1/* 2 * Copyright © 2016 Red Hat. 3 * Copyright © 2016 Bas Nieuwenhuizen 4 * 5 * based in part on anv driver which is: 6 * Copyright © 2015 Intel Corporation 7 * 8 * Permission is hereby granted, free of charge, to any person obtaining a 9 * copy of this software and associated documentation files (the "Software"), 10 * to deal in the Software without restriction, including without limitation 11 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 12 * and/or sell copies of the Software, and to permit persons to whom the 13 * Software is furnished to do so, subject to the following conditions: 14 * 15 * The above copyright notice and this permission notice (including the next 16 * paragraph) shall be included in all copies or substantial portions of the 17 * Software. 18 * 19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 25 * IN THE SOFTWARE. 26 */ 27 28#include "radv_cs.h" 29#include "radv_debug.h" 30#include "radv_meta.h" 31#include "radv_private.h" 32#include "radv_radeon_winsys.h" 33#include "radv_shader.h" 34#include "sid.h" 35#include "vk_format.h" 36#include "vk_util.h" 37 38#include "ac_debug.h" 39 40#include "util/fast_idiv_by_const.h" 41 42enum { 43 RADV_PREFETCH_VBO_DESCRIPTORS = (1 << 0), 44 RADV_PREFETCH_VS = (1 << 1), 45 RADV_PREFETCH_TCS = (1 << 2), 46 RADV_PREFETCH_TES = (1 << 3), 47 RADV_PREFETCH_GS = (1 << 4), 48 RADV_PREFETCH_PS = (1 << 5), 49 RADV_PREFETCH_SHADERS = (RADV_PREFETCH_VS | RADV_PREFETCH_TCS | RADV_PREFETCH_TES | 50 RADV_PREFETCH_GS | RADV_PREFETCH_PS) 51}; 52 53enum { 54 RADV_RT_STAGE_BITS = (VK_SHADER_STAGE_RAYGEN_BIT_KHR | VK_SHADER_STAGE_ANY_HIT_BIT_KHR | 55 VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR | VK_SHADER_STAGE_MISS_BIT_KHR | 56 VK_SHADER_STAGE_INTERSECTION_BIT_KHR | VK_SHADER_STAGE_CALLABLE_BIT_KHR) 57}; 58 59static void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer, 60 struct radv_image *image, VkImageLayout src_layout, 61 bool src_render_loop, VkImageLayout dst_layout, 62 bool dst_render_loop, uint32_t src_family, 63 uint32_t dst_family, const VkImageSubresourceRange *range, 64 struct radv_sample_locations_state *sample_locs); 65 66static void radv_set_rt_stack_size(struct radv_cmd_buffer *cmd_buffer, uint32_t size); 67 68const struct radv_dynamic_state default_dynamic_state = { 69 .viewport = 70 { 71 .count = 0, 72 }, 73 .scissor = 74 { 75 .count = 0, 76 }, 77 .line_width = 1.0f, 78 .depth_bias = 79 { 80 .bias = 0.0f, 81 .clamp = 0.0f, 82 .slope = 0.0f, 83 }, 84 .blend_constants = {0.0f, 0.0f, 0.0f, 0.0f}, 85 .depth_bounds = 86 { 87 .min = 0.0f, 88 .max = 1.0f, 89 }, 90 .stencil_compare_mask = 91 { 92 .front = ~0u, 93 .back = ~0u, 94 }, 95 .stencil_write_mask = 96 { 97 .front = ~0u, 98 .back = ~0u, 99 }, 100 .stencil_reference = 101 { 102 .front = 0u, 103 .back = 0u, 104 }, 105 .line_stipple = 106 { 107 .factor = 0u, 108 .pattern = 0u, 109 }, 110 .cull_mode = 0u, 111 .front_face = 0u, 112 .primitive_topology = 0u, 113 .fragment_shading_rate = 114 { 115 .size = {1u, 1u}, 116 .combiner_ops = {VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR, 117 VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR}, 118 }, 119 .depth_bias_enable = 0u, 120 .primitive_restart_enable = 0u, 121 .rasterizer_discard_enable = 0u, 122 .logic_op = 0u, 123 .color_write_enable = 0xffffffffu, 124}; 125 126static void 127radv_bind_dynamic_state(struct radv_cmd_buffer *cmd_buffer, const struct radv_dynamic_state *src) 128{ 129 struct radv_dynamic_state *dest = &cmd_buffer->state.dynamic; 130 uint64_t copy_mask = src->mask; 131 uint64_t dest_mask = 0; 132 133 dest->discard_rectangle.count = src->discard_rectangle.count; 134 dest->sample_location.count = src->sample_location.count; 135 136 if (copy_mask & RADV_DYNAMIC_VIEWPORT) { 137 if (dest->viewport.count != src->viewport.count) { 138 dest->viewport.count = src->viewport.count; 139 dest_mask |= RADV_DYNAMIC_VIEWPORT; 140 } 141 142 if (memcmp(&dest->viewport.viewports, &src->viewport.viewports, 143 src->viewport.count * sizeof(VkViewport))) { 144 typed_memcpy(dest->viewport.viewports, src->viewport.viewports, src->viewport.count); 145 typed_memcpy(dest->viewport.xform, src->viewport.xform, src->viewport.count); 146 dest_mask |= RADV_DYNAMIC_VIEWPORT; 147 } 148 } 149 150 if (copy_mask & RADV_DYNAMIC_SCISSOR) { 151 if (dest->scissor.count != src->scissor.count) { 152 dest->scissor.count = src->scissor.count; 153 dest_mask |= RADV_DYNAMIC_SCISSOR; 154 } 155 156 if (memcmp(&dest->scissor.scissors, &src->scissor.scissors, 157 src->scissor.count * sizeof(VkRect2D))) { 158 typed_memcpy(dest->scissor.scissors, src->scissor.scissors, src->scissor.count); 159 dest_mask |= RADV_DYNAMIC_SCISSOR; 160 } 161 } 162 163 if (copy_mask & RADV_DYNAMIC_LINE_WIDTH) { 164 if (dest->line_width != src->line_width) { 165 dest->line_width = src->line_width; 166 dest_mask |= RADV_DYNAMIC_LINE_WIDTH; 167 } 168 } 169 170 if (copy_mask & RADV_DYNAMIC_DEPTH_BIAS) { 171 if (memcmp(&dest->depth_bias, &src->depth_bias, sizeof(src->depth_bias))) { 172 dest->depth_bias = src->depth_bias; 173 dest_mask |= RADV_DYNAMIC_DEPTH_BIAS; 174 } 175 } 176 177 if (copy_mask & RADV_DYNAMIC_BLEND_CONSTANTS) { 178 if (memcmp(&dest->blend_constants, &src->blend_constants, sizeof(src->blend_constants))) { 179 typed_memcpy(dest->blend_constants, src->blend_constants, 4); 180 dest_mask |= RADV_DYNAMIC_BLEND_CONSTANTS; 181 } 182 } 183 184 if (copy_mask & RADV_DYNAMIC_DEPTH_BOUNDS) { 185 if (memcmp(&dest->depth_bounds, &src->depth_bounds, sizeof(src->depth_bounds))) { 186 dest->depth_bounds = src->depth_bounds; 187 dest_mask |= RADV_DYNAMIC_DEPTH_BOUNDS; 188 } 189 } 190 191 if (copy_mask & RADV_DYNAMIC_STENCIL_COMPARE_MASK) { 192 if (memcmp(&dest->stencil_compare_mask, &src->stencil_compare_mask, 193 sizeof(src->stencil_compare_mask))) { 194 dest->stencil_compare_mask = src->stencil_compare_mask; 195 dest_mask |= RADV_DYNAMIC_STENCIL_COMPARE_MASK; 196 } 197 } 198 199 if (copy_mask & RADV_DYNAMIC_STENCIL_WRITE_MASK) { 200 if (memcmp(&dest->stencil_write_mask, &src->stencil_write_mask, 201 sizeof(src->stencil_write_mask))) { 202 dest->stencil_write_mask = src->stencil_write_mask; 203 dest_mask |= RADV_DYNAMIC_STENCIL_WRITE_MASK; 204 } 205 } 206 207 if (copy_mask & RADV_DYNAMIC_STENCIL_REFERENCE) { 208 if (memcmp(&dest->stencil_reference, &src->stencil_reference, 209 sizeof(src->stencil_reference))) { 210 dest->stencil_reference = src->stencil_reference; 211 dest_mask |= RADV_DYNAMIC_STENCIL_REFERENCE; 212 } 213 } 214 215 if (copy_mask & RADV_DYNAMIC_DISCARD_RECTANGLE) { 216 if (memcmp(&dest->discard_rectangle.rectangles, &src->discard_rectangle.rectangles, 217 src->discard_rectangle.count * sizeof(VkRect2D))) { 218 typed_memcpy(dest->discard_rectangle.rectangles, src->discard_rectangle.rectangles, 219 src->discard_rectangle.count); 220 dest_mask |= RADV_DYNAMIC_DISCARD_RECTANGLE; 221 } 222 } 223 224 if (copy_mask & RADV_DYNAMIC_SAMPLE_LOCATIONS) { 225 if (dest->sample_location.per_pixel != src->sample_location.per_pixel || 226 dest->sample_location.grid_size.width != src->sample_location.grid_size.width || 227 dest->sample_location.grid_size.height != src->sample_location.grid_size.height || 228 memcmp(&dest->sample_location.locations, &src->sample_location.locations, 229 src->sample_location.count * sizeof(VkSampleLocationEXT))) { 230 dest->sample_location.per_pixel = src->sample_location.per_pixel; 231 dest->sample_location.grid_size = src->sample_location.grid_size; 232 typed_memcpy(dest->sample_location.locations, src->sample_location.locations, 233 src->sample_location.count); 234 dest_mask |= RADV_DYNAMIC_SAMPLE_LOCATIONS; 235 } 236 } 237 238 if (copy_mask & RADV_DYNAMIC_LINE_STIPPLE) { 239 if (memcmp(&dest->line_stipple, &src->line_stipple, sizeof(src->line_stipple))) { 240 dest->line_stipple = src->line_stipple; 241 dest_mask |= RADV_DYNAMIC_LINE_STIPPLE; 242 } 243 } 244 245 if (copy_mask & RADV_DYNAMIC_CULL_MODE) { 246 if (dest->cull_mode != src->cull_mode) { 247 dest->cull_mode = src->cull_mode; 248 dest_mask |= RADV_DYNAMIC_CULL_MODE; 249 } 250 } 251 252 if (copy_mask & RADV_DYNAMIC_FRONT_FACE) { 253 if (dest->front_face != src->front_face) { 254 dest->front_face = src->front_face; 255 dest_mask |= RADV_DYNAMIC_FRONT_FACE; 256 } 257 } 258 259 if (copy_mask & RADV_DYNAMIC_PRIMITIVE_TOPOLOGY) { 260 if (dest->primitive_topology != src->primitive_topology) { 261 dest->primitive_topology = src->primitive_topology; 262 dest_mask |= RADV_DYNAMIC_PRIMITIVE_TOPOLOGY; 263 } 264 } 265 266 if (copy_mask & RADV_DYNAMIC_DEPTH_TEST_ENABLE) { 267 if (dest->depth_test_enable != src->depth_test_enable) { 268 dest->depth_test_enable = src->depth_test_enable; 269 dest_mask |= RADV_DYNAMIC_DEPTH_TEST_ENABLE; 270 } 271 } 272 273 if (copy_mask & RADV_DYNAMIC_DEPTH_WRITE_ENABLE) { 274 if (dest->depth_write_enable != src->depth_write_enable) { 275 dest->depth_write_enable = src->depth_write_enable; 276 dest_mask |= RADV_DYNAMIC_DEPTH_WRITE_ENABLE; 277 } 278 } 279 280 if (copy_mask & RADV_DYNAMIC_DEPTH_COMPARE_OP) { 281 if (dest->depth_compare_op != src->depth_compare_op) { 282 dest->depth_compare_op = src->depth_compare_op; 283 dest_mask |= RADV_DYNAMIC_DEPTH_COMPARE_OP; 284 } 285 } 286 287 if (copy_mask & RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE) { 288 if (dest->depth_bounds_test_enable != src->depth_bounds_test_enable) { 289 dest->depth_bounds_test_enable = src->depth_bounds_test_enable; 290 dest_mask |= RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE; 291 } 292 } 293 294 if (copy_mask & RADV_DYNAMIC_STENCIL_TEST_ENABLE) { 295 if (dest->stencil_test_enable != src->stencil_test_enable) { 296 dest->stencil_test_enable = src->stencil_test_enable; 297 dest_mask |= RADV_DYNAMIC_STENCIL_TEST_ENABLE; 298 } 299 } 300 301 if (copy_mask & RADV_DYNAMIC_STENCIL_OP) { 302 if (memcmp(&dest->stencil_op, &src->stencil_op, sizeof(src->stencil_op))) { 303 dest->stencil_op = src->stencil_op; 304 dest_mask |= RADV_DYNAMIC_STENCIL_OP; 305 } 306 } 307 308 if (copy_mask & RADV_DYNAMIC_FRAGMENT_SHADING_RATE) { 309 if (memcmp(&dest->fragment_shading_rate, &src->fragment_shading_rate, 310 sizeof(src->fragment_shading_rate))) { 311 dest->fragment_shading_rate = src->fragment_shading_rate; 312 dest_mask |= RADV_DYNAMIC_FRAGMENT_SHADING_RATE; 313 } 314 } 315 316 if (copy_mask & RADV_DYNAMIC_DEPTH_BIAS_ENABLE) { 317 if (dest->depth_bias_enable != src->depth_bias_enable) { 318 dest->depth_bias_enable = src->depth_bias_enable; 319 dest_mask |= RADV_DYNAMIC_DEPTH_BIAS_ENABLE; 320 } 321 } 322 323 if (copy_mask & RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE) { 324 if (dest->primitive_restart_enable != src->primitive_restart_enable) { 325 dest->primitive_restart_enable = src->primitive_restart_enable; 326 dest_mask |= RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE; 327 } 328 } 329 330 if (copy_mask & RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE) { 331 if (dest->rasterizer_discard_enable != src->rasterizer_discard_enable) { 332 dest->rasterizer_discard_enable = src->rasterizer_discard_enable; 333 dest_mask |= RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE; 334 } 335 } 336 337 if (copy_mask & RADV_DYNAMIC_LOGIC_OP) { 338 if (dest->logic_op != src->logic_op) { 339 dest->logic_op = src->logic_op; 340 dest_mask |= RADV_DYNAMIC_LOGIC_OP; 341 } 342 } 343 344 if (copy_mask & RADV_DYNAMIC_COLOR_WRITE_ENABLE) { 345 if (dest->color_write_enable != src->color_write_enable) { 346 dest->color_write_enable = src->color_write_enable; 347 dest_mask |= RADV_DYNAMIC_COLOR_WRITE_ENABLE; 348 } 349 } 350 351 cmd_buffer->state.dirty |= dest_mask; 352} 353 354static void 355radv_bind_streamout_state(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline) 356{ 357 struct radv_streamout_state *so = &cmd_buffer->state.streamout; 358 struct radv_shader_info *info; 359 360 if (!pipeline->streamout_shader || cmd_buffer->device->physical_device->use_ngg_streamout) 361 return; 362 363 info = &pipeline->streamout_shader->info; 364 for (int i = 0; i < MAX_SO_BUFFERS; i++) 365 so->stride_in_dw[i] = info->so.strides[i]; 366 367 so->enabled_stream_buffers_mask = info->so.enabled_stream_buffers_mask; 368} 369 370bool 371radv_cmd_buffer_uses_mec(struct radv_cmd_buffer *cmd_buffer) 372{ 373 return cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE && 374 cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7; 375} 376 377enum ring_type 378radv_queue_family_to_ring(int f) 379{ 380 switch (f) { 381 case RADV_QUEUE_GENERAL: 382 return RING_GFX; 383 case RADV_QUEUE_COMPUTE: 384 return RING_COMPUTE; 385 case RADV_QUEUE_TRANSFER: 386 return RING_DMA; 387 default: 388 unreachable("Unknown queue family"); 389 } 390} 391 392static void 393radv_emit_write_data_packet(struct radv_cmd_buffer *cmd_buffer, unsigned engine_sel, uint64_t va, 394 unsigned count, const uint32_t *data) 395{ 396 struct radeon_cmdbuf *cs = cmd_buffer->cs; 397 398 radeon_check_space(cmd_buffer->device->ws, cs, 4 + count); 399 400 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0)); 401 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(engine_sel)); 402 radeon_emit(cs, va); 403 radeon_emit(cs, va >> 32); 404 radeon_emit_array(cs, data, count); 405} 406 407static void 408radv_emit_clear_data(struct radv_cmd_buffer *cmd_buffer, unsigned engine_sel, uint64_t va, 409 unsigned size) 410{ 411 uint32_t *zeroes = alloca(size); 412 memset(zeroes, 0, size); 413 radv_emit_write_data_packet(cmd_buffer, engine_sel, va, size / 4, zeroes); 414} 415 416static void 417radv_destroy_cmd_buffer(struct radv_cmd_buffer *cmd_buffer) 418{ 419 list_del(&cmd_buffer->pool_link); 420 421 list_for_each_entry_safe(struct radv_cmd_buffer_upload, up, &cmd_buffer->upload.list, list) 422 { 423 cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, up->upload_bo); 424 list_del(&up->list); 425 free(up); 426 } 427 428 if (cmd_buffer->upload.upload_bo) 429 cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, cmd_buffer->upload.upload_bo); 430 431 if (cmd_buffer->cs) 432 cmd_buffer->device->ws->cs_destroy(cmd_buffer->cs); 433 434 for (unsigned i = 0; i < MAX_BIND_POINTS; i++) { 435 free(cmd_buffer->descriptors[i].push_set.set.mapped_ptr); 436 vk_object_base_finish(&cmd_buffer->descriptors[i].push_set.set.base); 437 } 438 439 vk_object_base_finish(&cmd_buffer->meta_push_descriptors.base); 440 441 vk_command_buffer_finish(&cmd_buffer->vk); 442 vk_free(&cmd_buffer->pool->alloc, cmd_buffer); 443} 444 445static VkResult 446radv_create_cmd_buffer(struct radv_device *device, struct radv_cmd_pool *pool, 447 VkCommandBufferLevel level, VkCommandBuffer *pCommandBuffer) 448{ 449 struct radv_cmd_buffer *cmd_buffer; 450 unsigned ring; 451 cmd_buffer = vk_zalloc(&pool->alloc, sizeof(*cmd_buffer), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 452 if (cmd_buffer == NULL) 453 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 454 455 VkResult result = 456 vk_command_buffer_init(&cmd_buffer->vk, &device->vk); 457 if (result != VK_SUCCESS) { 458 vk_free(&cmd_buffer->pool->alloc, cmd_buffer); 459 return result; 460 } 461 462 cmd_buffer->device = device; 463 cmd_buffer->pool = pool; 464 cmd_buffer->level = level; 465 466 list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers); 467 cmd_buffer->queue_family_index = pool->queue_family_index; 468 469 ring = radv_queue_family_to_ring(cmd_buffer->queue_family_index); 470 471 cmd_buffer->cs = device->ws->cs_create(device->ws, ring); 472 if (!cmd_buffer->cs) { 473 radv_destroy_cmd_buffer(cmd_buffer); 474 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 475 } 476 477 vk_object_base_init(&device->vk, &cmd_buffer->meta_push_descriptors.base, 478 VK_OBJECT_TYPE_DESCRIPTOR_SET); 479 480 for (unsigned i = 0; i < MAX_BIND_POINTS; i++) 481 vk_object_base_init(&device->vk, &cmd_buffer->descriptors[i].push_set.set.base, 482 VK_OBJECT_TYPE_DESCRIPTOR_SET); 483 484 *pCommandBuffer = radv_cmd_buffer_to_handle(cmd_buffer); 485 486 list_inithead(&cmd_buffer->upload.list); 487 488 return VK_SUCCESS; 489} 490 491static VkResult 492radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer) 493{ 494 vk_command_buffer_reset(&cmd_buffer->vk); 495 496 cmd_buffer->device->ws->cs_reset(cmd_buffer->cs); 497 498 list_for_each_entry_safe(struct radv_cmd_buffer_upload, up, &cmd_buffer->upload.list, list) 499 { 500 cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, up->upload_bo); 501 list_del(&up->list); 502 free(up); 503 } 504 505 cmd_buffer->push_constant_stages = 0; 506 cmd_buffer->scratch_size_per_wave_needed = 0; 507 cmd_buffer->scratch_waves_wanted = 0; 508 cmd_buffer->compute_scratch_size_per_wave_needed = 0; 509 cmd_buffer->compute_scratch_waves_wanted = 0; 510 cmd_buffer->esgs_ring_size_needed = 0; 511 cmd_buffer->gsvs_ring_size_needed = 0; 512 cmd_buffer->tess_rings_needed = false; 513 cmd_buffer->gds_needed = false; 514 cmd_buffer->gds_oa_needed = false; 515 cmd_buffer->sample_positions_needed = false; 516 517 if (cmd_buffer->upload.upload_bo) 518 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->upload.upload_bo); 519 cmd_buffer->upload.offset = 0; 520 521 cmd_buffer->record_result = VK_SUCCESS; 522 523 memset(cmd_buffer->vertex_bindings, 0, sizeof(cmd_buffer->vertex_bindings)); 524 525 for (unsigned i = 0; i < MAX_BIND_POINTS; i++) { 526 cmd_buffer->descriptors[i].dirty = 0; 527 cmd_buffer->descriptors[i].valid = 0; 528 cmd_buffer->descriptors[i].push_dirty = false; 529 } 530 531 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9 && 532 cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) { 533 unsigned num_db = cmd_buffer->device->physical_device->rad_info.max_render_backends; 534 unsigned fence_offset, eop_bug_offset; 535 void *fence_ptr; 536 537 radv_cmd_buffer_upload_alloc(cmd_buffer, 8, &fence_offset, &fence_ptr); 538 memset(fence_ptr, 0, 8); 539 540 cmd_buffer->gfx9_fence_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); 541 cmd_buffer->gfx9_fence_va += fence_offset; 542 543 radv_emit_clear_data(cmd_buffer, V_370_PFP, cmd_buffer->gfx9_fence_va, 8); 544 545 if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) { 546 /* Allocate a buffer for the EOP bug on GFX9. */ 547 radv_cmd_buffer_upload_alloc(cmd_buffer, 16 * num_db, &eop_bug_offset, &fence_ptr); 548 memset(fence_ptr, 0, 16 * num_db); 549 cmd_buffer->gfx9_eop_bug_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); 550 cmd_buffer->gfx9_eop_bug_va += eop_bug_offset; 551 552 radv_emit_clear_data(cmd_buffer, V_370_PFP, cmd_buffer->gfx9_eop_bug_va, 16 * num_db); 553 } 554 } 555 556 cmd_buffer->status = RADV_CMD_BUFFER_STATUS_INITIAL; 557 558 return cmd_buffer->record_result; 559} 560 561static bool 562radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer *cmd_buffer, uint64_t min_needed) 563{ 564 uint64_t new_size; 565 struct radeon_winsys_bo *bo = NULL; 566 struct radv_cmd_buffer_upload *upload; 567 struct radv_device *device = cmd_buffer->device; 568 569 new_size = MAX2(min_needed, 16 * 1024); 570 new_size = MAX2(new_size, 2 * cmd_buffer->upload.size); 571 572 VkResult result = 573 device->ws->buffer_create(device->ws, new_size, 4096, device->ws->cs_domain(device->ws), 574 RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING | 575 RADEON_FLAG_32BIT | RADEON_FLAG_GTT_WC, 576 RADV_BO_PRIORITY_UPLOAD_BUFFER, 0, &bo); 577 578 if (result != VK_SUCCESS) { 579 cmd_buffer->record_result = result; 580 return false; 581 } 582 583 radv_cs_add_buffer(device->ws, cmd_buffer->cs, bo); 584 if (cmd_buffer->upload.upload_bo) { 585 upload = malloc(sizeof(*upload)); 586 587 if (!upload) { 588 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY; 589 device->ws->buffer_destroy(device->ws, bo); 590 return false; 591 } 592 593 memcpy(upload, &cmd_buffer->upload, sizeof(*upload)); 594 list_add(&upload->list, &cmd_buffer->upload.list); 595 } 596 597 cmd_buffer->upload.upload_bo = bo; 598 cmd_buffer->upload.size = new_size; 599 cmd_buffer->upload.offset = 0; 600 cmd_buffer->upload.map = device->ws->buffer_map(cmd_buffer->upload.upload_bo); 601 602 if (!cmd_buffer->upload.map) { 603 cmd_buffer->record_result = VK_ERROR_OUT_OF_DEVICE_MEMORY; 604 return false; 605 } 606 607 return true; 608} 609 610bool 611radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer, unsigned size, 612 unsigned *out_offset, void **ptr) 613{ 614 assert(size % 4 == 0); 615 616 struct radeon_info *rad_info = &cmd_buffer->device->physical_device->rad_info; 617 618 /* Align to the scalar cache line size if it results in this allocation 619 * being placed in less of them. 620 */ 621 unsigned offset = cmd_buffer->upload.offset; 622 unsigned line_size = rad_info->chip_class >= GFX10 ? 64 : 32; 623 unsigned gap = align(offset, line_size) - offset; 624 if ((size & (line_size - 1)) > gap) 625 offset = align(offset, line_size); 626 627 if (offset + size > cmd_buffer->upload.size) { 628 if (!radv_cmd_buffer_resize_upload_buf(cmd_buffer, size)) 629 return false; 630 offset = 0; 631 } 632 633 *out_offset = offset; 634 *ptr = cmd_buffer->upload.map + offset; 635 636 cmd_buffer->upload.offset = offset + size; 637 return true; 638} 639 640bool 641radv_cmd_buffer_upload_data(struct radv_cmd_buffer *cmd_buffer, unsigned size, const void *data, 642 unsigned *out_offset) 643{ 644 uint8_t *ptr; 645 646 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, out_offset, (void **)&ptr)) 647 return false; 648 649 if (ptr) 650 memcpy(ptr, data, size); 651 652 return true; 653} 654 655void 656radv_cmd_buffer_trace_emit(struct radv_cmd_buffer *cmd_buffer) 657{ 658 struct radv_device *device = cmd_buffer->device; 659 struct radeon_cmdbuf *cs = cmd_buffer->cs; 660 uint64_t va; 661 662 va = radv_buffer_get_va(device->trace_bo); 663 if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) 664 va += 4; 665 666 ++cmd_buffer->state.trace_id; 667 radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, 1, &cmd_buffer->state.trace_id); 668 669 radeon_check_space(cmd_buffer->device->ws, cs, 2); 670 671 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); 672 radeon_emit(cs, AC_ENCODE_TRACE_POINT(cmd_buffer->state.trace_id)); 673} 674 675static void 676radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer, enum radv_cmd_flush_bits flags) 677{ 678 if (unlikely(cmd_buffer->device->thread_trace.bo)) { 679 radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 680 radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | EVENT_INDEX(0)); 681 } 682 683 if (cmd_buffer->device->instance->debug_flags & RADV_DEBUG_SYNC_SHADERS) { 684 enum rgp_flush_bits sqtt_flush_bits = 0; 685 assert(flags & (RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH)); 686 687 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4); 688 689 /* Force wait for graphics or compute engines to be idle. */ 690 si_cs_emit_cache_flush(cmd_buffer->cs, 691 cmd_buffer->device->physical_device->rad_info.chip_class, 692 &cmd_buffer->gfx9_fence_idx, cmd_buffer->gfx9_fence_va, 693 radv_cmd_buffer_uses_mec(cmd_buffer), flags, &sqtt_flush_bits, 694 cmd_buffer->gfx9_eop_bug_va); 695 } 696 697 if (unlikely(cmd_buffer->device->trace_bo)) 698 radv_cmd_buffer_trace_emit(cmd_buffer); 699} 700 701static void 702radv_save_pipeline(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline) 703{ 704 struct radv_device *device = cmd_buffer->device; 705 enum ring_type ring; 706 uint32_t data[2]; 707 uint64_t va; 708 709 va = radv_buffer_get_va(device->trace_bo); 710 711 ring = radv_queue_family_to_ring(cmd_buffer->queue_family_index); 712 713 switch (ring) { 714 case RING_GFX: 715 va += 8; 716 break; 717 case RING_COMPUTE: 718 va += 16; 719 break; 720 default: 721 assert(!"invalid ring type"); 722 } 723 724 uint64_t pipeline_address = (uintptr_t)pipeline; 725 data[0] = pipeline_address; 726 data[1] = pipeline_address >> 32; 727 728 radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, 2, data); 729} 730 731static void 732radv_save_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, uint64_t vb_ptr) 733{ 734 struct radv_device *device = cmd_buffer->device; 735 uint32_t data[2]; 736 uint64_t va; 737 738 va = radv_buffer_get_va(device->trace_bo); 739 va += 24; 740 741 data[0] = vb_ptr; 742 data[1] = vb_ptr >> 32; 743 744 radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, 2, data); 745} 746 747void 748radv_set_descriptor_set(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point, 749 struct radv_descriptor_set *set, unsigned idx) 750{ 751 struct radv_descriptor_state *descriptors_state = 752 radv_get_descriptors_state(cmd_buffer, bind_point); 753 754 descriptors_state->sets[idx] = set; 755 756 descriptors_state->valid |= (1u << idx); /* active descriptors */ 757 descriptors_state->dirty |= (1u << idx); 758} 759 760static void 761radv_save_descriptors(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point) 762{ 763 struct radv_descriptor_state *descriptors_state = 764 radv_get_descriptors_state(cmd_buffer, bind_point); 765 struct radv_device *device = cmd_buffer->device; 766 uint32_t data[MAX_SETS * 2] = {0}; 767 uint64_t va; 768 va = radv_buffer_get_va(device->trace_bo) + 32; 769 770 u_foreach_bit(i, descriptors_state->valid) 771 { 772 struct radv_descriptor_set *set = descriptors_state->sets[i]; 773 data[i * 2] = (uint64_t)(uintptr_t)set; 774 data[i * 2 + 1] = (uint64_t)(uintptr_t)set >> 32; 775 } 776 777 radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, MAX_SETS * 2, data); 778} 779 780struct radv_userdata_info * 781radv_lookup_user_sgpr(struct radv_pipeline *pipeline, gl_shader_stage stage, int idx) 782{ 783 struct radv_shader_variant *shader = radv_get_shader(pipeline, stage); 784 return &shader->info.user_sgprs_locs.shader_data[idx]; 785} 786 787static void 788radv_emit_userdata_address(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline, 789 gl_shader_stage stage, int idx, uint64_t va) 790{ 791 struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx); 792 uint32_t base_reg = pipeline->user_data_0[stage]; 793 if (loc->sgpr_idx == -1) 794 return; 795 796 assert(loc->num_sgprs == 1); 797 798 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, va, 799 false); 800} 801 802static void 803radv_emit_descriptor_pointers(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline, 804 struct radv_descriptor_state *descriptors_state, 805 gl_shader_stage stage) 806{ 807 struct radv_device *device = cmd_buffer->device; 808 struct radeon_cmdbuf *cs = cmd_buffer->cs; 809 uint32_t sh_base = pipeline->user_data_0[stage]; 810 struct radv_userdata_locations *locs = &pipeline->shaders[stage]->info.user_sgprs_locs; 811 unsigned mask = locs->descriptor_sets_enabled; 812 813 mask &= descriptors_state->dirty & descriptors_state->valid; 814 815 while (mask) { 816 int start, count; 817 818 u_bit_scan_consecutive_range(&mask, &start, &count); 819 820 struct radv_userdata_info *loc = &locs->descriptor_sets[start]; 821 unsigned sh_offset = sh_base + loc->sgpr_idx * 4; 822 823 radv_emit_shader_pointer_head(cs, sh_offset, count, true); 824 for (int i = 0; i < count; i++) { 825 struct radv_descriptor_set *set = descriptors_state->sets[start + i]; 826 827 radv_emit_shader_pointer_body(device, cs, set->header.va, true); 828 } 829 } 830} 831 832/** 833 * Convert the user sample locations to hardware sample locations (the values 834 * that will be emitted by PA_SC_AA_SAMPLE_LOCS_PIXEL_*). 835 */ 836static void 837radv_convert_user_sample_locs(struct radv_sample_locations_state *state, uint32_t x, uint32_t y, 838 VkOffset2D *sample_locs) 839{ 840 uint32_t x_offset = x % state->grid_size.width; 841 uint32_t y_offset = y % state->grid_size.height; 842 uint32_t num_samples = (uint32_t)state->per_pixel; 843 VkSampleLocationEXT *user_locs; 844 uint32_t pixel_offset; 845 846 pixel_offset = (x_offset + y_offset * state->grid_size.width) * num_samples; 847 848 assert(pixel_offset <= MAX_SAMPLE_LOCATIONS); 849 user_locs = &state->locations[pixel_offset]; 850 851 for (uint32_t i = 0; i < num_samples; i++) { 852 float shifted_pos_x = user_locs[i].x - 0.5; 853 float shifted_pos_y = user_locs[i].y - 0.5; 854 855 int32_t scaled_pos_x = floorf(shifted_pos_x * 16); 856 int32_t scaled_pos_y = floorf(shifted_pos_y * 16); 857 858 sample_locs[i].x = CLAMP(scaled_pos_x, -8, 7); 859 sample_locs[i].y = CLAMP(scaled_pos_y, -8, 7); 860 } 861} 862 863/** 864 * Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask based on hardware sample 865 * locations. 866 */ 867static void 868radv_compute_sample_locs_pixel(uint32_t num_samples, VkOffset2D *sample_locs, 869 uint32_t *sample_locs_pixel) 870{ 871 for (uint32_t i = 0; i < num_samples; i++) { 872 uint32_t sample_reg_idx = i / 4; 873 uint32_t sample_loc_idx = i % 4; 874 int32_t pos_x = sample_locs[i].x; 875 int32_t pos_y = sample_locs[i].y; 876 877 uint32_t shift_x = 8 * sample_loc_idx; 878 uint32_t shift_y = shift_x + 4; 879 880 sample_locs_pixel[sample_reg_idx] |= (pos_x & 0xf) << shift_x; 881 sample_locs_pixel[sample_reg_idx] |= (pos_y & 0xf) << shift_y; 882 } 883} 884 885/** 886 * Compute the PA_SC_CENTROID_PRIORITY_* mask based on the top left hardware 887 * sample locations. 888 */ 889static uint64_t 890radv_compute_centroid_priority(struct radv_cmd_buffer *cmd_buffer, VkOffset2D *sample_locs, 891 uint32_t num_samples) 892{ 893 uint32_t *centroid_priorities = alloca(num_samples * sizeof(*centroid_priorities)); 894 uint32_t sample_mask = num_samples - 1; 895 uint32_t *distances = alloca(num_samples * sizeof(*distances)); 896 uint64_t centroid_priority = 0; 897 898 /* Compute the distances from center for each sample. */ 899 for (int i = 0; i < num_samples; i++) { 900 distances[i] = (sample_locs[i].x * sample_locs[i].x) + (sample_locs[i].y * sample_locs[i].y); 901 } 902 903 /* Compute the centroid priorities by looking at the distances array. */ 904 for (int i = 0; i < num_samples; i++) { 905 uint32_t min_idx = 0; 906 907 for (int j = 1; j < num_samples; j++) { 908 if (distances[j] < distances[min_idx]) 909 min_idx = j; 910 } 911 912 centroid_priorities[i] = min_idx; 913 distances[min_idx] = 0xffffffff; 914 } 915 916 /* Compute the final centroid priority. */ 917 for (int i = 0; i < 8; i++) { 918 centroid_priority |= centroid_priorities[i & sample_mask] << (i * 4); 919 } 920 921 return centroid_priority << 32 | centroid_priority; 922} 923 924/** 925 * Emit the sample locations that are specified with VK_EXT_sample_locations. 926 */ 927static void 928radv_emit_sample_locations(struct radv_cmd_buffer *cmd_buffer) 929{ 930 struct radv_sample_locations_state *sample_location = &cmd_buffer->state.dynamic.sample_location; 931 uint32_t num_samples = (uint32_t)sample_location->per_pixel; 932 struct radeon_cmdbuf *cs = cmd_buffer->cs; 933 uint32_t sample_locs_pixel[4][2] = {0}; 934 VkOffset2D sample_locs[4][8]; /* 8 is the max. sample count supported */ 935 uint32_t max_sample_dist = 0; 936 uint64_t centroid_priority; 937 938 if (!cmd_buffer->state.dynamic.sample_location.count) 939 return; 940 941 /* Convert the user sample locations to hardware sample locations. */ 942 radv_convert_user_sample_locs(sample_location, 0, 0, sample_locs[0]); 943 radv_convert_user_sample_locs(sample_location, 1, 0, sample_locs[1]); 944 radv_convert_user_sample_locs(sample_location, 0, 1, sample_locs[2]); 945 radv_convert_user_sample_locs(sample_location, 1, 1, sample_locs[3]); 946 947 /* Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask. */ 948 for (uint32_t i = 0; i < 4; i++) { 949 radv_compute_sample_locs_pixel(num_samples, sample_locs[i], sample_locs_pixel[i]); 950 } 951 952 /* Compute the PA_SC_CENTROID_PRIORITY_* mask. */ 953 centroid_priority = radv_compute_centroid_priority(cmd_buffer, sample_locs[0], num_samples); 954 955 /* Compute the maximum sample distance from the specified locations. */ 956 for (unsigned i = 0; i < 4; ++i) { 957 for (uint32_t j = 0; j < num_samples; j++) { 958 VkOffset2D offset = sample_locs[i][j]; 959 max_sample_dist = MAX2(max_sample_dist, MAX2(abs(offset.x), abs(offset.y))); 960 } 961 } 962 963 /* Emit the specified user sample locations. */ 964 switch (num_samples) { 965 case 2: 966 case 4: 967 radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 968 sample_locs_pixel[0][0]); 969 radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, 970 sample_locs_pixel[1][0]); 971 radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, 972 sample_locs_pixel[2][0]); 973 radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, 974 sample_locs_pixel[3][0]); 975 break; 976 case 8: 977 radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 978 sample_locs_pixel[0][0]); 979 radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, 980 sample_locs_pixel[1][0]); 981 radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, 982 sample_locs_pixel[2][0]); 983 radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, 984 sample_locs_pixel[3][0]); 985 radeon_set_context_reg(cs, R_028BFC_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1, 986 sample_locs_pixel[0][1]); 987 radeon_set_context_reg(cs, R_028C0C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1, 988 sample_locs_pixel[1][1]); 989 radeon_set_context_reg(cs, R_028C1C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1, 990 sample_locs_pixel[2][1]); 991 radeon_set_context_reg(cs, R_028C2C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1, 992 sample_locs_pixel[3][1]); 993 break; 994 default: 995 unreachable("invalid number of samples"); 996 } 997 998 /* Emit the maximum sample distance and the centroid priority. */ 999 radeon_set_context_reg_rmw(cs, R_028BE0_PA_SC_AA_CONFIG, 1000 S_028BE0_MAX_SAMPLE_DIST(max_sample_dist), ~C_028BE0_MAX_SAMPLE_DIST); 1001 1002 radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2); 1003 radeon_emit(cs, centroid_priority); 1004 radeon_emit(cs, centroid_priority >> 32); 1005 1006 cmd_buffer->state.context_roll_without_scissor_emitted = true; 1007} 1008 1009static void 1010radv_emit_inline_push_consts(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline, 1011 gl_shader_stage stage, int idx, uint32_t *values) 1012{ 1013 struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx); 1014 uint32_t base_reg = pipeline->user_data_0[stage]; 1015 if (loc->sgpr_idx == -1) 1016 return; 1017 1018 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 2 + loc->num_sgprs); 1019 1020 radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, loc->num_sgprs); 1021 radeon_emit_array(cmd_buffer->cs, values, loc->num_sgprs); 1022} 1023 1024static void 1025radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline) 1026{ 1027 int num_samples = pipeline->graphics.ms.num_samples; 1028 struct radv_pipeline *old_pipeline = cmd_buffer->state.emitted_pipeline; 1029 1030 if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.ps.needs_sample_positions) 1031 cmd_buffer->sample_positions_needed = true; 1032 1033 if (old_pipeline && num_samples == old_pipeline->graphics.ms.num_samples) 1034 return; 1035 1036 radv_emit_default_sample_locations(cmd_buffer->cs, num_samples); 1037 1038 cmd_buffer->state.context_roll_without_scissor_emitted = true; 1039} 1040 1041static void 1042radv_update_binning_state(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline) 1043{ 1044 const struct radv_pipeline *old_pipeline = cmd_buffer->state.emitted_pipeline; 1045 1046 if (pipeline->device->physical_device->rad_info.chip_class < GFX9) 1047 return; 1048 1049 if (old_pipeline && 1050 old_pipeline->graphics.binning.pa_sc_binner_cntl_0 == 1051 pipeline->graphics.binning.pa_sc_binner_cntl_0) 1052 return; 1053 1054 bool binning_flush = false; 1055 if (cmd_buffer->device->physical_device->rad_info.family == CHIP_VEGA12 || 1056 cmd_buffer->device->physical_device->rad_info.family == CHIP_VEGA20 || 1057 cmd_buffer->device->physical_device->rad_info.family == CHIP_RAVEN2 || 1058 cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) { 1059 binning_flush = !old_pipeline || 1060 G_028C44_BINNING_MODE(old_pipeline->graphics.binning.pa_sc_binner_cntl_0) != 1061 G_028C44_BINNING_MODE(pipeline->graphics.binning.pa_sc_binner_cntl_0); 1062 } 1063 1064 radeon_set_context_reg(cmd_buffer->cs, R_028C44_PA_SC_BINNER_CNTL_0, 1065 pipeline->graphics.binning.pa_sc_binner_cntl_0 | 1066 S_028C44_FLUSH_ON_BINNING_TRANSITION(!!binning_flush)); 1067 1068 cmd_buffer->state.context_roll_without_scissor_emitted = true; 1069} 1070 1071static void 1072radv_emit_shader_prefetch(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant *shader) 1073{ 1074 uint64_t va; 1075 1076 if (!shader) 1077 return; 1078 1079 va = radv_shader_variant_get_va(shader); 1080 1081 si_cp_dma_prefetch(cmd_buffer, va, shader->code_size); 1082} 1083 1084static void 1085radv_emit_prefetch_L2(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline, 1086 bool vertex_stage_only) 1087{ 1088 struct radv_cmd_state *state = &cmd_buffer->state; 1089 uint32_t mask = state->prefetch_L2_mask; 1090 1091 if (vertex_stage_only) { 1092 /* Fast prefetch path for starting draws as soon as possible. 1093 */ 1094 mask = state->prefetch_L2_mask & (RADV_PREFETCH_VS | RADV_PREFETCH_VBO_DESCRIPTORS); 1095 } 1096 1097 if (mask & RADV_PREFETCH_VS) 1098 radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_VERTEX]); 1099 1100 if (mask & RADV_PREFETCH_VBO_DESCRIPTORS) 1101 si_cp_dma_prefetch(cmd_buffer, state->vb_va, pipeline->vb_desc_alloc_size); 1102 1103 if (mask & RADV_PREFETCH_TCS) 1104 radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_TESS_CTRL]); 1105 1106 if (mask & RADV_PREFETCH_TES) 1107 radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_TESS_EVAL]); 1108 1109 if (mask & RADV_PREFETCH_GS) { 1110 radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_GEOMETRY]); 1111 if (radv_pipeline_has_gs_copy_shader(pipeline)) 1112 radv_emit_shader_prefetch(cmd_buffer, pipeline->gs_copy_shader); 1113 } 1114 1115 if (mask & RADV_PREFETCH_PS) 1116 radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_FRAGMENT]); 1117 1118 state->prefetch_L2_mask &= ~mask; 1119} 1120 1121static void 1122radv_emit_rbplus_state(struct radv_cmd_buffer *cmd_buffer) 1123{ 1124 if (!cmd_buffer->device->physical_device->rad_info.rbplus_allowed) 1125 return; 1126 1127 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; 1128 const struct radv_subpass *subpass = cmd_buffer->state.subpass; 1129 1130 unsigned sx_ps_downconvert = 0; 1131 unsigned sx_blend_opt_epsilon = 0; 1132 unsigned sx_blend_opt_control = 0; 1133 1134 if (!cmd_buffer->state.attachments || !subpass) 1135 return; 1136 1137 for (unsigned i = 0; i < subpass->color_count; ++i) { 1138 if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) { 1139 /* We don't set the DISABLE bits, because the HW can't have holes, 1140 * so the SPI color format is set to 32-bit 1-component. */ 1141 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4); 1142 continue; 1143 } 1144 1145 int idx = subpass->color_attachments[i].attachment; 1146 struct radv_color_buffer_info *cb = &cmd_buffer->state.attachments[idx].cb; 1147 1148 unsigned format = G_028C70_FORMAT(cb->cb_color_info); 1149 unsigned swap = G_028C70_COMP_SWAP(cb->cb_color_info); 1150 uint32_t spi_format = (pipeline->graphics.col_format >> (i * 4)) & 0xf; 1151 uint32_t colormask = (pipeline->graphics.cb_target_mask >> (i * 4)) & 0xf; 1152 1153 bool has_alpha, has_rgb; 1154 1155 /* Set if RGB and A are present. */ 1156 has_alpha = !G_028C74_FORCE_DST_ALPHA_1(cb->cb_color_attrib); 1157 1158 if (format == V_028C70_COLOR_8 || format == V_028C70_COLOR_16 || format == V_028C70_COLOR_32) 1159 has_rgb = !has_alpha; 1160 else 1161 has_rgb = true; 1162 1163 /* Check the colormask and export format. */ 1164 if (!(colormask & 0x7)) 1165 has_rgb = false; 1166 if (!(colormask & 0x8)) 1167 has_alpha = false; 1168 1169 if (spi_format == V_028714_SPI_SHADER_ZERO) { 1170 has_rgb = false; 1171 has_alpha = false; 1172 } 1173 1174 /* The HW doesn't quite blend correctly with rgb9e5 if we disable the alpha 1175 * optimization, even though it has no alpha. */ 1176 if (has_rgb && format == V_028C70_COLOR_5_9_9_9) 1177 has_alpha = true; 1178 1179 /* Disable value checking for disabled channels. */ 1180 if (!has_rgb) 1181 sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4); 1182 if (!has_alpha) 1183 sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4); 1184 1185 /* Enable down-conversion for 32bpp and smaller formats. */ 1186 switch (format) { 1187 case V_028C70_COLOR_8: 1188 case V_028C70_COLOR_8_8: 1189 case V_028C70_COLOR_8_8_8_8: 1190 /* For 1 and 2-channel formats, use the superset thereof. */ 1191 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR || 1192 spi_format == V_028714_SPI_SHADER_UINT16_ABGR || 1193 spi_format == V_028714_SPI_SHADER_SINT16_ABGR) { 1194 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_8_8_8_8 << (i * 4); 1195 sx_blend_opt_epsilon |= V_028758_8BIT_FORMAT << (i * 4); 1196 } 1197 break; 1198 1199 case V_028C70_COLOR_5_6_5: 1200 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { 1201 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_5_6_5 << (i * 4); 1202 sx_blend_opt_epsilon |= V_028758_6BIT_FORMAT << (i * 4); 1203 } 1204 break; 1205 1206 case V_028C70_COLOR_1_5_5_5: 1207 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { 1208 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_1_5_5_5 << (i * 4); 1209 sx_blend_opt_epsilon |= V_028758_5BIT_FORMAT << (i * 4); 1210 } 1211 break; 1212 1213 case V_028C70_COLOR_4_4_4_4: 1214 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { 1215 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_4_4_4_4 << (i * 4); 1216 sx_blend_opt_epsilon |= V_028758_4BIT_FORMAT << (i * 4); 1217 } 1218 break; 1219 1220 case V_028C70_COLOR_32: 1221 if (swap == V_028C70_SWAP_STD && spi_format == V_028714_SPI_SHADER_32_R) 1222 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4); 1223 else if (swap == V_028C70_SWAP_ALT_REV && spi_format == V_028714_SPI_SHADER_32_AR) 1224 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_A << (i * 4); 1225 break; 1226 1227 case V_028C70_COLOR_16: 1228 case V_028C70_COLOR_16_16: 1229 /* For 1-channel formats, use the superset thereof. */ 1230 if (spi_format == V_028714_SPI_SHADER_UNORM16_ABGR || 1231 spi_format == V_028714_SPI_SHADER_SNORM16_ABGR || 1232 spi_format == V_028714_SPI_SHADER_UINT16_ABGR || 1233 spi_format == V_028714_SPI_SHADER_SINT16_ABGR) { 1234 if (swap == V_028C70_SWAP_STD || swap == V_028C70_SWAP_STD_REV) 1235 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_GR << (i * 4); 1236 else 1237 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_AR << (i * 4); 1238 } 1239 break; 1240 1241 case V_028C70_COLOR_10_11_11: 1242 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) 1243 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_10_11_11 << (i * 4); 1244 break; 1245 1246 case V_028C70_COLOR_2_10_10_10: 1247 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { 1248 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_2_10_10_10 << (i * 4); 1249 sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT << (i * 4); 1250 } 1251 break; 1252 case V_028C70_COLOR_5_9_9_9: 1253 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) 1254 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_9_9_9_E5 << (i * 4); 1255 break; 1256 } 1257 } 1258 1259 /* Do not set the DISABLE bits for the unused attachments, as that 1260 * breaks dual source blending in SkQP and does not seem to improve 1261 * performance. */ 1262 1263 if (sx_ps_downconvert == cmd_buffer->state.last_sx_ps_downconvert && 1264 sx_blend_opt_epsilon == cmd_buffer->state.last_sx_blend_opt_epsilon && 1265 sx_blend_opt_control == cmd_buffer->state.last_sx_blend_opt_control) 1266 return; 1267 1268 radeon_set_context_reg_seq(cmd_buffer->cs, R_028754_SX_PS_DOWNCONVERT, 3); 1269 radeon_emit(cmd_buffer->cs, sx_ps_downconvert); 1270 radeon_emit(cmd_buffer->cs, sx_blend_opt_epsilon); 1271 radeon_emit(cmd_buffer->cs, sx_blend_opt_control); 1272 1273 cmd_buffer->state.context_roll_without_scissor_emitted = true; 1274 1275 cmd_buffer->state.last_sx_ps_downconvert = sx_ps_downconvert; 1276 cmd_buffer->state.last_sx_blend_opt_epsilon = sx_blend_opt_epsilon; 1277 cmd_buffer->state.last_sx_blend_opt_control = sx_blend_opt_control; 1278} 1279 1280static void 1281radv_emit_batch_break_on_new_ps(struct radv_cmd_buffer *cmd_buffer) 1282{ 1283 if (!cmd_buffer->device->pbb_allowed) 1284 return; 1285 1286 struct radv_binning_settings settings = 1287 radv_get_binning_settings(cmd_buffer->device->physical_device); 1288 bool break_for_new_ps = 1289 (!cmd_buffer->state.emitted_pipeline || 1290 cmd_buffer->state.emitted_pipeline->shaders[MESA_SHADER_FRAGMENT] != 1291 cmd_buffer->state.pipeline->shaders[MESA_SHADER_FRAGMENT]) && 1292 (settings.context_states_per_bin > 1 || settings.persistent_states_per_bin > 1); 1293 bool break_for_new_cb_target_mask = 1294 (cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE) && 1295 settings.context_states_per_bin > 1; 1296 1297 if (!break_for_new_ps && !break_for_new_cb_target_mask) 1298 return; 1299 1300 radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 1301 radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0)); 1302} 1303 1304static void 1305radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer) 1306{ 1307 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; 1308 1309 if (!pipeline || cmd_buffer->state.emitted_pipeline == pipeline) 1310 return; 1311 1312 radv_update_multisample_state(cmd_buffer, pipeline); 1313 radv_update_binning_state(cmd_buffer, pipeline); 1314 1315 cmd_buffer->scratch_size_per_wave_needed = 1316 MAX2(cmd_buffer->scratch_size_per_wave_needed, pipeline->scratch_bytes_per_wave); 1317 cmd_buffer->scratch_waves_wanted = MAX2(cmd_buffer->scratch_waves_wanted, pipeline->max_waves); 1318 1319 if (!cmd_buffer->state.emitted_pipeline || 1320 cmd_buffer->state.emitted_pipeline->graphics.can_use_guardband != 1321 pipeline->graphics.can_use_guardband) 1322 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR; 1323 1324 if (!cmd_buffer->state.emitted_pipeline || 1325 cmd_buffer->state.emitted_pipeline->graphics.pa_su_sc_mode_cntl != 1326 pipeline->graphics.pa_su_sc_mode_cntl) 1327 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_CULL_MODE | 1328 RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE | 1329 RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS; 1330 1331 if (!cmd_buffer->state.emitted_pipeline || 1332 cmd_buffer->state.emitted_pipeline->graphics.pa_cl_clip_cntl != 1333 pipeline->graphics.pa_cl_clip_cntl) 1334 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE; 1335 1336 if (!cmd_buffer->state.emitted_pipeline || 1337 cmd_buffer->state.emitted_pipeline->graphics.cb_color_control != 1338 pipeline->graphics.cb_color_control) 1339 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP; 1340 1341 if (!cmd_buffer->state.emitted_pipeline) 1342 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY | 1343 RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS | 1344 RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS | 1345 RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE; 1346 1347 if (!cmd_buffer->state.emitted_pipeline || 1348 cmd_buffer->state.emitted_pipeline->graphics.db_depth_control != 1349 pipeline->graphics.db_depth_control) 1350 cmd_buffer->state.dirty |= 1351 RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE | 1352 RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP | RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE | 1353 RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP; 1354 1355 if (!cmd_buffer->state.emitted_pipeline) 1356 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP; 1357 1358 if (!cmd_buffer->state.emitted_pipeline || 1359 cmd_buffer->state.emitted_pipeline->graphics.cb_target_mask != 1360 pipeline->graphics.cb_target_mask) { 1361 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE; 1362 } 1363 1364 radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw); 1365 1366 if (pipeline->graphics.has_ngg_culling && 1367 pipeline->graphics.last_vgt_api_stage != MESA_SHADER_GEOMETRY && 1368 !cmd_buffer->state.last_nggc_settings) { 1369 /* The already emitted RSRC2 contains the LDS required for NGG culling. 1370 * Culling is currently disabled, so re-emit RSRC2 to reduce LDS usage. 1371 * API GS always needs LDS, so this isn't useful there. 1372 */ 1373 struct radv_shader_variant *v = pipeline->shaders[pipeline->graphics.last_vgt_api_stage]; 1374 radeon_set_sh_reg(cmd_buffer->cs, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, 1375 (v->config.rsrc2 & C_00B22C_LDS_SIZE) | 1376 S_00B22C_LDS_SIZE(v->info.num_lds_blocks_when_not_culling)); 1377 } 1378 1379 if (!cmd_buffer->state.emitted_pipeline || 1380 cmd_buffer->state.emitted_pipeline->ctx_cs.cdw != pipeline->ctx_cs.cdw || 1381 cmd_buffer->state.emitted_pipeline->ctx_cs_hash != pipeline->ctx_cs_hash || 1382 memcmp(cmd_buffer->state.emitted_pipeline->ctx_cs.buf, pipeline->ctx_cs.buf, 1383 pipeline->ctx_cs.cdw * 4)) { 1384 radeon_emit_array(cmd_buffer->cs, pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw); 1385 cmd_buffer->state.context_roll_without_scissor_emitted = true; 1386 } 1387 1388 radv_emit_batch_break_on_new_ps(cmd_buffer); 1389 1390 for (unsigned i = 0; i < MESA_SHADER_COMPUTE; i++) { 1391 if (!pipeline->shaders[i]) 1392 continue; 1393 1394 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->shaders[i]->bo); 1395 } 1396 1397 if (radv_pipeline_has_gs_copy_shader(pipeline)) 1398 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->gs_copy_shader->bo); 1399 1400 if (unlikely(cmd_buffer->device->trace_bo)) 1401 radv_save_pipeline(cmd_buffer, pipeline); 1402 1403 cmd_buffer->state.emitted_pipeline = pipeline; 1404 1405 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_PIPELINE; 1406} 1407 1408static void 1409radv_emit_viewport(struct radv_cmd_buffer *cmd_buffer) 1410{ 1411 const struct radv_viewport_state *viewport = &cmd_buffer->state.dynamic.viewport; 1412 int i; 1413 const unsigned count = viewport->count; 1414 1415 assert(count); 1416 radeon_set_context_reg_seq(cmd_buffer->cs, R_02843C_PA_CL_VPORT_XSCALE, count * 6); 1417 1418 for (i = 0; i < count; i++) { 1419 radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].scale[0])); 1420 radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].translate[0])); 1421 radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].scale[1])); 1422 radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].translate[1])); 1423 radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].scale[2])); 1424 radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].translate[2])); 1425 } 1426 1427 radeon_set_context_reg_seq(cmd_buffer->cs, R_0282D0_PA_SC_VPORT_ZMIN_0, count * 2); 1428 for (i = 0; i < count; i++) { 1429 float zmin = MIN2(viewport->viewports[i].minDepth, viewport->viewports[i].maxDepth); 1430 float zmax = MAX2(viewport->viewports[i].minDepth, viewport->viewports[i].maxDepth); 1431 radeon_emit(cmd_buffer->cs, fui(zmin)); 1432 radeon_emit(cmd_buffer->cs, fui(zmax)); 1433 } 1434} 1435 1436static void 1437radv_emit_scissor(struct radv_cmd_buffer *cmd_buffer) 1438{ 1439 uint32_t count = cmd_buffer->state.dynamic.scissor.count; 1440 1441 si_write_scissors(cmd_buffer->cs, 0, count, cmd_buffer->state.dynamic.scissor.scissors, 1442 cmd_buffer->state.dynamic.viewport.viewports, 1443 cmd_buffer->state.emitted_pipeline->graphics.can_use_guardband); 1444 1445 cmd_buffer->state.context_roll_without_scissor_emitted = false; 1446} 1447 1448static void 1449radv_emit_discard_rectangle(struct radv_cmd_buffer *cmd_buffer) 1450{ 1451 if (!cmd_buffer->state.dynamic.discard_rectangle.count) 1452 return; 1453 1454 radeon_set_context_reg_seq(cmd_buffer->cs, R_028210_PA_SC_CLIPRECT_0_TL, 1455 cmd_buffer->state.dynamic.discard_rectangle.count * 2); 1456 for (unsigned i = 0; i < cmd_buffer->state.dynamic.discard_rectangle.count; ++i) { 1457 VkRect2D rect = cmd_buffer->state.dynamic.discard_rectangle.rectangles[i]; 1458 radeon_emit(cmd_buffer->cs, S_028210_TL_X(rect.offset.x) | S_028210_TL_Y(rect.offset.y)); 1459 radeon_emit(cmd_buffer->cs, S_028214_BR_X(rect.offset.x + rect.extent.width) | 1460 S_028214_BR_Y(rect.offset.y + rect.extent.height)); 1461 } 1462} 1463 1464static void 1465radv_emit_line_width(struct radv_cmd_buffer *cmd_buffer) 1466{ 1467 unsigned width = cmd_buffer->state.dynamic.line_width * 8; 1468 1469 radeon_set_context_reg(cmd_buffer->cs, R_028A08_PA_SU_LINE_CNTL, 1470 S_028A08_WIDTH(CLAMP(width, 0, 0xFFFF))); 1471} 1472 1473static void 1474radv_emit_blend_constants(struct radv_cmd_buffer *cmd_buffer) 1475{ 1476 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 1477 1478 radeon_set_context_reg_seq(cmd_buffer->cs, R_028414_CB_BLEND_RED, 4); 1479 radeon_emit_array(cmd_buffer->cs, (uint32_t *)d->blend_constants, 4); 1480} 1481 1482static void 1483radv_emit_stencil(struct radv_cmd_buffer *cmd_buffer) 1484{ 1485 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 1486 1487 radeon_set_context_reg_seq(cmd_buffer->cs, R_028430_DB_STENCILREFMASK, 2); 1488 radeon_emit(cmd_buffer->cs, S_028430_STENCILTESTVAL(d->stencil_reference.front) | 1489 S_028430_STENCILMASK(d->stencil_compare_mask.front) | 1490 S_028430_STENCILWRITEMASK(d->stencil_write_mask.front) | 1491 S_028430_STENCILOPVAL(1)); 1492 radeon_emit(cmd_buffer->cs, S_028434_STENCILTESTVAL_BF(d->stencil_reference.back) | 1493 S_028434_STENCILMASK_BF(d->stencil_compare_mask.back) | 1494 S_028434_STENCILWRITEMASK_BF(d->stencil_write_mask.back) | 1495 S_028434_STENCILOPVAL_BF(1)); 1496} 1497 1498static void 1499radv_emit_depth_bounds(struct radv_cmd_buffer *cmd_buffer) 1500{ 1501 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 1502 1503 radeon_set_context_reg_seq(cmd_buffer->cs, R_028020_DB_DEPTH_BOUNDS_MIN, 2); 1504 radeon_emit(cmd_buffer->cs, fui(d->depth_bounds.min)); 1505 radeon_emit(cmd_buffer->cs, fui(d->depth_bounds.max)); 1506} 1507 1508static void 1509radv_emit_depth_bias(struct radv_cmd_buffer *cmd_buffer) 1510{ 1511 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 1512 unsigned slope = fui(d->depth_bias.slope * 16.0f); 1513 1514 radeon_set_context_reg_seq(cmd_buffer->cs, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, 5); 1515 radeon_emit(cmd_buffer->cs, fui(d->depth_bias.clamp)); /* CLAMP */ 1516 radeon_emit(cmd_buffer->cs, slope); /* FRONT SCALE */ 1517 radeon_emit(cmd_buffer->cs, fui(d->depth_bias.bias)); /* FRONT OFFSET */ 1518 radeon_emit(cmd_buffer->cs, slope); /* BACK SCALE */ 1519 radeon_emit(cmd_buffer->cs, fui(d->depth_bias.bias)); /* BACK OFFSET */ 1520} 1521 1522static void 1523radv_emit_line_stipple(struct radv_cmd_buffer *cmd_buffer) 1524{ 1525 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 1526 uint32_t auto_reset_cntl = 1; 1527 1528 if (d->primitive_topology == V_008958_DI_PT_LINESTRIP) 1529 auto_reset_cntl = 2; 1530 1531 radeon_set_context_reg(cmd_buffer->cs, R_028A0C_PA_SC_LINE_STIPPLE, 1532 S_028A0C_LINE_PATTERN(d->line_stipple.pattern) | 1533 S_028A0C_REPEAT_COUNT(d->line_stipple.factor - 1) | 1534 S_028A0C_AUTO_RESET_CNTL(auto_reset_cntl)); 1535} 1536 1537static void 1538radv_emit_culling(struct radv_cmd_buffer *cmd_buffer, uint64_t states) 1539{ 1540 unsigned pa_su_sc_mode_cntl = cmd_buffer->state.pipeline->graphics.pa_su_sc_mode_cntl; 1541 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 1542 1543 pa_su_sc_mode_cntl &= C_028814_CULL_FRONT & 1544 C_028814_CULL_BACK & 1545 C_028814_FACE & 1546 C_028814_POLY_OFFSET_FRONT_ENABLE & 1547 C_028814_POLY_OFFSET_BACK_ENABLE & 1548 C_028814_POLY_OFFSET_PARA_ENABLE; 1549 1550 pa_su_sc_mode_cntl |= S_028814_CULL_FRONT(!!(d->cull_mode & VK_CULL_MODE_FRONT_BIT)) | 1551 S_028814_CULL_BACK(!!(d->cull_mode & VK_CULL_MODE_BACK_BIT)) | 1552 S_028814_FACE(d->front_face) | 1553 S_028814_POLY_OFFSET_FRONT_ENABLE(d->depth_bias_enable) | 1554 S_028814_POLY_OFFSET_BACK_ENABLE(d->depth_bias_enable) | 1555 S_028814_POLY_OFFSET_PARA_ENABLE(d->depth_bias_enable); 1556 1557 radeon_set_context_reg(cmd_buffer->cs, R_028814_PA_SU_SC_MODE_CNTL, pa_su_sc_mode_cntl); 1558} 1559 1560static void 1561radv_emit_primitive_topology(struct radv_cmd_buffer *cmd_buffer) 1562{ 1563 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 1564 1565 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) { 1566 radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cmd_buffer->cs, 1567 R_030908_VGT_PRIMITIVE_TYPE, 1, d->primitive_topology); 1568 } else { 1569 radeon_set_config_reg(cmd_buffer->cs, R_008958_VGT_PRIMITIVE_TYPE, d->primitive_topology); 1570 } 1571} 1572 1573static void 1574radv_emit_depth_control(struct radv_cmd_buffer *cmd_buffer, uint64_t states) 1575{ 1576 unsigned db_depth_control = cmd_buffer->state.pipeline->graphics.db_depth_control; 1577 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 1578 1579 db_depth_control &= C_028800_Z_ENABLE & 1580 C_028800_Z_WRITE_ENABLE & 1581 C_028800_ZFUNC & 1582 C_028800_DEPTH_BOUNDS_ENABLE & 1583 C_028800_STENCIL_ENABLE & 1584 C_028800_BACKFACE_ENABLE & 1585 C_028800_STENCILFUNC & 1586 C_028800_STENCILFUNC_BF; 1587 1588 db_depth_control |= S_028800_Z_ENABLE(d->depth_test_enable ? 1 : 0) | 1589 S_028800_Z_WRITE_ENABLE(d->depth_write_enable ? 1 : 0) | 1590 S_028800_ZFUNC(d->depth_compare_op) | 1591 S_028800_DEPTH_BOUNDS_ENABLE(d->depth_bounds_test_enable ? 1 : 0) | 1592 S_028800_STENCIL_ENABLE(d->stencil_test_enable ? 1 : 0) | 1593 S_028800_BACKFACE_ENABLE(d->stencil_test_enable ? 1 : 0) | 1594 S_028800_STENCILFUNC(d->stencil_op.front.compare_op) | 1595 S_028800_STENCILFUNC_BF(d->stencil_op.back.compare_op); 1596 1597 radeon_set_context_reg(cmd_buffer->cs, R_028800_DB_DEPTH_CONTROL, db_depth_control); 1598} 1599 1600static void 1601radv_emit_stencil_control(struct radv_cmd_buffer *cmd_buffer) 1602{ 1603 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 1604 1605 radeon_set_context_reg( 1606 cmd_buffer->cs, R_02842C_DB_STENCIL_CONTROL, 1607 S_02842C_STENCILFAIL(si_translate_stencil_op(d->stencil_op.front.fail_op)) | 1608 S_02842C_STENCILZPASS(si_translate_stencil_op(d->stencil_op.front.pass_op)) | 1609 S_02842C_STENCILZFAIL(si_translate_stencil_op(d->stencil_op.front.depth_fail_op)) | 1610 S_02842C_STENCILFAIL_BF(si_translate_stencil_op(d->stencil_op.back.fail_op)) | 1611 S_02842C_STENCILZPASS_BF(si_translate_stencil_op(d->stencil_op.back.pass_op)) | 1612 S_02842C_STENCILZFAIL_BF(si_translate_stencil_op(d->stencil_op.back.depth_fail_op))); 1613} 1614 1615static void 1616radv_emit_fragment_shading_rate(struct radv_cmd_buffer *cmd_buffer) 1617{ 1618 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; 1619 const struct radv_subpass *subpass = cmd_buffer->state.subpass; 1620 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 1621 uint32_t rate_x = MIN2(2, d->fragment_shading_rate.size.width) - 1; 1622 uint32_t rate_y = MIN2(2, d->fragment_shading_rate.size.height) - 1; 1623 uint32_t pa_cl_vrs_cntl = pipeline->graphics.vrs.pa_cl_vrs_cntl; 1624 uint32_t vertex_comb_mode = d->fragment_shading_rate.combiner_ops[0]; 1625 uint32_t htile_comb_mode = d->fragment_shading_rate.combiner_ops[1]; 1626 1627 if (subpass && !subpass->vrs_attachment) { 1628 /* When the current subpass has no VRS attachment, the VRS rates are expected to be 1x1, so we 1629 * can cheat by tweaking the different combiner modes. 1630 */ 1631 switch (htile_comb_mode) { 1632 case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MIN_KHR: 1633 /* The result of min(A, 1x1) is always 1x1. */ 1634 FALLTHROUGH; 1635 case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR: 1636 /* Force the per-draw VRS rate to 1x1. */ 1637 rate_x = rate_y = 0; 1638 1639 /* As the result of min(A, 1x1) or replace(A, 1x1) are always 1x1, set the vertex rate 1640 * combiner mode as passthrough. 1641 */ 1642 vertex_comb_mode = V_028848_VRS_COMB_MODE_PASSTHRU; 1643 break; 1644 case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MAX_KHR: 1645 /* The result of max(A, 1x1) is always A. */ 1646 FALLTHROUGH; 1647 case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR: 1648 /* Nothing to do here because the SAMPLE_ITER combiner mode should already be passthrough. */ 1649 break; 1650 default: 1651 break; 1652 } 1653 } 1654 1655 /* Emit per-draw VRS rate which is the first combiner. */ 1656 radeon_set_uconfig_reg(cmd_buffer->cs, R_03098C_GE_VRS_RATE, 1657 S_03098C_RATE_X(rate_x) | S_03098C_RATE_Y(rate_y)); 1658 1659 /* VERTEX_RATE_COMBINER_MODE controls the combiner mode between the 1660 * draw rate and the vertex rate. 1661 */ 1662 pa_cl_vrs_cntl |= S_028848_VERTEX_RATE_COMBINER_MODE(vertex_comb_mode); 1663 1664 /* HTILE_RATE_COMBINER_MODE controls the combiner mode between the primitive rate and the HTILE 1665 * rate. 1666 */ 1667 pa_cl_vrs_cntl |= S_028848_HTILE_RATE_COMBINER_MODE(htile_comb_mode); 1668 1669 radeon_set_context_reg(cmd_buffer->cs, R_028848_PA_CL_VRS_CNTL, pa_cl_vrs_cntl); 1670} 1671 1672static void 1673radv_emit_primitive_restart_enable(struct radv_cmd_buffer *cmd_buffer) 1674{ 1675 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 1676 1677 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { 1678 radeon_set_uconfig_reg(cmd_buffer->cs, R_03092C_VGT_MULTI_PRIM_IB_RESET_EN, 1679 d->primitive_restart_enable); 1680 } else { 1681 radeon_set_context_reg(cmd_buffer->cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, 1682 d->primitive_restart_enable); 1683 } 1684} 1685 1686static void 1687radv_emit_rasterizer_discard_enable(struct radv_cmd_buffer *cmd_buffer) 1688{ 1689 unsigned pa_cl_clip_cntl = cmd_buffer->state.pipeline->graphics.pa_cl_clip_cntl; 1690 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 1691 1692 pa_cl_clip_cntl &= C_028810_DX_RASTERIZATION_KILL; 1693 pa_cl_clip_cntl |= S_028810_DX_RASTERIZATION_KILL(d->rasterizer_discard_enable); 1694 1695 radeon_set_context_reg(cmd_buffer->cs, R_028810_PA_CL_CLIP_CNTL, pa_cl_clip_cntl); 1696} 1697 1698static void 1699radv_emit_logic_op(struct radv_cmd_buffer *cmd_buffer) 1700{ 1701 unsigned cb_color_control = cmd_buffer->state.pipeline->graphics.cb_color_control; 1702 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 1703 1704 cb_color_control &= C_028808_ROP3; 1705 cb_color_control |= S_028808_ROP3(d->logic_op); 1706 1707 radeon_set_context_reg(cmd_buffer->cs, R_028808_CB_COLOR_CONTROL, cb_color_control); 1708} 1709 1710static void 1711radv_emit_color_write_enable(struct radv_cmd_buffer *cmd_buffer) 1712{ 1713 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; 1714 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 1715 1716 radeon_set_context_reg(cmd_buffer->cs, R_028238_CB_TARGET_MASK, 1717 pipeline->graphics.cb_target_mask & d->color_write_enable); 1718} 1719 1720static void 1721radv_emit_fb_color_state(struct radv_cmd_buffer *cmd_buffer, int index, 1722 struct radv_color_buffer_info *cb, struct radv_image_view *iview, 1723 VkImageLayout layout, bool in_render_loop, bool disable_dcc) 1724{ 1725 bool is_vi = cmd_buffer->device->physical_device->rad_info.chip_class >= GFX8; 1726 uint32_t cb_color_info = cb->cb_color_info; 1727 struct radv_image *image = iview->image; 1728 1729 if (!radv_layout_dcc_compressed( 1730 cmd_buffer->device, image, iview->base_mip, layout, in_render_loop, 1731 radv_image_queue_family_mask(image, cmd_buffer->queue_family_index, 1732 cmd_buffer->queue_family_index)) || 1733 disable_dcc) { 1734 cb_color_info &= C_028C70_DCC_ENABLE; 1735 } 1736 1737 if (!radv_layout_fmask_compressed( 1738 cmd_buffer->device, image, layout, 1739 radv_image_queue_family_mask(image, cmd_buffer->queue_family_index, 1740 cmd_buffer->queue_family_index))) { 1741 cb_color_info &= C_028C70_COMPRESSION; 1742 } 1743 1744 if (radv_image_is_tc_compat_cmask(image) && (radv_is_fmask_decompress_pipeline(cmd_buffer) || 1745 radv_is_dcc_decompress_pipeline(cmd_buffer))) { 1746 /* If this bit is set, the FMASK decompression operation 1747 * doesn't occur (DCC_COMPRESS also implies FMASK_DECOMPRESS). 1748 */ 1749 cb_color_info &= C_028C70_FMASK_COMPRESS_1FRAG_ONLY; 1750 } 1751 1752 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) { 1753 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11); 1754 radeon_emit(cmd_buffer->cs, cb->cb_color_base); 1755 radeon_emit(cmd_buffer->cs, 0); 1756 radeon_emit(cmd_buffer->cs, 0); 1757 radeon_emit(cmd_buffer->cs, cb->cb_color_view); 1758 radeon_emit(cmd_buffer->cs, cb_color_info); 1759 radeon_emit(cmd_buffer->cs, cb->cb_color_attrib); 1760 radeon_emit(cmd_buffer->cs, cb->cb_dcc_control); 1761 radeon_emit(cmd_buffer->cs, cb->cb_color_cmask); 1762 radeon_emit(cmd_buffer->cs, 0); 1763 radeon_emit(cmd_buffer->cs, cb->cb_color_fmask); 1764 radeon_emit(cmd_buffer->cs, 0); 1765 1766 radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->cb_dcc_base); 1767 1768 radeon_set_context_reg(cmd_buffer->cs, R_028E40_CB_COLOR0_BASE_EXT + index * 4, 1769 cb->cb_color_base >> 32); 1770 radeon_set_context_reg(cmd_buffer->cs, R_028E60_CB_COLOR0_CMASK_BASE_EXT + index * 4, 1771 cb->cb_color_cmask >> 32); 1772 radeon_set_context_reg(cmd_buffer->cs, R_028E80_CB_COLOR0_FMASK_BASE_EXT + index * 4, 1773 cb->cb_color_fmask >> 32); 1774 radeon_set_context_reg(cmd_buffer->cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + index * 4, 1775 cb->cb_dcc_base >> 32); 1776 radeon_set_context_reg(cmd_buffer->cs, R_028EC0_CB_COLOR0_ATTRIB2 + index * 4, 1777 cb->cb_color_attrib2); 1778 radeon_set_context_reg(cmd_buffer->cs, R_028EE0_CB_COLOR0_ATTRIB3 + index * 4, 1779 cb->cb_color_attrib3); 1780 } else if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) { 1781 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11); 1782 radeon_emit(cmd_buffer->cs, cb->cb_color_base); 1783 radeon_emit(cmd_buffer->cs, S_028C64_BASE_256B(cb->cb_color_base >> 32)); 1784 radeon_emit(cmd_buffer->cs, cb->cb_color_attrib2); 1785 radeon_emit(cmd_buffer->cs, cb->cb_color_view); 1786 radeon_emit(cmd_buffer->cs, cb_color_info); 1787 radeon_emit(cmd_buffer->cs, cb->cb_color_attrib); 1788 radeon_emit(cmd_buffer->cs, cb->cb_dcc_control); 1789 radeon_emit(cmd_buffer->cs, cb->cb_color_cmask); 1790 radeon_emit(cmd_buffer->cs, S_028C80_BASE_256B(cb->cb_color_cmask >> 32)); 1791 radeon_emit(cmd_buffer->cs, cb->cb_color_fmask); 1792 radeon_emit(cmd_buffer->cs, S_028C88_BASE_256B(cb->cb_color_fmask >> 32)); 1793 1794 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, 2); 1795 radeon_emit(cmd_buffer->cs, cb->cb_dcc_base); 1796 radeon_emit(cmd_buffer->cs, S_028C98_BASE_256B(cb->cb_dcc_base >> 32)); 1797 1798 radeon_set_context_reg(cmd_buffer->cs, R_0287A0_CB_MRT0_EPITCH + index * 4, 1799 cb->cb_mrt_epitch); 1800 } else { 1801 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11); 1802 radeon_emit(cmd_buffer->cs, cb->cb_color_base); 1803 radeon_emit(cmd_buffer->cs, cb->cb_color_pitch); 1804 radeon_emit(cmd_buffer->cs, cb->cb_color_slice); 1805 radeon_emit(cmd_buffer->cs, cb->cb_color_view); 1806 radeon_emit(cmd_buffer->cs, cb_color_info); 1807 radeon_emit(cmd_buffer->cs, cb->cb_color_attrib); 1808 radeon_emit(cmd_buffer->cs, cb->cb_dcc_control); 1809 radeon_emit(cmd_buffer->cs, cb->cb_color_cmask); 1810 radeon_emit(cmd_buffer->cs, cb->cb_color_cmask_slice); 1811 radeon_emit(cmd_buffer->cs, cb->cb_color_fmask); 1812 radeon_emit(cmd_buffer->cs, cb->cb_color_fmask_slice); 1813 1814 if (is_vi) { /* DCC BASE */ 1815 radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, 1816 cb->cb_dcc_base); 1817 } 1818 } 1819 1820 if (G_028C70_DCC_ENABLE(cb_color_info)) { 1821 /* Drawing with DCC enabled also compresses colorbuffers. */ 1822 VkImageSubresourceRange range = { 1823 .aspectMask = iview->aspect_mask, 1824 .baseMipLevel = iview->base_mip, 1825 .levelCount = iview->level_count, 1826 .baseArrayLayer = iview->base_layer, 1827 .layerCount = iview->layer_count, 1828 }; 1829 1830 radv_update_dcc_metadata(cmd_buffer, image, &range, true); 1831 } 1832} 1833 1834static void 1835radv_update_zrange_precision(struct radv_cmd_buffer *cmd_buffer, struct radv_ds_buffer_info *ds, 1836 const struct radv_image_view *iview, VkImageLayout layout, 1837 bool in_render_loop, bool requires_cond_exec) 1838{ 1839 const struct radv_image *image = iview->image; 1840 uint32_t db_z_info = ds->db_z_info; 1841 uint32_t db_z_info_reg; 1842 1843 if (!cmd_buffer->device->physical_device->rad_info.has_tc_compat_zrange_bug || 1844 !radv_image_is_tc_compat_htile(image)) 1845 return; 1846 1847 if (!radv_layout_is_htile_compressed( 1848 cmd_buffer->device, image, layout, in_render_loop, 1849 radv_image_queue_family_mask(image, cmd_buffer->queue_family_index, 1850 cmd_buffer->queue_family_index))) { 1851 db_z_info &= C_028040_TILE_SURFACE_ENABLE; 1852 } 1853 1854 db_z_info &= C_028040_ZRANGE_PRECISION; 1855 1856 if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) { 1857 db_z_info_reg = R_028038_DB_Z_INFO; 1858 } else { 1859 db_z_info_reg = R_028040_DB_Z_INFO; 1860 } 1861 1862 /* When we don't know the last fast clear value we need to emit a 1863 * conditional packet that will eventually skip the following 1864 * SET_CONTEXT_REG packet. 1865 */ 1866 if (requires_cond_exec) { 1867 uint64_t va = radv_get_tc_compat_zrange_va(image, iview->base_mip); 1868 1869 radeon_emit(cmd_buffer->cs, PKT3(PKT3_COND_EXEC, 3, 0)); 1870 radeon_emit(cmd_buffer->cs, va); 1871 radeon_emit(cmd_buffer->cs, va >> 32); 1872 radeon_emit(cmd_buffer->cs, 0); 1873 radeon_emit(cmd_buffer->cs, 3); /* SET_CONTEXT_REG size */ 1874 } 1875 1876 radeon_set_context_reg(cmd_buffer->cs, db_z_info_reg, db_z_info); 1877} 1878 1879static void 1880radv_emit_fb_ds_state(struct radv_cmd_buffer *cmd_buffer, struct radv_ds_buffer_info *ds, 1881 struct radv_image_view *iview, VkImageLayout layout, bool in_render_loop) 1882{ 1883 const struct radv_image *image = iview->image; 1884 uint32_t db_z_info = ds->db_z_info; 1885 uint32_t db_stencil_info = ds->db_stencil_info; 1886 1887 if (!radv_layout_is_htile_compressed( 1888 cmd_buffer->device, image, layout, in_render_loop, 1889 radv_image_queue_family_mask(image, cmd_buffer->queue_family_index, 1890 cmd_buffer->queue_family_index))) { 1891 db_z_info &= C_028040_TILE_SURFACE_ENABLE; 1892 db_stencil_info |= S_028044_TILE_STENCIL_DISABLE(1); 1893 } 1894 1895 radeon_set_context_reg(cmd_buffer->cs, R_028008_DB_DEPTH_VIEW, ds->db_depth_view); 1896 radeon_set_context_reg(cmd_buffer->cs, R_028ABC_DB_HTILE_SURFACE, ds->db_htile_surface); 1897 1898 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) { 1899 radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base); 1900 radeon_set_context_reg(cmd_buffer->cs, R_02801C_DB_DEPTH_SIZE_XY, ds->db_depth_size); 1901 1902 radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 7); 1903 radeon_emit(cmd_buffer->cs, S_02803C_RESOURCE_LEVEL(1)); 1904 radeon_emit(cmd_buffer->cs, db_z_info); 1905 radeon_emit(cmd_buffer->cs, db_stencil_info); 1906 radeon_emit(cmd_buffer->cs, ds->db_z_read_base); 1907 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base); 1908 radeon_emit(cmd_buffer->cs, ds->db_z_read_base); 1909 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base); 1910 1911 radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_READ_BASE_HI, 5); 1912 radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32); 1913 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32); 1914 radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32); 1915 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32); 1916 radeon_emit(cmd_buffer->cs, ds->db_htile_data_base >> 32); 1917 } else if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) { 1918 radeon_set_context_reg_seq(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, 3); 1919 radeon_emit(cmd_buffer->cs, ds->db_htile_data_base); 1920 radeon_emit(cmd_buffer->cs, S_028018_BASE_HI(ds->db_htile_data_base >> 32)); 1921 radeon_emit(cmd_buffer->cs, ds->db_depth_size); 1922 1923 radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 10); 1924 radeon_emit(cmd_buffer->cs, db_z_info); /* DB_Z_INFO */ 1925 radeon_emit(cmd_buffer->cs, db_stencil_info); /* DB_STENCIL_INFO */ 1926 radeon_emit(cmd_buffer->cs, ds->db_z_read_base); /* DB_Z_READ_BASE */ 1927 radeon_emit(cmd_buffer->cs, 1928 S_028044_BASE_HI(ds->db_z_read_base >> 32)); /* DB_Z_READ_BASE_HI */ 1929 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base); /* DB_STENCIL_READ_BASE */ 1930 radeon_emit(cmd_buffer->cs, 1931 S_02804C_BASE_HI(ds->db_stencil_read_base >> 32)); /* DB_STENCIL_READ_BASE_HI */ 1932 radeon_emit(cmd_buffer->cs, ds->db_z_write_base); /* DB_Z_WRITE_BASE */ 1933 radeon_emit(cmd_buffer->cs, 1934 S_028054_BASE_HI(ds->db_z_write_base >> 32)); /* DB_Z_WRITE_BASE_HI */ 1935 radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base); /* DB_STENCIL_WRITE_BASE */ 1936 radeon_emit(cmd_buffer->cs, 1937 S_02805C_BASE_HI(ds->db_stencil_write_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */ 1938 1939 radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_INFO2, 2); 1940 radeon_emit(cmd_buffer->cs, ds->db_z_info2); 1941 radeon_emit(cmd_buffer->cs, ds->db_stencil_info2); 1942 } else { 1943 radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base); 1944 1945 radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 9); 1946 radeon_emit(cmd_buffer->cs, ds->db_depth_info); /* R_02803C_DB_DEPTH_INFO */ 1947 radeon_emit(cmd_buffer->cs, db_z_info); /* R_028040_DB_Z_INFO */ 1948 radeon_emit(cmd_buffer->cs, db_stencil_info); /* R_028044_DB_STENCIL_INFO */ 1949 radeon_emit(cmd_buffer->cs, ds->db_z_read_base); /* R_028048_DB_Z_READ_BASE */ 1950 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base); /* R_02804C_DB_STENCIL_READ_BASE */ 1951 radeon_emit(cmd_buffer->cs, ds->db_z_write_base); /* R_028050_DB_Z_WRITE_BASE */ 1952 radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base); /* R_028054_DB_STENCIL_WRITE_BASE */ 1953 radeon_emit(cmd_buffer->cs, ds->db_depth_size); /* R_028058_DB_DEPTH_SIZE */ 1954 radeon_emit(cmd_buffer->cs, ds->db_depth_slice); /* R_02805C_DB_DEPTH_SLICE */ 1955 } 1956 1957 /* Update the ZRANGE_PRECISION value for the TC-compat bug. */ 1958 radv_update_zrange_precision(cmd_buffer, ds, iview, layout, in_render_loop, true); 1959 1960 radeon_set_context_reg(cmd_buffer->cs, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, 1961 ds->pa_su_poly_offset_db_fmt_cntl); 1962} 1963 1964/** 1965 * Update the fast clear depth/stencil values if the image is bound as a 1966 * depth/stencil buffer. 1967 */ 1968static void 1969radv_update_bound_fast_clear_ds(struct radv_cmd_buffer *cmd_buffer, 1970 const struct radv_image_view *iview, 1971 VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects) 1972{ 1973 const struct radv_subpass *subpass = cmd_buffer->state.subpass; 1974 const struct radv_image *image = iview->image; 1975 struct radeon_cmdbuf *cs = cmd_buffer->cs; 1976 uint32_t att_idx; 1977 1978 if (!cmd_buffer->state.attachments || !subpass) 1979 return; 1980 1981 if (!subpass->depth_stencil_attachment) 1982 return; 1983 1984 att_idx = subpass->depth_stencil_attachment->attachment; 1985 if (cmd_buffer->state.attachments[att_idx].iview->image != image) 1986 return; 1987 1988 if (aspects == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { 1989 radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2); 1990 radeon_emit(cs, ds_clear_value.stencil); 1991 radeon_emit(cs, fui(ds_clear_value.depth)); 1992 } else if (aspects == VK_IMAGE_ASPECT_DEPTH_BIT) { 1993 radeon_set_context_reg(cs, R_02802C_DB_DEPTH_CLEAR, fui(ds_clear_value.depth)); 1994 } else { 1995 assert(aspects == VK_IMAGE_ASPECT_STENCIL_BIT); 1996 radeon_set_context_reg(cs, R_028028_DB_STENCIL_CLEAR, ds_clear_value.stencil); 1997 } 1998 1999 /* Update the ZRANGE_PRECISION value for the TC-compat bug. This is 2000 * only needed when clearing Z to 0.0. 2001 */ 2002 if ((aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && ds_clear_value.depth == 0.0) { 2003 VkImageLayout layout = subpass->depth_stencil_attachment->layout; 2004 bool in_render_loop = subpass->depth_stencil_attachment->in_render_loop; 2005 2006 radv_update_zrange_precision(cmd_buffer, &cmd_buffer->state.attachments[att_idx].ds, iview, 2007 layout, in_render_loop, false); 2008 } 2009 2010 cmd_buffer->state.context_roll_without_scissor_emitted = true; 2011} 2012 2013/** 2014 * Set the clear depth/stencil values to the image's metadata. 2015 */ 2016static void 2017radv_set_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 2018 const VkImageSubresourceRange *range, 2019 VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects) 2020{ 2021 struct radeon_cmdbuf *cs = cmd_buffer->cs; 2022 uint32_t level_count = radv_get_levelCount(image, range); 2023 2024 if (aspects == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { 2025 uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel); 2026 2027 /* Use the fastest way when both aspects are used. */ 2028 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + 2 * level_count, cmd_buffer->state.predicating)); 2029 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP)); 2030 radeon_emit(cs, va); 2031 radeon_emit(cs, va >> 32); 2032 2033 for (uint32_t l = 0; l < level_count; l++) { 2034 radeon_emit(cs, ds_clear_value.stencil); 2035 radeon_emit(cs, fui(ds_clear_value.depth)); 2036 } 2037 } else { 2038 /* Otherwise we need one WRITE_DATA packet per level. */ 2039 for (uint32_t l = 0; l < level_count; l++) { 2040 uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel + l); 2041 unsigned value; 2042 2043 if (aspects == VK_IMAGE_ASPECT_DEPTH_BIT) { 2044 value = fui(ds_clear_value.depth); 2045 va += 4; 2046 } else { 2047 assert(aspects == VK_IMAGE_ASPECT_STENCIL_BIT); 2048 value = ds_clear_value.stencil; 2049 } 2050 2051 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, cmd_buffer->state.predicating)); 2052 radeon_emit(cs, 2053 S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP)); 2054 radeon_emit(cs, va); 2055 radeon_emit(cs, va >> 32); 2056 radeon_emit(cs, value); 2057 } 2058 } 2059} 2060 2061/** 2062 * Update the TC-compat metadata value for this image. 2063 */ 2064static void 2065radv_set_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 2066 const VkImageSubresourceRange *range, uint32_t value) 2067{ 2068 struct radeon_cmdbuf *cs = cmd_buffer->cs; 2069 2070 if (!cmd_buffer->device->physical_device->rad_info.has_tc_compat_zrange_bug) 2071 return; 2072 2073 uint64_t va = radv_get_tc_compat_zrange_va(image, range->baseMipLevel); 2074 uint32_t level_count = radv_get_levelCount(image, range); 2075 2076 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + level_count, cmd_buffer->state.predicating)); 2077 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP)); 2078 radeon_emit(cs, va); 2079 radeon_emit(cs, va >> 32); 2080 2081 for (uint32_t l = 0; l < level_count; l++) 2082 radeon_emit(cs, value); 2083} 2084 2085static void 2086radv_update_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer, 2087 const struct radv_image_view *iview, 2088 VkClearDepthStencilValue ds_clear_value) 2089{ 2090 VkImageSubresourceRange range = { 2091 .aspectMask = iview->aspect_mask, 2092 .baseMipLevel = iview->base_mip, 2093 .levelCount = iview->level_count, 2094 .baseArrayLayer = iview->base_layer, 2095 .layerCount = iview->layer_count, 2096 }; 2097 uint32_t cond_val; 2098 2099 /* Conditionally set DB_Z_INFO.ZRANGE_PRECISION to 0 when the last 2100 * depth clear value is 0.0f. 2101 */ 2102 cond_val = ds_clear_value.depth == 0.0f ? UINT_MAX : 0; 2103 2104 radv_set_tc_compat_zrange_metadata(cmd_buffer, iview->image, &range, cond_val); 2105} 2106 2107/** 2108 * Update the clear depth/stencil values for this image. 2109 */ 2110void 2111radv_update_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, 2112 const struct radv_image_view *iview, 2113 VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects) 2114{ 2115 VkImageSubresourceRange range = { 2116 .aspectMask = iview->aspect_mask, 2117 .baseMipLevel = iview->base_mip, 2118 .levelCount = iview->level_count, 2119 .baseArrayLayer = iview->base_layer, 2120 .layerCount = iview->layer_count, 2121 }; 2122 struct radv_image *image = iview->image; 2123 2124 assert(radv_htile_enabled(image, range.baseMipLevel)); 2125 2126 radv_set_ds_clear_metadata(cmd_buffer, iview->image, &range, ds_clear_value, aspects); 2127 2128 if (radv_image_is_tc_compat_htile(image) && (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) { 2129 radv_update_tc_compat_zrange_metadata(cmd_buffer, iview, ds_clear_value); 2130 } 2131 2132 radv_update_bound_fast_clear_ds(cmd_buffer, iview, ds_clear_value, aspects); 2133} 2134 2135/** 2136 * Load the clear depth/stencil values from the image's metadata. 2137 */ 2138static void 2139radv_load_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, const struct radv_image_view *iview) 2140{ 2141 struct radeon_cmdbuf *cs = cmd_buffer->cs; 2142 const struct radv_image *image = iview->image; 2143 VkImageAspectFlags aspects = vk_format_aspects(image->vk_format); 2144 uint64_t va = radv_get_ds_clear_value_va(image, iview->base_mip); 2145 unsigned reg_offset = 0, reg_count = 0; 2146 2147 assert(radv_image_has_htile(image)); 2148 2149 if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { 2150 ++reg_count; 2151 } else { 2152 ++reg_offset; 2153 va += 4; 2154 } 2155 if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT) 2156 ++reg_count; 2157 2158 uint32_t reg = R_028028_DB_STENCIL_CLEAR + 4 * reg_offset; 2159 2160 if (cmd_buffer->device->physical_device->rad_info.has_load_ctx_reg_pkt) { 2161 radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, 0)); 2162 radeon_emit(cs, va); 2163 radeon_emit(cs, va >> 32); 2164 radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2); 2165 radeon_emit(cs, reg_count); 2166 } else { 2167 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); 2168 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) | 2169 (reg_count == 2 ? COPY_DATA_COUNT_SEL : 0)); 2170 radeon_emit(cs, va); 2171 radeon_emit(cs, va >> 32); 2172 radeon_emit(cs, reg >> 2); 2173 radeon_emit(cs, 0); 2174 2175 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); 2176 radeon_emit(cs, 0); 2177 } 2178} 2179 2180/* 2181 * With DCC some colors don't require CMASK elimination before being 2182 * used as a texture. This sets a predicate value to determine if the 2183 * cmask eliminate is required. 2184 */ 2185void 2186radv_update_fce_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 2187 const VkImageSubresourceRange *range, bool value) 2188{ 2189 if (!image->fce_pred_offset) 2190 return; 2191 2192 uint64_t pred_val = value; 2193 uint64_t va = radv_image_get_fce_pred_va(image, range->baseMipLevel); 2194 uint32_t level_count = radv_get_levelCount(image, range); 2195 uint32_t count = 2 * level_count; 2196 2197 radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0)); 2198 radeon_emit(cmd_buffer->cs, 2199 S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP)); 2200 radeon_emit(cmd_buffer->cs, va); 2201 radeon_emit(cmd_buffer->cs, va >> 32); 2202 2203 for (uint32_t l = 0; l < level_count; l++) { 2204 radeon_emit(cmd_buffer->cs, pred_val); 2205 radeon_emit(cmd_buffer->cs, pred_val >> 32); 2206 } 2207} 2208 2209/** 2210 * Update the DCC predicate to reflect the compression state. 2211 */ 2212void 2213radv_update_dcc_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 2214 const VkImageSubresourceRange *range, bool value) 2215{ 2216 if (image->dcc_pred_offset == 0) 2217 return; 2218 2219 uint64_t pred_val = value; 2220 uint64_t va = radv_image_get_dcc_pred_va(image, range->baseMipLevel); 2221 uint32_t level_count = radv_get_levelCount(image, range); 2222 uint32_t count = 2 * level_count; 2223 2224 assert(radv_dcc_enabled(image, range->baseMipLevel)); 2225 2226 radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0)); 2227 radeon_emit(cmd_buffer->cs, 2228 S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP)); 2229 radeon_emit(cmd_buffer->cs, va); 2230 radeon_emit(cmd_buffer->cs, va >> 32); 2231 2232 for (uint32_t l = 0; l < level_count; l++) { 2233 radeon_emit(cmd_buffer->cs, pred_val); 2234 radeon_emit(cmd_buffer->cs, pred_val >> 32); 2235 } 2236} 2237 2238/** 2239 * Update the fast clear color values if the image is bound as a color buffer. 2240 */ 2241static void 2242radv_update_bound_fast_clear_color(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 2243 int cb_idx, uint32_t color_values[2]) 2244{ 2245 const struct radv_subpass *subpass = cmd_buffer->state.subpass; 2246 struct radeon_cmdbuf *cs = cmd_buffer->cs; 2247 uint32_t att_idx; 2248 2249 if (!cmd_buffer->state.attachments || !subpass) 2250 return; 2251 2252 att_idx = subpass->color_attachments[cb_idx].attachment; 2253 if (att_idx == VK_ATTACHMENT_UNUSED) 2254 return; 2255 2256 if (cmd_buffer->state.attachments[att_idx].iview->image != image) 2257 return; 2258 2259 radeon_set_context_reg_seq(cs, R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c, 2); 2260 radeon_emit(cs, color_values[0]); 2261 radeon_emit(cs, color_values[1]); 2262 2263 cmd_buffer->state.context_roll_without_scissor_emitted = true; 2264} 2265 2266/** 2267 * Set the clear color values to the image's metadata. 2268 */ 2269static void 2270radv_set_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 2271 const VkImageSubresourceRange *range, uint32_t color_values[2]) 2272{ 2273 struct radeon_cmdbuf *cs = cmd_buffer->cs; 2274 uint32_t level_count = radv_get_levelCount(image, range); 2275 uint32_t count = 2 * level_count; 2276 2277 assert(radv_image_has_cmask(image) || radv_dcc_enabled(image, range->baseMipLevel)); 2278 2279 if (radv_image_has_clear_value(image)) { 2280 uint64_t va = radv_image_get_fast_clear_va(image, range->baseMipLevel); 2281 2282 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + count, cmd_buffer->state.predicating)); 2283 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP)); 2284 radeon_emit(cs, va); 2285 radeon_emit(cs, va >> 32); 2286 2287 for (uint32_t l = 0; l < level_count; l++) { 2288 radeon_emit(cs, color_values[0]); 2289 radeon_emit(cs, color_values[1]); 2290 } 2291 } else { 2292 /* Some default value we can set in the update. */ 2293 assert(color_values[0] == 0 && color_values[1] == 0); 2294 } 2295} 2296 2297/** 2298 * Update the clear color values for this image. 2299 */ 2300void 2301radv_update_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, 2302 const struct radv_image_view *iview, int cb_idx, 2303 uint32_t color_values[2]) 2304{ 2305 struct radv_image *image = iview->image; 2306 VkImageSubresourceRange range = { 2307 .aspectMask = iview->aspect_mask, 2308 .baseMipLevel = iview->base_mip, 2309 .levelCount = iview->level_count, 2310 .baseArrayLayer = iview->base_layer, 2311 .layerCount = iview->layer_count, 2312 }; 2313 2314 assert(radv_image_has_cmask(image) || radv_dcc_enabled(image, iview->base_mip)); 2315 2316 /* Do not need to update the clear value for images that are fast cleared with the comp-to-single 2317 * mode because the hardware gets the value from the image directly. 2318 */ 2319 if (iview->image->support_comp_to_single) 2320 return; 2321 2322 radv_set_color_clear_metadata(cmd_buffer, image, &range, color_values); 2323 2324 radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx, color_values); 2325} 2326 2327/** 2328 * Load the clear color values from the image's metadata. 2329 */ 2330static void 2331radv_load_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image_view *iview, 2332 int cb_idx) 2333{ 2334 struct radeon_cmdbuf *cs = cmd_buffer->cs; 2335 struct radv_image *image = iview->image; 2336 2337 if (!radv_image_has_cmask(image) && !radv_dcc_enabled(image, iview->base_mip)) 2338 return; 2339 2340 if (iview->image->support_comp_to_single) 2341 return; 2342 2343 if (!radv_image_has_clear_value(image)) { 2344 uint32_t color_values[2] = {0, 0}; 2345 radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx, color_values); 2346 return; 2347 } 2348 2349 uint64_t va = radv_image_get_fast_clear_va(image, iview->base_mip); 2350 uint32_t reg = R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c; 2351 2352 if (cmd_buffer->device->physical_device->rad_info.has_load_ctx_reg_pkt) { 2353 radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, cmd_buffer->state.predicating)); 2354 radeon_emit(cs, va); 2355 radeon_emit(cs, va >> 32); 2356 radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2); 2357 radeon_emit(cs, 2); 2358 } else { 2359 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating)); 2360 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) | 2361 COPY_DATA_COUNT_SEL); 2362 radeon_emit(cs, va); 2363 radeon_emit(cs, va >> 32); 2364 radeon_emit(cs, reg >> 2); 2365 radeon_emit(cs, 0); 2366 2367 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating)); 2368 radeon_emit(cs, 0); 2369 } 2370} 2371 2372/* GFX9+ metadata cache flushing workaround. metadata cache coherency is 2373 * broken if the CB caches data of multiple mips of the same image at the 2374 * same time. 2375 * 2376 * Insert some flushes to avoid this. 2377 */ 2378static void 2379radv_emit_fb_mip_change_flush(struct radv_cmd_buffer *cmd_buffer) 2380{ 2381 struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer; 2382 const struct radv_subpass *subpass = cmd_buffer->state.subpass; 2383 bool color_mip_changed = false; 2384 2385 /* Entire workaround is not applicable before GFX9 */ 2386 if (cmd_buffer->device->physical_device->rad_info.chip_class < GFX9) 2387 return; 2388 2389 if (!framebuffer) 2390 return; 2391 2392 for (int i = 0; i < subpass->color_count; ++i) { 2393 int idx = subpass->color_attachments[i].attachment; 2394 if (idx == VK_ATTACHMENT_UNUSED) 2395 continue; 2396 2397 struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview; 2398 2399 if ((radv_image_has_CB_metadata(iview->image) || 2400 radv_dcc_enabled(iview->image, iview->base_mip) || 2401 radv_dcc_enabled(iview->image, cmd_buffer->state.cb_mip[i])) && 2402 cmd_buffer->state.cb_mip[i] != iview->base_mip) 2403 color_mip_changed = true; 2404 2405 cmd_buffer->state.cb_mip[i] = iview->base_mip; 2406 } 2407 2408 if (color_mip_changed) { 2409 cmd_buffer->state.flush_bits |= 2410 RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; 2411 } 2412} 2413 2414/* This function does the flushes for mip changes if the levels are not zero for 2415 * all render targets. This way we can assume at the start of the next cmd_buffer 2416 * that rendering to mip 0 doesn't need any flushes. As that is the most common 2417 * case that saves some flushes. */ 2418static void 2419radv_emit_mip_change_flush_default(struct radv_cmd_buffer *cmd_buffer) 2420{ 2421 /* Entire workaround is not applicable before GFX9 */ 2422 if (cmd_buffer->device->physical_device->rad_info.chip_class < GFX9) 2423 return; 2424 2425 bool need_color_mip_flush = false; 2426 for (unsigned i = 0; i < 8; ++i) { 2427 if (cmd_buffer->state.cb_mip[i]) { 2428 need_color_mip_flush = true; 2429 break; 2430 } 2431 } 2432 2433 if (need_color_mip_flush) { 2434 cmd_buffer->state.flush_bits |= 2435 RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; 2436 } 2437 2438 memset(cmd_buffer->state.cb_mip, 0, sizeof(cmd_buffer->state.cb_mip)); 2439} 2440 2441static struct radv_image * 2442radv_cmd_buffer_get_vrs_image(struct radv_cmd_buffer *cmd_buffer) 2443{ 2444 struct radv_device *device = cmd_buffer->device; 2445 2446 if (!device->vrs.image) { 2447 VkResult result; 2448 2449 /* The global VRS state is initialized on-demand to avoid wasting VRAM. */ 2450 result = radv_device_init_vrs_state(device); 2451 if (result != VK_SUCCESS) { 2452 cmd_buffer->record_result = result; 2453 return NULL; 2454 } 2455 } 2456 2457 return device->vrs.image; 2458} 2459 2460static void 2461radv_emit_framebuffer_state(struct radv_cmd_buffer *cmd_buffer) 2462{ 2463 int i; 2464 struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer; 2465 const struct radv_subpass *subpass = cmd_buffer->state.subpass; 2466 2467 /* this may happen for inherited secondary recording */ 2468 if (!framebuffer) 2469 return; 2470 2471 for (i = 0; i < 8; ++i) { 2472 if (i >= subpass->color_count || 2473 subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) { 2474 radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, 2475 S_028C70_FORMAT(V_028C70_COLOR_INVALID)); 2476 continue; 2477 } 2478 2479 int idx = subpass->color_attachments[i].attachment; 2480 struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview; 2481 VkImageLayout layout = subpass->color_attachments[i].layout; 2482 bool in_render_loop = subpass->color_attachments[i].in_render_loop; 2483 2484 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, iview->image->bo); 2485 2486 assert(iview->aspect_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_PLANE_0_BIT | 2487 VK_IMAGE_ASPECT_PLANE_1_BIT | VK_IMAGE_ASPECT_PLANE_2_BIT)); 2488 radv_emit_fb_color_state(cmd_buffer, i, &cmd_buffer->state.attachments[idx].cb, iview, layout, 2489 in_render_loop, cmd_buffer->state.attachments[idx].disable_dcc); 2490 2491 radv_load_color_clear_metadata(cmd_buffer, iview, i); 2492 } 2493 2494 if (subpass->depth_stencil_attachment) { 2495 int idx = subpass->depth_stencil_attachment->attachment; 2496 VkImageLayout layout = subpass->depth_stencil_attachment->layout; 2497 bool in_render_loop = subpass->depth_stencil_attachment->in_render_loop; 2498 struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview; 2499 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, 2500 cmd_buffer->state.attachments[idx].iview->image->bo); 2501 2502 radv_emit_fb_ds_state(cmd_buffer, &cmd_buffer->state.attachments[idx].ds, iview, layout, 2503 in_render_loop); 2504 2505 if (radv_layout_is_htile_compressed( 2506 cmd_buffer->device, iview->image, layout, in_render_loop, 2507 radv_image_queue_family_mask(iview->image, cmd_buffer->queue_family_index, 2508 cmd_buffer->queue_family_index))) { 2509 /* Only load the depth/stencil fast clear values when 2510 * compressed rendering is enabled. 2511 */ 2512 radv_load_ds_clear_metadata(cmd_buffer, iview); 2513 } 2514 } else if (subpass->vrs_attachment && cmd_buffer->device->vrs.image) { 2515 /* When a subpass uses a VRS attachment without binding a depth/stencil attachment, we have to 2516 * bind our internal depth buffer that contains the VRS data as part of HTILE. 2517 */ 2518 VkImageLayout layout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; 2519 struct radv_buffer *htile_buffer = cmd_buffer->device->vrs.buffer; 2520 struct radv_image *image = cmd_buffer->device->vrs.image; 2521 struct radv_ds_buffer_info ds; 2522 struct radv_image_view iview; 2523 2524 radv_image_view_init(&iview, cmd_buffer->device, 2525 &(VkImageViewCreateInfo){ 2526 .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, 2527 .image = radv_image_to_handle(image), 2528 .viewType = radv_meta_get_view_type(image), 2529 .format = image->vk_format, 2530 .subresourceRange = 2531 { 2532 .aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT, 2533 .baseMipLevel = 0, 2534 .levelCount = 1, 2535 .baseArrayLayer = 0, 2536 .layerCount = 1, 2537 }, 2538 }, 2539 NULL); 2540 2541 radv_initialise_vrs_surface(image, htile_buffer, &ds); 2542 2543 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, htile_buffer->bo); 2544 2545 radv_emit_fb_ds_state(cmd_buffer, &ds, &iview, layout, false); 2546 2547 radv_image_view_finish(&iview); 2548 } else { 2549 if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) 2550 radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 2); 2551 else 2552 radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 2); 2553 2554 radeon_emit(cmd_buffer->cs, S_028040_FORMAT(V_028040_Z_INVALID)); /* DB_Z_INFO */ 2555 radeon_emit(cmd_buffer->cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */ 2556 } 2557 radeon_set_context_reg(cmd_buffer->cs, R_028208_PA_SC_WINDOW_SCISSOR_BR, 2558 S_028208_BR_X(framebuffer->width) | S_028208_BR_Y(framebuffer->height)); 2559 2560 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX8) { 2561 bool disable_constant_encode = 2562 cmd_buffer->device->physical_device->rad_info.has_dcc_constant_encode; 2563 enum chip_class chip_class = cmd_buffer->device->physical_device->rad_info.chip_class; 2564 uint8_t watermark = chip_class >= GFX10 ? 6 : 4; 2565 2566 radeon_set_context_reg(cmd_buffer->cs, R_028424_CB_DCC_CONTROL, 2567 S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(chip_class <= GFX9) | 2568 S_028424_OVERWRITE_COMBINER_WATERMARK(watermark) | 2569 S_028424_DISABLE_CONSTANT_ENCODE_REG(disable_constant_encode)); 2570 } 2571 2572 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_FRAMEBUFFER; 2573} 2574 2575static void 2576radv_emit_index_buffer(struct radv_cmd_buffer *cmd_buffer, bool indirect) 2577{ 2578 struct radeon_cmdbuf *cs = cmd_buffer->cs; 2579 struct radv_cmd_state *state = &cmd_buffer->state; 2580 2581 if (state->index_type != state->last_index_type) { 2582 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { 2583 radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cs, 2584 R_03090C_VGT_INDEX_TYPE, 2, state->index_type); 2585 } else { 2586 radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0)); 2587 radeon_emit(cs, state->index_type); 2588 } 2589 2590 state->last_index_type = state->index_type; 2591 } 2592 2593 /* For the direct indexed draws we use DRAW_INDEX_2, which includes 2594 * the index_va and max_index_count already. */ 2595 if (!indirect) 2596 return; 2597 2598 radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0)); 2599 radeon_emit(cs, state->index_va); 2600 radeon_emit(cs, state->index_va >> 32); 2601 2602 radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0)); 2603 radeon_emit(cs, state->max_index_count); 2604 2605 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_INDEX_BUFFER; 2606} 2607 2608void 2609radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer) 2610{ 2611 bool has_perfect_queries = cmd_buffer->state.perfect_occlusion_queries_enabled; 2612 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; 2613 uint32_t pa_sc_mode_cntl_1 = pipeline ? pipeline->graphics.ms.pa_sc_mode_cntl_1 : 0; 2614 uint32_t db_count_control; 2615 2616 if (!cmd_buffer->state.active_occlusion_queries) { 2617 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) { 2618 if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) && 2619 pipeline->graphics.disable_out_of_order_rast_for_occlusion && has_perfect_queries) { 2620 /* Re-enable out-of-order rasterization if the 2621 * bound pipeline supports it and if it's has 2622 * been disabled before starting any perfect 2623 * occlusion queries. 2624 */ 2625 radeon_set_context_reg(cmd_buffer->cs, R_028A4C_PA_SC_MODE_CNTL_1, pa_sc_mode_cntl_1); 2626 } 2627 } 2628 db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1); 2629 } else { 2630 const struct radv_subpass *subpass = cmd_buffer->state.subpass; 2631 uint32_t sample_rate = subpass ? util_logbase2(subpass->max_sample_count) : 0; 2632 bool gfx10_perfect = 2633 cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10 && has_perfect_queries; 2634 2635 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) { 2636 /* Always enable PERFECT_ZPASS_COUNTS due to issues with partially 2637 * covered tiles, discards, and early depth testing. For more details, 2638 * see https://gitlab.freedesktop.org/mesa/mesa/-/issues/3218 */ 2639 db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) | 2640 S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(gfx10_perfect) | 2641 S_028004_SAMPLE_RATE(sample_rate) | S_028004_ZPASS_ENABLE(1) | 2642 S_028004_SLICE_EVEN_ENABLE(1) | S_028004_SLICE_ODD_ENABLE(1); 2643 2644 if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) && 2645 pipeline->graphics.disable_out_of_order_rast_for_occlusion && has_perfect_queries) { 2646 /* If the bound pipeline has enabled 2647 * out-of-order rasterization, we should 2648 * disable it before starting any perfect 2649 * occlusion queries. 2650 */ 2651 pa_sc_mode_cntl_1 &= C_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE; 2652 2653 radeon_set_context_reg(cmd_buffer->cs, R_028A4C_PA_SC_MODE_CNTL_1, pa_sc_mode_cntl_1); 2654 } 2655 } else { 2656 db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) | S_028004_SAMPLE_RATE(sample_rate); 2657 } 2658 } 2659 2660 radeon_set_context_reg(cmd_buffer->cs, R_028004_DB_COUNT_CONTROL, db_count_control); 2661 2662 cmd_buffer->state.context_roll_without_scissor_emitted = true; 2663} 2664 2665unsigned 2666radv_instance_rate_prolog_index(unsigned num_attributes, uint32_t instance_rate_inputs) 2667{ 2668 /* instance_rate_vs_prologs is a flattened array of array of arrays of different sizes, or a 2669 * single array sorted in ascending order using: 2670 * - total number of attributes 2671 * - number of instanced attributes 2672 * - index of first instanced attribute 2673 */ 2674 2675 /* From total number of attributes to offset. */ 2676 static const uint16_t total_to_offset[16] = {0, 1, 4, 10, 20, 35, 56, 84, 2677 120, 165, 220, 286, 364, 455, 560, 680}; 2678 unsigned start_index = total_to_offset[num_attributes - 1]; 2679 2680 /* From number of instanced attributes to offset. This would require a different LUT depending on 2681 * the total number of attributes, but we can exploit a pattern to use just the LUT for 16 total 2682 * attributes. 2683 */ 2684 static const uint8_t count_to_offset_total16[16] = {0, 16, 31, 45, 58, 70, 81, 91, 2685 100, 108, 115, 121, 126, 130, 133, 135}; 2686 unsigned count = util_bitcount(instance_rate_inputs); 2687 unsigned offset_from_start_index = 2688 count_to_offset_total16[count - 1] - ((16 - num_attributes) * (count - 1)); 2689 2690 unsigned first = ffs(instance_rate_inputs) - 1; 2691 return start_index + offset_from_start_index + first; 2692} 2693 2694union vs_prolog_key_header { 2695 struct { 2696 uint32_t key_size : 8; 2697 uint32_t num_attributes : 6; 2698 uint32_t as_ls : 1; 2699 uint32_t is_ngg : 1; 2700 uint32_t wave32 : 1; 2701 uint32_t next_stage : 3; 2702 uint32_t instance_rate_inputs : 1; 2703 uint32_t alpha_adjust_lo : 1; 2704 uint32_t alpha_adjust_hi : 1; 2705 uint32_t misaligned_mask : 1; 2706 uint32_t post_shuffle : 1; 2707 uint32_t nontrivial_divisors : 1; 2708 /* We need this to ensure the padding is zero. It's useful even if it's unused. */ 2709 uint32_t padding0 : 6; 2710 }; 2711 uint32_t v; 2712}; 2713 2714uint32_t 2715radv_hash_vs_prolog(const void *key_) 2716{ 2717 const uint32_t *key = key_; 2718 union vs_prolog_key_header header; 2719 header.v = key[0]; 2720 return _mesa_hash_data(key, header.key_size); 2721} 2722 2723bool 2724radv_cmp_vs_prolog(const void *a_, const void *b_) 2725{ 2726 const uint32_t *a = a_; 2727 const uint32_t *b = b_; 2728 if (a[0] != b[0]) 2729 return false; 2730 2731 union vs_prolog_key_header header; 2732 header.v = a[0]; 2733 return memcmp(a, b, header.key_size) == 0; 2734} 2735 2736static struct radv_shader_prolog * 2737lookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant *vs_shader, 2738 uint32_t *nontrivial_divisors) 2739{ 2740 STATIC_ASSERT(sizeof(union vs_prolog_key_header) == 4); 2741 assert(vs_shader->info.vs.dynamic_inputs); 2742 2743 const struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input; 2744 const struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; 2745 struct radv_device *device = cmd_buffer->device; 2746 2747 unsigned num_attributes = pipeline->last_vertex_attrib_bit; 2748 uint32_t attribute_mask = BITFIELD_MASK(num_attributes); 2749 2750 uint32_t instance_rate_inputs = state->instance_rate_inputs & attribute_mask; 2751 *nontrivial_divisors = state->nontrivial_divisors & attribute_mask; 2752 enum chip_class chip = device->physical_device->rad_info.chip_class; 2753 const uint32_t misaligned_mask = chip == GFX6 || chip >= GFX10 ? cmd_buffer->state.vbo_misaligned_mask : 0; 2754 2755 /* try to use a pre-compiled prolog first */ 2756 struct radv_shader_prolog *prolog = NULL; 2757 if (pipeline->can_use_simple_input && 2758 (!vs_shader->info.vs.as_ls || !instance_rate_inputs) && 2759 !misaligned_mask && !state->alpha_adjust_lo && !state->alpha_adjust_hi) { 2760 if (!instance_rate_inputs) { 2761 prolog = device->simple_vs_prologs[num_attributes - 1]; 2762 } else if (num_attributes <= 16 && !*nontrivial_divisors && 2763 util_bitcount(instance_rate_inputs) == 2764 (util_last_bit(instance_rate_inputs) - ffs(instance_rate_inputs) + 1)) { 2765 unsigned index = radv_instance_rate_prolog_index(num_attributes, instance_rate_inputs); 2766 prolog = device->instance_rate_vs_prologs[index]; 2767 } 2768 } 2769 if (prolog) 2770 return prolog; 2771 2772 /* if we couldn't use a pre-compiled prolog, find one in the cache or create one */ 2773 uint32_t key_words[16]; 2774 unsigned key_size = 1; 2775 2776 struct radv_vs_prolog_key key; 2777 key.state = state; 2778 key.num_attributes = num_attributes; 2779 key.misaligned_mask = misaligned_mask; 2780 /* The instance ID input VGPR is placed differently when as_ls=true. */ 2781 key.as_ls = vs_shader->info.vs.as_ls && instance_rate_inputs; 2782 key.is_ngg = vs_shader->info.is_ngg; 2783 key.wave32 = vs_shader->info.wave_size == 32; 2784 key.next_stage = pipeline->next_vertex_stage; 2785 2786 union vs_prolog_key_header header; 2787 header.v = 0; 2788 header.num_attributes = num_attributes; 2789 header.as_ls = key.as_ls; 2790 header.is_ngg = key.is_ngg; 2791 header.wave32 = key.wave32; 2792 header.next_stage = key.next_stage; 2793 2794 if (instance_rate_inputs & ~*nontrivial_divisors) { 2795 header.instance_rate_inputs = true; 2796 key_words[key_size++] = instance_rate_inputs; 2797 } 2798 if (*nontrivial_divisors) { 2799 header.nontrivial_divisors = true; 2800 key_words[key_size++] = *nontrivial_divisors; 2801 } 2802 if (misaligned_mask) { 2803 header.misaligned_mask = true; 2804 key_words[key_size++] = misaligned_mask; 2805 2806 uint8_t *formats = (uint8_t *)&key_words[key_size]; 2807 unsigned num_formats = 0; 2808 u_foreach_bit(index, misaligned_mask) formats[num_formats++] = state->formats[index]; 2809 while (num_formats & 0x3) 2810 formats[num_formats++] = 0; 2811 key_size += num_formats / 4u; 2812 2813 if (state->post_shuffle & attribute_mask) { 2814 header.post_shuffle = true; 2815 key_words[key_size++] = state->post_shuffle & attribute_mask; 2816 } 2817 } 2818 if (state->alpha_adjust_lo & attribute_mask) { 2819 header.alpha_adjust_lo = true; 2820 key_words[key_size++] = state->alpha_adjust_lo & attribute_mask; 2821 } 2822 if (state->alpha_adjust_hi & attribute_mask) { 2823 header.alpha_adjust_hi = true; 2824 key_words[key_size++] = state->alpha_adjust_hi & attribute_mask; 2825 } 2826 2827 header.key_size = key_size * sizeof(key_words[0]); 2828 key_words[0] = header.v; 2829 2830 uint32_t hash = radv_hash_vs_prolog(key_words); 2831 2832 if (cmd_buffer->state.emitted_vs_prolog && 2833 cmd_buffer->state.emitted_vs_prolog_key_hash == hash && 2834 radv_cmp_vs_prolog(key_words, cmd_buffer->state.emitted_vs_prolog_key)) 2835 return cmd_buffer->state.emitted_vs_prolog; 2836 2837 u_rwlock_rdlock(&device->vs_prologs_lock); 2838 struct hash_entry *prolog_entry = 2839 _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, key_words); 2840 u_rwlock_rdunlock(&device->vs_prologs_lock); 2841 2842 if (!prolog_entry) { 2843 u_rwlock_wrlock(&device->vs_prologs_lock); 2844 prolog_entry = _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, key_words); 2845 if (prolog_entry) { 2846 u_rwlock_wrunlock(&device->vs_prologs_lock); 2847 return prolog_entry->data; 2848 } 2849 2850 prolog = radv_create_vs_prolog(device, &key); 2851 uint32_t *key2 = malloc(key_size * 4); 2852 if (!prolog || !key2) { 2853 radv_prolog_destroy(device, prolog); 2854 free(key2); 2855 u_rwlock_wrunlock(&device->vs_prologs_lock); 2856 return NULL; 2857 } 2858 memcpy(key2, key_words, key_size * 4); 2859 _mesa_hash_table_insert_pre_hashed(device->vs_prologs, hash, key2, prolog); 2860 2861 u_rwlock_wrunlock(&device->vs_prologs_lock); 2862 return prolog; 2863 } 2864 2865 return prolog_entry->data; 2866} 2867 2868static void 2869emit_prolog_regs(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant *vs_shader, 2870 struct radv_shader_prolog *prolog, bool pipeline_is_dirty) 2871{ 2872 /* no need to re-emit anything in this case */ 2873 if (cmd_buffer->state.emitted_vs_prolog == prolog && !pipeline_is_dirty) 2874 return; 2875 2876 enum chip_class chip = cmd_buffer->device->physical_device->rad_info.chip_class; 2877 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; 2878 uint64_t prolog_va = radv_buffer_get_va(prolog->bo) + prolog->alloc->offset; 2879 2880 assert(cmd_buffer->state.emitted_pipeline == cmd_buffer->state.pipeline); 2881 assert(vs_shader->info.num_input_sgprs <= prolog->num_preserved_sgprs); 2882 2883 uint32_t rsrc1 = vs_shader->config.rsrc1; 2884 if (chip < GFX10 && G_00B228_SGPRS(prolog->rsrc1) > G_00B228_SGPRS(vs_shader->config.rsrc1)) 2885 rsrc1 = (rsrc1 & C_00B228_SGPRS) | (prolog->rsrc1 & ~C_00B228_SGPRS); 2886 2887 /* The main shader must not use less VGPRs than the prolog, otherwise shared vgprs might not 2888 * work. 2889 */ 2890 assert(G_00B848_VGPRS(vs_shader->config.rsrc1) >= G_00B848_VGPRS(prolog->rsrc1)); 2891 2892 unsigned pgm_lo_reg = R_00B120_SPI_SHADER_PGM_LO_VS; 2893 unsigned rsrc1_reg = R_00B128_SPI_SHADER_PGM_RSRC1_VS; 2894 if (vs_shader->info.is_ngg || pipeline->shaders[MESA_SHADER_GEOMETRY] == vs_shader) { 2895 pgm_lo_reg = chip >= GFX10 ? R_00B320_SPI_SHADER_PGM_LO_ES : R_00B210_SPI_SHADER_PGM_LO_ES; 2896 rsrc1_reg = R_00B228_SPI_SHADER_PGM_RSRC1_GS; 2897 } else if (pipeline->shaders[MESA_SHADER_TESS_CTRL] == vs_shader) { 2898 pgm_lo_reg = chip >= GFX10 ? R_00B520_SPI_SHADER_PGM_LO_LS : R_00B410_SPI_SHADER_PGM_LO_LS; 2899 rsrc1_reg = R_00B428_SPI_SHADER_PGM_RSRC1_HS; 2900 } else if (vs_shader->info.vs.as_ls) { 2901 pgm_lo_reg = R_00B520_SPI_SHADER_PGM_LO_LS; 2902 rsrc1_reg = R_00B528_SPI_SHADER_PGM_RSRC1_LS; 2903 } else if (vs_shader->info.vs.as_es) { 2904 pgm_lo_reg = R_00B320_SPI_SHADER_PGM_LO_ES; 2905 rsrc1_reg = R_00B328_SPI_SHADER_PGM_RSRC1_ES; 2906 } 2907 2908 radeon_set_sh_reg_seq(cmd_buffer->cs, pgm_lo_reg, 2); 2909 radeon_emit(cmd_buffer->cs, prolog_va >> 8); 2910 radeon_emit(cmd_buffer->cs, S_00B124_MEM_BASE(prolog_va >> 40)); 2911 2912 if (chip < GFX10) 2913 radeon_set_sh_reg(cmd_buffer->cs, rsrc1_reg, rsrc1); 2914 else 2915 assert(rsrc1 == vs_shader->config.rsrc1); 2916 2917 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, prolog->bo); 2918} 2919 2920static void 2921emit_prolog_inputs(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant *vs_shader, 2922 uint32_t nontrivial_divisors, bool pipeline_is_dirty) 2923{ 2924 /* no need to re-emit anything in this case */ 2925 if (!nontrivial_divisors && !pipeline_is_dirty && cmd_buffer->state.emitted_vs_prolog && 2926 !cmd_buffer->state.emitted_vs_prolog->nontrivial_divisors) 2927 return; 2928 2929 struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input; 2930 uint64_t input_va = radv_shader_variant_get_va(vs_shader); 2931 2932 if (nontrivial_divisors) { 2933 unsigned inputs_offset; 2934 uint32_t *inputs; 2935 unsigned size = 8 + util_bitcount(nontrivial_divisors) * 8; 2936 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, &inputs_offset, (void **)&inputs)) 2937 return; 2938 2939 *(inputs++) = input_va; 2940 *(inputs++) = input_va >> 32; 2941 2942 u_foreach_bit(index, nontrivial_divisors) 2943 { 2944 uint32_t div = state->divisors[index]; 2945 if (div == 0) { 2946 *(inputs++) = 0; 2947 *(inputs++) = 1; 2948 } else if (util_is_power_of_two_or_zero(div)) { 2949 *(inputs++) = util_logbase2(div) | (1 << 8); 2950 *(inputs++) = 0xffffffffu; 2951 } else { 2952 struct util_fast_udiv_info info = util_compute_fast_udiv_info(div, 32, 32); 2953 *(inputs++) = info.pre_shift | (info.increment << 8) | (info.post_shift << 16); 2954 *(inputs++) = info.multiplier; 2955 } 2956 } 2957 2958 input_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + inputs_offset; 2959 } 2960 2961 struct radv_userdata_info *loc = 2962 &vs_shader->info.user_sgprs_locs.shader_data[AC_UD_VS_PROLOG_INPUTS]; 2963 uint32_t base_reg = cmd_buffer->state.pipeline->user_data_0[MESA_SHADER_VERTEX]; 2964 assert(loc->sgpr_idx != -1); 2965 assert(loc->num_sgprs == 2); 2966 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, 2967 input_va, true); 2968} 2969 2970static void 2971radv_emit_vertex_input(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty) 2972{ 2973 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; 2974 struct radv_shader_variant *vs_shader = radv_get_shader(pipeline, MESA_SHADER_VERTEX); 2975 2976 if (!vs_shader->info.vs.has_prolog) 2977 return; 2978 2979 uint32_t nontrivial_divisors; 2980 struct radv_shader_prolog *prolog = 2981 lookup_vs_prolog(cmd_buffer, vs_shader, &nontrivial_divisors); 2982 if (!prolog) { 2983 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY; 2984 return; 2985 } 2986 emit_prolog_regs(cmd_buffer, vs_shader, prolog, pipeline_is_dirty); 2987 emit_prolog_inputs(cmd_buffer, vs_shader, nontrivial_divisors, pipeline_is_dirty); 2988 2989 cmd_buffer->state.emitted_vs_prolog = prolog; 2990} 2991 2992static void 2993radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty) 2994{ 2995 uint64_t states = 2996 cmd_buffer->state.dirty & cmd_buffer->state.emitted_pipeline->graphics.needed_dynamic_state; 2997 2998 if (states & (RADV_CMD_DIRTY_DYNAMIC_VIEWPORT)) 2999 radv_emit_viewport(cmd_buffer); 3000 3001 if (states & (RADV_CMD_DIRTY_DYNAMIC_SCISSOR | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT) && 3002 !cmd_buffer->device->physical_device->rad_info.has_gfx9_scissor_bug) 3003 radv_emit_scissor(cmd_buffer); 3004 3005 if (states & RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH) 3006 radv_emit_line_width(cmd_buffer); 3007 3008 if (states & RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS) 3009 radv_emit_blend_constants(cmd_buffer); 3010 3011 if (states & 3012 (RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK | 3013 RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK)) 3014 radv_emit_stencil(cmd_buffer); 3015 3016 if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS) 3017 radv_emit_depth_bounds(cmd_buffer); 3018 3019 if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS) 3020 radv_emit_depth_bias(cmd_buffer); 3021 3022 if (states & RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE) 3023 radv_emit_discard_rectangle(cmd_buffer); 3024 3025 if (states & RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS) 3026 radv_emit_sample_locations(cmd_buffer); 3027 3028 if (states & RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE) 3029 radv_emit_line_stipple(cmd_buffer); 3030 3031 if (states & (RADV_CMD_DIRTY_DYNAMIC_CULL_MODE | RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE | 3032 RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE)) 3033 radv_emit_culling(cmd_buffer, states); 3034 3035 if (states & RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY) 3036 radv_emit_primitive_topology(cmd_buffer); 3037 3038 if (states & 3039 (RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE | 3040 RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP | RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE | 3041 RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP)) 3042 radv_emit_depth_control(cmd_buffer, states); 3043 3044 if (states & RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP) 3045 radv_emit_stencil_control(cmd_buffer); 3046 3047 if (states & RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE) 3048 radv_emit_fragment_shading_rate(cmd_buffer); 3049 3050 if (states & RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE) 3051 radv_emit_primitive_restart_enable(cmd_buffer); 3052 3053 if (states & RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE) 3054 radv_emit_rasterizer_discard_enable(cmd_buffer); 3055 3056 if (states & RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP) 3057 radv_emit_logic_op(cmd_buffer); 3058 3059 if (states & RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE) 3060 radv_emit_color_write_enable(cmd_buffer); 3061 3062 if (states & RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT) 3063 radv_emit_vertex_input(cmd_buffer, pipeline_is_dirty); 3064 3065 cmd_buffer->state.dirty &= ~states; 3066} 3067 3068static void 3069radv_flush_push_descriptors(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point) 3070{ 3071 struct radv_descriptor_state *descriptors_state = 3072 radv_get_descriptors_state(cmd_buffer, bind_point); 3073 struct radv_descriptor_set *set = (struct radv_descriptor_set *)&descriptors_state->push_set.set; 3074 unsigned bo_offset; 3075 3076 if (!radv_cmd_buffer_upload_data(cmd_buffer, set->header.size, set->header.mapped_ptr, 3077 &bo_offset)) 3078 return; 3079 3080 set->header.va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); 3081 set->header.va += bo_offset; 3082} 3083 3084static void 3085radv_flush_indirect_descriptor_sets(struct radv_cmd_buffer *cmd_buffer, 3086 struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point) 3087{ 3088 struct radv_descriptor_state *descriptors_state = 3089 radv_get_descriptors_state(cmd_buffer, bind_point); 3090 uint32_t size = MAX_SETS * 4; 3091 uint32_t offset; 3092 void *ptr; 3093 3094 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, &offset, &ptr)) 3095 return; 3096 3097 for (unsigned i = 0; i < MAX_SETS; i++) { 3098 uint32_t *uptr = ((uint32_t *)ptr) + i; 3099 uint64_t set_va = 0; 3100 struct radv_descriptor_set *set = descriptors_state->sets[i]; 3101 if (descriptors_state->valid & (1u << i)) 3102 set_va = set->header.va; 3103 uptr[0] = set_va & 0xffffffff; 3104 } 3105 3106 uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); 3107 va += offset; 3108 3109 if (bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) { 3110 if (pipeline->shaders[MESA_SHADER_VERTEX]) 3111 radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_VERTEX, 3112 AC_UD_INDIRECT_DESCRIPTOR_SETS, va); 3113 3114 if (pipeline->shaders[MESA_SHADER_FRAGMENT]) 3115 radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_FRAGMENT, 3116 AC_UD_INDIRECT_DESCRIPTOR_SETS, va); 3117 3118 if (radv_pipeline_has_gs(pipeline)) 3119 radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_GEOMETRY, 3120 AC_UD_INDIRECT_DESCRIPTOR_SETS, va); 3121 3122 if (radv_pipeline_has_tess(pipeline)) 3123 radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_TESS_CTRL, 3124 AC_UD_INDIRECT_DESCRIPTOR_SETS, va); 3125 3126 if (radv_pipeline_has_tess(pipeline)) 3127 radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_TESS_EVAL, 3128 AC_UD_INDIRECT_DESCRIPTOR_SETS, va); 3129 } else { 3130 radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_COMPUTE, 3131 AC_UD_INDIRECT_DESCRIPTOR_SETS, va); 3132 } 3133} 3134 3135static void 3136radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stages, 3137 struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point) 3138{ 3139 struct radv_descriptor_state *descriptors_state = 3140 radv_get_descriptors_state(cmd_buffer, bind_point); 3141 bool flush_indirect_descriptors; 3142 3143 if (!descriptors_state->dirty) 3144 return; 3145 3146 if (descriptors_state->push_dirty) 3147 radv_flush_push_descriptors(cmd_buffer, bind_point); 3148 3149 flush_indirect_descriptors = pipeline && pipeline->need_indirect_descriptor_sets; 3150 3151 if (flush_indirect_descriptors) 3152 radv_flush_indirect_descriptor_sets(cmd_buffer, pipeline, bind_point); 3153 3154 ASSERTED unsigned cdw_max = 3155 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, MAX_SETS * MESA_SHADER_STAGES * 4); 3156 3157 if (pipeline) { 3158 if (stages & VK_SHADER_STAGE_COMPUTE_BIT) { 3159 radv_emit_descriptor_pointers(cmd_buffer, pipeline, descriptors_state, 3160 MESA_SHADER_COMPUTE); 3161 } else { 3162 radv_foreach_stage(stage, stages) 3163 { 3164 if (!cmd_buffer->state.pipeline->shaders[stage]) 3165 continue; 3166 3167 radv_emit_descriptor_pointers(cmd_buffer, pipeline, descriptors_state, stage); 3168 } 3169 } 3170 } 3171 3172 descriptors_state->dirty = 0; 3173 descriptors_state->push_dirty = false; 3174 3175 assert(cmd_buffer->cs->cdw <= cdw_max); 3176 3177 if (unlikely(cmd_buffer->device->trace_bo)) 3178 radv_save_descriptors(cmd_buffer, bind_point); 3179} 3180 3181static bool 3182radv_shader_loads_push_constants(struct radv_pipeline *pipeline, gl_shader_stage stage) 3183{ 3184 struct radv_userdata_info *loc = 3185 radv_lookup_user_sgpr(pipeline, stage, AC_UD_PUSH_CONSTANTS); 3186 return loc->sgpr_idx != -1; 3187} 3188 3189static void 3190radv_flush_constants(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stages, 3191 struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point) 3192{ 3193 struct radv_descriptor_state *descriptors_state = 3194 radv_get_descriptors_state(cmd_buffer, bind_point); 3195 struct radv_shader_variant *shader, *prev_shader; 3196 bool need_push_constants = false; 3197 unsigned offset; 3198 void *ptr; 3199 uint64_t va; 3200 uint32_t internal_stages; 3201 uint32_t dirty_stages = 0; 3202 3203 stages &= cmd_buffer->push_constant_stages; 3204 if (!stages || (!pipeline->push_constant_size && !pipeline->dynamic_offset_count)) 3205 return; 3206 3207 internal_stages = stages; 3208 switch (bind_point) { 3209 case VK_PIPELINE_BIND_POINT_GRAPHICS: 3210 break; 3211 case VK_PIPELINE_BIND_POINT_COMPUTE: 3212 dirty_stages = RADV_RT_STAGE_BITS; 3213 break; 3214 case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR: 3215 internal_stages = VK_SHADER_STAGE_COMPUTE_BIT; 3216 dirty_stages = VK_SHADER_STAGE_COMPUTE_BIT; 3217 break; 3218 default: 3219 unreachable("Unhandled bind point"); 3220 } 3221 3222 radv_foreach_stage(stage, internal_stages) 3223 { 3224 shader = radv_get_shader(pipeline, stage); 3225 if (!shader) 3226 continue; 3227 3228 need_push_constants |= radv_shader_loads_push_constants(pipeline, stage); 3229 3230 uint8_t base = shader->info.min_push_constant_used / 4; 3231 3232 radv_emit_inline_push_consts(cmd_buffer, pipeline, stage, AC_UD_INLINE_PUSH_CONSTANTS, 3233 (uint32_t *)&cmd_buffer->push_constants[base * 4]); 3234 } 3235 3236 if (need_push_constants) { 3237 if (!radv_cmd_buffer_upload_alloc( 3238 cmd_buffer, pipeline->push_constant_size + 16 * pipeline->dynamic_offset_count, &offset, 3239 &ptr)) 3240 return; 3241 3242 memcpy(ptr, cmd_buffer->push_constants, pipeline->push_constant_size); 3243 memcpy((char *)ptr + pipeline->push_constant_size, descriptors_state->dynamic_buffers, 3244 16 * pipeline->dynamic_offset_count); 3245 3246 va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); 3247 va += offset; 3248 3249 ASSERTED unsigned cdw_max = 3250 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, MESA_SHADER_STAGES * 4); 3251 3252 prev_shader = NULL; 3253 radv_foreach_stage(stage, internal_stages) 3254 { 3255 shader = radv_get_shader(pipeline, stage); 3256 3257 /* Avoid redundantly emitting the address for merged stages. */ 3258 if (shader && shader != prev_shader) { 3259 radv_emit_userdata_address(cmd_buffer, pipeline, stage, AC_UD_PUSH_CONSTANTS, va); 3260 3261 prev_shader = shader; 3262 } 3263 } 3264 assert(cmd_buffer->cs->cdw <= cdw_max); 3265 } 3266 3267 cmd_buffer->push_constant_stages &= ~stages; 3268 cmd_buffer->push_constant_stages |= dirty_stages; 3269} 3270 3271enum radv_dst_sel { 3272 DST_SEL_0001 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_0) | 3273 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1), 3274 DST_SEL_X001 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_0) | 3275 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1), 3276 DST_SEL_XY01 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | 3277 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1), 3278 DST_SEL_XYZ1 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | 3279 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1), 3280 DST_SEL_XYZW = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | 3281 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W), 3282 DST_SEL_ZYXW = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | 3283 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W), 3284}; 3285 3286static const uint32_t data_format_dst_sel[] = { 3287 [V_008F0C_BUF_DATA_FORMAT_INVALID] = DST_SEL_0001, 3288 [V_008F0C_BUF_DATA_FORMAT_8] = DST_SEL_X001, 3289 [V_008F0C_BUF_DATA_FORMAT_16] = DST_SEL_X001, 3290 [V_008F0C_BUF_DATA_FORMAT_8_8] = DST_SEL_XY01, 3291 [V_008F0C_BUF_DATA_FORMAT_32] = DST_SEL_X001, 3292 [V_008F0C_BUF_DATA_FORMAT_16_16] = DST_SEL_XY01, 3293 [V_008F0C_BUF_DATA_FORMAT_10_11_11] = DST_SEL_XYZ1, 3294 [V_008F0C_BUF_DATA_FORMAT_11_11_10] = DST_SEL_XYZ1, 3295 [V_008F0C_BUF_DATA_FORMAT_10_10_10_2] = DST_SEL_XYZW, 3296 [V_008F0C_BUF_DATA_FORMAT_2_10_10_10] = DST_SEL_XYZW, 3297 [V_008F0C_BUF_DATA_FORMAT_8_8_8_8] = DST_SEL_XYZW, 3298 [V_008F0C_BUF_DATA_FORMAT_32_32] = DST_SEL_XY01, 3299 [V_008F0C_BUF_DATA_FORMAT_16_16_16_16] = DST_SEL_XYZW, 3300 [V_008F0C_BUF_DATA_FORMAT_32_32_32] = DST_SEL_XYZ1, 3301 [V_008F0C_BUF_DATA_FORMAT_32_32_32_32] = DST_SEL_XYZW, 3302}; 3303 3304static void 3305radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty) 3306{ 3307 if ((pipeline_is_dirty || (cmd_buffer->state.dirty & RADV_CMD_DIRTY_VERTEX_BUFFER)) && 3308 cmd_buffer->state.pipeline->vb_desc_usage_mask) { 3309 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; 3310 struct radv_shader_variant *vs_shader = radv_get_shader(pipeline, MESA_SHADER_VERTEX); 3311 enum chip_class chip = cmd_buffer->device->physical_device->rad_info.chip_class; 3312 unsigned vb_offset; 3313 void *vb_ptr; 3314 unsigned desc_index = 0; 3315 uint32_t mask = pipeline->vb_desc_usage_mask; 3316 uint64_t va; 3317 struct radv_vs_input_state *vs_state = 3318 vs_shader->info.vs.dynamic_inputs ? &cmd_buffer->state.dynamic_vs_input : NULL; 3319 3320 /* allocate some descriptor state for vertex buffers */ 3321 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, pipeline->vb_desc_alloc_size, &vb_offset, &vb_ptr)) 3322 return; 3323 3324 assert(!vs_state || pipeline->use_per_attribute_vb_descs); 3325 3326 while (mask) { 3327 unsigned i = u_bit_scan(&mask); 3328 uint32_t *desc = &((uint32_t *)vb_ptr)[desc_index++ * 4]; 3329 uint32_t offset, rsrc_word3; 3330 unsigned binding = 3331 vs_state ? cmd_buffer->state.dynamic_vs_input.bindings[i] 3332 : (pipeline->use_per_attribute_vb_descs ? pipeline->attrib_bindings[i] : i); 3333 struct radv_buffer *buffer = cmd_buffer->vertex_bindings[binding].buffer; 3334 unsigned num_records; 3335 unsigned stride; 3336 3337 if (vs_state) { 3338 unsigned format = vs_state->formats[i]; 3339 unsigned dfmt = format & 0xf; 3340 unsigned nfmt = (format >> 4) & 0x7; 3341 3342 rsrc_word3 = 3343 vs_state->post_shuffle & (1u << i) ? DST_SEL_ZYXW : data_format_dst_sel[dfmt]; 3344 3345 if (chip >= GFX10) 3346 rsrc_word3 |= S_008F0C_FORMAT(ac_get_tbuffer_format(chip, dfmt, nfmt)); 3347 else 3348 rsrc_word3 |= S_008F0C_NUM_FORMAT(nfmt) | S_008F0C_DATA_FORMAT(dfmt); 3349 } else { 3350 if (chip >= GFX10) 3351 rsrc_word3 = DST_SEL_XYZW | S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_UINT); 3352 else 3353 rsrc_word3 = DST_SEL_XYZW | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) | 3354 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); 3355 } 3356 3357 if (!buffer) { 3358 if (vs_state) { 3359 /* Stride needs to be non-zero on GFX9, or else bounds checking is disabled. We need 3360 * to include the format/word3 so that the alpha channel is 1 for formats without an 3361 * alpha channel. 3362 */ 3363 desc[0] = 0; 3364 desc[1] = S_008F04_STRIDE(16); 3365 desc[2] = 0; 3366 desc[3] = rsrc_word3; 3367 } else { 3368 memset(desc, 0, 4 * 4); 3369 } 3370 continue; 3371 } 3372 3373 va = radv_buffer_get_va(buffer->bo); 3374 3375 offset = cmd_buffer->vertex_bindings[binding].offset; 3376 va += offset + buffer->offset; 3377 if (vs_state) 3378 va += vs_state->offsets[i]; 3379 3380 if (cmd_buffer->vertex_bindings[binding].size) { 3381 num_records = cmd_buffer->vertex_bindings[binding].size; 3382 } else { 3383 num_records = buffer->size - offset; 3384 } 3385 3386 if (pipeline->graphics.uses_dynamic_stride) { 3387 stride = cmd_buffer->vertex_bindings[binding].stride; 3388 } else { 3389 stride = pipeline->binding_stride[binding]; 3390 } 3391 3392 if (pipeline->use_per_attribute_vb_descs) { 3393 uint32_t attrib_end = vs_state ? vs_state->offsets[i] + vs_state->format_sizes[i] 3394 : pipeline->attrib_ends[i]; 3395 3396 if (num_records < attrib_end) { 3397 num_records = 0; /* not enough space for one vertex */ 3398 } else if (stride == 0) { 3399 num_records = 1; /* only one vertex */ 3400 } else { 3401 num_records = (num_records - attrib_end) / stride + 1; 3402 /* If attrib_offset>stride, then the compiler will increase the vertex index by 3403 * attrib_offset/stride and decrease the offset by attrib_offset%stride. This is 3404 * only allowed with static strides. 3405 */ 3406 num_records += pipeline->attrib_index_offset[i]; 3407 } 3408 3409 /* GFX10 uses OOB_SELECT_RAW if stride==0, so convert num_records from elements into 3410 * into bytes in that case. GFX8 always uses bytes. 3411 */ 3412 if (num_records && (chip == GFX8 || (chip != GFX9 && !stride))) { 3413 num_records = (num_records - 1) * stride + attrib_end; 3414 } else if (!num_records) { 3415 /* On GFX9, it seems bounds checking is disabled if both 3416 * num_records and stride are zero. This doesn't seem necessary on GFX8, GFX10 and 3417 * GFX10.3 but it doesn't hurt. 3418 */ 3419 if (vs_state) { 3420 desc[0] = 0; 3421 desc[1] = S_008F04_STRIDE(16); 3422 desc[2] = 0; 3423 desc[3] = rsrc_word3; 3424 } else { 3425 memset(desc, 0, 16); 3426 } 3427 continue; 3428 } 3429 } else { 3430 if (chip != GFX8 && stride) 3431 num_records = DIV_ROUND_UP(num_records, stride); 3432 } 3433 3434 if (chip >= GFX10) { 3435 /* OOB_SELECT chooses the out-of-bounds check: 3436 * - 1: index >= NUM_RECORDS (Structured) 3437 * - 3: offset >= NUM_RECORDS (Raw) 3438 */ 3439 int oob_select = stride ? V_008F0C_OOB_SELECT_STRUCTURED : V_008F0C_OOB_SELECT_RAW; 3440 rsrc_word3 |= S_008F0C_OOB_SELECT(oob_select) | S_008F0C_RESOURCE_LEVEL(1); 3441 } 3442 3443 desc[0] = va; 3444 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride); 3445 desc[2] = num_records; 3446 desc[3] = rsrc_word3; 3447 } 3448 3449 va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); 3450 va += vb_offset; 3451 3452 radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_VERTEX, AC_UD_VS_VERTEX_BUFFERS, 3453 va); 3454 3455 cmd_buffer->state.vb_va = va; 3456 cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_VBO_DESCRIPTORS; 3457 3458 if (unlikely(cmd_buffer->device->trace_bo)) 3459 radv_save_vertex_descriptors(cmd_buffer, (uintptr_t)vb_ptr); 3460 } 3461 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_VERTEX_BUFFER; 3462} 3463 3464static void 3465radv_emit_streamout_buffers(struct radv_cmd_buffer *cmd_buffer, uint64_t va) 3466{ 3467 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; 3468 struct radv_userdata_info *loc; 3469 uint32_t base_reg; 3470 3471 for (unsigned stage = 0; stage < MESA_SHADER_STAGES; ++stage) { 3472 if (!radv_get_shader(pipeline, stage)) 3473 continue; 3474 3475 loc = radv_lookup_user_sgpr(pipeline, stage, AC_UD_STREAMOUT_BUFFERS); 3476 if (loc->sgpr_idx == -1) 3477 continue; 3478 3479 base_reg = pipeline->user_data_0[stage]; 3480 3481 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, va, 3482 false); 3483 } 3484 3485 if (radv_pipeline_has_gs_copy_shader(pipeline)) { 3486 loc = &pipeline->gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_STREAMOUT_BUFFERS]; 3487 if (loc->sgpr_idx != -1) { 3488 base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0; 3489 3490 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, 3491 va, false); 3492 } 3493 } 3494} 3495 3496static void 3497radv_flush_streamout_descriptors(struct radv_cmd_buffer *cmd_buffer) 3498{ 3499 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_STREAMOUT_BUFFER) { 3500 struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings; 3501 struct radv_streamout_state *so = &cmd_buffer->state.streamout; 3502 unsigned so_offset; 3503 void *so_ptr; 3504 uint64_t va; 3505 3506 /* Allocate some descriptor state for streamout buffers. */ 3507 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, MAX_SO_BUFFERS * 16, &so_offset, &so_ptr)) 3508 return; 3509 3510 for (uint32_t i = 0; i < MAX_SO_BUFFERS; i++) { 3511 struct radv_buffer *buffer = sb[i].buffer; 3512 uint32_t *desc = &((uint32_t *)so_ptr)[i * 4]; 3513 3514 if (!(so->enabled_mask & (1 << i))) 3515 continue; 3516 3517 va = radv_buffer_get_va(buffer->bo) + buffer->offset; 3518 3519 va += sb[i].offset; 3520 3521 /* Set the descriptor. 3522 * 3523 * On GFX8, the format must be non-INVALID, otherwise 3524 * the buffer will be considered not bound and store 3525 * instructions will be no-ops. 3526 */ 3527 uint32_t size = 0xffffffff; 3528 3529 /* Compute the correct buffer size for NGG streamout 3530 * because it's used to determine the max emit per 3531 * buffer. 3532 */ 3533 if (cmd_buffer->device->physical_device->use_ngg_streamout) 3534 size = buffer->size - sb[i].offset; 3535 3536 uint32_t rsrc_word3 = 3537 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | 3538 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); 3539 3540 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) { 3541 rsrc_word3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) | 3542 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1); 3543 } else { 3544 rsrc_word3 |= S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); 3545 } 3546 3547 desc[0] = va; 3548 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32); 3549 desc[2] = size; 3550 desc[3] = rsrc_word3; 3551 } 3552 3553 va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); 3554 va += so_offset; 3555 3556 radv_emit_streamout_buffers(cmd_buffer, va); 3557 } 3558 3559 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_STREAMOUT_BUFFER; 3560} 3561 3562static void 3563radv_flush_ngg_gs_state(struct radv_cmd_buffer *cmd_buffer) 3564{ 3565 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; 3566 struct radv_userdata_info *loc; 3567 uint32_t ngg_gs_state = 0; 3568 uint32_t base_reg; 3569 3570 if (!radv_pipeline_has_gs(pipeline) || !pipeline->graphics.is_ngg) 3571 return; 3572 3573 /* By default NGG GS queries are disabled but they are enabled if the 3574 * command buffer has active GDS queries or if it's a secondary command 3575 * buffer that inherits the number of generated primitives. 3576 */ 3577 if (cmd_buffer->state.active_pipeline_gds_queries || 3578 (cmd_buffer->state.inherited_pipeline_statistics & 3579 VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT)) 3580 ngg_gs_state = 1; 3581 3582 loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_GEOMETRY, AC_UD_NGG_GS_STATE); 3583 base_reg = pipeline->user_data_0[MESA_SHADER_GEOMETRY]; 3584 assert(loc->sgpr_idx != -1); 3585 3586 radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, ngg_gs_state); 3587} 3588 3589static void 3590radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty) 3591{ 3592 radv_flush_vertex_descriptors(cmd_buffer, pipeline_is_dirty); 3593 radv_flush_streamout_descriptors(cmd_buffer); 3594 radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS, cmd_buffer->state.pipeline, 3595 VK_PIPELINE_BIND_POINT_GRAPHICS); 3596 radv_flush_constants(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS, cmd_buffer->state.pipeline, 3597 VK_PIPELINE_BIND_POINT_GRAPHICS); 3598 radv_flush_ngg_gs_state(cmd_buffer); 3599} 3600 3601struct radv_draw_info { 3602 /** 3603 * Number of vertices. 3604 */ 3605 uint32_t count; 3606 3607 /** 3608 * First instance id. 3609 */ 3610 uint32_t first_instance; 3611 3612 /** 3613 * Number of instances. 3614 */ 3615 uint32_t instance_count; 3616 3617 /** 3618 * Whether it's an indexed draw. 3619 */ 3620 bool indexed; 3621 3622 /** 3623 * Indirect draw parameters resource. 3624 */ 3625 struct radv_buffer *indirect; 3626 uint64_t indirect_offset; 3627 uint32_t stride; 3628 3629 /** 3630 * Draw count parameters resource. 3631 */ 3632 struct radv_buffer *count_buffer; 3633 uint64_t count_buffer_offset; 3634 3635 /** 3636 * Stream output parameters resource. 3637 */ 3638 struct radv_buffer *strmout_buffer; 3639 uint64_t strmout_buffer_offset; 3640}; 3641 3642static uint32_t 3643radv_get_primitive_reset_index(struct radv_cmd_buffer *cmd_buffer) 3644{ 3645 switch (cmd_buffer->state.index_type) { 3646 case V_028A7C_VGT_INDEX_8: 3647 return 0xffu; 3648 case V_028A7C_VGT_INDEX_16: 3649 return 0xffffu; 3650 case V_028A7C_VGT_INDEX_32: 3651 return 0xffffffffu; 3652 default: 3653 unreachable("invalid index type"); 3654 } 3655} 3656 3657static void 3658si_emit_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer, bool instanced_draw, 3659 bool indirect_draw, bool count_from_stream_output, 3660 uint32_t draw_vertex_count) 3661{ 3662 struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info; 3663 struct radv_cmd_state *state = &cmd_buffer->state; 3664 unsigned topology = state->dynamic.primitive_topology; 3665 bool prim_restart_enable = state->dynamic.primitive_restart_enable; 3666 struct radeon_cmdbuf *cs = cmd_buffer->cs; 3667 unsigned ia_multi_vgt_param; 3668 3669 ia_multi_vgt_param = 3670 si_get_ia_multi_vgt_param(cmd_buffer, instanced_draw, indirect_draw, count_from_stream_output, 3671 draw_vertex_count, topology, prim_restart_enable); 3672 3673 if (state->last_ia_multi_vgt_param != ia_multi_vgt_param) { 3674 if (info->chip_class == GFX9) { 3675 radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cs, 3676 R_030960_IA_MULTI_VGT_PARAM, 4, ia_multi_vgt_param); 3677 } else if (info->chip_class >= GFX7) { 3678 radeon_set_context_reg_idx(cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param); 3679 } else { 3680 radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param); 3681 } 3682 state->last_ia_multi_vgt_param = ia_multi_vgt_param; 3683 } 3684} 3685 3686static void 3687radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *draw_info) 3688{ 3689 struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info; 3690 struct radv_cmd_state *state = &cmd_buffer->state; 3691 struct radeon_cmdbuf *cs = cmd_buffer->cs; 3692 3693 /* Draw state. */ 3694 if (info->chip_class < GFX10) { 3695 si_emit_ia_multi_vgt_param(cmd_buffer, draw_info->instance_count > 1, draw_info->indirect, 3696 !!draw_info->strmout_buffer, 3697 draw_info->indirect ? 0 : draw_info->count); 3698 } 3699 3700 if (state->dynamic.primitive_restart_enable) { 3701 uint32_t primitive_reset_index = radv_get_primitive_reset_index(cmd_buffer); 3702 3703 if (primitive_reset_index != state->last_primitive_reset_index) { 3704 radeon_set_context_reg(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, primitive_reset_index); 3705 state->last_primitive_reset_index = primitive_reset_index; 3706 } 3707 } 3708 3709 if (draw_info->strmout_buffer) { 3710 uint64_t va = radv_buffer_get_va(draw_info->strmout_buffer->bo); 3711 3712 va += draw_info->strmout_buffer->offset + draw_info->strmout_buffer_offset; 3713 3714 radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, draw_info->stride); 3715 3716 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); 3717 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) | 3718 COPY_DATA_WR_CONFIRM); 3719 radeon_emit(cs, va); 3720 radeon_emit(cs, va >> 32); 3721 radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2); 3722 radeon_emit(cs, 0); /* unused */ 3723 3724 radv_cs_add_buffer(cmd_buffer->device->ws, cs, draw_info->strmout_buffer->bo); 3725 } 3726} 3727 3728static void 3729radv_stage_flush(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags src_stage_mask) 3730{ 3731 if (src_stage_mask & 3732 (VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_TRANSFER_BIT | 3733 VK_PIPELINE_STAGE_ACCELERATION_STRUCTURE_BUILD_BIT_KHR | 3734 VK_PIPELINE_STAGE_RAY_TRACING_SHADER_BIT_KHR | VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT | 3735 VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)) { 3736 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH; 3737 } 3738 3739 if (src_stage_mask & 3740 (VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | 3741 VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT | 3742 VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT | 3743 VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT | VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)) { 3744 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH; 3745 } else if (src_stage_mask & 3746 (VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_VERTEX_INPUT_BIT | 3747 VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | 3748 VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT | 3749 VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT | 3750 VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT | 3751 VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT)) { 3752 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH; 3753 } 3754} 3755 3756static bool 3757can_skip_buffer_l2_flushes(struct radv_device *device) 3758{ 3759 return device->physical_device->rad_info.chip_class == GFX9 || 3760 (device->physical_device->rad_info.chip_class >= GFX10 && 3761 !device->physical_device->rad_info.tcc_rb_non_coherent); 3762} 3763 3764/* 3765 * In vulkan barriers have two kinds of operations: 3766 * 3767 * - visibility (implemented with radv_src_access_flush) 3768 * - availability (implemented with radv_dst_access_flush) 3769 * 3770 * for a memory operation to observe the result of a previous memory operation 3771 * one needs to do a visibility operation from the source memory and then an 3772 * availability operation to the target memory. 3773 * 3774 * The complication is the availability and visibility operations do not need to 3775 * be in the same barrier. 3776 * 3777 * The cleanest way to implement this is to define the visibility operation to 3778 * bring the caches to a "state of rest", which none of the caches below that 3779 * level dirty. 3780 * 3781 * For GFX8 and earlier this would be VRAM/GTT with none of the caches dirty. 3782 * 3783 * For GFX9+ we can define the state at rest to be L2 instead of VRAM for all 3784 * buffers and for images marked as coherent, and VRAM/GTT for non-coherent 3785 * images. However, given the existence of memory barriers which do not specify 3786 * the image/buffer it often devolves to just VRAM/GTT anyway. 3787 * 3788 * To help reducing the invalidations for GPUs that have L2 coherency between the 3789 * RB and the shader caches, we always invalidate L2 on the src side, as we can 3790 * use our knowledge of past usage to optimize flushes away. 3791 */ 3792 3793enum radv_cmd_flush_bits 3794radv_src_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags src_flags, 3795 const struct radv_image *image) 3796{ 3797 bool has_CB_meta = true, has_DB_meta = true; 3798 bool image_is_coherent = image ? image->l2_coherent : false; 3799 enum radv_cmd_flush_bits flush_bits = 0; 3800 3801 if (image) { 3802 if (!radv_image_has_CB_metadata(image)) 3803 has_CB_meta = false; 3804 if (!radv_image_has_htile(image)) 3805 has_DB_meta = false; 3806 } 3807 3808 u_foreach_bit(b, src_flags) 3809 { 3810 switch ((VkAccessFlagBits)(1 << b)) { 3811 case VK_ACCESS_SHADER_WRITE_BIT: 3812 /* since the STORAGE bit isn't set we know that this is a meta operation. 3813 * on the dst flush side we skip CB/DB flushes without the STORAGE bit, so 3814 * set it here. */ 3815 if (image && !(image->usage & VK_IMAGE_USAGE_STORAGE_BIT)) { 3816 if (vk_format_is_depth_or_stencil(image->vk_format)) { 3817 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB; 3818 } else { 3819 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB; 3820 } 3821 } 3822 3823 /* This is valid even for the rb_noncoherent_dirty case, because with how we account for 3824 * dirtyness, if it isn't dirty it doesn't contain the data at all and hence doesn't need 3825 * invalidating. */ 3826 if (!image_is_coherent) 3827 flush_bits |= RADV_CMD_FLAG_WB_L2; 3828 break; 3829 case VK_ACCESS_ACCELERATION_STRUCTURE_WRITE_BIT_KHR: 3830 case VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT: 3831 case VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT: 3832 if (!image_is_coherent) 3833 flush_bits |= RADV_CMD_FLAG_WB_L2; 3834 break; 3835 case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT: 3836 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB; 3837 if (has_CB_meta) 3838 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; 3839 break; 3840 case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT: 3841 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB; 3842 if (has_DB_meta) 3843 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; 3844 break; 3845 case VK_ACCESS_TRANSFER_WRITE_BIT: 3846 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB; 3847 3848 if (!image_is_coherent) 3849 flush_bits |= RADV_CMD_FLAG_INV_L2; 3850 if (has_CB_meta) 3851 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; 3852 if (has_DB_meta) 3853 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; 3854 break; 3855 case VK_ACCESS_MEMORY_WRITE_BIT: 3856 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB; 3857 3858 if (!image_is_coherent) 3859 flush_bits |= RADV_CMD_FLAG_INV_L2; 3860 if (has_CB_meta) 3861 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; 3862 if (has_DB_meta) 3863 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; 3864 break; 3865 default: 3866 break; 3867 } 3868 } 3869 return flush_bits; 3870} 3871 3872enum radv_cmd_flush_bits 3873radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags dst_flags, 3874 const struct radv_image *image) 3875{ 3876 bool has_CB_meta = true, has_DB_meta = true; 3877 enum radv_cmd_flush_bits flush_bits = 0; 3878 bool flush_CB = true, flush_DB = true; 3879 bool image_is_coherent = image ? image->l2_coherent : false; 3880 3881 if (image) { 3882 if (!(image->usage & VK_IMAGE_USAGE_STORAGE_BIT)) { 3883 flush_CB = false; 3884 flush_DB = false; 3885 } 3886 3887 if (!radv_image_has_CB_metadata(image)) 3888 has_CB_meta = false; 3889 if (!radv_image_has_htile(image)) 3890 has_DB_meta = false; 3891 } 3892 3893 /* All the L2 invalidations below are not the CB/DB. So if there are no incoherent images 3894 * in the L2 cache in CB/DB mode then they are already usable from all the other L2 clients. */ 3895 image_is_coherent |= 3896 can_skip_buffer_l2_flushes(cmd_buffer->device) && !cmd_buffer->state.rb_noncoherent_dirty; 3897 3898 u_foreach_bit(b, dst_flags) 3899 { 3900 switch ((VkAccessFlagBits)(1 << b)) { 3901 case VK_ACCESS_INDIRECT_COMMAND_READ_BIT: 3902 case VK_ACCESS_INDEX_READ_BIT: 3903 case VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT: 3904 break; 3905 case VK_ACCESS_UNIFORM_READ_BIT: 3906 flush_bits |= RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_SCACHE; 3907 break; 3908 case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT: 3909 case VK_ACCESS_INPUT_ATTACHMENT_READ_BIT: 3910 case VK_ACCESS_TRANSFER_READ_BIT: 3911 case VK_ACCESS_TRANSFER_WRITE_BIT: 3912 flush_bits |= RADV_CMD_FLAG_INV_VCACHE; 3913 3914 if (has_CB_meta || has_DB_meta) 3915 flush_bits |= RADV_CMD_FLAG_INV_L2_METADATA; 3916 if (!image_is_coherent) 3917 flush_bits |= RADV_CMD_FLAG_INV_L2; 3918 break; 3919 case VK_ACCESS_SHADER_READ_BIT: 3920 flush_bits |= RADV_CMD_FLAG_INV_VCACHE; 3921 /* Unlike LLVM, ACO uses SMEM for SSBOs and we have to 3922 * invalidate the scalar cache. */ 3923 if (!cmd_buffer->device->physical_device->use_llvm && !image) 3924 flush_bits |= RADV_CMD_FLAG_INV_SCACHE; 3925 3926 if (has_CB_meta || has_DB_meta) 3927 flush_bits |= RADV_CMD_FLAG_INV_L2_METADATA; 3928 if (!image_is_coherent) 3929 flush_bits |= RADV_CMD_FLAG_INV_L2; 3930 break; 3931 case VK_ACCESS_ACCELERATION_STRUCTURE_READ_BIT_KHR: 3932 flush_bits |= RADV_CMD_FLAG_INV_VCACHE; 3933 if (cmd_buffer->device->physical_device->rad_info.chip_class < GFX9) 3934 flush_bits |= RADV_CMD_FLAG_INV_L2; 3935 break; 3936 case VK_ACCESS_SHADER_WRITE_BIT: 3937 case VK_ACCESS_ACCELERATION_STRUCTURE_WRITE_BIT_KHR: 3938 break; 3939 case VK_ACCESS_COLOR_ATTACHMENT_READ_BIT: 3940 case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT: 3941 if (flush_CB) 3942 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB; 3943 if (has_CB_meta) 3944 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; 3945 break; 3946 case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT: 3947 case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT: 3948 if (flush_DB) 3949 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB; 3950 if (has_DB_meta) 3951 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; 3952 break; 3953 case VK_ACCESS_MEMORY_READ_BIT: 3954 case VK_ACCESS_MEMORY_WRITE_BIT: 3955 flush_bits |= RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_SCACHE; 3956 if (!image_is_coherent) 3957 flush_bits |= RADV_CMD_FLAG_INV_L2; 3958 if (flush_CB) 3959 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB; 3960 if (has_CB_meta) 3961 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; 3962 if (flush_DB) 3963 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB; 3964 if (has_DB_meta) 3965 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; 3966 break; 3967 default: 3968 break; 3969 } 3970 } 3971 return flush_bits; 3972} 3973 3974void 3975radv_emit_subpass_barrier(struct radv_cmd_buffer *cmd_buffer, const struct radv_subpass_barrier *barrier) 3976{ 3977 struct radv_framebuffer *fb = cmd_buffer->state.framebuffer; 3978 if (fb && !fb->imageless) { 3979 for (int i = 0; i < fb->attachment_count; ++i) { 3980 cmd_buffer->state.flush_bits |= 3981 radv_src_access_flush(cmd_buffer, barrier->src_access_mask, fb->attachments[i]->image); 3982 } 3983 } else { 3984 cmd_buffer->state.flush_bits |= 3985 radv_src_access_flush(cmd_buffer, barrier->src_access_mask, NULL); 3986 } 3987 3988 radv_stage_flush(cmd_buffer, barrier->src_stage_mask); 3989 3990 if (fb && !fb->imageless) { 3991 for (int i = 0; i < fb->attachment_count; ++i) { 3992 cmd_buffer->state.flush_bits |= 3993 radv_dst_access_flush(cmd_buffer, barrier->dst_access_mask, fb->attachments[i]->image); 3994 } 3995 } else { 3996 cmd_buffer->state.flush_bits |= 3997 radv_dst_access_flush(cmd_buffer, barrier->dst_access_mask, NULL); 3998 } 3999} 4000 4001uint32_t 4002radv_get_subpass_id(struct radv_cmd_buffer *cmd_buffer) 4003{ 4004 struct radv_cmd_state *state = &cmd_buffer->state; 4005 uint32_t subpass_id = state->subpass - state->pass->subpasses; 4006 4007 /* The id of this subpass shouldn't exceed the number of subpasses in 4008 * this render pass minus 1. 4009 */ 4010 assert(subpass_id < state->pass->subpass_count); 4011 return subpass_id; 4012} 4013 4014static struct radv_sample_locations_state * 4015radv_get_attachment_sample_locations(struct radv_cmd_buffer *cmd_buffer, uint32_t att_idx, 4016 bool begin_subpass) 4017{ 4018 struct radv_cmd_state *state = &cmd_buffer->state; 4019 uint32_t subpass_id = radv_get_subpass_id(cmd_buffer); 4020 struct radv_image_view *view = state->attachments[att_idx].iview; 4021 4022 if (view->image->info.samples == 1) 4023 return NULL; 4024 4025 if (state->pass->attachments[att_idx].first_subpass_idx == subpass_id) { 4026 /* Return the initial sample locations if this is the initial 4027 * layout transition of the given subpass attachemnt. 4028 */ 4029 if (state->attachments[att_idx].sample_location.count > 0) 4030 return &state->attachments[att_idx].sample_location; 4031 } else { 4032 /* Otherwise return the subpass sample locations if defined. */ 4033 if (state->subpass_sample_locs) { 4034 /* Because the driver sets the current subpass before 4035 * initial layout transitions, we should use the sample 4036 * locations from the previous subpass to avoid an 4037 * off-by-one problem. Otherwise, use the sample 4038 * locations for the current subpass for final layout 4039 * transitions. 4040 */ 4041 if (begin_subpass) 4042 subpass_id--; 4043 4044 for (uint32_t i = 0; i < state->num_subpass_sample_locs; i++) { 4045 if (state->subpass_sample_locs[i].subpass_idx == subpass_id) 4046 return &state->subpass_sample_locs[i].sample_location; 4047 } 4048 } 4049 } 4050 4051 return NULL; 4052} 4053 4054static void 4055radv_handle_subpass_image_transition(struct radv_cmd_buffer *cmd_buffer, 4056 struct radv_subpass_attachment att, bool begin_subpass) 4057{ 4058 unsigned idx = att.attachment; 4059 struct radv_image_view *view = cmd_buffer->state.attachments[idx].iview; 4060 struct radv_sample_locations_state *sample_locs; 4061 VkImageSubresourceRange range; 4062 range.aspectMask = view->aspect_mask; 4063 range.baseMipLevel = view->base_mip; 4064 range.levelCount = 1; 4065 range.baseArrayLayer = view->base_layer; 4066 range.layerCount = cmd_buffer->state.framebuffer->layers; 4067 4068 if (cmd_buffer->state.subpass->view_mask) { 4069 /* If the current subpass uses multiview, the driver might have 4070 * performed a fast color/depth clear to the whole image 4071 * (including all layers). To make sure the driver will 4072 * decompress the image correctly (if needed), we have to 4073 * account for the "real" number of layers. If the view mask is 4074 * sparse, this will decompress more layers than needed. 4075 */ 4076 range.layerCount = util_last_bit(cmd_buffer->state.subpass->view_mask); 4077 } 4078 4079 /* Get the subpass sample locations for the given attachment, if NULL 4080 * is returned the driver will use the default HW locations. 4081 */ 4082 sample_locs = radv_get_attachment_sample_locations(cmd_buffer, idx, begin_subpass); 4083 4084 /* Determine if the subpass uses separate depth/stencil layouts. */ 4085 bool uses_separate_depth_stencil_layouts = false; 4086 if ((cmd_buffer->state.attachments[idx].current_layout != 4087 cmd_buffer->state.attachments[idx].current_stencil_layout) || 4088 (att.layout != att.stencil_layout)) { 4089 uses_separate_depth_stencil_layouts = true; 4090 } 4091 4092 /* For separate layouts, perform depth and stencil transitions 4093 * separately. 4094 */ 4095 if (uses_separate_depth_stencil_layouts && 4096 (range.aspectMask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))) { 4097 /* Depth-only transitions. */ 4098 range.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT; 4099 radv_handle_image_transition(cmd_buffer, view->image, 4100 cmd_buffer->state.attachments[idx].current_layout, 4101 cmd_buffer->state.attachments[idx].current_in_render_loop, 4102 att.layout, att.in_render_loop, 0, 0, &range, sample_locs); 4103 4104 /* Stencil-only transitions. */ 4105 range.aspectMask = VK_IMAGE_ASPECT_STENCIL_BIT; 4106 radv_handle_image_transition( 4107 cmd_buffer, view->image, cmd_buffer->state.attachments[idx].current_stencil_layout, 4108 cmd_buffer->state.attachments[idx].current_in_render_loop, att.stencil_layout, 4109 att.in_render_loop, 0, 0, &range, sample_locs); 4110 } else { 4111 radv_handle_image_transition(cmd_buffer, view->image, 4112 cmd_buffer->state.attachments[idx].current_layout, 4113 cmd_buffer->state.attachments[idx].current_in_render_loop, 4114 att.layout, att.in_render_loop, 0, 0, &range, sample_locs); 4115 } 4116 4117 cmd_buffer->state.attachments[idx].current_layout = att.layout; 4118 cmd_buffer->state.attachments[idx].current_stencil_layout = att.stencil_layout; 4119 cmd_buffer->state.attachments[idx].current_in_render_loop = att.in_render_loop; 4120} 4121 4122void 4123radv_cmd_buffer_set_subpass(struct radv_cmd_buffer *cmd_buffer, const struct radv_subpass *subpass) 4124{ 4125 cmd_buffer->state.subpass = subpass; 4126 4127 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER; 4128} 4129 4130static VkResult 4131radv_cmd_state_setup_sample_locations(struct radv_cmd_buffer *cmd_buffer, 4132 struct radv_render_pass *pass, 4133 const VkRenderPassBeginInfo *info) 4134{ 4135 const struct VkRenderPassSampleLocationsBeginInfoEXT *sample_locs = 4136 vk_find_struct_const(info->pNext, RENDER_PASS_SAMPLE_LOCATIONS_BEGIN_INFO_EXT); 4137 struct radv_cmd_state *state = &cmd_buffer->state; 4138 4139 if (!sample_locs) { 4140 state->subpass_sample_locs = NULL; 4141 return VK_SUCCESS; 4142 } 4143 4144 for (uint32_t i = 0; i < sample_locs->attachmentInitialSampleLocationsCount; i++) { 4145 const VkAttachmentSampleLocationsEXT *att_sample_locs = 4146 &sample_locs->pAttachmentInitialSampleLocations[i]; 4147 uint32_t att_idx = att_sample_locs->attachmentIndex; 4148 struct radv_image *image = cmd_buffer->state.attachments[att_idx].iview->image; 4149 4150 assert(vk_format_is_depth_or_stencil(image->vk_format)); 4151 4152 /* From the Vulkan spec 1.1.108: 4153 * 4154 * "If the image referenced by the framebuffer attachment at 4155 * index attachmentIndex was not created with 4156 * VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT 4157 * then the values specified in sampleLocationsInfo are 4158 * ignored." 4159 */ 4160 if (!(image->flags & VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT)) 4161 continue; 4162 4163 const VkSampleLocationsInfoEXT *sample_locs_info = &att_sample_locs->sampleLocationsInfo; 4164 4165 state->attachments[att_idx].sample_location.per_pixel = 4166 sample_locs_info->sampleLocationsPerPixel; 4167 state->attachments[att_idx].sample_location.grid_size = 4168 sample_locs_info->sampleLocationGridSize; 4169 state->attachments[att_idx].sample_location.count = sample_locs_info->sampleLocationsCount; 4170 typed_memcpy(&state->attachments[att_idx].sample_location.locations[0], 4171 sample_locs_info->pSampleLocations, sample_locs_info->sampleLocationsCount); 4172 } 4173 4174 state->subpass_sample_locs = 4175 vk_alloc(&cmd_buffer->pool->alloc, 4176 sample_locs->postSubpassSampleLocationsCount * sizeof(state->subpass_sample_locs[0]), 4177 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 4178 if (state->subpass_sample_locs == NULL) { 4179 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY; 4180 return cmd_buffer->record_result; 4181 } 4182 4183 state->num_subpass_sample_locs = sample_locs->postSubpassSampleLocationsCount; 4184 4185 for (uint32_t i = 0; i < sample_locs->postSubpassSampleLocationsCount; i++) { 4186 const VkSubpassSampleLocationsEXT *subpass_sample_locs_info = 4187 &sample_locs->pPostSubpassSampleLocations[i]; 4188 const VkSampleLocationsInfoEXT *sample_locs_info = 4189 &subpass_sample_locs_info->sampleLocationsInfo; 4190 4191 state->subpass_sample_locs[i].subpass_idx = subpass_sample_locs_info->subpassIndex; 4192 state->subpass_sample_locs[i].sample_location.per_pixel = 4193 sample_locs_info->sampleLocationsPerPixel; 4194 state->subpass_sample_locs[i].sample_location.grid_size = 4195 sample_locs_info->sampleLocationGridSize; 4196 state->subpass_sample_locs[i].sample_location.count = sample_locs_info->sampleLocationsCount; 4197 typed_memcpy(&state->subpass_sample_locs[i].sample_location.locations[0], 4198 sample_locs_info->pSampleLocations, sample_locs_info->sampleLocationsCount); 4199 } 4200 4201 return VK_SUCCESS; 4202} 4203 4204static VkResult 4205radv_cmd_state_setup_attachments(struct radv_cmd_buffer *cmd_buffer, struct radv_render_pass *pass, 4206 const VkRenderPassBeginInfo *info, 4207 const struct radv_extra_render_pass_begin_info *extra) 4208{ 4209 struct radv_cmd_state *state = &cmd_buffer->state; 4210 const struct VkRenderPassAttachmentBeginInfo *attachment_info = NULL; 4211 4212 if (info) { 4213 attachment_info = vk_find_struct_const(info->pNext, RENDER_PASS_ATTACHMENT_BEGIN_INFO); 4214 } 4215 4216 if (pass->attachment_count == 0) { 4217 state->attachments = NULL; 4218 return VK_SUCCESS; 4219 } 4220 4221 state->attachments = 4222 vk_alloc(&cmd_buffer->pool->alloc, pass->attachment_count * sizeof(state->attachments[0]), 8, 4223 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 4224 if (state->attachments == NULL) { 4225 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY; 4226 return cmd_buffer->record_result; 4227 } 4228 4229 for (uint32_t i = 0; i < pass->attachment_count; ++i) { 4230 struct radv_render_pass_attachment *att = &pass->attachments[i]; 4231 VkImageAspectFlags att_aspects = vk_format_aspects(att->format); 4232 VkImageAspectFlags clear_aspects = 0; 4233 4234 if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) { 4235 /* color attachment */ 4236 if (att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { 4237 clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT; 4238 } 4239 } else { 4240 /* depthstencil attachment */ 4241 if ((att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && 4242 att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { 4243 clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT; 4244 if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) && 4245 att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_DONT_CARE) 4246 clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT; 4247 } 4248 if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) && 4249 att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { 4250 clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT; 4251 } 4252 } 4253 4254 state->attachments[i].pending_clear_aspects = clear_aspects; 4255 state->attachments[i].cleared_views = 0; 4256 if (clear_aspects && info) { 4257 assert(info->clearValueCount > i); 4258 state->attachments[i].clear_value = info->pClearValues[i]; 4259 } 4260 4261 state->attachments[i].current_layout = att->initial_layout; 4262 state->attachments[i].current_in_render_loop = false; 4263 state->attachments[i].current_stencil_layout = att->stencil_initial_layout; 4264 state->attachments[i].disable_dcc = extra && extra->disable_dcc; 4265 state->attachments[i].sample_location.count = 0; 4266 4267 struct radv_image_view *iview; 4268 if (attachment_info && attachment_info->attachmentCount > i) { 4269 iview = radv_image_view_from_handle(attachment_info->pAttachments[i]); 4270 } else { 4271 iview = state->framebuffer->attachments[i]; 4272 } 4273 4274 state->attachments[i].iview = iview; 4275 if (iview->aspect_mask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { 4276 radv_initialise_ds_surface(cmd_buffer->device, &state->attachments[i].ds, iview); 4277 } else { 4278 radv_initialise_color_surface(cmd_buffer->device, &state->attachments[i].cb, iview); 4279 } 4280 } 4281 4282 return VK_SUCCESS; 4283} 4284 4285VkResult 4286radv_AllocateCommandBuffers(VkDevice _device, const VkCommandBufferAllocateInfo *pAllocateInfo, 4287 VkCommandBuffer *pCommandBuffers) 4288{ 4289 RADV_FROM_HANDLE(radv_device, device, _device); 4290 RADV_FROM_HANDLE(radv_cmd_pool, pool, pAllocateInfo->commandPool); 4291 4292 VkResult result = VK_SUCCESS; 4293 uint32_t i; 4294 4295 for (i = 0; i < pAllocateInfo->commandBufferCount; i++) { 4296 4297 if (!list_is_empty(&pool->free_cmd_buffers)) { 4298 struct radv_cmd_buffer *cmd_buffer = 4299 list_first_entry(&pool->free_cmd_buffers, struct radv_cmd_buffer, pool_link); 4300 4301 list_del(&cmd_buffer->pool_link); 4302 list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers); 4303 4304 result = radv_reset_cmd_buffer(cmd_buffer); 4305 cmd_buffer->level = pAllocateInfo->level; 4306 vk_command_buffer_finish(&cmd_buffer->vk); 4307 VkResult init_result = 4308 vk_command_buffer_init(&cmd_buffer->vk, &device->vk); 4309 if (init_result != VK_SUCCESS) 4310 result = init_result; 4311 4312 pCommandBuffers[i] = radv_cmd_buffer_to_handle(cmd_buffer); 4313 } else { 4314 result = radv_create_cmd_buffer(device, pool, pAllocateInfo->level, &pCommandBuffers[i]); 4315 } 4316 if (result != VK_SUCCESS) 4317 break; 4318 } 4319 4320 if (result != VK_SUCCESS) { 4321 radv_FreeCommandBuffers(_device, pAllocateInfo->commandPool, i, pCommandBuffers); 4322 4323 /* From the Vulkan 1.0.66 spec: 4324 * 4325 * "vkAllocateCommandBuffers can be used to create multiple 4326 * command buffers. If the creation of any of those command 4327 * buffers fails, the implementation must destroy all 4328 * successfully created command buffer objects from this 4329 * command, set all entries of the pCommandBuffers array to 4330 * NULL and return the error." 4331 */ 4332 memset(pCommandBuffers, 0, sizeof(*pCommandBuffers) * pAllocateInfo->commandBufferCount); 4333 } 4334 4335 return result; 4336} 4337 4338void 4339radv_FreeCommandBuffers(VkDevice device, VkCommandPool commandPool, uint32_t commandBufferCount, 4340 const VkCommandBuffer *pCommandBuffers) 4341{ 4342 for (uint32_t i = 0; i < commandBufferCount; i++) { 4343 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, pCommandBuffers[i]); 4344 4345 if (cmd_buffer) { 4346 if (cmd_buffer->pool) { 4347 list_del(&cmd_buffer->pool_link); 4348 list_addtail(&cmd_buffer->pool_link, &cmd_buffer->pool->free_cmd_buffers); 4349 } else 4350 radv_destroy_cmd_buffer(cmd_buffer); 4351 } 4352 } 4353} 4354 4355VkResult 4356radv_ResetCommandBuffer(VkCommandBuffer commandBuffer, VkCommandBufferResetFlags flags) 4357{ 4358 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 4359 return radv_reset_cmd_buffer(cmd_buffer); 4360} 4361 4362VkResult 4363radv_BeginCommandBuffer(VkCommandBuffer commandBuffer, const VkCommandBufferBeginInfo *pBeginInfo) 4364{ 4365 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 4366 VkResult result = VK_SUCCESS; 4367 4368 if (cmd_buffer->status != RADV_CMD_BUFFER_STATUS_INITIAL) { 4369 /* If the command buffer has already been resetted with 4370 * vkResetCommandBuffer, no need to do it again. 4371 */ 4372 result = radv_reset_cmd_buffer(cmd_buffer); 4373 if (result != VK_SUCCESS) 4374 return result; 4375 } 4376 4377 memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state)); 4378 cmd_buffer->state.last_primitive_reset_en = -1; 4379 cmd_buffer->state.last_index_type = -1; 4380 cmd_buffer->state.last_num_instances = -1; 4381 cmd_buffer->state.last_vertex_offset = -1; 4382 cmd_buffer->state.last_first_instance = -1; 4383 cmd_buffer->state.last_drawid = -1; 4384 cmd_buffer->state.predication_type = -1; 4385 cmd_buffer->state.last_sx_ps_downconvert = -1; 4386 cmd_buffer->state.last_sx_blend_opt_epsilon = -1; 4387 cmd_buffer->state.last_sx_blend_opt_control = -1; 4388 cmd_buffer->state.last_nggc_settings = -1; 4389 cmd_buffer->state.last_nggc_settings_sgpr_idx = -1; 4390 cmd_buffer->usage_flags = pBeginInfo->flags; 4391 4392 if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY && 4393 (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) { 4394 assert(pBeginInfo->pInheritanceInfo); 4395 cmd_buffer->state.framebuffer = 4396 radv_framebuffer_from_handle(pBeginInfo->pInheritanceInfo->framebuffer); 4397 cmd_buffer->state.pass = 4398 radv_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass); 4399 4400 struct radv_subpass *subpass = 4401 &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass]; 4402 4403 if (cmd_buffer->state.framebuffer) { 4404 result = radv_cmd_state_setup_attachments(cmd_buffer, cmd_buffer->state.pass, NULL, NULL); 4405 if (result != VK_SUCCESS) 4406 return result; 4407 } 4408 4409 cmd_buffer->state.inherited_pipeline_statistics = 4410 pBeginInfo->pInheritanceInfo->pipelineStatistics; 4411 4412 radv_cmd_buffer_set_subpass(cmd_buffer, subpass); 4413 } 4414 4415 if (unlikely(cmd_buffer->device->trace_bo)) 4416 radv_cmd_buffer_trace_emit(cmd_buffer); 4417 4418 radv_describe_begin_cmd_buffer(cmd_buffer); 4419 4420 cmd_buffer->status = RADV_CMD_BUFFER_STATUS_RECORDING; 4421 4422 return result; 4423} 4424 4425void 4426radv_CmdBindVertexBuffers(VkCommandBuffer commandBuffer, uint32_t firstBinding, 4427 uint32_t bindingCount, const VkBuffer *pBuffers, 4428 const VkDeviceSize *pOffsets) 4429{ 4430 radv_CmdBindVertexBuffers2EXT(commandBuffer, firstBinding, bindingCount, pBuffers, pOffsets, 4431 NULL, NULL); 4432} 4433 4434void 4435radv_CmdBindVertexBuffers2EXT(VkCommandBuffer commandBuffer, uint32_t firstBinding, 4436 uint32_t bindingCount, const VkBuffer *pBuffers, 4437 const VkDeviceSize *pOffsets, const VkDeviceSize *pSizes, 4438 const VkDeviceSize *pStrides) 4439{ 4440 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 4441 struct radv_vertex_binding *vb = cmd_buffer->vertex_bindings; 4442 struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input; 4443 bool changed = false; 4444 4445 /* We have to defer setting up vertex buffer since we need the buffer 4446 * stride from the pipeline. */ 4447 4448 assert(firstBinding + bindingCount <= MAX_VBS); 4449 cmd_buffer->state.vbo_misaligned_mask = state->misaligned_mask; 4450 enum chip_class chip = cmd_buffer->device->physical_device->rad_info.chip_class; 4451 for (uint32_t i = 0; i < bindingCount; i++) { 4452 RADV_FROM_HANDLE(radv_buffer, buffer, pBuffers[i]); 4453 uint32_t idx = firstBinding + i; 4454 VkDeviceSize size = pSizes ? pSizes[i] : 0; 4455 VkDeviceSize stride = pStrides ? pStrides[i] : 0; 4456 4457 /* pSizes and pStrides are optional. */ 4458 if (!changed && (vb[idx].buffer != buffer || vb[idx].offset != pOffsets[i] || 4459 vb[idx].size != size || (pStrides && vb[idx].stride != stride))) { 4460 changed = true; 4461 } 4462 4463 vb[idx].buffer = buffer; 4464 vb[idx].offset = pOffsets[i]; 4465 vb[idx].size = size; 4466 /* if pStrides=NULL, it shouldn't overwrite the strides specified by CmdSetVertexInputEXT */ 4467 4468 if (chip == GFX6 || chip >= GFX10) { 4469 const uint32_t bit = 1u << idx; 4470 if (!buffer) { 4471 cmd_buffer->state.vbo_misaligned_mask &= ~bit; 4472 cmd_buffer->state.vbo_bound_mask &= ~bit; 4473 } else { 4474 cmd_buffer->state.vbo_bound_mask |= bit; 4475 if (pStrides && vb[idx].stride != stride) { 4476 if (stride & state->format_align_req_minus_1[idx]) 4477 cmd_buffer->state.vbo_misaligned_mask |= bit; 4478 else 4479 cmd_buffer->state.vbo_misaligned_mask &= ~bit; 4480 } 4481 if (state->possibly_misaligned_mask & bit && 4482 (vb[idx].offset + state->offsets[idx]) & state->format_align_req_minus_1[idx]) 4483 cmd_buffer->state.vbo_misaligned_mask |= bit; 4484 } 4485 } 4486 4487 if (pStrides) 4488 vb[idx].stride = stride; 4489 4490 if (buffer) { 4491 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, vb[idx].buffer->bo); 4492 } 4493 } 4494 4495 if (!changed) { 4496 /* No state changes. */ 4497 return; 4498 } 4499 4500 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER | 4501 RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT; 4502} 4503 4504static uint32_t 4505vk_to_index_type(VkIndexType type) 4506{ 4507 switch (type) { 4508 case VK_INDEX_TYPE_UINT8_EXT: 4509 return V_028A7C_VGT_INDEX_8; 4510 case VK_INDEX_TYPE_UINT16: 4511 return V_028A7C_VGT_INDEX_16; 4512 case VK_INDEX_TYPE_UINT32: 4513 return V_028A7C_VGT_INDEX_32; 4514 default: 4515 unreachable("invalid index type"); 4516 } 4517} 4518 4519static uint32_t 4520radv_get_vgt_index_size(uint32_t type) 4521{ 4522 switch (type) { 4523 case V_028A7C_VGT_INDEX_8: 4524 return 1; 4525 case V_028A7C_VGT_INDEX_16: 4526 return 2; 4527 case V_028A7C_VGT_INDEX_32: 4528 return 4; 4529 default: 4530 unreachable("invalid index type"); 4531 } 4532} 4533 4534void 4535radv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, 4536 VkIndexType indexType) 4537{ 4538 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 4539 RADV_FROM_HANDLE(radv_buffer, index_buffer, buffer); 4540 4541 if (cmd_buffer->state.index_buffer == index_buffer && cmd_buffer->state.index_offset == offset && 4542 cmd_buffer->state.index_type == indexType) { 4543 /* No state changes. */ 4544 return; 4545 } 4546 4547 cmd_buffer->state.index_buffer = index_buffer; 4548 cmd_buffer->state.index_offset = offset; 4549 cmd_buffer->state.index_type = vk_to_index_type(indexType); 4550 cmd_buffer->state.index_va = radv_buffer_get_va(index_buffer->bo); 4551 cmd_buffer->state.index_va += index_buffer->offset + offset; 4552 4553 int index_size = radv_get_vgt_index_size(vk_to_index_type(indexType)); 4554 cmd_buffer->state.max_index_count = (index_buffer->size - offset) / index_size; 4555 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER; 4556 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, index_buffer->bo); 4557} 4558 4559static void 4560radv_bind_descriptor_set(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point, 4561 struct radv_descriptor_set *set, unsigned idx) 4562{ 4563 struct radeon_winsys *ws = cmd_buffer->device->ws; 4564 4565 radv_set_descriptor_set(cmd_buffer, bind_point, set, idx); 4566 4567 assert(set); 4568 4569 if (!cmd_buffer->device->use_global_bo_list) { 4570 for (unsigned j = 0; j < set->header.buffer_count; ++j) 4571 if (set->descriptors[j]) 4572 radv_cs_add_buffer(ws, cmd_buffer->cs, set->descriptors[j]); 4573 } 4574 4575 if (set->header.bo) 4576 radv_cs_add_buffer(ws, cmd_buffer->cs, set->header.bo); 4577} 4578 4579void 4580radv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, 4581 VkPipelineLayout _layout, uint32_t firstSet, uint32_t descriptorSetCount, 4582 const VkDescriptorSet *pDescriptorSets, uint32_t dynamicOffsetCount, 4583 const uint32_t *pDynamicOffsets) 4584{ 4585 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 4586 RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout); 4587 unsigned dyn_idx = 0; 4588 4589 const bool no_dynamic_bounds = 4590 cmd_buffer->device->instance->debug_flags & RADV_DEBUG_NO_DYNAMIC_BOUNDS; 4591 struct radv_descriptor_state *descriptors_state = 4592 radv_get_descriptors_state(cmd_buffer, pipelineBindPoint); 4593 4594 for (unsigned i = 0; i < descriptorSetCount; ++i) { 4595 unsigned set_idx = i + firstSet; 4596 RADV_FROM_HANDLE(radv_descriptor_set, set, pDescriptorSets[i]); 4597 4598 /* If the set is already bound we only need to update the 4599 * (potentially changed) dynamic offsets. */ 4600 if (descriptors_state->sets[set_idx] != set || 4601 !(descriptors_state->valid & (1u << set_idx))) { 4602 radv_bind_descriptor_set(cmd_buffer, pipelineBindPoint, set, set_idx); 4603 } 4604 4605 for (unsigned j = 0; j < layout->set[set_idx].dynamic_offset_count; ++j, ++dyn_idx) { 4606 unsigned idx = j + layout->set[i + firstSet].dynamic_offset_start; 4607 uint32_t *dst = descriptors_state->dynamic_buffers + idx * 4; 4608 assert(dyn_idx < dynamicOffsetCount); 4609 4610 struct radv_descriptor_range *range = set->header.dynamic_descriptors + j; 4611 4612 if (!range->va) { 4613 memset(dst, 0, 4 * 4); 4614 } else { 4615 uint64_t va = range->va + pDynamicOffsets[dyn_idx]; 4616 dst[0] = va; 4617 dst[1] = S_008F04_BASE_ADDRESS_HI(va >> 32); 4618 dst[2] = no_dynamic_bounds ? 0xffffffffu : range->size; 4619 dst[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | 4620 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); 4621 4622 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) { 4623 dst[3] |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) | 4624 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1); 4625 } else { 4626 dst[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | 4627 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); 4628 } 4629 } 4630 4631 cmd_buffer->push_constant_stages |= layout->set[set_idx].dynamic_offset_stages; 4632 } 4633 } 4634} 4635 4636static bool 4637radv_init_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer, struct radv_descriptor_set *set, 4638 struct radv_descriptor_set_layout *layout, 4639 VkPipelineBindPoint bind_point) 4640{ 4641 struct radv_descriptor_state *descriptors_state = 4642 radv_get_descriptors_state(cmd_buffer, bind_point); 4643 set->header.size = layout->size; 4644 set->header.layout = layout; 4645 4646 if (descriptors_state->push_set.capacity < set->header.size) { 4647 size_t new_size = MAX2(set->header.size, 1024); 4648 new_size = MAX2(new_size, 2 * descriptors_state->push_set.capacity); 4649 new_size = MIN2(new_size, 96 * MAX_PUSH_DESCRIPTORS); 4650 4651 free(set->header.mapped_ptr); 4652 set->header.mapped_ptr = malloc(new_size); 4653 4654 if (!set->header.mapped_ptr) { 4655 descriptors_state->push_set.capacity = 0; 4656 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY; 4657 return false; 4658 } 4659 4660 descriptors_state->push_set.capacity = new_size; 4661 } 4662 4663 return true; 4664} 4665 4666void 4667radv_meta_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer, 4668 VkPipelineBindPoint pipelineBindPoint, VkPipelineLayout _layout, 4669 uint32_t set, uint32_t descriptorWriteCount, 4670 const VkWriteDescriptorSet *pDescriptorWrites) 4671{ 4672 RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout); 4673 struct radv_descriptor_set *push_set = 4674 (struct radv_descriptor_set *)&cmd_buffer->meta_push_descriptors; 4675 unsigned bo_offset; 4676 4677 assert(set == 0); 4678 assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR); 4679 4680 push_set->header.size = layout->set[set].layout->size; 4681 push_set->header.layout = layout->set[set].layout; 4682 4683 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, push_set->header.size, &bo_offset, 4684 (void **)&push_set->header.mapped_ptr)) 4685 return; 4686 4687 push_set->header.va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); 4688 push_set->header.va += bo_offset; 4689 4690 radv_update_descriptor_sets(cmd_buffer->device, cmd_buffer, 4691 radv_descriptor_set_to_handle(push_set), descriptorWriteCount, 4692 pDescriptorWrites, 0, NULL); 4693 4694 radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set); 4695} 4696 4697void 4698radv_CmdPushDescriptorSetKHR(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, 4699 VkPipelineLayout _layout, uint32_t set, uint32_t descriptorWriteCount, 4700 const VkWriteDescriptorSet *pDescriptorWrites) 4701{ 4702 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 4703 RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout); 4704 struct radv_descriptor_state *descriptors_state = 4705 radv_get_descriptors_state(cmd_buffer, pipelineBindPoint); 4706 struct radv_descriptor_set *push_set = 4707 (struct radv_descriptor_set *)&descriptors_state->push_set.set; 4708 4709 assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR); 4710 4711 if (!radv_init_push_descriptor_set(cmd_buffer, push_set, layout->set[set].layout, 4712 pipelineBindPoint)) 4713 return; 4714 4715 /* Check that there are no inline uniform block updates when calling vkCmdPushDescriptorSetKHR() 4716 * because it is invalid, according to Vulkan spec. 4717 */ 4718 for (int i = 0; i < descriptorWriteCount; i++) { 4719 ASSERTED const VkWriteDescriptorSet *writeset = &pDescriptorWrites[i]; 4720 assert(writeset->descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT); 4721 } 4722 4723 radv_update_descriptor_sets(cmd_buffer->device, cmd_buffer, 4724 radv_descriptor_set_to_handle(push_set), descriptorWriteCount, 4725 pDescriptorWrites, 0, NULL); 4726 4727 radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set); 4728 descriptors_state->push_dirty = true; 4729} 4730 4731void 4732radv_CmdPushDescriptorSetWithTemplateKHR(VkCommandBuffer commandBuffer, 4733 VkDescriptorUpdateTemplate descriptorUpdateTemplate, 4734 VkPipelineLayout _layout, uint32_t set, const void *pData) 4735{ 4736 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 4737 RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout); 4738 RADV_FROM_HANDLE(radv_descriptor_update_template, templ, descriptorUpdateTemplate); 4739 struct radv_descriptor_state *descriptors_state = 4740 radv_get_descriptors_state(cmd_buffer, templ->bind_point); 4741 struct radv_descriptor_set *push_set = 4742 (struct radv_descriptor_set *)&descriptors_state->push_set.set; 4743 4744 assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR); 4745 4746 if (!radv_init_push_descriptor_set(cmd_buffer, push_set, layout->set[set].layout, 4747 templ->bind_point)) 4748 return; 4749 4750 radv_update_descriptor_set_with_template(cmd_buffer->device, cmd_buffer, push_set, 4751 descriptorUpdateTemplate, pData); 4752 4753 radv_set_descriptor_set(cmd_buffer, templ->bind_point, push_set, set); 4754 descriptors_state->push_dirty = true; 4755} 4756 4757void 4758radv_CmdPushConstants(VkCommandBuffer commandBuffer, VkPipelineLayout layout, 4759 VkShaderStageFlags stageFlags, uint32_t offset, uint32_t size, 4760 const void *pValues) 4761{ 4762 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 4763 memcpy(cmd_buffer->push_constants + offset, pValues, size); 4764 cmd_buffer->push_constant_stages |= stageFlags; 4765} 4766 4767VkResult 4768radv_EndCommandBuffer(VkCommandBuffer commandBuffer) 4769{ 4770 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 4771 4772 radv_emit_mip_change_flush_default(cmd_buffer); 4773 4774 if (cmd_buffer->queue_family_index != RADV_QUEUE_TRANSFER) { 4775 if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX6) 4776 cmd_buffer->state.flush_bits |= 4777 RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_WB_L2; 4778 4779 /* Make sure to sync all pending active queries at the end of 4780 * command buffer. 4781 */ 4782 cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits; 4783 4784 /* Flush noncoherent images on GFX9+ so we can assume they're clean on the start of a 4785 * command buffer. 4786 */ 4787 if (cmd_buffer->state.rb_noncoherent_dirty && can_skip_buffer_l2_flushes(cmd_buffer->device)) 4788 cmd_buffer->state.flush_bits |= radv_src_access_flush( 4789 cmd_buffer, 4790 VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, 4791 NULL); 4792 4793 /* Since NGG streamout uses GDS, we need to make GDS idle when 4794 * we leave the IB, otherwise another process might overwrite 4795 * it while our shaders are busy. 4796 */ 4797 if (cmd_buffer->gds_needed) 4798 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH; 4799 4800 si_emit_cache_flush(cmd_buffer); 4801 } 4802 4803 /* Make sure CP DMA is idle at the end of IBs because the kernel 4804 * doesn't wait for it. 4805 */ 4806 si_cp_dma_wait_for_idle(cmd_buffer); 4807 4808 radv_describe_end_cmd_buffer(cmd_buffer); 4809 4810 vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments); 4811 vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.subpass_sample_locs); 4812 4813 VkResult result = cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs); 4814 if (result != VK_SUCCESS) 4815 return vk_error(cmd_buffer, result); 4816 4817 cmd_buffer->status = RADV_CMD_BUFFER_STATUS_EXECUTABLE; 4818 4819 return cmd_buffer->record_result; 4820} 4821 4822static void 4823radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline) 4824{ 4825 if (!pipeline || pipeline == cmd_buffer->state.emitted_compute_pipeline) 4826 return; 4827 4828 assert(!pipeline->ctx_cs.cdw); 4829 4830 cmd_buffer->state.emitted_compute_pipeline = pipeline; 4831 4832 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->cs.cdw); 4833 radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw); 4834 4835 cmd_buffer->compute_scratch_size_per_wave_needed = 4836 MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, pipeline->scratch_bytes_per_wave); 4837 cmd_buffer->compute_scratch_waves_wanted = 4838 MAX2(cmd_buffer->compute_scratch_waves_wanted, pipeline->max_waves); 4839 4840 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, 4841 pipeline->shaders[MESA_SHADER_COMPUTE]->bo); 4842 4843 if (unlikely(cmd_buffer->device->trace_bo)) 4844 radv_save_pipeline(cmd_buffer, pipeline); 4845} 4846 4847static void 4848radv_mark_descriptor_sets_dirty(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point) 4849{ 4850 struct radv_descriptor_state *descriptors_state = 4851 radv_get_descriptors_state(cmd_buffer, bind_point); 4852 4853 descriptors_state->dirty |= descriptors_state->valid; 4854} 4855 4856void 4857radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, 4858 VkPipeline _pipeline) 4859{ 4860 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 4861 RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline); 4862 4863 switch (pipelineBindPoint) { 4864 case VK_PIPELINE_BIND_POINT_COMPUTE: 4865 if (cmd_buffer->state.compute_pipeline == pipeline) 4866 return; 4867 radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint); 4868 4869 cmd_buffer->state.compute_pipeline = pipeline; 4870 cmd_buffer->push_constant_stages |= VK_SHADER_STAGE_COMPUTE_BIT; 4871 break; 4872 case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR: 4873 if (cmd_buffer->state.rt_pipeline == pipeline) 4874 return; 4875 radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint); 4876 4877 cmd_buffer->state.rt_pipeline = pipeline; 4878 cmd_buffer->push_constant_stages |= 4879 (VK_SHADER_STAGE_RAYGEN_BIT_KHR | VK_SHADER_STAGE_ANY_HIT_BIT_KHR | 4880 VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR | VK_SHADER_STAGE_MISS_BIT_KHR | 4881 VK_SHADER_STAGE_INTERSECTION_BIT_KHR | VK_SHADER_STAGE_CALLABLE_BIT_KHR); 4882 radv_set_rt_stack_size(cmd_buffer, cmd_buffer->state.rt_stack_size); 4883 break; 4884 case VK_PIPELINE_BIND_POINT_GRAPHICS: 4885 if (cmd_buffer->state.pipeline == pipeline) 4886 return; 4887 radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint); 4888 4889 bool vtx_emit_count_changed = 4890 !pipeline || !cmd_buffer->state.pipeline || 4891 cmd_buffer->state.pipeline->graphics.vtx_emit_num != pipeline->graphics.vtx_emit_num || 4892 cmd_buffer->state.pipeline->graphics.vtx_base_sgpr != pipeline->graphics.vtx_base_sgpr; 4893 cmd_buffer->state.pipeline = pipeline; 4894 if (!pipeline) 4895 break; 4896 4897 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PIPELINE | RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT; 4898 cmd_buffer->push_constant_stages |= pipeline->active_stages; 4899 4900 /* the new vertex shader might not have the same user regs */ 4901 if (vtx_emit_count_changed) { 4902 cmd_buffer->state.last_first_instance = -1; 4903 cmd_buffer->state.last_vertex_offset = -1; 4904 cmd_buffer->state.last_drawid = -1; 4905 } 4906 4907 /* Prefetch all pipeline shaders at first draw time. */ 4908 cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_SHADERS; 4909 4910 if (cmd_buffer->device->physical_device->rad_info.has_vgt_flush_ngg_legacy_bug && 4911 cmd_buffer->state.emitted_pipeline && 4912 cmd_buffer->state.emitted_pipeline->graphics.is_ngg && 4913 !cmd_buffer->state.pipeline->graphics.is_ngg) { 4914 /* Transitioning from NGG to legacy GS requires 4915 * VGT_FLUSH on GFX10 and Sienna Cichlid. VGT_FLUSH 4916 * is also emitted at the beginning of IBs when legacy 4917 * GS ring pointers are set. 4918 */ 4919 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_FLUSH; 4920 } 4921 4922 radv_bind_dynamic_state(cmd_buffer, &pipeline->dynamic_state); 4923 radv_bind_streamout_state(cmd_buffer, pipeline); 4924 4925 if (pipeline->graphics.esgs_ring_size > cmd_buffer->esgs_ring_size_needed) 4926 cmd_buffer->esgs_ring_size_needed = pipeline->graphics.esgs_ring_size; 4927 if (pipeline->graphics.gsvs_ring_size > cmd_buffer->gsvs_ring_size_needed) 4928 cmd_buffer->gsvs_ring_size_needed = pipeline->graphics.gsvs_ring_size; 4929 4930 if (radv_pipeline_has_tess(pipeline)) 4931 cmd_buffer->tess_rings_needed = true; 4932 break; 4933 default: 4934 assert(!"invalid bind point"); 4935 break; 4936 } 4937} 4938 4939void 4940radv_CmdSetViewport(VkCommandBuffer commandBuffer, uint32_t firstViewport, uint32_t viewportCount, 4941 const VkViewport *pViewports) 4942{ 4943 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 4944 struct radv_cmd_state *state = &cmd_buffer->state; 4945 ASSERTED const uint32_t total_count = firstViewport + viewportCount; 4946 4947 assert(firstViewport < MAX_VIEWPORTS); 4948 assert(total_count >= 1 && total_count <= MAX_VIEWPORTS); 4949 4950 if (total_count <= state->dynamic.viewport.count && 4951 !memcmp(state->dynamic.viewport.viewports + firstViewport, pViewports, 4952 viewportCount * sizeof(*pViewports))) { 4953 return; 4954 } 4955 4956 if (state->dynamic.viewport.count < total_count) 4957 state->dynamic.viewport.count = total_count; 4958 4959 memcpy(state->dynamic.viewport.viewports + firstViewport, pViewports, 4960 viewportCount * sizeof(*pViewports)); 4961 for (unsigned i = 0; i < viewportCount; i++) { 4962 radv_get_viewport_xform(&pViewports[i], 4963 state->dynamic.viewport.xform[i + firstViewport].scale, 4964 state->dynamic.viewport.xform[i + firstViewport].translate); 4965 } 4966 4967 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_VIEWPORT; 4968} 4969 4970void 4971radv_CmdSetScissor(VkCommandBuffer commandBuffer, uint32_t firstScissor, uint32_t scissorCount, 4972 const VkRect2D *pScissors) 4973{ 4974 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 4975 struct radv_cmd_state *state = &cmd_buffer->state; 4976 ASSERTED const uint32_t total_count = firstScissor + scissorCount; 4977 4978 assert(firstScissor < MAX_SCISSORS); 4979 assert(total_count >= 1 && total_count <= MAX_SCISSORS); 4980 4981 if (total_count <= state->dynamic.scissor.count && 4982 !memcmp(state->dynamic.scissor.scissors + firstScissor, pScissors, 4983 scissorCount * sizeof(*pScissors))) { 4984 return; 4985 } 4986 4987 if (state->dynamic.scissor.count < total_count) 4988 state->dynamic.scissor.count = total_count; 4989 4990 memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors, 4991 scissorCount * sizeof(*pScissors)); 4992 4993 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR; 4994} 4995 4996void 4997radv_CmdSetLineWidth(VkCommandBuffer commandBuffer, float lineWidth) 4998{ 4999 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5000 5001 if (cmd_buffer->state.dynamic.line_width == lineWidth) 5002 return; 5003 5004 cmd_buffer->state.dynamic.line_width = lineWidth; 5005 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH; 5006} 5007 5008void 5009radv_CmdSetDepthBias(VkCommandBuffer commandBuffer, float depthBiasConstantFactor, 5010 float depthBiasClamp, float depthBiasSlopeFactor) 5011{ 5012 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5013 struct radv_cmd_state *state = &cmd_buffer->state; 5014 5015 if (state->dynamic.depth_bias.bias == depthBiasConstantFactor && 5016 state->dynamic.depth_bias.clamp == depthBiasClamp && 5017 state->dynamic.depth_bias.slope == depthBiasSlopeFactor) { 5018 return; 5019 } 5020 5021 state->dynamic.depth_bias.bias = depthBiasConstantFactor; 5022 state->dynamic.depth_bias.clamp = depthBiasClamp; 5023 state->dynamic.depth_bias.slope = depthBiasSlopeFactor; 5024 5025 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS; 5026} 5027 5028void 5029radv_CmdSetBlendConstants(VkCommandBuffer commandBuffer, const float blendConstants[4]) 5030{ 5031 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5032 struct radv_cmd_state *state = &cmd_buffer->state; 5033 5034 if (!memcmp(state->dynamic.blend_constants, blendConstants, sizeof(float) * 4)) 5035 return; 5036 5037 memcpy(state->dynamic.blend_constants, blendConstants, sizeof(float) * 4); 5038 5039 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS; 5040} 5041 5042void 5043radv_CmdSetDepthBounds(VkCommandBuffer commandBuffer, float minDepthBounds, float maxDepthBounds) 5044{ 5045 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5046 struct radv_cmd_state *state = &cmd_buffer->state; 5047 5048 if (state->dynamic.depth_bounds.min == minDepthBounds && 5049 state->dynamic.depth_bounds.max == maxDepthBounds) { 5050 return; 5051 } 5052 5053 state->dynamic.depth_bounds.min = minDepthBounds; 5054 state->dynamic.depth_bounds.max = maxDepthBounds; 5055 5056 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS; 5057} 5058 5059void 5060radv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, 5061 uint32_t compareMask) 5062{ 5063 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5064 struct radv_cmd_state *state = &cmd_buffer->state; 5065 bool front_same = state->dynamic.stencil_compare_mask.front == compareMask; 5066 bool back_same = state->dynamic.stencil_compare_mask.back == compareMask; 5067 5068 if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) && 5069 (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) { 5070 return; 5071 } 5072 5073 if (faceMask & VK_STENCIL_FACE_FRONT_BIT) 5074 state->dynamic.stencil_compare_mask.front = compareMask; 5075 if (faceMask & VK_STENCIL_FACE_BACK_BIT) 5076 state->dynamic.stencil_compare_mask.back = compareMask; 5077 5078 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK; 5079} 5080 5081void 5082radv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, 5083 uint32_t writeMask) 5084{ 5085 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5086 struct radv_cmd_state *state = &cmd_buffer->state; 5087 bool front_same = state->dynamic.stencil_write_mask.front == writeMask; 5088 bool back_same = state->dynamic.stencil_write_mask.back == writeMask; 5089 5090 if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) && 5091 (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) { 5092 return; 5093 } 5094 5095 if (faceMask & VK_STENCIL_FACE_FRONT_BIT) 5096 state->dynamic.stencil_write_mask.front = writeMask; 5097 if (faceMask & VK_STENCIL_FACE_BACK_BIT) 5098 state->dynamic.stencil_write_mask.back = writeMask; 5099 5100 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK; 5101} 5102 5103void 5104radv_CmdSetStencilReference(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, 5105 uint32_t reference) 5106{ 5107 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5108 struct radv_cmd_state *state = &cmd_buffer->state; 5109 bool front_same = state->dynamic.stencil_reference.front == reference; 5110 bool back_same = state->dynamic.stencil_reference.back == reference; 5111 5112 if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) && 5113 (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) { 5114 return; 5115 } 5116 5117 if (faceMask & VK_STENCIL_FACE_FRONT_BIT) 5118 cmd_buffer->state.dynamic.stencil_reference.front = reference; 5119 if (faceMask & VK_STENCIL_FACE_BACK_BIT) 5120 cmd_buffer->state.dynamic.stencil_reference.back = reference; 5121 5122 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE; 5123} 5124 5125void 5126radv_CmdSetDiscardRectangleEXT(VkCommandBuffer commandBuffer, uint32_t firstDiscardRectangle, 5127 uint32_t discardRectangleCount, const VkRect2D *pDiscardRectangles) 5128{ 5129 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5130 struct radv_cmd_state *state = &cmd_buffer->state; 5131 ASSERTED const uint32_t total_count = firstDiscardRectangle + discardRectangleCount; 5132 5133 assert(firstDiscardRectangle < MAX_DISCARD_RECTANGLES); 5134 assert(total_count >= 1 && total_count <= MAX_DISCARD_RECTANGLES); 5135 5136 if (!memcmp(state->dynamic.discard_rectangle.rectangles + firstDiscardRectangle, 5137 pDiscardRectangles, discardRectangleCount * sizeof(*pDiscardRectangles))) { 5138 return; 5139 } 5140 5141 typed_memcpy(&state->dynamic.discard_rectangle.rectangles[firstDiscardRectangle], 5142 pDiscardRectangles, discardRectangleCount); 5143 5144 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE; 5145} 5146 5147void 5148radv_CmdSetSampleLocationsEXT(VkCommandBuffer commandBuffer, 5149 const VkSampleLocationsInfoEXT *pSampleLocationsInfo) 5150{ 5151 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5152 struct radv_cmd_state *state = &cmd_buffer->state; 5153 5154 assert(pSampleLocationsInfo->sampleLocationsCount <= MAX_SAMPLE_LOCATIONS); 5155 5156 state->dynamic.sample_location.per_pixel = pSampleLocationsInfo->sampleLocationsPerPixel; 5157 state->dynamic.sample_location.grid_size = pSampleLocationsInfo->sampleLocationGridSize; 5158 state->dynamic.sample_location.count = pSampleLocationsInfo->sampleLocationsCount; 5159 typed_memcpy(&state->dynamic.sample_location.locations[0], 5160 pSampleLocationsInfo->pSampleLocations, pSampleLocationsInfo->sampleLocationsCount); 5161 5162 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS; 5163} 5164 5165void 5166radv_CmdSetLineStippleEXT(VkCommandBuffer commandBuffer, uint32_t lineStippleFactor, 5167 uint16_t lineStipplePattern) 5168{ 5169 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5170 struct radv_cmd_state *state = &cmd_buffer->state; 5171 5172 if (state->dynamic.line_stipple.factor == lineStippleFactor && 5173 state->dynamic.line_stipple.pattern == lineStipplePattern) 5174 return; 5175 5176 state->dynamic.line_stipple.factor = lineStippleFactor; 5177 state->dynamic.line_stipple.pattern = lineStipplePattern; 5178 5179 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE; 5180} 5181 5182void 5183radv_CmdSetCullModeEXT(VkCommandBuffer commandBuffer, VkCullModeFlags cullMode) 5184{ 5185 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5186 struct radv_cmd_state *state = &cmd_buffer->state; 5187 5188 if (state->dynamic.cull_mode == cullMode) 5189 return; 5190 5191 state->dynamic.cull_mode = cullMode; 5192 5193 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_CULL_MODE; 5194} 5195 5196void 5197radv_CmdSetFrontFaceEXT(VkCommandBuffer commandBuffer, VkFrontFace frontFace) 5198{ 5199 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5200 struct radv_cmd_state *state = &cmd_buffer->state; 5201 5202 if (state->dynamic.front_face == frontFace) 5203 return; 5204 5205 state->dynamic.front_face = frontFace; 5206 5207 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE; 5208} 5209 5210void 5211radv_CmdSetPrimitiveTopologyEXT(VkCommandBuffer commandBuffer, 5212 VkPrimitiveTopology primitiveTopology) 5213{ 5214 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5215 struct radv_cmd_state *state = &cmd_buffer->state; 5216 unsigned primitive_topology = si_translate_prim(primitiveTopology); 5217 5218 if (state->dynamic.primitive_topology == primitive_topology) 5219 return; 5220 5221 state->dynamic.primitive_topology = primitive_topology; 5222 5223 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY; 5224} 5225 5226void 5227radv_CmdSetViewportWithCountEXT(VkCommandBuffer commandBuffer, uint32_t viewportCount, 5228 const VkViewport *pViewports) 5229{ 5230 radv_CmdSetViewport(commandBuffer, 0, viewportCount, pViewports); 5231} 5232 5233void 5234radv_CmdSetScissorWithCountEXT(VkCommandBuffer commandBuffer, uint32_t scissorCount, 5235 const VkRect2D *pScissors) 5236{ 5237 radv_CmdSetScissor(commandBuffer, 0, scissorCount, pScissors); 5238} 5239 5240void 5241radv_CmdSetDepthTestEnableEXT(VkCommandBuffer commandBuffer, VkBool32 depthTestEnable) 5242 5243{ 5244 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5245 struct radv_cmd_state *state = &cmd_buffer->state; 5246 5247 if (state->dynamic.depth_test_enable == depthTestEnable) 5248 return; 5249 5250 state->dynamic.depth_test_enable = depthTestEnable; 5251 5252 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE; 5253} 5254 5255void 5256radv_CmdSetDepthWriteEnableEXT(VkCommandBuffer commandBuffer, VkBool32 depthWriteEnable) 5257{ 5258 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5259 struct radv_cmd_state *state = &cmd_buffer->state; 5260 5261 if (state->dynamic.depth_write_enable == depthWriteEnable) 5262 return; 5263 5264 state->dynamic.depth_write_enable = depthWriteEnable; 5265 5266 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE; 5267} 5268 5269void 5270radv_CmdSetDepthCompareOpEXT(VkCommandBuffer commandBuffer, VkCompareOp depthCompareOp) 5271{ 5272 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5273 struct radv_cmd_state *state = &cmd_buffer->state; 5274 5275 if (state->dynamic.depth_compare_op == depthCompareOp) 5276 return; 5277 5278 state->dynamic.depth_compare_op = depthCompareOp; 5279 5280 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP; 5281} 5282 5283void 5284radv_CmdSetDepthBoundsTestEnableEXT(VkCommandBuffer commandBuffer, VkBool32 depthBoundsTestEnable) 5285{ 5286 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5287 struct radv_cmd_state *state = &cmd_buffer->state; 5288 5289 if (state->dynamic.depth_bounds_test_enable == depthBoundsTestEnable) 5290 return; 5291 5292 state->dynamic.depth_bounds_test_enable = depthBoundsTestEnable; 5293 5294 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE; 5295} 5296 5297void 5298radv_CmdSetStencilTestEnableEXT(VkCommandBuffer commandBuffer, VkBool32 stencilTestEnable) 5299{ 5300 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5301 struct radv_cmd_state *state = &cmd_buffer->state; 5302 5303 if (state->dynamic.stencil_test_enable == stencilTestEnable) 5304 return; 5305 5306 state->dynamic.stencil_test_enable = stencilTestEnable; 5307 5308 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE; 5309} 5310 5311void 5312radv_CmdSetStencilOpEXT(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, 5313 VkStencilOp failOp, VkStencilOp passOp, VkStencilOp depthFailOp, 5314 VkCompareOp compareOp) 5315{ 5316 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5317 struct radv_cmd_state *state = &cmd_buffer->state; 5318 bool front_same = state->dynamic.stencil_op.front.fail_op == failOp && 5319 state->dynamic.stencil_op.front.pass_op == passOp && 5320 state->dynamic.stencil_op.front.depth_fail_op == depthFailOp && 5321 state->dynamic.stencil_op.front.compare_op == compareOp; 5322 bool back_same = state->dynamic.stencil_op.back.fail_op == failOp && 5323 state->dynamic.stencil_op.back.pass_op == passOp && 5324 state->dynamic.stencil_op.back.depth_fail_op == depthFailOp && 5325 state->dynamic.stencil_op.back.compare_op == compareOp; 5326 5327 if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) && 5328 (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) 5329 return; 5330 5331 if (faceMask & VK_STENCIL_FACE_FRONT_BIT) { 5332 state->dynamic.stencil_op.front.fail_op = failOp; 5333 state->dynamic.stencil_op.front.pass_op = passOp; 5334 state->dynamic.stencil_op.front.depth_fail_op = depthFailOp; 5335 state->dynamic.stencil_op.front.compare_op = compareOp; 5336 } 5337 5338 if (faceMask & VK_STENCIL_FACE_BACK_BIT) { 5339 state->dynamic.stencil_op.back.fail_op = failOp; 5340 state->dynamic.stencil_op.back.pass_op = passOp; 5341 state->dynamic.stencil_op.back.depth_fail_op = depthFailOp; 5342 state->dynamic.stencil_op.back.compare_op = compareOp; 5343 } 5344 5345 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP; 5346} 5347 5348void 5349radv_CmdSetFragmentShadingRateKHR(VkCommandBuffer commandBuffer, const VkExtent2D *pFragmentSize, 5350 const VkFragmentShadingRateCombinerOpKHR combinerOps[2]) 5351{ 5352 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5353 struct radv_cmd_state *state = &cmd_buffer->state; 5354 5355 if (state->dynamic.fragment_shading_rate.size.width == pFragmentSize->width && 5356 state->dynamic.fragment_shading_rate.size.height == pFragmentSize->height && 5357 state->dynamic.fragment_shading_rate.combiner_ops[0] == combinerOps[0] && 5358 state->dynamic.fragment_shading_rate.combiner_ops[1] == combinerOps[1]) 5359 return; 5360 5361 state->dynamic.fragment_shading_rate.size = *pFragmentSize; 5362 for (unsigned i = 0; i < 2; i++) 5363 state->dynamic.fragment_shading_rate.combiner_ops[i] = combinerOps[i]; 5364 5365 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE; 5366} 5367 5368void 5369radv_CmdSetDepthBiasEnableEXT(VkCommandBuffer commandBuffer, VkBool32 depthBiasEnable) 5370{ 5371 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5372 struct radv_cmd_state *state = &cmd_buffer->state; 5373 5374 if (state->dynamic.depth_bias_enable == depthBiasEnable) 5375 return; 5376 5377 state->dynamic.depth_bias_enable = depthBiasEnable; 5378 5379 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE; 5380} 5381 5382void 5383radv_CmdSetPrimitiveRestartEnableEXT(VkCommandBuffer commandBuffer, VkBool32 primitiveRestartEnable) 5384{ 5385 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5386 struct radv_cmd_state *state = &cmd_buffer->state; 5387 5388 if (state->dynamic.primitive_restart_enable == primitiveRestartEnable) 5389 return; 5390 5391 state->dynamic.primitive_restart_enable = primitiveRestartEnable; 5392 5393 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE; 5394} 5395 5396void 5397radv_CmdSetRasterizerDiscardEnableEXT(VkCommandBuffer commandBuffer, 5398 VkBool32 rasterizerDiscardEnable) 5399{ 5400 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5401 struct radv_cmd_state *state = &cmd_buffer->state; 5402 5403 if (state->dynamic.rasterizer_discard_enable == rasterizerDiscardEnable) 5404 return; 5405 5406 state->dynamic.rasterizer_discard_enable = rasterizerDiscardEnable; 5407 5408 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE; 5409} 5410 5411void 5412radv_CmdSetPatchControlPointsEXT(VkCommandBuffer commandBuffer, uint32_t patchControlPoints) 5413{ 5414 /* not implemented */ 5415} 5416 5417void 5418radv_CmdSetLogicOpEXT(VkCommandBuffer commandBuffer, VkLogicOp logicOp) 5419{ 5420 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5421 struct radv_cmd_state *state = &cmd_buffer->state; 5422 unsigned logic_op = si_translate_blend_logic_op(logicOp); 5423 5424 if (state->dynamic.logic_op == logic_op) 5425 return; 5426 5427 state->dynamic.logic_op = logic_op; 5428 5429 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP; 5430} 5431 5432void 5433radv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer, uint32_t attachmentCount, 5434 const VkBool32 *pColorWriteEnables) 5435{ 5436 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5437 struct radv_cmd_state *state = &cmd_buffer->state; 5438 uint32_t color_write_enable = 0; 5439 5440 assert(attachmentCount < MAX_RTS); 5441 5442 for (uint32_t i = 0; i < attachmentCount; i++) { 5443 color_write_enable |= pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0; 5444 } 5445 5446 if (state->dynamic.color_write_enable == color_write_enable) 5447 return; 5448 5449 state->dynamic.color_write_enable = color_write_enable; 5450 5451 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE; 5452} 5453 5454void 5455radv_CmdSetVertexInputEXT(VkCommandBuffer commandBuffer, uint32_t vertexBindingDescriptionCount, 5456 const VkVertexInputBindingDescription2EXT *pVertexBindingDescriptions, 5457 uint32_t vertexAttributeDescriptionCount, 5458 const VkVertexInputAttributeDescription2EXT *pVertexAttributeDescriptions) 5459{ 5460 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5461 struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input; 5462 5463 const VkVertexInputBindingDescription2EXT *bindings[MAX_VBS]; 5464 for (unsigned i = 0; i < vertexBindingDescriptionCount; i++) 5465 bindings[pVertexBindingDescriptions[i].binding] = &pVertexBindingDescriptions[i]; 5466 5467 cmd_buffer->state.vbo_misaligned_mask = 0; 5468 5469 memset(state, 0, sizeof(*state)); 5470 5471 enum chip_class chip = cmd_buffer->device->physical_device->rad_info.chip_class; 5472 for (unsigned i = 0; i < vertexAttributeDescriptionCount; i++) { 5473 const VkVertexInputAttributeDescription2EXT *attrib = &pVertexAttributeDescriptions[i]; 5474 const VkVertexInputBindingDescription2EXT *binding = bindings[attrib->binding]; 5475 unsigned loc = attrib->location; 5476 const struct util_format_description *format_desc = vk_format_description(attrib->format); 5477 unsigned nfmt, dfmt; 5478 bool post_shuffle; 5479 enum radv_vs_input_alpha_adjust alpha_adjust; 5480 5481 state->attribute_mask |= 1u << loc; 5482 state->bindings[loc] = attrib->binding; 5483 if (binding->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE) { 5484 state->instance_rate_inputs |= 1u << loc; 5485 state->divisors[loc] = binding->divisor; 5486 if (binding->divisor != 1) 5487 state->nontrivial_divisors |= 1u << loc; 5488 } 5489 cmd_buffer->vertex_bindings[attrib->binding].stride = binding->stride; 5490 state->offsets[loc] = attrib->offset; 5491 5492 radv_translate_vertex_format(cmd_buffer->device->physical_device, attrib->format, format_desc, 5493 &dfmt, &nfmt, &post_shuffle, &alpha_adjust); 5494 5495 state->formats[loc] = dfmt | (nfmt << 4); 5496 const uint8_t format_align_req_minus_1 = format_desc->channel[0].size >= 32 ? 3 : 5497 (format_desc->block.bits / 8u - 1); 5498 state->format_align_req_minus_1[loc] = format_align_req_minus_1; 5499 state->format_sizes[loc] = format_desc->block.bits / 8u; 5500 5501 if (chip == GFX6 || chip >= GFX10) { 5502 struct radv_vertex_binding *vb = cmd_buffer->vertex_bindings; 5503 unsigned bit = 1u << loc; 5504 if (binding->stride & format_align_req_minus_1) { 5505 state->misaligned_mask |= bit; 5506 if (cmd_buffer->state.vbo_bound_mask & bit) 5507 cmd_buffer->state.vbo_misaligned_mask |= bit; 5508 } else { 5509 state->possibly_misaligned_mask |= bit; 5510 if (cmd_buffer->state.vbo_bound_mask & bit && 5511 ((vb[attrib->binding].offset + state->offsets[loc]) & format_align_req_minus_1)) 5512 cmd_buffer->state.vbo_misaligned_mask |= bit; 5513 } 5514 } 5515 5516 if (alpha_adjust) { 5517 state->alpha_adjust_lo |= (alpha_adjust & 0x1) << loc; 5518 state->alpha_adjust_hi |= (alpha_adjust >> 1) << loc; 5519 } 5520 5521 if (post_shuffle) 5522 state->post_shuffle |= 1u << loc; 5523 } 5524 5525 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER | 5526 RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT; 5527} 5528 5529void 5530radv_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCount, 5531 const VkCommandBuffer *pCmdBuffers) 5532{ 5533 RADV_FROM_HANDLE(radv_cmd_buffer, primary, commandBuffer); 5534 5535 assert(commandBufferCount > 0); 5536 5537 radv_emit_mip_change_flush_default(primary); 5538 5539 /* Emit pending flushes on primary prior to executing secondary */ 5540 si_emit_cache_flush(primary); 5541 5542 /* Make sure CP DMA is idle on primary prior to executing secondary. */ 5543 si_cp_dma_wait_for_idle(primary); 5544 5545 for (uint32_t i = 0; i < commandBufferCount; i++) { 5546 RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]); 5547 bool allow_ib2 = true; 5548 5549 if (secondary->device->physical_device->rad_info.chip_class == GFX7 && 5550 secondary->state.uses_draw_indirect_multi) { 5551 /* Do not launch an IB2 for secondary command buffers that contain 5552 * DRAW_{INDEX}_INDIRECT_MULTI on GFX7 because it's illegal and hang the GPU. 5553 */ 5554 allow_ib2 = false; 5555 } 5556 5557 if (secondary->queue_family_index == RADV_QUEUE_COMPUTE) { 5558 /* IB2 packets are not supported on compute queues according to PAL. */ 5559 allow_ib2 = false; 5560 } 5561 5562 primary->scratch_size_per_wave_needed = 5563 MAX2(primary->scratch_size_per_wave_needed, secondary->scratch_size_per_wave_needed); 5564 primary->scratch_waves_wanted = 5565 MAX2(primary->scratch_waves_wanted, secondary->scratch_waves_wanted); 5566 primary->compute_scratch_size_per_wave_needed = 5567 MAX2(primary->compute_scratch_size_per_wave_needed, 5568 secondary->compute_scratch_size_per_wave_needed); 5569 primary->compute_scratch_waves_wanted = 5570 MAX2(primary->compute_scratch_waves_wanted, secondary->compute_scratch_waves_wanted); 5571 5572 if (secondary->esgs_ring_size_needed > primary->esgs_ring_size_needed) 5573 primary->esgs_ring_size_needed = secondary->esgs_ring_size_needed; 5574 if (secondary->gsvs_ring_size_needed > primary->gsvs_ring_size_needed) 5575 primary->gsvs_ring_size_needed = secondary->gsvs_ring_size_needed; 5576 if (secondary->tess_rings_needed) 5577 primary->tess_rings_needed = true; 5578 if (secondary->sample_positions_needed) 5579 primary->sample_positions_needed = true; 5580 if (secondary->gds_needed) 5581 primary->gds_needed = true; 5582 5583 if (!secondary->state.framebuffer && (primary->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)) { 5584 /* Emit the framebuffer state from primary if secondary 5585 * has been recorded without a framebuffer, otherwise 5586 * fast color/depth clears can't work. 5587 */ 5588 radv_emit_fb_mip_change_flush(primary); 5589 radv_emit_framebuffer_state(primary); 5590 } 5591 5592 primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs, allow_ib2); 5593 5594 /* When the secondary command buffer is compute only we don't 5595 * need to re-emit the current graphics pipeline. 5596 */ 5597 if (secondary->state.emitted_pipeline) { 5598 primary->state.emitted_pipeline = secondary->state.emitted_pipeline; 5599 } 5600 5601 /* When the secondary command buffer is graphics only we don't 5602 * need to re-emit the current compute pipeline. 5603 */ 5604 if (secondary->state.emitted_compute_pipeline) { 5605 primary->state.emitted_compute_pipeline = secondary->state.emitted_compute_pipeline; 5606 } 5607 5608 /* Only re-emit the draw packets when needed. */ 5609 if (secondary->state.last_primitive_reset_en != -1) { 5610 primary->state.last_primitive_reset_en = secondary->state.last_primitive_reset_en; 5611 } 5612 5613 if (secondary->state.last_primitive_reset_index) { 5614 primary->state.last_primitive_reset_index = secondary->state.last_primitive_reset_index; 5615 } 5616 5617 if (secondary->state.last_ia_multi_vgt_param) { 5618 primary->state.last_ia_multi_vgt_param = secondary->state.last_ia_multi_vgt_param; 5619 } 5620 5621 primary->state.last_first_instance = secondary->state.last_first_instance; 5622 primary->state.last_num_instances = secondary->state.last_num_instances; 5623 primary->state.last_drawid = secondary->state.last_drawid; 5624 primary->state.last_vertex_offset = secondary->state.last_vertex_offset; 5625 primary->state.last_sx_ps_downconvert = secondary->state.last_sx_ps_downconvert; 5626 primary->state.last_sx_blend_opt_epsilon = secondary->state.last_sx_blend_opt_epsilon; 5627 primary->state.last_sx_blend_opt_control = secondary->state.last_sx_blend_opt_control; 5628 5629 if (secondary->state.last_index_type != -1) { 5630 primary->state.last_index_type = secondary->state.last_index_type; 5631 } 5632 5633 primary->state.last_nggc_settings = secondary->state.last_nggc_settings; 5634 primary->state.last_nggc_settings_sgpr_idx = secondary->state.last_nggc_settings_sgpr_idx; 5635 primary->state.last_nggc_skip = secondary->state.last_nggc_skip; 5636 } 5637 5638 /* After executing commands from secondary buffers we have to dirty 5639 * some states. 5640 */ 5641 primary->state.dirty |= 5642 RADV_CMD_DIRTY_PIPELINE | RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_DYNAMIC_ALL; 5643 radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_GRAPHICS); 5644 radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_COMPUTE); 5645} 5646 5647VkResult 5648radv_CreateCommandPool(VkDevice _device, const VkCommandPoolCreateInfo *pCreateInfo, 5649 const VkAllocationCallbacks *pAllocator, VkCommandPool *pCmdPool) 5650{ 5651 RADV_FROM_HANDLE(radv_device, device, _device); 5652 struct radv_cmd_pool *pool; 5653 5654 pool = 5655 vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*pool), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 5656 if (pool == NULL) 5657 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 5658 5659 vk_object_base_init(&device->vk, &pool->base, VK_OBJECT_TYPE_COMMAND_POOL); 5660 5661 if (pAllocator) 5662 pool->alloc = *pAllocator; 5663 else 5664 pool->alloc = device->vk.alloc; 5665 5666 list_inithead(&pool->cmd_buffers); 5667 list_inithead(&pool->free_cmd_buffers); 5668 5669 pool->queue_family_index = pCreateInfo->queueFamilyIndex; 5670 5671 *pCmdPool = radv_cmd_pool_to_handle(pool); 5672 5673 return VK_SUCCESS; 5674} 5675 5676void 5677radv_DestroyCommandPool(VkDevice _device, VkCommandPool commandPool, 5678 const VkAllocationCallbacks *pAllocator) 5679{ 5680 RADV_FROM_HANDLE(radv_device, device, _device); 5681 RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool); 5682 5683 if (!pool) 5684 return; 5685 5686 list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, &pool->cmd_buffers, pool_link) 5687 { 5688 radv_destroy_cmd_buffer(cmd_buffer); 5689 } 5690 5691 list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, &pool->free_cmd_buffers, pool_link) 5692 { 5693 radv_destroy_cmd_buffer(cmd_buffer); 5694 } 5695 5696 vk_object_base_finish(&pool->base); 5697 vk_free2(&device->vk.alloc, pAllocator, pool); 5698} 5699 5700VkResult 5701radv_ResetCommandPool(VkDevice device, VkCommandPool commandPool, VkCommandPoolResetFlags flags) 5702{ 5703 RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool); 5704 VkResult result; 5705 5706 list_for_each_entry(struct radv_cmd_buffer, cmd_buffer, &pool->cmd_buffers, pool_link) 5707 { 5708 result = radv_reset_cmd_buffer(cmd_buffer); 5709 if (result != VK_SUCCESS) 5710 return result; 5711 } 5712 5713 return VK_SUCCESS; 5714} 5715 5716void 5717radv_TrimCommandPool(VkDevice device, VkCommandPool commandPool, VkCommandPoolTrimFlags flags) 5718{ 5719 RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool); 5720 5721 if (!pool) 5722 return; 5723 5724 list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, &pool->free_cmd_buffers, pool_link) 5725 { 5726 radv_destroy_cmd_buffer(cmd_buffer); 5727 } 5728} 5729 5730static void 5731radv_cmd_buffer_begin_subpass(struct radv_cmd_buffer *cmd_buffer, uint32_t subpass_id) 5732{ 5733 struct radv_cmd_state *state = &cmd_buffer->state; 5734 struct radv_subpass *subpass = &state->pass->subpasses[subpass_id]; 5735 5736 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4096); 5737 5738 radv_emit_subpass_barrier(cmd_buffer, &subpass->start_barrier); 5739 5740 radv_cmd_buffer_set_subpass(cmd_buffer, subpass); 5741 5742 radv_describe_barrier_start(cmd_buffer, RGP_BARRIER_EXTERNAL_RENDER_PASS_SYNC); 5743 5744 for (uint32_t i = 0; i < subpass->attachment_count; ++i) { 5745 const uint32_t a = subpass->attachments[i].attachment; 5746 if (a == VK_ATTACHMENT_UNUSED) 5747 continue; 5748 5749 radv_handle_subpass_image_transition(cmd_buffer, subpass->attachments[i], true); 5750 } 5751 5752 if (subpass->vrs_attachment) { 5753 int idx = subpass->vrs_attachment->attachment; 5754 struct radv_image_view *vrs_iview = cmd_buffer->state.attachments[idx].iview; 5755 5756 if (subpass->depth_stencil_attachment) { 5757 /* When a subpass uses a VRS attachment and a depth/stencil attachment, we just need to 5758 * copy the VRS rates to the HTILE buffer of the attachment. 5759 */ 5760 int ds_idx = subpass->depth_stencil_attachment->attachment; 5761 struct radv_image_view *ds_iview = cmd_buffer->state.attachments[ds_idx].iview; 5762 struct radv_image *ds_image = ds_iview->image; 5763 5764 VkExtent2D extent = { 5765 .width = ds_image->info.width, 5766 .height = ds_image->info.height, 5767 }; 5768 5769 /* HTILE buffer */ 5770 uint64_t htile_offset = ds_image->offset + ds_image->planes[0].surface.meta_offset; 5771 uint64_t htile_size = ds_image->planes[0].surface.meta_slice_size; 5772 struct radv_buffer htile_buffer; 5773 5774 radv_buffer_init(&htile_buffer, cmd_buffer->device, ds_image->bo, htile_size, htile_offset); 5775 5776 /* Copy the VRS rates to the HTILE buffer. */ 5777 radv_copy_vrs_htile(cmd_buffer, vrs_iview->image, &extent, ds_image, &htile_buffer, true); 5778 5779 radv_buffer_finish(&htile_buffer); 5780 } else { 5781 /* When a subpass uses a VRS attachment without binding a depth/stencil attachment, we have 5782 * to copy the VRS rates to our internal HTILE buffer. 5783 */ 5784 struct radv_framebuffer *fb = cmd_buffer->state.framebuffer; 5785 struct radv_image *ds_image = radv_cmd_buffer_get_vrs_image(cmd_buffer); 5786 5787 if (ds_image) { 5788 /* HTILE buffer */ 5789 struct radv_buffer *htile_buffer = cmd_buffer->device->vrs.buffer; 5790 5791 VkExtent2D extent = { 5792 .width = MIN2(fb->width, ds_image->info.width), 5793 .height = MIN2(fb->height, ds_image->info.height), 5794 }; 5795 5796 /* Copy the VRS rates to the HTILE buffer. */ 5797 radv_copy_vrs_htile(cmd_buffer, vrs_iview->image, &extent, ds_image, htile_buffer, false); 5798 } 5799 } 5800 } 5801 5802 radv_describe_barrier_end(cmd_buffer); 5803 5804 radv_cmd_buffer_clear_subpass(cmd_buffer); 5805 5806 assert(cmd_buffer->cs->cdw <= cdw_max); 5807} 5808 5809static void 5810radv_mark_noncoherent_rb(struct radv_cmd_buffer *cmd_buffer) 5811{ 5812 const struct radv_subpass *subpass = cmd_buffer->state.subpass; 5813 5814 /* Have to be conservative in cmdbuffers with inherited attachments. */ 5815 if (!cmd_buffer->state.attachments) { 5816 cmd_buffer->state.rb_noncoherent_dirty = true; 5817 return; 5818 } 5819 5820 for (uint32_t i = 0; i < subpass->color_count; ++i) { 5821 const uint32_t a = subpass->color_attachments[i].attachment; 5822 if (a == VK_ATTACHMENT_UNUSED) 5823 continue; 5824 if (!cmd_buffer->state.attachments[a].iview->image->l2_coherent) { 5825 cmd_buffer->state.rb_noncoherent_dirty = true; 5826 return; 5827 } 5828 } 5829 if (subpass->depth_stencil_attachment && 5830 !cmd_buffer->state.attachments[subpass->depth_stencil_attachment->attachment] 5831 .iview->image->l2_coherent) 5832 cmd_buffer->state.rb_noncoherent_dirty = true; 5833} 5834 5835void 5836radv_cmd_buffer_restore_subpass(struct radv_cmd_buffer *cmd_buffer, 5837 const struct radv_subpass *subpass) 5838{ 5839 radv_mark_noncoherent_rb(cmd_buffer); 5840 radv_cmd_buffer_set_subpass(cmd_buffer, subpass); 5841} 5842 5843static void 5844radv_cmd_buffer_end_subpass(struct radv_cmd_buffer *cmd_buffer) 5845{ 5846 struct radv_cmd_state *state = &cmd_buffer->state; 5847 const struct radv_subpass *subpass = state->subpass; 5848 uint32_t subpass_id = radv_get_subpass_id(cmd_buffer); 5849 5850 radv_cmd_buffer_resolve_subpass(cmd_buffer); 5851 5852 radv_describe_barrier_start(cmd_buffer, RGP_BARRIER_EXTERNAL_RENDER_PASS_SYNC); 5853 5854 for (uint32_t i = 0; i < subpass->attachment_count; ++i) { 5855 const uint32_t a = subpass->attachments[i].attachment; 5856 if (a == VK_ATTACHMENT_UNUSED) 5857 continue; 5858 5859 if (state->pass->attachments[a].last_subpass_idx != subpass_id) 5860 continue; 5861 5862 VkImageLayout layout = state->pass->attachments[a].final_layout; 5863 VkImageLayout stencil_layout = state->pass->attachments[a].stencil_final_layout; 5864 struct radv_subpass_attachment att = {a, layout, stencil_layout}; 5865 radv_handle_subpass_image_transition(cmd_buffer, att, false); 5866 } 5867 5868 radv_describe_barrier_end(cmd_buffer); 5869} 5870 5871void 5872radv_cmd_buffer_begin_render_pass(struct radv_cmd_buffer *cmd_buffer, 5873 const VkRenderPassBeginInfo *pRenderPassBegin, 5874 const struct radv_extra_render_pass_begin_info *extra_info) 5875{ 5876 RADV_FROM_HANDLE(radv_render_pass, pass, pRenderPassBegin->renderPass); 5877 RADV_FROM_HANDLE(radv_framebuffer, framebuffer, pRenderPassBegin->framebuffer); 5878 VkResult result; 5879 5880 cmd_buffer->state.framebuffer = framebuffer; 5881 cmd_buffer->state.pass = pass; 5882 cmd_buffer->state.render_area = pRenderPassBegin->renderArea; 5883 5884 result = radv_cmd_state_setup_attachments(cmd_buffer, pass, pRenderPassBegin, extra_info); 5885 if (result != VK_SUCCESS) 5886 return; 5887 5888 result = radv_cmd_state_setup_sample_locations(cmd_buffer, pass, pRenderPassBegin); 5889 if (result != VK_SUCCESS) 5890 return; 5891} 5892 5893void 5894radv_CmdBeginRenderPass2(VkCommandBuffer commandBuffer, 5895 const VkRenderPassBeginInfo *pRenderPassBeginInfo, 5896 const VkSubpassBeginInfo *pSubpassBeginInfo) 5897{ 5898 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5899 5900 radv_cmd_buffer_begin_render_pass(cmd_buffer, pRenderPassBeginInfo, NULL); 5901 5902 radv_cmd_buffer_begin_subpass(cmd_buffer, 0); 5903} 5904 5905void 5906radv_CmdNextSubpass2(VkCommandBuffer commandBuffer, const VkSubpassBeginInfo *pSubpassBeginInfo, 5907 const VkSubpassEndInfo *pSubpassEndInfo) 5908{ 5909 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5910 5911 radv_mark_noncoherent_rb(cmd_buffer); 5912 5913 uint32_t prev_subpass = radv_get_subpass_id(cmd_buffer); 5914 radv_cmd_buffer_end_subpass(cmd_buffer); 5915 radv_cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1); 5916} 5917 5918static void 5919radv_emit_view_index(struct radv_cmd_buffer *cmd_buffer, unsigned index) 5920{ 5921 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; 5922 for (unsigned stage = 0; stage < MESA_SHADER_STAGES; ++stage) { 5923 if (!radv_get_shader(pipeline, stage)) 5924 continue; 5925 5926 struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, AC_UD_VIEW_INDEX); 5927 if (loc->sgpr_idx == -1) 5928 continue; 5929 uint32_t base_reg = pipeline->user_data_0[stage]; 5930 radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, index); 5931 } 5932 if (radv_pipeline_has_gs_copy_shader(pipeline)) { 5933 struct radv_userdata_info *loc = 5934 &pipeline->gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_VIEW_INDEX]; 5935 if (loc->sgpr_idx != -1) { 5936 uint32_t base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0; 5937 radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, index); 5938 } 5939 } 5940} 5941 5942static void 5943radv_cs_emit_draw_packet(struct radv_cmd_buffer *cmd_buffer, uint32_t vertex_count, 5944 uint32_t use_opaque) 5945{ 5946 radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, cmd_buffer->state.predicating)); 5947 radeon_emit(cmd_buffer->cs, vertex_count); 5948 radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque); 5949} 5950 5951/** 5952 * Emit a PKT3_DRAW_INDEX_2 packet to render "index_count` vertices. 5953 * 5954 * The starting address "index_va" may point anywhere within the index buffer. The number of 5955 * indexes allocated in the index buffer *past that point* is specified by "max_index_count". 5956 * Hardware uses this information to return 0 for out-of-bounds reads. 5957 */ 5958static void 5959radv_cs_emit_draw_indexed_packet(struct radv_cmd_buffer *cmd_buffer, uint64_t index_va, 5960 uint32_t max_index_count, uint32_t index_count, bool not_eop) 5961{ 5962 radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_2, 4, cmd_buffer->state.predicating)); 5963 radeon_emit(cmd_buffer->cs, max_index_count); 5964 radeon_emit(cmd_buffer->cs, index_va); 5965 radeon_emit(cmd_buffer->cs, index_va >> 32); 5966 radeon_emit(cmd_buffer->cs, index_count); 5967 /* NOT_EOP allows merging multiple draws into 1 wave, but only user VGPRs 5968 * can be changed between draws and GS fast launch must be disabled. 5969 * NOT_EOP doesn't work on gfx9 and older. 5970 */ 5971 radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_DMA | S_0287F0_NOT_EOP(not_eop)); 5972} 5973 5974/* MUST inline this function to avoid massive perf loss in drawoverhead */ 5975ALWAYS_INLINE static void 5976radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer *cmd_buffer, bool indexed, 5977 uint32_t draw_count, uint64_t count_va, uint32_t stride) 5978{ 5979 struct radeon_cmdbuf *cs = cmd_buffer->cs; 5980 const unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA : V_0287F0_DI_SRC_SEL_AUTO_INDEX; 5981 bool draw_id_enable = cmd_buffer->state.pipeline->graphics.uses_drawid; 5982 uint32_t base_reg = cmd_buffer->state.pipeline->graphics.vtx_base_sgpr; 5983 uint32_t vertex_offset_reg, start_instance_reg = 0, draw_id_reg = 0; 5984 bool predicating = cmd_buffer->state.predicating; 5985 assert(base_reg); 5986 5987 /* just reset draw state for vertex data */ 5988 cmd_buffer->state.last_first_instance = -1; 5989 cmd_buffer->state.last_num_instances = -1; 5990 cmd_buffer->state.last_drawid = -1; 5991 cmd_buffer->state.last_vertex_offset = -1; 5992 5993 vertex_offset_reg = (base_reg - SI_SH_REG_OFFSET) >> 2; 5994 if (cmd_buffer->state.pipeline->graphics.uses_baseinstance) 5995 start_instance_reg = ((base_reg + (draw_id_enable ? 8 : 4)) - SI_SH_REG_OFFSET) >> 2; 5996 if (draw_id_enable) 5997 draw_id_reg = ((base_reg + 4) - SI_SH_REG_OFFSET) >> 2; 5998 5999 if (draw_count == 1 && !count_va && !draw_id_enable) { 6000 radeon_emit(cs, 6001 PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT : PKT3_DRAW_INDIRECT, 3, predicating)); 6002 radeon_emit(cs, 0); 6003 radeon_emit(cs, vertex_offset_reg); 6004 radeon_emit(cs, start_instance_reg); 6005 radeon_emit(cs, di_src_sel); 6006 } else { 6007 radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI : PKT3_DRAW_INDIRECT_MULTI, 8, 6008 predicating)); 6009 radeon_emit(cs, 0); 6010 radeon_emit(cs, vertex_offset_reg); 6011 radeon_emit(cs, start_instance_reg); 6012 radeon_emit(cs, draw_id_reg | S_2C3_DRAW_INDEX_ENABLE(draw_id_enable) | 6013 S_2C3_COUNT_INDIRECT_ENABLE(!!count_va)); 6014 radeon_emit(cs, draw_count); /* count */ 6015 radeon_emit(cs, count_va); /* count_addr */ 6016 radeon_emit(cs, count_va >> 32); 6017 radeon_emit(cs, stride); /* stride */ 6018 radeon_emit(cs, di_src_sel); 6019 6020 cmd_buffer->state.uses_draw_indirect_multi = true; 6021 } 6022} 6023 6024static inline void 6025radv_emit_userdata_vertex_internal(struct radv_cmd_buffer *cmd_buffer, 6026 const struct radv_draw_info *info, const uint32_t vertex_offset) 6027{ 6028 struct radv_cmd_state *state = &cmd_buffer->state; 6029 struct radeon_cmdbuf *cs = cmd_buffer->cs; 6030 const bool uses_baseinstance = state->pipeline->graphics.uses_baseinstance; 6031 const bool uses_drawid = state->pipeline->graphics.uses_drawid; 6032 radeon_set_sh_reg_seq(cs, state->pipeline->graphics.vtx_base_sgpr, 6033 state->pipeline->graphics.vtx_emit_num); 6034 6035 radeon_emit(cs, vertex_offset); 6036 state->last_vertex_offset = vertex_offset; 6037 if (uses_drawid) { 6038 radeon_emit(cs, 0); 6039 state->last_drawid = 0; 6040 } 6041 if (uses_baseinstance) { 6042 radeon_emit(cs, info->first_instance); 6043 state->last_first_instance = info->first_instance; 6044 } 6045} 6046 6047ALWAYS_INLINE static void 6048radv_emit_userdata_vertex(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, 6049 const uint32_t vertex_offset) 6050{ 6051 const struct radv_cmd_state *state = &cmd_buffer->state; 6052 const bool uses_baseinstance = state->pipeline->graphics.uses_baseinstance; 6053 const bool uses_drawid = state->pipeline->graphics.uses_drawid; 6054 6055 /* this looks very dumb, but it allows the compiler to optimize better and yields 6056 * ~3-4% perf increase in drawoverhead 6057 */ 6058 if (vertex_offset != state->last_vertex_offset) { 6059 radv_emit_userdata_vertex_internal(cmd_buffer, info, vertex_offset); 6060 } else if (uses_drawid && 0 != state->last_drawid) { 6061 radv_emit_userdata_vertex_internal(cmd_buffer, info, vertex_offset); 6062 } else if (uses_baseinstance && info->first_instance != state->last_first_instance) { 6063 radv_emit_userdata_vertex_internal(cmd_buffer, info, vertex_offset); 6064 } 6065} 6066 6067ALWAYS_INLINE static void 6068radv_emit_userdata_vertex_drawid(struct radv_cmd_buffer *cmd_buffer, uint32_t vertex_offset, uint32_t drawid) 6069{ 6070 struct radv_cmd_state *state = &cmd_buffer->state; 6071 struct radeon_cmdbuf *cs = cmd_buffer->cs; 6072 radeon_set_sh_reg_seq(cs, state->pipeline->graphics.vtx_base_sgpr, 1 + !!drawid); 6073 radeon_emit(cs, vertex_offset); 6074 state->last_vertex_offset = vertex_offset; 6075 if (drawid) 6076 radeon_emit(cs, drawid); 6077 6078} 6079 6080ALWAYS_INLINE static void 6081radv_emit_draw_packets_indexed(struct radv_cmd_buffer *cmd_buffer, 6082 const struct radv_draw_info *info, 6083 uint32_t drawCount, const VkMultiDrawIndexedInfoEXT *minfo, 6084 uint32_t stride, 6085 const int32_t *vertexOffset) 6086 6087{ 6088 struct radv_cmd_state *state = &cmd_buffer->state; 6089 struct radeon_cmdbuf *cs = cmd_buffer->cs; 6090 const int index_size = radv_get_vgt_index_size(state->index_type); 6091 unsigned i = 0; 6092 const bool uses_drawid = state->pipeline->graphics.uses_drawid; 6093 const bool can_eop = !uses_drawid && cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10; 6094 6095 if (uses_drawid) { 6096 if (vertexOffset) { 6097 radv_emit_userdata_vertex(cmd_buffer, info, *vertexOffset); 6098 vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) { 6099 const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex; 6100 6101 /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */ 6102 if (!remaining_indexes && 6103 cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug) 6104 continue; 6105 6106 if (i > 0) 6107 radeon_set_sh_reg(cs, state->pipeline->graphics.vtx_base_sgpr + sizeof(uint32_t), i); 6108 6109 const uint64_t index_va = state->index_va + draw->firstIndex * index_size; 6110 6111 if (!state->subpass->view_mask) { 6112 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false); 6113 } else { 6114 u_foreach_bit(view, state->subpass->view_mask) { 6115 radv_emit_view_index(cmd_buffer, view); 6116 6117 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false); 6118 } 6119 } 6120 } 6121 } else { 6122 vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) { 6123 const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex; 6124 6125 /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */ 6126 if (!remaining_indexes && 6127 cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug) 6128 continue; 6129 6130 if (i > 0) { 6131 if (state->last_vertex_offset != draw->vertexOffset) 6132 radv_emit_userdata_vertex_drawid(cmd_buffer, draw->vertexOffset, i); 6133 else 6134 radeon_set_sh_reg(cs, state->pipeline->graphics.vtx_base_sgpr + sizeof(uint32_t), i); 6135 } else 6136 radv_emit_userdata_vertex(cmd_buffer, info, draw->vertexOffset); 6137 6138 const uint64_t index_va = state->index_va + draw->firstIndex * index_size; 6139 6140 if (!state->subpass->view_mask) { 6141 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false); 6142 } else { 6143 u_foreach_bit(view, state->subpass->view_mask) { 6144 radv_emit_view_index(cmd_buffer, view); 6145 6146 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false); 6147 } 6148 } 6149 } 6150 } 6151 if (drawCount > 1) { 6152 state->last_drawid = drawCount - 1; 6153 } 6154 } else { 6155 if (vertexOffset) { 6156 if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX10) { 6157 /* GFX10 has a bug that consecutive draw packets with NOT_EOP must not have 6158 * count == 0 for the last draw that doesn't have NOT_EOP. 6159 */ 6160 while (drawCount > 1) { 6161 const VkMultiDrawIndexedInfoEXT *last = (const VkMultiDrawIndexedInfoEXT*)(((const uint8_t*)minfo) + (drawCount - 1) * stride); 6162 if (last->indexCount) 6163 break; 6164 drawCount--; 6165 } 6166 } 6167 6168 radv_emit_userdata_vertex(cmd_buffer, info, *vertexOffset); 6169 vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) { 6170 const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex; 6171 6172 /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */ 6173 if (!remaining_indexes && 6174 cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug) 6175 continue; 6176 6177 const uint64_t index_va = state->index_va + draw->firstIndex * index_size; 6178 6179 if (!state->subpass->view_mask) { 6180 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, can_eop && i < drawCount - 1); 6181 } else { 6182 u_foreach_bit(view, state->subpass->view_mask) { 6183 radv_emit_view_index(cmd_buffer, view); 6184 6185 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false); 6186 } 6187 } 6188 } 6189 } else { 6190 vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) { 6191 const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex; 6192 6193 /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */ 6194 if (!remaining_indexes && 6195 cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug) 6196 continue; 6197 6198 const VkMultiDrawIndexedInfoEXT *next = (const VkMultiDrawIndexedInfoEXT*)(i < drawCount - 1 ? ((uint8_t*)draw + stride) : NULL); 6199 const bool offset_changes = next && next->vertexOffset != draw->vertexOffset; 6200 radv_emit_userdata_vertex(cmd_buffer, info, draw->vertexOffset); 6201 6202 const uint64_t index_va = state->index_va + draw->firstIndex * index_size; 6203 6204 if (!state->subpass->view_mask) { 6205 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, can_eop && !offset_changes && i < drawCount - 1); 6206 } else { 6207 u_foreach_bit(view, state->subpass->view_mask) { 6208 radv_emit_view_index(cmd_buffer, view); 6209 6210 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false); 6211 } 6212 } 6213 } 6214 } 6215 if (drawCount > 1) { 6216 state->last_drawid = drawCount - 1; 6217 } 6218 } 6219} 6220 6221ALWAYS_INLINE static void 6222radv_emit_direct_draw_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, 6223 uint32_t drawCount, const VkMultiDrawInfoEXT *minfo, 6224 uint32_t use_opaque, uint32_t stride) 6225{ 6226 unsigned i = 0; 6227 const uint32_t view_mask = cmd_buffer->state.subpass->view_mask; 6228 const bool uses_drawid = cmd_buffer->state.pipeline->graphics.uses_drawid; 6229 uint32_t last_start = 0; 6230 6231 vk_foreach_multi_draw(draw, i, minfo, drawCount, stride) { 6232 if (!i) 6233 radv_emit_userdata_vertex(cmd_buffer, info, draw->firstVertex); 6234 else 6235 radv_emit_userdata_vertex_drawid(cmd_buffer, draw->firstVertex, uses_drawid ? i : 0); 6236 6237 if (!view_mask) { 6238 radv_cs_emit_draw_packet(cmd_buffer, draw->vertexCount, use_opaque); 6239 } else { 6240 u_foreach_bit(view, view_mask) { 6241 radv_emit_view_index(cmd_buffer, view); 6242 radv_cs_emit_draw_packet(cmd_buffer, draw->vertexCount, use_opaque); 6243 } 6244 } 6245 last_start = draw->firstVertex; 6246 } 6247 if (drawCount > 1) { 6248 struct radv_cmd_state *state = &cmd_buffer->state; 6249 state->last_vertex_offset = last_start; 6250 if (uses_drawid) 6251 state->last_drawid = drawCount - 1; 6252 } 6253} 6254 6255static void 6256radv_emit_indirect_draw_packets(struct radv_cmd_buffer *cmd_buffer, 6257 const struct radv_draw_info *info) 6258{ 6259 const struct radv_cmd_state *state = &cmd_buffer->state; 6260 struct radeon_winsys *ws = cmd_buffer->device->ws; 6261 struct radeon_cmdbuf *cs = cmd_buffer->cs; 6262 const uint64_t va = 6263 radv_buffer_get_va(info->indirect->bo) + info->indirect->offset + info->indirect_offset; 6264 const uint64_t count_va = info->count_buffer 6265 ? radv_buffer_get_va(info->count_buffer->bo) + 6266 info->count_buffer->offset + info->count_buffer_offset 6267 : 0; 6268 6269 radv_cs_add_buffer(ws, cs, info->indirect->bo); 6270 6271 radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0)); 6272 radeon_emit(cs, 1); 6273 radeon_emit(cs, va); 6274 radeon_emit(cs, va >> 32); 6275 6276 if (info->count_buffer) { 6277 radv_cs_add_buffer(ws, cs, info->count_buffer->bo); 6278 } 6279 6280 if (!state->subpass->view_mask) { 6281 radv_cs_emit_indirect_draw_packet(cmd_buffer, info->indexed, info->count, count_va, 6282 info->stride); 6283 } else { 6284 u_foreach_bit(i, state->subpass->view_mask) 6285 { 6286 radv_emit_view_index(cmd_buffer, i); 6287 6288 radv_cs_emit_indirect_draw_packet(cmd_buffer, info->indexed, info->count, count_va, 6289 info->stride); 6290 } 6291 } 6292} 6293 6294/* 6295 * Vega and raven have a bug which triggers if there are multiple context 6296 * register contexts active at the same time with different scissor values. 6297 * 6298 * There are two possible workarounds: 6299 * 1) Wait for PS_PARTIAL_FLUSH every time the scissor is changed. That way 6300 * there is only ever 1 active set of scissor values at the same time. 6301 * 6302 * 2) Whenever the hardware switches contexts we have to set the scissor 6303 * registers again even if it is a noop. That way the new context gets 6304 * the correct scissor values. 6305 * 6306 * This implements option 2. radv_need_late_scissor_emission needs to 6307 * return true on affected HW if radv_emit_all_graphics_states sets 6308 * any context registers. 6309 */ 6310static bool 6311radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer, 6312 const struct radv_draw_info *info) 6313{ 6314 struct radv_cmd_state *state = &cmd_buffer->state; 6315 6316 if (!cmd_buffer->device->physical_device->rad_info.has_gfx9_scissor_bug) 6317 return false; 6318 6319 if (cmd_buffer->state.context_roll_without_scissor_emitted || info->strmout_buffer) 6320 return true; 6321 6322 uint64_t used_states = 6323 cmd_buffer->state.pipeline->graphics.needed_dynamic_state | ~RADV_CMD_DIRTY_DYNAMIC_ALL; 6324 6325 /* Index, vertex and streamout buffers don't change context regs, and 6326 * pipeline is already handled. 6327 */ 6328 used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_VERTEX_BUFFER | 6329 RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT | RADV_CMD_DIRTY_STREAMOUT_BUFFER | 6330 RADV_CMD_DIRTY_PIPELINE); 6331 6332 if (cmd_buffer->state.dirty & used_states) 6333 return true; 6334 6335 uint32_t primitive_reset_index = radv_get_primitive_reset_index(cmd_buffer); 6336 6337 if (info->indexed && state->dynamic.primitive_restart_enable && 6338 primitive_reset_index != state->last_primitive_reset_index) 6339 return true; 6340 6341 return false; 6342} 6343 6344enum { 6345 ngg_cull_none = 0, 6346 ngg_cull_front_face = 1, 6347 ngg_cull_back_face = 2, 6348 ngg_cull_face_is_ccw = 4, 6349 ngg_cull_small_primitives = 8, 6350}; 6351 6352ALWAYS_INLINE static bool 6353radv_skip_ngg_culling(bool has_tess, const unsigned vtx_cnt, 6354 bool indirect) 6355{ 6356 /* If we have to draw only a few vertices, we get better latency if 6357 * we disable NGG culling. 6358 * 6359 * When tessellation is used, what matters is the number of tessellated 6360 * vertices, so let's always assume it's not a small draw. 6361 */ 6362 return !has_tess && !indirect && vtx_cnt < 128; 6363} 6364 6365ALWAYS_INLINE static uint32_t 6366radv_get_ngg_culling_settings(struct radv_cmd_buffer *cmd_buffer, bool vp_y_inverted) 6367{ 6368 const struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; 6369 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 6370 6371 /* Cull every triangle when rasterizer discard is enabled. */ 6372 if (d->rasterizer_discard_enable || 6373 G_028810_DX_RASTERIZATION_KILL(cmd_buffer->state.pipeline->graphics.pa_cl_clip_cntl)) 6374 return ngg_cull_front_face | ngg_cull_back_face; 6375 6376 uint32_t pa_su_sc_mode_cntl = cmd_buffer->state.pipeline->graphics.pa_su_sc_mode_cntl; 6377 uint32_t nggc_settings = ngg_cull_none; 6378 6379 /* The culling code needs to know whether face is CW or CCW. */ 6380 bool ccw = (pipeline->graphics.needed_dynamic_state & RADV_DYNAMIC_FRONT_FACE) 6381 ? d->front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE 6382 : G_028814_FACE(pa_su_sc_mode_cntl) == 0; 6383 6384 /* Take inverted viewport into account. */ 6385 ccw ^= vp_y_inverted; 6386 6387 if (ccw) 6388 nggc_settings |= ngg_cull_face_is_ccw; 6389 6390 /* Face culling settings. */ 6391 if ((pipeline->graphics.needed_dynamic_state & RADV_DYNAMIC_CULL_MODE) 6392 ? (d->cull_mode & VK_CULL_MODE_FRONT_BIT) 6393 : G_028814_CULL_FRONT(pa_su_sc_mode_cntl)) 6394 nggc_settings |= ngg_cull_front_face; 6395 if ((pipeline->graphics.needed_dynamic_state & RADV_DYNAMIC_CULL_MODE) 6396 ? (d->cull_mode & VK_CULL_MODE_BACK_BIT) 6397 : G_028814_CULL_BACK(pa_su_sc_mode_cntl)) 6398 nggc_settings |= ngg_cull_back_face; 6399 6400 /* Small primitive culling is only valid when conservative overestimation is not used. */ 6401 if (!pipeline->graphics.uses_conservative_overestimate) { 6402 nggc_settings |= ngg_cull_small_primitives; 6403 6404 /* small_prim_precision = num_samples / 2^subpixel_bits 6405 * num_samples is also always a power of two, so the small prim precision can only be 6406 * a power of two between 2^-2 and 2^-6, therefore it's enough to remember the exponent. 6407 */ 6408 unsigned subpixel_bits = 256; 6409 int32_t small_prim_precision_log2 = util_logbase2(pipeline->graphics.ms.num_samples) - util_logbase2(subpixel_bits); 6410 nggc_settings |= ((uint32_t) small_prim_precision_log2 << 24u); 6411 } 6412 6413 return nggc_settings; 6414} 6415 6416static void 6417radv_emit_ngg_culling_state(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *draw_info) 6418{ 6419 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; 6420 const unsigned stage = pipeline->graphics.last_vgt_api_stage; 6421 const bool nggc_supported = pipeline->graphics.has_ngg_culling; 6422 6423 if (!nggc_supported && !cmd_buffer->state.last_nggc_settings) { 6424 /* Current shader doesn't support culling and culling was already disabled: 6425 * No further steps needed, just remember the SGPR's location is not set. 6426 */ 6427 cmd_buffer->state.last_nggc_settings_sgpr_idx = -1; 6428 return; 6429 } 6430 6431 /* Check dirty flags: 6432 * - Dirty pipeline: SGPR index may have changed (we have to re-emit if changed). 6433 * - Dirty dynamic flags: culling settings may have changed. 6434 */ 6435 const bool dirty = 6436 cmd_buffer->state.dirty & 6437 (RADV_CMD_DIRTY_PIPELINE | 6438 RADV_CMD_DIRTY_DYNAMIC_CULL_MODE | RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE | 6439 RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT); 6440 6441 /* Check small draw status: 6442 * For small draw calls, we disable culling by setting the SGPR to 0. 6443 */ 6444 const bool skip = 6445 radv_skip_ngg_culling(stage == MESA_SHADER_TESS_EVAL, draw_info->count, draw_info->indirect); 6446 6447 /* See if anything changed. */ 6448 if (!dirty && skip == cmd_buffer->state.last_nggc_skip) 6449 return; 6450 6451 /* Remember small draw state. */ 6452 cmd_buffer->state.last_nggc_skip = skip; 6453 const struct radv_shader_variant *v = pipeline->shaders[stage]; 6454 assert(v->info.has_ngg_culling == nggc_supported); 6455 6456 /* Find the user SGPR. */ 6457 const uint32_t base_reg = pipeline->user_data_0[stage]; 6458 const int8_t nggc_sgpr_idx = v->info.user_sgprs_locs.shader_data[AC_UD_NGG_CULLING_SETTINGS].sgpr_idx; 6459 assert(!nggc_supported || nggc_sgpr_idx != -1); 6460 6461 /* Get viewport transform. */ 6462 float vp_scale[2], vp_translate[2]; 6463 memcpy(vp_scale, cmd_buffer->state.dynamic.viewport.xform[0].scale, 2 * sizeof(float)); 6464 memcpy(vp_translate, cmd_buffer->state.dynamic.viewport.xform[0].translate, 2 * sizeof(float)); 6465 bool vp_y_inverted = (-vp_scale[1] + vp_translate[1]) > (vp_scale[1] + vp_translate[1]); 6466 6467 /* Get current culling settings. */ 6468 uint32_t nggc_settings = nggc_supported && !skip 6469 ? radv_get_ngg_culling_settings(cmd_buffer, vp_y_inverted) 6470 : ngg_cull_none; 6471 6472 bool emit_viewport = nggc_settings && 6473 (cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_VIEWPORT || 6474 cmd_buffer->state.last_nggc_settings_sgpr_idx != nggc_sgpr_idx || 6475 !cmd_buffer->state.last_nggc_settings); 6476 6477 if (emit_viewport) { 6478 /* Correction for inverted Y */ 6479 if (vp_y_inverted) { 6480 vp_scale[1] = -vp_scale[1]; 6481 vp_translate[1] = -vp_translate[1]; 6482 } 6483 6484 /* Correction for number of samples per pixel. */ 6485 for (unsigned i = 0; i < 2; ++i) { 6486 vp_scale[i] *= (float) pipeline->graphics.ms.num_samples; 6487 vp_translate[i] *= (float) pipeline->graphics.ms.num_samples; 6488 } 6489 6490 uint32_t vp_reg_values[4] = {fui(vp_scale[0]), fui(vp_scale[1]), fui(vp_translate[0]), fui(vp_translate[1])}; 6491 const int8_t vp_sgpr_idx = v->info.user_sgprs_locs.shader_data[AC_UD_NGG_VIEWPORT].sgpr_idx; 6492 assert(vp_sgpr_idx != -1); 6493 radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + vp_sgpr_idx * 4, 4); 6494 radeon_emit_array(cmd_buffer->cs, vp_reg_values, 4); 6495 } 6496 6497 bool emit_settings = nggc_supported && 6498 (cmd_buffer->state.last_nggc_settings != nggc_settings || 6499 cmd_buffer->state.last_nggc_settings_sgpr_idx != nggc_sgpr_idx); 6500 6501 /* This needs to be emitted when culling is turned on 6502 * and when it's already on but some settings change. 6503 */ 6504 if (emit_settings) { 6505 assert(nggc_sgpr_idx >= 0); 6506 radeon_set_sh_reg(cmd_buffer->cs, base_reg + nggc_sgpr_idx * 4, nggc_settings); 6507 } 6508 6509 /* These only need to be emitted when culling is turned on or off, 6510 * but not when it stays on and just some settings change. 6511 */ 6512 if (!!cmd_buffer->state.last_nggc_settings != !!nggc_settings) { 6513 uint32_t rsrc2 = v->config.rsrc2; 6514 6515 if (!nggc_settings) { 6516 /* Allocate less LDS when culling is disabled. (But GS always needs it.) */ 6517 if (stage != MESA_SHADER_GEOMETRY) 6518 rsrc2 = (rsrc2 & C_00B22C_LDS_SIZE) | S_00B22C_LDS_SIZE(v->info.num_lds_blocks_when_not_culling); 6519 } 6520 6521 /* When the pipeline is dirty and not yet emitted, don't write it here 6522 * because radv_emit_graphics_pipeline will overwrite this register. 6523 */ 6524 if (!(cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) || 6525 cmd_buffer->state.emitted_pipeline == pipeline) { 6526 radeon_set_sh_reg(cmd_buffer->cs, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2); 6527 } 6528 } 6529 6530 cmd_buffer->state.last_nggc_settings = nggc_settings; 6531 cmd_buffer->state.last_nggc_settings_sgpr_idx = nggc_sgpr_idx; 6532} 6533 6534static void 6535radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, 6536 bool pipeline_is_dirty) 6537{ 6538 bool late_scissor_emission; 6539 6540 if ((cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER) || 6541 cmd_buffer->state.emitted_pipeline != cmd_buffer->state.pipeline) 6542 radv_emit_rbplus_state(cmd_buffer); 6543 6544 if (cmd_buffer->device->physical_device->use_ngg_culling && 6545 cmd_buffer->state.pipeline->graphics.is_ngg) 6546 radv_emit_ngg_culling_state(cmd_buffer, info); 6547 6548 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) 6549 radv_emit_graphics_pipeline(cmd_buffer); 6550 6551 /* This should be before the cmd_buffer->state.dirty is cleared 6552 * (excluding RADV_CMD_DIRTY_PIPELINE) and after 6553 * cmd_buffer->state.context_roll_without_scissor_emitted is set. */ 6554 late_scissor_emission = radv_need_late_scissor_emission(cmd_buffer, info); 6555 6556 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER) 6557 radv_emit_framebuffer_state(cmd_buffer); 6558 6559 if (info->indexed) { 6560 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_INDEX_BUFFER) 6561 radv_emit_index_buffer(cmd_buffer, info->indirect); 6562 } else { 6563 /* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE, 6564 * so the state must be re-emitted before the next indexed 6565 * draw. 6566 */ 6567 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) { 6568 cmd_buffer->state.last_index_type = -1; 6569 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER; 6570 } 6571 } 6572 6573 radv_cmd_buffer_flush_dynamic_state(cmd_buffer, pipeline_is_dirty); 6574 6575 radv_emit_draw_registers(cmd_buffer, info); 6576 6577 if (late_scissor_emission) 6578 radv_emit_scissor(cmd_buffer); 6579} 6580 6581/* MUST inline this function to avoid massive perf loss in drawoverhead */ 6582ALWAYS_INLINE static bool 6583radv_before_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, uint32_t drawCount) 6584{ 6585 const bool has_prefetch = cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7; 6586 const bool pipeline_is_dirty = (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) && 6587 cmd_buffer->state.pipeline != cmd_buffer->state.emitted_pipeline; 6588 6589 ASSERTED const unsigned cdw_max = 6590 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4096 + 128 * (drawCount - 1)); 6591 6592 if (likely(!info->indirect)) { 6593 /* GFX6-GFX7 treat instance_count==0 as instance_count==1. There is 6594 * no workaround for indirect draws, but we can at least skip 6595 * direct draws. 6596 */ 6597 if (unlikely(!info->instance_count)) 6598 return false; 6599 6600 /* Handle count == 0. */ 6601 if (unlikely(!info->count && !info->strmout_buffer)) 6602 return false; 6603 } 6604 6605 /* Need to apply this workaround early as it can set flush flags. */ 6606 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER) 6607 radv_emit_fb_mip_change_flush(cmd_buffer); 6608 6609 /* Use optimal packet order based on whether we need to sync the 6610 * pipeline. 6611 */ 6612 if (cmd_buffer->state.flush_bits & 6613 (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB | 6614 RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) { 6615 /* If we have to wait for idle, set all states first, so that 6616 * all SET packets are processed in parallel with previous draw 6617 * calls. Then upload descriptors, set shader pointers, and 6618 * draw, and prefetch at the end. This ensures that the time 6619 * the CUs are idle is very short. (there are only SET_SH 6620 * packets between the wait and the draw) 6621 */ 6622 radv_emit_all_graphics_states(cmd_buffer, info, pipeline_is_dirty); 6623 si_emit_cache_flush(cmd_buffer); 6624 /* <-- CUs are idle here --> */ 6625 6626 radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty); 6627 } else { 6628 /* If we don't wait for idle, start prefetches first, then set 6629 * states, and draw at the end. 6630 */ 6631 si_emit_cache_flush(cmd_buffer); 6632 6633 if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) { 6634 /* Only prefetch the vertex shader and VBO descriptors 6635 * in order to start the draw as soon as possible. 6636 */ 6637 radv_emit_prefetch_L2(cmd_buffer, cmd_buffer->state.pipeline, true); 6638 } 6639 6640 radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty); 6641 6642 radv_emit_all_graphics_states(cmd_buffer, info, pipeline_is_dirty); 6643 } 6644 6645 radv_describe_draw(cmd_buffer); 6646 if (likely(!info->indirect)) { 6647 struct radv_cmd_state *state = &cmd_buffer->state; 6648 struct radeon_cmdbuf *cs = cmd_buffer->cs; 6649 assert(state->pipeline->graphics.vtx_base_sgpr); 6650 if (state->last_num_instances != info->instance_count) { 6651 radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, false)); 6652 radeon_emit(cs, info->instance_count); 6653 state->last_num_instances = info->instance_count; 6654 } 6655 } 6656 assert(cmd_buffer->cs->cdw <= cdw_max); 6657 6658 return true; 6659} 6660 6661static void 6662radv_after_draw(struct radv_cmd_buffer *cmd_buffer) 6663{ 6664 const struct radeon_info *rad_info = &cmd_buffer->device->physical_device->rad_info; 6665 bool has_prefetch = cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7; 6666 /* Start prefetches after the draw has been started. Both will 6667 * run in parallel, but starting the draw first is more 6668 * important. 6669 */ 6670 if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) { 6671 radv_emit_prefetch_L2(cmd_buffer, cmd_buffer->state.pipeline, false); 6672 } 6673 6674 /* Workaround for a VGT hang when streamout is enabled. 6675 * It must be done after drawing. 6676 */ 6677 if (cmd_buffer->state.streamout.streamout_enabled && 6678 (rad_info->family == CHIP_HAWAII || rad_info->family == CHIP_TONGA || 6679 rad_info->family == CHIP_FIJI)) { 6680 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_STREAMOUT_SYNC; 6681 } 6682 6683 radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_PS_PARTIAL_FLUSH); 6684} 6685 6686void 6687radv_CmdDraw(VkCommandBuffer commandBuffer, uint32_t vertexCount, uint32_t instanceCount, 6688 uint32_t firstVertex, uint32_t firstInstance) 6689{ 6690 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 6691 struct radv_draw_info info; 6692 6693 info.count = vertexCount; 6694 info.instance_count = instanceCount; 6695 info.first_instance = firstInstance; 6696 info.strmout_buffer = NULL; 6697 info.indirect = NULL; 6698 info.indexed = false; 6699 6700 if (!radv_before_draw(cmd_buffer, &info, 1)) 6701 return; 6702 const VkMultiDrawInfoEXT minfo = { firstVertex, vertexCount }; 6703 radv_emit_direct_draw_packets(cmd_buffer, &info, 1, &minfo, 0, 0); 6704 radv_after_draw(cmd_buffer); 6705} 6706 6707void 6708radv_CmdDrawMultiEXT(VkCommandBuffer commandBuffer, uint32_t drawCount, const VkMultiDrawInfoEXT *pVertexInfo, 6709 uint32_t instanceCount, uint32_t firstInstance, uint32_t stride) 6710{ 6711 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 6712 struct radv_draw_info info; 6713 6714 if (!drawCount) 6715 return; 6716 6717 info.count = pVertexInfo->vertexCount; 6718 info.instance_count = instanceCount; 6719 info.first_instance = firstInstance; 6720 info.strmout_buffer = NULL; 6721 info.indirect = NULL; 6722 info.indexed = false; 6723 6724 if (!radv_before_draw(cmd_buffer, &info, drawCount)) 6725 return; 6726 radv_emit_direct_draw_packets(cmd_buffer, &info, drawCount, pVertexInfo, 0, stride); 6727 radv_after_draw(cmd_buffer); 6728} 6729 6730void 6731radv_CmdDrawIndexed(VkCommandBuffer commandBuffer, uint32_t indexCount, uint32_t instanceCount, 6732 uint32_t firstIndex, int32_t vertexOffset, uint32_t firstInstance) 6733{ 6734 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 6735 struct radv_draw_info info; 6736 6737 info.indexed = true; 6738 info.count = indexCount; 6739 info.instance_count = instanceCount; 6740 info.first_instance = firstInstance; 6741 info.strmout_buffer = NULL; 6742 info.indirect = NULL; 6743 6744 if (!radv_before_draw(cmd_buffer, &info, 1)) 6745 return; 6746 const VkMultiDrawIndexedInfoEXT minfo = { firstIndex, indexCount, vertexOffset }; 6747 radv_emit_draw_packets_indexed(cmd_buffer, &info, 1, &minfo, 0, NULL); 6748 radv_after_draw(cmd_buffer); 6749} 6750 6751void radv_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer, uint32_t drawCount, const VkMultiDrawIndexedInfoEXT *pIndexInfo, 6752 uint32_t instanceCount, uint32_t firstInstance, uint32_t stride, const int32_t *pVertexOffset) 6753{ 6754 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 6755 struct radv_draw_info info; 6756 6757 if (!drawCount) 6758 return; 6759 6760 const VkMultiDrawIndexedInfoEXT *minfo = pIndexInfo; 6761 info.indexed = true; 6762 info.count = minfo->indexCount; 6763 info.instance_count = instanceCount; 6764 info.first_instance = firstInstance; 6765 info.strmout_buffer = NULL; 6766 info.indirect = NULL; 6767 6768 if (!radv_before_draw(cmd_buffer, &info, drawCount)) 6769 return; 6770 radv_emit_draw_packets_indexed(cmd_buffer, &info, drawCount, pIndexInfo, stride, pVertexOffset); 6771 radv_after_draw(cmd_buffer); 6772} 6773 6774void 6775radv_CmdDrawIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, 6776 uint32_t drawCount, uint32_t stride) 6777{ 6778 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 6779 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer); 6780 struct radv_draw_info info; 6781 6782 info.count = drawCount; 6783 info.indirect = buffer; 6784 info.indirect_offset = offset; 6785 info.stride = stride; 6786 info.strmout_buffer = NULL; 6787 info.count_buffer = NULL; 6788 info.indexed = false; 6789 info.instance_count = 0; 6790 6791 if (!radv_before_draw(cmd_buffer, &info, 1)) 6792 return; 6793 radv_emit_indirect_draw_packets(cmd_buffer, &info); 6794 radv_after_draw(cmd_buffer); 6795} 6796 6797void 6798radv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, 6799 uint32_t drawCount, uint32_t stride) 6800{ 6801 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 6802 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer); 6803 struct radv_draw_info info; 6804 6805 info.indexed = true; 6806 info.count = drawCount; 6807 info.indirect = buffer; 6808 info.indirect_offset = offset; 6809 info.stride = stride; 6810 info.count_buffer = NULL; 6811 info.strmout_buffer = NULL; 6812 info.instance_count = 0; 6813 6814 if (!radv_before_draw(cmd_buffer, &info, 1)) 6815 return; 6816 radv_emit_indirect_draw_packets(cmd_buffer, &info); 6817 radv_after_draw(cmd_buffer); 6818} 6819 6820void 6821radv_CmdDrawIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, 6822 VkBuffer _countBuffer, VkDeviceSize countBufferOffset, 6823 uint32_t maxDrawCount, uint32_t stride) 6824{ 6825 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 6826 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer); 6827 RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer); 6828 struct radv_draw_info info; 6829 6830 info.count = maxDrawCount; 6831 info.indirect = buffer; 6832 info.indirect_offset = offset; 6833 info.count_buffer = count_buffer; 6834 info.count_buffer_offset = countBufferOffset; 6835 info.stride = stride; 6836 info.strmout_buffer = NULL; 6837 info.indexed = false; 6838 info.instance_count = 0; 6839 6840 if (!radv_before_draw(cmd_buffer, &info, 1)) 6841 return; 6842 radv_emit_indirect_draw_packets(cmd_buffer, &info); 6843 radv_after_draw(cmd_buffer); 6844} 6845 6846void 6847radv_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer, 6848 VkDeviceSize offset, VkBuffer _countBuffer, 6849 VkDeviceSize countBufferOffset, uint32_t maxDrawCount, 6850 uint32_t stride) 6851{ 6852 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 6853 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer); 6854 RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer); 6855 struct radv_draw_info info; 6856 6857 info.indexed = true; 6858 info.count = maxDrawCount; 6859 info.indirect = buffer; 6860 info.indirect_offset = offset; 6861 info.count_buffer = count_buffer; 6862 info.count_buffer_offset = countBufferOffset; 6863 info.stride = stride; 6864 info.strmout_buffer = NULL; 6865 info.instance_count = 0; 6866 6867 if (!radv_before_draw(cmd_buffer, &info, 1)) 6868 return; 6869 radv_emit_indirect_draw_packets(cmd_buffer, &info); 6870 radv_after_draw(cmd_buffer); 6871} 6872 6873struct radv_dispatch_info { 6874 /** 6875 * Determine the layout of the grid (in block units) to be used. 6876 */ 6877 uint32_t blocks[3]; 6878 6879 /** 6880 * A starting offset for the grid. If unaligned is set, the offset 6881 * must still be aligned. 6882 */ 6883 uint32_t offsets[3]; 6884 /** 6885 * Whether it's an unaligned compute dispatch. 6886 */ 6887 bool unaligned; 6888 6889 /** 6890 * Indirect compute parameters resource. 6891 */ 6892 struct radeon_winsys_bo *indirect; 6893 uint64_t va; 6894}; 6895 6896static void 6897radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline, 6898 const struct radv_dispatch_info *info) 6899{ 6900 struct radv_shader_variant *compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE]; 6901 unsigned dispatch_initiator = cmd_buffer->device->dispatch_initiator; 6902 struct radeon_winsys *ws = cmd_buffer->device->ws; 6903 bool predicating = cmd_buffer->state.predicating; 6904 struct radeon_cmdbuf *cs = cmd_buffer->cs; 6905 struct radv_userdata_info *loc; 6906 6907 radv_describe_dispatch(cmd_buffer, info->blocks[0], info->blocks[1], info->blocks[2]); 6908 6909 loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_COMPUTE, AC_UD_CS_GRID_SIZE); 6910 6911 ASSERTED unsigned cdw_max = radeon_check_space(ws, cs, 25); 6912 6913 if (compute_shader->info.wave_size == 32) { 6914 assert(cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10); 6915 dispatch_initiator |= S_00B800_CS_W32_EN(1); 6916 } 6917 6918 if (info->indirect) { 6919 radv_cs_add_buffer(ws, cs, info->indirect); 6920 6921 if (loc->sgpr_idx != -1) { 6922 for (unsigned i = 0; i < 3; ++i) { 6923 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); 6924 radeon_emit(cs, 6925 COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG)); 6926 radeon_emit(cs, (info->va + 4 * i)); 6927 radeon_emit(cs, (info->va + 4 * i) >> 32); 6928 radeon_emit(cs, ((R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4) >> 2) + i); 6929 radeon_emit(cs, 0); 6930 } 6931 } 6932 6933 if (radv_cmd_buffer_uses_mec(cmd_buffer)) { 6934 radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, predicating) | PKT3_SHADER_TYPE_S(1)); 6935 radeon_emit(cs, info->va); 6936 radeon_emit(cs, info->va >> 32); 6937 radeon_emit(cs, dispatch_initiator); 6938 } else { 6939 radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) | PKT3_SHADER_TYPE_S(1)); 6940 radeon_emit(cs, 1); 6941 radeon_emit(cs, info->va); 6942 radeon_emit(cs, info->va >> 32); 6943 6944 radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, predicating) | PKT3_SHADER_TYPE_S(1)); 6945 radeon_emit(cs, 0); 6946 radeon_emit(cs, dispatch_initiator); 6947 } 6948 } else { 6949 unsigned blocks[3] = {info->blocks[0], info->blocks[1], info->blocks[2]}; 6950 unsigned offsets[3] = {info->offsets[0], info->offsets[1], info->offsets[2]}; 6951 6952 if (info->unaligned) { 6953 unsigned *cs_block_size = compute_shader->info.cs.block_size; 6954 unsigned remainder[3]; 6955 6956 /* If aligned, these should be an entire block size, 6957 * not 0. 6958 */ 6959 remainder[0] = blocks[0] + cs_block_size[0] - align_u32_npot(blocks[0], cs_block_size[0]); 6960 remainder[1] = blocks[1] + cs_block_size[1] - align_u32_npot(blocks[1], cs_block_size[1]); 6961 remainder[2] = blocks[2] + cs_block_size[2] - align_u32_npot(blocks[2], cs_block_size[2]); 6962 6963 blocks[0] = round_up_u32(blocks[0], cs_block_size[0]); 6964 blocks[1] = round_up_u32(blocks[1], cs_block_size[1]); 6965 blocks[2] = round_up_u32(blocks[2], cs_block_size[2]); 6966 6967 for (unsigned i = 0; i < 3; ++i) { 6968 assert(offsets[i] % cs_block_size[i] == 0); 6969 offsets[i] /= cs_block_size[i]; 6970 } 6971 6972 radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3); 6973 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[0]) | 6974 S_00B81C_NUM_THREAD_PARTIAL(remainder[0])); 6975 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[1]) | 6976 S_00B81C_NUM_THREAD_PARTIAL(remainder[1])); 6977 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[2]) | 6978 S_00B81C_NUM_THREAD_PARTIAL(remainder[2])); 6979 6980 dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1); 6981 } 6982 6983 if (loc->sgpr_idx != -1) { 6984 assert(loc->num_sgprs == 3); 6985 6986 radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, 3); 6987 radeon_emit(cs, blocks[0]); 6988 radeon_emit(cs, blocks[1]); 6989 radeon_emit(cs, blocks[2]); 6990 } 6991 6992 if (offsets[0] || offsets[1] || offsets[2]) { 6993 radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3); 6994 radeon_emit(cs, offsets[0]); 6995 radeon_emit(cs, offsets[1]); 6996 radeon_emit(cs, offsets[2]); 6997 6998 /* The blocks in the packet are not counts but end values. */ 6999 for (unsigned i = 0; i < 3; ++i) 7000 blocks[i] += offsets[i]; 7001 } else { 7002 dispatch_initiator |= S_00B800_FORCE_START_AT_000(1); 7003 } 7004 7005 radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, predicating) | PKT3_SHADER_TYPE_S(1)); 7006 radeon_emit(cs, blocks[0]); 7007 radeon_emit(cs, blocks[1]); 7008 radeon_emit(cs, blocks[2]); 7009 radeon_emit(cs, dispatch_initiator); 7010 } 7011 7012 assert(cmd_buffer->cs->cdw <= cdw_max); 7013} 7014 7015static void 7016radv_upload_compute_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, 7017 struct radv_pipeline *pipeline, 7018 VkPipelineBindPoint bind_point) 7019{ 7020 radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_COMPUTE_BIT, pipeline, bind_point); 7021 radv_flush_constants(cmd_buffer, 7022 bind_point == VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR 7023 ? RADV_RT_STAGE_BITS 7024 : VK_SHADER_STAGE_COMPUTE_BIT, 7025 pipeline, bind_point); 7026} 7027 7028static void 7029radv_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info, 7030 struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point) 7031{ 7032 bool has_prefetch = cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7; 7033 bool pipeline_is_dirty = pipeline && pipeline != cmd_buffer->state.emitted_compute_pipeline; 7034 bool cs_regalloc_hang = cmd_buffer->device->physical_device->rad_info.has_cs_regalloc_hang_bug && 7035 info->blocks[0] * info->blocks[1] * info->blocks[2] > 256; 7036 7037 if (cs_regalloc_hang) 7038 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH | 7039 RADV_CMD_FLAG_CS_PARTIAL_FLUSH; 7040 7041 if (cmd_buffer->state.flush_bits & 7042 (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB | 7043 RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) { 7044 /* If we have to wait for idle, set all states first, so that 7045 * all SET packets are processed in parallel with previous draw 7046 * calls. Then upload descriptors, set shader pointers, and 7047 * dispatch, and prefetch at the end. This ensures that the 7048 * time the CUs are idle is very short. (there are only SET_SH 7049 * packets between the wait and the draw) 7050 */ 7051 radv_emit_compute_pipeline(cmd_buffer, pipeline); 7052 si_emit_cache_flush(cmd_buffer); 7053 /* <-- CUs are idle here --> */ 7054 7055 radv_upload_compute_shader_descriptors(cmd_buffer, pipeline, bind_point); 7056 7057 radv_emit_dispatch_packets(cmd_buffer, pipeline, info); 7058 /* <-- CUs are busy here --> */ 7059 7060 /* Start prefetches after the dispatch has been started. Both 7061 * will run in parallel, but starting the dispatch first is 7062 * more important. 7063 */ 7064 if (has_prefetch && pipeline_is_dirty) { 7065 radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_COMPUTE]); 7066 } 7067 } else { 7068 /* If we don't wait for idle, start prefetches first, then set 7069 * states, and dispatch at the end. 7070 */ 7071 si_emit_cache_flush(cmd_buffer); 7072 7073 if (has_prefetch && pipeline_is_dirty) { 7074 radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_COMPUTE]); 7075 } 7076 7077 radv_upload_compute_shader_descriptors(cmd_buffer, pipeline, bind_point); 7078 7079 radv_emit_compute_pipeline(cmd_buffer, pipeline); 7080 radv_emit_dispatch_packets(cmd_buffer, pipeline, info); 7081 } 7082 7083 if (pipeline_is_dirty) { 7084 /* Raytracing uses compute shaders but has separate bind points and pipelines. 7085 * So if we set compute userdata & shader registers we should dirty the raytracing 7086 * ones and the other way around. 7087 * 7088 * We only need to do this when the pipeline is dirty because when we switch between 7089 * the two we always need to switch pipelines. 7090 */ 7091 radv_mark_descriptor_sets_dirty(cmd_buffer, bind_point == VK_PIPELINE_BIND_POINT_COMPUTE 7092 ? VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR 7093 : VK_PIPELINE_BIND_POINT_COMPUTE); 7094 } 7095 7096 if (cs_regalloc_hang) 7097 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH; 7098 7099 radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_CS_PARTIAL_FLUSH); 7100} 7101 7102static void 7103radv_compute_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info) 7104{ 7105 radv_dispatch(cmd_buffer, info, cmd_buffer->state.compute_pipeline, 7106 VK_PIPELINE_BIND_POINT_COMPUTE); 7107} 7108 7109void 7110radv_CmdDispatchBase(VkCommandBuffer commandBuffer, uint32_t base_x, uint32_t base_y, 7111 uint32_t base_z, uint32_t x, uint32_t y, uint32_t z) 7112{ 7113 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 7114 struct radv_dispatch_info info = {0}; 7115 7116 info.blocks[0] = x; 7117 info.blocks[1] = y; 7118 info.blocks[2] = z; 7119 7120 info.offsets[0] = base_x; 7121 info.offsets[1] = base_y; 7122 info.offsets[2] = base_z; 7123 radv_compute_dispatch(cmd_buffer, &info); 7124} 7125 7126void 7127radv_CmdDispatch(VkCommandBuffer commandBuffer, uint32_t x, uint32_t y, uint32_t z) 7128{ 7129 radv_CmdDispatchBase(commandBuffer, 0, 0, 0, x, y, z); 7130} 7131 7132void 7133radv_CmdDispatchIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset) 7134{ 7135 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 7136 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer); 7137 struct radv_dispatch_info info = {0}; 7138 7139 info.indirect = buffer->bo; 7140 info.va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset; 7141 7142 radv_compute_dispatch(cmd_buffer, &info); 7143} 7144 7145void 7146radv_unaligned_dispatch(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y, uint32_t z) 7147{ 7148 struct radv_dispatch_info info = {0}; 7149 7150 info.blocks[0] = x; 7151 info.blocks[1] = y; 7152 info.blocks[2] = z; 7153 info.unaligned = 1; 7154 7155 radv_compute_dispatch(cmd_buffer, &info); 7156} 7157 7158void 7159radv_indirect_dispatch(struct radv_cmd_buffer *cmd_buffer, struct radeon_winsys_bo *bo, uint64_t va) 7160{ 7161 struct radv_dispatch_info info = {0}; 7162 7163 info.indirect = bo; 7164 info.va = va; 7165 7166 radv_compute_dispatch(cmd_buffer, &info); 7167} 7168 7169static void 7170radv_rt_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info) 7171{ 7172 radv_dispatch(cmd_buffer, info, cmd_buffer->state.rt_pipeline, 7173 VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR); 7174} 7175 7176static bool 7177radv_rt_bind_tables(struct radv_cmd_buffer *cmd_buffer, 7178 const VkStridedDeviceAddressRegionKHR *tables) 7179{ 7180 struct radv_pipeline *pipeline = cmd_buffer->state.rt_pipeline; 7181 uint32_t base_reg; 7182 void *ptr; 7183 uint32_t *desc_ptr; 7184 uint32_t offset; 7185 7186 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, 64, &offset, &ptr)) 7187 return false; 7188 7189 desc_ptr = ptr; 7190 for (unsigned i = 0; i < 4; ++i, desc_ptr += 4) { 7191 desc_ptr[0] = tables[i].deviceAddress; 7192 desc_ptr[1] = tables[i].deviceAddress >> 32; 7193 desc_ptr[2] = tables[i].stride; 7194 desc_ptr[3] = 0; 7195 } 7196 7197 uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset; 7198 struct radv_userdata_info *loc = 7199 radv_lookup_user_sgpr(pipeline, MESA_SHADER_COMPUTE, AC_UD_CS_SBT_DESCRIPTORS); 7200 if (loc->sgpr_idx == -1) 7201 return true; 7202 7203 base_reg = pipeline->user_data_0[MESA_SHADER_COMPUTE]; 7204 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, va, 7205 false); 7206 return true; 7207} 7208 7209void 7210radv_CmdTraceRaysKHR(VkCommandBuffer commandBuffer, 7211 const VkStridedDeviceAddressRegionKHR *pRaygenShaderBindingTable, 7212 const VkStridedDeviceAddressRegionKHR *pMissShaderBindingTable, 7213 const VkStridedDeviceAddressRegionKHR *pHitShaderBindingTable, 7214 const VkStridedDeviceAddressRegionKHR *pCallableShaderBindingTable, 7215 uint32_t width, uint32_t height, uint32_t depth) 7216{ 7217 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 7218 struct radv_dispatch_info info = {0}; 7219 7220 info.blocks[0] = width; 7221 info.blocks[1] = height; 7222 info.blocks[2] = depth; 7223 info.unaligned = 1; 7224 7225 const VkStridedDeviceAddressRegionKHR tables[] = { 7226 *pRaygenShaderBindingTable, 7227 *pMissShaderBindingTable, 7228 *pHitShaderBindingTable, 7229 *pCallableShaderBindingTable, 7230 }; 7231 7232 if (!radv_rt_bind_tables(cmd_buffer, tables)) { 7233 return; 7234 } 7235 7236 struct radv_userdata_info *loc = radv_lookup_user_sgpr( 7237 cmd_buffer->state.rt_pipeline, MESA_SHADER_COMPUTE, AC_UD_CS_RAY_LAUNCH_SIZE); 7238 7239 if (loc->sgpr_idx != -1) { 7240 assert(loc->num_sgprs == 3); 7241 7242 radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, 3); 7243 radeon_emit(cmd_buffer->cs, width); 7244 radeon_emit(cmd_buffer->cs, height); 7245 radeon_emit(cmd_buffer->cs, depth); 7246 } 7247 7248 radv_rt_dispatch(cmd_buffer, &info); 7249} 7250 7251static void 7252radv_set_rt_stack_size(struct radv_cmd_buffer *cmd_buffer, uint32_t size) 7253{ 7254 unsigned wave_size = 0; 7255 unsigned scratch_bytes_per_wave = 0; 7256 7257 if (cmd_buffer->state.rt_pipeline) { 7258 scratch_bytes_per_wave = cmd_buffer->state.rt_pipeline->scratch_bytes_per_wave; 7259 wave_size = cmd_buffer->state.rt_pipeline->shaders[MESA_SHADER_COMPUTE]->info.wave_size; 7260 } 7261 7262 /* The hardware register is specified as a multiple of 256 DWORDS. */ 7263 scratch_bytes_per_wave += align(size * wave_size, 1024); 7264 7265 cmd_buffer->compute_scratch_size_per_wave_needed = 7266 MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, scratch_bytes_per_wave); 7267} 7268 7269void 7270radv_CmdSetRayTracingPipelineStackSizeKHR(VkCommandBuffer commandBuffer, uint32_t size) 7271{ 7272 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 7273 7274 radv_set_rt_stack_size(cmd_buffer, size); 7275 cmd_buffer->state.rt_stack_size = size; 7276} 7277 7278void 7279radv_cmd_buffer_end_render_pass(struct radv_cmd_buffer *cmd_buffer) 7280{ 7281 vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments); 7282 vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.subpass_sample_locs); 7283 7284 cmd_buffer->state.pass = NULL; 7285 cmd_buffer->state.subpass = NULL; 7286 cmd_buffer->state.attachments = NULL; 7287 cmd_buffer->state.framebuffer = NULL; 7288 cmd_buffer->state.subpass_sample_locs = NULL; 7289} 7290 7291void 7292radv_CmdEndRenderPass2(VkCommandBuffer commandBuffer, const VkSubpassEndInfo *pSubpassEndInfo) 7293{ 7294 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 7295 7296 radv_mark_noncoherent_rb(cmd_buffer); 7297 7298 radv_emit_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier); 7299 7300 radv_cmd_buffer_end_subpass(cmd_buffer); 7301 7302 radv_cmd_buffer_end_render_pass(cmd_buffer); 7303} 7304 7305/* 7306 * For HTILE we have the following interesting clear words: 7307 * 0xfffff30f: Uncompressed, full depth range, for depth+stencil HTILE 7308 * 0xfffc000f: Uncompressed, full depth range, for depth only HTILE. 7309 * 0xfffffff0: Clear depth to 1.0 7310 * 0x00000000: Clear depth to 0.0 7311 */ 7312static void 7313radv_initialize_htile(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 7314 const VkImageSubresourceRange *range) 7315{ 7316 struct radv_cmd_state *state = &cmd_buffer->state; 7317 uint32_t htile_value = radv_get_htile_initial_value(cmd_buffer->device, image); 7318 VkClearDepthStencilValue value = {0}; 7319 struct radv_barrier_data barrier = {0}; 7320 7321 barrier.layout_transitions.init_mask_ram = 1; 7322 radv_describe_layout_transition(cmd_buffer, &barrier); 7323 7324 /* Transitioning from LAYOUT_UNDEFINED layout not everyone is consistent 7325 * in considering previous rendering work for WAW hazards. */ 7326 state->flush_bits |= 7327 radv_src_access_flush(cmd_buffer, VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, image); 7328 7329 if (image->planes[0].surface.has_stencil && 7330 !(range->aspectMask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))) { 7331 /* Flush caches before performing a separate aspect initialization because it's a 7332 * read-modify-write operation. 7333 */ 7334 state->flush_bits |= radv_dst_access_flush(cmd_buffer, VK_ACCESS_SHADER_READ_BIT, image); 7335 } 7336 7337 state->flush_bits |= radv_clear_htile(cmd_buffer, image, range, htile_value); 7338 7339 radv_set_ds_clear_metadata(cmd_buffer, image, range, value, range->aspectMask); 7340 7341 if (radv_image_is_tc_compat_htile(image) && (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)) { 7342 /* Initialize the TC-compat metada value to 0 because by 7343 * default DB_Z_INFO.RANGE_PRECISION is set to 1, and we only 7344 * need have to conditionally update its value when performing 7345 * a fast depth clear. 7346 */ 7347 radv_set_tc_compat_zrange_metadata(cmd_buffer, image, range, 0); 7348 } 7349} 7350 7351static void 7352radv_handle_depth_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 7353 VkImageLayout src_layout, bool src_render_loop, 7354 VkImageLayout dst_layout, bool dst_render_loop, 7355 unsigned src_queue_mask, unsigned dst_queue_mask, 7356 const VkImageSubresourceRange *range, 7357 struct radv_sample_locations_state *sample_locs) 7358{ 7359 struct radv_device *device = cmd_buffer->device; 7360 7361 if (!radv_htile_enabled(image, range->baseMipLevel)) 7362 return; 7363 7364 if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) { 7365 radv_initialize_htile(cmd_buffer, image, range); 7366 } else if (!radv_layout_is_htile_compressed(device, image, src_layout, src_render_loop, 7367 src_queue_mask) && 7368 radv_layout_is_htile_compressed(device, image, dst_layout, dst_render_loop, 7369 dst_queue_mask)) { 7370 radv_initialize_htile(cmd_buffer, image, range); 7371 } else if (radv_layout_is_htile_compressed(device, image, src_layout, src_render_loop, 7372 src_queue_mask) && 7373 !radv_layout_is_htile_compressed(device, image, dst_layout, dst_render_loop, 7374 dst_queue_mask)) { 7375 cmd_buffer->state.flush_bits |= 7376 RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; 7377 7378 radv_expand_depth_stencil(cmd_buffer, image, range, sample_locs); 7379 7380 cmd_buffer->state.flush_bits |= 7381 RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; 7382 } 7383} 7384 7385static uint32_t 7386radv_init_cmask(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 7387 const VkImageSubresourceRange *range, uint32_t value) 7388{ 7389 struct radv_barrier_data barrier = {0}; 7390 7391 barrier.layout_transitions.init_mask_ram = 1; 7392 radv_describe_layout_transition(cmd_buffer, &barrier); 7393 7394 return radv_clear_cmask(cmd_buffer, image, range, value); 7395} 7396 7397uint32_t 7398radv_init_fmask(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 7399 const VkImageSubresourceRange *range) 7400{ 7401 static const uint32_t fmask_clear_values[4] = {0x00000000, 0x02020202, 0xE4E4E4E4, 0x76543210}; 7402 uint32_t log2_samples = util_logbase2(image->info.samples); 7403 uint32_t value = fmask_clear_values[log2_samples]; 7404 struct radv_barrier_data barrier = {0}; 7405 7406 barrier.layout_transitions.init_mask_ram = 1; 7407 radv_describe_layout_transition(cmd_buffer, &barrier); 7408 7409 return radv_clear_fmask(cmd_buffer, image, range, value); 7410} 7411 7412uint32_t 7413radv_init_dcc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 7414 const VkImageSubresourceRange *range, uint32_t value) 7415{ 7416 struct radv_barrier_data barrier = {0}; 7417 uint32_t flush_bits = 0; 7418 unsigned size = 0; 7419 7420 barrier.layout_transitions.init_mask_ram = 1; 7421 radv_describe_layout_transition(cmd_buffer, &barrier); 7422 7423 flush_bits |= radv_clear_dcc(cmd_buffer, image, range, value); 7424 7425 if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX8) { 7426 /* When DCC is enabled with mipmaps, some levels might not 7427 * support fast clears and we have to initialize them as "fully 7428 * expanded". 7429 */ 7430 /* Compute the size of all fast clearable DCC levels. */ 7431 for (unsigned i = 0; i < image->planes[0].surface.num_meta_levels; i++) { 7432 struct legacy_surf_dcc_level *dcc_level = &image->planes[0].surface.u.legacy.color.dcc_level[i]; 7433 unsigned dcc_fast_clear_size = 7434 dcc_level->dcc_slice_fast_clear_size * image->info.array_size; 7435 7436 if (!dcc_fast_clear_size) 7437 break; 7438 7439 size = dcc_level->dcc_offset + dcc_fast_clear_size; 7440 } 7441 7442 /* Initialize the mipmap levels without DCC. */ 7443 if (size != image->planes[0].surface.meta_size) { 7444 flush_bits |= radv_fill_buffer(cmd_buffer, image, image->bo, 7445 image->offset + image->planes[0].surface.meta_offset + size, 7446 image->planes[0].surface.meta_size - size, 0xffffffff); 7447 } 7448 } 7449 7450 return flush_bits; 7451} 7452 7453/** 7454 * Initialize DCC/FMASK/CMASK metadata for a color image. 7455 */ 7456static void 7457radv_init_color_image_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 7458 VkImageLayout src_layout, bool src_render_loop, 7459 VkImageLayout dst_layout, bool dst_render_loop, 7460 unsigned src_queue_mask, unsigned dst_queue_mask, 7461 const VkImageSubresourceRange *range) 7462{ 7463 uint32_t flush_bits = 0; 7464 7465 /* Transitioning from LAYOUT_UNDEFINED layout not everyone is 7466 * consistent in considering previous rendering work for WAW hazards. 7467 */ 7468 cmd_buffer->state.flush_bits |= 7469 radv_src_access_flush(cmd_buffer, VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT, image); 7470 7471 if (radv_image_has_cmask(image)) { 7472 uint32_t value; 7473 7474 if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) { 7475 /* TODO: Fix clearing CMASK layers on GFX9. */ 7476 if (radv_image_is_tc_compat_cmask(image) || 7477 (radv_image_has_fmask(image) && 7478 radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel, dst_layout, 7479 dst_render_loop, dst_queue_mask))) { 7480 value = 0xccccccccu; 7481 } else { 7482 value = 0xffffffffu; 7483 } 7484 } else { 7485 static const uint32_t cmask_clear_values[4] = {0xffffffff, 0xdddddddd, 0xeeeeeeee, 0xffffffff}; 7486 uint32_t log2_samples = util_logbase2(image->info.samples); 7487 7488 value = cmask_clear_values[log2_samples]; 7489 } 7490 7491 flush_bits |= radv_init_cmask(cmd_buffer, image, range, value); 7492 } 7493 7494 if (radv_image_has_fmask(image)) { 7495 flush_bits |= radv_init_fmask(cmd_buffer, image, range); 7496 } 7497 7498 if (radv_dcc_enabled(image, range->baseMipLevel)) { 7499 uint32_t value = 0xffffffffu; /* Fully expanded mode. */ 7500 7501 if (radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel, 7502 dst_layout, dst_render_loop, dst_queue_mask)) { 7503 value = 0u; 7504 } 7505 7506 flush_bits |= radv_init_dcc(cmd_buffer, image, range, value); 7507 } 7508 7509 if (radv_image_has_cmask(image) || radv_dcc_enabled(image, range->baseMipLevel)) { 7510 radv_update_fce_metadata(cmd_buffer, image, range, false); 7511 7512 uint32_t color_values[2] = {0}; 7513 radv_set_color_clear_metadata(cmd_buffer, image, range, color_values); 7514 } 7515 7516 cmd_buffer->state.flush_bits |= flush_bits; 7517} 7518 7519static void 7520radv_retile_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 7521 VkImageLayout src_layout, VkImageLayout dst_layout, unsigned dst_queue_mask) 7522{ 7523 if (src_layout != VK_IMAGE_LAYOUT_PRESENT_SRC_KHR && 7524 (dst_layout == VK_IMAGE_LAYOUT_PRESENT_SRC_KHR || 7525 (dst_queue_mask & (1u << RADV_QUEUE_FOREIGN)))) 7526 radv_retile_dcc(cmd_buffer, image); 7527} 7528 7529static bool 7530radv_image_need_retile(const struct radv_image *image) 7531{ 7532 return image->planes[0].surface.display_dcc_offset && 7533 image->planes[0].surface.display_dcc_offset != image->planes[0].surface.meta_offset; 7534} 7535 7536/** 7537 * Handle color image transitions for DCC/FMASK/CMASK. 7538 */ 7539static void 7540radv_handle_color_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 7541 VkImageLayout src_layout, bool src_render_loop, 7542 VkImageLayout dst_layout, bool dst_render_loop, 7543 unsigned src_queue_mask, unsigned dst_queue_mask, 7544 const VkImageSubresourceRange *range) 7545{ 7546 bool dcc_decompressed = false, fast_clear_flushed = false; 7547 7548 if (!radv_image_has_cmask(image) && !radv_image_has_fmask(image) && 7549 !radv_dcc_enabled(image, range->baseMipLevel)) 7550 return; 7551 7552 if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) { 7553 radv_init_color_image_metadata(cmd_buffer, image, src_layout, src_render_loop, dst_layout, 7554 dst_render_loop, src_queue_mask, dst_queue_mask, range); 7555 7556 if (radv_image_need_retile(image)) 7557 radv_retile_transition(cmd_buffer, image, src_layout, dst_layout, dst_queue_mask); 7558 return; 7559 } 7560 7561 if (radv_dcc_enabled(image, range->baseMipLevel)) { 7562 if (src_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) { 7563 cmd_buffer->state.flush_bits |= radv_init_dcc(cmd_buffer, image, range, 0xffffffffu); 7564 } else if (radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel, 7565 src_layout, src_render_loop, src_queue_mask) && 7566 !radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel, 7567 dst_layout, dst_render_loop, dst_queue_mask)) { 7568 radv_decompress_dcc(cmd_buffer, image, range); 7569 dcc_decompressed = true; 7570 } else if (radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel, 7571 src_layout, src_render_loop, src_queue_mask) && 7572 !radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel, 7573 dst_layout, dst_render_loop, dst_queue_mask)) { 7574 radv_fast_clear_flush_image_inplace(cmd_buffer, image, range); 7575 fast_clear_flushed = true; 7576 } 7577 7578 if (radv_image_need_retile(image)) 7579 radv_retile_transition(cmd_buffer, image, src_layout, dst_layout, dst_queue_mask); 7580 } else if (radv_image_has_cmask(image) || radv_image_has_fmask(image)) { 7581 if (radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel, 7582 src_layout, src_render_loop, src_queue_mask) && 7583 !radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel, 7584 dst_layout, dst_render_loop, dst_queue_mask)) { 7585 radv_fast_clear_flush_image_inplace(cmd_buffer, image, range); 7586 fast_clear_flushed = true; 7587 } 7588 } 7589 7590 /* MSAA color decompress. */ 7591 if (radv_image_has_fmask(image) && 7592 (image->usage & (VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT)) && 7593 radv_layout_fmask_compressed(cmd_buffer->device, image, src_layout, src_queue_mask) && 7594 !radv_layout_fmask_compressed(cmd_buffer->device, image, dst_layout, dst_queue_mask)) { 7595 if (radv_dcc_enabled(image, range->baseMipLevel) && 7596 !radv_image_use_dcc_image_stores(cmd_buffer->device, image) && !dcc_decompressed) { 7597 /* A DCC decompress is required before expanding FMASK 7598 * when DCC stores aren't supported to avoid being in 7599 * a state where DCC is compressed and the main 7600 * surface is uncompressed. 7601 */ 7602 radv_decompress_dcc(cmd_buffer, image, range); 7603 } else if (!fast_clear_flushed) { 7604 /* A FMASK decompress is required before expanding 7605 * FMASK. 7606 */ 7607 radv_fast_clear_flush_image_inplace(cmd_buffer, image, range); 7608 } 7609 7610 struct radv_barrier_data barrier = {0}; 7611 barrier.layout_transitions.fmask_color_expand = 1; 7612 radv_describe_layout_transition(cmd_buffer, &barrier); 7613 7614 radv_expand_fmask_image_inplace(cmd_buffer, image, range); 7615 } 7616} 7617 7618static void 7619radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 7620 VkImageLayout src_layout, bool src_render_loop, 7621 VkImageLayout dst_layout, bool dst_render_loop, uint32_t src_family, 7622 uint32_t dst_family, const VkImageSubresourceRange *range, 7623 struct radv_sample_locations_state *sample_locs) 7624{ 7625 if (image->exclusive && src_family != dst_family) { 7626 /* This is an acquire or a release operation and there will be 7627 * a corresponding release/acquire. Do the transition in the 7628 * most flexible queue. */ 7629 7630 assert(src_family == cmd_buffer->queue_family_index || 7631 dst_family == cmd_buffer->queue_family_index); 7632 7633 if (src_family == VK_QUEUE_FAMILY_EXTERNAL || src_family == VK_QUEUE_FAMILY_FOREIGN_EXT) 7634 return; 7635 7636 if (cmd_buffer->queue_family_index == RADV_QUEUE_TRANSFER) 7637 return; 7638 7639 if (cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE && 7640 (src_family == RADV_QUEUE_GENERAL || dst_family == RADV_QUEUE_GENERAL)) 7641 return; 7642 } 7643 7644 unsigned src_queue_mask = 7645 radv_image_queue_family_mask(image, src_family, cmd_buffer->queue_family_index); 7646 unsigned dst_queue_mask = 7647 radv_image_queue_family_mask(image, dst_family, cmd_buffer->queue_family_index); 7648 7649 if (src_layout == dst_layout && src_render_loop == dst_render_loop && src_queue_mask == dst_queue_mask) 7650 return; 7651 7652 if (vk_format_has_depth(image->vk_format)) { 7653 radv_handle_depth_image_transition(cmd_buffer, image, src_layout, src_render_loop, dst_layout, 7654 dst_render_loop, src_queue_mask, dst_queue_mask, range, 7655 sample_locs); 7656 } else { 7657 radv_handle_color_image_transition(cmd_buffer, image, src_layout, src_render_loop, dst_layout, 7658 dst_render_loop, src_queue_mask, dst_queue_mask, range); 7659 } 7660} 7661 7662struct radv_barrier_info { 7663 enum rgp_barrier_reason reason; 7664 uint32_t eventCount; 7665 const VkEvent *pEvents; 7666 VkPipelineStageFlags srcStageMask; 7667 VkPipelineStageFlags dstStageMask; 7668}; 7669 7670static void 7671radv_barrier(struct radv_cmd_buffer *cmd_buffer, uint32_t memoryBarrierCount, 7672 const VkMemoryBarrier *pMemoryBarriers, uint32_t bufferMemoryBarrierCount, 7673 const VkBufferMemoryBarrier *pBufferMemoryBarriers, uint32_t imageMemoryBarrierCount, 7674 const VkImageMemoryBarrier *pImageMemoryBarriers, const struct radv_barrier_info *info) 7675{ 7676 struct radeon_cmdbuf *cs = cmd_buffer->cs; 7677 enum radv_cmd_flush_bits src_flush_bits = 0; 7678 enum radv_cmd_flush_bits dst_flush_bits = 0; 7679 7680 if (cmd_buffer->state.subpass) 7681 radv_mark_noncoherent_rb(cmd_buffer); 7682 7683 radv_describe_barrier_start(cmd_buffer, info->reason); 7684 7685 for (unsigned i = 0; i < info->eventCount; ++i) { 7686 RADV_FROM_HANDLE(radv_event, event, info->pEvents[i]); 7687 uint64_t va = radv_buffer_get_va(event->bo); 7688 7689 radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo); 7690 7691 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 7); 7692 7693 radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL, va, 1, 0xffffffff); 7694 assert(cmd_buffer->cs->cdw <= cdw_max); 7695 } 7696 7697 for (uint32_t i = 0; i < memoryBarrierCount; i++) { 7698 src_flush_bits |= radv_src_access_flush(cmd_buffer, pMemoryBarriers[i].srcAccessMask, NULL); 7699 dst_flush_bits |= radv_dst_access_flush(cmd_buffer, pMemoryBarriers[i].dstAccessMask, NULL); 7700 } 7701 7702 for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) { 7703 src_flush_bits |= 7704 radv_src_access_flush(cmd_buffer, pBufferMemoryBarriers[i].srcAccessMask, NULL); 7705 dst_flush_bits |= 7706 radv_dst_access_flush(cmd_buffer, pBufferMemoryBarriers[i].dstAccessMask, NULL); 7707 } 7708 7709 for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) { 7710 RADV_FROM_HANDLE(radv_image, image, pImageMemoryBarriers[i].image); 7711 7712 src_flush_bits |= 7713 radv_src_access_flush(cmd_buffer, pImageMemoryBarriers[i].srcAccessMask, image); 7714 dst_flush_bits |= 7715 radv_dst_access_flush(cmd_buffer, pImageMemoryBarriers[i].dstAccessMask, image); 7716 } 7717 7718 /* The Vulkan spec 1.1.98 says: 7719 * 7720 * "An execution dependency with only 7721 * VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT in the destination stage mask 7722 * will only prevent that stage from executing in subsequently 7723 * submitted commands. As this stage does not perform any actual 7724 * execution, this is not observable - in effect, it does not delay 7725 * processing of subsequent commands. Similarly an execution dependency 7726 * with only VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT in the source stage mask 7727 * will effectively not wait for any prior commands to complete." 7728 */ 7729 if (info->dstStageMask != VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT) 7730 radv_stage_flush(cmd_buffer, info->srcStageMask); 7731 cmd_buffer->state.flush_bits |= src_flush_bits; 7732 7733 for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) { 7734 RADV_FROM_HANDLE(radv_image, image, pImageMemoryBarriers[i].image); 7735 7736 const struct VkSampleLocationsInfoEXT *sample_locs_info = 7737 vk_find_struct_const(pImageMemoryBarriers[i].pNext, SAMPLE_LOCATIONS_INFO_EXT); 7738 struct radv_sample_locations_state sample_locations = {0}; 7739 7740 if (sample_locs_info) { 7741 assert(image->flags & VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT); 7742 sample_locations.per_pixel = sample_locs_info->sampleLocationsPerPixel; 7743 sample_locations.grid_size = sample_locs_info->sampleLocationGridSize; 7744 sample_locations.count = sample_locs_info->sampleLocationsCount; 7745 typed_memcpy(&sample_locations.locations[0], sample_locs_info->pSampleLocations, 7746 sample_locs_info->sampleLocationsCount); 7747 } 7748 7749 radv_handle_image_transition( 7750 cmd_buffer, image, pImageMemoryBarriers[i].oldLayout, 7751 false, /* Outside of a renderpass we are never in a renderloop */ 7752 pImageMemoryBarriers[i].newLayout, 7753 false, /* Outside of a renderpass we are never in a renderloop */ 7754 pImageMemoryBarriers[i].srcQueueFamilyIndex, pImageMemoryBarriers[i].dstQueueFamilyIndex, 7755 &pImageMemoryBarriers[i].subresourceRange, sample_locs_info ? &sample_locations : NULL); 7756 } 7757 7758 /* Make sure CP DMA is idle because the driver might have performed a 7759 * DMA operation for copying or filling buffers/images. 7760 */ 7761 if (info->srcStageMask & (VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT)) 7762 si_cp_dma_wait_for_idle(cmd_buffer); 7763 7764 cmd_buffer->state.flush_bits |= dst_flush_bits; 7765 7766 radv_describe_barrier_end(cmd_buffer); 7767} 7768 7769void 7770radv_CmdPipelineBarrier(VkCommandBuffer commandBuffer, VkPipelineStageFlags srcStageMask, 7771 VkPipelineStageFlags destStageMask, VkBool32 byRegion, 7772 uint32_t memoryBarrierCount, const VkMemoryBarrier *pMemoryBarriers, 7773 uint32_t bufferMemoryBarrierCount, 7774 const VkBufferMemoryBarrier *pBufferMemoryBarriers, 7775 uint32_t imageMemoryBarrierCount, 7776 const VkImageMemoryBarrier *pImageMemoryBarriers) 7777{ 7778 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 7779 struct radv_barrier_info info; 7780 7781 info.reason = RGP_BARRIER_EXTERNAL_CMD_PIPELINE_BARRIER; 7782 info.eventCount = 0; 7783 info.pEvents = NULL; 7784 info.srcStageMask = srcStageMask; 7785 info.dstStageMask = destStageMask; 7786 7787 radv_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers, bufferMemoryBarrierCount, 7788 pBufferMemoryBarriers, imageMemoryBarrierCount, pImageMemoryBarriers, &info); 7789} 7790 7791static void 7792write_event(struct radv_cmd_buffer *cmd_buffer, struct radv_event *event, 7793 VkPipelineStageFlags stageMask, unsigned value) 7794{ 7795 struct radeon_cmdbuf *cs = cmd_buffer->cs; 7796 uint64_t va = radv_buffer_get_va(event->bo); 7797 7798 si_emit_cache_flush(cmd_buffer); 7799 7800 radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo); 7801 7802 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 28); 7803 7804 /* Flags that only require a top-of-pipe event. */ 7805 VkPipelineStageFlags top_of_pipe_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; 7806 7807 /* Flags that only require a post-index-fetch event. */ 7808 VkPipelineStageFlags post_index_fetch_flags = 7809 top_of_pipe_flags | VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_VERTEX_INPUT_BIT; 7810 7811 /* Flags that only require signaling post PS. */ 7812 VkPipelineStageFlags post_ps_flags = 7813 post_index_fetch_flags | VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | 7814 VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT | 7815 VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT | VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT | 7816 VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT | 7817 VK_PIPELINE_STAGE_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR | 7818 VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; 7819 7820 /* Flags that only require signaling post CS. */ 7821 VkPipelineStageFlags post_cs_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; 7822 7823 /* Make sure CP DMA is idle because the driver might have performed a 7824 * DMA operation for copying or filling buffers/images. 7825 */ 7826 if (stageMask & (VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT)) 7827 si_cp_dma_wait_for_idle(cmd_buffer); 7828 7829 if (!(stageMask & ~top_of_pipe_flags)) { 7830 /* Just need to sync the PFP engine. */ 7831 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0)); 7832 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP)); 7833 radeon_emit(cs, va); 7834 radeon_emit(cs, va >> 32); 7835 radeon_emit(cs, value); 7836 } else if (!(stageMask & ~post_index_fetch_flags)) { 7837 /* Sync ME because PFP reads index and indirect buffers. */ 7838 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0)); 7839 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME)); 7840 radeon_emit(cs, va); 7841 radeon_emit(cs, va >> 32); 7842 radeon_emit(cs, value); 7843 } else { 7844 unsigned event_type; 7845 7846 if (!(stageMask & ~post_ps_flags)) { 7847 /* Sync previous fragment shaders. */ 7848 event_type = V_028A90_PS_DONE; 7849 } else if (!(stageMask & ~post_cs_flags)) { 7850 /* Sync previous compute shaders. */ 7851 event_type = V_028A90_CS_DONE; 7852 } else { 7853 /* Otherwise, sync all prior GPU work. */ 7854 event_type = V_028A90_BOTTOM_OF_PIPE_TS; 7855 } 7856 7857 si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.chip_class, 7858 radv_cmd_buffer_uses_mec(cmd_buffer), event_type, 0, 7859 EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, va, value, 7860 cmd_buffer->gfx9_eop_bug_va); 7861 } 7862 7863 assert(cmd_buffer->cs->cdw <= cdw_max); 7864} 7865 7866void 7867radv_CmdSetEvent(VkCommandBuffer commandBuffer, VkEvent _event, VkPipelineStageFlags stageMask) 7868{ 7869 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 7870 RADV_FROM_HANDLE(radv_event, event, _event); 7871 7872 write_event(cmd_buffer, event, stageMask, 1); 7873} 7874 7875void 7876radv_CmdResetEvent(VkCommandBuffer commandBuffer, VkEvent _event, VkPipelineStageFlags stageMask) 7877{ 7878 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 7879 RADV_FROM_HANDLE(radv_event, event, _event); 7880 7881 write_event(cmd_buffer, event, stageMask, 0); 7882} 7883 7884void 7885radv_CmdWaitEvents(VkCommandBuffer commandBuffer, uint32_t eventCount, const VkEvent *pEvents, 7886 VkPipelineStageFlags srcStageMask, VkPipelineStageFlags dstStageMask, 7887 uint32_t memoryBarrierCount, const VkMemoryBarrier *pMemoryBarriers, 7888 uint32_t bufferMemoryBarrierCount, 7889 const VkBufferMemoryBarrier *pBufferMemoryBarriers, 7890 uint32_t imageMemoryBarrierCount, 7891 const VkImageMemoryBarrier *pImageMemoryBarriers) 7892{ 7893 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 7894 struct radv_barrier_info info; 7895 7896 info.reason = RGP_BARRIER_EXTERNAL_CMD_WAIT_EVENTS; 7897 info.eventCount = eventCount; 7898 info.pEvents = pEvents; 7899 info.srcStageMask = 0; 7900 7901 radv_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers, bufferMemoryBarrierCount, 7902 pBufferMemoryBarriers, imageMemoryBarrierCount, pImageMemoryBarriers, &info); 7903} 7904 7905void 7906radv_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask) 7907{ 7908 /* No-op */ 7909} 7910 7911/* VK_EXT_conditional_rendering */ 7912void 7913radv_CmdBeginConditionalRenderingEXT( 7914 VkCommandBuffer commandBuffer, 7915 const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin) 7916{ 7917 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 7918 RADV_FROM_HANDLE(radv_buffer, buffer, pConditionalRenderingBegin->buffer); 7919 struct radeon_cmdbuf *cs = cmd_buffer->cs; 7920 unsigned pred_op = PREDICATION_OP_BOOL32; 7921 bool draw_visible = true; 7922 uint64_t va; 7923 7924 va = radv_buffer_get_va(buffer->bo) + pConditionalRenderingBegin->offset; 7925 7926 /* By default, if the 32-bit value at offset in buffer memory is zero, 7927 * then the rendering commands are discarded, otherwise they are 7928 * executed as normal. If the inverted flag is set, all commands are 7929 * discarded if the value is non zero. 7930 */ 7931 if (pConditionalRenderingBegin->flags & VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT) { 7932 draw_visible = false; 7933 } 7934 7935 si_emit_cache_flush(cmd_buffer); 7936 7937 if (cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL && 7938 !cmd_buffer->device->physical_device->rad_info.has_32bit_predication) { 7939 uint64_t pred_value = 0, pred_va; 7940 unsigned pred_offset; 7941 7942 /* From the Vulkan spec 1.1.107: 7943 * 7944 * "If the 32-bit value at offset in buffer memory is zero, 7945 * then the rendering commands are discarded, otherwise they 7946 * are executed as normal. If the value of the predicate in 7947 * buffer memory changes while conditional rendering is 7948 * active, the rendering commands may be discarded in an 7949 * implementation-dependent way. Some implementations may 7950 * latch the value of the predicate upon beginning conditional 7951 * rendering while others may read it before every rendering 7952 * command." 7953 * 7954 * But, the AMD hardware treats the predicate as a 64-bit 7955 * value which means we need a workaround in the driver. 7956 * Luckily, it's not required to support if the value changes 7957 * when predication is active. 7958 * 7959 * The workaround is as follows: 7960 * 1) allocate a 64-value in the upload BO and initialize it 7961 * to 0 7962 * 2) copy the 32-bit predicate value to the upload BO 7963 * 3) use the new allocated VA address for predication 7964 * 7965 * Based on the conditionalrender demo, it's faster to do the 7966 * COPY_DATA in ME (+ sync PFP) instead of PFP. 7967 */ 7968 radv_cmd_buffer_upload_data(cmd_buffer, 8, &pred_value, &pred_offset); 7969 7970 pred_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset; 7971 7972 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); 7973 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | 7974 COPY_DATA_WR_CONFIRM); 7975 radeon_emit(cs, va); 7976 radeon_emit(cs, va >> 32); 7977 radeon_emit(cs, pred_va); 7978 radeon_emit(cs, pred_va >> 32); 7979 7980 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); 7981 radeon_emit(cs, 0); 7982 7983 va = pred_va; 7984 pred_op = PREDICATION_OP_BOOL64; 7985 } 7986 7987 /* Enable predication for this command buffer. */ 7988 si_emit_set_predication_state(cmd_buffer, draw_visible, pred_op, va); 7989 cmd_buffer->state.predicating = true; 7990 7991 /* Store conditional rendering user info. */ 7992 cmd_buffer->state.predication_type = draw_visible; 7993 cmd_buffer->state.predication_op = pred_op; 7994 cmd_buffer->state.predication_va = va; 7995} 7996 7997void 7998radv_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer) 7999{ 8000 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 8001 8002 /* Disable predication for this command buffer. */ 8003 si_emit_set_predication_state(cmd_buffer, false, 0, 0); 8004 cmd_buffer->state.predicating = false; 8005 8006 /* Reset conditional rendering user info. */ 8007 cmd_buffer->state.predication_type = -1; 8008 cmd_buffer->state.predication_op = 0; 8009 cmd_buffer->state.predication_va = 0; 8010} 8011 8012/* VK_EXT_transform_feedback */ 8013void 8014radv_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer, uint32_t firstBinding, 8015 uint32_t bindingCount, const VkBuffer *pBuffers, 8016 const VkDeviceSize *pOffsets, const VkDeviceSize *pSizes) 8017{ 8018 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 8019 struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings; 8020 uint8_t enabled_mask = 0; 8021 8022 assert(firstBinding + bindingCount <= MAX_SO_BUFFERS); 8023 for (uint32_t i = 0; i < bindingCount; i++) { 8024 uint32_t idx = firstBinding + i; 8025 8026 sb[idx].buffer = radv_buffer_from_handle(pBuffers[i]); 8027 sb[idx].offset = pOffsets[i]; 8028 8029 if (!pSizes || pSizes[i] == VK_WHOLE_SIZE) { 8030 sb[idx].size = sb[idx].buffer->size - sb[idx].offset; 8031 } else { 8032 sb[idx].size = pSizes[i]; 8033 } 8034 8035 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, sb[idx].buffer->bo); 8036 8037 enabled_mask |= 1 << idx; 8038 } 8039 8040 cmd_buffer->state.streamout.enabled_mask |= enabled_mask; 8041 8042 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_BUFFER; 8043} 8044 8045static void 8046radv_emit_streamout_enable(struct radv_cmd_buffer *cmd_buffer) 8047{ 8048 struct radv_streamout_state *so = &cmd_buffer->state.streamout; 8049 struct radeon_cmdbuf *cs = cmd_buffer->cs; 8050 8051 radeon_set_context_reg_seq(cs, R_028B94_VGT_STRMOUT_CONFIG, 2); 8052 radeon_emit(cs, S_028B94_STREAMOUT_0_EN(so->streamout_enabled) | S_028B94_RAST_STREAM(0) | 8053 S_028B94_STREAMOUT_1_EN(so->streamout_enabled) | 8054 S_028B94_STREAMOUT_2_EN(so->streamout_enabled) | 8055 S_028B94_STREAMOUT_3_EN(so->streamout_enabled)); 8056 radeon_emit(cs, so->hw_enabled_mask & so->enabled_stream_buffers_mask); 8057 8058 cmd_buffer->state.context_roll_without_scissor_emitted = true; 8059} 8060 8061static void 8062radv_set_streamout_enable(struct radv_cmd_buffer *cmd_buffer, bool enable) 8063{ 8064 struct radv_streamout_state *so = &cmd_buffer->state.streamout; 8065 bool old_streamout_enabled = so->streamout_enabled; 8066 uint32_t old_hw_enabled_mask = so->hw_enabled_mask; 8067 8068 so->streamout_enabled = enable; 8069 8070 so->hw_enabled_mask = so->enabled_mask | (so->enabled_mask << 4) | (so->enabled_mask << 8) | 8071 (so->enabled_mask << 12); 8072 8073 if (!cmd_buffer->device->physical_device->use_ngg_streamout && 8074 ((old_streamout_enabled != so->streamout_enabled) || 8075 (old_hw_enabled_mask != so->hw_enabled_mask))) 8076 radv_emit_streamout_enable(cmd_buffer); 8077 8078 if (cmd_buffer->device->physical_device->use_ngg_streamout) { 8079 cmd_buffer->gds_needed = true; 8080 cmd_buffer->gds_oa_needed = true; 8081 } 8082} 8083 8084static void 8085radv_flush_vgt_streamout(struct radv_cmd_buffer *cmd_buffer) 8086{ 8087 struct radeon_cmdbuf *cs = cmd_buffer->cs; 8088 unsigned reg_strmout_cntl; 8089 8090 /* The register is at different places on different ASICs. */ 8091 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) { 8092 reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL; 8093 radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0); 8094 } else { 8095 reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL; 8096 radeon_set_config_reg(cs, reg_strmout_cntl, 0); 8097 } 8098 8099 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 8100 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0)); 8101 8102 radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); 8103 radeon_emit(cs, 8104 WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */ 8105 radeon_emit(cs, reg_strmout_cntl >> 2); /* register */ 8106 radeon_emit(cs, 0); 8107 radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */ 8108 radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */ 8109 radeon_emit(cs, 4); /* poll interval */ 8110} 8111 8112static void 8113radv_emit_streamout_begin(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer, 8114 uint32_t counterBufferCount, const VkBuffer *pCounterBuffers, 8115 const VkDeviceSize *pCounterBufferOffsets) 8116 8117{ 8118 struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings; 8119 struct radv_streamout_state *so = &cmd_buffer->state.streamout; 8120 struct radeon_cmdbuf *cs = cmd_buffer->cs; 8121 8122 radv_flush_vgt_streamout(cmd_buffer); 8123 8124 assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS); 8125 u_foreach_bit(i, so->enabled_mask) 8126 { 8127 int32_t counter_buffer_idx = i - firstCounterBuffer; 8128 if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount) 8129 counter_buffer_idx = -1; 8130 8131 /* AMD GCN binds streamout buffers as shader resources. 8132 * VGT only counts primitives and tells the shader through 8133 * SGPRs what to do. 8134 */ 8135 radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 2); 8136 radeon_emit(cs, sb[i].size >> 2); /* BUFFER_SIZE (in DW) */ 8137 radeon_emit(cs, so->stride_in_dw[i]); /* VTX_STRIDE (in DW) */ 8138 8139 cmd_buffer->state.context_roll_without_scissor_emitted = true; 8140 8141 if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) { 8142 /* The array of counter buffers is optional. */ 8143 RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]); 8144 uint64_t va = radv_buffer_get_va(buffer->bo); 8145 uint64_t counter_buffer_offset = 0; 8146 8147 if (pCounterBufferOffsets) 8148 counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx]; 8149 8150 va += buffer->offset + counter_buffer_offset; 8151 8152 /* Append */ 8153 radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); 8154 radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) | /* offset in bytes */ 8155 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */ 8156 radeon_emit(cs, 0); /* unused */ 8157 radeon_emit(cs, 0); /* unused */ 8158 radeon_emit(cs, va); /* src address lo */ 8159 radeon_emit(cs, va >> 32); /* src address hi */ 8160 8161 radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo); 8162 } else { 8163 /* Start from the beginning. */ 8164 radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); 8165 radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) | /* offset in bytes */ 8166 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */ 8167 radeon_emit(cs, 0); /* unused */ 8168 radeon_emit(cs, 0); /* unused */ 8169 radeon_emit(cs, 0); /* unused */ 8170 radeon_emit(cs, 0); /* unused */ 8171 } 8172 } 8173 8174 radv_set_streamout_enable(cmd_buffer, true); 8175} 8176 8177static void 8178gfx10_emit_streamout_begin(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer, 8179 uint32_t counterBufferCount, const VkBuffer *pCounterBuffers, 8180 const VkDeviceSize *pCounterBufferOffsets) 8181{ 8182 struct radv_streamout_state *so = &cmd_buffer->state.streamout; 8183 unsigned last_target = util_last_bit(so->enabled_mask) - 1; 8184 struct radeon_cmdbuf *cs = cmd_buffer->cs; 8185 8186 assert(cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10); 8187 assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS); 8188 8189 /* Sync because the next streamout operation will overwrite GDS and we 8190 * have to make sure it's idle. 8191 * TODO: Improve by tracking if there is a streamout operation in 8192 * flight. 8193 */ 8194 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH; 8195 si_emit_cache_flush(cmd_buffer); 8196 8197 u_foreach_bit(i, so->enabled_mask) 8198 { 8199 int32_t counter_buffer_idx = i - firstCounterBuffer; 8200 if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount) 8201 counter_buffer_idx = -1; 8202 8203 bool append = 8204 counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]; 8205 uint64_t va = 0; 8206 8207 if (append) { 8208 RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]); 8209 uint64_t counter_buffer_offset = 0; 8210 8211 if (pCounterBufferOffsets) 8212 counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx]; 8213 8214 va += radv_buffer_get_va(buffer->bo); 8215 va += buffer->offset + counter_buffer_offset; 8216 8217 radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo); 8218 } 8219 8220 radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0)); 8221 radeon_emit(cs, S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) | 8222 S_411_DST_SEL(V_411_GDS) | S_411_CP_SYNC(i == last_target)); 8223 radeon_emit(cs, va); 8224 radeon_emit(cs, va >> 32); 8225 radeon_emit(cs, 4 * i); /* destination in GDS */ 8226 radeon_emit(cs, 0); 8227 radeon_emit(cs, S_415_BYTE_COUNT_GFX9(4) | S_415_DISABLE_WR_CONFIRM_GFX9(i != last_target)); 8228 } 8229 8230 radv_set_streamout_enable(cmd_buffer, true); 8231} 8232 8233void 8234radv_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer, 8235 uint32_t counterBufferCount, const VkBuffer *pCounterBuffers, 8236 const VkDeviceSize *pCounterBufferOffsets) 8237{ 8238 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 8239 8240 if (cmd_buffer->device->physical_device->use_ngg_streamout) { 8241 gfx10_emit_streamout_begin(cmd_buffer, firstCounterBuffer, counterBufferCount, 8242 pCounterBuffers, pCounterBufferOffsets); 8243 } else { 8244 radv_emit_streamout_begin(cmd_buffer, firstCounterBuffer, counterBufferCount, pCounterBuffers, 8245 pCounterBufferOffsets); 8246 } 8247} 8248 8249static void 8250radv_emit_streamout_end(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer, 8251 uint32_t counterBufferCount, const VkBuffer *pCounterBuffers, 8252 const VkDeviceSize *pCounterBufferOffsets) 8253{ 8254 struct radv_streamout_state *so = &cmd_buffer->state.streamout; 8255 struct radeon_cmdbuf *cs = cmd_buffer->cs; 8256 8257 radv_flush_vgt_streamout(cmd_buffer); 8258 8259 assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS); 8260 u_foreach_bit(i, so->enabled_mask) 8261 { 8262 int32_t counter_buffer_idx = i - firstCounterBuffer; 8263 if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount) 8264 counter_buffer_idx = -1; 8265 8266 if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) { 8267 /* The array of counters buffer is optional. */ 8268 RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]); 8269 uint64_t va = radv_buffer_get_va(buffer->bo); 8270 uint64_t counter_buffer_offset = 0; 8271 8272 if (pCounterBufferOffsets) 8273 counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx]; 8274 8275 va += buffer->offset + counter_buffer_offset; 8276 8277 radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); 8278 radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) | /* offset in bytes */ 8279 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) | 8280 STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */ 8281 radeon_emit(cs, va); /* dst address lo */ 8282 radeon_emit(cs, va >> 32); /* dst address hi */ 8283 radeon_emit(cs, 0); /* unused */ 8284 radeon_emit(cs, 0); /* unused */ 8285 8286 radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo); 8287 } 8288 8289 /* Deactivate transform feedback by zeroing the buffer size. 8290 * The counters (primitives generated, primitives emitted) may 8291 * be enabled even if there is not buffer bound. This ensures 8292 * that the primitives-emitted query won't increment. 8293 */ 8294 radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 0); 8295 8296 cmd_buffer->state.context_roll_without_scissor_emitted = true; 8297 } 8298 8299 radv_set_streamout_enable(cmd_buffer, false); 8300} 8301 8302static void 8303gfx10_emit_streamout_end(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer, 8304 uint32_t counterBufferCount, const VkBuffer *pCounterBuffers, 8305 const VkDeviceSize *pCounterBufferOffsets) 8306{ 8307 struct radv_streamout_state *so = &cmd_buffer->state.streamout; 8308 struct radeon_cmdbuf *cs = cmd_buffer->cs; 8309 8310 assert(cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10); 8311 assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS); 8312 8313 u_foreach_bit(i, so->enabled_mask) 8314 { 8315 int32_t counter_buffer_idx = i - firstCounterBuffer; 8316 if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount) 8317 counter_buffer_idx = -1; 8318 8319 if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) { 8320 /* The array of counters buffer is optional. */ 8321 RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]); 8322 uint64_t va = radv_buffer_get_va(buffer->bo); 8323 uint64_t counter_buffer_offset = 0; 8324 8325 if (pCounterBufferOffsets) 8326 counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx]; 8327 8328 va += buffer->offset + counter_buffer_offset; 8329 8330 si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.chip_class, 8331 radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_PS_DONE, 0, 8332 EOP_DST_SEL_TC_L2, EOP_DATA_SEL_GDS, va, EOP_DATA_GDS(i, 1), 0); 8333 8334 radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo); 8335 } 8336 } 8337 8338 radv_set_streamout_enable(cmd_buffer, false); 8339} 8340 8341void 8342radv_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer, 8343 uint32_t counterBufferCount, const VkBuffer *pCounterBuffers, 8344 const VkDeviceSize *pCounterBufferOffsets) 8345{ 8346 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 8347 8348 if (cmd_buffer->device->physical_device->use_ngg_streamout) { 8349 gfx10_emit_streamout_end(cmd_buffer, firstCounterBuffer, counterBufferCount, pCounterBuffers, 8350 pCounterBufferOffsets); 8351 } else { 8352 radv_emit_streamout_end(cmd_buffer, firstCounterBuffer, counterBufferCount, pCounterBuffers, 8353 pCounterBufferOffsets); 8354 } 8355} 8356 8357void 8358radv_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer, uint32_t instanceCount, 8359 uint32_t firstInstance, VkBuffer _counterBuffer, 8360 VkDeviceSize counterBufferOffset, uint32_t counterOffset, 8361 uint32_t vertexStride) 8362{ 8363 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 8364 RADV_FROM_HANDLE(radv_buffer, counterBuffer, _counterBuffer); 8365 struct radv_draw_info info; 8366 8367 info.count = 0; 8368 info.instance_count = instanceCount; 8369 info.first_instance = firstInstance; 8370 info.strmout_buffer = counterBuffer; 8371 info.strmout_buffer_offset = counterBufferOffset; 8372 info.stride = vertexStride; 8373 info.indexed = false; 8374 info.indirect = NULL; 8375 8376 if (!radv_before_draw(cmd_buffer, &info, 1)) 8377 return; 8378 struct VkMultiDrawInfoEXT minfo = { 0, 0 }; 8379 radv_emit_direct_draw_packets(cmd_buffer, &info, 1, &minfo, S_0287F0_USE_OPAQUE(1), 0); 8380 radv_after_draw(cmd_buffer); 8381} 8382 8383/* VK_AMD_buffer_marker */ 8384void 8385radv_CmdWriteBufferMarkerAMD(VkCommandBuffer commandBuffer, VkPipelineStageFlagBits pipelineStage, 8386 VkBuffer dstBuffer, VkDeviceSize dstOffset, uint32_t marker) 8387{ 8388 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 8389 RADV_FROM_HANDLE(radv_buffer, buffer, dstBuffer); 8390 struct radeon_cmdbuf *cs = cmd_buffer->cs; 8391 uint64_t va = radv_buffer_get_va(buffer->bo) + dstOffset; 8392 8393 si_emit_cache_flush(cmd_buffer); 8394 8395 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 12); 8396 8397 if (!(pipelineStage & ~VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT)) { 8398 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); 8399 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | 8400 COPY_DATA_WR_CONFIRM); 8401 radeon_emit(cs, marker); 8402 radeon_emit(cs, 0); 8403 radeon_emit(cs, va); 8404 radeon_emit(cs, va >> 32); 8405 } else { 8406 si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.chip_class, 8407 radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_BOTTOM_OF_PIPE_TS, 8408 0, EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, va, marker, 8409 cmd_buffer->gfx9_eop_bug_va); 8410 } 8411 8412 assert(cmd_buffer->cs->cdw <= cdw_max); 8413} 8414