1/* 2 * Copyright © 2016 Red Hat. 3 * Copyright © 2016 Bas Nieuwenhuizen 4 * 5 * based in part on anv driver which is: 6 * Copyright © 2015 Intel Corporation 7 * 8 * Permission is hereby granted, free of charge, to any person obtaining a 9 * copy of this software and associated documentation files (the "Software"), 10 * to deal in the Software without restriction, including without limitation 11 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 12 * and/or sell copies of the Software, and to permit persons to whom the 13 * Software is furnished to do so, subject to the following conditions: 14 * 15 * The above copyright notice and this permission notice (including the next 16 * paragraph) shall be included in all copies or substantial portions of the 17 * Software. 18 * 19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 25 * IN THE SOFTWARE. 26 */ 27 28#include "radv_private.h" 29#include "radv_radeon_winsys.h" 30#include "radv_shader.h" 31#include "radv_cs.h" 32#include "sid.h" 33#include "gfx9d.h" 34#include "vk_format.h" 35#include "radv_debug.h" 36#include "radv_meta.h" 37 38#include "ac_debug.h" 39 40enum { 41 RADV_PREFETCH_VBO_DESCRIPTORS = (1 << 0), 42 RADV_PREFETCH_VS = (1 << 1), 43 RADV_PREFETCH_TCS = (1 << 2), 44 RADV_PREFETCH_TES = (1 << 3), 45 RADV_PREFETCH_GS = (1 << 4), 46 RADV_PREFETCH_PS = (1 << 5), 47 RADV_PREFETCH_SHADERS = (RADV_PREFETCH_VS | 48 RADV_PREFETCH_TCS | 49 RADV_PREFETCH_TES | 50 RADV_PREFETCH_GS | 51 RADV_PREFETCH_PS) 52}; 53 54static void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer, 55 struct radv_image *image, 56 VkImageLayout src_layout, 57 VkImageLayout dst_layout, 58 uint32_t src_family, 59 uint32_t dst_family, 60 const VkImageSubresourceRange *range); 61 62const struct radv_dynamic_state default_dynamic_state = { 63 .viewport = { 64 .count = 0, 65 }, 66 .scissor = { 67 .count = 0, 68 }, 69 .line_width = 1.0f, 70 .depth_bias = { 71 .bias = 0.0f, 72 .clamp = 0.0f, 73 .slope = 0.0f, 74 }, 75 .blend_constants = { 0.0f, 0.0f, 0.0f, 0.0f }, 76 .depth_bounds = { 77 .min = 0.0f, 78 .max = 1.0f, 79 }, 80 .stencil_compare_mask = { 81 .front = ~0u, 82 .back = ~0u, 83 }, 84 .stencil_write_mask = { 85 .front = ~0u, 86 .back = ~0u, 87 }, 88 .stencil_reference = { 89 .front = 0u, 90 .back = 0u, 91 }, 92}; 93 94static void 95radv_bind_dynamic_state(struct radv_cmd_buffer *cmd_buffer, 96 const struct radv_dynamic_state *src) 97{ 98 struct radv_dynamic_state *dest = &cmd_buffer->state.dynamic; 99 uint32_t copy_mask = src->mask; 100 uint32_t dest_mask = 0; 101 102 /* Make sure to copy the number of viewports/scissors because they can 103 * only be specified at pipeline creation time. 104 */ 105 dest->viewport.count = src->viewport.count; 106 dest->scissor.count = src->scissor.count; 107 dest->discard_rectangle.count = src->discard_rectangle.count; 108 109 if (copy_mask & RADV_DYNAMIC_VIEWPORT) { 110 if (memcmp(&dest->viewport.viewports, &src->viewport.viewports, 111 src->viewport.count * sizeof(VkViewport))) { 112 typed_memcpy(dest->viewport.viewports, 113 src->viewport.viewports, 114 src->viewport.count); 115 dest_mask |= RADV_DYNAMIC_VIEWPORT; 116 } 117 } 118 119 if (copy_mask & RADV_DYNAMIC_SCISSOR) { 120 if (memcmp(&dest->scissor.scissors, &src->scissor.scissors, 121 src->scissor.count * sizeof(VkRect2D))) { 122 typed_memcpy(dest->scissor.scissors, 123 src->scissor.scissors, src->scissor.count); 124 dest_mask |= RADV_DYNAMIC_SCISSOR; 125 } 126 } 127 128 if (copy_mask & RADV_DYNAMIC_LINE_WIDTH) { 129 if (dest->line_width != src->line_width) { 130 dest->line_width = src->line_width; 131 dest_mask |= RADV_DYNAMIC_LINE_WIDTH; 132 } 133 } 134 135 if (copy_mask & RADV_DYNAMIC_DEPTH_BIAS) { 136 if (memcmp(&dest->depth_bias, &src->depth_bias, 137 sizeof(src->depth_bias))) { 138 dest->depth_bias = src->depth_bias; 139 dest_mask |= RADV_DYNAMIC_DEPTH_BIAS; 140 } 141 } 142 143 if (copy_mask & RADV_DYNAMIC_BLEND_CONSTANTS) { 144 if (memcmp(&dest->blend_constants, &src->blend_constants, 145 sizeof(src->blend_constants))) { 146 typed_memcpy(dest->blend_constants, 147 src->blend_constants, 4); 148 dest_mask |= RADV_DYNAMIC_BLEND_CONSTANTS; 149 } 150 } 151 152 if (copy_mask & RADV_DYNAMIC_DEPTH_BOUNDS) { 153 if (memcmp(&dest->depth_bounds, &src->depth_bounds, 154 sizeof(src->depth_bounds))) { 155 dest->depth_bounds = src->depth_bounds; 156 dest_mask |= RADV_DYNAMIC_DEPTH_BOUNDS; 157 } 158 } 159 160 if (copy_mask & RADV_DYNAMIC_STENCIL_COMPARE_MASK) { 161 if (memcmp(&dest->stencil_compare_mask, 162 &src->stencil_compare_mask, 163 sizeof(src->stencil_compare_mask))) { 164 dest->stencil_compare_mask = src->stencil_compare_mask; 165 dest_mask |= RADV_DYNAMIC_STENCIL_COMPARE_MASK; 166 } 167 } 168 169 if (copy_mask & RADV_DYNAMIC_STENCIL_WRITE_MASK) { 170 if (memcmp(&dest->stencil_write_mask, &src->stencil_write_mask, 171 sizeof(src->stencil_write_mask))) { 172 dest->stencil_write_mask = src->stencil_write_mask; 173 dest_mask |= RADV_DYNAMIC_STENCIL_WRITE_MASK; 174 } 175 } 176 177 if (copy_mask & RADV_DYNAMIC_STENCIL_REFERENCE) { 178 if (memcmp(&dest->stencil_reference, &src->stencil_reference, 179 sizeof(src->stencil_reference))) { 180 dest->stencil_reference = src->stencil_reference; 181 dest_mask |= RADV_DYNAMIC_STENCIL_REFERENCE; 182 } 183 } 184 185 if (copy_mask & RADV_DYNAMIC_DISCARD_RECTANGLE) { 186 if (memcmp(&dest->discard_rectangle.rectangles, &src->discard_rectangle.rectangles, 187 src->discard_rectangle.count * sizeof(VkRect2D))) { 188 typed_memcpy(dest->discard_rectangle.rectangles, 189 src->discard_rectangle.rectangles, 190 src->discard_rectangle.count); 191 dest_mask |= RADV_DYNAMIC_DISCARD_RECTANGLE; 192 } 193 } 194 195 cmd_buffer->state.dirty |= dest_mask; 196} 197 198static void 199radv_bind_streamout_state(struct radv_cmd_buffer *cmd_buffer, 200 struct radv_pipeline *pipeline) 201{ 202 struct radv_streamout_state *so = &cmd_buffer->state.streamout; 203 struct radv_shader_info *info; 204 205 if (!pipeline->streamout_shader) 206 return; 207 208 info = &pipeline->streamout_shader->info.info; 209 for (int i = 0; i < MAX_SO_BUFFERS; i++) 210 so->stride_in_dw[i] = info->so.strides[i]; 211 212 so->enabled_stream_buffers_mask = info->so.enabled_stream_buffers_mask; 213} 214 215bool radv_cmd_buffer_uses_mec(struct radv_cmd_buffer *cmd_buffer) 216{ 217 return cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE && 218 cmd_buffer->device->physical_device->rad_info.chip_class >= CIK; 219} 220 221enum ring_type radv_queue_family_to_ring(int f) { 222 switch (f) { 223 case RADV_QUEUE_GENERAL: 224 return RING_GFX; 225 case RADV_QUEUE_COMPUTE: 226 return RING_COMPUTE; 227 case RADV_QUEUE_TRANSFER: 228 return RING_DMA; 229 default: 230 unreachable("Unknown queue family"); 231 } 232} 233 234static VkResult radv_create_cmd_buffer( 235 struct radv_device * device, 236 struct radv_cmd_pool * pool, 237 VkCommandBufferLevel level, 238 VkCommandBuffer* pCommandBuffer) 239{ 240 struct radv_cmd_buffer *cmd_buffer; 241 unsigned ring; 242 cmd_buffer = vk_zalloc(&pool->alloc, sizeof(*cmd_buffer), 8, 243 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 244 if (cmd_buffer == NULL) 245 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); 246 247 cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC; 248 cmd_buffer->device = device; 249 cmd_buffer->pool = pool; 250 cmd_buffer->level = level; 251 252 if (pool) { 253 list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers); 254 cmd_buffer->queue_family_index = pool->queue_family_index; 255 256 } else { 257 /* Init the pool_link so we can safely call list_del when we destroy 258 * the command buffer 259 */ 260 list_inithead(&cmd_buffer->pool_link); 261 cmd_buffer->queue_family_index = RADV_QUEUE_GENERAL; 262 } 263 264 ring = radv_queue_family_to_ring(cmd_buffer->queue_family_index); 265 266 cmd_buffer->cs = device->ws->cs_create(device->ws, ring); 267 if (!cmd_buffer->cs) { 268 vk_free(&cmd_buffer->pool->alloc, cmd_buffer); 269 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); 270 } 271 272 *pCommandBuffer = radv_cmd_buffer_to_handle(cmd_buffer); 273 274 list_inithead(&cmd_buffer->upload.list); 275 276 return VK_SUCCESS; 277} 278 279static void 280radv_cmd_buffer_destroy(struct radv_cmd_buffer *cmd_buffer) 281{ 282 list_del(&cmd_buffer->pool_link); 283 284 list_for_each_entry_safe(struct radv_cmd_buffer_upload, up, 285 &cmd_buffer->upload.list, list) { 286 cmd_buffer->device->ws->buffer_destroy(up->upload_bo); 287 list_del(&up->list); 288 free(up); 289 } 290 291 if (cmd_buffer->upload.upload_bo) 292 cmd_buffer->device->ws->buffer_destroy(cmd_buffer->upload.upload_bo); 293 cmd_buffer->device->ws->cs_destroy(cmd_buffer->cs); 294 295 for (unsigned i = 0; i < VK_PIPELINE_BIND_POINT_RANGE_SIZE; i++) 296 free(cmd_buffer->descriptors[i].push_set.set.mapped_ptr); 297 298 vk_free(&cmd_buffer->pool->alloc, cmd_buffer); 299} 300 301static VkResult 302radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer) 303{ 304 cmd_buffer->device->ws->cs_reset(cmd_buffer->cs); 305 306 list_for_each_entry_safe(struct radv_cmd_buffer_upload, up, 307 &cmd_buffer->upload.list, list) { 308 cmd_buffer->device->ws->buffer_destroy(up->upload_bo); 309 list_del(&up->list); 310 free(up); 311 } 312 313 cmd_buffer->push_constant_stages = 0; 314 cmd_buffer->scratch_size_needed = 0; 315 cmd_buffer->compute_scratch_size_needed = 0; 316 cmd_buffer->esgs_ring_size_needed = 0; 317 cmd_buffer->gsvs_ring_size_needed = 0; 318 cmd_buffer->tess_rings_needed = false; 319 cmd_buffer->sample_positions_needed = false; 320 321 if (cmd_buffer->upload.upload_bo) 322 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, 323 cmd_buffer->upload.upload_bo); 324 cmd_buffer->upload.offset = 0; 325 326 cmd_buffer->record_result = VK_SUCCESS; 327 328 memset(cmd_buffer->vertex_bindings, 0, sizeof(cmd_buffer->vertex_bindings)); 329 330 for (unsigned i = 0; i < VK_PIPELINE_BIND_POINT_RANGE_SIZE; i++) { 331 cmd_buffer->descriptors[i].dirty = 0; 332 cmd_buffer->descriptors[i].valid = 0; 333 cmd_buffer->descriptors[i].push_dirty = false; 334 } 335 336 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9 && 337 cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) { 338 unsigned num_db = cmd_buffer->device->physical_device->rad_info.num_render_backends; 339 unsigned fence_offset, eop_bug_offset; 340 void *fence_ptr; 341 342 radv_cmd_buffer_upload_alloc(cmd_buffer, 8, 8, &fence_offset, 343 &fence_ptr); 344 345 cmd_buffer->gfx9_fence_va = 346 radv_buffer_get_va(cmd_buffer->upload.upload_bo); 347 cmd_buffer->gfx9_fence_va += fence_offset; 348 349 /* Allocate a buffer for the EOP bug on GFX9. */ 350 radv_cmd_buffer_upload_alloc(cmd_buffer, 16 * num_db, 8, 351 &eop_bug_offset, &fence_ptr); 352 cmd_buffer->gfx9_eop_bug_va = 353 radv_buffer_get_va(cmd_buffer->upload.upload_bo); 354 cmd_buffer->gfx9_eop_bug_va += eop_bug_offset; 355 } 356 357 cmd_buffer->status = RADV_CMD_BUFFER_STATUS_INITIAL; 358 359 return cmd_buffer->record_result; 360} 361 362static bool 363radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer *cmd_buffer, 364 uint64_t min_needed) 365{ 366 uint64_t new_size; 367 struct radeon_winsys_bo *bo; 368 struct radv_cmd_buffer_upload *upload; 369 struct radv_device *device = cmd_buffer->device; 370 371 new_size = MAX2(min_needed, 16 * 1024); 372 new_size = MAX2(new_size, 2 * cmd_buffer->upload.size); 373 374 bo = device->ws->buffer_create(device->ws, 375 new_size, 4096, 376 RADEON_DOMAIN_GTT, 377 RADEON_FLAG_CPU_ACCESS| 378 RADEON_FLAG_NO_INTERPROCESS_SHARING | 379 RADEON_FLAG_32BIT, 380 RADV_BO_PRIORITY_UPLOAD_BUFFER); 381 382 if (!bo) { 383 cmd_buffer->record_result = VK_ERROR_OUT_OF_DEVICE_MEMORY; 384 return false; 385 } 386 387 radv_cs_add_buffer(device->ws, cmd_buffer->cs, bo); 388 if (cmd_buffer->upload.upload_bo) { 389 upload = malloc(sizeof(*upload)); 390 391 if (!upload) { 392 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY; 393 device->ws->buffer_destroy(bo); 394 return false; 395 } 396 397 memcpy(upload, &cmd_buffer->upload, sizeof(*upload)); 398 list_add(&upload->list, &cmd_buffer->upload.list); 399 } 400 401 cmd_buffer->upload.upload_bo = bo; 402 cmd_buffer->upload.size = new_size; 403 cmd_buffer->upload.offset = 0; 404 cmd_buffer->upload.map = device->ws->buffer_map(cmd_buffer->upload.upload_bo); 405 406 if (!cmd_buffer->upload.map) { 407 cmd_buffer->record_result = VK_ERROR_OUT_OF_DEVICE_MEMORY; 408 return false; 409 } 410 411 return true; 412} 413 414bool 415radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer, 416 unsigned size, 417 unsigned alignment, 418 unsigned *out_offset, 419 void **ptr) 420{ 421 assert(util_is_power_of_two_nonzero(alignment)); 422 423 uint64_t offset = align(cmd_buffer->upload.offset, alignment); 424 if (offset + size > cmd_buffer->upload.size) { 425 if (!radv_cmd_buffer_resize_upload_buf(cmd_buffer, size)) 426 return false; 427 offset = 0; 428 } 429 430 *out_offset = offset; 431 *ptr = cmd_buffer->upload.map + offset; 432 433 cmd_buffer->upload.offset = offset + size; 434 return true; 435} 436 437bool 438radv_cmd_buffer_upload_data(struct radv_cmd_buffer *cmd_buffer, 439 unsigned size, unsigned alignment, 440 const void *data, unsigned *out_offset) 441{ 442 uint8_t *ptr; 443 444 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, alignment, 445 out_offset, (void **)&ptr)) 446 return false; 447 448 if (ptr) 449 memcpy(ptr, data, size); 450 451 return true; 452} 453 454static void 455radv_emit_write_data_packet(struct radv_cmd_buffer *cmd_buffer, uint64_t va, 456 unsigned count, const uint32_t *data) 457{ 458 struct radeon_cmdbuf *cs = cmd_buffer->cs; 459 460 radeon_check_space(cmd_buffer->device->ws, cs, 4 + count); 461 462 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0)); 463 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | 464 S_370_WR_CONFIRM(1) | 465 S_370_ENGINE_SEL(V_370_ME)); 466 radeon_emit(cs, va); 467 radeon_emit(cs, va >> 32); 468 radeon_emit_array(cs, data, count); 469} 470 471void radv_cmd_buffer_trace_emit(struct radv_cmd_buffer *cmd_buffer) 472{ 473 struct radv_device *device = cmd_buffer->device; 474 struct radeon_cmdbuf *cs = cmd_buffer->cs; 475 uint64_t va; 476 477 va = radv_buffer_get_va(device->trace_bo); 478 if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) 479 va += 4; 480 481 ++cmd_buffer->state.trace_id; 482 radv_emit_write_data_packet(cmd_buffer, va, 1, 483 &cmd_buffer->state.trace_id); 484 485 radeon_check_space(cmd_buffer->device->ws, cs, 2); 486 487 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); 488 radeon_emit(cs, AC_ENCODE_TRACE_POINT(cmd_buffer->state.trace_id)); 489} 490 491static void 492radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer, 493 enum radv_cmd_flush_bits flags) 494{ 495 if (cmd_buffer->device->instance->debug_flags & RADV_DEBUG_SYNC_SHADERS) { 496 assert(flags & (RADV_CMD_FLAG_PS_PARTIAL_FLUSH | 497 RADV_CMD_FLAG_CS_PARTIAL_FLUSH)); 498 499 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4); 500 501 /* Force wait for graphics or compute engines to be idle. */ 502 si_cs_emit_cache_flush(cmd_buffer->cs, 503 cmd_buffer->device->physical_device->rad_info.chip_class, 504 &cmd_buffer->gfx9_fence_idx, 505 cmd_buffer->gfx9_fence_va, 506 radv_cmd_buffer_uses_mec(cmd_buffer), 507 flags, cmd_buffer->gfx9_eop_bug_va); 508 } 509 510 if (unlikely(cmd_buffer->device->trace_bo)) 511 radv_cmd_buffer_trace_emit(cmd_buffer); 512} 513 514static void 515radv_save_pipeline(struct radv_cmd_buffer *cmd_buffer, 516 struct radv_pipeline *pipeline, enum ring_type ring) 517{ 518 struct radv_device *device = cmd_buffer->device; 519 uint32_t data[2]; 520 uint64_t va; 521 522 va = radv_buffer_get_va(device->trace_bo); 523 524 switch (ring) { 525 case RING_GFX: 526 va += 8; 527 break; 528 case RING_COMPUTE: 529 va += 16; 530 break; 531 default: 532 assert(!"invalid ring type"); 533 } 534 535 data[0] = (uintptr_t)pipeline; 536 data[1] = (uintptr_t)pipeline >> 32; 537 538 radv_emit_write_data_packet(cmd_buffer, va, 2, data); 539} 540 541void radv_set_descriptor_set(struct radv_cmd_buffer *cmd_buffer, 542 VkPipelineBindPoint bind_point, 543 struct radv_descriptor_set *set, 544 unsigned idx) 545{ 546 struct radv_descriptor_state *descriptors_state = 547 radv_get_descriptors_state(cmd_buffer, bind_point); 548 549 descriptors_state->sets[idx] = set; 550 551 descriptors_state->valid |= (1u << idx); /* active descriptors */ 552 descriptors_state->dirty |= (1u << idx); 553} 554 555static void 556radv_save_descriptors(struct radv_cmd_buffer *cmd_buffer, 557 VkPipelineBindPoint bind_point) 558{ 559 struct radv_descriptor_state *descriptors_state = 560 radv_get_descriptors_state(cmd_buffer, bind_point); 561 struct radv_device *device = cmd_buffer->device; 562 uint32_t data[MAX_SETS * 2] = {}; 563 uint64_t va; 564 unsigned i; 565 va = radv_buffer_get_va(device->trace_bo) + 24; 566 567 for_each_bit(i, descriptors_state->valid) { 568 struct radv_descriptor_set *set = descriptors_state->sets[i]; 569 data[i * 2] = (uint64_t)(uintptr_t)set; 570 data[i * 2 + 1] = (uint64_t)(uintptr_t)set >> 32; 571 } 572 573 radv_emit_write_data_packet(cmd_buffer, va, MAX_SETS * 2, data); 574} 575 576struct radv_userdata_info * 577radv_lookup_user_sgpr(struct radv_pipeline *pipeline, 578 gl_shader_stage stage, 579 int idx) 580{ 581 struct radv_shader_variant *shader = radv_get_shader(pipeline, stage); 582 return &shader->info.user_sgprs_locs.shader_data[idx]; 583} 584 585static void 586radv_emit_userdata_address(struct radv_cmd_buffer *cmd_buffer, 587 struct radv_pipeline *pipeline, 588 gl_shader_stage stage, 589 int idx, uint64_t va) 590{ 591 struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx); 592 uint32_t base_reg = pipeline->user_data_0[stage]; 593 if (loc->sgpr_idx == -1) 594 return; 595 596 assert(loc->num_sgprs == 1); 597 598 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, 599 base_reg + loc->sgpr_idx * 4, va, false); 600} 601 602static void 603radv_emit_descriptor_pointers(struct radv_cmd_buffer *cmd_buffer, 604 struct radv_pipeline *pipeline, 605 struct radv_descriptor_state *descriptors_state, 606 gl_shader_stage stage) 607{ 608 struct radv_device *device = cmd_buffer->device; 609 struct radeon_cmdbuf *cs = cmd_buffer->cs; 610 uint32_t sh_base = pipeline->user_data_0[stage]; 611 struct radv_userdata_locations *locs = 612 &pipeline->shaders[stage]->info.user_sgprs_locs; 613 unsigned mask = locs->descriptor_sets_enabled; 614 615 mask &= descriptors_state->dirty & descriptors_state->valid; 616 617 while (mask) { 618 int start, count; 619 620 u_bit_scan_consecutive_range(&mask, &start, &count); 621 622 struct radv_userdata_info *loc = &locs->descriptor_sets[start]; 623 unsigned sh_offset = sh_base + loc->sgpr_idx * 4; 624 625 radv_emit_shader_pointer_head(cs, sh_offset, count, true); 626 for (int i = 0; i < count; i++) { 627 struct radv_descriptor_set *set = 628 descriptors_state->sets[start + i]; 629 630 radv_emit_shader_pointer_body(device, cs, set->va, true); 631 } 632 } 633} 634 635static void 636radv_emit_inline_push_consts(struct radv_cmd_buffer *cmd_buffer, 637 struct radv_pipeline *pipeline, 638 gl_shader_stage stage, 639 int idx, int count, uint32_t *values) 640{ 641 struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx); 642 uint32_t base_reg = pipeline->user_data_0[stage]; 643 if (loc->sgpr_idx == -1) 644 return; 645 646 assert(loc->num_sgprs == count); 647 648 radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, count); 649 radeon_emit_array(cmd_buffer->cs, values, count); 650} 651 652static void 653radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer, 654 struct radv_pipeline *pipeline) 655{ 656 int num_samples = pipeline->graphics.ms.num_samples; 657 struct radv_multisample_state *ms = &pipeline->graphics.ms; 658 struct radv_pipeline *old_pipeline = cmd_buffer->state.emitted_pipeline; 659 660 if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.info.ps.needs_sample_positions) 661 cmd_buffer->sample_positions_needed = true; 662 663 if (old_pipeline && num_samples == old_pipeline->graphics.ms.num_samples) 664 return; 665 666 radeon_set_context_reg_seq(cmd_buffer->cs, R_028BDC_PA_SC_LINE_CNTL, 2); 667 radeon_emit(cmd_buffer->cs, ms->pa_sc_line_cntl); 668 radeon_emit(cmd_buffer->cs, ms->pa_sc_aa_config); 669 670 radeon_set_context_reg(cmd_buffer->cs, R_028A48_PA_SC_MODE_CNTL_0, ms->pa_sc_mode_cntl_0); 671 672 radv_cayman_emit_msaa_sample_locs(cmd_buffer->cs, num_samples); 673 674 /* GFX9: Flush DFSM when the AA mode changes. */ 675 if (cmd_buffer->device->dfsm_allowed) { 676 radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 677 radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0)); 678 } 679 680 cmd_buffer->state.context_roll_without_scissor_emitted = true; 681} 682 683static void 684radv_emit_shader_prefetch(struct radv_cmd_buffer *cmd_buffer, 685 struct radv_shader_variant *shader) 686{ 687 uint64_t va; 688 689 if (!shader) 690 return; 691 692 va = radv_buffer_get_va(shader->bo) + shader->bo_offset; 693 694 si_cp_dma_prefetch(cmd_buffer, va, shader->code_size); 695} 696 697static void 698radv_emit_prefetch_L2(struct radv_cmd_buffer *cmd_buffer, 699 struct radv_pipeline *pipeline, 700 bool vertex_stage_only) 701{ 702 struct radv_cmd_state *state = &cmd_buffer->state; 703 uint32_t mask = state->prefetch_L2_mask; 704 705 if (vertex_stage_only) { 706 /* Fast prefetch path for starting draws as soon as possible. 707 */ 708 mask = state->prefetch_L2_mask & (RADV_PREFETCH_VS | 709 RADV_PREFETCH_VBO_DESCRIPTORS); 710 } 711 712 if (mask & RADV_PREFETCH_VS) 713 radv_emit_shader_prefetch(cmd_buffer, 714 pipeline->shaders[MESA_SHADER_VERTEX]); 715 716 if (mask & RADV_PREFETCH_VBO_DESCRIPTORS) 717 si_cp_dma_prefetch(cmd_buffer, state->vb_va, state->vb_size); 718 719 if (mask & RADV_PREFETCH_TCS) 720 radv_emit_shader_prefetch(cmd_buffer, 721 pipeline->shaders[MESA_SHADER_TESS_CTRL]); 722 723 if (mask & RADV_PREFETCH_TES) 724 radv_emit_shader_prefetch(cmd_buffer, 725 pipeline->shaders[MESA_SHADER_TESS_EVAL]); 726 727 if (mask & RADV_PREFETCH_GS) { 728 radv_emit_shader_prefetch(cmd_buffer, 729 pipeline->shaders[MESA_SHADER_GEOMETRY]); 730 radv_emit_shader_prefetch(cmd_buffer, pipeline->gs_copy_shader); 731 } 732 733 if (mask & RADV_PREFETCH_PS) 734 radv_emit_shader_prefetch(cmd_buffer, 735 pipeline->shaders[MESA_SHADER_FRAGMENT]); 736 737 state->prefetch_L2_mask &= ~mask; 738} 739 740static void 741radv_emit_rbplus_state(struct radv_cmd_buffer *cmd_buffer) 742{ 743 if (!cmd_buffer->device->physical_device->rbplus_allowed) 744 return; 745 746 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; 747 struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer; 748 const struct radv_subpass *subpass = cmd_buffer->state.subpass; 749 750 unsigned sx_ps_downconvert = 0; 751 unsigned sx_blend_opt_epsilon = 0; 752 unsigned sx_blend_opt_control = 0; 753 754 for (unsigned i = 0; i < subpass->color_count; ++i) { 755 if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) { 756 sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4); 757 sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4); 758 continue; 759 } 760 761 int idx = subpass->color_attachments[i].attachment; 762 struct radv_color_buffer_info *cb = &framebuffer->attachments[idx].cb; 763 764 unsigned format = G_028C70_FORMAT(cb->cb_color_info); 765 unsigned swap = G_028C70_COMP_SWAP(cb->cb_color_info); 766 uint32_t spi_format = (pipeline->graphics.col_format >> (i * 4)) & 0xf; 767 uint32_t colormask = (pipeline->graphics.cb_target_mask >> (i * 4)) & 0xf; 768 769 bool has_alpha, has_rgb; 770 771 /* Set if RGB and A are present. */ 772 has_alpha = !G_028C74_FORCE_DST_ALPHA_1(cb->cb_color_attrib); 773 774 if (format == V_028C70_COLOR_8 || 775 format == V_028C70_COLOR_16 || 776 format == V_028C70_COLOR_32) 777 has_rgb = !has_alpha; 778 else 779 has_rgb = true; 780 781 /* Check the colormask and export format. */ 782 if (!(colormask & 0x7)) 783 has_rgb = false; 784 if (!(colormask & 0x8)) 785 has_alpha = false; 786 787 if (spi_format == V_028714_SPI_SHADER_ZERO) { 788 has_rgb = false; 789 has_alpha = false; 790 } 791 792 /* Disable value checking for disabled channels. */ 793 if (!has_rgb) 794 sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4); 795 if (!has_alpha) 796 sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4); 797 798 /* Enable down-conversion for 32bpp and smaller formats. */ 799 switch (format) { 800 case V_028C70_COLOR_8: 801 case V_028C70_COLOR_8_8: 802 case V_028C70_COLOR_8_8_8_8: 803 /* For 1 and 2-channel formats, use the superset thereof. */ 804 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR || 805 spi_format == V_028714_SPI_SHADER_UINT16_ABGR || 806 spi_format == V_028714_SPI_SHADER_SINT16_ABGR) { 807 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_8_8_8_8 << (i * 4); 808 sx_blend_opt_epsilon |= V_028758_8BIT_FORMAT << (i * 4); 809 } 810 break; 811 812 case V_028C70_COLOR_5_6_5: 813 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { 814 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_5_6_5 << (i * 4); 815 sx_blend_opt_epsilon |= V_028758_6BIT_FORMAT << (i * 4); 816 } 817 break; 818 819 case V_028C70_COLOR_1_5_5_5: 820 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { 821 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_1_5_5_5 << (i * 4); 822 sx_blend_opt_epsilon |= V_028758_5BIT_FORMAT << (i * 4); 823 } 824 break; 825 826 case V_028C70_COLOR_4_4_4_4: 827 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { 828 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_4_4_4_4 << (i * 4); 829 sx_blend_opt_epsilon |= V_028758_4BIT_FORMAT << (i * 4); 830 } 831 break; 832 833 case V_028C70_COLOR_32: 834 if (swap == V_028C70_SWAP_STD && 835 spi_format == V_028714_SPI_SHADER_32_R) 836 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4); 837 else if (swap == V_028C70_SWAP_ALT_REV && 838 spi_format == V_028714_SPI_SHADER_32_AR) 839 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_A << (i * 4); 840 break; 841 842 case V_028C70_COLOR_16: 843 case V_028C70_COLOR_16_16: 844 /* For 1-channel formats, use the superset thereof. */ 845 if (spi_format == V_028714_SPI_SHADER_UNORM16_ABGR || 846 spi_format == V_028714_SPI_SHADER_SNORM16_ABGR || 847 spi_format == V_028714_SPI_SHADER_UINT16_ABGR || 848 spi_format == V_028714_SPI_SHADER_SINT16_ABGR) { 849 if (swap == V_028C70_SWAP_STD || 850 swap == V_028C70_SWAP_STD_REV) 851 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_GR << (i * 4); 852 else 853 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_AR << (i * 4); 854 } 855 break; 856 857 case V_028C70_COLOR_10_11_11: 858 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { 859 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_10_11_11 << (i * 4); 860 sx_blend_opt_epsilon |= V_028758_11BIT_FORMAT << (i * 4); 861 } 862 break; 863 864 case V_028C70_COLOR_2_10_10_10: 865 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { 866 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_2_10_10_10 << (i * 4); 867 sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT << (i * 4); 868 } 869 break; 870 } 871 } 872 873 for (unsigned i = subpass->color_count; i < 8; ++i) { 874 sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4); 875 sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4); 876 } 877 /* TODO: avoid redundantly setting context registers */ 878 radeon_set_context_reg_seq(cmd_buffer->cs, R_028754_SX_PS_DOWNCONVERT, 3); 879 radeon_emit(cmd_buffer->cs, sx_ps_downconvert); 880 radeon_emit(cmd_buffer->cs, sx_blend_opt_epsilon); 881 radeon_emit(cmd_buffer->cs, sx_blend_opt_control); 882 883 cmd_buffer->state.context_roll_without_scissor_emitted = true; 884} 885 886static void 887radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer) 888{ 889 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; 890 891 if (!pipeline || cmd_buffer->state.emitted_pipeline == pipeline) 892 return; 893 894 radv_update_multisample_state(cmd_buffer, pipeline); 895 896 cmd_buffer->scratch_size_needed = 897 MAX2(cmd_buffer->scratch_size_needed, 898 pipeline->max_waves * pipeline->scratch_bytes_per_wave); 899 900 if (!cmd_buffer->state.emitted_pipeline || 901 cmd_buffer->state.emitted_pipeline->graphics.can_use_guardband != 902 pipeline->graphics.can_use_guardband) 903 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR; 904 905 radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw); 906 907 if (!cmd_buffer->state.emitted_pipeline || 908 cmd_buffer->state.emitted_pipeline->ctx_cs.cdw != pipeline->ctx_cs.cdw || 909 cmd_buffer->state.emitted_pipeline->ctx_cs_hash != pipeline->ctx_cs_hash || 910 memcmp(cmd_buffer->state.emitted_pipeline->ctx_cs.buf, 911 pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw * 4)) { 912 radeon_emit_array(cmd_buffer->cs, pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw); 913 cmd_buffer->state.context_roll_without_scissor_emitted = true; 914 } 915 916 for (unsigned i = 0; i < MESA_SHADER_COMPUTE; i++) { 917 if (!pipeline->shaders[i]) 918 continue; 919 920 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, 921 pipeline->shaders[i]->bo); 922 } 923 924 if (radv_pipeline_has_gs(pipeline)) 925 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, 926 pipeline->gs_copy_shader->bo); 927 928 if (unlikely(cmd_buffer->device->trace_bo)) 929 radv_save_pipeline(cmd_buffer, pipeline, RING_GFX); 930 931 cmd_buffer->state.emitted_pipeline = pipeline; 932 933 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_PIPELINE; 934} 935 936static void 937radv_emit_viewport(struct radv_cmd_buffer *cmd_buffer) 938{ 939 si_write_viewport(cmd_buffer->cs, 0, cmd_buffer->state.dynamic.viewport.count, 940 cmd_buffer->state.dynamic.viewport.viewports); 941} 942 943static void 944radv_emit_scissor(struct radv_cmd_buffer *cmd_buffer) 945{ 946 uint32_t count = cmd_buffer->state.dynamic.scissor.count; 947 948 si_write_scissors(cmd_buffer->cs, 0, count, 949 cmd_buffer->state.dynamic.scissor.scissors, 950 cmd_buffer->state.dynamic.viewport.viewports, 951 cmd_buffer->state.emitted_pipeline->graphics.can_use_guardband); 952 953 cmd_buffer->state.context_roll_without_scissor_emitted = false; 954} 955 956static void 957radv_emit_discard_rectangle(struct radv_cmd_buffer *cmd_buffer) 958{ 959 if (!cmd_buffer->state.dynamic.discard_rectangle.count) 960 return; 961 962 radeon_set_context_reg_seq(cmd_buffer->cs, R_028210_PA_SC_CLIPRECT_0_TL, 963 cmd_buffer->state.dynamic.discard_rectangle.count * 2); 964 for (unsigned i = 0; i < cmd_buffer->state.dynamic.discard_rectangle.count; ++i) { 965 VkRect2D rect = cmd_buffer->state.dynamic.discard_rectangle.rectangles[i]; 966 radeon_emit(cmd_buffer->cs, S_028210_TL_X(rect.offset.x) | S_028210_TL_Y(rect.offset.y)); 967 radeon_emit(cmd_buffer->cs, S_028214_BR_X(rect.offset.x + rect.extent.width) | 968 S_028214_BR_Y(rect.offset.y + rect.extent.height)); 969 } 970} 971 972static void 973radv_emit_line_width(struct radv_cmd_buffer *cmd_buffer) 974{ 975 unsigned width = cmd_buffer->state.dynamic.line_width * 8; 976 977 radeon_set_context_reg(cmd_buffer->cs, R_028A08_PA_SU_LINE_CNTL, 978 S_028A08_WIDTH(CLAMP(width, 0, 0xFFF))); 979} 980 981static void 982radv_emit_blend_constants(struct radv_cmd_buffer *cmd_buffer) 983{ 984 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 985 986 radeon_set_context_reg_seq(cmd_buffer->cs, R_028414_CB_BLEND_RED, 4); 987 radeon_emit_array(cmd_buffer->cs, (uint32_t *)d->blend_constants, 4); 988} 989 990static void 991radv_emit_stencil(struct radv_cmd_buffer *cmd_buffer) 992{ 993 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 994 995 radeon_set_context_reg_seq(cmd_buffer->cs, 996 R_028430_DB_STENCILREFMASK, 2); 997 radeon_emit(cmd_buffer->cs, 998 S_028430_STENCILTESTVAL(d->stencil_reference.front) | 999 S_028430_STENCILMASK(d->stencil_compare_mask.front) | 1000 S_028430_STENCILWRITEMASK(d->stencil_write_mask.front) | 1001 S_028430_STENCILOPVAL(1)); 1002 radeon_emit(cmd_buffer->cs, 1003 S_028434_STENCILTESTVAL_BF(d->stencil_reference.back) | 1004 S_028434_STENCILMASK_BF(d->stencil_compare_mask.back) | 1005 S_028434_STENCILWRITEMASK_BF(d->stencil_write_mask.back) | 1006 S_028434_STENCILOPVAL_BF(1)); 1007} 1008 1009static void 1010radv_emit_depth_bounds(struct radv_cmd_buffer *cmd_buffer) 1011{ 1012 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 1013 1014 radeon_set_context_reg(cmd_buffer->cs, R_028020_DB_DEPTH_BOUNDS_MIN, 1015 fui(d->depth_bounds.min)); 1016 radeon_set_context_reg(cmd_buffer->cs, R_028024_DB_DEPTH_BOUNDS_MAX, 1017 fui(d->depth_bounds.max)); 1018} 1019 1020static void 1021radv_emit_depth_bias(struct radv_cmd_buffer *cmd_buffer) 1022{ 1023 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 1024 unsigned slope = fui(d->depth_bias.slope * 16.0f); 1025 unsigned bias = fui(d->depth_bias.bias * cmd_buffer->state.offset_scale); 1026 1027 1028 radeon_set_context_reg_seq(cmd_buffer->cs, 1029 R_028B7C_PA_SU_POLY_OFFSET_CLAMP, 5); 1030 radeon_emit(cmd_buffer->cs, fui(d->depth_bias.clamp)); /* CLAMP */ 1031 radeon_emit(cmd_buffer->cs, slope); /* FRONT SCALE */ 1032 radeon_emit(cmd_buffer->cs, bias); /* FRONT OFFSET */ 1033 radeon_emit(cmd_buffer->cs, slope); /* BACK SCALE */ 1034 radeon_emit(cmd_buffer->cs, bias); /* BACK OFFSET */ 1035} 1036 1037static void 1038radv_emit_fb_color_state(struct radv_cmd_buffer *cmd_buffer, 1039 int index, 1040 struct radv_attachment_info *att, 1041 struct radv_image *image, 1042 VkImageLayout layout) 1043{ 1044 bool is_vi = cmd_buffer->device->physical_device->rad_info.chip_class >= VI; 1045 struct radv_color_buffer_info *cb = &att->cb; 1046 uint32_t cb_color_info = cb->cb_color_info; 1047 1048 if (!radv_layout_dcc_compressed(image, layout, 1049 radv_image_queue_family_mask(image, 1050 cmd_buffer->queue_family_index, 1051 cmd_buffer->queue_family_index))) { 1052 cb_color_info &= C_028C70_DCC_ENABLE; 1053 } 1054 1055 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { 1056 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11); 1057 radeon_emit(cmd_buffer->cs, cb->cb_color_base); 1058 radeon_emit(cmd_buffer->cs, S_028C64_BASE_256B(cb->cb_color_base >> 32)); 1059 radeon_emit(cmd_buffer->cs, cb->cb_color_attrib2); 1060 radeon_emit(cmd_buffer->cs, cb->cb_color_view); 1061 radeon_emit(cmd_buffer->cs, cb_color_info); 1062 radeon_emit(cmd_buffer->cs, cb->cb_color_attrib); 1063 radeon_emit(cmd_buffer->cs, cb->cb_dcc_control); 1064 radeon_emit(cmd_buffer->cs, cb->cb_color_cmask); 1065 radeon_emit(cmd_buffer->cs, S_028C80_BASE_256B(cb->cb_color_cmask >> 32)); 1066 radeon_emit(cmd_buffer->cs, cb->cb_color_fmask); 1067 radeon_emit(cmd_buffer->cs, S_028C88_BASE_256B(cb->cb_color_fmask >> 32)); 1068 1069 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, 2); 1070 radeon_emit(cmd_buffer->cs, cb->cb_dcc_base); 1071 radeon_emit(cmd_buffer->cs, S_028C98_BASE_256B(cb->cb_dcc_base >> 32)); 1072 1073 radeon_set_context_reg(cmd_buffer->cs, R_0287A0_CB_MRT0_EPITCH + index * 4, 1074 cb->cb_mrt_epitch); 1075 } else { 1076 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11); 1077 radeon_emit(cmd_buffer->cs, cb->cb_color_base); 1078 radeon_emit(cmd_buffer->cs, cb->cb_color_pitch); 1079 radeon_emit(cmd_buffer->cs, cb->cb_color_slice); 1080 radeon_emit(cmd_buffer->cs, cb->cb_color_view); 1081 radeon_emit(cmd_buffer->cs, cb_color_info); 1082 radeon_emit(cmd_buffer->cs, cb->cb_color_attrib); 1083 radeon_emit(cmd_buffer->cs, cb->cb_dcc_control); 1084 radeon_emit(cmd_buffer->cs, cb->cb_color_cmask); 1085 radeon_emit(cmd_buffer->cs, cb->cb_color_cmask_slice); 1086 radeon_emit(cmd_buffer->cs, cb->cb_color_fmask); 1087 radeon_emit(cmd_buffer->cs, cb->cb_color_fmask_slice); 1088 1089 if (is_vi) { /* DCC BASE */ 1090 radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->cb_dcc_base); 1091 } 1092 } 1093 1094 if (radv_image_has_dcc(image)) { 1095 /* Drawing with DCC enabled also compresses colorbuffers. */ 1096 radv_update_dcc_metadata(cmd_buffer, image, true); 1097 } 1098} 1099 1100static void 1101radv_update_zrange_precision(struct radv_cmd_buffer *cmd_buffer, 1102 struct radv_ds_buffer_info *ds, 1103 struct radv_image *image, VkImageLayout layout, 1104 bool requires_cond_exec) 1105{ 1106 uint32_t db_z_info = ds->db_z_info; 1107 uint32_t db_z_info_reg; 1108 1109 if (!radv_image_is_tc_compat_htile(image)) 1110 return; 1111 1112 if (!radv_layout_has_htile(image, layout, 1113 radv_image_queue_family_mask(image, 1114 cmd_buffer->queue_family_index, 1115 cmd_buffer->queue_family_index))) { 1116 db_z_info &= C_028040_TILE_SURFACE_ENABLE; 1117 } 1118 1119 db_z_info &= C_028040_ZRANGE_PRECISION; 1120 1121 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { 1122 db_z_info_reg = R_028038_DB_Z_INFO; 1123 } else { 1124 db_z_info_reg = R_028040_DB_Z_INFO; 1125 } 1126 1127 /* When we don't know the last fast clear value we need to emit a 1128 * conditional packet that will eventually skip the following 1129 * SET_CONTEXT_REG packet. 1130 */ 1131 if (requires_cond_exec) { 1132 uint64_t va = radv_buffer_get_va(image->bo); 1133 va += image->offset + image->tc_compat_zrange_offset; 1134 1135 radeon_emit(cmd_buffer->cs, PKT3(PKT3_COND_EXEC, 3, 0)); 1136 radeon_emit(cmd_buffer->cs, va); 1137 radeon_emit(cmd_buffer->cs, va >> 32); 1138 radeon_emit(cmd_buffer->cs, 0); 1139 radeon_emit(cmd_buffer->cs, 3); /* SET_CONTEXT_REG size */ 1140 } 1141 1142 radeon_set_context_reg(cmd_buffer->cs, db_z_info_reg, db_z_info); 1143} 1144 1145static void 1146radv_emit_fb_ds_state(struct radv_cmd_buffer *cmd_buffer, 1147 struct radv_ds_buffer_info *ds, 1148 struct radv_image *image, 1149 VkImageLayout layout) 1150{ 1151 uint32_t db_z_info = ds->db_z_info; 1152 uint32_t db_stencil_info = ds->db_stencil_info; 1153 1154 if (!radv_layout_has_htile(image, layout, 1155 radv_image_queue_family_mask(image, 1156 cmd_buffer->queue_family_index, 1157 cmd_buffer->queue_family_index))) { 1158 db_z_info &= C_028040_TILE_SURFACE_ENABLE; 1159 db_stencil_info |= S_028044_TILE_STENCIL_DISABLE(1); 1160 } 1161 1162 radeon_set_context_reg(cmd_buffer->cs, R_028008_DB_DEPTH_VIEW, ds->db_depth_view); 1163 radeon_set_context_reg(cmd_buffer->cs, R_028ABC_DB_HTILE_SURFACE, ds->db_htile_surface); 1164 1165 1166 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { 1167 radeon_set_context_reg_seq(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, 3); 1168 radeon_emit(cmd_buffer->cs, ds->db_htile_data_base); 1169 radeon_emit(cmd_buffer->cs, S_028018_BASE_HI(ds->db_htile_data_base >> 32)); 1170 radeon_emit(cmd_buffer->cs, ds->db_depth_size); 1171 1172 radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 10); 1173 radeon_emit(cmd_buffer->cs, db_z_info); /* DB_Z_INFO */ 1174 radeon_emit(cmd_buffer->cs, db_stencil_info); /* DB_STENCIL_INFO */ 1175 radeon_emit(cmd_buffer->cs, ds->db_z_read_base); /* DB_Z_READ_BASE */ 1176 radeon_emit(cmd_buffer->cs, S_028044_BASE_HI(ds->db_z_read_base >> 32)); /* DB_Z_READ_BASE_HI */ 1177 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base); /* DB_STENCIL_READ_BASE */ 1178 radeon_emit(cmd_buffer->cs, S_02804C_BASE_HI(ds->db_stencil_read_base >> 32)); /* DB_STENCIL_READ_BASE_HI */ 1179 radeon_emit(cmd_buffer->cs, ds->db_z_write_base); /* DB_Z_WRITE_BASE */ 1180 radeon_emit(cmd_buffer->cs, S_028054_BASE_HI(ds->db_z_write_base >> 32)); /* DB_Z_WRITE_BASE_HI */ 1181 radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base); /* DB_STENCIL_WRITE_BASE */ 1182 radeon_emit(cmd_buffer->cs, S_02805C_BASE_HI(ds->db_stencil_write_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */ 1183 1184 radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_INFO2, 2); 1185 radeon_emit(cmd_buffer->cs, ds->db_z_info2); 1186 radeon_emit(cmd_buffer->cs, ds->db_stencil_info2); 1187 } else { 1188 radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base); 1189 1190 radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 9); 1191 radeon_emit(cmd_buffer->cs, ds->db_depth_info); /* R_02803C_DB_DEPTH_INFO */ 1192 radeon_emit(cmd_buffer->cs, db_z_info); /* R_028040_DB_Z_INFO */ 1193 radeon_emit(cmd_buffer->cs, db_stencil_info); /* R_028044_DB_STENCIL_INFO */ 1194 radeon_emit(cmd_buffer->cs, ds->db_z_read_base); /* R_028048_DB_Z_READ_BASE */ 1195 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base); /* R_02804C_DB_STENCIL_READ_BASE */ 1196 radeon_emit(cmd_buffer->cs, ds->db_z_write_base); /* R_028050_DB_Z_WRITE_BASE */ 1197 radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base); /* R_028054_DB_STENCIL_WRITE_BASE */ 1198 radeon_emit(cmd_buffer->cs, ds->db_depth_size); /* R_028058_DB_DEPTH_SIZE */ 1199 radeon_emit(cmd_buffer->cs, ds->db_depth_slice); /* R_02805C_DB_DEPTH_SLICE */ 1200 1201 } 1202 1203 /* Update the ZRANGE_PRECISION value for the TC-compat bug. */ 1204 radv_update_zrange_precision(cmd_buffer, ds, image, layout, true); 1205 1206 radeon_set_context_reg(cmd_buffer->cs, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, 1207 ds->pa_su_poly_offset_db_fmt_cntl); 1208} 1209 1210/** 1211 * Update the fast clear depth/stencil values if the image is bound as a 1212 * depth/stencil buffer. 1213 */ 1214static void 1215radv_update_bound_fast_clear_ds(struct radv_cmd_buffer *cmd_buffer, 1216 struct radv_image *image, 1217 VkClearDepthStencilValue ds_clear_value, 1218 VkImageAspectFlags aspects) 1219{ 1220 struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer; 1221 const struct radv_subpass *subpass = cmd_buffer->state.subpass; 1222 struct radeon_cmdbuf *cs = cmd_buffer->cs; 1223 struct radv_attachment_info *att; 1224 uint32_t att_idx; 1225 1226 if (!framebuffer || !subpass) 1227 return; 1228 1229 if (!subpass->depth_stencil_attachment) 1230 return; 1231 1232 att_idx = subpass->depth_stencil_attachment->attachment; 1233 att = &framebuffer->attachments[att_idx]; 1234 if (att->attachment->image != image) 1235 return; 1236 1237 radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2); 1238 radeon_emit(cs, ds_clear_value.stencil); 1239 radeon_emit(cs, fui(ds_clear_value.depth)); 1240 1241 /* Update the ZRANGE_PRECISION value for the TC-compat bug. This is 1242 * only needed when clearing Z to 0.0. 1243 */ 1244 if ((aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && 1245 ds_clear_value.depth == 0.0) { 1246 VkImageLayout layout = subpass->depth_stencil_attachment->layout; 1247 1248 radv_update_zrange_precision(cmd_buffer, &att->ds, image, 1249 layout, false); 1250 } 1251 1252 cmd_buffer->state.context_roll_without_scissor_emitted = true; 1253} 1254 1255/** 1256 * Set the clear depth/stencil values to the image's metadata. 1257 */ 1258static void 1259radv_set_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, 1260 struct radv_image *image, 1261 VkClearDepthStencilValue ds_clear_value, 1262 VkImageAspectFlags aspects) 1263{ 1264 struct radeon_cmdbuf *cs = cmd_buffer->cs; 1265 uint64_t va = radv_buffer_get_va(image->bo); 1266 unsigned reg_offset = 0, reg_count = 0; 1267 1268 va += image->offset + image->clear_value_offset; 1269 1270 if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { 1271 ++reg_count; 1272 } else { 1273 ++reg_offset; 1274 va += 4; 1275 } 1276 if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT) 1277 ++reg_count; 1278 1279 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + reg_count, cmd_buffer->state.predicating)); 1280 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | 1281 S_370_WR_CONFIRM(1) | 1282 S_370_ENGINE_SEL(V_370_PFP)); 1283 radeon_emit(cs, va); 1284 radeon_emit(cs, va >> 32); 1285 if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) 1286 radeon_emit(cs, ds_clear_value.stencil); 1287 if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT) 1288 radeon_emit(cs, fui(ds_clear_value.depth)); 1289} 1290 1291/** 1292 * Update the TC-compat metadata value for this image. 1293 */ 1294static void 1295radv_set_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer, 1296 struct radv_image *image, 1297 uint32_t value) 1298{ 1299 struct radeon_cmdbuf *cs = cmd_buffer->cs; 1300 uint64_t va = radv_buffer_get_va(image->bo); 1301 va += image->offset + image->tc_compat_zrange_offset; 1302 1303 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, cmd_buffer->state.predicating)); 1304 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | 1305 S_370_WR_CONFIRM(1) | 1306 S_370_ENGINE_SEL(V_370_PFP)); 1307 radeon_emit(cs, va); 1308 radeon_emit(cs, va >> 32); 1309 radeon_emit(cs, value); 1310} 1311 1312static void 1313radv_update_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer, 1314 struct radv_image *image, 1315 VkClearDepthStencilValue ds_clear_value) 1316{ 1317 uint64_t va = radv_buffer_get_va(image->bo); 1318 va += image->offset + image->tc_compat_zrange_offset; 1319 uint32_t cond_val; 1320 1321 /* Conditionally set DB_Z_INFO.ZRANGE_PRECISION to 0 when the last 1322 * depth clear value is 0.0f. 1323 */ 1324 cond_val = ds_clear_value.depth == 0.0f ? UINT_MAX : 0; 1325 1326 radv_set_tc_compat_zrange_metadata(cmd_buffer, image, cond_val); 1327} 1328 1329/** 1330 * Update the clear depth/stencil values for this image. 1331 */ 1332void 1333radv_update_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, 1334 struct radv_image *image, 1335 VkClearDepthStencilValue ds_clear_value, 1336 VkImageAspectFlags aspects) 1337{ 1338 assert(radv_image_has_htile(image)); 1339 1340 radv_set_ds_clear_metadata(cmd_buffer, image, ds_clear_value, aspects); 1341 1342 if (radv_image_is_tc_compat_htile(image) && 1343 (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) { 1344 radv_update_tc_compat_zrange_metadata(cmd_buffer, image, 1345 ds_clear_value); 1346 } 1347 1348 radv_update_bound_fast_clear_ds(cmd_buffer, image, ds_clear_value, 1349 aspects); 1350} 1351 1352/** 1353 * Load the clear depth/stencil values from the image's metadata. 1354 */ 1355static void 1356radv_load_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, 1357 struct radv_image *image) 1358{ 1359 struct radeon_cmdbuf *cs = cmd_buffer->cs; 1360 VkImageAspectFlags aspects = vk_format_aspects(image->vk_format); 1361 uint64_t va = radv_buffer_get_va(image->bo); 1362 unsigned reg_offset = 0, reg_count = 0; 1363 1364 va += image->offset + image->clear_value_offset; 1365 1366 if (!radv_image_has_htile(image)) 1367 return; 1368 1369 if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { 1370 ++reg_count; 1371 } else { 1372 ++reg_offset; 1373 va += 4; 1374 } 1375 if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT) 1376 ++reg_count; 1377 1378 uint32_t reg = R_028028_DB_STENCIL_CLEAR + 4 * reg_offset; 1379 1380 if (cmd_buffer->device->physical_device->has_load_ctx_reg_pkt) { 1381 radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG, 3, 0)); 1382 radeon_emit(cs, va); 1383 radeon_emit(cs, va >> 32); 1384 radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2); 1385 radeon_emit(cs, reg_count); 1386 } else { 1387 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); 1388 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | 1389 COPY_DATA_DST_SEL(COPY_DATA_REG) | 1390 (reg_count == 2 ? COPY_DATA_COUNT_SEL : 0)); 1391 radeon_emit(cs, va); 1392 radeon_emit(cs, va >> 32); 1393 radeon_emit(cs, reg >> 2); 1394 radeon_emit(cs, 0); 1395 1396 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); 1397 radeon_emit(cs, 0); 1398 } 1399} 1400 1401/* 1402 * With DCC some colors don't require CMASK elimination before being 1403 * used as a texture. This sets a predicate value to determine if the 1404 * cmask eliminate is required. 1405 */ 1406void 1407radv_update_fce_metadata(struct radv_cmd_buffer *cmd_buffer, 1408 struct radv_image *image, bool value) 1409{ 1410 uint64_t pred_val = value; 1411 uint64_t va = radv_buffer_get_va(image->bo); 1412 va += image->offset + image->fce_pred_offset; 1413 1414 assert(radv_image_has_dcc(image)); 1415 1416 radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 4, 0)); 1417 radeon_emit(cmd_buffer->cs, S_370_DST_SEL(V_370_MEM) | 1418 S_370_WR_CONFIRM(1) | 1419 S_370_ENGINE_SEL(V_370_PFP)); 1420 radeon_emit(cmd_buffer->cs, va); 1421 radeon_emit(cmd_buffer->cs, va >> 32); 1422 radeon_emit(cmd_buffer->cs, pred_val); 1423 radeon_emit(cmd_buffer->cs, pred_val >> 32); 1424} 1425 1426/** 1427 * Update the DCC predicate to reflect the compression state. 1428 */ 1429void 1430radv_update_dcc_metadata(struct radv_cmd_buffer *cmd_buffer, 1431 struct radv_image *image, bool value) 1432{ 1433 uint64_t pred_val = value; 1434 uint64_t va = radv_buffer_get_va(image->bo); 1435 va += image->offset + image->dcc_pred_offset; 1436 1437 assert(radv_image_has_dcc(image)); 1438 1439 radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 4, 0)); 1440 radeon_emit(cmd_buffer->cs, S_370_DST_SEL(V_370_MEM) | 1441 S_370_WR_CONFIRM(1) | 1442 S_370_ENGINE_SEL(V_370_PFP)); 1443 radeon_emit(cmd_buffer->cs, va); 1444 radeon_emit(cmd_buffer->cs, va >> 32); 1445 radeon_emit(cmd_buffer->cs, pred_val); 1446 radeon_emit(cmd_buffer->cs, pred_val >> 32); 1447} 1448 1449/** 1450 * Update the fast clear color values if the image is bound as a color buffer. 1451 */ 1452static void 1453radv_update_bound_fast_clear_color(struct radv_cmd_buffer *cmd_buffer, 1454 struct radv_image *image, 1455 int cb_idx, 1456 uint32_t color_values[2]) 1457{ 1458 struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer; 1459 const struct radv_subpass *subpass = cmd_buffer->state.subpass; 1460 struct radeon_cmdbuf *cs = cmd_buffer->cs; 1461 struct radv_attachment_info *att; 1462 uint32_t att_idx; 1463 1464 if (!framebuffer || !subpass) 1465 return; 1466 1467 att_idx = subpass->color_attachments[cb_idx].attachment; 1468 if (att_idx == VK_ATTACHMENT_UNUSED) 1469 return; 1470 1471 att = &framebuffer->attachments[att_idx]; 1472 if (att->attachment->image != image) 1473 return; 1474 1475 radeon_set_context_reg_seq(cs, R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c, 2); 1476 radeon_emit(cs, color_values[0]); 1477 radeon_emit(cs, color_values[1]); 1478 1479 cmd_buffer->state.context_roll_without_scissor_emitted = true; 1480} 1481 1482/** 1483 * Set the clear color values to the image's metadata. 1484 */ 1485static void 1486radv_set_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, 1487 struct radv_image *image, 1488 uint32_t color_values[2]) 1489{ 1490 struct radeon_cmdbuf *cs = cmd_buffer->cs; 1491 uint64_t va = radv_buffer_get_va(image->bo); 1492 1493 va += image->offset + image->clear_value_offset; 1494 1495 assert(radv_image_has_cmask(image) || radv_image_has_dcc(image)); 1496 1497 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 4, cmd_buffer->state.predicating)); 1498 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | 1499 S_370_WR_CONFIRM(1) | 1500 S_370_ENGINE_SEL(V_370_PFP)); 1501 radeon_emit(cs, va); 1502 radeon_emit(cs, va >> 32); 1503 radeon_emit(cs, color_values[0]); 1504 radeon_emit(cs, color_values[1]); 1505} 1506 1507/** 1508 * Update the clear color values for this image. 1509 */ 1510void 1511radv_update_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, 1512 struct radv_image *image, 1513 int cb_idx, 1514 uint32_t color_values[2]) 1515{ 1516 assert(radv_image_has_cmask(image) || radv_image_has_dcc(image)); 1517 1518 radv_set_color_clear_metadata(cmd_buffer, image, color_values); 1519 1520 radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx, 1521 color_values); 1522} 1523 1524/** 1525 * Load the clear color values from the image's metadata. 1526 */ 1527static void 1528radv_load_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, 1529 struct radv_image *image, 1530 int cb_idx) 1531{ 1532 struct radeon_cmdbuf *cs = cmd_buffer->cs; 1533 uint64_t va = radv_buffer_get_va(image->bo); 1534 1535 va += image->offset + image->clear_value_offset; 1536 1537 if (!radv_image_has_cmask(image) && !radv_image_has_dcc(image)) 1538 return; 1539 1540 uint32_t reg = R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c; 1541 1542 if (cmd_buffer->device->physical_device->has_load_ctx_reg_pkt) { 1543 radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG, 3, cmd_buffer->state.predicating)); 1544 radeon_emit(cs, va); 1545 radeon_emit(cs, va >> 32); 1546 radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2); 1547 radeon_emit(cs, 2); 1548 } else { 1549 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating)); 1550 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | 1551 COPY_DATA_DST_SEL(COPY_DATA_REG) | 1552 COPY_DATA_COUNT_SEL); 1553 radeon_emit(cs, va); 1554 radeon_emit(cs, va >> 32); 1555 radeon_emit(cs, reg >> 2); 1556 radeon_emit(cs, 0); 1557 1558 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating)); 1559 radeon_emit(cs, 0); 1560 } 1561} 1562 1563static void 1564radv_emit_framebuffer_state(struct radv_cmd_buffer *cmd_buffer) 1565{ 1566 int i; 1567 struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer; 1568 const struct radv_subpass *subpass = cmd_buffer->state.subpass; 1569 unsigned num_bpp64_colorbufs = 0; 1570 1571 /* this may happen for inherited secondary recording */ 1572 if (!framebuffer) 1573 return; 1574 1575 for (i = 0; i < 8; ++i) { 1576 if (i >= subpass->color_count || subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) { 1577 radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, 1578 S_028C70_FORMAT(V_028C70_COLOR_INVALID)); 1579 continue; 1580 } 1581 1582 int idx = subpass->color_attachments[i].attachment; 1583 struct radv_attachment_info *att = &framebuffer->attachments[idx]; 1584 struct radv_image *image = att->attachment->image; 1585 VkImageLayout layout = subpass->color_attachments[i].layout; 1586 1587 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, att->attachment->bo); 1588 1589 assert(att->attachment->aspect_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_PLANE_0_BIT | 1590 VK_IMAGE_ASPECT_PLANE_1_BIT | VK_IMAGE_ASPECT_PLANE_2_BIT)); 1591 radv_emit_fb_color_state(cmd_buffer, i, att, image, layout); 1592 1593 radv_load_color_clear_metadata(cmd_buffer, image, i); 1594 1595 if (image->planes[0].surface.bpe >= 8) 1596 num_bpp64_colorbufs++; 1597 } 1598 1599 if (subpass->depth_stencil_attachment) { 1600 int idx = subpass->depth_stencil_attachment->attachment; 1601 VkImageLayout layout = subpass->depth_stencil_attachment->layout; 1602 struct radv_attachment_info *att = &framebuffer->attachments[idx]; 1603 struct radv_image *image = att->attachment->image; 1604 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, att->attachment->bo); 1605 MAYBE_UNUSED uint32_t queue_mask = radv_image_queue_family_mask(image, 1606 cmd_buffer->queue_family_index, 1607 cmd_buffer->queue_family_index); 1608 /* We currently don't support writing decompressed HTILE */ 1609 assert(radv_layout_has_htile(image, layout, queue_mask) == 1610 radv_layout_is_htile_compressed(image, layout, queue_mask)); 1611 1612 radv_emit_fb_ds_state(cmd_buffer, &att->ds, image, layout); 1613 1614 if (att->ds.offset_scale != cmd_buffer->state.offset_scale) { 1615 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS; 1616 cmd_buffer->state.offset_scale = att->ds.offset_scale; 1617 } 1618 radv_load_ds_clear_metadata(cmd_buffer, image); 1619 } else { 1620 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) 1621 radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 2); 1622 else 1623 radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 2); 1624 1625 radeon_emit(cmd_buffer->cs, S_028040_FORMAT(V_028040_Z_INVALID)); /* DB_Z_INFO */ 1626 radeon_emit(cmd_buffer->cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */ 1627 } 1628 radeon_set_context_reg(cmd_buffer->cs, R_028208_PA_SC_WINDOW_SCISSOR_BR, 1629 S_028208_BR_X(framebuffer->width) | 1630 S_028208_BR_Y(framebuffer->height)); 1631 1632 if (cmd_buffer->device->physical_device->rad_info.chip_class >= VI) { 1633 uint8_t watermark = 4; /* Default value for VI. */ 1634 1635 /* For optimal DCC performance. */ 1636 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { 1637 if (num_bpp64_colorbufs >= 5) { 1638 watermark = 8; 1639 } else { 1640 watermark = 6; 1641 } 1642 } 1643 1644 radeon_set_context_reg(cmd_buffer->cs, R_028424_CB_DCC_CONTROL, 1645 S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(1) | 1646 S_028424_OVERWRITE_COMBINER_WATERMARK(watermark)); 1647 } 1648 1649 if (cmd_buffer->device->dfsm_allowed) { 1650 radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 1651 radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0)); 1652 } 1653 1654 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_FRAMEBUFFER; 1655} 1656 1657static void 1658radv_emit_index_buffer(struct radv_cmd_buffer *cmd_buffer) 1659{ 1660 struct radeon_cmdbuf *cs = cmd_buffer->cs; 1661 struct radv_cmd_state *state = &cmd_buffer->state; 1662 1663 if (state->index_type != state->last_index_type) { 1664 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { 1665 radeon_set_uconfig_reg_idx(cs, R_03090C_VGT_INDEX_TYPE, 1666 2, state->index_type); 1667 } else { 1668 radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0)); 1669 radeon_emit(cs, state->index_type); 1670 } 1671 1672 state->last_index_type = state->index_type; 1673 } 1674 1675 radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0)); 1676 radeon_emit(cs, state->index_va); 1677 radeon_emit(cs, state->index_va >> 32); 1678 1679 radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0)); 1680 radeon_emit(cs, state->max_index_count); 1681 1682 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_INDEX_BUFFER; 1683} 1684 1685void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer) 1686{ 1687 bool has_perfect_queries = cmd_buffer->state.perfect_occlusion_queries_enabled; 1688 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; 1689 uint32_t pa_sc_mode_cntl_1 = 1690 pipeline ? pipeline->graphics.ms.pa_sc_mode_cntl_1 : 0; 1691 uint32_t db_count_control; 1692 1693 if(!cmd_buffer->state.active_occlusion_queries) { 1694 if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) { 1695 if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) && 1696 pipeline->graphics.disable_out_of_order_rast_for_occlusion && 1697 has_perfect_queries) { 1698 /* Re-enable out-of-order rasterization if the 1699 * bound pipeline supports it and if it's has 1700 * been disabled before starting any perfect 1701 * occlusion queries. 1702 */ 1703 radeon_set_context_reg(cmd_buffer->cs, 1704 R_028A4C_PA_SC_MODE_CNTL_1, 1705 pa_sc_mode_cntl_1); 1706 } 1707 } 1708 db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1); 1709 } else { 1710 const struct radv_subpass *subpass = cmd_buffer->state.subpass; 1711 uint32_t sample_rate = subpass ? util_logbase2(subpass->max_sample_count) : 0; 1712 1713 if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) { 1714 db_count_control = 1715 S_028004_PERFECT_ZPASS_COUNTS(has_perfect_queries) | 1716 S_028004_SAMPLE_RATE(sample_rate) | 1717 S_028004_ZPASS_ENABLE(1) | 1718 S_028004_SLICE_EVEN_ENABLE(1) | 1719 S_028004_SLICE_ODD_ENABLE(1); 1720 1721 if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) && 1722 pipeline->graphics.disable_out_of_order_rast_for_occlusion && 1723 has_perfect_queries) { 1724 /* If the bound pipeline has enabled 1725 * out-of-order rasterization, we should 1726 * disable it before starting any perfect 1727 * occlusion queries. 1728 */ 1729 pa_sc_mode_cntl_1 &= C_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE; 1730 1731 radeon_set_context_reg(cmd_buffer->cs, 1732 R_028A4C_PA_SC_MODE_CNTL_1, 1733 pa_sc_mode_cntl_1); 1734 } 1735 } else { 1736 db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) | 1737 S_028004_SAMPLE_RATE(sample_rate); 1738 } 1739 } 1740 1741 radeon_set_context_reg(cmd_buffer->cs, R_028004_DB_COUNT_CONTROL, db_count_control); 1742 1743 cmd_buffer->state.context_roll_without_scissor_emitted = true; 1744} 1745 1746static void 1747radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer) 1748{ 1749 uint32_t states = cmd_buffer->state.dirty & cmd_buffer->state.emitted_pipeline->graphics.needed_dynamic_state; 1750 1751 if (states & (RADV_CMD_DIRTY_DYNAMIC_VIEWPORT)) 1752 radv_emit_viewport(cmd_buffer); 1753 1754 if (states & (RADV_CMD_DIRTY_DYNAMIC_SCISSOR | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT) && 1755 !cmd_buffer->device->physical_device->has_scissor_bug) 1756 radv_emit_scissor(cmd_buffer); 1757 1758 if (states & RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH) 1759 radv_emit_line_width(cmd_buffer); 1760 1761 if (states & RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS) 1762 radv_emit_blend_constants(cmd_buffer); 1763 1764 if (states & (RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE | 1765 RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK | 1766 RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK)) 1767 radv_emit_stencil(cmd_buffer); 1768 1769 if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS) 1770 radv_emit_depth_bounds(cmd_buffer); 1771 1772 if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS) 1773 radv_emit_depth_bias(cmd_buffer); 1774 1775 if (states & RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE) 1776 radv_emit_discard_rectangle(cmd_buffer); 1777 1778 cmd_buffer->state.dirty &= ~states; 1779} 1780 1781static void 1782radv_flush_push_descriptors(struct radv_cmd_buffer *cmd_buffer, 1783 VkPipelineBindPoint bind_point) 1784{ 1785 struct radv_descriptor_state *descriptors_state = 1786 radv_get_descriptors_state(cmd_buffer, bind_point); 1787 struct radv_descriptor_set *set = &descriptors_state->push_set.set; 1788 unsigned bo_offset; 1789 1790 if (!radv_cmd_buffer_upload_data(cmd_buffer, set->size, 32, 1791 set->mapped_ptr, 1792 &bo_offset)) 1793 return; 1794 1795 set->va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); 1796 set->va += bo_offset; 1797} 1798 1799static void 1800radv_flush_indirect_descriptor_sets(struct radv_cmd_buffer *cmd_buffer, 1801 VkPipelineBindPoint bind_point) 1802{ 1803 struct radv_descriptor_state *descriptors_state = 1804 radv_get_descriptors_state(cmd_buffer, bind_point); 1805 uint32_t size = MAX_SETS * 4; 1806 uint32_t offset; 1807 void *ptr; 1808 1809 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, 1810 256, &offset, &ptr)) 1811 return; 1812 1813 for (unsigned i = 0; i < MAX_SETS; i++) { 1814 uint32_t *uptr = ((uint32_t *)ptr) + i; 1815 uint64_t set_va = 0; 1816 struct radv_descriptor_set *set = descriptors_state->sets[i]; 1817 if (descriptors_state->valid & (1u << i)) 1818 set_va = set->va; 1819 uptr[0] = set_va & 0xffffffff; 1820 } 1821 1822 uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); 1823 va += offset; 1824 1825 if (cmd_buffer->state.pipeline) { 1826 if (cmd_buffer->state.pipeline->shaders[MESA_SHADER_VERTEX]) 1827 radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_VERTEX, 1828 AC_UD_INDIRECT_DESCRIPTOR_SETS, va); 1829 1830 if (cmd_buffer->state.pipeline->shaders[MESA_SHADER_FRAGMENT]) 1831 radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_FRAGMENT, 1832 AC_UD_INDIRECT_DESCRIPTOR_SETS, va); 1833 1834 if (radv_pipeline_has_gs(cmd_buffer->state.pipeline)) 1835 radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_GEOMETRY, 1836 AC_UD_INDIRECT_DESCRIPTOR_SETS, va); 1837 1838 if (radv_pipeline_has_tess(cmd_buffer->state.pipeline)) 1839 radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_TESS_CTRL, 1840 AC_UD_INDIRECT_DESCRIPTOR_SETS, va); 1841 1842 if (radv_pipeline_has_tess(cmd_buffer->state.pipeline)) 1843 radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_TESS_EVAL, 1844 AC_UD_INDIRECT_DESCRIPTOR_SETS, va); 1845 } 1846 1847 if (cmd_buffer->state.compute_pipeline) 1848 radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.compute_pipeline, MESA_SHADER_COMPUTE, 1849 AC_UD_INDIRECT_DESCRIPTOR_SETS, va); 1850} 1851 1852static void 1853radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer, 1854 VkShaderStageFlags stages) 1855{ 1856 VkPipelineBindPoint bind_point = stages & VK_SHADER_STAGE_COMPUTE_BIT ? 1857 VK_PIPELINE_BIND_POINT_COMPUTE : 1858 VK_PIPELINE_BIND_POINT_GRAPHICS; 1859 struct radv_descriptor_state *descriptors_state = 1860 radv_get_descriptors_state(cmd_buffer, bind_point); 1861 struct radv_cmd_state *state = &cmd_buffer->state; 1862 bool flush_indirect_descriptors; 1863 1864 if (!descriptors_state->dirty) 1865 return; 1866 1867 if (descriptors_state->push_dirty) 1868 radv_flush_push_descriptors(cmd_buffer, bind_point); 1869 1870 flush_indirect_descriptors = 1871 (bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS && 1872 state->pipeline && state->pipeline->need_indirect_descriptor_sets) || 1873 (bind_point == VK_PIPELINE_BIND_POINT_COMPUTE && 1874 state->compute_pipeline && state->compute_pipeline->need_indirect_descriptor_sets); 1875 1876 if (flush_indirect_descriptors) 1877 radv_flush_indirect_descriptor_sets(cmd_buffer, bind_point); 1878 1879 MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, 1880 cmd_buffer->cs, 1881 MAX_SETS * MESA_SHADER_STAGES * 4); 1882 1883 if (cmd_buffer->state.pipeline) { 1884 radv_foreach_stage(stage, stages) { 1885 if (!cmd_buffer->state.pipeline->shaders[stage]) 1886 continue; 1887 1888 radv_emit_descriptor_pointers(cmd_buffer, 1889 cmd_buffer->state.pipeline, 1890 descriptors_state, stage); 1891 } 1892 } 1893 1894 if (cmd_buffer->state.compute_pipeline && 1895 (stages & VK_SHADER_STAGE_COMPUTE_BIT)) { 1896 radv_emit_descriptor_pointers(cmd_buffer, 1897 cmd_buffer->state.compute_pipeline, 1898 descriptors_state, 1899 MESA_SHADER_COMPUTE); 1900 } 1901 1902 descriptors_state->dirty = 0; 1903 descriptors_state->push_dirty = false; 1904 1905 assert(cmd_buffer->cs->cdw <= cdw_max); 1906 1907 if (unlikely(cmd_buffer->device->trace_bo)) 1908 radv_save_descriptors(cmd_buffer, bind_point); 1909} 1910 1911static void 1912radv_flush_constants(struct radv_cmd_buffer *cmd_buffer, 1913 VkShaderStageFlags stages) 1914{ 1915 struct radv_pipeline *pipeline = stages & VK_SHADER_STAGE_COMPUTE_BIT 1916 ? cmd_buffer->state.compute_pipeline 1917 : cmd_buffer->state.pipeline; 1918 VkPipelineBindPoint bind_point = stages & VK_SHADER_STAGE_COMPUTE_BIT ? 1919 VK_PIPELINE_BIND_POINT_COMPUTE : 1920 VK_PIPELINE_BIND_POINT_GRAPHICS; 1921 struct radv_descriptor_state *descriptors_state = 1922 radv_get_descriptors_state(cmd_buffer, bind_point); 1923 struct radv_pipeline_layout *layout = pipeline->layout; 1924 struct radv_shader_variant *shader, *prev_shader; 1925 bool need_push_constants = false; 1926 unsigned offset; 1927 void *ptr; 1928 uint64_t va; 1929 1930 stages &= cmd_buffer->push_constant_stages; 1931 if (!stages || 1932 (!layout->push_constant_size && !layout->dynamic_offset_count)) 1933 return; 1934 1935 radv_foreach_stage(stage, stages) { 1936 if (!pipeline->shaders[stage]) 1937 continue; 1938 1939 need_push_constants |= pipeline->shaders[stage]->info.info.loads_push_constants; 1940 need_push_constants |= pipeline->shaders[stage]->info.info.loads_dynamic_offsets; 1941 1942 uint8_t base = pipeline->shaders[stage]->info.info.base_inline_push_consts; 1943 uint8_t count = pipeline->shaders[stage]->info.info.num_inline_push_consts; 1944 1945 radv_emit_inline_push_consts(cmd_buffer, pipeline, stage, 1946 AC_UD_INLINE_PUSH_CONSTANTS, 1947 count, 1948 (uint32_t *)&cmd_buffer->push_constants[base * 4]); 1949 } 1950 1951 if (need_push_constants) { 1952 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, layout->push_constant_size + 1953 16 * layout->dynamic_offset_count, 1954 256, &offset, &ptr)) 1955 return; 1956 1957 memcpy(ptr, cmd_buffer->push_constants, layout->push_constant_size); 1958 memcpy((char*)ptr + layout->push_constant_size, 1959 descriptors_state->dynamic_buffers, 1960 16 * layout->dynamic_offset_count); 1961 1962 va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); 1963 va += offset; 1964 1965 MAYBE_UNUSED unsigned cdw_max = 1966 radeon_check_space(cmd_buffer->device->ws, 1967 cmd_buffer->cs, MESA_SHADER_STAGES * 4); 1968 1969 prev_shader = NULL; 1970 radv_foreach_stage(stage, stages) { 1971 shader = radv_get_shader(pipeline, stage); 1972 1973 /* Avoid redundantly emitting the address for merged stages. */ 1974 if (shader && shader != prev_shader) { 1975 radv_emit_userdata_address(cmd_buffer, pipeline, stage, 1976 AC_UD_PUSH_CONSTANTS, va); 1977 1978 prev_shader = shader; 1979 } 1980 } 1981 assert(cmd_buffer->cs->cdw <= cdw_max); 1982 } 1983 1984 cmd_buffer->push_constant_stages &= ~stages; 1985} 1986 1987static void 1988radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, 1989 bool pipeline_is_dirty) 1990{ 1991 if ((pipeline_is_dirty || 1992 (cmd_buffer->state.dirty & RADV_CMD_DIRTY_VERTEX_BUFFER)) && 1993 cmd_buffer->state.pipeline->num_vertex_bindings && 1994 radv_get_shader(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX)->info.info.vs.has_vertex_buffers) { 1995 struct radv_vertex_elements_info *velems = &cmd_buffer->state.pipeline->vertex_elements; 1996 unsigned vb_offset; 1997 void *vb_ptr; 1998 uint32_t i = 0; 1999 uint32_t count = cmd_buffer->state.pipeline->num_vertex_bindings; 2000 uint64_t va; 2001 2002 /* allocate some descriptor state for vertex buffers */ 2003 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, count * 16, 256, 2004 &vb_offset, &vb_ptr)) 2005 return; 2006 2007 for (i = 0; i < count; i++) { 2008 uint32_t *desc = &((uint32_t *)vb_ptr)[i * 4]; 2009 uint32_t offset; 2010 struct radv_buffer *buffer = cmd_buffer->vertex_bindings[i].buffer; 2011 uint32_t stride = cmd_buffer->state.pipeline->binding_stride[i]; 2012 2013 if (!buffer) 2014 continue; 2015 2016 va = radv_buffer_get_va(buffer->bo); 2017 2018 offset = cmd_buffer->vertex_bindings[i].offset; 2019 va += offset + buffer->offset; 2020 desc[0] = va; 2021 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride); 2022 if (cmd_buffer->device->physical_device->rad_info.chip_class <= CIK && stride) 2023 desc[2] = (buffer->size - offset - velems->format_size[i]) / stride + 1; 2024 else 2025 desc[2] = buffer->size - offset; 2026 desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | 2027 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | 2028 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | 2029 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | 2030 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) | 2031 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); 2032 } 2033 2034 va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); 2035 va += vb_offset; 2036 2037 radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_VERTEX, 2038 AC_UD_VS_VERTEX_BUFFERS, va); 2039 2040 cmd_buffer->state.vb_va = va; 2041 cmd_buffer->state.vb_size = count * 16; 2042 cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_VBO_DESCRIPTORS; 2043 } 2044 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_VERTEX_BUFFER; 2045} 2046 2047static void 2048radv_emit_streamout_buffers(struct radv_cmd_buffer *cmd_buffer, uint64_t va) 2049{ 2050 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; 2051 struct radv_userdata_info *loc; 2052 uint32_t base_reg; 2053 2054 for (unsigned stage = 0; stage < MESA_SHADER_STAGES; ++stage) { 2055 if (!radv_get_shader(pipeline, stage)) 2056 continue; 2057 2058 loc = radv_lookup_user_sgpr(pipeline, stage, 2059 AC_UD_STREAMOUT_BUFFERS); 2060 if (loc->sgpr_idx == -1) 2061 continue; 2062 2063 base_reg = pipeline->user_data_0[stage]; 2064 2065 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, 2066 base_reg + loc->sgpr_idx * 4, va, false); 2067 } 2068 2069 if (pipeline->gs_copy_shader) { 2070 loc = &pipeline->gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_STREAMOUT_BUFFERS]; 2071 if (loc->sgpr_idx != -1) { 2072 base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0; 2073 2074 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, 2075 base_reg + loc->sgpr_idx * 4, va, false); 2076 } 2077 } 2078} 2079 2080static void 2081radv_flush_streamout_descriptors(struct radv_cmd_buffer *cmd_buffer) 2082{ 2083 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_STREAMOUT_BUFFER) { 2084 struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings; 2085 struct radv_streamout_state *so = &cmd_buffer->state.streamout; 2086 unsigned so_offset; 2087 void *so_ptr; 2088 uint64_t va; 2089 2090 /* Allocate some descriptor state for streamout buffers. */ 2091 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, 2092 MAX_SO_BUFFERS * 16, 256, 2093 &so_offset, &so_ptr)) 2094 return; 2095 2096 for (uint32_t i = 0; i < MAX_SO_BUFFERS; i++) { 2097 struct radv_buffer *buffer = sb[i].buffer; 2098 uint32_t *desc = &((uint32_t *)so_ptr)[i * 4]; 2099 2100 if (!(so->enabled_mask & (1 << i))) 2101 continue; 2102 2103 va = radv_buffer_get_va(buffer->bo) + buffer->offset; 2104 2105 va += sb[i].offset; 2106 2107 /* Set the descriptor. 2108 * 2109 * On VI, the format must be non-INVALID, otherwise 2110 * the buffer will be considered not bound and store 2111 * instructions will be no-ops. 2112 */ 2113 desc[0] = va; 2114 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32); 2115 desc[2] = 0xffffffff; 2116 desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | 2117 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | 2118 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | 2119 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | 2120 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); 2121 } 2122 2123 va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); 2124 va += so_offset; 2125 2126 radv_emit_streamout_buffers(cmd_buffer, va); 2127 } 2128 2129 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_STREAMOUT_BUFFER; 2130} 2131 2132static void 2133radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty) 2134{ 2135 radv_flush_vertex_descriptors(cmd_buffer, pipeline_is_dirty); 2136 radv_flush_streamout_descriptors(cmd_buffer); 2137 radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS); 2138 radv_flush_constants(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS); 2139} 2140 2141struct radv_draw_info { 2142 /** 2143 * Number of vertices. 2144 */ 2145 uint32_t count; 2146 2147 /** 2148 * Index of the first vertex. 2149 */ 2150 int32_t vertex_offset; 2151 2152 /** 2153 * First instance id. 2154 */ 2155 uint32_t first_instance; 2156 2157 /** 2158 * Number of instances. 2159 */ 2160 uint32_t instance_count; 2161 2162 /** 2163 * First index (indexed draws only). 2164 */ 2165 uint32_t first_index; 2166 2167 /** 2168 * Whether it's an indexed draw. 2169 */ 2170 bool indexed; 2171 2172 /** 2173 * Indirect draw parameters resource. 2174 */ 2175 struct radv_buffer *indirect; 2176 uint64_t indirect_offset; 2177 uint32_t stride; 2178 2179 /** 2180 * Draw count parameters resource. 2181 */ 2182 struct radv_buffer *count_buffer; 2183 uint64_t count_buffer_offset; 2184 2185 /** 2186 * Stream output parameters resource. 2187 */ 2188 struct radv_buffer *strmout_buffer; 2189 uint64_t strmout_buffer_offset; 2190}; 2191 2192static void 2193radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer, 2194 const struct radv_draw_info *draw_info) 2195{ 2196 struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info; 2197 struct radv_cmd_state *state = &cmd_buffer->state; 2198 struct radeon_cmdbuf *cs = cmd_buffer->cs; 2199 uint32_t ia_multi_vgt_param; 2200 int32_t primitive_reset_en; 2201 2202 /* Draw state. */ 2203 ia_multi_vgt_param = 2204 si_get_ia_multi_vgt_param(cmd_buffer, draw_info->instance_count > 1, 2205 draw_info->indirect, 2206 !!draw_info->strmout_buffer, 2207 draw_info->indirect ? 0 : draw_info->count); 2208 2209 if (state->last_ia_multi_vgt_param != ia_multi_vgt_param) { 2210 if (info->chip_class >= GFX9) { 2211 radeon_set_uconfig_reg_idx(cs, 2212 R_030960_IA_MULTI_VGT_PARAM, 2213 4, ia_multi_vgt_param); 2214 } else if (info->chip_class >= CIK) { 2215 radeon_set_context_reg_idx(cs, 2216 R_028AA8_IA_MULTI_VGT_PARAM, 2217 1, ia_multi_vgt_param); 2218 } else { 2219 radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, 2220 ia_multi_vgt_param); 2221 } 2222 state->last_ia_multi_vgt_param = ia_multi_vgt_param; 2223 } 2224 2225 /* Primitive restart. */ 2226 primitive_reset_en = 2227 draw_info->indexed && state->pipeline->graphics.prim_restart_enable; 2228 2229 if (primitive_reset_en != state->last_primitive_reset_en) { 2230 state->last_primitive_reset_en = primitive_reset_en; 2231 if (info->chip_class >= GFX9) { 2232 radeon_set_uconfig_reg(cs, 2233 R_03092C_VGT_MULTI_PRIM_IB_RESET_EN, 2234 primitive_reset_en); 2235 } else { 2236 radeon_set_context_reg(cs, 2237 R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, 2238 primitive_reset_en); 2239 } 2240 } 2241 2242 if (primitive_reset_en) { 2243 uint32_t primitive_reset_index = 2244 state->index_type ? 0xffffffffu : 0xffffu; 2245 2246 if (primitive_reset_index != state->last_primitive_reset_index) { 2247 radeon_set_context_reg(cs, 2248 R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, 2249 primitive_reset_index); 2250 state->last_primitive_reset_index = primitive_reset_index; 2251 } 2252 } 2253 2254 if (draw_info->strmout_buffer) { 2255 uint64_t va = radv_buffer_get_va(draw_info->strmout_buffer->bo); 2256 2257 va += draw_info->strmout_buffer->offset + 2258 draw_info->strmout_buffer_offset; 2259 2260 radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, 2261 draw_info->stride); 2262 2263 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); 2264 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | 2265 COPY_DATA_DST_SEL(COPY_DATA_REG) | 2266 COPY_DATA_WR_CONFIRM); 2267 radeon_emit(cs, va); 2268 radeon_emit(cs, va >> 32); 2269 radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2); 2270 radeon_emit(cs, 0); /* unused */ 2271 2272 radv_cs_add_buffer(cmd_buffer->device->ws, cs, draw_info->strmout_buffer->bo); 2273 } 2274} 2275 2276static void radv_stage_flush(struct radv_cmd_buffer *cmd_buffer, 2277 VkPipelineStageFlags src_stage_mask) 2278{ 2279 if (src_stage_mask & (VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | 2280 VK_PIPELINE_STAGE_TRANSFER_BIT | 2281 VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT | 2282 VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)) { 2283 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH; 2284 } 2285 2286 if (src_stage_mask & (VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | 2287 VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | 2288 VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT | 2289 VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT | 2290 VK_PIPELINE_STAGE_TRANSFER_BIT | 2291 VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT | 2292 VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT | 2293 VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)) { 2294 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH; 2295 } else if (src_stage_mask & (VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | 2296 VK_PIPELINE_STAGE_VERTEX_INPUT_BIT | 2297 VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | 2298 VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT | 2299 VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT | 2300 VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT | 2301 VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT)) { 2302 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH; 2303 } 2304} 2305 2306static enum radv_cmd_flush_bits 2307radv_src_access_flush(struct radv_cmd_buffer *cmd_buffer, 2308 VkAccessFlags src_flags, 2309 struct radv_image *image) 2310{ 2311 bool flush_CB_meta = true, flush_DB_meta = true; 2312 enum radv_cmd_flush_bits flush_bits = 0; 2313 uint32_t b; 2314 2315 if (image) { 2316 if (!radv_image_has_CB_metadata(image)) 2317 flush_CB_meta = false; 2318 if (!radv_image_has_htile(image)) 2319 flush_DB_meta = false; 2320 } 2321 2322 for_each_bit(b, src_flags) { 2323 switch ((VkAccessFlagBits)(1 << b)) { 2324 case VK_ACCESS_SHADER_WRITE_BIT: 2325 case VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT: 2326 case VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT: 2327 flush_bits |= RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2; 2328 break; 2329 case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT: 2330 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB; 2331 if (flush_CB_meta) 2332 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; 2333 break; 2334 case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT: 2335 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB; 2336 if (flush_DB_meta) 2337 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; 2338 break; 2339 case VK_ACCESS_TRANSFER_WRITE_BIT: 2340 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | 2341 RADV_CMD_FLAG_FLUSH_AND_INV_DB | 2342 RADV_CMD_FLAG_INV_GLOBAL_L2; 2343 2344 if (flush_CB_meta) 2345 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; 2346 if (flush_DB_meta) 2347 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; 2348 break; 2349 default: 2350 break; 2351 } 2352 } 2353 return flush_bits; 2354} 2355 2356static enum radv_cmd_flush_bits 2357radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, 2358 VkAccessFlags dst_flags, 2359 struct radv_image *image) 2360{ 2361 bool flush_CB_meta = true, flush_DB_meta = true; 2362 enum radv_cmd_flush_bits flush_bits = 0; 2363 bool flush_CB = true, flush_DB = true; 2364 bool image_is_coherent = false; 2365 uint32_t b; 2366 2367 if (image) { 2368 if (!(image->usage & VK_IMAGE_USAGE_STORAGE_BIT)) { 2369 flush_CB = false; 2370 flush_DB = false; 2371 } 2372 2373 if (!radv_image_has_CB_metadata(image)) 2374 flush_CB_meta = false; 2375 if (!radv_image_has_htile(image)) 2376 flush_DB_meta = false; 2377 2378 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { 2379 if (image->info.samples == 1 && 2380 (image->usage & (VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | 2381 VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT)) && 2382 !vk_format_is_stencil(image->vk_format)) { 2383 /* Single-sample color and single-sample depth 2384 * (not stencil) are coherent with shaders on 2385 * GFX9. 2386 */ 2387 image_is_coherent = true; 2388 } 2389 } 2390 } 2391 2392 for_each_bit(b, dst_flags) { 2393 switch ((VkAccessFlagBits)(1 << b)) { 2394 case VK_ACCESS_INDIRECT_COMMAND_READ_BIT: 2395 case VK_ACCESS_INDEX_READ_BIT: 2396 case VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT: 2397 break; 2398 case VK_ACCESS_UNIFORM_READ_BIT: 2399 flush_bits |= RADV_CMD_FLAG_INV_VMEM_L1 | RADV_CMD_FLAG_INV_SMEM_L1; 2400 break; 2401 case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT: 2402 case VK_ACCESS_TRANSFER_READ_BIT: 2403 case VK_ACCESS_INPUT_ATTACHMENT_READ_BIT: 2404 flush_bits |= RADV_CMD_FLAG_INV_VMEM_L1 | 2405 RADV_CMD_FLAG_INV_GLOBAL_L2; 2406 break; 2407 case VK_ACCESS_SHADER_READ_BIT: 2408 flush_bits |= RADV_CMD_FLAG_INV_VMEM_L1; 2409 2410 if (!image_is_coherent) 2411 flush_bits |= RADV_CMD_FLAG_INV_GLOBAL_L2; 2412 break; 2413 case VK_ACCESS_COLOR_ATTACHMENT_READ_BIT: 2414 if (flush_CB) 2415 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB; 2416 if (flush_CB_meta) 2417 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; 2418 break; 2419 case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT: 2420 if (flush_DB) 2421 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB; 2422 if (flush_DB_meta) 2423 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; 2424 break; 2425 default: 2426 break; 2427 } 2428 } 2429 return flush_bits; 2430} 2431 2432void radv_subpass_barrier(struct radv_cmd_buffer *cmd_buffer, 2433 const struct radv_subpass_barrier *barrier) 2434{ 2435 cmd_buffer->state.flush_bits |= radv_src_access_flush(cmd_buffer, barrier->src_access_mask, 2436 NULL); 2437 radv_stage_flush(cmd_buffer, barrier->src_stage_mask); 2438 cmd_buffer->state.flush_bits |= radv_dst_access_flush(cmd_buffer, barrier->dst_access_mask, 2439 NULL); 2440} 2441 2442static void radv_handle_subpass_image_transition(struct radv_cmd_buffer *cmd_buffer, 2443 struct radv_subpass_attachment att) 2444{ 2445 unsigned idx = att.attachment; 2446 struct radv_image_view *view = cmd_buffer->state.framebuffer->attachments[idx].attachment; 2447 VkImageSubresourceRange range; 2448 range.aspectMask = 0; 2449 range.baseMipLevel = view->base_mip; 2450 range.levelCount = 1; 2451 range.baseArrayLayer = view->base_layer; 2452 range.layerCount = cmd_buffer->state.framebuffer->layers; 2453 2454 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) { 2455 /* If the current subpass uses multiview, the driver might have 2456 * performed a fast color/depth clear to the whole image 2457 * (including all layers). To make sure the driver will 2458 * decompress the image correctly (if needed), we have to 2459 * account for the "real" number of layers. If the view mask is 2460 * sparse, this will decompress more layers than needed. 2461 */ 2462 range.layerCount = util_last_bit(cmd_buffer->state.subpass->view_mask); 2463 } 2464 2465 radv_handle_image_transition(cmd_buffer, 2466 view->image, 2467 cmd_buffer->state.attachments[idx].current_layout, 2468 att.layout, 0, 0, &range); 2469 2470 cmd_buffer->state.attachments[idx].current_layout = att.layout; 2471 2472 2473} 2474 2475void 2476radv_cmd_buffer_set_subpass(struct radv_cmd_buffer *cmd_buffer, 2477 const struct radv_subpass *subpass) 2478{ 2479 cmd_buffer->state.subpass = subpass; 2480 2481 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER; 2482} 2483 2484static VkResult 2485radv_cmd_state_setup_attachments(struct radv_cmd_buffer *cmd_buffer, 2486 struct radv_render_pass *pass, 2487 const VkRenderPassBeginInfo *info) 2488{ 2489 struct radv_cmd_state *state = &cmd_buffer->state; 2490 2491 if (pass->attachment_count == 0) { 2492 state->attachments = NULL; 2493 return VK_SUCCESS; 2494 } 2495 2496 state->attachments = vk_alloc(&cmd_buffer->pool->alloc, 2497 pass->attachment_count * 2498 sizeof(state->attachments[0]), 2499 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 2500 if (state->attachments == NULL) { 2501 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY; 2502 return cmd_buffer->record_result; 2503 } 2504 2505 for (uint32_t i = 0; i < pass->attachment_count; ++i) { 2506 struct radv_render_pass_attachment *att = &pass->attachments[i]; 2507 VkImageAspectFlags att_aspects = vk_format_aspects(att->format); 2508 VkImageAspectFlags clear_aspects = 0; 2509 2510 if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) { 2511 /* color attachment */ 2512 if (att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { 2513 clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT; 2514 } 2515 } else { 2516 /* depthstencil attachment */ 2517 if ((att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && 2518 att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { 2519 clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT; 2520 if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) && 2521 att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_DONT_CARE) 2522 clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT; 2523 } 2524 if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) && 2525 att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { 2526 clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT; 2527 } 2528 } 2529 2530 state->attachments[i].pending_clear_aspects = clear_aspects; 2531 state->attachments[i].cleared_views = 0; 2532 if (clear_aspects && info) { 2533 assert(info->clearValueCount > i); 2534 state->attachments[i].clear_value = info->pClearValues[i]; 2535 } 2536 2537 state->attachments[i].current_layout = att->initial_layout; 2538 } 2539 2540 return VK_SUCCESS; 2541} 2542 2543VkResult radv_AllocateCommandBuffers( 2544 VkDevice _device, 2545 const VkCommandBufferAllocateInfo *pAllocateInfo, 2546 VkCommandBuffer *pCommandBuffers) 2547{ 2548 RADV_FROM_HANDLE(radv_device, device, _device); 2549 RADV_FROM_HANDLE(radv_cmd_pool, pool, pAllocateInfo->commandPool); 2550 2551 VkResult result = VK_SUCCESS; 2552 uint32_t i; 2553 2554 for (i = 0; i < pAllocateInfo->commandBufferCount; i++) { 2555 2556 if (!list_empty(&pool->free_cmd_buffers)) { 2557 struct radv_cmd_buffer *cmd_buffer = list_first_entry(&pool->free_cmd_buffers, struct radv_cmd_buffer, pool_link); 2558 2559 list_del(&cmd_buffer->pool_link); 2560 list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers); 2561 2562 result = radv_reset_cmd_buffer(cmd_buffer); 2563 cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC; 2564 cmd_buffer->level = pAllocateInfo->level; 2565 2566 pCommandBuffers[i] = radv_cmd_buffer_to_handle(cmd_buffer); 2567 } else { 2568 result = radv_create_cmd_buffer(device, pool, pAllocateInfo->level, 2569 &pCommandBuffers[i]); 2570 } 2571 if (result != VK_SUCCESS) 2572 break; 2573 } 2574 2575 if (result != VK_SUCCESS) { 2576 radv_FreeCommandBuffers(_device, pAllocateInfo->commandPool, 2577 i, pCommandBuffers); 2578 2579 /* From the Vulkan 1.0.66 spec: 2580 * 2581 * "vkAllocateCommandBuffers can be used to create multiple 2582 * command buffers. If the creation of any of those command 2583 * buffers fails, the implementation must destroy all 2584 * successfully created command buffer objects from this 2585 * command, set all entries of the pCommandBuffers array to 2586 * NULL and return the error." 2587 */ 2588 memset(pCommandBuffers, 0, 2589 sizeof(*pCommandBuffers) * pAllocateInfo->commandBufferCount); 2590 } 2591 2592 return result; 2593} 2594 2595void radv_FreeCommandBuffers( 2596 VkDevice device, 2597 VkCommandPool commandPool, 2598 uint32_t commandBufferCount, 2599 const VkCommandBuffer *pCommandBuffers) 2600{ 2601 for (uint32_t i = 0; i < commandBufferCount; i++) { 2602 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, pCommandBuffers[i]); 2603 2604 if (cmd_buffer) { 2605 if (cmd_buffer->pool) { 2606 list_del(&cmd_buffer->pool_link); 2607 list_addtail(&cmd_buffer->pool_link, &cmd_buffer->pool->free_cmd_buffers); 2608 } else 2609 radv_cmd_buffer_destroy(cmd_buffer); 2610 2611 } 2612 } 2613} 2614 2615VkResult radv_ResetCommandBuffer( 2616 VkCommandBuffer commandBuffer, 2617 VkCommandBufferResetFlags flags) 2618{ 2619 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 2620 return radv_reset_cmd_buffer(cmd_buffer); 2621} 2622 2623VkResult radv_BeginCommandBuffer( 2624 VkCommandBuffer commandBuffer, 2625 const VkCommandBufferBeginInfo *pBeginInfo) 2626{ 2627 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 2628 VkResult result = VK_SUCCESS; 2629 2630 if (cmd_buffer->status != RADV_CMD_BUFFER_STATUS_INITIAL) { 2631 /* If the command buffer has already been resetted with 2632 * vkResetCommandBuffer, no need to do it again. 2633 */ 2634 result = radv_reset_cmd_buffer(cmd_buffer); 2635 if (result != VK_SUCCESS) 2636 return result; 2637 } 2638 2639 memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state)); 2640 cmd_buffer->state.last_primitive_reset_en = -1; 2641 cmd_buffer->state.last_index_type = -1; 2642 cmd_buffer->state.last_num_instances = -1; 2643 cmd_buffer->state.last_vertex_offset = -1; 2644 cmd_buffer->state.last_first_instance = -1; 2645 cmd_buffer->state.predication_type = -1; 2646 cmd_buffer->usage_flags = pBeginInfo->flags; 2647 2648 if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY && 2649 (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) { 2650 assert(pBeginInfo->pInheritanceInfo); 2651 cmd_buffer->state.framebuffer = radv_framebuffer_from_handle(pBeginInfo->pInheritanceInfo->framebuffer); 2652 cmd_buffer->state.pass = radv_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass); 2653 2654 struct radv_subpass *subpass = 2655 &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass]; 2656 2657 result = radv_cmd_state_setup_attachments(cmd_buffer, cmd_buffer->state.pass, NULL); 2658 if (result != VK_SUCCESS) 2659 return result; 2660 2661 radv_cmd_buffer_set_subpass(cmd_buffer, subpass); 2662 } 2663 2664 if (unlikely(cmd_buffer->device->trace_bo)) { 2665 struct radv_device *device = cmd_buffer->device; 2666 2667 radv_cs_add_buffer(device->ws, cmd_buffer->cs, 2668 device->trace_bo); 2669 2670 radv_cmd_buffer_trace_emit(cmd_buffer); 2671 } 2672 2673 cmd_buffer->status = RADV_CMD_BUFFER_STATUS_RECORDING; 2674 2675 return result; 2676} 2677 2678void radv_CmdBindVertexBuffers( 2679 VkCommandBuffer commandBuffer, 2680 uint32_t firstBinding, 2681 uint32_t bindingCount, 2682 const VkBuffer* pBuffers, 2683 const VkDeviceSize* pOffsets) 2684{ 2685 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 2686 struct radv_vertex_binding *vb = cmd_buffer->vertex_bindings; 2687 bool changed = false; 2688 2689 /* We have to defer setting up vertex buffer since we need the buffer 2690 * stride from the pipeline. */ 2691 2692 assert(firstBinding + bindingCount <= MAX_VBS); 2693 for (uint32_t i = 0; i < bindingCount; i++) { 2694 uint32_t idx = firstBinding + i; 2695 2696 if (!changed && 2697 (vb[idx].buffer != radv_buffer_from_handle(pBuffers[i]) || 2698 vb[idx].offset != pOffsets[i])) { 2699 changed = true; 2700 } 2701 2702 vb[idx].buffer = radv_buffer_from_handle(pBuffers[i]); 2703 vb[idx].offset = pOffsets[i]; 2704 2705 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, 2706 vb[idx].buffer->bo); 2707 } 2708 2709 if (!changed) { 2710 /* No state changes. */ 2711 return; 2712 } 2713 2714 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER; 2715} 2716 2717void radv_CmdBindIndexBuffer( 2718 VkCommandBuffer commandBuffer, 2719 VkBuffer buffer, 2720 VkDeviceSize offset, 2721 VkIndexType indexType) 2722{ 2723 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 2724 RADV_FROM_HANDLE(radv_buffer, index_buffer, buffer); 2725 2726 if (cmd_buffer->state.index_buffer == index_buffer && 2727 cmd_buffer->state.index_offset == offset && 2728 cmd_buffer->state.index_type == indexType) { 2729 /* No state changes. */ 2730 return; 2731 } 2732 2733 cmd_buffer->state.index_buffer = index_buffer; 2734 cmd_buffer->state.index_offset = offset; 2735 cmd_buffer->state.index_type = indexType; /* vk matches hw */ 2736 cmd_buffer->state.index_va = radv_buffer_get_va(index_buffer->bo); 2737 cmd_buffer->state.index_va += index_buffer->offset + offset; 2738 2739 int index_size_shift = cmd_buffer->state.index_type ? 2 : 1; 2740 cmd_buffer->state.max_index_count = (index_buffer->size - offset) >> index_size_shift; 2741 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER; 2742 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, index_buffer->bo); 2743} 2744 2745 2746static void 2747radv_bind_descriptor_set(struct radv_cmd_buffer *cmd_buffer, 2748 VkPipelineBindPoint bind_point, 2749 struct radv_descriptor_set *set, unsigned idx) 2750{ 2751 struct radeon_winsys *ws = cmd_buffer->device->ws; 2752 2753 radv_set_descriptor_set(cmd_buffer, bind_point, set, idx); 2754 2755 assert(set); 2756 assert(!(set->layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR)); 2757 2758 if (!cmd_buffer->device->use_global_bo_list) { 2759 for (unsigned j = 0; j < set->layout->buffer_count; ++j) 2760 if (set->descriptors[j]) 2761 radv_cs_add_buffer(ws, cmd_buffer->cs, set->descriptors[j]); 2762 } 2763 2764 if(set->bo) 2765 radv_cs_add_buffer(ws, cmd_buffer->cs, set->bo); 2766} 2767 2768void radv_CmdBindDescriptorSets( 2769 VkCommandBuffer commandBuffer, 2770 VkPipelineBindPoint pipelineBindPoint, 2771 VkPipelineLayout _layout, 2772 uint32_t firstSet, 2773 uint32_t descriptorSetCount, 2774 const VkDescriptorSet* pDescriptorSets, 2775 uint32_t dynamicOffsetCount, 2776 const uint32_t* pDynamicOffsets) 2777{ 2778 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 2779 RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout); 2780 unsigned dyn_idx = 0; 2781 2782 const bool no_dynamic_bounds = cmd_buffer->device->instance->debug_flags & RADV_DEBUG_NO_DYNAMIC_BOUNDS; 2783 struct radv_descriptor_state *descriptors_state = 2784 radv_get_descriptors_state(cmd_buffer, pipelineBindPoint); 2785 2786 for (unsigned i = 0; i < descriptorSetCount; ++i) { 2787 unsigned idx = i + firstSet; 2788 RADV_FROM_HANDLE(radv_descriptor_set, set, pDescriptorSets[i]); 2789 radv_bind_descriptor_set(cmd_buffer, pipelineBindPoint, set, idx); 2790 2791 for(unsigned j = 0; j < set->layout->dynamic_offset_count; ++j, ++dyn_idx) { 2792 unsigned idx = j + layout->set[i + firstSet].dynamic_offset_start; 2793 uint32_t *dst = descriptors_state->dynamic_buffers + idx * 4; 2794 assert(dyn_idx < dynamicOffsetCount); 2795 2796 struct radv_descriptor_range *range = set->dynamic_descriptors + j; 2797 uint64_t va = range->va + pDynamicOffsets[dyn_idx]; 2798 dst[0] = va; 2799 dst[1] = S_008F04_BASE_ADDRESS_HI(va >> 32); 2800 dst[2] = no_dynamic_bounds ? 0xffffffffu : range->size; 2801 dst[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | 2802 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | 2803 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | 2804 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | 2805 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | 2806 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); 2807 cmd_buffer->push_constant_stages |= 2808 set->layout->dynamic_shader_stages; 2809 } 2810 } 2811} 2812 2813static bool radv_init_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer, 2814 struct radv_descriptor_set *set, 2815 struct radv_descriptor_set_layout *layout, 2816 VkPipelineBindPoint bind_point) 2817{ 2818 struct radv_descriptor_state *descriptors_state = 2819 radv_get_descriptors_state(cmd_buffer, bind_point); 2820 set->size = layout->size; 2821 set->layout = layout; 2822 2823 if (descriptors_state->push_set.capacity < set->size) { 2824 size_t new_size = MAX2(set->size, 1024); 2825 new_size = MAX2(new_size, 2 * descriptors_state->push_set.capacity); 2826 new_size = MIN2(new_size, 96 * MAX_PUSH_DESCRIPTORS); 2827 2828 free(set->mapped_ptr); 2829 set->mapped_ptr = malloc(new_size); 2830 2831 if (!set->mapped_ptr) { 2832 descriptors_state->push_set.capacity = 0; 2833 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY; 2834 return false; 2835 } 2836 2837 descriptors_state->push_set.capacity = new_size; 2838 } 2839 2840 return true; 2841} 2842 2843void radv_meta_push_descriptor_set( 2844 struct radv_cmd_buffer* cmd_buffer, 2845 VkPipelineBindPoint pipelineBindPoint, 2846 VkPipelineLayout _layout, 2847 uint32_t set, 2848 uint32_t descriptorWriteCount, 2849 const VkWriteDescriptorSet* pDescriptorWrites) 2850{ 2851 RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout); 2852 struct radv_descriptor_set *push_set = &cmd_buffer->meta_push_descriptors; 2853 unsigned bo_offset; 2854 2855 assert(set == 0); 2856 assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR); 2857 2858 push_set->size = layout->set[set].layout->size; 2859 push_set->layout = layout->set[set].layout; 2860 2861 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, push_set->size, 32, 2862 &bo_offset, 2863 (void**) &push_set->mapped_ptr)) 2864 return; 2865 2866 push_set->va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); 2867 push_set->va += bo_offset; 2868 2869 radv_update_descriptor_sets(cmd_buffer->device, cmd_buffer, 2870 radv_descriptor_set_to_handle(push_set), 2871 descriptorWriteCount, pDescriptorWrites, 0, NULL); 2872 2873 radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set); 2874} 2875 2876void radv_CmdPushDescriptorSetKHR( 2877 VkCommandBuffer commandBuffer, 2878 VkPipelineBindPoint pipelineBindPoint, 2879 VkPipelineLayout _layout, 2880 uint32_t set, 2881 uint32_t descriptorWriteCount, 2882 const VkWriteDescriptorSet* pDescriptorWrites) 2883{ 2884 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 2885 RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout); 2886 struct radv_descriptor_state *descriptors_state = 2887 radv_get_descriptors_state(cmd_buffer, pipelineBindPoint); 2888 struct radv_descriptor_set *push_set = &descriptors_state->push_set.set; 2889 2890 assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR); 2891 2892 if (!radv_init_push_descriptor_set(cmd_buffer, push_set, 2893 layout->set[set].layout, 2894 pipelineBindPoint)) 2895 return; 2896 2897 radv_update_descriptor_sets(cmd_buffer->device, cmd_buffer, 2898 radv_descriptor_set_to_handle(push_set), 2899 descriptorWriteCount, pDescriptorWrites, 0, NULL); 2900 2901 radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set); 2902 descriptors_state->push_dirty = true; 2903} 2904 2905void radv_CmdPushDescriptorSetWithTemplateKHR( 2906 VkCommandBuffer commandBuffer, 2907 VkDescriptorUpdateTemplate descriptorUpdateTemplate, 2908 VkPipelineLayout _layout, 2909 uint32_t set, 2910 const void* pData) 2911{ 2912 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 2913 RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout); 2914 RADV_FROM_HANDLE(radv_descriptor_update_template, templ, descriptorUpdateTemplate); 2915 struct radv_descriptor_state *descriptors_state = 2916 radv_get_descriptors_state(cmd_buffer, templ->bind_point); 2917 struct radv_descriptor_set *push_set = &descriptors_state->push_set.set; 2918 2919 assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR); 2920 2921 if (!radv_init_push_descriptor_set(cmd_buffer, push_set, 2922 layout->set[set].layout, 2923 templ->bind_point)) 2924 return; 2925 2926 radv_update_descriptor_set_with_template(cmd_buffer->device, cmd_buffer, push_set, 2927 descriptorUpdateTemplate, pData); 2928 2929 radv_set_descriptor_set(cmd_buffer, templ->bind_point, push_set, set); 2930 descriptors_state->push_dirty = true; 2931} 2932 2933void radv_CmdPushConstants(VkCommandBuffer commandBuffer, 2934 VkPipelineLayout layout, 2935 VkShaderStageFlags stageFlags, 2936 uint32_t offset, 2937 uint32_t size, 2938 const void* pValues) 2939{ 2940 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 2941 memcpy(cmd_buffer->push_constants + offset, pValues, size); 2942 cmd_buffer->push_constant_stages |= stageFlags; 2943} 2944 2945VkResult radv_EndCommandBuffer( 2946 VkCommandBuffer commandBuffer) 2947{ 2948 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 2949 2950 if (cmd_buffer->queue_family_index != RADV_QUEUE_TRANSFER) { 2951 if (cmd_buffer->device->physical_device->rad_info.chip_class == SI) 2952 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2; 2953 si_emit_cache_flush(cmd_buffer); 2954 } 2955 2956 /* Make sure CP DMA is idle at the end of IBs because the kernel 2957 * doesn't wait for it. 2958 */ 2959 si_cp_dma_wait_for_idle(cmd_buffer); 2960 2961 vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments); 2962 2963 if (!cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs)) 2964 return vk_error(cmd_buffer->device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY); 2965 2966 cmd_buffer->status = RADV_CMD_BUFFER_STATUS_EXECUTABLE; 2967 2968 return cmd_buffer->record_result; 2969} 2970 2971static void 2972radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer) 2973{ 2974 struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline; 2975 2976 if (!pipeline || pipeline == cmd_buffer->state.emitted_compute_pipeline) 2977 return; 2978 2979 assert(!pipeline->ctx_cs.cdw); 2980 2981 cmd_buffer->state.emitted_compute_pipeline = pipeline; 2982 2983 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->cs.cdw); 2984 radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw); 2985 2986 cmd_buffer->compute_scratch_size_needed = 2987 MAX2(cmd_buffer->compute_scratch_size_needed, 2988 pipeline->max_waves * pipeline->scratch_bytes_per_wave); 2989 2990 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, 2991 pipeline->shaders[MESA_SHADER_COMPUTE]->bo); 2992 2993 if (unlikely(cmd_buffer->device->trace_bo)) 2994 radv_save_pipeline(cmd_buffer, pipeline, RING_COMPUTE); 2995} 2996 2997static void radv_mark_descriptor_sets_dirty(struct radv_cmd_buffer *cmd_buffer, 2998 VkPipelineBindPoint bind_point) 2999{ 3000 struct radv_descriptor_state *descriptors_state = 3001 radv_get_descriptors_state(cmd_buffer, bind_point); 3002 3003 descriptors_state->dirty |= descriptors_state->valid; 3004} 3005 3006void radv_CmdBindPipeline( 3007 VkCommandBuffer commandBuffer, 3008 VkPipelineBindPoint pipelineBindPoint, 3009 VkPipeline _pipeline) 3010{ 3011 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 3012 RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline); 3013 3014 switch (pipelineBindPoint) { 3015 case VK_PIPELINE_BIND_POINT_COMPUTE: 3016 if (cmd_buffer->state.compute_pipeline == pipeline) 3017 return; 3018 radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint); 3019 3020 cmd_buffer->state.compute_pipeline = pipeline; 3021 cmd_buffer->push_constant_stages |= VK_SHADER_STAGE_COMPUTE_BIT; 3022 break; 3023 case VK_PIPELINE_BIND_POINT_GRAPHICS: 3024 if (cmd_buffer->state.pipeline == pipeline) 3025 return; 3026 radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint); 3027 3028 cmd_buffer->state.pipeline = pipeline; 3029 if (!pipeline) 3030 break; 3031 3032 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PIPELINE; 3033 cmd_buffer->push_constant_stages |= pipeline->active_stages; 3034 3035 /* the new vertex shader might not have the same user regs */ 3036 cmd_buffer->state.last_first_instance = -1; 3037 cmd_buffer->state.last_vertex_offset = -1; 3038 3039 /* Prefetch all pipeline shaders at first draw time. */ 3040 cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_SHADERS; 3041 3042 radv_bind_dynamic_state(cmd_buffer, &pipeline->dynamic_state); 3043 radv_bind_streamout_state(cmd_buffer, pipeline); 3044 3045 if (pipeline->graphics.esgs_ring_size > cmd_buffer->esgs_ring_size_needed) 3046 cmd_buffer->esgs_ring_size_needed = pipeline->graphics.esgs_ring_size; 3047 if (pipeline->graphics.gsvs_ring_size > cmd_buffer->gsvs_ring_size_needed) 3048 cmd_buffer->gsvs_ring_size_needed = pipeline->graphics.gsvs_ring_size; 3049 3050 if (radv_pipeline_has_tess(pipeline)) 3051 cmd_buffer->tess_rings_needed = true; 3052 break; 3053 default: 3054 assert(!"invalid bind point"); 3055 break; 3056 } 3057} 3058 3059void radv_CmdSetViewport( 3060 VkCommandBuffer commandBuffer, 3061 uint32_t firstViewport, 3062 uint32_t viewportCount, 3063 const VkViewport* pViewports) 3064{ 3065 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 3066 struct radv_cmd_state *state = &cmd_buffer->state; 3067 MAYBE_UNUSED const uint32_t total_count = firstViewport + viewportCount; 3068 3069 assert(firstViewport < MAX_VIEWPORTS); 3070 assert(total_count >= 1 && total_count <= MAX_VIEWPORTS); 3071 3072 if (!memcmp(state->dynamic.viewport.viewports + firstViewport, 3073 pViewports, viewportCount * sizeof(*pViewports))) { 3074 return; 3075 } 3076 3077 memcpy(state->dynamic.viewport.viewports + firstViewport, pViewports, 3078 viewportCount * sizeof(*pViewports)); 3079 3080 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_VIEWPORT; 3081} 3082 3083void radv_CmdSetScissor( 3084 VkCommandBuffer commandBuffer, 3085 uint32_t firstScissor, 3086 uint32_t scissorCount, 3087 const VkRect2D* pScissors) 3088{ 3089 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 3090 struct radv_cmd_state *state = &cmd_buffer->state; 3091 MAYBE_UNUSED const uint32_t total_count = firstScissor + scissorCount; 3092 3093 assert(firstScissor < MAX_SCISSORS); 3094 assert(total_count >= 1 && total_count <= MAX_SCISSORS); 3095 3096 if (!memcmp(state->dynamic.scissor.scissors + firstScissor, pScissors, 3097 scissorCount * sizeof(*pScissors))) { 3098 return; 3099 } 3100 3101 memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors, 3102 scissorCount * sizeof(*pScissors)); 3103 3104 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR; 3105} 3106 3107void radv_CmdSetLineWidth( 3108 VkCommandBuffer commandBuffer, 3109 float lineWidth) 3110{ 3111 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 3112 3113 if (cmd_buffer->state.dynamic.line_width == lineWidth) 3114 return; 3115 3116 cmd_buffer->state.dynamic.line_width = lineWidth; 3117 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH; 3118} 3119 3120void radv_CmdSetDepthBias( 3121 VkCommandBuffer commandBuffer, 3122 float depthBiasConstantFactor, 3123 float depthBiasClamp, 3124 float depthBiasSlopeFactor) 3125{ 3126 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 3127 struct radv_cmd_state *state = &cmd_buffer->state; 3128 3129 if (state->dynamic.depth_bias.bias == depthBiasConstantFactor && 3130 state->dynamic.depth_bias.clamp == depthBiasClamp && 3131 state->dynamic.depth_bias.slope == depthBiasSlopeFactor) { 3132 return; 3133 } 3134 3135 state->dynamic.depth_bias.bias = depthBiasConstantFactor; 3136 state->dynamic.depth_bias.clamp = depthBiasClamp; 3137 state->dynamic.depth_bias.slope = depthBiasSlopeFactor; 3138 3139 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS; 3140} 3141 3142void radv_CmdSetBlendConstants( 3143 VkCommandBuffer commandBuffer, 3144 const float blendConstants[4]) 3145{ 3146 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 3147 struct radv_cmd_state *state = &cmd_buffer->state; 3148 3149 if (!memcmp(state->dynamic.blend_constants, blendConstants, sizeof(float) * 4)) 3150 return; 3151 3152 memcpy(state->dynamic.blend_constants, blendConstants, sizeof(float) * 4); 3153 3154 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS; 3155} 3156 3157void radv_CmdSetDepthBounds( 3158 VkCommandBuffer commandBuffer, 3159 float minDepthBounds, 3160 float maxDepthBounds) 3161{ 3162 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 3163 struct radv_cmd_state *state = &cmd_buffer->state; 3164 3165 if (state->dynamic.depth_bounds.min == minDepthBounds && 3166 state->dynamic.depth_bounds.max == maxDepthBounds) { 3167 return; 3168 } 3169 3170 state->dynamic.depth_bounds.min = minDepthBounds; 3171 state->dynamic.depth_bounds.max = maxDepthBounds; 3172 3173 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS; 3174} 3175 3176void radv_CmdSetStencilCompareMask( 3177 VkCommandBuffer commandBuffer, 3178 VkStencilFaceFlags faceMask, 3179 uint32_t compareMask) 3180{ 3181 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 3182 struct radv_cmd_state *state = &cmd_buffer->state; 3183 bool front_same = state->dynamic.stencil_compare_mask.front == compareMask; 3184 bool back_same = state->dynamic.stencil_compare_mask.back == compareMask; 3185 3186 if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) && 3187 (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) { 3188 return; 3189 } 3190 3191 if (faceMask & VK_STENCIL_FACE_FRONT_BIT) 3192 state->dynamic.stencil_compare_mask.front = compareMask; 3193 if (faceMask & VK_STENCIL_FACE_BACK_BIT) 3194 state->dynamic.stencil_compare_mask.back = compareMask; 3195 3196 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK; 3197} 3198 3199void radv_CmdSetStencilWriteMask( 3200 VkCommandBuffer commandBuffer, 3201 VkStencilFaceFlags faceMask, 3202 uint32_t writeMask) 3203{ 3204 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 3205 struct radv_cmd_state *state = &cmd_buffer->state; 3206 bool front_same = state->dynamic.stencil_write_mask.front == writeMask; 3207 bool back_same = state->dynamic.stencil_write_mask.back == writeMask; 3208 3209 if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) && 3210 (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) { 3211 return; 3212 } 3213 3214 if (faceMask & VK_STENCIL_FACE_FRONT_BIT) 3215 state->dynamic.stencil_write_mask.front = writeMask; 3216 if (faceMask & VK_STENCIL_FACE_BACK_BIT) 3217 state->dynamic.stencil_write_mask.back = writeMask; 3218 3219 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK; 3220} 3221 3222void radv_CmdSetStencilReference( 3223 VkCommandBuffer commandBuffer, 3224 VkStencilFaceFlags faceMask, 3225 uint32_t reference) 3226{ 3227 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 3228 struct radv_cmd_state *state = &cmd_buffer->state; 3229 bool front_same = state->dynamic.stencil_reference.front == reference; 3230 bool back_same = state->dynamic.stencil_reference.back == reference; 3231 3232 if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) && 3233 (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) { 3234 return; 3235 } 3236 3237 if (faceMask & VK_STENCIL_FACE_FRONT_BIT) 3238 cmd_buffer->state.dynamic.stencil_reference.front = reference; 3239 if (faceMask & VK_STENCIL_FACE_BACK_BIT) 3240 cmd_buffer->state.dynamic.stencil_reference.back = reference; 3241 3242 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE; 3243} 3244 3245void radv_CmdSetDiscardRectangleEXT( 3246 VkCommandBuffer commandBuffer, 3247 uint32_t firstDiscardRectangle, 3248 uint32_t discardRectangleCount, 3249 const VkRect2D* pDiscardRectangles) 3250{ 3251 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 3252 struct radv_cmd_state *state = &cmd_buffer->state; 3253 MAYBE_UNUSED const uint32_t total_count = firstDiscardRectangle + discardRectangleCount; 3254 3255 assert(firstDiscardRectangle < MAX_DISCARD_RECTANGLES); 3256 assert(total_count >= 1 && total_count <= MAX_DISCARD_RECTANGLES); 3257 3258 if (!memcmp(state->dynamic.discard_rectangle.rectangles + firstDiscardRectangle, 3259 pDiscardRectangles, discardRectangleCount * sizeof(*pDiscardRectangles))) { 3260 return; 3261 } 3262 3263 typed_memcpy(&state->dynamic.discard_rectangle.rectangles[firstDiscardRectangle], 3264 pDiscardRectangles, discardRectangleCount); 3265 3266 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE; 3267} 3268 3269void radv_CmdExecuteCommands( 3270 VkCommandBuffer commandBuffer, 3271 uint32_t commandBufferCount, 3272 const VkCommandBuffer* pCmdBuffers) 3273{ 3274 RADV_FROM_HANDLE(radv_cmd_buffer, primary, commandBuffer); 3275 3276 assert(commandBufferCount > 0); 3277 3278 /* Emit pending flushes on primary prior to executing secondary */ 3279 si_emit_cache_flush(primary); 3280 3281 for (uint32_t i = 0; i < commandBufferCount; i++) { 3282 RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]); 3283 3284 primary->scratch_size_needed = MAX2(primary->scratch_size_needed, 3285 secondary->scratch_size_needed); 3286 primary->compute_scratch_size_needed = MAX2(primary->compute_scratch_size_needed, 3287 secondary->compute_scratch_size_needed); 3288 3289 if (secondary->esgs_ring_size_needed > primary->esgs_ring_size_needed) 3290 primary->esgs_ring_size_needed = secondary->esgs_ring_size_needed; 3291 if (secondary->gsvs_ring_size_needed > primary->gsvs_ring_size_needed) 3292 primary->gsvs_ring_size_needed = secondary->gsvs_ring_size_needed; 3293 if (secondary->tess_rings_needed) 3294 primary->tess_rings_needed = true; 3295 if (secondary->sample_positions_needed) 3296 primary->sample_positions_needed = true; 3297 3298 primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs); 3299 3300 3301 /* When the secondary command buffer is compute only we don't 3302 * need to re-emit the current graphics pipeline. 3303 */ 3304 if (secondary->state.emitted_pipeline) { 3305 primary->state.emitted_pipeline = 3306 secondary->state.emitted_pipeline; 3307 } 3308 3309 /* When the secondary command buffer is graphics only we don't 3310 * need to re-emit the current compute pipeline. 3311 */ 3312 if (secondary->state.emitted_compute_pipeline) { 3313 primary->state.emitted_compute_pipeline = 3314 secondary->state.emitted_compute_pipeline; 3315 } 3316 3317 /* Only re-emit the draw packets when needed. */ 3318 if (secondary->state.last_primitive_reset_en != -1) { 3319 primary->state.last_primitive_reset_en = 3320 secondary->state.last_primitive_reset_en; 3321 } 3322 3323 if (secondary->state.last_primitive_reset_index) { 3324 primary->state.last_primitive_reset_index = 3325 secondary->state.last_primitive_reset_index; 3326 } 3327 3328 if (secondary->state.last_ia_multi_vgt_param) { 3329 primary->state.last_ia_multi_vgt_param = 3330 secondary->state.last_ia_multi_vgt_param; 3331 } 3332 3333 primary->state.last_first_instance = secondary->state.last_first_instance; 3334 primary->state.last_num_instances = secondary->state.last_num_instances; 3335 primary->state.last_vertex_offset = secondary->state.last_vertex_offset; 3336 3337 if (secondary->state.last_index_type != -1) { 3338 primary->state.last_index_type = 3339 secondary->state.last_index_type; 3340 } 3341 } 3342 3343 /* After executing commands from secondary buffers we have to dirty 3344 * some states. 3345 */ 3346 primary->state.dirty |= RADV_CMD_DIRTY_PIPELINE | 3347 RADV_CMD_DIRTY_INDEX_BUFFER | 3348 RADV_CMD_DIRTY_DYNAMIC_ALL; 3349 radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_GRAPHICS); 3350 radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_COMPUTE); 3351} 3352 3353VkResult radv_CreateCommandPool( 3354 VkDevice _device, 3355 const VkCommandPoolCreateInfo* pCreateInfo, 3356 const VkAllocationCallbacks* pAllocator, 3357 VkCommandPool* pCmdPool) 3358{ 3359 RADV_FROM_HANDLE(radv_device, device, _device); 3360 struct radv_cmd_pool *pool; 3361 3362 pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8, 3363 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 3364 if (pool == NULL) 3365 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); 3366 3367 if (pAllocator) 3368 pool->alloc = *pAllocator; 3369 else 3370 pool->alloc = device->alloc; 3371 3372 list_inithead(&pool->cmd_buffers); 3373 list_inithead(&pool->free_cmd_buffers); 3374 3375 pool->queue_family_index = pCreateInfo->queueFamilyIndex; 3376 3377 *pCmdPool = radv_cmd_pool_to_handle(pool); 3378 3379 return VK_SUCCESS; 3380 3381} 3382 3383void radv_DestroyCommandPool( 3384 VkDevice _device, 3385 VkCommandPool commandPool, 3386 const VkAllocationCallbacks* pAllocator) 3387{ 3388 RADV_FROM_HANDLE(radv_device, device, _device); 3389 RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool); 3390 3391 if (!pool) 3392 return; 3393 3394 list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, 3395 &pool->cmd_buffers, pool_link) { 3396 radv_cmd_buffer_destroy(cmd_buffer); 3397 } 3398 3399 list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, 3400 &pool->free_cmd_buffers, pool_link) { 3401 radv_cmd_buffer_destroy(cmd_buffer); 3402 } 3403 3404 vk_free2(&device->alloc, pAllocator, pool); 3405} 3406 3407VkResult radv_ResetCommandPool( 3408 VkDevice device, 3409 VkCommandPool commandPool, 3410 VkCommandPoolResetFlags flags) 3411{ 3412 RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool); 3413 VkResult result; 3414 3415 list_for_each_entry(struct radv_cmd_buffer, cmd_buffer, 3416 &pool->cmd_buffers, pool_link) { 3417 result = radv_reset_cmd_buffer(cmd_buffer); 3418 if (result != VK_SUCCESS) 3419 return result; 3420 } 3421 3422 return VK_SUCCESS; 3423} 3424 3425void radv_TrimCommandPool( 3426 VkDevice device, 3427 VkCommandPool commandPool, 3428 VkCommandPoolTrimFlags flags) 3429{ 3430 RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool); 3431 3432 if (!pool) 3433 return; 3434 3435 list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, 3436 &pool->free_cmd_buffers, pool_link) { 3437 radv_cmd_buffer_destroy(cmd_buffer); 3438 } 3439} 3440 3441static uint32_t 3442radv_get_subpass_id(struct radv_cmd_buffer *cmd_buffer) 3443{ 3444 struct radv_cmd_state *state = &cmd_buffer->state; 3445 uint32_t subpass_id = state->subpass - state->pass->subpasses; 3446 3447 /* The id of this subpass shouldn't exceed the number of subpasses in 3448 * this render pass minus 1. 3449 */ 3450 assert(subpass_id < state->pass->subpass_count); 3451 return subpass_id; 3452} 3453 3454static void 3455radv_cmd_buffer_begin_subpass(struct radv_cmd_buffer *cmd_buffer, 3456 uint32_t subpass_id) 3457{ 3458 struct radv_cmd_state *state = &cmd_buffer->state; 3459 struct radv_subpass *subpass = &state->pass->subpasses[subpass_id]; 3460 3461 MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, 3462 cmd_buffer->cs, 4096); 3463 3464 radv_subpass_barrier(cmd_buffer, &subpass->start_barrier); 3465 3466 for (uint32_t i = 0; i < subpass->attachment_count; ++i) { 3467 const uint32_t a = subpass->attachments[i].attachment; 3468 if (a == VK_ATTACHMENT_UNUSED) 3469 continue; 3470 3471 radv_handle_subpass_image_transition(cmd_buffer, 3472 subpass->attachments[i]); 3473 } 3474 3475 radv_cmd_buffer_set_subpass(cmd_buffer, subpass); 3476 radv_cmd_buffer_clear_subpass(cmd_buffer); 3477 3478 assert(cmd_buffer->cs->cdw <= cdw_max); 3479} 3480 3481static void 3482radv_cmd_buffer_end_subpass(struct radv_cmd_buffer *cmd_buffer) 3483{ 3484 struct radv_cmd_state *state = &cmd_buffer->state; 3485 const struct radv_subpass *subpass = state->subpass; 3486 uint32_t subpass_id = radv_get_subpass_id(cmd_buffer); 3487 3488 radv_cmd_buffer_resolve_subpass(cmd_buffer); 3489 3490 for (uint32_t i = 0; i < subpass->attachment_count; ++i) { 3491 const uint32_t a = subpass->attachments[i].attachment; 3492 if (a == VK_ATTACHMENT_UNUSED) 3493 continue; 3494 3495 if (state->pass->attachments[a].last_subpass_idx != subpass_id) 3496 continue; 3497 3498 VkImageLayout layout = state->pass->attachments[a].final_layout; 3499 radv_handle_subpass_image_transition(cmd_buffer, 3500 (struct radv_subpass_attachment){a, layout}); 3501 } 3502} 3503 3504void radv_CmdBeginRenderPass( 3505 VkCommandBuffer commandBuffer, 3506 const VkRenderPassBeginInfo* pRenderPassBegin, 3507 VkSubpassContents contents) 3508{ 3509 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 3510 RADV_FROM_HANDLE(radv_render_pass, pass, pRenderPassBegin->renderPass); 3511 RADV_FROM_HANDLE(radv_framebuffer, framebuffer, pRenderPassBegin->framebuffer); 3512 VkResult result; 3513 3514 cmd_buffer->state.framebuffer = framebuffer; 3515 cmd_buffer->state.pass = pass; 3516 cmd_buffer->state.render_area = pRenderPassBegin->renderArea; 3517 3518 result = radv_cmd_state_setup_attachments(cmd_buffer, pass, pRenderPassBegin); 3519 if (result != VK_SUCCESS) 3520 return; 3521 3522 radv_cmd_buffer_begin_subpass(cmd_buffer, 0); 3523} 3524 3525void radv_CmdBeginRenderPass2KHR( 3526 VkCommandBuffer commandBuffer, 3527 const VkRenderPassBeginInfo* pRenderPassBeginInfo, 3528 const VkSubpassBeginInfoKHR* pSubpassBeginInfo) 3529{ 3530 radv_CmdBeginRenderPass(commandBuffer, pRenderPassBeginInfo, 3531 pSubpassBeginInfo->contents); 3532} 3533 3534void radv_CmdNextSubpass( 3535 VkCommandBuffer commandBuffer, 3536 VkSubpassContents contents) 3537{ 3538 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 3539 3540 uint32_t prev_subpass = radv_get_subpass_id(cmd_buffer); 3541 radv_cmd_buffer_end_subpass(cmd_buffer); 3542 radv_cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1); 3543} 3544 3545void radv_CmdNextSubpass2KHR( 3546 VkCommandBuffer commandBuffer, 3547 const VkSubpassBeginInfoKHR* pSubpassBeginInfo, 3548 const VkSubpassEndInfoKHR* pSubpassEndInfo) 3549{ 3550 radv_CmdNextSubpass(commandBuffer, pSubpassBeginInfo->contents); 3551} 3552 3553static void radv_emit_view_index(struct radv_cmd_buffer *cmd_buffer, unsigned index) 3554{ 3555 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; 3556 for (unsigned stage = 0; stage < MESA_SHADER_STAGES; ++stage) { 3557 if (!radv_get_shader(pipeline, stage)) 3558 continue; 3559 3560 struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, AC_UD_VIEW_INDEX); 3561 if (loc->sgpr_idx == -1) 3562 continue; 3563 uint32_t base_reg = pipeline->user_data_0[stage]; 3564 radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, index); 3565 3566 } 3567 if (pipeline->gs_copy_shader) { 3568 struct radv_userdata_info *loc = &pipeline->gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_VIEW_INDEX]; 3569 if (loc->sgpr_idx != -1) { 3570 uint32_t base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0; 3571 radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, index); 3572 } 3573 } 3574} 3575 3576static void 3577radv_cs_emit_draw_packet(struct radv_cmd_buffer *cmd_buffer, 3578 uint32_t vertex_count, 3579 bool use_opaque) 3580{ 3581 radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, cmd_buffer->state.predicating)); 3582 radeon_emit(cmd_buffer->cs, vertex_count); 3583 radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX | 3584 S_0287F0_USE_OPAQUE(use_opaque)); 3585} 3586 3587static void 3588radv_cs_emit_draw_indexed_packet(struct radv_cmd_buffer *cmd_buffer, 3589 uint64_t index_va, 3590 uint32_t index_count) 3591{ 3592 radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_2, 4, cmd_buffer->state.predicating)); 3593 radeon_emit(cmd_buffer->cs, cmd_buffer->state.max_index_count); 3594 radeon_emit(cmd_buffer->cs, index_va); 3595 radeon_emit(cmd_buffer->cs, index_va >> 32); 3596 radeon_emit(cmd_buffer->cs, index_count); 3597 radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_DMA); 3598} 3599 3600static void 3601radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer *cmd_buffer, 3602 bool indexed, 3603 uint32_t draw_count, 3604 uint64_t count_va, 3605 uint32_t stride) 3606{ 3607 struct radeon_cmdbuf *cs = cmd_buffer->cs; 3608 unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA 3609 : V_0287F0_DI_SRC_SEL_AUTO_INDEX; 3610 bool draw_id_enable = radv_get_shader(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX)->info.info.vs.needs_draw_id; 3611 uint32_t base_reg = cmd_buffer->state.pipeline->graphics.vtx_base_sgpr; 3612 bool predicating = cmd_buffer->state.predicating; 3613 assert(base_reg); 3614 3615 /* just reset draw state for vertex data */ 3616 cmd_buffer->state.last_first_instance = -1; 3617 cmd_buffer->state.last_num_instances = -1; 3618 cmd_buffer->state.last_vertex_offset = -1; 3619 3620 if (draw_count == 1 && !count_va && !draw_id_enable) { 3621 radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT : 3622 PKT3_DRAW_INDIRECT, 3, predicating)); 3623 radeon_emit(cs, 0); 3624 radeon_emit(cs, (base_reg - SI_SH_REG_OFFSET) >> 2); 3625 radeon_emit(cs, ((base_reg + 4) - SI_SH_REG_OFFSET) >> 2); 3626 radeon_emit(cs, di_src_sel); 3627 } else { 3628 radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI : 3629 PKT3_DRAW_INDIRECT_MULTI, 3630 8, predicating)); 3631 radeon_emit(cs, 0); 3632 radeon_emit(cs, (base_reg - SI_SH_REG_OFFSET) >> 2); 3633 radeon_emit(cs, ((base_reg + 4) - SI_SH_REG_OFFSET) >> 2); 3634 radeon_emit(cs, (((base_reg + 8) - SI_SH_REG_OFFSET) >> 2) | 3635 S_2C3_DRAW_INDEX_ENABLE(draw_id_enable) | 3636 S_2C3_COUNT_INDIRECT_ENABLE(!!count_va)); 3637 radeon_emit(cs, draw_count); /* count */ 3638 radeon_emit(cs, count_va); /* count_addr */ 3639 radeon_emit(cs, count_va >> 32); 3640 radeon_emit(cs, stride); /* stride */ 3641 radeon_emit(cs, di_src_sel); 3642 } 3643} 3644 3645static void 3646radv_emit_draw_packets(struct radv_cmd_buffer *cmd_buffer, 3647 const struct radv_draw_info *info) 3648{ 3649 struct radv_cmd_state *state = &cmd_buffer->state; 3650 struct radeon_winsys *ws = cmd_buffer->device->ws; 3651 struct radeon_cmdbuf *cs = cmd_buffer->cs; 3652 3653 if (info->indirect) { 3654 uint64_t va = radv_buffer_get_va(info->indirect->bo); 3655 uint64_t count_va = 0; 3656 3657 va += info->indirect->offset + info->indirect_offset; 3658 3659 radv_cs_add_buffer(ws, cs, info->indirect->bo); 3660 3661 radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0)); 3662 radeon_emit(cs, 1); 3663 radeon_emit(cs, va); 3664 radeon_emit(cs, va >> 32); 3665 3666 if (info->count_buffer) { 3667 count_va = radv_buffer_get_va(info->count_buffer->bo); 3668 count_va += info->count_buffer->offset + 3669 info->count_buffer_offset; 3670 3671 radv_cs_add_buffer(ws, cs, info->count_buffer->bo); 3672 } 3673 3674 if (!state->subpass->view_mask) { 3675 radv_cs_emit_indirect_draw_packet(cmd_buffer, 3676 info->indexed, 3677 info->count, 3678 count_va, 3679 info->stride); 3680 } else { 3681 unsigned i; 3682 for_each_bit(i, state->subpass->view_mask) { 3683 radv_emit_view_index(cmd_buffer, i); 3684 3685 radv_cs_emit_indirect_draw_packet(cmd_buffer, 3686 info->indexed, 3687 info->count, 3688 count_va, 3689 info->stride); 3690 } 3691 } 3692 } else { 3693 assert(state->pipeline->graphics.vtx_base_sgpr); 3694 3695 if (info->vertex_offset != state->last_vertex_offset || 3696 info->first_instance != state->last_first_instance) { 3697 radeon_set_sh_reg_seq(cs, state->pipeline->graphics.vtx_base_sgpr, 3698 state->pipeline->graphics.vtx_emit_num); 3699 3700 radeon_emit(cs, info->vertex_offset); 3701 radeon_emit(cs, info->first_instance); 3702 if (state->pipeline->graphics.vtx_emit_num == 3) 3703 radeon_emit(cs, 0); 3704 state->last_first_instance = info->first_instance; 3705 state->last_vertex_offset = info->vertex_offset; 3706 } 3707 3708 if (state->last_num_instances != info->instance_count) { 3709 radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, false)); 3710 radeon_emit(cs, info->instance_count); 3711 state->last_num_instances = info->instance_count; 3712 } 3713 3714 if (info->indexed) { 3715 int index_size = state->index_type ? 4 : 2; 3716 uint64_t index_va; 3717 3718 index_va = state->index_va; 3719 index_va += info->first_index * index_size; 3720 3721 if (!state->subpass->view_mask) { 3722 radv_cs_emit_draw_indexed_packet(cmd_buffer, 3723 index_va, 3724 info->count); 3725 } else { 3726 unsigned i; 3727 for_each_bit(i, state->subpass->view_mask) { 3728 radv_emit_view_index(cmd_buffer, i); 3729 3730 radv_cs_emit_draw_indexed_packet(cmd_buffer, 3731 index_va, 3732 info->count); 3733 } 3734 } 3735 } else { 3736 if (!state->subpass->view_mask) { 3737 radv_cs_emit_draw_packet(cmd_buffer, 3738 info->count, 3739 !!info->strmout_buffer); 3740 } else { 3741 unsigned i; 3742 for_each_bit(i, state->subpass->view_mask) { 3743 radv_emit_view_index(cmd_buffer, i); 3744 3745 radv_cs_emit_draw_packet(cmd_buffer, 3746 info->count, 3747 !!info->strmout_buffer); 3748 } 3749 } 3750 } 3751 } 3752} 3753 3754/* 3755 * Vega and raven have a bug which triggers if there are multiple context 3756 * register contexts active at the same time with different scissor values. 3757 * 3758 * There are two possible workarounds: 3759 * 1) Wait for PS_PARTIAL_FLUSH every time the scissor is changed. That way 3760 * there is only ever 1 active set of scissor values at the same time. 3761 * 3762 * 2) Whenever the hardware switches contexts we have to set the scissor 3763 * registers again even if it is a noop. That way the new context gets 3764 * the correct scissor values. 3765 * 3766 * This implements option 2. radv_need_late_scissor_emission needs to 3767 * return true on affected HW if radv_emit_all_graphics_states sets 3768 * any context registers. 3769 */ 3770static bool radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer, 3771 const struct radv_draw_info *info) 3772{ 3773 struct radv_cmd_state *state = &cmd_buffer->state; 3774 3775 if (!cmd_buffer->device->physical_device->has_scissor_bug) 3776 return false; 3777 3778 if (cmd_buffer->state.context_roll_without_scissor_emitted || info->strmout_buffer) 3779 return true; 3780 3781 uint32_t used_states = cmd_buffer->state.pipeline->graphics.needed_dynamic_state | ~RADV_CMD_DIRTY_DYNAMIC_ALL; 3782 3783 /* Index, vertex and streamout buffers don't change context regs, and 3784 * pipeline is already handled. 3785 */ 3786 used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER | 3787 RADV_CMD_DIRTY_VERTEX_BUFFER | 3788 RADV_CMD_DIRTY_STREAMOUT_BUFFER | 3789 RADV_CMD_DIRTY_PIPELINE); 3790 3791 if (cmd_buffer->state.dirty & used_states) 3792 return true; 3793 3794 if (info->indexed && state->pipeline->graphics.prim_restart_enable && 3795 (state->index_type ? 0xffffffffu : 0xffffu) != state->last_primitive_reset_index) 3796 return true; 3797 3798 return false; 3799} 3800 3801static void 3802radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, 3803 const struct radv_draw_info *info) 3804{ 3805 bool late_scissor_emission; 3806 3807 if ((cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER) || 3808 cmd_buffer->state.emitted_pipeline != cmd_buffer->state.pipeline) 3809 radv_emit_rbplus_state(cmd_buffer); 3810 3811 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) 3812 radv_emit_graphics_pipeline(cmd_buffer); 3813 3814 /* This should be before the cmd_buffer->state.dirty is cleared 3815 * (excluding RADV_CMD_DIRTY_PIPELINE) and after 3816 * cmd_buffer->state.context_roll_without_scissor_emitted is set. */ 3817 late_scissor_emission = 3818 radv_need_late_scissor_emission(cmd_buffer, info); 3819 3820 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER) 3821 radv_emit_framebuffer_state(cmd_buffer); 3822 3823 if (info->indexed) { 3824 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_INDEX_BUFFER) 3825 radv_emit_index_buffer(cmd_buffer); 3826 } else { 3827 /* On CI and later, non-indexed draws overwrite VGT_INDEX_TYPE, 3828 * so the state must be re-emitted before the next indexed 3829 * draw. 3830 */ 3831 if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) { 3832 cmd_buffer->state.last_index_type = -1; 3833 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER; 3834 } 3835 } 3836 3837 radv_cmd_buffer_flush_dynamic_state(cmd_buffer); 3838 3839 radv_emit_draw_registers(cmd_buffer, info); 3840 3841 if (late_scissor_emission) 3842 radv_emit_scissor(cmd_buffer); 3843} 3844 3845static void 3846radv_draw(struct radv_cmd_buffer *cmd_buffer, 3847 const struct radv_draw_info *info) 3848{ 3849 struct radeon_info *rad_info = 3850 &cmd_buffer->device->physical_device->rad_info; 3851 bool has_prefetch = 3852 cmd_buffer->device->physical_device->rad_info.chip_class >= CIK; 3853 bool pipeline_is_dirty = 3854 (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) && 3855 cmd_buffer->state.pipeline != cmd_buffer->state.emitted_pipeline; 3856 3857 MAYBE_UNUSED unsigned cdw_max = 3858 radeon_check_space(cmd_buffer->device->ws, 3859 cmd_buffer->cs, 4096); 3860 3861 if (likely(!info->indirect)) { 3862 /* SI-CI treat instance_count==0 as instance_count==1. There is 3863 * no workaround for indirect draws, but we can at least skip 3864 * direct draws. 3865 */ 3866 if (unlikely(!info->instance_count)) 3867 return; 3868 3869 /* Handle count == 0. */ 3870 if (unlikely(!info->count && !info->strmout_buffer)) 3871 return; 3872 } 3873 3874 /* Use optimal packet order based on whether we need to sync the 3875 * pipeline. 3876 */ 3877 if (cmd_buffer->state.flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB | 3878 RADV_CMD_FLAG_FLUSH_AND_INV_DB | 3879 RADV_CMD_FLAG_PS_PARTIAL_FLUSH | 3880 RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) { 3881 /* If we have to wait for idle, set all states first, so that 3882 * all SET packets are processed in parallel with previous draw 3883 * calls. Then upload descriptors, set shader pointers, and 3884 * draw, and prefetch at the end. This ensures that the time 3885 * the CUs are idle is very short. (there are only SET_SH 3886 * packets between the wait and the draw) 3887 */ 3888 radv_emit_all_graphics_states(cmd_buffer, info); 3889 si_emit_cache_flush(cmd_buffer); 3890 /* <-- CUs are idle here --> */ 3891 3892 radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty); 3893 3894 radv_emit_draw_packets(cmd_buffer, info); 3895 /* <-- CUs are busy here --> */ 3896 3897 /* Start prefetches after the draw has been started. Both will 3898 * run in parallel, but starting the draw first is more 3899 * important. 3900 */ 3901 if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) { 3902 radv_emit_prefetch_L2(cmd_buffer, 3903 cmd_buffer->state.pipeline, false); 3904 } 3905 } else { 3906 /* If we don't wait for idle, start prefetches first, then set 3907 * states, and draw at the end. 3908 */ 3909 si_emit_cache_flush(cmd_buffer); 3910 3911 if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) { 3912 /* Only prefetch the vertex shader and VBO descriptors 3913 * in order to start the draw as soon as possible. 3914 */ 3915 radv_emit_prefetch_L2(cmd_buffer, 3916 cmd_buffer->state.pipeline, true); 3917 } 3918 3919 radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty); 3920 3921 radv_emit_all_graphics_states(cmd_buffer, info); 3922 radv_emit_draw_packets(cmd_buffer, info); 3923 3924 /* Prefetch the remaining shaders after the draw has been 3925 * started. 3926 */ 3927 if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) { 3928 radv_emit_prefetch_L2(cmd_buffer, 3929 cmd_buffer->state.pipeline, false); 3930 } 3931 } 3932 3933 /* Workaround for a VGT hang when streamout is enabled. 3934 * It must be done after drawing. 3935 */ 3936 if (cmd_buffer->state.streamout.streamout_enabled && 3937 (rad_info->family == CHIP_HAWAII || 3938 rad_info->family == CHIP_TONGA || 3939 rad_info->family == CHIP_FIJI)) { 3940 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_STREAMOUT_SYNC; 3941 } 3942 3943 assert(cmd_buffer->cs->cdw <= cdw_max); 3944 radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_PS_PARTIAL_FLUSH); 3945} 3946 3947void radv_CmdDraw( 3948 VkCommandBuffer commandBuffer, 3949 uint32_t vertexCount, 3950 uint32_t instanceCount, 3951 uint32_t firstVertex, 3952 uint32_t firstInstance) 3953{ 3954 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 3955 struct radv_draw_info info = {}; 3956 3957 info.count = vertexCount; 3958 info.instance_count = instanceCount; 3959 info.first_instance = firstInstance; 3960 info.vertex_offset = firstVertex; 3961 3962 radv_draw(cmd_buffer, &info); 3963} 3964 3965void radv_CmdDrawIndexed( 3966 VkCommandBuffer commandBuffer, 3967 uint32_t indexCount, 3968 uint32_t instanceCount, 3969 uint32_t firstIndex, 3970 int32_t vertexOffset, 3971 uint32_t firstInstance) 3972{ 3973 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 3974 struct radv_draw_info info = {}; 3975 3976 info.indexed = true; 3977 info.count = indexCount; 3978 info.instance_count = instanceCount; 3979 info.first_index = firstIndex; 3980 info.vertex_offset = vertexOffset; 3981 info.first_instance = firstInstance; 3982 3983 radv_draw(cmd_buffer, &info); 3984} 3985 3986void radv_CmdDrawIndirect( 3987 VkCommandBuffer commandBuffer, 3988 VkBuffer _buffer, 3989 VkDeviceSize offset, 3990 uint32_t drawCount, 3991 uint32_t stride) 3992{ 3993 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 3994 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer); 3995 struct radv_draw_info info = {}; 3996 3997 info.count = drawCount; 3998 info.indirect = buffer; 3999 info.indirect_offset = offset; 4000 info.stride = stride; 4001 4002 radv_draw(cmd_buffer, &info); 4003} 4004 4005void radv_CmdDrawIndexedIndirect( 4006 VkCommandBuffer commandBuffer, 4007 VkBuffer _buffer, 4008 VkDeviceSize offset, 4009 uint32_t drawCount, 4010 uint32_t stride) 4011{ 4012 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 4013 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer); 4014 struct radv_draw_info info = {}; 4015 4016 info.indexed = true; 4017 info.count = drawCount; 4018 info.indirect = buffer; 4019 info.indirect_offset = offset; 4020 info.stride = stride; 4021 4022 radv_draw(cmd_buffer, &info); 4023} 4024 4025void radv_CmdDrawIndirectCountAMD( 4026 VkCommandBuffer commandBuffer, 4027 VkBuffer _buffer, 4028 VkDeviceSize offset, 4029 VkBuffer _countBuffer, 4030 VkDeviceSize countBufferOffset, 4031 uint32_t maxDrawCount, 4032 uint32_t stride) 4033{ 4034 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 4035 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer); 4036 RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer); 4037 struct radv_draw_info info = {}; 4038 4039 info.count = maxDrawCount; 4040 info.indirect = buffer; 4041 info.indirect_offset = offset; 4042 info.count_buffer = count_buffer; 4043 info.count_buffer_offset = countBufferOffset; 4044 info.stride = stride; 4045 4046 radv_draw(cmd_buffer, &info); 4047} 4048 4049void radv_CmdDrawIndexedIndirectCountAMD( 4050 VkCommandBuffer commandBuffer, 4051 VkBuffer _buffer, 4052 VkDeviceSize offset, 4053 VkBuffer _countBuffer, 4054 VkDeviceSize countBufferOffset, 4055 uint32_t maxDrawCount, 4056 uint32_t stride) 4057{ 4058 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 4059 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer); 4060 RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer); 4061 struct radv_draw_info info = {}; 4062 4063 info.indexed = true; 4064 info.count = maxDrawCount; 4065 info.indirect = buffer; 4066 info.indirect_offset = offset; 4067 info.count_buffer = count_buffer; 4068 info.count_buffer_offset = countBufferOffset; 4069 info.stride = stride; 4070 4071 radv_draw(cmd_buffer, &info); 4072} 4073 4074void radv_CmdDrawIndirectCountKHR( 4075 VkCommandBuffer commandBuffer, 4076 VkBuffer _buffer, 4077 VkDeviceSize offset, 4078 VkBuffer _countBuffer, 4079 VkDeviceSize countBufferOffset, 4080 uint32_t maxDrawCount, 4081 uint32_t stride) 4082{ 4083 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 4084 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer); 4085 RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer); 4086 struct radv_draw_info info = {}; 4087 4088 info.count = maxDrawCount; 4089 info.indirect = buffer; 4090 info.indirect_offset = offset; 4091 info.count_buffer = count_buffer; 4092 info.count_buffer_offset = countBufferOffset; 4093 info.stride = stride; 4094 4095 radv_draw(cmd_buffer, &info); 4096} 4097 4098void radv_CmdDrawIndexedIndirectCountKHR( 4099 VkCommandBuffer commandBuffer, 4100 VkBuffer _buffer, 4101 VkDeviceSize offset, 4102 VkBuffer _countBuffer, 4103 VkDeviceSize countBufferOffset, 4104 uint32_t maxDrawCount, 4105 uint32_t stride) 4106{ 4107 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 4108 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer); 4109 RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer); 4110 struct radv_draw_info info = {}; 4111 4112 info.indexed = true; 4113 info.count = maxDrawCount; 4114 info.indirect = buffer; 4115 info.indirect_offset = offset; 4116 info.count_buffer = count_buffer; 4117 info.count_buffer_offset = countBufferOffset; 4118 info.stride = stride; 4119 4120 radv_draw(cmd_buffer, &info); 4121} 4122 4123struct radv_dispatch_info { 4124 /** 4125 * Determine the layout of the grid (in block units) to be used. 4126 */ 4127 uint32_t blocks[3]; 4128 4129 /** 4130 * A starting offset for the grid. If unaligned is set, the offset 4131 * must still be aligned. 4132 */ 4133 uint32_t offsets[3]; 4134 /** 4135 * Whether it's an unaligned compute dispatch. 4136 */ 4137 bool unaligned; 4138 4139 /** 4140 * Indirect compute parameters resource. 4141 */ 4142 struct radv_buffer *indirect; 4143 uint64_t indirect_offset; 4144}; 4145 4146static void 4147radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer, 4148 const struct radv_dispatch_info *info) 4149{ 4150 struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline; 4151 struct radv_shader_variant *compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE]; 4152 unsigned dispatch_initiator = cmd_buffer->device->dispatch_initiator; 4153 struct radeon_winsys *ws = cmd_buffer->device->ws; 4154 bool predicating = cmd_buffer->state.predicating; 4155 struct radeon_cmdbuf *cs = cmd_buffer->cs; 4156 struct radv_userdata_info *loc; 4157 4158 loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_COMPUTE, 4159 AC_UD_CS_GRID_SIZE); 4160 4161 MAYBE_UNUSED unsigned cdw_max = radeon_check_space(ws, cs, 25); 4162 4163 if (info->indirect) { 4164 uint64_t va = radv_buffer_get_va(info->indirect->bo); 4165 4166 va += info->indirect->offset + info->indirect_offset; 4167 4168 radv_cs_add_buffer(ws, cs, info->indirect->bo); 4169 4170 if (loc->sgpr_idx != -1) { 4171 for (unsigned i = 0; i < 3; ++i) { 4172 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); 4173 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | 4174 COPY_DATA_DST_SEL(COPY_DATA_REG)); 4175 radeon_emit(cs, (va + 4 * i)); 4176 radeon_emit(cs, (va + 4 * i) >> 32); 4177 radeon_emit(cs, ((R_00B900_COMPUTE_USER_DATA_0 4178 + loc->sgpr_idx * 4) >> 2) + i); 4179 radeon_emit(cs, 0); 4180 } 4181 } 4182 4183 if (radv_cmd_buffer_uses_mec(cmd_buffer)) { 4184 radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, predicating) | 4185 PKT3_SHADER_TYPE_S(1)); 4186 radeon_emit(cs, va); 4187 radeon_emit(cs, va >> 32); 4188 radeon_emit(cs, dispatch_initiator); 4189 } else { 4190 radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) | 4191 PKT3_SHADER_TYPE_S(1)); 4192 radeon_emit(cs, 1); 4193 radeon_emit(cs, va); 4194 radeon_emit(cs, va >> 32); 4195 4196 radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, predicating) | 4197 PKT3_SHADER_TYPE_S(1)); 4198 radeon_emit(cs, 0); 4199 radeon_emit(cs, dispatch_initiator); 4200 } 4201 } else { 4202 unsigned blocks[3] = { info->blocks[0], info->blocks[1], info->blocks[2] }; 4203 unsigned offsets[3] = { info->offsets[0], info->offsets[1], info->offsets[2] }; 4204 4205 if (info->unaligned) { 4206 unsigned *cs_block_size = compute_shader->info.cs.block_size; 4207 unsigned remainder[3]; 4208 4209 /* If aligned, these should be an entire block size, 4210 * not 0. 4211 */ 4212 remainder[0] = blocks[0] + cs_block_size[0] - 4213 align_u32_npot(blocks[0], cs_block_size[0]); 4214 remainder[1] = blocks[1] + cs_block_size[1] - 4215 align_u32_npot(blocks[1], cs_block_size[1]); 4216 remainder[2] = blocks[2] + cs_block_size[2] - 4217 align_u32_npot(blocks[2], cs_block_size[2]); 4218 4219 blocks[0] = round_up_u32(blocks[0], cs_block_size[0]); 4220 blocks[1] = round_up_u32(blocks[1], cs_block_size[1]); 4221 blocks[2] = round_up_u32(blocks[2], cs_block_size[2]); 4222 4223 for(unsigned i = 0; i < 3; ++i) { 4224 assert(offsets[i] % cs_block_size[i] == 0); 4225 offsets[i] /= cs_block_size[i]; 4226 } 4227 4228 radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3); 4229 radeon_emit(cs, 4230 S_00B81C_NUM_THREAD_FULL(cs_block_size[0]) | 4231 S_00B81C_NUM_THREAD_PARTIAL(remainder[0])); 4232 radeon_emit(cs, 4233 S_00B81C_NUM_THREAD_FULL(cs_block_size[1]) | 4234 S_00B81C_NUM_THREAD_PARTIAL(remainder[1])); 4235 radeon_emit(cs, 4236 S_00B81C_NUM_THREAD_FULL(cs_block_size[2]) | 4237 S_00B81C_NUM_THREAD_PARTIAL(remainder[2])); 4238 4239 dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1); 4240 } 4241 4242 if (loc->sgpr_idx != -1) { 4243 assert(loc->num_sgprs == 3); 4244 4245 radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + 4246 loc->sgpr_idx * 4, 3); 4247 radeon_emit(cs, blocks[0]); 4248 radeon_emit(cs, blocks[1]); 4249 radeon_emit(cs, blocks[2]); 4250 } 4251 4252 if (offsets[0] || offsets[1] || offsets[2]) { 4253 radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3); 4254 radeon_emit(cs, offsets[0]); 4255 radeon_emit(cs, offsets[1]); 4256 radeon_emit(cs, offsets[2]); 4257 4258 /* The blocks in the packet are not counts but end values. */ 4259 for (unsigned i = 0; i < 3; ++i) 4260 blocks[i] += offsets[i]; 4261 } else { 4262 dispatch_initiator |= S_00B800_FORCE_START_AT_000(1); 4263 } 4264 4265 radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, predicating) | 4266 PKT3_SHADER_TYPE_S(1)); 4267 radeon_emit(cs, blocks[0]); 4268 radeon_emit(cs, blocks[1]); 4269 radeon_emit(cs, blocks[2]); 4270 radeon_emit(cs, dispatch_initiator); 4271 } 4272 4273 assert(cmd_buffer->cs->cdw <= cdw_max); 4274} 4275 4276static void 4277radv_upload_compute_shader_descriptors(struct radv_cmd_buffer *cmd_buffer) 4278{ 4279 radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_COMPUTE_BIT); 4280 radv_flush_constants(cmd_buffer, VK_SHADER_STAGE_COMPUTE_BIT); 4281} 4282 4283static void 4284radv_dispatch(struct radv_cmd_buffer *cmd_buffer, 4285 const struct radv_dispatch_info *info) 4286{ 4287 struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline; 4288 bool has_prefetch = 4289 cmd_buffer->device->physical_device->rad_info.chip_class >= CIK; 4290 bool pipeline_is_dirty = pipeline && 4291 pipeline != cmd_buffer->state.emitted_compute_pipeline; 4292 4293 if (cmd_buffer->state.flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB | 4294 RADV_CMD_FLAG_FLUSH_AND_INV_DB | 4295 RADV_CMD_FLAG_PS_PARTIAL_FLUSH | 4296 RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) { 4297 /* If we have to wait for idle, set all states first, so that 4298 * all SET packets are processed in parallel with previous draw 4299 * calls. Then upload descriptors, set shader pointers, and 4300 * dispatch, and prefetch at the end. This ensures that the 4301 * time the CUs are idle is very short. (there are only SET_SH 4302 * packets between the wait and the draw) 4303 */ 4304 radv_emit_compute_pipeline(cmd_buffer); 4305 si_emit_cache_flush(cmd_buffer); 4306 /* <-- CUs are idle here --> */ 4307 4308 radv_upload_compute_shader_descriptors(cmd_buffer); 4309 4310 radv_emit_dispatch_packets(cmd_buffer, info); 4311 /* <-- CUs are busy here --> */ 4312 4313 /* Start prefetches after the dispatch has been started. Both 4314 * will run in parallel, but starting the dispatch first is 4315 * more important. 4316 */ 4317 if (has_prefetch && pipeline_is_dirty) { 4318 radv_emit_shader_prefetch(cmd_buffer, 4319 pipeline->shaders[MESA_SHADER_COMPUTE]); 4320 } 4321 } else { 4322 /* If we don't wait for idle, start prefetches first, then set 4323 * states, and dispatch at the end. 4324 */ 4325 si_emit_cache_flush(cmd_buffer); 4326 4327 if (has_prefetch && pipeline_is_dirty) { 4328 radv_emit_shader_prefetch(cmd_buffer, 4329 pipeline->shaders[MESA_SHADER_COMPUTE]); 4330 } 4331 4332 radv_upload_compute_shader_descriptors(cmd_buffer); 4333 4334 radv_emit_compute_pipeline(cmd_buffer); 4335 radv_emit_dispatch_packets(cmd_buffer, info); 4336 } 4337 4338 radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_CS_PARTIAL_FLUSH); 4339} 4340 4341void radv_CmdDispatchBase( 4342 VkCommandBuffer commandBuffer, 4343 uint32_t base_x, 4344 uint32_t base_y, 4345 uint32_t base_z, 4346 uint32_t x, 4347 uint32_t y, 4348 uint32_t z) 4349{ 4350 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 4351 struct radv_dispatch_info info = {}; 4352 4353 info.blocks[0] = x; 4354 info.blocks[1] = y; 4355 info.blocks[2] = z; 4356 4357 info.offsets[0] = base_x; 4358 info.offsets[1] = base_y; 4359 info.offsets[2] = base_z; 4360 radv_dispatch(cmd_buffer, &info); 4361} 4362 4363void radv_CmdDispatch( 4364 VkCommandBuffer commandBuffer, 4365 uint32_t x, 4366 uint32_t y, 4367 uint32_t z) 4368{ 4369 radv_CmdDispatchBase(commandBuffer, 0, 0, 0, x, y, z); 4370} 4371 4372void radv_CmdDispatchIndirect( 4373 VkCommandBuffer commandBuffer, 4374 VkBuffer _buffer, 4375 VkDeviceSize offset) 4376{ 4377 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 4378 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer); 4379 struct radv_dispatch_info info = {}; 4380 4381 info.indirect = buffer; 4382 info.indirect_offset = offset; 4383 4384 radv_dispatch(cmd_buffer, &info); 4385} 4386 4387void radv_unaligned_dispatch( 4388 struct radv_cmd_buffer *cmd_buffer, 4389 uint32_t x, 4390 uint32_t y, 4391 uint32_t z) 4392{ 4393 struct radv_dispatch_info info = {}; 4394 4395 info.blocks[0] = x; 4396 info.blocks[1] = y; 4397 info.blocks[2] = z; 4398 info.unaligned = 1; 4399 4400 radv_dispatch(cmd_buffer, &info); 4401} 4402 4403void radv_CmdEndRenderPass( 4404 VkCommandBuffer commandBuffer) 4405{ 4406 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 4407 4408 radv_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier); 4409 4410 radv_cmd_buffer_end_subpass(cmd_buffer); 4411 4412 vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments); 4413 4414 cmd_buffer->state.pass = NULL; 4415 cmd_buffer->state.subpass = NULL; 4416 cmd_buffer->state.attachments = NULL; 4417 cmd_buffer->state.framebuffer = NULL; 4418} 4419 4420void radv_CmdEndRenderPass2KHR( 4421 VkCommandBuffer commandBuffer, 4422 const VkSubpassEndInfoKHR* pSubpassEndInfo) 4423{ 4424 radv_CmdEndRenderPass(commandBuffer); 4425} 4426 4427/* 4428 * For HTILE we have the following interesting clear words: 4429 * 0xfffff30f: Uncompressed, full depth range, for depth+stencil HTILE 4430 * 0xfffc000f: Uncompressed, full depth range, for depth only HTILE. 4431 * 0xfffffff0: Clear depth to 1.0 4432 * 0x00000000: Clear depth to 0.0 4433 */ 4434static void radv_initialize_htile(struct radv_cmd_buffer *cmd_buffer, 4435 struct radv_image *image, 4436 const VkImageSubresourceRange *range, 4437 uint32_t clear_word) 4438{ 4439 assert(range->baseMipLevel == 0); 4440 assert(range->levelCount == 1 || range->levelCount == VK_REMAINING_ARRAY_LAYERS); 4441 unsigned layer_count = radv_get_layerCount(image, range); 4442 uint64_t size = image->planes[0].surface.htile_slice_size * layer_count; 4443 VkImageAspectFlags aspects = VK_IMAGE_ASPECT_DEPTH_BIT; 4444 uint64_t offset = image->offset + image->htile_offset + 4445 image->planes[0].surface.htile_slice_size * range->baseArrayLayer; 4446 struct radv_cmd_state *state = &cmd_buffer->state; 4447 VkClearDepthStencilValue value = {}; 4448 4449 state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB | 4450 RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; 4451 4452 state->flush_bits |= radv_fill_buffer(cmd_buffer, image->bo, offset, 4453 size, clear_word); 4454 4455 state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; 4456 4457 if (vk_format_is_stencil(image->vk_format)) 4458 aspects |= VK_IMAGE_ASPECT_STENCIL_BIT; 4459 4460 radv_set_ds_clear_metadata(cmd_buffer, image, value, aspects); 4461 4462 if (radv_image_is_tc_compat_htile(image)) { 4463 /* Initialize the TC-compat metada value to 0 because by 4464 * default DB_Z_INFO.RANGE_PRECISION is set to 1, and we only 4465 * need have to conditionally update its value when performing 4466 * a fast depth clear. 4467 */ 4468 radv_set_tc_compat_zrange_metadata(cmd_buffer, image, 0); 4469 } 4470} 4471 4472static void radv_handle_depth_image_transition(struct radv_cmd_buffer *cmd_buffer, 4473 struct radv_image *image, 4474 VkImageLayout src_layout, 4475 VkImageLayout dst_layout, 4476 unsigned src_queue_mask, 4477 unsigned dst_queue_mask, 4478 const VkImageSubresourceRange *range) 4479{ 4480 if (!radv_image_has_htile(image)) 4481 return; 4482 4483 if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) { 4484 uint32_t clear_value = vk_format_is_stencil(image->vk_format) ? 0xfffff30f : 0xfffc000f; 4485 4486 if (radv_layout_is_htile_compressed(image, dst_layout, 4487 dst_queue_mask)) { 4488 clear_value = 0; 4489 } 4490 4491 radv_initialize_htile(cmd_buffer, image, range, clear_value); 4492 } else if (!radv_layout_is_htile_compressed(image, src_layout, src_queue_mask) && 4493 radv_layout_is_htile_compressed(image, dst_layout, dst_queue_mask)) { 4494 uint32_t clear_value = vk_format_is_stencil(image->vk_format) ? 0xfffff30f : 0xfffc000f; 4495 radv_initialize_htile(cmd_buffer, image, range, clear_value); 4496 } else if (radv_layout_is_htile_compressed(image, src_layout, src_queue_mask) && 4497 !radv_layout_is_htile_compressed(image, dst_layout, dst_queue_mask)) { 4498 VkImageSubresourceRange local_range = *range; 4499 local_range.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT; 4500 local_range.baseMipLevel = 0; 4501 local_range.levelCount = 1; 4502 4503 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB | 4504 RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; 4505 4506 radv_decompress_depth_image_inplace(cmd_buffer, image, &local_range); 4507 4508 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB | 4509 RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; 4510 } 4511} 4512 4513static void radv_initialise_cmask(struct radv_cmd_buffer *cmd_buffer, 4514 struct radv_image *image, uint32_t value) 4515{ 4516 struct radv_cmd_state *state = &cmd_buffer->state; 4517 4518 state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | 4519 RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; 4520 4521 state->flush_bits |= radv_clear_cmask(cmd_buffer, image, value); 4522 4523 state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; 4524} 4525 4526void radv_initialize_fmask(struct radv_cmd_buffer *cmd_buffer, 4527 struct radv_image *image) 4528{ 4529 struct radv_cmd_state *state = &cmd_buffer->state; 4530 static const uint32_t fmask_clear_values[4] = { 4531 0x00000000, 4532 0x02020202, 4533 0xE4E4E4E4, 4534 0x76543210 4535 }; 4536 uint32_t log2_samples = util_logbase2(image->info.samples); 4537 uint32_t value = fmask_clear_values[log2_samples]; 4538 4539 state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | 4540 RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; 4541 4542 state->flush_bits |= radv_clear_fmask(cmd_buffer, image, value); 4543 4544 state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; 4545} 4546 4547void radv_initialize_dcc(struct radv_cmd_buffer *cmd_buffer, 4548 struct radv_image *image, uint32_t value) 4549{ 4550 struct radv_cmd_state *state = &cmd_buffer->state; 4551 4552 state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | 4553 RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; 4554 4555 state->flush_bits |= radv_clear_dcc(cmd_buffer, image, value); 4556 4557 state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | 4558 RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; 4559} 4560 4561/** 4562 * Initialize DCC/FMASK/CMASK metadata for a color image. 4563 */ 4564static void radv_init_color_image_metadata(struct radv_cmd_buffer *cmd_buffer, 4565 struct radv_image *image, 4566 VkImageLayout src_layout, 4567 VkImageLayout dst_layout, 4568 unsigned src_queue_mask, 4569 unsigned dst_queue_mask) 4570{ 4571 if (radv_image_has_cmask(image)) { 4572 uint32_t value = 0xffffffffu; /* Fully expanded mode. */ 4573 4574 /* TODO: clarify this. */ 4575 if (radv_image_has_fmask(image)) { 4576 value = 0xccccccccu; 4577 } 4578 4579 radv_initialise_cmask(cmd_buffer, image, value); 4580 } 4581 4582 if (radv_image_has_fmask(image)) { 4583 radv_initialize_fmask(cmd_buffer, image); 4584 } 4585 4586 if (radv_image_has_dcc(image)) { 4587 uint32_t value = 0xffffffffu; /* Fully expanded mode. */ 4588 bool need_decompress_pass = false; 4589 4590 if (radv_layout_dcc_compressed(image, dst_layout, 4591 dst_queue_mask)) { 4592 value = 0x20202020u; 4593 need_decompress_pass = true; 4594 } 4595 4596 radv_initialize_dcc(cmd_buffer, image, value); 4597 4598 radv_update_fce_metadata(cmd_buffer, image, 4599 need_decompress_pass); 4600 } 4601 4602 if (radv_image_has_cmask(image) || radv_image_has_dcc(image)) { 4603 uint32_t color_values[2] = {}; 4604 radv_set_color_clear_metadata(cmd_buffer, image, color_values); 4605 } 4606} 4607 4608/** 4609 * Handle color image transitions for DCC/FMASK/CMASK. 4610 */ 4611static void radv_handle_color_image_transition(struct radv_cmd_buffer *cmd_buffer, 4612 struct radv_image *image, 4613 VkImageLayout src_layout, 4614 VkImageLayout dst_layout, 4615 unsigned src_queue_mask, 4616 unsigned dst_queue_mask, 4617 const VkImageSubresourceRange *range) 4618{ 4619 if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) { 4620 radv_init_color_image_metadata(cmd_buffer, image, 4621 src_layout, dst_layout, 4622 src_queue_mask, dst_queue_mask); 4623 return; 4624 } 4625 4626 if (radv_image_has_dcc(image)) { 4627 if (src_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) { 4628 radv_initialize_dcc(cmd_buffer, image, 0xffffffffu); 4629 } else if (radv_layout_dcc_compressed(image, src_layout, src_queue_mask) && 4630 !radv_layout_dcc_compressed(image, dst_layout, dst_queue_mask)) { 4631 radv_decompress_dcc(cmd_buffer, image, range); 4632 } else if (radv_layout_can_fast_clear(image, src_layout, src_queue_mask) && 4633 !radv_layout_can_fast_clear(image, dst_layout, dst_queue_mask)) { 4634 radv_fast_clear_flush_image_inplace(cmd_buffer, image, range); 4635 } 4636 } else if (radv_image_has_cmask(image) || radv_image_has_fmask(image)) { 4637 if (radv_layout_can_fast_clear(image, src_layout, src_queue_mask) && 4638 !radv_layout_can_fast_clear(image, dst_layout, dst_queue_mask)) { 4639 radv_fast_clear_flush_image_inplace(cmd_buffer, image, range); 4640 } 4641 4642 if (radv_image_has_fmask(image)) { 4643 if (src_layout != VK_IMAGE_LAYOUT_GENERAL && 4644 dst_layout == VK_IMAGE_LAYOUT_GENERAL) { 4645 radv_expand_fmask_image_inplace(cmd_buffer, image, range); 4646 } 4647 } 4648 } 4649} 4650 4651static void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer, 4652 struct radv_image *image, 4653 VkImageLayout src_layout, 4654 VkImageLayout dst_layout, 4655 uint32_t src_family, 4656 uint32_t dst_family, 4657 const VkImageSubresourceRange *range) 4658{ 4659 if (image->exclusive && src_family != dst_family) { 4660 /* This is an acquire or a release operation and there will be 4661 * a corresponding release/acquire. Do the transition in the 4662 * most flexible queue. */ 4663 4664 assert(src_family == cmd_buffer->queue_family_index || 4665 dst_family == cmd_buffer->queue_family_index); 4666 4667 if (src_family == VK_QUEUE_FAMILY_EXTERNAL) 4668 return; 4669 4670 if (cmd_buffer->queue_family_index == RADV_QUEUE_TRANSFER) 4671 return; 4672 4673 if (cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE && 4674 (src_family == RADV_QUEUE_GENERAL || 4675 dst_family == RADV_QUEUE_GENERAL)) 4676 return; 4677 } 4678 4679 if (src_layout == dst_layout) 4680 return; 4681 4682 unsigned src_queue_mask = 4683 radv_image_queue_family_mask(image, src_family, 4684 cmd_buffer->queue_family_index); 4685 unsigned dst_queue_mask = 4686 radv_image_queue_family_mask(image, dst_family, 4687 cmd_buffer->queue_family_index); 4688 4689 if (vk_format_is_depth(image->vk_format)) { 4690 radv_handle_depth_image_transition(cmd_buffer, image, 4691 src_layout, dst_layout, 4692 src_queue_mask, dst_queue_mask, 4693 range); 4694 } else { 4695 radv_handle_color_image_transition(cmd_buffer, image, 4696 src_layout, dst_layout, 4697 src_queue_mask, dst_queue_mask, 4698 range); 4699 } 4700} 4701 4702struct radv_barrier_info { 4703 uint32_t eventCount; 4704 const VkEvent *pEvents; 4705 VkPipelineStageFlags srcStageMask; 4706 VkPipelineStageFlags dstStageMask; 4707}; 4708 4709static void 4710radv_barrier(struct radv_cmd_buffer *cmd_buffer, 4711 uint32_t memoryBarrierCount, 4712 const VkMemoryBarrier *pMemoryBarriers, 4713 uint32_t bufferMemoryBarrierCount, 4714 const VkBufferMemoryBarrier *pBufferMemoryBarriers, 4715 uint32_t imageMemoryBarrierCount, 4716 const VkImageMemoryBarrier *pImageMemoryBarriers, 4717 const struct radv_barrier_info *info) 4718{ 4719 struct radeon_cmdbuf *cs = cmd_buffer->cs; 4720 enum radv_cmd_flush_bits src_flush_bits = 0; 4721 enum radv_cmd_flush_bits dst_flush_bits = 0; 4722 4723 for (unsigned i = 0; i < info->eventCount; ++i) { 4724 RADV_FROM_HANDLE(radv_event, event, info->pEvents[i]); 4725 uint64_t va = radv_buffer_get_va(event->bo); 4726 4727 radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo); 4728 4729 MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 7); 4730 4731 radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL, va, 1, 0xffffffff); 4732 assert(cmd_buffer->cs->cdw <= cdw_max); 4733 } 4734 4735 for (uint32_t i = 0; i < memoryBarrierCount; i++) { 4736 src_flush_bits |= radv_src_access_flush(cmd_buffer, pMemoryBarriers[i].srcAccessMask, 4737 NULL); 4738 dst_flush_bits |= radv_dst_access_flush(cmd_buffer, pMemoryBarriers[i].dstAccessMask, 4739 NULL); 4740 } 4741 4742 for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) { 4743 src_flush_bits |= radv_src_access_flush(cmd_buffer, pBufferMemoryBarriers[i].srcAccessMask, 4744 NULL); 4745 dst_flush_bits |= radv_dst_access_flush(cmd_buffer, pBufferMemoryBarriers[i].dstAccessMask, 4746 NULL); 4747 } 4748 4749 for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) { 4750 RADV_FROM_HANDLE(radv_image, image, pImageMemoryBarriers[i].image); 4751 4752 src_flush_bits |= radv_src_access_flush(cmd_buffer, pImageMemoryBarriers[i].srcAccessMask, 4753 image); 4754 dst_flush_bits |= radv_dst_access_flush(cmd_buffer, pImageMemoryBarriers[i].dstAccessMask, 4755 image); 4756 } 4757 4758 /* The Vulkan spec 1.1.98 says: 4759 * 4760 * "An execution dependency with only 4761 * VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT in the destination stage mask 4762 * will only prevent that stage from executing in subsequently 4763 * submitted commands. As this stage does not perform any actual 4764 * execution, this is not observable - in effect, it does not delay 4765 * processing of subsequent commands. Similarly an execution dependency 4766 * with only VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT in the source stage mask 4767 * will effectively not wait for any prior commands to complete." 4768 */ 4769 if (info->dstStageMask != VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT) 4770 radv_stage_flush(cmd_buffer, info->srcStageMask); 4771 cmd_buffer->state.flush_bits |= src_flush_bits; 4772 4773 for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) { 4774 RADV_FROM_HANDLE(radv_image, image, pImageMemoryBarriers[i].image); 4775 radv_handle_image_transition(cmd_buffer, image, 4776 pImageMemoryBarriers[i].oldLayout, 4777 pImageMemoryBarriers[i].newLayout, 4778 pImageMemoryBarriers[i].srcQueueFamilyIndex, 4779 pImageMemoryBarriers[i].dstQueueFamilyIndex, 4780 &pImageMemoryBarriers[i].subresourceRange); 4781 } 4782 4783 /* Make sure CP DMA is idle because the driver might have performed a 4784 * DMA operation for copying or filling buffers/images. 4785 */ 4786 if (info->srcStageMask & (VK_PIPELINE_STAGE_TRANSFER_BIT | 4787 VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT)) 4788 si_cp_dma_wait_for_idle(cmd_buffer); 4789 4790 cmd_buffer->state.flush_bits |= dst_flush_bits; 4791} 4792 4793void radv_CmdPipelineBarrier( 4794 VkCommandBuffer commandBuffer, 4795 VkPipelineStageFlags srcStageMask, 4796 VkPipelineStageFlags destStageMask, 4797 VkBool32 byRegion, 4798 uint32_t memoryBarrierCount, 4799 const VkMemoryBarrier* pMemoryBarriers, 4800 uint32_t bufferMemoryBarrierCount, 4801 const VkBufferMemoryBarrier* pBufferMemoryBarriers, 4802 uint32_t imageMemoryBarrierCount, 4803 const VkImageMemoryBarrier* pImageMemoryBarriers) 4804{ 4805 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 4806 struct radv_barrier_info info; 4807 4808 info.eventCount = 0; 4809 info.pEvents = NULL; 4810 info.srcStageMask = srcStageMask; 4811 info.dstStageMask = destStageMask; 4812 4813 radv_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers, 4814 bufferMemoryBarrierCount, pBufferMemoryBarriers, 4815 imageMemoryBarrierCount, pImageMemoryBarriers, &info); 4816} 4817 4818 4819static void write_event(struct radv_cmd_buffer *cmd_buffer, 4820 struct radv_event *event, 4821 VkPipelineStageFlags stageMask, 4822 unsigned value) 4823{ 4824 struct radeon_cmdbuf *cs = cmd_buffer->cs; 4825 uint64_t va = radv_buffer_get_va(event->bo); 4826 4827 si_emit_cache_flush(cmd_buffer); 4828 4829 radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo); 4830 4831 MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 21); 4832 4833 /* Flags that only require a top-of-pipe event. */ 4834 VkPipelineStageFlags top_of_pipe_flags = 4835 VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; 4836 4837 /* Flags that only require a post-index-fetch event. */ 4838 VkPipelineStageFlags post_index_fetch_flags = 4839 top_of_pipe_flags | 4840 VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | 4841 VK_PIPELINE_STAGE_VERTEX_INPUT_BIT; 4842 4843 /* Make sure CP DMA is idle because the driver might have performed a 4844 * DMA operation for copying or filling buffers/images. 4845 */ 4846 if (stageMask & (VK_PIPELINE_STAGE_TRANSFER_BIT | 4847 VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT)) 4848 si_cp_dma_wait_for_idle(cmd_buffer); 4849 4850 /* TODO: Emit EOS events for syncing PS/CS stages. */ 4851 4852 if (!(stageMask & ~top_of_pipe_flags)) { 4853 /* Just need to sync the PFP engine. */ 4854 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0)); 4855 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | 4856 S_370_WR_CONFIRM(1) | 4857 S_370_ENGINE_SEL(V_370_PFP)); 4858 radeon_emit(cs, va); 4859 radeon_emit(cs, va >> 32); 4860 radeon_emit(cs, value); 4861 } else if (!(stageMask & ~post_index_fetch_flags)) { 4862 /* Sync ME because PFP reads index and indirect buffers. */ 4863 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0)); 4864 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | 4865 S_370_WR_CONFIRM(1) | 4866 S_370_ENGINE_SEL(V_370_ME)); 4867 radeon_emit(cs, va); 4868 radeon_emit(cs, va >> 32); 4869 radeon_emit(cs, value); 4870 } else { 4871 /* Otherwise, sync all prior GPU work using an EOP event. */ 4872 si_cs_emit_write_event_eop(cs, 4873 cmd_buffer->device->physical_device->rad_info.chip_class, 4874 radv_cmd_buffer_uses_mec(cmd_buffer), 4875 V_028A90_BOTTOM_OF_PIPE_TS, 0, 4876 EOP_DATA_SEL_VALUE_32BIT, va, value, 4877 cmd_buffer->gfx9_eop_bug_va); 4878 } 4879 4880 assert(cmd_buffer->cs->cdw <= cdw_max); 4881} 4882 4883void radv_CmdSetEvent(VkCommandBuffer commandBuffer, 4884 VkEvent _event, 4885 VkPipelineStageFlags stageMask) 4886{ 4887 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 4888 RADV_FROM_HANDLE(radv_event, event, _event); 4889 4890 write_event(cmd_buffer, event, stageMask, 1); 4891} 4892 4893void radv_CmdResetEvent(VkCommandBuffer commandBuffer, 4894 VkEvent _event, 4895 VkPipelineStageFlags stageMask) 4896{ 4897 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 4898 RADV_FROM_HANDLE(radv_event, event, _event); 4899 4900 write_event(cmd_buffer, event, stageMask, 0); 4901} 4902 4903void radv_CmdWaitEvents(VkCommandBuffer commandBuffer, 4904 uint32_t eventCount, 4905 const VkEvent* pEvents, 4906 VkPipelineStageFlags srcStageMask, 4907 VkPipelineStageFlags dstStageMask, 4908 uint32_t memoryBarrierCount, 4909 const VkMemoryBarrier* pMemoryBarriers, 4910 uint32_t bufferMemoryBarrierCount, 4911 const VkBufferMemoryBarrier* pBufferMemoryBarriers, 4912 uint32_t imageMemoryBarrierCount, 4913 const VkImageMemoryBarrier* pImageMemoryBarriers) 4914{ 4915 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 4916 struct radv_barrier_info info; 4917 4918 info.eventCount = eventCount; 4919 info.pEvents = pEvents; 4920 info.srcStageMask = 0; 4921 4922 radv_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers, 4923 bufferMemoryBarrierCount, pBufferMemoryBarriers, 4924 imageMemoryBarrierCount, pImageMemoryBarriers, &info); 4925} 4926 4927 4928void radv_CmdSetDeviceMask(VkCommandBuffer commandBuffer, 4929 uint32_t deviceMask) 4930{ 4931 /* No-op */ 4932} 4933 4934/* VK_EXT_conditional_rendering */ 4935void radv_CmdBeginConditionalRenderingEXT( 4936 VkCommandBuffer commandBuffer, 4937 const VkConditionalRenderingBeginInfoEXT* pConditionalRenderingBegin) 4938{ 4939 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 4940 RADV_FROM_HANDLE(radv_buffer, buffer, pConditionalRenderingBegin->buffer); 4941 struct radeon_cmdbuf *cs = cmd_buffer->cs; 4942 bool draw_visible = true; 4943 uint64_t pred_value = 0; 4944 uint64_t va, new_va; 4945 unsigned pred_offset; 4946 4947 va = radv_buffer_get_va(buffer->bo) + pConditionalRenderingBegin->offset; 4948 4949 /* By default, if the 32-bit value at offset in buffer memory is zero, 4950 * then the rendering commands are discarded, otherwise they are 4951 * executed as normal. If the inverted flag is set, all commands are 4952 * discarded if the value is non zero. 4953 */ 4954 if (pConditionalRenderingBegin->flags & 4955 VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT) { 4956 draw_visible = false; 4957 } 4958 4959 si_emit_cache_flush(cmd_buffer); 4960 4961 /* From the Vulkan spec 1.1.107: 4962 * 4963 * "If the 32-bit value at offset in buffer memory is zero, then the 4964 * rendering commands are discarded, otherwise they are executed as 4965 * normal. If the value of the predicate in buffer memory changes while 4966 * conditional rendering is active, the rendering commands may be 4967 * discarded in an implementation-dependent way. Some implementations 4968 * may latch the value of the predicate upon beginning conditional 4969 * rendering while others may read it before every rendering command." 4970 * 4971 * But, the AMD hardware treats the predicate as a 64-bit value which 4972 * means we need a workaround in the driver. Luckily, it's not required 4973 * to support if the value changes when predication is active. 4974 * 4975 * The workaround is as follows: 4976 * 1) allocate a 64-value in the upload BO and initialize it to 0 4977 * 2) copy the 32-bit predicate value to the upload BO 4978 * 3) use the new allocated VA address for predication 4979 * 4980 * Based on the conditionalrender demo, it's faster to do the COPY_DATA 4981 * in ME (+ sync PFP) instead of PFP. 4982 */ 4983 radv_cmd_buffer_upload_data(cmd_buffer, 8, 16, &pred_value, &pred_offset); 4984 4985 new_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset; 4986 4987 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); 4988 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | 4989 COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | 4990 COPY_DATA_WR_CONFIRM); 4991 radeon_emit(cs, va); 4992 radeon_emit(cs, va >> 32); 4993 radeon_emit(cs, new_va); 4994 radeon_emit(cs, new_va >> 32); 4995 4996 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); 4997 radeon_emit(cs, 0); 4998 4999 /* Enable predication for this command buffer. */ 5000 si_emit_set_predication_state(cmd_buffer, draw_visible, new_va); 5001 cmd_buffer->state.predicating = true; 5002 5003 /* Store conditional rendering user info. */ 5004 cmd_buffer->state.predication_type = draw_visible; 5005 cmd_buffer->state.predication_va = new_va; 5006} 5007 5008void radv_CmdEndConditionalRenderingEXT( 5009 VkCommandBuffer commandBuffer) 5010{ 5011 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5012 5013 /* Disable predication for this command buffer. */ 5014 si_emit_set_predication_state(cmd_buffer, false, 0); 5015 cmd_buffer->state.predicating = false; 5016 5017 /* Reset conditional rendering user info. */ 5018 cmd_buffer->state.predication_type = -1; 5019 cmd_buffer->state.predication_va = 0; 5020} 5021 5022/* VK_EXT_transform_feedback */ 5023void radv_CmdBindTransformFeedbackBuffersEXT( 5024 VkCommandBuffer commandBuffer, 5025 uint32_t firstBinding, 5026 uint32_t bindingCount, 5027 const VkBuffer* pBuffers, 5028 const VkDeviceSize* pOffsets, 5029 const VkDeviceSize* pSizes) 5030{ 5031 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5032 struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings; 5033 uint8_t enabled_mask = 0; 5034 5035 assert(firstBinding + bindingCount <= MAX_SO_BUFFERS); 5036 for (uint32_t i = 0; i < bindingCount; i++) { 5037 uint32_t idx = firstBinding + i; 5038 5039 sb[idx].buffer = radv_buffer_from_handle(pBuffers[i]); 5040 sb[idx].offset = pOffsets[i]; 5041 sb[idx].size = pSizes[i]; 5042 5043 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, 5044 sb[idx].buffer->bo); 5045 5046 enabled_mask |= 1 << idx; 5047 } 5048 5049 cmd_buffer->state.streamout.enabled_mask |= enabled_mask; 5050 5051 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_BUFFER; 5052} 5053 5054static void 5055radv_emit_streamout_enable(struct radv_cmd_buffer *cmd_buffer) 5056{ 5057 struct radv_streamout_state *so = &cmd_buffer->state.streamout; 5058 struct radeon_cmdbuf *cs = cmd_buffer->cs; 5059 5060 radeon_set_context_reg_seq(cs, R_028B94_VGT_STRMOUT_CONFIG, 2); 5061 radeon_emit(cs, 5062 S_028B94_STREAMOUT_0_EN(so->streamout_enabled) | 5063 S_028B94_RAST_STREAM(0) | 5064 S_028B94_STREAMOUT_1_EN(so->streamout_enabled) | 5065 S_028B94_STREAMOUT_2_EN(so->streamout_enabled) | 5066 S_028B94_STREAMOUT_3_EN(so->streamout_enabled)); 5067 radeon_emit(cs, so->hw_enabled_mask & 5068 so->enabled_stream_buffers_mask); 5069 5070 cmd_buffer->state.context_roll_without_scissor_emitted = true; 5071} 5072 5073static void 5074radv_set_streamout_enable(struct radv_cmd_buffer *cmd_buffer, bool enable) 5075{ 5076 struct radv_streamout_state *so = &cmd_buffer->state.streamout; 5077 bool old_streamout_enabled = so->streamout_enabled; 5078 uint32_t old_hw_enabled_mask = so->hw_enabled_mask; 5079 5080 so->streamout_enabled = enable; 5081 5082 so->hw_enabled_mask = so->enabled_mask | 5083 (so->enabled_mask << 4) | 5084 (so->enabled_mask << 8) | 5085 (so->enabled_mask << 12); 5086 5087 if ((old_streamout_enabled != so->streamout_enabled) || 5088 (old_hw_enabled_mask != so->hw_enabled_mask)) 5089 radv_emit_streamout_enable(cmd_buffer); 5090} 5091 5092static void radv_flush_vgt_streamout(struct radv_cmd_buffer *cmd_buffer) 5093{ 5094 struct radeon_cmdbuf *cs = cmd_buffer->cs; 5095 unsigned reg_strmout_cntl; 5096 5097 /* The register is at different places on different ASICs. */ 5098 if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) { 5099 reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL; 5100 radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0); 5101 } else { 5102 reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL; 5103 radeon_set_config_reg(cs, reg_strmout_cntl, 0); 5104 } 5105 5106 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 5107 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0)); 5108 5109 radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); 5110 radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */ 5111 radeon_emit(cs, reg_strmout_cntl >> 2); /* register */ 5112 radeon_emit(cs, 0); 5113 radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */ 5114 radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */ 5115 radeon_emit(cs, 4); /* poll interval */ 5116} 5117 5118void radv_CmdBeginTransformFeedbackEXT( 5119 VkCommandBuffer commandBuffer, 5120 uint32_t firstCounterBuffer, 5121 uint32_t counterBufferCount, 5122 const VkBuffer* pCounterBuffers, 5123 const VkDeviceSize* pCounterBufferOffsets) 5124{ 5125 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5126 struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings; 5127 struct radv_streamout_state *so = &cmd_buffer->state.streamout; 5128 struct radeon_cmdbuf *cs = cmd_buffer->cs; 5129 uint32_t i; 5130 5131 radv_flush_vgt_streamout(cmd_buffer); 5132 5133 assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS); 5134 for_each_bit(i, so->enabled_mask) { 5135 int32_t counter_buffer_idx = i - firstCounterBuffer; 5136 if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount) 5137 counter_buffer_idx = -1; 5138 5139 /* SI binds streamout buffers as shader resources. 5140 * VGT only counts primitives and tells the shader through 5141 * SGPRs what to do. 5142 */ 5143 radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 2); 5144 radeon_emit(cs, sb[i].size >> 2); /* BUFFER_SIZE (in DW) */ 5145 radeon_emit(cs, so->stride_in_dw[i]); /* VTX_STRIDE (in DW) */ 5146 5147 cmd_buffer->state.context_roll_without_scissor_emitted = true; 5148 5149 if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) { 5150 /* The array of counter buffers is optional. */ 5151 RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]); 5152 uint64_t va = radv_buffer_get_va(buffer->bo); 5153 5154 va += buffer->offset + pCounterBufferOffsets[counter_buffer_idx]; 5155 5156 /* Append */ 5157 radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); 5158 radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | 5159 STRMOUT_DATA_TYPE(1) | /* offset in bytes */ 5160 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */ 5161 radeon_emit(cs, 0); /* unused */ 5162 radeon_emit(cs, 0); /* unused */ 5163 radeon_emit(cs, va); /* src address lo */ 5164 radeon_emit(cs, va >> 32); /* src address hi */ 5165 5166 radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo); 5167 } else { 5168 /* Start from the beginning. */ 5169 radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); 5170 radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | 5171 STRMOUT_DATA_TYPE(1) | /* offset in bytes */ 5172 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */ 5173 radeon_emit(cs, 0); /* unused */ 5174 radeon_emit(cs, 0); /* unused */ 5175 radeon_emit(cs, 0); /* unused */ 5176 radeon_emit(cs, 0); /* unused */ 5177 } 5178 } 5179 5180 radv_set_streamout_enable(cmd_buffer, true); 5181} 5182 5183void radv_CmdEndTransformFeedbackEXT( 5184 VkCommandBuffer commandBuffer, 5185 uint32_t firstCounterBuffer, 5186 uint32_t counterBufferCount, 5187 const VkBuffer* pCounterBuffers, 5188 const VkDeviceSize* pCounterBufferOffsets) 5189{ 5190 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5191 struct radv_streamout_state *so = &cmd_buffer->state.streamout; 5192 struct radeon_cmdbuf *cs = cmd_buffer->cs; 5193 uint32_t i; 5194 5195 radv_flush_vgt_streamout(cmd_buffer); 5196 5197 assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS); 5198 for_each_bit(i, so->enabled_mask) { 5199 int32_t counter_buffer_idx = i - firstCounterBuffer; 5200 if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount) 5201 counter_buffer_idx = -1; 5202 5203 if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) { 5204 /* The array of counters buffer is optional. */ 5205 RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]); 5206 uint64_t va = radv_buffer_get_va(buffer->bo); 5207 5208 va += buffer->offset + pCounterBufferOffsets[counter_buffer_idx]; 5209 5210 radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); 5211 radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | 5212 STRMOUT_DATA_TYPE(1) | /* offset in bytes */ 5213 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) | 5214 STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */ 5215 radeon_emit(cs, va); /* dst address lo */ 5216 radeon_emit(cs, va >> 32); /* dst address hi */ 5217 radeon_emit(cs, 0); /* unused */ 5218 radeon_emit(cs, 0); /* unused */ 5219 5220 radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo); 5221 } 5222 5223 /* Deactivate transform feedback by zeroing the buffer size. 5224 * The counters (primitives generated, primitives emitted) may 5225 * be enabled even if there is not buffer bound. This ensures 5226 * that the primitives-emitted query won't increment. 5227 */ 5228 radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 0); 5229 5230 cmd_buffer->state.context_roll_without_scissor_emitted = true; 5231 } 5232 5233 radv_set_streamout_enable(cmd_buffer, false); 5234} 5235 5236void radv_CmdDrawIndirectByteCountEXT( 5237 VkCommandBuffer commandBuffer, 5238 uint32_t instanceCount, 5239 uint32_t firstInstance, 5240 VkBuffer _counterBuffer, 5241 VkDeviceSize counterBufferOffset, 5242 uint32_t counterOffset, 5243 uint32_t vertexStride) 5244{ 5245 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5246 RADV_FROM_HANDLE(radv_buffer, counterBuffer, _counterBuffer); 5247 struct radv_draw_info info = {}; 5248 5249 info.instance_count = instanceCount; 5250 info.first_instance = firstInstance; 5251 info.strmout_buffer = counterBuffer; 5252 info.strmout_buffer_offset = counterBufferOffset; 5253 info.stride = vertexStride; 5254 5255 radv_draw(cmd_buffer, &info); 5256} 5257