v3dv_cmd_buffer.c revision 7ec681f3
1/* 2 * Copyright © 2019 Raspberry Pi 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "v3dv_private.h" 25#include "util/u_pack_color.h" 26#include "vk_format_info.h" 27#include "vk_util.h" 28 29const struct v3dv_dynamic_state default_dynamic_state = { 30 .viewport = { 31 .count = 0, 32 }, 33 .scissor = { 34 .count = 0, 35 }, 36 .stencil_compare_mask = 37 { 38 .front = ~0u, 39 .back = ~0u, 40 }, 41 .stencil_write_mask = 42 { 43 .front = ~0u, 44 .back = ~0u, 45 }, 46 .stencil_reference = 47 { 48 .front = 0u, 49 .back = 0u, 50 }, 51 .blend_constants = { 0.0f, 0.0f, 0.0f, 0.0f }, 52 .depth_bias = { 53 .constant_factor = 0.0f, 54 .depth_bias_clamp = 0.0f, 55 .slope_factor = 0.0f, 56 }, 57 .line_width = 1.0f, 58 .color_write_enable = (1ull << (4 * V3D_MAX_DRAW_BUFFERS)) - 1, 59}; 60 61void 62v3dv_job_add_bo(struct v3dv_job *job, struct v3dv_bo *bo) 63{ 64 if (!bo) 65 return; 66 67 if (job->bo_handle_mask & bo->handle_bit) { 68 if (_mesa_set_search(job->bos, bo)) 69 return; 70 } 71 72 _mesa_set_add(job->bos, bo); 73 job->bo_count++; 74 job->bo_handle_mask |= bo->handle_bit; 75} 76 77void 78v3dv_job_add_bo_unchecked(struct v3dv_job *job, struct v3dv_bo *bo) 79{ 80 assert(bo); 81 _mesa_set_add(job->bos, bo); 82 job->bo_count++; 83 job->bo_handle_mask |= bo->handle_bit; 84} 85 86VKAPI_ATTR VkResult VKAPI_CALL 87v3dv_CreateCommandPool(VkDevice _device, 88 const VkCommandPoolCreateInfo *pCreateInfo, 89 const VkAllocationCallbacks *pAllocator, 90 VkCommandPool *pCmdPool) 91{ 92 V3DV_FROM_HANDLE(v3dv_device, device, _device); 93 struct v3dv_cmd_pool *pool; 94 95 /* We only support one queue */ 96 assert(pCreateInfo->queueFamilyIndex == 0); 97 98 pool = vk_object_zalloc(&device->vk, pAllocator, sizeof(*pool), 99 VK_OBJECT_TYPE_COMMAND_POOL); 100 if (pool == NULL) 101 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 102 103 if (pAllocator) 104 pool->alloc = *pAllocator; 105 else 106 pool->alloc = device->vk.alloc; 107 108 list_inithead(&pool->cmd_buffers); 109 110 *pCmdPool = v3dv_cmd_pool_to_handle(pool); 111 112 return VK_SUCCESS; 113} 114 115static void 116cmd_buffer_init(struct v3dv_cmd_buffer *cmd_buffer, 117 struct v3dv_device *device, 118 struct v3dv_cmd_pool *pool, 119 VkCommandBufferLevel level) 120{ 121 /* Do not reset the base object! If we are calling this from a command 122 * buffer reset that would reset the loader's dispatch table for the 123 * command buffer, and any other relevant info from vk_object_base 124 */ 125 const uint32_t base_size = sizeof(struct vk_command_buffer); 126 uint8_t *cmd_buffer_driver_start = ((uint8_t *) cmd_buffer) + base_size; 127 memset(cmd_buffer_driver_start, 0, sizeof(*cmd_buffer) - base_size); 128 129 cmd_buffer->device = device; 130 cmd_buffer->pool = pool; 131 cmd_buffer->level = level; 132 133 list_inithead(&cmd_buffer->private_objs); 134 list_inithead(&cmd_buffer->jobs); 135 list_inithead(&cmd_buffer->list_link); 136 137 assert(pool); 138 list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers); 139 140 cmd_buffer->state.subpass_idx = -1; 141 cmd_buffer->state.meta.subpass_idx = -1; 142 143 cmd_buffer->status = V3DV_CMD_BUFFER_STATUS_INITIALIZED; 144} 145 146static VkResult 147cmd_buffer_create(struct v3dv_device *device, 148 struct v3dv_cmd_pool *pool, 149 VkCommandBufferLevel level, 150 VkCommandBuffer *pCommandBuffer) 151{ 152 struct v3dv_cmd_buffer *cmd_buffer; 153 cmd_buffer = vk_zalloc2(&device->vk.alloc, 154 &pool->alloc, 155 sizeof(*cmd_buffer), 156 8, 157 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 158 if (cmd_buffer == NULL) 159 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 160 161 VkResult result; 162 result = vk_command_buffer_init(&cmd_buffer->vk, &device->vk); 163 if (result != VK_SUCCESS) { 164 vk_free2(&device->vk.alloc, &pool->alloc, cmd_buffer); 165 return result; 166 } 167 168 cmd_buffer_init(cmd_buffer, device, pool, level); 169 170 *pCommandBuffer = v3dv_cmd_buffer_to_handle(cmd_buffer); 171 172 return VK_SUCCESS; 173} 174 175static void 176job_destroy_gpu_cl_resources(struct v3dv_job *job) 177{ 178 assert(job->type == V3DV_JOB_TYPE_GPU_CL || 179 job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY); 180 181 v3dv_cl_destroy(&job->bcl); 182 v3dv_cl_destroy(&job->rcl); 183 v3dv_cl_destroy(&job->indirect); 184 185 /* Since we don't ref BOs when we add them to the command buffer, don't 186 * unref them here either. Bo's will be freed when their corresponding API 187 * objects are destroyed. 188 */ 189 _mesa_set_destroy(job->bos, NULL); 190 191 v3dv_bo_free(job->device, job->tile_alloc); 192 v3dv_bo_free(job->device, job->tile_state); 193} 194 195static void 196job_destroy_cloned_gpu_cl_resources(struct v3dv_job *job) 197{ 198 assert(job->type == V3DV_JOB_TYPE_GPU_CL); 199 200 list_for_each_entry_safe(struct v3dv_bo, bo, &job->bcl.bo_list, list_link) { 201 list_del(&bo->list_link); 202 vk_free(&job->device->vk.alloc, bo); 203 } 204 205 list_for_each_entry_safe(struct v3dv_bo, bo, &job->rcl.bo_list, list_link) { 206 list_del(&bo->list_link); 207 vk_free(&job->device->vk.alloc, bo); 208 } 209 210 list_for_each_entry_safe(struct v3dv_bo, bo, &job->indirect.bo_list, list_link) { 211 list_del(&bo->list_link); 212 vk_free(&job->device->vk.alloc, bo); 213 } 214} 215 216static void 217job_destroy_gpu_csd_resources(struct v3dv_job *job) 218{ 219 assert(job->type == V3DV_JOB_TYPE_GPU_CSD); 220 assert(job->cmd_buffer); 221 222 v3dv_cl_destroy(&job->indirect); 223 224 _mesa_set_destroy(job->bos, NULL); 225 226 if (job->csd.shared_memory) 227 v3dv_bo_free(job->device, job->csd.shared_memory); 228} 229 230static void 231job_destroy_cpu_wait_events_resources(struct v3dv_job *job) 232{ 233 assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS); 234 assert(job->cmd_buffer); 235 vk_free(&job->cmd_buffer->device->vk.alloc, job->cpu.event_wait.events); 236} 237 238static void 239job_destroy_cpu_csd_indirect_resources(struct v3dv_job *job) 240{ 241 assert(job->type == V3DV_JOB_TYPE_CPU_CSD_INDIRECT); 242 assert(job->cmd_buffer); 243 v3dv_job_destroy(job->cpu.csd_indirect.csd_job); 244} 245 246void 247v3dv_job_destroy(struct v3dv_job *job) 248{ 249 assert(job); 250 251 list_del(&job->list_link); 252 253 /* Cloned jobs don't make deep copies of the original jobs, so they don't 254 * own any of their resources. However, they do allocate clones of BO 255 * structs, so make sure we free those. 256 */ 257 if (!job->is_clone) { 258 switch (job->type) { 259 case V3DV_JOB_TYPE_GPU_CL: 260 case V3DV_JOB_TYPE_GPU_CL_SECONDARY: 261 job_destroy_gpu_cl_resources(job); 262 break; 263 case V3DV_JOB_TYPE_GPU_CSD: 264 job_destroy_gpu_csd_resources(job); 265 break; 266 case V3DV_JOB_TYPE_CPU_WAIT_EVENTS: 267 job_destroy_cpu_wait_events_resources(job); 268 break; 269 case V3DV_JOB_TYPE_CPU_CSD_INDIRECT: 270 job_destroy_cpu_csd_indirect_resources(job); 271 break; 272 default: 273 break; 274 } 275 } else { 276 /* Cloned jobs */ 277 if (job->type == V3DV_JOB_TYPE_GPU_CL) 278 job_destroy_cloned_gpu_cl_resources(job); 279 } 280 281 vk_free(&job->device->vk.alloc, job); 282} 283 284void 285v3dv_cmd_buffer_add_private_obj(struct v3dv_cmd_buffer *cmd_buffer, 286 uint64_t obj, 287 v3dv_cmd_buffer_private_obj_destroy_cb destroy_cb) 288{ 289 struct v3dv_cmd_buffer_private_obj *pobj = 290 vk_alloc(&cmd_buffer->device->vk.alloc, sizeof(*pobj), 8, 291 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 292 if (!pobj) { 293 v3dv_flag_oom(cmd_buffer, NULL); 294 return; 295 } 296 297 pobj->obj = obj; 298 pobj->destroy_cb = destroy_cb; 299 300 list_addtail(&pobj->list_link, &cmd_buffer->private_objs); 301} 302 303static void 304cmd_buffer_destroy_private_obj(struct v3dv_cmd_buffer *cmd_buffer, 305 struct v3dv_cmd_buffer_private_obj *pobj) 306{ 307 assert(pobj && pobj->obj && pobj->destroy_cb); 308 pobj->destroy_cb(v3dv_device_to_handle(cmd_buffer->device), 309 pobj->obj, 310 &cmd_buffer->device->vk.alloc); 311 list_del(&pobj->list_link); 312 vk_free(&cmd_buffer->device->vk.alloc, pobj); 313} 314 315static void 316cmd_buffer_free_resources(struct v3dv_cmd_buffer *cmd_buffer) 317{ 318 list_for_each_entry_safe(struct v3dv_job, job, 319 &cmd_buffer->jobs, list_link) { 320 v3dv_job_destroy(job); 321 } 322 323 if (cmd_buffer->state.job) 324 v3dv_job_destroy(cmd_buffer->state.job); 325 326 if (cmd_buffer->state.attachments) 327 vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments); 328 329 if (cmd_buffer->state.query.end.alloc_count > 0) 330 vk_free(&cmd_buffer->device->vk.alloc, cmd_buffer->state.query.end.states); 331 332 if (cmd_buffer->push_constants_resource.bo) 333 v3dv_bo_free(cmd_buffer->device, cmd_buffer->push_constants_resource.bo); 334 335 list_for_each_entry_safe(struct v3dv_cmd_buffer_private_obj, pobj, 336 &cmd_buffer->private_objs, list_link) { 337 cmd_buffer_destroy_private_obj(cmd_buffer, pobj); 338 } 339 340 if (cmd_buffer->state.meta.attachments) { 341 assert(cmd_buffer->state.meta.attachment_alloc_count > 0); 342 vk_free(&cmd_buffer->device->vk.alloc, cmd_buffer->state.meta.attachments); 343 } 344} 345 346static void 347cmd_buffer_destroy(struct v3dv_cmd_buffer *cmd_buffer) 348{ 349 list_del(&cmd_buffer->pool_link); 350 cmd_buffer_free_resources(cmd_buffer); 351 vk_command_buffer_finish(&cmd_buffer->vk); 352 vk_free2(&cmd_buffer->device->vk.alloc, &cmd_buffer->pool->alloc, 353 cmd_buffer); 354} 355 356static bool 357attachment_list_is_subset(struct v3dv_subpass_attachment *l1, uint32_t l1_count, 358 struct v3dv_subpass_attachment *l2, uint32_t l2_count) 359{ 360 for (uint32_t i = 0; i < l1_count; i++) { 361 uint32_t attachment_idx = l1[i].attachment; 362 if (attachment_idx == VK_ATTACHMENT_UNUSED) 363 continue; 364 365 uint32_t j; 366 for (j = 0; j < l2_count; j++) { 367 if (l2[j].attachment == attachment_idx) 368 break; 369 } 370 if (j == l2_count) 371 return false; 372 } 373 374 return true; 375 } 376 377static bool 378cmd_buffer_can_merge_subpass(struct v3dv_cmd_buffer *cmd_buffer, 379 uint32_t subpass_idx) 380{ 381 const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 382 assert(state->pass); 383 384 const struct v3dv_physical_device *physical_device = 385 &cmd_buffer->device->instance->physicalDevice; 386 387 if (cmd_buffer->level != VK_COMMAND_BUFFER_LEVEL_PRIMARY) 388 return false; 389 390 if (!cmd_buffer->state.job) 391 return false; 392 393 if (cmd_buffer->state.job->always_flush) 394 return false; 395 396 if (!physical_device->options.merge_jobs) 397 return false; 398 399 /* Each render pass starts a new job */ 400 if (subpass_idx == 0) 401 return false; 402 403 /* Two subpasses can be merged in the same job if we can emit a single RCL 404 * for them (since the RCL includes the END_OF_RENDERING command that 405 * triggers the "render job finished" interrupt). We can do this so long 406 * as both subpasses render against the same attachments. 407 */ 408 assert(state->subpass_idx == subpass_idx - 1); 409 struct v3dv_subpass *prev_subpass = &state->pass->subpasses[state->subpass_idx]; 410 struct v3dv_subpass *subpass = &state->pass->subpasses[subpass_idx]; 411 412 /* Don't merge if the subpasses have different view masks, since in that 413 * case the framebuffer setup is different and we need to emit different 414 * RCLs. 415 */ 416 if (subpass->view_mask != prev_subpass->view_mask) 417 return false; 418 419 /* Because the list of subpass attachments can include VK_ATTACHMENT_UNUSED, 420 * we need to check that for each subpass all its used attachments are 421 * used by the other subpass. 422 */ 423 bool compatible = 424 attachment_list_is_subset(prev_subpass->color_attachments, 425 prev_subpass->color_count, 426 subpass->color_attachments, 427 subpass->color_count); 428 if (!compatible) 429 return false; 430 431 compatible = 432 attachment_list_is_subset(subpass->color_attachments, 433 subpass->color_count, 434 prev_subpass->color_attachments, 435 prev_subpass->color_count); 436 if (!compatible) 437 return false; 438 439 if (subpass->ds_attachment.attachment != 440 prev_subpass->ds_attachment.attachment) 441 return false; 442 443 /* FIXME: Since some attachment formats can't be resolved using the TLB we 444 * need to emit separate resolve jobs for them and that would not be 445 * compatible with subpass merges. We could fix that by testing if any of 446 * the attachments to resolve doesn't suppotr TLB resolves. 447 */ 448 if (prev_subpass->resolve_attachments || subpass->resolve_attachments) 449 return false; 450 451 return true; 452} 453 454/** 455 * Computes and sets the job frame tiling information required to setup frame 456 * binning and rendering. 457 */ 458static struct v3dv_frame_tiling * 459job_compute_frame_tiling(struct v3dv_job *job, 460 uint32_t width, 461 uint32_t height, 462 uint32_t layers, 463 uint32_t render_target_count, 464 uint8_t max_internal_bpp, 465 bool msaa) 466{ 467 static const uint8_t tile_sizes[] = { 468 64, 64, 469 64, 32, 470 32, 32, 471 32, 16, 472 16, 16, 473 16, 8, 474 8, 8 475 }; 476 477 assert(job); 478 struct v3dv_frame_tiling *tiling = &job->frame_tiling; 479 480 tiling->width = width; 481 tiling->height = height; 482 tiling->layers = layers; 483 tiling->render_target_count = render_target_count; 484 tiling->msaa = msaa; 485 486 uint32_t tile_size_index = 0; 487 488 if (render_target_count > 2) 489 tile_size_index += 2; 490 else if (render_target_count > 1) 491 tile_size_index += 1; 492 493 if (msaa) 494 tile_size_index += 2; 495 496 tiling->internal_bpp = max_internal_bpp; 497 tile_size_index += tiling->internal_bpp; 498 assert(tile_size_index < ARRAY_SIZE(tile_sizes) / 2); 499 500 tiling->tile_width = tile_sizes[tile_size_index * 2]; 501 tiling->tile_height = tile_sizes[tile_size_index * 2 + 1]; 502 503 tiling->draw_tiles_x = DIV_ROUND_UP(width, tiling->tile_width); 504 tiling->draw_tiles_y = DIV_ROUND_UP(height, tiling->tile_height); 505 506 /* Size up our supertiles until we get under the limit */ 507 const uint32_t max_supertiles = 256; 508 tiling->supertile_width = 1; 509 tiling->supertile_height = 1; 510 for (;;) { 511 tiling->frame_width_in_supertiles = 512 DIV_ROUND_UP(tiling->draw_tiles_x, tiling->supertile_width); 513 tiling->frame_height_in_supertiles = 514 DIV_ROUND_UP(tiling->draw_tiles_y, tiling->supertile_height); 515 const uint32_t num_supertiles = tiling->frame_width_in_supertiles * 516 tiling->frame_height_in_supertiles; 517 if (num_supertiles < max_supertiles) 518 break; 519 520 if (tiling->supertile_width < tiling->supertile_height) 521 tiling->supertile_width++; 522 else 523 tiling->supertile_height++; 524 } 525 526 return tiling; 527} 528 529void 530v3dv_job_start_frame(struct v3dv_job *job, 531 uint32_t width, 532 uint32_t height, 533 uint32_t layers, 534 bool allocate_tile_state_for_all_layers, 535 uint32_t render_target_count, 536 uint8_t max_internal_bpp, 537 bool msaa) 538{ 539 assert(job); 540 541 /* Start by computing frame tiling spec for this job */ 542 const struct v3dv_frame_tiling *tiling = 543 job_compute_frame_tiling(job, 544 width, height, layers, 545 render_target_count, max_internal_bpp, msaa); 546 547 v3dv_cl_ensure_space_with_branch(&job->bcl, 256); 548 v3dv_return_if_oom(NULL, job); 549 550 /* We only need to allocate tile state for all layers if the binner 551 * writes primitives to layers other than the first. This can only be 552 * done using layered rendering (writing gl_Layer from a geometry shader), 553 * so for other cases of multilayered framebuffers (typically with 554 * meta copy/clear operations) that won't use layered rendering, we only 555 * need one layer worth of of tile state for the binner. 556 */ 557 if (!allocate_tile_state_for_all_layers) 558 layers = 1; 559 560 /* The PTB will request the tile alloc initial size per tile at start 561 * of tile binning. 562 */ 563 uint32_t tile_alloc_size = 64 * tiling->layers * 564 tiling->draw_tiles_x * 565 tiling->draw_tiles_y; 566 567 /* The PTB allocates in aligned 4k chunks after the initial setup. */ 568 tile_alloc_size = align(tile_alloc_size, 4096); 569 570 /* Include the first two chunk allocations that the PTB does so that 571 * we definitely clear the OOM condition before triggering one (the HW 572 * won't trigger OOM during the first allocations). 573 */ 574 tile_alloc_size += 8192; 575 576 /* For performance, allocate some extra initial memory after the PTB's 577 * minimal allocations, so that we hopefully don't have to block the 578 * GPU on the kernel handling an OOM signal. 579 */ 580 tile_alloc_size += 512 * 1024; 581 582 job->tile_alloc = v3dv_bo_alloc(job->device, tile_alloc_size, 583 "tile_alloc", true); 584 if (!job->tile_alloc) { 585 v3dv_flag_oom(NULL, job); 586 return; 587 } 588 589 v3dv_job_add_bo_unchecked(job, job->tile_alloc); 590 591 const uint32_t tsda_per_tile_size = 256; 592 const uint32_t tile_state_size = tiling->layers * 593 tiling->draw_tiles_x * 594 tiling->draw_tiles_y * 595 tsda_per_tile_size; 596 job->tile_state = v3dv_bo_alloc(job->device, tile_state_size, "TSDA", true); 597 if (!job->tile_state) { 598 v3dv_flag_oom(NULL, job); 599 return; 600 } 601 602 v3dv_job_add_bo_unchecked(job, job->tile_state); 603 604 v3dv_X(job->device, job_emit_binning_prolog)(job, tiling, layers); 605 606 job->ez_state = V3D_EZ_UNDECIDED; 607 job->first_ez_state = V3D_EZ_UNDECIDED; 608} 609 610static void 611cmd_buffer_end_render_pass_frame(struct v3dv_cmd_buffer *cmd_buffer) 612{ 613 assert(cmd_buffer->state.job); 614 615 /* Typically, we have a single job for each subpass and we emit the job's RCL 616 * here when we are ending the frame for the subpass. However, some commands 617 * such as vkCmdClearAttachments need to run in their own separate job and 618 * they emit their own RCL even if they execute inside a subpass. In this 619 * scenario, we don't want to emit subpass RCL when we end the frame for 620 * those jobs, so we only emit the subpass RCL if the job has not recorded 621 * any RCL commands of its own. 622 */ 623 if (v3dv_cl_offset(&cmd_buffer->state.job->rcl) == 0) 624 v3dv_X(cmd_buffer->device, cmd_buffer_emit_render_pass_rcl)(cmd_buffer); 625 626 v3dv_X(cmd_buffer->device, job_emit_binning_flush)(cmd_buffer->state.job); 627} 628 629struct v3dv_job * 630v3dv_cmd_buffer_create_cpu_job(struct v3dv_device *device, 631 enum v3dv_job_type type, 632 struct v3dv_cmd_buffer *cmd_buffer, 633 uint32_t subpass_idx) 634{ 635 struct v3dv_job *job = vk_zalloc(&device->vk.alloc, 636 sizeof(struct v3dv_job), 8, 637 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 638 if (!job) { 639 v3dv_flag_oom(cmd_buffer, NULL); 640 return NULL; 641 } 642 643 v3dv_job_init(job, type, device, cmd_buffer, subpass_idx); 644 return job; 645} 646 647static void 648cmd_buffer_add_cpu_jobs_for_pending_state(struct v3dv_cmd_buffer *cmd_buffer) 649{ 650 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 651 652 if (state->query.end.used_count > 0) { 653 const uint32_t query_count = state->query.end.used_count; 654 for (uint32_t i = 0; i < query_count; i++) { 655 assert(i < state->query.end.used_count); 656 struct v3dv_job *job = 657 v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device, 658 V3DV_JOB_TYPE_CPU_END_QUERY, 659 cmd_buffer, -1); 660 v3dv_return_if_oom(cmd_buffer, NULL); 661 662 job->cpu.query_end = state->query.end.states[i]; 663 list_addtail(&job->list_link, &cmd_buffer->jobs); 664 } 665 } 666} 667 668void 669v3dv_cmd_buffer_finish_job(struct v3dv_cmd_buffer *cmd_buffer) 670{ 671 struct v3dv_job *job = cmd_buffer->state.job; 672 if (!job) 673 return; 674 675 if (cmd_buffer->state.oom) { 676 v3dv_job_destroy(job); 677 cmd_buffer->state.job = NULL; 678 return; 679 } 680 681 /* If we have created a job for a command buffer then we should have 682 * recorded something into it: if the job was started in a render pass, it 683 * should at least have the start frame commands, otherwise, it should have 684 * a transfer command. The only exception are secondary command buffers 685 * inside a render pass. 686 */ 687 assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY || 688 v3dv_cl_offset(&job->bcl) > 0); 689 690 /* When we merge multiple subpasses into the same job we must only emit one 691 * RCL, so we do that here, when we decided that we need to finish the job. 692 * Any rendering that happens outside a render pass is never merged, so 693 * the RCL should have been emitted by the time we got here. 694 */ 695 assert(v3dv_cl_offset(&job->rcl) != 0 || cmd_buffer->state.pass); 696 697 /* If we are finishing a job inside a render pass we have two scenarios: 698 * 699 * 1. It is a regular CL, in which case we will submit the job to the GPU, 700 * so we may need to generate an RCL and add a binning flush. 701 * 702 * 2. It is a partial CL recorded in a secondary command buffer, in which 703 * case we are not submitting it directly to the GPU but rather branch to 704 * it from a primary command buffer. In this case we just want to end 705 * the BCL with a RETURN_FROM_SUB_LIST and the RCL and binning flush 706 * will be the primary job that branches to this CL. 707 */ 708 if (cmd_buffer->state.pass) { 709 if (job->type == V3DV_JOB_TYPE_GPU_CL) { 710 cmd_buffer_end_render_pass_frame(cmd_buffer); 711 } else { 712 assert(job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY); 713 v3dv_X(cmd_buffer->device, cmd_buffer_end_render_pass_secondary)(cmd_buffer); 714 } 715 } 716 717 list_addtail(&job->list_link, &cmd_buffer->jobs); 718 cmd_buffer->state.job = NULL; 719 720 /* If we have recorded any state with this last GPU job that requires to 721 * emit CPU jobs after the job is completed, add them now. The only 722 * exception is secondary command buffers inside a render pass, because in 723 * that case we want to defer this until we finish recording the primary 724 * job into which we execute the secondary. 725 */ 726 if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY || 727 !cmd_buffer->state.pass) { 728 cmd_buffer_add_cpu_jobs_for_pending_state(cmd_buffer); 729 } 730} 731 732static bool 733job_type_is_gpu(struct v3dv_job *job) 734{ 735 switch (job->type) { 736 case V3DV_JOB_TYPE_GPU_CL: 737 case V3DV_JOB_TYPE_GPU_CL_SECONDARY: 738 case V3DV_JOB_TYPE_GPU_TFU: 739 case V3DV_JOB_TYPE_GPU_CSD: 740 return true; 741 default: 742 return false; 743 } 744} 745 746static void 747cmd_buffer_serialize_job_if_needed(struct v3dv_cmd_buffer *cmd_buffer, 748 struct v3dv_job *job) 749{ 750 assert(cmd_buffer && job); 751 752 if (!cmd_buffer->state.has_barrier) 753 return; 754 755 /* Serialization only affects GPU jobs, CPU jobs are always automatically 756 * serialized. 757 */ 758 if (!job_type_is_gpu(job)) 759 return; 760 761 job->serialize = true; 762 if (cmd_buffer->state.has_bcl_barrier && 763 (job->type == V3DV_JOB_TYPE_GPU_CL || 764 job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY)) { 765 job->needs_bcl_sync = true; 766 } 767 768 cmd_buffer->state.has_barrier = false; 769 cmd_buffer->state.has_bcl_barrier = false; 770} 771 772void 773v3dv_job_init(struct v3dv_job *job, 774 enum v3dv_job_type type, 775 struct v3dv_device *device, 776 struct v3dv_cmd_buffer *cmd_buffer, 777 int32_t subpass_idx) 778{ 779 assert(job); 780 781 /* Make sure we haven't made this new job current before calling here */ 782 assert(!cmd_buffer || cmd_buffer->state.job != job); 783 784 job->type = type; 785 786 job->device = device; 787 job->cmd_buffer = cmd_buffer; 788 789 list_inithead(&job->list_link); 790 791 if (type == V3DV_JOB_TYPE_GPU_CL || 792 type == V3DV_JOB_TYPE_GPU_CL_SECONDARY || 793 type == V3DV_JOB_TYPE_GPU_CSD) { 794 job->bos = 795 _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal); 796 job->bo_count = 0; 797 798 v3dv_cl_init(job, &job->indirect); 799 800 if (unlikely(V3D_DEBUG & V3D_DEBUG_ALWAYS_FLUSH)) 801 job->always_flush = true; 802 } 803 804 if (type == V3DV_JOB_TYPE_GPU_CL || 805 type == V3DV_JOB_TYPE_GPU_CL_SECONDARY) { 806 v3dv_cl_init(job, &job->bcl); 807 v3dv_cl_init(job, &job->rcl); 808 } 809 810 if (cmd_buffer) { 811 /* Flag all state as dirty. Generally, we need to re-emit state for each 812 * new job. 813 * 814 * FIXME: there may be some exceptions, in which case we could skip some 815 * bits. 816 */ 817 cmd_buffer->state.dirty = ~0; 818 cmd_buffer->state.dirty_descriptor_stages = ~0; 819 820 /* Honor inheritance of occlussion queries in secondaries if requested */ 821 if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY && 822 cmd_buffer->state.inheritance.occlusion_query_enable) { 823 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_OCCLUSION_QUERY; 824 } 825 826 /* Keep track of the first subpass that we are recording in this new job. 827 * We will use this when we emit the RCL to decide how to emit our loads 828 * and stores. 829 */ 830 if (cmd_buffer->state.pass) 831 job->first_subpass = subpass_idx; 832 833 cmd_buffer_serialize_job_if_needed(cmd_buffer, job); 834 } 835} 836 837struct v3dv_job * 838v3dv_cmd_buffer_start_job(struct v3dv_cmd_buffer *cmd_buffer, 839 int32_t subpass_idx, 840 enum v3dv_job_type type) 841{ 842 /* Don't create a new job if we can merge the current subpass into 843 * the current job. 844 */ 845 if (cmd_buffer->state.pass && 846 subpass_idx != -1 && 847 cmd_buffer_can_merge_subpass(cmd_buffer, subpass_idx)) { 848 cmd_buffer->state.job->is_subpass_finish = false; 849 return cmd_buffer->state.job; 850 } 851 852 /* Ensure we are not starting a new job without finishing a previous one */ 853 if (cmd_buffer->state.job != NULL) 854 v3dv_cmd_buffer_finish_job(cmd_buffer); 855 856 assert(cmd_buffer->state.job == NULL); 857 struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->vk.alloc, 858 sizeof(struct v3dv_job), 8, 859 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 860 861 if (!job) { 862 fprintf(stderr, "Error: failed to allocate CPU memory for job\n"); 863 v3dv_flag_oom(cmd_buffer, NULL); 864 return NULL; 865 } 866 867 v3dv_job_init(job, type, cmd_buffer->device, cmd_buffer, subpass_idx); 868 cmd_buffer->state.job = job; 869 870 return job; 871} 872 873static VkResult 874cmd_buffer_reset(struct v3dv_cmd_buffer *cmd_buffer, 875 VkCommandBufferResetFlags flags) 876{ 877 vk_command_buffer_reset(&cmd_buffer->vk); 878 if (cmd_buffer->status != V3DV_CMD_BUFFER_STATUS_INITIALIZED) { 879 struct v3dv_device *device = cmd_buffer->device; 880 struct v3dv_cmd_pool *pool = cmd_buffer->pool; 881 VkCommandBufferLevel level = cmd_buffer->level; 882 883 /* cmd_buffer_init below will re-add the command buffer to the pool 884 * so remove it here so we don't end up adding it again. 885 */ 886 list_del(&cmd_buffer->pool_link); 887 888 /* FIXME: For now we always free all resources as if 889 * VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT was set. 890 */ 891 if (cmd_buffer->status != V3DV_CMD_BUFFER_STATUS_NEW) 892 cmd_buffer_free_resources(cmd_buffer); 893 894 cmd_buffer_init(cmd_buffer, device, pool, level); 895 } 896 897 assert(cmd_buffer->status == V3DV_CMD_BUFFER_STATUS_INITIALIZED); 898 return VK_SUCCESS; 899} 900 901VKAPI_ATTR VkResult VKAPI_CALL 902v3dv_AllocateCommandBuffers(VkDevice _device, 903 const VkCommandBufferAllocateInfo *pAllocateInfo, 904 VkCommandBuffer *pCommandBuffers) 905{ 906 V3DV_FROM_HANDLE(v3dv_device, device, _device); 907 V3DV_FROM_HANDLE(v3dv_cmd_pool, pool, pAllocateInfo->commandPool); 908 909 VkResult result = VK_SUCCESS; 910 uint32_t i; 911 912 for (i = 0; i < pAllocateInfo->commandBufferCount; i++) { 913 result = cmd_buffer_create(device, pool, pAllocateInfo->level, 914 &pCommandBuffers[i]); 915 if (result != VK_SUCCESS) 916 break; 917 } 918 919 if (result != VK_SUCCESS) { 920 v3dv_FreeCommandBuffers(_device, pAllocateInfo->commandPool, 921 i, pCommandBuffers); 922 for (i = 0; i < pAllocateInfo->commandBufferCount; i++) 923 pCommandBuffers[i] = VK_NULL_HANDLE; 924 } 925 926 return result; 927} 928 929VKAPI_ATTR void VKAPI_CALL 930v3dv_FreeCommandBuffers(VkDevice device, 931 VkCommandPool commandPool, 932 uint32_t commandBufferCount, 933 const VkCommandBuffer *pCommandBuffers) 934{ 935 for (uint32_t i = 0; i < commandBufferCount; i++) { 936 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, pCommandBuffers[i]); 937 938 if (!cmd_buffer) 939 continue; 940 941 cmd_buffer_destroy(cmd_buffer); 942 } 943} 944 945VKAPI_ATTR void VKAPI_CALL 946v3dv_DestroyCommandPool(VkDevice _device, 947 VkCommandPool commandPool, 948 const VkAllocationCallbacks *pAllocator) 949{ 950 V3DV_FROM_HANDLE(v3dv_device, device, _device); 951 V3DV_FROM_HANDLE(v3dv_cmd_pool, pool, commandPool); 952 953 if (!pool) 954 return; 955 956 list_for_each_entry_safe(struct v3dv_cmd_buffer, cmd_buffer, 957 &pool->cmd_buffers, pool_link) { 958 cmd_buffer_destroy(cmd_buffer); 959 } 960 961 vk_object_free(&device->vk, pAllocator, pool); 962} 963 964VKAPI_ATTR void VKAPI_CALL 965v3dv_TrimCommandPool(VkDevice device, 966 VkCommandPool commandPool, 967 VkCommandPoolTrimFlags flags) 968{ 969 /* We don't need to do anything here, our command pools never hold on to 970 * any resources from command buffers that are freed or reset. 971 */ 972} 973 974 975static void 976cmd_buffer_subpass_handle_pending_resolves(struct v3dv_cmd_buffer *cmd_buffer) 977{ 978 assert(cmd_buffer->state.subpass_idx < cmd_buffer->state.pass->subpass_count); 979 const struct v3dv_render_pass *pass = cmd_buffer->state.pass; 980 const struct v3dv_subpass *subpass = 981 &pass->subpasses[cmd_buffer->state.subpass_idx]; 982 983 if (!subpass->resolve_attachments) 984 return; 985 986 struct v3dv_framebuffer *fb = cmd_buffer->state.framebuffer; 987 988 /* At this point we have already ended the current subpass and now we are 989 * about to emit vkCmdResolveImage calls to get the resolves we can't handle 990 * handle in the subpass RCL. 991 * 992 * vkCmdResolveImage is not supposed to be called inside a render pass so 993 * before we call that we need to make sure our command buffer state reflects 994 * that we are no longer in a subpass by finishing the current job and 995 * resetting the framebuffer and render pass state temporarily and then 996 * restoring it after we are done with the resolves. 997 */ 998 if (cmd_buffer->state.job) 999 v3dv_cmd_buffer_finish_job(cmd_buffer); 1000 struct v3dv_framebuffer *restore_fb = cmd_buffer->state.framebuffer; 1001 struct v3dv_render_pass *restore_pass = cmd_buffer->state.pass; 1002 uint32_t restore_subpass_idx = cmd_buffer->state.subpass_idx; 1003 cmd_buffer->state.framebuffer = NULL; 1004 cmd_buffer->state.pass = NULL; 1005 cmd_buffer->state.subpass_idx = -1; 1006 1007 VkCommandBuffer cmd_buffer_handle = v3dv_cmd_buffer_to_handle(cmd_buffer); 1008 for (uint32_t i = 0; i < subpass->color_count; i++) { 1009 const uint32_t src_attachment_idx = 1010 subpass->color_attachments[i].attachment; 1011 if (src_attachment_idx == VK_ATTACHMENT_UNUSED) 1012 continue; 1013 1014 if (pass->attachments[src_attachment_idx].use_tlb_resolve) 1015 continue; 1016 1017 const uint32_t dst_attachment_idx = 1018 subpass->resolve_attachments[i].attachment; 1019 if (dst_attachment_idx == VK_ATTACHMENT_UNUSED) 1020 continue; 1021 1022 struct v3dv_image_view *src_iview = fb->attachments[src_attachment_idx]; 1023 struct v3dv_image_view *dst_iview = fb->attachments[dst_attachment_idx]; 1024 1025 VkImageResolve2KHR region = { 1026 .sType = VK_STRUCTURE_TYPE_IMAGE_RESOLVE_2_KHR, 1027 .srcSubresource = { 1028 VK_IMAGE_ASPECT_COLOR_BIT, 1029 src_iview->vk.base_mip_level, 1030 src_iview->vk.base_array_layer, 1031 src_iview->vk.layer_count, 1032 }, 1033 .srcOffset = { 0, 0, 0 }, 1034 .dstSubresource = { 1035 VK_IMAGE_ASPECT_COLOR_BIT, 1036 dst_iview->vk.base_mip_level, 1037 dst_iview->vk.base_array_layer, 1038 dst_iview->vk.layer_count, 1039 }, 1040 .dstOffset = { 0, 0, 0 }, 1041 .extent = src_iview->vk.image->extent, 1042 }; 1043 1044 struct v3dv_image *src_image = (struct v3dv_image *) src_iview->vk.image; 1045 struct v3dv_image *dst_image = (struct v3dv_image *) dst_iview->vk.image; 1046 VkResolveImageInfo2KHR resolve_info = { 1047 .sType = VK_STRUCTURE_TYPE_RESOLVE_IMAGE_INFO_2_KHR, 1048 .srcImage = v3dv_image_to_handle(src_image), 1049 .srcImageLayout = VK_IMAGE_LAYOUT_GENERAL, 1050 .dstImage = v3dv_image_to_handle(dst_image), 1051 .dstImageLayout = VK_IMAGE_LAYOUT_GENERAL, 1052 .regionCount = 1, 1053 .pRegions = ®ion, 1054 }; 1055 v3dv_CmdResolveImage2KHR(cmd_buffer_handle, &resolve_info); 1056 } 1057 1058 cmd_buffer->state.framebuffer = restore_fb; 1059 cmd_buffer->state.pass = restore_pass; 1060 cmd_buffer->state.subpass_idx = restore_subpass_idx; 1061} 1062 1063static VkResult 1064cmd_buffer_begin_render_pass_secondary( 1065 struct v3dv_cmd_buffer *cmd_buffer, 1066 const VkCommandBufferInheritanceInfo *inheritance_info) 1067{ 1068 assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY); 1069 assert(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT); 1070 assert(inheritance_info); 1071 1072 cmd_buffer->state.pass = 1073 v3dv_render_pass_from_handle(inheritance_info->renderPass); 1074 assert(cmd_buffer->state.pass); 1075 1076 cmd_buffer->state.framebuffer = 1077 v3dv_framebuffer_from_handle(inheritance_info->framebuffer); 1078 1079 assert(inheritance_info->subpass < cmd_buffer->state.pass->subpass_count); 1080 cmd_buffer->state.subpass_idx = inheritance_info->subpass; 1081 1082 cmd_buffer->state.inheritance.occlusion_query_enable = 1083 inheritance_info->occlusionQueryEnable; 1084 1085 /* Secondaries that execute inside a render pass won't start subpasses 1086 * so we want to create a job for them here. 1087 */ 1088 struct v3dv_job *job = 1089 v3dv_cmd_buffer_start_job(cmd_buffer, inheritance_info->subpass, 1090 V3DV_JOB_TYPE_GPU_CL_SECONDARY); 1091 if (!job) { 1092 v3dv_flag_oom(cmd_buffer, NULL); 1093 return VK_ERROR_OUT_OF_HOST_MEMORY; 1094 } 1095 1096 /* Secondary command buffers don't know about the render area, but our 1097 * scissor setup accounts for it, so let's make sure we make it large 1098 * enough that it doesn't actually constrain any rendering. This should 1099 * be fine, since the Vulkan spec states: 1100 * 1101 * "The application must ensure (using scissor if necessary) that all 1102 * rendering is contained within the render area." 1103 * 1104 * FIXME: setup constants for the max framebuffer dimensions and use them 1105 * here and when filling in VkPhysicalDeviceLimits. 1106 */ 1107 const struct v3dv_framebuffer *framebuffer = cmd_buffer->state.framebuffer; 1108 cmd_buffer->state.render_area.offset.x = 0; 1109 cmd_buffer->state.render_area.offset.y = 0; 1110 cmd_buffer->state.render_area.extent.width = 1111 framebuffer ? framebuffer->width : 4096; 1112 cmd_buffer->state.render_area.extent.height = 1113 framebuffer ? framebuffer->height : 4096; 1114 1115 return VK_SUCCESS; 1116} 1117 1118VKAPI_ATTR VkResult VKAPI_CALL 1119v3dv_BeginCommandBuffer(VkCommandBuffer commandBuffer, 1120 const VkCommandBufferBeginInfo *pBeginInfo) 1121{ 1122 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 1123 1124 /* If this is the first vkBeginCommandBuffer, we must initialize the 1125 * command buffer's state. Otherwise, we must reset its state. In both 1126 * cases we reset it. 1127 */ 1128 VkResult result = cmd_buffer_reset(cmd_buffer, 0); 1129 if (result != VK_SUCCESS) 1130 return result; 1131 1132 assert(cmd_buffer->status == V3DV_CMD_BUFFER_STATUS_INITIALIZED); 1133 1134 cmd_buffer->usage_flags = pBeginInfo->flags; 1135 1136 if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) { 1137 if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) { 1138 result = 1139 cmd_buffer_begin_render_pass_secondary(cmd_buffer, 1140 pBeginInfo->pInheritanceInfo); 1141 if (result != VK_SUCCESS) 1142 return result; 1143 } 1144 } 1145 1146 cmd_buffer->status = V3DV_CMD_BUFFER_STATUS_RECORDING; 1147 1148 return VK_SUCCESS; 1149} 1150 1151VKAPI_ATTR VkResult VKAPI_CALL 1152v3dv_ResetCommandBuffer(VkCommandBuffer commandBuffer, 1153 VkCommandBufferResetFlags flags) 1154{ 1155 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 1156 return cmd_buffer_reset(cmd_buffer, flags); 1157} 1158 1159VKAPI_ATTR VkResult VKAPI_CALL 1160v3dv_ResetCommandPool(VkDevice device, 1161 VkCommandPool commandPool, 1162 VkCommandPoolResetFlags flags) 1163{ 1164 V3DV_FROM_HANDLE(v3dv_cmd_pool, pool, commandPool); 1165 1166 VkCommandBufferResetFlags reset_flags = 0; 1167 if (flags & VK_COMMAND_POOL_RESET_RELEASE_RESOURCES_BIT) 1168 reset_flags = VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT; 1169 list_for_each_entry_safe(struct v3dv_cmd_buffer, cmd_buffer, 1170 &pool->cmd_buffers, pool_link) { 1171 cmd_buffer_reset(cmd_buffer, reset_flags); 1172 } 1173 1174 return VK_SUCCESS; 1175} 1176 1177static void 1178cmd_buffer_update_tile_alignment(struct v3dv_cmd_buffer *cmd_buffer) 1179{ 1180 /* Render areas and scissor/viewport are only relevant inside render passes, 1181 * otherwise we are dealing with transfer operations where these elements 1182 * don't apply. 1183 */ 1184 assert(cmd_buffer->state.pass); 1185 const VkRect2D *rect = &cmd_buffer->state.render_area; 1186 1187 /* We should only call this at the beginning of a subpass so we should 1188 * always have framebuffer information available. 1189 */ 1190 assert(cmd_buffer->state.framebuffer); 1191 cmd_buffer->state.tile_aligned_render_area = 1192 v3dv_subpass_area_is_tile_aligned(cmd_buffer->device, rect, 1193 cmd_buffer->state.framebuffer, 1194 cmd_buffer->state.pass, 1195 cmd_buffer->state.subpass_idx); 1196 1197 if (!cmd_buffer->state.tile_aligned_render_area) { 1198 perf_debug("Render area for subpass %d of render pass %p doesn't " 1199 "match render pass granularity.\n", 1200 cmd_buffer->state.subpass_idx, cmd_buffer->state.pass); 1201 } 1202} 1203 1204static void 1205cmd_buffer_state_set_attachment_clear_color(struct v3dv_cmd_buffer *cmd_buffer, 1206 uint32_t attachment_idx, 1207 const VkClearColorValue *color) 1208{ 1209 assert(attachment_idx < cmd_buffer->state.pass->attachment_count); 1210 1211 const struct v3dv_render_pass_attachment *attachment = 1212 &cmd_buffer->state.pass->attachments[attachment_idx]; 1213 1214 uint32_t internal_type, internal_bpp; 1215 const struct v3dv_format *format = 1216 v3dv_X(cmd_buffer->device, get_format)(attachment->desc.format); 1217 1218 v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_output_format) 1219 (format->rt_type, &internal_type, &internal_bpp); 1220 1221 uint32_t internal_size = 4 << internal_bpp; 1222 1223 struct v3dv_cmd_buffer_attachment_state *attachment_state = 1224 &cmd_buffer->state.attachments[attachment_idx]; 1225 1226 v3dv_X(cmd_buffer->device, get_hw_clear_color) 1227 (color, internal_type, internal_size, &attachment_state->clear_value.color[0]); 1228 1229 attachment_state->vk_clear_value.color = *color; 1230} 1231 1232static void 1233cmd_buffer_state_set_attachment_clear_depth_stencil( 1234 struct v3dv_cmd_buffer *cmd_buffer, 1235 uint32_t attachment_idx, 1236 bool clear_depth, bool clear_stencil, 1237 const VkClearDepthStencilValue *ds) 1238{ 1239 struct v3dv_cmd_buffer_attachment_state *attachment_state = 1240 &cmd_buffer->state.attachments[attachment_idx]; 1241 1242 if (clear_depth) 1243 attachment_state->clear_value.z = ds->depth; 1244 1245 if (clear_stencil) 1246 attachment_state->clear_value.s = ds->stencil; 1247 1248 attachment_state->vk_clear_value.depthStencil = *ds; 1249} 1250 1251static void 1252cmd_buffer_state_set_clear_values(struct v3dv_cmd_buffer *cmd_buffer, 1253 uint32_t count, const VkClearValue *values) 1254{ 1255 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 1256 const struct v3dv_render_pass *pass = state->pass; 1257 1258 /* There could be less clear values than attachments in the render pass, in 1259 * which case we only want to process as many as we have, or there could be 1260 * more, in which case we want to ignore those for which we don't have a 1261 * corresponding attachment. 1262 */ 1263 count = MIN2(count, pass->attachment_count); 1264 for (uint32_t i = 0; i < count; i++) { 1265 const struct v3dv_render_pass_attachment *attachment = 1266 &pass->attachments[i]; 1267 1268 if (attachment->desc.loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR) 1269 continue; 1270 1271 VkImageAspectFlags aspects = vk_format_aspects(attachment->desc.format); 1272 if (aspects & VK_IMAGE_ASPECT_COLOR_BIT) { 1273 cmd_buffer_state_set_attachment_clear_color(cmd_buffer, i, 1274 &values[i].color); 1275 } else if (aspects & (VK_IMAGE_ASPECT_DEPTH_BIT | 1276 VK_IMAGE_ASPECT_STENCIL_BIT)) { 1277 cmd_buffer_state_set_attachment_clear_depth_stencil( 1278 cmd_buffer, i, 1279 aspects & VK_IMAGE_ASPECT_DEPTH_BIT, 1280 aspects & VK_IMAGE_ASPECT_STENCIL_BIT, 1281 &values[i].depthStencil); 1282 } 1283 } 1284} 1285 1286static void 1287cmd_buffer_init_render_pass_attachment_state(struct v3dv_cmd_buffer *cmd_buffer, 1288 const VkRenderPassBeginInfo *pRenderPassBegin) 1289{ 1290 cmd_buffer_state_set_clear_values(cmd_buffer, 1291 pRenderPassBegin->clearValueCount, 1292 pRenderPassBegin->pClearValues); 1293} 1294 1295static void 1296cmd_buffer_ensure_render_pass_attachment_state(struct v3dv_cmd_buffer *cmd_buffer) 1297{ 1298 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 1299 const struct v3dv_render_pass *pass = state->pass; 1300 1301 if (state->attachment_alloc_count < pass->attachment_count) { 1302 if (state->attachments > 0) { 1303 assert(state->attachment_alloc_count > 0); 1304 vk_free(&cmd_buffer->device->vk.alloc, state->attachments); 1305 } 1306 1307 uint32_t size = sizeof(struct v3dv_cmd_buffer_attachment_state) * 1308 pass->attachment_count; 1309 state->attachments = vk_zalloc(&cmd_buffer->device->vk.alloc, size, 8, 1310 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 1311 if (!state->attachments) { 1312 v3dv_flag_oom(cmd_buffer, NULL); 1313 return; 1314 } 1315 state->attachment_alloc_count = pass->attachment_count; 1316 } 1317 1318 assert(state->attachment_alloc_count >= pass->attachment_count); 1319} 1320 1321VKAPI_ATTR void VKAPI_CALL 1322v3dv_CmdBeginRenderPass(VkCommandBuffer commandBuffer, 1323 const VkRenderPassBeginInfo *pRenderPassBegin, 1324 VkSubpassContents contents) 1325{ 1326 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 1327 V3DV_FROM_HANDLE(v3dv_render_pass, pass, pRenderPassBegin->renderPass); 1328 V3DV_FROM_HANDLE(v3dv_framebuffer, framebuffer, pRenderPassBegin->framebuffer); 1329 1330 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 1331 state->pass = pass; 1332 state->framebuffer = framebuffer; 1333 1334 cmd_buffer_ensure_render_pass_attachment_state(cmd_buffer); 1335 v3dv_return_if_oom(cmd_buffer, NULL); 1336 1337 cmd_buffer_init_render_pass_attachment_state(cmd_buffer, pRenderPassBegin); 1338 1339 state->render_area = pRenderPassBegin->renderArea; 1340 1341 /* If our render area is smaller than the current clip window we will have 1342 * to emit a new clip window to constraint it to the render area. 1343 */ 1344 uint32_t min_render_x = state->render_area.offset.x; 1345 uint32_t min_render_y = state->render_area.offset.y; 1346 uint32_t max_render_x = min_render_x + state->render_area.extent.width - 1; 1347 uint32_t max_render_y = min_render_y + state->render_area.extent.height - 1; 1348 uint32_t min_clip_x = state->clip_window.offset.x; 1349 uint32_t min_clip_y = state->clip_window.offset.y; 1350 uint32_t max_clip_x = min_clip_x + state->clip_window.extent.width - 1; 1351 uint32_t max_clip_y = min_clip_y + state->clip_window.extent.height - 1; 1352 if (min_render_x > min_clip_x || min_render_y > min_clip_y || 1353 max_render_x < max_clip_x || max_render_y < max_clip_y) { 1354 state->dirty |= V3DV_CMD_DIRTY_SCISSOR; 1355 } 1356 1357 /* Setup for first subpass */ 1358 v3dv_cmd_buffer_subpass_start(cmd_buffer, 0); 1359} 1360 1361VKAPI_ATTR void VKAPI_CALL 1362v3dv_CmdNextSubpass(VkCommandBuffer commandBuffer, VkSubpassContents contents) 1363{ 1364 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 1365 1366 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 1367 assert(state->subpass_idx < state->pass->subpass_count - 1); 1368 1369 /* Finish the previous subpass */ 1370 v3dv_cmd_buffer_subpass_finish(cmd_buffer); 1371 cmd_buffer_subpass_handle_pending_resolves(cmd_buffer); 1372 1373 /* Start the next subpass */ 1374 v3dv_cmd_buffer_subpass_start(cmd_buffer, state->subpass_idx + 1); 1375} 1376 1377static void 1378cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer *cmd_buffer) 1379{ 1380 assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); 1381 1382 assert(cmd_buffer->state.pass); 1383 assert(cmd_buffer->state.subpass_idx < cmd_buffer->state.pass->subpass_count); 1384 const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 1385 const struct v3dv_render_pass *pass = state->pass; 1386 const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx]; 1387 1388 /* We only need to emit subpass clears as draw calls when the render 1389 * area is not aligned to tile boundaries or for GFXH-1461. 1390 */ 1391 if (cmd_buffer->state.tile_aligned_render_area && 1392 !subpass->do_depth_clear_with_draw && 1393 !subpass->do_depth_clear_with_draw) { 1394 return; 1395 } 1396 1397 uint32_t att_count = 0; 1398 VkClearAttachment atts[V3D_MAX_DRAW_BUFFERS + 1]; /* 4 color + D/S */ 1399 1400 /* We only need to emit subpass clears as draw calls for color attachments 1401 * if the render area is not aligned to tile boundaries. 1402 */ 1403 if (!cmd_buffer->state.tile_aligned_render_area) { 1404 for (uint32_t i = 0; i < subpass->color_count; i++) { 1405 const uint32_t att_idx = subpass->color_attachments[i].attachment; 1406 if (att_idx == VK_ATTACHMENT_UNUSED) 1407 continue; 1408 1409 struct v3dv_render_pass_attachment *att = &pass->attachments[att_idx]; 1410 if (att->desc.loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR) 1411 continue; 1412 1413 if (state->subpass_idx != att->first_subpass) 1414 continue; 1415 1416 atts[att_count].aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; 1417 atts[att_count].colorAttachment = i; 1418 atts[att_count].clearValue = state->attachments[att_idx].vk_clear_value; 1419 att_count++; 1420 } 1421 } 1422 1423 /* For D/S we may also need to emit a subpass clear for GFXH-1461 */ 1424 const uint32_t ds_att_idx = subpass->ds_attachment.attachment; 1425 if (ds_att_idx != VK_ATTACHMENT_UNUSED) { 1426 struct v3dv_render_pass_attachment *att = &pass->attachments[ds_att_idx]; 1427 if (state->subpass_idx == att->first_subpass) { 1428 VkImageAspectFlags aspects = vk_format_aspects(att->desc.format); 1429 if (att->desc.loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR || 1430 (cmd_buffer->state.tile_aligned_render_area && 1431 !subpass->do_depth_clear_with_draw)) { 1432 aspects &= ~VK_IMAGE_ASPECT_DEPTH_BIT; 1433 } 1434 if (att->desc.stencilLoadOp != VK_ATTACHMENT_LOAD_OP_CLEAR || 1435 (cmd_buffer->state.tile_aligned_render_area && 1436 !subpass->do_stencil_clear_with_draw)) { 1437 aspects &= ~VK_IMAGE_ASPECT_STENCIL_BIT; 1438 } 1439 if (aspects) { 1440 atts[att_count].aspectMask = aspects; 1441 atts[att_count].colorAttachment = 0; /* Ignored */ 1442 atts[att_count].clearValue = 1443 state->attachments[ds_att_idx].vk_clear_value; 1444 att_count++; 1445 } 1446 } 1447 } 1448 1449 if (att_count == 0) 1450 return; 1451 1452 if (!cmd_buffer->state.tile_aligned_render_area) { 1453 perf_debug("Render area doesn't match render pass granularity, falling " 1454 "back to vkCmdClearAttachments for " 1455 "VK_ATTACHMENT_LOAD_OP_CLEAR.\n"); 1456 } else if (subpass->do_depth_clear_with_draw || 1457 subpass->do_stencil_clear_with_draw) { 1458 perf_debug("Subpass clears DEPTH but loads STENCIL (or viceversa), " 1459 "falling back to vkCmdClearAttachments for " 1460 "VK_ATTACHMENT_LOAD_OP_CLEAR.\n"); 1461 } 1462 1463 /* From the Vulkan 1.0 spec: 1464 * 1465 * "VK_ATTACHMENT_LOAD_OP_CLEAR specifies that the contents within the 1466 * render area will be cleared to a uniform value, which is specified 1467 * when a render pass instance is begun." 1468 * 1469 * So the clear is only constrained by the render area and not by pipeline 1470 * state such as scissor or viewport, these are the semantics of 1471 * vkCmdClearAttachments as well. 1472 */ 1473 VkCommandBuffer _cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer); 1474 VkClearRect rect = { 1475 .rect = state->render_area, 1476 .baseArrayLayer = 0, 1477 .layerCount = 1, 1478 }; 1479 v3dv_CmdClearAttachments(_cmd_buffer, att_count, atts, 1, &rect); 1480} 1481 1482static struct v3dv_job * 1483cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer, 1484 uint32_t subpass_idx, 1485 enum v3dv_job_type type) 1486{ 1487 assert(type == V3DV_JOB_TYPE_GPU_CL || 1488 type == V3DV_JOB_TYPE_GPU_CL_SECONDARY); 1489 1490 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 1491 assert(subpass_idx < state->pass->subpass_count); 1492 1493 /* Starting a new job can trigger a finish of the current one, so don't 1494 * change the command buffer state for the new job until we are done creating 1495 * the new job. 1496 */ 1497 struct v3dv_job *job = 1498 v3dv_cmd_buffer_start_job(cmd_buffer, subpass_idx, type); 1499 if (!job) 1500 return NULL; 1501 1502 state->subpass_idx = subpass_idx; 1503 1504 /* If we are starting a new job we need to setup binning. We only do this 1505 * for V3DV_JOB_TYPE_GPU_CL jobs because V3DV_JOB_TYPE_GPU_CL_SECONDARY 1506 * jobs are not submitted to the GPU directly, and are instead meant to be 1507 * branched to from other V3DV_JOB_TYPE_GPU_CL jobs. 1508 */ 1509 if (type == V3DV_JOB_TYPE_GPU_CL && 1510 job->first_subpass == state->subpass_idx) { 1511 const struct v3dv_subpass *subpass = 1512 &state->pass->subpasses[state->subpass_idx]; 1513 1514 const struct v3dv_framebuffer *framebuffer = state->framebuffer; 1515 1516 uint8_t internal_bpp; 1517 bool msaa; 1518 v3dv_X(job->device, framebuffer_compute_internal_bpp_msaa) 1519 (framebuffer, subpass, &internal_bpp, &msaa); 1520 1521 /* From the Vulkan spec: 1522 * 1523 * "If the render pass uses multiview, then layers must be one and 1524 * each attachment requires a number of layers that is greater than 1525 * the maximum bit index set in the view mask in the subpasses in 1526 * which it is used." 1527 * 1528 * So when multiview is enabled, we take the number of layers from the 1529 * last bit set in the view mask. 1530 */ 1531 uint32_t layers = framebuffer->layers; 1532 if (subpass->view_mask != 0) { 1533 assert(framebuffer->layers == 1); 1534 layers = util_last_bit(subpass->view_mask); 1535 } 1536 1537 v3dv_job_start_frame(job, 1538 framebuffer->width, 1539 framebuffer->height, 1540 layers, 1541 true, 1542 subpass->color_count, 1543 internal_bpp, 1544 msaa); 1545 } 1546 1547 return job; 1548} 1549 1550struct v3dv_job * 1551v3dv_cmd_buffer_subpass_start(struct v3dv_cmd_buffer *cmd_buffer, 1552 uint32_t subpass_idx) 1553{ 1554 assert(cmd_buffer->state.pass); 1555 assert(subpass_idx < cmd_buffer->state.pass->subpass_count); 1556 1557 struct v3dv_job *job = 1558 cmd_buffer_subpass_create_job(cmd_buffer, subpass_idx, 1559 V3DV_JOB_TYPE_GPU_CL); 1560 if (!job) 1561 return NULL; 1562 1563 /* Check if our render area is aligned to tile boundaries. We have to do 1564 * this in each subpass because the subset of attachments used can change 1565 * and with that the tile size selected by the hardware can change too. 1566 */ 1567 cmd_buffer_update_tile_alignment(cmd_buffer); 1568 1569 /* If we can't use TLB clears then we need to emit draw clears for any 1570 * LOAD_OP_CLEAR attachments in this subpass now. We might also need to emit 1571 * Depth/Stencil clears if we hit GFXH-1461. 1572 * 1573 * Secondary command buffers don't start subpasses (and may not even have 1574 * framebuffer state), so we only care about this in primaries. The only 1575 * exception could be a secondary runnning inside a subpass that needs to 1576 * record a meta operation (with its own render pass) that relies on 1577 * attachment load clears, but we don't have any instances of that right 1578 * now. 1579 */ 1580 if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) 1581 cmd_buffer_emit_subpass_clears(cmd_buffer); 1582 1583 return job; 1584} 1585 1586struct v3dv_job * 1587v3dv_cmd_buffer_subpass_resume(struct v3dv_cmd_buffer *cmd_buffer, 1588 uint32_t subpass_idx) 1589{ 1590 assert(cmd_buffer->state.pass); 1591 assert(subpass_idx < cmd_buffer->state.pass->subpass_count); 1592 1593 struct v3dv_job *job; 1594 if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) { 1595 job = cmd_buffer_subpass_create_job(cmd_buffer, subpass_idx, 1596 V3DV_JOB_TYPE_GPU_CL); 1597 } else { 1598 assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY); 1599 job = cmd_buffer_subpass_create_job(cmd_buffer, subpass_idx, 1600 V3DV_JOB_TYPE_GPU_CL_SECONDARY); 1601 } 1602 1603 if (!job) 1604 return NULL; 1605 1606 job->is_subpass_continue = true; 1607 1608 return job; 1609} 1610 1611void 1612v3dv_cmd_buffer_subpass_finish(struct v3dv_cmd_buffer *cmd_buffer) 1613{ 1614 /* We can end up here without a job if the last command recorded into the 1615 * subpass already finished the job (for example a pipeline barrier). In 1616 * that case we miss to set the is_subpass_finish flag, but that is not 1617 * required for proper behavior. 1618 */ 1619 struct v3dv_job *job = cmd_buffer->state.job; 1620 if (job) 1621 job->is_subpass_finish = true; 1622} 1623 1624VKAPI_ATTR void VKAPI_CALL 1625v3dv_CmdEndRenderPass(VkCommandBuffer commandBuffer) 1626{ 1627 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 1628 1629 /* Finalize last subpass */ 1630 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 1631 assert(state->subpass_idx == state->pass->subpass_count - 1); 1632 v3dv_cmd_buffer_subpass_finish(cmd_buffer); 1633 v3dv_cmd_buffer_finish_job(cmd_buffer); 1634 1635 cmd_buffer_subpass_handle_pending_resolves(cmd_buffer); 1636 1637 /* We are no longer inside a render pass */ 1638 state->framebuffer = NULL; 1639 state->pass = NULL; 1640 state->subpass_idx = -1; 1641} 1642 1643VKAPI_ATTR VkResult VKAPI_CALL 1644v3dv_EndCommandBuffer(VkCommandBuffer commandBuffer) 1645{ 1646 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 1647 1648 if (cmd_buffer->state.oom) 1649 return VK_ERROR_OUT_OF_HOST_MEMORY; 1650 1651 /* Primaries should have ended any recording jobs by the time they hit 1652 * vkEndRenderPass (if we are inside a render pass). Commands outside 1653 * a render pass instance (for both primaries and secondaries) spawn 1654 * complete jobs too. So the only case where we can get here without 1655 * finishing a recording job is when we are recording a secondary 1656 * inside a render pass. 1657 */ 1658 if (cmd_buffer->state.job) { 1659 assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY && 1660 cmd_buffer->state.pass); 1661 v3dv_cmd_buffer_finish_job(cmd_buffer); 1662 } 1663 1664 cmd_buffer->status = V3DV_CMD_BUFFER_STATUS_EXECUTABLE; 1665 1666 return VK_SUCCESS; 1667} 1668 1669static void 1670clone_bo_list(struct v3dv_cmd_buffer *cmd_buffer, 1671 struct list_head *dst, 1672 struct list_head *src) 1673{ 1674 assert(cmd_buffer); 1675 1676 list_inithead(dst); 1677 list_for_each_entry(struct v3dv_bo, bo, src, list_link) { 1678 struct v3dv_bo *clone_bo = 1679 vk_alloc(&cmd_buffer->device->vk.alloc, sizeof(struct v3dv_bo), 8, 1680 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 1681 if (!clone_bo) { 1682 v3dv_flag_oom(cmd_buffer, NULL); 1683 return; 1684 } 1685 1686 *clone_bo = *bo; 1687 list_addtail(&clone_bo->list_link, dst); 1688 } 1689} 1690 1691/* Clones a job for inclusion in the given command buffer. Note that this 1692 * doesn't make a deep copy so the cloned job it doesn't own any resources. 1693 * Useful when we need to have a job in more than one list, which happens 1694 * for jobs recorded in secondary command buffers when we want to execute 1695 * them in primaries. 1696 */ 1697struct v3dv_job * 1698v3dv_job_clone_in_cmd_buffer(struct v3dv_job *job, 1699 struct v3dv_cmd_buffer *cmd_buffer) 1700{ 1701 struct v3dv_job *clone_job = vk_alloc(&job->device->vk.alloc, 1702 sizeof(struct v3dv_job), 8, 1703 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 1704 if (!clone_job) { 1705 v3dv_flag_oom(cmd_buffer, NULL); 1706 return NULL; 1707 } 1708 1709 /* Cloned jobs don't duplicate resources! */ 1710 *clone_job = *job; 1711 clone_job->is_clone = true; 1712 clone_job->cmd_buffer = cmd_buffer; 1713 list_addtail(&clone_job->list_link, &cmd_buffer->jobs); 1714 1715 /* We need to regen the BO lists so that they point to the BO list in the 1716 * cloned job. Otherwise functions like list_length() will loop forever. 1717 */ 1718 if (job->type == V3DV_JOB_TYPE_GPU_CL) { 1719 clone_bo_list(cmd_buffer, &clone_job->bcl.bo_list, &job->bcl.bo_list); 1720 clone_bo_list(cmd_buffer, &clone_job->rcl.bo_list, &job->rcl.bo_list); 1721 clone_bo_list(cmd_buffer, &clone_job->indirect.bo_list, 1722 &job->indirect.bo_list); 1723 } 1724 1725 return clone_job; 1726} 1727 1728static void 1729cmd_buffer_execute_outside_pass(struct v3dv_cmd_buffer *primary, 1730 uint32_t cmd_buffer_count, 1731 const VkCommandBuffer *cmd_buffers) 1732{ 1733 bool pending_barrier = false; 1734 bool pending_bcl_barrier = false; 1735 for (uint32_t i = 0; i < cmd_buffer_count; i++) { 1736 V3DV_FROM_HANDLE(v3dv_cmd_buffer, secondary, cmd_buffers[i]); 1737 1738 assert(!(secondary->usage_flags & 1739 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)); 1740 1741 /* Secondary command buffers that execute outside a render pass create 1742 * complete jobs with an RCL and tile setup, so we simply want to merge 1743 * their job list into the primary's. However, because they may be 1744 * executed into multiple primaries at the same time and we only have a 1745 * single list_link in each job, we can't just add then to the primary's 1746 * job list and we instead have to clone them first. 1747 * 1748 * Alternatively, we could create a "execute secondary" CPU job that 1749 * when executed in a queue, would submit all the jobs in the referenced 1750 * secondary command buffer. However, this would raise some challenges 1751 * to make it work with the implementation of wait threads in the queue 1752 * which we use for event waits, for example. 1753 */ 1754 list_for_each_entry(struct v3dv_job, secondary_job, 1755 &secondary->jobs, list_link) { 1756 /* These can only happen inside a render pass */ 1757 assert(secondary_job->type != V3DV_JOB_TYPE_GPU_CL_SECONDARY); 1758 struct v3dv_job *job = v3dv_job_clone_in_cmd_buffer(secondary_job, primary); 1759 if (!job) 1760 return; 1761 1762 if (pending_barrier) { 1763 job->serialize = true; 1764 if (pending_bcl_barrier) 1765 job->needs_bcl_sync = true; 1766 pending_barrier = false; 1767 pending_bcl_barrier = false; 1768 } 1769 } 1770 1771 /* If this secondary had any pending barrier state we will need that 1772 * barrier state consumed with whatever comes after it (first job in 1773 * the next secondary or the primary, if this was the last secondary). 1774 */ 1775 assert(secondary->state.has_barrier || !secondary->state.has_bcl_barrier); 1776 pending_barrier = secondary->state.has_barrier; 1777 pending_bcl_barrier = secondary->state.has_bcl_barrier; 1778 } 1779 1780 if (pending_barrier) { 1781 primary->state.has_barrier = true; 1782 primary->state.has_bcl_barrier |= pending_bcl_barrier; 1783 } 1784} 1785 1786VKAPI_ATTR void VKAPI_CALL 1787v3dv_CmdExecuteCommands(VkCommandBuffer commandBuffer, 1788 uint32_t commandBufferCount, 1789 const VkCommandBuffer *pCommandBuffers) 1790{ 1791 V3DV_FROM_HANDLE(v3dv_cmd_buffer, primary, commandBuffer); 1792 1793 if (primary->state.pass != NULL) { 1794 v3dv_X(primary->device, cmd_buffer_execute_inside_pass) 1795 (primary, commandBufferCount, pCommandBuffers); 1796 } else { 1797 cmd_buffer_execute_outside_pass(primary, 1798 commandBufferCount, pCommandBuffers); 1799 } 1800} 1801 1802/* This goes though the list of possible dynamic states in the pipeline and, 1803 * for those that are not configured as dynamic, copies relevant state into 1804 * the command buffer. 1805 */ 1806static void 1807cmd_buffer_bind_pipeline_static_state(struct v3dv_cmd_buffer *cmd_buffer, 1808 const struct v3dv_dynamic_state *src) 1809{ 1810 struct v3dv_dynamic_state *dest = &cmd_buffer->state.dynamic; 1811 uint32_t dynamic_mask = src->mask; 1812 uint32_t dirty = 0; 1813 1814 if (!(dynamic_mask & V3DV_DYNAMIC_VIEWPORT)) { 1815 dest->viewport.count = src->viewport.count; 1816 if (memcmp(&dest->viewport.viewports, &src->viewport.viewports, 1817 src->viewport.count * sizeof(VkViewport))) { 1818 typed_memcpy(dest->viewport.viewports, 1819 src->viewport.viewports, 1820 src->viewport.count); 1821 typed_memcpy(dest->viewport.scale, src->viewport.scale, 1822 src->viewport.count); 1823 typed_memcpy(dest->viewport.translate, src->viewport.translate, 1824 src->viewport.count); 1825 dirty |= V3DV_CMD_DIRTY_VIEWPORT; 1826 } 1827 } 1828 1829 if (!(dynamic_mask & V3DV_DYNAMIC_SCISSOR)) { 1830 dest->scissor.count = src->scissor.count; 1831 if (memcmp(&dest->scissor.scissors, &src->scissor.scissors, 1832 src->scissor.count * sizeof(VkRect2D))) { 1833 typed_memcpy(dest->scissor.scissors, 1834 src->scissor.scissors, src->scissor.count); 1835 dirty |= V3DV_CMD_DIRTY_SCISSOR; 1836 } 1837 } 1838 1839 if (!(dynamic_mask & V3DV_DYNAMIC_STENCIL_COMPARE_MASK)) { 1840 if (memcmp(&dest->stencil_compare_mask, &src->stencil_compare_mask, 1841 sizeof(src->stencil_compare_mask))) { 1842 dest->stencil_compare_mask = src->stencil_compare_mask; 1843 dirty |= V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK; 1844 } 1845 } 1846 1847 if (!(dynamic_mask & V3DV_DYNAMIC_STENCIL_WRITE_MASK)) { 1848 if (memcmp(&dest->stencil_write_mask, &src->stencil_write_mask, 1849 sizeof(src->stencil_write_mask))) { 1850 dest->stencil_write_mask = src->stencil_write_mask; 1851 dirty |= V3DV_CMD_DIRTY_STENCIL_WRITE_MASK; 1852 } 1853 } 1854 1855 if (!(dynamic_mask & V3DV_DYNAMIC_STENCIL_REFERENCE)) { 1856 if (memcmp(&dest->stencil_reference, &src->stencil_reference, 1857 sizeof(src->stencil_reference))) { 1858 dest->stencil_reference = src->stencil_reference; 1859 dirty |= V3DV_CMD_DIRTY_STENCIL_REFERENCE; 1860 } 1861 } 1862 1863 if (!(dynamic_mask & V3DV_DYNAMIC_BLEND_CONSTANTS)) { 1864 if (memcmp(dest->blend_constants, src->blend_constants, 1865 sizeof(src->blend_constants))) { 1866 memcpy(dest->blend_constants, src->blend_constants, 1867 sizeof(src->blend_constants)); 1868 dirty |= V3DV_CMD_DIRTY_BLEND_CONSTANTS; 1869 } 1870 } 1871 1872 if (!(dynamic_mask & V3DV_DYNAMIC_DEPTH_BIAS)) { 1873 if (memcmp(&dest->depth_bias, &src->depth_bias, 1874 sizeof(src->depth_bias))) { 1875 memcpy(&dest->depth_bias, &src->depth_bias, sizeof(src->depth_bias)); 1876 dirty |= V3DV_CMD_DIRTY_DEPTH_BIAS; 1877 } 1878 } 1879 1880 if (!(dynamic_mask & V3DV_DYNAMIC_LINE_WIDTH)) { 1881 if (dest->line_width != src->line_width) { 1882 dest->line_width = src->line_width; 1883 dirty |= V3DV_CMD_DIRTY_LINE_WIDTH; 1884 } 1885 } 1886 1887 if (!(dynamic_mask & V3DV_DYNAMIC_COLOR_WRITE_ENABLE)) { 1888 if (dest->color_write_enable != src->color_write_enable) { 1889 dest->color_write_enable = src->color_write_enable; 1890 dirty |= V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE; 1891 } 1892 } 1893 1894 cmd_buffer->state.dynamic.mask = dynamic_mask; 1895 cmd_buffer->state.dirty |= dirty; 1896} 1897 1898static void 1899bind_graphics_pipeline(struct v3dv_cmd_buffer *cmd_buffer, 1900 struct v3dv_pipeline *pipeline) 1901{ 1902 assert(pipeline && !(pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT)); 1903 if (cmd_buffer->state.gfx.pipeline == pipeline) 1904 return; 1905 1906 cmd_buffer->state.gfx.pipeline = pipeline; 1907 1908 cmd_buffer_bind_pipeline_static_state(cmd_buffer, &pipeline->dynamic_state); 1909 1910 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_PIPELINE; 1911} 1912 1913static void 1914bind_compute_pipeline(struct v3dv_cmd_buffer *cmd_buffer, 1915 struct v3dv_pipeline *pipeline) 1916{ 1917 assert(pipeline && pipeline->active_stages == VK_SHADER_STAGE_COMPUTE_BIT); 1918 1919 if (cmd_buffer->state.compute.pipeline == pipeline) 1920 return; 1921 1922 cmd_buffer->state.compute.pipeline = pipeline; 1923 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_COMPUTE_PIPELINE; 1924} 1925 1926VKAPI_ATTR void VKAPI_CALL 1927v3dv_CmdBindPipeline(VkCommandBuffer commandBuffer, 1928 VkPipelineBindPoint pipelineBindPoint, 1929 VkPipeline _pipeline) 1930{ 1931 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 1932 V3DV_FROM_HANDLE(v3dv_pipeline, pipeline, _pipeline); 1933 1934 switch (pipelineBindPoint) { 1935 case VK_PIPELINE_BIND_POINT_COMPUTE: 1936 bind_compute_pipeline(cmd_buffer, pipeline); 1937 break; 1938 1939 case VK_PIPELINE_BIND_POINT_GRAPHICS: 1940 bind_graphics_pipeline(cmd_buffer, pipeline); 1941 break; 1942 1943 default: 1944 assert(!"invalid bind point"); 1945 break; 1946 } 1947} 1948 1949/* FIXME: C&P from radv. tu has similar code. Perhaps common place? */ 1950void 1951v3dv_viewport_compute_xform(const VkViewport *viewport, 1952 float scale[3], 1953 float translate[3]) 1954{ 1955 float x = viewport->x; 1956 float y = viewport->y; 1957 float half_width = 0.5f * viewport->width; 1958 float half_height = 0.5f * viewport->height; 1959 double n = viewport->minDepth; 1960 double f = viewport->maxDepth; 1961 1962 scale[0] = half_width; 1963 translate[0] = half_width + x; 1964 scale[1] = half_height; 1965 translate[1] = half_height + y; 1966 1967 scale[2] = (f - n); 1968 translate[2] = n; 1969 1970 /* It seems that if the scale is small enough the hardware won't clip 1971 * correctly so we work around this my choosing the smallest scale that 1972 * seems to work. 1973 * 1974 * This case is exercised by CTS: 1975 * dEQP-VK.draw.inverted_depth_ranges.nodepthclamp_deltazero 1976 */ 1977 const float min_abs_scale = 0.000009f; 1978 if (fabs(scale[2]) < min_abs_scale) 1979 scale[2] = min_abs_scale * (scale[2] < 0 ? -1.0f : 1.0f); 1980} 1981 1982VKAPI_ATTR void VKAPI_CALL 1983v3dv_CmdSetViewport(VkCommandBuffer commandBuffer, 1984 uint32_t firstViewport, 1985 uint32_t viewportCount, 1986 const VkViewport *pViewports) 1987{ 1988 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 1989 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 1990 const uint32_t total_count = firstViewport + viewportCount; 1991 1992 assert(firstViewport < MAX_VIEWPORTS); 1993 assert(total_count >= 1 && total_count <= MAX_VIEWPORTS); 1994 1995 if (state->dynamic.viewport.count < total_count) 1996 state->dynamic.viewport.count = total_count; 1997 1998 if (!memcmp(state->dynamic.viewport.viewports + firstViewport, 1999 pViewports, viewportCount * sizeof(*pViewports))) { 2000 return; 2001 } 2002 2003 memcpy(state->dynamic.viewport.viewports + firstViewport, pViewports, 2004 viewportCount * sizeof(*pViewports)); 2005 2006 for (uint32_t i = firstViewport; i < total_count; i++) { 2007 v3dv_viewport_compute_xform(&state->dynamic.viewport.viewports[i], 2008 state->dynamic.viewport.scale[i], 2009 state->dynamic.viewport.translate[i]); 2010 } 2011 2012 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VIEWPORT; 2013} 2014 2015VKAPI_ATTR void VKAPI_CALL 2016v3dv_CmdSetScissor(VkCommandBuffer commandBuffer, 2017 uint32_t firstScissor, 2018 uint32_t scissorCount, 2019 const VkRect2D *pScissors) 2020{ 2021 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 2022 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 2023 2024 assert(firstScissor < MAX_SCISSORS); 2025 assert(firstScissor + scissorCount >= 1 && 2026 firstScissor + scissorCount <= MAX_SCISSORS); 2027 2028 if (state->dynamic.scissor.count < firstScissor + scissorCount) 2029 state->dynamic.scissor.count = firstScissor + scissorCount; 2030 2031 if (!memcmp(state->dynamic.scissor.scissors + firstScissor, 2032 pScissors, scissorCount * sizeof(*pScissors))) { 2033 return; 2034 } 2035 2036 memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors, 2037 scissorCount * sizeof(*pScissors)); 2038 2039 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_SCISSOR; 2040} 2041 2042static void 2043emit_scissor(struct v3dv_cmd_buffer *cmd_buffer) 2044{ 2045 if (cmd_buffer->state.dynamic.viewport.count == 0) 2046 return; 2047 2048 struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic; 2049 2050 /* FIXME: right now we only support one viewport. viewporst[0] would work 2051 * now, but would need to change if we allow multiple viewports. 2052 */ 2053 float *vptranslate = dynamic->viewport.translate[0]; 2054 float *vpscale = dynamic->viewport.scale[0]; 2055 2056 float vp_minx = -fabsf(vpscale[0]) + vptranslate[0]; 2057 float vp_maxx = fabsf(vpscale[0]) + vptranslate[0]; 2058 float vp_miny = -fabsf(vpscale[1]) + vptranslate[1]; 2059 float vp_maxy = fabsf(vpscale[1]) + vptranslate[1]; 2060 2061 /* Quoting from v3dx_emit: 2062 * "Clip to the scissor if it's enabled, but still clip to the 2063 * drawable regardless since that controls where the binner 2064 * tries to put things. 2065 * 2066 * Additionally, always clip the rendering to the viewport, 2067 * since the hardware does guardband clipping, meaning 2068 * primitives would rasterize outside of the view volume." 2069 */ 2070 uint32_t minx, miny, maxx, maxy; 2071 2072 /* From the Vulkan spec: 2073 * 2074 * "The application must ensure (using scissor if necessary) that all 2075 * rendering is contained within the render area. The render area must be 2076 * contained within the framebuffer dimensions." 2077 * 2078 * So it is the application's responsibility to ensure this. Still, we can 2079 * help by automatically restricting the scissor rect to the render area. 2080 */ 2081 minx = MAX2(vp_minx, cmd_buffer->state.render_area.offset.x); 2082 miny = MAX2(vp_miny, cmd_buffer->state.render_area.offset.y); 2083 maxx = MIN2(vp_maxx, cmd_buffer->state.render_area.offset.x + 2084 cmd_buffer->state.render_area.extent.width); 2085 maxy = MIN2(vp_maxy, cmd_buffer->state.render_area.offset.y + 2086 cmd_buffer->state.render_area.extent.height); 2087 2088 minx = vp_minx; 2089 miny = vp_miny; 2090 maxx = vp_maxx; 2091 maxy = vp_maxy; 2092 2093 /* Clip against user provided scissor if needed. 2094 * 2095 * FIXME: right now we only allow one scissor. Below would need to be 2096 * updated if we support more 2097 */ 2098 if (dynamic->scissor.count > 0) { 2099 VkRect2D *scissor = &dynamic->scissor.scissors[0]; 2100 minx = MAX2(minx, scissor->offset.x); 2101 miny = MAX2(miny, scissor->offset.y); 2102 maxx = MIN2(maxx, scissor->offset.x + scissor->extent.width); 2103 maxy = MIN2(maxy, scissor->offset.y + scissor->extent.height); 2104 } 2105 2106 /* If the scissor is outside the viewport area we end up with 2107 * min{x,y} > max{x,y}. 2108 */ 2109 if (minx > maxx) 2110 maxx = minx; 2111 if (miny > maxy) 2112 maxy = miny; 2113 2114 cmd_buffer->state.clip_window.offset.x = minx; 2115 cmd_buffer->state.clip_window.offset.y = miny; 2116 cmd_buffer->state.clip_window.extent.width = maxx - minx; 2117 cmd_buffer->state.clip_window.extent.height = maxy - miny; 2118 2119 v3dv_X(cmd_buffer->device, job_emit_clip_window) 2120 (cmd_buffer->state.job, &cmd_buffer->state.clip_window); 2121 2122 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_SCISSOR; 2123} 2124 2125static void 2126update_gfx_uniform_state(struct v3dv_cmd_buffer *cmd_buffer, 2127 uint32_t dirty_uniform_state) 2128{ 2129 /* We need to update uniform streams if any piece of state that is passed 2130 * to the shader as a uniform may have changed. 2131 * 2132 * If only descriptor sets are dirty then we can safely ignore updates 2133 * for shader stages that don't access descriptors. 2134 */ 2135 2136 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; 2137 assert(pipeline); 2138 2139 const bool has_new_pipeline = dirty_uniform_state & V3DV_CMD_DIRTY_PIPELINE; 2140 const bool has_new_viewport = dirty_uniform_state & V3DV_CMD_DIRTY_VIEWPORT; 2141 const bool has_new_push_constants = dirty_uniform_state & V3DV_CMD_DIRTY_PUSH_CONSTANTS; 2142 const bool has_new_descriptors = dirty_uniform_state & V3DV_CMD_DIRTY_DESCRIPTOR_SETS; 2143 const bool has_new_view_index = dirty_uniform_state & V3DV_CMD_DIRTY_VIEW_INDEX; 2144 2145 /* VK_SHADER_STAGE_FRAGMENT_BIT */ 2146 const bool has_new_descriptors_fs = 2147 has_new_descriptors && 2148 (cmd_buffer->state.dirty_descriptor_stages & VK_SHADER_STAGE_FRAGMENT_BIT); 2149 2150 const bool has_new_push_constants_fs = 2151 has_new_push_constants && 2152 (cmd_buffer->state.dirty_push_constants_stages & VK_SHADER_STAGE_FRAGMENT_BIT); 2153 2154 const bool needs_fs_update = has_new_pipeline || 2155 has_new_view_index || 2156 has_new_push_constants_fs || 2157 has_new_descriptors_fs || 2158 has_new_view_index; 2159 2160 if (needs_fs_update) { 2161 struct v3dv_shader_variant *fs_variant = 2162 pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]; 2163 2164 cmd_buffer->state.uniforms.fs = 2165 v3dv_write_uniforms(cmd_buffer, pipeline, fs_variant); 2166 } 2167 2168 /* VK_SHADER_STAGE_GEOMETRY_BIT */ 2169 if (pipeline->has_gs) { 2170 const bool has_new_descriptors_gs = 2171 has_new_descriptors && 2172 (cmd_buffer->state.dirty_descriptor_stages & 2173 VK_SHADER_STAGE_GEOMETRY_BIT); 2174 2175 const bool has_new_push_constants_gs = 2176 has_new_push_constants && 2177 (cmd_buffer->state.dirty_push_constants_stages & 2178 VK_SHADER_STAGE_GEOMETRY_BIT); 2179 2180 const bool needs_gs_update = has_new_viewport || 2181 has_new_view_index || 2182 has_new_pipeline || 2183 has_new_push_constants_gs || 2184 has_new_descriptors_gs; 2185 2186 if (needs_gs_update) { 2187 struct v3dv_shader_variant *gs_variant = 2188 pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY]; 2189 2190 struct v3dv_shader_variant *gs_bin_variant = 2191 pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN]; 2192 2193 cmd_buffer->state.uniforms.gs = 2194 v3dv_write_uniforms(cmd_buffer, pipeline, gs_variant); 2195 2196 cmd_buffer->state.uniforms.gs_bin = 2197 v3dv_write_uniforms(cmd_buffer, pipeline, gs_bin_variant); 2198 } 2199 } 2200 2201 /* VK_SHADER_STAGE_VERTEX_BIT */ 2202 const bool has_new_descriptors_vs = 2203 has_new_descriptors && 2204 (cmd_buffer->state.dirty_descriptor_stages & VK_SHADER_STAGE_VERTEX_BIT); 2205 2206 const bool has_new_push_constants_vs = 2207 has_new_push_constants && 2208 (cmd_buffer->state.dirty_push_constants_stages & VK_SHADER_STAGE_VERTEX_BIT); 2209 2210 const bool needs_vs_update = has_new_viewport || 2211 has_new_view_index || 2212 has_new_pipeline || 2213 has_new_push_constants_vs || 2214 has_new_descriptors_vs; 2215 2216 if (needs_vs_update) { 2217 struct v3dv_shader_variant *vs_variant = 2218 pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX]; 2219 2220 struct v3dv_shader_variant *vs_bin_variant = 2221 pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN]; 2222 2223 cmd_buffer->state.uniforms.vs = 2224 v3dv_write_uniforms(cmd_buffer, pipeline, vs_variant); 2225 2226 cmd_buffer->state.uniforms.vs_bin = 2227 v3dv_write_uniforms(cmd_buffer, pipeline, vs_bin_variant); 2228 } 2229 2230 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_VIEW_INDEX; 2231} 2232 2233/* This stores command buffer state that we might be about to stomp for 2234 * a meta operation. 2235 */ 2236void 2237v3dv_cmd_buffer_meta_state_push(struct v3dv_cmd_buffer *cmd_buffer, 2238 bool push_descriptor_state) 2239{ 2240 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 2241 2242 if (state->subpass_idx != -1) { 2243 state->meta.subpass_idx = state->subpass_idx; 2244 state->meta.framebuffer = v3dv_framebuffer_to_handle(state->framebuffer); 2245 state->meta.pass = v3dv_render_pass_to_handle(state->pass); 2246 2247 const uint32_t attachment_state_item_size = 2248 sizeof(struct v3dv_cmd_buffer_attachment_state); 2249 const uint32_t attachment_state_total_size = 2250 attachment_state_item_size * state->attachment_alloc_count; 2251 if (state->meta.attachment_alloc_count < state->attachment_alloc_count) { 2252 if (state->meta.attachment_alloc_count > 0) 2253 vk_free(&cmd_buffer->device->vk.alloc, state->meta.attachments); 2254 2255 state->meta.attachments = vk_zalloc(&cmd_buffer->device->vk.alloc, 2256 attachment_state_total_size, 8, 2257 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 2258 if (!state->meta.attachments) { 2259 v3dv_flag_oom(cmd_buffer, NULL); 2260 return; 2261 } 2262 state->meta.attachment_alloc_count = state->attachment_alloc_count; 2263 } 2264 state->meta.attachment_count = state->attachment_alloc_count; 2265 memcpy(state->meta.attachments, state->attachments, 2266 attachment_state_total_size); 2267 2268 state->meta.tile_aligned_render_area = state->tile_aligned_render_area; 2269 memcpy(&state->meta.render_area, &state->render_area, sizeof(VkRect2D)); 2270 } 2271 2272 /* We expect that meta operations are graphics-only, so we only take into 2273 * account the graphics pipeline, and the graphics state 2274 */ 2275 state->meta.gfx.pipeline = state->gfx.pipeline; 2276 memcpy(&state->meta.dynamic, &state->dynamic, sizeof(state->dynamic)); 2277 2278 struct v3dv_descriptor_state *gfx_descriptor_state = 2279 &cmd_buffer->state.gfx.descriptor_state; 2280 2281 if (push_descriptor_state) { 2282 if (gfx_descriptor_state->valid != 0) { 2283 memcpy(&state->meta.gfx.descriptor_state, gfx_descriptor_state, 2284 sizeof(state->gfx.descriptor_state)); 2285 } 2286 state->meta.has_descriptor_state = true; 2287 } else { 2288 state->meta.has_descriptor_state = false; 2289 } 2290 2291 /* FIXME: if we keep track of wether we have bound any push constant state 2292 * at all we could restruct this only to cases where it is actually 2293 * necessary. 2294 */ 2295 memcpy(state->meta.push_constants, cmd_buffer->push_constants_data, 2296 sizeof(state->meta.push_constants)); 2297} 2298 2299/* This restores command buffer state after a meta operation 2300 */ 2301void 2302v3dv_cmd_buffer_meta_state_pop(struct v3dv_cmd_buffer *cmd_buffer, 2303 uint32_t dirty_dynamic_state, 2304 bool needs_subpass_resume) 2305{ 2306 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 2307 2308 if (state->meta.subpass_idx != -1) { 2309 state->pass = v3dv_render_pass_from_handle(state->meta.pass); 2310 state->framebuffer = v3dv_framebuffer_from_handle(state->meta.framebuffer); 2311 2312 assert(state->meta.attachment_count <= state->attachment_alloc_count); 2313 const uint32_t attachment_state_item_size = 2314 sizeof(struct v3dv_cmd_buffer_attachment_state); 2315 const uint32_t attachment_state_total_size = 2316 attachment_state_item_size * state->meta.attachment_count; 2317 memcpy(state->attachments, state->meta.attachments, 2318 attachment_state_total_size); 2319 2320 state->tile_aligned_render_area = state->meta.tile_aligned_render_area; 2321 memcpy(&state->render_area, &state->meta.render_area, sizeof(VkRect2D)); 2322 2323 /* Is needs_subpass_resume is true it means that the emitted the meta 2324 * operation in its own job (possibly with an RT config that is 2325 * incompatible with the current subpass), so resuming subpass execution 2326 * after it requires that we create a new job with the subpass RT setup. 2327 */ 2328 if (needs_subpass_resume) 2329 v3dv_cmd_buffer_subpass_resume(cmd_buffer, state->meta.subpass_idx); 2330 } else { 2331 state->subpass_idx = -1; 2332 } 2333 2334 if (state->meta.gfx.pipeline != NULL) { 2335 struct v3dv_pipeline *pipeline = state->meta.gfx.pipeline; 2336 VkPipelineBindPoint pipeline_binding = 2337 v3dv_pipeline_get_binding_point(pipeline); 2338 v3dv_CmdBindPipeline(v3dv_cmd_buffer_to_handle(cmd_buffer), 2339 pipeline_binding, 2340 v3dv_pipeline_to_handle(state->meta.gfx.pipeline)); 2341 } else { 2342 state->gfx.pipeline = NULL; 2343 } 2344 2345 if (dirty_dynamic_state) { 2346 memcpy(&state->dynamic, &state->meta.dynamic, sizeof(state->dynamic)); 2347 state->dirty |= dirty_dynamic_state; 2348 } 2349 2350 if (state->meta.has_descriptor_state) { 2351 if (state->meta.gfx.descriptor_state.valid != 0) { 2352 memcpy(&state->gfx.descriptor_state, &state->meta.gfx.descriptor_state, 2353 sizeof(state->gfx.descriptor_state)); 2354 } else { 2355 state->gfx.descriptor_state.valid = 0; 2356 } 2357 } 2358 2359 memcpy(cmd_buffer->push_constants_data, state->meta.push_constants, 2360 sizeof(state->meta.push_constants)); 2361 2362 state->meta.gfx.pipeline = NULL; 2363 state->meta.framebuffer = VK_NULL_HANDLE; 2364 state->meta.pass = VK_NULL_HANDLE; 2365 state->meta.subpass_idx = -1; 2366 state->meta.has_descriptor_state = false; 2367} 2368 2369static struct v3dv_job * 2370cmd_buffer_pre_draw_split_job(struct v3dv_cmd_buffer *cmd_buffer) 2371{ 2372 struct v3dv_job *job = cmd_buffer->state.job; 2373 assert(job); 2374 2375 /* If the job has been flagged with 'always_flush' and it has already 2376 * recorded any draw calls then we need to start a new job for it. 2377 */ 2378 if (job->always_flush && job->draw_count > 0) { 2379 assert(cmd_buffer->state.pass); 2380 /* First, flag the current job as not being the last in the 2381 * current subpass 2382 */ 2383 job->is_subpass_finish = false; 2384 2385 /* Now start a new job in the same subpass and flag it as continuing 2386 * the current subpass. 2387 */ 2388 job = v3dv_cmd_buffer_subpass_resume(cmd_buffer, 2389 cmd_buffer->state.subpass_idx); 2390 assert(job->draw_count == 0); 2391 2392 /* Inherit the 'always flush' behavior */ 2393 job->always_flush = true; 2394 } 2395 2396 assert(job->draw_count == 0 || !job->always_flush); 2397 return job; 2398} 2399 2400/** 2401 * The Vulkan spec states: 2402 * 2403 * "It is legal for a subpass to use no color or depth/stencil 2404 * attachments (...) This kind of subpass can use shader side effects such 2405 * as image stores and atomics to produce an output. In this case, the 2406 * subpass continues to use the width, height, and layers of the framebuffer 2407 * to define the dimensions of the rendering area, and the 2408 * rasterizationSamples from each pipeline’s 2409 * VkPipelineMultisampleStateCreateInfo to define the number of samples used 2410 * in rasterization." 2411 * 2412 * We need to enable MSAA in the TILE_BINNING_MODE_CFG packet, which we 2413 * emit when we start a new frame at the begining of a subpass. At that point, 2414 * if the framebuffer doesn't have any attachments we won't enable MSAA and 2415 * the job won't be valid in the scenario described by the spec. 2416 * 2417 * This function is intended to be called before a draw call and will test if 2418 * we are in that scenario, in which case, it will restart the current job 2419 * with MSAA enabled. 2420 */ 2421static void 2422cmd_buffer_restart_job_for_msaa_if_needed(struct v3dv_cmd_buffer *cmd_buffer) 2423{ 2424 assert(cmd_buffer->state.job); 2425 2426 /* We don't support variableMultisampleRate so we know that all pipelines 2427 * bound in the same subpass must have matching number of samples, so we 2428 * can do this check only on the first draw call. 2429 */ 2430 if (cmd_buffer->state.job->draw_count > 0) 2431 return; 2432 2433 /* We only need to restart the frame if the pipeline requires MSAA but 2434 * our frame tiling didn't enable it. 2435 */ 2436 if (!cmd_buffer->state.gfx.pipeline->msaa || 2437 cmd_buffer->state.job->frame_tiling.msaa) { 2438 return; 2439 } 2440 2441 /* FIXME: Secondary command buffers don't start frames. Instead, they are 2442 * recorded into primary jobs that start them. For secondaries, we should 2443 * still handle this scenario, but we should do that when we record them 2444 * into primaries by testing if any of the secondaries has multisampled 2445 * draw calls in them, and then using that info to decide if we need to 2446 * restart the primary job into which they are being recorded. 2447 */ 2448 if (cmd_buffer->level != VK_COMMAND_BUFFER_LEVEL_PRIMARY) 2449 return; 2450 2451 /* Drop the current job and restart it with MSAA enabled */ 2452 struct v3dv_job *old_job = cmd_buffer->state.job; 2453 cmd_buffer->state.job = NULL; 2454 2455 struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->vk.alloc, 2456 sizeof(struct v3dv_job), 8, 2457 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 2458 if (!job) { 2459 v3dv_flag_oom(cmd_buffer, NULL); 2460 return; 2461 } 2462 2463 v3dv_job_init(job, V3DV_JOB_TYPE_GPU_CL, cmd_buffer->device, cmd_buffer, 2464 cmd_buffer->state.subpass_idx); 2465 cmd_buffer->state.job = job; 2466 2467 v3dv_job_start_frame(job, 2468 old_job->frame_tiling.width, 2469 old_job->frame_tiling.height, 2470 old_job->frame_tiling.layers, 2471 true, 2472 old_job->frame_tiling.render_target_count, 2473 old_job->frame_tiling.internal_bpp, 2474 true /* msaa */); 2475 2476 v3dv_job_destroy(old_job); 2477} 2478 2479void 2480v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer) 2481{ 2482 assert(cmd_buffer->state.gfx.pipeline); 2483 assert(!(cmd_buffer->state.gfx.pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT)); 2484 2485 /* If we emitted a pipeline barrier right before this draw we won't have 2486 * an active job. In that case, create a new job continuing the current 2487 * subpass. 2488 */ 2489 if (!cmd_buffer->state.job) { 2490 v3dv_cmd_buffer_subpass_resume(cmd_buffer, 2491 cmd_buffer->state.subpass_idx); 2492 } 2493 2494 /* Restart single sample job for MSAA pipeline if needed */ 2495 cmd_buffer_restart_job_for_msaa_if_needed(cmd_buffer); 2496 2497 /* If the job is configured to flush on every draw call we need to create 2498 * a new job now. 2499 */ 2500 struct v3dv_job *job = cmd_buffer_pre_draw_split_job(cmd_buffer); 2501 job->draw_count++; 2502 2503 /* GL shader state binds shaders, uniform and vertex attribute state. The 2504 * compiler injects uniforms to handle some descriptor types (such as 2505 * textures), so we need to regen that when descriptor state changes. 2506 * 2507 * We also need to emit new shader state if we have a dirty viewport since 2508 * that will require that we new uniform state for QUNIFORM_VIEWPORT_*. 2509 */ 2510 uint32_t *dirty = &cmd_buffer->state.dirty; 2511 2512 const uint32_t dirty_uniform_state = 2513 *dirty & (V3DV_CMD_DIRTY_PIPELINE | 2514 V3DV_CMD_DIRTY_PUSH_CONSTANTS | 2515 V3DV_CMD_DIRTY_DESCRIPTOR_SETS | 2516 V3DV_CMD_DIRTY_VIEWPORT | 2517 V3DV_CMD_DIRTY_VIEW_INDEX); 2518 2519 if (dirty_uniform_state) 2520 update_gfx_uniform_state(cmd_buffer, dirty_uniform_state); 2521 2522 struct v3dv_device *device = cmd_buffer->device; 2523 2524 if (dirty_uniform_state || (*dirty & V3DV_CMD_DIRTY_VERTEX_BUFFER)) 2525 v3dv_X(device, cmd_buffer_emit_gl_shader_state)(cmd_buffer); 2526 2527 if (*dirty & (V3DV_CMD_DIRTY_PIPELINE)) { 2528 v3dv_X(device, cmd_buffer_emit_configuration_bits)(cmd_buffer); 2529 v3dv_X(device, cmd_buffer_emit_varyings_state)(cmd_buffer); 2530 } 2531 2532 if (*dirty & (V3DV_CMD_DIRTY_VIEWPORT | V3DV_CMD_DIRTY_SCISSOR)) { 2533 emit_scissor(cmd_buffer); 2534 } 2535 2536 if (*dirty & V3DV_CMD_DIRTY_VIEWPORT) { 2537 v3dv_X(device, cmd_buffer_emit_viewport)(cmd_buffer); 2538 } 2539 2540 if (*dirty & V3DV_CMD_DIRTY_INDEX_BUFFER) 2541 v3dv_X(device, cmd_buffer_emit_index_buffer)(cmd_buffer); 2542 2543 const uint32_t dynamic_stencil_dirty_flags = 2544 V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK | 2545 V3DV_CMD_DIRTY_STENCIL_WRITE_MASK | 2546 V3DV_CMD_DIRTY_STENCIL_REFERENCE; 2547 if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | dynamic_stencil_dirty_flags)) 2548 v3dv_X(device, cmd_buffer_emit_stencil)(cmd_buffer); 2549 2550 if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_DEPTH_BIAS)) 2551 v3dv_X(device, cmd_buffer_emit_depth_bias)(cmd_buffer); 2552 2553 if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_BLEND_CONSTANTS)) 2554 v3dv_X(device, cmd_buffer_emit_blend)(cmd_buffer); 2555 2556 if (*dirty & V3DV_CMD_DIRTY_OCCLUSION_QUERY) 2557 v3dv_X(device, cmd_buffer_emit_occlusion_query)(cmd_buffer); 2558 2559 if (*dirty & V3DV_CMD_DIRTY_LINE_WIDTH) 2560 v3dv_X(device, cmd_buffer_emit_line_width)(cmd_buffer); 2561 2562 if (*dirty & V3DV_CMD_DIRTY_PIPELINE) 2563 v3dv_X(device, cmd_buffer_emit_sample_state)(cmd_buffer); 2564 2565 if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE)) 2566 v3dv_X(device, cmd_buffer_emit_color_write_mask)(cmd_buffer); 2567 2568 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_PIPELINE; 2569} 2570 2571static inline void 2572cmd_buffer_set_view_index(struct v3dv_cmd_buffer *cmd_buffer, 2573 uint32_t view_index) 2574{ 2575 cmd_buffer->state.view_index = view_index; 2576 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VIEW_INDEX; 2577} 2578 2579static void 2580cmd_buffer_draw(struct v3dv_cmd_buffer *cmd_buffer, 2581 struct v3dv_draw_info *info) 2582{ 2583 2584 struct v3dv_render_pass *pass = cmd_buffer->state.pass; 2585 if (likely(!pass->multiview_enabled)) { 2586 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer); 2587 v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw)(cmd_buffer, info); 2588 return; 2589 } 2590 2591 uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask; 2592 while (view_mask) { 2593 cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask)); 2594 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer); 2595 v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw)(cmd_buffer, info); 2596 } 2597} 2598 2599VKAPI_ATTR void VKAPI_CALL 2600v3dv_CmdDraw(VkCommandBuffer commandBuffer, 2601 uint32_t vertexCount, 2602 uint32_t instanceCount, 2603 uint32_t firstVertex, 2604 uint32_t firstInstance) 2605{ 2606 if (vertexCount == 0 || instanceCount == 0) 2607 return; 2608 2609 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 2610 struct v3dv_draw_info info = {}; 2611 info.vertex_count = vertexCount; 2612 info.instance_count = instanceCount; 2613 info.first_instance = firstInstance; 2614 info.first_vertex = firstVertex; 2615 2616 cmd_buffer_draw(cmd_buffer, &info); 2617} 2618 2619VKAPI_ATTR void VKAPI_CALL 2620v3dv_CmdDrawIndexed(VkCommandBuffer commandBuffer, 2621 uint32_t indexCount, 2622 uint32_t instanceCount, 2623 uint32_t firstIndex, 2624 int32_t vertexOffset, 2625 uint32_t firstInstance) 2626{ 2627 if (indexCount == 0 || instanceCount == 0) 2628 return; 2629 2630 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 2631 2632 struct v3dv_render_pass *pass = cmd_buffer->state.pass; 2633 if (likely(!pass->multiview_enabled)) { 2634 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer); 2635 v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indexed) 2636 (cmd_buffer, indexCount, instanceCount, 2637 firstIndex, vertexOffset, firstInstance); 2638 return; 2639 } 2640 2641 uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask; 2642 while (view_mask) { 2643 cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask)); 2644 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer); 2645 v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indexed) 2646 (cmd_buffer, indexCount, instanceCount, 2647 firstIndex, vertexOffset, firstInstance); 2648 } 2649} 2650 2651VKAPI_ATTR void VKAPI_CALL 2652v3dv_CmdDrawIndirect(VkCommandBuffer commandBuffer, 2653 VkBuffer _buffer, 2654 VkDeviceSize offset, 2655 uint32_t drawCount, 2656 uint32_t stride) 2657{ 2658 /* drawCount is the number of draws to execute, and can be zero. */ 2659 if (drawCount == 0) 2660 return; 2661 2662 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 2663 V3DV_FROM_HANDLE(v3dv_buffer, buffer, _buffer); 2664 2665 struct v3dv_render_pass *pass = cmd_buffer->state.pass; 2666 if (likely(!pass->multiview_enabled)) { 2667 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer); 2668 v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indirect) 2669 (cmd_buffer, buffer, offset, drawCount, stride); 2670 return; 2671 } 2672 2673 uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask; 2674 while (view_mask) { 2675 cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask)); 2676 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer); 2677 v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indirect) 2678 (cmd_buffer, buffer, offset, drawCount, stride); 2679 } 2680} 2681 2682VKAPI_ATTR void VKAPI_CALL 2683v3dv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, 2684 VkBuffer _buffer, 2685 VkDeviceSize offset, 2686 uint32_t drawCount, 2687 uint32_t stride) 2688{ 2689 /* drawCount is the number of draws to execute, and can be zero. */ 2690 if (drawCount == 0) 2691 return; 2692 2693 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 2694 V3DV_FROM_HANDLE(v3dv_buffer, buffer, _buffer); 2695 2696 struct v3dv_render_pass *pass = cmd_buffer->state.pass; 2697 if (likely(!pass->multiview_enabled)) { 2698 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer); 2699 v3dv_X(cmd_buffer->device, cmd_buffer_emit_indexed_indirect) 2700 (cmd_buffer, buffer, offset, drawCount, stride); 2701 return; 2702 } 2703 2704 uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask; 2705 while (view_mask) { 2706 cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask)); 2707 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer); 2708 v3dv_X(cmd_buffer->device, cmd_buffer_emit_indexed_indirect) 2709 (cmd_buffer, buffer, offset, drawCount, stride); 2710 } 2711} 2712 2713VKAPI_ATTR void VKAPI_CALL 2714v3dv_CmdPipelineBarrier(VkCommandBuffer commandBuffer, 2715 VkPipelineStageFlags srcStageMask, 2716 VkPipelineStageFlags dstStageMask, 2717 VkDependencyFlags dependencyFlags, 2718 uint32_t memoryBarrierCount, 2719 const VkMemoryBarrier *pMemoryBarriers, 2720 uint32_t bufferBarrierCount, 2721 const VkBufferMemoryBarrier *pBufferBarriers, 2722 uint32_t imageBarrierCount, 2723 const VkImageMemoryBarrier *pImageBarriers) 2724{ 2725 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 2726 2727 /* We only care about barriers between GPU jobs */ 2728 if (srcStageMask == VK_PIPELINE_STAGE_HOST_BIT || 2729 dstStageMask == VK_PIPELINE_STAGE_HOST_BIT) { 2730 return; 2731 } 2732 2733 /* If we have a recording job, finish it here */ 2734 struct v3dv_job *job = cmd_buffer->state.job; 2735 if (job) 2736 v3dv_cmd_buffer_finish_job(cmd_buffer); 2737 2738 cmd_buffer->state.has_barrier = true; 2739 if (dstStageMask & (VK_PIPELINE_STAGE_VERTEX_INPUT_BIT | 2740 VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | 2741 VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT | 2742 VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT | 2743 VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT | 2744 VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT)) { 2745 cmd_buffer->state.has_bcl_barrier = true; 2746 } 2747} 2748 2749VKAPI_ATTR void VKAPI_CALL 2750v3dv_CmdBindVertexBuffers(VkCommandBuffer commandBuffer, 2751 uint32_t firstBinding, 2752 uint32_t bindingCount, 2753 const VkBuffer *pBuffers, 2754 const VkDeviceSize *pOffsets) 2755{ 2756 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 2757 struct v3dv_vertex_binding *vb = cmd_buffer->state.vertex_bindings; 2758 2759 /* We have to defer setting up vertex buffer since we need the buffer 2760 * stride from the pipeline. 2761 */ 2762 2763 assert(firstBinding + bindingCount <= MAX_VBS); 2764 bool vb_state_changed = false; 2765 for (uint32_t i = 0; i < bindingCount; i++) { 2766 if (vb[firstBinding + i].buffer != v3dv_buffer_from_handle(pBuffers[i])) { 2767 vb[firstBinding + i].buffer = v3dv_buffer_from_handle(pBuffers[i]); 2768 vb_state_changed = true; 2769 } 2770 if (vb[firstBinding + i].offset != pOffsets[i]) { 2771 vb[firstBinding + i].offset = pOffsets[i]; 2772 vb_state_changed = true; 2773 } 2774 } 2775 2776 if (vb_state_changed) 2777 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VERTEX_BUFFER; 2778} 2779 2780static uint32_t 2781get_index_size(VkIndexType index_type) 2782{ 2783 switch (index_type) { 2784 case VK_INDEX_TYPE_UINT8_EXT: 2785 return 1; 2786 break; 2787 case VK_INDEX_TYPE_UINT16: 2788 return 2; 2789 break; 2790 case VK_INDEX_TYPE_UINT32: 2791 return 4; 2792 break; 2793 default: 2794 unreachable("Unsupported index type"); 2795 } 2796} 2797 2798VKAPI_ATTR void VKAPI_CALL 2799v3dv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer, 2800 VkBuffer buffer, 2801 VkDeviceSize offset, 2802 VkIndexType indexType) 2803{ 2804 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 2805 2806 const uint32_t index_size = get_index_size(indexType); 2807 if (buffer == cmd_buffer->state.index_buffer.buffer && 2808 offset == cmd_buffer->state.index_buffer.offset && 2809 index_size == cmd_buffer->state.index_buffer.index_size) { 2810 return; 2811 } 2812 2813 cmd_buffer->state.index_buffer.buffer = buffer; 2814 cmd_buffer->state.index_buffer.offset = offset; 2815 cmd_buffer->state.index_buffer.index_size = index_size; 2816 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_INDEX_BUFFER; 2817} 2818 2819VKAPI_ATTR void VKAPI_CALL 2820v3dv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer, 2821 VkStencilFaceFlags faceMask, 2822 uint32_t compareMask) 2823{ 2824 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 2825 2826 if (faceMask & VK_STENCIL_FACE_FRONT_BIT) 2827 cmd_buffer->state.dynamic.stencil_compare_mask.front = compareMask & 0xff; 2828 if (faceMask & VK_STENCIL_FACE_BACK_BIT) 2829 cmd_buffer->state.dynamic.stencil_compare_mask.back = compareMask & 0xff; 2830 2831 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK; 2832} 2833 2834VKAPI_ATTR void VKAPI_CALL 2835v3dv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer, 2836 VkStencilFaceFlags faceMask, 2837 uint32_t writeMask) 2838{ 2839 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 2840 2841 if (faceMask & VK_STENCIL_FACE_FRONT_BIT) 2842 cmd_buffer->state.dynamic.stencil_write_mask.front = writeMask & 0xff; 2843 if (faceMask & VK_STENCIL_FACE_BACK_BIT) 2844 cmd_buffer->state.dynamic.stencil_write_mask.back = writeMask & 0xff; 2845 2846 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_STENCIL_WRITE_MASK; 2847} 2848 2849VKAPI_ATTR void VKAPI_CALL 2850v3dv_CmdSetStencilReference(VkCommandBuffer commandBuffer, 2851 VkStencilFaceFlags faceMask, 2852 uint32_t reference) 2853{ 2854 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 2855 2856 if (faceMask & VK_STENCIL_FACE_FRONT_BIT) 2857 cmd_buffer->state.dynamic.stencil_reference.front = reference & 0xff; 2858 if (faceMask & VK_STENCIL_FACE_BACK_BIT) 2859 cmd_buffer->state.dynamic.stencil_reference.back = reference & 0xff; 2860 2861 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_STENCIL_REFERENCE; 2862} 2863 2864VKAPI_ATTR void VKAPI_CALL 2865v3dv_CmdSetDepthBias(VkCommandBuffer commandBuffer, 2866 float depthBiasConstantFactor, 2867 float depthBiasClamp, 2868 float depthBiasSlopeFactor) 2869{ 2870 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 2871 2872 cmd_buffer->state.dynamic.depth_bias.constant_factor = depthBiasConstantFactor; 2873 cmd_buffer->state.dynamic.depth_bias.depth_bias_clamp = depthBiasClamp; 2874 cmd_buffer->state.dynamic.depth_bias.slope_factor = depthBiasSlopeFactor; 2875 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DEPTH_BIAS; 2876} 2877 2878VKAPI_ATTR void VKAPI_CALL 2879v3dv_CmdSetDepthBounds(VkCommandBuffer commandBuffer, 2880 float minDepthBounds, 2881 float maxDepthBounds) 2882{ 2883 /* We do not support depth bounds testing so we just ingore this. We are 2884 * already asserting that pipelines don't enable the feature anyway. 2885 */ 2886} 2887 2888VKAPI_ATTR void VKAPI_CALL 2889v3dv_CmdSetLineWidth(VkCommandBuffer commandBuffer, 2890 float lineWidth) 2891{ 2892 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 2893 2894 cmd_buffer->state.dynamic.line_width = lineWidth; 2895 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_LINE_WIDTH; 2896} 2897 2898VKAPI_ATTR void VKAPI_CALL 2899v3dv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer, 2900 VkPipelineBindPoint pipelineBindPoint, 2901 VkPipelineLayout _layout, 2902 uint32_t firstSet, 2903 uint32_t descriptorSetCount, 2904 const VkDescriptorSet *pDescriptorSets, 2905 uint32_t dynamicOffsetCount, 2906 const uint32_t *pDynamicOffsets) 2907{ 2908 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 2909 V3DV_FROM_HANDLE(v3dv_pipeline_layout, layout, _layout); 2910 2911 uint32_t dyn_index = 0; 2912 2913 assert(firstSet + descriptorSetCount <= MAX_SETS); 2914 2915 struct v3dv_descriptor_state *descriptor_state = 2916 pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE ? 2917 &cmd_buffer->state.compute.descriptor_state : 2918 &cmd_buffer->state.gfx.descriptor_state; 2919 2920 VkShaderStageFlags dirty_stages = 0; 2921 bool descriptor_state_changed = false; 2922 for (uint32_t i = 0; i < descriptorSetCount; i++) { 2923 V3DV_FROM_HANDLE(v3dv_descriptor_set, set, pDescriptorSets[i]); 2924 uint32_t index = firstSet + i; 2925 2926 descriptor_state->valid |= (1u << index); 2927 if (descriptor_state->descriptor_sets[index] != set) { 2928 descriptor_state->descriptor_sets[index] = set; 2929 dirty_stages |= set->layout->shader_stages; 2930 descriptor_state_changed = true; 2931 } 2932 2933 for (uint32_t j = 0; j < set->layout->dynamic_offset_count; j++, dyn_index++) { 2934 uint32_t idx = j + layout->set[i + firstSet].dynamic_offset_start; 2935 2936 if (descriptor_state->dynamic_offsets[idx] != pDynamicOffsets[dyn_index]) { 2937 descriptor_state->dynamic_offsets[idx] = pDynamicOffsets[dyn_index]; 2938 dirty_stages |= set->layout->shader_stages; 2939 descriptor_state_changed = true; 2940 } 2941 } 2942 } 2943 2944 if (descriptor_state_changed) { 2945 if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) { 2946 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DESCRIPTOR_SETS; 2947 cmd_buffer->state.dirty_descriptor_stages |= dirty_stages & VK_SHADER_STAGE_ALL_GRAPHICS; 2948 } else { 2949 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS; 2950 cmd_buffer->state.dirty_descriptor_stages |= VK_SHADER_STAGE_COMPUTE_BIT; 2951 } 2952 } 2953} 2954 2955VKAPI_ATTR void VKAPI_CALL 2956v3dv_CmdPushConstants(VkCommandBuffer commandBuffer, 2957 VkPipelineLayout layout, 2958 VkShaderStageFlags stageFlags, 2959 uint32_t offset, 2960 uint32_t size, 2961 const void *pValues) 2962{ 2963 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 2964 2965 if (!memcmp((uint8_t *) cmd_buffer->push_constants_data + offset, pValues, size)) 2966 return; 2967 2968 memcpy((uint8_t *) cmd_buffer->push_constants_data + offset, pValues, size); 2969 2970 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_PUSH_CONSTANTS; 2971 cmd_buffer->state.dirty_push_constants_stages |= stageFlags; 2972} 2973 2974VKAPI_ATTR void VKAPI_CALL 2975v3dv_CmdSetBlendConstants(VkCommandBuffer commandBuffer, 2976 const float blendConstants[4]) 2977{ 2978 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 2979 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 2980 2981 if (!memcmp(state->dynamic.blend_constants, blendConstants, 2982 sizeof(state->dynamic.blend_constants))) { 2983 return; 2984 } 2985 2986 memcpy(state->dynamic.blend_constants, blendConstants, 2987 sizeof(state->dynamic.blend_constants)); 2988 2989 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_BLEND_CONSTANTS; 2990} 2991 2992VKAPI_ATTR void VKAPI_CALL 2993v3dv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer, 2994 uint32_t attachmentCount, 2995 const VkBool32 *pColorWriteEnables) 2996{ 2997 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 2998 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 2999 uint32_t color_write_enable = 0; 3000 3001 for (uint32_t i = 0; i < attachmentCount; i++) 3002 color_write_enable |= pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0; 3003 3004 if (state->dynamic.color_write_enable == color_write_enable) 3005 return; 3006 3007 state->dynamic.color_write_enable = color_write_enable; 3008 3009 state->dirty |= V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE; 3010} 3011 3012void 3013v3dv_cmd_buffer_reset_queries(struct v3dv_cmd_buffer *cmd_buffer, 3014 struct v3dv_query_pool *pool, 3015 uint32_t first, 3016 uint32_t count) 3017{ 3018 /* Resets can only happen outside a render pass instance so we should not 3019 * be in the middle of job recording. 3020 */ 3021 assert(cmd_buffer->state.pass == NULL); 3022 assert(cmd_buffer->state.job == NULL); 3023 3024 assert(first < pool->query_count); 3025 assert(first + count <= pool->query_count); 3026 3027 struct v3dv_job *job = 3028 v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device, 3029 V3DV_JOB_TYPE_CPU_RESET_QUERIES, 3030 cmd_buffer, -1); 3031 v3dv_return_if_oom(cmd_buffer, NULL); 3032 3033 job->cpu.query_reset.pool = pool; 3034 job->cpu.query_reset.first = first; 3035 job->cpu.query_reset.count = count; 3036 3037 list_addtail(&job->list_link, &cmd_buffer->jobs); 3038} 3039 3040void 3041v3dv_cmd_buffer_ensure_array_state(struct v3dv_cmd_buffer *cmd_buffer, 3042 uint32_t slot_size, 3043 uint32_t used_count, 3044 uint32_t *alloc_count, 3045 void **ptr) 3046{ 3047 if (used_count >= *alloc_count) { 3048 const uint32_t prev_slot_count = *alloc_count; 3049 void *old_buffer = *ptr; 3050 3051 const uint32_t new_slot_count = MAX2(*alloc_count * 2, 4); 3052 const uint32_t bytes = new_slot_count * slot_size; 3053 *ptr = vk_alloc(&cmd_buffer->device->vk.alloc, bytes, 8, 3054 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 3055 if (*ptr == NULL) { 3056 fprintf(stderr, "Error: failed to allocate CPU buffer for query.\n"); 3057 v3dv_flag_oom(cmd_buffer, NULL); 3058 return; 3059 } 3060 3061 memcpy(*ptr, old_buffer, prev_slot_count * slot_size); 3062 *alloc_count = new_slot_count; 3063 } 3064 assert(used_count < *alloc_count); 3065} 3066 3067void 3068v3dv_cmd_buffer_begin_query(struct v3dv_cmd_buffer *cmd_buffer, 3069 struct v3dv_query_pool *pool, 3070 uint32_t query, 3071 VkQueryControlFlags flags) 3072{ 3073 /* FIXME: we only support one active query for now */ 3074 assert(cmd_buffer->state.query.active_query.bo == NULL); 3075 assert(query < pool->query_count); 3076 3077 cmd_buffer->state.query.active_query.bo = pool->queries[query].bo; 3078 cmd_buffer->state.query.active_query.offset = pool->queries[query].offset; 3079 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY; 3080} 3081 3082void 3083v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer, 3084 struct v3dv_query_pool *pool, 3085 uint32_t query) 3086{ 3087 assert(query < pool->query_count); 3088 assert(cmd_buffer->state.query.active_query.bo != NULL); 3089 3090 if (cmd_buffer->state.pass) { 3091 /* Queue the EndQuery in the command buffer state, we will create a CPU 3092 * job to flag all of these queries as possibly available right after the 3093 * render pass job in which they have been recorded. 3094 */ 3095 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 3096 v3dv_cmd_buffer_ensure_array_state(cmd_buffer, 3097 sizeof(struct v3dv_end_query_cpu_job_info), 3098 state->query.end.used_count, 3099 &state->query.end.alloc_count, 3100 (void **) &state->query.end.states); 3101 v3dv_return_if_oom(cmd_buffer, NULL); 3102 3103 struct v3dv_end_query_cpu_job_info *info = 3104 &state->query.end.states[state->query.end.used_count++]; 3105 3106 info->pool = pool; 3107 info->query = query; 3108 3109 /* From the Vulkan spec: 3110 * 3111 * "If queries are used while executing a render pass instance that has 3112 * multiview enabled, the query uses N consecutive query indices in 3113 * the query pool (starting at query) where N is the number of bits set 3114 * in the view mask in the subpass the query is used in. How the 3115 * numerical results of the query are distributed among the queries is 3116 * implementation-dependent." 3117 * 3118 * In our case, only the first query is used but this means we still need 3119 * to flag the other queries as available so we don't emit errors when 3120 * the applications attempt to retrive values from them. 3121 */ 3122 struct v3dv_render_pass *pass = cmd_buffer->state.pass; 3123 if (!pass->multiview_enabled) { 3124 info->count = 1; 3125 } else { 3126 struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx]; 3127 info->count = util_bitcount(subpass->view_mask); 3128 } 3129 } else { 3130 /* Otherwise, schedule the CPU job immediately */ 3131 struct v3dv_job *job = 3132 v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device, 3133 V3DV_JOB_TYPE_CPU_END_QUERY, 3134 cmd_buffer, -1); 3135 v3dv_return_if_oom(cmd_buffer, NULL); 3136 3137 job->cpu.query_end.pool = pool; 3138 job->cpu.query_end.query = query; 3139 3140 /* Multiview queries cannot cross subpass boundaries */ 3141 job->cpu.query_end.count = 1; 3142 3143 list_addtail(&job->list_link, &cmd_buffer->jobs); 3144 } 3145 3146 cmd_buffer->state.query.active_query.bo = NULL; 3147 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY; 3148} 3149 3150void 3151v3dv_cmd_buffer_copy_query_results(struct v3dv_cmd_buffer *cmd_buffer, 3152 struct v3dv_query_pool *pool, 3153 uint32_t first, 3154 uint32_t count, 3155 struct v3dv_buffer *dst, 3156 uint32_t offset, 3157 uint32_t stride, 3158 VkQueryResultFlags flags) 3159{ 3160 /* Copies can only happen outside a render pass instance so we should not 3161 * be in the middle of job recording. 3162 */ 3163 assert(cmd_buffer->state.pass == NULL); 3164 assert(cmd_buffer->state.job == NULL); 3165 3166 assert(first < pool->query_count); 3167 assert(first + count <= pool->query_count); 3168 3169 struct v3dv_job *job = 3170 v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device, 3171 V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS, 3172 cmd_buffer, -1); 3173 v3dv_return_if_oom(cmd_buffer, NULL); 3174 3175 job->cpu.query_copy_results.pool = pool; 3176 job->cpu.query_copy_results.first = first; 3177 job->cpu.query_copy_results.count = count; 3178 job->cpu.query_copy_results.dst = dst; 3179 job->cpu.query_copy_results.offset = offset; 3180 job->cpu.query_copy_results.stride = stride; 3181 job->cpu.query_copy_results.flags = flags; 3182 3183 list_addtail(&job->list_link, &cmd_buffer->jobs); 3184} 3185 3186void 3187v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer *cmd_buffer, 3188 struct drm_v3d_submit_tfu *tfu) 3189{ 3190 struct v3dv_device *device = cmd_buffer->device; 3191 struct v3dv_job *job = vk_zalloc(&device->vk.alloc, 3192 sizeof(struct v3dv_job), 8, 3193 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 3194 if (!job) { 3195 v3dv_flag_oom(cmd_buffer, NULL); 3196 return; 3197 } 3198 3199 v3dv_job_init(job, V3DV_JOB_TYPE_GPU_TFU, device, cmd_buffer, -1); 3200 job->tfu = *tfu; 3201 list_addtail(&job->list_link, &cmd_buffer->jobs); 3202} 3203 3204VKAPI_ATTR void VKAPI_CALL 3205v3dv_CmdSetEvent(VkCommandBuffer commandBuffer, 3206 VkEvent _event, 3207 VkPipelineStageFlags stageMask) 3208{ 3209 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 3210 V3DV_FROM_HANDLE(v3dv_event, event, _event); 3211 3212 /* Event (re)sets can only happen outside a render pass instance so we 3213 * should not be in the middle of job recording. 3214 */ 3215 assert(cmd_buffer->state.pass == NULL); 3216 assert(cmd_buffer->state.job == NULL); 3217 3218 struct v3dv_job *job = 3219 v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device, 3220 V3DV_JOB_TYPE_CPU_SET_EVENT, 3221 cmd_buffer, -1); 3222 v3dv_return_if_oom(cmd_buffer, NULL); 3223 3224 job->cpu.event_set.event = event; 3225 job->cpu.event_set.state = 1; 3226 3227 list_addtail(&job->list_link, &cmd_buffer->jobs); 3228} 3229 3230VKAPI_ATTR void VKAPI_CALL 3231v3dv_CmdResetEvent(VkCommandBuffer commandBuffer, 3232 VkEvent _event, 3233 VkPipelineStageFlags stageMask) 3234{ 3235 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 3236 V3DV_FROM_HANDLE(v3dv_event, event, _event); 3237 3238 /* Event (re)sets can only happen outside a render pass instance so we 3239 * should not be in the middle of job recording. 3240 */ 3241 assert(cmd_buffer->state.pass == NULL); 3242 assert(cmd_buffer->state.job == NULL); 3243 3244 struct v3dv_job *job = 3245 v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device, 3246 V3DV_JOB_TYPE_CPU_SET_EVENT, 3247 cmd_buffer, -1); 3248 v3dv_return_if_oom(cmd_buffer, NULL); 3249 3250 job->cpu.event_set.event = event; 3251 job->cpu.event_set.state = 0; 3252 3253 list_addtail(&job->list_link, &cmd_buffer->jobs); 3254} 3255 3256VKAPI_ATTR void VKAPI_CALL 3257v3dv_CmdWaitEvents(VkCommandBuffer commandBuffer, 3258 uint32_t eventCount, 3259 const VkEvent *pEvents, 3260 VkPipelineStageFlags srcStageMask, 3261 VkPipelineStageFlags dstStageMask, 3262 uint32_t memoryBarrierCount, 3263 const VkMemoryBarrier *pMemoryBarriers, 3264 uint32_t bufferMemoryBarrierCount, 3265 const VkBufferMemoryBarrier *pBufferMemoryBarriers, 3266 uint32_t imageMemoryBarrierCount, 3267 const VkImageMemoryBarrier *pImageMemoryBarriers) 3268{ 3269 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 3270 3271 assert(eventCount > 0); 3272 3273 struct v3dv_job *job = 3274 v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device, 3275 V3DV_JOB_TYPE_CPU_WAIT_EVENTS, 3276 cmd_buffer, -1); 3277 v3dv_return_if_oom(cmd_buffer, NULL); 3278 3279 const uint32_t event_list_size = sizeof(struct v3dv_event *) * eventCount; 3280 3281 job->cpu.event_wait.events = 3282 vk_alloc(&cmd_buffer->device->vk.alloc, event_list_size, 8, 3283 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 3284 if (!job->cpu.event_wait.events) { 3285 v3dv_flag_oom(cmd_buffer, NULL); 3286 return; 3287 } 3288 job->cpu.event_wait.event_count = eventCount; 3289 3290 for (uint32_t i = 0; i < eventCount; i++) 3291 job->cpu.event_wait.events[i] = v3dv_event_from_handle(pEvents[i]); 3292 3293 /* vkCmdWaitEvents can be recorded inside a render pass, so we might have 3294 * an active job. 3295 * 3296 * If we are inside a render pass, because we vkCmd(Re)SetEvent can't happen 3297 * inside a render pass, it is safe to move the wait job so it happens right 3298 * before the current job we are currently recording for the subpass, if any 3299 * (it would actually be safe to move it all the way back to right before 3300 * the start of the render pass). 3301 * 3302 * If we are outside a render pass then we should not have any on-going job 3303 * and we are free to just add the wait job without restrictions. 3304 */ 3305 assert(cmd_buffer->state.pass || !cmd_buffer->state.job); 3306 list_addtail(&job->list_link, &cmd_buffer->jobs); 3307} 3308 3309VKAPI_ATTR void VKAPI_CALL 3310v3dv_CmdWriteTimestamp(VkCommandBuffer commandBuffer, 3311 VkPipelineStageFlagBits pipelineStage, 3312 VkQueryPool queryPool, 3313 uint32_t query) 3314{ 3315 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 3316 V3DV_FROM_HANDLE(v3dv_query_pool, query_pool, queryPool); 3317 3318 /* If this is called inside a render pass we need to finish the current 3319 * job here... 3320 */ 3321 struct v3dv_render_pass *pass = cmd_buffer->state.pass; 3322 if (pass) 3323 v3dv_cmd_buffer_finish_job(cmd_buffer); 3324 3325 struct v3dv_job *job = 3326 v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device, 3327 V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY, 3328 cmd_buffer, -1); 3329 v3dv_return_if_oom(cmd_buffer, NULL); 3330 3331 job->cpu.query_timestamp.pool = query_pool; 3332 job->cpu.query_timestamp.query = query; 3333 3334 if (!pass || !pass->multiview_enabled) { 3335 job->cpu.query_timestamp.count = 1; 3336 } else { 3337 struct v3dv_subpass *subpass = 3338 &pass->subpasses[cmd_buffer->state.subpass_idx]; 3339 job->cpu.query_timestamp.count = util_bitcount(subpass->view_mask); 3340 } 3341 3342 list_addtail(&job->list_link, &cmd_buffer->jobs); 3343 cmd_buffer->state.job = NULL; 3344 3345 /* ...and resume the subpass after the timestamp */ 3346 if (cmd_buffer->state.pass) 3347 v3dv_cmd_buffer_subpass_resume(cmd_buffer, cmd_buffer->state.subpass_idx); 3348} 3349 3350static void 3351cmd_buffer_emit_pre_dispatch(struct v3dv_cmd_buffer *cmd_buffer) 3352{ 3353 assert(cmd_buffer->state.compute.pipeline); 3354 assert(cmd_buffer->state.compute.pipeline->active_stages == 3355 VK_SHADER_STAGE_COMPUTE_BIT); 3356 3357 cmd_buffer->state.dirty &= ~(V3DV_CMD_DIRTY_COMPUTE_PIPELINE | 3358 V3DV_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS); 3359 cmd_buffer->state.dirty_descriptor_stages &= ~VK_SHADER_STAGE_COMPUTE_BIT; 3360 cmd_buffer->state.dirty_push_constants_stages &= ~VK_SHADER_STAGE_COMPUTE_BIT; 3361} 3362 3363#define V3D_CSD_CFG012_WG_COUNT_SHIFT 16 3364#define V3D_CSD_CFG012_WG_OFFSET_SHIFT 0 3365/* Allow this dispatch to start while the last one is still running. */ 3366#define V3D_CSD_CFG3_OVERLAP_WITH_PREV (1 << 26) 3367/* Maximum supergroup ID. 6 bits. */ 3368#define V3D_CSD_CFG3_MAX_SG_ID_SHIFT 20 3369/* Batches per supergroup minus 1. 8 bits. */ 3370#define V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT 12 3371/* Workgroups per supergroup, 0 means 16 */ 3372#define V3D_CSD_CFG3_WGS_PER_SG_SHIFT 8 3373#define V3D_CSD_CFG3_WG_SIZE_SHIFT 0 3374 3375#define V3D_CSD_CFG5_PROPAGATE_NANS (1 << 2) 3376#define V3D_CSD_CFG5_SINGLE_SEG (1 << 1) 3377#define V3D_CSD_CFG5_THREADING (1 << 0) 3378 3379void 3380v3dv_cmd_buffer_rewrite_indirect_csd_job( 3381 struct v3dv_csd_indirect_cpu_job_info *info, 3382 const uint32_t *wg_counts) 3383{ 3384 assert(info->csd_job); 3385 struct v3dv_job *job = info->csd_job; 3386 3387 assert(job->type == V3DV_JOB_TYPE_GPU_CSD); 3388 assert(wg_counts[0] > 0 && wg_counts[1] > 0 && wg_counts[2] > 0); 3389 3390 struct drm_v3d_submit_csd *submit = &job->csd.submit; 3391 3392 job->csd.wg_count[0] = wg_counts[0]; 3393 job->csd.wg_count[1] = wg_counts[1]; 3394 job->csd.wg_count[2] = wg_counts[2]; 3395 3396 submit->cfg[0] = wg_counts[0] << V3D_CSD_CFG012_WG_COUNT_SHIFT; 3397 submit->cfg[1] = wg_counts[1] << V3D_CSD_CFG012_WG_COUNT_SHIFT; 3398 submit->cfg[2] = wg_counts[2] << V3D_CSD_CFG012_WG_COUNT_SHIFT; 3399 3400 submit->cfg[4] = DIV_ROUND_UP(info->wg_size, 16) * 3401 (wg_counts[0] * wg_counts[1] * wg_counts[2]) - 1; 3402 assert(submit->cfg[4] != ~0); 3403 3404 if (info->needs_wg_uniform_rewrite) { 3405 /* Make sure the GPU is not currently accessing the indirect CL for this 3406 * job, since we are about to overwrite some of the uniform data. 3407 */ 3408 v3dv_bo_wait(job->device, job->indirect.bo, PIPE_TIMEOUT_INFINITE); 3409 3410 for (uint32_t i = 0; i < 3; i++) { 3411 if (info->wg_uniform_offsets[i]) { 3412 /* Sanity check that our uniform pointers are within the allocated 3413 * BO space for our indirect CL. 3414 */ 3415 assert(info->wg_uniform_offsets[i] >= (uint32_t *) job->indirect.base); 3416 assert(info->wg_uniform_offsets[i] < (uint32_t *) job->indirect.next); 3417 *(info->wg_uniform_offsets[i]) = wg_counts[i]; 3418 } 3419 } 3420 } 3421} 3422 3423static struct v3dv_job * 3424cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer, 3425 uint32_t base_offset_x, 3426 uint32_t base_offset_y, 3427 uint32_t base_offset_z, 3428 uint32_t group_count_x, 3429 uint32_t group_count_y, 3430 uint32_t group_count_z, 3431 uint32_t **wg_uniform_offsets_out, 3432 uint32_t *wg_size_out) 3433{ 3434 struct v3dv_pipeline *pipeline = cmd_buffer->state.compute.pipeline; 3435 assert(pipeline && pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE]); 3436 struct v3dv_shader_variant *cs_variant = 3437 pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE]; 3438 3439 struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->vk.alloc, 3440 sizeof(struct v3dv_job), 8, 3441 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 3442 if (!job) { 3443 v3dv_flag_oom(cmd_buffer, NULL); 3444 return NULL; 3445 } 3446 3447 v3dv_job_init(job, V3DV_JOB_TYPE_GPU_CSD, cmd_buffer->device, cmd_buffer, -1); 3448 cmd_buffer->state.job = job; 3449 3450 struct drm_v3d_submit_csd *submit = &job->csd.submit; 3451 3452 job->csd.wg_count[0] = group_count_x; 3453 job->csd.wg_count[1] = group_count_y; 3454 job->csd.wg_count[2] = group_count_z; 3455 3456 job->csd.wg_base[0] = base_offset_x; 3457 job->csd.wg_base[1] = base_offset_y; 3458 job->csd.wg_base[2] = base_offset_z; 3459 3460 submit->cfg[0] |= group_count_x << V3D_CSD_CFG012_WG_COUNT_SHIFT; 3461 submit->cfg[1] |= group_count_y << V3D_CSD_CFG012_WG_COUNT_SHIFT; 3462 submit->cfg[2] |= group_count_z << V3D_CSD_CFG012_WG_COUNT_SHIFT; 3463 3464 const struct v3d_compute_prog_data *cpd = 3465 cs_variant->prog_data.cs; 3466 3467 const uint32_t num_wgs = group_count_x * group_count_y * group_count_z; 3468 const uint32_t wg_size = cpd->local_size[0] * 3469 cpd->local_size[1] * 3470 cpd->local_size[2]; 3471 3472 uint32_t wgs_per_sg = 3473 v3d_csd_choose_workgroups_per_supergroup( 3474 &cmd_buffer->device->devinfo, 3475 cs_variant->prog_data.cs->has_subgroups, 3476 cs_variant->prog_data.cs->base.has_control_barrier, 3477 cs_variant->prog_data.cs->base.threads, 3478 num_wgs, wg_size); 3479 3480 uint32_t batches_per_sg = DIV_ROUND_UP(wgs_per_sg * wg_size, 16); 3481 uint32_t whole_sgs = num_wgs / wgs_per_sg; 3482 uint32_t rem_wgs = num_wgs - whole_sgs * wgs_per_sg; 3483 uint32_t num_batches = batches_per_sg * whole_sgs + 3484 DIV_ROUND_UP(rem_wgs * wg_size, 16); 3485 3486 submit->cfg[3] |= (wgs_per_sg & 0xf) << V3D_CSD_CFG3_WGS_PER_SG_SHIFT; 3487 submit->cfg[3] |= (batches_per_sg - 1) << V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT; 3488 submit->cfg[3] |= (wg_size & 0xff) << V3D_CSD_CFG3_WG_SIZE_SHIFT; 3489 if (wg_size_out) 3490 *wg_size_out = wg_size; 3491 3492 submit->cfg[4] = num_batches - 1; 3493 assert(submit->cfg[4] != ~0); 3494 3495 assert(pipeline->shared_data->assembly_bo); 3496 struct v3dv_bo *cs_assembly_bo = pipeline->shared_data->assembly_bo; 3497 3498 submit->cfg[5] = cs_assembly_bo->offset + cs_variant->assembly_offset; 3499 submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS; 3500 if (cs_variant->prog_data.base->single_seg) 3501 submit->cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG; 3502 if (cs_variant->prog_data.base->threads == 4) 3503 submit->cfg[5] |= V3D_CSD_CFG5_THREADING; 3504 3505 if (cs_variant->prog_data.cs->shared_size > 0) { 3506 job->csd.shared_memory = 3507 v3dv_bo_alloc(cmd_buffer->device, 3508 cs_variant->prog_data.cs->shared_size * wgs_per_sg, 3509 "shared_vars", true); 3510 if (!job->csd.shared_memory) { 3511 v3dv_flag_oom(cmd_buffer, NULL); 3512 return job; 3513 } 3514 } 3515 3516 v3dv_job_add_bo_unchecked(job, cs_assembly_bo); 3517 struct v3dv_cl_reloc uniforms = 3518 v3dv_write_uniforms_wg_offsets(cmd_buffer, pipeline, 3519 cs_variant, 3520 wg_uniform_offsets_out); 3521 submit->cfg[6] = uniforms.bo->offset + uniforms.offset; 3522 3523 v3dv_job_add_bo(job, uniforms.bo); 3524 3525 return job; 3526} 3527 3528static void 3529cmd_buffer_dispatch(struct v3dv_cmd_buffer *cmd_buffer, 3530 uint32_t base_offset_x, 3531 uint32_t base_offset_y, 3532 uint32_t base_offset_z, 3533 uint32_t group_count_x, 3534 uint32_t group_count_y, 3535 uint32_t group_count_z) 3536{ 3537 if (group_count_x == 0 || group_count_y == 0 || group_count_z == 0) 3538 return; 3539 3540 struct v3dv_job *job = 3541 cmd_buffer_create_csd_job(cmd_buffer, 3542 base_offset_x, 3543 base_offset_y, 3544 base_offset_z, 3545 group_count_x, 3546 group_count_y, 3547 group_count_z, 3548 NULL, NULL); 3549 3550 list_addtail(&job->list_link, &cmd_buffer->jobs); 3551 cmd_buffer->state.job = NULL; 3552} 3553 3554VKAPI_ATTR void VKAPI_CALL 3555v3dv_CmdDispatch(VkCommandBuffer commandBuffer, 3556 uint32_t groupCountX, 3557 uint32_t groupCountY, 3558 uint32_t groupCountZ) 3559{ 3560 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 3561 3562 cmd_buffer_emit_pre_dispatch(cmd_buffer); 3563 cmd_buffer_dispatch(cmd_buffer, 0, 0, 0, 3564 groupCountX, groupCountY, groupCountZ); 3565} 3566 3567VKAPI_ATTR void VKAPI_CALL 3568v3dv_CmdDispatchBase(VkCommandBuffer commandBuffer, 3569 uint32_t baseGroupX, 3570 uint32_t baseGroupY, 3571 uint32_t baseGroupZ, 3572 uint32_t groupCountX, 3573 uint32_t groupCountY, 3574 uint32_t groupCountZ) 3575{ 3576 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 3577 3578 cmd_buffer_emit_pre_dispatch(cmd_buffer); 3579 cmd_buffer_dispatch(cmd_buffer, 3580 baseGroupX, baseGroupY, baseGroupZ, 3581 groupCountX, groupCountY, groupCountZ); 3582} 3583 3584 3585static void 3586cmd_buffer_dispatch_indirect(struct v3dv_cmd_buffer *cmd_buffer, 3587 struct v3dv_buffer *buffer, 3588 uint32_t offset) 3589{ 3590 /* We can't do indirect dispatches, so instead we record a CPU job that, 3591 * when executed in the queue, will map the indirect buffer, read the 3592 * dispatch parameters, and submit a regular dispatch. 3593 */ 3594 struct v3dv_job *job = 3595 v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device, 3596 V3DV_JOB_TYPE_CPU_CSD_INDIRECT, 3597 cmd_buffer, -1); 3598 v3dv_return_if_oom(cmd_buffer, NULL); 3599 3600 /* We need to create a CSD job now, even if we still don't know the actual 3601 * dispatch parameters, because the job setup needs to be done using the 3602 * current command buffer state (i.e. pipeline, descriptor sets, push 3603 * constants, etc.). So we create the job with default dispatch parameters 3604 * and we will rewrite the parts we need at submit time if the indirect 3605 * parameters don't match the ones we used to setup the job. 3606 */ 3607 struct v3dv_job *csd_job = 3608 cmd_buffer_create_csd_job(cmd_buffer, 3609 0, 0, 0, 3610 1, 1, 1, 3611 &job->cpu.csd_indirect.wg_uniform_offsets[0], 3612 &job->cpu.csd_indirect.wg_size); 3613 v3dv_return_if_oom(cmd_buffer, NULL); 3614 assert(csd_job); 3615 3616 job->cpu.csd_indirect.buffer = buffer; 3617 job->cpu.csd_indirect.offset = offset; 3618 job->cpu.csd_indirect.csd_job = csd_job; 3619 3620 /* If the compute shader reads the workgroup sizes we will also need to 3621 * rewrite the corresponding uniforms. 3622 */ 3623 job->cpu.csd_indirect.needs_wg_uniform_rewrite = 3624 job->cpu.csd_indirect.wg_uniform_offsets[0] || 3625 job->cpu.csd_indirect.wg_uniform_offsets[1] || 3626 job->cpu.csd_indirect.wg_uniform_offsets[2]; 3627 3628 list_addtail(&job->list_link, &cmd_buffer->jobs); 3629 cmd_buffer->state.job = NULL; 3630} 3631 3632VKAPI_ATTR void VKAPI_CALL 3633v3dv_CmdDispatchIndirect(VkCommandBuffer commandBuffer, 3634 VkBuffer _buffer, 3635 VkDeviceSize offset) 3636{ 3637 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 3638 V3DV_FROM_HANDLE(v3dv_buffer, buffer, _buffer); 3639 3640 assert(offset <= UINT32_MAX); 3641 3642 cmd_buffer_emit_pre_dispatch(cmd_buffer); 3643 cmd_buffer_dispatch_indirect(cmd_buffer, buffer, offset); 3644} 3645 3646VKAPI_ATTR void VKAPI_CALL 3647v3dv_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask) 3648{ 3649 /* Nothing to do here since we only support a single device */ 3650 assert(deviceMask == 0x1); 3651} 3652