1/* 2 * Copyright © 2015 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include <assert.h> 25#include <stdbool.h> 26 27#include "anv_private.h" 28#include "anv_measure.h" 29#include "vk_format.h" 30#include "vk_util.h" 31#include "util/fast_idiv_by_const.h" 32 33#include "common/intel_aux_map.h" 34#include "common/intel_l3_config.h" 35#include "genxml/gen_macros.h" 36#include "genxml/genX_pack.h" 37#include "genxml/gen_rt_pack.h" 38 39#include "nir/nir_xfb_info.h" 40 41/* We reserve : 42 * - GPR 14 for secondary command buffer returns 43 * - GPR 15 for conditional rendering 44 */ 45#define MI_BUILDER_NUM_ALLOC_GPRS 14 46#define __gen_get_batch_dwords anv_batch_emit_dwords 47#define __gen_address_offset anv_address_add 48#define __gen_get_batch_address(b, a) anv_batch_address(b, a) 49#include "common/mi_builder.h" 50 51static void genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer, 52 uint32_t pipeline); 53 54static enum anv_pipe_bits 55convert_pc_to_bits(struct GENX(PIPE_CONTROL) *pc) { 56 enum anv_pipe_bits bits = 0; 57 bits |= (pc->DepthCacheFlushEnable) ? ANV_PIPE_DEPTH_CACHE_FLUSH_BIT : 0; 58 bits |= (pc->DCFlushEnable) ? ANV_PIPE_DATA_CACHE_FLUSH_BIT : 0; 59#if GFX_VER >= 12 60 bits |= (pc->TileCacheFlushEnable) ? ANV_PIPE_TILE_CACHE_FLUSH_BIT : 0; 61 bits |= (pc->HDCPipelineFlushEnable) ? ANV_PIPE_HDC_PIPELINE_FLUSH_BIT : 0; 62#endif 63 bits |= (pc->RenderTargetCacheFlushEnable) ? ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT : 0; 64 bits |= (pc->StateCacheInvalidationEnable) ? ANV_PIPE_STATE_CACHE_INVALIDATE_BIT : 0; 65 bits |= (pc->ConstantCacheInvalidationEnable) ? ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT : 0; 66 bits |= (pc->TextureCacheInvalidationEnable) ? ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT : 0; 67 bits |= (pc->InstructionCacheInvalidateEnable) ? ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT : 0; 68 bits |= (pc->StallAtPixelScoreboard) ? ANV_PIPE_STALL_AT_SCOREBOARD_BIT : 0; 69 bits |= (pc->DepthStallEnable) ? ANV_PIPE_DEPTH_STALL_BIT : 0; 70 bits |= (pc->CommandStreamerStallEnable) ? ANV_PIPE_CS_STALL_BIT : 0; 71 return bits; 72} 73 74#define anv_debug_dump_pc(pc) \ 75 if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) { \ 76 fputs("pc: emit PC=( ", stderr); \ 77 anv_dump_pipe_bits(convert_pc_to_bits(&(pc))); \ 78 fprintf(stderr, ") reason: %s\n", __FUNCTION__); \ 79 } 80 81static bool 82is_render_queue_cmd_buffer(const struct anv_cmd_buffer *cmd_buffer) 83{ 84 struct anv_queue_family *queue_family = cmd_buffer->pool->queue_family; 85 return (queue_family->queueFlags & VK_QUEUE_GRAPHICS_BIT) != 0; 86} 87 88void 89genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer) 90{ 91 struct anv_device *device = cmd_buffer->device; 92 UNUSED const struct intel_device_info *devinfo = &device->info; 93 uint32_t mocs = isl_mocs(&device->isl_dev, 0, false); 94 95 /* If we are emitting a new state base address we probably need to re-emit 96 * binding tables. 97 */ 98 cmd_buffer->state.descriptors_dirty |= ~0; 99 100 /* Emit a render target cache flush. 101 * 102 * This isn't documented anywhere in the PRM. However, it seems to be 103 * necessary prior to changing the surface state base adress. Without 104 * this, we get GPU hangs when using multi-level command buffers which 105 * clear depth, reset state base address, and then go render stuff. 106 */ 107 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 108#if GFX_VER >= 12 109 pc.HDCPipelineFlushEnable = true; 110#else 111 pc.DCFlushEnable = true; 112#endif 113 pc.RenderTargetCacheFlushEnable = true; 114 pc.CommandStreamerStallEnable = true; 115#if GFX_VER == 12 116 /* Wa_1606662791: 117 * 118 * Software must program PIPE_CONTROL command with "HDC Pipeline 119 * Flush" prior to programming of the below two non-pipeline state : 120 * * STATE_BASE_ADDRESS 121 * * 3DSTATE_BINDING_TABLE_POOL_ALLOC 122 */ 123 if (devinfo->revision == 0 /* A0 */) 124 pc.HDCPipelineFlushEnable = true; 125#endif 126 anv_debug_dump_pc(pc); 127 } 128 129#if GFX_VER == 12 130 /* Wa_1607854226: 131 * 132 * Workaround the non pipelined state not applying in MEDIA/GPGPU pipeline 133 * mode by putting the pipeline temporarily in 3D mode. 134 */ 135 uint32_t gfx12_wa_pipeline = cmd_buffer->state.current_pipeline; 136 genX(flush_pipeline_select_3d)(cmd_buffer); 137#endif 138 139 anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) { 140 sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 }; 141 sba.GeneralStateMOCS = mocs; 142 sba.GeneralStateBaseAddressModifyEnable = true; 143 144 sba.StatelessDataPortAccessMOCS = mocs; 145 146 sba.SurfaceStateBaseAddress = 147 anv_cmd_buffer_surface_base_address(cmd_buffer); 148 sba.SurfaceStateMOCS = mocs; 149 sba.SurfaceStateBaseAddressModifyEnable = true; 150 151 sba.DynamicStateBaseAddress = 152 (struct anv_address) { device->dynamic_state_pool.block_pool.bo, 0 }; 153 sba.DynamicStateMOCS = mocs; 154 sba.DynamicStateBaseAddressModifyEnable = true; 155 156 sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 }; 157 sba.IndirectObjectMOCS = mocs; 158 sba.IndirectObjectBaseAddressModifyEnable = true; 159 160 sba.InstructionBaseAddress = 161 (struct anv_address) { device->instruction_state_pool.block_pool.bo, 0 }; 162 sba.InstructionMOCS = mocs; 163 sba.InstructionBaseAddressModifyEnable = true; 164 165# if (GFX_VER >= 8) 166 /* Broadwell requires that we specify a buffer size for a bunch of 167 * these fields. However, since we will be growing the BO's live, we 168 * just set them all to the maximum. 169 */ 170 sba.GeneralStateBufferSize = 0xfffff; 171 sba.IndirectObjectBufferSize = 0xfffff; 172 if (anv_use_softpin(device->physical)) { 173 /* With softpin, we use fixed addresses so we actually know how big 174 * our base addresses are. 175 */ 176 sba.DynamicStateBufferSize = DYNAMIC_STATE_POOL_SIZE / 4096; 177 sba.InstructionBufferSize = INSTRUCTION_STATE_POOL_SIZE / 4096; 178 } else { 179 sba.DynamicStateBufferSize = 0xfffff; 180 sba.InstructionBufferSize = 0xfffff; 181 } 182 sba.GeneralStateBufferSizeModifyEnable = true; 183 sba.IndirectObjectBufferSizeModifyEnable = true; 184 sba.DynamicStateBufferSizeModifyEnable = true; 185 sba.InstructionBuffersizeModifyEnable = true; 186# else 187 /* On gfx7, we have upper bounds instead. According to the docs, 188 * setting an upper bound of zero means that no bounds checking is 189 * performed so, in theory, we should be able to leave them zero. 190 * However, border color is broken and the GPU bounds-checks anyway. 191 * To avoid this and other potential problems, we may as well set it 192 * for everything. 193 */ 194 sba.GeneralStateAccessUpperBound = 195 (struct anv_address) { .bo = NULL, .offset = 0xfffff000 }; 196 sba.GeneralStateAccessUpperBoundModifyEnable = true; 197 sba.DynamicStateAccessUpperBound = 198 (struct anv_address) { .bo = NULL, .offset = 0xfffff000 }; 199 sba.DynamicStateAccessUpperBoundModifyEnable = true; 200 sba.InstructionAccessUpperBound = 201 (struct anv_address) { .bo = NULL, .offset = 0xfffff000 }; 202 sba.InstructionAccessUpperBoundModifyEnable = true; 203# endif 204# if (GFX_VER >= 9) 205 if (anv_use_softpin(device->physical)) { 206 sba.BindlessSurfaceStateBaseAddress = (struct anv_address) { 207 .bo = device->surface_state_pool.block_pool.bo, 208 .offset = 0, 209 }; 210 sba.BindlessSurfaceStateSize = (1 << 20) - 1; 211 } else { 212 sba.BindlessSurfaceStateBaseAddress = ANV_NULL_ADDRESS; 213 sba.BindlessSurfaceStateSize = 0; 214 } 215 sba.BindlessSurfaceStateMOCS = mocs; 216 sba.BindlessSurfaceStateBaseAddressModifyEnable = true; 217# endif 218# if (GFX_VER >= 10) 219 sba.BindlessSamplerStateBaseAddress = (struct anv_address) { NULL, 0 }; 220 sba.BindlessSamplerStateMOCS = mocs; 221 sba.BindlessSamplerStateBaseAddressModifyEnable = true; 222 sba.BindlessSamplerStateBufferSize = 0; 223# endif 224 } 225 226#if GFX_VER == 12 227 /* Wa_1607854226: 228 * 229 * Put the pipeline back into its current mode. 230 */ 231 if (gfx12_wa_pipeline != UINT32_MAX) 232 genX(flush_pipeline_select)(cmd_buffer, gfx12_wa_pipeline); 233#endif 234 235 /* After re-setting the surface state base address, we have to do some 236 * cache flusing so that the sampler engine will pick up the new 237 * SURFACE_STATE objects and binding tables. From the Broadwell PRM, 238 * Shared Function > 3D Sampler > State > State Caching (page 96): 239 * 240 * Coherency with system memory in the state cache, like the texture 241 * cache is handled partially by software. It is expected that the 242 * command stream or shader will issue Cache Flush operation or 243 * Cache_Flush sampler message to ensure that the L1 cache remains 244 * coherent with system memory. 245 * 246 * [...] 247 * 248 * Whenever the value of the Dynamic_State_Base_Addr, 249 * Surface_State_Base_Addr are altered, the L1 state cache must be 250 * invalidated to ensure the new surface or sampler state is fetched 251 * from system memory. 252 * 253 * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit 254 * which, according the PIPE_CONTROL instruction documentation in the 255 * Broadwell PRM: 256 * 257 * Setting this bit is independent of any other bit in this packet. 258 * This bit controls the invalidation of the L1 and L2 state caches 259 * at the top of the pipe i.e. at the parsing time. 260 * 261 * Unfortunately, experimentation seems to indicate that state cache 262 * invalidation through a PIPE_CONTROL does nothing whatsoever in 263 * regards to surface state and binding tables. In stead, it seems that 264 * invalidating the texture cache is what is actually needed. 265 * 266 * XXX: As far as we have been able to determine through 267 * experimentation, shows that flush the texture cache appears to be 268 * sufficient. The theory here is that all of the sampling/rendering 269 * units cache the binding table in the texture cache. However, we have 270 * yet to be able to actually confirm this. 271 */ 272 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 273 pc.TextureCacheInvalidationEnable = true; 274 pc.ConstantCacheInvalidationEnable = true; 275 pc.StateCacheInvalidationEnable = true; 276 anv_debug_dump_pc(pc); 277 } 278} 279 280static void 281add_surface_reloc(struct anv_cmd_buffer *cmd_buffer, 282 struct anv_state state, struct anv_address addr) 283{ 284 VkResult result; 285 286 if (anv_use_softpin(cmd_buffer->device->physical)) { 287 result = anv_reloc_list_add_bo(&cmd_buffer->surface_relocs, 288 &cmd_buffer->pool->alloc, 289 addr.bo); 290 } else { 291 const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; 292 result = anv_reloc_list_add(&cmd_buffer->surface_relocs, 293 &cmd_buffer->pool->alloc, 294 state.offset + isl_dev->ss.addr_offset, 295 addr.bo, addr.offset, NULL); 296 } 297 298 if (unlikely(result != VK_SUCCESS)) 299 anv_batch_set_error(&cmd_buffer->batch, result); 300} 301 302static void 303add_surface_state_relocs(struct anv_cmd_buffer *cmd_buffer, 304 struct anv_surface_state state) 305{ 306 const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; 307 308 assert(!anv_address_is_null(state.address)); 309 add_surface_reloc(cmd_buffer, state.state, state.address); 310 311 if (!anv_address_is_null(state.aux_address)) { 312 VkResult result = 313 anv_reloc_list_add(&cmd_buffer->surface_relocs, 314 &cmd_buffer->pool->alloc, 315 state.state.offset + isl_dev->ss.aux_addr_offset, 316 state.aux_address.bo, 317 state.aux_address.offset, 318 NULL); 319 if (result != VK_SUCCESS) 320 anv_batch_set_error(&cmd_buffer->batch, result); 321 } 322 323 if (!anv_address_is_null(state.clear_address)) { 324 VkResult result = 325 anv_reloc_list_add(&cmd_buffer->surface_relocs, 326 &cmd_buffer->pool->alloc, 327 state.state.offset + 328 isl_dev->ss.clear_color_state_offset, 329 state.clear_address.bo, 330 state.clear_address.offset, 331 NULL); 332 if (result != VK_SUCCESS) 333 anv_batch_set_error(&cmd_buffer->batch, result); 334 } 335} 336 337static bool 338isl_color_value_requires_conversion(union isl_color_value color, 339 const struct isl_surf *surf, 340 const struct isl_view *view) 341{ 342 if (surf->format == view->format && isl_swizzle_is_identity(view->swizzle)) 343 return false; 344 345 uint32_t surf_pack[4] = { 0, 0, 0, 0 }; 346 isl_color_value_pack(&color, surf->format, surf_pack); 347 348 uint32_t view_pack[4] = { 0, 0, 0, 0 }; 349 union isl_color_value swiz_color = 350 isl_color_value_swizzle_inv(color, view->swizzle); 351 isl_color_value_pack(&swiz_color, view->format, view_pack); 352 353 return memcmp(surf_pack, view_pack, sizeof(surf_pack)) != 0; 354} 355 356static bool 357anv_can_fast_clear_color_view(struct anv_device * device, 358 struct anv_image_view *iview, 359 VkImageLayout layout, 360 union isl_color_value clear_color, 361 uint32_t num_layers, 362 VkRect2D render_area) 363{ 364 if (iview->planes[0].isl.base_array_layer >= 365 anv_image_aux_layers(iview->image, VK_IMAGE_ASPECT_COLOR_BIT, 366 iview->planes[0].isl.base_level)) 367 return false; 368 369 /* Start by getting the fast clear type. We use the first subpass 370 * layout here because we don't want to fast-clear if the first subpass 371 * to use the attachment can't handle fast-clears. 372 */ 373 enum anv_fast_clear_type fast_clear_type = 374 anv_layout_to_fast_clear_type(&device->info, iview->image, 375 VK_IMAGE_ASPECT_COLOR_BIT, 376 layout); 377 switch (fast_clear_type) { 378 case ANV_FAST_CLEAR_NONE: 379 return false; 380 case ANV_FAST_CLEAR_DEFAULT_VALUE: 381 if (!isl_color_value_is_zero(clear_color, iview->planes[0].isl.format)) 382 return false; 383 break; 384 case ANV_FAST_CLEAR_ANY: 385 break; 386 } 387 388 /* Potentially, we could do partial fast-clears but doing so has crazy 389 * alignment restrictions. It's easier to just restrict to full size 390 * fast clears for now. 391 */ 392 if (render_area.offset.x != 0 || 393 render_area.offset.y != 0 || 394 render_area.extent.width != iview->vk.extent.width || 395 render_area.extent.height != iview->vk.extent.height) 396 return false; 397 398 /* On Broadwell and earlier, we can only handle 0/1 clear colors */ 399 if (GFX_VER <= 8 && 400 !isl_color_value_is_zero_one(clear_color, iview->planes[0].isl.format)) 401 return false; 402 403 /* If the clear color is one that would require non-trivial format 404 * conversion on resolve, we don't bother with the fast clear. This 405 * shouldn't be common as most clear colors are 0/1 and the most common 406 * format re-interpretation is for sRGB. 407 */ 408 if (isl_color_value_requires_conversion(clear_color, 409 &iview->image->planes[0].primary_surface.isl, 410 &iview->planes[0].isl)) { 411 anv_perf_warn(VK_LOG_OBJS(&iview->vk.base), 412 "Cannot fast-clear to colors which would require " 413 "format conversion on resolve"); 414 return false; 415 } 416 417 /* We only allow fast clears to the first slice of an image (level 0, 418 * layer 0) and only for the entire slice. This guarantees us that, at 419 * any given time, there is only one clear color on any given image at 420 * any given time. At the time of our testing (Jan 17, 2018), there 421 * were no known applications which would benefit from fast-clearing 422 * more than just the first slice. 423 */ 424 if (iview->planes[0].isl.base_level > 0 || 425 iview->planes[0].isl.base_array_layer > 0) { 426 anv_perf_warn(VK_LOG_OBJS(&iview->image->vk.base), 427 "Rendering with multi-lod or multi-layer framebuffer " 428 "with LOAD_OP_LOAD and baseMipLevel > 0 or " 429 "baseArrayLayer > 0. Not fast clearing."); 430 return false; 431 } 432 433 if (num_layers > 1) { 434 anv_perf_warn(VK_LOG_OBJS(&iview->image->vk.base), 435 "Rendering to a multi-layer framebuffer with " 436 "LOAD_OP_CLEAR. Only fast-clearing the first slice"); 437 } 438 439 return true; 440} 441 442static bool 443anv_can_hiz_clear_ds_view(struct anv_device *device, 444 struct anv_image_view *iview, 445 VkImageLayout layout, 446 VkImageAspectFlags clear_aspects, 447 float depth_clear_value, 448 VkRect2D render_area) 449{ 450 /* We don't do any HiZ or depth fast-clears on gfx7 yet */ 451 if (GFX_VER == 7) 452 return false; 453 454 /* If we're just clearing stencil, we can always HiZ clear */ 455 if (!(clear_aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) 456 return true; 457 458 /* We must have depth in order to have HiZ */ 459 if (!(iview->image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) 460 return false; 461 462 const enum isl_aux_usage clear_aux_usage = 463 anv_layout_to_aux_usage(&device->info, iview->image, 464 VK_IMAGE_ASPECT_DEPTH_BIT, 465 VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT, 466 layout); 467 if (!blorp_can_hiz_clear_depth(&device->info, 468 &iview->image->planes[0].primary_surface.isl, 469 clear_aux_usage, 470 iview->planes[0].isl.base_level, 471 iview->planes[0].isl.base_array_layer, 472 render_area.offset.x, 473 render_area.offset.y, 474 render_area.offset.x + 475 render_area.extent.width, 476 render_area.offset.y + 477 render_area.extent.height)) 478 return false; 479 480 if (depth_clear_value != ANV_HZ_FC_VAL) 481 return false; 482 483 /* Only gfx9+ supports returning ANV_HZ_FC_VAL when sampling a fast-cleared 484 * portion of a HiZ buffer. Testing has revealed that Gfx8 only supports 485 * returning 0.0f. Gens prior to gfx8 do not support this feature at all. 486 */ 487 if (GFX_VER == 8 && anv_can_sample_with_hiz(&device->info, iview->image)) 488 return false; 489 490 /* If we got here, then we can fast clear */ 491 return true; 492} 493 494#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x)) 495 496#if GFX_VER == 12 497static void 498anv_image_init_aux_tt(struct anv_cmd_buffer *cmd_buffer, 499 const struct anv_image *image, 500 VkImageAspectFlagBits aspect, 501 uint32_t base_level, uint32_t level_count, 502 uint32_t base_layer, uint32_t layer_count) 503{ 504 const uint32_t plane = anv_image_aspect_to_plane(image, aspect); 505 506 const struct anv_surface *surface = &image->planes[plane].primary_surface; 507 uint64_t base_address = 508 anv_address_physical(anv_image_address(image, &surface->memory_range)); 509 510 const struct isl_surf *isl_surf = &image->planes[plane].primary_surface.isl; 511 uint64_t format_bits = intel_aux_map_format_bits_for_isl_surf(isl_surf); 512 513 /* We're about to live-update the AUX-TT. We really don't want anyone else 514 * trying to read it while we're doing this. We could probably get away 515 * with not having this stall in some cases if we were really careful but 516 * it's better to play it safe. Full stall the GPU. 517 */ 518 anv_add_pending_pipe_bits(cmd_buffer, 519 ANV_PIPE_END_OF_PIPE_SYNC_BIT, 520 "before update AUX-TT"); 521 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 522 523 struct mi_builder b; 524 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 525 526 for (uint32_t a = 0; a < layer_count; a++) { 527 const uint32_t layer = base_layer + a; 528 529 uint64_t start_offset_B = UINT64_MAX, end_offset_B = 0; 530 for (uint32_t l = 0; l < level_count; l++) { 531 const uint32_t level = base_level + l; 532 533 uint32_t logical_array_layer, logical_z_offset_px; 534 if (image->vk.image_type == VK_IMAGE_TYPE_3D) { 535 logical_array_layer = 0; 536 537 /* If the given miplevel does not have this layer, then any higher 538 * miplevels won't either because miplevels only get smaller the 539 * higher the LOD. 540 */ 541 assert(layer < image->vk.extent.depth); 542 if (layer >= anv_minify(image->vk.extent.depth, level)) 543 break; 544 logical_z_offset_px = layer; 545 } else { 546 assert(layer < image->vk.array_layers); 547 logical_array_layer = layer; 548 logical_z_offset_px = 0; 549 } 550 551 uint64_t slice_start_offset_B, slice_end_offset_B; 552 isl_surf_get_image_range_B_tile(isl_surf, level, 553 logical_array_layer, 554 logical_z_offset_px, 555 &slice_start_offset_B, 556 &slice_end_offset_B); 557 558 start_offset_B = MIN2(start_offset_B, slice_start_offset_B); 559 end_offset_B = MAX2(end_offset_B, slice_end_offset_B); 560 } 561 562 /* Aux operates 64K at a time */ 563 start_offset_B = align_down_u64(start_offset_B, 64 * 1024); 564 end_offset_B = align_u64(end_offset_B, 64 * 1024); 565 566 for (uint64_t offset = start_offset_B; 567 offset < end_offset_B; offset += 64 * 1024) { 568 uint64_t address = base_address + offset; 569 570 uint64_t aux_entry_addr64, *aux_entry_map; 571 aux_entry_map = intel_aux_map_get_entry(cmd_buffer->device->aux_map_ctx, 572 address, &aux_entry_addr64); 573 574 assert(anv_use_softpin(cmd_buffer->device->physical)); 575 struct anv_address aux_entry_address = { 576 .bo = NULL, 577 .offset = aux_entry_addr64, 578 }; 579 580 const uint64_t old_aux_entry = READ_ONCE(*aux_entry_map); 581 uint64_t new_aux_entry = 582 (old_aux_entry & INTEL_AUX_MAP_ADDRESS_MASK) | format_bits; 583 584 if (isl_aux_usage_has_ccs(image->planes[plane].aux_usage)) 585 new_aux_entry |= INTEL_AUX_MAP_ENTRY_VALID_BIT; 586 587 mi_store(&b, mi_mem64(aux_entry_address), mi_imm(new_aux_entry)); 588 } 589 } 590 591 anv_add_pending_pipe_bits(cmd_buffer, 592 ANV_PIPE_AUX_TABLE_INVALIDATE_BIT, 593 "after update AUX-TT"); 594} 595#endif /* GFX_VER == 12 */ 596 597/* Transitions a HiZ-enabled depth buffer from one layout to another. Unless 598 * the initial layout is undefined, the HiZ buffer and depth buffer will 599 * represent the same data at the end of this operation. 600 */ 601static void 602transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer, 603 const struct anv_image *image, 604 uint32_t base_layer, uint32_t layer_count, 605 VkImageLayout initial_layout, 606 VkImageLayout final_layout, 607 bool will_full_fast_clear) 608{ 609 const uint32_t depth_plane = 610 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT); 611 if (image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_NONE) 612 return; 613 614#if GFX_VER == 12 615 if ((initial_layout == VK_IMAGE_LAYOUT_UNDEFINED || 616 initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) && 617 cmd_buffer->device->physical->has_implicit_ccs && 618 cmd_buffer->device->info.has_aux_map) { 619 anv_image_init_aux_tt(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT, 620 0, 1, base_layer, layer_count); 621 } 622#endif 623 624 /* If will_full_fast_clear is set, the caller promises to fast-clear the 625 * largest portion of the specified range as it can. For depth images, 626 * that means the entire image because we don't support multi-LOD HiZ. 627 */ 628 assert(image->planes[0].primary_surface.isl.levels == 1); 629 if (will_full_fast_clear) 630 return; 631 632 const enum isl_aux_state initial_state = 633 anv_layout_to_aux_state(&cmd_buffer->device->info, image, 634 VK_IMAGE_ASPECT_DEPTH_BIT, 635 initial_layout); 636 const enum isl_aux_state final_state = 637 anv_layout_to_aux_state(&cmd_buffer->device->info, image, 638 VK_IMAGE_ASPECT_DEPTH_BIT, 639 final_layout); 640 641 const bool initial_depth_valid = 642 isl_aux_state_has_valid_primary(initial_state); 643 const bool initial_hiz_valid = 644 isl_aux_state_has_valid_aux(initial_state); 645 const bool final_needs_depth = 646 isl_aux_state_has_valid_primary(final_state); 647 const bool final_needs_hiz = 648 isl_aux_state_has_valid_aux(final_state); 649 650 /* Getting into the pass-through state for Depth is tricky and involves 651 * both a resolve and an ambiguate. We don't handle that state right now 652 * as anv_layout_to_aux_state never returns it. 653 */ 654 assert(final_state != ISL_AUX_STATE_PASS_THROUGH); 655 656 if (final_needs_depth && !initial_depth_valid) { 657 assert(initial_hiz_valid); 658 anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT, 659 0, base_layer, layer_count, ISL_AUX_OP_FULL_RESOLVE); 660 } else if (final_needs_hiz && !initial_hiz_valid) { 661 assert(initial_depth_valid); 662 anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT, 663 0, base_layer, layer_count, ISL_AUX_OP_AMBIGUATE); 664 } 665} 666 667static inline bool 668vk_image_layout_stencil_write_optimal(VkImageLayout layout) 669{ 670 return layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL || 671 layout == VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL || 672 layout == VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR; 673} 674 675/* Transitions a HiZ-enabled depth buffer from one layout to another. Unless 676 * the initial layout is undefined, the HiZ buffer and depth buffer will 677 * represent the same data at the end of this operation. 678 */ 679static void 680transition_stencil_buffer(struct anv_cmd_buffer *cmd_buffer, 681 const struct anv_image *image, 682 uint32_t base_level, uint32_t level_count, 683 uint32_t base_layer, uint32_t layer_count, 684 VkImageLayout initial_layout, 685 VkImageLayout final_layout, 686 bool will_full_fast_clear) 687{ 688#if GFX_VER == 7 689 const uint32_t plane = 690 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT); 691 692 /* On gfx7, we have to store a texturable version of the stencil buffer in 693 * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and 694 * forth at strategic points. Stencil writes are only allowed in following 695 * layouts: 696 * 697 * - VK_IMAGE_LAYOUT_GENERAL 698 * - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL 699 * - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL 700 * - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL 701 * - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR 702 * 703 * For general, we have no nice opportunity to transition so we do the copy 704 * to the shadow unconditionally at the end of the subpass. For transfer 705 * destinations, we can update it as part of the transfer op. For the other 706 * layouts, we delay the copy until a transition into some other layout. 707 */ 708 if (anv_surface_is_valid(&image->planes[plane].shadow_surface) && 709 vk_image_layout_stencil_write_optimal(initial_layout) && 710 !vk_image_layout_stencil_write_optimal(final_layout)) { 711 anv_image_copy_to_shadow(cmd_buffer, image, 712 VK_IMAGE_ASPECT_STENCIL_BIT, 713 base_level, level_count, 714 base_layer, layer_count); 715 } 716#elif GFX_VER == 12 717 const uint32_t plane = 718 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT); 719 if (image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE) 720 return; 721 722 if ((initial_layout == VK_IMAGE_LAYOUT_UNDEFINED || 723 initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) && 724 cmd_buffer->device->physical->has_implicit_ccs && 725 cmd_buffer->device->info.has_aux_map) { 726 anv_image_init_aux_tt(cmd_buffer, image, VK_IMAGE_ASPECT_STENCIL_BIT, 727 base_level, level_count, base_layer, layer_count); 728 729 /* If will_full_fast_clear is set, the caller promises to fast-clear the 730 * largest portion of the specified range as it can. 731 */ 732 if (will_full_fast_clear) 733 return; 734 735 for (uint32_t l = 0; l < level_count; l++) { 736 const uint32_t level = base_level + l; 737 const VkRect2D clear_rect = { 738 .offset.x = 0, 739 .offset.y = 0, 740 .extent.width = anv_minify(image->vk.extent.width, level), 741 .extent.height = anv_minify(image->vk.extent.height, level), 742 }; 743 744 uint32_t aux_layers = 745 anv_image_aux_layers(image, VK_IMAGE_ASPECT_STENCIL_BIT, level); 746 uint32_t level_layer_count = 747 MIN2(layer_count, aux_layers - base_layer); 748 749 /* From Bspec's 3DSTATE_STENCIL_BUFFER_BODY > Stencil Compression 750 * Enable: 751 * 752 * "When enabled, Stencil Buffer needs to be initialized via 753 * stencil clear (HZ_OP) before any renderpass." 754 */ 755 anv_image_hiz_clear(cmd_buffer, image, VK_IMAGE_ASPECT_STENCIL_BIT, 756 level, base_layer, level_layer_count, 757 clear_rect, 0 /* Stencil clear value */); 758 } 759 } 760#endif 761} 762 763#define MI_PREDICATE_SRC0 0x2400 764#define MI_PREDICATE_SRC1 0x2408 765#define MI_PREDICATE_RESULT 0x2418 766 767static void 768set_image_compressed_bit(struct anv_cmd_buffer *cmd_buffer, 769 const struct anv_image *image, 770 VkImageAspectFlagBits aspect, 771 uint32_t level, 772 uint32_t base_layer, uint32_t layer_count, 773 bool compressed) 774{ 775 const uint32_t plane = anv_image_aspect_to_plane(image, aspect); 776 777 /* We only have compression tracking for CCS_E */ 778 if (image->planes[plane].aux_usage != ISL_AUX_USAGE_CCS_E) 779 return; 780 781 for (uint32_t a = 0; a < layer_count; a++) { 782 uint32_t layer = base_layer + a; 783 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) { 784 sdi.Address = anv_image_get_compression_state_addr(cmd_buffer->device, 785 image, aspect, 786 level, layer); 787 sdi.ImmediateData = compressed ? UINT32_MAX : 0; 788 } 789 } 790} 791 792static void 793set_image_fast_clear_state(struct anv_cmd_buffer *cmd_buffer, 794 const struct anv_image *image, 795 VkImageAspectFlagBits aspect, 796 enum anv_fast_clear_type fast_clear) 797{ 798 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) { 799 sdi.Address = anv_image_get_fast_clear_type_addr(cmd_buffer->device, 800 image, aspect); 801 sdi.ImmediateData = fast_clear; 802 } 803 804 /* Whenever we have fast-clear, we consider that slice to be compressed. 805 * This makes building predicates much easier. 806 */ 807 if (fast_clear != ANV_FAST_CLEAR_NONE) 808 set_image_compressed_bit(cmd_buffer, image, aspect, 0, 0, 1, true); 809} 810 811/* This is only really practical on haswell and above because it requires 812 * MI math in order to get it correct. 813 */ 814#if GFX_VERx10 >= 75 815static void 816anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer, 817 const struct anv_image *image, 818 VkImageAspectFlagBits aspect, 819 uint32_t level, uint32_t array_layer, 820 enum isl_aux_op resolve_op, 821 enum anv_fast_clear_type fast_clear_supported) 822{ 823 struct mi_builder b; 824 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 825 826 const struct mi_value fast_clear_type = 827 mi_mem32(anv_image_get_fast_clear_type_addr(cmd_buffer->device, 828 image, aspect)); 829 830 if (resolve_op == ISL_AUX_OP_FULL_RESOLVE) { 831 /* In this case, we're doing a full resolve which means we want the 832 * resolve to happen if any compression (including fast-clears) is 833 * present. 834 * 835 * In order to simplify the logic a bit, we make the assumption that, 836 * if the first slice has been fast-cleared, it is also marked as 837 * compressed. See also set_image_fast_clear_state. 838 */ 839 const struct mi_value compression_state = 840 mi_mem32(anv_image_get_compression_state_addr(cmd_buffer->device, 841 image, aspect, 842 level, array_layer)); 843 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), compression_state); 844 mi_store(&b, compression_state, mi_imm(0)); 845 846 if (level == 0 && array_layer == 0) { 847 /* If the predicate is true, we want to write 0 to the fast clear type 848 * and, if it's false, leave it alone. We can do this by writing 849 * 850 * clear_type = clear_type & ~predicate; 851 */ 852 struct mi_value new_fast_clear_type = 853 mi_iand(&b, fast_clear_type, 854 mi_inot(&b, mi_reg64(MI_PREDICATE_SRC0))); 855 mi_store(&b, fast_clear_type, new_fast_clear_type); 856 } 857 } else if (level == 0 && array_layer == 0) { 858 /* In this case, we are doing a partial resolve to get rid of fast-clear 859 * colors. We don't care about the compression state but we do care 860 * about how much fast clear is allowed by the final layout. 861 */ 862 assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE); 863 assert(fast_clear_supported < ANV_FAST_CLEAR_ANY); 864 865 /* We need to compute (fast_clear_supported < image->fast_clear) */ 866 struct mi_value pred = 867 mi_ult(&b, mi_imm(fast_clear_supported), fast_clear_type); 868 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), mi_value_ref(&b, pred)); 869 870 /* If the predicate is true, we want to write 0 to the fast clear type 871 * and, if it's false, leave it alone. We can do this by writing 872 * 873 * clear_type = clear_type & ~predicate; 874 */ 875 struct mi_value new_fast_clear_type = 876 mi_iand(&b, fast_clear_type, mi_inot(&b, pred)); 877 mi_store(&b, fast_clear_type, new_fast_clear_type); 878 } else { 879 /* In this case, we're trying to do a partial resolve on a slice that 880 * doesn't have clear color. There's nothing to do. 881 */ 882 assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE); 883 return; 884 } 885 886 /* Set src1 to 0 and use a != condition */ 887 mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0)); 888 889 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { 890 mip.LoadOperation = LOAD_LOADINV; 891 mip.CombineOperation = COMBINE_SET; 892 mip.CompareOperation = COMPARE_SRCS_EQUAL; 893 } 894} 895#endif /* GFX_VERx10 >= 75 */ 896 897#if GFX_VER <= 8 898static void 899anv_cmd_simple_resolve_predicate(struct anv_cmd_buffer *cmd_buffer, 900 const struct anv_image *image, 901 VkImageAspectFlagBits aspect, 902 uint32_t level, uint32_t array_layer, 903 enum isl_aux_op resolve_op, 904 enum anv_fast_clear_type fast_clear_supported) 905{ 906 struct mi_builder b; 907 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 908 909 struct mi_value fast_clear_type_mem = 910 mi_mem32(anv_image_get_fast_clear_type_addr(cmd_buffer->device, 911 image, aspect)); 912 913 /* This only works for partial resolves and only when the clear color is 914 * all or nothing. On the upside, this emits less command streamer code 915 * and works on Ivybridge and Bay Trail. 916 */ 917 assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE); 918 assert(fast_clear_supported != ANV_FAST_CLEAR_ANY); 919 920 /* We don't support fast clears on anything other than the first slice. */ 921 if (level > 0 || array_layer > 0) 922 return; 923 924 /* On gfx8, we don't have a concept of default clear colors because we 925 * can't sample from CCS surfaces. It's enough to just load the fast clear 926 * state into the predicate register. 927 */ 928 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), fast_clear_type_mem); 929 mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0)); 930 mi_store(&b, fast_clear_type_mem, mi_imm(0)); 931 932 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { 933 mip.LoadOperation = LOAD_LOADINV; 934 mip.CombineOperation = COMBINE_SET; 935 mip.CompareOperation = COMPARE_SRCS_EQUAL; 936 } 937} 938#endif /* GFX_VER <= 8 */ 939 940static void 941anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer *cmd_buffer, 942 const struct anv_image *image, 943 enum isl_format format, 944 struct isl_swizzle swizzle, 945 VkImageAspectFlagBits aspect, 946 uint32_t level, uint32_t array_layer, 947 enum isl_aux_op resolve_op, 948 enum anv_fast_clear_type fast_clear_supported) 949{ 950 const uint32_t plane = anv_image_aspect_to_plane(image, aspect); 951 952#if GFX_VER >= 9 953 anv_cmd_compute_resolve_predicate(cmd_buffer, image, 954 aspect, level, array_layer, 955 resolve_op, fast_clear_supported); 956#else /* GFX_VER <= 8 */ 957 anv_cmd_simple_resolve_predicate(cmd_buffer, image, 958 aspect, level, array_layer, 959 resolve_op, fast_clear_supported); 960#endif 961 962 /* CCS_D only supports full resolves and BLORP will assert on us if we try 963 * to do a partial resolve on a CCS_D surface. 964 */ 965 if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE && 966 image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_D) 967 resolve_op = ISL_AUX_OP_FULL_RESOLVE; 968 969 anv_image_ccs_op(cmd_buffer, image, format, swizzle, aspect, 970 level, array_layer, 1, resolve_op, NULL, true); 971} 972 973static void 974anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer *cmd_buffer, 975 const struct anv_image *image, 976 enum isl_format format, 977 struct isl_swizzle swizzle, 978 VkImageAspectFlagBits aspect, 979 uint32_t array_layer, 980 enum isl_aux_op resolve_op, 981 enum anv_fast_clear_type fast_clear_supported) 982{ 983 assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT); 984 assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE); 985 986#if GFX_VERx10 >= 75 987 anv_cmd_compute_resolve_predicate(cmd_buffer, image, 988 aspect, 0, array_layer, 989 resolve_op, fast_clear_supported); 990 991 anv_image_mcs_op(cmd_buffer, image, format, swizzle, aspect, 992 array_layer, 1, resolve_op, NULL, true); 993#else 994 unreachable("MCS resolves are unsupported on Ivybridge and Bay Trail"); 995#endif 996} 997 998void 999genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer, 1000 const struct anv_image *image, 1001 VkImageAspectFlagBits aspect, 1002 enum isl_aux_usage aux_usage, 1003 uint32_t level, 1004 uint32_t base_layer, 1005 uint32_t layer_count) 1006{ 1007 /* The aspect must be exactly one of the image aspects. */ 1008 assert(util_bitcount(aspect) == 1 && (aspect & image->vk.aspects)); 1009 1010 /* The only compression types with more than just fast-clears are MCS, 1011 * CCS_E, and HiZ. With HiZ we just trust the layout and don't actually 1012 * track the current fast-clear and compression state. This leaves us 1013 * with just MCS and CCS_E. 1014 */ 1015 if (aux_usage != ISL_AUX_USAGE_CCS_E && 1016 aux_usage != ISL_AUX_USAGE_MCS) 1017 return; 1018 1019 set_image_compressed_bit(cmd_buffer, image, aspect, 1020 level, base_layer, layer_count, true); 1021} 1022 1023static void 1024init_fast_clear_color(struct anv_cmd_buffer *cmd_buffer, 1025 const struct anv_image *image, 1026 VkImageAspectFlagBits aspect) 1027{ 1028 assert(cmd_buffer && image); 1029 assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV); 1030 1031 set_image_fast_clear_state(cmd_buffer, image, aspect, 1032 ANV_FAST_CLEAR_NONE); 1033 1034 /* Initialize the struct fields that are accessed for fast-clears so that 1035 * the HW restrictions on the field values are satisfied. 1036 */ 1037 struct anv_address addr = 1038 anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect); 1039 1040 if (GFX_VER >= 9) { 1041 const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; 1042 const unsigned num_dwords = GFX_VER >= 10 ? 1043 isl_dev->ss.clear_color_state_size / 4 : 1044 isl_dev->ss.clear_value_size / 4; 1045 for (unsigned i = 0; i < num_dwords; i++) { 1046 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) { 1047 sdi.Address = addr; 1048 sdi.Address.offset += i * 4; 1049 sdi.ImmediateData = 0; 1050 } 1051 } 1052 } else { 1053 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) { 1054 sdi.Address = addr; 1055 if (GFX_VERx10 >= 75) { 1056 /* Pre-SKL, the dword containing the clear values also contains 1057 * other fields, so we need to initialize those fields to match the 1058 * values that would be in a color attachment. 1059 */ 1060 sdi.ImmediateData = ISL_CHANNEL_SELECT_RED << 25 | 1061 ISL_CHANNEL_SELECT_GREEN << 22 | 1062 ISL_CHANNEL_SELECT_BLUE << 19 | 1063 ISL_CHANNEL_SELECT_ALPHA << 16; 1064 } else if (GFX_VER == 7) { 1065 /* On IVB, the dword containing the clear values also contains 1066 * other fields that must be zero or can be zero. 1067 */ 1068 sdi.ImmediateData = 0; 1069 } 1070 } 1071 } 1072} 1073 1074/* Copy the fast-clear value dword(s) between a surface state object and an 1075 * image's fast clear state buffer. 1076 */ 1077static void 1078genX(copy_fast_clear_dwords)(struct anv_cmd_buffer *cmd_buffer, 1079 struct anv_state surface_state, 1080 const struct anv_image *image, 1081 VkImageAspectFlagBits aspect, 1082 bool copy_from_surface_state) 1083{ 1084 assert(cmd_buffer && image); 1085 assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV); 1086 1087 struct anv_address ss_clear_addr = { 1088 .bo = cmd_buffer->device->surface_state_pool.block_pool.bo, 1089 .offset = surface_state.offset + 1090 cmd_buffer->device->isl_dev.ss.clear_value_offset, 1091 }; 1092 const struct anv_address entry_addr = 1093 anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect); 1094 unsigned copy_size = cmd_buffer->device->isl_dev.ss.clear_value_size; 1095 1096#if GFX_VER == 7 1097 /* On gfx7, the combination of commands used here(MI_LOAD_REGISTER_MEM 1098 * and MI_STORE_REGISTER_MEM) can cause GPU hangs if any rendering is 1099 * in-flight when they are issued even if the memory touched is not 1100 * currently active for rendering. The weird bit is that it is not the 1101 * MI_LOAD/STORE_REGISTER_MEM commands which hang but rather the in-flight 1102 * rendering hangs such that the next stalling command after the 1103 * MI_LOAD/STORE_REGISTER_MEM commands will catch the hang. 1104 * 1105 * It is unclear exactly why this hang occurs. Both MI commands come with 1106 * warnings about the 3D pipeline but that doesn't seem to fully explain 1107 * it. My (Jason's) best theory is that it has something to do with the 1108 * fact that we're using a GPU state register as our temporary and that 1109 * something with reading/writing it is causing problems. 1110 * 1111 * In order to work around this issue, we emit a PIPE_CONTROL with the 1112 * command streamer stall bit set. 1113 */ 1114 anv_add_pending_pipe_bits(cmd_buffer, 1115 ANV_PIPE_CS_STALL_BIT, 1116 "after copy_fast_clear_dwords. Avoid potential hang"); 1117 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 1118#endif 1119 1120 struct mi_builder b; 1121 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 1122 1123 if (copy_from_surface_state) { 1124 mi_memcpy(&b, entry_addr, ss_clear_addr, copy_size); 1125 } else { 1126 mi_memcpy(&b, ss_clear_addr, entry_addr, copy_size); 1127 1128 /* Updating a surface state object may require that the state cache be 1129 * invalidated. From the SKL PRM, Shared Functions -> State -> State 1130 * Caching: 1131 * 1132 * Whenever the RENDER_SURFACE_STATE object in memory pointed to by 1133 * the Binding Table Pointer (BTP) and Binding Table Index (BTI) is 1134 * modified [...], the L1 state cache must be invalidated to ensure 1135 * the new surface or sampler state is fetched from system memory. 1136 * 1137 * In testing, SKL doesn't actually seem to need this, but HSW does. 1138 */ 1139 anv_add_pending_pipe_bits(cmd_buffer, 1140 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT, 1141 "after copy_fast_clear_dwords surface state update"); 1142 } 1143} 1144 1145/** 1146 * @brief Transitions a color buffer from one layout to another. 1147 * 1148 * See section 6.1.1. Image Layout Transitions of the Vulkan 1.0.50 spec for 1149 * more information. 1150 * 1151 * @param level_count VK_REMAINING_MIP_LEVELS isn't supported. 1152 * @param layer_count VK_REMAINING_ARRAY_LAYERS isn't supported. For 3D images, 1153 * this represents the maximum layers to transition at each 1154 * specified miplevel. 1155 */ 1156static void 1157transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, 1158 const struct anv_image *image, 1159 VkImageAspectFlagBits aspect, 1160 const uint32_t base_level, uint32_t level_count, 1161 uint32_t base_layer, uint32_t layer_count, 1162 VkImageLayout initial_layout, 1163 VkImageLayout final_layout, 1164 uint64_t src_queue_family, 1165 uint64_t dst_queue_family, 1166 bool will_full_fast_clear) 1167{ 1168 struct anv_device *device = cmd_buffer->device; 1169 const struct intel_device_info *devinfo = &device->info; 1170 /* Validate the inputs. */ 1171 assert(cmd_buffer); 1172 assert(image && image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV); 1173 /* These values aren't supported for simplicity's sake. */ 1174 assert(level_count != VK_REMAINING_MIP_LEVELS && 1175 layer_count != VK_REMAINING_ARRAY_LAYERS); 1176 /* Ensure the subresource range is valid. */ 1177 UNUSED uint64_t last_level_num = base_level + level_count; 1178 const uint32_t max_depth = anv_minify(image->vk.extent.depth, base_level); 1179 UNUSED const uint32_t image_layers = MAX2(image->vk.array_layers, max_depth); 1180 assert((uint64_t)base_layer + layer_count <= image_layers); 1181 assert(last_level_num <= image->vk.mip_levels); 1182 /* If there is a layout transfer, the final layout cannot be undefined or 1183 * preinitialized (VUID-VkImageMemoryBarrier-newLayout-01198). 1184 */ 1185 assert(initial_layout == final_layout || 1186 (final_layout != VK_IMAGE_LAYOUT_UNDEFINED && 1187 final_layout != VK_IMAGE_LAYOUT_PREINITIALIZED)); 1188 const struct isl_drm_modifier_info *isl_mod_info = 1189 image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT 1190 ? isl_drm_modifier_get_info(image->vk.drm_format_mod) 1191 : NULL; 1192 1193 const bool src_queue_external = 1194 src_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT || 1195 src_queue_family == VK_QUEUE_FAMILY_EXTERNAL; 1196 1197 const bool dst_queue_external = 1198 dst_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT || 1199 dst_queue_family == VK_QUEUE_FAMILY_EXTERNAL; 1200 1201 /* Simultaneous acquire and release on external queues is illegal. */ 1202 assert(!src_queue_external || !dst_queue_external); 1203 1204 /* Ownership transition on an external queue requires special action if the 1205 * image has a DRM format modifier because we store image data in 1206 * a driver-private bo which is inaccessible to the external queue. 1207 */ 1208 const bool mod_acquire = 1209 src_queue_external && 1210 image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT; 1211 1212 const bool mod_release = 1213 dst_queue_external && 1214 image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT; 1215 1216 if (initial_layout == final_layout && 1217 !mod_acquire && !mod_release) { 1218 /* No work is needed. */ 1219 return; 1220 } 1221 1222 const uint32_t plane = anv_image_aspect_to_plane(image, aspect); 1223 1224 if (anv_surface_is_valid(&image->planes[plane].shadow_surface) && 1225 final_layout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) { 1226 /* This surface is a linear compressed image with a tiled shadow surface 1227 * for texturing. The client is about to use it in READ_ONLY_OPTIMAL so 1228 * we need to ensure the shadow copy is up-to-date. 1229 */ 1230 assert(image->vk.tiling != VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT); 1231 assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT); 1232 assert(image->planes[plane].primary_surface.isl.tiling == ISL_TILING_LINEAR); 1233 assert(image->planes[plane].shadow_surface.isl.tiling != ISL_TILING_LINEAR); 1234 assert(isl_format_is_compressed(image->planes[plane].primary_surface.isl.format)); 1235 assert(plane == 0); 1236 anv_image_copy_to_shadow(cmd_buffer, image, 1237 VK_IMAGE_ASPECT_COLOR_BIT, 1238 base_level, level_count, 1239 base_layer, layer_count); 1240 } 1241 1242 if (base_layer >= anv_image_aux_layers(image, aspect, base_level)) 1243 return; 1244 1245 assert(image->planes[plane].primary_surface.isl.tiling != ISL_TILING_LINEAR); 1246 1247 /* The following layouts are equivalent for non-linear images. */ 1248 const bool initial_layout_undefined = 1249 initial_layout == VK_IMAGE_LAYOUT_UNDEFINED || 1250 initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED; 1251 1252 bool must_init_fast_clear_state = false; 1253 bool must_init_aux_surface = false; 1254 1255 if (initial_layout_undefined) { 1256 /* The subresource may have been aliased and populated with arbitrary 1257 * data. 1258 */ 1259 must_init_fast_clear_state = true; 1260 must_init_aux_surface = true; 1261 } else if (mod_acquire) { 1262 /* The fast clear state lives in a driver-private bo, and therefore the 1263 * external/foreign queue is unaware of it. 1264 * 1265 * If this is the first time we are accessing the image, then the fast 1266 * clear state is uninitialized. 1267 * 1268 * If this is NOT the first time we are accessing the image, then the fast 1269 * clear state may still be valid and correct due to the resolve during 1270 * our most recent ownership release. However, we do not track the aux 1271 * state with MI stores, and therefore must assume the worst-case: that 1272 * this is the first time we are accessing the image. 1273 */ 1274 assert(image->planes[plane].fast_clear_memory_range.binding == 1275 ANV_IMAGE_MEMORY_BINDING_PRIVATE); 1276 must_init_fast_clear_state = true; 1277 1278 if (image->planes[plane].aux_surface.memory_range.binding == 1279 ANV_IMAGE_MEMORY_BINDING_PRIVATE) { 1280 assert(isl_mod_info->aux_usage == ISL_AUX_USAGE_NONE); 1281 1282 /* The aux surface, like the fast clear state, lives in 1283 * a driver-private bo. We must initialize the aux surface for the 1284 * same reasons we must initialize the fast clear state. 1285 */ 1286 must_init_aux_surface = true; 1287 } else { 1288 assert(isl_mod_info->aux_usage != ISL_AUX_USAGE_NONE); 1289 1290 /* The aux surface, unlike the fast clear state, lives in 1291 * application-visible VkDeviceMemory and is shared with the 1292 * external/foreign queue. Therefore, when we acquire ownership of the 1293 * image with a defined VkImageLayout, the aux surface is valid and has 1294 * the aux state required by the modifier. 1295 */ 1296 must_init_aux_surface = false; 1297 } 1298 } 1299 1300#if GFX_VER == 12 1301 /* We do not yet support modifiers with aux on gen12. */ 1302 assert(image->vk.tiling != VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT); 1303 1304 if (initial_layout_undefined) { 1305 if (device->physical->has_implicit_ccs && devinfo->has_aux_map) { 1306 anv_image_init_aux_tt(cmd_buffer, image, aspect, 1307 base_level, level_count, 1308 base_layer, layer_count); 1309 } 1310 } 1311#else 1312 assert(!(device->physical->has_implicit_ccs && devinfo->has_aux_map)); 1313#endif 1314 1315 if (must_init_fast_clear_state) { 1316 if (base_level == 0 && base_layer == 0) 1317 init_fast_clear_color(cmd_buffer, image, aspect); 1318 } 1319 1320 if (must_init_aux_surface) { 1321 assert(must_init_fast_clear_state); 1322 1323 /* Initialize the aux buffers to enable correct rendering. In order to 1324 * ensure that things such as storage images work correctly, aux buffers 1325 * need to be initialized to valid data. 1326 * 1327 * Having an aux buffer with invalid data is a problem for two reasons: 1328 * 1329 * 1) Having an invalid value in the buffer can confuse the hardware. 1330 * For instance, with CCS_E on SKL, a two-bit CCS value of 2 is 1331 * invalid and leads to the hardware doing strange things. It 1332 * doesn't hang as far as we can tell but rendering corruption can 1333 * occur. 1334 * 1335 * 2) If this transition is into the GENERAL layout and we then use the 1336 * image as a storage image, then we must have the aux buffer in the 1337 * pass-through state so that, if we then go to texture from the 1338 * image, we get the results of our storage image writes and not the 1339 * fast clear color or other random data. 1340 * 1341 * For CCS both of the problems above are real demonstrable issues. In 1342 * that case, the only thing we can do is to perform an ambiguate to 1343 * transition the aux surface into the pass-through state. 1344 * 1345 * For MCS, (2) is never an issue because we don't support multisampled 1346 * storage images. In theory, issue (1) is a problem with MCS but we've 1347 * never seen it in the wild. For 4x and 16x, all bit patters could, in 1348 * theory, be interpreted as something but we don't know that all bit 1349 * patterns are actually valid. For 2x and 8x, you could easily end up 1350 * with the MCS referring to an invalid plane because not all bits of 1351 * the MCS value are actually used. Even though we've never seen issues 1352 * in the wild, it's best to play it safe and initialize the MCS. We 1353 * can use a fast-clear for MCS because we only ever touch from render 1354 * and texture (no image load store). 1355 */ 1356 if (image->vk.samples == 1) { 1357 for (uint32_t l = 0; l < level_count; l++) { 1358 const uint32_t level = base_level + l; 1359 1360 uint32_t aux_layers = anv_image_aux_layers(image, aspect, level); 1361 if (base_layer >= aux_layers) 1362 break; /* We will only get fewer layers as level increases */ 1363 uint32_t level_layer_count = 1364 MIN2(layer_count, aux_layers - base_layer); 1365 1366 /* If will_full_fast_clear is set, the caller promises to 1367 * fast-clear the largest portion of the specified range as it can. 1368 * For color images, that means only the first LOD and array slice. 1369 */ 1370 if (level == 0 && base_layer == 0 && will_full_fast_clear) { 1371 base_layer++; 1372 level_layer_count--; 1373 if (level_layer_count == 0) 1374 continue; 1375 } 1376 1377 anv_image_ccs_op(cmd_buffer, image, 1378 image->planes[plane].primary_surface.isl.format, 1379 ISL_SWIZZLE_IDENTITY, 1380 aspect, level, base_layer, level_layer_count, 1381 ISL_AUX_OP_AMBIGUATE, NULL, false); 1382 1383 if (image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E) { 1384 set_image_compressed_bit(cmd_buffer, image, aspect, 1385 level, base_layer, level_layer_count, 1386 false); 1387 } 1388 } 1389 } else { 1390 if (image->vk.samples == 4 || image->vk.samples == 16) { 1391 anv_perf_warn(VK_LOG_OBJS(&image->vk.base), 1392 "Doing a potentially unnecessary fast-clear to " 1393 "define an MCS buffer."); 1394 } 1395 1396 /* If will_full_fast_clear is set, the caller promises to fast-clear 1397 * the largest portion of the specified range as it can. 1398 */ 1399 if (will_full_fast_clear) 1400 return; 1401 1402 assert(base_level == 0 && level_count == 1); 1403 anv_image_mcs_op(cmd_buffer, image, 1404 image->planes[plane].primary_surface.isl.format, 1405 ISL_SWIZZLE_IDENTITY, 1406 aspect, base_layer, layer_count, 1407 ISL_AUX_OP_FAST_CLEAR, NULL, false); 1408 } 1409 return; 1410 } 1411 1412 enum isl_aux_usage initial_aux_usage = 1413 anv_layout_to_aux_usage(devinfo, image, aspect, 0, initial_layout); 1414 enum isl_aux_usage final_aux_usage = 1415 anv_layout_to_aux_usage(devinfo, image, aspect, 0, final_layout); 1416 1417 /* We must override the anv_layout_to_* functions because they are unaware of 1418 * acquire/release direction. 1419 */ 1420 if (mod_acquire) { 1421 initial_aux_usage = isl_mod_info->aux_usage; 1422 } else if (mod_release) { 1423 final_aux_usage = isl_mod_info->aux_usage; 1424 } 1425 1426 /* The current code assumes that there is no mixing of CCS_E and CCS_D. 1427 * We can handle transitions between CCS_D/E to and from NONE. What we 1428 * don't yet handle is switching between CCS_E and CCS_D within a given 1429 * image. Doing so in a performant way requires more detailed aux state 1430 * tracking such as what is done in i965. For now, just assume that we 1431 * only have one type of compression. 1432 */ 1433 assert(initial_aux_usage == ISL_AUX_USAGE_NONE || 1434 final_aux_usage == ISL_AUX_USAGE_NONE || 1435 initial_aux_usage == final_aux_usage); 1436 1437 /* If initial aux usage is NONE, there is nothing to resolve */ 1438 if (initial_aux_usage == ISL_AUX_USAGE_NONE) 1439 return; 1440 1441 enum isl_aux_op resolve_op = ISL_AUX_OP_NONE; 1442 1443 /* If the initial layout supports more fast clear than the final layout 1444 * then we need at least a partial resolve. 1445 */ 1446 const enum anv_fast_clear_type initial_fast_clear = 1447 anv_layout_to_fast_clear_type(devinfo, image, aspect, initial_layout); 1448 const enum anv_fast_clear_type final_fast_clear = 1449 anv_layout_to_fast_clear_type(devinfo, image, aspect, final_layout); 1450 if (final_fast_clear < initial_fast_clear) 1451 resolve_op = ISL_AUX_OP_PARTIAL_RESOLVE; 1452 1453 if (initial_aux_usage == ISL_AUX_USAGE_CCS_E && 1454 final_aux_usage != ISL_AUX_USAGE_CCS_E) 1455 resolve_op = ISL_AUX_OP_FULL_RESOLVE; 1456 1457 if (resolve_op == ISL_AUX_OP_NONE) 1458 return; 1459 1460 /* Perform a resolve to synchronize data between the main and aux buffer. 1461 * Before we begin, we must satisfy the cache flushing requirement specified 1462 * in the Sky Lake PRM Vol. 7, "MCS Buffer for Render Target(s)": 1463 * 1464 * Any transition from any value in {Clear, Render, Resolve} to a 1465 * different value in {Clear, Render, Resolve} requires end of pipe 1466 * synchronization. 1467 * 1468 * We perform a flush of the write cache before and after the clear and 1469 * resolve operations to meet this requirement. 1470 * 1471 * Unlike other drawing, fast clear operations are not properly 1472 * synchronized. The first PIPE_CONTROL here likely ensures that the 1473 * contents of the previous render or clear hit the render target before we 1474 * resolve and the second likely ensures that the resolve is complete before 1475 * we do any more rendering or clearing. 1476 */ 1477 anv_add_pending_pipe_bits(cmd_buffer, 1478 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | 1479 ANV_PIPE_END_OF_PIPE_SYNC_BIT, 1480 "after transition RT"); 1481 1482 for (uint32_t l = 0; l < level_count; l++) { 1483 uint32_t level = base_level + l; 1484 1485 uint32_t aux_layers = anv_image_aux_layers(image, aspect, level); 1486 if (base_layer >= aux_layers) 1487 break; /* We will only get fewer layers as level increases */ 1488 uint32_t level_layer_count = 1489 MIN2(layer_count, aux_layers - base_layer); 1490 1491 for (uint32_t a = 0; a < level_layer_count; a++) { 1492 uint32_t array_layer = base_layer + a; 1493 1494 /* If will_full_fast_clear is set, the caller promises to fast-clear 1495 * the largest portion of the specified range as it can. For color 1496 * images, that means only the first LOD and array slice. 1497 */ 1498 if (level == 0 && array_layer == 0 && will_full_fast_clear) 1499 continue; 1500 1501 if (image->vk.samples == 1) { 1502 anv_cmd_predicated_ccs_resolve(cmd_buffer, image, 1503 image->planes[plane].primary_surface.isl.format, 1504 ISL_SWIZZLE_IDENTITY, 1505 aspect, level, array_layer, resolve_op, 1506 final_fast_clear); 1507 } else { 1508 /* We only support fast-clear on the first layer so partial 1509 * resolves should not be used on other layers as they will use 1510 * the clear color stored in memory that is only valid for layer0. 1511 */ 1512 if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE && 1513 array_layer != 0) 1514 continue; 1515 1516 anv_cmd_predicated_mcs_resolve(cmd_buffer, image, 1517 image->planes[plane].primary_surface.isl.format, 1518 ISL_SWIZZLE_IDENTITY, 1519 aspect, array_layer, resolve_op, 1520 final_fast_clear); 1521 } 1522 } 1523 } 1524 1525 anv_add_pending_pipe_bits(cmd_buffer, 1526 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | 1527 ANV_PIPE_END_OF_PIPE_SYNC_BIT, 1528 "after transition RT"); 1529} 1530 1531static VkResult 1532genX(cmd_buffer_setup_attachments)(struct anv_cmd_buffer *cmd_buffer, 1533 const struct anv_render_pass *pass, 1534 const struct anv_framebuffer *framebuffer, 1535 const VkRenderPassBeginInfo *begin) 1536{ 1537 struct anv_cmd_state *state = &cmd_buffer->state; 1538 1539 vk_free(&cmd_buffer->pool->alloc, state->attachments); 1540 1541 if (pass->attachment_count > 0) { 1542 state->attachments = vk_zalloc(&cmd_buffer->pool->alloc, 1543 pass->attachment_count * 1544 sizeof(state->attachments[0]), 1545 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 1546 if (state->attachments == NULL) { 1547 /* Propagate VK_ERROR_OUT_OF_HOST_MEMORY to vkEndCommandBuffer */ 1548 return anv_batch_set_error(&cmd_buffer->batch, 1549 VK_ERROR_OUT_OF_HOST_MEMORY); 1550 } 1551 } else { 1552 state->attachments = NULL; 1553 } 1554 1555 const VkRenderPassAttachmentBeginInfoKHR *attach_begin = 1556 vk_find_struct_const(begin, RENDER_PASS_ATTACHMENT_BEGIN_INFO_KHR); 1557 if (begin && !attach_begin) 1558 assert(pass->attachment_count == framebuffer->attachment_count); 1559 1560 for (uint32_t i = 0; i < pass->attachment_count; ++i) { 1561 if (attach_begin && attach_begin->attachmentCount != 0) { 1562 assert(attach_begin->attachmentCount == pass->attachment_count); 1563 ANV_FROM_HANDLE(anv_image_view, iview, attach_begin->pAttachments[i]); 1564 state->attachments[i].image_view = iview; 1565 } else if (framebuffer && i < framebuffer->attachment_count) { 1566 state->attachments[i].image_view = framebuffer->attachments[i]; 1567 } else { 1568 state->attachments[i].image_view = NULL; 1569 } 1570 } 1571 1572 if (begin) { 1573 for (uint32_t i = 0; i < pass->attachment_count; ++i) { 1574 const struct anv_render_pass_attachment *pass_att = &pass->attachments[i]; 1575 struct anv_attachment_state *att_state = &state->attachments[i]; 1576 VkImageAspectFlags att_aspects = vk_format_aspects(pass_att->format); 1577 VkImageAspectFlags clear_aspects = 0; 1578 VkImageAspectFlags load_aspects = 0; 1579 1580 if (att_aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { 1581 /* color attachment */ 1582 if (pass_att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { 1583 clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT; 1584 } else if (pass_att->load_op == VK_ATTACHMENT_LOAD_OP_LOAD) { 1585 load_aspects |= VK_IMAGE_ASPECT_COLOR_BIT; 1586 } 1587 } else { 1588 /* depthstencil attachment */ 1589 if (att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { 1590 if (pass_att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { 1591 clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT; 1592 } else if (pass_att->load_op == VK_ATTACHMENT_LOAD_OP_LOAD) { 1593 load_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT; 1594 } 1595 } 1596 if (att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { 1597 if (pass_att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { 1598 clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT; 1599 } else if (pass_att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_LOAD) { 1600 load_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT; 1601 } 1602 } 1603 } 1604 1605 att_state->current_layout = pass_att->initial_layout; 1606 att_state->current_stencil_layout = pass_att->stencil_initial_layout; 1607 att_state->pending_clear_aspects = clear_aspects; 1608 att_state->pending_load_aspects = load_aspects; 1609 if (clear_aspects) 1610 att_state->clear_value = begin->pClearValues[i]; 1611 1612 struct anv_image_view *iview = state->attachments[i].image_view; 1613 1614 const uint32_t num_layers = iview->planes[0].isl.array_len; 1615 att_state->pending_clear_views = (1 << num_layers) - 1; 1616 1617 /* This will be initialized after the first subpass transition. */ 1618 att_state->aux_usage = ISL_AUX_USAGE_NONE; 1619 1620 att_state->fast_clear = false; 1621 if (clear_aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { 1622 assert(clear_aspects == VK_IMAGE_ASPECT_COLOR_BIT); 1623 att_state->fast_clear = 1624 anv_can_fast_clear_color_view(cmd_buffer->device, iview, 1625 pass_att->first_subpass_layout, 1626 vk_to_isl_color(att_state->clear_value.color), 1627 framebuffer->layers, 1628 begin->renderArea); 1629 } else if (clear_aspects & (VK_IMAGE_ASPECT_DEPTH_BIT | 1630 VK_IMAGE_ASPECT_STENCIL_BIT)) { 1631 att_state->fast_clear = 1632 anv_can_hiz_clear_ds_view(cmd_buffer->device, iview, 1633 pass_att->first_subpass_layout, 1634 clear_aspects, 1635 att_state->clear_value.depthStencil.depth, 1636 begin->renderArea); 1637 } 1638 } 1639 } 1640 1641 return VK_SUCCESS; 1642} 1643 1644/** 1645 * Setup anv_cmd_state::attachments for vkCmdBeginRenderPass. 1646 */ 1647static VkResult 1648genX(cmd_buffer_alloc_att_surf_states)(struct anv_cmd_buffer *cmd_buffer, 1649 const struct anv_render_pass *pass, 1650 const struct anv_subpass *subpass) 1651{ 1652 const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; 1653 struct anv_cmd_state *state = &cmd_buffer->state; 1654 1655 /* Reserve one for the NULL state. */ 1656 unsigned num_states = 1; 1657 for (uint32_t i = 0; i < subpass->attachment_count; i++) { 1658 uint32_t att = subpass->attachments[i].attachment; 1659 if (att == VK_ATTACHMENT_UNUSED) 1660 continue; 1661 1662 assert(att < pass->attachment_count); 1663 if (!vk_format_is_color(pass->attachments[att].format)) 1664 continue; 1665 1666 const VkImageUsageFlagBits att_usage = subpass->attachments[i].usage; 1667 assert(util_bitcount(att_usage) == 1); 1668 1669 if (att_usage == VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT || 1670 att_usage == VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) 1671 num_states++; 1672 } 1673 1674 const uint32_t ss_stride = align_u32(isl_dev->ss.size, isl_dev->ss.align); 1675 state->attachment_states = 1676 anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 1677 num_states * ss_stride, isl_dev->ss.align); 1678 if (state->attachment_states.map == NULL) { 1679 return anv_batch_set_error(&cmd_buffer->batch, 1680 VK_ERROR_OUT_OF_DEVICE_MEMORY); 1681 } 1682 1683 struct anv_state next_state = state->attachment_states; 1684 next_state.alloc_size = isl_dev->ss.size; 1685 1686 state->null_surface_state = next_state; 1687 next_state.offset += ss_stride; 1688 next_state.map += ss_stride; 1689 1690 for (uint32_t i = 0; i < subpass->attachment_count; i++) { 1691 uint32_t att = subpass->attachments[i].attachment; 1692 if (att == VK_ATTACHMENT_UNUSED) 1693 continue; 1694 1695 assert(att < pass->attachment_count); 1696 if (!vk_format_is_color(pass->attachments[att].format)) 1697 continue; 1698 1699 const VkImageUsageFlagBits att_usage = subpass->attachments[i].usage; 1700 assert(util_bitcount(att_usage) == 1); 1701 1702 if (att_usage == VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) 1703 state->attachments[att].color.state = next_state; 1704 else if (att_usage == VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) 1705 state->attachments[att].input.state = next_state; 1706 else 1707 continue; 1708 1709 next_state.offset += ss_stride; 1710 next_state.map += ss_stride; 1711 } 1712 1713 assert(next_state.offset == state->attachment_states.offset + 1714 state->attachment_states.alloc_size); 1715 1716 return VK_SUCCESS; 1717} 1718 1719VkResult 1720genX(BeginCommandBuffer)( 1721 VkCommandBuffer commandBuffer, 1722 const VkCommandBufferBeginInfo* pBeginInfo) 1723{ 1724 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 1725 1726 /* If this is the first vkBeginCommandBuffer, we must *initialize* the 1727 * command buffer's state. Otherwise, we must *reset* its state. In both 1728 * cases we reset it. 1729 * 1730 * From the Vulkan 1.0 spec: 1731 * 1732 * If a command buffer is in the executable state and the command buffer 1733 * was allocated from a command pool with the 1734 * VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT flag set, then 1735 * vkBeginCommandBuffer implicitly resets the command buffer, behaving 1736 * as if vkResetCommandBuffer had been called with 1737 * VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT not set. It then puts 1738 * the command buffer in the recording state. 1739 */ 1740 anv_cmd_buffer_reset(cmd_buffer); 1741 1742 cmd_buffer->usage_flags = pBeginInfo->flags; 1743 1744 /* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT must be ignored for 1745 * primary level command buffers. 1746 * 1747 * From the Vulkan 1.0 spec: 1748 * 1749 * VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT specifies that a 1750 * secondary command buffer is considered to be entirely inside a render 1751 * pass. If this is a primary command buffer, then this bit is ignored. 1752 */ 1753 if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) 1754 cmd_buffer->usage_flags &= ~VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT; 1755 1756 genX(cmd_buffer_emit_state_base_address)(cmd_buffer); 1757 1758 /* We sometimes store vertex data in the dynamic state buffer for blorp 1759 * operations and our dynamic state stream may re-use data from previous 1760 * command buffers. In order to prevent stale cache data, we flush the VF 1761 * cache. We could do this on every blorp call but that's not really 1762 * needed as all of the data will get written by the CPU prior to the GPU 1763 * executing anything. The chances are fairly high that they will use 1764 * blorp at least once per primary command buffer so it shouldn't be 1765 * wasted. 1766 * 1767 * There is also a workaround on gfx8 which requires us to invalidate the 1768 * VF cache occasionally. It's easier if we can assume we start with a 1769 * fresh cache (See also genX(cmd_buffer_set_binding_for_gfx8_vb_flush).) 1770 */ 1771 anv_add_pending_pipe_bits(cmd_buffer, 1772 ANV_PIPE_VF_CACHE_INVALIDATE_BIT, 1773 "new cmd buffer"); 1774 1775 /* Re-emit the aux table register in every command buffer. This way we're 1776 * ensured that we have the table even if this command buffer doesn't 1777 * initialize any images. 1778 */ 1779 if (cmd_buffer->device->info.has_aux_map) { 1780 anv_add_pending_pipe_bits(cmd_buffer, 1781 ANV_PIPE_AUX_TABLE_INVALIDATE_BIT, 1782 "new cmd buffer with aux-tt"); 1783 } 1784 1785 /* We send an "Indirect State Pointers Disable" packet at 1786 * EndCommandBuffer, so all push contant packets are ignored during a 1787 * context restore. Documentation says after that command, we need to 1788 * emit push constants again before any rendering operation. So we 1789 * flag them dirty here to make sure they get emitted. 1790 */ 1791 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS; 1792 1793 VkResult result = VK_SUCCESS; 1794 if (cmd_buffer->usage_flags & 1795 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) { 1796 assert(pBeginInfo->pInheritanceInfo); 1797 ANV_FROM_HANDLE(anv_render_pass, pass, 1798 pBeginInfo->pInheritanceInfo->renderPass); 1799 struct anv_subpass *subpass = 1800 &pass->subpasses[pBeginInfo->pInheritanceInfo->subpass]; 1801 ANV_FROM_HANDLE(anv_framebuffer, framebuffer, 1802 pBeginInfo->pInheritanceInfo->framebuffer); 1803 1804 cmd_buffer->state.pass = pass; 1805 cmd_buffer->state.subpass = subpass; 1806 1807 /* This is optional in the inheritance info. */ 1808 cmd_buffer->state.framebuffer = framebuffer; 1809 1810 result = genX(cmd_buffer_setup_attachments)(cmd_buffer, pass, 1811 framebuffer, NULL); 1812 if (result != VK_SUCCESS) 1813 return result; 1814 1815 result = genX(cmd_buffer_alloc_att_surf_states)(cmd_buffer, pass, 1816 subpass); 1817 if (result != VK_SUCCESS) 1818 return result; 1819 1820 /* Record that HiZ is enabled if we can. */ 1821 if (cmd_buffer->state.framebuffer) { 1822 const struct anv_image_view * const iview = 1823 anv_cmd_buffer_get_depth_stencil_view(cmd_buffer); 1824 1825 if (iview) { 1826 VkImageLayout layout = 1827 cmd_buffer->state.subpass->depth_stencil_attachment->layout; 1828 1829 enum isl_aux_usage aux_usage = 1830 anv_layout_to_aux_usage(&cmd_buffer->device->info, iview->image, 1831 VK_IMAGE_ASPECT_DEPTH_BIT, 1832 VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT, 1833 layout); 1834 1835 cmd_buffer->state.hiz_enabled = isl_aux_usage_has_hiz(aux_usage); 1836 } 1837 } 1838 1839 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS; 1840 } 1841 1842#if GFX_VERx10 >= 75 1843 if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) { 1844 const VkCommandBufferInheritanceConditionalRenderingInfoEXT *conditional_rendering_info = 1845 vk_find_struct_const(pBeginInfo->pInheritanceInfo->pNext, COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT); 1846 1847 /* If secondary buffer supports conditional rendering 1848 * we should emit commands as if conditional rendering is enabled. 1849 */ 1850 cmd_buffer->state.conditional_render_enabled = 1851 conditional_rendering_info && conditional_rendering_info->conditionalRenderingEnable; 1852 } 1853#endif 1854 1855 return result; 1856} 1857 1858/* From the PRM, Volume 2a: 1859 * 1860 * "Indirect State Pointers Disable 1861 * 1862 * At the completion of the post-sync operation associated with this pipe 1863 * control packet, the indirect state pointers in the hardware are 1864 * considered invalid; the indirect pointers are not saved in the context. 1865 * If any new indirect state commands are executed in the command stream 1866 * while the pipe control is pending, the new indirect state commands are 1867 * preserved. 1868 * 1869 * [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context 1870 * restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant 1871 * commands are only considered as Indirect State Pointers. Once ISP is 1872 * issued in a context, SW must initialize by programming push constant 1873 * commands for all the shaders (at least to zero length) before attempting 1874 * any rendering operation for the same context." 1875 * 1876 * 3DSTATE_CONSTANT_* packets are restored during a context restore, 1877 * even though they point to a BO that has been already unreferenced at 1878 * the end of the previous batch buffer. This has been fine so far since 1879 * we are protected by these scratch page (every address not covered by 1880 * a BO should be pointing to the scratch page). But on CNL, it is 1881 * causing a GPU hang during context restore at the 3DSTATE_CONSTANT_* 1882 * instruction. 1883 * 1884 * The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the 1885 * hardware to ignore previous 3DSTATE_CONSTANT_* packets during a 1886 * context restore, so the mentioned hang doesn't happen. However, 1887 * software must program push constant commands for all stages prior to 1888 * rendering anything. So we flag them dirty in BeginCommandBuffer. 1889 * 1890 * Finally, we also make sure to stall at pixel scoreboard to make sure the 1891 * constants have been loaded into the EUs prior to disable the push constants 1892 * so that it doesn't hang a previous 3DPRIMITIVE. 1893 */ 1894static void 1895emit_isp_disable(struct anv_cmd_buffer *cmd_buffer) 1896{ 1897 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 1898 pc.StallAtPixelScoreboard = true; 1899 pc.CommandStreamerStallEnable = true; 1900 anv_debug_dump_pc(pc); 1901 } 1902 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 1903 pc.IndirectStatePointersDisable = true; 1904 pc.CommandStreamerStallEnable = true; 1905 anv_debug_dump_pc(pc); 1906 } 1907} 1908 1909VkResult 1910genX(EndCommandBuffer)( 1911 VkCommandBuffer commandBuffer) 1912{ 1913 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 1914 1915 if (anv_batch_has_error(&cmd_buffer->batch)) 1916 return cmd_buffer->batch.status; 1917 1918 anv_measure_endcommandbuffer(cmd_buffer); 1919 1920 /* We want every command buffer to start with the PMA fix in a known state, 1921 * so we disable it at the end of the command buffer. 1922 */ 1923 genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false); 1924 1925 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 1926 1927 emit_isp_disable(cmd_buffer); 1928 1929 anv_cmd_buffer_end_batch_buffer(cmd_buffer); 1930 1931 return VK_SUCCESS; 1932} 1933 1934void 1935genX(CmdExecuteCommands)( 1936 VkCommandBuffer commandBuffer, 1937 uint32_t commandBufferCount, 1938 const VkCommandBuffer* pCmdBuffers) 1939{ 1940 ANV_FROM_HANDLE(anv_cmd_buffer, primary, commandBuffer); 1941 1942 assert(primary->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); 1943 1944 if (anv_batch_has_error(&primary->batch)) 1945 return; 1946 1947 /* The secondary command buffers will assume that the PMA fix is disabled 1948 * when they begin executing. Make sure this is true. 1949 */ 1950 genX(cmd_buffer_enable_pma_fix)(primary, false); 1951 1952 /* The secondary command buffer doesn't know which textures etc. have been 1953 * flushed prior to their execution. Apply those flushes now. 1954 */ 1955 genX(cmd_buffer_apply_pipe_flushes)(primary); 1956 1957 for (uint32_t i = 0; i < commandBufferCount; i++) { 1958 ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]); 1959 1960 assert(secondary->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY); 1961 assert(!anv_batch_has_error(&secondary->batch)); 1962 1963#if GFX_VERx10 >= 75 1964 if (secondary->state.conditional_render_enabled) { 1965 if (!primary->state.conditional_render_enabled) { 1966 /* Secondary buffer is constructed as if it will be executed 1967 * with conditional rendering, we should satisfy this dependency 1968 * regardless of conditional rendering being enabled in primary. 1969 */ 1970 struct mi_builder b; 1971 mi_builder_init(&b, &primary->device->info, &primary->batch); 1972 mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG), 1973 mi_imm(UINT64_MAX)); 1974 } 1975 } 1976#endif 1977 1978 if (secondary->usage_flags & 1979 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) { 1980 /* If we're continuing a render pass from the primary, we need to 1981 * copy the surface states for the current subpass into the storage 1982 * we allocated for them in BeginCommandBuffer. 1983 */ 1984 struct anv_bo *ss_bo = 1985 primary->device->surface_state_pool.block_pool.bo; 1986 struct anv_state src_state = primary->state.attachment_states; 1987 struct anv_state dst_state = secondary->state.attachment_states; 1988 assert(src_state.alloc_size == dst_state.alloc_size); 1989 1990 genX(cmd_buffer_so_memcpy)(primary, 1991 (struct anv_address) { 1992 .bo = ss_bo, 1993 .offset = dst_state.offset, 1994 }, 1995 (struct anv_address) { 1996 .bo = ss_bo, 1997 .offset = src_state.offset, 1998 }, 1999 src_state.alloc_size); 2000 } 2001 2002 anv_cmd_buffer_add_secondary(primary, secondary); 2003 2004 assert(secondary->perf_query_pool == NULL || primary->perf_query_pool == NULL || 2005 secondary->perf_query_pool == primary->perf_query_pool); 2006 if (secondary->perf_query_pool) 2007 primary->perf_query_pool = secondary->perf_query_pool; 2008 2009#if GFX_VERx10 == 120 2010 if (secondary->state.depth_reg_mode != ANV_DEPTH_REG_MODE_UNKNOWN) 2011 primary->state.depth_reg_mode = secondary->state.depth_reg_mode; 2012#endif 2013 } 2014 2015 /* The secondary isn't counted in our VF cache tracking so we need to 2016 * invalidate the whole thing. 2017 */ 2018 if (GFX_VER >= 8 && GFX_VER <= 9) { 2019 anv_add_pending_pipe_bits(primary, 2020 ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT, 2021 "Secondary cmd buffer not tracked in VF cache"); 2022 } 2023 2024 /* The secondary may have selected a different pipeline (3D or compute) and 2025 * may have changed the current L3$ configuration. Reset our tracking 2026 * variables to invalid values to ensure that we re-emit these in the case 2027 * where we do any draws or compute dispatches from the primary after the 2028 * secondary has returned. 2029 */ 2030 primary->state.current_pipeline = UINT32_MAX; 2031 primary->state.current_l3_config = NULL; 2032 primary->state.current_hash_scale = 0; 2033 2034 /* Each of the secondary command buffers will use its own state base 2035 * address. We need to re-emit state base address for the primary after 2036 * all of the secondaries are done. 2037 * 2038 * TODO: Maybe we want to make this a dirty bit to avoid extra state base 2039 * address calls? 2040 */ 2041 genX(cmd_buffer_emit_state_base_address)(primary); 2042} 2043 2044/** 2045 * Program the hardware to use the specified L3 configuration. 2046 */ 2047void 2048genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer, 2049 const struct intel_l3_config *cfg) 2050{ 2051 assert(cfg || GFX_VER >= 12); 2052 if (cfg == cmd_buffer->state.current_l3_config) 2053 return; 2054 2055#if GFX_VER >= 11 2056 /* On Gfx11+ we use only one config, so verify it remains the same and skip 2057 * the stalling programming entirely. 2058 */ 2059 assert(cfg == cmd_buffer->device->l3_config); 2060#else 2061 if (INTEL_DEBUG(DEBUG_L3)) { 2062 mesa_logd("L3 config transition: "); 2063 intel_dump_l3_config(cfg, stderr); 2064 } 2065 2066 /* According to the hardware docs, the L3 partitioning can only be changed 2067 * while the pipeline is completely drained and the caches are flushed, 2068 * which involves a first PIPE_CONTROL flush which stalls the pipeline... 2069 */ 2070 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 2071 pc.DCFlushEnable = true; 2072 pc.PostSyncOperation = NoWrite; 2073 pc.CommandStreamerStallEnable = true; 2074 anv_debug_dump_pc(pc); 2075 } 2076 2077 /* ...followed by a second pipelined PIPE_CONTROL that initiates 2078 * invalidation of the relevant caches. Note that because RO invalidation 2079 * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL 2080 * command is processed by the CS) we cannot combine it with the previous 2081 * stalling flush as the hardware documentation suggests, because that 2082 * would cause the CS to stall on previous rendering *after* RO 2083 * invalidation and wouldn't prevent the RO caches from being polluted by 2084 * concurrent rendering before the stall completes. This intentionally 2085 * doesn't implement the SKL+ hardware workaround suggesting to enable CS 2086 * stall on PIPE_CONTROLs with the texture cache invalidation bit set for 2087 * GPGPU workloads because the previous and subsequent PIPE_CONTROLs 2088 * already guarantee that there is no concurrent GPGPU kernel execution 2089 * (see SKL HSD 2132585). 2090 */ 2091 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 2092 pc.TextureCacheInvalidationEnable = true; 2093 pc.ConstantCacheInvalidationEnable = true; 2094 pc.InstructionCacheInvalidateEnable = true; 2095 pc.StateCacheInvalidationEnable = true; 2096 pc.PostSyncOperation = NoWrite; 2097 anv_debug_dump_pc(pc); 2098 } 2099 2100 /* Now send a third stalling flush to make sure that invalidation is 2101 * complete when the L3 configuration registers are modified. 2102 */ 2103 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 2104 pc.DCFlushEnable = true; 2105 pc.PostSyncOperation = NoWrite; 2106 pc.CommandStreamerStallEnable = true; 2107 anv_debug_dump_pc(pc); 2108 } 2109 2110 genX(emit_l3_config)(&cmd_buffer->batch, cmd_buffer->device, cfg); 2111#endif /* GFX_VER >= 11 */ 2112 cmd_buffer->state.current_l3_config = cfg; 2113} 2114 2115void 2116genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer) 2117{ 2118 UNUSED const struct intel_device_info *devinfo = &cmd_buffer->device->info; 2119 enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits; 2120 2121 if (unlikely(cmd_buffer->device->physical->always_flush_cache)) 2122 bits |= ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS; 2123 else if (bits == 0) 2124 return; 2125 2126 /* 2127 * From Sandybridge PRM, volume 2, "1.7.2 End-of-Pipe Synchronization": 2128 * 2129 * Write synchronization is a special case of end-of-pipe 2130 * synchronization that requires that the render cache and/or depth 2131 * related caches are flushed to memory, where the data will become 2132 * globally visible. This type of synchronization is required prior to 2133 * SW (CPU) actually reading the result data from memory, or initiating 2134 * an operation that will use as a read surface (such as a texture 2135 * surface) a previous render target and/or depth/stencil buffer 2136 * 2137 * 2138 * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization": 2139 * 2140 * Exercising the write cache flush bits (Render Target Cache Flush 2141 * Enable, Depth Cache Flush Enable, DC Flush) in PIPE_CONTROL only 2142 * ensures the write caches are flushed and doesn't guarantee the data 2143 * is globally visible. 2144 * 2145 * SW can track the completion of the end-of-pipe-synchronization by 2146 * using "Notify Enable" and "PostSync Operation - Write Immediate 2147 * Data" in the PIPE_CONTROL command. 2148 * 2149 * In other words, flushes are pipelined while invalidations are handled 2150 * immediately. Therefore, if we're flushing anything then we need to 2151 * schedule an end-of-pipe sync before any invalidations can happen. 2152 */ 2153 if (bits & ANV_PIPE_FLUSH_BITS) 2154 bits |= ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT; 2155 2156 2157 /* HSD 1209978178: docs say that before programming the aux table: 2158 * 2159 * "Driver must ensure that the engine is IDLE but ensure it doesn't 2160 * add extra flushes in the case it knows that the engine is already 2161 * IDLE." 2162 */ 2163 if (GFX_VER == 12 && (bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT)) 2164 bits |= ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT; 2165 2166 /* If we're going to do an invalidate and we have a pending end-of-pipe 2167 * sync that has yet to be resolved, we do the end-of-pipe sync now. 2168 */ 2169 if ((bits & ANV_PIPE_INVALIDATE_BITS) && 2170 (bits & ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT)) { 2171 bits |= ANV_PIPE_END_OF_PIPE_SYNC_BIT; 2172 bits &= ~ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT; 2173 } 2174 2175 /* Wa_1409226450, Wait for EU to be idle before pipe control which 2176 * invalidates the instruction cache 2177 */ 2178 if (GFX_VER == 12 && (bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT)) 2179 bits |= ANV_PIPE_CS_STALL_BIT | ANV_PIPE_STALL_AT_SCOREBOARD_BIT; 2180 2181 if ((GFX_VER >= 8 && GFX_VER <= 9) && 2182 (bits & ANV_PIPE_CS_STALL_BIT) && 2183 (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) { 2184 /* If we are doing a VF cache invalidate AND a CS stall (it must be 2185 * both) then we can reset our vertex cache tracking. 2186 */ 2187 memset(cmd_buffer->state.gfx.vb_dirty_ranges, 0, 2188 sizeof(cmd_buffer->state.gfx.vb_dirty_ranges)); 2189 memset(&cmd_buffer->state.gfx.ib_dirty_range, 0, 2190 sizeof(cmd_buffer->state.gfx.ib_dirty_range)); 2191 } 2192 2193 /* Project: SKL / Argument: LRI Post Sync Operation [23] 2194 * 2195 * "PIPECONTROL command with “Command Streamer Stall Enable” must be 2196 * programmed prior to programming a PIPECONTROL command with "LRI 2197 * Post Sync Operation" in GPGPU mode of operation (i.e when 2198 * PIPELINE_SELECT command is set to GPGPU mode of operation)." 2199 * 2200 * The same text exists a few rows below for Post Sync Op. 2201 * 2202 * On Gfx12 this is Wa_1607156449. 2203 */ 2204 if (bits & ANV_PIPE_POST_SYNC_BIT) { 2205 if ((GFX_VER == 9 || (GFX_VER == 12 && devinfo->revision == 0 /* A0 */)) && 2206 cmd_buffer->state.current_pipeline == GPGPU) 2207 bits |= ANV_PIPE_CS_STALL_BIT; 2208 bits &= ~ANV_PIPE_POST_SYNC_BIT; 2209 } 2210 2211 if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS | 2212 ANV_PIPE_END_OF_PIPE_SYNC_BIT)) { 2213 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { 2214#if GFX_VER >= 12 2215 pipe.TileCacheFlushEnable = bits & ANV_PIPE_TILE_CACHE_FLUSH_BIT; 2216 pipe.HDCPipelineFlushEnable |= bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT; 2217#else 2218 /* Flushing HDC pipeline requires DC Flush on earlier HW. */ 2219 pipe.DCFlushEnable |= bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT; 2220#endif 2221 pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT; 2222 pipe.DCFlushEnable |= bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT; 2223 pipe.RenderTargetCacheFlushEnable = 2224 bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT; 2225 2226 /* Wa_1409600907: "PIPE_CONTROL with Depth Stall Enable bit must 2227 * be set with any PIPE_CONTROL with Depth Flush Enable bit set. 2228 */ 2229#if GFX_VER >= 12 2230 pipe.DepthStallEnable = 2231 pipe.DepthCacheFlushEnable || (bits & ANV_PIPE_DEPTH_STALL_BIT); 2232#else 2233 pipe.DepthStallEnable = bits & ANV_PIPE_DEPTH_STALL_BIT; 2234#endif 2235 2236 pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT; 2237 pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT; 2238 2239 /* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory": 2240 * 2241 * "The most common action to perform upon reaching a 2242 * synchronization point is to write a value out to memory. An 2243 * immediate value (included with the synchronization command) may 2244 * be written." 2245 * 2246 * 2247 * From Broadwell PRM, volume 7, "End-of-Pipe Synchronization": 2248 * 2249 * "In case the data flushed out by the render engine is to be 2250 * read back in to the render engine in coherent manner, then the 2251 * render engine has to wait for the fence completion before 2252 * accessing the flushed data. This can be achieved by following 2253 * means on various products: PIPE_CONTROL command with CS Stall 2254 * and the required write caches flushed with Post-Sync-Operation 2255 * as Write Immediate Data. 2256 * 2257 * Example: 2258 * - Workload-1 (3D/GPGPU/MEDIA) 2259 * - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write 2260 * Immediate Data, Required Write Cache Flush bits set) 2261 * - Workload-2 (Can use the data produce or output by 2262 * Workload-1) 2263 */ 2264 if (bits & ANV_PIPE_END_OF_PIPE_SYNC_BIT) { 2265 pipe.CommandStreamerStallEnable = true; 2266 pipe.PostSyncOperation = WriteImmediateData; 2267 pipe.Address = cmd_buffer->device->workaround_address; 2268 } 2269 2270 /* 2271 * According to the Broadwell documentation, any PIPE_CONTROL with the 2272 * "Command Streamer Stall" bit set must also have another bit set, 2273 * with five different options: 2274 * 2275 * - Render Target Cache Flush 2276 * - Depth Cache Flush 2277 * - Stall at Pixel Scoreboard 2278 * - Post-Sync Operation 2279 * - Depth Stall 2280 * - DC Flush Enable 2281 * 2282 * I chose "Stall at Pixel Scoreboard" since that's what we use in 2283 * mesa and it seems to work fine. The choice is fairly arbitrary. 2284 */ 2285 if (pipe.CommandStreamerStallEnable && 2286 !pipe.RenderTargetCacheFlushEnable && 2287 !pipe.DepthCacheFlushEnable && 2288 !pipe.StallAtPixelScoreboard && 2289 !pipe.PostSyncOperation && 2290 !pipe.DepthStallEnable && 2291 !pipe.DCFlushEnable) 2292 pipe.StallAtPixelScoreboard = true; 2293 anv_debug_dump_pc(pipe); 2294 } 2295 2296 /* If a render target flush was emitted, then we can toggle off the bit 2297 * saying that render target writes are ongoing. 2298 */ 2299 if (bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT) 2300 bits &= ~(ANV_PIPE_RENDER_TARGET_BUFFER_WRITES); 2301 2302 if (GFX_VERx10 == 75) { 2303 /* Haswell needs addition work-arounds: 2304 * 2305 * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization": 2306 * 2307 * Option 1: 2308 * PIPE_CONTROL command with the CS Stall and the required write 2309 * caches flushed with Post-SyncOperation as Write Immediate Data 2310 * followed by eight dummy MI_STORE_DATA_IMM (write to scratch 2311 * spce) commands. 2312 * 2313 * Example: 2314 * - Workload-1 2315 * - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write 2316 * Immediate Data, Required Write Cache Flush bits set) 2317 * - MI_STORE_DATA_IMM (8 times) (Dummy data, Scratch Address) 2318 * - Workload-2 (Can use the data produce or output by 2319 * Workload-1) 2320 * 2321 * Unfortunately, both the PRMs and the internal docs are a bit 2322 * out-of-date in this regard. What the windows driver does (and 2323 * this appears to actually work) is to emit a register read from the 2324 * memory address written by the pipe control above. 2325 * 2326 * What register we load into doesn't matter. We choose an indirect 2327 * rendering register because we know it always exists and it's one 2328 * of the first registers the command parser allows us to write. If 2329 * you don't have command parser support in your kernel (pre-4.2), 2330 * this will get turned into MI_NOOP and you won't get the 2331 * workaround. Unfortunately, there's just not much we can do in 2332 * that case. This register is perfectly safe to write since we 2333 * always re-load all of the indirect draw registers right before 2334 * 3DPRIMITIVE when needed anyway. 2335 */ 2336 anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { 2337 lrm.RegisterAddress = 0x243C; /* GFX7_3DPRIM_START_INSTANCE */ 2338 lrm.MemoryAddress = cmd_buffer->device->workaround_address; 2339 } 2340 } 2341 2342 bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS | 2343 ANV_PIPE_END_OF_PIPE_SYNC_BIT); 2344 } 2345 2346 if (bits & ANV_PIPE_INVALIDATE_BITS) { 2347 /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL", 2348 * 2349 * "If the VF Cache Invalidation Enable is set to a 1 in a 2350 * PIPE_CONTROL, a separate Null PIPE_CONTROL, all bitfields sets to 2351 * 0, with the VF Cache Invalidation Enable set to 0 needs to be sent 2352 * prior to the PIPE_CONTROL with VF Cache Invalidation Enable set to 2353 * a 1." 2354 * 2355 * This appears to hang Broadwell, so we restrict it to just gfx9. 2356 */ 2357 if (GFX_VER == 9 && (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) 2358 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe); 2359 2360 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { 2361 pipe.StateCacheInvalidationEnable = 2362 bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT; 2363 pipe.ConstantCacheInvalidationEnable = 2364 bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT; 2365#if GFX_VER >= 12 2366 /* Invalidates the L3 cache part in which index & vertex data is loaded 2367 * when VERTEX_BUFFER_STATE::L3BypassDisable is set. 2368 */ 2369 pipe.L3ReadOnlyCacheInvalidationEnable = 2370 bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT; 2371#endif 2372 pipe.VFCacheInvalidationEnable = 2373 bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT; 2374 pipe.TextureCacheInvalidationEnable = 2375 bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT; 2376 pipe.InstructionCacheInvalidateEnable = 2377 bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT; 2378 2379 /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL", 2380 * 2381 * "When VF Cache Invalidate is set “Post Sync Operation” must be 2382 * enabled to “Write Immediate Data” or “Write PS Depth Count” or 2383 * “Write Timestamp”. 2384 */ 2385 if (GFX_VER == 9 && pipe.VFCacheInvalidationEnable) { 2386 pipe.PostSyncOperation = WriteImmediateData; 2387 pipe.Address = cmd_buffer->device->workaround_address; 2388 } 2389 anv_debug_dump_pc(pipe); 2390 } 2391 2392#if GFX_VER == 12 2393 if ((bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT) && 2394 cmd_buffer->device->info.has_aux_map) { 2395 anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) { 2396 lri.RegisterOffset = GENX(GFX_CCS_AUX_INV_num); 2397 lri.DataDWord = 1; 2398 } 2399 } 2400#endif 2401 2402 bits &= ~ANV_PIPE_INVALIDATE_BITS; 2403 } 2404 2405 cmd_buffer->state.pending_pipe_bits = bits; 2406} 2407 2408static void 2409cmd_buffer_barrier(struct anv_cmd_buffer *cmd_buffer, 2410 const VkDependencyInfoKHR *dep_info, 2411 const char *reason) 2412{ 2413 /* XXX: Right now, we're really dumb and just flush whatever categories 2414 * the app asks for. One of these days we may make this a bit better 2415 * but right now that's all the hardware allows for in most areas. 2416 */ 2417 VkAccessFlags2KHR src_flags = 0; 2418 VkAccessFlags2KHR dst_flags = 0; 2419 2420 for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) { 2421 src_flags |= dep_info->pMemoryBarriers[i].srcAccessMask; 2422 dst_flags |= dep_info->pMemoryBarriers[i].dstAccessMask; 2423 } 2424 2425 for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) { 2426 src_flags |= dep_info->pBufferMemoryBarriers[i].srcAccessMask; 2427 dst_flags |= dep_info->pBufferMemoryBarriers[i].dstAccessMask; 2428 } 2429 2430 for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) { 2431 const VkImageMemoryBarrier2KHR *img_barrier = 2432 &dep_info->pImageMemoryBarriers[i]; 2433 2434 src_flags |= img_barrier->srcAccessMask; 2435 dst_flags |= img_barrier->dstAccessMask; 2436 2437 ANV_FROM_HANDLE(anv_image, image, img_barrier->image); 2438 const VkImageSubresourceRange *range = &img_barrier->subresourceRange; 2439 2440 uint32_t base_layer, layer_count; 2441 if (image->vk.image_type == VK_IMAGE_TYPE_3D) { 2442 base_layer = 0; 2443 layer_count = anv_minify(image->vk.extent.depth, range->baseMipLevel); 2444 } else { 2445 base_layer = range->baseArrayLayer; 2446 layer_count = vk_image_subresource_layer_count(&image->vk, range); 2447 } 2448 const uint32_t level_count = 2449 vk_image_subresource_level_count(&image->vk, range); 2450 2451 if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) { 2452 transition_depth_buffer(cmd_buffer, image, 2453 base_layer, layer_count, 2454 img_barrier->oldLayout, 2455 img_barrier->newLayout, 2456 false /* will_full_fast_clear */); 2457 } 2458 2459 if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) { 2460 transition_stencil_buffer(cmd_buffer, image, 2461 range->baseMipLevel, level_count, 2462 base_layer, layer_count, 2463 img_barrier->oldLayout, 2464 img_barrier->newLayout, 2465 false /* will_full_fast_clear */); 2466 } 2467 2468 if (range->aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { 2469 VkImageAspectFlags color_aspects = 2470 vk_image_expand_aspect_mask(&image->vk, range->aspectMask); 2471 anv_foreach_image_aspect_bit(aspect_bit, image, color_aspects) { 2472 transition_color_buffer(cmd_buffer, image, 1UL << aspect_bit, 2473 range->baseMipLevel, level_count, 2474 base_layer, layer_count, 2475 img_barrier->oldLayout, 2476 img_barrier->newLayout, 2477 img_barrier->srcQueueFamilyIndex, 2478 img_barrier->dstQueueFamilyIndex, 2479 false /* will_full_fast_clear */); 2480 } 2481 } 2482 } 2483 2484 enum anv_pipe_bits bits = 2485 anv_pipe_flush_bits_for_access_flags(cmd_buffer->device, src_flags) | 2486 anv_pipe_invalidate_bits_for_access_flags(cmd_buffer->device, dst_flags); 2487 2488 anv_add_pending_pipe_bits(cmd_buffer, bits, reason); 2489} 2490 2491void genX(CmdPipelineBarrier2KHR)( 2492 VkCommandBuffer commandBuffer, 2493 const VkDependencyInfoKHR* pDependencyInfo) 2494{ 2495 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 2496 2497 cmd_buffer_barrier(cmd_buffer, pDependencyInfo, "pipe barrier"); 2498} 2499 2500static void 2501cmd_buffer_alloc_push_constants(struct anv_cmd_buffer *cmd_buffer) 2502{ 2503 assert(anv_pipeline_is_primitive(cmd_buffer->state.gfx.pipeline)); 2504 2505 VkShaderStageFlags stages = 2506 cmd_buffer->state.gfx.pipeline->active_stages; 2507 2508 /* In order to avoid thrash, we assume that vertex and fragment stages 2509 * always exist. In the rare case where one is missing *and* the other 2510 * uses push concstants, this may be suboptimal. However, avoiding stalls 2511 * seems more important. 2512 */ 2513 stages |= VK_SHADER_STAGE_FRAGMENT_BIT | VK_SHADER_STAGE_VERTEX_BIT; 2514 2515 if (stages == cmd_buffer->state.gfx.push_constant_stages) 2516 return; 2517 2518 const unsigned push_constant_kb = 2519 cmd_buffer->device->info.max_constant_urb_size_kb; 2520 2521 const unsigned num_stages = 2522 util_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS); 2523 unsigned size_per_stage = push_constant_kb / num_stages; 2524 2525 /* Broadwell+ and Haswell gt3 require that the push constant sizes be in 2526 * units of 2KB. Incidentally, these are the same platforms that have 2527 * 32KB worth of push constant space. 2528 */ 2529 if (push_constant_kb == 32) 2530 size_per_stage &= ~1u; 2531 2532 uint32_t kb_used = 0; 2533 for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_FRAGMENT; i++) { 2534 unsigned push_size = (stages & (1 << i)) ? size_per_stage : 0; 2535 anv_batch_emit(&cmd_buffer->batch, 2536 GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) { 2537 alloc._3DCommandSubOpcode = 18 + i; 2538 alloc.ConstantBufferOffset = (push_size > 0) ? kb_used : 0; 2539 alloc.ConstantBufferSize = push_size; 2540 } 2541 kb_used += push_size; 2542 } 2543 2544 anv_batch_emit(&cmd_buffer->batch, 2545 GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) { 2546 alloc.ConstantBufferOffset = kb_used; 2547 alloc.ConstantBufferSize = push_constant_kb - kb_used; 2548 } 2549 2550 cmd_buffer->state.gfx.push_constant_stages = stages; 2551 2552 /* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS: 2553 * 2554 * "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to 2555 * the next 3DPRIMITIVE command after programming the 2556 * 3DSTATE_PUSH_CONSTANT_ALLOC_VS" 2557 * 2558 * Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of 2559 * pipeline setup, we need to dirty push constants. 2560 */ 2561 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS; 2562} 2563 2564static VkResult 2565emit_binding_table(struct anv_cmd_buffer *cmd_buffer, 2566 struct anv_cmd_pipeline_state *pipe_state, 2567 struct anv_shader_bin *shader, 2568 struct anv_state *bt_state) 2569{ 2570 struct anv_subpass *subpass = cmd_buffer->state.subpass; 2571 uint32_t state_offset; 2572 2573 struct anv_pipeline_bind_map *map = &shader->bind_map; 2574 if (map->surface_count == 0) { 2575 *bt_state = (struct anv_state) { 0, }; 2576 return VK_SUCCESS; 2577 } 2578 2579 *bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer, 2580 map->surface_count, 2581 &state_offset); 2582 uint32_t *bt_map = bt_state->map; 2583 2584 if (bt_state->map == NULL) 2585 return VK_ERROR_OUT_OF_DEVICE_MEMORY; 2586 2587 /* We only need to emit relocs if we're not using softpin. If we are using 2588 * softpin then we always keep all user-allocated memory objects resident. 2589 */ 2590 const bool need_client_mem_relocs = 2591 !anv_use_softpin(cmd_buffer->device->physical); 2592 struct anv_push_constants *push = &pipe_state->push_constants; 2593 2594 for (uint32_t s = 0; s < map->surface_count; s++) { 2595 struct anv_pipeline_binding *binding = &map->surface_to_descriptor[s]; 2596 2597 struct anv_state surface_state; 2598 2599 switch (binding->set) { 2600 case ANV_DESCRIPTOR_SET_NULL: 2601 bt_map[s] = 0; 2602 break; 2603 2604 case ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS: 2605 /* Color attachment binding */ 2606 assert(shader->stage == MESA_SHADER_FRAGMENT); 2607 if (binding->index < subpass->color_count) { 2608 const unsigned att = 2609 subpass->color_attachments[binding->index].attachment; 2610 2611 /* From the Vulkan 1.0.46 spec: 2612 * 2613 * "If any color or depth/stencil attachments are 2614 * VK_ATTACHMENT_UNUSED, then no writes occur for those 2615 * attachments." 2616 */ 2617 if (att == VK_ATTACHMENT_UNUSED) { 2618 surface_state = cmd_buffer->state.null_surface_state; 2619 } else { 2620 surface_state = cmd_buffer->state.attachments[att].color.state; 2621 } 2622 } else { 2623 surface_state = cmd_buffer->state.null_surface_state; 2624 } 2625 2626 assert(surface_state.map); 2627 bt_map[s] = surface_state.offset + state_offset; 2628 break; 2629 2630 case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS: { 2631 struct anv_state surface_state = 2632 anv_cmd_buffer_alloc_surface_state(cmd_buffer); 2633 2634 struct anv_address constant_data = { 2635 .bo = cmd_buffer->device->instruction_state_pool.block_pool.bo, 2636 .offset = shader->kernel.offset + 2637 shader->prog_data->const_data_offset, 2638 }; 2639 unsigned constant_data_size = shader->prog_data->const_data_size; 2640 2641 const enum isl_format format = 2642 anv_isl_format_for_descriptor_type(cmd_buffer->device, 2643 VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER); 2644 anv_fill_buffer_surface_state(cmd_buffer->device, 2645 surface_state, format, 2646 ISL_SURF_USAGE_CONSTANT_BUFFER_BIT, 2647 constant_data, constant_data_size, 1); 2648 2649 assert(surface_state.map); 2650 bt_map[s] = surface_state.offset + state_offset; 2651 add_surface_reloc(cmd_buffer, surface_state, constant_data); 2652 break; 2653 } 2654 2655 case ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS: { 2656 /* This is always the first binding for compute shaders */ 2657 assert(shader->stage == MESA_SHADER_COMPUTE && s == 0); 2658 2659 struct anv_state surface_state = 2660 anv_cmd_buffer_alloc_surface_state(cmd_buffer); 2661 2662 const enum isl_format format = 2663 anv_isl_format_for_descriptor_type(cmd_buffer->device, 2664 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); 2665 anv_fill_buffer_surface_state(cmd_buffer->device, surface_state, 2666 format, 2667 ISL_SURF_USAGE_CONSTANT_BUFFER_BIT, 2668 cmd_buffer->state.compute.num_workgroups, 2669 12, 1); 2670 2671 assert(surface_state.map); 2672 bt_map[s] = surface_state.offset + state_offset; 2673 if (need_client_mem_relocs) { 2674 add_surface_reloc(cmd_buffer, surface_state, 2675 cmd_buffer->state.compute.num_workgroups); 2676 } 2677 break; 2678 } 2679 2680 case ANV_DESCRIPTOR_SET_DESCRIPTORS: { 2681 /* This is a descriptor set buffer so the set index is actually 2682 * given by binding->binding. (Yes, that's confusing.) 2683 */ 2684 struct anv_descriptor_set *set = 2685 pipe_state->descriptors[binding->index]; 2686 assert(set->desc_mem.alloc_size); 2687 assert(set->desc_surface_state.alloc_size); 2688 bt_map[s] = set->desc_surface_state.offset + state_offset; 2689 add_surface_reloc(cmd_buffer, set->desc_surface_state, 2690 anv_descriptor_set_address(set)); 2691 break; 2692 } 2693 2694 default: { 2695 assert(binding->set < MAX_SETS); 2696 const struct anv_descriptor_set *set = 2697 pipe_state->descriptors[binding->set]; 2698 if (binding->index >= set->descriptor_count) { 2699 /* From the Vulkan spec section entitled "DescriptorSet and 2700 * Binding Assignment": 2701 * 2702 * "If the array is runtime-sized, then array elements greater 2703 * than or equal to the size of that binding in the bound 2704 * descriptor set must not be used." 2705 * 2706 * Unfortunately, the compiler isn't smart enough to figure out 2707 * when a dynamic binding isn't used so it may grab the whole 2708 * array and stick it in the binding table. In this case, it's 2709 * safe to just skip those bindings that are OOB. 2710 */ 2711 assert(binding->index < set->layout->descriptor_count); 2712 continue; 2713 } 2714 const struct anv_descriptor *desc = &set->descriptors[binding->index]; 2715 2716 switch (desc->type) { 2717 case VK_DESCRIPTOR_TYPE_SAMPLER: 2718 /* Nothing for us to do here */ 2719 continue; 2720 2721 case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: 2722 case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: { 2723 if (desc->image_view) { 2724 struct anv_surface_state sstate = 2725 (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ? 2726 desc->image_view->planes[binding->plane].general_sampler_surface_state : 2727 desc->image_view->planes[binding->plane].optimal_sampler_surface_state; 2728 surface_state = sstate.state; 2729 assert(surface_state.alloc_size); 2730 if (need_client_mem_relocs) 2731 add_surface_state_relocs(cmd_buffer, sstate); 2732 } else { 2733 surface_state = cmd_buffer->device->null_surface_state; 2734 } 2735 break; 2736 } 2737 case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: 2738 assert(shader->stage == MESA_SHADER_FRAGMENT); 2739 assert(desc->image_view != NULL); 2740 if ((desc->image_view->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) == 0) { 2741 /* For depth and stencil input attachments, we treat it like any 2742 * old texture that a user may have bound. 2743 */ 2744 assert(desc->image_view->n_planes == 1); 2745 struct anv_surface_state sstate = 2746 (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ? 2747 desc->image_view->planes[0].general_sampler_surface_state : 2748 desc->image_view->planes[0].optimal_sampler_surface_state; 2749 surface_state = sstate.state; 2750 assert(surface_state.alloc_size); 2751 if (need_client_mem_relocs) 2752 add_surface_state_relocs(cmd_buffer, sstate); 2753 } else { 2754 /* For color input attachments, we create the surface state at 2755 * vkBeginRenderPass time so that we can include aux and clear 2756 * color information. 2757 */ 2758 assert(binding->input_attachment_index < subpass->input_count); 2759 const unsigned subpass_att = binding->input_attachment_index; 2760 const unsigned att = subpass->input_attachments[subpass_att].attachment; 2761 surface_state = cmd_buffer->state.attachments[att].input.state; 2762 } 2763 break; 2764 2765 case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: { 2766 if (desc->image_view) { 2767 struct anv_surface_state sstate = 2768 binding->lowered_storage_surface 2769 ? desc->image_view->planes[binding->plane].lowered_storage_surface_state 2770 : desc->image_view->planes[binding->plane].storage_surface_state; 2771 surface_state = sstate.state; 2772 assert(surface_state.alloc_size); 2773 if (surface_state.offset == 0) { 2774 mesa_loge("Bound a image to a descriptor where the " 2775 "descriptor does not have NonReadable " 2776 "set and the image does not have a " 2777 "corresponding SPIR-V format enum."); 2778 vk_debug_report(&cmd_buffer->device->physical->instance->vk, 2779 VK_DEBUG_REPORT_ERROR_BIT_EXT, 2780 &desc->image_view->vk.base, 2781 __LINE__, 0, "anv", 2782 "Bound a image to a descriptor where the " 2783 "descriptor does not have NonReadable " 2784 "set and the image does not have a " 2785 "corresponding SPIR-V format enum."); 2786 } 2787 if (surface_state.offset && need_client_mem_relocs) 2788 add_surface_state_relocs(cmd_buffer, sstate); 2789 } else { 2790 surface_state = cmd_buffer->device->null_surface_state; 2791 } 2792 break; 2793 } 2794 2795 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: 2796 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: 2797 case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: 2798 if (desc->buffer_view) { 2799 surface_state = desc->buffer_view->surface_state; 2800 assert(surface_state.alloc_size); 2801 if (need_client_mem_relocs) { 2802 add_surface_reloc(cmd_buffer, surface_state, 2803 desc->buffer_view->address); 2804 } 2805 } else { 2806 surface_state = cmd_buffer->device->null_surface_state; 2807 } 2808 break; 2809 2810 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: 2811 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: { 2812 if (desc->buffer) { 2813 /* Compute the offset within the buffer */ 2814 uint32_t dynamic_offset = 2815 push->dynamic_offsets[binding->dynamic_offset_index]; 2816 uint64_t offset = desc->offset + dynamic_offset; 2817 /* Clamp to the buffer size */ 2818 offset = MIN2(offset, desc->buffer->size); 2819 /* Clamp the range to the buffer size */ 2820 uint32_t range = MIN2(desc->range, desc->buffer->size - offset); 2821 2822 /* Align the range for consistency */ 2823 if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) 2824 range = align_u32(range, ANV_UBO_ALIGNMENT); 2825 2826 struct anv_address address = 2827 anv_address_add(desc->buffer->address, offset); 2828 2829 surface_state = 2830 anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64); 2831 enum isl_format format = 2832 anv_isl_format_for_descriptor_type(cmd_buffer->device, 2833 desc->type); 2834 2835 isl_surf_usage_flags_t usage = 2836 desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ? 2837 ISL_SURF_USAGE_CONSTANT_BUFFER_BIT : 2838 ISL_SURF_USAGE_STORAGE_BIT; 2839 2840 anv_fill_buffer_surface_state(cmd_buffer->device, surface_state, 2841 format, usage, address, range, 1); 2842 if (need_client_mem_relocs) 2843 add_surface_reloc(cmd_buffer, surface_state, address); 2844 } else { 2845 surface_state = cmd_buffer->device->null_surface_state; 2846 } 2847 break; 2848 } 2849 2850 case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: 2851 if (desc->buffer_view) { 2852 surface_state = binding->lowered_storage_surface 2853 ? desc->buffer_view->lowered_storage_surface_state 2854 : desc->buffer_view->storage_surface_state; 2855 assert(surface_state.alloc_size); 2856 if (need_client_mem_relocs) { 2857 add_surface_reloc(cmd_buffer, surface_state, 2858 desc->buffer_view->address); 2859 } 2860 } else { 2861 surface_state = cmd_buffer->device->null_surface_state; 2862 } 2863 break; 2864 2865 default: 2866 assert(!"Invalid descriptor type"); 2867 continue; 2868 } 2869 assert(surface_state.map); 2870 bt_map[s] = surface_state.offset + state_offset; 2871 break; 2872 } 2873 } 2874 } 2875 2876 return VK_SUCCESS; 2877} 2878 2879static VkResult 2880emit_samplers(struct anv_cmd_buffer *cmd_buffer, 2881 struct anv_cmd_pipeline_state *pipe_state, 2882 struct anv_shader_bin *shader, 2883 struct anv_state *state) 2884{ 2885 struct anv_pipeline_bind_map *map = &shader->bind_map; 2886 if (map->sampler_count == 0) { 2887 *state = (struct anv_state) { 0, }; 2888 return VK_SUCCESS; 2889 } 2890 2891 uint32_t size = map->sampler_count * 16; 2892 *state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, 32); 2893 2894 if (state->map == NULL) 2895 return VK_ERROR_OUT_OF_DEVICE_MEMORY; 2896 2897 for (uint32_t s = 0; s < map->sampler_count; s++) { 2898 struct anv_pipeline_binding *binding = &map->sampler_to_descriptor[s]; 2899 const struct anv_descriptor *desc = 2900 &pipe_state->descriptors[binding->set]->descriptors[binding->index]; 2901 2902 if (desc->type != VK_DESCRIPTOR_TYPE_SAMPLER && 2903 desc->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) 2904 continue; 2905 2906 struct anv_sampler *sampler = desc->sampler; 2907 2908 /* This can happen if we have an unfilled slot since TYPE_SAMPLER 2909 * happens to be zero. 2910 */ 2911 if (sampler == NULL) 2912 continue; 2913 2914 memcpy(state->map + (s * 16), 2915 sampler->state[binding->plane], sizeof(sampler->state[0])); 2916 } 2917 2918 return VK_SUCCESS; 2919} 2920 2921static uint32_t 2922flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer, 2923 struct anv_cmd_pipeline_state *pipe_state, 2924 const VkShaderStageFlags dirty, 2925 struct anv_shader_bin **shaders, 2926 uint32_t num_shaders) 2927{ 2928 VkShaderStageFlags flushed = 0; 2929 2930 VkResult result = VK_SUCCESS; 2931 for (uint32_t i = 0; i < num_shaders; i++) { 2932 if (!shaders[i]) 2933 continue; 2934 2935 gl_shader_stage stage = shaders[i]->stage; 2936 VkShaderStageFlags vk_stage = mesa_to_vk_shader_stage(stage); 2937 if ((vk_stage & dirty) == 0) 2938 continue; 2939 2940 assert(stage < ARRAY_SIZE(cmd_buffer->state.samplers)); 2941 result = emit_samplers(cmd_buffer, pipe_state, shaders[i], 2942 &cmd_buffer->state.samplers[stage]); 2943 if (result != VK_SUCCESS) 2944 break; 2945 2946 assert(stage < ARRAY_SIZE(cmd_buffer->state.binding_tables)); 2947 result = emit_binding_table(cmd_buffer, pipe_state, shaders[i], 2948 &cmd_buffer->state.binding_tables[stage]); 2949 if (result != VK_SUCCESS) 2950 break; 2951 2952 flushed |= vk_stage; 2953 } 2954 2955 if (result != VK_SUCCESS) { 2956 assert(result == VK_ERROR_OUT_OF_DEVICE_MEMORY); 2957 2958 result = anv_cmd_buffer_new_binding_table_block(cmd_buffer); 2959 if (result != VK_SUCCESS) 2960 return 0; 2961 2962 /* Re-emit state base addresses so we get the new surface state base 2963 * address before we start emitting binding tables etc. 2964 */ 2965 genX(cmd_buffer_emit_state_base_address)(cmd_buffer); 2966 2967 /* Re-emit all active binding tables */ 2968 flushed = 0; 2969 2970 for (uint32_t i = 0; i < num_shaders; i++) { 2971 if (!shaders[i]) 2972 continue; 2973 2974 gl_shader_stage stage = shaders[i]->stage; 2975 2976 result = emit_samplers(cmd_buffer, pipe_state, shaders[i], 2977 &cmd_buffer->state.samplers[stage]); 2978 if (result != VK_SUCCESS) { 2979 anv_batch_set_error(&cmd_buffer->batch, result); 2980 return 0; 2981 } 2982 result = emit_binding_table(cmd_buffer, pipe_state, shaders[i], 2983 &cmd_buffer->state.binding_tables[stage]); 2984 if (result != VK_SUCCESS) { 2985 anv_batch_set_error(&cmd_buffer->batch, result); 2986 return 0; 2987 } 2988 2989 flushed |= mesa_to_vk_shader_stage(stage); 2990 } 2991 } 2992 2993 return flushed; 2994} 2995 2996static void 2997cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer, 2998 uint32_t stages) 2999{ 3000 static const uint32_t sampler_state_opcodes[] = { 3001 [MESA_SHADER_VERTEX] = 43, 3002 [MESA_SHADER_TESS_CTRL] = 44, /* HS */ 3003 [MESA_SHADER_TESS_EVAL] = 45, /* DS */ 3004 [MESA_SHADER_GEOMETRY] = 46, 3005 [MESA_SHADER_FRAGMENT] = 47, 3006 [MESA_SHADER_COMPUTE] = 0, 3007 }; 3008 3009 static const uint32_t binding_table_opcodes[] = { 3010 [MESA_SHADER_VERTEX] = 38, 3011 [MESA_SHADER_TESS_CTRL] = 39, 3012 [MESA_SHADER_TESS_EVAL] = 40, 3013 [MESA_SHADER_GEOMETRY] = 41, 3014 [MESA_SHADER_FRAGMENT] = 42, 3015 [MESA_SHADER_COMPUTE] = 0, 3016 }; 3017 3018 anv_foreach_stage(s, stages) { 3019 assert(s < ARRAY_SIZE(binding_table_opcodes)); 3020 assert(binding_table_opcodes[s] > 0); 3021 3022 if (cmd_buffer->state.samplers[s].alloc_size > 0) { 3023 anv_batch_emit(&cmd_buffer->batch, 3024 GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ssp) { 3025 ssp._3DCommandSubOpcode = sampler_state_opcodes[s]; 3026 ssp.PointertoVSSamplerState = cmd_buffer->state.samplers[s].offset; 3027 } 3028 } 3029 3030 /* Always emit binding table pointers if we're asked to, since on SKL 3031 * this is what flushes push constants. */ 3032 anv_batch_emit(&cmd_buffer->batch, 3033 GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), btp) { 3034 btp._3DCommandSubOpcode = binding_table_opcodes[s]; 3035 btp.PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset; 3036 } 3037 } 3038} 3039 3040static struct anv_address 3041get_push_range_address(struct anv_cmd_buffer *cmd_buffer, 3042 const struct anv_shader_bin *shader, 3043 const struct anv_push_range *range) 3044{ 3045 struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx; 3046 switch (range->set) { 3047 case ANV_DESCRIPTOR_SET_DESCRIPTORS: { 3048 /* This is a descriptor set buffer so the set index is 3049 * actually given by binding->binding. (Yes, that's 3050 * confusing.) 3051 */ 3052 struct anv_descriptor_set *set = 3053 gfx_state->base.descriptors[range->index]; 3054 return anv_descriptor_set_address(set); 3055 } 3056 3057 case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: { 3058 if (gfx_state->base.push_constants_state.alloc_size == 0) { 3059 gfx_state->base.push_constants_state = 3060 anv_cmd_buffer_gfx_push_constants(cmd_buffer); 3061 } 3062 return (struct anv_address) { 3063 .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, 3064 .offset = gfx_state->base.push_constants_state.offset, 3065 }; 3066 } 3067 3068 case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS: 3069 return (struct anv_address) { 3070 .bo = cmd_buffer->device->instruction_state_pool.block_pool.bo, 3071 .offset = shader->kernel.offset + 3072 shader->prog_data->const_data_offset, 3073 }; 3074 3075 default: { 3076 assert(range->set < MAX_SETS); 3077 struct anv_descriptor_set *set = 3078 gfx_state->base.descriptors[range->set]; 3079 const struct anv_descriptor *desc = 3080 &set->descriptors[range->index]; 3081 3082 if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) { 3083 if (desc->buffer_view) 3084 return desc->buffer_view->address; 3085 } else { 3086 assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC); 3087 if (desc->buffer) { 3088 const struct anv_push_constants *push = 3089 &gfx_state->base.push_constants; 3090 uint32_t dynamic_offset = 3091 push->dynamic_offsets[range->dynamic_offset_index]; 3092 return anv_address_add(desc->buffer->address, 3093 desc->offset + dynamic_offset); 3094 } 3095 } 3096 3097 /* For NULL UBOs, we just return an address in the workaround BO. We do 3098 * writes to it for workarounds but always at the bottom. The higher 3099 * bytes should be all zeros. 3100 */ 3101 assert(range->length * 32 <= 2048); 3102 return (struct anv_address) { 3103 .bo = cmd_buffer->device->workaround_bo, 3104 .offset = 1024, 3105 }; 3106 } 3107 } 3108} 3109 3110 3111/** Returns the size in bytes of the bound buffer 3112 * 3113 * The range is relative to the start of the buffer, not the start of the 3114 * range. The returned range may be smaller than 3115 * 3116 * (range->start + range->length) * 32; 3117 */ 3118static uint32_t 3119get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer, 3120 const struct anv_shader_bin *shader, 3121 const struct anv_push_range *range) 3122{ 3123 assert(shader->stage != MESA_SHADER_COMPUTE); 3124 const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx; 3125 switch (range->set) { 3126 case ANV_DESCRIPTOR_SET_DESCRIPTORS: { 3127 struct anv_descriptor_set *set = 3128 gfx_state->base.descriptors[range->index]; 3129 assert(range->start * 32 < set->desc_mem.alloc_size); 3130 assert((range->start + range->length) * 32 <= set->desc_mem.alloc_size); 3131 return set->desc_mem.alloc_size; 3132 } 3133 3134 case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: 3135 return (range->start + range->length) * 32; 3136 3137 case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS: 3138 return ALIGN(shader->prog_data->const_data_size, ANV_UBO_ALIGNMENT); 3139 3140 default: { 3141 assert(range->set < MAX_SETS); 3142 struct anv_descriptor_set *set = 3143 gfx_state->base.descriptors[range->set]; 3144 const struct anv_descriptor *desc = 3145 &set->descriptors[range->index]; 3146 3147 if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) { 3148 if (!desc->buffer_view) 3149 return 0; 3150 3151 if (range->start * 32 > desc->buffer_view->range) 3152 return 0; 3153 3154 return desc->buffer_view->range; 3155 } else { 3156 if (!desc->buffer) 3157 return 0; 3158 3159 assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC); 3160 /* Compute the offset within the buffer */ 3161 const struct anv_push_constants *push = 3162 &gfx_state->base.push_constants; 3163 uint32_t dynamic_offset = 3164 push->dynamic_offsets[range->dynamic_offset_index]; 3165 uint64_t offset = desc->offset + dynamic_offset; 3166 /* Clamp to the buffer size */ 3167 offset = MIN2(offset, desc->buffer->size); 3168 /* Clamp the range to the buffer size */ 3169 uint32_t bound_range = MIN2(desc->range, desc->buffer->size - offset); 3170 3171 /* Align the range for consistency */ 3172 bound_range = align_u32(bound_range, ANV_UBO_ALIGNMENT); 3173 3174 return bound_range; 3175 } 3176 } 3177 } 3178} 3179 3180static void 3181cmd_buffer_emit_push_constant(struct anv_cmd_buffer *cmd_buffer, 3182 gl_shader_stage stage, 3183 struct anv_address *buffers, 3184 unsigned buffer_count) 3185{ 3186 const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx; 3187 const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline; 3188 3189 static const uint32_t push_constant_opcodes[] = { 3190 [MESA_SHADER_VERTEX] = 21, 3191 [MESA_SHADER_TESS_CTRL] = 25, /* HS */ 3192 [MESA_SHADER_TESS_EVAL] = 26, /* DS */ 3193 [MESA_SHADER_GEOMETRY] = 22, 3194 [MESA_SHADER_FRAGMENT] = 23, 3195 [MESA_SHADER_COMPUTE] = 0, 3196 }; 3197 3198 assert(stage < ARRAY_SIZE(push_constant_opcodes)); 3199 assert(push_constant_opcodes[stage] > 0); 3200 3201 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) { 3202 c._3DCommandSubOpcode = push_constant_opcodes[stage]; 3203 3204 if (anv_pipeline_has_stage(pipeline, stage)) { 3205 const struct anv_pipeline_bind_map *bind_map = 3206 &pipeline->shaders[stage]->bind_map; 3207 3208#if GFX_VER >= 9 3209 /* This field exists since Gfx8. However, the Broadwell PRM says: 3210 * 3211 * "Constant Buffer Object Control State must be always programmed 3212 * to zero." 3213 * 3214 * This restriction does not exist on any newer platforms. 3215 * 3216 * We only have one MOCS field for the whole packet, not one per 3217 * buffer. We could go out of our way here to walk over all of the 3218 * buffers and see if any of them are used externally and use the 3219 * external MOCS. However, the notion that someone would use the 3220 * same bit of memory for both scanout and a UBO is nuts. Let's not 3221 * bother and assume it's all internal. 3222 */ 3223 c.MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false); 3224#endif 3225 3226#if GFX_VERx10 >= 75 3227 /* The Skylake PRM contains the following restriction: 3228 * 3229 * "The driver must ensure The following case does not occur 3230 * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with 3231 * buffer 3 read length equal to zero committed followed by a 3232 * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to 3233 * zero committed." 3234 * 3235 * To avoid this, we program the buffers in the highest slots. 3236 * This way, slot 0 is only used if slot 3 is also used. 3237 */ 3238 assert(buffer_count <= 4); 3239 const unsigned shift = 4 - buffer_count; 3240 for (unsigned i = 0; i < buffer_count; i++) { 3241 const struct anv_push_range *range = &bind_map->push_ranges[i]; 3242 3243 /* At this point we only have non-empty ranges */ 3244 assert(range->length > 0); 3245 3246 /* For Ivy Bridge, make sure we only set the first range (actual 3247 * push constants) 3248 */ 3249 assert((GFX_VERx10 >= 75) || i == 0); 3250 3251 c.ConstantBody.ReadLength[i + shift] = range->length; 3252 c.ConstantBody.Buffer[i + shift] = 3253 anv_address_add(buffers[i], range->start * 32); 3254 } 3255#else 3256 /* For Ivy Bridge, push constants are relative to dynamic state 3257 * base address and we only ever push actual push constants. 3258 */ 3259 if (bind_map->push_ranges[0].length > 0) { 3260 assert(buffer_count == 1); 3261 assert(bind_map->push_ranges[0].set == 3262 ANV_DESCRIPTOR_SET_PUSH_CONSTANTS); 3263 assert(buffers[0].bo == 3264 cmd_buffer->device->dynamic_state_pool.block_pool.bo); 3265 c.ConstantBody.ReadLength[0] = bind_map->push_ranges[0].length; 3266 c.ConstantBody.Buffer[0].bo = NULL; 3267 c.ConstantBody.Buffer[0].offset = buffers[0].offset; 3268 } 3269 assert(bind_map->push_ranges[1].length == 0); 3270 assert(bind_map->push_ranges[2].length == 0); 3271 assert(bind_map->push_ranges[3].length == 0); 3272#endif 3273 } 3274 } 3275} 3276 3277#if GFX_VER >= 12 3278static void 3279cmd_buffer_emit_push_constant_all(struct anv_cmd_buffer *cmd_buffer, 3280 uint32_t shader_mask, 3281 struct anv_address *buffers, 3282 uint32_t buffer_count) 3283{ 3284 if (buffer_count == 0) { 3285 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) { 3286 c.ShaderUpdateEnable = shader_mask; 3287 c.MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false); 3288 } 3289 return; 3290 } 3291 3292 const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx; 3293 const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline; 3294 3295 static const UNUSED uint32_t push_constant_opcodes[] = { 3296 [MESA_SHADER_VERTEX] = 21, 3297 [MESA_SHADER_TESS_CTRL] = 25, /* HS */ 3298 [MESA_SHADER_TESS_EVAL] = 26, /* DS */ 3299 [MESA_SHADER_GEOMETRY] = 22, 3300 [MESA_SHADER_FRAGMENT] = 23, 3301 [MESA_SHADER_COMPUTE] = 0, 3302 }; 3303 3304 gl_shader_stage stage = vk_to_mesa_shader_stage(shader_mask); 3305 assert(stage < ARRAY_SIZE(push_constant_opcodes)); 3306 assert(push_constant_opcodes[stage] > 0); 3307 3308 const struct anv_pipeline_bind_map *bind_map = 3309 &pipeline->shaders[stage]->bind_map; 3310 3311 uint32_t *dw; 3312 const uint32_t buffer_mask = (1 << buffer_count) - 1; 3313 const uint32_t num_dwords = 2 + 2 * buffer_count; 3314 3315 dw = anv_batch_emitn(&cmd_buffer->batch, num_dwords, 3316 GENX(3DSTATE_CONSTANT_ALL), 3317 .ShaderUpdateEnable = shader_mask, 3318 .PointerBufferMask = buffer_mask, 3319 .MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false)); 3320 3321 for (int i = 0; i < buffer_count; i++) { 3322 const struct anv_push_range *range = &bind_map->push_ranges[i]; 3323 GENX(3DSTATE_CONSTANT_ALL_DATA_pack)( 3324 &cmd_buffer->batch, dw + 2 + i * 2, 3325 &(struct GENX(3DSTATE_CONSTANT_ALL_DATA)) { 3326 .PointerToConstantBuffer = 3327 anv_address_add(buffers[i], range->start * 32), 3328 .ConstantBufferReadLength = range->length, 3329 }); 3330 } 3331} 3332#endif 3333 3334static void 3335cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer, 3336 VkShaderStageFlags dirty_stages) 3337{ 3338 VkShaderStageFlags flushed = 0; 3339 struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx; 3340 const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline; 3341 3342#if GFX_VER >= 12 3343 uint32_t nobuffer_stages = 0; 3344#endif 3345 3346 /* Compute robust pushed register access mask for each stage. */ 3347 if (cmd_buffer->device->robust_buffer_access) { 3348 anv_foreach_stage(stage, dirty_stages) { 3349 if (!anv_pipeline_has_stage(pipeline, stage)) 3350 continue; 3351 3352 const struct anv_shader_bin *shader = pipeline->shaders[stage]; 3353 const struct anv_pipeline_bind_map *bind_map = &shader->bind_map; 3354 struct anv_push_constants *push = &gfx_state->base.push_constants; 3355 3356 push->push_reg_mask[stage] = 0; 3357 /* Start of the current range in the shader, relative to the start of 3358 * push constants in the shader. 3359 */ 3360 unsigned range_start_reg = 0; 3361 for (unsigned i = 0; i < 4; i++) { 3362 const struct anv_push_range *range = &bind_map->push_ranges[i]; 3363 if (range->length == 0) 3364 continue; 3365 3366 unsigned bound_size = 3367 get_push_range_bound_size(cmd_buffer, shader, range); 3368 if (bound_size >= range->start * 32) { 3369 unsigned bound_regs = 3370 MIN2(DIV_ROUND_UP(bound_size, 32) - range->start, 3371 range->length); 3372 assert(range_start_reg + bound_regs <= 64); 3373 push->push_reg_mask[stage] |= BITFIELD64_RANGE(range_start_reg, 3374 bound_regs); 3375 } 3376 3377 cmd_buffer->state.push_constants_dirty |= 3378 mesa_to_vk_shader_stage(stage); 3379 3380 range_start_reg += range->length; 3381 } 3382 } 3383 } 3384 3385 /* Resets the push constant state so that we allocate a new one if 3386 * needed. 3387 */ 3388 gfx_state->base.push_constants_state = ANV_STATE_NULL; 3389 3390 anv_foreach_stage(stage, dirty_stages) { 3391 unsigned buffer_count = 0; 3392 flushed |= mesa_to_vk_shader_stage(stage); 3393 UNUSED uint32_t max_push_range = 0; 3394 3395 struct anv_address buffers[4] = {}; 3396 if (anv_pipeline_has_stage(pipeline, stage)) { 3397 const struct anv_shader_bin *shader = pipeline->shaders[stage]; 3398 const struct anv_pipeline_bind_map *bind_map = &shader->bind_map; 3399 3400 /* We have to gather buffer addresses as a second step because the 3401 * loop above puts data into the push constant area and the call to 3402 * get_push_range_address is what locks our push constants and copies 3403 * them into the actual GPU buffer. If we did the two loops at the 3404 * same time, we'd risk only having some of the sizes in the push 3405 * constant buffer when we did the copy. 3406 */ 3407 for (unsigned i = 0; i < 4; i++) { 3408 const struct anv_push_range *range = &bind_map->push_ranges[i]; 3409 if (range->length == 0) 3410 break; 3411 3412 buffers[i] = get_push_range_address(cmd_buffer, shader, range); 3413 max_push_range = MAX2(max_push_range, range->length); 3414 buffer_count++; 3415 } 3416 3417 /* We have at most 4 buffers but they should be tightly packed */ 3418 for (unsigned i = buffer_count; i < 4; i++) 3419 assert(bind_map->push_ranges[i].length == 0); 3420 } 3421 3422#if GFX_VER >= 12 3423 /* If this stage doesn't have any push constants, emit it later in a 3424 * single CONSTANT_ALL packet. 3425 */ 3426 if (buffer_count == 0) { 3427 nobuffer_stages |= 1 << stage; 3428 continue; 3429 } 3430 3431 /* The Constant Buffer Read Length field from 3DSTATE_CONSTANT_ALL 3432 * contains only 5 bits, so we can only use it for buffers smaller than 3433 * 32. 3434 */ 3435 if (max_push_range < 32) { 3436 cmd_buffer_emit_push_constant_all(cmd_buffer, 1 << stage, 3437 buffers, buffer_count); 3438 continue; 3439 } 3440#endif 3441 3442 cmd_buffer_emit_push_constant(cmd_buffer, stage, buffers, buffer_count); 3443 } 3444 3445#if GFX_VER >= 12 3446 if (nobuffer_stages) 3447 cmd_buffer_emit_push_constant_all(cmd_buffer, nobuffer_stages, NULL, 0); 3448#endif 3449 3450 cmd_buffer->state.push_constants_dirty &= ~flushed; 3451} 3452 3453static void 3454cmd_buffer_emit_clip(struct anv_cmd_buffer *cmd_buffer) 3455{ 3456 const uint32_t clip_states = 3457#if GFX_VER <= 7 3458 ANV_CMD_DIRTY_DYNAMIC_FRONT_FACE | 3459 ANV_CMD_DIRTY_DYNAMIC_CULL_MODE | 3460#endif 3461 ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY | 3462 ANV_CMD_DIRTY_DYNAMIC_VIEWPORT | 3463 ANV_CMD_DIRTY_PIPELINE; 3464 3465 if ((cmd_buffer->state.gfx.dirty & clip_states) == 0) 3466 return; 3467 3468 /* Take dynamic primitive topology in to account with 3469 * 3DSTATE_CLIP::ViewportXYClipTestEnable 3470 */ 3471 bool xy_clip_test_enable = 0; 3472 3473 if (cmd_buffer->state.gfx.pipeline->dynamic_states & 3474 ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY) { 3475 VkPrimitiveTopology primitive_topology = 3476 cmd_buffer->state.gfx.dynamic.primitive_topology; 3477 3478 VkPolygonMode dynamic_raster_mode = 3479 genX(raster_polygon_mode)(cmd_buffer->state.gfx.pipeline, 3480 primitive_topology); 3481 3482 xy_clip_test_enable = (dynamic_raster_mode == VK_POLYGON_MODE_FILL); 3483 } 3484 3485#if GFX_VER <= 7 3486 const struct anv_dynamic_state *d = &cmd_buffer->state.gfx.dynamic; 3487#endif 3488 struct GENX(3DSTATE_CLIP) clip = { 3489 GENX(3DSTATE_CLIP_header), 3490#if GFX_VER <= 7 3491 .FrontWinding = genX(vk_to_intel_front_face)[d->front_face], 3492 .CullMode = genX(vk_to_intel_cullmode)[d->cull_mode], 3493#endif 3494 .ViewportXYClipTestEnable = xy_clip_test_enable, 3495 }; 3496 uint32_t dwords[GENX(3DSTATE_CLIP_length)]; 3497 3498 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; 3499 if (anv_pipeline_is_primitive(pipeline)) { 3500 const struct brw_vue_prog_data *last = 3501 anv_pipeline_get_last_vue_prog_data(pipeline); 3502 if (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT) { 3503 clip.MaximumVPIndex = 3504 cmd_buffer->state.gfx.dynamic.viewport.count > 0 ? 3505 cmd_buffer->state.gfx.dynamic.viewport.count - 1 : 0; 3506 } 3507 } 3508 3509 GENX(3DSTATE_CLIP_pack)(NULL, dwords, &clip); 3510 anv_batch_emit_merge(&cmd_buffer->batch, dwords, 3511 pipeline->gfx7.clip); 3512} 3513 3514static void 3515cmd_buffer_emit_streamout(struct anv_cmd_buffer *cmd_buffer) 3516{ 3517 const struct anv_dynamic_state *d = &cmd_buffer->state.gfx.dynamic; 3518 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; 3519 3520#if GFX_VER == 7 3521# define streamout_state_dw pipeline->gfx7.streamout_state 3522#else 3523# define streamout_state_dw pipeline->gfx8.streamout_state 3524#endif 3525 3526 uint32_t dwords[GENX(3DSTATE_STREAMOUT_length)]; 3527 3528 struct GENX(3DSTATE_STREAMOUT) so = { 3529 GENX(3DSTATE_STREAMOUT_header), 3530 .RenderingDisable = d->raster_discard, 3531 }; 3532 GENX(3DSTATE_STREAMOUT_pack)(NULL, dwords, &so); 3533 anv_batch_emit_merge(&cmd_buffer->batch, dwords, streamout_state_dw); 3534} 3535 3536void 3537genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) 3538{ 3539 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; 3540 uint32_t *p; 3541 3542 assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0); 3543 3544 genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config); 3545 3546 genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, UINT_MAX, UINT_MAX, 1); 3547 3548 genX(flush_pipeline_select_3d)(cmd_buffer); 3549 3550 /* Apply any pending pipeline flushes we may have. We want to apply them 3551 * now because, if any of those flushes are for things like push constants, 3552 * the GPU will read the state at weird times. 3553 */ 3554 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 3555 3556 uint32_t vb_emit = cmd_buffer->state.gfx.vb_dirty & pipeline->vb_used; 3557 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) 3558 vb_emit |= pipeline->vb_used; 3559 3560 if (vb_emit) { 3561 const uint32_t num_buffers = __builtin_popcount(vb_emit); 3562 const uint32_t num_dwords = 1 + num_buffers * 4; 3563 3564 p = anv_batch_emitn(&cmd_buffer->batch, num_dwords, 3565 GENX(3DSTATE_VERTEX_BUFFERS)); 3566 uint32_t i = 0; 3567 u_foreach_bit(vb, vb_emit) { 3568 struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer; 3569 uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset; 3570 3571 /* If dynamic, use stride/size from vertex binding, otherwise use 3572 * stride/size that was setup in the pipeline object. 3573 */ 3574 bool dynamic_stride = cmd_buffer->state.gfx.dynamic.dyn_vbo_stride; 3575 bool dynamic_size = cmd_buffer->state.gfx.dynamic.dyn_vbo_size; 3576 3577 struct GENX(VERTEX_BUFFER_STATE) state; 3578 if (buffer) { 3579 uint32_t stride = dynamic_stride ? 3580 cmd_buffer->state.vertex_bindings[vb].stride : pipeline->vb[vb].stride; 3581 /* From the Vulkan spec (vkCmdBindVertexBuffers2EXT): 3582 * 3583 * "If pname:pSizes is not NULL then pname:pSizes[i] specifies 3584 * the bound size of the vertex buffer starting from the corresponding 3585 * elements of pname:pBuffers[i] plus pname:pOffsets[i]." 3586 */ 3587 UNUSED uint32_t size = dynamic_size ? 3588 cmd_buffer->state.vertex_bindings[vb].size : buffer->size - offset; 3589 3590 state = (struct GENX(VERTEX_BUFFER_STATE)) { 3591 .VertexBufferIndex = vb, 3592 3593 .MOCS = anv_mocs(cmd_buffer->device, buffer->address.bo, 3594 ISL_SURF_USAGE_VERTEX_BUFFER_BIT), 3595#if GFX_VER <= 7 3596 .BufferAccessType = pipeline->vb[vb].instanced ? INSTANCEDATA : VERTEXDATA, 3597 .InstanceDataStepRate = pipeline->vb[vb].instance_divisor, 3598#endif 3599 .AddressModifyEnable = true, 3600 .BufferPitch = stride, 3601 .BufferStartingAddress = anv_address_add(buffer->address, offset), 3602 .NullVertexBuffer = offset >= buffer->size, 3603#if GFX_VER >= 12 3604 .L3BypassDisable = true, 3605#endif 3606 3607#if GFX_VER >= 8 3608 .BufferSize = size, 3609#else 3610 /* XXX: to handle dynamic offset for older gens we might want 3611 * to modify Endaddress, but there are issues when doing so: 3612 * 3613 * https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7439 3614 */ 3615 .EndAddress = anv_address_add(buffer->address, buffer->size - 1), 3616#endif 3617 }; 3618 } else { 3619 state = (struct GENX(VERTEX_BUFFER_STATE)) { 3620 .VertexBufferIndex = vb, 3621 .NullVertexBuffer = true, 3622 }; 3623 } 3624 3625#if GFX_VER >= 8 && GFX_VER <= 9 3626 genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, vb, 3627 state.BufferStartingAddress, 3628 state.BufferSize); 3629#endif 3630 3631 GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state); 3632 i++; 3633 } 3634 } 3635 3636 cmd_buffer->state.gfx.vb_dirty &= ~vb_emit; 3637 3638 uint32_t descriptors_dirty = cmd_buffer->state.descriptors_dirty & 3639 pipeline->active_stages; 3640 if (!cmd_buffer->state.gfx.dirty && !descriptors_dirty && 3641 !cmd_buffer->state.push_constants_dirty) 3642 return; 3643 3644 if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_XFB_ENABLE) || 3645 (GFX_VER == 7 && (cmd_buffer->state.gfx.dirty & 3646 ANV_CMD_DIRTY_PIPELINE))) { 3647 /* We don't need any per-buffer dirty tracking because you're not 3648 * allowed to bind different XFB buffers while XFB is enabled. 3649 */ 3650 for (unsigned idx = 0; idx < MAX_XFB_BUFFERS; idx++) { 3651 struct anv_xfb_binding *xfb = &cmd_buffer->state.xfb_bindings[idx]; 3652 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) { 3653#if GFX_VER < 12 3654 sob.SOBufferIndex = idx; 3655#else 3656 sob._3DCommandOpcode = 0; 3657 sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + idx; 3658#endif 3659 3660 if (cmd_buffer->state.xfb_enabled && xfb->buffer && xfb->size != 0) { 3661 sob.MOCS = anv_mocs(cmd_buffer->device, xfb->buffer->address.bo, 0); 3662 sob.SurfaceBaseAddress = anv_address_add(xfb->buffer->address, 3663 xfb->offset); 3664#if GFX_VER >= 8 3665 sob.SOBufferEnable = true; 3666 sob.StreamOffsetWriteEnable = false; 3667 /* Size is in DWords - 1 */ 3668 sob.SurfaceSize = DIV_ROUND_UP(xfb->size, 4) - 1; 3669#else 3670 /* We don't have SOBufferEnable in 3DSTATE_SO_BUFFER on Gfx7 so 3671 * we trust in SurfaceEndAddress = SurfaceBaseAddress = 0 (the 3672 * default for an empty SO_BUFFER packet) to disable them. 3673 */ 3674 sob.SurfacePitch = pipeline->gfx7.xfb_bo_pitch[idx]; 3675 sob.SurfaceEndAddress = anv_address_add(xfb->buffer->address, 3676 xfb->offset + xfb->size); 3677#endif 3678 } 3679 } 3680 } 3681 3682 /* CNL and later require a CS stall after 3DSTATE_SO_BUFFER */ 3683 if (GFX_VER >= 10) { 3684 anv_add_pending_pipe_bits(cmd_buffer, 3685 ANV_PIPE_CS_STALL_BIT, 3686 "after 3DSTATE_SO_BUFFER call"); 3687 } 3688 } 3689 3690 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) { 3691 anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch); 3692 3693 /* Remove from dynamic state emission all of stuff that is baked into 3694 * the pipeline. 3695 */ 3696 cmd_buffer->state.gfx.dirty &= ~pipeline->static_state_mask; 3697 3698 /* If the pipeline changed, we may need to re-allocate push constant 3699 * space in the URB. 3700 */ 3701 cmd_buffer_alloc_push_constants(cmd_buffer); 3702 } 3703 3704 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) 3705 cmd_buffer->state.gfx.primitive_topology = pipeline->topology; 3706 3707#if GFX_VER <= 7 3708 if (cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_VERTEX_BIT || 3709 cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_VERTEX_BIT) { 3710 /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1: 3711 * 3712 * "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth 3713 * stall needs to be sent just prior to any 3DSTATE_VS, 3714 * 3DSTATE_URB_VS, 3DSTATE_CONSTANT_VS, 3715 * 3DSTATE_BINDING_TABLE_POINTER_VS, 3716 * 3DSTATE_SAMPLER_STATE_POINTER_VS command. Only one 3717 * PIPE_CONTROL needs to be sent before any combination of VS 3718 * associated 3DSTATE." 3719 */ 3720 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 3721 pc.DepthStallEnable = true; 3722 pc.PostSyncOperation = WriteImmediateData; 3723 pc.Address = cmd_buffer->device->workaround_address; 3724 anv_debug_dump_pc(pc); 3725 } 3726 } 3727#endif 3728 3729 /* Render targets live in the same binding table as fragment descriptors */ 3730 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS) 3731 descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT; 3732 3733 /* We emit the binding tables and sampler tables first, then emit push 3734 * constants and then finally emit binding table and sampler table 3735 * pointers. It has to happen in this order, since emitting the binding 3736 * tables may change the push constants (in case of storage images). After 3737 * emitting push constants, on SKL+ we have to emit the corresponding 3738 * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect. 3739 */ 3740 uint32_t dirty = 0; 3741 if (descriptors_dirty) { 3742 dirty = flush_descriptor_sets(cmd_buffer, 3743 &cmd_buffer->state.gfx.base, 3744 descriptors_dirty, 3745 pipeline->shaders, 3746 ARRAY_SIZE(pipeline->shaders)); 3747 cmd_buffer->state.descriptors_dirty &= ~dirty; 3748 } 3749 3750 if (dirty || cmd_buffer->state.push_constants_dirty) { 3751 /* Because we're pushing UBOs, we have to push whenever either 3752 * descriptors or push constants is dirty. 3753 */ 3754 dirty |= cmd_buffer->state.push_constants_dirty; 3755 dirty &= ANV_STAGE_MASK & VK_SHADER_STAGE_ALL_GRAPHICS; 3756 cmd_buffer_flush_push_constants(cmd_buffer, dirty); 3757 } 3758 3759 if (dirty) 3760 cmd_buffer_emit_descriptor_pointers(cmd_buffer, dirty); 3761 3762 cmd_buffer_emit_clip(cmd_buffer); 3763 3764 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE) 3765 cmd_buffer_emit_streamout(cmd_buffer); 3766 3767 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_VIEWPORT) 3768 gfx8_cmd_buffer_emit_viewport(cmd_buffer); 3769 3770 if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_DYNAMIC_VIEWPORT | 3771 ANV_CMD_DIRTY_PIPELINE)) { 3772 gfx8_cmd_buffer_emit_depth_viewport(cmd_buffer, 3773 pipeline->depth_clamp_enable); 3774 } 3775 3776 if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_DYNAMIC_SCISSOR | 3777 ANV_CMD_DIRTY_RENDER_TARGETS)) 3778 gfx7_cmd_buffer_emit_scissor(cmd_buffer); 3779 3780 genX(cmd_buffer_flush_dynamic_state)(cmd_buffer); 3781} 3782 3783static void 3784emit_vertex_bo(struct anv_cmd_buffer *cmd_buffer, 3785 struct anv_address addr, 3786 uint32_t size, uint32_t index) 3787{ 3788 uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, 5, 3789 GENX(3DSTATE_VERTEX_BUFFERS)); 3790 3791 GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, p + 1, 3792 &(struct GENX(VERTEX_BUFFER_STATE)) { 3793 .VertexBufferIndex = index, 3794 .AddressModifyEnable = true, 3795 .BufferPitch = 0, 3796 .MOCS = addr.bo ? anv_mocs(cmd_buffer->device, addr.bo, 3797 ISL_SURF_USAGE_VERTEX_BUFFER_BIT) : 0, 3798 .NullVertexBuffer = size == 0, 3799#if GFX_VER >= 12 3800 .L3BypassDisable = true, 3801#endif 3802#if (GFX_VER >= 8) 3803 .BufferStartingAddress = addr, 3804 .BufferSize = size 3805#else 3806 .BufferStartingAddress = addr, 3807 .EndAddress = anv_address_add(addr, size), 3808#endif 3809 }); 3810 3811 genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, 3812 index, addr, size); 3813} 3814 3815static void 3816emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer, 3817 struct anv_address addr) 3818{ 3819 emit_vertex_bo(cmd_buffer, addr, addr.bo ? 8 : 0, ANV_SVGS_VB_INDEX); 3820} 3821 3822static void 3823emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer, 3824 uint32_t base_vertex, uint32_t base_instance) 3825{ 3826 if (base_vertex == 0 && base_instance == 0) { 3827 emit_base_vertex_instance_bo(cmd_buffer, ANV_NULL_ADDRESS); 3828 } else { 3829 struct anv_state id_state = 3830 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4); 3831 3832 ((uint32_t *)id_state.map)[0] = base_vertex; 3833 ((uint32_t *)id_state.map)[1] = base_instance; 3834 3835 struct anv_address addr = { 3836 .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, 3837 .offset = id_state.offset, 3838 }; 3839 3840 emit_base_vertex_instance_bo(cmd_buffer, addr); 3841 } 3842} 3843 3844static void 3845emit_draw_index(struct anv_cmd_buffer *cmd_buffer, uint32_t draw_index) 3846{ 3847 struct anv_state state = 3848 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 4, 4); 3849 3850 ((uint32_t *)state.map)[0] = draw_index; 3851 3852 struct anv_address addr = { 3853 .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, 3854 .offset = state.offset, 3855 }; 3856 3857 emit_vertex_bo(cmd_buffer, addr, 4, ANV_DRAWID_VB_INDEX); 3858} 3859 3860static void 3861update_dirty_vbs_for_gfx8_vb_flush(struct anv_cmd_buffer *cmd_buffer, 3862 uint32_t access_type) 3863{ 3864 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; 3865 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); 3866 3867 uint64_t vb_used = pipeline->vb_used; 3868 if (vs_prog_data->uses_firstvertex || 3869 vs_prog_data->uses_baseinstance) 3870 vb_used |= 1ull << ANV_SVGS_VB_INDEX; 3871 if (vs_prog_data->uses_drawid) 3872 vb_used |= 1ull << ANV_DRAWID_VB_INDEX; 3873 3874 genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(cmd_buffer, 3875 access_type == RANDOM, 3876 vb_used); 3877} 3878 3879ALWAYS_INLINE static void 3880cmd_buffer_emit_vertex_constants_and_flush(struct anv_cmd_buffer *cmd_buffer, 3881 const struct brw_vs_prog_data *vs_prog_data, 3882 uint32_t base_vertex, 3883 uint32_t base_instance, 3884 uint32_t draw_id, 3885 bool force_flush) 3886{ 3887 bool emitted = false; 3888 if (vs_prog_data->uses_firstvertex || 3889 vs_prog_data->uses_baseinstance) { 3890 emit_base_vertex_instance(cmd_buffer, base_vertex, base_instance); 3891 emitted = true; 3892 } 3893 if (vs_prog_data->uses_drawid) { 3894 emit_draw_index(cmd_buffer, draw_id); 3895 emitted = true; 3896 } 3897 /* Emitting draw index or vertex index BOs may result in needing 3898 * additional VF cache flushes. 3899 */ 3900 if (emitted || force_flush) 3901 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 3902} 3903 3904void genX(CmdDraw)( 3905 VkCommandBuffer commandBuffer, 3906 uint32_t vertexCount, 3907 uint32_t instanceCount, 3908 uint32_t firstVertex, 3909 uint32_t firstInstance) 3910{ 3911 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 3912 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; 3913 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); 3914 3915 if (anv_batch_has_error(&cmd_buffer->batch)) 3916 return; 3917 3918 const uint32_t count = (vertexCount * 3919 instanceCount * 3920 (pipeline->use_primitive_replication ? 3921 1 : anv_subpass_view_count(cmd_buffer->state.subpass))); 3922 anv_measure_snapshot(cmd_buffer, 3923 INTEL_SNAPSHOT_DRAW, 3924 "draw", count); 3925 3926 genX(cmd_buffer_flush_state)(cmd_buffer); 3927 3928 if (cmd_buffer->state.conditional_render_enabled) 3929 genX(cmd_emit_conditional_render_predicate)(cmd_buffer); 3930 3931 cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data, 3932 firstVertex, firstInstance, 0, 3933 true); 3934 3935 /* Our implementation of VK_KHR_multiview uses instancing to draw the 3936 * different views. We need to multiply instanceCount by the view count. 3937 */ 3938 if (!pipeline->use_primitive_replication) 3939 instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass); 3940 3941 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { 3942 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; 3943 prim.VertexAccessType = SEQUENTIAL; 3944 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology; 3945 prim.VertexCountPerInstance = vertexCount; 3946 prim.StartVertexLocation = firstVertex; 3947 prim.InstanceCount = instanceCount; 3948 prim.StartInstanceLocation = firstInstance; 3949 prim.BaseVertexLocation = 0; 3950 } 3951 3952 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL); 3953} 3954 3955void genX(CmdDrawMultiEXT)( 3956 VkCommandBuffer commandBuffer, 3957 uint32_t drawCount, 3958 const VkMultiDrawInfoEXT *pVertexInfo, 3959 uint32_t instanceCount, 3960 uint32_t firstInstance, 3961 uint32_t stride) 3962{ 3963 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 3964 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; 3965 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); 3966 3967 if (anv_batch_has_error(&cmd_buffer->batch)) 3968 return; 3969 3970 const uint32_t count = (drawCount * 3971 instanceCount * 3972 (pipeline->use_primitive_replication ? 3973 1 : anv_subpass_view_count(cmd_buffer->state.subpass))); 3974 anv_measure_snapshot(cmd_buffer, 3975 INTEL_SNAPSHOT_DRAW, 3976 "draw_multi", count); 3977 3978 genX(cmd_buffer_flush_state)(cmd_buffer); 3979 3980 if (cmd_buffer->state.conditional_render_enabled) 3981 genX(cmd_emit_conditional_render_predicate)(cmd_buffer); 3982 3983 /* Our implementation of VK_KHR_multiview uses instancing to draw the 3984 * different views. We need to multiply instanceCount by the view count. 3985 */ 3986 if (!pipeline->use_primitive_replication) 3987 instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass); 3988 3989 uint32_t i = 0; 3990 vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) { 3991 cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data, 3992 draw->firstVertex, 3993 firstInstance, i, !i); 3994 3995 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { 3996 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; 3997 prim.VertexAccessType = SEQUENTIAL; 3998 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology; 3999 prim.VertexCountPerInstance = draw->vertexCount; 4000 prim.StartVertexLocation = draw->firstVertex; 4001 prim.InstanceCount = instanceCount; 4002 prim.StartInstanceLocation = firstInstance; 4003 prim.BaseVertexLocation = 0; 4004 } 4005 } 4006 4007 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL); 4008} 4009 4010void genX(CmdDrawIndexed)( 4011 VkCommandBuffer commandBuffer, 4012 uint32_t indexCount, 4013 uint32_t instanceCount, 4014 uint32_t firstIndex, 4015 int32_t vertexOffset, 4016 uint32_t firstInstance) 4017{ 4018 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 4019 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; 4020 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); 4021 4022 if (anv_batch_has_error(&cmd_buffer->batch)) 4023 return; 4024 4025 const uint32_t count = (indexCount * 4026 instanceCount * 4027 (pipeline->use_primitive_replication ? 4028 1 : anv_subpass_view_count(cmd_buffer->state.subpass))); 4029 anv_measure_snapshot(cmd_buffer, 4030 INTEL_SNAPSHOT_DRAW, 4031 "draw indexed", 4032 count); 4033 4034 genX(cmd_buffer_flush_state)(cmd_buffer); 4035 4036 if (cmd_buffer->state.conditional_render_enabled) 4037 genX(cmd_emit_conditional_render_predicate)(cmd_buffer); 4038 4039 cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data, vertexOffset, firstInstance, 0, true); 4040 4041 /* Our implementation of VK_KHR_multiview uses instancing to draw the 4042 * different views. We need to multiply instanceCount by the view count. 4043 */ 4044 if (!pipeline->use_primitive_replication) 4045 instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass); 4046 4047 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { 4048 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; 4049 prim.VertexAccessType = RANDOM; 4050 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology; 4051 prim.VertexCountPerInstance = indexCount; 4052 prim.StartVertexLocation = firstIndex; 4053 prim.InstanceCount = instanceCount; 4054 prim.StartInstanceLocation = firstInstance; 4055 prim.BaseVertexLocation = vertexOffset; 4056 } 4057 4058 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM); 4059} 4060 4061void genX(CmdDrawMultiIndexedEXT)( 4062 VkCommandBuffer commandBuffer, 4063 uint32_t drawCount, 4064 const VkMultiDrawIndexedInfoEXT *pIndexInfo, 4065 uint32_t instanceCount, 4066 uint32_t firstInstance, 4067 uint32_t stride, 4068 const int32_t *pVertexOffset) 4069{ 4070 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 4071 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; 4072 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); 4073 4074 if (anv_batch_has_error(&cmd_buffer->batch)) 4075 return; 4076 4077 const uint32_t count = (drawCount * 4078 instanceCount * 4079 (pipeline->use_primitive_replication ? 4080 1 : anv_subpass_view_count(cmd_buffer->state.subpass))); 4081 anv_measure_snapshot(cmd_buffer, 4082 INTEL_SNAPSHOT_DRAW, 4083 "draw indexed_multi", 4084 count); 4085 4086 genX(cmd_buffer_flush_state)(cmd_buffer); 4087 4088 if (cmd_buffer->state.conditional_render_enabled) 4089 genX(cmd_emit_conditional_render_predicate)(cmd_buffer); 4090 4091 /* Our implementation of VK_KHR_multiview uses instancing to draw the 4092 * different views. We need to multiply instanceCount by the view count. 4093 */ 4094 if (!pipeline->use_primitive_replication) 4095 instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass); 4096 4097 uint32_t i = 0; 4098 if (pVertexOffset) { 4099 if (vs_prog_data->uses_drawid) { 4100 bool emitted = true; 4101 if (vs_prog_data->uses_firstvertex || 4102 vs_prog_data->uses_baseinstance) { 4103 emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance); 4104 emitted = true; 4105 } 4106 vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) { 4107 if (vs_prog_data->uses_drawid) { 4108 emit_draw_index(cmd_buffer, i); 4109 emitted = true; 4110 } 4111 /* Emitting draw index or vertex index BOs may result in needing 4112 * additional VF cache flushes. 4113 */ 4114 if (emitted) 4115 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 4116 4117 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { 4118 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; 4119 prim.VertexAccessType = RANDOM; 4120 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology; 4121 prim.VertexCountPerInstance = draw->indexCount; 4122 prim.StartVertexLocation = draw->firstIndex; 4123 prim.InstanceCount = instanceCount; 4124 prim.StartInstanceLocation = firstInstance; 4125 prim.BaseVertexLocation = *pVertexOffset; 4126 } 4127 emitted = false; 4128 } 4129 } else { 4130 if (vs_prog_data->uses_firstvertex || 4131 vs_prog_data->uses_baseinstance) { 4132 emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance); 4133 /* Emitting draw index or vertex index BOs may result in needing 4134 * additional VF cache flushes. 4135 */ 4136 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 4137 } 4138 vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) { 4139 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { 4140 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; 4141 prim.VertexAccessType = RANDOM; 4142 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology; 4143 prim.VertexCountPerInstance = draw->indexCount; 4144 prim.StartVertexLocation = draw->firstIndex; 4145 prim.InstanceCount = instanceCount; 4146 prim.StartInstanceLocation = firstInstance; 4147 prim.BaseVertexLocation = *pVertexOffset; 4148 } 4149 } 4150 } 4151 } else { 4152 vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) { 4153 cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data, 4154 draw->vertexOffset, 4155 firstInstance, i, i != 0); 4156 4157 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { 4158 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; 4159 prim.VertexAccessType = RANDOM; 4160 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology; 4161 prim.VertexCountPerInstance = draw->indexCount; 4162 prim.StartVertexLocation = draw->firstIndex; 4163 prim.InstanceCount = instanceCount; 4164 prim.StartInstanceLocation = firstInstance; 4165 prim.BaseVertexLocation = draw->vertexOffset; 4166 } 4167 } 4168 } 4169 4170 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM); 4171} 4172 4173/* Auto-Draw / Indirect Registers */ 4174#define GFX7_3DPRIM_END_OFFSET 0x2420 4175#define GFX7_3DPRIM_START_VERTEX 0x2430 4176#define GFX7_3DPRIM_VERTEX_COUNT 0x2434 4177#define GFX7_3DPRIM_INSTANCE_COUNT 0x2438 4178#define GFX7_3DPRIM_START_INSTANCE 0x243C 4179#define GFX7_3DPRIM_BASE_VERTEX 0x2440 4180 4181void genX(CmdDrawIndirectByteCountEXT)( 4182 VkCommandBuffer commandBuffer, 4183 uint32_t instanceCount, 4184 uint32_t firstInstance, 4185 VkBuffer counterBuffer, 4186 VkDeviceSize counterBufferOffset, 4187 uint32_t counterOffset, 4188 uint32_t vertexStride) 4189{ 4190#if GFX_VERx10 >= 75 4191 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 4192 ANV_FROM_HANDLE(anv_buffer, counter_buffer, counterBuffer); 4193 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; 4194 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); 4195 4196 /* firstVertex is always zero for this draw function */ 4197 const uint32_t firstVertex = 0; 4198 4199 if (anv_batch_has_error(&cmd_buffer->batch)) 4200 return; 4201 4202 anv_measure_snapshot(cmd_buffer, 4203 INTEL_SNAPSHOT_DRAW, 4204 "draw indirect byte count", 4205 instanceCount); 4206 4207 genX(cmd_buffer_flush_state)(cmd_buffer); 4208 4209 if (vs_prog_data->uses_firstvertex || 4210 vs_prog_data->uses_baseinstance) 4211 emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance); 4212 if (vs_prog_data->uses_drawid) 4213 emit_draw_index(cmd_buffer, 0); 4214 4215 /* Emitting draw index or vertex index BOs may result in needing 4216 * additional VF cache flushes. 4217 */ 4218 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 4219 4220 /* Our implementation of VK_KHR_multiview uses instancing to draw the 4221 * different views. We need to multiply instanceCount by the view count. 4222 */ 4223 if (!pipeline->use_primitive_replication) 4224 instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass); 4225 4226 struct mi_builder b; 4227 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 4228 struct mi_value count = 4229 mi_mem32(anv_address_add(counter_buffer->address, 4230 counterBufferOffset)); 4231 if (counterOffset) 4232 count = mi_isub(&b, count, mi_imm(counterOffset)); 4233 count = mi_udiv32_imm(&b, count, vertexStride); 4234 mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT), count); 4235 4236 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX), mi_imm(firstVertex)); 4237 mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT), mi_imm(instanceCount)); 4238 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE), mi_imm(firstInstance)); 4239 mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0)); 4240 4241 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { 4242 prim.IndirectParameterEnable = true; 4243 prim.VertexAccessType = SEQUENTIAL; 4244 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology; 4245 } 4246 4247 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL); 4248#endif /* GFX_VERx10 >= 75 */ 4249} 4250 4251static void 4252load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer, 4253 struct anv_address addr, 4254 bool indexed) 4255{ 4256 struct mi_builder b; 4257 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 4258 4259 mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT), 4260 mi_mem32(anv_address_add(addr, 0))); 4261 4262 struct mi_value instance_count = mi_mem32(anv_address_add(addr, 4)); 4263 unsigned view_count = anv_subpass_view_count(cmd_buffer->state.subpass); 4264 if (view_count > 1) { 4265#if GFX_VERx10 >= 75 4266 instance_count = mi_imul_imm(&b, instance_count, view_count); 4267#else 4268 anv_finishme("Multiview + indirect draw requires MI_MATH; " 4269 "MI_MATH is not supported on Ivy Bridge"); 4270#endif 4271 } 4272 mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT), instance_count); 4273 4274 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX), 4275 mi_mem32(anv_address_add(addr, 8))); 4276 4277 if (indexed) { 4278 mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), 4279 mi_mem32(anv_address_add(addr, 12))); 4280 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE), 4281 mi_mem32(anv_address_add(addr, 16))); 4282 } else { 4283 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE), 4284 mi_mem32(anv_address_add(addr, 12))); 4285 mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0)); 4286 } 4287} 4288 4289void genX(CmdDrawIndirect)( 4290 VkCommandBuffer commandBuffer, 4291 VkBuffer _buffer, 4292 VkDeviceSize offset, 4293 uint32_t drawCount, 4294 uint32_t stride) 4295{ 4296 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 4297 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); 4298 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; 4299 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); 4300 4301 if (anv_batch_has_error(&cmd_buffer->batch)) 4302 return; 4303 4304 genX(cmd_buffer_flush_state)(cmd_buffer); 4305 4306 if (cmd_buffer->state.conditional_render_enabled) 4307 genX(cmd_emit_conditional_render_predicate)(cmd_buffer); 4308 4309 for (uint32_t i = 0; i < drawCount; i++) { 4310 struct anv_address draw = anv_address_add(buffer->address, offset); 4311 4312 if (vs_prog_data->uses_firstvertex || 4313 vs_prog_data->uses_baseinstance) 4314 emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8)); 4315 if (vs_prog_data->uses_drawid) 4316 emit_draw_index(cmd_buffer, i); 4317 4318 /* Emitting draw index or vertex index BOs may result in needing 4319 * additional VF cache flushes. 4320 */ 4321 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 4322 4323 load_indirect_parameters(cmd_buffer, draw, false); 4324 4325 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { 4326 prim.IndirectParameterEnable = true; 4327 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; 4328 prim.VertexAccessType = SEQUENTIAL; 4329 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology; 4330 } 4331 4332 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL); 4333 4334 offset += stride; 4335 } 4336} 4337 4338void genX(CmdDrawIndexedIndirect)( 4339 VkCommandBuffer commandBuffer, 4340 VkBuffer _buffer, 4341 VkDeviceSize offset, 4342 uint32_t drawCount, 4343 uint32_t stride) 4344{ 4345 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 4346 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); 4347 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; 4348 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); 4349 4350 if (anv_batch_has_error(&cmd_buffer->batch)) 4351 return; 4352 4353 genX(cmd_buffer_flush_state)(cmd_buffer); 4354 4355 if (cmd_buffer->state.conditional_render_enabled) 4356 genX(cmd_emit_conditional_render_predicate)(cmd_buffer); 4357 4358 for (uint32_t i = 0; i < drawCount; i++) { 4359 struct anv_address draw = anv_address_add(buffer->address, offset); 4360 4361 /* TODO: We need to stomp base vertex to 0 somehow */ 4362 if (vs_prog_data->uses_firstvertex || 4363 vs_prog_data->uses_baseinstance) 4364 emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12)); 4365 if (vs_prog_data->uses_drawid) 4366 emit_draw_index(cmd_buffer, i); 4367 4368 /* Emitting draw index or vertex index BOs may result in needing 4369 * additional VF cache flushes. 4370 */ 4371 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 4372 4373 load_indirect_parameters(cmd_buffer, draw, true); 4374 4375 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { 4376 prim.IndirectParameterEnable = true; 4377 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; 4378 prim.VertexAccessType = RANDOM; 4379 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology; 4380 } 4381 4382 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM); 4383 4384 offset += stride; 4385 } 4386} 4387 4388static struct mi_value 4389prepare_for_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer, 4390 struct mi_builder *b, 4391 struct anv_buffer *count_buffer, 4392 uint64_t countBufferOffset) 4393{ 4394 struct anv_address count_address = 4395 anv_address_add(count_buffer->address, countBufferOffset); 4396 4397 struct mi_value ret = mi_imm(0); 4398 4399 if (cmd_buffer->state.conditional_render_enabled) { 4400#if GFX_VERx10 >= 75 4401 ret = mi_new_gpr(b); 4402 mi_store(b, mi_value_ref(b, ret), mi_mem32(count_address)); 4403#endif 4404 } else { 4405 /* Upload the current draw count from the draw parameters buffer to 4406 * MI_PREDICATE_SRC0. 4407 */ 4408 mi_store(b, mi_reg64(MI_PREDICATE_SRC0), mi_mem32(count_address)); 4409 mi_store(b, mi_reg32(MI_PREDICATE_SRC1 + 4), mi_imm(0)); 4410 } 4411 4412 return ret; 4413} 4414 4415static void 4416emit_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer, 4417 struct mi_builder *b, 4418 uint32_t draw_index) 4419{ 4420 /* Upload the index of the current primitive to MI_PREDICATE_SRC1. */ 4421 mi_store(b, mi_reg32(MI_PREDICATE_SRC1), mi_imm(draw_index)); 4422 4423 if (draw_index == 0) { 4424 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { 4425 mip.LoadOperation = LOAD_LOADINV; 4426 mip.CombineOperation = COMBINE_SET; 4427 mip.CompareOperation = COMPARE_SRCS_EQUAL; 4428 } 4429 } else { 4430 /* While draw_index < draw_count the predicate's result will be 4431 * (draw_index == draw_count) ^ TRUE = TRUE 4432 * When draw_index == draw_count the result is 4433 * (TRUE) ^ TRUE = FALSE 4434 * After this all results will be: 4435 * (FALSE) ^ FALSE = FALSE 4436 */ 4437 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { 4438 mip.LoadOperation = LOAD_LOAD; 4439 mip.CombineOperation = COMBINE_XOR; 4440 mip.CompareOperation = COMPARE_SRCS_EQUAL; 4441 } 4442 } 4443} 4444 4445#if GFX_VERx10 >= 75 4446static void 4447emit_draw_count_predicate_with_conditional_render( 4448 struct anv_cmd_buffer *cmd_buffer, 4449 struct mi_builder *b, 4450 uint32_t draw_index, 4451 struct mi_value max) 4452{ 4453 struct mi_value pred = mi_ult(b, mi_imm(draw_index), max); 4454 pred = mi_iand(b, pred, mi_reg64(ANV_PREDICATE_RESULT_REG)); 4455 4456#if GFX_VER >= 8 4457 mi_store(b, mi_reg32(MI_PREDICATE_RESULT), pred); 4458#else 4459 /* MI_PREDICATE_RESULT is not whitelisted in i915 command parser 4460 * so we emit MI_PREDICATE to set it. 4461 */ 4462 4463 mi_store(b, mi_reg64(MI_PREDICATE_SRC0), pred); 4464 mi_store(b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0)); 4465 4466 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { 4467 mip.LoadOperation = LOAD_LOADINV; 4468 mip.CombineOperation = COMBINE_SET; 4469 mip.CompareOperation = COMPARE_SRCS_EQUAL; 4470 } 4471#endif 4472} 4473#endif 4474 4475static void 4476emit_draw_count_predicate_cond(struct anv_cmd_buffer *cmd_buffer, 4477 struct mi_builder *b, 4478 uint32_t draw_index, 4479 struct mi_value max) 4480{ 4481#if GFX_VERx10 >= 75 4482 if (cmd_buffer->state.conditional_render_enabled) { 4483 emit_draw_count_predicate_with_conditional_render( 4484 cmd_buffer, b, draw_index, mi_value_ref(b, max)); 4485 } else { 4486 emit_draw_count_predicate(cmd_buffer, b, draw_index); 4487 } 4488#else 4489 emit_draw_count_predicate(cmd_buffer, b, draw_index); 4490#endif 4491} 4492 4493void genX(CmdDrawIndirectCount)( 4494 VkCommandBuffer commandBuffer, 4495 VkBuffer _buffer, 4496 VkDeviceSize offset, 4497 VkBuffer _countBuffer, 4498 VkDeviceSize countBufferOffset, 4499 uint32_t maxDrawCount, 4500 uint32_t stride) 4501{ 4502 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 4503 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); 4504 ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer); 4505 struct anv_cmd_state *cmd_state = &cmd_buffer->state; 4506 struct anv_graphics_pipeline *pipeline = cmd_state->gfx.pipeline; 4507 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); 4508 4509 if (anv_batch_has_error(&cmd_buffer->batch)) 4510 return; 4511 4512 genX(cmd_buffer_flush_state)(cmd_buffer); 4513 4514 struct mi_builder b; 4515 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 4516 struct mi_value max = 4517 prepare_for_draw_count_predicate(cmd_buffer, &b, 4518 count_buffer, countBufferOffset); 4519 4520 for (uint32_t i = 0; i < maxDrawCount; i++) { 4521 struct anv_address draw = anv_address_add(buffer->address, offset); 4522 4523 emit_draw_count_predicate_cond(cmd_buffer, &b, i, max); 4524 4525 if (vs_prog_data->uses_firstvertex || 4526 vs_prog_data->uses_baseinstance) 4527 emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8)); 4528 if (vs_prog_data->uses_drawid) 4529 emit_draw_index(cmd_buffer, i); 4530 4531 /* Emitting draw index or vertex index BOs may result in needing 4532 * additional VF cache flushes. 4533 */ 4534 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 4535 4536 load_indirect_parameters(cmd_buffer, draw, false); 4537 4538 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { 4539 prim.IndirectParameterEnable = true; 4540 prim.PredicateEnable = true; 4541 prim.VertexAccessType = SEQUENTIAL; 4542 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology; 4543 } 4544 4545 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL); 4546 4547 offset += stride; 4548 } 4549 4550 mi_value_unref(&b, max); 4551} 4552 4553void genX(CmdDrawIndexedIndirectCount)( 4554 VkCommandBuffer commandBuffer, 4555 VkBuffer _buffer, 4556 VkDeviceSize offset, 4557 VkBuffer _countBuffer, 4558 VkDeviceSize countBufferOffset, 4559 uint32_t maxDrawCount, 4560 uint32_t stride) 4561{ 4562 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 4563 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); 4564 ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer); 4565 struct anv_cmd_state *cmd_state = &cmd_buffer->state; 4566 struct anv_graphics_pipeline *pipeline = cmd_state->gfx.pipeline; 4567 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); 4568 4569 if (anv_batch_has_error(&cmd_buffer->batch)) 4570 return; 4571 4572 genX(cmd_buffer_flush_state)(cmd_buffer); 4573 4574 struct mi_builder b; 4575 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 4576 struct mi_value max = 4577 prepare_for_draw_count_predicate(cmd_buffer, &b, 4578 count_buffer, countBufferOffset); 4579 4580 for (uint32_t i = 0; i < maxDrawCount; i++) { 4581 struct anv_address draw = anv_address_add(buffer->address, offset); 4582 4583 emit_draw_count_predicate_cond(cmd_buffer, &b, i, max); 4584 4585 /* TODO: We need to stomp base vertex to 0 somehow */ 4586 if (vs_prog_data->uses_firstvertex || 4587 vs_prog_data->uses_baseinstance) 4588 emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12)); 4589 if (vs_prog_data->uses_drawid) 4590 emit_draw_index(cmd_buffer, i); 4591 4592 /* Emitting draw index or vertex index BOs may result in needing 4593 * additional VF cache flushes. 4594 */ 4595 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 4596 4597 load_indirect_parameters(cmd_buffer, draw, true); 4598 4599 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { 4600 prim.IndirectParameterEnable = true; 4601 prim.PredicateEnable = true; 4602 prim.VertexAccessType = RANDOM; 4603 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology; 4604 } 4605 4606 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM); 4607 4608 offset += stride; 4609 } 4610 4611 mi_value_unref(&b, max); 4612} 4613 4614void genX(CmdBeginTransformFeedbackEXT)( 4615 VkCommandBuffer commandBuffer, 4616 uint32_t firstCounterBuffer, 4617 uint32_t counterBufferCount, 4618 const VkBuffer* pCounterBuffers, 4619 const VkDeviceSize* pCounterBufferOffsets) 4620{ 4621 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 4622 4623 assert(firstCounterBuffer < MAX_XFB_BUFFERS); 4624 assert(counterBufferCount <= MAX_XFB_BUFFERS); 4625 assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS); 4626 4627 /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET: 4628 * 4629 * "Ssoftware must ensure that no HW stream output operations can be in 4630 * process or otherwise pending at the point that the MI_LOAD/STORE 4631 * commands are processed. This will likely require a pipeline flush." 4632 */ 4633 anv_add_pending_pipe_bits(cmd_buffer, 4634 ANV_PIPE_CS_STALL_BIT, 4635 "begin transform feedback"); 4636 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 4637 4638 for (uint32_t idx = 0; idx < MAX_XFB_BUFFERS; idx++) { 4639 /* If we have a counter buffer, this is a resume so we need to load the 4640 * value into the streamout offset register. Otherwise, this is a begin 4641 * and we need to reset it to zero. 4642 */ 4643 if (pCounterBuffers && 4644 idx >= firstCounterBuffer && 4645 idx - firstCounterBuffer < counterBufferCount && 4646 pCounterBuffers[idx - firstCounterBuffer] != VK_NULL_HANDLE) { 4647 uint32_t cb_idx = idx - firstCounterBuffer; 4648 ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]); 4649 uint64_t offset = pCounterBufferOffsets ? 4650 pCounterBufferOffsets[cb_idx] : 0; 4651 4652 anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { 4653 lrm.RegisterAddress = GENX(SO_WRITE_OFFSET0_num) + idx * 4; 4654 lrm.MemoryAddress = anv_address_add(counter_buffer->address, 4655 offset); 4656 } 4657 } else { 4658 anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) { 4659 lri.RegisterOffset = GENX(SO_WRITE_OFFSET0_num) + idx * 4; 4660 lri.DataDWord = 0; 4661 } 4662 } 4663 } 4664 4665 cmd_buffer->state.xfb_enabled = true; 4666 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE; 4667} 4668 4669void genX(CmdEndTransformFeedbackEXT)( 4670 VkCommandBuffer commandBuffer, 4671 uint32_t firstCounterBuffer, 4672 uint32_t counterBufferCount, 4673 const VkBuffer* pCounterBuffers, 4674 const VkDeviceSize* pCounterBufferOffsets) 4675{ 4676 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 4677 4678 assert(firstCounterBuffer < MAX_XFB_BUFFERS); 4679 assert(counterBufferCount <= MAX_XFB_BUFFERS); 4680 assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS); 4681 4682 /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET: 4683 * 4684 * "Ssoftware must ensure that no HW stream output operations can be in 4685 * process or otherwise pending at the point that the MI_LOAD/STORE 4686 * commands are processed. This will likely require a pipeline flush." 4687 */ 4688 anv_add_pending_pipe_bits(cmd_buffer, 4689 ANV_PIPE_CS_STALL_BIT, 4690 "end transform feedback"); 4691 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 4692 4693 for (uint32_t cb_idx = 0; cb_idx < counterBufferCount; cb_idx++) { 4694 unsigned idx = firstCounterBuffer + cb_idx; 4695 4696 /* If we have a counter buffer, this is a resume so we need to load the 4697 * value into the streamout offset register. Otherwise, this is a begin 4698 * and we need to reset it to zero. 4699 */ 4700 if (pCounterBuffers && 4701 cb_idx < counterBufferCount && 4702 pCounterBuffers[cb_idx] != VK_NULL_HANDLE) { 4703 ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]); 4704 uint64_t offset = pCounterBufferOffsets ? 4705 pCounterBufferOffsets[cb_idx] : 0; 4706 4707 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) { 4708 srm.MemoryAddress = anv_address_add(counter_buffer->address, 4709 offset); 4710 srm.RegisterAddress = GENX(SO_WRITE_OFFSET0_num) + idx * 4; 4711 } 4712 } 4713 } 4714 4715 cmd_buffer->state.xfb_enabled = false; 4716 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE; 4717} 4718 4719void 4720genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer) 4721{ 4722 struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute; 4723 struct anv_compute_pipeline *pipeline = comp_state->pipeline; 4724 4725 assert(pipeline->cs); 4726 4727 genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config); 4728 4729 genX(flush_pipeline_select_gpgpu)(cmd_buffer); 4730 4731 /* Apply any pending pipeline flushes we may have. We want to apply them 4732 * now because, if any of those flushes are for things like push constants, 4733 * the GPU will read the state at weird times. 4734 */ 4735 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 4736 4737 if (cmd_buffer->state.compute.pipeline_dirty) { 4738 /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE: 4739 * 4740 * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless 4741 * the only bits that are changed are scoreboard related: Scoreboard 4742 * Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For 4743 * these scoreboard related states, a MEDIA_STATE_FLUSH is 4744 * sufficient." 4745 */ 4746 anv_add_pending_pipe_bits(cmd_buffer, 4747 ANV_PIPE_CS_STALL_BIT, 4748 "flush compute state"); 4749 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 4750 4751 anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch); 4752 4753 /* The workgroup size of the pipeline affects our push constant layout 4754 * so flag push constants as dirty if we change the pipeline. 4755 */ 4756 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; 4757 } 4758 4759 if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) || 4760 cmd_buffer->state.compute.pipeline_dirty) { 4761 flush_descriptor_sets(cmd_buffer, 4762 &cmd_buffer->state.compute.base, 4763 VK_SHADER_STAGE_COMPUTE_BIT, 4764 &pipeline->cs, 1); 4765 cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT; 4766 4767#if GFX_VERx10 < 125 4768 uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)]; 4769 struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = { 4770 .BindingTablePointer = 4771 cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset, 4772 .SamplerStatePointer = 4773 cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset, 4774 }; 4775 GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc); 4776 4777 struct anv_state state = 4778 anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw, 4779 pipeline->interface_descriptor_data, 4780 GENX(INTERFACE_DESCRIPTOR_DATA_length), 4781 64); 4782 4783 uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t); 4784 anv_batch_emit(&cmd_buffer->batch, 4785 GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) { 4786 mid.InterfaceDescriptorTotalLength = size; 4787 mid.InterfaceDescriptorDataStartAddress = state.offset; 4788 } 4789#endif 4790 } 4791 4792 if (cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_COMPUTE_BIT) { 4793 comp_state->push_data = 4794 anv_cmd_buffer_cs_push_constants(cmd_buffer); 4795 4796#if GFX_VERx10 < 125 4797 if (comp_state->push_data.alloc_size) { 4798 anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) { 4799 curbe.CURBETotalDataLength = comp_state->push_data.alloc_size; 4800 curbe.CURBEDataStartAddress = comp_state->push_data.offset; 4801 } 4802 } 4803#endif 4804 4805 cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT; 4806 } 4807 4808 cmd_buffer->state.compute.pipeline_dirty = false; 4809 4810 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 4811} 4812 4813#if GFX_VER == 7 4814 4815static VkResult 4816verify_cmd_parser(const struct anv_device *device, 4817 int required_version, 4818 const char *function) 4819{ 4820 if (device->physical->cmd_parser_version < required_version) { 4821 return vk_errorf(device->physical, VK_ERROR_FEATURE_NOT_PRESENT, 4822 "cmd parser version %d is required for %s", 4823 required_version, function); 4824 } else { 4825 return VK_SUCCESS; 4826 } 4827} 4828 4829#endif 4830 4831static void 4832anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer *cmd_buffer, 4833 uint32_t baseGroupX, 4834 uint32_t baseGroupY, 4835 uint32_t baseGroupZ) 4836{ 4837 if (anv_batch_has_error(&cmd_buffer->batch)) 4838 return; 4839 4840 struct anv_push_constants *push = 4841 &cmd_buffer->state.compute.base.push_constants; 4842 if (push->cs.base_work_group_id[0] != baseGroupX || 4843 push->cs.base_work_group_id[1] != baseGroupY || 4844 push->cs.base_work_group_id[2] != baseGroupZ) { 4845 push->cs.base_work_group_id[0] = baseGroupX; 4846 push->cs.base_work_group_id[1] = baseGroupY; 4847 push->cs.base_work_group_id[2] = baseGroupZ; 4848 4849 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; 4850 } 4851} 4852 4853void genX(CmdDispatch)( 4854 VkCommandBuffer commandBuffer, 4855 uint32_t x, 4856 uint32_t y, 4857 uint32_t z) 4858{ 4859 genX(CmdDispatchBase)(commandBuffer, 0, 0, 0, x, y, z); 4860} 4861 4862#if GFX_VERx10 >= 125 4863 4864static inline void 4865emit_compute_walker(struct anv_cmd_buffer *cmd_buffer, 4866 const struct anv_compute_pipeline *pipeline, bool indirect, 4867 const struct brw_cs_prog_data *prog_data, 4868 uint32_t groupCountX, uint32_t groupCountY, 4869 uint32_t groupCountZ) 4870{ 4871 struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute; 4872 const struct anv_shader_bin *cs_bin = pipeline->cs; 4873 bool predicate = cmd_buffer->state.conditional_render_enabled; 4874 4875 const struct intel_device_info *devinfo = &pipeline->base.device->info; 4876 const struct brw_cs_dispatch_info dispatch = 4877 brw_cs_get_dispatch_info(devinfo, prog_data, NULL); 4878 4879 anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) { 4880 cw.IndirectParameterEnable = indirect; 4881 cw.PredicateEnable = predicate; 4882 cw.SIMDSize = dispatch.simd_size / 16; 4883 cw.IndirectDataStartAddress = comp_state->push_data.offset; 4884 cw.IndirectDataLength = comp_state->push_data.alloc_size; 4885 cw.LocalXMaximum = prog_data->local_size[0] - 1; 4886 cw.LocalYMaximum = prog_data->local_size[1] - 1; 4887 cw.LocalZMaximum = prog_data->local_size[2] - 1; 4888 cw.ThreadGroupIDXDimension = groupCountX; 4889 cw.ThreadGroupIDYDimension = groupCountY; 4890 cw.ThreadGroupIDZDimension = groupCountZ; 4891 cw.ExecutionMask = dispatch.right_mask; 4892 4893 cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) { 4894 .KernelStartPointer = cs_bin->kernel.offset, 4895 .SamplerStatePointer = 4896 cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset, 4897 .BindingTablePointer = 4898 cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset, 4899 .BindingTableEntryCount = 4900 1 + MIN2(pipeline->cs->bind_map.surface_count, 30), 4901 .NumberofThreadsinGPGPUThreadGroup = dispatch.threads, 4902 .SharedLocalMemorySize = encode_slm_size(GFX_VER, 4903 prog_data->base.total_shared), 4904 .NumberOfBarriers = prog_data->uses_barrier, 4905 }; 4906 } 4907} 4908 4909#else /* #if GFX_VERx10 >= 125 */ 4910 4911static inline void 4912emit_gpgpu_walker(struct anv_cmd_buffer *cmd_buffer, 4913 const struct anv_compute_pipeline *pipeline, bool indirect, 4914 const struct brw_cs_prog_data *prog_data, 4915 uint32_t groupCountX, uint32_t groupCountY, 4916 uint32_t groupCountZ) 4917{ 4918 bool predicate = (GFX_VER <= 7 && indirect) || 4919 cmd_buffer->state.conditional_render_enabled; 4920 4921 const struct intel_device_info *devinfo = &pipeline->base.device->info; 4922 const struct brw_cs_dispatch_info dispatch = 4923 brw_cs_get_dispatch_info(devinfo, prog_data, NULL); 4924 4925 anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) { 4926 ggw.IndirectParameterEnable = indirect; 4927 ggw.PredicateEnable = predicate; 4928 ggw.SIMDSize = dispatch.simd_size / 16; 4929 ggw.ThreadDepthCounterMaximum = 0; 4930 ggw.ThreadHeightCounterMaximum = 0; 4931 ggw.ThreadWidthCounterMaximum = dispatch.threads - 1; 4932 ggw.ThreadGroupIDXDimension = groupCountX; 4933 ggw.ThreadGroupIDYDimension = groupCountY; 4934 ggw.ThreadGroupIDZDimension = groupCountZ; 4935 ggw.RightExecutionMask = dispatch.right_mask; 4936 ggw.BottomExecutionMask = 0xffffffff; 4937 } 4938 4939 anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH), msf); 4940} 4941 4942#endif /* #if GFX_VERx10 >= 125 */ 4943 4944static inline void 4945emit_cs_walker(struct anv_cmd_buffer *cmd_buffer, 4946 const struct anv_compute_pipeline *pipeline, bool indirect, 4947 const struct brw_cs_prog_data *prog_data, 4948 uint32_t groupCountX, uint32_t groupCountY, 4949 uint32_t groupCountZ) 4950{ 4951#if GFX_VERx10 >= 125 4952 emit_compute_walker(cmd_buffer, pipeline, indirect, prog_data, groupCountX, 4953 groupCountY, groupCountZ); 4954#else 4955 emit_gpgpu_walker(cmd_buffer, pipeline, indirect, prog_data, groupCountX, 4956 groupCountY, groupCountZ); 4957#endif 4958} 4959 4960void genX(CmdDispatchBase)( 4961 VkCommandBuffer commandBuffer, 4962 uint32_t baseGroupX, 4963 uint32_t baseGroupY, 4964 uint32_t baseGroupZ, 4965 uint32_t groupCountX, 4966 uint32_t groupCountY, 4967 uint32_t groupCountZ) 4968{ 4969 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 4970 struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline; 4971 const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline); 4972 4973 anv_cmd_buffer_push_base_group_id(cmd_buffer, baseGroupX, 4974 baseGroupY, baseGroupZ); 4975 4976 if (anv_batch_has_error(&cmd_buffer->batch)) 4977 return; 4978 4979 anv_measure_snapshot(cmd_buffer, 4980 INTEL_SNAPSHOT_COMPUTE, 4981 "compute", 4982 groupCountX * groupCountY * groupCountZ * 4983 prog_data->local_size[0] * prog_data->local_size[1] * 4984 prog_data->local_size[2]); 4985 4986 if (prog_data->uses_num_work_groups) { 4987 struct anv_state state = 4988 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 12, 4); 4989 uint32_t *sizes = state.map; 4990 sizes[0] = groupCountX; 4991 sizes[1] = groupCountY; 4992 sizes[2] = groupCountZ; 4993 cmd_buffer->state.compute.num_workgroups = (struct anv_address) { 4994 .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, 4995 .offset = state.offset, 4996 }; 4997 4998 /* The num_workgroups buffer goes in the binding table */ 4999 cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; 5000 } 5001 5002 genX(cmd_buffer_flush_compute_state)(cmd_buffer); 5003 5004 if (cmd_buffer->state.conditional_render_enabled) 5005 genX(cmd_emit_conditional_render_predicate)(cmd_buffer); 5006 5007 emit_cs_walker(cmd_buffer, pipeline, false, prog_data, groupCountX, 5008 groupCountY, groupCountZ); 5009} 5010 5011#define GPGPU_DISPATCHDIMX 0x2500 5012#define GPGPU_DISPATCHDIMY 0x2504 5013#define GPGPU_DISPATCHDIMZ 0x2508 5014 5015void genX(CmdDispatchIndirect)( 5016 VkCommandBuffer commandBuffer, 5017 VkBuffer _buffer, 5018 VkDeviceSize offset) 5019{ 5020 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 5021 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); 5022 struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline; 5023 const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline); 5024 struct anv_address addr = anv_address_add(buffer->address, offset); 5025 UNUSED struct anv_batch *batch = &cmd_buffer->batch; 5026 5027 anv_cmd_buffer_push_base_group_id(cmd_buffer, 0, 0, 0); 5028 5029#if GFX_VER == 7 5030 /* Linux 4.4 added command parser version 5 which allows the GPGPU 5031 * indirect dispatch registers to be written. 5032 */ 5033 if (verify_cmd_parser(cmd_buffer->device, 5, 5034 "vkCmdDispatchIndirect") != VK_SUCCESS) 5035 return; 5036#endif 5037 5038 anv_measure_snapshot(cmd_buffer, 5039 INTEL_SNAPSHOT_COMPUTE, 5040 "compute indirect", 5041 0); 5042 5043 if (prog_data->uses_num_work_groups) { 5044 cmd_buffer->state.compute.num_workgroups = addr; 5045 5046 /* The num_workgroups buffer goes in the binding table */ 5047 cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; 5048 } 5049 5050 genX(cmd_buffer_flush_compute_state)(cmd_buffer); 5051 5052 struct mi_builder b; 5053 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 5054 5055 struct mi_value size_x = mi_mem32(anv_address_add(addr, 0)); 5056 struct mi_value size_y = mi_mem32(anv_address_add(addr, 4)); 5057 struct mi_value size_z = mi_mem32(anv_address_add(addr, 8)); 5058 5059 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), size_x); 5060 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), size_y); 5061 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), size_z); 5062 5063#if GFX_VER <= 7 5064 /* predicate = (compute_dispatch_indirect_x_size == 0); */ 5065 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), size_x); 5066 mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0)); 5067 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) { 5068 mip.LoadOperation = LOAD_LOAD; 5069 mip.CombineOperation = COMBINE_SET; 5070 mip.CompareOperation = COMPARE_SRCS_EQUAL; 5071 } 5072 5073 /* predicate |= (compute_dispatch_indirect_y_size == 0); */ 5074 mi_store(&b, mi_reg32(MI_PREDICATE_SRC0), size_y); 5075 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) { 5076 mip.LoadOperation = LOAD_LOAD; 5077 mip.CombineOperation = COMBINE_OR; 5078 mip.CompareOperation = COMPARE_SRCS_EQUAL; 5079 } 5080 5081 /* predicate |= (compute_dispatch_indirect_z_size == 0); */ 5082 mi_store(&b, mi_reg32(MI_PREDICATE_SRC0), size_z); 5083 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) { 5084 mip.LoadOperation = LOAD_LOAD; 5085 mip.CombineOperation = COMBINE_OR; 5086 mip.CompareOperation = COMPARE_SRCS_EQUAL; 5087 } 5088 5089 /* predicate = !predicate; */ 5090 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) { 5091 mip.LoadOperation = LOAD_LOADINV; 5092 mip.CombineOperation = COMBINE_OR; 5093 mip.CompareOperation = COMPARE_FALSE; 5094 } 5095 5096#if GFX_VERx10 == 75 5097 if (cmd_buffer->state.conditional_render_enabled) { 5098 /* predicate &= !(conditional_rendering_predicate == 0); */ 5099 mi_store(&b, mi_reg32(MI_PREDICATE_SRC0), 5100 mi_reg32(ANV_PREDICATE_RESULT_REG)); 5101 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) { 5102 mip.LoadOperation = LOAD_LOADINV; 5103 mip.CombineOperation = COMBINE_AND; 5104 mip.CompareOperation = COMPARE_SRCS_EQUAL; 5105 } 5106 } 5107#endif 5108 5109#else /* GFX_VER > 7 */ 5110 if (cmd_buffer->state.conditional_render_enabled) 5111 genX(cmd_emit_conditional_render_predicate)(cmd_buffer); 5112#endif 5113 5114 emit_cs_walker(cmd_buffer, pipeline, true, prog_data, 0, 0, 0); 5115} 5116 5117#if GFX_VERx10 >= 125 5118static void 5119calc_local_trace_size(uint8_t local_shift[3], const uint32_t global[3]) 5120{ 5121 unsigned total_shift = 0; 5122 memset(local_shift, 0, 3); 5123 5124 bool progress; 5125 do { 5126 progress = false; 5127 for (unsigned i = 0; i < 3; i++) { 5128 assert(global[i] > 0); 5129 if ((1 << local_shift[i]) < global[i]) { 5130 progress = true; 5131 local_shift[i]++; 5132 total_shift++; 5133 } 5134 5135 if (total_shift == 3) 5136 return; 5137 } 5138 } while(progress); 5139 5140 /* Assign whatever's left to x */ 5141 local_shift[0] += 3 - total_shift; 5142} 5143 5144static struct GFX_RT_SHADER_TABLE 5145vk_sdar_to_shader_table(const VkStridedDeviceAddressRegionKHR *region) 5146{ 5147 return (struct GFX_RT_SHADER_TABLE) { 5148 .BaseAddress = anv_address_from_u64(region->deviceAddress), 5149 .Stride = region->stride, 5150 }; 5151} 5152 5153static void 5154cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer, 5155 const VkStridedDeviceAddressRegionKHR *raygen_sbt, 5156 const VkStridedDeviceAddressRegionKHR *miss_sbt, 5157 const VkStridedDeviceAddressRegionKHR *hit_sbt, 5158 const VkStridedDeviceAddressRegionKHR *callable_sbt, 5159 bool is_indirect, 5160 uint32_t launch_width, 5161 uint32_t launch_height, 5162 uint32_t launch_depth, 5163 uint64_t launch_size_addr) 5164{ 5165 struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt; 5166 struct anv_ray_tracing_pipeline *pipeline = rt->pipeline; 5167 5168 if (anv_batch_has_error(&cmd_buffer->batch)) 5169 return; 5170 5171 /* If we have a known degenerate launch size, just bail */ 5172 if (!is_indirect && 5173 (launch_width == 0 || launch_height == 0 || launch_depth == 0)) 5174 return; 5175 5176 genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config); 5177 genX(flush_pipeline_select_gpgpu)(cmd_buffer); 5178 5179 cmd_buffer->state.rt.pipeline_dirty = false; 5180 5181 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 5182 5183 /* Add these to the reloc list as they're internal buffers that don't 5184 * actually have relocs to pick them up manually. 5185 * 5186 * TODO(RT): This is a bit of a hack 5187 */ 5188 anv_reloc_list_add_bo(cmd_buffer->batch.relocs, 5189 cmd_buffer->batch.alloc, 5190 rt->scratch.bo); 5191 5192 /* Allocate and set up our RT_DISPATCH_GLOBALS */ 5193 struct anv_state rtdg_state = 5194 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 5195 BRW_RT_PUSH_CONST_OFFSET + 5196 sizeof(struct anv_push_constants), 5197 64); 5198 5199 struct GFX_RT_DISPATCH_GLOBALS rtdg = { 5200 .MemBaseAddress = (struct anv_address) { 5201 .bo = rt->scratch.bo, 5202 .offset = rt->scratch.layout.ray_stack_start, 5203 }, 5204 .CallStackHandler = 5205 anv_shader_bin_get_bsr(cmd_buffer->device->rt_trivial_return, 0), 5206 .AsyncRTStackSize = rt->scratch.layout.ray_stack_stride / 64, 5207 .NumDSSRTStacks = rt->scratch.layout.stack_ids_per_dss, 5208 .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS, 5209 .Flags = RT_DEPTH_TEST_LESS_EQUAL, 5210 .HitGroupTable = vk_sdar_to_shader_table(hit_sbt), 5211 .MissGroupTable = vk_sdar_to_shader_table(miss_sbt), 5212 .SWStackSize = rt->scratch.layout.sw_stack_size / 64, 5213 .LaunchWidth = launch_width, 5214 .LaunchHeight = launch_height, 5215 .LaunchDepth = launch_depth, 5216 .CallableGroupTable = vk_sdar_to_shader_table(callable_sbt), 5217 }; 5218 GFX_RT_DISPATCH_GLOBALS_pack(NULL, rtdg_state.map, &rtdg); 5219 5220 /* Push constants go after the RT_DISPATCH_GLOBALS */ 5221 assert(GFX_RT_DISPATCH_GLOBALS_length * 4 <= BRW_RT_PUSH_CONST_OFFSET); 5222 memcpy(rtdg_state.map + BRW_RT_PUSH_CONST_OFFSET, 5223 &cmd_buffer->state.rt.base.push_constants, 5224 sizeof(struct anv_push_constants)); 5225 5226 struct anv_address rtdg_addr = { 5227 .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, 5228 .offset = rtdg_state.offset, 5229 }; 5230 5231 uint8_t local_size_log2[3]; 5232 uint32_t global_size[3] = {}; 5233 if (is_indirect) { 5234 /* Pick a local size that's probably ok. We assume most TraceRays calls 5235 * will use a two-dimensional dispatch size. Worst case, our initial 5236 * dispatch will be a little slower than it has to be. 5237 */ 5238 local_size_log2[0] = 2; 5239 local_size_log2[1] = 1; 5240 local_size_log2[2] = 0; 5241 5242 struct mi_builder b; 5243 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 5244 5245 struct mi_value launch_size[3] = { 5246 mi_mem32(anv_address_from_u64(launch_size_addr + 0)), 5247 mi_mem32(anv_address_from_u64(launch_size_addr + 4)), 5248 mi_mem32(anv_address_from_u64(launch_size_addr + 8)), 5249 }; 5250 5251 /* Store the original launch size into RT_DISPATCH_GLOBALS 5252 * 5253 * TODO: Pull values from genX_bits.h once RT_DISPATCH_GLOBALS gets 5254 * moved into a genX version. 5255 */ 5256 mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 52)), 5257 mi_value_ref(&b, launch_size[0])); 5258 mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 56)), 5259 mi_value_ref(&b, launch_size[1])); 5260 mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 60)), 5261 mi_value_ref(&b, launch_size[2])); 5262 5263 /* Compute the global dispatch size */ 5264 for (unsigned i = 0; i < 3; i++) { 5265 if (local_size_log2[i] == 0) 5266 continue; 5267 5268 /* global_size = DIV_ROUND_UP(launch_size, local_size) 5269 * 5270 * Fortunately for us MI_ALU math is 64-bit and , mi_ushr32_imm 5271 * has the semantics of shifting the enture 64-bit value and taking 5272 * the bottom 32 so we don't have to worry about roll-over. 5273 */ 5274 uint32_t local_size = 1 << local_size_log2[i]; 5275 launch_size[i] = mi_iadd(&b, launch_size[i], 5276 mi_imm(local_size - 1)); 5277 launch_size[i] = mi_ushr32_imm(&b, launch_size[i], 5278 local_size_log2[i]); 5279 } 5280 5281 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), launch_size[0]); 5282 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), launch_size[1]); 5283 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), launch_size[2]); 5284 } else { 5285 uint32_t launch_size[3] = { launch_width, launch_height, launch_depth }; 5286 calc_local_trace_size(local_size_log2, launch_size); 5287 5288 for (unsigned i = 0; i < 3; i++) { 5289 /* We have to be a bit careful here because DIV_ROUND_UP adds to the 5290 * numerator value may overflow. Cast to uint64_t to avoid this. 5291 */ 5292 uint32_t local_size = 1 << local_size_log2[i]; 5293 global_size[i] = DIV_ROUND_UP((uint64_t)launch_size[i], local_size); 5294 } 5295 } 5296 5297 anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) { 5298 cw.IndirectParameterEnable = is_indirect; 5299 cw.PredicateEnable = false; 5300 cw.SIMDSize = SIMD8; 5301 cw.LocalXMaximum = (1 << local_size_log2[0]) - 1; 5302 cw.LocalYMaximum = (1 << local_size_log2[1]) - 1; 5303 cw.LocalZMaximum = (1 << local_size_log2[2]) - 1; 5304 cw.ThreadGroupIDXDimension = global_size[0]; 5305 cw.ThreadGroupIDYDimension = global_size[1]; 5306 cw.ThreadGroupIDZDimension = global_size[2]; 5307 cw.ExecutionMask = 0xff; 5308 cw.EmitInlineParameter = true; 5309 5310 const gl_shader_stage s = MESA_SHADER_RAYGEN; 5311 struct anv_device *device = cmd_buffer->device; 5312 struct anv_state *surfaces = &cmd_buffer->state.binding_tables[s]; 5313 struct anv_state *samplers = &cmd_buffer->state.samplers[s]; 5314 cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) { 5315 .KernelStartPointer = device->rt_trampoline->kernel.offset, 5316 .SamplerStatePointer = samplers->offset, 5317 /* i965: DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4), */ 5318 .SamplerCount = 0, 5319 .BindingTablePointer = surfaces->offset, 5320 .NumberofThreadsinGPGPUThreadGroup = 1, 5321 .BTDMode = true, 5322 }; 5323 5324 struct brw_rt_raygen_trampoline_params trampoline_params = { 5325 .rt_disp_globals_addr = anv_address_physical(rtdg_addr), 5326 .raygen_bsr_addr = raygen_sbt->deviceAddress, 5327 .is_indirect = is_indirect, 5328 .local_group_size_log2 = { 5329 local_size_log2[0], 5330 local_size_log2[1], 5331 local_size_log2[2], 5332 }, 5333 }; 5334 STATIC_ASSERT(sizeof(trampoline_params) == 32); 5335 memcpy(cw.InlineData, &trampoline_params, sizeof(trampoline_params)); 5336 } 5337} 5338 5339void 5340genX(CmdTraceRaysKHR)( 5341 VkCommandBuffer commandBuffer, 5342 const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable, 5343 const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable, 5344 const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable, 5345 const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable, 5346 uint32_t width, 5347 uint32_t height, 5348 uint32_t depth) 5349{ 5350 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 5351 5352 cmd_buffer_trace_rays(cmd_buffer, 5353 pRaygenShaderBindingTable, 5354 pMissShaderBindingTable, 5355 pHitShaderBindingTable, 5356 pCallableShaderBindingTable, 5357 false /* is_indirect */, 5358 width, height, depth, 5359 0 /* launch_size_addr */); 5360} 5361 5362void 5363genX(CmdTraceRaysIndirectKHR)( 5364 VkCommandBuffer commandBuffer, 5365 const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable, 5366 const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable, 5367 const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable, 5368 const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable, 5369 VkDeviceAddress indirectDeviceAddress) 5370{ 5371 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 5372 5373 cmd_buffer_trace_rays(cmd_buffer, 5374 pRaygenShaderBindingTable, 5375 pMissShaderBindingTable, 5376 pHitShaderBindingTable, 5377 pCallableShaderBindingTable, 5378 true /* is_indirect */, 5379 0, 0, 0, /* width, height, depth, */ 5380 indirectDeviceAddress); 5381} 5382#endif /* GFX_VERx10 >= 125 */ 5383 5384static void 5385genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer, 5386 uint32_t pipeline) 5387{ 5388 UNUSED const struct intel_device_info *devinfo = &cmd_buffer->device->info; 5389 5390 if (cmd_buffer->state.current_pipeline == pipeline) 5391 return; 5392 5393#if GFX_VER >= 8 && GFX_VER < 10 5394 /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT: 5395 * 5396 * Software must clear the COLOR_CALC_STATE Valid field in 5397 * 3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT 5398 * with Pipeline Select set to GPGPU. 5399 * 5400 * The internal hardware docs recommend the same workaround for Gfx9 5401 * hardware too. 5402 */ 5403 if (pipeline == GPGPU) 5404 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), t); 5405#endif 5406 5407#if GFX_VER == 9 5408 if (pipeline == _3D) { 5409 /* There is a mid-object preemption workaround which requires you to 5410 * re-emit MEDIA_VFE_STATE after switching from GPGPU to 3D. However, 5411 * even without preemption, we have issues with geometry flickering when 5412 * GPGPU and 3D are back-to-back and this seems to fix it. We don't 5413 * really know why. 5414 */ 5415 anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_VFE_STATE), vfe) { 5416 vfe.MaximumNumberofThreads = 5417 devinfo->max_cs_threads * devinfo->subslice_total - 1; 5418 vfe.NumberofURBEntries = 2; 5419 vfe.URBEntryAllocationSize = 2; 5420 } 5421 5422 /* We just emitted a dummy MEDIA_VFE_STATE so now that packet is 5423 * invalid. Set the compute pipeline to dirty to force a re-emit of the 5424 * pipeline in case we get back-to-back dispatch calls with the same 5425 * pipeline and a PIPELINE_SELECT in between. 5426 */ 5427 cmd_buffer->state.compute.pipeline_dirty = true; 5428 } 5429#endif 5430 5431 /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction] 5432 * PIPELINE_SELECT [DevBWR+]": 5433 * 5434 * Project: DEVSNB+ 5435 * 5436 * Software must ensure all the write caches are flushed through a 5437 * stalling PIPE_CONTROL command followed by another PIPE_CONTROL 5438 * command to invalidate read only caches prior to programming 5439 * MI_PIPELINE_SELECT command to change the Pipeline Select Mode. 5440 */ 5441 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 5442 pc.RenderTargetCacheFlushEnable = true; 5443 pc.DepthCacheFlushEnable = true; 5444#if GFX_VER >= 12 5445 pc.HDCPipelineFlushEnable = true; 5446#else 5447 pc.DCFlushEnable = true; 5448#endif 5449 pc.PostSyncOperation = NoWrite; 5450 pc.CommandStreamerStallEnable = true; 5451#if GFX_VER >= 12 5452 /* Wa_1409600907: "PIPE_CONTROL with Depth Stall Enable bit must be 5453 * set with any PIPE_CONTROL with Depth Flush Enable bit set. 5454 */ 5455 pc.DepthStallEnable = true; 5456#endif 5457 anv_debug_dump_pc(pc); 5458 } 5459 5460 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 5461 pc.TextureCacheInvalidationEnable = true; 5462 pc.ConstantCacheInvalidationEnable = true; 5463 pc.StateCacheInvalidationEnable = true; 5464 pc.InstructionCacheInvalidateEnable = true; 5465 pc.PostSyncOperation = NoWrite; 5466 anv_debug_dump_pc(pc); 5467 } 5468 5469 anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) { 5470#if GFX_VER >= 9 5471 ps.MaskBits = GFX_VER >= 12 ? 0x13 : 3; 5472 ps.MediaSamplerDOPClockGateEnable = GFX_VER >= 12; 5473#endif 5474 ps.PipelineSelection = pipeline; 5475 } 5476 5477#if GFX_VER == 9 5478 if (devinfo->is_geminilake) { 5479 /* Project: DevGLK 5480 * 5481 * "This chicken bit works around a hardware issue with barrier logic 5482 * encountered when switching between GPGPU and 3D pipelines. To 5483 * workaround the issue, this mode bit should be set after a pipeline 5484 * is selected." 5485 */ 5486 anv_batch_write_reg(&cmd_buffer->batch, GENX(SLICE_COMMON_ECO_CHICKEN1), scec1) { 5487 scec1.GLKBarrierMode = pipeline == GPGPU ? GLK_BARRIER_MODE_GPGPU 5488 : GLK_BARRIER_MODE_3D_HULL; 5489 scec1.GLKBarrierModeMask = 1; 5490 } 5491 } 5492#endif 5493 5494 cmd_buffer->state.current_pipeline = pipeline; 5495} 5496 5497void 5498genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer) 5499{ 5500 genX(flush_pipeline_select)(cmd_buffer, _3D); 5501} 5502 5503void 5504genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer) 5505{ 5506 genX(flush_pipeline_select)(cmd_buffer, GPGPU); 5507} 5508 5509void 5510genX(cmd_buffer_emit_gfx7_depth_flush)(struct anv_cmd_buffer *cmd_buffer) 5511{ 5512 if (GFX_VER >= 8) 5513 return; 5514 5515 /* From the Haswell PRM, documentation for 3DSTATE_DEPTH_BUFFER: 5516 * 5517 * "Restriction: Prior to changing Depth/Stencil Buffer state (i.e., any 5518 * combination of 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS, 5519 * 3DSTATE_STENCIL_BUFFER, 3DSTATE_HIER_DEPTH_BUFFER) SW must first 5520 * issue a pipelined depth stall (PIPE_CONTROL with Depth Stall bit 5521 * set), followed by a pipelined depth cache flush (PIPE_CONTROL with 5522 * Depth Flush Bit set, followed by another pipelined depth stall 5523 * (PIPE_CONTROL with Depth Stall Bit set), unless SW can otherwise 5524 * guarantee that the pipeline from WM onwards is already flushed (e.g., 5525 * via a preceding MI_FLUSH)." 5526 */ 5527 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { 5528 pipe.DepthStallEnable = true; 5529 anv_debug_dump_pc(pipe); 5530 } 5531 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { 5532 pipe.DepthCacheFlushEnable = true; 5533#if GFX_VER >= 12 5534 pipe.TileCacheFlushEnable = true; 5535#endif 5536 anv_debug_dump_pc(pipe); 5537 } 5538 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { 5539 pipe.DepthStallEnable = true; 5540 anv_debug_dump_pc(pipe); 5541 } 5542} 5543 5544void 5545genX(cmd_buffer_emit_gfx12_depth_wa)(struct anv_cmd_buffer *cmd_buffer, 5546 const struct isl_surf *surf) 5547{ 5548#if GFX_VERx10 == 120 5549 const bool fmt_is_d16 = surf->format == ISL_FORMAT_R16_UNORM; 5550 5551 switch (cmd_buffer->state.depth_reg_mode) { 5552 case ANV_DEPTH_REG_MODE_HW_DEFAULT: 5553 if (!fmt_is_d16) 5554 return; 5555 break; 5556 case ANV_DEPTH_REG_MODE_D16: 5557 if (fmt_is_d16) 5558 return; 5559 break; 5560 case ANV_DEPTH_REG_MODE_UNKNOWN: 5561 break; 5562 } 5563 5564 /* We'll change some CHICKEN registers depending on the depth surface 5565 * format. Do a depth flush and stall so the pipeline is not using these 5566 * settings while we change the registers. 5567 */ 5568 anv_add_pending_pipe_bits(cmd_buffer, 5569 ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | 5570 ANV_PIPE_DEPTH_STALL_BIT | 5571 ANV_PIPE_END_OF_PIPE_SYNC_BIT, 5572 "Workaround: Stop pipeline for 14010455700"); 5573 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 5574 5575 /* Wa_14010455700 5576 * 5577 * To avoid sporadic corruptions “Set 0x7010[9] when Depth Buffer 5578 * Surface Format is D16_UNORM , surface type is not NULL & 1X_MSAA”. 5579 */ 5580 anv_batch_write_reg(&cmd_buffer->batch, GENX(COMMON_SLICE_CHICKEN1), reg) { 5581 reg.HIZPlaneOptimizationdisablebit = fmt_is_d16 && surf->samples == 1; 5582 reg.HIZPlaneOptimizationdisablebitMask = true; 5583 } 5584 5585 /* Wa_1806527549 5586 * 5587 * Set HIZ_CHICKEN (7018h) bit 13 = 1 when depth buffer is D16_UNORM. 5588 */ 5589 anv_batch_write_reg(&cmd_buffer->batch, GENX(HIZ_CHICKEN), reg) { 5590 reg.HZDepthTestLEGEOptimizationDisable = fmt_is_d16; 5591 reg.HZDepthTestLEGEOptimizationDisableMask = true; 5592 } 5593 5594 cmd_buffer->state.depth_reg_mode = 5595 fmt_is_d16 ? ANV_DEPTH_REG_MODE_D16 : ANV_DEPTH_REG_MODE_HW_DEFAULT; 5596#endif 5597} 5598 5599/* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS: 5600 * 5601 * "The VF cache needs to be invalidated before binding and then using 5602 * Vertex Buffers that overlap with any previously bound Vertex Buffer 5603 * (at a 64B granularity) since the last invalidation. A VF cache 5604 * invalidate is performed by setting the "VF Cache Invalidation Enable" 5605 * bit in PIPE_CONTROL." 5606 * 5607 * This is implemented by carefully tracking all vertex and index buffer 5608 * bindings and flushing if the cache ever ends up with a range in the cache 5609 * that would exceed 4 GiB. This is implemented in three parts: 5610 * 5611 * 1. genX(cmd_buffer_set_binding_for_gfx8_vb_flush)() which must be called 5612 * every time a 3DSTATE_VERTEX_BUFFER packet is emitted and informs the 5613 * tracking code of the new binding. If this new binding would cause 5614 * the cache to have a too-large range on the next draw call, a pipeline 5615 * stall and VF cache invalidate are added to pending_pipeline_bits. 5616 * 5617 * 2. genX(cmd_buffer_apply_pipe_flushes)() resets the cache tracking to 5618 * empty whenever we emit a VF invalidate. 5619 * 5620 * 3. genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)() must be called 5621 * after every 3DPRIMITIVE and copies the bound range into the dirty 5622 * range for each used buffer. This has to be a separate step because 5623 * we don't always re-bind all buffers and so 1. can't know which 5624 * buffers are actually bound. 5625 */ 5626void 5627genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer, 5628 int vb_index, 5629 struct anv_address vb_address, 5630 uint32_t vb_size) 5631{ 5632 if (GFX_VER < 8 || GFX_VER > 9 || 5633 !anv_use_softpin(cmd_buffer->device->physical)) 5634 return; 5635 5636 struct anv_vb_cache_range *bound, *dirty; 5637 if (vb_index == -1) { 5638 bound = &cmd_buffer->state.gfx.ib_bound_range; 5639 dirty = &cmd_buffer->state.gfx.ib_dirty_range; 5640 } else { 5641 assert(vb_index >= 0); 5642 assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges)); 5643 assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges)); 5644 bound = &cmd_buffer->state.gfx.vb_bound_ranges[vb_index]; 5645 dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[vb_index]; 5646 } 5647 5648 if (vb_size == 0) { 5649 bound->start = 0; 5650 bound->end = 0; 5651 return; 5652 } 5653 5654 assert(vb_address.bo && (vb_address.bo->flags & EXEC_OBJECT_PINNED)); 5655 bound->start = intel_48b_address(anv_address_physical(vb_address)); 5656 bound->end = bound->start + vb_size; 5657 assert(bound->end > bound->start); /* No overflow */ 5658 5659 /* Align everything to a cache line */ 5660 bound->start &= ~(64ull - 1ull); 5661 bound->end = align_u64(bound->end, 64); 5662 5663 /* Compute the dirty range */ 5664 dirty->start = MIN2(dirty->start, bound->start); 5665 dirty->end = MAX2(dirty->end, bound->end); 5666 5667 /* If our range is larger than 32 bits, we have to flush */ 5668 assert(bound->end - bound->start <= (1ull << 32)); 5669 if (dirty->end - dirty->start > (1ull << 32)) { 5670 anv_add_pending_pipe_bits(cmd_buffer, 5671 ANV_PIPE_CS_STALL_BIT | 5672 ANV_PIPE_VF_CACHE_INVALIDATE_BIT, 5673 "vb > 32b range"); 5674 } 5675} 5676 5677void 5678genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer, 5679 uint32_t access_type, 5680 uint64_t vb_used) 5681{ 5682 if (GFX_VER < 8 || GFX_VER > 9 || 5683 !anv_use_softpin(cmd_buffer->device->physical)) 5684 return; 5685 5686 if (access_type == RANDOM) { 5687 /* We have an index buffer */ 5688 struct anv_vb_cache_range *bound = &cmd_buffer->state.gfx.ib_bound_range; 5689 struct anv_vb_cache_range *dirty = &cmd_buffer->state.gfx.ib_dirty_range; 5690 5691 if (bound->end > bound->start) { 5692 dirty->start = MIN2(dirty->start, bound->start); 5693 dirty->end = MAX2(dirty->end, bound->end); 5694 } 5695 } 5696 5697 uint64_t mask = vb_used; 5698 while (mask) { 5699 int i = u_bit_scan64(&mask); 5700 assert(i >= 0); 5701 assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges)); 5702 assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges)); 5703 5704 struct anv_vb_cache_range *bound, *dirty; 5705 bound = &cmd_buffer->state.gfx.vb_bound_ranges[i]; 5706 dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[i]; 5707 5708 if (bound->end > bound->start) { 5709 dirty->start = MIN2(dirty->start, bound->start); 5710 dirty->end = MAX2(dirty->end, bound->end); 5711 } 5712 } 5713} 5714 5715/** 5716 * Update the pixel hashing modes that determine the balancing of PS threads 5717 * across subslices and slices. 5718 * 5719 * \param width Width bound of the rendering area (already scaled down if \p 5720 * scale is greater than 1). 5721 * \param height Height bound of the rendering area (already scaled down if \p 5722 * scale is greater than 1). 5723 * \param scale The number of framebuffer samples that could potentially be 5724 * affected by an individual channel of the PS thread. This is 5725 * typically one for single-sampled rendering, but for operations 5726 * like CCS resolves and fast clears a single PS invocation may 5727 * update a huge number of pixels, in which case a finer 5728 * balancing is desirable in order to maximally utilize the 5729 * bandwidth available. UINT_MAX can be used as shorthand for 5730 * "finest hashing mode available". 5731 */ 5732void 5733genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer, 5734 unsigned width, unsigned height, 5735 unsigned scale) 5736{ 5737#if GFX_VER == 9 5738 const struct intel_device_info *devinfo = &cmd_buffer->device->info; 5739 const unsigned slice_hashing[] = { 5740 /* Because all Gfx9 platforms with more than one slice require 5741 * three-way subslice hashing, a single "normal" 16x16 slice hashing 5742 * block is guaranteed to suffer from substantial imbalance, with one 5743 * subslice receiving twice as much work as the other two in the 5744 * slice. 5745 * 5746 * The performance impact of that would be particularly severe when 5747 * three-way hashing is also in use for slice balancing (which is the 5748 * case for all Gfx9 GT4 platforms), because one of the slices 5749 * receives one every three 16x16 blocks in either direction, which 5750 * is roughly the periodicity of the underlying subslice imbalance 5751 * pattern ("roughly" because in reality the hardware's 5752 * implementation of three-way hashing doesn't do exact modulo 3 5753 * arithmetic, which somewhat decreases the magnitude of this effect 5754 * in practice). This leads to a systematic subslice imbalance 5755 * within that slice regardless of the size of the primitive. The 5756 * 32x32 hashing mode guarantees that the subslice imbalance within a 5757 * single slice hashing block is minimal, largely eliminating this 5758 * effect. 5759 */ 5760 _32x32, 5761 /* Finest slice hashing mode available. */ 5762 NORMAL 5763 }; 5764 const unsigned subslice_hashing[] = { 5765 /* 16x16 would provide a slight cache locality benefit especially 5766 * visible in the sampler L1 cache efficiency of low-bandwidth 5767 * non-LLC platforms, but it comes at the cost of greater subslice 5768 * imbalance for primitives of dimensions approximately intermediate 5769 * between 16x4 and 16x16. 5770 */ 5771 _16x4, 5772 /* Finest subslice hashing mode available. */ 5773 _8x4 5774 }; 5775 /* Dimensions of the smallest hashing block of a given hashing mode. If 5776 * the rendering area is smaller than this there can't possibly be any 5777 * benefit from switching to this mode, so we optimize out the 5778 * transition. 5779 */ 5780 const unsigned min_size[][2] = { 5781 { 16, 4 }, 5782 { 8, 4 } 5783 }; 5784 const unsigned idx = scale > 1; 5785 5786 if (cmd_buffer->state.current_hash_scale != scale && 5787 (width > min_size[idx][0] || height > min_size[idx][1])) { 5788 anv_add_pending_pipe_bits(cmd_buffer, 5789 ANV_PIPE_CS_STALL_BIT | 5790 ANV_PIPE_STALL_AT_SCOREBOARD_BIT, 5791 "change pixel hash mode"); 5792 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 5793 5794 anv_batch_write_reg(&cmd_buffer->batch, GENX(GT_MODE), gt) { 5795 gt.SliceHashing = (devinfo->num_slices > 1 ? slice_hashing[idx] : 0); 5796 gt.SliceHashingMask = (devinfo->num_slices > 1 ? -1 : 0); 5797 gt.SubsliceHashing = subslice_hashing[idx]; 5798 gt.SubsliceHashingMask = -1; 5799 } 5800 5801 cmd_buffer->state.current_hash_scale = scale; 5802 } 5803#endif 5804} 5805 5806static void 5807cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer) 5808{ 5809 struct anv_device *device = cmd_buffer->device; 5810 const struct anv_image_view *iview = 5811 anv_cmd_buffer_get_depth_stencil_view(cmd_buffer); 5812 const struct anv_image *image = iview ? iview->image : NULL; 5813 5814 /* FIXME: Width and Height are wrong */ 5815 5816 genX(cmd_buffer_emit_gfx7_depth_flush)(cmd_buffer); 5817 5818 uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch, 5819 device->isl_dev.ds.size / 4); 5820 if (dw == NULL) 5821 return; 5822 5823 struct isl_depth_stencil_hiz_emit_info info = { }; 5824 5825 if (iview) 5826 info.view = &iview->planes[0].isl; 5827 5828 if (image && (image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) { 5829 const uint32_t depth_plane = 5830 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT); 5831 const struct anv_surface *depth_surface = 5832 &image->planes[depth_plane].primary_surface; 5833 const struct anv_address depth_address = 5834 anv_image_address(image, &depth_surface->memory_range); 5835 5836 info.depth_surf = &depth_surface->isl; 5837 5838 info.depth_address = 5839 anv_batch_emit_reloc(&cmd_buffer->batch, 5840 dw + device->isl_dev.ds.depth_offset / 4, 5841 depth_address.bo, depth_address.offset); 5842 info.mocs = 5843 anv_mocs(device, depth_address.bo, ISL_SURF_USAGE_DEPTH_BIT); 5844 5845 const uint32_t ds = 5846 cmd_buffer->state.subpass->depth_stencil_attachment->attachment; 5847 info.hiz_usage = cmd_buffer->state.attachments[ds].aux_usage; 5848 if (info.hiz_usage != ISL_AUX_USAGE_NONE) { 5849 assert(isl_aux_usage_has_hiz(info.hiz_usage)); 5850 5851 const struct anv_surface *hiz_surface = 5852 &image->planes[depth_plane].aux_surface; 5853 const struct anv_address hiz_address = 5854 anv_image_address(image, &hiz_surface->memory_range); 5855 5856 info.hiz_surf = &hiz_surface->isl; 5857 5858 info.hiz_address = 5859 anv_batch_emit_reloc(&cmd_buffer->batch, 5860 dw + device->isl_dev.ds.hiz_offset / 4, 5861 hiz_address.bo, hiz_address.offset); 5862 5863 info.depth_clear_value = ANV_HZ_FC_VAL; 5864 } 5865 } 5866 5867 if (image && (image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT)) { 5868 const uint32_t stencil_plane = 5869 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT); 5870 const struct anv_surface *stencil_surface = 5871 &image->planes[stencil_plane].primary_surface; 5872 const struct anv_address stencil_address = 5873 anv_image_address(image, &stencil_surface->memory_range); 5874 5875 info.stencil_surf = &stencil_surface->isl; 5876 5877 info.stencil_aux_usage = image->planes[stencil_plane].aux_usage; 5878 info.stencil_address = 5879 anv_batch_emit_reloc(&cmd_buffer->batch, 5880 dw + device->isl_dev.ds.stencil_offset / 4, 5881 stencil_address.bo, stencil_address.offset); 5882 info.mocs = 5883 anv_mocs(device, stencil_address.bo, ISL_SURF_USAGE_STENCIL_BIT); 5884 } 5885 5886 isl_emit_depth_stencil_hiz_s(&device->isl_dev, dw, &info); 5887 5888 if (info.depth_surf) 5889 genX(cmd_buffer_emit_gfx12_depth_wa)(cmd_buffer, info.depth_surf); 5890 5891 if (GFX_VER >= 12) { 5892 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT; 5893 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 5894 5895 /* Wa_1408224581 5896 * 5897 * Workaround: Gfx12LP Astep only An additional pipe control with 5898 * post-sync = store dword operation would be required.( w/a is to 5899 * have an additional pipe control after the stencil state whenever 5900 * the surface state bits of this state is changing). 5901 */ 5902 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 5903 pc.PostSyncOperation = WriteImmediateData; 5904 pc.Address = cmd_buffer->device->workaround_address; 5905 } 5906 } 5907 cmd_buffer->state.hiz_enabled = isl_aux_usage_has_hiz(info.hiz_usage); 5908} 5909 5910/** 5911 * This ANDs the view mask of the current subpass with the pending clear 5912 * views in the attachment to get the mask of views active in the subpass 5913 * that still need to be cleared. 5914 */ 5915static inline uint32_t 5916get_multiview_subpass_clear_mask(const struct anv_cmd_state *cmd_state, 5917 const struct anv_attachment_state *att_state) 5918{ 5919 return cmd_state->subpass->view_mask & att_state->pending_clear_views; 5920} 5921 5922static inline bool 5923do_first_layer_clear(const struct anv_cmd_state *cmd_state, 5924 const struct anv_attachment_state *att_state) 5925{ 5926 if (!cmd_state->subpass->view_mask) 5927 return true; 5928 5929 uint32_t pending_clear_mask = 5930 get_multiview_subpass_clear_mask(cmd_state, att_state); 5931 5932 return pending_clear_mask & 1; 5933} 5934 5935static inline bool 5936current_subpass_is_last_for_attachment(const struct anv_cmd_state *cmd_state, 5937 uint32_t att_idx) 5938{ 5939 const uint32_t last_subpass_idx = 5940 cmd_state->pass->attachments[att_idx].last_subpass_idx; 5941 const struct anv_subpass *last_subpass = 5942 &cmd_state->pass->subpasses[last_subpass_idx]; 5943 return last_subpass == cmd_state->subpass; 5944} 5945 5946static void 5947cmd_buffer_begin_subpass(struct anv_cmd_buffer *cmd_buffer, 5948 uint32_t subpass_id) 5949{ 5950 struct anv_cmd_state *cmd_state = &cmd_buffer->state; 5951 struct anv_render_pass *pass = cmd_state->pass; 5952 struct anv_subpass *subpass = &pass->subpasses[subpass_id]; 5953 cmd_state->subpass = subpass; 5954 5955 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS; 5956 5957 /* Our implementation of VK_KHR_multiview uses instancing to draw the 5958 * different views. If the client asks for instancing, we need to use the 5959 * Instance Data Step Rate to ensure that we repeat the client's 5960 * per-instance data once for each view. Since this bit is in 5961 * VERTEX_BUFFER_STATE on gfx7, we need to dirty vertex buffers at the top 5962 * of each subpass. 5963 */ 5964 if (GFX_VER == 7) 5965 cmd_buffer->state.gfx.vb_dirty |= ~0; 5966 5967 /* It is possible to start a render pass with an old pipeline. Because the 5968 * render pass and subpass index are both baked into the pipeline, this is 5969 * highly unlikely. In order to do so, it requires that you have a render 5970 * pass with a single subpass and that you use that render pass twice 5971 * back-to-back and use the same pipeline at the start of the second render 5972 * pass as at the end of the first. In order to avoid unpredictable issues 5973 * with this edge case, we just dirty the pipeline at the start of every 5974 * subpass. 5975 */ 5976 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_PIPELINE; 5977 5978 /* Accumulate any subpass flushes that need to happen before the subpass */ 5979 anv_add_pending_pipe_bits(cmd_buffer, 5980 cmd_buffer->state.pass->subpass_flushes[subpass_id], 5981 "begin subpass deps/attachments"); 5982 5983 VkRect2D render_area = cmd_buffer->state.render_area; 5984 struct anv_framebuffer *fb = cmd_buffer->state.framebuffer; 5985 5986 bool is_multiview = subpass->view_mask != 0; 5987 5988 for (uint32_t i = 0; i < subpass->attachment_count; ++i) { 5989 const uint32_t a = subpass->attachments[i].attachment; 5990 if (a == VK_ATTACHMENT_UNUSED) 5991 continue; 5992 5993 assert(a < cmd_state->pass->attachment_count); 5994 struct anv_attachment_state *att_state = &cmd_state->attachments[a]; 5995 5996 struct anv_image_view *iview = cmd_state->attachments[a].image_view; 5997 const struct anv_image *image = iview->image; 5998 5999 VkImageLayout target_layout = subpass->attachments[i].layout; 6000 VkImageLayout target_stencil_layout = 6001 subpass->attachments[i].stencil_layout; 6002 6003 uint32_t level = iview->planes[0].isl.base_level; 6004 uint32_t width = anv_minify(iview->image->vk.extent.width, level); 6005 uint32_t height = anv_minify(iview->image->vk.extent.height, level); 6006 bool full_surface_draw = 6007 render_area.offset.x == 0 && render_area.offset.y == 0 && 6008 render_area.extent.width == width && 6009 render_area.extent.height == height; 6010 6011 uint32_t base_layer, layer_count; 6012 if (image->vk.image_type == VK_IMAGE_TYPE_3D) { 6013 base_layer = 0; 6014 layer_count = anv_minify(iview->image->vk.extent.depth, level); 6015 } else { 6016 base_layer = iview->planes[0].isl.base_array_layer; 6017 layer_count = fb->layers; 6018 } 6019 6020 if (image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { 6021 bool will_full_fast_clear = 6022 (att_state->pending_clear_aspects & VK_IMAGE_ASPECT_COLOR_BIT) && 6023 att_state->fast_clear && full_surface_draw; 6024 6025 assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT); 6026 transition_color_buffer(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT, 6027 level, 1, base_layer, layer_count, 6028 att_state->current_layout, target_layout, 6029 VK_QUEUE_FAMILY_IGNORED, 6030 VK_QUEUE_FAMILY_IGNORED, 6031 will_full_fast_clear); 6032 att_state->aux_usage = 6033 anv_layout_to_aux_usage(&cmd_buffer->device->info, image, 6034 VK_IMAGE_ASPECT_COLOR_BIT, 6035 VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, 6036 target_layout); 6037 } 6038 6039 if (image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { 6040 bool will_full_fast_clear = 6041 (att_state->pending_clear_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && 6042 att_state->fast_clear && full_surface_draw; 6043 6044 transition_depth_buffer(cmd_buffer, image, 6045 base_layer, layer_count, 6046 att_state->current_layout, target_layout, 6047 will_full_fast_clear); 6048 att_state->aux_usage = 6049 anv_layout_to_aux_usage(&cmd_buffer->device->info, image, 6050 VK_IMAGE_ASPECT_DEPTH_BIT, 6051 VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT, 6052 target_layout); 6053 } 6054 6055 if (image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { 6056 bool will_full_fast_clear = 6057 (att_state->pending_clear_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) && 6058 att_state->fast_clear && full_surface_draw; 6059 6060 transition_stencil_buffer(cmd_buffer, image, 6061 level, 1, base_layer, layer_count, 6062 att_state->current_stencil_layout, 6063 target_stencil_layout, 6064 will_full_fast_clear); 6065 } 6066 att_state->current_layout = target_layout; 6067 att_state->current_stencil_layout = target_stencil_layout; 6068 6069 if (att_state->pending_clear_aspects & VK_IMAGE_ASPECT_COLOR_BIT) { 6070 assert(att_state->pending_clear_aspects == VK_IMAGE_ASPECT_COLOR_BIT); 6071 6072 /* Multi-planar images are not supported as attachments */ 6073 assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT); 6074 assert(image->n_planes == 1); 6075 6076 uint32_t base_clear_layer = iview->planes[0].isl.base_array_layer; 6077 uint32_t clear_layer_count = fb->layers; 6078 6079 if (att_state->fast_clear && 6080 do_first_layer_clear(cmd_state, att_state)) { 6081 /* We only support fast-clears on the first layer */ 6082 assert(level == 0 && base_layer == 0); 6083 6084 union isl_color_value clear_color = {}; 6085 anv_clear_color_from_att_state(&clear_color, att_state, iview); 6086 if (iview->image->vk.samples == 1) { 6087 anv_image_ccs_op(cmd_buffer, image, 6088 iview->planes[0].isl.format, 6089 iview->planes[0].isl.swizzle, 6090 VK_IMAGE_ASPECT_COLOR_BIT, 6091 0, 0, 1, ISL_AUX_OP_FAST_CLEAR, 6092 &clear_color, 6093 false); 6094 } else { 6095 anv_image_mcs_op(cmd_buffer, image, 6096 iview->planes[0].isl.format, 6097 iview->planes[0].isl.swizzle, 6098 VK_IMAGE_ASPECT_COLOR_BIT, 6099 0, 1, ISL_AUX_OP_FAST_CLEAR, 6100 &clear_color, 6101 false); 6102 } 6103 base_clear_layer++; 6104 clear_layer_count--; 6105 if (is_multiview) 6106 att_state->pending_clear_views &= ~1; 6107 6108 if (isl_color_value_is_zero(clear_color, 6109 iview->planes[0].isl.format)) { 6110 /* This image has the auxiliary buffer enabled. We can mark the 6111 * subresource as not needing a resolve because the clear color 6112 * will match what's in every RENDER_SURFACE_STATE object when 6113 * it's being used for sampling. 6114 */ 6115 set_image_fast_clear_state(cmd_buffer, iview->image, 6116 VK_IMAGE_ASPECT_COLOR_BIT, 6117 ANV_FAST_CLEAR_DEFAULT_VALUE); 6118 } else { 6119 set_image_fast_clear_state(cmd_buffer, iview->image, 6120 VK_IMAGE_ASPECT_COLOR_BIT, 6121 ANV_FAST_CLEAR_ANY); 6122 } 6123 } 6124 6125 /* From the VkFramebufferCreateInfo spec: 6126 * 6127 * "If the render pass uses multiview, then layers must be one and each 6128 * attachment requires a number of layers that is greater than the 6129 * maximum bit index set in the view mask in the subpasses in which it 6130 * is used." 6131 * 6132 * So if multiview is active we ignore the number of layers in the 6133 * framebuffer and instead we honor the view mask from the subpass. 6134 */ 6135 if (is_multiview) { 6136 assert(image->n_planes == 1); 6137 uint32_t pending_clear_mask = 6138 get_multiview_subpass_clear_mask(cmd_state, att_state); 6139 6140 u_foreach_bit(layer_idx, pending_clear_mask) { 6141 uint32_t layer = 6142 iview->planes[0].isl.base_array_layer + layer_idx; 6143 6144 anv_image_clear_color(cmd_buffer, image, 6145 VK_IMAGE_ASPECT_COLOR_BIT, 6146 att_state->aux_usage, 6147 iview->planes[0].isl.format, 6148 iview->planes[0].isl.swizzle, 6149 level, layer, 1, 6150 render_area, 6151 vk_to_isl_color(att_state->clear_value.color)); 6152 } 6153 6154 att_state->pending_clear_views &= ~pending_clear_mask; 6155 } else if (clear_layer_count > 0) { 6156 assert(image->n_planes == 1); 6157 anv_image_clear_color(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT, 6158 att_state->aux_usage, 6159 iview->planes[0].isl.format, 6160 iview->planes[0].isl.swizzle, 6161 level, base_clear_layer, clear_layer_count, 6162 render_area, 6163 vk_to_isl_color(att_state->clear_value.color)); 6164 } 6165 } else if (att_state->pending_clear_aspects & (VK_IMAGE_ASPECT_DEPTH_BIT | 6166 VK_IMAGE_ASPECT_STENCIL_BIT)) { 6167 if (att_state->fast_clear && 6168 (att_state->pending_clear_aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) { 6169 /* We currently only support HiZ for single-LOD images */ 6170 assert(isl_aux_usage_has_hiz(iview->image->planes[0].aux_usage)); 6171 assert(iview->planes[0].isl.base_level == 0); 6172 assert(iview->planes[0].isl.levels == 1); 6173 } 6174 6175 if (is_multiview) { 6176 uint32_t pending_clear_mask = 6177 get_multiview_subpass_clear_mask(cmd_state, att_state); 6178 6179 u_foreach_bit(layer_idx, pending_clear_mask) { 6180 uint32_t layer = 6181 iview->planes[0].isl.base_array_layer + layer_idx; 6182 6183 if (att_state->fast_clear) { 6184 anv_image_hiz_clear(cmd_buffer, image, 6185 att_state->pending_clear_aspects, 6186 level, layer, 1, render_area, 6187 att_state->clear_value.depthStencil.stencil); 6188 } else { 6189 anv_image_clear_depth_stencil(cmd_buffer, image, 6190 att_state->pending_clear_aspects, 6191 att_state->aux_usage, 6192 level, layer, 1, render_area, 6193 att_state->clear_value.depthStencil.depth, 6194 att_state->clear_value.depthStencil.stencil); 6195 } 6196 } 6197 6198 att_state->pending_clear_views &= ~pending_clear_mask; 6199 } else { 6200 if (att_state->fast_clear) { 6201 anv_image_hiz_clear(cmd_buffer, image, 6202 att_state->pending_clear_aspects, 6203 level, base_layer, layer_count, 6204 render_area, 6205 att_state->clear_value.depthStencil.stencil); 6206 } else { 6207 anv_image_clear_depth_stencil(cmd_buffer, image, 6208 att_state->pending_clear_aspects, 6209 att_state->aux_usage, 6210 level, base_layer, layer_count, 6211 render_area, 6212 att_state->clear_value.depthStencil.depth, 6213 att_state->clear_value.depthStencil.stencil); 6214 } 6215 } 6216 } else { 6217 assert(att_state->pending_clear_aspects == 0); 6218 } 6219 6220 /* If multiview is enabled, then we are only done clearing when we no 6221 * longer have pending layers to clear, or when we have processed the 6222 * last subpass that uses this attachment. 6223 */ 6224 if (!is_multiview || 6225 att_state->pending_clear_views == 0 || 6226 current_subpass_is_last_for_attachment(cmd_state, a)) { 6227 att_state->pending_clear_aspects = 0; 6228 } 6229 6230 att_state->pending_load_aspects = 0; 6231 } 6232 6233 /* We've transitioned all our images possibly fast clearing them. Now we 6234 * can fill out the surface states that we will use as render targets 6235 * during actual subpass rendering. 6236 */ 6237 VkResult result = genX(cmd_buffer_alloc_att_surf_states)(cmd_buffer, 6238 pass, subpass); 6239 if (result != VK_SUCCESS) 6240 return; 6241 6242 isl_null_fill_state(&cmd_buffer->device->isl_dev, 6243 cmd_state->null_surface_state.map, 6244 .size = isl_extent3d(fb->width, fb->height, fb->layers)); 6245 6246 for (uint32_t i = 0; i < subpass->attachment_count; ++i) { 6247 const uint32_t att = subpass->attachments[i].attachment; 6248 if (att == VK_ATTACHMENT_UNUSED) 6249 continue; 6250 6251 assert(att < cmd_state->pass->attachment_count); 6252 struct anv_render_pass_attachment *pass_att = &pass->attachments[att]; 6253 struct anv_attachment_state *att_state = &cmd_state->attachments[att]; 6254 struct anv_image_view *iview = att_state->image_view; 6255 6256 if (!vk_format_is_color(pass_att->format)) 6257 continue; 6258 6259 const VkImageUsageFlagBits att_usage = subpass->attachments[i].usage; 6260 assert(util_bitcount(att_usage) == 1); 6261 6262 struct anv_surface_state *surface_state; 6263 isl_surf_usage_flags_t isl_surf_usage; 6264 enum isl_aux_usage isl_aux_usage; 6265 if (att_usage == VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) { 6266 surface_state = &att_state->color; 6267 isl_surf_usage = ISL_SURF_USAGE_RENDER_TARGET_BIT; 6268 isl_aux_usage = att_state->aux_usage; 6269 } else if (att_usage == VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) { 6270 surface_state = &att_state->input; 6271 isl_surf_usage = ISL_SURF_USAGE_TEXTURE_BIT; 6272 isl_aux_usage = 6273 anv_layout_to_aux_usage(&cmd_buffer->device->info, iview->image, 6274 VK_IMAGE_ASPECT_COLOR_BIT, 6275 VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT, 6276 att_state->current_layout); 6277 } else { 6278 continue; 6279 } 6280 6281 /* We had better have a surface state when we get here */ 6282 assert(surface_state->state.map); 6283 6284 union isl_color_value clear_color = { .u32 = { 0, } }; 6285 if (pass_att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR && 6286 att_state->fast_clear) 6287 anv_clear_color_from_att_state(&clear_color, att_state, iview); 6288 6289 anv_image_fill_surface_state(cmd_buffer->device, 6290 iview->image, 6291 VK_IMAGE_ASPECT_COLOR_BIT, 6292 &iview->planes[0].isl, 6293 isl_surf_usage, 6294 isl_aux_usage, 6295 &clear_color, 6296 0, 6297 surface_state, 6298 NULL); 6299 6300 add_surface_state_relocs(cmd_buffer, *surface_state); 6301 6302 if (GFX_VER < 10 && 6303 pass_att->load_op == VK_ATTACHMENT_LOAD_OP_LOAD && 6304 iview->image->planes[0].aux_usage != ISL_AUX_USAGE_NONE && 6305 iview->planes[0].isl.base_level == 0 && 6306 iview->planes[0].isl.base_array_layer == 0) { 6307 genX(copy_fast_clear_dwords)(cmd_buffer, surface_state->state, 6308 iview->image, 6309 VK_IMAGE_ASPECT_COLOR_BIT, 6310 false /* copy to ss */); 6311 } 6312 } 6313 6314#if GFX_VER >= 11 6315 /* The PIPE_CONTROL command description says: 6316 * 6317 * "Whenever a Binding Table Index (BTI) used by a Render Taget Message 6318 * points to a different RENDER_SURFACE_STATE, SW must issue a Render 6319 * Target Cache Flush by enabling this bit. When render target flush 6320 * is set due to new association of BTI, PS Scoreboard Stall bit must 6321 * be set in this packet." 6322 */ 6323 anv_add_pending_pipe_bits(cmd_buffer, 6324 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | 6325 ANV_PIPE_STALL_AT_SCOREBOARD_BIT, 6326 "change RT"); 6327#endif 6328 6329 cmd_buffer_emit_depth_stencil(cmd_buffer); 6330} 6331 6332static enum blorp_filter 6333vk_to_blorp_resolve_mode(VkResolveModeFlagBitsKHR vk_mode) 6334{ 6335 switch (vk_mode) { 6336 case VK_RESOLVE_MODE_SAMPLE_ZERO_BIT_KHR: 6337 return BLORP_FILTER_SAMPLE_0; 6338 case VK_RESOLVE_MODE_AVERAGE_BIT_KHR: 6339 return BLORP_FILTER_AVERAGE; 6340 case VK_RESOLVE_MODE_MIN_BIT_KHR: 6341 return BLORP_FILTER_MIN_SAMPLE; 6342 case VK_RESOLVE_MODE_MAX_BIT_KHR: 6343 return BLORP_FILTER_MAX_SAMPLE; 6344 default: 6345 return BLORP_FILTER_NONE; 6346 } 6347} 6348 6349static void 6350cmd_buffer_end_subpass(struct anv_cmd_buffer *cmd_buffer) 6351{ 6352 struct anv_cmd_state *cmd_state = &cmd_buffer->state; 6353 struct anv_subpass *subpass = cmd_state->subpass; 6354 uint32_t subpass_id = anv_get_subpass_id(&cmd_buffer->state); 6355 struct anv_framebuffer *fb = cmd_buffer->state.framebuffer; 6356 6357 /* We are done with the previous subpass and all rendering directly to that 6358 * subpass is now complete. Zero out all the surface states so we don't 6359 * accidentally use them between now and the next subpass. 6360 */ 6361 for (uint32_t i = 0; i < cmd_state->pass->attachment_count; ++i) { 6362 memset(&cmd_state->attachments[i].color, 0, 6363 sizeof(cmd_state->attachments[i].color)); 6364 memset(&cmd_state->attachments[i].input, 0, 6365 sizeof(cmd_state->attachments[i].input)); 6366 } 6367 cmd_state->null_surface_state = ANV_STATE_NULL; 6368 cmd_state->attachment_states = ANV_STATE_NULL; 6369 6370 for (uint32_t i = 0; i < subpass->attachment_count; ++i) { 6371 const uint32_t a = subpass->attachments[i].attachment; 6372 if (a == VK_ATTACHMENT_UNUSED) 6373 continue; 6374 6375 assert(a < cmd_state->pass->attachment_count); 6376 struct anv_attachment_state *att_state = &cmd_state->attachments[a]; 6377 struct anv_image_view *iview = att_state->image_view; 6378 6379 assert(util_bitcount(subpass->attachments[i].usage) == 1); 6380 if (subpass->attachments[i].usage == 6381 VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) { 6382 /* We assume that if we're ending a subpass, we did do some rendering 6383 * so we may end up with compressed data. 6384 */ 6385 genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image, 6386 VK_IMAGE_ASPECT_COLOR_BIT, 6387 att_state->aux_usage, 6388 iview->planes[0].isl.base_level, 6389 iview->planes[0].isl.base_array_layer, 6390 fb->layers); 6391 } else if (subpass->attachments[i].usage == 6392 VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) { 6393 /* We may be writing depth or stencil so we need to mark the surface. 6394 * Unfortunately, there's no way to know at this point whether the 6395 * depth or stencil tests used will actually write to the surface. 6396 * 6397 * Even though stencil may be plane 1, it always shares a base_level 6398 * with depth. 6399 */ 6400 const struct isl_view *ds_view = &iview->planes[0].isl; 6401 if (iview->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { 6402 genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image, 6403 VK_IMAGE_ASPECT_DEPTH_BIT, 6404 att_state->aux_usage, 6405 ds_view->base_level, 6406 ds_view->base_array_layer, 6407 fb->layers); 6408 } 6409 if (iview->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { 6410 /* Even though stencil may be plane 1, it always shares a 6411 * base_level with depth. 6412 */ 6413 genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image, 6414 VK_IMAGE_ASPECT_STENCIL_BIT, 6415 ISL_AUX_USAGE_NONE, 6416 ds_view->base_level, 6417 ds_view->base_array_layer, 6418 fb->layers); 6419 } 6420 } 6421 } 6422 6423 if (subpass->has_color_resolve) { 6424 /* We are about to do some MSAA resolves. We need to flush so that the 6425 * result of writes to the MSAA color attachments show up in the sampler 6426 * when we blit to the single-sampled resolve target. 6427 */ 6428 anv_add_pending_pipe_bits(cmd_buffer, 6429 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT | 6430 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT, 6431 "MSAA resolve"); 6432 6433 for (uint32_t i = 0; i < subpass->color_count; ++i) { 6434 uint32_t src_att = subpass->color_attachments[i].attachment; 6435 uint32_t dst_att = subpass->resolve_attachments[i].attachment; 6436 6437 if (dst_att == VK_ATTACHMENT_UNUSED) 6438 continue; 6439 6440 assert(src_att < cmd_buffer->state.pass->attachment_count); 6441 assert(dst_att < cmd_buffer->state.pass->attachment_count); 6442 6443 if (cmd_buffer->state.attachments[dst_att].pending_clear_aspects) { 6444 /* From the Vulkan 1.0 spec: 6445 * 6446 * If the first use of an attachment in a render pass is as a 6447 * resolve attachment, then the loadOp is effectively ignored 6448 * as the resolve is guaranteed to overwrite all pixels in the 6449 * render area. 6450 */ 6451 cmd_buffer->state.attachments[dst_att].pending_clear_aspects = 0; 6452 } 6453 6454 struct anv_image_view *src_iview = cmd_state->attachments[src_att].image_view; 6455 struct anv_image_view *dst_iview = cmd_state->attachments[dst_att].image_view; 6456 6457 const VkRect2D render_area = cmd_buffer->state.render_area; 6458 6459 enum isl_aux_usage src_aux_usage = 6460 cmd_buffer->state.attachments[src_att].aux_usage; 6461 enum isl_aux_usage dst_aux_usage = 6462 cmd_buffer->state.attachments[dst_att].aux_usage; 6463 6464 assert(src_iview->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT && 6465 dst_iview->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT); 6466 6467 anv_image_msaa_resolve(cmd_buffer, 6468 src_iview->image, src_aux_usage, 6469 src_iview->planes[0].isl.base_level, 6470 src_iview->planes[0].isl.base_array_layer, 6471 dst_iview->image, dst_aux_usage, 6472 dst_iview->planes[0].isl.base_level, 6473 dst_iview->planes[0].isl.base_array_layer, 6474 VK_IMAGE_ASPECT_COLOR_BIT, 6475 render_area.offset.x, render_area.offset.y, 6476 render_area.offset.x, render_area.offset.y, 6477 render_area.extent.width, 6478 render_area.extent.height, 6479 fb->layers, BLORP_FILTER_NONE); 6480 } 6481 } 6482 6483 if (subpass->ds_resolve_attachment) { 6484 /* We are about to do some MSAA resolves. We need to flush so that the 6485 * result of writes to the MSAA depth attachments show up in the sampler 6486 * when we blit to the single-sampled resolve target. 6487 */ 6488 anv_add_pending_pipe_bits(cmd_buffer, 6489 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT | 6490 ANV_PIPE_DEPTH_CACHE_FLUSH_BIT, 6491 "MSAA resolve"); 6492 6493 uint32_t src_att = subpass->depth_stencil_attachment->attachment; 6494 uint32_t dst_att = subpass->ds_resolve_attachment->attachment; 6495 6496 assert(src_att < cmd_buffer->state.pass->attachment_count); 6497 assert(dst_att < cmd_buffer->state.pass->attachment_count); 6498 6499 if (cmd_buffer->state.attachments[dst_att].pending_clear_aspects) { 6500 /* From the Vulkan 1.0 spec: 6501 * 6502 * If the first use of an attachment in a render pass is as a 6503 * resolve attachment, then the loadOp is effectively ignored 6504 * as the resolve is guaranteed to overwrite all pixels in the 6505 * render area. 6506 */ 6507 cmd_buffer->state.attachments[dst_att].pending_clear_aspects = 0; 6508 } 6509 6510 struct anv_image_view *src_iview = cmd_state->attachments[src_att].image_view; 6511 struct anv_image_view *dst_iview = cmd_state->attachments[dst_att].image_view; 6512 6513 const VkRect2D render_area = cmd_buffer->state.render_area; 6514 6515 struct anv_attachment_state *src_state = 6516 &cmd_state->attachments[src_att]; 6517 struct anv_attachment_state *dst_state = 6518 &cmd_state->attachments[dst_att]; 6519 6520 if ((src_iview->image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && 6521 subpass->depth_resolve_mode != VK_RESOLVE_MODE_NONE_KHR) { 6522 6523 /* MSAA resolves sample from the source attachment. Transition the 6524 * depth attachment first to get rid of any HiZ that we may not be 6525 * able to handle. 6526 */ 6527 transition_depth_buffer(cmd_buffer, src_iview->image, 6528 src_iview->planes[0].isl.base_array_layer, 6529 fb->layers, 6530 src_state->current_layout, 6531 VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, 6532 false /* will_full_fast_clear */); 6533 src_state->aux_usage = 6534 anv_layout_to_aux_usage(&cmd_buffer->device->info, src_iview->image, 6535 VK_IMAGE_ASPECT_DEPTH_BIT, 6536 VK_IMAGE_USAGE_TRANSFER_SRC_BIT, 6537 VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL); 6538 src_state->current_layout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; 6539 6540 /* MSAA resolves write to the resolve attachment as if it were any 6541 * other transfer op. Transition the resolve attachment accordingly. 6542 */ 6543 VkImageLayout dst_initial_layout = dst_state->current_layout; 6544 6545 /* If our render area is the entire size of the image, we're going to 6546 * blow it all away so we can claim the initial layout is UNDEFINED 6547 * and we'll get a HiZ ambiguate instead of a resolve. 6548 */ 6549 if (dst_iview->image->vk.image_type != VK_IMAGE_TYPE_3D && 6550 render_area.offset.x == 0 && render_area.offset.y == 0 && 6551 render_area.extent.width == dst_iview->vk.extent.width && 6552 render_area.extent.height == dst_iview->vk.extent.height) 6553 dst_initial_layout = VK_IMAGE_LAYOUT_UNDEFINED; 6554 6555 transition_depth_buffer(cmd_buffer, dst_iview->image, 6556 dst_iview->planes[0].isl.base_array_layer, 6557 fb->layers, 6558 dst_initial_layout, 6559 VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 6560 false /* will_full_fast_clear */); 6561 dst_state->aux_usage = 6562 anv_layout_to_aux_usage(&cmd_buffer->device->info, dst_iview->image, 6563 VK_IMAGE_ASPECT_DEPTH_BIT, 6564 VK_IMAGE_USAGE_TRANSFER_DST_BIT, 6565 VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); 6566 dst_state->current_layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; 6567 6568 enum blorp_filter filter = 6569 vk_to_blorp_resolve_mode(subpass->depth_resolve_mode); 6570 6571 anv_image_msaa_resolve(cmd_buffer, 6572 src_iview->image, src_state->aux_usage, 6573 src_iview->planes[0].isl.base_level, 6574 src_iview->planes[0].isl.base_array_layer, 6575 dst_iview->image, dst_state->aux_usage, 6576 dst_iview->planes[0].isl.base_level, 6577 dst_iview->planes[0].isl.base_array_layer, 6578 VK_IMAGE_ASPECT_DEPTH_BIT, 6579 render_area.offset.x, render_area.offset.y, 6580 render_area.offset.x, render_area.offset.y, 6581 render_area.extent.width, 6582 render_area.extent.height, 6583 fb->layers, filter); 6584 } 6585 6586 if ((src_iview->image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT) && 6587 subpass->stencil_resolve_mode != VK_RESOLVE_MODE_NONE_KHR) { 6588 6589 src_state->current_stencil_layout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; 6590 dst_state->current_stencil_layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; 6591 6592 enum isl_aux_usage src_aux_usage = ISL_AUX_USAGE_NONE; 6593 const uint32_t plane = 6594 anv_image_aspect_to_plane(dst_iview->image, VK_IMAGE_ASPECT_STENCIL_BIT); 6595 enum isl_aux_usage dst_aux_usage = 6596 dst_iview->image->planes[plane].aux_usage; 6597 6598 enum blorp_filter filter = 6599 vk_to_blorp_resolve_mode(subpass->stencil_resolve_mode); 6600 6601 anv_image_msaa_resolve(cmd_buffer, 6602 src_iview->image, src_aux_usage, 6603 src_iview->planes[0].isl.base_level, 6604 src_iview->planes[0].isl.base_array_layer, 6605 dst_iview->image, dst_aux_usage, 6606 dst_iview->planes[0].isl.base_level, 6607 dst_iview->planes[0].isl.base_array_layer, 6608 VK_IMAGE_ASPECT_STENCIL_BIT, 6609 render_area.offset.x, render_area.offset.y, 6610 render_area.offset.x, render_area.offset.y, 6611 render_area.extent.width, 6612 render_area.extent.height, 6613 fb->layers, filter); 6614 } 6615 } 6616 6617#if GFX_VER == 7 6618 /* On gfx7, we have to store a texturable version of the stencil buffer in 6619 * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and 6620 * forth at strategic points. Stencil writes are only allowed in following 6621 * layouts: 6622 * 6623 * - VK_IMAGE_LAYOUT_GENERAL 6624 * - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL 6625 * - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL 6626 * - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL 6627 * - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR 6628 * 6629 * For general, we have no nice opportunity to transition so we do the copy 6630 * to the shadow unconditionally at the end of the subpass. For transfer 6631 * destinations, we can update it as part of the transfer op. For the other 6632 * layouts, we delay the copy until a transition into some other layout. 6633 */ 6634 if (subpass->depth_stencil_attachment) { 6635 uint32_t a = subpass->depth_stencil_attachment->attachment; 6636 assert(a != VK_ATTACHMENT_UNUSED); 6637 6638 struct anv_attachment_state *att_state = &cmd_state->attachments[a]; 6639 struct anv_image_view *iview = cmd_state->attachments[a].image_view;; 6640 const struct anv_image *image = iview->image; 6641 6642 if (image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { 6643 const uint32_t plane = 6644 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT); 6645 6646 if (anv_surface_is_valid(&image->planes[plane].shadow_surface) && 6647 att_state->current_stencil_layout == VK_IMAGE_LAYOUT_GENERAL) { 6648 assert(image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT); 6649 anv_image_copy_to_shadow(cmd_buffer, image, 6650 VK_IMAGE_ASPECT_STENCIL_BIT, 6651 iview->planes[plane].isl.base_level, 1, 6652 iview->planes[plane].isl.base_array_layer, 6653 fb->layers); 6654 } 6655 } 6656 } 6657#endif /* GFX_VER == 7 */ 6658 6659 for (uint32_t i = 0; i < subpass->attachment_count; ++i) { 6660 const uint32_t a = subpass->attachments[i].attachment; 6661 if (a == VK_ATTACHMENT_UNUSED) 6662 continue; 6663 6664 if (cmd_state->pass->attachments[a].last_subpass_idx != subpass_id) 6665 continue; 6666 6667 assert(a < cmd_state->pass->attachment_count); 6668 struct anv_attachment_state *att_state = &cmd_state->attachments[a]; 6669 struct anv_image_view *iview = cmd_state->attachments[a].image_view; 6670 const struct anv_image *image = iview->image; 6671 6672 /* Transition the image into the final layout for this render pass */ 6673 VkImageLayout target_layout = 6674 cmd_state->pass->attachments[a].final_layout; 6675 VkImageLayout target_stencil_layout = 6676 cmd_state->pass->attachments[a].stencil_final_layout; 6677 6678 uint32_t base_layer, layer_count; 6679 if (image->vk.image_type == VK_IMAGE_TYPE_3D) { 6680 base_layer = 0; 6681 layer_count = anv_minify(iview->image->vk.extent.depth, 6682 iview->planes[0].isl.base_level); 6683 } else { 6684 base_layer = iview->planes[0].isl.base_array_layer; 6685 layer_count = fb->layers; 6686 } 6687 6688 if (image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { 6689 assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT); 6690 transition_color_buffer(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT, 6691 iview->planes[0].isl.base_level, 1, 6692 base_layer, layer_count, 6693 att_state->current_layout, target_layout, 6694 VK_QUEUE_FAMILY_IGNORED, 6695 VK_QUEUE_FAMILY_IGNORED, 6696 false /* will_full_fast_clear */); 6697 } 6698 6699 if (image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { 6700 transition_depth_buffer(cmd_buffer, image, 6701 base_layer, layer_count, 6702 att_state->current_layout, target_layout, 6703 false /* will_full_fast_clear */); 6704 } 6705 6706 if (image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { 6707 transition_stencil_buffer(cmd_buffer, image, 6708 iview->planes[0].isl.base_level, 1, 6709 base_layer, layer_count, 6710 att_state->current_stencil_layout, 6711 target_stencil_layout, 6712 false /* will_full_fast_clear */); 6713 } 6714 } 6715 6716 /* Accumulate any subpass flushes that need to happen after the subpass. 6717 * Yes, they do get accumulated twice in the NextSubpass case but since 6718 * genX_CmdNextSubpass just calls end/begin back-to-back, we just end up 6719 * ORing the bits in twice so it's harmless. 6720 */ 6721 anv_add_pending_pipe_bits(cmd_buffer, 6722 cmd_buffer->state.pass->subpass_flushes[subpass_id + 1], 6723 "end subpass deps/attachments"); 6724} 6725 6726void genX(CmdBeginRenderPass2)( 6727 VkCommandBuffer commandBuffer, 6728 const VkRenderPassBeginInfo* pRenderPassBeginInfo, 6729 const VkSubpassBeginInfoKHR* pSubpassBeginInfo) 6730{ 6731 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 6732 ANV_FROM_HANDLE(anv_render_pass, pass, pRenderPassBeginInfo->renderPass); 6733 ANV_FROM_HANDLE(anv_framebuffer, framebuffer, pRenderPassBeginInfo->framebuffer); 6734 VkResult result; 6735 6736 if (!is_render_queue_cmd_buffer(cmd_buffer)) { 6737 assert(!"Trying to start a render pass on non-render queue!"); 6738 anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_UNKNOWN); 6739 return; 6740 } 6741 6742 cmd_buffer->state.framebuffer = framebuffer; 6743 cmd_buffer->state.pass = pass; 6744 cmd_buffer->state.render_area = pRenderPassBeginInfo->renderArea; 6745 6746 anv_measure_beginrenderpass(cmd_buffer); 6747 6748 result = genX(cmd_buffer_setup_attachments)(cmd_buffer, pass, 6749 framebuffer, 6750 pRenderPassBeginInfo); 6751 if (result != VK_SUCCESS) { 6752 assert(anv_batch_has_error(&cmd_buffer->batch)); 6753 return; 6754 } 6755 6756 genX(flush_pipeline_select_3d)(cmd_buffer); 6757 6758 cmd_buffer_begin_subpass(cmd_buffer, 0); 6759} 6760 6761void genX(CmdNextSubpass2)( 6762 VkCommandBuffer commandBuffer, 6763 const VkSubpassBeginInfoKHR* pSubpassBeginInfo, 6764 const VkSubpassEndInfoKHR* pSubpassEndInfo) 6765{ 6766 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 6767 6768 if (anv_batch_has_error(&cmd_buffer->batch)) 6769 return; 6770 6771 assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); 6772 6773 uint32_t prev_subpass = anv_get_subpass_id(&cmd_buffer->state); 6774 cmd_buffer_end_subpass(cmd_buffer); 6775 cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1); 6776} 6777 6778void genX(CmdEndRenderPass2)( 6779 VkCommandBuffer commandBuffer, 6780 const VkSubpassEndInfoKHR* pSubpassEndInfo) 6781{ 6782 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 6783 6784 if (anv_batch_has_error(&cmd_buffer->batch)) 6785 return; 6786 6787 cmd_buffer_end_subpass(cmd_buffer); 6788 6789 cmd_buffer->state.hiz_enabled = false; 6790 6791 /* Remove references to render pass specific state. This enables us to 6792 * detect whether or not we're in a renderpass. 6793 */ 6794 cmd_buffer->state.framebuffer = NULL; 6795 cmd_buffer->state.pass = NULL; 6796 cmd_buffer->state.subpass = NULL; 6797} 6798 6799void 6800genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer) 6801{ 6802#if GFX_VERx10 >= 75 6803 struct mi_builder b; 6804 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 6805 6806 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), 6807 mi_reg32(ANV_PREDICATE_RESULT_REG)); 6808 mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0)); 6809 6810 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { 6811 mip.LoadOperation = LOAD_LOADINV; 6812 mip.CombineOperation = COMBINE_SET; 6813 mip.CompareOperation = COMPARE_SRCS_EQUAL; 6814 } 6815#endif 6816} 6817 6818#if GFX_VERx10 >= 75 6819void genX(CmdBeginConditionalRenderingEXT)( 6820 VkCommandBuffer commandBuffer, 6821 const VkConditionalRenderingBeginInfoEXT* pConditionalRenderingBegin) 6822{ 6823 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 6824 ANV_FROM_HANDLE(anv_buffer, buffer, pConditionalRenderingBegin->buffer); 6825 struct anv_cmd_state *cmd_state = &cmd_buffer->state; 6826 struct anv_address value_address = 6827 anv_address_add(buffer->address, pConditionalRenderingBegin->offset); 6828 6829 const bool isInverted = pConditionalRenderingBegin->flags & 6830 VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT; 6831 6832 cmd_state->conditional_render_enabled = true; 6833 6834 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 6835 6836 struct mi_builder b; 6837 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 6838 6839 /* Section 19.4 of the Vulkan 1.1.85 spec says: 6840 * 6841 * If the value of the predicate in buffer memory changes 6842 * while conditional rendering is active, the rendering commands 6843 * may be discarded in an implementation-dependent way. 6844 * Some implementations may latch the value of the predicate 6845 * upon beginning conditional rendering while others 6846 * may read it before every rendering command. 6847 * 6848 * So it's perfectly fine to read a value from the buffer once. 6849 */ 6850 struct mi_value value = mi_mem32(value_address); 6851 6852 /* Precompute predicate result, it is necessary to support secondary 6853 * command buffers since it is unknown if conditional rendering is 6854 * inverted when populating them. 6855 */ 6856 mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG), 6857 isInverted ? mi_uge(&b, mi_imm(0), value) : 6858 mi_ult(&b, mi_imm(0), value)); 6859} 6860 6861void genX(CmdEndConditionalRenderingEXT)( 6862 VkCommandBuffer commandBuffer) 6863{ 6864 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 6865 struct anv_cmd_state *cmd_state = &cmd_buffer->state; 6866 6867 cmd_state->conditional_render_enabled = false; 6868} 6869#endif 6870 6871/* Set of stage bits for which are pipelined, i.e. they get queued 6872 * by the command streamer for later execution. 6873 */ 6874#define ANV_PIPELINE_STAGE_PIPELINED_BITS \ 6875 ~(VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT_KHR | \ 6876 VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT_KHR | \ 6877 VK_PIPELINE_STAGE_2_HOST_BIT_KHR | \ 6878 VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT) 6879 6880void genX(CmdSetEvent2KHR)( 6881 VkCommandBuffer commandBuffer, 6882 VkEvent _event, 6883 const VkDependencyInfoKHR* pDependencyInfo) 6884{ 6885 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 6886 ANV_FROM_HANDLE(anv_event, event, _event); 6887 6888 VkPipelineStageFlags2KHR src_stages = 0; 6889 6890 for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++) 6891 src_stages |= pDependencyInfo->pMemoryBarriers[i].srcStageMask; 6892 for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++) 6893 src_stages |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask; 6894 for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++) 6895 src_stages |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask; 6896 6897 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT; 6898 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 6899 6900 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 6901 if (src_stages & ANV_PIPELINE_STAGE_PIPELINED_BITS) { 6902 pc.StallAtPixelScoreboard = true; 6903 pc.CommandStreamerStallEnable = true; 6904 } 6905 6906 pc.DestinationAddressType = DAT_PPGTT, 6907 pc.PostSyncOperation = WriteImmediateData, 6908 pc.Address = (struct anv_address) { 6909 cmd_buffer->device->dynamic_state_pool.block_pool.bo, 6910 event->state.offset 6911 }; 6912 pc.ImmediateData = VK_EVENT_SET; 6913 anv_debug_dump_pc(pc); 6914 } 6915} 6916 6917void genX(CmdResetEvent2KHR)( 6918 VkCommandBuffer commandBuffer, 6919 VkEvent _event, 6920 VkPipelineStageFlags2KHR stageMask) 6921{ 6922 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 6923 ANV_FROM_HANDLE(anv_event, event, _event); 6924 6925 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT; 6926 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 6927 6928 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 6929 if (stageMask & ANV_PIPELINE_STAGE_PIPELINED_BITS) { 6930 pc.StallAtPixelScoreboard = true; 6931 pc.CommandStreamerStallEnable = true; 6932 } 6933 6934 pc.DestinationAddressType = DAT_PPGTT; 6935 pc.PostSyncOperation = WriteImmediateData; 6936 pc.Address = (struct anv_address) { 6937 cmd_buffer->device->dynamic_state_pool.block_pool.bo, 6938 event->state.offset 6939 }; 6940 pc.ImmediateData = VK_EVENT_RESET; 6941 anv_debug_dump_pc(pc); 6942 } 6943} 6944 6945void genX(CmdWaitEvents2KHR)( 6946 VkCommandBuffer commandBuffer, 6947 uint32_t eventCount, 6948 const VkEvent* pEvents, 6949 const VkDependencyInfoKHR* pDependencyInfos) 6950{ 6951 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 6952 6953#if GFX_VER >= 8 6954 for (uint32_t i = 0; i < eventCount; i++) { 6955 ANV_FROM_HANDLE(anv_event, event, pEvents[i]); 6956 6957 anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT), sem) { 6958 sem.WaitMode = PollingMode, 6959 sem.CompareOperation = COMPARE_SAD_EQUAL_SDD, 6960 sem.SemaphoreDataDword = VK_EVENT_SET, 6961 sem.SemaphoreAddress = (struct anv_address) { 6962 cmd_buffer->device->dynamic_state_pool.block_pool.bo, 6963 event->state.offset 6964 }; 6965 } 6966 } 6967#else 6968 anv_finishme("Implement events on gfx7"); 6969#endif 6970 6971 cmd_buffer_barrier(cmd_buffer, pDependencyInfos, "wait event"); 6972} 6973 6974VkResult genX(CmdSetPerformanceOverrideINTEL)( 6975 VkCommandBuffer commandBuffer, 6976 const VkPerformanceOverrideInfoINTEL* pOverrideInfo) 6977{ 6978 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 6979 6980 switch (pOverrideInfo->type) { 6981 case VK_PERFORMANCE_OVERRIDE_TYPE_NULL_HARDWARE_INTEL: { 6982#if GFX_VER >= 9 6983 anv_batch_write_reg(&cmd_buffer->batch, GENX(CS_DEBUG_MODE2), csdm2) { 6984 csdm2._3DRenderingInstructionDisable = pOverrideInfo->enable; 6985 csdm2.MediaInstructionDisable = pOverrideInfo->enable; 6986 csdm2._3DRenderingInstructionDisableMask = true; 6987 csdm2.MediaInstructionDisableMask = true; 6988 } 6989#else 6990 anv_batch_write_reg(&cmd_buffer->batch, GENX(INSTPM), instpm) { 6991 instpm._3DRenderingInstructionDisable = pOverrideInfo->enable; 6992 instpm.MediaInstructionDisable = pOverrideInfo->enable; 6993 instpm._3DRenderingInstructionDisableMask = true; 6994 instpm.MediaInstructionDisableMask = true; 6995 } 6996#endif 6997 break; 6998 } 6999 7000 case VK_PERFORMANCE_OVERRIDE_TYPE_FLUSH_GPU_CACHES_INTEL: 7001 if (pOverrideInfo->enable) { 7002 /* FLUSH ALL THE THINGS! As requested by the MDAPI team. */ 7003 anv_add_pending_pipe_bits(cmd_buffer, 7004 ANV_PIPE_FLUSH_BITS | 7005 ANV_PIPE_INVALIDATE_BITS, 7006 "perf counter isolation"); 7007 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 7008 } 7009 break; 7010 7011 default: 7012 unreachable("Invalid override"); 7013 } 7014 7015 return VK_SUCCESS; 7016} 7017 7018VkResult genX(CmdSetPerformanceStreamMarkerINTEL)( 7019 VkCommandBuffer commandBuffer, 7020 const VkPerformanceStreamMarkerInfoINTEL* pMarkerInfo) 7021{ 7022 /* TODO: Waiting on the register to write, might depend on generation. */ 7023 7024 return VK_SUCCESS; 7025} 7026 7027void genX(cmd_emit_timestamp)(struct anv_batch *batch, 7028 struct anv_bo *bo, 7029 uint32_t offset) { 7030 anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) { 7031 pc.CommandStreamerStallEnable = true; 7032 pc.PostSyncOperation = WriteTimestamp; 7033 pc.Address = (struct anv_address) {bo, offset}; 7034 anv_debug_dump_pc(pc); 7035 } 7036} 7037