genX_cmd_buffer.c revision 01e04c3f
1/* 2 * Copyright © 2015 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include <assert.h> 25#include <stdbool.h> 26 27#include "anv_private.h" 28#include "vk_format_info.h" 29#include "vk_util.h" 30 31#include "common/gen_l3_config.h" 32#include "genxml/gen_macros.h" 33#include "genxml/genX_pack.h" 34 35static void 36emit_lrm(struct anv_batch *batch, uint32_t reg, struct anv_address addr) 37{ 38 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { 39 lrm.RegisterAddress = reg; 40 lrm.MemoryAddress = addr; 41 } 42} 43 44static void 45emit_lri(struct anv_batch *batch, uint32_t reg, uint32_t imm) 46{ 47 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) { 48 lri.RegisterOffset = reg; 49 lri.DataDWord = imm; 50 } 51} 52 53#if GEN_IS_HASWELL || GEN_GEN >= 8 54static void 55emit_lrr(struct anv_batch *batch, uint32_t dst, uint32_t src) 56{ 57 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_REG), lrr) { 58 lrr.SourceRegisterAddress = src; 59 lrr.DestinationRegisterAddress = dst; 60 } 61} 62#endif 63 64void 65genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer) 66{ 67 struct anv_device *device = cmd_buffer->device; 68 69 /* If we are emitting a new state base address we probably need to re-emit 70 * binding tables. 71 */ 72 cmd_buffer->state.descriptors_dirty |= ~0; 73 74 /* Emit a render target cache flush. 75 * 76 * This isn't documented anywhere in the PRM. However, it seems to be 77 * necessary prior to changing the surface state base adress. Without 78 * this, we get GPU hangs when using multi-level command buffers which 79 * clear depth, reset state base address, and then go render stuff. 80 */ 81 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 82 pc.DCFlushEnable = true; 83 pc.RenderTargetCacheFlushEnable = true; 84 pc.CommandStreamerStallEnable = true; 85 } 86 87 anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) { 88 sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 }; 89 sba.GeneralStateMemoryObjectControlState = GENX(MOCS); 90 sba.GeneralStateBaseAddressModifyEnable = true; 91 92 sba.SurfaceStateBaseAddress = 93 anv_cmd_buffer_surface_base_address(cmd_buffer); 94 sba.SurfaceStateMemoryObjectControlState = GENX(MOCS); 95 sba.SurfaceStateBaseAddressModifyEnable = true; 96 97 sba.DynamicStateBaseAddress = 98 (struct anv_address) { &device->dynamic_state_pool.block_pool.bo, 0 }; 99 sba.DynamicStateMemoryObjectControlState = GENX(MOCS); 100 sba.DynamicStateBaseAddressModifyEnable = true; 101 102 sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 }; 103 sba.IndirectObjectMemoryObjectControlState = GENX(MOCS); 104 sba.IndirectObjectBaseAddressModifyEnable = true; 105 106 sba.InstructionBaseAddress = 107 (struct anv_address) { &device->instruction_state_pool.block_pool.bo, 0 }; 108 sba.InstructionMemoryObjectControlState = GENX(MOCS); 109 sba.InstructionBaseAddressModifyEnable = true; 110 111# if (GEN_GEN >= 8) 112 /* Broadwell requires that we specify a buffer size for a bunch of 113 * these fields. However, since we will be growing the BO's live, we 114 * just set them all to the maximum. 115 */ 116 sba.GeneralStateBufferSize = 0xfffff; 117 sba.GeneralStateBufferSizeModifyEnable = true; 118 sba.DynamicStateBufferSize = 0xfffff; 119 sba.DynamicStateBufferSizeModifyEnable = true; 120 sba.IndirectObjectBufferSize = 0xfffff; 121 sba.IndirectObjectBufferSizeModifyEnable = true; 122 sba.InstructionBufferSize = 0xfffff; 123 sba.InstructionBuffersizeModifyEnable = true; 124# endif 125# if (GEN_GEN >= 9) 126 sba.BindlessSurfaceStateBaseAddress = (struct anv_address) { NULL, 0 }; 127 sba.BindlessSurfaceStateMemoryObjectControlState = GENX(MOCS); 128 sba.BindlessSurfaceStateBaseAddressModifyEnable = true; 129 sba.BindlessSurfaceStateSize = 0; 130# endif 131# if (GEN_GEN >= 10) 132 sba.BindlessSamplerStateBaseAddress = (struct anv_address) { NULL, 0 }; 133 sba.BindlessSamplerStateMemoryObjectControlState = GENX(MOCS); 134 sba.BindlessSamplerStateBaseAddressModifyEnable = true; 135 sba.BindlessSamplerStateBufferSize = 0; 136# endif 137 } 138 139 /* After re-setting the surface state base address, we have to do some 140 * cache flusing so that the sampler engine will pick up the new 141 * SURFACE_STATE objects and binding tables. From the Broadwell PRM, 142 * Shared Function > 3D Sampler > State > State Caching (page 96): 143 * 144 * Coherency with system memory in the state cache, like the texture 145 * cache is handled partially by software. It is expected that the 146 * command stream or shader will issue Cache Flush operation or 147 * Cache_Flush sampler message to ensure that the L1 cache remains 148 * coherent with system memory. 149 * 150 * [...] 151 * 152 * Whenever the value of the Dynamic_State_Base_Addr, 153 * Surface_State_Base_Addr are altered, the L1 state cache must be 154 * invalidated to ensure the new surface or sampler state is fetched 155 * from system memory. 156 * 157 * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit 158 * which, according the PIPE_CONTROL instruction documentation in the 159 * Broadwell PRM: 160 * 161 * Setting this bit is independent of any other bit in this packet. 162 * This bit controls the invalidation of the L1 and L2 state caches 163 * at the top of the pipe i.e. at the parsing time. 164 * 165 * Unfortunately, experimentation seems to indicate that state cache 166 * invalidation through a PIPE_CONTROL does nothing whatsoever in 167 * regards to surface state and binding tables. In stead, it seems that 168 * invalidating the texture cache is what is actually needed. 169 * 170 * XXX: As far as we have been able to determine through 171 * experimentation, shows that flush the texture cache appears to be 172 * sufficient. The theory here is that all of the sampling/rendering 173 * units cache the binding table in the texture cache. However, we have 174 * yet to be able to actually confirm this. 175 */ 176 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 177 pc.TextureCacheInvalidationEnable = true; 178 pc.ConstantCacheInvalidationEnable = true; 179 pc.StateCacheInvalidationEnable = true; 180 } 181} 182 183static void 184add_surface_reloc(struct anv_cmd_buffer *cmd_buffer, 185 struct anv_state state, struct anv_address addr) 186{ 187 const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; 188 189 VkResult result = 190 anv_reloc_list_add(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc, 191 state.offset + isl_dev->ss.addr_offset, 192 addr.bo, addr.offset); 193 if (result != VK_SUCCESS) 194 anv_batch_set_error(&cmd_buffer->batch, result); 195} 196 197static void 198add_surface_state_relocs(struct anv_cmd_buffer *cmd_buffer, 199 struct anv_surface_state state) 200{ 201 const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; 202 203 assert(!anv_address_is_null(state.address)); 204 add_surface_reloc(cmd_buffer, state.state, state.address); 205 206 if (!anv_address_is_null(state.aux_address)) { 207 VkResult result = 208 anv_reloc_list_add(&cmd_buffer->surface_relocs, 209 &cmd_buffer->pool->alloc, 210 state.state.offset + isl_dev->ss.aux_addr_offset, 211 state.aux_address.bo, state.aux_address.offset); 212 if (result != VK_SUCCESS) 213 anv_batch_set_error(&cmd_buffer->batch, result); 214 } 215 216 if (!anv_address_is_null(state.clear_address)) { 217 VkResult result = 218 anv_reloc_list_add(&cmd_buffer->surface_relocs, 219 &cmd_buffer->pool->alloc, 220 state.state.offset + 221 isl_dev->ss.clear_color_state_offset, 222 state.clear_address.bo, state.clear_address.offset); 223 if (result != VK_SUCCESS) 224 anv_batch_set_error(&cmd_buffer->batch, result); 225 } 226} 227 228static void 229color_attachment_compute_aux_usage(struct anv_device * device, 230 struct anv_cmd_state * cmd_state, 231 uint32_t att, VkRect2D render_area, 232 union isl_color_value *fast_clear_color) 233{ 234 struct anv_attachment_state *att_state = &cmd_state->attachments[att]; 235 struct anv_image_view *iview = cmd_state->framebuffer->attachments[att]; 236 237 assert(iview->n_planes == 1); 238 239 if (iview->planes[0].isl.base_array_layer >= 240 anv_image_aux_layers(iview->image, VK_IMAGE_ASPECT_COLOR_BIT, 241 iview->planes[0].isl.base_level)) { 242 /* There is no aux buffer which corresponds to the level and layer(s) 243 * being accessed. 244 */ 245 att_state->aux_usage = ISL_AUX_USAGE_NONE; 246 att_state->input_aux_usage = ISL_AUX_USAGE_NONE; 247 att_state->fast_clear = false; 248 return; 249 } 250 251 att_state->aux_usage = 252 anv_layout_to_aux_usage(&device->info, iview->image, 253 VK_IMAGE_ASPECT_COLOR_BIT, 254 VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL); 255 256 /* If we don't have aux, then we should have returned early in the layer 257 * check above. If we got here, we must have something. 258 */ 259 assert(att_state->aux_usage != ISL_AUX_USAGE_NONE); 260 261 if (att_state->aux_usage == ISL_AUX_USAGE_CCS_E || 262 att_state->aux_usage == ISL_AUX_USAGE_MCS) { 263 att_state->input_aux_usage = att_state->aux_usage; 264 } else { 265 /* From the Sky Lake PRM, RENDER_SURFACE_STATE::AuxiliarySurfaceMode: 266 * 267 * "If Number of Multisamples is MULTISAMPLECOUNT_1, AUX_CCS_D 268 * setting is only allowed if Surface Format supported for Fast 269 * Clear. In addition, if the surface is bound to the sampling 270 * engine, Surface Format must be supported for Render Target 271 * Compression for surfaces bound to the sampling engine." 272 * 273 * In other words, we can only sample from a fast-cleared image if it 274 * also supports color compression. 275 */ 276 if (isl_format_supports_ccs_e(&device->info, iview->planes[0].isl.format)) { 277 att_state->input_aux_usage = ISL_AUX_USAGE_CCS_D; 278 279 /* While fast-clear resolves and partial resolves are fairly cheap in the 280 * case where you render to most of the pixels, full resolves are not 281 * because they potentially involve reading and writing the entire 282 * framebuffer. If we can't texture with CCS_E, we should leave it off and 283 * limit ourselves to fast clears. 284 */ 285 if (cmd_state->pass->attachments[att].first_subpass_layout == 286 VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL) { 287 anv_perf_warn(device->instance, iview->image, 288 "Not temporarily enabling CCS_E."); 289 } 290 } else { 291 att_state->input_aux_usage = ISL_AUX_USAGE_NONE; 292 } 293 } 294 295 assert(iview->image->planes[0].aux_surface.isl.usage & 296 (ISL_SURF_USAGE_CCS_BIT | ISL_SURF_USAGE_MCS_BIT)); 297 298 union isl_color_value clear_color = {}; 299 anv_clear_color_from_att_state(&clear_color, att_state, iview); 300 301 att_state->clear_color_is_zero_one = 302 isl_color_value_is_zero_one(clear_color, iview->planes[0].isl.format); 303 att_state->clear_color_is_zero = 304 isl_color_value_is_zero(clear_color, iview->planes[0].isl.format); 305 306 if (att_state->pending_clear_aspects == VK_IMAGE_ASPECT_COLOR_BIT) { 307 /* Start by getting the fast clear type. We use the first subpass 308 * layout here because we don't want to fast-clear if the first subpass 309 * to use the attachment can't handle fast-clears. 310 */ 311 enum anv_fast_clear_type fast_clear_type = 312 anv_layout_to_fast_clear_type(&device->info, iview->image, 313 VK_IMAGE_ASPECT_COLOR_BIT, 314 cmd_state->pass->attachments[att].first_subpass_layout); 315 switch (fast_clear_type) { 316 case ANV_FAST_CLEAR_NONE: 317 att_state->fast_clear = false; 318 break; 319 case ANV_FAST_CLEAR_DEFAULT_VALUE: 320 att_state->fast_clear = att_state->clear_color_is_zero; 321 break; 322 case ANV_FAST_CLEAR_ANY: 323 att_state->fast_clear = true; 324 break; 325 } 326 327 /* Potentially, we could do partial fast-clears but doing so has crazy 328 * alignment restrictions. It's easier to just restrict to full size 329 * fast clears for now. 330 */ 331 if (render_area.offset.x != 0 || 332 render_area.offset.y != 0 || 333 render_area.extent.width != iview->extent.width || 334 render_area.extent.height != iview->extent.height) 335 att_state->fast_clear = false; 336 337 /* On Broadwell and earlier, we can only handle 0/1 clear colors */ 338 if (GEN_GEN <= 8 && !att_state->clear_color_is_zero_one) 339 att_state->fast_clear = false; 340 341 /* We only allow fast clears to the first slice of an image (level 0, 342 * layer 0) and only for the entire slice. This guarantees us that, at 343 * any given time, there is only one clear color on any given image at 344 * any given time. At the time of our testing (Jan 17, 2018), there 345 * were no known applications which would benefit from fast-clearing 346 * more than just the first slice. 347 */ 348 if (att_state->fast_clear && 349 (iview->planes[0].isl.base_level > 0 || 350 iview->planes[0].isl.base_array_layer > 0)) { 351 anv_perf_warn(device->instance, iview->image, 352 "Rendering with multi-lod or multi-layer framebuffer " 353 "with LOAD_OP_LOAD and baseMipLevel > 0 or " 354 "baseArrayLayer > 0. Not fast clearing."); 355 att_state->fast_clear = false; 356 } else if (att_state->fast_clear && cmd_state->framebuffer->layers > 1) { 357 anv_perf_warn(device->instance, iview->image, 358 "Rendering to a multi-layer framebuffer with " 359 "LOAD_OP_CLEAR. Only fast-clearing the first slice"); 360 } 361 362 if (att_state->fast_clear) 363 *fast_clear_color = clear_color; 364 } else { 365 att_state->fast_clear = false; 366 } 367} 368 369static void 370depth_stencil_attachment_compute_aux_usage(struct anv_device *device, 371 struct anv_cmd_state *cmd_state, 372 uint32_t att, VkRect2D render_area) 373{ 374 struct anv_render_pass_attachment *pass_att = 375 &cmd_state->pass->attachments[att]; 376 struct anv_attachment_state *att_state = &cmd_state->attachments[att]; 377 struct anv_image_view *iview = cmd_state->framebuffer->attachments[att]; 378 379 /* These will be initialized after the first subpass transition. */ 380 att_state->aux_usage = ISL_AUX_USAGE_NONE; 381 att_state->input_aux_usage = ISL_AUX_USAGE_NONE; 382 383 if (GEN_GEN == 7) { 384 /* We don't do any HiZ or depth fast-clears on gen7 yet */ 385 att_state->fast_clear = false; 386 return; 387 } 388 389 if (!(att_state->pending_clear_aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) { 390 /* If we're just clearing stencil, we can always HiZ clear */ 391 att_state->fast_clear = true; 392 return; 393 } 394 395 /* Default to false for now */ 396 att_state->fast_clear = false; 397 398 /* We must have depth in order to have HiZ */ 399 if (!(iview->image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) 400 return; 401 402 const enum isl_aux_usage first_subpass_aux_usage = 403 anv_layout_to_aux_usage(&device->info, iview->image, 404 VK_IMAGE_ASPECT_DEPTH_BIT, 405 pass_att->first_subpass_layout); 406 if (first_subpass_aux_usage != ISL_AUX_USAGE_HIZ) 407 return; 408 409 if (!blorp_can_hiz_clear_depth(GEN_GEN, 410 iview->planes[0].isl.format, 411 iview->image->samples, 412 render_area.offset.x, 413 render_area.offset.y, 414 render_area.offset.x + 415 render_area.extent.width, 416 render_area.offset.y + 417 render_area.extent.height)) 418 return; 419 420 if (att_state->clear_value.depthStencil.depth != ANV_HZ_FC_VAL) 421 return; 422 423 if (GEN_GEN == 8 && anv_can_sample_with_hiz(&device->info, iview->image)) { 424 /* Only gen9+ supports returning ANV_HZ_FC_VAL when sampling a 425 * fast-cleared portion of a HiZ buffer. Testing has revealed that Gen8 426 * only supports returning 0.0f. Gens prior to gen8 do not support this 427 * feature at all. 428 */ 429 return; 430 } 431 432 /* If we got here, then we can fast clear */ 433 att_state->fast_clear = true; 434} 435 436static bool 437need_input_attachment_state(const struct anv_render_pass_attachment *att) 438{ 439 if (!(att->usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)) 440 return false; 441 442 /* We only allocate input attachment states for color surfaces. Compression 443 * is not yet enabled for depth textures and stencil doesn't allow 444 * compression so we can just use the texture surface state from the view. 445 */ 446 return vk_format_is_color(att->format); 447} 448 449/* Transitions a HiZ-enabled depth buffer from one layout to another. Unless 450 * the initial layout is undefined, the HiZ buffer and depth buffer will 451 * represent the same data at the end of this operation. 452 */ 453static void 454transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer, 455 const struct anv_image *image, 456 VkImageLayout initial_layout, 457 VkImageLayout final_layout) 458{ 459 const bool hiz_enabled = ISL_AUX_USAGE_HIZ == 460 anv_layout_to_aux_usage(&cmd_buffer->device->info, image, 461 VK_IMAGE_ASPECT_DEPTH_BIT, initial_layout); 462 const bool enable_hiz = ISL_AUX_USAGE_HIZ == 463 anv_layout_to_aux_usage(&cmd_buffer->device->info, image, 464 VK_IMAGE_ASPECT_DEPTH_BIT, final_layout); 465 466 enum isl_aux_op hiz_op; 467 if (hiz_enabled && !enable_hiz) { 468 hiz_op = ISL_AUX_OP_FULL_RESOLVE; 469 } else if (!hiz_enabled && enable_hiz) { 470 hiz_op = ISL_AUX_OP_AMBIGUATE; 471 } else { 472 assert(hiz_enabled == enable_hiz); 473 /* If the same buffer will be used, no resolves are necessary. */ 474 hiz_op = ISL_AUX_OP_NONE; 475 } 476 477 if (hiz_op != ISL_AUX_OP_NONE) 478 anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT, 479 0, 0, 1, hiz_op); 480} 481 482#define MI_PREDICATE_SRC0 0x2400 483#define MI_PREDICATE_SRC1 0x2408 484 485static void 486set_image_compressed_bit(struct anv_cmd_buffer *cmd_buffer, 487 const struct anv_image *image, 488 VkImageAspectFlagBits aspect, 489 uint32_t level, 490 uint32_t base_layer, uint32_t layer_count, 491 bool compressed) 492{ 493 uint32_t plane = anv_image_aspect_to_plane(image->aspects, aspect); 494 495 /* We only have compression tracking for CCS_E */ 496 if (image->planes[plane].aux_usage != ISL_AUX_USAGE_CCS_E) 497 return; 498 499 for (uint32_t a = 0; a < layer_count; a++) { 500 uint32_t layer = base_layer + a; 501 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) { 502 sdi.Address = anv_image_get_compression_state_addr(cmd_buffer->device, 503 image, aspect, 504 level, layer); 505 sdi.ImmediateData = compressed ? UINT32_MAX : 0; 506 } 507 } 508} 509 510static void 511set_image_fast_clear_state(struct anv_cmd_buffer *cmd_buffer, 512 const struct anv_image *image, 513 VkImageAspectFlagBits aspect, 514 enum anv_fast_clear_type fast_clear) 515{ 516 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) { 517 sdi.Address = anv_image_get_fast_clear_type_addr(cmd_buffer->device, 518 image, aspect); 519 sdi.ImmediateData = fast_clear; 520 } 521 522 /* Whenever we have fast-clear, we consider that slice to be compressed. 523 * This makes building predicates much easier. 524 */ 525 if (fast_clear != ANV_FAST_CLEAR_NONE) 526 set_image_compressed_bit(cmd_buffer, image, aspect, 0, 0, 1, true); 527} 528 529#if GEN_IS_HASWELL || GEN_GEN >= 8 530static inline uint32_t 531mi_alu(uint32_t opcode, uint32_t operand1, uint32_t operand2) 532{ 533 struct GENX(MI_MATH_ALU_INSTRUCTION) instr = { 534 .ALUOpcode = opcode, 535 .Operand1 = operand1, 536 .Operand2 = operand2, 537 }; 538 539 uint32_t dw; 540 GENX(MI_MATH_ALU_INSTRUCTION_pack)(NULL, &dw, &instr); 541 542 return dw; 543} 544#endif 545 546#define CS_GPR(n) (0x2600 + (n) * 8) 547 548/* This is only really practical on haswell and above because it requires 549 * MI math in order to get it correct. 550 */ 551#if GEN_GEN >= 8 || GEN_IS_HASWELL 552static void 553anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer, 554 const struct anv_image *image, 555 VkImageAspectFlagBits aspect, 556 uint32_t level, uint32_t array_layer, 557 enum isl_aux_op resolve_op, 558 enum anv_fast_clear_type fast_clear_supported) 559{ 560 struct anv_address fast_clear_type_addr = 561 anv_image_get_fast_clear_type_addr(cmd_buffer->device, image, aspect); 562 563 /* Name some registers */ 564 const int image_fc_reg = MI_ALU_REG0; 565 const int fc_imm_reg = MI_ALU_REG1; 566 const int pred_reg = MI_ALU_REG2; 567 568 uint32_t *dw; 569 570 if (resolve_op == ISL_AUX_OP_FULL_RESOLVE) { 571 /* In this case, we're doing a full resolve which means we want the 572 * resolve to happen if any compression (including fast-clears) is 573 * present. 574 * 575 * In order to simplify the logic a bit, we make the assumption that, 576 * if the first slice has been fast-cleared, it is also marked as 577 * compressed. See also set_image_fast_clear_state. 578 */ 579 struct anv_address compression_state_addr = 580 anv_image_get_compression_state_addr(cmd_buffer->device, image, 581 aspect, level, array_layer); 582 anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { 583 lrm.RegisterAddress = MI_PREDICATE_SRC0; 584 lrm.MemoryAddress = compression_state_addr; 585 } 586 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) { 587 sdi.Address = compression_state_addr; 588 sdi.ImmediateData = 0; 589 } 590 591 if (level == 0 && array_layer == 0) { 592 /* If the predicate is true, we want to write 0 to the fast clear type 593 * and, if it's false, leave it alone. We can do this by writing 594 * 595 * clear_type = clear_type & ~predicate; 596 */ 597 anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { 598 lrm.RegisterAddress = CS_GPR(image_fc_reg); 599 lrm.MemoryAddress = fast_clear_type_addr; 600 } 601 anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_REG), lrr) { 602 lrr.DestinationRegisterAddress = CS_GPR(pred_reg); 603 lrr.SourceRegisterAddress = MI_PREDICATE_SRC0; 604 } 605 606 dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(MI_MATH)); 607 dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, image_fc_reg); 608 dw[2] = mi_alu(MI_ALU_LOADINV, MI_ALU_SRCB, pred_reg); 609 dw[3] = mi_alu(MI_ALU_AND, 0, 0); 610 dw[4] = mi_alu(MI_ALU_STORE, image_fc_reg, MI_ALU_ACCU); 611 612 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) { 613 srm.MemoryAddress = fast_clear_type_addr; 614 srm.RegisterAddress = CS_GPR(image_fc_reg); 615 } 616 } 617 } else if (level == 0 && array_layer == 0) { 618 /* In this case, we are doing a partial resolve to get rid of fast-clear 619 * colors. We don't care about the compression state but we do care 620 * about how much fast clear is allowed by the final layout. 621 */ 622 assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE); 623 assert(fast_clear_supported < ANV_FAST_CLEAR_ANY); 624 625 anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { 626 lrm.RegisterAddress = CS_GPR(image_fc_reg); 627 lrm.MemoryAddress = fast_clear_type_addr; 628 } 629 emit_lri(&cmd_buffer->batch, CS_GPR(image_fc_reg) + 4, 0); 630 631 emit_lri(&cmd_buffer->batch, CS_GPR(fc_imm_reg), fast_clear_supported); 632 emit_lri(&cmd_buffer->batch, CS_GPR(fc_imm_reg) + 4, 0); 633 634 /* We need to compute (fast_clear_supported < image->fast_clear). 635 * We do this by subtracting and storing the carry bit. 636 */ 637 dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(MI_MATH)); 638 dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, fc_imm_reg); 639 dw[2] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, image_fc_reg); 640 dw[3] = mi_alu(MI_ALU_SUB, 0, 0); 641 dw[4] = mi_alu(MI_ALU_STORE, pred_reg, MI_ALU_CF); 642 643 /* Store the predicate */ 644 emit_lrr(&cmd_buffer->batch, MI_PREDICATE_SRC0, CS_GPR(pred_reg)); 645 646 /* If the predicate is true, we want to write 0 to the fast clear type 647 * and, if it's false, leave it alone. We can do this by writing 648 * 649 * clear_type = clear_type & ~predicate; 650 */ 651 dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(MI_MATH)); 652 dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, image_fc_reg); 653 dw[2] = mi_alu(MI_ALU_LOADINV, MI_ALU_SRCB, pred_reg); 654 dw[3] = mi_alu(MI_ALU_AND, 0, 0); 655 dw[4] = mi_alu(MI_ALU_STORE, image_fc_reg, MI_ALU_ACCU); 656 657 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) { 658 srm.RegisterAddress = CS_GPR(image_fc_reg); 659 srm.MemoryAddress = fast_clear_type_addr; 660 } 661 } else { 662 /* In this case, we're trying to do a partial resolve on a slice that 663 * doesn't have clear color. There's nothing to do. 664 */ 665 assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE); 666 return; 667 } 668 669 /* We use the first half of src0 for the actual predicate. Set the second 670 * half of src0 and all of src1 to 0 as the predicate operation will be 671 * doing an implicit src0 != src1. 672 */ 673 emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC0 + 4, 0); 674 emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1 , 0); 675 emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1 + 4, 0); 676 677 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { 678 mip.LoadOperation = LOAD_LOADINV; 679 mip.CombineOperation = COMBINE_SET; 680 mip.CompareOperation = COMPARE_SRCS_EQUAL; 681 } 682} 683#endif /* GEN_GEN >= 8 || GEN_IS_HASWELL */ 684 685#if GEN_GEN <= 8 686static void 687anv_cmd_simple_resolve_predicate(struct anv_cmd_buffer *cmd_buffer, 688 const struct anv_image *image, 689 VkImageAspectFlagBits aspect, 690 uint32_t level, uint32_t array_layer, 691 enum isl_aux_op resolve_op, 692 enum anv_fast_clear_type fast_clear_supported) 693{ 694 struct anv_address fast_clear_type_addr = 695 anv_image_get_fast_clear_type_addr(cmd_buffer->device, image, aspect); 696 697 /* This only works for partial resolves and only when the clear color is 698 * all or nothing. On the upside, this emits less command streamer code 699 * and works on Ivybridge and Bay Trail. 700 */ 701 assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE); 702 assert(fast_clear_supported != ANV_FAST_CLEAR_ANY); 703 704 /* We don't support fast clears on anything other than the first slice. */ 705 if (level > 0 || array_layer > 0) 706 return; 707 708 /* On gen8, we don't have a concept of default clear colors because we 709 * can't sample from CCS surfaces. It's enough to just load the fast clear 710 * state into the predicate register. 711 */ 712 anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { 713 lrm.RegisterAddress = MI_PREDICATE_SRC0; 714 lrm.MemoryAddress = fast_clear_type_addr; 715 } 716 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) { 717 sdi.Address = fast_clear_type_addr; 718 sdi.ImmediateData = 0; 719 } 720 721 /* We use the first half of src0 for the actual predicate. Set the second 722 * half of src0 and all of src1 to 0 as the predicate operation will be 723 * doing an implicit src0 != src1. 724 */ 725 emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC0 + 4, 0); 726 emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1 , 0); 727 emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1 + 4, 0); 728 729 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { 730 mip.LoadOperation = LOAD_LOADINV; 731 mip.CombineOperation = COMBINE_SET; 732 mip.CompareOperation = COMPARE_SRCS_EQUAL; 733 } 734} 735#endif /* GEN_GEN <= 8 */ 736 737static void 738anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer *cmd_buffer, 739 const struct anv_image *image, 740 enum isl_format format, 741 VkImageAspectFlagBits aspect, 742 uint32_t level, uint32_t array_layer, 743 enum isl_aux_op resolve_op, 744 enum anv_fast_clear_type fast_clear_supported) 745{ 746 const uint32_t plane = anv_image_aspect_to_plane(image->aspects, aspect); 747 748#if GEN_GEN >= 9 749 anv_cmd_compute_resolve_predicate(cmd_buffer, image, 750 aspect, level, array_layer, 751 resolve_op, fast_clear_supported); 752#else /* GEN_GEN <= 8 */ 753 anv_cmd_simple_resolve_predicate(cmd_buffer, image, 754 aspect, level, array_layer, 755 resolve_op, fast_clear_supported); 756#endif 757 758 /* CCS_D only supports full resolves and BLORP will assert on us if we try 759 * to do a partial resolve on a CCS_D surface. 760 */ 761 if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE && 762 image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE) 763 resolve_op = ISL_AUX_OP_FULL_RESOLVE; 764 765 anv_image_ccs_op(cmd_buffer, image, format, aspect, level, 766 array_layer, 1, resolve_op, NULL, true); 767} 768 769static void 770anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer *cmd_buffer, 771 const struct anv_image *image, 772 enum isl_format format, 773 VkImageAspectFlagBits aspect, 774 uint32_t array_layer, 775 enum isl_aux_op resolve_op, 776 enum anv_fast_clear_type fast_clear_supported) 777{ 778 assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT); 779 assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE); 780 781#if GEN_GEN >= 8 || GEN_IS_HASWELL 782 anv_cmd_compute_resolve_predicate(cmd_buffer, image, 783 aspect, 0, array_layer, 784 resolve_op, fast_clear_supported); 785 786 anv_image_mcs_op(cmd_buffer, image, format, aspect, 787 array_layer, 1, resolve_op, NULL, true); 788#else 789 unreachable("MCS resolves are unsupported on Ivybridge and Bay Trail"); 790#endif 791} 792 793void 794genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer, 795 const struct anv_image *image, 796 VkImageAspectFlagBits aspect, 797 enum isl_aux_usage aux_usage, 798 uint32_t level, 799 uint32_t base_layer, 800 uint32_t layer_count) 801{ 802 /* The aspect must be exactly one of the image aspects. */ 803 assert(util_bitcount(aspect) == 1 && (aspect & image->aspects)); 804 805 /* The only compression types with more than just fast-clears are MCS, 806 * CCS_E, and HiZ. With HiZ we just trust the layout and don't actually 807 * track the current fast-clear and compression state. This leaves us 808 * with just MCS and CCS_E. 809 */ 810 if (aux_usage != ISL_AUX_USAGE_CCS_E && 811 aux_usage != ISL_AUX_USAGE_MCS) 812 return; 813 814 set_image_compressed_bit(cmd_buffer, image, aspect, 815 level, base_layer, layer_count, true); 816} 817 818static void 819init_fast_clear_color(struct anv_cmd_buffer *cmd_buffer, 820 const struct anv_image *image, 821 VkImageAspectFlagBits aspect) 822{ 823 assert(cmd_buffer && image); 824 assert(image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV); 825 826 set_image_fast_clear_state(cmd_buffer, image, aspect, 827 ANV_FAST_CLEAR_NONE); 828 829 /* The fast clear value dword(s) will be copied into a surface state object. 830 * Ensure that the restrictions of the fields in the dword(s) are followed. 831 * 832 * CCS buffers on SKL+ can have any value set for the clear colors. 833 */ 834 if (image->samples == 1 && GEN_GEN >= 9) 835 return; 836 837 /* Other combinations of auxiliary buffers and platforms require specific 838 * values in the clear value dword(s). 839 */ 840 struct anv_address addr = 841 anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect); 842 843 if (GEN_GEN >= 9) { 844 for (unsigned i = 0; i < 4; i++) { 845 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) { 846 sdi.Address = addr; 847 sdi.Address.offset += i * 4; 848 /* MCS buffers on SKL+ can only have 1/0 clear colors. */ 849 assert(image->samples > 1); 850 sdi.ImmediateData = 0; 851 } 852 } 853 } else { 854 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) { 855 sdi.Address = addr; 856 if (GEN_GEN >= 8 || GEN_IS_HASWELL) { 857 /* Pre-SKL, the dword containing the clear values also contains 858 * other fields, so we need to initialize those fields to match the 859 * values that would be in a color attachment. 860 */ 861 sdi.ImmediateData = ISL_CHANNEL_SELECT_RED << 25 | 862 ISL_CHANNEL_SELECT_GREEN << 22 | 863 ISL_CHANNEL_SELECT_BLUE << 19 | 864 ISL_CHANNEL_SELECT_ALPHA << 16; 865 } else if (GEN_GEN == 7) { 866 /* On IVB, the dword containing the clear values also contains 867 * other fields that must be zero or can be zero. 868 */ 869 sdi.ImmediateData = 0; 870 } 871 } 872 } 873} 874 875/* Copy the fast-clear value dword(s) between a surface state object and an 876 * image's fast clear state buffer. 877 */ 878static void 879genX(copy_fast_clear_dwords)(struct anv_cmd_buffer *cmd_buffer, 880 struct anv_state surface_state, 881 const struct anv_image *image, 882 VkImageAspectFlagBits aspect, 883 bool copy_from_surface_state) 884{ 885 assert(cmd_buffer && image); 886 assert(image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV); 887 888 struct anv_address ss_clear_addr = { 889 .bo = &cmd_buffer->device->surface_state_pool.block_pool.bo, 890 .offset = surface_state.offset + 891 cmd_buffer->device->isl_dev.ss.clear_value_offset, 892 }; 893 const struct anv_address entry_addr = 894 anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect); 895 unsigned copy_size = cmd_buffer->device->isl_dev.ss.clear_value_size; 896 897 if (copy_from_surface_state) { 898 genX(cmd_buffer_mi_memcpy)(cmd_buffer, entry_addr, 899 ss_clear_addr, copy_size); 900 } else { 901 genX(cmd_buffer_mi_memcpy)(cmd_buffer, ss_clear_addr, 902 entry_addr, copy_size); 903 904 /* Updating a surface state object may require that the state cache be 905 * invalidated. From the SKL PRM, Shared Functions -> State -> State 906 * Caching: 907 * 908 * Whenever the RENDER_SURFACE_STATE object in memory pointed to by 909 * the Binding Table Pointer (BTP) and Binding Table Index (BTI) is 910 * modified [...], the L1 state cache must be invalidated to ensure 911 * the new surface or sampler state is fetched from system memory. 912 * 913 * In testing, SKL doesn't actually seem to need this, but HSW does. 914 */ 915 cmd_buffer->state.pending_pipe_bits |= 916 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT; 917 } 918} 919 920/** 921 * @brief Transitions a color buffer from one layout to another. 922 * 923 * See section 6.1.1. Image Layout Transitions of the Vulkan 1.0.50 spec for 924 * more information. 925 * 926 * @param level_count VK_REMAINING_MIP_LEVELS isn't supported. 927 * @param layer_count VK_REMAINING_ARRAY_LAYERS isn't supported. For 3D images, 928 * this represents the maximum layers to transition at each 929 * specified miplevel. 930 */ 931static void 932transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, 933 const struct anv_image *image, 934 VkImageAspectFlagBits aspect, 935 const uint32_t base_level, uint32_t level_count, 936 uint32_t base_layer, uint32_t layer_count, 937 VkImageLayout initial_layout, 938 VkImageLayout final_layout) 939{ 940 const struct gen_device_info *devinfo = &cmd_buffer->device->info; 941 /* Validate the inputs. */ 942 assert(cmd_buffer); 943 assert(image && image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV); 944 /* These values aren't supported for simplicity's sake. */ 945 assert(level_count != VK_REMAINING_MIP_LEVELS && 946 layer_count != VK_REMAINING_ARRAY_LAYERS); 947 /* Ensure the subresource range is valid. */ 948 uint64_t last_level_num = base_level + level_count; 949 const uint32_t max_depth = anv_minify(image->extent.depth, base_level); 950 UNUSED const uint32_t image_layers = MAX2(image->array_size, max_depth); 951 assert((uint64_t)base_layer + layer_count <= image_layers); 952 assert(last_level_num <= image->levels); 953 /* The spec disallows these final layouts. */ 954 assert(final_layout != VK_IMAGE_LAYOUT_UNDEFINED && 955 final_layout != VK_IMAGE_LAYOUT_PREINITIALIZED); 956 957 /* No work is necessary if the layout stays the same or if this subresource 958 * range lacks auxiliary data. 959 */ 960 if (initial_layout == final_layout) 961 return; 962 963 uint32_t plane = anv_image_aspect_to_plane(image->aspects, aspect); 964 965 if (image->planes[plane].shadow_surface.isl.size_B > 0 && 966 final_layout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) { 967 /* This surface is a linear compressed image with a tiled shadow surface 968 * for texturing. The client is about to use it in READ_ONLY_OPTIMAL so 969 * we need to ensure the shadow copy is up-to-date. 970 */ 971 assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT); 972 assert(image->planes[plane].surface.isl.tiling == ISL_TILING_LINEAR); 973 assert(image->planes[plane].shadow_surface.isl.tiling != ISL_TILING_LINEAR); 974 assert(isl_format_is_compressed(image->planes[plane].surface.isl.format)); 975 assert(plane == 0); 976 anv_image_copy_to_shadow(cmd_buffer, image, 977 base_level, level_count, 978 base_layer, layer_count); 979 } 980 981 if (base_layer >= anv_image_aux_layers(image, aspect, base_level)) 982 return; 983 984 assert(image->tiling == VK_IMAGE_TILING_OPTIMAL); 985 986 if (initial_layout == VK_IMAGE_LAYOUT_UNDEFINED || 987 initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) { 988 /* A subresource in the undefined layout may have been aliased and 989 * populated with any arrangement of bits. Therefore, we must initialize 990 * the related aux buffer and clear buffer entry with desirable values. 991 * An initial layout of PREINITIALIZED is the same as UNDEFINED for 992 * images with VK_IMAGE_TILING_OPTIMAL. 993 * 994 * Initialize the relevant clear buffer entries. 995 */ 996 if (base_level == 0 && base_layer == 0) 997 init_fast_clear_color(cmd_buffer, image, aspect); 998 999 /* Initialize the aux buffers to enable correct rendering. In order to 1000 * ensure that things such as storage images work correctly, aux buffers 1001 * need to be initialized to valid data. 1002 * 1003 * Having an aux buffer with invalid data is a problem for two reasons: 1004 * 1005 * 1) Having an invalid value in the buffer can confuse the hardware. 1006 * For instance, with CCS_E on SKL, a two-bit CCS value of 2 is 1007 * invalid and leads to the hardware doing strange things. It 1008 * doesn't hang as far as we can tell but rendering corruption can 1009 * occur. 1010 * 1011 * 2) If this transition is into the GENERAL layout and we then use the 1012 * image as a storage image, then we must have the aux buffer in the 1013 * pass-through state so that, if we then go to texture from the 1014 * image, we get the results of our storage image writes and not the 1015 * fast clear color or other random data. 1016 * 1017 * For CCS both of the problems above are real demonstrable issues. In 1018 * that case, the only thing we can do is to perform an ambiguate to 1019 * transition the aux surface into the pass-through state. 1020 * 1021 * For MCS, (2) is never an issue because we don't support multisampled 1022 * storage images. In theory, issue (1) is a problem with MCS but we've 1023 * never seen it in the wild. For 4x and 16x, all bit patters could, in 1024 * theory, be interpreted as something but we don't know that all bit 1025 * patterns are actually valid. For 2x and 8x, you could easily end up 1026 * with the MCS referring to an invalid plane because not all bits of 1027 * the MCS value are actually used. Even though we've never seen issues 1028 * in the wild, it's best to play it safe and initialize the MCS. We 1029 * can use a fast-clear for MCS because we only ever touch from render 1030 * and texture (no image load store). 1031 */ 1032 if (image->samples == 1) { 1033 for (uint32_t l = 0; l < level_count; l++) { 1034 const uint32_t level = base_level + l; 1035 1036 uint32_t aux_layers = anv_image_aux_layers(image, aspect, level); 1037 if (base_layer >= aux_layers) 1038 break; /* We will only get fewer layers as level increases */ 1039 uint32_t level_layer_count = 1040 MIN2(layer_count, aux_layers - base_layer); 1041 1042 anv_image_ccs_op(cmd_buffer, image, 1043 image->planes[plane].surface.isl.format, 1044 aspect, level, base_layer, level_layer_count, 1045 ISL_AUX_OP_AMBIGUATE, NULL, false); 1046 1047 if (image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E) { 1048 set_image_compressed_bit(cmd_buffer, image, aspect, 1049 level, base_layer, level_layer_count, 1050 false); 1051 } 1052 } 1053 } else { 1054 if (image->samples == 4 || image->samples == 16) { 1055 anv_perf_warn(cmd_buffer->device->instance, image, 1056 "Doing a potentially unnecessary fast-clear to " 1057 "define an MCS buffer."); 1058 } 1059 1060 assert(base_level == 0 && level_count == 1); 1061 anv_image_mcs_op(cmd_buffer, image, 1062 image->planes[plane].surface.isl.format, 1063 aspect, base_layer, layer_count, 1064 ISL_AUX_OP_FAST_CLEAR, NULL, false); 1065 } 1066 return; 1067 } 1068 1069 const enum isl_aux_usage initial_aux_usage = 1070 anv_layout_to_aux_usage(devinfo, image, aspect, initial_layout); 1071 const enum isl_aux_usage final_aux_usage = 1072 anv_layout_to_aux_usage(devinfo, image, aspect, final_layout); 1073 1074 /* The current code assumes that there is no mixing of CCS_E and CCS_D. 1075 * We can handle transitions between CCS_D/E to and from NONE. What we 1076 * don't yet handle is switching between CCS_E and CCS_D within a given 1077 * image. Doing so in a performant way requires more detailed aux state 1078 * tracking such as what is done in i965. For now, just assume that we 1079 * only have one type of compression. 1080 */ 1081 assert(initial_aux_usage == ISL_AUX_USAGE_NONE || 1082 final_aux_usage == ISL_AUX_USAGE_NONE || 1083 initial_aux_usage == final_aux_usage); 1084 1085 /* If initial aux usage is NONE, there is nothing to resolve */ 1086 if (initial_aux_usage == ISL_AUX_USAGE_NONE) 1087 return; 1088 1089 enum isl_aux_op resolve_op = ISL_AUX_OP_NONE; 1090 1091 /* If the initial layout supports more fast clear than the final layout 1092 * then we need at least a partial resolve. 1093 */ 1094 const enum anv_fast_clear_type initial_fast_clear = 1095 anv_layout_to_fast_clear_type(devinfo, image, aspect, initial_layout); 1096 const enum anv_fast_clear_type final_fast_clear = 1097 anv_layout_to_fast_clear_type(devinfo, image, aspect, final_layout); 1098 if (final_fast_clear < initial_fast_clear) 1099 resolve_op = ISL_AUX_OP_PARTIAL_RESOLVE; 1100 1101 if (initial_aux_usage == ISL_AUX_USAGE_CCS_E && 1102 final_aux_usage != ISL_AUX_USAGE_CCS_E) 1103 resolve_op = ISL_AUX_OP_FULL_RESOLVE; 1104 1105 if (resolve_op == ISL_AUX_OP_NONE) 1106 return; 1107 1108 /* Perform a resolve to synchronize data between the main and aux buffer. 1109 * Before we begin, we must satisfy the cache flushing requirement specified 1110 * in the Sky Lake PRM Vol. 7, "MCS Buffer for Render Target(s)": 1111 * 1112 * Any transition from any value in {Clear, Render, Resolve} to a 1113 * different value in {Clear, Render, Resolve} requires end of pipe 1114 * synchronization. 1115 * 1116 * We perform a flush of the write cache before and after the clear and 1117 * resolve operations to meet this requirement. 1118 * 1119 * Unlike other drawing, fast clear operations are not properly 1120 * synchronized. The first PIPE_CONTROL here likely ensures that the 1121 * contents of the previous render or clear hit the render target before we 1122 * resolve and the second likely ensures that the resolve is complete before 1123 * we do any more rendering or clearing. 1124 */ 1125 cmd_buffer->state.pending_pipe_bits |= 1126 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_CS_STALL_BIT; 1127 1128 for (uint32_t l = 0; l < level_count; l++) { 1129 uint32_t level = base_level + l; 1130 1131 uint32_t aux_layers = anv_image_aux_layers(image, aspect, level); 1132 if (base_layer >= aux_layers) 1133 break; /* We will only get fewer layers as level increases */ 1134 uint32_t level_layer_count = 1135 MIN2(layer_count, aux_layers - base_layer); 1136 1137 for (uint32_t a = 0; a < level_layer_count; a++) { 1138 uint32_t array_layer = base_layer + a; 1139 if (image->samples == 1) { 1140 anv_cmd_predicated_ccs_resolve(cmd_buffer, image, 1141 image->planes[plane].surface.isl.format, 1142 aspect, level, array_layer, resolve_op, 1143 final_fast_clear); 1144 } else { 1145 /* We only support fast-clear on the first layer so partial 1146 * resolves should not be used on other layers as they will use 1147 * the clear color stored in memory that is only valid for layer0. 1148 */ 1149 if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE && 1150 array_layer != 0) 1151 continue; 1152 1153 anv_cmd_predicated_mcs_resolve(cmd_buffer, image, 1154 image->planes[plane].surface.isl.format, 1155 aspect, array_layer, resolve_op, 1156 final_fast_clear); 1157 } 1158 } 1159 } 1160 1161 cmd_buffer->state.pending_pipe_bits |= 1162 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_CS_STALL_BIT; 1163} 1164 1165/** 1166 * Setup anv_cmd_state::attachments for vkCmdBeginRenderPass. 1167 */ 1168static VkResult 1169genX(cmd_buffer_setup_attachments)(struct anv_cmd_buffer *cmd_buffer, 1170 struct anv_render_pass *pass, 1171 const VkRenderPassBeginInfo *begin) 1172{ 1173 const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; 1174 struct anv_cmd_state *state = &cmd_buffer->state; 1175 1176 vk_free(&cmd_buffer->pool->alloc, state->attachments); 1177 1178 if (pass->attachment_count > 0) { 1179 state->attachments = vk_alloc(&cmd_buffer->pool->alloc, 1180 pass->attachment_count * 1181 sizeof(state->attachments[0]), 1182 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 1183 if (state->attachments == NULL) { 1184 /* Propagate VK_ERROR_OUT_OF_HOST_MEMORY to vkEndCommandBuffer */ 1185 return anv_batch_set_error(&cmd_buffer->batch, 1186 VK_ERROR_OUT_OF_HOST_MEMORY); 1187 } 1188 } else { 1189 state->attachments = NULL; 1190 } 1191 1192 /* Reserve one for the NULL state. */ 1193 unsigned num_states = 1; 1194 for (uint32_t i = 0; i < pass->attachment_count; ++i) { 1195 if (vk_format_is_color(pass->attachments[i].format)) 1196 num_states++; 1197 1198 if (need_input_attachment_state(&pass->attachments[i])) 1199 num_states++; 1200 } 1201 1202 const uint32_t ss_stride = align_u32(isl_dev->ss.size, isl_dev->ss.align); 1203 state->render_pass_states = 1204 anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 1205 num_states * ss_stride, isl_dev->ss.align); 1206 1207 struct anv_state next_state = state->render_pass_states; 1208 next_state.alloc_size = isl_dev->ss.size; 1209 1210 state->null_surface_state = next_state; 1211 next_state.offset += ss_stride; 1212 next_state.map += ss_stride; 1213 1214 for (uint32_t i = 0; i < pass->attachment_count; ++i) { 1215 if (vk_format_is_color(pass->attachments[i].format)) { 1216 state->attachments[i].color.state = next_state; 1217 next_state.offset += ss_stride; 1218 next_state.map += ss_stride; 1219 } 1220 1221 if (need_input_attachment_state(&pass->attachments[i])) { 1222 state->attachments[i].input.state = next_state; 1223 next_state.offset += ss_stride; 1224 next_state.map += ss_stride; 1225 } 1226 } 1227 assert(next_state.offset == state->render_pass_states.offset + 1228 state->render_pass_states.alloc_size); 1229 1230 if (begin) { 1231 ANV_FROM_HANDLE(anv_framebuffer, framebuffer, begin->framebuffer); 1232 assert(pass->attachment_count == framebuffer->attachment_count); 1233 1234 isl_null_fill_state(isl_dev, state->null_surface_state.map, 1235 isl_extent3d(framebuffer->width, 1236 framebuffer->height, 1237 framebuffer->layers)); 1238 1239 for (uint32_t i = 0; i < pass->attachment_count; ++i) { 1240 struct anv_render_pass_attachment *att = &pass->attachments[i]; 1241 VkImageAspectFlags att_aspects = vk_format_aspects(att->format); 1242 VkImageAspectFlags clear_aspects = 0; 1243 VkImageAspectFlags load_aspects = 0; 1244 1245 if (att_aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { 1246 /* color attachment */ 1247 if (att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { 1248 clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT; 1249 } else if (att->load_op == VK_ATTACHMENT_LOAD_OP_LOAD) { 1250 load_aspects |= VK_IMAGE_ASPECT_COLOR_BIT; 1251 } 1252 } else { 1253 /* depthstencil attachment */ 1254 if (att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { 1255 if (att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { 1256 clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT; 1257 } else if (att->load_op == VK_ATTACHMENT_LOAD_OP_LOAD) { 1258 load_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT; 1259 } 1260 } 1261 if (att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { 1262 if (att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { 1263 clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT; 1264 } else if (att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_LOAD) { 1265 load_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT; 1266 } 1267 } 1268 } 1269 1270 state->attachments[i].current_layout = att->initial_layout; 1271 state->attachments[i].pending_clear_aspects = clear_aspects; 1272 state->attachments[i].pending_load_aspects = load_aspects; 1273 if (clear_aspects) 1274 state->attachments[i].clear_value = begin->pClearValues[i]; 1275 1276 struct anv_image_view *iview = framebuffer->attachments[i]; 1277 anv_assert(iview->vk_format == att->format); 1278 1279 const uint32_t num_layers = iview->planes[0].isl.array_len; 1280 state->attachments[i].pending_clear_views = (1 << num_layers) - 1; 1281 1282 union isl_color_value clear_color = { .u32 = { 0, } }; 1283 if (att_aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { 1284 anv_assert(iview->n_planes == 1); 1285 assert(att_aspects == VK_IMAGE_ASPECT_COLOR_BIT); 1286 color_attachment_compute_aux_usage(cmd_buffer->device, 1287 state, i, begin->renderArea, 1288 &clear_color); 1289 1290 anv_image_fill_surface_state(cmd_buffer->device, 1291 iview->image, 1292 VK_IMAGE_ASPECT_COLOR_BIT, 1293 &iview->planes[0].isl, 1294 ISL_SURF_USAGE_RENDER_TARGET_BIT, 1295 state->attachments[i].aux_usage, 1296 &clear_color, 1297 0, 1298 &state->attachments[i].color, 1299 NULL); 1300 1301 add_surface_state_relocs(cmd_buffer, state->attachments[i].color); 1302 } else { 1303 depth_stencil_attachment_compute_aux_usage(cmd_buffer->device, 1304 state, i, 1305 begin->renderArea); 1306 } 1307 1308 if (need_input_attachment_state(&pass->attachments[i])) { 1309 anv_image_fill_surface_state(cmd_buffer->device, 1310 iview->image, 1311 VK_IMAGE_ASPECT_COLOR_BIT, 1312 &iview->planes[0].isl, 1313 ISL_SURF_USAGE_TEXTURE_BIT, 1314 state->attachments[i].input_aux_usage, 1315 &clear_color, 1316 0, 1317 &state->attachments[i].input, 1318 NULL); 1319 1320 add_surface_state_relocs(cmd_buffer, state->attachments[i].input); 1321 } 1322 } 1323 } 1324 1325 return VK_SUCCESS; 1326} 1327 1328VkResult 1329genX(BeginCommandBuffer)( 1330 VkCommandBuffer commandBuffer, 1331 const VkCommandBufferBeginInfo* pBeginInfo) 1332{ 1333 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 1334 1335 /* If this is the first vkBeginCommandBuffer, we must *initialize* the 1336 * command buffer's state. Otherwise, we must *reset* its state. In both 1337 * cases we reset it. 1338 * 1339 * From the Vulkan 1.0 spec: 1340 * 1341 * If a command buffer is in the executable state and the command buffer 1342 * was allocated from a command pool with the 1343 * VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT flag set, then 1344 * vkBeginCommandBuffer implicitly resets the command buffer, behaving 1345 * as if vkResetCommandBuffer had been called with 1346 * VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT not set. It then puts 1347 * the command buffer in the recording state. 1348 */ 1349 anv_cmd_buffer_reset(cmd_buffer); 1350 1351 cmd_buffer->usage_flags = pBeginInfo->flags; 1352 1353 assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY || 1354 !(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)); 1355 1356 genX(cmd_buffer_emit_state_base_address)(cmd_buffer); 1357 1358 /* We sometimes store vertex data in the dynamic state buffer for blorp 1359 * operations and our dynamic state stream may re-use data from previous 1360 * command buffers. In order to prevent stale cache data, we flush the VF 1361 * cache. We could do this on every blorp call but that's not really 1362 * needed as all of the data will get written by the CPU prior to the GPU 1363 * executing anything. The chances are fairly high that they will use 1364 * blorp at least once per primary command buffer so it shouldn't be 1365 * wasted. 1366 */ 1367 if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) 1368 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT; 1369 1370 /* We send an "Indirect State Pointers Disable" packet at 1371 * EndCommandBuffer, so all push contant packets are ignored during a 1372 * context restore. Documentation says after that command, we need to 1373 * emit push constants again before any rendering operation. So we 1374 * flag them dirty here to make sure they get emitted. 1375 */ 1376 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS; 1377 1378 VkResult result = VK_SUCCESS; 1379 if (cmd_buffer->usage_flags & 1380 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) { 1381 assert(pBeginInfo->pInheritanceInfo); 1382 cmd_buffer->state.pass = 1383 anv_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass); 1384 cmd_buffer->state.subpass = 1385 &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass]; 1386 1387 /* This is optional in the inheritance info. */ 1388 cmd_buffer->state.framebuffer = 1389 anv_framebuffer_from_handle(pBeginInfo->pInheritanceInfo->framebuffer); 1390 1391 result = genX(cmd_buffer_setup_attachments)(cmd_buffer, 1392 cmd_buffer->state.pass, NULL); 1393 1394 /* Record that HiZ is enabled if we can. */ 1395 if (cmd_buffer->state.framebuffer) { 1396 const struct anv_image_view * const iview = 1397 anv_cmd_buffer_get_depth_stencil_view(cmd_buffer); 1398 1399 if (iview) { 1400 VkImageLayout layout = 1401 cmd_buffer->state.subpass->depth_stencil_attachment->layout; 1402 1403 enum isl_aux_usage aux_usage = 1404 anv_layout_to_aux_usage(&cmd_buffer->device->info, iview->image, 1405 VK_IMAGE_ASPECT_DEPTH_BIT, layout); 1406 1407 cmd_buffer->state.hiz_enabled = aux_usage == ISL_AUX_USAGE_HIZ; 1408 } 1409 } 1410 1411 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS; 1412 } 1413 1414 return result; 1415} 1416 1417/* From the PRM, Volume 2a: 1418 * 1419 * "Indirect State Pointers Disable 1420 * 1421 * At the completion of the post-sync operation associated with this pipe 1422 * control packet, the indirect state pointers in the hardware are 1423 * considered invalid; the indirect pointers are not saved in the context. 1424 * If any new indirect state commands are executed in the command stream 1425 * while the pipe control is pending, the new indirect state commands are 1426 * preserved. 1427 * 1428 * [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context 1429 * restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant 1430 * commands are only considered as Indirect State Pointers. Once ISP is 1431 * issued in a context, SW must initialize by programming push constant 1432 * commands for all the shaders (at least to zero length) before attempting 1433 * any rendering operation for the same context." 1434 * 1435 * 3DSTATE_CONSTANT_* packets are restored during a context restore, 1436 * even though they point to a BO that has been already unreferenced at 1437 * the end of the previous batch buffer. This has been fine so far since 1438 * we are protected by these scratch page (every address not covered by 1439 * a BO should be pointing to the scratch page). But on CNL, it is 1440 * causing a GPU hang during context restore at the 3DSTATE_CONSTANT_* 1441 * instruction. 1442 * 1443 * The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the 1444 * hardware to ignore previous 3DSTATE_CONSTANT_* packets during a 1445 * context restore, so the mentioned hang doesn't happen. However, 1446 * software must program push constant commands for all stages prior to 1447 * rendering anything. So we flag them dirty in BeginCommandBuffer. 1448 * 1449 * Finally, we also make sure to stall at pixel scoreboard to make sure the 1450 * constants have been loaded into the EUs prior to disable the push constants 1451 * so that it doesn't hang a previous 3DPRIMITIVE. 1452 */ 1453static void 1454emit_isp_disable(struct anv_cmd_buffer *cmd_buffer) 1455{ 1456 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 1457 pc.StallAtPixelScoreboard = true; 1458 pc.CommandStreamerStallEnable = true; 1459 } 1460 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 1461 pc.IndirectStatePointersDisable = true; 1462 pc.CommandStreamerStallEnable = true; 1463 } 1464} 1465 1466VkResult 1467genX(EndCommandBuffer)( 1468 VkCommandBuffer commandBuffer) 1469{ 1470 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 1471 1472 if (anv_batch_has_error(&cmd_buffer->batch)) 1473 return cmd_buffer->batch.status; 1474 1475 /* We want every command buffer to start with the PMA fix in a known state, 1476 * so we disable it at the end of the command buffer. 1477 */ 1478 genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false); 1479 1480 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 1481 1482 emit_isp_disable(cmd_buffer); 1483 1484 anv_cmd_buffer_end_batch_buffer(cmd_buffer); 1485 1486 return VK_SUCCESS; 1487} 1488 1489void 1490genX(CmdExecuteCommands)( 1491 VkCommandBuffer commandBuffer, 1492 uint32_t commandBufferCount, 1493 const VkCommandBuffer* pCmdBuffers) 1494{ 1495 ANV_FROM_HANDLE(anv_cmd_buffer, primary, commandBuffer); 1496 1497 assert(primary->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); 1498 1499 if (anv_batch_has_error(&primary->batch)) 1500 return; 1501 1502 /* The secondary command buffers will assume that the PMA fix is disabled 1503 * when they begin executing. Make sure this is true. 1504 */ 1505 genX(cmd_buffer_enable_pma_fix)(primary, false); 1506 1507 /* The secondary command buffer doesn't know which textures etc. have been 1508 * flushed prior to their execution. Apply those flushes now. 1509 */ 1510 genX(cmd_buffer_apply_pipe_flushes)(primary); 1511 1512 for (uint32_t i = 0; i < commandBufferCount; i++) { 1513 ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]); 1514 1515 assert(secondary->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY); 1516 assert(!anv_batch_has_error(&secondary->batch)); 1517 1518 if (secondary->usage_flags & 1519 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) { 1520 /* If we're continuing a render pass from the primary, we need to 1521 * copy the surface states for the current subpass into the storage 1522 * we allocated for them in BeginCommandBuffer. 1523 */ 1524 struct anv_bo *ss_bo = 1525 &primary->device->surface_state_pool.block_pool.bo; 1526 struct anv_state src_state = primary->state.render_pass_states; 1527 struct anv_state dst_state = secondary->state.render_pass_states; 1528 assert(src_state.alloc_size == dst_state.alloc_size); 1529 1530 genX(cmd_buffer_so_memcpy)(primary, 1531 (struct anv_address) { 1532 .bo = ss_bo, 1533 .offset = dst_state.offset, 1534 }, 1535 (struct anv_address) { 1536 .bo = ss_bo, 1537 .offset = src_state.offset, 1538 }, 1539 src_state.alloc_size); 1540 } 1541 1542 anv_cmd_buffer_add_secondary(primary, secondary); 1543 } 1544 1545 /* The secondary may have selected a different pipeline (3D or compute) and 1546 * may have changed the current L3$ configuration. Reset our tracking 1547 * variables to invalid values to ensure that we re-emit these in the case 1548 * where we do any draws or compute dispatches from the primary after the 1549 * secondary has returned. 1550 */ 1551 primary->state.current_pipeline = UINT32_MAX; 1552 primary->state.current_l3_config = NULL; 1553 1554 /* Each of the secondary command buffers will use its own state base 1555 * address. We need to re-emit state base address for the primary after 1556 * all of the secondaries are done. 1557 * 1558 * TODO: Maybe we want to make this a dirty bit to avoid extra state base 1559 * address calls? 1560 */ 1561 genX(cmd_buffer_emit_state_base_address)(primary); 1562} 1563 1564#define IVB_L3SQCREG1_SQGHPCI_DEFAULT 0x00730000 1565#define VLV_L3SQCREG1_SQGHPCI_DEFAULT 0x00d30000 1566#define HSW_L3SQCREG1_SQGHPCI_DEFAULT 0x00610000 1567 1568/** 1569 * Program the hardware to use the specified L3 configuration. 1570 */ 1571void 1572genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer, 1573 const struct gen_l3_config *cfg) 1574{ 1575 assert(cfg); 1576 if (cfg == cmd_buffer->state.current_l3_config) 1577 return; 1578 1579 if (unlikely(INTEL_DEBUG & DEBUG_L3)) { 1580 intel_logd("L3 config transition: "); 1581 gen_dump_l3_config(cfg, stderr); 1582 } 1583 1584 const bool has_slm = cfg->n[GEN_L3P_SLM]; 1585 1586 /* According to the hardware docs, the L3 partitioning can only be changed 1587 * while the pipeline is completely drained and the caches are flushed, 1588 * which involves a first PIPE_CONTROL flush which stalls the pipeline... 1589 */ 1590 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 1591 pc.DCFlushEnable = true; 1592 pc.PostSyncOperation = NoWrite; 1593 pc.CommandStreamerStallEnable = true; 1594 } 1595 1596 /* ...followed by a second pipelined PIPE_CONTROL that initiates 1597 * invalidation of the relevant caches. Note that because RO invalidation 1598 * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL 1599 * command is processed by the CS) we cannot combine it with the previous 1600 * stalling flush as the hardware documentation suggests, because that 1601 * would cause the CS to stall on previous rendering *after* RO 1602 * invalidation and wouldn't prevent the RO caches from being polluted by 1603 * concurrent rendering before the stall completes. This intentionally 1604 * doesn't implement the SKL+ hardware workaround suggesting to enable CS 1605 * stall on PIPE_CONTROLs with the texture cache invalidation bit set for 1606 * GPGPU workloads because the previous and subsequent PIPE_CONTROLs 1607 * already guarantee that there is no concurrent GPGPU kernel execution 1608 * (see SKL HSD 2132585). 1609 */ 1610 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 1611 pc.TextureCacheInvalidationEnable = true; 1612 pc.ConstantCacheInvalidationEnable = true; 1613 pc.InstructionCacheInvalidateEnable = true; 1614 pc.StateCacheInvalidationEnable = true; 1615 pc.PostSyncOperation = NoWrite; 1616 } 1617 1618 /* Now send a third stalling flush to make sure that invalidation is 1619 * complete when the L3 configuration registers are modified. 1620 */ 1621 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 1622 pc.DCFlushEnable = true; 1623 pc.PostSyncOperation = NoWrite; 1624 pc.CommandStreamerStallEnable = true; 1625 } 1626 1627#if GEN_GEN >= 8 1628 1629 assert(!cfg->n[GEN_L3P_IS] && !cfg->n[GEN_L3P_C] && !cfg->n[GEN_L3P_T]); 1630 1631 uint32_t l3cr; 1632 anv_pack_struct(&l3cr, GENX(L3CNTLREG), 1633 .SLMEnable = has_slm, 1634 .URBAllocation = cfg->n[GEN_L3P_URB], 1635 .ROAllocation = cfg->n[GEN_L3P_RO], 1636 .DCAllocation = cfg->n[GEN_L3P_DC], 1637 .AllAllocation = cfg->n[GEN_L3P_ALL]); 1638 1639 /* Set up the L3 partitioning. */ 1640 emit_lri(&cmd_buffer->batch, GENX(L3CNTLREG_num), l3cr); 1641 1642#else 1643 1644 const bool has_dc = cfg->n[GEN_L3P_DC] || cfg->n[GEN_L3P_ALL]; 1645 const bool has_is = cfg->n[GEN_L3P_IS] || cfg->n[GEN_L3P_RO] || 1646 cfg->n[GEN_L3P_ALL]; 1647 const bool has_c = cfg->n[GEN_L3P_C] || cfg->n[GEN_L3P_RO] || 1648 cfg->n[GEN_L3P_ALL]; 1649 const bool has_t = cfg->n[GEN_L3P_T] || cfg->n[GEN_L3P_RO] || 1650 cfg->n[GEN_L3P_ALL]; 1651 1652 assert(!cfg->n[GEN_L3P_ALL]); 1653 1654 /* When enabled SLM only uses a portion of the L3 on half of the banks, 1655 * the matching space on the remaining banks has to be allocated to a 1656 * client (URB for all validated configurations) set to the 1657 * lower-bandwidth 2-bank address hashing mode. 1658 */ 1659 const struct gen_device_info *devinfo = &cmd_buffer->device->info; 1660 const bool urb_low_bw = has_slm && !devinfo->is_baytrail; 1661 assert(!urb_low_bw || cfg->n[GEN_L3P_URB] == cfg->n[GEN_L3P_SLM]); 1662 1663 /* Minimum number of ways that can be allocated to the URB. */ 1664 MAYBE_UNUSED const unsigned n0_urb = devinfo->is_baytrail ? 32 : 0; 1665 assert(cfg->n[GEN_L3P_URB] >= n0_urb); 1666 1667 uint32_t l3sqcr1, l3cr2, l3cr3; 1668 anv_pack_struct(&l3sqcr1, GENX(L3SQCREG1), 1669 .ConvertDC_UC = !has_dc, 1670 .ConvertIS_UC = !has_is, 1671 .ConvertC_UC = !has_c, 1672 .ConvertT_UC = !has_t); 1673 l3sqcr1 |= 1674 GEN_IS_HASWELL ? HSW_L3SQCREG1_SQGHPCI_DEFAULT : 1675 devinfo->is_baytrail ? VLV_L3SQCREG1_SQGHPCI_DEFAULT : 1676 IVB_L3SQCREG1_SQGHPCI_DEFAULT; 1677 1678 anv_pack_struct(&l3cr2, GENX(L3CNTLREG2), 1679 .SLMEnable = has_slm, 1680 .URBLowBandwidth = urb_low_bw, 1681 .URBAllocation = cfg->n[GEN_L3P_URB] - n0_urb, 1682#if !GEN_IS_HASWELL 1683 .ALLAllocation = cfg->n[GEN_L3P_ALL], 1684#endif 1685 .ROAllocation = cfg->n[GEN_L3P_RO], 1686 .DCAllocation = cfg->n[GEN_L3P_DC]); 1687 1688 anv_pack_struct(&l3cr3, GENX(L3CNTLREG3), 1689 .ISAllocation = cfg->n[GEN_L3P_IS], 1690 .ISLowBandwidth = 0, 1691 .CAllocation = cfg->n[GEN_L3P_C], 1692 .CLowBandwidth = 0, 1693 .TAllocation = cfg->n[GEN_L3P_T], 1694 .TLowBandwidth = 0); 1695 1696 /* Set up the L3 partitioning. */ 1697 emit_lri(&cmd_buffer->batch, GENX(L3SQCREG1_num), l3sqcr1); 1698 emit_lri(&cmd_buffer->batch, GENX(L3CNTLREG2_num), l3cr2); 1699 emit_lri(&cmd_buffer->batch, GENX(L3CNTLREG3_num), l3cr3); 1700 1701#if GEN_IS_HASWELL 1702 if (cmd_buffer->device->instance->physicalDevice.cmd_parser_version >= 4) { 1703 /* Enable L3 atomics on HSW if we have a DC partition, otherwise keep 1704 * them disabled to avoid crashing the system hard. 1705 */ 1706 uint32_t scratch1, chicken3; 1707 anv_pack_struct(&scratch1, GENX(SCRATCH1), 1708 .L3AtomicDisable = !has_dc); 1709 anv_pack_struct(&chicken3, GENX(CHICKEN3), 1710 .L3AtomicDisableMask = true, 1711 .L3AtomicDisable = !has_dc); 1712 emit_lri(&cmd_buffer->batch, GENX(SCRATCH1_num), scratch1); 1713 emit_lri(&cmd_buffer->batch, GENX(CHICKEN3_num), chicken3); 1714 } 1715#endif 1716 1717#endif 1718 1719 cmd_buffer->state.current_l3_config = cfg; 1720} 1721 1722void 1723genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer) 1724{ 1725 enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits; 1726 1727 /* Flushes are pipelined while invalidations are handled immediately. 1728 * Therefore, if we're flushing anything then we need to schedule a stall 1729 * before any invalidations can happen. 1730 */ 1731 if (bits & ANV_PIPE_FLUSH_BITS) 1732 bits |= ANV_PIPE_NEEDS_CS_STALL_BIT; 1733 1734 /* If we're going to do an invalidate and we have a pending CS stall that 1735 * has yet to be resolved, we do the CS stall now. 1736 */ 1737 if ((bits & ANV_PIPE_INVALIDATE_BITS) && 1738 (bits & ANV_PIPE_NEEDS_CS_STALL_BIT)) { 1739 bits |= ANV_PIPE_CS_STALL_BIT; 1740 bits &= ~ANV_PIPE_NEEDS_CS_STALL_BIT; 1741 } 1742 1743 if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT)) { 1744 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { 1745 pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT; 1746 pipe.DCFlushEnable = bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT; 1747 pipe.RenderTargetCacheFlushEnable = 1748 bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT; 1749 1750 pipe.DepthStallEnable = bits & ANV_PIPE_DEPTH_STALL_BIT; 1751 pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT; 1752 pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT; 1753 1754 /* 1755 * According to the Broadwell documentation, any PIPE_CONTROL with the 1756 * "Command Streamer Stall" bit set must also have another bit set, 1757 * with five different options: 1758 * 1759 * - Render Target Cache Flush 1760 * - Depth Cache Flush 1761 * - Stall at Pixel Scoreboard 1762 * - Post-Sync Operation 1763 * - Depth Stall 1764 * - DC Flush Enable 1765 * 1766 * I chose "Stall at Pixel Scoreboard" since that's what we use in 1767 * mesa and it seems to work fine. The choice is fairly arbitrary. 1768 */ 1769 if ((bits & ANV_PIPE_CS_STALL_BIT) && 1770 !(bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_DEPTH_STALL_BIT | 1771 ANV_PIPE_STALL_AT_SCOREBOARD_BIT))) 1772 pipe.StallAtPixelScoreboard = true; 1773 } 1774 1775 /* If a render target flush was emitted, then we can toggle off the bit 1776 * saying that render target writes are ongoing. 1777 */ 1778 if (bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT) 1779 bits &= ~(ANV_PIPE_RENDER_TARGET_WRITES); 1780 1781 bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT); 1782 } 1783 1784 if (bits & ANV_PIPE_INVALIDATE_BITS) { 1785 /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL", 1786 * 1787 * "If the VF Cache Invalidation Enable is set to a 1 in a 1788 * PIPE_CONTROL, a separate Null PIPE_CONTROL, all bitfields sets to 1789 * 0, with the VF Cache Invalidation Enable set to 0 needs to be sent 1790 * prior to the PIPE_CONTROL with VF Cache Invalidation Enable set to 1791 * a 1." 1792 * 1793 * This appears to hang Broadwell, so we restrict it to just gen9. 1794 */ 1795 if (GEN_GEN == 9 && (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) 1796 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe); 1797 1798 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { 1799 pipe.StateCacheInvalidationEnable = 1800 bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT; 1801 pipe.ConstantCacheInvalidationEnable = 1802 bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT; 1803 pipe.VFCacheInvalidationEnable = 1804 bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT; 1805 pipe.TextureCacheInvalidationEnable = 1806 bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT; 1807 pipe.InstructionCacheInvalidateEnable = 1808 bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT; 1809 1810 /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL", 1811 * 1812 * "When VF Cache Invalidate is set “Post Sync Operation” must be 1813 * enabled to “Write Immediate Data” or “Write PS Depth Count” or 1814 * “Write Timestamp”. 1815 */ 1816 if (GEN_GEN == 9 && pipe.VFCacheInvalidationEnable) { 1817 pipe.PostSyncOperation = WriteImmediateData; 1818 pipe.Address = 1819 (struct anv_address) { &cmd_buffer->device->workaround_bo, 0 }; 1820 } 1821 } 1822 1823 bits &= ~ANV_PIPE_INVALIDATE_BITS; 1824 } 1825 1826 cmd_buffer->state.pending_pipe_bits = bits; 1827} 1828 1829void genX(CmdPipelineBarrier)( 1830 VkCommandBuffer commandBuffer, 1831 VkPipelineStageFlags srcStageMask, 1832 VkPipelineStageFlags destStageMask, 1833 VkBool32 byRegion, 1834 uint32_t memoryBarrierCount, 1835 const VkMemoryBarrier* pMemoryBarriers, 1836 uint32_t bufferMemoryBarrierCount, 1837 const VkBufferMemoryBarrier* pBufferMemoryBarriers, 1838 uint32_t imageMemoryBarrierCount, 1839 const VkImageMemoryBarrier* pImageMemoryBarriers) 1840{ 1841 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 1842 1843 /* XXX: Right now, we're really dumb and just flush whatever categories 1844 * the app asks for. One of these days we may make this a bit better 1845 * but right now that's all the hardware allows for in most areas. 1846 */ 1847 VkAccessFlags src_flags = 0; 1848 VkAccessFlags dst_flags = 0; 1849 1850 for (uint32_t i = 0; i < memoryBarrierCount; i++) { 1851 src_flags |= pMemoryBarriers[i].srcAccessMask; 1852 dst_flags |= pMemoryBarriers[i].dstAccessMask; 1853 } 1854 1855 for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) { 1856 src_flags |= pBufferMemoryBarriers[i].srcAccessMask; 1857 dst_flags |= pBufferMemoryBarriers[i].dstAccessMask; 1858 } 1859 1860 for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) { 1861 src_flags |= pImageMemoryBarriers[i].srcAccessMask; 1862 dst_flags |= pImageMemoryBarriers[i].dstAccessMask; 1863 ANV_FROM_HANDLE(anv_image, image, pImageMemoryBarriers[i].image); 1864 const VkImageSubresourceRange *range = 1865 &pImageMemoryBarriers[i].subresourceRange; 1866 1867 if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) { 1868 transition_depth_buffer(cmd_buffer, image, 1869 pImageMemoryBarriers[i].oldLayout, 1870 pImageMemoryBarriers[i].newLayout); 1871 } else if (range->aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { 1872 VkImageAspectFlags color_aspects = 1873 anv_image_expand_aspects(image, range->aspectMask); 1874 uint32_t aspect_bit; 1875 1876 uint32_t base_layer, layer_count; 1877 if (image->type == VK_IMAGE_TYPE_3D) { 1878 base_layer = 0; 1879 layer_count = anv_minify(image->extent.depth, range->baseMipLevel); 1880 } else { 1881 base_layer = range->baseArrayLayer; 1882 layer_count = anv_get_layerCount(image, range); 1883 } 1884 1885 anv_foreach_image_aspect_bit(aspect_bit, image, color_aspects) { 1886 transition_color_buffer(cmd_buffer, image, 1UL << aspect_bit, 1887 range->baseMipLevel, 1888 anv_get_levelCount(image, range), 1889 base_layer, layer_count, 1890 pImageMemoryBarriers[i].oldLayout, 1891 pImageMemoryBarriers[i].newLayout); 1892 } 1893 } 1894 } 1895 1896 cmd_buffer->state.pending_pipe_bits |= 1897 anv_pipe_flush_bits_for_access_flags(src_flags) | 1898 anv_pipe_invalidate_bits_for_access_flags(dst_flags); 1899} 1900 1901static void 1902cmd_buffer_alloc_push_constants(struct anv_cmd_buffer *cmd_buffer) 1903{ 1904 VkShaderStageFlags stages = 1905 cmd_buffer->state.gfx.base.pipeline->active_stages; 1906 1907 /* In order to avoid thrash, we assume that vertex and fragment stages 1908 * always exist. In the rare case where one is missing *and* the other 1909 * uses push concstants, this may be suboptimal. However, avoiding stalls 1910 * seems more important. 1911 */ 1912 stages |= VK_SHADER_STAGE_FRAGMENT_BIT | VK_SHADER_STAGE_VERTEX_BIT; 1913 1914 if (stages == cmd_buffer->state.push_constant_stages) 1915 return; 1916 1917#if GEN_GEN >= 8 1918 const unsigned push_constant_kb = 32; 1919#elif GEN_IS_HASWELL 1920 const unsigned push_constant_kb = cmd_buffer->device->info.gt == 3 ? 32 : 16; 1921#else 1922 const unsigned push_constant_kb = 16; 1923#endif 1924 1925 const unsigned num_stages = 1926 util_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS); 1927 unsigned size_per_stage = push_constant_kb / num_stages; 1928 1929 /* Broadwell+ and Haswell gt3 require that the push constant sizes be in 1930 * units of 2KB. Incidentally, these are the same platforms that have 1931 * 32KB worth of push constant space. 1932 */ 1933 if (push_constant_kb == 32) 1934 size_per_stage &= ~1u; 1935 1936 uint32_t kb_used = 0; 1937 for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_FRAGMENT; i++) { 1938 unsigned push_size = (stages & (1 << i)) ? size_per_stage : 0; 1939 anv_batch_emit(&cmd_buffer->batch, 1940 GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) { 1941 alloc._3DCommandSubOpcode = 18 + i; 1942 alloc.ConstantBufferOffset = (push_size > 0) ? kb_used : 0; 1943 alloc.ConstantBufferSize = push_size; 1944 } 1945 kb_used += push_size; 1946 } 1947 1948 anv_batch_emit(&cmd_buffer->batch, 1949 GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) { 1950 alloc.ConstantBufferOffset = kb_used; 1951 alloc.ConstantBufferSize = push_constant_kb - kb_used; 1952 } 1953 1954 cmd_buffer->state.push_constant_stages = stages; 1955 1956 /* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS: 1957 * 1958 * "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to 1959 * the next 3DPRIMITIVE command after programming the 1960 * 3DSTATE_PUSH_CONSTANT_ALLOC_VS" 1961 * 1962 * Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of 1963 * pipeline setup, we need to dirty push constants. 1964 */ 1965 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS; 1966} 1967 1968static const struct anv_descriptor * 1969anv_descriptor_for_binding(const struct anv_cmd_pipeline_state *pipe_state, 1970 const struct anv_pipeline_binding *binding) 1971{ 1972 assert(binding->set < MAX_SETS); 1973 const struct anv_descriptor_set *set = 1974 pipe_state->descriptors[binding->set]; 1975 const uint32_t offset = 1976 set->layout->binding[binding->binding].descriptor_index; 1977 return &set->descriptors[offset + binding->index]; 1978} 1979 1980static uint32_t 1981dynamic_offset_for_binding(const struct anv_cmd_pipeline_state *pipe_state, 1982 const struct anv_pipeline_binding *binding) 1983{ 1984 assert(binding->set < MAX_SETS); 1985 const struct anv_descriptor_set *set = 1986 pipe_state->descriptors[binding->set]; 1987 1988 uint32_t dynamic_offset_idx = 1989 pipe_state->layout->set[binding->set].dynamic_offset_start + 1990 set->layout->binding[binding->binding].dynamic_offset_index + 1991 binding->index; 1992 1993 return pipe_state->dynamic_offsets[dynamic_offset_idx]; 1994} 1995 1996static VkResult 1997emit_binding_table(struct anv_cmd_buffer *cmd_buffer, 1998 gl_shader_stage stage, 1999 struct anv_state *bt_state) 2000{ 2001 const struct gen_device_info *devinfo = &cmd_buffer->device->info; 2002 struct anv_subpass *subpass = cmd_buffer->state.subpass; 2003 struct anv_cmd_pipeline_state *pipe_state; 2004 struct anv_pipeline *pipeline; 2005 uint32_t bias, state_offset; 2006 2007 switch (stage) { 2008 case MESA_SHADER_COMPUTE: 2009 pipe_state = &cmd_buffer->state.compute.base; 2010 bias = 1; 2011 break; 2012 default: 2013 pipe_state = &cmd_buffer->state.gfx.base; 2014 bias = 0; 2015 break; 2016 } 2017 pipeline = pipe_state->pipeline; 2018 2019 if (!anv_pipeline_has_stage(pipeline, stage)) { 2020 *bt_state = (struct anv_state) { 0, }; 2021 return VK_SUCCESS; 2022 } 2023 2024 struct anv_pipeline_bind_map *map = &pipeline->shaders[stage]->bind_map; 2025 if (bias + map->surface_count == 0) { 2026 *bt_state = (struct anv_state) { 0, }; 2027 return VK_SUCCESS; 2028 } 2029 2030 *bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer, 2031 bias + map->surface_count, 2032 &state_offset); 2033 uint32_t *bt_map = bt_state->map; 2034 2035 if (bt_state->map == NULL) 2036 return VK_ERROR_OUT_OF_DEVICE_MEMORY; 2037 2038 if (stage == MESA_SHADER_COMPUTE && 2039 get_cs_prog_data(pipeline)->uses_num_work_groups) { 2040 struct anv_state surface_state; 2041 surface_state = 2042 anv_cmd_buffer_alloc_surface_state(cmd_buffer); 2043 2044 const enum isl_format format = 2045 anv_isl_format_for_descriptor_type(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); 2046 anv_fill_buffer_surface_state(cmd_buffer->device, surface_state, 2047 format, 2048 cmd_buffer->state.compute.num_workgroups, 2049 12, 1); 2050 2051 bt_map[0] = surface_state.offset + state_offset; 2052 add_surface_reloc(cmd_buffer, surface_state, 2053 cmd_buffer->state.compute.num_workgroups); 2054 } 2055 2056 if (map->surface_count == 0) 2057 goto out; 2058 2059 /* We only use push constant space for images before gen9 */ 2060 if (map->image_count > 0 && devinfo->gen < 9) { 2061 VkResult result = 2062 anv_cmd_buffer_ensure_push_constant_field(cmd_buffer, stage, images); 2063 if (result != VK_SUCCESS) 2064 return result; 2065 2066 cmd_buffer->state.push_constants_dirty |= 1 << stage; 2067 } 2068 2069 uint32_t image = 0; 2070 for (uint32_t s = 0; s < map->surface_count; s++) { 2071 struct anv_pipeline_binding *binding = &map->surface_to_descriptor[s]; 2072 2073 struct anv_state surface_state; 2074 2075 if (binding->set == ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS) { 2076 /* Color attachment binding */ 2077 assert(stage == MESA_SHADER_FRAGMENT); 2078 assert(binding->binding == 0); 2079 if (binding->index < subpass->color_count) { 2080 const unsigned att = 2081 subpass->color_attachments[binding->index].attachment; 2082 2083 /* From the Vulkan 1.0.46 spec: 2084 * 2085 * "If any color or depth/stencil attachments are 2086 * VK_ATTACHMENT_UNUSED, then no writes occur for those 2087 * attachments." 2088 */ 2089 if (att == VK_ATTACHMENT_UNUSED) { 2090 surface_state = cmd_buffer->state.null_surface_state; 2091 } else { 2092 surface_state = cmd_buffer->state.attachments[att].color.state; 2093 } 2094 } else { 2095 surface_state = cmd_buffer->state.null_surface_state; 2096 } 2097 2098 bt_map[bias + s] = surface_state.offset + state_offset; 2099 continue; 2100 } else if (binding->set == ANV_DESCRIPTOR_SET_SHADER_CONSTANTS) { 2101 struct anv_state surface_state = 2102 anv_cmd_buffer_alloc_surface_state(cmd_buffer); 2103 2104 struct anv_address constant_data = { 2105 .bo = &pipeline->device->dynamic_state_pool.block_pool.bo, 2106 .offset = pipeline->shaders[stage]->constant_data.offset, 2107 }; 2108 unsigned constant_data_size = 2109 pipeline->shaders[stage]->constant_data_size; 2110 2111 const enum isl_format format = 2112 anv_isl_format_for_descriptor_type(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER); 2113 anv_fill_buffer_surface_state(cmd_buffer->device, 2114 surface_state, format, 2115 constant_data, constant_data_size, 1); 2116 2117 bt_map[bias + s] = surface_state.offset + state_offset; 2118 add_surface_reloc(cmd_buffer, surface_state, constant_data); 2119 continue; 2120 } 2121 2122 const struct anv_descriptor *desc = 2123 anv_descriptor_for_binding(pipe_state, binding); 2124 2125 switch (desc->type) { 2126 case VK_DESCRIPTOR_TYPE_SAMPLER: 2127 /* Nothing for us to do here */ 2128 continue; 2129 2130 case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: 2131 case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: { 2132 struct anv_surface_state sstate = 2133 (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ? 2134 desc->image_view->planes[binding->plane].general_sampler_surface_state : 2135 desc->image_view->planes[binding->plane].optimal_sampler_surface_state; 2136 surface_state = sstate.state; 2137 assert(surface_state.alloc_size); 2138 add_surface_state_relocs(cmd_buffer, sstate); 2139 break; 2140 } 2141 case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: 2142 assert(stage == MESA_SHADER_FRAGMENT); 2143 if ((desc->image_view->aspect_mask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) == 0) { 2144 /* For depth and stencil input attachments, we treat it like any 2145 * old texture that a user may have bound. 2146 */ 2147 struct anv_surface_state sstate = 2148 (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ? 2149 desc->image_view->planes[binding->plane].general_sampler_surface_state : 2150 desc->image_view->planes[binding->plane].optimal_sampler_surface_state; 2151 surface_state = sstate.state; 2152 assert(surface_state.alloc_size); 2153 add_surface_state_relocs(cmd_buffer, sstate); 2154 } else { 2155 /* For color input attachments, we create the surface state at 2156 * vkBeginRenderPass time so that we can include aux and clear 2157 * color information. 2158 */ 2159 assert(binding->input_attachment_index < subpass->input_count); 2160 const unsigned subpass_att = binding->input_attachment_index; 2161 const unsigned att = subpass->input_attachments[subpass_att].attachment; 2162 surface_state = cmd_buffer->state.attachments[att].input.state; 2163 } 2164 break; 2165 2166 case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: { 2167 struct anv_surface_state sstate = (binding->write_only) 2168 ? desc->image_view->planes[binding->plane].writeonly_storage_surface_state 2169 : desc->image_view->planes[binding->plane].storage_surface_state; 2170 surface_state = sstate.state; 2171 assert(surface_state.alloc_size); 2172 add_surface_state_relocs(cmd_buffer, sstate); 2173 if (devinfo->gen < 9) { 2174 assert(image < MAX_GEN8_IMAGES); 2175 struct brw_image_param *image_param = 2176 &cmd_buffer->state.push_constants[stage]->images[image]; 2177 2178 *image_param = 2179 desc->image_view->planes[binding->plane].storage_image_param; 2180 } 2181 image++; 2182 break; 2183 } 2184 2185 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: 2186 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: 2187 case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: 2188 surface_state = desc->buffer_view->surface_state; 2189 assert(surface_state.alloc_size); 2190 add_surface_reloc(cmd_buffer, surface_state, 2191 desc->buffer_view->address); 2192 break; 2193 2194 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: 2195 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: { 2196 /* Compute the offset within the buffer */ 2197 uint32_t dynamic_offset = 2198 dynamic_offset_for_binding(pipe_state, binding); 2199 uint64_t offset = desc->offset + dynamic_offset; 2200 /* Clamp to the buffer size */ 2201 offset = MIN2(offset, desc->buffer->size); 2202 /* Clamp the range to the buffer size */ 2203 uint32_t range = MIN2(desc->range, desc->buffer->size - offset); 2204 2205 struct anv_address address = 2206 anv_address_add(desc->buffer->address, offset); 2207 2208 surface_state = 2209 anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64); 2210 enum isl_format format = 2211 anv_isl_format_for_descriptor_type(desc->type); 2212 2213 anv_fill_buffer_surface_state(cmd_buffer->device, surface_state, 2214 format, address, range, 1); 2215 add_surface_reloc(cmd_buffer, surface_state, address); 2216 break; 2217 } 2218 2219 case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: 2220 surface_state = (binding->write_only) 2221 ? desc->buffer_view->writeonly_storage_surface_state 2222 : desc->buffer_view->storage_surface_state; 2223 assert(surface_state.alloc_size); 2224 add_surface_reloc(cmd_buffer, surface_state, 2225 desc->buffer_view->address); 2226 if (devinfo->gen < 9) { 2227 assert(image < MAX_GEN8_IMAGES); 2228 struct brw_image_param *image_param = 2229 &cmd_buffer->state.push_constants[stage]->images[image]; 2230 2231 *image_param = desc->buffer_view->storage_image_param; 2232 } 2233 image++; 2234 break; 2235 2236 default: 2237 assert(!"Invalid descriptor type"); 2238 continue; 2239 } 2240 2241 bt_map[bias + s] = surface_state.offset + state_offset; 2242 } 2243 assert(image == map->image_count); 2244 2245 out: 2246 anv_state_flush(cmd_buffer->device, *bt_state); 2247 2248#if GEN_GEN >= 11 2249 /* The PIPE_CONTROL command description says: 2250 * 2251 * "Whenever a Binding Table Index (BTI) used by a Render Taget Message 2252 * points to a different RENDER_SURFACE_STATE, SW must issue a Render 2253 * Target Cache Flush by enabling this bit. When render target flush 2254 * is set due to new association of BTI, PS Scoreboard Stall bit must 2255 * be set in this packet." 2256 * 2257 * FINISHME: Currently we shuffle around the surface states in the binding 2258 * table based on if they are getting used or not. So, we've to do below 2259 * pipe control flush for every binding table upload. Make changes so 2260 * that we do it only when we modify render target surface states. 2261 */ 2262 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 2263 pc.RenderTargetCacheFlushEnable = true; 2264 pc.StallAtPixelScoreboard = true; 2265 } 2266#endif 2267 2268 return VK_SUCCESS; 2269} 2270 2271static VkResult 2272emit_samplers(struct anv_cmd_buffer *cmd_buffer, 2273 gl_shader_stage stage, 2274 struct anv_state *state) 2275{ 2276 struct anv_cmd_pipeline_state *pipe_state = 2277 stage == MESA_SHADER_COMPUTE ? &cmd_buffer->state.compute.base : 2278 &cmd_buffer->state.gfx.base; 2279 struct anv_pipeline *pipeline = pipe_state->pipeline; 2280 2281 if (!anv_pipeline_has_stage(pipeline, stage)) { 2282 *state = (struct anv_state) { 0, }; 2283 return VK_SUCCESS; 2284 } 2285 2286 struct anv_pipeline_bind_map *map = &pipeline->shaders[stage]->bind_map; 2287 if (map->sampler_count == 0) { 2288 *state = (struct anv_state) { 0, }; 2289 return VK_SUCCESS; 2290 } 2291 2292 uint32_t size = map->sampler_count * 16; 2293 *state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, 32); 2294 2295 if (state->map == NULL) 2296 return VK_ERROR_OUT_OF_DEVICE_MEMORY; 2297 2298 for (uint32_t s = 0; s < map->sampler_count; s++) { 2299 struct anv_pipeline_binding *binding = &map->sampler_to_descriptor[s]; 2300 const struct anv_descriptor *desc = 2301 anv_descriptor_for_binding(pipe_state, binding); 2302 2303 if (desc->type != VK_DESCRIPTOR_TYPE_SAMPLER && 2304 desc->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) 2305 continue; 2306 2307 struct anv_sampler *sampler = desc->sampler; 2308 2309 /* This can happen if we have an unfilled slot since TYPE_SAMPLER 2310 * happens to be zero. 2311 */ 2312 if (sampler == NULL) 2313 continue; 2314 2315 memcpy(state->map + (s * 16), 2316 sampler->state[binding->plane], sizeof(sampler->state[0])); 2317 } 2318 2319 anv_state_flush(cmd_buffer->device, *state); 2320 2321 return VK_SUCCESS; 2322} 2323 2324static uint32_t 2325flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer) 2326{ 2327 struct anv_pipeline *pipeline = cmd_buffer->state.gfx.base.pipeline; 2328 2329 VkShaderStageFlags dirty = cmd_buffer->state.descriptors_dirty & 2330 pipeline->active_stages; 2331 2332 VkResult result = VK_SUCCESS; 2333 anv_foreach_stage(s, dirty) { 2334 result = emit_samplers(cmd_buffer, s, &cmd_buffer->state.samplers[s]); 2335 if (result != VK_SUCCESS) 2336 break; 2337 result = emit_binding_table(cmd_buffer, s, 2338 &cmd_buffer->state.binding_tables[s]); 2339 if (result != VK_SUCCESS) 2340 break; 2341 } 2342 2343 if (result != VK_SUCCESS) { 2344 assert(result == VK_ERROR_OUT_OF_DEVICE_MEMORY); 2345 2346 result = anv_cmd_buffer_new_binding_table_block(cmd_buffer); 2347 if (result != VK_SUCCESS) 2348 return 0; 2349 2350 /* Re-emit state base addresses so we get the new surface state base 2351 * address before we start emitting binding tables etc. 2352 */ 2353 genX(cmd_buffer_emit_state_base_address)(cmd_buffer); 2354 2355 /* Re-emit all active binding tables */ 2356 dirty |= pipeline->active_stages; 2357 anv_foreach_stage(s, dirty) { 2358 result = emit_samplers(cmd_buffer, s, &cmd_buffer->state.samplers[s]); 2359 if (result != VK_SUCCESS) { 2360 anv_batch_set_error(&cmd_buffer->batch, result); 2361 return 0; 2362 } 2363 result = emit_binding_table(cmd_buffer, s, 2364 &cmd_buffer->state.binding_tables[s]); 2365 if (result != VK_SUCCESS) { 2366 anv_batch_set_error(&cmd_buffer->batch, result); 2367 return 0; 2368 } 2369 } 2370 } 2371 2372 cmd_buffer->state.descriptors_dirty &= ~dirty; 2373 2374 return dirty; 2375} 2376 2377static void 2378cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer, 2379 uint32_t stages) 2380{ 2381 static const uint32_t sampler_state_opcodes[] = { 2382 [MESA_SHADER_VERTEX] = 43, 2383 [MESA_SHADER_TESS_CTRL] = 44, /* HS */ 2384 [MESA_SHADER_TESS_EVAL] = 45, /* DS */ 2385 [MESA_SHADER_GEOMETRY] = 46, 2386 [MESA_SHADER_FRAGMENT] = 47, 2387 [MESA_SHADER_COMPUTE] = 0, 2388 }; 2389 2390 static const uint32_t binding_table_opcodes[] = { 2391 [MESA_SHADER_VERTEX] = 38, 2392 [MESA_SHADER_TESS_CTRL] = 39, 2393 [MESA_SHADER_TESS_EVAL] = 40, 2394 [MESA_SHADER_GEOMETRY] = 41, 2395 [MESA_SHADER_FRAGMENT] = 42, 2396 [MESA_SHADER_COMPUTE] = 0, 2397 }; 2398 2399 anv_foreach_stage(s, stages) { 2400 assert(s < ARRAY_SIZE(binding_table_opcodes)); 2401 assert(binding_table_opcodes[s] > 0); 2402 2403 if (cmd_buffer->state.samplers[s].alloc_size > 0) { 2404 anv_batch_emit(&cmd_buffer->batch, 2405 GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ssp) { 2406 ssp._3DCommandSubOpcode = sampler_state_opcodes[s]; 2407 ssp.PointertoVSSamplerState = cmd_buffer->state.samplers[s].offset; 2408 } 2409 } 2410 2411 /* Always emit binding table pointers if we're asked to, since on SKL 2412 * this is what flushes push constants. */ 2413 anv_batch_emit(&cmd_buffer->batch, 2414 GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), btp) { 2415 btp._3DCommandSubOpcode = binding_table_opcodes[s]; 2416 btp.PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset; 2417 } 2418 } 2419} 2420 2421static void 2422cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer, 2423 VkShaderStageFlags dirty_stages) 2424{ 2425 const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx; 2426 const struct anv_pipeline *pipeline = gfx_state->base.pipeline; 2427 2428 static const uint32_t push_constant_opcodes[] = { 2429 [MESA_SHADER_VERTEX] = 21, 2430 [MESA_SHADER_TESS_CTRL] = 25, /* HS */ 2431 [MESA_SHADER_TESS_EVAL] = 26, /* DS */ 2432 [MESA_SHADER_GEOMETRY] = 22, 2433 [MESA_SHADER_FRAGMENT] = 23, 2434 [MESA_SHADER_COMPUTE] = 0, 2435 }; 2436 2437 VkShaderStageFlags flushed = 0; 2438 2439 anv_foreach_stage(stage, dirty_stages) { 2440 assert(stage < ARRAY_SIZE(push_constant_opcodes)); 2441 assert(push_constant_opcodes[stage] > 0); 2442 2443 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) { 2444 c._3DCommandSubOpcode = push_constant_opcodes[stage]; 2445 2446 if (anv_pipeline_has_stage(pipeline, stage)) { 2447#if GEN_GEN >= 8 || GEN_IS_HASWELL 2448 const struct brw_stage_prog_data *prog_data = 2449 pipeline->shaders[stage]->prog_data; 2450 const struct anv_pipeline_bind_map *bind_map = 2451 &pipeline->shaders[stage]->bind_map; 2452 2453 /* The Skylake PRM contains the following restriction: 2454 * 2455 * "The driver must ensure The following case does not occur 2456 * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with 2457 * buffer 3 read length equal to zero committed followed by a 2458 * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to 2459 * zero committed." 2460 * 2461 * To avoid this, we program the buffers in the highest slots. 2462 * This way, slot 0 is only used if slot 3 is also used. 2463 */ 2464 int n = 3; 2465 2466 for (int i = 3; i >= 0; i--) { 2467 const struct brw_ubo_range *range = &prog_data->ubo_ranges[i]; 2468 if (range->length == 0) 2469 continue; 2470 2471 const unsigned surface = 2472 prog_data->binding_table.ubo_start + range->block; 2473 2474 assert(surface <= bind_map->surface_count); 2475 const struct anv_pipeline_binding *binding = 2476 &bind_map->surface_to_descriptor[surface]; 2477 2478 struct anv_address read_addr; 2479 uint32_t read_len; 2480 if (binding->set == ANV_DESCRIPTOR_SET_SHADER_CONSTANTS) { 2481 struct anv_address constant_data = { 2482 .bo = &pipeline->device->dynamic_state_pool.block_pool.bo, 2483 .offset = pipeline->shaders[stage]->constant_data.offset, 2484 }; 2485 unsigned constant_data_size = 2486 pipeline->shaders[stage]->constant_data_size; 2487 2488 read_len = MIN2(range->length, 2489 DIV_ROUND_UP(constant_data_size, 32) - range->start); 2490 read_addr = anv_address_add(constant_data, 2491 range->start * 32); 2492 } else { 2493 const struct anv_descriptor *desc = 2494 anv_descriptor_for_binding(&gfx_state->base, binding); 2495 2496 if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) { 2497 read_len = MIN2(range->length, 2498 DIV_ROUND_UP(desc->buffer_view->range, 32) - range->start); 2499 read_addr = anv_address_add(desc->buffer_view->address, 2500 range->start * 32); 2501 } else { 2502 assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC); 2503 2504 uint32_t dynamic_offset = 2505 dynamic_offset_for_binding(&gfx_state->base, binding); 2506 uint32_t buf_offset = 2507 MIN2(desc->offset + dynamic_offset, desc->buffer->size); 2508 uint32_t buf_range = 2509 MIN2(desc->range, desc->buffer->size - buf_offset); 2510 2511 read_len = MIN2(range->length, 2512 DIV_ROUND_UP(buf_range, 32) - range->start); 2513 read_addr = anv_address_add(desc->buffer->address, 2514 buf_offset + range->start * 32); 2515 } 2516 } 2517 2518 if (read_len > 0) { 2519 c.ConstantBody.Buffer[n] = read_addr; 2520 c.ConstantBody.ReadLength[n] = read_len; 2521 n--; 2522 } 2523 } 2524 2525 struct anv_state state = 2526 anv_cmd_buffer_push_constants(cmd_buffer, stage); 2527 2528 if (state.alloc_size > 0) { 2529 c.ConstantBody.Buffer[n] = (struct anv_address) { 2530 .bo = &cmd_buffer->device->dynamic_state_pool.block_pool.bo, 2531 .offset = state.offset, 2532 }; 2533 c.ConstantBody.ReadLength[n] = 2534 DIV_ROUND_UP(state.alloc_size, 32); 2535 } 2536#else 2537 /* For Ivy Bridge, the push constants packets have a different 2538 * rule that would require us to iterate in the other direction 2539 * and possibly mess around with dynamic state base address. 2540 * Don't bother; just emit regular push constants at n = 0. 2541 */ 2542 struct anv_state state = 2543 anv_cmd_buffer_push_constants(cmd_buffer, stage); 2544 2545 if (state.alloc_size > 0) { 2546 c.ConstantBody.Buffer[0].offset = state.offset, 2547 c.ConstantBody.ReadLength[0] = 2548 DIV_ROUND_UP(state.alloc_size, 32); 2549 } 2550#endif 2551 } 2552 } 2553 2554 flushed |= mesa_to_vk_shader_stage(stage); 2555 } 2556 2557 cmd_buffer->state.push_constants_dirty &= ~flushed; 2558} 2559 2560void 2561genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) 2562{ 2563 struct anv_pipeline *pipeline = cmd_buffer->state.gfx.base.pipeline; 2564 uint32_t *p; 2565 2566 uint32_t vb_emit = cmd_buffer->state.gfx.vb_dirty & pipeline->vb_used; 2567 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) 2568 vb_emit |= pipeline->vb_used; 2569 2570 assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0); 2571 2572 genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->urb.l3_config); 2573 2574 genX(flush_pipeline_select_3d)(cmd_buffer); 2575 2576 if (vb_emit) { 2577 const uint32_t num_buffers = __builtin_popcount(vb_emit); 2578 const uint32_t num_dwords = 1 + num_buffers * 4; 2579 2580 p = anv_batch_emitn(&cmd_buffer->batch, num_dwords, 2581 GENX(3DSTATE_VERTEX_BUFFERS)); 2582 uint32_t vb, i = 0; 2583 for_each_bit(vb, vb_emit) { 2584 struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer; 2585 uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset; 2586 2587 struct GENX(VERTEX_BUFFER_STATE) state = { 2588 .VertexBufferIndex = vb, 2589 2590 .VertexBufferMOCS = anv_mocs_for_bo(cmd_buffer->device, 2591 buffer->address.bo), 2592#if GEN_GEN <= 7 2593 .BufferAccessType = pipeline->vb[vb].instanced ? INSTANCEDATA : VERTEXDATA, 2594 .InstanceDataStepRate = pipeline->vb[vb].instance_divisor, 2595#endif 2596 2597 .AddressModifyEnable = true, 2598 .BufferPitch = pipeline->vb[vb].stride, 2599 .BufferStartingAddress = anv_address_add(buffer->address, offset), 2600 2601#if GEN_GEN >= 8 2602 .BufferSize = buffer->size - offset 2603#else 2604 .EndAddress = anv_address_add(buffer->address, buffer->size - 1), 2605#endif 2606 }; 2607 2608 GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state); 2609 i++; 2610 } 2611 } 2612 2613 cmd_buffer->state.gfx.vb_dirty &= ~vb_emit; 2614 2615 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) { 2616 anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch); 2617 2618 /* The exact descriptor layout is pulled from the pipeline, so we need 2619 * to re-emit binding tables on every pipeline change. 2620 */ 2621 cmd_buffer->state.descriptors_dirty |= pipeline->active_stages; 2622 2623 /* If the pipeline changed, we may need to re-allocate push constant 2624 * space in the URB. 2625 */ 2626 cmd_buffer_alloc_push_constants(cmd_buffer); 2627 } 2628 2629#if GEN_GEN <= 7 2630 if (cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_VERTEX_BIT || 2631 cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_VERTEX_BIT) { 2632 /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1: 2633 * 2634 * "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth 2635 * stall needs to be sent just prior to any 3DSTATE_VS, 2636 * 3DSTATE_URB_VS, 3DSTATE_CONSTANT_VS, 2637 * 3DSTATE_BINDING_TABLE_POINTER_VS, 2638 * 3DSTATE_SAMPLER_STATE_POINTER_VS command. Only one 2639 * PIPE_CONTROL needs to be sent before any combination of VS 2640 * associated 3DSTATE." 2641 */ 2642 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 2643 pc.DepthStallEnable = true; 2644 pc.PostSyncOperation = WriteImmediateData; 2645 pc.Address = 2646 (struct anv_address) { &cmd_buffer->device->workaround_bo, 0 }; 2647 } 2648 } 2649#endif 2650 2651 /* Render targets live in the same binding table as fragment descriptors */ 2652 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS) 2653 cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT; 2654 2655 /* We emit the binding tables and sampler tables first, then emit push 2656 * constants and then finally emit binding table and sampler table 2657 * pointers. It has to happen in this order, since emitting the binding 2658 * tables may change the push constants (in case of storage images). After 2659 * emitting push constants, on SKL+ we have to emit the corresponding 2660 * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect. 2661 */ 2662 uint32_t dirty = 0; 2663 if (cmd_buffer->state.descriptors_dirty) 2664 dirty = flush_descriptor_sets(cmd_buffer); 2665 2666 if (dirty || cmd_buffer->state.push_constants_dirty) { 2667 /* Because we're pushing UBOs, we have to push whenever either 2668 * descriptors or push constants is dirty. 2669 */ 2670 dirty |= cmd_buffer->state.push_constants_dirty; 2671 dirty &= ANV_STAGE_MASK & VK_SHADER_STAGE_ALL_GRAPHICS; 2672 cmd_buffer_flush_push_constants(cmd_buffer, dirty); 2673 } 2674 2675 if (dirty) 2676 cmd_buffer_emit_descriptor_pointers(cmd_buffer, dirty); 2677 2678 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_VIEWPORT) 2679 gen8_cmd_buffer_emit_viewport(cmd_buffer); 2680 2681 if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_DYNAMIC_VIEWPORT | 2682 ANV_CMD_DIRTY_PIPELINE)) { 2683 gen8_cmd_buffer_emit_depth_viewport(cmd_buffer, 2684 pipeline->depth_clamp_enable); 2685 } 2686 2687 if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_DYNAMIC_SCISSOR | 2688 ANV_CMD_DIRTY_RENDER_TARGETS)) 2689 gen7_cmd_buffer_emit_scissor(cmd_buffer); 2690 2691 genX(cmd_buffer_flush_dynamic_state)(cmd_buffer); 2692 2693 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 2694} 2695 2696static void 2697emit_vertex_bo(struct anv_cmd_buffer *cmd_buffer, 2698 struct anv_address addr, 2699 uint32_t size, uint32_t index) 2700{ 2701 uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, 5, 2702 GENX(3DSTATE_VERTEX_BUFFERS)); 2703 2704 GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, p + 1, 2705 &(struct GENX(VERTEX_BUFFER_STATE)) { 2706 .VertexBufferIndex = index, 2707 .AddressModifyEnable = true, 2708 .BufferPitch = 0, 2709 .VertexBufferMOCS = anv_mocs_for_bo(cmd_buffer->device, addr.bo), 2710#if (GEN_GEN >= 8) 2711 .BufferStartingAddress = addr, 2712 .BufferSize = size 2713#else 2714 .BufferStartingAddress = addr, 2715 .EndAddress = anv_address_add(addr, size), 2716#endif 2717 }); 2718} 2719 2720static void 2721emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer, 2722 struct anv_address addr) 2723{ 2724 emit_vertex_bo(cmd_buffer, addr, 8, ANV_SVGS_VB_INDEX); 2725} 2726 2727static void 2728emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer, 2729 uint32_t base_vertex, uint32_t base_instance) 2730{ 2731 struct anv_state id_state = 2732 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4); 2733 2734 ((uint32_t *)id_state.map)[0] = base_vertex; 2735 ((uint32_t *)id_state.map)[1] = base_instance; 2736 2737 anv_state_flush(cmd_buffer->device, id_state); 2738 2739 struct anv_address addr = { 2740 .bo = &cmd_buffer->device->dynamic_state_pool.block_pool.bo, 2741 .offset = id_state.offset, 2742 }; 2743 2744 emit_base_vertex_instance_bo(cmd_buffer, addr); 2745} 2746 2747static void 2748emit_draw_index(struct anv_cmd_buffer *cmd_buffer, uint32_t draw_index) 2749{ 2750 struct anv_state state = 2751 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 4, 4); 2752 2753 ((uint32_t *)state.map)[0] = draw_index; 2754 2755 anv_state_flush(cmd_buffer->device, state); 2756 2757 struct anv_address addr = { 2758 .bo = &cmd_buffer->device->dynamic_state_pool.block_pool.bo, 2759 .offset = state.offset, 2760 }; 2761 2762 emit_vertex_bo(cmd_buffer, addr, 4, ANV_DRAWID_VB_INDEX); 2763} 2764 2765void genX(CmdDraw)( 2766 VkCommandBuffer commandBuffer, 2767 uint32_t vertexCount, 2768 uint32_t instanceCount, 2769 uint32_t firstVertex, 2770 uint32_t firstInstance) 2771{ 2772 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 2773 struct anv_pipeline *pipeline = cmd_buffer->state.gfx.base.pipeline; 2774 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); 2775 2776 if (anv_batch_has_error(&cmd_buffer->batch)) 2777 return; 2778 2779 genX(cmd_buffer_flush_state)(cmd_buffer); 2780 2781 if (vs_prog_data->uses_firstvertex || 2782 vs_prog_data->uses_baseinstance) 2783 emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance); 2784 if (vs_prog_data->uses_drawid) 2785 emit_draw_index(cmd_buffer, 0); 2786 2787 /* Our implementation of VK_KHR_multiview uses instancing to draw the 2788 * different views. We need to multiply instanceCount by the view count. 2789 */ 2790 instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass); 2791 2792 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { 2793 prim.VertexAccessType = SEQUENTIAL; 2794 prim.PrimitiveTopologyType = pipeline->topology; 2795 prim.VertexCountPerInstance = vertexCount; 2796 prim.StartVertexLocation = firstVertex; 2797 prim.InstanceCount = instanceCount; 2798 prim.StartInstanceLocation = firstInstance; 2799 prim.BaseVertexLocation = 0; 2800 } 2801 2802 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_RENDER_TARGET_WRITES; 2803} 2804 2805void genX(CmdDrawIndexed)( 2806 VkCommandBuffer commandBuffer, 2807 uint32_t indexCount, 2808 uint32_t instanceCount, 2809 uint32_t firstIndex, 2810 int32_t vertexOffset, 2811 uint32_t firstInstance) 2812{ 2813 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 2814 struct anv_pipeline *pipeline = cmd_buffer->state.gfx.base.pipeline; 2815 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); 2816 2817 if (anv_batch_has_error(&cmd_buffer->batch)) 2818 return; 2819 2820 genX(cmd_buffer_flush_state)(cmd_buffer); 2821 2822 if (vs_prog_data->uses_firstvertex || 2823 vs_prog_data->uses_baseinstance) 2824 emit_base_vertex_instance(cmd_buffer, vertexOffset, firstInstance); 2825 if (vs_prog_data->uses_drawid) 2826 emit_draw_index(cmd_buffer, 0); 2827 2828 /* Our implementation of VK_KHR_multiview uses instancing to draw the 2829 * different views. We need to multiply instanceCount by the view count. 2830 */ 2831 instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass); 2832 2833 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { 2834 prim.VertexAccessType = RANDOM; 2835 prim.PrimitiveTopologyType = pipeline->topology; 2836 prim.VertexCountPerInstance = indexCount; 2837 prim.StartVertexLocation = firstIndex; 2838 prim.InstanceCount = instanceCount; 2839 prim.StartInstanceLocation = firstInstance; 2840 prim.BaseVertexLocation = vertexOffset; 2841 } 2842 2843 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_RENDER_TARGET_WRITES; 2844} 2845 2846/* Auto-Draw / Indirect Registers */ 2847#define GEN7_3DPRIM_END_OFFSET 0x2420 2848#define GEN7_3DPRIM_START_VERTEX 0x2430 2849#define GEN7_3DPRIM_VERTEX_COUNT 0x2434 2850#define GEN7_3DPRIM_INSTANCE_COUNT 0x2438 2851#define GEN7_3DPRIM_START_INSTANCE 0x243C 2852#define GEN7_3DPRIM_BASE_VERTEX 0x2440 2853 2854/* MI_MATH only exists on Haswell+ */ 2855#if GEN_IS_HASWELL || GEN_GEN >= 8 2856 2857/* Emit dwords to multiply GPR0 by N */ 2858static void 2859build_alu_multiply_gpr0(uint32_t *dw, unsigned *dw_count, uint32_t N) 2860{ 2861 VK_OUTARRAY_MAKE(out, dw, dw_count); 2862 2863#define append_alu(opcode, operand1, operand2) \ 2864 vk_outarray_append(&out, alu_dw) *alu_dw = mi_alu(opcode, operand1, operand2) 2865 2866 assert(N > 0); 2867 unsigned top_bit = 31 - __builtin_clz(N); 2868 for (int i = top_bit - 1; i >= 0; i--) { 2869 /* We get our initial data in GPR0 and we write the final data out to 2870 * GPR0 but we use GPR1 as our scratch register. 2871 */ 2872 unsigned src_reg = i == top_bit - 1 ? MI_ALU_REG0 : MI_ALU_REG1; 2873 unsigned dst_reg = i == 0 ? MI_ALU_REG0 : MI_ALU_REG1; 2874 2875 /* Shift the current value left by 1 */ 2876 append_alu(MI_ALU_LOAD, MI_ALU_SRCA, src_reg); 2877 append_alu(MI_ALU_LOAD, MI_ALU_SRCB, src_reg); 2878 append_alu(MI_ALU_ADD, 0, 0); 2879 2880 if (N & (1 << i)) { 2881 /* Store ACCU to R1 and add R0 to R1 */ 2882 append_alu(MI_ALU_STORE, MI_ALU_REG1, MI_ALU_ACCU); 2883 append_alu(MI_ALU_LOAD, MI_ALU_SRCA, MI_ALU_REG0); 2884 append_alu(MI_ALU_LOAD, MI_ALU_SRCB, MI_ALU_REG1); 2885 append_alu(MI_ALU_ADD, 0, 0); 2886 } 2887 2888 append_alu(MI_ALU_STORE, dst_reg, MI_ALU_ACCU); 2889 } 2890 2891#undef append_alu 2892} 2893 2894static void 2895emit_mul_gpr0(struct anv_batch *batch, uint32_t N) 2896{ 2897 uint32_t num_dwords; 2898 build_alu_multiply_gpr0(NULL, &num_dwords, N); 2899 2900 uint32_t *dw = anv_batch_emitn(batch, 1 + num_dwords, GENX(MI_MATH)); 2901 build_alu_multiply_gpr0(dw + 1, &num_dwords, N); 2902} 2903 2904#endif /* GEN_IS_HASWELL || GEN_GEN >= 8 */ 2905 2906static void 2907load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer, 2908 struct anv_address addr, 2909 bool indexed) 2910{ 2911 struct anv_batch *batch = &cmd_buffer->batch; 2912 2913 emit_lrm(batch, GEN7_3DPRIM_VERTEX_COUNT, anv_address_add(addr, 0)); 2914 2915 unsigned view_count = anv_subpass_view_count(cmd_buffer->state.subpass); 2916 if (view_count > 1) { 2917#if GEN_IS_HASWELL || GEN_GEN >= 8 2918 emit_lrm(batch, CS_GPR(0), anv_address_add(addr, 4)); 2919 emit_mul_gpr0(batch, view_count); 2920 emit_lrr(batch, GEN7_3DPRIM_INSTANCE_COUNT, CS_GPR(0)); 2921#else 2922 anv_finishme("Multiview + indirect draw requires MI_MATH; " 2923 "MI_MATH is not supported on Ivy Bridge"); 2924 emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, anv_address_add(addr, 4)); 2925#endif 2926 } else { 2927 emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, anv_address_add(addr, 4)); 2928 } 2929 2930 emit_lrm(batch, GEN7_3DPRIM_START_VERTEX, anv_address_add(addr, 8)); 2931 2932 if (indexed) { 2933 emit_lrm(batch, GEN7_3DPRIM_BASE_VERTEX, anv_address_add(addr, 12)); 2934 emit_lrm(batch, GEN7_3DPRIM_START_INSTANCE, anv_address_add(addr, 16)); 2935 } else { 2936 emit_lrm(batch, GEN7_3DPRIM_START_INSTANCE, anv_address_add(addr, 12)); 2937 emit_lri(batch, GEN7_3DPRIM_BASE_VERTEX, 0); 2938 } 2939} 2940 2941void genX(CmdDrawIndirect)( 2942 VkCommandBuffer commandBuffer, 2943 VkBuffer _buffer, 2944 VkDeviceSize offset, 2945 uint32_t drawCount, 2946 uint32_t stride) 2947{ 2948 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 2949 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); 2950 struct anv_pipeline *pipeline = cmd_buffer->state.gfx.base.pipeline; 2951 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); 2952 2953 if (anv_batch_has_error(&cmd_buffer->batch)) 2954 return; 2955 2956 genX(cmd_buffer_flush_state)(cmd_buffer); 2957 2958 for (uint32_t i = 0; i < drawCount; i++) { 2959 struct anv_address draw = anv_address_add(buffer->address, offset); 2960 2961 if (vs_prog_data->uses_firstvertex || 2962 vs_prog_data->uses_baseinstance) 2963 emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8)); 2964 if (vs_prog_data->uses_drawid) 2965 emit_draw_index(cmd_buffer, i); 2966 2967 load_indirect_parameters(cmd_buffer, draw, false); 2968 2969 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { 2970 prim.IndirectParameterEnable = true; 2971 prim.VertexAccessType = SEQUENTIAL; 2972 prim.PrimitiveTopologyType = pipeline->topology; 2973 } 2974 2975 offset += stride; 2976 } 2977 2978 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_RENDER_TARGET_WRITES; 2979} 2980 2981void genX(CmdDrawIndexedIndirect)( 2982 VkCommandBuffer commandBuffer, 2983 VkBuffer _buffer, 2984 VkDeviceSize offset, 2985 uint32_t drawCount, 2986 uint32_t stride) 2987{ 2988 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 2989 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); 2990 struct anv_pipeline *pipeline = cmd_buffer->state.gfx.base.pipeline; 2991 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); 2992 2993 if (anv_batch_has_error(&cmd_buffer->batch)) 2994 return; 2995 2996 genX(cmd_buffer_flush_state)(cmd_buffer); 2997 2998 for (uint32_t i = 0; i < drawCount; i++) { 2999 struct anv_address draw = anv_address_add(buffer->address, offset); 3000 3001 /* TODO: We need to stomp base vertex to 0 somehow */ 3002 if (vs_prog_data->uses_firstvertex || 3003 vs_prog_data->uses_baseinstance) 3004 emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12)); 3005 if (vs_prog_data->uses_drawid) 3006 emit_draw_index(cmd_buffer, i); 3007 3008 load_indirect_parameters(cmd_buffer, draw, true); 3009 3010 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { 3011 prim.IndirectParameterEnable = true; 3012 prim.VertexAccessType = RANDOM; 3013 prim.PrimitiveTopologyType = pipeline->topology; 3014 } 3015 3016 offset += stride; 3017 } 3018 3019 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_RENDER_TARGET_WRITES; 3020} 3021 3022static VkResult 3023flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer) 3024{ 3025 struct anv_pipeline *pipeline = cmd_buffer->state.compute.base.pipeline; 3026 struct anv_state surfaces = { 0, }, samplers = { 0, }; 3027 VkResult result; 3028 3029 result = emit_binding_table(cmd_buffer, MESA_SHADER_COMPUTE, &surfaces); 3030 if (result != VK_SUCCESS) { 3031 assert(result == VK_ERROR_OUT_OF_DEVICE_MEMORY); 3032 3033 result = anv_cmd_buffer_new_binding_table_block(cmd_buffer); 3034 if (result != VK_SUCCESS) 3035 return result; 3036 3037 /* Re-emit state base addresses so we get the new surface state base 3038 * address before we start emitting binding tables etc. 3039 */ 3040 genX(cmd_buffer_emit_state_base_address)(cmd_buffer); 3041 3042 result = emit_binding_table(cmd_buffer, MESA_SHADER_COMPUTE, &surfaces); 3043 if (result != VK_SUCCESS) { 3044 anv_batch_set_error(&cmd_buffer->batch, result); 3045 return result; 3046 } 3047 } 3048 3049 result = emit_samplers(cmd_buffer, MESA_SHADER_COMPUTE, &samplers); 3050 if (result != VK_SUCCESS) { 3051 anv_batch_set_error(&cmd_buffer->batch, result); 3052 return result; 3053 } 3054 3055 uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)]; 3056 struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = { 3057 .BindingTablePointer = surfaces.offset, 3058 .SamplerStatePointer = samplers.offset, 3059 }; 3060 GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc); 3061 3062 struct anv_state state = 3063 anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw, 3064 pipeline->interface_descriptor_data, 3065 GENX(INTERFACE_DESCRIPTOR_DATA_length), 3066 64); 3067 3068 uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t); 3069 anv_batch_emit(&cmd_buffer->batch, 3070 GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) { 3071 mid.InterfaceDescriptorTotalLength = size; 3072 mid.InterfaceDescriptorDataStartAddress = state.offset; 3073 } 3074 3075 return VK_SUCCESS; 3076} 3077 3078void 3079genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer) 3080{ 3081 struct anv_pipeline *pipeline = cmd_buffer->state.compute.base.pipeline; 3082 MAYBE_UNUSED VkResult result; 3083 3084 assert(pipeline->active_stages == VK_SHADER_STAGE_COMPUTE_BIT); 3085 3086 genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->urb.l3_config); 3087 3088 genX(flush_pipeline_select_gpgpu)(cmd_buffer); 3089 3090 if (cmd_buffer->state.compute.pipeline_dirty) { 3091 /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE: 3092 * 3093 * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless 3094 * the only bits that are changed are scoreboard related: Scoreboard 3095 * Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For 3096 * these scoreboard related states, a MEDIA_STATE_FLUSH is 3097 * sufficient." 3098 */ 3099 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT; 3100 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 3101 3102 anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch); 3103 } 3104 3105 if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) || 3106 cmd_buffer->state.compute.pipeline_dirty) { 3107 /* FIXME: figure out descriptors for gen7 */ 3108 result = flush_compute_descriptor_set(cmd_buffer); 3109 if (result != VK_SUCCESS) 3110 return; 3111 3112 cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT; 3113 } 3114 3115 if (cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_COMPUTE_BIT) { 3116 struct anv_state push_state = 3117 anv_cmd_buffer_cs_push_constants(cmd_buffer); 3118 3119 if (push_state.alloc_size) { 3120 anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) { 3121 curbe.CURBETotalDataLength = push_state.alloc_size; 3122 curbe.CURBEDataStartAddress = push_state.offset; 3123 } 3124 } 3125 3126 cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT; 3127 } 3128 3129 cmd_buffer->state.compute.pipeline_dirty = false; 3130 3131 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 3132} 3133 3134#if GEN_GEN == 7 3135 3136static VkResult 3137verify_cmd_parser(const struct anv_device *device, 3138 int required_version, 3139 const char *function) 3140{ 3141 if (device->instance->physicalDevice.cmd_parser_version < required_version) { 3142 return vk_errorf(device->instance, device->instance, 3143 VK_ERROR_FEATURE_NOT_PRESENT, 3144 "cmd parser version %d is required for %s", 3145 required_version, function); 3146 } else { 3147 return VK_SUCCESS; 3148 } 3149} 3150 3151#endif 3152 3153static void 3154anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer *cmd_buffer, 3155 uint32_t baseGroupX, 3156 uint32_t baseGroupY, 3157 uint32_t baseGroupZ) 3158{ 3159 if (anv_batch_has_error(&cmd_buffer->batch)) 3160 return; 3161 3162 VkResult result = 3163 anv_cmd_buffer_ensure_push_constant_field(cmd_buffer, MESA_SHADER_COMPUTE, 3164 base_work_group_id); 3165 if (result != VK_SUCCESS) { 3166 cmd_buffer->batch.status = result; 3167 return; 3168 } 3169 3170 struct anv_push_constants *push = 3171 cmd_buffer->state.push_constants[MESA_SHADER_COMPUTE]; 3172 if (push->base_work_group_id[0] != baseGroupX || 3173 push->base_work_group_id[1] != baseGroupY || 3174 push->base_work_group_id[2] != baseGroupZ) { 3175 push->base_work_group_id[0] = baseGroupX; 3176 push->base_work_group_id[1] = baseGroupY; 3177 push->base_work_group_id[2] = baseGroupZ; 3178 3179 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; 3180 } 3181} 3182 3183void genX(CmdDispatch)( 3184 VkCommandBuffer commandBuffer, 3185 uint32_t x, 3186 uint32_t y, 3187 uint32_t z) 3188{ 3189 genX(CmdDispatchBase)(commandBuffer, 0, 0, 0, x, y, z); 3190} 3191 3192void genX(CmdDispatchBase)( 3193 VkCommandBuffer commandBuffer, 3194 uint32_t baseGroupX, 3195 uint32_t baseGroupY, 3196 uint32_t baseGroupZ, 3197 uint32_t groupCountX, 3198 uint32_t groupCountY, 3199 uint32_t groupCountZ) 3200{ 3201 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 3202 struct anv_pipeline *pipeline = cmd_buffer->state.compute.base.pipeline; 3203 const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline); 3204 3205 anv_cmd_buffer_push_base_group_id(cmd_buffer, baseGroupX, 3206 baseGroupY, baseGroupZ); 3207 3208 if (anv_batch_has_error(&cmd_buffer->batch)) 3209 return; 3210 3211 if (prog_data->uses_num_work_groups) { 3212 struct anv_state state = 3213 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 12, 4); 3214 uint32_t *sizes = state.map; 3215 sizes[0] = groupCountX; 3216 sizes[1] = groupCountY; 3217 sizes[2] = groupCountZ; 3218 anv_state_flush(cmd_buffer->device, state); 3219 cmd_buffer->state.compute.num_workgroups = (struct anv_address) { 3220 .bo = &cmd_buffer->device->dynamic_state_pool.block_pool.bo, 3221 .offset = state.offset, 3222 }; 3223 } 3224 3225 genX(cmd_buffer_flush_compute_state)(cmd_buffer); 3226 3227 anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) { 3228 ggw.SIMDSize = prog_data->simd_size / 16; 3229 ggw.ThreadDepthCounterMaximum = 0; 3230 ggw.ThreadHeightCounterMaximum = 0; 3231 ggw.ThreadWidthCounterMaximum = prog_data->threads - 1; 3232 ggw.ThreadGroupIDXDimension = groupCountX; 3233 ggw.ThreadGroupIDYDimension = groupCountY; 3234 ggw.ThreadGroupIDZDimension = groupCountZ; 3235 ggw.RightExecutionMask = pipeline->cs_right_mask; 3236 ggw.BottomExecutionMask = 0xffffffff; 3237 } 3238 3239 anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH), msf); 3240} 3241 3242#define GPGPU_DISPATCHDIMX 0x2500 3243#define GPGPU_DISPATCHDIMY 0x2504 3244#define GPGPU_DISPATCHDIMZ 0x2508 3245 3246void genX(CmdDispatchIndirect)( 3247 VkCommandBuffer commandBuffer, 3248 VkBuffer _buffer, 3249 VkDeviceSize offset) 3250{ 3251 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 3252 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); 3253 struct anv_pipeline *pipeline = cmd_buffer->state.compute.base.pipeline; 3254 const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline); 3255 struct anv_address addr = anv_address_add(buffer->address, offset); 3256 struct anv_batch *batch = &cmd_buffer->batch; 3257 3258 anv_cmd_buffer_push_base_group_id(cmd_buffer, 0, 0, 0); 3259 3260#if GEN_GEN == 7 3261 /* Linux 4.4 added command parser version 5 which allows the GPGPU 3262 * indirect dispatch registers to be written. 3263 */ 3264 if (verify_cmd_parser(cmd_buffer->device, 5, 3265 "vkCmdDispatchIndirect") != VK_SUCCESS) 3266 return; 3267#endif 3268 3269 if (prog_data->uses_num_work_groups) 3270 cmd_buffer->state.compute.num_workgroups = addr; 3271 3272 genX(cmd_buffer_flush_compute_state)(cmd_buffer); 3273 3274 emit_lrm(batch, GPGPU_DISPATCHDIMX, anv_address_add(addr, 0)); 3275 emit_lrm(batch, GPGPU_DISPATCHDIMY, anv_address_add(addr, 4)); 3276 emit_lrm(batch, GPGPU_DISPATCHDIMZ, anv_address_add(addr, 8)); 3277 3278#if GEN_GEN <= 7 3279 /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */ 3280 emit_lri(batch, MI_PREDICATE_SRC0 + 4, 0); 3281 emit_lri(batch, MI_PREDICATE_SRC1 + 0, 0); 3282 emit_lri(batch, MI_PREDICATE_SRC1 + 4, 0); 3283 3284 /* Load compute_dispatch_indirect_x_size into SRC0 */ 3285 emit_lrm(batch, MI_PREDICATE_SRC0, anv_address_add(addr, 0)); 3286 3287 /* predicate = (compute_dispatch_indirect_x_size == 0); */ 3288 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) { 3289 mip.LoadOperation = LOAD_LOAD; 3290 mip.CombineOperation = COMBINE_SET; 3291 mip.CompareOperation = COMPARE_SRCS_EQUAL; 3292 } 3293 3294 /* Load compute_dispatch_indirect_y_size into SRC0 */ 3295 emit_lrm(batch, MI_PREDICATE_SRC0, anv_address_add(addr, 4)); 3296 3297 /* predicate |= (compute_dispatch_indirect_y_size == 0); */ 3298 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) { 3299 mip.LoadOperation = LOAD_LOAD; 3300 mip.CombineOperation = COMBINE_OR; 3301 mip.CompareOperation = COMPARE_SRCS_EQUAL; 3302 } 3303 3304 /* Load compute_dispatch_indirect_z_size into SRC0 */ 3305 emit_lrm(batch, MI_PREDICATE_SRC0, anv_address_add(addr, 8)); 3306 3307 /* predicate |= (compute_dispatch_indirect_z_size == 0); */ 3308 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) { 3309 mip.LoadOperation = LOAD_LOAD; 3310 mip.CombineOperation = COMBINE_OR; 3311 mip.CompareOperation = COMPARE_SRCS_EQUAL; 3312 } 3313 3314 /* predicate = !predicate; */ 3315#define COMPARE_FALSE 1 3316 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) { 3317 mip.LoadOperation = LOAD_LOADINV; 3318 mip.CombineOperation = COMBINE_OR; 3319 mip.CompareOperation = COMPARE_FALSE; 3320 } 3321#endif 3322 3323 anv_batch_emit(batch, GENX(GPGPU_WALKER), ggw) { 3324 ggw.IndirectParameterEnable = true; 3325 ggw.PredicateEnable = GEN_GEN <= 7; 3326 ggw.SIMDSize = prog_data->simd_size / 16; 3327 ggw.ThreadDepthCounterMaximum = 0; 3328 ggw.ThreadHeightCounterMaximum = 0; 3329 ggw.ThreadWidthCounterMaximum = prog_data->threads - 1; 3330 ggw.RightExecutionMask = pipeline->cs_right_mask; 3331 ggw.BottomExecutionMask = 0xffffffff; 3332 } 3333 3334 anv_batch_emit(batch, GENX(MEDIA_STATE_FLUSH), msf); 3335} 3336 3337static void 3338genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer, 3339 uint32_t pipeline) 3340{ 3341 UNUSED const struct gen_device_info *devinfo = &cmd_buffer->device->info; 3342 3343 if (cmd_buffer->state.current_pipeline == pipeline) 3344 return; 3345 3346#if GEN_GEN >= 8 && GEN_GEN < 10 3347 /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT: 3348 * 3349 * Software must clear the COLOR_CALC_STATE Valid field in 3350 * 3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT 3351 * with Pipeline Select set to GPGPU. 3352 * 3353 * The internal hardware docs recommend the same workaround for Gen9 3354 * hardware too. 3355 */ 3356 if (pipeline == GPGPU) 3357 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), t); 3358#endif 3359 3360 /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction] 3361 * PIPELINE_SELECT [DevBWR+]": 3362 * 3363 * Project: DEVSNB+ 3364 * 3365 * Software must ensure all the write caches are flushed through a 3366 * stalling PIPE_CONTROL command followed by another PIPE_CONTROL 3367 * command to invalidate read only caches prior to programming 3368 * MI_PIPELINE_SELECT command to change the Pipeline Select Mode. 3369 */ 3370 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 3371 pc.RenderTargetCacheFlushEnable = true; 3372 pc.DepthCacheFlushEnable = true; 3373 pc.DCFlushEnable = true; 3374 pc.PostSyncOperation = NoWrite; 3375 pc.CommandStreamerStallEnable = true; 3376 } 3377 3378 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 3379 pc.TextureCacheInvalidationEnable = true; 3380 pc.ConstantCacheInvalidationEnable = true; 3381 pc.StateCacheInvalidationEnable = true; 3382 pc.InstructionCacheInvalidateEnable = true; 3383 pc.PostSyncOperation = NoWrite; 3384 } 3385 3386 anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) { 3387#if GEN_GEN >= 9 3388 ps.MaskBits = 3; 3389#endif 3390 ps.PipelineSelection = pipeline; 3391 } 3392 3393#if GEN_GEN == 9 3394 if (devinfo->is_geminilake) { 3395 /* Project: DevGLK 3396 * 3397 * "This chicken bit works around a hardware issue with barrier logic 3398 * encountered when switching between GPGPU and 3D pipelines. To 3399 * workaround the issue, this mode bit should be set after a pipeline 3400 * is selected." 3401 */ 3402 uint32_t scec; 3403 anv_pack_struct(&scec, GENX(SLICE_COMMON_ECO_CHICKEN1), 3404 .GLKBarrierMode = 3405 pipeline == GPGPU ? GLK_BARRIER_MODE_GPGPU 3406 : GLK_BARRIER_MODE_3D_HULL, 3407 .GLKBarrierModeMask = 1); 3408 emit_lri(&cmd_buffer->batch, GENX(SLICE_COMMON_ECO_CHICKEN1_num), scec); 3409 } 3410#endif 3411 3412 cmd_buffer->state.current_pipeline = pipeline; 3413} 3414 3415void 3416genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer) 3417{ 3418 genX(flush_pipeline_select)(cmd_buffer, _3D); 3419} 3420 3421void 3422genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer) 3423{ 3424 genX(flush_pipeline_select)(cmd_buffer, GPGPU); 3425} 3426 3427void 3428genX(cmd_buffer_emit_gen7_depth_flush)(struct anv_cmd_buffer *cmd_buffer) 3429{ 3430 if (GEN_GEN >= 8) 3431 return; 3432 3433 /* From the Haswell PRM, documentation for 3DSTATE_DEPTH_BUFFER: 3434 * 3435 * "Restriction: Prior to changing Depth/Stencil Buffer state (i.e., any 3436 * combination of 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS, 3437 * 3DSTATE_STENCIL_BUFFER, 3DSTATE_HIER_DEPTH_BUFFER) SW must first 3438 * issue a pipelined depth stall (PIPE_CONTROL with Depth Stall bit 3439 * set), followed by a pipelined depth cache flush (PIPE_CONTROL with 3440 * Depth Flush Bit set, followed by another pipelined depth stall 3441 * (PIPE_CONTROL with Depth Stall Bit set), unless SW can otherwise 3442 * guarantee that the pipeline from WM onwards is already flushed (e.g., 3443 * via a preceding MI_FLUSH)." 3444 */ 3445 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { 3446 pipe.DepthStallEnable = true; 3447 } 3448 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { 3449 pipe.DepthCacheFlushEnable = true; 3450 } 3451 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { 3452 pipe.DepthStallEnable = true; 3453 } 3454} 3455 3456static void 3457cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer) 3458{ 3459 struct anv_device *device = cmd_buffer->device; 3460 const struct anv_image_view *iview = 3461 anv_cmd_buffer_get_depth_stencil_view(cmd_buffer); 3462 const struct anv_image *image = iview ? iview->image : NULL; 3463 3464 /* FIXME: Width and Height are wrong */ 3465 3466 genX(cmd_buffer_emit_gen7_depth_flush)(cmd_buffer); 3467 3468 uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch, 3469 device->isl_dev.ds.size / 4); 3470 if (dw == NULL) 3471 return; 3472 3473 struct isl_depth_stencil_hiz_emit_info info = { }; 3474 3475 if (iview) 3476 info.view = &iview->planes[0].isl; 3477 3478 if (image && (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) { 3479 uint32_t depth_plane = 3480 anv_image_aspect_to_plane(image->aspects, VK_IMAGE_ASPECT_DEPTH_BIT); 3481 const struct anv_surface *surface = &image->planes[depth_plane].surface; 3482 3483 info.depth_surf = &surface->isl; 3484 3485 info.depth_address = 3486 anv_batch_emit_reloc(&cmd_buffer->batch, 3487 dw + device->isl_dev.ds.depth_offset / 4, 3488 image->planes[depth_plane].address.bo, 3489 image->planes[depth_plane].address.offset + 3490 surface->offset); 3491 info.mocs = 3492 anv_mocs_for_bo(device, image->planes[depth_plane].address.bo); 3493 3494 const uint32_t ds = 3495 cmd_buffer->state.subpass->depth_stencil_attachment->attachment; 3496 info.hiz_usage = cmd_buffer->state.attachments[ds].aux_usage; 3497 if (info.hiz_usage == ISL_AUX_USAGE_HIZ) { 3498 info.hiz_surf = &image->planes[depth_plane].aux_surface.isl; 3499 3500 info.hiz_address = 3501 anv_batch_emit_reloc(&cmd_buffer->batch, 3502 dw + device->isl_dev.ds.hiz_offset / 4, 3503 image->planes[depth_plane].address.bo, 3504 image->planes[depth_plane].address.offset + 3505 image->planes[depth_plane].aux_surface.offset); 3506 3507 info.depth_clear_value = ANV_HZ_FC_VAL; 3508 } 3509 } 3510 3511 if (image && (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT)) { 3512 uint32_t stencil_plane = 3513 anv_image_aspect_to_plane(image->aspects, VK_IMAGE_ASPECT_STENCIL_BIT); 3514 const struct anv_surface *surface = &image->planes[stencil_plane].surface; 3515 3516 info.stencil_surf = &surface->isl; 3517 3518 info.stencil_address = 3519 anv_batch_emit_reloc(&cmd_buffer->batch, 3520 dw + device->isl_dev.ds.stencil_offset / 4, 3521 image->planes[stencil_plane].address.bo, 3522 image->planes[stencil_plane].address.offset + 3523 surface->offset); 3524 info.mocs = 3525 anv_mocs_for_bo(device, image->planes[stencil_plane].address.bo); 3526 } 3527 3528 isl_emit_depth_stencil_hiz_s(&device->isl_dev, dw, &info); 3529 3530 cmd_buffer->state.hiz_enabled = info.hiz_usage == ISL_AUX_USAGE_HIZ; 3531} 3532 3533/** 3534 * This ANDs the view mask of the current subpass with the pending clear 3535 * views in the attachment to get the mask of views active in the subpass 3536 * that still need to be cleared. 3537 */ 3538static inline uint32_t 3539get_multiview_subpass_clear_mask(const struct anv_cmd_state *cmd_state, 3540 const struct anv_attachment_state *att_state) 3541{ 3542 return cmd_state->subpass->view_mask & att_state->pending_clear_views; 3543} 3544 3545static inline bool 3546do_first_layer_clear(const struct anv_cmd_state *cmd_state, 3547 const struct anv_attachment_state *att_state) 3548{ 3549 if (!cmd_state->subpass->view_mask) 3550 return true; 3551 3552 uint32_t pending_clear_mask = 3553 get_multiview_subpass_clear_mask(cmd_state, att_state); 3554 3555 return pending_clear_mask & 1; 3556} 3557 3558static inline bool 3559current_subpass_is_last_for_attachment(const struct anv_cmd_state *cmd_state, 3560 uint32_t att_idx) 3561{ 3562 const uint32_t last_subpass_idx = 3563 cmd_state->pass->attachments[att_idx].last_subpass_idx; 3564 const struct anv_subpass *last_subpass = 3565 &cmd_state->pass->subpasses[last_subpass_idx]; 3566 return last_subpass == cmd_state->subpass; 3567} 3568 3569static void 3570cmd_buffer_begin_subpass(struct anv_cmd_buffer *cmd_buffer, 3571 uint32_t subpass_id) 3572{ 3573 struct anv_cmd_state *cmd_state = &cmd_buffer->state; 3574 struct anv_subpass *subpass = &cmd_state->pass->subpasses[subpass_id]; 3575 cmd_state->subpass = subpass; 3576 3577 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS; 3578 3579 /* Our implementation of VK_KHR_multiview uses instancing to draw the 3580 * different views. If the client asks for instancing, we need to use the 3581 * Instance Data Step Rate to ensure that we repeat the client's 3582 * per-instance data once for each view. Since this bit is in 3583 * VERTEX_BUFFER_STATE on gen7, we need to dirty vertex buffers at the top 3584 * of each subpass. 3585 */ 3586 if (GEN_GEN == 7) 3587 cmd_buffer->state.gfx.vb_dirty |= ~0; 3588 3589 /* It is possible to start a render pass with an old pipeline. Because the 3590 * render pass and subpass index are both baked into the pipeline, this is 3591 * highly unlikely. In order to do so, it requires that you have a render 3592 * pass with a single subpass and that you use that render pass twice 3593 * back-to-back and use the same pipeline at the start of the second render 3594 * pass as at the end of the first. In order to avoid unpredictable issues 3595 * with this edge case, we just dirty the pipeline at the start of every 3596 * subpass. 3597 */ 3598 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_PIPELINE; 3599 3600 /* Accumulate any subpass flushes that need to happen before the subpass */ 3601 cmd_buffer->state.pending_pipe_bits |= 3602 cmd_buffer->state.pass->subpass_flushes[subpass_id]; 3603 3604 VkRect2D render_area = cmd_buffer->state.render_area; 3605 struct anv_framebuffer *fb = cmd_buffer->state.framebuffer; 3606 3607 bool is_multiview = subpass->view_mask != 0; 3608 3609 for (uint32_t i = 0; i < subpass->attachment_count; ++i) { 3610 const uint32_t a = subpass->attachments[i].attachment; 3611 if (a == VK_ATTACHMENT_UNUSED) 3612 continue; 3613 3614 assert(a < cmd_state->pass->attachment_count); 3615 struct anv_attachment_state *att_state = &cmd_state->attachments[a]; 3616 3617 struct anv_image_view *iview = fb->attachments[a]; 3618 const struct anv_image *image = iview->image; 3619 3620 /* A resolve is necessary before use as an input attachment if the clear 3621 * color or auxiliary buffer usage isn't supported by the sampler. 3622 */ 3623 const bool input_needs_resolve = 3624 (att_state->fast_clear && !att_state->clear_color_is_zero_one) || 3625 att_state->input_aux_usage != att_state->aux_usage; 3626 3627 VkImageLayout target_layout; 3628 if (iview->aspect_mask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV && 3629 !input_needs_resolve) { 3630 /* Layout transitions before the final only help to enable sampling 3631 * as an input attachment. If the input attachment supports sampling 3632 * using the auxiliary surface, we can skip such transitions by 3633 * making the target layout one that is CCS-aware. 3634 */ 3635 target_layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; 3636 } else { 3637 target_layout = subpass->attachments[i].layout; 3638 } 3639 3640 if (image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { 3641 assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT); 3642 3643 uint32_t base_layer, layer_count; 3644 if (image->type == VK_IMAGE_TYPE_3D) { 3645 base_layer = 0; 3646 layer_count = anv_minify(iview->image->extent.depth, 3647 iview->planes[0].isl.base_level); 3648 } else { 3649 base_layer = iview->planes[0].isl.base_array_layer; 3650 layer_count = fb->layers; 3651 } 3652 3653 transition_color_buffer(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT, 3654 iview->planes[0].isl.base_level, 1, 3655 base_layer, layer_count, 3656 att_state->current_layout, target_layout); 3657 } else if (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { 3658 transition_depth_buffer(cmd_buffer, image, 3659 att_state->current_layout, target_layout); 3660 att_state->aux_usage = 3661 anv_layout_to_aux_usage(&cmd_buffer->device->info, image, 3662 VK_IMAGE_ASPECT_DEPTH_BIT, target_layout); 3663 } 3664 att_state->current_layout = target_layout; 3665 3666 if (att_state->pending_clear_aspects & VK_IMAGE_ASPECT_COLOR_BIT) { 3667 assert(att_state->pending_clear_aspects == VK_IMAGE_ASPECT_COLOR_BIT); 3668 3669 /* Multi-planar images are not supported as attachments */ 3670 assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT); 3671 assert(image->n_planes == 1); 3672 3673 uint32_t base_clear_layer = iview->planes[0].isl.base_array_layer; 3674 uint32_t clear_layer_count = fb->layers; 3675 3676 if (att_state->fast_clear && 3677 do_first_layer_clear(cmd_state, att_state)) { 3678 /* We only support fast-clears on the first layer */ 3679 assert(iview->planes[0].isl.base_level == 0); 3680 assert(iview->planes[0].isl.base_array_layer == 0); 3681 3682 union isl_color_value clear_color = {}; 3683 anv_clear_color_from_att_state(&clear_color, att_state, iview); 3684 if (iview->image->samples == 1) { 3685 anv_image_ccs_op(cmd_buffer, image, 3686 iview->planes[0].isl.format, 3687 VK_IMAGE_ASPECT_COLOR_BIT, 3688 0, 0, 1, ISL_AUX_OP_FAST_CLEAR, 3689 &clear_color, 3690 false); 3691 } else { 3692 anv_image_mcs_op(cmd_buffer, image, 3693 iview->planes[0].isl.format, 3694 VK_IMAGE_ASPECT_COLOR_BIT, 3695 0, 1, ISL_AUX_OP_FAST_CLEAR, 3696 &clear_color, 3697 false); 3698 } 3699 base_clear_layer++; 3700 clear_layer_count--; 3701 if (is_multiview) 3702 att_state->pending_clear_views &= ~1; 3703 3704 if (att_state->clear_color_is_zero) { 3705 /* This image has the auxiliary buffer enabled. We can mark the 3706 * subresource as not needing a resolve because the clear color 3707 * will match what's in every RENDER_SURFACE_STATE object when 3708 * it's being used for sampling. 3709 */ 3710 set_image_fast_clear_state(cmd_buffer, iview->image, 3711 VK_IMAGE_ASPECT_COLOR_BIT, 3712 ANV_FAST_CLEAR_DEFAULT_VALUE); 3713 } else { 3714 set_image_fast_clear_state(cmd_buffer, iview->image, 3715 VK_IMAGE_ASPECT_COLOR_BIT, 3716 ANV_FAST_CLEAR_ANY); 3717 } 3718 } 3719 3720 /* From the VkFramebufferCreateInfo spec: 3721 * 3722 * "If the render pass uses multiview, then layers must be one and each 3723 * attachment requires a number of layers that is greater than the 3724 * maximum bit index set in the view mask in the subpasses in which it 3725 * is used." 3726 * 3727 * So if multiview is active we ignore the number of layers in the 3728 * framebuffer and instead we honor the view mask from the subpass. 3729 */ 3730 if (is_multiview) { 3731 assert(image->n_planes == 1); 3732 uint32_t pending_clear_mask = 3733 get_multiview_subpass_clear_mask(cmd_state, att_state); 3734 3735 uint32_t layer_idx; 3736 for_each_bit(layer_idx, pending_clear_mask) { 3737 uint32_t layer = 3738 iview->planes[0].isl.base_array_layer + layer_idx; 3739 3740 anv_image_clear_color(cmd_buffer, image, 3741 VK_IMAGE_ASPECT_COLOR_BIT, 3742 att_state->aux_usage, 3743 iview->planes[0].isl.format, 3744 iview->planes[0].isl.swizzle, 3745 iview->planes[0].isl.base_level, 3746 layer, 1, 3747 render_area, 3748 vk_to_isl_color(att_state->clear_value.color)); 3749 } 3750 3751 att_state->pending_clear_views &= ~pending_clear_mask; 3752 } else if (clear_layer_count > 0) { 3753 assert(image->n_planes == 1); 3754 anv_image_clear_color(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT, 3755 att_state->aux_usage, 3756 iview->planes[0].isl.format, 3757 iview->planes[0].isl.swizzle, 3758 iview->planes[0].isl.base_level, 3759 base_clear_layer, clear_layer_count, 3760 render_area, 3761 vk_to_isl_color(att_state->clear_value.color)); 3762 } 3763 } else if (att_state->pending_clear_aspects & (VK_IMAGE_ASPECT_DEPTH_BIT | 3764 VK_IMAGE_ASPECT_STENCIL_BIT)) { 3765 if (att_state->fast_clear && !is_multiview) { 3766 /* We currently only support HiZ for single-layer images */ 3767 if (att_state->pending_clear_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { 3768 assert(iview->image->planes[0].aux_usage == ISL_AUX_USAGE_HIZ); 3769 assert(iview->planes[0].isl.base_level == 0); 3770 assert(iview->planes[0].isl.base_array_layer == 0); 3771 assert(fb->layers == 1); 3772 } 3773 3774 anv_image_hiz_clear(cmd_buffer, image, 3775 att_state->pending_clear_aspects, 3776 iview->planes[0].isl.base_level, 3777 iview->planes[0].isl.base_array_layer, 3778 fb->layers, render_area, 3779 att_state->clear_value.depthStencil.stencil); 3780 } else if (is_multiview) { 3781 uint32_t pending_clear_mask = 3782 get_multiview_subpass_clear_mask(cmd_state, att_state); 3783 3784 uint32_t layer_idx; 3785 for_each_bit(layer_idx, pending_clear_mask) { 3786 uint32_t layer = 3787 iview->planes[0].isl.base_array_layer + layer_idx; 3788 3789 anv_image_clear_depth_stencil(cmd_buffer, image, 3790 att_state->pending_clear_aspects, 3791 att_state->aux_usage, 3792 iview->planes[0].isl.base_level, 3793 layer, 1, 3794 render_area, 3795 att_state->clear_value.depthStencil.depth, 3796 att_state->clear_value.depthStencil.stencil); 3797 } 3798 3799 att_state->pending_clear_views &= ~pending_clear_mask; 3800 } else { 3801 anv_image_clear_depth_stencil(cmd_buffer, image, 3802 att_state->pending_clear_aspects, 3803 att_state->aux_usage, 3804 iview->planes[0].isl.base_level, 3805 iview->planes[0].isl.base_array_layer, 3806 fb->layers, render_area, 3807 att_state->clear_value.depthStencil.depth, 3808 att_state->clear_value.depthStencil.stencil); 3809 } 3810 } else { 3811 assert(att_state->pending_clear_aspects == 0); 3812 } 3813 3814 if (GEN_GEN < 10 && 3815 (att_state->pending_load_aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) && 3816 image->planes[0].aux_surface.isl.size_B > 0 && 3817 iview->planes[0].isl.base_level == 0 && 3818 iview->planes[0].isl.base_array_layer == 0) { 3819 if (att_state->aux_usage != ISL_AUX_USAGE_NONE) { 3820 genX(copy_fast_clear_dwords)(cmd_buffer, att_state->color.state, 3821 image, VK_IMAGE_ASPECT_COLOR_BIT, 3822 false /* copy to ss */); 3823 } 3824 3825 if (need_input_attachment_state(&cmd_state->pass->attachments[a]) && 3826 att_state->input_aux_usage != ISL_AUX_USAGE_NONE) { 3827 genX(copy_fast_clear_dwords)(cmd_buffer, att_state->input.state, 3828 image, VK_IMAGE_ASPECT_COLOR_BIT, 3829 false /* copy to ss */); 3830 } 3831 } 3832 3833 if (subpass->attachments[i].usage == 3834 VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) { 3835 /* We assume that if we're starting a subpass, we're going to do some 3836 * rendering so we may end up with compressed data. 3837 */ 3838 genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image, 3839 VK_IMAGE_ASPECT_COLOR_BIT, 3840 att_state->aux_usage, 3841 iview->planes[0].isl.base_level, 3842 iview->planes[0].isl.base_array_layer, 3843 fb->layers); 3844 } else if (subpass->attachments[i].usage == 3845 VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) { 3846 /* We may be writing depth or stencil so we need to mark the surface. 3847 * Unfortunately, there's no way to know at this point whether the 3848 * depth or stencil tests used will actually write to the surface. 3849 * 3850 * Even though stencil may be plane 1, it always shares a base_level 3851 * with depth. 3852 */ 3853 const struct isl_view *ds_view = &iview->planes[0].isl; 3854 if (iview->aspect_mask & VK_IMAGE_ASPECT_DEPTH_BIT) { 3855 genX(cmd_buffer_mark_image_written)(cmd_buffer, image, 3856 VK_IMAGE_ASPECT_DEPTH_BIT, 3857 att_state->aux_usage, 3858 ds_view->base_level, 3859 ds_view->base_array_layer, 3860 fb->layers); 3861 } 3862 if (iview->aspect_mask & VK_IMAGE_ASPECT_STENCIL_BIT) { 3863 /* Even though stencil may be plane 1, it always shares a 3864 * base_level with depth. 3865 */ 3866 genX(cmd_buffer_mark_image_written)(cmd_buffer, image, 3867 VK_IMAGE_ASPECT_STENCIL_BIT, 3868 ISL_AUX_USAGE_NONE, 3869 ds_view->base_level, 3870 ds_view->base_array_layer, 3871 fb->layers); 3872 } 3873 } 3874 3875 /* If multiview is enabled, then we are only done clearing when we no 3876 * longer have pending layers to clear, or when we have processed the 3877 * last subpass that uses this attachment. 3878 */ 3879 if (!is_multiview || 3880 att_state->pending_clear_views == 0 || 3881 current_subpass_is_last_for_attachment(cmd_state, a)) { 3882 att_state->pending_clear_aspects = 0; 3883 } 3884 3885 att_state->pending_load_aspects = 0; 3886 } 3887 3888 cmd_buffer_emit_depth_stencil(cmd_buffer); 3889} 3890 3891static void 3892cmd_buffer_end_subpass(struct anv_cmd_buffer *cmd_buffer) 3893{ 3894 struct anv_cmd_state *cmd_state = &cmd_buffer->state; 3895 struct anv_subpass *subpass = cmd_state->subpass; 3896 uint32_t subpass_id = anv_get_subpass_id(&cmd_buffer->state); 3897 3898 anv_cmd_buffer_resolve_subpass(cmd_buffer); 3899 3900 struct anv_framebuffer *fb = cmd_buffer->state.framebuffer; 3901 for (uint32_t i = 0; i < subpass->attachment_count; ++i) { 3902 const uint32_t a = subpass->attachments[i].attachment; 3903 if (a == VK_ATTACHMENT_UNUSED) 3904 continue; 3905 3906 if (cmd_state->pass->attachments[a].last_subpass_idx != subpass_id) 3907 continue; 3908 3909 assert(a < cmd_state->pass->attachment_count); 3910 struct anv_attachment_state *att_state = &cmd_state->attachments[a]; 3911 struct anv_image_view *iview = fb->attachments[a]; 3912 const struct anv_image *image = iview->image; 3913 3914 if ((image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) && 3915 image->vk_format != iview->vk_format) { 3916 enum anv_fast_clear_type fast_clear_type = 3917 anv_layout_to_fast_clear_type(&cmd_buffer->device->info, 3918 image, VK_IMAGE_ASPECT_COLOR_BIT, 3919 att_state->current_layout); 3920 3921 /* If any clear color was used, flush it down the aux surfaces. If we 3922 * don't do it now using the view's format we might use the clear 3923 * color incorrectly in the following resolves (for example with an 3924 * SRGB view & a UNORM image). 3925 */ 3926 if (fast_clear_type != ANV_FAST_CLEAR_NONE) { 3927 anv_perf_warn(cmd_buffer->device->instance, fb, 3928 "Doing a partial resolve to get rid of clear color at the " 3929 "end of a renderpass due to an image/view format mismatch"); 3930 3931 uint32_t base_layer, layer_count; 3932 if (image->type == VK_IMAGE_TYPE_3D) { 3933 base_layer = 0; 3934 layer_count = anv_minify(iview->image->extent.depth, 3935 iview->planes[0].isl.base_level); 3936 } else { 3937 base_layer = iview->planes[0].isl.base_array_layer; 3938 layer_count = fb->layers; 3939 } 3940 3941 for (uint32_t a = 0; a < layer_count; a++) { 3942 uint32_t array_layer = base_layer + a; 3943 if (image->samples == 1) { 3944 anv_cmd_predicated_ccs_resolve(cmd_buffer, image, 3945 iview->planes[0].isl.format, 3946 VK_IMAGE_ASPECT_COLOR_BIT, 3947 iview->planes[0].isl.base_level, 3948 array_layer, 3949 ISL_AUX_OP_PARTIAL_RESOLVE, 3950 ANV_FAST_CLEAR_NONE); 3951 } else { 3952 anv_cmd_predicated_mcs_resolve(cmd_buffer, image, 3953 iview->planes[0].isl.format, 3954 VK_IMAGE_ASPECT_COLOR_BIT, 3955 base_layer, 3956 ISL_AUX_OP_PARTIAL_RESOLVE, 3957 ANV_FAST_CLEAR_NONE); 3958 } 3959 } 3960 } 3961 } 3962 3963 /* Transition the image into the final layout for this render pass */ 3964 VkImageLayout target_layout = 3965 cmd_state->pass->attachments[a].final_layout; 3966 3967 if (image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { 3968 assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT); 3969 3970 uint32_t base_layer, layer_count; 3971 if (image->type == VK_IMAGE_TYPE_3D) { 3972 base_layer = 0; 3973 layer_count = anv_minify(iview->image->extent.depth, 3974 iview->planes[0].isl.base_level); 3975 } else { 3976 base_layer = iview->planes[0].isl.base_array_layer; 3977 layer_count = fb->layers; 3978 } 3979 3980 transition_color_buffer(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT, 3981 iview->planes[0].isl.base_level, 1, 3982 base_layer, layer_count, 3983 att_state->current_layout, target_layout); 3984 } else if (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { 3985 transition_depth_buffer(cmd_buffer, image, 3986 att_state->current_layout, target_layout); 3987 } 3988 } 3989 3990 /* Accumulate any subpass flushes that need to happen after the subpass. 3991 * Yes, they do get accumulated twice in the NextSubpass case but since 3992 * genX_CmdNextSubpass just calls end/begin back-to-back, we just end up 3993 * ORing the bits in twice so it's harmless. 3994 */ 3995 cmd_buffer->state.pending_pipe_bits |= 3996 cmd_buffer->state.pass->subpass_flushes[subpass_id + 1]; 3997} 3998 3999void genX(CmdBeginRenderPass)( 4000 VkCommandBuffer commandBuffer, 4001 const VkRenderPassBeginInfo* pRenderPassBegin, 4002 VkSubpassContents contents) 4003{ 4004 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 4005 ANV_FROM_HANDLE(anv_render_pass, pass, pRenderPassBegin->renderPass); 4006 ANV_FROM_HANDLE(anv_framebuffer, framebuffer, pRenderPassBegin->framebuffer); 4007 4008 cmd_buffer->state.framebuffer = framebuffer; 4009 cmd_buffer->state.pass = pass; 4010 cmd_buffer->state.render_area = pRenderPassBegin->renderArea; 4011 VkResult result = 4012 genX(cmd_buffer_setup_attachments)(cmd_buffer, pass, pRenderPassBegin); 4013 4014 /* If we failed to setup the attachments we should not try to go further */ 4015 if (result != VK_SUCCESS) { 4016 assert(anv_batch_has_error(&cmd_buffer->batch)); 4017 return; 4018 } 4019 4020 genX(flush_pipeline_select_3d)(cmd_buffer); 4021 4022 cmd_buffer_begin_subpass(cmd_buffer, 0); 4023} 4024 4025void genX(CmdBeginRenderPass2KHR)( 4026 VkCommandBuffer commandBuffer, 4027 const VkRenderPassBeginInfo* pRenderPassBeginInfo, 4028 const VkSubpassBeginInfoKHR* pSubpassBeginInfo) 4029{ 4030 genX(CmdBeginRenderPass)(commandBuffer, pRenderPassBeginInfo, 4031 pSubpassBeginInfo->contents); 4032} 4033 4034void genX(CmdNextSubpass)( 4035 VkCommandBuffer commandBuffer, 4036 VkSubpassContents contents) 4037{ 4038 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 4039 4040 if (anv_batch_has_error(&cmd_buffer->batch)) 4041 return; 4042 4043 assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); 4044 4045 uint32_t prev_subpass = anv_get_subpass_id(&cmd_buffer->state); 4046 cmd_buffer_end_subpass(cmd_buffer); 4047 cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1); 4048} 4049 4050void genX(CmdNextSubpass2KHR)( 4051 VkCommandBuffer commandBuffer, 4052 const VkSubpassBeginInfoKHR* pSubpassBeginInfo, 4053 const VkSubpassEndInfoKHR* pSubpassEndInfo) 4054{ 4055 genX(CmdNextSubpass)(commandBuffer, pSubpassBeginInfo->contents); 4056} 4057 4058void genX(CmdEndRenderPass)( 4059 VkCommandBuffer commandBuffer) 4060{ 4061 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 4062 4063 if (anv_batch_has_error(&cmd_buffer->batch)) 4064 return; 4065 4066 cmd_buffer_end_subpass(cmd_buffer); 4067 4068 cmd_buffer->state.hiz_enabled = false; 4069 4070#ifndef NDEBUG 4071 anv_dump_add_framebuffer(cmd_buffer, cmd_buffer->state.framebuffer); 4072#endif 4073 4074 /* Remove references to render pass specific state. This enables us to 4075 * detect whether or not we're in a renderpass. 4076 */ 4077 cmd_buffer->state.framebuffer = NULL; 4078 cmd_buffer->state.pass = NULL; 4079 cmd_buffer->state.subpass = NULL; 4080} 4081 4082void genX(CmdEndRenderPass2KHR)( 4083 VkCommandBuffer commandBuffer, 4084 const VkSubpassEndInfoKHR* pSubpassEndInfo) 4085{ 4086 genX(CmdEndRenderPass)(commandBuffer); 4087} 4088