101e04c3fSmrg/* 201e04c3fSmrg * Copyright © 2015 Intel Corporation 301e04c3fSmrg * 401e04c3fSmrg * Permission is hereby granted, free of charge, to any person obtaining a 501e04c3fSmrg * copy of this software and associated documentation files (the "Software"), 601e04c3fSmrg * to deal in the Software without restriction, including without limitation 701e04c3fSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 801e04c3fSmrg * and/or sell copies of the Software, and to permit persons to whom the 901e04c3fSmrg * Software is furnished to do so, subject to the following conditions: 1001e04c3fSmrg * 1101e04c3fSmrg * The above copyright notice and this permission notice (including the next 1201e04c3fSmrg * paragraph) shall be included in all copies or substantial portions of the 1301e04c3fSmrg * Software. 1401e04c3fSmrg * 1501e04c3fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1601e04c3fSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1701e04c3fSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 1801e04c3fSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 1901e04c3fSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 2001e04c3fSmrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 2101e04c3fSmrg * IN THE SOFTWARE. 2201e04c3fSmrg */ 2301e04c3fSmrg 2401e04c3fSmrg#include <assert.h> 2501e04c3fSmrg#include <stdbool.h> 2601e04c3fSmrg 2701e04c3fSmrg#include "anv_private.h" 287ec681f3Smrg#include "anv_measure.h" 297ec681f3Smrg#include "vk_format.h" 3001e04c3fSmrg#include "vk_util.h" 319f464c52Smaya#include "util/fast_idiv_by_const.h" 3201e04c3fSmrg 337ec681f3Smrg#include "common/intel_aux_map.h" 347ec681f3Smrg#include "common/intel_l3_config.h" 3501e04c3fSmrg#include "genxml/gen_macros.h" 3601e04c3fSmrg#include "genxml/genX_pack.h" 377ec681f3Smrg#include "genxml/gen_rt_pack.h" 3801e04c3fSmrg 397ec681f3Smrg#include "nir/nir_xfb_info.h" 407ec681f3Smrg 417ec681f3Smrg/* We reserve : 427ec681f3Smrg * - GPR 14 for secondary command buffer returns 437ec681f3Smrg * - GPR 15 for conditional rendering 447ec681f3Smrg */ 457ec681f3Smrg#define MI_BUILDER_NUM_ALLOC_GPRS 14 469f464c52Smaya#define __gen_get_batch_dwords anv_batch_emit_dwords 479f464c52Smaya#define __gen_address_offset anv_address_add 487ec681f3Smrg#define __gen_get_batch_address(b, a) anv_batch_address(b, a) 497ec681f3Smrg#include "common/mi_builder.h" 507ec681f3Smrg 517ec681f3Smrgstatic void genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer, 527ec681f3Smrg uint32_t pipeline); 537ec681f3Smrg 547ec681f3Smrgstatic enum anv_pipe_bits 557ec681f3Smrgconvert_pc_to_bits(struct GENX(PIPE_CONTROL) *pc) { 567ec681f3Smrg enum anv_pipe_bits bits = 0; 577ec681f3Smrg bits |= (pc->DepthCacheFlushEnable) ? ANV_PIPE_DEPTH_CACHE_FLUSH_BIT : 0; 587ec681f3Smrg bits |= (pc->DCFlushEnable) ? ANV_PIPE_DATA_CACHE_FLUSH_BIT : 0; 597ec681f3Smrg#if GFX_VER >= 12 607ec681f3Smrg bits |= (pc->TileCacheFlushEnable) ? ANV_PIPE_TILE_CACHE_FLUSH_BIT : 0; 617ec681f3Smrg bits |= (pc->HDCPipelineFlushEnable) ? ANV_PIPE_HDC_PIPELINE_FLUSH_BIT : 0; 627ec681f3Smrg#endif 637ec681f3Smrg bits |= (pc->RenderTargetCacheFlushEnable) ? ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT : 0; 647ec681f3Smrg bits |= (pc->StateCacheInvalidationEnable) ? ANV_PIPE_STATE_CACHE_INVALIDATE_BIT : 0; 657ec681f3Smrg bits |= (pc->ConstantCacheInvalidationEnable) ? ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT : 0; 667ec681f3Smrg bits |= (pc->TextureCacheInvalidationEnable) ? ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT : 0; 677ec681f3Smrg bits |= (pc->InstructionCacheInvalidateEnable) ? ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT : 0; 687ec681f3Smrg bits |= (pc->StallAtPixelScoreboard) ? ANV_PIPE_STALL_AT_SCOREBOARD_BIT : 0; 697ec681f3Smrg bits |= (pc->DepthStallEnable) ? ANV_PIPE_DEPTH_STALL_BIT : 0; 707ec681f3Smrg bits |= (pc->CommandStreamerStallEnable) ? ANV_PIPE_CS_STALL_BIT : 0; 717ec681f3Smrg return bits; 727ec681f3Smrg} 7301e04c3fSmrg 747ec681f3Smrg#define anv_debug_dump_pc(pc) \ 757ec681f3Smrg if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) { \ 767ec681f3Smrg fputs("pc: emit PC=( ", stderr); \ 777ec681f3Smrg anv_dump_pipe_bits(convert_pc_to_bits(&(pc))); \ 787ec681f3Smrg fprintf(stderr, ") reason: %s\n", __FUNCTION__); \ 7901e04c3fSmrg } 807ec681f3Smrg 817ec681f3Smrgstatic bool 827ec681f3Smrgis_render_queue_cmd_buffer(const struct anv_cmd_buffer *cmd_buffer) 837ec681f3Smrg{ 847ec681f3Smrg struct anv_queue_family *queue_family = cmd_buffer->pool->queue_family; 857ec681f3Smrg return (queue_family->queueFlags & VK_QUEUE_GRAPHICS_BIT) != 0; 8601e04c3fSmrg} 8701e04c3fSmrg 8801e04c3fSmrgvoid 8901e04c3fSmrggenX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer) 9001e04c3fSmrg{ 9101e04c3fSmrg struct anv_device *device = cmd_buffer->device; 927ec681f3Smrg UNUSED const struct intel_device_info *devinfo = &device->info; 937ec681f3Smrg uint32_t mocs = isl_mocs(&device->isl_dev, 0, false); 9401e04c3fSmrg 9501e04c3fSmrg /* If we are emitting a new state base address we probably need to re-emit 9601e04c3fSmrg * binding tables. 9701e04c3fSmrg */ 9801e04c3fSmrg cmd_buffer->state.descriptors_dirty |= ~0; 9901e04c3fSmrg 10001e04c3fSmrg /* Emit a render target cache flush. 10101e04c3fSmrg * 10201e04c3fSmrg * This isn't documented anywhere in the PRM. However, it seems to be 10301e04c3fSmrg * necessary prior to changing the surface state base adress. Without 10401e04c3fSmrg * this, we get GPU hangs when using multi-level command buffers which 10501e04c3fSmrg * clear depth, reset state base address, and then go render stuff. 10601e04c3fSmrg */ 10701e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 1087ec681f3Smrg#if GFX_VER >= 12 1097ec681f3Smrg pc.HDCPipelineFlushEnable = true; 1107ec681f3Smrg#else 11101e04c3fSmrg pc.DCFlushEnable = true; 1127ec681f3Smrg#endif 11301e04c3fSmrg pc.RenderTargetCacheFlushEnable = true; 11401e04c3fSmrg pc.CommandStreamerStallEnable = true; 1157ec681f3Smrg#if GFX_VER == 12 1167ec681f3Smrg /* Wa_1606662791: 1177ec681f3Smrg * 1187ec681f3Smrg * Software must program PIPE_CONTROL command with "HDC Pipeline 1197ec681f3Smrg * Flush" prior to programming of the below two non-pipeline state : 1207ec681f3Smrg * * STATE_BASE_ADDRESS 1217ec681f3Smrg * * 3DSTATE_BINDING_TABLE_POOL_ALLOC 1227ec681f3Smrg */ 1237ec681f3Smrg if (devinfo->revision == 0 /* A0 */) 1247ec681f3Smrg pc.HDCPipelineFlushEnable = true; 1257ec681f3Smrg#endif 1267ec681f3Smrg anv_debug_dump_pc(pc); 12701e04c3fSmrg } 12801e04c3fSmrg 1297ec681f3Smrg#if GFX_VER == 12 1307ec681f3Smrg /* Wa_1607854226: 1317ec681f3Smrg * 1327ec681f3Smrg * Workaround the non pipelined state not applying in MEDIA/GPGPU pipeline 1337ec681f3Smrg * mode by putting the pipeline temporarily in 3D mode. 1347ec681f3Smrg */ 1357ec681f3Smrg uint32_t gfx12_wa_pipeline = cmd_buffer->state.current_pipeline; 1367ec681f3Smrg genX(flush_pipeline_select_3d)(cmd_buffer); 1377ec681f3Smrg#endif 1387ec681f3Smrg 13901e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) { 14001e04c3fSmrg sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 }; 1417ec681f3Smrg sba.GeneralStateMOCS = mocs; 14201e04c3fSmrg sba.GeneralStateBaseAddressModifyEnable = true; 14301e04c3fSmrg 1447ec681f3Smrg sba.StatelessDataPortAccessMOCS = mocs; 1459f464c52Smaya 14601e04c3fSmrg sba.SurfaceStateBaseAddress = 14701e04c3fSmrg anv_cmd_buffer_surface_base_address(cmd_buffer); 1487ec681f3Smrg sba.SurfaceStateMOCS = mocs; 14901e04c3fSmrg sba.SurfaceStateBaseAddressModifyEnable = true; 15001e04c3fSmrg 15101e04c3fSmrg sba.DynamicStateBaseAddress = 1529f464c52Smaya (struct anv_address) { device->dynamic_state_pool.block_pool.bo, 0 }; 1537ec681f3Smrg sba.DynamicStateMOCS = mocs; 15401e04c3fSmrg sba.DynamicStateBaseAddressModifyEnable = true; 15501e04c3fSmrg 15601e04c3fSmrg sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 }; 1577ec681f3Smrg sba.IndirectObjectMOCS = mocs; 15801e04c3fSmrg sba.IndirectObjectBaseAddressModifyEnable = true; 15901e04c3fSmrg 16001e04c3fSmrg sba.InstructionBaseAddress = 1619f464c52Smaya (struct anv_address) { device->instruction_state_pool.block_pool.bo, 0 }; 1627ec681f3Smrg sba.InstructionMOCS = mocs; 16301e04c3fSmrg sba.InstructionBaseAddressModifyEnable = true; 16401e04c3fSmrg 1657ec681f3Smrg# if (GFX_VER >= 8) 16601e04c3fSmrg /* Broadwell requires that we specify a buffer size for a bunch of 16701e04c3fSmrg * these fields. However, since we will be growing the BO's live, we 16801e04c3fSmrg * just set them all to the maximum. 16901e04c3fSmrg */ 1707ec681f3Smrg sba.GeneralStateBufferSize = 0xfffff; 1717ec681f3Smrg sba.IndirectObjectBufferSize = 0xfffff; 1727ec681f3Smrg if (anv_use_softpin(device->physical)) { 1737ec681f3Smrg /* With softpin, we use fixed addresses so we actually know how big 1747ec681f3Smrg * our base addresses are. 1757ec681f3Smrg */ 1767ec681f3Smrg sba.DynamicStateBufferSize = DYNAMIC_STATE_POOL_SIZE / 4096; 1777ec681f3Smrg sba.InstructionBufferSize = INSTRUCTION_STATE_POOL_SIZE / 4096; 1787ec681f3Smrg } else { 1797ec681f3Smrg sba.DynamicStateBufferSize = 0xfffff; 1807ec681f3Smrg sba.InstructionBufferSize = 0xfffff; 1817ec681f3Smrg } 18201e04c3fSmrg sba.GeneralStateBufferSizeModifyEnable = true; 18301e04c3fSmrg sba.IndirectObjectBufferSizeModifyEnable = true; 1847ec681f3Smrg sba.DynamicStateBufferSizeModifyEnable = true; 18501e04c3fSmrg sba.InstructionBuffersizeModifyEnable = true; 1869f464c52Smaya# else 1877ec681f3Smrg /* On gfx7, we have upper bounds instead. According to the docs, 1889f464c52Smaya * setting an upper bound of zero means that no bounds checking is 1899f464c52Smaya * performed so, in theory, we should be able to leave them zero. 1909f464c52Smaya * However, border color is broken and the GPU bounds-checks anyway. 1919f464c52Smaya * To avoid this and other potential problems, we may as well set it 1929f464c52Smaya * for everything. 1939f464c52Smaya */ 1949f464c52Smaya sba.GeneralStateAccessUpperBound = 1959f464c52Smaya (struct anv_address) { .bo = NULL, .offset = 0xfffff000 }; 1969f464c52Smaya sba.GeneralStateAccessUpperBoundModifyEnable = true; 1979f464c52Smaya sba.DynamicStateAccessUpperBound = 1989f464c52Smaya (struct anv_address) { .bo = NULL, .offset = 0xfffff000 }; 1999f464c52Smaya sba.DynamicStateAccessUpperBoundModifyEnable = true; 2009f464c52Smaya sba.InstructionAccessUpperBound = 2019f464c52Smaya (struct anv_address) { .bo = NULL, .offset = 0xfffff000 }; 2029f464c52Smaya sba.InstructionAccessUpperBoundModifyEnable = true; 20301e04c3fSmrg# endif 2047ec681f3Smrg# if (GFX_VER >= 9) 2057ec681f3Smrg if (anv_use_softpin(device->physical)) { 2069f464c52Smaya sba.BindlessSurfaceStateBaseAddress = (struct anv_address) { 2079f464c52Smaya .bo = device->surface_state_pool.block_pool.bo, 2089f464c52Smaya .offset = 0, 2099f464c52Smaya }; 2109f464c52Smaya sba.BindlessSurfaceStateSize = (1 << 20) - 1; 2119f464c52Smaya } else { 2129f464c52Smaya sba.BindlessSurfaceStateBaseAddress = ANV_NULL_ADDRESS; 2139f464c52Smaya sba.BindlessSurfaceStateSize = 0; 2149f464c52Smaya } 2157ec681f3Smrg sba.BindlessSurfaceStateMOCS = mocs; 21601e04c3fSmrg sba.BindlessSurfaceStateBaseAddressModifyEnable = true; 21701e04c3fSmrg# endif 2187ec681f3Smrg# if (GFX_VER >= 10) 21901e04c3fSmrg sba.BindlessSamplerStateBaseAddress = (struct anv_address) { NULL, 0 }; 2207ec681f3Smrg sba.BindlessSamplerStateMOCS = mocs; 22101e04c3fSmrg sba.BindlessSamplerStateBaseAddressModifyEnable = true; 22201e04c3fSmrg sba.BindlessSamplerStateBufferSize = 0; 22301e04c3fSmrg# endif 22401e04c3fSmrg } 22501e04c3fSmrg 2267ec681f3Smrg#if GFX_VER == 12 2277ec681f3Smrg /* Wa_1607854226: 2287ec681f3Smrg * 2297ec681f3Smrg * Put the pipeline back into its current mode. 2307ec681f3Smrg */ 2317ec681f3Smrg if (gfx12_wa_pipeline != UINT32_MAX) 2327ec681f3Smrg genX(flush_pipeline_select)(cmd_buffer, gfx12_wa_pipeline); 2337ec681f3Smrg#endif 2347ec681f3Smrg 23501e04c3fSmrg /* After re-setting the surface state base address, we have to do some 23601e04c3fSmrg * cache flusing so that the sampler engine will pick up the new 23701e04c3fSmrg * SURFACE_STATE objects and binding tables. From the Broadwell PRM, 23801e04c3fSmrg * Shared Function > 3D Sampler > State > State Caching (page 96): 23901e04c3fSmrg * 24001e04c3fSmrg * Coherency with system memory in the state cache, like the texture 24101e04c3fSmrg * cache is handled partially by software. It is expected that the 24201e04c3fSmrg * command stream or shader will issue Cache Flush operation or 24301e04c3fSmrg * Cache_Flush sampler message to ensure that the L1 cache remains 24401e04c3fSmrg * coherent with system memory. 24501e04c3fSmrg * 24601e04c3fSmrg * [...] 24701e04c3fSmrg * 24801e04c3fSmrg * Whenever the value of the Dynamic_State_Base_Addr, 24901e04c3fSmrg * Surface_State_Base_Addr are altered, the L1 state cache must be 25001e04c3fSmrg * invalidated to ensure the new surface or sampler state is fetched 25101e04c3fSmrg * from system memory. 25201e04c3fSmrg * 25301e04c3fSmrg * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit 25401e04c3fSmrg * which, according the PIPE_CONTROL instruction documentation in the 25501e04c3fSmrg * Broadwell PRM: 25601e04c3fSmrg * 25701e04c3fSmrg * Setting this bit is independent of any other bit in this packet. 25801e04c3fSmrg * This bit controls the invalidation of the L1 and L2 state caches 25901e04c3fSmrg * at the top of the pipe i.e. at the parsing time. 26001e04c3fSmrg * 26101e04c3fSmrg * Unfortunately, experimentation seems to indicate that state cache 26201e04c3fSmrg * invalidation through a PIPE_CONTROL does nothing whatsoever in 26301e04c3fSmrg * regards to surface state and binding tables. In stead, it seems that 26401e04c3fSmrg * invalidating the texture cache is what is actually needed. 26501e04c3fSmrg * 26601e04c3fSmrg * XXX: As far as we have been able to determine through 26701e04c3fSmrg * experimentation, shows that flush the texture cache appears to be 26801e04c3fSmrg * sufficient. The theory here is that all of the sampling/rendering 26901e04c3fSmrg * units cache the binding table in the texture cache. However, we have 27001e04c3fSmrg * yet to be able to actually confirm this. 27101e04c3fSmrg */ 27201e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 27301e04c3fSmrg pc.TextureCacheInvalidationEnable = true; 27401e04c3fSmrg pc.ConstantCacheInvalidationEnable = true; 27501e04c3fSmrg pc.StateCacheInvalidationEnable = true; 2767ec681f3Smrg anv_debug_dump_pc(pc); 27701e04c3fSmrg } 27801e04c3fSmrg} 27901e04c3fSmrg 28001e04c3fSmrgstatic void 28101e04c3fSmrgadd_surface_reloc(struct anv_cmd_buffer *cmd_buffer, 28201e04c3fSmrg struct anv_state state, struct anv_address addr) 28301e04c3fSmrg{ 2847ec681f3Smrg VkResult result; 28501e04c3fSmrg 2867ec681f3Smrg if (anv_use_softpin(cmd_buffer->device->physical)) { 2877ec681f3Smrg result = anv_reloc_list_add_bo(&cmd_buffer->surface_relocs, 2887ec681f3Smrg &cmd_buffer->pool->alloc, 2897ec681f3Smrg addr.bo); 2907ec681f3Smrg } else { 2917ec681f3Smrg const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; 2927ec681f3Smrg result = anv_reloc_list_add(&cmd_buffer->surface_relocs, 2937ec681f3Smrg &cmd_buffer->pool->alloc, 2947ec681f3Smrg state.offset + isl_dev->ss.addr_offset, 2957ec681f3Smrg addr.bo, addr.offset, NULL); 2967ec681f3Smrg } 2977ec681f3Smrg 2987ec681f3Smrg if (unlikely(result != VK_SUCCESS)) 29901e04c3fSmrg anv_batch_set_error(&cmd_buffer->batch, result); 30001e04c3fSmrg} 30101e04c3fSmrg 30201e04c3fSmrgstatic void 30301e04c3fSmrgadd_surface_state_relocs(struct anv_cmd_buffer *cmd_buffer, 30401e04c3fSmrg struct anv_surface_state state) 30501e04c3fSmrg{ 30601e04c3fSmrg const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; 30701e04c3fSmrg 30801e04c3fSmrg assert(!anv_address_is_null(state.address)); 30901e04c3fSmrg add_surface_reloc(cmd_buffer, state.state, state.address); 31001e04c3fSmrg 31101e04c3fSmrg if (!anv_address_is_null(state.aux_address)) { 31201e04c3fSmrg VkResult result = 31301e04c3fSmrg anv_reloc_list_add(&cmd_buffer->surface_relocs, 31401e04c3fSmrg &cmd_buffer->pool->alloc, 31501e04c3fSmrg state.state.offset + isl_dev->ss.aux_addr_offset, 3167ec681f3Smrg state.aux_address.bo, 3177ec681f3Smrg state.aux_address.offset, 3187ec681f3Smrg NULL); 31901e04c3fSmrg if (result != VK_SUCCESS) 32001e04c3fSmrg anv_batch_set_error(&cmd_buffer->batch, result); 32101e04c3fSmrg } 32201e04c3fSmrg 32301e04c3fSmrg if (!anv_address_is_null(state.clear_address)) { 32401e04c3fSmrg VkResult result = 32501e04c3fSmrg anv_reloc_list_add(&cmd_buffer->surface_relocs, 32601e04c3fSmrg &cmd_buffer->pool->alloc, 32701e04c3fSmrg state.state.offset + 32801e04c3fSmrg isl_dev->ss.clear_color_state_offset, 3297ec681f3Smrg state.clear_address.bo, 3307ec681f3Smrg state.clear_address.offset, 3317ec681f3Smrg NULL); 33201e04c3fSmrg if (result != VK_SUCCESS) 33301e04c3fSmrg anv_batch_set_error(&cmd_buffer->batch, result); 33401e04c3fSmrg } 33501e04c3fSmrg} 33601e04c3fSmrg 3377ec681f3Smrgstatic bool 3387ec681f3Smrgisl_color_value_requires_conversion(union isl_color_value color, 3397ec681f3Smrg const struct isl_surf *surf, 3407ec681f3Smrg const struct isl_view *view) 34101e04c3fSmrg{ 3427ec681f3Smrg if (surf->format == view->format && isl_swizzle_is_identity(view->swizzle)) 3437ec681f3Smrg return false; 3447ec681f3Smrg 3457ec681f3Smrg uint32_t surf_pack[4] = { 0, 0, 0, 0 }; 3467ec681f3Smrg isl_color_value_pack(&color, surf->format, surf_pack); 3477ec681f3Smrg 3487ec681f3Smrg uint32_t view_pack[4] = { 0, 0, 0, 0 }; 3497ec681f3Smrg union isl_color_value swiz_color = 3507ec681f3Smrg isl_color_value_swizzle_inv(color, view->swizzle); 3517ec681f3Smrg isl_color_value_pack(&swiz_color, view->format, view_pack); 35201e04c3fSmrg 3537ec681f3Smrg return memcmp(surf_pack, view_pack, sizeof(surf_pack)) != 0; 3547ec681f3Smrg} 35501e04c3fSmrg 3567ec681f3Smrgstatic bool 3577ec681f3Smrganv_can_fast_clear_color_view(struct anv_device * device, 3587ec681f3Smrg struct anv_image_view *iview, 3597ec681f3Smrg VkImageLayout layout, 3607ec681f3Smrg union isl_color_value clear_color, 3617ec681f3Smrg uint32_t num_layers, 3627ec681f3Smrg VkRect2D render_area) 3637ec681f3Smrg{ 36401e04c3fSmrg if (iview->planes[0].isl.base_array_layer >= 36501e04c3fSmrg anv_image_aux_layers(iview->image, VK_IMAGE_ASPECT_COLOR_BIT, 3667ec681f3Smrg iview->planes[0].isl.base_level)) 3677ec681f3Smrg return false; 36801e04c3fSmrg 3697ec681f3Smrg /* Start by getting the fast clear type. We use the first subpass 3707ec681f3Smrg * layout here because we don't want to fast-clear if the first subpass 3717ec681f3Smrg * to use the attachment can't handle fast-clears. 37201e04c3fSmrg */ 3737ec681f3Smrg enum anv_fast_clear_type fast_clear_type = 3747ec681f3Smrg anv_layout_to_fast_clear_type(&device->info, iview->image, 3757ec681f3Smrg VK_IMAGE_ASPECT_COLOR_BIT, 3767ec681f3Smrg layout); 3777ec681f3Smrg switch (fast_clear_type) { 3787ec681f3Smrg case ANV_FAST_CLEAR_NONE: 3797ec681f3Smrg return false; 3807ec681f3Smrg case ANV_FAST_CLEAR_DEFAULT_VALUE: 3817ec681f3Smrg if (!isl_color_value_is_zero(clear_color, iview->planes[0].isl.format)) 3827ec681f3Smrg return false; 3837ec681f3Smrg break; 3847ec681f3Smrg case ANV_FAST_CLEAR_ANY: 3857ec681f3Smrg break; 38601e04c3fSmrg } 38701e04c3fSmrg 3887ec681f3Smrg /* Potentially, we could do partial fast-clears but doing so has crazy 3897ec681f3Smrg * alignment restrictions. It's easier to just restrict to full size 3907ec681f3Smrg * fast clears for now. 3917ec681f3Smrg */ 3927ec681f3Smrg if (render_area.offset.x != 0 || 3937ec681f3Smrg render_area.offset.y != 0 || 3947ec681f3Smrg render_area.extent.width != iview->vk.extent.width || 3957ec681f3Smrg render_area.extent.height != iview->vk.extent.height) 3967ec681f3Smrg return false; 39701e04c3fSmrg 3987ec681f3Smrg /* On Broadwell and earlier, we can only handle 0/1 clear colors */ 3997ec681f3Smrg if (GFX_VER <= 8 && 4007ec681f3Smrg !isl_color_value_is_zero_one(clear_color, iview->planes[0].isl.format)) 4017ec681f3Smrg return false; 40201e04c3fSmrg 4037ec681f3Smrg /* If the clear color is one that would require non-trivial format 4047ec681f3Smrg * conversion on resolve, we don't bother with the fast clear. This 4057ec681f3Smrg * shouldn't be common as most clear colors are 0/1 and the most common 4067ec681f3Smrg * format re-interpretation is for sRGB. 4077ec681f3Smrg */ 4087ec681f3Smrg if (isl_color_value_requires_conversion(clear_color, 4097ec681f3Smrg &iview->image->planes[0].primary_surface.isl, 4107ec681f3Smrg &iview->planes[0].isl)) { 4117ec681f3Smrg anv_perf_warn(VK_LOG_OBJS(&iview->vk.base), 4127ec681f3Smrg "Cannot fast-clear to colors which would require " 4137ec681f3Smrg "format conversion on resolve"); 4147ec681f3Smrg return false; 4157ec681f3Smrg } 41601e04c3fSmrg 4177ec681f3Smrg /* We only allow fast clears to the first slice of an image (level 0, 4187ec681f3Smrg * layer 0) and only for the entire slice. This guarantees us that, at 4197ec681f3Smrg * any given time, there is only one clear color on any given image at 4207ec681f3Smrg * any given time. At the time of our testing (Jan 17, 2018), there 4217ec681f3Smrg * were no known applications which would benefit from fast-clearing 4227ec681f3Smrg * more than just the first slice. 4237ec681f3Smrg */ 4247ec681f3Smrg if (iview->planes[0].isl.base_level > 0 || 4257ec681f3Smrg iview->planes[0].isl.base_array_layer > 0) { 4267ec681f3Smrg anv_perf_warn(VK_LOG_OBJS(&iview->image->vk.base), 4277ec681f3Smrg "Rendering with multi-lod or multi-layer framebuffer " 4287ec681f3Smrg "with LOAD_OP_LOAD and baseMipLevel > 0 or " 4297ec681f3Smrg "baseArrayLayer > 0. Not fast clearing."); 4307ec681f3Smrg return false; 4317ec681f3Smrg } 43201e04c3fSmrg 4337ec681f3Smrg if (num_layers > 1) { 4347ec681f3Smrg anv_perf_warn(VK_LOG_OBJS(&iview->image->vk.base), 4357ec681f3Smrg "Rendering to a multi-layer framebuffer with " 4367ec681f3Smrg "LOAD_OP_CLEAR. Only fast-clearing the first slice"); 43701e04c3fSmrg } 4387ec681f3Smrg 4397ec681f3Smrg return true; 44001e04c3fSmrg} 44101e04c3fSmrg 4427ec681f3Smrgstatic bool 4437ec681f3Smrganv_can_hiz_clear_ds_view(struct anv_device *device, 4447ec681f3Smrg struct anv_image_view *iview, 4457ec681f3Smrg VkImageLayout layout, 4467ec681f3Smrg VkImageAspectFlags clear_aspects, 4477ec681f3Smrg float depth_clear_value, 4487ec681f3Smrg VkRect2D render_area) 44901e04c3fSmrg{ 4507ec681f3Smrg /* We don't do any HiZ or depth fast-clears on gfx7 yet */ 4517ec681f3Smrg if (GFX_VER == 7) 4527ec681f3Smrg return false; 45301e04c3fSmrg 4547ec681f3Smrg /* If we're just clearing stencil, we can always HiZ clear */ 4557ec681f3Smrg if (!(clear_aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) 4567ec681f3Smrg return true; 45701e04c3fSmrg 45801e04c3fSmrg /* We must have depth in order to have HiZ */ 4597ec681f3Smrg if (!(iview->image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) 4607ec681f3Smrg return false; 46101e04c3fSmrg 4627ec681f3Smrg const enum isl_aux_usage clear_aux_usage = 46301e04c3fSmrg anv_layout_to_aux_usage(&device->info, iview->image, 46401e04c3fSmrg VK_IMAGE_ASPECT_DEPTH_BIT, 4657ec681f3Smrg VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT, 4667ec681f3Smrg layout); 4677ec681f3Smrg if (!blorp_can_hiz_clear_depth(&device->info, 4687ec681f3Smrg &iview->image->planes[0].primary_surface.isl, 4697ec681f3Smrg clear_aux_usage, 4707ec681f3Smrg iview->planes[0].isl.base_level, 4717ec681f3Smrg iview->planes[0].isl.base_array_layer, 47201e04c3fSmrg render_area.offset.x, 47301e04c3fSmrg render_area.offset.y, 47401e04c3fSmrg render_area.offset.x + 47501e04c3fSmrg render_area.extent.width, 47601e04c3fSmrg render_area.offset.y + 47701e04c3fSmrg render_area.extent.height)) 4787ec681f3Smrg return false; 47901e04c3fSmrg 4807ec681f3Smrg if (depth_clear_value != ANV_HZ_FC_VAL) 4817ec681f3Smrg return false; 48201e04c3fSmrg 4837ec681f3Smrg /* Only gfx9+ supports returning ANV_HZ_FC_VAL when sampling a fast-cleared 4847ec681f3Smrg * portion of a HiZ buffer. Testing has revealed that Gfx8 only supports 4857ec681f3Smrg * returning 0.0f. Gens prior to gfx8 do not support this feature at all. 4867ec681f3Smrg */ 4877ec681f3Smrg if (GFX_VER == 8 && anv_can_sample_with_hiz(&device->info, iview->image)) 4887ec681f3Smrg return false; 48901e04c3fSmrg 49001e04c3fSmrg /* If we got here, then we can fast clear */ 4917ec681f3Smrg return true; 49201e04c3fSmrg} 49301e04c3fSmrg 4947ec681f3Smrg#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x)) 4957ec681f3Smrg 4967ec681f3Smrg#if GFX_VER == 12 4977ec681f3Smrgstatic void 4987ec681f3Smrganv_image_init_aux_tt(struct anv_cmd_buffer *cmd_buffer, 4997ec681f3Smrg const struct anv_image *image, 5007ec681f3Smrg VkImageAspectFlagBits aspect, 5017ec681f3Smrg uint32_t base_level, uint32_t level_count, 5027ec681f3Smrg uint32_t base_layer, uint32_t layer_count) 50301e04c3fSmrg{ 5047ec681f3Smrg const uint32_t plane = anv_image_aspect_to_plane(image, aspect); 50501e04c3fSmrg 5067ec681f3Smrg const struct anv_surface *surface = &image->planes[plane].primary_surface; 5077ec681f3Smrg uint64_t base_address = 5087ec681f3Smrg anv_address_physical(anv_image_address(image, &surface->memory_range)); 5097ec681f3Smrg 5107ec681f3Smrg const struct isl_surf *isl_surf = &image->planes[plane].primary_surface.isl; 5117ec681f3Smrg uint64_t format_bits = intel_aux_map_format_bits_for_isl_surf(isl_surf); 5127ec681f3Smrg 5137ec681f3Smrg /* We're about to live-update the AUX-TT. We really don't want anyone else 5147ec681f3Smrg * trying to read it while we're doing this. We could probably get away 5157ec681f3Smrg * with not having this stall in some cases if we were really careful but 5167ec681f3Smrg * it's better to play it safe. Full stall the GPU. 51701e04c3fSmrg */ 5187ec681f3Smrg anv_add_pending_pipe_bits(cmd_buffer, 5197ec681f3Smrg ANV_PIPE_END_OF_PIPE_SYNC_BIT, 5207ec681f3Smrg "before update AUX-TT"); 5217ec681f3Smrg genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 5227ec681f3Smrg 5237ec681f3Smrg struct mi_builder b; 5247ec681f3Smrg mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 5257ec681f3Smrg 5267ec681f3Smrg for (uint32_t a = 0; a < layer_count; a++) { 5277ec681f3Smrg const uint32_t layer = base_layer + a; 5287ec681f3Smrg 5297ec681f3Smrg uint64_t start_offset_B = UINT64_MAX, end_offset_B = 0; 5307ec681f3Smrg for (uint32_t l = 0; l < level_count; l++) { 5317ec681f3Smrg const uint32_t level = base_level + l; 5327ec681f3Smrg 5337ec681f3Smrg uint32_t logical_array_layer, logical_z_offset_px; 5347ec681f3Smrg if (image->vk.image_type == VK_IMAGE_TYPE_3D) { 5357ec681f3Smrg logical_array_layer = 0; 5367ec681f3Smrg 5377ec681f3Smrg /* If the given miplevel does not have this layer, then any higher 5387ec681f3Smrg * miplevels won't either because miplevels only get smaller the 5397ec681f3Smrg * higher the LOD. 5407ec681f3Smrg */ 5417ec681f3Smrg assert(layer < image->vk.extent.depth); 5427ec681f3Smrg if (layer >= anv_minify(image->vk.extent.depth, level)) 5437ec681f3Smrg break; 5447ec681f3Smrg logical_z_offset_px = layer; 5457ec681f3Smrg } else { 5467ec681f3Smrg assert(layer < image->vk.array_layers); 5477ec681f3Smrg logical_array_layer = layer; 5487ec681f3Smrg logical_z_offset_px = 0; 5497ec681f3Smrg } 5507ec681f3Smrg 5517ec681f3Smrg uint64_t slice_start_offset_B, slice_end_offset_B; 5527ec681f3Smrg isl_surf_get_image_range_B_tile(isl_surf, level, 5537ec681f3Smrg logical_array_layer, 5547ec681f3Smrg logical_z_offset_px, 5557ec681f3Smrg &slice_start_offset_B, 5567ec681f3Smrg &slice_end_offset_B); 5577ec681f3Smrg 5587ec681f3Smrg start_offset_B = MIN2(start_offset_B, slice_start_offset_B); 5597ec681f3Smrg end_offset_B = MAX2(end_offset_B, slice_end_offset_B); 5607ec681f3Smrg } 5617ec681f3Smrg 5627ec681f3Smrg /* Aux operates 64K at a time */ 5637ec681f3Smrg start_offset_B = align_down_u64(start_offset_B, 64 * 1024); 5647ec681f3Smrg end_offset_B = align_u64(end_offset_B, 64 * 1024); 5657ec681f3Smrg 5667ec681f3Smrg for (uint64_t offset = start_offset_B; 5677ec681f3Smrg offset < end_offset_B; offset += 64 * 1024) { 5687ec681f3Smrg uint64_t address = base_address + offset; 5697ec681f3Smrg 5707ec681f3Smrg uint64_t aux_entry_addr64, *aux_entry_map; 5717ec681f3Smrg aux_entry_map = intel_aux_map_get_entry(cmd_buffer->device->aux_map_ctx, 5727ec681f3Smrg address, &aux_entry_addr64); 5737ec681f3Smrg 5747ec681f3Smrg assert(anv_use_softpin(cmd_buffer->device->physical)); 5757ec681f3Smrg struct anv_address aux_entry_address = { 5767ec681f3Smrg .bo = NULL, 5777ec681f3Smrg .offset = aux_entry_addr64, 5787ec681f3Smrg }; 5797ec681f3Smrg 5807ec681f3Smrg const uint64_t old_aux_entry = READ_ONCE(*aux_entry_map); 5817ec681f3Smrg uint64_t new_aux_entry = 5827ec681f3Smrg (old_aux_entry & INTEL_AUX_MAP_ADDRESS_MASK) | format_bits; 5837ec681f3Smrg 5847ec681f3Smrg if (isl_aux_usage_has_ccs(image->planes[plane].aux_usage)) 5857ec681f3Smrg new_aux_entry |= INTEL_AUX_MAP_ENTRY_VALID_BIT; 5867ec681f3Smrg 5877ec681f3Smrg mi_store(&b, mi_mem64(aux_entry_address), mi_imm(new_aux_entry)); 5887ec681f3Smrg } 5897ec681f3Smrg } 5907ec681f3Smrg 5917ec681f3Smrg anv_add_pending_pipe_bits(cmd_buffer, 5927ec681f3Smrg ANV_PIPE_AUX_TABLE_INVALIDATE_BIT, 5937ec681f3Smrg "after update AUX-TT"); 59401e04c3fSmrg} 5957ec681f3Smrg#endif /* GFX_VER == 12 */ 59601e04c3fSmrg 59701e04c3fSmrg/* Transitions a HiZ-enabled depth buffer from one layout to another. Unless 59801e04c3fSmrg * the initial layout is undefined, the HiZ buffer and depth buffer will 59901e04c3fSmrg * represent the same data at the end of this operation. 60001e04c3fSmrg */ 60101e04c3fSmrgstatic void 60201e04c3fSmrgtransition_depth_buffer(struct anv_cmd_buffer *cmd_buffer, 60301e04c3fSmrg const struct anv_image *image, 6047ec681f3Smrg uint32_t base_layer, uint32_t layer_count, 60501e04c3fSmrg VkImageLayout initial_layout, 6067ec681f3Smrg VkImageLayout final_layout, 6077ec681f3Smrg bool will_full_fast_clear) 60801e04c3fSmrg{ 6097ec681f3Smrg const uint32_t depth_plane = 6107ec681f3Smrg anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT); 6117ec681f3Smrg if (image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_NONE) 6127ec681f3Smrg return; 6137ec681f3Smrg 6147ec681f3Smrg#if GFX_VER == 12 6157ec681f3Smrg if ((initial_layout == VK_IMAGE_LAYOUT_UNDEFINED || 6167ec681f3Smrg initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) && 6177ec681f3Smrg cmd_buffer->device->physical->has_implicit_ccs && 6187ec681f3Smrg cmd_buffer->device->info.has_aux_map) { 6197ec681f3Smrg anv_image_init_aux_tt(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT, 6207ec681f3Smrg 0, 1, base_layer, layer_count); 62101e04c3fSmrg } 6227ec681f3Smrg#endif 6237ec681f3Smrg 6247ec681f3Smrg /* If will_full_fast_clear is set, the caller promises to fast-clear the 6257ec681f3Smrg * largest portion of the specified range as it can. For depth images, 6267ec681f3Smrg * that means the entire image because we don't support multi-LOD HiZ. 6277ec681f3Smrg */ 6287ec681f3Smrg assert(image->planes[0].primary_surface.isl.levels == 1); 6297ec681f3Smrg if (will_full_fast_clear) 6307ec681f3Smrg return; 6317ec681f3Smrg 6327ec681f3Smrg const enum isl_aux_state initial_state = 6337ec681f3Smrg anv_layout_to_aux_state(&cmd_buffer->device->info, image, 6347ec681f3Smrg VK_IMAGE_ASPECT_DEPTH_BIT, 6357ec681f3Smrg initial_layout); 6367ec681f3Smrg const enum isl_aux_state final_state = 6377ec681f3Smrg anv_layout_to_aux_state(&cmd_buffer->device->info, image, 6387ec681f3Smrg VK_IMAGE_ASPECT_DEPTH_BIT, 6397ec681f3Smrg final_layout); 6407ec681f3Smrg 6417ec681f3Smrg const bool initial_depth_valid = 6427ec681f3Smrg isl_aux_state_has_valid_primary(initial_state); 6437ec681f3Smrg const bool initial_hiz_valid = 6447ec681f3Smrg isl_aux_state_has_valid_aux(initial_state); 6457ec681f3Smrg const bool final_needs_depth = 6467ec681f3Smrg isl_aux_state_has_valid_primary(final_state); 6477ec681f3Smrg const bool final_needs_hiz = 6487ec681f3Smrg isl_aux_state_has_valid_aux(final_state); 6497ec681f3Smrg 6507ec681f3Smrg /* Getting into the pass-through state for Depth is tricky and involves 6517ec681f3Smrg * both a resolve and an ambiguate. We don't handle that state right now 6527ec681f3Smrg * as anv_layout_to_aux_state never returns it. 6537ec681f3Smrg */ 6547ec681f3Smrg assert(final_state != ISL_AUX_STATE_PASS_THROUGH); 65501e04c3fSmrg 6567ec681f3Smrg if (final_needs_depth && !initial_depth_valid) { 6577ec681f3Smrg assert(initial_hiz_valid); 65801e04c3fSmrg anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT, 6597ec681f3Smrg 0, base_layer, layer_count, ISL_AUX_OP_FULL_RESOLVE); 6607ec681f3Smrg } else if (final_needs_hiz && !initial_hiz_valid) { 6617ec681f3Smrg assert(initial_depth_valid); 6627ec681f3Smrg anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT, 6637ec681f3Smrg 0, base_layer, layer_count, ISL_AUX_OP_AMBIGUATE); 6647ec681f3Smrg } 6657ec681f3Smrg} 6667ec681f3Smrg 6677ec681f3Smrgstatic inline bool 6687ec681f3Smrgvk_image_layout_stencil_write_optimal(VkImageLayout layout) 6697ec681f3Smrg{ 6707ec681f3Smrg return layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL || 6717ec681f3Smrg layout == VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL || 6727ec681f3Smrg layout == VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR; 6737ec681f3Smrg} 6747ec681f3Smrg 6757ec681f3Smrg/* Transitions a HiZ-enabled depth buffer from one layout to another. Unless 6767ec681f3Smrg * the initial layout is undefined, the HiZ buffer and depth buffer will 6777ec681f3Smrg * represent the same data at the end of this operation. 6787ec681f3Smrg */ 6797ec681f3Smrgstatic void 6807ec681f3Smrgtransition_stencil_buffer(struct anv_cmd_buffer *cmd_buffer, 6817ec681f3Smrg const struct anv_image *image, 6827ec681f3Smrg uint32_t base_level, uint32_t level_count, 6837ec681f3Smrg uint32_t base_layer, uint32_t layer_count, 6847ec681f3Smrg VkImageLayout initial_layout, 6857ec681f3Smrg VkImageLayout final_layout, 6867ec681f3Smrg bool will_full_fast_clear) 6877ec681f3Smrg{ 6887ec681f3Smrg#if GFX_VER == 7 6897ec681f3Smrg const uint32_t plane = 6907ec681f3Smrg anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT); 6917ec681f3Smrg 6927ec681f3Smrg /* On gfx7, we have to store a texturable version of the stencil buffer in 6937ec681f3Smrg * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and 6947ec681f3Smrg * forth at strategic points. Stencil writes are only allowed in following 6957ec681f3Smrg * layouts: 6967ec681f3Smrg * 6977ec681f3Smrg * - VK_IMAGE_LAYOUT_GENERAL 6987ec681f3Smrg * - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL 6997ec681f3Smrg * - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL 7007ec681f3Smrg * - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL 7017ec681f3Smrg * - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR 7027ec681f3Smrg * 7037ec681f3Smrg * For general, we have no nice opportunity to transition so we do the copy 7047ec681f3Smrg * to the shadow unconditionally at the end of the subpass. For transfer 7057ec681f3Smrg * destinations, we can update it as part of the transfer op. For the other 7067ec681f3Smrg * layouts, we delay the copy until a transition into some other layout. 7077ec681f3Smrg */ 7087ec681f3Smrg if (anv_surface_is_valid(&image->planes[plane].shadow_surface) && 7097ec681f3Smrg vk_image_layout_stencil_write_optimal(initial_layout) && 7107ec681f3Smrg !vk_image_layout_stencil_write_optimal(final_layout)) { 7117ec681f3Smrg anv_image_copy_to_shadow(cmd_buffer, image, 7127ec681f3Smrg VK_IMAGE_ASPECT_STENCIL_BIT, 7137ec681f3Smrg base_level, level_count, 7147ec681f3Smrg base_layer, layer_count); 7157ec681f3Smrg } 7167ec681f3Smrg#elif GFX_VER == 12 7177ec681f3Smrg const uint32_t plane = 7187ec681f3Smrg anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT); 7197ec681f3Smrg if (image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE) 7207ec681f3Smrg return; 7217ec681f3Smrg 7227ec681f3Smrg if ((initial_layout == VK_IMAGE_LAYOUT_UNDEFINED || 7237ec681f3Smrg initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) && 7247ec681f3Smrg cmd_buffer->device->physical->has_implicit_ccs && 7257ec681f3Smrg cmd_buffer->device->info.has_aux_map) { 7267ec681f3Smrg anv_image_init_aux_tt(cmd_buffer, image, VK_IMAGE_ASPECT_STENCIL_BIT, 7277ec681f3Smrg base_level, level_count, base_layer, layer_count); 7287ec681f3Smrg 7297ec681f3Smrg /* If will_full_fast_clear is set, the caller promises to fast-clear the 7307ec681f3Smrg * largest portion of the specified range as it can. 7317ec681f3Smrg */ 7327ec681f3Smrg if (will_full_fast_clear) 7337ec681f3Smrg return; 7347ec681f3Smrg 7357ec681f3Smrg for (uint32_t l = 0; l < level_count; l++) { 7367ec681f3Smrg const uint32_t level = base_level + l; 7377ec681f3Smrg const VkRect2D clear_rect = { 7387ec681f3Smrg .offset.x = 0, 7397ec681f3Smrg .offset.y = 0, 7407ec681f3Smrg .extent.width = anv_minify(image->vk.extent.width, level), 7417ec681f3Smrg .extent.height = anv_minify(image->vk.extent.height, level), 7427ec681f3Smrg }; 7437ec681f3Smrg 7447ec681f3Smrg uint32_t aux_layers = 7457ec681f3Smrg anv_image_aux_layers(image, VK_IMAGE_ASPECT_STENCIL_BIT, level); 7467ec681f3Smrg uint32_t level_layer_count = 7477ec681f3Smrg MIN2(layer_count, aux_layers - base_layer); 7487ec681f3Smrg 7497ec681f3Smrg /* From Bspec's 3DSTATE_STENCIL_BUFFER_BODY > Stencil Compression 7507ec681f3Smrg * Enable: 7517ec681f3Smrg * 7527ec681f3Smrg * "When enabled, Stencil Buffer needs to be initialized via 7537ec681f3Smrg * stencil clear (HZ_OP) before any renderpass." 7547ec681f3Smrg */ 7557ec681f3Smrg anv_image_hiz_clear(cmd_buffer, image, VK_IMAGE_ASPECT_STENCIL_BIT, 7567ec681f3Smrg level, base_layer, level_layer_count, 7577ec681f3Smrg clear_rect, 0 /* Stencil clear value */); 7587ec681f3Smrg } 7597ec681f3Smrg } 7607ec681f3Smrg#endif 76101e04c3fSmrg} 76201e04c3fSmrg 7639f464c52Smaya#define MI_PREDICATE_SRC0 0x2400 7649f464c52Smaya#define MI_PREDICATE_SRC1 0x2408 7659f464c52Smaya#define MI_PREDICATE_RESULT 0x2418 76601e04c3fSmrg 76701e04c3fSmrgstatic void 76801e04c3fSmrgset_image_compressed_bit(struct anv_cmd_buffer *cmd_buffer, 76901e04c3fSmrg const struct anv_image *image, 77001e04c3fSmrg VkImageAspectFlagBits aspect, 77101e04c3fSmrg uint32_t level, 77201e04c3fSmrg uint32_t base_layer, uint32_t layer_count, 77301e04c3fSmrg bool compressed) 77401e04c3fSmrg{ 7757ec681f3Smrg const uint32_t plane = anv_image_aspect_to_plane(image, aspect); 77601e04c3fSmrg 77701e04c3fSmrg /* We only have compression tracking for CCS_E */ 77801e04c3fSmrg if (image->planes[plane].aux_usage != ISL_AUX_USAGE_CCS_E) 77901e04c3fSmrg return; 78001e04c3fSmrg 78101e04c3fSmrg for (uint32_t a = 0; a < layer_count; a++) { 78201e04c3fSmrg uint32_t layer = base_layer + a; 78301e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) { 78401e04c3fSmrg sdi.Address = anv_image_get_compression_state_addr(cmd_buffer->device, 78501e04c3fSmrg image, aspect, 78601e04c3fSmrg level, layer); 78701e04c3fSmrg sdi.ImmediateData = compressed ? UINT32_MAX : 0; 78801e04c3fSmrg } 78901e04c3fSmrg } 79001e04c3fSmrg} 79101e04c3fSmrg 79201e04c3fSmrgstatic void 79301e04c3fSmrgset_image_fast_clear_state(struct anv_cmd_buffer *cmd_buffer, 79401e04c3fSmrg const struct anv_image *image, 79501e04c3fSmrg VkImageAspectFlagBits aspect, 79601e04c3fSmrg enum anv_fast_clear_type fast_clear) 79701e04c3fSmrg{ 79801e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) { 79901e04c3fSmrg sdi.Address = anv_image_get_fast_clear_type_addr(cmd_buffer->device, 80001e04c3fSmrg image, aspect); 80101e04c3fSmrg sdi.ImmediateData = fast_clear; 80201e04c3fSmrg } 80301e04c3fSmrg 80401e04c3fSmrg /* Whenever we have fast-clear, we consider that slice to be compressed. 80501e04c3fSmrg * This makes building predicates much easier. 80601e04c3fSmrg */ 80701e04c3fSmrg if (fast_clear != ANV_FAST_CLEAR_NONE) 80801e04c3fSmrg set_image_compressed_bit(cmd_buffer, image, aspect, 0, 0, 1, true); 80901e04c3fSmrg} 81001e04c3fSmrg 81101e04c3fSmrg/* This is only really practical on haswell and above because it requires 81201e04c3fSmrg * MI math in order to get it correct. 81301e04c3fSmrg */ 8147ec681f3Smrg#if GFX_VERx10 >= 75 81501e04c3fSmrgstatic void 81601e04c3fSmrganv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer, 81701e04c3fSmrg const struct anv_image *image, 81801e04c3fSmrg VkImageAspectFlagBits aspect, 81901e04c3fSmrg uint32_t level, uint32_t array_layer, 82001e04c3fSmrg enum isl_aux_op resolve_op, 82101e04c3fSmrg enum anv_fast_clear_type fast_clear_supported) 82201e04c3fSmrg{ 8237ec681f3Smrg struct mi_builder b; 8247ec681f3Smrg mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 82501e04c3fSmrg 8267ec681f3Smrg const struct mi_value fast_clear_type = 8277ec681f3Smrg mi_mem32(anv_image_get_fast_clear_type_addr(cmd_buffer->device, 8287ec681f3Smrg image, aspect)); 82901e04c3fSmrg 83001e04c3fSmrg if (resolve_op == ISL_AUX_OP_FULL_RESOLVE) { 83101e04c3fSmrg /* In this case, we're doing a full resolve which means we want the 83201e04c3fSmrg * resolve to happen if any compression (including fast-clears) is 83301e04c3fSmrg * present. 83401e04c3fSmrg * 83501e04c3fSmrg * In order to simplify the logic a bit, we make the assumption that, 83601e04c3fSmrg * if the first slice has been fast-cleared, it is also marked as 83701e04c3fSmrg * compressed. See also set_image_fast_clear_state. 83801e04c3fSmrg */ 8397ec681f3Smrg const struct mi_value compression_state = 8407ec681f3Smrg mi_mem32(anv_image_get_compression_state_addr(cmd_buffer->device, 8417ec681f3Smrg image, aspect, 8427ec681f3Smrg level, array_layer)); 8437ec681f3Smrg mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), compression_state); 8447ec681f3Smrg mi_store(&b, compression_state, mi_imm(0)); 84501e04c3fSmrg 84601e04c3fSmrg if (level == 0 && array_layer == 0) { 84701e04c3fSmrg /* If the predicate is true, we want to write 0 to the fast clear type 84801e04c3fSmrg * and, if it's false, leave it alone. We can do this by writing 84901e04c3fSmrg * 85001e04c3fSmrg * clear_type = clear_type & ~predicate; 85101e04c3fSmrg */ 8527ec681f3Smrg struct mi_value new_fast_clear_type = 8537ec681f3Smrg mi_iand(&b, fast_clear_type, 8547ec681f3Smrg mi_inot(&b, mi_reg64(MI_PREDICATE_SRC0))); 8557ec681f3Smrg mi_store(&b, fast_clear_type, new_fast_clear_type); 85601e04c3fSmrg } 85701e04c3fSmrg } else if (level == 0 && array_layer == 0) { 85801e04c3fSmrg /* In this case, we are doing a partial resolve to get rid of fast-clear 85901e04c3fSmrg * colors. We don't care about the compression state but we do care 86001e04c3fSmrg * about how much fast clear is allowed by the final layout. 86101e04c3fSmrg */ 86201e04c3fSmrg assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE); 86301e04c3fSmrg assert(fast_clear_supported < ANV_FAST_CLEAR_ANY); 86401e04c3fSmrg 8659f464c52Smaya /* We need to compute (fast_clear_supported < image->fast_clear) */ 8667ec681f3Smrg struct mi_value pred = 8677ec681f3Smrg mi_ult(&b, mi_imm(fast_clear_supported), fast_clear_type); 8687ec681f3Smrg mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), mi_value_ref(&b, pred)); 86901e04c3fSmrg 87001e04c3fSmrg /* If the predicate is true, we want to write 0 to the fast clear type 87101e04c3fSmrg * and, if it's false, leave it alone. We can do this by writing 87201e04c3fSmrg * 87301e04c3fSmrg * clear_type = clear_type & ~predicate; 87401e04c3fSmrg */ 8757ec681f3Smrg struct mi_value new_fast_clear_type = 8767ec681f3Smrg mi_iand(&b, fast_clear_type, mi_inot(&b, pred)); 8777ec681f3Smrg mi_store(&b, fast_clear_type, new_fast_clear_type); 87801e04c3fSmrg } else { 87901e04c3fSmrg /* In this case, we're trying to do a partial resolve on a slice that 88001e04c3fSmrg * doesn't have clear color. There's nothing to do. 88101e04c3fSmrg */ 88201e04c3fSmrg assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE); 88301e04c3fSmrg return; 88401e04c3fSmrg } 88501e04c3fSmrg 8869f464c52Smaya /* Set src1 to 0 and use a != condition */ 8877ec681f3Smrg mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0)); 88801e04c3fSmrg 88901e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { 89001e04c3fSmrg mip.LoadOperation = LOAD_LOADINV; 89101e04c3fSmrg mip.CombineOperation = COMBINE_SET; 89201e04c3fSmrg mip.CompareOperation = COMPARE_SRCS_EQUAL; 89301e04c3fSmrg } 89401e04c3fSmrg} 8957ec681f3Smrg#endif /* GFX_VERx10 >= 75 */ 89601e04c3fSmrg 8977ec681f3Smrg#if GFX_VER <= 8 89801e04c3fSmrgstatic void 89901e04c3fSmrganv_cmd_simple_resolve_predicate(struct anv_cmd_buffer *cmd_buffer, 90001e04c3fSmrg const struct anv_image *image, 90101e04c3fSmrg VkImageAspectFlagBits aspect, 90201e04c3fSmrg uint32_t level, uint32_t array_layer, 90301e04c3fSmrg enum isl_aux_op resolve_op, 90401e04c3fSmrg enum anv_fast_clear_type fast_clear_supported) 90501e04c3fSmrg{ 9067ec681f3Smrg struct mi_builder b; 9077ec681f3Smrg mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 9089f464c52Smaya 9097ec681f3Smrg struct mi_value fast_clear_type_mem = 9107ec681f3Smrg mi_mem32(anv_image_get_fast_clear_type_addr(cmd_buffer->device, 9119f464c52Smaya image, aspect)); 91201e04c3fSmrg 91301e04c3fSmrg /* This only works for partial resolves and only when the clear color is 91401e04c3fSmrg * all or nothing. On the upside, this emits less command streamer code 91501e04c3fSmrg * and works on Ivybridge and Bay Trail. 91601e04c3fSmrg */ 91701e04c3fSmrg assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE); 91801e04c3fSmrg assert(fast_clear_supported != ANV_FAST_CLEAR_ANY); 91901e04c3fSmrg 92001e04c3fSmrg /* We don't support fast clears on anything other than the first slice. */ 92101e04c3fSmrg if (level > 0 || array_layer > 0) 92201e04c3fSmrg return; 92301e04c3fSmrg 9247ec681f3Smrg /* On gfx8, we don't have a concept of default clear colors because we 92501e04c3fSmrg * can't sample from CCS surfaces. It's enough to just load the fast clear 92601e04c3fSmrg * state into the predicate register. 92701e04c3fSmrg */ 9287ec681f3Smrg mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), fast_clear_type_mem); 9297ec681f3Smrg mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0)); 9307ec681f3Smrg mi_store(&b, fast_clear_type_mem, mi_imm(0)); 93101e04c3fSmrg 93201e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { 93301e04c3fSmrg mip.LoadOperation = LOAD_LOADINV; 93401e04c3fSmrg mip.CombineOperation = COMBINE_SET; 93501e04c3fSmrg mip.CompareOperation = COMPARE_SRCS_EQUAL; 93601e04c3fSmrg } 93701e04c3fSmrg} 9387ec681f3Smrg#endif /* GFX_VER <= 8 */ 93901e04c3fSmrg 94001e04c3fSmrgstatic void 94101e04c3fSmrganv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer *cmd_buffer, 94201e04c3fSmrg const struct anv_image *image, 94301e04c3fSmrg enum isl_format format, 9447ec681f3Smrg struct isl_swizzle swizzle, 94501e04c3fSmrg VkImageAspectFlagBits aspect, 94601e04c3fSmrg uint32_t level, uint32_t array_layer, 94701e04c3fSmrg enum isl_aux_op resolve_op, 94801e04c3fSmrg enum anv_fast_clear_type fast_clear_supported) 94901e04c3fSmrg{ 9507ec681f3Smrg const uint32_t plane = anv_image_aspect_to_plane(image, aspect); 95101e04c3fSmrg 9527ec681f3Smrg#if GFX_VER >= 9 95301e04c3fSmrg anv_cmd_compute_resolve_predicate(cmd_buffer, image, 95401e04c3fSmrg aspect, level, array_layer, 95501e04c3fSmrg resolve_op, fast_clear_supported); 9567ec681f3Smrg#else /* GFX_VER <= 8 */ 95701e04c3fSmrg anv_cmd_simple_resolve_predicate(cmd_buffer, image, 95801e04c3fSmrg aspect, level, array_layer, 95901e04c3fSmrg resolve_op, fast_clear_supported); 96001e04c3fSmrg#endif 96101e04c3fSmrg 96201e04c3fSmrg /* CCS_D only supports full resolves and BLORP will assert on us if we try 96301e04c3fSmrg * to do a partial resolve on a CCS_D surface. 96401e04c3fSmrg */ 96501e04c3fSmrg if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE && 9667ec681f3Smrg image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_D) 96701e04c3fSmrg resolve_op = ISL_AUX_OP_FULL_RESOLVE; 96801e04c3fSmrg 9697ec681f3Smrg anv_image_ccs_op(cmd_buffer, image, format, swizzle, aspect, 9707ec681f3Smrg level, array_layer, 1, resolve_op, NULL, true); 97101e04c3fSmrg} 97201e04c3fSmrg 97301e04c3fSmrgstatic void 97401e04c3fSmrganv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer *cmd_buffer, 97501e04c3fSmrg const struct anv_image *image, 97601e04c3fSmrg enum isl_format format, 9777ec681f3Smrg struct isl_swizzle swizzle, 97801e04c3fSmrg VkImageAspectFlagBits aspect, 97901e04c3fSmrg uint32_t array_layer, 98001e04c3fSmrg enum isl_aux_op resolve_op, 98101e04c3fSmrg enum anv_fast_clear_type fast_clear_supported) 98201e04c3fSmrg{ 98301e04c3fSmrg assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT); 98401e04c3fSmrg assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE); 98501e04c3fSmrg 9867ec681f3Smrg#if GFX_VERx10 >= 75 98701e04c3fSmrg anv_cmd_compute_resolve_predicate(cmd_buffer, image, 98801e04c3fSmrg aspect, 0, array_layer, 98901e04c3fSmrg resolve_op, fast_clear_supported); 99001e04c3fSmrg 9917ec681f3Smrg anv_image_mcs_op(cmd_buffer, image, format, swizzle, aspect, 99201e04c3fSmrg array_layer, 1, resolve_op, NULL, true); 99301e04c3fSmrg#else 99401e04c3fSmrg unreachable("MCS resolves are unsupported on Ivybridge and Bay Trail"); 99501e04c3fSmrg#endif 99601e04c3fSmrg} 99701e04c3fSmrg 99801e04c3fSmrgvoid 99901e04c3fSmrggenX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer, 100001e04c3fSmrg const struct anv_image *image, 100101e04c3fSmrg VkImageAspectFlagBits aspect, 100201e04c3fSmrg enum isl_aux_usage aux_usage, 100301e04c3fSmrg uint32_t level, 100401e04c3fSmrg uint32_t base_layer, 100501e04c3fSmrg uint32_t layer_count) 100601e04c3fSmrg{ 100701e04c3fSmrg /* The aspect must be exactly one of the image aspects. */ 10087ec681f3Smrg assert(util_bitcount(aspect) == 1 && (aspect & image->vk.aspects)); 100901e04c3fSmrg 101001e04c3fSmrg /* The only compression types with more than just fast-clears are MCS, 101101e04c3fSmrg * CCS_E, and HiZ. With HiZ we just trust the layout and don't actually 101201e04c3fSmrg * track the current fast-clear and compression state. This leaves us 101301e04c3fSmrg * with just MCS and CCS_E. 101401e04c3fSmrg */ 101501e04c3fSmrg if (aux_usage != ISL_AUX_USAGE_CCS_E && 101601e04c3fSmrg aux_usage != ISL_AUX_USAGE_MCS) 101701e04c3fSmrg return; 101801e04c3fSmrg 101901e04c3fSmrg set_image_compressed_bit(cmd_buffer, image, aspect, 102001e04c3fSmrg level, base_layer, layer_count, true); 102101e04c3fSmrg} 102201e04c3fSmrg 102301e04c3fSmrgstatic void 102401e04c3fSmrginit_fast_clear_color(struct anv_cmd_buffer *cmd_buffer, 102501e04c3fSmrg const struct anv_image *image, 102601e04c3fSmrg VkImageAspectFlagBits aspect) 102701e04c3fSmrg{ 102801e04c3fSmrg assert(cmd_buffer && image); 10297ec681f3Smrg assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV); 103001e04c3fSmrg 103101e04c3fSmrg set_image_fast_clear_state(cmd_buffer, image, aspect, 103201e04c3fSmrg ANV_FAST_CLEAR_NONE); 103301e04c3fSmrg 10349f464c52Smaya /* Initialize the struct fields that are accessed for fast-clears so that 10359f464c52Smaya * the HW restrictions on the field values are satisfied. 103601e04c3fSmrg */ 103701e04c3fSmrg struct anv_address addr = 103801e04c3fSmrg anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect); 103901e04c3fSmrg 10407ec681f3Smrg if (GFX_VER >= 9) { 10419f464c52Smaya const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; 10427ec681f3Smrg const unsigned num_dwords = GFX_VER >= 10 ? 10439f464c52Smaya isl_dev->ss.clear_color_state_size / 4 : 10449f464c52Smaya isl_dev->ss.clear_value_size / 4; 10459f464c52Smaya for (unsigned i = 0; i < num_dwords; i++) { 104601e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) { 104701e04c3fSmrg sdi.Address = addr; 104801e04c3fSmrg sdi.Address.offset += i * 4; 104901e04c3fSmrg sdi.ImmediateData = 0; 105001e04c3fSmrg } 105101e04c3fSmrg } 105201e04c3fSmrg } else { 105301e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) { 105401e04c3fSmrg sdi.Address = addr; 10557ec681f3Smrg if (GFX_VERx10 >= 75) { 105601e04c3fSmrg /* Pre-SKL, the dword containing the clear values also contains 105701e04c3fSmrg * other fields, so we need to initialize those fields to match the 105801e04c3fSmrg * values that would be in a color attachment. 105901e04c3fSmrg */ 106001e04c3fSmrg sdi.ImmediateData = ISL_CHANNEL_SELECT_RED << 25 | 106101e04c3fSmrg ISL_CHANNEL_SELECT_GREEN << 22 | 106201e04c3fSmrg ISL_CHANNEL_SELECT_BLUE << 19 | 106301e04c3fSmrg ISL_CHANNEL_SELECT_ALPHA << 16; 10647ec681f3Smrg } else if (GFX_VER == 7) { 106501e04c3fSmrg /* On IVB, the dword containing the clear values also contains 106601e04c3fSmrg * other fields that must be zero or can be zero. 106701e04c3fSmrg */ 106801e04c3fSmrg sdi.ImmediateData = 0; 106901e04c3fSmrg } 107001e04c3fSmrg } 107101e04c3fSmrg } 107201e04c3fSmrg} 107301e04c3fSmrg 107401e04c3fSmrg/* Copy the fast-clear value dword(s) between a surface state object and an 107501e04c3fSmrg * image's fast clear state buffer. 107601e04c3fSmrg */ 107701e04c3fSmrgstatic void 107801e04c3fSmrggenX(copy_fast_clear_dwords)(struct anv_cmd_buffer *cmd_buffer, 107901e04c3fSmrg struct anv_state surface_state, 108001e04c3fSmrg const struct anv_image *image, 108101e04c3fSmrg VkImageAspectFlagBits aspect, 108201e04c3fSmrg bool copy_from_surface_state) 108301e04c3fSmrg{ 108401e04c3fSmrg assert(cmd_buffer && image); 10857ec681f3Smrg assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV); 108601e04c3fSmrg 108701e04c3fSmrg struct anv_address ss_clear_addr = { 10889f464c52Smaya .bo = cmd_buffer->device->surface_state_pool.block_pool.bo, 108901e04c3fSmrg .offset = surface_state.offset + 109001e04c3fSmrg cmd_buffer->device->isl_dev.ss.clear_value_offset, 109101e04c3fSmrg }; 109201e04c3fSmrg const struct anv_address entry_addr = 109301e04c3fSmrg anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect); 109401e04c3fSmrg unsigned copy_size = cmd_buffer->device->isl_dev.ss.clear_value_size; 109501e04c3fSmrg 10967ec681f3Smrg#if GFX_VER == 7 10977ec681f3Smrg /* On gfx7, the combination of commands used here(MI_LOAD_REGISTER_MEM 10989f464c52Smaya * and MI_STORE_REGISTER_MEM) can cause GPU hangs if any rendering is 10999f464c52Smaya * in-flight when they are issued even if the memory touched is not 11009f464c52Smaya * currently active for rendering. The weird bit is that it is not the 11019f464c52Smaya * MI_LOAD/STORE_REGISTER_MEM commands which hang but rather the in-flight 11029f464c52Smaya * rendering hangs such that the next stalling command after the 11039f464c52Smaya * MI_LOAD/STORE_REGISTER_MEM commands will catch the hang. 11049f464c52Smaya * 11059f464c52Smaya * It is unclear exactly why this hang occurs. Both MI commands come with 11069f464c52Smaya * warnings about the 3D pipeline but that doesn't seem to fully explain 11079f464c52Smaya * it. My (Jason's) best theory is that it has something to do with the 11089f464c52Smaya * fact that we're using a GPU state register as our temporary and that 11099f464c52Smaya * something with reading/writing it is causing problems. 11109f464c52Smaya * 11119f464c52Smaya * In order to work around this issue, we emit a PIPE_CONTROL with the 11129f464c52Smaya * command streamer stall bit set. 11139f464c52Smaya */ 11147ec681f3Smrg anv_add_pending_pipe_bits(cmd_buffer, 11157ec681f3Smrg ANV_PIPE_CS_STALL_BIT, 11167ec681f3Smrg "after copy_fast_clear_dwords. Avoid potential hang"); 11179f464c52Smaya genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 11189f464c52Smaya#endif 11199f464c52Smaya 11207ec681f3Smrg struct mi_builder b; 11217ec681f3Smrg mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 11229f464c52Smaya 112301e04c3fSmrg if (copy_from_surface_state) { 11247ec681f3Smrg mi_memcpy(&b, entry_addr, ss_clear_addr, copy_size); 112501e04c3fSmrg } else { 11267ec681f3Smrg mi_memcpy(&b, ss_clear_addr, entry_addr, copy_size); 112701e04c3fSmrg 112801e04c3fSmrg /* Updating a surface state object may require that the state cache be 112901e04c3fSmrg * invalidated. From the SKL PRM, Shared Functions -> State -> State 113001e04c3fSmrg * Caching: 113101e04c3fSmrg * 113201e04c3fSmrg * Whenever the RENDER_SURFACE_STATE object in memory pointed to by 113301e04c3fSmrg * the Binding Table Pointer (BTP) and Binding Table Index (BTI) is 113401e04c3fSmrg * modified [...], the L1 state cache must be invalidated to ensure 113501e04c3fSmrg * the new surface or sampler state is fetched from system memory. 113601e04c3fSmrg * 113701e04c3fSmrg * In testing, SKL doesn't actually seem to need this, but HSW does. 113801e04c3fSmrg */ 11397ec681f3Smrg anv_add_pending_pipe_bits(cmd_buffer, 11407ec681f3Smrg ANV_PIPE_STATE_CACHE_INVALIDATE_BIT, 11417ec681f3Smrg "after copy_fast_clear_dwords surface state update"); 114201e04c3fSmrg } 114301e04c3fSmrg} 114401e04c3fSmrg 114501e04c3fSmrg/** 114601e04c3fSmrg * @brief Transitions a color buffer from one layout to another. 114701e04c3fSmrg * 114801e04c3fSmrg * See section 6.1.1. Image Layout Transitions of the Vulkan 1.0.50 spec for 114901e04c3fSmrg * more information. 115001e04c3fSmrg * 115101e04c3fSmrg * @param level_count VK_REMAINING_MIP_LEVELS isn't supported. 115201e04c3fSmrg * @param layer_count VK_REMAINING_ARRAY_LAYERS isn't supported. For 3D images, 115301e04c3fSmrg * this represents the maximum layers to transition at each 115401e04c3fSmrg * specified miplevel. 115501e04c3fSmrg */ 115601e04c3fSmrgstatic void 115701e04c3fSmrgtransition_color_buffer(struct anv_cmd_buffer *cmd_buffer, 115801e04c3fSmrg const struct anv_image *image, 115901e04c3fSmrg VkImageAspectFlagBits aspect, 116001e04c3fSmrg const uint32_t base_level, uint32_t level_count, 116101e04c3fSmrg uint32_t base_layer, uint32_t layer_count, 116201e04c3fSmrg VkImageLayout initial_layout, 11637ec681f3Smrg VkImageLayout final_layout, 11647ec681f3Smrg uint64_t src_queue_family, 11657ec681f3Smrg uint64_t dst_queue_family, 11667ec681f3Smrg bool will_full_fast_clear) 116701e04c3fSmrg{ 11687ec681f3Smrg struct anv_device *device = cmd_buffer->device; 11697ec681f3Smrg const struct intel_device_info *devinfo = &device->info; 117001e04c3fSmrg /* Validate the inputs. */ 117101e04c3fSmrg assert(cmd_buffer); 11727ec681f3Smrg assert(image && image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV); 117301e04c3fSmrg /* These values aren't supported for simplicity's sake. */ 117401e04c3fSmrg assert(level_count != VK_REMAINING_MIP_LEVELS && 117501e04c3fSmrg layer_count != VK_REMAINING_ARRAY_LAYERS); 117601e04c3fSmrg /* Ensure the subresource range is valid. */ 11779f464c52Smaya UNUSED uint64_t last_level_num = base_level + level_count; 11787ec681f3Smrg const uint32_t max_depth = anv_minify(image->vk.extent.depth, base_level); 11797ec681f3Smrg UNUSED const uint32_t image_layers = MAX2(image->vk.array_layers, max_depth); 118001e04c3fSmrg assert((uint64_t)base_layer + layer_count <= image_layers); 11817ec681f3Smrg assert(last_level_num <= image->vk.mip_levels); 11827ec681f3Smrg /* If there is a layout transfer, the final layout cannot be undefined or 11837ec681f3Smrg * preinitialized (VUID-VkImageMemoryBarrier-newLayout-01198). 118401e04c3fSmrg */ 11857ec681f3Smrg assert(initial_layout == final_layout || 11867ec681f3Smrg (final_layout != VK_IMAGE_LAYOUT_UNDEFINED && 11877ec681f3Smrg final_layout != VK_IMAGE_LAYOUT_PREINITIALIZED)); 11887ec681f3Smrg const struct isl_drm_modifier_info *isl_mod_info = 11897ec681f3Smrg image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT 11907ec681f3Smrg ? isl_drm_modifier_get_info(image->vk.drm_format_mod) 11917ec681f3Smrg : NULL; 11927ec681f3Smrg 11937ec681f3Smrg const bool src_queue_external = 11947ec681f3Smrg src_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT || 11957ec681f3Smrg src_queue_family == VK_QUEUE_FAMILY_EXTERNAL; 11967ec681f3Smrg 11977ec681f3Smrg const bool dst_queue_external = 11987ec681f3Smrg dst_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT || 11997ec681f3Smrg dst_queue_family == VK_QUEUE_FAMILY_EXTERNAL; 12007ec681f3Smrg 12017ec681f3Smrg /* Simultaneous acquire and release on external queues is illegal. */ 12027ec681f3Smrg assert(!src_queue_external || !dst_queue_external); 12037ec681f3Smrg 12047ec681f3Smrg /* Ownership transition on an external queue requires special action if the 12057ec681f3Smrg * image has a DRM format modifier because we store image data in 12067ec681f3Smrg * a driver-private bo which is inaccessible to the external queue. 12077ec681f3Smrg */ 12087ec681f3Smrg const bool mod_acquire = 12097ec681f3Smrg src_queue_external && 12107ec681f3Smrg image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT; 12117ec681f3Smrg 12127ec681f3Smrg const bool mod_release = 12137ec681f3Smrg dst_queue_external && 12147ec681f3Smrg image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT; 12157ec681f3Smrg 12167ec681f3Smrg if (initial_layout == final_layout && 12177ec681f3Smrg !mod_acquire && !mod_release) { 12187ec681f3Smrg /* No work is needed. */ 12197ec681f3Smrg return; 12207ec681f3Smrg } 122101e04c3fSmrg 12227ec681f3Smrg const uint32_t plane = anv_image_aspect_to_plane(image, aspect); 122301e04c3fSmrg 12247ec681f3Smrg if (anv_surface_is_valid(&image->planes[plane].shadow_surface) && 122501e04c3fSmrg final_layout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) { 122601e04c3fSmrg /* This surface is a linear compressed image with a tiled shadow surface 122701e04c3fSmrg * for texturing. The client is about to use it in READ_ONLY_OPTIMAL so 122801e04c3fSmrg * we need to ensure the shadow copy is up-to-date. 122901e04c3fSmrg */ 12307ec681f3Smrg assert(image->vk.tiling != VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT); 12317ec681f3Smrg assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT); 12327ec681f3Smrg assert(image->planes[plane].primary_surface.isl.tiling == ISL_TILING_LINEAR); 123301e04c3fSmrg assert(image->planes[plane].shadow_surface.isl.tiling != ISL_TILING_LINEAR); 12347ec681f3Smrg assert(isl_format_is_compressed(image->planes[plane].primary_surface.isl.format)); 123501e04c3fSmrg assert(plane == 0); 123601e04c3fSmrg anv_image_copy_to_shadow(cmd_buffer, image, 12377ec681f3Smrg VK_IMAGE_ASPECT_COLOR_BIT, 123801e04c3fSmrg base_level, level_count, 123901e04c3fSmrg base_layer, layer_count); 124001e04c3fSmrg } 124101e04c3fSmrg 124201e04c3fSmrg if (base_layer >= anv_image_aux_layers(image, aspect, base_level)) 124301e04c3fSmrg return; 124401e04c3fSmrg 12457ec681f3Smrg assert(image->planes[plane].primary_surface.isl.tiling != ISL_TILING_LINEAR); 124601e04c3fSmrg 12477ec681f3Smrg /* The following layouts are equivalent for non-linear images. */ 12487ec681f3Smrg const bool initial_layout_undefined = 12497ec681f3Smrg initial_layout == VK_IMAGE_LAYOUT_UNDEFINED || 12507ec681f3Smrg initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED; 12517ec681f3Smrg 12527ec681f3Smrg bool must_init_fast_clear_state = false; 12537ec681f3Smrg bool must_init_aux_surface = false; 12547ec681f3Smrg 12557ec681f3Smrg if (initial_layout_undefined) { 12567ec681f3Smrg /* The subresource may have been aliased and populated with arbitrary 12577ec681f3Smrg * data. 12587ec681f3Smrg */ 12597ec681f3Smrg must_init_fast_clear_state = true; 12607ec681f3Smrg must_init_aux_surface = true; 12617ec681f3Smrg } else if (mod_acquire) { 12627ec681f3Smrg /* The fast clear state lives in a driver-private bo, and therefore the 12637ec681f3Smrg * external/foreign queue is unaware of it. 12647ec681f3Smrg * 12657ec681f3Smrg * If this is the first time we are accessing the image, then the fast 12667ec681f3Smrg * clear state is uninitialized. 126701e04c3fSmrg * 12687ec681f3Smrg * If this is NOT the first time we are accessing the image, then the fast 12697ec681f3Smrg * clear state may still be valid and correct due to the resolve during 12707ec681f3Smrg * our most recent ownership release. However, we do not track the aux 12717ec681f3Smrg * state with MI stores, and therefore must assume the worst-case: that 12727ec681f3Smrg * this is the first time we are accessing the image. 127301e04c3fSmrg */ 12747ec681f3Smrg assert(image->planes[plane].fast_clear_memory_range.binding == 12757ec681f3Smrg ANV_IMAGE_MEMORY_BINDING_PRIVATE); 12767ec681f3Smrg must_init_fast_clear_state = true; 12777ec681f3Smrg 12787ec681f3Smrg if (image->planes[plane].aux_surface.memory_range.binding == 12797ec681f3Smrg ANV_IMAGE_MEMORY_BINDING_PRIVATE) { 12807ec681f3Smrg assert(isl_mod_info->aux_usage == ISL_AUX_USAGE_NONE); 12817ec681f3Smrg 12827ec681f3Smrg /* The aux surface, like the fast clear state, lives in 12837ec681f3Smrg * a driver-private bo. We must initialize the aux surface for the 12847ec681f3Smrg * same reasons we must initialize the fast clear state. 12857ec681f3Smrg */ 12867ec681f3Smrg must_init_aux_surface = true; 12877ec681f3Smrg } else { 12887ec681f3Smrg assert(isl_mod_info->aux_usage != ISL_AUX_USAGE_NONE); 12897ec681f3Smrg 12907ec681f3Smrg /* The aux surface, unlike the fast clear state, lives in 12917ec681f3Smrg * application-visible VkDeviceMemory and is shared with the 12927ec681f3Smrg * external/foreign queue. Therefore, when we acquire ownership of the 12937ec681f3Smrg * image with a defined VkImageLayout, the aux surface is valid and has 12947ec681f3Smrg * the aux state required by the modifier. 12957ec681f3Smrg */ 12967ec681f3Smrg must_init_aux_surface = false; 12977ec681f3Smrg } 12987ec681f3Smrg } 12997ec681f3Smrg 13007ec681f3Smrg#if GFX_VER == 12 13017ec681f3Smrg /* We do not yet support modifiers with aux on gen12. */ 13027ec681f3Smrg assert(image->vk.tiling != VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT); 13037ec681f3Smrg 13047ec681f3Smrg if (initial_layout_undefined) { 13057ec681f3Smrg if (device->physical->has_implicit_ccs && devinfo->has_aux_map) { 13067ec681f3Smrg anv_image_init_aux_tt(cmd_buffer, image, aspect, 13077ec681f3Smrg base_level, level_count, 13087ec681f3Smrg base_layer, layer_count); 13097ec681f3Smrg } 13107ec681f3Smrg } 13117ec681f3Smrg#else 13127ec681f3Smrg assert(!(device->physical->has_implicit_ccs && devinfo->has_aux_map)); 13137ec681f3Smrg#endif 13147ec681f3Smrg 13157ec681f3Smrg if (must_init_fast_clear_state) { 131601e04c3fSmrg if (base_level == 0 && base_layer == 0) 131701e04c3fSmrg init_fast_clear_color(cmd_buffer, image, aspect); 13187ec681f3Smrg } 13197ec681f3Smrg 13207ec681f3Smrg if (must_init_aux_surface) { 13217ec681f3Smrg assert(must_init_fast_clear_state); 132201e04c3fSmrg 132301e04c3fSmrg /* Initialize the aux buffers to enable correct rendering. In order to 132401e04c3fSmrg * ensure that things such as storage images work correctly, aux buffers 132501e04c3fSmrg * need to be initialized to valid data. 132601e04c3fSmrg * 132701e04c3fSmrg * Having an aux buffer with invalid data is a problem for two reasons: 132801e04c3fSmrg * 132901e04c3fSmrg * 1) Having an invalid value in the buffer can confuse the hardware. 133001e04c3fSmrg * For instance, with CCS_E on SKL, a two-bit CCS value of 2 is 133101e04c3fSmrg * invalid and leads to the hardware doing strange things. It 133201e04c3fSmrg * doesn't hang as far as we can tell but rendering corruption can 133301e04c3fSmrg * occur. 133401e04c3fSmrg * 133501e04c3fSmrg * 2) If this transition is into the GENERAL layout and we then use the 133601e04c3fSmrg * image as a storage image, then we must have the aux buffer in the 133701e04c3fSmrg * pass-through state so that, if we then go to texture from the 133801e04c3fSmrg * image, we get the results of our storage image writes and not the 133901e04c3fSmrg * fast clear color or other random data. 134001e04c3fSmrg * 134101e04c3fSmrg * For CCS both of the problems above are real demonstrable issues. In 134201e04c3fSmrg * that case, the only thing we can do is to perform an ambiguate to 134301e04c3fSmrg * transition the aux surface into the pass-through state. 134401e04c3fSmrg * 134501e04c3fSmrg * For MCS, (2) is never an issue because we don't support multisampled 134601e04c3fSmrg * storage images. In theory, issue (1) is a problem with MCS but we've 134701e04c3fSmrg * never seen it in the wild. For 4x and 16x, all bit patters could, in 134801e04c3fSmrg * theory, be interpreted as something but we don't know that all bit 134901e04c3fSmrg * patterns are actually valid. For 2x and 8x, you could easily end up 135001e04c3fSmrg * with the MCS referring to an invalid plane because not all bits of 135101e04c3fSmrg * the MCS value are actually used. Even though we've never seen issues 135201e04c3fSmrg * in the wild, it's best to play it safe and initialize the MCS. We 135301e04c3fSmrg * can use a fast-clear for MCS because we only ever touch from render 135401e04c3fSmrg * and texture (no image load store). 135501e04c3fSmrg */ 13567ec681f3Smrg if (image->vk.samples == 1) { 135701e04c3fSmrg for (uint32_t l = 0; l < level_count; l++) { 135801e04c3fSmrg const uint32_t level = base_level + l; 135901e04c3fSmrg 136001e04c3fSmrg uint32_t aux_layers = anv_image_aux_layers(image, aspect, level); 136101e04c3fSmrg if (base_layer >= aux_layers) 136201e04c3fSmrg break; /* We will only get fewer layers as level increases */ 136301e04c3fSmrg uint32_t level_layer_count = 136401e04c3fSmrg MIN2(layer_count, aux_layers - base_layer); 136501e04c3fSmrg 13667ec681f3Smrg /* If will_full_fast_clear is set, the caller promises to 13677ec681f3Smrg * fast-clear the largest portion of the specified range as it can. 13687ec681f3Smrg * For color images, that means only the first LOD and array slice. 13697ec681f3Smrg */ 13707ec681f3Smrg if (level == 0 && base_layer == 0 && will_full_fast_clear) { 13717ec681f3Smrg base_layer++; 13727ec681f3Smrg level_layer_count--; 13737ec681f3Smrg if (level_layer_count == 0) 13747ec681f3Smrg continue; 13757ec681f3Smrg } 13767ec681f3Smrg 137701e04c3fSmrg anv_image_ccs_op(cmd_buffer, image, 13787ec681f3Smrg image->planes[plane].primary_surface.isl.format, 13797ec681f3Smrg ISL_SWIZZLE_IDENTITY, 138001e04c3fSmrg aspect, level, base_layer, level_layer_count, 138101e04c3fSmrg ISL_AUX_OP_AMBIGUATE, NULL, false); 138201e04c3fSmrg 138301e04c3fSmrg if (image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E) { 138401e04c3fSmrg set_image_compressed_bit(cmd_buffer, image, aspect, 138501e04c3fSmrg level, base_layer, level_layer_count, 138601e04c3fSmrg false); 138701e04c3fSmrg } 138801e04c3fSmrg } 138901e04c3fSmrg } else { 13907ec681f3Smrg if (image->vk.samples == 4 || image->vk.samples == 16) { 13917ec681f3Smrg anv_perf_warn(VK_LOG_OBJS(&image->vk.base), 139201e04c3fSmrg "Doing a potentially unnecessary fast-clear to " 139301e04c3fSmrg "define an MCS buffer."); 139401e04c3fSmrg } 139501e04c3fSmrg 13967ec681f3Smrg /* If will_full_fast_clear is set, the caller promises to fast-clear 13977ec681f3Smrg * the largest portion of the specified range as it can. 13987ec681f3Smrg */ 13997ec681f3Smrg if (will_full_fast_clear) 14007ec681f3Smrg return; 14017ec681f3Smrg 140201e04c3fSmrg assert(base_level == 0 && level_count == 1); 140301e04c3fSmrg anv_image_mcs_op(cmd_buffer, image, 14047ec681f3Smrg image->planes[plane].primary_surface.isl.format, 14057ec681f3Smrg ISL_SWIZZLE_IDENTITY, 140601e04c3fSmrg aspect, base_layer, layer_count, 140701e04c3fSmrg ISL_AUX_OP_FAST_CLEAR, NULL, false); 140801e04c3fSmrg } 140901e04c3fSmrg return; 141001e04c3fSmrg } 141101e04c3fSmrg 14127ec681f3Smrg enum isl_aux_usage initial_aux_usage = 14137ec681f3Smrg anv_layout_to_aux_usage(devinfo, image, aspect, 0, initial_layout); 14147ec681f3Smrg enum isl_aux_usage final_aux_usage = 14157ec681f3Smrg anv_layout_to_aux_usage(devinfo, image, aspect, 0, final_layout); 14167ec681f3Smrg 14177ec681f3Smrg /* We must override the anv_layout_to_* functions because they are unaware of 14187ec681f3Smrg * acquire/release direction. 14197ec681f3Smrg */ 14207ec681f3Smrg if (mod_acquire) { 14217ec681f3Smrg initial_aux_usage = isl_mod_info->aux_usage; 14227ec681f3Smrg } else if (mod_release) { 14237ec681f3Smrg final_aux_usage = isl_mod_info->aux_usage; 14247ec681f3Smrg } 142501e04c3fSmrg 142601e04c3fSmrg /* The current code assumes that there is no mixing of CCS_E and CCS_D. 142701e04c3fSmrg * We can handle transitions between CCS_D/E to and from NONE. What we 142801e04c3fSmrg * don't yet handle is switching between CCS_E and CCS_D within a given 142901e04c3fSmrg * image. Doing so in a performant way requires more detailed aux state 143001e04c3fSmrg * tracking such as what is done in i965. For now, just assume that we 143101e04c3fSmrg * only have one type of compression. 143201e04c3fSmrg */ 143301e04c3fSmrg assert(initial_aux_usage == ISL_AUX_USAGE_NONE || 143401e04c3fSmrg final_aux_usage == ISL_AUX_USAGE_NONE || 143501e04c3fSmrg initial_aux_usage == final_aux_usage); 143601e04c3fSmrg 143701e04c3fSmrg /* If initial aux usage is NONE, there is nothing to resolve */ 143801e04c3fSmrg if (initial_aux_usage == ISL_AUX_USAGE_NONE) 143901e04c3fSmrg return; 144001e04c3fSmrg 144101e04c3fSmrg enum isl_aux_op resolve_op = ISL_AUX_OP_NONE; 144201e04c3fSmrg 144301e04c3fSmrg /* If the initial layout supports more fast clear than the final layout 144401e04c3fSmrg * then we need at least a partial resolve. 144501e04c3fSmrg */ 144601e04c3fSmrg const enum anv_fast_clear_type initial_fast_clear = 144701e04c3fSmrg anv_layout_to_fast_clear_type(devinfo, image, aspect, initial_layout); 144801e04c3fSmrg const enum anv_fast_clear_type final_fast_clear = 144901e04c3fSmrg anv_layout_to_fast_clear_type(devinfo, image, aspect, final_layout); 145001e04c3fSmrg if (final_fast_clear < initial_fast_clear) 145101e04c3fSmrg resolve_op = ISL_AUX_OP_PARTIAL_RESOLVE; 145201e04c3fSmrg 145301e04c3fSmrg if (initial_aux_usage == ISL_AUX_USAGE_CCS_E && 145401e04c3fSmrg final_aux_usage != ISL_AUX_USAGE_CCS_E) 145501e04c3fSmrg resolve_op = ISL_AUX_OP_FULL_RESOLVE; 145601e04c3fSmrg 145701e04c3fSmrg if (resolve_op == ISL_AUX_OP_NONE) 145801e04c3fSmrg return; 145901e04c3fSmrg 146001e04c3fSmrg /* Perform a resolve to synchronize data between the main and aux buffer. 146101e04c3fSmrg * Before we begin, we must satisfy the cache flushing requirement specified 146201e04c3fSmrg * in the Sky Lake PRM Vol. 7, "MCS Buffer for Render Target(s)": 146301e04c3fSmrg * 146401e04c3fSmrg * Any transition from any value in {Clear, Render, Resolve} to a 146501e04c3fSmrg * different value in {Clear, Render, Resolve} requires end of pipe 146601e04c3fSmrg * synchronization. 146701e04c3fSmrg * 146801e04c3fSmrg * We perform a flush of the write cache before and after the clear and 146901e04c3fSmrg * resolve operations to meet this requirement. 147001e04c3fSmrg * 147101e04c3fSmrg * Unlike other drawing, fast clear operations are not properly 147201e04c3fSmrg * synchronized. The first PIPE_CONTROL here likely ensures that the 147301e04c3fSmrg * contents of the previous render or clear hit the render target before we 147401e04c3fSmrg * resolve and the second likely ensures that the resolve is complete before 147501e04c3fSmrg * we do any more rendering or clearing. 147601e04c3fSmrg */ 14777ec681f3Smrg anv_add_pending_pipe_bits(cmd_buffer, 14787ec681f3Smrg ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | 14797ec681f3Smrg ANV_PIPE_END_OF_PIPE_SYNC_BIT, 14807ec681f3Smrg "after transition RT"); 148101e04c3fSmrg 148201e04c3fSmrg for (uint32_t l = 0; l < level_count; l++) { 148301e04c3fSmrg uint32_t level = base_level + l; 148401e04c3fSmrg 148501e04c3fSmrg uint32_t aux_layers = anv_image_aux_layers(image, aspect, level); 148601e04c3fSmrg if (base_layer >= aux_layers) 148701e04c3fSmrg break; /* We will only get fewer layers as level increases */ 148801e04c3fSmrg uint32_t level_layer_count = 148901e04c3fSmrg MIN2(layer_count, aux_layers - base_layer); 149001e04c3fSmrg 149101e04c3fSmrg for (uint32_t a = 0; a < level_layer_count; a++) { 149201e04c3fSmrg uint32_t array_layer = base_layer + a; 14937ec681f3Smrg 14947ec681f3Smrg /* If will_full_fast_clear is set, the caller promises to fast-clear 14957ec681f3Smrg * the largest portion of the specified range as it can. For color 14967ec681f3Smrg * images, that means only the first LOD and array slice. 14977ec681f3Smrg */ 14987ec681f3Smrg if (level == 0 && array_layer == 0 && will_full_fast_clear) 14997ec681f3Smrg continue; 15007ec681f3Smrg 15017ec681f3Smrg if (image->vk.samples == 1) { 150201e04c3fSmrg anv_cmd_predicated_ccs_resolve(cmd_buffer, image, 15037ec681f3Smrg image->planes[plane].primary_surface.isl.format, 15047ec681f3Smrg ISL_SWIZZLE_IDENTITY, 150501e04c3fSmrg aspect, level, array_layer, resolve_op, 150601e04c3fSmrg final_fast_clear); 150701e04c3fSmrg } else { 150801e04c3fSmrg /* We only support fast-clear on the first layer so partial 150901e04c3fSmrg * resolves should not be used on other layers as they will use 151001e04c3fSmrg * the clear color stored in memory that is only valid for layer0. 151101e04c3fSmrg */ 151201e04c3fSmrg if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE && 151301e04c3fSmrg array_layer != 0) 151401e04c3fSmrg continue; 151501e04c3fSmrg 151601e04c3fSmrg anv_cmd_predicated_mcs_resolve(cmd_buffer, image, 15177ec681f3Smrg image->planes[plane].primary_surface.isl.format, 15187ec681f3Smrg ISL_SWIZZLE_IDENTITY, 151901e04c3fSmrg aspect, array_layer, resolve_op, 152001e04c3fSmrg final_fast_clear); 152101e04c3fSmrg } 152201e04c3fSmrg } 152301e04c3fSmrg } 152401e04c3fSmrg 15257ec681f3Smrg anv_add_pending_pipe_bits(cmd_buffer, 15267ec681f3Smrg ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | 15277ec681f3Smrg ANV_PIPE_END_OF_PIPE_SYNC_BIT, 15287ec681f3Smrg "after transition RT"); 152901e04c3fSmrg} 153001e04c3fSmrg 153101e04c3fSmrgstatic VkResult 153201e04c3fSmrggenX(cmd_buffer_setup_attachments)(struct anv_cmd_buffer *cmd_buffer, 15337ec681f3Smrg const struct anv_render_pass *pass, 15347ec681f3Smrg const struct anv_framebuffer *framebuffer, 153501e04c3fSmrg const VkRenderPassBeginInfo *begin) 153601e04c3fSmrg{ 153701e04c3fSmrg struct anv_cmd_state *state = &cmd_buffer->state; 153801e04c3fSmrg 153901e04c3fSmrg vk_free(&cmd_buffer->pool->alloc, state->attachments); 154001e04c3fSmrg 154101e04c3fSmrg if (pass->attachment_count > 0) { 15427ec681f3Smrg state->attachments = vk_zalloc(&cmd_buffer->pool->alloc, 15437ec681f3Smrg pass->attachment_count * 15447ec681f3Smrg sizeof(state->attachments[0]), 15457ec681f3Smrg 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 154601e04c3fSmrg if (state->attachments == NULL) { 154701e04c3fSmrg /* Propagate VK_ERROR_OUT_OF_HOST_MEMORY to vkEndCommandBuffer */ 154801e04c3fSmrg return anv_batch_set_error(&cmd_buffer->batch, 154901e04c3fSmrg VK_ERROR_OUT_OF_HOST_MEMORY); 155001e04c3fSmrg } 155101e04c3fSmrg } else { 155201e04c3fSmrg state->attachments = NULL; 155301e04c3fSmrg } 155401e04c3fSmrg 15557ec681f3Smrg const VkRenderPassAttachmentBeginInfoKHR *attach_begin = 15567ec681f3Smrg vk_find_struct_const(begin, RENDER_PASS_ATTACHMENT_BEGIN_INFO_KHR); 15577ec681f3Smrg if (begin && !attach_begin) 15587ec681f3Smrg assert(pass->attachment_count == framebuffer->attachment_count); 155901e04c3fSmrg 15607ec681f3Smrg for (uint32_t i = 0; i < pass->attachment_count; ++i) { 15617ec681f3Smrg if (attach_begin && attach_begin->attachmentCount != 0) { 15627ec681f3Smrg assert(attach_begin->attachmentCount == pass->attachment_count); 15637ec681f3Smrg ANV_FROM_HANDLE(anv_image_view, iview, attach_begin->pAttachments[i]); 15647ec681f3Smrg state->attachments[i].image_view = iview; 15657ec681f3Smrg } else if (framebuffer && i < framebuffer->attachment_count) { 15667ec681f3Smrg state->attachments[i].image_view = framebuffer->attachments[i]; 15677ec681f3Smrg } else { 15687ec681f3Smrg state->attachments[i].image_view = NULL; 15697ec681f3Smrg } 157001e04c3fSmrg } 157101e04c3fSmrg 157201e04c3fSmrg if (begin) { 157301e04c3fSmrg for (uint32_t i = 0; i < pass->attachment_count; ++i) { 15747ec681f3Smrg const struct anv_render_pass_attachment *pass_att = &pass->attachments[i]; 15757ec681f3Smrg struct anv_attachment_state *att_state = &state->attachments[i]; 15767ec681f3Smrg VkImageAspectFlags att_aspects = vk_format_aspects(pass_att->format); 157701e04c3fSmrg VkImageAspectFlags clear_aspects = 0; 157801e04c3fSmrg VkImageAspectFlags load_aspects = 0; 157901e04c3fSmrg 158001e04c3fSmrg if (att_aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { 158101e04c3fSmrg /* color attachment */ 15827ec681f3Smrg if (pass_att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { 158301e04c3fSmrg clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT; 15847ec681f3Smrg } else if (pass_att->load_op == VK_ATTACHMENT_LOAD_OP_LOAD) { 158501e04c3fSmrg load_aspects |= VK_IMAGE_ASPECT_COLOR_BIT; 158601e04c3fSmrg } 158701e04c3fSmrg } else { 158801e04c3fSmrg /* depthstencil attachment */ 158901e04c3fSmrg if (att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { 15907ec681f3Smrg if (pass_att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { 159101e04c3fSmrg clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT; 15927ec681f3Smrg } else if (pass_att->load_op == VK_ATTACHMENT_LOAD_OP_LOAD) { 159301e04c3fSmrg load_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT; 159401e04c3fSmrg } 159501e04c3fSmrg } 159601e04c3fSmrg if (att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { 15977ec681f3Smrg if (pass_att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { 159801e04c3fSmrg clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT; 15997ec681f3Smrg } else if (pass_att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_LOAD) { 160001e04c3fSmrg load_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT; 160101e04c3fSmrg } 160201e04c3fSmrg } 160301e04c3fSmrg } 160401e04c3fSmrg 16057ec681f3Smrg att_state->current_layout = pass_att->initial_layout; 16067ec681f3Smrg att_state->current_stencil_layout = pass_att->stencil_initial_layout; 16077ec681f3Smrg att_state->pending_clear_aspects = clear_aspects; 16087ec681f3Smrg att_state->pending_load_aspects = load_aspects; 160901e04c3fSmrg if (clear_aspects) 16107ec681f3Smrg att_state->clear_value = begin->pClearValues[i]; 161101e04c3fSmrg 16127ec681f3Smrg struct anv_image_view *iview = state->attachments[i].image_view; 161301e04c3fSmrg 161401e04c3fSmrg const uint32_t num_layers = iview->planes[0].isl.array_len; 16157ec681f3Smrg att_state->pending_clear_views = (1 << num_layers) - 1; 161601e04c3fSmrg 16177ec681f3Smrg /* This will be initialized after the first subpass transition. */ 16187ec681f3Smrg att_state->aux_usage = ISL_AUX_USAGE_NONE; 161901e04c3fSmrg 16207ec681f3Smrg att_state->fast_clear = false; 16217ec681f3Smrg if (clear_aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { 16227ec681f3Smrg assert(clear_aspects == VK_IMAGE_ASPECT_COLOR_BIT); 16237ec681f3Smrg att_state->fast_clear = 16247ec681f3Smrg anv_can_fast_clear_color_view(cmd_buffer->device, iview, 16257ec681f3Smrg pass_att->first_subpass_layout, 16267ec681f3Smrg vk_to_isl_color(att_state->clear_value.color), 16277ec681f3Smrg framebuffer->layers, 16287ec681f3Smrg begin->renderArea); 16297ec681f3Smrg } else if (clear_aspects & (VK_IMAGE_ASPECT_DEPTH_BIT | 16307ec681f3Smrg VK_IMAGE_ASPECT_STENCIL_BIT)) { 16317ec681f3Smrg att_state->fast_clear = 16327ec681f3Smrg anv_can_hiz_clear_ds_view(cmd_buffer->device, iview, 16337ec681f3Smrg pass_att->first_subpass_layout, 16347ec681f3Smrg clear_aspects, 16357ec681f3Smrg att_state->clear_value.depthStencil.depth, 16367ec681f3Smrg begin->renderArea); 163701e04c3fSmrg } 163801e04c3fSmrg } 163901e04c3fSmrg } 164001e04c3fSmrg 164101e04c3fSmrg return VK_SUCCESS; 164201e04c3fSmrg} 164301e04c3fSmrg 16447ec681f3Smrg/** 16457ec681f3Smrg * Setup anv_cmd_state::attachments for vkCmdBeginRenderPass. 16467ec681f3Smrg */ 16477ec681f3Smrgstatic VkResult 16487ec681f3SmrggenX(cmd_buffer_alloc_att_surf_states)(struct anv_cmd_buffer *cmd_buffer, 16497ec681f3Smrg const struct anv_render_pass *pass, 16507ec681f3Smrg const struct anv_subpass *subpass) 16517ec681f3Smrg{ 16527ec681f3Smrg const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; 16537ec681f3Smrg struct anv_cmd_state *state = &cmd_buffer->state; 16547ec681f3Smrg 16557ec681f3Smrg /* Reserve one for the NULL state. */ 16567ec681f3Smrg unsigned num_states = 1; 16577ec681f3Smrg for (uint32_t i = 0; i < subpass->attachment_count; i++) { 16587ec681f3Smrg uint32_t att = subpass->attachments[i].attachment; 16597ec681f3Smrg if (att == VK_ATTACHMENT_UNUSED) 16607ec681f3Smrg continue; 16617ec681f3Smrg 16627ec681f3Smrg assert(att < pass->attachment_count); 16637ec681f3Smrg if (!vk_format_is_color(pass->attachments[att].format)) 16647ec681f3Smrg continue; 16657ec681f3Smrg 16667ec681f3Smrg const VkImageUsageFlagBits att_usage = subpass->attachments[i].usage; 16677ec681f3Smrg assert(util_bitcount(att_usage) == 1); 16687ec681f3Smrg 16697ec681f3Smrg if (att_usage == VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT || 16707ec681f3Smrg att_usage == VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) 16717ec681f3Smrg num_states++; 16727ec681f3Smrg } 16737ec681f3Smrg 16747ec681f3Smrg const uint32_t ss_stride = align_u32(isl_dev->ss.size, isl_dev->ss.align); 16757ec681f3Smrg state->attachment_states = 16767ec681f3Smrg anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 16777ec681f3Smrg num_states * ss_stride, isl_dev->ss.align); 16787ec681f3Smrg if (state->attachment_states.map == NULL) { 16797ec681f3Smrg return anv_batch_set_error(&cmd_buffer->batch, 16807ec681f3Smrg VK_ERROR_OUT_OF_DEVICE_MEMORY); 16817ec681f3Smrg } 16827ec681f3Smrg 16837ec681f3Smrg struct anv_state next_state = state->attachment_states; 16847ec681f3Smrg next_state.alloc_size = isl_dev->ss.size; 16857ec681f3Smrg 16867ec681f3Smrg state->null_surface_state = next_state; 16877ec681f3Smrg next_state.offset += ss_stride; 16887ec681f3Smrg next_state.map += ss_stride; 16897ec681f3Smrg 16907ec681f3Smrg for (uint32_t i = 0; i < subpass->attachment_count; i++) { 16917ec681f3Smrg uint32_t att = subpass->attachments[i].attachment; 16927ec681f3Smrg if (att == VK_ATTACHMENT_UNUSED) 16937ec681f3Smrg continue; 16947ec681f3Smrg 16957ec681f3Smrg assert(att < pass->attachment_count); 16967ec681f3Smrg if (!vk_format_is_color(pass->attachments[att].format)) 16977ec681f3Smrg continue; 16987ec681f3Smrg 16997ec681f3Smrg const VkImageUsageFlagBits att_usage = subpass->attachments[i].usage; 17007ec681f3Smrg assert(util_bitcount(att_usage) == 1); 17017ec681f3Smrg 17027ec681f3Smrg if (att_usage == VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) 17037ec681f3Smrg state->attachments[att].color.state = next_state; 17047ec681f3Smrg else if (att_usage == VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) 17057ec681f3Smrg state->attachments[att].input.state = next_state; 17067ec681f3Smrg else 17077ec681f3Smrg continue; 17087ec681f3Smrg 17097ec681f3Smrg next_state.offset += ss_stride; 17107ec681f3Smrg next_state.map += ss_stride; 17117ec681f3Smrg } 17127ec681f3Smrg 17137ec681f3Smrg assert(next_state.offset == state->attachment_states.offset + 17147ec681f3Smrg state->attachment_states.alloc_size); 17157ec681f3Smrg 17167ec681f3Smrg return VK_SUCCESS; 17177ec681f3Smrg} 17187ec681f3Smrg 171901e04c3fSmrgVkResult 172001e04c3fSmrggenX(BeginCommandBuffer)( 172101e04c3fSmrg VkCommandBuffer commandBuffer, 172201e04c3fSmrg const VkCommandBufferBeginInfo* pBeginInfo) 172301e04c3fSmrg{ 172401e04c3fSmrg ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 172501e04c3fSmrg 172601e04c3fSmrg /* If this is the first vkBeginCommandBuffer, we must *initialize* the 172701e04c3fSmrg * command buffer's state. Otherwise, we must *reset* its state. In both 172801e04c3fSmrg * cases we reset it. 172901e04c3fSmrg * 173001e04c3fSmrg * From the Vulkan 1.0 spec: 173101e04c3fSmrg * 173201e04c3fSmrg * If a command buffer is in the executable state and the command buffer 173301e04c3fSmrg * was allocated from a command pool with the 173401e04c3fSmrg * VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT flag set, then 173501e04c3fSmrg * vkBeginCommandBuffer implicitly resets the command buffer, behaving 173601e04c3fSmrg * as if vkResetCommandBuffer had been called with 173701e04c3fSmrg * VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT not set. It then puts 173801e04c3fSmrg * the command buffer in the recording state. 173901e04c3fSmrg */ 174001e04c3fSmrg anv_cmd_buffer_reset(cmd_buffer); 174101e04c3fSmrg 174201e04c3fSmrg cmd_buffer->usage_flags = pBeginInfo->flags; 174301e04c3fSmrg 17447ec681f3Smrg /* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT must be ignored for 17457ec681f3Smrg * primary level command buffers. 17467ec681f3Smrg * 17477ec681f3Smrg * From the Vulkan 1.0 spec: 17487ec681f3Smrg * 17497ec681f3Smrg * VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT specifies that a 17507ec681f3Smrg * secondary command buffer is considered to be entirely inside a render 17517ec681f3Smrg * pass. If this is a primary command buffer, then this bit is ignored. 17527ec681f3Smrg */ 17537ec681f3Smrg if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) 17547ec681f3Smrg cmd_buffer->usage_flags &= ~VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT; 175501e04c3fSmrg 175601e04c3fSmrg genX(cmd_buffer_emit_state_base_address)(cmd_buffer); 175701e04c3fSmrg 175801e04c3fSmrg /* We sometimes store vertex data in the dynamic state buffer for blorp 175901e04c3fSmrg * operations and our dynamic state stream may re-use data from previous 176001e04c3fSmrg * command buffers. In order to prevent stale cache data, we flush the VF 176101e04c3fSmrg * cache. We could do this on every blorp call but that's not really 176201e04c3fSmrg * needed as all of the data will get written by the CPU prior to the GPU 176301e04c3fSmrg * executing anything. The chances are fairly high that they will use 176401e04c3fSmrg * blorp at least once per primary command buffer so it shouldn't be 176501e04c3fSmrg * wasted. 17667ec681f3Smrg * 17677ec681f3Smrg * There is also a workaround on gfx8 which requires us to invalidate the 17687ec681f3Smrg * VF cache occasionally. It's easier if we can assume we start with a 17697ec681f3Smrg * fresh cache (See also genX(cmd_buffer_set_binding_for_gfx8_vb_flush).) 177001e04c3fSmrg */ 17717ec681f3Smrg anv_add_pending_pipe_bits(cmd_buffer, 17727ec681f3Smrg ANV_PIPE_VF_CACHE_INVALIDATE_BIT, 17737ec681f3Smrg "new cmd buffer"); 17747ec681f3Smrg 17757ec681f3Smrg /* Re-emit the aux table register in every command buffer. This way we're 17767ec681f3Smrg * ensured that we have the table even if this command buffer doesn't 17777ec681f3Smrg * initialize any images. 17787ec681f3Smrg */ 17797ec681f3Smrg if (cmd_buffer->device->info.has_aux_map) { 17807ec681f3Smrg anv_add_pending_pipe_bits(cmd_buffer, 17817ec681f3Smrg ANV_PIPE_AUX_TABLE_INVALIDATE_BIT, 17827ec681f3Smrg "new cmd buffer with aux-tt"); 17837ec681f3Smrg } 178401e04c3fSmrg 178501e04c3fSmrg /* We send an "Indirect State Pointers Disable" packet at 178601e04c3fSmrg * EndCommandBuffer, so all push contant packets are ignored during a 178701e04c3fSmrg * context restore. Documentation says after that command, we need to 178801e04c3fSmrg * emit push constants again before any rendering operation. So we 178901e04c3fSmrg * flag them dirty here to make sure they get emitted. 179001e04c3fSmrg */ 179101e04c3fSmrg cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS; 179201e04c3fSmrg 179301e04c3fSmrg VkResult result = VK_SUCCESS; 179401e04c3fSmrg if (cmd_buffer->usage_flags & 179501e04c3fSmrg VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) { 179601e04c3fSmrg assert(pBeginInfo->pInheritanceInfo); 17977ec681f3Smrg ANV_FROM_HANDLE(anv_render_pass, pass, 17987ec681f3Smrg pBeginInfo->pInheritanceInfo->renderPass); 17997ec681f3Smrg struct anv_subpass *subpass = 18007ec681f3Smrg &pass->subpasses[pBeginInfo->pInheritanceInfo->subpass]; 18017ec681f3Smrg ANV_FROM_HANDLE(anv_framebuffer, framebuffer, 18027ec681f3Smrg pBeginInfo->pInheritanceInfo->framebuffer); 18037ec681f3Smrg 18047ec681f3Smrg cmd_buffer->state.pass = pass; 18057ec681f3Smrg cmd_buffer->state.subpass = subpass; 180601e04c3fSmrg 180701e04c3fSmrg /* This is optional in the inheritance info. */ 18087ec681f3Smrg cmd_buffer->state.framebuffer = framebuffer; 18097ec681f3Smrg 18107ec681f3Smrg result = genX(cmd_buffer_setup_attachments)(cmd_buffer, pass, 18117ec681f3Smrg framebuffer, NULL); 18127ec681f3Smrg if (result != VK_SUCCESS) 18137ec681f3Smrg return result; 181401e04c3fSmrg 18157ec681f3Smrg result = genX(cmd_buffer_alloc_att_surf_states)(cmd_buffer, pass, 18167ec681f3Smrg subpass); 18177ec681f3Smrg if (result != VK_SUCCESS) 18187ec681f3Smrg return result; 181901e04c3fSmrg 182001e04c3fSmrg /* Record that HiZ is enabled if we can. */ 182101e04c3fSmrg if (cmd_buffer->state.framebuffer) { 182201e04c3fSmrg const struct anv_image_view * const iview = 182301e04c3fSmrg anv_cmd_buffer_get_depth_stencil_view(cmd_buffer); 182401e04c3fSmrg 182501e04c3fSmrg if (iview) { 182601e04c3fSmrg VkImageLayout layout = 182701e04c3fSmrg cmd_buffer->state.subpass->depth_stencil_attachment->layout; 182801e04c3fSmrg 182901e04c3fSmrg enum isl_aux_usage aux_usage = 183001e04c3fSmrg anv_layout_to_aux_usage(&cmd_buffer->device->info, iview->image, 18317ec681f3Smrg VK_IMAGE_ASPECT_DEPTH_BIT, 18327ec681f3Smrg VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT, 18337ec681f3Smrg layout); 183401e04c3fSmrg 18357ec681f3Smrg cmd_buffer->state.hiz_enabled = isl_aux_usage_has_hiz(aux_usage); 183601e04c3fSmrg } 183701e04c3fSmrg } 183801e04c3fSmrg 183901e04c3fSmrg cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS; 184001e04c3fSmrg } 184101e04c3fSmrg 18427ec681f3Smrg#if GFX_VERx10 >= 75 18439f464c52Smaya if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) { 18449f464c52Smaya const VkCommandBufferInheritanceConditionalRenderingInfoEXT *conditional_rendering_info = 18459f464c52Smaya vk_find_struct_const(pBeginInfo->pInheritanceInfo->pNext, COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT); 18469f464c52Smaya 18479f464c52Smaya /* If secondary buffer supports conditional rendering 18489f464c52Smaya * we should emit commands as if conditional rendering is enabled. 18499f464c52Smaya */ 18509f464c52Smaya cmd_buffer->state.conditional_render_enabled = 18519f464c52Smaya conditional_rendering_info && conditional_rendering_info->conditionalRenderingEnable; 18529f464c52Smaya } 18539f464c52Smaya#endif 18549f464c52Smaya 185501e04c3fSmrg return result; 185601e04c3fSmrg} 185701e04c3fSmrg 185801e04c3fSmrg/* From the PRM, Volume 2a: 185901e04c3fSmrg * 186001e04c3fSmrg * "Indirect State Pointers Disable 186101e04c3fSmrg * 186201e04c3fSmrg * At the completion of the post-sync operation associated with this pipe 186301e04c3fSmrg * control packet, the indirect state pointers in the hardware are 186401e04c3fSmrg * considered invalid; the indirect pointers are not saved in the context. 186501e04c3fSmrg * If any new indirect state commands are executed in the command stream 186601e04c3fSmrg * while the pipe control is pending, the new indirect state commands are 186701e04c3fSmrg * preserved. 186801e04c3fSmrg * 186901e04c3fSmrg * [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context 187001e04c3fSmrg * restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant 187101e04c3fSmrg * commands are only considered as Indirect State Pointers. Once ISP is 187201e04c3fSmrg * issued in a context, SW must initialize by programming push constant 187301e04c3fSmrg * commands for all the shaders (at least to zero length) before attempting 187401e04c3fSmrg * any rendering operation for the same context." 187501e04c3fSmrg * 187601e04c3fSmrg * 3DSTATE_CONSTANT_* packets are restored during a context restore, 187701e04c3fSmrg * even though they point to a BO that has been already unreferenced at 187801e04c3fSmrg * the end of the previous batch buffer. This has been fine so far since 187901e04c3fSmrg * we are protected by these scratch page (every address not covered by 188001e04c3fSmrg * a BO should be pointing to the scratch page). But on CNL, it is 188101e04c3fSmrg * causing a GPU hang during context restore at the 3DSTATE_CONSTANT_* 188201e04c3fSmrg * instruction. 188301e04c3fSmrg * 188401e04c3fSmrg * The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the 188501e04c3fSmrg * hardware to ignore previous 3DSTATE_CONSTANT_* packets during a 188601e04c3fSmrg * context restore, so the mentioned hang doesn't happen. However, 188701e04c3fSmrg * software must program push constant commands for all stages prior to 188801e04c3fSmrg * rendering anything. So we flag them dirty in BeginCommandBuffer. 188901e04c3fSmrg * 189001e04c3fSmrg * Finally, we also make sure to stall at pixel scoreboard to make sure the 189101e04c3fSmrg * constants have been loaded into the EUs prior to disable the push constants 189201e04c3fSmrg * so that it doesn't hang a previous 3DPRIMITIVE. 189301e04c3fSmrg */ 189401e04c3fSmrgstatic void 189501e04c3fSmrgemit_isp_disable(struct anv_cmd_buffer *cmd_buffer) 189601e04c3fSmrg{ 189701e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 189801e04c3fSmrg pc.StallAtPixelScoreboard = true; 189901e04c3fSmrg pc.CommandStreamerStallEnable = true; 19007ec681f3Smrg anv_debug_dump_pc(pc); 190101e04c3fSmrg } 190201e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 190301e04c3fSmrg pc.IndirectStatePointersDisable = true; 190401e04c3fSmrg pc.CommandStreamerStallEnable = true; 19057ec681f3Smrg anv_debug_dump_pc(pc); 190601e04c3fSmrg } 190701e04c3fSmrg} 190801e04c3fSmrg 190901e04c3fSmrgVkResult 191001e04c3fSmrggenX(EndCommandBuffer)( 191101e04c3fSmrg VkCommandBuffer commandBuffer) 191201e04c3fSmrg{ 191301e04c3fSmrg ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 191401e04c3fSmrg 191501e04c3fSmrg if (anv_batch_has_error(&cmd_buffer->batch)) 191601e04c3fSmrg return cmd_buffer->batch.status; 191701e04c3fSmrg 19187ec681f3Smrg anv_measure_endcommandbuffer(cmd_buffer); 19197ec681f3Smrg 192001e04c3fSmrg /* We want every command buffer to start with the PMA fix in a known state, 192101e04c3fSmrg * so we disable it at the end of the command buffer. 192201e04c3fSmrg */ 192301e04c3fSmrg genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false); 192401e04c3fSmrg 192501e04c3fSmrg genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 192601e04c3fSmrg 192701e04c3fSmrg emit_isp_disable(cmd_buffer); 192801e04c3fSmrg 192901e04c3fSmrg anv_cmd_buffer_end_batch_buffer(cmd_buffer); 193001e04c3fSmrg 193101e04c3fSmrg return VK_SUCCESS; 193201e04c3fSmrg} 193301e04c3fSmrg 193401e04c3fSmrgvoid 193501e04c3fSmrggenX(CmdExecuteCommands)( 193601e04c3fSmrg VkCommandBuffer commandBuffer, 193701e04c3fSmrg uint32_t commandBufferCount, 193801e04c3fSmrg const VkCommandBuffer* pCmdBuffers) 193901e04c3fSmrg{ 194001e04c3fSmrg ANV_FROM_HANDLE(anv_cmd_buffer, primary, commandBuffer); 194101e04c3fSmrg 194201e04c3fSmrg assert(primary->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); 194301e04c3fSmrg 194401e04c3fSmrg if (anv_batch_has_error(&primary->batch)) 194501e04c3fSmrg return; 194601e04c3fSmrg 194701e04c3fSmrg /* The secondary command buffers will assume that the PMA fix is disabled 194801e04c3fSmrg * when they begin executing. Make sure this is true. 194901e04c3fSmrg */ 195001e04c3fSmrg genX(cmd_buffer_enable_pma_fix)(primary, false); 195101e04c3fSmrg 195201e04c3fSmrg /* The secondary command buffer doesn't know which textures etc. have been 195301e04c3fSmrg * flushed prior to their execution. Apply those flushes now. 195401e04c3fSmrg */ 195501e04c3fSmrg genX(cmd_buffer_apply_pipe_flushes)(primary); 195601e04c3fSmrg 195701e04c3fSmrg for (uint32_t i = 0; i < commandBufferCount; i++) { 195801e04c3fSmrg ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]); 195901e04c3fSmrg 196001e04c3fSmrg assert(secondary->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY); 196101e04c3fSmrg assert(!anv_batch_has_error(&secondary->batch)); 196201e04c3fSmrg 19637ec681f3Smrg#if GFX_VERx10 >= 75 19649f464c52Smaya if (secondary->state.conditional_render_enabled) { 19659f464c52Smaya if (!primary->state.conditional_render_enabled) { 19669f464c52Smaya /* Secondary buffer is constructed as if it will be executed 19679f464c52Smaya * with conditional rendering, we should satisfy this dependency 19689f464c52Smaya * regardless of conditional rendering being enabled in primary. 19699f464c52Smaya */ 19707ec681f3Smrg struct mi_builder b; 19717ec681f3Smrg mi_builder_init(&b, &primary->device->info, &primary->batch); 19727ec681f3Smrg mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG), 19737ec681f3Smrg mi_imm(UINT64_MAX)); 19749f464c52Smaya } 19759f464c52Smaya } 19769f464c52Smaya#endif 19779f464c52Smaya 197801e04c3fSmrg if (secondary->usage_flags & 197901e04c3fSmrg VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) { 198001e04c3fSmrg /* If we're continuing a render pass from the primary, we need to 198101e04c3fSmrg * copy the surface states for the current subpass into the storage 198201e04c3fSmrg * we allocated for them in BeginCommandBuffer. 198301e04c3fSmrg */ 198401e04c3fSmrg struct anv_bo *ss_bo = 19859f464c52Smaya primary->device->surface_state_pool.block_pool.bo; 19867ec681f3Smrg struct anv_state src_state = primary->state.attachment_states; 19877ec681f3Smrg struct anv_state dst_state = secondary->state.attachment_states; 198801e04c3fSmrg assert(src_state.alloc_size == dst_state.alloc_size); 198901e04c3fSmrg 199001e04c3fSmrg genX(cmd_buffer_so_memcpy)(primary, 199101e04c3fSmrg (struct anv_address) { 199201e04c3fSmrg .bo = ss_bo, 199301e04c3fSmrg .offset = dst_state.offset, 199401e04c3fSmrg }, 199501e04c3fSmrg (struct anv_address) { 199601e04c3fSmrg .bo = ss_bo, 199701e04c3fSmrg .offset = src_state.offset, 199801e04c3fSmrg }, 199901e04c3fSmrg src_state.alloc_size); 200001e04c3fSmrg } 200101e04c3fSmrg 200201e04c3fSmrg anv_cmd_buffer_add_secondary(primary, secondary); 20037ec681f3Smrg 20047ec681f3Smrg assert(secondary->perf_query_pool == NULL || primary->perf_query_pool == NULL || 20057ec681f3Smrg secondary->perf_query_pool == primary->perf_query_pool); 20067ec681f3Smrg if (secondary->perf_query_pool) 20077ec681f3Smrg primary->perf_query_pool = secondary->perf_query_pool; 20087ec681f3Smrg 20097ec681f3Smrg#if GFX_VERx10 == 120 20107ec681f3Smrg if (secondary->state.depth_reg_mode != ANV_DEPTH_REG_MODE_UNKNOWN) 20117ec681f3Smrg primary->state.depth_reg_mode = secondary->state.depth_reg_mode; 20127ec681f3Smrg#endif 20137ec681f3Smrg } 20147ec681f3Smrg 20157ec681f3Smrg /* The secondary isn't counted in our VF cache tracking so we need to 20167ec681f3Smrg * invalidate the whole thing. 20177ec681f3Smrg */ 20187ec681f3Smrg if (GFX_VER >= 8 && GFX_VER <= 9) { 20197ec681f3Smrg anv_add_pending_pipe_bits(primary, 20207ec681f3Smrg ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT, 20217ec681f3Smrg "Secondary cmd buffer not tracked in VF cache"); 202201e04c3fSmrg } 202301e04c3fSmrg 202401e04c3fSmrg /* The secondary may have selected a different pipeline (3D or compute) and 202501e04c3fSmrg * may have changed the current L3$ configuration. Reset our tracking 202601e04c3fSmrg * variables to invalid values to ensure that we re-emit these in the case 202701e04c3fSmrg * where we do any draws or compute dispatches from the primary after the 202801e04c3fSmrg * secondary has returned. 202901e04c3fSmrg */ 203001e04c3fSmrg primary->state.current_pipeline = UINT32_MAX; 203101e04c3fSmrg primary->state.current_l3_config = NULL; 20327ec681f3Smrg primary->state.current_hash_scale = 0; 203301e04c3fSmrg 203401e04c3fSmrg /* Each of the secondary command buffers will use its own state base 203501e04c3fSmrg * address. We need to re-emit state base address for the primary after 203601e04c3fSmrg * all of the secondaries are done. 203701e04c3fSmrg * 203801e04c3fSmrg * TODO: Maybe we want to make this a dirty bit to avoid extra state base 203901e04c3fSmrg * address calls? 204001e04c3fSmrg */ 204101e04c3fSmrg genX(cmd_buffer_emit_state_base_address)(primary); 204201e04c3fSmrg} 204301e04c3fSmrg 204401e04c3fSmrg/** 204501e04c3fSmrg * Program the hardware to use the specified L3 configuration. 204601e04c3fSmrg */ 204701e04c3fSmrgvoid 204801e04c3fSmrggenX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer, 20497ec681f3Smrg const struct intel_l3_config *cfg) 205001e04c3fSmrg{ 20517ec681f3Smrg assert(cfg || GFX_VER >= 12); 205201e04c3fSmrg if (cfg == cmd_buffer->state.current_l3_config) 205301e04c3fSmrg return; 205401e04c3fSmrg 20557ec681f3Smrg#if GFX_VER >= 11 20567ec681f3Smrg /* On Gfx11+ we use only one config, so verify it remains the same and skip 20577ec681f3Smrg * the stalling programming entirely. 20587ec681f3Smrg */ 20597ec681f3Smrg assert(cfg == cmd_buffer->device->l3_config); 20607ec681f3Smrg#else 20617ec681f3Smrg if (INTEL_DEBUG(DEBUG_L3)) { 20627ec681f3Smrg mesa_logd("L3 config transition: "); 20637ec681f3Smrg intel_dump_l3_config(cfg, stderr); 206401e04c3fSmrg } 206501e04c3fSmrg 206601e04c3fSmrg /* According to the hardware docs, the L3 partitioning can only be changed 206701e04c3fSmrg * while the pipeline is completely drained and the caches are flushed, 206801e04c3fSmrg * which involves a first PIPE_CONTROL flush which stalls the pipeline... 206901e04c3fSmrg */ 207001e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 207101e04c3fSmrg pc.DCFlushEnable = true; 207201e04c3fSmrg pc.PostSyncOperation = NoWrite; 207301e04c3fSmrg pc.CommandStreamerStallEnable = true; 20747ec681f3Smrg anv_debug_dump_pc(pc); 207501e04c3fSmrg } 207601e04c3fSmrg 207701e04c3fSmrg /* ...followed by a second pipelined PIPE_CONTROL that initiates 207801e04c3fSmrg * invalidation of the relevant caches. Note that because RO invalidation 207901e04c3fSmrg * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL 208001e04c3fSmrg * command is processed by the CS) we cannot combine it with the previous 208101e04c3fSmrg * stalling flush as the hardware documentation suggests, because that 208201e04c3fSmrg * would cause the CS to stall on previous rendering *after* RO 208301e04c3fSmrg * invalidation and wouldn't prevent the RO caches from being polluted by 208401e04c3fSmrg * concurrent rendering before the stall completes. This intentionally 208501e04c3fSmrg * doesn't implement the SKL+ hardware workaround suggesting to enable CS 208601e04c3fSmrg * stall on PIPE_CONTROLs with the texture cache invalidation bit set for 208701e04c3fSmrg * GPGPU workloads because the previous and subsequent PIPE_CONTROLs 208801e04c3fSmrg * already guarantee that there is no concurrent GPGPU kernel execution 208901e04c3fSmrg * (see SKL HSD 2132585). 209001e04c3fSmrg */ 209101e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 209201e04c3fSmrg pc.TextureCacheInvalidationEnable = true; 209301e04c3fSmrg pc.ConstantCacheInvalidationEnable = true; 209401e04c3fSmrg pc.InstructionCacheInvalidateEnable = true; 209501e04c3fSmrg pc.StateCacheInvalidationEnable = true; 209601e04c3fSmrg pc.PostSyncOperation = NoWrite; 20977ec681f3Smrg anv_debug_dump_pc(pc); 209801e04c3fSmrg } 209901e04c3fSmrg 210001e04c3fSmrg /* Now send a third stalling flush to make sure that invalidation is 210101e04c3fSmrg * complete when the L3 configuration registers are modified. 210201e04c3fSmrg */ 210301e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 210401e04c3fSmrg pc.DCFlushEnable = true; 210501e04c3fSmrg pc.PostSyncOperation = NoWrite; 210601e04c3fSmrg pc.CommandStreamerStallEnable = true; 21077ec681f3Smrg anv_debug_dump_pc(pc); 210801e04c3fSmrg } 210901e04c3fSmrg 21107ec681f3Smrg genX(emit_l3_config)(&cmd_buffer->batch, cmd_buffer->device, cfg); 21117ec681f3Smrg#endif /* GFX_VER >= 11 */ 211201e04c3fSmrg cmd_buffer->state.current_l3_config = cfg; 211301e04c3fSmrg} 211401e04c3fSmrg 211501e04c3fSmrgvoid 211601e04c3fSmrggenX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer) 211701e04c3fSmrg{ 21187ec681f3Smrg UNUSED const struct intel_device_info *devinfo = &cmd_buffer->device->info; 211901e04c3fSmrg enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits; 212001e04c3fSmrg 21217ec681f3Smrg if (unlikely(cmd_buffer->device->physical->always_flush_cache)) 21227ec681f3Smrg bits |= ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS; 21237ec681f3Smrg else if (bits == 0) 21247ec681f3Smrg return; 21257ec681f3Smrg 21267ec681f3Smrg /* 21277ec681f3Smrg * From Sandybridge PRM, volume 2, "1.7.2 End-of-Pipe Synchronization": 21287ec681f3Smrg * 21297ec681f3Smrg * Write synchronization is a special case of end-of-pipe 21307ec681f3Smrg * synchronization that requires that the render cache and/or depth 21317ec681f3Smrg * related caches are flushed to memory, where the data will become 21327ec681f3Smrg * globally visible. This type of synchronization is required prior to 21337ec681f3Smrg * SW (CPU) actually reading the result data from memory, or initiating 21347ec681f3Smrg * an operation that will use as a read surface (such as a texture 21357ec681f3Smrg * surface) a previous render target and/or depth/stencil buffer 21367ec681f3Smrg * 21377ec681f3Smrg * 21387ec681f3Smrg * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization": 21397ec681f3Smrg * 21407ec681f3Smrg * Exercising the write cache flush bits (Render Target Cache Flush 21417ec681f3Smrg * Enable, Depth Cache Flush Enable, DC Flush) in PIPE_CONTROL only 21427ec681f3Smrg * ensures the write caches are flushed and doesn't guarantee the data 21437ec681f3Smrg * is globally visible. 21447ec681f3Smrg * 21457ec681f3Smrg * SW can track the completion of the end-of-pipe-synchronization by 21467ec681f3Smrg * using "Notify Enable" and "PostSync Operation - Write Immediate 21477ec681f3Smrg * Data" in the PIPE_CONTROL command. 21487ec681f3Smrg * 21497ec681f3Smrg * In other words, flushes are pipelined while invalidations are handled 21507ec681f3Smrg * immediately. Therefore, if we're flushing anything then we need to 21517ec681f3Smrg * schedule an end-of-pipe sync before any invalidations can happen. 215201e04c3fSmrg */ 215301e04c3fSmrg if (bits & ANV_PIPE_FLUSH_BITS) 21547ec681f3Smrg bits |= ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT; 21557ec681f3Smrg 21567ec681f3Smrg 21577ec681f3Smrg /* HSD 1209978178: docs say that before programming the aux table: 21587ec681f3Smrg * 21597ec681f3Smrg * "Driver must ensure that the engine is IDLE but ensure it doesn't 21607ec681f3Smrg * add extra flushes in the case it knows that the engine is already 21617ec681f3Smrg * IDLE." 21627ec681f3Smrg */ 21637ec681f3Smrg if (GFX_VER == 12 && (bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT)) 21647ec681f3Smrg bits |= ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT; 216501e04c3fSmrg 21667ec681f3Smrg /* If we're going to do an invalidate and we have a pending end-of-pipe 21677ec681f3Smrg * sync that has yet to be resolved, we do the end-of-pipe sync now. 216801e04c3fSmrg */ 216901e04c3fSmrg if ((bits & ANV_PIPE_INVALIDATE_BITS) && 21707ec681f3Smrg (bits & ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT)) { 21717ec681f3Smrg bits |= ANV_PIPE_END_OF_PIPE_SYNC_BIT; 21727ec681f3Smrg bits &= ~ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT; 21737ec681f3Smrg } 21747ec681f3Smrg 21757ec681f3Smrg /* Wa_1409226450, Wait for EU to be idle before pipe control which 21767ec681f3Smrg * invalidates the instruction cache 21777ec681f3Smrg */ 21787ec681f3Smrg if (GFX_VER == 12 && (bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT)) 21797ec681f3Smrg bits |= ANV_PIPE_CS_STALL_BIT | ANV_PIPE_STALL_AT_SCOREBOARD_BIT; 21807ec681f3Smrg 21817ec681f3Smrg if ((GFX_VER >= 8 && GFX_VER <= 9) && 21827ec681f3Smrg (bits & ANV_PIPE_CS_STALL_BIT) && 21837ec681f3Smrg (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) { 21847ec681f3Smrg /* If we are doing a VF cache invalidate AND a CS stall (it must be 21857ec681f3Smrg * both) then we can reset our vertex cache tracking. 21867ec681f3Smrg */ 21877ec681f3Smrg memset(cmd_buffer->state.gfx.vb_dirty_ranges, 0, 21887ec681f3Smrg sizeof(cmd_buffer->state.gfx.vb_dirty_ranges)); 21897ec681f3Smrg memset(&cmd_buffer->state.gfx.ib_dirty_range, 0, 21907ec681f3Smrg sizeof(cmd_buffer->state.gfx.ib_dirty_range)); 21917ec681f3Smrg } 21927ec681f3Smrg 21937ec681f3Smrg /* Project: SKL / Argument: LRI Post Sync Operation [23] 21947ec681f3Smrg * 21957ec681f3Smrg * "PIPECONTROL command with “Command Streamer Stall Enable” must be 21967ec681f3Smrg * programmed prior to programming a PIPECONTROL command with "LRI 21977ec681f3Smrg * Post Sync Operation" in GPGPU mode of operation (i.e when 21987ec681f3Smrg * PIPELINE_SELECT command is set to GPGPU mode of operation)." 21997ec681f3Smrg * 22007ec681f3Smrg * The same text exists a few rows below for Post Sync Op. 22017ec681f3Smrg * 22027ec681f3Smrg * On Gfx12 this is Wa_1607156449. 22037ec681f3Smrg */ 22047ec681f3Smrg if (bits & ANV_PIPE_POST_SYNC_BIT) { 22057ec681f3Smrg if ((GFX_VER == 9 || (GFX_VER == 12 && devinfo->revision == 0 /* A0 */)) && 22067ec681f3Smrg cmd_buffer->state.current_pipeline == GPGPU) 22077ec681f3Smrg bits |= ANV_PIPE_CS_STALL_BIT; 22087ec681f3Smrg bits &= ~ANV_PIPE_POST_SYNC_BIT; 220901e04c3fSmrg } 221001e04c3fSmrg 22117ec681f3Smrg if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS | 22127ec681f3Smrg ANV_PIPE_END_OF_PIPE_SYNC_BIT)) { 221301e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { 22147ec681f3Smrg#if GFX_VER >= 12 22157ec681f3Smrg pipe.TileCacheFlushEnable = bits & ANV_PIPE_TILE_CACHE_FLUSH_BIT; 22167ec681f3Smrg pipe.HDCPipelineFlushEnable |= bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT; 22177ec681f3Smrg#else 22187ec681f3Smrg /* Flushing HDC pipeline requires DC Flush on earlier HW. */ 22197ec681f3Smrg pipe.DCFlushEnable |= bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT; 22207ec681f3Smrg#endif 222101e04c3fSmrg pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT; 22227ec681f3Smrg pipe.DCFlushEnable |= bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT; 222301e04c3fSmrg pipe.RenderTargetCacheFlushEnable = 222401e04c3fSmrg bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT; 222501e04c3fSmrg 22267ec681f3Smrg /* Wa_1409600907: "PIPE_CONTROL with Depth Stall Enable bit must 22277ec681f3Smrg * be set with any PIPE_CONTROL with Depth Flush Enable bit set. 22287ec681f3Smrg */ 22297ec681f3Smrg#if GFX_VER >= 12 22307ec681f3Smrg pipe.DepthStallEnable = 22317ec681f3Smrg pipe.DepthCacheFlushEnable || (bits & ANV_PIPE_DEPTH_STALL_BIT); 22327ec681f3Smrg#else 223301e04c3fSmrg pipe.DepthStallEnable = bits & ANV_PIPE_DEPTH_STALL_BIT; 22347ec681f3Smrg#endif 22357ec681f3Smrg 223601e04c3fSmrg pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT; 223701e04c3fSmrg pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT; 223801e04c3fSmrg 22397ec681f3Smrg /* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory": 22407ec681f3Smrg * 22417ec681f3Smrg * "The most common action to perform upon reaching a 22427ec681f3Smrg * synchronization point is to write a value out to memory. An 22437ec681f3Smrg * immediate value (included with the synchronization command) may 22447ec681f3Smrg * be written." 22457ec681f3Smrg * 22467ec681f3Smrg * 22477ec681f3Smrg * From Broadwell PRM, volume 7, "End-of-Pipe Synchronization": 22487ec681f3Smrg * 22497ec681f3Smrg * "In case the data flushed out by the render engine is to be 22507ec681f3Smrg * read back in to the render engine in coherent manner, then the 22517ec681f3Smrg * render engine has to wait for the fence completion before 22527ec681f3Smrg * accessing the flushed data. This can be achieved by following 22537ec681f3Smrg * means on various products: PIPE_CONTROL command with CS Stall 22547ec681f3Smrg * and the required write caches flushed with Post-Sync-Operation 22557ec681f3Smrg * as Write Immediate Data. 22567ec681f3Smrg * 22577ec681f3Smrg * Example: 22587ec681f3Smrg * - Workload-1 (3D/GPGPU/MEDIA) 22597ec681f3Smrg * - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write 22607ec681f3Smrg * Immediate Data, Required Write Cache Flush bits set) 22617ec681f3Smrg * - Workload-2 (Can use the data produce or output by 22627ec681f3Smrg * Workload-1) 22637ec681f3Smrg */ 22647ec681f3Smrg if (bits & ANV_PIPE_END_OF_PIPE_SYNC_BIT) { 22657ec681f3Smrg pipe.CommandStreamerStallEnable = true; 22667ec681f3Smrg pipe.PostSyncOperation = WriteImmediateData; 22677ec681f3Smrg pipe.Address = cmd_buffer->device->workaround_address; 22687ec681f3Smrg } 22697ec681f3Smrg 227001e04c3fSmrg /* 227101e04c3fSmrg * According to the Broadwell documentation, any PIPE_CONTROL with the 227201e04c3fSmrg * "Command Streamer Stall" bit set must also have another bit set, 227301e04c3fSmrg * with five different options: 227401e04c3fSmrg * 227501e04c3fSmrg * - Render Target Cache Flush 227601e04c3fSmrg * - Depth Cache Flush 227701e04c3fSmrg * - Stall at Pixel Scoreboard 227801e04c3fSmrg * - Post-Sync Operation 227901e04c3fSmrg * - Depth Stall 228001e04c3fSmrg * - DC Flush Enable 228101e04c3fSmrg * 228201e04c3fSmrg * I chose "Stall at Pixel Scoreboard" since that's what we use in 228301e04c3fSmrg * mesa and it seems to work fine. The choice is fairly arbitrary. 228401e04c3fSmrg */ 22857ec681f3Smrg if (pipe.CommandStreamerStallEnable && 22867ec681f3Smrg !pipe.RenderTargetCacheFlushEnable && 22877ec681f3Smrg !pipe.DepthCacheFlushEnable && 22887ec681f3Smrg !pipe.StallAtPixelScoreboard && 22897ec681f3Smrg !pipe.PostSyncOperation && 22907ec681f3Smrg !pipe.DepthStallEnable && 22917ec681f3Smrg !pipe.DCFlushEnable) 229201e04c3fSmrg pipe.StallAtPixelScoreboard = true; 22937ec681f3Smrg anv_debug_dump_pc(pipe); 229401e04c3fSmrg } 229501e04c3fSmrg 229601e04c3fSmrg /* If a render target flush was emitted, then we can toggle off the bit 229701e04c3fSmrg * saying that render target writes are ongoing. 229801e04c3fSmrg */ 229901e04c3fSmrg if (bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT) 23009f464c52Smaya bits &= ~(ANV_PIPE_RENDER_TARGET_BUFFER_WRITES); 230101e04c3fSmrg 23027ec681f3Smrg if (GFX_VERx10 == 75) { 23037ec681f3Smrg /* Haswell needs addition work-arounds: 23047ec681f3Smrg * 23057ec681f3Smrg * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization": 23067ec681f3Smrg * 23077ec681f3Smrg * Option 1: 23087ec681f3Smrg * PIPE_CONTROL command with the CS Stall and the required write 23097ec681f3Smrg * caches flushed with Post-SyncOperation as Write Immediate Data 23107ec681f3Smrg * followed by eight dummy MI_STORE_DATA_IMM (write to scratch 23117ec681f3Smrg * spce) commands. 23127ec681f3Smrg * 23137ec681f3Smrg * Example: 23147ec681f3Smrg * - Workload-1 23157ec681f3Smrg * - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write 23167ec681f3Smrg * Immediate Data, Required Write Cache Flush bits set) 23177ec681f3Smrg * - MI_STORE_DATA_IMM (8 times) (Dummy data, Scratch Address) 23187ec681f3Smrg * - Workload-2 (Can use the data produce or output by 23197ec681f3Smrg * Workload-1) 23207ec681f3Smrg * 23217ec681f3Smrg * Unfortunately, both the PRMs and the internal docs are a bit 23227ec681f3Smrg * out-of-date in this regard. What the windows driver does (and 23237ec681f3Smrg * this appears to actually work) is to emit a register read from the 23247ec681f3Smrg * memory address written by the pipe control above. 23257ec681f3Smrg * 23267ec681f3Smrg * What register we load into doesn't matter. We choose an indirect 23277ec681f3Smrg * rendering register because we know it always exists and it's one 23287ec681f3Smrg * of the first registers the command parser allows us to write. If 23297ec681f3Smrg * you don't have command parser support in your kernel (pre-4.2), 23307ec681f3Smrg * this will get turned into MI_NOOP and you won't get the 23317ec681f3Smrg * workaround. Unfortunately, there's just not much we can do in 23327ec681f3Smrg * that case. This register is perfectly safe to write since we 23337ec681f3Smrg * always re-load all of the indirect draw registers right before 23347ec681f3Smrg * 3DPRIMITIVE when needed anyway. 23357ec681f3Smrg */ 23367ec681f3Smrg anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { 23377ec681f3Smrg lrm.RegisterAddress = 0x243C; /* GFX7_3DPRIM_START_INSTANCE */ 23387ec681f3Smrg lrm.MemoryAddress = cmd_buffer->device->workaround_address; 23397ec681f3Smrg } 23407ec681f3Smrg } 23417ec681f3Smrg 23427ec681f3Smrg bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS | 23437ec681f3Smrg ANV_PIPE_END_OF_PIPE_SYNC_BIT); 234401e04c3fSmrg } 234501e04c3fSmrg 234601e04c3fSmrg if (bits & ANV_PIPE_INVALIDATE_BITS) { 234701e04c3fSmrg /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL", 234801e04c3fSmrg * 234901e04c3fSmrg * "If the VF Cache Invalidation Enable is set to a 1 in a 235001e04c3fSmrg * PIPE_CONTROL, a separate Null PIPE_CONTROL, all bitfields sets to 235101e04c3fSmrg * 0, with the VF Cache Invalidation Enable set to 0 needs to be sent 235201e04c3fSmrg * prior to the PIPE_CONTROL with VF Cache Invalidation Enable set to 235301e04c3fSmrg * a 1." 235401e04c3fSmrg * 23557ec681f3Smrg * This appears to hang Broadwell, so we restrict it to just gfx9. 235601e04c3fSmrg */ 23577ec681f3Smrg if (GFX_VER == 9 && (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) 235801e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe); 235901e04c3fSmrg 236001e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { 236101e04c3fSmrg pipe.StateCacheInvalidationEnable = 236201e04c3fSmrg bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT; 236301e04c3fSmrg pipe.ConstantCacheInvalidationEnable = 236401e04c3fSmrg bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT; 23657ec681f3Smrg#if GFX_VER >= 12 23667ec681f3Smrg /* Invalidates the L3 cache part in which index & vertex data is loaded 23677ec681f3Smrg * when VERTEX_BUFFER_STATE::L3BypassDisable is set. 23687ec681f3Smrg */ 23697ec681f3Smrg pipe.L3ReadOnlyCacheInvalidationEnable = 23707ec681f3Smrg bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT; 23717ec681f3Smrg#endif 237201e04c3fSmrg pipe.VFCacheInvalidationEnable = 237301e04c3fSmrg bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT; 237401e04c3fSmrg pipe.TextureCacheInvalidationEnable = 237501e04c3fSmrg bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT; 237601e04c3fSmrg pipe.InstructionCacheInvalidateEnable = 237701e04c3fSmrg bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT; 237801e04c3fSmrg 237901e04c3fSmrg /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL", 238001e04c3fSmrg * 238101e04c3fSmrg * "When VF Cache Invalidate is set “Post Sync Operation” must be 238201e04c3fSmrg * enabled to “Write Immediate Data” or “Write PS Depth Count” or 238301e04c3fSmrg * “Write Timestamp”. 238401e04c3fSmrg */ 23857ec681f3Smrg if (GFX_VER == 9 && pipe.VFCacheInvalidationEnable) { 238601e04c3fSmrg pipe.PostSyncOperation = WriteImmediateData; 23877ec681f3Smrg pipe.Address = cmd_buffer->device->workaround_address; 23887ec681f3Smrg } 23897ec681f3Smrg anv_debug_dump_pc(pipe); 23907ec681f3Smrg } 23917ec681f3Smrg 23927ec681f3Smrg#if GFX_VER == 12 23937ec681f3Smrg if ((bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT) && 23947ec681f3Smrg cmd_buffer->device->info.has_aux_map) { 23957ec681f3Smrg anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) { 23967ec681f3Smrg lri.RegisterOffset = GENX(GFX_CCS_AUX_INV_num); 23977ec681f3Smrg lri.DataDWord = 1; 239801e04c3fSmrg } 239901e04c3fSmrg } 24007ec681f3Smrg#endif 240101e04c3fSmrg 240201e04c3fSmrg bits &= ~ANV_PIPE_INVALIDATE_BITS; 240301e04c3fSmrg } 240401e04c3fSmrg 240501e04c3fSmrg cmd_buffer->state.pending_pipe_bits = bits; 240601e04c3fSmrg} 240701e04c3fSmrg 24087ec681f3Smrgstatic void 24097ec681f3Smrgcmd_buffer_barrier(struct anv_cmd_buffer *cmd_buffer, 24107ec681f3Smrg const VkDependencyInfoKHR *dep_info, 24117ec681f3Smrg const char *reason) 241201e04c3fSmrg{ 241301e04c3fSmrg /* XXX: Right now, we're really dumb and just flush whatever categories 241401e04c3fSmrg * the app asks for. One of these days we may make this a bit better 241501e04c3fSmrg * but right now that's all the hardware allows for in most areas. 241601e04c3fSmrg */ 24177ec681f3Smrg VkAccessFlags2KHR src_flags = 0; 24187ec681f3Smrg VkAccessFlags2KHR dst_flags = 0; 241901e04c3fSmrg 24207ec681f3Smrg for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) { 24217ec681f3Smrg src_flags |= dep_info->pMemoryBarriers[i].srcAccessMask; 24227ec681f3Smrg dst_flags |= dep_info->pMemoryBarriers[i].dstAccessMask; 242301e04c3fSmrg } 242401e04c3fSmrg 24257ec681f3Smrg for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) { 24267ec681f3Smrg src_flags |= dep_info->pBufferMemoryBarriers[i].srcAccessMask; 24277ec681f3Smrg dst_flags |= dep_info->pBufferMemoryBarriers[i].dstAccessMask; 242801e04c3fSmrg } 242901e04c3fSmrg 24307ec681f3Smrg for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) { 24317ec681f3Smrg const VkImageMemoryBarrier2KHR *img_barrier = 24327ec681f3Smrg &dep_info->pImageMemoryBarriers[i]; 24337ec681f3Smrg 24347ec681f3Smrg src_flags |= img_barrier->srcAccessMask; 24357ec681f3Smrg dst_flags |= img_barrier->dstAccessMask; 24367ec681f3Smrg 24377ec681f3Smrg ANV_FROM_HANDLE(anv_image, image, img_barrier->image); 24387ec681f3Smrg const VkImageSubresourceRange *range = &img_barrier->subresourceRange; 24397ec681f3Smrg 24407ec681f3Smrg uint32_t base_layer, layer_count; 24417ec681f3Smrg if (image->vk.image_type == VK_IMAGE_TYPE_3D) { 24427ec681f3Smrg base_layer = 0; 24437ec681f3Smrg layer_count = anv_minify(image->vk.extent.depth, range->baseMipLevel); 24447ec681f3Smrg } else { 24457ec681f3Smrg base_layer = range->baseArrayLayer; 24467ec681f3Smrg layer_count = vk_image_subresource_layer_count(&image->vk, range); 24477ec681f3Smrg } 24487ec681f3Smrg const uint32_t level_count = 24497ec681f3Smrg vk_image_subresource_level_count(&image->vk, range); 245001e04c3fSmrg 245101e04c3fSmrg if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) { 245201e04c3fSmrg transition_depth_buffer(cmd_buffer, image, 24537ec681f3Smrg base_layer, layer_count, 24547ec681f3Smrg img_barrier->oldLayout, 24557ec681f3Smrg img_barrier->newLayout, 24567ec681f3Smrg false /* will_full_fast_clear */); 24577ec681f3Smrg } 245801e04c3fSmrg 24597ec681f3Smrg if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) { 24607ec681f3Smrg transition_stencil_buffer(cmd_buffer, image, 24617ec681f3Smrg range->baseMipLevel, level_count, 24627ec681f3Smrg base_layer, layer_count, 24637ec681f3Smrg img_barrier->oldLayout, 24647ec681f3Smrg img_barrier->newLayout, 24657ec681f3Smrg false /* will_full_fast_clear */); 24667ec681f3Smrg } 246701e04c3fSmrg 24687ec681f3Smrg if (range->aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { 24697ec681f3Smrg VkImageAspectFlags color_aspects = 24707ec681f3Smrg vk_image_expand_aspect_mask(&image->vk, range->aspectMask); 247101e04c3fSmrg anv_foreach_image_aspect_bit(aspect_bit, image, color_aspects) { 247201e04c3fSmrg transition_color_buffer(cmd_buffer, image, 1UL << aspect_bit, 24737ec681f3Smrg range->baseMipLevel, level_count, 247401e04c3fSmrg base_layer, layer_count, 24757ec681f3Smrg img_barrier->oldLayout, 24767ec681f3Smrg img_barrier->newLayout, 24777ec681f3Smrg img_barrier->srcQueueFamilyIndex, 24787ec681f3Smrg img_barrier->dstQueueFamilyIndex, 24797ec681f3Smrg false /* will_full_fast_clear */); 248001e04c3fSmrg } 248101e04c3fSmrg } 248201e04c3fSmrg } 248301e04c3fSmrg 24847ec681f3Smrg enum anv_pipe_bits bits = 24857ec681f3Smrg anv_pipe_flush_bits_for_access_flags(cmd_buffer->device, src_flags) | 24867ec681f3Smrg anv_pipe_invalidate_bits_for_access_flags(cmd_buffer->device, dst_flags); 24877ec681f3Smrg 24887ec681f3Smrg anv_add_pending_pipe_bits(cmd_buffer, bits, reason); 24897ec681f3Smrg} 24907ec681f3Smrg 24917ec681f3Smrgvoid genX(CmdPipelineBarrier2KHR)( 24927ec681f3Smrg VkCommandBuffer commandBuffer, 24937ec681f3Smrg const VkDependencyInfoKHR* pDependencyInfo) 24947ec681f3Smrg{ 24957ec681f3Smrg ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 24967ec681f3Smrg 24977ec681f3Smrg cmd_buffer_barrier(cmd_buffer, pDependencyInfo, "pipe barrier"); 249801e04c3fSmrg} 249901e04c3fSmrg 250001e04c3fSmrgstatic void 250101e04c3fSmrgcmd_buffer_alloc_push_constants(struct anv_cmd_buffer *cmd_buffer) 250201e04c3fSmrg{ 25037ec681f3Smrg assert(anv_pipeline_is_primitive(cmd_buffer->state.gfx.pipeline)); 25047ec681f3Smrg 250501e04c3fSmrg VkShaderStageFlags stages = 25067ec681f3Smrg cmd_buffer->state.gfx.pipeline->active_stages; 250701e04c3fSmrg 250801e04c3fSmrg /* In order to avoid thrash, we assume that vertex and fragment stages 250901e04c3fSmrg * always exist. In the rare case where one is missing *and* the other 251001e04c3fSmrg * uses push concstants, this may be suboptimal. However, avoiding stalls 251101e04c3fSmrg * seems more important. 251201e04c3fSmrg */ 251301e04c3fSmrg stages |= VK_SHADER_STAGE_FRAGMENT_BIT | VK_SHADER_STAGE_VERTEX_BIT; 251401e04c3fSmrg 25157ec681f3Smrg if (stages == cmd_buffer->state.gfx.push_constant_stages) 251601e04c3fSmrg return; 251701e04c3fSmrg 25187ec681f3Smrg const unsigned push_constant_kb = 25197ec681f3Smrg cmd_buffer->device->info.max_constant_urb_size_kb; 252001e04c3fSmrg 252101e04c3fSmrg const unsigned num_stages = 252201e04c3fSmrg util_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS); 252301e04c3fSmrg unsigned size_per_stage = push_constant_kb / num_stages; 252401e04c3fSmrg 252501e04c3fSmrg /* Broadwell+ and Haswell gt3 require that the push constant sizes be in 252601e04c3fSmrg * units of 2KB. Incidentally, these are the same platforms that have 252701e04c3fSmrg * 32KB worth of push constant space. 252801e04c3fSmrg */ 252901e04c3fSmrg if (push_constant_kb == 32) 253001e04c3fSmrg size_per_stage &= ~1u; 253101e04c3fSmrg 253201e04c3fSmrg uint32_t kb_used = 0; 253301e04c3fSmrg for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_FRAGMENT; i++) { 253401e04c3fSmrg unsigned push_size = (stages & (1 << i)) ? size_per_stage : 0; 253501e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, 253601e04c3fSmrg GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) { 253701e04c3fSmrg alloc._3DCommandSubOpcode = 18 + i; 253801e04c3fSmrg alloc.ConstantBufferOffset = (push_size > 0) ? kb_used : 0; 253901e04c3fSmrg alloc.ConstantBufferSize = push_size; 254001e04c3fSmrg } 254101e04c3fSmrg kb_used += push_size; 254201e04c3fSmrg } 254301e04c3fSmrg 254401e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, 254501e04c3fSmrg GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) { 254601e04c3fSmrg alloc.ConstantBufferOffset = kb_used; 254701e04c3fSmrg alloc.ConstantBufferSize = push_constant_kb - kb_used; 254801e04c3fSmrg } 254901e04c3fSmrg 25507ec681f3Smrg cmd_buffer->state.gfx.push_constant_stages = stages; 255101e04c3fSmrg 255201e04c3fSmrg /* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS: 255301e04c3fSmrg * 255401e04c3fSmrg * "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to 255501e04c3fSmrg * the next 3DPRIMITIVE command after programming the 255601e04c3fSmrg * 3DSTATE_PUSH_CONSTANT_ALLOC_VS" 255701e04c3fSmrg * 255801e04c3fSmrg * Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of 255901e04c3fSmrg * pipeline setup, we need to dirty push constants. 256001e04c3fSmrg */ 256101e04c3fSmrg cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS; 256201e04c3fSmrg} 256301e04c3fSmrg 256401e04c3fSmrgstatic VkResult 256501e04c3fSmrgemit_binding_table(struct anv_cmd_buffer *cmd_buffer, 25667ec681f3Smrg struct anv_cmd_pipeline_state *pipe_state, 25677ec681f3Smrg struct anv_shader_bin *shader, 256801e04c3fSmrg struct anv_state *bt_state) 256901e04c3fSmrg{ 257001e04c3fSmrg struct anv_subpass *subpass = cmd_buffer->state.subpass; 25719f464c52Smaya uint32_t state_offset; 257201e04c3fSmrg 25737ec681f3Smrg struct anv_pipeline_bind_map *map = &shader->bind_map; 25749f464c52Smaya if (map->surface_count == 0) { 257501e04c3fSmrg *bt_state = (struct anv_state) { 0, }; 257601e04c3fSmrg return VK_SUCCESS; 257701e04c3fSmrg } 257801e04c3fSmrg 257901e04c3fSmrg *bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer, 25809f464c52Smaya map->surface_count, 258101e04c3fSmrg &state_offset); 258201e04c3fSmrg uint32_t *bt_map = bt_state->map; 258301e04c3fSmrg 258401e04c3fSmrg if (bt_state->map == NULL) 258501e04c3fSmrg return VK_ERROR_OUT_OF_DEVICE_MEMORY; 258601e04c3fSmrg 25879f464c52Smaya /* We only need to emit relocs if we're not using softpin. If we are using 25889f464c52Smaya * softpin then we always keep all user-allocated memory objects resident. 25899f464c52Smaya */ 25909f464c52Smaya const bool need_client_mem_relocs = 25917ec681f3Smrg !anv_use_softpin(cmd_buffer->device->physical); 25927ec681f3Smrg struct anv_push_constants *push = &pipe_state->push_constants; 259301e04c3fSmrg 259401e04c3fSmrg for (uint32_t s = 0; s < map->surface_count; s++) { 259501e04c3fSmrg struct anv_pipeline_binding *binding = &map->surface_to_descriptor[s]; 259601e04c3fSmrg 259701e04c3fSmrg struct anv_state surface_state; 259801e04c3fSmrg 25997ec681f3Smrg switch (binding->set) { 26007ec681f3Smrg case ANV_DESCRIPTOR_SET_NULL: 26017ec681f3Smrg bt_map[s] = 0; 26027ec681f3Smrg break; 26037ec681f3Smrg 26047ec681f3Smrg case ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS: 260501e04c3fSmrg /* Color attachment binding */ 26067ec681f3Smrg assert(shader->stage == MESA_SHADER_FRAGMENT); 260701e04c3fSmrg if (binding->index < subpass->color_count) { 260801e04c3fSmrg const unsigned att = 260901e04c3fSmrg subpass->color_attachments[binding->index].attachment; 261001e04c3fSmrg 261101e04c3fSmrg /* From the Vulkan 1.0.46 spec: 261201e04c3fSmrg * 261301e04c3fSmrg * "If any color or depth/stencil attachments are 261401e04c3fSmrg * VK_ATTACHMENT_UNUSED, then no writes occur for those 261501e04c3fSmrg * attachments." 261601e04c3fSmrg */ 261701e04c3fSmrg if (att == VK_ATTACHMENT_UNUSED) { 261801e04c3fSmrg surface_state = cmd_buffer->state.null_surface_state; 261901e04c3fSmrg } else { 262001e04c3fSmrg surface_state = cmd_buffer->state.attachments[att].color.state; 262101e04c3fSmrg } 262201e04c3fSmrg } else { 262301e04c3fSmrg surface_state = cmd_buffer->state.null_surface_state; 262401e04c3fSmrg } 262501e04c3fSmrg 26267ec681f3Smrg assert(surface_state.map); 26279f464c52Smaya bt_map[s] = surface_state.offset + state_offset; 26287ec681f3Smrg break; 26297ec681f3Smrg 26307ec681f3Smrg case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS: { 263101e04c3fSmrg struct anv_state surface_state = 263201e04c3fSmrg anv_cmd_buffer_alloc_surface_state(cmd_buffer); 263301e04c3fSmrg 263401e04c3fSmrg struct anv_address constant_data = { 26357ec681f3Smrg .bo = cmd_buffer->device->instruction_state_pool.block_pool.bo, 26367ec681f3Smrg .offset = shader->kernel.offset + 26377ec681f3Smrg shader->prog_data->const_data_offset, 263801e04c3fSmrg }; 26397ec681f3Smrg unsigned constant_data_size = shader->prog_data->const_data_size; 264001e04c3fSmrg 264101e04c3fSmrg const enum isl_format format = 26427ec681f3Smrg anv_isl_format_for_descriptor_type(cmd_buffer->device, 26437ec681f3Smrg VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER); 264401e04c3fSmrg anv_fill_buffer_surface_state(cmd_buffer->device, 264501e04c3fSmrg surface_state, format, 26467ec681f3Smrg ISL_SURF_USAGE_CONSTANT_BUFFER_BIT, 264701e04c3fSmrg constant_data, constant_data_size, 1); 264801e04c3fSmrg 26497ec681f3Smrg assert(surface_state.map); 26509f464c52Smaya bt_map[s] = surface_state.offset + state_offset; 265101e04c3fSmrg add_surface_reloc(cmd_buffer, surface_state, constant_data); 26527ec681f3Smrg break; 26537ec681f3Smrg } 26547ec681f3Smrg 26557ec681f3Smrg case ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS: { 26569f464c52Smaya /* This is always the first binding for compute shaders */ 26577ec681f3Smrg assert(shader->stage == MESA_SHADER_COMPUTE && s == 0); 26589f464c52Smaya 26599f464c52Smaya struct anv_state surface_state = 26609f464c52Smaya anv_cmd_buffer_alloc_surface_state(cmd_buffer); 26619f464c52Smaya 26629f464c52Smaya const enum isl_format format = 26637ec681f3Smrg anv_isl_format_for_descriptor_type(cmd_buffer->device, 26647ec681f3Smrg VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); 26659f464c52Smaya anv_fill_buffer_surface_state(cmd_buffer->device, surface_state, 26669f464c52Smaya format, 26677ec681f3Smrg ISL_SURF_USAGE_CONSTANT_BUFFER_BIT, 26689f464c52Smaya cmd_buffer->state.compute.num_workgroups, 26699f464c52Smaya 12, 1); 26707ec681f3Smrg 26717ec681f3Smrg assert(surface_state.map); 26729f464c52Smaya bt_map[s] = surface_state.offset + state_offset; 26739f464c52Smaya if (need_client_mem_relocs) { 26749f464c52Smaya add_surface_reloc(cmd_buffer, surface_state, 26759f464c52Smaya cmd_buffer->state.compute.num_workgroups); 26769f464c52Smaya } 26777ec681f3Smrg break; 26787ec681f3Smrg } 26797ec681f3Smrg 26807ec681f3Smrg case ANV_DESCRIPTOR_SET_DESCRIPTORS: { 26819f464c52Smaya /* This is a descriptor set buffer so the set index is actually 26829f464c52Smaya * given by binding->binding. (Yes, that's confusing.) 26839f464c52Smaya */ 26849f464c52Smaya struct anv_descriptor_set *set = 26857ec681f3Smrg pipe_state->descriptors[binding->index]; 26869f464c52Smaya assert(set->desc_mem.alloc_size); 26879f464c52Smaya assert(set->desc_surface_state.alloc_size); 26889f464c52Smaya bt_map[s] = set->desc_surface_state.offset + state_offset; 26899f464c52Smaya add_surface_reloc(cmd_buffer, set->desc_surface_state, 26907ec681f3Smrg anv_descriptor_set_address(set)); 269101e04c3fSmrg break; 269201e04c3fSmrg } 26937ec681f3Smrg 26947ec681f3Smrg default: { 26957ec681f3Smrg assert(binding->set < MAX_SETS); 26967ec681f3Smrg const struct anv_descriptor_set *set = 26977ec681f3Smrg pipe_state->descriptors[binding->set]; 26987ec681f3Smrg if (binding->index >= set->descriptor_count) { 26997ec681f3Smrg /* From the Vulkan spec section entitled "DescriptorSet and 27007ec681f3Smrg * Binding Assignment": 27017ec681f3Smrg * 27027ec681f3Smrg * "If the array is runtime-sized, then array elements greater 27037ec681f3Smrg * than or equal to the size of that binding in the bound 27047ec681f3Smrg * descriptor set must not be used." 27057ec681f3Smrg * 27067ec681f3Smrg * Unfortunately, the compiler isn't smart enough to figure out 27077ec681f3Smrg * when a dynamic binding isn't used so it may grab the whole 27087ec681f3Smrg * array and stick it in the binding table. In this case, it's 27097ec681f3Smrg * safe to just skip those bindings that are OOB. 271001e04c3fSmrg */ 27117ec681f3Smrg assert(binding->index < set->layout->descriptor_count); 27127ec681f3Smrg continue; 271301e04c3fSmrg } 27147ec681f3Smrg const struct anv_descriptor *desc = &set->descriptors[binding->index]; 271501e04c3fSmrg 27167ec681f3Smrg switch (desc->type) { 27177ec681f3Smrg case VK_DESCRIPTOR_TYPE_SAMPLER: 27187ec681f3Smrg /* Nothing for us to do here */ 27197ec681f3Smrg continue; 272001e04c3fSmrg 27217ec681f3Smrg case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: 27227ec681f3Smrg case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: { 27237ec681f3Smrg if (desc->image_view) { 27247ec681f3Smrg struct anv_surface_state sstate = 27257ec681f3Smrg (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ? 27267ec681f3Smrg desc->image_view->planes[binding->plane].general_sampler_surface_state : 27277ec681f3Smrg desc->image_view->planes[binding->plane].optimal_sampler_surface_state; 27287ec681f3Smrg surface_state = sstate.state; 27297ec681f3Smrg assert(surface_state.alloc_size); 27307ec681f3Smrg if (need_client_mem_relocs) 27317ec681f3Smrg add_surface_state_relocs(cmd_buffer, sstate); 27327ec681f3Smrg } else { 27337ec681f3Smrg surface_state = cmd_buffer->device->null_surface_state; 27347ec681f3Smrg } 27357ec681f3Smrg break; 27367ec681f3Smrg } 27377ec681f3Smrg case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: 27387ec681f3Smrg assert(shader->stage == MESA_SHADER_FRAGMENT); 27397ec681f3Smrg assert(desc->image_view != NULL); 27407ec681f3Smrg if ((desc->image_view->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) == 0) { 27417ec681f3Smrg /* For depth and stencil input attachments, we treat it like any 27427ec681f3Smrg * old texture that a user may have bound. 27437ec681f3Smrg */ 27447ec681f3Smrg assert(desc->image_view->n_planes == 1); 27457ec681f3Smrg struct anv_surface_state sstate = 27467ec681f3Smrg (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ? 27477ec681f3Smrg desc->image_view->planes[0].general_sampler_surface_state : 27487ec681f3Smrg desc->image_view->planes[0].optimal_sampler_surface_state; 27497ec681f3Smrg surface_state = sstate.state; 27507ec681f3Smrg assert(surface_state.alloc_size); 27517ec681f3Smrg if (need_client_mem_relocs) 27527ec681f3Smrg add_surface_state_relocs(cmd_buffer, sstate); 27537ec681f3Smrg } else { 27547ec681f3Smrg /* For color input attachments, we create the surface state at 27557ec681f3Smrg * vkBeginRenderPass time so that we can include aux and clear 27567ec681f3Smrg * color information. 27577ec681f3Smrg */ 27587ec681f3Smrg assert(binding->input_attachment_index < subpass->input_count); 27597ec681f3Smrg const unsigned subpass_att = binding->input_attachment_index; 27607ec681f3Smrg const unsigned att = subpass->input_attachments[subpass_att].attachment; 27617ec681f3Smrg surface_state = cmd_buffer->state.attachments[att].input.state; 27627ec681f3Smrg } 27637ec681f3Smrg break; 27647ec681f3Smrg 27657ec681f3Smrg case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: { 27667ec681f3Smrg if (desc->image_view) { 27677ec681f3Smrg struct anv_surface_state sstate = 27687ec681f3Smrg binding->lowered_storage_surface 27697ec681f3Smrg ? desc->image_view->planes[binding->plane].lowered_storage_surface_state 27707ec681f3Smrg : desc->image_view->planes[binding->plane].storage_surface_state; 27717ec681f3Smrg surface_state = sstate.state; 27727ec681f3Smrg assert(surface_state.alloc_size); 27737ec681f3Smrg if (surface_state.offset == 0) { 27747ec681f3Smrg mesa_loge("Bound a image to a descriptor where the " 27757ec681f3Smrg "descriptor does not have NonReadable " 27767ec681f3Smrg "set and the image does not have a " 27777ec681f3Smrg "corresponding SPIR-V format enum."); 27787ec681f3Smrg vk_debug_report(&cmd_buffer->device->physical->instance->vk, 27797ec681f3Smrg VK_DEBUG_REPORT_ERROR_BIT_EXT, 27807ec681f3Smrg &desc->image_view->vk.base, 27817ec681f3Smrg __LINE__, 0, "anv", 27827ec681f3Smrg "Bound a image to a descriptor where the " 27837ec681f3Smrg "descriptor does not have NonReadable " 27847ec681f3Smrg "set and the image does not have a " 27857ec681f3Smrg "corresponding SPIR-V format enum."); 27867ec681f3Smrg } 27877ec681f3Smrg if (surface_state.offset && need_client_mem_relocs) 27887ec681f3Smrg add_surface_state_relocs(cmd_buffer, sstate); 27897ec681f3Smrg } else { 27907ec681f3Smrg surface_state = cmd_buffer->device->null_surface_state; 27917ec681f3Smrg } 27927ec681f3Smrg break; 27939f464c52Smaya } 279401e04c3fSmrg 27957ec681f3Smrg case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: 27967ec681f3Smrg case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: 27977ec681f3Smrg case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: 27987ec681f3Smrg if (desc->buffer_view) { 27997ec681f3Smrg surface_state = desc->buffer_view->surface_state; 28007ec681f3Smrg assert(surface_state.alloc_size); 28017ec681f3Smrg if (need_client_mem_relocs) { 28027ec681f3Smrg add_surface_reloc(cmd_buffer, surface_state, 28037ec681f3Smrg desc->buffer_view->address); 28047ec681f3Smrg } 28057ec681f3Smrg } else { 28067ec681f3Smrg surface_state = cmd_buffer->device->null_surface_state; 28077ec681f3Smrg } 28087ec681f3Smrg break; 28097ec681f3Smrg 28107ec681f3Smrg case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: 28117ec681f3Smrg case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: { 28127ec681f3Smrg if (desc->buffer) { 28137ec681f3Smrg /* Compute the offset within the buffer */ 28147ec681f3Smrg uint32_t dynamic_offset = 28157ec681f3Smrg push->dynamic_offsets[binding->dynamic_offset_index]; 28167ec681f3Smrg uint64_t offset = desc->offset + dynamic_offset; 28177ec681f3Smrg /* Clamp to the buffer size */ 28187ec681f3Smrg offset = MIN2(offset, desc->buffer->size); 28197ec681f3Smrg /* Clamp the range to the buffer size */ 28207ec681f3Smrg uint32_t range = MIN2(desc->range, desc->buffer->size - offset); 28217ec681f3Smrg 28227ec681f3Smrg /* Align the range for consistency */ 28237ec681f3Smrg if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) 28247ec681f3Smrg range = align_u32(range, ANV_UBO_ALIGNMENT); 28257ec681f3Smrg 28267ec681f3Smrg struct anv_address address = 28277ec681f3Smrg anv_address_add(desc->buffer->address, offset); 28287ec681f3Smrg 28297ec681f3Smrg surface_state = 28307ec681f3Smrg anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64); 28317ec681f3Smrg enum isl_format format = 28327ec681f3Smrg anv_isl_format_for_descriptor_type(cmd_buffer->device, 28337ec681f3Smrg desc->type); 28347ec681f3Smrg 28357ec681f3Smrg isl_surf_usage_flags_t usage = 28367ec681f3Smrg desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ? 28377ec681f3Smrg ISL_SURF_USAGE_CONSTANT_BUFFER_BIT : 28387ec681f3Smrg ISL_SURF_USAGE_STORAGE_BIT; 28397ec681f3Smrg 28407ec681f3Smrg anv_fill_buffer_surface_state(cmd_buffer->device, surface_state, 28417ec681f3Smrg format, usage, address, range, 1); 28427ec681f3Smrg if (need_client_mem_relocs) 28437ec681f3Smrg add_surface_reloc(cmd_buffer, surface_state, address); 28447ec681f3Smrg } else { 28457ec681f3Smrg surface_state = cmd_buffer->device->null_surface_state; 28467ec681f3Smrg } 28477ec681f3Smrg break; 28487ec681f3Smrg } 284901e04c3fSmrg 28507ec681f3Smrg case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: 28517ec681f3Smrg if (desc->buffer_view) { 28527ec681f3Smrg surface_state = binding->lowered_storage_surface 28537ec681f3Smrg ? desc->buffer_view->lowered_storage_surface_state 28547ec681f3Smrg : desc->buffer_view->storage_surface_state; 28557ec681f3Smrg assert(surface_state.alloc_size); 28567ec681f3Smrg if (need_client_mem_relocs) { 28577ec681f3Smrg add_surface_reloc(cmd_buffer, surface_state, 28587ec681f3Smrg desc->buffer_view->address); 28597ec681f3Smrg } 28607ec681f3Smrg } else { 28617ec681f3Smrg surface_state = cmd_buffer->device->null_surface_state; 28627ec681f3Smrg } 28637ec681f3Smrg break; 286401e04c3fSmrg 28657ec681f3Smrg default: 28667ec681f3Smrg assert(!"Invalid descriptor type"); 28677ec681f3Smrg continue; 286801e04c3fSmrg } 28697ec681f3Smrg assert(surface_state.map); 28707ec681f3Smrg bt_map[s] = surface_state.offset + state_offset; 287101e04c3fSmrg break; 287201e04c3fSmrg } 28737ec681f3Smrg } 287401e04c3fSmrg } 287501e04c3fSmrg 287601e04c3fSmrg return VK_SUCCESS; 287701e04c3fSmrg} 287801e04c3fSmrg 287901e04c3fSmrgstatic VkResult 288001e04c3fSmrgemit_samplers(struct anv_cmd_buffer *cmd_buffer, 28817ec681f3Smrg struct anv_cmd_pipeline_state *pipe_state, 28827ec681f3Smrg struct anv_shader_bin *shader, 288301e04c3fSmrg struct anv_state *state) 288401e04c3fSmrg{ 28857ec681f3Smrg struct anv_pipeline_bind_map *map = &shader->bind_map; 288601e04c3fSmrg if (map->sampler_count == 0) { 288701e04c3fSmrg *state = (struct anv_state) { 0, }; 288801e04c3fSmrg return VK_SUCCESS; 288901e04c3fSmrg } 289001e04c3fSmrg 289101e04c3fSmrg uint32_t size = map->sampler_count * 16; 289201e04c3fSmrg *state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, 32); 289301e04c3fSmrg 289401e04c3fSmrg if (state->map == NULL) 289501e04c3fSmrg return VK_ERROR_OUT_OF_DEVICE_MEMORY; 289601e04c3fSmrg 289701e04c3fSmrg for (uint32_t s = 0; s < map->sampler_count; s++) { 289801e04c3fSmrg struct anv_pipeline_binding *binding = &map->sampler_to_descriptor[s]; 289901e04c3fSmrg const struct anv_descriptor *desc = 29007ec681f3Smrg &pipe_state->descriptors[binding->set]->descriptors[binding->index]; 290101e04c3fSmrg 290201e04c3fSmrg if (desc->type != VK_DESCRIPTOR_TYPE_SAMPLER && 290301e04c3fSmrg desc->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) 290401e04c3fSmrg continue; 290501e04c3fSmrg 290601e04c3fSmrg struct anv_sampler *sampler = desc->sampler; 290701e04c3fSmrg 290801e04c3fSmrg /* This can happen if we have an unfilled slot since TYPE_SAMPLER 290901e04c3fSmrg * happens to be zero. 291001e04c3fSmrg */ 291101e04c3fSmrg if (sampler == NULL) 291201e04c3fSmrg continue; 291301e04c3fSmrg 291401e04c3fSmrg memcpy(state->map + (s * 16), 291501e04c3fSmrg sampler->state[binding->plane], sizeof(sampler->state[0])); 291601e04c3fSmrg } 291701e04c3fSmrg 291801e04c3fSmrg return VK_SUCCESS; 291901e04c3fSmrg} 292001e04c3fSmrg 292101e04c3fSmrgstatic uint32_t 29227ec681f3Smrgflush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer, 29237ec681f3Smrg struct anv_cmd_pipeline_state *pipe_state, 29247ec681f3Smrg const VkShaderStageFlags dirty, 29257ec681f3Smrg struct anv_shader_bin **shaders, 29267ec681f3Smrg uint32_t num_shaders) 292701e04c3fSmrg{ 29287ec681f3Smrg VkShaderStageFlags flushed = 0; 292901e04c3fSmrg 293001e04c3fSmrg VkResult result = VK_SUCCESS; 29317ec681f3Smrg for (uint32_t i = 0; i < num_shaders; i++) { 29327ec681f3Smrg if (!shaders[i]) 29337ec681f3Smrg continue; 29347ec681f3Smrg 29357ec681f3Smrg gl_shader_stage stage = shaders[i]->stage; 29367ec681f3Smrg VkShaderStageFlags vk_stage = mesa_to_vk_shader_stage(stage); 29377ec681f3Smrg if ((vk_stage & dirty) == 0) 29387ec681f3Smrg continue; 29397ec681f3Smrg 29407ec681f3Smrg assert(stage < ARRAY_SIZE(cmd_buffer->state.samplers)); 29417ec681f3Smrg result = emit_samplers(cmd_buffer, pipe_state, shaders[i], 29427ec681f3Smrg &cmd_buffer->state.samplers[stage]); 294301e04c3fSmrg if (result != VK_SUCCESS) 294401e04c3fSmrg break; 29457ec681f3Smrg 29467ec681f3Smrg assert(stage < ARRAY_SIZE(cmd_buffer->state.binding_tables)); 29477ec681f3Smrg result = emit_binding_table(cmd_buffer, pipe_state, shaders[i], 29487ec681f3Smrg &cmd_buffer->state.binding_tables[stage]); 294901e04c3fSmrg if (result != VK_SUCCESS) 295001e04c3fSmrg break; 29517ec681f3Smrg 29527ec681f3Smrg flushed |= vk_stage; 295301e04c3fSmrg } 295401e04c3fSmrg 295501e04c3fSmrg if (result != VK_SUCCESS) { 295601e04c3fSmrg assert(result == VK_ERROR_OUT_OF_DEVICE_MEMORY); 295701e04c3fSmrg 295801e04c3fSmrg result = anv_cmd_buffer_new_binding_table_block(cmd_buffer); 295901e04c3fSmrg if (result != VK_SUCCESS) 296001e04c3fSmrg return 0; 296101e04c3fSmrg 296201e04c3fSmrg /* Re-emit state base addresses so we get the new surface state base 296301e04c3fSmrg * address before we start emitting binding tables etc. 296401e04c3fSmrg */ 296501e04c3fSmrg genX(cmd_buffer_emit_state_base_address)(cmd_buffer); 296601e04c3fSmrg 296701e04c3fSmrg /* Re-emit all active binding tables */ 29687ec681f3Smrg flushed = 0; 29697ec681f3Smrg 29707ec681f3Smrg for (uint32_t i = 0; i < num_shaders; i++) { 29717ec681f3Smrg if (!shaders[i]) 29727ec681f3Smrg continue; 29737ec681f3Smrg 29747ec681f3Smrg gl_shader_stage stage = shaders[i]->stage; 29757ec681f3Smrg 29767ec681f3Smrg result = emit_samplers(cmd_buffer, pipe_state, shaders[i], 29777ec681f3Smrg &cmd_buffer->state.samplers[stage]); 297801e04c3fSmrg if (result != VK_SUCCESS) { 297901e04c3fSmrg anv_batch_set_error(&cmd_buffer->batch, result); 298001e04c3fSmrg return 0; 298101e04c3fSmrg } 29827ec681f3Smrg result = emit_binding_table(cmd_buffer, pipe_state, shaders[i], 29837ec681f3Smrg &cmd_buffer->state.binding_tables[stage]); 298401e04c3fSmrg if (result != VK_SUCCESS) { 298501e04c3fSmrg anv_batch_set_error(&cmd_buffer->batch, result); 298601e04c3fSmrg return 0; 298701e04c3fSmrg } 29887ec681f3Smrg 29897ec681f3Smrg flushed |= mesa_to_vk_shader_stage(stage); 299001e04c3fSmrg } 299101e04c3fSmrg } 299201e04c3fSmrg 29937ec681f3Smrg return flushed; 299401e04c3fSmrg} 299501e04c3fSmrg 299601e04c3fSmrgstatic void 299701e04c3fSmrgcmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer, 299801e04c3fSmrg uint32_t stages) 299901e04c3fSmrg{ 300001e04c3fSmrg static const uint32_t sampler_state_opcodes[] = { 300101e04c3fSmrg [MESA_SHADER_VERTEX] = 43, 300201e04c3fSmrg [MESA_SHADER_TESS_CTRL] = 44, /* HS */ 300301e04c3fSmrg [MESA_SHADER_TESS_EVAL] = 45, /* DS */ 300401e04c3fSmrg [MESA_SHADER_GEOMETRY] = 46, 300501e04c3fSmrg [MESA_SHADER_FRAGMENT] = 47, 300601e04c3fSmrg [MESA_SHADER_COMPUTE] = 0, 300701e04c3fSmrg }; 300801e04c3fSmrg 300901e04c3fSmrg static const uint32_t binding_table_opcodes[] = { 301001e04c3fSmrg [MESA_SHADER_VERTEX] = 38, 301101e04c3fSmrg [MESA_SHADER_TESS_CTRL] = 39, 301201e04c3fSmrg [MESA_SHADER_TESS_EVAL] = 40, 301301e04c3fSmrg [MESA_SHADER_GEOMETRY] = 41, 301401e04c3fSmrg [MESA_SHADER_FRAGMENT] = 42, 301501e04c3fSmrg [MESA_SHADER_COMPUTE] = 0, 301601e04c3fSmrg }; 301701e04c3fSmrg 301801e04c3fSmrg anv_foreach_stage(s, stages) { 301901e04c3fSmrg assert(s < ARRAY_SIZE(binding_table_opcodes)); 302001e04c3fSmrg assert(binding_table_opcodes[s] > 0); 302101e04c3fSmrg 302201e04c3fSmrg if (cmd_buffer->state.samplers[s].alloc_size > 0) { 302301e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, 302401e04c3fSmrg GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ssp) { 302501e04c3fSmrg ssp._3DCommandSubOpcode = sampler_state_opcodes[s]; 302601e04c3fSmrg ssp.PointertoVSSamplerState = cmd_buffer->state.samplers[s].offset; 302701e04c3fSmrg } 302801e04c3fSmrg } 302901e04c3fSmrg 303001e04c3fSmrg /* Always emit binding table pointers if we're asked to, since on SKL 303101e04c3fSmrg * this is what flushes push constants. */ 303201e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, 303301e04c3fSmrg GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), btp) { 303401e04c3fSmrg btp._3DCommandSubOpcode = binding_table_opcodes[s]; 303501e04c3fSmrg btp.PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset; 303601e04c3fSmrg } 303701e04c3fSmrg } 303801e04c3fSmrg} 303901e04c3fSmrg 30407ec681f3Smrgstatic struct anv_address 30417ec681f3Smrgget_push_range_address(struct anv_cmd_buffer *cmd_buffer, 30427ec681f3Smrg const struct anv_shader_bin *shader, 30437ec681f3Smrg const struct anv_push_range *range) 30447ec681f3Smrg{ 30457ec681f3Smrg struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx; 30467ec681f3Smrg switch (range->set) { 30477ec681f3Smrg case ANV_DESCRIPTOR_SET_DESCRIPTORS: { 30487ec681f3Smrg /* This is a descriptor set buffer so the set index is 30497ec681f3Smrg * actually given by binding->binding. (Yes, that's 30507ec681f3Smrg * confusing.) 30517ec681f3Smrg */ 30527ec681f3Smrg struct anv_descriptor_set *set = 30537ec681f3Smrg gfx_state->base.descriptors[range->index]; 30547ec681f3Smrg return anv_descriptor_set_address(set); 30557ec681f3Smrg } 30567ec681f3Smrg 30577ec681f3Smrg case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: { 30587ec681f3Smrg if (gfx_state->base.push_constants_state.alloc_size == 0) { 30597ec681f3Smrg gfx_state->base.push_constants_state = 30607ec681f3Smrg anv_cmd_buffer_gfx_push_constants(cmd_buffer); 30617ec681f3Smrg } 30627ec681f3Smrg return (struct anv_address) { 30637ec681f3Smrg .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, 30647ec681f3Smrg .offset = gfx_state->base.push_constants_state.offset, 30657ec681f3Smrg }; 30667ec681f3Smrg } 30677ec681f3Smrg 30687ec681f3Smrg case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS: 30697ec681f3Smrg return (struct anv_address) { 30707ec681f3Smrg .bo = cmd_buffer->device->instruction_state_pool.block_pool.bo, 30717ec681f3Smrg .offset = shader->kernel.offset + 30727ec681f3Smrg shader->prog_data->const_data_offset, 30737ec681f3Smrg }; 30747ec681f3Smrg 30757ec681f3Smrg default: { 30767ec681f3Smrg assert(range->set < MAX_SETS); 30777ec681f3Smrg struct anv_descriptor_set *set = 30787ec681f3Smrg gfx_state->base.descriptors[range->set]; 30797ec681f3Smrg const struct anv_descriptor *desc = 30807ec681f3Smrg &set->descriptors[range->index]; 30817ec681f3Smrg 30827ec681f3Smrg if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) { 30837ec681f3Smrg if (desc->buffer_view) 30847ec681f3Smrg return desc->buffer_view->address; 30857ec681f3Smrg } else { 30867ec681f3Smrg assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC); 30877ec681f3Smrg if (desc->buffer) { 30887ec681f3Smrg const struct anv_push_constants *push = 30897ec681f3Smrg &gfx_state->base.push_constants; 30907ec681f3Smrg uint32_t dynamic_offset = 30917ec681f3Smrg push->dynamic_offsets[range->dynamic_offset_index]; 30927ec681f3Smrg return anv_address_add(desc->buffer->address, 30937ec681f3Smrg desc->offset + dynamic_offset); 30947ec681f3Smrg } 30957ec681f3Smrg } 30967ec681f3Smrg 30977ec681f3Smrg /* For NULL UBOs, we just return an address in the workaround BO. We do 30987ec681f3Smrg * writes to it for workarounds but always at the bottom. The higher 30997ec681f3Smrg * bytes should be all zeros. 31007ec681f3Smrg */ 31017ec681f3Smrg assert(range->length * 32 <= 2048); 31027ec681f3Smrg return (struct anv_address) { 31037ec681f3Smrg .bo = cmd_buffer->device->workaround_bo, 31047ec681f3Smrg .offset = 1024, 31057ec681f3Smrg }; 31067ec681f3Smrg } 31077ec681f3Smrg } 31087ec681f3Smrg} 31097ec681f3Smrg 31107ec681f3Smrg 31117ec681f3Smrg/** Returns the size in bytes of the bound buffer 31127ec681f3Smrg * 31137ec681f3Smrg * The range is relative to the start of the buffer, not the start of the 31147ec681f3Smrg * range. The returned range may be smaller than 31157ec681f3Smrg * 31167ec681f3Smrg * (range->start + range->length) * 32; 31177ec681f3Smrg */ 31187ec681f3Smrgstatic uint32_t 31197ec681f3Smrgget_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer, 31207ec681f3Smrg const struct anv_shader_bin *shader, 31217ec681f3Smrg const struct anv_push_range *range) 31227ec681f3Smrg{ 31237ec681f3Smrg assert(shader->stage != MESA_SHADER_COMPUTE); 31247ec681f3Smrg const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx; 31257ec681f3Smrg switch (range->set) { 31267ec681f3Smrg case ANV_DESCRIPTOR_SET_DESCRIPTORS: { 31277ec681f3Smrg struct anv_descriptor_set *set = 31287ec681f3Smrg gfx_state->base.descriptors[range->index]; 31297ec681f3Smrg assert(range->start * 32 < set->desc_mem.alloc_size); 31307ec681f3Smrg assert((range->start + range->length) * 32 <= set->desc_mem.alloc_size); 31317ec681f3Smrg return set->desc_mem.alloc_size; 31327ec681f3Smrg } 31337ec681f3Smrg 31347ec681f3Smrg case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: 31357ec681f3Smrg return (range->start + range->length) * 32; 31367ec681f3Smrg 31377ec681f3Smrg case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS: 31387ec681f3Smrg return ALIGN(shader->prog_data->const_data_size, ANV_UBO_ALIGNMENT); 31397ec681f3Smrg 31407ec681f3Smrg default: { 31417ec681f3Smrg assert(range->set < MAX_SETS); 31427ec681f3Smrg struct anv_descriptor_set *set = 31437ec681f3Smrg gfx_state->base.descriptors[range->set]; 31447ec681f3Smrg const struct anv_descriptor *desc = 31457ec681f3Smrg &set->descriptors[range->index]; 31467ec681f3Smrg 31477ec681f3Smrg if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) { 31487ec681f3Smrg if (!desc->buffer_view) 31497ec681f3Smrg return 0; 31507ec681f3Smrg 31517ec681f3Smrg if (range->start * 32 > desc->buffer_view->range) 31527ec681f3Smrg return 0; 31537ec681f3Smrg 31547ec681f3Smrg return desc->buffer_view->range; 31557ec681f3Smrg } else { 31567ec681f3Smrg if (!desc->buffer) 31577ec681f3Smrg return 0; 31587ec681f3Smrg 31597ec681f3Smrg assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC); 31607ec681f3Smrg /* Compute the offset within the buffer */ 31617ec681f3Smrg const struct anv_push_constants *push = 31627ec681f3Smrg &gfx_state->base.push_constants; 31637ec681f3Smrg uint32_t dynamic_offset = 31647ec681f3Smrg push->dynamic_offsets[range->dynamic_offset_index]; 31657ec681f3Smrg uint64_t offset = desc->offset + dynamic_offset; 31667ec681f3Smrg /* Clamp to the buffer size */ 31677ec681f3Smrg offset = MIN2(offset, desc->buffer->size); 31687ec681f3Smrg /* Clamp the range to the buffer size */ 31697ec681f3Smrg uint32_t bound_range = MIN2(desc->range, desc->buffer->size - offset); 31707ec681f3Smrg 31717ec681f3Smrg /* Align the range for consistency */ 31727ec681f3Smrg bound_range = align_u32(bound_range, ANV_UBO_ALIGNMENT); 31737ec681f3Smrg 31747ec681f3Smrg return bound_range; 31757ec681f3Smrg } 31767ec681f3Smrg } 31777ec681f3Smrg } 31787ec681f3Smrg} 31797ec681f3Smrg 318001e04c3fSmrgstatic void 31817ec681f3Smrgcmd_buffer_emit_push_constant(struct anv_cmd_buffer *cmd_buffer, 31827ec681f3Smrg gl_shader_stage stage, 31837ec681f3Smrg struct anv_address *buffers, 31847ec681f3Smrg unsigned buffer_count) 318501e04c3fSmrg{ 318601e04c3fSmrg const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx; 31877ec681f3Smrg const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline; 318801e04c3fSmrg 318901e04c3fSmrg static const uint32_t push_constant_opcodes[] = { 319001e04c3fSmrg [MESA_SHADER_VERTEX] = 21, 319101e04c3fSmrg [MESA_SHADER_TESS_CTRL] = 25, /* HS */ 319201e04c3fSmrg [MESA_SHADER_TESS_EVAL] = 26, /* DS */ 319301e04c3fSmrg [MESA_SHADER_GEOMETRY] = 22, 319401e04c3fSmrg [MESA_SHADER_FRAGMENT] = 23, 319501e04c3fSmrg [MESA_SHADER_COMPUTE] = 0, 319601e04c3fSmrg }; 319701e04c3fSmrg 31987ec681f3Smrg assert(stage < ARRAY_SIZE(push_constant_opcodes)); 31997ec681f3Smrg assert(push_constant_opcodes[stage] > 0); 320001e04c3fSmrg 32017ec681f3Smrg anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) { 32027ec681f3Smrg c._3DCommandSubOpcode = push_constant_opcodes[stage]; 32037ec681f3Smrg 32047ec681f3Smrg if (anv_pipeline_has_stage(pipeline, stage)) { 32057ec681f3Smrg const struct anv_pipeline_bind_map *bind_map = 32067ec681f3Smrg &pipeline->shaders[stage]->bind_map; 320701e04c3fSmrg 32087ec681f3Smrg#if GFX_VER >= 9 32097ec681f3Smrg /* This field exists since Gfx8. However, the Broadwell PRM says: 32107ec681f3Smrg * 32117ec681f3Smrg * "Constant Buffer Object Control State must be always programmed 32127ec681f3Smrg * to zero." 32137ec681f3Smrg * 32147ec681f3Smrg * This restriction does not exist on any newer platforms. 32157ec681f3Smrg * 32167ec681f3Smrg * We only have one MOCS field for the whole packet, not one per 32177ec681f3Smrg * buffer. We could go out of our way here to walk over all of the 32187ec681f3Smrg * buffers and see if any of them are used externally and use the 32197ec681f3Smrg * external MOCS. However, the notion that someone would use the 32207ec681f3Smrg * same bit of memory for both scanout and a UBO is nuts. Let's not 32217ec681f3Smrg * bother and assume it's all internal. 32227ec681f3Smrg */ 32237ec681f3Smrg c.MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false); 32247ec681f3Smrg#endif 322501e04c3fSmrg 32267ec681f3Smrg#if GFX_VERx10 >= 75 32277ec681f3Smrg /* The Skylake PRM contains the following restriction: 32287ec681f3Smrg * 32297ec681f3Smrg * "The driver must ensure The following case does not occur 32307ec681f3Smrg * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with 32317ec681f3Smrg * buffer 3 read length equal to zero committed followed by a 32327ec681f3Smrg * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to 32337ec681f3Smrg * zero committed." 32347ec681f3Smrg * 32357ec681f3Smrg * To avoid this, we program the buffers in the highest slots. 32367ec681f3Smrg * This way, slot 0 is only used if slot 3 is also used. 32377ec681f3Smrg */ 32387ec681f3Smrg assert(buffer_count <= 4); 32397ec681f3Smrg const unsigned shift = 4 - buffer_count; 32407ec681f3Smrg for (unsigned i = 0; i < buffer_count; i++) { 32417ec681f3Smrg const struct anv_push_range *range = &bind_map->push_ranges[i]; 324201e04c3fSmrg 32437ec681f3Smrg /* At this point we only have non-empty ranges */ 32447ec681f3Smrg assert(range->length > 0); 32457ec681f3Smrg 32467ec681f3Smrg /* For Ivy Bridge, make sure we only set the first range (actual 32477ec681f3Smrg * push constants) 324801e04c3fSmrg */ 32497ec681f3Smrg assert((GFX_VERx10 >= 75) || i == 0); 325001e04c3fSmrg 32517ec681f3Smrg c.ConstantBody.ReadLength[i + shift] = range->length; 32527ec681f3Smrg c.ConstantBody.Buffer[i + shift] = 32537ec681f3Smrg anv_address_add(buffers[i], range->start * 32); 32547ec681f3Smrg } 32557ec681f3Smrg#else 32567ec681f3Smrg /* For Ivy Bridge, push constants are relative to dynamic state 32577ec681f3Smrg * base address and we only ever push actual push constants. 32587ec681f3Smrg */ 32597ec681f3Smrg if (bind_map->push_ranges[0].length > 0) { 32607ec681f3Smrg assert(buffer_count == 1); 32617ec681f3Smrg assert(bind_map->push_ranges[0].set == 32627ec681f3Smrg ANV_DESCRIPTOR_SET_PUSH_CONSTANTS); 32637ec681f3Smrg assert(buffers[0].bo == 32647ec681f3Smrg cmd_buffer->device->dynamic_state_pool.block_pool.bo); 32657ec681f3Smrg c.ConstantBody.ReadLength[0] = bind_map->push_ranges[0].length; 32667ec681f3Smrg c.ConstantBody.Buffer[0].bo = NULL; 32677ec681f3Smrg c.ConstantBody.Buffer[0].offset = buffers[0].offset; 32687ec681f3Smrg } 32697ec681f3Smrg assert(bind_map->push_ranges[1].length == 0); 32707ec681f3Smrg assert(bind_map->push_ranges[2].length == 0); 32717ec681f3Smrg assert(bind_map->push_ranges[3].length == 0); 32727ec681f3Smrg#endif 32737ec681f3Smrg } 32747ec681f3Smrg } 32757ec681f3Smrg} 327601e04c3fSmrg 32777ec681f3Smrg#if GFX_VER >= 12 32787ec681f3Smrgstatic void 32797ec681f3Smrgcmd_buffer_emit_push_constant_all(struct anv_cmd_buffer *cmd_buffer, 32807ec681f3Smrg uint32_t shader_mask, 32817ec681f3Smrg struct anv_address *buffers, 32827ec681f3Smrg uint32_t buffer_count) 32837ec681f3Smrg{ 32847ec681f3Smrg if (buffer_count == 0) { 32857ec681f3Smrg anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) { 32867ec681f3Smrg c.ShaderUpdateEnable = shader_mask; 32877ec681f3Smrg c.MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false); 32887ec681f3Smrg } 32897ec681f3Smrg return; 32907ec681f3Smrg } 329101e04c3fSmrg 32927ec681f3Smrg const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx; 32937ec681f3Smrg const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline; 329401e04c3fSmrg 32957ec681f3Smrg static const UNUSED uint32_t push_constant_opcodes[] = { 32967ec681f3Smrg [MESA_SHADER_VERTEX] = 21, 32977ec681f3Smrg [MESA_SHADER_TESS_CTRL] = 25, /* HS */ 32987ec681f3Smrg [MESA_SHADER_TESS_EVAL] = 26, /* DS */ 32997ec681f3Smrg [MESA_SHADER_GEOMETRY] = 22, 33007ec681f3Smrg [MESA_SHADER_FRAGMENT] = 23, 33017ec681f3Smrg [MESA_SHADER_COMPUTE] = 0, 33027ec681f3Smrg }; 330301e04c3fSmrg 33047ec681f3Smrg gl_shader_stage stage = vk_to_mesa_shader_stage(shader_mask); 33057ec681f3Smrg assert(stage < ARRAY_SIZE(push_constant_opcodes)); 33067ec681f3Smrg assert(push_constant_opcodes[stage] > 0); 33077ec681f3Smrg 33087ec681f3Smrg const struct anv_pipeline_bind_map *bind_map = 33097ec681f3Smrg &pipeline->shaders[stage]->bind_map; 33107ec681f3Smrg 33117ec681f3Smrg uint32_t *dw; 33127ec681f3Smrg const uint32_t buffer_mask = (1 << buffer_count) - 1; 33137ec681f3Smrg const uint32_t num_dwords = 2 + 2 * buffer_count; 33147ec681f3Smrg 33157ec681f3Smrg dw = anv_batch_emitn(&cmd_buffer->batch, num_dwords, 33167ec681f3Smrg GENX(3DSTATE_CONSTANT_ALL), 33177ec681f3Smrg .ShaderUpdateEnable = shader_mask, 33187ec681f3Smrg .PointerBufferMask = buffer_mask, 33197ec681f3Smrg .MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false)); 33207ec681f3Smrg 33217ec681f3Smrg for (int i = 0; i < buffer_count; i++) { 33227ec681f3Smrg const struct anv_push_range *range = &bind_map->push_ranges[i]; 33237ec681f3Smrg GENX(3DSTATE_CONSTANT_ALL_DATA_pack)( 33247ec681f3Smrg &cmd_buffer->batch, dw + 2 + i * 2, 33257ec681f3Smrg &(struct GENX(3DSTATE_CONSTANT_ALL_DATA)) { 33267ec681f3Smrg .PointerToConstantBuffer = 33277ec681f3Smrg anv_address_add(buffers[i], range->start * 32), 33287ec681f3Smrg .ConstantBufferReadLength = range->length, 33297ec681f3Smrg }); 33307ec681f3Smrg } 33317ec681f3Smrg} 33327ec681f3Smrg#endif 333301e04c3fSmrg 33347ec681f3Smrgstatic void 33357ec681f3Smrgcmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer, 33367ec681f3Smrg VkShaderStageFlags dirty_stages) 33377ec681f3Smrg{ 33387ec681f3Smrg VkShaderStageFlags flushed = 0; 33397ec681f3Smrg struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx; 33407ec681f3Smrg const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline; 33417ec681f3Smrg 33427ec681f3Smrg#if GFX_VER >= 12 33437ec681f3Smrg uint32_t nobuffer_stages = 0; 334401e04c3fSmrg#endif 33457ec681f3Smrg 33467ec681f3Smrg /* Compute robust pushed register access mask for each stage. */ 33477ec681f3Smrg if (cmd_buffer->device->robust_buffer_access) { 33487ec681f3Smrg anv_foreach_stage(stage, dirty_stages) { 33497ec681f3Smrg if (!anv_pipeline_has_stage(pipeline, stage)) 33507ec681f3Smrg continue; 33517ec681f3Smrg 33527ec681f3Smrg const struct anv_shader_bin *shader = pipeline->shaders[stage]; 33537ec681f3Smrg const struct anv_pipeline_bind_map *bind_map = &shader->bind_map; 33547ec681f3Smrg struct anv_push_constants *push = &gfx_state->base.push_constants; 33557ec681f3Smrg 33567ec681f3Smrg push->push_reg_mask[stage] = 0; 33577ec681f3Smrg /* Start of the current range in the shader, relative to the start of 33587ec681f3Smrg * push constants in the shader. 33597ec681f3Smrg */ 33607ec681f3Smrg unsigned range_start_reg = 0; 33617ec681f3Smrg for (unsigned i = 0; i < 4; i++) { 33627ec681f3Smrg const struct anv_push_range *range = &bind_map->push_ranges[i]; 33637ec681f3Smrg if (range->length == 0) 33647ec681f3Smrg continue; 33657ec681f3Smrg 33667ec681f3Smrg unsigned bound_size = 33677ec681f3Smrg get_push_range_bound_size(cmd_buffer, shader, range); 33687ec681f3Smrg if (bound_size >= range->start * 32) { 33697ec681f3Smrg unsigned bound_regs = 33707ec681f3Smrg MIN2(DIV_ROUND_UP(bound_size, 32) - range->start, 33717ec681f3Smrg range->length); 33727ec681f3Smrg assert(range_start_reg + bound_regs <= 64); 33737ec681f3Smrg push->push_reg_mask[stage] |= BITFIELD64_RANGE(range_start_reg, 33747ec681f3Smrg bound_regs); 33757ec681f3Smrg } 33767ec681f3Smrg 33777ec681f3Smrg cmd_buffer->state.push_constants_dirty |= 33787ec681f3Smrg mesa_to_vk_shader_stage(stage); 33797ec681f3Smrg 33807ec681f3Smrg range_start_reg += range->length; 338101e04c3fSmrg } 338201e04c3fSmrg } 33837ec681f3Smrg } 338401e04c3fSmrg 33857ec681f3Smrg /* Resets the push constant state so that we allocate a new one if 33867ec681f3Smrg * needed. 33877ec681f3Smrg */ 33887ec681f3Smrg gfx_state->base.push_constants_state = ANV_STATE_NULL; 33897ec681f3Smrg 33907ec681f3Smrg anv_foreach_stage(stage, dirty_stages) { 33917ec681f3Smrg unsigned buffer_count = 0; 339201e04c3fSmrg flushed |= mesa_to_vk_shader_stage(stage); 33937ec681f3Smrg UNUSED uint32_t max_push_range = 0; 33947ec681f3Smrg 33957ec681f3Smrg struct anv_address buffers[4] = {}; 33967ec681f3Smrg if (anv_pipeline_has_stage(pipeline, stage)) { 33977ec681f3Smrg const struct anv_shader_bin *shader = pipeline->shaders[stage]; 33987ec681f3Smrg const struct anv_pipeline_bind_map *bind_map = &shader->bind_map; 33997ec681f3Smrg 34007ec681f3Smrg /* We have to gather buffer addresses as a second step because the 34017ec681f3Smrg * loop above puts data into the push constant area and the call to 34027ec681f3Smrg * get_push_range_address is what locks our push constants and copies 34037ec681f3Smrg * them into the actual GPU buffer. If we did the two loops at the 34047ec681f3Smrg * same time, we'd risk only having some of the sizes in the push 34057ec681f3Smrg * constant buffer when we did the copy. 34067ec681f3Smrg */ 34077ec681f3Smrg for (unsigned i = 0; i < 4; i++) { 34087ec681f3Smrg const struct anv_push_range *range = &bind_map->push_ranges[i]; 34097ec681f3Smrg if (range->length == 0) 34107ec681f3Smrg break; 34117ec681f3Smrg 34127ec681f3Smrg buffers[i] = get_push_range_address(cmd_buffer, shader, range); 34137ec681f3Smrg max_push_range = MAX2(max_push_range, range->length); 34147ec681f3Smrg buffer_count++; 34157ec681f3Smrg } 34167ec681f3Smrg 34177ec681f3Smrg /* We have at most 4 buffers but they should be tightly packed */ 34187ec681f3Smrg for (unsigned i = buffer_count; i < 4; i++) 34197ec681f3Smrg assert(bind_map->push_ranges[i].length == 0); 34207ec681f3Smrg } 34217ec681f3Smrg 34227ec681f3Smrg#if GFX_VER >= 12 34237ec681f3Smrg /* If this stage doesn't have any push constants, emit it later in a 34247ec681f3Smrg * single CONSTANT_ALL packet. 34257ec681f3Smrg */ 34267ec681f3Smrg if (buffer_count == 0) { 34277ec681f3Smrg nobuffer_stages |= 1 << stage; 34287ec681f3Smrg continue; 34297ec681f3Smrg } 34307ec681f3Smrg 34317ec681f3Smrg /* The Constant Buffer Read Length field from 3DSTATE_CONSTANT_ALL 34327ec681f3Smrg * contains only 5 bits, so we can only use it for buffers smaller than 34337ec681f3Smrg * 32. 34347ec681f3Smrg */ 34357ec681f3Smrg if (max_push_range < 32) { 34367ec681f3Smrg cmd_buffer_emit_push_constant_all(cmd_buffer, 1 << stage, 34377ec681f3Smrg buffers, buffer_count); 34387ec681f3Smrg continue; 34397ec681f3Smrg } 34407ec681f3Smrg#endif 34417ec681f3Smrg 34427ec681f3Smrg cmd_buffer_emit_push_constant(cmd_buffer, stage, buffers, buffer_count); 344301e04c3fSmrg } 344401e04c3fSmrg 34457ec681f3Smrg#if GFX_VER >= 12 34467ec681f3Smrg if (nobuffer_stages) 34477ec681f3Smrg cmd_buffer_emit_push_constant_all(cmd_buffer, nobuffer_stages, NULL, 0); 34487ec681f3Smrg#endif 34497ec681f3Smrg 345001e04c3fSmrg cmd_buffer->state.push_constants_dirty &= ~flushed; 345101e04c3fSmrg} 345201e04c3fSmrg 34537ec681f3Smrgstatic void 34547ec681f3Smrgcmd_buffer_emit_clip(struct anv_cmd_buffer *cmd_buffer) 34557ec681f3Smrg{ 34567ec681f3Smrg const uint32_t clip_states = 34577ec681f3Smrg#if GFX_VER <= 7 34587ec681f3Smrg ANV_CMD_DIRTY_DYNAMIC_FRONT_FACE | 34597ec681f3Smrg ANV_CMD_DIRTY_DYNAMIC_CULL_MODE | 34607ec681f3Smrg#endif 34617ec681f3Smrg ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY | 34627ec681f3Smrg ANV_CMD_DIRTY_DYNAMIC_VIEWPORT | 34637ec681f3Smrg ANV_CMD_DIRTY_PIPELINE; 34647ec681f3Smrg 34657ec681f3Smrg if ((cmd_buffer->state.gfx.dirty & clip_states) == 0) 34667ec681f3Smrg return; 34677ec681f3Smrg 34687ec681f3Smrg /* Take dynamic primitive topology in to account with 34697ec681f3Smrg * 3DSTATE_CLIP::ViewportXYClipTestEnable 34707ec681f3Smrg */ 34717ec681f3Smrg bool xy_clip_test_enable = 0; 34727ec681f3Smrg 34737ec681f3Smrg if (cmd_buffer->state.gfx.pipeline->dynamic_states & 34747ec681f3Smrg ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY) { 34757ec681f3Smrg VkPrimitiveTopology primitive_topology = 34767ec681f3Smrg cmd_buffer->state.gfx.dynamic.primitive_topology; 34777ec681f3Smrg 34787ec681f3Smrg VkPolygonMode dynamic_raster_mode = 34797ec681f3Smrg genX(raster_polygon_mode)(cmd_buffer->state.gfx.pipeline, 34807ec681f3Smrg primitive_topology); 34817ec681f3Smrg 34827ec681f3Smrg xy_clip_test_enable = (dynamic_raster_mode == VK_POLYGON_MODE_FILL); 34837ec681f3Smrg } 34847ec681f3Smrg 34857ec681f3Smrg#if GFX_VER <= 7 34867ec681f3Smrg const struct anv_dynamic_state *d = &cmd_buffer->state.gfx.dynamic; 34877ec681f3Smrg#endif 34887ec681f3Smrg struct GENX(3DSTATE_CLIP) clip = { 34897ec681f3Smrg GENX(3DSTATE_CLIP_header), 34907ec681f3Smrg#if GFX_VER <= 7 34917ec681f3Smrg .FrontWinding = genX(vk_to_intel_front_face)[d->front_face], 34927ec681f3Smrg .CullMode = genX(vk_to_intel_cullmode)[d->cull_mode], 34937ec681f3Smrg#endif 34947ec681f3Smrg .ViewportXYClipTestEnable = xy_clip_test_enable, 34957ec681f3Smrg }; 34967ec681f3Smrg uint32_t dwords[GENX(3DSTATE_CLIP_length)]; 34977ec681f3Smrg 34987ec681f3Smrg struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; 34997ec681f3Smrg if (anv_pipeline_is_primitive(pipeline)) { 35007ec681f3Smrg const struct brw_vue_prog_data *last = 35017ec681f3Smrg anv_pipeline_get_last_vue_prog_data(pipeline); 35027ec681f3Smrg if (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT) { 35037ec681f3Smrg clip.MaximumVPIndex = 35047ec681f3Smrg cmd_buffer->state.gfx.dynamic.viewport.count > 0 ? 35057ec681f3Smrg cmd_buffer->state.gfx.dynamic.viewport.count - 1 : 0; 35067ec681f3Smrg } 35077ec681f3Smrg } 35087ec681f3Smrg 35097ec681f3Smrg GENX(3DSTATE_CLIP_pack)(NULL, dwords, &clip); 35107ec681f3Smrg anv_batch_emit_merge(&cmd_buffer->batch, dwords, 35117ec681f3Smrg pipeline->gfx7.clip); 35127ec681f3Smrg} 35137ec681f3Smrg 35147ec681f3Smrgstatic void 35157ec681f3Smrgcmd_buffer_emit_streamout(struct anv_cmd_buffer *cmd_buffer) 35167ec681f3Smrg{ 35177ec681f3Smrg const struct anv_dynamic_state *d = &cmd_buffer->state.gfx.dynamic; 35187ec681f3Smrg struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; 35197ec681f3Smrg 35207ec681f3Smrg#if GFX_VER == 7 35217ec681f3Smrg# define streamout_state_dw pipeline->gfx7.streamout_state 35227ec681f3Smrg#else 35237ec681f3Smrg# define streamout_state_dw pipeline->gfx8.streamout_state 35247ec681f3Smrg#endif 35257ec681f3Smrg 35267ec681f3Smrg uint32_t dwords[GENX(3DSTATE_STREAMOUT_length)]; 35277ec681f3Smrg 35287ec681f3Smrg struct GENX(3DSTATE_STREAMOUT) so = { 35297ec681f3Smrg GENX(3DSTATE_STREAMOUT_header), 35307ec681f3Smrg .RenderingDisable = d->raster_discard, 35317ec681f3Smrg }; 35327ec681f3Smrg GENX(3DSTATE_STREAMOUT_pack)(NULL, dwords, &so); 35337ec681f3Smrg anv_batch_emit_merge(&cmd_buffer->batch, dwords, streamout_state_dw); 35347ec681f3Smrg} 35357ec681f3Smrg 353601e04c3fSmrgvoid 353701e04c3fSmrggenX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) 353801e04c3fSmrg{ 35397ec681f3Smrg struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; 354001e04c3fSmrg uint32_t *p; 354101e04c3fSmrg 354201e04c3fSmrg assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0); 354301e04c3fSmrg 35447ec681f3Smrg genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config); 35457ec681f3Smrg 35467ec681f3Smrg genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, UINT_MAX, UINT_MAX, 1); 354701e04c3fSmrg 354801e04c3fSmrg genX(flush_pipeline_select_3d)(cmd_buffer); 354901e04c3fSmrg 35507ec681f3Smrg /* Apply any pending pipeline flushes we may have. We want to apply them 35517ec681f3Smrg * now because, if any of those flushes are for things like push constants, 35527ec681f3Smrg * the GPU will read the state at weird times. 35537ec681f3Smrg */ 35547ec681f3Smrg genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 35557ec681f3Smrg 35567ec681f3Smrg uint32_t vb_emit = cmd_buffer->state.gfx.vb_dirty & pipeline->vb_used; 35577ec681f3Smrg if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) 35587ec681f3Smrg vb_emit |= pipeline->vb_used; 35597ec681f3Smrg 356001e04c3fSmrg if (vb_emit) { 356101e04c3fSmrg const uint32_t num_buffers = __builtin_popcount(vb_emit); 356201e04c3fSmrg const uint32_t num_dwords = 1 + num_buffers * 4; 356301e04c3fSmrg 356401e04c3fSmrg p = anv_batch_emitn(&cmd_buffer->batch, num_dwords, 356501e04c3fSmrg GENX(3DSTATE_VERTEX_BUFFERS)); 35667ec681f3Smrg uint32_t i = 0; 35677ec681f3Smrg u_foreach_bit(vb, vb_emit) { 356801e04c3fSmrg struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer; 356901e04c3fSmrg uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset; 357001e04c3fSmrg 35717ec681f3Smrg /* If dynamic, use stride/size from vertex binding, otherwise use 35727ec681f3Smrg * stride/size that was setup in the pipeline object. 35737ec681f3Smrg */ 35747ec681f3Smrg bool dynamic_stride = cmd_buffer->state.gfx.dynamic.dyn_vbo_stride; 35757ec681f3Smrg bool dynamic_size = cmd_buffer->state.gfx.dynamic.dyn_vbo_size; 35767ec681f3Smrg 35777ec681f3Smrg struct GENX(VERTEX_BUFFER_STATE) state; 35787ec681f3Smrg if (buffer) { 35797ec681f3Smrg uint32_t stride = dynamic_stride ? 35807ec681f3Smrg cmd_buffer->state.vertex_bindings[vb].stride : pipeline->vb[vb].stride; 35817ec681f3Smrg /* From the Vulkan spec (vkCmdBindVertexBuffers2EXT): 35827ec681f3Smrg * 35837ec681f3Smrg * "If pname:pSizes is not NULL then pname:pSizes[i] specifies 35847ec681f3Smrg * the bound size of the vertex buffer starting from the corresponding 35857ec681f3Smrg * elements of pname:pBuffers[i] plus pname:pOffsets[i]." 35867ec681f3Smrg */ 35877ec681f3Smrg UNUSED uint32_t size = dynamic_size ? 35887ec681f3Smrg cmd_buffer->state.vertex_bindings[vb].size : buffer->size - offset; 35897ec681f3Smrg 35907ec681f3Smrg state = (struct GENX(VERTEX_BUFFER_STATE)) { 35917ec681f3Smrg .VertexBufferIndex = vb, 359201e04c3fSmrg 35937ec681f3Smrg .MOCS = anv_mocs(cmd_buffer->device, buffer->address.bo, 35947ec681f3Smrg ISL_SURF_USAGE_VERTEX_BUFFER_BIT), 35957ec681f3Smrg#if GFX_VER <= 7 35967ec681f3Smrg .BufferAccessType = pipeline->vb[vb].instanced ? INSTANCEDATA : VERTEXDATA, 35977ec681f3Smrg .InstanceDataStepRate = pipeline->vb[vb].instance_divisor, 35987ec681f3Smrg#endif 35997ec681f3Smrg .AddressModifyEnable = true, 36007ec681f3Smrg .BufferPitch = stride, 36017ec681f3Smrg .BufferStartingAddress = anv_address_add(buffer->address, offset), 36027ec681f3Smrg .NullVertexBuffer = offset >= buffer->size, 36037ec681f3Smrg#if GFX_VER >= 12 36047ec681f3Smrg .L3BypassDisable = true, 360501e04c3fSmrg#endif 360601e04c3fSmrg 36077ec681f3Smrg#if GFX_VER >= 8 36087ec681f3Smrg .BufferSize = size, 360901e04c3fSmrg#else 36107ec681f3Smrg /* XXX: to handle dynamic offset for older gens we might want 36117ec681f3Smrg * to modify Endaddress, but there are issues when doing so: 36127ec681f3Smrg * 36137ec681f3Smrg * https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7439 36147ec681f3Smrg */ 36157ec681f3Smrg .EndAddress = anv_address_add(buffer->address, buffer->size - 1), 36167ec681f3Smrg#endif 36177ec681f3Smrg }; 36187ec681f3Smrg } else { 36197ec681f3Smrg state = (struct GENX(VERTEX_BUFFER_STATE)) { 36207ec681f3Smrg .VertexBufferIndex = vb, 36217ec681f3Smrg .NullVertexBuffer = true, 36227ec681f3Smrg }; 36237ec681f3Smrg } 36247ec681f3Smrg 36257ec681f3Smrg#if GFX_VER >= 8 && GFX_VER <= 9 36267ec681f3Smrg genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, vb, 36277ec681f3Smrg state.BufferStartingAddress, 36287ec681f3Smrg state.BufferSize); 362901e04c3fSmrg#endif 363001e04c3fSmrg 363101e04c3fSmrg GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state); 363201e04c3fSmrg i++; 363301e04c3fSmrg } 363401e04c3fSmrg } 363501e04c3fSmrg 363601e04c3fSmrg cmd_buffer->state.gfx.vb_dirty &= ~vb_emit; 363701e04c3fSmrg 36387ec681f3Smrg uint32_t descriptors_dirty = cmd_buffer->state.descriptors_dirty & 36397ec681f3Smrg pipeline->active_stages; 36407ec681f3Smrg if (!cmd_buffer->state.gfx.dirty && !descriptors_dirty && 36417ec681f3Smrg !cmd_buffer->state.push_constants_dirty) 36427ec681f3Smrg return; 36437ec681f3Smrg 36447ec681f3Smrg if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_XFB_ENABLE) || 36457ec681f3Smrg (GFX_VER == 7 && (cmd_buffer->state.gfx.dirty & 36467ec681f3Smrg ANV_CMD_DIRTY_PIPELINE))) { 36479f464c52Smaya /* We don't need any per-buffer dirty tracking because you're not 36489f464c52Smaya * allowed to bind different XFB buffers while XFB is enabled. 36499f464c52Smaya */ 36509f464c52Smaya for (unsigned idx = 0; idx < MAX_XFB_BUFFERS; idx++) { 36519f464c52Smaya struct anv_xfb_binding *xfb = &cmd_buffer->state.xfb_bindings[idx]; 36529f464c52Smaya anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) { 36537ec681f3Smrg#if GFX_VER < 12 36549f464c52Smaya sob.SOBufferIndex = idx; 36557ec681f3Smrg#else 36567ec681f3Smrg sob._3DCommandOpcode = 0; 36577ec681f3Smrg sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + idx; 36587ec681f3Smrg#endif 36599f464c52Smaya 36609f464c52Smaya if (cmd_buffer->state.xfb_enabled && xfb->buffer && xfb->size != 0) { 36617ec681f3Smrg sob.MOCS = anv_mocs(cmd_buffer->device, xfb->buffer->address.bo, 0); 36629f464c52Smaya sob.SurfaceBaseAddress = anv_address_add(xfb->buffer->address, 36639f464c52Smaya xfb->offset); 36647ec681f3Smrg#if GFX_VER >= 8 36657ec681f3Smrg sob.SOBufferEnable = true; 36667ec681f3Smrg sob.StreamOffsetWriteEnable = false; 36679f464c52Smaya /* Size is in DWords - 1 */ 36687ec681f3Smrg sob.SurfaceSize = DIV_ROUND_UP(xfb->size, 4) - 1; 36697ec681f3Smrg#else 36707ec681f3Smrg /* We don't have SOBufferEnable in 3DSTATE_SO_BUFFER on Gfx7 so 36717ec681f3Smrg * we trust in SurfaceEndAddress = SurfaceBaseAddress = 0 (the 36727ec681f3Smrg * default for an empty SO_BUFFER packet) to disable them. 36737ec681f3Smrg */ 36747ec681f3Smrg sob.SurfacePitch = pipeline->gfx7.xfb_bo_pitch[idx]; 36757ec681f3Smrg sob.SurfaceEndAddress = anv_address_add(xfb->buffer->address, 36767ec681f3Smrg xfb->offset + xfb->size); 36777ec681f3Smrg#endif 36789f464c52Smaya } 36799f464c52Smaya } 36809f464c52Smaya } 36819f464c52Smaya 36829f464c52Smaya /* CNL and later require a CS stall after 3DSTATE_SO_BUFFER */ 36837ec681f3Smrg if (GFX_VER >= 10) { 36847ec681f3Smrg anv_add_pending_pipe_bits(cmd_buffer, 36857ec681f3Smrg ANV_PIPE_CS_STALL_BIT, 36867ec681f3Smrg "after 3DSTATE_SO_BUFFER call"); 36877ec681f3Smrg } 36889f464c52Smaya } 36899f464c52Smaya 369001e04c3fSmrg if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) { 36917ec681f3Smrg anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch); 369201e04c3fSmrg 36937ec681f3Smrg /* Remove from dynamic state emission all of stuff that is baked into 36947ec681f3Smrg * the pipeline. 369501e04c3fSmrg */ 36967ec681f3Smrg cmd_buffer->state.gfx.dirty &= ~pipeline->static_state_mask; 369701e04c3fSmrg 369801e04c3fSmrg /* If the pipeline changed, we may need to re-allocate push constant 369901e04c3fSmrg * space in the URB. 370001e04c3fSmrg */ 370101e04c3fSmrg cmd_buffer_alloc_push_constants(cmd_buffer); 370201e04c3fSmrg } 370301e04c3fSmrg 37047ec681f3Smrg if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) 37057ec681f3Smrg cmd_buffer->state.gfx.primitive_topology = pipeline->topology; 37067ec681f3Smrg 37077ec681f3Smrg#if GFX_VER <= 7 370801e04c3fSmrg if (cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_VERTEX_BIT || 370901e04c3fSmrg cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_VERTEX_BIT) { 371001e04c3fSmrg /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1: 371101e04c3fSmrg * 371201e04c3fSmrg * "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth 371301e04c3fSmrg * stall needs to be sent just prior to any 3DSTATE_VS, 371401e04c3fSmrg * 3DSTATE_URB_VS, 3DSTATE_CONSTANT_VS, 371501e04c3fSmrg * 3DSTATE_BINDING_TABLE_POINTER_VS, 371601e04c3fSmrg * 3DSTATE_SAMPLER_STATE_POINTER_VS command. Only one 371701e04c3fSmrg * PIPE_CONTROL needs to be sent before any combination of VS 371801e04c3fSmrg * associated 3DSTATE." 371901e04c3fSmrg */ 372001e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 372101e04c3fSmrg pc.DepthStallEnable = true; 372201e04c3fSmrg pc.PostSyncOperation = WriteImmediateData; 37237ec681f3Smrg pc.Address = cmd_buffer->device->workaround_address; 37247ec681f3Smrg anv_debug_dump_pc(pc); 372501e04c3fSmrg } 372601e04c3fSmrg } 372701e04c3fSmrg#endif 372801e04c3fSmrg 372901e04c3fSmrg /* Render targets live in the same binding table as fragment descriptors */ 373001e04c3fSmrg if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS) 37317ec681f3Smrg descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT; 373201e04c3fSmrg 373301e04c3fSmrg /* We emit the binding tables and sampler tables first, then emit push 373401e04c3fSmrg * constants and then finally emit binding table and sampler table 373501e04c3fSmrg * pointers. It has to happen in this order, since emitting the binding 373601e04c3fSmrg * tables may change the push constants (in case of storage images). After 373701e04c3fSmrg * emitting push constants, on SKL+ we have to emit the corresponding 373801e04c3fSmrg * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect. 373901e04c3fSmrg */ 374001e04c3fSmrg uint32_t dirty = 0; 37417ec681f3Smrg if (descriptors_dirty) { 37427ec681f3Smrg dirty = flush_descriptor_sets(cmd_buffer, 37437ec681f3Smrg &cmd_buffer->state.gfx.base, 37447ec681f3Smrg descriptors_dirty, 37457ec681f3Smrg pipeline->shaders, 37467ec681f3Smrg ARRAY_SIZE(pipeline->shaders)); 37477ec681f3Smrg cmd_buffer->state.descriptors_dirty &= ~dirty; 37487ec681f3Smrg } 374901e04c3fSmrg 375001e04c3fSmrg if (dirty || cmd_buffer->state.push_constants_dirty) { 375101e04c3fSmrg /* Because we're pushing UBOs, we have to push whenever either 375201e04c3fSmrg * descriptors or push constants is dirty. 375301e04c3fSmrg */ 375401e04c3fSmrg dirty |= cmd_buffer->state.push_constants_dirty; 375501e04c3fSmrg dirty &= ANV_STAGE_MASK & VK_SHADER_STAGE_ALL_GRAPHICS; 375601e04c3fSmrg cmd_buffer_flush_push_constants(cmd_buffer, dirty); 375701e04c3fSmrg } 375801e04c3fSmrg 375901e04c3fSmrg if (dirty) 376001e04c3fSmrg cmd_buffer_emit_descriptor_pointers(cmd_buffer, dirty); 376101e04c3fSmrg 37627ec681f3Smrg cmd_buffer_emit_clip(cmd_buffer); 37637ec681f3Smrg 37647ec681f3Smrg if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE) 37657ec681f3Smrg cmd_buffer_emit_streamout(cmd_buffer); 37667ec681f3Smrg 376701e04c3fSmrg if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_VIEWPORT) 37687ec681f3Smrg gfx8_cmd_buffer_emit_viewport(cmd_buffer); 376901e04c3fSmrg 377001e04c3fSmrg if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_DYNAMIC_VIEWPORT | 377101e04c3fSmrg ANV_CMD_DIRTY_PIPELINE)) { 37727ec681f3Smrg gfx8_cmd_buffer_emit_depth_viewport(cmd_buffer, 377301e04c3fSmrg pipeline->depth_clamp_enable); 377401e04c3fSmrg } 377501e04c3fSmrg 377601e04c3fSmrg if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_DYNAMIC_SCISSOR | 377701e04c3fSmrg ANV_CMD_DIRTY_RENDER_TARGETS)) 37787ec681f3Smrg gfx7_cmd_buffer_emit_scissor(cmd_buffer); 377901e04c3fSmrg 378001e04c3fSmrg genX(cmd_buffer_flush_dynamic_state)(cmd_buffer); 378101e04c3fSmrg} 378201e04c3fSmrg 378301e04c3fSmrgstatic void 378401e04c3fSmrgemit_vertex_bo(struct anv_cmd_buffer *cmd_buffer, 378501e04c3fSmrg struct anv_address addr, 378601e04c3fSmrg uint32_t size, uint32_t index) 378701e04c3fSmrg{ 378801e04c3fSmrg uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, 5, 378901e04c3fSmrg GENX(3DSTATE_VERTEX_BUFFERS)); 379001e04c3fSmrg 379101e04c3fSmrg GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, p + 1, 379201e04c3fSmrg &(struct GENX(VERTEX_BUFFER_STATE)) { 379301e04c3fSmrg .VertexBufferIndex = index, 379401e04c3fSmrg .AddressModifyEnable = true, 379501e04c3fSmrg .BufferPitch = 0, 37967ec681f3Smrg .MOCS = addr.bo ? anv_mocs(cmd_buffer->device, addr.bo, 37977ec681f3Smrg ISL_SURF_USAGE_VERTEX_BUFFER_BIT) : 0, 37987ec681f3Smrg .NullVertexBuffer = size == 0, 37997ec681f3Smrg#if GFX_VER >= 12 38007ec681f3Smrg .L3BypassDisable = true, 38017ec681f3Smrg#endif 38027ec681f3Smrg#if (GFX_VER >= 8) 380301e04c3fSmrg .BufferStartingAddress = addr, 380401e04c3fSmrg .BufferSize = size 380501e04c3fSmrg#else 380601e04c3fSmrg .BufferStartingAddress = addr, 380701e04c3fSmrg .EndAddress = anv_address_add(addr, size), 380801e04c3fSmrg#endif 380901e04c3fSmrg }); 38107ec681f3Smrg 38117ec681f3Smrg genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, 38127ec681f3Smrg index, addr, size); 381301e04c3fSmrg} 381401e04c3fSmrg 381501e04c3fSmrgstatic void 381601e04c3fSmrgemit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer, 381701e04c3fSmrg struct anv_address addr) 381801e04c3fSmrg{ 38197ec681f3Smrg emit_vertex_bo(cmd_buffer, addr, addr.bo ? 8 : 0, ANV_SVGS_VB_INDEX); 382001e04c3fSmrg} 382101e04c3fSmrg 382201e04c3fSmrgstatic void 382301e04c3fSmrgemit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer, 382401e04c3fSmrg uint32_t base_vertex, uint32_t base_instance) 382501e04c3fSmrg{ 38267ec681f3Smrg if (base_vertex == 0 && base_instance == 0) { 38277ec681f3Smrg emit_base_vertex_instance_bo(cmd_buffer, ANV_NULL_ADDRESS); 38287ec681f3Smrg } else { 38297ec681f3Smrg struct anv_state id_state = 38307ec681f3Smrg anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4); 383101e04c3fSmrg 38327ec681f3Smrg ((uint32_t *)id_state.map)[0] = base_vertex; 38337ec681f3Smrg ((uint32_t *)id_state.map)[1] = base_instance; 383401e04c3fSmrg 38357ec681f3Smrg struct anv_address addr = { 38367ec681f3Smrg .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, 38377ec681f3Smrg .offset = id_state.offset, 38387ec681f3Smrg }; 383901e04c3fSmrg 38407ec681f3Smrg emit_base_vertex_instance_bo(cmd_buffer, addr); 38417ec681f3Smrg } 384201e04c3fSmrg} 384301e04c3fSmrg 384401e04c3fSmrgstatic void 384501e04c3fSmrgemit_draw_index(struct anv_cmd_buffer *cmd_buffer, uint32_t draw_index) 384601e04c3fSmrg{ 384701e04c3fSmrg struct anv_state state = 384801e04c3fSmrg anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 4, 4); 384901e04c3fSmrg 385001e04c3fSmrg ((uint32_t *)state.map)[0] = draw_index; 385101e04c3fSmrg 385201e04c3fSmrg struct anv_address addr = { 38539f464c52Smaya .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, 385401e04c3fSmrg .offset = state.offset, 385501e04c3fSmrg }; 385601e04c3fSmrg 385701e04c3fSmrg emit_vertex_bo(cmd_buffer, addr, 4, ANV_DRAWID_VB_INDEX); 385801e04c3fSmrg} 385901e04c3fSmrg 38607ec681f3Smrgstatic void 38617ec681f3Smrgupdate_dirty_vbs_for_gfx8_vb_flush(struct anv_cmd_buffer *cmd_buffer, 38627ec681f3Smrg uint32_t access_type) 38637ec681f3Smrg{ 38647ec681f3Smrg struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; 38657ec681f3Smrg const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); 38667ec681f3Smrg 38677ec681f3Smrg uint64_t vb_used = pipeline->vb_used; 38687ec681f3Smrg if (vs_prog_data->uses_firstvertex || 38697ec681f3Smrg vs_prog_data->uses_baseinstance) 38707ec681f3Smrg vb_used |= 1ull << ANV_SVGS_VB_INDEX; 38717ec681f3Smrg if (vs_prog_data->uses_drawid) 38727ec681f3Smrg vb_used |= 1ull << ANV_DRAWID_VB_INDEX; 38737ec681f3Smrg 38747ec681f3Smrg genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(cmd_buffer, 38757ec681f3Smrg access_type == RANDOM, 38767ec681f3Smrg vb_used); 38777ec681f3Smrg} 38787ec681f3Smrg 38797ec681f3SmrgALWAYS_INLINE static void 38807ec681f3Smrgcmd_buffer_emit_vertex_constants_and_flush(struct anv_cmd_buffer *cmd_buffer, 38817ec681f3Smrg const struct brw_vs_prog_data *vs_prog_data, 38827ec681f3Smrg uint32_t base_vertex, 38837ec681f3Smrg uint32_t base_instance, 38847ec681f3Smrg uint32_t draw_id, 38857ec681f3Smrg bool force_flush) 38867ec681f3Smrg{ 38877ec681f3Smrg bool emitted = false; 38887ec681f3Smrg if (vs_prog_data->uses_firstvertex || 38897ec681f3Smrg vs_prog_data->uses_baseinstance) { 38907ec681f3Smrg emit_base_vertex_instance(cmd_buffer, base_vertex, base_instance); 38917ec681f3Smrg emitted = true; 38927ec681f3Smrg } 38937ec681f3Smrg if (vs_prog_data->uses_drawid) { 38947ec681f3Smrg emit_draw_index(cmd_buffer, draw_id); 38957ec681f3Smrg emitted = true; 38967ec681f3Smrg } 38977ec681f3Smrg /* Emitting draw index or vertex index BOs may result in needing 38987ec681f3Smrg * additional VF cache flushes. 38997ec681f3Smrg */ 39007ec681f3Smrg if (emitted || force_flush) 39017ec681f3Smrg genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 39027ec681f3Smrg} 39037ec681f3Smrg 390401e04c3fSmrgvoid genX(CmdDraw)( 390501e04c3fSmrg VkCommandBuffer commandBuffer, 390601e04c3fSmrg uint32_t vertexCount, 390701e04c3fSmrg uint32_t instanceCount, 390801e04c3fSmrg uint32_t firstVertex, 390901e04c3fSmrg uint32_t firstInstance) 391001e04c3fSmrg{ 391101e04c3fSmrg ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 39127ec681f3Smrg struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; 391301e04c3fSmrg const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); 391401e04c3fSmrg 391501e04c3fSmrg if (anv_batch_has_error(&cmd_buffer->batch)) 391601e04c3fSmrg return; 391701e04c3fSmrg 39187ec681f3Smrg const uint32_t count = (vertexCount * 39197ec681f3Smrg instanceCount * 39207ec681f3Smrg (pipeline->use_primitive_replication ? 39217ec681f3Smrg 1 : anv_subpass_view_count(cmd_buffer->state.subpass))); 39227ec681f3Smrg anv_measure_snapshot(cmd_buffer, 39237ec681f3Smrg INTEL_SNAPSHOT_DRAW, 39247ec681f3Smrg "draw", count); 39257ec681f3Smrg 392601e04c3fSmrg genX(cmd_buffer_flush_state)(cmd_buffer); 392701e04c3fSmrg 39289f464c52Smaya if (cmd_buffer->state.conditional_render_enabled) 39299f464c52Smaya genX(cmd_emit_conditional_render_predicate)(cmd_buffer); 39309f464c52Smaya 39317ec681f3Smrg cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data, 39327ec681f3Smrg firstVertex, firstInstance, 0, 39337ec681f3Smrg true); 393401e04c3fSmrg 393501e04c3fSmrg /* Our implementation of VK_KHR_multiview uses instancing to draw the 393601e04c3fSmrg * different views. We need to multiply instanceCount by the view count. 393701e04c3fSmrg */ 39387ec681f3Smrg if (!pipeline->use_primitive_replication) 39397ec681f3Smrg instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass); 394001e04c3fSmrg 394101e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { 39429f464c52Smaya prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; 394301e04c3fSmrg prim.VertexAccessType = SEQUENTIAL; 39447ec681f3Smrg prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology; 394501e04c3fSmrg prim.VertexCountPerInstance = vertexCount; 394601e04c3fSmrg prim.StartVertexLocation = firstVertex; 394701e04c3fSmrg prim.InstanceCount = instanceCount; 394801e04c3fSmrg prim.StartInstanceLocation = firstInstance; 394901e04c3fSmrg prim.BaseVertexLocation = 0; 395001e04c3fSmrg } 39517ec681f3Smrg 39527ec681f3Smrg update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL); 39537ec681f3Smrg} 39547ec681f3Smrg 39557ec681f3Smrgvoid genX(CmdDrawMultiEXT)( 39567ec681f3Smrg VkCommandBuffer commandBuffer, 39577ec681f3Smrg uint32_t drawCount, 39587ec681f3Smrg const VkMultiDrawInfoEXT *pVertexInfo, 39597ec681f3Smrg uint32_t instanceCount, 39607ec681f3Smrg uint32_t firstInstance, 39617ec681f3Smrg uint32_t stride) 39627ec681f3Smrg{ 39637ec681f3Smrg ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 39647ec681f3Smrg struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; 39657ec681f3Smrg const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); 39667ec681f3Smrg 39677ec681f3Smrg if (anv_batch_has_error(&cmd_buffer->batch)) 39687ec681f3Smrg return; 39697ec681f3Smrg 39707ec681f3Smrg const uint32_t count = (drawCount * 39717ec681f3Smrg instanceCount * 39727ec681f3Smrg (pipeline->use_primitive_replication ? 39737ec681f3Smrg 1 : anv_subpass_view_count(cmd_buffer->state.subpass))); 39747ec681f3Smrg anv_measure_snapshot(cmd_buffer, 39757ec681f3Smrg INTEL_SNAPSHOT_DRAW, 39767ec681f3Smrg "draw_multi", count); 39777ec681f3Smrg 39787ec681f3Smrg genX(cmd_buffer_flush_state)(cmd_buffer); 39797ec681f3Smrg 39807ec681f3Smrg if (cmd_buffer->state.conditional_render_enabled) 39817ec681f3Smrg genX(cmd_emit_conditional_render_predicate)(cmd_buffer); 39827ec681f3Smrg 39837ec681f3Smrg /* Our implementation of VK_KHR_multiview uses instancing to draw the 39847ec681f3Smrg * different views. We need to multiply instanceCount by the view count. 39857ec681f3Smrg */ 39867ec681f3Smrg if (!pipeline->use_primitive_replication) 39877ec681f3Smrg instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass); 39887ec681f3Smrg 39897ec681f3Smrg uint32_t i = 0; 39907ec681f3Smrg vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) { 39917ec681f3Smrg cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data, 39927ec681f3Smrg draw->firstVertex, 39937ec681f3Smrg firstInstance, i, !i); 39947ec681f3Smrg 39957ec681f3Smrg anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { 39967ec681f3Smrg prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; 39977ec681f3Smrg prim.VertexAccessType = SEQUENTIAL; 39987ec681f3Smrg prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology; 39997ec681f3Smrg prim.VertexCountPerInstance = draw->vertexCount; 40007ec681f3Smrg prim.StartVertexLocation = draw->firstVertex; 40017ec681f3Smrg prim.InstanceCount = instanceCount; 40027ec681f3Smrg prim.StartInstanceLocation = firstInstance; 40037ec681f3Smrg prim.BaseVertexLocation = 0; 40047ec681f3Smrg } 40057ec681f3Smrg } 40067ec681f3Smrg 40077ec681f3Smrg update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL); 400801e04c3fSmrg} 400901e04c3fSmrg 401001e04c3fSmrgvoid genX(CmdDrawIndexed)( 401101e04c3fSmrg VkCommandBuffer commandBuffer, 401201e04c3fSmrg uint32_t indexCount, 401301e04c3fSmrg uint32_t instanceCount, 401401e04c3fSmrg uint32_t firstIndex, 401501e04c3fSmrg int32_t vertexOffset, 401601e04c3fSmrg uint32_t firstInstance) 401701e04c3fSmrg{ 401801e04c3fSmrg ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 40197ec681f3Smrg struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; 402001e04c3fSmrg const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); 402101e04c3fSmrg 402201e04c3fSmrg if (anv_batch_has_error(&cmd_buffer->batch)) 402301e04c3fSmrg return; 402401e04c3fSmrg 40257ec681f3Smrg const uint32_t count = (indexCount * 40267ec681f3Smrg instanceCount * 40277ec681f3Smrg (pipeline->use_primitive_replication ? 40287ec681f3Smrg 1 : anv_subpass_view_count(cmd_buffer->state.subpass))); 40297ec681f3Smrg anv_measure_snapshot(cmd_buffer, 40307ec681f3Smrg INTEL_SNAPSHOT_DRAW, 40317ec681f3Smrg "draw indexed", 40327ec681f3Smrg count); 40337ec681f3Smrg 403401e04c3fSmrg genX(cmd_buffer_flush_state)(cmd_buffer); 403501e04c3fSmrg 40369f464c52Smaya if (cmd_buffer->state.conditional_render_enabled) 40379f464c52Smaya genX(cmd_emit_conditional_render_predicate)(cmd_buffer); 40389f464c52Smaya 40397ec681f3Smrg cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data, vertexOffset, firstInstance, 0, true); 404001e04c3fSmrg 404101e04c3fSmrg /* Our implementation of VK_KHR_multiview uses instancing to draw the 404201e04c3fSmrg * different views. We need to multiply instanceCount by the view count. 404301e04c3fSmrg */ 40447ec681f3Smrg if (!pipeline->use_primitive_replication) 40457ec681f3Smrg instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass); 404601e04c3fSmrg 404701e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { 40489f464c52Smaya prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; 404901e04c3fSmrg prim.VertexAccessType = RANDOM; 40507ec681f3Smrg prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology; 405101e04c3fSmrg prim.VertexCountPerInstance = indexCount; 405201e04c3fSmrg prim.StartVertexLocation = firstIndex; 405301e04c3fSmrg prim.InstanceCount = instanceCount; 405401e04c3fSmrg prim.StartInstanceLocation = firstInstance; 405501e04c3fSmrg prim.BaseVertexLocation = vertexOffset; 405601e04c3fSmrg } 40577ec681f3Smrg 40587ec681f3Smrg update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM); 40597ec681f3Smrg} 40607ec681f3Smrg 40617ec681f3Smrgvoid genX(CmdDrawMultiIndexedEXT)( 40627ec681f3Smrg VkCommandBuffer commandBuffer, 40637ec681f3Smrg uint32_t drawCount, 40647ec681f3Smrg const VkMultiDrawIndexedInfoEXT *pIndexInfo, 40657ec681f3Smrg uint32_t instanceCount, 40667ec681f3Smrg uint32_t firstInstance, 40677ec681f3Smrg uint32_t stride, 40687ec681f3Smrg const int32_t *pVertexOffset) 40697ec681f3Smrg{ 40707ec681f3Smrg ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 40717ec681f3Smrg struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; 40727ec681f3Smrg const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); 40737ec681f3Smrg 40747ec681f3Smrg if (anv_batch_has_error(&cmd_buffer->batch)) 40757ec681f3Smrg return; 40767ec681f3Smrg 40777ec681f3Smrg const uint32_t count = (drawCount * 40787ec681f3Smrg instanceCount * 40797ec681f3Smrg (pipeline->use_primitive_replication ? 40807ec681f3Smrg 1 : anv_subpass_view_count(cmd_buffer->state.subpass))); 40817ec681f3Smrg anv_measure_snapshot(cmd_buffer, 40827ec681f3Smrg INTEL_SNAPSHOT_DRAW, 40837ec681f3Smrg "draw indexed_multi", 40847ec681f3Smrg count); 40857ec681f3Smrg 40867ec681f3Smrg genX(cmd_buffer_flush_state)(cmd_buffer); 40877ec681f3Smrg 40887ec681f3Smrg if (cmd_buffer->state.conditional_render_enabled) 40897ec681f3Smrg genX(cmd_emit_conditional_render_predicate)(cmd_buffer); 40907ec681f3Smrg 40917ec681f3Smrg /* Our implementation of VK_KHR_multiview uses instancing to draw the 40927ec681f3Smrg * different views. We need to multiply instanceCount by the view count. 40937ec681f3Smrg */ 40947ec681f3Smrg if (!pipeline->use_primitive_replication) 40957ec681f3Smrg instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass); 40967ec681f3Smrg 40977ec681f3Smrg uint32_t i = 0; 40987ec681f3Smrg if (pVertexOffset) { 40997ec681f3Smrg if (vs_prog_data->uses_drawid) { 41007ec681f3Smrg bool emitted = true; 41017ec681f3Smrg if (vs_prog_data->uses_firstvertex || 41027ec681f3Smrg vs_prog_data->uses_baseinstance) { 41037ec681f3Smrg emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance); 41047ec681f3Smrg emitted = true; 41057ec681f3Smrg } 41067ec681f3Smrg vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) { 41077ec681f3Smrg if (vs_prog_data->uses_drawid) { 41087ec681f3Smrg emit_draw_index(cmd_buffer, i); 41097ec681f3Smrg emitted = true; 41107ec681f3Smrg } 41117ec681f3Smrg /* Emitting draw index or vertex index BOs may result in needing 41127ec681f3Smrg * additional VF cache flushes. 41137ec681f3Smrg */ 41147ec681f3Smrg if (emitted) 41157ec681f3Smrg genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 41167ec681f3Smrg 41177ec681f3Smrg anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { 41187ec681f3Smrg prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; 41197ec681f3Smrg prim.VertexAccessType = RANDOM; 41207ec681f3Smrg prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology; 41217ec681f3Smrg prim.VertexCountPerInstance = draw->indexCount; 41227ec681f3Smrg prim.StartVertexLocation = draw->firstIndex; 41237ec681f3Smrg prim.InstanceCount = instanceCount; 41247ec681f3Smrg prim.StartInstanceLocation = firstInstance; 41257ec681f3Smrg prim.BaseVertexLocation = *pVertexOffset; 41267ec681f3Smrg } 41277ec681f3Smrg emitted = false; 41287ec681f3Smrg } 41297ec681f3Smrg } else { 41307ec681f3Smrg if (vs_prog_data->uses_firstvertex || 41317ec681f3Smrg vs_prog_data->uses_baseinstance) { 41327ec681f3Smrg emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance); 41337ec681f3Smrg /* Emitting draw index or vertex index BOs may result in needing 41347ec681f3Smrg * additional VF cache flushes. 41357ec681f3Smrg */ 41367ec681f3Smrg genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 41377ec681f3Smrg } 41387ec681f3Smrg vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) { 41397ec681f3Smrg anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { 41407ec681f3Smrg prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; 41417ec681f3Smrg prim.VertexAccessType = RANDOM; 41427ec681f3Smrg prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology; 41437ec681f3Smrg prim.VertexCountPerInstance = draw->indexCount; 41447ec681f3Smrg prim.StartVertexLocation = draw->firstIndex; 41457ec681f3Smrg prim.InstanceCount = instanceCount; 41467ec681f3Smrg prim.StartInstanceLocation = firstInstance; 41477ec681f3Smrg prim.BaseVertexLocation = *pVertexOffset; 41487ec681f3Smrg } 41497ec681f3Smrg } 41507ec681f3Smrg } 41517ec681f3Smrg } else { 41527ec681f3Smrg vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) { 41537ec681f3Smrg cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data, 41547ec681f3Smrg draw->vertexOffset, 41557ec681f3Smrg firstInstance, i, i != 0); 41567ec681f3Smrg 41577ec681f3Smrg anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { 41587ec681f3Smrg prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; 41597ec681f3Smrg prim.VertexAccessType = RANDOM; 41607ec681f3Smrg prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology; 41617ec681f3Smrg prim.VertexCountPerInstance = draw->indexCount; 41627ec681f3Smrg prim.StartVertexLocation = draw->firstIndex; 41637ec681f3Smrg prim.InstanceCount = instanceCount; 41647ec681f3Smrg prim.StartInstanceLocation = firstInstance; 41657ec681f3Smrg prim.BaseVertexLocation = draw->vertexOffset; 41667ec681f3Smrg } 41677ec681f3Smrg } 41687ec681f3Smrg } 41697ec681f3Smrg 41707ec681f3Smrg update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM); 417101e04c3fSmrg} 417201e04c3fSmrg 417301e04c3fSmrg/* Auto-Draw / Indirect Registers */ 41747ec681f3Smrg#define GFX7_3DPRIM_END_OFFSET 0x2420 41757ec681f3Smrg#define GFX7_3DPRIM_START_VERTEX 0x2430 41767ec681f3Smrg#define GFX7_3DPRIM_VERTEX_COUNT 0x2434 41777ec681f3Smrg#define GFX7_3DPRIM_INSTANCE_COUNT 0x2438 41787ec681f3Smrg#define GFX7_3DPRIM_START_INSTANCE 0x243C 41797ec681f3Smrg#define GFX7_3DPRIM_BASE_VERTEX 0x2440 418001e04c3fSmrg 41819f464c52Smayavoid genX(CmdDrawIndirectByteCountEXT)( 41829f464c52Smaya VkCommandBuffer commandBuffer, 41839f464c52Smaya uint32_t instanceCount, 41849f464c52Smaya uint32_t firstInstance, 41859f464c52Smaya VkBuffer counterBuffer, 41869f464c52Smaya VkDeviceSize counterBufferOffset, 41879f464c52Smaya uint32_t counterOffset, 41889f464c52Smaya uint32_t vertexStride) 418901e04c3fSmrg{ 41907ec681f3Smrg#if GFX_VERx10 >= 75 41919f464c52Smaya ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 41929f464c52Smaya ANV_FROM_HANDLE(anv_buffer, counter_buffer, counterBuffer); 41937ec681f3Smrg struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; 41949f464c52Smaya const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); 419501e04c3fSmrg 41969f464c52Smaya /* firstVertex is always zero for this draw function */ 41979f464c52Smaya const uint32_t firstVertex = 0; 419801e04c3fSmrg 41999f464c52Smaya if (anv_batch_has_error(&cmd_buffer->batch)) 42009f464c52Smaya return; 420101e04c3fSmrg 42027ec681f3Smrg anv_measure_snapshot(cmd_buffer, 42037ec681f3Smrg INTEL_SNAPSHOT_DRAW, 42047ec681f3Smrg "draw indirect byte count", 42057ec681f3Smrg instanceCount); 42067ec681f3Smrg 42079f464c52Smaya genX(cmd_buffer_flush_state)(cmd_buffer); 420801e04c3fSmrg 42099f464c52Smaya if (vs_prog_data->uses_firstvertex || 42109f464c52Smaya vs_prog_data->uses_baseinstance) 42119f464c52Smaya emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance); 42129f464c52Smaya if (vs_prog_data->uses_drawid) 42139f464c52Smaya emit_draw_index(cmd_buffer, 0); 421401e04c3fSmrg 42157ec681f3Smrg /* Emitting draw index or vertex index BOs may result in needing 42167ec681f3Smrg * additional VF cache flushes. 42177ec681f3Smrg */ 42187ec681f3Smrg genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 42197ec681f3Smrg 42209f464c52Smaya /* Our implementation of VK_KHR_multiview uses instancing to draw the 42219f464c52Smaya * different views. We need to multiply instanceCount by the view count. 42229f464c52Smaya */ 42237ec681f3Smrg if (!pipeline->use_primitive_replication) 42247ec681f3Smrg instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass); 422501e04c3fSmrg 42267ec681f3Smrg struct mi_builder b; 42277ec681f3Smrg mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 42287ec681f3Smrg struct mi_value count = 42297ec681f3Smrg mi_mem32(anv_address_add(counter_buffer->address, 42309f464c52Smaya counterBufferOffset)); 42319f464c52Smaya if (counterOffset) 42327ec681f3Smrg count = mi_isub(&b, count, mi_imm(counterOffset)); 42337ec681f3Smrg count = mi_udiv32_imm(&b, count, vertexStride); 42347ec681f3Smrg mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT), count); 42357ec681f3Smrg 42367ec681f3Smrg mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX), mi_imm(firstVertex)); 42377ec681f3Smrg mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT), mi_imm(instanceCount)); 42387ec681f3Smrg mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE), mi_imm(firstInstance)); 42397ec681f3Smrg mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0)); 424001e04c3fSmrg 42419f464c52Smaya anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { 42429f464c52Smaya prim.IndirectParameterEnable = true; 42439f464c52Smaya prim.VertexAccessType = SEQUENTIAL; 42447ec681f3Smrg prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology; 42459f464c52Smaya } 42467ec681f3Smrg 42477ec681f3Smrg update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL); 42487ec681f3Smrg#endif /* GFX_VERx10 >= 75 */ 42499f464c52Smaya} 425001e04c3fSmrg 425101e04c3fSmrgstatic void 425201e04c3fSmrgload_indirect_parameters(struct anv_cmd_buffer *cmd_buffer, 425301e04c3fSmrg struct anv_address addr, 425401e04c3fSmrg bool indexed) 425501e04c3fSmrg{ 42567ec681f3Smrg struct mi_builder b; 42577ec681f3Smrg mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 425801e04c3fSmrg 42597ec681f3Smrg mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT), 42607ec681f3Smrg mi_mem32(anv_address_add(addr, 0))); 426101e04c3fSmrg 42627ec681f3Smrg struct mi_value instance_count = mi_mem32(anv_address_add(addr, 4)); 426301e04c3fSmrg unsigned view_count = anv_subpass_view_count(cmd_buffer->state.subpass); 426401e04c3fSmrg if (view_count > 1) { 42657ec681f3Smrg#if GFX_VERx10 >= 75 42667ec681f3Smrg instance_count = mi_imul_imm(&b, instance_count, view_count); 426701e04c3fSmrg#else 426801e04c3fSmrg anv_finishme("Multiview + indirect draw requires MI_MATH; " 426901e04c3fSmrg "MI_MATH is not supported on Ivy Bridge"); 427001e04c3fSmrg#endif 427101e04c3fSmrg } 42727ec681f3Smrg mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT), instance_count); 427301e04c3fSmrg 42747ec681f3Smrg mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX), 42757ec681f3Smrg mi_mem32(anv_address_add(addr, 8))); 427601e04c3fSmrg 427701e04c3fSmrg if (indexed) { 42787ec681f3Smrg mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), 42797ec681f3Smrg mi_mem32(anv_address_add(addr, 12))); 42807ec681f3Smrg mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE), 42817ec681f3Smrg mi_mem32(anv_address_add(addr, 16))); 428201e04c3fSmrg } else { 42837ec681f3Smrg mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE), 42847ec681f3Smrg mi_mem32(anv_address_add(addr, 12))); 42857ec681f3Smrg mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0)); 428601e04c3fSmrg } 428701e04c3fSmrg} 428801e04c3fSmrg 428901e04c3fSmrgvoid genX(CmdDrawIndirect)( 429001e04c3fSmrg VkCommandBuffer commandBuffer, 429101e04c3fSmrg VkBuffer _buffer, 429201e04c3fSmrg VkDeviceSize offset, 429301e04c3fSmrg uint32_t drawCount, 429401e04c3fSmrg uint32_t stride) 429501e04c3fSmrg{ 429601e04c3fSmrg ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 429701e04c3fSmrg ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); 42987ec681f3Smrg struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; 429901e04c3fSmrg const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); 430001e04c3fSmrg 430101e04c3fSmrg if (anv_batch_has_error(&cmd_buffer->batch)) 430201e04c3fSmrg return; 430301e04c3fSmrg 430401e04c3fSmrg genX(cmd_buffer_flush_state)(cmd_buffer); 430501e04c3fSmrg 43069f464c52Smaya if (cmd_buffer->state.conditional_render_enabled) 43079f464c52Smaya genX(cmd_emit_conditional_render_predicate)(cmd_buffer); 43089f464c52Smaya 430901e04c3fSmrg for (uint32_t i = 0; i < drawCount; i++) { 431001e04c3fSmrg struct anv_address draw = anv_address_add(buffer->address, offset); 431101e04c3fSmrg 431201e04c3fSmrg if (vs_prog_data->uses_firstvertex || 431301e04c3fSmrg vs_prog_data->uses_baseinstance) 431401e04c3fSmrg emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8)); 431501e04c3fSmrg if (vs_prog_data->uses_drawid) 431601e04c3fSmrg emit_draw_index(cmd_buffer, i); 431701e04c3fSmrg 43187ec681f3Smrg /* Emitting draw index or vertex index BOs may result in needing 43197ec681f3Smrg * additional VF cache flushes. 43207ec681f3Smrg */ 43217ec681f3Smrg genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 43227ec681f3Smrg 432301e04c3fSmrg load_indirect_parameters(cmd_buffer, draw, false); 432401e04c3fSmrg 432501e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { 432601e04c3fSmrg prim.IndirectParameterEnable = true; 43279f464c52Smaya prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; 432801e04c3fSmrg prim.VertexAccessType = SEQUENTIAL; 43297ec681f3Smrg prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology; 433001e04c3fSmrg } 433101e04c3fSmrg 43327ec681f3Smrg update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL); 43337ec681f3Smrg 433401e04c3fSmrg offset += stride; 433501e04c3fSmrg } 433601e04c3fSmrg} 433701e04c3fSmrg 433801e04c3fSmrgvoid genX(CmdDrawIndexedIndirect)( 433901e04c3fSmrg VkCommandBuffer commandBuffer, 434001e04c3fSmrg VkBuffer _buffer, 434101e04c3fSmrg VkDeviceSize offset, 434201e04c3fSmrg uint32_t drawCount, 434301e04c3fSmrg uint32_t stride) 434401e04c3fSmrg{ 434501e04c3fSmrg ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 434601e04c3fSmrg ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); 43477ec681f3Smrg struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; 434801e04c3fSmrg const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); 434901e04c3fSmrg 435001e04c3fSmrg if (anv_batch_has_error(&cmd_buffer->batch)) 435101e04c3fSmrg return; 435201e04c3fSmrg 435301e04c3fSmrg genX(cmd_buffer_flush_state)(cmd_buffer); 435401e04c3fSmrg 43559f464c52Smaya if (cmd_buffer->state.conditional_render_enabled) 43569f464c52Smaya genX(cmd_emit_conditional_render_predicate)(cmd_buffer); 43579f464c52Smaya 435801e04c3fSmrg for (uint32_t i = 0; i < drawCount; i++) { 435901e04c3fSmrg struct anv_address draw = anv_address_add(buffer->address, offset); 436001e04c3fSmrg 436101e04c3fSmrg /* TODO: We need to stomp base vertex to 0 somehow */ 436201e04c3fSmrg if (vs_prog_data->uses_firstvertex || 436301e04c3fSmrg vs_prog_data->uses_baseinstance) 436401e04c3fSmrg emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12)); 436501e04c3fSmrg if (vs_prog_data->uses_drawid) 436601e04c3fSmrg emit_draw_index(cmd_buffer, i); 436701e04c3fSmrg 43687ec681f3Smrg /* Emitting draw index or vertex index BOs may result in needing 43697ec681f3Smrg * additional VF cache flushes. 43707ec681f3Smrg */ 43717ec681f3Smrg genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 43727ec681f3Smrg 437301e04c3fSmrg load_indirect_parameters(cmd_buffer, draw, true); 437401e04c3fSmrg 437501e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { 437601e04c3fSmrg prim.IndirectParameterEnable = true; 43779f464c52Smaya prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; 43789f464c52Smaya prim.VertexAccessType = RANDOM; 43797ec681f3Smrg prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology; 43809f464c52Smaya } 43819f464c52Smaya 43827ec681f3Smrg update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM); 43837ec681f3Smrg 43849f464c52Smaya offset += stride; 43859f464c52Smaya } 43869f464c52Smaya} 43879f464c52Smaya 43887ec681f3Smrgstatic struct mi_value 43899f464c52Smayaprepare_for_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer, 43907ec681f3Smrg struct mi_builder *b, 43917ec681f3Smrg struct anv_buffer *count_buffer, 43927ec681f3Smrg uint64_t countBufferOffset) 43939f464c52Smaya{ 43947ec681f3Smrg struct anv_address count_address = 43957ec681f3Smrg anv_address_add(count_buffer->address, countBufferOffset); 43967ec681f3Smrg 43977ec681f3Smrg struct mi_value ret = mi_imm(0); 43989f464c52Smaya 43997ec681f3Smrg if (cmd_buffer->state.conditional_render_enabled) { 44007ec681f3Smrg#if GFX_VERx10 >= 75 44017ec681f3Smrg ret = mi_new_gpr(b); 44027ec681f3Smrg mi_store(b, mi_value_ref(b, ret), mi_mem32(count_address)); 44039f464c52Smaya#endif 44049f464c52Smaya } else { 44059f464c52Smaya /* Upload the current draw count from the draw parameters buffer to 44069f464c52Smaya * MI_PREDICATE_SRC0. 44079f464c52Smaya */ 44087ec681f3Smrg mi_store(b, mi_reg64(MI_PREDICATE_SRC0), mi_mem32(count_address)); 44097ec681f3Smrg mi_store(b, mi_reg32(MI_PREDICATE_SRC1 + 4), mi_imm(0)); 44109f464c52Smaya } 44117ec681f3Smrg 44127ec681f3Smrg return ret; 44139f464c52Smaya} 44149f464c52Smaya 44159f464c52Smayastatic void 44169f464c52Smayaemit_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer, 44177ec681f3Smrg struct mi_builder *b, 44189f464c52Smaya uint32_t draw_index) 44199f464c52Smaya{ 44209f464c52Smaya /* Upload the index of the current primitive to MI_PREDICATE_SRC1. */ 44217ec681f3Smrg mi_store(b, mi_reg32(MI_PREDICATE_SRC1), mi_imm(draw_index)); 44229f464c52Smaya 44239f464c52Smaya if (draw_index == 0) { 44249f464c52Smaya anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { 44259f464c52Smaya mip.LoadOperation = LOAD_LOADINV; 44269f464c52Smaya mip.CombineOperation = COMBINE_SET; 44279f464c52Smaya mip.CompareOperation = COMPARE_SRCS_EQUAL; 44289f464c52Smaya } 44299f464c52Smaya } else { 44309f464c52Smaya /* While draw_index < draw_count the predicate's result will be 44319f464c52Smaya * (draw_index == draw_count) ^ TRUE = TRUE 44329f464c52Smaya * When draw_index == draw_count the result is 44339f464c52Smaya * (TRUE) ^ TRUE = FALSE 44349f464c52Smaya * After this all results will be: 44359f464c52Smaya * (FALSE) ^ FALSE = FALSE 44369f464c52Smaya */ 44379f464c52Smaya anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { 44389f464c52Smaya mip.LoadOperation = LOAD_LOAD; 44399f464c52Smaya mip.CombineOperation = COMBINE_XOR; 44409f464c52Smaya mip.CompareOperation = COMPARE_SRCS_EQUAL; 44419f464c52Smaya } 44429f464c52Smaya } 44439f464c52Smaya} 44449f464c52Smaya 44457ec681f3Smrg#if GFX_VERx10 >= 75 44469f464c52Smayastatic void 44479f464c52Smayaemit_draw_count_predicate_with_conditional_render( 44489f464c52Smaya struct anv_cmd_buffer *cmd_buffer, 44497ec681f3Smrg struct mi_builder *b, 44507ec681f3Smrg uint32_t draw_index, 44517ec681f3Smrg struct mi_value max) 44529f464c52Smaya{ 44537ec681f3Smrg struct mi_value pred = mi_ult(b, mi_imm(draw_index), max); 44547ec681f3Smrg pred = mi_iand(b, pred, mi_reg64(ANV_PREDICATE_RESULT_REG)); 44559f464c52Smaya 44567ec681f3Smrg#if GFX_VER >= 8 44577ec681f3Smrg mi_store(b, mi_reg32(MI_PREDICATE_RESULT), pred); 44589f464c52Smaya#else 44599f464c52Smaya /* MI_PREDICATE_RESULT is not whitelisted in i915 command parser 44609f464c52Smaya * so we emit MI_PREDICATE to set it. 44619f464c52Smaya */ 44629f464c52Smaya 44637ec681f3Smrg mi_store(b, mi_reg64(MI_PREDICATE_SRC0), pred); 44647ec681f3Smrg mi_store(b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0)); 44659f464c52Smaya 44669f464c52Smaya anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { 44679f464c52Smaya mip.LoadOperation = LOAD_LOADINV; 44689f464c52Smaya mip.CombineOperation = COMBINE_SET; 44699f464c52Smaya mip.CompareOperation = COMPARE_SRCS_EQUAL; 44709f464c52Smaya } 44719f464c52Smaya#endif 44729f464c52Smaya} 44739f464c52Smaya#endif 44749f464c52Smaya 44757ec681f3Smrgstatic void 44767ec681f3Smrgemit_draw_count_predicate_cond(struct anv_cmd_buffer *cmd_buffer, 44777ec681f3Smrg struct mi_builder *b, 44787ec681f3Smrg uint32_t draw_index, 44797ec681f3Smrg struct mi_value max) 44807ec681f3Smrg{ 44817ec681f3Smrg#if GFX_VERx10 >= 75 44827ec681f3Smrg if (cmd_buffer->state.conditional_render_enabled) { 44837ec681f3Smrg emit_draw_count_predicate_with_conditional_render( 44847ec681f3Smrg cmd_buffer, b, draw_index, mi_value_ref(b, max)); 44857ec681f3Smrg } else { 44867ec681f3Smrg emit_draw_count_predicate(cmd_buffer, b, draw_index); 44877ec681f3Smrg } 44887ec681f3Smrg#else 44897ec681f3Smrg emit_draw_count_predicate(cmd_buffer, b, draw_index); 44907ec681f3Smrg#endif 44917ec681f3Smrg} 44927ec681f3Smrg 44937ec681f3Smrgvoid genX(CmdDrawIndirectCount)( 44949f464c52Smaya VkCommandBuffer commandBuffer, 44959f464c52Smaya VkBuffer _buffer, 44969f464c52Smaya VkDeviceSize offset, 44979f464c52Smaya VkBuffer _countBuffer, 44989f464c52Smaya VkDeviceSize countBufferOffset, 44999f464c52Smaya uint32_t maxDrawCount, 45009f464c52Smaya uint32_t stride) 45019f464c52Smaya{ 45029f464c52Smaya ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 45039f464c52Smaya ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); 45049f464c52Smaya ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer); 45059f464c52Smaya struct anv_cmd_state *cmd_state = &cmd_buffer->state; 45067ec681f3Smrg struct anv_graphics_pipeline *pipeline = cmd_state->gfx.pipeline; 45079f464c52Smaya const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); 45089f464c52Smaya 45099f464c52Smaya if (anv_batch_has_error(&cmd_buffer->batch)) 45109f464c52Smaya return; 45119f464c52Smaya 45129f464c52Smaya genX(cmd_buffer_flush_state)(cmd_buffer); 45139f464c52Smaya 45147ec681f3Smrg struct mi_builder b; 45157ec681f3Smrg mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 45167ec681f3Smrg struct mi_value max = 45177ec681f3Smrg prepare_for_draw_count_predicate(cmd_buffer, &b, 45187ec681f3Smrg count_buffer, countBufferOffset); 45199f464c52Smaya 45209f464c52Smaya for (uint32_t i = 0; i < maxDrawCount; i++) { 45219f464c52Smaya struct anv_address draw = anv_address_add(buffer->address, offset); 45229f464c52Smaya 45237ec681f3Smrg emit_draw_count_predicate_cond(cmd_buffer, &b, i, max); 45249f464c52Smaya 45259f464c52Smaya if (vs_prog_data->uses_firstvertex || 45269f464c52Smaya vs_prog_data->uses_baseinstance) 45279f464c52Smaya emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8)); 45289f464c52Smaya if (vs_prog_data->uses_drawid) 45299f464c52Smaya emit_draw_index(cmd_buffer, i); 45309f464c52Smaya 45317ec681f3Smrg /* Emitting draw index or vertex index BOs may result in needing 45327ec681f3Smrg * additional VF cache flushes. 45337ec681f3Smrg */ 45347ec681f3Smrg genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 45357ec681f3Smrg 45369f464c52Smaya load_indirect_parameters(cmd_buffer, draw, false); 45379f464c52Smaya 45389f464c52Smaya anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { 45399f464c52Smaya prim.IndirectParameterEnable = true; 45409f464c52Smaya prim.PredicateEnable = true; 45419f464c52Smaya prim.VertexAccessType = SEQUENTIAL; 45427ec681f3Smrg prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology; 45439f464c52Smaya } 45449f464c52Smaya 45457ec681f3Smrg update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL); 45467ec681f3Smrg 45479f464c52Smaya offset += stride; 45489f464c52Smaya } 45497ec681f3Smrg 45507ec681f3Smrg mi_value_unref(&b, max); 45519f464c52Smaya} 45529f464c52Smaya 45537ec681f3Smrgvoid genX(CmdDrawIndexedIndirectCount)( 45549f464c52Smaya VkCommandBuffer commandBuffer, 45559f464c52Smaya VkBuffer _buffer, 45569f464c52Smaya VkDeviceSize offset, 45579f464c52Smaya VkBuffer _countBuffer, 45589f464c52Smaya VkDeviceSize countBufferOffset, 45599f464c52Smaya uint32_t maxDrawCount, 45609f464c52Smaya uint32_t stride) 45619f464c52Smaya{ 45629f464c52Smaya ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 45639f464c52Smaya ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); 45649f464c52Smaya ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer); 45659f464c52Smaya struct anv_cmd_state *cmd_state = &cmd_buffer->state; 45667ec681f3Smrg struct anv_graphics_pipeline *pipeline = cmd_state->gfx.pipeline; 45679f464c52Smaya const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); 45689f464c52Smaya 45699f464c52Smaya if (anv_batch_has_error(&cmd_buffer->batch)) 45709f464c52Smaya return; 45719f464c52Smaya 45729f464c52Smaya genX(cmd_buffer_flush_state)(cmd_buffer); 45739f464c52Smaya 45747ec681f3Smrg struct mi_builder b; 45757ec681f3Smrg mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 45767ec681f3Smrg struct mi_value max = 45777ec681f3Smrg prepare_for_draw_count_predicate(cmd_buffer, &b, 45787ec681f3Smrg count_buffer, countBufferOffset); 45799f464c52Smaya 45809f464c52Smaya for (uint32_t i = 0; i < maxDrawCount; i++) { 45819f464c52Smaya struct anv_address draw = anv_address_add(buffer->address, offset); 45829f464c52Smaya 45837ec681f3Smrg emit_draw_count_predicate_cond(cmd_buffer, &b, i, max); 45849f464c52Smaya 45859f464c52Smaya /* TODO: We need to stomp base vertex to 0 somehow */ 45869f464c52Smaya if (vs_prog_data->uses_firstvertex || 45879f464c52Smaya vs_prog_data->uses_baseinstance) 45889f464c52Smaya emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12)); 45899f464c52Smaya if (vs_prog_data->uses_drawid) 45909f464c52Smaya emit_draw_index(cmd_buffer, i); 45919f464c52Smaya 45927ec681f3Smrg /* Emitting draw index or vertex index BOs may result in needing 45937ec681f3Smrg * additional VF cache flushes. 45947ec681f3Smrg */ 45957ec681f3Smrg genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 45967ec681f3Smrg 45979f464c52Smaya load_indirect_parameters(cmd_buffer, draw, true); 45989f464c52Smaya 45999f464c52Smaya anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { 46009f464c52Smaya prim.IndirectParameterEnable = true; 46019f464c52Smaya prim.PredicateEnable = true; 460201e04c3fSmrg prim.VertexAccessType = RANDOM; 46037ec681f3Smrg prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology; 460401e04c3fSmrg } 460501e04c3fSmrg 46067ec681f3Smrg update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM); 46077ec681f3Smrg 460801e04c3fSmrg offset += stride; 460901e04c3fSmrg } 46107ec681f3Smrg 46117ec681f3Smrg mi_value_unref(&b, max); 46129f464c52Smaya} 46139f464c52Smaya 46149f464c52Smayavoid genX(CmdBeginTransformFeedbackEXT)( 46159f464c52Smaya VkCommandBuffer commandBuffer, 46169f464c52Smaya uint32_t firstCounterBuffer, 46179f464c52Smaya uint32_t counterBufferCount, 46189f464c52Smaya const VkBuffer* pCounterBuffers, 46199f464c52Smaya const VkDeviceSize* pCounterBufferOffsets) 46209f464c52Smaya{ 46219f464c52Smaya ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 46229f464c52Smaya 46239f464c52Smaya assert(firstCounterBuffer < MAX_XFB_BUFFERS); 46249f464c52Smaya assert(counterBufferCount <= MAX_XFB_BUFFERS); 46259f464c52Smaya assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS); 46269f464c52Smaya 46279f464c52Smaya /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET: 46289f464c52Smaya * 46299f464c52Smaya * "Ssoftware must ensure that no HW stream output operations can be in 46309f464c52Smaya * process or otherwise pending at the point that the MI_LOAD/STORE 46319f464c52Smaya * commands are processed. This will likely require a pipeline flush." 46329f464c52Smaya */ 46337ec681f3Smrg anv_add_pending_pipe_bits(cmd_buffer, 46347ec681f3Smrg ANV_PIPE_CS_STALL_BIT, 46357ec681f3Smrg "begin transform feedback"); 46369f464c52Smaya genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 46379f464c52Smaya 46389f464c52Smaya for (uint32_t idx = 0; idx < MAX_XFB_BUFFERS; idx++) { 46399f464c52Smaya /* If we have a counter buffer, this is a resume so we need to load the 46409f464c52Smaya * value into the streamout offset register. Otherwise, this is a begin 46419f464c52Smaya * and we need to reset it to zero. 46429f464c52Smaya */ 46439f464c52Smaya if (pCounterBuffers && 46449f464c52Smaya idx >= firstCounterBuffer && 46459f464c52Smaya idx - firstCounterBuffer < counterBufferCount && 46469f464c52Smaya pCounterBuffers[idx - firstCounterBuffer] != VK_NULL_HANDLE) { 46479f464c52Smaya uint32_t cb_idx = idx - firstCounterBuffer; 46489f464c52Smaya ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]); 46499f464c52Smaya uint64_t offset = pCounterBufferOffsets ? 46509f464c52Smaya pCounterBufferOffsets[cb_idx] : 0; 465101e04c3fSmrg 46529f464c52Smaya anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { 46539f464c52Smaya lrm.RegisterAddress = GENX(SO_WRITE_OFFSET0_num) + idx * 4; 46549f464c52Smaya lrm.MemoryAddress = anv_address_add(counter_buffer->address, 46559f464c52Smaya offset); 46569f464c52Smaya } 46579f464c52Smaya } else { 46589f464c52Smaya anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) { 46599f464c52Smaya lri.RegisterOffset = GENX(SO_WRITE_OFFSET0_num) + idx * 4; 46609f464c52Smaya lri.DataDWord = 0; 46619f464c52Smaya } 46629f464c52Smaya } 46639f464c52Smaya } 46649f464c52Smaya 46659f464c52Smaya cmd_buffer->state.xfb_enabled = true; 46669f464c52Smaya cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE; 46679f464c52Smaya} 46689f464c52Smaya 46699f464c52Smayavoid genX(CmdEndTransformFeedbackEXT)( 46709f464c52Smaya VkCommandBuffer commandBuffer, 46719f464c52Smaya uint32_t firstCounterBuffer, 46729f464c52Smaya uint32_t counterBufferCount, 46739f464c52Smaya const VkBuffer* pCounterBuffers, 46749f464c52Smaya const VkDeviceSize* pCounterBufferOffsets) 46759f464c52Smaya{ 46769f464c52Smaya ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 46779f464c52Smaya 46789f464c52Smaya assert(firstCounterBuffer < MAX_XFB_BUFFERS); 46799f464c52Smaya assert(counterBufferCount <= MAX_XFB_BUFFERS); 46809f464c52Smaya assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS); 46819f464c52Smaya 46829f464c52Smaya /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET: 46839f464c52Smaya * 46849f464c52Smaya * "Ssoftware must ensure that no HW stream output operations can be in 46859f464c52Smaya * process or otherwise pending at the point that the MI_LOAD/STORE 46869f464c52Smaya * commands are processed. This will likely require a pipeline flush." 46879f464c52Smaya */ 46887ec681f3Smrg anv_add_pending_pipe_bits(cmd_buffer, 46897ec681f3Smrg ANV_PIPE_CS_STALL_BIT, 46907ec681f3Smrg "end transform feedback"); 46919f464c52Smaya genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 46929f464c52Smaya 46937ec681f3Smrg for (uint32_t cb_idx = 0; cb_idx < counterBufferCount; cb_idx++) { 46947ec681f3Smrg unsigned idx = firstCounterBuffer + cb_idx; 469501e04c3fSmrg 46967ec681f3Smrg /* If we have a counter buffer, this is a resume so we need to load the 46977ec681f3Smrg * value into the streamout offset register. Otherwise, this is a begin 46987ec681f3Smrg * and we need to reset it to zero. 46997ec681f3Smrg */ 47007ec681f3Smrg if (pCounterBuffers && 47017ec681f3Smrg cb_idx < counterBufferCount && 47027ec681f3Smrg pCounterBuffers[cb_idx] != VK_NULL_HANDLE) { 47037ec681f3Smrg ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]); 47047ec681f3Smrg uint64_t offset = pCounterBufferOffsets ? 47057ec681f3Smrg pCounterBufferOffsets[cb_idx] : 0; 470601e04c3fSmrg 47077ec681f3Smrg anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) { 47087ec681f3Smrg srm.MemoryAddress = anv_address_add(counter_buffer->address, 47097ec681f3Smrg offset); 47107ec681f3Smrg srm.RegisterAddress = GENX(SO_WRITE_OFFSET0_num) + idx * 4; 47117ec681f3Smrg } 47127ec681f3Smrg } 471301e04c3fSmrg } 471401e04c3fSmrg 47157ec681f3Smrg cmd_buffer->state.xfb_enabled = false; 47167ec681f3Smrg cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE; 471701e04c3fSmrg} 471801e04c3fSmrg 471901e04c3fSmrgvoid 472001e04c3fSmrggenX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer) 472101e04c3fSmrg{ 47227ec681f3Smrg struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute; 47237ec681f3Smrg struct anv_compute_pipeline *pipeline = comp_state->pipeline; 472401e04c3fSmrg 47257ec681f3Smrg assert(pipeline->cs); 472601e04c3fSmrg 47277ec681f3Smrg genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config); 472801e04c3fSmrg 472901e04c3fSmrg genX(flush_pipeline_select_gpgpu)(cmd_buffer); 473001e04c3fSmrg 47317ec681f3Smrg /* Apply any pending pipeline flushes we may have. We want to apply them 47327ec681f3Smrg * now because, if any of those flushes are for things like push constants, 47337ec681f3Smrg * the GPU will read the state at weird times. 47347ec681f3Smrg */ 47357ec681f3Smrg genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 47367ec681f3Smrg 473701e04c3fSmrg if (cmd_buffer->state.compute.pipeline_dirty) { 473801e04c3fSmrg /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE: 473901e04c3fSmrg * 474001e04c3fSmrg * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless 474101e04c3fSmrg * the only bits that are changed are scoreboard related: Scoreboard 474201e04c3fSmrg * Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For 474301e04c3fSmrg * these scoreboard related states, a MEDIA_STATE_FLUSH is 474401e04c3fSmrg * sufficient." 474501e04c3fSmrg */ 47467ec681f3Smrg anv_add_pending_pipe_bits(cmd_buffer, 47477ec681f3Smrg ANV_PIPE_CS_STALL_BIT, 47487ec681f3Smrg "flush compute state"); 474901e04c3fSmrg genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 475001e04c3fSmrg 47517ec681f3Smrg anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch); 47527ec681f3Smrg 47537ec681f3Smrg /* The workgroup size of the pipeline affects our push constant layout 47547ec681f3Smrg * so flag push constants as dirty if we change the pipeline. 47557ec681f3Smrg */ 47567ec681f3Smrg cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; 475701e04c3fSmrg } 475801e04c3fSmrg 475901e04c3fSmrg if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) || 476001e04c3fSmrg cmd_buffer->state.compute.pipeline_dirty) { 47617ec681f3Smrg flush_descriptor_sets(cmd_buffer, 47627ec681f3Smrg &cmd_buffer->state.compute.base, 47637ec681f3Smrg VK_SHADER_STAGE_COMPUTE_BIT, 47647ec681f3Smrg &pipeline->cs, 1); 476501e04c3fSmrg cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT; 47667ec681f3Smrg 47677ec681f3Smrg#if GFX_VERx10 < 125 47687ec681f3Smrg uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)]; 47697ec681f3Smrg struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = { 47707ec681f3Smrg .BindingTablePointer = 47717ec681f3Smrg cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset, 47727ec681f3Smrg .SamplerStatePointer = 47737ec681f3Smrg cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset, 47747ec681f3Smrg }; 47757ec681f3Smrg GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc); 47767ec681f3Smrg 47777ec681f3Smrg struct anv_state state = 47787ec681f3Smrg anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw, 47797ec681f3Smrg pipeline->interface_descriptor_data, 47807ec681f3Smrg GENX(INTERFACE_DESCRIPTOR_DATA_length), 47817ec681f3Smrg 64); 47827ec681f3Smrg 47837ec681f3Smrg uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t); 47847ec681f3Smrg anv_batch_emit(&cmd_buffer->batch, 47857ec681f3Smrg GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) { 47867ec681f3Smrg mid.InterfaceDescriptorTotalLength = size; 47877ec681f3Smrg mid.InterfaceDescriptorDataStartAddress = state.offset; 47887ec681f3Smrg } 47897ec681f3Smrg#endif 479001e04c3fSmrg } 479101e04c3fSmrg 479201e04c3fSmrg if (cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_COMPUTE_BIT) { 47937ec681f3Smrg comp_state->push_data = 479401e04c3fSmrg anv_cmd_buffer_cs_push_constants(cmd_buffer); 479501e04c3fSmrg 47967ec681f3Smrg#if GFX_VERx10 < 125 47977ec681f3Smrg if (comp_state->push_data.alloc_size) { 479801e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) { 47997ec681f3Smrg curbe.CURBETotalDataLength = comp_state->push_data.alloc_size; 48007ec681f3Smrg curbe.CURBEDataStartAddress = comp_state->push_data.offset; 480101e04c3fSmrg } 480201e04c3fSmrg } 48037ec681f3Smrg#endif 480401e04c3fSmrg 480501e04c3fSmrg cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT; 480601e04c3fSmrg } 480701e04c3fSmrg 480801e04c3fSmrg cmd_buffer->state.compute.pipeline_dirty = false; 480901e04c3fSmrg 481001e04c3fSmrg genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 481101e04c3fSmrg} 481201e04c3fSmrg 48137ec681f3Smrg#if GFX_VER == 7 481401e04c3fSmrg 481501e04c3fSmrgstatic VkResult 481601e04c3fSmrgverify_cmd_parser(const struct anv_device *device, 481701e04c3fSmrg int required_version, 481801e04c3fSmrg const char *function) 481901e04c3fSmrg{ 48207ec681f3Smrg if (device->physical->cmd_parser_version < required_version) { 48217ec681f3Smrg return vk_errorf(device->physical, VK_ERROR_FEATURE_NOT_PRESENT, 482201e04c3fSmrg "cmd parser version %d is required for %s", 482301e04c3fSmrg required_version, function); 482401e04c3fSmrg } else { 482501e04c3fSmrg return VK_SUCCESS; 482601e04c3fSmrg } 482701e04c3fSmrg} 482801e04c3fSmrg 482901e04c3fSmrg#endif 483001e04c3fSmrg 483101e04c3fSmrgstatic void 483201e04c3fSmrganv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer *cmd_buffer, 483301e04c3fSmrg uint32_t baseGroupX, 483401e04c3fSmrg uint32_t baseGroupY, 483501e04c3fSmrg uint32_t baseGroupZ) 483601e04c3fSmrg{ 483701e04c3fSmrg if (anv_batch_has_error(&cmd_buffer->batch)) 483801e04c3fSmrg return; 483901e04c3fSmrg 484001e04c3fSmrg struct anv_push_constants *push = 48417ec681f3Smrg &cmd_buffer->state.compute.base.push_constants; 48427ec681f3Smrg if (push->cs.base_work_group_id[0] != baseGroupX || 48437ec681f3Smrg push->cs.base_work_group_id[1] != baseGroupY || 48447ec681f3Smrg push->cs.base_work_group_id[2] != baseGroupZ) { 48457ec681f3Smrg push->cs.base_work_group_id[0] = baseGroupX; 48467ec681f3Smrg push->cs.base_work_group_id[1] = baseGroupY; 48477ec681f3Smrg push->cs.base_work_group_id[2] = baseGroupZ; 484801e04c3fSmrg 484901e04c3fSmrg cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; 485001e04c3fSmrg } 485101e04c3fSmrg} 485201e04c3fSmrg 485301e04c3fSmrgvoid genX(CmdDispatch)( 485401e04c3fSmrg VkCommandBuffer commandBuffer, 485501e04c3fSmrg uint32_t x, 485601e04c3fSmrg uint32_t y, 485701e04c3fSmrg uint32_t z) 485801e04c3fSmrg{ 485901e04c3fSmrg genX(CmdDispatchBase)(commandBuffer, 0, 0, 0, x, y, z); 486001e04c3fSmrg} 486101e04c3fSmrg 48627ec681f3Smrg#if GFX_VERx10 >= 125 48637ec681f3Smrg 48647ec681f3Smrgstatic inline void 48657ec681f3Smrgemit_compute_walker(struct anv_cmd_buffer *cmd_buffer, 48667ec681f3Smrg const struct anv_compute_pipeline *pipeline, bool indirect, 48677ec681f3Smrg const struct brw_cs_prog_data *prog_data, 48687ec681f3Smrg uint32_t groupCountX, uint32_t groupCountY, 48697ec681f3Smrg uint32_t groupCountZ) 48707ec681f3Smrg{ 48717ec681f3Smrg struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute; 48727ec681f3Smrg const struct anv_shader_bin *cs_bin = pipeline->cs; 48737ec681f3Smrg bool predicate = cmd_buffer->state.conditional_render_enabled; 48747ec681f3Smrg 48757ec681f3Smrg const struct intel_device_info *devinfo = &pipeline->base.device->info; 48767ec681f3Smrg const struct brw_cs_dispatch_info dispatch = 48777ec681f3Smrg brw_cs_get_dispatch_info(devinfo, prog_data, NULL); 48787ec681f3Smrg 48797ec681f3Smrg anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) { 48807ec681f3Smrg cw.IndirectParameterEnable = indirect; 48817ec681f3Smrg cw.PredicateEnable = predicate; 48827ec681f3Smrg cw.SIMDSize = dispatch.simd_size / 16; 48837ec681f3Smrg cw.IndirectDataStartAddress = comp_state->push_data.offset; 48847ec681f3Smrg cw.IndirectDataLength = comp_state->push_data.alloc_size; 48857ec681f3Smrg cw.LocalXMaximum = prog_data->local_size[0] - 1; 48867ec681f3Smrg cw.LocalYMaximum = prog_data->local_size[1] - 1; 48877ec681f3Smrg cw.LocalZMaximum = prog_data->local_size[2] - 1; 48887ec681f3Smrg cw.ThreadGroupIDXDimension = groupCountX; 48897ec681f3Smrg cw.ThreadGroupIDYDimension = groupCountY; 48907ec681f3Smrg cw.ThreadGroupIDZDimension = groupCountZ; 48917ec681f3Smrg cw.ExecutionMask = dispatch.right_mask; 48927ec681f3Smrg 48937ec681f3Smrg cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) { 48947ec681f3Smrg .KernelStartPointer = cs_bin->kernel.offset, 48957ec681f3Smrg .SamplerStatePointer = 48967ec681f3Smrg cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset, 48977ec681f3Smrg .BindingTablePointer = 48987ec681f3Smrg cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset, 48997ec681f3Smrg .BindingTableEntryCount = 49007ec681f3Smrg 1 + MIN2(pipeline->cs->bind_map.surface_count, 30), 49017ec681f3Smrg .NumberofThreadsinGPGPUThreadGroup = dispatch.threads, 49027ec681f3Smrg .SharedLocalMemorySize = encode_slm_size(GFX_VER, 49037ec681f3Smrg prog_data->base.total_shared), 49047ec681f3Smrg .NumberOfBarriers = prog_data->uses_barrier, 49057ec681f3Smrg }; 49067ec681f3Smrg } 49077ec681f3Smrg} 49087ec681f3Smrg 49097ec681f3Smrg#else /* #if GFX_VERx10 >= 125 */ 49107ec681f3Smrg 49117ec681f3Smrgstatic inline void 49127ec681f3Smrgemit_gpgpu_walker(struct anv_cmd_buffer *cmd_buffer, 49137ec681f3Smrg const struct anv_compute_pipeline *pipeline, bool indirect, 49147ec681f3Smrg const struct brw_cs_prog_data *prog_data, 49157ec681f3Smrg uint32_t groupCountX, uint32_t groupCountY, 49167ec681f3Smrg uint32_t groupCountZ) 49177ec681f3Smrg{ 49187ec681f3Smrg bool predicate = (GFX_VER <= 7 && indirect) || 49197ec681f3Smrg cmd_buffer->state.conditional_render_enabled; 49207ec681f3Smrg 49217ec681f3Smrg const struct intel_device_info *devinfo = &pipeline->base.device->info; 49227ec681f3Smrg const struct brw_cs_dispatch_info dispatch = 49237ec681f3Smrg brw_cs_get_dispatch_info(devinfo, prog_data, NULL); 49247ec681f3Smrg 49257ec681f3Smrg anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) { 49267ec681f3Smrg ggw.IndirectParameterEnable = indirect; 49277ec681f3Smrg ggw.PredicateEnable = predicate; 49287ec681f3Smrg ggw.SIMDSize = dispatch.simd_size / 16; 49297ec681f3Smrg ggw.ThreadDepthCounterMaximum = 0; 49307ec681f3Smrg ggw.ThreadHeightCounterMaximum = 0; 49317ec681f3Smrg ggw.ThreadWidthCounterMaximum = dispatch.threads - 1; 49327ec681f3Smrg ggw.ThreadGroupIDXDimension = groupCountX; 49337ec681f3Smrg ggw.ThreadGroupIDYDimension = groupCountY; 49347ec681f3Smrg ggw.ThreadGroupIDZDimension = groupCountZ; 49357ec681f3Smrg ggw.RightExecutionMask = dispatch.right_mask; 49367ec681f3Smrg ggw.BottomExecutionMask = 0xffffffff; 49377ec681f3Smrg } 49387ec681f3Smrg 49397ec681f3Smrg anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH), msf); 49407ec681f3Smrg} 49417ec681f3Smrg 49427ec681f3Smrg#endif /* #if GFX_VERx10 >= 125 */ 49437ec681f3Smrg 49447ec681f3Smrgstatic inline void 49457ec681f3Smrgemit_cs_walker(struct anv_cmd_buffer *cmd_buffer, 49467ec681f3Smrg const struct anv_compute_pipeline *pipeline, bool indirect, 49477ec681f3Smrg const struct brw_cs_prog_data *prog_data, 49487ec681f3Smrg uint32_t groupCountX, uint32_t groupCountY, 49497ec681f3Smrg uint32_t groupCountZ) 49507ec681f3Smrg{ 49517ec681f3Smrg#if GFX_VERx10 >= 125 49527ec681f3Smrg emit_compute_walker(cmd_buffer, pipeline, indirect, prog_data, groupCountX, 49537ec681f3Smrg groupCountY, groupCountZ); 49547ec681f3Smrg#else 49557ec681f3Smrg emit_gpgpu_walker(cmd_buffer, pipeline, indirect, prog_data, groupCountX, 49567ec681f3Smrg groupCountY, groupCountZ); 49577ec681f3Smrg#endif 49587ec681f3Smrg} 49597ec681f3Smrg 496001e04c3fSmrgvoid genX(CmdDispatchBase)( 496101e04c3fSmrg VkCommandBuffer commandBuffer, 496201e04c3fSmrg uint32_t baseGroupX, 496301e04c3fSmrg uint32_t baseGroupY, 496401e04c3fSmrg uint32_t baseGroupZ, 496501e04c3fSmrg uint32_t groupCountX, 496601e04c3fSmrg uint32_t groupCountY, 496701e04c3fSmrg uint32_t groupCountZ) 496801e04c3fSmrg{ 496901e04c3fSmrg ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 49707ec681f3Smrg struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline; 497101e04c3fSmrg const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline); 497201e04c3fSmrg 497301e04c3fSmrg anv_cmd_buffer_push_base_group_id(cmd_buffer, baseGroupX, 497401e04c3fSmrg baseGroupY, baseGroupZ); 497501e04c3fSmrg 497601e04c3fSmrg if (anv_batch_has_error(&cmd_buffer->batch)) 497701e04c3fSmrg return; 497801e04c3fSmrg 49797ec681f3Smrg anv_measure_snapshot(cmd_buffer, 49807ec681f3Smrg INTEL_SNAPSHOT_COMPUTE, 49817ec681f3Smrg "compute", 49827ec681f3Smrg groupCountX * groupCountY * groupCountZ * 49837ec681f3Smrg prog_data->local_size[0] * prog_data->local_size[1] * 49847ec681f3Smrg prog_data->local_size[2]); 49857ec681f3Smrg 498601e04c3fSmrg if (prog_data->uses_num_work_groups) { 498701e04c3fSmrg struct anv_state state = 498801e04c3fSmrg anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 12, 4); 498901e04c3fSmrg uint32_t *sizes = state.map; 499001e04c3fSmrg sizes[0] = groupCountX; 499101e04c3fSmrg sizes[1] = groupCountY; 499201e04c3fSmrg sizes[2] = groupCountZ; 499301e04c3fSmrg cmd_buffer->state.compute.num_workgroups = (struct anv_address) { 49949f464c52Smaya .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, 499501e04c3fSmrg .offset = state.offset, 499601e04c3fSmrg }; 49977ec681f3Smrg 49987ec681f3Smrg /* The num_workgroups buffer goes in the binding table */ 49997ec681f3Smrg cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; 500001e04c3fSmrg } 500101e04c3fSmrg 500201e04c3fSmrg genX(cmd_buffer_flush_compute_state)(cmd_buffer); 500301e04c3fSmrg 50049f464c52Smaya if (cmd_buffer->state.conditional_render_enabled) 50059f464c52Smaya genX(cmd_emit_conditional_render_predicate)(cmd_buffer); 50069f464c52Smaya 50077ec681f3Smrg emit_cs_walker(cmd_buffer, pipeline, false, prog_data, groupCountX, 50087ec681f3Smrg groupCountY, groupCountZ); 500901e04c3fSmrg} 501001e04c3fSmrg 501101e04c3fSmrg#define GPGPU_DISPATCHDIMX 0x2500 501201e04c3fSmrg#define GPGPU_DISPATCHDIMY 0x2504 501301e04c3fSmrg#define GPGPU_DISPATCHDIMZ 0x2508 501401e04c3fSmrg 501501e04c3fSmrgvoid genX(CmdDispatchIndirect)( 501601e04c3fSmrg VkCommandBuffer commandBuffer, 501701e04c3fSmrg VkBuffer _buffer, 501801e04c3fSmrg VkDeviceSize offset) 501901e04c3fSmrg{ 502001e04c3fSmrg ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 502101e04c3fSmrg ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); 50227ec681f3Smrg struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline; 502301e04c3fSmrg const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline); 502401e04c3fSmrg struct anv_address addr = anv_address_add(buffer->address, offset); 50257ec681f3Smrg UNUSED struct anv_batch *batch = &cmd_buffer->batch; 502601e04c3fSmrg 502701e04c3fSmrg anv_cmd_buffer_push_base_group_id(cmd_buffer, 0, 0, 0); 502801e04c3fSmrg 50297ec681f3Smrg#if GFX_VER == 7 503001e04c3fSmrg /* Linux 4.4 added command parser version 5 which allows the GPGPU 503101e04c3fSmrg * indirect dispatch registers to be written. 503201e04c3fSmrg */ 503301e04c3fSmrg if (verify_cmd_parser(cmd_buffer->device, 5, 503401e04c3fSmrg "vkCmdDispatchIndirect") != VK_SUCCESS) 503501e04c3fSmrg return; 503601e04c3fSmrg#endif 503701e04c3fSmrg 50387ec681f3Smrg anv_measure_snapshot(cmd_buffer, 50397ec681f3Smrg INTEL_SNAPSHOT_COMPUTE, 50407ec681f3Smrg "compute indirect", 50417ec681f3Smrg 0); 50427ec681f3Smrg 50437ec681f3Smrg if (prog_data->uses_num_work_groups) { 504401e04c3fSmrg cmd_buffer->state.compute.num_workgroups = addr; 504501e04c3fSmrg 50467ec681f3Smrg /* The num_workgroups buffer goes in the binding table */ 50477ec681f3Smrg cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; 50487ec681f3Smrg } 50497ec681f3Smrg 505001e04c3fSmrg genX(cmd_buffer_flush_compute_state)(cmd_buffer); 505101e04c3fSmrg 50527ec681f3Smrg struct mi_builder b; 50537ec681f3Smrg mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 505401e04c3fSmrg 50557ec681f3Smrg struct mi_value size_x = mi_mem32(anv_address_add(addr, 0)); 50567ec681f3Smrg struct mi_value size_y = mi_mem32(anv_address_add(addr, 4)); 50577ec681f3Smrg struct mi_value size_z = mi_mem32(anv_address_add(addr, 8)); 505801e04c3fSmrg 50597ec681f3Smrg mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), size_x); 50607ec681f3Smrg mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), size_y); 50617ec681f3Smrg mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), size_z); 506201e04c3fSmrg 50637ec681f3Smrg#if GFX_VER <= 7 506401e04c3fSmrg /* predicate = (compute_dispatch_indirect_x_size == 0); */ 50657ec681f3Smrg mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), size_x); 50667ec681f3Smrg mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0)); 506701e04c3fSmrg anv_batch_emit(batch, GENX(MI_PREDICATE), mip) { 506801e04c3fSmrg mip.LoadOperation = LOAD_LOAD; 506901e04c3fSmrg mip.CombineOperation = COMBINE_SET; 507001e04c3fSmrg mip.CompareOperation = COMPARE_SRCS_EQUAL; 507101e04c3fSmrg } 507201e04c3fSmrg 507301e04c3fSmrg /* predicate |= (compute_dispatch_indirect_y_size == 0); */ 50747ec681f3Smrg mi_store(&b, mi_reg32(MI_PREDICATE_SRC0), size_y); 507501e04c3fSmrg anv_batch_emit(batch, GENX(MI_PREDICATE), mip) { 507601e04c3fSmrg mip.LoadOperation = LOAD_LOAD; 507701e04c3fSmrg mip.CombineOperation = COMBINE_OR; 507801e04c3fSmrg mip.CompareOperation = COMPARE_SRCS_EQUAL; 507901e04c3fSmrg } 508001e04c3fSmrg 508101e04c3fSmrg /* predicate |= (compute_dispatch_indirect_z_size == 0); */ 50827ec681f3Smrg mi_store(&b, mi_reg32(MI_PREDICATE_SRC0), size_z); 508301e04c3fSmrg anv_batch_emit(batch, GENX(MI_PREDICATE), mip) { 508401e04c3fSmrg mip.LoadOperation = LOAD_LOAD; 508501e04c3fSmrg mip.CombineOperation = COMBINE_OR; 508601e04c3fSmrg mip.CompareOperation = COMPARE_SRCS_EQUAL; 508701e04c3fSmrg } 508801e04c3fSmrg 508901e04c3fSmrg /* predicate = !predicate; */ 509001e04c3fSmrg anv_batch_emit(batch, GENX(MI_PREDICATE), mip) { 509101e04c3fSmrg mip.LoadOperation = LOAD_LOADINV; 509201e04c3fSmrg mip.CombineOperation = COMBINE_OR; 509301e04c3fSmrg mip.CompareOperation = COMPARE_FALSE; 509401e04c3fSmrg } 50959f464c52Smaya 50967ec681f3Smrg#if GFX_VERx10 == 75 50979f464c52Smaya if (cmd_buffer->state.conditional_render_enabled) { 50989f464c52Smaya /* predicate &= !(conditional_rendering_predicate == 0); */ 50997ec681f3Smrg mi_store(&b, mi_reg32(MI_PREDICATE_SRC0), 51007ec681f3Smrg mi_reg32(ANV_PREDICATE_RESULT_REG)); 51019f464c52Smaya anv_batch_emit(batch, GENX(MI_PREDICATE), mip) { 51029f464c52Smaya mip.LoadOperation = LOAD_LOADINV; 51039f464c52Smaya mip.CombineOperation = COMBINE_AND; 51049f464c52Smaya mip.CompareOperation = COMPARE_SRCS_EQUAL; 51059f464c52Smaya } 51069f464c52Smaya } 51079f464c52Smaya#endif 51089f464c52Smaya 51097ec681f3Smrg#else /* GFX_VER > 7 */ 51109f464c52Smaya if (cmd_buffer->state.conditional_render_enabled) 51119f464c52Smaya genX(cmd_emit_conditional_render_predicate)(cmd_buffer); 511201e04c3fSmrg#endif 511301e04c3fSmrg 51147ec681f3Smrg emit_cs_walker(cmd_buffer, pipeline, true, prog_data, 0, 0, 0); 51157ec681f3Smrg} 51167ec681f3Smrg 51177ec681f3Smrg#if GFX_VERx10 >= 125 51187ec681f3Smrgstatic void 51197ec681f3Smrgcalc_local_trace_size(uint8_t local_shift[3], const uint32_t global[3]) 51207ec681f3Smrg{ 51217ec681f3Smrg unsigned total_shift = 0; 51227ec681f3Smrg memset(local_shift, 0, 3); 51237ec681f3Smrg 51247ec681f3Smrg bool progress; 51257ec681f3Smrg do { 51267ec681f3Smrg progress = false; 51277ec681f3Smrg for (unsigned i = 0; i < 3; i++) { 51287ec681f3Smrg assert(global[i] > 0); 51297ec681f3Smrg if ((1 << local_shift[i]) < global[i]) { 51307ec681f3Smrg progress = true; 51317ec681f3Smrg local_shift[i]++; 51327ec681f3Smrg total_shift++; 51337ec681f3Smrg } 51347ec681f3Smrg 51357ec681f3Smrg if (total_shift == 3) 51367ec681f3Smrg return; 51377ec681f3Smrg } 51387ec681f3Smrg } while(progress); 51397ec681f3Smrg 51407ec681f3Smrg /* Assign whatever's left to x */ 51417ec681f3Smrg local_shift[0] += 3 - total_shift; 51427ec681f3Smrg} 51437ec681f3Smrg 51447ec681f3Smrgstatic struct GFX_RT_SHADER_TABLE 51457ec681f3Smrgvk_sdar_to_shader_table(const VkStridedDeviceAddressRegionKHR *region) 51467ec681f3Smrg{ 51477ec681f3Smrg return (struct GFX_RT_SHADER_TABLE) { 51487ec681f3Smrg .BaseAddress = anv_address_from_u64(region->deviceAddress), 51497ec681f3Smrg .Stride = region->stride, 51507ec681f3Smrg }; 51517ec681f3Smrg} 51527ec681f3Smrg 51537ec681f3Smrgstatic void 51547ec681f3Smrgcmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer, 51557ec681f3Smrg const VkStridedDeviceAddressRegionKHR *raygen_sbt, 51567ec681f3Smrg const VkStridedDeviceAddressRegionKHR *miss_sbt, 51577ec681f3Smrg const VkStridedDeviceAddressRegionKHR *hit_sbt, 51587ec681f3Smrg const VkStridedDeviceAddressRegionKHR *callable_sbt, 51597ec681f3Smrg bool is_indirect, 51607ec681f3Smrg uint32_t launch_width, 51617ec681f3Smrg uint32_t launch_height, 51627ec681f3Smrg uint32_t launch_depth, 51637ec681f3Smrg uint64_t launch_size_addr) 51647ec681f3Smrg{ 51657ec681f3Smrg struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt; 51667ec681f3Smrg struct anv_ray_tracing_pipeline *pipeline = rt->pipeline; 51677ec681f3Smrg 51687ec681f3Smrg if (anv_batch_has_error(&cmd_buffer->batch)) 51697ec681f3Smrg return; 51707ec681f3Smrg 51717ec681f3Smrg /* If we have a known degenerate launch size, just bail */ 51727ec681f3Smrg if (!is_indirect && 51737ec681f3Smrg (launch_width == 0 || launch_height == 0 || launch_depth == 0)) 51747ec681f3Smrg return; 51757ec681f3Smrg 51767ec681f3Smrg genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config); 51777ec681f3Smrg genX(flush_pipeline_select_gpgpu)(cmd_buffer); 51787ec681f3Smrg 51797ec681f3Smrg cmd_buffer->state.rt.pipeline_dirty = false; 51807ec681f3Smrg 51817ec681f3Smrg genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 51827ec681f3Smrg 51837ec681f3Smrg /* Add these to the reloc list as they're internal buffers that don't 51847ec681f3Smrg * actually have relocs to pick them up manually. 51857ec681f3Smrg * 51867ec681f3Smrg * TODO(RT): This is a bit of a hack 51877ec681f3Smrg */ 51887ec681f3Smrg anv_reloc_list_add_bo(cmd_buffer->batch.relocs, 51897ec681f3Smrg cmd_buffer->batch.alloc, 51907ec681f3Smrg rt->scratch.bo); 51917ec681f3Smrg 51927ec681f3Smrg /* Allocate and set up our RT_DISPATCH_GLOBALS */ 51937ec681f3Smrg struct anv_state rtdg_state = 51947ec681f3Smrg anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 51957ec681f3Smrg BRW_RT_PUSH_CONST_OFFSET + 51967ec681f3Smrg sizeof(struct anv_push_constants), 51977ec681f3Smrg 64); 51987ec681f3Smrg 51997ec681f3Smrg struct GFX_RT_DISPATCH_GLOBALS rtdg = { 52007ec681f3Smrg .MemBaseAddress = (struct anv_address) { 52017ec681f3Smrg .bo = rt->scratch.bo, 52027ec681f3Smrg .offset = rt->scratch.layout.ray_stack_start, 52037ec681f3Smrg }, 52047ec681f3Smrg .CallStackHandler = 52057ec681f3Smrg anv_shader_bin_get_bsr(cmd_buffer->device->rt_trivial_return, 0), 52067ec681f3Smrg .AsyncRTStackSize = rt->scratch.layout.ray_stack_stride / 64, 52077ec681f3Smrg .NumDSSRTStacks = rt->scratch.layout.stack_ids_per_dss, 52087ec681f3Smrg .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS, 52097ec681f3Smrg .Flags = RT_DEPTH_TEST_LESS_EQUAL, 52107ec681f3Smrg .HitGroupTable = vk_sdar_to_shader_table(hit_sbt), 52117ec681f3Smrg .MissGroupTable = vk_sdar_to_shader_table(miss_sbt), 52127ec681f3Smrg .SWStackSize = rt->scratch.layout.sw_stack_size / 64, 52137ec681f3Smrg .LaunchWidth = launch_width, 52147ec681f3Smrg .LaunchHeight = launch_height, 52157ec681f3Smrg .LaunchDepth = launch_depth, 52167ec681f3Smrg .CallableGroupTable = vk_sdar_to_shader_table(callable_sbt), 52177ec681f3Smrg }; 52187ec681f3Smrg GFX_RT_DISPATCH_GLOBALS_pack(NULL, rtdg_state.map, &rtdg); 52197ec681f3Smrg 52207ec681f3Smrg /* Push constants go after the RT_DISPATCH_GLOBALS */ 52217ec681f3Smrg assert(GFX_RT_DISPATCH_GLOBALS_length * 4 <= BRW_RT_PUSH_CONST_OFFSET); 52227ec681f3Smrg memcpy(rtdg_state.map + BRW_RT_PUSH_CONST_OFFSET, 52237ec681f3Smrg &cmd_buffer->state.rt.base.push_constants, 52247ec681f3Smrg sizeof(struct anv_push_constants)); 52257ec681f3Smrg 52267ec681f3Smrg struct anv_address rtdg_addr = { 52277ec681f3Smrg .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, 52287ec681f3Smrg .offset = rtdg_state.offset, 52297ec681f3Smrg }; 52307ec681f3Smrg 52317ec681f3Smrg uint8_t local_size_log2[3]; 52327ec681f3Smrg uint32_t global_size[3] = {}; 52337ec681f3Smrg if (is_indirect) { 52347ec681f3Smrg /* Pick a local size that's probably ok. We assume most TraceRays calls 52357ec681f3Smrg * will use a two-dimensional dispatch size. Worst case, our initial 52367ec681f3Smrg * dispatch will be a little slower than it has to be. 52377ec681f3Smrg */ 52387ec681f3Smrg local_size_log2[0] = 2; 52397ec681f3Smrg local_size_log2[1] = 1; 52407ec681f3Smrg local_size_log2[2] = 0; 52417ec681f3Smrg 52427ec681f3Smrg struct mi_builder b; 52437ec681f3Smrg mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 52447ec681f3Smrg 52457ec681f3Smrg struct mi_value launch_size[3] = { 52467ec681f3Smrg mi_mem32(anv_address_from_u64(launch_size_addr + 0)), 52477ec681f3Smrg mi_mem32(anv_address_from_u64(launch_size_addr + 4)), 52487ec681f3Smrg mi_mem32(anv_address_from_u64(launch_size_addr + 8)), 52497ec681f3Smrg }; 52507ec681f3Smrg 52517ec681f3Smrg /* Store the original launch size into RT_DISPATCH_GLOBALS 52527ec681f3Smrg * 52537ec681f3Smrg * TODO: Pull values from genX_bits.h once RT_DISPATCH_GLOBALS gets 52547ec681f3Smrg * moved into a genX version. 52557ec681f3Smrg */ 52567ec681f3Smrg mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 52)), 52577ec681f3Smrg mi_value_ref(&b, launch_size[0])); 52587ec681f3Smrg mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 56)), 52597ec681f3Smrg mi_value_ref(&b, launch_size[1])); 52607ec681f3Smrg mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 60)), 52617ec681f3Smrg mi_value_ref(&b, launch_size[2])); 52627ec681f3Smrg 52637ec681f3Smrg /* Compute the global dispatch size */ 52647ec681f3Smrg for (unsigned i = 0; i < 3; i++) { 52657ec681f3Smrg if (local_size_log2[i] == 0) 52667ec681f3Smrg continue; 52677ec681f3Smrg 52687ec681f3Smrg /* global_size = DIV_ROUND_UP(launch_size, local_size) 52697ec681f3Smrg * 52707ec681f3Smrg * Fortunately for us MI_ALU math is 64-bit and , mi_ushr32_imm 52717ec681f3Smrg * has the semantics of shifting the enture 64-bit value and taking 52727ec681f3Smrg * the bottom 32 so we don't have to worry about roll-over. 52737ec681f3Smrg */ 52747ec681f3Smrg uint32_t local_size = 1 << local_size_log2[i]; 52757ec681f3Smrg launch_size[i] = mi_iadd(&b, launch_size[i], 52767ec681f3Smrg mi_imm(local_size - 1)); 52777ec681f3Smrg launch_size[i] = mi_ushr32_imm(&b, launch_size[i], 52787ec681f3Smrg local_size_log2[i]); 52797ec681f3Smrg } 52807ec681f3Smrg 52817ec681f3Smrg mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), launch_size[0]); 52827ec681f3Smrg mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), launch_size[1]); 52837ec681f3Smrg mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), launch_size[2]); 52847ec681f3Smrg } else { 52857ec681f3Smrg uint32_t launch_size[3] = { launch_width, launch_height, launch_depth }; 52867ec681f3Smrg calc_local_trace_size(local_size_log2, launch_size); 52877ec681f3Smrg 52887ec681f3Smrg for (unsigned i = 0; i < 3; i++) { 52897ec681f3Smrg /* We have to be a bit careful here because DIV_ROUND_UP adds to the 52907ec681f3Smrg * numerator value may overflow. Cast to uint64_t to avoid this. 52917ec681f3Smrg */ 52927ec681f3Smrg uint32_t local_size = 1 << local_size_log2[i]; 52937ec681f3Smrg global_size[i] = DIV_ROUND_UP((uint64_t)launch_size[i], local_size); 52947ec681f3Smrg } 52957ec681f3Smrg } 52967ec681f3Smrg 52977ec681f3Smrg anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) { 52987ec681f3Smrg cw.IndirectParameterEnable = is_indirect; 52997ec681f3Smrg cw.PredicateEnable = false; 53007ec681f3Smrg cw.SIMDSize = SIMD8; 53017ec681f3Smrg cw.LocalXMaximum = (1 << local_size_log2[0]) - 1; 53027ec681f3Smrg cw.LocalYMaximum = (1 << local_size_log2[1]) - 1; 53037ec681f3Smrg cw.LocalZMaximum = (1 << local_size_log2[2]) - 1; 53047ec681f3Smrg cw.ThreadGroupIDXDimension = global_size[0]; 53057ec681f3Smrg cw.ThreadGroupIDYDimension = global_size[1]; 53067ec681f3Smrg cw.ThreadGroupIDZDimension = global_size[2]; 53077ec681f3Smrg cw.ExecutionMask = 0xff; 53087ec681f3Smrg cw.EmitInlineParameter = true; 53097ec681f3Smrg 53107ec681f3Smrg const gl_shader_stage s = MESA_SHADER_RAYGEN; 53117ec681f3Smrg struct anv_device *device = cmd_buffer->device; 53127ec681f3Smrg struct anv_state *surfaces = &cmd_buffer->state.binding_tables[s]; 53137ec681f3Smrg struct anv_state *samplers = &cmd_buffer->state.samplers[s]; 53147ec681f3Smrg cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) { 53157ec681f3Smrg .KernelStartPointer = device->rt_trampoline->kernel.offset, 53167ec681f3Smrg .SamplerStatePointer = samplers->offset, 53177ec681f3Smrg /* i965: DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4), */ 53187ec681f3Smrg .SamplerCount = 0, 53197ec681f3Smrg .BindingTablePointer = surfaces->offset, 53207ec681f3Smrg .NumberofThreadsinGPGPUThreadGroup = 1, 53217ec681f3Smrg .BTDMode = true, 53227ec681f3Smrg }; 53237ec681f3Smrg 53247ec681f3Smrg struct brw_rt_raygen_trampoline_params trampoline_params = { 53257ec681f3Smrg .rt_disp_globals_addr = anv_address_physical(rtdg_addr), 53267ec681f3Smrg .raygen_bsr_addr = raygen_sbt->deviceAddress, 53277ec681f3Smrg .is_indirect = is_indirect, 53287ec681f3Smrg .local_group_size_log2 = { 53297ec681f3Smrg local_size_log2[0], 53307ec681f3Smrg local_size_log2[1], 53317ec681f3Smrg local_size_log2[2], 53327ec681f3Smrg }, 53337ec681f3Smrg }; 53347ec681f3Smrg STATIC_ASSERT(sizeof(trampoline_params) == 32); 53357ec681f3Smrg memcpy(cw.InlineData, &trampoline_params, sizeof(trampoline_params)); 533601e04c3fSmrg } 53377ec681f3Smrg} 53387ec681f3Smrg 53397ec681f3Smrgvoid 53407ec681f3SmrggenX(CmdTraceRaysKHR)( 53417ec681f3Smrg VkCommandBuffer commandBuffer, 53427ec681f3Smrg const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable, 53437ec681f3Smrg const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable, 53447ec681f3Smrg const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable, 53457ec681f3Smrg const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable, 53467ec681f3Smrg uint32_t width, 53477ec681f3Smrg uint32_t height, 53487ec681f3Smrg uint32_t depth) 53497ec681f3Smrg{ 53507ec681f3Smrg ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 53517ec681f3Smrg 53527ec681f3Smrg cmd_buffer_trace_rays(cmd_buffer, 53537ec681f3Smrg pRaygenShaderBindingTable, 53547ec681f3Smrg pMissShaderBindingTable, 53557ec681f3Smrg pHitShaderBindingTable, 53567ec681f3Smrg pCallableShaderBindingTable, 53577ec681f3Smrg false /* is_indirect */, 53587ec681f3Smrg width, height, depth, 53597ec681f3Smrg 0 /* launch_size_addr */); 53607ec681f3Smrg} 53617ec681f3Smrg 53627ec681f3Smrgvoid 53637ec681f3SmrggenX(CmdTraceRaysIndirectKHR)( 53647ec681f3Smrg VkCommandBuffer commandBuffer, 53657ec681f3Smrg const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable, 53667ec681f3Smrg const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable, 53677ec681f3Smrg const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable, 53687ec681f3Smrg const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable, 53697ec681f3Smrg VkDeviceAddress indirectDeviceAddress) 53707ec681f3Smrg{ 53717ec681f3Smrg ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 537201e04c3fSmrg 53737ec681f3Smrg cmd_buffer_trace_rays(cmd_buffer, 53747ec681f3Smrg pRaygenShaderBindingTable, 53757ec681f3Smrg pMissShaderBindingTable, 53767ec681f3Smrg pHitShaderBindingTable, 53777ec681f3Smrg pCallableShaderBindingTable, 53787ec681f3Smrg true /* is_indirect */, 53797ec681f3Smrg 0, 0, 0, /* width, height, depth, */ 53807ec681f3Smrg indirectDeviceAddress); 538101e04c3fSmrg} 53827ec681f3Smrg#endif /* GFX_VERx10 >= 125 */ 538301e04c3fSmrg 538401e04c3fSmrgstatic void 538501e04c3fSmrggenX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer, 538601e04c3fSmrg uint32_t pipeline) 538701e04c3fSmrg{ 53887ec681f3Smrg UNUSED const struct intel_device_info *devinfo = &cmd_buffer->device->info; 538901e04c3fSmrg 539001e04c3fSmrg if (cmd_buffer->state.current_pipeline == pipeline) 539101e04c3fSmrg return; 539201e04c3fSmrg 53937ec681f3Smrg#if GFX_VER >= 8 && GFX_VER < 10 539401e04c3fSmrg /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT: 539501e04c3fSmrg * 539601e04c3fSmrg * Software must clear the COLOR_CALC_STATE Valid field in 539701e04c3fSmrg * 3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT 539801e04c3fSmrg * with Pipeline Select set to GPGPU. 539901e04c3fSmrg * 54007ec681f3Smrg * The internal hardware docs recommend the same workaround for Gfx9 540101e04c3fSmrg * hardware too. 540201e04c3fSmrg */ 540301e04c3fSmrg if (pipeline == GPGPU) 540401e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), t); 540501e04c3fSmrg#endif 540601e04c3fSmrg 54077ec681f3Smrg#if GFX_VER == 9 54089f464c52Smaya if (pipeline == _3D) { 54099f464c52Smaya /* There is a mid-object preemption workaround which requires you to 54109f464c52Smaya * re-emit MEDIA_VFE_STATE after switching from GPGPU to 3D. However, 54119f464c52Smaya * even without preemption, we have issues with geometry flickering when 54129f464c52Smaya * GPGPU and 3D are back-to-back and this seems to fix it. We don't 54139f464c52Smaya * really know why. 54149f464c52Smaya */ 54159f464c52Smaya anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_VFE_STATE), vfe) { 54169f464c52Smaya vfe.MaximumNumberofThreads = 54177ec681f3Smrg devinfo->max_cs_threads * devinfo->subslice_total - 1; 54189f464c52Smaya vfe.NumberofURBEntries = 2; 54199f464c52Smaya vfe.URBEntryAllocationSize = 2; 54209f464c52Smaya } 54217ec681f3Smrg 54227ec681f3Smrg /* We just emitted a dummy MEDIA_VFE_STATE so now that packet is 54237ec681f3Smrg * invalid. Set the compute pipeline to dirty to force a re-emit of the 54247ec681f3Smrg * pipeline in case we get back-to-back dispatch calls with the same 54257ec681f3Smrg * pipeline and a PIPELINE_SELECT in between. 54267ec681f3Smrg */ 54277ec681f3Smrg cmd_buffer->state.compute.pipeline_dirty = true; 54289f464c52Smaya } 54299f464c52Smaya#endif 54309f464c52Smaya 543101e04c3fSmrg /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction] 543201e04c3fSmrg * PIPELINE_SELECT [DevBWR+]": 543301e04c3fSmrg * 543401e04c3fSmrg * Project: DEVSNB+ 543501e04c3fSmrg * 543601e04c3fSmrg * Software must ensure all the write caches are flushed through a 543701e04c3fSmrg * stalling PIPE_CONTROL command followed by another PIPE_CONTROL 543801e04c3fSmrg * command to invalidate read only caches prior to programming 543901e04c3fSmrg * MI_PIPELINE_SELECT command to change the Pipeline Select Mode. 544001e04c3fSmrg */ 544101e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 544201e04c3fSmrg pc.RenderTargetCacheFlushEnable = true; 544301e04c3fSmrg pc.DepthCacheFlushEnable = true; 54447ec681f3Smrg#if GFX_VER >= 12 54457ec681f3Smrg pc.HDCPipelineFlushEnable = true; 54467ec681f3Smrg#else 544701e04c3fSmrg pc.DCFlushEnable = true; 54487ec681f3Smrg#endif 544901e04c3fSmrg pc.PostSyncOperation = NoWrite; 545001e04c3fSmrg pc.CommandStreamerStallEnable = true; 54517ec681f3Smrg#if GFX_VER >= 12 54527ec681f3Smrg /* Wa_1409600907: "PIPE_CONTROL with Depth Stall Enable bit must be 54537ec681f3Smrg * set with any PIPE_CONTROL with Depth Flush Enable bit set. 54547ec681f3Smrg */ 54557ec681f3Smrg pc.DepthStallEnable = true; 54567ec681f3Smrg#endif 54577ec681f3Smrg anv_debug_dump_pc(pc); 545801e04c3fSmrg } 545901e04c3fSmrg 546001e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 546101e04c3fSmrg pc.TextureCacheInvalidationEnable = true; 546201e04c3fSmrg pc.ConstantCacheInvalidationEnable = true; 546301e04c3fSmrg pc.StateCacheInvalidationEnable = true; 546401e04c3fSmrg pc.InstructionCacheInvalidateEnable = true; 546501e04c3fSmrg pc.PostSyncOperation = NoWrite; 54667ec681f3Smrg anv_debug_dump_pc(pc); 546701e04c3fSmrg } 546801e04c3fSmrg 546901e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) { 54707ec681f3Smrg#if GFX_VER >= 9 54717ec681f3Smrg ps.MaskBits = GFX_VER >= 12 ? 0x13 : 3; 54727ec681f3Smrg ps.MediaSamplerDOPClockGateEnable = GFX_VER >= 12; 547301e04c3fSmrg#endif 547401e04c3fSmrg ps.PipelineSelection = pipeline; 547501e04c3fSmrg } 547601e04c3fSmrg 54777ec681f3Smrg#if GFX_VER == 9 547801e04c3fSmrg if (devinfo->is_geminilake) { 547901e04c3fSmrg /* Project: DevGLK 548001e04c3fSmrg * 548101e04c3fSmrg * "This chicken bit works around a hardware issue with barrier logic 548201e04c3fSmrg * encountered when switching between GPGPU and 3D pipelines. To 548301e04c3fSmrg * workaround the issue, this mode bit should be set after a pipeline 548401e04c3fSmrg * is selected." 548501e04c3fSmrg */ 54867ec681f3Smrg anv_batch_write_reg(&cmd_buffer->batch, GENX(SLICE_COMMON_ECO_CHICKEN1), scec1) { 54877ec681f3Smrg scec1.GLKBarrierMode = pipeline == GPGPU ? GLK_BARRIER_MODE_GPGPU 54887ec681f3Smrg : GLK_BARRIER_MODE_3D_HULL; 54897ec681f3Smrg scec1.GLKBarrierModeMask = 1; 54907ec681f3Smrg } 549101e04c3fSmrg } 549201e04c3fSmrg#endif 549301e04c3fSmrg 549401e04c3fSmrg cmd_buffer->state.current_pipeline = pipeline; 549501e04c3fSmrg} 549601e04c3fSmrg 549701e04c3fSmrgvoid 549801e04c3fSmrggenX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer) 549901e04c3fSmrg{ 550001e04c3fSmrg genX(flush_pipeline_select)(cmd_buffer, _3D); 550101e04c3fSmrg} 550201e04c3fSmrg 550301e04c3fSmrgvoid 550401e04c3fSmrggenX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer) 550501e04c3fSmrg{ 550601e04c3fSmrg genX(flush_pipeline_select)(cmd_buffer, GPGPU); 550701e04c3fSmrg} 550801e04c3fSmrg 550901e04c3fSmrgvoid 55107ec681f3SmrggenX(cmd_buffer_emit_gfx7_depth_flush)(struct anv_cmd_buffer *cmd_buffer) 551101e04c3fSmrg{ 55127ec681f3Smrg if (GFX_VER >= 8) 551301e04c3fSmrg return; 551401e04c3fSmrg 551501e04c3fSmrg /* From the Haswell PRM, documentation for 3DSTATE_DEPTH_BUFFER: 551601e04c3fSmrg * 551701e04c3fSmrg * "Restriction: Prior to changing Depth/Stencil Buffer state (i.e., any 551801e04c3fSmrg * combination of 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS, 551901e04c3fSmrg * 3DSTATE_STENCIL_BUFFER, 3DSTATE_HIER_DEPTH_BUFFER) SW must first 552001e04c3fSmrg * issue a pipelined depth stall (PIPE_CONTROL with Depth Stall bit 552101e04c3fSmrg * set), followed by a pipelined depth cache flush (PIPE_CONTROL with 552201e04c3fSmrg * Depth Flush Bit set, followed by another pipelined depth stall 552301e04c3fSmrg * (PIPE_CONTROL with Depth Stall Bit set), unless SW can otherwise 552401e04c3fSmrg * guarantee that the pipeline from WM onwards is already flushed (e.g., 552501e04c3fSmrg * via a preceding MI_FLUSH)." 552601e04c3fSmrg */ 552701e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { 552801e04c3fSmrg pipe.DepthStallEnable = true; 55297ec681f3Smrg anv_debug_dump_pc(pipe); 553001e04c3fSmrg } 553101e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { 553201e04c3fSmrg pipe.DepthCacheFlushEnable = true; 55337ec681f3Smrg#if GFX_VER >= 12 55347ec681f3Smrg pipe.TileCacheFlushEnable = true; 55357ec681f3Smrg#endif 55367ec681f3Smrg anv_debug_dump_pc(pipe); 553701e04c3fSmrg } 553801e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { 553901e04c3fSmrg pipe.DepthStallEnable = true; 55407ec681f3Smrg anv_debug_dump_pc(pipe); 55417ec681f3Smrg } 55427ec681f3Smrg} 55437ec681f3Smrg 55447ec681f3Smrgvoid 55457ec681f3SmrggenX(cmd_buffer_emit_gfx12_depth_wa)(struct anv_cmd_buffer *cmd_buffer, 55467ec681f3Smrg const struct isl_surf *surf) 55477ec681f3Smrg{ 55487ec681f3Smrg#if GFX_VERx10 == 120 55497ec681f3Smrg const bool fmt_is_d16 = surf->format == ISL_FORMAT_R16_UNORM; 55507ec681f3Smrg 55517ec681f3Smrg switch (cmd_buffer->state.depth_reg_mode) { 55527ec681f3Smrg case ANV_DEPTH_REG_MODE_HW_DEFAULT: 55537ec681f3Smrg if (!fmt_is_d16) 55547ec681f3Smrg return; 55557ec681f3Smrg break; 55567ec681f3Smrg case ANV_DEPTH_REG_MODE_D16: 55577ec681f3Smrg if (fmt_is_d16) 55587ec681f3Smrg return; 55597ec681f3Smrg break; 55607ec681f3Smrg case ANV_DEPTH_REG_MODE_UNKNOWN: 55617ec681f3Smrg break; 55627ec681f3Smrg } 55637ec681f3Smrg 55647ec681f3Smrg /* We'll change some CHICKEN registers depending on the depth surface 55657ec681f3Smrg * format. Do a depth flush and stall so the pipeline is not using these 55667ec681f3Smrg * settings while we change the registers. 55677ec681f3Smrg */ 55687ec681f3Smrg anv_add_pending_pipe_bits(cmd_buffer, 55697ec681f3Smrg ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | 55707ec681f3Smrg ANV_PIPE_DEPTH_STALL_BIT | 55717ec681f3Smrg ANV_PIPE_END_OF_PIPE_SYNC_BIT, 55727ec681f3Smrg "Workaround: Stop pipeline for 14010455700"); 55737ec681f3Smrg genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 55747ec681f3Smrg 55757ec681f3Smrg /* Wa_14010455700 55767ec681f3Smrg * 55777ec681f3Smrg * To avoid sporadic corruptions “Set 0x7010[9] when Depth Buffer 55787ec681f3Smrg * Surface Format is D16_UNORM , surface type is not NULL & 1X_MSAA”. 55797ec681f3Smrg */ 55807ec681f3Smrg anv_batch_write_reg(&cmd_buffer->batch, GENX(COMMON_SLICE_CHICKEN1), reg) { 55817ec681f3Smrg reg.HIZPlaneOptimizationdisablebit = fmt_is_d16 && surf->samples == 1; 55827ec681f3Smrg reg.HIZPlaneOptimizationdisablebitMask = true; 55837ec681f3Smrg } 55847ec681f3Smrg 55857ec681f3Smrg /* Wa_1806527549 55867ec681f3Smrg * 55877ec681f3Smrg * Set HIZ_CHICKEN (7018h) bit 13 = 1 when depth buffer is D16_UNORM. 55887ec681f3Smrg */ 55897ec681f3Smrg anv_batch_write_reg(&cmd_buffer->batch, GENX(HIZ_CHICKEN), reg) { 55907ec681f3Smrg reg.HZDepthTestLEGEOptimizationDisable = fmt_is_d16; 55917ec681f3Smrg reg.HZDepthTestLEGEOptimizationDisableMask = true; 55927ec681f3Smrg } 55937ec681f3Smrg 55947ec681f3Smrg cmd_buffer->state.depth_reg_mode = 55957ec681f3Smrg fmt_is_d16 ? ANV_DEPTH_REG_MODE_D16 : ANV_DEPTH_REG_MODE_HW_DEFAULT; 55967ec681f3Smrg#endif 55977ec681f3Smrg} 55987ec681f3Smrg 55997ec681f3Smrg/* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS: 56007ec681f3Smrg * 56017ec681f3Smrg * "The VF cache needs to be invalidated before binding and then using 56027ec681f3Smrg * Vertex Buffers that overlap with any previously bound Vertex Buffer 56037ec681f3Smrg * (at a 64B granularity) since the last invalidation. A VF cache 56047ec681f3Smrg * invalidate is performed by setting the "VF Cache Invalidation Enable" 56057ec681f3Smrg * bit in PIPE_CONTROL." 56067ec681f3Smrg * 56077ec681f3Smrg * This is implemented by carefully tracking all vertex and index buffer 56087ec681f3Smrg * bindings and flushing if the cache ever ends up with a range in the cache 56097ec681f3Smrg * that would exceed 4 GiB. This is implemented in three parts: 56107ec681f3Smrg * 56117ec681f3Smrg * 1. genX(cmd_buffer_set_binding_for_gfx8_vb_flush)() which must be called 56127ec681f3Smrg * every time a 3DSTATE_VERTEX_BUFFER packet is emitted and informs the 56137ec681f3Smrg * tracking code of the new binding. If this new binding would cause 56147ec681f3Smrg * the cache to have a too-large range on the next draw call, a pipeline 56157ec681f3Smrg * stall and VF cache invalidate are added to pending_pipeline_bits. 56167ec681f3Smrg * 56177ec681f3Smrg * 2. genX(cmd_buffer_apply_pipe_flushes)() resets the cache tracking to 56187ec681f3Smrg * empty whenever we emit a VF invalidate. 56197ec681f3Smrg * 56207ec681f3Smrg * 3. genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)() must be called 56217ec681f3Smrg * after every 3DPRIMITIVE and copies the bound range into the dirty 56227ec681f3Smrg * range for each used buffer. This has to be a separate step because 56237ec681f3Smrg * we don't always re-bind all buffers and so 1. can't know which 56247ec681f3Smrg * buffers are actually bound. 56257ec681f3Smrg */ 56267ec681f3Smrgvoid 56277ec681f3SmrggenX(cmd_buffer_set_binding_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer, 56287ec681f3Smrg int vb_index, 56297ec681f3Smrg struct anv_address vb_address, 56307ec681f3Smrg uint32_t vb_size) 56317ec681f3Smrg{ 56327ec681f3Smrg if (GFX_VER < 8 || GFX_VER > 9 || 56337ec681f3Smrg !anv_use_softpin(cmd_buffer->device->physical)) 56347ec681f3Smrg return; 56357ec681f3Smrg 56367ec681f3Smrg struct anv_vb_cache_range *bound, *dirty; 56377ec681f3Smrg if (vb_index == -1) { 56387ec681f3Smrg bound = &cmd_buffer->state.gfx.ib_bound_range; 56397ec681f3Smrg dirty = &cmd_buffer->state.gfx.ib_dirty_range; 56407ec681f3Smrg } else { 56417ec681f3Smrg assert(vb_index >= 0); 56427ec681f3Smrg assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges)); 56437ec681f3Smrg assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges)); 56447ec681f3Smrg bound = &cmd_buffer->state.gfx.vb_bound_ranges[vb_index]; 56457ec681f3Smrg dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[vb_index]; 56467ec681f3Smrg } 56477ec681f3Smrg 56487ec681f3Smrg if (vb_size == 0) { 56497ec681f3Smrg bound->start = 0; 56507ec681f3Smrg bound->end = 0; 56517ec681f3Smrg return; 56527ec681f3Smrg } 56537ec681f3Smrg 56547ec681f3Smrg assert(vb_address.bo && (vb_address.bo->flags & EXEC_OBJECT_PINNED)); 56557ec681f3Smrg bound->start = intel_48b_address(anv_address_physical(vb_address)); 56567ec681f3Smrg bound->end = bound->start + vb_size; 56577ec681f3Smrg assert(bound->end > bound->start); /* No overflow */ 56587ec681f3Smrg 56597ec681f3Smrg /* Align everything to a cache line */ 56607ec681f3Smrg bound->start &= ~(64ull - 1ull); 56617ec681f3Smrg bound->end = align_u64(bound->end, 64); 56627ec681f3Smrg 56637ec681f3Smrg /* Compute the dirty range */ 56647ec681f3Smrg dirty->start = MIN2(dirty->start, bound->start); 56657ec681f3Smrg dirty->end = MAX2(dirty->end, bound->end); 56667ec681f3Smrg 56677ec681f3Smrg /* If our range is larger than 32 bits, we have to flush */ 56687ec681f3Smrg assert(bound->end - bound->start <= (1ull << 32)); 56697ec681f3Smrg if (dirty->end - dirty->start > (1ull << 32)) { 56707ec681f3Smrg anv_add_pending_pipe_bits(cmd_buffer, 56717ec681f3Smrg ANV_PIPE_CS_STALL_BIT | 56727ec681f3Smrg ANV_PIPE_VF_CACHE_INVALIDATE_BIT, 56737ec681f3Smrg "vb > 32b range"); 56747ec681f3Smrg } 56757ec681f3Smrg} 56767ec681f3Smrg 56777ec681f3Smrgvoid 56787ec681f3SmrggenX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer, 56797ec681f3Smrg uint32_t access_type, 56807ec681f3Smrg uint64_t vb_used) 56817ec681f3Smrg{ 56827ec681f3Smrg if (GFX_VER < 8 || GFX_VER > 9 || 56837ec681f3Smrg !anv_use_softpin(cmd_buffer->device->physical)) 56847ec681f3Smrg return; 56857ec681f3Smrg 56867ec681f3Smrg if (access_type == RANDOM) { 56877ec681f3Smrg /* We have an index buffer */ 56887ec681f3Smrg struct anv_vb_cache_range *bound = &cmd_buffer->state.gfx.ib_bound_range; 56897ec681f3Smrg struct anv_vb_cache_range *dirty = &cmd_buffer->state.gfx.ib_dirty_range; 56907ec681f3Smrg 56917ec681f3Smrg if (bound->end > bound->start) { 56927ec681f3Smrg dirty->start = MIN2(dirty->start, bound->start); 56937ec681f3Smrg dirty->end = MAX2(dirty->end, bound->end); 56947ec681f3Smrg } 56957ec681f3Smrg } 56967ec681f3Smrg 56977ec681f3Smrg uint64_t mask = vb_used; 56987ec681f3Smrg while (mask) { 56997ec681f3Smrg int i = u_bit_scan64(&mask); 57007ec681f3Smrg assert(i >= 0); 57017ec681f3Smrg assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges)); 57027ec681f3Smrg assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges)); 57037ec681f3Smrg 57047ec681f3Smrg struct anv_vb_cache_range *bound, *dirty; 57057ec681f3Smrg bound = &cmd_buffer->state.gfx.vb_bound_ranges[i]; 57067ec681f3Smrg dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[i]; 57077ec681f3Smrg 57087ec681f3Smrg if (bound->end > bound->start) { 57097ec681f3Smrg dirty->start = MIN2(dirty->start, bound->start); 57107ec681f3Smrg dirty->end = MAX2(dirty->end, bound->end); 57117ec681f3Smrg } 57127ec681f3Smrg } 57137ec681f3Smrg} 57147ec681f3Smrg 57157ec681f3Smrg/** 57167ec681f3Smrg * Update the pixel hashing modes that determine the balancing of PS threads 57177ec681f3Smrg * across subslices and slices. 57187ec681f3Smrg * 57197ec681f3Smrg * \param width Width bound of the rendering area (already scaled down if \p 57207ec681f3Smrg * scale is greater than 1). 57217ec681f3Smrg * \param height Height bound of the rendering area (already scaled down if \p 57227ec681f3Smrg * scale is greater than 1). 57237ec681f3Smrg * \param scale The number of framebuffer samples that could potentially be 57247ec681f3Smrg * affected by an individual channel of the PS thread. This is 57257ec681f3Smrg * typically one for single-sampled rendering, but for operations 57267ec681f3Smrg * like CCS resolves and fast clears a single PS invocation may 57277ec681f3Smrg * update a huge number of pixels, in which case a finer 57287ec681f3Smrg * balancing is desirable in order to maximally utilize the 57297ec681f3Smrg * bandwidth available. UINT_MAX can be used as shorthand for 57307ec681f3Smrg * "finest hashing mode available". 57317ec681f3Smrg */ 57327ec681f3Smrgvoid 57337ec681f3SmrggenX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer, 57347ec681f3Smrg unsigned width, unsigned height, 57357ec681f3Smrg unsigned scale) 57367ec681f3Smrg{ 57377ec681f3Smrg#if GFX_VER == 9 57387ec681f3Smrg const struct intel_device_info *devinfo = &cmd_buffer->device->info; 57397ec681f3Smrg const unsigned slice_hashing[] = { 57407ec681f3Smrg /* Because all Gfx9 platforms with more than one slice require 57417ec681f3Smrg * three-way subslice hashing, a single "normal" 16x16 slice hashing 57427ec681f3Smrg * block is guaranteed to suffer from substantial imbalance, with one 57437ec681f3Smrg * subslice receiving twice as much work as the other two in the 57447ec681f3Smrg * slice. 57457ec681f3Smrg * 57467ec681f3Smrg * The performance impact of that would be particularly severe when 57477ec681f3Smrg * three-way hashing is also in use for slice balancing (which is the 57487ec681f3Smrg * case for all Gfx9 GT4 platforms), because one of the slices 57497ec681f3Smrg * receives one every three 16x16 blocks in either direction, which 57507ec681f3Smrg * is roughly the periodicity of the underlying subslice imbalance 57517ec681f3Smrg * pattern ("roughly" because in reality the hardware's 57527ec681f3Smrg * implementation of three-way hashing doesn't do exact modulo 3 57537ec681f3Smrg * arithmetic, which somewhat decreases the magnitude of this effect 57547ec681f3Smrg * in practice). This leads to a systematic subslice imbalance 57557ec681f3Smrg * within that slice regardless of the size of the primitive. The 57567ec681f3Smrg * 32x32 hashing mode guarantees that the subslice imbalance within a 57577ec681f3Smrg * single slice hashing block is minimal, largely eliminating this 57587ec681f3Smrg * effect. 57597ec681f3Smrg */ 57607ec681f3Smrg _32x32, 57617ec681f3Smrg /* Finest slice hashing mode available. */ 57627ec681f3Smrg NORMAL 57637ec681f3Smrg }; 57647ec681f3Smrg const unsigned subslice_hashing[] = { 57657ec681f3Smrg /* 16x16 would provide a slight cache locality benefit especially 57667ec681f3Smrg * visible in the sampler L1 cache efficiency of low-bandwidth 57677ec681f3Smrg * non-LLC platforms, but it comes at the cost of greater subslice 57687ec681f3Smrg * imbalance for primitives of dimensions approximately intermediate 57697ec681f3Smrg * between 16x4 and 16x16. 57707ec681f3Smrg */ 57717ec681f3Smrg _16x4, 57727ec681f3Smrg /* Finest subslice hashing mode available. */ 57737ec681f3Smrg _8x4 57747ec681f3Smrg }; 57757ec681f3Smrg /* Dimensions of the smallest hashing block of a given hashing mode. If 57767ec681f3Smrg * the rendering area is smaller than this there can't possibly be any 57777ec681f3Smrg * benefit from switching to this mode, so we optimize out the 57787ec681f3Smrg * transition. 57797ec681f3Smrg */ 57807ec681f3Smrg const unsigned min_size[][2] = { 57817ec681f3Smrg { 16, 4 }, 57827ec681f3Smrg { 8, 4 } 57837ec681f3Smrg }; 57847ec681f3Smrg const unsigned idx = scale > 1; 57857ec681f3Smrg 57867ec681f3Smrg if (cmd_buffer->state.current_hash_scale != scale && 57877ec681f3Smrg (width > min_size[idx][0] || height > min_size[idx][1])) { 57887ec681f3Smrg anv_add_pending_pipe_bits(cmd_buffer, 57897ec681f3Smrg ANV_PIPE_CS_STALL_BIT | 57907ec681f3Smrg ANV_PIPE_STALL_AT_SCOREBOARD_BIT, 57917ec681f3Smrg "change pixel hash mode"); 57927ec681f3Smrg genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 57937ec681f3Smrg 57947ec681f3Smrg anv_batch_write_reg(&cmd_buffer->batch, GENX(GT_MODE), gt) { 57957ec681f3Smrg gt.SliceHashing = (devinfo->num_slices > 1 ? slice_hashing[idx] : 0); 57967ec681f3Smrg gt.SliceHashingMask = (devinfo->num_slices > 1 ? -1 : 0); 57977ec681f3Smrg gt.SubsliceHashing = subslice_hashing[idx]; 57987ec681f3Smrg gt.SubsliceHashingMask = -1; 57997ec681f3Smrg } 58007ec681f3Smrg 58017ec681f3Smrg cmd_buffer->state.current_hash_scale = scale; 580201e04c3fSmrg } 58037ec681f3Smrg#endif 580401e04c3fSmrg} 580501e04c3fSmrg 580601e04c3fSmrgstatic void 580701e04c3fSmrgcmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer) 580801e04c3fSmrg{ 580901e04c3fSmrg struct anv_device *device = cmd_buffer->device; 581001e04c3fSmrg const struct anv_image_view *iview = 581101e04c3fSmrg anv_cmd_buffer_get_depth_stencil_view(cmd_buffer); 581201e04c3fSmrg const struct anv_image *image = iview ? iview->image : NULL; 581301e04c3fSmrg 581401e04c3fSmrg /* FIXME: Width and Height are wrong */ 581501e04c3fSmrg 58167ec681f3Smrg genX(cmd_buffer_emit_gfx7_depth_flush)(cmd_buffer); 581701e04c3fSmrg 581801e04c3fSmrg uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch, 581901e04c3fSmrg device->isl_dev.ds.size / 4); 582001e04c3fSmrg if (dw == NULL) 582101e04c3fSmrg return; 582201e04c3fSmrg 582301e04c3fSmrg struct isl_depth_stencil_hiz_emit_info info = { }; 582401e04c3fSmrg 582501e04c3fSmrg if (iview) 582601e04c3fSmrg info.view = &iview->planes[0].isl; 582701e04c3fSmrg 58287ec681f3Smrg if (image && (image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) { 58297ec681f3Smrg const uint32_t depth_plane = 58307ec681f3Smrg anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT); 58317ec681f3Smrg const struct anv_surface *depth_surface = 58327ec681f3Smrg &image->planes[depth_plane].primary_surface; 58337ec681f3Smrg const struct anv_address depth_address = 58347ec681f3Smrg anv_image_address(image, &depth_surface->memory_range); 583501e04c3fSmrg 58367ec681f3Smrg info.depth_surf = &depth_surface->isl; 583701e04c3fSmrg 583801e04c3fSmrg info.depth_address = 583901e04c3fSmrg anv_batch_emit_reloc(&cmd_buffer->batch, 584001e04c3fSmrg dw + device->isl_dev.ds.depth_offset / 4, 58417ec681f3Smrg depth_address.bo, depth_address.offset); 584201e04c3fSmrg info.mocs = 58437ec681f3Smrg anv_mocs(device, depth_address.bo, ISL_SURF_USAGE_DEPTH_BIT); 584401e04c3fSmrg 584501e04c3fSmrg const uint32_t ds = 584601e04c3fSmrg cmd_buffer->state.subpass->depth_stencil_attachment->attachment; 584701e04c3fSmrg info.hiz_usage = cmd_buffer->state.attachments[ds].aux_usage; 58487ec681f3Smrg if (info.hiz_usage != ISL_AUX_USAGE_NONE) { 58497ec681f3Smrg assert(isl_aux_usage_has_hiz(info.hiz_usage)); 58507ec681f3Smrg 58517ec681f3Smrg const struct anv_surface *hiz_surface = 58527ec681f3Smrg &image->planes[depth_plane].aux_surface; 58537ec681f3Smrg const struct anv_address hiz_address = 58547ec681f3Smrg anv_image_address(image, &hiz_surface->memory_range); 58557ec681f3Smrg 58567ec681f3Smrg info.hiz_surf = &hiz_surface->isl; 585701e04c3fSmrg 585801e04c3fSmrg info.hiz_address = 585901e04c3fSmrg anv_batch_emit_reloc(&cmd_buffer->batch, 586001e04c3fSmrg dw + device->isl_dev.ds.hiz_offset / 4, 58617ec681f3Smrg hiz_address.bo, hiz_address.offset); 586201e04c3fSmrg 586301e04c3fSmrg info.depth_clear_value = ANV_HZ_FC_VAL; 586401e04c3fSmrg } 586501e04c3fSmrg } 586601e04c3fSmrg 58677ec681f3Smrg if (image && (image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT)) { 58687ec681f3Smrg const uint32_t stencil_plane = 58697ec681f3Smrg anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT); 58707ec681f3Smrg const struct anv_surface *stencil_surface = 58717ec681f3Smrg &image->planes[stencil_plane].primary_surface; 58727ec681f3Smrg const struct anv_address stencil_address = 58737ec681f3Smrg anv_image_address(image, &stencil_surface->memory_range); 587401e04c3fSmrg 58757ec681f3Smrg info.stencil_surf = &stencil_surface->isl; 587601e04c3fSmrg 58777ec681f3Smrg info.stencil_aux_usage = image->planes[stencil_plane].aux_usage; 587801e04c3fSmrg info.stencil_address = 587901e04c3fSmrg anv_batch_emit_reloc(&cmd_buffer->batch, 588001e04c3fSmrg dw + device->isl_dev.ds.stencil_offset / 4, 58817ec681f3Smrg stencil_address.bo, stencil_address.offset); 588201e04c3fSmrg info.mocs = 58837ec681f3Smrg anv_mocs(device, stencil_address.bo, ISL_SURF_USAGE_STENCIL_BIT); 588401e04c3fSmrg } 588501e04c3fSmrg 588601e04c3fSmrg isl_emit_depth_stencil_hiz_s(&device->isl_dev, dw, &info); 588701e04c3fSmrg 58887ec681f3Smrg if (info.depth_surf) 58897ec681f3Smrg genX(cmd_buffer_emit_gfx12_depth_wa)(cmd_buffer, info.depth_surf); 58907ec681f3Smrg 58917ec681f3Smrg if (GFX_VER >= 12) { 58927ec681f3Smrg cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT; 58937ec681f3Smrg genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 58947ec681f3Smrg 58957ec681f3Smrg /* Wa_1408224581 58967ec681f3Smrg * 58977ec681f3Smrg * Workaround: Gfx12LP Astep only An additional pipe control with 58987ec681f3Smrg * post-sync = store dword operation would be required.( w/a is to 58997ec681f3Smrg * have an additional pipe control after the stencil state whenever 59007ec681f3Smrg * the surface state bits of this state is changing). 59017ec681f3Smrg */ 59027ec681f3Smrg anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 59037ec681f3Smrg pc.PostSyncOperation = WriteImmediateData; 59047ec681f3Smrg pc.Address = cmd_buffer->device->workaround_address; 59057ec681f3Smrg } 59067ec681f3Smrg } 59077ec681f3Smrg cmd_buffer->state.hiz_enabled = isl_aux_usage_has_hiz(info.hiz_usage); 590801e04c3fSmrg} 590901e04c3fSmrg 591001e04c3fSmrg/** 591101e04c3fSmrg * This ANDs the view mask of the current subpass with the pending clear 591201e04c3fSmrg * views in the attachment to get the mask of views active in the subpass 591301e04c3fSmrg * that still need to be cleared. 591401e04c3fSmrg */ 591501e04c3fSmrgstatic inline uint32_t 591601e04c3fSmrgget_multiview_subpass_clear_mask(const struct anv_cmd_state *cmd_state, 591701e04c3fSmrg const struct anv_attachment_state *att_state) 591801e04c3fSmrg{ 591901e04c3fSmrg return cmd_state->subpass->view_mask & att_state->pending_clear_views; 592001e04c3fSmrg} 592101e04c3fSmrg 592201e04c3fSmrgstatic inline bool 592301e04c3fSmrgdo_first_layer_clear(const struct anv_cmd_state *cmd_state, 592401e04c3fSmrg const struct anv_attachment_state *att_state) 592501e04c3fSmrg{ 592601e04c3fSmrg if (!cmd_state->subpass->view_mask) 592701e04c3fSmrg return true; 592801e04c3fSmrg 592901e04c3fSmrg uint32_t pending_clear_mask = 593001e04c3fSmrg get_multiview_subpass_clear_mask(cmd_state, att_state); 593101e04c3fSmrg 593201e04c3fSmrg return pending_clear_mask & 1; 593301e04c3fSmrg} 593401e04c3fSmrg 593501e04c3fSmrgstatic inline bool 593601e04c3fSmrgcurrent_subpass_is_last_for_attachment(const struct anv_cmd_state *cmd_state, 593701e04c3fSmrg uint32_t att_idx) 593801e04c3fSmrg{ 593901e04c3fSmrg const uint32_t last_subpass_idx = 594001e04c3fSmrg cmd_state->pass->attachments[att_idx].last_subpass_idx; 594101e04c3fSmrg const struct anv_subpass *last_subpass = 594201e04c3fSmrg &cmd_state->pass->subpasses[last_subpass_idx]; 594301e04c3fSmrg return last_subpass == cmd_state->subpass; 594401e04c3fSmrg} 594501e04c3fSmrg 594601e04c3fSmrgstatic void 594701e04c3fSmrgcmd_buffer_begin_subpass(struct anv_cmd_buffer *cmd_buffer, 594801e04c3fSmrg uint32_t subpass_id) 594901e04c3fSmrg{ 595001e04c3fSmrg struct anv_cmd_state *cmd_state = &cmd_buffer->state; 59517ec681f3Smrg struct anv_render_pass *pass = cmd_state->pass; 59527ec681f3Smrg struct anv_subpass *subpass = &pass->subpasses[subpass_id]; 595301e04c3fSmrg cmd_state->subpass = subpass; 595401e04c3fSmrg 595501e04c3fSmrg cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS; 595601e04c3fSmrg 595701e04c3fSmrg /* Our implementation of VK_KHR_multiview uses instancing to draw the 595801e04c3fSmrg * different views. If the client asks for instancing, we need to use the 595901e04c3fSmrg * Instance Data Step Rate to ensure that we repeat the client's 596001e04c3fSmrg * per-instance data once for each view. Since this bit is in 59617ec681f3Smrg * VERTEX_BUFFER_STATE on gfx7, we need to dirty vertex buffers at the top 596201e04c3fSmrg * of each subpass. 596301e04c3fSmrg */ 59647ec681f3Smrg if (GFX_VER == 7) 596501e04c3fSmrg cmd_buffer->state.gfx.vb_dirty |= ~0; 596601e04c3fSmrg 596701e04c3fSmrg /* It is possible to start a render pass with an old pipeline. Because the 596801e04c3fSmrg * render pass and subpass index are both baked into the pipeline, this is 596901e04c3fSmrg * highly unlikely. In order to do so, it requires that you have a render 597001e04c3fSmrg * pass with a single subpass and that you use that render pass twice 597101e04c3fSmrg * back-to-back and use the same pipeline at the start of the second render 597201e04c3fSmrg * pass as at the end of the first. In order to avoid unpredictable issues 597301e04c3fSmrg * with this edge case, we just dirty the pipeline at the start of every 597401e04c3fSmrg * subpass. 597501e04c3fSmrg */ 597601e04c3fSmrg cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_PIPELINE; 597701e04c3fSmrg 597801e04c3fSmrg /* Accumulate any subpass flushes that need to happen before the subpass */ 59797ec681f3Smrg anv_add_pending_pipe_bits(cmd_buffer, 59807ec681f3Smrg cmd_buffer->state.pass->subpass_flushes[subpass_id], 59817ec681f3Smrg "begin subpass deps/attachments"); 598201e04c3fSmrg 598301e04c3fSmrg VkRect2D render_area = cmd_buffer->state.render_area; 598401e04c3fSmrg struct anv_framebuffer *fb = cmd_buffer->state.framebuffer; 598501e04c3fSmrg 598601e04c3fSmrg bool is_multiview = subpass->view_mask != 0; 598701e04c3fSmrg 598801e04c3fSmrg for (uint32_t i = 0; i < subpass->attachment_count; ++i) { 598901e04c3fSmrg const uint32_t a = subpass->attachments[i].attachment; 599001e04c3fSmrg if (a == VK_ATTACHMENT_UNUSED) 599101e04c3fSmrg continue; 599201e04c3fSmrg 599301e04c3fSmrg assert(a < cmd_state->pass->attachment_count); 599401e04c3fSmrg struct anv_attachment_state *att_state = &cmd_state->attachments[a]; 599501e04c3fSmrg 59967ec681f3Smrg struct anv_image_view *iview = cmd_state->attachments[a].image_view; 599701e04c3fSmrg const struct anv_image *image = iview->image; 599801e04c3fSmrg 59997ec681f3Smrg VkImageLayout target_layout = subpass->attachments[i].layout; 60007ec681f3Smrg VkImageLayout target_stencil_layout = 60017ec681f3Smrg subpass->attachments[i].stencil_layout; 60027ec681f3Smrg 60037ec681f3Smrg uint32_t level = iview->planes[0].isl.base_level; 60047ec681f3Smrg uint32_t width = anv_minify(iview->image->vk.extent.width, level); 60057ec681f3Smrg uint32_t height = anv_minify(iview->image->vk.extent.height, level); 60067ec681f3Smrg bool full_surface_draw = 60077ec681f3Smrg render_area.offset.x == 0 && render_area.offset.y == 0 && 60087ec681f3Smrg render_area.extent.width == width && 60097ec681f3Smrg render_area.extent.height == height; 60107ec681f3Smrg 60117ec681f3Smrg uint32_t base_layer, layer_count; 60127ec681f3Smrg if (image->vk.image_type == VK_IMAGE_TYPE_3D) { 60137ec681f3Smrg base_layer = 0; 60147ec681f3Smrg layer_count = anv_minify(iview->image->vk.extent.depth, level); 601501e04c3fSmrg } else { 60167ec681f3Smrg base_layer = iview->planes[0].isl.base_array_layer; 60177ec681f3Smrg layer_count = fb->layers; 601801e04c3fSmrg } 601901e04c3fSmrg 60207ec681f3Smrg if (image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { 60217ec681f3Smrg bool will_full_fast_clear = 60227ec681f3Smrg (att_state->pending_clear_aspects & VK_IMAGE_ASPECT_COLOR_BIT) && 60237ec681f3Smrg att_state->fast_clear && full_surface_draw; 602401e04c3fSmrg 60257ec681f3Smrg assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT); 602601e04c3fSmrg transition_color_buffer(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT, 60277ec681f3Smrg level, 1, base_layer, layer_count, 60287ec681f3Smrg att_state->current_layout, target_layout, 60297ec681f3Smrg VK_QUEUE_FAMILY_IGNORED, 60307ec681f3Smrg VK_QUEUE_FAMILY_IGNORED, 60317ec681f3Smrg will_full_fast_clear); 60327ec681f3Smrg att_state->aux_usage = 60337ec681f3Smrg anv_layout_to_aux_usage(&cmd_buffer->device->info, image, 60347ec681f3Smrg VK_IMAGE_ASPECT_COLOR_BIT, 60357ec681f3Smrg VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, 60367ec681f3Smrg target_layout); 60377ec681f3Smrg } 60387ec681f3Smrg 60397ec681f3Smrg if (image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { 60407ec681f3Smrg bool will_full_fast_clear = 60417ec681f3Smrg (att_state->pending_clear_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && 60427ec681f3Smrg att_state->fast_clear && full_surface_draw; 60437ec681f3Smrg 604401e04c3fSmrg transition_depth_buffer(cmd_buffer, image, 60457ec681f3Smrg base_layer, layer_count, 60467ec681f3Smrg att_state->current_layout, target_layout, 60477ec681f3Smrg will_full_fast_clear); 604801e04c3fSmrg att_state->aux_usage = 604901e04c3fSmrg anv_layout_to_aux_usage(&cmd_buffer->device->info, image, 60507ec681f3Smrg VK_IMAGE_ASPECT_DEPTH_BIT, 60517ec681f3Smrg VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT, 60527ec681f3Smrg target_layout); 60537ec681f3Smrg } 60547ec681f3Smrg 60557ec681f3Smrg if (image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { 60567ec681f3Smrg bool will_full_fast_clear = 60577ec681f3Smrg (att_state->pending_clear_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) && 60587ec681f3Smrg att_state->fast_clear && full_surface_draw; 60597ec681f3Smrg 60607ec681f3Smrg transition_stencil_buffer(cmd_buffer, image, 60617ec681f3Smrg level, 1, base_layer, layer_count, 60627ec681f3Smrg att_state->current_stencil_layout, 60637ec681f3Smrg target_stencil_layout, 60647ec681f3Smrg will_full_fast_clear); 606501e04c3fSmrg } 606601e04c3fSmrg att_state->current_layout = target_layout; 60677ec681f3Smrg att_state->current_stencil_layout = target_stencil_layout; 606801e04c3fSmrg 606901e04c3fSmrg if (att_state->pending_clear_aspects & VK_IMAGE_ASPECT_COLOR_BIT) { 607001e04c3fSmrg assert(att_state->pending_clear_aspects == VK_IMAGE_ASPECT_COLOR_BIT); 607101e04c3fSmrg 607201e04c3fSmrg /* Multi-planar images are not supported as attachments */ 60737ec681f3Smrg assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT); 607401e04c3fSmrg assert(image->n_planes == 1); 607501e04c3fSmrg 607601e04c3fSmrg uint32_t base_clear_layer = iview->planes[0].isl.base_array_layer; 607701e04c3fSmrg uint32_t clear_layer_count = fb->layers; 607801e04c3fSmrg 607901e04c3fSmrg if (att_state->fast_clear && 608001e04c3fSmrg do_first_layer_clear(cmd_state, att_state)) { 608101e04c3fSmrg /* We only support fast-clears on the first layer */ 60827ec681f3Smrg assert(level == 0 && base_layer == 0); 608301e04c3fSmrg 608401e04c3fSmrg union isl_color_value clear_color = {}; 608501e04c3fSmrg anv_clear_color_from_att_state(&clear_color, att_state, iview); 60867ec681f3Smrg if (iview->image->vk.samples == 1) { 608701e04c3fSmrg anv_image_ccs_op(cmd_buffer, image, 608801e04c3fSmrg iview->planes[0].isl.format, 60897ec681f3Smrg iview->planes[0].isl.swizzle, 609001e04c3fSmrg VK_IMAGE_ASPECT_COLOR_BIT, 609101e04c3fSmrg 0, 0, 1, ISL_AUX_OP_FAST_CLEAR, 609201e04c3fSmrg &clear_color, 609301e04c3fSmrg false); 609401e04c3fSmrg } else { 609501e04c3fSmrg anv_image_mcs_op(cmd_buffer, image, 609601e04c3fSmrg iview->planes[0].isl.format, 60977ec681f3Smrg iview->planes[0].isl.swizzle, 609801e04c3fSmrg VK_IMAGE_ASPECT_COLOR_BIT, 609901e04c3fSmrg 0, 1, ISL_AUX_OP_FAST_CLEAR, 610001e04c3fSmrg &clear_color, 610101e04c3fSmrg false); 610201e04c3fSmrg } 610301e04c3fSmrg base_clear_layer++; 610401e04c3fSmrg clear_layer_count--; 610501e04c3fSmrg if (is_multiview) 610601e04c3fSmrg att_state->pending_clear_views &= ~1; 610701e04c3fSmrg 61087ec681f3Smrg if (isl_color_value_is_zero(clear_color, 61097ec681f3Smrg iview->planes[0].isl.format)) { 611001e04c3fSmrg /* This image has the auxiliary buffer enabled. We can mark the 611101e04c3fSmrg * subresource as not needing a resolve because the clear color 611201e04c3fSmrg * will match what's in every RENDER_SURFACE_STATE object when 611301e04c3fSmrg * it's being used for sampling. 611401e04c3fSmrg */ 611501e04c3fSmrg set_image_fast_clear_state(cmd_buffer, iview->image, 611601e04c3fSmrg VK_IMAGE_ASPECT_COLOR_BIT, 611701e04c3fSmrg ANV_FAST_CLEAR_DEFAULT_VALUE); 611801e04c3fSmrg } else { 611901e04c3fSmrg set_image_fast_clear_state(cmd_buffer, iview->image, 612001e04c3fSmrg VK_IMAGE_ASPECT_COLOR_BIT, 612101e04c3fSmrg ANV_FAST_CLEAR_ANY); 612201e04c3fSmrg } 612301e04c3fSmrg } 612401e04c3fSmrg 612501e04c3fSmrg /* From the VkFramebufferCreateInfo spec: 612601e04c3fSmrg * 612701e04c3fSmrg * "If the render pass uses multiview, then layers must be one and each 612801e04c3fSmrg * attachment requires a number of layers that is greater than the 612901e04c3fSmrg * maximum bit index set in the view mask in the subpasses in which it 613001e04c3fSmrg * is used." 613101e04c3fSmrg * 613201e04c3fSmrg * So if multiview is active we ignore the number of layers in the 613301e04c3fSmrg * framebuffer and instead we honor the view mask from the subpass. 613401e04c3fSmrg */ 613501e04c3fSmrg if (is_multiview) { 613601e04c3fSmrg assert(image->n_planes == 1); 613701e04c3fSmrg uint32_t pending_clear_mask = 613801e04c3fSmrg get_multiview_subpass_clear_mask(cmd_state, att_state); 613901e04c3fSmrg 61407ec681f3Smrg u_foreach_bit(layer_idx, pending_clear_mask) { 614101e04c3fSmrg uint32_t layer = 614201e04c3fSmrg iview->planes[0].isl.base_array_layer + layer_idx; 614301e04c3fSmrg 614401e04c3fSmrg anv_image_clear_color(cmd_buffer, image, 614501e04c3fSmrg VK_IMAGE_ASPECT_COLOR_BIT, 614601e04c3fSmrg att_state->aux_usage, 614701e04c3fSmrg iview->planes[0].isl.format, 614801e04c3fSmrg iview->planes[0].isl.swizzle, 61497ec681f3Smrg level, layer, 1, 615001e04c3fSmrg render_area, 615101e04c3fSmrg vk_to_isl_color(att_state->clear_value.color)); 615201e04c3fSmrg } 615301e04c3fSmrg 615401e04c3fSmrg att_state->pending_clear_views &= ~pending_clear_mask; 615501e04c3fSmrg } else if (clear_layer_count > 0) { 615601e04c3fSmrg assert(image->n_planes == 1); 615701e04c3fSmrg anv_image_clear_color(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT, 615801e04c3fSmrg att_state->aux_usage, 615901e04c3fSmrg iview->planes[0].isl.format, 616001e04c3fSmrg iview->planes[0].isl.swizzle, 61617ec681f3Smrg level, base_clear_layer, clear_layer_count, 616201e04c3fSmrg render_area, 616301e04c3fSmrg vk_to_isl_color(att_state->clear_value.color)); 616401e04c3fSmrg } 616501e04c3fSmrg } else if (att_state->pending_clear_aspects & (VK_IMAGE_ASPECT_DEPTH_BIT | 616601e04c3fSmrg VK_IMAGE_ASPECT_STENCIL_BIT)) { 61677ec681f3Smrg if (att_state->fast_clear && 61687ec681f3Smrg (att_state->pending_clear_aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) { 61697ec681f3Smrg /* We currently only support HiZ for single-LOD images */ 61707ec681f3Smrg assert(isl_aux_usage_has_hiz(iview->image->planes[0].aux_usage)); 61717ec681f3Smrg assert(iview->planes[0].isl.base_level == 0); 61727ec681f3Smrg assert(iview->planes[0].isl.levels == 1); 61737ec681f3Smrg } 617401e04c3fSmrg 61757ec681f3Smrg if (is_multiview) { 617601e04c3fSmrg uint32_t pending_clear_mask = 617701e04c3fSmrg get_multiview_subpass_clear_mask(cmd_state, att_state); 617801e04c3fSmrg 61797ec681f3Smrg u_foreach_bit(layer_idx, pending_clear_mask) { 618001e04c3fSmrg uint32_t layer = 618101e04c3fSmrg iview->planes[0].isl.base_array_layer + layer_idx; 618201e04c3fSmrg 61837ec681f3Smrg if (att_state->fast_clear) { 61847ec681f3Smrg anv_image_hiz_clear(cmd_buffer, image, 61857ec681f3Smrg att_state->pending_clear_aspects, 61867ec681f3Smrg level, layer, 1, render_area, 61877ec681f3Smrg att_state->clear_value.depthStencil.stencil); 61887ec681f3Smrg } else { 61897ec681f3Smrg anv_image_clear_depth_stencil(cmd_buffer, image, 61907ec681f3Smrg att_state->pending_clear_aspects, 61917ec681f3Smrg att_state->aux_usage, 61927ec681f3Smrg level, layer, 1, render_area, 61937ec681f3Smrg att_state->clear_value.depthStencil.depth, 61947ec681f3Smrg att_state->clear_value.depthStencil.stencil); 61957ec681f3Smrg } 61967ec681f3Smrg } 61977ec681f3Smrg 61987ec681f3Smrg att_state->pending_clear_views &= ~pending_clear_mask; 61997ec681f3Smrg } else { 62007ec681f3Smrg if (att_state->fast_clear) { 62017ec681f3Smrg anv_image_hiz_clear(cmd_buffer, image, 62027ec681f3Smrg att_state->pending_clear_aspects, 62037ec681f3Smrg level, base_layer, layer_count, 62047ec681f3Smrg render_area, 62057ec681f3Smrg att_state->clear_value.depthStencil.stencil); 62067ec681f3Smrg } else { 620701e04c3fSmrg anv_image_clear_depth_stencil(cmd_buffer, image, 620801e04c3fSmrg att_state->pending_clear_aspects, 620901e04c3fSmrg att_state->aux_usage, 62107ec681f3Smrg level, base_layer, layer_count, 621101e04c3fSmrg render_area, 621201e04c3fSmrg att_state->clear_value.depthStencil.depth, 621301e04c3fSmrg att_state->clear_value.depthStencil.stencil); 621401e04c3fSmrg } 621501e04c3fSmrg } 621601e04c3fSmrg } else { 621701e04c3fSmrg assert(att_state->pending_clear_aspects == 0); 621801e04c3fSmrg } 621901e04c3fSmrg 622001e04c3fSmrg /* If multiview is enabled, then we are only done clearing when we no 622101e04c3fSmrg * longer have pending layers to clear, or when we have processed the 622201e04c3fSmrg * last subpass that uses this attachment. 622301e04c3fSmrg */ 622401e04c3fSmrg if (!is_multiview || 622501e04c3fSmrg att_state->pending_clear_views == 0 || 622601e04c3fSmrg current_subpass_is_last_for_attachment(cmd_state, a)) { 622701e04c3fSmrg att_state->pending_clear_aspects = 0; 622801e04c3fSmrg } 622901e04c3fSmrg 623001e04c3fSmrg att_state->pending_load_aspects = 0; 623101e04c3fSmrg } 623201e04c3fSmrg 62337ec681f3Smrg /* We've transitioned all our images possibly fast clearing them. Now we 62347ec681f3Smrg * can fill out the surface states that we will use as render targets 62357ec681f3Smrg * during actual subpass rendering. 62367ec681f3Smrg */ 62377ec681f3Smrg VkResult result = genX(cmd_buffer_alloc_att_surf_states)(cmd_buffer, 62387ec681f3Smrg pass, subpass); 62397ec681f3Smrg if (result != VK_SUCCESS) 62407ec681f3Smrg return; 62417ec681f3Smrg 62427ec681f3Smrg isl_null_fill_state(&cmd_buffer->device->isl_dev, 62437ec681f3Smrg cmd_state->null_surface_state.map, 62447ec681f3Smrg .size = isl_extent3d(fb->width, fb->height, fb->layers)); 62457ec681f3Smrg 62467ec681f3Smrg for (uint32_t i = 0; i < subpass->attachment_count; ++i) { 62477ec681f3Smrg const uint32_t att = subpass->attachments[i].attachment; 62487ec681f3Smrg if (att == VK_ATTACHMENT_UNUSED) 62497ec681f3Smrg continue; 62507ec681f3Smrg 62517ec681f3Smrg assert(att < cmd_state->pass->attachment_count); 62527ec681f3Smrg struct anv_render_pass_attachment *pass_att = &pass->attachments[att]; 62537ec681f3Smrg struct anv_attachment_state *att_state = &cmd_state->attachments[att]; 62547ec681f3Smrg struct anv_image_view *iview = att_state->image_view; 62557ec681f3Smrg 62567ec681f3Smrg if (!vk_format_is_color(pass_att->format)) 62577ec681f3Smrg continue; 62587ec681f3Smrg 62597ec681f3Smrg const VkImageUsageFlagBits att_usage = subpass->attachments[i].usage; 62607ec681f3Smrg assert(util_bitcount(att_usage) == 1); 62617ec681f3Smrg 62627ec681f3Smrg struct anv_surface_state *surface_state; 62637ec681f3Smrg isl_surf_usage_flags_t isl_surf_usage; 62647ec681f3Smrg enum isl_aux_usage isl_aux_usage; 62657ec681f3Smrg if (att_usage == VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) { 62667ec681f3Smrg surface_state = &att_state->color; 62677ec681f3Smrg isl_surf_usage = ISL_SURF_USAGE_RENDER_TARGET_BIT; 62687ec681f3Smrg isl_aux_usage = att_state->aux_usage; 62697ec681f3Smrg } else if (att_usage == VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) { 62707ec681f3Smrg surface_state = &att_state->input; 62717ec681f3Smrg isl_surf_usage = ISL_SURF_USAGE_TEXTURE_BIT; 62727ec681f3Smrg isl_aux_usage = 62737ec681f3Smrg anv_layout_to_aux_usage(&cmd_buffer->device->info, iview->image, 62747ec681f3Smrg VK_IMAGE_ASPECT_COLOR_BIT, 62757ec681f3Smrg VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT, 62767ec681f3Smrg att_state->current_layout); 62777ec681f3Smrg } else { 62787ec681f3Smrg continue; 62797ec681f3Smrg } 62807ec681f3Smrg 62817ec681f3Smrg /* We had better have a surface state when we get here */ 62827ec681f3Smrg assert(surface_state->state.map); 62837ec681f3Smrg 62847ec681f3Smrg union isl_color_value clear_color = { .u32 = { 0, } }; 62857ec681f3Smrg if (pass_att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR && 62867ec681f3Smrg att_state->fast_clear) 62877ec681f3Smrg anv_clear_color_from_att_state(&clear_color, att_state, iview); 62887ec681f3Smrg 62897ec681f3Smrg anv_image_fill_surface_state(cmd_buffer->device, 62907ec681f3Smrg iview->image, 62917ec681f3Smrg VK_IMAGE_ASPECT_COLOR_BIT, 62927ec681f3Smrg &iview->planes[0].isl, 62937ec681f3Smrg isl_surf_usage, 62947ec681f3Smrg isl_aux_usage, 62957ec681f3Smrg &clear_color, 62967ec681f3Smrg 0, 62977ec681f3Smrg surface_state, 62987ec681f3Smrg NULL); 62997ec681f3Smrg 63007ec681f3Smrg add_surface_state_relocs(cmd_buffer, *surface_state); 63017ec681f3Smrg 63027ec681f3Smrg if (GFX_VER < 10 && 63037ec681f3Smrg pass_att->load_op == VK_ATTACHMENT_LOAD_OP_LOAD && 63047ec681f3Smrg iview->image->planes[0].aux_usage != ISL_AUX_USAGE_NONE && 63057ec681f3Smrg iview->planes[0].isl.base_level == 0 && 63067ec681f3Smrg iview->planes[0].isl.base_array_layer == 0) { 63077ec681f3Smrg genX(copy_fast_clear_dwords)(cmd_buffer, surface_state->state, 63087ec681f3Smrg iview->image, 63097ec681f3Smrg VK_IMAGE_ASPECT_COLOR_BIT, 63107ec681f3Smrg false /* copy to ss */); 63117ec681f3Smrg } 63127ec681f3Smrg } 63137ec681f3Smrg 63147ec681f3Smrg#if GFX_VER >= 11 63157ec681f3Smrg /* The PIPE_CONTROL command description says: 63167ec681f3Smrg * 63177ec681f3Smrg * "Whenever a Binding Table Index (BTI) used by a Render Taget Message 63187ec681f3Smrg * points to a different RENDER_SURFACE_STATE, SW must issue a Render 63197ec681f3Smrg * Target Cache Flush by enabling this bit. When render target flush 63207ec681f3Smrg * is set due to new association of BTI, PS Scoreboard Stall bit must 63217ec681f3Smrg * be set in this packet." 63227ec681f3Smrg */ 63237ec681f3Smrg anv_add_pending_pipe_bits(cmd_buffer, 63247ec681f3Smrg ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | 63257ec681f3Smrg ANV_PIPE_STALL_AT_SCOREBOARD_BIT, 63267ec681f3Smrg "change RT"); 63277ec681f3Smrg#endif 63287ec681f3Smrg 632901e04c3fSmrg cmd_buffer_emit_depth_stencil(cmd_buffer); 633001e04c3fSmrg} 633101e04c3fSmrg 63329f464c52Smayastatic enum blorp_filter 63339f464c52Smayavk_to_blorp_resolve_mode(VkResolveModeFlagBitsKHR vk_mode) 63349f464c52Smaya{ 63359f464c52Smaya switch (vk_mode) { 63369f464c52Smaya case VK_RESOLVE_MODE_SAMPLE_ZERO_BIT_KHR: 63379f464c52Smaya return BLORP_FILTER_SAMPLE_0; 63389f464c52Smaya case VK_RESOLVE_MODE_AVERAGE_BIT_KHR: 63399f464c52Smaya return BLORP_FILTER_AVERAGE; 63409f464c52Smaya case VK_RESOLVE_MODE_MIN_BIT_KHR: 63419f464c52Smaya return BLORP_FILTER_MIN_SAMPLE; 63429f464c52Smaya case VK_RESOLVE_MODE_MAX_BIT_KHR: 63439f464c52Smaya return BLORP_FILTER_MAX_SAMPLE; 63449f464c52Smaya default: 63459f464c52Smaya return BLORP_FILTER_NONE; 63469f464c52Smaya } 63479f464c52Smaya} 63489f464c52Smaya 634901e04c3fSmrgstatic void 635001e04c3fSmrgcmd_buffer_end_subpass(struct anv_cmd_buffer *cmd_buffer) 635101e04c3fSmrg{ 635201e04c3fSmrg struct anv_cmd_state *cmd_state = &cmd_buffer->state; 635301e04c3fSmrg struct anv_subpass *subpass = cmd_state->subpass; 635401e04c3fSmrg uint32_t subpass_id = anv_get_subpass_id(&cmd_buffer->state); 63559f464c52Smaya struct anv_framebuffer *fb = cmd_buffer->state.framebuffer; 635601e04c3fSmrg 63577ec681f3Smrg /* We are done with the previous subpass and all rendering directly to that 63587ec681f3Smrg * subpass is now complete. Zero out all the surface states so we don't 63597ec681f3Smrg * accidentally use them between now and the next subpass. 63607ec681f3Smrg */ 63617ec681f3Smrg for (uint32_t i = 0; i < cmd_state->pass->attachment_count; ++i) { 63627ec681f3Smrg memset(&cmd_state->attachments[i].color, 0, 63637ec681f3Smrg sizeof(cmd_state->attachments[i].color)); 63647ec681f3Smrg memset(&cmd_state->attachments[i].input, 0, 63657ec681f3Smrg sizeof(cmd_state->attachments[i].input)); 63667ec681f3Smrg } 63677ec681f3Smrg cmd_state->null_surface_state = ANV_STATE_NULL; 63687ec681f3Smrg cmd_state->attachment_states = ANV_STATE_NULL; 63697ec681f3Smrg 63707ec681f3Smrg for (uint32_t i = 0; i < subpass->attachment_count; ++i) { 63717ec681f3Smrg const uint32_t a = subpass->attachments[i].attachment; 63727ec681f3Smrg if (a == VK_ATTACHMENT_UNUSED) 63737ec681f3Smrg continue; 63747ec681f3Smrg 63757ec681f3Smrg assert(a < cmd_state->pass->attachment_count); 63767ec681f3Smrg struct anv_attachment_state *att_state = &cmd_state->attachments[a]; 63777ec681f3Smrg struct anv_image_view *iview = att_state->image_view; 63787ec681f3Smrg 63797ec681f3Smrg assert(util_bitcount(subpass->attachments[i].usage) == 1); 63807ec681f3Smrg if (subpass->attachments[i].usage == 63817ec681f3Smrg VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) { 63827ec681f3Smrg /* We assume that if we're ending a subpass, we did do some rendering 63837ec681f3Smrg * so we may end up with compressed data. 63847ec681f3Smrg */ 63857ec681f3Smrg genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image, 63867ec681f3Smrg VK_IMAGE_ASPECT_COLOR_BIT, 63877ec681f3Smrg att_state->aux_usage, 63887ec681f3Smrg iview->planes[0].isl.base_level, 63897ec681f3Smrg iview->planes[0].isl.base_array_layer, 63907ec681f3Smrg fb->layers); 63917ec681f3Smrg } else if (subpass->attachments[i].usage == 63927ec681f3Smrg VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) { 63937ec681f3Smrg /* We may be writing depth or stencil so we need to mark the surface. 63947ec681f3Smrg * Unfortunately, there's no way to know at this point whether the 63957ec681f3Smrg * depth or stencil tests used will actually write to the surface. 63967ec681f3Smrg * 63977ec681f3Smrg * Even though stencil may be plane 1, it always shares a base_level 63987ec681f3Smrg * with depth. 63997ec681f3Smrg */ 64007ec681f3Smrg const struct isl_view *ds_view = &iview->planes[0].isl; 64017ec681f3Smrg if (iview->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { 64027ec681f3Smrg genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image, 64037ec681f3Smrg VK_IMAGE_ASPECT_DEPTH_BIT, 64047ec681f3Smrg att_state->aux_usage, 64057ec681f3Smrg ds_view->base_level, 64067ec681f3Smrg ds_view->base_array_layer, 64077ec681f3Smrg fb->layers); 64087ec681f3Smrg } 64097ec681f3Smrg if (iview->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { 64107ec681f3Smrg /* Even though stencil may be plane 1, it always shares a 64117ec681f3Smrg * base_level with depth. 64127ec681f3Smrg */ 64137ec681f3Smrg genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image, 64147ec681f3Smrg VK_IMAGE_ASPECT_STENCIL_BIT, 64157ec681f3Smrg ISL_AUX_USAGE_NONE, 64167ec681f3Smrg ds_view->base_level, 64177ec681f3Smrg ds_view->base_array_layer, 64187ec681f3Smrg fb->layers); 64197ec681f3Smrg } 64207ec681f3Smrg } 64217ec681f3Smrg } 64227ec681f3Smrg 64239f464c52Smaya if (subpass->has_color_resolve) { 64249f464c52Smaya /* We are about to do some MSAA resolves. We need to flush so that the 64259f464c52Smaya * result of writes to the MSAA color attachments show up in the sampler 64269f464c52Smaya * when we blit to the single-sampled resolve target. 64279f464c52Smaya */ 64287ec681f3Smrg anv_add_pending_pipe_bits(cmd_buffer, 64297ec681f3Smrg ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT | 64307ec681f3Smrg ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT, 64317ec681f3Smrg "MSAA resolve"); 64329f464c52Smaya 64339f464c52Smaya for (uint32_t i = 0; i < subpass->color_count; ++i) { 64349f464c52Smaya uint32_t src_att = subpass->color_attachments[i].attachment; 64359f464c52Smaya uint32_t dst_att = subpass->resolve_attachments[i].attachment; 64369f464c52Smaya 64379f464c52Smaya if (dst_att == VK_ATTACHMENT_UNUSED) 64389f464c52Smaya continue; 64399f464c52Smaya 64409f464c52Smaya assert(src_att < cmd_buffer->state.pass->attachment_count); 64419f464c52Smaya assert(dst_att < cmd_buffer->state.pass->attachment_count); 64429f464c52Smaya 64439f464c52Smaya if (cmd_buffer->state.attachments[dst_att].pending_clear_aspects) { 64449f464c52Smaya /* From the Vulkan 1.0 spec: 64459f464c52Smaya * 64469f464c52Smaya * If the first use of an attachment in a render pass is as a 64479f464c52Smaya * resolve attachment, then the loadOp is effectively ignored 64489f464c52Smaya * as the resolve is guaranteed to overwrite all pixels in the 64499f464c52Smaya * render area. 64509f464c52Smaya */ 64519f464c52Smaya cmd_buffer->state.attachments[dst_att].pending_clear_aspects = 0; 64529f464c52Smaya } 64539f464c52Smaya 64547ec681f3Smrg struct anv_image_view *src_iview = cmd_state->attachments[src_att].image_view; 64557ec681f3Smrg struct anv_image_view *dst_iview = cmd_state->attachments[dst_att].image_view; 64569f464c52Smaya 64579f464c52Smaya const VkRect2D render_area = cmd_buffer->state.render_area; 64589f464c52Smaya 64599f464c52Smaya enum isl_aux_usage src_aux_usage = 64609f464c52Smaya cmd_buffer->state.attachments[src_att].aux_usage; 64619f464c52Smaya enum isl_aux_usage dst_aux_usage = 64629f464c52Smaya cmd_buffer->state.attachments[dst_att].aux_usage; 64639f464c52Smaya 64647ec681f3Smrg assert(src_iview->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT && 64657ec681f3Smrg dst_iview->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT); 64669f464c52Smaya 64679f464c52Smaya anv_image_msaa_resolve(cmd_buffer, 64689f464c52Smaya src_iview->image, src_aux_usage, 64699f464c52Smaya src_iview->planes[0].isl.base_level, 64709f464c52Smaya src_iview->planes[0].isl.base_array_layer, 64719f464c52Smaya dst_iview->image, dst_aux_usage, 64729f464c52Smaya dst_iview->planes[0].isl.base_level, 64739f464c52Smaya dst_iview->planes[0].isl.base_array_layer, 64749f464c52Smaya VK_IMAGE_ASPECT_COLOR_BIT, 64759f464c52Smaya render_area.offset.x, render_area.offset.y, 64769f464c52Smaya render_area.offset.x, render_area.offset.y, 64779f464c52Smaya render_area.extent.width, 64789f464c52Smaya render_area.extent.height, 64799f464c52Smaya fb->layers, BLORP_FILTER_NONE); 64809f464c52Smaya } 64819f464c52Smaya } 64829f464c52Smaya 64839f464c52Smaya if (subpass->ds_resolve_attachment) { 64849f464c52Smaya /* We are about to do some MSAA resolves. We need to flush so that the 64859f464c52Smaya * result of writes to the MSAA depth attachments show up in the sampler 64869f464c52Smaya * when we blit to the single-sampled resolve target. 64879f464c52Smaya */ 64887ec681f3Smrg anv_add_pending_pipe_bits(cmd_buffer, 64897ec681f3Smrg ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT | 64907ec681f3Smrg ANV_PIPE_DEPTH_CACHE_FLUSH_BIT, 64917ec681f3Smrg "MSAA resolve"); 64929f464c52Smaya 64939f464c52Smaya uint32_t src_att = subpass->depth_stencil_attachment->attachment; 64949f464c52Smaya uint32_t dst_att = subpass->ds_resolve_attachment->attachment; 64959f464c52Smaya 64969f464c52Smaya assert(src_att < cmd_buffer->state.pass->attachment_count); 64979f464c52Smaya assert(dst_att < cmd_buffer->state.pass->attachment_count); 64989f464c52Smaya 64999f464c52Smaya if (cmd_buffer->state.attachments[dst_att].pending_clear_aspects) { 65009f464c52Smaya /* From the Vulkan 1.0 spec: 65019f464c52Smaya * 65029f464c52Smaya * If the first use of an attachment in a render pass is as a 65039f464c52Smaya * resolve attachment, then the loadOp is effectively ignored 65049f464c52Smaya * as the resolve is guaranteed to overwrite all pixels in the 65059f464c52Smaya * render area. 65069f464c52Smaya */ 65079f464c52Smaya cmd_buffer->state.attachments[dst_att].pending_clear_aspects = 0; 65089f464c52Smaya } 65099f464c52Smaya 65107ec681f3Smrg struct anv_image_view *src_iview = cmd_state->attachments[src_att].image_view; 65117ec681f3Smrg struct anv_image_view *dst_iview = cmd_state->attachments[dst_att].image_view; 65129f464c52Smaya 65139f464c52Smaya const VkRect2D render_area = cmd_buffer->state.render_area; 65149f464c52Smaya 65157ec681f3Smrg struct anv_attachment_state *src_state = 65167ec681f3Smrg &cmd_state->attachments[src_att]; 65177ec681f3Smrg struct anv_attachment_state *dst_state = 65187ec681f3Smrg &cmd_state->attachments[dst_att]; 65199f464c52Smaya 65207ec681f3Smrg if ((src_iview->image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && 65217ec681f3Smrg subpass->depth_resolve_mode != VK_RESOLVE_MODE_NONE_KHR) { 65229f464c52Smaya 65239f464c52Smaya /* MSAA resolves sample from the source attachment. Transition the 65249f464c52Smaya * depth attachment first to get rid of any HiZ that we may not be 65259f464c52Smaya * able to handle. 65269f464c52Smaya */ 65279f464c52Smaya transition_depth_buffer(cmd_buffer, src_iview->image, 65287ec681f3Smrg src_iview->planes[0].isl.base_array_layer, 65297ec681f3Smrg fb->layers, 65309f464c52Smaya src_state->current_layout, 65317ec681f3Smrg VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, 65327ec681f3Smrg false /* will_full_fast_clear */); 65339f464c52Smaya src_state->aux_usage = 65349f464c52Smaya anv_layout_to_aux_usage(&cmd_buffer->device->info, src_iview->image, 65359f464c52Smaya VK_IMAGE_ASPECT_DEPTH_BIT, 65367ec681f3Smrg VK_IMAGE_USAGE_TRANSFER_SRC_BIT, 65377ec681f3Smrg VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL); 65387ec681f3Smrg src_state->current_layout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; 65399f464c52Smaya 65409f464c52Smaya /* MSAA resolves write to the resolve attachment as if it were any 65419f464c52Smaya * other transfer op. Transition the resolve attachment accordingly. 65429f464c52Smaya */ 65439f464c52Smaya VkImageLayout dst_initial_layout = dst_state->current_layout; 65449f464c52Smaya 65459f464c52Smaya /* If our render area is the entire size of the image, we're going to 65469f464c52Smaya * blow it all away so we can claim the initial layout is UNDEFINED 65479f464c52Smaya * and we'll get a HiZ ambiguate instead of a resolve. 65489f464c52Smaya */ 65497ec681f3Smrg if (dst_iview->image->vk.image_type != VK_IMAGE_TYPE_3D && 65509f464c52Smaya render_area.offset.x == 0 && render_area.offset.y == 0 && 65517ec681f3Smrg render_area.extent.width == dst_iview->vk.extent.width && 65527ec681f3Smrg render_area.extent.height == dst_iview->vk.extent.height) 65539f464c52Smaya dst_initial_layout = VK_IMAGE_LAYOUT_UNDEFINED; 65549f464c52Smaya 65559f464c52Smaya transition_depth_buffer(cmd_buffer, dst_iview->image, 65567ec681f3Smrg dst_iview->planes[0].isl.base_array_layer, 65577ec681f3Smrg fb->layers, 65589f464c52Smaya dst_initial_layout, 65597ec681f3Smrg VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 65607ec681f3Smrg false /* will_full_fast_clear */); 65619f464c52Smaya dst_state->aux_usage = 65629f464c52Smaya anv_layout_to_aux_usage(&cmd_buffer->device->info, dst_iview->image, 65639f464c52Smaya VK_IMAGE_ASPECT_DEPTH_BIT, 65647ec681f3Smrg VK_IMAGE_USAGE_TRANSFER_DST_BIT, 65659f464c52Smaya VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); 65669f464c52Smaya dst_state->current_layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; 65679f464c52Smaya 65689f464c52Smaya enum blorp_filter filter = 65699f464c52Smaya vk_to_blorp_resolve_mode(subpass->depth_resolve_mode); 65709f464c52Smaya 65719f464c52Smaya anv_image_msaa_resolve(cmd_buffer, 65729f464c52Smaya src_iview->image, src_state->aux_usage, 65739f464c52Smaya src_iview->planes[0].isl.base_level, 65749f464c52Smaya src_iview->planes[0].isl.base_array_layer, 65759f464c52Smaya dst_iview->image, dst_state->aux_usage, 65769f464c52Smaya dst_iview->planes[0].isl.base_level, 65779f464c52Smaya dst_iview->planes[0].isl.base_array_layer, 65789f464c52Smaya VK_IMAGE_ASPECT_DEPTH_BIT, 65799f464c52Smaya render_area.offset.x, render_area.offset.y, 65809f464c52Smaya render_area.offset.x, render_area.offset.y, 65819f464c52Smaya render_area.extent.width, 65829f464c52Smaya render_area.extent.height, 65839f464c52Smaya fb->layers, filter); 65849f464c52Smaya } 65859f464c52Smaya 65867ec681f3Smrg if ((src_iview->image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT) && 65879f464c52Smaya subpass->stencil_resolve_mode != VK_RESOLVE_MODE_NONE_KHR) { 65889f464c52Smaya 65897ec681f3Smrg src_state->current_stencil_layout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; 65907ec681f3Smrg dst_state->current_stencil_layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; 65917ec681f3Smrg 65929f464c52Smaya enum isl_aux_usage src_aux_usage = ISL_AUX_USAGE_NONE; 65937ec681f3Smrg const uint32_t plane = 65947ec681f3Smrg anv_image_aspect_to_plane(dst_iview->image, VK_IMAGE_ASPECT_STENCIL_BIT); 65957ec681f3Smrg enum isl_aux_usage dst_aux_usage = 65967ec681f3Smrg dst_iview->image->planes[plane].aux_usage; 65979f464c52Smaya 65989f464c52Smaya enum blorp_filter filter = 65999f464c52Smaya vk_to_blorp_resolve_mode(subpass->stencil_resolve_mode); 66009f464c52Smaya 66019f464c52Smaya anv_image_msaa_resolve(cmd_buffer, 66029f464c52Smaya src_iview->image, src_aux_usage, 66039f464c52Smaya src_iview->planes[0].isl.base_level, 66049f464c52Smaya src_iview->planes[0].isl.base_array_layer, 66059f464c52Smaya dst_iview->image, dst_aux_usage, 66069f464c52Smaya dst_iview->planes[0].isl.base_level, 66079f464c52Smaya dst_iview->planes[0].isl.base_array_layer, 66089f464c52Smaya VK_IMAGE_ASPECT_STENCIL_BIT, 66099f464c52Smaya render_area.offset.x, render_area.offset.y, 66109f464c52Smaya render_area.offset.x, render_area.offset.y, 66119f464c52Smaya render_area.extent.width, 66129f464c52Smaya render_area.extent.height, 66139f464c52Smaya fb->layers, filter); 66149f464c52Smaya } 66159f464c52Smaya } 661601e04c3fSmrg 66177ec681f3Smrg#if GFX_VER == 7 66187ec681f3Smrg /* On gfx7, we have to store a texturable version of the stencil buffer in 66197ec681f3Smrg * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and 66207ec681f3Smrg * forth at strategic points. Stencil writes are only allowed in following 66217ec681f3Smrg * layouts: 66227ec681f3Smrg * 66237ec681f3Smrg * - VK_IMAGE_LAYOUT_GENERAL 66247ec681f3Smrg * - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL 66257ec681f3Smrg * - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL 66267ec681f3Smrg * - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL 66277ec681f3Smrg * - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR 66287ec681f3Smrg * 66297ec681f3Smrg * For general, we have no nice opportunity to transition so we do the copy 66307ec681f3Smrg * to the shadow unconditionally at the end of the subpass. For transfer 66317ec681f3Smrg * destinations, we can update it as part of the transfer op. For the other 66327ec681f3Smrg * layouts, we delay the copy until a transition into some other layout. 66337ec681f3Smrg */ 66347ec681f3Smrg if (subpass->depth_stencil_attachment) { 66357ec681f3Smrg uint32_t a = subpass->depth_stencil_attachment->attachment; 66367ec681f3Smrg assert(a != VK_ATTACHMENT_UNUSED); 66377ec681f3Smrg 66387ec681f3Smrg struct anv_attachment_state *att_state = &cmd_state->attachments[a]; 66397ec681f3Smrg struct anv_image_view *iview = cmd_state->attachments[a].image_view;; 66407ec681f3Smrg const struct anv_image *image = iview->image; 66417ec681f3Smrg 66427ec681f3Smrg if (image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { 66437ec681f3Smrg const uint32_t plane = 66447ec681f3Smrg anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT); 66457ec681f3Smrg 66467ec681f3Smrg if (anv_surface_is_valid(&image->planes[plane].shadow_surface) && 66477ec681f3Smrg att_state->current_stencil_layout == VK_IMAGE_LAYOUT_GENERAL) { 66487ec681f3Smrg assert(image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT); 66497ec681f3Smrg anv_image_copy_to_shadow(cmd_buffer, image, 66507ec681f3Smrg VK_IMAGE_ASPECT_STENCIL_BIT, 66517ec681f3Smrg iview->planes[plane].isl.base_level, 1, 66527ec681f3Smrg iview->planes[plane].isl.base_array_layer, 66537ec681f3Smrg fb->layers); 66547ec681f3Smrg } 66557ec681f3Smrg } 66567ec681f3Smrg } 66577ec681f3Smrg#endif /* GFX_VER == 7 */ 66587ec681f3Smrg 665901e04c3fSmrg for (uint32_t i = 0; i < subpass->attachment_count; ++i) { 666001e04c3fSmrg const uint32_t a = subpass->attachments[i].attachment; 666101e04c3fSmrg if (a == VK_ATTACHMENT_UNUSED) 666201e04c3fSmrg continue; 666301e04c3fSmrg 666401e04c3fSmrg if (cmd_state->pass->attachments[a].last_subpass_idx != subpass_id) 666501e04c3fSmrg continue; 666601e04c3fSmrg 666701e04c3fSmrg assert(a < cmd_state->pass->attachment_count); 666801e04c3fSmrg struct anv_attachment_state *att_state = &cmd_state->attachments[a]; 66697ec681f3Smrg struct anv_image_view *iview = cmd_state->attachments[a].image_view; 667001e04c3fSmrg const struct anv_image *image = iview->image; 667101e04c3fSmrg 667201e04c3fSmrg /* Transition the image into the final layout for this render pass */ 667301e04c3fSmrg VkImageLayout target_layout = 667401e04c3fSmrg cmd_state->pass->attachments[a].final_layout; 66757ec681f3Smrg VkImageLayout target_stencil_layout = 66767ec681f3Smrg cmd_state->pass->attachments[a].stencil_final_layout; 66777ec681f3Smrg 66787ec681f3Smrg uint32_t base_layer, layer_count; 66797ec681f3Smrg if (image->vk.image_type == VK_IMAGE_TYPE_3D) { 66807ec681f3Smrg base_layer = 0; 66817ec681f3Smrg layer_count = anv_minify(iview->image->vk.extent.depth, 66827ec681f3Smrg iview->planes[0].isl.base_level); 66837ec681f3Smrg } else { 66847ec681f3Smrg base_layer = iview->planes[0].isl.base_array_layer; 66857ec681f3Smrg layer_count = fb->layers; 66867ec681f3Smrg } 668701e04c3fSmrg 66887ec681f3Smrg if (image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { 66897ec681f3Smrg assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT); 669001e04c3fSmrg transition_color_buffer(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT, 669101e04c3fSmrg iview->planes[0].isl.base_level, 1, 669201e04c3fSmrg base_layer, layer_count, 66937ec681f3Smrg att_state->current_layout, target_layout, 66947ec681f3Smrg VK_QUEUE_FAMILY_IGNORED, 66957ec681f3Smrg VK_QUEUE_FAMILY_IGNORED, 66967ec681f3Smrg false /* will_full_fast_clear */); 66977ec681f3Smrg } 66987ec681f3Smrg 66997ec681f3Smrg if (image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { 670001e04c3fSmrg transition_depth_buffer(cmd_buffer, image, 67017ec681f3Smrg base_layer, layer_count, 67027ec681f3Smrg att_state->current_layout, target_layout, 67037ec681f3Smrg false /* will_full_fast_clear */); 67047ec681f3Smrg } 67057ec681f3Smrg 67067ec681f3Smrg if (image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { 67077ec681f3Smrg transition_stencil_buffer(cmd_buffer, image, 67087ec681f3Smrg iview->planes[0].isl.base_level, 1, 67097ec681f3Smrg base_layer, layer_count, 67107ec681f3Smrg att_state->current_stencil_layout, 67117ec681f3Smrg target_stencil_layout, 67127ec681f3Smrg false /* will_full_fast_clear */); 671301e04c3fSmrg } 671401e04c3fSmrg } 671501e04c3fSmrg 671601e04c3fSmrg /* Accumulate any subpass flushes that need to happen after the subpass. 671701e04c3fSmrg * Yes, they do get accumulated twice in the NextSubpass case but since 671801e04c3fSmrg * genX_CmdNextSubpass just calls end/begin back-to-back, we just end up 671901e04c3fSmrg * ORing the bits in twice so it's harmless. 672001e04c3fSmrg */ 67217ec681f3Smrg anv_add_pending_pipe_bits(cmd_buffer, 67227ec681f3Smrg cmd_buffer->state.pass->subpass_flushes[subpass_id + 1], 67237ec681f3Smrg "end subpass deps/attachments"); 672401e04c3fSmrg} 672501e04c3fSmrg 67267ec681f3Smrgvoid genX(CmdBeginRenderPass2)( 672701e04c3fSmrg VkCommandBuffer commandBuffer, 67287ec681f3Smrg const VkRenderPassBeginInfo* pRenderPassBeginInfo, 67297ec681f3Smrg const VkSubpassBeginInfoKHR* pSubpassBeginInfo) 673001e04c3fSmrg{ 673101e04c3fSmrg ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 67327ec681f3Smrg ANV_FROM_HANDLE(anv_render_pass, pass, pRenderPassBeginInfo->renderPass); 67337ec681f3Smrg ANV_FROM_HANDLE(anv_framebuffer, framebuffer, pRenderPassBeginInfo->framebuffer); 67347ec681f3Smrg VkResult result; 67357ec681f3Smrg 67367ec681f3Smrg if (!is_render_queue_cmd_buffer(cmd_buffer)) { 67377ec681f3Smrg assert(!"Trying to start a render pass on non-render queue!"); 67387ec681f3Smrg anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_UNKNOWN); 67397ec681f3Smrg return; 67407ec681f3Smrg } 674101e04c3fSmrg 674201e04c3fSmrg cmd_buffer->state.framebuffer = framebuffer; 674301e04c3fSmrg cmd_buffer->state.pass = pass; 67447ec681f3Smrg cmd_buffer->state.render_area = pRenderPassBeginInfo->renderArea; 67457ec681f3Smrg 67467ec681f3Smrg anv_measure_beginrenderpass(cmd_buffer); 674701e04c3fSmrg 67487ec681f3Smrg result = genX(cmd_buffer_setup_attachments)(cmd_buffer, pass, 67497ec681f3Smrg framebuffer, 67507ec681f3Smrg pRenderPassBeginInfo); 675101e04c3fSmrg if (result != VK_SUCCESS) { 675201e04c3fSmrg assert(anv_batch_has_error(&cmd_buffer->batch)); 675301e04c3fSmrg return; 675401e04c3fSmrg } 675501e04c3fSmrg 675601e04c3fSmrg genX(flush_pipeline_select_3d)(cmd_buffer); 675701e04c3fSmrg 675801e04c3fSmrg cmd_buffer_begin_subpass(cmd_buffer, 0); 675901e04c3fSmrg} 676001e04c3fSmrg 67617ec681f3Smrgvoid genX(CmdNextSubpass2)( 676201e04c3fSmrg VkCommandBuffer commandBuffer, 67637ec681f3Smrg const VkSubpassBeginInfoKHR* pSubpassBeginInfo, 67647ec681f3Smrg const VkSubpassEndInfoKHR* pSubpassEndInfo) 676501e04c3fSmrg{ 676601e04c3fSmrg ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 676701e04c3fSmrg 676801e04c3fSmrg if (anv_batch_has_error(&cmd_buffer->batch)) 676901e04c3fSmrg return; 677001e04c3fSmrg 677101e04c3fSmrg assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); 677201e04c3fSmrg 677301e04c3fSmrg uint32_t prev_subpass = anv_get_subpass_id(&cmd_buffer->state); 677401e04c3fSmrg cmd_buffer_end_subpass(cmd_buffer); 677501e04c3fSmrg cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1); 677601e04c3fSmrg} 677701e04c3fSmrg 67787ec681f3Smrgvoid genX(CmdEndRenderPass2)( 677901e04c3fSmrg VkCommandBuffer commandBuffer, 678001e04c3fSmrg const VkSubpassEndInfoKHR* pSubpassEndInfo) 678101e04c3fSmrg{ 678201e04c3fSmrg ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 678301e04c3fSmrg 678401e04c3fSmrg if (anv_batch_has_error(&cmd_buffer->batch)) 678501e04c3fSmrg return; 678601e04c3fSmrg 678701e04c3fSmrg cmd_buffer_end_subpass(cmd_buffer); 678801e04c3fSmrg 678901e04c3fSmrg cmd_buffer->state.hiz_enabled = false; 679001e04c3fSmrg 679101e04c3fSmrg /* Remove references to render pass specific state. This enables us to 679201e04c3fSmrg * detect whether or not we're in a renderpass. 679301e04c3fSmrg */ 679401e04c3fSmrg cmd_buffer->state.framebuffer = NULL; 679501e04c3fSmrg cmd_buffer->state.pass = NULL; 679601e04c3fSmrg cmd_buffer->state.subpass = NULL; 679701e04c3fSmrg} 679801e04c3fSmrg 67999f464c52Smayavoid 68009f464c52SmayagenX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer) 68019f464c52Smaya{ 68027ec681f3Smrg#if GFX_VERx10 >= 75 68037ec681f3Smrg struct mi_builder b; 68047ec681f3Smrg mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 68059f464c52Smaya 68067ec681f3Smrg mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), 68077ec681f3Smrg mi_reg32(ANV_PREDICATE_RESULT_REG)); 68087ec681f3Smrg mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0)); 68099f464c52Smaya 68109f464c52Smaya anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { 68119f464c52Smaya mip.LoadOperation = LOAD_LOADINV; 68129f464c52Smaya mip.CombineOperation = COMBINE_SET; 68139f464c52Smaya mip.CompareOperation = COMPARE_SRCS_EQUAL; 68149f464c52Smaya } 68159f464c52Smaya#endif 68169f464c52Smaya} 68179f464c52Smaya 68187ec681f3Smrg#if GFX_VERx10 >= 75 68199f464c52Smayavoid genX(CmdBeginConditionalRenderingEXT)( 68209f464c52Smaya VkCommandBuffer commandBuffer, 68219f464c52Smaya const VkConditionalRenderingBeginInfoEXT* pConditionalRenderingBegin) 68229f464c52Smaya{ 68239f464c52Smaya ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 68249f464c52Smaya ANV_FROM_HANDLE(anv_buffer, buffer, pConditionalRenderingBegin->buffer); 68259f464c52Smaya struct anv_cmd_state *cmd_state = &cmd_buffer->state; 68269f464c52Smaya struct anv_address value_address = 68279f464c52Smaya anv_address_add(buffer->address, pConditionalRenderingBegin->offset); 68289f464c52Smaya 68299f464c52Smaya const bool isInverted = pConditionalRenderingBegin->flags & 68309f464c52Smaya VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT; 68319f464c52Smaya 68329f464c52Smaya cmd_state->conditional_render_enabled = true; 68339f464c52Smaya 68349f464c52Smaya genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 68359f464c52Smaya 68367ec681f3Smrg struct mi_builder b; 68377ec681f3Smrg mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 68389f464c52Smaya 68399f464c52Smaya /* Section 19.4 of the Vulkan 1.1.85 spec says: 68409f464c52Smaya * 68419f464c52Smaya * If the value of the predicate in buffer memory changes 68429f464c52Smaya * while conditional rendering is active, the rendering commands 68439f464c52Smaya * may be discarded in an implementation-dependent way. 68449f464c52Smaya * Some implementations may latch the value of the predicate 68459f464c52Smaya * upon beginning conditional rendering while others 68469f464c52Smaya * may read it before every rendering command. 68479f464c52Smaya * 68489f464c52Smaya * So it's perfectly fine to read a value from the buffer once. 68499f464c52Smaya */ 68507ec681f3Smrg struct mi_value value = mi_mem32(value_address); 68519f464c52Smaya 68529f464c52Smaya /* Precompute predicate result, it is necessary to support secondary 68539f464c52Smaya * command buffers since it is unknown if conditional rendering is 68549f464c52Smaya * inverted when populating them. 68559f464c52Smaya */ 68567ec681f3Smrg mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG), 68577ec681f3Smrg isInverted ? mi_uge(&b, mi_imm(0), value) : 68587ec681f3Smrg mi_ult(&b, mi_imm(0), value)); 68599f464c52Smaya} 68609f464c52Smaya 68619f464c52Smayavoid genX(CmdEndConditionalRenderingEXT)( 68629f464c52Smaya VkCommandBuffer commandBuffer) 68639f464c52Smaya{ 68649f464c52Smaya ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 68659f464c52Smaya struct anv_cmd_state *cmd_state = &cmd_buffer->state; 68669f464c52Smaya 68679f464c52Smaya cmd_state->conditional_render_enabled = false; 68689f464c52Smaya} 68699f464c52Smaya#endif 68709f464c52Smaya 68717ec681f3Smrg/* Set of stage bits for which are pipelined, i.e. they get queued 68727ec681f3Smrg * by the command streamer for later execution. 68739f464c52Smaya */ 68749f464c52Smaya#define ANV_PIPELINE_STAGE_PIPELINED_BITS \ 68757ec681f3Smrg ~(VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT_KHR | \ 68767ec681f3Smrg VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT_KHR | \ 68777ec681f3Smrg VK_PIPELINE_STAGE_2_HOST_BIT_KHR | \ 68787ec681f3Smrg VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT) 68797ec681f3Smrg 68807ec681f3Smrgvoid genX(CmdSetEvent2KHR)( 68819f464c52Smaya VkCommandBuffer commandBuffer, 68829f464c52Smaya VkEvent _event, 68837ec681f3Smrg const VkDependencyInfoKHR* pDependencyInfo) 68849f464c52Smaya{ 68859f464c52Smaya ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 68869f464c52Smaya ANV_FROM_HANDLE(anv_event, event, _event); 68879f464c52Smaya 68887ec681f3Smrg VkPipelineStageFlags2KHR src_stages = 0; 68897ec681f3Smrg 68907ec681f3Smrg for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++) 68917ec681f3Smrg src_stages |= pDependencyInfo->pMemoryBarriers[i].srcStageMask; 68927ec681f3Smrg for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++) 68937ec681f3Smrg src_stages |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask; 68947ec681f3Smrg for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++) 68957ec681f3Smrg src_stages |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask; 68967ec681f3Smrg 68977ec681f3Smrg cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT; 68987ec681f3Smrg genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 68997ec681f3Smrg 69009f464c52Smaya anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 69017ec681f3Smrg if (src_stages & ANV_PIPELINE_STAGE_PIPELINED_BITS) { 69029f464c52Smaya pc.StallAtPixelScoreboard = true; 69039f464c52Smaya pc.CommandStreamerStallEnable = true; 69049f464c52Smaya } 69059f464c52Smaya 69069f464c52Smaya pc.DestinationAddressType = DAT_PPGTT, 69079f464c52Smaya pc.PostSyncOperation = WriteImmediateData, 69089f464c52Smaya pc.Address = (struct anv_address) { 69099f464c52Smaya cmd_buffer->device->dynamic_state_pool.block_pool.bo, 69109f464c52Smaya event->state.offset 69119f464c52Smaya }; 69129f464c52Smaya pc.ImmediateData = VK_EVENT_SET; 69137ec681f3Smrg anv_debug_dump_pc(pc); 69149f464c52Smaya } 69159f464c52Smaya} 69169f464c52Smaya 69177ec681f3Smrgvoid genX(CmdResetEvent2KHR)( 69189f464c52Smaya VkCommandBuffer commandBuffer, 69199f464c52Smaya VkEvent _event, 69207ec681f3Smrg VkPipelineStageFlags2KHR stageMask) 69219f464c52Smaya{ 69229f464c52Smaya ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 69239f464c52Smaya ANV_FROM_HANDLE(anv_event, event, _event); 69249f464c52Smaya 69257ec681f3Smrg cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT; 69267ec681f3Smrg genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 69277ec681f3Smrg 69289f464c52Smaya anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 69299f464c52Smaya if (stageMask & ANV_PIPELINE_STAGE_PIPELINED_BITS) { 69309f464c52Smaya pc.StallAtPixelScoreboard = true; 69319f464c52Smaya pc.CommandStreamerStallEnable = true; 69329f464c52Smaya } 69339f464c52Smaya 69349f464c52Smaya pc.DestinationAddressType = DAT_PPGTT; 69359f464c52Smaya pc.PostSyncOperation = WriteImmediateData; 69369f464c52Smaya pc.Address = (struct anv_address) { 69379f464c52Smaya cmd_buffer->device->dynamic_state_pool.block_pool.bo, 69389f464c52Smaya event->state.offset 69399f464c52Smaya }; 69409f464c52Smaya pc.ImmediateData = VK_EVENT_RESET; 69417ec681f3Smrg anv_debug_dump_pc(pc); 69429f464c52Smaya } 69439f464c52Smaya} 69449f464c52Smaya 69457ec681f3Smrgvoid genX(CmdWaitEvents2KHR)( 69469f464c52Smaya VkCommandBuffer commandBuffer, 69479f464c52Smaya uint32_t eventCount, 69489f464c52Smaya const VkEvent* pEvents, 69497ec681f3Smrg const VkDependencyInfoKHR* pDependencyInfos) 69509f464c52Smaya{ 69519f464c52Smaya ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 69529f464c52Smaya 69537ec681f3Smrg#if GFX_VER >= 8 69549f464c52Smaya for (uint32_t i = 0; i < eventCount; i++) { 69559f464c52Smaya ANV_FROM_HANDLE(anv_event, event, pEvents[i]); 69569f464c52Smaya 69579f464c52Smaya anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT), sem) { 69589f464c52Smaya sem.WaitMode = PollingMode, 69599f464c52Smaya sem.CompareOperation = COMPARE_SAD_EQUAL_SDD, 69609f464c52Smaya sem.SemaphoreDataDword = VK_EVENT_SET, 69619f464c52Smaya sem.SemaphoreAddress = (struct anv_address) { 69629f464c52Smaya cmd_buffer->device->dynamic_state_pool.block_pool.bo, 69639f464c52Smaya event->state.offset 69649f464c52Smaya }; 69659f464c52Smaya } 69669f464c52Smaya } 69679f464c52Smaya#else 69687ec681f3Smrg anv_finishme("Implement events on gfx7"); 69697ec681f3Smrg#endif 69707ec681f3Smrg 69717ec681f3Smrg cmd_buffer_barrier(cmd_buffer, pDependencyInfos, "wait event"); 69727ec681f3Smrg} 69737ec681f3Smrg 69747ec681f3SmrgVkResult genX(CmdSetPerformanceOverrideINTEL)( 69757ec681f3Smrg VkCommandBuffer commandBuffer, 69767ec681f3Smrg const VkPerformanceOverrideInfoINTEL* pOverrideInfo) 69777ec681f3Smrg{ 69787ec681f3Smrg ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 69797ec681f3Smrg 69807ec681f3Smrg switch (pOverrideInfo->type) { 69817ec681f3Smrg case VK_PERFORMANCE_OVERRIDE_TYPE_NULL_HARDWARE_INTEL: { 69827ec681f3Smrg#if GFX_VER >= 9 69837ec681f3Smrg anv_batch_write_reg(&cmd_buffer->batch, GENX(CS_DEBUG_MODE2), csdm2) { 69847ec681f3Smrg csdm2._3DRenderingInstructionDisable = pOverrideInfo->enable; 69857ec681f3Smrg csdm2.MediaInstructionDisable = pOverrideInfo->enable; 69867ec681f3Smrg csdm2._3DRenderingInstructionDisableMask = true; 69877ec681f3Smrg csdm2.MediaInstructionDisableMask = true; 69887ec681f3Smrg } 69897ec681f3Smrg#else 69907ec681f3Smrg anv_batch_write_reg(&cmd_buffer->batch, GENX(INSTPM), instpm) { 69917ec681f3Smrg instpm._3DRenderingInstructionDisable = pOverrideInfo->enable; 69927ec681f3Smrg instpm.MediaInstructionDisable = pOverrideInfo->enable; 69937ec681f3Smrg instpm._3DRenderingInstructionDisableMask = true; 69947ec681f3Smrg instpm.MediaInstructionDisableMask = true; 69957ec681f3Smrg } 69969f464c52Smaya#endif 69977ec681f3Smrg break; 69987ec681f3Smrg } 69997ec681f3Smrg 70007ec681f3Smrg case VK_PERFORMANCE_OVERRIDE_TYPE_FLUSH_GPU_CACHES_INTEL: 70017ec681f3Smrg if (pOverrideInfo->enable) { 70027ec681f3Smrg /* FLUSH ALL THE THINGS! As requested by the MDAPI team. */ 70037ec681f3Smrg anv_add_pending_pipe_bits(cmd_buffer, 70047ec681f3Smrg ANV_PIPE_FLUSH_BITS | 70057ec681f3Smrg ANV_PIPE_INVALIDATE_BITS, 70067ec681f3Smrg "perf counter isolation"); 70077ec681f3Smrg genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 70087ec681f3Smrg } 70097ec681f3Smrg break; 70107ec681f3Smrg 70117ec681f3Smrg default: 70127ec681f3Smrg unreachable("Invalid override"); 70137ec681f3Smrg } 70147ec681f3Smrg 70157ec681f3Smrg return VK_SUCCESS; 70167ec681f3Smrg} 70179f464c52Smaya 70187ec681f3SmrgVkResult genX(CmdSetPerformanceStreamMarkerINTEL)( 70197ec681f3Smrg VkCommandBuffer commandBuffer, 70207ec681f3Smrg const VkPerformanceStreamMarkerInfoINTEL* pMarkerInfo) 70217ec681f3Smrg{ 70227ec681f3Smrg /* TODO: Waiting on the register to write, might depend on generation. */ 70237ec681f3Smrg 70247ec681f3Smrg return VK_SUCCESS; 70257ec681f3Smrg} 70267ec681f3Smrg 70277ec681f3Smrgvoid genX(cmd_emit_timestamp)(struct anv_batch *batch, 70287ec681f3Smrg struct anv_bo *bo, 70297ec681f3Smrg uint32_t offset) { 70307ec681f3Smrg anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) { 70317ec681f3Smrg pc.CommandStreamerStallEnable = true; 70327ec681f3Smrg pc.PostSyncOperation = WriteTimestamp; 70337ec681f3Smrg pc.Address = (struct anv_address) {bo, offset}; 70347ec681f3Smrg anv_debug_dump_pc(pc); 70357ec681f3Smrg } 70369f464c52Smaya} 7037