101e04c3fSmrg/* 201e04c3fSmrg * Copyright © 2016 Red Hat. 301e04c3fSmrg * Copyright © 2016 Bas Nieuwenhuizen 401e04c3fSmrg * 501e04c3fSmrg * based in part on anv driver which is: 601e04c3fSmrg * Copyright © 2015 Intel Corporation 701e04c3fSmrg * 801e04c3fSmrg * Permission is hereby granted, free of charge, to any person obtaining a 901e04c3fSmrg * copy of this software and associated documentation files (the "Software"), 1001e04c3fSmrg * to deal in the Software without restriction, including without limitation 1101e04c3fSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 1201e04c3fSmrg * and/or sell copies of the Software, and to permit persons to whom the 1301e04c3fSmrg * Software is furnished to do so, subject to the following conditions: 1401e04c3fSmrg * 1501e04c3fSmrg * The above copyright notice and this permission notice (including the next 1601e04c3fSmrg * paragraph) shall be included in all copies or substantial portions of the 1701e04c3fSmrg * Software. 1801e04c3fSmrg * 1901e04c3fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 2001e04c3fSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 2101e04c3fSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 2201e04c3fSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 2301e04c3fSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 2401e04c3fSmrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 2501e04c3fSmrg * IN THE SOFTWARE. 2601e04c3fSmrg */ 2701e04c3fSmrg 287ec681f3Smrg#include "radv_cs.h" 297ec681f3Smrg#include "radv_debug.h" 307ec681f3Smrg#include "radv_meta.h" 3101e04c3fSmrg#include "radv_private.h" 3201e04c3fSmrg#include "radv_radeon_winsys.h" 3301e04c3fSmrg#include "radv_shader.h" 3401e04c3fSmrg#include "sid.h" 3501e04c3fSmrg#include "vk_format.h" 367ec681f3Smrg#include "vk_util.h" 3701e04c3fSmrg 3801e04c3fSmrg#include "ac_debug.h" 3901e04c3fSmrg 407ec681f3Smrg#include "util/fast_idiv_by_const.h" 417ec681f3Smrg 427ec681f3Smrgenum { 437ec681f3Smrg RADV_PREFETCH_VBO_DESCRIPTORS = (1 << 0), 447ec681f3Smrg RADV_PREFETCH_VS = (1 << 1), 457ec681f3Smrg RADV_PREFETCH_TCS = (1 << 2), 467ec681f3Smrg RADV_PREFETCH_TES = (1 << 3), 477ec681f3Smrg RADV_PREFETCH_GS = (1 << 4), 487ec681f3Smrg RADV_PREFETCH_PS = (1 << 5), 497ec681f3Smrg RADV_PREFETCH_SHADERS = (RADV_PREFETCH_VS | RADV_PREFETCH_TCS | RADV_PREFETCH_TES | 507ec681f3Smrg RADV_PREFETCH_GS | RADV_PREFETCH_PS) 517ec681f3Smrg}; 527ec681f3Smrg 5301e04c3fSmrgenum { 547ec681f3Smrg RADV_RT_STAGE_BITS = (VK_SHADER_STAGE_RAYGEN_BIT_KHR | VK_SHADER_STAGE_ANY_HIT_BIT_KHR | 557ec681f3Smrg VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR | VK_SHADER_STAGE_MISS_BIT_KHR | 567ec681f3Smrg VK_SHADER_STAGE_INTERSECTION_BIT_KHR | VK_SHADER_STAGE_CALLABLE_BIT_KHR) 5701e04c3fSmrg}; 5801e04c3fSmrg 5901e04c3fSmrgstatic void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer, 607ec681f3Smrg struct radv_image *image, VkImageLayout src_layout, 617ec681f3Smrg bool src_render_loop, VkImageLayout dst_layout, 627ec681f3Smrg bool dst_render_loop, uint32_t src_family, 637ec681f3Smrg uint32_t dst_family, const VkImageSubresourceRange *range, 647ec681f3Smrg struct radv_sample_locations_state *sample_locs); 657ec681f3Smrg 667ec681f3Smrgstatic void radv_set_rt_stack_size(struct radv_cmd_buffer *cmd_buffer, uint32_t size); 6701e04c3fSmrg 6801e04c3fSmrgconst struct radv_dynamic_state default_dynamic_state = { 697ec681f3Smrg .viewport = 707ec681f3Smrg { 717ec681f3Smrg .count = 0, 727ec681f3Smrg }, 737ec681f3Smrg .scissor = 747ec681f3Smrg { 757ec681f3Smrg .count = 0, 767ec681f3Smrg }, 777ec681f3Smrg .line_width = 1.0f, 787ec681f3Smrg .depth_bias = 797ec681f3Smrg { 807ec681f3Smrg .bias = 0.0f, 817ec681f3Smrg .clamp = 0.0f, 827ec681f3Smrg .slope = 0.0f, 837ec681f3Smrg }, 847ec681f3Smrg .blend_constants = {0.0f, 0.0f, 0.0f, 0.0f}, 857ec681f3Smrg .depth_bounds = 867ec681f3Smrg { 877ec681f3Smrg .min = 0.0f, 887ec681f3Smrg .max = 1.0f, 897ec681f3Smrg }, 907ec681f3Smrg .stencil_compare_mask = 917ec681f3Smrg { 927ec681f3Smrg .front = ~0u, 937ec681f3Smrg .back = ~0u, 947ec681f3Smrg }, 957ec681f3Smrg .stencil_write_mask = 967ec681f3Smrg { 977ec681f3Smrg .front = ~0u, 987ec681f3Smrg .back = ~0u, 997ec681f3Smrg }, 1007ec681f3Smrg .stencil_reference = 1017ec681f3Smrg { 1027ec681f3Smrg .front = 0u, 1037ec681f3Smrg .back = 0u, 1047ec681f3Smrg }, 1057ec681f3Smrg .line_stipple = 1067ec681f3Smrg { 1077ec681f3Smrg .factor = 0u, 1087ec681f3Smrg .pattern = 0u, 1097ec681f3Smrg }, 1107ec681f3Smrg .cull_mode = 0u, 1117ec681f3Smrg .front_face = 0u, 1127ec681f3Smrg .primitive_topology = 0u, 1137ec681f3Smrg .fragment_shading_rate = 1147ec681f3Smrg { 1157ec681f3Smrg .size = {1u, 1u}, 1167ec681f3Smrg .combiner_ops = {VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR, 1177ec681f3Smrg VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR}, 1187ec681f3Smrg }, 1197ec681f3Smrg .depth_bias_enable = 0u, 1207ec681f3Smrg .primitive_restart_enable = 0u, 1217ec681f3Smrg .rasterizer_discard_enable = 0u, 1227ec681f3Smrg .logic_op = 0u, 1237ec681f3Smrg .color_write_enable = 0xffffffffu, 12401e04c3fSmrg}; 12501e04c3fSmrg 12601e04c3fSmrgstatic void 1277ec681f3Smrgradv_bind_dynamic_state(struct radv_cmd_buffer *cmd_buffer, const struct radv_dynamic_state *src) 1287ec681f3Smrg{ 1297ec681f3Smrg struct radv_dynamic_state *dest = &cmd_buffer->state.dynamic; 1307ec681f3Smrg uint64_t copy_mask = src->mask; 1317ec681f3Smrg uint64_t dest_mask = 0; 1327ec681f3Smrg 1337ec681f3Smrg dest->discard_rectangle.count = src->discard_rectangle.count; 1347ec681f3Smrg dest->sample_location.count = src->sample_location.count; 1357ec681f3Smrg 1367ec681f3Smrg if (copy_mask & RADV_DYNAMIC_VIEWPORT) { 1377ec681f3Smrg if (dest->viewport.count != src->viewport.count) { 1387ec681f3Smrg dest->viewport.count = src->viewport.count; 1397ec681f3Smrg dest_mask |= RADV_DYNAMIC_VIEWPORT; 1407ec681f3Smrg } 1417ec681f3Smrg 1427ec681f3Smrg if (memcmp(&dest->viewport.viewports, &src->viewport.viewports, 1437ec681f3Smrg src->viewport.count * sizeof(VkViewport))) { 1447ec681f3Smrg typed_memcpy(dest->viewport.viewports, src->viewport.viewports, src->viewport.count); 1457ec681f3Smrg typed_memcpy(dest->viewport.xform, src->viewport.xform, src->viewport.count); 1467ec681f3Smrg dest_mask |= RADV_DYNAMIC_VIEWPORT; 1477ec681f3Smrg } 1487ec681f3Smrg } 1497ec681f3Smrg 1507ec681f3Smrg if (copy_mask & RADV_DYNAMIC_SCISSOR) { 1517ec681f3Smrg if (dest->scissor.count != src->scissor.count) { 1527ec681f3Smrg dest->scissor.count = src->scissor.count; 1537ec681f3Smrg dest_mask |= RADV_DYNAMIC_SCISSOR; 1547ec681f3Smrg } 1557ec681f3Smrg 1567ec681f3Smrg if (memcmp(&dest->scissor.scissors, &src->scissor.scissors, 1577ec681f3Smrg src->scissor.count * sizeof(VkRect2D))) { 1587ec681f3Smrg typed_memcpy(dest->scissor.scissors, src->scissor.scissors, src->scissor.count); 1597ec681f3Smrg dest_mask |= RADV_DYNAMIC_SCISSOR; 1607ec681f3Smrg } 1617ec681f3Smrg } 1627ec681f3Smrg 1637ec681f3Smrg if (copy_mask & RADV_DYNAMIC_LINE_WIDTH) { 1647ec681f3Smrg if (dest->line_width != src->line_width) { 1657ec681f3Smrg dest->line_width = src->line_width; 1667ec681f3Smrg dest_mask |= RADV_DYNAMIC_LINE_WIDTH; 1677ec681f3Smrg } 1687ec681f3Smrg } 1697ec681f3Smrg 1707ec681f3Smrg if (copy_mask & RADV_DYNAMIC_DEPTH_BIAS) { 1717ec681f3Smrg if (memcmp(&dest->depth_bias, &src->depth_bias, sizeof(src->depth_bias))) { 1727ec681f3Smrg dest->depth_bias = src->depth_bias; 1737ec681f3Smrg dest_mask |= RADV_DYNAMIC_DEPTH_BIAS; 1747ec681f3Smrg } 1757ec681f3Smrg } 1767ec681f3Smrg 1777ec681f3Smrg if (copy_mask & RADV_DYNAMIC_BLEND_CONSTANTS) { 1787ec681f3Smrg if (memcmp(&dest->blend_constants, &src->blend_constants, sizeof(src->blend_constants))) { 1797ec681f3Smrg typed_memcpy(dest->blend_constants, src->blend_constants, 4); 1807ec681f3Smrg dest_mask |= RADV_DYNAMIC_BLEND_CONSTANTS; 1817ec681f3Smrg } 1827ec681f3Smrg } 1837ec681f3Smrg 1847ec681f3Smrg if (copy_mask & RADV_DYNAMIC_DEPTH_BOUNDS) { 1857ec681f3Smrg if (memcmp(&dest->depth_bounds, &src->depth_bounds, sizeof(src->depth_bounds))) { 1867ec681f3Smrg dest->depth_bounds = src->depth_bounds; 1877ec681f3Smrg dest_mask |= RADV_DYNAMIC_DEPTH_BOUNDS; 1887ec681f3Smrg } 1897ec681f3Smrg } 1907ec681f3Smrg 1917ec681f3Smrg if (copy_mask & RADV_DYNAMIC_STENCIL_COMPARE_MASK) { 1927ec681f3Smrg if (memcmp(&dest->stencil_compare_mask, &src->stencil_compare_mask, 1937ec681f3Smrg sizeof(src->stencil_compare_mask))) { 1947ec681f3Smrg dest->stencil_compare_mask = src->stencil_compare_mask; 1957ec681f3Smrg dest_mask |= RADV_DYNAMIC_STENCIL_COMPARE_MASK; 1967ec681f3Smrg } 1977ec681f3Smrg } 1987ec681f3Smrg 1997ec681f3Smrg if (copy_mask & RADV_DYNAMIC_STENCIL_WRITE_MASK) { 2007ec681f3Smrg if (memcmp(&dest->stencil_write_mask, &src->stencil_write_mask, 2017ec681f3Smrg sizeof(src->stencil_write_mask))) { 2027ec681f3Smrg dest->stencil_write_mask = src->stencil_write_mask; 2037ec681f3Smrg dest_mask |= RADV_DYNAMIC_STENCIL_WRITE_MASK; 2047ec681f3Smrg } 2057ec681f3Smrg } 2067ec681f3Smrg 2077ec681f3Smrg if (copy_mask & RADV_DYNAMIC_STENCIL_REFERENCE) { 2087ec681f3Smrg if (memcmp(&dest->stencil_reference, &src->stencil_reference, 2097ec681f3Smrg sizeof(src->stencil_reference))) { 2107ec681f3Smrg dest->stencil_reference = src->stencil_reference; 2117ec681f3Smrg dest_mask |= RADV_DYNAMIC_STENCIL_REFERENCE; 2127ec681f3Smrg } 2137ec681f3Smrg } 2147ec681f3Smrg 2157ec681f3Smrg if (copy_mask & RADV_DYNAMIC_DISCARD_RECTANGLE) { 2167ec681f3Smrg if (memcmp(&dest->discard_rectangle.rectangles, &src->discard_rectangle.rectangles, 2177ec681f3Smrg src->discard_rectangle.count * sizeof(VkRect2D))) { 2187ec681f3Smrg typed_memcpy(dest->discard_rectangle.rectangles, src->discard_rectangle.rectangles, 2197ec681f3Smrg src->discard_rectangle.count); 2207ec681f3Smrg dest_mask |= RADV_DYNAMIC_DISCARD_RECTANGLE; 2217ec681f3Smrg } 2227ec681f3Smrg } 2237ec681f3Smrg 2247ec681f3Smrg if (copy_mask & RADV_DYNAMIC_SAMPLE_LOCATIONS) { 2257ec681f3Smrg if (dest->sample_location.per_pixel != src->sample_location.per_pixel || 2267ec681f3Smrg dest->sample_location.grid_size.width != src->sample_location.grid_size.width || 2277ec681f3Smrg dest->sample_location.grid_size.height != src->sample_location.grid_size.height || 2287ec681f3Smrg memcmp(&dest->sample_location.locations, &src->sample_location.locations, 2297ec681f3Smrg src->sample_location.count * sizeof(VkSampleLocationEXT))) { 2307ec681f3Smrg dest->sample_location.per_pixel = src->sample_location.per_pixel; 2317ec681f3Smrg dest->sample_location.grid_size = src->sample_location.grid_size; 2327ec681f3Smrg typed_memcpy(dest->sample_location.locations, src->sample_location.locations, 2337ec681f3Smrg src->sample_location.count); 2347ec681f3Smrg dest_mask |= RADV_DYNAMIC_SAMPLE_LOCATIONS; 2357ec681f3Smrg } 2367ec681f3Smrg } 2377ec681f3Smrg 2387ec681f3Smrg if (copy_mask & RADV_DYNAMIC_LINE_STIPPLE) { 2397ec681f3Smrg if (memcmp(&dest->line_stipple, &src->line_stipple, sizeof(src->line_stipple))) { 2407ec681f3Smrg dest->line_stipple = src->line_stipple; 2417ec681f3Smrg dest_mask |= RADV_DYNAMIC_LINE_STIPPLE; 2427ec681f3Smrg } 2437ec681f3Smrg } 2447ec681f3Smrg 2457ec681f3Smrg if (copy_mask & RADV_DYNAMIC_CULL_MODE) { 2467ec681f3Smrg if (dest->cull_mode != src->cull_mode) { 2477ec681f3Smrg dest->cull_mode = src->cull_mode; 2487ec681f3Smrg dest_mask |= RADV_DYNAMIC_CULL_MODE; 2497ec681f3Smrg } 2507ec681f3Smrg } 2517ec681f3Smrg 2527ec681f3Smrg if (copy_mask & RADV_DYNAMIC_FRONT_FACE) { 2537ec681f3Smrg if (dest->front_face != src->front_face) { 2547ec681f3Smrg dest->front_face = src->front_face; 2557ec681f3Smrg dest_mask |= RADV_DYNAMIC_FRONT_FACE; 2567ec681f3Smrg } 2577ec681f3Smrg } 2587ec681f3Smrg 2597ec681f3Smrg if (copy_mask & RADV_DYNAMIC_PRIMITIVE_TOPOLOGY) { 2607ec681f3Smrg if (dest->primitive_topology != src->primitive_topology) { 2617ec681f3Smrg dest->primitive_topology = src->primitive_topology; 2627ec681f3Smrg dest_mask |= RADV_DYNAMIC_PRIMITIVE_TOPOLOGY; 2637ec681f3Smrg } 2647ec681f3Smrg } 2657ec681f3Smrg 2667ec681f3Smrg if (copy_mask & RADV_DYNAMIC_DEPTH_TEST_ENABLE) { 2677ec681f3Smrg if (dest->depth_test_enable != src->depth_test_enable) { 2687ec681f3Smrg dest->depth_test_enable = src->depth_test_enable; 2697ec681f3Smrg dest_mask |= RADV_DYNAMIC_DEPTH_TEST_ENABLE; 2707ec681f3Smrg } 2717ec681f3Smrg } 2727ec681f3Smrg 2737ec681f3Smrg if (copy_mask & RADV_DYNAMIC_DEPTH_WRITE_ENABLE) { 2747ec681f3Smrg if (dest->depth_write_enable != src->depth_write_enable) { 2757ec681f3Smrg dest->depth_write_enable = src->depth_write_enable; 2767ec681f3Smrg dest_mask |= RADV_DYNAMIC_DEPTH_WRITE_ENABLE; 2777ec681f3Smrg } 2787ec681f3Smrg } 2797ec681f3Smrg 2807ec681f3Smrg if (copy_mask & RADV_DYNAMIC_DEPTH_COMPARE_OP) { 2817ec681f3Smrg if (dest->depth_compare_op != src->depth_compare_op) { 2827ec681f3Smrg dest->depth_compare_op = src->depth_compare_op; 2837ec681f3Smrg dest_mask |= RADV_DYNAMIC_DEPTH_COMPARE_OP; 2847ec681f3Smrg } 2857ec681f3Smrg } 2867ec681f3Smrg 2877ec681f3Smrg if (copy_mask & RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE) { 2887ec681f3Smrg if (dest->depth_bounds_test_enable != src->depth_bounds_test_enable) { 2897ec681f3Smrg dest->depth_bounds_test_enable = src->depth_bounds_test_enable; 2907ec681f3Smrg dest_mask |= RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE; 2917ec681f3Smrg } 2927ec681f3Smrg } 2937ec681f3Smrg 2947ec681f3Smrg if (copy_mask & RADV_DYNAMIC_STENCIL_TEST_ENABLE) { 2957ec681f3Smrg if (dest->stencil_test_enable != src->stencil_test_enable) { 2967ec681f3Smrg dest->stencil_test_enable = src->stencil_test_enable; 2977ec681f3Smrg dest_mask |= RADV_DYNAMIC_STENCIL_TEST_ENABLE; 2987ec681f3Smrg } 2997ec681f3Smrg } 3007ec681f3Smrg 3017ec681f3Smrg if (copy_mask & RADV_DYNAMIC_STENCIL_OP) { 3027ec681f3Smrg if (memcmp(&dest->stencil_op, &src->stencil_op, sizeof(src->stencil_op))) { 3037ec681f3Smrg dest->stencil_op = src->stencil_op; 3047ec681f3Smrg dest_mask |= RADV_DYNAMIC_STENCIL_OP; 3057ec681f3Smrg } 3067ec681f3Smrg } 3077ec681f3Smrg 3087ec681f3Smrg if (copy_mask & RADV_DYNAMIC_FRAGMENT_SHADING_RATE) { 3097ec681f3Smrg if (memcmp(&dest->fragment_shading_rate, &src->fragment_shading_rate, 3107ec681f3Smrg sizeof(src->fragment_shading_rate))) { 3117ec681f3Smrg dest->fragment_shading_rate = src->fragment_shading_rate; 3127ec681f3Smrg dest_mask |= RADV_DYNAMIC_FRAGMENT_SHADING_RATE; 3137ec681f3Smrg } 3147ec681f3Smrg } 3157ec681f3Smrg 3167ec681f3Smrg if (copy_mask & RADV_DYNAMIC_DEPTH_BIAS_ENABLE) { 3177ec681f3Smrg if (dest->depth_bias_enable != src->depth_bias_enable) { 3187ec681f3Smrg dest->depth_bias_enable = src->depth_bias_enable; 3197ec681f3Smrg dest_mask |= RADV_DYNAMIC_DEPTH_BIAS_ENABLE; 3207ec681f3Smrg } 3217ec681f3Smrg } 3227ec681f3Smrg 3237ec681f3Smrg if (copy_mask & RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE) { 3247ec681f3Smrg if (dest->primitive_restart_enable != src->primitive_restart_enable) { 3257ec681f3Smrg dest->primitive_restart_enable = src->primitive_restart_enable; 3267ec681f3Smrg dest_mask |= RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE; 3277ec681f3Smrg } 3287ec681f3Smrg } 3297ec681f3Smrg 3307ec681f3Smrg if (copy_mask & RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE) { 3317ec681f3Smrg if (dest->rasterizer_discard_enable != src->rasterizer_discard_enable) { 3327ec681f3Smrg dest->rasterizer_discard_enable = src->rasterizer_discard_enable; 3337ec681f3Smrg dest_mask |= RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE; 3347ec681f3Smrg } 3357ec681f3Smrg } 3367ec681f3Smrg 3377ec681f3Smrg if (copy_mask & RADV_DYNAMIC_LOGIC_OP) { 3387ec681f3Smrg if (dest->logic_op != src->logic_op) { 3397ec681f3Smrg dest->logic_op = src->logic_op; 3407ec681f3Smrg dest_mask |= RADV_DYNAMIC_LOGIC_OP; 3417ec681f3Smrg } 3427ec681f3Smrg } 3437ec681f3Smrg 3447ec681f3Smrg if (copy_mask & RADV_DYNAMIC_COLOR_WRITE_ENABLE) { 3457ec681f3Smrg if (dest->color_write_enable != src->color_write_enable) { 3467ec681f3Smrg dest->color_write_enable = src->color_write_enable; 3477ec681f3Smrg dest_mask |= RADV_DYNAMIC_COLOR_WRITE_ENABLE; 3487ec681f3Smrg } 3497ec681f3Smrg } 3507ec681f3Smrg 3517ec681f3Smrg cmd_buffer->state.dirty |= dest_mask; 35201e04c3fSmrg} 35301e04c3fSmrg 35401e04c3fSmrgstatic void 3557ec681f3Smrgradv_bind_streamout_state(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline) 35601e04c3fSmrg{ 3577ec681f3Smrg struct radv_streamout_state *so = &cmd_buffer->state.streamout; 3587ec681f3Smrg struct radv_shader_info *info; 35901e04c3fSmrg 3607ec681f3Smrg if (!pipeline->streamout_shader || cmd_buffer->device->physical_device->use_ngg_streamout) 3617ec681f3Smrg return; 36201e04c3fSmrg 3637ec681f3Smrg info = &pipeline->streamout_shader->info; 3647ec681f3Smrg for (int i = 0; i < MAX_SO_BUFFERS; i++) 3657ec681f3Smrg so->stride_in_dw[i] = info->so.strides[i]; 36601e04c3fSmrg 3677ec681f3Smrg so->enabled_stream_buffers_mask = info->so.enabled_stream_buffers_mask; 36801e04c3fSmrg} 36901e04c3fSmrg 3707ec681f3Smrgbool 3717ec681f3Smrgradv_cmd_buffer_uses_mec(struct radv_cmd_buffer *cmd_buffer) 37201e04c3fSmrg{ 3737ec681f3Smrg return cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE && 3747ec681f3Smrg cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7; 37501e04c3fSmrg} 37601e04c3fSmrg 3777ec681f3Smrgenum ring_type 3787ec681f3Smrgradv_queue_family_to_ring(int f) 3797ec681f3Smrg{ 3807ec681f3Smrg switch (f) { 3817ec681f3Smrg case RADV_QUEUE_GENERAL: 3827ec681f3Smrg return RING_GFX; 3837ec681f3Smrg case RADV_QUEUE_COMPUTE: 3847ec681f3Smrg return RING_COMPUTE; 3857ec681f3Smrg case RADV_QUEUE_TRANSFER: 3867ec681f3Smrg return RING_DMA; 3877ec681f3Smrg default: 3887ec681f3Smrg unreachable("Unknown queue family"); 3897ec681f3Smrg } 39001e04c3fSmrg} 39101e04c3fSmrg 3927ec681f3Smrgstatic void 3937ec681f3Smrgradv_emit_write_data_packet(struct radv_cmd_buffer *cmd_buffer, unsigned engine_sel, uint64_t va, 3947ec681f3Smrg unsigned count, const uint32_t *data) 39501e04c3fSmrg{ 3967ec681f3Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 3977ec681f3Smrg 3987ec681f3Smrg radeon_check_space(cmd_buffer->device->ws, cs, 4 + count); 3997ec681f3Smrg 4007ec681f3Smrg radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0)); 4017ec681f3Smrg radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(engine_sel)); 4027ec681f3Smrg radeon_emit(cs, va); 4037ec681f3Smrg radeon_emit(cs, va >> 32); 4047ec681f3Smrg radeon_emit_array(cs, data, count); 4057ec681f3Smrg} 40601e04c3fSmrg 4077ec681f3Smrgstatic void 4087ec681f3Smrgradv_emit_clear_data(struct radv_cmd_buffer *cmd_buffer, unsigned engine_sel, uint64_t va, 4097ec681f3Smrg unsigned size) 4107ec681f3Smrg{ 4117ec681f3Smrg uint32_t *zeroes = alloca(size); 4127ec681f3Smrg memset(zeroes, 0, size); 4137ec681f3Smrg radv_emit_write_data_packet(cmd_buffer, engine_sel, va, size / 4, zeroes); 4147ec681f3Smrg} 41501e04c3fSmrg 4167ec681f3Smrgstatic void 4177ec681f3Smrgradv_destroy_cmd_buffer(struct radv_cmd_buffer *cmd_buffer) 4187ec681f3Smrg{ 4197ec681f3Smrg list_del(&cmd_buffer->pool_link); 42001e04c3fSmrg 4217ec681f3Smrg list_for_each_entry_safe(struct radv_cmd_buffer_upload, up, &cmd_buffer->upload.list, list) 4227ec681f3Smrg { 4237ec681f3Smrg cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, up->upload_bo); 4247ec681f3Smrg list_del(&up->list); 4257ec681f3Smrg free(up); 4267ec681f3Smrg } 42701e04c3fSmrg 4287ec681f3Smrg if (cmd_buffer->upload.upload_bo) 4297ec681f3Smrg cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, cmd_buffer->upload.upload_bo); 43001e04c3fSmrg 4317ec681f3Smrg if (cmd_buffer->cs) 4327ec681f3Smrg cmd_buffer->device->ws->cs_destroy(cmd_buffer->cs); 43301e04c3fSmrg 4347ec681f3Smrg for (unsigned i = 0; i < MAX_BIND_POINTS; i++) { 4357ec681f3Smrg free(cmd_buffer->descriptors[i].push_set.set.mapped_ptr); 4367ec681f3Smrg vk_object_base_finish(&cmd_buffer->descriptors[i].push_set.set.base); 4377ec681f3Smrg } 43801e04c3fSmrg 4397ec681f3Smrg vk_object_base_finish(&cmd_buffer->meta_push_descriptors.base); 44001e04c3fSmrg 4417ec681f3Smrg vk_command_buffer_finish(&cmd_buffer->vk); 4427ec681f3Smrg vk_free(&cmd_buffer->pool->alloc, cmd_buffer); 44301e04c3fSmrg} 44401e04c3fSmrg 4457ec681f3Smrgstatic VkResult 4467ec681f3Smrgradv_create_cmd_buffer(struct radv_device *device, struct radv_cmd_pool *pool, 4477ec681f3Smrg VkCommandBufferLevel level, VkCommandBuffer *pCommandBuffer) 44801e04c3fSmrg{ 4497ec681f3Smrg struct radv_cmd_buffer *cmd_buffer; 4507ec681f3Smrg unsigned ring; 4517ec681f3Smrg cmd_buffer = vk_zalloc(&pool->alloc, sizeof(*cmd_buffer), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 4527ec681f3Smrg if (cmd_buffer == NULL) 4537ec681f3Smrg return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 45401e04c3fSmrg 4557ec681f3Smrg VkResult result = 4567ec681f3Smrg vk_command_buffer_init(&cmd_buffer->vk, &device->vk); 4577ec681f3Smrg if (result != VK_SUCCESS) { 4587ec681f3Smrg vk_free(&cmd_buffer->pool->alloc, cmd_buffer); 4597ec681f3Smrg return result; 4607ec681f3Smrg } 46101e04c3fSmrg 4627ec681f3Smrg cmd_buffer->device = device; 4637ec681f3Smrg cmd_buffer->pool = pool; 4647ec681f3Smrg cmd_buffer->level = level; 46501e04c3fSmrg 4667ec681f3Smrg list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers); 4677ec681f3Smrg cmd_buffer->queue_family_index = pool->queue_family_index; 46801e04c3fSmrg 4697ec681f3Smrg ring = radv_queue_family_to_ring(cmd_buffer->queue_family_index); 4707ec681f3Smrg 4717ec681f3Smrg cmd_buffer->cs = device->ws->cs_create(device->ws, ring); 4727ec681f3Smrg if (!cmd_buffer->cs) { 4737ec681f3Smrg radv_destroy_cmd_buffer(cmd_buffer); 4747ec681f3Smrg return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 4757ec681f3Smrg } 4767ec681f3Smrg 4777ec681f3Smrg vk_object_base_init(&device->vk, &cmd_buffer->meta_push_descriptors.base, 4787ec681f3Smrg VK_OBJECT_TYPE_DESCRIPTOR_SET); 4797ec681f3Smrg 4807ec681f3Smrg for (unsigned i = 0; i < MAX_BIND_POINTS; i++) 4817ec681f3Smrg vk_object_base_init(&device->vk, &cmd_buffer->descriptors[i].push_set.set.base, 4827ec681f3Smrg VK_OBJECT_TYPE_DESCRIPTOR_SET); 4837ec681f3Smrg 4847ec681f3Smrg *pCommandBuffer = radv_cmd_buffer_to_handle(cmd_buffer); 4857ec681f3Smrg 4867ec681f3Smrg list_inithead(&cmd_buffer->upload.list); 4877ec681f3Smrg 4887ec681f3Smrg return VK_SUCCESS; 48901e04c3fSmrg} 49001e04c3fSmrg 49101e04c3fSmrgstatic VkResult 49201e04c3fSmrgradv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer) 49301e04c3fSmrg{ 4947ec681f3Smrg vk_command_buffer_reset(&cmd_buffer->vk); 4957ec681f3Smrg 4967ec681f3Smrg cmd_buffer->device->ws->cs_reset(cmd_buffer->cs); 4977ec681f3Smrg 4987ec681f3Smrg list_for_each_entry_safe(struct radv_cmd_buffer_upload, up, &cmd_buffer->upload.list, list) 4997ec681f3Smrg { 5007ec681f3Smrg cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, up->upload_bo); 5017ec681f3Smrg list_del(&up->list); 5027ec681f3Smrg free(up); 5037ec681f3Smrg } 50401e04c3fSmrg 5057ec681f3Smrg cmd_buffer->push_constant_stages = 0; 5067ec681f3Smrg cmd_buffer->scratch_size_per_wave_needed = 0; 5077ec681f3Smrg cmd_buffer->scratch_waves_wanted = 0; 5087ec681f3Smrg cmd_buffer->compute_scratch_size_per_wave_needed = 0; 5097ec681f3Smrg cmd_buffer->compute_scratch_waves_wanted = 0; 5107ec681f3Smrg cmd_buffer->esgs_ring_size_needed = 0; 5117ec681f3Smrg cmd_buffer->gsvs_ring_size_needed = 0; 5127ec681f3Smrg cmd_buffer->tess_rings_needed = false; 5137ec681f3Smrg cmd_buffer->gds_needed = false; 5147ec681f3Smrg cmd_buffer->gds_oa_needed = false; 5157ec681f3Smrg cmd_buffer->sample_positions_needed = false; 51601e04c3fSmrg 5177ec681f3Smrg if (cmd_buffer->upload.upload_bo) 5187ec681f3Smrg radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->upload.upload_bo); 5197ec681f3Smrg cmd_buffer->upload.offset = 0; 52001e04c3fSmrg 5217ec681f3Smrg cmd_buffer->record_result = VK_SUCCESS; 52201e04c3fSmrg 5237ec681f3Smrg memset(cmd_buffer->vertex_bindings, 0, sizeof(cmd_buffer->vertex_bindings)); 52401e04c3fSmrg 5257ec681f3Smrg for (unsigned i = 0; i < MAX_BIND_POINTS; i++) { 5267ec681f3Smrg cmd_buffer->descriptors[i].dirty = 0; 5277ec681f3Smrg cmd_buffer->descriptors[i].valid = 0; 5287ec681f3Smrg cmd_buffer->descriptors[i].push_dirty = false; 5297ec681f3Smrg } 530ed98bd31Smaya 5317ec681f3Smrg if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9 && 5327ec681f3Smrg cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) { 5337ec681f3Smrg unsigned num_db = cmd_buffer->device->physical_device->rad_info.max_render_backends; 5347ec681f3Smrg unsigned fence_offset, eop_bug_offset; 5357ec681f3Smrg void *fence_ptr; 53601e04c3fSmrg 5377ec681f3Smrg radv_cmd_buffer_upload_alloc(cmd_buffer, 8, &fence_offset, &fence_ptr); 5387ec681f3Smrg memset(fence_ptr, 0, 8); 53901e04c3fSmrg 5407ec681f3Smrg cmd_buffer->gfx9_fence_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); 5417ec681f3Smrg cmd_buffer->gfx9_fence_va += fence_offset; 542ed98bd31Smaya 5437ec681f3Smrg radv_emit_clear_data(cmd_buffer, V_370_PFP, cmd_buffer->gfx9_fence_va, 8); 54401e04c3fSmrg 5457ec681f3Smrg if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) { 5467ec681f3Smrg /* Allocate a buffer for the EOP bug on GFX9. */ 5477ec681f3Smrg radv_cmd_buffer_upload_alloc(cmd_buffer, 16 * num_db, &eop_bug_offset, &fence_ptr); 5487ec681f3Smrg memset(fence_ptr, 0, 16 * num_db); 5497ec681f3Smrg cmd_buffer->gfx9_eop_bug_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); 5507ec681f3Smrg cmd_buffer->gfx9_eop_bug_va += eop_bug_offset; 55101e04c3fSmrg 5527ec681f3Smrg radv_emit_clear_data(cmd_buffer, V_370_PFP, cmd_buffer->gfx9_eop_bug_va, 16 * num_db); 5537ec681f3Smrg } 5547ec681f3Smrg } 55501e04c3fSmrg 5567ec681f3Smrg cmd_buffer->status = RADV_CMD_BUFFER_STATUS_INITIAL; 5577ec681f3Smrg 5587ec681f3Smrg return cmd_buffer->record_result; 55901e04c3fSmrg} 56001e04c3fSmrg 56101e04c3fSmrgstatic bool 5627ec681f3Smrgradv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer *cmd_buffer, uint64_t min_needed) 56301e04c3fSmrg{ 5647ec681f3Smrg uint64_t new_size; 5657ec681f3Smrg struct radeon_winsys_bo *bo = NULL; 5667ec681f3Smrg struct radv_cmd_buffer_upload *upload; 5677ec681f3Smrg struct radv_device *device = cmd_buffer->device; 56801e04c3fSmrg 5697ec681f3Smrg new_size = MAX2(min_needed, 16 * 1024); 5707ec681f3Smrg new_size = MAX2(new_size, 2 * cmd_buffer->upload.size); 57101e04c3fSmrg 5727ec681f3Smrg VkResult result = 5737ec681f3Smrg device->ws->buffer_create(device->ws, new_size, 4096, device->ws->cs_domain(device->ws), 5747ec681f3Smrg RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING | 5757ec681f3Smrg RADEON_FLAG_32BIT | RADEON_FLAG_GTT_WC, 5767ec681f3Smrg RADV_BO_PRIORITY_UPLOAD_BUFFER, 0, &bo); 57701e04c3fSmrg 5787ec681f3Smrg if (result != VK_SUCCESS) { 5797ec681f3Smrg cmd_buffer->record_result = result; 5807ec681f3Smrg return false; 5817ec681f3Smrg } 58201e04c3fSmrg 5837ec681f3Smrg radv_cs_add_buffer(device->ws, cmd_buffer->cs, bo); 5847ec681f3Smrg if (cmd_buffer->upload.upload_bo) { 5857ec681f3Smrg upload = malloc(sizeof(*upload)); 58601e04c3fSmrg 5877ec681f3Smrg if (!upload) { 5887ec681f3Smrg cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY; 5897ec681f3Smrg device->ws->buffer_destroy(device->ws, bo); 5907ec681f3Smrg return false; 5917ec681f3Smrg } 59201e04c3fSmrg 5937ec681f3Smrg memcpy(upload, &cmd_buffer->upload, sizeof(*upload)); 5947ec681f3Smrg list_add(&upload->list, &cmd_buffer->upload.list); 5957ec681f3Smrg } 59601e04c3fSmrg 5977ec681f3Smrg cmd_buffer->upload.upload_bo = bo; 5987ec681f3Smrg cmd_buffer->upload.size = new_size; 5997ec681f3Smrg cmd_buffer->upload.offset = 0; 6007ec681f3Smrg cmd_buffer->upload.map = device->ws->buffer_map(cmd_buffer->upload.upload_bo); 60101e04c3fSmrg 6027ec681f3Smrg if (!cmd_buffer->upload.map) { 6037ec681f3Smrg cmd_buffer->record_result = VK_ERROR_OUT_OF_DEVICE_MEMORY; 6047ec681f3Smrg return false; 6057ec681f3Smrg } 60601e04c3fSmrg 6077ec681f3Smrg return true; 60801e04c3fSmrg} 60901e04c3fSmrg 61001e04c3fSmrgbool 6117ec681f3Smrgradv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer, unsigned size, 6127ec681f3Smrg unsigned *out_offset, void **ptr) 61301e04c3fSmrg{ 6147ec681f3Smrg assert(size % 4 == 0); 6157ec681f3Smrg 6167ec681f3Smrg struct radeon_info *rad_info = &cmd_buffer->device->physical_device->rad_info; 617993e1d59Smrg 6187ec681f3Smrg /* Align to the scalar cache line size if it results in this allocation 6197ec681f3Smrg * being placed in less of them. 6207ec681f3Smrg */ 6217ec681f3Smrg unsigned offset = cmd_buffer->upload.offset; 6227ec681f3Smrg unsigned line_size = rad_info->chip_class >= GFX10 ? 64 : 32; 6237ec681f3Smrg unsigned gap = align(offset, line_size) - offset; 6247ec681f3Smrg if ((size & (line_size - 1)) > gap) 6257ec681f3Smrg offset = align(offset, line_size); 62601e04c3fSmrg 6277ec681f3Smrg if (offset + size > cmd_buffer->upload.size) { 6287ec681f3Smrg if (!radv_cmd_buffer_resize_upload_buf(cmd_buffer, size)) 6297ec681f3Smrg return false; 6307ec681f3Smrg offset = 0; 6317ec681f3Smrg } 63201e04c3fSmrg 6337ec681f3Smrg *out_offset = offset; 6347ec681f3Smrg *ptr = cmd_buffer->upload.map + offset; 6357ec681f3Smrg 6367ec681f3Smrg cmd_buffer->upload.offset = offset + size; 6377ec681f3Smrg return true; 63801e04c3fSmrg} 63901e04c3fSmrg 64001e04c3fSmrgbool 6417ec681f3Smrgradv_cmd_buffer_upload_data(struct radv_cmd_buffer *cmd_buffer, unsigned size, const void *data, 6427ec681f3Smrg unsigned *out_offset) 64301e04c3fSmrg{ 6447ec681f3Smrg uint8_t *ptr; 64501e04c3fSmrg 6467ec681f3Smrg if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, out_offset, (void **)&ptr)) 6477ec681f3Smrg return false; 64801e04c3fSmrg 6497ec681f3Smrg if (ptr) 6507ec681f3Smrg memcpy(ptr, data, size); 65101e04c3fSmrg 6527ec681f3Smrg return true; 65301e04c3fSmrg} 65401e04c3fSmrg 6557ec681f3Smrgvoid 6567ec681f3Smrgradv_cmd_buffer_trace_emit(struct radv_cmd_buffer *cmd_buffer) 65701e04c3fSmrg{ 6587ec681f3Smrg struct radv_device *device = cmd_buffer->device; 6597ec681f3Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 6607ec681f3Smrg uint64_t va; 6617ec681f3Smrg 6627ec681f3Smrg va = radv_buffer_get_va(device->trace_bo); 6637ec681f3Smrg if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) 6647ec681f3Smrg va += 4; 66501e04c3fSmrg 6667ec681f3Smrg ++cmd_buffer->state.trace_id; 6677ec681f3Smrg radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, 1, &cmd_buffer->state.trace_id); 66801e04c3fSmrg 6697ec681f3Smrg radeon_check_space(cmd_buffer->device->ws, cs, 2); 6707ec681f3Smrg 6717ec681f3Smrg radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); 6727ec681f3Smrg radeon_emit(cs, AC_ENCODE_TRACE_POINT(cmd_buffer->state.trace_id)); 67301e04c3fSmrg} 67401e04c3fSmrg 6757ec681f3Smrgstatic void 6767ec681f3Smrgradv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer, enum radv_cmd_flush_bits flags) 67701e04c3fSmrg{ 6787ec681f3Smrg if (unlikely(cmd_buffer->device->thread_trace.bo)) { 6797ec681f3Smrg radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 6807ec681f3Smrg radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | EVENT_INDEX(0)); 6817ec681f3Smrg } 68201e04c3fSmrg 6837ec681f3Smrg if (cmd_buffer->device->instance->debug_flags & RADV_DEBUG_SYNC_SHADERS) { 6847ec681f3Smrg enum rgp_flush_bits sqtt_flush_bits = 0; 6857ec681f3Smrg assert(flags & (RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH)); 68601e04c3fSmrg 6877ec681f3Smrg radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4); 68801e04c3fSmrg 6897ec681f3Smrg /* Force wait for graphics or compute engines to be idle. */ 6907ec681f3Smrg si_cs_emit_cache_flush(cmd_buffer->cs, 6917ec681f3Smrg cmd_buffer->device->physical_device->rad_info.chip_class, 6927ec681f3Smrg &cmd_buffer->gfx9_fence_idx, cmd_buffer->gfx9_fence_va, 6937ec681f3Smrg radv_cmd_buffer_uses_mec(cmd_buffer), flags, &sqtt_flush_bits, 6947ec681f3Smrg cmd_buffer->gfx9_eop_bug_va); 6957ec681f3Smrg } 69601e04c3fSmrg 6977ec681f3Smrg if (unlikely(cmd_buffer->device->trace_bo)) 6987ec681f3Smrg radv_cmd_buffer_trace_emit(cmd_buffer); 69901e04c3fSmrg} 70001e04c3fSmrg 70101e04c3fSmrgstatic void 7027ec681f3Smrgradv_save_pipeline(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline) 70301e04c3fSmrg{ 7047ec681f3Smrg struct radv_device *device = cmd_buffer->device; 7057ec681f3Smrg enum ring_type ring; 7067ec681f3Smrg uint32_t data[2]; 7077ec681f3Smrg uint64_t va; 70801e04c3fSmrg 7097ec681f3Smrg va = radv_buffer_get_va(device->trace_bo); 71001e04c3fSmrg 7117ec681f3Smrg ring = radv_queue_family_to_ring(cmd_buffer->queue_family_index); 71201e04c3fSmrg 7137ec681f3Smrg switch (ring) { 7147ec681f3Smrg case RING_GFX: 7157ec681f3Smrg va += 8; 7167ec681f3Smrg break; 7177ec681f3Smrg case RING_COMPUTE: 7187ec681f3Smrg va += 16; 7197ec681f3Smrg break; 7207ec681f3Smrg default: 7217ec681f3Smrg assert(!"invalid ring type"); 7227ec681f3Smrg } 7237ec681f3Smrg 7247ec681f3Smrg uint64_t pipeline_address = (uintptr_t)pipeline; 7257ec681f3Smrg data[0] = pipeline_address; 7267ec681f3Smrg data[1] = pipeline_address >> 32; 7277ec681f3Smrg 7287ec681f3Smrg radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, 2, data); 72901e04c3fSmrg} 73001e04c3fSmrg 73101e04c3fSmrgstatic void 7327ec681f3Smrgradv_save_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, uint64_t vb_ptr) 73301e04c3fSmrg{ 7347ec681f3Smrg struct radv_device *device = cmd_buffer->device; 7357ec681f3Smrg uint32_t data[2]; 7367ec681f3Smrg uint64_t va; 73701e04c3fSmrg 7387ec681f3Smrg va = radv_buffer_get_va(device->trace_bo); 7397ec681f3Smrg va += 24; 74001e04c3fSmrg 7417ec681f3Smrg data[0] = vb_ptr; 7427ec681f3Smrg data[1] = vb_ptr >> 32; 74301e04c3fSmrg 7447ec681f3Smrg radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, 2, data); 74501e04c3fSmrg} 74601e04c3fSmrg 7477ec681f3Smrgvoid 7487ec681f3Smrgradv_set_descriptor_set(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point, 7497ec681f3Smrg struct radv_descriptor_set *set, unsigned idx) 75001e04c3fSmrg{ 7517ec681f3Smrg struct radv_descriptor_state *descriptors_state = 7527ec681f3Smrg radv_get_descriptors_state(cmd_buffer, bind_point); 75301e04c3fSmrg 7547ec681f3Smrg descriptors_state->sets[idx] = set; 75501e04c3fSmrg 7567ec681f3Smrg descriptors_state->valid |= (1u << idx); /* active descriptors */ 7577ec681f3Smrg descriptors_state->dirty |= (1u << idx); 75801e04c3fSmrg} 75901e04c3fSmrg 76001e04c3fSmrgstatic void 7617ec681f3Smrgradv_save_descriptors(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point) 76201e04c3fSmrg{ 7637ec681f3Smrg struct radv_descriptor_state *descriptors_state = 7647ec681f3Smrg radv_get_descriptors_state(cmd_buffer, bind_point); 7657ec681f3Smrg struct radv_device *device = cmd_buffer->device; 7667ec681f3Smrg uint32_t data[MAX_SETS * 2] = {0}; 7677ec681f3Smrg uint64_t va; 7687ec681f3Smrg va = radv_buffer_get_va(device->trace_bo) + 32; 76901e04c3fSmrg 7707ec681f3Smrg u_foreach_bit(i, descriptors_state->valid) 7717ec681f3Smrg { 7727ec681f3Smrg struct radv_descriptor_set *set = descriptors_state->sets[i]; 7737ec681f3Smrg data[i * 2] = (uint64_t)(uintptr_t)set; 7747ec681f3Smrg data[i * 2 + 1] = (uint64_t)(uintptr_t)set >> 32; 7757ec681f3Smrg } 77601e04c3fSmrg 7777ec681f3Smrg radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, MAX_SETS * 2, data); 77801e04c3fSmrg} 77901e04c3fSmrg 78001e04c3fSmrgstruct radv_userdata_info * 7817ec681f3Smrgradv_lookup_user_sgpr(struct radv_pipeline *pipeline, gl_shader_stage stage, int idx) 78201e04c3fSmrg{ 7837ec681f3Smrg struct radv_shader_variant *shader = radv_get_shader(pipeline, stage); 7847ec681f3Smrg return &shader->info.user_sgprs_locs.shader_data[idx]; 78501e04c3fSmrg} 78601e04c3fSmrg 78701e04c3fSmrgstatic void 7887ec681f3Smrgradv_emit_userdata_address(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline, 7897ec681f3Smrg gl_shader_stage stage, int idx, uint64_t va) 79001e04c3fSmrg{ 7917ec681f3Smrg struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx); 7927ec681f3Smrg uint32_t base_reg = pipeline->user_data_0[stage]; 7937ec681f3Smrg if (loc->sgpr_idx == -1) 7947ec681f3Smrg return; 79501e04c3fSmrg 7967ec681f3Smrg assert(loc->num_sgprs == 1); 79701e04c3fSmrg 7987ec681f3Smrg radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, va, 7997ec681f3Smrg false); 80001e04c3fSmrg} 80101e04c3fSmrg 80201e04c3fSmrgstatic void 8037ec681f3Smrgradv_emit_descriptor_pointers(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline, 8047ec681f3Smrg struct radv_descriptor_state *descriptors_state, 8057ec681f3Smrg gl_shader_stage stage) 80601e04c3fSmrg{ 8077ec681f3Smrg struct radv_device *device = cmd_buffer->device; 8087ec681f3Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 8097ec681f3Smrg uint32_t sh_base = pipeline->user_data_0[stage]; 8107ec681f3Smrg struct radv_userdata_locations *locs = &pipeline->shaders[stage]->info.user_sgprs_locs; 8117ec681f3Smrg unsigned mask = locs->descriptor_sets_enabled; 81201e04c3fSmrg 8137ec681f3Smrg mask &= descriptors_state->dirty & descriptors_state->valid; 81401e04c3fSmrg 8157ec681f3Smrg while (mask) { 8167ec681f3Smrg int start, count; 81701e04c3fSmrg 8187ec681f3Smrg u_bit_scan_consecutive_range(&mask, &start, &count); 81901e04c3fSmrg 8207ec681f3Smrg struct radv_userdata_info *loc = &locs->descriptor_sets[start]; 8217ec681f3Smrg unsigned sh_offset = sh_base + loc->sgpr_idx * 4; 82201e04c3fSmrg 8237ec681f3Smrg radv_emit_shader_pointer_head(cs, sh_offset, count, true); 8247ec681f3Smrg for (int i = 0; i < count; i++) { 8257ec681f3Smrg struct radv_descriptor_set *set = descriptors_state->sets[start + i]; 82601e04c3fSmrg 8277ec681f3Smrg radv_emit_shader_pointer_body(device, cs, set->header.va, true); 8287ec681f3Smrg } 8297ec681f3Smrg } 83001e04c3fSmrg} 83101e04c3fSmrg 8327ec681f3Smrg/** 8337ec681f3Smrg * Convert the user sample locations to hardware sample locations (the values 8347ec681f3Smrg * that will be emitted by PA_SC_AA_SAMPLE_LOCS_PIXEL_*). 8357ec681f3Smrg */ 836ed98bd31Smayastatic void 8377ec681f3Smrgradv_convert_user_sample_locs(struct radv_sample_locations_state *state, uint32_t x, uint32_t y, 8387ec681f3Smrg VkOffset2D *sample_locs) 839ed98bd31Smaya{ 8407ec681f3Smrg uint32_t x_offset = x % state->grid_size.width; 8417ec681f3Smrg uint32_t y_offset = y % state->grid_size.height; 8427ec681f3Smrg uint32_t num_samples = (uint32_t)state->per_pixel; 8437ec681f3Smrg VkSampleLocationEXT *user_locs; 8447ec681f3Smrg uint32_t pixel_offset; 8457ec681f3Smrg 8467ec681f3Smrg pixel_offset = (x_offset + y_offset * state->grid_size.width) * num_samples; 8477ec681f3Smrg 8487ec681f3Smrg assert(pixel_offset <= MAX_SAMPLE_LOCATIONS); 8497ec681f3Smrg user_locs = &state->locations[pixel_offset]; 850ed98bd31Smaya 8517ec681f3Smrg for (uint32_t i = 0; i < num_samples; i++) { 8527ec681f3Smrg float shifted_pos_x = user_locs[i].x - 0.5; 8537ec681f3Smrg float shifted_pos_y = user_locs[i].y - 0.5; 854ed98bd31Smaya 8557ec681f3Smrg int32_t scaled_pos_x = floorf(shifted_pos_x * 16); 8567ec681f3Smrg int32_t scaled_pos_y = floorf(shifted_pos_y * 16); 8577ec681f3Smrg 8587ec681f3Smrg sample_locs[i].x = CLAMP(scaled_pos_x, -8, 7); 8597ec681f3Smrg sample_locs[i].y = CLAMP(scaled_pos_y, -8, 7); 8607ec681f3Smrg } 861ed98bd31Smaya} 862ed98bd31Smaya 8637ec681f3Smrg/** 8647ec681f3Smrg * Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask based on hardware sample 8657ec681f3Smrg * locations. 8667ec681f3Smrg */ 86701e04c3fSmrgstatic void 8687ec681f3Smrgradv_compute_sample_locs_pixel(uint32_t num_samples, VkOffset2D *sample_locs, 8697ec681f3Smrg uint32_t *sample_locs_pixel) 8707ec681f3Smrg{ 8717ec681f3Smrg for (uint32_t i = 0; i < num_samples; i++) { 8727ec681f3Smrg uint32_t sample_reg_idx = i / 4; 8737ec681f3Smrg uint32_t sample_loc_idx = i % 4; 8747ec681f3Smrg int32_t pos_x = sample_locs[i].x; 8757ec681f3Smrg int32_t pos_y = sample_locs[i].y; 8767ec681f3Smrg 8777ec681f3Smrg uint32_t shift_x = 8 * sample_loc_idx; 8787ec681f3Smrg uint32_t shift_y = shift_x + 4; 8797ec681f3Smrg 8807ec681f3Smrg sample_locs_pixel[sample_reg_idx] |= (pos_x & 0xf) << shift_x; 8817ec681f3Smrg sample_locs_pixel[sample_reg_idx] |= (pos_y & 0xf) << shift_y; 8827ec681f3Smrg } 8837ec681f3Smrg} 8847ec681f3Smrg 8857ec681f3Smrg/** 8867ec681f3Smrg * Compute the PA_SC_CENTROID_PRIORITY_* mask based on the top left hardware 8877ec681f3Smrg * sample locations. 8887ec681f3Smrg */ 8897ec681f3Smrgstatic uint64_t 8907ec681f3Smrgradv_compute_centroid_priority(struct radv_cmd_buffer *cmd_buffer, VkOffset2D *sample_locs, 8917ec681f3Smrg uint32_t num_samples) 89201e04c3fSmrg{ 8937ec681f3Smrg uint32_t *centroid_priorities = alloca(num_samples * sizeof(*centroid_priorities)); 8947ec681f3Smrg uint32_t sample_mask = num_samples - 1; 8957ec681f3Smrg uint32_t *distances = alloca(num_samples * sizeof(*distances)); 8967ec681f3Smrg uint64_t centroid_priority = 0; 89701e04c3fSmrg 8987ec681f3Smrg /* Compute the distances from center for each sample. */ 8997ec681f3Smrg for (int i = 0; i < num_samples; i++) { 9007ec681f3Smrg distances[i] = (sample_locs[i].x * sample_locs[i].x) + (sample_locs[i].y * sample_locs[i].y); 9017ec681f3Smrg } 90201e04c3fSmrg 9037ec681f3Smrg /* Compute the centroid priorities by looking at the distances array. */ 9047ec681f3Smrg for (int i = 0; i < num_samples; i++) { 9057ec681f3Smrg uint32_t min_idx = 0; 90601e04c3fSmrg 9077ec681f3Smrg for (int j = 1; j < num_samples; j++) { 9087ec681f3Smrg if (distances[j] < distances[min_idx]) 9097ec681f3Smrg min_idx = j; 9107ec681f3Smrg } 91101e04c3fSmrg 9127ec681f3Smrg centroid_priorities[i] = min_idx; 9137ec681f3Smrg distances[min_idx] = 0xffffffff; 9147ec681f3Smrg } 91501e04c3fSmrg 9167ec681f3Smrg /* Compute the final centroid priority. */ 9177ec681f3Smrg for (int i = 0; i < 8; i++) { 9187ec681f3Smrg centroid_priority |= centroid_priorities[i & sample_mask] << (i * 4); 9197ec681f3Smrg } 92001e04c3fSmrg 9217ec681f3Smrg return centroid_priority << 32 | centroid_priority; 9227ec681f3Smrg} 923ed98bd31Smaya 9247ec681f3Smrg/** 9257ec681f3Smrg * Emit the sample locations that are specified with VK_EXT_sample_locations. 9267ec681f3Smrg */ 9277ec681f3Smrgstatic void 9287ec681f3Smrgradv_emit_sample_locations(struct radv_cmd_buffer *cmd_buffer) 9297ec681f3Smrg{ 9307ec681f3Smrg struct radv_sample_locations_state *sample_location = &cmd_buffer->state.dynamic.sample_location; 9317ec681f3Smrg uint32_t num_samples = (uint32_t)sample_location->per_pixel; 9327ec681f3Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 9337ec681f3Smrg uint32_t sample_locs_pixel[4][2] = {0}; 9347ec681f3Smrg VkOffset2D sample_locs[4][8]; /* 8 is the max. sample count supported */ 9357ec681f3Smrg uint32_t max_sample_dist = 0; 9367ec681f3Smrg uint64_t centroid_priority; 9377ec681f3Smrg 9387ec681f3Smrg if (!cmd_buffer->state.dynamic.sample_location.count) 9397ec681f3Smrg return; 9407ec681f3Smrg 9417ec681f3Smrg /* Convert the user sample locations to hardware sample locations. */ 9427ec681f3Smrg radv_convert_user_sample_locs(sample_location, 0, 0, sample_locs[0]); 9437ec681f3Smrg radv_convert_user_sample_locs(sample_location, 1, 0, sample_locs[1]); 9447ec681f3Smrg radv_convert_user_sample_locs(sample_location, 0, 1, sample_locs[2]); 9457ec681f3Smrg radv_convert_user_sample_locs(sample_location, 1, 1, sample_locs[3]); 9467ec681f3Smrg 9477ec681f3Smrg /* Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask. */ 9487ec681f3Smrg for (uint32_t i = 0; i < 4; i++) { 9497ec681f3Smrg radv_compute_sample_locs_pixel(num_samples, sample_locs[i], sample_locs_pixel[i]); 9507ec681f3Smrg } 9517ec681f3Smrg 9527ec681f3Smrg /* Compute the PA_SC_CENTROID_PRIORITY_* mask. */ 9537ec681f3Smrg centroid_priority = radv_compute_centroid_priority(cmd_buffer, sample_locs[0], num_samples); 9547ec681f3Smrg 9557ec681f3Smrg /* Compute the maximum sample distance from the specified locations. */ 9567ec681f3Smrg for (unsigned i = 0; i < 4; ++i) { 9577ec681f3Smrg for (uint32_t j = 0; j < num_samples; j++) { 9587ec681f3Smrg VkOffset2D offset = sample_locs[i][j]; 9597ec681f3Smrg max_sample_dist = MAX2(max_sample_dist, MAX2(abs(offset.x), abs(offset.y))); 9607ec681f3Smrg } 9617ec681f3Smrg } 9627ec681f3Smrg 9637ec681f3Smrg /* Emit the specified user sample locations. */ 9647ec681f3Smrg switch (num_samples) { 9657ec681f3Smrg case 2: 9667ec681f3Smrg case 4: 9677ec681f3Smrg radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 9687ec681f3Smrg sample_locs_pixel[0][0]); 9697ec681f3Smrg radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, 9707ec681f3Smrg sample_locs_pixel[1][0]); 9717ec681f3Smrg radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, 9727ec681f3Smrg sample_locs_pixel[2][0]); 9737ec681f3Smrg radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, 9747ec681f3Smrg sample_locs_pixel[3][0]); 9757ec681f3Smrg break; 9767ec681f3Smrg case 8: 9777ec681f3Smrg radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 9787ec681f3Smrg sample_locs_pixel[0][0]); 9797ec681f3Smrg radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, 9807ec681f3Smrg sample_locs_pixel[1][0]); 9817ec681f3Smrg radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, 9827ec681f3Smrg sample_locs_pixel[2][0]); 9837ec681f3Smrg radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, 9847ec681f3Smrg sample_locs_pixel[3][0]); 9857ec681f3Smrg radeon_set_context_reg(cs, R_028BFC_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1, 9867ec681f3Smrg sample_locs_pixel[0][1]); 9877ec681f3Smrg radeon_set_context_reg(cs, R_028C0C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1, 9887ec681f3Smrg sample_locs_pixel[1][1]); 9897ec681f3Smrg radeon_set_context_reg(cs, R_028C1C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1, 9907ec681f3Smrg sample_locs_pixel[2][1]); 9917ec681f3Smrg radeon_set_context_reg(cs, R_028C2C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1, 9927ec681f3Smrg sample_locs_pixel[3][1]); 9937ec681f3Smrg break; 9947ec681f3Smrg default: 9957ec681f3Smrg unreachable("invalid number of samples"); 9967ec681f3Smrg } 9977ec681f3Smrg 9987ec681f3Smrg /* Emit the maximum sample distance and the centroid priority. */ 9997ec681f3Smrg radeon_set_context_reg_rmw(cs, R_028BE0_PA_SC_AA_CONFIG, 10007ec681f3Smrg S_028BE0_MAX_SAMPLE_DIST(max_sample_dist), ~C_028BE0_MAX_SAMPLE_DIST); 10017ec681f3Smrg 10027ec681f3Smrg radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2); 10037ec681f3Smrg radeon_emit(cs, centroid_priority); 10047ec681f3Smrg radeon_emit(cs, centroid_priority >> 32); 10057ec681f3Smrg 10067ec681f3Smrg cmd_buffer->state.context_roll_without_scissor_emitted = true; 100701e04c3fSmrg} 100801e04c3fSmrg 100901e04c3fSmrgstatic void 10107ec681f3Smrgradv_emit_inline_push_consts(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline, 10117ec681f3Smrg gl_shader_stage stage, int idx, uint32_t *values) 101201e04c3fSmrg{ 10137ec681f3Smrg struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx); 10147ec681f3Smrg uint32_t base_reg = pipeline->user_data_0[stage]; 10157ec681f3Smrg if (loc->sgpr_idx == -1) 10167ec681f3Smrg return; 101701e04c3fSmrg 10187ec681f3Smrg radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 2 + loc->num_sgprs); 101901e04c3fSmrg 10207ec681f3Smrg radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, loc->num_sgprs); 10217ec681f3Smrg radeon_emit_array(cmd_buffer->cs, values, loc->num_sgprs); 102201e04c3fSmrg} 102301e04c3fSmrg 102401e04c3fSmrgstatic void 10257ec681f3Smrgradv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline) 102601e04c3fSmrg{ 10277ec681f3Smrg int num_samples = pipeline->graphics.ms.num_samples; 10287ec681f3Smrg struct radv_pipeline *old_pipeline = cmd_buffer->state.emitted_pipeline; 10297ec681f3Smrg 10307ec681f3Smrg if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.ps.needs_sample_positions) 10317ec681f3Smrg cmd_buffer->sample_positions_needed = true; 103201e04c3fSmrg 10337ec681f3Smrg if (old_pipeline && num_samples == old_pipeline->graphics.ms.num_samples) 10347ec681f3Smrg return; 103501e04c3fSmrg 10367ec681f3Smrg radv_emit_default_sample_locations(cmd_buffer->cs, num_samples); 103701e04c3fSmrg 10387ec681f3Smrg cmd_buffer->state.context_roll_without_scissor_emitted = true; 10397ec681f3Smrg} 10407ec681f3Smrg 10417ec681f3Smrgstatic void 10427ec681f3Smrgradv_update_binning_state(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline) 10437ec681f3Smrg{ 10447ec681f3Smrg const struct radv_pipeline *old_pipeline = cmd_buffer->state.emitted_pipeline; 104501e04c3fSmrg 10467ec681f3Smrg if (pipeline->device->physical_device->rad_info.chip_class < GFX9) 10477ec681f3Smrg return; 104801e04c3fSmrg 10497ec681f3Smrg if (old_pipeline && 10507ec681f3Smrg old_pipeline->graphics.binning.pa_sc_binner_cntl_0 == 10517ec681f3Smrg pipeline->graphics.binning.pa_sc_binner_cntl_0) 10527ec681f3Smrg return; 105301e04c3fSmrg 10547ec681f3Smrg bool binning_flush = false; 10557ec681f3Smrg if (cmd_buffer->device->physical_device->rad_info.family == CHIP_VEGA12 || 10567ec681f3Smrg cmd_buffer->device->physical_device->rad_info.family == CHIP_VEGA20 || 10577ec681f3Smrg cmd_buffer->device->physical_device->rad_info.family == CHIP_RAVEN2 || 10587ec681f3Smrg cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) { 10597ec681f3Smrg binning_flush = !old_pipeline || 10607ec681f3Smrg G_028C44_BINNING_MODE(old_pipeline->graphics.binning.pa_sc_binner_cntl_0) != 10617ec681f3Smrg G_028C44_BINNING_MODE(pipeline->graphics.binning.pa_sc_binner_cntl_0); 10627ec681f3Smrg } 106301e04c3fSmrg 10647ec681f3Smrg radeon_set_context_reg(cmd_buffer->cs, R_028C44_PA_SC_BINNER_CNTL_0, 10657ec681f3Smrg pipeline->graphics.binning.pa_sc_binner_cntl_0 | 10667ec681f3Smrg S_028C44_FLUSH_ON_BINNING_TRANSITION(!!binning_flush)); 106701e04c3fSmrg 10687ec681f3Smrg cmd_buffer->state.context_roll_without_scissor_emitted = true; 106901e04c3fSmrg} 107001e04c3fSmrg 107101e04c3fSmrgstatic void 10727ec681f3Smrgradv_emit_shader_prefetch(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant *shader) 107301e04c3fSmrg{ 10747ec681f3Smrg uint64_t va; 10757ec681f3Smrg 10767ec681f3Smrg if (!shader) 10777ec681f3Smrg return; 10787ec681f3Smrg 10797ec681f3Smrg va = radv_shader_variant_get_va(shader); 10807ec681f3Smrg 10817ec681f3Smrg si_cp_dma_prefetch(cmd_buffer, va, shader->code_size); 108201e04c3fSmrg} 108301e04c3fSmrg 108401e04c3fSmrgstatic void 10857ec681f3Smrgradv_emit_prefetch_L2(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline, 10867ec681f3Smrg bool vertex_stage_only) 108701e04c3fSmrg{ 10887ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 10897ec681f3Smrg uint32_t mask = state->prefetch_L2_mask; 10907ec681f3Smrg 10917ec681f3Smrg if (vertex_stage_only) { 10927ec681f3Smrg /* Fast prefetch path for starting draws as soon as possible. 10937ec681f3Smrg */ 10947ec681f3Smrg mask = state->prefetch_L2_mask & (RADV_PREFETCH_VS | RADV_PREFETCH_VBO_DESCRIPTORS); 10957ec681f3Smrg } 109601e04c3fSmrg 10977ec681f3Smrg if (mask & RADV_PREFETCH_VS) 10987ec681f3Smrg radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_VERTEX]); 109901e04c3fSmrg 11007ec681f3Smrg if (mask & RADV_PREFETCH_VBO_DESCRIPTORS) 11017ec681f3Smrg si_cp_dma_prefetch(cmd_buffer, state->vb_va, pipeline->vb_desc_alloc_size); 110201e04c3fSmrg 11037ec681f3Smrg if (mask & RADV_PREFETCH_TCS) 11047ec681f3Smrg radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_TESS_CTRL]); 110501e04c3fSmrg 11067ec681f3Smrg if (mask & RADV_PREFETCH_TES) 11077ec681f3Smrg radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_TESS_EVAL]); 110801e04c3fSmrg 11097ec681f3Smrg if (mask & RADV_PREFETCH_GS) { 11107ec681f3Smrg radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_GEOMETRY]); 11117ec681f3Smrg if (radv_pipeline_has_gs_copy_shader(pipeline)) 11127ec681f3Smrg radv_emit_shader_prefetch(cmd_buffer, pipeline->gs_copy_shader); 11137ec681f3Smrg } 111401e04c3fSmrg 11157ec681f3Smrg if (mask & RADV_PREFETCH_PS) 11167ec681f3Smrg radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_FRAGMENT]); 11177ec681f3Smrg 11187ec681f3Smrg state->prefetch_L2_mask &= ~mask; 11197ec681f3Smrg} 1120ed98bd31Smaya 11217ec681f3Smrgstatic void 11227ec681f3Smrgradv_emit_rbplus_state(struct radv_cmd_buffer *cmd_buffer) 11237ec681f3Smrg{ 11247ec681f3Smrg if (!cmd_buffer->device->physical_device->rad_info.rbplus_allowed) 11257ec681f3Smrg return; 11267ec681f3Smrg 11277ec681f3Smrg struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; 11287ec681f3Smrg const struct radv_subpass *subpass = cmd_buffer->state.subpass; 11297ec681f3Smrg 11307ec681f3Smrg unsigned sx_ps_downconvert = 0; 11317ec681f3Smrg unsigned sx_blend_opt_epsilon = 0; 11327ec681f3Smrg unsigned sx_blend_opt_control = 0; 11337ec681f3Smrg 11347ec681f3Smrg if (!cmd_buffer->state.attachments || !subpass) 11357ec681f3Smrg return; 11367ec681f3Smrg 11377ec681f3Smrg for (unsigned i = 0; i < subpass->color_count; ++i) { 11387ec681f3Smrg if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) { 11397ec681f3Smrg /* We don't set the DISABLE bits, because the HW can't have holes, 11407ec681f3Smrg * so the SPI color format is set to 32-bit 1-component. */ 11417ec681f3Smrg sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4); 11427ec681f3Smrg continue; 11437ec681f3Smrg } 11447ec681f3Smrg 11457ec681f3Smrg int idx = subpass->color_attachments[i].attachment; 11467ec681f3Smrg struct radv_color_buffer_info *cb = &cmd_buffer->state.attachments[idx].cb; 11477ec681f3Smrg 11487ec681f3Smrg unsigned format = G_028C70_FORMAT(cb->cb_color_info); 11497ec681f3Smrg unsigned swap = G_028C70_COMP_SWAP(cb->cb_color_info); 11507ec681f3Smrg uint32_t spi_format = (pipeline->graphics.col_format >> (i * 4)) & 0xf; 11517ec681f3Smrg uint32_t colormask = (pipeline->graphics.cb_target_mask >> (i * 4)) & 0xf; 11527ec681f3Smrg 11537ec681f3Smrg bool has_alpha, has_rgb; 11547ec681f3Smrg 11557ec681f3Smrg /* Set if RGB and A are present. */ 11567ec681f3Smrg has_alpha = !G_028C74_FORCE_DST_ALPHA_1(cb->cb_color_attrib); 11577ec681f3Smrg 11587ec681f3Smrg if (format == V_028C70_COLOR_8 || format == V_028C70_COLOR_16 || format == V_028C70_COLOR_32) 11597ec681f3Smrg has_rgb = !has_alpha; 11607ec681f3Smrg else 11617ec681f3Smrg has_rgb = true; 11627ec681f3Smrg 11637ec681f3Smrg /* Check the colormask and export format. */ 11647ec681f3Smrg if (!(colormask & 0x7)) 11657ec681f3Smrg has_rgb = false; 11667ec681f3Smrg if (!(colormask & 0x8)) 11677ec681f3Smrg has_alpha = false; 11687ec681f3Smrg 11697ec681f3Smrg if (spi_format == V_028714_SPI_SHADER_ZERO) { 11707ec681f3Smrg has_rgb = false; 11717ec681f3Smrg has_alpha = false; 11727ec681f3Smrg } 11737ec681f3Smrg 11747ec681f3Smrg /* The HW doesn't quite blend correctly with rgb9e5 if we disable the alpha 11757ec681f3Smrg * optimization, even though it has no alpha. */ 11767ec681f3Smrg if (has_rgb && format == V_028C70_COLOR_5_9_9_9) 11777ec681f3Smrg has_alpha = true; 11787ec681f3Smrg 11797ec681f3Smrg /* Disable value checking for disabled channels. */ 11807ec681f3Smrg if (!has_rgb) 11817ec681f3Smrg sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4); 11827ec681f3Smrg if (!has_alpha) 11837ec681f3Smrg sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4); 11847ec681f3Smrg 11857ec681f3Smrg /* Enable down-conversion for 32bpp and smaller formats. */ 11867ec681f3Smrg switch (format) { 11877ec681f3Smrg case V_028C70_COLOR_8: 11887ec681f3Smrg case V_028C70_COLOR_8_8: 11897ec681f3Smrg case V_028C70_COLOR_8_8_8_8: 11907ec681f3Smrg /* For 1 and 2-channel formats, use the superset thereof. */ 11917ec681f3Smrg if (spi_format == V_028714_SPI_SHADER_FP16_ABGR || 11927ec681f3Smrg spi_format == V_028714_SPI_SHADER_UINT16_ABGR || 11937ec681f3Smrg spi_format == V_028714_SPI_SHADER_SINT16_ABGR) { 11947ec681f3Smrg sx_ps_downconvert |= V_028754_SX_RT_EXPORT_8_8_8_8 << (i * 4); 11957ec681f3Smrg sx_blend_opt_epsilon |= V_028758_8BIT_FORMAT << (i * 4); 11967ec681f3Smrg } 11977ec681f3Smrg break; 11987ec681f3Smrg 11997ec681f3Smrg case V_028C70_COLOR_5_6_5: 12007ec681f3Smrg if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { 12017ec681f3Smrg sx_ps_downconvert |= V_028754_SX_RT_EXPORT_5_6_5 << (i * 4); 12027ec681f3Smrg sx_blend_opt_epsilon |= V_028758_6BIT_FORMAT << (i * 4); 12037ec681f3Smrg } 12047ec681f3Smrg break; 12057ec681f3Smrg 12067ec681f3Smrg case V_028C70_COLOR_1_5_5_5: 12077ec681f3Smrg if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { 12087ec681f3Smrg sx_ps_downconvert |= V_028754_SX_RT_EXPORT_1_5_5_5 << (i * 4); 12097ec681f3Smrg sx_blend_opt_epsilon |= V_028758_5BIT_FORMAT << (i * 4); 12107ec681f3Smrg } 12117ec681f3Smrg break; 12127ec681f3Smrg 12137ec681f3Smrg case V_028C70_COLOR_4_4_4_4: 12147ec681f3Smrg if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { 12157ec681f3Smrg sx_ps_downconvert |= V_028754_SX_RT_EXPORT_4_4_4_4 << (i * 4); 12167ec681f3Smrg sx_blend_opt_epsilon |= V_028758_4BIT_FORMAT << (i * 4); 12177ec681f3Smrg } 12187ec681f3Smrg break; 12197ec681f3Smrg 12207ec681f3Smrg case V_028C70_COLOR_32: 12217ec681f3Smrg if (swap == V_028C70_SWAP_STD && spi_format == V_028714_SPI_SHADER_32_R) 12227ec681f3Smrg sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4); 12237ec681f3Smrg else if (swap == V_028C70_SWAP_ALT_REV && spi_format == V_028714_SPI_SHADER_32_AR) 12247ec681f3Smrg sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_A << (i * 4); 12257ec681f3Smrg break; 12267ec681f3Smrg 12277ec681f3Smrg case V_028C70_COLOR_16: 12287ec681f3Smrg case V_028C70_COLOR_16_16: 12297ec681f3Smrg /* For 1-channel formats, use the superset thereof. */ 12307ec681f3Smrg if (spi_format == V_028714_SPI_SHADER_UNORM16_ABGR || 12317ec681f3Smrg spi_format == V_028714_SPI_SHADER_SNORM16_ABGR || 12327ec681f3Smrg spi_format == V_028714_SPI_SHADER_UINT16_ABGR || 12337ec681f3Smrg spi_format == V_028714_SPI_SHADER_SINT16_ABGR) { 12347ec681f3Smrg if (swap == V_028C70_SWAP_STD || swap == V_028C70_SWAP_STD_REV) 12357ec681f3Smrg sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_GR << (i * 4); 12367ec681f3Smrg else 12377ec681f3Smrg sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_AR << (i * 4); 12387ec681f3Smrg } 12397ec681f3Smrg break; 12407ec681f3Smrg 12417ec681f3Smrg case V_028C70_COLOR_10_11_11: 12427ec681f3Smrg if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) 12437ec681f3Smrg sx_ps_downconvert |= V_028754_SX_RT_EXPORT_10_11_11 << (i * 4); 12447ec681f3Smrg break; 12457ec681f3Smrg 12467ec681f3Smrg case V_028C70_COLOR_2_10_10_10: 12477ec681f3Smrg if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { 12487ec681f3Smrg sx_ps_downconvert |= V_028754_SX_RT_EXPORT_2_10_10_10 << (i * 4); 12497ec681f3Smrg sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT << (i * 4); 12507ec681f3Smrg } 12517ec681f3Smrg break; 12527ec681f3Smrg case V_028C70_COLOR_5_9_9_9: 12537ec681f3Smrg if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) 12547ec681f3Smrg sx_ps_downconvert |= V_028754_SX_RT_EXPORT_9_9_9_E5 << (i * 4); 12557ec681f3Smrg break; 12567ec681f3Smrg } 12577ec681f3Smrg } 12587ec681f3Smrg 12597ec681f3Smrg /* Do not set the DISABLE bits for the unused attachments, as that 12607ec681f3Smrg * breaks dual source blending in SkQP and does not seem to improve 12617ec681f3Smrg * performance. */ 12627ec681f3Smrg 12637ec681f3Smrg if (sx_ps_downconvert == cmd_buffer->state.last_sx_ps_downconvert && 12647ec681f3Smrg sx_blend_opt_epsilon == cmd_buffer->state.last_sx_blend_opt_epsilon && 12657ec681f3Smrg sx_blend_opt_control == cmd_buffer->state.last_sx_blend_opt_control) 12667ec681f3Smrg return; 12677ec681f3Smrg 12687ec681f3Smrg radeon_set_context_reg_seq(cmd_buffer->cs, R_028754_SX_PS_DOWNCONVERT, 3); 12697ec681f3Smrg radeon_emit(cmd_buffer->cs, sx_ps_downconvert); 12707ec681f3Smrg radeon_emit(cmd_buffer->cs, sx_blend_opt_epsilon); 12717ec681f3Smrg radeon_emit(cmd_buffer->cs, sx_blend_opt_control); 12727ec681f3Smrg 12737ec681f3Smrg cmd_buffer->state.context_roll_without_scissor_emitted = true; 12747ec681f3Smrg 12757ec681f3Smrg cmd_buffer->state.last_sx_ps_downconvert = sx_ps_downconvert; 12767ec681f3Smrg cmd_buffer->state.last_sx_blend_opt_epsilon = sx_blend_opt_epsilon; 12777ec681f3Smrg cmd_buffer->state.last_sx_blend_opt_control = sx_blend_opt_control; 12787ec681f3Smrg} 127901e04c3fSmrg 12807ec681f3Smrgstatic void 12817ec681f3Smrgradv_emit_batch_break_on_new_ps(struct radv_cmd_buffer *cmd_buffer) 12827ec681f3Smrg{ 12837ec681f3Smrg if (!cmd_buffer->device->pbb_allowed) 12847ec681f3Smrg return; 128501e04c3fSmrg 12867ec681f3Smrg struct radv_binning_settings settings = 12877ec681f3Smrg radv_get_binning_settings(cmd_buffer->device->physical_device); 12887ec681f3Smrg bool break_for_new_ps = 12897ec681f3Smrg (!cmd_buffer->state.emitted_pipeline || 12907ec681f3Smrg cmd_buffer->state.emitted_pipeline->shaders[MESA_SHADER_FRAGMENT] != 12917ec681f3Smrg cmd_buffer->state.pipeline->shaders[MESA_SHADER_FRAGMENT]) && 12927ec681f3Smrg (settings.context_states_per_bin > 1 || settings.persistent_states_per_bin > 1); 12937ec681f3Smrg bool break_for_new_cb_target_mask = 12947ec681f3Smrg (cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE) && 12957ec681f3Smrg settings.context_states_per_bin > 1; 129601e04c3fSmrg 12977ec681f3Smrg if (!break_for_new_ps && !break_for_new_cb_target_mask) 12987ec681f3Smrg return; 129901e04c3fSmrg 13007ec681f3Smrg radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 13017ec681f3Smrg radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0)); 13027ec681f3Smrg} 130301e04c3fSmrg 13047ec681f3Smrgstatic void 13057ec681f3Smrgradv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer) 13067ec681f3Smrg{ 13077ec681f3Smrg struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; 13087ec681f3Smrg 13097ec681f3Smrg if (!pipeline || cmd_buffer->state.emitted_pipeline == pipeline) 13107ec681f3Smrg return; 13117ec681f3Smrg 13127ec681f3Smrg radv_update_multisample_state(cmd_buffer, pipeline); 13137ec681f3Smrg radv_update_binning_state(cmd_buffer, pipeline); 13147ec681f3Smrg 13157ec681f3Smrg cmd_buffer->scratch_size_per_wave_needed = 13167ec681f3Smrg MAX2(cmd_buffer->scratch_size_per_wave_needed, pipeline->scratch_bytes_per_wave); 13177ec681f3Smrg cmd_buffer->scratch_waves_wanted = MAX2(cmd_buffer->scratch_waves_wanted, pipeline->max_waves); 13187ec681f3Smrg 13197ec681f3Smrg if (!cmd_buffer->state.emitted_pipeline || 13207ec681f3Smrg cmd_buffer->state.emitted_pipeline->graphics.can_use_guardband != 13217ec681f3Smrg pipeline->graphics.can_use_guardband) 13227ec681f3Smrg cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR; 13237ec681f3Smrg 13247ec681f3Smrg if (!cmd_buffer->state.emitted_pipeline || 13257ec681f3Smrg cmd_buffer->state.emitted_pipeline->graphics.pa_su_sc_mode_cntl != 13267ec681f3Smrg pipeline->graphics.pa_su_sc_mode_cntl) 13277ec681f3Smrg cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_CULL_MODE | 13287ec681f3Smrg RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE | 13297ec681f3Smrg RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS; 13307ec681f3Smrg 13317ec681f3Smrg if (!cmd_buffer->state.emitted_pipeline || 13327ec681f3Smrg cmd_buffer->state.emitted_pipeline->graphics.pa_cl_clip_cntl != 13337ec681f3Smrg pipeline->graphics.pa_cl_clip_cntl) 13347ec681f3Smrg cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE; 13357ec681f3Smrg 13367ec681f3Smrg if (!cmd_buffer->state.emitted_pipeline || 13377ec681f3Smrg cmd_buffer->state.emitted_pipeline->graphics.cb_color_control != 13387ec681f3Smrg pipeline->graphics.cb_color_control) 13397ec681f3Smrg cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP; 13407ec681f3Smrg 13417ec681f3Smrg if (!cmd_buffer->state.emitted_pipeline) 13427ec681f3Smrg cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY | 13437ec681f3Smrg RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS | 13447ec681f3Smrg RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS | 13457ec681f3Smrg RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE; 13467ec681f3Smrg 13477ec681f3Smrg if (!cmd_buffer->state.emitted_pipeline || 13487ec681f3Smrg cmd_buffer->state.emitted_pipeline->graphics.db_depth_control != 13497ec681f3Smrg pipeline->graphics.db_depth_control) 13507ec681f3Smrg cmd_buffer->state.dirty |= 13517ec681f3Smrg RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE | 13527ec681f3Smrg RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP | RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE | 13537ec681f3Smrg RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP; 13547ec681f3Smrg 13557ec681f3Smrg if (!cmd_buffer->state.emitted_pipeline) 13567ec681f3Smrg cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP; 13577ec681f3Smrg 13587ec681f3Smrg if (!cmd_buffer->state.emitted_pipeline || 13597ec681f3Smrg cmd_buffer->state.emitted_pipeline->graphics.cb_target_mask != 13607ec681f3Smrg pipeline->graphics.cb_target_mask) { 13617ec681f3Smrg cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE; 13627ec681f3Smrg } 13637ec681f3Smrg 13647ec681f3Smrg radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw); 13657ec681f3Smrg 13667ec681f3Smrg if (pipeline->graphics.has_ngg_culling && 13677ec681f3Smrg pipeline->graphics.last_vgt_api_stage != MESA_SHADER_GEOMETRY && 13687ec681f3Smrg !cmd_buffer->state.last_nggc_settings) { 13697ec681f3Smrg /* The already emitted RSRC2 contains the LDS required for NGG culling. 13707ec681f3Smrg * Culling is currently disabled, so re-emit RSRC2 to reduce LDS usage. 13717ec681f3Smrg * API GS always needs LDS, so this isn't useful there. 13727ec681f3Smrg */ 13737ec681f3Smrg struct radv_shader_variant *v = pipeline->shaders[pipeline->graphics.last_vgt_api_stage]; 13747ec681f3Smrg radeon_set_sh_reg(cmd_buffer->cs, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, 13757ec681f3Smrg (v->config.rsrc2 & C_00B22C_LDS_SIZE) | 13767ec681f3Smrg S_00B22C_LDS_SIZE(v->info.num_lds_blocks_when_not_culling)); 13777ec681f3Smrg } 13787ec681f3Smrg 13797ec681f3Smrg if (!cmd_buffer->state.emitted_pipeline || 13807ec681f3Smrg cmd_buffer->state.emitted_pipeline->ctx_cs.cdw != pipeline->ctx_cs.cdw || 13817ec681f3Smrg cmd_buffer->state.emitted_pipeline->ctx_cs_hash != pipeline->ctx_cs_hash || 13827ec681f3Smrg memcmp(cmd_buffer->state.emitted_pipeline->ctx_cs.buf, pipeline->ctx_cs.buf, 13837ec681f3Smrg pipeline->ctx_cs.cdw * 4)) { 13847ec681f3Smrg radeon_emit_array(cmd_buffer->cs, pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw); 13857ec681f3Smrg cmd_buffer->state.context_roll_without_scissor_emitted = true; 13867ec681f3Smrg } 13877ec681f3Smrg 13887ec681f3Smrg radv_emit_batch_break_on_new_ps(cmd_buffer); 13897ec681f3Smrg 13907ec681f3Smrg for (unsigned i = 0; i < MESA_SHADER_COMPUTE; i++) { 13917ec681f3Smrg if (!pipeline->shaders[i]) 13927ec681f3Smrg continue; 13937ec681f3Smrg 13947ec681f3Smrg radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->shaders[i]->bo); 13957ec681f3Smrg } 13967ec681f3Smrg 13977ec681f3Smrg if (radv_pipeline_has_gs_copy_shader(pipeline)) 13987ec681f3Smrg radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->gs_copy_shader->bo); 13997ec681f3Smrg 14007ec681f3Smrg if (unlikely(cmd_buffer->device->trace_bo)) 14017ec681f3Smrg radv_save_pipeline(cmd_buffer, pipeline); 14027ec681f3Smrg 14037ec681f3Smrg cmd_buffer->state.emitted_pipeline = pipeline; 14047ec681f3Smrg 14057ec681f3Smrg cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_PIPELINE; 140601e04c3fSmrg} 140701e04c3fSmrg 140801e04c3fSmrgstatic void 140901e04c3fSmrgradv_emit_viewport(struct radv_cmd_buffer *cmd_buffer) 141001e04c3fSmrg{ 14117ec681f3Smrg const struct radv_viewport_state *viewport = &cmd_buffer->state.dynamic.viewport; 14127ec681f3Smrg int i; 14137ec681f3Smrg const unsigned count = viewport->count; 14147ec681f3Smrg 14157ec681f3Smrg assert(count); 14167ec681f3Smrg radeon_set_context_reg_seq(cmd_buffer->cs, R_02843C_PA_CL_VPORT_XSCALE, count * 6); 14177ec681f3Smrg 14187ec681f3Smrg for (i = 0; i < count; i++) { 14197ec681f3Smrg radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].scale[0])); 14207ec681f3Smrg radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].translate[0])); 14217ec681f3Smrg radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].scale[1])); 14227ec681f3Smrg radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].translate[1])); 14237ec681f3Smrg radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].scale[2])); 14247ec681f3Smrg radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].translate[2])); 14257ec681f3Smrg } 14267ec681f3Smrg 14277ec681f3Smrg radeon_set_context_reg_seq(cmd_buffer->cs, R_0282D0_PA_SC_VPORT_ZMIN_0, count * 2); 14287ec681f3Smrg for (i = 0; i < count; i++) { 14297ec681f3Smrg float zmin = MIN2(viewport->viewports[i].minDepth, viewport->viewports[i].maxDepth); 14307ec681f3Smrg float zmax = MAX2(viewport->viewports[i].minDepth, viewport->viewports[i].maxDepth); 14317ec681f3Smrg radeon_emit(cmd_buffer->cs, fui(zmin)); 14327ec681f3Smrg radeon_emit(cmd_buffer->cs, fui(zmax)); 14337ec681f3Smrg } 143401e04c3fSmrg} 143501e04c3fSmrg 143601e04c3fSmrgstatic void 143701e04c3fSmrgradv_emit_scissor(struct radv_cmd_buffer *cmd_buffer) 143801e04c3fSmrg{ 14397ec681f3Smrg uint32_t count = cmd_buffer->state.dynamic.scissor.count; 144001e04c3fSmrg 14417ec681f3Smrg si_write_scissors(cmd_buffer->cs, 0, count, cmd_buffer->state.dynamic.scissor.scissors, 14427ec681f3Smrg cmd_buffer->state.dynamic.viewport.viewports, 14437ec681f3Smrg cmd_buffer->state.emitted_pipeline->graphics.can_use_guardband); 1444ed98bd31Smaya 14457ec681f3Smrg cmd_buffer->state.context_roll_without_scissor_emitted = false; 144601e04c3fSmrg} 144701e04c3fSmrg 144801e04c3fSmrgstatic void 144901e04c3fSmrgradv_emit_discard_rectangle(struct radv_cmd_buffer *cmd_buffer) 145001e04c3fSmrg{ 14517ec681f3Smrg if (!cmd_buffer->state.dynamic.discard_rectangle.count) 14527ec681f3Smrg return; 145301e04c3fSmrg 14547ec681f3Smrg radeon_set_context_reg_seq(cmd_buffer->cs, R_028210_PA_SC_CLIPRECT_0_TL, 14557ec681f3Smrg cmd_buffer->state.dynamic.discard_rectangle.count * 2); 14567ec681f3Smrg for (unsigned i = 0; i < cmd_buffer->state.dynamic.discard_rectangle.count; ++i) { 14577ec681f3Smrg VkRect2D rect = cmd_buffer->state.dynamic.discard_rectangle.rectangles[i]; 14587ec681f3Smrg radeon_emit(cmd_buffer->cs, S_028210_TL_X(rect.offset.x) | S_028210_TL_Y(rect.offset.y)); 14597ec681f3Smrg radeon_emit(cmd_buffer->cs, S_028214_BR_X(rect.offset.x + rect.extent.width) | 14607ec681f3Smrg S_028214_BR_Y(rect.offset.y + rect.extent.height)); 14617ec681f3Smrg } 146201e04c3fSmrg} 146301e04c3fSmrg 146401e04c3fSmrgstatic void 146501e04c3fSmrgradv_emit_line_width(struct radv_cmd_buffer *cmd_buffer) 146601e04c3fSmrg{ 14677ec681f3Smrg unsigned width = cmd_buffer->state.dynamic.line_width * 8; 146801e04c3fSmrg 14697ec681f3Smrg radeon_set_context_reg(cmd_buffer->cs, R_028A08_PA_SU_LINE_CNTL, 14707ec681f3Smrg S_028A08_WIDTH(CLAMP(width, 0, 0xFFFF))); 147101e04c3fSmrg} 147201e04c3fSmrg 147301e04c3fSmrgstatic void 147401e04c3fSmrgradv_emit_blend_constants(struct radv_cmd_buffer *cmd_buffer) 147501e04c3fSmrg{ 14767ec681f3Smrg struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 147701e04c3fSmrg 14787ec681f3Smrg radeon_set_context_reg_seq(cmd_buffer->cs, R_028414_CB_BLEND_RED, 4); 14797ec681f3Smrg radeon_emit_array(cmd_buffer->cs, (uint32_t *)d->blend_constants, 4); 148001e04c3fSmrg} 148101e04c3fSmrg 148201e04c3fSmrgstatic void 148301e04c3fSmrgradv_emit_stencil(struct radv_cmd_buffer *cmd_buffer) 148401e04c3fSmrg{ 14857ec681f3Smrg struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 148601e04c3fSmrg 14877ec681f3Smrg radeon_set_context_reg_seq(cmd_buffer->cs, R_028430_DB_STENCILREFMASK, 2); 14887ec681f3Smrg radeon_emit(cmd_buffer->cs, S_028430_STENCILTESTVAL(d->stencil_reference.front) | 14897ec681f3Smrg S_028430_STENCILMASK(d->stencil_compare_mask.front) | 14907ec681f3Smrg S_028430_STENCILWRITEMASK(d->stencil_write_mask.front) | 14917ec681f3Smrg S_028430_STENCILOPVAL(1)); 14927ec681f3Smrg radeon_emit(cmd_buffer->cs, S_028434_STENCILTESTVAL_BF(d->stencil_reference.back) | 14937ec681f3Smrg S_028434_STENCILMASK_BF(d->stencil_compare_mask.back) | 14947ec681f3Smrg S_028434_STENCILWRITEMASK_BF(d->stencil_write_mask.back) | 14957ec681f3Smrg S_028434_STENCILOPVAL_BF(1)); 149601e04c3fSmrg} 149701e04c3fSmrg 149801e04c3fSmrgstatic void 149901e04c3fSmrgradv_emit_depth_bounds(struct radv_cmd_buffer *cmd_buffer) 150001e04c3fSmrg{ 15017ec681f3Smrg struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 150201e04c3fSmrg 15037ec681f3Smrg radeon_set_context_reg_seq(cmd_buffer->cs, R_028020_DB_DEPTH_BOUNDS_MIN, 2); 15047ec681f3Smrg radeon_emit(cmd_buffer->cs, fui(d->depth_bounds.min)); 15057ec681f3Smrg radeon_emit(cmd_buffer->cs, fui(d->depth_bounds.max)); 150601e04c3fSmrg} 150701e04c3fSmrg 150801e04c3fSmrgstatic void 150901e04c3fSmrgradv_emit_depth_bias(struct radv_cmd_buffer *cmd_buffer) 151001e04c3fSmrg{ 15117ec681f3Smrg struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 15127ec681f3Smrg unsigned slope = fui(d->depth_bias.slope * 16.0f); 15137ec681f3Smrg 15147ec681f3Smrg radeon_set_context_reg_seq(cmd_buffer->cs, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, 5); 15157ec681f3Smrg radeon_emit(cmd_buffer->cs, fui(d->depth_bias.clamp)); /* CLAMP */ 15167ec681f3Smrg radeon_emit(cmd_buffer->cs, slope); /* FRONT SCALE */ 15177ec681f3Smrg radeon_emit(cmd_buffer->cs, fui(d->depth_bias.bias)); /* FRONT OFFSET */ 15187ec681f3Smrg radeon_emit(cmd_buffer->cs, slope); /* BACK SCALE */ 15197ec681f3Smrg radeon_emit(cmd_buffer->cs, fui(d->depth_bias.bias)); /* BACK OFFSET */ 15207ec681f3Smrg} 15217ec681f3Smrg 15227ec681f3Smrgstatic void 15237ec681f3Smrgradv_emit_line_stipple(struct radv_cmd_buffer *cmd_buffer) 15247ec681f3Smrg{ 15257ec681f3Smrg struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 15267ec681f3Smrg uint32_t auto_reset_cntl = 1; 15277ec681f3Smrg 15287ec681f3Smrg if (d->primitive_topology == V_008958_DI_PT_LINESTRIP) 15297ec681f3Smrg auto_reset_cntl = 2; 15307ec681f3Smrg 15317ec681f3Smrg radeon_set_context_reg(cmd_buffer->cs, R_028A0C_PA_SC_LINE_STIPPLE, 15327ec681f3Smrg S_028A0C_LINE_PATTERN(d->line_stipple.pattern) | 15337ec681f3Smrg S_028A0C_REPEAT_COUNT(d->line_stipple.factor - 1) | 15347ec681f3Smrg S_028A0C_AUTO_RESET_CNTL(auto_reset_cntl)); 15357ec681f3Smrg} 15367ec681f3Smrg 15377ec681f3Smrgstatic void 15387ec681f3Smrgradv_emit_culling(struct radv_cmd_buffer *cmd_buffer, uint64_t states) 15397ec681f3Smrg{ 15407ec681f3Smrg unsigned pa_su_sc_mode_cntl = cmd_buffer->state.pipeline->graphics.pa_su_sc_mode_cntl; 15417ec681f3Smrg struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 15427ec681f3Smrg 15437ec681f3Smrg pa_su_sc_mode_cntl &= C_028814_CULL_FRONT & 15447ec681f3Smrg C_028814_CULL_BACK & 15457ec681f3Smrg C_028814_FACE & 15467ec681f3Smrg C_028814_POLY_OFFSET_FRONT_ENABLE & 15477ec681f3Smrg C_028814_POLY_OFFSET_BACK_ENABLE & 15487ec681f3Smrg C_028814_POLY_OFFSET_PARA_ENABLE; 15497ec681f3Smrg 15507ec681f3Smrg pa_su_sc_mode_cntl |= S_028814_CULL_FRONT(!!(d->cull_mode & VK_CULL_MODE_FRONT_BIT)) | 15517ec681f3Smrg S_028814_CULL_BACK(!!(d->cull_mode & VK_CULL_MODE_BACK_BIT)) | 15527ec681f3Smrg S_028814_FACE(d->front_face) | 15537ec681f3Smrg S_028814_POLY_OFFSET_FRONT_ENABLE(d->depth_bias_enable) | 15547ec681f3Smrg S_028814_POLY_OFFSET_BACK_ENABLE(d->depth_bias_enable) | 15557ec681f3Smrg S_028814_POLY_OFFSET_PARA_ENABLE(d->depth_bias_enable); 15567ec681f3Smrg 15577ec681f3Smrg radeon_set_context_reg(cmd_buffer->cs, R_028814_PA_SU_SC_MODE_CNTL, pa_su_sc_mode_cntl); 15587ec681f3Smrg} 15597ec681f3Smrg 15607ec681f3Smrgstatic void 15617ec681f3Smrgradv_emit_primitive_topology(struct radv_cmd_buffer *cmd_buffer) 15627ec681f3Smrg{ 15637ec681f3Smrg struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 15647ec681f3Smrg 15657ec681f3Smrg if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) { 15667ec681f3Smrg radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cmd_buffer->cs, 15677ec681f3Smrg R_030908_VGT_PRIMITIVE_TYPE, 1, d->primitive_topology); 15687ec681f3Smrg } else { 15697ec681f3Smrg radeon_set_config_reg(cmd_buffer->cs, R_008958_VGT_PRIMITIVE_TYPE, d->primitive_topology); 15707ec681f3Smrg } 15717ec681f3Smrg} 15727ec681f3Smrg 15737ec681f3Smrgstatic void 15747ec681f3Smrgradv_emit_depth_control(struct radv_cmd_buffer *cmd_buffer, uint64_t states) 15757ec681f3Smrg{ 15767ec681f3Smrg unsigned db_depth_control = cmd_buffer->state.pipeline->graphics.db_depth_control; 15777ec681f3Smrg struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 15787ec681f3Smrg 15797ec681f3Smrg db_depth_control &= C_028800_Z_ENABLE & 15807ec681f3Smrg C_028800_Z_WRITE_ENABLE & 15817ec681f3Smrg C_028800_ZFUNC & 15827ec681f3Smrg C_028800_DEPTH_BOUNDS_ENABLE & 15837ec681f3Smrg C_028800_STENCIL_ENABLE & 15847ec681f3Smrg C_028800_BACKFACE_ENABLE & 15857ec681f3Smrg C_028800_STENCILFUNC & 15867ec681f3Smrg C_028800_STENCILFUNC_BF; 15877ec681f3Smrg 15887ec681f3Smrg db_depth_control |= S_028800_Z_ENABLE(d->depth_test_enable ? 1 : 0) | 15897ec681f3Smrg S_028800_Z_WRITE_ENABLE(d->depth_write_enable ? 1 : 0) | 15907ec681f3Smrg S_028800_ZFUNC(d->depth_compare_op) | 15917ec681f3Smrg S_028800_DEPTH_BOUNDS_ENABLE(d->depth_bounds_test_enable ? 1 : 0) | 15927ec681f3Smrg S_028800_STENCIL_ENABLE(d->stencil_test_enable ? 1 : 0) | 15937ec681f3Smrg S_028800_BACKFACE_ENABLE(d->stencil_test_enable ? 1 : 0) | 15947ec681f3Smrg S_028800_STENCILFUNC(d->stencil_op.front.compare_op) | 15957ec681f3Smrg S_028800_STENCILFUNC_BF(d->stencil_op.back.compare_op); 15967ec681f3Smrg 15977ec681f3Smrg radeon_set_context_reg(cmd_buffer->cs, R_028800_DB_DEPTH_CONTROL, db_depth_control); 15987ec681f3Smrg} 15997ec681f3Smrg 16007ec681f3Smrgstatic void 16017ec681f3Smrgradv_emit_stencil_control(struct radv_cmd_buffer *cmd_buffer) 16027ec681f3Smrg{ 16037ec681f3Smrg struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 16047ec681f3Smrg 16057ec681f3Smrg radeon_set_context_reg( 16067ec681f3Smrg cmd_buffer->cs, R_02842C_DB_STENCIL_CONTROL, 16077ec681f3Smrg S_02842C_STENCILFAIL(si_translate_stencil_op(d->stencil_op.front.fail_op)) | 16087ec681f3Smrg S_02842C_STENCILZPASS(si_translate_stencil_op(d->stencil_op.front.pass_op)) | 16097ec681f3Smrg S_02842C_STENCILZFAIL(si_translate_stencil_op(d->stencil_op.front.depth_fail_op)) | 16107ec681f3Smrg S_02842C_STENCILFAIL_BF(si_translate_stencil_op(d->stencil_op.back.fail_op)) | 16117ec681f3Smrg S_02842C_STENCILZPASS_BF(si_translate_stencil_op(d->stencil_op.back.pass_op)) | 16127ec681f3Smrg S_02842C_STENCILZFAIL_BF(si_translate_stencil_op(d->stencil_op.back.depth_fail_op))); 16137ec681f3Smrg} 16147ec681f3Smrg 16157ec681f3Smrgstatic void 16167ec681f3Smrgradv_emit_fragment_shading_rate(struct radv_cmd_buffer *cmd_buffer) 16177ec681f3Smrg{ 16187ec681f3Smrg struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; 16197ec681f3Smrg const struct radv_subpass *subpass = cmd_buffer->state.subpass; 16207ec681f3Smrg struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 16217ec681f3Smrg uint32_t rate_x = MIN2(2, d->fragment_shading_rate.size.width) - 1; 16227ec681f3Smrg uint32_t rate_y = MIN2(2, d->fragment_shading_rate.size.height) - 1; 16237ec681f3Smrg uint32_t pa_cl_vrs_cntl = pipeline->graphics.vrs.pa_cl_vrs_cntl; 16247ec681f3Smrg uint32_t vertex_comb_mode = d->fragment_shading_rate.combiner_ops[0]; 16257ec681f3Smrg uint32_t htile_comb_mode = d->fragment_shading_rate.combiner_ops[1]; 16267ec681f3Smrg 16277ec681f3Smrg if (subpass && !subpass->vrs_attachment) { 16287ec681f3Smrg /* When the current subpass has no VRS attachment, the VRS rates are expected to be 1x1, so we 16297ec681f3Smrg * can cheat by tweaking the different combiner modes. 16307ec681f3Smrg */ 16317ec681f3Smrg switch (htile_comb_mode) { 16327ec681f3Smrg case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MIN_KHR: 16337ec681f3Smrg /* The result of min(A, 1x1) is always 1x1. */ 16347ec681f3Smrg FALLTHROUGH; 16357ec681f3Smrg case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR: 16367ec681f3Smrg /* Force the per-draw VRS rate to 1x1. */ 16377ec681f3Smrg rate_x = rate_y = 0; 16387ec681f3Smrg 16397ec681f3Smrg /* As the result of min(A, 1x1) or replace(A, 1x1) are always 1x1, set the vertex rate 16407ec681f3Smrg * combiner mode as passthrough. 16417ec681f3Smrg */ 16427ec681f3Smrg vertex_comb_mode = V_028848_VRS_COMB_MODE_PASSTHRU; 16437ec681f3Smrg break; 16447ec681f3Smrg case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MAX_KHR: 16457ec681f3Smrg /* The result of max(A, 1x1) is always A. */ 16467ec681f3Smrg FALLTHROUGH; 16477ec681f3Smrg case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR: 16487ec681f3Smrg /* Nothing to do here because the SAMPLE_ITER combiner mode should already be passthrough. */ 16497ec681f3Smrg break; 16507ec681f3Smrg default: 16517ec681f3Smrg break; 16527ec681f3Smrg } 16537ec681f3Smrg } 16547ec681f3Smrg 16557ec681f3Smrg /* Emit per-draw VRS rate which is the first combiner. */ 16567ec681f3Smrg radeon_set_uconfig_reg(cmd_buffer->cs, R_03098C_GE_VRS_RATE, 16577ec681f3Smrg S_03098C_RATE_X(rate_x) | S_03098C_RATE_Y(rate_y)); 16587ec681f3Smrg 16597ec681f3Smrg /* VERTEX_RATE_COMBINER_MODE controls the combiner mode between the 16607ec681f3Smrg * draw rate and the vertex rate. 16617ec681f3Smrg */ 16627ec681f3Smrg pa_cl_vrs_cntl |= S_028848_VERTEX_RATE_COMBINER_MODE(vertex_comb_mode); 16637ec681f3Smrg 16647ec681f3Smrg /* HTILE_RATE_COMBINER_MODE controls the combiner mode between the primitive rate and the HTILE 16657ec681f3Smrg * rate. 16667ec681f3Smrg */ 16677ec681f3Smrg pa_cl_vrs_cntl |= S_028848_HTILE_RATE_COMBINER_MODE(htile_comb_mode); 16687ec681f3Smrg 16697ec681f3Smrg radeon_set_context_reg(cmd_buffer->cs, R_028848_PA_CL_VRS_CNTL, pa_cl_vrs_cntl); 16707ec681f3Smrg} 16717ec681f3Smrg 16727ec681f3Smrgstatic void 16737ec681f3Smrgradv_emit_primitive_restart_enable(struct radv_cmd_buffer *cmd_buffer) 16747ec681f3Smrg{ 16757ec681f3Smrg struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 16767ec681f3Smrg 16777ec681f3Smrg if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { 16787ec681f3Smrg radeon_set_uconfig_reg(cmd_buffer->cs, R_03092C_VGT_MULTI_PRIM_IB_RESET_EN, 16797ec681f3Smrg d->primitive_restart_enable); 16807ec681f3Smrg } else { 16817ec681f3Smrg radeon_set_context_reg(cmd_buffer->cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, 16827ec681f3Smrg d->primitive_restart_enable); 16837ec681f3Smrg } 16847ec681f3Smrg} 16857ec681f3Smrg 16867ec681f3Smrgstatic void 16877ec681f3Smrgradv_emit_rasterizer_discard_enable(struct radv_cmd_buffer *cmd_buffer) 16887ec681f3Smrg{ 16897ec681f3Smrg unsigned pa_cl_clip_cntl = cmd_buffer->state.pipeline->graphics.pa_cl_clip_cntl; 16907ec681f3Smrg struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 16917ec681f3Smrg 16927ec681f3Smrg pa_cl_clip_cntl &= C_028810_DX_RASTERIZATION_KILL; 16937ec681f3Smrg pa_cl_clip_cntl |= S_028810_DX_RASTERIZATION_KILL(d->rasterizer_discard_enable); 16947ec681f3Smrg 16957ec681f3Smrg radeon_set_context_reg(cmd_buffer->cs, R_028810_PA_CL_CLIP_CNTL, pa_cl_clip_cntl); 16967ec681f3Smrg} 16977ec681f3Smrg 16987ec681f3Smrgstatic void 16997ec681f3Smrgradv_emit_logic_op(struct radv_cmd_buffer *cmd_buffer) 17007ec681f3Smrg{ 17017ec681f3Smrg unsigned cb_color_control = cmd_buffer->state.pipeline->graphics.cb_color_control; 17027ec681f3Smrg struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 17037ec681f3Smrg 17047ec681f3Smrg cb_color_control &= C_028808_ROP3; 17057ec681f3Smrg cb_color_control |= S_028808_ROP3(d->logic_op); 17067ec681f3Smrg 17077ec681f3Smrg radeon_set_context_reg(cmd_buffer->cs, R_028808_CB_COLOR_CONTROL, cb_color_control); 17087ec681f3Smrg} 170901e04c3fSmrg 17107ec681f3Smrgstatic void 17117ec681f3Smrgradv_emit_color_write_enable(struct radv_cmd_buffer *cmd_buffer) 17127ec681f3Smrg{ 17137ec681f3Smrg struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; 17147ec681f3Smrg struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 171501e04c3fSmrg 17167ec681f3Smrg radeon_set_context_reg(cmd_buffer->cs, R_028238_CB_TARGET_MASK, 17177ec681f3Smrg pipeline->graphics.cb_target_mask & d->color_write_enable); 171801e04c3fSmrg} 171901e04c3fSmrg 172001e04c3fSmrgstatic void 17217ec681f3Smrgradv_emit_fb_color_state(struct radv_cmd_buffer *cmd_buffer, int index, 17227ec681f3Smrg struct radv_color_buffer_info *cb, struct radv_image_view *iview, 17237ec681f3Smrg VkImageLayout layout, bool in_render_loop, bool disable_dcc) 17247ec681f3Smrg{ 17257ec681f3Smrg bool is_vi = cmd_buffer->device->physical_device->rad_info.chip_class >= GFX8; 17267ec681f3Smrg uint32_t cb_color_info = cb->cb_color_info; 17277ec681f3Smrg struct radv_image *image = iview->image; 17287ec681f3Smrg 17297ec681f3Smrg if (!radv_layout_dcc_compressed( 17307ec681f3Smrg cmd_buffer->device, image, iview->base_mip, layout, in_render_loop, 17317ec681f3Smrg radv_image_queue_family_mask(image, cmd_buffer->queue_family_index, 17327ec681f3Smrg cmd_buffer->queue_family_index)) || 17337ec681f3Smrg disable_dcc) { 17347ec681f3Smrg cb_color_info &= C_028C70_DCC_ENABLE; 17357ec681f3Smrg } 17367ec681f3Smrg 17377ec681f3Smrg if (!radv_layout_fmask_compressed( 17387ec681f3Smrg cmd_buffer->device, image, layout, 17397ec681f3Smrg radv_image_queue_family_mask(image, cmd_buffer->queue_family_index, 17407ec681f3Smrg cmd_buffer->queue_family_index))) { 17417ec681f3Smrg cb_color_info &= C_028C70_COMPRESSION; 17427ec681f3Smrg } 17437ec681f3Smrg 17447ec681f3Smrg if (radv_image_is_tc_compat_cmask(image) && (radv_is_fmask_decompress_pipeline(cmd_buffer) || 17457ec681f3Smrg radv_is_dcc_decompress_pipeline(cmd_buffer))) { 17467ec681f3Smrg /* If this bit is set, the FMASK decompression operation 17477ec681f3Smrg * doesn't occur (DCC_COMPRESS also implies FMASK_DECOMPRESS). 17487ec681f3Smrg */ 17497ec681f3Smrg cb_color_info &= C_028C70_FMASK_COMPRESS_1FRAG_ONLY; 17507ec681f3Smrg } 17517ec681f3Smrg 17527ec681f3Smrg if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) { 17537ec681f3Smrg radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11); 17547ec681f3Smrg radeon_emit(cmd_buffer->cs, cb->cb_color_base); 17557ec681f3Smrg radeon_emit(cmd_buffer->cs, 0); 17567ec681f3Smrg radeon_emit(cmd_buffer->cs, 0); 17577ec681f3Smrg radeon_emit(cmd_buffer->cs, cb->cb_color_view); 17587ec681f3Smrg radeon_emit(cmd_buffer->cs, cb_color_info); 17597ec681f3Smrg radeon_emit(cmd_buffer->cs, cb->cb_color_attrib); 17607ec681f3Smrg radeon_emit(cmd_buffer->cs, cb->cb_dcc_control); 17617ec681f3Smrg radeon_emit(cmd_buffer->cs, cb->cb_color_cmask); 17627ec681f3Smrg radeon_emit(cmd_buffer->cs, 0); 17637ec681f3Smrg radeon_emit(cmd_buffer->cs, cb->cb_color_fmask); 17647ec681f3Smrg radeon_emit(cmd_buffer->cs, 0); 17657ec681f3Smrg 17667ec681f3Smrg radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->cb_dcc_base); 17677ec681f3Smrg 17687ec681f3Smrg radeon_set_context_reg(cmd_buffer->cs, R_028E40_CB_COLOR0_BASE_EXT + index * 4, 17697ec681f3Smrg cb->cb_color_base >> 32); 17707ec681f3Smrg radeon_set_context_reg(cmd_buffer->cs, R_028E60_CB_COLOR0_CMASK_BASE_EXT + index * 4, 17717ec681f3Smrg cb->cb_color_cmask >> 32); 17727ec681f3Smrg radeon_set_context_reg(cmd_buffer->cs, R_028E80_CB_COLOR0_FMASK_BASE_EXT + index * 4, 17737ec681f3Smrg cb->cb_color_fmask >> 32); 17747ec681f3Smrg radeon_set_context_reg(cmd_buffer->cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + index * 4, 17757ec681f3Smrg cb->cb_dcc_base >> 32); 17767ec681f3Smrg radeon_set_context_reg(cmd_buffer->cs, R_028EC0_CB_COLOR0_ATTRIB2 + index * 4, 17777ec681f3Smrg cb->cb_color_attrib2); 17787ec681f3Smrg radeon_set_context_reg(cmd_buffer->cs, R_028EE0_CB_COLOR0_ATTRIB3 + index * 4, 17797ec681f3Smrg cb->cb_color_attrib3); 17807ec681f3Smrg } else if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) { 17817ec681f3Smrg radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11); 17827ec681f3Smrg radeon_emit(cmd_buffer->cs, cb->cb_color_base); 17837ec681f3Smrg radeon_emit(cmd_buffer->cs, S_028C64_BASE_256B(cb->cb_color_base >> 32)); 17847ec681f3Smrg radeon_emit(cmd_buffer->cs, cb->cb_color_attrib2); 17857ec681f3Smrg radeon_emit(cmd_buffer->cs, cb->cb_color_view); 17867ec681f3Smrg radeon_emit(cmd_buffer->cs, cb_color_info); 17877ec681f3Smrg radeon_emit(cmd_buffer->cs, cb->cb_color_attrib); 17887ec681f3Smrg radeon_emit(cmd_buffer->cs, cb->cb_dcc_control); 17897ec681f3Smrg radeon_emit(cmd_buffer->cs, cb->cb_color_cmask); 17907ec681f3Smrg radeon_emit(cmd_buffer->cs, S_028C80_BASE_256B(cb->cb_color_cmask >> 32)); 17917ec681f3Smrg radeon_emit(cmd_buffer->cs, cb->cb_color_fmask); 17927ec681f3Smrg radeon_emit(cmd_buffer->cs, S_028C88_BASE_256B(cb->cb_color_fmask >> 32)); 17937ec681f3Smrg 17947ec681f3Smrg radeon_set_context_reg_seq(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, 2); 17957ec681f3Smrg radeon_emit(cmd_buffer->cs, cb->cb_dcc_base); 17967ec681f3Smrg radeon_emit(cmd_buffer->cs, S_028C98_BASE_256B(cb->cb_dcc_base >> 32)); 17977ec681f3Smrg 17987ec681f3Smrg radeon_set_context_reg(cmd_buffer->cs, R_0287A0_CB_MRT0_EPITCH + index * 4, 17997ec681f3Smrg cb->cb_mrt_epitch); 18007ec681f3Smrg } else { 18017ec681f3Smrg radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11); 18027ec681f3Smrg radeon_emit(cmd_buffer->cs, cb->cb_color_base); 18037ec681f3Smrg radeon_emit(cmd_buffer->cs, cb->cb_color_pitch); 18047ec681f3Smrg radeon_emit(cmd_buffer->cs, cb->cb_color_slice); 18057ec681f3Smrg radeon_emit(cmd_buffer->cs, cb->cb_color_view); 18067ec681f3Smrg radeon_emit(cmd_buffer->cs, cb_color_info); 18077ec681f3Smrg radeon_emit(cmd_buffer->cs, cb->cb_color_attrib); 18087ec681f3Smrg radeon_emit(cmd_buffer->cs, cb->cb_dcc_control); 18097ec681f3Smrg radeon_emit(cmd_buffer->cs, cb->cb_color_cmask); 18107ec681f3Smrg radeon_emit(cmd_buffer->cs, cb->cb_color_cmask_slice); 18117ec681f3Smrg radeon_emit(cmd_buffer->cs, cb->cb_color_fmask); 18127ec681f3Smrg radeon_emit(cmd_buffer->cs, cb->cb_color_fmask_slice); 18137ec681f3Smrg 18147ec681f3Smrg if (is_vi) { /* DCC BASE */ 18157ec681f3Smrg radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, 18167ec681f3Smrg cb->cb_dcc_base); 18177ec681f3Smrg } 18187ec681f3Smrg } 18197ec681f3Smrg 18207ec681f3Smrg if (G_028C70_DCC_ENABLE(cb_color_info)) { 18217ec681f3Smrg /* Drawing with DCC enabled also compresses colorbuffers. */ 18227ec681f3Smrg VkImageSubresourceRange range = { 18237ec681f3Smrg .aspectMask = iview->aspect_mask, 18247ec681f3Smrg .baseMipLevel = iview->base_mip, 18257ec681f3Smrg .levelCount = iview->level_count, 18267ec681f3Smrg .baseArrayLayer = iview->base_layer, 18277ec681f3Smrg .layerCount = iview->layer_count, 18287ec681f3Smrg }; 18297ec681f3Smrg 18307ec681f3Smrg radv_update_dcc_metadata(cmd_buffer, image, &range, true); 18317ec681f3Smrg } 183201e04c3fSmrg} 183301e04c3fSmrg 183401e04c3fSmrgstatic void 18357ec681f3Smrgradv_update_zrange_precision(struct radv_cmd_buffer *cmd_buffer, struct radv_ds_buffer_info *ds, 18367ec681f3Smrg const struct radv_image_view *iview, VkImageLayout layout, 18377ec681f3Smrg bool in_render_loop, bool requires_cond_exec) 18387ec681f3Smrg{ 18397ec681f3Smrg const struct radv_image *image = iview->image; 18407ec681f3Smrg uint32_t db_z_info = ds->db_z_info; 18417ec681f3Smrg uint32_t db_z_info_reg; 18427ec681f3Smrg 18437ec681f3Smrg if (!cmd_buffer->device->physical_device->rad_info.has_tc_compat_zrange_bug || 18447ec681f3Smrg !radv_image_is_tc_compat_htile(image)) 18457ec681f3Smrg return; 18467ec681f3Smrg 18477ec681f3Smrg if (!radv_layout_is_htile_compressed( 18487ec681f3Smrg cmd_buffer->device, image, layout, in_render_loop, 18497ec681f3Smrg radv_image_queue_family_mask(image, cmd_buffer->queue_family_index, 18507ec681f3Smrg cmd_buffer->queue_family_index))) { 18517ec681f3Smrg db_z_info &= C_028040_TILE_SURFACE_ENABLE; 18527ec681f3Smrg } 18537ec681f3Smrg 18547ec681f3Smrg db_z_info &= C_028040_ZRANGE_PRECISION; 18557ec681f3Smrg 18567ec681f3Smrg if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) { 18577ec681f3Smrg db_z_info_reg = R_028038_DB_Z_INFO; 18587ec681f3Smrg } else { 18597ec681f3Smrg db_z_info_reg = R_028040_DB_Z_INFO; 18607ec681f3Smrg } 18617ec681f3Smrg 18627ec681f3Smrg /* When we don't know the last fast clear value we need to emit a 18637ec681f3Smrg * conditional packet that will eventually skip the following 18647ec681f3Smrg * SET_CONTEXT_REG packet. 18657ec681f3Smrg */ 18667ec681f3Smrg if (requires_cond_exec) { 18677ec681f3Smrg uint64_t va = radv_get_tc_compat_zrange_va(image, iview->base_mip); 18687ec681f3Smrg 18697ec681f3Smrg radeon_emit(cmd_buffer->cs, PKT3(PKT3_COND_EXEC, 3, 0)); 18707ec681f3Smrg radeon_emit(cmd_buffer->cs, va); 18717ec681f3Smrg radeon_emit(cmd_buffer->cs, va >> 32); 18727ec681f3Smrg radeon_emit(cmd_buffer->cs, 0); 18737ec681f3Smrg radeon_emit(cmd_buffer->cs, 3); /* SET_CONTEXT_REG size */ 18747ec681f3Smrg } 18757ec681f3Smrg 18767ec681f3Smrg radeon_set_context_reg(cmd_buffer->cs, db_z_info_reg, db_z_info); 187701e04c3fSmrg} 187801e04c3fSmrg 187901e04c3fSmrgstatic void 18807ec681f3Smrgradv_emit_fb_ds_state(struct radv_cmd_buffer *cmd_buffer, struct radv_ds_buffer_info *ds, 18817ec681f3Smrg struct radv_image_view *iview, VkImageLayout layout, bool in_render_loop) 18827ec681f3Smrg{ 18837ec681f3Smrg const struct radv_image *image = iview->image; 18847ec681f3Smrg uint32_t db_z_info = ds->db_z_info; 18857ec681f3Smrg uint32_t db_stencil_info = ds->db_stencil_info; 18867ec681f3Smrg 18877ec681f3Smrg if (!radv_layout_is_htile_compressed( 18887ec681f3Smrg cmd_buffer->device, image, layout, in_render_loop, 18897ec681f3Smrg radv_image_queue_family_mask(image, cmd_buffer->queue_family_index, 18907ec681f3Smrg cmd_buffer->queue_family_index))) { 18917ec681f3Smrg db_z_info &= C_028040_TILE_SURFACE_ENABLE; 18927ec681f3Smrg db_stencil_info |= S_028044_TILE_STENCIL_DISABLE(1); 18937ec681f3Smrg } 18947ec681f3Smrg 18957ec681f3Smrg radeon_set_context_reg(cmd_buffer->cs, R_028008_DB_DEPTH_VIEW, ds->db_depth_view); 18967ec681f3Smrg radeon_set_context_reg(cmd_buffer->cs, R_028ABC_DB_HTILE_SURFACE, ds->db_htile_surface); 18977ec681f3Smrg 18987ec681f3Smrg if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) { 18997ec681f3Smrg radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base); 19007ec681f3Smrg radeon_set_context_reg(cmd_buffer->cs, R_02801C_DB_DEPTH_SIZE_XY, ds->db_depth_size); 19017ec681f3Smrg 19027ec681f3Smrg radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 7); 19037ec681f3Smrg radeon_emit(cmd_buffer->cs, S_02803C_RESOURCE_LEVEL(1)); 19047ec681f3Smrg radeon_emit(cmd_buffer->cs, db_z_info); 19057ec681f3Smrg radeon_emit(cmd_buffer->cs, db_stencil_info); 19067ec681f3Smrg radeon_emit(cmd_buffer->cs, ds->db_z_read_base); 19077ec681f3Smrg radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base); 19087ec681f3Smrg radeon_emit(cmd_buffer->cs, ds->db_z_read_base); 19097ec681f3Smrg radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base); 19107ec681f3Smrg 19117ec681f3Smrg radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_READ_BASE_HI, 5); 19127ec681f3Smrg radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32); 19137ec681f3Smrg radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32); 19147ec681f3Smrg radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32); 19157ec681f3Smrg radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32); 19167ec681f3Smrg radeon_emit(cmd_buffer->cs, ds->db_htile_data_base >> 32); 19177ec681f3Smrg } else if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) { 19187ec681f3Smrg radeon_set_context_reg_seq(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, 3); 19197ec681f3Smrg radeon_emit(cmd_buffer->cs, ds->db_htile_data_base); 19207ec681f3Smrg radeon_emit(cmd_buffer->cs, S_028018_BASE_HI(ds->db_htile_data_base >> 32)); 19217ec681f3Smrg radeon_emit(cmd_buffer->cs, ds->db_depth_size); 19227ec681f3Smrg 19237ec681f3Smrg radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 10); 19247ec681f3Smrg radeon_emit(cmd_buffer->cs, db_z_info); /* DB_Z_INFO */ 19257ec681f3Smrg radeon_emit(cmd_buffer->cs, db_stencil_info); /* DB_STENCIL_INFO */ 19267ec681f3Smrg radeon_emit(cmd_buffer->cs, ds->db_z_read_base); /* DB_Z_READ_BASE */ 19277ec681f3Smrg radeon_emit(cmd_buffer->cs, 19287ec681f3Smrg S_028044_BASE_HI(ds->db_z_read_base >> 32)); /* DB_Z_READ_BASE_HI */ 19297ec681f3Smrg radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base); /* DB_STENCIL_READ_BASE */ 19307ec681f3Smrg radeon_emit(cmd_buffer->cs, 19317ec681f3Smrg S_02804C_BASE_HI(ds->db_stencil_read_base >> 32)); /* DB_STENCIL_READ_BASE_HI */ 19327ec681f3Smrg radeon_emit(cmd_buffer->cs, ds->db_z_write_base); /* DB_Z_WRITE_BASE */ 19337ec681f3Smrg radeon_emit(cmd_buffer->cs, 19347ec681f3Smrg S_028054_BASE_HI(ds->db_z_write_base >> 32)); /* DB_Z_WRITE_BASE_HI */ 19357ec681f3Smrg radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base); /* DB_STENCIL_WRITE_BASE */ 19367ec681f3Smrg radeon_emit(cmd_buffer->cs, 19377ec681f3Smrg S_02805C_BASE_HI(ds->db_stencil_write_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */ 19387ec681f3Smrg 19397ec681f3Smrg radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_INFO2, 2); 19407ec681f3Smrg radeon_emit(cmd_buffer->cs, ds->db_z_info2); 19417ec681f3Smrg radeon_emit(cmd_buffer->cs, ds->db_stencil_info2); 19427ec681f3Smrg } else { 19437ec681f3Smrg radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base); 19447ec681f3Smrg 19457ec681f3Smrg radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 9); 19467ec681f3Smrg radeon_emit(cmd_buffer->cs, ds->db_depth_info); /* R_02803C_DB_DEPTH_INFO */ 19477ec681f3Smrg radeon_emit(cmd_buffer->cs, db_z_info); /* R_028040_DB_Z_INFO */ 19487ec681f3Smrg radeon_emit(cmd_buffer->cs, db_stencil_info); /* R_028044_DB_STENCIL_INFO */ 19497ec681f3Smrg radeon_emit(cmd_buffer->cs, ds->db_z_read_base); /* R_028048_DB_Z_READ_BASE */ 19507ec681f3Smrg radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base); /* R_02804C_DB_STENCIL_READ_BASE */ 19517ec681f3Smrg radeon_emit(cmd_buffer->cs, ds->db_z_write_base); /* R_028050_DB_Z_WRITE_BASE */ 19527ec681f3Smrg radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base); /* R_028054_DB_STENCIL_WRITE_BASE */ 19537ec681f3Smrg radeon_emit(cmd_buffer->cs, ds->db_depth_size); /* R_028058_DB_DEPTH_SIZE */ 19547ec681f3Smrg radeon_emit(cmd_buffer->cs, ds->db_depth_slice); /* R_02805C_DB_DEPTH_SLICE */ 19557ec681f3Smrg } 19567ec681f3Smrg 19577ec681f3Smrg /* Update the ZRANGE_PRECISION value for the TC-compat bug. */ 19587ec681f3Smrg radv_update_zrange_precision(cmd_buffer, ds, iview, layout, in_render_loop, true); 19597ec681f3Smrg 19607ec681f3Smrg radeon_set_context_reg(cmd_buffer->cs, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, 19617ec681f3Smrg ds->pa_su_poly_offset_db_fmt_cntl); 196201e04c3fSmrg} 196301e04c3fSmrg 196401e04c3fSmrg/** 196501e04c3fSmrg * Update the fast clear depth/stencil values if the image is bound as a 196601e04c3fSmrg * depth/stencil buffer. 196701e04c3fSmrg */ 196801e04c3fSmrgstatic void 196901e04c3fSmrgradv_update_bound_fast_clear_ds(struct radv_cmd_buffer *cmd_buffer, 19707ec681f3Smrg const struct radv_image_view *iview, 19717ec681f3Smrg VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects) 197201e04c3fSmrg{ 19737ec681f3Smrg const struct radv_subpass *subpass = cmd_buffer->state.subpass; 19747ec681f3Smrg const struct radv_image *image = iview->image; 19757ec681f3Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 19767ec681f3Smrg uint32_t att_idx; 197701e04c3fSmrg 19787ec681f3Smrg if (!cmd_buffer->state.attachments || !subpass) 19797ec681f3Smrg return; 198001e04c3fSmrg 19817ec681f3Smrg if (!subpass->depth_stencil_attachment) 19827ec681f3Smrg return; 198301e04c3fSmrg 19847ec681f3Smrg att_idx = subpass->depth_stencil_attachment->attachment; 19857ec681f3Smrg if (cmd_buffer->state.attachments[att_idx].iview->image != image) 19867ec681f3Smrg return; 198701e04c3fSmrg 19887ec681f3Smrg if (aspects == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { 19897ec681f3Smrg radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2); 19907ec681f3Smrg radeon_emit(cs, ds_clear_value.stencil); 19917ec681f3Smrg radeon_emit(cs, fui(ds_clear_value.depth)); 19927ec681f3Smrg } else if (aspects == VK_IMAGE_ASPECT_DEPTH_BIT) { 19937ec681f3Smrg radeon_set_context_reg(cs, R_02802C_DB_DEPTH_CLEAR, fui(ds_clear_value.depth)); 19947ec681f3Smrg } else { 19957ec681f3Smrg assert(aspects == VK_IMAGE_ASPECT_STENCIL_BIT); 19967ec681f3Smrg radeon_set_context_reg(cs, R_028028_DB_STENCIL_CLEAR, ds_clear_value.stencil); 19977ec681f3Smrg } 199801e04c3fSmrg 19997ec681f3Smrg /* Update the ZRANGE_PRECISION value for the TC-compat bug. This is 20007ec681f3Smrg * only needed when clearing Z to 0.0. 20017ec681f3Smrg */ 20027ec681f3Smrg if ((aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && ds_clear_value.depth == 0.0) { 20037ec681f3Smrg VkImageLayout layout = subpass->depth_stencil_attachment->layout; 20047ec681f3Smrg bool in_render_loop = subpass->depth_stencil_attachment->in_render_loop; 200501e04c3fSmrg 20067ec681f3Smrg radv_update_zrange_precision(cmd_buffer, &cmd_buffer->state.attachments[att_idx].ds, iview, 20077ec681f3Smrg layout, in_render_loop, false); 20087ec681f3Smrg } 2009ed98bd31Smaya 20107ec681f3Smrg cmd_buffer->state.context_roll_without_scissor_emitted = true; 201101e04c3fSmrg} 201201e04c3fSmrg 201301e04c3fSmrg/** 201401e04c3fSmrg * Set the clear depth/stencil values to the image's metadata. 201501e04c3fSmrg */ 201601e04c3fSmrgstatic void 20177ec681f3Smrgradv_set_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 20187ec681f3Smrg const VkImageSubresourceRange *range, 20197ec681f3Smrg VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects) 20207ec681f3Smrg{ 20217ec681f3Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 20227ec681f3Smrg uint32_t level_count = radv_get_levelCount(image, range); 20237ec681f3Smrg 20247ec681f3Smrg if (aspects == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { 20257ec681f3Smrg uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel); 20267ec681f3Smrg 20277ec681f3Smrg /* Use the fastest way when both aspects are used. */ 20287ec681f3Smrg radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + 2 * level_count, cmd_buffer->state.predicating)); 20297ec681f3Smrg radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP)); 20307ec681f3Smrg radeon_emit(cs, va); 20317ec681f3Smrg radeon_emit(cs, va >> 32); 20327ec681f3Smrg 20337ec681f3Smrg for (uint32_t l = 0; l < level_count; l++) { 20347ec681f3Smrg radeon_emit(cs, ds_clear_value.stencil); 20357ec681f3Smrg radeon_emit(cs, fui(ds_clear_value.depth)); 20367ec681f3Smrg } 20377ec681f3Smrg } else { 20387ec681f3Smrg /* Otherwise we need one WRITE_DATA packet per level. */ 20397ec681f3Smrg for (uint32_t l = 0; l < level_count; l++) { 20407ec681f3Smrg uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel + l); 20417ec681f3Smrg unsigned value; 20427ec681f3Smrg 20437ec681f3Smrg if (aspects == VK_IMAGE_ASPECT_DEPTH_BIT) { 20447ec681f3Smrg value = fui(ds_clear_value.depth); 20457ec681f3Smrg va += 4; 20467ec681f3Smrg } else { 20477ec681f3Smrg assert(aspects == VK_IMAGE_ASPECT_STENCIL_BIT); 20487ec681f3Smrg value = ds_clear_value.stencil; 20497ec681f3Smrg } 20507ec681f3Smrg 20517ec681f3Smrg radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, cmd_buffer->state.predicating)); 20527ec681f3Smrg radeon_emit(cs, 20537ec681f3Smrg S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP)); 20547ec681f3Smrg radeon_emit(cs, va); 20557ec681f3Smrg radeon_emit(cs, va >> 32); 20567ec681f3Smrg radeon_emit(cs, value); 20577ec681f3Smrg } 20587ec681f3Smrg } 205901e04c3fSmrg} 206001e04c3fSmrg 206101e04c3fSmrg/** 206201e04c3fSmrg * Update the TC-compat metadata value for this image. 206301e04c3fSmrg */ 206401e04c3fSmrgstatic void 20657ec681f3Smrgradv_set_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 20667ec681f3Smrg const VkImageSubresourceRange *range, uint32_t value) 206701e04c3fSmrg{ 20687ec681f3Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 20697ec681f3Smrg 20707ec681f3Smrg if (!cmd_buffer->device->physical_device->rad_info.has_tc_compat_zrange_bug) 20717ec681f3Smrg return; 207201e04c3fSmrg 20737ec681f3Smrg uint64_t va = radv_get_tc_compat_zrange_va(image, range->baseMipLevel); 20747ec681f3Smrg uint32_t level_count = radv_get_levelCount(image, range); 20757ec681f3Smrg 20767ec681f3Smrg radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + level_count, cmd_buffer->state.predicating)); 20777ec681f3Smrg radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP)); 20787ec681f3Smrg radeon_emit(cs, va); 20797ec681f3Smrg radeon_emit(cs, va >> 32); 20807ec681f3Smrg 20817ec681f3Smrg for (uint32_t l = 0; l < level_count; l++) 20827ec681f3Smrg radeon_emit(cs, value); 208301e04c3fSmrg} 208401e04c3fSmrg 208501e04c3fSmrgstatic void 208601e04c3fSmrgradv_update_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer, 20877ec681f3Smrg const struct radv_image_view *iview, 20887ec681f3Smrg VkClearDepthStencilValue ds_clear_value) 208901e04c3fSmrg{ 20907ec681f3Smrg VkImageSubresourceRange range = { 20917ec681f3Smrg .aspectMask = iview->aspect_mask, 20927ec681f3Smrg .baseMipLevel = iview->base_mip, 20937ec681f3Smrg .levelCount = iview->level_count, 20947ec681f3Smrg .baseArrayLayer = iview->base_layer, 20957ec681f3Smrg .layerCount = iview->layer_count, 20967ec681f3Smrg }; 20977ec681f3Smrg uint32_t cond_val; 209801e04c3fSmrg 20997ec681f3Smrg /* Conditionally set DB_Z_INFO.ZRANGE_PRECISION to 0 when the last 21007ec681f3Smrg * depth clear value is 0.0f. 21017ec681f3Smrg */ 21027ec681f3Smrg cond_val = ds_clear_value.depth == 0.0f ? UINT_MAX : 0; 210301e04c3fSmrg 21047ec681f3Smrg radv_set_tc_compat_zrange_metadata(cmd_buffer, iview->image, &range, cond_val); 210501e04c3fSmrg} 210601e04c3fSmrg 210701e04c3fSmrg/** 210801e04c3fSmrg * Update the clear depth/stencil values for this image. 210901e04c3fSmrg */ 211001e04c3fSmrgvoid 211101e04c3fSmrgradv_update_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, 21127ec681f3Smrg const struct radv_image_view *iview, 21137ec681f3Smrg VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects) 211401e04c3fSmrg{ 21157ec681f3Smrg VkImageSubresourceRange range = { 21167ec681f3Smrg .aspectMask = iview->aspect_mask, 21177ec681f3Smrg .baseMipLevel = iview->base_mip, 21187ec681f3Smrg .levelCount = iview->level_count, 21197ec681f3Smrg .baseArrayLayer = iview->base_layer, 21207ec681f3Smrg .layerCount = iview->layer_count, 21217ec681f3Smrg }; 21227ec681f3Smrg struct radv_image *image = iview->image; 212301e04c3fSmrg 21247ec681f3Smrg assert(radv_htile_enabled(image, range.baseMipLevel)); 212501e04c3fSmrg 21267ec681f3Smrg radv_set_ds_clear_metadata(cmd_buffer, iview->image, &range, ds_clear_value, aspects); 212701e04c3fSmrg 21287ec681f3Smrg if (radv_image_is_tc_compat_htile(image) && (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) { 21297ec681f3Smrg radv_update_tc_compat_zrange_metadata(cmd_buffer, iview, ds_clear_value); 21307ec681f3Smrg } 21317ec681f3Smrg 21327ec681f3Smrg radv_update_bound_fast_clear_ds(cmd_buffer, iview, ds_clear_value, aspects); 213301e04c3fSmrg} 213401e04c3fSmrg 213501e04c3fSmrg/** 213601e04c3fSmrg * Load the clear depth/stencil values from the image's metadata. 213701e04c3fSmrg */ 213801e04c3fSmrgstatic void 21397ec681f3Smrgradv_load_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, const struct radv_image_view *iview) 21407ec681f3Smrg{ 21417ec681f3Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 21427ec681f3Smrg const struct radv_image *image = iview->image; 21437ec681f3Smrg VkImageAspectFlags aspects = vk_format_aspects(image->vk_format); 21447ec681f3Smrg uint64_t va = radv_get_ds_clear_value_va(image, iview->base_mip); 21457ec681f3Smrg unsigned reg_offset = 0, reg_count = 0; 21467ec681f3Smrg 21477ec681f3Smrg assert(radv_image_has_htile(image)); 21487ec681f3Smrg 21497ec681f3Smrg if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { 21507ec681f3Smrg ++reg_count; 21517ec681f3Smrg } else { 21527ec681f3Smrg ++reg_offset; 21537ec681f3Smrg va += 4; 21547ec681f3Smrg } 21557ec681f3Smrg if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT) 21567ec681f3Smrg ++reg_count; 21577ec681f3Smrg 21587ec681f3Smrg uint32_t reg = R_028028_DB_STENCIL_CLEAR + 4 * reg_offset; 21597ec681f3Smrg 21607ec681f3Smrg if (cmd_buffer->device->physical_device->rad_info.has_load_ctx_reg_pkt) { 21617ec681f3Smrg radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, 0)); 21627ec681f3Smrg radeon_emit(cs, va); 21637ec681f3Smrg radeon_emit(cs, va >> 32); 21647ec681f3Smrg radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2); 21657ec681f3Smrg radeon_emit(cs, reg_count); 21667ec681f3Smrg } else { 21677ec681f3Smrg radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); 21687ec681f3Smrg radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) | 21697ec681f3Smrg (reg_count == 2 ? COPY_DATA_COUNT_SEL : 0)); 21707ec681f3Smrg radeon_emit(cs, va); 21717ec681f3Smrg radeon_emit(cs, va >> 32); 21727ec681f3Smrg radeon_emit(cs, reg >> 2); 21737ec681f3Smrg radeon_emit(cs, 0); 21747ec681f3Smrg 21757ec681f3Smrg radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); 21767ec681f3Smrg radeon_emit(cs, 0); 21777ec681f3Smrg } 217801e04c3fSmrg} 217901e04c3fSmrg 218001e04c3fSmrg/* 218101e04c3fSmrg * With DCC some colors don't require CMASK elimination before being 218201e04c3fSmrg * used as a texture. This sets a predicate value to determine if the 218301e04c3fSmrg * cmask eliminate is required. 218401e04c3fSmrg */ 218501e04c3fSmrgvoid 21867ec681f3Smrgradv_update_fce_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 21877ec681f3Smrg const VkImageSubresourceRange *range, bool value) 2188ed98bd31Smaya{ 21897ec681f3Smrg if (!image->fce_pred_offset) 21907ec681f3Smrg return; 21917ec681f3Smrg 21927ec681f3Smrg uint64_t pred_val = value; 21937ec681f3Smrg uint64_t va = radv_image_get_fce_pred_va(image, range->baseMipLevel); 21947ec681f3Smrg uint32_t level_count = radv_get_levelCount(image, range); 21957ec681f3Smrg uint32_t count = 2 * level_count; 2196ed98bd31Smaya 21977ec681f3Smrg radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0)); 21987ec681f3Smrg radeon_emit(cmd_buffer->cs, 21997ec681f3Smrg S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP)); 22007ec681f3Smrg radeon_emit(cmd_buffer->cs, va); 22017ec681f3Smrg radeon_emit(cmd_buffer->cs, va >> 32); 2202ed98bd31Smaya 22037ec681f3Smrg for (uint32_t l = 0; l < level_count; l++) { 22047ec681f3Smrg radeon_emit(cmd_buffer->cs, pred_val); 22057ec681f3Smrg radeon_emit(cmd_buffer->cs, pred_val >> 32); 22067ec681f3Smrg } 2207ed98bd31Smaya} 2208ed98bd31Smaya 2209ed98bd31Smaya/** 2210ed98bd31Smaya * Update the DCC predicate to reflect the compression state. 2211ed98bd31Smaya */ 2212ed98bd31Smayavoid 22137ec681f3Smrgradv_update_dcc_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 22147ec681f3Smrg const VkImageSubresourceRange *range, bool value) 221501e04c3fSmrg{ 22167ec681f3Smrg if (image->dcc_pred_offset == 0) 22177ec681f3Smrg return; 221801e04c3fSmrg 22197ec681f3Smrg uint64_t pred_val = value; 22207ec681f3Smrg uint64_t va = radv_image_get_dcc_pred_va(image, range->baseMipLevel); 22217ec681f3Smrg uint32_t level_count = radv_get_levelCount(image, range); 22227ec681f3Smrg uint32_t count = 2 * level_count; 222301e04c3fSmrg 22247ec681f3Smrg assert(radv_dcc_enabled(image, range->baseMipLevel)); 22257ec681f3Smrg 22267ec681f3Smrg radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0)); 22277ec681f3Smrg radeon_emit(cmd_buffer->cs, 22287ec681f3Smrg S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP)); 22297ec681f3Smrg radeon_emit(cmd_buffer->cs, va); 22307ec681f3Smrg radeon_emit(cmd_buffer->cs, va >> 32); 22317ec681f3Smrg 22327ec681f3Smrg for (uint32_t l = 0; l < level_count; l++) { 22337ec681f3Smrg radeon_emit(cmd_buffer->cs, pred_val); 22347ec681f3Smrg radeon_emit(cmd_buffer->cs, pred_val >> 32); 22357ec681f3Smrg } 223601e04c3fSmrg} 223701e04c3fSmrg 223801e04c3fSmrg/** 223901e04c3fSmrg * Update the fast clear color values if the image is bound as a color buffer. 224001e04c3fSmrg */ 224101e04c3fSmrgstatic void 22427ec681f3Smrgradv_update_bound_fast_clear_color(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 22437ec681f3Smrg int cb_idx, uint32_t color_values[2]) 224401e04c3fSmrg{ 22457ec681f3Smrg const struct radv_subpass *subpass = cmd_buffer->state.subpass; 22467ec681f3Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 22477ec681f3Smrg uint32_t att_idx; 224801e04c3fSmrg 22497ec681f3Smrg if (!cmd_buffer->state.attachments || !subpass) 22507ec681f3Smrg return; 225101e04c3fSmrg 22527ec681f3Smrg att_idx = subpass->color_attachments[cb_idx].attachment; 22537ec681f3Smrg if (att_idx == VK_ATTACHMENT_UNUSED) 22547ec681f3Smrg return; 225501e04c3fSmrg 22567ec681f3Smrg if (cmd_buffer->state.attachments[att_idx].iview->image != image) 22577ec681f3Smrg return; 225801e04c3fSmrg 22597ec681f3Smrg radeon_set_context_reg_seq(cs, R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c, 2); 22607ec681f3Smrg radeon_emit(cs, color_values[0]); 22617ec681f3Smrg radeon_emit(cs, color_values[1]); 2262ed98bd31Smaya 22637ec681f3Smrg cmd_buffer->state.context_roll_without_scissor_emitted = true; 226401e04c3fSmrg} 226501e04c3fSmrg 226601e04c3fSmrg/** 226701e04c3fSmrg * Set the clear color values to the image's metadata. 226801e04c3fSmrg */ 226901e04c3fSmrgstatic void 22707ec681f3Smrgradv_set_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 22717ec681f3Smrg const VkImageSubresourceRange *range, uint32_t color_values[2]) 227201e04c3fSmrg{ 22737ec681f3Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 22747ec681f3Smrg uint32_t level_count = radv_get_levelCount(image, range); 22757ec681f3Smrg uint32_t count = 2 * level_count; 227601e04c3fSmrg 22777ec681f3Smrg assert(radv_image_has_cmask(image) || radv_dcc_enabled(image, range->baseMipLevel)); 227801e04c3fSmrg 22797ec681f3Smrg if (radv_image_has_clear_value(image)) { 22807ec681f3Smrg uint64_t va = radv_image_get_fast_clear_va(image, range->baseMipLevel); 228101e04c3fSmrg 22827ec681f3Smrg radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + count, cmd_buffer->state.predicating)); 22837ec681f3Smrg radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP)); 22847ec681f3Smrg radeon_emit(cs, va); 22857ec681f3Smrg radeon_emit(cs, va >> 32); 22867ec681f3Smrg 22877ec681f3Smrg for (uint32_t l = 0; l < level_count; l++) { 22887ec681f3Smrg radeon_emit(cs, color_values[0]); 22897ec681f3Smrg radeon_emit(cs, color_values[1]); 22907ec681f3Smrg } 22917ec681f3Smrg } else { 22927ec681f3Smrg /* Some default value we can set in the update. */ 22937ec681f3Smrg assert(color_values[0] == 0 && color_values[1] == 0); 22947ec681f3Smrg } 229501e04c3fSmrg} 229601e04c3fSmrg 229701e04c3fSmrg/** 229801e04c3fSmrg * Update the clear color values for this image. 229901e04c3fSmrg */ 230001e04c3fSmrgvoid 230101e04c3fSmrgradv_update_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, 23027ec681f3Smrg const struct radv_image_view *iview, int cb_idx, 23037ec681f3Smrg uint32_t color_values[2]) 230401e04c3fSmrg{ 23057ec681f3Smrg struct radv_image *image = iview->image; 23067ec681f3Smrg VkImageSubresourceRange range = { 23077ec681f3Smrg .aspectMask = iview->aspect_mask, 23087ec681f3Smrg .baseMipLevel = iview->base_mip, 23097ec681f3Smrg .levelCount = iview->level_count, 23107ec681f3Smrg .baseArrayLayer = iview->base_layer, 23117ec681f3Smrg .layerCount = iview->layer_count, 23127ec681f3Smrg }; 23137ec681f3Smrg 23147ec681f3Smrg assert(radv_image_has_cmask(image) || radv_dcc_enabled(image, iview->base_mip)); 231501e04c3fSmrg 23167ec681f3Smrg /* Do not need to update the clear value for images that are fast cleared with the comp-to-single 23177ec681f3Smrg * mode because the hardware gets the value from the image directly. 23187ec681f3Smrg */ 23197ec681f3Smrg if (iview->image->support_comp_to_single) 23207ec681f3Smrg return; 232101e04c3fSmrg 23227ec681f3Smrg radv_set_color_clear_metadata(cmd_buffer, image, &range, color_values); 23237ec681f3Smrg 23247ec681f3Smrg radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx, color_values); 232501e04c3fSmrg} 232601e04c3fSmrg 232701e04c3fSmrg/** 232801e04c3fSmrg * Load the clear color values from the image's metadata. 232901e04c3fSmrg */ 233001e04c3fSmrgstatic void 23317ec681f3Smrgradv_load_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image_view *iview, 23327ec681f3Smrg int cb_idx) 23337ec681f3Smrg{ 23347ec681f3Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 23357ec681f3Smrg struct radv_image *image = iview->image; 23367ec681f3Smrg 23377ec681f3Smrg if (!radv_image_has_cmask(image) && !radv_dcc_enabled(image, iview->base_mip)) 23387ec681f3Smrg return; 23397ec681f3Smrg 23407ec681f3Smrg if (iview->image->support_comp_to_single) 23417ec681f3Smrg return; 23427ec681f3Smrg 23437ec681f3Smrg if (!radv_image_has_clear_value(image)) { 23447ec681f3Smrg uint32_t color_values[2] = {0, 0}; 23457ec681f3Smrg radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx, color_values); 23467ec681f3Smrg return; 23477ec681f3Smrg } 23487ec681f3Smrg 23497ec681f3Smrg uint64_t va = radv_image_get_fast_clear_va(image, iview->base_mip); 23507ec681f3Smrg uint32_t reg = R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c; 23517ec681f3Smrg 23527ec681f3Smrg if (cmd_buffer->device->physical_device->rad_info.has_load_ctx_reg_pkt) { 23537ec681f3Smrg radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, cmd_buffer->state.predicating)); 23547ec681f3Smrg radeon_emit(cs, va); 23557ec681f3Smrg radeon_emit(cs, va >> 32); 23567ec681f3Smrg radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2); 23577ec681f3Smrg radeon_emit(cs, 2); 23587ec681f3Smrg } else { 23597ec681f3Smrg radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating)); 23607ec681f3Smrg radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) | 23617ec681f3Smrg COPY_DATA_COUNT_SEL); 23627ec681f3Smrg radeon_emit(cs, va); 23637ec681f3Smrg radeon_emit(cs, va >> 32); 23647ec681f3Smrg radeon_emit(cs, reg >> 2); 23657ec681f3Smrg radeon_emit(cs, 0); 23667ec681f3Smrg 23677ec681f3Smrg radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating)); 23687ec681f3Smrg radeon_emit(cs, 0); 23697ec681f3Smrg } 23707ec681f3Smrg} 23717ec681f3Smrg 23727ec681f3Smrg/* GFX9+ metadata cache flushing workaround. metadata cache coherency is 23737ec681f3Smrg * broken if the CB caches data of multiple mips of the same image at the 23747ec681f3Smrg * same time. 23757ec681f3Smrg * 23767ec681f3Smrg * Insert some flushes to avoid this. 23777ec681f3Smrg */ 23787ec681f3Smrgstatic void 23797ec681f3Smrgradv_emit_fb_mip_change_flush(struct radv_cmd_buffer *cmd_buffer) 238001e04c3fSmrg{ 23817ec681f3Smrg struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer; 23827ec681f3Smrg const struct radv_subpass *subpass = cmd_buffer->state.subpass; 23837ec681f3Smrg bool color_mip_changed = false; 238401e04c3fSmrg 23857ec681f3Smrg /* Entire workaround is not applicable before GFX9 */ 23867ec681f3Smrg if (cmd_buffer->device->physical_device->rad_info.chip_class < GFX9) 23877ec681f3Smrg return; 238801e04c3fSmrg 23897ec681f3Smrg if (!framebuffer) 23907ec681f3Smrg return; 239101e04c3fSmrg 23927ec681f3Smrg for (int i = 0; i < subpass->color_count; ++i) { 23937ec681f3Smrg int idx = subpass->color_attachments[i].attachment; 23947ec681f3Smrg if (idx == VK_ATTACHMENT_UNUSED) 23957ec681f3Smrg continue; 239601e04c3fSmrg 23977ec681f3Smrg struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview; 239801e04c3fSmrg 23997ec681f3Smrg if ((radv_image_has_CB_metadata(iview->image) || 24007ec681f3Smrg radv_dcc_enabled(iview->image, iview->base_mip) || 24017ec681f3Smrg radv_dcc_enabled(iview->image, cmd_buffer->state.cb_mip[i])) && 24027ec681f3Smrg cmd_buffer->state.cb_mip[i] != iview->base_mip) 24037ec681f3Smrg color_mip_changed = true; 240401e04c3fSmrg 24057ec681f3Smrg cmd_buffer->state.cb_mip[i] = iview->base_mip; 24067ec681f3Smrg } 240701e04c3fSmrg 24087ec681f3Smrg if (color_mip_changed) { 24097ec681f3Smrg cmd_buffer->state.flush_bits |= 24107ec681f3Smrg RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; 24117ec681f3Smrg } 241201e04c3fSmrg} 241301e04c3fSmrg 24147ec681f3Smrg/* This function does the flushes for mip changes if the levels are not zero for 24157ec681f3Smrg * all render targets. This way we can assume at the start of the next cmd_buffer 24167ec681f3Smrg * that rendering to mip 0 doesn't need any flushes. As that is the most common 24177ec681f3Smrg * case that saves some flushes. */ 241801e04c3fSmrgstatic void 24197ec681f3Smrgradv_emit_mip_change_flush_default(struct radv_cmd_buffer *cmd_buffer) 242001e04c3fSmrg{ 24217ec681f3Smrg /* Entire workaround is not applicable before GFX9 */ 24227ec681f3Smrg if (cmd_buffer->device->physical_device->rad_info.chip_class < GFX9) 24237ec681f3Smrg return; 242401e04c3fSmrg 24257ec681f3Smrg bool need_color_mip_flush = false; 24267ec681f3Smrg for (unsigned i = 0; i < 8; ++i) { 24277ec681f3Smrg if (cmd_buffer->state.cb_mip[i]) { 24287ec681f3Smrg need_color_mip_flush = true; 24297ec681f3Smrg break; 24307ec681f3Smrg } 24317ec681f3Smrg } 243201e04c3fSmrg 24337ec681f3Smrg if (need_color_mip_flush) { 24347ec681f3Smrg cmd_buffer->state.flush_bits |= 24357ec681f3Smrg RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; 24367ec681f3Smrg } 243701e04c3fSmrg 24387ec681f3Smrg memset(cmd_buffer->state.cb_mip, 0, sizeof(cmd_buffer->state.cb_mip)); 243901e04c3fSmrg} 244001e04c3fSmrg 24417ec681f3Smrgstatic struct radv_image * 24427ec681f3Smrgradv_cmd_buffer_get_vrs_image(struct radv_cmd_buffer *cmd_buffer) 244301e04c3fSmrg{ 24447ec681f3Smrg struct radv_device *device = cmd_buffer->device; 24457ec681f3Smrg 24467ec681f3Smrg if (!device->vrs.image) { 24477ec681f3Smrg VkResult result; 244801e04c3fSmrg 24497ec681f3Smrg /* The global VRS state is initialized on-demand to avoid wasting VRAM. */ 24507ec681f3Smrg result = radv_device_init_vrs_state(device); 24517ec681f3Smrg if (result != VK_SUCCESS) { 24527ec681f3Smrg cmd_buffer->record_result = result; 24537ec681f3Smrg return NULL; 24547ec681f3Smrg } 24557ec681f3Smrg } 245601e04c3fSmrg 24577ec681f3Smrg return device->vrs.image; 245801e04c3fSmrg} 245901e04c3fSmrg 246001e04c3fSmrgstatic void 24617ec681f3Smrgradv_emit_framebuffer_state(struct radv_cmd_buffer *cmd_buffer) 24627ec681f3Smrg{ 24637ec681f3Smrg int i; 24647ec681f3Smrg struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer; 24657ec681f3Smrg const struct radv_subpass *subpass = cmd_buffer->state.subpass; 24667ec681f3Smrg 24677ec681f3Smrg /* this may happen for inherited secondary recording */ 24687ec681f3Smrg if (!framebuffer) 24697ec681f3Smrg return; 24707ec681f3Smrg 24717ec681f3Smrg for (i = 0; i < 8; ++i) { 24727ec681f3Smrg if (i >= subpass->color_count || 24737ec681f3Smrg subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) { 24747ec681f3Smrg radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, 24757ec681f3Smrg S_028C70_FORMAT(V_028C70_COLOR_INVALID)); 24767ec681f3Smrg continue; 24777ec681f3Smrg } 24787ec681f3Smrg 24797ec681f3Smrg int idx = subpass->color_attachments[i].attachment; 24807ec681f3Smrg struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview; 24817ec681f3Smrg VkImageLayout layout = subpass->color_attachments[i].layout; 24827ec681f3Smrg bool in_render_loop = subpass->color_attachments[i].in_render_loop; 24837ec681f3Smrg 24847ec681f3Smrg radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, iview->image->bo); 24857ec681f3Smrg 24867ec681f3Smrg assert(iview->aspect_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_PLANE_0_BIT | 24877ec681f3Smrg VK_IMAGE_ASPECT_PLANE_1_BIT | VK_IMAGE_ASPECT_PLANE_2_BIT)); 24887ec681f3Smrg radv_emit_fb_color_state(cmd_buffer, i, &cmd_buffer->state.attachments[idx].cb, iview, layout, 24897ec681f3Smrg in_render_loop, cmd_buffer->state.attachments[idx].disable_dcc); 24907ec681f3Smrg 24917ec681f3Smrg radv_load_color_clear_metadata(cmd_buffer, iview, i); 24927ec681f3Smrg } 24937ec681f3Smrg 24947ec681f3Smrg if (subpass->depth_stencil_attachment) { 24957ec681f3Smrg int idx = subpass->depth_stencil_attachment->attachment; 24967ec681f3Smrg VkImageLayout layout = subpass->depth_stencil_attachment->layout; 24977ec681f3Smrg bool in_render_loop = subpass->depth_stencil_attachment->in_render_loop; 24987ec681f3Smrg struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview; 24997ec681f3Smrg radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, 25007ec681f3Smrg cmd_buffer->state.attachments[idx].iview->image->bo); 25017ec681f3Smrg 25027ec681f3Smrg radv_emit_fb_ds_state(cmd_buffer, &cmd_buffer->state.attachments[idx].ds, iview, layout, 25037ec681f3Smrg in_render_loop); 25047ec681f3Smrg 25057ec681f3Smrg if (radv_layout_is_htile_compressed( 25067ec681f3Smrg cmd_buffer->device, iview->image, layout, in_render_loop, 25077ec681f3Smrg radv_image_queue_family_mask(iview->image, cmd_buffer->queue_family_index, 25087ec681f3Smrg cmd_buffer->queue_family_index))) { 25097ec681f3Smrg /* Only load the depth/stencil fast clear values when 25107ec681f3Smrg * compressed rendering is enabled. 25117ec681f3Smrg */ 25127ec681f3Smrg radv_load_ds_clear_metadata(cmd_buffer, iview); 25137ec681f3Smrg } 25147ec681f3Smrg } else if (subpass->vrs_attachment && cmd_buffer->device->vrs.image) { 25157ec681f3Smrg /* When a subpass uses a VRS attachment without binding a depth/stencil attachment, we have to 25167ec681f3Smrg * bind our internal depth buffer that contains the VRS data as part of HTILE. 25177ec681f3Smrg */ 25187ec681f3Smrg VkImageLayout layout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; 25197ec681f3Smrg struct radv_buffer *htile_buffer = cmd_buffer->device->vrs.buffer; 25207ec681f3Smrg struct radv_image *image = cmd_buffer->device->vrs.image; 25217ec681f3Smrg struct radv_ds_buffer_info ds; 25227ec681f3Smrg struct radv_image_view iview; 25237ec681f3Smrg 25247ec681f3Smrg radv_image_view_init(&iview, cmd_buffer->device, 25257ec681f3Smrg &(VkImageViewCreateInfo){ 25267ec681f3Smrg .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, 25277ec681f3Smrg .image = radv_image_to_handle(image), 25287ec681f3Smrg .viewType = radv_meta_get_view_type(image), 25297ec681f3Smrg .format = image->vk_format, 25307ec681f3Smrg .subresourceRange = 25317ec681f3Smrg { 25327ec681f3Smrg .aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT, 25337ec681f3Smrg .baseMipLevel = 0, 25347ec681f3Smrg .levelCount = 1, 25357ec681f3Smrg .baseArrayLayer = 0, 25367ec681f3Smrg .layerCount = 1, 25377ec681f3Smrg }, 25387ec681f3Smrg }, 25397ec681f3Smrg NULL); 25407ec681f3Smrg 25417ec681f3Smrg radv_initialise_vrs_surface(image, htile_buffer, &ds); 25427ec681f3Smrg 25437ec681f3Smrg radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, htile_buffer->bo); 25447ec681f3Smrg 25457ec681f3Smrg radv_emit_fb_ds_state(cmd_buffer, &ds, &iview, layout, false); 25467ec681f3Smrg 25477ec681f3Smrg radv_image_view_finish(&iview); 25487ec681f3Smrg } else { 25497ec681f3Smrg if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) 25507ec681f3Smrg radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 2); 25517ec681f3Smrg else 25527ec681f3Smrg radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 2); 25537ec681f3Smrg 25547ec681f3Smrg radeon_emit(cmd_buffer->cs, S_028040_FORMAT(V_028040_Z_INVALID)); /* DB_Z_INFO */ 25557ec681f3Smrg radeon_emit(cmd_buffer->cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */ 25567ec681f3Smrg } 25577ec681f3Smrg radeon_set_context_reg(cmd_buffer->cs, R_028208_PA_SC_WINDOW_SCISSOR_BR, 25587ec681f3Smrg S_028208_BR_X(framebuffer->width) | S_028208_BR_Y(framebuffer->height)); 25597ec681f3Smrg 25607ec681f3Smrg if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX8) { 25617ec681f3Smrg bool disable_constant_encode = 25627ec681f3Smrg cmd_buffer->device->physical_device->rad_info.has_dcc_constant_encode; 25637ec681f3Smrg enum chip_class chip_class = cmd_buffer->device->physical_device->rad_info.chip_class; 25647ec681f3Smrg uint8_t watermark = chip_class >= GFX10 ? 6 : 4; 25657ec681f3Smrg 25667ec681f3Smrg radeon_set_context_reg(cmd_buffer->cs, R_028424_CB_DCC_CONTROL, 25677ec681f3Smrg S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(chip_class <= GFX9) | 25687ec681f3Smrg S_028424_OVERWRITE_COMBINER_WATERMARK(watermark) | 25697ec681f3Smrg S_028424_DISABLE_CONSTANT_ENCODE_REG(disable_constant_encode)); 25707ec681f3Smrg } 25717ec681f3Smrg 25727ec681f3Smrg cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_FRAMEBUFFER; 257301e04c3fSmrg} 257401e04c3fSmrg 257501e04c3fSmrgstatic void 25767ec681f3Smrgradv_emit_index_buffer(struct radv_cmd_buffer *cmd_buffer, bool indirect) 257701e04c3fSmrg{ 25787ec681f3Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 25797ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 258001e04c3fSmrg 25817ec681f3Smrg if (state->index_type != state->last_index_type) { 25827ec681f3Smrg if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { 25837ec681f3Smrg radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cs, 25847ec681f3Smrg R_03090C_VGT_INDEX_TYPE, 2, state->index_type); 25857ec681f3Smrg } else { 25867ec681f3Smrg radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0)); 25877ec681f3Smrg radeon_emit(cs, state->index_type); 25887ec681f3Smrg } 258901e04c3fSmrg 25907ec681f3Smrg state->last_index_type = state->index_type; 25917ec681f3Smrg } 259201e04c3fSmrg 25937ec681f3Smrg /* For the direct indexed draws we use DRAW_INDEX_2, which includes 25947ec681f3Smrg * the index_va and max_index_count already. */ 25957ec681f3Smrg if (!indirect) 25967ec681f3Smrg return; 259701e04c3fSmrg 25987ec681f3Smrg radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0)); 25997ec681f3Smrg radeon_emit(cs, state->index_va); 26007ec681f3Smrg radeon_emit(cs, state->index_va >> 32); 260101e04c3fSmrg 26027ec681f3Smrg radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0)); 26037ec681f3Smrg radeon_emit(cs, state->max_index_count); 260401e04c3fSmrg 26057ec681f3Smrg cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_INDEX_BUFFER; 26067ec681f3Smrg} 260701e04c3fSmrg 26087ec681f3Smrgvoid 26097ec681f3Smrgradv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer) 26107ec681f3Smrg{ 26117ec681f3Smrg bool has_perfect_queries = cmd_buffer->state.perfect_occlusion_queries_enabled; 26127ec681f3Smrg struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; 26137ec681f3Smrg uint32_t pa_sc_mode_cntl_1 = pipeline ? pipeline->graphics.ms.pa_sc_mode_cntl_1 : 0; 26147ec681f3Smrg uint32_t db_count_control; 26157ec681f3Smrg 26167ec681f3Smrg if (!cmd_buffer->state.active_occlusion_queries) { 26177ec681f3Smrg if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) { 26187ec681f3Smrg if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) && 26197ec681f3Smrg pipeline->graphics.disable_out_of_order_rast_for_occlusion && has_perfect_queries) { 26207ec681f3Smrg /* Re-enable out-of-order rasterization if the 26217ec681f3Smrg * bound pipeline supports it and if it's has 26227ec681f3Smrg * been disabled before starting any perfect 26237ec681f3Smrg * occlusion queries. 26247ec681f3Smrg */ 26257ec681f3Smrg radeon_set_context_reg(cmd_buffer->cs, R_028A4C_PA_SC_MODE_CNTL_1, pa_sc_mode_cntl_1); 26267ec681f3Smrg } 26277ec681f3Smrg } 26287ec681f3Smrg db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1); 26297ec681f3Smrg } else { 26307ec681f3Smrg const struct radv_subpass *subpass = cmd_buffer->state.subpass; 26317ec681f3Smrg uint32_t sample_rate = subpass ? util_logbase2(subpass->max_sample_count) : 0; 26327ec681f3Smrg bool gfx10_perfect = 26337ec681f3Smrg cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10 && has_perfect_queries; 26347ec681f3Smrg 26357ec681f3Smrg if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) { 26367ec681f3Smrg /* Always enable PERFECT_ZPASS_COUNTS due to issues with partially 26377ec681f3Smrg * covered tiles, discards, and early depth testing. For more details, 26387ec681f3Smrg * see https://gitlab.freedesktop.org/mesa/mesa/-/issues/3218 */ 26397ec681f3Smrg db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) | 26407ec681f3Smrg S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(gfx10_perfect) | 26417ec681f3Smrg S_028004_SAMPLE_RATE(sample_rate) | S_028004_ZPASS_ENABLE(1) | 26427ec681f3Smrg S_028004_SLICE_EVEN_ENABLE(1) | S_028004_SLICE_ODD_ENABLE(1); 26437ec681f3Smrg 26447ec681f3Smrg if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) && 26457ec681f3Smrg pipeline->graphics.disable_out_of_order_rast_for_occlusion && has_perfect_queries) { 26467ec681f3Smrg /* If the bound pipeline has enabled 26477ec681f3Smrg * out-of-order rasterization, we should 26487ec681f3Smrg * disable it before starting any perfect 26497ec681f3Smrg * occlusion queries. 26507ec681f3Smrg */ 26517ec681f3Smrg pa_sc_mode_cntl_1 &= C_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE; 26527ec681f3Smrg 26537ec681f3Smrg radeon_set_context_reg(cmd_buffer->cs, R_028A4C_PA_SC_MODE_CNTL_1, pa_sc_mode_cntl_1); 26547ec681f3Smrg } 26557ec681f3Smrg } else { 26567ec681f3Smrg db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) | S_028004_SAMPLE_RATE(sample_rate); 26577ec681f3Smrg } 26587ec681f3Smrg } 26597ec681f3Smrg 26607ec681f3Smrg radeon_set_context_reg(cmd_buffer->cs, R_028004_DB_COUNT_CONTROL, db_count_control); 26617ec681f3Smrg 26627ec681f3Smrg cmd_buffer->state.context_roll_without_scissor_emitted = true; 26637ec681f3Smrg} 26647ec681f3Smrg 26657ec681f3Smrgunsigned 26667ec681f3Smrgradv_instance_rate_prolog_index(unsigned num_attributes, uint32_t instance_rate_inputs) 26677ec681f3Smrg{ 26687ec681f3Smrg /* instance_rate_vs_prologs is a flattened array of array of arrays of different sizes, or a 26697ec681f3Smrg * single array sorted in ascending order using: 26707ec681f3Smrg * - total number of attributes 26717ec681f3Smrg * - number of instanced attributes 26727ec681f3Smrg * - index of first instanced attribute 26737ec681f3Smrg */ 26747ec681f3Smrg 26757ec681f3Smrg /* From total number of attributes to offset. */ 26767ec681f3Smrg static const uint16_t total_to_offset[16] = {0, 1, 4, 10, 20, 35, 56, 84, 26777ec681f3Smrg 120, 165, 220, 286, 364, 455, 560, 680}; 26787ec681f3Smrg unsigned start_index = total_to_offset[num_attributes - 1]; 26797ec681f3Smrg 26807ec681f3Smrg /* From number of instanced attributes to offset. This would require a different LUT depending on 26817ec681f3Smrg * the total number of attributes, but we can exploit a pattern to use just the LUT for 16 total 26827ec681f3Smrg * attributes. 26837ec681f3Smrg */ 26847ec681f3Smrg static const uint8_t count_to_offset_total16[16] = {0, 16, 31, 45, 58, 70, 81, 91, 26857ec681f3Smrg 100, 108, 115, 121, 126, 130, 133, 135}; 26867ec681f3Smrg unsigned count = util_bitcount(instance_rate_inputs); 26877ec681f3Smrg unsigned offset_from_start_index = 26887ec681f3Smrg count_to_offset_total16[count - 1] - ((16 - num_attributes) * (count - 1)); 26897ec681f3Smrg 26907ec681f3Smrg unsigned first = ffs(instance_rate_inputs) - 1; 26917ec681f3Smrg return start_index + offset_from_start_index + first; 26927ec681f3Smrg} 26937ec681f3Smrg 26947ec681f3Smrgunion vs_prolog_key_header { 26957ec681f3Smrg struct { 26967ec681f3Smrg uint32_t key_size : 8; 26977ec681f3Smrg uint32_t num_attributes : 6; 26987ec681f3Smrg uint32_t as_ls : 1; 26997ec681f3Smrg uint32_t is_ngg : 1; 27007ec681f3Smrg uint32_t wave32 : 1; 27017ec681f3Smrg uint32_t next_stage : 3; 27027ec681f3Smrg uint32_t instance_rate_inputs : 1; 27037ec681f3Smrg uint32_t alpha_adjust_lo : 1; 27047ec681f3Smrg uint32_t alpha_adjust_hi : 1; 27057ec681f3Smrg uint32_t misaligned_mask : 1; 27067ec681f3Smrg uint32_t post_shuffle : 1; 27077ec681f3Smrg uint32_t nontrivial_divisors : 1; 27087ec681f3Smrg /* We need this to ensure the padding is zero. It's useful even if it's unused. */ 27097ec681f3Smrg uint32_t padding0 : 6; 27107ec681f3Smrg }; 27117ec681f3Smrg uint32_t v; 27127ec681f3Smrg}; 271301e04c3fSmrg 27147ec681f3Smrguint32_t 27157ec681f3Smrgradv_hash_vs_prolog(const void *key_) 27167ec681f3Smrg{ 27177ec681f3Smrg const uint32_t *key = key_; 27187ec681f3Smrg union vs_prolog_key_header header; 27197ec681f3Smrg header.v = key[0]; 27207ec681f3Smrg return _mesa_hash_data(key, header.key_size); 27217ec681f3Smrg} 272201e04c3fSmrg 27237ec681f3Smrgbool 27247ec681f3Smrgradv_cmp_vs_prolog(const void *a_, const void *b_) 27257ec681f3Smrg{ 27267ec681f3Smrg const uint32_t *a = a_; 27277ec681f3Smrg const uint32_t *b = b_; 27287ec681f3Smrg if (a[0] != b[0]) 27297ec681f3Smrg return false; 27307ec681f3Smrg 27317ec681f3Smrg union vs_prolog_key_header header; 27327ec681f3Smrg header.v = a[0]; 27337ec681f3Smrg return memcmp(a, b, header.key_size) == 0; 27347ec681f3Smrg} 27357ec681f3Smrg 27367ec681f3Smrgstatic struct radv_shader_prolog * 27377ec681f3Smrglookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant *vs_shader, 27387ec681f3Smrg uint32_t *nontrivial_divisors) 27397ec681f3Smrg{ 27407ec681f3Smrg STATIC_ASSERT(sizeof(union vs_prolog_key_header) == 4); 27417ec681f3Smrg assert(vs_shader->info.vs.dynamic_inputs); 27427ec681f3Smrg 27437ec681f3Smrg const struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input; 27447ec681f3Smrg const struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; 27457ec681f3Smrg struct radv_device *device = cmd_buffer->device; 27467ec681f3Smrg 27477ec681f3Smrg unsigned num_attributes = pipeline->last_vertex_attrib_bit; 27487ec681f3Smrg uint32_t attribute_mask = BITFIELD_MASK(num_attributes); 27497ec681f3Smrg 27507ec681f3Smrg uint32_t instance_rate_inputs = state->instance_rate_inputs & attribute_mask; 27517ec681f3Smrg *nontrivial_divisors = state->nontrivial_divisors & attribute_mask; 27527ec681f3Smrg enum chip_class chip = device->physical_device->rad_info.chip_class; 27537ec681f3Smrg const uint32_t misaligned_mask = chip == GFX6 || chip >= GFX10 ? cmd_buffer->state.vbo_misaligned_mask : 0; 27547ec681f3Smrg 27557ec681f3Smrg /* try to use a pre-compiled prolog first */ 27567ec681f3Smrg struct radv_shader_prolog *prolog = NULL; 27577ec681f3Smrg if (pipeline->can_use_simple_input && 27587ec681f3Smrg (!vs_shader->info.vs.as_ls || !instance_rate_inputs) && 27597ec681f3Smrg !misaligned_mask && !state->alpha_adjust_lo && !state->alpha_adjust_hi) { 27607ec681f3Smrg if (!instance_rate_inputs) { 27617ec681f3Smrg prolog = device->simple_vs_prologs[num_attributes - 1]; 27627ec681f3Smrg } else if (num_attributes <= 16 && !*nontrivial_divisors && 27637ec681f3Smrg util_bitcount(instance_rate_inputs) == 27647ec681f3Smrg (util_last_bit(instance_rate_inputs) - ffs(instance_rate_inputs) + 1)) { 27657ec681f3Smrg unsigned index = radv_instance_rate_prolog_index(num_attributes, instance_rate_inputs); 27667ec681f3Smrg prolog = device->instance_rate_vs_prologs[index]; 27677ec681f3Smrg } 27687ec681f3Smrg } 27697ec681f3Smrg if (prolog) 27707ec681f3Smrg return prolog; 27717ec681f3Smrg 27727ec681f3Smrg /* if we couldn't use a pre-compiled prolog, find one in the cache or create one */ 27737ec681f3Smrg uint32_t key_words[16]; 27747ec681f3Smrg unsigned key_size = 1; 27757ec681f3Smrg 27767ec681f3Smrg struct radv_vs_prolog_key key; 27777ec681f3Smrg key.state = state; 27787ec681f3Smrg key.num_attributes = num_attributes; 27797ec681f3Smrg key.misaligned_mask = misaligned_mask; 27807ec681f3Smrg /* The instance ID input VGPR is placed differently when as_ls=true. */ 27817ec681f3Smrg key.as_ls = vs_shader->info.vs.as_ls && instance_rate_inputs; 27827ec681f3Smrg key.is_ngg = vs_shader->info.is_ngg; 27837ec681f3Smrg key.wave32 = vs_shader->info.wave_size == 32; 27847ec681f3Smrg key.next_stage = pipeline->next_vertex_stage; 27857ec681f3Smrg 27867ec681f3Smrg union vs_prolog_key_header header; 27877ec681f3Smrg header.v = 0; 27887ec681f3Smrg header.num_attributes = num_attributes; 27897ec681f3Smrg header.as_ls = key.as_ls; 27907ec681f3Smrg header.is_ngg = key.is_ngg; 27917ec681f3Smrg header.wave32 = key.wave32; 27927ec681f3Smrg header.next_stage = key.next_stage; 27937ec681f3Smrg 27947ec681f3Smrg if (instance_rate_inputs & ~*nontrivial_divisors) { 27957ec681f3Smrg header.instance_rate_inputs = true; 27967ec681f3Smrg key_words[key_size++] = instance_rate_inputs; 27977ec681f3Smrg } 27987ec681f3Smrg if (*nontrivial_divisors) { 27997ec681f3Smrg header.nontrivial_divisors = true; 28007ec681f3Smrg key_words[key_size++] = *nontrivial_divisors; 28017ec681f3Smrg } 28027ec681f3Smrg if (misaligned_mask) { 28037ec681f3Smrg header.misaligned_mask = true; 28047ec681f3Smrg key_words[key_size++] = misaligned_mask; 28057ec681f3Smrg 28067ec681f3Smrg uint8_t *formats = (uint8_t *)&key_words[key_size]; 28077ec681f3Smrg unsigned num_formats = 0; 28087ec681f3Smrg u_foreach_bit(index, misaligned_mask) formats[num_formats++] = state->formats[index]; 28097ec681f3Smrg while (num_formats & 0x3) 28107ec681f3Smrg formats[num_formats++] = 0; 28117ec681f3Smrg key_size += num_formats / 4u; 28127ec681f3Smrg 28137ec681f3Smrg if (state->post_shuffle & attribute_mask) { 28147ec681f3Smrg header.post_shuffle = true; 28157ec681f3Smrg key_words[key_size++] = state->post_shuffle & attribute_mask; 28167ec681f3Smrg } 28177ec681f3Smrg } 28187ec681f3Smrg if (state->alpha_adjust_lo & attribute_mask) { 28197ec681f3Smrg header.alpha_adjust_lo = true; 28207ec681f3Smrg key_words[key_size++] = state->alpha_adjust_lo & attribute_mask; 28217ec681f3Smrg } 28227ec681f3Smrg if (state->alpha_adjust_hi & attribute_mask) { 28237ec681f3Smrg header.alpha_adjust_hi = true; 28247ec681f3Smrg key_words[key_size++] = state->alpha_adjust_hi & attribute_mask; 28257ec681f3Smrg } 28267ec681f3Smrg 28277ec681f3Smrg header.key_size = key_size * sizeof(key_words[0]); 28287ec681f3Smrg key_words[0] = header.v; 28297ec681f3Smrg 28307ec681f3Smrg uint32_t hash = radv_hash_vs_prolog(key_words); 28317ec681f3Smrg 28327ec681f3Smrg if (cmd_buffer->state.emitted_vs_prolog && 28337ec681f3Smrg cmd_buffer->state.emitted_vs_prolog_key_hash == hash && 28347ec681f3Smrg radv_cmp_vs_prolog(key_words, cmd_buffer->state.emitted_vs_prolog_key)) 28357ec681f3Smrg return cmd_buffer->state.emitted_vs_prolog; 28367ec681f3Smrg 28377ec681f3Smrg u_rwlock_rdlock(&device->vs_prologs_lock); 28387ec681f3Smrg struct hash_entry *prolog_entry = 28397ec681f3Smrg _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, key_words); 28407ec681f3Smrg u_rwlock_rdunlock(&device->vs_prologs_lock); 28417ec681f3Smrg 28427ec681f3Smrg if (!prolog_entry) { 28437ec681f3Smrg u_rwlock_wrlock(&device->vs_prologs_lock); 28447ec681f3Smrg prolog_entry = _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, key_words); 28457ec681f3Smrg if (prolog_entry) { 28467ec681f3Smrg u_rwlock_wrunlock(&device->vs_prologs_lock); 28477ec681f3Smrg return prolog_entry->data; 28487ec681f3Smrg } 28497ec681f3Smrg 28507ec681f3Smrg prolog = radv_create_vs_prolog(device, &key); 28517ec681f3Smrg uint32_t *key2 = malloc(key_size * 4); 28527ec681f3Smrg if (!prolog || !key2) { 28537ec681f3Smrg radv_prolog_destroy(device, prolog); 28547ec681f3Smrg free(key2); 28557ec681f3Smrg u_rwlock_wrunlock(&device->vs_prologs_lock); 28567ec681f3Smrg return NULL; 28577ec681f3Smrg } 28587ec681f3Smrg memcpy(key2, key_words, key_size * 4); 28597ec681f3Smrg _mesa_hash_table_insert_pre_hashed(device->vs_prologs, hash, key2, prolog); 28607ec681f3Smrg 28617ec681f3Smrg u_rwlock_wrunlock(&device->vs_prologs_lock); 28627ec681f3Smrg return prolog; 28637ec681f3Smrg } 28647ec681f3Smrg 28657ec681f3Smrg return prolog_entry->data; 286601e04c3fSmrg} 286701e04c3fSmrg 286801e04c3fSmrgstatic void 28697ec681f3Smrgemit_prolog_regs(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant *vs_shader, 28707ec681f3Smrg struct radv_shader_prolog *prolog, bool pipeline_is_dirty) 28717ec681f3Smrg{ 28727ec681f3Smrg /* no need to re-emit anything in this case */ 28737ec681f3Smrg if (cmd_buffer->state.emitted_vs_prolog == prolog && !pipeline_is_dirty) 28747ec681f3Smrg return; 28757ec681f3Smrg 28767ec681f3Smrg enum chip_class chip = cmd_buffer->device->physical_device->rad_info.chip_class; 28777ec681f3Smrg struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; 28787ec681f3Smrg uint64_t prolog_va = radv_buffer_get_va(prolog->bo) + prolog->alloc->offset; 28797ec681f3Smrg 28807ec681f3Smrg assert(cmd_buffer->state.emitted_pipeline == cmd_buffer->state.pipeline); 28817ec681f3Smrg assert(vs_shader->info.num_input_sgprs <= prolog->num_preserved_sgprs); 28827ec681f3Smrg 28837ec681f3Smrg uint32_t rsrc1 = vs_shader->config.rsrc1; 28847ec681f3Smrg if (chip < GFX10 && G_00B228_SGPRS(prolog->rsrc1) > G_00B228_SGPRS(vs_shader->config.rsrc1)) 28857ec681f3Smrg rsrc1 = (rsrc1 & C_00B228_SGPRS) | (prolog->rsrc1 & ~C_00B228_SGPRS); 28867ec681f3Smrg 28877ec681f3Smrg /* The main shader must not use less VGPRs than the prolog, otherwise shared vgprs might not 28887ec681f3Smrg * work. 28897ec681f3Smrg */ 28907ec681f3Smrg assert(G_00B848_VGPRS(vs_shader->config.rsrc1) >= G_00B848_VGPRS(prolog->rsrc1)); 28917ec681f3Smrg 28927ec681f3Smrg unsigned pgm_lo_reg = R_00B120_SPI_SHADER_PGM_LO_VS; 28937ec681f3Smrg unsigned rsrc1_reg = R_00B128_SPI_SHADER_PGM_RSRC1_VS; 28947ec681f3Smrg if (vs_shader->info.is_ngg || pipeline->shaders[MESA_SHADER_GEOMETRY] == vs_shader) { 28957ec681f3Smrg pgm_lo_reg = chip >= GFX10 ? R_00B320_SPI_SHADER_PGM_LO_ES : R_00B210_SPI_SHADER_PGM_LO_ES; 28967ec681f3Smrg rsrc1_reg = R_00B228_SPI_SHADER_PGM_RSRC1_GS; 28977ec681f3Smrg } else if (pipeline->shaders[MESA_SHADER_TESS_CTRL] == vs_shader) { 28987ec681f3Smrg pgm_lo_reg = chip >= GFX10 ? R_00B520_SPI_SHADER_PGM_LO_LS : R_00B410_SPI_SHADER_PGM_LO_LS; 28997ec681f3Smrg rsrc1_reg = R_00B428_SPI_SHADER_PGM_RSRC1_HS; 29007ec681f3Smrg } else if (vs_shader->info.vs.as_ls) { 29017ec681f3Smrg pgm_lo_reg = R_00B520_SPI_SHADER_PGM_LO_LS; 29027ec681f3Smrg rsrc1_reg = R_00B528_SPI_SHADER_PGM_RSRC1_LS; 29037ec681f3Smrg } else if (vs_shader->info.vs.as_es) { 29047ec681f3Smrg pgm_lo_reg = R_00B320_SPI_SHADER_PGM_LO_ES; 29057ec681f3Smrg rsrc1_reg = R_00B328_SPI_SHADER_PGM_RSRC1_ES; 29067ec681f3Smrg } 29077ec681f3Smrg 29087ec681f3Smrg radeon_set_sh_reg_seq(cmd_buffer->cs, pgm_lo_reg, 2); 29097ec681f3Smrg radeon_emit(cmd_buffer->cs, prolog_va >> 8); 29107ec681f3Smrg radeon_emit(cmd_buffer->cs, S_00B124_MEM_BASE(prolog_va >> 40)); 29117ec681f3Smrg 29127ec681f3Smrg if (chip < GFX10) 29137ec681f3Smrg radeon_set_sh_reg(cmd_buffer->cs, rsrc1_reg, rsrc1); 29147ec681f3Smrg else 29157ec681f3Smrg assert(rsrc1 == vs_shader->config.rsrc1); 29167ec681f3Smrg 29177ec681f3Smrg radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, prolog->bo); 291801e04c3fSmrg} 291901e04c3fSmrg 292001e04c3fSmrgstatic void 29217ec681f3Smrgemit_prolog_inputs(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant *vs_shader, 29227ec681f3Smrg uint32_t nontrivial_divisors, bool pipeline_is_dirty) 29237ec681f3Smrg{ 29247ec681f3Smrg /* no need to re-emit anything in this case */ 29257ec681f3Smrg if (!nontrivial_divisors && !pipeline_is_dirty && cmd_buffer->state.emitted_vs_prolog && 29267ec681f3Smrg !cmd_buffer->state.emitted_vs_prolog->nontrivial_divisors) 29277ec681f3Smrg return; 29287ec681f3Smrg 29297ec681f3Smrg struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input; 29307ec681f3Smrg uint64_t input_va = radv_shader_variant_get_va(vs_shader); 29317ec681f3Smrg 29327ec681f3Smrg if (nontrivial_divisors) { 29337ec681f3Smrg unsigned inputs_offset; 29347ec681f3Smrg uint32_t *inputs; 29357ec681f3Smrg unsigned size = 8 + util_bitcount(nontrivial_divisors) * 8; 29367ec681f3Smrg if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, &inputs_offset, (void **)&inputs)) 29377ec681f3Smrg return; 29387ec681f3Smrg 29397ec681f3Smrg *(inputs++) = input_va; 29407ec681f3Smrg *(inputs++) = input_va >> 32; 29417ec681f3Smrg 29427ec681f3Smrg u_foreach_bit(index, nontrivial_divisors) 29437ec681f3Smrg { 29447ec681f3Smrg uint32_t div = state->divisors[index]; 29457ec681f3Smrg if (div == 0) { 29467ec681f3Smrg *(inputs++) = 0; 29477ec681f3Smrg *(inputs++) = 1; 29487ec681f3Smrg } else if (util_is_power_of_two_or_zero(div)) { 29497ec681f3Smrg *(inputs++) = util_logbase2(div) | (1 << 8); 29507ec681f3Smrg *(inputs++) = 0xffffffffu; 29517ec681f3Smrg } else { 29527ec681f3Smrg struct util_fast_udiv_info info = util_compute_fast_udiv_info(div, 32, 32); 29537ec681f3Smrg *(inputs++) = info.pre_shift | (info.increment << 8) | (info.post_shift << 16); 29547ec681f3Smrg *(inputs++) = info.multiplier; 29557ec681f3Smrg } 29567ec681f3Smrg } 29577ec681f3Smrg 29587ec681f3Smrg input_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + inputs_offset; 29597ec681f3Smrg } 29607ec681f3Smrg 29617ec681f3Smrg struct radv_userdata_info *loc = 29627ec681f3Smrg &vs_shader->info.user_sgprs_locs.shader_data[AC_UD_VS_PROLOG_INPUTS]; 29637ec681f3Smrg uint32_t base_reg = cmd_buffer->state.pipeline->user_data_0[MESA_SHADER_VERTEX]; 29647ec681f3Smrg assert(loc->sgpr_idx != -1); 29657ec681f3Smrg assert(loc->num_sgprs == 2); 29667ec681f3Smrg radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, 29677ec681f3Smrg input_va, true); 296801e04c3fSmrg} 296901e04c3fSmrg 297001e04c3fSmrgstatic void 29717ec681f3Smrgradv_emit_vertex_input(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty) 297201e04c3fSmrg{ 29737ec681f3Smrg struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; 29747ec681f3Smrg struct radv_shader_variant *vs_shader = radv_get_shader(pipeline, MESA_SHADER_VERTEX); 297501e04c3fSmrg 29767ec681f3Smrg if (!vs_shader->info.vs.has_prolog) 29777ec681f3Smrg return; 297801e04c3fSmrg 29797ec681f3Smrg uint32_t nontrivial_divisors; 29807ec681f3Smrg struct radv_shader_prolog *prolog = 29817ec681f3Smrg lookup_vs_prolog(cmd_buffer, vs_shader, &nontrivial_divisors); 29827ec681f3Smrg if (!prolog) { 29837ec681f3Smrg cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY; 29847ec681f3Smrg return; 29857ec681f3Smrg } 29867ec681f3Smrg emit_prolog_regs(cmd_buffer, vs_shader, prolog, pipeline_is_dirty); 29877ec681f3Smrg emit_prolog_inputs(cmd_buffer, vs_shader, nontrivial_divisors, pipeline_is_dirty); 298801e04c3fSmrg 29897ec681f3Smrg cmd_buffer->state.emitted_vs_prolog = prolog; 299001e04c3fSmrg} 299101e04c3fSmrg 299201e04c3fSmrgstatic void 29937ec681f3Smrgradv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty) 299401e04c3fSmrg{ 29957ec681f3Smrg uint64_t states = 29967ec681f3Smrg cmd_buffer->state.dirty & cmd_buffer->state.emitted_pipeline->graphics.needed_dynamic_state; 299701e04c3fSmrg 29987ec681f3Smrg if (states & (RADV_CMD_DIRTY_DYNAMIC_VIEWPORT)) 29997ec681f3Smrg radv_emit_viewport(cmd_buffer); 300001e04c3fSmrg 30017ec681f3Smrg if (states & (RADV_CMD_DIRTY_DYNAMIC_SCISSOR | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT) && 30027ec681f3Smrg !cmd_buffer->device->physical_device->rad_info.has_gfx9_scissor_bug) 30037ec681f3Smrg radv_emit_scissor(cmd_buffer); 300401e04c3fSmrg 30057ec681f3Smrg if (states & RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH) 30067ec681f3Smrg radv_emit_line_width(cmd_buffer); 300701e04c3fSmrg 30087ec681f3Smrg if (states & RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS) 30097ec681f3Smrg radv_emit_blend_constants(cmd_buffer); 301001e04c3fSmrg 30117ec681f3Smrg if (states & 30127ec681f3Smrg (RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK | 30137ec681f3Smrg RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK)) 30147ec681f3Smrg radv_emit_stencil(cmd_buffer); 301501e04c3fSmrg 30167ec681f3Smrg if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS) 30177ec681f3Smrg radv_emit_depth_bounds(cmd_buffer); 301801e04c3fSmrg 30197ec681f3Smrg if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS) 30207ec681f3Smrg radv_emit_depth_bias(cmd_buffer); 302101e04c3fSmrg 30227ec681f3Smrg if (states & RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE) 30237ec681f3Smrg radv_emit_discard_rectangle(cmd_buffer); 302401e04c3fSmrg 30257ec681f3Smrg if (states & RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS) 30267ec681f3Smrg radv_emit_sample_locations(cmd_buffer); 302701e04c3fSmrg 30287ec681f3Smrg if (states & RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE) 30297ec681f3Smrg radv_emit_line_stipple(cmd_buffer); 303001e04c3fSmrg 30317ec681f3Smrg if (states & (RADV_CMD_DIRTY_DYNAMIC_CULL_MODE | RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE | 30327ec681f3Smrg RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE)) 30337ec681f3Smrg radv_emit_culling(cmd_buffer, states); 3034ed98bd31Smaya 30357ec681f3Smrg if (states & RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY) 30367ec681f3Smrg radv_emit_primitive_topology(cmd_buffer); 303701e04c3fSmrg 30387ec681f3Smrg if (states & 30397ec681f3Smrg (RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE | 30407ec681f3Smrg RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP | RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE | 30417ec681f3Smrg RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP)) 30427ec681f3Smrg radv_emit_depth_control(cmd_buffer, states); 304301e04c3fSmrg 30447ec681f3Smrg if (states & RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP) 30457ec681f3Smrg radv_emit_stencil_control(cmd_buffer); 304601e04c3fSmrg 30477ec681f3Smrg if (states & RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE) 30487ec681f3Smrg radv_emit_fragment_shading_rate(cmd_buffer); 304901e04c3fSmrg 30507ec681f3Smrg if (states & RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE) 30517ec681f3Smrg radv_emit_primitive_restart_enable(cmd_buffer); 305201e04c3fSmrg 30537ec681f3Smrg if (states & RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE) 30547ec681f3Smrg radv_emit_rasterizer_discard_enable(cmd_buffer); 305501e04c3fSmrg 30567ec681f3Smrg if (states & RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP) 30577ec681f3Smrg radv_emit_logic_op(cmd_buffer); 305801e04c3fSmrg 30597ec681f3Smrg if (states & RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE) 30607ec681f3Smrg radv_emit_color_write_enable(cmd_buffer); 306101e04c3fSmrg 30627ec681f3Smrg if (states & RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT) 30637ec681f3Smrg radv_emit_vertex_input(cmd_buffer, pipeline_is_dirty); 306401e04c3fSmrg 30657ec681f3Smrg cmd_buffer->state.dirty &= ~states; 30667ec681f3Smrg} 306701e04c3fSmrg 30687ec681f3Smrgstatic void 30697ec681f3Smrgradv_flush_push_descriptors(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point) 30707ec681f3Smrg{ 30717ec681f3Smrg struct radv_descriptor_state *descriptors_state = 30727ec681f3Smrg radv_get_descriptors_state(cmd_buffer, bind_point); 30737ec681f3Smrg struct radv_descriptor_set *set = (struct radv_descriptor_set *)&descriptors_state->push_set.set; 30747ec681f3Smrg unsigned bo_offset; 307501e04c3fSmrg 30767ec681f3Smrg if (!radv_cmd_buffer_upload_data(cmd_buffer, set->header.size, set->header.mapped_ptr, 30777ec681f3Smrg &bo_offset)) 30787ec681f3Smrg return; 307901e04c3fSmrg 30807ec681f3Smrg set->header.va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); 30817ec681f3Smrg set->header.va += bo_offset; 30827ec681f3Smrg} 308301e04c3fSmrg 30847ec681f3Smrgstatic void 30857ec681f3Smrgradv_flush_indirect_descriptor_sets(struct radv_cmd_buffer *cmd_buffer, 30867ec681f3Smrg struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point) 30877ec681f3Smrg{ 30887ec681f3Smrg struct radv_descriptor_state *descriptors_state = 30897ec681f3Smrg radv_get_descriptors_state(cmd_buffer, bind_point); 30907ec681f3Smrg uint32_t size = MAX_SETS * 4; 30917ec681f3Smrg uint32_t offset; 30927ec681f3Smrg void *ptr; 30937ec681f3Smrg 30947ec681f3Smrg if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, &offset, &ptr)) 30957ec681f3Smrg return; 30967ec681f3Smrg 30977ec681f3Smrg for (unsigned i = 0; i < MAX_SETS; i++) { 30987ec681f3Smrg uint32_t *uptr = ((uint32_t *)ptr) + i; 30997ec681f3Smrg uint64_t set_va = 0; 31007ec681f3Smrg struct radv_descriptor_set *set = descriptors_state->sets[i]; 31017ec681f3Smrg if (descriptors_state->valid & (1u << i)) 31027ec681f3Smrg set_va = set->header.va; 31037ec681f3Smrg uptr[0] = set_va & 0xffffffff; 31047ec681f3Smrg } 31057ec681f3Smrg 31067ec681f3Smrg uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); 31077ec681f3Smrg va += offset; 31087ec681f3Smrg 31097ec681f3Smrg if (bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) { 31107ec681f3Smrg if (pipeline->shaders[MESA_SHADER_VERTEX]) 31117ec681f3Smrg radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_VERTEX, 31127ec681f3Smrg AC_UD_INDIRECT_DESCRIPTOR_SETS, va); 31137ec681f3Smrg 31147ec681f3Smrg if (pipeline->shaders[MESA_SHADER_FRAGMENT]) 31157ec681f3Smrg radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_FRAGMENT, 31167ec681f3Smrg AC_UD_INDIRECT_DESCRIPTOR_SETS, va); 31177ec681f3Smrg 31187ec681f3Smrg if (radv_pipeline_has_gs(pipeline)) 31197ec681f3Smrg radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_GEOMETRY, 31207ec681f3Smrg AC_UD_INDIRECT_DESCRIPTOR_SETS, va); 31217ec681f3Smrg 31227ec681f3Smrg if (radv_pipeline_has_tess(pipeline)) 31237ec681f3Smrg radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_TESS_CTRL, 31247ec681f3Smrg AC_UD_INDIRECT_DESCRIPTOR_SETS, va); 31257ec681f3Smrg 31267ec681f3Smrg if (radv_pipeline_has_tess(pipeline)) 31277ec681f3Smrg radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_TESS_EVAL, 31287ec681f3Smrg AC_UD_INDIRECT_DESCRIPTOR_SETS, va); 31297ec681f3Smrg } else { 31307ec681f3Smrg radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_COMPUTE, 31317ec681f3Smrg AC_UD_INDIRECT_DESCRIPTOR_SETS, va); 31327ec681f3Smrg } 31337ec681f3Smrg} 313401e04c3fSmrg 31357ec681f3Smrgstatic void 31367ec681f3Smrgradv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stages, 31377ec681f3Smrg struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point) 31387ec681f3Smrg{ 31397ec681f3Smrg struct radv_descriptor_state *descriptors_state = 31407ec681f3Smrg radv_get_descriptors_state(cmd_buffer, bind_point); 31417ec681f3Smrg bool flush_indirect_descriptors; 314201e04c3fSmrg 31437ec681f3Smrg if (!descriptors_state->dirty) 31447ec681f3Smrg return; 314501e04c3fSmrg 31467ec681f3Smrg if (descriptors_state->push_dirty) 31477ec681f3Smrg radv_flush_push_descriptors(cmd_buffer, bind_point); 314801e04c3fSmrg 31497ec681f3Smrg flush_indirect_descriptors = pipeline && pipeline->need_indirect_descriptor_sets; 315001e04c3fSmrg 31517ec681f3Smrg if (flush_indirect_descriptors) 31527ec681f3Smrg radv_flush_indirect_descriptor_sets(cmd_buffer, pipeline, bind_point); 315301e04c3fSmrg 31547ec681f3Smrg ASSERTED unsigned cdw_max = 31557ec681f3Smrg radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, MAX_SETS * MESA_SHADER_STAGES * 4); 315601e04c3fSmrg 31577ec681f3Smrg if (pipeline) { 31587ec681f3Smrg if (stages & VK_SHADER_STAGE_COMPUTE_BIT) { 31597ec681f3Smrg radv_emit_descriptor_pointers(cmd_buffer, pipeline, descriptors_state, 31607ec681f3Smrg MESA_SHADER_COMPUTE); 31617ec681f3Smrg } else { 31627ec681f3Smrg radv_foreach_stage(stage, stages) 31637ec681f3Smrg { 31647ec681f3Smrg if (!cmd_buffer->state.pipeline->shaders[stage]) 31657ec681f3Smrg continue; 316601e04c3fSmrg 31677ec681f3Smrg radv_emit_descriptor_pointers(cmd_buffer, pipeline, descriptors_state, stage); 31687ec681f3Smrg } 31697ec681f3Smrg } 31707ec681f3Smrg } 317101e04c3fSmrg 31727ec681f3Smrg descriptors_state->dirty = 0; 31737ec681f3Smrg descriptors_state->push_dirty = false; 317401e04c3fSmrg 31757ec681f3Smrg assert(cmd_buffer->cs->cdw <= cdw_max); 317601e04c3fSmrg 31777ec681f3Smrg if (unlikely(cmd_buffer->device->trace_bo)) 31787ec681f3Smrg radv_save_descriptors(cmd_buffer, bind_point); 317901e04c3fSmrg} 318001e04c3fSmrg 31817ec681f3Smrgstatic bool 31827ec681f3Smrgradv_shader_loads_push_constants(struct radv_pipeline *pipeline, gl_shader_stage stage) 318301e04c3fSmrg{ 31847ec681f3Smrg struct radv_userdata_info *loc = 31857ec681f3Smrg radv_lookup_user_sgpr(pipeline, stage, AC_UD_PUSH_CONSTANTS); 31867ec681f3Smrg return loc->sgpr_idx != -1; 31877ec681f3Smrg} 318801e04c3fSmrg 31897ec681f3Smrgstatic void 31907ec681f3Smrgradv_flush_constants(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stages, 31917ec681f3Smrg struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point) 31927ec681f3Smrg{ 31937ec681f3Smrg struct radv_descriptor_state *descriptors_state = 31947ec681f3Smrg radv_get_descriptors_state(cmd_buffer, bind_point); 31957ec681f3Smrg struct radv_shader_variant *shader, *prev_shader; 31967ec681f3Smrg bool need_push_constants = false; 31977ec681f3Smrg unsigned offset; 31987ec681f3Smrg void *ptr; 31997ec681f3Smrg uint64_t va; 32007ec681f3Smrg uint32_t internal_stages; 32017ec681f3Smrg uint32_t dirty_stages = 0; 32027ec681f3Smrg 32037ec681f3Smrg stages &= cmd_buffer->push_constant_stages; 32047ec681f3Smrg if (!stages || (!pipeline->push_constant_size && !pipeline->dynamic_offset_count)) 32057ec681f3Smrg return; 32067ec681f3Smrg 32077ec681f3Smrg internal_stages = stages; 32087ec681f3Smrg switch (bind_point) { 32097ec681f3Smrg case VK_PIPELINE_BIND_POINT_GRAPHICS: 32107ec681f3Smrg break; 32117ec681f3Smrg case VK_PIPELINE_BIND_POINT_COMPUTE: 32127ec681f3Smrg dirty_stages = RADV_RT_STAGE_BITS; 32137ec681f3Smrg break; 32147ec681f3Smrg case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR: 32157ec681f3Smrg internal_stages = VK_SHADER_STAGE_COMPUTE_BIT; 32167ec681f3Smrg dirty_stages = VK_SHADER_STAGE_COMPUTE_BIT; 32177ec681f3Smrg break; 32187ec681f3Smrg default: 32197ec681f3Smrg unreachable("Unhandled bind point"); 32207ec681f3Smrg } 32217ec681f3Smrg 32227ec681f3Smrg radv_foreach_stage(stage, internal_stages) 32237ec681f3Smrg { 32247ec681f3Smrg shader = radv_get_shader(pipeline, stage); 32257ec681f3Smrg if (!shader) 32267ec681f3Smrg continue; 32277ec681f3Smrg 32287ec681f3Smrg need_push_constants |= radv_shader_loads_push_constants(pipeline, stage); 32297ec681f3Smrg 32307ec681f3Smrg uint8_t base = shader->info.min_push_constant_used / 4; 32317ec681f3Smrg 32327ec681f3Smrg radv_emit_inline_push_consts(cmd_buffer, pipeline, stage, AC_UD_INLINE_PUSH_CONSTANTS, 32337ec681f3Smrg (uint32_t *)&cmd_buffer->push_constants[base * 4]); 32347ec681f3Smrg } 32357ec681f3Smrg 32367ec681f3Smrg if (need_push_constants) { 32377ec681f3Smrg if (!radv_cmd_buffer_upload_alloc( 32387ec681f3Smrg cmd_buffer, pipeline->push_constant_size + 16 * pipeline->dynamic_offset_count, &offset, 32397ec681f3Smrg &ptr)) 32407ec681f3Smrg return; 32417ec681f3Smrg 32427ec681f3Smrg memcpy(ptr, cmd_buffer->push_constants, pipeline->push_constant_size); 32437ec681f3Smrg memcpy((char *)ptr + pipeline->push_constant_size, descriptors_state->dynamic_buffers, 32447ec681f3Smrg 16 * pipeline->dynamic_offset_count); 32457ec681f3Smrg 32467ec681f3Smrg va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); 32477ec681f3Smrg va += offset; 32487ec681f3Smrg 32497ec681f3Smrg ASSERTED unsigned cdw_max = 32507ec681f3Smrg radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, MESA_SHADER_STAGES * 4); 32517ec681f3Smrg 32527ec681f3Smrg prev_shader = NULL; 32537ec681f3Smrg radv_foreach_stage(stage, internal_stages) 32547ec681f3Smrg { 32557ec681f3Smrg shader = radv_get_shader(pipeline, stage); 32567ec681f3Smrg 32577ec681f3Smrg /* Avoid redundantly emitting the address for merged stages. */ 32587ec681f3Smrg if (shader && shader != prev_shader) { 32597ec681f3Smrg radv_emit_userdata_address(cmd_buffer, pipeline, stage, AC_UD_PUSH_CONSTANTS, va); 32607ec681f3Smrg 32617ec681f3Smrg prev_shader = shader; 32627ec681f3Smrg } 32637ec681f3Smrg } 32647ec681f3Smrg assert(cmd_buffer->cs->cdw <= cdw_max); 32657ec681f3Smrg } 32667ec681f3Smrg 32677ec681f3Smrg cmd_buffer->push_constant_stages &= ~stages; 32687ec681f3Smrg cmd_buffer->push_constant_stages |= dirty_stages; 32697ec681f3Smrg} 32707ec681f3Smrg 32717ec681f3Smrgenum radv_dst_sel { 32727ec681f3Smrg DST_SEL_0001 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_0) | 32737ec681f3Smrg S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1), 32747ec681f3Smrg DST_SEL_X001 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_0) | 32757ec681f3Smrg S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1), 32767ec681f3Smrg DST_SEL_XY01 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | 32777ec681f3Smrg S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1), 32787ec681f3Smrg DST_SEL_XYZ1 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | 32797ec681f3Smrg S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1), 32807ec681f3Smrg DST_SEL_XYZW = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | 32817ec681f3Smrg S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W), 32827ec681f3Smrg DST_SEL_ZYXW = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | 32837ec681f3Smrg S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W), 32847ec681f3Smrg}; 328501e04c3fSmrg 32867ec681f3Smrgstatic const uint32_t data_format_dst_sel[] = { 32877ec681f3Smrg [V_008F0C_BUF_DATA_FORMAT_INVALID] = DST_SEL_0001, 32887ec681f3Smrg [V_008F0C_BUF_DATA_FORMAT_8] = DST_SEL_X001, 32897ec681f3Smrg [V_008F0C_BUF_DATA_FORMAT_16] = DST_SEL_X001, 32907ec681f3Smrg [V_008F0C_BUF_DATA_FORMAT_8_8] = DST_SEL_XY01, 32917ec681f3Smrg [V_008F0C_BUF_DATA_FORMAT_32] = DST_SEL_X001, 32927ec681f3Smrg [V_008F0C_BUF_DATA_FORMAT_16_16] = DST_SEL_XY01, 32937ec681f3Smrg [V_008F0C_BUF_DATA_FORMAT_10_11_11] = DST_SEL_XYZ1, 32947ec681f3Smrg [V_008F0C_BUF_DATA_FORMAT_11_11_10] = DST_SEL_XYZ1, 32957ec681f3Smrg [V_008F0C_BUF_DATA_FORMAT_10_10_10_2] = DST_SEL_XYZW, 32967ec681f3Smrg [V_008F0C_BUF_DATA_FORMAT_2_10_10_10] = DST_SEL_XYZW, 32977ec681f3Smrg [V_008F0C_BUF_DATA_FORMAT_8_8_8_8] = DST_SEL_XYZW, 32987ec681f3Smrg [V_008F0C_BUF_DATA_FORMAT_32_32] = DST_SEL_XY01, 32997ec681f3Smrg [V_008F0C_BUF_DATA_FORMAT_16_16_16_16] = DST_SEL_XYZW, 33007ec681f3Smrg [V_008F0C_BUF_DATA_FORMAT_32_32_32] = DST_SEL_XYZ1, 33017ec681f3Smrg [V_008F0C_BUF_DATA_FORMAT_32_32_32_32] = DST_SEL_XYZW, 33027ec681f3Smrg}; 330301e04c3fSmrg 33047ec681f3Smrgstatic void 33057ec681f3Smrgradv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty) 33067ec681f3Smrg{ 33077ec681f3Smrg if ((pipeline_is_dirty || (cmd_buffer->state.dirty & RADV_CMD_DIRTY_VERTEX_BUFFER)) && 33087ec681f3Smrg cmd_buffer->state.pipeline->vb_desc_usage_mask) { 33097ec681f3Smrg struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; 33107ec681f3Smrg struct radv_shader_variant *vs_shader = radv_get_shader(pipeline, MESA_SHADER_VERTEX); 33117ec681f3Smrg enum chip_class chip = cmd_buffer->device->physical_device->rad_info.chip_class; 33127ec681f3Smrg unsigned vb_offset; 33137ec681f3Smrg void *vb_ptr; 33147ec681f3Smrg unsigned desc_index = 0; 33157ec681f3Smrg uint32_t mask = pipeline->vb_desc_usage_mask; 33167ec681f3Smrg uint64_t va; 33177ec681f3Smrg struct radv_vs_input_state *vs_state = 33187ec681f3Smrg vs_shader->info.vs.dynamic_inputs ? &cmd_buffer->state.dynamic_vs_input : NULL; 33197ec681f3Smrg 33207ec681f3Smrg /* allocate some descriptor state for vertex buffers */ 33217ec681f3Smrg if (!radv_cmd_buffer_upload_alloc(cmd_buffer, pipeline->vb_desc_alloc_size, &vb_offset, &vb_ptr)) 33227ec681f3Smrg return; 33237ec681f3Smrg 33247ec681f3Smrg assert(!vs_state || pipeline->use_per_attribute_vb_descs); 33257ec681f3Smrg 33267ec681f3Smrg while (mask) { 33277ec681f3Smrg unsigned i = u_bit_scan(&mask); 33287ec681f3Smrg uint32_t *desc = &((uint32_t *)vb_ptr)[desc_index++ * 4]; 33297ec681f3Smrg uint32_t offset, rsrc_word3; 33307ec681f3Smrg unsigned binding = 33317ec681f3Smrg vs_state ? cmd_buffer->state.dynamic_vs_input.bindings[i] 33327ec681f3Smrg : (pipeline->use_per_attribute_vb_descs ? pipeline->attrib_bindings[i] : i); 33337ec681f3Smrg struct radv_buffer *buffer = cmd_buffer->vertex_bindings[binding].buffer; 33347ec681f3Smrg unsigned num_records; 33357ec681f3Smrg unsigned stride; 33367ec681f3Smrg 33377ec681f3Smrg if (vs_state) { 33387ec681f3Smrg unsigned format = vs_state->formats[i]; 33397ec681f3Smrg unsigned dfmt = format & 0xf; 33407ec681f3Smrg unsigned nfmt = (format >> 4) & 0x7; 33417ec681f3Smrg 33427ec681f3Smrg rsrc_word3 = 33437ec681f3Smrg vs_state->post_shuffle & (1u << i) ? DST_SEL_ZYXW : data_format_dst_sel[dfmt]; 33447ec681f3Smrg 33457ec681f3Smrg if (chip >= GFX10) 33467ec681f3Smrg rsrc_word3 |= S_008F0C_FORMAT(ac_get_tbuffer_format(chip, dfmt, nfmt)); 33477ec681f3Smrg else 33487ec681f3Smrg rsrc_word3 |= S_008F0C_NUM_FORMAT(nfmt) | S_008F0C_DATA_FORMAT(dfmt); 33497ec681f3Smrg } else { 33507ec681f3Smrg if (chip >= GFX10) 33517ec681f3Smrg rsrc_word3 = DST_SEL_XYZW | S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_UINT); 33527ec681f3Smrg else 33537ec681f3Smrg rsrc_word3 = DST_SEL_XYZW | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) | 33547ec681f3Smrg S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); 33557ec681f3Smrg } 33567ec681f3Smrg 33577ec681f3Smrg if (!buffer) { 33587ec681f3Smrg if (vs_state) { 33597ec681f3Smrg /* Stride needs to be non-zero on GFX9, or else bounds checking is disabled. We need 33607ec681f3Smrg * to include the format/word3 so that the alpha channel is 1 for formats without an 33617ec681f3Smrg * alpha channel. 33627ec681f3Smrg */ 33637ec681f3Smrg desc[0] = 0; 33647ec681f3Smrg desc[1] = S_008F04_STRIDE(16); 33657ec681f3Smrg desc[2] = 0; 33667ec681f3Smrg desc[3] = rsrc_word3; 33677ec681f3Smrg } else { 33687ec681f3Smrg memset(desc, 0, 4 * 4); 33697ec681f3Smrg } 33707ec681f3Smrg continue; 33717ec681f3Smrg } 33727ec681f3Smrg 33737ec681f3Smrg va = radv_buffer_get_va(buffer->bo); 33747ec681f3Smrg 33757ec681f3Smrg offset = cmd_buffer->vertex_bindings[binding].offset; 33767ec681f3Smrg va += offset + buffer->offset; 33777ec681f3Smrg if (vs_state) 33787ec681f3Smrg va += vs_state->offsets[i]; 33797ec681f3Smrg 33807ec681f3Smrg if (cmd_buffer->vertex_bindings[binding].size) { 33817ec681f3Smrg num_records = cmd_buffer->vertex_bindings[binding].size; 33827ec681f3Smrg } else { 33837ec681f3Smrg num_records = buffer->size - offset; 33847ec681f3Smrg } 33857ec681f3Smrg 33867ec681f3Smrg if (pipeline->graphics.uses_dynamic_stride) { 33877ec681f3Smrg stride = cmd_buffer->vertex_bindings[binding].stride; 33887ec681f3Smrg } else { 33897ec681f3Smrg stride = pipeline->binding_stride[binding]; 33907ec681f3Smrg } 33917ec681f3Smrg 33927ec681f3Smrg if (pipeline->use_per_attribute_vb_descs) { 33937ec681f3Smrg uint32_t attrib_end = vs_state ? vs_state->offsets[i] + vs_state->format_sizes[i] 33947ec681f3Smrg : pipeline->attrib_ends[i]; 33957ec681f3Smrg 33967ec681f3Smrg if (num_records < attrib_end) { 33977ec681f3Smrg num_records = 0; /* not enough space for one vertex */ 33987ec681f3Smrg } else if (stride == 0) { 33997ec681f3Smrg num_records = 1; /* only one vertex */ 34007ec681f3Smrg } else { 34017ec681f3Smrg num_records = (num_records - attrib_end) / stride + 1; 34027ec681f3Smrg /* If attrib_offset>stride, then the compiler will increase the vertex index by 34037ec681f3Smrg * attrib_offset/stride and decrease the offset by attrib_offset%stride. This is 34047ec681f3Smrg * only allowed with static strides. 34057ec681f3Smrg */ 34067ec681f3Smrg num_records += pipeline->attrib_index_offset[i]; 34077ec681f3Smrg } 34087ec681f3Smrg 34097ec681f3Smrg /* GFX10 uses OOB_SELECT_RAW if stride==0, so convert num_records from elements into 34107ec681f3Smrg * into bytes in that case. GFX8 always uses bytes. 34117ec681f3Smrg */ 34127ec681f3Smrg if (num_records && (chip == GFX8 || (chip != GFX9 && !stride))) { 34137ec681f3Smrg num_records = (num_records - 1) * stride + attrib_end; 34147ec681f3Smrg } else if (!num_records) { 34157ec681f3Smrg /* On GFX9, it seems bounds checking is disabled if both 34167ec681f3Smrg * num_records and stride are zero. This doesn't seem necessary on GFX8, GFX10 and 34177ec681f3Smrg * GFX10.3 but it doesn't hurt. 34187ec681f3Smrg */ 34197ec681f3Smrg if (vs_state) { 34207ec681f3Smrg desc[0] = 0; 34217ec681f3Smrg desc[1] = S_008F04_STRIDE(16); 34227ec681f3Smrg desc[2] = 0; 34237ec681f3Smrg desc[3] = rsrc_word3; 34247ec681f3Smrg } else { 34257ec681f3Smrg memset(desc, 0, 16); 34267ec681f3Smrg } 34277ec681f3Smrg continue; 34287ec681f3Smrg } 34297ec681f3Smrg } else { 34307ec681f3Smrg if (chip != GFX8 && stride) 34317ec681f3Smrg num_records = DIV_ROUND_UP(num_records, stride); 34327ec681f3Smrg } 34337ec681f3Smrg 34347ec681f3Smrg if (chip >= GFX10) { 34357ec681f3Smrg /* OOB_SELECT chooses the out-of-bounds check: 34367ec681f3Smrg * - 1: index >= NUM_RECORDS (Structured) 34377ec681f3Smrg * - 3: offset >= NUM_RECORDS (Raw) 34387ec681f3Smrg */ 34397ec681f3Smrg int oob_select = stride ? V_008F0C_OOB_SELECT_STRUCTURED : V_008F0C_OOB_SELECT_RAW; 34407ec681f3Smrg rsrc_word3 |= S_008F0C_OOB_SELECT(oob_select) | S_008F0C_RESOURCE_LEVEL(1); 34417ec681f3Smrg } 34427ec681f3Smrg 34437ec681f3Smrg desc[0] = va; 34447ec681f3Smrg desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride); 34457ec681f3Smrg desc[2] = num_records; 34467ec681f3Smrg desc[3] = rsrc_word3; 34477ec681f3Smrg } 34487ec681f3Smrg 34497ec681f3Smrg va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); 34507ec681f3Smrg va += vb_offset; 34517ec681f3Smrg 34527ec681f3Smrg radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_VERTEX, AC_UD_VS_VERTEX_BUFFERS, 34537ec681f3Smrg va); 34547ec681f3Smrg 34557ec681f3Smrg cmd_buffer->state.vb_va = va; 34567ec681f3Smrg cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_VBO_DESCRIPTORS; 34577ec681f3Smrg 34587ec681f3Smrg if (unlikely(cmd_buffer->device->trace_bo)) 34597ec681f3Smrg radv_save_vertex_descriptors(cmd_buffer, (uintptr_t)vb_ptr); 34607ec681f3Smrg } 34617ec681f3Smrg cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_VERTEX_BUFFER; 346201e04c3fSmrg} 346301e04c3fSmrg 346401e04c3fSmrgstatic void 34657ec681f3Smrgradv_emit_streamout_buffers(struct radv_cmd_buffer *cmd_buffer, uint64_t va) 346601e04c3fSmrg{ 34677ec681f3Smrg struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; 34687ec681f3Smrg struct radv_userdata_info *loc; 34697ec681f3Smrg uint32_t base_reg; 347001e04c3fSmrg 34717ec681f3Smrg for (unsigned stage = 0; stage < MESA_SHADER_STAGES; ++stage) { 34727ec681f3Smrg if (!radv_get_shader(pipeline, stage)) 34737ec681f3Smrg continue; 347401e04c3fSmrg 34757ec681f3Smrg loc = radv_lookup_user_sgpr(pipeline, stage, AC_UD_STREAMOUT_BUFFERS); 34767ec681f3Smrg if (loc->sgpr_idx == -1) 34777ec681f3Smrg continue; 347801e04c3fSmrg 34797ec681f3Smrg base_reg = pipeline->user_data_0[stage]; 348001e04c3fSmrg 34817ec681f3Smrg radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, va, 34827ec681f3Smrg false); 34837ec681f3Smrg } 348401e04c3fSmrg 34857ec681f3Smrg if (radv_pipeline_has_gs_copy_shader(pipeline)) { 34867ec681f3Smrg loc = &pipeline->gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_STREAMOUT_BUFFERS]; 34877ec681f3Smrg if (loc->sgpr_idx != -1) { 34887ec681f3Smrg base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0; 348901e04c3fSmrg 34907ec681f3Smrg radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, 34917ec681f3Smrg va, false); 34927ec681f3Smrg } 34937ec681f3Smrg } 349401e04c3fSmrg} 349501e04c3fSmrg 34967ec681f3Smrgstatic void 34977ec681f3Smrgradv_flush_streamout_descriptors(struct radv_cmd_buffer *cmd_buffer) 349801e04c3fSmrg{ 34997ec681f3Smrg if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_STREAMOUT_BUFFER) { 35007ec681f3Smrg struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings; 35017ec681f3Smrg struct radv_streamout_state *so = &cmd_buffer->state.streamout; 35027ec681f3Smrg unsigned so_offset; 35037ec681f3Smrg void *so_ptr; 35047ec681f3Smrg uint64_t va; 350501e04c3fSmrg 35067ec681f3Smrg /* Allocate some descriptor state for streamout buffers. */ 35077ec681f3Smrg if (!radv_cmd_buffer_upload_alloc(cmd_buffer, MAX_SO_BUFFERS * 16, &so_offset, &so_ptr)) 35087ec681f3Smrg return; 350901e04c3fSmrg 35107ec681f3Smrg for (uint32_t i = 0; i < MAX_SO_BUFFERS; i++) { 35117ec681f3Smrg struct radv_buffer *buffer = sb[i].buffer; 35127ec681f3Smrg uint32_t *desc = &((uint32_t *)so_ptr)[i * 4]; 351301e04c3fSmrg 35147ec681f3Smrg if (!(so->enabled_mask & (1 << i))) 35157ec681f3Smrg continue; 351601e04c3fSmrg 35177ec681f3Smrg va = radv_buffer_get_va(buffer->bo) + buffer->offset; 351801e04c3fSmrg 35197ec681f3Smrg va += sb[i].offset; 352001e04c3fSmrg 35217ec681f3Smrg /* Set the descriptor. 35227ec681f3Smrg * 35237ec681f3Smrg * On GFX8, the format must be non-INVALID, otherwise 35247ec681f3Smrg * the buffer will be considered not bound and store 35257ec681f3Smrg * instructions will be no-ops. 35267ec681f3Smrg */ 35277ec681f3Smrg uint32_t size = 0xffffffff; 35287ec681f3Smrg 35297ec681f3Smrg /* Compute the correct buffer size for NGG streamout 35307ec681f3Smrg * because it's used to determine the max emit per 35317ec681f3Smrg * buffer. 35327ec681f3Smrg */ 35337ec681f3Smrg if (cmd_buffer->device->physical_device->use_ngg_streamout) 35347ec681f3Smrg size = buffer->size - sb[i].offset; 353501e04c3fSmrg 35367ec681f3Smrg uint32_t rsrc_word3 = 35377ec681f3Smrg S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | 35387ec681f3Smrg S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); 353901e04c3fSmrg 35407ec681f3Smrg if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) { 35417ec681f3Smrg rsrc_word3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) | 35427ec681f3Smrg S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1); 35437ec681f3Smrg } else { 35447ec681f3Smrg rsrc_word3 |= S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); 35457ec681f3Smrg } 354601e04c3fSmrg 35477ec681f3Smrg desc[0] = va; 35487ec681f3Smrg desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32); 35497ec681f3Smrg desc[2] = size; 35507ec681f3Smrg desc[3] = rsrc_word3; 35517ec681f3Smrg } 355201e04c3fSmrg 35537ec681f3Smrg va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); 35547ec681f3Smrg va += so_offset; 355501e04c3fSmrg 35567ec681f3Smrg radv_emit_streamout_buffers(cmd_buffer, va); 35577ec681f3Smrg } 355801e04c3fSmrg 35597ec681f3Smrg cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_STREAMOUT_BUFFER; 356001e04c3fSmrg} 356101e04c3fSmrg 356201e04c3fSmrgstatic void 35637ec681f3Smrgradv_flush_ngg_gs_state(struct radv_cmd_buffer *cmd_buffer) 356401e04c3fSmrg{ 35657ec681f3Smrg struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; 35667ec681f3Smrg struct radv_userdata_info *loc; 35677ec681f3Smrg uint32_t ngg_gs_state = 0; 35687ec681f3Smrg uint32_t base_reg; 356901e04c3fSmrg 35707ec681f3Smrg if (!radv_pipeline_has_gs(pipeline) || !pipeline->graphics.is_ngg) 35717ec681f3Smrg return; 357201e04c3fSmrg 35737ec681f3Smrg /* By default NGG GS queries are disabled but they are enabled if the 35747ec681f3Smrg * command buffer has active GDS queries or if it's a secondary command 35757ec681f3Smrg * buffer that inherits the number of generated primitives. 35767ec681f3Smrg */ 35777ec681f3Smrg if (cmd_buffer->state.active_pipeline_gds_queries || 35787ec681f3Smrg (cmd_buffer->state.inherited_pipeline_statistics & 35797ec681f3Smrg VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT)) 35807ec681f3Smrg ngg_gs_state = 1; 3581ed98bd31Smaya 35827ec681f3Smrg loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_GEOMETRY, AC_UD_NGG_GS_STATE); 35837ec681f3Smrg base_reg = pipeline->user_data_0[MESA_SHADER_GEOMETRY]; 35847ec681f3Smrg assert(loc->sgpr_idx != -1); 358501e04c3fSmrg 35867ec681f3Smrg radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, ngg_gs_state); 358701e04c3fSmrg} 358801e04c3fSmrg 35897ec681f3Smrgstatic void 35907ec681f3Smrgradv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty) 359101e04c3fSmrg{ 35927ec681f3Smrg radv_flush_vertex_descriptors(cmd_buffer, pipeline_is_dirty); 35937ec681f3Smrg radv_flush_streamout_descriptors(cmd_buffer); 35947ec681f3Smrg radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS, cmd_buffer->state.pipeline, 35957ec681f3Smrg VK_PIPELINE_BIND_POINT_GRAPHICS); 35967ec681f3Smrg radv_flush_constants(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS, cmd_buffer->state.pipeline, 35977ec681f3Smrg VK_PIPELINE_BIND_POINT_GRAPHICS); 35987ec681f3Smrg radv_flush_ngg_gs_state(cmd_buffer); 359901e04c3fSmrg} 360001e04c3fSmrg 36017ec681f3Smrgstruct radv_draw_info { 36027ec681f3Smrg /** 36037ec681f3Smrg * Number of vertices. 36047ec681f3Smrg */ 36057ec681f3Smrg uint32_t count; 36067ec681f3Smrg 36077ec681f3Smrg /** 36087ec681f3Smrg * First instance id. 36097ec681f3Smrg */ 36107ec681f3Smrg uint32_t first_instance; 36117ec681f3Smrg 36127ec681f3Smrg /** 36137ec681f3Smrg * Number of instances. 36147ec681f3Smrg */ 36157ec681f3Smrg uint32_t instance_count; 36167ec681f3Smrg 36177ec681f3Smrg /** 36187ec681f3Smrg * Whether it's an indexed draw. 36197ec681f3Smrg */ 36207ec681f3Smrg bool indexed; 36217ec681f3Smrg 36227ec681f3Smrg /** 36237ec681f3Smrg * Indirect draw parameters resource. 36247ec681f3Smrg */ 36257ec681f3Smrg struct radv_buffer *indirect; 36267ec681f3Smrg uint64_t indirect_offset; 36277ec681f3Smrg uint32_t stride; 36287ec681f3Smrg 36297ec681f3Smrg /** 36307ec681f3Smrg * Draw count parameters resource. 36317ec681f3Smrg */ 36327ec681f3Smrg struct radv_buffer *count_buffer; 36337ec681f3Smrg uint64_t count_buffer_offset; 36347ec681f3Smrg 36357ec681f3Smrg /** 36367ec681f3Smrg * Stream output parameters resource. 36377ec681f3Smrg */ 36387ec681f3Smrg struct radv_buffer *strmout_buffer; 36397ec681f3Smrg uint64_t strmout_buffer_offset; 36407ec681f3Smrg}; 364101e04c3fSmrg 36427ec681f3Smrgstatic uint32_t 36437ec681f3Smrgradv_get_primitive_reset_index(struct radv_cmd_buffer *cmd_buffer) 36447ec681f3Smrg{ 36457ec681f3Smrg switch (cmd_buffer->state.index_type) { 36467ec681f3Smrg case V_028A7C_VGT_INDEX_8: 36477ec681f3Smrg return 0xffu; 36487ec681f3Smrg case V_028A7C_VGT_INDEX_16: 36497ec681f3Smrg return 0xffffu; 36507ec681f3Smrg case V_028A7C_VGT_INDEX_32: 36517ec681f3Smrg return 0xffffffffu; 36527ec681f3Smrg default: 36537ec681f3Smrg unreachable("invalid index type"); 36547ec681f3Smrg } 36557ec681f3Smrg} 365601e04c3fSmrg 36577ec681f3Smrgstatic void 36587ec681f3Smrgsi_emit_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer, bool instanced_draw, 36597ec681f3Smrg bool indirect_draw, bool count_from_stream_output, 36607ec681f3Smrg uint32_t draw_vertex_count) 36617ec681f3Smrg{ 36627ec681f3Smrg struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info; 36637ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 36647ec681f3Smrg unsigned topology = state->dynamic.primitive_topology; 36657ec681f3Smrg bool prim_restart_enable = state->dynamic.primitive_restart_enable; 36667ec681f3Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 36677ec681f3Smrg unsigned ia_multi_vgt_param; 36687ec681f3Smrg 36697ec681f3Smrg ia_multi_vgt_param = 36707ec681f3Smrg si_get_ia_multi_vgt_param(cmd_buffer, instanced_draw, indirect_draw, count_from_stream_output, 36717ec681f3Smrg draw_vertex_count, topology, prim_restart_enable); 36727ec681f3Smrg 36737ec681f3Smrg if (state->last_ia_multi_vgt_param != ia_multi_vgt_param) { 36747ec681f3Smrg if (info->chip_class == GFX9) { 36757ec681f3Smrg radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cs, 36767ec681f3Smrg R_030960_IA_MULTI_VGT_PARAM, 4, ia_multi_vgt_param); 36777ec681f3Smrg } else if (info->chip_class >= GFX7) { 36787ec681f3Smrg radeon_set_context_reg_idx(cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param); 36797ec681f3Smrg } else { 36807ec681f3Smrg radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param); 36817ec681f3Smrg } 36827ec681f3Smrg state->last_ia_multi_vgt_param = ia_multi_vgt_param; 36837ec681f3Smrg } 368401e04c3fSmrg} 368501e04c3fSmrg 36867ec681f3Smrgstatic void 36877ec681f3Smrgradv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *draw_info) 368801e04c3fSmrg{ 36897ec681f3Smrg struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info; 36907ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 36917ec681f3Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 369201e04c3fSmrg 36937ec681f3Smrg /* Draw state. */ 36947ec681f3Smrg if (info->chip_class < GFX10) { 36957ec681f3Smrg si_emit_ia_multi_vgt_param(cmd_buffer, draw_info->instance_count > 1, draw_info->indirect, 36967ec681f3Smrg !!draw_info->strmout_buffer, 36977ec681f3Smrg draw_info->indirect ? 0 : draw_info->count); 36987ec681f3Smrg } 369901e04c3fSmrg 37007ec681f3Smrg if (state->dynamic.primitive_restart_enable) { 37017ec681f3Smrg uint32_t primitive_reset_index = radv_get_primitive_reset_index(cmd_buffer); 3702ed98bd31Smaya 37037ec681f3Smrg if (primitive_reset_index != state->last_primitive_reset_index) { 37047ec681f3Smrg radeon_set_context_reg(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, primitive_reset_index); 37057ec681f3Smrg state->last_primitive_reset_index = primitive_reset_index; 37067ec681f3Smrg } 37077ec681f3Smrg } 370801e04c3fSmrg 37097ec681f3Smrg if (draw_info->strmout_buffer) { 37107ec681f3Smrg uint64_t va = radv_buffer_get_va(draw_info->strmout_buffer->bo); 371101e04c3fSmrg 37127ec681f3Smrg va += draw_info->strmout_buffer->offset + draw_info->strmout_buffer_offset; 371301e04c3fSmrg 37147ec681f3Smrg radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, draw_info->stride); 371501e04c3fSmrg 37167ec681f3Smrg radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); 37177ec681f3Smrg radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) | 37187ec681f3Smrg COPY_DATA_WR_CONFIRM); 37197ec681f3Smrg radeon_emit(cs, va); 37207ec681f3Smrg radeon_emit(cs, va >> 32); 37217ec681f3Smrg radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2); 37227ec681f3Smrg radeon_emit(cs, 0); /* unused */ 3723ed98bd31Smaya 37247ec681f3Smrg radv_cs_add_buffer(cmd_buffer->device->ws, cs, draw_info->strmout_buffer->bo); 37257ec681f3Smrg } 37267ec681f3Smrg} 372701e04c3fSmrg 37287ec681f3Smrgstatic void 37297ec681f3Smrgradv_stage_flush(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags src_stage_mask) 37307ec681f3Smrg{ 37317ec681f3Smrg if (src_stage_mask & 37327ec681f3Smrg (VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_TRANSFER_BIT | 37337ec681f3Smrg VK_PIPELINE_STAGE_ACCELERATION_STRUCTURE_BUILD_BIT_KHR | 37347ec681f3Smrg VK_PIPELINE_STAGE_RAY_TRACING_SHADER_BIT_KHR | VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT | 37357ec681f3Smrg VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)) { 37367ec681f3Smrg cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH; 37377ec681f3Smrg } 37387ec681f3Smrg 37397ec681f3Smrg if (src_stage_mask & 37407ec681f3Smrg (VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | 37417ec681f3Smrg VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT | 37427ec681f3Smrg VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT | 37437ec681f3Smrg VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT | VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)) { 37447ec681f3Smrg cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH; 37457ec681f3Smrg } else if (src_stage_mask & 37467ec681f3Smrg (VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_VERTEX_INPUT_BIT | 37477ec681f3Smrg VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | 37487ec681f3Smrg VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT | 37497ec681f3Smrg VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT | 37507ec681f3Smrg VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT | 37517ec681f3Smrg VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT)) { 37527ec681f3Smrg cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH; 37537ec681f3Smrg } 375401e04c3fSmrg} 375501e04c3fSmrg 37567ec681f3Smrgstatic bool 37577ec681f3Smrgcan_skip_buffer_l2_flushes(struct radv_device *device) 375801e04c3fSmrg{ 37597ec681f3Smrg return device->physical_device->rad_info.chip_class == GFX9 || 37607ec681f3Smrg (device->physical_device->rad_info.chip_class >= GFX10 && 37617ec681f3Smrg !device->physical_device->rad_info.tcc_rb_non_coherent); 376201e04c3fSmrg} 376301e04c3fSmrg 37647ec681f3Smrg/* 37657ec681f3Smrg * In vulkan barriers have two kinds of operations: 37667ec681f3Smrg * 37677ec681f3Smrg * - visibility (implemented with radv_src_access_flush) 37687ec681f3Smrg * - availability (implemented with radv_dst_access_flush) 37697ec681f3Smrg * 37707ec681f3Smrg * for a memory operation to observe the result of a previous memory operation 37717ec681f3Smrg * one needs to do a visibility operation from the source memory and then an 37727ec681f3Smrg * availability operation to the target memory. 37737ec681f3Smrg * 37747ec681f3Smrg * The complication is the availability and visibility operations do not need to 37757ec681f3Smrg * be in the same barrier. 37767ec681f3Smrg * 37777ec681f3Smrg * The cleanest way to implement this is to define the visibility operation to 37787ec681f3Smrg * bring the caches to a "state of rest", which none of the caches below that 37797ec681f3Smrg * level dirty. 37807ec681f3Smrg * 37817ec681f3Smrg * For GFX8 and earlier this would be VRAM/GTT with none of the caches dirty. 37827ec681f3Smrg * 37837ec681f3Smrg * For GFX9+ we can define the state at rest to be L2 instead of VRAM for all 37847ec681f3Smrg * buffers and for images marked as coherent, and VRAM/GTT for non-coherent 37857ec681f3Smrg * images. However, given the existence of memory barriers which do not specify 37867ec681f3Smrg * the image/buffer it often devolves to just VRAM/GTT anyway. 37877ec681f3Smrg * 37887ec681f3Smrg * To help reducing the invalidations for GPUs that have L2 coherency between the 37897ec681f3Smrg * RB and the shader caches, we always invalidate L2 on the src side, as we can 37907ec681f3Smrg * use our knowledge of past usage to optimize flushes away. 37917ec681f3Smrg */ 379201e04c3fSmrg 37937ec681f3Smrgenum radv_cmd_flush_bits 37947ec681f3Smrgradv_src_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags src_flags, 37957ec681f3Smrg const struct radv_image *image) 37967ec681f3Smrg{ 37977ec681f3Smrg bool has_CB_meta = true, has_DB_meta = true; 37987ec681f3Smrg bool image_is_coherent = image ? image->l2_coherent : false; 37997ec681f3Smrg enum radv_cmd_flush_bits flush_bits = 0; 38007ec681f3Smrg 38017ec681f3Smrg if (image) { 38027ec681f3Smrg if (!radv_image_has_CB_metadata(image)) 38037ec681f3Smrg has_CB_meta = false; 38047ec681f3Smrg if (!radv_image_has_htile(image)) 38057ec681f3Smrg has_DB_meta = false; 38067ec681f3Smrg } 38077ec681f3Smrg 38087ec681f3Smrg u_foreach_bit(b, src_flags) 38097ec681f3Smrg { 38107ec681f3Smrg switch ((VkAccessFlagBits)(1 << b)) { 38117ec681f3Smrg case VK_ACCESS_SHADER_WRITE_BIT: 38127ec681f3Smrg /* since the STORAGE bit isn't set we know that this is a meta operation. 38137ec681f3Smrg * on the dst flush side we skip CB/DB flushes without the STORAGE bit, so 38147ec681f3Smrg * set it here. */ 38157ec681f3Smrg if (image && !(image->usage & VK_IMAGE_USAGE_STORAGE_BIT)) { 38167ec681f3Smrg if (vk_format_is_depth_or_stencil(image->vk_format)) { 38177ec681f3Smrg flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB; 38187ec681f3Smrg } else { 38197ec681f3Smrg flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB; 38207ec681f3Smrg } 38217ec681f3Smrg } 38227ec681f3Smrg 38237ec681f3Smrg /* This is valid even for the rb_noncoherent_dirty case, because with how we account for 38247ec681f3Smrg * dirtyness, if it isn't dirty it doesn't contain the data at all and hence doesn't need 38257ec681f3Smrg * invalidating. */ 38267ec681f3Smrg if (!image_is_coherent) 38277ec681f3Smrg flush_bits |= RADV_CMD_FLAG_WB_L2; 38287ec681f3Smrg break; 38297ec681f3Smrg case VK_ACCESS_ACCELERATION_STRUCTURE_WRITE_BIT_KHR: 38307ec681f3Smrg case VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT: 38317ec681f3Smrg case VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT: 38327ec681f3Smrg if (!image_is_coherent) 38337ec681f3Smrg flush_bits |= RADV_CMD_FLAG_WB_L2; 38347ec681f3Smrg break; 38357ec681f3Smrg case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT: 38367ec681f3Smrg flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB; 38377ec681f3Smrg if (has_CB_meta) 38387ec681f3Smrg flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; 38397ec681f3Smrg break; 38407ec681f3Smrg case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT: 38417ec681f3Smrg flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB; 38427ec681f3Smrg if (has_DB_meta) 38437ec681f3Smrg flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; 38447ec681f3Smrg break; 38457ec681f3Smrg case VK_ACCESS_TRANSFER_WRITE_BIT: 38467ec681f3Smrg flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB; 38477ec681f3Smrg 38487ec681f3Smrg if (!image_is_coherent) 38497ec681f3Smrg flush_bits |= RADV_CMD_FLAG_INV_L2; 38507ec681f3Smrg if (has_CB_meta) 38517ec681f3Smrg flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; 38527ec681f3Smrg if (has_DB_meta) 38537ec681f3Smrg flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; 38547ec681f3Smrg break; 38557ec681f3Smrg case VK_ACCESS_MEMORY_WRITE_BIT: 38567ec681f3Smrg flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB; 38577ec681f3Smrg 38587ec681f3Smrg if (!image_is_coherent) 38597ec681f3Smrg flush_bits |= RADV_CMD_FLAG_INV_L2; 38607ec681f3Smrg if (has_CB_meta) 38617ec681f3Smrg flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; 38627ec681f3Smrg if (has_DB_meta) 38637ec681f3Smrg flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; 38647ec681f3Smrg break; 38657ec681f3Smrg default: 38667ec681f3Smrg break; 38677ec681f3Smrg } 38687ec681f3Smrg } 38697ec681f3Smrg return flush_bits; 38707ec681f3Smrg} 38717ec681f3Smrg 38727ec681f3Smrgenum radv_cmd_flush_bits 38737ec681f3Smrgradv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags dst_flags, 38747ec681f3Smrg const struct radv_image *image) 38757ec681f3Smrg{ 38767ec681f3Smrg bool has_CB_meta = true, has_DB_meta = true; 38777ec681f3Smrg enum radv_cmd_flush_bits flush_bits = 0; 38787ec681f3Smrg bool flush_CB = true, flush_DB = true; 38797ec681f3Smrg bool image_is_coherent = image ? image->l2_coherent : false; 38807ec681f3Smrg 38817ec681f3Smrg if (image) { 38827ec681f3Smrg if (!(image->usage & VK_IMAGE_USAGE_STORAGE_BIT)) { 38837ec681f3Smrg flush_CB = false; 38847ec681f3Smrg flush_DB = false; 38857ec681f3Smrg } 38867ec681f3Smrg 38877ec681f3Smrg if (!radv_image_has_CB_metadata(image)) 38887ec681f3Smrg has_CB_meta = false; 38897ec681f3Smrg if (!radv_image_has_htile(image)) 38907ec681f3Smrg has_DB_meta = false; 38917ec681f3Smrg } 38927ec681f3Smrg 38937ec681f3Smrg /* All the L2 invalidations below are not the CB/DB. So if there are no incoherent images 38947ec681f3Smrg * in the L2 cache in CB/DB mode then they are already usable from all the other L2 clients. */ 38957ec681f3Smrg image_is_coherent |= 38967ec681f3Smrg can_skip_buffer_l2_flushes(cmd_buffer->device) && !cmd_buffer->state.rb_noncoherent_dirty; 38977ec681f3Smrg 38987ec681f3Smrg u_foreach_bit(b, dst_flags) 38997ec681f3Smrg { 39007ec681f3Smrg switch ((VkAccessFlagBits)(1 << b)) { 39017ec681f3Smrg case VK_ACCESS_INDIRECT_COMMAND_READ_BIT: 39027ec681f3Smrg case VK_ACCESS_INDEX_READ_BIT: 39037ec681f3Smrg case VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT: 39047ec681f3Smrg break; 39057ec681f3Smrg case VK_ACCESS_UNIFORM_READ_BIT: 39067ec681f3Smrg flush_bits |= RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_SCACHE; 39077ec681f3Smrg break; 39087ec681f3Smrg case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT: 39097ec681f3Smrg case VK_ACCESS_INPUT_ATTACHMENT_READ_BIT: 39107ec681f3Smrg case VK_ACCESS_TRANSFER_READ_BIT: 39117ec681f3Smrg case VK_ACCESS_TRANSFER_WRITE_BIT: 39127ec681f3Smrg flush_bits |= RADV_CMD_FLAG_INV_VCACHE; 39137ec681f3Smrg 39147ec681f3Smrg if (has_CB_meta || has_DB_meta) 39157ec681f3Smrg flush_bits |= RADV_CMD_FLAG_INV_L2_METADATA; 39167ec681f3Smrg if (!image_is_coherent) 39177ec681f3Smrg flush_bits |= RADV_CMD_FLAG_INV_L2; 39187ec681f3Smrg break; 39197ec681f3Smrg case VK_ACCESS_SHADER_READ_BIT: 39207ec681f3Smrg flush_bits |= RADV_CMD_FLAG_INV_VCACHE; 39217ec681f3Smrg /* Unlike LLVM, ACO uses SMEM for SSBOs and we have to 39227ec681f3Smrg * invalidate the scalar cache. */ 39237ec681f3Smrg if (!cmd_buffer->device->physical_device->use_llvm && !image) 39247ec681f3Smrg flush_bits |= RADV_CMD_FLAG_INV_SCACHE; 39257ec681f3Smrg 39267ec681f3Smrg if (has_CB_meta || has_DB_meta) 39277ec681f3Smrg flush_bits |= RADV_CMD_FLAG_INV_L2_METADATA; 39287ec681f3Smrg if (!image_is_coherent) 39297ec681f3Smrg flush_bits |= RADV_CMD_FLAG_INV_L2; 39307ec681f3Smrg break; 39317ec681f3Smrg case VK_ACCESS_ACCELERATION_STRUCTURE_READ_BIT_KHR: 39327ec681f3Smrg flush_bits |= RADV_CMD_FLAG_INV_VCACHE; 39337ec681f3Smrg if (cmd_buffer->device->physical_device->rad_info.chip_class < GFX9) 39347ec681f3Smrg flush_bits |= RADV_CMD_FLAG_INV_L2; 39357ec681f3Smrg break; 39367ec681f3Smrg case VK_ACCESS_SHADER_WRITE_BIT: 39377ec681f3Smrg case VK_ACCESS_ACCELERATION_STRUCTURE_WRITE_BIT_KHR: 39387ec681f3Smrg break; 39397ec681f3Smrg case VK_ACCESS_COLOR_ATTACHMENT_READ_BIT: 39407ec681f3Smrg case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT: 39417ec681f3Smrg if (flush_CB) 39427ec681f3Smrg flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB; 39437ec681f3Smrg if (has_CB_meta) 39447ec681f3Smrg flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; 39457ec681f3Smrg break; 39467ec681f3Smrg case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT: 39477ec681f3Smrg case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT: 39487ec681f3Smrg if (flush_DB) 39497ec681f3Smrg flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB; 39507ec681f3Smrg if (has_DB_meta) 39517ec681f3Smrg flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; 39527ec681f3Smrg break; 39537ec681f3Smrg case VK_ACCESS_MEMORY_READ_BIT: 39547ec681f3Smrg case VK_ACCESS_MEMORY_WRITE_BIT: 39557ec681f3Smrg flush_bits |= RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_SCACHE; 39567ec681f3Smrg if (!image_is_coherent) 39577ec681f3Smrg flush_bits |= RADV_CMD_FLAG_INV_L2; 39587ec681f3Smrg if (flush_CB) 39597ec681f3Smrg flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB; 39607ec681f3Smrg if (has_CB_meta) 39617ec681f3Smrg flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; 39627ec681f3Smrg if (flush_DB) 39637ec681f3Smrg flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB; 39647ec681f3Smrg if (has_DB_meta) 39657ec681f3Smrg flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; 39667ec681f3Smrg break; 39677ec681f3Smrg default: 39687ec681f3Smrg break; 39697ec681f3Smrg } 39707ec681f3Smrg } 39717ec681f3Smrg return flush_bits; 39727ec681f3Smrg} 3973ed98bd31Smaya 39747ec681f3Smrgvoid 39757ec681f3Smrgradv_emit_subpass_barrier(struct radv_cmd_buffer *cmd_buffer, const struct radv_subpass_barrier *barrier) 39767ec681f3Smrg{ 39777ec681f3Smrg struct radv_framebuffer *fb = cmd_buffer->state.framebuffer; 39787ec681f3Smrg if (fb && !fb->imageless) { 39797ec681f3Smrg for (int i = 0; i < fb->attachment_count; ++i) { 39807ec681f3Smrg cmd_buffer->state.flush_bits |= 39817ec681f3Smrg radv_src_access_flush(cmd_buffer, barrier->src_access_mask, fb->attachments[i]->image); 39827ec681f3Smrg } 39837ec681f3Smrg } else { 39847ec681f3Smrg cmd_buffer->state.flush_bits |= 39857ec681f3Smrg radv_src_access_flush(cmd_buffer, barrier->src_access_mask, NULL); 39867ec681f3Smrg } 39877ec681f3Smrg 39887ec681f3Smrg radv_stage_flush(cmd_buffer, barrier->src_stage_mask); 39897ec681f3Smrg 39907ec681f3Smrg if (fb && !fb->imageless) { 39917ec681f3Smrg for (int i = 0; i < fb->attachment_count; ++i) { 39927ec681f3Smrg cmd_buffer->state.flush_bits |= 39937ec681f3Smrg radv_dst_access_flush(cmd_buffer, barrier->dst_access_mask, fb->attachments[i]->image); 39947ec681f3Smrg } 39957ec681f3Smrg } else { 39967ec681f3Smrg cmd_buffer->state.flush_bits |= 39977ec681f3Smrg radv_dst_access_flush(cmd_buffer, barrier->dst_access_mask, NULL); 39987ec681f3Smrg } 39997ec681f3Smrg} 40007ec681f3Smrg 40017ec681f3Smrguint32_t 40027ec681f3Smrgradv_get_subpass_id(struct radv_cmd_buffer *cmd_buffer) 40037ec681f3Smrg{ 40047ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 40057ec681f3Smrg uint32_t subpass_id = state->subpass - state->pass->subpasses; 40067ec681f3Smrg 40077ec681f3Smrg /* The id of this subpass shouldn't exceed the number of subpasses in 40087ec681f3Smrg * this render pass minus 1. 40097ec681f3Smrg */ 40107ec681f3Smrg assert(subpass_id < state->pass->subpass_count); 40117ec681f3Smrg return subpass_id; 40127ec681f3Smrg} 40137ec681f3Smrg 40147ec681f3Smrgstatic struct radv_sample_locations_state * 40157ec681f3Smrgradv_get_attachment_sample_locations(struct radv_cmd_buffer *cmd_buffer, uint32_t att_idx, 40167ec681f3Smrg bool begin_subpass) 40177ec681f3Smrg{ 40187ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 40197ec681f3Smrg uint32_t subpass_id = radv_get_subpass_id(cmd_buffer); 40207ec681f3Smrg struct radv_image_view *view = state->attachments[att_idx].iview; 40217ec681f3Smrg 40227ec681f3Smrg if (view->image->info.samples == 1) 40237ec681f3Smrg return NULL; 40247ec681f3Smrg 40257ec681f3Smrg if (state->pass->attachments[att_idx].first_subpass_idx == subpass_id) { 40267ec681f3Smrg /* Return the initial sample locations if this is the initial 40277ec681f3Smrg * layout transition of the given subpass attachemnt. 40287ec681f3Smrg */ 40297ec681f3Smrg if (state->attachments[att_idx].sample_location.count > 0) 40307ec681f3Smrg return &state->attachments[att_idx].sample_location; 40317ec681f3Smrg } else { 40327ec681f3Smrg /* Otherwise return the subpass sample locations if defined. */ 40337ec681f3Smrg if (state->subpass_sample_locs) { 40347ec681f3Smrg /* Because the driver sets the current subpass before 40357ec681f3Smrg * initial layout transitions, we should use the sample 40367ec681f3Smrg * locations from the previous subpass to avoid an 40377ec681f3Smrg * off-by-one problem. Otherwise, use the sample 40387ec681f3Smrg * locations for the current subpass for final layout 40397ec681f3Smrg * transitions. 40407ec681f3Smrg */ 40417ec681f3Smrg if (begin_subpass) 40427ec681f3Smrg subpass_id--; 40437ec681f3Smrg 40447ec681f3Smrg for (uint32_t i = 0; i < state->num_subpass_sample_locs; i++) { 40457ec681f3Smrg if (state->subpass_sample_locs[i].subpass_idx == subpass_id) 40467ec681f3Smrg return &state->subpass_sample_locs[i].sample_location; 40477ec681f3Smrg } 40487ec681f3Smrg } 40497ec681f3Smrg } 40507ec681f3Smrg 40517ec681f3Smrg return NULL; 40527ec681f3Smrg} 405301e04c3fSmrg 40547ec681f3Smrgstatic void 40557ec681f3Smrgradv_handle_subpass_image_transition(struct radv_cmd_buffer *cmd_buffer, 40567ec681f3Smrg struct radv_subpass_attachment att, bool begin_subpass) 40577ec681f3Smrg{ 40587ec681f3Smrg unsigned idx = att.attachment; 40597ec681f3Smrg struct radv_image_view *view = cmd_buffer->state.attachments[idx].iview; 40607ec681f3Smrg struct radv_sample_locations_state *sample_locs; 40617ec681f3Smrg VkImageSubresourceRange range; 40627ec681f3Smrg range.aspectMask = view->aspect_mask; 40637ec681f3Smrg range.baseMipLevel = view->base_mip; 40647ec681f3Smrg range.levelCount = 1; 40657ec681f3Smrg range.baseArrayLayer = view->base_layer; 40667ec681f3Smrg range.layerCount = cmd_buffer->state.framebuffer->layers; 40677ec681f3Smrg 40687ec681f3Smrg if (cmd_buffer->state.subpass->view_mask) { 40697ec681f3Smrg /* If the current subpass uses multiview, the driver might have 40707ec681f3Smrg * performed a fast color/depth clear to the whole image 40717ec681f3Smrg * (including all layers). To make sure the driver will 40727ec681f3Smrg * decompress the image correctly (if needed), we have to 40737ec681f3Smrg * account for the "real" number of layers. If the view mask is 40747ec681f3Smrg * sparse, this will decompress more layers than needed. 40757ec681f3Smrg */ 40767ec681f3Smrg range.layerCount = util_last_bit(cmd_buffer->state.subpass->view_mask); 40777ec681f3Smrg } 40787ec681f3Smrg 40797ec681f3Smrg /* Get the subpass sample locations for the given attachment, if NULL 40807ec681f3Smrg * is returned the driver will use the default HW locations. 40817ec681f3Smrg */ 40827ec681f3Smrg sample_locs = radv_get_attachment_sample_locations(cmd_buffer, idx, begin_subpass); 40837ec681f3Smrg 40847ec681f3Smrg /* Determine if the subpass uses separate depth/stencil layouts. */ 40857ec681f3Smrg bool uses_separate_depth_stencil_layouts = false; 40867ec681f3Smrg if ((cmd_buffer->state.attachments[idx].current_layout != 40877ec681f3Smrg cmd_buffer->state.attachments[idx].current_stencil_layout) || 40887ec681f3Smrg (att.layout != att.stencil_layout)) { 40897ec681f3Smrg uses_separate_depth_stencil_layouts = true; 40907ec681f3Smrg } 40917ec681f3Smrg 40927ec681f3Smrg /* For separate layouts, perform depth and stencil transitions 40937ec681f3Smrg * separately. 40947ec681f3Smrg */ 40957ec681f3Smrg if (uses_separate_depth_stencil_layouts && 40967ec681f3Smrg (range.aspectMask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))) { 40977ec681f3Smrg /* Depth-only transitions. */ 40987ec681f3Smrg range.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT; 40997ec681f3Smrg radv_handle_image_transition(cmd_buffer, view->image, 41007ec681f3Smrg cmd_buffer->state.attachments[idx].current_layout, 41017ec681f3Smrg cmd_buffer->state.attachments[idx].current_in_render_loop, 41027ec681f3Smrg att.layout, att.in_render_loop, 0, 0, &range, sample_locs); 41037ec681f3Smrg 41047ec681f3Smrg /* Stencil-only transitions. */ 41057ec681f3Smrg range.aspectMask = VK_IMAGE_ASPECT_STENCIL_BIT; 41067ec681f3Smrg radv_handle_image_transition( 41077ec681f3Smrg cmd_buffer, view->image, cmd_buffer->state.attachments[idx].current_stencil_layout, 41087ec681f3Smrg cmd_buffer->state.attachments[idx].current_in_render_loop, att.stencil_layout, 41097ec681f3Smrg att.in_render_loop, 0, 0, &range, sample_locs); 41107ec681f3Smrg } else { 41117ec681f3Smrg radv_handle_image_transition(cmd_buffer, view->image, 41127ec681f3Smrg cmd_buffer->state.attachments[idx].current_layout, 41137ec681f3Smrg cmd_buffer->state.attachments[idx].current_in_render_loop, 41147ec681f3Smrg att.layout, att.in_render_loop, 0, 0, &range, sample_locs); 41157ec681f3Smrg } 41167ec681f3Smrg 41177ec681f3Smrg cmd_buffer->state.attachments[idx].current_layout = att.layout; 41187ec681f3Smrg cmd_buffer->state.attachments[idx].current_stencil_layout = att.stencil_layout; 41197ec681f3Smrg cmd_buffer->state.attachments[idx].current_in_render_loop = att.in_render_loop; 412001e04c3fSmrg} 412101e04c3fSmrg 41227ec681f3Smrgvoid 41237ec681f3Smrgradv_cmd_buffer_set_subpass(struct radv_cmd_buffer *cmd_buffer, const struct radv_subpass *subpass) 412401e04c3fSmrg{ 41257ec681f3Smrg cmd_buffer->state.subpass = subpass; 412601e04c3fSmrg 41277ec681f3Smrg cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER; 41287ec681f3Smrg} 4129ed98bd31Smaya 41307ec681f3Smrgstatic VkResult 41317ec681f3Smrgradv_cmd_state_setup_sample_locations(struct radv_cmd_buffer *cmd_buffer, 41327ec681f3Smrg struct radv_render_pass *pass, 41337ec681f3Smrg const VkRenderPassBeginInfo *info) 41347ec681f3Smrg{ 41357ec681f3Smrg const struct VkRenderPassSampleLocationsBeginInfoEXT *sample_locs = 41367ec681f3Smrg vk_find_struct_const(info->pNext, RENDER_PASS_SAMPLE_LOCATIONS_BEGIN_INFO_EXT); 41377ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 41387ec681f3Smrg 41397ec681f3Smrg if (!sample_locs) { 41407ec681f3Smrg state->subpass_sample_locs = NULL; 41417ec681f3Smrg return VK_SUCCESS; 41427ec681f3Smrg } 41437ec681f3Smrg 41447ec681f3Smrg for (uint32_t i = 0; i < sample_locs->attachmentInitialSampleLocationsCount; i++) { 41457ec681f3Smrg const VkAttachmentSampleLocationsEXT *att_sample_locs = 41467ec681f3Smrg &sample_locs->pAttachmentInitialSampleLocations[i]; 41477ec681f3Smrg uint32_t att_idx = att_sample_locs->attachmentIndex; 41487ec681f3Smrg struct radv_image *image = cmd_buffer->state.attachments[att_idx].iview->image; 41497ec681f3Smrg 41507ec681f3Smrg assert(vk_format_is_depth_or_stencil(image->vk_format)); 41517ec681f3Smrg 41527ec681f3Smrg /* From the Vulkan spec 1.1.108: 41537ec681f3Smrg * 41547ec681f3Smrg * "If the image referenced by the framebuffer attachment at 41557ec681f3Smrg * index attachmentIndex was not created with 41567ec681f3Smrg * VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT 41577ec681f3Smrg * then the values specified in sampleLocationsInfo are 41587ec681f3Smrg * ignored." 41597ec681f3Smrg */ 41607ec681f3Smrg if (!(image->flags & VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT)) 41617ec681f3Smrg continue; 41627ec681f3Smrg 41637ec681f3Smrg const VkSampleLocationsInfoEXT *sample_locs_info = &att_sample_locs->sampleLocationsInfo; 41647ec681f3Smrg 41657ec681f3Smrg state->attachments[att_idx].sample_location.per_pixel = 41667ec681f3Smrg sample_locs_info->sampleLocationsPerPixel; 41677ec681f3Smrg state->attachments[att_idx].sample_location.grid_size = 41687ec681f3Smrg sample_locs_info->sampleLocationGridSize; 41697ec681f3Smrg state->attachments[att_idx].sample_location.count = sample_locs_info->sampleLocationsCount; 41707ec681f3Smrg typed_memcpy(&state->attachments[att_idx].sample_location.locations[0], 41717ec681f3Smrg sample_locs_info->pSampleLocations, sample_locs_info->sampleLocationsCount); 41727ec681f3Smrg } 41737ec681f3Smrg 41747ec681f3Smrg state->subpass_sample_locs = 41757ec681f3Smrg vk_alloc(&cmd_buffer->pool->alloc, 41767ec681f3Smrg sample_locs->postSubpassSampleLocationsCount * sizeof(state->subpass_sample_locs[0]), 41777ec681f3Smrg 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 41787ec681f3Smrg if (state->subpass_sample_locs == NULL) { 41797ec681f3Smrg cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY; 41807ec681f3Smrg return cmd_buffer->record_result; 41817ec681f3Smrg } 41827ec681f3Smrg 41837ec681f3Smrg state->num_subpass_sample_locs = sample_locs->postSubpassSampleLocationsCount; 41847ec681f3Smrg 41857ec681f3Smrg for (uint32_t i = 0; i < sample_locs->postSubpassSampleLocationsCount; i++) { 41867ec681f3Smrg const VkSubpassSampleLocationsEXT *subpass_sample_locs_info = 41877ec681f3Smrg &sample_locs->pPostSubpassSampleLocations[i]; 41887ec681f3Smrg const VkSampleLocationsInfoEXT *sample_locs_info = 41897ec681f3Smrg &subpass_sample_locs_info->sampleLocationsInfo; 41907ec681f3Smrg 41917ec681f3Smrg state->subpass_sample_locs[i].subpass_idx = subpass_sample_locs_info->subpassIndex; 41927ec681f3Smrg state->subpass_sample_locs[i].sample_location.per_pixel = 41937ec681f3Smrg sample_locs_info->sampleLocationsPerPixel; 41947ec681f3Smrg state->subpass_sample_locs[i].sample_location.grid_size = 41957ec681f3Smrg sample_locs_info->sampleLocationGridSize; 41967ec681f3Smrg state->subpass_sample_locs[i].sample_location.count = sample_locs_info->sampleLocationsCount; 41977ec681f3Smrg typed_memcpy(&state->subpass_sample_locs[i].sample_location.locations[0], 41987ec681f3Smrg sample_locs_info->pSampleLocations, sample_locs_info->sampleLocationsCount); 41997ec681f3Smrg } 42007ec681f3Smrg 42017ec681f3Smrg return VK_SUCCESS; 42027ec681f3Smrg} 420301e04c3fSmrg 42047ec681f3Smrgstatic VkResult 42057ec681f3Smrgradv_cmd_state_setup_attachments(struct radv_cmd_buffer *cmd_buffer, struct radv_render_pass *pass, 42067ec681f3Smrg const VkRenderPassBeginInfo *info, 42077ec681f3Smrg const struct radv_extra_render_pass_begin_info *extra) 42087ec681f3Smrg{ 42097ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 42107ec681f3Smrg const struct VkRenderPassAttachmentBeginInfo *attachment_info = NULL; 42117ec681f3Smrg 42127ec681f3Smrg if (info) { 42137ec681f3Smrg attachment_info = vk_find_struct_const(info->pNext, RENDER_PASS_ATTACHMENT_BEGIN_INFO); 42147ec681f3Smrg } 42157ec681f3Smrg 42167ec681f3Smrg if (pass->attachment_count == 0) { 42177ec681f3Smrg state->attachments = NULL; 42187ec681f3Smrg return VK_SUCCESS; 42197ec681f3Smrg } 42207ec681f3Smrg 42217ec681f3Smrg state->attachments = 42227ec681f3Smrg vk_alloc(&cmd_buffer->pool->alloc, pass->attachment_count * sizeof(state->attachments[0]), 8, 42237ec681f3Smrg VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 42247ec681f3Smrg if (state->attachments == NULL) { 42257ec681f3Smrg cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY; 42267ec681f3Smrg return cmd_buffer->record_result; 42277ec681f3Smrg } 42287ec681f3Smrg 42297ec681f3Smrg for (uint32_t i = 0; i < pass->attachment_count; ++i) { 42307ec681f3Smrg struct radv_render_pass_attachment *att = &pass->attachments[i]; 42317ec681f3Smrg VkImageAspectFlags att_aspects = vk_format_aspects(att->format); 42327ec681f3Smrg VkImageAspectFlags clear_aspects = 0; 42337ec681f3Smrg 42347ec681f3Smrg if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) { 42357ec681f3Smrg /* color attachment */ 42367ec681f3Smrg if (att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { 42377ec681f3Smrg clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT; 42387ec681f3Smrg } 42397ec681f3Smrg } else { 42407ec681f3Smrg /* depthstencil attachment */ 42417ec681f3Smrg if ((att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && 42427ec681f3Smrg att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { 42437ec681f3Smrg clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT; 42447ec681f3Smrg if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) && 42457ec681f3Smrg att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_DONT_CARE) 42467ec681f3Smrg clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT; 42477ec681f3Smrg } 42487ec681f3Smrg if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) && 42497ec681f3Smrg att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { 42507ec681f3Smrg clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT; 42517ec681f3Smrg } 42527ec681f3Smrg } 42537ec681f3Smrg 42547ec681f3Smrg state->attachments[i].pending_clear_aspects = clear_aspects; 42557ec681f3Smrg state->attachments[i].cleared_views = 0; 42567ec681f3Smrg if (clear_aspects && info) { 42577ec681f3Smrg assert(info->clearValueCount > i); 42587ec681f3Smrg state->attachments[i].clear_value = info->pClearValues[i]; 42597ec681f3Smrg } 42607ec681f3Smrg 42617ec681f3Smrg state->attachments[i].current_layout = att->initial_layout; 42627ec681f3Smrg state->attachments[i].current_in_render_loop = false; 42637ec681f3Smrg state->attachments[i].current_stencil_layout = att->stencil_initial_layout; 42647ec681f3Smrg state->attachments[i].disable_dcc = extra && extra->disable_dcc; 42657ec681f3Smrg state->attachments[i].sample_location.count = 0; 42667ec681f3Smrg 42677ec681f3Smrg struct radv_image_view *iview; 42687ec681f3Smrg if (attachment_info && attachment_info->attachmentCount > i) { 42697ec681f3Smrg iview = radv_image_view_from_handle(attachment_info->pAttachments[i]); 42707ec681f3Smrg } else { 42717ec681f3Smrg iview = state->framebuffer->attachments[i]; 42727ec681f3Smrg } 42737ec681f3Smrg 42747ec681f3Smrg state->attachments[i].iview = iview; 42757ec681f3Smrg if (iview->aspect_mask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { 42767ec681f3Smrg radv_initialise_ds_surface(cmd_buffer->device, &state->attachments[i].ds, iview); 42777ec681f3Smrg } else { 42787ec681f3Smrg radv_initialise_color_surface(cmd_buffer->device, &state->attachments[i].cb, iview); 42797ec681f3Smrg } 42807ec681f3Smrg } 42817ec681f3Smrg 42827ec681f3Smrg return VK_SUCCESS; 42837ec681f3Smrg} 42847ec681f3Smrg 42857ec681f3SmrgVkResult 42867ec681f3Smrgradv_AllocateCommandBuffers(VkDevice _device, const VkCommandBufferAllocateInfo *pAllocateInfo, 42877ec681f3Smrg VkCommandBuffer *pCommandBuffers) 42887ec681f3Smrg{ 42897ec681f3Smrg RADV_FROM_HANDLE(radv_device, device, _device); 42907ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_pool, pool, pAllocateInfo->commandPool); 42917ec681f3Smrg 42927ec681f3Smrg VkResult result = VK_SUCCESS; 42937ec681f3Smrg uint32_t i; 42947ec681f3Smrg 42957ec681f3Smrg for (i = 0; i < pAllocateInfo->commandBufferCount; i++) { 42967ec681f3Smrg 42977ec681f3Smrg if (!list_is_empty(&pool->free_cmd_buffers)) { 42987ec681f3Smrg struct radv_cmd_buffer *cmd_buffer = 42997ec681f3Smrg list_first_entry(&pool->free_cmd_buffers, struct radv_cmd_buffer, pool_link); 43007ec681f3Smrg 43017ec681f3Smrg list_del(&cmd_buffer->pool_link); 43027ec681f3Smrg list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers); 43037ec681f3Smrg 43047ec681f3Smrg result = radv_reset_cmd_buffer(cmd_buffer); 43057ec681f3Smrg cmd_buffer->level = pAllocateInfo->level; 43067ec681f3Smrg vk_command_buffer_finish(&cmd_buffer->vk); 43077ec681f3Smrg VkResult init_result = 43087ec681f3Smrg vk_command_buffer_init(&cmd_buffer->vk, &device->vk); 43097ec681f3Smrg if (init_result != VK_SUCCESS) 43107ec681f3Smrg result = init_result; 43117ec681f3Smrg 43127ec681f3Smrg pCommandBuffers[i] = radv_cmd_buffer_to_handle(cmd_buffer); 43137ec681f3Smrg } else { 43147ec681f3Smrg result = radv_create_cmd_buffer(device, pool, pAllocateInfo->level, &pCommandBuffers[i]); 43157ec681f3Smrg } 43167ec681f3Smrg if (result != VK_SUCCESS) 43177ec681f3Smrg break; 43187ec681f3Smrg } 43197ec681f3Smrg 43207ec681f3Smrg if (result != VK_SUCCESS) { 43217ec681f3Smrg radv_FreeCommandBuffers(_device, pAllocateInfo->commandPool, i, pCommandBuffers); 43227ec681f3Smrg 43237ec681f3Smrg /* From the Vulkan 1.0.66 spec: 43247ec681f3Smrg * 43257ec681f3Smrg * "vkAllocateCommandBuffers can be used to create multiple 43267ec681f3Smrg * command buffers. If the creation of any of those command 43277ec681f3Smrg * buffers fails, the implementation must destroy all 43287ec681f3Smrg * successfully created command buffer objects from this 43297ec681f3Smrg * command, set all entries of the pCommandBuffers array to 43307ec681f3Smrg * NULL and return the error." 43317ec681f3Smrg */ 43327ec681f3Smrg memset(pCommandBuffers, 0, sizeof(*pCommandBuffers) * pAllocateInfo->commandBufferCount); 43337ec681f3Smrg } 43347ec681f3Smrg 43357ec681f3Smrg return result; 433601e04c3fSmrg} 433701e04c3fSmrg 43387ec681f3Smrgvoid 43397ec681f3Smrgradv_FreeCommandBuffers(VkDevice device, VkCommandPool commandPool, uint32_t commandBufferCount, 43407ec681f3Smrg const VkCommandBuffer *pCommandBuffers) 434101e04c3fSmrg{ 43427ec681f3Smrg for (uint32_t i = 0; i < commandBufferCount; i++) { 43437ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, pCommandBuffers[i]); 434401e04c3fSmrg 43457ec681f3Smrg if (cmd_buffer) { 43467ec681f3Smrg if (cmd_buffer->pool) { 43477ec681f3Smrg list_del(&cmd_buffer->pool_link); 43487ec681f3Smrg list_addtail(&cmd_buffer->pool_link, &cmd_buffer->pool->free_cmd_buffers); 43497ec681f3Smrg } else 43507ec681f3Smrg radv_destroy_cmd_buffer(cmd_buffer); 43517ec681f3Smrg } 43527ec681f3Smrg } 43537ec681f3Smrg} 435401e04c3fSmrg 43557ec681f3SmrgVkResult 43567ec681f3Smrgradv_ResetCommandBuffer(VkCommandBuffer commandBuffer, VkCommandBufferResetFlags flags) 43577ec681f3Smrg{ 43587ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 43597ec681f3Smrg return radv_reset_cmd_buffer(cmd_buffer); 436001e04c3fSmrg} 436101e04c3fSmrg 43627ec681f3SmrgVkResult 43637ec681f3Smrgradv_BeginCommandBuffer(VkCommandBuffer commandBuffer, const VkCommandBufferBeginInfo *pBeginInfo) 436401e04c3fSmrg{ 43657ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 43667ec681f3Smrg VkResult result = VK_SUCCESS; 43677ec681f3Smrg 43687ec681f3Smrg if (cmd_buffer->status != RADV_CMD_BUFFER_STATUS_INITIAL) { 43697ec681f3Smrg /* If the command buffer has already been resetted with 43707ec681f3Smrg * vkResetCommandBuffer, no need to do it again. 43717ec681f3Smrg */ 43727ec681f3Smrg result = radv_reset_cmd_buffer(cmd_buffer); 43737ec681f3Smrg if (result != VK_SUCCESS) 43747ec681f3Smrg return result; 43757ec681f3Smrg } 43767ec681f3Smrg 43777ec681f3Smrg memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state)); 43787ec681f3Smrg cmd_buffer->state.last_primitive_reset_en = -1; 43797ec681f3Smrg cmd_buffer->state.last_index_type = -1; 43807ec681f3Smrg cmd_buffer->state.last_num_instances = -1; 43817ec681f3Smrg cmd_buffer->state.last_vertex_offset = -1; 43827ec681f3Smrg cmd_buffer->state.last_first_instance = -1; 43837ec681f3Smrg cmd_buffer->state.last_drawid = -1; 43847ec681f3Smrg cmd_buffer->state.predication_type = -1; 43857ec681f3Smrg cmd_buffer->state.last_sx_ps_downconvert = -1; 43867ec681f3Smrg cmd_buffer->state.last_sx_blend_opt_epsilon = -1; 43877ec681f3Smrg cmd_buffer->state.last_sx_blend_opt_control = -1; 43887ec681f3Smrg cmd_buffer->state.last_nggc_settings = -1; 43897ec681f3Smrg cmd_buffer->state.last_nggc_settings_sgpr_idx = -1; 43907ec681f3Smrg cmd_buffer->usage_flags = pBeginInfo->flags; 43917ec681f3Smrg 43927ec681f3Smrg if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY && 43937ec681f3Smrg (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) { 43947ec681f3Smrg assert(pBeginInfo->pInheritanceInfo); 43957ec681f3Smrg cmd_buffer->state.framebuffer = 43967ec681f3Smrg radv_framebuffer_from_handle(pBeginInfo->pInheritanceInfo->framebuffer); 43977ec681f3Smrg cmd_buffer->state.pass = 43987ec681f3Smrg radv_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass); 43997ec681f3Smrg 44007ec681f3Smrg struct radv_subpass *subpass = 44017ec681f3Smrg &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass]; 44027ec681f3Smrg 44037ec681f3Smrg if (cmd_buffer->state.framebuffer) { 44047ec681f3Smrg result = radv_cmd_state_setup_attachments(cmd_buffer, cmd_buffer->state.pass, NULL, NULL); 44057ec681f3Smrg if (result != VK_SUCCESS) 44067ec681f3Smrg return result; 44077ec681f3Smrg } 44087ec681f3Smrg 44097ec681f3Smrg cmd_buffer->state.inherited_pipeline_statistics = 44107ec681f3Smrg pBeginInfo->pInheritanceInfo->pipelineStatistics; 44117ec681f3Smrg 44127ec681f3Smrg radv_cmd_buffer_set_subpass(cmd_buffer, subpass); 44137ec681f3Smrg } 4414ed98bd31Smaya 44157ec681f3Smrg if (unlikely(cmd_buffer->device->trace_bo)) 44167ec681f3Smrg radv_cmd_buffer_trace_emit(cmd_buffer); 441701e04c3fSmrg 44187ec681f3Smrg radv_describe_begin_cmd_buffer(cmd_buffer); 441901e04c3fSmrg 44207ec681f3Smrg cmd_buffer->status = RADV_CMD_BUFFER_STATUS_RECORDING; 44217ec681f3Smrg 44227ec681f3Smrg return result; 442301e04c3fSmrg} 442401e04c3fSmrg 44257ec681f3Smrgvoid 44267ec681f3Smrgradv_CmdBindVertexBuffers(VkCommandBuffer commandBuffer, uint32_t firstBinding, 44277ec681f3Smrg uint32_t bindingCount, const VkBuffer *pBuffers, 44287ec681f3Smrg const VkDeviceSize *pOffsets) 442901e04c3fSmrg{ 44307ec681f3Smrg radv_CmdBindVertexBuffers2EXT(commandBuffer, firstBinding, bindingCount, pBuffers, pOffsets, 44317ec681f3Smrg NULL, NULL); 44327ec681f3Smrg} 4433ed98bd31Smaya 44347ec681f3Smrgvoid 44357ec681f3Smrgradv_CmdBindVertexBuffers2EXT(VkCommandBuffer commandBuffer, uint32_t firstBinding, 44367ec681f3Smrg uint32_t bindingCount, const VkBuffer *pBuffers, 44377ec681f3Smrg const VkDeviceSize *pOffsets, const VkDeviceSize *pSizes, 44387ec681f3Smrg const VkDeviceSize *pStrides) 44397ec681f3Smrg{ 44407ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 44417ec681f3Smrg struct radv_vertex_binding *vb = cmd_buffer->vertex_bindings; 44427ec681f3Smrg struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input; 44437ec681f3Smrg bool changed = false; 44447ec681f3Smrg 44457ec681f3Smrg /* We have to defer setting up vertex buffer since we need the buffer 44467ec681f3Smrg * stride from the pipeline. */ 44477ec681f3Smrg 44487ec681f3Smrg assert(firstBinding + bindingCount <= MAX_VBS); 44497ec681f3Smrg cmd_buffer->state.vbo_misaligned_mask = state->misaligned_mask; 44507ec681f3Smrg enum chip_class chip = cmd_buffer->device->physical_device->rad_info.chip_class; 44517ec681f3Smrg for (uint32_t i = 0; i < bindingCount; i++) { 44527ec681f3Smrg RADV_FROM_HANDLE(radv_buffer, buffer, pBuffers[i]); 44537ec681f3Smrg uint32_t idx = firstBinding + i; 44547ec681f3Smrg VkDeviceSize size = pSizes ? pSizes[i] : 0; 44557ec681f3Smrg VkDeviceSize stride = pStrides ? pStrides[i] : 0; 44567ec681f3Smrg 44577ec681f3Smrg /* pSizes and pStrides are optional. */ 44587ec681f3Smrg if (!changed && (vb[idx].buffer != buffer || vb[idx].offset != pOffsets[i] || 44597ec681f3Smrg vb[idx].size != size || (pStrides && vb[idx].stride != stride))) { 44607ec681f3Smrg changed = true; 44617ec681f3Smrg } 44627ec681f3Smrg 44637ec681f3Smrg vb[idx].buffer = buffer; 44647ec681f3Smrg vb[idx].offset = pOffsets[i]; 44657ec681f3Smrg vb[idx].size = size; 44667ec681f3Smrg /* if pStrides=NULL, it shouldn't overwrite the strides specified by CmdSetVertexInputEXT */ 44677ec681f3Smrg 44687ec681f3Smrg if (chip == GFX6 || chip >= GFX10) { 44697ec681f3Smrg const uint32_t bit = 1u << idx; 44707ec681f3Smrg if (!buffer) { 44717ec681f3Smrg cmd_buffer->state.vbo_misaligned_mask &= ~bit; 44727ec681f3Smrg cmd_buffer->state.vbo_bound_mask &= ~bit; 44737ec681f3Smrg } else { 44747ec681f3Smrg cmd_buffer->state.vbo_bound_mask |= bit; 44757ec681f3Smrg if (pStrides && vb[idx].stride != stride) { 44767ec681f3Smrg if (stride & state->format_align_req_minus_1[idx]) 44777ec681f3Smrg cmd_buffer->state.vbo_misaligned_mask |= bit; 44787ec681f3Smrg else 44797ec681f3Smrg cmd_buffer->state.vbo_misaligned_mask &= ~bit; 44807ec681f3Smrg } 44817ec681f3Smrg if (state->possibly_misaligned_mask & bit && 44827ec681f3Smrg (vb[idx].offset + state->offsets[idx]) & state->format_align_req_minus_1[idx]) 44837ec681f3Smrg cmd_buffer->state.vbo_misaligned_mask |= bit; 44847ec681f3Smrg } 44857ec681f3Smrg } 44867ec681f3Smrg 44877ec681f3Smrg if (pStrides) 44887ec681f3Smrg vb[idx].stride = stride; 44897ec681f3Smrg 44907ec681f3Smrg if (buffer) { 44917ec681f3Smrg radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, vb[idx].buffer->bo); 44927ec681f3Smrg } 44937ec681f3Smrg } 44947ec681f3Smrg 44957ec681f3Smrg if (!changed) { 44967ec681f3Smrg /* No state changes. */ 44977ec681f3Smrg return; 44987ec681f3Smrg } 44997ec681f3Smrg 45007ec681f3Smrg cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER | 45017ec681f3Smrg RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT; 45027ec681f3Smrg} 450301e04c3fSmrg 45047ec681f3Smrgstatic uint32_t 45057ec681f3Smrgvk_to_index_type(VkIndexType type) 45067ec681f3Smrg{ 45077ec681f3Smrg switch (type) { 45087ec681f3Smrg case VK_INDEX_TYPE_UINT8_EXT: 45097ec681f3Smrg return V_028A7C_VGT_INDEX_8; 45107ec681f3Smrg case VK_INDEX_TYPE_UINT16: 45117ec681f3Smrg return V_028A7C_VGT_INDEX_16; 45127ec681f3Smrg case VK_INDEX_TYPE_UINT32: 45137ec681f3Smrg return V_028A7C_VGT_INDEX_32; 45147ec681f3Smrg default: 45157ec681f3Smrg unreachable("invalid index type"); 45167ec681f3Smrg } 45177ec681f3Smrg} 451801e04c3fSmrg 45197ec681f3Smrgstatic uint32_t 45207ec681f3Smrgradv_get_vgt_index_size(uint32_t type) 45217ec681f3Smrg{ 45227ec681f3Smrg switch (type) { 45237ec681f3Smrg case V_028A7C_VGT_INDEX_8: 45247ec681f3Smrg return 1; 45257ec681f3Smrg case V_028A7C_VGT_INDEX_16: 45267ec681f3Smrg return 2; 45277ec681f3Smrg case V_028A7C_VGT_INDEX_32: 45287ec681f3Smrg return 4; 45297ec681f3Smrg default: 45307ec681f3Smrg unreachable("invalid index type"); 45317ec681f3Smrg } 453201e04c3fSmrg} 453301e04c3fSmrg 45347ec681f3Smrgvoid 45357ec681f3Smrgradv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, 45367ec681f3Smrg VkIndexType indexType) 453701e04c3fSmrg{ 45387ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 45397ec681f3Smrg RADV_FROM_HANDLE(radv_buffer, index_buffer, buffer); 4540ed98bd31Smaya 45417ec681f3Smrg if (cmd_buffer->state.index_buffer == index_buffer && cmd_buffer->state.index_offset == offset && 45427ec681f3Smrg cmd_buffer->state.index_type == indexType) { 45437ec681f3Smrg /* No state changes. */ 45447ec681f3Smrg return; 45457ec681f3Smrg } 454601e04c3fSmrg 45477ec681f3Smrg cmd_buffer->state.index_buffer = index_buffer; 45487ec681f3Smrg cmd_buffer->state.index_offset = offset; 45497ec681f3Smrg cmd_buffer->state.index_type = vk_to_index_type(indexType); 45507ec681f3Smrg cmd_buffer->state.index_va = radv_buffer_get_va(index_buffer->bo); 45517ec681f3Smrg cmd_buffer->state.index_va += index_buffer->offset + offset; 455201e04c3fSmrg 45537ec681f3Smrg int index_size = radv_get_vgt_index_size(vk_to_index_type(indexType)); 45547ec681f3Smrg cmd_buffer->state.max_index_count = (index_buffer->size - offset) / index_size; 45557ec681f3Smrg cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER; 45567ec681f3Smrg radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, index_buffer->bo); 455701e04c3fSmrg} 455801e04c3fSmrg 45597ec681f3Smrgstatic void 45607ec681f3Smrgradv_bind_descriptor_set(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point, 45617ec681f3Smrg struct radv_descriptor_set *set, unsigned idx) 456201e04c3fSmrg{ 45637ec681f3Smrg struct radeon_winsys *ws = cmd_buffer->device->ws; 456401e04c3fSmrg 45657ec681f3Smrg radv_set_descriptor_set(cmd_buffer, bind_point, set, idx); 4566ed98bd31Smaya 45677ec681f3Smrg assert(set); 45687ec681f3Smrg 45697ec681f3Smrg if (!cmd_buffer->device->use_global_bo_list) { 45707ec681f3Smrg for (unsigned j = 0; j < set->header.buffer_count; ++j) 45717ec681f3Smrg if (set->descriptors[j]) 45727ec681f3Smrg radv_cs_add_buffer(ws, cmd_buffer->cs, set->descriptors[j]); 45737ec681f3Smrg } 45747ec681f3Smrg 45757ec681f3Smrg if (set->header.bo) 45767ec681f3Smrg radv_cs_add_buffer(ws, cmd_buffer->cs, set->header.bo); 457701e04c3fSmrg} 457801e04c3fSmrg 45797ec681f3Smrgvoid 45807ec681f3Smrgradv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, 45817ec681f3Smrg VkPipelineLayout _layout, uint32_t firstSet, uint32_t descriptorSetCount, 45827ec681f3Smrg const VkDescriptorSet *pDescriptorSets, uint32_t dynamicOffsetCount, 45837ec681f3Smrg const uint32_t *pDynamicOffsets) 45847ec681f3Smrg{ 45857ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 45867ec681f3Smrg RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout); 45877ec681f3Smrg unsigned dyn_idx = 0; 45887ec681f3Smrg 45897ec681f3Smrg const bool no_dynamic_bounds = 45907ec681f3Smrg cmd_buffer->device->instance->debug_flags & RADV_DEBUG_NO_DYNAMIC_BOUNDS; 45917ec681f3Smrg struct radv_descriptor_state *descriptors_state = 45927ec681f3Smrg radv_get_descriptors_state(cmd_buffer, pipelineBindPoint); 45937ec681f3Smrg 45947ec681f3Smrg for (unsigned i = 0; i < descriptorSetCount; ++i) { 45957ec681f3Smrg unsigned set_idx = i + firstSet; 45967ec681f3Smrg RADV_FROM_HANDLE(radv_descriptor_set, set, pDescriptorSets[i]); 45977ec681f3Smrg 45987ec681f3Smrg /* If the set is already bound we only need to update the 45997ec681f3Smrg * (potentially changed) dynamic offsets. */ 46007ec681f3Smrg if (descriptors_state->sets[set_idx] != set || 46017ec681f3Smrg !(descriptors_state->valid & (1u << set_idx))) { 46027ec681f3Smrg radv_bind_descriptor_set(cmd_buffer, pipelineBindPoint, set, set_idx); 46037ec681f3Smrg } 46047ec681f3Smrg 46057ec681f3Smrg for (unsigned j = 0; j < layout->set[set_idx].dynamic_offset_count; ++j, ++dyn_idx) { 46067ec681f3Smrg unsigned idx = j + layout->set[i + firstSet].dynamic_offset_start; 46077ec681f3Smrg uint32_t *dst = descriptors_state->dynamic_buffers + idx * 4; 46087ec681f3Smrg assert(dyn_idx < dynamicOffsetCount); 46097ec681f3Smrg 46107ec681f3Smrg struct radv_descriptor_range *range = set->header.dynamic_descriptors + j; 46117ec681f3Smrg 46127ec681f3Smrg if (!range->va) { 46137ec681f3Smrg memset(dst, 0, 4 * 4); 46147ec681f3Smrg } else { 46157ec681f3Smrg uint64_t va = range->va + pDynamicOffsets[dyn_idx]; 46167ec681f3Smrg dst[0] = va; 46177ec681f3Smrg dst[1] = S_008F04_BASE_ADDRESS_HI(va >> 32); 46187ec681f3Smrg dst[2] = no_dynamic_bounds ? 0xffffffffu : range->size; 46197ec681f3Smrg dst[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | 46207ec681f3Smrg S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); 46217ec681f3Smrg 46227ec681f3Smrg if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) { 46237ec681f3Smrg dst[3] |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) | 46247ec681f3Smrg S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1); 46257ec681f3Smrg } else { 46267ec681f3Smrg dst[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | 46277ec681f3Smrg S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); 46287ec681f3Smrg } 46297ec681f3Smrg } 46307ec681f3Smrg 46317ec681f3Smrg cmd_buffer->push_constant_stages |= layout->set[set_idx].dynamic_offset_stages; 46327ec681f3Smrg } 46337ec681f3Smrg } 46347ec681f3Smrg} 463501e04c3fSmrg 46367ec681f3Smrgstatic bool 46377ec681f3Smrgradv_init_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer, struct radv_descriptor_set *set, 46387ec681f3Smrg struct radv_descriptor_set_layout *layout, 46397ec681f3Smrg VkPipelineBindPoint bind_point) 46407ec681f3Smrg{ 46417ec681f3Smrg struct radv_descriptor_state *descriptors_state = 46427ec681f3Smrg radv_get_descriptors_state(cmd_buffer, bind_point); 46437ec681f3Smrg set->header.size = layout->size; 46447ec681f3Smrg set->header.layout = layout; 464501e04c3fSmrg 46467ec681f3Smrg if (descriptors_state->push_set.capacity < set->header.size) { 46477ec681f3Smrg size_t new_size = MAX2(set->header.size, 1024); 46487ec681f3Smrg new_size = MAX2(new_size, 2 * descriptors_state->push_set.capacity); 46497ec681f3Smrg new_size = MIN2(new_size, 96 * MAX_PUSH_DESCRIPTORS); 465001e04c3fSmrg 46517ec681f3Smrg free(set->header.mapped_ptr); 46527ec681f3Smrg set->header.mapped_ptr = malloc(new_size); 465301e04c3fSmrg 46547ec681f3Smrg if (!set->header.mapped_ptr) { 46557ec681f3Smrg descriptors_state->push_set.capacity = 0; 46567ec681f3Smrg cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY; 46577ec681f3Smrg return false; 46587ec681f3Smrg } 465901e04c3fSmrg 46607ec681f3Smrg descriptors_state->push_set.capacity = new_size; 46617ec681f3Smrg } 466201e04c3fSmrg 46637ec681f3Smrg return true; 46647ec681f3Smrg} 466501e04c3fSmrg 46667ec681f3Smrgvoid 46677ec681f3Smrgradv_meta_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer, 46687ec681f3Smrg VkPipelineBindPoint pipelineBindPoint, VkPipelineLayout _layout, 46697ec681f3Smrg uint32_t set, uint32_t descriptorWriteCount, 46707ec681f3Smrg const VkWriteDescriptorSet *pDescriptorWrites) 46717ec681f3Smrg{ 46727ec681f3Smrg RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout); 46737ec681f3Smrg struct radv_descriptor_set *push_set = 46747ec681f3Smrg (struct radv_descriptor_set *)&cmd_buffer->meta_push_descriptors; 46757ec681f3Smrg unsigned bo_offset; 467601e04c3fSmrg 46777ec681f3Smrg assert(set == 0); 46787ec681f3Smrg assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR); 467901e04c3fSmrg 46807ec681f3Smrg push_set->header.size = layout->set[set].layout->size; 46817ec681f3Smrg push_set->header.layout = layout->set[set].layout; 468201e04c3fSmrg 46837ec681f3Smrg if (!radv_cmd_buffer_upload_alloc(cmd_buffer, push_set->header.size, &bo_offset, 46847ec681f3Smrg (void **)&push_set->header.mapped_ptr)) 46857ec681f3Smrg return; 468601e04c3fSmrg 46877ec681f3Smrg push_set->header.va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); 46887ec681f3Smrg push_set->header.va += bo_offset; 468901e04c3fSmrg 46907ec681f3Smrg radv_update_descriptor_sets(cmd_buffer->device, cmd_buffer, 46917ec681f3Smrg radv_descriptor_set_to_handle(push_set), descriptorWriteCount, 46927ec681f3Smrg pDescriptorWrites, 0, NULL); 469301e04c3fSmrg 46947ec681f3Smrg radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set); 469501e04c3fSmrg} 469601e04c3fSmrg 46977ec681f3Smrgvoid 46987ec681f3Smrgradv_CmdPushDescriptorSetKHR(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, 46997ec681f3Smrg VkPipelineLayout _layout, uint32_t set, uint32_t descriptorWriteCount, 47007ec681f3Smrg const VkWriteDescriptorSet *pDescriptorWrites) 470101e04c3fSmrg{ 47027ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 47037ec681f3Smrg RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout); 47047ec681f3Smrg struct radv_descriptor_state *descriptors_state = 47057ec681f3Smrg radv_get_descriptors_state(cmd_buffer, pipelineBindPoint); 47067ec681f3Smrg struct radv_descriptor_set *push_set = 47077ec681f3Smrg (struct radv_descriptor_set *)&descriptors_state->push_set.set; 470801e04c3fSmrg 47097ec681f3Smrg assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR); 471001e04c3fSmrg 47117ec681f3Smrg if (!radv_init_push_descriptor_set(cmd_buffer, push_set, layout->set[set].layout, 47127ec681f3Smrg pipelineBindPoint)) 47137ec681f3Smrg return; 471401e04c3fSmrg 47157ec681f3Smrg /* Check that there are no inline uniform block updates when calling vkCmdPushDescriptorSetKHR() 47167ec681f3Smrg * because it is invalid, according to Vulkan spec. 47177ec681f3Smrg */ 47187ec681f3Smrg for (int i = 0; i < descriptorWriteCount; i++) { 47197ec681f3Smrg ASSERTED const VkWriteDescriptorSet *writeset = &pDescriptorWrites[i]; 47207ec681f3Smrg assert(writeset->descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT); 47217ec681f3Smrg } 472201e04c3fSmrg 47237ec681f3Smrg radv_update_descriptor_sets(cmd_buffer->device, cmd_buffer, 47247ec681f3Smrg radv_descriptor_set_to_handle(push_set), descriptorWriteCount, 47257ec681f3Smrg pDescriptorWrites, 0, NULL); 472601e04c3fSmrg 47277ec681f3Smrg radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set); 47287ec681f3Smrg descriptors_state->push_dirty = true; 472901e04c3fSmrg} 473001e04c3fSmrg 47317ec681f3Smrgvoid 47327ec681f3Smrgradv_CmdPushDescriptorSetWithTemplateKHR(VkCommandBuffer commandBuffer, 47337ec681f3Smrg VkDescriptorUpdateTemplate descriptorUpdateTemplate, 47347ec681f3Smrg VkPipelineLayout _layout, uint32_t set, const void *pData) 473501e04c3fSmrg{ 47367ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 47377ec681f3Smrg RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout); 47387ec681f3Smrg RADV_FROM_HANDLE(radv_descriptor_update_template, templ, descriptorUpdateTemplate); 47397ec681f3Smrg struct radv_descriptor_state *descriptors_state = 47407ec681f3Smrg radv_get_descriptors_state(cmd_buffer, templ->bind_point); 47417ec681f3Smrg struct radv_descriptor_set *push_set = 47427ec681f3Smrg (struct radv_descriptor_set *)&descriptors_state->push_set.set; 474301e04c3fSmrg 47447ec681f3Smrg assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR); 474501e04c3fSmrg 47467ec681f3Smrg if (!radv_init_push_descriptor_set(cmd_buffer, push_set, layout->set[set].layout, 47477ec681f3Smrg templ->bind_point)) 47487ec681f3Smrg return; 474901e04c3fSmrg 47507ec681f3Smrg radv_update_descriptor_set_with_template(cmd_buffer->device, cmd_buffer, push_set, 47517ec681f3Smrg descriptorUpdateTemplate, pData); 475201e04c3fSmrg 47537ec681f3Smrg radv_set_descriptor_set(cmd_buffer, templ->bind_point, push_set, set); 47547ec681f3Smrg descriptors_state->push_dirty = true; 475501e04c3fSmrg} 475601e04c3fSmrg 47577ec681f3Smrgvoid 47587ec681f3Smrgradv_CmdPushConstants(VkCommandBuffer commandBuffer, VkPipelineLayout layout, 47597ec681f3Smrg VkShaderStageFlags stageFlags, uint32_t offset, uint32_t size, 47607ec681f3Smrg const void *pValues) 476101e04c3fSmrg{ 47627ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 47637ec681f3Smrg memcpy(cmd_buffer->push_constants + offset, pValues, size); 47647ec681f3Smrg cmd_buffer->push_constant_stages |= stageFlags; 476501e04c3fSmrg} 476601e04c3fSmrg 47677ec681f3SmrgVkResult 47687ec681f3Smrgradv_EndCommandBuffer(VkCommandBuffer commandBuffer) 476901e04c3fSmrg{ 47707ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 477101e04c3fSmrg 47727ec681f3Smrg radv_emit_mip_change_flush_default(cmd_buffer); 477301e04c3fSmrg 47747ec681f3Smrg if (cmd_buffer->queue_family_index != RADV_QUEUE_TRANSFER) { 47757ec681f3Smrg if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX6) 47767ec681f3Smrg cmd_buffer->state.flush_bits |= 47777ec681f3Smrg RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_WB_L2; 477801e04c3fSmrg 47797ec681f3Smrg /* Make sure to sync all pending active queries at the end of 47807ec681f3Smrg * command buffer. 47817ec681f3Smrg */ 47827ec681f3Smrg cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits; 4783ed98bd31Smaya 47847ec681f3Smrg /* Flush noncoherent images on GFX9+ so we can assume they're clean on the start of a 47857ec681f3Smrg * command buffer. 47867ec681f3Smrg */ 47877ec681f3Smrg if (cmd_buffer->state.rb_noncoherent_dirty && can_skip_buffer_l2_flushes(cmd_buffer->device)) 47887ec681f3Smrg cmd_buffer->state.flush_bits |= radv_src_access_flush( 47897ec681f3Smrg cmd_buffer, 47907ec681f3Smrg VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, 47917ec681f3Smrg NULL); 4792ed98bd31Smaya 47937ec681f3Smrg /* Since NGG streamout uses GDS, we need to make GDS idle when 47947ec681f3Smrg * we leave the IB, otherwise another process might overwrite 47957ec681f3Smrg * it while our shaders are busy. 47967ec681f3Smrg */ 47977ec681f3Smrg if (cmd_buffer->gds_needed) 47987ec681f3Smrg cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH; 47997ec681f3Smrg 48007ec681f3Smrg si_emit_cache_flush(cmd_buffer); 48017ec681f3Smrg } 4802ed98bd31Smaya 48037ec681f3Smrg /* Make sure CP DMA is idle at the end of IBs because the kernel 48047ec681f3Smrg * doesn't wait for it. 48057ec681f3Smrg */ 48067ec681f3Smrg si_cp_dma_wait_for_idle(cmd_buffer); 4807ed98bd31Smaya 48087ec681f3Smrg radv_describe_end_cmd_buffer(cmd_buffer); 4809ed98bd31Smaya 48107ec681f3Smrg vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments); 48117ec681f3Smrg vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.subpass_sample_locs); 4812ed98bd31Smaya 48137ec681f3Smrg VkResult result = cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs); 48147ec681f3Smrg if (result != VK_SUCCESS) 48157ec681f3Smrg return vk_error(cmd_buffer, result); 4816ed98bd31Smaya 48177ec681f3Smrg cmd_buffer->status = RADV_CMD_BUFFER_STATUS_EXECUTABLE; 4818ed98bd31Smaya 48197ec681f3Smrg return cmd_buffer->record_result; 4820ed98bd31Smaya} 4821ed98bd31Smaya 4822ed98bd31Smayastatic void 48237ec681f3Smrgradv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline) 4824ed98bd31Smaya{ 48257ec681f3Smrg if (!pipeline || pipeline == cmd_buffer->state.emitted_compute_pipeline) 48267ec681f3Smrg return; 48277ec681f3Smrg 48287ec681f3Smrg assert(!pipeline->ctx_cs.cdw); 4829ed98bd31Smaya 48307ec681f3Smrg cmd_buffer->state.emitted_compute_pipeline = pipeline; 4831ed98bd31Smaya 48327ec681f3Smrg radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->cs.cdw); 48337ec681f3Smrg radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw); 4834ed98bd31Smaya 48357ec681f3Smrg cmd_buffer->compute_scratch_size_per_wave_needed = 48367ec681f3Smrg MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, pipeline->scratch_bytes_per_wave); 48377ec681f3Smrg cmd_buffer->compute_scratch_waves_wanted = 48387ec681f3Smrg MAX2(cmd_buffer->compute_scratch_waves_wanted, pipeline->max_waves); 4839ed98bd31Smaya 48407ec681f3Smrg radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, 48417ec681f3Smrg pipeline->shaders[MESA_SHADER_COMPUTE]->bo); 48427ec681f3Smrg 48437ec681f3Smrg if (unlikely(cmd_buffer->device->trace_bo)) 48447ec681f3Smrg radv_save_pipeline(cmd_buffer, pipeline); 4845ed98bd31Smaya} 4846ed98bd31Smaya 48477ec681f3Smrgstatic void 48487ec681f3Smrgradv_mark_descriptor_sets_dirty(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point) 484901e04c3fSmrg{ 48507ec681f3Smrg struct radv_descriptor_state *descriptors_state = 48517ec681f3Smrg radv_get_descriptors_state(cmd_buffer, bind_point); 485201e04c3fSmrg 48537ec681f3Smrg descriptors_state->dirty |= descriptors_state->valid; 48547ec681f3Smrg} 485501e04c3fSmrg 48567ec681f3Smrgvoid 48577ec681f3Smrgradv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, 48587ec681f3Smrg VkPipeline _pipeline) 48597ec681f3Smrg{ 48607ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 48617ec681f3Smrg RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline); 48627ec681f3Smrg 48637ec681f3Smrg switch (pipelineBindPoint) { 48647ec681f3Smrg case VK_PIPELINE_BIND_POINT_COMPUTE: 48657ec681f3Smrg if (cmd_buffer->state.compute_pipeline == pipeline) 48667ec681f3Smrg return; 48677ec681f3Smrg radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint); 48687ec681f3Smrg 48697ec681f3Smrg cmd_buffer->state.compute_pipeline = pipeline; 48707ec681f3Smrg cmd_buffer->push_constant_stages |= VK_SHADER_STAGE_COMPUTE_BIT; 48717ec681f3Smrg break; 48727ec681f3Smrg case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR: 48737ec681f3Smrg if (cmd_buffer->state.rt_pipeline == pipeline) 48747ec681f3Smrg return; 48757ec681f3Smrg radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint); 48767ec681f3Smrg 48777ec681f3Smrg cmd_buffer->state.rt_pipeline = pipeline; 48787ec681f3Smrg cmd_buffer->push_constant_stages |= 48797ec681f3Smrg (VK_SHADER_STAGE_RAYGEN_BIT_KHR | VK_SHADER_STAGE_ANY_HIT_BIT_KHR | 48807ec681f3Smrg VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR | VK_SHADER_STAGE_MISS_BIT_KHR | 48817ec681f3Smrg VK_SHADER_STAGE_INTERSECTION_BIT_KHR | VK_SHADER_STAGE_CALLABLE_BIT_KHR); 48827ec681f3Smrg radv_set_rt_stack_size(cmd_buffer, cmd_buffer->state.rt_stack_size); 48837ec681f3Smrg break; 48847ec681f3Smrg case VK_PIPELINE_BIND_POINT_GRAPHICS: 48857ec681f3Smrg if (cmd_buffer->state.pipeline == pipeline) 48867ec681f3Smrg return; 48877ec681f3Smrg radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint); 48887ec681f3Smrg 48897ec681f3Smrg bool vtx_emit_count_changed = 48907ec681f3Smrg !pipeline || !cmd_buffer->state.pipeline || 48917ec681f3Smrg cmd_buffer->state.pipeline->graphics.vtx_emit_num != pipeline->graphics.vtx_emit_num || 48927ec681f3Smrg cmd_buffer->state.pipeline->graphics.vtx_base_sgpr != pipeline->graphics.vtx_base_sgpr; 48937ec681f3Smrg cmd_buffer->state.pipeline = pipeline; 48947ec681f3Smrg if (!pipeline) 48957ec681f3Smrg break; 48967ec681f3Smrg 48977ec681f3Smrg cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PIPELINE | RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT; 48987ec681f3Smrg cmd_buffer->push_constant_stages |= pipeline->active_stages; 48997ec681f3Smrg 49007ec681f3Smrg /* the new vertex shader might not have the same user regs */ 49017ec681f3Smrg if (vtx_emit_count_changed) { 49027ec681f3Smrg cmd_buffer->state.last_first_instance = -1; 49037ec681f3Smrg cmd_buffer->state.last_vertex_offset = -1; 49047ec681f3Smrg cmd_buffer->state.last_drawid = -1; 49057ec681f3Smrg } 49067ec681f3Smrg 49077ec681f3Smrg /* Prefetch all pipeline shaders at first draw time. */ 49087ec681f3Smrg cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_SHADERS; 49097ec681f3Smrg 49107ec681f3Smrg if (cmd_buffer->device->physical_device->rad_info.has_vgt_flush_ngg_legacy_bug && 49117ec681f3Smrg cmd_buffer->state.emitted_pipeline && 49127ec681f3Smrg cmd_buffer->state.emitted_pipeline->graphics.is_ngg && 49137ec681f3Smrg !cmd_buffer->state.pipeline->graphics.is_ngg) { 49147ec681f3Smrg /* Transitioning from NGG to legacy GS requires 49157ec681f3Smrg * VGT_FLUSH on GFX10 and Sienna Cichlid. VGT_FLUSH 49167ec681f3Smrg * is also emitted at the beginning of IBs when legacy 49177ec681f3Smrg * GS ring pointers are set. 49187ec681f3Smrg */ 49197ec681f3Smrg cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_FLUSH; 49207ec681f3Smrg } 49217ec681f3Smrg 49227ec681f3Smrg radv_bind_dynamic_state(cmd_buffer, &pipeline->dynamic_state); 49237ec681f3Smrg radv_bind_streamout_state(cmd_buffer, pipeline); 49247ec681f3Smrg 49257ec681f3Smrg if (pipeline->graphics.esgs_ring_size > cmd_buffer->esgs_ring_size_needed) 49267ec681f3Smrg cmd_buffer->esgs_ring_size_needed = pipeline->graphics.esgs_ring_size; 49277ec681f3Smrg if (pipeline->graphics.gsvs_ring_size > cmd_buffer->gsvs_ring_size_needed) 49287ec681f3Smrg cmd_buffer->gsvs_ring_size_needed = pipeline->graphics.gsvs_ring_size; 49297ec681f3Smrg 49307ec681f3Smrg if (radv_pipeline_has_tess(pipeline)) 49317ec681f3Smrg cmd_buffer->tess_rings_needed = true; 49327ec681f3Smrg break; 49337ec681f3Smrg default: 49347ec681f3Smrg assert(!"invalid bind point"); 49357ec681f3Smrg break; 49367ec681f3Smrg } 493701e04c3fSmrg} 493801e04c3fSmrg 49397ec681f3Smrgvoid 49407ec681f3Smrgradv_CmdSetViewport(VkCommandBuffer commandBuffer, uint32_t firstViewport, uint32_t viewportCount, 49417ec681f3Smrg const VkViewport *pViewports) 494201e04c3fSmrg{ 49437ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 49447ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 49457ec681f3Smrg ASSERTED const uint32_t total_count = firstViewport + viewportCount; 49467ec681f3Smrg 49477ec681f3Smrg assert(firstViewport < MAX_VIEWPORTS); 49487ec681f3Smrg assert(total_count >= 1 && total_count <= MAX_VIEWPORTS); 49497ec681f3Smrg 49507ec681f3Smrg if (total_count <= state->dynamic.viewport.count && 49517ec681f3Smrg !memcmp(state->dynamic.viewport.viewports + firstViewport, pViewports, 49527ec681f3Smrg viewportCount * sizeof(*pViewports))) { 49537ec681f3Smrg return; 49547ec681f3Smrg } 49557ec681f3Smrg 49567ec681f3Smrg if (state->dynamic.viewport.count < total_count) 49577ec681f3Smrg state->dynamic.viewport.count = total_count; 49587ec681f3Smrg 49597ec681f3Smrg memcpy(state->dynamic.viewport.viewports + firstViewport, pViewports, 49607ec681f3Smrg viewportCount * sizeof(*pViewports)); 49617ec681f3Smrg for (unsigned i = 0; i < viewportCount; i++) { 49627ec681f3Smrg radv_get_viewport_xform(&pViewports[i], 49637ec681f3Smrg state->dynamic.viewport.xform[i + firstViewport].scale, 49647ec681f3Smrg state->dynamic.viewport.xform[i + firstViewport].translate); 49657ec681f3Smrg } 49667ec681f3Smrg 49677ec681f3Smrg state->dirty |= RADV_CMD_DIRTY_DYNAMIC_VIEWPORT; 496801e04c3fSmrg} 496901e04c3fSmrg 49707ec681f3Smrgvoid 49717ec681f3Smrgradv_CmdSetScissor(VkCommandBuffer commandBuffer, uint32_t firstScissor, uint32_t scissorCount, 49727ec681f3Smrg const VkRect2D *pScissors) 497301e04c3fSmrg{ 49747ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 49757ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 49767ec681f3Smrg ASSERTED const uint32_t total_count = firstScissor + scissorCount; 49777ec681f3Smrg 49787ec681f3Smrg assert(firstScissor < MAX_SCISSORS); 49797ec681f3Smrg assert(total_count >= 1 && total_count <= MAX_SCISSORS); 49807ec681f3Smrg 49817ec681f3Smrg if (total_count <= state->dynamic.scissor.count && 49827ec681f3Smrg !memcmp(state->dynamic.scissor.scissors + firstScissor, pScissors, 49837ec681f3Smrg scissorCount * sizeof(*pScissors))) { 49847ec681f3Smrg return; 49857ec681f3Smrg } 498601e04c3fSmrg 49877ec681f3Smrg if (state->dynamic.scissor.count < total_count) 49887ec681f3Smrg state->dynamic.scissor.count = total_count; 49897ec681f3Smrg 49907ec681f3Smrg memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors, 49917ec681f3Smrg scissorCount * sizeof(*pScissors)); 49927ec681f3Smrg 49937ec681f3Smrg state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR; 499401e04c3fSmrg} 499501e04c3fSmrg 49967ec681f3Smrgvoid 49977ec681f3Smrgradv_CmdSetLineWidth(VkCommandBuffer commandBuffer, float lineWidth) 499801e04c3fSmrg{ 49997ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 50007ec681f3Smrg 50017ec681f3Smrg if (cmd_buffer->state.dynamic.line_width == lineWidth) 50027ec681f3Smrg return; 50037ec681f3Smrg 50047ec681f3Smrg cmd_buffer->state.dynamic.line_width = lineWidth; 50057ec681f3Smrg cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH; 500601e04c3fSmrg} 500701e04c3fSmrg 50087ec681f3Smrgvoid 50097ec681f3Smrgradv_CmdSetDepthBias(VkCommandBuffer commandBuffer, float depthBiasConstantFactor, 50107ec681f3Smrg float depthBiasClamp, float depthBiasSlopeFactor) 501101e04c3fSmrg{ 50127ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 50137ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 501401e04c3fSmrg 50157ec681f3Smrg if (state->dynamic.depth_bias.bias == depthBiasConstantFactor && 50167ec681f3Smrg state->dynamic.depth_bias.clamp == depthBiasClamp && 50177ec681f3Smrg state->dynamic.depth_bias.slope == depthBiasSlopeFactor) { 50187ec681f3Smrg return; 50197ec681f3Smrg } 502001e04c3fSmrg 50217ec681f3Smrg state->dynamic.depth_bias.bias = depthBiasConstantFactor; 50227ec681f3Smrg state->dynamic.depth_bias.clamp = depthBiasClamp; 50237ec681f3Smrg state->dynamic.depth_bias.slope = depthBiasSlopeFactor; 50247ec681f3Smrg 50257ec681f3Smrg state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS; 502601e04c3fSmrg} 502701e04c3fSmrg 50287ec681f3Smrgvoid 50297ec681f3Smrgradv_CmdSetBlendConstants(VkCommandBuffer commandBuffer, const float blendConstants[4]) 503001e04c3fSmrg{ 50317ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 50327ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 50337ec681f3Smrg 50347ec681f3Smrg if (!memcmp(state->dynamic.blend_constants, blendConstants, sizeof(float) * 4)) 50357ec681f3Smrg return; 50367ec681f3Smrg 50377ec681f3Smrg memcpy(state->dynamic.blend_constants, blendConstants, sizeof(float) * 4); 50387ec681f3Smrg 50397ec681f3Smrg state->dirty |= RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS; 504001e04c3fSmrg} 504101e04c3fSmrg 50427ec681f3Smrgvoid 50437ec681f3Smrgradv_CmdSetDepthBounds(VkCommandBuffer commandBuffer, float minDepthBounds, float maxDepthBounds) 504401e04c3fSmrg{ 50457ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 50467ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 504701e04c3fSmrg 50487ec681f3Smrg if (state->dynamic.depth_bounds.min == minDepthBounds && 50497ec681f3Smrg state->dynamic.depth_bounds.max == maxDepthBounds) { 50507ec681f3Smrg return; 50517ec681f3Smrg } 505201e04c3fSmrg 50537ec681f3Smrg state->dynamic.depth_bounds.min = minDepthBounds; 50547ec681f3Smrg state->dynamic.depth_bounds.max = maxDepthBounds; 50557ec681f3Smrg 50567ec681f3Smrg state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS; 505701e04c3fSmrg} 505801e04c3fSmrg 50597ec681f3Smrgvoid 50607ec681f3Smrgradv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, 50617ec681f3Smrg uint32_t compareMask) 506201e04c3fSmrg{ 50637ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 50647ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 50657ec681f3Smrg bool front_same = state->dynamic.stencil_compare_mask.front == compareMask; 50667ec681f3Smrg bool back_same = state->dynamic.stencil_compare_mask.back == compareMask; 506701e04c3fSmrg 50687ec681f3Smrg if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) && 50697ec681f3Smrg (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) { 50707ec681f3Smrg return; 50717ec681f3Smrg } 507201e04c3fSmrg 50737ec681f3Smrg if (faceMask & VK_STENCIL_FACE_FRONT_BIT) 50747ec681f3Smrg state->dynamic.stencil_compare_mask.front = compareMask; 50757ec681f3Smrg if (faceMask & VK_STENCIL_FACE_BACK_BIT) 50767ec681f3Smrg state->dynamic.stencil_compare_mask.back = compareMask; 5077ed98bd31Smaya 50787ec681f3Smrg state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK; 50797ec681f3Smrg} 508001e04c3fSmrg 50817ec681f3Smrgvoid 50827ec681f3Smrgradv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, 50837ec681f3Smrg uint32_t writeMask) 50847ec681f3Smrg{ 50857ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 50867ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 50877ec681f3Smrg bool front_same = state->dynamic.stencil_write_mask.front == writeMask; 50887ec681f3Smrg bool back_same = state->dynamic.stencil_write_mask.back == writeMask; 508901e04c3fSmrg 50907ec681f3Smrg if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) && 50917ec681f3Smrg (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) { 50927ec681f3Smrg return; 50937ec681f3Smrg } 509401e04c3fSmrg 50957ec681f3Smrg if (faceMask & VK_STENCIL_FACE_FRONT_BIT) 50967ec681f3Smrg state->dynamic.stencil_write_mask.front = writeMask; 50977ec681f3Smrg if (faceMask & VK_STENCIL_FACE_BACK_BIT) 50987ec681f3Smrg state->dynamic.stencil_write_mask.back = writeMask; 509901e04c3fSmrg 51007ec681f3Smrg state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK; 510101e04c3fSmrg} 510201e04c3fSmrg 51037ec681f3Smrgvoid 51047ec681f3Smrgradv_CmdSetStencilReference(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, 51057ec681f3Smrg uint32_t reference) 510601e04c3fSmrg{ 51077ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 51087ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 51097ec681f3Smrg bool front_same = state->dynamic.stencil_reference.front == reference; 51107ec681f3Smrg bool back_same = state->dynamic.stencil_reference.back == reference; 511101e04c3fSmrg 51127ec681f3Smrg if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) && 51137ec681f3Smrg (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) { 51147ec681f3Smrg return; 51157ec681f3Smrg } 511601e04c3fSmrg 51177ec681f3Smrg if (faceMask & VK_STENCIL_FACE_FRONT_BIT) 51187ec681f3Smrg cmd_buffer->state.dynamic.stencil_reference.front = reference; 51197ec681f3Smrg if (faceMask & VK_STENCIL_FACE_BACK_BIT) 51207ec681f3Smrg cmd_buffer->state.dynamic.stencil_reference.back = reference; 512101e04c3fSmrg 51227ec681f3Smrg cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE; 51237ec681f3Smrg} 5124ed98bd31Smaya 51257ec681f3Smrgvoid 51267ec681f3Smrgradv_CmdSetDiscardRectangleEXT(VkCommandBuffer commandBuffer, uint32_t firstDiscardRectangle, 51277ec681f3Smrg uint32_t discardRectangleCount, const VkRect2D *pDiscardRectangles) 51287ec681f3Smrg{ 51297ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 51307ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 51317ec681f3Smrg ASSERTED const uint32_t total_count = firstDiscardRectangle + discardRectangleCount; 513201e04c3fSmrg 51337ec681f3Smrg assert(firstDiscardRectangle < MAX_DISCARD_RECTANGLES); 51347ec681f3Smrg assert(total_count >= 1 && total_count <= MAX_DISCARD_RECTANGLES); 513501e04c3fSmrg 51367ec681f3Smrg if (!memcmp(state->dynamic.discard_rectangle.rectangles + firstDiscardRectangle, 51377ec681f3Smrg pDiscardRectangles, discardRectangleCount * sizeof(*pDiscardRectangles))) { 51387ec681f3Smrg return; 51397ec681f3Smrg } 514001e04c3fSmrg 51417ec681f3Smrg typed_memcpy(&state->dynamic.discard_rectangle.rectangles[firstDiscardRectangle], 51427ec681f3Smrg pDiscardRectangles, discardRectangleCount); 514301e04c3fSmrg 51447ec681f3Smrg state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE; 514501e04c3fSmrg} 514601e04c3fSmrg 51477ec681f3Smrgvoid 51487ec681f3Smrgradv_CmdSetSampleLocationsEXT(VkCommandBuffer commandBuffer, 51497ec681f3Smrg const VkSampleLocationsInfoEXT *pSampleLocationsInfo) 51507ec681f3Smrg{ 51517ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 51527ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 515301e04c3fSmrg 51547ec681f3Smrg assert(pSampleLocationsInfo->sampleLocationsCount <= MAX_SAMPLE_LOCATIONS); 515501e04c3fSmrg 51567ec681f3Smrg state->dynamic.sample_location.per_pixel = pSampleLocationsInfo->sampleLocationsPerPixel; 51577ec681f3Smrg state->dynamic.sample_location.grid_size = pSampleLocationsInfo->sampleLocationGridSize; 51587ec681f3Smrg state->dynamic.sample_location.count = pSampleLocationsInfo->sampleLocationsCount; 51597ec681f3Smrg typed_memcpy(&state->dynamic.sample_location.locations[0], 51607ec681f3Smrg pSampleLocationsInfo->pSampleLocations, pSampleLocationsInfo->sampleLocationsCount); 51617ec681f3Smrg 51627ec681f3Smrg state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS; 516301e04c3fSmrg} 516401e04c3fSmrg 51657ec681f3Smrgvoid 51667ec681f3Smrgradv_CmdSetLineStippleEXT(VkCommandBuffer commandBuffer, uint32_t lineStippleFactor, 51677ec681f3Smrg uint16_t lineStipplePattern) 516801e04c3fSmrg{ 51697ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 51707ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 51717ec681f3Smrg 51727ec681f3Smrg if (state->dynamic.line_stipple.factor == lineStippleFactor && 51737ec681f3Smrg state->dynamic.line_stipple.pattern == lineStipplePattern) 51747ec681f3Smrg return; 51757ec681f3Smrg 51767ec681f3Smrg state->dynamic.line_stipple.factor = lineStippleFactor; 51777ec681f3Smrg state->dynamic.line_stipple.pattern = lineStipplePattern; 51787ec681f3Smrg 51797ec681f3Smrg state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE; 518001e04c3fSmrg} 518101e04c3fSmrg 51827ec681f3Smrgvoid 51837ec681f3Smrgradv_CmdSetCullModeEXT(VkCommandBuffer commandBuffer, VkCullModeFlags cullMode) 51847ec681f3Smrg{ 51857ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 51867ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 518701e04c3fSmrg 51887ec681f3Smrg if (state->dynamic.cull_mode == cullMode) 51897ec681f3Smrg return; 519001e04c3fSmrg 51917ec681f3Smrg state->dynamic.cull_mode = cullMode; 519201e04c3fSmrg 51937ec681f3Smrg state->dirty |= RADV_CMD_DIRTY_DYNAMIC_CULL_MODE; 51947ec681f3Smrg} 519501e04c3fSmrg 51967ec681f3Smrgvoid 51977ec681f3Smrgradv_CmdSetFrontFaceEXT(VkCommandBuffer commandBuffer, VkFrontFace frontFace) 51987ec681f3Smrg{ 51997ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 52007ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 520101e04c3fSmrg 52027ec681f3Smrg if (state->dynamic.front_face == frontFace) 52037ec681f3Smrg return; 520401e04c3fSmrg 52057ec681f3Smrg state->dynamic.front_face = frontFace; 520601e04c3fSmrg 52077ec681f3Smrg state->dirty |= RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE; 520801e04c3fSmrg} 520901e04c3fSmrg 52107ec681f3Smrgvoid 52117ec681f3Smrgradv_CmdSetPrimitiveTopologyEXT(VkCommandBuffer commandBuffer, 52127ec681f3Smrg VkPrimitiveTopology primitiveTopology) 521301e04c3fSmrg{ 52147ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 52157ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 52167ec681f3Smrg unsigned primitive_topology = si_translate_prim(primitiveTopology); 521701e04c3fSmrg 52187ec681f3Smrg if (state->dynamic.primitive_topology == primitive_topology) 52197ec681f3Smrg return; 522001e04c3fSmrg 52217ec681f3Smrg state->dynamic.primitive_topology = primitive_topology; 52227ec681f3Smrg 52237ec681f3Smrg state->dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY; 522401e04c3fSmrg} 522501e04c3fSmrg 52267ec681f3Smrgvoid 52277ec681f3Smrgradv_CmdSetViewportWithCountEXT(VkCommandBuffer commandBuffer, uint32_t viewportCount, 52287ec681f3Smrg const VkViewport *pViewports) 522901e04c3fSmrg{ 52307ec681f3Smrg radv_CmdSetViewport(commandBuffer, 0, viewportCount, pViewports); 523101e04c3fSmrg} 523201e04c3fSmrg 52337ec681f3Smrgvoid 52347ec681f3Smrgradv_CmdSetScissorWithCountEXT(VkCommandBuffer commandBuffer, uint32_t scissorCount, 52357ec681f3Smrg const VkRect2D *pScissors) 523601e04c3fSmrg{ 52377ec681f3Smrg radv_CmdSetScissor(commandBuffer, 0, scissorCount, pScissors); 523801e04c3fSmrg} 523901e04c3fSmrg 52407ec681f3Smrgvoid 52417ec681f3Smrgradv_CmdSetDepthTestEnableEXT(VkCommandBuffer commandBuffer, VkBool32 depthTestEnable) 52427ec681f3Smrg 524301e04c3fSmrg{ 52447ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 52457ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 52467ec681f3Smrg 52477ec681f3Smrg if (state->dynamic.depth_test_enable == depthTestEnable) 52487ec681f3Smrg return; 524901e04c3fSmrg 52507ec681f3Smrg state->dynamic.depth_test_enable = depthTestEnable; 525101e04c3fSmrg 52527ec681f3Smrg state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE; 525301e04c3fSmrg} 525401e04c3fSmrg 52557ec681f3Smrgvoid 52567ec681f3Smrgradv_CmdSetDepthWriteEnableEXT(VkCommandBuffer commandBuffer, VkBool32 depthWriteEnable) 525701e04c3fSmrg{ 52587ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 52597ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 526001e04c3fSmrg 52617ec681f3Smrg if (state->dynamic.depth_write_enable == depthWriteEnable) 52627ec681f3Smrg return; 526301e04c3fSmrg 52647ec681f3Smrg state->dynamic.depth_write_enable = depthWriteEnable; 526501e04c3fSmrg 52667ec681f3Smrg state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE; 526701e04c3fSmrg} 526801e04c3fSmrg 52697ec681f3Smrgvoid 52707ec681f3Smrgradv_CmdSetDepthCompareOpEXT(VkCommandBuffer commandBuffer, VkCompareOp depthCompareOp) 527101e04c3fSmrg{ 52727ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 52737ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 527401e04c3fSmrg 52757ec681f3Smrg if (state->dynamic.depth_compare_op == depthCompareOp) 52767ec681f3Smrg return; 527701e04c3fSmrg 52787ec681f3Smrg state->dynamic.depth_compare_op = depthCompareOp; 527901e04c3fSmrg 52807ec681f3Smrg state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP; 52817ec681f3Smrg} 528201e04c3fSmrg 52837ec681f3Smrgvoid 52847ec681f3Smrgradv_CmdSetDepthBoundsTestEnableEXT(VkCommandBuffer commandBuffer, VkBool32 depthBoundsTestEnable) 52857ec681f3Smrg{ 52867ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 52877ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 528801e04c3fSmrg 52897ec681f3Smrg if (state->dynamic.depth_bounds_test_enable == depthBoundsTestEnable) 52907ec681f3Smrg return; 529101e04c3fSmrg 52927ec681f3Smrg state->dynamic.depth_bounds_test_enable = depthBoundsTestEnable; 529301e04c3fSmrg 52947ec681f3Smrg state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE; 529501e04c3fSmrg} 529601e04c3fSmrg 52977ec681f3Smrgvoid 52987ec681f3Smrgradv_CmdSetStencilTestEnableEXT(VkCommandBuffer commandBuffer, VkBool32 stencilTestEnable) 529901e04c3fSmrg{ 53007ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 53017ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 530201e04c3fSmrg 53037ec681f3Smrg if (state->dynamic.stencil_test_enable == stencilTestEnable) 53047ec681f3Smrg return; 5305993e1d59Smrg 53067ec681f3Smrg state->dynamic.stencil_test_enable = stencilTestEnable; 53077ec681f3Smrg 53087ec681f3Smrg state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE; 53097ec681f3Smrg} 53107ec681f3Smrg 53117ec681f3Smrgvoid 53127ec681f3Smrgradv_CmdSetStencilOpEXT(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, 53137ec681f3Smrg VkStencilOp failOp, VkStencilOp passOp, VkStencilOp depthFailOp, 53147ec681f3Smrg VkCompareOp compareOp) 53157ec681f3Smrg{ 53167ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 53177ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 53187ec681f3Smrg bool front_same = state->dynamic.stencil_op.front.fail_op == failOp && 53197ec681f3Smrg state->dynamic.stencil_op.front.pass_op == passOp && 53207ec681f3Smrg state->dynamic.stencil_op.front.depth_fail_op == depthFailOp && 53217ec681f3Smrg state->dynamic.stencil_op.front.compare_op == compareOp; 53227ec681f3Smrg bool back_same = state->dynamic.stencil_op.back.fail_op == failOp && 53237ec681f3Smrg state->dynamic.stencil_op.back.pass_op == passOp && 53247ec681f3Smrg state->dynamic.stencil_op.back.depth_fail_op == depthFailOp && 53257ec681f3Smrg state->dynamic.stencil_op.back.compare_op == compareOp; 53267ec681f3Smrg 53277ec681f3Smrg if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) && 53287ec681f3Smrg (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) 53297ec681f3Smrg return; 53307ec681f3Smrg 53317ec681f3Smrg if (faceMask & VK_STENCIL_FACE_FRONT_BIT) { 53327ec681f3Smrg state->dynamic.stencil_op.front.fail_op = failOp; 53337ec681f3Smrg state->dynamic.stencil_op.front.pass_op = passOp; 53347ec681f3Smrg state->dynamic.stencil_op.front.depth_fail_op = depthFailOp; 53357ec681f3Smrg state->dynamic.stencil_op.front.compare_op = compareOp; 53367ec681f3Smrg } 53377ec681f3Smrg 53387ec681f3Smrg if (faceMask & VK_STENCIL_FACE_BACK_BIT) { 53397ec681f3Smrg state->dynamic.stencil_op.back.fail_op = failOp; 53407ec681f3Smrg state->dynamic.stencil_op.back.pass_op = passOp; 53417ec681f3Smrg state->dynamic.stencil_op.back.depth_fail_op = depthFailOp; 53427ec681f3Smrg state->dynamic.stencil_op.back.compare_op = compareOp; 53437ec681f3Smrg } 53447ec681f3Smrg 53457ec681f3Smrg state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP; 53467ec681f3Smrg} 5347993e1d59Smrg 53487ec681f3Smrgvoid 53497ec681f3Smrgradv_CmdSetFragmentShadingRateKHR(VkCommandBuffer commandBuffer, const VkExtent2D *pFragmentSize, 53507ec681f3Smrg const VkFragmentShadingRateCombinerOpKHR combinerOps[2]) 53517ec681f3Smrg{ 53527ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 53537ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 535401e04c3fSmrg 53557ec681f3Smrg if (state->dynamic.fragment_shading_rate.size.width == pFragmentSize->width && 53567ec681f3Smrg state->dynamic.fragment_shading_rate.size.height == pFragmentSize->height && 53577ec681f3Smrg state->dynamic.fragment_shading_rate.combiner_ops[0] == combinerOps[0] && 53587ec681f3Smrg state->dynamic.fragment_shading_rate.combiner_ops[1] == combinerOps[1]) 53597ec681f3Smrg return; 536001e04c3fSmrg 53617ec681f3Smrg state->dynamic.fragment_shading_rate.size = *pFragmentSize; 53627ec681f3Smrg for (unsigned i = 0; i < 2; i++) 53637ec681f3Smrg state->dynamic.fragment_shading_rate.combiner_ops[i] = combinerOps[i]; 536401e04c3fSmrg 53657ec681f3Smrg state->dirty |= RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE; 536601e04c3fSmrg} 536701e04c3fSmrg 53687ec681f3Smrgvoid 53697ec681f3Smrgradv_CmdSetDepthBiasEnableEXT(VkCommandBuffer commandBuffer, VkBool32 depthBiasEnable) 537001e04c3fSmrg{ 53717ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 53727ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 537301e04c3fSmrg 53747ec681f3Smrg if (state->dynamic.depth_bias_enable == depthBiasEnable) 53757ec681f3Smrg return; 537601e04c3fSmrg 53777ec681f3Smrg state->dynamic.depth_bias_enable = depthBiasEnable; 537801e04c3fSmrg 53797ec681f3Smrg state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE; 538001e04c3fSmrg} 538101e04c3fSmrg 53827ec681f3Smrgvoid 53837ec681f3Smrgradv_CmdSetPrimitiveRestartEnableEXT(VkCommandBuffer commandBuffer, VkBool32 primitiveRestartEnable) 5384ed98bd31Smaya{ 53857ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 53867ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 5387ed98bd31Smaya 53887ec681f3Smrg if (state->dynamic.primitive_restart_enable == primitiveRestartEnable) 53897ec681f3Smrg return; 5390ed98bd31Smaya 53917ec681f3Smrg state->dynamic.primitive_restart_enable = primitiveRestartEnable; 5392ed98bd31Smaya 53937ec681f3Smrg state->dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE; 5394ed98bd31Smaya} 5395ed98bd31Smaya 53967ec681f3Smrgvoid 53977ec681f3Smrgradv_CmdSetRasterizerDiscardEnableEXT(VkCommandBuffer commandBuffer, 53987ec681f3Smrg VkBool32 rasterizerDiscardEnable) 539901e04c3fSmrg{ 54007ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 54017ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 540201e04c3fSmrg 54037ec681f3Smrg if (state->dynamic.rasterizer_discard_enable == rasterizerDiscardEnable) 54047ec681f3Smrg return; 540501e04c3fSmrg 54067ec681f3Smrg state->dynamic.rasterizer_discard_enable = rasterizerDiscardEnable; 540701e04c3fSmrg 54087ec681f3Smrg state->dirty |= RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE; 540901e04c3fSmrg} 541001e04c3fSmrg 54117ec681f3Smrgvoid 54127ec681f3Smrgradv_CmdSetPatchControlPointsEXT(VkCommandBuffer commandBuffer, uint32_t patchControlPoints) 54137ec681f3Smrg{ 54147ec681f3Smrg /* not implemented */ 54157ec681f3Smrg} 54167ec681f3Smrg 54177ec681f3Smrgvoid 54187ec681f3Smrgradv_CmdSetLogicOpEXT(VkCommandBuffer commandBuffer, VkLogicOp logicOp) 541901e04c3fSmrg{ 54207ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 54217ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 54227ec681f3Smrg unsigned logic_op = si_translate_blend_logic_op(logicOp); 542301e04c3fSmrg 54247ec681f3Smrg if (state->dynamic.logic_op == logic_op) 54257ec681f3Smrg return; 542601e04c3fSmrg 54277ec681f3Smrg state->dynamic.logic_op = logic_op; 54287ec681f3Smrg 54297ec681f3Smrg state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP; 54307ec681f3Smrg} 543101e04c3fSmrg 54327ec681f3Smrgvoid 54337ec681f3Smrgradv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer, uint32_t attachmentCount, 54347ec681f3Smrg const VkBool32 *pColorWriteEnables) 54357ec681f3Smrg{ 54367ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 54377ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 54387ec681f3Smrg uint32_t color_write_enable = 0; 5439ed98bd31Smaya 54407ec681f3Smrg assert(attachmentCount < MAX_RTS); 544101e04c3fSmrg 54427ec681f3Smrg for (uint32_t i = 0; i < attachmentCount; i++) { 54437ec681f3Smrg color_write_enable |= pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0; 54447ec681f3Smrg } 544501e04c3fSmrg 54467ec681f3Smrg if (state->dynamic.color_write_enable == color_write_enable) 54477ec681f3Smrg return; 544801e04c3fSmrg 54497ec681f3Smrg state->dynamic.color_write_enable = color_write_enable; 545001e04c3fSmrg 54517ec681f3Smrg state->dirty |= RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE; 545201e04c3fSmrg} 545301e04c3fSmrg 54547ec681f3Smrgvoid 54557ec681f3Smrgradv_CmdSetVertexInputEXT(VkCommandBuffer commandBuffer, uint32_t vertexBindingDescriptionCount, 54567ec681f3Smrg const VkVertexInputBindingDescription2EXT *pVertexBindingDescriptions, 54577ec681f3Smrg uint32_t vertexAttributeDescriptionCount, 54587ec681f3Smrg const VkVertexInputAttributeDescription2EXT *pVertexAttributeDescriptions) 54597ec681f3Smrg{ 54607ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 54617ec681f3Smrg struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input; 54627ec681f3Smrg 54637ec681f3Smrg const VkVertexInputBindingDescription2EXT *bindings[MAX_VBS]; 54647ec681f3Smrg for (unsigned i = 0; i < vertexBindingDescriptionCount; i++) 54657ec681f3Smrg bindings[pVertexBindingDescriptions[i].binding] = &pVertexBindingDescriptions[i]; 54667ec681f3Smrg 54677ec681f3Smrg cmd_buffer->state.vbo_misaligned_mask = 0; 54687ec681f3Smrg 54697ec681f3Smrg memset(state, 0, sizeof(*state)); 54707ec681f3Smrg 54717ec681f3Smrg enum chip_class chip = cmd_buffer->device->physical_device->rad_info.chip_class; 54727ec681f3Smrg for (unsigned i = 0; i < vertexAttributeDescriptionCount; i++) { 54737ec681f3Smrg const VkVertexInputAttributeDescription2EXT *attrib = &pVertexAttributeDescriptions[i]; 54747ec681f3Smrg const VkVertexInputBindingDescription2EXT *binding = bindings[attrib->binding]; 54757ec681f3Smrg unsigned loc = attrib->location; 54767ec681f3Smrg const struct util_format_description *format_desc = vk_format_description(attrib->format); 54777ec681f3Smrg unsigned nfmt, dfmt; 54787ec681f3Smrg bool post_shuffle; 54797ec681f3Smrg enum radv_vs_input_alpha_adjust alpha_adjust; 54807ec681f3Smrg 54817ec681f3Smrg state->attribute_mask |= 1u << loc; 54827ec681f3Smrg state->bindings[loc] = attrib->binding; 54837ec681f3Smrg if (binding->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE) { 54847ec681f3Smrg state->instance_rate_inputs |= 1u << loc; 54857ec681f3Smrg state->divisors[loc] = binding->divisor; 54867ec681f3Smrg if (binding->divisor != 1) 54877ec681f3Smrg state->nontrivial_divisors |= 1u << loc; 54887ec681f3Smrg } 54897ec681f3Smrg cmd_buffer->vertex_bindings[attrib->binding].stride = binding->stride; 54907ec681f3Smrg state->offsets[loc] = attrib->offset; 54917ec681f3Smrg 54927ec681f3Smrg radv_translate_vertex_format(cmd_buffer->device->physical_device, attrib->format, format_desc, 54937ec681f3Smrg &dfmt, &nfmt, &post_shuffle, &alpha_adjust); 54947ec681f3Smrg 54957ec681f3Smrg state->formats[loc] = dfmt | (nfmt << 4); 54967ec681f3Smrg const uint8_t format_align_req_minus_1 = format_desc->channel[0].size >= 32 ? 3 : 54977ec681f3Smrg (format_desc->block.bits / 8u - 1); 54987ec681f3Smrg state->format_align_req_minus_1[loc] = format_align_req_minus_1; 54997ec681f3Smrg state->format_sizes[loc] = format_desc->block.bits / 8u; 55007ec681f3Smrg 55017ec681f3Smrg if (chip == GFX6 || chip >= GFX10) { 55027ec681f3Smrg struct radv_vertex_binding *vb = cmd_buffer->vertex_bindings; 55037ec681f3Smrg unsigned bit = 1u << loc; 55047ec681f3Smrg if (binding->stride & format_align_req_minus_1) { 55057ec681f3Smrg state->misaligned_mask |= bit; 55067ec681f3Smrg if (cmd_buffer->state.vbo_bound_mask & bit) 55077ec681f3Smrg cmd_buffer->state.vbo_misaligned_mask |= bit; 55087ec681f3Smrg } else { 55097ec681f3Smrg state->possibly_misaligned_mask |= bit; 55107ec681f3Smrg if (cmd_buffer->state.vbo_bound_mask & bit && 55117ec681f3Smrg ((vb[attrib->binding].offset + state->offsets[loc]) & format_align_req_minus_1)) 55127ec681f3Smrg cmd_buffer->state.vbo_misaligned_mask |= bit; 55137ec681f3Smrg } 55147ec681f3Smrg } 55157ec681f3Smrg 55167ec681f3Smrg if (alpha_adjust) { 55177ec681f3Smrg state->alpha_adjust_lo |= (alpha_adjust & 0x1) << loc; 55187ec681f3Smrg state->alpha_adjust_hi |= (alpha_adjust >> 1) << loc; 55197ec681f3Smrg } 55207ec681f3Smrg 55217ec681f3Smrg if (post_shuffle) 55227ec681f3Smrg state->post_shuffle |= 1u << loc; 55237ec681f3Smrg } 55247ec681f3Smrg 55257ec681f3Smrg cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER | 55267ec681f3Smrg RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT; 552701e04c3fSmrg} 552801e04c3fSmrg 55297ec681f3Smrgvoid 55307ec681f3Smrgradv_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCount, 55317ec681f3Smrg const VkCommandBuffer *pCmdBuffers) 55327ec681f3Smrg{ 55337ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, primary, commandBuffer); 55347ec681f3Smrg 55357ec681f3Smrg assert(commandBufferCount > 0); 55367ec681f3Smrg 55377ec681f3Smrg radv_emit_mip_change_flush_default(primary); 55387ec681f3Smrg 55397ec681f3Smrg /* Emit pending flushes on primary prior to executing secondary */ 55407ec681f3Smrg si_emit_cache_flush(primary); 55417ec681f3Smrg 55427ec681f3Smrg /* Make sure CP DMA is idle on primary prior to executing secondary. */ 55437ec681f3Smrg si_cp_dma_wait_for_idle(primary); 55447ec681f3Smrg 55457ec681f3Smrg for (uint32_t i = 0; i < commandBufferCount; i++) { 55467ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]); 55477ec681f3Smrg bool allow_ib2 = true; 55487ec681f3Smrg 55497ec681f3Smrg if (secondary->device->physical_device->rad_info.chip_class == GFX7 && 55507ec681f3Smrg secondary->state.uses_draw_indirect_multi) { 55517ec681f3Smrg /* Do not launch an IB2 for secondary command buffers that contain 55527ec681f3Smrg * DRAW_{INDEX}_INDIRECT_MULTI on GFX7 because it's illegal and hang the GPU. 55537ec681f3Smrg */ 55547ec681f3Smrg allow_ib2 = false; 55557ec681f3Smrg } 55567ec681f3Smrg 55577ec681f3Smrg if (secondary->queue_family_index == RADV_QUEUE_COMPUTE) { 55587ec681f3Smrg /* IB2 packets are not supported on compute queues according to PAL. */ 55597ec681f3Smrg allow_ib2 = false; 55607ec681f3Smrg } 55617ec681f3Smrg 55627ec681f3Smrg primary->scratch_size_per_wave_needed = 55637ec681f3Smrg MAX2(primary->scratch_size_per_wave_needed, secondary->scratch_size_per_wave_needed); 55647ec681f3Smrg primary->scratch_waves_wanted = 55657ec681f3Smrg MAX2(primary->scratch_waves_wanted, secondary->scratch_waves_wanted); 55667ec681f3Smrg primary->compute_scratch_size_per_wave_needed = 55677ec681f3Smrg MAX2(primary->compute_scratch_size_per_wave_needed, 55687ec681f3Smrg secondary->compute_scratch_size_per_wave_needed); 55697ec681f3Smrg primary->compute_scratch_waves_wanted = 55707ec681f3Smrg MAX2(primary->compute_scratch_waves_wanted, secondary->compute_scratch_waves_wanted); 55717ec681f3Smrg 55727ec681f3Smrg if (secondary->esgs_ring_size_needed > primary->esgs_ring_size_needed) 55737ec681f3Smrg primary->esgs_ring_size_needed = secondary->esgs_ring_size_needed; 55747ec681f3Smrg if (secondary->gsvs_ring_size_needed > primary->gsvs_ring_size_needed) 55757ec681f3Smrg primary->gsvs_ring_size_needed = secondary->gsvs_ring_size_needed; 55767ec681f3Smrg if (secondary->tess_rings_needed) 55777ec681f3Smrg primary->tess_rings_needed = true; 55787ec681f3Smrg if (secondary->sample_positions_needed) 55797ec681f3Smrg primary->sample_positions_needed = true; 55807ec681f3Smrg if (secondary->gds_needed) 55817ec681f3Smrg primary->gds_needed = true; 55827ec681f3Smrg 55837ec681f3Smrg if (!secondary->state.framebuffer && (primary->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)) { 55847ec681f3Smrg /* Emit the framebuffer state from primary if secondary 55857ec681f3Smrg * has been recorded without a framebuffer, otherwise 55867ec681f3Smrg * fast color/depth clears can't work. 55877ec681f3Smrg */ 55887ec681f3Smrg radv_emit_fb_mip_change_flush(primary); 55897ec681f3Smrg radv_emit_framebuffer_state(primary); 55907ec681f3Smrg } 55917ec681f3Smrg 55927ec681f3Smrg primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs, allow_ib2); 55937ec681f3Smrg 55947ec681f3Smrg /* When the secondary command buffer is compute only we don't 55957ec681f3Smrg * need to re-emit the current graphics pipeline. 55967ec681f3Smrg */ 55977ec681f3Smrg if (secondary->state.emitted_pipeline) { 55987ec681f3Smrg primary->state.emitted_pipeline = secondary->state.emitted_pipeline; 55997ec681f3Smrg } 56007ec681f3Smrg 56017ec681f3Smrg /* When the secondary command buffer is graphics only we don't 56027ec681f3Smrg * need to re-emit the current compute pipeline. 56037ec681f3Smrg */ 56047ec681f3Smrg if (secondary->state.emitted_compute_pipeline) { 56057ec681f3Smrg primary->state.emitted_compute_pipeline = secondary->state.emitted_compute_pipeline; 56067ec681f3Smrg } 56077ec681f3Smrg 56087ec681f3Smrg /* Only re-emit the draw packets when needed. */ 56097ec681f3Smrg if (secondary->state.last_primitive_reset_en != -1) { 56107ec681f3Smrg primary->state.last_primitive_reset_en = secondary->state.last_primitive_reset_en; 56117ec681f3Smrg } 56127ec681f3Smrg 56137ec681f3Smrg if (secondary->state.last_primitive_reset_index) { 56147ec681f3Smrg primary->state.last_primitive_reset_index = secondary->state.last_primitive_reset_index; 56157ec681f3Smrg } 56167ec681f3Smrg 56177ec681f3Smrg if (secondary->state.last_ia_multi_vgt_param) { 56187ec681f3Smrg primary->state.last_ia_multi_vgt_param = secondary->state.last_ia_multi_vgt_param; 56197ec681f3Smrg } 56207ec681f3Smrg 56217ec681f3Smrg primary->state.last_first_instance = secondary->state.last_first_instance; 56227ec681f3Smrg primary->state.last_num_instances = secondary->state.last_num_instances; 56237ec681f3Smrg primary->state.last_drawid = secondary->state.last_drawid; 56247ec681f3Smrg primary->state.last_vertex_offset = secondary->state.last_vertex_offset; 56257ec681f3Smrg primary->state.last_sx_ps_downconvert = secondary->state.last_sx_ps_downconvert; 56267ec681f3Smrg primary->state.last_sx_blend_opt_epsilon = secondary->state.last_sx_blend_opt_epsilon; 56277ec681f3Smrg primary->state.last_sx_blend_opt_control = secondary->state.last_sx_blend_opt_control; 56287ec681f3Smrg 56297ec681f3Smrg if (secondary->state.last_index_type != -1) { 56307ec681f3Smrg primary->state.last_index_type = secondary->state.last_index_type; 56317ec681f3Smrg } 56327ec681f3Smrg 56337ec681f3Smrg primary->state.last_nggc_settings = secondary->state.last_nggc_settings; 56347ec681f3Smrg primary->state.last_nggc_settings_sgpr_idx = secondary->state.last_nggc_settings_sgpr_idx; 56357ec681f3Smrg primary->state.last_nggc_skip = secondary->state.last_nggc_skip; 56367ec681f3Smrg } 56377ec681f3Smrg 56387ec681f3Smrg /* After executing commands from secondary buffers we have to dirty 56397ec681f3Smrg * some states. 56407ec681f3Smrg */ 56417ec681f3Smrg primary->state.dirty |= 56427ec681f3Smrg RADV_CMD_DIRTY_PIPELINE | RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_DYNAMIC_ALL; 56437ec681f3Smrg radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_GRAPHICS); 56447ec681f3Smrg radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_COMPUTE); 56457ec681f3Smrg} 56467ec681f3Smrg 56477ec681f3SmrgVkResult 56487ec681f3Smrgradv_CreateCommandPool(VkDevice _device, const VkCommandPoolCreateInfo *pCreateInfo, 56497ec681f3Smrg const VkAllocationCallbacks *pAllocator, VkCommandPool *pCmdPool) 56507ec681f3Smrg{ 56517ec681f3Smrg RADV_FROM_HANDLE(radv_device, device, _device); 56527ec681f3Smrg struct radv_cmd_pool *pool; 56537ec681f3Smrg 56547ec681f3Smrg pool = 56557ec681f3Smrg vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*pool), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 56567ec681f3Smrg if (pool == NULL) 56577ec681f3Smrg return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 56587ec681f3Smrg 56597ec681f3Smrg vk_object_base_init(&device->vk, &pool->base, VK_OBJECT_TYPE_COMMAND_POOL); 56607ec681f3Smrg 56617ec681f3Smrg if (pAllocator) 56627ec681f3Smrg pool->alloc = *pAllocator; 56637ec681f3Smrg else 56647ec681f3Smrg pool->alloc = device->vk.alloc; 56657ec681f3Smrg 56667ec681f3Smrg list_inithead(&pool->cmd_buffers); 56677ec681f3Smrg list_inithead(&pool->free_cmd_buffers); 56687ec681f3Smrg 56697ec681f3Smrg pool->queue_family_index = pCreateInfo->queueFamilyIndex; 56707ec681f3Smrg 56717ec681f3Smrg *pCmdPool = radv_cmd_pool_to_handle(pool); 56727ec681f3Smrg 56737ec681f3Smrg return VK_SUCCESS; 567401e04c3fSmrg} 567501e04c3fSmrg 56767ec681f3Smrgvoid 56777ec681f3Smrgradv_DestroyCommandPool(VkDevice _device, VkCommandPool commandPool, 56787ec681f3Smrg const VkAllocationCallbacks *pAllocator) 56797ec681f3Smrg{ 56807ec681f3Smrg RADV_FROM_HANDLE(radv_device, device, _device); 56817ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool); 568201e04c3fSmrg 56837ec681f3Smrg if (!pool) 56847ec681f3Smrg return; 56857ec681f3Smrg 56867ec681f3Smrg list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, &pool->cmd_buffers, pool_link) 56877ec681f3Smrg { 56887ec681f3Smrg radv_destroy_cmd_buffer(cmd_buffer); 56897ec681f3Smrg } 56907ec681f3Smrg 56917ec681f3Smrg list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, &pool->free_cmd_buffers, pool_link) 56927ec681f3Smrg { 56937ec681f3Smrg radv_destroy_cmd_buffer(cmd_buffer); 56947ec681f3Smrg } 56957ec681f3Smrg 56967ec681f3Smrg vk_object_base_finish(&pool->base); 56977ec681f3Smrg vk_free2(&device->vk.alloc, pAllocator, pool); 569801e04c3fSmrg} 569901e04c3fSmrg 57007ec681f3SmrgVkResult 57017ec681f3Smrgradv_ResetCommandPool(VkDevice device, VkCommandPool commandPool, VkCommandPoolResetFlags flags) 570201e04c3fSmrg{ 57037ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool); 57047ec681f3Smrg VkResult result; 570501e04c3fSmrg 57067ec681f3Smrg list_for_each_entry(struct radv_cmd_buffer, cmd_buffer, &pool->cmd_buffers, pool_link) 57077ec681f3Smrg { 57087ec681f3Smrg result = radv_reset_cmd_buffer(cmd_buffer); 57097ec681f3Smrg if (result != VK_SUCCESS) 57107ec681f3Smrg return result; 57117ec681f3Smrg } 57127ec681f3Smrg 57137ec681f3Smrg return VK_SUCCESS; 571401e04c3fSmrg} 571501e04c3fSmrg 57167ec681f3Smrgvoid 57177ec681f3Smrgradv_TrimCommandPool(VkDevice device, VkCommandPool commandPool, VkCommandPoolTrimFlags flags) 571801e04c3fSmrg{ 57197ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool); 572001e04c3fSmrg 57217ec681f3Smrg if (!pool) 57227ec681f3Smrg return; 57237ec681f3Smrg 57247ec681f3Smrg list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, &pool->free_cmd_buffers, pool_link) 57257ec681f3Smrg { 57267ec681f3Smrg radv_destroy_cmd_buffer(cmd_buffer); 57277ec681f3Smrg } 572801e04c3fSmrg} 572901e04c3fSmrg 57307ec681f3Smrgstatic void 57317ec681f3Smrgradv_cmd_buffer_begin_subpass(struct radv_cmd_buffer *cmd_buffer, uint32_t subpass_id) 573201e04c3fSmrg{ 57337ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 57347ec681f3Smrg struct radv_subpass *subpass = &state->pass->subpasses[subpass_id]; 57357ec681f3Smrg 57367ec681f3Smrg ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4096); 57377ec681f3Smrg 57387ec681f3Smrg radv_emit_subpass_barrier(cmd_buffer, &subpass->start_barrier); 57397ec681f3Smrg 57407ec681f3Smrg radv_cmd_buffer_set_subpass(cmd_buffer, subpass); 57417ec681f3Smrg 57427ec681f3Smrg radv_describe_barrier_start(cmd_buffer, RGP_BARRIER_EXTERNAL_RENDER_PASS_SYNC); 57437ec681f3Smrg 57447ec681f3Smrg for (uint32_t i = 0; i < subpass->attachment_count; ++i) { 57457ec681f3Smrg const uint32_t a = subpass->attachments[i].attachment; 57467ec681f3Smrg if (a == VK_ATTACHMENT_UNUSED) 57477ec681f3Smrg continue; 57487ec681f3Smrg 57497ec681f3Smrg radv_handle_subpass_image_transition(cmd_buffer, subpass->attachments[i], true); 57507ec681f3Smrg } 57517ec681f3Smrg 57527ec681f3Smrg if (subpass->vrs_attachment) { 57537ec681f3Smrg int idx = subpass->vrs_attachment->attachment; 57547ec681f3Smrg struct radv_image_view *vrs_iview = cmd_buffer->state.attachments[idx].iview; 57557ec681f3Smrg 57567ec681f3Smrg if (subpass->depth_stencil_attachment) { 57577ec681f3Smrg /* When a subpass uses a VRS attachment and a depth/stencil attachment, we just need to 57587ec681f3Smrg * copy the VRS rates to the HTILE buffer of the attachment. 57597ec681f3Smrg */ 57607ec681f3Smrg int ds_idx = subpass->depth_stencil_attachment->attachment; 57617ec681f3Smrg struct radv_image_view *ds_iview = cmd_buffer->state.attachments[ds_idx].iview; 57627ec681f3Smrg struct radv_image *ds_image = ds_iview->image; 57637ec681f3Smrg 57647ec681f3Smrg VkExtent2D extent = { 57657ec681f3Smrg .width = ds_image->info.width, 57667ec681f3Smrg .height = ds_image->info.height, 57677ec681f3Smrg }; 57687ec681f3Smrg 57697ec681f3Smrg /* HTILE buffer */ 57707ec681f3Smrg uint64_t htile_offset = ds_image->offset + ds_image->planes[0].surface.meta_offset; 57717ec681f3Smrg uint64_t htile_size = ds_image->planes[0].surface.meta_slice_size; 57727ec681f3Smrg struct radv_buffer htile_buffer; 57737ec681f3Smrg 57747ec681f3Smrg radv_buffer_init(&htile_buffer, cmd_buffer->device, ds_image->bo, htile_size, htile_offset); 57757ec681f3Smrg 57767ec681f3Smrg /* Copy the VRS rates to the HTILE buffer. */ 57777ec681f3Smrg radv_copy_vrs_htile(cmd_buffer, vrs_iview->image, &extent, ds_image, &htile_buffer, true); 577801e04c3fSmrg 57797ec681f3Smrg radv_buffer_finish(&htile_buffer); 57807ec681f3Smrg } else { 57817ec681f3Smrg /* When a subpass uses a VRS attachment without binding a depth/stencil attachment, we have 57827ec681f3Smrg * to copy the VRS rates to our internal HTILE buffer. 57837ec681f3Smrg */ 57847ec681f3Smrg struct radv_framebuffer *fb = cmd_buffer->state.framebuffer; 57857ec681f3Smrg struct radv_image *ds_image = radv_cmd_buffer_get_vrs_image(cmd_buffer); 578601e04c3fSmrg 57877ec681f3Smrg if (ds_image) { 57887ec681f3Smrg /* HTILE buffer */ 57897ec681f3Smrg struct radv_buffer *htile_buffer = cmd_buffer->device->vrs.buffer; 57907ec681f3Smrg 57917ec681f3Smrg VkExtent2D extent = { 57927ec681f3Smrg .width = MIN2(fb->width, ds_image->info.width), 57937ec681f3Smrg .height = MIN2(fb->height, ds_image->info.height), 57947ec681f3Smrg }; 57957ec681f3Smrg 57967ec681f3Smrg /* Copy the VRS rates to the HTILE buffer. */ 57977ec681f3Smrg radv_copy_vrs_htile(cmd_buffer, vrs_iview->image, &extent, ds_image, htile_buffer, false); 57987ec681f3Smrg } 57997ec681f3Smrg } 58007ec681f3Smrg } 58017ec681f3Smrg 58027ec681f3Smrg radv_describe_barrier_end(cmd_buffer); 58037ec681f3Smrg 58047ec681f3Smrg radv_cmd_buffer_clear_subpass(cmd_buffer); 58057ec681f3Smrg 58067ec681f3Smrg assert(cmd_buffer->cs->cdw <= cdw_max); 58077ec681f3Smrg} 58087ec681f3Smrg 58097ec681f3Smrgstatic void 58107ec681f3Smrgradv_mark_noncoherent_rb(struct radv_cmd_buffer *cmd_buffer) 58117ec681f3Smrg{ 58127ec681f3Smrg const struct radv_subpass *subpass = cmd_buffer->state.subpass; 58137ec681f3Smrg 58147ec681f3Smrg /* Have to be conservative in cmdbuffers with inherited attachments. */ 58157ec681f3Smrg if (!cmd_buffer->state.attachments) { 58167ec681f3Smrg cmd_buffer->state.rb_noncoherent_dirty = true; 58177ec681f3Smrg return; 58187ec681f3Smrg } 58197ec681f3Smrg 58207ec681f3Smrg for (uint32_t i = 0; i < subpass->color_count; ++i) { 58217ec681f3Smrg const uint32_t a = subpass->color_attachments[i].attachment; 58227ec681f3Smrg if (a == VK_ATTACHMENT_UNUSED) 58237ec681f3Smrg continue; 58247ec681f3Smrg if (!cmd_buffer->state.attachments[a].iview->image->l2_coherent) { 58257ec681f3Smrg cmd_buffer->state.rb_noncoherent_dirty = true; 58267ec681f3Smrg return; 58277ec681f3Smrg } 58287ec681f3Smrg } 58297ec681f3Smrg if (subpass->depth_stencil_attachment && 58307ec681f3Smrg !cmd_buffer->state.attachments[subpass->depth_stencil_attachment->attachment] 58317ec681f3Smrg .iview->image->l2_coherent) 58327ec681f3Smrg cmd_buffer->state.rb_noncoherent_dirty = true; 583301e04c3fSmrg} 583401e04c3fSmrg 58357ec681f3Smrgvoid 58367ec681f3Smrgradv_cmd_buffer_restore_subpass(struct radv_cmd_buffer *cmd_buffer, 58377ec681f3Smrg const struct radv_subpass *subpass) 583801e04c3fSmrg{ 58397ec681f3Smrg radv_mark_noncoherent_rb(cmd_buffer); 58407ec681f3Smrg radv_cmd_buffer_set_subpass(cmd_buffer, subpass); 584101e04c3fSmrg} 584201e04c3fSmrg 58437ec681f3Smrgstatic void 58447ec681f3Smrgradv_cmd_buffer_end_subpass(struct radv_cmd_buffer *cmd_buffer) 58457ec681f3Smrg{ 58467ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 58477ec681f3Smrg const struct radv_subpass *subpass = state->subpass; 58487ec681f3Smrg uint32_t subpass_id = radv_get_subpass_id(cmd_buffer); 58497ec681f3Smrg 58507ec681f3Smrg radv_cmd_buffer_resolve_subpass(cmd_buffer); 58517ec681f3Smrg 58527ec681f3Smrg radv_describe_barrier_start(cmd_buffer, RGP_BARRIER_EXTERNAL_RENDER_PASS_SYNC); 58537ec681f3Smrg 58547ec681f3Smrg for (uint32_t i = 0; i < subpass->attachment_count; ++i) { 58557ec681f3Smrg const uint32_t a = subpass->attachments[i].attachment; 58567ec681f3Smrg if (a == VK_ATTACHMENT_UNUSED) 58577ec681f3Smrg continue; 58587ec681f3Smrg 58597ec681f3Smrg if (state->pass->attachments[a].last_subpass_idx != subpass_id) 58607ec681f3Smrg continue; 58617ec681f3Smrg 58627ec681f3Smrg VkImageLayout layout = state->pass->attachments[a].final_layout; 58637ec681f3Smrg VkImageLayout stencil_layout = state->pass->attachments[a].stencil_final_layout; 58647ec681f3Smrg struct radv_subpass_attachment att = {a, layout, stencil_layout}; 58657ec681f3Smrg radv_handle_subpass_image_transition(cmd_buffer, att, false); 58667ec681f3Smrg } 58677ec681f3Smrg 58687ec681f3Smrg radv_describe_barrier_end(cmd_buffer); 586901e04c3fSmrg} 587001e04c3fSmrg 58717ec681f3Smrgvoid 58727ec681f3Smrgradv_cmd_buffer_begin_render_pass(struct radv_cmd_buffer *cmd_buffer, 58737ec681f3Smrg const VkRenderPassBeginInfo *pRenderPassBegin, 58747ec681f3Smrg const struct radv_extra_render_pass_begin_info *extra_info) 587501e04c3fSmrg{ 58767ec681f3Smrg RADV_FROM_HANDLE(radv_render_pass, pass, pRenderPassBegin->renderPass); 58777ec681f3Smrg RADV_FROM_HANDLE(radv_framebuffer, framebuffer, pRenderPassBegin->framebuffer); 58787ec681f3Smrg VkResult result; 587901e04c3fSmrg 58807ec681f3Smrg cmd_buffer->state.framebuffer = framebuffer; 58817ec681f3Smrg cmd_buffer->state.pass = pass; 58827ec681f3Smrg cmd_buffer->state.render_area = pRenderPassBegin->renderArea; 588301e04c3fSmrg 58847ec681f3Smrg result = radv_cmd_state_setup_attachments(cmd_buffer, pass, pRenderPassBegin, extra_info); 58857ec681f3Smrg if (result != VK_SUCCESS) 58867ec681f3Smrg return; 588701e04c3fSmrg 58887ec681f3Smrg result = radv_cmd_state_setup_sample_locations(cmd_buffer, pass, pRenderPassBegin); 58897ec681f3Smrg if (result != VK_SUCCESS) 58907ec681f3Smrg return; 58917ec681f3Smrg} 589201e04c3fSmrg 58937ec681f3Smrgvoid 58947ec681f3Smrgradv_CmdBeginRenderPass2(VkCommandBuffer commandBuffer, 58957ec681f3Smrg const VkRenderPassBeginInfo *pRenderPassBeginInfo, 58967ec681f3Smrg const VkSubpassBeginInfo *pSubpassBeginInfo) 58977ec681f3Smrg{ 58987ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 589901e04c3fSmrg 59007ec681f3Smrg radv_cmd_buffer_begin_render_pass(cmd_buffer, pRenderPassBeginInfo, NULL); 590101e04c3fSmrg 59027ec681f3Smrg radv_cmd_buffer_begin_subpass(cmd_buffer, 0); 590301e04c3fSmrg} 590401e04c3fSmrg 59057ec681f3Smrgvoid 59067ec681f3Smrgradv_CmdNextSubpass2(VkCommandBuffer commandBuffer, const VkSubpassBeginInfo *pSubpassBeginInfo, 59077ec681f3Smrg const VkSubpassEndInfo *pSubpassEndInfo) 590801e04c3fSmrg{ 59097ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 591001e04c3fSmrg 59117ec681f3Smrg radv_mark_noncoherent_rb(cmd_buffer); 5912ed98bd31Smaya 59137ec681f3Smrg uint32_t prev_subpass = radv_get_subpass_id(cmd_buffer); 59147ec681f3Smrg radv_cmd_buffer_end_subpass(cmd_buffer); 59157ec681f3Smrg radv_cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1); 591601e04c3fSmrg} 591701e04c3fSmrg 591801e04c3fSmrgstatic void 59197ec681f3Smrgradv_emit_view_index(struct radv_cmd_buffer *cmd_buffer, unsigned index) 59207ec681f3Smrg{ 59217ec681f3Smrg struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; 59227ec681f3Smrg for (unsigned stage = 0; stage < MESA_SHADER_STAGES; ++stage) { 59237ec681f3Smrg if (!radv_get_shader(pipeline, stage)) 59247ec681f3Smrg continue; 59257ec681f3Smrg 59267ec681f3Smrg struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, AC_UD_VIEW_INDEX); 59277ec681f3Smrg if (loc->sgpr_idx == -1) 59287ec681f3Smrg continue; 59297ec681f3Smrg uint32_t base_reg = pipeline->user_data_0[stage]; 59307ec681f3Smrg radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, index); 59317ec681f3Smrg } 59327ec681f3Smrg if (radv_pipeline_has_gs_copy_shader(pipeline)) { 59337ec681f3Smrg struct radv_userdata_info *loc = 59347ec681f3Smrg &pipeline->gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_VIEW_INDEX]; 59357ec681f3Smrg if (loc->sgpr_idx != -1) { 59367ec681f3Smrg uint32_t base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0; 59377ec681f3Smrg radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, index); 59387ec681f3Smrg } 59397ec681f3Smrg } 59407ec681f3Smrg} 59417ec681f3Smrg 59427ec681f3Smrgstatic void 59437ec681f3Smrgradv_cs_emit_draw_packet(struct radv_cmd_buffer *cmd_buffer, uint32_t vertex_count, 59447ec681f3Smrg uint32_t use_opaque) 594501e04c3fSmrg{ 59467ec681f3Smrg radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, cmd_buffer->state.predicating)); 59477ec681f3Smrg radeon_emit(cmd_buffer->cs, vertex_count); 59487ec681f3Smrg radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque); 59497ec681f3Smrg} 59507ec681f3Smrg 59517ec681f3Smrg/** 59527ec681f3Smrg * Emit a PKT3_DRAW_INDEX_2 packet to render "index_count` vertices. 59537ec681f3Smrg * 59547ec681f3Smrg * The starting address "index_va" may point anywhere within the index buffer. The number of 59557ec681f3Smrg * indexes allocated in the index buffer *past that point* is specified by "max_index_count". 59567ec681f3Smrg * Hardware uses this information to return 0 for out-of-bounds reads. 59577ec681f3Smrg */ 59587ec681f3Smrgstatic void 59597ec681f3Smrgradv_cs_emit_draw_indexed_packet(struct radv_cmd_buffer *cmd_buffer, uint64_t index_va, 59607ec681f3Smrg uint32_t max_index_count, uint32_t index_count, bool not_eop) 59617ec681f3Smrg{ 59627ec681f3Smrg radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_2, 4, cmd_buffer->state.predicating)); 59637ec681f3Smrg radeon_emit(cmd_buffer->cs, max_index_count); 59647ec681f3Smrg radeon_emit(cmd_buffer->cs, index_va); 59657ec681f3Smrg radeon_emit(cmd_buffer->cs, index_va >> 32); 59667ec681f3Smrg radeon_emit(cmd_buffer->cs, index_count); 59677ec681f3Smrg /* NOT_EOP allows merging multiple draws into 1 wave, but only user VGPRs 59687ec681f3Smrg * can be changed between draws and GS fast launch must be disabled. 59697ec681f3Smrg * NOT_EOP doesn't work on gfx9 and older. 59707ec681f3Smrg */ 59717ec681f3Smrg radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_DMA | S_0287F0_NOT_EOP(not_eop)); 59727ec681f3Smrg} 59737ec681f3Smrg 59747ec681f3Smrg/* MUST inline this function to avoid massive perf loss in drawoverhead */ 59757ec681f3SmrgALWAYS_INLINE static void 59767ec681f3Smrgradv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer *cmd_buffer, bool indexed, 59777ec681f3Smrg uint32_t draw_count, uint64_t count_va, uint32_t stride) 59787ec681f3Smrg{ 59797ec681f3Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 59807ec681f3Smrg const unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA : V_0287F0_DI_SRC_SEL_AUTO_INDEX; 59817ec681f3Smrg bool draw_id_enable = cmd_buffer->state.pipeline->graphics.uses_drawid; 59827ec681f3Smrg uint32_t base_reg = cmd_buffer->state.pipeline->graphics.vtx_base_sgpr; 59837ec681f3Smrg uint32_t vertex_offset_reg, start_instance_reg = 0, draw_id_reg = 0; 59847ec681f3Smrg bool predicating = cmd_buffer->state.predicating; 59857ec681f3Smrg assert(base_reg); 59867ec681f3Smrg 59877ec681f3Smrg /* just reset draw state for vertex data */ 59887ec681f3Smrg cmd_buffer->state.last_first_instance = -1; 59897ec681f3Smrg cmd_buffer->state.last_num_instances = -1; 59907ec681f3Smrg cmd_buffer->state.last_drawid = -1; 59917ec681f3Smrg cmd_buffer->state.last_vertex_offset = -1; 59927ec681f3Smrg 59937ec681f3Smrg vertex_offset_reg = (base_reg - SI_SH_REG_OFFSET) >> 2; 59947ec681f3Smrg if (cmd_buffer->state.pipeline->graphics.uses_baseinstance) 59957ec681f3Smrg start_instance_reg = ((base_reg + (draw_id_enable ? 8 : 4)) - SI_SH_REG_OFFSET) >> 2; 59967ec681f3Smrg if (draw_id_enable) 59977ec681f3Smrg draw_id_reg = ((base_reg + 4) - SI_SH_REG_OFFSET) >> 2; 59987ec681f3Smrg 59997ec681f3Smrg if (draw_count == 1 && !count_va && !draw_id_enable) { 60007ec681f3Smrg radeon_emit(cs, 60017ec681f3Smrg PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT : PKT3_DRAW_INDIRECT, 3, predicating)); 60027ec681f3Smrg radeon_emit(cs, 0); 60037ec681f3Smrg radeon_emit(cs, vertex_offset_reg); 60047ec681f3Smrg radeon_emit(cs, start_instance_reg); 60057ec681f3Smrg radeon_emit(cs, di_src_sel); 60067ec681f3Smrg } else { 60077ec681f3Smrg radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI : PKT3_DRAW_INDIRECT_MULTI, 8, 60087ec681f3Smrg predicating)); 60097ec681f3Smrg radeon_emit(cs, 0); 60107ec681f3Smrg radeon_emit(cs, vertex_offset_reg); 60117ec681f3Smrg radeon_emit(cs, start_instance_reg); 60127ec681f3Smrg radeon_emit(cs, draw_id_reg | S_2C3_DRAW_INDEX_ENABLE(draw_id_enable) | 60137ec681f3Smrg S_2C3_COUNT_INDIRECT_ENABLE(!!count_va)); 60147ec681f3Smrg radeon_emit(cs, draw_count); /* count */ 60157ec681f3Smrg radeon_emit(cs, count_va); /* count_addr */ 60167ec681f3Smrg radeon_emit(cs, count_va >> 32); 60177ec681f3Smrg radeon_emit(cs, stride); /* stride */ 60187ec681f3Smrg radeon_emit(cs, di_src_sel); 60197ec681f3Smrg 60207ec681f3Smrg cmd_buffer->state.uses_draw_indirect_multi = true; 60217ec681f3Smrg } 60227ec681f3Smrg} 60237ec681f3Smrg 60247ec681f3Smrgstatic inline void 60257ec681f3Smrgradv_emit_userdata_vertex_internal(struct radv_cmd_buffer *cmd_buffer, 60267ec681f3Smrg const struct radv_draw_info *info, const uint32_t vertex_offset) 60277ec681f3Smrg{ 60287ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 60297ec681f3Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 60307ec681f3Smrg const bool uses_baseinstance = state->pipeline->graphics.uses_baseinstance; 60317ec681f3Smrg const bool uses_drawid = state->pipeline->graphics.uses_drawid; 60327ec681f3Smrg radeon_set_sh_reg_seq(cs, state->pipeline->graphics.vtx_base_sgpr, 60337ec681f3Smrg state->pipeline->graphics.vtx_emit_num); 60347ec681f3Smrg 60357ec681f3Smrg radeon_emit(cs, vertex_offset); 60367ec681f3Smrg state->last_vertex_offset = vertex_offset; 60377ec681f3Smrg if (uses_drawid) { 60387ec681f3Smrg radeon_emit(cs, 0); 60397ec681f3Smrg state->last_drawid = 0; 60407ec681f3Smrg } 60417ec681f3Smrg if (uses_baseinstance) { 60427ec681f3Smrg radeon_emit(cs, info->first_instance); 60437ec681f3Smrg state->last_first_instance = info->first_instance; 60447ec681f3Smrg } 60457ec681f3Smrg} 60467ec681f3Smrg 60477ec681f3SmrgALWAYS_INLINE static void 60487ec681f3Smrgradv_emit_userdata_vertex(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, 60497ec681f3Smrg const uint32_t vertex_offset) 60507ec681f3Smrg{ 60517ec681f3Smrg const struct radv_cmd_state *state = &cmd_buffer->state; 60527ec681f3Smrg const bool uses_baseinstance = state->pipeline->graphics.uses_baseinstance; 60537ec681f3Smrg const bool uses_drawid = state->pipeline->graphics.uses_drawid; 60547ec681f3Smrg 60557ec681f3Smrg /* this looks very dumb, but it allows the compiler to optimize better and yields 60567ec681f3Smrg * ~3-4% perf increase in drawoverhead 60577ec681f3Smrg */ 60587ec681f3Smrg if (vertex_offset != state->last_vertex_offset) { 60597ec681f3Smrg radv_emit_userdata_vertex_internal(cmd_buffer, info, vertex_offset); 60607ec681f3Smrg } else if (uses_drawid && 0 != state->last_drawid) { 60617ec681f3Smrg radv_emit_userdata_vertex_internal(cmd_buffer, info, vertex_offset); 60627ec681f3Smrg } else if (uses_baseinstance && info->first_instance != state->last_first_instance) { 60637ec681f3Smrg radv_emit_userdata_vertex_internal(cmd_buffer, info, vertex_offset); 60647ec681f3Smrg } 60657ec681f3Smrg} 60667ec681f3Smrg 60677ec681f3SmrgALWAYS_INLINE static void 60687ec681f3Smrgradv_emit_userdata_vertex_drawid(struct radv_cmd_buffer *cmd_buffer, uint32_t vertex_offset, uint32_t drawid) 60697ec681f3Smrg{ 60707ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 60717ec681f3Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 60727ec681f3Smrg radeon_set_sh_reg_seq(cs, state->pipeline->graphics.vtx_base_sgpr, 1 + !!drawid); 60737ec681f3Smrg radeon_emit(cs, vertex_offset); 60747ec681f3Smrg state->last_vertex_offset = vertex_offset; 60757ec681f3Smrg if (drawid) 60767ec681f3Smrg radeon_emit(cs, drawid); 60777ec681f3Smrg 60787ec681f3Smrg} 60797ec681f3Smrg 60807ec681f3SmrgALWAYS_INLINE static void 60817ec681f3Smrgradv_emit_draw_packets_indexed(struct radv_cmd_buffer *cmd_buffer, 60827ec681f3Smrg const struct radv_draw_info *info, 60837ec681f3Smrg uint32_t drawCount, const VkMultiDrawIndexedInfoEXT *minfo, 60847ec681f3Smrg uint32_t stride, 60857ec681f3Smrg const int32_t *vertexOffset) 60867ec681f3Smrg 60877ec681f3Smrg{ 60887ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 60897ec681f3Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 60907ec681f3Smrg const int index_size = radv_get_vgt_index_size(state->index_type); 60917ec681f3Smrg unsigned i = 0; 60927ec681f3Smrg const bool uses_drawid = state->pipeline->graphics.uses_drawid; 60937ec681f3Smrg const bool can_eop = !uses_drawid && cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10; 60947ec681f3Smrg 60957ec681f3Smrg if (uses_drawid) { 60967ec681f3Smrg if (vertexOffset) { 60977ec681f3Smrg radv_emit_userdata_vertex(cmd_buffer, info, *vertexOffset); 60987ec681f3Smrg vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) { 60997ec681f3Smrg const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex; 61007ec681f3Smrg 61017ec681f3Smrg /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */ 61027ec681f3Smrg if (!remaining_indexes && 61037ec681f3Smrg cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug) 61047ec681f3Smrg continue; 61057ec681f3Smrg 61067ec681f3Smrg if (i > 0) 61077ec681f3Smrg radeon_set_sh_reg(cs, state->pipeline->graphics.vtx_base_sgpr + sizeof(uint32_t), i); 61087ec681f3Smrg 61097ec681f3Smrg const uint64_t index_va = state->index_va + draw->firstIndex * index_size; 61107ec681f3Smrg 61117ec681f3Smrg if (!state->subpass->view_mask) { 61127ec681f3Smrg radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false); 61137ec681f3Smrg } else { 61147ec681f3Smrg u_foreach_bit(view, state->subpass->view_mask) { 61157ec681f3Smrg radv_emit_view_index(cmd_buffer, view); 61167ec681f3Smrg 61177ec681f3Smrg radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false); 61187ec681f3Smrg } 61197ec681f3Smrg } 61207ec681f3Smrg } 61217ec681f3Smrg } else { 61227ec681f3Smrg vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) { 61237ec681f3Smrg const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex; 61247ec681f3Smrg 61257ec681f3Smrg /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */ 61267ec681f3Smrg if (!remaining_indexes && 61277ec681f3Smrg cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug) 61287ec681f3Smrg continue; 61297ec681f3Smrg 61307ec681f3Smrg if (i > 0) { 61317ec681f3Smrg if (state->last_vertex_offset != draw->vertexOffset) 61327ec681f3Smrg radv_emit_userdata_vertex_drawid(cmd_buffer, draw->vertexOffset, i); 61337ec681f3Smrg else 61347ec681f3Smrg radeon_set_sh_reg(cs, state->pipeline->graphics.vtx_base_sgpr + sizeof(uint32_t), i); 61357ec681f3Smrg } else 61367ec681f3Smrg radv_emit_userdata_vertex(cmd_buffer, info, draw->vertexOffset); 61377ec681f3Smrg 61387ec681f3Smrg const uint64_t index_va = state->index_va + draw->firstIndex * index_size; 61397ec681f3Smrg 61407ec681f3Smrg if (!state->subpass->view_mask) { 61417ec681f3Smrg radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false); 61427ec681f3Smrg } else { 61437ec681f3Smrg u_foreach_bit(view, state->subpass->view_mask) { 61447ec681f3Smrg radv_emit_view_index(cmd_buffer, view); 61457ec681f3Smrg 61467ec681f3Smrg radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false); 61477ec681f3Smrg } 61487ec681f3Smrg } 61497ec681f3Smrg } 61507ec681f3Smrg } 61517ec681f3Smrg if (drawCount > 1) { 61527ec681f3Smrg state->last_drawid = drawCount - 1; 61537ec681f3Smrg } 61547ec681f3Smrg } else { 61557ec681f3Smrg if (vertexOffset) { 61567ec681f3Smrg if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX10) { 61577ec681f3Smrg /* GFX10 has a bug that consecutive draw packets with NOT_EOP must not have 61587ec681f3Smrg * count == 0 for the last draw that doesn't have NOT_EOP. 61597ec681f3Smrg */ 61607ec681f3Smrg while (drawCount > 1) { 61617ec681f3Smrg const VkMultiDrawIndexedInfoEXT *last = (const VkMultiDrawIndexedInfoEXT*)(((const uint8_t*)minfo) + (drawCount - 1) * stride); 61627ec681f3Smrg if (last->indexCount) 61637ec681f3Smrg break; 61647ec681f3Smrg drawCount--; 61657ec681f3Smrg } 61667ec681f3Smrg } 61677ec681f3Smrg 61687ec681f3Smrg radv_emit_userdata_vertex(cmd_buffer, info, *vertexOffset); 61697ec681f3Smrg vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) { 61707ec681f3Smrg const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex; 61717ec681f3Smrg 61727ec681f3Smrg /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */ 61737ec681f3Smrg if (!remaining_indexes && 61747ec681f3Smrg cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug) 61757ec681f3Smrg continue; 61767ec681f3Smrg 61777ec681f3Smrg const uint64_t index_va = state->index_va + draw->firstIndex * index_size; 61787ec681f3Smrg 61797ec681f3Smrg if (!state->subpass->view_mask) { 61807ec681f3Smrg radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, can_eop && i < drawCount - 1); 61817ec681f3Smrg } else { 61827ec681f3Smrg u_foreach_bit(view, state->subpass->view_mask) { 61837ec681f3Smrg radv_emit_view_index(cmd_buffer, view); 61847ec681f3Smrg 61857ec681f3Smrg radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false); 61867ec681f3Smrg } 61877ec681f3Smrg } 61887ec681f3Smrg } 61897ec681f3Smrg } else { 61907ec681f3Smrg vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) { 61917ec681f3Smrg const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex; 61927ec681f3Smrg 61937ec681f3Smrg /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */ 61947ec681f3Smrg if (!remaining_indexes && 61957ec681f3Smrg cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug) 61967ec681f3Smrg continue; 61977ec681f3Smrg 61987ec681f3Smrg const VkMultiDrawIndexedInfoEXT *next = (const VkMultiDrawIndexedInfoEXT*)(i < drawCount - 1 ? ((uint8_t*)draw + stride) : NULL); 61997ec681f3Smrg const bool offset_changes = next && next->vertexOffset != draw->vertexOffset; 62007ec681f3Smrg radv_emit_userdata_vertex(cmd_buffer, info, draw->vertexOffset); 62017ec681f3Smrg 62027ec681f3Smrg const uint64_t index_va = state->index_va + draw->firstIndex * index_size; 62037ec681f3Smrg 62047ec681f3Smrg if (!state->subpass->view_mask) { 62057ec681f3Smrg radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, can_eop && !offset_changes && i < drawCount - 1); 62067ec681f3Smrg } else { 62077ec681f3Smrg u_foreach_bit(view, state->subpass->view_mask) { 62087ec681f3Smrg radv_emit_view_index(cmd_buffer, view); 62097ec681f3Smrg 62107ec681f3Smrg radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false); 62117ec681f3Smrg } 62127ec681f3Smrg } 62137ec681f3Smrg } 62147ec681f3Smrg } 62157ec681f3Smrg if (drawCount > 1) { 62167ec681f3Smrg state->last_drawid = drawCount - 1; 62177ec681f3Smrg } 62187ec681f3Smrg } 62197ec681f3Smrg} 62207ec681f3Smrg 62217ec681f3SmrgALWAYS_INLINE static void 62227ec681f3Smrgradv_emit_direct_draw_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, 62237ec681f3Smrg uint32_t drawCount, const VkMultiDrawInfoEXT *minfo, 62247ec681f3Smrg uint32_t use_opaque, uint32_t stride) 62257ec681f3Smrg{ 62267ec681f3Smrg unsigned i = 0; 62277ec681f3Smrg const uint32_t view_mask = cmd_buffer->state.subpass->view_mask; 62287ec681f3Smrg const bool uses_drawid = cmd_buffer->state.pipeline->graphics.uses_drawid; 62297ec681f3Smrg uint32_t last_start = 0; 62307ec681f3Smrg 62317ec681f3Smrg vk_foreach_multi_draw(draw, i, minfo, drawCount, stride) { 62327ec681f3Smrg if (!i) 62337ec681f3Smrg radv_emit_userdata_vertex(cmd_buffer, info, draw->firstVertex); 62347ec681f3Smrg else 62357ec681f3Smrg radv_emit_userdata_vertex_drawid(cmd_buffer, draw->firstVertex, uses_drawid ? i : 0); 62367ec681f3Smrg 62377ec681f3Smrg if (!view_mask) { 62387ec681f3Smrg radv_cs_emit_draw_packet(cmd_buffer, draw->vertexCount, use_opaque); 62397ec681f3Smrg } else { 62407ec681f3Smrg u_foreach_bit(view, view_mask) { 62417ec681f3Smrg radv_emit_view_index(cmd_buffer, view); 62427ec681f3Smrg radv_cs_emit_draw_packet(cmd_buffer, draw->vertexCount, use_opaque); 62437ec681f3Smrg } 62447ec681f3Smrg } 62457ec681f3Smrg last_start = draw->firstVertex; 62467ec681f3Smrg } 62477ec681f3Smrg if (drawCount > 1) { 62487ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 62497ec681f3Smrg state->last_vertex_offset = last_start; 62507ec681f3Smrg if (uses_drawid) 62517ec681f3Smrg state->last_drawid = drawCount - 1; 62527ec681f3Smrg } 62537ec681f3Smrg} 62547ec681f3Smrg 62557ec681f3Smrgstatic void 62567ec681f3Smrgradv_emit_indirect_draw_packets(struct radv_cmd_buffer *cmd_buffer, 62577ec681f3Smrg const struct radv_draw_info *info) 62587ec681f3Smrg{ 62597ec681f3Smrg const struct radv_cmd_state *state = &cmd_buffer->state; 62607ec681f3Smrg struct radeon_winsys *ws = cmd_buffer->device->ws; 62617ec681f3Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 62627ec681f3Smrg const uint64_t va = 62637ec681f3Smrg radv_buffer_get_va(info->indirect->bo) + info->indirect->offset + info->indirect_offset; 62647ec681f3Smrg const uint64_t count_va = info->count_buffer 62657ec681f3Smrg ? radv_buffer_get_va(info->count_buffer->bo) + 62667ec681f3Smrg info->count_buffer->offset + info->count_buffer_offset 62677ec681f3Smrg : 0; 62687ec681f3Smrg 62697ec681f3Smrg radv_cs_add_buffer(ws, cs, info->indirect->bo); 62707ec681f3Smrg 62717ec681f3Smrg radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0)); 62727ec681f3Smrg radeon_emit(cs, 1); 62737ec681f3Smrg radeon_emit(cs, va); 62747ec681f3Smrg radeon_emit(cs, va >> 32); 62757ec681f3Smrg 62767ec681f3Smrg if (info->count_buffer) { 62777ec681f3Smrg radv_cs_add_buffer(ws, cs, info->count_buffer->bo); 62787ec681f3Smrg } 62797ec681f3Smrg 62807ec681f3Smrg if (!state->subpass->view_mask) { 62817ec681f3Smrg radv_cs_emit_indirect_draw_packet(cmd_buffer, info->indexed, info->count, count_va, 62827ec681f3Smrg info->stride); 62837ec681f3Smrg } else { 62847ec681f3Smrg u_foreach_bit(i, state->subpass->view_mask) 62857ec681f3Smrg { 62867ec681f3Smrg radv_emit_view_index(cmd_buffer, i); 62877ec681f3Smrg 62887ec681f3Smrg radv_cs_emit_indirect_draw_packet(cmd_buffer, info->indexed, info->count, count_va, 62897ec681f3Smrg info->stride); 62907ec681f3Smrg } 62917ec681f3Smrg } 62927ec681f3Smrg} 62937ec681f3Smrg 62947ec681f3Smrg/* 62957ec681f3Smrg * Vega and raven have a bug which triggers if there are multiple context 62967ec681f3Smrg * register contexts active at the same time with different scissor values. 62977ec681f3Smrg * 62987ec681f3Smrg * There are two possible workarounds: 62997ec681f3Smrg * 1) Wait for PS_PARTIAL_FLUSH every time the scissor is changed. That way 63007ec681f3Smrg * there is only ever 1 active set of scissor values at the same time. 63017ec681f3Smrg * 63027ec681f3Smrg * 2) Whenever the hardware switches contexts we have to set the scissor 63037ec681f3Smrg * registers again even if it is a noop. That way the new context gets 63047ec681f3Smrg * the correct scissor values. 63057ec681f3Smrg * 63067ec681f3Smrg * This implements option 2. radv_need_late_scissor_emission needs to 63077ec681f3Smrg * return true on affected HW if radv_emit_all_graphics_states sets 63087ec681f3Smrg * any context registers. 63097ec681f3Smrg */ 63107ec681f3Smrgstatic bool 63117ec681f3Smrgradv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer, 63127ec681f3Smrg const struct radv_draw_info *info) 63137ec681f3Smrg{ 63147ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 63157ec681f3Smrg 63167ec681f3Smrg if (!cmd_buffer->device->physical_device->rad_info.has_gfx9_scissor_bug) 63177ec681f3Smrg return false; 63187ec681f3Smrg 63197ec681f3Smrg if (cmd_buffer->state.context_roll_without_scissor_emitted || info->strmout_buffer) 63207ec681f3Smrg return true; 63217ec681f3Smrg 63227ec681f3Smrg uint64_t used_states = 63237ec681f3Smrg cmd_buffer->state.pipeline->graphics.needed_dynamic_state | ~RADV_CMD_DIRTY_DYNAMIC_ALL; 63247ec681f3Smrg 63257ec681f3Smrg /* Index, vertex and streamout buffers don't change context regs, and 63267ec681f3Smrg * pipeline is already handled. 63277ec681f3Smrg */ 63287ec681f3Smrg used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_VERTEX_BUFFER | 63297ec681f3Smrg RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT | RADV_CMD_DIRTY_STREAMOUT_BUFFER | 63307ec681f3Smrg RADV_CMD_DIRTY_PIPELINE); 63317ec681f3Smrg 63327ec681f3Smrg if (cmd_buffer->state.dirty & used_states) 63337ec681f3Smrg return true; 63347ec681f3Smrg 63357ec681f3Smrg uint32_t primitive_reset_index = radv_get_primitive_reset_index(cmd_buffer); 63367ec681f3Smrg 63377ec681f3Smrg if (info->indexed && state->dynamic.primitive_restart_enable && 63387ec681f3Smrg primitive_reset_index != state->last_primitive_reset_index) 63397ec681f3Smrg return true; 63407ec681f3Smrg 63417ec681f3Smrg return false; 63427ec681f3Smrg} 63437ec681f3Smrg 63447ec681f3Smrgenum { 63457ec681f3Smrg ngg_cull_none = 0, 63467ec681f3Smrg ngg_cull_front_face = 1, 63477ec681f3Smrg ngg_cull_back_face = 2, 63487ec681f3Smrg ngg_cull_face_is_ccw = 4, 63497ec681f3Smrg ngg_cull_small_primitives = 8, 63507ec681f3Smrg}; 63517ec681f3Smrg 63527ec681f3SmrgALWAYS_INLINE static bool 63537ec681f3Smrgradv_skip_ngg_culling(bool has_tess, const unsigned vtx_cnt, 63547ec681f3Smrg bool indirect) 63557ec681f3Smrg{ 63567ec681f3Smrg /* If we have to draw only a few vertices, we get better latency if 63577ec681f3Smrg * we disable NGG culling. 63587ec681f3Smrg * 63597ec681f3Smrg * When tessellation is used, what matters is the number of tessellated 63607ec681f3Smrg * vertices, so let's always assume it's not a small draw. 63617ec681f3Smrg */ 63627ec681f3Smrg return !has_tess && !indirect && vtx_cnt < 128; 63637ec681f3Smrg} 63647ec681f3Smrg 63657ec681f3SmrgALWAYS_INLINE static uint32_t 63667ec681f3Smrgradv_get_ngg_culling_settings(struct radv_cmd_buffer *cmd_buffer, bool vp_y_inverted) 63677ec681f3Smrg{ 63687ec681f3Smrg const struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; 63697ec681f3Smrg const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 63707ec681f3Smrg 63717ec681f3Smrg /* Cull every triangle when rasterizer discard is enabled. */ 63727ec681f3Smrg if (d->rasterizer_discard_enable || 63737ec681f3Smrg G_028810_DX_RASTERIZATION_KILL(cmd_buffer->state.pipeline->graphics.pa_cl_clip_cntl)) 63747ec681f3Smrg return ngg_cull_front_face | ngg_cull_back_face; 63757ec681f3Smrg 63767ec681f3Smrg uint32_t pa_su_sc_mode_cntl = cmd_buffer->state.pipeline->graphics.pa_su_sc_mode_cntl; 63777ec681f3Smrg uint32_t nggc_settings = ngg_cull_none; 63787ec681f3Smrg 63797ec681f3Smrg /* The culling code needs to know whether face is CW or CCW. */ 63807ec681f3Smrg bool ccw = (pipeline->graphics.needed_dynamic_state & RADV_DYNAMIC_FRONT_FACE) 63817ec681f3Smrg ? d->front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE 63827ec681f3Smrg : G_028814_FACE(pa_su_sc_mode_cntl) == 0; 63837ec681f3Smrg 63847ec681f3Smrg /* Take inverted viewport into account. */ 63857ec681f3Smrg ccw ^= vp_y_inverted; 63867ec681f3Smrg 63877ec681f3Smrg if (ccw) 63887ec681f3Smrg nggc_settings |= ngg_cull_face_is_ccw; 63897ec681f3Smrg 63907ec681f3Smrg /* Face culling settings. */ 63917ec681f3Smrg if ((pipeline->graphics.needed_dynamic_state & RADV_DYNAMIC_CULL_MODE) 63927ec681f3Smrg ? (d->cull_mode & VK_CULL_MODE_FRONT_BIT) 63937ec681f3Smrg : G_028814_CULL_FRONT(pa_su_sc_mode_cntl)) 63947ec681f3Smrg nggc_settings |= ngg_cull_front_face; 63957ec681f3Smrg if ((pipeline->graphics.needed_dynamic_state & RADV_DYNAMIC_CULL_MODE) 63967ec681f3Smrg ? (d->cull_mode & VK_CULL_MODE_BACK_BIT) 63977ec681f3Smrg : G_028814_CULL_BACK(pa_su_sc_mode_cntl)) 63987ec681f3Smrg nggc_settings |= ngg_cull_back_face; 63997ec681f3Smrg 64007ec681f3Smrg /* Small primitive culling is only valid when conservative overestimation is not used. */ 64017ec681f3Smrg if (!pipeline->graphics.uses_conservative_overestimate) { 64027ec681f3Smrg nggc_settings |= ngg_cull_small_primitives; 64037ec681f3Smrg 64047ec681f3Smrg /* small_prim_precision = num_samples / 2^subpixel_bits 64057ec681f3Smrg * num_samples is also always a power of two, so the small prim precision can only be 64067ec681f3Smrg * a power of two between 2^-2 and 2^-6, therefore it's enough to remember the exponent. 64077ec681f3Smrg */ 64087ec681f3Smrg unsigned subpixel_bits = 256; 64097ec681f3Smrg int32_t small_prim_precision_log2 = util_logbase2(pipeline->graphics.ms.num_samples) - util_logbase2(subpixel_bits); 64107ec681f3Smrg nggc_settings |= ((uint32_t) small_prim_precision_log2 << 24u); 64117ec681f3Smrg } 64127ec681f3Smrg 64137ec681f3Smrg return nggc_settings; 64147ec681f3Smrg} 64157ec681f3Smrg 64167ec681f3Smrgstatic void 64177ec681f3Smrgradv_emit_ngg_culling_state(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *draw_info) 64187ec681f3Smrg{ 64197ec681f3Smrg struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; 64207ec681f3Smrg const unsigned stage = pipeline->graphics.last_vgt_api_stage; 64217ec681f3Smrg const bool nggc_supported = pipeline->graphics.has_ngg_culling; 64227ec681f3Smrg 64237ec681f3Smrg if (!nggc_supported && !cmd_buffer->state.last_nggc_settings) { 64247ec681f3Smrg /* Current shader doesn't support culling and culling was already disabled: 64257ec681f3Smrg * No further steps needed, just remember the SGPR's location is not set. 64267ec681f3Smrg */ 64277ec681f3Smrg cmd_buffer->state.last_nggc_settings_sgpr_idx = -1; 64287ec681f3Smrg return; 64297ec681f3Smrg } 64307ec681f3Smrg 64317ec681f3Smrg /* Check dirty flags: 64327ec681f3Smrg * - Dirty pipeline: SGPR index may have changed (we have to re-emit if changed). 64337ec681f3Smrg * - Dirty dynamic flags: culling settings may have changed. 64347ec681f3Smrg */ 64357ec681f3Smrg const bool dirty = 64367ec681f3Smrg cmd_buffer->state.dirty & 64377ec681f3Smrg (RADV_CMD_DIRTY_PIPELINE | 64387ec681f3Smrg RADV_CMD_DIRTY_DYNAMIC_CULL_MODE | RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE | 64397ec681f3Smrg RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT); 64407ec681f3Smrg 64417ec681f3Smrg /* Check small draw status: 64427ec681f3Smrg * For small draw calls, we disable culling by setting the SGPR to 0. 64437ec681f3Smrg */ 64447ec681f3Smrg const bool skip = 64457ec681f3Smrg radv_skip_ngg_culling(stage == MESA_SHADER_TESS_EVAL, draw_info->count, draw_info->indirect); 64467ec681f3Smrg 64477ec681f3Smrg /* See if anything changed. */ 64487ec681f3Smrg if (!dirty && skip == cmd_buffer->state.last_nggc_skip) 64497ec681f3Smrg return; 64507ec681f3Smrg 64517ec681f3Smrg /* Remember small draw state. */ 64527ec681f3Smrg cmd_buffer->state.last_nggc_skip = skip; 64537ec681f3Smrg const struct radv_shader_variant *v = pipeline->shaders[stage]; 64547ec681f3Smrg assert(v->info.has_ngg_culling == nggc_supported); 64557ec681f3Smrg 64567ec681f3Smrg /* Find the user SGPR. */ 64577ec681f3Smrg const uint32_t base_reg = pipeline->user_data_0[stage]; 64587ec681f3Smrg const int8_t nggc_sgpr_idx = v->info.user_sgprs_locs.shader_data[AC_UD_NGG_CULLING_SETTINGS].sgpr_idx; 64597ec681f3Smrg assert(!nggc_supported || nggc_sgpr_idx != -1); 64607ec681f3Smrg 64617ec681f3Smrg /* Get viewport transform. */ 64627ec681f3Smrg float vp_scale[2], vp_translate[2]; 64637ec681f3Smrg memcpy(vp_scale, cmd_buffer->state.dynamic.viewport.xform[0].scale, 2 * sizeof(float)); 64647ec681f3Smrg memcpy(vp_translate, cmd_buffer->state.dynamic.viewport.xform[0].translate, 2 * sizeof(float)); 64657ec681f3Smrg bool vp_y_inverted = (-vp_scale[1] + vp_translate[1]) > (vp_scale[1] + vp_translate[1]); 64667ec681f3Smrg 64677ec681f3Smrg /* Get current culling settings. */ 64687ec681f3Smrg uint32_t nggc_settings = nggc_supported && !skip 64697ec681f3Smrg ? radv_get_ngg_culling_settings(cmd_buffer, vp_y_inverted) 64707ec681f3Smrg : ngg_cull_none; 64717ec681f3Smrg 64727ec681f3Smrg bool emit_viewport = nggc_settings && 64737ec681f3Smrg (cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_VIEWPORT || 64747ec681f3Smrg cmd_buffer->state.last_nggc_settings_sgpr_idx != nggc_sgpr_idx || 64757ec681f3Smrg !cmd_buffer->state.last_nggc_settings); 64767ec681f3Smrg 64777ec681f3Smrg if (emit_viewport) { 64787ec681f3Smrg /* Correction for inverted Y */ 64797ec681f3Smrg if (vp_y_inverted) { 64807ec681f3Smrg vp_scale[1] = -vp_scale[1]; 64817ec681f3Smrg vp_translate[1] = -vp_translate[1]; 64827ec681f3Smrg } 64837ec681f3Smrg 64847ec681f3Smrg /* Correction for number of samples per pixel. */ 64857ec681f3Smrg for (unsigned i = 0; i < 2; ++i) { 64867ec681f3Smrg vp_scale[i] *= (float) pipeline->graphics.ms.num_samples; 64877ec681f3Smrg vp_translate[i] *= (float) pipeline->graphics.ms.num_samples; 64887ec681f3Smrg } 64897ec681f3Smrg 64907ec681f3Smrg uint32_t vp_reg_values[4] = {fui(vp_scale[0]), fui(vp_scale[1]), fui(vp_translate[0]), fui(vp_translate[1])}; 64917ec681f3Smrg const int8_t vp_sgpr_idx = v->info.user_sgprs_locs.shader_data[AC_UD_NGG_VIEWPORT].sgpr_idx; 64927ec681f3Smrg assert(vp_sgpr_idx != -1); 64937ec681f3Smrg radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + vp_sgpr_idx * 4, 4); 64947ec681f3Smrg radeon_emit_array(cmd_buffer->cs, vp_reg_values, 4); 64957ec681f3Smrg } 64967ec681f3Smrg 64977ec681f3Smrg bool emit_settings = nggc_supported && 64987ec681f3Smrg (cmd_buffer->state.last_nggc_settings != nggc_settings || 64997ec681f3Smrg cmd_buffer->state.last_nggc_settings_sgpr_idx != nggc_sgpr_idx); 65007ec681f3Smrg 65017ec681f3Smrg /* This needs to be emitted when culling is turned on 65027ec681f3Smrg * and when it's already on but some settings change. 65037ec681f3Smrg */ 65047ec681f3Smrg if (emit_settings) { 65057ec681f3Smrg assert(nggc_sgpr_idx >= 0); 65067ec681f3Smrg radeon_set_sh_reg(cmd_buffer->cs, base_reg + nggc_sgpr_idx * 4, nggc_settings); 65077ec681f3Smrg } 65087ec681f3Smrg 65097ec681f3Smrg /* These only need to be emitted when culling is turned on or off, 65107ec681f3Smrg * but not when it stays on and just some settings change. 65117ec681f3Smrg */ 65127ec681f3Smrg if (!!cmd_buffer->state.last_nggc_settings != !!nggc_settings) { 65137ec681f3Smrg uint32_t rsrc2 = v->config.rsrc2; 65147ec681f3Smrg 65157ec681f3Smrg if (!nggc_settings) { 65167ec681f3Smrg /* Allocate less LDS when culling is disabled. (But GS always needs it.) */ 65177ec681f3Smrg if (stage != MESA_SHADER_GEOMETRY) 65187ec681f3Smrg rsrc2 = (rsrc2 & C_00B22C_LDS_SIZE) | S_00B22C_LDS_SIZE(v->info.num_lds_blocks_when_not_culling); 65197ec681f3Smrg } 65207ec681f3Smrg 65217ec681f3Smrg /* When the pipeline is dirty and not yet emitted, don't write it here 65227ec681f3Smrg * because radv_emit_graphics_pipeline will overwrite this register. 65237ec681f3Smrg */ 65247ec681f3Smrg if (!(cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) || 65257ec681f3Smrg cmd_buffer->state.emitted_pipeline == pipeline) { 65267ec681f3Smrg radeon_set_sh_reg(cmd_buffer->cs, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2); 65277ec681f3Smrg } 65287ec681f3Smrg } 65297ec681f3Smrg 65307ec681f3Smrg cmd_buffer->state.last_nggc_settings = nggc_settings; 65317ec681f3Smrg cmd_buffer->state.last_nggc_settings_sgpr_idx = nggc_sgpr_idx; 65327ec681f3Smrg} 65337ec681f3Smrg 65347ec681f3Smrgstatic void 65357ec681f3Smrgradv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, 65367ec681f3Smrg bool pipeline_is_dirty) 65377ec681f3Smrg{ 65387ec681f3Smrg bool late_scissor_emission; 65397ec681f3Smrg 65407ec681f3Smrg if ((cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER) || 65417ec681f3Smrg cmd_buffer->state.emitted_pipeline != cmd_buffer->state.pipeline) 65427ec681f3Smrg radv_emit_rbplus_state(cmd_buffer); 65437ec681f3Smrg 65447ec681f3Smrg if (cmd_buffer->device->physical_device->use_ngg_culling && 65457ec681f3Smrg cmd_buffer->state.pipeline->graphics.is_ngg) 65467ec681f3Smrg radv_emit_ngg_culling_state(cmd_buffer, info); 65477ec681f3Smrg 65487ec681f3Smrg if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) 65497ec681f3Smrg radv_emit_graphics_pipeline(cmd_buffer); 65507ec681f3Smrg 65517ec681f3Smrg /* This should be before the cmd_buffer->state.dirty is cleared 65527ec681f3Smrg * (excluding RADV_CMD_DIRTY_PIPELINE) and after 65537ec681f3Smrg * cmd_buffer->state.context_roll_without_scissor_emitted is set. */ 65547ec681f3Smrg late_scissor_emission = radv_need_late_scissor_emission(cmd_buffer, info); 65557ec681f3Smrg 65567ec681f3Smrg if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER) 65577ec681f3Smrg radv_emit_framebuffer_state(cmd_buffer); 65587ec681f3Smrg 65597ec681f3Smrg if (info->indexed) { 65607ec681f3Smrg if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_INDEX_BUFFER) 65617ec681f3Smrg radv_emit_index_buffer(cmd_buffer, info->indirect); 65627ec681f3Smrg } else { 65637ec681f3Smrg /* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE, 65647ec681f3Smrg * so the state must be re-emitted before the next indexed 65657ec681f3Smrg * draw. 65667ec681f3Smrg */ 65677ec681f3Smrg if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) { 65687ec681f3Smrg cmd_buffer->state.last_index_type = -1; 65697ec681f3Smrg cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER; 65707ec681f3Smrg } 65717ec681f3Smrg } 65727ec681f3Smrg 65737ec681f3Smrg radv_cmd_buffer_flush_dynamic_state(cmd_buffer, pipeline_is_dirty); 65747ec681f3Smrg 65757ec681f3Smrg radv_emit_draw_registers(cmd_buffer, info); 65767ec681f3Smrg 65777ec681f3Smrg if (late_scissor_emission) 65787ec681f3Smrg radv_emit_scissor(cmd_buffer); 65797ec681f3Smrg} 65807ec681f3Smrg 65817ec681f3Smrg/* MUST inline this function to avoid massive perf loss in drawoverhead */ 65827ec681f3SmrgALWAYS_INLINE static bool 65837ec681f3Smrgradv_before_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, uint32_t drawCount) 65847ec681f3Smrg{ 65857ec681f3Smrg const bool has_prefetch = cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7; 65867ec681f3Smrg const bool pipeline_is_dirty = (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) && 65877ec681f3Smrg cmd_buffer->state.pipeline != cmd_buffer->state.emitted_pipeline; 65887ec681f3Smrg 65897ec681f3Smrg ASSERTED const unsigned cdw_max = 65907ec681f3Smrg radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4096 + 128 * (drawCount - 1)); 65917ec681f3Smrg 65927ec681f3Smrg if (likely(!info->indirect)) { 65937ec681f3Smrg /* GFX6-GFX7 treat instance_count==0 as instance_count==1. There is 65947ec681f3Smrg * no workaround for indirect draws, but we can at least skip 65957ec681f3Smrg * direct draws. 65967ec681f3Smrg */ 65977ec681f3Smrg if (unlikely(!info->instance_count)) 65987ec681f3Smrg return false; 65997ec681f3Smrg 66007ec681f3Smrg /* Handle count == 0. */ 66017ec681f3Smrg if (unlikely(!info->count && !info->strmout_buffer)) 66027ec681f3Smrg return false; 66037ec681f3Smrg } 66047ec681f3Smrg 66057ec681f3Smrg /* Need to apply this workaround early as it can set flush flags. */ 66067ec681f3Smrg if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER) 66077ec681f3Smrg radv_emit_fb_mip_change_flush(cmd_buffer); 66087ec681f3Smrg 66097ec681f3Smrg /* Use optimal packet order based on whether we need to sync the 66107ec681f3Smrg * pipeline. 66117ec681f3Smrg */ 66127ec681f3Smrg if (cmd_buffer->state.flush_bits & 66137ec681f3Smrg (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB | 66147ec681f3Smrg RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) { 66157ec681f3Smrg /* If we have to wait for idle, set all states first, so that 66167ec681f3Smrg * all SET packets are processed in parallel with previous draw 66177ec681f3Smrg * calls. Then upload descriptors, set shader pointers, and 66187ec681f3Smrg * draw, and prefetch at the end. This ensures that the time 66197ec681f3Smrg * the CUs are idle is very short. (there are only SET_SH 66207ec681f3Smrg * packets between the wait and the draw) 66217ec681f3Smrg */ 66227ec681f3Smrg radv_emit_all_graphics_states(cmd_buffer, info, pipeline_is_dirty); 66237ec681f3Smrg si_emit_cache_flush(cmd_buffer); 66247ec681f3Smrg /* <-- CUs are idle here --> */ 66257ec681f3Smrg 66267ec681f3Smrg radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty); 66277ec681f3Smrg } else { 66287ec681f3Smrg /* If we don't wait for idle, start prefetches first, then set 66297ec681f3Smrg * states, and draw at the end. 66307ec681f3Smrg */ 66317ec681f3Smrg si_emit_cache_flush(cmd_buffer); 66327ec681f3Smrg 66337ec681f3Smrg if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) { 66347ec681f3Smrg /* Only prefetch the vertex shader and VBO descriptors 66357ec681f3Smrg * in order to start the draw as soon as possible. 66367ec681f3Smrg */ 66377ec681f3Smrg radv_emit_prefetch_L2(cmd_buffer, cmd_buffer->state.pipeline, true); 66387ec681f3Smrg } 66397ec681f3Smrg 66407ec681f3Smrg radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty); 66417ec681f3Smrg 66427ec681f3Smrg radv_emit_all_graphics_states(cmd_buffer, info, pipeline_is_dirty); 66437ec681f3Smrg } 66447ec681f3Smrg 66457ec681f3Smrg radv_describe_draw(cmd_buffer); 66467ec681f3Smrg if (likely(!info->indirect)) { 66477ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 66487ec681f3Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 66497ec681f3Smrg assert(state->pipeline->graphics.vtx_base_sgpr); 66507ec681f3Smrg if (state->last_num_instances != info->instance_count) { 66517ec681f3Smrg radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, false)); 66527ec681f3Smrg radeon_emit(cs, info->instance_count); 66537ec681f3Smrg state->last_num_instances = info->instance_count; 66547ec681f3Smrg } 66557ec681f3Smrg } 66567ec681f3Smrg assert(cmd_buffer->cs->cdw <= cdw_max); 66577ec681f3Smrg 66587ec681f3Smrg return true; 66597ec681f3Smrg} 66607ec681f3Smrg 66617ec681f3Smrgstatic void 66627ec681f3Smrgradv_after_draw(struct radv_cmd_buffer *cmd_buffer) 66637ec681f3Smrg{ 66647ec681f3Smrg const struct radeon_info *rad_info = &cmd_buffer->device->physical_device->rad_info; 66657ec681f3Smrg bool has_prefetch = cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7; 66667ec681f3Smrg /* Start prefetches after the draw has been started. Both will 66677ec681f3Smrg * run in parallel, but starting the draw first is more 66687ec681f3Smrg * important. 66697ec681f3Smrg */ 66707ec681f3Smrg if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) { 66717ec681f3Smrg radv_emit_prefetch_L2(cmd_buffer, cmd_buffer->state.pipeline, false); 66727ec681f3Smrg } 66737ec681f3Smrg 66747ec681f3Smrg /* Workaround for a VGT hang when streamout is enabled. 66757ec681f3Smrg * It must be done after drawing. 66767ec681f3Smrg */ 66777ec681f3Smrg if (cmd_buffer->state.streamout.streamout_enabled && 66787ec681f3Smrg (rad_info->family == CHIP_HAWAII || rad_info->family == CHIP_TONGA || 66797ec681f3Smrg rad_info->family == CHIP_FIJI)) { 66807ec681f3Smrg cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_STREAMOUT_SYNC; 66817ec681f3Smrg } 66827ec681f3Smrg 66837ec681f3Smrg radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_PS_PARTIAL_FLUSH); 66847ec681f3Smrg} 66857ec681f3Smrg 66867ec681f3Smrgvoid 66877ec681f3Smrgradv_CmdDraw(VkCommandBuffer commandBuffer, uint32_t vertexCount, uint32_t instanceCount, 66887ec681f3Smrg uint32_t firstVertex, uint32_t firstInstance) 66897ec681f3Smrg{ 66907ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 66917ec681f3Smrg struct radv_draw_info info; 66927ec681f3Smrg 66937ec681f3Smrg info.count = vertexCount; 66947ec681f3Smrg info.instance_count = instanceCount; 66957ec681f3Smrg info.first_instance = firstInstance; 66967ec681f3Smrg info.strmout_buffer = NULL; 66977ec681f3Smrg info.indirect = NULL; 66987ec681f3Smrg info.indexed = false; 66997ec681f3Smrg 67007ec681f3Smrg if (!radv_before_draw(cmd_buffer, &info, 1)) 67017ec681f3Smrg return; 67027ec681f3Smrg const VkMultiDrawInfoEXT minfo = { firstVertex, vertexCount }; 67037ec681f3Smrg radv_emit_direct_draw_packets(cmd_buffer, &info, 1, &minfo, 0, 0); 67047ec681f3Smrg radv_after_draw(cmd_buffer); 67057ec681f3Smrg} 67067ec681f3Smrg 67077ec681f3Smrgvoid 67087ec681f3Smrgradv_CmdDrawMultiEXT(VkCommandBuffer commandBuffer, uint32_t drawCount, const VkMultiDrawInfoEXT *pVertexInfo, 67097ec681f3Smrg uint32_t instanceCount, uint32_t firstInstance, uint32_t stride) 67107ec681f3Smrg{ 67117ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 67127ec681f3Smrg struct radv_draw_info info; 67137ec681f3Smrg 67147ec681f3Smrg if (!drawCount) 67157ec681f3Smrg return; 67167ec681f3Smrg 67177ec681f3Smrg info.count = pVertexInfo->vertexCount; 67187ec681f3Smrg info.instance_count = instanceCount; 67197ec681f3Smrg info.first_instance = firstInstance; 67207ec681f3Smrg info.strmout_buffer = NULL; 67217ec681f3Smrg info.indirect = NULL; 67227ec681f3Smrg info.indexed = false; 67237ec681f3Smrg 67247ec681f3Smrg if (!radv_before_draw(cmd_buffer, &info, drawCount)) 67257ec681f3Smrg return; 67267ec681f3Smrg radv_emit_direct_draw_packets(cmd_buffer, &info, drawCount, pVertexInfo, 0, stride); 67277ec681f3Smrg radv_after_draw(cmd_buffer); 67287ec681f3Smrg} 67297ec681f3Smrg 67307ec681f3Smrgvoid 67317ec681f3Smrgradv_CmdDrawIndexed(VkCommandBuffer commandBuffer, uint32_t indexCount, uint32_t instanceCount, 67327ec681f3Smrg uint32_t firstIndex, int32_t vertexOffset, uint32_t firstInstance) 67337ec681f3Smrg{ 67347ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 67357ec681f3Smrg struct radv_draw_info info; 67367ec681f3Smrg 67377ec681f3Smrg info.indexed = true; 67387ec681f3Smrg info.count = indexCount; 67397ec681f3Smrg info.instance_count = instanceCount; 67407ec681f3Smrg info.first_instance = firstInstance; 67417ec681f3Smrg info.strmout_buffer = NULL; 67427ec681f3Smrg info.indirect = NULL; 67437ec681f3Smrg 67447ec681f3Smrg if (!radv_before_draw(cmd_buffer, &info, 1)) 67457ec681f3Smrg return; 67467ec681f3Smrg const VkMultiDrawIndexedInfoEXT minfo = { firstIndex, indexCount, vertexOffset }; 67477ec681f3Smrg radv_emit_draw_packets_indexed(cmd_buffer, &info, 1, &minfo, 0, NULL); 67487ec681f3Smrg radv_after_draw(cmd_buffer); 67497ec681f3Smrg} 67507ec681f3Smrg 67517ec681f3Smrgvoid radv_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer, uint32_t drawCount, const VkMultiDrawIndexedInfoEXT *pIndexInfo, 67527ec681f3Smrg uint32_t instanceCount, uint32_t firstInstance, uint32_t stride, const int32_t *pVertexOffset) 67537ec681f3Smrg{ 67547ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 67557ec681f3Smrg struct radv_draw_info info; 67567ec681f3Smrg 67577ec681f3Smrg if (!drawCount) 67587ec681f3Smrg return; 67597ec681f3Smrg 67607ec681f3Smrg const VkMultiDrawIndexedInfoEXT *minfo = pIndexInfo; 67617ec681f3Smrg info.indexed = true; 67627ec681f3Smrg info.count = minfo->indexCount; 67637ec681f3Smrg info.instance_count = instanceCount; 67647ec681f3Smrg info.first_instance = firstInstance; 67657ec681f3Smrg info.strmout_buffer = NULL; 67667ec681f3Smrg info.indirect = NULL; 67677ec681f3Smrg 67687ec681f3Smrg if (!radv_before_draw(cmd_buffer, &info, drawCount)) 67697ec681f3Smrg return; 67707ec681f3Smrg radv_emit_draw_packets_indexed(cmd_buffer, &info, drawCount, pIndexInfo, stride, pVertexOffset); 67717ec681f3Smrg radv_after_draw(cmd_buffer); 67727ec681f3Smrg} 67737ec681f3Smrg 67747ec681f3Smrgvoid 67757ec681f3Smrgradv_CmdDrawIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, 67767ec681f3Smrg uint32_t drawCount, uint32_t stride) 67777ec681f3Smrg{ 67787ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 67797ec681f3Smrg RADV_FROM_HANDLE(radv_buffer, buffer, _buffer); 67807ec681f3Smrg struct radv_draw_info info; 67817ec681f3Smrg 67827ec681f3Smrg info.count = drawCount; 67837ec681f3Smrg info.indirect = buffer; 67847ec681f3Smrg info.indirect_offset = offset; 67857ec681f3Smrg info.stride = stride; 67867ec681f3Smrg info.strmout_buffer = NULL; 67877ec681f3Smrg info.count_buffer = NULL; 67887ec681f3Smrg info.indexed = false; 67897ec681f3Smrg info.instance_count = 0; 67907ec681f3Smrg 67917ec681f3Smrg if (!radv_before_draw(cmd_buffer, &info, 1)) 67927ec681f3Smrg return; 67937ec681f3Smrg radv_emit_indirect_draw_packets(cmd_buffer, &info); 67947ec681f3Smrg radv_after_draw(cmd_buffer); 67957ec681f3Smrg} 67967ec681f3Smrg 67977ec681f3Smrgvoid 67987ec681f3Smrgradv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, 67997ec681f3Smrg uint32_t drawCount, uint32_t stride) 68007ec681f3Smrg{ 68017ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 68027ec681f3Smrg RADV_FROM_HANDLE(radv_buffer, buffer, _buffer); 68037ec681f3Smrg struct radv_draw_info info; 68047ec681f3Smrg 68057ec681f3Smrg info.indexed = true; 68067ec681f3Smrg info.count = drawCount; 68077ec681f3Smrg info.indirect = buffer; 68087ec681f3Smrg info.indirect_offset = offset; 68097ec681f3Smrg info.stride = stride; 68107ec681f3Smrg info.count_buffer = NULL; 68117ec681f3Smrg info.strmout_buffer = NULL; 68127ec681f3Smrg info.instance_count = 0; 68137ec681f3Smrg 68147ec681f3Smrg if (!radv_before_draw(cmd_buffer, &info, 1)) 68157ec681f3Smrg return; 68167ec681f3Smrg radv_emit_indirect_draw_packets(cmd_buffer, &info); 68177ec681f3Smrg radv_after_draw(cmd_buffer); 68187ec681f3Smrg} 68197ec681f3Smrg 68207ec681f3Smrgvoid 68217ec681f3Smrgradv_CmdDrawIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, 68227ec681f3Smrg VkBuffer _countBuffer, VkDeviceSize countBufferOffset, 68237ec681f3Smrg uint32_t maxDrawCount, uint32_t stride) 68247ec681f3Smrg{ 68257ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 68267ec681f3Smrg RADV_FROM_HANDLE(radv_buffer, buffer, _buffer); 68277ec681f3Smrg RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer); 68287ec681f3Smrg struct radv_draw_info info; 68297ec681f3Smrg 68307ec681f3Smrg info.count = maxDrawCount; 68317ec681f3Smrg info.indirect = buffer; 68327ec681f3Smrg info.indirect_offset = offset; 68337ec681f3Smrg info.count_buffer = count_buffer; 68347ec681f3Smrg info.count_buffer_offset = countBufferOffset; 68357ec681f3Smrg info.stride = stride; 68367ec681f3Smrg info.strmout_buffer = NULL; 68377ec681f3Smrg info.indexed = false; 68387ec681f3Smrg info.instance_count = 0; 68397ec681f3Smrg 68407ec681f3Smrg if (!radv_before_draw(cmd_buffer, &info, 1)) 68417ec681f3Smrg return; 68427ec681f3Smrg radv_emit_indirect_draw_packets(cmd_buffer, &info); 68437ec681f3Smrg radv_after_draw(cmd_buffer); 68447ec681f3Smrg} 68457ec681f3Smrg 68467ec681f3Smrgvoid 68477ec681f3Smrgradv_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer, 68487ec681f3Smrg VkDeviceSize offset, VkBuffer _countBuffer, 68497ec681f3Smrg VkDeviceSize countBufferOffset, uint32_t maxDrawCount, 68507ec681f3Smrg uint32_t stride) 68517ec681f3Smrg{ 68527ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 68537ec681f3Smrg RADV_FROM_HANDLE(radv_buffer, buffer, _buffer); 68547ec681f3Smrg RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer); 68557ec681f3Smrg struct radv_draw_info info; 68567ec681f3Smrg 68577ec681f3Smrg info.indexed = true; 68587ec681f3Smrg info.count = maxDrawCount; 68597ec681f3Smrg info.indirect = buffer; 68607ec681f3Smrg info.indirect_offset = offset; 68617ec681f3Smrg info.count_buffer = count_buffer; 68627ec681f3Smrg info.count_buffer_offset = countBufferOffset; 68637ec681f3Smrg info.stride = stride; 68647ec681f3Smrg info.strmout_buffer = NULL; 68657ec681f3Smrg info.instance_count = 0; 68667ec681f3Smrg 68677ec681f3Smrg if (!radv_before_draw(cmd_buffer, &info, 1)) 68687ec681f3Smrg return; 68697ec681f3Smrg radv_emit_indirect_draw_packets(cmd_buffer, &info); 68707ec681f3Smrg radv_after_draw(cmd_buffer); 68717ec681f3Smrg} 68727ec681f3Smrg 68737ec681f3Smrgstruct radv_dispatch_info { 68747ec681f3Smrg /** 68757ec681f3Smrg * Determine the layout of the grid (in block units) to be used. 68767ec681f3Smrg */ 68777ec681f3Smrg uint32_t blocks[3]; 68787ec681f3Smrg 68797ec681f3Smrg /** 68807ec681f3Smrg * A starting offset for the grid. If unaligned is set, the offset 68817ec681f3Smrg * must still be aligned. 68827ec681f3Smrg */ 68837ec681f3Smrg uint32_t offsets[3]; 68847ec681f3Smrg /** 68857ec681f3Smrg * Whether it's an unaligned compute dispatch. 68867ec681f3Smrg */ 68877ec681f3Smrg bool unaligned; 68887ec681f3Smrg 68897ec681f3Smrg /** 68907ec681f3Smrg * Indirect compute parameters resource. 68917ec681f3Smrg */ 68927ec681f3Smrg struct radeon_winsys_bo *indirect; 68937ec681f3Smrg uint64_t va; 68947ec681f3Smrg}; 68957ec681f3Smrg 68967ec681f3Smrgstatic void 68977ec681f3Smrgradv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline, 68987ec681f3Smrg const struct radv_dispatch_info *info) 68997ec681f3Smrg{ 69007ec681f3Smrg struct radv_shader_variant *compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE]; 69017ec681f3Smrg unsigned dispatch_initiator = cmd_buffer->device->dispatch_initiator; 69027ec681f3Smrg struct radeon_winsys *ws = cmd_buffer->device->ws; 69037ec681f3Smrg bool predicating = cmd_buffer->state.predicating; 69047ec681f3Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 69057ec681f3Smrg struct radv_userdata_info *loc; 69067ec681f3Smrg 69077ec681f3Smrg radv_describe_dispatch(cmd_buffer, info->blocks[0], info->blocks[1], info->blocks[2]); 69087ec681f3Smrg 69097ec681f3Smrg loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_COMPUTE, AC_UD_CS_GRID_SIZE); 69107ec681f3Smrg 69117ec681f3Smrg ASSERTED unsigned cdw_max = radeon_check_space(ws, cs, 25); 69127ec681f3Smrg 69137ec681f3Smrg if (compute_shader->info.wave_size == 32) { 69147ec681f3Smrg assert(cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10); 69157ec681f3Smrg dispatch_initiator |= S_00B800_CS_W32_EN(1); 69167ec681f3Smrg } 69177ec681f3Smrg 69187ec681f3Smrg if (info->indirect) { 69197ec681f3Smrg radv_cs_add_buffer(ws, cs, info->indirect); 69207ec681f3Smrg 69217ec681f3Smrg if (loc->sgpr_idx != -1) { 69227ec681f3Smrg for (unsigned i = 0; i < 3; ++i) { 69237ec681f3Smrg radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); 69247ec681f3Smrg radeon_emit(cs, 69257ec681f3Smrg COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG)); 69267ec681f3Smrg radeon_emit(cs, (info->va + 4 * i)); 69277ec681f3Smrg radeon_emit(cs, (info->va + 4 * i) >> 32); 69287ec681f3Smrg radeon_emit(cs, ((R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4) >> 2) + i); 69297ec681f3Smrg radeon_emit(cs, 0); 69307ec681f3Smrg } 69317ec681f3Smrg } 69327ec681f3Smrg 69337ec681f3Smrg if (radv_cmd_buffer_uses_mec(cmd_buffer)) { 69347ec681f3Smrg radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, predicating) | PKT3_SHADER_TYPE_S(1)); 69357ec681f3Smrg radeon_emit(cs, info->va); 69367ec681f3Smrg radeon_emit(cs, info->va >> 32); 69377ec681f3Smrg radeon_emit(cs, dispatch_initiator); 69387ec681f3Smrg } else { 69397ec681f3Smrg radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) | PKT3_SHADER_TYPE_S(1)); 69407ec681f3Smrg radeon_emit(cs, 1); 69417ec681f3Smrg radeon_emit(cs, info->va); 69427ec681f3Smrg radeon_emit(cs, info->va >> 32); 69437ec681f3Smrg 69447ec681f3Smrg radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, predicating) | PKT3_SHADER_TYPE_S(1)); 69457ec681f3Smrg radeon_emit(cs, 0); 69467ec681f3Smrg radeon_emit(cs, dispatch_initiator); 69477ec681f3Smrg } 69487ec681f3Smrg } else { 69497ec681f3Smrg unsigned blocks[3] = {info->blocks[0], info->blocks[1], info->blocks[2]}; 69507ec681f3Smrg unsigned offsets[3] = {info->offsets[0], info->offsets[1], info->offsets[2]}; 69517ec681f3Smrg 69527ec681f3Smrg if (info->unaligned) { 69537ec681f3Smrg unsigned *cs_block_size = compute_shader->info.cs.block_size; 69547ec681f3Smrg unsigned remainder[3]; 69557ec681f3Smrg 69567ec681f3Smrg /* If aligned, these should be an entire block size, 69577ec681f3Smrg * not 0. 69587ec681f3Smrg */ 69597ec681f3Smrg remainder[0] = blocks[0] + cs_block_size[0] - align_u32_npot(blocks[0], cs_block_size[0]); 69607ec681f3Smrg remainder[1] = blocks[1] + cs_block_size[1] - align_u32_npot(blocks[1], cs_block_size[1]); 69617ec681f3Smrg remainder[2] = blocks[2] + cs_block_size[2] - align_u32_npot(blocks[2], cs_block_size[2]); 69627ec681f3Smrg 69637ec681f3Smrg blocks[0] = round_up_u32(blocks[0], cs_block_size[0]); 69647ec681f3Smrg blocks[1] = round_up_u32(blocks[1], cs_block_size[1]); 69657ec681f3Smrg blocks[2] = round_up_u32(blocks[2], cs_block_size[2]); 69667ec681f3Smrg 69677ec681f3Smrg for (unsigned i = 0; i < 3; ++i) { 69687ec681f3Smrg assert(offsets[i] % cs_block_size[i] == 0); 69697ec681f3Smrg offsets[i] /= cs_block_size[i]; 69707ec681f3Smrg } 69717ec681f3Smrg 69727ec681f3Smrg radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3); 69737ec681f3Smrg radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[0]) | 69747ec681f3Smrg S_00B81C_NUM_THREAD_PARTIAL(remainder[0])); 69757ec681f3Smrg radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[1]) | 69767ec681f3Smrg S_00B81C_NUM_THREAD_PARTIAL(remainder[1])); 69777ec681f3Smrg radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[2]) | 69787ec681f3Smrg S_00B81C_NUM_THREAD_PARTIAL(remainder[2])); 69797ec681f3Smrg 69807ec681f3Smrg dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1); 69817ec681f3Smrg } 69827ec681f3Smrg 69837ec681f3Smrg if (loc->sgpr_idx != -1) { 69847ec681f3Smrg assert(loc->num_sgprs == 3); 69857ec681f3Smrg 69867ec681f3Smrg radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, 3); 69877ec681f3Smrg radeon_emit(cs, blocks[0]); 69887ec681f3Smrg radeon_emit(cs, blocks[1]); 69897ec681f3Smrg radeon_emit(cs, blocks[2]); 69907ec681f3Smrg } 69917ec681f3Smrg 69927ec681f3Smrg if (offsets[0] || offsets[1] || offsets[2]) { 69937ec681f3Smrg radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3); 69947ec681f3Smrg radeon_emit(cs, offsets[0]); 69957ec681f3Smrg radeon_emit(cs, offsets[1]); 69967ec681f3Smrg radeon_emit(cs, offsets[2]); 69977ec681f3Smrg 69987ec681f3Smrg /* The blocks in the packet are not counts but end values. */ 69997ec681f3Smrg for (unsigned i = 0; i < 3; ++i) 70007ec681f3Smrg blocks[i] += offsets[i]; 70017ec681f3Smrg } else { 70027ec681f3Smrg dispatch_initiator |= S_00B800_FORCE_START_AT_000(1); 70037ec681f3Smrg } 70047ec681f3Smrg 70057ec681f3Smrg radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, predicating) | PKT3_SHADER_TYPE_S(1)); 70067ec681f3Smrg radeon_emit(cs, blocks[0]); 70077ec681f3Smrg radeon_emit(cs, blocks[1]); 70087ec681f3Smrg radeon_emit(cs, blocks[2]); 70097ec681f3Smrg radeon_emit(cs, dispatch_initiator); 70107ec681f3Smrg } 70117ec681f3Smrg 70127ec681f3Smrg assert(cmd_buffer->cs->cdw <= cdw_max); 70137ec681f3Smrg} 70147ec681f3Smrg 70157ec681f3Smrgstatic void 70167ec681f3Smrgradv_upload_compute_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, 70177ec681f3Smrg struct radv_pipeline *pipeline, 70187ec681f3Smrg VkPipelineBindPoint bind_point) 70197ec681f3Smrg{ 70207ec681f3Smrg radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_COMPUTE_BIT, pipeline, bind_point); 70217ec681f3Smrg radv_flush_constants(cmd_buffer, 70227ec681f3Smrg bind_point == VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR 70237ec681f3Smrg ? RADV_RT_STAGE_BITS 70247ec681f3Smrg : VK_SHADER_STAGE_COMPUTE_BIT, 70257ec681f3Smrg pipeline, bind_point); 70267ec681f3Smrg} 70277ec681f3Smrg 70287ec681f3Smrgstatic void 70297ec681f3Smrgradv_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info, 70307ec681f3Smrg struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point) 70317ec681f3Smrg{ 70327ec681f3Smrg bool has_prefetch = cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7; 70337ec681f3Smrg bool pipeline_is_dirty = pipeline && pipeline != cmd_buffer->state.emitted_compute_pipeline; 70347ec681f3Smrg bool cs_regalloc_hang = cmd_buffer->device->physical_device->rad_info.has_cs_regalloc_hang_bug && 70357ec681f3Smrg info->blocks[0] * info->blocks[1] * info->blocks[2] > 256; 70367ec681f3Smrg 70377ec681f3Smrg if (cs_regalloc_hang) 70387ec681f3Smrg cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH | 70397ec681f3Smrg RADV_CMD_FLAG_CS_PARTIAL_FLUSH; 70407ec681f3Smrg 70417ec681f3Smrg if (cmd_buffer->state.flush_bits & 70427ec681f3Smrg (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB | 70437ec681f3Smrg RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) { 70447ec681f3Smrg /* If we have to wait for idle, set all states first, so that 70457ec681f3Smrg * all SET packets are processed in parallel with previous draw 70467ec681f3Smrg * calls. Then upload descriptors, set shader pointers, and 70477ec681f3Smrg * dispatch, and prefetch at the end. This ensures that the 70487ec681f3Smrg * time the CUs are idle is very short. (there are only SET_SH 70497ec681f3Smrg * packets between the wait and the draw) 70507ec681f3Smrg */ 70517ec681f3Smrg radv_emit_compute_pipeline(cmd_buffer, pipeline); 70527ec681f3Smrg si_emit_cache_flush(cmd_buffer); 70537ec681f3Smrg /* <-- CUs are idle here --> */ 70547ec681f3Smrg 70557ec681f3Smrg radv_upload_compute_shader_descriptors(cmd_buffer, pipeline, bind_point); 70567ec681f3Smrg 70577ec681f3Smrg radv_emit_dispatch_packets(cmd_buffer, pipeline, info); 70587ec681f3Smrg /* <-- CUs are busy here --> */ 70597ec681f3Smrg 70607ec681f3Smrg /* Start prefetches after the dispatch has been started. Both 70617ec681f3Smrg * will run in parallel, but starting the dispatch first is 70627ec681f3Smrg * more important. 70637ec681f3Smrg */ 70647ec681f3Smrg if (has_prefetch && pipeline_is_dirty) { 70657ec681f3Smrg radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_COMPUTE]); 70667ec681f3Smrg } 70677ec681f3Smrg } else { 70687ec681f3Smrg /* If we don't wait for idle, start prefetches first, then set 70697ec681f3Smrg * states, and dispatch at the end. 70707ec681f3Smrg */ 70717ec681f3Smrg si_emit_cache_flush(cmd_buffer); 70727ec681f3Smrg 70737ec681f3Smrg if (has_prefetch && pipeline_is_dirty) { 70747ec681f3Smrg radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_COMPUTE]); 70757ec681f3Smrg } 70767ec681f3Smrg 70777ec681f3Smrg radv_upload_compute_shader_descriptors(cmd_buffer, pipeline, bind_point); 70787ec681f3Smrg 70797ec681f3Smrg radv_emit_compute_pipeline(cmd_buffer, pipeline); 70807ec681f3Smrg radv_emit_dispatch_packets(cmd_buffer, pipeline, info); 70817ec681f3Smrg } 70827ec681f3Smrg 70837ec681f3Smrg if (pipeline_is_dirty) { 70847ec681f3Smrg /* Raytracing uses compute shaders but has separate bind points and pipelines. 70857ec681f3Smrg * So if we set compute userdata & shader registers we should dirty the raytracing 70867ec681f3Smrg * ones and the other way around. 70877ec681f3Smrg * 70887ec681f3Smrg * We only need to do this when the pipeline is dirty because when we switch between 70897ec681f3Smrg * the two we always need to switch pipelines. 70907ec681f3Smrg */ 70917ec681f3Smrg radv_mark_descriptor_sets_dirty(cmd_buffer, bind_point == VK_PIPELINE_BIND_POINT_COMPUTE 70927ec681f3Smrg ? VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR 70937ec681f3Smrg : VK_PIPELINE_BIND_POINT_COMPUTE); 70947ec681f3Smrg } 70957ec681f3Smrg 70967ec681f3Smrg if (cs_regalloc_hang) 70977ec681f3Smrg cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH; 70987ec681f3Smrg 70997ec681f3Smrg radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_CS_PARTIAL_FLUSH); 71007ec681f3Smrg} 71017ec681f3Smrg 71027ec681f3Smrgstatic void 71037ec681f3Smrgradv_compute_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info) 71047ec681f3Smrg{ 71057ec681f3Smrg radv_dispatch(cmd_buffer, info, cmd_buffer->state.compute_pipeline, 71067ec681f3Smrg VK_PIPELINE_BIND_POINT_COMPUTE); 71077ec681f3Smrg} 71087ec681f3Smrg 71097ec681f3Smrgvoid 71107ec681f3Smrgradv_CmdDispatchBase(VkCommandBuffer commandBuffer, uint32_t base_x, uint32_t base_y, 71117ec681f3Smrg uint32_t base_z, uint32_t x, uint32_t y, uint32_t z) 71127ec681f3Smrg{ 71137ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 71147ec681f3Smrg struct radv_dispatch_info info = {0}; 71157ec681f3Smrg 71167ec681f3Smrg info.blocks[0] = x; 71177ec681f3Smrg info.blocks[1] = y; 71187ec681f3Smrg info.blocks[2] = z; 71197ec681f3Smrg 71207ec681f3Smrg info.offsets[0] = base_x; 71217ec681f3Smrg info.offsets[1] = base_y; 71227ec681f3Smrg info.offsets[2] = base_z; 71237ec681f3Smrg radv_compute_dispatch(cmd_buffer, &info); 71247ec681f3Smrg} 71257ec681f3Smrg 71267ec681f3Smrgvoid 71277ec681f3Smrgradv_CmdDispatch(VkCommandBuffer commandBuffer, uint32_t x, uint32_t y, uint32_t z) 71287ec681f3Smrg{ 71297ec681f3Smrg radv_CmdDispatchBase(commandBuffer, 0, 0, 0, x, y, z); 71307ec681f3Smrg} 71317ec681f3Smrg 71327ec681f3Smrgvoid 71337ec681f3Smrgradv_CmdDispatchIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset) 71347ec681f3Smrg{ 71357ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 71367ec681f3Smrg RADV_FROM_HANDLE(radv_buffer, buffer, _buffer); 71377ec681f3Smrg struct radv_dispatch_info info = {0}; 71387ec681f3Smrg 71397ec681f3Smrg info.indirect = buffer->bo; 71407ec681f3Smrg info.va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset; 71417ec681f3Smrg 71427ec681f3Smrg radv_compute_dispatch(cmd_buffer, &info); 71437ec681f3Smrg} 71447ec681f3Smrg 71457ec681f3Smrgvoid 71467ec681f3Smrgradv_unaligned_dispatch(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y, uint32_t z) 71477ec681f3Smrg{ 71487ec681f3Smrg struct radv_dispatch_info info = {0}; 71497ec681f3Smrg 71507ec681f3Smrg info.blocks[0] = x; 71517ec681f3Smrg info.blocks[1] = y; 71527ec681f3Smrg info.blocks[2] = z; 71537ec681f3Smrg info.unaligned = 1; 71547ec681f3Smrg 71557ec681f3Smrg radv_compute_dispatch(cmd_buffer, &info); 71567ec681f3Smrg} 71577ec681f3Smrg 71587ec681f3Smrgvoid 71597ec681f3Smrgradv_indirect_dispatch(struct radv_cmd_buffer *cmd_buffer, struct radeon_winsys_bo *bo, uint64_t va) 71607ec681f3Smrg{ 71617ec681f3Smrg struct radv_dispatch_info info = {0}; 71627ec681f3Smrg 71637ec681f3Smrg info.indirect = bo; 71647ec681f3Smrg info.va = va; 71657ec681f3Smrg 71667ec681f3Smrg radv_compute_dispatch(cmd_buffer, &info); 71677ec681f3Smrg} 71687ec681f3Smrg 71697ec681f3Smrgstatic void 71707ec681f3Smrgradv_rt_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info) 71717ec681f3Smrg{ 71727ec681f3Smrg radv_dispatch(cmd_buffer, info, cmd_buffer->state.rt_pipeline, 71737ec681f3Smrg VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR); 71747ec681f3Smrg} 71757ec681f3Smrg 71767ec681f3Smrgstatic bool 71777ec681f3Smrgradv_rt_bind_tables(struct radv_cmd_buffer *cmd_buffer, 71787ec681f3Smrg const VkStridedDeviceAddressRegionKHR *tables) 71797ec681f3Smrg{ 71807ec681f3Smrg struct radv_pipeline *pipeline = cmd_buffer->state.rt_pipeline; 71817ec681f3Smrg uint32_t base_reg; 71827ec681f3Smrg void *ptr; 71837ec681f3Smrg uint32_t *desc_ptr; 71847ec681f3Smrg uint32_t offset; 71857ec681f3Smrg 71867ec681f3Smrg if (!radv_cmd_buffer_upload_alloc(cmd_buffer, 64, &offset, &ptr)) 71877ec681f3Smrg return false; 71887ec681f3Smrg 71897ec681f3Smrg desc_ptr = ptr; 71907ec681f3Smrg for (unsigned i = 0; i < 4; ++i, desc_ptr += 4) { 71917ec681f3Smrg desc_ptr[0] = tables[i].deviceAddress; 71927ec681f3Smrg desc_ptr[1] = tables[i].deviceAddress >> 32; 71937ec681f3Smrg desc_ptr[2] = tables[i].stride; 71947ec681f3Smrg desc_ptr[3] = 0; 71957ec681f3Smrg } 71967ec681f3Smrg 71977ec681f3Smrg uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset; 71987ec681f3Smrg struct radv_userdata_info *loc = 71997ec681f3Smrg radv_lookup_user_sgpr(pipeline, MESA_SHADER_COMPUTE, AC_UD_CS_SBT_DESCRIPTORS); 72007ec681f3Smrg if (loc->sgpr_idx == -1) 72017ec681f3Smrg return true; 72027ec681f3Smrg 72037ec681f3Smrg base_reg = pipeline->user_data_0[MESA_SHADER_COMPUTE]; 72047ec681f3Smrg radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, va, 72057ec681f3Smrg false); 72067ec681f3Smrg return true; 72077ec681f3Smrg} 72087ec681f3Smrg 72097ec681f3Smrgvoid 72107ec681f3Smrgradv_CmdTraceRaysKHR(VkCommandBuffer commandBuffer, 72117ec681f3Smrg const VkStridedDeviceAddressRegionKHR *pRaygenShaderBindingTable, 72127ec681f3Smrg const VkStridedDeviceAddressRegionKHR *pMissShaderBindingTable, 72137ec681f3Smrg const VkStridedDeviceAddressRegionKHR *pHitShaderBindingTable, 72147ec681f3Smrg const VkStridedDeviceAddressRegionKHR *pCallableShaderBindingTable, 72157ec681f3Smrg uint32_t width, uint32_t height, uint32_t depth) 72167ec681f3Smrg{ 72177ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 72187ec681f3Smrg struct radv_dispatch_info info = {0}; 72197ec681f3Smrg 72207ec681f3Smrg info.blocks[0] = width; 72217ec681f3Smrg info.blocks[1] = height; 72227ec681f3Smrg info.blocks[2] = depth; 72237ec681f3Smrg info.unaligned = 1; 72247ec681f3Smrg 72257ec681f3Smrg const VkStridedDeviceAddressRegionKHR tables[] = { 72267ec681f3Smrg *pRaygenShaderBindingTable, 72277ec681f3Smrg *pMissShaderBindingTable, 72287ec681f3Smrg *pHitShaderBindingTable, 72297ec681f3Smrg *pCallableShaderBindingTable, 72307ec681f3Smrg }; 72317ec681f3Smrg 72327ec681f3Smrg if (!radv_rt_bind_tables(cmd_buffer, tables)) { 72337ec681f3Smrg return; 72347ec681f3Smrg } 72357ec681f3Smrg 72367ec681f3Smrg struct radv_userdata_info *loc = radv_lookup_user_sgpr( 72377ec681f3Smrg cmd_buffer->state.rt_pipeline, MESA_SHADER_COMPUTE, AC_UD_CS_RAY_LAUNCH_SIZE); 72387ec681f3Smrg 72397ec681f3Smrg if (loc->sgpr_idx != -1) { 72407ec681f3Smrg assert(loc->num_sgprs == 3); 72417ec681f3Smrg 72427ec681f3Smrg radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, 3); 72437ec681f3Smrg radeon_emit(cmd_buffer->cs, width); 72447ec681f3Smrg radeon_emit(cmd_buffer->cs, height); 72457ec681f3Smrg radeon_emit(cmd_buffer->cs, depth); 72467ec681f3Smrg } 72477ec681f3Smrg 72487ec681f3Smrg radv_rt_dispatch(cmd_buffer, &info); 72497ec681f3Smrg} 72507ec681f3Smrg 72517ec681f3Smrgstatic void 72527ec681f3Smrgradv_set_rt_stack_size(struct radv_cmd_buffer *cmd_buffer, uint32_t size) 72537ec681f3Smrg{ 72547ec681f3Smrg unsigned wave_size = 0; 72557ec681f3Smrg unsigned scratch_bytes_per_wave = 0; 72567ec681f3Smrg 72577ec681f3Smrg if (cmd_buffer->state.rt_pipeline) { 72587ec681f3Smrg scratch_bytes_per_wave = cmd_buffer->state.rt_pipeline->scratch_bytes_per_wave; 72597ec681f3Smrg wave_size = cmd_buffer->state.rt_pipeline->shaders[MESA_SHADER_COMPUTE]->info.wave_size; 72607ec681f3Smrg } 72617ec681f3Smrg 72627ec681f3Smrg /* The hardware register is specified as a multiple of 256 DWORDS. */ 72637ec681f3Smrg scratch_bytes_per_wave += align(size * wave_size, 1024); 72647ec681f3Smrg 72657ec681f3Smrg cmd_buffer->compute_scratch_size_per_wave_needed = 72667ec681f3Smrg MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, scratch_bytes_per_wave); 72677ec681f3Smrg} 72687ec681f3Smrg 72697ec681f3Smrgvoid 72707ec681f3Smrgradv_CmdSetRayTracingPipelineStackSizeKHR(VkCommandBuffer commandBuffer, uint32_t size) 72717ec681f3Smrg{ 72727ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 72737ec681f3Smrg 72747ec681f3Smrg radv_set_rt_stack_size(cmd_buffer, size); 72757ec681f3Smrg cmd_buffer->state.rt_stack_size = size; 72767ec681f3Smrg} 72777ec681f3Smrg 72787ec681f3Smrgvoid 72797ec681f3Smrgradv_cmd_buffer_end_render_pass(struct radv_cmd_buffer *cmd_buffer) 72807ec681f3Smrg{ 72817ec681f3Smrg vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments); 72827ec681f3Smrg vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.subpass_sample_locs); 72837ec681f3Smrg 72847ec681f3Smrg cmd_buffer->state.pass = NULL; 72857ec681f3Smrg cmd_buffer->state.subpass = NULL; 72867ec681f3Smrg cmd_buffer->state.attachments = NULL; 72877ec681f3Smrg cmd_buffer->state.framebuffer = NULL; 72887ec681f3Smrg cmd_buffer->state.subpass_sample_locs = NULL; 72897ec681f3Smrg} 72907ec681f3Smrg 72917ec681f3Smrgvoid 72927ec681f3Smrgradv_CmdEndRenderPass2(VkCommandBuffer commandBuffer, const VkSubpassEndInfo *pSubpassEndInfo) 72937ec681f3Smrg{ 72947ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 72957ec681f3Smrg 72967ec681f3Smrg radv_mark_noncoherent_rb(cmd_buffer); 72977ec681f3Smrg 72987ec681f3Smrg radv_emit_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier); 72997ec681f3Smrg 73007ec681f3Smrg radv_cmd_buffer_end_subpass(cmd_buffer); 73017ec681f3Smrg 73027ec681f3Smrg radv_cmd_buffer_end_render_pass(cmd_buffer); 73037ec681f3Smrg} 73047ec681f3Smrg 73057ec681f3Smrg/* 73067ec681f3Smrg * For HTILE we have the following interesting clear words: 73077ec681f3Smrg * 0xfffff30f: Uncompressed, full depth range, for depth+stencil HTILE 73087ec681f3Smrg * 0xfffc000f: Uncompressed, full depth range, for depth only HTILE. 73097ec681f3Smrg * 0xfffffff0: Clear depth to 1.0 73107ec681f3Smrg * 0x00000000: Clear depth to 0.0 73117ec681f3Smrg */ 73127ec681f3Smrgstatic void 73137ec681f3Smrgradv_initialize_htile(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 73147ec681f3Smrg const VkImageSubresourceRange *range) 73157ec681f3Smrg{ 73167ec681f3Smrg struct radv_cmd_state *state = &cmd_buffer->state; 73177ec681f3Smrg uint32_t htile_value = radv_get_htile_initial_value(cmd_buffer->device, image); 73187ec681f3Smrg VkClearDepthStencilValue value = {0}; 73197ec681f3Smrg struct radv_barrier_data barrier = {0}; 73207ec681f3Smrg 73217ec681f3Smrg barrier.layout_transitions.init_mask_ram = 1; 73227ec681f3Smrg radv_describe_layout_transition(cmd_buffer, &barrier); 73237ec681f3Smrg 73247ec681f3Smrg /* Transitioning from LAYOUT_UNDEFINED layout not everyone is consistent 73257ec681f3Smrg * in considering previous rendering work for WAW hazards. */ 73267ec681f3Smrg state->flush_bits |= 73277ec681f3Smrg radv_src_access_flush(cmd_buffer, VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, image); 73287ec681f3Smrg 73297ec681f3Smrg if (image->planes[0].surface.has_stencil && 73307ec681f3Smrg !(range->aspectMask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))) { 73317ec681f3Smrg /* Flush caches before performing a separate aspect initialization because it's a 73327ec681f3Smrg * read-modify-write operation. 73337ec681f3Smrg */ 73347ec681f3Smrg state->flush_bits |= radv_dst_access_flush(cmd_buffer, VK_ACCESS_SHADER_READ_BIT, image); 73357ec681f3Smrg } 73367ec681f3Smrg 73377ec681f3Smrg state->flush_bits |= radv_clear_htile(cmd_buffer, image, range, htile_value); 73387ec681f3Smrg 73397ec681f3Smrg radv_set_ds_clear_metadata(cmd_buffer, image, range, value, range->aspectMask); 73407ec681f3Smrg 73417ec681f3Smrg if (radv_image_is_tc_compat_htile(image) && (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)) { 73427ec681f3Smrg /* Initialize the TC-compat metada value to 0 because by 73437ec681f3Smrg * default DB_Z_INFO.RANGE_PRECISION is set to 1, and we only 73447ec681f3Smrg * need have to conditionally update its value when performing 73457ec681f3Smrg * a fast depth clear. 73467ec681f3Smrg */ 73477ec681f3Smrg radv_set_tc_compat_zrange_metadata(cmd_buffer, image, range, 0); 73487ec681f3Smrg } 73497ec681f3Smrg} 73507ec681f3Smrg 73517ec681f3Smrgstatic void 73527ec681f3Smrgradv_handle_depth_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 73537ec681f3Smrg VkImageLayout src_layout, bool src_render_loop, 73547ec681f3Smrg VkImageLayout dst_layout, bool dst_render_loop, 73557ec681f3Smrg unsigned src_queue_mask, unsigned dst_queue_mask, 73567ec681f3Smrg const VkImageSubresourceRange *range, 73577ec681f3Smrg struct radv_sample_locations_state *sample_locs) 73587ec681f3Smrg{ 73597ec681f3Smrg struct radv_device *device = cmd_buffer->device; 73607ec681f3Smrg 73617ec681f3Smrg if (!radv_htile_enabled(image, range->baseMipLevel)) 73627ec681f3Smrg return; 73637ec681f3Smrg 73647ec681f3Smrg if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) { 73657ec681f3Smrg radv_initialize_htile(cmd_buffer, image, range); 73667ec681f3Smrg } else if (!radv_layout_is_htile_compressed(device, image, src_layout, src_render_loop, 73677ec681f3Smrg src_queue_mask) && 73687ec681f3Smrg radv_layout_is_htile_compressed(device, image, dst_layout, dst_render_loop, 73697ec681f3Smrg dst_queue_mask)) { 73707ec681f3Smrg radv_initialize_htile(cmd_buffer, image, range); 73717ec681f3Smrg } else if (radv_layout_is_htile_compressed(device, image, src_layout, src_render_loop, 73727ec681f3Smrg src_queue_mask) && 73737ec681f3Smrg !radv_layout_is_htile_compressed(device, image, dst_layout, dst_render_loop, 73747ec681f3Smrg dst_queue_mask)) { 73757ec681f3Smrg cmd_buffer->state.flush_bits |= 73767ec681f3Smrg RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; 73777ec681f3Smrg 73787ec681f3Smrg radv_expand_depth_stencil(cmd_buffer, image, range, sample_locs); 73797ec681f3Smrg 73807ec681f3Smrg cmd_buffer->state.flush_bits |= 73817ec681f3Smrg RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; 73827ec681f3Smrg } 73837ec681f3Smrg} 73847ec681f3Smrg 73857ec681f3Smrgstatic uint32_t 73867ec681f3Smrgradv_init_cmask(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 73877ec681f3Smrg const VkImageSubresourceRange *range, uint32_t value) 73887ec681f3Smrg{ 73897ec681f3Smrg struct radv_barrier_data barrier = {0}; 73907ec681f3Smrg 73917ec681f3Smrg barrier.layout_transitions.init_mask_ram = 1; 73927ec681f3Smrg radv_describe_layout_transition(cmd_buffer, &barrier); 73937ec681f3Smrg 73947ec681f3Smrg return radv_clear_cmask(cmd_buffer, image, range, value); 73957ec681f3Smrg} 73967ec681f3Smrg 73977ec681f3Smrguint32_t 73987ec681f3Smrgradv_init_fmask(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 73997ec681f3Smrg const VkImageSubresourceRange *range) 74007ec681f3Smrg{ 74017ec681f3Smrg static const uint32_t fmask_clear_values[4] = {0x00000000, 0x02020202, 0xE4E4E4E4, 0x76543210}; 74027ec681f3Smrg uint32_t log2_samples = util_logbase2(image->info.samples); 74037ec681f3Smrg uint32_t value = fmask_clear_values[log2_samples]; 74047ec681f3Smrg struct radv_barrier_data barrier = {0}; 74057ec681f3Smrg 74067ec681f3Smrg barrier.layout_transitions.init_mask_ram = 1; 74077ec681f3Smrg radv_describe_layout_transition(cmd_buffer, &barrier); 74087ec681f3Smrg 74097ec681f3Smrg return radv_clear_fmask(cmd_buffer, image, range, value); 74107ec681f3Smrg} 74117ec681f3Smrg 74127ec681f3Smrguint32_t 74137ec681f3Smrgradv_init_dcc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 74147ec681f3Smrg const VkImageSubresourceRange *range, uint32_t value) 74157ec681f3Smrg{ 74167ec681f3Smrg struct radv_barrier_data barrier = {0}; 74177ec681f3Smrg uint32_t flush_bits = 0; 74187ec681f3Smrg unsigned size = 0; 74197ec681f3Smrg 74207ec681f3Smrg barrier.layout_transitions.init_mask_ram = 1; 74217ec681f3Smrg radv_describe_layout_transition(cmd_buffer, &barrier); 74227ec681f3Smrg 74237ec681f3Smrg flush_bits |= radv_clear_dcc(cmd_buffer, image, range, value); 74247ec681f3Smrg 74257ec681f3Smrg if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX8) { 74267ec681f3Smrg /* When DCC is enabled with mipmaps, some levels might not 74277ec681f3Smrg * support fast clears and we have to initialize them as "fully 74287ec681f3Smrg * expanded". 74297ec681f3Smrg */ 74307ec681f3Smrg /* Compute the size of all fast clearable DCC levels. */ 74317ec681f3Smrg for (unsigned i = 0; i < image->planes[0].surface.num_meta_levels; i++) { 74327ec681f3Smrg struct legacy_surf_dcc_level *dcc_level = &image->planes[0].surface.u.legacy.color.dcc_level[i]; 74337ec681f3Smrg unsigned dcc_fast_clear_size = 74347ec681f3Smrg dcc_level->dcc_slice_fast_clear_size * image->info.array_size; 74357ec681f3Smrg 74367ec681f3Smrg if (!dcc_fast_clear_size) 74377ec681f3Smrg break; 74387ec681f3Smrg 74397ec681f3Smrg size = dcc_level->dcc_offset + dcc_fast_clear_size; 74407ec681f3Smrg } 74417ec681f3Smrg 74427ec681f3Smrg /* Initialize the mipmap levels without DCC. */ 74437ec681f3Smrg if (size != image->planes[0].surface.meta_size) { 74447ec681f3Smrg flush_bits |= radv_fill_buffer(cmd_buffer, image, image->bo, 74457ec681f3Smrg image->offset + image->planes[0].surface.meta_offset + size, 74467ec681f3Smrg image->planes[0].surface.meta_size - size, 0xffffffff); 74477ec681f3Smrg } 74487ec681f3Smrg } 74497ec681f3Smrg 74507ec681f3Smrg return flush_bits; 74517ec681f3Smrg} 74527ec681f3Smrg 74537ec681f3Smrg/** 74547ec681f3Smrg * Initialize DCC/FMASK/CMASK metadata for a color image. 74557ec681f3Smrg */ 74567ec681f3Smrgstatic void 74577ec681f3Smrgradv_init_color_image_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 74587ec681f3Smrg VkImageLayout src_layout, bool src_render_loop, 74597ec681f3Smrg VkImageLayout dst_layout, bool dst_render_loop, 74607ec681f3Smrg unsigned src_queue_mask, unsigned dst_queue_mask, 74617ec681f3Smrg const VkImageSubresourceRange *range) 74627ec681f3Smrg{ 74637ec681f3Smrg uint32_t flush_bits = 0; 74647ec681f3Smrg 74657ec681f3Smrg /* Transitioning from LAYOUT_UNDEFINED layout not everyone is 74667ec681f3Smrg * consistent in considering previous rendering work for WAW hazards. 74677ec681f3Smrg */ 74687ec681f3Smrg cmd_buffer->state.flush_bits |= 74697ec681f3Smrg radv_src_access_flush(cmd_buffer, VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT, image); 74707ec681f3Smrg 74717ec681f3Smrg if (radv_image_has_cmask(image)) { 74727ec681f3Smrg uint32_t value; 74737ec681f3Smrg 74747ec681f3Smrg if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) { 74757ec681f3Smrg /* TODO: Fix clearing CMASK layers on GFX9. */ 74767ec681f3Smrg if (radv_image_is_tc_compat_cmask(image) || 74777ec681f3Smrg (radv_image_has_fmask(image) && 74787ec681f3Smrg radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel, dst_layout, 74797ec681f3Smrg dst_render_loop, dst_queue_mask))) { 74807ec681f3Smrg value = 0xccccccccu; 74817ec681f3Smrg } else { 74827ec681f3Smrg value = 0xffffffffu; 74837ec681f3Smrg } 74847ec681f3Smrg } else { 74857ec681f3Smrg static const uint32_t cmask_clear_values[4] = {0xffffffff, 0xdddddddd, 0xeeeeeeee, 0xffffffff}; 74867ec681f3Smrg uint32_t log2_samples = util_logbase2(image->info.samples); 74877ec681f3Smrg 74887ec681f3Smrg value = cmask_clear_values[log2_samples]; 74897ec681f3Smrg } 74907ec681f3Smrg 74917ec681f3Smrg flush_bits |= radv_init_cmask(cmd_buffer, image, range, value); 74927ec681f3Smrg } 74937ec681f3Smrg 74947ec681f3Smrg if (radv_image_has_fmask(image)) { 74957ec681f3Smrg flush_bits |= radv_init_fmask(cmd_buffer, image, range); 74967ec681f3Smrg } 74977ec681f3Smrg 74987ec681f3Smrg if (radv_dcc_enabled(image, range->baseMipLevel)) { 74997ec681f3Smrg uint32_t value = 0xffffffffu; /* Fully expanded mode. */ 75007ec681f3Smrg 75017ec681f3Smrg if (radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel, 75027ec681f3Smrg dst_layout, dst_render_loop, dst_queue_mask)) { 75037ec681f3Smrg value = 0u; 75047ec681f3Smrg } 75057ec681f3Smrg 75067ec681f3Smrg flush_bits |= radv_init_dcc(cmd_buffer, image, range, value); 75077ec681f3Smrg } 75087ec681f3Smrg 75097ec681f3Smrg if (radv_image_has_cmask(image) || radv_dcc_enabled(image, range->baseMipLevel)) { 75107ec681f3Smrg radv_update_fce_metadata(cmd_buffer, image, range, false); 75117ec681f3Smrg 75127ec681f3Smrg uint32_t color_values[2] = {0}; 75137ec681f3Smrg radv_set_color_clear_metadata(cmd_buffer, image, range, color_values); 75147ec681f3Smrg } 75157ec681f3Smrg 75167ec681f3Smrg cmd_buffer->state.flush_bits |= flush_bits; 75177ec681f3Smrg} 75187ec681f3Smrg 75197ec681f3Smrgstatic void 75207ec681f3Smrgradv_retile_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 75217ec681f3Smrg VkImageLayout src_layout, VkImageLayout dst_layout, unsigned dst_queue_mask) 75227ec681f3Smrg{ 75237ec681f3Smrg if (src_layout != VK_IMAGE_LAYOUT_PRESENT_SRC_KHR && 75247ec681f3Smrg (dst_layout == VK_IMAGE_LAYOUT_PRESENT_SRC_KHR || 75257ec681f3Smrg (dst_queue_mask & (1u << RADV_QUEUE_FOREIGN)))) 75267ec681f3Smrg radv_retile_dcc(cmd_buffer, image); 75277ec681f3Smrg} 75287ec681f3Smrg 75297ec681f3Smrgstatic bool 75307ec681f3Smrgradv_image_need_retile(const struct radv_image *image) 75317ec681f3Smrg{ 75327ec681f3Smrg return image->planes[0].surface.display_dcc_offset && 75337ec681f3Smrg image->planes[0].surface.display_dcc_offset != image->planes[0].surface.meta_offset; 75347ec681f3Smrg} 75357ec681f3Smrg 75367ec681f3Smrg/** 75377ec681f3Smrg * Handle color image transitions for DCC/FMASK/CMASK. 75387ec681f3Smrg */ 75397ec681f3Smrgstatic void 75407ec681f3Smrgradv_handle_color_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 75417ec681f3Smrg VkImageLayout src_layout, bool src_render_loop, 75427ec681f3Smrg VkImageLayout dst_layout, bool dst_render_loop, 75437ec681f3Smrg unsigned src_queue_mask, unsigned dst_queue_mask, 75447ec681f3Smrg const VkImageSubresourceRange *range) 75457ec681f3Smrg{ 75467ec681f3Smrg bool dcc_decompressed = false, fast_clear_flushed = false; 75477ec681f3Smrg 75487ec681f3Smrg if (!radv_image_has_cmask(image) && !radv_image_has_fmask(image) && 75497ec681f3Smrg !radv_dcc_enabled(image, range->baseMipLevel)) 75507ec681f3Smrg return; 75517ec681f3Smrg 75527ec681f3Smrg if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) { 75537ec681f3Smrg radv_init_color_image_metadata(cmd_buffer, image, src_layout, src_render_loop, dst_layout, 75547ec681f3Smrg dst_render_loop, src_queue_mask, dst_queue_mask, range); 75557ec681f3Smrg 75567ec681f3Smrg if (radv_image_need_retile(image)) 75577ec681f3Smrg radv_retile_transition(cmd_buffer, image, src_layout, dst_layout, dst_queue_mask); 75587ec681f3Smrg return; 75597ec681f3Smrg } 75607ec681f3Smrg 75617ec681f3Smrg if (radv_dcc_enabled(image, range->baseMipLevel)) { 75627ec681f3Smrg if (src_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) { 75637ec681f3Smrg cmd_buffer->state.flush_bits |= radv_init_dcc(cmd_buffer, image, range, 0xffffffffu); 75647ec681f3Smrg } else if (radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel, 75657ec681f3Smrg src_layout, src_render_loop, src_queue_mask) && 75667ec681f3Smrg !radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel, 75677ec681f3Smrg dst_layout, dst_render_loop, dst_queue_mask)) { 75687ec681f3Smrg radv_decompress_dcc(cmd_buffer, image, range); 75697ec681f3Smrg dcc_decompressed = true; 75707ec681f3Smrg } else if (radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel, 75717ec681f3Smrg src_layout, src_render_loop, src_queue_mask) && 75727ec681f3Smrg !radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel, 75737ec681f3Smrg dst_layout, dst_render_loop, dst_queue_mask)) { 75747ec681f3Smrg radv_fast_clear_flush_image_inplace(cmd_buffer, image, range); 75757ec681f3Smrg fast_clear_flushed = true; 75767ec681f3Smrg } 75777ec681f3Smrg 75787ec681f3Smrg if (radv_image_need_retile(image)) 75797ec681f3Smrg radv_retile_transition(cmd_buffer, image, src_layout, dst_layout, dst_queue_mask); 75807ec681f3Smrg } else if (radv_image_has_cmask(image) || radv_image_has_fmask(image)) { 75817ec681f3Smrg if (radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel, 75827ec681f3Smrg src_layout, src_render_loop, src_queue_mask) && 75837ec681f3Smrg !radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel, 75847ec681f3Smrg dst_layout, dst_render_loop, dst_queue_mask)) { 75857ec681f3Smrg radv_fast_clear_flush_image_inplace(cmd_buffer, image, range); 75867ec681f3Smrg fast_clear_flushed = true; 75877ec681f3Smrg } 75887ec681f3Smrg } 75897ec681f3Smrg 75907ec681f3Smrg /* MSAA color decompress. */ 75917ec681f3Smrg if (radv_image_has_fmask(image) && 75927ec681f3Smrg (image->usage & (VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT)) && 75937ec681f3Smrg radv_layout_fmask_compressed(cmd_buffer->device, image, src_layout, src_queue_mask) && 75947ec681f3Smrg !radv_layout_fmask_compressed(cmd_buffer->device, image, dst_layout, dst_queue_mask)) { 75957ec681f3Smrg if (radv_dcc_enabled(image, range->baseMipLevel) && 75967ec681f3Smrg !radv_image_use_dcc_image_stores(cmd_buffer->device, image) && !dcc_decompressed) { 75977ec681f3Smrg /* A DCC decompress is required before expanding FMASK 75987ec681f3Smrg * when DCC stores aren't supported to avoid being in 75997ec681f3Smrg * a state where DCC is compressed and the main 76007ec681f3Smrg * surface is uncompressed. 76017ec681f3Smrg */ 76027ec681f3Smrg radv_decompress_dcc(cmd_buffer, image, range); 76037ec681f3Smrg } else if (!fast_clear_flushed) { 76047ec681f3Smrg /* A FMASK decompress is required before expanding 76057ec681f3Smrg * FMASK. 76067ec681f3Smrg */ 76077ec681f3Smrg radv_fast_clear_flush_image_inplace(cmd_buffer, image, range); 76087ec681f3Smrg } 76097ec681f3Smrg 76107ec681f3Smrg struct radv_barrier_data barrier = {0}; 76117ec681f3Smrg barrier.layout_transitions.fmask_color_expand = 1; 76127ec681f3Smrg radv_describe_layout_transition(cmd_buffer, &barrier); 76137ec681f3Smrg 76147ec681f3Smrg radv_expand_fmask_image_inplace(cmd_buffer, image, range); 76157ec681f3Smrg } 76167ec681f3Smrg} 76177ec681f3Smrg 76187ec681f3Smrgstatic void 76197ec681f3Smrgradv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 76207ec681f3Smrg VkImageLayout src_layout, bool src_render_loop, 76217ec681f3Smrg VkImageLayout dst_layout, bool dst_render_loop, uint32_t src_family, 76227ec681f3Smrg uint32_t dst_family, const VkImageSubresourceRange *range, 76237ec681f3Smrg struct radv_sample_locations_state *sample_locs) 76247ec681f3Smrg{ 76257ec681f3Smrg if (image->exclusive && src_family != dst_family) { 76267ec681f3Smrg /* This is an acquire or a release operation and there will be 76277ec681f3Smrg * a corresponding release/acquire. Do the transition in the 76287ec681f3Smrg * most flexible queue. */ 76297ec681f3Smrg 76307ec681f3Smrg assert(src_family == cmd_buffer->queue_family_index || 76317ec681f3Smrg dst_family == cmd_buffer->queue_family_index); 76327ec681f3Smrg 76337ec681f3Smrg if (src_family == VK_QUEUE_FAMILY_EXTERNAL || src_family == VK_QUEUE_FAMILY_FOREIGN_EXT) 76347ec681f3Smrg return; 76357ec681f3Smrg 76367ec681f3Smrg if (cmd_buffer->queue_family_index == RADV_QUEUE_TRANSFER) 76377ec681f3Smrg return; 76387ec681f3Smrg 76397ec681f3Smrg if (cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE && 76407ec681f3Smrg (src_family == RADV_QUEUE_GENERAL || dst_family == RADV_QUEUE_GENERAL)) 76417ec681f3Smrg return; 76427ec681f3Smrg } 76437ec681f3Smrg 76447ec681f3Smrg unsigned src_queue_mask = 76457ec681f3Smrg radv_image_queue_family_mask(image, src_family, cmd_buffer->queue_family_index); 76467ec681f3Smrg unsigned dst_queue_mask = 76477ec681f3Smrg radv_image_queue_family_mask(image, dst_family, cmd_buffer->queue_family_index); 76487ec681f3Smrg 76497ec681f3Smrg if (src_layout == dst_layout && src_render_loop == dst_render_loop && src_queue_mask == dst_queue_mask) 76507ec681f3Smrg return; 76517ec681f3Smrg 76527ec681f3Smrg if (vk_format_has_depth(image->vk_format)) { 76537ec681f3Smrg radv_handle_depth_image_transition(cmd_buffer, image, src_layout, src_render_loop, dst_layout, 76547ec681f3Smrg dst_render_loop, src_queue_mask, dst_queue_mask, range, 76557ec681f3Smrg sample_locs); 76567ec681f3Smrg } else { 76577ec681f3Smrg radv_handle_color_image_transition(cmd_buffer, image, src_layout, src_render_loop, dst_layout, 76587ec681f3Smrg dst_render_loop, src_queue_mask, dst_queue_mask, range); 76597ec681f3Smrg } 76607ec681f3Smrg} 76617ec681f3Smrg 76627ec681f3Smrgstruct radv_barrier_info { 76637ec681f3Smrg enum rgp_barrier_reason reason; 76647ec681f3Smrg uint32_t eventCount; 76657ec681f3Smrg const VkEvent *pEvents; 76667ec681f3Smrg VkPipelineStageFlags srcStageMask; 76677ec681f3Smrg VkPipelineStageFlags dstStageMask; 76687ec681f3Smrg}; 76697ec681f3Smrg 76707ec681f3Smrgstatic void 76717ec681f3Smrgradv_barrier(struct radv_cmd_buffer *cmd_buffer, uint32_t memoryBarrierCount, 76727ec681f3Smrg const VkMemoryBarrier *pMemoryBarriers, uint32_t bufferMemoryBarrierCount, 76737ec681f3Smrg const VkBufferMemoryBarrier *pBufferMemoryBarriers, uint32_t imageMemoryBarrierCount, 76747ec681f3Smrg const VkImageMemoryBarrier *pImageMemoryBarriers, const struct radv_barrier_info *info) 76757ec681f3Smrg{ 76767ec681f3Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 76777ec681f3Smrg enum radv_cmd_flush_bits src_flush_bits = 0; 76787ec681f3Smrg enum radv_cmd_flush_bits dst_flush_bits = 0; 76797ec681f3Smrg 76807ec681f3Smrg if (cmd_buffer->state.subpass) 76817ec681f3Smrg radv_mark_noncoherent_rb(cmd_buffer); 76827ec681f3Smrg 76837ec681f3Smrg radv_describe_barrier_start(cmd_buffer, info->reason); 76847ec681f3Smrg 76857ec681f3Smrg for (unsigned i = 0; i < info->eventCount; ++i) { 76867ec681f3Smrg RADV_FROM_HANDLE(radv_event, event, info->pEvents[i]); 76877ec681f3Smrg uint64_t va = radv_buffer_get_va(event->bo); 76887ec681f3Smrg 76897ec681f3Smrg radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo); 76907ec681f3Smrg 76917ec681f3Smrg ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 7); 76927ec681f3Smrg 76937ec681f3Smrg radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL, va, 1, 0xffffffff); 76947ec681f3Smrg assert(cmd_buffer->cs->cdw <= cdw_max); 76957ec681f3Smrg } 76967ec681f3Smrg 76977ec681f3Smrg for (uint32_t i = 0; i < memoryBarrierCount; i++) { 76987ec681f3Smrg src_flush_bits |= radv_src_access_flush(cmd_buffer, pMemoryBarriers[i].srcAccessMask, NULL); 76997ec681f3Smrg dst_flush_bits |= radv_dst_access_flush(cmd_buffer, pMemoryBarriers[i].dstAccessMask, NULL); 77007ec681f3Smrg } 77017ec681f3Smrg 77027ec681f3Smrg for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) { 77037ec681f3Smrg src_flush_bits |= 77047ec681f3Smrg radv_src_access_flush(cmd_buffer, pBufferMemoryBarriers[i].srcAccessMask, NULL); 77057ec681f3Smrg dst_flush_bits |= 77067ec681f3Smrg radv_dst_access_flush(cmd_buffer, pBufferMemoryBarriers[i].dstAccessMask, NULL); 77077ec681f3Smrg } 77087ec681f3Smrg 77097ec681f3Smrg for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) { 77107ec681f3Smrg RADV_FROM_HANDLE(radv_image, image, pImageMemoryBarriers[i].image); 77117ec681f3Smrg 77127ec681f3Smrg src_flush_bits |= 77137ec681f3Smrg radv_src_access_flush(cmd_buffer, pImageMemoryBarriers[i].srcAccessMask, image); 77147ec681f3Smrg dst_flush_bits |= 77157ec681f3Smrg radv_dst_access_flush(cmd_buffer, pImageMemoryBarriers[i].dstAccessMask, image); 77167ec681f3Smrg } 77177ec681f3Smrg 77187ec681f3Smrg /* The Vulkan spec 1.1.98 says: 77197ec681f3Smrg * 77207ec681f3Smrg * "An execution dependency with only 77217ec681f3Smrg * VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT in the destination stage mask 77227ec681f3Smrg * will only prevent that stage from executing in subsequently 77237ec681f3Smrg * submitted commands. As this stage does not perform any actual 77247ec681f3Smrg * execution, this is not observable - in effect, it does not delay 77257ec681f3Smrg * processing of subsequent commands. Similarly an execution dependency 77267ec681f3Smrg * with only VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT in the source stage mask 77277ec681f3Smrg * will effectively not wait for any prior commands to complete." 77287ec681f3Smrg */ 77297ec681f3Smrg if (info->dstStageMask != VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT) 77307ec681f3Smrg radv_stage_flush(cmd_buffer, info->srcStageMask); 77317ec681f3Smrg cmd_buffer->state.flush_bits |= src_flush_bits; 77327ec681f3Smrg 77337ec681f3Smrg for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) { 77347ec681f3Smrg RADV_FROM_HANDLE(radv_image, image, pImageMemoryBarriers[i].image); 77357ec681f3Smrg 77367ec681f3Smrg const struct VkSampleLocationsInfoEXT *sample_locs_info = 77377ec681f3Smrg vk_find_struct_const(pImageMemoryBarriers[i].pNext, SAMPLE_LOCATIONS_INFO_EXT); 77387ec681f3Smrg struct radv_sample_locations_state sample_locations = {0}; 77397ec681f3Smrg 77407ec681f3Smrg if (sample_locs_info) { 77417ec681f3Smrg assert(image->flags & VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT); 77427ec681f3Smrg sample_locations.per_pixel = sample_locs_info->sampleLocationsPerPixel; 77437ec681f3Smrg sample_locations.grid_size = sample_locs_info->sampleLocationGridSize; 77447ec681f3Smrg sample_locations.count = sample_locs_info->sampleLocationsCount; 77457ec681f3Smrg typed_memcpy(&sample_locations.locations[0], sample_locs_info->pSampleLocations, 77467ec681f3Smrg sample_locs_info->sampleLocationsCount); 77477ec681f3Smrg } 77487ec681f3Smrg 77497ec681f3Smrg radv_handle_image_transition( 77507ec681f3Smrg cmd_buffer, image, pImageMemoryBarriers[i].oldLayout, 77517ec681f3Smrg false, /* Outside of a renderpass we are never in a renderloop */ 77527ec681f3Smrg pImageMemoryBarriers[i].newLayout, 77537ec681f3Smrg false, /* Outside of a renderpass we are never in a renderloop */ 77547ec681f3Smrg pImageMemoryBarriers[i].srcQueueFamilyIndex, pImageMemoryBarriers[i].dstQueueFamilyIndex, 77557ec681f3Smrg &pImageMemoryBarriers[i].subresourceRange, sample_locs_info ? &sample_locations : NULL); 77567ec681f3Smrg } 77577ec681f3Smrg 77587ec681f3Smrg /* Make sure CP DMA is idle because the driver might have performed a 77597ec681f3Smrg * DMA operation for copying or filling buffers/images. 77607ec681f3Smrg */ 77617ec681f3Smrg if (info->srcStageMask & (VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT)) 77627ec681f3Smrg si_cp_dma_wait_for_idle(cmd_buffer); 77637ec681f3Smrg 77647ec681f3Smrg cmd_buffer->state.flush_bits |= dst_flush_bits; 77657ec681f3Smrg 77667ec681f3Smrg radv_describe_barrier_end(cmd_buffer); 77677ec681f3Smrg} 77687ec681f3Smrg 77697ec681f3Smrgvoid 77707ec681f3Smrgradv_CmdPipelineBarrier(VkCommandBuffer commandBuffer, VkPipelineStageFlags srcStageMask, 77717ec681f3Smrg VkPipelineStageFlags destStageMask, VkBool32 byRegion, 77727ec681f3Smrg uint32_t memoryBarrierCount, const VkMemoryBarrier *pMemoryBarriers, 77737ec681f3Smrg uint32_t bufferMemoryBarrierCount, 77747ec681f3Smrg const VkBufferMemoryBarrier *pBufferMemoryBarriers, 77757ec681f3Smrg uint32_t imageMemoryBarrierCount, 77767ec681f3Smrg const VkImageMemoryBarrier *pImageMemoryBarriers) 77777ec681f3Smrg{ 77787ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 77797ec681f3Smrg struct radv_barrier_info info; 77807ec681f3Smrg 77817ec681f3Smrg info.reason = RGP_BARRIER_EXTERNAL_CMD_PIPELINE_BARRIER; 77827ec681f3Smrg info.eventCount = 0; 77837ec681f3Smrg info.pEvents = NULL; 77847ec681f3Smrg info.srcStageMask = srcStageMask; 77857ec681f3Smrg info.dstStageMask = destStageMask; 77867ec681f3Smrg 77877ec681f3Smrg radv_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers, bufferMemoryBarrierCount, 77887ec681f3Smrg pBufferMemoryBarriers, imageMemoryBarrierCount, pImageMemoryBarriers, &info); 77897ec681f3Smrg} 77907ec681f3Smrg 77917ec681f3Smrgstatic void 77927ec681f3Smrgwrite_event(struct radv_cmd_buffer *cmd_buffer, struct radv_event *event, 77937ec681f3Smrg VkPipelineStageFlags stageMask, unsigned value) 77947ec681f3Smrg{ 77957ec681f3Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 77967ec681f3Smrg uint64_t va = radv_buffer_get_va(event->bo); 77977ec681f3Smrg 77987ec681f3Smrg si_emit_cache_flush(cmd_buffer); 77997ec681f3Smrg 78007ec681f3Smrg radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo); 78017ec681f3Smrg 78027ec681f3Smrg ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 28); 78037ec681f3Smrg 78047ec681f3Smrg /* Flags that only require a top-of-pipe event. */ 78057ec681f3Smrg VkPipelineStageFlags top_of_pipe_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; 78067ec681f3Smrg 78077ec681f3Smrg /* Flags that only require a post-index-fetch event. */ 78087ec681f3Smrg VkPipelineStageFlags post_index_fetch_flags = 78097ec681f3Smrg top_of_pipe_flags | VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_VERTEX_INPUT_BIT; 78107ec681f3Smrg 78117ec681f3Smrg /* Flags that only require signaling post PS. */ 78127ec681f3Smrg VkPipelineStageFlags post_ps_flags = 78137ec681f3Smrg post_index_fetch_flags | VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | 78147ec681f3Smrg VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT | 78157ec681f3Smrg VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT | VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT | 78167ec681f3Smrg VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT | 78177ec681f3Smrg VK_PIPELINE_STAGE_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR | 78187ec681f3Smrg VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; 78197ec681f3Smrg 78207ec681f3Smrg /* Flags that only require signaling post CS. */ 78217ec681f3Smrg VkPipelineStageFlags post_cs_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; 78227ec681f3Smrg 78237ec681f3Smrg /* Make sure CP DMA is idle because the driver might have performed a 78247ec681f3Smrg * DMA operation for copying or filling buffers/images. 78257ec681f3Smrg */ 78267ec681f3Smrg if (stageMask & (VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT)) 78277ec681f3Smrg si_cp_dma_wait_for_idle(cmd_buffer); 78287ec681f3Smrg 78297ec681f3Smrg if (!(stageMask & ~top_of_pipe_flags)) { 78307ec681f3Smrg /* Just need to sync the PFP engine. */ 78317ec681f3Smrg radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0)); 78327ec681f3Smrg radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP)); 78337ec681f3Smrg radeon_emit(cs, va); 78347ec681f3Smrg radeon_emit(cs, va >> 32); 78357ec681f3Smrg radeon_emit(cs, value); 78367ec681f3Smrg } else if (!(stageMask & ~post_index_fetch_flags)) { 78377ec681f3Smrg /* Sync ME because PFP reads index and indirect buffers. */ 78387ec681f3Smrg radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0)); 78397ec681f3Smrg radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME)); 78407ec681f3Smrg radeon_emit(cs, va); 78417ec681f3Smrg radeon_emit(cs, va >> 32); 78427ec681f3Smrg radeon_emit(cs, value); 78437ec681f3Smrg } else { 78447ec681f3Smrg unsigned event_type; 78457ec681f3Smrg 78467ec681f3Smrg if (!(stageMask & ~post_ps_flags)) { 78477ec681f3Smrg /* Sync previous fragment shaders. */ 78487ec681f3Smrg event_type = V_028A90_PS_DONE; 78497ec681f3Smrg } else if (!(stageMask & ~post_cs_flags)) { 78507ec681f3Smrg /* Sync previous compute shaders. */ 78517ec681f3Smrg event_type = V_028A90_CS_DONE; 78527ec681f3Smrg } else { 78537ec681f3Smrg /* Otherwise, sync all prior GPU work. */ 78547ec681f3Smrg event_type = V_028A90_BOTTOM_OF_PIPE_TS; 78557ec681f3Smrg } 78567ec681f3Smrg 78577ec681f3Smrg si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.chip_class, 78587ec681f3Smrg radv_cmd_buffer_uses_mec(cmd_buffer), event_type, 0, 78597ec681f3Smrg EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, va, value, 78607ec681f3Smrg cmd_buffer->gfx9_eop_bug_va); 78617ec681f3Smrg } 78627ec681f3Smrg 78637ec681f3Smrg assert(cmd_buffer->cs->cdw <= cdw_max); 78647ec681f3Smrg} 78657ec681f3Smrg 78667ec681f3Smrgvoid 78677ec681f3Smrgradv_CmdSetEvent(VkCommandBuffer commandBuffer, VkEvent _event, VkPipelineStageFlags stageMask) 78687ec681f3Smrg{ 78697ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 78707ec681f3Smrg RADV_FROM_HANDLE(radv_event, event, _event); 78717ec681f3Smrg 78727ec681f3Smrg write_event(cmd_buffer, event, stageMask, 1); 78737ec681f3Smrg} 78747ec681f3Smrg 78757ec681f3Smrgvoid 78767ec681f3Smrgradv_CmdResetEvent(VkCommandBuffer commandBuffer, VkEvent _event, VkPipelineStageFlags stageMask) 78777ec681f3Smrg{ 78787ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 78797ec681f3Smrg RADV_FROM_HANDLE(radv_event, event, _event); 78807ec681f3Smrg 78817ec681f3Smrg write_event(cmd_buffer, event, stageMask, 0); 78827ec681f3Smrg} 78837ec681f3Smrg 78847ec681f3Smrgvoid 78857ec681f3Smrgradv_CmdWaitEvents(VkCommandBuffer commandBuffer, uint32_t eventCount, const VkEvent *pEvents, 78867ec681f3Smrg VkPipelineStageFlags srcStageMask, VkPipelineStageFlags dstStageMask, 78877ec681f3Smrg uint32_t memoryBarrierCount, const VkMemoryBarrier *pMemoryBarriers, 78887ec681f3Smrg uint32_t bufferMemoryBarrierCount, 78897ec681f3Smrg const VkBufferMemoryBarrier *pBufferMemoryBarriers, 78907ec681f3Smrg uint32_t imageMemoryBarrierCount, 78917ec681f3Smrg const VkImageMemoryBarrier *pImageMemoryBarriers) 78927ec681f3Smrg{ 78937ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 78947ec681f3Smrg struct radv_barrier_info info; 78957ec681f3Smrg 78967ec681f3Smrg info.reason = RGP_BARRIER_EXTERNAL_CMD_WAIT_EVENTS; 78977ec681f3Smrg info.eventCount = eventCount; 78987ec681f3Smrg info.pEvents = pEvents; 78997ec681f3Smrg info.srcStageMask = 0; 79007ec681f3Smrg 79017ec681f3Smrg radv_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers, bufferMemoryBarrierCount, 79027ec681f3Smrg pBufferMemoryBarriers, imageMemoryBarrierCount, pImageMemoryBarriers, &info); 79037ec681f3Smrg} 79047ec681f3Smrg 79057ec681f3Smrgvoid 79067ec681f3Smrgradv_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask) 79077ec681f3Smrg{ 79087ec681f3Smrg /* No-op */ 79097ec681f3Smrg} 79107ec681f3Smrg 79117ec681f3Smrg/* VK_EXT_conditional_rendering */ 79127ec681f3Smrgvoid 79137ec681f3Smrgradv_CmdBeginConditionalRenderingEXT( 79147ec681f3Smrg VkCommandBuffer commandBuffer, 79157ec681f3Smrg const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin) 79167ec681f3Smrg{ 79177ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 79187ec681f3Smrg RADV_FROM_HANDLE(radv_buffer, buffer, pConditionalRenderingBegin->buffer); 79197ec681f3Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 79207ec681f3Smrg unsigned pred_op = PREDICATION_OP_BOOL32; 79217ec681f3Smrg bool draw_visible = true; 79227ec681f3Smrg uint64_t va; 79237ec681f3Smrg 79247ec681f3Smrg va = radv_buffer_get_va(buffer->bo) + pConditionalRenderingBegin->offset; 79257ec681f3Smrg 79267ec681f3Smrg /* By default, if the 32-bit value at offset in buffer memory is zero, 79277ec681f3Smrg * then the rendering commands are discarded, otherwise they are 79287ec681f3Smrg * executed as normal. If the inverted flag is set, all commands are 79297ec681f3Smrg * discarded if the value is non zero. 79307ec681f3Smrg */ 79317ec681f3Smrg if (pConditionalRenderingBegin->flags & VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT) { 79327ec681f3Smrg draw_visible = false; 79337ec681f3Smrg } 79347ec681f3Smrg 79357ec681f3Smrg si_emit_cache_flush(cmd_buffer); 79367ec681f3Smrg 79377ec681f3Smrg if (cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL && 79387ec681f3Smrg !cmd_buffer->device->physical_device->rad_info.has_32bit_predication) { 79397ec681f3Smrg uint64_t pred_value = 0, pred_va; 79407ec681f3Smrg unsigned pred_offset; 79417ec681f3Smrg 79427ec681f3Smrg /* From the Vulkan spec 1.1.107: 79437ec681f3Smrg * 79447ec681f3Smrg * "If the 32-bit value at offset in buffer memory is zero, 79457ec681f3Smrg * then the rendering commands are discarded, otherwise they 79467ec681f3Smrg * are executed as normal. If the value of the predicate in 79477ec681f3Smrg * buffer memory changes while conditional rendering is 79487ec681f3Smrg * active, the rendering commands may be discarded in an 79497ec681f3Smrg * implementation-dependent way. Some implementations may 79507ec681f3Smrg * latch the value of the predicate upon beginning conditional 79517ec681f3Smrg * rendering while others may read it before every rendering 79527ec681f3Smrg * command." 79537ec681f3Smrg * 79547ec681f3Smrg * But, the AMD hardware treats the predicate as a 64-bit 79557ec681f3Smrg * value which means we need a workaround in the driver. 79567ec681f3Smrg * Luckily, it's not required to support if the value changes 79577ec681f3Smrg * when predication is active. 79587ec681f3Smrg * 79597ec681f3Smrg * The workaround is as follows: 79607ec681f3Smrg * 1) allocate a 64-value in the upload BO and initialize it 79617ec681f3Smrg * to 0 79627ec681f3Smrg * 2) copy the 32-bit predicate value to the upload BO 79637ec681f3Smrg * 3) use the new allocated VA address for predication 79647ec681f3Smrg * 79657ec681f3Smrg * Based on the conditionalrender demo, it's faster to do the 79667ec681f3Smrg * COPY_DATA in ME (+ sync PFP) instead of PFP. 79677ec681f3Smrg */ 79687ec681f3Smrg radv_cmd_buffer_upload_data(cmd_buffer, 8, &pred_value, &pred_offset); 79697ec681f3Smrg 79707ec681f3Smrg pred_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset; 79717ec681f3Smrg 79727ec681f3Smrg radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); 79737ec681f3Smrg radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | 79747ec681f3Smrg COPY_DATA_WR_CONFIRM); 79757ec681f3Smrg radeon_emit(cs, va); 79767ec681f3Smrg radeon_emit(cs, va >> 32); 79777ec681f3Smrg radeon_emit(cs, pred_va); 79787ec681f3Smrg radeon_emit(cs, pred_va >> 32); 79797ec681f3Smrg 79807ec681f3Smrg radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); 79817ec681f3Smrg radeon_emit(cs, 0); 79827ec681f3Smrg 79837ec681f3Smrg va = pred_va; 79847ec681f3Smrg pred_op = PREDICATION_OP_BOOL64; 79857ec681f3Smrg } 79867ec681f3Smrg 79877ec681f3Smrg /* Enable predication for this command buffer. */ 79887ec681f3Smrg si_emit_set_predication_state(cmd_buffer, draw_visible, pred_op, va); 79897ec681f3Smrg cmd_buffer->state.predicating = true; 79907ec681f3Smrg 79917ec681f3Smrg /* Store conditional rendering user info. */ 79927ec681f3Smrg cmd_buffer->state.predication_type = draw_visible; 79937ec681f3Smrg cmd_buffer->state.predication_op = pred_op; 79947ec681f3Smrg cmd_buffer->state.predication_va = va; 79957ec681f3Smrg} 79967ec681f3Smrg 79977ec681f3Smrgvoid 79987ec681f3Smrgradv_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer) 79997ec681f3Smrg{ 80007ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 80017ec681f3Smrg 80027ec681f3Smrg /* Disable predication for this command buffer. */ 80037ec681f3Smrg si_emit_set_predication_state(cmd_buffer, false, 0, 0); 80047ec681f3Smrg cmd_buffer->state.predicating = false; 80057ec681f3Smrg 80067ec681f3Smrg /* Reset conditional rendering user info. */ 80077ec681f3Smrg cmd_buffer->state.predication_type = -1; 80087ec681f3Smrg cmd_buffer->state.predication_op = 0; 80097ec681f3Smrg cmd_buffer->state.predication_va = 0; 80107ec681f3Smrg} 80117ec681f3Smrg 80127ec681f3Smrg/* VK_EXT_transform_feedback */ 80137ec681f3Smrgvoid 80147ec681f3Smrgradv_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer, uint32_t firstBinding, 80157ec681f3Smrg uint32_t bindingCount, const VkBuffer *pBuffers, 80167ec681f3Smrg const VkDeviceSize *pOffsets, const VkDeviceSize *pSizes) 80177ec681f3Smrg{ 80187ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 80197ec681f3Smrg struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings; 80207ec681f3Smrg uint8_t enabled_mask = 0; 80217ec681f3Smrg 80227ec681f3Smrg assert(firstBinding + bindingCount <= MAX_SO_BUFFERS); 80237ec681f3Smrg for (uint32_t i = 0; i < bindingCount; i++) { 80247ec681f3Smrg uint32_t idx = firstBinding + i; 80257ec681f3Smrg 80267ec681f3Smrg sb[idx].buffer = radv_buffer_from_handle(pBuffers[i]); 80277ec681f3Smrg sb[idx].offset = pOffsets[i]; 80287ec681f3Smrg 80297ec681f3Smrg if (!pSizes || pSizes[i] == VK_WHOLE_SIZE) { 80307ec681f3Smrg sb[idx].size = sb[idx].buffer->size - sb[idx].offset; 80317ec681f3Smrg } else { 80327ec681f3Smrg sb[idx].size = pSizes[i]; 80337ec681f3Smrg } 80347ec681f3Smrg 80357ec681f3Smrg radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, sb[idx].buffer->bo); 80367ec681f3Smrg 80377ec681f3Smrg enabled_mask |= 1 << idx; 80387ec681f3Smrg } 80397ec681f3Smrg 80407ec681f3Smrg cmd_buffer->state.streamout.enabled_mask |= enabled_mask; 80417ec681f3Smrg 80427ec681f3Smrg cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_BUFFER; 80437ec681f3Smrg} 80447ec681f3Smrg 80457ec681f3Smrgstatic void 80467ec681f3Smrgradv_emit_streamout_enable(struct radv_cmd_buffer *cmd_buffer) 80477ec681f3Smrg{ 80487ec681f3Smrg struct radv_streamout_state *so = &cmd_buffer->state.streamout; 80497ec681f3Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 80507ec681f3Smrg 80517ec681f3Smrg radeon_set_context_reg_seq(cs, R_028B94_VGT_STRMOUT_CONFIG, 2); 80527ec681f3Smrg radeon_emit(cs, S_028B94_STREAMOUT_0_EN(so->streamout_enabled) | S_028B94_RAST_STREAM(0) | 80537ec681f3Smrg S_028B94_STREAMOUT_1_EN(so->streamout_enabled) | 80547ec681f3Smrg S_028B94_STREAMOUT_2_EN(so->streamout_enabled) | 80557ec681f3Smrg S_028B94_STREAMOUT_3_EN(so->streamout_enabled)); 80567ec681f3Smrg radeon_emit(cs, so->hw_enabled_mask & so->enabled_stream_buffers_mask); 80577ec681f3Smrg 80587ec681f3Smrg cmd_buffer->state.context_roll_without_scissor_emitted = true; 80597ec681f3Smrg} 80607ec681f3Smrg 80617ec681f3Smrgstatic void 80627ec681f3Smrgradv_set_streamout_enable(struct radv_cmd_buffer *cmd_buffer, bool enable) 80637ec681f3Smrg{ 80647ec681f3Smrg struct radv_streamout_state *so = &cmd_buffer->state.streamout; 80657ec681f3Smrg bool old_streamout_enabled = so->streamout_enabled; 80667ec681f3Smrg uint32_t old_hw_enabled_mask = so->hw_enabled_mask; 80677ec681f3Smrg 80687ec681f3Smrg so->streamout_enabled = enable; 80697ec681f3Smrg 80707ec681f3Smrg so->hw_enabled_mask = so->enabled_mask | (so->enabled_mask << 4) | (so->enabled_mask << 8) | 80717ec681f3Smrg (so->enabled_mask << 12); 80727ec681f3Smrg 80737ec681f3Smrg if (!cmd_buffer->device->physical_device->use_ngg_streamout && 80747ec681f3Smrg ((old_streamout_enabled != so->streamout_enabled) || 80757ec681f3Smrg (old_hw_enabled_mask != so->hw_enabled_mask))) 80767ec681f3Smrg radv_emit_streamout_enable(cmd_buffer); 80777ec681f3Smrg 80787ec681f3Smrg if (cmd_buffer->device->physical_device->use_ngg_streamout) { 80797ec681f3Smrg cmd_buffer->gds_needed = true; 80807ec681f3Smrg cmd_buffer->gds_oa_needed = true; 80817ec681f3Smrg } 80827ec681f3Smrg} 80837ec681f3Smrg 80847ec681f3Smrgstatic void 80857ec681f3Smrgradv_flush_vgt_streamout(struct radv_cmd_buffer *cmd_buffer) 80867ec681f3Smrg{ 80877ec681f3Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 80887ec681f3Smrg unsigned reg_strmout_cntl; 80897ec681f3Smrg 80907ec681f3Smrg /* The register is at different places on different ASICs. */ 80917ec681f3Smrg if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) { 80927ec681f3Smrg reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL; 80937ec681f3Smrg radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0); 80947ec681f3Smrg } else { 80957ec681f3Smrg reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL; 80967ec681f3Smrg radeon_set_config_reg(cs, reg_strmout_cntl, 0); 80977ec681f3Smrg } 80987ec681f3Smrg 80997ec681f3Smrg radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 81007ec681f3Smrg radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0)); 81017ec681f3Smrg 81027ec681f3Smrg radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); 81037ec681f3Smrg radeon_emit(cs, 81047ec681f3Smrg WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */ 81057ec681f3Smrg radeon_emit(cs, reg_strmout_cntl >> 2); /* register */ 81067ec681f3Smrg radeon_emit(cs, 0); 81077ec681f3Smrg radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */ 81087ec681f3Smrg radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */ 81097ec681f3Smrg radeon_emit(cs, 4); /* poll interval */ 81107ec681f3Smrg} 81117ec681f3Smrg 81127ec681f3Smrgstatic void 81137ec681f3Smrgradv_emit_streamout_begin(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer, 81147ec681f3Smrg uint32_t counterBufferCount, const VkBuffer *pCounterBuffers, 81157ec681f3Smrg const VkDeviceSize *pCounterBufferOffsets) 81167ec681f3Smrg 81177ec681f3Smrg{ 81187ec681f3Smrg struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings; 81197ec681f3Smrg struct radv_streamout_state *so = &cmd_buffer->state.streamout; 81207ec681f3Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 81217ec681f3Smrg 81227ec681f3Smrg radv_flush_vgt_streamout(cmd_buffer); 81237ec681f3Smrg 81247ec681f3Smrg assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS); 81257ec681f3Smrg u_foreach_bit(i, so->enabled_mask) 81267ec681f3Smrg { 81277ec681f3Smrg int32_t counter_buffer_idx = i - firstCounterBuffer; 81287ec681f3Smrg if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount) 81297ec681f3Smrg counter_buffer_idx = -1; 81307ec681f3Smrg 81317ec681f3Smrg /* AMD GCN binds streamout buffers as shader resources. 81327ec681f3Smrg * VGT only counts primitives and tells the shader through 81337ec681f3Smrg * SGPRs what to do. 81347ec681f3Smrg */ 81357ec681f3Smrg radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 2); 81367ec681f3Smrg radeon_emit(cs, sb[i].size >> 2); /* BUFFER_SIZE (in DW) */ 81377ec681f3Smrg radeon_emit(cs, so->stride_in_dw[i]); /* VTX_STRIDE (in DW) */ 81387ec681f3Smrg 81397ec681f3Smrg cmd_buffer->state.context_roll_without_scissor_emitted = true; 81407ec681f3Smrg 81417ec681f3Smrg if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) { 81427ec681f3Smrg /* The array of counter buffers is optional. */ 81437ec681f3Smrg RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]); 81447ec681f3Smrg uint64_t va = radv_buffer_get_va(buffer->bo); 81457ec681f3Smrg uint64_t counter_buffer_offset = 0; 81467ec681f3Smrg 81477ec681f3Smrg if (pCounterBufferOffsets) 81487ec681f3Smrg counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx]; 81497ec681f3Smrg 81507ec681f3Smrg va += buffer->offset + counter_buffer_offset; 81517ec681f3Smrg 81527ec681f3Smrg /* Append */ 81537ec681f3Smrg radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); 81547ec681f3Smrg radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) | /* offset in bytes */ 81557ec681f3Smrg STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */ 81567ec681f3Smrg radeon_emit(cs, 0); /* unused */ 81577ec681f3Smrg radeon_emit(cs, 0); /* unused */ 81587ec681f3Smrg radeon_emit(cs, va); /* src address lo */ 81597ec681f3Smrg radeon_emit(cs, va >> 32); /* src address hi */ 81607ec681f3Smrg 81617ec681f3Smrg radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo); 81627ec681f3Smrg } else { 81637ec681f3Smrg /* Start from the beginning. */ 81647ec681f3Smrg radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); 81657ec681f3Smrg radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) | /* offset in bytes */ 81667ec681f3Smrg STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */ 81677ec681f3Smrg radeon_emit(cs, 0); /* unused */ 81687ec681f3Smrg radeon_emit(cs, 0); /* unused */ 81697ec681f3Smrg radeon_emit(cs, 0); /* unused */ 81707ec681f3Smrg radeon_emit(cs, 0); /* unused */ 81717ec681f3Smrg } 81727ec681f3Smrg } 81737ec681f3Smrg 81747ec681f3Smrg radv_set_streamout_enable(cmd_buffer, true); 81757ec681f3Smrg} 81767ec681f3Smrg 81777ec681f3Smrgstatic void 81787ec681f3Smrggfx10_emit_streamout_begin(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer, 81797ec681f3Smrg uint32_t counterBufferCount, const VkBuffer *pCounterBuffers, 81807ec681f3Smrg const VkDeviceSize *pCounterBufferOffsets) 81817ec681f3Smrg{ 81827ec681f3Smrg struct radv_streamout_state *so = &cmd_buffer->state.streamout; 81837ec681f3Smrg unsigned last_target = util_last_bit(so->enabled_mask) - 1; 81847ec681f3Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 81857ec681f3Smrg 81867ec681f3Smrg assert(cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10); 81877ec681f3Smrg assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS); 81887ec681f3Smrg 81897ec681f3Smrg /* Sync because the next streamout operation will overwrite GDS and we 81907ec681f3Smrg * have to make sure it's idle. 81917ec681f3Smrg * TODO: Improve by tracking if there is a streamout operation in 81927ec681f3Smrg * flight. 81937ec681f3Smrg */ 81947ec681f3Smrg cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH; 81957ec681f3Smrg si_emit_cache_flush(cmd_buffer); 81967ec681f3Smrg 81977ec681f3Smrg u_foreach_bit(i, so->enabled_mask) 81987ec681f3Smrg { 81997ec681f3Smrg int32_t counter_buffer_idx = i - firstCounterBuffer; 82007ec681f3Smrg if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount) 82017ec681f3Smrg counter_buffer_idx = -1; 82027ec681f3Smrg 82037ec681f3Smrg bool append = 82047ec681f3Smrg counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]; 82057ec681f3Smrg uint64_t va = 0; 82067ec681f3Smrg 82077ec681f3Smrg if (append) { 82087ec681f3Smrg RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]); 82097ec681f3Smrg uint64_t counter_buffer_offset = 0; 82107ec681f3Smrg 82117ec681f3Smrg if (pCounterBufferOffsets) 82127ec681f3Smrg counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx]; 82137ec681f3Smrg 82147ec681f3Smrg va += radv_buffer_get_va(buffer->bo); 82157ec681f3Smrg va += buffer->offset + counter_buffer_offset; 82167ec681f3Smrg 82177ec681f3Smrg radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo); 82187ec681f3Smrg } 82197ec681f3Smrg 82207ec681f3Smrg radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0)); 82217ec681f3Smrg radeon_emit(cs, S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) | 82227ec681f3Smrg S_411_DST_SEL(V_411_GDS) | S_411_CP_SYNC(i == last_target)); 82237ec681f3Smrg radeon_emit(cs, va); 82247ec681f3Smrg radeon_emit(cs, va >> 32); 82257ec681f3Smrg radeon_emit(cs, 4 * i); /* destination in GDS */ 82267ec681f3Smrg radeon_emit(cs, 0); 82277ec681f3Smrg radeon_emit(cs, S_415_BYTE_COUNT_GFX9(4) | S_415_DISABLE_WR_CONFIRM_GFX9(i != last_target)); 82287ec681f3Smrg } 82297ec681f3Smrg 82307ec681f3Smrg radv_set_streamout_enable(cmd_buffer, true); 82317ec681f3Smrg} 82327ec681f3Smrg 82337ec681f3Smrgvoid 82347ec681f3Smrgradv_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer, 82357ec681f3Smrg uint32_t counterBufferCount, const VkBuffer *pCounterBuffers, 82367ec681f3Smrg const VkDeviceSize *pCounterBufferOffsets) 82377ec681f3Smrg{ 82387ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 82397ec681f3Smrg 82407ec681f3Smrg if (cmd_buffer->device->physical_device->use_ngg_streamout) { 82417ec681f3Smrg gfx10_emit_streamout_begin(cmd_buffer, firstCounterBuffer, counterBufferCount, 82427ec681f3Smrg pCounterBuffers, pCounterBufferOffsets); 82437ec681f3Smrg } else { 82447ec681f3Smrg radv_emit_streamout_begin(cmd_buffer, firstCounterBuffer, counterBufferCount, pCounterBuffers, 82457ec681f3Smrg pCounterBufferOffsets); 82467ec681f3Smrg } 82477ec681f3Smrg} 82487ec681f3Smrg 82497ec681f3Smrgstatic void 82507ec681f3Smrgradv_emit_streamout_end(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer, 82517ec681f3Smrg uint32_t counterBufferCount, const VkBuffer *pCounterBuffers, 82527ec681f3Smrg const VkDeviceSize *pCounterBufferOffsets) 82537ec681f3Smrg{ 82547ec681f3Smrg struct radv_streamout_state *so = &cmd_buffer->state.streamout; 82557ec681f3Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 82567ec681f3Smrg 82577ec681f3Smrg radv_flush_vgt_streamout(cmd_buffer); 82587ec681f3Smrg 82597ec681f3Smrg assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS); 82607ec681f3Smrg u_foreach_bit(i, so->enabled_mask) 82617ec681f3Smrg { 82627ec681f3Smrg int32_t counter_buffer_idx = i - firstCounterBuffer; 82637ec681f3Smrg if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount) 82647ec681f3Smrg counter_buffer_idx = -1; 82657ec681f3Smrg 82667ec681f3Smrg if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) { 82677ec681f3Smrg /* The array of counters buffer is optional. */ 82687ec681f3Smrg RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]); 82697ec681f3Smrg uint64_t va = radv_buffer_get_va(buffer->bo); 82707ec681f3Smrg uint64_t counter_buffer_offset = 0; 82717ec681f3Smrg 82727ec681f3Smrg if (pCounterBufferOffsets) 82737ec681f3Smrg counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx]; 82747ec681f3Smrg 82757ec681f3Smrg va += buffer->offset + counter_buffer_offset; 82767ec681f3Smrg 82777ec681f3Smrg radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); 82787ec681f3Smrg radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) | /* offset in bytes */ 82797ec681f3Smrg STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) | 82807ec681f3Smrg STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */ 82817ec681f3Smrg radeon_emit(cs, va); /* dst address lo */ 82827ec681f3Smrg radeon_emit(cs, va >> 32); /* dst address hi */ 82837ec681f3Smrg radeon_emit(cs, 0); /* unused */ 82847ec681f3Smrg radeon_emit(cs, 0); /* unused */ 82857ec681f3Smrg 82867ec681f3Smrg radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo); 82877ec681f3Smrg } 82887ec681f3Smrg 82897ec681f3Smrg /* Deactivate transform feedback by zeroing the buffer size. 82907ec681f3Smrg * The counters (primitives generated, primitives emitted) may 82917ec681f3Smrg * be enabled even if there is not buffer bound. This ensures 82927ec681f3Smrg * that the primitives-emitted query won't increment. 82937ec681f3Smrg */ 82947ec681f3Smrg radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 0); 82957ec681f3Smrg 82967ec681f3Smrg cmd_buffer->state.context_roll_without_scissor_emitted = true; 82977ec681f3Smrg } 82987ec681f3Smrg 82997ec681f3Smrg radv_set_streamout_enable(cmd_buffer, false); 83007ec681f3Smrg} 83017ec681f3Smrg 83027ec681f3Smrgstatic void 83037ec681f3Smrggfx10_emit_streamout_end(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer, 83047ec681f3Smrg uint32_t counterBufferCount, const VkBuffer *pCounterBuffers, 83057ec681f3Smrg const VkDeviceSize *pCounterBufferOffsets) 83067ec681f3Smrg{ 83077ec681f3Smrg struct radv_streamout_state *so = &cmd_buffer->state.streamout; 83087ec681f3Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 83097ec681f3Smrg 83107ec681f3Smrg assert(cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10); 83117ec681f3Smrg assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS); 83127ec681f3Smrg 83137ec681f3Smrg u_foreach_bit(i, so->enabled_mask) 83147ec681f3Smrg { 83157ec681f3Smrg int32_t counter_buffer_idx = i - firstCounterBuffer; 83167ec681f3Smrg if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount) 83177ec681f3Smrg counter_buffer_idx = -1; 83187ec681f3Smrg 83197ec681f3Smrg if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) { 83207ec681f3Smrg /* The array of counters buffer is optional. */ 83217ec681f3Smrg RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]); 83227ec681f3Smrg uint64_t va = radv_buffer_get_va(buffer->bo); 83237ec681f3Smrg uint64_t counter_buffer_offset = 0; 83247ec681f3Smrg 83257ec681f3Smrg if (pCounterBufferOffsets) 83267ec681f3Smrg counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx]; 83277ec681f3Smrg 83287ec681f3Smrg va += buffer->offset + counter_buffer_offset; 83297ec681f3Smrg 83307ec681f3Smrg si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.chip_class, 83317ec681f3Smrg radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_PS_DONE, 0, 83327ec681f3Smrg EOP_DST_SEL_TC_L2, EOP_DATA_SEL_GDS, va, EOP_DATA_GDS(i, 1), 0); 83337ec681f3Smrg 83347ec681f3Smrg radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo); 83357ec681f3Smrg } 83367ec681f3Smrg } 83377ec681f3Smrg 83387ec681f3Smrg radv_set_streamout_enable(cmd_buffer, false); 83397ec681f3Smrg} 83407ec681f3Smrg 83417ec681f3Smrgvoid 83427ec681f3Smrgradv_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer, 83437ec681f3Smrg uint32_t counterBufferCount, const VkBuffer *pCounterBuffers, 83447ec681f3Smrg const VkDeviceSize *pCounterBufferOffsets) 83457ec681f3Smrg{ 83467ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 83477ec681f3Smrg 83487ec681f3Smrg if (cmd_buffer->device->physical_device->use_ngg_streamout) { 83497ec681f3Smrg gfx10_emit_streamout_end(cmd_buffer, firstCounterBuffer, counterBufferCount, pCounterBuffers, 83507ec681f3Smrg pCounterBufferOffsets); 83517ec681f3Smrg } else { 83527ec681f3Smrg radv_emit_streamout_end(cmd_buffer, firstCounterBuffer, counterBufferCount, pCounterBuffers, 83537ec681f3Smrg pCounterBufferOffsets); 83547ec681f3Smrg } 83557ec681f3Smrg} 83567ec681f3Smrg 83577ec681f3Smrgvoid 83587ec681f3Smrgradv_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer, uint32_t instanceCount, 83597ec681f3Smrg uint32_t firstInstance, VkBuffer _counterBuffer, 83607ec681f3Smrg VkDeviceSize counterBufferOffset, uint32_t counterOffset, 83617ec681f3Smrg uint32_t vertexStride) 83627ec681f3Smrg{ 83637ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 83647ec681f3Smrg RADV_FROM_HANDLE(radv_buffer, counterBuffer, _counterBuffer); 83657ec681f3Smrg struct radv_draw_info info; 83667ec681f3Smrg 83677ec681f3Smrg info.count = 0; 83687ec681f3Smrg info.instance_count = instanceCount; 83697ec681f3Smrg info.first_instance = firstInstance; 83707ec681f3Smrg info.strmout_buffer = counterBuffer; 83717ec681f3Smrg info.strmout_buffer_offset = counterBufferOffset; 83727ec681f3Smrg info.stride = vertexStride; 83737ec681f3Smrg info.indexed = false; 83747ec681f3Smrg info.indirect = NULL; 83757ec681f3Smrg 83767ec681f3Smrg if (!radv_before_draw(cmd_buffer, &info, 1)) 83777ec681f3Smrg return; 83787ec681f3Smrg struct VkMultiDrawInfoEXT minfo = { 0, 0 }; 83797ec681f3Smrg radv_emit_direct_draw_packets(cmd_buffer, &info, 1, &minfo, S_0287F0_USE_OPAQUE(1), 0); 83807ec681f3Smrg radv_after_draw(cmd_buffer); 83817ec681f3Smrg} 83827ec681f3Smrg 83837ec681f3Smrg/* VK_AMD_buffer_marker */ 83847ec681f3Smrgvoid 83857ec681f3Smrgradv_CmdWriteBufferMarkerAMD(VkCommandBuffer commandBuffer, VkPipelineStageFlagBits pipelineStage, 83867ec681f3Smrg VkBuffer dstBuffer, VkDeviceSize dstOffset, uint32_t marker) 83877ec681f3Smrg{ 83887ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 83897ec681f3Smrg RADV_FROM_HANDLE(radv_buffer, buffer, dstBuffer); 83907ec681f3Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 83917ec681f3Smrg uint64_t va = radv_buffer_get_va(buffer->bo) + dstOffset; 83927ec681f3Smrg 83937ec681f3Smrg si_emit_cache_flush(cmd_buffer); 83947ec681f3Smrg 83957ec681f3Smrg ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 12); 83967ec681f3Smrg 83977ec681f3Smrg if (!(pipelineStage & ~VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT)) { 83987ec681f3Smrg radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); 83997ec681f3Smrg radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | 84007ec681f3Smrg COPY_DATA_WR_CONFIRM); 84017ec681f3Smrg radeon_emit(cs, marker); 84027ec681f3Smrg radeon_emit(cs, 0); 84037ec681f3Smrg radeon_emit(cs, va); 84047ec681f3Smrg radeon_emit(cs, va >> 32); 84057ec681f3Smrg } else { 84067ec681f3Smrg si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.chip_class, 84077ec681f3Smrg radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_BOTTOM_OF_PIPE_TS, 84087ec681f3Smrg 0, EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, va, marker, 84097ec681f3Smrg cmd_buffer->gfx9_eop_bug_va); 84107ec681f3Smrg } 84117ec681f3Smrg 84127ec681f3Smrg assert(cmd_buffer->cs->cdw <= cdw_max); 841301e04c3fSmrg} 8414