101e04c3fSmrg/* 201e04c3fSmrg * Copyrigh 2016 Red Hat Inc. 301e04c3fSmrg * Based on anv: 401e04c3fSmrg * Copyright © 2015 Intel Corporation 501e04c3fSmrg * 601e04c3fSmrg * Permission is hereby granted, free of charge, to any person obtaining a 701e04c3fSmrg * copy of this software and associated documentation files (the "Software"), 801e04c3fSmrg * to deal in the Software without restriction, including without limitation 901e04c3fSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 1001e04c3fSmrg * and/or sell copies of the Software, and to permit persons to whom the 1101e04c3fSmrg * Software is furnished to do so, subject to the following conditions: 1201e04c3fSmrg * 1301e04c3fSmrg * The above copyright notice and this permission notice (including the next 1401e04c3fSmrg * paragraph) shall be included in all copies or substantial portions of the 1501e04c3fSmrg * Software. 1601e04c3fSmrg * 1701e04c3fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1801e04c3fSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1901e04c3fSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 2001e04c3fSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 2101e04c3fSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 2201e04c3fSmrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 2301e04c3fSmrg * IN THE SOFTWARE. 2401e04c3fSmrg */ 2501e04c3fSmrg 2601e04c3fSmrg#include <assert.h> 277ec681f3Smrg#include <fcntl.h> 2801e04c3fSmrg#include <stdbool.h> 2901e04c3fSmrg#include <string.h> 3001e04c3fSmrg 3101e04c3fSmrg#include "nir/nir_builder.h" 327ec681f3Smrg#include "util/u_atomic.h" 337ec681f3Smrg#include "radv_acceleration_structure.h" 347ec681f3Smrg#include "radv_cs.h" 3501e04c3fSmrg#include "radv_meta.h" 3601e04c3fSmrg#include "radv_private.h" 3701e04c3fSmrg#include "sid.h" 3801e04c3fSmrg 3901e04c3fSmrg#define TIMESTAMP_NOT_READY UINT64_MAX 4001e04c3fSmrg 4101e04c3fSmrgstatic const int pipelinestat_block_size = 11 * 8; 4201e04c3fSmrgstatic const unsigned pipeline_statistics_indices[] = {7, 6, 3, 4, 5, 2, 1, 0, 8, 9, 10}; 4301e04c3fSmrg 447ec681f3Smrgstatic unsigned 457ec681f3Smrgradv_get_pipeline_statistics_index(const VkQueryPipelineStatisticFlagBits flag) 4601e04c3fSmrg{ 477ec681f3Smrg int offset = ffs(flag) - 1; 487ec681f3Smrg assert(offset < ARRAY_SIZE(pipeline_statistics_indices)); 497ec681f3Smrg return pipeline_statistics_indices[offset]; 5001e04c3fSmrg} 5101e04c3fSmrg 527ec681f3Smrgstatic nir_ssa_def * 537ec681f3Smrgnir_test_flag(nir_builder *b, nir_ssa_def *flags, uint32_t flag) 5401e04c3fSmrg{ 557ec681f3Smrg return nir_i2b(b, nir_iand(b, flags, nir_imm_int(b, flag))); 567ec681f3Smrg} 5701e04c3fSmrg 587ec681f3Smrgstatic void 597ec681f3Smrgradv_break_on_count(nir_builder *b, nir_variable *var, nir_ssa_def *count) 607ec681f3Smrg{ 617ec681f3Smrg nir_ssa_def *counter = nir_load_var(b, var); 6201e04c3fSmrg 637ec681f3Smrg nir_push_if(b, nir_uge(b, counter, count)); 647ec681f3Smrg nir_jump(b, nir_jump_break); 657ec681f3Smrg nir_pop_if(b, NULL); 6601e04c3fSmrg 677ec681f3Smrg counter = nir_iadd(b, counter, nir_imm_int(b, 1)); 687ec681f3Smrg nir_store_var(b, var, counter, 0x1); 6901e04c3fSmrg} 7001e04c3fSmrg 717ec681f3Smrgstatic void 727ec681f3Smrgradv_store_availability(nir_builder *b, nir_ssa_def *flags, nir_ssa_def *dst_buf, 737ec681f3Smrg nir_ssa_def *offset, nir_ssa_def *value32) 7401e04c3fSmrg{ 757ec681f3Smrg nir_push_if(b, nir_test_flag(b, flags, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)); 7601e04c3fSmrg 777ec681f3Smrg nir_push_if(b, nir_test_flag(b, flags, VK_QUERY_RESULT_64_BIT)); 7801e04c3fSmrg 797ec681f3Smrg nir_store_ssbo(b, nir_vec2(b, value32, nir_imm_int(b, 0)), dst_buf, offset, .write_mask = 0x3, 807ec681f3Smrg .align_mul = 8); 8101e04c3fSmrg 827ec681f3Smrg nir_push_else(b, NULL); 8301e04c3fSmrg 847ec681f3Smrg nir_store_ssbo(b, value32, dst_buf, offset, .write_mask = 0x1, .align_mul = 4); 8501e04c3fSmrg 867ec681f3Smrg nir_pop_if(b, NULL); 8701e04c3fSmrg 887ec681f3Smrg nir_pop_if(b, NULL); 897ec681f3Smrg} 9001e04c3fSmrg 917ec681f3Smrgstatic nir_shader * 927ec681f3Smrgbuild_occlusion_query_shader(struct radv_device *device) 937ec681f3Smrg{ 947ec681f3Smrg /* the shader this builds is roughly 957ec681f3Smrg * 967ec681f3Smrg * push constants { 977ec681f3Smrg * uint32_t flags; 987ec681f3Smrg * uint32_t dst_stride; 997ec681f3Smrg * }; 1007ec681f3Smrg * 1017ec681f3Smrg * uint32_t src_stride = 16 * db_count; 1027ec681f3Smrg * 1037ec681f3Smrg * location(binding = 0) buffer dst_buf; 1047ec681f3Smrg * location(binding = 1) buffer src_buf; 1057ec681f3Smrg * 1067ec681f3Smrg * void main() { 1077ec681f3Smrg * uint64_t result = 0; 1087ec681f3Smrg * uint64_t src_offset = src_stride * global_id.x; 1097ec681f3Smrg * uint64_t dst_offset = dst_stride * global_id.x; 1107ec681f3Smrg * bool available = true; 1117ec681f3Smrg * for (int i = 0; i < db_count; ++i) { 1127ec681f3Smrg * if (enabled_rb_mask & (1 << i)) { 1137ec681f3Smrg * uint64_t start = src_buf[src_offset + 16 * i]; 1147ec681f3Smrg * uint64_t end = src_buf[src_offset + 16 * i + 8]; 1157ec681f3Smrg * if ((start & (1ull << 63)) && (end & (1ull << 63))) 1167ec681f3Smrg * result += end - start; 1177ec681f3Smrg * else 1187ec681f3Smrg * available = false; 1197ec681f3Smrg * } 1207ec681f3Smrg * } 1217ec681f3Smrg * uint32_t elem_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4; 1227ec681f3Smrg * if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) { 1237ec681f3Smrg * if (flags & VK_QUERY_RESULT_64_BIT) 1247ec681f3Smrg * dst_buf[dst_offset] = result; 1257ec681f3Smrg * else 1267ec681f3Smrg * dst_buf[dst_offset] = (uint32_t)result. 1277ec681f3Smrg * } 1287ec681f3Smrg * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { 1297ec681f3Smrg * dst_buf[dst_offset + elem_size] = available; 1307ec681f3Smrg * } 1317ec681f3Smrg * } 1327ec681f3Smrg */ 1337ec681f3Smrg nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL, "occlusion_query"); 1347ec681f3Smrg b.shader->info.workgroup_size[0] = 64; 1357ec681f3Smrg b.shader->info.workgroup_size[1] = 1; 1367ec681f3Smrg b.shader->info.workgroup_size[2] = 1; 1377ec681f3Smrg 1387ec681f3Smrg nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result"); 1397ec681f3Smrg nir_variable *outer_counter = 1407ec681f3Smrg nir_local_variable_create(b.impl, glsl_int_type(), "outer_counter"); 1417ec681f3Smrg nir_variable *start = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "start"); 1427ec681f3Smrg nir_variable *end = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "end"); 1437ec681f3Smrg nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available"); 1447ec681f3Smrg unsigned enabled_rb_mask = device->physical_device->rad_info.enabled_rb_mask; 1457ec681f3Smrg unsigned db_count = device->physical_device->rad_info.max_render_backends; 1467ec681f3Smrg 1477ec681f3Smrg nir_ssa_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 16); 1487ec681f3Smrg 1497ec681f3Smrg nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0); 1507ec681f3Smrg nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1); 1517ec681f3Smrg 1527ec681f3Smrg nir_ssa_def *global_id = get_global_ids(&b, 1); 1537ec681f3Smrg 1547ec681f3Smrg nir_ssa_def *input_stride = nir_imm_int(&b, db_count * 16); 1557ec681f3Smrg nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id); 1567ec681f3Smrg nir_ssa_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 16); 1577ec681f3Smrg nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id); 1587ec681f3Smrg 1597ec681f3Smrg nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1); 1607ec681f3Smrg nir_store_var(&b, outer_counter, nir_imm_int(&b, 0), 0x1); 1617ec681f3Smrg nir_store_var(&b, available, nir_imm_true(&b), 0x1); 1627ec681f3Smrg 1637ec681f3Smrg nir_push_loop(&b); 1647ec681f3Smrg 1657ec681f3Smrg nir_ssa_def *current_outer_count = nir_load_var(&b, outer_counter); 1667ec681f3Smrg radv_break_on_count(&b, outer_counter, nir_imm_int(&b, db_count)); 1677ec681f3Smrg 1687ec681f3Smrg nir_ssa_def *enabled_cond = nir_iand(&b, nir_imm_int(&b, enabled_rb_mask), 1697ec681f3Smrg nir_ishl(&b, nir_imm_int(&b, 1), current_outer_count)); 1707ec681f3Smrg 1717ec681f3Smrg nir_push_if(&b, nir_i2b(&b, enabled_cond)); 1727ec681f3Smrg 1737ec681f3Smrg nir_ssa_def *load_offset = nir_imul(&b, current_outer_count, nir_imm_int(&b, 16)); 1747ec681f3Smrg load_offset = nir_iadd(&b, input_base, load_offset); 1757ec681f3Smrg 1767ec681f3Smrg nir_ssa_def *load = nir_load_ssbo(&b, 2, 64, src_buf, load_offset, .align_mul = 16); 1777ec681f3Smrg 1787ec681f3Smrg nir_store_var(&b, start, nir_channel(&b, load, 0), 0x1); 1797ec681f3Smrg nir_store_var(&b, end, nir_channel(&b, load, 1), 0x1); 1807ec681f3Smrg 1817ec681f3Smrg nir_ssa_def *start_done = nir_ilt(&b, nir_load_var(&b, start), nir_imm_int64(&b, 0)); 1827ec681f3Smrg nir_ssa_def *end_done = nir_ilt(&b, nir_load_var(&b, end), nir_imm_int64(&b, 0)); 1837ec681f3Smrg 1847ec681f3Smrg nir_push_if(&b, nir_iand(&b, start_done, end_done)); 1857ec681f3Smrg 1867ec681f3Smrg nir_store_var(&b, result, 1877ec681f3Smrg nir_iadd(&b, nir_load_var(&b, result), 1887ec681f3Smrg nir_isub(&b, nir_load_var(&b, end), nir_load_var(&b, start))), 1897ec681f3Smrg 0x1); 1907ec681f3Smrg 1917ec681f3Smrg nir_push_else(&b, NULL); 1927ec681f3Smrg 1937ec681f3Smrg nir_store_var(&b, available, nir_imm_false(&b), 0x1); 1947ec681f3Smrg 1957ec681f3Smrg nir_pop_if(&b, NULL); 1967ec681f3Smrg nir_pop_if(&b, NULL); 1977ec681f3Smrg nir_pop_loop(&b, NULL); 1987ec681f3Smrg 1997ec681f3Smrg /* Store the result if complete or if partial results have been requested. */ 2007ec681f3Smrg 2017ec681f3Smrg nir_ssa_def *result_is_64bit = nir_test_flag(&b, flags, VK_QUERY_RESULT_64_BIT); 2027ec681f3Smrg nir_ssa_def *result_size = 2037ec681f3Smrg nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4)); 2047ec681f3Smrg nir_push_if(&b, nir_ior(&b, nir_test_flag(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT), 2057ec681f3Smrg nir_load_var(&b, available))); 2067ec681f3Smrg 2077ec681f3Smrg nir_push_if(&b, result_is_64bit); 2087ec681f3Smrg 2097ec681f3Smrg nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base, .write_mask = 0x1, 2107ec681f3Smrg .align_mul = 8); 21101e04c3fSmrg 2127ec681f3Smrg nir_push_else(&b, NULL); 21301e04c3fSmrg 2147ec681f3Smrg nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base, 2157ec681f3Smrg .write_mask = 0x1, .align_mul = 8); 21601e04c3fSmrg 2177ec681f3Smrg nir_pop_if(&b, NULL); 2187ec681f3Smrg nir_pop_if(&b, NULL); 21901e04c3fSmrg 2207ec681f3Smrg radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base), 2217ec681f3Smrg nir_b2i32(&b, nir_load_var(&b, available))); 22201e04c3fSmrg 2237ec681f3Smrg return b.shader; 2247ec681f3Smrg} 22501e04c3fSmrg 2267ec681f3Smrgstatic nir_shader * 2277ec681f3Smrgbuild_pipeline_statistics_query_shader(struct radv_device *device) 2287ec681f3Smrg{ 2297ec681f3Smrg /* the shader this builds is roughly 2307ec681f3Smrg * 2317ec681f3Smrg * push constants { 2327ec681f3Smrg * uint32_t flags; 2337ec681f3Smrg * uint32_t dst_stride; 2347ec681f3Smrg * uint32_t stats_mask; 2357ec681f3Smrg * uint32_t avail_offset; 2367ec681f3Smrg * }; 2377ec681f3Smrg * 2387ec681f3Smrg * uint32_t src_stride = pipelinestat_block_size * 2; 2397ec681f3Smrg * 2407ec681f3Smrg * location(binding = 0) buffer dst_buf; 2417ec681f3Smrg * location(binding = 1) buffer src_buf; 2427ec681f3Smrg * 2437ec681f3Smrg * void main() { 2447ec681f3Smrg * uint64_t src_offset = src_stride * global_id.x; 2457ec681f3Smrg * uint64_t dst_base = dst_stride * global_id.x; 2467ec681f3Smrg * uint64_t dst_offset = dst_base; 2477ec681f3Smrg * uint32_t elem_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4; 2487ec681f3Smrg * uint32_t elem_count = stats_mask >> 16; 2497ec681f3Smrg * uint32_t available32 = src_buf[avail_offset + 4 * global_id.x]; 2507ec681f3Smrg * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { 2517ec681f3Smrg * dst_buf[dst_offset + elem_count * elem_size] = available32; 2527ec681f3Smrg * } 2537ec681f3Smrg * if ((bool)available32) { 2547ec681f3Smrg * // repeat 11 times: 2557ec681f3Smrg * if (stats_mask & (1 << 0)) { 2567ec681f3Smrg * uint64_t start = src_buf[src_offset + 8 * indices[0]]; 2577ec681f3Smrg * uint64_t end = src_buf[src_offset + 8 * indices[0] + 2587ec681f3Smrg * pipelinestat_block_size]; uint64_t result = end - start; if (flags & VK_QUERY_RESULT_64_BIT) 2597ec681f3Smrg * dst_buf[dst_offset] = result; 2607ec681f3Smrg * else 2617ec681f3Smrg * dst_buf[dst_offset] = (uint32_t)result. 2627ec681f3Smrg * dst_offset += elem_size; 2637ec681f3Smrg * } 2647ec681f3Smrg * } else if (flags & VK_QUERY_RESULT_PARTIAL_BIT) { 2657ec681f3Smrg * // Set everything to 0 as we don't know what is valid. 2667ec681f3Smrg * for (int i = 0; i < elem_count; ++i) 2677ec681f3Smrg * dst_buf[dst_base + elem_size * i] = 0; 2687ec681f3Smrg * } 2697ec681f3Smrg * } 2707ec681f3Smrg */ 2717ec681f3Smrg nir_builder b = 2727ec681f3Smrg nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL, "pipeline_statistics_query"); 2737ec681f3Smrg b.shader->info.workgroup_size[0] = 64; 2747ec681f3Smrg b.shader->info.workgroup_size[1] = 1; 2757ec681f3Smrg b.shader->info.workgroup_size[2] = 1; 2767ec681f3Smrg 2777ec681f3Smrg nir_variable *output_offset = 2787ec681f3Smrg nir_local_variable_create(b.impl, glsl_int_type(), "output_offset"); 2797ec681f3Smrg 2807ec681f3Smrg nir_ssa_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 16); 2817ec681f3Smrg nir_ssa_def *stats_mask = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 8), .range = 16); 2827ec681f3Smrg nir_ssa_def *avail_offset = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 12), .range = 16); 2837ec681f3Smrg 2847ec681f3Smrg nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0); 2857ec681f3Smrg nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1); 2867ec681f3Smrg 2877ec681f3Smrg nir_ssa_def *global_id = get_global_ids(&b, 1); 2887ec681f3Smrg 2897ec681f3Smrg nir_ssa_def *input_stride = nir_imm_int(&b, pipelinestat_block_size * 2); 2907ec681f3Smrg nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id); 2917ec681f3Smrg nir_ssa_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 16); 2927ec681f3Smrg nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id); 2937ec681f3Smrg 2947ec681f3Smrg avail_offset = nir_iadd(&b, avail_offset, nir_imul(&b, global_id, nir_imm_int(&b, 4))); 2957ec681f3Smrg 2967ec681f3Smrg nir_ssa_def *available32 = nir_load_ssbo(&b, 1, 32, src_buf, avail_offset, .align_mul = 4); 2977ec681f3Smrg 2987ec681f3Smrg nir_ssa_def *result_is_64bit = nir_test_flag(&b, flags, VK_QUERY_RESULT_64_BIT); 2997ec681f3Smrg nir_ssa_def *elem_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4)); 3007ec681f3Smrg nir_ssa_def *elem_count = nir_ushr(&b, stats_mask, nir_imm_int(&b, 16)); 3017ec681f3Smrg 3027ec681f3Smrg radv_store_availability(&b, flags, dst_buf, 3037ec681f3Smrg nir_iadd(&b, output_base, nir_imul(&b, elem_count, elem_size)), 3047ec681f3Smrg available32); 3057ec681f3Smrg 3067ec681f3Smrg nir_push_if(&b, nir_i2b(&b, available32)); 3077ec681f3Smrg 3087ec681f3Smrg nir_store_var(&b, output_offset, output_base, 0x1); 3097ec681f3Smrg for (int i = 0; i < ARRAY_SIZE(pipeline_statistics_indices); ++i) { 3107ec681f3Smrg nir_push_if(&b, nir_test_flag(&b, stats_mask, 1u << i)); 3117ec681f3Smrg 3127ec681f3Smrg nir_ssa_def *start_offset = 3137ec681f3Smrg nir_iadd(&b, input_base, nir_imm_int(&b, pipeline_statistics_indices[i] * 8)); 3147ec681f3Smrg nir_ssa_def *start = nir_load_ssbo(&b, 1, 64, src_buf, start_offset, .align_mul = 8); 3157ec681f3Smrg 3167ec681f3Smrg nir_ssa_def *end_offset = 3177ec681f3Smrg nir_iadd(&b, input_base, 3187ec681f3Smrg nir_imm_int(&b, pipeline_statistics_indices[i] * 8 + pipelinestat_block_size)); 3197ec681f3Smrg nir_ssa_def *end = nir_load_ssbo(&b, 1, 64, src_buf, end_offset, .align_mul = 8); 3207ec681f3Smrg 3217ec681f3Smrg nir_ssa_def *result = nir_isub(&b, end, start); 3227ec681f3Smrg 3237ec681f3Smrg /* Store result */ 3247ec681f3Smrg nir_push_if(&b, result_is_64bit); 3257ec681f3Smrg 3267ec681f3Smrg nir_store_ssbo(&b, result, dst_buf, nir_load_var(&b, output_offset), .write_mask = 0x1, 3277ec681f3Smrg .align_mul = 8); 3287ec681f3Smrg 3297ec681f3Smrg nir_push_else(&b, NULL); 3307ec681f3Smrg 3317ec681f3Smrg nir_store_ssbo(&b, nir_u2u32(&b, result), dst_buf, nir_load_var(&b, output_offset), 3327ec681f3Smrg .write_mask = 0x1, .align_mul = 4); 3337ec681f3Smrg 3347ec681f3Smrg nir_pop_if(&b, NULL); 3357ec681f3Smrg 3367ec681f3Smrg nir_store_var(&b, output_offset, nir_iadd(&b, nir_load_var(&b, output_offset), elem_size), 3377ec681f3Smrg 0x1); 33801e04c3fSmrg 3397ec681f3Smrg nir_pop_if(&b, NULL); 3407ec681f3Smrg } 34101e04c3fSmrg 3427ec681f3Smrg nir_push_else(&b, NULL); /* nir_i2b(&b, available32) */ 34301e04c3fSmrg 3447ec681f3Smrg nir_push_if(&b, nir_test_flag(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT)); 34501e04c3fSmrg 3467ec681f3Smrg /* Stores zeros in all outputs. */ 34701e04c3fSmrg 3487ec681f3Smrg nir_variable *counter = nir_local_variable_create(b.impl, glsl_int_type(), "counter"); 3497ec681f3Smrg nir_store_var(&b, counter, nir_imm_int(&b, 0), 0x1); 35001e04c3fSmrg 3517ec681f3Smrg nir_loop *loop = nir_push_loop(&b); 35201e04c3fSmrg 3537ec681f3Smrg nir_ssa_def *current_counter = nir_load_var(&b, counter); 3547ec681f3Smrg radv_break_on_count(&b, counter, elem_count); 35501e04c3fSmrg 3567ec681f3Smrg nir_ssa_def *output_elem = nir_iadd(&b, output_base, nir_imul(&b, elem_size, current_counter)); 3577ec681f3Smrg nir_push_if(&b, result_is_64bit); 35801e04c3fSmrg 3597ec681f3Smrg nir_store_ssbo(&b, nir_imm_int64(&b, 0), dst_buf, output_elem, .write_mask = 0x1, 3607ec681f3Smrg .align_mul = 8); 36101e04c3fSmrg 3627ec681f3Smrg nir_push_else(&b, NULL); 36301e04c3fSmrg 3647ec681f3Smrg nir_store_ssbo(&b, nir_imm_int(&b, 0), dst_buf, output_elem, .write_mask = 0x1, .align_mul = 4); 36501e04c3fSmrg 3667ec681f3Smrg nir_pop_if(&b, NULL); 36701e04c3fSmrg 3687ec681f3Smrg nir_pop_loop(&b, loop); 3697ec681f3Smrg nir_pop_if(&b, NULL); /* VK_QUERY_RESULT_PARTIAL_BIT */ 3707ec681f3Smrg nir_pop_if(&b, NULL); /* nir_i2b(&b, available32) */ 3717ec681f3Smrg return b.shader; 37201e04c3fSmrg} 37301e04c3fSmrg 37401e04c3fSmrgstatic nir_shader * 37501e04c3fSmrgbuild_tfb_query_shader(struct radv_device *device) 37601e04c3fSmrg{ 3777ec681f3Smrg /* the shader this builds is roughly 3787ec681f3Smrg * 3797ec681f3Smrg * uint32_t src_stride = 32; 3807ec681f3Smrg * 3817ec681f3Smrg * location(binding = 0) buffer dst_buf; 3827ec681f3Smrg * location(binding = 1) buffer src_buf; 3837ec681f3Smrg * 3847ec681f3Smrg * void main() { 3857ec681f3Smrg * uint64_t result[2] = {}; 3867ec681f3Smrg * bool available = false; 3877ec681f3Smrg * uint64_t src_offset = src_stride * global_id.x; 3887ec681f3Smrg * uint64_t dst_offset = dst_stride * global_id.x; 3897ec681f3Smrg * uint64_t *src_data = src_buf[src_offset]; 3907ec681f3Smrg * uint32_t avail = (src_data[0] >> 32) & 3917ec681f3Smrg * (src_data[1] >> 32) & 3927ec681f3Smrg * (src_data[2] >> 32) & 3937ec681f3Smrg * (src_data[3] >> 32); 3947ec681f3Smrg * if (avail & 0x80000000) { 3957ec681f3Smrg * result[0] = src_data[3] - src_data[1]; 3967ec681f3Smrg * result[1] = src_data[2] - src_data[0]; 3977ec681f3Smrg * available = true; 3987ec681f3Smrg * } 3997ec681f3Smrg * uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 16 : 8; 4007ec681f3Smrg * if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) { 4017ec681f3Smrg * if (flags & VK_QUERY_RESULT_64_BIT) { 4027ec681f3Smrg * dst_buf[dst_offset] = result; 4037ec681f3Smrg * } else { 4047ec681f3Smrg * dst_buf[dst_offset] = (uint32_t)result; 4057ec681f3Smrg * } 4067ec681f3Smrg * } 4077ec681f3Smrg * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { 4087ec681f3Smrg * dst_buf[dst_offset + result_size] = available; 4097ec681f3Smrg * } 4107ec681f3Smrg * } 4117ec681f3Smrg */ 4127ec681f3Smrg nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL, "tfb_query"); 4137ec681f3Smrg b.shader->info.workgroup_size[0] = 64; 4147ec681f3Smrg b.shader->info.workgroup_size[1] = 1; 4157ec681f3Smrg b.shader->info.workgroup_size[2] = 1; 4167ec681f3Smrg 4177ec681f3Smrg /* Create and initialize local variables. */ 4187ec681f3Smrg nir_variable *result = 4197ec681f3Smrg nir_local_variable_create(b.impl, glsl_vector_type(GLSL_TYPE_UINT64, 2), "result"); 4207ec681f3Smrg nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available"); 4217ec681f3Smrg 4227ec681f3Smrg nir_store_var(&b, result, nir_vec2(&b, nir_imm_int64(&b, 0), nir_imm_int64(&b, 0)), 0x3); 4237ec681f3Smrg nir_store_var(&b, available, nir_imm_false(&b), 0x1); 4247ec681f3Smrg 4257ec681f3Smrg nir_ssa_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 16); 4267ec681f3Smrg 4277ec681f3Smrg /* Load resources. */ 4287ec681f3Smrg nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0); 4297ec681f3Smrg nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1); 4307ec681f3Smrg 4317ec681f3Smrg /* Compute global ID. */ 4327ec681f3Smrg nir_ssa_def *global_id = get_global_ids(&b, 1); 4337ec681f3Smrg 4347ec681f3Smrg /* Compute src/dst strides. */ 4357ec681f3Smrg nir_ssa_def *input_stride = nir_imm_int(&b, 32); 4367ec681f3Smrg nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id); 4377ec681f3Smrg nir_ssa_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 16); 4387ec681f3Smrg nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id); 4397ec681f3Smrg 4407ec681f3Smrg /* Load data from the query pool. */ 4417ec681f3Smrg nir_ssa_def *load1 = nir_load_ssbo(&b, 4, 32, src_buf, input_base, .align_mul = 32); 4427ec681f3Smrg nir_ssa_def *load2 = nir_load_ssbo( 4437ec681f3Smrg &b, 4, 32, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 16)), .align_mul = 16); 4447ec681f3Smrg 4457ec681f3Smrg /* Check if result is available. */ 4467ec681f3Smrg nir_ssa_def *avails[2]; 4477ec681f3Smrg avails[0] = nir_iand(&b, nir_channel(&b, load1, 1), nir_channel(&b, load1, 3)); 4487ec681f3Smrg avails[1] = nir_iand(&b, nir_channel(&b, load2, 1), nir_channel(&b, load2, 3)); 4497ec681f3Smrg nir_ssa_def *result_is_available = 4507ec681f3Smrg nir_i2b(&b, nir_iand(&b, nir_iand(&b, avails[0], avails[1]), nir_imm_int(&b, 0x80000000))); 4517ec681f3Smrg 4527ec681f3Smrg /* Only compute result if available. */ 4537ec681f3Smrg nir_push_if(&b, result_is_available); 4547ec681f3Smrg 4557ec681f3Smrg /* Pack values. */ 4567ec681f3Smrg nir_ssa_def *packed64[4]; 4577ec681f3Smrg packed64[0] = 4587ec681f3Smrg nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load1, 0), nir_channel(&b, load1, 1))); 4597ec681f3Smrg packed64[1] = 4607ec681f3Smrg nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load1, 2), nir_channel(&b, load1, 3))); 4617ec681f3Smrg packed64[2] = 4627ec681f3Smrg nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load2, 0), nir_channel(&b, load2, 1))); 4637ec681f3Smrg packed64[3] = 4647ec681f3Smrg nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load2, 2), nir_channel(&b, load2, 3))); 4657ec681f3Smrg 4667ec681f3Smrg /* Compute result. */ 4677ec681f3Smrg nir_ssa_def *num_primitive_written = nir_isub(&b, packed64[3], packed64[1]); 4687ec681f3Smrg nir_ssa_def *primitive_storage_needed = nir_isub(&b, packed64[2], packed64[0]); 4697ec681f3Smrg 4707ec681f3Smrg nir_store_var(&b, result, nir_vec2(&b, num_primitive_written, primitive_storage_needed), 0x3); 4717ec681f3Smrg nir_store_var(&b, available, nir_imm_true(&b), 0x1); 4727ec681f3Smrg 4737ec681f3Smrg nir_pop_if(&b, NULL); 4747ec681f3Smrg 4757ec681f3Smrg /* Determine if result is 64 or 32 bit. */ 4767ec681f3Smrg nir_ssa_def *result_is_64bit = nir_test_flag(&b, flags, VK_QUERY_RESULT_64_BIT); 4777ec681f3Smrg nir_ssa_def *result_size = 4787ec681f3Smrg nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 16), nir_imm_int(&b, 8)); 4797ec681f3Smrg 4807ec681f3Smrg /* Store the result if complete or partial results have been requested. */ 4817ec681f3Smrg nir_push_if(&b, nir_ior(&b, nir_test_flag(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT), 4827ec681f3Smrg nir_load_var(&b, available))); 4837ec681f3Smrg 4847ec681f3Smrg /* Store result. */ 4857ec681f3Smrg nir_push_if(&b, result_is_64bit); 4867ec681f3Smrg 4877ec681f3Smrg nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base, .write_mask = 0x3, 4887ec681f3Smrg .align_mul = 8); 4897ec681f3Smrg 4907ec681f3Smrg nir_push_else(&b, NULL); 4917ec681f3Smrg 4927ec681f3Smrg nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base, 4937ec681f3Smrg .write_mask = 0x3, .align_mul = 4); 4947ec681f3Smrg 4957ec681f3Smrg nir_pop_if(&b, NULL); 4967ec681f3Smrg nir_pop_if(&b, NULL); 4977ec681f3Smrg 4987ec681f3Smrg radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base), 4997ec681f3Smrg nir_b2i32(&b, nir_load_var(&b, available))); 5007ec681f3Smrg 5017ec681f3Smrg return b.shader; 50201e04c3fSmrg} 50301e04c3fSmrg 5047ec681f3Smrgstatic nir_shader * 5057ec681f3Smrgbuild_timestamp_query_shader(struct radv_device *device) 50601e04c3fSmrg{ 5077ec681f3Smrg /* the shader this builds is roughly 5087ec681f3Smrg * 5097ec681f3Smrg * uint32_t src_stride = 8; 5107ec681f3Smrg * 5117ec681f3Smrg * location(binding = 0) buffer dst_buf; 5127ec681f3Smrg * location(binding = 1) buffer src_buf; 5137ec681f3Smrg * 5147ec681f3Smrg * void main() { 5157ec681f3Smrg * uint64_t result = 0; 5167ec681f3Smrg * bool available = false; 5177ec681f3Smrg * uint64_t src_offset = src_stride * global_id.x; 5187ec681f3Smrg * uint64_t dst_offset = dst_stride * global_id.x; 5197ec681f3Smrg * uint64_t timestamp = src_buf[src_offset]; 5207ec681f3Smrg * if (timestamp != TIMESTAMP_NOT_READY) { 5217ec681f3Smrg * result = timestamp; 5227ec681f3Smrg * available = true; 5237ec681f3Smrg * } 5247ec681f3Smrg * uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4; 5257ec681f3Smrg * if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) { 5267ec681f3Smrg * if (flags & VK_QUERY_RESULT_64_BIT) { 5277ec681f3Smrg * dst_buf[dst_offset] = result; 5287ec681f3Smrg * } else { 5297ec681f3Smrg * dst_buf[dst_offset] = (uint32_t)result; 5307ec681f3Smrg * } 5317ec681f3Smrg * } 5327ec681f3Smrg * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { 5337ec681f3Smrg * dst_buf[dst_offset + result_size] = available; 5347ec681f3Smrg * } 5357ec681f3Smrg * } 5367ec681f3Smrg */ 5377ec681f3Smrg nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL, "timestamp_query"); 5387ec681f3Smrg b.shader->info.workgroup_size[0] = 64; 5397ec681f3Smrg b.shader->info.workgroup_size[1] = 1; 5407ec681f3Smrg b.shader->info.workgroup_size[2] = 1; 5417ec681f3Smrg 5427ec681f3Smrg /* Create and initialize local variables. */ 5437ec681f3Smrg nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result"); 5447ec681f3Smrg nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available"); 5457ec681f3Smrg 5467ec681f3Smrg nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1); 5477ec681f3Smrg nir_store_var(&b, available, nir_imm_false(&b), 0x1); 5487ec681f3Smrg 5497ec681f3Smrg nir_ssa_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 16); 5507ec681f3Smrg 5517ec681f3Smrg /* Load resources. */ 5527ec681f3Smrg nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0); 5537ec681f3Smrg nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1); 5547ec681f3Smrg 5557ec681f3Smrg /* Compute global ID. */ 5567ec681f3Smrg nir_ssa_def *global_id = get_global_ids(&b, 1); 5577ec681f3Smrg 5587ec681f3Smrg /* Compute src/dst strides. */ 5597ec681f3Smrg nir_ssa_def *input_stride = nir_imm_int(&b, 8); 5607ec681f3Smrg nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id); 5617ec681f3Smrg nir_ssa_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 16); 5627ec681f3Smrg nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id); 5637ec681f3Smrg 5647ec681f3Smrg /* Load data from the query pool. */ 5657ec681f3Smrg nir_ssa_def *load = nir_load_ssbo(&b, 2, 32, src_buf, input_base, .align_mul = 8); 5667ec681f3Smrg 5677ec681f3Smrg /* Pack the timestamp. */ 5687ec681f3Smrg nir_ssa_def *timestamp; 5697ec681f3Smrg timestamp = 5707ec681f3Smrg nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load, 0), nir_channel(&b, load, 1))); 5717ec681f3Smrg 5727ec681f3Smrg /* Check if result is available. */ 5737ec681f3Smrg nir_ssa_def *result_is_available = 5747ec681f3Smrg nir_i2b(&b, nir_ine(&b, timestamp, nir_imm_int64(&b, TIMESTAMP_NOT_READY))); 5757ec681f3Smrg 5767ec681f3Smrg /* Only store result if available. */ 5777ec681f3Smrg nir_push_if(&b, result_is_available); 5787ec681f3Smrg 5797ec681f3Smrg nir_store_var(&b, result, timestamp, 0x1); 5807ec681f3Smrg nir_store_var(&b, available, nir_imm_true(&b), 0x1); 5817ec681f3Smrg 5827ec681f3Smrg nir_pop_if(&b, NULL); 5837ec681f3Smrg 5847ec681f3Smrg /* Determine if result is 64 or 32 bit. */ 5857ec681f3Smrg nir_ssa_def *result_is_64bit = nir_test_flag(&b, flags, VK_QUERY_RESULT_64_BIT); 5867ec681f3Smrg nir_ssa_def *result_size = 5877ec681f3Smrg nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4)); 5887ec681f3Smrg 5897ec681f3Smrg /* Store the result if complete or partial results have been requested. */ 5907ec681f3Smrg nir_push_if(&b, nir_ior(&b, nir_test_flag(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT), 5917ec681f3Smrg nir_load_var(&b, available))); 5927ec681f3Smrg 5937ec681f3Smrg /* Store result. */ 5947ec681f3Smrg nir_push_if(&b, result_is_64bit); 5957ec681f3Smrg 5967ec681f3Smrg nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base, .write_mask = 0x1, 5977ec681f3Smrg .align_mul = 8); 5987ec681f3Smrg 5997ec681f3Smrg nir_push_else(&b, NULL); 6007ec681f3Smrg 6017ec681f3Smrg nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base, 6027ec681f3Smrg .write_mask = 0x1, .align_mul = 4); 6037ec681f3Smrg 6047ec681f3Smrg nir_pop_if(&b, NULL); 6057ec681f3Smrg 6067ec681f3Smrg nir_pop_if(&b, NULL); 6077ec681f3Smrg 6087ec681f3Smrg radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base), 6097ec681f3Smrg nir_b2i32(&b, nir_load_var(&b, available))); 6107ec681f3Smrg 6117ec681f3Smrg return b.shader; 61201e04c3fSmrg} 61301e04c3fSmrg 6147ec681f3Smrgstatic VkResult 6157ec681f3Smrgradv_device_init_meta_query_state_internal(struct radv_device *device) 61601e04c3fSmrg{ 6177ec681f3Smrg VkResult result; 6187ec681f3Smrg nir_shader *occlusion_cs = NULL; 6197ec681f3Smrg nir_shader *pipeline_statistics_cs = NULL; 6207ec681f3Smrg nir_shader *tfb_cs = NULL; 6217ec681f3Smrg nir_shader *timestamp_cs = NULL; 6227ec681f3Smrg 6237ec681f3Smrg mtx_lock(&device->meta_state.mtx); 6247ec681f3Smrg if (device->meta_state.query.pipeline_statistics_query_pipeline) { 6257ec681f3Smrg mtx_unlock(&device->meta_state.mtx); 6267ec681f3Smrg return VK_SUCCESS; 6277ec681f3Smrg } 6287ec681f3Smrg occlusion_cs = build_occlusion_query_shader(device); 6297ec681f3Smrg pipeline_statistics_cs = build_pipeline_statistics_query_shader(device); 6307ec681f3Smrg tfb_cs = build_tfb_query_shader(device); 6317ec681f3Smrg timestamp_cs = build_timestamp_query_shader(device); 6327ec681f3Smrg 6337ec681f3Smrg VkDescriptorSetLayoutCreateInfo occlusion_ds_create_info = { 6347ec681f3Smrg .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, 6357ec681f3Smrg .flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR, 6367ec681f3Smrg .bindingCount = 2, 6377ec681f3Smrg .pBindings = (VkDescriptorSetLayoutBinding[]){ 6387ec681f3Smrg {.binding = 0, 6397ec681f3Smrg .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 6407ec681f3Smrg .descriptorCount = 1, 6417ec681f3Smrg .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, 6427ec681f3Smrg .pImmutableSamplers = NULL}, 6437ec681f3Smrg {.binding = 1, 6447ec681f3Smrg .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 6457ec681f3Smrg .descriptorCount = 1, 6467ec681f3Smrg .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, 6477ec681f3Smrg .pImmutableSamplers = NULL}, 6487ec681f3Smrg }}; 6497ec681f3Smrg 6507ec681f3Smrg result = radv_CreateDescriptorSetLayout(radv_device_to_handle(device), &occlusion_ds_create_info, 6517ec681f3Smrg &device->meta_state.alloc, 6527ec681f3Smrg &device->meta_state.query.ds_layout); 6537ec681f3Smrg if (result != VK_SUCCESS) 6547ec681f3Smrg goto fail; 6557ec681f3Smrg 6567ec681f3Smrg VkPipelineLayoutCreateInfo occlusion_pl_create_info = { 6577ec681f3Smrg .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, 6587ec681f3Smrg .setLayoutCount = 1, 6597ec681f3Smrg .pSetLayouts = &device->meta_state.query.ds_layout, 6607ec681f3Smrg .pushConstantRangeCount = 1, 6617ec681f3Smrg .pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, 16}, 6627ec681f3Smrg }; 6637ec681f3Smrg 6647ec681f3Smrg result = 6657ec681f3Smrg radv_CreatePipelineLayout(radv_device_to_handle(device), &occlusion_pl_create_info, 6667ec681f3Smrg &device->meta_state.alloc, &device->meta_state.query.p_layout); 6677ec681f3Smrg if (result != VK_SUCCESS) 6687ec681f3Smrg goto fail; 6697ec681f3Smrg 6707ec681f3Smrg VkPipelineShaderStageCreateInfo occlusion_pipeline_shader_stage = { 6717ec681f3Smrg .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, 6727ec681f3Smrg .stage = VK_SHADER_STAGE_COMPUTE_BIT, 6737ec681f3Smrg .module = vk_shader_module_handle_from_nir(occlusion_cs), 6747ec681f3Smrg .pName = "main", 6757ec681f3Smrg .pSpecializationInfo = NULL, 6767ec681f3Smrg }; 6777ec681f3Smrg 6787ec681f3Smrg VkComputePipelineCreateInfo occlusion_vk_pipeline_info = { 6797ec681f3Smrg .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, 6807ec681f3Smrg .stage = occlusion_pipeline_shader_stage, 6817ec681f3Smrg .flags = 0, 6827ec681f3Smrg .layout = device->meta_state.query.p_layout, 6837ec681f3Smrg }; 6847ec681f3Smrg 6857ec681f3Smrg result = radv_CreateComputePipelines( 6867ec681f3Smrg radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1, 6877ec681f3Smrg &occlusion_vk_pipeline_info, NULL, &device->meta_state.query.occlusion_query_pipeline); 6887ec681f3Smrg if (result != VK_SUCCESS) 6897ec681f3Smrg goto fail; 6907ec681f3Smrg 6917ec681f3Smrg VkPipelineShaderStageCreateInfo pipeline_statistics_pipeline_shader_stage = { 6927ec681f3Smrg .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, 6937ec681f3Smrg .stage = VK_SHADER_STAGE_COMPUTE_BIT, 6947ec681f3Smrg .module = vk_shader_module_handle_from_nir(pipeline_statistics_cs), 6957ec681f3Smrg .pName = "main", 6967ec681f3Smrg .pSpecializationInfo = NULL, 6977ec681f3Smrg }; 6987ec681f3Smrg 6997ec681f3Smrg VkComputePipelineCreateInfo pipeline_statistics_vk_pipeline_info = { 7007ec681f3Smrg .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, 7017ec681f3Smrg .stage = pipeline_statistics_pipeline_shader_stage, 7027ec681f3Smrg .flags = 0, 7037ec681f3Smrg .layout = device->meta_state.query.p_layout, 7047ec681f3Smrg }; 7057ec681f3Smrg 7067ec681f3Smrg result = radv_CreateComputePipelines( 7077ec681f3Smrg radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1, 7087ec681f3Smrg &pipeline_statistics_vk_pipeline_info, NULL, 7097ec681f3Smrg &device->meta_state.query.pipeline_statistics_query_pipeline); 7107ec681f3Smrg if (result != VK_SUCCESS) 7117ec681f3Smrg goto fail; 7127ec681f3Smrg 7137ec681f3Smrg VkPipelineShaderStageCreateInfo tfb_pipeline_shader_stage = { 7147ec681f3Smrg .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, 7157ec681f3Smrg .stage = VK_SHADER_STAGE_COMPUTE_BIT, 7167ec681f3Smrg .module = vk_shader_module_handle_from_nir(tfb_cs), 7177ec681f3Smrg .pName = "main", 7187ec681f3Smrg .pSpecializationInfo = NULL, 7197ec681f3Smrg }; 7207ec681f3Smrg 7217ec681f3Smrg VkComputePipelineCreateInfo tfb_pipeline_info = { 7227ec681f3Smrg .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, 7237ec681f3Smrg .stage = tfb_pipeline_shader_stage, 7247ec681f3Smrg .flags = 0, 7257ec681f3Smrg .layout = device->meta_state.query.p_layout, 7267ec681f3Smrg }; 7277ec681f3Smrg 7287ec681f3Smrg result = radv_CreateComputePipelines( 7297ec681f3Smrg radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1, 7307ec681f3Smrg &tfb_pipeline_info, NULL, &device->meta_state.query.tfb_query_pipeline); 7317ec681f3Smrg if (result != VK_SUCCESS) 7327ec681f3Smrg goto fail; 7337ec681f3Smrg 7347ec681f3Smrg VkPipelineShaderStageCreateInfo timestamp_pipeline_shader_stage = { 7357ec681f3Smrg .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, 7367ec681f3Smrg .stage = VK_SHADER_STAGE_COMPUTE_BIT, 7377ec681f3Smrg .module = vk_shader_module_handle_from_nir(timestamp_cs), 7387ec681f3Smrg .pName = "main", 7397ec681f3Smrg .pSpecializationInfo = NULL, 7407ec681f3Smrg }; 7417ec681f3Smrg 7427ec681f3Smrg VkComputePipelineCreateInfo timestamp_pipeline_info = { 7437ec681f3Smrg .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, 7447ec681f3Smrg .stage = timestamp_pipeline_shader_stage, 7457ec681f3Smrg .flags = 0, 7467ec681f3Smrg .layout = device->meta_state.query.p_layout, 7477ec681f3Smrg }; 7487ec681f3Smrg 7497ec681f3Smrg result = radv_CreateComputePipelines( 7507ec681f3Smrg radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1, 7517ec681f3Smrg ×tamp_pipeline_info, NULL, &device->meta_state.query.timestamp_query_pipeline); 75201e04c3fSmrg 7537ec681f3Smrgfail: 7547ec681f3Smrg if (result != VK_SUCCESS) 7557ec681f3Smrg radv_device_finish_meta_query_state(device); 7567ec681f3Smrg ralloc_free(occlusion_cs); 7577ec681f3Smrg ralloc_free(pipeline_statistics_cs); 7587ec681f3Smrg ralloc_free(tfb_cs); 7597ec681f3Smrg ralloc_free(timestamp_cs); 7607ec681f3Smrg mtx_unlock(&device->meta_state.mtx); 7617ec681f3Smrg return result; 76201e04c3fSmrg} 76301e04c3fSmrg 7647ec681f3SmrgVkResult 7657ec681f3Smrgradv_device_init_meta_query_state(struct radv_device *device, bool on_demand) 76601e04c3fSmrg{ 7677ec681f3Smrg if (on_demand) 7687ec681f3Smrg return VK_SUCCESS; 7697ec681f3Smrg 7707ec681f3Smrg return radv_device_init_meta_query_state_internal(device); 77101e04c3fSmrg} 77201e04c3fSmrg 7737ec681f3Smrgvoid 7747ec681f3Smrgradv_device_finish_meta_query_state(struct radv_device *device) 77501e04c3fSmrg{ 7767ec681f3Smrg if (device->meta_state.query.tfb_query_pipeline) 7777ec681f3Smrg radv_DestroyPipeline(radv_device_to_handle(device), 7787ec681f3Smrg device->meta_state.query.tfb_query_pipeline, &device->meta_state.alloc); 7797ec681f3Smrg 7807ec681f3Smrg if (device->meta_state.query.pipeline_statistics_query_pipeline) 7817ec681f3Smrg radv_DestroyPipeline(radv_device_to_handle(device), 7827ec681f3Smrg device->meta_state.query.pipeline_statistics_query_pipeline, 7837ec681f3Smrg &device->meta_state.alloc); 7847ec681f3Smrg 7857ec681f3Smrg if (device->meta_state.query.occlusion_query_pipeline) 7867ec681f3Smrg radv_DestroyPipeline(radv_device_to_handle(device), 7877ec681f3Smrg device->meta_state.query.occlusion_query_pipeline, 7887ec681f3Smrg &device->meta_state.alloc); 7897ec681f3Smrg 7907ec681f3Smrg if (device->meta_state.query.timestamp_query_pipeline) 7917ec681f3Smrg radv_DestroyPipeline(radv_device_to_handle(device), 7927ec681f3Smrg device->meta_state.query.timestamp_query_pipeline, 7937ec681f3Smrg &device->meta_state.alloc); 7947ec681f3Smrg 7957ec681f3Smrg if (device->meta_state.query.p_layout) 7967ec681f3Smrg radv_DestroyPipelineLayout(radv_device_to_handle(device), device->meta_state.query.p_layout, 7977ec681f3Smrg &device->meta_state.alloc); 7987ec681f3Smrg 7997ec681f3Smrg if (device->meta_state.query.ds_layout) 8007ec681f3Smrg radv_DestroyDescriptorSetLayout(radv_device_to_handle(device), 8017ec681f3Smrg device->meta_state.query.ds_layout, 8027ec681f3Smrg &device->meta_state.alloc); 80301e04c3fSmrg} 80401e04c3fSmrg 8057ec681f3Smrgstatic void 8067ec681f3Smrgradv_query_shader(struct radv_cmd_buffer *cmd_buffer, VkPipeline *pipeline, 8077ec681f3Smrg struct radeon_winsys_bo *src_bo, struct radeon_winsys_bo *dst_bo, 8087ec681f3Smrg uint64_t src_offset, uint64_t dst_offset, uint32_t src_stride, 8097ec681f3Smrg uint32_t dst_stride, size_t dst_size, uint32_t count, uint32_t flags, 8107ec681f3Smrg uint32_t pipeline_stats_mask, uint32_t avail_offset) 81101e04c3fSmrg{ 8127ec681f3Smrg struct radv_device *device = cmd_buffer->device; 8137ec681f3Smrg struct radv_meta_saved_state saved_state; 8147ec681f3Smrg struct radv_buffer src_buffer, dst_buffer; 8157ec681f3Smrg bool old_predicating; 8167ec681f3Smrg 8177ec681f3Smrg if (!*pipeline) { 8187ec681f3Smrg VkResult ret = radv_device_init_meta_query_state_internal(device); 8197ec681f3Smrg if (ret != VK_SUCCESS) { 8207ec681f3Smrg cmd_buffer->record_result = ret; 8217ec681f3Smrg return; 8227ec681f3Smrg } 8237ec681f3Smrg } 8247ec681f3Smrg 8257ec681f3Smrg radv_meta_save( 8267ec681f3Smrg &saved_state, cmd_buffer, 8277ec681f3Smrg RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_CONSTANTS | RADV_META_SAVE_DESCRIPTORS); 8287ec681f3Smrg 8297ec681f3Smrg /* VK_EXT_conditional_rendering says that copy commands should not be 8307ec681f3Smrg * affected by conditional rendering. 8317ec681f3Smrg */ 8327ec681f3Smrg old_predicating = cmd_buffer->state.predicating; 8337ec681f3Smrg cmd_buffer->state.predicating = false; 8347ec681f3Smrg 8357ec681f3Smrg uint64_t src_buffer_size = MAX2(src_stride * count, avail_offset + 4 * count - src_offset); 8367ec681f3Smrg uint64_t dst_buffer_size = dst_stride * (count - 1) + dst_size; 8377ec681f3Smrg 8387ec681f3Smrg radv_buffer_init(&src_buffer, device, src_bo, src_buffer_size, src_offset); 8397ec681f3Smrg radv_buffer_init(&dst_buffer, device, dst_bo, dst_buffer_size, dst_offset); 8407ec681f3Smrg 8417ec681f3Smrg radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, 8427ec681f3Smrg *pipeline); 8437ec681f3Smrg 8447ec681f3Smrg radv_meta_push_descriptor_set( 8457ec681f3Smrg cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, device->meta_state.query.p_layout, 0, /* set */ 8467ec681f3Smrg 2, /* descriptorWriteCount */ 8477ec681f3Smrg (VkWriteDescriptorSet[]){ 8487ec681f3Smrg {.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, 8497ec681f3Smrg .dstBinding = 0, 8507ec681f3Smrg .dstArrayElement = 0, 8517ec681f3Smrg .descriptorCount = 1, 8527ec681f3Smrg .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 8537ec681f3Smrg .pBufferInfo = &(VkDescriptorBufferInfo){.buffer = radv_buffer_to_handle(&dst_buffer), 8547ec681f3Smrg .offset = 0, 8557ec681f3Smrg .range = VK_WHOLE_SIZE}}, 8567ec681f3Smrg {.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, 8577ec681f3Smrg .dstBinding = 1, 8587ec681f3Smrg .dstArrayElement = 0, 8597ec681f3Smrg .descriptorCount = 1, 8607ec681f3Smrg .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 8617ec681f3Smrg .pBufferInfo = &(VkDescriptorBufferInfo){.buffer = radv_buffer_to_handle(&src_buffer), 8627ec681f3Smrg .offset = 0, 8637ec681f3Smrg .range = VK_WHOLE_SIZE}}}); 8647ec681f3Smrg 8657ec681f3Smrg /* Encode the number of elements for easy access by the shader. */ 8667ec681f3Smrg pipeline_stats_mask &= 0x7ff; 8677ec681f3Smrg pipeline_stats_mask |= util_bitcount(pipeline_stats_mask) << 16; 8687ec681f3Smrg 8697ec681f3Smrg avail_offset -= src_offset; 8707ec681f3Smrg 8717ec681f3Smrg struct { 8727ec681f3Smrg uint32_t flags; 8737ec681f3Smrg uint32_t dst_stride; 8747ec681f3Smrg uint32_t pipeline_stats_mask; 8757ec681f3Smrg uint32_t avail_offset; 8767ec681f3Smrg } push_constants = {flags, dst_stride, pipeline_stats_mask, avail_offset}; 8777ec681f3Smrg 8787ec681f3Smrg radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), device->meta_state.query.p_layout, 8797ec681f3Smrg VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants), &push_constants); 8807ec681f3Smrg 8817ec681f3Smrg cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_INV_VCACHE; 8827ec681f3Smrg 8837ec681f3Smrg if (flags & VK_QUERY_RESULT_WAIT_BIT) 8847ec681f3Smrg cmd_buffer->state.flush_bits |= RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER; 8857ec681f3Smrg 8867ec681f3Smrg radv_unaligned_dispatch(cmd_buffer, count, 1, 1); 8877ec681f3Smrg 8887ec681f3Smrg /* Restore conditional rendering. */ 8897ec681f3Smrg cmd_buffer->state.predicating = old_predicating; 8907ec681f3Smrg 8917ec681f3Smrg radv_buffer_finish(&src_buffer); 8927ec681f3Smrg radv_buffer_finish(&dst_buffer); 8937ec681f3Smrg 8947ec681f3Smrg radv_meta_restore(&saved_state, cmd_buffer); 89501e04c3fSmrg} 89601e04c3fSmrg 8977ec681f3Smrgstatic bool 8987ec681f3Smrgradv_query_pool_needs_gds(struct radv_device *device, struct radv_query_pool *pool) 89901e04c3fSmrg{ 9007ec681f3Smrg /* The number of primitives generated by geometry shader invocations is 9017ec681f3Smrg * only counted by the hardware if GS uses the legacy path. When NGG GS 9027ec681f3Smrg * is used, the hardware can't know the number of generated primitives 9037ec681f3Smrg * and we have to it manually inside the shader. To achieve that, the 9047ec681f3Smrg * driver does a plain GDS atomic to accumulate that value. 9057ec681f3Smrg * TODO: fix use of NGG GS and non-NGG GS inside the same begin/end 9067ec681f3Smrg * query. 9077ec681f3Smrg */ 9087ec681f3Smrg return device->physical_device->use_ngg && 9097ec681f3Smrg (pool->pipeline_stats_mask & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT); 9107ec681f3Smrg} 91101e04c3fSmrg 9127ec681f3Smrgstatic void 9137ec681f3Smrgradv_destroy_query_pool(struct radv_device *device, const VkAllocationCallbacks *pAllocator, 9147ec681f3Smrg struct radv_query_pool *pool) 9157ec681f3Smrg{ 9167ec681f3Smrg if (pool->bo) 9177ec681f3Smrg device->ws->buffer_destroy(device->ws, pool->bo); 9187ec681f3Smrg vk_object_base_finish(&pool->base); 9197ec681f3Smrg vk_free2(&device->vk.alloc, pAllocator, pool); 92001e04c3fSmrg} 92101e04c3fSmrg 9227ec681f3SmrgVkResult 9237ec681f3Smrgradv_CreateQueryPool(VkDevice _device, const VkQueryPoolCreateInfo *pCreateInfo, 9247ec681f3Smrg const VkAllocationCallbacks *pAllocator, VkQueryPool *pQueryPool) 92501e04c3fSmrg{ 9267ec681f3Smrg RADV_FROM_HANDLE(radv_device, device, _device); 9277ec681f3Smrg struct radv_query_pool *pool = 9287ec681f3Smrg vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*pool), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 9297ec681f3Smrg 9307ec681f3Smrg if (!pool) 9317ec681f3Smrg return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 9327ec681f3Smrg 9337ec681f3Smrg vk_object_base_init(&device->vk, &pool->base, VK_OBJECT_TYPE_QUERY_POOL); 9347ec681f3Smrg 9357ec681f3Smrg switch (pCreateInfo->queryType) { 9367ec681f3Smrg case VK_QUERY_TYPE_OCCLUSION: 9377ec681f3Smrg pool->stride = 16 * device->physical_device->rad_info.max_render_backends; 9387ec681f3Smrg break; 9397ec681f3Smrg case VK_QUERY_TYPE_PIPELINE_STATISTICS: 9407ec681f3Smrg pool->stride = pipelinestat_block_size * 2; 9417ec681f3Smrg break; 9427ec681f3Smrg case VK_QUERY_TYPE_TIMESTAMP: 9437ec681f3Smrg case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR: 9447ec681f3Smrg case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR: 9457ec681f3Smrg pool->stride = 8; 9467ec681f3Smrg break; 9477ec681f3Smrg case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 9487ec681f3Smrg pool->stride = 32; 9497ec681f3Smrg break; 9507ec681f3Smrg default: 9517ec681f3Smrg unreachable("creating unhandled query type"); 9527ec681f3Smrg } 9537ec681f3Smrg 9547ec681f3Smrg pool->type = pCreateInfo->queryType; 9557ec681f3Smrg pool->pipeline_stats_mask = pCreateInfo->pipelineStatistics; 9567ec681f3Smrg pool->availability_offset = pool->stride * pCreateInfo->queryCount; 9577ec681f3Smrg pool->size = pool->availability_offset; 9587ec681f3Smrg if (pCreateInfo->queryType == VK_QUERY_TYPE_PIPELINE_STATISTICS) 9597ec681f3Smrg pool->size += 4 * pCreateInfo->queryCount; 9607ec681f3Smrg 9617ec681f3Smrg VkResult result = device->ws->buffer_create(device->ws, pool->size, 64, RADEON_DOMAIN_GTT, 9627ec681f3Smrg RADEON_FLAG_NO_INTERPROCESS_SHARING, 9637ec681f3Smrg RADV_BO_PRIORITY_QUERY_POOL, 0, &pool->bo); 9647ec681f3Smrg if (result != VK_SUCCESS) { 9657ec681f3Smrg radv_destroy_query_pool(device, pAllocator, pool); 9667ec681f3Smrg return vk_error(device, result); 9677ec681f3Smrg } 9687ec681f3Smrg 9697ec681f3Smrg pool->ptr = device->ws->buffer_map(pool->bo); 9707ec681f3Smrg if (!pool->ptr) { 9717ec681f3Smrg radv_destroy_query_pool(device, pAllocator, pool); 9727ec681f3Smrg return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY); 9737ec681f3Smrg } 9747ec681f3Smrg 9757ec681f3Smrg *pQueryPool = radv_query_pool_to_handle(pool); 9767ec681f3Smrg return VK_SUCCESS; 97701e04c3fSmrg} 97801e04c3fSmrg 9797ec681f3Smrgvoid 9807ec681f3Smrgradv_DestroyQueryPool(VkDevice _device, VkQueryPool _pool, const VkAllocationCallbacks *pAllocator) 98101e04c3fSmrg{ 9827ec681f3Smrg RADV_FROM_HANDLE(radv_device, device, _device); 9837ec681f3Smrg RADV_FROM_HANDLE(radv_query_pool, pool, _pool); 9847ec681f3Smrg 9857ec681f3Smrg if (!pool) 9867ec681f3Smrg return; 98701e04c3fSmrg 9887ec681f3Smrg radv_destroy_query_pool(device, pAllocator, pool); 98901e04c3fSmrg} 99001e04c3fSmrg 9917ec681f3SmrgVkResult 9927ec681f3Smrgradv_GetQueryPoolResults(VkDevice _device, VkQueryPool queryPool, uint32_t firstQuery, 9937ec681f3Smrg uint32_t queryCount, size_t dataSize, void *pData, VkDeviceSize stride, 9947ec681f3Smrg VkQueryResultFlags flags) 99501e04c3fSmrg{ 9967ec681f3Smrg RADV_FROM_HANDLE(radv_device, device, _device); 9977ec681f3Smrg RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); 9987ec681f3Smrg char *data = pData; 9997ec681f3Smrg VkResult result = VK_SUCCESS; 10007ec681f3Smrg 10017ec681f3Smrg if (radv_device_is_lost(device)) 10027ec681f3Smrg return VK_ERROR_DEVICE_LOST; 10037ec681f3Smrg 10047ec681f3Smrg for (unsigned query_idx = 0; query_idx < queryCount; ++query_idx, data += stride) { 10057ec681f3Smrg char *dest = data; 10067ec681f3Smrg unsigned query = firstQuery + query_idx; 10077ec681f3Smrg char *src = pool->ptr + query * pool->stride; 10087ec681f3Smrg uint32_t available; 10097ec681f3Smrg 10107ec681f3Smrg switch (pool->type) { 10117ec681f3Smrg case VK_QUERY_TYPE_TIMESTAMP: 10127ec681f3Smrg case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR: 10137ec681f3Smrg case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR: { 10147ec681f3Smrg uint64_t const *src64 = (uint64_t const *)src; 10157ec681f3Smrg uint64_t value; 10167ec681f3Smrg 10177ec681f3Smrg do { 10187ec681f3Smrg value = p_atomic_read(src64); 10197ec681f3Smrg } while (value == TIMESTAMP_NOT_READY && (flags & VK_QUERY_RESULT_WAIT_BIT)); 10207ec681f3Smrg 10217ec681f3Smrg available = value != TIMESTAMP_NOT_READY; 10227ec681f3Smrg 10237ec681f3Smrg if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT)) 10247ec681f3Smrg result = VK_NOT_READY; 10257ec681f3Smrg 10267ec681f3Smrg if (flags & VK_QUERY_RESULT_64_BIT) { 10277ec681f3Smrg if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) 10287ec681f3Smrg *(uint64_t *)dest = value; 10297ec681f3Smrg dest += 8; 10307ec681f3Smrg } else { 10317ec681f3Smrg if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) 10327ec681f3Smrg *(uint32_t *)dest = (uint32_t)value; 10337ec681f3Smrg dest += 4; 10347ec681f3Smrg } 10357ec681f3Smrg break; 10367ec681f3Smrg } 10377ec681f3Smrg case VK_QUERY_TYPE_OCCLUSION: { 10387ec681f3Smrg uint64_t const *src64 = (uint64_t const *)src; 10397ec681f3Smrg uint32_t db_count = device->physical_device->rad_info.max_render_backends; 10407ec681f3Smrg uint32_t enabled_rb_mask = device->physical_device->rad_info.enabled_rb_mask; 10417ec681f3Smrg uint64_t sample_count = 0; 10427ec681f3Smrg available = 1; 10437ec681f3Smrg 10447ec681f3Smrg for (int i = 0; i < db_count; ++i) { 10457ec681f3Smrg uint64_t start, end; 10467ec681f3Smrg 10477ec681f3Smrg if (!(enabled_rb_mask & (1 << i))) 10487ec681f3Smrg continue; 10497ec681f3Smrg 10507ec681f3Smrg do { 10517ec681f3Smrg start = p_atomic_read(src64 + 2 * i); 10527ec681f3Smrg end = p_atomic_read(src64 + 2 * i + 1); 10537ec681f3Smrg } while ((!(start & (1ull << 63)) || !(end & (1ull << 63))) && 10547ec681f3Smrg (flags & VK_QUERY_RESULT_WAIT_BIT)); 10557ec681f3Smrg 10567ec681f3Smrg if (!(start & (1ull << 63)) || !(end & (1ull << 63))) 10577ec681f3Smrg available = 0; 10587ec681f3Smrg else { 10597ec681f3Smrg sample_count += end - start; 10607ec681f3Smrg } 10617ec681f3Smrg } 10627ec681f3Smrg 10637ec681f3Smrg if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT)) 10647ec681f3Smrg result = VK_NOT_READY; 10657ec681f3Smrg 10667ec681f3Smrg if (flags & VK_QUERY_RESULT_64_BIT) { 10677ec681f3Smrg if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) 10687ec681f3Smrg *(uint64_t *)dest = sample_count; 10697ec681f3Smrg dest += 8; 10707ec681f3Smrg } else { 10717ec681f3Smrg if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) 10727ec681f3Smrg *(uint32_t *)dest = sample_count; 10737ec681f3Smrg dest += 4; 10747ec681f3Smrg } 10757ec681f3Smrg break; 10767ec681f3Smrg } 10777ec681f3Smrg case VK_QUERY_TYPE_PIPELINE_STATISTICS: { 10787ec681f3Smrg const uint32_t *avail_ptr = 10797ec681f3Smrg (const uint32_t *)(pool->ptr + pool->availability_offset + 4 * query); 10807ec681f3Smrg 10817ec681f3Smrg do { 10827ec681f3Smrg available = p_atomic_read(avail_ptr); 10837ec681f3Smrg } while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)); 10847ec681f3Smrg 10857ec681f3Smrg if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT)) 10867ec681f3Smrg result = VK_NOT_READY; 10877ec681f3Smrg 10887ec681f3Smrg const uint64_t *start = (uint64_t *)src; 10897ec681f3Smrg const uint64_t *stop = (uint64_t *)(src + pipelinestat_block_size); 10907ec681f3Smrg if (flags & VK_QUERY_RESULT_64_BIT) { 10917ec681f3Smrg uint64_t *dst = (uint64_t *)dest; 10927ec681f3Smrg dest += util_bitcount(pool->pipeline_stats_mask) * 8; 10937ec681f3Smrg for (int i = 0; i < ARRAY_SIZE(pipeline_statistics_indices); ++i) { 10947ec681f3Smrg if (pool->pipeline_stats_mask & (1u << i)) { 10957ec681f3Smrg if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) 10967ec681f3Smrg *dst = stop[pipeline_statistics_indices[i]] - 10977ec681f3Smrg start[pipeline_statistics_indices[i]]; 10987ec681f3Smrg dst++; 10997ec681f3Smrg } 11007ec681f3Smrg } 11017ec681f3Smrg 11027ec681f3Smrg } else { 11037ec681f3Smrg uint32_t *dst = (uint32_t *)dest; 11047ec681f3Smrg dest += util_bitcount(pool->pipeline_stats_mask) * 4; 11057ec681f3Smrg for (int i = 0; i < ARRAY_SIZE(pipeline_statistics_indices); ++i) { 11067ec681f3Smrg if (pool->pipeline_stats_mask & (1u << i)) { 11077ec681f3Smrg if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) 11087ec681f3Smrg *dst = stop[pipeline_statistics_indices[i]] - 11097ec681f3Smrg start[pipeline_statistics_indices[i]]; 11107ec681f3Smrg dst++; 11117ec681f3Smrg } 11127ec681f3Smrg } 11137ec681f3Smrg } 11147ec681f3Smrg break; 11157ec681f3Smrg } 11167ec681f3Smrg case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: { 11177ec681f3Smrg uint64_t const *src64 = (uint64_t const *)src; 11187ec681f3Smrg uint64_t num_primitives_written; 11197ec681f3Smrg uint64_t primitive_storage_needed; 11207ec681f3Smrg 11217ec681f3Smrg /* SAMPLE_STREAMOUTSTATS stores this structure: 11227ec681f3Smrg * { 11237ec681f3Smrg * u64 NumPrimitivesWritten; 11247ec681f3Smrg * u64 PrimitiveStorageNeeded; 11257ec681f3Smrg * } 11267ec681f3Smrg */ 11277ec681f3Smrg available = 1; 11287ec681f3Smrg for (int j = 0; j < 4; j++) { 11297ec681f3Smrg if (!(p_atomic_read(src64 + j) & 0x8000000000000000UL)) 11307ec681f3Smrg available = 0; 11317ec681f3Smrg } 11327ec681f3Smrg 11337ec681f3Smrg if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT)) 11347ec681f3Smrg result = VK_NOT_READY; 11357ec681f3Smrg 11367ec681f3Smrg num_primitives_written = src64[3] - src64[1]; 11377ec681f3Smrg primitive_storage_needed = src64[2] - src64[0]; 11387ec681f3Smrg 11397ec681f3Smrg if (flags & VK_QUERY_RESULT_64_BIT) { 11407ec681f3Smrg if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) 11417ec681f3Smrg *(uint64_t *)dest = num_primitives_written; 11427ec681f3Smrg dest += 8; 11437ec681f3Smrg if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) 11447ec681f3Smrg *(uint64_t *)dest = primitive_storage_needed; 11457ec681f3Smrg dest += 8; 11467ec681f3Smrg } else { 11477ec681f3Smrg if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) 11487ec681f3Smrg *(uint32_t *)dest = num_primitives_written; 11497ec681f3Smrg dest += 4; 11507ec681f3Smrg if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) 11517ec681f3Smrg *(uint32_t *)dest = primitive_storage_needed; 11527ec681f3Smrg dest += 4; 11537ec681f3Smrg } 11547ec681f3Smrg break; 11557ec681f3Smrg } 11567ec681f3Smrg default: 11577ec681f3Smrg unreachable("trying to get results of unhandled query type"); 11587ec681f3Smrg } 11597ec681f3Smrg 11607ec681f3Smrg if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { 11617ec681f3Smrg if (flags & VK_QUERY_RESULT_64_BIT) { 11627ec681f3Smrg *(uint64_t *)dest = available; 11637ec681f3Smrg } else { 11647ec681f3Smrg *(uint32_t *)dest = available; 11657ec681f3Smrg } 11667ec681f3Smrg } 11677ec681f3Smrg } 11687ec681f3Smrg 11697ec681f3Smrg return result; 117001e04c3fSmrg} 117101e04c3fSmrg 11727ec681f3Smrgstatic void 11737ec681f3Smrgemit_query_flush(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool) 1174ed98bd31Smaya{ 11757ec681f3Smrg if (cmd_buffer->pending_reset_query) { 11767ec681f3Smrg if (pool->size >= RADV_BUFFER_OPS_CS_THRESHOLD) { 11777ec681f3Smrg /* Only need to flush caches if the query pool size is 11787ec681f3Smrg * large enough to be resetted using the compute shader 11797ec681f3Smrg * path. Small pools don't need any cache flushes 11807ec681f3Smrg * because we use a CP dma clear. 11817ec681f3Smrg */ 11827ec681f3Smrg si_emit_cache_flush(cmd_buffer); 11837ec681f3Smrg } 11847ec681f3Smrg } 11857ec681f3Smrg} 1186ed98bd31Smaya 11877ec681f3Smrgstatic size_t 11887ec681f3Smrgradv_query_result_size(const struct radv_query_pool *pool, VkQueryResultFlags flags) 11897ec681f3Smrg{ 11907ec681f3Smrg unsigned values = (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) ? 1 : 0; 11917ec681f3Smrg switch (pool->type) { 11927ec681f3Smrg case VK_QUERY_TYPE_TIMESTAMP: 11937ec681f3Smrg case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR: 11947ec681f3Smrg case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR: 11957ec681f3Smrg case VK_QUERY_TYPE_OCCLUSION: 11967ec681f3Smrg values += 1; 11977ec681f3Smrg break; 11987ec681f3Smrg case VK_QUERY_TYPE_PIPELINE_STATISTICS: 11997ec681f3Smrg values += util_bitcount(pool->pipeline_stats_mask); 12007ec681f3Smrg break; 12017ec681f3Smrg case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 12027ec681f3Smrg values += 2; 12037ec681f3Smrg break; 12047ec681f3Smrg default: 12057ec681f3Smrg unreachable("trying to get size of unhandled query type"); 12067ec681f3Smrg } 12077ec681f3Smrg return values * ((flags & VK_QUERY_RESULT_64_BIT) ? 8 : 4); 12087ec681f3Smrg} 1209ed98bd31Smaya 12107ec681f3Smrgvoid 12117ec681f3Smrgradv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPool, 12127ec681f3Smrg uint32_t firstQuery, uint32_t queryCount, VkBuffer dstBuffer, 12137ec681f3Smrg VkDeviceSize dstOffset, VkDeviceSize stride, VkQueryResultFlags flags) 12147ec681f3Smrg{ 12157ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 12167ec681f3Smrg RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); 12177ec681f3Smrg RADV_FROM_HANDLE(radv_buffer, dst_buffer, dstBuffer); 12187ec681f3Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 12197ec681f3Smrg uint64_t va = radv_buffer_get_va(pool->bo); 12207ec681f3Smrg uint64_t dest_va = radv_buffer_get_va(dst_buffer->bo); 12217ec681f3Smrg size_t dst_size = radv_query_result_size(pool, flags); 12227ec681f3Smrg dest_va += dst_buffer->offset + dstOffset; 12237ec681f3Smrg 12247ec681f3Smrg if (!queryCount) 12257ec681f3Smrg return; 12267ec681f3Smrg 12277ec681f3Smrg radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pool->bo); 12287ec681f3Smrg radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, dst_buffer->bo); 12297ec681f3Smrg 12307ec681f3Smrg /* From the Vulkan spec 1.1.108: 12317ec681f3Smrg * 12327ec681f3Smrg * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of 12337ec681f3Smrg * previous uses of vkCmdResetQueryPool in the same queue, without any 12347ec681f3Smrg * additional synchronization." 12357ec681f3Smrg * 12367ec681f3Smrg * So, we have to flush the caches if the compute shader path was used. 12377ec681f3Smrg */ 12387ec681f3Smrg emit_query_flush(cmd_buffer, pool); 12397ec681f3Smrg 12407ec681f3Smrg switch (pool->type) { 12417ec681f3Smrg case VK_QUERY_TYPE_OCCLUSION: 12427ec681f3Smrg if (flags & VK_QUERY_RESULT_WAIT_BIT) { 12437ec681f3Smrg unsigned enabled_rb_mask = cmd_buffer->device->physical_device->rad_info.enabled_rb_mask; 12447ec681f3Smrg uint32_t rb_avail_offset = 16 * util_last_bit(enabled_rb_mask) - 4; 12457ec681f3Smrg for (unsigned i = 0; i < queryCount; ++i, dest_va += stride) { 12467ec681f3Smrg unsigned query = firstQuery + i; 12477ec681f3Smrg uint64_t src_va = va + query * pool->stride + rb_avail_offset; 12487ec681f3Smrg 12497ec681f3Smrg radeon_check_space(cmd_buffer->device->ws, cs, 7); 12507ec681f3Smrg 12517ec681f3Smrg /* Waits on the upper word of the last DB entry */ 12527ec681f3Smrg radv_cp_wait_mem(cs, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va, 0x80000000, 0xffffffff); 12537ec681f3Smrg } 12547ec681f3Smrg } 12557ec681f3Smrg radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.occlusion_query_pipeline, 12567ec681f3Smrg pool->bo, dst_buffer->bo, firstQuery * pool->stride, 12577ec681f3Smrg dst_buffer->offset + dstOffset, pool->stride, stride, dst_size, queryCount, 12587ec681f3Smrg flags, 0, 0); 12597ec681f3Smrg break; 12607ec681f3Smrg case VK_QUERY_TYPE_PIPELINE_STATISTICS: 12617ec681f3Smrg if (flags & VK_QUERY_RESULT_WAIT_BIT) { 12627ec681f3Smrg for (unsigned i = 0; i < queryCount; ++i, dest_va += stride) { 12637ec681f3Smrg unsigned query = firstQuery + i; 12647ec681f3Smrg 12657ec681f3Smrg radeon_check_space(cmd_buffer->device->ws, cs, 7); 12667ec681f3Smrg 12677ec681f3Smrg uint64_t avail_va = va + pool->availability_offset + 4 * query; 12687ec681f3Smrg 12697ec681f3Smrg /* This waits on the ME. All copies below are done on the ME */ 12707ec681f3Smrg radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL, avail_va, 1, 0xffffffff); 12717ec681f3Smrg } 12727ec681f3Smrg } 12737ec681f3Smrg radv_query_shader( 12747ec681f3Smrg cmd_buffer, &cmd_buffer->device->meta_state.query.pipeline_statistics_query_pipeline, 12757ec681f3Smrg pool->bo, dst_buffer->bo, firstQuery * pool->stride, dst_buffer->offset + dstOffset, 12767ec681f3Smrg pool->stride, stride, dst_size, queryCount, flags, pool->pipeline_stats_mask, 12777ec681f3Smrg pool->availability_offset + 4 * firstQuery); 12787ec681f3Smrg break; 12797ec681f3Smrg case VK_QUERY_TYPE_TIMESTAMP: 12807ec681f3Smrg case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR: 12817ec681f3Smrg case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR: 12827ec681f3Smrg if (flags & VK_QUERY_RESULT_WAIT_BIT) { 12837ec681f3Smrg for (unsigned i = 0; i < queryCount; ++i, dest_va += stride) { 12847ec681f3Smrg unsigned query = firstQuery + i; 12857ec681f3Smrg uint64_t local_src_va = va + query * pool->stride; 12867ec681f3Smrg 12877ec681f3Smrg radeon_check_space(cmd_buffer->device->ws, cs, 7); 12887ec681f3Smrg 12897ec681f3Smrg /* Wait on the high 32 bits of the timestamp in 12907ec681f3Smrg * case the low part is 0xffffffff. 12917ec681f3Smrg */ 12927ec681f3Smrg radv_cp_wait_mem(cs, WAIT_REG_MEM_NOT_EQUAL, local_src_va + 4, 12937ec681f3Smrg TIMESTAMP_NOT_READY >> 32, 0xffffffff); 12947ec681f3Smrg } 12957ec681f3Smrg } 12967ec681f3Smrg 12977ec681f3Smrg radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.timestamp_query_pipeline, 12987ec681f3Smrg pool->bo, dst_buffer->bo, firstQuery * pool->stride, 12997ec681f3Smrg dst_buffer->offset + dstOffset, pool->stride, stride, dst_size, queryCount, 13007ec681f3Smrg flags, 0, 0); 13017ec681f3Smrg break; 13027ec681f3Smrg case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 13037ec681f3Smrg if (flags & VK_QUERY_RESULT_WAIT_BIT) { 13047ec681f3Smrg for (unsigned i = 0; i < queryCount; i++) { 13057ec681f3Smrg unsigned query = firstQuery + i; 13067ec681f3Smrg uint64_t src_va = va + query * pool->stride; 13077ec681f3Smrg 13087ec681f3Smrg radeon_check_space(cmd_buffer->device->ws, cs, 7 * 4); 13097ec681f3Smrg 13107ec681f3Smrg /* Wait on the upper word of all results. */ 13117ec681f3Smrg for (unsigned j = 0; j < 4; j++, src_va += 8) { 13127ec681f3Smrg radv_cp_wait_mem(cs, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 4, 0x80000000, 13137ec681f3Smrg 0xffffffff); 13147ec681f3Smrg } 13157ec681f3Smrg } 13167ec681f3Smrg } 13177ec681f3Smrg 13187ec681f3Smrg radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.tfb_query_pipeline, 13197ec681f3Smrg pool->bo, dst_buffer->bo, firstQuery * pool->stride, 13207ec681f3Smrg dst_buffer->offset + dstOffset, pool->stride, stride, dst_size, queryCount, 13217ec681f3Smrg flags, 0, 0); 13227ec681f3Smrg break; 13237ec681f3Smrg default: 13247ec681f3Smrg unreachable("trying to get results of unhandled query type"); 13257ec681f3Smrg } 13267ec681f3Smrg} 1327ed98bd31Smaya 13287ec681f3Smrgstatic uint32_t 13297ec681f3Smrgquery_clear_value(VkQueryType type) 13307ec681f3Smrg{ 13317ec681f3Smrg switch (type) { 13327ec681f3Smrg case VK_QUERY_TYPE_TIMESTAMP: 13337ec681f3Smrg case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR: 13347ec681f3Smrg case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR: 13357ec681f3Smrg return (uint32_t)TIMESTAMP_NOT_READY; 13367ec681f3Smrg default: 13377ec681f3Smrg return 0; 13387ec681f3Smrg } 1339ed98bd31Smaya} 1340ed98bd31Smaya 13417ec681f3Smrgvoid 13427ec681f3Smrgradv_CmdResetQueryPool(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t firstQuery, 13437ec681f3Smrg uint32_t queryCount) 134401e04c3fSmrg{ 13457ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 13467ec681f3Smrg RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); 13477ec681f3Smrg uint32_t value = query_clear_value(pool->type); 13487ec681f3Smrg uint32_t flush_bits = 0; 13497ec681f3Smrg 13507ec681f3Smrg /* Make sure to sync all previous work if the given command buffer has 13517ec681f3Smrg * pending active queries. Otherwise the GPU might write queries data 13527ec681f3Smrg * after the reset operation. 13537ec681f3Smrg */ 13547ec681f3Smrg cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits; 13557ec681f3Smrg 13567ec681f3Smrg flush_bits |= radv_fill_buffer(cmd_buffer, NULL, pool->bo, firstQuery * pool->stride, 13577ec681f3Smrg queryCount * pool->stride, value); 13587ec681f3Smrg 13597ec681f3Smrg if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) { 13607ec681f3Smrg flush_bits |= radv_fill_buffer(cmd_buffer, NULL, pool->bo, 13617ec681f3Smrg pool->availability_offset + firstQuery * 4, queryCount * 4, 0); 13627ec681f3Smrg } 13637ec681f3Smrg 13647ec681f3Smrg if (flush_bits) { 13657ec681f3Smrg /* Only need to flush caches for the compute shader path. */ 13667ec681f3Smrg cmd_buffer->pending_reset_query = true; 13677ec681f3Smrg cmd_buffer->state.flush_bits |= flush_bits; 13687ec681f3Smrg } 136901e04c3fSmrg} 137001e04c3fSmrg 13717ec681f3Smrgvoid 13727ec681f3Smrgradv_ResetQueryPool(VkDevice _device, VkQueryPool queryPool, uint32_t firstQuery, 13737ec681f3Smrg uint32_t queryCount) 137401e04c3fSmrg{ 13757ec681f3Smrg RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); 13767ec681f3Smrg 13777ec681f3Smrg uint32_t value = query_clear_value(pool->type); 13787ec681f3Smrg uint32_t *data = (uint32_t *)(pool->ptr + firstQuery * pool->stride); 13797ec681f3Smrg uint32_t *data_end = (uint32_t *)(pool->ptr + (firstQuery + queryCount) * pool->stride); 13807ec681f3Smrg 13817ec681f3Smrg for (uint32_t *p = data; p != data_end; ++p) 13827ec681f3Smrg *p = value; 13837ec681f3Smrg 13847ec681f3Smrg if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) { 13857ec681f3Smrg memset(pool->ptr + pool->availability_offset + firstQuery * 4, 0, queryCount * 4); 13867ec681f3Smrg } 138701e04c3fSmrg} 138801e04c3fSmrg 13897ec681f3Smrgstatic unsigned 13907ec681f3Smrgevent_type_for_stream(unsigned stream) 139101e04c3fSmrg{ 13927ec681f3Smrg switch (stream) { 13937ec681f3Smrg default: 13947ec681f3Smrg case 0: 13957ec681f3Smrg return V_028A90_SAMPLE_STREAMOUTSTATS; 13967ec681f3Smrg case 1: 13977ec681f3Smrg return V_028A90_SAMPLE_STREAMOUTSTATS1; 13987ec681f3Smrg case 2: 13997ec681f3Smrg return V_028A90_SAMPLE_STREAMOUTSTATS2; 14007ec681f3Smrg case 3: 14017ec681f3Smrg return V_028A90_SAMPLE_STREAMOUTSTATS3; 14027ec681f3Smrg } 14037ec681f3Smrg} 140401e04c3fSmrg 14057ec681f3Smrgstatic void 14067ec681f3Smrgemit_begin_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint64_t va, 14077ec681f3Smrg VkQueryType query_type, VkQueryControlFlags flags, uint32_t index) 14087ec681f3Smrg{ 14097ec681f3Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 14107ec681f3Smrg switch (query_type) { 14117ec681f3Smrg case VK_QUERY_TYPE_OCCLUSION: 14127ec681f3Smrg radeon_check_space(cmd_buffer->device->ws, cs, 7); 14137ec681f3Smrg 14147ec681f3Smrg ++cmd_buffer->state.active_occlusion_queries; 14157ec681f3Smrg if (cmd_buffer->state.active_occlusion_queries == 1) { 14167ec681f3Smrg if (flags & VK_QUERY_CONTROL_PRECISE_BIT) { 14177ec681f3Smrg /* This is the first occlusion query, enable 14187ec681f3Smrg * the hint if the precision bit is set. 14197ec681f3Smrg */ 14207ec681f3Smrg cmd_buffer->state.perfect_occlusion_queries_enabled = true; 14217ec681f3Smrg } 14227ec681f3Smrg 14237ec681f3Smrg radv_set_db_count_control(cmd_buffer); 14247ec681f3Smrg } else { 14257ec681f3Smrg if ((flags & VK_QUERY_CONTROL_PRECISE_BIT) && 14267ec681f3Smrg !cmd_buffer->state.perfect_occlusion_queries_enabled) { 14277ec681f3Smrg /* This is not the first query, but this one 14287ec681f3Smrg * needs to enable precision, DB_COUNT_CONTROL 14297ec681f3Smrg * has to be updated accordingly. 14307ec681f3Smrg */ 14317ec681f3Smrg cmd_buffer->state.perfect_occlusion_queries_enabled = true; 14327ec681f3Smrg 14337ec681f3Smrg radv_set_db_count_control(cmd_buffer); 14347ec681f3Smrg } 14357ec681f3Smrg } 14367ec681f3Smrg 14377ec681f3Smrg radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); 14387ec681f3Smrg radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1)); 14397ec681f3Smrg radeon_emit(cs, va); 14407ec681f3Smrg radeon_emit(cs, va >> 32); 14417ec681f3Smrg break; 14427ec681f3Smrg case VK_QUERY_TYPE_PIPELINE_STATISTICS: 14437ec681f3Smrg radeon_check_space(cmd_buffer->device->ws, cs, 4); 14447ec681f3Smrg 14457ec681f3Smrg ++cmd_buffer->state.active_pipeline_queries; 14467ec681f3Smrg if (cmd_buffer->state.active_pipeline_queries == 1) { 14477ec681f3Smrg cmd_buffer->state.flush_bits &= ~RADV_CMD_FLAG_STOP_PIPELINE_STATS; 14487ec681f3Smrg cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_START_PIPELINE_STATS; 14497ec681f3Smrg } 14507ec681f3Smrg 14517ec681f3Smrg radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); 14527ec681f3Smrg radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2)); 14537ec681f3Smrg radeon_emit(cs, va); 14547ec681f3Smrg radeon_emit(cs, va >> 32); 14557ec681f3Smrg 14567ec681f3Smrg if (radv_query_pool_needs_gds(cmd_buffer->device, pool)) { 14577ec681f3Smrg int idx = radv_get_pipeline_statistics_index( 14587ec681f3Smrg VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT); 14597ec681f3Smrg 14607ec681f3Smrg /* Make sure GDS is idle before copying the value. */ 14617ec681f3Smrg cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2; 14627ec681f3Smrg si_emit_cache_flush(cmd_buffer); 14637ec681f3Smrg 14647ec681f3Smrg va += 8 * idx; 14657ec681f3Smrg 14667ec681f3Smrg radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); 14677ec681f3Smrg radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_GDS) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | 14687ec681f3Smrg COPY_DATA_WR_CONFIRM); 14697ec681f3Smrg radeon_emit(cs, 0); 14707ec681f3Smrg radeon_emit(cs, 0); 14717ec681f3Smrg radeon_emit(cs, va); 14727ec681f3Smrg radeon_emit(cs, va >> 32); 14737ec681f3Smrg 14747ec681f3Smrg /* Record that the command buffer needs GDS. */ 14757ec681f3Smrg cmd_buffer->gds_needed = true; 14767ec681f3Smrg 14777ec681f3Smrg cmd_buffer->state.active_pipeline_gds_queries++; 14787ec681f3Smrg } 14797ec681f3Smrg break; 14807ec681f3Smrg case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 14817ec681f3Smrg radeon_check_space(cmd_buffer->device->ws, cs, 4); 14827ec681f3Smrg 14837ec681f3Smrg assert(index < MAX_SO_STREAMS); 14847ec681f3Smrg 14857ec681f3Smrg radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); 14867ec681f3Smrg radeon_emit(cs, EVENT_TYPE(event_type_for_stream(index)) | EVENT_INDEX(3)); 14877ec681f3Smrg radeon_emit(cs, va); 14887ec681f3Smrg radeon_emit(cs, va >> 32); 14897ec681f3Smrg break; 14907ec681f3Smrg default: 14917ec681f3Smrg unreachable("beginning unhandled query type"); 14927ec681f3Smrg } 149301e04c3fSmrg} 149401e04c3fSmrg 14957ec681f3Smrgstatic void 14967ec681f3Smrgemit_end_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint64_t va, 14977ec681f3Smrg uint64_t avail_va, VkQueryType query_type, uint32_t index) 149801e04c3fSmrg{ 14997ec681f3Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 15007ec681f3Smrg switch (query_type) { 15017ec681f3Smrg case VK_QUERY_TYPE_OCCLUSION: 15027ec681f3Smrg radeon_check_space(cmd_buffer->device->ws, cs, 14); 15037ec681f3Smrg 15047ec681f3Smrg cmd_buffer->state.active_occlusion_queries--; 15057ec681f3Smrg if (cmd_buffer->state.active_occlusion_queries == 0) { 15067ec681f3Smrg radv_set_db_count_control(cmd_buffer); 15077ec681f3Smrg 15087ec681f3Smrg /* Reset the perfect occlusion queries hint now that no 15097ec681f3Smrg * queries are active. 15107ec681f3Smrg */ 15117ec681f3Smrg cmd_buffer->state.perfect_occlusion_queries_enabled = false; 15127ec681f3Smrg } 15137ec681f3Smrg 15147ec681f3Smrg radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); 15157ec681f3Smrg radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1)); 15167ec681f3Smrg radeon_emit(cs, va + 8); 15177ec681f3Smrg radeon_emit(cs, (va + 8) >> 32); 15187ec681f3Smrg 15197ec681f3Smrg break; 15207ec681f3Smrg case VK_QUERY_TYPE_PIPELINE_STATISTICS: 15217ec681f3Smrg radeon_check_space(cmd_buffer->device->ws, cs, 16); 15227ec681f3Smrg 15237ec681f3Smrg cmd_buffer->state.active_pipeline_queries--; 15247ec681f3Smrg if (cmd_buffer->state.active_pipeline_queries == 0) { 15257ec681f3Smrg cmd_buffer->state.flush_bits &= ~RADV_CMD_FLAG_START_PIPELINE_STATS; 15267ec681f3Smrg cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_STOP_PIPELINE_STATS; 15277ec681f3Smrg } 15287ec681f3Smrg va += pipelinestat_block_size; 15297ec681f3Smrg 15307ec681f3Smrg radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); 15317ec681f3Smrg radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2)); 15327ec681f3Smrg radeon_emit(cs, va); 15337ec681f3Smrg radeon_emit(cs, va >> 32); 15347ec681f3Smrg 15357ec681f3Smrg si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.chip_class, 15367ec681f3Smrg radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_BOTTOM_OF_PIPE_TS, 15377ec681f3Smrg 0, EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, avail_va, 1, 15387ec681f3Smrg cmd_buffer->gfx9_eop_bug_va); 15397ec681f3Smrg 15407ec681f3Smrg if (radv_query_pool_needs_gds(cmd_buffer->device, pool)) { 15417ec681f3Smrg int idx = radv_get_pipeline_statistics_index( 15427ec681f3Smrg VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT); 15437ec681f3Smrg 15447ec681f3Smrg /* Make sure GDS is idle before copying the value. */ 15457ec681f3Smrg cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2; 15467ec681f3Smrg si_emit_cache_flush(cmd_buffer); 15477ec681f3Smrg 15487ec681f3Smrg va += 8 * idx; 15497ec681f3Smrg 15507ec681f3Smrg radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); 15517ec681f3Smrg radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_GDS) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | 15527ec681f3Smrg COPY_DATA_WR_CONFIRM); 15537ec681f3Smrg radeon_emit(cs, 0); 15547ec681f3Smrg radeon_emit(cs, 0); 15557ec681f3Smrg radeon_emit(cs, va); 15567ec681f3Smrg radeon_emit(cs, va >> 32); 15577ec681f3Smrg 15587ec681f3Smrg cmd_buffer->state.active_pipeline_gds_queries--; 15597ec681f3Smrg } 15607ec681f3Smrg break; 15617ec681f3Smrg case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 15627ec681f3Smrg radeon_check_space(cmd_buffer->device->ws, cs, 4); 15637ec681f3Smrg 15647ec681f3Smrg assert(index < MAX_SO_STREAMS); 15657ec681f3Smrg 15667ec681f3Smrg radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); 15677ec681f3Smrg radeon_emit(cs, EVENT_TYPE(event_type_for_stream(index)) | EVENT_INDEX(3)); 15687ec681f3Smrg radeon_emit(cs, (va + 16)); 15697ec681f3Smrg radeon_emit(cs, (va + 16) >> 32); 15707ec681f3Smrg break; 15717ec681f3Smrg default: 15727ec681f3Smrg unreachable("ending unhandled query type"); 15737ec681f3Smrg } 15747ec681f3Smrg 15757ec681f3Smrg cmd_buffer->active_query_flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH | 15767ec681f3Smrg RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2 | 15777ec681f3Smrg RADV_CMD_FLAG_INV_VCACHE; 15787ec681f3Smrg if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { 15797ec681f3Smrg cmd_buffer->active_query_flush_bits |= 15807ec681f3Smrg RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB; 15817ec681f3Smrg } 158201e04c3fSmrg} 158301e04c3fSmrg 15847ec681f3Smrgvoid 15857ec681f3Smrgradv_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query, 15867ec681f3Smrg VkQueryControlFlags flags, uint32_t index) 158701e04c3fSmrg{ 15887ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 15897ec681f3Smrg RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); 15907ec681f3Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 15917ec681f3Smrg uint64_t va = radv_buffer_get_va(pool->bo); 15927ec681f3Smrg 15937ec681f3Smrg radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo); 159401e04c3fSmrg 15957ec681f3Smrg emit_query_flush(cmd_buffer, pool); 159601e04c3fSmrg 15977ec681f3Smrg va += pool->stride * query; 159801e04c3fSmrg 15997ec681f3Smrg emit_begin_query(cmd_buffer, pool, va, pool->type, flags, index); 16007ec681f3Smrg} 160101e04c3fSmrg 16027ec681f3Smrgvoid 16037ec681f3Smrgradv_CmdBeginQuery(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query, 16047ec681f3Smrg VkQueryControlFlags flags) 16057ec681f3Smrg{ 16067ec681f3Smrg radv_CmdBeginQueryIndexedEXT(commandBuffer, queryPool, query, flags, 0); 160701e04c3fSmrg} 160801e04c3fSmrg 16097ec681f3Smrgvoid 16107ec681f3Smrgradv_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query, 16117ec681f3Smrg uint32_t index) 161201e04c3fSmrg{ 16137ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 16147ec681f3Smrg RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); 16157ec681f3Smrg uint64_t va = radv_buffer_get_va(pool->bo); 16167ec681f3Smrg uint64_t avail_va = va + pool->availability_offset + 4 * query; 16177ec681f3Smrg va += pool->stride * query; 16187ec681f3Smrg 16197ec681f3Smrg /* Do not need to add the pool BO to the list because the query must 16207ec681f3Smrg * currently be active, which means the BO is already in the list. 16217ec681f3Smrg */ 16227ec681f3Smrg emit_end_query(cmd_buffer, pool, va, avail_va, pool->type, index); 16237ec681f3Smrg 16247ec681f3Smrg /* 16257ec681f3Smrg * For multiview we have to emit a query for each bit in the mask, 16267ec681f3Smrg * however the first query we emit will get the totals for all the 16277ec681f3Smrg * operations, so we don't want to get a real value in the other 16287ec681f3Smrg * queries. This emits a fake begin/end sequence so the waiting 16297ec681f3Smrg * code gets a completed query value and doesn't hang, but the 16307ec681f3Smrg * query returns 0. 16317ec681f3Smrg */ 16327ec681f3Smrg if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) { 16337ec681f3Smrg for (unsigned i = 1; i < util_bitcount(cmd_buffer->state.subpass->view_mask); i++) { 16347ec681f3Smrg va += pool->stride; 16357ec681f3Smrg avail_va += 4; 16367ec681f3Smrg emit_begin_query(cmd_buffer, pool, va, pool->type, 0, 0); 16377ec681f3Smrg emit_end_query(cmd_buffer, pool, va, avail_va, pool->type, 0); 16387ec681f3Smrg } 16397ec681f3Smrg } 164001e04c3fSmrg} 164101e04c3fSmrg 16427ec681f3Smrgvoid 16437ec681f3Smrgradv_CmdEndQuery(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query) 164401e04c3fSmrg{ 16457ec681f3Smrg radv_CmdEndQueryIndexedEXT(commandBuffer, queryPool, query, 0); 164601e04c3fSmrg} 164701e04c3fSmrg 16487ec681f3Smrgvoid 16497ec681f3Smrgradv_CmdWriteTimestamp(VkCommandBuffer commandBuffer, VkPipelineStageFlagBits pipelineStage, 16507ec681f3Smrg VkQueryPool queryPool, uint32_t query) 165101e04c3fSmrg{ 16527ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 16537ec681f3Smrg RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); 16547ec681f3Smrg bool mec = radv_cmd_buffer_uses_mec(cmd_buffer); 16557ec681f3Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 16567ec681f3Smrg uint64_t va = radv_buffer_get_va(pool->bo); 16577ec681f3Smrg uint64_t query_va = va + pool->stride * query; 16587ec681f3Smrg 16597ec681f3Smrg radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo); 16607ec681f3Smrg 16617ec681f3Smrg emit_query_flush(cmd_buffer, pool); 16627ec681f3Smrg 16637ec681f3Smrg int num_queries = 1; 16647ec681f3Smrg if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) 16657ec681f3Smrg num_queries = util_bitcount(cmd_buffer->state.subpass->view_mask); 16667ec681f3Smrg 16677ec681f3Smrg ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 28 * num_queries); 16687ec681f3Smrg 16697ec681f3Smrg for (unsigned i = 0; i < num_queries; i++) { 16707ec681f3Smrg switch (pipelineStage) { 16717ec681f3Smrg case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT: 16727ec681f3Smrg radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); 16737ec681f3Smrg radeon_emit(cs, COPY_DATA_COUNT_SEL | COPY_DATA_WR_CONFIRM | 16747ec681f3Smrg COPY_DATA_SRC_SEL(COPY_DATA_TIMESTAMP) | COPY_DATA_DST_SEL(V_370_MEM)); 16757ec681f3Smrg radeon_emit(cs, 0); 16767ec681f3Smrg radeon_emit(cs, 0); 16777ec681f3Smrg radeon_emit(cs, query_va); 16787ec681f3Smrg radeon_emit(cs, query_va >> 32); 16797ec681f3Smrg break; 16807ec681f3Smrg default: 16817ec681f3Smrg si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.chip_class, 16827ec681f3Smrg mec, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, 16837ec681f3Smrg EOP_DATA_SEL_TIMESTAMP, query_va, 0, 16847ec681f3Smrg cmd_buffer->gfx9_eop_bug_va); 16857ec681f3Smrg break; 16867ec681f3Smrg } 16877ec681f3Smrg query_va += pool->stride; 16887ec681f3Smrg } 16897ec681f3Smrg 16907ec681f3Smrg cmd_buffer->active_query_flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH | 16917ec681f3Smrg RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2 | 16927ec681f3Smrg RADV_CMD_FLAG_INV_VCACHE; 16937ec681f3Smrg if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { 16947ec681f3Smrg cmd_buffer->active_query_flush_bits |= 16957ec681f3Smrg RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB; 16967ec681f3Smrg } 16977ec681f3Smrg 16987ec681f3Smrg assert(cmd_buffer->cs->cdw <= cdw_max); 169901e04c3fSmrg} 170001e04c3fSmrg 17017ec681f3Smrgvoid 17027ec681f3Smrgradv_CmdWriteAccelerationStructuresPropertiesKHR( 17037ec681f3Smrg VkCommandBuffer commandBuffer, uint32_t accelerationStructureCount, 17047ec681f3Smrg const VkAccelerationStructureKHR *pAccelerationStructures, VkQueryType queryType, 17057ec681f3Smrg VkQueryPool queryPool, uint32_t firstQuery) 170601e04c3fSmrg{ 17077ec681f3Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 17087ec681f3Smrg RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); 17097ec681f3Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 17107ec681f3Smrg uint64_t pool_va = radv_buffer_get_va(pool->bo); 17117ec681f3Smrg uint64_t query_va = pool_va + pool->stride * firstQuery; 17127ec681f3Smrg 17137ec681f3Smrg radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo); 17147ec681f3Smrg 17157ec681f3Smrg emit_query_flush(cmd_buffer, pool); 17167ec681f3Smrg 17177ec681f3Smrg ASSERTED unsigned cdw_max = 17187ec681f3Smrg radeon_check_space(cmd_buffer->device->ws, cs, 6 * accelerationStructureCount); 17197ec681f3Smrg 17207ec681f3Smrg for (uint32_t i = 0; i < accelerationStructureCount; ++i) { 17217ec681f3Smrg RADV_FROM_HANDLE(radv_acceleration_structure, accel_struct, pAccelerationStructures[i]); 17227ec681f3Smrg uint64_t va = radv_accel_struct_get_va(accel_struct); 17237ec681f3Smrg 17247ec681f3Smrg switch (queryType) { 17257ec681f3Smrg case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR: 17267ec681f3Smrg va += offsetof(struct radv_accel_struct_header, compacted_size); 17277ec681f3Smrg break; 17287ec681f3Smrg case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR: 17297ec681f3Smrg va += offsetof(struct radv_accel_struct_header, serialization_size); 17307ec681f3Smrg break; 17317ec681f3Smrg default: 17327ec681f3Smrg unreachable("Unhandle accel struct query type."); 17337ec681f3Smrg } 17347ec681f3Smrg 17357ec681f3Smrg radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); 17367ec681f3Smrg radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | 17377ec681f3Smrg COPY_DATA_COUNT_SEL | COPY_DATA_WR_CONFIRM); 17387ec681f3Smrg radeon_emit(cs, va); 17397ec681f3Smrg radeon_emit(cs, va >> 32); 17407ec681f3Smrg radeon_emit(cs, query_va); 17417ec681f3Smrg radeon_emit(cs, query_va >> 32); 17427ec681f3Smrg 17437ec681f3Smrg query_va += pool->stride; 17447ec681f3Smrg } 17457ec681f3Smrg 17467ec681f3Smrg assert(cmd_buffer->cs->cdw <= cdw_max); 174701e04c3fSmrg} 1748