1b8e80941Smrg/* 2b8e80941Smrg * Copyrigh 2016 Red Hat Inc. 3b8e80941Smrg * Based on anv: 4b8e80941Smrg * Copyright © 2015 Intel Corporation 5b8e80941Smrg * 6b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a 7b8e80941Smrg * copy of this software and associated documentation files (the "Software"), 8b8e80941Smrg * to deal in the Software without restriction, including without limitation 9b8e80941Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10b8e80941Smrg * and/or sell copies of the Software, and to permit persons to whom the 11b8e80941Smrg * Software is furnished to do so, subject to the following conditions: 12b8e80941Smrg * 13b8e80941Smrg * The above copyright notice and this permission notice (including the next 14b8e80941Smrg * paragraph) shall be included in all copies or substantial portions of the 15b8e80941Smrg * Software. 16b8e80941Smrg * 17b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18b8e80941Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20b8e80941Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21b8e80941Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 22b8e80941Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 23b8e80941Smrg * IN THE SOFTWARE. 24b8e80941Smrg */ 25b8e80941Smrg 26b8e80941Smrg#include <assert.h> 27b8e80941Smrg#include <stdbool.h> 28b8e80941Smrg#include <string.h> 29b8e80941Smrg#include <unistd.h> 30b8e80941Smrg#include <fcntl.h> 31b8e80941Smrg 32b8e80941Smrg#include "nir/nir_builder.h" 33b8e80941Smrg#include "radv_meta.h" 34b8e80941Smrg#include "radv_private.h" 35b8e80941Smrg#include "radv_cs.h" 36b8e80941Smrg#include "sid.h" 37b8e80941Smrg 38b8e80941Smrg#define TIMESTAMP_NOT_READY UINT64_MAX 39b8e80941Smrg 40b8e80941Smrgstatic const int pipelinestat_block_size = 11 * 8; 41b8e80941Smrgstatic const unsigned pipeline_statistics_indices[] = {7, 6, 3, 4, 5, 2, 1, 0, 8, 9, 10}; 42b8e80941Smrg 43b8e80941Smrgstatic nir_ssa_def *nir_test_flag(nir_builder *b, nir_ssa_def *flags, uint32_t flag) 44b8e80941Smrg{ 45b8e80941Smrg return nir_i2b(b, nir_iand(b, flags, nir_imm_int(b, flag))); 46b8e80941Smrg} 47b8e80941Smrg 48b8e80941Smrgstatic void radv_break_on_count(nir_builder *b, nir_variable *var, nir_ssa_def *count) 49b8e80941Smrg{ 50b8e80941Smrg nir_ssa_def *counter = nir_load_var(b, var); 51b8e80941Smrg 52b8e80941Smrg nir_if *if_stmt = nir_if_create(b->shader); 53b8e80941Smrg if_stmt->condition = nir_src_for_ssa(nir_uge(b, counter, count)); 54b8e80941Smrg nir_cf_node_insert(b->cursor, &if_stmt->cf_node); 55b8e80941Smrg 56b8e80941Smrg b->cursor = nir_after_cf_list(&if_stmt->then_list); 57b8e80941Smrg 58b8e80941Smrg nir_jump_instr *instr = nir_jump_instr_create(b->shader, nir_jump_break); 59b8e80941Smrg nir_builder_instr_insert(b, &instr->instr); 60b8e80941Smrg 61b8e80941Smrg b->cursor = nir_after_cf_node(&if_stmt->cf_node); 62b8e80941Smrg counter = nir_iadd(b, counter, nir_imm_int(b, 1)); 63b8e80941Smrg nir_store_var(b, var, counter, 0x1); 64b8e80941Smrg} 65b8e80941Smrg 66b8e80941Smrgstatic struct nir_ssa_def * 67b8e80941Smrgradv_load_push_int(nir_builder *b, unsigned offset, const char *name) 68b8e80941Smrg{ 69b8e80941Smrg nir_intrinsic_instr *flags = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_push_constant); 70b8e80941Smrg nir_intrinsic_set_base(flags, 0); 71b8e80941Smrg nir_intrinsic_set_range(flags, 16); 72b8e80941Smrg flags->src[0] = nir_src_for_ssa(nir_imm_int(b, offset)); 73b8e80941Smrg flags->num_components = 1; 74b8e80941Smrg nir_ssa_dest_init(&flags->instr, &flags->dest, 1, 32, name); 75b8e80941Smrg nir_builder_instr_insert(b, &flags->instr); 76b8e80941Smrg return &flags->dest.ssa; 77b8e80941Smrg} 78b8e80941Smrg 79b8e80941Smrgstatic nir_shader * 80b8e80941Smrgbuild_occlusion_query_shader(struct radv_device *device) { 81b8e80941Smrg /* the shader this builds is roughly 82b8e80941Smrg * 83b8e80941Smrg * push constants { 84b8e80941Smrg * uint32_t flags; 85b8e80941Smrg * uint32_t dst_stride; 86b8e80941Smrg * }; 87b8e80941Smrg * 88b8e80941Smrg * uint32_t src_stride = 16 * db_count; 89b8e80941Smrg * 90b8e80941Smrg * location(binding = 0) buffer dst_buf; 91b8e80941Smrg * location(binding = 1) buffer src_buf; 92b8e80941Smrg * 93b8e80941Smrg * void main() { 94b8e80941Smrg * uint64_t result = 0; 95b8e80941Smrg * uint64_t src_offset = src_stride * global_id.x; 96b8e80941Smrg * uint64_t dst_offset = dst_stride * global_id.x; 97b8e80941Smrg * bool available = true; 98b8e80941Smrg * for (int i = 0; i < db_count; ++i) { 99b8e80941Smrg * if (enabled_rb_mask & (1 << i)) { 100b8e80941Smrg * uint64_t start = src_buf[src_offset + 16 * i]; 101b8e80941Smrg * uint64_t end = src_buf[src_offset + 16 * i + 8]; 102b8e80941Smrg * if ((start & (1ull << 63)) && (end & (1ull << 63))) 103b8e80941Smrg * result += end - start; 104b8e80941Smrg * else 105b8e80941Smrg * available = false; 106b8e80941Smrg * } 107b8e80941Smrg * } 108b8e80941Smrg * uint32_t elem_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4; 109b8e80941Smrg * if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) { 110b8e80941Smrg * if (flags & VK_QUERY_RESULT_64_BIT) 111b8e80941Smrg * dst_buf[dst_offset] = result; 112b8e80941Smrg * else 113b8e80941Smrg * dst_buf[dst_offset] = (uint32_t)result. 114b8e80941Smrg * } 115b8e80941Smrg * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { 116b8e80941Smrg * dst_buf[dst_offset + elem_size] = available; 117b8e80941Smrg * } 118b8e80941Smrg * } 119b8e80941Smrg */ 120b8e80941Smrg nir_builder b; 121b8e80941Smrg nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL); 122b8e80941Smrg b.shader->info.name = ralloc_strdup(b.shader, "occlusion_query"); 123b8e80941Smrg b.shader->info.cs.local_size[0] = 64; 124b8e80941Smrg b.shader->info.cs.local_size[1] = 1; 125b8e80941Smrg b.shader->info.cs.local_size[2] = 1; 126b8e80941Smrg 127b8e80941Smrg nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result"); 128b8e80941Smrg nir_variable *outer_counter = nir_local_variable_create(b.impl, glsl_int_type(), "outer_counter"); 129b8e80941Smrg nir_variable *start = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "start"); 130b8e80941Smrg nir_variable *end = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "end"); 131b8e80941Smrg nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available"); 132b8e80941Smrg unsigned enabled_rb_mask = device->physical_device->rad_info.enabled_rb_mask; 133b8e80941Smrg unsigned db_count = device->physical_device->rad_info.num_render_backends; 134b8e80941Smrg 135b8e80941Smrg nir_ssa_def *flags = radv_load_push_int(&b, 0, "flags"); 136b8e80941Smrg 137b8e80941Smrg nir_intrinsic_instr *dst_buf = nir_intrinsic_instr_create(b.shader, 138b8e80941Smrg nir_intrinsic_vulkan_resource_index); 139b8e80941Smrg dst_buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0)); 140b8e80941Smrg dst_buf->num_components = 1; 141b8e80941Smrg nir_intrinsic_set_desc_set(dst_buf, 0); 142b8e80941Smrg nir_intrinsic_set_binding(dst_buf, 0); 143b8e80941Smrg nir_ssa_dest_init(&dst_buf->instr, &dst_buf->dest, dst_buf->num_components, 32, NULL); 144b8e80941Smrg nir_builder_instr_insert(&b, &dst_buf->instr); 145b8e80941Smrg 146b8e80941Smrg nir_intrinsic_instr *src_buf = nir_intrinsic_instr_create(b.shader, 147b8e80941Smrg nir_intrinsic_vulkan_resource_index); 148b8e80941Smrg src_buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0)); 149b8e80941Smrg src_buf->num_components = 1; 150b8e80941Smrg nir_intrinsic_set_desc_set(src_buf, 0); 151b8e80941Smrg nir_intrinsic_set_binding(src_buf, 1); 152b8e80941Smrg nir_ssa_dest_init(&src_buf->instr, &src_buf->dest, src_buf->num_components, 32, NULL); 153b8e80941Smrg nir_builder_instr_insert(&b, &src_buf->instr); 154b8e80941Smrg 155b8e80941Smrg nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b); 156b8e80941Smrg nir_ssa_def *wg_id = nir_load_work_group_id(&b); 157b8e80941Smrg nir_ssa_def *block_size = nir_imm_ivec4(&b, 158b8e80941Smrg b.shader->info.cs.local_size[0], 159b8e80941Smrg b.shader->info.cs.local_size[1], 160b8e80941Smrg b.shader->info.cs.local_size[2], 0); 161b8e80941Smrg nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id); 162b8e80941Smrg global_id = nir_channel(&b, global_id, 0); // We only care about x here. 163b8e80941Smrg 164b8e80941Smrg nir_ssa_def *input_stride = nir_imm_int(&b, db_count * 16); 165b8e80941Smrg nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id); 166b8e80941Smrg nir_ssa_def *output_stride = radv_load_push_int(&b, 4, "output_stride"); 167b8e80941Smrg nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id); 168b8e80941Smrg 169b8e80941Smrg 170b8e80941Smrg nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1); 171b8e80941Smrg nir_store_var(&b, outer_counter, nir_imm_int(&b, 0), 0x1); 172b8e80941Smrg nir_store_var(&b, available, nir_imm_true(&b), 0x1); 173b8e80941Smrg 174b8e80941Smrg nir_loop *outer_loop = nir_loop_create(b.shader); 175b8e80941Smrg nir_builder_cf_insert(&b, &outer_loop->cf_node); 176b8e80941Smrg b.cursor = nir_after_cf_list(&outer_loop->body); 177b8e80941Smrg 178b8e80941Smrg nir_ssa_def *current_outer_count = nir_load_var(&b, outer_counter); 179b8e80941Smrg radv_break_on_count(&b, outer_counter, nir_imm_int(&b, db_count)); 180b8e80941Smrg 181b8e80941Smrg nir_ssa_def *enabled_cond = 182b8e80941Smrg nir_iand(&b, nir_imm_int(&b, enabled_rb_mask), 183b8e80941Smrg nir_ishl(&b, nir_imm_int(&b, 1), current_outer_count)); 184b8e80941Smrg 185b8e80941Smrg nir_if *enabled_if = nir_if_create(b.shader); 186b8e80941Smrg enabled_if->condition = nir_src_for_ssa(nir_i2b(&b, enabled_cond)); 187b8e80941Smrg nir_cf_node_insert(b.cursor, &enabled_if->cf_node); 188b8e80941Smrg 189b8e80941Smrg b.cursor = nir_after_cf_list(&enabled_if->then_list); 190b8e80941Smrg 191b8e80941Smrg nir_ssa_def *load_offset = nir_imul(&b, current_outer_count, nir_imm_int(&b, 16)); 192b8e80941Smrg load_offset = nir_iadd(&b, input_base, load_offset); 193b8e80941Smrg 194b8e80941Smrg nir_intrinsic_instr *load = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo); 195b8e80941Smrg load->src[0] = nir_src_for_ssa(&src_buf->dest.ssa); 196b8e80941Smrg load->src[1] = nir_src_for_ssa(load_offset); 197b8e80941Smrg nir_ssa_dest_init(&load->instr, &load->dest, 2, 64, NULL); 198b8e80941Smrg load->num_components = 2; 199b8e80941Smrg nir_builder_instr_insert(&b, &load->instr); 200b8e80941Smrg 201b8e80941Smrg nir_store_var(&b, start, nir_channel(&b, &load->dest.ssa, 0), 0x1); 202b8e80941Smrg nir_store_var(&b, end, nir_channel(&b, &load->dest.ssa, 1), 0x1); 203b8e80941Smrg 204b8e80941Smrg nir_ssa_def *start_done = nir_ilt(&b, nir_load_var(&b, start), nir_imm_int64(&b, 0)); 205b8e80941Smrg nir_ssa_def *end_done = nir_ilt(&b, nir_load_var(&b, end), nir_imm_int64(&b, 0)); 206b8e80941Smrg 207b8e80941Smrg nir_if *update_if = nir_if_create(b.shader); 208b8e80941Smrg update_if->condition = nir_src_for_ssa(nir_iand(&b, start_done, end_done)); 209b8e80941Smrg nir_cf_node_insert(b.cursor, &update_if->cf_node); 210b8e80941Smrg 211b8e80941Smrg b.cursor = nir_after_cf_list(&update_if->then_list); 212b8e80941Smrg 213b8e80941Smrg nir_store_var(&b, result, 214b8e80941Smrg nir_iadd(&b, nir_load_var(&b, result), 215b8e80941Smrg nir_isub(&b, nir_load_var(&b, end), 216b8e80941Smrg nir_load_var(&b, start))), 0x1); 217b8e80941Smrg 218b8e80941Smrg b.cursor = nir_after_cf_list(&update_if->else_list); 219b8e80941Smrg 220b8e80941Smrg nir_store_var(&b, available, nir_imm_false(&b), 0x1); 221b8e80941Smrg 222b8e80941Smrg b.cursor = nir_after_cf_node(&outer_loop->cf_node); 223b8e80941Smrg 224b8e80941Smrg /* Store the result if complete or if partial results have been requested. */ 225b8e80941Smrg 226b8e80941Smrg nir_ssa_def *result_is_64bit = nir_test_flag(&b, flags, VK_QUERY_RESULT_64_BIT); 227b8e80941Smrg nir_ssa_def *result_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4)); 228b8e80941Smrg 229b8e80941Smrg nir_if *store_if = nir_if_create(b.shader); 230b8e80941Smrg store_if->condition = nir_src_for_ssa(nir_ior(&b, nir_test_flag(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT), nir_load_var(&b, available))); 231b8e80941Smrg nir_cf_node_insert(b.cursor, &store_if->cf_node); 232b8e80941Smrg 233b8e80941Smrg b.cursor = nir_after_cf_list(&store_if->then_list); 234b8e80941Smrg 235b8e80941Smrg nir_if *store_64bit_if = nir_if_create(b.shader); 236b8e80941Smrg store_64bit_if->condition = nir_src_for_ssa(result_is_64bit); 237b8e80941Smrg nir_cf_node_insert(b.cursor, &store_64bit_if->cf_node); 238b8e80941Smrg 239b8e80941Smrg b.cursor = nir_after_cf_list(&store_64bit_if->then_list); 240b8e80941Smrg 241b8e80941Smrg nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo); 242b8e80941Smrg store->src[0] = nir_src_for_ssa(nir_load_var(&b, result)); 243b8e80941Smrg store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa); 244b8e80941Smrg store->src[2] = nir_src_for_ssa(output_base); 245b8e80941Smrg nir_intrinsic_set_write_mask(store, 0x1); 246b8e80941Smrg store->num_components = 1; 247b8e80941Smrg nir_builder_instr_insert(&b, &store->instr); 248b8e80941Smrg 249b8e80941Smrg b.cursor = nir_after_cf_list(&store_64bit_if->else_list); 250b8e80941Smrg 251b8e80941Smrg store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo); 252b8e80941Smrg store->src[0] = nir_src_for_ssa(nir_u2u32(&b, nir_load_var(&b, result))); 253b8e80941Smrg store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa); 254b8e80941Smrg store->src[2] = nir_src_for_ssa(output_base); 255b8e80941Smrg nir_intrinsic_set_write_mask(store, 0x1); 256b8e80941Smrg store->num_components = 1; 257b8e80941Smrg nir_builder_instr_insert(&b, &store->instr); 258b8e80941Smrg 259b8e80941Smrg b.cursor = nir_after_cf_node(&store_if->cf_node); 260b8e80941Smrg 261b8e80941Smrg /* Store the availability bit if requested. */ 262b8e80941Smrg 263b8e80941Smrg nir_if *availability_if = nir_if_create(b.shader); 264b8e80941Smrg availability_if->condition = nir_src_for_ssa(nir_test_flag(&b, flags, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)); 265b8e80941Smrg nir_cf_node_insert(b.cursor, &availability_if->cf_node); 266b8e80941Smrg 267b8e80941Smrg b.cursor = nir_after_cf_list(&availability_if->then_list); 268b8e80941Smrg 269b8e80941Smrg store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo); 270b8e80941Smrg store->src[0] = nir_src_for_ssa(nir_b2i32(&b, nir_load_var(&b, available))); 271b8e80941Smrg store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa); 272b8e80941Smrg store->src[2] = nir_src_for_ssa(nir_iadd(&b, result_size, output_base)); 273b8e80941Smrg nir_intrinsic_set_write_mask(store, 0x1); 274b8e80941Smrg store->num_components = 1; 275b8e80941Smrg nir_builder_instr_insert(&b, &store->instr); 276b8e80941Smrg 277b8e80941Smrg return b.shader; 278b8e80941Smrg} 279b8e80941Smrg 280b8e80941Smrgstatic nir_shader * 281b8e80941Smrgbuild_pipeline_statistics_query_shader(struct radv_device *device) { 282b8e80941Smrg /* the shader this builds is roughly 283b8e80941Smrg * 284b8e80941Smrg * push constants { 285b8e80941Smrg * uint32_t flags; 286b8e80941Smrg * uint32_t dst_stride; 287b8e80941Smrg * uint32_t stats_mask; 288b8e80941Smrg * uint32_t avail_offset; 289b8e80941Smrg * }; 290b8e80941Smrg * 291b8e80941Smrg * uint32_t src_stride = pipelinestat_block_size * 2; 292b8e80941Smrg * 293b8e80941Smrg * location(binding = 0) buffer dst_buf; 294b8e80941Smrg * location(binding = 1) buffer src_buf; 295b8e80941Smrg * 296b8e80941Smrg * void main() { 297b8e80941Smrg * uint64_t src_offset = src_stride * global_id.x; 298b8e80941Smrg * uint64_t dst_base = dst_stride * global_id.x; 299b8e80941Smrg * uint64_t dst_offset = dst_base; 300b8e80941Smrg * uint32_t elem_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4; 301b8e80941Smrg * uint32_t elem_count = stats_mask >> 16; 302b8e80941Smrg * uint32_t available32 = src_buf[avail_offset + 4 * global_id.x]; 303b8e80941Smrg * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { 304b8e80941Smrg * dst_buf[dst_offset + elem_count * elem_size] = available32; 305b8e80941Smrg * } 306b8e80941Smrg * if ((bool)available32) { 307b8e80941Smrg * // repeat 11 times: 308b8e80941Smrg * if (stats_mask & (1 << 0)) { 309b8e80941Smrg * uint64_t start = src_buf[src_offset + 8 * indices[0]]; 310b8e80941Smrg * uint64_t end = src_buf[src_offset + 8 * indices[0] + pipelinestat_block_size]; 311b8e80941Smrg * uint64_t result = end - start; 312b8e80941Smrg * if (flags & VK_QUERY_RESULT_64_BIT) 313b8e80941Smrg * dst_buf[dst_offset] = result; 314b8e80941Smrg * else 315b8e80941Smrg * dst_buf[dst_offset] = (uint32_t)result. 316b8e80941Smrg * dst_offset += elem_size; 317b8e80941Smrg * } 318b8e80941Smrg * } else if (flags & VK_QUERY_RESULT_PARTIAL_BIT) { 319b8e80941Smrg * // Set everything to 0 as we don't know what is valid. 320b8e80941Smrg * for (int i = 0; i < elem_count; ++i) 321b8e80941Smrg * dst_buf[dst_base + elem_size * i] = 0; 322b8e80941Smrg * } 323b8e80941Smrg * } 324b8e80941Smrg */ 325b8e80941Smrg nir_builder b; 326b8e80941Smrg nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL); 327b8e80941Smrg b.shader->info.name = ralloc_strdup(b.shader, "pipeline_statistics_query"); 328b8e80941Smrg b.shader->info.cs.local_size[0] = 64; 329b8e80941Smrg b.shader->info.cs.local_size[1] = 1; 330b8e80941Smrg b.shader->info.cs.local_size[2] = 1; 331b8e80941Smrg 332b8e80941Smrg nir_variable *output_offset = nir_local_variable_create(b.impl, glsl_int_type(), "output_offset"); 333b8e80941Smrg 334b8e80941Smrg nir_ssa_def *flags = radv_load_push_int(&b, 0, "flags"); 335b8e80941Smrg nir_ssa_def *stats_mask = radv_load_push_int(&b, 8, "stats_mask"); 336b8e80941Smrg nir_ssa_def *avail_offset = radv_load_push_int(&b, 12, "avail_offset"); 337b8e80941Smrg 338b8e80941Smrg nir_intrinsic_instr *dst_buf = nir_intrinsic_instr_create(b.shader, 339b8e80941Smrg nir_intrinsic_vulkan_resource_index); 340b8e80941Smrg dst_buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0)); 341b8e80941Smrg dst_buf->num_components = 1;; 342b8e80941Smrg nir_intrinsic_set_desc_set(dst_buf, 0); 343b8e80941Smrg nir_intrinsic_set_binding(dst_buf, 0); 344b8e80941Smrg nir_ssa_dest_init(&dst_buf->instr, &dst_buf->dest, dst_buf->num_components, 32, NULL); 345b8e80941Smrg nir_builder_instr_insert(&b, &dst_buf->instr); 346b8e80941Smrg 347b8e80941Smrg nir_intrinsic_instr *src_buf = nir_intrinsic_instr_create(b.shader, 348b8e80941Smrg nir_intrinsic_vulkan_resource_index); 349b8e80941Smrg src_buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0)); 350b8e80941Smrg src_buf->num_components = 1; 351b8e80941Smrg nir_intrinsic_set_desc_set(src_buf, 0); 352b8e80941Smrg nir_intrinsic_set_binding(src_buf, 1); 353b8e80941Smrg nir_ssa_dest_init(&src_buf->instr, &src_buf->dest, src_buf->num_components, 32, NULL); 354b8e80941Smrg nir_builder_instr_insert(&b, &src_buf->instr); 355b8e80941Smrg 356b8e80941Smrg nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b); 357b8e80941Smrg nir_ssa_def *wg_id = nir_load_work_group_id(&b); 358b8e80941Smrg nir_ssa_def *block_size = nir_imm_ivec4(&b, 359b8e80941Smrg b.shader->info.cs.local_size[0], 360b8e80941Smrg b.shader->info.cs.local_size[1], 361b8e80941Smrg b.shader->info.cs.local_size[2], 0); 362b8e80941Smrg nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id); 363b8e80941Smrg global_id = nir_channel(&b, global_id, 0); // We only care about x here. 364b8e80941Smrg 365b8e80941Smrg nir_ssa_def *input_stride = nir_imm_int(&b, pipelinestat_block_size * 2); 366b8e80941Smrg nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id); 367b8e80941Smrg nir_ssa_def *output_stride = radv_load_push_int(&b, 4, "output_stride"); 368b8e80941Smrg nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id); 369b8e80941Smrg 370b8e80941Smrg 371b8e80941Smrg avail_offset = nir_iadd(&b, avail_offset, 372b8e80941Smrg nir_imul(&b, global_id, nir_imm_int(&b, 4))); 373b8e80941Smrg 374b8e80941Smrg nir_intrinsic_instr *load = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo); 375b8e80941Smrg load->src[0] = nir_src_for_ssa(&src_buf->dest.ssa); 376b8e80941Smrg load->src[1] = nir_src_for_ssa(avail_offset); 377b8e80941Smrg nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL); 378b8e80941Smrg load->num_components = 1; 379b8e80941Smrg nir_builder_instr_insert(&b, &load->instr); 380b8e80941Smrg nir_ssa_def *available32 = &load->dest.ssa; 381b8e80941Smrg 382b8e80941Smrg nir_ssa_def *result_is_64bit = nir_test_flag(&b, flags, VK_QUERY_RESULT_64_BIT); 383b8e80941Smrg nir_ssa_def *elem_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4)); 384b8e80941Smrg nir_ssa_def *elem_count = nir_ushr(&b, stats_mask, nir_imm_int(&b, 16)); 385b8e80941Smrg 386b8e80941Smrg /* Store the availability bit if requested. */ 387b8e80941Smrg 388b8e80941Smrg nir_if *availability_if = nir_if_create(b.shader); 389b8e80941Smrg availability_if->condition = nir_src_for_ssa(nir_test_flag(&b, flags, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)); 390b8e80941Smrg nir_cf_node_insert(b.cursor, &availability_if->cf_node); 391b8e80941Smrg 392b8e80941Smrg b.cursor = nir_after_cf_list(&availability_if->then_list); 393b8e80941Smrg 394b8e80941Smrg nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo); 395b8e80941Smrg store->src[0] = nir_src_for_ssa(available32); 396b8e80941Smrg store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa); 397b8e80941Smrg store->src[2] = nir_src_for_ssa(nir_iadd(&b, output_base, nir_imul(&b, elem_count, elem_size))); 398b8e80941Smrg nir_intrinsic_set_write_mask(store, 0x1); 399b8e80941Smrg store->num_components = 1; 400b8e80941Smrg nir_builder_instr_insert(&b, &store->instr); 401b8e80941Smrg 402b8e80941Smrg b.cursor = nir_after_cf_node(&availability_if->cf_node); 403b8e80941Smrg 404b8e80941Smrg nir_if *available_if = nir_if_create(b.shader); 405b8e80941Smrg available_if->condition = nir_src_for_ssa(nir_i2b(&b, available32)); 406b8e80941Smrg nir_cf_node_insert(b.cursor, &available_if->cf_node); 407b8e80941Smrg 408b8e80941Smrg b.cursor = nir_after_cf_list(&available_if->then_list); 409b8e80941Smrg 410b8e80941Smrg nir_store_var(&b, output_offset, output_base, 0x1); 411b8e80941Smrg for (int i = 0; i < 11; ++i) { 412b8e80941Smrg nir_if *store_if = nir_if_create(b.shader); 413b8e80941Smrg store_if->condition = nir_src_for_ssa(nir_test_flag(&b, stats_mask, 1u << i)); 414b8e80941Smrg nir_cf_node_insert(b.cursor, &store_if->cf_node); 415b8e80941Smrg 416b8e80941Smrg b.cursor = nir_after_cf_list(&store_if->then_list); 417b8e80941Smrg 418b8e80941Smrg load = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo); 419b8e80941Smrg load->src[0] = nir_src_for_ssa(&src_buf->dest.ssa); 420b8e80941Smrg load->src[1] = nir_src_for_ssa(nir_iadd(&b, input_base, 421b8e80941Smrg nir_imm_int(&b, pipeline_statistics_indices[i] * 8))); 422b8e80941Smrg nir_ssa_dest_init(&load->instr, &load->dest, 1, 64, NULL); 423b8e80941Smrg load->num_components = 1; 424b8e80941Smrg nir_builder_instr_insert(&b, &load->instr); 425b8e80941Smrg nir_ssa_def *start = &load->dest.ssa; 426b8e80941Smrg 427b8e80941Smrg load = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo); 428b8e80941Smrg load->src[0] = nir_src_for_ssa(&src_buf->dest.ssa); 429b8e80941Smrg load->src[1] = nir_src_for_ssa(nir_iadd(&b, input_base, 430b8e80941Smrg nir_imm_int(&b, pipeline_statistics_indices[i] * 8 + pipelinestat_block_size))); 431b8e80941Smrg nir_ssa_dest_init(&load->instr, &load->dest, 1, 64, NULL); 432b8e80941Smrg load->num_components = 1; 433b8e80941Smrg nir_builder_instr_insert(&b, &load->instr); 434b8e80941Smrg nir_ssa_def *end = &load->dest.ssa; 435b8e80941Smrg 436b8e80941Smrg nir_ssa_def *result = nir_isub(&b, end, start); 437b8e80941Smrg 438b8e80941Smrg /* Store result */ 439b8e80941Smrg nir_if *store_64bit_if = nir_if_create(b.shader); 440b8e80941Smrg store_64bit_if->condition = nir_src_for_ssa(result_is_64bit); 441b8e80941Smrg nir_cf_node_insert(b.cursor, &store_64bit_if->cf_node); 442b8e80941Smrg 443b8e80941Smrg b.cursor = nir_after_cf_list(&store_64bit_if->then_list); 444b8e80941Smrg 445b8e80941Smrg nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo); 446b8e80941Smrg store->src[0] = nir_src_for_ssa(result); 447b8e80941Smrg store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa); 448b8e80941Smrg store->src[2] = nir_src_for_ssa(nir_load_var(&b, output_offset)); 449b8e80941Smrg nir_intrinsic_set_write_mask(store, 0x1); 450b8e80941Smrg store->num_components = 1; 451b8e80941Smrg nir_builder_instr_insert(&b, &store->instr); 452b8e80941Smrg 453b8e80941Smrg b.cursor = nir_after_cf_list(&store_64bit_if->else_list); 454b8e80941Smrg 455b8e80941Smrg store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo); 456b8e80941Smrg store->src[0] = nir_src_for_ssa(nir_u2u32(&b, result)); 457b8e80941Smrg store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa); 458b8e80941Smrg store->src[2] = nir_src_for_ssa(nir_load_var(&b, output_offset)); 459b8e80941Smrg nir_intrinsic_set_write_mask(store, 0x1); 460b8e80941Smrg store->num_components = 1; 461b8e80941Smrg nir_builder_instr_insert(&b, &store->instr); 462b8e80941Smrg 463b8e80941Smrg b.cursor = nir_after_cf_node(&store_64bit_if->cf_node); 464b8e80941Smrg 465b8e80941Smrg nir_store_var(&b, output_offset, 466b8e80941Smrg nir_iadd(&b, nir_load_var(&b, output_offset), 467b8e80941Smrg elem_size), 0x1); 468b8e80941Smrg 469b8e80941Smrg b.cursor = nir_after_cf_node(&store_if->cf_node); 470b8e80941Smrg } 471b8e80941Smrg 472b8e80941Smrg b.cursor = nir_after_cf_list(&available_if->else_list); 473b8e80941Smrg 474b8e80941Smrg available_if = nir_if_create(b.shader); 475b8e80941Smrg available_if->condition = nir_src_for_ssa(nir_test_flag(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT)); 476b8e80941Smrg nir_cf_node_insert(b.cursor, &available_if->cf_node); 477b8e80941Smrg 478b8e80941Smrg b.cursor = nir_after_cf_list(&available_if->then_list); 479b8e80941Smrg 480b8e80941Smrg /* Stores zeros in all outputs. */ 481b8e80941Smrg 482b8e80941Smrg nir_variable *counter = nir_local_variable_create(b.impl, glsl_int_type(), "counter"); 483b8e80941Smrg nir_store_var(&b, counter, nir_imm_int(&b, 0), 0x1); 484b8e80941Smrg 485b8e80941Smrg nir_loop *loop = nir_loop_create(b.shader); 486b8e80941Smrg nir_builder_cf_insert(&b, &loop->cf_node); 487b8e80941Smrg b.cursor = nir_after_cf_list(&loop->body); 488b8e80941Smrg 489b8e80941Smrg nir_ssa_def *current_counter = nir_load_var(&b, counter); 490b8e80941Smrg radv_break_on_count(&b, counter, elem_count); 491b8e80941Smrg 492b8e80941Smrg nir_ssa_def *output_elem = nir_iadd(&b, output_base, 493b8e80941Smrg nir_imul(&b, elem_size, current_counter)); 494b8e80941Smrg 495b8e80941Smrg nir_if *store_64bit_if = nir_if_create(b.shader); 496b8e80941Smrg store_64bit_if->condition = nir_src_for_ssa(result_is_64bit); 497b8e80941Smrg nir_cf_node_insert(b.cursor, &store_64bit_if->cf_node); 498b8e80941Smrg 499b8e80941Smrg b.cursor = nir_after_cf_list(&store_64bit_if->then_list); 500b8e80941Smrg 501b8e80941Smrg store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo); 502b8e80941Smrg store->src[0] = nir_src_for_ssa(nir_imm_int64(&b, 0)); 503b8e80941Smrg store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa); 504b8e80941Smrg store->src[2] = nir_src_for_ssa(output_elem); 505b8e80941Smrg nir_intrinsic_set_write_mask(store, 0x1); 506b8e80941Smrg store->num_components = 1; 507b8e80941Smrg nir_builder_instr_insert(&b, &store->instr); 508b8e80941Smrg 509b8e80941Smrg b.cursor = nir_after_cf_list(&store_64bit_if->else_list); 510b8e80941Smrg 511b8e80941Smrg store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo); 512b8e80941Smrg store->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0)); 513b8e80941Smrg store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa); 514b8e80941Smrg store->src[2] = nir_src_for_ssa(output_elem); 515b8e80941Smrg nir_intrinsic_set_write_mask(store, 0x1); 516b8e80941Smrg store->num_components = 1; 517b8e80941Smrg nir_builder_instr_insert(&b, &store->instr); 518b8e80941Smrg 519b8e80941Smrg b.cursor = nir_after_cf_node(&loop->cf_node); 520b8e80941Smrg return b.shader; 521b8e80941Smrg} 522b8e80941Smrg 523b8e80941Smrgstatic nir_shader * 524b8e80941Smrgbuild_tfb_query_shader(struct radv_device *device) 525b8e80941Smrg{ 526b8e80941Smrg /* the shader this builds is roughly 527b8e80941Smrg * 528b8e80941Smrg * uint32_t src_stride = 32; 529b8e80941Smrg * 530b8e80941Smrg * location(binding = 0) buffer dst_buf; 531b8e80941Smrg * location(binding = 1) buffer src_buf; 532b8e80941Smrg * 533b8e80941Smrg * void main() { 534b8e80941Smrg * uint64_t result[2] = {}; 535b8e80941Smrg * bool available = false; 536b8e80941Smrg * uint64_t src_offset = src_stride * global_id.x; 537b8e80941Smrg * uint64_t dst_offset = dst_stride * global_id.x; 538b8e80941Smrg * uint64_t *src_data = src_buf[src_offset]; 539b8e80941Smrg * uint32_t avail = (src_data[0] >> 32) & 540b8e80941Smrg * (src_data[1] >> 32) & 541b8e80941Smrg * (src_data[2] >> 32) & 542b8e80941Smrg * (src_data[3] >> 32); 543b8e80941Smrg * if (avail & 0x80000000) { 544b8e80941Smrg * result[0] = src_data[3] - src_data[1]; 545b8e80941Smrg * result[1] = src_data[2] - src_data[0]; 546b8e80941Smrg * available = true; 547b8e80941Smrg * } 548b8e80941Smrg * uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 16 : 8; 549b8e80941Smrg * if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) { 550b8e80941Smrg * if (flags & VK_QUERY_RESULT_64_BIT) { 551b8e80941Smrg * dst_buf[dst_offset] = result; 552b8e80941Smrg * } else { 553b8e80941Smrg * dst_buf[dst_offset] = (uint32_t)result; 554b8e80941Smrg * } 555b8e80941Smrg * } 556b8e80941Smrg * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { 557b8e80941Smrg * dst_buf[dst_offset + result_size] = available; 558b8e80941Smrg * } 559b8e80941Smrg * } 560b8e80941Smrg */ 561b8e80941Smrg nir_builder b; 562b8e80941Smrg nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL); 563b8e80941Smrg b.shader->info.name = ralloc_strdup(b.shader, "tfb_query"); 564b8e80941Smrg b.shader->info.cs.local_size[0] = 64; 565b8e80941Smrg b.shader->info.cs.local_size[1] = 1; 566b8e80941Smrg b.shader->info.cs.local_size[2] = 1; 567b8e80941Smrg 568b8e80941Smrg /* Create and initialize local variables. */ 569b8e80941Smrg nir_variable *result = 570b8e80941Smrg nir_local_variable_create(b.impl, 571b8e80941Smrg glsl_vector_type(GLSL_TYPE_UINT64, 2), 572b8e80941Smrg "result"); 573b8e80941Smrg nir_variable *available = 574b8e80941Smrg nir_local_variable_create(b.impl, glsl_bool_type(), "available"); 575b8e80941Smrg 576b8e80941Smrg nir_store_var(&b, result, 577b8e80941Smrg nir_vec2(&b, nir_imm_int64(&b, 0), 578b8e80941Smrg nir_imm_int64(&b, 0)), 0x3); 579b8e80941Smrg nir_store_var(&b, available, nir_imm_false(&b), 0x1); 580b8e80941Smrg 581b8e80941Smrg nir_ssa_def *flags = radv_load_push_int(&b, 0, "flags"); 582b8e80941Smrg 583b8e80941Smrg /* Load resources. */ 584b8e80941Smrg nir_intrinsic_instr *dst_buf = nir_intrinsic_instr_create(b.shader, 585b8e80941Smrg nir_intrinsic_vulkan_resource_index); 586b8e80941Smrg dst_buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0)); 587b8e80941Smrg dst_buf->num_components = 1; 588b8e80941Smrg nir_intrinsic_set_desc_set(dst_buf, 0); 589b8e80941Smrg nir_intrinsic_set_binding(dst_buf, 0); 590b8e80941Smrg nir_ssa_dest_init(&dst_buf->instr, &dst_buf->dest, dst_buf->num_components, 32, NULL); 591b8e80941Smrg nir_builder_instr_insert(&b, &dst_buf->instr); 592b8e80941Smrg 593b8e80941Smrg nir_intrinsic_instr *src_buf = nir_intrinsic_instr_create(b.shader, 594b8e80941Smrg nir_intrinsic_vulkan_resource_index); 595b8e80941Smrg src_buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0)); 596b8e80941Smrg src_buf->num_components = 1; 597b8e80941Smrg nir_intrinsic_set_desc_set(src_buf, 0); 598b8e80941Smrg nir_intrinsic_set_binding(src_buf, 1); 599b8e80941Smrg nir_ssa_dest_init(&src_buf->instr, &src_buf->dest, src_buf->num_components, 32, NULL); 600b8e80941Smrg nir_builder_instr_insert(&b, &src_buf->instr); 601b8e80941Smrg 602b8e80941Smrg /* Compute global ID. */ 603b8e80941Smrg nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b); 604b8e80941Smrg nir_ssa_def *wg_id = nir_load_work_group_id(&b); 605b8e80941Smrg nir_ssa_def *block_size = nir_imm_ivec4(&b, 606b8e80941Smrg b.shader->info.cs.local_size[0], 607b8e80941Smrg b.shader->info.cs.local_size[1], 608b8e80941Smrg b.shader->info.cs.local_size[2], 0); 609b8e80941Smrg nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id); 610b8e80941Smrg global_id = nir_channel(&b, global_id, 0); // We only care about x here. 611b8e80941Smrg 612b8e80941Smrg /* Compute src/dst strides. */ 613b8e80941Smrg nir_ssa_def *input_stride = nir_imm_int(&b, 32); 614b8e80941Smrg nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id); 615b8e80941Smrg nir_ssa_def *output_stride = radv_load_push_int(&b, 4, "output_stride"); 616b8e80941Smrg nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id); 617b8e80941Smrg 618b8e80941Smrg /* Load data from the query pool. */ 619b8e80941Smrg nir_intrinsic_instr *load1 = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo); 620b8e80941Smrg load1->src[0] = nir_src_for_ssa(&src_buf->dest.ssa); 621b8e80941Smrg load1->src[1] = nir_src_for_ssa(input_base); 622b8e80941Smrg nir_ssa_dest_init(&load1->instr, &load1->dest, 4, 32, NULL); 623b8e80941Smrg load1->num_components = 4; 624b8e80941Smrg nir_builder_instr_insert(&b, &load1->instr); 625b8e80941Smrg 626b8e80941Smrg nir_intrinsic_instr *load2 = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo); 627b8e80941Smrg load2->src[0] = nir_src_for_ssa(&src_buf->dest.ssa); 628b8e80941Smrg load2->src[1] = nir_src_for_ssa(nir_iadd(&b, input_base, nir_imm_int(&b, 16))); 629b8e80941Smrg nir_ssa_dest_init(&load2->instr, &load2->dest, 4, 32, NULL); 630b8e80941Smrg load2->num_components = 4; 631b8e80941Smrg nir_builder_instr_insert(&b, &load2->instr); 632b8e80941Smrg 633b8e80941Smrg /* Check if result is available. */ 634b8e80941Smrg nir_ssa_def *avails[2]; 635b8e80941Smrg avails[0] = nir_iand(&b, nir_channel(&b, &load1->dest.ssa, 1), 636b8e80941Smrg nir_channel(&b, &load1->dest.ssa, 3)); 637b8e80941Smrg avails[1] = nir_iand(&b, nir_channel(&b, &load2->dest.ssa, 1), 638b8e80941Smrg nir_channel(&b, &load2->dest.ssa, 3)); 639b8e80941Smrg nir_ssa_def *result_is_available = 640b8e80941Smrg nir_i2b(&b, nir_iand(&b, nir_iand(&b, avails[0], avails[1]), 641b8e80941Smrg nir_imm_int(&b, 0x80000000))); 642b8e80941Smrg 643b8e80941Smrg /* Only compute result if available. */ 644b8e80941Smrg nir_if *available_if = nir_if_create(b.shader); 645b8e80941Smrg available_if->condition = nir_src_for_ssa(result_is_available); 646b8e80941Smrg nir_cf_node_insert(b.cursor, &available_if->cf_node); 647b8e80941Smrg 648b8e80941Smrg b.cursor = nir_after_cf_list(&available_if->then_list); 649b8e80941Smrg 650b8e80941Smrg /* Pack values. */ 651b8e80941Smrg nir_ssa_def *packed64[4]; 652b8e80941Smrg packed64[0] = nir_pack_64_2x32(&b, nir_vec2(&b, 653b8e80941Smrg nir_channel(&b, &load1->dest.ssa, 0), 654b8e80941Smrg nir_channel(&b, &load1->dest.ssa, 1))); 655b8e80941Smrg packed64[1] = nir_pack_64_2x32(&b, nir_vec2(&b, 656b8e80941Smrg nir_channel(&b, &load1->dest.ssa, 2), 657b8e80941Smrg nir_channel(&b, &load1->dest.ssa, 3))); 658b8e80941Smrg packed64[2] = nir_pack_64_2x32(&b, nir_vec2(&b, 659b8e80941Smrg nir_channel(&b, &load2->dest.ssa, 0), 660b8e80941Smrg nir_channel(&b, &load2->dest.ssa, 1))); 661b8e80941Smrg packed64[3] = nir_pack_64_2x32(&b, nir_vec2(&b, 662b8e80941Smrg nir_channel(&b, &load2->dest.ssa, 2), 663b8e80941Smrg nir_channel(&b, &load2->dest.ssa, 3))); 664b8e80941Smrg 665b8e80941Smrg /* Compute result. */ 666b8e80941Smrg nir_ssa_def *num_primitive_written = 667b8e80941Smrg nir_isub(&b, packed64[3], packed64[1]); 668b8e80941Smrg nir_ssa_def *primitive_storage_needed = 669b8e80941Smrg nir_isub(&b, packed64[2], packed64[0]); 670b8e80941Smrg 671b8e80941Smrg nir_store_var(&b, result, 672b8e80941Smrg nir_vec2(&b, num_primitive_written, 673b8e80941Smrg primitive_storage_needed), 0x3); 674b8e80941Smrg nir_store_var(&b, available, nir_imm_true(&b), 0x1); 675b8e80941Smrg 676b8e80941Smrg b.cursor = nir_after_cf_node(&available_if->cf_node); 677b8e80941Smrg 678b8e80941Smrg /* Determine if result is 64 or 32 bit. */ 679b8e80941Smrg nir_ssa_def *result_is_64bit = 680b8e80941Smrg nir_test_flag(&b, flags, VK_QUERY_RESULT_64_BIT); 681b8e80941Smrg nir_ssa_def *result_size = 682b8e80941Smrg nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 16), 683b8e80941Smrg nir_imm_int(&b, 8)); 684b8e80941Smrg 685b8e80941Smrg /* Store the result if complete or partial results have been requested. */ 686b8e80941Smrg nir_if *store_if = nir_if_create(b.shader); 687b8e80941Smrg store_if->condition = 688b8e80941Smrg nir_src_for_ssa(nir_ior(&b, nir_test_flag(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT), 689b8e80941Smrg nir_load_var(&b, available))); 690b8e80941Smrg nir_cf_node_insert(b.cursor, &store_if->cf_node); 691b8e80941Smrg 692b8e80941Smrg b.cursor = nir_after_cf_list(&store_if->then_list); 693b8e80941Smrg 694b8e80941Smrg /* Store result. */ 695b8e80941Smrg nir_if *store_64bit_if = nir_if_create(b.shader); 696b8e80941Smrg store_64bit_if->condition = nir_src_for_ssa(result_is_64bit); 697b8e80941Smrg nir_cf_node_insert(b.cursor, &store_64bit_if->cf_node); 698b8e80941Smrg 699b8e80941Smrg b.cursor = nir_after_cf_list(&store_64bit_if->then_list); 700b8e80941Smrg 701b8e80941Smrg nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo); 702b8e80941Smrg store->src[0] = nir_src_for_ssa(nir_load_var(&b, result)); 703b8e80941Smrg store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa); 704b8e80941Smrg store->src[2] = nir_src_for_ssa(output_base); 705b8e80941Smrg nir_intrinsic_set_write_mask(store, 0x3); 706b8e80941Smrg store->num_components = 2; 707b8e80941Smrg nir_builder_instr_insert(&b, &store->instr); 708b8e80941Smrg 709b8e80941Smrg b.cursor = nir_after_cf_list(&store_64bit_if->else_list); 710b8e80941Smrg 711b8e80941Smrg store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo); 712b8e80941Smrg store->src[0] = nir_src_for_ssa(nir_u2u32(&b, nir_load_var(&b, result))); 713b8e80941Smrg store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa); 714b8e80941Smrg store->src[2] = nir_src_for_ssa(output_base); 715b8e80941Smrg nir_intrinsic_set_write_mask(store, 0x3); 716b8e80941Smrg store->num_components = 2; 717b8e80941Smrg nir_builder_instr_insert(&b, &store->instr); 718b8e80941Smrg 719b8e80941Smrg b.cursor = nir_after_cf_node(&store_64bit_if->cf_node); 720b8e80941Smrg 721b8e80941Smrg b.cursor = nir_after_cf_node(&store_if->cf_node); 722b8e80941Smrg 723b8e80941Smrg /* Store the availability bit if requested. */ 724b8e80941Smrg nir_if *availability_if = nir_if_create(b.shader); 725b8e80941Smrg availability_if->condition = 726b8e80941Smrg nir_src_for_ssa(nir_test_flag(&b, flags, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)); 727b8e80941Smrg nir_cf_node_insert(b.cursor, &availability_if->cf_node); 728b8e80941Smrg 729b8e80941Smrg b.cursor = nir_after_cf_list(&availability_if->then_list); 730b8e80941Smrg 731b8e80941Smrg store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo); 732b8e80941Smrg store->src[0] = nir_src_for_ssa(nir_b2i32(&b, nir_load_var(&b, available))); 733b8e80941Smrg store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa); 734b8e80941Smrg store->src[2] = nir_src_for_ssa(nir_iadd(&b, result_size, output_base)); 735b8e80941Smrg nir_intrinsic_set_write_mask(store, 0x1); 736b8e80941Smrg store->num_components = 1; 737b8e80941Smrg nir_builder_instr_insert(&b, &store->instr); 738b8e80941Smrg 739b8e80941Smrg b.cursor = nir_after_cf_node(&availability_if->cf_node); 740b8e80941Smrg 741b8e80941Smrg return b.shader; 742b8e80941Smrg} 743b8e80941Smrg 744b8e80941Smrgstatic VkResult radv_device_init_meta_query_state_internal(struct radv_device *device) 745b8e80941Smrg{ 746b8e80941Smrg VkResult result; 747b8e80941Smrg struct radv_shader_module occlusion_cs = { .nir = NULL }; 748b8e80941Smrg struct radv_shader_module pipeline_statistics_cs = { .nir = NULL }; 749b8e80941Smrg struct radv_shader_module tfb_cs = { .nir = NULL }; 750b8e80941Smrg 751b8e80941Smrg mtx_lock(&device->meta_state.mtx); 752b8e80941Smrg if (device->meta_state.query.pipeline_statistics_query_pipeline) { 753b8e80941Smrg mtx_unlock(&device->meta_state.mtx); 754b8e80941Smrg return VK_SUCCESS; 755b8e80941Smrg } 756b8e80941Smrg occlusion_cs.nir = build_occlusion_query_shader(device); 757b8e80941Smrg pipeline_statistics_cs.nir = build_pipeline_statistics_query_shader(device); 758b8e80941Smrg tfb_cs.nir = build_tfb_query_shader(device); 759b8e80941Smrg 760b8e80941Smrg VkDescriptorSetLayoutCreateInfo occlusion_ds_create_info = { 761b8e80941Smrg .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, 762b8e80941Smrg .flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR, 763b8e80941Smrg .bindingCount = 2, 764b8e80941Smrg .pBindings = (VkDescriptorSetLayoutBinding[]) { 765b8e80941Smrg { 766b8e80941Smrg .binding = 0, 767b8e80941Smrg .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 768b8e80941Smrg .descriptorCount = 1, 769b8e80941Smrg .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, 770b8e80941Smrg .pImmutableSamplers = NULL 771b8e80941Smrg }, 772b8e80941Smrg { 773b8e80941Smrg .binding = 1, 774b8e80941Smrg .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 775b8e80941Smrg .descriptorCount = 1, 776b8e80941Smrg .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, 777b8e80941Smrg .pImmutableSamplers = NULL 778b8e80941Smrg }, 779b8e80941Smrg } 780b8e80941Smrg }; 781b8e80941Smrg 782b8e80941Smrg result = radv_CreateDescriptorSetLayout(radv_device_to_handle(device), 783b8e80941Smrg &occlusion_ds_create_info, 784b8e80941Smrg &device->meta_state.alloc, 785b8e80941Smrg &device->meta_state.query.ds_layout); 786b8e80941Smrg if (result != VK_SUCCESS) 787b8e80941Smrg goto fail; 788b8e80941Smrg 789b8e80941Smrg VkPipelineLayoutCreateInfo occlusion_pl_create_info = { 790b8e80941Smrg .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, 791b8e80941Smrg .setLayoutCount = 1, 792b8e80941Smrg .pSetLayouts = &device->meta_state.query.ds_layout, 793b8e80941Smrg .pushConstantRangeCount = 1, 794b8e80941Smrg .pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, 16}, 795b8e80941Smrg }; 796b8e80941Smrg 797b8e80941Smrg result = radv_CreatePipelineLayout(radv_device_to_handle(device), 798b8e80941Smrg &occlusion_pl_create_info, 799b8e80941Smrg &device->meta_state.alloc, 800b8e80941Smrg &device->meta_state.query.p_layout); 801b8e80941Smrg if (result != VK_SUCCESS) 802b8e80941Smrg goto fail; 803b8e80941Smrg 804b8e80941Smrg VkPipelineShaderStageCreateInfo occlusion_pipeline_shader_stage = { 805b8e80941Smrg .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, 806b8e80941Smrg .stage = VK_SHADER_STAGE_COMPUTE_BIT, 807b8e80941Smrg .module = radv_shader_module_to_handle(&occlusion_cs), 808b8e80941Smrg .pName = "main", 809b8e80941Smrg .pSpecializationInfo = NULL, 810b8e80941Smrg }; 811b8e80941Smrg 812b8e80941Smrg VkComputePipelineCreateInfo occlusion_vk_pipeline_info = { 813b8e80941Smrg .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, 814b8e80941Smrg .stage = occlusion_pipeline_shader_stage, 815b8e80941Smrg .flags = 0, 816b8e80941Smrg .layout = device->meta_state.query.p_layout, 817b8e80941Smrg }; 818b8e80941Smrg 819b8e80941Smrg result = radv_CreateComputePipelines(radv_device_to_handle(device), 820b8e80941Smrg radv_pipeline_cache_to_handle(&device->meta_state.cache), 821b8e80941Smrg 1, &occlusion_vk_pipeline_info, NULL, 822b8e80941Smrg &device->meta_state.query.occlusion_query_pipeline); 823b8e80941Smrg if (result != VK_SUCCESS) 824b8e80941Smrg goto fail; 825b8e80941Smrg 826b8e80941Smrg VkPipelineShaderStageCreateInfo pipeline_statistics_pipeline_shader_stage = { 827b8e80941Smrg .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, 828b8e80941Smrg .stage = VK_SHADER_STAGE_COMPUTE_BIT, 829b8e80941Smrg .module = radv_shader_module_to_handle(&pipeline_statistics_cs), 830b8e80941Smrg .pName = "main", 831b8e80941Smrg .pSpecializationInfo = NULL, 832b8e80941Smrg }; 833b8e80941Smrg 834b8e80941Smrg VkComputePipelineCreateInfo pipeline_statistics_vk_pipeline_info = { 835b8e80941Smrg .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, 836b8e80941Smrg .stage = pipeline_statistics_pipeline_shader_stage, 837b8e80941Smrg .flags = 0, 838b8e80941Smrg .layout = device->meta_state.query.p_layout, 839b8e80941Smrg }; 840b8e80941Smrg 841b8e80941Smrg result = radv_CreateComputePipelines(radv_device_to_handle(device), 842b8e80941Smrg radv_pipeline_cache_to_handle(&device->meta_state.cache), 843b8e80941Smrg 1, &pipeline_statistics_vk_pipeline_info, NULL, 844b8e80941Smrg &device->meta_state.query.pipeline_statistics_query_pipeline); 845b8e80941Smrg if (result != VK_SUCCESS) 846b8e80941Smrg goto fail; 847b8e80941Smrg 848b8e80941Smrg VkPipelineShaderStageCreateInfo tfb_pipeline_shader_stage = { 849b8e80941Smrg .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, 850b8e80941Smrg .stage = VK_SHADER_STAGE_COMPUTE_BIT, 851b8e80941Smrg .module = radv_shader_module_to_handle(&tfb_cs), 852b8e80941Smrg .pName = "main", 853b8e80941Smrg .pSpecializationInfo = NULL, 854b8e80941Smrg }; 855b8e80941Smrg 856b8e80941Smrg VkComputePipelineCreateInfo tfb_pipeline_info = { 857b8e80941Smrg .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, 858b8e80941Smrg .stage = tfb_pipeline_shader_stage, 859b8e80941Smrg .flags = 0, 860b8e80941Smrg .layout = device->meta_state.query.p_layout, 861b8e80941Smrg }; 862b8e80941Smrg 863b8e80941Smrg result = radv_CreateComputePipelines(radv_device_to_handle(device), 864b8e80941Smrg radv_pipeline_cache_to_handle(&device->meta_state.cache), 865b8e80941Smrg 1, &tfb_pipeline_info, NULL, 866b8e80941Smrg &device->meta_state.query.tfb_query_pipeline); 867b8e80941Smrgfail: 868b8e80941Smrg if (result != VK_SUCCESS) 869b8e80941Smrg radv_device_finish_meta_query_state(device); 870b8e80941Smrg ralloc_free(occlusion_cs.nir); 871b8e80941Smrg ralloc_free(pipeline_statistics_cs.nir); 872b8e80941Smrg ralloc_free(tfb_cs.nir); 873b8e80941Smrg mtx_unlock(&device->meta_state.mtx); 874b8e80941Smrg return result; 875b8e80941Smrg} 876b8e80941Smrg 877b8e80941SmrgVkResult radv_device_init_meta_query_state(struct radv_device *device, bool on_demand) 878b8e80941Smrg{ 879b8e80941Smrg if (on_demand) 880b8e80941Smrg return VK_SUCCESS; 881b8e80941Smrg 882b8e80941Smrg return radv_device_init_meta_query_state_internal(device); 883b8e80941Smrg} 884b8e80941Smrg 885b8e80941Smrgvoid radv_device_finish_meta_query_state(struct radv_device *device) 886b8e80941Smrg{ 887b8e80941Smrg if (device->meta_state.query.tfb_query_pipeline) 888b8e80941Smrg radv_DestroyPipeline(radv_device_to_handle(device), 889b8e80941Smrg device->meta_state.query.tfb_query_pipeline, 890b8e80941Smrg &device->meta_state.alloc); 891b8e80941Smrg 892b8e80941Smrg if (device->meta_state.query.pipeline_statistics_query_pipeline) 893b8e80941Smrg radv_DestroyPipeline(radv_device_to_handle(device), 894b8e80941Smrg device->meta_state.query.pipeline_statistics_query_pipeline, 895b8e80941Smrg &device->meta_state.alloc); 896b8e80941Smrg 897b8e80941Smrg if (device->meta_state.query.occlusion_query_pipeline) 898b8e80941Smrg radv_DestroyPipeline(radv_device_to_handle(device), 899b8e80941Smrg device->meta_state.query.occlusion_query_pipeline, 900b8e80941Smrg &device->meta_state.alloc); 901b8e80941Smrg 902b8e80941Smrg if (device->meta_state.query.p_layout) 903b8e80941Smrg radv_DestroyPipelineLayout(radv_device_to_handle(device), 904b8e80941Smrg device->meta_state.query.p_layout, 905b8e80941Smrg &device->meta_state.alloc); 906b8e80941Smrg 907b8e80941Smrg if (device->meta_state.query.ds_layout) 908b8e80941Smrg radv_DestroyDescriptorSetLayout(radv_device_to_handle(device), 909b8e80941Smrg device->meta_state.query.ds_layout, 910b8e80941Smrg &device->meta_state.alloc); 911b8e80941Smrg} 912b8e80941Smrg 913b8e80941Smrgstatic void radv_query_shader(struct radv_cmd_buffer *cmd_buffer, 914b8e80941Smrg VkPipeline *pipeline, 915b8e80941Smrg struct radeon_winsys_bo *src_bo, 916b8e80941Smrg struct radeon_winsys_bo *dst_bo, 917b8e80941Smrg uint64_t src_offset, uint64_t dst_offset, 918b8e80941Smrg uint32_t src_stride, uint32_t dst_stride, 919b8e80941Smrg uint32_t count, uint32_t flags, 920b8e80941Smrg uint32_t pipeline_stats_mask, uint32_t avail_offset) 921b8e80941Smrg{ 922b8e80941Smrg struct radv_device *device = cmd_buffer->device; 923b8e80941Smrg struct radv_meta_saved_state saved_state; 924b8e80941Smrg bool old_predicating; 925b8e80941Smrg 926b8e80941Smrg if (!*pipeline) { 927b8e80941Smrg VkResult ret = radv_device_init_meta_query_state_internal(device); 928b8e80941Smrg if (ret != VK_SUCCESS) { 929b8e80941Smrg cmd_buffer->record_result = ret; 930b8e80941Smrg return; 931b8e80941Smrg } 932b8e80941Smrg } 933b8e80941Smrg 934b8e80941Smrg radv_meta_save(&saved_state, cmd_buffer, 935b8e80941Smrg RADV_META_SAVE_COMPUTE_PIPELINE | 936b8e80941Smrg RADV_META_SAVE_CONSTANTS | 937b8e80941Smrg RADV_META_SAVE_DESCRIPTORS); 938b8e80941Smrg 939b8e80941Smrg /* VK_EXT_conditional_rendering says that copy commands should not be 940b8e80941Smrg * affected by conditional rendering. 941b8e80941Smrg */ 942b8e80941Smrg old_predicating = cmd_buffer->state.predicating; 943b8e80941Smrg cmd_buffer->state.predicating = false; 944b8e80941Smrg 945b8e80941Smrg struct radv_buffer dst_buffer = { 946b8e80941Smrg .bo = dst_bo, 947b8e80941Smrg .offset = dst_offset, 948b8e80941Smrg .size = dst_stride * count 949b8e80941Smrg }; 950b8e80941Smrg 951b8e80941Smrg struct radv_buffer src_buffer = { 952b8e80941Smrg .bo = src_bo, 953b8e80941Smrg .offset = src_offset, 954b8e80941Smrg .size = MAX2(src_stride * count, avail_offset + 4 * count - src_offset) 955b8e80941Smrg }; 956b8e80941Smrg 957b8e80941Smrg radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), 958b8e80941Smrg VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline); 959b8e80941Smrg 960b8e80941Smrg radv_meta_push_descriptor_set(cmd_buffer, 961b8e80941Smrg VK_PIPELINE_BIND_POINT_COMPUTE, 962b8e80941Smrg device->meta_state.query.p_layout, 963b8e80941Smrg 0, /* set */ 964b8e80941Smrg 2, /* descriptorWriteCount */ 965b8e80941Smrg (VkWriteDescriptorSet[]) { 966b8e80941Smrg { 967b8e80941Smrg .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, 968b8e80941Smrg .dstBinding = 0, 969b8e80941Smrg .dstArrayElement = 0, 970b8e80941Smrg .descriptorCount = 1, 971b8e80941Smrg .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 972b8e80941Smrg .pBufferInfo = &(VkDescriptorBufferInfo) { 973b8e80941Smrg .buffer = radv_buffer_to_handle(&dst_buffer), 974b8e80941Smrg .offset = 0, 975b8e80941Smrg .range = VK_WHOLE_SIZE 976b8e80941Smrg } 977b8e80941Smrg }, 978b8e80941Smrg { 979b8e80941Smrg .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, 980b8e80941Smrg .dstBinding = 1, 981b8e80941Smrg .dstArrayElement = 0, 982b8e80941Smrg .descriptorCount = 1, 983b8e80941Smrg .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 984b8e80941Smrg .pBufferInfo = &(VkDescriptorBufferInfo) { 985b8e80941Smrg .buffer = radv_buffer_to_handle(&src_buffer), 986b8e80941Smrg .offset = 0, 987b8e80941Smrg .range = VK_WHOLE_SIZE 988b8e80941Smrg } 989b8e80941Smrg } 990b8e80941Smrg }); 991b8e80941Smrg 992b8e80941Smrg /* Encode the number of elements for easy access by the shader. */ 993b8e80941Smrg pipeline_stats_mask &= 0x7ff; 994b8e80941Smrg pipeline_stats_mask |= util_bitcount(pipeline_stats_mask) << 16; 995b8e80941Smrg 996b8e80941Smrg avail_offset -= src_offset; 997b8e80941Smrg 998b8e80941Smrg struct { 999b8e80941Smrg uint32_t flags; 1000b8e80941Smrg uint32_t dst_stride; 1001b8e80941Smrg uint32_t pipeline_stats_mask; 1002b8e80941Smrg uint32_t avail_offset; 1003b8e80941Smrg } push_constants = { 1004b8e80941Smrg flags, 1005b8e80941Smrg dst_stride, 1006b8e80941Smrg pipeline_stats_mask, 1007b8e80941Smrg avail_offset 1008b8e80941Smrg }; 1009b8e80941Smrg 1010b8e80941Smrg radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), 1011b8e80941Smrg device->meta_state.query.p_layout, 1012b8e80941Smrg VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants), 1013b8e80941Smrg &push_constants); 1014b8e80941Smrg 1015b8e80941Smrg cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_GLOBAL_L2 | 1016b8e80941Smrg RADV_CMD_FLAG_INV_VMEM_L1; 1017b8e80941Smrg 1018b8e80941Smrg if (flags & VK_QUERY_RESULT_WAIT_BIT) 1019b8e80941Smrg cmd_buffer->state.flush_bits |= RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER; 1020b8e80941Smrg 1021b8e80941Smrg radv_unaligned_dispatch(cmd_buffer, count, 1, 1); 1022b8e80941Smrg 1023b8e80941Smrg /* Restore conditional rendering. */ 1024b8e80941Smrg cmd_buffer->state.predicating = old_predicating; 1025b8e80941Smrg 1026b8e80941Smrg radv_meta_restore(&saved_state, cmd_buffer); 1027b8e80941Smrg} 1028b8e80941Smrg 1029b8e80941SmrgVkResult radv_CreateQueryPool( 1030b8e80941Smrg VkDevice _device, 1031b8e80941Smrg const VkQueryPoolCreateInfo* pCreateInfo, 1032b8e80941Smrg const VkAllocationCallbacks* pAllocator, 1033b8e80941Smrg VkQueryPool* pQueryPool) 1034b8e80941Smrg{ 1035b8e80941Smrg RADV_FROM_HANDLE(radv_device, device, _device); 1036b8e80941Smrg struct radv_query_pool *pool = vk_alloc2(&device->alloc, pAllocator, 1037b8e80941Smrg sizeof(*pool), 8, 1038b8e80941Smrg VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 1039b8e80941Smrg uint32_t initial_value = pCreateInfo->queryType == VK_QUERY_TYPE_TIMESTAMP 1040b8e80941Smrg ? TIMESTAMP_NOT_READY : 0; 1041b8e80941Smrg 1042b8e80941Smrg if (!pool) 1043b8e80941Smrg return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); 1044b8e80941Smrg 1045b8e80941Smrg 1046b8e80941Smrg switch(pCreateInfo->queryType) { 1047b8e80941Smrg case VK_QUERY_TYPE_OCCLUSION: 1048b8e80941Smrg pool->stride = 16 * device->physical_device->rad_info.num_render_backends; 1049b8e80941Smrg break; 1050b8e80941Smrg case VK_QUERY_TYPE_PIPELINE_STATISTICS: 1051b8e80941Smrg pool->stride = pipelinestat_block_size * 2; 1052b8e80941Smrg break; 1053b8e80941Smrg case VK_QUERY_TYPE_TIMESTAMP: 1054b8e80941Smrg pool->stride = 8; 1055b8e80941Smrg break; 1056b8e80941Smrg case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 1057b8e80941Smrg pool->stride = 32; 1058b8e80941Smrg break; 1059b8e80941Smrg default: 1060b8e80941Smrg unreachable("creating unhandled query type"); 1061b8e80941Smrg } 1062b8e80941Smrg 1063b8e80941Smrg pool->type = pCreateInfo->queryType; 1064b8e80941Smrg pool->pipeline_stats_mask = pCreateInfo->pipelineStatistics; 1065b8e80941Smrg pool->availability_offset = pool->stride * pCreateInfo->queryCount; 1066b8e80941Smrg pool->size = pool->availability_offset; 1067b8e80941Smrg if (pCreateInfo->queryType == VK_QUERY_TYPE_PIPELINE_STATISTICS) 1068b8e80941Smrg pool->size += 4 * pCreateInfo->queryCount; 1069b8e80941Smrg 1070b8e80941Smrg pool->bo = device->ws->buffer_create(device->ws, pool->size, 1071b8e80941Smrg 64, RADEON_DOMAIN_GTT, RADEON_FLAG_NO_INTERPROCESS_SHARING, 1072b8e80941Smrg RADV_BO_PRIORITY_QUERY_POOL); 1073b8e80941Smrg 1074b8e80941Smrg if (!pool->bo) { 1075b8e80941Smrg vk_free2(&device->alloc, pAllocator, pool); 1076b8e80941Smrg return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY); 1077b8e80941Smrg } 1078b8e80941Smrg 1079b8e80941Smrg pool->ptr = device->ws->buffer_map(pool->bo); 1080b8e80941Smrg 1081b8e80941Smrg if (!pool->ptr) { 1082b8e80941Smrg device->ws->buffer_destroy(pool->bo); 1083b8e80941Smrg vk_free2(&device->alloc, pAllocator, pool); 1084b8e80941Smrg return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY); 1085b8e80941Smrg } 1086b8e80941Smrg memset(pool->ptr, initial_value, pool->size); 1087b8e80941Smrg 1088b8e80941Smrg *pQueryPool = radv_query_pool_to_handle(pool); 1089b8e80941Smrg return VK_SUCCESS; 1090b8e80941Smrg} 1091b8e80941Smrg 1092b8e80941Smrgvoid radv_DestroyQueryPool( 1093b8e80941Smrg VkDevice _device, 1094b8e80941Smrg VkQueryPool _pool, 1095b8e80941Smrg const VkAllocationCallbacks* pAllocator) 1096b8e80941Smrg{ 1097b8e80941Smrg RADV_FROM_HANDLE(radv_device, device, _device); 1098b8e80941Smrg RADV_FROM_HANDLE(radv_query_pool, pool, _pool); 1099b8e80941Smrg 1100b8e80941Smrg if (!pool) 1101b8e80941Smrg return; 1102b8e80941Smrg 1103b8e80941Smrg device->ws->buffer_destroy(pool->bo); 1104b8e80941Smrg vk_free2(&device->alloc, pAllocator, pool); 1105b8e80941Smrg} 1106b8e80941Smrg 1107b8e80941SmrgVkResult radv_GetQueryPoolResults( 1108b8e80941Smrg VkDevice _device, 1109b8e80941Smrg VkQueryPool queryPool, 1110b8e80941Smrg uint32_t firstQuery, 1111b8e80941Smrg uint32_t queryCount, 1112b8e80941Smrg size_t dataSize, 1113b8e80941Smrg void* pData, 1114b8e80941Smrg VkDeviceSize stride, 1115b8e80941Smrg VkQueryResultFlags flags) 1116b8e80941Smrg{ 1117b8e80941Smrg RADV_FROM_HANDLE(radv_device, device, _device); 1118b8e80941Smrg RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); 1119b8e80941Smrg char *data = pData; 1120b8e80941Smrg VkResult result = VK_SUCCESS; 1121b8e80941Smrg 1122b8e80941Smrg for(unsigned i = 0; i < queryCount; ++i, data += stride) { 1123b8e80941Smrg char *dest = data; 1124b8e80941Smrg unsigned query = firstQuery + i; 1125b8e80941Smrg char *src = pool->ptr + query * pool->stride; 1126b8e80941Smrg uint32_t available; 1127b8e80941Smrg 1128b8e80941Smrg if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) { 1129b8e80941Smrg if (flags & VK_QUERY_RESULT_WAIT_BIT) 1130b8e80941Smrg while(!*(volatile uint32_t*)(pool->ptr + pool->availability_offset + 4 * query)) 1131b8e80941Smrg ; 1132b8e80941Smrg available = *(volatile uint32_t*)(pool->ptr + pool->availability_offset + 4 * query); 1133b8e80941Smrg } 1134b8e80941Smrg 1135b8e80941Smrg switch (pool->type) { 1136b8e80941Smrg case VK_QUERY_TYPE_TIMESTAMP: { 1137b8e80941Smrg volatile uint64_t const *src64 = (volatile uint64_t const *)src; 1138b8e80941Smrg available = *src64 != TIMESTAMP_NOT_READY; 1139b8e80941Smrg 1140b8e80941Smrg if (flags & VK_QUERY_RESULT_WAIT_BIT) { 1141b8e80941Smrg while (*src64 == TIMESTAMP_NOT_READY) 1142b8e80941Smrg ; 1143b8e80941Smrg available = true; 1144b8e80941Smrg } 1145b8e80941Smrg 1146b8e80941Smrg if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT)) 1147b8e80941Smrg result = VK_NOT_READY; 1148b8e80941Smrg 1149b8e80941Smrg if (flags & VK_QUERY_RESULT_64_BIT) { 1150b8e80941Smrg if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) 1151b8e80941Smrg *(uint64_t*)dest = *src64; 1152b8e80941Smrg dest += 8; 1153b8e80941Smrg } else { 1154b8e80941Smrg if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) 1155b8e80941Smrg *(uint32_t*)dest = *(volatile uint32_t*)src; 1156b8e80941Smrg dest += 4; 1157b8e80941Smrg } 1158b8e80941Smrg break; 1159b8e80941Smrg } 1160b8e80941Smrg case VK_QUERY_TYPE_OCCLUSION: { 1161b8e80941Smrg volatile uint64_t const *src64 = (volatile uint64_t const *)src; 1162b8e80941Smrg uint32_t db_count = device->physical_device->rad_info.num_render_backends; 1163b8e80941Smrg uint32_t enabled_rb_mask = device->physical_device->rad_info.enabled_rb_mask; 1164b8e80941Smrg uint64_t sample_count = 0; 1165b8e80941Smrg available = 1; 1166b8e80941Smrg 1167b8e80941Smrg for (int i = 0; i < db_count; ++i) { 1168b8e80941Smrg uint64_t start, end; 1169b8e80941Smrg 1170b8e80941Smrg if (!(enabled_rb_mask & (1 << i))) 1171b8e80941Smrg continue; 1172b8e80941Smrg 1173b8e80941Smrg do { 1174b8e80941Smrg start = src64[2 * i]; 1175b8e80941Smrg end = src64[2 * i + 1]; 1176b8e80941Smrg } while ((!(start & (1ull << 63)) || !(end & (1ull << 63))) && (flags & VK_QUERY_RESULT_WAIT_BIT)); 1177b8e80941Smrg 1178b8e80941Smrg if (!(start & (1ull << 63)) || !(end & (1ull << 63))) 1179b8e80941Smrg available = 0; 1180b8e80941Smrg else { 1181b8e80941Smrg sample_count += end - start; 1182b8e80941Smrg } 1183b8e80941Smrg } 1184b8e80941Smrg 1185b8e80941Smrg if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT)) 1186b8e80941Smrg result = VK_NOT_READY; 1187b8e80941Smrg 1188b8e80941Smrg if (flags & VK_QUERY_RESULT_64_BIT) { 1189b8e80941Smrg if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) 1190b8e80941Smrg *(uint64_t*)dest = sample_count; 1191b8e80941Smrg dest += 8; 1192b8e80941Smrg } else { 1193b8e80941Smrg if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) 1194b8e80941Smrg *(uint32_t*)dest = sample_count; 1195b8e80941Smrg dest += 4; 1196b8e80941Smrg } 1197b8e80941Smrg break; 1198b8e80941Smrg } 1199b8e80941Smrg case VK_QUERY_TYPE_PIPELINE_STATISTICS: { 1200b8e80941Smrg if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT)) 1201b8e80941Smrg result = VK_NOT_READY; 1202b8e80941Smrg 1203b8e80941Smrg const volatile uint64_t *start = (uint64_t*)src; 1204b8e80941Smrg const volatile uint64_t *stop = (uint64_t*)(src + pipelinestat_block_size); 1205b8e80941Smrg if (flags & VK_QUERY_RESULT_64_BIT) { 1206b8e80941Smrg uint64_t *dst = (uint64_t*)dest; 1207b8e80941Smrg dest += util_bitcount(pool->pipeline_stats_mask) * 8; 1208b8e80941Smrg for(int i = 0; i < 11; ++i) { 1209b8e80941Smrg if(pool->pipeline_stats_mask & (1u << i)) { 1210b8e80941Smrg if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) 1211b8e80941Smrg *dst = stop[pipeline_statistics_indices[i]] - 1212b8e80941Smrg start[pipeline_statistics_indices[i]]; 1213b8e80941Smrg dst++; 1214b8e80941Smrg } 1215b8e80941Smrg } 1216b8e80941Smrg 1217b8e80941Smrg } else { 1218b8e80941Smrg uint32_t *dst = (uint32_t*)dest; 1219b8e80941Smrg dest += util_bitcount(pool->pipeline_stats_mask) * 4; 1220b8e80941Smrg for(int i = 0; i < 11; ++i) { 1221b8e80941Smrg if(pool->pipeline_stats_mask & (1u << i)) { 1222b8e80941Smrg if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) 1223b8e80941Smrg *dst = stop[pipeline_statistics_indices[i]] - 1224b8e80941Smrg start[pipeline_statistics_indices[i]]; 1225b8e80941Smrg dst++; 1226b8e80941Smrg } 1227b8e80941Smrg } 1228b8e80941Smrg } 1229b8e80941Smrg break; 1230b8e80941Smrg } 1231b8e80941Smrg case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: { 1232b8e80941Smrg volatile uint64_t const *src64 = (volatile uint64_t const *)src; 1233b8e80941Smrg uint64_t num_primitives_written; 1234b8e80941Smrg uint64_t primitive_storage_needed; 1235b8e80941Smrg 1236b8e80941Smrg /* SAMPLE_STREAMOUTSTATS stores this structure: 1237b8e80941Smrg * { 1238b8e80941Smrg * u64 NumPrimitivesWritten; 1239b8e80941Smrg * u64 PrimitiveStorageNeeded; 1240b8e80941Smrg * } 1241b8e80941Smrg */ 1242b8e80941Smrg available = 1; 1243b8e80941Smrg for (int j = 0; j < 4; j++) { 1244b8e80941Smrg if (!(src64[j] & 0x8000000000000000UL)) 1245b8e80941Smrg available = 0; 1246b8e80941Smrg } 1247b8e80941Smrg 1248b8e80941Smrg if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT)) 1249b8e80941Smrg result = VK_NOT_READY; 1250b8e80941Smrg 1251b8e80941Smrg num_primitives_written = src64[3] - src64[1]; 1252b8e80941Smrg primitive_storage_needed = src64[2] - src64[0]; 1253b8e80941Smrg 1254b8e80941Smrg if (flags & VK_QUERY_RESULT_64_BIT) { 1255b8e80941Smrg if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) 1256b8e80941Smrg *(uint64_t *)dest = num_primitives_written; 1257b8e80941Smrg dest += 8; 1258b8e80941Smrg if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) 1259b8e80941Smrg *(uint64_t *)dest = primitive_storage_needed; 1260b8e80941Smrg dest += 8; 1261b8e80941Smrg } else { 1262b8e80941Smrg if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) 1263b8e80941Smrg *(uint32_t *)dest = num_primitives_written; 1264b8e80941Smrg dest += 4; 1265b8e80941Smrg if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) 1266b8e80941Smrg *(uint32_t *)dest = primitive_storage_needed; 1267b8e80941Smrg dest += 4; 1268b8e80941Smrg } 1269b8e80941Smrg break; 1270b8e80941Smrg } 1271b8e80941Smrg default: 1272b8e80941Smrg unreachable("trying to get results of unhandled query type"); 1273b8e80941Smrg } 1274b8e80941Smrg 1275b8e80941Smrg if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { 1276b8e80941Smrg if (flags & VK_QUERY_RESULT_64_BIT) { 1277b8e80941Smrg *(uint64_t*)dest = available; 1278b8e80941Smrg } else { 1279b8e80941Smrg *(uint32_t*)dest = available; 1280b8e80941Smrg } 1281b8e80941Smrg } 1282b8e80941Smrg } 1283b8e80941Smrg 1284b8e80941Smrg return result; 1285b8e80941Smrg} 1286b8e80941Smrg 1287b8e80941Smrgvoid radv_CmdCopyQueryPoolResults( 1288b8e80941Smrg VkCommandBuffer commandBuffer, 1289b8e80941Smrg VkQueryPool queryPool, 1290b8e80941Smrg uint32_t firstQuery, 1291b8e80941Smrg uint32_t queryCount, 1292b8e80941Smrg VkBuffer dstBuffer, 1293b8e80941Smrg VkDeviceSize dstOffset, 1294b8e80941Smrg VkDeviceSize stride, 1295b8e80941Smrg VkQueryResultFlags flags) 1296b8e80941Smrg{ 1297b8e80941Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 1298b8e80941Smrg RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); 1299b8e80941Smrg RADV_FROM_HANDLE(radv_buffer, dst_buffer, dstBuffer); 1300b8e80941Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 1301b8e80941Smrg unsigned elem_size = (flags & VK_QUERY_RESULT_64_BIT) ? 8 : 4; 1302b8e80941Smrg uint64_t va = radv_buffer_get_va(pool->bo); 1303b8e80941Smrg uint64_t dest_va = radv_buffer_get_va(dst_buffer->bo); 1304b8e80941Smrg dest_va += dst_buffer->offset + dstOffset; 1305b8e80941Smrg 1306b8e80941Smrg radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pool->bo); 1307b8e80941Smrg radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, dst_buffer->bo); 1308b8e80941Smrg 1309b8e80941Smrg switch (pool->type) { 1310b8e80941Smrg case VK_QUERY_TYPE_OCCLUSION: 1311b8e80941Smrg if (flags & VK_QUERY_RESULT_WAIT_BIT) { 1312b8e80941Smrg for(unsigned i = 0; i < queryCount; ++i, dest_va += stride) { 1313b8e80941Smrg unsigned query = firstQuery + i; 1314b8e80941Smrg uint64_t src_va = va + query * pool->stride + pool->stride - 4; 1315b8e80941Smrg 1316b8e80941Smrg radeon_check_space(cmd_buffer->device->ws, cs, 7); 1317b8e80941Smrg 1318b8e80941Smrg /* Waits on the upper word of the last DB entry */ 1319b8e80941Smrg radv_cp_wait_mem(cs, WAIT_REG_MEM_GREATER_OR_EQUAL, 1320b8e80941Smrg src_va, 0x80000000, 0xffffffff); 1321b8e80941Smrg } 1322b8e80941Smrg } 1323b8e80941Smrg radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.occlusion_query_pipeline, 1324b8e80941Smrg pool->bo, dst_buffer->bo, firstQuery * pool->stride, 1325b8e80941Smrg dst_buffer->offset + dstOffset, 1326b8e80941Smrg pool->stride, stride, 1327b8e80941Smrg queryCount, flags, 0, 0); 1328b8e80941Smrg break; 1329b8e80941Smrg case VK_QUERY_TYPE_PIPELINE_STATISTICS: 1330b8e80941Smrg if (flags & VK_QUERY_RESULT_WAIT_BIT) { 1331b8e80941Smrg for(unsigned i = 0; i < queryCount; ++i, dest_va += stride) { 1332b8e80941Smrg unsigned query = firstQuery + i; 1333b8e80941Smrg 1334b8e80941Smrg radeon_check_space(cmd_buffer->device->ws, cs, 7); 1335b8e80941Smrg 1336b8e80941Smrg uint64_t avail_va = va + pool->availability_offset + 4 * query; 1337b8e80941Smrg 1338b8e80941Smrg /* This waits on the ME. All copies below are done on the ME */ 1339b8e80941Smrg radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL, 1340b8e80941Smrg avail_va, 1, 0xffffffff); 1341b8e80941Smrg } 1342b8e80941Smrg } 1343b8e80941Smrg radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.pipeline_statistics_query_pipeline, 1344b8e80941Smrg pool->bo, dst_buffer->bo, firstQuery * pool->stride, 1345b8e80941Smrg dst_buffer->offset + dstOffset, 1346b8e80941Smrg pool->stride, stride, queryCount, flags, 1347b8e80941Smrg pool->pipeline_stats_mask, 1348b8e80941Smrg pool->availability_offset + 4 * firstQuery); 1349b8e80941Smrg break; 1350b8e80941Smrg case VK_QUERY_TYPE_TIMESTAMP: 1351b8e80941Smrg for(unsigned i = 0; i < queryCount; ++i, dest_va += stride) { 1352b8e80941Smrg unsigned query = firstQuery + i; 1353b8e80941Smrg uint64_t local_src_va = va + query * pool->stride; 1354b8e80941Smrg 1355b8e80941Smrg MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 19); 1356b8e80941Smrg 1357b8e80941Smrg 1358b8e80941Smrg if (flags & VK_QUERY_RESULT_WAIT_BIT) { 1359b8e80941Smrg /* Wait on the high 32 bits of the timestamp in 1360b8e80941Smrg * case the low part is 0xffffffff. 1361b8e80941Smrg */ 1362b8e80941Smrg radv_cp_wait_mem(cs, WAIT_REG_MEM_NOT_EQUAL, 1363b8e80941Smrg local_src_va + 4, 1364b8e80941Smrg TIMESTAMP_NOT_READY >> 32, 1365b8e80941Smrg 0xffffffff); 1366b8e80941Smrg } 1367b8e80941Smrg if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { 1368b8e80941Smrg uint64_t avail_dest_va = dest_va + elem_size; 1369b8e80941Smrg 1370b8e80941Smrg radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); 1371b8e80941Smrg radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | 1372b8e80941Smrg COPY_DATA_DST_SEL(COPY_DATA_DST_MEM_GRBM)); 1373b8e80941Smrg radeon_emit(cs, local_src_va); 1374b8e80941Smrg radeon_emit(cs, local_src_va >> 32); 1375b8e80941Smrg radeon_emit(cs, avail_dest_va); 1376b8e80941Smrg radeon_emit(cs, avail_dest_va >> 32); 1377b8e80941Smrg } 1378b8e80941Smrg 1379b8e80941Smrg radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); 1380b8e80941Smrg radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | 1381b8e80941Smrg COPY_DATA_DST_SEL(COPY_DATA_DST_MEM_GRBM) | 1382b8e80941Smrg ((flags & VK_QUERY_RESULT_64_BIT) ? COPY_DATA_COUNT_SEL : 0)); 1383b8e80941Smrg radeon_emit(cs, local_src_va); 1384b8e80941Smrg radeon_emit(cs, local_src_va >> 32); 1385b8e80941Smrg radeon_emit(cs, dest_va); 1386b8e80941Smrg radeon_emit(cs, dest_va >> 32); 1387b8e80941Smrg 1388b8e80941Smrg 1389b8e80941Smrg assert(cs->cdw <= cdw_max); 1390b8e80941Smrg } 1391b8e80941Smrg break; 1392b8e80941Smrg case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 1393b8e80941Smrg if (flags & VK_QUERY_RESULT_WAIT_BIT) { 1394b8e80941Smrg for(unsigned i = 0; i < queryCount; i++) { 1395b8e80941Smrg unsigned query = firstQuery + i; 1396b8e80941Smrg uint64_t src_va = va + query * pool->stride; 1397b8e80941Smrg 1398b8e80941Smrg radeon_check_space(cmd_buffer->device->ws, cs, 7 * 4); 1399b8e80941Smrg 1400b8e80941Smrg /* Wait on the upper word of all results. */ 1401b8e80941Smrg for (unsigned j = 0; j < 4; j++, src_va += 8) { 1402b8e80941Smrg radv_cp_wait_mem(cs, WAIT_REG_MEM_GREATER_OR_EQUAL, 1403b8e80941Smrg src_va + 4, 0x80000000, 1404b8e80941Smrg 0xffffffff); 1405b8e80941Smrg } 1406b8e80941Smrg } 1407b8e80941Smrg } 1408b8e80941Smrg 1409b8e80941Smrg radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.tfb_query_pipeline, 1410b8e80941Smrg pool->bo, dst_buffer->bo, 1411b8e80941Smrg firstQuery * pool->stride, 1412b8e80941Smrg dst_buffer->offset + dstOffset, 1413b8e80941Smrg pool->stride, stride, 1414b8e80941Smrg queryCount, flags, 0, 0); 1415b8e80941Smrg break; 1416b8e80941Smrg default: 1417b8e80941Smrg unreachable("trying to get results of unhandled query type"); 1418b8e80941Smrg } 1419b8e80941Smrg 1420b8e80941Smrg} 1421b8e80941Smrg 1422b8e80941Smrgvoid radv_CmdResetQueryPool( 1423b8e80941Smrg VkCommandBuffer commandBuffer, 1424b8e80941Smrg VkQueryPool queryPool, 1425b8e80941Smrg uint32_t firstQuery, 1426b8e80941Smrg uint32_t queryCount) 1427b8e80941Smrg{ 1428b8e80941Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 1429b8e80941Smrg RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); 1430b8e80941Smrg uint32_t value = pool->type == VK_QUERY_TYPE_TIMESTAMP 1431b8e80941Smrg ? TIMESTAMP_NOT_READY : 0; 1432b8e80941Smrg uint32_t flush_bits = 0; 1433b8e80941Smrg 1434b8e80941Smrg flush_bits |= radv_fill_buffer(cmd_buffer, pool->bo, 1435b8e80941Smrg firstQuery * pool->stride, 1436b8e80941Smrg queryCount * pool->stride, value); 1437b8e80941Smrg 1438b8e80941Smrg if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) { 1439b8e80941Smrg flush_bits |= radv_fill_buffer(cmd_buffer, pool->bo, 1440b8e80941Smrg pool->availability_offset + firstQuery * 4, 1441b8e80941Smrg queryCount * 4, 0); 1442b8e80941Smrg } 1443b8e80941Smrg 1444b8e80941Smrg if (flush_bits) { 1445b8e80941Smrg /* Only need to flush caches for the compute shader path. */ 1446b8e80941Smrg cmd_buffer->pending_reset_query = true; 1447b8e80941Smrg cmd_buffer->state.flush_bits |= flush_bits; 1448b8e80941Smrg } 1449b8e80941Smrg} 1450b8e80941Smrg 1451b8e80941Smrgvoid radv_ResetQueryPoolEXT( 1452b8e80941Smrg VkDevice _device, 1453b8e80941Smrg VkQueryPool queryPool, 1454b8e80941Smrg uint32_t firstQuery, 1455b8e80941Smrg uint32_t queryCount) 1456b8e80941Smrg{ 1457b8e80941Smrg RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); 1458b8e80941Smrg 1459b8e80941Smrg uint32_t value = pool->type == VK_QUERY_TYPE_TIMESTAMP 1460b8e80941Smrg ? TIMESTAMP_NOT_READY : 0; 1461b8e80941Smrg uint32_t *data = (uint32_t*)(pool->ptr + firstQuery * pool->stride); 1462b8e80941Smrg uint32_t *data_end = (uint32_t*)(pool->ptr + (firstQuery + queryCount) * pool->stride); 1463b8e80941Smrg 1464b8e80941Smrg for(uint32_t *p = data; p != data_end; ++p) 1465b8e80941Smrg *p = value; 1466b8e80941Smrg 1467b8e80941Smrg if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) { 1468b8e80941Smrg memset(pool->ptr + pool->availability_offset + firstQuery * 4, 1469b8e80941Smrg 0, queryCount * 4); 1470b8e80941Smrg } 1471b8e80941Smrg} 1472b8e80941Smrg 1473b8e80941Smrgstatic unsigned event_type_for_stream(unsigned stream) 1474b8e80941Smrg{ 1475b8e80941Smrg switch (stream) { 1476b8e80941Smrg default: 1477b8e80941Smrg case 0: return V_028A90_SAMPLE_STREAMOUTSTATS; 1478b8e80941Smrg case 1: return V_028A90_SAMPLE_STREAMOUTSTATS1; 1479b8e80941Smrg case 2: return V_028A90_SAMPLE_STREAMOUTSTATS2; 1480b8e80941Smrg case 3: return V_028A90_SAMPLE_STREAMOUTSTATS3; 1481b8e80941Smrg } 1482b8e80941Smrg} 1483b8e80941Smrg 1484b8e80941Smrgstatic void emit_query_flush(struct radv_cmd_buffer *cmd_buffer, 1485b8e80941Smrg struct radv_query_pool *pool) 1486b8e80941Smrg{ 1487b8e80941Smrg if (cmd_buffer->pending_reset_query) { 1488b8e80941Smrg if (pool->size >= RADV_BUFFER_OPS_CS_THRESHOLD) { 1489b8e80941Smrg /* Only need to flush caches if the query pool size is 1490b8e80941Smrg * large enough to be resetted using the compute shader 1491b8e80941Smrg * path. Small pools don't need any cache flushes 1492b8e80941Smrg * because we use a CP dma clear. 1493b8e80941Smrg */ 1494b8e80941Smrg si_emit_cache_flush(cmd_buffer); 1495b8e80941Smrg } 1496b8e80941Smrg } 1497b8e80941Smrg} 1498b8e80941Smrg 1499b8e80941Smrgstatic void emit_begin_query(struct radv_cmd_buffer *cmd_buffer, 1500b8e80941Smrg uint64_t va, 1501b8e80941Smrg VkQueryType query_type, 1502b8e80941Smrg VkQueryControlFlags flags, 1503b8e80941Smrg uint32_t index) 1504b8e80941Smrg{ 1505b8e80941Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 1506b8e80941Smrg switch (query_type) { 1507b8e80941Smrg case VK_QUERY_TYPE_OCCLUSION: 1508b8e80941Smrg radeon_check_space(cmd_buffer->device->ws, cs, 7); 1509b8e80941Smrg 1510b8e80941Smrg ++cmd_buffer->state.active_occlusion_queries; 1511b8e80941Smrg if (cmd_buffer->state.active_occlusion_queries == 1) { 1512b8e80941Smrg if (flags & VK_QUERY_CONTROL_PRECISE_BIT) { 1513b8e80941Smrg /* This is the first occlusion query, enable 1514b8e80941Smrg * the hint if the precision bit is set. 1515b8e80941Smrg */ 1516b8e80941Smrg cmd_buffer->state.perfect_occlusion_queries_enabled = true; 1517b8e80941Smrg } 1518b8e80941Smrg 1519b8e80941Smrg radv_set_db_count_control(cmd_buffer); 1520b8e80941Smrg } else { 1521b8e80941Smrg if ((flags & VK_QUERY_CONTROL_PRECISE_BIT) && 1522b8e80941Smrg !cmd_buffer->state.perfect_occlusion_queries_enabled) { 1523b8e80941Smrg /* This is not the first query, but this one 1524b8e80941Smrg * needs to enable precision, DB_COUNT_CONTROL 1525b8e80941Smrg * has to be updated accordingly. 1526b8e80941Smrg */ 1527b8e80941Smrg cmd_buffer->state.perfect_occlusion_queries_enabled = true; 1528b8e80941Smrg 1529b8e80941Smrg radv_set_db_count_control(cmd_buffer); 1530b8e80941Smrg } 1531b8e80941Smrg } 1532b8e80941Smrg 1533b8e80941Smrg radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); 1534b8e80941Smrg radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1)); 1535b8e80941Smrg radeon_emit(cs, va); 1536b8e80941Smrg radeon_emit(cs, va >> 32); 1537b8e80941Smrg break; 1538b8e80941Smrg case VK_QUERY_TYPE_PIPELINE_STATISTICS: 1539b8e80941Smrg radeon_check_space(cmd_buffer->device->ws, cs, 4); 1540b8e80941Smrg 1541b8e80941Smrg ++cmd_buffer->state.active_pipeline_queries; 1542b8e80941Smrg if (cmd_buffer->state.active_pipeline_queries == 1) { 1543b8e80941Smrg cmd_buffer->state.flush_bits &= ~RADV_CMD_FLAG_STOP_PIPELINE_STATS; 1544b8e80941Smrg cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_START_PIPELINE_STATS; 1545b8e80941Smrg } 1546b8e80941Smrg 1547b8e80941Smrg radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); 1548b8e80941Smrg radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2)); 1549b8e80941Smrg radeon_emit(cs, va); 1550b8e80941Smrg radeon_emit(cs, va >> 32); 1551b8e80941Smrg break; 1552b8e80941Smrg case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 1553b8e80941Smrg radeon_check_space(cmd_buffer->device->ws, cs, 4); 1554b8e80941Smrg 1555b8e80941Smrg assert(index < MAX_SO_STREAMS); 1556b8e80941Smrg 1557b8e80941Smrg radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); 1558b8e80941Smrg radeon_emit(cs, EVENT_TYPE(event_type_for_stream(index)) | EVENT_INDEX(3)); 1559b8e80941Smrg radeon_emit(cs, va); 1560b8e80941Smrg radeon_emit(cs, va >> 32); 1561b8e80941Smrg break; 1562b8e80941Smrg default: 1563b8e80941Smrg unreachable("beginning unhandled query type"); 1564b8e80941Smrg } 1565b8e80941Smrg 1566b8e80941Smrg} 1567b8e80941Smrg 1568b8e80941Smrgstatic void emit_end_query(struct radv_cmd_buffer *cmd_buffer, 1569b8e80941Smrg uint64_t va, uint64_t avail_va, 1570b8e80941Smrg VkQueryType query_type, uint32_t index) 1571b8e80941Smrg{ 1572b8e80941Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 1573b8e80941Smrg switch (query_type) { 1574b8e80941Smrg case VK_QUERY_TYPE_OCCLUSION: 1575b8e80941Smrg radeon_check_space(cmd_buffer->device->ws, cs, 14); 1576b8e80941Smrg 1577b8e80941Smrg cmd_buffer->state.active_occlusion_queries--; 1578b8e80941Smrg if (cmd_buffer->state.active_occlusion_queries == 0) { 1579b8e80941Smrg radv_set_db_count_control(cmd_buffer); 1580b8e80941Smrg 1581b8e80941Smrg /* Reset the perfect occlusion queries hint now that no 1582b8e80941Smrg * queries are active. 1583b8e80941Smrg */ 1584b8e80941Smrg cmd_buffer->state.perfect_occlusion_queries_enabled = false; 1585b8e80941Smrg } 1586b8e80941Smrg 1587b8e80941Smrg radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); 1588b8e80941Smrg radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1)); 1589b8e80941Smrg radeon_emit(cs, va + 8); 1590b8e80941Smrg radeon_emit(cs, (va + 8) >> 32); 1591b8e80941Smrg 1592b8e80941Smrg break; 1593b8e80941Smrg case VK_QUERY_TYPE_PIPELINE_STATISTICS: 1594b8e80941Smrg radeon_check_space(cmd_buffer->device->ws, cs, 16); 1595b8e80941Smrg 1596b8e80941Smrg cmd_buffer->state.active_pipeline_queries--; 1597b8e80941Smrg if (cmd_buffer->state.active_pipeline_queries == 0) { 1598b8e80941Smrg cmd_buffer->state.flush_bits &= ~RADV_CMD_FLAG_START_PIPELINE_STATS; 1599b8e80941Smrg cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_STOP_PIPELINE_STATS; 1600b8e80941Smrg } 1601b8e80941Smrg va += pipelinestat_block_size; 1602b8e80941Smrg 1603b8e80941Smrg radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); 1604b8e80941Smrg radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2)); 1605b8e80941Smrg radeon_emit(cs, va); 1606b8e80941Smrg radeon_emit(cs, va >> 32); 1607b8e80941Smrg 1608b8e80941Smrg si_cs_emit_write_event_eop(cs, 1609b8e80941Smrg cmd_buffer->device->physical_device->rad_info.chip_class, 1610b8e80941Smrg radv_cmd_buffer_uses_mec(cmd_buffer), 1611b8e80941Smrg V_028A90_BOTTOM_OF_PIPE_TS, 0, 1612b8e80941Smrg EOP_DATA_SEL_VALUE_32BIT, 1613b8e80941Smrg avail_va, 1, 1614b8e80941Smrg cmd_buffer->gfx9_eop_bug_va); 1615b8e80941Smrg break; 1616b8e80941Smrg case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 1617b8e80941Smrg radeon_check_space(cmd_buffer->device->ws, cs, 4); 1618b8e80941Smrg 1619b8e80941Smrg assert(index < MAX_SO_STREAMS); 1620b8e80941Smrg 1621b8e80941Smrg radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); 1622b8e80941Smrg radeon_emit(cs, EVENT_TYPE(event_type_for_stream(index)) | EVENT_INDEX(3)); 1623b8e80941Smrg radeon_emit(cs, (va + 16)); 1624b8e80941Smrg radeon_emit(cs, (va + 16) >> 32); 1625b8e80941Smrg break; 1626b8e80941Smrg default: 1627b8e80941Smrg unreachable("ending unhandled query type"); 1628b8e80941Smrg } 1629b8e80941Smrg} 1630b8e80941Smrg 1631b8e80941Smrgvoid radv_CmdBeginQueryIndexedEXT( 1632b8e80941Smrg VkCommandBuffer commandBuffer, 1633b8e80941Smrg VkQueryPool queryPool, 1634b8e80941Smrg uint32_t query, 1635b8e80941Smrg VkQueryControlFlags flags, 1636b8e80941Smrg uint32_t index) 1637b8e80941Smrg{ 1638b8e80941Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 1639b8e80941Smrg RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); 1640b8e80941Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 1641b8e80941Smrg uint64_t va = radv_buffer_get_va(pool->bo); 1642b8e80941Smrg 1643b8e80941Smrg radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo); 1644b8e80941Smrg 1645b8e80941Smrg emit_query_flush(cmd_buffer, pool); 1646b8e80941Smrg 1647b8e80941Smrg va += pool->stride * query; 1648b8e80941Smrg 1649b8e80941Smrg emit_begin_query(cmd_buffer, va, pool->type, flags, index); 1650b8e80941Smrg} 1651b8e80941Smrg 1652b8e80941Smrgvoid radv_CmdBeginQuery( 1653b8e80941Smrg VkCommandBuffer commandBuffer, 1654b8e80941Smrg VkQueryPool queryPool, 1655b8e80941Smrg uint32_t query, 1656b8e80941Smrg VkQueryControlFlags flags) 1657b8e80941Smrg{ 1658b8e80941Smrg radv_CmdBeginQueryIndexedEXT(commandBuffer, queryPool, query, flags, 0); 1659b8e80941Smrg} 1660b8e80941Smrg 1661b8e80941Smrgvoid radv_CmdEndQueryIndexedEXT( 1662b8e80941Smrg VkCommandBuffer commandBuffer, 1663b8e80941Smrg VkQueryPool queryPool, 1664b8e80941Smrg uint32_t query, 1665b8e80941Smrg uint32_t index) 1666b8e80941Smrg{ 1667b8e80941Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 1668b8e80941Smrg RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); 1669b8e80941Smrg uint64_t va = radv_buffer_get_va(pool->bo); 1670b8e80941Smrg uint64_t avail_va = va + pool->availability_offset + 4 * query; 1671b8e80941Smrg va += pool->stride * query; 1672b8e80941Smrg 1673b8e80941Smrg /* Do not need to add the pool BO to the list because the query must 1674b8e80941Smrg * currently be active, which means the BO is already in the list. 1675b8e80941Smrg */ 1676b8e80941Smrg emit_end_query(cmd_buffer, va, avail_va, pool->type, index); 1677b8e80941Smrg 1678b8e80941Smrg /* 1679b8e80941Smrg * For multiview we have to emit a query for each bit in the mask, 1680b8e80941Smrg * however the first query we emit will get the totals for all the 1681b8e80941Smrg * operations, so we don't want to get a real value in the other 1682b8e80941Smrg * queries. This emits a fake begin/end sequence so the waiting 1683b8e80941Smrg * code gets a completed query value and doesn't hang, but the 1684b8e80941Smrg * query returns 0. 1685b8e80941Smrg */ 1686b8e80941Smrg if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) { 1687b8e80941Smrg uint64_t avail_va = va + pool->availability_offset + 4 * query; 1688b8e80941Smrg 1689b8e80941Smrg 1690b8e80941Smrg for (unsigned i = 1; i < util_bitcount(cmd_buffer->state.subpass->view_mask); i++) { 1691b8e80941Smrg va += pool->stride; 1692b8e80941Smrg avail_va += 4; 1693b8e80941Smrg emit_begin_query(cmd_buffer, va, pool->type, 0, 0); 1694b8e80941Smrg emit_end_query(cmd_buffer, va, avail_va, pool->type, 0); 1695b8e80941Smrg } 1696b8e80941Smrg } 1697b8e80941Smrg} 1698b8e80941Smrg 1699b8e80941Smrgvoid radv_CmdEndQuery( 1700b8e80941Smrg VkCommandBuffer commandBuffer, 1701b8e80941Smrg VkQueryPool queryPool, 1702b8e80941Smrg uint32_t query) 1703b8e80941Smrg{ 1704b8e80941Smrg radv_CmdEndQueryIndexedEXT(commandBuffer, queryPool, query, 0); 1705b8e80941Smrg} 1706b8e80941Smrg 1707b8e80941Smrgvoid radv_CmdWriteTimestamp( 1708b8e80941Smrg VkCommandBuffer commandBuffer, 1709b8e80941Smrg VkPipelineStageFlagBits pipelineStage, 1710b8e80941Smrg VkQueryPool queryPool, 1711b8e80941Smrg uint32_t query) 1712b8e80941Smrg{ 1713b8e80941Smrg RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 1714b8e80941Smrg RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); 1715b8e80941Smrg bool mec = radv_cmd_buffer_uses_mec(cmd_buffer); 1716b8e80941Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 1717b8e80941Smrg uint64_t va = radv_buffer_get_va(pool->bo); 1718b8e80941Smrg uint64_t query_va = va + pool->stride * query; 1719b8e80941Smrg 1720b8e80941Smrg radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo); 1721b8e80941Smrg 1722b8e80941Smrg emit_query_flush(cmd_buffer, pool); 1723b8e80941Smrg 1724b8e80941Smrg int num_queries = 1; 1725b8e80941Smrg if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) 1726b8e80941Smrg num_queries = util_bitcount(cmd_buffer->state.subpass->view_mask); 1727b8e80941Smrg 1728b8e80941Smrg MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 28 * num_queries); 1729b8e80941Smrg 1730b8e80941Smrg for (unsigned i = 0; i < num_queries; i++) { 1731b8e80941Smrg switch(pipelineStage) { 1732b8e80941Smrg case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT: 1733b8e80941Smrg radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); 1734b8e80941Smrg radeon_emit(cs, COPY_DATA_COUNT_SEL | COPY_DATA_WR_CONFIRM | 1735b8e80941Smrg COPY_DATA_SRC_SEL(COPY_DATA_TIMESTAMP) | 1736b8e80941Smrg COPY_DATA_DST_SEL(V_370_MEM)); 1737b8e80941Smrg radeon_emit(cs, 0); 1738b8e80941Smrg radeon_emit(cs, 0); 1739b8e80941Smrg radeon_emit(cs, query_va); 1740b8e80941Smrg radeon_emit(cs, query_va >> 32); 1741b8e80941Smrg break; 1742b8e80941Smrg default: 1743b8e80941Smrg si_cs_emit_write_event_eop(cs, 1744b8e80941Smrg cmd_buffer->device->physical_device->rad_info.chip_class, 1745b8e80941Smrg mec, 1746b8e80941Smrg V_028A90_BOTTOM_OF_PIPE_TS, 0, 1747b8e80941Smrg EOP_DATA_SEL_TIMESTAMP, 1748b8e80941Smrg query_va, 0, 1749b8e80941Smrg cmd_buffer->gfx9_eop_bug_va); 1750b8e80941Smrg break; 1751b8e80941Smrg } 1752b8e80941Smrg query_va += pool->stride; 1753b8e80941Smrg } 1754b8e80941Smrg assert(cmd_buffer->cs->cdw <= cdw_max); 1755b8e80941Smrg} 1756