101e04c3fSmrg/* 201e04c3fSmrg * Copyright © 2015 Intel Corporation 301e04c3fSmrg * 401e04c3fSmrg * Permission is hereby granted, free of charge, to any person obtaining a 501e04c3fSmrg * copy of this software and associated documentation files (the "Software"), 601e04c3fSmrg * to deal in the Software without restriction, including without limitation 701e04c3fSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 801e04c3fSmrg * and/or sell copies of the Software, and to permit persons to whom the 901e04c3fSmrg * Software is furnished to do so, subject to the following conditions: 1001e04c3fSmrg * 1101e04c3fSmrg * The above copyright notice and this permission notice (including the next 1201e04c3fSmrg * paragraph) shall be included in all copies or substantial portions of the 1301e04c3fSmrg * Software. 1401e04c3fSmrg * 1501e04c3fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1601e04c3fSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1701e04c3fSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 1801e04c3fSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 1901e04c3fSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 2001e04c3fSmrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 2101e04c3fSmrg * IN THE SOFTWARE. 2201e04c3fSmrg */ 2301e04c3fSmrg 2401e04c3fSmrg#include <assert.h> 2501e04c3fSmrg#include <stdbool.h> 2601e04c3fSmrg#include <string.h> 2701e04c3fSmrg#include <unistd.h> 2801e04c3fSmrg#include <fcntl.h> 2901e04c3fSmrg 3001e04c3fSmrg#include "anv_private.h" 3101e04c3fSmrg 3201e04c3fSmrg#include "genxml/gen_macros.h" 3301e04c3fSmrg#include "genxml/genX_pack.h" 3401e04c3fSmrg 357ec681f3Smrg/* We reserve : 367ec681f3Smrg * - GPR 14 for perf queries 377ec681f3Smrg * - GPR 15 for conditional rendering 387ec681f3Smrg */ 397ec681f3Smrg#define MI_BUILDER_NUM_ALLOC_GPRS 14 407ec681f3Smrg#define MI_BUILDER_CAN_WRITE_BATCH GFX_VER >= 8 419f464c52Smaya#define __gen_get_batch_dwords anv_batch_emit_dwords 429f464c52Smaya#define __gen_address_offset anv_address_add 437ec681f3Smrg#define __gen_get_batch_address(b, a) anv_batch_address(b, a) 447ec681f3Smrg#include "common/mi_builder.h" 457ec681f3Smrg#include "perf/intel_perf.h" 467ec681f3Smrg#include "perf/intel_perf_mdapi.h" 477ec681f3Smrg#include "perf/intel_perf_regs.h" 487ec681f3Smrg 497ec681f3Smrg#include "vk_util.h" 507ec681f3Smrg 517ec681f3Smrgstatic struct anv_address 527ec681f3Smrganv_query_address(struct anv_query_pool *pool, uint32_t query) 537ec681f3Smrg{ 547ec681f3Smrg return (struct anv_address) { 557ec681f3Smrg .bo = pool->bo, 567ec681f3Smrg .offset = query * pool->stride, 577ec681f3Smrg }; 587ec681f3Smrg} 599f464c52Smaya 6001e04c3fSmrgVkResult genX(CreateQueryPool)( 6101e04c3fSmrg VkDevice _device, 6201e04c3fSmrg const VkQueryPoolCreateInfo* pCreateInfo, 6301e04c3fSmrg const VkAllocationCallbacks* pAllocator, 6401e04c3fSmrg VkQueryPool* pQueryPool) 6501e04c3fSmrg{ 6601e04c3fSmrg ANV_FROM_HANDLE(anv_device, device, _device); 677ec681f3Smrg const struct anv_physical_device *pdevice = device->physical; 687ec681f3Smrg#if GFX_VER >= 8 697ec681f3Smrg const VkQueryPoolPerformanceCreateInfoKHR *perf_query_info = NULL; 707ec681f3Smrg struct intel_perf_counter_pass *counter_pass; 717ec681f3Smrg struct intel_perf_query_info **pass_query; 727ec681f3Smrg uint32_t n_passes = 0; 737ec681f3Smrg#endif 747ec681f3Smrg uint32_t data_offset = 0; 757ec681f3Smrg VK_MULTIALLOC(ma); 7601e04c3fSmrg VkResult result; 7701e04c3fSmrg 7801e04c3fSmrg assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO); 7901e04c3fSmrg 8001e04c3fSmrg /* Query pool slots are made up of some number of 64-bit values packed 817ec681f3Smrg * tightly together. For most query types have the first 64-bit value is 827ec681f3Smrg * the "available" bit which is 0 when the query is unavailable and 1 when 837ec681f3Smrg * it is available. The 64-bit values that follow are determined by the 847ec681f3Smrg * type of query. 857ec681f3Smrg * 867ec681f3Smrg * For performance queries, we have a requirement to align OA reports at 877ec681f3Smrg * 64bytes so we put those first and have the "available" bit behind 887ec681f3Smrg * together with some other counters. 8901e04c3fSmrg */ 907ec681f3Smrg uint32_t uint64s_per_slot = 0; 917ec681f3Smrg 927ec681f3Smrg VK_MULTIALLOC_DECL(&ma, struct anv_query_pool, pool, 1); 9301e04c3fSmrg 9401e04c3fSmrg VkQueryPipelineStatisticFlags pipeline_statistics = 0; 9501e04c3fSmrg switch (pCreateInfo->queryType) { 9601e04c3fSmrg case VK_QUERY_TYPE_OCCLUSION: 9701e04c3fSmrg /* Occlusion queries have two values: begin and end. */ 987ec681f3Smrg uint64s_per_slot = 1 + 2; 9901e04c3fSmrg break; 10001e04c3fSmrg case VK_QUERY_TYPE_TIMESTAMP: 10101e04c3fSmrg /* Timestamps just have the one timestamp value */ 1027ec681f3Smrg uint64s_per_slot = 1 + 1; 10301e04c3fSmrg break; 10401e04c3fSmrg case VK_QUERY_TYPE_PIPELINE_STATISTICS: 10501e04c3fSmrg pipeline_statistics = pCreateInfo->pipelineStatistics; 10601e04c3fSmrg /* We're going to trust this field implicitly so we need to ensure that 10701e04c3fSmrg * no unhandled extension bits leak in. 10801e04c3fSmrg */ 10901e04c3fSmrg pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK; 11001e04c3fSmrg 11101e04c3fSmrg /* Statistics queries have a min and max for every statistic */ 1127ec681f3Smrg uint64s_per_slot = 1 + 2 * util_bitcount(pipeline_statistics); 11301e04c3fSmrg break; 1149f464c52Smaya case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 1159f464c52Smaya /* Transform feedback queries are 4 values, begin/end for 1169f464c52Smaya * written/available. 1179f464c52Smaya */ 1187ec681f3Smrg uint64s_per_slot = 1 + 4; 1197ec681f3Smrg break; 1207ec681f3Smrg case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { 1217ec681f3Smrg const struct intel_perf_query_field_layout *layout = 1227ec681f3Smrg &pdevice->perf->query_layout; 1237ec681f3Smrg 1247ec681f3Smrg uint64s_per_slot = 2; /* availability + marker */ 1257ec681f3Smrg /* Align to the requirement of the layout */ 1267ec681f3Smrg uint64s_per_slot = align_u32(uint64s_per_slot, 1277ec681f3Smrg DIV_ROUND_UP(layout->alignment, sizeof(uint64_t))); 1287ec681f3Smrg data_offset = uint64s_per_slot * sizeof(uint64_t); 1297ec681f3Smrg /* Add the query data for begin & end commands */ 1307ec681f3Smrg uint64s_per_slot += 2 * DIV_ROUND_UP(layout->size, sizeof(uint64_t)); 1317ec681f3Smrg break; 1327ec681f3Smrg } 1337ec681f3Smrg#if GFX_VER >= 8 1347ec681f3Smrg case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { 1357ec681f3Smrg const struct intel_perf_query_field_layout *layout = 1367ec681f3Smrg &pdevice->perf->query_layout; 1377ec681f3Smrg 1387ec681f3Smrg perf_query_info = vk_find_struct_const(pCreateInfo->pNext, 1397ec681f3Smrg QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR); 1407ec681f3Smrg n_passes = intel_perf_get_n_passes(pdevice->perf, 1417ec681f3Smrg perf_query_info->pCounterIndices, 1427ec681f3Smrg perf_query_info->counterIndexCount, 1437ec681f3Smrg NULL); 1447ec681f3Smrg vk_multialloc_add(&ma, &counter_pass, struct intel_perf_counter_pass, 1457ec681f3Smrg perf_query_info->counterIndexCount); 1467ec681f3Smrg vk_multialloc_add(&ma, &pass_query, struct intel_perf_query_info *, 1477ec681f3Smrg n_passes); 1487ec681f3Smrg uint64s_per_slot = 4 /* availability + small batch */; 1497ec681f3Smrg /* Align to the requirement of the layout */ 1507ec681f3Smrg uint64s_per_slot = align_u32(uint64s_per_slot, 1517ec681f3Smrg DIV_ROUND_UP(layout->alignment, sizeof(uint64_t))); 1527ec681f3Smrg data_offset = uint64s_per_slot * sizeof(uint64_t); 1537ec681f3Smrg /* Add the query data for begin & end commands */ 1547ec681f3Smrg uint64s_per_slot += 2 * DIV_ROUND_UP(layout->size, sizeof(uint64_t)); 1557ec681f3Smrg /* Multiply by the number of passes */ 1567ec681f3Smrg uint64s_per_slot *= n_passes; 1579f464c52Smaya break; 1587ec681f3Smrg } 1597ec681f3Smrg#endif 16001e04c3fSmrg default: 16101e04c3fSmrg assert(!"Invalid query type"); 16201e04c3fSmrg } 16301e04c3fSmrg 1647ec681f3Smrg if (!vk_object_multialloc(&device->vk, &ma, pAllocator, 1657ec681f3Smrg VK_OBJECT_TYPE_QUERY_POOL)) 1667ec681f3Smrg return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 16701e04c3fSmrg 16801e04c3fSmrg pool->type = pCreateInfo->queryType; 16901e04c3fSmrg pool->pipeline_statistics = pipeline_statistics; 17001e04c3fSmrg pool->stride = uint64s_per_slot * sizeof(uint64_t); 17101e04c3fSmrg pool->slots = pCreateInfo->queryCount; 17201e04c3fSmrg 1737ec681f3Smrg if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL) { 1747ec681f3Smrg pool->data_offset = data_offset; 1757ec681f3Smrg pool->snapshot_size = (pool->stride - data_offset) / 2; 1767ec681f3Smrg } 1777ec681f3Smrg#if GFX_VER >= 8 1787ec681f3Smrg else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { 1797ec681f3Smrg pool->pass_size = pool->stride / n_passes; 1807ec681f3Smrg pool->data_offset = data_offset; 1817ec681f3Smrg pool->snapshot_size = (pool->pass_size - data_offset) / 2; 1827ec681f3Smrg pool->n_counters = perf_query_info->counterIndexCount; 1837ec681f3Smrg pool->counter_pass = counter_pass; 1847ec681f3Smrg intel_perf_get_counters_passes(pdevice->perf, 1857ec681f3Smrg perf_query_info->pCounterIndices, 1867ec681f3Smrg perf_query_info->counterIndexCount, 1877ec681f3Smrg pool->counter_pass); 1887ec681f3Smrg pool->n_passes = n_passes; 1897ec681f3Smrg pool->pass_query = pass_query; 1907ec681f3Smrg intel_perf_get_n_passes(pdevice->perf, 1917ec681f3Smrg perf_query_info->pCounterIndices, 1927ec681f3Smrg perf_query_info->counterIndexCount, 1937ec681f3Smrg pool->pass_query); 1947ec681f3Smrg } 1957ec681f3Smrg#endif 1967ec681f3Smrg 1977ec681f3Smrg uint64_t size = pool->slots * (uint64_t)pool->stride; 1987ec681f3Smrg result = anv_device_alloc_bo(device, "query-pool", size, 1997ec681f3Smrg ANV_BO_ALLOC_MAPPED | 2007ec681f3Smrg ANV_BO_ALLOC_SNOOPED, 2017ec681f3Smrg 0 /* explicit_address */, 2027ec681f3Smrg &pool->bo); 20301e04c3fSmrg if (result != VK_SUCCESS) 20401e04c3fSmrg goto fail; 20501e04c3fSmrg 2067ec681f3Smrg#if GFX_VER >= 8 2077ec681f3Smrg if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { 2087ec681f3Smrg for (uint32_t p = 0; p < pool->n_passes; p++) { 2097ec681f3Smrg struct mi_builder b; 2107ec681f3Smrg struct anv_batch batch = { 2117ec681f3Smrg .start = pool->bo->map + khr_perf_query_preamble_offset(pool, p), 2127ec681f3Smrg .end = pool->bo->map + khr_perf_query_preamble_offset(pool, p) + pool->data_offset, 2137ec681f3Smrg }; 2147ec681f3Smrg batch.next = batch.start; 2157ec681f3Smrg 2167ec681f3Smrg mi_builder_init(&b, &device->info, &batch); 2177ec681f3Smrg mi_store(&b, mi_reg64(ANV_PERF_QUERY_OFFSET_REG), 2187ec681f3Smrg mi_imm(p * (uint64_t)pool->pass_size)); 2197ec681f3Smrg anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe); 2207ec681f3Smrg } 2217ec681f3Smrg } 2227ec681f3Smrg#endif 22301e04c3fSmrg 22401e04c3fSmrg *pQueryPool = anv_query_pool_to_handle(pool); 22501e04c3fSmrg 22601e04c3fSmrg return VK_SUCCESS; 22701e04c3fSmrg 22801e04c3fSmrg fail: 2297ec681f3Smrg vk_free2(&device->vk.alloc, pAllocator, pool); 23001e04c3fSmrg 23101e04c3fSmrg return result; 23201e04c3fSmrg} 23301e04c3fSmrg 23401e04c3fSmrgvoid genX(DestroyQueryPool)( 23501e04c3fSmrg VkDevice _device, 23601e04c3fSmrg VkQueryPool _pool, 23701e04c3fSmrg const VkAllocationCallbacks* pAllocator) 23801e04c3fSmrg{ 23901e04c3fSmrg ANV_FROM_HANDLE(anv_device, device, _device); 24001e04c3fSmrg ANV_FROM_HANDLE(anv_query_pool, pool, _pool); 24101e04c3fSmrg 24201e04c3fSmrg if (!pool) 24301e04c3fSmrg return; 24401e04c3fSmrg 2457ec681f3Smrg anv_device_release_bo(device, pool->bo); 2467ec681f3Smrg vk_object_free(&device->vk, pAllocator, pool); 2477ec681f3Smrg} 2487ec681f3Smrg 2497ec681f3Smrg#if GFX_VER >= 8 2507ec681f3Smrg/** 2517ec681f3Smrg * VK_KHR_performance_query layout : 2527ec681f3Smrg * 2537ec681f3Smrg * -------------------------------------------- 2547ec681f3Smrg * | availability (8b) | | | 2557ec681f3Smrg * |-------------------------------| | | 2567ec681f3Smrg * | Small batch loading | | | 2577ec681f3Smrg * | ANV_PERF_QUERY_OFFSET_REG | | | 2587ec681f3Smrg * | (24b) | | Pass 0 | 2597ec681f3Smrg * |-------------------------------| | | 2607ec681f3Smrg * | some padding (see | | | 2617ec681f3Smrg * | query_field_layout:alignment) | | | 2627ec681f3Smrg * |-------------------------------| | | 2637ec681f3Smrg * | query data | | | 2647ec681f3Smrg * | (2 * query_field_layout:size) | | | 2657ec681f3Smrg * |-------------------------------|-- | Query 0 2667ec681f3Smrg * | availability (8b) | | | 2677ec681f3Smrg * |-------------------------------| | | 2687ec681f3Smrg * | Small batch loading | | | 2697ec681f3Smrg * | ANV_PERF_QUERY_OFFSET_REG | | | 2707ec681f3Smrg * | (24b) | | Pass 1 | 2717ec681f3Smrg * |-------------------------------| | | 2727ec681f3Smrg * | some padding (see | | | 2737ec681f3Smrg * | query_field_layout:alignment) | | | 2747ec681f3Smrg * |-------------------------------| | | 2757ec681f3Smrg * | query data | | | 2767ec681f3Smrg * | (2 * query_field_layout:size) | | | 2777ec681f3Smrg * |-------------------------------|----------- 2787ec681f3Smrg * | availability (8b) | | | 2797ec681f3Smrg * |-------------------------------| | | 2807ec681f3Smrg * | Small batch loading | | | 2817ec681f3Smrg * | ANV_PERF_QUERY_OFFSET_REG | | | 2827ec681f3Smrg * | (24b) | | Pass 0 | 2837ec681f3Smrg * |-------------------------------| | | 2847ec681f3Smrg * | some padding (see | | | 2857ec681f3Smrg * | query_field_layout:alignment) | | | 2867ec681f3Smrg * |-------------------------------| | | 2877ec681f3Smrg * | query data | | | 2887ec681f3Smrg * | (2 * query_field_layout:size) | | | 2897ec681f3Smrg * |-------------------------------|-- | Query 1 2907ec681f3Smrg * | ... | | | 2917ec681f3Smrg * -------------------------------------------- 2927ec681f3Smrg */ 2937ec681f3Smrg 2947ec681f3Smrgstatic uint64_t 2957ec681f3Smrgkhr_perf_query_availability_offset(struct anv_query_pool *pool, uint32_t query, uint32_t pass) 2967ec681f3Smrg{ 2977ec681f3Smrg return query * (uint64_t)pool->stride + pass * (uint64_t)pool->pass_size; 2987ec681f3Smrg} 2997ec681f3Smrg 3007ec681f3Smrgstatic uint64_t 3017ec681f3Smrgkhr_perf_query_data_offset(struct anv_query_pool *pool, uint32_t query, uint32_t pass, bool end) 3027ec681f3Smrg{ 3037ec681f3Smrg return query * (uint64_t)pool->stride + pass * (uint64_t)pool->pass_size + 3047ec681f3Smrg pool->data_offset + (end ? pool->snapshot_size : 0); 30501e04c3fSmrg} 30601e04c3fSmrg 30701e04c3fSmrgstatic struct anv_address 3087ec681f3Smrgkhr_perf_query_availability_address(struct anv_query_pool *pool, uint32_t query, uint32_t pass) 30901e04c3fSmrg{ 3107ec681f3Smrg return anv_address_add( 3117ec681f3Smrg (struct anv_address) { .bo = pool->bo, }, 3127ec681f3Smrg khr_perf_query_availability_offset(pool, query, pass)); 3137ec681f3Smrg} 3147ec681f3Smrg 3157ec681f3Smrgstatic struct anv_address 3167ec681f3Smrgkhr_perf_query_data_address(struct anv_query_pool *pool, uint32_t query, uint32_t pass, bool end) 3177ec681f3Smrg{ 3187ec681f3Smrg return anv_address_add( 3197ec681f3Smrg (struct anv_address) { .bo = pool->bo, }, 3207ec681f3Smrg khr_perf_query_data_offset(pool, query, pass, end)); 3217ec681f3Smrg} 3227ec681f3Smrg 3237ec681f3Smrgstatic bool 3247ec681f3Smrgkhr_perf_query_ensure_relocs(struct anv_cmd_buffer *cmd_buffer) 3257ec681f3Smrg{ 3267ec681f3Smrg if (anv_batch_has_error(&cmd_buffer->batch)) 3277ec681f3Smrg return false; 3287ec681f3Smrg 3297ec681f3Smrg if (cmd_buffer->self_mod_locations) 3307ec681f3Smrg return true; 3317ec681f3Smrg 3327ec681f3Smrg struct anv_device *device = cmd_buffer->device; 3337ec681f3Smrg const struct anv_physical_device *pdevice = device->physical; 3347ec681f3Smrg 3357ec681f3Smrg cmd_buffer->self_mod_locations = 3367ec681f3Smrg vk_alloc(&cmd_buffer->pool->alloc, 3377ec681f3Smrg pdevice->n_perf_query_commands * sizeof(*cmd_buffer->self_mod_locations), 8, 3387ec681f3Smrg VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 3397ec681f3Smrg 3407ec681f3Smrg if (!cmd_buffer->self_mod_locations) { 3417ec681f3Smrg anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY); 3427ec681f3Smrg return false; 3437ec681f3Smrg } 3447ec681f3Smrg 3457ec681f3Smrg return true; 3467ec681f3Smrg} 3477ec681f3Smrg#endif 3487ec681f3Smrg 3497ec681f3Smrg/** 3507ec681f3Smrg * VK_INTEL_performance_query layout : 3517ec681f3Smrg * 3527ec681f3Smrg * --------------------------------- 3537ec681f3Smrg * | availability (8b) | 3547ec681f3Smrg * |-------------------------------| 3557ec681f3Smrg * | marker (8b) | 3567ec681f3Smrg * |-------------------------------| 3577ec681f3Smrg * | some padding (see | 3587ec681f3Smrg * | query_field_layout:alignment) | 3597ec681f3Smrg * |-------------------------------| 3607ec681f3Smrg * | query data | 3617ec681f3Smrg * | (2 * query_field_layout:size) | 3627ec681f3Smrg * --------------------------------- 3637ec681f3Smrg */ 3647ec681f3Smrg 3657ec681f3Smrgstatic uint32_t 3667ec681f3Smrgintel_perf_marker_offset(void) 3677ec681f3Smrg{ 3687ec681f3Smrg return 8; 3697ec681f3Smrg} 3707ec681f3Smrg 3717ec681f3Smrgstatic uint32_t 3727ec681f3Smrgintel_perf_query_data_offset(struct anv_query_pool *pool, bool end) 3737ec681f3Smrg{ 3747ec681f3Smrg return pool->data_offset + (end ? pool->snapshot_size : 0); 37501e04c3fSmrg} 37601e04c3fSmrg 37701e04c3fSmrgstatic void 37801e04c3fSmrgcpu_write_query_result(void *dst_slot, VkQueryResultFlags flags, 37901e04c3fSmrg uint32_t value_index, uint64_t result) 38001e04c3fSmrg{ 38101e04c3fSmrg if (flags & VK_QUERY_RESULT_64_BIT) { 38201e04c3fSmrg uint64_t *dst64 = dst_slot; 38301e04c3fSmrg dst64[value_index] = result; 38401e04c3fSmrg } else { 38501e04c3fSmrg uint32_t *dst32 = dst_slot; 38601e04c3fSmrg dst32[value_index] = result; 38701e04c3fSmrg } 38801e04c3fSmrg} 38901e04c3fSmrg 3907ec681f3Smrgstatic void * 3917ec681f3Smrgquery_slot(struct anv_query_pool *pool, uint32_t query) 3927ec681f3Smrg{ 3937ec681f3Smrg return pool->bo->map + query * pool->stride; 3947ec681f3Smrg} 3957ec681f3Smrg 39601e04c3fSmrgstatic bool 3977ec681f3Smrgquery_is_available(struct anv_query_pool *pool, uint32_t query) 39801e04c3fSmrg{ 3997ec681f3Smrg#if GFX_VER >= 8 4007ec681f3Smrg if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { 4017ec681f3Smrg for (uint32_t p = 0; p < pool->n_passes; p++) { 4027ec681f3Smrg volatile uint64_t *slot = 4037ec681f3Smrg pool->bo->map + khr_perf_query_availability_offset(pool, query, p); 4047ec681f3Smrg if (!slot[0]) 4057ec681f3Smrg return false; 4067ec681f3Smrg } 4077ec681f3Smrg return true; 4087ec681f3Smrg } 4097ec681f3Smrg#endif 4107ec681f3Smrg 4117ec681f3Smrg return *(volatile uint64_t *)query_slot(pool, query); 41201e04c3fSmrg} 41301e04c3fSmrg 41401e04c3fSmrgstatic VkResult 41501e04c3fSmrgwait_for_available(struct anv_device *device, 4167ec681f3Smrg struct anv_query_pool *pool, uint32_t query) 41701e04c3fSmrg{ 4187ec681f3Smrg uint64_t abs_timeout = anv_get_absolute_timeout(2 * NSEC_PER_SEC); 41901e04c3fSmrg 4207ec681f3Smrg while (anv_gettime_ns() < abs_timeout) { 4217ec681f3Smrg if (query_is_available(pool, query)) 4227ec681f3Smrg return VK_SUCCESS; 4237ec681f3Smrg VkResult status = anv_device_query_status(device); 4247ec681f3Smrg if (status != VK_SUCCESS) 4257ec681f3Smrg return status; 42601e04c3fSmrg } 4277ec681f3Smrg 4287ec681f3Smrg return anv_device_set_lost(device, "query timeout"); 42901e04c3fSmrg} 43001e04c3fSmrg 43101e04c3fSmrgVkResult genX(GetQueryPoolResults)( 43201e04c3fSmrg VkDevice _device, 43301e04c3fSmrg VkQueryPool queryPool, 43401e04c3fSmrg uint32_t firstQuery, 43501e04c3fSmrg uint32_t queryCount, 43601e04c3fSmrg size_t dataSize, 43701e04c3fSmrg void* pData, 43801e04c3fSmrg VkDeviceSize stride, 43901e04c3fSmrg VkQueryResultFlags flags) 44001e04c3fSmrg{ 44101e04c3fSmrg ANV_FROM_HANDLE(anv_device, device, _device); 44201e04c3fSmrg ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 44301e04c3fSmrg 44401e04c3fSmrg assert(pool->type == VK_QUERY_TYPE_OCCLUSION || 44501e04c3fSmrg pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS || 4469f464c52Smaya pool->type == VK_QUERY_TYPE_TIMESTAMP || 4477ec681f3Smrg pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT || 4487ec681f3Smrg pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR || 4497ec681f3Smrg pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL); 45001e04c3fSmrg 45101e04c3fSmrg if (anv_device_is_lost(device)) 45201e04c3fSmrg return VK_ERROR_DEVICE_LOST; 45301e04c3fSmrg 45401e04c3fSmrg if (pData == NULL) 45501e04c3fSmrg return VK_SUCCESS; 45601e04c3fSmrg 45701e04c3fSmrg void *data_end = pData + dataSize; 45801e04c3fSmrg 45901e04c3fSmrg VkResult status = VK_SUCCESS; 46001e04c3fSmrg for (uint32_t i = 0; i < queryCount; i++) { 4617ec681f3Smrg bool available = query_is_available(pool, firstQuery + i); 46201e04c3fSmrg 46301e04c3fSmrg if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) { 4647ec681f3Smrg status = wait_for_available(device, pool, firstQuery + i); 4657ec681f3Smrg if (status != VK_SUCCESS) { 46601e04c3fSmrg return status; 4677ec681f3Smrg } 46801e04c3fSmrg 46901e04c3fSmrg available = true; 47001e04c3fSmrg } 47101e04c3fSmrg 47201e04c3fSmrg /* From the Vulkan 1.0.42 spec: 47301e04c3fSmrg * 47401e04c3fSmrg * "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are 47501e04c3fSmrg * both not set then no result values are written to pData for 47601e04c3fSmrg * queries that are in the unavailable state at the time of the call, 47701e04c3fSmrg * and vkGetQueryPoolResults returns VK_NOT_READY. However, 47801e04c3fSmrg * availability state is still written to pData for those queries if 47901e04c3fSmrg * VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set." 4807ec681f3Smrg * 4817ec681f3Smrg * From VK_KHR_performance_query : 4827ec681f3Smrg * 4837ec681f3Smrg * "VK_QUERY_RESULT_PERFORMANCE_QUERY_RECORDED_COUNTERS_BIT_KHR specifies 4847ec681f3Smrg * that the result should contain the number of counters that were recorded 4857ec681f3Smrg * into a query pool of type ename:VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR" 48601e04c3fSmrg */ 48701e04c3fSmrg bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT); 48801e04c3fSmrg 48901e04c3fSmrg uint32_t idx = 0; 49001e04c3fSmrg switch (pool->type) { 4917ec681f3Smrg case VK_QUERY_TYPE_OCCLUSION: { 4927ec681f3Smrg uint64_t *slot = query_slot(pool, firstQuery + i); 4937ec681f3Smrg if (write_results) { 4947ec681f3Smrg /* From the Vulkan 1.2.132 spec: 4957ec681f3Smrg * 4967ec681f3Smrg * "If VK_QUERY_RESULT_PARTIAL_BIT is set, 4977ec681f3Smrg * VK_QUERY_RESULT_WAIT_BIT is not set, and the query’s status 4987ec681f3Smrg * is unavailable, an intermediate result value between zero and 4997ec681f3Smrg * the final result value is written to pData for that query." 5007ec681f3Smrg */ 5017ec681f3Smrg uint64_t result = available ? slot[2] - slot[1] : 0; 5027ec681f3Smrg cpu_write_query_result(pData, flags, idx, result); 5037ec681f3Smrg } 50401e04c3fSmrg idx++; 50501e04c3fSmrg break; 5067ec681f3Smrg } 50701e04c3fSmrg 50801e04c3fSmrg case VK_QUERY_TYPE_PIPELINE_STATISTICS: { 5097ec681f3Smrg uint64_t *slot = query_slot(pool, firstQuery + i); 51001e04c3fSmrg uint32_t statistics = pool->pipeline_statistics; 51101e04c3fSmrg while (statistics) { 51201e04c3fSmrg uint32_t stat = u_bit_scan(&statistics); 51301e04c3fSmrg if (write_results) { 51401e04c3fSmrg uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1]; 51501e04c3fSmrg 51601e04c3fSmrg /* WaDividePSInvocationCountBy4:HSW,BDW */ 5177ec681f3Smrg if ((device->info.ver == 8 || device->info.is_haswell) && 51801e04c3fSmrg (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) 51901e04c3fSmrg result >>= 2; 52001e04c3fSmrg 52101e04c3fSmrg cpu_write_query_result(pData, flags, idx, result); 52201e04c3fSmrg } 52301e04c3fSmrg idx++; 52401e04c3fSmrg } 52501e04c3fSmrg assert(idx == util_bitcount(pool->pipeline_statistics)); 52601e04c3fSmrg break; 52701e04c3fSmrg } 52801e04c3fSmrg 5297ec681f3Smrg case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: { 5307ec681f3Smrg uint64_t *slot = query_slot(pool, firstQuery + i); 5319f464c52Smaya if (write_results) 5329f464c52Smaya cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]); 5339f464c52Smaya idx++; 5349f464c52Smaya if (write_results) 5359f464c52Smaya cpu_write_query_result(pData, flags, idx, slot[4] - slot[3]); 5369f464c52Smaya idx++; 5379f464c52Smaya break; 5387ec681f3Smrg } 5399f464c52Smaya 5407ec681f3Smrg case VK_QUERY_TYPE_TIMESTAMP: { 5417ec681f3Smrg uint64_t *slot = query_slot(pool, firstQuery + i); 54201e04c3fSmrg if (write_results) 54301e04c3fSmrg cpu_write_query_result(pData, flags, idx, slot[1]); 54401e04c3fSmrg idx++; 54501e04c3fSmrg break; 5467ec681f3Smrg } 5477ec681f3Smrg 5487ec681f3Smrg#if GFX_VER >= 8 5497ec681f3Smrg case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { 5507ec681f3Smrg const struct anv_physical_device *pdevice = device->physical; 5517ec681f3Smrg assert((flags & (VK_QUERY_RESULT_WITH_AVAILABILITY_BIT | 5527ec681f3Smrg VK_QUERY_RESULT_PARTIAL_BIT)) == 0); 5537ec681f3Smrg for (uint32_t p = 0; p < pool->n_passes; p++) { 5547ec681f3Smrg const struct intel_perf_query_info *query = pool->pass_query[p]; 5557ec681f3Smrg struct intel_perf_query_result result; 5567ec681f3Smrg intel_perf_query_result_clear(&result); 5577ec681f3Smrg intel_perf_query_result_accumulate_fields(&result, query, &device->info, 5587ec681f3Smrg pool->bo->map + khr_perf_query_data_offset(pool, firstQuery + i, p, false), 5597ec681f3Smrg pool->bo->map + khr_perf_query_data_offset(pool, firstQuery + i, p, true), 5607ec681f3Smrg false /* no_oa_accumulate */); 5617ec681f3Smrg anv_perf_write_pass_results(pdevice->perf, pool, p, &result, pData); 5627ec681f3Smrg } 5637ec681f3Smrg break; 5647ec681f3Smrg } 5657ec681f3Smrg#endif 5667ec681f3Smrg 5677ec681f3Smrg case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { 5687ec681f3Smrg if (!write_results) 5697ec681f3Smrg break; 5707ec681f3Smrg const void *query_data = query_slot(pool, firstQuery + i); 5717ec681f3Smrg const struct intel_perf_query_info *query = &device->physical->perf->queries[0]; 5727ec681f3Smrg struct intel_perf_query_result result; 5737ec681f3Smrg intel_perf_query_result_clear(&result); 5747ec681f3Smrg intel_perf_query_result_accumulate_fields(&result, query, &device->info, 5757ec681f3Smrg query_data + intel_perf_query_data_offset(pool, false), 5767ec681f3Smrg query_data + intel_perf_query_data_offset(pool, true), 5777ec681f3Smrg false /* no_oa_accumulate */); 5787ec681f3Smrg intel_perf_query_result_write_mdapi(pData, stride, 5797ec681f3Smrg &device->info, 5807ec681f3Smrg query, &result); 5817ec681f3Smrg const uint64_t *marker = query_data + intel_perf_marker_offset(); 5827ec681f3Smrg intel_perf_query_mdapi_write_marker(pData, stride, &device->info, *marker); 5837ec681f3Smrg break; 5847ec681f3Smrg } 58501e04c3fSmrg 58601e04c3fSmrg default: 58701e04c3fSmrg unreachable("invalid pool type"); 58801e04c3fSmrg } 58901e04c3fSmrg 59001e04c3fSmrg if (!write_results) 59101e04c3fSmrg status = VK_NOT_READY; 59201e04c3fSmrg 59301e04c3fSmrg if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) 59401e04c3fSmrg cpu_write_query_result(pData, flags, idx, available); 59501e04c3fSmrg 59601e04c3fSmrg pData += stride; 59701e04c3fSmrg if (pData >= data_end) 59801e04c3fSmrg break; 59901e04c3fSmrg } 60001e04c3fSmrg 60101e04c3fSmrg return status; 60201e04c3fSmrg} 60301e04c3fSmrg 60401e04c3fSmrgstatic void 60501e04c3fSmrgemit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer, 60601e04c3fSmrg struct anv_address addr) 60701e04c3fSmrg{ 6087ec681f3Smrg cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT; 6097ec681f3Smrg genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 6107ec681f3Smrg 61101e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 61201e04c3fSmrg pc.DestinationAddressType = DAT_PPGTT; 61301e04c3fSmrg pc.PostSyncOperation = WritePSDepthCount; 61401e04c3fSmrg pc.DepthStallEnable = true; 61501e04c3fSmrg pc.Address = addr; 61601e04c3fSmrg 6177ec681f3Smrg if (GFX_VER == 9 && cmd_buffer->device->info.gt == 4) 61801e04c3fSmrg pc.CommandStreamerStallEnable = true; 61901e04c3fSmrg } 62001e04c3fSmrg} 62101e04c3fSmrg 62201e04c3fSmrgstatic void 6237ec681f3Smrgemit_query_mi_availability(struct mi_builder *b, 6249f464c52Smaya struct anv_address addr, 6259f464c52Smaya bool available) 6269f464c52Smaya{ 6277ec681f3Smrg mi_store(b, mi_mem64(addr), mi_imm(available)); 6289f464c52Smaya} 6299f464c52Smaya 6309f464c52Smayastatic void 6319f464c52Smayaemit_query_pc_availability(struct anv_cmd_buffer *cmd_buffer, 6329f464c52Smaya struct anv_address addr, 6339f464c52Smaya bool available) 63401e04c3fSmrg{ 6357ec681f3Smrg cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT; 6367ec681f3Smrg genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 6377ec681f3Smrg 63801e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 63901e04c3fSmrg pc.DestinationAddressType = DAT_PPGTT; 64001e04c3fSmrg pc.PostSyncOperation = WriteImmediateData; 64101e04c3fSmrg pc.Address = addr; 6429f464c52Smaya pc.ImmediateData = available; 64301e04c3fSmrg } 64401e04c3fSmrg} 64501e04c3fSmrg 64601e04c3fSmrg/** 64701e04c3fSmrg * Goes through a series of consecutive query indices in the given pool 64801e04c3fSmrg * setting all element values to 0 and emitting them as available. 64901e04c3fSmrg */ 65001e04c3fSmrgstatic void 65101e04c3fSmrgemit_zero_queries(struct anv_cmd_buffer *cmd_buffer, 6527ec681f3Smrg struct mi_builder *b, struct anv_query_pool *pool, 65301e04c3fSmrg uint32_t first_index, uint32_t num_queries) 65401e04c3fSmrg{ 6559f464c52Smaya switch (pool->type) { 6569f464c52Smaya case VK_QUERY_TYPE_OCCLUSION: 6579f464c52Smaya case VK_QUERY_TYPE_TIMESTAMP: 6589f464c52Smaya /* These queries are written with a PIPE_CONTROL so clear them using the 6599f464c52Smaya * PIPE_CONTROL as well so we don't have to synchronize between 2 types 6609f464c52Smaya * of operations. 6619f464c52Smaya */ 6629f464c52Smaya assert((pool->stride % 8) == 0); 6639f464c52Smaya for (uint32_t i = 0; i < num_queries; i++) { 6649f464c52Smaya struct anv_address slot_addr = 6659f464c52Smaya anv_query_address(pool, first_index + i); 6669f464c52Smaya 6679f464c52Smaya for (uint32_t qword = 1; qword < (pool->stride / 8); qword++) { 6689f464c52Smaya emit_query_pc_availability(cmd_buffer, 6699f464c52Smaya anv_address_add(slot_addr, qword * 8), 6709f464c52Smaya false); 6719f464c52Smaya } 6729f464c52Smaya emit_query_pc_availability(cmd_buffer, slot_addr, true); 6739f464c52Smaya } 6749f464c52Smaya break; 6759f464c52Smaya 6769f464c52Smaya case VK_QUERY_TYPE_PIPELINE_STATISTICS: 6779f464c52Smaya case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 6789f464c52Smaya for (uint32_t i = 0; i < num_queries; i++) { 6799f464c52Smaya struct anv_address slot_addr = 6809f464c52Smaya anv_query_address(pool, first_index + i); 6817ec681f3Smrg mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8); 6827ec681f3Smrg emit_query_mi_availability(b, slot_addr, true); 6837ec681f3Smrg } 6847ec681f3Smrg break; 6857ec681f3Smrg 6867ec681f3Smrg#if GFX_VER >= 8 6877ec681f3Smrg case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { 6887ec681f3Smrg for (uint32_t i = 0; i < num_queries; i++) { 6897ec681f3Smrg for (uint32_t p = 0; p < pool->n_passes; p++) { 6907ec681f3Smrg mi_memset(b, khr_perf_query_data_address(pool, first_index + i, p, false), 6917ec681f3Smrg 0, 2 * pool->snapshot_size); 6927ec681f3Smrg emit_query_mi_availability(b, 6937ec681f3Smrg khr_perf_query_availability_address(pool, first_index + i, p), 6947ec681f3Smrg true); 6957ec681f3Smrg } 6967ec681f3Smrg } 6977ec681f3Smrg break; 6987ec681f3Smrg } 6997ec681f3Smrg#endif 7007ec681f3Smrg 7017ec681f3Smrg case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: 7027ec681f3Smrg for (uint32_t i = 0; i < num_queries; i++) { 7037ec681f3Smrg struct anv_address slot_addr = 7047ec681f3Smrg anv_query_address(pool, first_index + i); 7057ec681f3Smrg mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8); 7069f464c52Smaya emit_query_mi_availability(b, slot_addr, true); 7079f464c52Smaya } 7089f464c52Smaya break; 7099f464c52Smaya 7109f464c52Smaya default: 7119f464c52Smaya unreachable("Unsupported query type"); 71201e04c3fSmrg } 71301e04c3fSmrg} 71401e04c3fSmrg 71501e04c3fSmrgvoid genX(CmdResetQueryPool)( 71601e04c3fSmrg VkCommandBuffer commandBuffer, 71701e04c3fSmrg VkQueryPool queryPool, 71801e04c3fSmrg uint32_t firstQuery, 71901e04c3fSmrg uint32_t queryCount) 72001e04c3fSmrg{ 72101e04c3fSmrg ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 72201e04c3fSmrg ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 72301e04c3fSmrg 7249f464c52Smaya switch (pool->type) { 7259f464c52Smaya case VK_QUERY_TYPE_OCCLUSION: 7269f464c52Smaya case VK_QUERY_TYPE_TIMESTAMP: 7279f464c52Smaya for (uint32_t i = 0; i < queryCount; i++) { 7289f464c52Smaya emit_query_pc_availability(cmd_buffer, 7299f464c52Smaya anv_query_address(pool, firstQuery + i), 7309f464c52Smaya false); 73101e04c3fSmrg } 7329f464c52Smaya break; 7339f464c52Smaya 7349f464c52Smaya case VK_QUERY_TYPE_PIPELINE_STATISTICS: 7359f464c52Smaya case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: { 7367ec681f3Smrg struct mi_builder b; 7377ec681f3Smrg mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 7387ec681f3Smrg 7397ec681f3Smrg for (uint32_t i = 0; i < queryCount; i++) 7407ec681f3Smrg emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false); 7417ec681f3Smrg break; 7427ec681f3Smrg } 7437ec681f3Smrg 7447ec681f3Smrg#if GFX_VER >= 8 7457ec681f3Smrg case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { 7467ec681f3Smrg struct mi_builder b; 7477ec681f3Smrg mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 7487ec681f3Smrg 7497ec681f3Smrg for (uint32_t i = 0; i < queryCount; i++) { 7507ec681f3Smrg for (uint32_t p = 0; p < pool->n_passes; p++) { 7517ec681f3Smrg emit_query_mi_availability( 7527ec681f3Smrg &b, 7537ec681f3Smrg khr_perf_query_availability_address(pool, firstQuery + i, p), 7547ec681f3Smrg false); 7557ec681f3Smrg } 7567ec681f3Smrg } 7577ec681f3Smrg break; 7587ec681f3Smrg } 7597ec681f3Smrg#endif 7607ec681f3Smrg 7617ec681f3Smrg case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { 7627ec681f3Smrg struct mi_builder b; 7637ec681f3Smrg mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 7649f464c52Smaya 7659f464c52Smaya for (uint32_t i = 0; i < queryCount; i++) 7669f464c52Smaya emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false); 7679f464c52Smaya break; 7689f464c52Smaya } 7699f464c52Smaya 7709f464c52Smaya default: 7719f464c52Smaya unreachable("Unsupported query type"); 7729f464c52Smaya } 7739f464c52Smaya} 7749f464c52Smaya 7757ec681f3Smrgvoid genX(ResetQueryPool)( 7769f464c52Smaya VkDevice _device, 7779f464c52Smaya VkQueryPool queryPool, 7789f464c52Smaya uint32_t firstQuery, 7799f464c52Smaya uint32_t queryCount) 7809f464c52Smaya{ 7819f464c52Smaya ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 7829f464c52Smaya 7839f464c52Smaya for (uint32_t i = 0; i < queryCount; i++) { 7847ec681f3Smrg if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { 7857ec681f3Smrg#if GFX_VER >= 8 7867ec681f3Smrg for (uint32_t p = 0; p < pool->n_passes; p++) { 7877ec681f3Smrg uint64_t *pass_slot = pool->bo->map + 7887ec681f3Smrg khr_perf_query_availability_offset(pool, firstQuery + i, p); 7897ec681f3Smrg *pass_slot = 0; 7907ec681f3Smrg } 7917ec681f3Smrg#endif 7927ec681f3Smrg } else { 7937ec681f3Smrg uint64_t *slot = query_slot(pool, firstQuery + i); 7947ec681f3Smrg *slot = 0; 7957ec681f3Smrg } 79601e04c3fSmrg } 79701e04c3fSmrg} 79801e04c3fSmrg 79901e04c3fSmrgstatic const uint32_t vk_pipeline_stat_to_reg[] = { 80001e04c3fSmrg GENX(IA_VERTICES_COUNT_num), 80101e04c3fSmrg GENX(IA_PRIMITIVES_COUNT_num), 80201e04c3fSmrg GENX(VS_INVOCATION_COUNT_num), 80301e04c3fSmrg GENX(GS_INVOCATION_COUNT_num), 80401e04c3fSmrg GENX(GS_PRIMITIVES_COUNT_num), 80501e04c3fSmrg GENX(CL_INVOCATION_COUNT_num), 80601e04c3fSmrg GENX(CL_PRIMITIVES_COUNT_num), 80701e04c3fSmrg GENX(PS_INVOCATION_COUNT_num), 80801e04c3fSmrg GENX(HS_INVOCATION_COUNT_num), 80901e04c3fSmrg GENX(DS_INVOCATION_COUNT_num), 81001e04c3fSmrg GENX(CS_INVOCATION_COUNT_num), 81101e04c3fSmrg}; 81201e04c3fSmrg 81301e04c3fSmrgstatic void 8147ec681f3Smrgemit_pipeline_stat(struct mi_builder *b, uint32_t stat, 81501e04c3fSmrg struct anv_address addr) 81601e04c3fSmrg{ 81701e04c3fSmrg STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK == 81801e04c3fSmrg (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg)) - 1); 81901e04c3fSmrg 82001e04c3fSmrg assert(stat < ARRAY_SIZE(vk_pipeline_stat_to_reg)); 8217ec681f3Smrg mi_store(b, mi_mem64(addr), mi_reg64(vk_pipeline_stat_to_reg[stat])); 8229f464c52Smaya} 8239f464c52Smaya 8249f464c52Smayastatic void 8257ec681f3Smrgemit_xfb_query(struct mi_builder *b, uint32_t stream, 8269f464c52Smaya struct anv_address addr) 8279f464c52Smaya{ 8289f464c52Smaya assert(stream < MAX_XFB_STREAMS); 8299f464c52Smaya 8307ec681f3Smrg mi_store(b, mi_mem64(anv_address_add(addr, 0)), 8317ec681f3Smrg mi_reg64(GENX(SO_NUM_PRIMS_WRITTEN0_num) + stream * 8)); 8327ec681f3Smrg mi_store(b, mi_mem64(anv_address_add(addr, 16)), 8337ec681f3Smrg mi_reg64(GENX(SO_PRIM_STORAGE_NEEDED0_num) + stream * 8)); 8347ec681f3Smrg} 8357ec681f3Smrg 8367ec681f3Smrgstatic void 8377ec681f3Smrgemit_perf_intel_query(struct anv_cmd_buffer *cmd_buffer, 8387ec681f3Smrg struct anv_query_pool *pool, 8397ec681f3Smrg struct mi_builder *b, 8407ec681f3Smrg struct anv_address query_addr, 8417ec681f3Smrg bool end) 8427ec681f3Smrg{ 8437ec681f3Smrg const struct intel_perf_query_field_layout *layout = 8447ec681f3Smrg &cmd_buffer->device->physical->perf->query_layout; 8457ec681f3Smrg struct anv_address data_addr = 8467ec681f3Smrg anv_address_add(query_addr, intel_perf_query_data_offset(pool, end)); 8477ec681f3Smrg 8487ec681f3Smrg for (uint32_t f = 0; f < layout->n_fields; f++) { 8497ec681f3Smrg const struct intel_perf_query_field *field = 8507ec681f3Smrg &layout->fields[end ? f : (layout->n_fields - 1 - f)]; 8517ec681f3Smrg 8527ec681f3Smrg switch (field->type) { 8537ec681f3Smrg case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC: 8547ec681f3Smrg anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) { 8557ec681f3Smrg rpc.MemoryAddress = anv_address_add(data_addr, field->location); 8567ec681f3Smrg } 8577ec681f3Smrg break; 8587ec681f3Smrg 8597ec681f3Smrg case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT: 8607ec681f3Smrg case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT: 8617ec681f3Smrg case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B: 8627ec681f3Smrg case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C: { 8637ec681f3Smrg struct anv_address addr = anv_address_add(data_addr, field->location); 8647ec681f3Smrg struct mi_value src = field->size == 8 ? 8657ec681f3Smrg mi_reg64(field->mmio_offset) : 8667ec681f3Smrg mi_reg32(field->mmio_offset); 8677ec681f3Smrg struct mi_value dst = field->size == 8 ? 8687ec681f3Smrg mi_mem64(addr) : mi_mem32(addr); 8697ec681f3Smrg mi_store(b, dst, src); 8707ec681f3Smrg break; 8717ec681f3Smrg } 8727ec681f3Smrg 8737ec681f3Smrg default: 8747ec681f3Smrg unreachable("Invalid query field"); 8757ec681f3Smrg break; 8767ec681f3Smrg } 8777ec681f3Smrg } 87801e04c3fSmrg} 87901e04c3fSmrg 88001e04c3fSmrgvoid genX(CmdBeginQuery)( 88101e04c3fSmrg VkCommandBuffer commandBuffer, 88201e04c3fSmrg VkQueryPool queryPool, 88301e04c3fSmrg uint32_t query, 88401e04c3fSmrg VkQueryControlFlags flags) 8859f464c52Smaya{ 8869f464c52Smaya genX(CmdBeginQueryIndexedEXT)(commandBuffer, queryPool, query, flags, 0); 8879f464c52Smaya} 8889f464c52Smaya 8899f464c52Smayavoid genX(CmdBeginQueryIndexedEXT)( 8909f464c52Smaya VkCommandBuffer commandBuffer, 8919f464c52Smaya VkQueryPool queryPool, 8929f464c52Smaya uint32_t query, 8939f464c52Smaya VkQueryControlFlags flags, 8949f464c52Smaya uint32_t index) 89501e04c3fSmrg{ 89601e04c3fSmrg ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 89701e04c3fSmrg ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 89801e04c3fSmrg struct anv_address query_addr = anv_query_address(pool, query); 89901e04c3fSmrg 9007ec681f3Smrg struct mi_builder b; 9017ec681f3Smrg mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 9029f464c52Smaya 90301e04c3fSmrg switch (pool->type) { 90401e04c3fSmrg case VK_QUERY_TYPE_OCCLUSION: 90501e04c3fSmrg emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 8)); 90601e04c3fSmrg break; 90701e04c3fSmrg 90801e04c3fSmrg case VK_QUERY_TYPE_PIPELINE_STATISTICS: { 90901e04c3fSmrg /* TODO: This might only be necessary for certain stats */ 91001e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 91101e04c3fSmrg pc.CommandStreamerStallEnable = true; 91201e04c3fSmrg pc.StallAtPixelScoreboard = true; 91301e04c3fSmrg } 91401e04c3fSmrg 91501e04c3fSmrg uint32_t statistics = pool->pipeline_statistics; 91601e04c3fSmrg uint32_t offset = 8; 91701e04c3fSmrg while (statistics) { 91801e04c3fSmrg uint32_t stat = u_bit_scan(&statistics); 9199f464c52Smaya emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset)); 92001e04c3fSmrg offset += 16; 92101e04c3fSmrg } 92201e04c3fSmrg break; 92301e04c3fSmrg } 92401e04c3fSmrg 9259f464c52Smaya case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 9269f464c52Smaya anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 9279f464c52Smaya pc.CommandStreamerStallEnable = true; 9289f464c52Smaya pc.StallAtPixelScoreboard = true; 9299f464c52Smaya } 9309f464c52Smaya emit_xfb_query(&b, index, anv_address_add(query_addr, 8)); 9319f464c52Smaya break; 9329f464c52Smaya 9337ec681f3Smrg#if GFX_VER >= 8 9347ec681f3Smrg case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { 9357ec681f3Smrg if (!khr_perf_query_ensure_relocs(cmd_buffer)) 9367ec681f3Smrg return; 9377ec681f3Smrg 9387ec681f3Smrg const struct anv_physical_device *pdevice = cmd_buffer->device->physical; 9397ec681f3Smrg const struct intel_perf_query_field_layout *layout = &pdevice->perf->query_layout; 9407ec681f3Smrg 9417ec681f3Smrg uint32_t reloc_idx = 0; 9427ec681f3Smrg for (uint32_t end = 0; end < 2; end++) { 9437ec681f3Smrg for (uint32_t r = 0; r < layout->n_fields; r++) { 9447ec681f3Smrg const struct intel_perf_query_field *field = 9457ec681f3Smrg &layout->fields[end ? r : (layout->n_fields - 1 - r)]; 9467ec681f3Smrg struct mi_value reg_addr = 9477ec681f3Smrg mi_iadd( 9487ec681f3Smrg &b, 9497ec681f3Smrg mi_imm(intel_canonical_address(pool->bo->offset + 9507ec681f3Smrg khr_perf_query_data_offset(pool, query, 0, end) + 9517ec681f3Smrg field->location)), 9527ec681f3Smrg mi_reg64(ANV_PERF_QUERY_OFFSET_REG)); 9537ec681f3Smrg cmd_buffer->self_mod_locations[reloc_idx++] = mi_store_address(&b, reg_addr); 9547ec681f3Smrg 9557ec681f3Smrg if (field->type != INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC && 9567ec681f3Smrg field->size == 8) { 9577ec681f3Smrg reg_addr = 9587ec681f3Smrg mi_iadd( 9597ec681f3Smrg &b, 9607ec681f3Smrg mi_imm(intel_canonical_address(pool->bo->offset + 9617ec681f3Smrg khr_perf_query_data_offset(pool, query, 0, end) + 9627ec681f3Smrg field->location + 4)), 9637ec681f3Smrg mi_reg64(ANV_PERF_QUERY_OFFSET_REG)); 9647ec681f3Smrg cmd_buffer->self_mod_locations[reloc_idx++] = mi_store_address(&b, reg_addr); 9657ec681f3Smrg } 9667ec681f3Smrg } 9677ec681f3Smrg } 9687ec681f3Smrg 9697ec681f3Smrg struct mi_value availability_write_offset = 9707ec681f3Smrg mi_iadd( 9717ec681f3Smrg &b, 9727ec681f3Smrg mi_imm( 9737ec681f3Smrg intel_canonical_address( 9747ec681f3Smrg pool->bo->offset + 9757ec681f3Smrg khr_perf_query_availability_offset(pool, query, 0 /* pass */))), 9767ec681f3Smrg mi_reg64(ANV_PERF_QUERY_OFFSET_REG)); 9777ec681f3Smrg cmd_buffer->self_mod_locations[reloc_idx++] = 9787ec681f3Smrg mi_store_address(&b, availability_write_offset); 9797ec681f3Smrg 9807ec681f3Smrg assert(reloc_idx == pdevice->n_perf_query_commands); 9817ec681f3Smrg 9827ec681f3Smrg mi_self_mod_barrier(&b); 9837ec681f3Smrg 9847ec681f3Smrg anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 9857ec681f3Smrg pc.CommandStreamerStallEnable = true; 9867ec681f3Smrg pc.StallAtPixelScoreboard = true; 9877ec681f3Smrg } 9887ec681f3Smrg cmd_buffer->perf_query_pool = pool; 9897ec681f3Smrg 9907ec681f3Smrg cmd_buffer->perf_reloc_idx = 0; 9917ec681f3Smrg for (uint32_t r = 0; r < layout->n_fields; r++) { 9927ec681f3Smrg const struct intel_perf_query_field *field = 9937ec681f3Smrg &layout->fields[layout->n_fields - 1 - r]; 9947ec681f3Smrg void *dws; 9957ec681f3Smrg 9967ec681f3Smrg switch (field->type) { 9977ec681f3Smrg case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC: 9987ec681f3Smrg dws = anv_batch_emitn(&cmd_buffer->batch, 9997ec681f3Smrg GENX(MI_REPORT_PERF_COUNT_length), 10007ec681f3Smrg GENX(MI_REPORT_PERF_COUNT), 10017ec681f3Smrg .MemoryAddress = query_addr /* Will be overwritten */); 10027ec681f3Smrg _mi_resolve_address_token(&b, 10037ec681f3Smrg cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], 10047ec681f3Smrg dws + 10057ec681f3Smrg GENX(MI_REPORT_PERF_COUNT_MemoryAddress_start) / 8); 10067ec681f3Smrg break; 10077ec681f3Smrg 10087ec681f3Smrg case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT: 10097ec681f3Smrg case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT: 10107ec681f3Smrg case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B: 10117ec681f3Smrg case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C: 10127ec681f3Smrg dws = 10137ec681f3Smrg anv_batch_emitn(&cmd_buffer->batch, 10147ec681f3Smrg GENX(MI_STORE_REGISTER_MEM_length), 10157ec681f3Smrg GENX(MI_STORE_REGISTER_MEM), 10167ec681f3Smrg .RegisterAddress = field->mmio_offset, 10177ec681f3Smrg .MemoryAddress = query_addr /* Will be overwritten */ ); 10187ec681f3Smrg _mi_resolve_address_token(&b, 10197ec681f3Smrg cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], 10207ec681f3Smrg dws + 10217ec681f3Smrg GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8); 10227ec681f3Smrg if (field->size == 8) { 10237ec681f3Smrg dws = 10247ec681f3Smrg anv_batch_emitn(&cmd_buffer->batch, 10257ec681f3Smrg GENX(MI_STORE_REGISTER_MEM_length), 10267ec681f3Smrg GENX(MI_STORE_REGISTER_MEM), 10277ec681f3Smrg .RegisterAddress = field->mmio_offset + 4, 10287ec681f3Smrg .MemoryAddress = query_addr /* Will be overwritten */ ); 10297ec681f3Smrg _mi_resolve_address_token(&b, 10307ec681f3Smrg cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], 10317ec681f3Smrg dws + 10327ec681f3Smrg GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8); 10337ec681f3Smrg } 10347ec681f3Smrg break; 10357ec681f3Smrg 10367ec681f3Smrg default: 10377ec681f3Smrg unreachable("Invalid query field"); 10387ec681f3Smrg break; 10397ec681f3Smrg } 10407ec681f3Smrg } 10417ec681f3Smrg break; 10427ec681f3Smrg } 10437ec681f3Smrg#endif 10447ec681f3Smrg 10457ec681f3Smrg case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { 10467ec681f3Smrg anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 10477ec681f3Smrg pc.CommandStreamerStallEnable = true; 10487ec681f3Smrg pc.StallAtPixelScoreboard = true; 10497ec681f3Smrg } 10507ec681f3Smrg emit_perf_intel_query(cmd_buffer, pool, &b, query_addr, false); 10517ec681f3Smrg break; 10527ec681f3Smrg } 10537ec681f3Smrg 105401e04c3fSmrg default: 105501e04c3fSmrg unreachable(""); 105601e04c3fSmrg } 105701e04c3fSmrg} 105801e04c3fSmrg 105901e04c3fSmrgvoid genX(CmdEndQuery)( 106001e04c3fSmrg VkCommandBuffer commandBuffer, 106101e04c3fSmrg VkQueryPool queryPool, 106201e04c3fSmrg uint32_t query) 10639f464c52Smaya{ 10649f464c52Smaya genX(CmdEndQueryIndexedEXT)(commandBuffer, queryPool, query, 0); 10659f464c52Smaya} 10669f464c52Smaya 10679f464c52Smayavoid genX(CmdEndQueryIndexedEXT)( 10689f464c52Smaya VkCommandBuffer commandBuffer, 10699f464c52Smaya VkQueryPool queryPool, 10709f464c52Smaya uint32_t query, 10719f464c52Smaya uint32_t index) 107201e04c3fSmrg{ 107301e04c3fSmrg ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 107401e04c3fSmrg ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 107501e04c3fSmrg struct anv_address query_addr = anv_query_address(pool, query); 107601e04c3fSmrg 10777ec681f3Smrg struct mi_builder b; 10787ec681f3Smrg mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 10799f464c52Smaya 108001e04c3fSmrg switch (pool->type) { 108101e04c3fSmrg case VK_QUERY_TYPE_OCCLUSION: 108201e04c3fSmrg emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16)); 10839f464c52Smaya emit_query_pc_availability(cmd_buffer, query_addr, true); 108401e04c3fSmrg break; 108501e04c3fSmrg 108601e04c3fSmrg case VK_QUERY_TYPE_PIPELINE_STATISTICS: { 108701e04c3fSmrg /* TODO: This might only be necessary for certain stats */ 108801e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 108901e04c3fSmrg pc.CommandStreamerStallEnable = true; 109001e04c3fSmrg pc.StallAtPixelScoreboard = true; 109101e04c3fSmrg } 109201e04c3fSmrg 109301e04c3fSmrg uint32_t statistics = pool->pipeline_statistics; 109401e04c3fSmrg uint32_t offset = 16; 109501e04c3fSmrg while (statistics) { 109601e04c3fSmrg uint32_t stat = u_bit_scan(&statistics); 10979f464c52Smaya emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset)); 109801e04c3fSmrg offset += 16; 109901e04c3fSmrg } 110001e04c3fSmrg 11019f464c52Smaya emit_query_mi_availability(&b, query_addr, true); 110201e04c3fSmrg break; 110301e04c3fSmrg } 110401e04c3fSmrg 11059f464c52Smaya case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 11069f464c52Smaya anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 11079f464c52Smaya pc.CommandStreamerStallEnable = true; 11089f464c52Smaya pc.StallAtPixelScoreboard = true; 11099f464c52Smaya } 11109f464c52Smaya 11119f464c52Smaya emit_xfb_query(&b, index, anv_address_add(query_addr, 16)); 11129f464c52Smaya emit_query_mi_availability(&b, query_addr, true); 11139f464c52Smaya break; 11149f464c52Smaya 11157ec681f3Smrg#if GFX_VER >= 8 11167ec681f3Smrg case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { 11177ec681f3Smrg anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 11187ec681f3Smrg pc.CommandStreamerStallEnable = true; 11197ec681f3Smrg pc.StallAtPixelScoreboard = true; 11207ec681f3Smrg } 11217ec681f3Smrg cmd_buffer->perf_query_pool = pool; 11227ec681f3Smrg 11237ec681f3Smrg if (!khr_perf_query_ensure_relocs(cmd_buffer)) 11247ec681f3Smrg return; 11257ec681f3Smrg 11267ec681f3Smrg const struct anv_physical_device *pdevice = cmd_buffer->device->physical; 11277ec681f3Smrg const struct intel_perf_query_field_layout *layout = &pdevice->perf->query_layout; 11287ec681f3Smrg 11297ec681f3Smrg void *dws; 11307ec681f3Smrg for (uint32_t r = 0; r < layout->n_fields; r++) { 11317ec681f3Smrg const struct intel_perf_query_field *field = &layout->fields[r]; 11327ec681f3Smrg 11337ec681f3Smrg switch (field->type) { 11347ec681f3Smrg case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC: 11357ec681f3Smrg dws = anv_batch_emitn(&cmd_buffer->batch, 11367ec681f3Smrg GENX(MI_REPORT_PERF_COUNT_length), 11377ec681f3Smrg GENX(MI_REPORT_PERF_COUNT), 11387ec681f3Smrg .MemoryAddress = query_addr /* Will be overwritten */); 11397ec681f3Smrg _mi_resolve_address_token(&b, 11407ec681f3Smrg cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], 11417ec681f3Smrg dws + 11427ec681f3Smrg GENX(MI_REPORT_PERF_COUNT_MemoryAddress_start) / 8); 11437ec681f3Smrg break; 11447ec681f3Smrg 11457ec681f3Smrg case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT: 11467ec681f3Smrg case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT: 11477ec681f3Smrg case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B: 11487ec681f3Smrg case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C: 11497ec681f3Smrg dws = 11507ec681f3Smrg anv_batch_emitn(&cmd_buffer->batch, 11517ec681f3Smrg GENX(MI_STORE_REGISTER_MEM_length), 11527ec681f3Smrg GENX(MI_STORE_REGISTER_MEM), 11537ec681f3Smrg .RegisterAddress = field->mmio_offset, 11547ec681f3Smrg .MemoryAddress = query_addr /* Will be overwritten */ ); 11557ec681f3Smrg _mi_resolve_address_token(&b, 11567ec681f3Smrg cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], 11577ec681f3Smrg dws + 11587ec681f3Smrg GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8); 11597ec681f3Smrg if (field->size == 8) { 11607ec681f3Smrg dws = 11617ec681f3Smrg anv_batch_emitn(&cmd_buffer->batch, 11627ec681f3Smrg GENX(MI_STORE_REGISTER_MEM_length), 11637ec681f3Smrg GENX(MI_STORE_REGISTER_MEM), 11647ec681f3Smrg .RegisterAddress = field->mmio_offset + 4, 11657ec681f3Smrg .MemoryAddress = query_addr /* Will be overwritten */ ); 11667ec681f3Smrg _mi_resolve_address_token(&b, 11677ec681f3Smrg cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], 11687ec681f3Smrg dws + 11697ec681f3Smrg GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8); 11707ec681f3Smrg } 11717ec681f3Smrg break; 11727ec681f3Smrg 11737ec681f3Smrg default: 11747ec681f3Smrg unreachable("Invalid query field"); 11757ec681f3Smrg break; 11767ec681f3Smrg } 11777ec681f3Smrg } 11787ec681f3Smrg 11797ec681f3Smrg dws = 11807ec681f3Smrg anv_batch_emitn(&cmd_buffer->batch, 11817ec681f3Smrg GENX(MI_STORE_DATA_IMM_length), 11827ec681f3Smrg GENX(MI_STORE_DATA_IMM), 11837ec681f3Smrg .ImmediateData = true); 11847ec681f3Smrg _mi_resolve_address_token(&b, 11857ec681f3Smrg cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], 11867ec681f3Smrg dws + 11877ec681f3Smrg GENX(MI_STORE_DATA_IMM_Address_start) / 8); 11887ec681f3Smrg 11897ec681f3Smrg assert(cmd_buffer->perf_reloc_idx == pdevice->n_perf_query_commands); 11907ec681f3Smrg break; 11917ec681f3Smrg } 11927ec681f3Smrg#endif 11937ec681f3Smrg 11947ec681f3Smrg case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { 11957ec681f3Smrg anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 11967ec681f3Smrg pc.CommandStreamerStallEnable = true; 11977ec681f3Smrg pc.StallAtPixelScoreboard = true; 11987ec681f3Smrg } 11997ec681f3Smrg uint32_t marker_offset = intel_perf_marker_offset(); 12007ec681f3Smrg mi_store(&b, mi_mem64(anv_address_add(query_addr, marker_offset)), 12017ec681f3Smrg mi_imm(cmd_buffer->intel_perf_marker)); 12027ec681f3Smrg emit_perf_intel_query(cmd_buffer, pool, &b, query_addr, true); 12037ec681f3Smrg emit_query_mi_availability(&b, query_addr, true); 12047ec681f3Smrg break; 12057ec681f3Smrg } 12067ec681f3Smrg 120701e04c3fSmrg default: 120801e04c3fSmrg unreachable(""); 120901e04c3fSmrg } 121001e04c3fSmrg 121101e04c3fSmrg /* When multiview is active the spec requires that N consecutive query 121201e04c3fSmrg * indices are used, where N is the number of active views in the subpass. 121301e04c3fSmrg * The spec allows that we only write the results to one of the queries 121401e04c3fSmrg * but we still need to manage result availability for all the query indices. 121501e04c3fSmrg * Since we only emit a single query for all active views in the 121601e04c3fSmrg * first index, mark the other query indices as being already available 121701e04c3fSmrg * with result 0. 121801e04c3fSmrg */ 121901e04c3fSmrg if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) { 122001e04c3fSmrg const uint32_t num_queries = 122101e04c3fSmrg util_bitcount(cmd_buffer->state.subpass->view_mask); 122201e04c3fSmrg if (num_queries > 1) 12239f464c52Smaya emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1); 122401e04c3fSmrg } 122501e04c3fSmrg} 122601e04c3fSmrg 122701e04c3fSmrg#define TIMESTAMP 0x2358 122801e04c3fSmrg 12297ec681f3Smrgvoid genX(CmdWriteTimestamp2KHR)( 123001e04c3fSmrg VkCommandBuffer commandBuffer, 12317ec681f3Smrg VkPipelineStageFlags2KHR stage, 123201e04c3fSmrg VkQueryPool queryPool, 123301e04c3fSmrg uint32_t query) 123401e04c3fSmrg{ 123501e04c3fSmrg ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 123601e04c3fSmrg ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 123701e04c3fSmrg struct anv_address query_addr = anv_query_address(pool, query); 123801e04c3fSmrg 123901e04c3fSmrg assert(pool->type == VK_QUERY_TYPE_TIMESTAMP); 124001e04c3fSmrg 12417ec681f3Smrg struct mi_builder b; 12427ec681f3Smrg mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 12439f464c52Smaya 12447ec681f3Smrg if (stage == VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT_KHR) { 12457ec681f3Smrg mi_store(&b, mi_mem64(anv_address_add(query_addr, 8)), 12467ec681f3Smrg mi_reg64(TIMESTAMP)); 12477ec681f3Smrg } else { 124801e04c3fSmrg /* Everything else is bottom-of-pipe */ 12497ec681f3Smrg cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT; 12507ec681f3Smrg genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 12517ec681f3Smrg 125201e04c3fSmrg anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 125301e04c3fSmrg pc.DestinationAddressType = DAT_PPGTT; 125401e04c3fSmrg pc.PostSyncOperation = WriteTimestamp; 125501e04c3fSmrg pc.Address = anv_address_add(query_addr, 8); 125601e04c3fSmrg 12577ec681f3Smrg if (GFX_VER == 9 && cmd_buffer->device->info.gt == 4) 125801e04c3fSmrg pc.CommandStreamerStallEnable = true; 125901e04c3fSmrg } 126001e04c3fSmrg } 126101e04c3fSmrg 12629f464c52Smaya emit_query_pc_availability(cmd_buffer, query_addr, true); 126301e04c3fSmrg 126401e04c3fSmrg /* When multiview is active the spec requires that N consecutive query 126501e04c3fSmrg * indices are used, where N is the number of active views in the subpass. 126601e04c3fSmrg * The spec allows that we only write the results to one of the queries 126701e04c3fSmrg * but we still need to manage result availability for all the query indices. 126801e04c3fSmrg * Since we only emit a single query for all active views in the 126901e04c3fSmrg * first index, mark the other query indices as being already available 127001e04c3fSmrg * with result 0. 127101e04c3fSmrg */ 127201e04c3fSmrg if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) { 127301e04c3fSmrg const uint32_t num_queries = 127401e04c3fSmrg util_bitcount(cmd_buffer->state.subpass->view_mask); 127501e04c3fSmrg if (num_queries > 1) 12769f464c52Smaya emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1); 127701e04c3fSmrg } 127801e04c3fSmrg} 127901e04c3fSmrg 12807ec681f3Smrg#if GFX_VERx10 >= 75 12817ec681f3Smrg 12827ec681f3Smrg#define MI_PREDICATE_SRC0 0x2400 12837ec681f3Smrg#define MI_PREDICATE_SRC1 0x2408 12847ec681f3Smrg#define MI_PREDICATE_RESULT 0x2418 12857ec681f3Smrg 12867ec681f3Smrg/** 12877ec681f3Smrg * Writes the results of a query to dst_addr is the value at poll_addr is equal 12887ec681f3Smrg * to the reference value. 12897ec681f3Smrg */ 12907ec681f3Smrgstatic void 12917ec681f3Smrggpu_write_query_result_cond(struct anv_cmd_buffer *cmd_buffer, 12927ec681f3Smrg struct mi_builder *b, 12937ec681f3Smrg struct anv_address poll_addr, 12947ec681f3Smrg struct anv_address dst_addr, 12957ec681f3Smrg uint64_t ref_value, 12967ec681f3Smrg VkQueryResultFlags flags, 12977ec681f3Smrg uint32_t value_index, 12987ec681f3Smrg struct mi_value query_result) 12997ec681f3Smrg{ 13007ec681f3Smrg mi_store(b, mi_reg64(MI_PREDICATE_SRC0), mi_mem64(poll_addr)); 13017ec681f3Smrg mi_store(b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(ref_value)); 13027ec681f3Smrg anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { 13037ec681f3Smrg mip.LoadOperation = LOAD_LOAD; 13047ec681f3Smrg mip.CombineOperation = COMBINE_SET; 13057ec681f3Smrg mip.CompareOperation = COMPARE_SRCS_EQUAL; 13067ec681f3Smrg } 13077ec681f3Smrg 13087ec681f3Smrg if (flags & VK_QUERY_RESULT_64_BIT) { 13097ec681f3Smrg struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8); 13107ec681f3Smrg mi_store_if(b, mi_mem64(res_addr), query_result); 13117ec681f3Smrg } else { 13127ec681f3Smrg struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4); 13137ec681f3Smrg mi_store_if(b, mi_mem32(res_addr), query_result); 13147ec681f3Smrg } 13157ec681f3Smrg} 131601e04c3fSmrg 131701e04c3fSmrgstatic void 13187ec681f3Smrggpu_write_query_result(struct mi_builder *b, 131901e04c3fSmrg struct anv_address dst_addr, 132001e04c3fSmrg VkQueryResultFlags flags, 13219f464c52Smaya uint32_t value_index, 13227ec681f3Smrg struct mi_value query_result) 132301e04c3fSmrg{ 132401e04c3fSmrg if (flags & VK_QUERY_RESULT_64_BIT) { 13259f464c52Smaya struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8); 13267ec681f3Smrg mi_store(b, mi_mem64(res_addr), query_result); 132701e04c3fSmrg } else { 13289f464c52Smaya struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4); 13297ec681f3Smrg mi_store(b, mi_mem32(res_addr), query_result); 133001e04c3fSmrg } 133101e04c3fSmrg} 133201e04c3fSmrg 13337ec681f3Smrgstatic struct mi_value 13347ec681f3Smrgcompute_query_result(struct mi_builder *b, struct anv_address addr) 133501e04c3fSmrg{ 13367ec681f3Smrg return mi_isub(b, mi_mem64(anv_address_add(addr, 8)), 13377ec681f3Smrg mi_mem64(anv_address_add(addr, 0))); 133801e04c3fSmrg} 133901e04c3fSmrg 134001e04c3fSmrgvoid genX(CmdCopyQueryPoolResults)( 134101e04c3fSmrg VkCommandBuffer commandBuffer, 134201e04c3fSmrg VkQueryPool queryPool, 134301e04c3fSmrg uint32_t firstQuery, 134401e04c3fSmrg uint32_t queryCount, 134501e04c3fSmrg VkBuffer destBuffer, 134601e04c3fSmrg VkDeviceSize destOffset, 134701e04c3fSmrg VkDeviceSize destStride, 134801e04c3fSmrg VkQueryResultFlags flags) 134901e04c3fSmrg{ 135001e04c3fSmrg ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 135101e04c3fSmrg ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 135201e04c3fSmrg ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer); 135301e04c3fSmrg 13547ec681f3Smrg struct mi_builder b; 13557ec681f3Smrg mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 13567ec681f3Smrg struct mi_value result; 13579f464c52Smaya 135801e04c3fSmrg /* If render target writes are ongoing, request a render target cache flush 135901e04c3fSmrg * to ensure proper ordering of the commands from the 3d pipe and the 136001e04c3fSmrg * command streamer. 136101e04c3fSmrg */ 13629f464c52Smaya if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_RENDER_TARGET_BUFFER_WRITES) { 13637ec681f3Smrg anv_add_pending_pipe_bits(cmd_buffer, 13647ec681f3Smrg ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT, 13657ec681f3Smrg "CopyQueryPoolResults"); 136601e04c3fSmrg } 136701e04c3fSmrg 136801e04c3fSmrg if ((flags & VK_QUERY_RESULT_WAIT_BIT) || 13699f464c52Smaya (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) || 13709f464c52Smaya /* Occlusion & timestamp queries are written using a PIPE_CONTROL and 13719f464c52Smaya * because we're about to copy values from MI commands, we need to 13729f464c52Smaya * stall the command streamer to make sure the PIPE_CONTROL values have 13739f464c52Smaya * landed, otherwise we could see inconsistent values & availability. 13749f464c52Smaya * 13759f464c52Smaya * From the vulkan spec: 13769f464c52Smaya * 13779f464c52Smaya * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of 13789f464c52Smaya * previous uses of vkCmdResetQueryPool in the same queue, without 13799f464c52Smaya * any additional synchronization." 13809f464c52Smaya */ 13819f464c52Smaya pool->type == VK_QUERY_TYPE_OCCLUSION || 13829f464c52Smaya pool->type == VK_QUERY_TYPE_TIMESTAMP) { 13837ec681f3Smrg anv_add_pending_pipe_bits(cmd_buffer, 13847ec681f3Smrg ANV_PIPE_CS_STALL_BIT, 13857ec681f3Smrg "CopyQueryPoolResults"); 138601e04c3fSmrg genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 138701e04c3fSmrg } 138801e04c3fSmrg 138901e04c3fSmrg struct anv_address dest_addr = anv_address_add(buffer->address, destOffset); 139001e04c3fSmrg for (uint32_t i = 0; i < queryCount; i++) { 139101e04c3fSmrg struct anv_address query_addr = anv_query_address(pool, firstQuery + i); 139201e04c3fSmrg uint32_t idx = 0; 139301e04c3fSmrg switch (pool->type) { 139401e04c3fSmrg case VK_QUERY_TYPE_OCCLUSION: 13959f464c52Smaya result = compute_query_result(&b, anv_address_add(query_addr, 8)); 13967ec681f3Smrg /* Like in the case of vkGetQueryPoolResults, if the query is 13977ec681f3Smrg * unavailable and the VK_QUERY_RESULT_PARTIAL_BIT flag is set, 13987ec681f3Smrg * conservatively write 0 as the query result. If the 13997ec681f3Smrg * VK_QUERY_RESULT_PARTIAL_BIT isn't set, don't write any value. 14007ec681f3Smrg */ 14017ec681f3Smrg gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr, 14027ec681f3Smrg 1 /* available */, flags, idx, result); 14037ec681f3Smrg if (flags & VK_QUERY_RESULT_PARTIAL_BIT) { 14047ec681f3Smrg gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr, 14057ec681f3Smrg 0 /* unavailable */, flags, idx, mi_imm(0)); 14067ec681f3Smrg } 14077ec681f3Smrg idx++; 140801e04c3fSmrg break; 140901e04c3fSmrg 141001e04c3fSmrg case VK_QUERY_TYPE_PIPELINE_STATISTICS: { 141101e04c3fSmrg uint32_t statistics = pool->pipeline_statistics; 141201e04c3fSmrg while (statistics) { 141301e04c3fSmrg uint32_t stat = u_bit_scan(&statistics); 141401e04c3fSmrg 14159f464c52Smaya result = compute_query_result(&b, anv_address_add(query_addr, 14169f464c52Smaya idx * 16 + 8)); 141701e04c3fSmrg 141801e04c3fSmrg /* WaDividePSInvocationCountBy4:HSW,BDW */ 14197ec681f3Smrg if ((cmd_buffer->device->info.ver == 8 || 142001e04c3fSmrg cmd_buffer->device->info.is_haswell) && 142101e04c3fSmrg (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) { 14227ec681f3Smrg result = mi_ushr32_imm(&b, result, 2); 142301e04c3fSmrg } 142401e04c3fSmrg 14259f464c52Smaya gpu_write_query_result(&b, dest_addr, flags, idx++, result); 142601e04c3fSmrg } 142701e04c3fSmrg assert(idx == util_bitcount(pool->pipeline_statistics)); 142801e04c3fSmrg break; 142901e04c3fSmrg } 143001e04c3fSmrg 14319f464c52Smaya case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 14329f464c52Smaya result = compute_query_result(&b, anv_address_add(query_addr, 8)); 14339f464c52Smaya gpu_write_query_result(&b, dest_addr, flags, idx++, result); 14349f464c52Smaya result = compute_query_result(&b, anv_address_add(query_addr, 24)); 14359f464c52Smaya gpu_write_query_result(&b, dest_addr, flags, idx++, result); 14369f464c52Smaya break; 14379f464c52Smaya 143801e04c3fSmrg case VK_QUERY_TYPE_TIMESTAMP: 14397ec681f3Smrg result = mi_mem64(anv_address_add(query_addr, 8)); 14407ec681f3Smrg gpu_write_query_result(&b, dest_addr, flags, idx++, result); 144101e04c3fSmrg break; 144201e04c3fSmrg 14437ec681f3Smrg#if GFX_VER >= 8 14447ec681f3Smrg case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: 14457ec681f3Smrg unreachable("Copy KHR performance query results not implemented"); 14467ec681f3Smrg break; 14477ec681f3Smrg#endif 14487ec681f3Smrg 144901e04c3fSmrg default: 145001e04c3fSmrg unreachable("unhandled query type"); 145101e04c3fSmrg } 145201e04c3fSmrg 145301e04c3fSmrg if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { 14549f464c52Smaya gpu_write_query_result(&b, dest_addr, flags, idx, 14557ec681f3Smrg mi_mem64(query_addr)); 145601e04c3fSmrg } 145701e04c3fSmrg 145801e04c3fSmrg dest_addr = anv_address_add(dest_addr, destStride); 145901e04c3fSmrg } 146001e04c3fSmrg} 146101e04c3fSmrg 146201e04c3fSmrg#else 146301e04c3fSmrgvoid genX(CmdCopyQueryPoolResults)( 146401e04c3fSmrg VkCommandBuffer commandBuffer, 146501e04c3fSmrg VkQueryPool queryPool, 146601e04c3fSmrg uint32_t firstQuery, 146701e04c3fSmrg uint32_t queryCount, 146801e04c3fSmrg VkBuffer destBuffer, 146901e04c3fSmrg VkDeviceSize destOffset, 147001e04c3fSmrg VkDeviceSize destStride, 147101e04c3fSmrg VkQueryResultFlags flags) 147201e04c3fSmrg{ 147301e04c3fSmrg anv_finishme("Queries not yet supported on Ivy Bridge"); 147401e04c3fSmrg} 147501e04c3fSmrg#endif 1476