101e04c3fSmrg/*
201e04c3fSmrg * Copyright © 2015 Intel Corporation
301e04c3fSmrg *
401e04c3fSmrg * Permission is hereby granted, free of charge, to any person obtaining a
501e04c3fSmrg * copy of this software and associated documentation files (the "Software"),
601e04c3fSmrg * to deal in the Software without restriction, including without limitation
701e04c3fSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
801e04c3fSmrg * and/or sell copies of the Software, and to permit persons to whom the
901e04c3fSmrg * Software is furnished to do so, subject to the following conditions:
1001e04c3fSmrg *
1101e04c3fSmrg * The above copyright notice and this permission notice (including the next
1201e04c3fSmrg * paragraph) shall be included in all copies or substantial portions of the
1301e04c3fSmrg * Software.
1401e04c3fSmrg *
1501e04c3fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1601e04c3fSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1701e04c3fSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
1801e04c3fSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1901e04c3fSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
2001e04c3fSmrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
2101e04c3fSmrg * IN THE SOFTWARE.
2201e04c3fSmrg */
2301e04c3fSmrg
2401e04c3fSmrg#include <assert.h>
2501e04c3fSmrg#include <stdbool.h>
2601e04c3fSmrg#include <string.h>
2701e04c3fSmrg#include <unistd.h>
2801e04c3fSmrg#include <fcntl.h>
2901e04c3fSmrg
3001e04c3fSmrg#include "anv_private.h"
3101e04c3fSmrg
3201e04c3fSmrg#include "genxml/gen_macros.h"
3301e04c3fSmrg#include "genxml/genX_pack.h"
3401e04c3fSmrg
357ec681f3Smrg/* We reserve :
367ec681f3Smrg *    - GPR 14 for perf queries
377ec681f3Smrg *    - GPR 15 for conditional rendering
387ec681f3Smrg */
397ec681f3Smrg#define MI_BUILDER_NUM_ALLOC_GPRS 14
407ec681f3Smrg#define MI_BUILDER_CAN_WRITE_BATCH GFX_VER >= 8
419f464c52Smaya#define __gen_get_batch_dwords anv_batch_emit_dwords
429f464c52Smaya#define __gen_address_offset anv_address_add
437ec681f3Smrg#define __gen_get_batch_address(b, a) anv_batch_address(b, a)
447ec681f3Smrg#include "common/mi_builder.h"
457ec681f3Smrg#include "perf/intel_perf.h"
467ec681f3Smrg#include "perf/intel_perf_mdapi.h"
477ec681f3Smrg#include "perf/intel_perf_regs.h"
487ec681f3Smrg
497ec681f3Smrg#include "vk_util.h"
507ec681f3Smrg
517ec681f3Smrgstatic struct anv_address
527ec681f3Smrganv_query_address(struct anv_query_pool *pool, uint32_t query)
537ec681f3Smrg{
547ec681f3Smrg   return (struct anv_address) {
557ec681f3Smrg      .bo = pool->bo,
567ec681f3Smrg      .offset = query * pool->stride,
577ec681f3Smrg   };
587ec681f3Smrg}
599f464c52Smaya
6001e04c3fSmrgVkResult genX(CreateQueryPool)(
6101e04c3fSmrg    VkDevice                                    _device,
6201e04c3fSmrg    const VkQueryPoolCreateInfo*                pCreateInfo,
6301e04c3fSmrg    const VkAllocationCallbacks*                pAllocator,
6401e04c3fSmrg    VkQueryPool*                                pQueryPool)
6501e04c3fSmrg{
6601e04c3fSmrg   ANV_FROM_HANDLE(anv_device, device, _device);
677ec681f3Smrg   const struct anv_physical_device *pdevice = device->physical;
687ec681f3Smrg#if GFX_VER >= 8
697ec681f3Smrg   const VkQueryPoolPerformanceCreateInfoKHR *perf_query_info = NULL;
707ec681f3Smrg   struct intel_perf_counter_pass *counter_pass;
717ec681f3Smrg   struct intel_perf_query_info **pass_query;
727ec681f3Smrg   uint32_t n_passes = 0;
737ec681f3Smrg#endif
747ec681f3Smrg   uint32_t data_offset = 0;
757ec681f3Smrg   VK_MULTIALLOC(ma);
7601e04c3fSmrg   VkResult result;
7701e04c3fSmrg
7801e04c3fSmrg   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
7901e04c3fSmrg
8001e04c3fSmrg   /* Query pool slots are made up of some number of 64-bit values packed
817ec681f3Smrg    * tightly together. For most query types have the first 64-bit value is
827ec681f3Smrg    * the "available" bit which is 0 when the query is unavailable and 1 when
837ec681f3Smrg    * it is available. The 64-bit values that follow are determined by the
847ec681f3Smrg    * type of query.
857ec681f3Smrg    *
867ec681f3Smrg    * For performance queries, we have a requirement to align OA reports at
877ec681f3Smrg    * 64bytes so we put those first and have the "available" bit behind
887ec681f3Smrg    * together with some other counters.
8901e04c3fSmrg    */
907ec681f3Smrg   uint32_t uint64s_per_slot = 0;
917ec681f3Smrg
927ec681f3Smrg   VK_MULTIALLOC_DECL(&ma, struct anv_query_pool, pool, 1);
9301e04c3fSmrg
9401e04c3fSmrg   VkQueryPipelineStatisticFlags pipeline_statistics = 0;
9501e04c3fSmrg   switch (pCreateInfo->queryType) {
9601e04c3fSmrg   case VK_QUERY_TYPE_OCCLUSION:
9701e04c3fSmrg      /* Occlusion queries have two values: begin and end. */
987ec681f3Smrg      uint64s_per_slot = 1 + 2;
9901e04c3fSmrg      break;
10001e04c3fSmrg   case VK_QUERY_TYPE_TIMESTAMP:
10101e04c3fSmrg      /* Timestamps just have the one timestamp value */
1027ec681f3Smrg      uint64s_per_slot = 1 + 1;
10301e04c3fSmrg      break;
10401e04c3fSmrg   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
10501e04c3fSmrg      pipeline_statistics = pCreateInfo->pipelineStatistics;
10601e04c3fSmrg      /* We're going to trust this field implicitly so we need to ensure that
10701e04c3fSmrg       * no unhandled extension bits leak in.
10801e04c3fSmrg       */
10901e04c3fSmrg      pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK;
11001e04c3fSmrg
11101e04c3fSmrg      /* Statistics queries have a min and max for every statistic */
1127ec681f3Smrg      uint64s_per_slot = 1 + 2 * util_bitcount(pipeline_statistics);
11301e04c3fSmrg      break;
1149f464c52Smaya   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1159f464c52Smaya      /* Transform feedback queries are 4 values, begin/end for
1169f464c52Smaya       * written/available.
1179f464c52Smaya       */
1187ec681f3Smrg      uint64s_per_slot = 1 + 4;
1197ec681f3Smrg      break;
1207ec681f3Smrg   case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
1217ec681f3Smrg      const struct intel_perf_query_field_layout *layout =
1227ec681f3Smrg         &pdevice->perf->query_layout;
1237ec681f3Smrg
1247ec681f3Smrg      uint64s_per_slot = 2; /* availability + marker */
1257ec681f3Smrg      /* Align to the requirement of the layout */
1267ec681f3Smrg      uint64s_per_slot = align_u32(uint64s_per_slot,
1277ec681f3Smrg                                   DIV_ROUND_UP(layout->alignment, sizeof(uint64_t)));
1287ec681f3Smrg      data_offset = uint64s_per_slot * sizeof(uint64_t);
1297ec681f3Smrg      /* Add the query data for begin & end commands */
1307ec681f3Smrg      uint64s_per_slot += 2 * DIV_ROUND_UP(layout->size, sizeof(uint64_t));
1317ec681f3Smrg      break;
1327ec681f3Smrg   }
1337ec681f3Smrg#if GFX_VER >= 8
1347ec681f3Smrg   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
1357ec681f3Smrg      const struct intel_perf_query_field_layout *layout =
1367ec681f3Smrg         &pdevice->perf->query_layout;
1377ec681f3Smrg
1387ec681f3Smrg      perf_query_info = vk_find_struct_const(pCreateInfo->pNext,
1397ec681f3Smrg                                             QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
1407ec681f3Smrg      n_passes = intel_perf_get_n_passes(pdevice->perf,
1417ec681f3Smrg                                         perf_query_info->pCounterIndices,
1427ec681f3Smrg                                         perf_query_info->counterIndexCount,
1437ec681f3Smrg                                         NULL);
1447ec681f3Smrg      vk_multialloc_add(&ma, &counter_pass, struct intel_perf_counter_pass,
1457ec681f3Smrg                             perf_query_info->counterIndexCount);
1467ec681f3Smrg      vk_multialloc_add(&ma, &pass_query, struct intel_perf_query_info *,
1477ec681f3Smrg                             n_passes);
1487ec681f3Smrg      uint64s_per_slot = 4 /* availability + small batch */;
1497ec681f3Smrg      /* Align to the requirement of the layout */
1507ec681f3Smrg      uint64s_per_slot = align_u32(uint64s_per_slot,
1517ec681f3Smrg                                   DIV_ROUND_UP(layout->alignment, sizeof(uint64_t)));
1527ec681f3Smrg      data_offset = uint64s_per_slot * sizeof(uint64_t);
1537ec681f3Smrg      /* Add the query data for begin & end commands */
1547ec681f3Smrg      uint64s_per_slot += 2 * DIV_ROUND_UP(layout->size, sizeof(uint64_t));
1557ec681f3Smrg      /* Multiply by the number of passes */
1567ec681f3Smrg      uint64s_per_slot *= n_passes;
1579f464c52Smaya      break;
1587ec681f3Smrg   }
1597ec681f3Smrg#endif
16001e04c3fSmrg   default:
16101e04c3fSmrg      assert(!"Invalid query type");
16201e04c3fSmrg   }
16301e04c3fSmrg
1647ec681f3Smrg   if (!vk_object_multialloc(&device->vk, &ma, pAllocator,
1657ec681f3Smrg                             VK_OBJECT_TYPE_QUERY_POOL))
1667ec681f3Smrg      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
16701e04c3fSmrg
16801e04c3fSmrg   pool->type = pCreateInfo->queryType;
16901e04c3fSmrg   pool->pipeline_statistics = pipeline_statistics;
17001e04c3fSmrg   pool->stride = uint64s_per_slot * sizeof(uint64_t);
17101e04c3fSmrg   pool->slots = pCreateInfo->queryCount;
17201e04c3fSmrg
1737ec681f3Smrg   if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL) {
1747ec681f3Smrg      pool->data_offset = data_offset;
1757ec681f3Smrg      pool->snapshot_size = (pool->stride - data_offset) / 2;
1767ec681f3Smrg   }
1777ec681f3Smrg#if GFX_VER >= 8
1787ec681f3Smrg   else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
1797ec681f3Smrg      pool->pass_size = pool->stride / n_passes;
1807ec681f3Smrg      pool->data_offset = data_offset;
1817ec681f3Smrg      pool->snapshot_size = (pool->pass_size - data_offset) / 2;
1827ec681f3Smrg      pool->n_counters = perf_query_info->counterIndexCount;
1837ec681f3Smrg      pool->counter_pass = counter_pass;
1847ec681f3Smrg      intel_perf_get_counters_passes(pdevice->perf,
1857ec681f3Smrg                                     perf_query_info->pCounterIndices,
1867ec681f3Smrg                                     perf_query_info->counterIndexCount,
1877ec681f3Smrg                                     pool->counter_pass);
1887ec681f3Smrg      pool->n_passes = n_passes;
1897ec681f3Smrg      pool->pass_query = pass_query;
1907ec681f3Smrg      intel_perf_get_n_passes(pdevice->perf,
1917ec681f3Smrg                              perf_query_info->pCounterIndices,
1927ec681f3Smrg                              perf_query_info->counterIndexCount,
1937ec681f3Smrg                              pool->pass_query);
1947ec681f3Smrg   }
1957ec681f3Smrg#endif
1967ec681f3Smrg
1977ec681f3Smrg   uint64_t size = pool->slots * (uint64_t)pool->stride;
1987ec681f3Smrg   result = anv_device_alloc_bo(device, "query-pool", size,
1997ec681f3Smrg                                ANV_BO_ALLOC_MAPPED |
2007ec681f3Smrg                                ANV_BO_ALLOC_SNOOPED,
2017ec681f3Smrg                                0 /* explicit_address */,
2027ec681f3Smrg                                &pool->bo);
20301e04c3fSmrg   if (result != VK_SUCCESS)
20401e04c3fSmrg      goto fail;
20501e04c3fSmrg
2067ec681f3Smrg#if GFX_VER >= 8
2077ec681f3Smrg   if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
2087ec681f3Smrg      for (uint32_t p = 0; p < pool->n_passes; p++) {
2097ec681f3Smrg         struct mi_builder b;
2107ec681f3Smrg         struct anv_batch batch = {
2117ec681f3Smrg            .start = pool->bo->map + khr_perf_query_preamble_offset(pool, p),
2127ec681f3Smrg            .end = pool->bo->map + khr_perf_query_preamble_offset(pool, p) + pool->data_offset,
2137ec681f3Smrg         };
2147ec681f3Smrg         batch.next = batch.start;
2157ec681f3Smrg
2167ec681f3Smrg         mi_builder_init(&b, &device->info, &batch);
2177ec681f3Smrg         mi_store(&b, mi_reg64(ANV_PERF_QUERY_OFFSET_REG),
2187ec681f3Smrg                      mi_imm(p * (uint64_t)pool->pass_size));
2197ec681f3Smrg         anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
2207ec681f3Smrg      }
2217ec681f3Smrg   }
2227ec681f3Smrg#endif
22301e04c3fSmrg
22401e04c3fSmrg   *pQueryPool = anv_query_pool_to_handle(pool);
22501e04c3fSmrg
22601e04c3fSmrg   return VK_SUCCESS;
22701e04c3fSmrg
22801e04c3fSmrg fail:
2297ec681f3Smrg   vk_free2(&device->vk.alloc, pAllocator, pool);
23001e04c3fSmrg
23101e04c3fSmrg   return result;
23201e04c3fSmrg}
23301e04c3fSmrg
23401e04c3fSmrgvoid genX(DestroyQueryPool)(
23501e04c3fSmrg    VkDevice                                    _device,
23601e04c3fSmrg    VkQueryPool                                 _pool,
23701e04c3fSmrg    const VkAllocationCallbacks*                pAllocator)
23801e04c3fSmrg{
23901e04c3fSmrg   ANV_FROM_HANDLE(anv_device, device, _device);
24001e04c3fSmrg   ANV_FROM_HANDLE(anv_query_pool, pool, _pool);
24101e04c3fSmrg
24201e04c3fSmrg   if (!pool)
24301e04c3fSmrg      return;
24401e04c3fSmrg
2457ec681f3Smrg   anv_device_release_bo(device, pool->bo);
2467ec681f3Smrg   vk_object_free(&device->vk, pAllocator, pool);
2477ec681f3Smrg}
2487ec681f3Smrg
2497ec681f3Smrg#if GFX_VER >= 8
2507ec681f3Smrg/**
2517ec681f3Smrg * VK_KHR_performance_query layout  :
2527ec681f3Smrg *
2537ec681f3Smrg * --------------------------------------------
2547ec681f3Smrg * |       availability (8b)       | |        |
2557ec681f3Smrg * |-------------------------------| |        |
2567ec681f3Smrg * |      Small batch loading      | |        |
2577ec681f3Smrg * |   ANV_PERF_QUERY_OFFSET_REG   | |        |
2587ec681f3Smrg * |            (24b)              | | Pass 0 |
2597ec681f3Smrg * |-------------------------------| |        |
2607ec681f3Smrg * |       some padding (see       | |        |
2617ec681f3Smrg * | query_field_layout:alignment) | |        |
2627ec681f3Smrg * |-------------------------------| |        |
2637ec681f3Smrg * |           query data          | |        |
2647ec681f3Smrg * | (2 * query_field_layout:size) | |        |
2657ec681f3Smrg * |-------------------------------|--        | Query 0
2667ec681f3Smrg * |       availability (8b)       | |        |
2677ec681f3Smrg * |-------------------------------| |        |
2687ec681f3Smrg * |      Small batch loading      | |        |
2697ec681f3Smrg * |   ANV_PERF_QUERY_OFFSET_REG   | |        |
2707ec681f3Smrg * |            (24b)              | | Pass 1 |
2717ec681f3Smrg * |-------------------------------| |        |
2727ec681f3Smrg * |       some padding (see       | |        |
2737ec681f3Smrg * | query_field_layout:alignment) | |        |
2747ec681f3Smrg * |-------------------------------| |        |
2757ec681f3Smrg * |           query data          | |        |
2767ec681f3Smrg * | (2 * query_field_layout:size) | |        |
2777ec681f3Smrg * |-------------------------------|-----------
2787ec681f3Smrg * |       availability (8b)       | |        |
2797ec681f3Smrg * |-------------------------------| |        |
2807ec681f3Smrg * |      Small batch loading      | |        |
2817ec681f3Smrg * |   ANV_PERF_QUERY_OFFSET_REG   | |        |
2827ec681f3Smrg * |            (24b)              | | Pass 0 |
2837ec681f3Smrg * |-------------------------------| |        |
2847ec681f3Smrg * |       some padding (see       | |        |
2857ec681f3Smrg * | query_field_layout:alignment) | |        |
2867ec681f3Smrg * |-------------------------------| |        |
2877ec681f3Smrg * |           query data          | |        |
2887ec681f3Smrg * | (2 * query_field_layout:size) | |        |
2897ec681f3Smrg * |-------------------------------|--        | Query 1
2907ec681f3Smrg * |               ...             | |        |
2917ec681f3Smrg * --------------------------------------------
2927ec681f3Smrg */
2937ec681f3Smrg
2947ec681f3Smrgstatic uint64_t
2957ec681f3Smrgkhr_perf_query_availability_offset(struct anv_query_pool *pool, uint32_t query, uint32_t pass)
2967ec681f3Smrg{
2977ec681f3Smrg   return query * (uint64_t)pool->stride + pass * (uint64_t)pool->pass_size;
2987ec681f3Smrg}
2997ec681f3Smrg
3007ec681f3Smrgstatic uint64_t
3017ec681f3Smrgkhr_perf_query_data_offset(struct anv_query_pool *pool, uint32_t query, uint32_t pass, bool end)
3027ec681f3Smrg{
3037ec681f3Smrg   return query * (uint64_t)pool->stride + pass * (uint64_t)pool->pass_size +
3047ec681f3Smrg      pool->data_offset + (end ? pool->snapshot_size : 0);
30501e04c3fSmrg}
30601e04c3fSmrg
30701e04c3fSmrgstatic struct anv_address
3087ec681f3Smrgkhr_perf_query_availability_address(struct anv_query_pool *pool, uint32_t query, uint32_t pass)
30901e04c3fSmrg{
3107ec681f3Smrg   return anv_address_add(
3117ec681f3Smrg      (struct anv_address) { .bo = pool->bo, },
3127ec681f3Smrg      khr_perf_query_availability_offset(pool, query, pass));
3137ec681f3Smrg}
3147ec681f3Smrg
3157ec681f3Smrgstatic struct anv_address
3167ec681f3Smrgkhr_perf_query_data_address(struct anv_query_pool *pool, uint32_t query, uint32_t pass, bool end)
3177ec681f3Smrg{
3187ec681f3Smrg   return anv_address_add(
3197ec681f3Smrg      (struct anv_address) { .bo = pool->bo, },
3207ec681f3Smrg      khr_perf_query_data_offset(pool, query, pass, end));
3217ec681f3Smrg}
3227ec681f3Smrg
3237ec681f3Smrgstatic bool
3247ec681f3Smrgkhr_perf_query_ensure_relocs(struct anv_cmd_buffer *cmd_buffer)
3257ec681f3Smrg{
3267ec681f3Smrg   if (anv_batch_has_error(&cmd_buffer->batch))
3277ec681f3Smrg      return false;
3287ec681f3Smrg
3297ec681f3Smrg   if (cmd_buffer->self_mod_locations)
3307ec681f3Smrg      return true;
3317ec681f3Smrg
3327ec681f3Smrg   struct anv_device *device = cmd_buffer->device;
3337ec681f3Smrg   const struct anv_physical_device *pdevice = device->physical;
3347ec681f3Smrg
3357ec681f3Smrg   cmd_buffer->self_mod_locations =
3367ec681f3Smrg      vk_alloc(&cmd_buffer->pool->alloc,
3377ec681f3Smrg               pdevice->n_perf_query_commands * sizeof(*cmd_buffer->self_mod_locations), 8,
3387ec681f3Smrg               VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
3397ec681f3Smrg
3407ec681f3Smrg   if (!cmd_buffer->self_mod_locations) {
3417ec681f3Smrg      anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY);
3427ec681f3Smrg      return false;
3437ec681f3Smrg   }
3447ec681f3Smrg
3457ec681f3Smrg   return true;
3467ec681f3Smrg}
3477ec681f3Smrg#endif
3487ec681f3Smrg
3497ec681f3Smrg/**
3507ec681f3Smrg * VK_INTEL_performance_query layout :
3517ec681f3Smrg *
3527ec681f3Smrg * ---------------------------------
3537ec681f3Smrg * |       availability (8b)       |
3547ec681f3Smrg * |-------------------------------|
3557ec681f3Smrg * |          marker (8b)          |
3567ec681f3Smrg * |-------------------------------|
3577ec681f3Smrg * |       some padding (see       |
3587ec681f3Smrg * | query_field_layout:alignment) |
3597ec681f3Smrg * |-------------------------------|
3607ec681f3Smrg * |           query data          |
3617ec681f3Smrg * | (2 * query_field_layout:size) |
3627ec681f3Smrg * ---------------------------------
3637ec681f3Smrg */
3647ec681f3Smrg
3657ec681f3Smrgstatic uint32_t
3667ec681f3Smrgintel_perf_marker_offset(void)
3677ec681f3Smrg{
3687ec681f3Smrg   return 8;
3697ec681f3Smrg}
3707ec681f3Smrg
3717ec681f3Smrgstatic uint32_t
3727ec681f3Smrgintel_perf_query_data_offset(struct anv_query_pool *pool, bool end)
3737ec681f3Smrg{
3747ec681f3Smrg   return pool->data_offset + (end ? pool->snapshot_size : 0);
37501e04c3fSmrg}
37601e04c3fSmrg
37701e04c3fSmrgstatic void
37801e04c3fSmrgcpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
37901e04c3fSmrg                       uint32_t value_index, uint64_t result)
38001e04c3fSmrg{
38101e04c3fSmrg   if (flags & VK_QUERY_RESULT_64_BIT) {
38201e04c3fSmrg      uint64_t *dst64 = dst_slot;
38301e04c3fSmrg      dst64[value_index] = result;
38401e04c3fSmrg   } else {
38501e04c3fSmrg      uint32_t *dst32 = dst_slot;
38601e04c3fSmrg      dst32[value_index] = result;
38701e04c3fSmrg   }
38801e04c3fSmrg}
38901e04c3fSmrg
3907ec681f3Smrgstatic void *
3917ec681f3Smrgquery_slot(struct anv_query_pool *pool, uint32_t query)
3927ec681f3Smrg{
3937ec681f3Smrg   return pool->bo->map + query * pool->stride;
3947ec681f3Smrg}
3957ec681f3Smrg
39601e04c3fSmrgstatic bool
3977ec681f3Smrgquery_is_available(struct anv_query_pool *pool, uint32_t query)
39801e04c3fSmrg{
3997ec681f3Smrg#if GFX_VER >= 8
4007ec681f3Smrg   if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
4017ec681f3Smrg      for (uint32_t p = 0; p < pool->n_passes; p++) {
4027ec681f3Smrg         volatile uint64_t *slot =
4037ec681f3Smrg            pool->bo->map + khr_perf_query_availability_offset(pool, query, p);
4047ec681f3Smrg         if (!slot[0])
4057ec681f3Smrg            return false;
4067ec681f3Smrg      }
4077ec681f3Smrg      return true;
4087ec681f3Smrg   }
4097ec681f3Smrg#endif
4107ec681f3Smrg
4117ec681f3Smrg   return *(volatile uint64_t *)query_slot(pool, query);
41201e04c3fSmrg}
41301e04c3fSmrg
41401e04c3fSmrgstatic VkResult
41501e04c3fSmrgwait_for_available(struct anv_device *device,
4167ec681f3Smrg                   struct anv_query_pool *pool, uint32_t query)
41701e04c3fSmrg{
4187ec681f3Smrg   uint64_t abs_timeout = anv_get_absolute_timeout(2 * NSEC_PER_SEC);
41901e04c3fSmrg
4207ec681f3Smrg   while (anv_gettime_ns() < abs_timeout) {
4217ec681f3Smrg      if (query_is_available(pool, query))
4227ec681f3Smrg         return VK_SUCCESS;
4237ec681f3Smrg      VkResult status = anv_device_query_status(device);
4247ec681f3Smrg      if (status != VK_SUCCESS)
4257ec681f3Smrg         return status;
42601e04c3fSmrg   }
4277ec681f3Smrg
4287ec681f3Smrg   return anv_device_set_lost(device, "query timeout");
42901e04c3fSmrg}
43001e04c3fSmrg
43101e04c3fSmrgVkResult genX(GetQueryPoolResults)(
43201e04c3fSmrg    VkDevice                                    _device,
43301e04c3fSmrg    VkQueryPool                                 queryPool,
43401e04c3fSmrg    uint32_t                                    firstQuery,
43501e04c3fSmrg    uint32_t                                    queryCount,
43601e04c3fSmrg    size_t                                      dataSize,
43701e04c3fSmrg    void*                                       pData,
43801e04c3fSmrg    VkDeviceSize                                stride,
43901e04c3fSmrg    VkQueryResultFlags                          flags)
44001e04c3fSmrg{
44101e04c3fSmrg   ANV_FROM_HANDLE(anv_device, device, _device);
44201e04c3fSmrg   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
44301e04c3fSmrg
44401e04c3fSmrg   assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
44501e04c3fSmrg          pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
4469f464c52Smaya          pool->type == VK_QUERY_TYPE_TIMESTAMP ||
4477ec681f3Smrg          pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT ||
4487ec681f3Smrg          pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR ||
4497ec681f3Smrg          pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL);
45001e04c3fSmrg
45101e04c3fSmrg   if (anv_device_is_lost(device))
45201e04c3fSmrg      return VK_ERROR_DEVICE_LOST;
45301e04c3fSmrg
45401e04c3fSmrg   if (pData == NULL)
45501e04c3fSmrg      return VK_SUCCESS;
45601e04c3fSmrg
45701e04c3fSmrg   void *data_end = pData + dataSize;
45801e04c3fSmrg
45901e04c3fSmrg   VkResult status = VK_SUCCESS;
46001e04c3fSmrg   for (uint32_t i = 0; i < queryCount; i++) {
4617ec681f3Smrg      bool available = query_is_available(pool, firstQuery + i);
46201e04c3fSmrg
46301e04c3fSmrg      if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
4647ec681f3Smrg         status = wait_for_available(device, pool, firstQuery + i);
4657ec681f3Smrg         if (status != VK_SUCCESS) {
46601e04c3fSmrg            return status;
4677ec681f3Smrg         }
46801e04c3fSmrg
46901e04c3fSmrg         available = true;
47001e04c3fSmrg      }
47101e04c3fSmrg
47201e04c3fSmrg      /* From the Vulkan 1.0.42 spec:
47301e04c3fSmrg       *
47401e04c3fSmrg       *    "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
47501e04c3fSmrg       *    both not set then no result values are written to pData for
47601e04c3fSmrg       *    queries that are in the unavailable state at the time of the call,
47701e04c3fSmrg       *    and vkGetQueryPoolResults returns VK_NOT_READY. However,
47801e04c3fSmrg       *    availability state is still written to pData for those queries if
47901e04c3fSmrg       *    VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
4807ec681f3Smrg       *
4817ec681f3Smrg       * From VK_KHR_performance_query :
4827ec681f3Smrg       *
4837ec681f3Smrg       *    "VK_QUERY_RESULT_PERFORMANCE_QUERY_RECORDED_COUNTERS_BIT_KHR specifies
4847ec681f3Smrg       *     that the result should contain the number of counters that were recorded
4857ec681f3Smrg       *     into a query pool of type ename:VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR"
48601e04c3fSmrg       */
48701e04c3fSmrg      bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
48801e04c3fSmrg
48901e04c3fSmrg      uint32_t idx = 0;
49001e04c3fSmrg      switch (pool->type) {
4917ec681f3Smrg      case VK_QUERY_TYPE_OCCLUSION: {
4927ec681f3Smrg         uint64_t *slot = query_slot(pool, firstQuery + i);
4937ec681f3Smrg         if (write_results) {
4947ec681f3Smrg            /* From the Vulkan 1.2.132 spec:
4957ec681f3Smrg             *
4967ec681f3Smrg             *    "If VK_QUERY_RESULT_PARTIAL_BIT is set,
4977ec681f3Smrg             *    VK_QUERY_RESULT_WAIT_BIT is not set, and the query’s status
4987ec681f3Smrg             *    is unavailable, an intermediate result value between zero and
4997ec681f3Smrg             *    the final result value is written to pData for that query."
5007ec681f3Smrg             */
5017ec681f3Smrg            uint64_t result = available ? slot[2] - slot[1] : 0;
5027ec681f3Smrg            cpu_write_query_result(pData, flags, idx, result);
5037ec681f3Smrg         }
50401e04c3fSmrg         idx++;
50501e04c3fSmrg         break;
5067ec681f3Smrg      }
50701e04c3fSmrg
50801e04c3fSmrg      case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
5097ec681f3Smrg         uint64_t *slot = query_slot(pool, firstQuery + i);
51001e04c3fSmrg         uint32_t statistics = pool->pipeline_statistics;
51101e04c3fSmrg         while (statistics) {
51201e04c3fSmrg            uint32_t stat = u_bit_scan(&statistics);
51301e04c3fSmrg            if (write_results) {
51401e04c3fSmrg               uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1];
51501e04c3fSmrg
51601e04c3fSmrg               /* WaDividePSInvocationCountBy4:HSW,BDW */
5177ec681f3Smrg               if ((device->info.ver == 8 || device->info.is_haswell) &&
51801e04c3fSmrg                   (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT)
51901e04c3fSmrg                  result >>= 2;
52001e04c3fSmrg
52101e04c3fSmrg               cpu_write_query_result(pData, flags, idx, result);
52201e04c3fSmrg            }
52301e04c3fSmrg            idx++;
52401e04c3fSmrg         }
52501e04c3fSmrg         assert(idx == util_bitcount(pool->pipeline_statistics));
52601e04c3fSmrg         break;
52701e04c3fSmrg      }
52801e04c3fSmrg
5297ec681f3Smrg      case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
5307ec681f3Smrg         uint64_t *slot = query_slot(pool, firstQuery + i);
5319f464c52Smaya         if (write_results)
5329f464c52Smaya            cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
5339f464c52Smaya         idx++;
5349f464c52Smaya         if (write_results)
5359f464c52Smaya            cpu_write_query_result(pData, flags, idx, slot[4] - slot[3]);
5369f464c52Smaya         idx++;
5379f464c52Smaya         break;
5387ec681f3Smrg      }
5399f464c52Smaya
5407ec681f3Smrg      case VK_QUERY_TYPE_TIMESTAMP: {
5417ec681f3Smrg         uint64_t *slot = query_slot(pool, firstQuery + i);
54201e04c3fSmrg         if (write_results)
54301e04c3fSmrg            cpu_write_query_result(pData, flags, idx, slot[1]);
54401e04c3fSmrg         idx++;
54501e04c3fSmrg         break;
5467ec681f3Smrg      }
5477ec681f3Smrg
5487ec681f3Smrg#if GFX_VER >= 8
5497ec681f3Smrg      case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
5507ec681f3Smrg         const struct anv_physical_device *pdevice = device->physical;
5517ec681f3Smrg         assert((flags & (VK_QUERY_RESULT_WITH_AVAILABILITY_BIT |
5527ec681f3Smrg                          VK_QUERY_RESULT_PARTIAL_BIT)) == 0);
5537ec681f3Smrg         for (uint32_t p = 0; p < pool->n_passes; p++) {
5547ec681f3Smrg            const struct intel_perf_query_info *query = pool->pass_query[p];
5557ec681f3Smrg            struct intel_perf_query_result result;
5567ec681f3Smrg            intel_perf_query_result_clear(&result);
5577ec681f3Smrg            intel_perf_query_result_accumulate_fields(&result, query, &device->info,
5587ec681f3Smrg                                                      pool->bo->map + khr_perf_query_data_offset(pool, firstQuery + i, p, false),
5597ec681f3Smrg                                                      pool->bo->map + khr_perf_query_data_offset(pool, firstQuery + i, p, true),
5607ec681f3Smrg                                                      false /* no_oa_accumulate */);
5617ec681f3Smrg            anv_perf_write_pass_results(pdevice->perf, pool, p, &result, pData);
5627ec681f3Smrg         }
5637ec681f3Smrg         break;
5647ec681f3Smrg      }
5657ec681f3Smrg#endif
5667ec681f3Smrg
5677ec681f3Smrg      case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
5687ec681f3Smrg         if (!write_results)
5697ec681f3Smrg            break;
5707ec681f3Smrg         const void *query_data = query_slot(pool, firstQuery + i);
5717ec681f3Smrg         const struct intel_perf_query_info *query = &device->physical->perf->queries[0];
5727ec681f3Smrg         struct intel_perf_query_result result;
5737ec681f3Smrg         intel_perf_query_result_clear(&result);
5747ec681f3Smrg         intel_perf_query_result_accumulate_fields(&result, query, &device->info,
5757ec681f3Smrg                                                   query_data + intel_perf_query_data_offset(pool, false),
5767ec681f3Smrg                                                   query_data + intel_perf_query_data_offset(pool, true),
5777ec681f3Smrg                                                   false /* no_oa_accumulate */);
5787ec681f3Smrg         intel_perf_query_result_write_mdapi(pData, stride,
5797ec681f3Smrg                                             &device->info,
5807ec681f3Smrg                                             query, &result);
5817ec681f3Smrg         const uint64_t *marker = query_data + intel_perf_marker_offset();
5827ec681f3Smrg         intel_perf_query_mdapi_write_marker(pData, stride, &device->info, *marker);
5837ec681f3Smrg         break;
5847ec681f3Smrg      }
58501e04c3fSmrg
58601e04c3fSmrg      default:
58701e04c3fSmrg         unreachable("invalid pool type");
58801e04c3fSmrg      }
58901e04c3fSmrg
59001e04c3fSmrg      if (!write_results)
59101e04c3fSmrg         status = VK_NOT_READY;
59201e04c3fSmrg
59301e04c3fSmrg      if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
59401e04c3fSmrg         cpu_write_query_result(pData, flags, idx, available);
59501e04c3fSmrg
59601e04c3fSmrg      pData += stride;
59701e04c3fSmrg      if (pData >= data_end)
59801e04c3fSmrg         break;
59901e04c3fSmrg   }
60001e04c3fSmrg
60101e04c3fSmrg   return status;
60201e04c3fSmrg}
60301e04c3fSmrg
60401e04c3fSmrgstatic void
60501e04c3fSmrgemit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
60601e04c3fSmrg                    struct anv_address addr)
60701e04c3fSmrg{
6087ec681f3Smrg   cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
6097ec681f3Smrg   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
6107ec681f3Smrg
61101e04c3fSmrg   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
61201e04c3fSmrg      pc.DestinationAddressType  = DAT_PPGTT;
61301e04c3fSmrg      pc.PostSyncOperation       = WritePSDepthCount;
61401e04c3fSmrg      pc.DepthStallEnable        = true;
61501e04c3fSmrg      pc.Address                 = addr;
61601e04c3fSmrg
6177ec681f3Smrg      if (GFX_VER == 9 && cmd_buffer->device->info.gt == 4)
61801e04c3fSmrg         pc.CommandStreamerStallEnable = true;
61901e04c3fSmrg   }
62001e04c3fSmrg}
62101e04c3fSmrg
62201e04c3fSmrgstatic void
6237ec681f3Smrgemit_query_mi_availability(struct mi_builder *b,
6249f464c52Smaya                           struct anv_address addr,
6259f464c52Smaya                           bool available)
6269f464c52Smaya{
6277ec681f3Smrg   mi_store(b, mi_mem64(addr), mi_imm(available));
6289f464c52Smaya}
6299f464c52Smaya
6309f464c52Smayastatic void
6319f464c52Smayaemit_query_pc_availability(struct anv_cmd_buffer *cmd_buffer,
6329f464c52Smaya                           struct anv_address addr,
6339f464c52Smaya                           bool available)
63401e04c3fSmrg{
6357ec681f3Smrg   cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
6367ec681f3Smrg   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
6377ec681f3Smrg
63801e04c3fSmrg   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
63901e04c3fSmrg      pc.DestinationAddressType  = DAT_PPGTT;
64001e04c3fSmrg      pc.PostSyncOperation       = WriteImmediateData;
64101e04c3fSmrg      pc.Address                 = addr;
6429f464c52Smaya      pc.ImmediateData           = available;
64301e04c3fSmrg   }
64401e04c3fSmrg}
64501e04c3fSmrg
64601e04c3fSmrg/**
64701e04c3fSmrg * Goes through a series of consecutive query indices in the given pool
64801e04c3fSmrg * setting all element values to 0 and emitting them as available.
64901e04c3fSmrg */
65001e04c3fSmrgstatic void
65101e04c3fSmrgemit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
6527ec681f3Smrg                  struct mi_builder *b, struct anv_query_pool *pool,
65301e04c3fSmrg                  uint32_t first_index, uint32_t num_queries)
65401e04c3fSmrg{
6559f464c52Smaya   switch (pool->type) {
6569f464c52Smaya   case VK_QUERY_TYPE_OCCLUSION:
6579f464c52Smaya   case VK_QUERY_TYPE_TIMESTAMP:
6589f464c52Smaya      /* These queries are written with a PIPE_CONTROL so clear them using the
6599f464c52Smaya       * PIPE_CONTROL as well so we don't have to synchronize between 2 types
6609f464c52Smaya       * of operations.
6619f464c52Smaya       */
6629f464c52Smaya      assert((pool->stride % 8) == 0);
6639f464c52Smaya      for (uint32_t i = 0; i < num_queries; i++) {
6649f464c52Smaya         struct anv_address slot_addr =
6659f464c52Smaya            anv_query_address(pool, first_index + i);
6669f464c52Smaya
6679f464c52Smaya         for (uint32_t qword = 1; qword < (pool->stride / 8); qword++) {
6689f464c52Smaya            emit_query_pc_availability(cmd_buffer,
6699f464c52Smaya                                       anv_address_add(slot_addr, qword * 8),
6709f464c52Smaya                                       false);
6719f464c52Smaya         }
6729f464c52Smaya         emit_query_pc_availability(cmd_buffer, slot_addr, true);
6739f464c52Smaya      }
6749f464c52Smaya      break;
6759f464c52Smaya
6769f464c52Smaya   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
6779f464c52Smaya   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
6789f464c52Smaya      for (uint32_t i = 0; i < num_queries; i++) {
6799f464c52Smaya         struct anv_address slot_addr =
6809f464c52Smaya            anv_query_address(pool, first_index + i);
6817ec681f3Smrg         mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8);
6827ec681f3Smrg         emit_query_mi_availability(b, slot_addr, true);
6837ec681f3Smrg      }
6847ec681f3Smrg      break;
6857ec681f3Smrg
6867ec681f3Smrg#if GFX_VER >= 8
6877ec681f3Smrg   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
6887ec681f3Smrg      for (uint32_t i = 0; i < num_queries; i++) {
6897ec681f3Smrg         for (uint32_t p = 0; p < pool->n_passes; p++) {
6907ec681f3Smrg            mi_memset(b, khr_perf_query_data_address(pool, first_index + i, p, false),
6917ec681f3Smrg                         0, 2 * pool->snapshot_size);
6927ec681f3Smrg            emit_query_mi_availability(b,
6937ec681f3Smrg                                       khr_perf_query_availability_address(pool, first_index + i, p),
6947ec681f3Smrg                                       true);
6957ec681f3Smrg         }
6967ec681f3Smrg      }
6977ec681f3Smrg      break;
6987ec681f3Smrg   }
6997ec681f3Smrg#endif
7007ec681f3Smrg
7017ec681f3Smrg   case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL:
7027ec681f3Smrg      for (uint32_t i = 0; i < num_queries; i++) {
7037ec681f3Smrg         struct anv_address slot_addr =
7047ec681f3Smrg            anv_query_address(pool, first_index + i);
7057ec681f3Smrg         mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8);
7069f464c52Smaya         emit_query_mi_availability(b, slot_addr, true);
7079f464c52Smaya      }
7089f464c52Smaya      break;
7099f464c52Smaya
7109f464c52Smaya   default:
7119f464c52Smaya      unreachable("Unsupported query type");
71201e04c3fSmrg   }
71301e04c3fSmrg}
71401e04c3fSmrg
71501e04c3fSmrgvoid genX(CmdResetQueryPool)(
71601e04c3fSmrg    VkCommandBuffer                             commandBuffer,
71701e04c3fSmrg    VkQueryPool                                 queryPool,
71801e04c3fSmrg    uint32_t                                    firstQuery,
71901e04c3fSmrg    uint32_t                                    queryCount)
72001e04c3fSmrg{
72101e04c3fSmrg   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
72201e04c3fSmrg   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
72301e04c3fSmrg
7249f464c52Smaya   switch (pool->type) {
7259f464c52Smaya   case VK_QUERY_TYPE_OCCLUSION:
7269f464c52Smaya   case VK_QUERY_TYPE_TIMESTAMP:
7279f464c52Smaya      for (uint32_t i = 0; i < queryCount; i++) {
7289f464c52Smaya         emit_query_pc_availability(cmd_buffer,
7299f464c52Smaya                                    anv_query_address(pool, firstQuery + i),
7309f464c52Smaya                                    false);
73101e04c3fSmrg      }
7329f464c52Smaya      break;
7339f464c52Smaya
7349f464c52Smaya   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
7359f464c52Smaya   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
7367ec681f3Smrg      struct mi_builder b;
7377ec681f3Smrg      mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
7387ec681f3Smrg
7397ec681f3Smrg      for (uint32_t i = 0; i < queryCount; i++)
7407ec681f3Smrg         emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
7417ec681f3Smrg      break;
7427ec681f3Smrg   }
7437ec681f3Smrg
7447ec681f3Smrg#if GFX_VER >= 8
7457ec681f3Smrg   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
7467ec681f3Smrg      struct mi_builder b;
7477ec681f3Smrg      mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
7487ec681f3Smrg
7497ec681f3Smrg      for (uint32_t i = 0; i < queryCount; i++) {
7507ec681f3Smrg         for (uint32_t p = 0; p < pool->n_passes; p++) {
7517ec681f3Smrg            emit_query_mi_availability(
7527ec681f3Smrg               &b,
7537ec681f3Smrg               khr_perf_query_availability_address(pool, firstQuery + i, p),
7547ec681f3Smrg               false);
7557ec681f3Smrg         }
7567ec681f3Smrg      }
7577ec681f3Smrg      break;
7587ec681f3Smrg   }
7597ec681f3Smrg#endif
7607ec681f3Smrg
7617ec681f3Smrg   case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
7627ec681f3Smrg      struct mi_builder b;
7637ec681f3Smrg      mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
7649f464c52Smaya
7659f464c52Smaya      for (uint32_t i = 0; i < queryCount; i++)
7669f464c52Smaya         emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
7679f464c52Smaya      break;
7689f464c52Smaya   }
7699f464c52Smaya
7709f464c52Smaya   default:
7719f464c52Smaya      unreachable("Unsupported query type");
7729f464c52Smaya   }
7739f464c52Smaya}
7749f464c52Smaya
7757ec681f3Smrgvoid genX(ResetQueryPool)(
7769f464c52Smaya    VkDevice                                    _device,
7779f464c52Smaya    VkQueryPool                                 queryPool,
7789f464c52Smaya    uint32_t                                    firstQuery,
7799f464c52Smaya    uint32_t                                    queryCount)
7809f464c52Smaya{
7819f464c52Smaya   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
7829f464c52Smaya
7839f464c52Smaya   for (uint32_t i = 0; i < queryCount; i++) {
7847ec681f3Smrg      if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
7857ec681f3Smrg#if GFX_VER >= 8
7867ec681f3Smrg         for (uint32_t p = 0; p < pool->n_passes; p++) {
7877ec681f3Smrg            uint64_t *pass_slot = pool->bo->map +
7887ec681f3Smrg               khr_perf_query_availability_offset(pool, firstQuery + i, p);
7897ec681f3Smrg            *pass_slot = 0;
7907ec681f3Smrg         }
7917ec681f3Smrg#endif
7927ec681f3Smrg      } else {
7937ec681f3Smrg         uint64_t *slot = query_slot(pool, firstQuery + i);
7947ec681f3Smrg         *slot = 0;
7957ec681f3Smrg      }
79601e04c3fSmrg   }
79701e04c3fSmrg}
79801e04c3fSmrg
79901e04c3fSmrgstatic const uint32_t vk_pipeline_stat_to_reg[] = {
80001e04c3fSmrg   GENX(IA_VERTICES_COUNT_num),
80101e04c3fSmrg   GENX(IA_PRIMITIVES_COUNT_num),
80201e04c3fSmrg   GENX(VS_INVOCATION_COUNT_num),
80301e04c3fSmrg   GENX(GS_INVOCATION_COUNT_num),
80401e04c3fSmrg   GENX(GS_PRIMITIVES_COUNT_num),
80501e04c3fSmrg   GENX(CL_INVOCATION_COUNT_num),
80601e04c3fSmrg   GENX(CL_PRIMITIVES_COUNT_num),
80701e04c3fSmrg   GENX(PS_INVOCATION_COUNT_num),
80801e04c3fSmrg   GENX(HS_INVOCATION_COUNT_num),
80901e04c3fSmrg   GENX(DS_INVOCATION_COUNT_num),
81001e04c3fSmrg   GENX(CS_INVOCATION_COUNT_num),
81101e04c3fSmrg};
81201e04c3fSmrg
81301e04c3fSmrgstatic void
8147ec681f3Smrgemit_pipeline_stat(struct mi_builder *b, uint32_t stat,
81501e04c3fSmrg                   struct anv_address addr)
81601e04c3fSmrg{
81701e04c3fSmrg   STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK ==
81801e04c3fSmrg                 (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg)) - 1);
81901e04c3fSmrg
82001e04c3fSmrg   assert(stat < ARRAY_SIZE(vk_pipeline_stat_to_reg));
8217ec681f3Smrg   mi_store(b, mi_mem64(addr), mi_reg64(vk_pipeline_stat_to_reg[stat]));
8229f464c52Smaya}
8239f464c52Smaya
8249f464c52Smayastatic void
8257ec681f3Smrgemit_xfb_query(struct mi_builder *b, uint32_t stream,
8269f464c52Smaya               struct anv_address addr)
8279f464c52Smaya{
8289f464c52Smaya   assert(stream < MAX_XFB_STREAMS);
8299f464c52Smaya
8307ec681f3Smrg   mi_store(b, mi_mem64(anv_address_add(addr, 0)),
8317ec681f3Smrg               mi_reg64(GENX(SO_NUM_PRIMS_WRITTEN0_num) + stream * 8));
8327ec681f3Smrg   mi_store(b, mi_mem64(anv_address_add(addr, 16)),
8337ec681f3Smrg               mi_reg64(GENX(SO_PRIM_STORAGE_NEEDED0_num) + stream * 8));
8347ec681f3Smrg}
8357ec681f3Smrg
8367ec681f3Smrgstatic void
8377ec681f3Smrgemit_perf_intel_query(struct anv_cmd_buffer *cmd_buffer,
8387ec681f3Smrg                      struct anv_query_pool *pool,
8397ec681f3Smrg                      struct mi_builder *b,
8407ec681f3Smrg                      struct anv_address query_addr,
8417ec681f3Smrg                      bool end)
8427ec681f3Smrg{
8437ec681f3Smrg   const struct intel_perf_query_field_layout *layout =
8447ec681f3Smrg      &cmd_buffer->device->physical->perf->query_layout;
8457ec681f3Smrg   struct anv_address data_addr =
8467ec681f3Smrg      anv_address_add(query_addr, intel_perf_query_data_offset(pool, end));
8477ec681f3Smrg
8487ec681f3Smrg   for (uint32_t f = 0; f < layout->n_fields; f++) {
8497ec681f3Smrg      const struct intel_perf_query_field *field =
8507ec681f3Smrg         &layout->fields[end ? f : (layout->n_fields - 1 - f)];
8517ec681f3Smrg
8527ec681f3Smrg      switch (field->type) {
8537ec681f3Smrg      case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC:
8547ec681f3Smrg         anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) {
8557ec681f3Smrg            rpc.MemoryAddress = anv_address_add(data_addr, field->location);
8567ec681f3Smrg         }
8577ec681f3Smrg         break;
8587ec681f3Smrg
8597ec681f3Smrg      case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
8607ec681f3Smrg      case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT:
8617ec681f3Smrg      case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
8627ec681f3Smrg      case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C: {
8637ec681f3Smrg         struct anv_address addr = anv_address_add(data_addr, field->location);
8647ec681f3Smrg         struct mi_value src = field->size == 8 ?
8657ec681f3Smrg            mi_reg64(field->mmio_offset) :
8667ec681f3Smrg            mi_reg32(field->mmio_offset);
8677ec681f3Smrg         struct mi_value dst = field->size == 8 ?
8687ec681f3Smrg            mi_mem64(addr) : mi_mem32(addr);
8697ec681f3Smrg         mi_store(b, dst, src);
8707ec681f3Smrg         break;
8717ec681f3Smrg      }
8727ec681f3Smrg
8737ec681f3Smrg      default:
8747ec681f3Smrg         unreachable("Invalid query field");
8757ec681f3Smrg         break;
8767ec681f3Smrg      }
8777ec681f3Smrg   }
87801e04c3fSmrg}
87901e04c3fSmrg
88001e04c3fSmrgvoid genX(CmdBeginQuery)(
88101e04c3fSmrg    VkCommandBuffer                             commandBuffer,
88201e04c3fSmrg    VkQueryPool                                 queryPool,
88301e04c3fSmrg    uint32_t                                    query,
88401e04c3fSmrg    VkQueryControlFlags                         flags)
8859f464c52Smaya{
8869f464c52Smaya   genX(CmdBeginQueryIndexedEXT)(commandBuffer, queryPool, query, flags, 0);
8879f464c52Smaya}
8889f464c52Smaya
8899f464c52Smayavoid genX(CmdBeginQueryIndexedEXT)(
8909f464c52Smaya    VkCommandBuffer                             commandBuffer,
8919f464c52Smaya    VkQueryPool                                 queryPool,
8929f464c52Smaya    uint32_t                                    query,
8939f464c52Smaya    VkQueryControlFlags                         flags,
8949f464c52Smaya    uint32_t                                    index)
89501e04c3fSmrg{
89601e04c3fSmrg   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
89701e04c3fSmrg   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
89801e04c3fSmrg   struct anv_address query_addr = anv_query_address(pool, query);
89901e04c3fSmrg
9007ec681f3Smrg   struct mi_builder b;
9017ec681f3Smrg   mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
9029f464c52Smaya
90301e04c3fSmrg   switch (pool->type) {
90401e04c3fSmrg   case VK_QUERY_TYPE_OCCLUSION:
90501e04c3fSmrg      emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 8));
90601e04c3fSmrg      break;
90701e04c3fSmrg
90801e04c3fSmrg   case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
90901e04c3fSmrg      /* TODO: This might only be necessary for certain stats */
91001e04c3fSmrg      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
91101e04c3fSmrg         pc.CommandStreamerStallEnable = true;
91201e04c3fSmrg         pc.StallAtPixelScoreboard = true;
91301e04c3fSmrg      }
91401e04c3fSmrg
91501e04c3fSmrg      uint32_t statistics = pool->pipeline_statistics;
91601e04c3fSmrg      uint32_t offset = 8;
91701e04c3fSmrg      while (statistics) {
91801e04c3fSmrg         uint32_t stat = u_bit_scan(&statistics);
9199f464c52Smaya         emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
92001e04c3fSmrg         offset += 16;
92101e04c3fSmrg      }
92201e04c3fSmrg      break;
92301e04c3fSmrg   }
92401e04c3fSmrg
9259f464c52Smaya   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
9269f464c52Smaya      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
9279f464c52Smaya         pc.CommandStreamerStallEnable = true;
9289f464c52Smaya         pc.StallAtPixelScoreboard = true;
9299f464c52Smaya      }
9309f464c52Smaya      emit_xfb_query(&b, index, anv_address_add(query_addr, 8));
9319f464c52Smaya      break;
9329f464c52Smaya
9337ec681f3Smrg#if GFX_VER >= 8
9347ec681f3Smrg   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
9357ec681f3Smrg      if (!khr_perf_query_ensure_relocs(cmd_buffer))
9367ec681f3Smrg         return;
9377ec681f3Smrg
9387ec681f3Smrg      const struct anv_physical_device *pdevice = cmd_buffer->device->physical;
9397ec681f3Smrg      const struct intel_perf_query_field_layout *layout = &pdevice->perf->query_layout;
9407ec681f3Smrg
9417ec681f3Smrg      uint32_t reloc_idx = 0;
9427ec681f3Smrg      for (uint32_t end = 0; end < 2; end++) {
9437ec681f3Smrg         for (uint32_t r = 0; r < layout->n_fields; r++) {
9447ec681f3Smrg            const struct intel_perf_query_field *field =
9457ec681f3Smrg               &layout->fields[end ? r : (layout->n_fields - 1 - r)];
9467ec681f3Smrg            struct mi_value reg_addr =
9477ec681f3Smrg               mi_iadd(
9487ec681f3Smrg                  &b,
9497ec681f3Smrg                  mi_imm(intel_canonical_address(pool->bo->offset +
9507ec681f3Smrg                                                 khr_perf_query_data_offset(pool, query, 0, end) +
9517ec681f3Smrg                                                 field->location)),
9527ec681f3Smrg                  mi_reg64(ANV_PERF_QUERY_OFFSET_REG));
9537ec681f3Smrg            cmd_buffer->self_mod_locations[reloc_idx++] = mi_store_address(&b, reg_addr);
9547ec681f3Smrg
9557ec681f3Smrg            if (field->type != INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC &&
9567ec681f3Smrg                field->size == 8) {
9577ec681f3Smrg               reg_addr =
9587ec681f3Smrg                  mi_iadd(
9597ec681f3Smrg                     &b,
9607ec681f3Smrg                     mi_imm(intel_canonical_address(pool->bo->offset +
9617ec681f3Smrg                                                    khr_perf_query_data_offset(pool, query, 0, end) +
9627ec681f3Smrg                                                    field->location + 4)),
9637ec681f3Smrg                     mi_reg64(ANV_PERF_QUERY_OFFSET_REG));
9647ec681f3Smrg               cmd_buffer->self_mod_locations[reloc_idx++] = mi_store_address(&b, reg_addr);
9657ec681f3Smrg            }
9667ec681f3Smrg         }
9677ec681f3Smrg      }
9687ec681f3Smrg
9697ec681f3Smrg      struct mi_value availability_write_offset =
9707ec681f3Smrg         mi_iadd(
9717ec681f3Smrg            &b,
9727ec681f3Smrg            mi_imm(
9737ec681f3Smrg               intel_canonical_address(
9747ec681f3Smrg                  pool->bo->offset +
9757ec681f3Smrg                  khr_perf_query_availability_offset(pool, query, 0 /* pass */))),
9767ec681f3Smrg            mi_reg64(ANV_PERF_QUERY_OFFSET_REG));
9777ec681f3Smrg      cmd_buffer->self_mod_locations[reloc_idx++] =
9787ec681f3Smrg         mi_store_address(&b, availability_write_offset);
9797ec681f3Smrg
9807ec681f3Smrg      assert(reloc_idx == pdevice->n_perf_query_commands);
9817ec681f3Smrg
9827ec681f3Smrg      mi_self_mod_barrier(&b);
9837ec681f3Smrg
9847ec681f3Smrg      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
9857ec681f3Smrg         pc.CommandStreamerStallEnable = true;
9867ec681f3Smrg         pc.StallAtPixelScoreboard = true;
9877ec681f3Smrg      }
9887ec681f3Smrg      cmd_buffer->perf_query_pool = pool;
9897ec681f3Smrg
9907ec681f3Smrg      cmd_buffer->perf_reloc_idx = 0;
9917ec681f3Smrg      for (uint32_t r = 0; r < layout->n_fields; r++) {
9927ec681f3Smrg         const struct intel_perf_query_field *field =
9937ec681f3Smrg            &layout->fields[layout->n_fields - 1 - r];
9947ec681f3Smrg         void *dws;
9957ec681f3Smrg
9967ec681f3Smrg         switch (field->type) {
9977ec681f3Smrg         case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC:
9987ec681f3Smrg            dws = anv_batch_emitn(&cmd_buffer->batch,
9997ec681f3Smrg                                  GENX(MI_REPORT_PERF_COUNT_length),
10007ec681f3Smrg                                  GENX(MI_REPORT_PERF_COUNT),
10017ec681f3Smrg                                  .MemoryAddress = query_addr /* Will be overwritten */);
10027ec681f3Smrg            _mi_resolve_address_token(&b,
10037ec681f3Smrg                                      cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
10047ec681f3Smrg                                      dws +
10057ec681f3Smrg                                      GENX(MI_REPORT_PERF_COUNT_MemoryAddress_start) / 8);
10067ec681f3Smrg            break;
10077ec681f3Smrg
10087ec681f3Smrg         case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
10097ec681f3Smrg         case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT:
10107ec681f3Smrg         case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
10117ec681f3Smrg         case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C:
10127ec681f3Smrg            dws =
10137ec681f3Smrg               anv_batch_emitn(&cmd_buffer->batch,
10147ec681f3Smrg                               GENX(MI_STORE_REGISTER_MEM_length),
10157ec681f3Smrg                               GENX(MI_STORE_REGISTER_MEM),
10167ec681f3Smrg                               .RegisterAddress = field->mmio_offset,
10177ec681f3Smrg                               .MemoryAddress = query_addr /* Will be overwritten */ );
10187ec681f3Smrg            _mi_resolve_address_token(&b,
10197ec681f3Smrg                                      cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
10207ec681f3Smrg                                      dws +
10217ec681f3Smrg                                      GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8);
10227ec681f3Smrg            if (field->size == 8) {
10237ec681f3Smrg               dws =
10247ec681f3Smrg                  anv_batch_emitn(&cmd_buffer->batch,
10257ec681f3Smrg                                  GENX(MI_STORE_REGISTER_MEM_length),
10267ec681f3Smrg                                  GENX(MI_STORE_REGISTER_MEM),
10277ec681f3Smrg                                  .RegisterAddress = field->mmio_offset + 4,
10287ec681f3Smrg                                  .MemoryAddress = query_addr /* Will be overwritten */ );
10297ec681f3Smrg               _mi_resolve_address_token(&b,
10307ec681f3Smrg                                         cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
10317ec681f3Smrg                                         dws +
10327ec681f3Smrg                                         GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8);
10337ec681f3Smrg            }
10347ec681f3Smrg            break;
10357ec681f3Smrg
10367ec681f3Smrg         default:
10377ec681f3Smrg            unreachable("Invalid query field");
10387ec681f3Smrg            break;
10397ec681f3Smrg         }
10407ec681f3Smrg      }
10417ec681f3Smrg      break;
10427ec681f3Smrg   }
10437ec681f3Smrg#endif
10447ec681f3Smrg
10457ec681f3Smrg   case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
10467ec681f3Smrg      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
10477ec681f3Smrg         pc.CommandStreamerStallEnable = true;
10487ec681f3Smrg         pc.StallAtPixelScoreboard = true;
10497ec681f3Smrg      }
10507ec681f3Smrg      emit_perf_intel_query(cmd_buffer, pool, &b, query_addr, false);
10517ec681f3Smrg      break;
10527ec681f3Smrg   }
10537ec681f3Smrg
105401e04c3fSmrg   default:
105501e04c3fSmrg      unreachable("");
105601e04c3fSmrg   }
105701e04c3fSmrg}
105801e04c3fSmrg
105901e04c3fSmrgvoid genX(CmdEndQuery)(
106001e04c3fSmrg    VkCommandBuffer                             commandBuffer,
106101e04c3fSmrg    VkQueryPool                                 queryPool,
106201e04c3fSmrg    uint32_t                                    query)
10639f464c52Smaya{
10649f464c52Smaya   genX(CmdEndQueryIndexedEXT)(commandBuffer, queryPool, query, 0);
10659f464c52Smaya}
10669f464c52Smaya
10679f464c52Smayavoid genX(CmdEndQueryIndexedEXT)(
10689f464c52Smaya    VkCommandBuffer                             commandBuffer,
10699f464c52Smaya    VkQueryPool                                 queryPool,
10709f464c52Smaya    uint32_t                                    query,
10719f464c52Smaya    uint32_t                                    index)
107201e04c3fSmrg{
107301e04c3fSmrg   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
107401e04c3fSmrg   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
107501e04c3fSmrg   struct anv_address query_addr = anv_query_address(pool, query);
107601e04c3fSmrg
10777ec681f3Smrg   struct mi_builder b;
10787ec681f3Smrg   mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
10799f464c52Smaya
108001e04c3fSmrg   switch (pool->type) {
108101e04c3fSmrg   case VK_QUERY_TYPE_OCCLUSION:
108201e04c3fSmrg      emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16));
10839f464c52Smaya      emit_query_pc_availability(cmd_buffer, query_addr, true);
108401e04c3fSmrg      break;
108501e04c3fSmrg
108601e04c3fSmrg   case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
108701e04c3fSmrg      /* TODO: This might only be necessary for certain stats */
108801e04c3fSmrg      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
108901e04c3fSmrg         pc.CommandStreamerStallEnable = true;
109001e04c3fSmrg         pc.StallAtPixelScoreboard = true;
109101e04c3fSmrg      }
109201e04c3fSmrg
109301e04c3fSmrg      uint32_t statistics = pool->pipeline_statistics;
109401e04c3fSmrg      uint32_t offset = 16;
109501e04c3fSmrg      while (statistics) {
109601e04c3fSmrg         uint32_t stat = u_bit_scan(&statistics);
10979f464c52Smaya         emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
109801e04c3fSmrg         offset += 16;
109901e04c3fSmrg      }
110001e04c3fSmrg
11019f464c52Smaya      emit_query_mi_availability(&b, query_addr, true);
110201e04c3fSmrg      break;
110301e04c3fSmrg   }
110401e04c3fSmrg
11059f464c52Smaya   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
11069f464c52Smaya      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
11079f464c52Smaya         pc.CommandStreamerStallEnable = true;
11089f464c52Smaya         pc.StallAtPixelScoreboard = true;
11099f464c52Smaya      }
11109f464c52Smaya
11119f464c52Smaya      emit_xfb_query(&b, index, anv_address_add(query_addr, 16));
11129f464c52Smaya      emit_query_mi_availability(&b, query_addr, true);
11139f464c52Smaya      break;
11149f464c52Smaya
11157ec681f3Smrg#if GFX_VER >= 8
11167ec681f3Smrg   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
11177ec681f3Smrg      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
11187ec681f3Smrg         pc.CommandStreamerStallEnable = true;
11197ec681f3Smrg         pc.StallAtPixelScoreboard = true;
11207ec681f3Smrg      }
11217ec681f3Smrg      cmd_buffer->perf_query_pool = pool;
11227ec681f3Smrg
11237ec681f3Smrg      if (!khr_perf_query_ensure_relocs(cmd_buffer))
11247ec681f3Smrg         return;
11257ec681f3Smrg
11267ec681f3Smrg      const struct anv_physical_device *pdevice = cmd_buffer->device->physical;
11277ec681f3Smrg      const struct intel_perf_query_field_layout *layout = &pdevice->perf->query_layout;
11287ec681f3Smrg
11297ec681f3Smrg      void *dws;
11307ec681f3Smrg      for (uint32_t r = 0; r < layout->n_fields; r++) {
11317ec681f3Smrg         const struct intel_perf_query_field *field = &layout->fields[r];
11327ec681f3Smrg
11337ec681f3Smrg         switch (field->type) {
11347ec681f3Smrg         case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC:
11357ec681f3Smrg            dws = anv_batch_emitn(&cmd_buffer->batch,
11367ec681f3Smrg                                  GENX(MI_REPORT_PERF_COUNT_length),
11377ec681f3Smrg                                  GENX(MI_REPORT_PERF_COUNT),
11387ec681f3Smrg                                  .MemoryAddress = query_addr /* Will be overwritten */);
11397ec681f3Smrg            _mi_resolve_address_token(&b,
11407ec681f3Smrg                                      cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
11417ec681f3Smrg                                      dws +
11427ec681f3Smrg                                      GENX(MI_REPORT_PERF_COUNT_MemoryAddress_start) / 8);
11437ec681f3Smrg            break;
11447ec681f3Smrg
11457ec681f3Smrg         case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
11467ec681f3Smrg         case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT:
11477ec681f3Smrg         case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
11487ec681f3Smrg         case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C:
11497ec681f3Smrg            dws =
11507ec681f3Smrg               anv_batch_emitn(&cmd_buffer->batch,
11517ec681f3Smrg                               GENX(MI_STORE_REGISTER_MEM_length),
11527ec681f3Smrg                               GENX(MI_STORE_REGISTER_MEM),
11537ec681f3Smrg                               .RegisterAddress = field->mmio_offset,
11547ec681f3Smrg                               .MemoryAddress = query_addr /* Will be overwritten */ );
11557ec681f3Smrg            _mi_resolve_address_token(&b,
11567ec681f3Smrg                                      cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
11577ec681f3Smrg                                      dws +
11587ec681f3Smrg                                      GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8);
11597ec681f3Smrg            if (field->size == 8) {
11607ec681f3Smrg               dws =
11617ec681f3Smrg                  anv_batch_emitn(&cmd_buffer->batch,
11627ec681f3Smrg                                  GENX(MI_STORE_REGISTER_MEM_length),
11637ec681f3Smrg                                  GENX(MI_STORE_REGISTER_MEM),
11647ec681f3Smrg                                  .RegisterAddress = field->mmio_offset + 4,
11657ec681f3Smrg                                  .MemoryAddress = query_addr /* Will be overwritten */ );
11667ec681f3Smrg               _mi_resolve_address_token(&b,
11677ec681f3Smrg                                         cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
11687ec681f3Smrg                                         dws +
11697ec681f3Smrg                                         GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8);
11707ec681f3Smrg            }
11717ec681f3Smrg            break;
11727ec681f3Smrg
11737ec681f3Smrg         default:
11747ec681f3Smrg            unreachable("Invalid query field");
11757ec681f3Smrg            break;
11767ec681f3Smrg         }
11777ec681f3Smrg      }
11787ec681f3Smrg
11797ec681f3Smrg      dws =
11807ec681f3Smrg         anv_batch_emitn(&cmd_buffer->batch,
11817ec681f3Smrg                         GENX(MI_STORE_DATA_IMM_length),
11827ec681f3Smrg                         GENX(MI_STORE_DATA_IMM),
11837ec681f3Smrg                         .ImmediateData = true);
11847ec681f3Smrg      _mi_resolve_address_token(&b,
11857ec681f3Smrg                                cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
11867ec681f3Smrg                                dws +
11877ec681f3Smrg                                GENX(MI_STORE_DATA_IMM_Address_start) / 8);
11887ec681f3Smrg
11897ec681f3Smrg      assert(cmd_buffer->perf_reloc_idx == pdevice->n_perf_query_commands);
11907ec681f3Smrg      break;
11917ec681f3Smrg   }
11927ec681f3Smrg#endif
11937ec681f3Smrg
11947ec681f3Smrg   case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
11957ec681f3Smrg      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
11967ec681f3Smrg         pc.CommandStreamerStallEnable = true;
11977ec681f3Smrg         pc.StallAtPixelScoreboard = true;
11987ec681f3Smrg      }
11997ec681f3Smrg      uint32_t marker_offset = intel_perf_marker_offset();
12007ec681f3Smrg      mi_store(&b, mi_mem64(anv_address_add(query_addr, marker_offset)),
12017ec681f3Smrg                   mi_imm(cmd_buffer->intel_perf_marker));
12027ec681f3Smrg      emit_perf_intel_query(cmd_buffer, pool, &b, query_addr, true);
12037ec681f3Smrg      emit_query_mi_availability(&b, query_addr, true);
12047ec681f3Smrg      break;
12057ec681f3Smrg   }
12067ec681f3Smrg
120701e04c3fSmrg   default:
120801e04c3fSmrg      unreachable("");
120901e04c3fSmrg   }
121001e04c3fSmrg
121101e04c3fSmrg   /* When multiview is active the spec requires that N consecutive query
121201e04c3fSmrg    * indices are used, where N is the number of active views in the subpass.
121301e04c3fSmrg    * The spec allows that we only write the results to one of the queries
121401e04c3fSmrg    * but we still need to manage result availability for all the query indices.
121501e04c3fSmrg    * Since we only emit a single query for all active views in the
121601e04c3fSmrg    * first index, mark the other query indices as being already available
121701e04c3fSmrg    * with result 0.
121801e04c3fSmrg    */
121901e04c3fSmrg   if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
122001e04c3fSmrg      const uint32_t num_queries =
122101e04c3fSmrg         util_bitcount(cmd_buffer->state.subpass->view_mask);
122201e04c3fSmrg      if (num_queries > 1)
12239f464c52Smaya         emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
122401e04c3fSmrg   }
122501e04c3fSmrg}
122601e04c3fSmrg
122701e04c3fSmrg#define TIMESTAMP 0x2358
122801e04c3fSmrg
12297ec681f3Smrgvoid genX(CmdWriteTimestamp2KHR)(
123001e04c3fSmrg    VkCommandBuffer                             commandBuffer,
12317ec681f3Smrg    VkPipelineStageFlags2KHR                    stage,
123201e04c3fSmrg    VkQueryPool                                 queryPool,
123301e04c3fSmrg    uint32_t                                    query)
123401e04c3fSmrg{
123501e04c3fSmrg   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
123601e04c3fSmrg   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
123701e04c3fSmrg   struct anv_address query_addr = anv_query_address(pool, query);
123801e04c3fSmrg
123901e04c3fSmrg   assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
124001e04c3fSmrg
12417ec681f3Smrg   struct mi_builder b;
12427ec681f3Smrg   mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
12439f464c52Smaya
12447ec681f3Smrg   if (stage == VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT_KHR) {
12457ec681f3Smrg      mi_store(&b, mi_mem64(anv_address_add(query_addr, 8)),
12467ec681f3Smrg                   mi_reg64(TIMESTAMP));
12477ec681f3Smrg   } else {
124801e04c3fSmrg      /* Everything else is bottom-of-pipe */
12497ec681f3Smrg      cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
12507ec681f3Smrg      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
12517ec681f3Smrg
125201e04c3fSmrg      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
125301e04c3fSmrg         pc.DestinationAddressType  = DAT_PPGTT;
125401e04c3fSmrg         pc.PostSyncOperation       = WriteTimestamp;
125501e04c3fSmrg         pc.Address                 = anv_address_add(query_addr, 8);
125601e04c3fSmrg
12577ec681f3Smrg         if (GFX_VER == 9 && cmd_buffer->device->info.gt == 4)
125801e04c3fSmrg            pc.CommandStreamerStallEnable = true;
125901e04c3fSmrg      }
126001e04c3fSmrg   }
126101e04c3fSmrg
12629f464c52Smaya   emit_query_pc_availability(cmd_buffer, query_addr, true);
126301e04c3fSmrg
126401e04c3fSmrg   /* When multiview is active the spec requires that N consecutive query
126501e04c3fSmrg    * indices are used, where N is the number of active views in the subpass.
126601e04c3fSmrg    * The spec allows that we only write the results to one of the queries
126701e04c3fSmrg    * but we still need to manage result availability for all the query indices.
126801e04c3fSmrg    * Since we only emit a single query for all active views in the
126901e04c3fSmrg    * first index, mark the other query indices as being already available
127001e04c3fSmrg    * with result 0.
127101e04c3fSmrg    */
127201e04c3fSmrg   if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
127301e04c3fSmrg      const uint32_t num_queries =
127401e04c3fSmrg         util_bitcount(cmd_buffer->state.subpass->view_mask);
127501e04c3fSmrg      if (num_queries > 1)
12769f464c52Smaya         emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
127701e04c3fSmrg   }
127801e04c3fSmrg}
127901e04c3fSmrg
12807ec681f3Smrg#if GFX_VERx10 >= 75
12817ec681f3Smrg
12827ec681f3Smrg#define MI_PREDICATE_SRC0    0x2400
12837ec681f3Smrg#define MI_PREDICATE_SRC1    0x2408
12847ec681f3Smrg#define MI_PREDICATE_RESULT  0x2418
12857ec681f3Smrg
12867ec681f3Smrg/**
12877ec681f3Smrg * Writes the results of a query to dst_addr is the value at poll_addr is equal
12887ec681f3Smrg * to the reference value.
12897ec681f3Smrg */
12907ec681f3Smrgstatic void
12917ec681f3Smrggpu_write_query_result_cond(struct anv_cmd_buffer *cmd_buffer,
12927ec681f3Smrg                            struct mi_builder *b,
12937ec681f3Smrg                            struct anv_address poll_addr,
12947ec681f3Smrg                            struct anv_address dst_addr,
12957ec681f3Smrg                            uint64_t ref_value,
12967ec681f3Smrg                            VkQueryResultFlags flags,
12977ec681f3Smrg                            uint32_t value_index,
12987ec681f3Smrg                            struct mi_value query_result)
12997ec681f3Smrg{
13007ec681f3Smrg   mi_store(b, mi_reg64(MI_PREDICATE_SRC0), mi_mem64(poll_addr));
13017ec681f3Smrg   mi_store(b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(ref_value));
13027ec681f3Smrg   anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
13037ec681f3Smrg      mip.LoadOperation    = LOAD_LOAD;
13047ec681f3Smrg      mip.CombineOperation = COMBINE_SET;
13057ec681f3Smrg      mip.CompareOperation = COMPARE_SRCS_EQUAL;
13067ec681f3Smrg   }
13077ec681f3Smrg
13087ec681f3Smrg   if (flags & VK_QUERY_RESULT_64_BIT) {
13097ec681f3Smrg      struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8);
13107ec681f3Smrg      mi_store_if(b, mi_mem64(res_addr), query_result);
13117ec681f3Smrg   } else {
13127ec681f3Smrg      struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4);
13137ec681f3Smrg      mi_store_if(b, mi_mem32(res_addr), query_result);
13147ec681f3Smrg   }
13157ec681f3Smrg}
131601e04c3fSmrg
131701e04c3fSmrgstatic void
13187ec681f3Smrggpu_write_query_result(struct mi_builder *b,
131901e04c3fSmrg                       struct anv_address dst_addr,
132001e04c3fSmrg                       VkQueryResultFlags flags,
13219f464c52Smaya                       uint32_t value_index,
13227ec681f3Smrg                       struct mi_value query_result)
132301e04c3fSmrg{
132401e04c3fSmrg   if (flags & VK_QUERY_RESULT_64_BIT) {
13259f464c52Smaya      struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8);
13267ec681f3Smrg      mi_store(b, mi_mem64(res_addr), query_result);
132701e04c3fSmrg   } else {
13289f464c52Smaya      struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4);
13297ec681f3Smrg      mi_store(b, mi_mem32(res_addr), query_result);
133001e04c3fSmrg   }
133101e04c3fSmrg}
133201e04c3fSmrg
13337ec681f3Smrgstatic struct mi_value
13347ec681f3Smrgcompute_query_result(struct mi_builder *b, struct anv_address addr)
133501e04c3fSmrg{
13367ec681f3Smrg   return mi_isub(b, mi_mem64(anv_address_add(addr, 8)),
13377ec681f3Smrg                     mi_mem64(anv_address_add(addr, 0)));
133801e04c3fSmrg}
133901e04c3fSmrg
134001e04c3fSmrgvoid genX(CmdCopyQueryPoolResults)(
134101e04c3fSmrg    VkCommandBuffer                             commandBuffer,
134201e04c3fSmrg    VkQueryPool                                 queryPool,
134301e04c3fSmrg    uint32_t                                    firstQuery,
134401e04c3fSmrg    uint32_t                                    queryCount,
134501e04c3fSmrg    VkBuffer                                    destBuffer,
134601e04c3fSmrg    VkDeviceSize                                destOffset,
134701e04c3fSmrg    VkDeviceSize                                destStride,
134801e04c3fSmrg    VkQueryResultFlags                          flags)
134901e04c3fSmrg{
135001e04c3fSmrg   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
135101e04c3fSmrg   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
135201e04c3fSmrg   ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
135301e04c3fSmrg
13547ec681f3Smrg   struct mi_builder b;
13557ec681f3Smrg   mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
13567ec681f3Smrg   struct mi_value result;
13579f464c52Smaya
135801e04c3fSmrg   /* If render target writes are ongoing, request a render target cache flush
135901e04c3fSmrg    * to ensure proper ordering of the commands from the 3d pipe and the
136001e04c3fSmrg    * command streamer.
136101e04c3fSmrg    */
13629f464c52Smaya   if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_RENDER_TARGET_BUFFER_WRITES) {
13637ec681f3Smrg      anv_add_pending_pipe_bits(cmd_buffer,
13647ec681f3Smrg                                ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,
13657ec681f3Smrg                                "CopyQueryPoolResults");
136601e04c3fSmrg   }
136701e04c3fSmrg
136801e04c3fSmrg   if ((flags & VK_QUERY_RESULT_WAIT_BIT) ||
13699f464c52Smaya       (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) ||
13709f464c52Smaya       /* Occlusion & timestamp queries are written using a PIPE_CONTROL and
13719f464c52Smaya        * because we're about to copy values from MI commands, we need to
13729f464c52Smaya        * stall the command streamer to make sure the PIPE_CONTROL values have
13739f464c52Smaya        * landed, otherwise we could see inconsistent values & availability.
13749f464c52Smaya        *
13759f464c52Smaya        *  From the vulkan spec:
13769f464c52Smaya        *
13779f464c52Smaya        *     "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
13789f464c52Smaya        *     previous uses of vkCmdResetQueryPool in the same queue, without
13799f464c52Smaya        *     any additional synchronization."
13809f464c52Smaya        */
13819f464c52Smaya       pool->type == VK_QUERY_TYPE_OCCLUSION ||
13829f464c52Smaya       pool->type == VK_QUERY_TYPE_TIMESTAMP) {
13837ec681f3Smrg      anv_add_pending_pipe_bits(cmd_buffer,
13847ec681f3Smrg                                ANV_PIPE_CS_STALL_BIT,
13857ec681f3Smrg                                "CopyQueryPoolResults");
138601e04c3fSmrg      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
138701e04c3fSmrg   }
138801e04c3fSmrg
138901e04c3fSmrg   struct anv_address dest_addr = anv_address_add(buffer->address, destOffset);
139001e04c3fSmrg   for (uint32_t i = 0; i < queryCount; i++) {
139101e04c3fSmrg      struct anv_address query_addr = anv_query_address(pool, firstQuery + i);
139201e04c3fSmrg      uint32_t idx = 0;
139301e04c3fSmrg      switch (pool->type) {
139401e04c3fSmrg      case VK_QUERY_TYPE_OCCLUSION:
13959f464c52Smaya         result = compute_query_result(&b, anv_address_add(query_addr, 8));
13967ec681f3Smrg         /* Like in the case of vkGetQueryPoolResults, if the query is
13977ec681f3Smrg          * unavailable and the VK_QUERY_RESULT_PARTIAL_BIT flag is set,
13987ec681f3Smrg          * conservatively write 0 as the query result. If the
13997ec681f3Smrg          * VK_QUERY_RESULT_PARTIAL_BIT isn't set, don't write any value.
14007ec681f3Smrg          */
14017ec681f3Smrg         gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr,
14027ec681f3Smrg               1 /* available */, flags, idx, result);
14037ec681f3Smrg         if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
14047ec681f3Smrg            gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr,
14057ec681f3Smrg                  0 /* unavailable */, flags, idx, mi_imm(0));
14067ec681f3Smrg         }
14077ec681f3Smrg         idx++;
140801e04c3fSmrg         break;
140901e04c3fSmrg
141001e04c3fSmrg      case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
141101e04c3fSmrg         uint32_t statistics = pool->pipeline_statistics;
141201e04c3fSmrg         while (statistics) {
141301e04c3fSmrg            uint32_t stat = u_bit_scan(&statistics);
141401e04c3fSmrg
14159f464c52Smaya            result = compute_query_result(&b, anv_address_add(query_addr,
14169f464c52Smaya                                                              idx * 16 + 8));
141701e04c3fSmrg
141801e04c3fSmrg            /* WaDividePSInvocationCountBy4:HSW,BDW */
14197ec681f3Smrg            if ((cmd_buffer->device->info.ver == 8 ||
142001e04c3fSmrg                 cmd_buffer->device->info.is_haswell) &&
142101e04c3fSmrg                (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) {
14227ec681f3Smrg               result = mi_ushr32_imm(&b, result, 2);
142301e04c3fSmrg            }
142401e04c3fSmrg
14259f464c52Smaya            gpu_write_query_result(&b, dest_addr, flags, idx++, result);
142601e04c3fSmrg         }
142701e04c3fSmrg         assert(idx == util_bitcount(pool->pipeline_statistics));
142801e04c3fSmrg         break;
142901e04c3fSmrg      }
143001e04c3fSmrg
14319f464c52Smaya      case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
14329f464c52Smaya         result = compute_query_result(&b, anv_address_add(query_addr, 8));
14339f464c52Smaya         gpu_write_query_result(&b, dest_addr, flags, idx++, result);
14349f464c52Smaya         result = compute_query_result(&b, anv_address_add(query_addr, 24));
14359f464c52Smaya         gpu_write_query_result(&b, dest_addr, flags, idx++, result);
14369f464c52Smaya         break;
14379f464c52Smaya
143801e04c3fSmrg      case VK_QUERY_TYPE_TIMESTAMP:
14397ec681f3Smrg         result = mi_mem64(anv_address_add(query_addr, 8));
14407ec681f3Smrg         gpu_write_query_result(&b, dest_addr, flags, idx++, result);
144101e04c3fSmrg         break;
144201e04c3fSmrg
14437ec681f3Smrg#if GFX_VER >= 8
14447ec681f3Smrg      case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
14457ec681f3Smrg         unreachable("Copy KHR performance query results not implemented");
14467ec681f3Smrg         break;
14477ec681f3Smrg#endif
14487ec681f3Smrg
144901e04c3fSmrg      default:
145001e04c3fSmrg         unreachable("unhandled query type");
145101e04c3fSmrg      }
145201e04c3fSmrg
145301e04c3fSmrg      if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
14549f464c52Smaya         gpu_write_query_result(&b, dest_addr, flags, idx,
14557ec681f3Smrg                                mi_mem64(query_addr));
145601e04c3fSmrg      }
145701e04c3fSmrg
145801e04c3fSmrg      dest_addr = anv_address_add(dest_addr, destStride);
145901e04c3fSmrg   }
146001e04c3fSmrg}
146101e04c3fSmrg
146201e04c3fSmrg#else
146301e04c3fSmrgvoid genX(CmdCopyQueryPoolResults)(
146401e04c3fSmrg    VkCommandBuffer                             commandBuffer,
146501e04c3fSmrg    VkQueryPool                                 queryPool,
146601e04c3fSmrg    uint32_t                                    firstQuery,
146701e04c3fSmrg    uint32_t                                    queryCount,
146801e04c3fSmrg    VkBuffer                                    destBuffer,
146901e04c3fSmrg    VkDeviceSize                                destOffset,
147001e04c3fSmrg    VkDeviceSize                                destStride,
147101e04c3fSmrg    VkQueryResultFlags                          flags)
147201e04c3fSmrg{
147301e04c3fSmrg   anv_finishme("Queries not yet supported on Ivy Bridge");
147401e04c3fSmrg}
147501e04c3fSmrg#endif
1476