genX_query.c revision 9f464c52
1/*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include <assert.h>
25#include <stdbool.h>
26#include <string.h>
27#include <unistd.h>
28#include <fcntl.h>
29
30#include "anv_private.h"
31
32#include "genxml/gen_macros.h"
33#include "genxml/genX_pack.h"
34
35/* We reserve GPR 14 and 15 for conditional rendering */
36#define GEN_MI_BUILDER_NUM_ALLOC_GPRS 14
37#define __gen_get_batch_dwords anv_batch_emit_dwords
38#define __gen_address_offset anv_address_add
39#include "common/gen_mi_builder.h"
40
41VkResult genX(CreateQueryPool)(
42    VkDevice                                    _device,
43    const VkQueryPoolCreateInfo*                pCreateInfo,
44    const VkAllocationCallbacks*                pAllocator,
45    VkQueryPool*                                pQueryPool)
46{
47   ANV_FROM_HANDLE(anv_device, device, _device);
48   const struct anv_physical_device *pdevice = &device->instance->physicalDevice;
49   struct anv_query_pool *pool;
50   VkResult result;
51
52   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
53
54   /* Query pool slots are made up of some number of 64-bit values packed
55    * tightly together.  The first 64-bit value is always the "available" bit
56    * which is 0 when the query is unavailable and 1 when it is available.
57    * The 64-bit values that follow are determined by the type of query.
58    */
59   uint32_t uint64s_per_slot = 1;
60
61   VkQueryPipelineStatisticFlags pipeline_statistics = 0;
62   switch (pCreateInfo->queryType) {
63   case VK_QUERY_TYPE_OCCLUSION:
64      /* Occlusion queries have two values: begin and end. */
65      uint64s_per_slot += 2;
66      break;
67   case VK_QUERY_TYPE_TIMESTAMP:
68      /* Timestamps just have the one timestamp value */
69      uint64s_per_slot += 1;
70      break;
71   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
72      pipeline_statistics = pCreateInfo->pipelineStatistics;
73      /* We're going to trust this field implicitly so we need to ensure that
74       * no unhandled extension bits leak in.
75       */
76      pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK;
77
78      /* Statistics queries have a min and max for every statistic */
79      uint64s_per_slot += 2 * util_bitcount(pipeline_statistics);
80      break;
81   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
82      /* Transform feedback queries are 4 values, begin/end for
83       * written/available.
84       */
85      uint64s_per_slot += 4;
86      break;
87   default:
88      assert(!"Invalid query type");
89   }
90
91   pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
92                     VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
93   if (pool == NULL)
94      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
95
96   pool->type = pCreateInfo->queryType;
97   pool->pipeline_statistics = pipeline_statistics;
98   pool->stride = uint64s_per_slot * sizeof(uint64_t);
99   pool->slots = pCreateInfo->queryCount;
100
101   uint64_t size = pool->slots * pool->stride;
102   result = anv_bo_init_new(&pool->bo, device, size);
103   if (result != VK_SUCCESS)
104      goto fail;
105
106   if (pdevice->supports_48bit_addresses)
107      pool->bo.flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
108
109   if (pdevice->use_softpin)
110      pool->bo.flags |= EXEC_OBJECT_PINNED;
111
112   if (pdevice->has_exec_async)
113      pool->bo.flags |= EXEC_OBJECT_ASYNC;
114
115   anv_vma_alloc(device, &pool->bo);
116
117   /* For query pools, we set the caching mode to I915_CACHING_CACHED.  On LLC
118    * platforms, this does nothing.  On non-LLC platforms, this means snooping
119    * which comes at a slight cost.  However, the buffers aren't big, won't be
120    * written frequently, and trying to handle the flushing manually without
121    * doing too much flushing is extremely painful.
122    */
123   anv_gem_set_caching(device, pool->bo.gem_handle, I915_CACHING_CACHED);
124
125   pool->bo.map = anv_gem_mmap(device, pool->bo.gem_handle, 0, size, 0);
126
127   *pQueryPool = anv_query_pool_to_handle(pool);
128
129   return VK_SUCCESS;
130
131 fail:
132   vk_free2(&device->alloc, pAllocator, pool);
133
134   return result;
135}
136
137void genX(DestroyQueryPool)(
138    VkDevice                                    _device,
139    VkQueryPool                                 _pool,
140    const VkAllocationCallbacks*                pAllocator)
141{
142   ANV_FROM_HANDLE(anv_device, device, _device);
143   ANV_FROM_HANDLE(anv_query_pool, pool, _pool);
144
145   if (!pool)
146      return;
147
148   anv_gem_munmap(pool->bo.map, pool->bo.size);
149   anv_vma_free(device, &pool->bo);
150   anv_gem_close(device, pool->bo.gem_handle);
151   vk_free2(&device->alloc, pAllocator, pool);
152}
153
154static struct anv_address
155anv_query_address(struct anv_query_pool *pool, uint32_t query)
156{
157   return (struct anv_address) {
158      .bo = &pool->bo,
159      .offset = query * pool->stride,
160   };
161}
162
163static void
164cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
165                       uint32_t value_index, uint64_t result)
166{
167   if (flags & VK_QUERY_RESULT_64_BIT) {
168      uint64_t *dst64 = dst_slot;
169      dst64[value_index] = result;
170   } else {
171      uint32_t *dst32 = dst_slot;
172      dst32[value_index] = result;
173   }
174}
175
176static bool
177query_is_available(uint64_t *slot)
178{
179   return *(volatile uint64_t *)slot;
180}
181
182static VkResult
183wait_for_available(struct anv_device *device,
184                   struct anv_query_pool *pool, uint64_t *slot)
185{
186   while (true) {
187      if (query_is_available(slot))
188         return VK_SUCCESS;
189
190      int ret = anv_gem_busy(device, pool->bo.gem_handle);
191      if (ret == 1) {
192         /* The BO is still busy, keep waiting. */
193         continue;
194      } else if (ret == -1) {
195         /* We don't know the real error. */
196         return anv_device_set_lost(device, "gem wait failed: %m");
197      } else {
198         assert(ret == 0);
199         /* The BO is no longer busy. */
200         if (query_is_available(slot)) {
201            return VK_SUCCESS;
202         } else {
203            VkResult status = anv_device_query_status(device);
204            if (status != VK_SUCCESS)
205               return status;
206
207            /* If we haven't seen availability yet, then we never will.  This
208             * can only happen if we have a client error where they call
209             * GetQueryPoolResults on a query that they haven't submitted to
210             * the GPU yet.  The spec allows us to do anything in this case,
211             * but returning VK_SUCCESS doesn't seem right and we shouldn't
212             * just keep spinning.
213             */
214            return VK_NOT_READY;
215         }
216      }
217   }
218}
219
220VkResult genX(GetQueryPoolResults)(
221    VkDevice                                    _device,
222    VkQueryPool                                 queryPool,
223    uint32_t                                    firstQuery,
224    uint32_t                                    queryCount,
225    size_t                                      dataSize,
226    void*                                       pData,
227    VkDeviceSize                                stride,
228    VkQueryResultFlags                          flags)
229{
230   ANV_FROM_HANDLE(anv_device, device, _device);
231   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
232
233   assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
234          pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
235          pool->type == VK_QUERY_TYPE_TIMESTAMP ||
236          pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT);
237
238   if (anv_device_is_lost(device))
239      return VK_ERROR_DEVICE_LOST;
240
241   if (pData == NULL)
242      return VK_SUCCESS;
243
244   void *data_end = pData + dataSize;
245
246   VkResult status = VK_SUCCESS;
247   for (uint32_t i = 0; i < queryCount; i++) {
248      uint64_t *slot = pool->bo.map + (firstQuery + i) * pool->stride;
249
250      /* Availability is always at the start of the slot */
251      bool available = slot[0];
252
253      if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
254         status = wait_for_available(device, pool, slot);
255         if (status != VK_SUCCESS)
256            return status;
257
258         available = true;
259      }
260
261      /* From the Vulkan 1.0.42 spec:
262       *
263       *    "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
264       *    both not set then no result values are written to pData for
265       *    queries that are in the unavailable state at the time of the call,
266       *    and vkGetQueryPoolResults returns VK_NOT_READY. However,
267       *    availability state is still written to pData for those queries if
268       *    VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
269       */
270      bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
271
272      uint32_t idx = 0;
273      switch (pool->type) {
274      case VK_QUERY_TYPE_OCCLUSION:
275         if (write_results)
276            cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
277         idx++;
278         break;
279
280      case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
281         uint32_t statistics = pool->pipeline_statistics;
282         while (statistics) {
283            uint32_t stat = u_bit_scan(&statistics);
284            if (write_results) {
285               uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1];
286
287               /* WaDividePSInvocationCountBy4:HSW,BDW */
288               if ((device->info.gen == 8 || device->info.is_haswell) &&
289                   (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT)
290                  result >>= 2;
291
292               cpu_write_query_result(pData, flags, idx, result);
293            }
294            idx++;
295         }
296         assert(idx == util_bitcount(pool->pipeline_statistics));
297         break;
298      }
299
300      case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
301         if (write_results)
302            cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
303         idx++;
304         if (write_results)
305            cpu_write_query_result(pData, flags, idx, slot[4] - slot[3]);
306         idx++;
307         break;
308
309      case VK_QUERY_TYPE_TIMESTAMP:
310         if (write_results)
311            cpu_write_query_result(pData, flags, idx, slot[1]);
312         idx++;
313         break;
314
315      default:
316         unreachable("invalid pool type");
317      }
318
319      if (!write_results)
320         status = VK_NOT_READY;
321
322      if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
323         cpu_write_query_result(pData, flags, idx, available);
324
325      pData += stride;
326      if (pData >= data_end)
327         break;
328   }
329
330   return status;
331}
332
333static void
334emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
335                    struct anv_address addr)
336{
337   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
338      pc.DestinationAddressType  = DAT_PPGTT;
339      pc.PostSyncOperation       = WritePSDepthCount;
340      pc.DepthStallEnable        = true;
341      pc.Address                 = addr;
342
343      if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
344         pc.CommandStreamerStallEnable = true;
345   }
346}
347
348static void
349emit_query_mi_availability(struct gen_mi_builder *b,
350                           struct anv_address addr,
351                           bool available)
352{
353   gen_mi_store(b, gen_mi_mem64(addr), gen_mi_imm(available));
354}
355
356static void
357emit_query_pc_availability(struct anv_cmd_buffer *cmd_buffer,
358                           struct anv_address addr,
359                           bool available)
360{
361   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
362      pc.DestinationAddressType  = DAT_PPGTT;
363      pc.PostSyncOperation       = WriteImmediateData;
364      pc.Address                 = addr;
365      pc.ImmediateData           = available;
366   }
367}
368
369/**
370 * Goes through a series of consecutive query indices in the given pool
371 * setting all element values to 0 and emitting them as available.
372 */
373static void
374emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
375                  struct gen_mi_builder *b, struct anv_query_pool *pool,
376                  uint32_t first_index, uint32_t num_queries)
377{
378   switch (pool->type) {
379   case VK_QUERY_TYPE_OCCLUSION:
380   case VK_QUERY_TYPE_TIMESTAMP:
381      /* These queries are written with a PIPE_CONTROL so clear them using the
382       * PIPE_CONTROL as well so we don't have to synchronize between 2 types
383       * of operations.
384       */
385      assert((pool->stride % 8) == 0);
386      for (uint32_t i = 0; i < num_queries; i++) {
387         struct anv_address slot_addr =
388            anv_query_address(pool, first_index + i);
389
390         for (uint32_t qword = 1; qword < (pool->stride / 8); qword++) {
391            emit_query_pc_availability(cmd_buffer,
392                                       anv_address_add(slot_addr, qword * 8),
393                                       false);
394         }
395         emit_query_pc_availability(cmd_buffer, slot_addr, true);
396      }
397      break;
398
399   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
400   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
401      for (uint32_t i = 0; i < num_queries; i++) {
402         struct anv_address slot_addr =
403            anv_query_address(pool, first_index + i);
404         gen_mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8);
405         emit_query_mi_availability(b, slot_addr, true);
406      }
407      break;
408
409   default:
410      unreachable("Unsupported query type");
411   }
412}
413
414void genX(CmdResetQueryPool)(
415    VkCommandBuffer                             commandBuffer,
416    VkQueryPool                                 queryPool,
417    uint32_t                                    firstQuery,
418    uint32_t                                    queryCount)
419{
420   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
421   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
422
423   switch (pool->type) {
424   case VK_QUERY_TYPE_OCCLUSION:
425   case VK_QUERY_TYPE_TIMESTAMP:
426      for (uint32_t i = 0; i < queryCount; i++) {
427         emit_query_pc_availability(cmd_buffer,
428                                    anv_query_address(pool, firstQuery + i),
429                                    false);
430      }
431      break;
432
433   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
434   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
435      struct gen_mi_builder b;
436      gen_mi_builder_init(&b, &cmd_buffer->batch);
437
438      for (uint32_t i = 0; i < queryCount; i++)
439         emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
440      break;
441   }
442
443   default:
444      unreachable("Unsupported query type");
445   }
446}
447
448void genX(ResetQueryPoolEXT)(
449    VkDevice                                    _device,
450    VkQueryPool                                 queryPool,
451    uint32_t                                    firstQuery,
452    uint32_t                                    queryCount)
453{
454   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
455
456   for (uint32_t i = 0; i < queryCount; i++) {
457      uint64_t *slot = pool->bo.map + (firstQuery + i) * pool->stride;
458      *slot = 0;
459   }
460}
461
462static const uint32_t vk_pipeline_stat_to_reg[] = {
463   GENX(IA_VERTICES_COUNT_num),
464   GENX(IA_PRIMITIVES_COUNT_num),
465   GENX(VS_INVOCATION_COUNT_num),
466   GENX(GS_INVOCATION_COUNT_num),
467   GENX(GS_PRIMITIVES_COUNT_num),
468   GENX(CL_INVOCATION_COUNT_num),
469   GENX(CL_PRIMITIVES_COUNT_num),
470   GENX(PS_INVOCATION_COUNT_num),
471   GENX(HS_INVOCATION_COUNT_num),
472   GENX(DS_INVOCATION_COUNT_num),
473   GENX(CS_INVOCATION_COUNT_num),
474};
475
476static void
477emit_pipeline_stat(struct gen_mi_builder *b, uint32_t stat,
478                   struct anv_address addr)
479{
480   STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK ==
481                 (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg)) - 1);
482
483   assert(stat < ARRAY_SIZE(vk_pipeline_stat_to_reg));
484   gen_mi_store(b, gen_mi_mem64(addr),
485                gen_mi_reg64(vk_pipeline_stat_to_reg[stat]));
486}
487
488static void
489emit_xfb_query(struct gen_mi_builder *b, uint32_t stream,
490               struct anv_address addr)
491{
492   assert(stream < MAX_XFB_STREAMS);
493
494   gen_mi_store(b, gen_mi_mem64(anv_address_add(addr, 0)),
495                gen_mi_reg64(GENX(SO_NUM_PRIMS_WRITTEN0_num) + stream * 8));
496   gen_mi_store(b, gen_mi_mem64(anv_address_add(addr, 16)),
497                gen_mi_reg64(GENX(SO_PRIM_STORAGE_NEEDED0_num) + stream * 8));
498}
499
500void genX(CmdBeginQuery)(
501    VkCommandBuffer                             commandBuffer,
502    VkQueryPool                                 queryPool,
503    uint32_t                                    query,
504    VkQueryControlFlags                         flags)
505{
506   genX(CmdBeginQueryIndexedEXT)(commandBuffer, queryPool, query, flags, 0);
507}
508
509void genX(CmdBeginQueryIndexedEXT)(
510    VkCommandBuffer                             commandBuffer,
511    VkQueryPool                                 queryPool,
512    uint32_t                                    query,
513    VkQueryControlFlags                         flags,
514    uint32_t                                    index)
515{
516   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
517   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
518   struct anv_address query_addr = anv_query_address(pool, query);
519
520   struct gen_mi_builder b;
521   gen_mi_builder_init(&b, &cmd_buffer->batch);
522
523   switch (pool->type) {
524   case VK_QUERY_TYPE_OCCLUSION:
525      emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 8));
526      break;
527
528   case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
529      /* TODO: This might only be necessary for certain stats */
530      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
531         pc.CommandStreamerStallEnable = true;
532         pc.StallAtPixelScoreboard = true;
533      }
534
535      uint32_t statistics = pool->pipeline_statistics;
536      uint32_t offset = 8;
537      while (statistics) {
538         uint32_t stat = u_bit_scan(&statistics);
539         emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
540         offset += 16;
541      }
542      break;
543   }
544
545   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
546      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
547         pc.CommandStreamerStallEnable = true;
548         pc.StallAtPixelScoreboard = true;
549      }
550      emit_xfb_query(&b, index, anv_address_add(query_addr, 8));
551      break;
552
553   default:
554      unreachable("");
555   }
556}
557
558void genX(CmdEndQuery)(
559    VkCommandBuffer                             commandBuffer,
560    VkQueryPool                                 queryPool,
561    uint32_t                                    query)
562{
563   genX(CmdEndQueryIndexedEXT)(commandBuffer, queryPool, query, 0);
564}
565
566void genX(CmdEndQueryIndexedEXT)(
567    VkCommandBuffer                             commandBuffer,
568    VkQueryPool                                 queryPool,
569    uint32_t                                    query,
570    uint32_t                                    index)
571{
572   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
573   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
574   struct anv_address query_addr = anv_query_address(pool, query);
575
576   struct gen_mi_builder b;
577   gen_mi_builder_init(&b, &cmd_buffer->batch);
578
579   switch (pool->type) {
580   case VK_QUERY_TYPE_OCCLUSION:
581      emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16));
582      emit_query_pc_availability(cmd_buffer, query_addr, true);
583      break;
584
585   case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
586      /* TODO: This might only be necessary for certain stats */
587      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
588         pc.CommandStreamerStallEnable = true;
589         pc.StallAtPixelScoreboard = true;
590      }
591
592      uint32_t statistics = pool->pipeline_statistics;
593      uint32_t offset = 16;
594      while (statistics) {
595         uint32_t stat = u_bit_scan(&statistics);
596         emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
597         offset += 16;
598      }
599
600      emit_query_mi_availability(&b, query_addr, true);
601      break;
602   }
603
604   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
605      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
606         pc.CommandStreamerStallEnable = true;
607         pc.StallAtPixelScoreboard = true;
608      }
609
610      emit_xfb_query(&b, index, anv_address_add(query_addr, 16));
611      emit_query_mi_availability(&b, query_addr, true);
612      break;
613
614   default:
615      unreachable("");
616   }
617
618   /* When multiview is active the spec requires that N consecutive query
619    * indices are used, where N is the number of active views in the subpass.
620    * The spec allows that we only write the results to one of the queries
621    * but we still need to manage result availability for all the query indices.
622    * Since we only emit a single query for all active views in the
623    * first index, mark the other query indices as being already available
624    * with result 0.
625    */
626   if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
627      const uint32_t num_queries =
628         util_bitcount(cmd_buffer->state.subpass->view_mask);
629      if (num_queries > 1)
630         emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
631   }
632}
633
634#define TIMESTAMP 0x2358
635
636void genX(CmdWriteTimestamp)(
637    VkCommandBuffer                             commandBuffer,
638    VkPipelineStageFlagBits                     pipelineStage,
639    VkQueryPool                                 queryPool,
640    uint32_t                                    query)
641{
642   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
643   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
644   struct anv_address query_addr = anv_query_address(pool, query);
645
646   assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
647
648   struct gen_mi_builder b;
649   gen_mi_builder_init(&b, &cmd_buffer->batch);
650
651   switch (pipelineStage) {
652   case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
653      gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, 8)),
654                       gen_mi_reg64(TIMESTAMP));
655      break;
656
657   default:
658      /* Everything else is bottom-of-pipe */
659      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
660         pc.DestinationAddressType  = DAT_PPGTT;
661         pc.PostSyncOperation       = WriteTimestamp;
662         pc.Address                 = anv_address_add(query_addr, 8);
663
664         if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
665            pc.CommandStreamerStallEnable = true;
666      }
667      break;
668   }
669
670   emit_query_pc_availability(cmd_buffer, query_addr, true);
671
672   /* When multiview is active the spec requires that N consecutive query
673    * indices are used, where N is the number of active views in the subpass.
674    * The spec allows that we only write the results to one of the queries
675    * but we still need to manage result availability for all the query indices.
676    * Since we only emit a single query for all active views in the
677    * first index, mark the other query indices as being already available
678    * with result 0.
679    */
680   if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
681      const uint32_t num_queries =
682         util_bitcount(cmd_buffer->state.subpass->view_mask);
683      if (num_queries > 1)
684         emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
685   }
686}
687
688#if GEN_GEN > 7 || GEN_IS_HASWELL
689
690static void
691gpu_write_query_result(struct gen_mi_builder *b,
692                       struct anv_address dst_addr,
693                       VkQueryResultFlags flags,
694                       uint32_t value_index,
695                       struct gen_mi_value query_result)
696{
697   if (flags & VK_QUERY_RESULT_64_BIT) {
698      struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8);
699      gen_mi_store(b, gen_mi_mem64(res_addr), query_result);
700   } else {
701      struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4);
702      gen_mi_store(b, gen_mi_mem32(res_addr), query_result);
703   }
704}
705
706static struct gen_mi_value
707compute_query_result(struct gen_mi_builder *b, struct anv_address addr)
708{
709   return gen_mi_isub(b, gen_mi_mem64(anv_address_add(addr, 8)),
710                         gen_mi_mem64(anv_address_add(addr, 0)));
711}
712
713void genX(CmdCopyQueryPoolResults)(
714    VkCommandBuffer                             commandBuffer,
715    VkQueryPool                                 queryPool,
716    uint32_t                                    firstQuery,
717    uint32_t                                    queryCount,
718    VkBuffer                                    destBuffer,
719    VkDeviceSize                                destOffset,
720    VkDeviceSize                                destStride,
721    VkQueryResultFlags                          flags)
722{
723   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
724   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
725   ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
726
727   struct gen_mi_builder b;
728   gen_mi_builder_init(&b, &cmd_buffer->batch);
729   struct gen_mi_value result;
730
731   /* If render target writes are ongoing, request a render target cache flush
732    * to ensure proper ordering of the commands from the 3d pipe and the
733    * command streamer.
734    */
735   if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_RENDER_TARGET_BUFFER_WRITES) {
736      cmd_buffer->state.pending_pipe_bits |=
737         ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
738   }
739
740   if ((flags & VK_QUERY_RESULT_WAIT_BIT) ||
741       (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) ||
742       /* Occlusion & timestamp queries are written using a PIPE_CONTROL and
743        * because we're about to copy values from MI commands, we need to
744        * stall the command streamer to make sure the PIPE_CONTROL values have
745        * landed, otherwise we could see inconsistent values & availability.
746        *
747        *  From the vulkan spec:
748        *
749        *     "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
750        *     previous uses of vkCmdResetQueryPool in the same queue, without
751        *     any additional synchronization."
752        */
753       pool->type == VK_QUERY_TYPE_OCCLUSION ||
754       pool->type == VK_QUERY_TYPE_TIMESTAMP) {
755      cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT;
756      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
757   }
758
759   struct anv_address dest_addr = anv_address_add(buffer->address, destOffset);
760   for (uint32_t i = 0; i < queryCount; i++) {
761      struct anv_address query_addr = anv_query_address(pool, firstQuery + i);
762      uint32_t idx = 0;
763      switch (pool->type) {
764      case VK_QUERY_TYPE_OCCLUSION:
765         result = compute_query_result(&b, anv_address_add(query_addr, 8));
766         gpu_write_query_result(&b, dest_addr, flags, idx++, result);
767         break;
768
769      case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
770         uint32_t statistics = pool->pipeline_statistics;
771         while (statistics) {
772            uint32_t stat = u_bit_scan(&statistics);
773
774            result = compute_query_result(&b, anv_address_add(query_addr,
775                                                              idx * 16 + 8));
776
777            /* WaDividePSInvocationCountBy4:HSW,BDW */
778            if ((cmd_buffer->device->info.gen == 8 ||
779                 cmd_buffer->device->info.is_haswell) &&
780                (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) {
781               result = gen_mi_ushr32_imm(&b, result, 2);
782            }
783
784            gpu_write_query_result(&b, dest_addr, flags, idx++, result);
785         }
786         assert(idx == util_bitcount(pool->pipeline_statistics));
787         break;
788      }
789
790      case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
791         result = compute_query_result(&b, anv_address_add(query_addr, 8));
792         gpu_write_query_result(&b, dest_addr, flags, idx++, result);
793         result = compute_query_result(&b, anv_address_add(query_addr, 24));
794         gpu_write_query_result(&b, dest_addr, flags, idx++, result);
795         break;
796
797      case VK_QUERY_TYPE_TIMESTAMP:
798         result = gen_mi_mem64(anv_address_add(query_addr, 8));
799         gpu_write_query_result(&b, dest_addr, flags, 0, result);
800         break;
801
802      default:
803         unreachable("unhandled query type");
804      }
805
806      if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
807         gpu_write_query_result(&b, dest_addr, flags, idx,
808                                gen_mi_mem64(query_addr));
809      }
810
811      dest_addr = anv_address_add(dest_addr, destStride);
812   }
813}
814
815#else
816void genX(CmdCopyQueryPoolResults)(
817    VkCommandBuffer                             commandBuffer,
818    VkQueryPool                                 queryPool,
819    uint32_t                                    firstQuery,
820    uint32_t                                    queryCount,
821    VkBuffer                                    destBuffer,
822    VkDeviceSize                                destOffset,
823    VkDeviceSize                                destStride,
824    VkQueryResultFlags                          flags)
825{
826   anv_finishme("Queries not yet supported on Ivy Bridge");
827}
828#endif
829