genX_query.c revision 01e04c3f
1/*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include <assert.h>
25#include <stdbool.h>
26#include <string.h>
27#include <unistd.h>
28#include <fcntl.h>
29
30#include "anv_private.h"
31
32#include "genxml/gen_macros.h"
33#include "genxml/genX_pack.h"
34
35VkResult genX(CreateQueryPool)(
36    VkDevice                                    _device,
37    const VkQueryPoolCreateInfo*                pCreateInfo,
38    const VkAllocationCallbacks*                pAllocator,
39    VkQueryPool*                                pQueryPool)
40{
41   ANV_FROM_HANDLE(anv_device, device, _device);
42   const struct anv_physical_device *pdevice = &device->instance->physicalDevice;
43   struct anv_query_pool *pool;
44   VkResult result;
45
46   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
47
48   /* Query pool slots are made up of some number of 64-bit values packed
49    * tightly together.  The first 64-bit value is always the "available" bit
50    * which is 0 when the query is unavailable and 1 when it is available.
51    * The 64-bit values that follow are determined by the type of query.
52    */
53   uint32_t uint64s_per_slot = 1;
54
55   VkQueryPipelineStatisticFlags pipeline_statistics = 0;
56   switch (pCreateInfo->queryType) {
57   case VK_QUERY_TYPE_OCCLUSION:
58      /* Occlusion queries have two values: begin and end. */
59      uint64s_per_slot += 2;
60      break;
61   case VK_QUERY_TYPE_TIMESTAMP:
62      /* Timestamps just have the one timestamp value */
63      uint64s_per_slot += 1;
64      break;
65   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
66      pipeline_statistics = pCreateInfo->pipelineStatistics;
67      /* We're going to trust this field implicitly so we need to ensure that
68       * no unhandled extension bits leak in.
69       */
70      pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK;
71
72      /* Statistics queries have a min and max for every statistic */
73      uint64s_per_slot += 2 * util_bitcount(pipeline_statistics);
74      break;
75   default:
76      assert(!"Invalid query type");
77   }
78
79   pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
80                     VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
81   if (pool == NULL)
82      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
83
84   pool->type = pCreateInfo->queryType;
85   pool->pipeline_statistics = pipeline_statistics;
86   pool->stride = uint64s_per_slot * sizeof(uint64_t);
87   pool->slots = pCreateInfo->queryCount;
88
89   uint64_t size = pool->slots * pool->stride;
90   result = anv_bo_init_new(&pool->bo, device, size);
91   if (result != VK_SUCCESS)
92      goto fail;
93
94   if (pdevice->supports_48bit_addresses)
95      pool->bo.flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
96
97   if (pdevice->use_softpin)
98      pool->bo.flags |= EXEC_OBJECT_PINNED;
99
100   if (pdevice->has_exec_async)
101      pool->bo.flags |= EXEC_OBJECT_ASYNC;
102
103   anv_vma_alloc(device, &pool->bo);
104
105   /* For query pools, we set the caching mode to I915_CACHING_CACHED.  On LLC
106    * platforms, this does nothing.  On non-LLC platforms, this means snooping
107    * which comes at a slight cost.  However, the buffers aren't big, won't be
108    * written frequently, and trying to handle the flushing manually without
109    * doing too much flushing is extremely painful.
110    */
111   anv_gem_set_caching(device, pool->bo.gem_handle, I915_CACHING_CACHED);
112
113   pool->bo.map = anv_gem_mmap(device, pool->bo.gem_handle, 0, size, 0);
114
115   *pQueryPool = anv_query_pool_to_handle(pool);
116
117   return VK_SUCCESS;
118
119 fail:
120   vk_free2(&device->alloc, pAllocator, pool);
121
122   return result;
123}
124
125void genX(DestroyQueryPool)(
126    VkDevice                                    _device,
127    VkQueryPool                                 _pool,
128    const VkAllocationCallbacks*                pAllocator)
129{
130   ANV_FROM_HANDLE(anv_device, device, _device);
131   ANV_FROM_HANDLE(anv_query_pool, pool, _pool);
132
133   if (!pool)
134      return;
135
136   anv_gem_munmap(pool->bo.map, pool->bo.size);
137   anv_vma_free(device, &pool->bo);
138   anv_gem_close(device, pool->bo.gem_handle);
139   vk_free2(&device->alloc, pAllocator, pool);
140}
141
142static struct anv_address
143anv_query_address(struct anv_query_pool *pool, uint32_t query)
144{
145   return (struct anv_address) {
146      .bo = &pool->bo,
147      .offset = query * pool->stride,
148   };
149}
150
151static void
152cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
153                       uint32_t value_index, uint64_t result)
154{
155   if (flags & VK_QUERY_RESULT_64_BIT) {
156      uint64_t *dst64 = dst_slot;
157      dst64[value_index] = result;
158   } else {
159      uint32_t *dst32 = dst_slot;
160      dst32[value_index] = result;
161   }
162}
163
164static bool
165query_is_available(uint64_t *slot)
166{
167   return *(volatile uint64_t *)slot;
168}
169
170static VkResult
171wait_for_available(struct anv_device *device,
172                   struct anv_query_pool *pool, uint64_t *slot)
173{
174   while (true) {
175      if (query_is_available(slot))
176         return VK_SUCCESS;
177
178      int ret = anv_gem_busy(device, pool->bo.gem_handle);
179      if (ret == 1) {
180         /* The BO is still busy, keep waiting. */
181         continue;
182      } else if (ret == -1) {
183         /* We don't know the real error. */
184         return anv_device_set_lost(device, "gem wait failed: %m");
185      } else {
186         assert(ret == 0);
187         /* The BO is no longer busy. */
188         if (query_is_available(slot)) {
189            return VK_SUCCESS;
190         } else {
191            VkResult status = anv_device_query_status(device);
192            if (status != VK_SUCCESS)
193               return status;
194
195            /* If we haven't seen availability yet, then we never will.  This
196             * can only happen if we have a client error where they call
197             * GetQueryPoolResults on a query that they haven't submitted to
198             * the GPU yet.  The spec allows us to do anything in this case,
199             * but returning VK_SUCCESS doesn't seem right and we shouldn't
200             * just keep spinning.
201             */
202            return VK_NOT_READY;
203         }
204      }
205   }
206}
207
208VkResult genX(GetQueryPoolResults)(
209    VkDevice                                    _device,
210    VkQueryPool                                 queryPool,
211    uint32_t                                    firstQuery,
212    uint32_t                                    queryCount,
213    size_t                                      dataSize,
214    void*                                       pData,
215    VkDeviceSize                                stride,
216    VkQueryResultFlags                          flags)
217{
218   ANV_FROM_HANDLE(anv_device, device, _device);
219   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
220
221   assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
222          pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
223          pool->type == VK_QUERY_TYPE_TIMESTAMP);
224
225   if (anv_device_is_lost(device))
226      return VK_ERROR_DEVICE_LOST;
227
228   if (pData == NULL)
229      return VK_SUCCESS;
230
231   void *data_end = pData + dataSize;
232
233   VkResult status = VK_SUCCESS;
234   for (uint32_t i = 0; i < queryCount; i++) {
235      uint64_t *slot = pool->bo.map + (firstQuery + i) * pool->stride;
236
237      /* Availability is always at the start of the slot */
238      bool available = slot[0];
239
240      if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
241         status = wait_for_available(device, pool, slot);
242         if (status != VK_SUCCESS)
243            return status;
244
245         available = true;
246      }
247
248      /* From the Vulkan 1.0.42 spec:
249       *
250       *    "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
251       *    both not set then no result values are written to pData for
252       *    queries that are in the unavailable state at the time of the call,
253       *    and vkGetQueryPoolResults returns VK_NOT_READY. However,
254       *    availability state is still written to pData for those queries if
255       *    VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
256       */
257      bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
258
259      uint32_t idx = 0;
260      switch (pool->type) {
261      case VK_QUERY_TYPE_OCCLUSION:
262         if (write_results)
263            cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
264         idx++;
265         break;
266
267      case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
268         uint32_t statistics = pool->pipeline_statistics;
269         while (statistics) {
270            uint32_t stat = u_bit_scan(&statistics);
271            if (write_results) {
272               uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1];
273
274               /* WaDividePSInvocationCountBy4:HSW,BDW */
275               if ((device->info.gen == 8 || device->info.is_haswell) &&
276                   (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT)
277                  result >>= 2;
278
279               cpu_write_query_result(pData, flags, idx, result);
280            }
281            idx++;
282         }
283         assert(idx == util_bitcount(pool->pipeline_statistics));
284         break;
285      }
286
287      case VK_QUERY_TYPE_TIMESTAMP:
288         if (write_results)
289            cpu_write_query_result(pData, flags, idx, slot[1]);
290         idx++;
291         break;
292
293      default:
294         unreachable("invalid pool type");
295      }
296
297      if (!write_results)
298         status = VK_NOT_READY;
299
300      if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
301         cpu_write_query_result(pData, flags, idx, available);
302
303      pData += stride;
304      if (pData >= data_end)
305         break;
306   }
307
308   return status;
309}
310
311static void
312emit_srm32(struct anv_batch *batch, struct anv_address addr, uint32_t reg)
313{
314   anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
315      srm.MemoryAddress    = addr;
316      srm.RegisterAddress  = reg;
317   }
318}
319
320static void
321emit_srm64(struct anv_batch *batch, struct anv_address addr, uint32_t reg)
322{
323   emit_srm32(batch, anv_address_add(addr, 0), reg + 0);
324   emit_srm32(batch, anv_address_add(addr, 4), reg + 4);
325}
326
327static void
328emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
329                    struct anv_address addr)
330{
331   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
332      pc.DestinationAddressType  = DAT_PPGTT;
333      pc.PostSyncOperation       = WritePSDepthCount;
334      pc.DepthStallEnable        = true;
335      pc.Address                 = addr;
336
337      if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
338         pc.CommandStreamerStallEnable = true;
339   }
340}
341
342static void
343emit_query_availability(struct anv_cmd_buffer *cmd_buffer,
344                        struct anv_address addr)
345{
346   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
347      pc.DestinationAddressType  = DAT_PPGTT;
348      pc.PostSyncOperation       = WriteImmediateData;
349      pc.Address                 = addr;
350      pc.ImmediateData           = 1;
351   }
352}
353
354/**
355 * Goes through a series of consecutive query indices in the given pool
356 * setting all element values to 0 and emitting them as available.
357 */
358static void
359emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
360                  struct anv_query_pool *pool,
361                  uint32_t first_index, uint32_t num_queries)
362{
363   for (uint32_t i = 0; i < num_queries; i++) {
364      struct anv_address slot_addr =
365         anv_query_address(pool, first_index + i);
366      genX(cmd_buffer_mi_memset)(cmd_buffer, anv_address_add(slot_addr, 8),
367                                 0, pool->stride - 8);
368      emit_query_availability(cmd_buffer, slot_addr);
369   }
370}
371
372void genX(CmdResetQueryPool)(
373    VkCommandBuffer                             commandBuffer,
374    VkQueryPool                                 queryPool,
375    uint32_t                                    firstQuery,
376    uint32_t                                    queryCount)
377{
378   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
379   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
380
381   for (uint32_t i = 0; i < queryCount; i++) {
382      anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdm) {
383         sdm.Address = anv_query_address(pool, firstQuery + i);
384         sdm.ImmediateData = 0;
385      }
386   }
387}
388
389static const uint32_t vk_pipeline_stat_to_reg[] = {
390   GENX(IA_VERTICES_COUNT_num),
391   GENX(IA_PRIMITIVES_COUNT_num),
392   GENX(VS_INVOCATION_COUNT_num),
393   GENX(GS_INVOCATION_COUNT_num),
394   GENX(GS_PRIMITIVES_COUNT_num),
395   GENX(CL_INVOCATION_COUNT_num),
396   GENX(CL_PRIMITIVES_COUNT_num),
397   GENX(PS_INVOCATION_COUNT_num),
398   GENX(HS_INVOCATION_COUNT_num),
399   GENX(DS_INVOCATION_COUNT_num),
400   GENX(CS_INVOCATION_COUNT_num),
401};
402
403static void
404emit_pipeline_stat(struct anv_cmd_buffer *cmd_buffer, uint32_t stat,
405                   struct anv_address addr)
406{
407   STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK ==
408                 (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg)) - 1);
409
410   assert(stat < ARRAY_SIZE(vk_pipeline_stat_to_reg));
411   emit_srm64(&cmd_buffer->batch, addr, vk_pipeline_stat_to_reg[stat]);
412}
413
414void genX(CmdBeginQuery)(
415    VkCommandBuffer                             commandBuffer,
416    VkQueryPool                                 queryPool,
417    uint32_t                                    query,
418    VkQueryControlFlags                         flags)
419{
420   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
421   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
422   struct anv_address query_addr = anv_query_address(pool, query);
423
424   switch (pool->type) {
425   case VK_QUERY_TYPE_OCCLUSION:
426      emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 8));
427      break;
428
429   case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
430      /* TODO: This might only be necessary for certain stats */
431      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
432         pc.CommandStreamerStallEnable = true;
433         pc.StallAtPixelScoreboard = true;
434      }
435
436      uint32_t statistics = pool->pipeline_statistics;
437      uint32_t offset = 8;
438      while (statistics) {
439         uint32_t stat = u_bit_scan(&statistics);
440         emit_pipeline_stat(cmd_buffer, stat,
441                            anv_address_add(query_addr, offset));
442         offset += 16;
443      }
444      break;
445   }
446
447   default:
448      unreachable("");
449   }
450}
451
452void genX(CmdEndQuery)(
453    VkCommandBuffer                             commandBuffer,
454    VkQueryPool                                 queryPool,
455    uint32_t                                    query)
456{
457   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
458   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
459   struct anv_address query_addr = anv_query_address(pool, query);
460
461   switch (pool->type) {
462   case VK_QUERY_TYPE_OCCLUSION:
463      emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16));
464      emit_query_availability(cmd_buffer, query_addr);
465      break;
466
467   case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
468      /* TODO: This might only be necessary for certain stats */
469      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
470         pc.CommandStreamerStallEnable = true;
471         pc.StallAtPixelScoreboard = true;
472      }
473
474      uint32_t statistics = pool->pipeline_statistics;
475      uint32_t offset = 16;
476      while (statistics) {
477         uint32_t stat = u_bit_scan(&statistics);
478         emit_pipeline_stat(cmd_buffer, stat,
479                            anv_address_add(query_addr, offset));
480         offset += 16;
481      }
482
483      emit_query_availability(cmd_buffer, query_addr);
484      break;
485   }
486
487   default:
488      unreachable("");
489   }
490
491   /* When multiview is active the spec requires that N consecutive query
492    * indices are used, where N is the number of active views in the subpass.
493    * The spec allows that we only write the results to one of the queries
494    * but we still need to manage result availability for all the query indices.
495    * Since we only emit a single query for all active views in the
496    * first index, mark the other query indices as being already available
497    * with result 0.
498    */
499   if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
500      const uint32_t num_queries =
501         util_bitcount(cmd_buffer->state.subpass->view_mask);
502      if (num_queries > 1)
503         emit_zero_queries(cmd_buffer, pool, query + 1, num_queries - 1);
504   }
505}
506
507#define TIMESTAMP 0x2358
508
509void genX(CmdWriteTimestamp)(
510    VkCommandBuffer                             commandBuffer,
511    VkPipelineStageFlagBits                     pipelineStage,
512    VkQueryPool                                 queryPool,
513    uint32_t                                    query)
514{
515   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
516   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
517   struct anv_address query_addr = anv_query_address(pool, query);
518
519   assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
520
521   switch (pipelineStage) {
522   case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
523      emit_srm64(&cmd_buffer->batch, anv_address_add(query_addr, 8), TIMESTAMP);
524      break;
525
526   default:
527      /* Everything else is bottom-of-pipe */
528      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
529         pc.DestinationAddressType  = DAT_PPGTT;
530         pc.PostSyncOperation       = WriteTimestamp;
531         pc.Address                 = anv_address_add(query_addr, 8);
532
533         if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
534            pc.CommandStreamerStallEnable = true;
535      }
536      break;
537   }
538
539   emit_query_availability(cmd_buffer, query_addr);
540
541   /* When multiview is active the spec requires that N consecutive query
542    * indices are used, where N is the number of active views in the subpass.
543    * The spec allows that we only write the results to one of the queries
544    * but we still need to manage result availability for all the query indices.
545    * Since we only emit a single query for all active views in the
546    * first index, mark the other query indices as being already available
547    * with result 0.
548    */
549   if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
550      const uint32_t num_queries =
551         util_bitcount(cmd_buffer->state.subpass->view_mask);
552      if (num_queries > 1)
553         emit_zero_queries(cmd_buffer, pool, query + 1, num_queries - 1);
554   }
555}
556
557#if GEN_GEN > 7 || GEN_IS_HASWELL
558
559static uint32_t
560mi_alu(uint32_t opcode, uint32_t operand1, uint32_t operand2)
561{
562   struct GENX(MI_MATH_ALU_INSTRUCTION) instr = {
563      .ALUOpcode = opcode,
564      .Operand1 = operand1,
565      .Operand2 = operand2,
566   };
567
568   uint32_t dw;
569   GENX(MI_MATH_ALU_INSTRUCTION_pack)(NULL, &dw, &instr);
570
571   return dw;
572}
573
574#define CS_GPR(n) (0x2600 + (n) * 8)
575
576static void
577emit_load_alu_reg_u64(struct anv_batch *batch, uint32_t reg,
578                      struct anv_address addr)
579{
580   anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
581      lrm.RegisterAddress  = reg;
582      lrm.MemoryAddress    = anv_address_add(addr, 0);
583   }
584   anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
585      lrm.RegisterAddress  = reg + 4;
586      lrm.MemoryAddress    = anv_address_add(addr, 4);
587   }
588}
589
590static void
591emit_load_alu_reg_imm32(struct anv_batch *batch, uint32_t reg, uint32_t imm)
592{
593   anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
594      lri.RegisterOffset   = reg;
595      lri.DataDWord        = imm;
596   }
597}
598
599static void
600emit_load_alu_reg_imm64(struct anv_batch *batch, uint32_t reg, uint64_t imm)
601{
602   emit_load_alu_reg_imm32(batch, reg, (uint32_t)imm);
603   emit_load_alu_reg_imm32(batch, reg + 4, (uint32_t)(imm >> 32));
604}
605
606static void
607emit_load_alu_reg_reg32(struct anv_batch *batch, uint32_t src, uint32_t dst)
608{
609   anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_REG), lrr) {
610      lrr.SourceRegisterAddress      = src;
611      lrr.DestinationRegisterAddress = dst;
612   }
613}
614
615/*
616 * GPR0 = GPR0 & ((1ull << n) - 1);
617 */
618static void
619keep_gpr0_lower_n_bits(struct anv_batch *batch, uint32_t n)
620{
621   assert(n < 64);
622   emit_load_alu_reg_imm64(batch, CS_GPR(1), (1ull << n) - 1);
623
624   uint32_t *dw = anv_batch_emitn(batch, 5, GENX(MI_MATH));
625   if (!dw) {
626      anv_batch_set_error(batch, VK_ERROR_OUT_OF_HOST_MEMORY);
627      return;
628   }
629
630   dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, MI_ALU_REG0);
631   dw[2] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, MI_ALU_REG1);
632   dw[3] = mi_alu(MI_ALU_AND, 0, 0);
633   dw[4] = mi_alu(MI_ALU_STORE, MI_ALU_REG0, MI_ALU_ACCU);
634}
635
636/*
637 * GPR0 = GPR0 << 30;
638 */
639static void
640shl_gpr0_by_30_bits(struct anv_batch *batch)
641{
642   /* First we mask 34 bits of GPR0 to prevent overflow */
643   keep_gpr0_lower_n_bits(batch, 34);
644
645   const uint32_t outer_count = 5;
646   const uint32_t inner_count = 6;
647   STATIC_ASSERT(outer_count * inner_count == 30);
648   const uint32_t cmd_len = 1 + inner_count * 4;
649
650   /* We'll emit 5 commands, each shifting GPR0 left by 6 bits, for a total of
651    * 30 left shifts.
652    */
653   for (int o = 0; o < outer_count; o++) {
654      /* Submit one MI_MATH to shift left by 6 bits */
655      uint32_t *dw = anv_batch_emitn(batch, cmd_len, GENX(MI_MATH));
656      if (!dw) {
657         anv_batch_set_error(batch, VK_ERROR_OUT_OF_HOST_MEMORY);
658         return;
659      }
660
661      dw++;
662      for (int i = 0; i < inner_count; i++, dw += 4) {
663         dw[0] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, MI_ALU_REG0);
664         dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, MI_ALU_REG0);
665         dw[2] = mi_alu(MI_ALU_ADD, 0, 0);
666         dw[3] = mi_alu(MI_ALU_STORE, MI_ALU_REG0, MI_ALU_ACCU);
667      }
668   }
669}
670
671/*
672 * GPR0 = GPR0 >> 2;
673 *
674 * Note that the upper 30 bits of GPR are lost!
675 */
676static void
677shr_gpr0_by_2_bits(struct anv_batch *batch)
678{
679   shl_gpr0_by_30_bits(batch);
680   emit_load_alu_reg_reg32(batch, CS_GPR(0) + 4, CS_GPR(0));
681   emit_load_alu_reg_imm32(batch, CS_GPR(0) + 4, 0);
682}
683
684static void
685gpu_write_query_result(struct anv_batch *batch,
686                       struct anv_address dst_addr,
687                       VkQueryResultFlags flags,
688                       uint32_t value_index, uint32_t reg)
689{
690   if (flags & VK_QUERY_RESULT_64_BIT) {
691      emit_srm64(batch, anv_address_add(dst_addr, value_index * 8), reg);
692   } else {
693      emit_srm32(batch, anv_address_add(dst_addr, value_index * 4), reg);
694   }
695}
696
697static void
698compute_query_result(struct anv_batch *batch, uint32_t dst_reg,
699                     struct anv_address addr)
700{
701   emit_load_alu_reg_u64(batch, CS_GPR(0), anv_address_add(addr, 0));
702   emit_load_alu_reg_u64(batch, CS_GPR(1), anv_address_add(addr, 8));
703
704   /* FIXME: We need to clamp the result for 32 bit. */
705
706   uint32_t *dw = anv_batch_emitn(batch, 5, GENX(MI_MATH));
707   if (!dw) {
708      anv_batch_set_error(batch, VK_ERROR_OUT_OF_HOST_MEMORY);
709      return;
710   }
711
712   dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, MI_ALU_REG1);
713   dw[2] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, MI_ALU_REG0);
714   dw[3] = mi_alu(MI_ALU_SUB, 0, 0);
715   dw[4] = mi_alu(MI_ALU_STORE, dst_reg, MI_ALU_ACCU);
716}
717
718void genX(CmdCopyQueryPoolResults)(
719    VkCommandBuffer                             commandBuffer,
720    VkQueryPool                                 queryPool,
721    uint32_t                                    firstQuery,
722    uint32_t                                    queryCount,
723    VkBuffer                                    destBuffer,
724    VkDeviceSize                                destOffset,
725    VkDeviceSize                                destStride,
726    VkQueryResultFlags                          flags)
727{
728   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
729   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
730   ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
731
732   /* If render target writes are ongoing, request a render target cache flush
733    * to ensure proper ordering of the commands from the 3d pipe and the
734    * command streamer.
735    */
736   if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_RENDER_TARGET_WRITES) {
737      cmd_buffer->state.pending_pipe_bits |=
738         ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
739   }
740
741   if ((flags & VK_QUERY_RESULT_WAIT_BIT) ||
742       (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS)) {
743      cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT;
744      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
745   }
746
747   struct anv_address dest_addr = anv_address_add(buffer->address, destOffset);
748   for (uint32_t i = 0; i < queryCount; i++) {
749      struct anv_address query_addr = anv_query_address(pool, firstQuery + i);
750      uint32_t idx = 0;
751      switch (pool->type) {
752      case VK_QUERY_TYPE_OCCLUSION:
753         compute_query_result(&cmd_buffer->batch, MI_ALU_REG2,
754                              anv_address_add(query_addr, 8));
755         gpu_write_query_result(&cmd_buffer->batch, dest_addr,
756                                flags, idx++, CS_GPR(2));
757         break;
758
759      case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
760         uint32_t statistics = pool->pipeline_statistics;
761         while (statistics) {
762            uint32_t stat = u_bit_scan(&statistics);
763
764            compute_query_result(&cmd_buffer->batch, MI_ALU_REG0,
765                                 anv_address_add(query_addr, idx * 16 + 8));
766
767            /* WaDividePSInvocationCountBy4:HSW,BDW */
768            if ((cmd_buffer->device->info.gen == 8 ||
769                 cmd_buffer->device->info.is_haswell) &&
770                (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) {
771               shr_gpr0_by_2_bits(&cmd_buffer->batch);
772            }
773
774            gpu_write_query_result(&cmd_buffer->batch, dest_addr,
775                                   flags, idx++, CS_GPR(0));
776         }
777         assert(idx == util_bitcount(pool->pipeline_statistics));
778         break;
779      }
780
781      case VK_QUERY_TYPE_TIMESTAMP:
782         emit_load_alu_reg_u64(&cmd_buffer->batch,
783                               CS_GPR(2), anv_address_add(query_addr, 8));
784         gpu_write_query_result(&cmd_buffer->batch, dest_addr,
785                                flags, 0, CS_GPR(2));
786         break;
787
788      default:
789         unreachable("unhandled query type");
790      }
791
792      if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
793         emit_load_alu_reg_u64(&cmd_buffer->batch, CS_GPR(0), query_addr);
794         gpu_write_query_result(&cmd_buffer->batch, dest_addr,
795                                flags, idx, CS_GPR(0));
796      }
797
798      dest_addr = anv_address_add(dest_addr, destStride);
799   }
800}
801
802#else
803void genX(CmdCopyQueryPoolResults)(
804    VkCommandBuffer                             commandBuffer,
805    VkQueryPool                                 queryPool,
806    uint32_t                                    firstQuery,
807    uint32_t                                    queryCount,
808    VkBuffer                                    destBuffer,
809    VkDeviceSize                                destOffset,
810    VkDeviceSize                                destStride,
811    VkQueryResultFlags                          flags)
812{
813   anv_finishme("Queries not yet supported on Ivy Bridge");
814}
815#endif
816