iris_query.c revision 9f464c52
1/*
2 * Copyright © 2017 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 * DEALINGS IN THE SOFTWARE.
21 */
22
23/**
24 * @file iris_query.c
25 *
26 * Query object support.  This allows measuring various simple statistics
27 * via counters on the GPU.
28 */
29
30#include <stdio.h>
31#include <errno.h>
32#include "pipe/p_defines.h"
33#include "pipe/p_state.h"
34#include "pipe/p_context.h"
35#include "pipe/p_screen.h"
36#include "util/fast_idiv_by_const.h"
37#include "util/u_inlines.h"
38#include "util/u_upload_mgr.h"
39#include "iris_context.h"
40#include "iris_defines.h"
41#include "iris_fence.h"
42#include "iris_resource.h"
43#include "iris_screen.h"
44#include "vulkan/util/vk_util.h"
45
46#define IA_VERTICES_COUNT          0x2310
47#define IA_PRIMITIVES_COUNT        0x2318
48#define VS_INVOCATION_COUNT        0x2320
49#define HS_INVOCATION_COUNT        0x2300
50#define DS_INVOCATION_COUNT        0x2308
51#define GS_INVOCATION_COUNT        0x2328
52#define GS_PRIMITIVES_COUNT        0x2330
53#define CL_INVOCATION_COUNT        0x2338
54#define CL_PRIMITIVES_COUNT        0x2340
55#define PS_INVOCATION_COUNT        0x2348
56#define CS_INVOCATION_COUNT        0x2290
57#define PS_DEPTH_COUNT             0x2350
58
59#define SO_PRIM_STORAGE_NEEDED(n)  (0x5240 + (n) * 8)
60
61#define SO_NUM_PRIMS_WRITTEN(n)    (0x5200 + (n) * 8)
62
63#define MI_MATH (0x1a << 23)
64
65#define MI_ALU_LOAD      0x080
66#define MI_ALU_LOADINV   0x480
67#define MI_ALU_LOAD0     0x081
68#define MI_ALU_LOAD1     0x481
69#define MI_ALU_ADD       0x100
70#define MI_ALU_SUB       0x101
71#define MI_ALU_AND       0x102
72#define MI_ALU_OR        0x103
73#define MI_ALU_XOR       0x104
74#define MI_ALU_STORE     0x180
75#define MI_ALU_STOREINV  0x580
76
77#define MI_ALU_R0        0x00
78#define MI_ALU_R1        0x01
79#define MI_ALU_R2        0x02
80#define MI_ALU_R3        0x03
81#define MI_ALU_R4        0x04
82#define MI_ALU_SRCA      0x20
83#define MI_ALU_SRCB      0x21
84#define MI_ALU_ACCU      0x31
85#define MI_ALU_ZF        0x32
86#define MI_ALU_CF        0x33
87
88#define _MI_ALU(op, x, y)  (((op) << 20) | ((x) << 10) | (y))
89
90#define _MI_ALU0(op)       _MI_ALU(MI_ALU_##op, 0, 0)
91#define _MI_ALU1(op, x)    _MI_ALU(MI_ALU_##op, x, 0)
92#define _MI_ALU2(op, x, y) _MI_ALU(MI_ALU_##op, x, y)
93
94#define MI_ALU0(op)        _MI_ALU0(op)
95#define MI_ALU1(op, x)     _MI_ALU1(op, MI_ALU_##x)
96#define MI_ALU2(op, x, y)  _MI_ALU2(op, MI_ALU_##x, MI_ALU_##y)
97
98#define emit_lri32 ice->vtbl.load_register_imm32
99#define emit_lri64 ice->vtbl.load_register_imm64
100#define emit_lrr32 ice->vtbl.load_register_reg32
101
102struct iris_query {
103   enum pipe_query_type type;
104   int index;
105
106   bool ready;
107
108   bool stalled;
109
110   uint64_t result;
111
112   struct iris_state_ref query_state_ref;
113   struct iris_query_snapshots *map;
114   struct iris_syncpt *syncpt;
115
116   int batch_idx;
117};
118
119struct iris_query_snapshots {
120   /** iris_render_condition's saved MI_PREDICATE_RESULT value. */
121   uint64_t predicate_result;
122
123   /** Have the start/end snapshots landed? */
124   uint64_t snapshots_landed;
125
126   /** Starting and ending counter snapshots */
127   uint64_t start;
128   uint64_t end;
129};
130
131struct iris_query_so_overflow {
132   uint64_t predicate_result;
133   uint64_t snapshots_landed;
134
135   struct {
136      uint64_t prim_storage_needed[2];
137      uint64_t num_prims[2];
138   } stream[4];
139};
140
141/**
142 * Is this type of query written by PIPE_CONTROL?
143 */
144static bool
145iris_is_query_pipelined(struct iris_query *q)
146{
147   switch (q->type) {
148   case PIPE_QUERY_OCCLUSION_COUNTER:
149   case PIPE_QUERY_OCCLUSION_PREDICATE:
150   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
151   case PIPE_QUERY_TIMESTAMP:
152   case PIPE_QUERY_TIMESTAMP_DISJOINT:
153   case PIPE_QUERY_TIME_ELAPSED:
154      return true;
155
156   default:
157      return false;
158   }
159}
160
161static void
162mark_available(struct iris_context *ice, struct iris_query *q)
163{
164   struct iris_batch *batch = &ice->batches[q->batch_idx];
165   unsigned flags = PIPE_CONTROL_WRITE_IMMEDIATE;
166   unsigned offset = offsetof(struct iris_query_snapshots, snapshots_landed);
167   struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
168   offset += q->query_state_ref.offset;
169
170   if (!iris_is_query_pipelined(q)) {
171      ice->vtbl.store_data_imm64(batch, bo, offset, true);
172   } else {
173      /* Order available *after* the query results. */
174      flags |= PIPE_CONTROL_FLUSH_ENABLE;
175      iris_emit_pipe_control_write(batch, flags, bo, offset, true);
176   }
177}
178
179/**
180 * Write PS_DEPTH_COUNT to q->(dest) via a PIPE_CONTROL.
181 */
182static void
183iris_pipelined_write(struct iris_batch *batch,
184                     struct iris_query *q,
185                     enum pipe_control_flags flags,
186                     unsigned offset)
187{
188   const struct gen_device_info *devinfo = &batch->screen->devinfo;
189   const unsigned optional_cs_stall =
190      devinfo->gen == 9 && devinfo->gt == 4 ?  PIPE_CONTROL_CS_STALL : 0;
191   struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
192
193   iris_emit_pipe_control_write(batch, flags | optional_cs_stall,
194                                bo, offset, 0ull);
195}
196
197static void
198write_value(struct iris_context *ice, struct iris_query *q, unsigned offset)
199{
200   struct iris_batch *batch = &ice->batches[q->batch_idx];
201   const struct gen_device_info *devinfo = &batch->screen->devinfo;
202   struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
203
204   if (!iris_is_query_pipelined(q)) {
205      iris_emit_pipe_control_flush(batch,
206                                   PIPE_CONTROL_CS_STALL |
207                                   PIPE_CONTROL_STALL_AT_SCOREBOARD);
208      q->stalled = true;
209   }
210
211   switch (q->type) {
212   case PIPE_QUERY_OCCLUSION_COUNTER:
213   case PIPE_QUERY_OCCLUSION_PREDICATE:
214   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
215      if (devinfo->gen >= 10) {
216         /* "Driver must program PIPE_CONTROL with only Depth Stall Enable
217          *  bit set prior to programming a PIPE_CONTROL with Write PS Depth
218          *  Count sync operation."
219          */
220         iris_emit_pipe_control_flush(batch, PIPE_CONTROL_DEPTH_STALL);
221      }
222      iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q,
223                           PIPE_CONTROL_WRITE_DEPTH_COUNT |
224                           PIPE_CONTROL_DEPTH_STALL,
225                           offset);
226      break;
227   case PIPE_QUERY_TIME_ELAPSED:
228   case PIPE_QUERY_TIMESTAMP:
229   case PIPE_QUERY_TIMESTAMP_DISJOINT:
230      iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q,
231                           PIPE_CONTROL_WRITE_TIMESTAMP,
232                           offset);
233      break;
234   case PIPE_QUERY_PRIMITIVES_GENERATED:
235      ice->vtbl.store_register_mem64(batch,
236                                     q->index == 0 ? CL_INVOCATION_COUNT :
237                                     SO_PRIM_STORAGE_NEEDED(q->index),
238                                     bo, offset, false);
239      break;
240   case PIPE_QUERY_PRIMITIVES_EMITTED:
241      ice->vtbl.store_register_mem64(batch,
242                                     SO_NUM_PRIMS_WRITTEN(q->index),
243                                     bo, offset, false);
244      break;
245   case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE: {
246      static const uint32_t index_to_reg[] = {
247         IA_VERTICES_COUNT,
248         IA_PRIMITIVES_COUNT,
249         VS_INVOCATION_COUNT,
250         GS_INVOCATION_COUNT,
251         GS_PRIMITIVES_COUNT,
252         CL_INVOCATION_COUNT,
253         CL_PRIMITIVES_COUNT,
254         PS_INVOCATION_COUNT,
255         HS_INVOCATION_COUNT,
256         DS_INVOCATION_COUNT,
257         CS_INVOCATION_COUNT,
258      };
259      const uint32_t reg = index_to_reg[q->index];
260
261      ice->vtbl.store_register_mem64(batch, reg, bo, offset, false);
262      break;
263   }
264   default:
265      assert(false);
266   }
267}
268
269static void
270write_overflow_values(struct iris_context *ice, struct iris_query *q, bool end)
271{
272   struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
273   uint32_t count = q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ? 1 : 4;
274   struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
275   uint32_t offset = q->query_state_ref.offset;
276
277   iris_emit_pipe_control_flush(batch,
278                                PIPE_CONTROL_CS_STALL |
279                                PIPE_CONTROL_STALL_AT_SCOREBOARD);
280   for (uint32_t i = 0; i < count; i++) {
281      int s = q->index + i;
282      int g_idx = offset + offsetof(struct iris_query_so_overflow,
283                           stream[s].num_prims[end]);
284      int w_idx = offset + offsetof(struct iris_query_so_overflow,
285                           stream[s].prim_storage_needed[end]);
286      ice->vtbl.store_register_mem64(batch, SO_NUM_PRIMS_WRITTEN(s),
287                                     bo, g_idx, false);
288      ice->vtbl.store_register_mem64(batch, SO_PRIM_STORAGE_NEEDED(s),
289                                     bo, w_idx, false);
290   }
291}
292
293uint64_t
294iris_timebase_scale(const struct gen_device_info *devinfo,
295                    uint64_t gpu_timestamp)
296{
297   return (1000000000ull * gpu_timestamp) / devinfo->timestamp_frequency;
298}
299
300static uint64_t
301iris_raw_timestamp_delta(uint64_t time0, uint64_t time1)
302{
303   if (time0 > time1) {
304      return (1ULL << TIMESTAMP_BITS) + time1 - time0;
305   } else {
306      return time1 - time0;
307   }
308}
309
310static bool
311stream_overflowed(struct iris_query_so_overflow *so, int s)
312{
313   return (so->stream[s].prim_storage_needed[1] -
314           so->stream[s].prim_storage_needed[0]) !=
315          (so->stream[s].num_prims[1] - so->stream[s].num_prims[0]);
316}
317
318static void
319calculate_result_on_cpu(const struct gen_device_info *devinfo,
320                        struct iris_query *q)
321{
322   switch (q->type) {
323   case PIPE_QUERY_OCCLUSION_PREDICATE:
324   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
325      q->result = q->map->end != q->map->start;
326      break;
327   case PIPE_QUERY_TIMESTAMP:
328   case PIPE_QUERY_TIMESTAMP_DISJOINT:
329      /* The timestamp is the single starting snapshot. */
330      q->result = iris_timebase_scale(devinfo, q->map->start);
331      q->result &= (1ull << TIMESTAMP_BITS) - 1;
332      break;
333   case PIPE_QUERY_TIME_ELAPSED:
334      q->result = iris_raw_timestamp_delta(q->map->start, q->map->end);
335      q->result = iris_timebase_scale(devinfo, q->result);
336      q->result &= (1ull << TIMESTAMP_BITS) - 1;
337      break;
338   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
339      q->result = stream_overflowed((void *) q->map, q->index);
340      break;
341   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
342      q->result = false;
343      for (int i = 0; i < MAX_VERTEX_STREAMS; i++)
344         q->result |= stream_overflowed((void *) q->map, i);
345      break;
346   case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE:
347      q->result = q->map->end - q->map->start;
348
349      /* WaDividePSInvocationCountBy4:HSW,BDW */
350      if (devinfo->gen == 8 && q->index == PIPE_STAT_QUERY_PS_INVOCATIONS)
351         q->result /= 4;
352      break;
353   case PIPE_QUERY_OCCLUSION_COUNTER:
354   case PIPE_QUERY_PRIMITIVES_GENERATED:
355   case PIPE_QUERY_PRIMITIVES_EMITTED:
356   default:
357      q->result = q->map->end - q->map->start;
358      break;
359   }
360
361   q->ready = true;
362}
363
364static void
365emit_alu_add(struct iris_batch *batch, unsigned dst_reg,
366             unsigned reg_a, unsigned reg_b)
367{
368   uint32_t *math = iris_get_command_space(batch, 5 * sizeof(uint32_t));
369
370   math[0] = MI_MATH | (5 - 2);
371   math[1] = _MI_ALU2(LOAD, MI_ALU_SRCA, reg_a);
372   math[2] = _MI_ALU2(LOAD, MI_ALU_SRCB, reg_b);
373   math[3] = _MI_ALU0(ADD);
374   math[4] = _MI_ALU2(STORE, dst_reg, MI_ALU_ACCU);
375}
376
377static void
378emit_alu_shl(struct iris_batch *batch, unsigned dst_reg,
379             unsigned src_reg, unsigned shift)
380{
381   assert(shift > 0);
382
383   int dwords = 1 + 4 * shift;
384
385   uint32_t *math = iris_get_command_space(batch, sizeof(uint32_t) * dwords);
386
387   math[0] = MI_MATH | ((1 + 4 * shift) - 2);
388
389   for (unsigned i = 0; i < shift; i++) {
390      unsigned add_src = (i == 0) ? src_reg : dst_reg;
391      math[1 + (i * 4) + 0] = _MI_ALU2(LOAD, MI_ALU_SRCA, add_src);
392      math[1 + (i * 4) + 1] = _MI_ALU2(LOAD, MI_ALU_SRCB, add_src);
393      math[1 + (i * 4) + 2] = _MI_ALU0(ADD);
394      math[1 + (i * 4) + 3] = _MI_ALU2(STORE, dst_reg, MI_ALU_ACCU);
395   }
396}
397
398/* Emit dwords to multiply GPR0 by N */
399static void
400build_alu_multiply_gpr0(uint32_t *dw, unsigned *dw_count, uint32_t N)
401{
402   VK_OUTARRAY_MAKE(out, dw, dw_count);
403
404#define APPEND_ALU(op, x, y) \
405   vk_outarray_append(&out, alu_dw) *alu_dw = _MI_ALU(MI_ALU_##op, x, y)
406
407   assert(N > 0);
408   unsigned top_bit = 31 - __builtin_clz(N);
409   for (int i = top_bit - 1; i >= 0; i--) {
410      /* We get our initial data in GPR0 and we write the final data out to
411       * GPR0 but we use GPR1 as our scratch register.
412       */
413      unsigned src_reg = i == top_bit - 1 ? MI_ALU_R0 : MI_ALU_R1;
414      unsigned dst_reg = i == 0 ? MI_ALU_R0 : MI_ALU_R1;
415
416      /* Shift the current value left by 1 */
417      APPEND_ALU(LOAD, MI_ALU_SRCA, src_reg);
418      APPEND_ALU(LOAD, MI_ALU_SRCB, src_reg);
419      APPEND_ALU(ADD, 0, 0);
420
421      if (N & (1 << i)) {
422         /* Store ACCU to R1 and add R0 to R1 */
423         APPEND_ALU(STORE, MI_ALU_R1, MI_ALU_ACCU);
424         APPEND_ALU(LOAD, MI_ALU_SRCA, MI_ALU_R0);
425         APPEND_ALU(LOAD, MI_ALU_SRCB, MI_ALU_R1);
426         APPEND_ALU(ADD, 0, 0);
427      }
428
429      APPEND_ALU(STORE, dst_reg, MI_ALU_ACCU);
430   }
431
432#undef APPEND_ALU
433}
434
435static void
436emit_mul_gpr0(struct iris_batch *batch, uint32_t N)
437{
438   uint32_t num_dwords;
439   build_alu_multiply_gpr0(NULL, &num_dwords, N);
440
441   uint32_t *math = iris_get_command_space(batch, 4 * num_dwords);
442   math[0] = MI_MATH | (num_dwords - 2);
443   build_alu_multiply_gpr0(&math[1], &num_dwords, N);
444}
445
446void
447iris_math_div32_gpr0(struct iris_context *ice,
448                     struct iris_batch *batch,
449                     uint32_t D)
450{
451   /* Zero out the top of GPR0 */
452   emit_lri32(batch, CS_GPR(0) + 4, 0);
453
454   if (D == 0) {
455      /* This invalid, but we should do something so we set GPR0 to 0. */
456      emit_lri32(batch, CS_GPR(0), 0);
457   } else if (util_is_power_of_two_or_zero(D)) {
458      unsigned log2_D = util_logbase2(D);
459      assert(log2_D < 32);
460      /* We right-shift by log2(D) by left-shifting by 32 - log2(D) and taking
461       * the top 32 bits of the result.
462       */
463      emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - log2_D);
464      emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
465      emit_lri32(batch, CS_GPR(0) + 4, 0);
466   } else {
467      struct util_fast_udiv_info m = util_compute_fast_udiv_info(D, 32, 32);
468      assert(m.multiplier <= UINT32_MAX);
469
470      if (m.pre_shift) {
471         /* We right-shift by L by left-shifting by 32 - l and taking the top
472          * 32 bits of the result.
473          */
474         if (m.pre_shift < 32)
475            emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - m.pre_shift);
476         emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
477         emit_lri32(batch, CS_GPR(0) + 4, 0);
478      }
479
480      /* Do the 32x32 multiply into gpr0 */
481      emit_mul_gpr0(batch, m.multiplier);
482
483      if (m.increment) {
484         /* If we need to increment, save off a copy of GPR0 */
485         emit_lri32(batch, CS_GPR(1) + 0, m.multiplier);
486         emit_lri32(batch, CS_GPR(1) + 4, 0);
487         emit_alu_add(batch, MI_ALU_R0, MI_ALU_R0, MI_ALU_R1);
488      }
489
490      /* Shift by 32 */
491      emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
492      emit_lri32(batch, CS_GPR(0) + 4, 0);
493
494      if (m.post_shift) {
495         /* We right-shift by L by left-shifting by 32 - l and taking the top
496          * 32 bits of the result.
497          */
498         if (m.post_shift < 32)
499            emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - m.post_shift);
500         emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
501         emit_lri32(batch, CS_GPR(0) + 4, 0);
502      }
503   }
504}
505
506void
507iris_math_add32_gpr0(struct iris_context *ice,
508                     struct iris_batch *batch,
509                     uint32_t x)
510{
511   emit_lri32(batch, CS_GPR(1), x);
512   emit_alu_add(batch, MI_ALU_R0, MI_ALU_R0, MI_ALU_R1);
513}
514
515/*
516 * GPR0 = (GPR0 == 0) ? 0 : 1;
517 */
518static void
519gpr0_to_bool(struct iris_context *ice)
520{
521   struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
522
523   ice->vtbl.load_register_imm64(batch, CS_GPR(1), 1ull);
524
525   static const uint32_t math[] = {
526      MI_MATH | (9 - 2),
527      MI_ALU2(LOAD, SRCA, R0),
528      MI_ALU1(LOAD0, SRCB),
529      MI_ALU0(ADD),
530      MI_ALU2(STOREINV, R0, ZF),
531      MI_ALU2(LOAD, SRCA, R0),
532      MI_ALU2(LOAD, SRCB, R1),
533      MI_ALU0(AND),
534      MI_ALU2(STORE, R0, ACCU),
535   };
536   iris_batch_emit(batch, math, sizeof(math));
537}
538
539static void
540load_overflow_data_to_cs_gprs(struct iris_context *ice,
541                              struct iris_query *q,
542                              int idx)
543{
544   struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
545   struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
546   uint32_t offset = q->query_state_ref.offset;
547
548   ice->vtbl.load_register_mem64(batch, CS_GPR(1), bo, offset +
549                                 offsetof(struct iris_query_so_overflow,
550                                          stream[idx].prim_storage_needed[0]));
551   ice->vtbl.load_register_mem64(batch, CS_GPR(2), bo, offset +
552                                 offsetof(struct iris_query_so_overflow,
553                                          stream[idx].prim_storage_needed[1]));
554
555   ice->vtbl.load_register_mem64(batch, CS_GPR(3), bo, offset +
556                                 offsetof(struct iris_query_so_overflow,
557                                          stream[idx].num_prims[0]));
558   ice->vtbl.load_register_mem64(batch, CS_GPR(4), bo, offset +
559                                 offsetof(struct iris_query_so_overflow,
560                                          stream[idx].num_prims[1]));
561}
562
563/*
564 * R3 = R4 - R3;
565 * R1 = R2 - R1;
566 * R1 = R3 - R1;
567 * R0 = R0 | R1;
568 */
569static void
570calc_overflow_for_stream(struct iris_context *ice)
571{
572   struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
573   static const uint32_t maths[] = {
574      MI_MATH | (17 - 2),
575      MI_ALU2(LOAD, SRCA, R4),
576      MI_ALU2(LOAD, SRCB, R3),
577      MI_ALU0(SUB),
578      MI_ALU2(STORE, R3, ACCU),
579      MI_ALU2(LOAD, SRCA, R2),
580      MI_ALU2(LOAD, SRCB, R1),
581      MI_ALU0(SUB),
582      MI_ALU2(STORE, R1, ACCU),
583      MI_ALU2(LOAD, SRCA, R3),
584      MI_ALU2(LOAD, SRCB, R1),
585      MI_ALU0(SUB),
586      MI_ALU2(STORE, R1, ACCU),
587      MI_ALU2(LOAD, SRCA, R1),
588      MI_ALU2(LOAD, SRCB, R0),
589      MI_ALU0(OR),
590      MI_ALU2(STORE, R0, ACCU),
591   };
592
593   iris_batch_emit(batch, maths, sizeof(maths));
594}
595
596static void
597overflow_result_to_gpr0(struct iris_context *ice, struct iris_query *q)
598{
599   struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
600
601   ice->vtbl.load_register_imm64(batch, CS_GPR(0), 0ull);
602
603   if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) {
604      load_overflow_data_to_cs_gprs(ice, q, q->index);
605      calc_overflow_for_stream(ice);
606   } else {
607      for (int i = 0; i < MAX_VERTEX_STREAMS; i++) {
608         load_overflow_data_to_cs_gprs(ice, q, i);
609         calc_overflow_for_stream(ice);
610      }
611   }
612
613   gpr0_to_bool(ice);
614}
615
616/*
617 * GPR0 = GPR0 & ((1ull << n) -1);
618 */
619static void
620keep_gpr0_lower_n_bits(struct iris_context *ice, uint32_t n)
621{
622   struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
623
624   ice->vtbl.load_register_imm64(batch, CS_GPR(1), (1ull << n) - 1);
625   static const uint32_t math[] = {
626      MI_MATH | (5 - 2),
627      MI_ALU2(LOAD, SRCA, R0),
628      MI_ALU2(LOAD, SRCB, R1),
629      MI_ALU0(AND),
630      MI_ALU2(STORE, R0, ACCU),
631   };
632   iris_batch_emit(batch, math, sizeof(math));
633}
634
635/*
636 * GPR0 = GPR0 << 30;
637 */
638static void
639shl_gpr0_by_30_bits(struct iris_context *ice)
640{
641   struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
642   /* First we mask 34 bits of GPR0 to prevent overflow */
643   keep_gpr0_lower_n_bits(ice, 34);
644
645   static const uint32_t shl_math[] = {
646      MI_ALU2(LOAD, SRCA, R0),
647      MI_ALU2(LOAD, SRCB, R0),
648      MI_ALU0(ADD),
649      MI_ALU2(STORE, R0, ACCU),
650   };
651
652   const uint32_t outer_count = 5;
653   const uint32_t inner_count = 6;
654   const uint32_t cmd_len = 1 + inner_count * ARRAY_SIZE(shl_math);
655   const uint32_t batch_len = cmd_len * outer_count;
656   uint32_t *map = iris_get_command_space(batch, batch_len * 4);
657   uint32_t offset = 0;
658   for (int o = 0; o < outer_count; o++) {
659      map[offset++] = MI_MATH | (cmd_len - 2);
660      for (int i = 0; i < inner_count; i++) {
661         memcpy(&map[offset], shl_math, sizeof(shl_math));
662         offset += 4;
663      }
664   }
665}
666
667/*
668 * GPR0 = GPR0 >> 2;
669 *
670 * Note that the upper 30 bits of GPR0 are lost!
671 */
672static void
673shr_gpr0_by_2_bits(struct iris_context *ice)
674{
675   struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
676   shl_gpr0_by_30_bits(ice);
677   ice->vtbl.load_register_reg32(batch, CS_GPR(0) + 4, CS_GPR(0));
678   ice->vtbl.load_register_imm32(batch, CS_GPR(0) + 4, 0);
679}
680
681/**
682 * Calculate the result and store it to CS_GPR0.
683 */
684static void
685calculate_result_on_gpu(struct iris_context *ice, struct iris_query *q)
686{
687   struct iris_batch *batch = &ice->batches[q->batch_idx];
688   struct iris_screen *screen = (void *) ice->ctx.screen;
689   const struct gen_device_info *devinfo = &batch->screen->devinfo;
690   struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
691   uint32_t offset = q->query_state_ref.offset;
692
693   if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
694       q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
695      overflow_result_to_gpr0(ice, q);
696      return;
697   }
698
699   if (q->type == PIPE_QUERY_TIMESTAMP) {
700      ice->vtbl.load_register_mem64(batch, CS_GPR(0), bo,
701                                    offset +
702                                    offsetof(struct iris_query_snapshots, start));
703      /* TODO: This discards any fractional bits of the timebase scale.
704       * We would need to do a bit of fixed point math on the CS ALU, or
705       * launch an actual shader to calculate this with full precision.
706       */
707      emit_mul_gpr0(batch, (1000000000ull / screen->devinfo.timestamp_frequency));
708      keep_gpr0_lower_n_bits(ice, 36);
709      return;
710   }
711
712   ice->vtbl.load_register_mem64(batch, CS_GPR(1), bo,
713                                 offset +
714                                 offsetof(struct iris_query_snapshots, start));
715   ice->vtbl.load_register_mem64(batch, CS_GPR(2), bo,
716                                 offset +
717                                 offsetof(struct iris_query_snapshots, end));
718
719   static const uint32_t math[] = {
720      MI_MATH | (5 - 2),
721      MI_ALU2(LOAD, SRCA, R2),
722      MI_ALU2(LOAD, SRCB, R1),
723      MI_ALU0(SUB),
724      MI_ALU2(STORE, R0, ACCU),
725   };
726   iris_batch_emit(batch, math, sizeof(math));
727
728   /* WaDividePSInvocationCountBy4:HSW,BDW */
729   if (devinfo->gen == 8 &&
730       q->type == PIPE_QUERY_PIPELINE_STATISTICS_SINGLE &&
731       q->index == PIPE_STAT_QUERY_PS_INVOCATIONS)
732      shr_gpr0_by_2_bits(ice);
733
734   if (q->type == PIPE_QUERY_OCCLUSION_PREDICATE ||
735       q->type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE)
736      gpr0_to_bool(ice);
737
738   if (q->type == PIPE_QUERY_TIME_ELAPSED) {
739      /* TODO: This discards fractional bits (see above). */
740      emit_mul_gpr0(batch, (1000000000ull / screen->devinfo.timestamp_frequency));
741   }
742}
743
744static struct pipe_query *
745iris_create_query(struct pipe_context *ctx,
746                  unsigned query_type,
747                  unsigned index)
748{
749   struct iris_query *q = calloc(1, sizeof(struct iris_query));
750
751   q->type = query_type;
752   q->index = index;
753
754   if (q->type == PIPE_QUERY_PIPELINE_STATISTICS_SINGLE &&
755       q->index == PIPE_STAT_QUERY_CS_INVOCATIONS)
756      q->batch_idx = IRIS_BATCH_COMPUTE;
757   else
758      q->batch_idx = IRIS_BATCH_RENDER;
759   return (struct pipe_query *) q;
760}
761
762static void
763iris_destroy_query(struct pipe_context *ctx, struct pipe_query *p_query)
764{
765   struct iris_query *query = (void *) p_query;
766   struct iris_screen *screen = (void *) ctx->screen;
767   iris_syncpt_reference(screen, &query->syncpt, NULL);
768   free(query);
769}
770
771
772static boolean
773iris_begin_query(struct pipe_context *ctx, struct pipe_query *query)
774{
775   struct iris_context *ice = (void *) ctx;
776   struct iris_query *q = (void *) query;
777   void *ptr = NULL;
778   uint32_t size;
779
780   if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
781       q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
782      size = sizeof(struct iris_query_so_overflow);
783   else
784      size = sizeof(struct iris_query_snapshots);
785
786   u_upload_alloc(ice->query_buffer_uploader, 0,
787                  size, size, &q->query_state_ref.offset,
788                  &q->query_state_ref.res, &ptr);
789
790   if (!iris_resource_bo(q->query_state_ref.res))
791      return false;
792
793   q->map = ptr;
794   if (!q->map)
795      return false;
796
797   q->result = 0ull;
798   q->ready = false;
799   WRITE_ONCE(q->map->snapshots_landed, false);
800
801   if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
802      ice->state.prims_generated_query_active = true;
803      ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
804   }
805
806   if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
807       q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
808      write_overflow_values(ice, q, false);
809   else
810      write_value(ice, q,
811                  q->query_state_ref.offset +
812                  offsetof(struct iris_query_snapshots, start));
813
814   return true;
815}
816
817static bool
818iris_end_query(struct pipe_context *ctx, struct pipe_query *query)
819{
820   struct iris_context *ice = (void *) ctx;
821   struct iris_query *q = (void *) query;
822   struct iris_batch *batch = &ice->batches[q->batch_idx];
823
824   if (q->type == PIPE_QUERY_TIMESTAMP) {
825      iris_begin_query(ctx, query);
826      iris_batch_reference_signal_syncpt(batch, &q->syncpt);
827      mark_available(ice, q);
828      return true;
829   }
830
831   if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
832      ice->state.prims_generated_query_active = false;
833      ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
834   }
835
836   if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
837       q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
838      write_overflow_values(ice, q, true);
839   else
840      write_value(ice, q,
841                  q->query_state_ref.offset +
842                  offsetof(struct iris_query_snapshots, end));
843
844   iris_batch_reference_signal_syncpt(batch, &q->syncpt);
845   mark_available(ice, q);
846
847   return true;
848}
849
850/**
851 * See if the snapshots have landed for a query, and if so, compute the
852 * result and mark it ready.  Does not flush (unlike iris_get_query_result).
853 */
854static void
855iris_check_query_no_flush(struct iris_context *ice, struct iris_query *q)
856{
857   struct iris_screen *screen = (void *) ice->ctx.screen;
858   const struct gen_device_info *devinfo = &screen->devinfo;
859
860   if (!q->ready && READ_ONCE(q->map->snapshots_landed)) {
861      calculate_result_on_cpu(devinfo, q);
862   }
863}
864
865static boolean
866iris_get_query_result(struct pipe_context *ctx,
867                      struct pipe_query *query,
868                      boolean wait,
869                      union pipe_query_result *result)
870{
871   struct iris_context *ice = (void *) ctx;
872   struct iris_query *q = (void *) query;
873   struct iris_screen *screen = (void *) ctx->screen;
874   const struct gen_device_info *devinfo = &screen->devinfo;
875   struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
876
877   if (!q->ready) {
878      if (iris_batch_references(&ice->batches[q->batch_idx], bo))
879         iris_batch_flush(&ice->batches[q->batch_idx]);
880
881      while (!READ_ONCE(q->map->snapshots_landed)) {
882         if (wait)
883            iris_wait_syncpt(ctx->screen, q->syncpt, INT64_MAX);
884         else
885            return false;
886      }
887
888      assert(READ_ONCE(q->map->snapshots_landed));
889      calculate_result_on_cpu(devinfo, q);
890   }
891
892   assert(q->ready);
893
894   result->u64 = q->result;
895
896   return true;
897}
898
899static void
900iris_get_query_result_resource(struct pipe_context *ctx,
901                               struct pipe_query *query,
902                               boolean wait,
903                               enum pipe_query_value_type result_type,
904                               int index,
905                               struct pipe_resource *p_res,
906                               unsigned offset)
907{
908   struct iris_context *ice = (void *) ctx;
909   struct iris_query *q = (void *) query;
910   struct iris_batch *batch = &ice->batches[q->batch_idx];
911   const struct gen_device_info *devinfo = &batch->screen->devinfo;
912   struct iris_resource *res = (void *) p_res;
913   struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
914   unsigned snapshots_landed_offset =
915      offsetof(struct iris_query_snapshots, snapshots_landed);
916
917   res->bind_history |= PIPE_BIND_QUERY_BUFFER;
918
919   if (index == -1) {
920      /* They're asking for the availability of the result.  If we still
921       * have commands queued up which produce the result, submit them
922       * now so that progress happens.  Either way, copy the snapshots
923       * landed field to the destination resource.
924       */
925      if (iris_batch_references(batch, bo))
926         iris_batch_flush(batch);
927
928      ice->vtbl.copy_mem_mem(batch, iris_resource_bo(p_res), offset,
929                             bo, snapshots_landed_offset,
930                             result_type <= PIPE_QUERY_TYPE_U32 ? 4 : 8);
931      return;
932   }
933
934   if (!q->ready && READ_ONCE(q->map->snapshots_landed)) {
935      /* The final snapshots happen to have landed, so let's just compute
936       * the result on the CPU now...
937       */
938      calculate_result_on_cpu(devinfo, q);
939   }
940
941   if (q->ready) {
942      /* We happen to have the result on the CPU, so just copy it. */
943      if (result_type <= PIPE_QUERY_TYPE_U32) {
944         ice->vtbl.store_data_imm32(batch, iris_resource_bo(p_res), offset,
945                                    q->result);
946      } else {
947         ice->vtbl.store_data_imm64(batch, iris_resource_bo(p_res), offset,
948                                    q->result);
949      }
950
951      /* Make sure the result lands before they use bind the QBO elsewhere
952       * and use the result.
953       */
954      // XXX: Why?  i965 doesn't do this.
955      iris_emit_pipe_control_flush(batch, PIPE_CONTROL_CS_STALL);
956      return;
957   }
958
959   /* Calculate the result to CS_GPR0 */
960   calculate_result_on_gpu(ice, q);
961
962   bool predicated = !wait && !q->stalled;
963
964   if (predicated) {
965      ice->vtbl.load_register_imm64(batch, MI_PREDICATE_SRC1, 0ull);
966      ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, bo,
967                                    snapshots_landed_offset);
968      uint32_t predicate = MI_PREDICATE |
969                           MI_PREDICATE_LOADOP_LOADINV |
970                           MI_PREDICATE_COMBINEOP_SET |
971                           MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
972      iris_batch_emit(batch, &predicate, sizeof(uint32_t));
973   }
974
975   if (result_type <= PIPE_QUERY_TYPE_U32) {
976      ice->vtbl.store_register_mem32(batch, CS_GPR(0),
977                                     iris_resource_bo(p_res),
978                                     offset, predicated);
979   } else {
980      ice->vtbl.store_register_mem64(batch, CS_GPR(0),
981                                     iris_resource_bo(p_res),
982                                     offset, predicated);
983   }
984}
985
986static void
987iris_set_active_query_state(struct pipe_context *ctx, boolean enable)
988{
989   struct iris_context *ice = (void *) ctx;
990
991   if (ice->state.statistics_counters_enabled == enable)
992      return;
993
994   // XXX: most packets aren't paying attention to this yet, because it'd
995   // have to be done dynamically at draw time, which is a pain
996   ice->state.statistics_counters_enabled = enable;
997   ice->state.dirty |= IRIS_DIRTY_CLIP |
998                       IRIS_DIRTY_GS |
999                       IRIS_DIRTY_RASTER |
1000                       IRIS_DIRTY_STREAMOUT |
1001                       IRIS_DIRTY_TCS |
1002                       IRIS_DIRTY_TES |
1003                       IRIS_DIRTY_VS |
1004                       IRIS_DIRTY_WM;
1005}
1006
1007static void
1008set_predicate_enable(struct iris_context *ice, bool value)
1009{
1010   if (value)
1011      ice->state.predicate = IRIS_PREDICATE_STATE_RENDER;
1012   else
1013      ice->state.predicate = IRIS_PREDICATE_STATE_DONT_RENDER;
1014}
1015
1016static void
1017set_predicate_for_result(struct iris_context *ice,
1018                         struct iris_query *q,
1019                         bool inverted)
1020{
1021   struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
1022   struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
1023
1024   /* The CPU doesn't have the query result yet; use hardware predication */
1025   ice->state.predicate = IRIS_PREDICATE_STATE_USE_BIT;
1026
1027   /* Ensure the memory is coherent for MI_LOAD_REGISTER_* commands. */
1028   iris_emit_pipe_control_flush(batch, PIPE_CONTROL_FLUSH_ENABLE);
1029   q->stalled = true;
1030
1031   switch (q->type) {
1032   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1033   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1034      overflow_result_to_gpr0(ice, q);
1035
1036      ice->vtbl.load_register_reg64(batch, MI_PREDICATE_SRC0, CS_GPR(0));
1037      ice->vtbl.load_register_imm64(batch, MI_PREDICATE_SRC1, 0ull);
1038      break;
1039   default:
1040      /* PIPE_QUERY_OCCLUSION_* */
1041      ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, bo,
1042         offsetof(struct iris_query_snapshots, start) +
1043         q->query_state_ref.offset);
1044      ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC1, bo,
1045         offsetof(struct iris_query_snapshots, end) +
1046         q->query_state_ref.offset);
1047      break;
1048   }
1049
1050   uint32_t mi_predicate = MI_PREDICATE |
1051                           MI_PREDICATE_COMBINEOP_SET |
1052                           MI_PREDICATE_COMPAREOP_SRCS_EQUAL |
1053                           (inverted ? MI_PREDICATE_LOADOP_LOAD
1054                                     : MI_PREDICATE_LOADOP_LOADINV);
1055   iris_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
1056
1057   /* We immediately set the predicate on the render batch, as all the
1058    * counters come from 3D operations.  However, we may need to predicate
1059    * a compute dispatch, which executes in a different GEM context and has
1060    * a different MI_PREDICATE_RESULT register.  So, we save the result to
1061    * memory and reload it in iris_launch_grid.
1062    */
1063   unsigned offset = q->query_state_ref.offset +
1064                     offsetof(struct iris_query_snapshots, predicate_result);
1065   ice->vtbl.store_register_mem64(batch, MI_PREDICATE_RESULT,
1066                                  bo, offset, false);
1067   ice->state.compute_predicate = bo;
1068}
1069
1070static void
1071iris_render_condition(struct pipe_context *ctx,
1072                      struct pipe_query *query,
1073                      boolean condition,
1074                      enum pipe_render_cond_flag mode)
1075{
1076   struct iris_context *ice = (void *) ctx;
1077   struct iris_query *q = (void *) query;
1078
1079   /* The old condition isn't relevant; we'll update it if necessary */
1080   ice->state.compute_predicate = NULL;
1081   ice->condition.query = q;
1082   ice->condition.condition = condition;
1083
1084   if (!q) {
1085      ice->state.predicate = IRIS_PREDICATE_STATE_RENDER;
1086      return;
1087   }
1088
1089   iris_check_query_no_flush(ice, q);
1090
1091   if (q->result || q->ready) {
1092      set_predicate_enable(ice, (q->result != 0) ^ condition);
1093   } else {
1094      if (mode == PIPE_RENDER_COND_NO_WAIT ||
1095          mode == PIPE_RENDER_COND_BY_REGION_NO_WAIT) {
1096         perf_debug(&ice->dbg, "Conditional rendering demoted from "
1097                    "\"no wait\" to \"wait\".");
1098      }
1099      set_predicate_for_result(ice, q, condition);
1100   }
1101}
1102
1103void
1104iris_resolve_conditional_render(struct iris_context *ice)
1105{
1106   struct pipe_context *ctx = (void *) ice;
1107   struct iris_query *q = ice->condition.query;
1108   struct pipe_query *query = (void *) q;
1109   union pipe_query_result result;
1110
1111   if (ice->state.predicate != IRIS_PREDICATE_STATE_USE_BIT)
1112      return;
1113
1114   assert(q);
1115
1116   iris_get_query_result(ctx, query, true, &result);
1117   set_predicate_enable(ice, (q->result != 0) ^ ice->condition.condition);
1118}
1119
1120void
1121iris_init_query_functions(struct pipe_context *ctx)
1122{
1123   ctx->create_query = iris_create_query;
1124   ctx->destroy_query = iris_destroy_query;
1125   ctx->begin_query = iris_begin_query;
1126   ctx->end_query = iris_end_query;
1127   ctx->get_query_result = iris_get_query_result;
1128   ctx->get_query_result_resource = iris_get_query_result_resource;
1129   ctx->set_active_query_state = iris_set_active_query_state;
1130   ctx->render_condition = iris_render_condition;
1131}
1132