si_query.c revision 7ec681f3
1/*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 * Copyright 2014 Marek Olšák <marek.olsak@amd.com>
4 * Copyright 2018 Advanced Micro Devices, Inc.
5 * All Rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * on the rights to use, copy, modify, merge, publish, distribute, sub
11 * license, and/or sell copies of the Software, and to permit persons to whom
12 * the Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the next
15 * paragraph) shall be included in all copies or substantial portions of the
16 * Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
21 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
22 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
23 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
24 * USE OR OTHER DEALINGS IN THE SOFTWARE.
25 */
26
27#include "si_query.h"
28#include "si_build_pm4.h"
29
30#include "amd/common/sid.h"
31#include "si_pipe.h"
32#include "util/os_time.h"
33#include "util/u_memory.h"
34#include "util/u_suballoc.h"
35#include "util/u_upload_mgr.h"
36
37static const struct si_query_ops query_hw_ops;
38
39struct si_hw_query_params {
40   unsigned start_offset;
41   unsigned end_offset;
42   unsigned fence_offset;
43   unsigned pair_stride;
44   unsigned pair_count;
45};
46
47/* Queries without buffer handling or suspend/resume. */
48struct si_query_sw {
49   struct si_query b;
50
51   uint64_t begin_result;
52   uint64_t end_result;
53
54   uint64_t begin_time;
55   uint64_t end_time;
56
57   /* Fence for GPU_FINISHED. */
58   struct pipe_fence_handle *fence;
59};
60
61static void si_query_sw_destroy(struct si_context *sctx, struct si_query *squery)
62{
63   struct si_query_sw *query = (struct si_query_sw *)squery;
64
65   sctx->b.screen->fence_reference(sctx->b.screen, &query->fence, NULL);
66   FREE(query);
67}
68
69static enum radeon_value_id winsys_id_from_type(unsigned type)
70{
71   switch (type) {
72   case SI_QUERY_REQUESTED_VRAM:
73      return RADEON_REQUESTED_VRAM_MEMORY;
74   case SI_QUERY_REQUESTED_GTT:
75      return RADEON_REQUESTED_GTT_MEMORY;
76   case SI_QUERY_MAPPED_VRAM:
77      return RADEON_MAPPED_VRAM;
78   case SI_QUERY_MAPPED_GTT:
79      return RADEON_MAPPED_GTT;
80   case SI_QUERY_SLAB_WASTED_VRAM:
81      return RADEON_SLAB_WASTED_VRAM;
82   case SI_QUERY_SLAB_WASTED_GTT:
83      return RADEON_SLAB_WASTED_GTT;
84   case SI_QUERY_BUFFER_WAIT_TIME:
85      return RADEON_BUFFER_WAIT_TIME_NS;
86   case SI_QUERY_NUM_MAPPED_BUFFERS:
87      return RADEON_NUM_MAPPED_BUFFERS;
88   case SI_QUERY_NUM_GFX_IBS:
89      return RADEON_NUM_GFX_IBS;
90   case SI_QUERY_GFX_BO_LIST_SIZE:
91      return RADEON_GFX_BO_LIST_COUNTER;
92   case SI_QUERY_GFX_IB_SIZE:
93      return RADEON_GFX_IB_SIZE_COUNTER;
94   case SI_QUERY_NUM_BYTES_MOVED:
95      return RADEON_NUM_BYTES_MOVED;
96   case SI_QUERY_NUM_EVICTIONS:
97      return RADEON_NUM_EVICTIONS;
98   case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS:
99      return RADEON_NUM_VRAM_CPU_PAGE_FAULTS;
100   case SI_QUERY_VRAM_USAGE:
101      return RADEON_VRAM_USAGE;
102   case SI_QUERY_VRAM_VIS_USAGE:
103      return RADEON_VRAM_VIS_USAGE;
104   case SI_QUERY_GTT_USAGE:
105      return RADEON_GTT_USAGE;
106   case SI_QUERY_GPU_TEMPERATURE:
107      return RADEON_GPU_TEMPERATURE;
108   case SI_QUERY_CURRENT_GPU_SCLK:
109      return RADEON_CURRENT_SCLK;
110   case SI_QUERY_CURRENT_GPU_MCLK:
111      return RADEON_CURRENT_MCLK;
112   case SI_QUERY_CS_THREAD_BUSY:
113      return RADEON_CS_THREAD_TIME;
114   default:
115      unreachable("query type does not correspond to winsys id");
116   }
117}
118
119static bool si_query_sw_begin(struct si_context *sctx, struct si_query *squery)
120{
121   struct si_query_sw *query = (struct si_query_sw *)squery;
122   enum radeon_value_id ws_id;
123
124   switch (query->b.type) {
125   case PIPE_QUERY_TIMESTAMP_DISJOINT:
126   case PIPE_QUERY_GPU_FINISHED:
127      break;
128   case SI_QUERY_DRAW_CALLS:
129      query->begin_result = sctx->num_draw_calls;
130      break;
131   case SI_QUERY_DECOMPRESS_CALLS:
132      query->begin_result = sctx->num_decompress_calls;
133      break;
134   case SI_QUERY_PRIM_RESTART_CALLS:
135      query->begin_result = sctx->num_prim_restart_calls;
136      break;
137   case SI_QUERY_COMPUTE_CALLS:
138      query->begin_result = sctx->num_compute_calls;
139      break;
140   case SI_QUERY_CP_DMA_CALLS:
141      query->begin_result = sctx->num_cp_dma_calls;
142      break;
143   case SI_QUERY_NUM_VS_FLUSHES:
144      query->begin_result = sctx->num_vs_flushes;
145      break;
146   case SI_QUERY_NUM_PS_FLUSHES:
147      query->begin_result = sctx->num_ps_flushes;
148      break;
149   case SI_QUERY_NUM_CS_FLUSHES:
150      query->begin_result = sctx->num_cs_flushes;
151      break;
152   case SI_QUERY_NUM_CB_CACHE_FLUSHES:
153      query->begin_result = sctx->num_cb_cache_flushes;
154      break;
155   case SI_QUERY_NUM_DB_CACHE_FLUSHES:
156      query->begin_result = sctx->num_db_cache_flushes;
157      break;
158   case SI_QUERY_NUM_L2_INVALIDATES:
159      query->begin_result = sctx->num_L2_invalidates;
160      break;
161   case SI_QUERY_NUM_L2_WRITEBACKS:
162      query->begin_result = sctx->num_L2_writebacks;
163      break;
164   case SI_QUERY_NUM_RESIDENT_HANDLES:
165      query->begin_result = sctx->num_resident_handles;
166      break;
167   case SI_QUERY_TC_OFFLOADED_SLOTS:
168      query->begin_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0;
169      break;
170   case SI_QUERY_TC_DIRECT_SLOTS:
171      query->begin_result = sctx->tc ? sctx->tc->num_direct_slots : 0;
172      break;
173   case SI_QUERY_TC_NUM_SYNCS:
174      query->begin_result = sctx->tc ? sctx->tc->num_syncs : 0;
175      break;
176   case SI_QUERY_REQUESTED_VRAM:
177   case SI_QUERY_REQUESTED_GTT:
178   case SI_QUERY_MAPPED_VRAM:
179   case SI_QUERY_MAPPED_GTT:
180   case SI_QUERY_SLAB_WASTED_VRAM:
181   case SI_QUERY_SLAB_WASTED_GTT:
182   case SI_QUERY_VRAM_USAGE:
183   case SI_QUERY_VRAM_VIS_USAGE:
184   case SI_QUERY_GTT_USAGE:
185   case SI_QUERY_GPU_TEMPERATURE:
186   case SI_QUERY_CURRENT_GPU_SCLK:
187   case SI_QUERY_CURRENT_GPU_MCLK:
188   case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
189   case SI_QUERY_NUM_MAPPED_BUFFERS:
190      query->begin_result = 0;
191      break;
192   case SI_QUERY_BUFFER_WAIT_TIME:
193   case SI_QUERY_GFX_IB_SIZE:
194   case SI_QUERY_NUM_GFX_IBS:
195   case SI_QUERY_NUM_BYTES_MOVED:
196   case SI_QUERY_NUM_EVICTIONS:
197   case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
198      enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
199      query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
200      break;
201   }
202   case SI_QUERY_GFX_BO_LIST_SIZE:
203      ws_id = winsys_id_from_type(query->b.type);
204      query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
205      query->begin_time = sctx->ws->query_value(sctx->ws, RADEON_NUM_GFX_IBS);
206      break;
207   case SI_QUERY_CS_THREAD_BUSY:
208      ws_id = winsys_id_from_type(query->b.type);
209      query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
210      query->begin_time = os_time_get_nano();
211      break;
212   case SI_QUERY_GALLIUM_THREAD_BUSY:
213      query->begin_result = sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0;
214      query->begin_time = os_time_get_nano();
215      break;
216   case SI_QUERY_GPU_LOAD:
217   case SI_QUERY_GPU_SHADERS_BUSY:
218   case SI_QUERY_GPU_TA_BUSY:
219   case SI_QUERY_GPU_GDS_BUSY:
220   case SI_QUERY_GPU_VGT_BUSY:
221   case SI_QUERY_GPU_IA_BUSY:
222   case SI_QUERY_GPU_SX_BUSY:
223   case SI_QUERY_GPU_WD_BUSY:
224   case SI_QUERY_GPU_BCI_BUSY:
225   case SI_QUERY_GPU_SC_BUSY:
226   case SI_QUERY_GPU_PA_BUSY:
227   case SI_QUERY_GPU_DB_BUSY:
228   case SI_QUERY_GPU_CP_BUSY:
229   case SI_QUERY_GPU_CB_BUSY:
230   case SI_QUERY_GPU_SDMA_BUSY:
231   case SI_QUERY_GPU_PFP_BUSY:
232   case SI_QUERY_GPU_MEQ_BUSY:
233   case SI_QUERY_GPU_ME_BUSY:
234   case SI_QUERY_GPU_SURF_SYNC_BUSY:
235   case SI_QUERY_GPU_CP_DMA_BUSY:
236   case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
237      query->begin_result = si_begin_counter(sctx->screen, query->b.type);
238      break;
239   case SI_QUERY_NUM_COMPILATIONS:
240      query->begin_result = p_atomic_read(&sctx->screen->num_compilations);
241      break;
242   case SI_QUERY_NUM_SHADERS_CREATED:
243      query->begin_result = p_atomic_read(&sctx->screen->num_shaders_created);
244      break;
245   case SI_QUERY_LIVE_SHADER_CACHE_HITS:
246      query->begin_result = sctx->screen->live_shader_cache.hits;
247      break;
248   case SI_QUERY_LIVE_SHADER_CACHE_MISSES:
249      query->begin_result = sctx->screen->live_shader_cache.misses;
250      break;
251   case SI_QUERY_MEMORY_SHADER_CACHE_HITS:
252      query->begin_result = sctx->screen->num_memory_shader_cache_hits;
253      break;
254   case SI_QUERY_MEMORY_SHADER_CACHE_MISSES:
255      query->begin_result = sctx->screen->num_memory_shader_cache_misses;
256      break;
257   case SI_QUERY_DISK_SHADER_CACHE_HITS:
258      query->begin_result = sctx->screen->num_disk_shader_cache_hits;
259      break;
260   case SI_QUERY_DISK_SHADER_CACHE_MISSES:
261      query->begin_result = sctx->screen->num_disk_shader_cache_misses;
262      break;
263   case SI_QUERY_GPIN_ASIC_ID:
264   case SI_QUERY_GPIN_NUM_SIMD:
265   case SI_QUERY_GPIN_NUM_RB:
266   case SI_QUERY_GPIN_NUM_SPI:
267   case SI_QUERY_GPIN_NUM_SE:
268      break;
269   default:
270      unreachable("si_query_sw_begin: bad query type");
271   }
272
273   return true;
274}
275
276static bool si_query_sw_end(struct si_context *sctx, struct si_query *squery)
277{
278   struct si_query_sw *query = (struct si_query_sw *)squery;
279   enum radeon_value_id ws_id;
280
281   switch (query->b.type) {
282   case PIPE_QUERY_TIMESTAMP_DISJOINT:
283      break;
284   case PIPE_QUERY_GPU_FINISHED:
285      sctx->b.flush(&sctx->b, &query->fence, PIPE_FLUSH_DEFERRED);
286      break;
287   case SI_QUERY_DRAW_CALLS:
288      query->end_result = sctx->num_draw_calls;
289      break;
290   case SI_QUERY_DECOMPRESS_CALLS:
291      query->end_result = sctx->num_decompress_calls;
292      break;
293   case SI_QUERY_PRIM_RESTART_CALLS:
294      query->end_result = sctx->num_prim_restart_calls;
295      break;
296   case SI_QUERY_COMPUTE_CALLS:
297      query->end_result = sctx->num_compute_calls;
298      break;
299   case SI_QUERY_CP_DMA_CALLS:
300      query->end_result = sctx->num_cp_dma_calls;
301      break;
302   case SI_QUERY_NUM_VS_FLUSHES:
303      query->end_result = sctx->num_vs_flushes;
304      break;
305   case SI_QUERY_NUM_PS_FLUSHES:
306      query->end_result = sctx->num_ps_flushes;
307      break;
308   case SI_QUERY_NUM_CS_FLUSHES:
309      query->end_result = sctx->num_cs_flushes;
310      break;
311   case SI_QUERY_NUM_CB_CACHE_FLUSHES:
312      query->end_result = sctx->num_cb_cache_flushes;
313      break;
314   case SI_QUERY_NUM_DB_CACHE_FLUSHES:
315      query->end_result = sctx->num_db_cache_flushes;
316      break;
317   case SI_QUERY_NUM_L2_INVALIDATES:
318      query->end_result = sctx->num_L2_invalidates;
319      break;
320   case SI_QUERY_NUM_L2_WRITEBACKS:
321      query->end_result = sctx->num_L2_writebacks;
322      break;
323   case SI_QUERY_NUM_RESIDENT_HANDLES:
324      query->end_result = sctx->num_resident_handles;
325      break;
326   case SI_QUERY_TC_OFFLOADED_SLOTS:
327      query->end_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0;
328      break;
329   case SI_QUERY_TC_DIRECT_SLOTS:
330      query->end_result = sctx->tc ? sctx->tc->num_direct_slots : 0;
331      break;
332   case SI_QUERY_TC_NUM_SYNCS:
333      query->end_result = sctx->tc ? sctx->tc->num_syncs : 0;
334      break;
335   case SI_QUERY_REQUESTED_VRAM:
336   case SI_QUERY_REQUESTED_GTT:
337   case SI_QUERY_MAPPED_VRAM:
338   case SI_QUERY_MAPPED_GTT:
339   case SI_QUERY_SLAB_WASTED_VRAM:
340   case SI_QUERY_SLAB_WASTED_GTT:
341   case SI_QUERY_VRAM_USAGE:
342   case SI_QUERY_VRAM_VIS_USAGE:
343   case SI_QUERY_GTT_USAGE:
344   case SI_QUERY_GPU_TEMPERATURE:
345   case SI_QUERY_CURRENT_GPU_SCLK:
346   case SI_QUERY_CURRENT_GPU_MCLK:
347   case SI_QUERY_BUFFER_WAIT_TIME:
348   case SI_QUERY_GFX_IB_SIZE:
349   case SI_QUERY_NUM_MAPPED_BUFFERS:
350   case SI_QUERY_NUM_GFX_IBS:
351   case SI_QUERY_NUM_BYTES_MOVED:
352   case SI_QUERY_NUM_EVICTIONS:
353   case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
354      enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
355      query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
356      break;
357   }
358   case SI_QUERY_GFX_BO_LIST_SIZE:
359      ws_id = winsys_id_from_type(query->b.type);
360      query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
361      query->end_time = sctx->ws->query_value(sctx->ws, RADEON_NUM_GFX_IBS);
362      break;
363   case SI_QUERY_CS_THREAD_BUSY:
364      ws_id = winsys_id_from_type(query->b.type);
365      query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
366      query->end_time = os_time_get_nano();
367      break;
368   case SI_QUERY_GALLIUM_THREAD_BUSY:
369      query->end_result = sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0;
370      query->end_time = os_time_get_nano();
371      break;
372   case SI_QUERY_GPU_LOAD:
373   case SI_QUERY_GPU_SHADERS_BUSY:
374   case SI_QUERY_GPU_TA_BUSY:
375   case SI_QUERY_GPU_GDS_BUSY:
376   case SI_QUERY_GPU_VGT_BUSY:
377   case SI_QUERY_GPU_IA_BUSY:
378   case SI_QUERY_GPU_SX_BUSY:
379   case SI_QUERY_GPU_WD_BUSY:
380   case SI_QUERY_GPU_BCI_BUSY:
381   case SI_QUERY_GPU_SC_BUSY:
382   case SI_QUERY_GPU_PA_BUSY:
383   case SI_QUERY_GPU_DB_BUSY:
384   case SI_QUERY_GPU_CP_BUSY:
385   case SI_QUERY_GPU_CB_BUSY:
386   case SI_QUERY_GPU_SDMA_BUSY:
387   case SI_QUERY_GPU_PFP_BUSY:
388   case SI_QUERY_GPU_MEQ_BUSY:
389   case SI_QUERY_GPU_ME_BUSY:
390   case SI_QUERY_GPU_SURF_SYNC_BUSY:
391   case SI_QUERY_GPU_CP_DMA_BUSY:
392   case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
393      query->end_result = si_end_counter(sctx->screen, query->b.type, query->begin_result);
394      query->begin_result = 0;
395      break;
396   case SI_QUERY_NUM_COMPILATIONS:
397      query->end_result = p_atomic_read(&sctx->screen->num_compilations);
398      break;
399   case SI_QUERY_NUM_SHADERS_CREATED:
400      query->end_result = p_atomic_read(&sctx->screen->num_shaders_created);
401      break;
402   case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
403      query->end_result = sctx->last_tex_ps_draw_ratio;
404      break;
405   case SI_QUERY_LIVE_SHADER_CACHE_HITS:
406      query->end_result = sctx->screen->live_shader_cache.hits;
407      break;
408   case SI_QUERY_LIVE_SHADER_CACHE_MISSES:
409      query->end_result = sctx->screen->live_shader_cache.misses;
410      break;
411   case SI_QUERY_MEMORY_SHADER_CACHE_HITS:
412      query->end_result = sctx->screen->num_memory_shader_cache_hits;
413      break;
414   case SI_QUERY_MEMORY_SHADER_CACHE_MISSES:
415      query->end_result = sctx->screen->num_memory_shader_cache_misses;
416      break;
417   case SI_QUERY_DISK_SHADER_CACHE_HITS:
418      query->end_result = sctx->screen->num_disk_shader_cache_hits;
419      break;
420   case SI_QUERY_DISK_SHADER_CACHE_MISSES:
421      query->end_result = sctx->screen->num_disk_shader_cache_misses;
422      break;
423   case SI_QUERY_GPIN_ASIC_ID:
424   case SI_QUERY_GPIN_NUM_SIMD:
425   case SI_QUERY_GPIN_NUM_RB:
426   case SI_QUERY_GPIN_NUM_SPI:
427   case SI_QUERY_GPIN_NUM_SE:
428      break;
429   default:
430      unreachable("si_query_sw_end: bad query type");
431   }
432
433   return true;
434}
435
436static bool si_query_sw_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
437                                   union pipe_query_result *result)
438{
439   struct si_query_sw *query = (struct si_query_sw *)squery;
440
441   switch (query->b.type) {
442   case PIPE_QUERY_TIMESTAMP_DISJOINT:
443      /* Convert from cycles per millisecond to cycles per second (Hz). */
444      result->timestamp_disjoint.frequency = (uint64_t)sctx->screen->info.clock_crystal_freq * 1000;
445      result->timestamp_disjoint.disjoint = false;
446      return true;
447   case PIPE_QUERY_GPU_FINISHED: {
448      struct pipe_screen *screen = sctx->b.screen;
449      struct pipe_context *ctx = squery->b.flushed ? NULL : &sctx->b;
450
451      result->b = screen->fence_finish(screen, ctx, query->fence, wait ? PIPE_TIMEOUT_INFINITE : 0);
452      return result->b;
453   }
454
455   case SI_QUERY_GFX_BO_LIST_SIZE:
456      result->u64 =
457         (query->end_result - query->begin_result) / (query->end_time - query->begin_time);
458      return true;
459   case SI_QUERY_CS_THREAD_BUSY:
460   case SI_QUERY_GALLIUM_THREAD_BUSY:
461      result->u64 =
462         (query->end_result - query->begin_result) * 100 / (query->end_time - query->begin_time);
463      return true;
464   case SI_QUERY_GPIN_ASIC_ID:
465      result->u32 = 0;
466      return true;
467   case SI_QUERY_GPIN_NUM_SIMD:
468      result->u32 = sctx->screen->info.num_good_compute_units;
469      return true;
470   case SI_QUERY_GPIN_NUM_RB:
471      result->u32 = sctx->screen->info.max_render_backends;
472      return true;
473   case SI_QUERY_GPIN_NUM_SPI:
474      result->u32 = 1; /* all supported chips have one SPI per SE */
475      return true;
476   case SI_QUERY_GPIN_NUM_SE:
477      result->u32 = sctx->screen->info.max_se;
478      return true;
479   }
480
481   result->u64 = query->end_result - query->begin_result;
482
483   switch (query->b.type) {
484   case SI_QUERY_BUFFER_WAIT_TIME:
485   case SI_QUERY_GPU_TEMPERATURE:
486      result->u64 /= 1000;
487      break;
488   case SI_QUERY_CURRENT_GPU_SCLK:
489   case SI_QUERY_CURRENT_GPU_MCLK:
490      result->u64 *= 1000000;
491      break;
492   }
493
494   return true;
495}
496
497static const struct si_query_ops sw_query_ops = {.destroy = si_query_sw_destroy,
498                                                 .begin = si_query_sw_begin,
499                                                 .end = si_query_sw_end,
500                                                 .get_result = si_query_sw_get_result,
501                                                 .get_result_resource = NULL};
502
503static struct pipe_query *si_query_sw_create(unsigned query_type)
504{
505   struct si_query_sw *query;
506
507   query = CALLOC_STRUCT(si_query_sw);
508   if (!query)
509      return NULL;
510
511   query->b.type = query_type;
512   query->b.ops = &sw_query_ops;
513
514   return (struct pipe_query *)query;
515}
516
517void si_query_buffer_destroy(struct si_screen *sscreen, struct si_query_buffer *buffer)
518{
519   struct si_query_buffer *prev = buffer->previous;
520
521   /* Release all query buffers. */
522   while (prev) {
523      struct si_query_buffer *qbuf = prev;
524      prev = prev->previous;
525      si_resource_reference(&qbuf->buf, NULL);
526      FREE(qbuf);
527   }
528
529   si_resource_reference(&buffer->buf, NULL);
530}
531
532void si_query_buffer_reset(struct si_context *sctx, struct si_query_buffer *buffer)
533{
534   /* Discard all query buffers except for the oldest. */
535   while (buffer->previous) {
536      struct si_query_buffer *qbuf = buffer->previous;
537      buffer->previous = qbuf->previous;
538
539      si_resource_reference(&buffer->buf, NULL);
540      buffer->buf = qbuf->buf; /* move ownership */
541      FREE(qbuf);
542   }
543   buffer->results_end = 0;
544
545   if (!buffer->buf)
546      return;
547
548   /* Discard even the oldest buffer if it can't be mapped without a stall. */
549   if (si_cs_is_buffer_referenced(sctx, buffer->buf->buf, RADEON_USAGE_READWRITE) ||
550       !sctx->ws->buffer_wait(sctx->ws, buffer->buf->buf, 0, RADEON_USAGE_READWRITE)) {
551      si_resource_reference(&buffer->buf, NULL);
552   } else {
553      buffer->unprepared = true;
554   }
555}
556
557bool si_query_buffer_alloc(struct si_context *sctx, struct si_query_buffer *buffer,
558                           bool (*prepare_buffer)(struct si_context *, struct si_query_buffer *),
559                           unsigned size)
560{
561   bool unprepared = buffer->unprepared;
562   buffer->unprepared = false;
563
564   if (!buffer->buf || buffer->results_end + size > buffer->buf->b.b.width0) {
565      if (buffer->buf) {
566         struct si_query_buffer *qbuf = MALLOC_STRUCT(si_query_buffer);
567         memcpy(qbuf, buffer, sizeof(*qbuf));
568         buffer->previous = qbuf;
569      }
570      buffer->results_end = 0;
571
572      /* Queries are normally read by the CPU after
573       * being written by the gpu, hence staging is probably a good
574       * usage pattern.
575       */
576      struct si_screen *screen = sctx->screen;
577      unsigned buf_size = MAX2(size, screen->info.min_alloc_size);
578      buffer->buf = si_resource(pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
579      if (unlikely(!buffer->buf))
580         return false;
581      unprepared = true;
582   }
583
584   if (unprepared && prepare_buffer) {
585      if (unlikely(!prepare_buffer(sctx, buffer))) {
586         si_resource_reference(&buffer->buf, NULL);
587         return false;
588      }
589   }
590
591   return true;
592}
593
594void si_query_hw_destroy(struct si_context *sctx, struct si_query *squery)
595{
596   struct si_query_hw *query = (struct si_query_hw *)squery;
597
598   si_query_buffer_destroy(sctx->screen, &query->buffer);
599   si_resource_reference(&query->workaround_buf, NULL);
600   FREE(squery);
601}
602
603static bool si_query_hw_prepare_buffer(struct si_context *sctx, struct si_query_buffer *qbuf)
604{
605   struct si_query_hw *query = container_of(qbuf, struct si_query_hw, buffer);
606   struct si_screen *screen = sctx->screen;
607
608   /* The caller ensures that the buffer is currently unused by the GPU. */
609   uint32_t *results = screen->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL,
610                                              PIPE_MAP_WRITE | PIPE_MAP_UNSYNCHRONIZED);
611   if (!results)
612      return false;
613
614   memset(results, 0, qbuf->buf->b.b.width0);
615
616   if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER ||
617       query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
618       query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
619      unsigned max_rbs = screen->info.max_render_backends;
620      unsigned enabled_rb_mask = screen->info.enabled_rb_mask;
621      unsigned num_results;
622      unsigned i, j;
623
624      /* Set top bits for unused backends. */
625      num_results = qbuf->buf->b.b.width0 / query->result_size;
626      for (j = 0; j < num_results; j++) {
627         for (i = 0; i < max_rbs; i++) {
628            if (!(enabled_rb_mask & (1 << i))) {
629               results[(i * 4) + 1] = 0x80000000;
630               results[(i * 4) + 3] = 0x80000000;
631            }
632         }
633         results += 4 * max_rbs;
634      }
635   }
636
637   return true;
638}
639
640static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_query *squery,
641                                            bool wait, enum pipe_query_value_type result_type,
642                                            int index, struct pipe_resource *resource,
643                                            unsigned offset);
644
645static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_hw *query,
646                                      struct si_resource *buffer, uint64_t va);
647static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw *query,
648                                     struct si_resource *buffer, uint64_t va);
649static void si_query_hw_add_result(struct si_screen *sscreen, struct si_query_hw *, void *buffer,
650                                   union pipe_query_result *result);
651static void si_query_hw_clear_result(struct si_query_hw *, union pipe_query_result *);
652
653static struct si_query_hw_ops query_hw_default_hw_ops = {
654   .prepare_buffer = si_query_hw_prepare_buffer,
655   .emit_start = si_query_hw_do_emit_start,
656   .emit_stop = si_query_hw_do_emit_stop,
657   .clear_result = si_query_hw_clear_result,
658   .add_result = si_query_hw_add_result,
659};
660
661static struct pipe_query *si_query_hw_create(struct si_screen *sscreen, unsigned query_type,
662                                             unsigned index)
663{
664   struct si_query_hw *query = CALLOC_STRUCT(si_query_hw);
665   if (!query)
666      return NULL;
667
668   query->b.type = query_type;
669   query->b.ops = &query_hw_ops;
670   query->ops = &query_hw_default_hw_ops;
671
672   switch (query_type) {
673   case PIPE_QUERY_OCCLUSION_COUNTER:
674   case PIPE_QUERY_OCCLUSION_PREDICATE:
675   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
676      query->result_size = 16 * sscreen->info.max_render_backends;
677      query->result_size += 16; /* for the fence + alignment */
678      query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
679      break;
680   case PIPE_QUERY_TIME_ELAPSED:
681      query->result_size = 24;
682      query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen);
683      break;
684   case PIPE_QUERY_TIMESTAMP:
685      query->result_size = 16;
686      query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen);
687      query->flags = SI_QUERY_HW_FLAG_NO_START;
688      break;
689   case PIPE_QUERY_PRIMITIVES_EMITTED:
690   case PIPE_QUERY_PRIMITIVES_GENERATED:
691   case PIPE_QUERY_SO_STATISTICS:
692   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
693      /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
694      query->result_size = 32;
695      query->b.num_cs_dw_suspend = 6;
696      query->stream = index;
697      break;
698   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
699      /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
700      query->result_size = 32 * SI_MAX_STREAMS;
701      query->b.num_cs_dw_suspend = 6 * SI_MAX_STREAMS;
702      break;
703   case PIPE_QUERY_PIPELINE_STATISTICS:
704      /* 11 values on GCN. */
705      query->result_size = 11 * 16;
706      query->result_size += 8; /* for the fence + alignment */
707      query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
708      break;
709   default:
710      assert(0);
711      FREE(query);
712      return NULL;
713   }
714
715   return (struct pipe_query *)query;
716}
717
718static void si_update_occlusion_query_state(struct si_context *sctx, unsigned type, int diff)
719{
720   if (type == PIPE_QUERY_OCCLUSION_COUNTER || type == PIPE_QUERY_OCCLUSION_PREDICATE ||
721       type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
722      bool old_enable = sctx->num_occlusion_queries != 0;
723      bool old_perfect_enable = sctx->num_perfect_occlusion_queries != 0;
724      bool enable, perfect_enable;
725
726      sctx->num_occlusion_queries += diff;
727      assert(sctx->num_occlusion_queries >= 0);
728
729      if (type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
730         sctx->num_perfect_occlusion_queries += diff;
731         assert(sctx->num_perfect_occlusion_queries >= 0);
732      }
733
734      enable = sctx->num_occlusion_queries != 0;
735      perfect_enable = sctx->num_perfect_occlusion_queries != 0;
736
737      if (enable != old_enable || perfect_enable != old_perfect_enable) {
738         si_set_occlusion_query_state(sctx, old_perfect_enable);
739      }
740   }
741}
742
743static unsigned event_type_for_stream(unsigned stream)
744{
745   switch (stream) {
746   default:
747   case 0:
748      return V_028A90_SAMPLE_STREAMOUTSTATS;
749   case 1:
750      return V_028A90_SAMPLE_STREAMOUTSTATS1;
751   case 2:
752      return V_028A90_SAMPLE_STREAMOUTSTATS2;
753   case 3:
754      return V_028A90_SAMPLE_STREAMOUTSTATS3;
755   }
756}
757
758static void emit_sample_streamout(struct radeon_cmdbuf *cs, uint64_t va, unsigned stream)
759{
760   radeon_begin(cs);
761   radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
762   radeon_emit(EVENT_TYPE(event_type_for_stream(stream)) | EVENT_INDEX(3));
763   radeon_emit(va);
764   radeon_emit(va >> 32);
765   radeon_end();
766}
767
768static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_hw *query,
769                                      struct si_resource *buffer, uint64_t va)
770{
771   struct radeon_cmdbuf *cs = &sctx->gfx_cs;
772
773   switch (query->b.type) {
774   case PIPE_QUERY_OCCLUSION_COUNTER:
775   case PIPE_QUERY_OCCLUSION_PREDICATE:
776   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
777      radeon_begin(cs);
778      radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
779      radeon_emit(EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
780      radeon_emit(va);
781      radeon_emit(va >> 32);
782      radeon_end();
783      break;
784   }
785   case PIPE_QUERY_PRIMITIVES_EMITTED:
786   case PIPE_QUERY_PRIMITIVES_GENERATED:
787   case PIPE_QUERY_SO_STATISTICS:
788   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
789      emit_sample_streamout(cs, va, query->stream);
790      break;
791   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
792      for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream)
793         emit_sample_streamout(cs, va + 32 * stream, stream);
794      break;
795   case PIPE_QUERY_TIME_ELAPSED:
796      si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
797                        EOP_DATA_SEL_TIMESTAMP, NULL, va, 0, query->b.type);
798      break;
799   case PIPE_QUERY_PIPELINE_STATISTICS: {
800      radeon_begin(cs);
801      radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
802      radeon_emit(EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
803      radeon_emit(va);
804      radeon_emit(va >> 32);
805      radeon_end();
806      break;
807   }
808   default:
809      assert(0);
810   }
811   radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, query->buffer.buf, RADEON_USAGE_WRITE,
812                             RADEON_PRIO_QUERY);
813}
814
815static void si_query_hw_emit_start(struct si_context *sctx, struct si_query_hw *query)
816{
817   uint64_t va;
818
819   if (!si_query_buffer_alloc(sctx, &query->buffer, query->ops->prepare_buffer, query->result_size))
820      return;
821
822   si_update_occlusion_query_state(sctx, query->b.type, 1);
823   si_update_prims_generated_query_state(sctx, query->b.type, 1);
824
825   if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS)
826      sctx->num_pipeline_stat_queries++;
827
828   si_need_gfx_cs_space(sctx, 0);
829
830   va = query->buffer.buf->gpu_address + query->buffer.results_end;
831   query->ops->emit_start(sctx, query, query->buffer.buf, va);
832}
833
834static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw *query,
835                                     struct si_resource *buffer, uint64_t va)
836{
837   struct radeon_cmdbuf *cs = &sctx->gfx_cs;
838   uint64_t fence_va = 0;
839
840   switch (query->b.type) {
841   case PIPE_QUERY_OCCLUSION_COUNTER:
842   case PIPE_QUERY_OCCLUSION_PREDICATE:
843   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
844      va += 8;
845      radeon_begin(cs);
846      radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
847      radeon_emit(EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
848      radeon_emit(va);
849      radeon_emit(va >> 32);
850      radeon_end();
851
852      fence_va = va + sctx->screen->info.max_render_backends * 16 - 8;
853      break;
854   }
855   case PIPE_QUERY_PRIMITIVES_EMITTED:
856   case PIPE_QUERY_PRIMITIVES_GENERATED:
857   case PIPE_QUERY_SO_STATISTICS:
858   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
859      va += 16;
860      emit_sample_streamout(cs, va, query->stream);
861      break;
862   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
863      va += 16;
864      for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream)
865         emit_sample_streamout(cs, va + 32 * stream, stream);
866      break;
867   case PIPE_QUERY_TIME_ELAPSED:
868      va += 8;
869      FALLTHROUGH;
870   case PIPE_QUERY_TIMESTAMP:
871      si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
872                        EOP_DATA_SEL_TIMESTAMP, NULL, va, 0, query->b.type);
873      fence_va = va + 8;
874      break;
875   case PIPE_QUERY_PIPELINE_STATISTICS: {
876      unsigned sample_size = (query->result_size - 8) / 2;
877
878      va += sample_size;
879      radeon_begin(cs);
880      radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
881      radeon_emit(EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
882      radeon_emit(va);
883      radeon_emit(va >> 32);
884      radeon_end();
885
886      fence_va = va + sample_size;
887      break;
888   }
889   default:
890      assert(0);
891   }
892   radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, query->buffer.buf, RADEON_USAGE_WRITE,
893                             RADEON_PRIO_QUERY);
894
895   if (fence_va) {
896      si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
897                        EOP_DATA_SEL_VALUE_32BIT, query->buffer.buf, fence_va, 0x80000000,
898                        query->b.type);
899   }
900}
901
902static void si_query_hw_emit_stop(struct si_context *sctx, struct si_query_hw *query)
903{
904   uint64_t va;
905
906   /* The queries which need begin already called this in begin_query. */
907   if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
908      si_need_gfx_cs_space(sctx, 0);
909      if (!si_query_buffer_alloc(sctx, &query->buffer, query->ops->prepare_buffer,
910                                 query->result_size))
911         return;
912   }
913
914   if (!query->buffer.buf)
915      return; // previous buffer allocation failure
916
917   /* emit end query */
918   va = query->buffer.buf->gpu_address + query->buffer.results_end;
919
920   query->ops->emit_stop(sctx, query, query->buffer.buf, va);
921
922   query->buffer.results_end += query->result_size;
923
924   si_update_occlusion_query_state(sctx, query->b.type, -1);
925   si_update_prims_generated_query_state(sctx, query->b.type, -1);
926
927   if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS)
928      sctx->num_pipeline_stat_queries--;
929}
930
931static void emit_set_predicate(struct si_context *ctx, struct si_resource *buf, uint64_t va,
932                               uint32_t op)
933{
934   struct radeon_cmdbuf *cs = &ctx->gfx_cs;
935
936   radeon_begin(cs);
937
938   if (ctx->chip_class >= GFX9) {
939      radeon_emit(PKT3(PKT3_SET_PREDICATION, 2, 0));
940      radeon_emit(op);
941      radeon_emit(va);
942      radeon_emit(va >> 32);
943   } else {
944      radeon_emit(PKT3(PKT3_SET_PREDICATION, 1, 0));
945      radeon_emit(va);
946      radeon_emit(op | ((va >> 32) & 0xFF));
947   }
948   radeon_end();
949
950   radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, buf, RADEON_USAGE_READ, RADEON_PRIO_QUERY);
951}
952
953static void si_emit_query_predication(struct si_context *ctx)
954{
955   uint32_t op;
956   bool flag_wait, invert;
957
958   struct si_query_hw *query = (struct si_query_hw *)ctx->render_cond;
959   if (!query)
960      return;
961
962   invert = ctx->render_cond_invert;
963   flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ||
964               ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT;
965
966   if (ctx->screen->use_ngg_streamout && (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
967                                          query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)) {
968      struct gfx10_sh_query *gfx10_query = (struct gfx10_sh_query *)query;
969      struct gfx10_sh_query_buffer *qbuf, *first, *last;
970
971      op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
972
973      /* if true then invert, see GL_ARB_conditional_render_inverted */
974      if (!invert)
975         op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */
976      else
977         op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */
978
979      op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
980
981      first = gfx10_query->first;
982      last = gfx10_query->last;
983
984      while (first) {
985         qbuf = first;
986         if (first != last)
987            first = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
988         else
989            first = NULL;
990
991         unsigned results_base = gfx10_query->first_begin;
992         uint64_t va_base = qbuf->buf->gpu_address;
993         uint64_t va = va_base + results_base;
994
995         unsigned begin = qbuf == gfx10_query->first ? gfx10_query->first_begin : 0;
996         unsigned end = qbuf == gfx10_query->last ? gfx10_query->last_end : qbuf->buf->b.b.width0;
997
998         unsigned count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem);
999         do {
1000            if (gfx10_query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
1001               for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
1002                  emit_set_predicate(ctx, qbuf->buf, va + 4 * sizeof(uint64_t) * stream, op);
1003
1004                  /* set CONTINUE bit for all packets except the first */
1005                  op |= PREDICATION_CONTINUE;
1006               }
1007            } else {
1008               emit_set_predicate(ctx, qbuf->buf, va + 4 * sizeof(uint64_t) * gfx10_query->stream, op);
1009               op |= PREDICATION_CONTINUE;
1010            }
1011
1012            results_base += sizeof(struct gfx10_sh_query_buffer_mem);
1013         } while (count--);
1014      }
1015   } else {
1016      struct si_query_buffer *qbuf;
1017
1018      if (query->workaround_buf) {
1019         op = PRED_OP(PREDICATION_OP_BOOL64);
1020      } else {
1021         switch (query->b.type) {
1022         case PIPE_QUERY_OCCLUSION_COUNTER:
1023         case PIPE_QUERY_OCCLUSION_PREDICATE:
1024         case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
1025            op = PRED_OP(PREDICATION_OP_ZPASS);
1026            break;
1027         case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1028         case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1029            op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
1030            invert = !invert;
1031            break;
1032         default:
1033            assert(0);
1034            return;
1035         }
1036      }
1037
1038      /* if true then invert, see GL_ARB_conditional_render_inverted */
1039      if (invert)
1040         op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */
1041      else
1042         op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */
1043
1044      /* Use the value written by compute shader as a workaround. Note that
1045       * the wait flag does not apply in this predication mode.
1046       *
1047       * The shader outputs the result value to L2. Workarounds only affect GFX8
1048       * and later, where the CP reads data from L2, so we don't need an
1049       * additional flush.
1050       */
1051      if (query->workaround_buf) {
1052         uint64_t va = query->workaround_buf->gpu_address + query->workaround_offset;
1053         emit_set_predicate(ctx, query->workaround_buf, va, op);
1054         return;
1055      }
1056
1057      op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
1058
1059      /* emit predicate packets for all data blocks */
1060      for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
1061         unsigned results_base = 0;
1062         uint64_t va_base = qbuf->buf->gpu_address;
1063
1064         while (results_base < qbuf->results_end) {
1065            uint64_t va = va_base + results_base;
1066
1067            if (query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
1068               for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
1069                  emit_set_predicate(ctx, qbuf->buf, va + 32 * stream, op);
1070
1071                  /* set CONTINUE bit for all packets except the first */
1072                  op |= PREDICATION_CONTINUE;
1073               }
1074            } else {
1075               emit_set_predicate(ctx, qbuf->buf, va, op);
1076               op |= PREDICATION_CONTINUE;
1077            }
1078
1079            results_base += query->result_size;
1080         }
1081      }
1082   }
1083}
1084
1085static struct pipe_query *si_create_query(struct pipe_context *ctx, unsigned query_type,
1086                                          unsigned index)
1087{
1088   struct si_screen *sscreen = (struct si_screen *)ctx->screen;
1089
1090   if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT || query_type == PIPE_QUERY_GPU_FINISHED ||
1091       (query_type >= PIPE_QUERY_DRIVER_SPECIFIC))
1092      return si_query_sw_create(query_type);
1093
1094   if (sscreen->use_ngg_streamout &&
1095       (query_type == PIPE_QUERY_PRIMITIVES_EMITTED ||
1096        query_type == PIPE_QUERY_PRIMITIVES_GENERATED || query_type == PIPE_QUERY_SO_STATISTICS ||
1097        query_type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
1098        query_type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE))
1099      return gfx10_sh_query_create(sscreen, query_type, index);
1100
1101   return si_query_hw_create(sscreen, query_type, index);
1102}
1103
1104static void si_destroy_query(struct pipe_context *ctx, struct pipe_query *query)
1105{
1106   struct si_context *sctx = (struct si_context *)ctx;
1107   struct si_query *squery = (struct si_query *)query;
1108
1109   squery->ops->destroy(sctx, squery);
1110}
1111
1112static bool si_begin_query(struct pipe_context *ctx, struct pipe_query *query)
1113{
1114   struct si_context *sctx = (struct si_context *)ctx;
1115   struct si_query *squery = (struct si_query *)query;
1116
1117   return squery->ops->begin(sctx, squery);
1118}
1119
1120bool si_query_hw_begin(struct si_context *sctx, struct si_query *squery)
1121{
1122   struct si_query_hw *query = (struct si_query_hw *)squery;
1123
1124   if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
1125      assert(0);
1126      return false;
1127   }
1128
1129   if (!(query->flags & SI_QUERY_HW_FLAG_BEGIN_RESUMES))
1130      si_query_buffer_reset(sctx, &query->buffer);
1131
1132   si_resource_reference(&query->workaround_buf, NULL);
1133
1134   si_query_hw_emit_start(sctx, query);
1135   if (!query->buffer.buf)
1136      return false;
1137
1138   list_addtail(&query->b.active_list, &sctx->active_queries);
1139   sctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend;
1140   return true;
1141}
1142
1143static bool si_end_query(struct pipe_context *ctx, struct pipe_query *query)
1144{
1145   struct si_context *sctx = (struct si_context *)ctx;
1146   struct si_query *squery = (struct si_query *)query;
1147
1148   return squery->ops->end(sctx, squery);
1149}
1150
1151bool si_query_hw_end(struct si_context *sctx, struct si_query *squery)
1152{
1153   struct si_query_hw *query = (struct si_query_hw *)squery;
1154
1155   if (query->flags & SI_QUERY_HW_FLAG_NO_START)
1156      si_query_buffer_reset(sctx, &query->buffer);
1157
1158   si_query_hw_emit_stop(sctx, query);
1159
1160   if (!(query->flags & SI_QUERY_HW_FLAG_NO_START)) {
1161      list_delinit(&query->b.active_list);
1162      sctx->num_cs_dw_queries_suspend -= query->b.num_cs_dw_suspend;
1163   }
1164
1165   if (!query->buffer.buf)
1166      return false;
1167
1168   return true;
1169}
1170
1171static void si_get_hw_query_params(struct si_context *sctx, struct si_query_hw *squery, int index,
1172                                   struct si_hw_query_params *params)
1173{
1174   unsigned max_rbs = sctx->screen->info.max_render_backends;
1175
1176   params->pair_stride = 0;
1177   params->pair_count = 1;
1178
1179   switch (squery->b.type) {
1180   case PIPE_QUERY_OCCLUSION_COUNTER:
1181   case PIPE_QUERY_OCCLUSION_PREDICATE:
1182   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
1183      params->start_offset = 0;
1184      params->end_offset = 8;
1185      params->fence_offset = max_rbs * 16;
1186      params->pair_stride = 16;
1187      params->pair_count = max_rbs;
1188      break;
1189   case PIPE_QUERY_TIME_ELAPSED:
1190      params->start_offset = 0;
1191      params->end_offset = 8;
1192      params->fence_offset = 16;
1193      break;
1194   case PIPE_QUERY_TIMESTAMP:
1195      params->start_offset = 0;
1196      params->end_offset = 0;
1197      params->fence_offset = 8;
1198      break;
1199   case PIPE_QUERY_PRIMITIVES_EMITTED:
1200      params->start_offset = 8;
1201      params->end_offset = 24;
1202      params->fence_offset = params->end_offset + 4;
1203      break;
1204   case PIPE_QUERY_PRIMITIVES_GENERATED:
1205      params->start_offset = 0;
1206      params->end_offset = 16;
1207      params->fence_offset = params->end_offset + 4;
1208      break;
1209   case PIPE_QUERY_SO_STATISTICS:
1210      params->start_offset = 8 - index * 8;
1211      params->end_offset = 24 - index * 8;
1212      params->fence_offset = params->end_offset + 4;
1213      break;
1214   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1215      params->pair_count = SI_MAX_STREAMS;
1216      params->pair_stride = 32;
1217      FALLTHROUGH;
1218   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1219      params->start_offset = 0;
1220      params->end_offset = 16;
1221
1222      /* We can re-use the high dword of the last 64-bit value as a
1223       * fence: it is initialized as 0, and the high bit is set by
1224       * the write of the streamout stats event.
1225       */
1226      params->fence_offset = squery->result_size - 4;
1227      break;
1228   case PIPE_QUERY_PIPELINE_STATISTICS: {
1229      static const unsigned offsets[] = {56, 48, 24, 32, 40, 16, 8, 0, 64, 72, 80};
1230      params->start_offset = offsets[index];
1231      params->end_offset = 88 + offsets[index];
1232      params->fence_offset = 2 * 88;
1233      break;
1234   }
1235   default:
1236      unreachable("si_get_hw_query_params unsupported");
1237   }
1238}
1239
1240static unsigned si_query_read_result(void *map, unsigned start_index, unsigned end_index,
1241                                     bool test_status_bit)
1242{
1243   uint32_t *current_result = (uint32_t *)map;
1244   uint64_t start, end;
1245
1246   start = (uint64_t)current_result[start_index] | (uint64_t)current_result[start_index + 1] << 32;
1247   end = (uint64_t)current_result[end_index] | (uint64_t)current_result[end_index + 1] << 32;
1248
1249   if (!test_status_bit || ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) {
1250      return end - start;
1251   }
1252   return 0;
1253}
1254
1255static void si_query_hw_add_result(struct si_screen *sscreen, struct si_query_hw *query,
1256                                   void *buffer, union pipe_query_result *result)
1257{
1258   unsigned max_rbs = sscreen->info.max_render_backends;
1259
1260   switch (query->b.type) {
1261   case PIPE_QUERY_OCCLUSION_COUNTER: {
1262      for (unsigned i = 0; i < max_rbs; ++i) {
1263         unsigned results_base = i * 16;
1264         result->u64 += si_query_read_result(buffer + results_base, 0, 2, true);
1265      }
1266      break;
1267   }
1268   case PIPE_QUERY_OCCLUSION_PREDICATE:
1269   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
1270      for (unsigned i = 0; i < max_rbs; ++i) {
1271         unsigned results_base = i * 16;
1272         result->b = result->b || si_query_read_result(buffer + results_base, 0, 2, true) != 0;
1273      }
1274      break;
1275   }
1276   case PIPE_QUERY_TIME_ELAPSED:
1277      result->u64 += si_query_read_result(buffer, 0, 2, false);
1278      break;
1279   case PIPE_QUERY_TIMESTAMP:
1280      result->u64 = *(uint64_t *)buffer;
1281      break;
1282   case PIPE_QUERY_PRIMITIVES_EMITTED:
1283      /* SAMPLE_STREAMOUTSTATS stores this structure:
1284       * {
1285       *    u64 NumPrimitivesWritten;
1286       *    u64 PrimitiveStorageNeeded;
1287       * }
1288       * We only need NumPrimitivesWritten here. */
1289      result->u64 += si_query_read_result(buffer, 2, 6, true);
1290      break;
1291   case PIPE_QUERY_PRIMITIVES_GENERATED:
1292      /* Here we read PrimitiveStorageNeeded. */
1293      result->u64 += si_query_read_result(buffer, 0, 4, true);
1294      break;
1295   case PIPE_QUERY_SO_STATISTICS:
1296      result->so_statistics.num_primitives_written += si_query_read_result(buffer, 2, 6, true);
1297      result->so_statistics.primitives_storage_needed += si_query_read_result(buffer, 0, 4, true);
1298      break;
1299   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1300      result->b = result->b || si_query_read_result(buffer, 2, 6, true) !=
1301                                  si_query_read_result(buffer, 0, 4, true);
1302      break;
1303   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1304      for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
1305         result->b = result->b || si_query_read_result(buffer, 2, 6, true) !=
1306                                     si_query_read_result(buffer, 0, 4, true);
1307         buffer = (char *)buffer + 32;
1308      }
1309      break;
1310   case PIPE_QUERY_PIPELINE_STATISTICS:
1311      result->pipeline_statistics.ps_invocations += si_query_read_result(buffer, 0, 22, false);
1312      result->pipeline_statistics.c_primitives += si_query_read_result(buffer, 2, 24, false);
1313      result->pipeline_statistics.c_invocations += si_query_read_result(buffer, 4, 26, false);
1314      result->pipeline_statistics.vs_invocations += si_query_read_result(buffer, 6, 28, false);
1315      result->pipeline_statistics.gs_invocations += si_query_read_result(buffer, 8, 30, false);
1316      result->pipeline_statistics.gs_primitives += si_query_read_result(buffer, 10, 32, false);
1317      result->pipeline_statistics.ia_primitives += si_query_read_result(buffer, 12, 34, false);
1318      result->pipeline_statistics.ia_vertices += si_query_read_result(buffer, 14, 36, false);
1319      result->pipeline_statistics.hs_invocations += si_query_read_result(buffer, 16, 38, false);
1320      result->pipeline_statistics.ds_invocations += si_query_read_result(buffer, 18, 40, false);
1321      result->pipeline_statistics.cs_invocations += si_query_read_result(buffer, 20, 42, false);
1322#if 0 /* for testing */
1323      printf("Pipeline stats: IA verts=%llu, IA prims=%llu, VS=%llu, HS=%llu, "
1324             "DS=%llu, GS=%llu, GS prims=%llu, Clipper=%llu, "
1325             "Clipper prims=%llu, PS=%llu, CS=%llu\n",
1326             result->pipeline_statistics.ia_vertices,
1327             result->pipeline_statistics.ia_primitives,
1328             result->pipeline_statistics.vs_invocations,
1329             result->pipeline_statistics.hs_invocations,
1330             result->pipeline_statistics.ds_invocations,
1331             result->pipeline_statistics.gs_invocations,
1332             result->pipeline_statistics.gs_primitives,
1333             result->pipeline_statistics.c_invocations,
1334             result->pipeline_statistics.c_primitives,
1335             result->pipeline_statistics.ps_invocations,
1336             result->pipeline_statistics.cs_invocations);
1337#endif
1338      break;
1339   default:
1340      assert(0);
1341   }
1342}
1343
1344void si_query_hw_suspend(struct si_context *sctx, struct si_query *query)
1345{
1346   si_query_hw_emit_stop(sctx, (struct si_query_hw *)query);
1347}
1348
1349void si_query_hw_resume(struct si_context *sctx, struct si_query *query)
1350{
1351   si_query_hw_emit_start(sctx, (struct si_query_hw *)query);
1352}
1353
1354static const struct si_query_ops query_hw_ops = {
1355   .destroy = si_query_hw_destroy,
1356   .begin = si_query_hw_begin,
1357   .end = si_query_hw_end,
1358   .get_result = si_query_hw_get_result,
1359   .get_result_resource = si_query_hw_get_result_resource,
1360
1361   .suspend = si_query_hw_suspend,
1362   .resume = si_query_hw_resume,
1363};
1364
1365static bool si_get_query_result(struct pipe_context *ctx, struct pipe_query *query, bool wait,
1366                                union pipe_query_result *result)
1367{
1368   struct si_context *sctx = (struct si_context *)ctx;
1369   struct si_query *squery = (struct si_query *)query;
1370
1371   return squery->ops->get_result(sctx, squery, wait, result);
1372}
1373
1374static void si_get_query_result_resource(struct pipe_context *ctx, struct pipe_query *query,
1375                                         bool wait, enum pipe_query_value_type result_type,
1376                                         int index, struct pipe_resource *resource, unsigned offset)
1377{
1378   struct si_context *sctx = (struct si_context *)ctx;
1379   struct si_query *squery = (struct si_query *)query;
1380
1381   squery->ops->get_result_resource(sctx, squery, wait, result_type, index, resource, offset);
1382}
1383
1384static void si_query_hw_clear_result(struct si_query_hw *query, union pipe_query_result *result)
1385{
1386   util_query_clear_result(result, query->b.type);
1387}
1388
1389bool si_query_hw_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
1390                            union pipe_query_result *result)
1391{
1392   struct si_screen *sscreen = sctx->screen;
1393   struct si_query_hw *query = (struct si_query_hw *)squery;
1394   struct si_query_buffer *qbuf;
1395
1396   query->ops->clear_result(query, result);
1397
1398   for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
1399      unsigned usage = PIPE_MAP_READ | (wait ? 0 : PIPE_MAP_DONTBLOCK);
1400      unsigned results_base = 0;
1401      void *map;
1402
1403      if (squery->b.flushed)
1404         map = sctx->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL, usage);
1405      else
1406         map = si_buffer_map(sctx, qbuf->buf, usage);
1407
1408      if (!map)
1409         return false;
1410
1411      while (results_base != qbuf->results_end) {
1412         query->ops->add_result(sscreen, query, map + results_base, result);
1413         results_base += query->result_size;
1414      }
1415   }
1416
1417   /* Convert the time to expected units. */
1418   if (squery->type == PIPE_QUERY_TIME_ELAPSED ||
1419       squery->type == PIPE_QUERY_TIMESTAMP) {
1420      result->u64 = (1000000 * result->u64) / sscreen->info.clock_crystal_freq;
1421   }
1422   return true;
1423}
1424
1425static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_query *squery,
1426                                            bool wait, enum pipe_query_value_type result_type,
1427                                            int index, struct pipe_resource *resource,
1428                                            unsigned offset)
1429{
1430   struct si_query_hw *query = (struct si_query_hw *)squery;
1431   struct si_query_buffer *qbuf;
1432   struct si_query_buffer *qbuf_prev;
1433   struct pipe_resource *tmp_buffer = NULL;
1434   unsigned tmp_buffer_offset = 0;
1435   struct si_qbo_state saved_state = {};
1436   struct pipe_grid_info grid = {};
1437   struct pipe_constant_buffer constant_buffer = {};
1438   struct pipe_shader_buffer ssbo[3];
1439   struct si_hw_query_params params;
1440   struct {
1441      uint32_t end_offset;
1442      uint32_t result_stride;
1443      uint32_t result_count;
1444      uint32_t config;
1445      uint32_t fence_offset;
1446      uint32_t pair_stride;
1447      uint32_t pair_count;
1448   } consts;
1449
1450   if (!sctx->query_result_shader) {
1451      sctx->query_result_shader = si_create_query_result_cs(sctx);
1452      if (!sctx->query_result_shader)
1453         return;
1454   }
1455
1456   if (query->buffer.previous) {
1457      u_suballocator_alloc(&sctx->allocator_zeroed_memory, 16, 16, &tmp_buffer_offset, &tmp_buffer);
1458      if (!tmp_buffer)
1459         return;
1460   }
1461
1462   si_save_qbo_state(sctx, &saved_state);
1463
1464   si_get_hw_query_params(sctx, query, index >= 0 ? index : 0, &params);
1465   consts.end_offset = params.end_offset - params.start_offset;
1466   consts.fence_offset = params.fence_offset - params.start_offset;
1467   consts.result_stride = query->result_size;
1468   consts.pair_stride = params.pair_stride;
1469   consts.pair_count = params.pair_count;
1470
1471   constant_buffer.buffer_size = sizeof(consts);
1472   constant_buffer.user_buffer = &consts;
1473
1474   ssbo[1].buffer = tmp_buffer;
1475   ssbo[1].buffer_offset = tmp_buffer_offset;
1476   ssbo[1].buffer_size = 16;
1477
1478   ssbo[2] = ssbo[1];
1479
1480   grid.block[0] = 1;
1481   grid.block[1] = 1;
1482   grid.block[2] = 1;
1483   grid.grid[0] = 1;
1484   grid.grid[1] = 1;
1485   grid.grid[2] = 1;
1486
1487   consts.config = 0;
1488   if (index < 0)
1489      consts.config |= 4;
1490   if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
1491       query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE)
1492      consts.config |= 8;
1493   else if (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
1494            query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
1495      consts.config |= 8 | 256;
1496   else if (query->b.type == PIPE_QUERY_TIMESTAMP || query->b.type == PIPE_QUERY_TIME_ELAPSED)
1497      consts.config |= 32;
1498
1499   switch (result_type) {
1500   case PIPE_QUERY_TYPE_U64:
1501   case PIPE_QUERY_TYPE_I64:
1502      consts.config |= 64;
1503      break;
1504   case PIPE_QUERY_TYPE_I32:
1505      consts.config |= 128;
1506      break;
1507   case PIPE_QUERY_TYPE_U32:
1508      break;
1509   }
1510
1511   sctx->flags |= sctx->screen->barrier_flags.cp_to_L2;
1512
1513   for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) {
1514      if (query->b.type != PIPE_QUERY_TIMESTAMP) {
1515         qbuf_prev = qbuf->previous;
1516         consts.result_count = qbuf->results_end / query->result_size;
1517         consts.config &= ~3;
1518         if (qbuf != &query->buffer)
1519            consts.config |= 1;
1520         if (qbuf->previous)
1521            consts.config |= 2;
1522      } else {
1523         /* Only read the last timestamp. */
1524         qbuf_prev = NULL;
1525         consts.result_count = 0;
1526         consts.config |= 16;
1527         params.start_offset += qbuf->results_end - query->result_size;
1528      }
1529
1530      sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, false, &constant_buffer);
1531
1532      ssbo[0].buffer = &qbuf->buf->b.b;
1533      ssbo[0].buffer_offset = params.start_offset;
1534      ssbo[0].buffer_size = qbuf->results_end - params.start_offset;
1535
1536      if (!qbuf->previous) {
1537         ssbo[2].buffer = resource;
1538         ssbo[2].buffer_offset = offset;
1539         ssbo[2].buffer_size = 8;
1540
1541         si_resource(resource)->TC_L2_dirty = true;
1542      }
1543
1544      if (wait && qbuf == &query->buffer) {
1545         uint64_t va;
1546
1547         /* Wait for result availability. Wait only for readiness
1548          * of the last entry, since the fence writes should be
1549          * serialized in the CP.
1550          */
1551         va = qbuf->buf->gpu_address + qbuf->results_end - query->result_size;
1552         va += params.fence_offset;
1553
1554         si_cp_wait_mem(sctx, &sctx->gfx_cs, va, 0x80000000, 0x80000000, WAIT_REG_MEM_EQUAL);
1555      }
1556      si_launch_grid_internal_ssbos(sctx, &grid, sctx->query_result_shader,
1557                                    SI_OP_SYNC_AFTER, SI_COHERENCY_SHADER,
1558                                    3, ssbo, 0x4);
1559   }
1560
1561   si_restore_qbo_state(sctx, &saved_state);
1562   pipe_resource_reference(&tmp_buffer, NULL);
1563}
1564
1565static void si_render_condition(struct pipe_context *ctx, struct pipe_query *query, bool condition,
1566                                enum pipe_render_cond_flag mode)
1567{
1568   struct si_context *sctx = (struct si_context *)ctx;
1569   struct si_query_hw *squery = (struct si_query_hw *)query;
1570   struct si_atom *atom = &sctx->atoms.s.render_cond;
1571
1572   if (query) {
1573      bool needs_workaround = false;
1574
1575      /* There was a firmware regression in GFX8 which causes successive
1576       * SET_PREDICATION packets to give the wrong answer for
1577       * non-inverted stream overflow predication.
1578       */
1579      if (((sctx->chip_class == GFX8 && sctx->screen->info.pfp_fw_feature < 49) ||
1580           (sctx->chip_class == GFX9 && sctx->screen->info.pfp_fw_feature < 38)) &&
1581          !condition &&
1582          (squery->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE ||
1583           (squery->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE &&
1584            (squery->buffer.previous || squery->buffer.results_end > squery->result_size)))) {
1585         needs_workaround = true;
1586      }
1587
1588      if (needs_workaround && !squery->workaround_buf) {
1589         bool old_render_cond_enabled = sctx->render_cond_enabled;
1590         sctx->render_cond_enabled = false;
1591
1592         u_suballocator_alloc(&sctx->allocator_zeroed_memory, 8, 8, &squery->workaround_offset,
1593                              (struct pipe_resource **)&squery->workaround_buf);
1594
1595         /* Reset to NULL to avoid a redundant SET_PREDICATION
1596          * from launching the compute grid.
1597          */
1598         sctx->render_cond = NULL;
1599
1600         ctx->get_query_result_resource(ctx, query, true, PIPE_QUERY_TYPE_U64, 0,
1601                                        &squery->workaround_buf->b.b, squery->workaround_offset);
1602
1603         /* Settings this in the render cond atom is too late,
1604          * so set it here. */
1605         sctx->flags |= sctx->screen->barrier_flags.L2_to_cp | SI_CONTEXT_FLUSH_FOR_RENDER_COND;
1606
1607         sctx->render_cond_enabled = old_render_cond_enabled;
1608      }
1609   }
1610
1611   sctx->render_cond = query;
1612   sctx->render_cond_invert = condition;
1613   sctx->render_cond_mode = mode;
1614   sctx->render_cond_enabled = query;
1615
1616   si_set_atom_dirty(sctx, atom, query != NULL);
1617}
1618
1619void si_suspend_queries(struct si_context *sctx)
1620{
1621   struct si_query *query;
1622
1623   LIST_FOR_EACH_ENTRY (query, &sctx->active_queries, active_list)
1624      query->ops->suspend(sctx, query);
1625}
1626
1627void si_resume_queries(struct si_context *sctx)
1628{
1629   struct si_query *query;
1630
1631   /* Check CS space here. Resuming must not be interrupted by flushes. */
1632   si_need_gfx_cs_space(sctx, 0);
1633
1634   LIST_FOR_EACH_ENTRY (query, &sctx->active_queries, active_list)
1635      query->ops->resume(sctx, query);
1636}
1637
1638#define XFULL(name_, query_type_, type_, result_type_, group_id_)                                  \
1639   {                                                                                               \
1640      .name = name_, .query_type = SI_QUERY_##query_type_, .type = PIPE_DRIVER_QUERY_TYPE_##type_, \
1641      .result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_##result_type_, .group_id = group_id_           \
1642   }
1643
1644#define X(name_, query_type_, type_, result_type_)                                                 \
1645   XFULL(name_, query_type_, type_, result_type_, ~(unsigned)0)
1646
1647#define XG(group_, name_, query_type_, type_, result_type_)                                        \
1648   XFULL(name_, query_type_, type_, result_type_, SI_QUERY_GROUP_##group_)
1649
1650static struct pipe_driver_query_info si_driver_query_list[] = {
1651   X("num-compilations", NUM_COMPILATIONS, UINT64, CUMULATIVE),
1652   X("num-shaders-created", NUM_SHADERS_CREATED, UINT64, CUMULATIVE),
1653   X("draw-calls", DRAW_CALLS, UINT64, AVERAGE),
1654   X("decompress-calls", DECOMPRESS_CALLS, UINT64, AVERAGE),
1655   X("prim-restart-calls", PRIM_RESTART_CALLS, UINT64, AVERAGE),
1656   X("compute-calls", COMPUTE_CALLS, UINT64, AVERAGE),
1657   X("cp-dma-calls", CP_DMA_CALLS, UINT64, AVERAGE),
1658   X("num-vs-flushes", NUM_VS_FLUSHES, UINT64, AVERAGE),
1659   X("num-ps-flushes", NUM_PS_FLUSHES, UINT64, AVERAGE),
1660   X("num-cs-flushes", NUM_CS_FLUSHES, UINT64, AVERAGE),
1661   X("num-CB-cache-flushes", NUM_CB_CACHE_FLUSHES, UINT64, AVERAGE),
1662   X("num-DB-cache-flushes", NUM_DB_CACHE_FLUSHES, UINT64, AVERAGE),
1663   X("num-L2-invalidates", NUM_L2_INVALIDATES, UINT64, AVERAGE),
1664   X("num-L2-writebacks", NUM_L2_WRITEBACKS, UINT64, AVERAGE),
1665   X("num-resident-handles", NUM_RESIDENT_HANDLES, UINT64, AVERAGE),
1666   X("tc-offloaded-slots", TC_OFFLOADED_SLOTS, UINT64, AVERAGE),
1667   X("tc-direct-slots", TC_DIRECT_SLOTS, UINT64, AVERAGE),
1668   X("tc-num-syncs", TC_NUM_SYNCS, UINT64, AVERAGE),
1669   X("CS-thread-busy", CS_THREAD_BUSY, UINT64, AVERAGE),
1670   X("gallium-thread-busy", GALLIUM_THREAD_BUSY, UINT64, AVERAGE),
1671   X("requested-VRAM", REQUESTED_VRAM, BYTES, AVERAGE),
1672   X("requested-GTT", REQUESTED_GTT, BYTES, AVERAGE),
1673   X("mapped-VRAM", MAPPED_VRAM, BYTES, AVERAGE),
1674   X("mapped-GTT", MAPPED_GTT, BYTES, AVERAGE),
1675   X("slab-wasted-VRAM", SLAB_WASTED_VRAM, BYTES, AVERAGE),
1676   X("slab-wasted-GTT", SLAB_WASTED_GTT, BYTES, AVERAGE),
1677   X("buffer-wait-time", BUFFER_WAIT_TIME, MICROSECONDS, CUMULATIVE),
1678   X("num-mapped-buffers", NUM_MAPPED_BUFFERS, UINT64, AVERAGE),
1679   X("num-GFX-IBs", NUM_GFX_IBS, UINT64, AVERAGE),
1680   X("GFX-BO-list-size", GFX_BO_LIST_SIZE, UINT64, AVERAGE),
1681   X("GFX-IB-size", GFX_IB_SIZE, UINT64, AVERAGE),
1682   X("num-bytes-moved", NUM_BYTES_MOVED, BYTES, CUMULATIVE),
1683   X("num-evictions", NUM_EVICTIONS, UINT64, CUMULATIVE),
1684   X("VRAM-CPU-page-faults", NUM_VRAM_CPU_PAGE_FAULTS, UINT64, CUMULATIVE),
1685   X("VRAM-usage", VRAM_USAGE, BYTES, AVERAGE),
1686   X("VRAM-vis-usage", VRAM_VIS_USAGE, BYTES, AVERAGE),
1687   X("GTT-usage", GTT_USAGE, BYTES, AVERAGE),
1688   X("back-buffer-ps-draw-ratio", BACK_BUFFER_PS_DRAW_RATIO, UINT64, AVERAGE),
1689   X("live-shader-cache-hits", LIVE_SHADER_CACHE_HITS, UINT, CUMULATIVE),
1690   X("live-shader-cache-misses", LIVE_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
1691   X("memory-shader-cache-hits", MEMORY_SHADER_CACHE_HITS, UINT, CUMULATIVE),
1692   X("memory-shader-cache-misses", MEMORY_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
1693   X("disk-shader-cache-hits", DISK_SHADER_CACHE_HITS, UINT, CUMULATIVE),
1694   X("disk-shader-cache-misses", DISK_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
1695
1696   /* GPIN queries are for the benefit of old versions of GPUPerfStudio,
1697    * which use it as a fallback path to detect the GPU type.
1698    *
1699    * Note: The names of these queries are significant for GPUPerfStudio
1700    * (and possibly their order as well). */
1701   XG(GPIN, "GPIN_000", GPIN_ASIC_ID, UINT, AVERAGE),
1702   XG(GPIN, "GPIN_001", GPIN_NUM_SIMD, UINT, AVERAGE),
1703   XG(GPIN, "GPIN_002", GPIN_NUM_RB, UINT, AVERAGE),
1704   XG(GPIN, "GPIN_003", GPIN_NUM_SPI, UINT, AVERAGE),
1705   XG(GPIN, "GPIN_004", GPIN_NUM_SE, UINT, AVERAGE),
1706
1707   X("temperature", GPU_TEMPERATURE, UINT64, AVERAGE),
1708   X("shader-clock", CURRENT_GPU_SCLK, HZ, AVERAGE),
1709   X("memory-clock", CURRENT_GPU_MCLK, HZ, AVERAGE),
1710
1711   /* The following queries must be at the end of the list because their
1712    * availability is adjusted dynamically based on the DRM version. */
1713   X("GPU-load", GPU_LOAD, UINT64, AVERAGE),
1714   X("GPU-shaders-busy", GPU_SHADERS_BUSY, UINT64, AVERAGE),
1715   X("GPU-ta-busy", GPU_TA_BUSY, UINT64, AVERAGE),
1716   X("GPU-gds-busy", GPU_GDS_BUSY, UINT64, AVERAGE),
1717   X("GPU-vgt-busy", GPU_VGT_BUSY, UINT64, AVERAGE),
1718   X("GPU-ia-busy", GPU_IA_BUSY, UINT64, AVERAGE),
1719   X("GPU-sx-busy", GPU_SX_BUSY, UINT64, AVERAGE),
1720   X("GPU-wd-busy", GPU_WD_BUSY, UINT64, AVERAGE),
1721   X("GPU-bci-busy", GPU_BCI_BUSY, UINT64, AVERAGE),
1722   X("GPU-sc-busy", GPU_SC_BUSY, UINT64, AVERAGE),
1723   X("GPU-pa-busy", GPU_PA_BUSY, UINT64, AVERAGE),
1724   X("GPU-db-busy", GPU_DB_BUSY, UINT64, AVERAGE),
1725   X("GPU-cp-busy", GPU_CP_BUSY, UINT64, AVERAGE),
1726   X("GPU-cb-busy", GPU_CB_BUSY, UINT64, AVERAGE),
1727
1728   /* SRBM_STATUS2 */
1729   X("GPU-sdma-busy", GPU_SDMA_BUSY, UINT64, AVERAGE),
1730
1731   /* CP_STAT */
1732   X("GPU-pfp-busy", GPU_PFP_BUSY, UINT64, AVERAGE),
1733   X("GPU-meq-busy", GPU_MEQ_BUSY, UINT64, AVERAGE),
1734   X("GPU-me-busy", GPU_ME_BUSY, UINT64, AVERAGE),
1735   X("GPU-surf-sync-busy", GPU_SURF_SYNC_BUSY, UINT64, AVERAGE),
1736   X("GPU-cp-dma-busy", GPU_CP_DMA_BUSY, UINT64, AVERAGE),
1737   X("GPU-scratch-ram-busy", GPU_SCRATCH_RAM_BUSY, UINT64, AVERAGE),
1738};
1739
1740#undef X
1741#undef XG
1742#undef XFULL
1743
1744static unsigned si_get_num_queries(struct si_screen *sscreen)
1745{
1746   /* amdgpu */
1747   if (sscreen->info.is_amdgpu) {
1748      if (sscreen->info.chip_class >= GFX8)
1749         return ARRAY_SIZE(si_driver_query_list);
1750      else
1751         return ARRAY_SIZE(si_driver_query_list) - 7;
1752   }
1753
1754   /* radeon */
1755   if (sscreen->info.has_read_registers_query) {
1756      if (sscreen->info.chip_class == GFX7)
1757         return ARRAY_SIZE(si_driver_query_list) - 6;
1758      else
1759         return ARRAY_SIZE(si_driver_query_list) - 7;
1760   }
1761
1762   return ARRAY_SIZE(si_driver_query_list) - 21;
1763}
1764
1765static int si_get_driver_query_info(struct pipe_screen *screen, unsigned index,
1766                                    struct pipe_driver_query_info *info)
1767{
1768   struct si_screen *sscreen = (struct si_screen *)screen;
1769   unsigned num_queries = si_get_num_queries(sscreen);
1770
1771   if (!info) {
1772      unsigned num_perfcounters = si_get_perfcounter_info(sscreen, 0, NULL);
1773
1774      return num_queries + num_perfcounters;
1775   }
1776
1777   if (index >= num_queries)
1778      return si_get_perfcounter_info(sscreen, index - num_queries, info);
1779
1780   *info = si_driver_query_list[index];
1781
1782   switch (info->query_type) {
1783   case SI_QUERY_REQUESTED_VRAM:
1784   case SI_QUERY_VRAM_USAGE:
1785   case SI_QUERY_MAPPED_VRAM:
1786   case SI_QUERY_SLAB_WASTED_VRAM:
1787      info->max_value.u64 = sscreen->info.vram_size;
1788      break;
1789   case SI_QUERY_REQUESTED_GTT:
1790   case SI_QUERY_GTT_USAGE:
1791   case SI_QUERY_MAPPED_GTT:
1792   case SI_QUERY_SLAB_WASTED_GTT:
1793      info->max_value.u64 = sscreen->info.gart_size;
1794      break;
1795   case SI_QUERY_GPU_TEMPERATURE:
1796      info->max_value.u64 = 125;
1797      break;
1798   case SI_QUERY_VRAM_VIS_USAGE:
1799      info->max_value.u64 = sscreen->info.vram_vis_size;
1800      break;
1801   }
1802
1803   if (info->group_id != ~(unsigned)0 && sscreen->perfcounters)
1804      info->group_id += sscreen->perfcounters->base.num_groups;
1805
1806   return 1;
1807}
1808
1809/* Note: Unfortunately, GPUPerfStudio hardcodes the order of hardware
1810 * performance counter groups, so be careful when changing this and related
1811 * functions.
1812 */
1813static int si_get_driver_query_group_info(struct pipe_screen *screen, unsigned index,
1814                                          struct pipe_driver_query_group_info *info)
1815{
1816   struct si_screen *sscreen = (struct si_screen *)screen;
1817   unsigned num_pc_groups = 0;
1818
1819   if (sscreen->perfcounters)
1820      num_pc_groups = sscreen->perfcounters->base.num_groups;
1821
1822   if (!info)
1823      return num_pc_groups + SI_NUM_SW_QUERY_GROUPS;
1824
1825   if (index < num_pc_groups)
1826      return si_get_perfcounter_group_info(sscreen, index, info);
1827
1828   index -= num_pc_groups;
1829   if (index >= SI_NUM_SW_QUERY_GROUPS)
1830      return 0;
1831
1832   info->name = "GPIN";
1833   info->max_active_queries = 5;
1834   info->num_queries = 5;
1835   return 1;
1836}
1837
1838void si_init_query_functions(struct si_context *sctx)
1839{
1840   sctx->b.create_query = si_create_query;
1841   sctx->b.create_batch_query = si_create_batch_query;
1842   sctx->b.destroy_query = si_destroy_query;
1843   sctx->b.begin_query = si_begin_query;
1844   sctx->b.end_query = si_end_query;
1845   sctx->b.get_query_result = si_get_query_result;
1846   sctx->b.get_query_result_resource = si_get_query_result_resource;
1847
1848   if (sctx->has_graphics) {
1849      sctx->atoms.s.render_cond.emit = si_emit_query_predication;
1850      sctx->b.render_condition = si_render_condition;
1851   }
1852
1853   list_inithead(&sctx->active_queries);
1854}
1855
1856void si_init_screen_query_functions(struct si_screen *sscreen)
1857{
1858   sscreen->b.get_driver_query_info = si_get_driver_query_info;
1859   sscreen->b.get_driver_query_group_info = si_get_driver_query_group_info;
1860}
1861