1/*
2 * Copyright 2015 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "si_build_pm4.h"
26#include "si_query.h"
27#include "util/u_memory.h"
28
29#include "ac_perfcounter.h"
30
31struct si_query_group {
32   struct si_query_group *next;
33   struct ac_pc_block *block;
34   unsigned sub_gid;     /* only used during init */
35   unsigned result_base; /* only used during init */
36   int se;
37   int instance;
38   unsigned num_counters;
39   unsigned selectors[AC_QUERY_MAX_COUNTERS];
40};
41
42struct si_query_counter {
43   unsigned base;
44   unsigned qwords;
45   unsigned stride; /* in uint64s */
46};
47
48struct si_query_pc {
49   struct si_query b;
50   struct si_query_buffer buffer;
51
52   /* Size of the results in memory, in bytes. */
53   unsigned result_size;
54
55   unsigned shaders;
56   unsigned num_counters;
57   struct si_query_counter *counters;
58   struct si_query_group *groups;
59};
60
61static void si_pc_emit_instance(struct si_context *sctx, int se, int instance)
62{
63   struct radeon_cmdbuf *cs = &sctx->gfx_cs;
64   unsigned value = S_030800_SH_BROADCAST_WRITES(1);
65
66   if (se >= 0) {
67      value |= S_030800_SE_INDEX(se);
68   } else {
69      value |= S_030800_SE_BROADCAST_WRITES(1);
70   }
71
72   if (sctx->chip_class >= GFX10) {
73      /* TODO: Expose counters from each shader array separately if needed. */
74      value |= S_030800_SA_BROADCAST_WRITES(1);
75   }
76
77   if (instance >= 0) {
78      value |= S_030800_INSTANCE_INDEX(instance);
79   } else {
80      value |= S_030800_INSTANCE_BROADCAST_WRITES(1);
81   }
82
83   radeon_begin(cs);
84   radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, value);
85   radeon_end();
86}
87
88static void si_pc_emit_shaders(struct si_context *sctx, unsigned shaders)
89{
90   struct radeon_cmdbuf *cs = &sctx->gfx_cs;
91
92   radeon_begin(cs);
93   radeon_set_uconfig_reg_seq(R_036780_SQ_PERFCOUNTER_CTRL, 2, false);
94   radeon_emit(shaders & 0x7f);
95   radeon_emit(0xffffffff);
96   radeon_end();
97}
98
99static void si_pc_emit_select(struct si_context *sctx, struct ac_pc_block *block, unsigned count,
100                              unsigned *selectors)
101{
102   struct ac_pc_block_base *regs = block->b->b;
103   struct radeon_cmdbuf *cs = &sctx->gfx_cs;
104   unsigned idx;
105
106   assert(count <= regs->num_counters);
107
108   /* Fake counters. */
109   if (!regs->select0)
110      return;
111
112   radeon_begin(cs);
113
114   for (idx = 0; idx < count; ++idx) {
115      radeon_set_uconfig_reg_seq(regs->select0[idx], 1, false);
116      radeon_emit(selectors[idx] | regs->select_or);
117   }
118
119   for (idx = 0; idx < regs->num_spm_counters; idx++) {
120      radeon_set_uconfig_reg_seq(regs->select1[idx], 1, false);
121      radeon_emit(0);
122   }
123
124   radeon_end();
125}
126
127static void si_pc_emit_start(struct si_context *sctx, struct si_resource *buffer, uint64_t va)
128{
129   struct radeon_cmdbuf *cs = &sctx->gfx_cs;
130
131   si_cp_copy_data(sctx, &sctx->gfx_cs, COPY_DATA_DST_MEM, buffer, va - buffer->gpu_address,
132                   COPY_DATA_IMM, NULL, 1);
133
134   radeon_begin(cs);
135   radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
136                          S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET));
137   radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
138   radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0));
139   radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
140                          S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_START_COUNTING));
141   radeon_end();
142}
143
144/* Note: The buffer was already added in si_pc_emit_start, so we don't have to
145 * do it again in here. */
146static void si_pc_emit_stop(struct si_context *sctx, struct si_resource *buffer, uint64_t va)
147{
148   struct radeon_cmdbuf *cs = &sctx->gfx_cs;
149
150   si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
151                     EOP_DATA_SEL_VALUE_32BIT, buffer, va, 0, SI_NOT_QUERY);
152   si_cp_wait_mem(sctx, cs, va, 0, 0xffffffff, WAIT_REG_MEM_EQUAL);
153
154   radeon_begin(cs);
155   radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
156   radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
157   radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
158   radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0));
159   radeon_set_uconfig_reg(
160      R_036020_CP_PERFMON_CNTL,
161      S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_STOP_COUNTING) | S_036020_PERFMON_SAMPLE_ENABLE(1));
162   radeon_end();
163}
164
165static void si_pc_emit_read(struct si_context *sctx, struct ac_pc_block *block, unsigned count,
166                            uint64_t va)
167{
168   struct ac_pc_block_base *regs = block->b->b;
169   struct radeon_cmdbuf *cs = &sctx->gfx_cs;
170   unsigned idx;
171   unsigned reg = regs->counter0_lo;
172   unsigned reg_delta = 8;
173
174   radeon_begin(cs);
175
176   if (regs->select0) {
177      for (idx = 0; idx < count; ++idx) {
178         if (regs->counters)
179            reg = regs->counters[idx];
180
181         radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0));
182         radeon_emit(COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
183                            COPY_DATA_COUNT_SEL); /* 64 bits */
184         radeon_emit(reg >> 2);
185         radeon_emit(0); /* unused */
186         radeon_emit(va);
187         radeon_emit(va >> 32);
188         va += sizeof(uint64_t);
189         reg += reg_delta;
190      }
191   } else {
192      /* Fake counters. */
193      for (idx = 0; idx < count; ++idx) {
194         radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0));
195         radeon_emit(COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
196                     COPY_DATA_COUNT_SEL);
197         radeon_emit(0); /* immediate */
198         radeon_emit(0);
199         radeon_emit(va);
200         radeon_emit(va >> 32);
201         va += sizeof(uint64_t);
202      }
203   }
204   radeon_end();
205}
206
207static void si_pc_query_destroy(struct si_context *sctx, struct si_query *squery)
208{
209   struct si_query_pc *query = (struct si_query_pc *)squery;
210
211   while (query->groups) {
212      struct si_query_group *group = query->groups;
213      query->groups = group->next;
214      FREE(group);
215   }
216
217   FREE(query->counters);
218
219   si_query_buffer_destroy(sctx->screen, &query->buffer);
220   FREE(query);
221}
222
223void si_inhibit_clockgating(struct si_context *sctx, struct radeon_cmdbuf *cs, bool inhibit)
224{
225   radeon_begin(&sctx->gfx_cs);
226
227   if (sctx->chip_class >= GFX10) {
228      radeon_set_uconfig_reg(R_037390_RLC_PERFMON_CLK_CNTL,
229                             S_037390_PERFMON_CLOCK_STATE(inhibit));
230   } else if (sctx->chip_class >= GFX8) {
231      radeon_set_uconfig_reg(R_0372FC_RLC_PERFMON_CLK_CNTL,
232                             S_0372FC_PERFMON_CLOCK_STATE(inhibit));
233   }
234   radeon_end();
235}
236
237static void si_pc_query_resume(struct si_context *sctx, struct si_query *squery)
238/*
239                                   struct si_query_hw *hwquery,
240                                   struct si_resource *buffer, uint64_t va)*/
241{
242   struct si_query_pc *query = (struct si_query_pc *)squery;
243   int current_se = -1;
244   int current_instance = -1;
245
246   if (!si_query_buffer_alloc(sctx, &query->buffer, NULL, query->result_size))
247      return;
248   si_need_gfx_cs_space(sctx, 0);
249
250   if (query->shaders)
251      si_pc_emit_shaders(sctx, query->shaders);
252
253   si_inhibit_clockgating(sctx, &sctx->gfx_cs, true);
254
255   for (struct si_query_group *group = query->groups; group; group = group->next) {
256      struct ac_pc_block *block = group->block;
257
258      if (group->se != current_se || group->instance != current_instance) {
259         current_se = group->se;
260         current_instance = group->instance;
261         si_pc_emit_instance(sctx, group->se, group->instance);
262      }
263
264      si_pc_emit_select(sctx, block, group->num_counters, group->selectors);
265   }
266
267   if (current_se != -1 || current_instance != -1)
268      si_pc_emit_instance(sctx, -1, -1);
269
270   uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
271   si_pc_emit_start(sctx, query->buffer.buf, va);
272}
273
274static void si_pc_query_suspend(struct si_context *sctx, struct si_query *squery)
275{
276   struct si_query_pc *query = (struct si_query_pc *)squery;
277
278   if (!query->buffer.buf)
279      return;
280
281   uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
282   query->buffer.results_end += query->result_size;
283
284   si_pc_emit_stop(sctx, query->buffer.buf, va);
285
286   for (struct si_query_group *group = query->groups; group; group = group->next) {
287      struct ac_pc_block *block = group->block;
288      unsigned se = group->se >= 0 ? group->se : 0;
289      unsigned se_end = se + 1;
290
291      if ((block->b->b->flags & AC_PC_BLOCK_SE) && (group->se < 0))
292         se_end = sctx->screen->info.max_se;
293
294      do {
295         unsigned instance = group->instance >= 0 ? group->instance : 0;
296
297         do {
298            si_pc_emit_instance(sctx, se, instance);
299            si_pc_emit_read(sctx, block, group->num_counters, va);
300            va += sizeof(uint64_t) * group->num_counters;
301         } while (group->instance < 0 && ++instance < block->num_instances);
302      } while (++se < se_end);
303   }
304
305   si_pc_emit_instance(sctx, -1, -1);
306
307   si_inhibit_clockgating(sctx, &sctx->gfx_cs, false);
308}
309
310static bool si_pc_query_begin(struct si_context *ctx, struct si_query *squery)
311{
312   struct si_query_pc *query = (struct si_query_pc *)squery;
313
314   si_query_buffer_reset(ctx, &query->buffer);
315
316   list_addtail(&query->b.active_list, &ctx->active_queries);
317   ctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend;
318
319   si_pc_query_resume(ctx, squery);
320
321   return true;
322}
323
324static bool si_pc_query_end(struct si_context *ctx, struct si_query *squery)
325{
326   struct si_query_pc *query = (struct si_query_pc *)squery;
327
328   si_pc_query_suspend(ctx, squery);
329
330   list_del(&squery->active_list);
331   ctx->num_cs_dw_queries_suspend -= squery->num_cs_dw_suspend;
332
333   return query->buffer.buf != NULL;
334}
335
336static void si_pc_query_add_result(struct si_query_pc *query, void *buffer,
337                                   union pipe_query_result *result)
338{
339   uint64_t *results = buffer;
340   unsigned i, j;
341
342   for (i = 0; i < query->num_counters; ++i) {
343      struct si_query_counter *counter = &query->counters[i];
344
345      for (j = 0; j < counter->qwords; ++j) {
346         uint32_t value = results[counter->base + j * counter->stride];
347         result->batch[i].u64 += value;
348      }
349   }
350}
351
352static bool si_pc_query_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
353                                   union pipe_query_result *result)
354{
355   struct si_query_pc *query = (struct si_query_pc *)squery;
356
357   memset(result, 0, sizeof(result->batch[0]) * query->num_counters);
358
359   for (struct si_query_buffer *qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
360      unsigned usage = PIPE_MAP_READ | (wait ? 0 : PIPE_MAP_DONTBLOCK);
361      unsigned results_base = 0;
362      void *map;
363
364      if (squery->b.flushed)
365         map = sctx->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL, usage);
366      else
367         map = si_buffer_map(sctx, qbuf->buf, usage);
368
369      if (!map)
370         return false;
371
372      while (results_base != qbuf->results_end) {
373         si_pc_query_add_result(query, map + results_base, result);
374         results_base += query->result_size;
375      }
376   }
377
378   return true;
379}
380
381static const struct si_query_ops batch_query_ops = {
382   .destroy = si_pc_query_destroy,
383   .begin = si_pc_query_begin,
384   .end = si_pc_query_end,
385   .get_result = si_pc_query_get_result,
386
387   .suspend = si_pc_query_suspend,
388   .resume = si_pc_query_resume,
389};
390
391static struct si_query_group *get_group_state(struct si_screen *screen, struct si_query_pc *query,
392                                              struct ac_pc_block *block, unsigned sub_gid)
393{
394   struct si_perfcounters *pc = screen->perfcounters;
395   struct si_query_group *group = query->groups;
396
397   while (group) {
398      if (group->block == block && group->sub_gid == sub_gid)
399         return group;
400      group = group->next;
401   }
402
403   group = CALLOC_STRUCT(si_query_group);
404   if (!group)
405      return NULL;
406
407   group->block = block;
408   group->sub_gid = sub_gid;
409
410   if (block->b->b->flags & AC_PC_BLOCK_SHADER) {
411      unsigned sub_gids = block->num_instances;
412      unsigned shader_id;
413      unsigned shaders;
414      unsigned query_shaders;
415
416      if (ac_pc_block_has_per_se_groups(&pc->base, block))
417         sub_gids = sub_gids * screen->info.max_se;
418      shader_id = sub_gid / sub_gids;
419      sub_gid = sub_gid % sub_gids;
420
421      shaders = ac_pc_shader_type_bits[shader_id];
422
423      query_shaders = query->shaders & ~AC_PC_SHADERS_WINDOWING;
424      if (query_shaders && query_shaders != shaders) {
425         fprintf(stderr, "si_perfcounter: incompatible shader groups\n");
426         FREE(group);
427         return NULL;
428      }
429      query->shaders = shaders;
430   }
431
432   if (block->b->b->flags & AC_PC_BLOCK_SHADER_WINDOWED && !query->shaders) {
433      // A non-zero value in query->shaders ensures that the shader
434      // masking is reset unless the user explicitly requests one.
435      query->shaders = AC_PC_SHADERS_WINDOWING;
436   }
437
438   if (ac_pc_block_has_per_se_groups(&pc->base, block)) {
439      group->se = sub_gid / block->num_instances;
440      sub_gid = sub_gid % block->num_instances;
441   } else {
442      group->se = -1;
443   }
444
445   if (ac_pc_block_has_per_instance_groups(&pc->base, block)) {
446      group->instance = sub_gid;
447   } else {
448      group->instance = -1;
449   }
450
451   group->next = query->groups;
452   query->groups = group;
453
454   return group;
455}
456
457struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_queries,
458                                         unsigned *query_types)
459{
460   struct si_screen *screen = (struct si_screen *)ctx->screen;
461   struct si_perfcounters *pc = screen->perfcounters;
462   struct ac_pc_block *block;
463   struct si_query_group *group;
464   struct si_query_pc *query;
465   unsigned base_gid, sub_gid, sub_index;
466   unsigned i, j;
467
468   if (!pc)
469      return NULL;
470
471   query = CALLOC_STRUCT(si_query_pc);
472   if (!query)
473      return NULL;
474
475   query->b.ops = &batch_query_ops;
476
477   query->num_counters = num_queries;
478
479   /* Collect selectors per group */
480   for (i = 0; i < num_queries; ++i) {
481      unsigned sub_gid;
482
483      if (query_types[i] < SI_QUERY_FIRST_PERFCOUNTER)
484         goto error;
485
486      block =
487         ac_lookup_counter(&pc->base, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index);
488      if (!block)
489         goto error;
490
491      sub_gid = sub_index / block->b->selectors;
492      sub_index = sub_index % block->b->selectors;
493
494      group = get_group_state(screen, query, block, sub_gid);
495      if (!group)
496         goto error;
497
498      if (group->num_counters >= block->b->b->num_counters) {
499         fprintf(stderr, "perfcounter group %s: too many selected\n", block->b->b->name);
500         goto error;
501      }
502      group->selectors[group->num_counters] = sub_index;
503      ++group->num_counters;
504   }
505
506   /* Compute result bases and CS size per group */
507   query->b.num_cs_dw_suspend = pc->num_stop_cs_dwords;
508   query->b.num_cs_dw_suspend += pc->num_instance_cs_dwords;
509
510   i = 0;
511   for (group = query->groups; group; group = group->next) {
512      struct ac_pc_block *block = group->block;
513      unsigned read_dw;
514      unsigned instances = 1;
515
516      if ((block->b->b->flags & AC_PC_BLOCK_SE) && group->se < 0)
517         instances = screen->info.max_se;
518      if (group->instance < 0)
519         instances *= block->num_instances;
520
521      group->result_base = i;
522      query->result_size += sizeof(uint64_t) * instances * group->num_counters;
523      i += instances * group->num_counters;
524
525      read_dw = 6 * group->num_counters;
526      query->b.num_cs_dw_suspend += instances * read_dw;
527      query->b.num_cs_dw_suspend += instances * pc->num_instance_cs_dwords;
528   }
529
530   if (query->shaders) {
531      if (query->shaders == AC_PC_SHADERS_WINDOWING)
532         query->shaders = 0xffffffff;
533   }
534
535   /* Map user-supplied query array to result indices */
536   query->counters = CALLOC(num_queries, sizeof(*query->counters));
537   for (i = 0; i < num_queries; ++i) {
538      struct si_query_counter *counter = &query->counters[i];
539      struct ac_pc_block *block;
540
541      block =
542         ac_lookup_counter(&pc->base, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index);
543
544      sub_gid = sub_index / block->b->selectors;
545      sub_index = sub_index % block->b->selectors;
546
547      group = get_group_state(screen, query, block, sub_gid);
548      assert(group != NULL);
549
550      for (j = 0; j < group->num_counters; ++j) {
551         if (group->selectors[j] == sub_index)
552            break;
553      }
554
555      counter->base = group->result_base + j;
556      counter->stride = group->num_counters;
557
558      counter->qwords = 1;
559      if ((block->b->b->flags & AC_PC_BLOCK_SE) && group->se < 0)
560         counter->qwords = screen->info.max_se;
561      if (group->instance < 0)
562         counter->qwords *= block->num_instances;
563   }
564
565   return (struct pipe_query *)query;
566
567error:
568   si_pc_query_destroy((struct si_context *)ctx, &query->b);
569   return NULL;
570}
571
572int si_get_perfcounter_info(struct si_screen *screen, unsigned index,
573                            struct pipe_driver_query_info *info)
574{
575   struct si_perfcounters *pc = screen->perfcounters;
576   struct ac_pc_block *block;
577   unsigned base_gid, sub;
578
579   if (!pc)
580      return 0;
581
582   if (!info) {
583      unsigned bid, num_queries = 0;
584
585      for (bid = 0; bid < pc->base.num_blocks; ++bid) {
586         num_queries += pc->base.blocks[bid].b->selectors * pc->base.blocks[bid].num_groups;
587      }
588
589      return num_queries;
590   }
591
592   block = ac_lookup_counter(&pc->base, index, &base_gid, &sub);
593   if (!block)
594      return 0;
595
596   if (!block->selector_names) {
597      if (!ac_init_block_names(&screen->info, &pc->base, block))
598         return 0;
599   }
600   info->name = block->selector_names + sub * block->selector_name_stride;
601   info->query_type = SI_QUERY_FIRST_PERFCOUNTER + index;
602   info->max_value.u64 = 0;
603   info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
604   info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE;
605   info->group_id = base_gid + sub / block->b->selectors;
606   info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH;
607   if (sub > 0 && sub + 1 < block->b->selectors * block->num_groups)
608      info->flags |= PIPE_DRIVER_QUERY_FLAG_DONT_LIST;
609   return 1;
610}
611
612int si_get_perfcounter_group_info(struct si_screen *screen, unsigned index,
613                                  struct pipe_driver_query_group_info *info)
614{
615   struct si_perfcounters *pc = screen->perfcounters;
616   struct ac_pc_block *block;
617
618   if (!pc)
619      return 0;
620
621   if (!info)
622      return pc->base.num_groups;
623
624   block = ac_lookup_group(&pc->base, &index);
625   if (!block)
626      return 0;
627
628   if (!block->group_names) {
629      if (!ac_init_block_names(&screen->info, &pc->base, block))
630         return 0;
631   }
632   info->name = block->group_names + index * block->group_name_stride;
633   info->num_queries = block->b->selectors;
634   info->max_active_queries = block->b->b->num_counters;
635   return 1;
636}
637
638void si_destroy_perfcounters(struct si_screen *screen)
639{
640   struct si_perfcounters *pc = screen->perfcounters;
641
642   if (!pc)
643      return;
644
645   ac_destroy_perfcounters(&pc->base);
646   FREE(pc);
647   screen->perfcounters = NULL;
648}
649
650void si_init_perfcounters(struct si_screen *screen)
651{
652   bool separate_se, separate_instance;
653
654   separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", false);
655   separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false);
656
657   screen->perfcounters = CALLOC_STRUCT(si_perfcounters);
658   if (!screen->perfcounters)
659      return;
660
661   screen->perfcounters->num_stop_cs_dwords = 14 + si_cp_write_fence_dwords(screen);
662   screen->perfcounters->num_instance_cs_dwords = 3;
663
664   if (!ac_init_perfcounters(&screen->info, separate_se, separate_instance,
665                             &screen->perfcounters->base)) {
666      si_destroy_perfcounters(screen);
667   }
668}
669