1/*
2 * Copyright (C) 2017 Rob Clark <robclark@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 *    Rob Clark <robclark@freedesktop.org>
25 */
26
27/* NOTE: see https://github.com/freedreno/freedreno/wiki/A5xx-Queries */
28
29#include "freedreno_query_acc.h"
30#include "freedreno_resource.h"
31
32#include "fd5_context.h"
33#include "fd5_format.h"
34#include "fd5_query.h"
35
36struct PACKED fd5_query_sample {
37	uint64_t start;
38	uint64_t result;
39	uint64_t stop;
40};
41
42/* offset of a single field of an array of fd5_query_sample: */
43#define query_sample_idx(aq, idx, field)        \
44	fd_resource((aq)->prsc)->bo,                \
45	(idx * sizeof(struct fd5_query_sample)) +   \
46	offsetof(struct fd5_query_sample, field),   \
47	0, 0
48
49/* offset of a single field of fd5_query_sample: */
50#define query_sample(aq, field)                 \
51	query_sample_idx(aq, 0, field)
52
53/*
54 * Occlusion Query:
55 *
56 * OCCLUSION_COUNTER and OCCLUSION_PREDICATE differ only in how they
57 * interpret results
58 */
59
60static void
61occlusion_resume(struct fd_acc_query *aq, struct fd_batch *batch)
62{
63	struct fd_ringbuffer *ring = batch->draw;
64
65	OUT_PKT4(ring, REG_A5XX_RB_SAMPLE_COUNT_CONTROL, 1);
66	OUT_RING(ring, A5XX_RB_SAMPLE_COUNT_CONTROL_COPY);
67
68	OUT_PKT4(ring, REG_A5XX_RB_SAMPLE_COUNT_ADDR_LO, 2);
69	OUT_RELOCW(ring, query_sample(aq, start));
70
71	OUT_PKT7(ring, CP_EVENT_WRITE, 1);
72	OUT_RING(ring, ZPASS_DONE);
73	fd_reset_wfi(batch);
74
75	fd5_context(batch->ctx)->samples_passed_queries++;
76}
77
78static void
79occlusion_pause(struct fd_acc_query *aq, struct fd_batch *batch)
80{
81	struct fd_ringbuffer *ring = batch->draw;
82
83	OUT_PKT7(ring, CP_MEM_WRITE, 4);
84	OUT_RELOCW(ring, query_sample(aq, stop));
85	OUT_RING(ring, 0xffffffff);
86	OUT_RING(ring, 0xffffffff);
87
88	OUT_PKT7(ring, CP_WAIT_MEM_WRITES, 0);
89
90	OUT_PKT4(ring, REG_A5XX_RB_SAMPLE_COUNT_CONTROL, 1);
91	OUT_RING(ring, A5XX_RB_SAMPLE_COUNT_CONTROL_COPY);
92
93	OUT_PKT4(ring, REG_A5XX_RB_SAMPLE_COUNT_ADDR_LO, 2);
94	OUT_RELOCW(ring, query_sample(aq, stop));
95
96	OUT_PKT7(ring, CP_EVENT_WRITE, 1);
97	OUT_RING(ring, ZPASS_DONE);
98	fd_reset_wfi(batch);
99
100	OUT_PKT7(ring, CP_WAIT_REG_MEM, 6);
101	OUT_RING(ring, 0x00000014);   // XXX
102	OUT_RELOC(ring, query_sample(aq, stop));
103	OUT_RING(ring, 0xffffffff);
104	OUT_RING(ring, 0xffffffff);
105	OUT_RING(ring, 0x00000010);   // XXX
106
107	/* result += stop - start: */
108	OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
109	OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE |
110			CP_MEM_TO_MEM_0_NEG_C);
111	OUT_RELOCW(ring, query_sample(aq, result));     /* dst */
112	OUT_RELOC(ring, query_sample(aq, result));      /* srcA */
113	OUT_RELOC(ring, query_sample(aq, stop));        /* srcB */
114	OUT_RELOC(ring, query_sample(aq, start));       /* srcC */
115
116	fd5_context(batch->ctx)->samples_passed_queries--;
117}
118
119static void
120occlusion_counter_result(struct fd_acc_query *aq, void *buf,
121		union pipe_query_result *result)
122{
123	struct fd5_query_sample *sp = buf;
124	result->u64 = sp->result;
125}
126
127static void
128occlusion_predicate_result(struct fd_acc_query *aq, void *buf,
129		union pipe_query_result *result)
130{
131	struct fd5_query_sample *sp = buf;
132	result->b = !!sp->result;
133}
134
135static const struct fd_acc_sample_provider occlusion_counter = {
136		.query_type = PIPE_QUERY_OCCLUSION_COUNTER,
137		.active = FD_STAGE_DRAW,
138		.size = sizeof(struct fd5_query_sample),
139		.resume = occlusion_resume,
140		.pause = occlusion_pause,
141		.result = occlusion_counter_result,
142};
143
144static const struct fd_acc_sample_provider occlusion_predicate = {
145		.query_type = PIPE_QUERY_OCCLUSION_PREDICATE,
146		.active = FD_STAGE_DRAW,
147		.size = sizeof(struct fd5_query_sample),
148		.resume = occlusion_resume,
149		.pause = occlusion_pause,
150		.result = occlusion_predicate_result,
151};
152
153static const struct fd_acc_sample_provider occlusion_predicate_conservative = {
154		.query_type = PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE,
155		.active = FD_STAGE_DRAW,
156		.size = sizeof(struct fd5_query_sample),
157		.resume = occlusion_resume,
158		.pause = occlusion_pause,
159		.result = occlusion_predicate_result,
160};
161
162/*
163 * Timestamp Queries:
164 */
165
166static void
167timestamp_resume(struct fd_acc_query *aq, struct fd_batch *batch)
168{
169	struct fd_ringbuffer *ring = batch->draw;
170
171	OUT_PKT7(ring, CP_EVENT_WRITE, 4);
172	OUT_RING(ring, CP_EVENT_WRITE_0_EVENT(CACHE_FLUSH_AND_INV_EVENT) |
173			CP_EVENT_WRITE_0_TIMESTAMP);
174	OUT_RELOCW(ring, query_sample(aq, start));
175	OUT_RING(ring, 0x00000000);
176
177	fd_reset_wfi(batch);
178}
179
180static void
181timestamp_pause(struct fd_acc_query *aq, struct fd_batch *batch)
182{
183	struct fd_ringbuffer *ring = batch->draw;
184
185	OUT_PKT7(ring, CP_EVENT_WRITE, 4);
186	OUT_RING(ring, CP_EVENT_WRITE_0_EVENT(CACHE_FLUSH_AND_INV_EVENT) |
187			CP_EVENT_WRITE_0_TIMESTAMP);
188	OUT_RELOCW(ring, query_sample(aq, stop));
189	OUT_RING(ring, 0x00000000);
190
191	fd_reset_wfi(batch);
192	fd_wfi(batch, ring);
193
194	/* result += stop - start: */
195	OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
196	OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE |
197			CP_MEM_TO_MEM_0_NEG_C);
198	OUT_RELOCW(ring, query_sample(aq, result));     /* dst */
199	OUT_RELOC(ring, query_sample(aq, result));      /* srcA */
200	OUT_RELOC(ring, query_sample(aq, stop));        /* srcB */
201	OUT_RELOC(ring, query_sample(aq, start));       /* srcC */
202}
203
204static uint64_t
205ticks_to_ns(uint32_t ts)
206{
207	/* This is based on the 19.2MHz always-on rbbm timer.
208	 *
209	 * TODO we should probably query this value from kernel..
210	 */
211	return ts * (1000000000 / 19200000);
212}
213
214static void
215time_elapsed_accumulate_result(struct fd_acc_query *aq, void *buf,
216		union pipe_query_result *result)
217{
218	struct fd5_query_sample *sp = buf;
219	result->u64 = ticks_to_ns(sp->result);
220}
221
222static void
223timestamp_accumulate_result(struct fd_acc_query *aq, void *buf,
224		union pipe_query_result *result)
225{
226	struct fd5_query_sample *sp = buf;
227	result->u64 = ticks_to_ns(sp->result);
228}
229
230static const struct fd_acc_sample_provider time_elapsed = {
231		.query_type = PIPE_QUERY_TIME_ELAPSED,
232		.active = FD_STAGE_DRAW | FD_STAGE_CLEAR,
233		.size = sizeof(struct fd5_query_sample),
234		.resume = timestamp_resume,
235		.pause = timestamp_pause,
236		.result = time_elapsed_accumulate_result,
237};
238
239/* NOTE: timestamp query isn't going to give terribly sensible results
240 * on a tiler.  But it is needed by qapitrace profile heatmap.  If you
241 * add in a binning pass, the results get even more non-sensical.  So
242 * we just return the timestamp on the first tile and hope that is
243 * kind of good enough.
244 */
245
246static const struct fd_acc_sample_provider timestamp = {
247		.query_type = PIPE_QUERY_TIMESTAMP,
248		.active = FD_STAGE_ALL,
249		.size = sizeof(struct fd5_query_sample),
250		.resume = timestamp_resume,
251		.pause = timestamp_pause,
252		.result = timestamp_accumulate_result,
253};
254
255/*
256 * Performance Counter (batch) queries:
257 *
258 * Only one of these is active at a time, per design of the gallium
259 * batch_query API design.  On perfcntr query tracks N query_types,
260 * each of which has a 'fd_batch_query_entry' that maps it back to
261 * the associated group and counter.
262 */
263
264struct fd_batch_query_entry {
265	uint8_t gid;        /* group-id */
266	uint8_t cid;        /* countable-id within the group */
267};
268
269struct fd_batch_query_data {
270	struct fd_screen *screen;
271	unsigned num_query_entries;
272	struct fd_batch_query_entry query_entries[];
273};
274
275static void
276perfcntr_resume(struct fd_acc_query *aq, struct fd_batch *batch)
277{
278	struct fd_batch_query_data *data = aq->query_data;
279	struct fd_screen *screen = data->screen;
280	struct fd_ringbuffer *ring = batch->draw;
281
282	unsigned counters_per_group[screen->num_perfcntr_groups];
283	memset(counters_per_group, 0, sizeof(counters_per_group));
284
285	fd_wfi(batch, ring);
286
287	/* configure performance counters for the requested queries: */
288	for (unsigned i = 0; i < data->num_query_entries; i++) {
289		struct fd_batch_query_entry *entry = &data->query_entries[i];
290		const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid];
291		unsigned counter_idx = counters_per_group[entry->gid]++;
292
293		debug_assert(counter_idx < g->num_counters);
294
295		OUT_PKT4(ring, g->counters[counter_idx].select_reg, 1);
296		OUT_RING(ring, g->countables[entry->cid].selector);
297	}
298
299	memset(counters_per_group, 0, sizeof(counters_per_group));
300
301	/* and snapshot the start values */
302	for (unsigned i = 0; i < data->num_query_entries; i++) {
303		struct fd_batch_query_entry *entry = &data->query_entries[i];
304		const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid];
305		unsigned counter_idx = counters_per_group[entry->gid]++;
306		const struct fd_perfcntr_counter *counter = &g->counters[counter_idx];
307
308		OUT_PKT7(ring, CP_REG_TO_MEM, 3);
309		OUT_RING(ring, CP_REG_TO_MEM_0_64B |
310			CP_REG_TO_MEM_0_REG(counter->counter_reg_lo));
311		OUT_RELOCW(ring, query_sample_idx(aq, i, start));
312	}
313}
314
315static void
316perfcntr_pause(struct fd_acc_query *aq, struct fd_batch *batch)
317{
318	struct fd_batch_query_data *data = aq->query_data;
319	struct fd_screen *screen = data->screen;
320	struct fd_ringbuffer *ring = batch->draw;
321
322	unsigned counters_per_group[screen->num_perfcntr_groups];
323	memset(counters_per_group, 0, sizeof(counters_per_group));
324
325	fd_wfi(batch, ring);
326
327	/* TODO do we need to bother to turn anything off? */
328
329	/* snapshot the end values: */
330	for (unsigned i = 0; i < data->num_query_entries; i++) {
331		struct fd_batch_query_entry *entry = &data->query_entries[i];
332		const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid];
333		unsigned counter_idx = counters_per_group[entry->gid]++;
334		const struct fd_perfcntr_counter *counter = &g->counters[counter_idx];
335
336		OUT_PKT7(ring, CP_REG_TO_MEM, 3);
337		OUT_RING(ring, CP_REG_TO_MEM_0_64B |
338			CP_REG_TO_MEM_0_REG(counter->counter_reg_lo));
339		OUT_RELOCW(ring, query_sample_idx(aq, i, stop));
340	}
341
342	/* and compute the result: */
343	for (unsigned i = 0; i < data->num_query_entries; i++) {
344		/* result += stop - start: */
345		OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
346		OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE |
347				CP_MEM_TO_MEM_0_NEG_C);
348		OUT_RELOCW(ring, query_sample_idx(aq, i, result));     /* dst */
349		OUT_RELOC(ring, query_sample_idx(aq, i, result));      /* srcA */
350		OUT_RELOC(ring, query_sample_idx(aq, i, stop));        /* srcB */
351		OUT_RELOC(ring, query_sample_idx(aq, i, start));       /* srcC */
352	}
353}
354
355static void
356perfcntr_accumulate_result(struct fd_acc_query *aq, void *buf,
357		union pipe_query_result *result)
358{
359	struct fd_batch_query_data *data = aq->query_data;
360	struct fd5_query_sample *sp = buf;
361
362	for (unsigned i = 0; i < data->num_query_entries; i++) {
363		result->batch[i].u64 = sp[i].result;
364	}
365}
366
367static const struct fd_acc_sample_provider perfcntr = {
368		.query_type = FD_QUERY_FIRST_PERFCNTR,
369		.active = FD_STAGE_DRAW | FD_STAGE_CLEAR,
370		.resume = perfcntr_resume,
371		.pause = perfcntr_pause,
372		.result = perfcntr_accumulate_result,
373};
374
375static struct pipe_query *
376fd5_create_batch_query(struct pipe_context *pctx,
377		unsigned num_queries, unsigned *query_types)
378{
379	struct fd_context *ctx = fd_context(pctx);
380	struct fd_screen *screen = ctx->screen;
381	struct fd_query *q;
382	struct fd_acc_query *aq;
383	struct fd_batch_query_data *data;
384
385	data = CALLOC_VARIANT_LENGTH_STRUCT(fd_batch_query_data,
386			num_queries * sizeof(data->query_entries[0]));
387
388	data->screen = screen;
389	data->num_query_entries = num_queries;
390
391	/* validate the requested query_types and ensure we don't try
392	 * to request more query_types of a given group than we have
393	 * counters:
394	 */
395	unsigned counters_per_group[screen->num_perfcntr_groups];
396	memset(counters_per_group, 0, sizeof(counters_per_group));
397
398	for (unsigned i = 0; i < num_queries; i++) {
399		unsigned idx = query_types[i] - FD_QUERY_FIRST_PERFCNTR;
400
401		/* verify valid query_type, ie. is it actually a perfcntr? */
402		if ((query_types[i] < FD_QUERY_FIRST_PERFCNTR) ||
403				(idx >= screen->num_perfcntr_queries)) {
404			debug_printf("invalid batch query query_type: %u\n", query_types[i]);
405			goto error;
406		}
407
408		struct fd_batch_query_entry *entry = &data->query_entries[i];
409		struct pipe_driver_query_info *pq = &screen->perfcntr_queries[idx];
410
411		entry->gid = pq->group_id;
412
413		/* the perfcntr_queries[] table flattens all the countables
414		 * for each group in series, ie:
415		 *
416		 *   (G0,C0), .., (G0,Cn), (G1,C0), .., (G1,Cm), ...
417		 *
418		 * So to find the countable index just step back through the
419		 * table to find the first entry with the same group-id.
420		 */
421		while (pq > screen->perfcntr_queries) {
422			pq--;
423			if (pq->group_id == entry->gid)
424				entry->cid++;
425		}
426
427		if (counters_per_group[entry->gid] >=
428				screen->perfcntr_groups[entry->gid].num_counters) {
429			debug_printf("too many counters for group %u\n", entry->gid);
430			goto error;
431		}
432
433		counters_per_group[entry->gid]++;
434	}
435
436	q = fd_acc_create_query2(ctx, 0, &perfcntr);
437	aq = fd_acc_query(q);
438
439	/* sample buffer size is based on # of queries: */
440	aq->size = num_queries * sizeof(struct fd5_query_sample);
441	aq->query_data = data;
442
443	return (struct pipe_query *)q;
444
445error:
446	free(data);
447	return NULL;
448}
449
450void
451fd5_query_context_init(struct pipe_context *pctx)
452{
453	struct fd_context *ctx = fd_context(pctx);
454
455	ctx->create_query = fd_acc_create_query;
456	ctx->query_set_stage = fd_acc_query_set_stage;
457
458	pctx->create_batch_query = fd5_create_batch_query;
459
460	fd_acc_query_register_provider(pctx, &occlusion_counter);
461	fd_acc_query_register_provider(pctx, &occlusion_predicate);
462	fd_acc_query_register_provider(pctx, &occlusion_predicate_conservative);
463
464	fd_acc_query_register_provider(pctx, &time_elapsed);
465	fd_acc_query_register_provider(pctx, &timestamp);
466}
467