radv_query.c revision b8e80941
1/*
2 * Copyrigh 2016 Red Hat Inc.
3 * Based on anv:
4 * Copyright © 2015 Intel Corporation
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23 * IN THE SOFTWARE.
24 */
25
26#include <assert.h>
27#include <stdbool.h>
28#include <string.h>
29#include <unistd.h>
30#include <fcntl.h>
31
32#include "nir/nir_builder.h"
33#include "radv_meta.h"
34#include "radv_private.h"
35#include "radv_cs.h"
36#include "sid.h"
37
38#define TIMESTAMP_NOT_READY UINT64_MAX
39
40static const int pipelinestat_block_size = 11 * 8;
41static const unsigned pipeline_statistics_indices[] = {7, 6, 3, 4, 5, 2, 1, 0, 8, 9, 10};
42
43static nir_ssa_def *nir_test_flag(nir_builder *b, nir_ssa_def *flags, uint32_t flag)
44{
45	return nir_i2b(b, nir_iand(b, flags, nir_imm_int(b, flag)));
46}
47
48static void radv_break_on_count(nir_builder *b, nir_variable *var, nir_ssa_def *count)
49{
50	nir_ssa_def *counter = nir_load_var(b, var);
51
52	nir_if *if_stmt = nir_if_create(b->shader);
53	if_stmt->condition = nir_src_for_ssa(nir_uge(b, counter, count));
54	nir_cf_node_insert(b->cursor, &if_stmt->cf_node);
55
56	b->cursor = nir_after_cf_list(&if_stmt->then_list);
57
58	nir_jump_instr *instr = nir_jump_instr_create(b->shader, nir_jump_break);
59	nir_builder_instr_insert(b, &instr->instr);
60
61	b->cursor = nir_after_cf_node(&if_stmt->cf_node);
62	counter = nir_iadd(b, counter, nir_imm_int(b, 1));
63	nir_store_var(b, var, counter, 0x1);
64}
65
66static struct nir_ssa_def *
67radv_load_push_int(nir_builder *b, unsigned offset, const char *name)
68{
69	nir_intrinsic_instr *flags = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_push_constant);
70	nir_intrinsic_set_base(flags, 0);
71	nir_intrinsic_set_range(flags, 16);
72	flags->src[0] = nir_src_for_ssa(nir_imm_int(b, offset));
73	flags->num_components = 1;
74	nir_ssa_dest_init(&flags->instr, &flags->dest, 1, 32, name);
75	nir_builder_instr_insert(b, &flags->instr);
76	return &flags->dest.ssa;
77}
78
79static nir_shader *
80build_occlusion_query_shader(struct radv_device *device) {
81	/* the shader this builds is roughly
82	 *
83	 * push constants {
84	 * 	uint32_t flags;
85	 * 	uint32_t dst_stride;
86	 * };
87	 *
88	 * uint32_t src_stride = 16 * db_count;
89	 *
90	 * location(binding = 0) buffer dst_buf;
91	 * location(binding = 1) buffer src_buf;
92	 *
93	 * void main() {
94	 * 	uint64_t result = 0;
95	 * 	uint64_t src_offset = src_stride * global_id.x;
96	 * 	uint64_t dst_offset = dst_stride * global_id.x;
97	 * 	bool available = true;
98	 * 	for (int i = 0; i < db_count; ++i) {
99	 *		if (enabled_rb_mask & (1 << i)) {
100	 *			uint64_t start = src_buf[src_offset + 16 * i];
101	 *			uint64_t end = src_buf[src_offset + 16 * i + 8];
102	 *			if ((start & (1ull << 63)) && (end & (1ull << 63)))
103	 *				result += end - start;
104	 *			else
105	 *				available = false;
106	 *		}
107	 * 	}
108	 * 	uint32_t elem_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
109	 * 	if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
110	 * 		if (flags & VK_QUERY_RESULT_64_BIT)
111	 * 			dst_buf[dst_offset] = result;
112	 * 		else
113	 * 			dst_buf[dst_offset] = (uint32_t)result.
114	 * 	}
115	 * 	if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
116	 * 		dst_buf[dst_offset + elem_size] = available;
117	 * 	}
118	 * }
119	 */
120	nir_builder b;
121	nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL);
122	b.shader->info.name = ralloc_strdup(b.shader, "occlusion_query");
123	b.shader->info.cs.local_size[0] = 64;
124	b.shader->info.cs.local_size[1] = 1;
125	b.shader->info.cs.local_size[2] = 1;
126
127	nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
128	nir_variable *outer_counter = nir_local_variable_create(b.impl, glsl_int_type(), "outer_counter");
129	nir_variable *start = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "start");
130	nir_variable *end = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "end");
131	nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
132	unsigned enabled_rb_mask = device->physical_device->rad_info.enabled_rb_mask;
133	unsigned db_count = device->physical_device->rad_info.num_render_backends;
134
135	nir_ssa_def *flags = radv_load_push_int(&b, 0, "flags");
136
137	nir_intrinsic_instr *dst_buf = nir_intrinsic_instr_create(b.shader,
138	                                                          nir_intrinsic_vulkan_resource_index);
139	dst_buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
140	dst_buf->num_components = 1;
141	nir_intrinsic_set_desc_set(dst_buf, 0);
142	nir_intrinsic_set_binding(dst_buf, 0);
143	nir_ssa_dest_init(&dst_buf->instr, &dst_buf->dest, dst_buf->num_components, 32, NULL);
144	nir_builder_instr_insert(&b, &dst_buf->instr);
145
146	nir_intrinsic_instr *src_buf = nir_intrinsic_instr_create(b.shader,
147	                                                          nir_intrinsic_vulkan_resource_index);
148	src_buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
149	src_buf->num_components = 1;
150	nir_intrinsic_set_desc_set(src_buf, 0);
151	nir_intrinsic_set_binding(src_buf, 1);
152	nir_ssa_dest_init(&src_buf->instr, &src_buf->dest, src_buf->num_components, 32, NULL);
153	nir_builder_instr_insert(&b, &src_buf->instr);
154
155	nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
156	nir_ssa_def *wg_id = nir_load_work_group_id(&b);
157	nir_ssa_def *block_size = nir_imm_ivec4(&b,
158	                                        b.shader->info.cs.local_size[0],
159	                                        b.shader->info.cs.local_size[1],
160	                                        b.shader->info.cs.local_size[2], 0);
161	nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
162	global_id = nir_channel(&b, global_id, 0); // We only care about x here.
163
164	nir_ssa_def *input_stride = nir_imm_int(&b, db_count * 16);
165	nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
166	nir_ssa_def *output_stride = radv_load_push_int(&b, 4, "output_stride");
167	nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
168
169
170	nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
171	nir_store_var(&b, outer_counter, nir_imm_int(&b, 0), 0x1);
172	nir_store_var(&b, available, nir_imm_true(&b), 0x1);
173
174	nir_loop *outer_loop = nir_loop_create(b.shader);
175	nir_builder_cf_insert(&b, &outer_loop->cf_node);
176	b.cursor = nir_after_cf_list(&outer_loop->body);
177
178	nir_ssa_def *current_outer_count = nir_load_var(&b, outer_counter);
179	radv_break_on_count(&b, outer_counter, nir_imm_int(&b, db_count));
180
181	nir_ssa_def *enabled_cond =
182		nir_iand(&b, nir_imm_int(&b, enabled_rb_mask),
183			     nir_ishl(&b, nir_imm_int(&b, 1), current_outer_count));
184
185	nir_if *enabled_if = nir_if_create(b.shader);
186	enabled_if->condition = nir_src_for_ssa(nir_i2b(&b, enabled_cond));
187	nir_cf_node_insert(b.cursor, &enabled_if->cf_node);
188
189	b.cursor = nir_after_cf_list(&enabled_if->then_list);
190
191	nir_ssa_def *load_offset = nir_imul(&b, current_outer_count, nir_imm_int(&b, 16));
192	load_offset = nir_iadd(&b, input_base, load_offset);
193
194	nir_intrinsic_instr *load = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
195	load->src[0] = nir_src_for_ssa(&src_buf->dest.ssa);
196	load->src[1] = nir_src_for_ssa(load_offset);
197	nir_ssa_dest_init(&load->instr, &load->dest, 2, 64, NULL);
198	load->num_components = 2;
199	nir_builder_instr_insert(&b, &load->instr);
200
201	nir_store_var(&b, start, nir_channel(&b, &load->dest.ssa, 0), 0x1);
202	nir_store_var(&b, end, nir_channel(&b, &load->dest.ssa, 1), 0x1);
203
204	nir_ssa_def *start_done = nir_ilt(&b, nir_load_var(&b, start), nir_imm_int64(&b, 0));
205	nir_ssa_def *end_done = nir_ilt(&b, nir_load_var(&b, end), nir_imm_int64(&b, 0));
206
207	nir_if *update_if = nir_if_create(b.shader);
208	update_if->condition = nir_src_for_ssa(nir_iand(&b, start_done, end_done));
209	nir_cf_node_insert(b.cursor, &update_if->cf_node);
210
211	b.cursor = nir_after_cf_list(&update_if->then_list);
212
213	nir_store_var(&b, result,
214	              nir_iadd(&b, nir_load_var(&b, result),
215	                           nir_isub(&b, nir_load_var(&b, end),
216	                                        nir_load_var(&b, start))), 0x1);
217
218	b.cursor = nir_after_cf_list(&update_if->else_list);
219
220	nir_store_var(&b, available, nir_imm_false(&b), 0x1);
221
222	b.cursor = nir_after_cf_node(&outer_loop->cf_node);
223
224	/* Store the result if complete or if partial results have been requested. */
225
226	nir_ssa_def *result_is_64bit = nir_test_flag(&b, flags, VK_QUERY_RESULT_64_BIT);
227	nir_ssa_def *result_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
228
229	nir_if *store_if = nir_if_create(b.shader);
230	store_if->condition = nir_src_for_ssa(nir_ior(&b, nir_test_flag(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT), nir_load_var(&b, available)));
231	nir_cf_node_insert(b.cursor, &store_if->cf_node);
232
233	b.cursor = nir_after_cf_list(&store_if->then_list);
234
235	nir_if *store_64bit_if = nir_if_create(b.shader);
236	store_64bit_if->condition = nir_src_for_ssa(result_is_64bit);
237	nir_cf_node_insert(b.cursor, &store_64bit_if->cf_node);
238
239	b.cursor = nir_after_cf_list(&store_64bit_if->then_list);
240
241	nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
242	store->src[0] = nir_src_for_ssa(nir_load_var(&b, result));
243	store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
244	store->src[2] = nir_src_for_ssa(output_base);
245	nir_intrinsic_set_write_mask(store, 0x1);
246	store->num_components = 1;
247	nir_builder_instr_insert(&b, &store->instr);
248
249	b.cursor = nir_after_cf_list(&store_64bit_if->else_list);
250
251	store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
252	store->src[0] = nir_src_for_ssa(nir_u2u32(&b, nir_load_var(&b, result)));
253	store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
254	store->src[2] = nir_src_for_ssa(output_base);
255	nir_intrinsic_set_write_mask(store, 0x1);
256	store->num_components = 1;
257	nir_builder_instr_insert(&b, &store->instr);
258
259	b.cursor = nir_after_cf_node(&store_if->cf_node);
260
261	/* Store the availability bit if requested. */
262
263	nir_if *availability_if = nir_if_create(b.shader);
264	availability_if->condition = nir_src_for_ssa(nir_test_flag(&b, flags, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT));
265	nir_cf_node_insert(b.cursor, &availability_if->cf_node);
266
267	b.cursor = nir_after_cf_list(&availability_if->then_list);
268
269	store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
270	store->src[0] = nir_src_for_ssa(nir_b2i32(&b, nir_load_var(&b, available)));
271	store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
272	store->src[2] = nir_src_for_ssa(nir_iadd(&b, result_size, output_base));
273	nir_intrinsic_set_write_mask(store, 0x1);
274	store->num_components = 1;
275	nir_builder_instr_insert(&b, &store->instr);
276
277	return b.shader;
278}
279
280static nir_shader *
281build_pipeline_statistics_query_shader(struct radv_device *device) {
282	/* the shader this builds is roughly
283	 *
284	 * push constants {
285	 * 	uint32_t flags;
286	 * 	uint32_t dst_stride;
287	 * 	uint32_t stats_mask;
288	 * 	uint32_t avail_offset;
289	 * };
290	 *
291	 * uint32_t src_stride = pipelinestat_block_size * 2;
292	 *
293	 * location(binding = 0) buffer dst_buf;
294	 * location(binding = 1) buffer src_buf;
295	 *
296	 * void main() {
297	 * 	uint64_t src_offset = src_stride * global_id.x;
298	 * 	uint64_t dst_base = dst_stride * global_id.x;
299	 * 	uint64_t dst_offset = dst_base;
300	 * 	uint32_t elem_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
301	 * 	uint32_t elem_count = stats_mask >> 16;
302	 * 	uint32_t available32 = src_buf[avail_offset + 4 * global_id.x];
303	 * 	if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
304	 * 		dst_buf[dst_offset + elem_count * elem_size] = available32;
305	 * 	}
306	 * 	if ((bool)available32) {
307	 * 		// repeat 11 times:
308	 * 		if (stats_mask & (1 << 0)) {
309	 * 			uint64_t start = src_buf[src_offset + 8 * indices[0]];
310	 * 			uint64_t end = src_buf[src_offset + 8 * indices[0] + pipelinestat_block_size];
311	 * 			uint64_t result = end - start;
312	 * 			if (flags & VK_QUERY_RESULT_64_BIT)
313	 * 				dst_buf[dst_offset] = result;
314	 * 			else
315	 * 				dst_buf[dst_offset] = (uint32_t)result.
316	 * 			dst_offset += elem_size;
317	 * 		}
318	 * 	} else if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
319	 *              // Set everything to 0 as we don't know what is valid.
320	 * 		for (int i = 0; i < elem_count; ++i)
321	 * 			dst_buf[dst_base + elem_size * i] = 0;
322	 * 	}
323	 * }
324	 */
325	nir_builder b;
326	nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL);
327	b.shader->info.name = ralloc_strdup(b.shader, "pipeline_statistics_query");
328	b.shader->info.cs.local_size[0] = 64;
329	b.shader->info.cs.local_size[1] = 1;
330	b.shader->info.cs.local_size[2] = 1;
331
332	nir_variable *output_offset = nir_local_variable_create(b.impl, glsl_int_type(), "output_offset");
333
334	nir_ssa_def *flags = radv_load_push_int(&b, 0, "flags");
335	nir_ssa_def *stats_mask = radv_load_push_int(&b, 8, "stats_mask");
336	nir_ssa_def *avail_offset = radv_load_push_int(&b, 12, "avail_offset");
337
338	nir_intrinsic_instr *dst_buf = nir_intrinsic_instr_create(b.shader,
339	                                                          nir_intrinsic_vulkan_resource_index);
340	dst_buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
341	dst_buf->num_components = 1;;
342	nir_intrinsic_set_desc_set(dst_buf, 0);
343	nir_intrinsic_set_binding(dst_buf, 0);
344	nir_ssa_dest_init(&dst_buf->instr, &dst_buf->dest, dst_buf->num_components, 32, NULL);
345	nir_builder_instr_insert(&b, &dst_buf->instr);
346
347	nir_intrinsic_instr *src_buf = nir_intrinsic_instr_create(b.shader,
348	                                                          nir_intrinsic_vulkan_resource_index);
349	src_buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
350	src_buf->num_components = 1;
351	nir_intrinsic_set_desc_set(src_buf, 0);
352	nir_intrinsic_set_binding(src_buf, 1);
353	nir_ssa_dest_init(&src_buf->instr, &src_buf->dest, src_buf->num_components, 32, NULL);
354	nir_builder_instr_insert(&b, &src_buf->instr);
355
356	nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
357	nir_ssa_def *wg_id = nir_load_work_group_id(&b);
358	nir_ssa_def *block_size = nir_imm_ivec4(&b,
359	                                        b.shader->info.cs.local_size[0],
360	                                        b.shader->info.cs.local_size[1],
361	                                        b.shader->info.cs.local_size[2], 0);
362	nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
363	global_id = nir_channel(&b, global_id, 0); // We only care about x here.
364
365	nir_ssa_def *input_stride = nir_imm_int(&b, pipelinestat_block_size * 2);
366	nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
367	nir_ssa_def *output_stride = radv_load_push_int(&b, 4, "output_stride");
368	nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
369
370
371	avail_offset = nir_iadd(&b, avail_offset,
372	                            nir_imul(&b, global_id, nir_imm_int(&b, 4)));
373
374	nir_intrinsic_instr *load = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
375	load->src[0] = nir_src_for_ssa(&src_buf->dest.ssa);
376	load->src[1] = nir_src_for_ssa(avail_offset);
377	nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL);
378	load->num_components = 1;
379	nir_builder_instr_insert(&b, &load->instr);
380	nir_ssa_def *available32 = &load->dest.ssa;
381
382	nir_ssa_def *result_is_64bit = nir_test_flag(&b, flags, VK_QUERY_RESULT_64_BIT);
383	nir_ssa_def *elem_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
384	nir_ssa_def *elem_count = nir_ushr(&b, stats_mask, nir_imm_int(&b, 16));
385
386	/* Store the availability bit if requested. */
387
388	nir_if *availability_if = nir_if_create(b.shader);
389	availability_if->condition = nir_src_for_ssa(nir_test_flag(&b, flags, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT));
390	nir_cf_node_insert(b.cursor, &availability_if->cf_node);
391
392	b.cursor = nir_after_cf_list(&availability_if->then_list);
393
394	nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
395	store->src[0] = nir_src_for_ssa(available32);
396	store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
397	store->src[2] = nir_src_for_ssa(nir_iadd(&b, output_base, nir_imul(&b, elem_count, elem_size)));
398	nir_intrinsic_set_write_mask(store, 0x1);
399	store->num_components = 1;
400	nir_builder_instr_insert(&b, &store->instr);
401
402	b.cursor = nir_after_cf_node(&availability_if->cf_node);
403
404	nir_if *available_if = nir_if_create(b.shader);
405	available_if->condition = nir_src_for_ssa(nir_i2b(&b, available32));
406	nir_cf_node_insert(b.cursor, &available_if->cf_node);
407
408	b.cursor = nir_after_cf_list(&available_if->then_list);
409
410	nir_store_var(&b, output_offset, output_base, 0x1);
411	for (int i = 0; i < 11; ++i) {
412		nir_if *store_if = nir_if_create(b.shader);
413		store_if->condition = nir_src_for_ssa(nir_test_flag(&b, stats_mask, 1u << i));
414		nir_cf_node_insert(b.cursor, &store_if->cf_node);
415
416		b.cursor = nir_after_cf_list(&store_if->then_list);
417
418		load = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
419		load->src[0] = nir_src_for_ssa(&src_buf->dest.ssa);
420		load->src[1] = nir_src_for_ssa(nir_iadd(&b, input_base,
421		                                            nir_imm_int(&b, pipeline_statistics_indices[i] * 8)));
422		nir_ssa_dest_init(&load->instr, &load->dest, 1, 64, NULL);
423		load->num_components = 1;
424		nir_builder_instr_insert(&b, &load->instr);
425		nir_ssa_def *start = &load->dest.ssa;
426
427		load = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
428		load->src[0] = nir_src_for_ssa(&src_buf->dest.ssa);
429		load->src[1] = nir_src_for_ssa(nir_iadd(&b, input_base,
430		                                            nir_imm_int(&b, pipeline_statistics_indices[i] * 8 + pipelinestat_block_size)));
431		nir_ssa_dest_init(&load->instr, &load->dest, 1, 64, NULL);
432		load->num_components = 1;
433		nir_builder_instr_insert(&b, &load->instr);
434		nir_ssa_def *end = &load->dest.ssa;
435
436		nir_ssa_def *result = nir_isub(&b, end, start);
437
438		/* Store result */
439		nir_if *store_64bit_if = nir_if_create(b.shader);
440		store_64bit_if->condition = nir_src_for_ssa(result_is_64bit);
441		nir_cf_node_insert(b.cursor, &store_64bit_if->cf_node);
442
443		b.cursor = nir_after_cf_list(&store_64bit_if->then_list);
444
445		nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
446		store->src[0] = nir_src_for_ssa(result);
447		store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
448		store->src[2] = nir_src_for_ssa(nir_load_var(&b, output_offset));
449		nir_intrinsic_set_write_mask(store, 0x1);
450		store->num_components = 1;
451		nir_builder_instr_insert(&b, &store->instr);
452
453		b.cursor = nir_after_cf_list(&store_64bit_if->else_list);
454
455		store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
456		store->src[0] = nir_src_for_ssa(nir_u2u32(&b, result));
457		store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
458		store->src[2] = nir_src_for_ssa(nir_load_var(&b, output_offset));
459		nir_intrinsic_set_write_mask(store, 0x1);
460		store->num_components = 1;
461		nir_builder_instr_insert(&b, &store->instr);
462
463		b.cursor = nir_after_cf_node(&store_64bit_if->cf_node);
464
465		nir_store_var(&b, output_offset,
466		                  nir_iadd(&b, nir_load_var(&b, output_offset),
467		                               elem_size), 0x1);
468
469		b.cursor = nir_after_cf_node(&store_if->cf_node);
470	}
471
472	b.cursor = nir_after_cf_list(&available_if->else_list);
473
474	available_if = nir_if_create(b.shader);
475	available_if->condition = nir_src_for_ssa(nir_test_flag(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT));
476	nir_cf_node_insert(b.cursor, &available_if->cf_node);
477
478	b.cursor = nir_after_cf_list(&available_if->then_list);
479
480	/* Stores zeros in all outputs. */
481
482	nir_variable *counter = nir_local_variable_create(b.impl, glsl_int_type(), "counter");
483	nir_store_var(&b, counter, nir_imm_int(&b, 0), 0x1);
484
485	nir_loop *loop = nir_loop_create(b.shader);
486	nir_builder_cf_insert(&b, &loop->cf_node);
487	b.cursor = nir_after_cf_list(&loop->body);
488
489	nir_ssa_def *current_counter = nir_load_var(&b, counter);
490	radv_break_on_count(&b, counter, elem_count);
491
492	nir_ssa_def *output_elem = nir_iadd(&b, output_base,
493	                                        nir_imul(&b, elem_size, current_counter));
494
495	nir_if *store_64bit_if = nir_if_create(b.shader);
496	store_64bit_if->condition = nir_src_for_ssa(result_is_64bit);
497	nir_cf_node_insert(b.cursor, &store_64bit_if->cf_node);
498
499	b.cursor = nir_after_cf_list(&store_64bit_if->then_list);
500
501	store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
502	store->src[0] = nir_src_for_ssa(nir_imm_int64(&b, 0));
503	store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
504	store->src[2] = nir_src_for_ssa(output_elem);
505	nir_intrinsic_set_write_mask(store, 0x1);
506	store->num_components = 1;
507	nir_builder_instr_insert(&b, &store->instr);
508
509	b.cursor = nir_after_cf_list(&store_64bit_if->else_list);
510
511	store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
512	store->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
513	store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
514	store->src[2] = nir_src_for_ssa(output_elem);
515	nir_intrinsic_set_write_mask(store, 0x1);
516	store->num_components = 1;
517	nir_builder_instr_insert(&b, &store->instr);
518
519	b.cursor = nir_after_cf_node(&loop->cf_node);
520	return b.shader;
521}
522
523static nir_shader *
524build_tfb_query_shader(struct radv_device *device)
525{
526	/* the shader this builds is roughly
527	 *
528	 * uint32_t src_stride = 32;
529	 *
530	 * location(binding = 0) buffer dst_buf;
531	 * location(binding = 1) buffer src_buf;
532	 *
533	 * void main() {
534	 *	uint64_t result[2] = {};
535	 *	bool available = false;
536	 *	uint64_t src_offset = src_stride * global_id.x;
537	 * 	uint64_t dst_offset = dst_stride * global_id.x;
538	 * 	uint64_t *src_data = src_buf[src_offset];
539	 *	uint32_t avail = (src_data[0] >> 32) &
540	 *			 (src_data[1] >> 32) &
541	 *			 (src_data[2] >> 32) &
542	 *			 (src_data[3] >> 32);
543	 *	if (avail & 0x80000000) {
544	 *		result[0] = src_data[3] - src_data[1];
545	 *		result[1] = src_data[2] - src_data[0];
546	 *		available = true;
547	 *	}
548	 * 	uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 16 : 8;
549	 * 	if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
550	 *		if (flags & VK_QUERY_RESULT_64_BIT) {
551	 *			dst_buf[dst_offset] = result;
552	 *		} else {
553	 *			dst_buf[dst_offset] = (uint32_t)result;
554	 *		}
555	 *	}
556	 *	if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
557	 *		dst_buf[dst_offset + result_size] = available;
558	 * 	}
559	 * }
560	 */
561	nir_builder b;
562	nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL);
563	b.shader->info.name = ralloc_strdup(b.shader, "tfb_query");
564	b.shader->info.cs.local_size[0] = 64;
565	b.shader->info.cs.local_size[1] = 1;
566	b.shader->info.cs.local_size[2] = 1;
567
568	/* Create and initialize local variables. */
569	nir_variable *result =
570		nir_local_variable_create(b.impl,
571					  glsl_vector_type(GLSL_TYPE_UINT64, 2),
572					  "result");
573	nir_variable *available =
574		nir_local_variable_create(b.impl, glsl_bool_type(), "available");
575
576	nir_store_var(&b, result,
577		      nir_vec2(&b, nir_imm_int64(&b, 0),
578				   nir_imm_int64(&b, 0)), 0x3);
579	nir_store_var(&b, available, nir_imm_false(&b), 0x1);
580
581	nir_ssa_def *flags = radv_load_push_int(&b, 0, "flags");
582
583	/* Load resources. */
584	nir_intrinsic_instr *dst_buf = nir_intrinsic_instr_create(b.shader,
585	                                                          nir_intrinsic_vulkan_resource_index);
586	dst_buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
587	dst_buf->num_components = 1;
588	nir_intrinsic_set_desc_set(dst_buf, 0);
589	nir_intrinsic_set_binding(dst_buf, 0);
590	nir_ssa_dest_init(&dst_buf->instr, &dst_buf->dest, dst_buf->num_components, 32, NULL);
591	nir_builder_instr_insert(&b, &dst_buf->instr);
592
593	nir_intrinsic_instr *src_buf = nir_intrinsic_instr_create(b.shader,
594	                                                          nir_intrinsic_vulkan_resource_index);
595	src_buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
596	src_buf->num_components = 1;
597	nir_intrinsic_set_desc_set(src_buf, 0);
598	nir_intrinsic_set_binding(src_buf, 1);
599	nir_ssa_dest_init(&src_buf->instr, &src_buf->dest, src_buf->num_components, 32, NULL);
600	nir_builder_instr_insert(&b, &src_buf->instr);
601
602	/* Compute global ID. */
603	nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
604	nir_ssa_def *wg_id = nir_load_work_group_id(&b);
605	nir_ssa_def *block_size = nir_imm_ivec4(&b,
606	                                        b.shader->info.cs.local_size[0],
607	                                        b.shader->info.cs.local_size[1],
608	                                        b.shader->info.cs.local_size[2], 0);
609	nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
610	global_id = nir_channel(&b, global_id, 0); // We only care about x here.
611
612	/* Compute src/dst strides. */
613	nir_ssa_def *input_stride = nir_imm_int(&b, 32);
614	nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
615	nir_ssa_def *output_stride = radv_load_push_int(&b, 4, "output_stride");
616	nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
617
618	/* Load data from the query pool. */
619	nir_intrinsic_instr *load1 = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
620	load1->src[0] = nir_src_for_ssa(&src_buf->dest.ssa);
621	load1->src[1] = nir_src_for_ssa(input_base);
622	nir_ssa_dest_init(&load1->instr, &load1->dest, 4, 32, NULL);
623	load1->num_components = 4;
624	nir_builder_instr_insert(&b, &load1->instr);
625
626	nir_intrinsic_instr *load2 = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
627	load2->src[0] = nir_src_for_ssa(&src_buf->dest.ssa);
628	load2->src[1] = nir_src_for_ssa(nir_iadd(&b, input_base, nir_imm_int(&b, 16)));
629	nir_ssa_dest_init(&load2->instr, &load2->dest, 4, 32, NULL);
630	load2->num_components = 4;
631	nir_builder_instr_insert(&b, &load2->instr);
632
633	/* Check if result is available. */
634	nir_ssa_def *avails[2];
635	avails[0] = nir_iand(&b, nir_channel(&b, &load1->dest.ssa, 1),
636				 nir_channel(&b, &load1->dest.ssa, 3));
637	avails[1] = nir_iand(&b, nir_channel(&b, &load2->dest.ssa, 1),
638				 nir_channel(&b, &load2->dest.ssa, 3));
639	nir_ssa_def *result_is_available =
640		nir_i2b(&b, nir_iand(&b, nir_iand(&b, avails[0], avails[1]),
641			                 nir_imm_int(&b, 0x80000000)));
642
643	/* Only compute result if available. */
644	nir_if *available_if = nir_if_create(b.shader);
645	available_if->condition = nir_src_for_ssa(result_is_available);
646	nir_cf_node_insert(b.cursor, &available_if->cf_node);
647
648	b.cursor = nir_after_cf_list(&available_if->then_list);
649
650	/* Pack values. */
651	nir_ssa_def *packed64[4];
652	packed64[0] = nir_pack_64_2x32(&b, nir_vec2(&b,
653						    nir_channel(&b, &load1->dest.ssa, 0),
654						    nir_channel(&b, &load1->dest.ssa, 1)));
655	packed64[1] = nir_pack_64_2x32(&b, nir_vec2(&b,
656						    nir_channel(&b, &load1->dest.ssa, 2),
657						    nir_channel(&b, &load1->dest.ssa, 3)));
658	packed64[2] = nir_pack_64_2x32(&b, nir_vec2(&b,
659						    nir_channel(&b, &load2->dest.ssa, 0),
660						    nir_channel(&b, &load2->dest.ssa, 1)));
661	packed64[3] = nir_pack_64_2x32(&b, nir_vec2(&b,
662						    nir_channel(&b, &load2->dest.ssa, 2),
663						    nir_channel(&b, &load2->dest.ssa, 3)));
664
665	/* Compute result. */
666	nir_ssa_def *num_primitive_written =
667		nir_isub(&b, packed64[3], packed64[1]);
668	nir_ssa_def *primitive_storage_needed =
669		nir_isub(&b, packed64[2], packed64[0]);
670
671	nir_store_var(&b, result,
672		      nir_vec2(&b, num_primitive_written,
673				   primitive_storage_needed), 0x3);
674	nir_store_var(&b, available, nir_imm_true(&b), 0x1);
675
676	b.cursor = nir_after_cf_node(&available_if->cf_node);
677
678	/* Determine if result is 64 or 32 bit. */
679	nir_ssa_def *result_is_64bit =
680		nir_test_flag(&b, flags, VK_QUERY_RESULT_64_BIT);
681	nir_ssa_def *result_size =
682		nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 16),
683			  nir_imm_int(&b, 8));
684
685	/* Store the result if complete or partial results have been requested. */
686	nir_if *store_if = nir_if_create(b.shader);
687	store_if->condition =
688		nir_src_for_ssa(nir_ior(&b, nir_test_flag(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT),
689					nir_load_var(&b, available)));
690	nir_cf_node_insert(b.cursor, &store_if->cf_node);
691
692	b.cursor = nir_after_cf_list(&store_if->then_list);
693
694	/* Store result. */
695	nir_if *store_64bit_if = nir_if_create(b.shader);
696	store_64bit_if->condition = nir_src_for_ssa(result_is_64bit);
697	nir_cf_node_insert(b.cursor, &store_64bit_if->cf_node);
698
699	b.cursor = nir_after_cf_list(&store_64bit_if->then_list);
700
701	nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
702	store->src[0] = nir_src_for_ssa(nir_load_var(&b, result));
703	store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
704	store->src[2] = nir_src_for_ssa(output_base);
705	nir_intrinsic_set_write_mask(store, 0x3);
706	store->num_components = 2;
707	nir_builder_instr_insert(&b, &store->instr);
708
709	b.cursor = nir_after_cf_list(&store_64bit_if->else_list);
710
711	store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
712	store->src[0] = nir_src_for_ssa(nir_u2u32(&b, nir_load_var(&b, result)));
713	store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
714	store->src[2] = nir_src_for_ssa(output_base);
715	nir_intrinsic_set_write_mask(store, 0x3);
716	store->num_components = 2;
717	nir_builder_instr_insert(&b, &store->instr);
718
719	b.cursor = nir_after_cf_node(&store_64bit_if->cf_node);
720
721	b.cursor = nir_after_cf_node(&store_if->cf_node);
722
723	/* Store the availability bit if requested. */
724	nir_if *availability_if = nir_if_create(b.shader);
725	availability_if->condition =
726		nir_src_for_ssa(nir_test_flag(&b, flags, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT));
727	nir_cf_node_insert(b.cursor, &availability_if->cf_node);
728
729	b.cursor = nir_after_cf_list(&availability_if->then_list);
730
731	store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
732	store->src[0] = nir_src_for_ssa(nir_b2i32(&b, nir_load_var(&b, available)));
733	store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
734	store->src[2] = nir_src_for_ssa(nir_iadd(&b, result_size, output_base));
735	nir_intrinsic_set_write_mask(store, 0x1);
736	store->num_components = 1;
737	nir_builder_instr_insert(&b, &store->instr);
738
739	b.cursor = nir_after_cf_node(&availability_if->cf_node);
740
741	return b.shader;
742}
743
744static VkResult radv_device_init_meta_query_state_internal(struct radv_device *device)
745{
746	VkResult result;
747	struct radv_shader_module occlusion_cs = { .nir = NULL };
748	struct radv_shader_module pipeline_statistics_cs = { .nir = NULL };
749	struct radv_shader_module tfb_cs = { .nir = NULL };
750
751	mtx_lock(&device->meta_state.mtx);
752	if (device->meta_state.query.pipeline_statistics_query_pipeline) {
753		mtx_unlock(&device->meta_state.mtx);
754		return VK_SUCCESS;
755	}
756	occlusion_cs.nir = build_occlusion_query_shader(device);
757	pipeline_statistics_cs.nir = build_pipeline_statistics_query_shader(device);
758	tfb_cs.nir = build_tfb_query_shader(device);
759
760	VkDescriptorSetLayoutCreateInfo occlusion_ds_create_info = {
761		.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
762		.flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR,
763		.bindingCount = 2,
764		.pBindings = (VkDescriptorSetLayoutBinding[]) {
765			{
766				.binding = 0,
767				.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
768				.descriptorCount = 1,
769				.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
770				.pImmutableSamplers = NULL
771			},
772			{
773				.binding = 1,
774				.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
775				.descriptorCount = 1,
776				.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
777				.pImmutableSamplers = NULL
778			},
779		}
780	};
781
782	result = radv_CreateDescriptorSetLayout(radv_device_to_handle(device),
783						&occlusion_ds_create_info,
784						&device->meta_state.alloc,
785						&device->meta_state.query.ds_layout);
786	if (result != VK_SUCCESS)
787		goto fail;
788
789	VkPipelineLayoutCreateInfo occlusion_pl_create_info = {
790		.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
791		.setLayoutCount = 1,
792		.pSetLayouts = &device->meta_state.query.ds_layout,
793		.pushConstantRangeCount = 1,
794		.pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, 16},
795	};
796
797	result = radv_CreatePipelineLayout(radv_device_to_handle(device),
798					  &occlusion_pl_create_info,
799					  &device->meta_state.alloc,
800					  &device->meta_state.query.p_layout);
801	if (result != VK_SUCCESS)
802		goto fail;
803
804	VkPipelineShaderStageCreateInfo occlusion_pipeline_shader_stage = {
805		.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
806		.stage = VK_SHADER_STAGE_COMPUTE_BIT,
807		.module = radv_shader_module_to_handle(&occlusion_cs),
808		.pName = "main",
809		.pSpecializationInfo = NULL,
810	};
811
812	VkComputePipelineCreateInfo occlusion_vk_pipeline_info = {
813		.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
814		.stage = occlusion_pipeline_shader_stage,
815		.flags = 0,
816		.layout = device->meta_state.query.p_layout,
817	};
818
819	result = radv_CreateComputePipelines(radv_device_to_handle(device),
820					     radv_pipeline_cache_to_handle(&device->meta_state.cache),
821					     1, &occlusion_vk_pipeline_info, NULL,
822					     &device->meta_state.query.occlusion_query_pipeline);
823	if (result != VK_SUCCESS)
824		goto fail;
825
826	VkPipelineShaderStageCreateInfo pipeline_statistics_pipeline_shader_stage = {
827		.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
828		.stage = VK_SHADER_STAGE_COMPUTE_BIT,
829		.module = radv_shader_module_to_handle(&pipeline_statistics_cs),
830		.pName = "main",
831		.pSpecializationInfo = NULL,
832	};
833
834	VkComputePipelineCreateInfo pipeline_statistics_vk_pipeline_info = {
835		.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
836		.stage = pipeline_statistics_pipeline_shader_stage,
837		.flags = 0,
838		.layout = device->meta_state.query.p_layout,
839	};
840
841	result = radv_CreateComputePipelines(radv_device_to_handle(device),
842					     radv_pipeline_cache_to_handle(&device->meta_state.cache),
843					     1, &pipeline_statistics_vk_pipeline_info, NULL,
844					     &device->meta_state.query.pipeline_statistics_query_pipeline);
845	if (result != VK_SUCCESS)
846		goto fail;
847
848	VkPipelineShaderStageCreateInfo tfb_pipeline_shader_stage = {
849		.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
850		.stage = VK_SHADER_STAGE_COMPUTE_BIT,
851		.module = radv_shader_module_to_handle(&tfb_cs),
852		.pName = "main",
853		.pSpecializationInfo = NULL,
854	};
855
856	VkComputePipelineCreateInfo tfb_pipeline_info = {
857		.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
858		.stage = tfb_pipeline_shader_stage,
859		.flags = 0,
860		.layout = device->meta_state.query.p_layout,
861	};
862
863	result = radv_CreateComputePipelines(radv_device_to_handle(device),
864					     radv_pipeline_cache_to_handle(&device->meta_state.cache),
865					     1, &tfb_pipeline_info, NULL,
866					     &device->meta_state.query.tfb_query_pipeline);
867fail:
868	if (result != VK_SUCCESS)
869		radv_device_finish_meta_query_state(device);
870	ralloc_free(occlusion_cs.nir);
871	ralloc_free(pipeline_statistics_cs.nir);
872	ralloc_free(tfb_cs.nir);
873	mtx_unlock(&device->meta_state.mtx);
874	return result;
875}
876
877VkResult radv_device_init_meta_query_state(struct radv_device *device, bool on_demand)
878{
879	if (on_demand)
880		return VK_SUCCESS;
881
882	return radv_device_init_meta_query_state_internal(device);
883}
884
885void radv_device_finish_meta_query_state(struct radv_device *device)
886{
887	if (device->meta_state.query.tfb_query_pipeline)
888		radv_DestroyPipeline(radv_device_to_handle(device),
889				     device->meta_state.query.tfb_query_pipeline,
890				     &device->meta_state.alloc);
891
892	if (device->meta_state.query.pipeline_statistics_query_pipeline)
893		radv_DestroyPipeline(radv_device_to_handle(device),
894				     device->meta_state.query.pipeline_statistics_query_pipeline,
895				     &device->meta_state.alloc);
896
897	if (device->meta_state.query.occlusion_query_pipeline)
898		radv_DestroyPipeline(radv_device_to_handle(device),
899				     device->meta_state.query.occlusion_query_pipeline,
900				     &device->meta_state.alloc);
901
902	if (device->meta_state.query.p_layout)
903		radv_DestroyPipelineLayout(radv_device_to_handle(device),
904					   device->meta_state.query.p_layout,
905					   &device->meta_state.alloc);
906
907	if (device->meta_state.query.ds_layout)
908		radv_DestroyDescriptorSetLayout(radv_device_to_handle(device),
909						device->meta_state.query.ds_layout,
910						&device->meta_state.alloc);
911}
912
913static void radv_query_shader(struct radv_cmd_buffer *cmd_buffer,
914                              VkPipeline *pipeline,
915                              struct radeon_winsys_bo *src_bo,
916                              struct radeon_winsys_bo *dst_bo,
917                              uint64_t src_offset, uint64_t dst_offset,
918                              uint32_t src_stride, uint32_t dst_stride,
919                              uint32_t count, uint32_t flags,
920                              uint32_t pipeline_stats_mask, uint32_t avail_offset)
921{
922	struct radv_device *device = cmd_buffer->device;
923	struct radv_meta_saved_state saved_state;
924	bool old_predicating;
925
926	if (!*pipeline) {
927		VkResult ret = radv_device_init_meta_query_state_internal(device);
928		if (ret != VK_SUCCESS) {
929			cmd_buffer->record_result = ret;
930			return;
931		}
932	}
933
934	radv_meta_save(&saved_state, cmd_buffer,
935		       RADV_META_SAVE_COMPUTE_PIPELINE |
936		       RADV_META_SAVE_CONSTANTS |
937		       RADV_META_SAVE_DESCRIPTORS);
938
939	/* VK_EXT_conditional_rendering says that copy commands should not be
940	 * affected by conditional rendering.
941	 */
942	old_predicating = cmd_buffer->state.predicating;
943	cmd_buffer->state.predicating = false;
944
945	struct radv_buffer dst_buffer = {
946		.bo = dst_bo,
947		.offset = dst_offset,
948		.size = dst_stride * count
949	};
950
951	struct radv_buffer src_buffer = {
952		.bo = src_bo,
953		.offset = src_offset,
954		.size = MAX2(src_stride * count, avail_offset + 4 * count - src_offset)
955	};
956
957	radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer),
958			     VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
959
960	radv_meta_push_descriptor_set(cmd_buffer,
961				      VK_PIPELINE_BIND_POINT_COMPUTE,
962				      device->meta_state.query.p_layout,
963				      0, /* set */
964				      2, /* descriptorWriteCount */
965				      (VkWriteDescriptorSet[]) {
966				              {
967				                      .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
968				                      .dstBinding = 0,
969				                      .dstArrayElement = 0,
970				                      .descriptorCount = 1,
971				                      .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
972				                      .pBufferInfo = &(VkDescriptorBufferInfo) {
973				                              .buffer = radv_buffer_to_handle(&dst_buffer),
974				                              .offset = 0,
975				                              .range = VK_WHOLE_SIZE
976				                      }
977				              },
978				              {
979				                      .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
980				                      .dstBinding = 1,
981				                      .dstArrayElement = 0,
982				                      .descriptorCount = 1,
983				                      .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
984				                      .pBufferInfo = &(VkDescriptorBufferInfo) {
985				                              .buffer = radv_buffer_to_handle(&src_buffer),
986				                              .offset = 0,
987				                              .range = VK_WHOLE_SIZE
988				                      }
989				              }
990				      });
991
992	/* Encode the number of elements for easy access by the shader. */
993	pipeline_stats_mask &= 0x7ff;
994	pipeline_stats_mask |= util_bitcount(pipeline_stats_mask) << 16;
995
996	avail_offset -= src_offset;
997
998	struct {
999		uint32_t flags;
1000		uint32_t dst_stride;
1001		uint32_t pipeline_stats_mask;
1002		uint32_t avail_offset;
1003	} push_constants = {
1004		flags,
1005		dst_stride,
1006		pipeline_stats_mask,
1007		avail_offset
1008	};
1009
1010	radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
1011				      device->meta_state.query.p_layout,
1012				      VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants),
1013				      &push_constants);
1014
1015	cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_GLOBAL_L2 |
1016	                                RADV_CMD_FLAG_INV_VMEM_L1;
1017
1018	if (flags & VK_QUERY_RESULT_WAIT_BIT)
1019		cmd_buffer->state.flush_bits |= RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER;
1020
1021	radv_unaligned_dispatch(cmd_buffer, count, 1, 1);
1022
1023	/* Restore conditional rendering. */
1024	cmd_buffer->state.predicating = old_predicating;
1025
1026	radv_meta_restore(&saved_state, cmd_buffer);
1027}
1028
1029VkResult radv_CreateQueryPool(
1030	VkDevice                                    _device,
1031	const VkQueryPoolCreateInfo*                pCreateInfo,
1032	const VkAllocationCallbacks*                pAllocator,
1033	VkQueryPool*                                pQueryPool)
1034{
1035	RADV_FROM_HANDLE(radv_device, device, _device);
1036	struct radv_query_pool *pool = vk_alloc2(&device->alloc, pAllocator,
1037					       sizeof(*pool), 8,
1038					       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
1039	uint32_t initial_value = pCreateInfo->queryType == VK_QUERY_TYPE_TIMESTAMP
1040				 ? TIMESTAMP_NOT_READY : 0;
1041
1042	if (!pool)
1043		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1044
1045
1046	switch(pCreateInfo->queryType) {
1047	case VK_QUERY_TYPE_OCCLUSION:
1048		pool->stride = 16 * device->physical_device->rad_info.num_render_backends;
1049		break;
1050	case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1051		pool->stride = pipelinestat_block_size * 2;
1052		break;
1053	case VK_QUERY_TYPE_TIMESTAMP:
1054		pool->stride = 8;
1055		break;
1056	case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1057		pool->stride = 32;
1058		break;
1059	default:
1060		unreachable("creating unhandled query type");
1061	}
1062
1063	pool->type = pCreateInfo->queryType;
1064	pool->pipeline_stats_mask = pCreateInfo->pipelineStatistics;
1065	pool->availability_offset = pool->stride * pCreateInfo->queryCount;
1066	pool->size = pool->availability_offset;
1067	if (pCreateInfo->queryType == VK_QUERY_TYPE_PIPELINE_STATISTICS)
1068		pool->size += 4 * pCreateInfo->queryCount;
1069
1070	pool->bo = device->ws->buffer_create(device->ws, pool->size,
1071					     64, RADEON_DOMAIN_GTT, RADEON_FLAG_NO_INTERPROCESS_SHARING,
1072					     RADV_BO_PRIORITY_QUERY_POOL);
1073
1074	if (!pool->bo) {
1075		vk_free2(&device->alloc, pAllocator, pool);
1076		return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
1077	}
1078
1079	pool->ptr = device->ws->buffer_map(pool->bo);
1080
1081	if (!pool->ptr) {
1082		device->ws->buffer_destroy(pool->bo);
1083		vk_free2(&device->alloc, pAllocator, pool);
1084		return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
1085	}
1086	memset(pool->ptr, initial_value, pool->size);
1087
1088	*pQueryPool = radv_query_pool_to_handle(pool);
1089	return VK_SUCCESS;
1090}
1091
1092void radv_DestroyQueryPool(
1093	VkDevice                                    _device,
1094	VkQueryPool                                 _pool,
1095	const VkAllocationCallbacks*                pAllocator)
1096{
1097	RADV_FROM_HANDLE(radv_device, device, _device);
1098	RADV_FROM_HANDLE(radv_query_pool, pool, _pool);
1099
1100	if (!pool)
1101		return;
1102
1103	device->ws->buffer_destroy(pool->bo);
1104	vk_free2(&device->alloc, pAllocator, pool);
1105}
1106
1107VkResult radv_GetQueryPoolResults(
1108	VkDevice                                    _device,
1109	VkQueryPool                                 queryPool,
1110	uint32_t                                    firstQuery,
1111	uint32_t                                    queryCount,
1112	size_t                                      dataSize,
1113	void*                                       pData,
1114	VkDeviceSize                                stride,
1115	VkQueryResultFlags                          flags)
1116{
1117	RADV_FROM_HANDLE(radv_device, device, _device);
1118	RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1119	char *data = pData;
1120	VkResult result = VK_SUCCESS;
1121
1122	for(unsigned i = 0; i < queryCount; ++i, data += stride) {
1123		char *dest = data;
1124		unsigned query = firstQuery + i;
1125		char *src = pool->ptr + query * pool->stride;
1126		uint32_t available;
1127
1128		if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
1129			if (flags & VK_QUERY_RESULT_WAIT_BIT)
1130				while(!*(volatile uint32_t*)(pool->ptr + pool->availability_offset + 4 * query))
1131					;
1132			available = *(volatile uint32_t*)(pool->ptr + pool->availability_offset + 4 * query);
1133		}
1134
1135		switch (pool->type) {
1136		case VK_QUERY_TYPE_TIMESTAMP: {
1137			volatile uint64_t const *src64 = (volatile uint64_t const *)src;
1138			available = *src64 != TIMESTAMP_NOT_READY;
1139
1140			if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1141				while (*src64 == TIMESTAMP_NOT_READY)
1142					;
1143				available = true;
1144			}
1145
1146			if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1147				result = VK_NOT_READY;
1148
1149			if (flags & VK_QUERY_RESULT_64_BIT) {
1150				if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1151					*(uint64_t*)dest = *src64;
1152				dest += 8;
1153			} else {
1154				if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1155					*(uint32_t*)dest = *(volatile uint32_t*)src;
1156				dest += 4;
1157			}
1158			break;
1159		}
1160		case VK_QUERY_TYPE_OCCLUSION: {
1161			volatile uint64_t const *src64 = (volatile uint64_t const *)src;
1162			uint32_t db_count = device->physical_device->rad_info.num_render_backends;
1163			uint32_t enabled_rb_mask = device->physical_device->rad_info.enabled_rb_mask;
1164			uint64_t sample_count = 0;
1165			available = 1;
1166
1167			for (int i = 0; i < db_count; ++i) {
1168				uint64_t start, end;
1169
1170				if (!(enabled_rb_mask & (1 << i)))
1171					continue;
1172
1173				do {
1174					start = src64[2 * i];
1175					end = src64[2 * i + 1];
1176				} while ((!(start & (1ull << 63)) || !(end & (1ull << 63))) && (flags & VK_QUERY_RESULT_WAIT_BIT));
1177
1178				if (!(start & (1ull << 63)) || !(end & (1ull << 63)))
1179					available = 0;
1180				else {
1181					sample_count += end - start;
1182				}
1183			}
1184
1185			if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1186				result = VK_NOT_READY;
1187
1188			if (flags & VK_QUERY_RESULT_64_BIT) {
1189				if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1190					*(uint64_t*)dest = sample_count;
1191				dest += 8;
1192			} else {
1193				if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1194					*(uint32_t*)dest = sample_count;
1195				dest += 4;
1196			}
1197			break;
1198		}
1199		case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
1200			if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1201				result = VK_NOT_READY;
1202
1203			const volatile uint64_t *start = (uint64_t*)src;
1204			const volatile uint64_t *stop = (uint64_t*)(src + pipelinestat_block_size);
1205			if (flags & VK_QUERY_RESULT_64_BIT) {
1206				uint64_t *dst = (uint64_t*)dest;
1207				dest += util_bitcount(pool->pipeline_stats_mask) * 8;
1208				for(int i = 0; i < 11; ++i) {
1209					if(pool->pipeline_stats_mask & (1u << i)) {
1210						if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1211							*dst = stop[pipeline_statistics_indices[i]] -
1212							       start[pipeline_statistics_indices[i]];
1213						dst++;
1214					}
1215				}
1216
1217			} else {
1218				uint32_t *dst = (uint32_t*)dest;
1219				dest += util_bitcount(pool->pipeline_stats_mask) * 4;
1220				for(int i = 0; i < 11; ++i) {
1221					if(pool->pipeline_stats_mask & (1u << i)) {
1222						if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1223							*dst = stop[pipeline_statistics_indices[i]] -
1224							       start[pipeline_statistics_indices[i]];
1225						dst++;
1226					}
1227				}
1228			}
1229			break;
1230		}
1231		case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
1232			volatile uint64_t const *src64 = (volatile uint64_t const *)src;
1233			uint64_t num_primitives_written;
1234			uint64_t primitive_storage_needed;
1235
1236			/* SAMPLE_STREAMOUTSTATS stores this structure:
1237			 * {
1238			 *	u64 NumPrimitivesWritten;
1239			 *	u64 PrimitiveStorageNeeded;
1240			 * }
1241			 */
1242			available = 1;
1243			for (int j = 0; j < 4; j++) {
1244				if (!(src64[j] & 0x8000000000000000UL))
1245					available = 0;
1246			}
1247
1248			if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1249				result = VK_NOT_READY;
1250
1251			num_primitives_written = src64[3] - src64[1];
1252			primitive_storage_needed = src64[2] - src64[0];
1253
1254			if (flags & VK_QUERY_RESULT_64_BIT) {
1255				if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1256					*(uint64_t *)dest = num_primitives_written;
1257				dest += 8;
1258				if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1259					*(uint64_t *)dest = primitive_storage_needed;
1260				dest += 8;
1261			} else {
1262				if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1263					*(uint32_t *)dest = num_primitives_written;
1264				dest += 4;
1265				if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1266					*(uint32_t *)dest = primitive_storage_needed;
1267				dest += 4;
1268			}
1269			break;
1270		}
1271		default:
1272			unreachable("trying to get results of unhandled query type");
1273		}
1274
1275		if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
1276			if (flags & VK_QUERY_RESULT_64_BIT) {
1277				*(uint64_t*)dest = available;
1278			} else {
1279				*(uint32_t*)dest = available;
1280			}
1281		}
1282	}
1283
1284	return result;
1285}
1286
1287void radv_CmdCopyQueryPoolResults(
1288    VkCommandBuffer                             commandBuffer,
1289    VkQueryPool                                 queryPool,
1290    uint32_t                                    firstQuery,
1291    uint32_t                                    queryCount,
1292    VkBuffer                                    dstBuffer,
1293    VkDeviceSize                                dstOffset,
1294    VkDeviceSize                                stride,
1295    VkQueryResultFlags                          flags)
1296{
1297	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1298	RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1299	RADV_FROM_HANDLE(radv_buffer, dst_buffer, dstBuffer);
1300	struct radeon_cmdbuf *cs = cmd_buffer->cs;
1301	unsigned elem_size = (flags & VK_QUERY_RESULT_64_BIT) ? 8 : 4;
1302	uint64_t va = radv_buffer_get_va(pool->bo);
1303	uint64_t dest_va = radv_buffer_get_va(dst_buffer->bo);
1304	dest_va += dst_buffer->offset + dstOffset;
1305
1306	radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pool->bo);
1307	radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, dst_buffer->bo);
1308
1309	switch (pool->type) {
1310	case VK_QUERY_TYPE_OCCLUSION:
1311		if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1312			for(unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
1313				unsigned query = firstQuery + i;
1314				uint64_t src_va = va + query * pool->stride + pool->stride - 4;
1315
1316				radeon_check_space(cmd_buffer->device->ws, cs, 7);
1317
1318				/* Waits on the upper word of the last DB entry */
1319				radv_cp_wait_mem(cs, WAIT_REG_MEM_GREATER_OR_EQUAL,
1320						 src_va, 0x80000000, 0xffffffff);
1321			}
1322		}
1323		radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.occlusion_query_pipeline,
1324		                  pool->bo, dst_buffer->bo, firstQuery * pool->stride,
1325		                  dst_buffer->offset + dstOffset,
1326		                  pool->stride, stride,
1327		                  queryCount, flags, 0, 0);
1328		break;
1329	case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1330		if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1331			for(unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
1332				unsigned query = firstQuery + i;
1333
1334				radeon_check_space(cmd_buffer->device->ws, cs, 7);
1335
1336				uint64_t avail_va = va + pool->availability_offset + 4 * query;
1337
1338				/* This waits on the ME. All copies below are done on the ME */
1339				radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL,
1340						 avail_va, 1, 0xffffffff);
1341			}
1342		}
1343		radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.pipeline_statistics_query_pipeline,
1344		                  pool->bo, dst_buffer->bo, firstQuery * pool->stride,
1345		                  dst_buffer->offset + dstOffset,
1346		                  pool->stride, stride, queryCount, flags,
1347		                  pool->pipeline_stats_mask,
1348		                  pool->availability_offset + 4 * firstQuery);
1349		break;
1350	case VK_QUERY_TYPE_TIMESTAMP:
1351		for(unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
1352			unsigned query = firstQuery + i;
1353			uint64_t local_src_va = va  + query * pool->stride;
1354
1355			MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 19);
1356
1357
1358			if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1359				/* Wait on the high 32 bits of the timestamp in
1360				 * case the low part is 0xffffffff.
1361				 */
1362				radv_cp_wait_mem(cs, WAIT_REG_MEM_NOT_EQUAL,
1363						 local_src_va + 4,
1364						 TIMESTAMP_NOT_READY >> 32,
1365						 0xffffffff);
1366			}
1367			if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
1368				uint64_t avail_dest_va = dest_va + elem_size;
1369
1370				radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
1371				radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
1372						COPY_DATA_DST_SEL(COPY_DATA_DST_MEM_GRBM));
1373				radeon_emit(cs, local_src_va);
1374				radeon_emit(cs, local_src_va >> 32);
1375				radeon_emit(cs, avail_dest_va);
1376				radeon_emit(cs, avail_dest_va >> 32);
1377			}
1378
1379			radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
1380			radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
1381					COPY_DATA_DST_SEL(COPY_DATA_DST_MEM_GRBM) |
1382					((flags & VK_QUERY_RESULT_64_BIT) ? COPY_DATA_COUNT_SEL : 0));
1383			radeon_emit(cs, local_src_va);
1384			radeon_emit(cs, local_src_va >> 32);
1385			radeon_emit(cs, dest_va);
1386			radeon_emit(cs, dest_va >> 32);
1387
1388
1389			assert(cs->cdw <= cdw_max);
1390		}
1391		break;
1392	case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1393		if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1394			for(unsigned i = 0; i < queryCount; i++) {
1395				unsigned query = firstQuery + i;
1396				uint64_t src_va = va + query * pool->stride;
1397
1398				radeon_check_space(cmd_buffer->device->ws, cs, 7 * 4);
1399
1400				/* Wait on the upper word of all results. */
1401				for (unsigned j = 0; j < 4; j++, src_va += 8) {
1402					radv_cp_wait_mem(cs, WAIT_REG_MEM_GREATER_OR_EQUAL,
1403							 src_va + 4, 0x80000000,
1404							 0xffffffff);
1405				}
1406			}
1407		}
1408
1409		radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.tfb_query_pipeline,
1410		                  pool->bo, dst_buffer->bo,
1411				  firstQuery * pool->stride,
1412		                  dst_buffer->offset + dstOffset,
1413		                  pool->stride, stride,
1414				  queryCount, flags, 0, 0);
1415		break;
1416	default:
1417		unreachable("trying to get results of unhandled query type");
1418	}
1419
1420}
1421
1422void radv_CmdResetQueryPool(
1423	VkCommandBuffer                             commandBuffer,
1424	VkQueryPool                                 queryPool,
1425	uint32_t                                    firstQuery,
1426	uint32_t                                    queryCount)
1427{
1428	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1429	RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1430	uint32_t value = pool->type == VK_QUERY_TYPE_TIMESTAMP
1431			 ? TIMESTAMP_NOT_READY : 0;
1432	uint32_t flush_bits = 0;
1433
1434	flush_bits |= radv_fill_buffer(cmd_buffer, pool->bo,
1435				       firstQuery * pool->stride,
1436				       queryCount * pool->stride, value);
1437
1438	if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
1439		flush_bits |= radv_fill_buffer(cmd_buffer, pool->bo,
1440					       pool->availability_offset + firstQuery * 4,
1441					       queryCount * 4, 0);
1442	}
1443
1444	if (flush_bits) {
1445		/* Only need to flush caches for the compute shader path. */
1446		cmd_buffer->pending_reset_query = true;
1447		cmd_buffer->state.flush_bits |= flush_bits;
1448	}
1449}
1450
1451void radv_ResetQueryPoolEXT(
1452	VkDevice                                   _device,
1453	VkQueryPool                                 queryPool,
1454	uint32_t                                    firstQuery,
1455	uint32_t                                    queryCount)
1456{
1457	RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1458
1459	uint32_t value = pool->type == VK_QUERY_TYPE_TIMESTAMP
1460			 ? TIMESTAMP_NOT_READY : 0;
1461	uint32_t *data =  (uint32_t*)(pool->ptr + firstQuery * pool->stride);
1462	uint32_t *data_end = (uint32_t*)(pool->ptr + (firstQuery + queryCount) * pool->stride);
1463
1464	for(uint32_t *p = data; p != data_end; ++p)
1465		*p = value;
1466
1467	if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
1468		memset(pool->ptr + pool->availability_offset + firstQuery * 4,
1469		       0, queryCount * 4);
1470	}
1471}
1472
1473static unsigned event_type_for_stream(unsigned stream)
1474{
1475	switch (stream) {
1476	default:
1477	case 0: return V_028A90_SAMPLE_STREAMOUTSTATS;
1478	case 1: return V_028A90_SAMPLE_STREAMOUTSTATS1;
1479	case 2: return V_028A90_SAMPLE_STREAMOUTSTATS2;
1480	case 3: return V_028A90_SAMPLE_STREAMOUTSTATS3;
1481	}
1482}
1483
1484static void emit_query_flush(struct radv_cmd_buffer *cmd_buffer,
1485			     struct radv_query_pool *pool)
1486{
1487	if (cmd_buffer->pending_reset_query) {
1488		if (pool->size >= RADV_BUFFER_OPS_CS_THRESHOLD) {
1489			/* Only need to flush caches if the query pool size is
1490			 * large enough to be resetted using the compute shader
1491			 * path. Small pools don't need any cache flushes
1492			 * because we use a CP dma clear.
1493			 */
1494			si_emit_cache_flush(cmd_buffer);
1495		}
1496	}
1497}
1498
1499static void emit_begin_query(struct radv_cmd_buffer *cmd_buffer,
1500			     uint64_t va,
1501			     VkQueryType query_type,
1502			     VkQueryControlFlags flags,
1503			     uint32_t index)
1504{
1505	struct radeon_cmdbuf *cs = cmd_buffer->cs;
1506	switch (query_type) {
1507	case VK_QUERY_TYPE_OCCLUSION:
1508		radeon_check_space(cmd_buffer->device->ws, cs, 7);
1509
1510		++cmd_buffer->state.active_occlusion_queries;
1511		if (cmd_buffer->state.active_occlusion_queries == 1) {
1512			if (flags & VK_QUERY_CONTROL_PRECISE_BIT) {
1513				/* This is the first occlusion query, enable
1514				 * the hint if the precision bit is set.
1515				 */
1516				cmd_buffer->state.perfect_occlusion_queries_enabled = true;
1517			}
1518
1519			radv_set_db_count_control(cmd_buffer);
1520		} else {
1521			if ((flags & VK_QUERY_CONTROL_PRECISE_BIT) &&
1522			    !cmd_buffer->state.perfect_occlusion_queries_enabled) {
1523				/* This is not the first query, but this one
1524				 * needs to enable precision, DB_COUNT_CONTROL
1525				 * has to be updated accordingly.
1526				 */
1527				cmd_buffer->state.perfect_occlusion_queries_enabled = true;
1528
1529				radv_set_db_count_control(cmd_buffer);
1530			}
1531		}
1532
1533		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1534		radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
1535		radeon_emit(cs, va);
1536		radeon_emit(cs, va >> 32);
1537		break;
1538	case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1539		radeon_check_space(cmd_buffer->device->ws, cs, 4);
1540
1541		++cmd_buffer->state.active_pipeline_queries;
1542		if (cmd_buffer->state.active_pipeline_queries == 1) {
1543			cmd_buffer->state.flush_bits &= ~RADV_CMD_FLAG_STOP_PIPELINE_STATS;
1544			cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_START_PIPELINE_STATS;
1545		}
1546
1547		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1548		radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
1549		radeon_emit(cs, va);
1550		radeon_emit(cs, va >> 32);
1551		break;
1552	case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1553		radeon_check_space(cmd_buffer->device->ws, cs, 4);
1554
1555		assert(index < MAX_SO_STREAMS);
1556
1557		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1558		radeon_emit(cs, EVENT_TYPE(event_type_for_stream(index)) | EVENT_INDEX(3));
1559		radeon_emit(cs, va);
1560		radeon_emit(cs, va >> 32);
1561		break;
1562	default:
1563		unreachable("beginning unhandled query type");
1564	}
1565
1566}
1567
1568static void emit_end_query(struct radv_cmd_buffer *cmd_buffer,
1569			   uint64_t va, uint64_t avail_va,
1570			   VkQueryType query_type, uint32_t index)
1571{
1572	struct radeon_cmdbuf *cs = cmd_buffer->cs;
1573	switch (query_type) {
1574	case VK_QUERY_TYPE_OCCLUSION:
1575		radeon_check_space(cmd_buffer->device->ws, cs, 14);
1576
1577		cmd_buffer->state.active_occlusion_queries--;
1578		if (cmd_buffer->state.active_occlusion_queries == 0) {
1579			radv_set_db_count_control(cmd_buffer);
1580
1581			/* Reset the perfect occlusion queries hint now that no
1582			 * queries are active.
1583			 */
1584			cmd_buffer->state.perfect_occlusion_queries_enabled = false;
1585		}
1586
1587		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1588		radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
1589		radeon_emit(cs, va + 8);
1590		radeon_emit(cs, (va + 8) >> 32);
1591
1592		break;
1593	case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1594		radeon_check_space(cmd_buffer->device->ws, cs, 16);
1595
1596		cmd_buffer->state.active_pipeline_queries--;
1597		if (cmd_buffer->state.active_pipeline_queries == 0) {
1598			cmd_buffer->state.flush_bits &= ~RADV_CMD_FLAG_START_PIPELINE_STATS;
1599			cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_STOP_PIPELINE_STATS;
1600		}
1601		va += pipelinestat_block_size;
1602
1603		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1604		radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
1605		radeon_emit(cs, va);
1606		radeon_emit(cs, va >> 32);
1607
1608		si_cs_emit_write_event_eop(cs,
1609					   cmd_buffer->device->physical_device->rad_info.chip_class,
1610					   radv_cmd_buffer_uses_mec(cmd_buffer),
1611					   V_028A90_BOTTOM_OF_PIPE_TS, 0,
1612					   EOP_DATA_SEL_VALUE_32BIT,
1613					   avail_va, 1,
1614					   cmd_buffer->gfx9_eop_bug_va);
1615		break;
1616	case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1617		radeon_check_space(cmd_buffer->device->ws, cs, 4);
1618
1619		assert(index < MAX_SO_STREAMS);
1620
1621		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1622		radeon_emit(cs, EVENT_TYPE(event_type_for_stream(index)) | EVENT_INDEX(3));
1623		radeon_emit(cs, (va + 16));
1624		radeon_emit(cs, (va + 16) >> 32);
1625		break;
1626	default:
1627		unreachable("ending unhandled query type");
1628	}
1629}
1630
1631void radv_CmdBeginQueryIndexedEXT(
1632    VkCommandBuffer                             commandBuffer,
1633    VkQueryPool                                 queryPool,
1634    uint32_t                                    query,
1635    VkQueryControlFlags                         flags,
1636    uint32_t                                    index)
1637{
1638	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1639	RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1640	struct radeon_cmdbuf *cs = cmd_buffer->cs;
1641	uint64_t va = radv_buffer_get_va(pool->bo);
1642
1643	radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo);
1644
1645	emit_query_flush(cmd_buffer, pool);
1646
1647	va += pool->stride * query;
1648
1649	emit_begin_query(cmd_buffer, va, pool->type, flags, index);
1650}
1651
1652void radv_CmdBeginQuery(
1653    VkCommandBuffer                             commandBuffer,
1654    VkQueryPool                                 queryPool,
1655    uint32_t                                    query,
1656    VkQueryControlFlags                         flags)
1657{
1658	radv_CmdBeginQueryIndexedEXT(commandBuffer, queryPool, query, flags, 0);
1659}
1660
1661void radv_CmdEndQueryIndexedEXT(
1662    VkCommandBuffer                             commandBuffer,
1663    VkQueryPool                                 queryPool,
1664    uint32_t                                    query,
1665    uint32_t                                    index)
1666{
1667	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1668	RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1669	uint64_t va = radv_buffer_get_va(pool->bo);
1670	uint64_t avail_va = va + pool->availability_offset + 4 * query;
1671	va += pool->stride * query;
1672
1673	/* Do not need to add the pool BO to the list because the query must
1674	 * currently be active, which means the BO is already in the list.
1675	 */
1676	emit_end_query(cmd_buffer, va, avail_va, pool->type, index);
1677
1678	/*
1679	 * For multiview we have to emit a query for each bit in the mask,
1680	 * however the first query we emit will get the totals for all the
1681	 * operations, so we don't want to get a real value in the other
1682	 * queries. This emits a fake begin/end sequence so the waiting
1683	 * code gets a completed query value and doesn't hang, but the
1684	 * query returns 0.
1685	 */
1686	if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
1687		uint64_t avail_va = va + pool->availability_offset + 4 * query;
1688
1689
1690		for (unsigned i = 1; i < util_bitcount(cmd_buffer->state.subpass->view_mask); i++) {
1691			va += pool->stride;
1692			avail_va += 4;
1693			emit_begin_query(cmd_buffer, va, pool->type, 0, 0);
1694			emit_end_query(cmd_buffer, va, avail_va, pool->type, 0);
1695		}
1696	}
1697}
1698
1699void radv_CmdEndQuery(
1700    VkCommandBuffer                             commandBuffer,
1701    VkQueryPool                                 queryPool,
1702    uint32_t                                    query)
1703{
1704	radv_CmdEndQueryIndexedEXT(commandBuffer, queryPool, query, 0);
1705}
1706
1707void radv_CmdWriteTimestamp(
1708    VkCommandBuffer                             commandBuffer,
1709    VkPipelineStageFlagBits                     pipelineStage,
1710    VkQueryPool                                 queryPool,
1711    uint32_t                                    query)
1712{
1713	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1714	RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1715	bool mec = radv_cmd_buffer_uses_mec(cmd_buffer);
1716	struct radeon_cmdbuf *cs = cmd_buffer->cs;
1717	uint64_t va = radv_buffer_get_va(pool->bo);
1718	uint64_t query_va = va + pool->stride * query;
1719
1720	radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo);
1721
1722	emit_query_flush(cmd_buffer, pool);
1723
1724	int num_queries = 1;
1725	if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask)
1726		num_queries = util_bitcount(cmd_buffer->state.subpass->view_mask);
1727
1728	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 28 * num_queries);
1729
1730	for (unsigned i = 0; i < num_queries; i++) {
1731		switch(pipelineStage) {
1732		case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
1733			radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
1734			radeon_emit(cs, COPY_DATA_COUNT_SEL | COPY_DATA_WR_CONFIRM |
1735				    COPY_DATA_SRC_SEL(COPY_DATA_TIMESTAMP) |
1736				    COPY_DATA_DST_SEL(V_370_MEM));
1737			radeon_emit(cs, 0);
1738			radeon_emit(cs, 0);
1739			radeon_emit(cs, query_va);
1740			radeon_emit(cs, query_va >> 32);
1741			break;
1742		default:
1743			si_cs_emit_write_event_eop(cs,
1744						   cmd_buffer->device->physical_device->rad_info.chip_class,
1745						   mec,
1746						   V_028A90_BOTTOM_OF_PIPE_TS, 0,
1747						   EOP_DATA_SEL_TIMESTAMP,
1748						   query_va, 0,
1749						   cmd_buffer->gfx9_eop_bug_va);
1750			break;
1751		}
1752		query_va += pool->stride;
1753	}
1754	assert(cmd_buffer->cs->cdw <= cdw_max);
1755}
1756