1/*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * based in part on anv driver which is:
6 * Copyright © 2015 Intel Corporation
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the next
16 * paragraph) shall be included in all copies or substantial portions of the
17 * Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * IN THE SOFTWARE.
26 */
27
28#include "radv_private.h"
29#include "radv_radeon_winsys.h"
30#include "radv_shader.h"
31#include "radv_cs.h"
32#include "sid.h"
33#include "gfx9d.h"
34#include "vk_format.h"
35#include "radv_debug.h"
36#include "radv_meta.h"
37
38#include "ac_debug.h"
39
40enum {
41	RADV_PREFETCH_VBO_DESCRIPTORS	= (1 << 0),
42	RADV_PREFETCH_VS		= (1 << 1),
43	RADV_PREFETCH_TCS		= (1 << 2),
44	RADV_PREFETCH_TES		= (1 << 3),
45	RADV_PREFETCH_GS		= (1 << 4),
46	RADV_PREFETCH_PS		= (1 << 5),
47	RADV_PREFETCH_SHADERS		= (RADV_PREFETCH_VS  |
48					   RADV_PREFETCH_TCS |
49					   RADV_PREFETCH_TES |
50					   RADV_PREFETCH_GS  |
51					   RADV_PREFETCH_PS)
52};
53
54static void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer,
55					 struct radv_image *image,
56					 VkImageLayout src_layout,
57					 VkImageLayout dst_layout,
58					 uint32_t src_family,
59					 uint32_t dst_family,
60					 const VkImageSubresourceRange *range);
61
62const struct radv_dynamic_state default_dynamic_state = {
63	.viewport = {
64		.count = 0,
65	},
66	.scissor = {
67		.count = 0,
68	},
69	.line_width = 1.0f,
70	.depth_bias = {
71		.bias = 0.0f,
72		.clamp = 0.0f,
73		.slope = 0.0f,
74	},
75	.blend_constants = { 0.0f, 0.0f, 0.0f, 0.0f },
76	.depth_bounds = {
77		.min = 0.0f,
78		.max = 1.0f,
79	},
80	.stencil_compare_mask = {
81		.front = ~0u,
82		.back = ~0u,
83	},
84	.stencil_write_mask = {
85		.front = ~0u,
86		.back = ~0u,
87	},
88	.stencil_reference = {
89		.front = 0u,
90		.back = 0u,
91	},
92};
93
94static void
95radv_bind_dynamic_state(struct radv_cmd_buffer *cmd_buffer,
96			const struct radv_dynamic_state *src)
97{
98	struct radv_dynamic_state *dest = &cmd_buffer->state.dynamic;
99	uint32_t copy_mask = src->mask;
100	uint32_t dest_mask = 0;
101
102	/* Make sure to copy the number of viewports/scissors because they can
103	 * only be specified at pipeline creation time.
104	 */
105	dest->viewport.count = src->viewport.count;
106	dest->scissor.count = src->scissor.count;
107	dest->discard_rectangle.count = src->discard_rectangle.count;
108
109	if (copy_mask & RADV_DYNAMIC_VIEWPORT) {
110		if (memcmp(&dest->viewport.viewports, &src->viewport.viewports,
111			   src->viewport.count * sizeof(VkViewport))) {
112			typed_memcpy(dest->viewport.viewports,
113				     src->viewport.viewports,
114				     src->viewport.count);
115			dest_mask |= RADV_DYNAMIC_VIEWPORT;
116		}
117	}
118
119	if (copy_mask & RADV_DYNAMIC_SCISSOR) {
120		if (memcmp(&dest->scissor.scissors, &src->scissor.scissors,
121			   src->scissor.count * sizeof(VkRect2D))) {
122			typed_memcpy(dest->scissor.scissors,
123				     src->scissor.scissors, src->scissor.count);
124			dest_mask |= RADV_DYNAMIC_SCISSOR;
125		}
126	}
127
128	if (copy_mask & RADV_DYNAMIC_LINE_WIDTH) {
129		if (dest->line_width != src->line_width) {
130			dest->line_width = src->line_width;
131			dest_mask |= RADV_DYNAMIC_LINE_WIDTH;
132		}
133	}
134
135	if (copy_mask & RADV_DYNAMIC_DEPTH_BIAS) {
136		if (memcmp(&dest->depth_bias, &src->depth_bias,
137			   sizeof(src->depth_bias))) {
138			dest->depth_bias = src->depth_bias;
139			dest_mask |= RADV_DYNAMIC_DEPTH_BIAS;
140		}
141	}
142
143	if (copy_mask & RADV_DYNAMIC_BLEND_CONSTANTS) {
144		if (memcmp(&dest->blend_constants, &src->blend_constants,
145			   sizeof(src->blend_constants))) {
146			typed_memcpy(dest->blend_constants,
147				     src->blend_constants, 4);
148			dest_mask |= RADV_DYNAMIC_BLEND_CONSTANTS;
149		}
150	}
151
152	if (copy_mask & RADV_DYNAMIC_DEPTH_BOUNDS) {
153		if (memcmp(&dest->depth_bounds, &src->depth_bounds,
154			   sizeof(src->depth_bounds))) {
155			dest->depth_bounds = src->depth_bounds;
156			dest_mask |= RADV_DYNAMIC_DEPTH_BOUNDS;
157		}
158	}
159
160	if (copy_mask & RADV_DYNAMIC_STENCIL_COMPARE_MASK) {
161		if (memcmp(&dest->stencil_compare_mask,
162			   &src->stencil_compare_mask,
163			   sizeof(src->stencil_compare_mask))) {
164			dest->stencil_compare_mask = src->stencil_compare_mask;
165			dest_mask |= RADV_DYNAMIC_STENCIL_COMPARE_MASK;
166		}
167	}
168
169	if (copy_mask & RADV_DYNAMIC_STENCIL_WRITE_MASK) {
170		if (memcmp(&dest->stencil_write_mask, &src->stencil_write_mask,
171			   sizeof(src->stencil_write_mask))) {
172			dest->stencil_write_mask = src->stencil_write_mask;
173			dest_mask |= RADV_DYNAMIC_STENCIL_WRITE_MASK;
174		}
175	}
176
177	if (copy_mask & RADV_DYNAMIC_STENCIL_REFERENCE) {
178		if (memcmp(&dest->stencil_reference, &src->stencil_reference,
179			   sizeof(src->stencil_reference))) {
180			dest->stencil_reference = src->stencil_reference;
181			dest_mask |= RADV_DYNAMIC_STENCIL_REFERENCE;
182		}
183	}
184
185	if (copy_mask & RADV_DYNAMIC_DISCARD_RECTANGLE) {
186		if (memcmp(&dest->discard_rectangle.rectangles, &src->discard_rectangle.rectangles,
187			   src->discard_rectangle.count * sizeof(VkRect2D))) {
188			typed_memcpy(dest->discard_rectangle.rectangles,
189				     src->discard_rectangle.rectangles,
190				     src->discard_rectangle.count);
191			dest_mask |= RADV_DYNAMIC_DISCARD_RECTANGLE;
192		}
193	}
194
195	cmd_buffer->state.dirty |= dest_mask;
196}
197
198static void
199radv_bind_streamout_state(struct radv_cmd_buffer *cmd_buffer,
200			  struct radv_pipeline *pipeline)
201{
202	struct radv_streamout_state *so = &cmd_buffer->state.streamout;
203	struct radv_shader_info *info;
204
205	if (!pipeline->streamout_shader)
206		return;
207
208	info = &pipeline->streamout_shader->info.info;
209	for (int i = 0; i < MAX_SO_BUFFERS; i++)
210		so->stride_in_dw[i] = info->so.strides[i];
211
212	so->enabled_stream_buffers_mask = info->so.enabled_stream_buffers_mask;
213}
214
215bool radv_cmd_buffer_uses_mec(struct radv_cmd_buffer *cmd_buffer)
216{
217	return cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE &&
218	       cmd_buffer->device->physical_device->rad_info.chip_class >= CIK;
219}
220
221enum ring_type radv_queue_family_to_ring(int f) {
222	switch (f) {
223	case RADV_QUEUE_GENERAL:
224		return RING_GFX;
225	case RADV_QUEUE_COMPUTE:
226		return RING_COMPUTE;
227	case RADV_QUEUE_TRANSFER:
228		return RING_DMA;
229	default:
230		unreachable("Unknown queue family");
231	}
232}
233
234static VkResult radv_create_cmd_buffer(
235	struct radv_device *                         device,
236	struct radv_cmd_pool *                       pool,
237	VkCommandBufferLevel                        level,
238	VkCommandBuffer*                            pCommandBuffer)
239{
240	struct radv_cmd_buffer *cmd_buffer;
241	unsigned ring;
242	cmd_buffer = vk_zalloc(&pool->alloc, sizeof(*cmd_buffer), 8,
243			       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
244	if (cmd_buffer == NULL)
245		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
246
247	cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
248	cmd_buffer->device = device;
249	cmd_buffer->pool = pool;
250	cmd_buffer->level = level;
251
252	if (pool) {
253		list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
254		cmd_buffer->queue_family_index = pool->queue_family_index;
255
256	} else {
257		/* Init the pool_link so we can safely call list_del when we destroy
258		 * the command buffer
259		 */
260		list_inithead(&cmd_buffer->pool_link);
261		cmd_buffer->queue_family_index = RADV_QUEUE_GENERAL;
262	}
263
264	ring = radv_queue_family_to_ring(cmd_buffer->queue_family_index);
265
266	cmd_buffer->cs = device->ws->cs_create(device->ws, ring);
267	if (!cmd_buffer->cs) {
268		vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
269		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
270	}
271
272	*pCommandBuffer = radv_cmd_buffer_to_handle(cmd_buffer);
273
274	list_inithead(&cmd_buffer->upload.list);
275
276	return VK_SUCCESS;
277}
278
279static void
280radv_cmd_buffer_destroy(struct radv_cmd_buffer *cmd_buffer)
281{
282	list_del(&cmd_buffer->pool_link);
283
284	list_for_each_entry_safe(struct radv_cmd_buffer_upload, up,
285				 &cmd_buffer->upload.list, list) {
286		cmd_buffer->device->ws->buffer_destroy(up->upload_bo);
287		list_del(&up->list);
288		free(up);
289	}
290
291	if (cmd_buffer->upload.upload_bo)
292		cmd_buffer->device->ws->buffer_destroy(cmd_buffer->upload.upload_bo);
293	cmd_buffer->device->ws->cs_destroy(cmd_buffer->cs);
294
295	for (unsigned i = 0; i < VK_PIPELINE_BIND_POINT_RANGE_SIZE; i++)
296		free(cmd_buffer->descriptors[i].push_set.set.mapped_ptr);
297
298	vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
299}
300
301static VkResult
302radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
303{
304	cmd_buffer->device->ws->cs_reset(cmd_buffer->cs);
305
306	list_for_each_entry_safe(struct radv_cmd_buffer_upload, up,
307				 &cmd_buffer->upload.list, list) {
308		cmd_buffer->device->ws->buffer_destroy(up->upload_bo);
309		list_del(&up->list);
310		free(up);
311	}
312
313	cmd_buffer->push_constant_stages = 0;
314	cmd_buffer->scratch_size_needed = 0;
315	cmd_buffer->compute_scratch_size_needed = 0;
316	cmd_buffer->esgs_ring_size_needed = 0;
317	cmd_buffer->gsvs_ring_size_needed = 0;
318	cmd_buffer->tess_rings_needed = false;
319	cmd_buffer->sample_positions_needed = false;
320
321	if (cmd_buffer->upload.upload_bo)
322		radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
323				   cmd_buffer->upload.upload_bo);
324	cmd_buffer->upload.offset = 0;
325
326	cmd_buffer->record_result = VK_SUCCESS;
327
328	memset(cmd_buffer->vertex_bindings, 0, sizeof(cmd_buffer->vertex_bindings));
329
330	for (unsigned i = 0; i < VK_PIPELINE_BIND_POINT_RANGE_SIZE; i++) {
331		cmd_buffer->descriptors[i].dirty = 0;
332		cmd_buffer->descriptors[i].valid = 0;
333		cmd_buffer->descriptors[i].push_dirty = false;
334	}
335
336	if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9 &&
337	    cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) {
338		unsigned num_db = cmd_buffer->device->physical_device->rad_info.num_render_backends;
339		unsigned fence_offset, eop_bug_offset;
340		void *fence_ptr;
341
342		radv_cmd_buffer_upload_alloc(cmd_buffer, 8, 8, &fence_offset,
343					     &fence_ptr);
344
345		cmd_buffer->gfx9_fence_va =
346			radv_buffer_get_va(cmd_buffer->upload.upload_bo);
347		cmd_buffer->gfx9_fence_va += fence_offset;
348
349		/* Allocate a buffer for the EOP bug on GFX9. */
350		radv_cmd_buffer_upload_alloc(cmd_buffer, 16 * num_db, 8,
351					     &eop_bug_offset, &fence_ptr);
352		cmd_buffer->gfx9_eop_bug_va =
353			radv_buffer_get_va(cmd_buffer->upload.upload_bo);
354		cmd_buffer->gfx9_eop_bug_va += eop_bug_offset;
355	}
356
357	cmd_buffer->status = RADV_CMD_BUFFER_STATUS_INITIAL;
358
359	return cmd_buffer->record_result;
360}
361
362static bool
363radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer *cmd_buffer,
364				  uint64_t min_needed)
365{
366	uint64_t new_size;
367	struct radeon_winsys_bo *bo;
368	struct radv_cmd_buffer_upload *upload;
369	struct radv_device *device = cmd_buffer->device;
370
371	new_size = MAX2(min_needed, 16 * 1024);
372	new_size = MAX2(new_size, 2 * cmd_buffer->upload.size);
373
374	bo = device->ws->buffer_create(device->ws,
375				       new_size, 4096,
376				       RADEON_DOMAIN_GTT,
377				       RADEON_FLAG_CPU_ACCESS|
378				       RADEON_FLAG_NO_INTERPROCESS_SHARING |
379				       RADEON_FLAG_32BIT,
380				       RADV_BO_PRIORITY_UPLOAD_BUFFER);
381
382	if (!bo) {
383		cmd_buffer->record_result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
384		return false;
385	}
386
387	radv_cs_add_buffer(device->ws, cmd_buffer->cs, bo);
388	if (cmd_buffer->upload.upload_bo) {
389		upload = malloc(sizeof(*upload));
390
391		if (!upload) {
392			cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
393			device->ws->buffer_destroy(bo);
394			return false;
395		}
396
397		memcpy(upload, &cmd_buffer->upload, sizeof(*upload));
398		list_add(&upload->list, &cmd_buffer->upload.list);
399	}
400
401	cmd_buffer->upload.upload_bo = bo;
402	cmd_buffer->upload.size = new_size;
403	cmd_buffer->upload.offset = 0;
404	cmd_buffer->upload.map = device->ws->buffer_map(cmd_buffer->upload.upload_bo);
405
406	if (!cmd_buffer->upload.map) {
407		cmd_buffer->record_result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
408		return false;
409	}
410
411	return true;
412}
413
414bool
415radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer,
416			     unsigned size,
417			     unsigned alignment,
418			     unsigned *out_offset,
419			     void **ptr)
420{
421	assert(util_is_power_of_two_nonzero(alignment));
422
423	uint64_t offset = align(cmd_buffer->upload.offset, alignment);
424	if (offset + size > cmd_buffer->upload.size) {
425		if (!radv_cmd_buffer_resize_upload_buf(cmd_buffer, size))
426			return false;
427		offset = 0;
428	}
429
430	*out_offset = offset;
431	*ptr = cmd_buffer->upload.map + offset;
432
433	cmd_buffer->upload.offset = offset + size;
434	return true;
435}
436
437bool
438radv_cmd_buffer_upload_data(struct radv_cmd_buffer *cmd_buffer,
439			    unsigned size, unsigned alignment,
440			    const void *data, unsigned *out_offset)
441{
442	uint8_t *ptr;
443
444	if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, alignment,
445					  out_offset, (void **)&ptr))
446		return false;
447
448	if (ptr)
449		memcpy(ptr, data, size);
450
451	return true;
452}
453
454static void
455radv_emit_write_data_packet(struct radv_cmd_buffer *cmd_buffer, uint64_t va,
456			    unsigned count, const uint32_t *data)
457{
458	struct radeon_cmdbuf *cs = cmd_buffer->cs;
459
460	radeon_check_space(cmd_buffer->device->ws, cs, 4 + count);
461
462	radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
463	radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
464		    S_370_WR_CONFIRM(1) |
465		    S_370_ENGINE_SEL(V_370_ME));
466	radeon_emit(cs, va);
467	radeon_emit(cs, va >> 32);
468	radeon_emit_array(cs, data, count);
469}
470
471void radv_cmd_buffer_trace_emit(struct radv_cmd_buffer *cmd_buffer)
472{
473	struct radv_device *device = cmd_buffer->device;
474	struct radeon_cmdbuf *cs = cmd_buffer->cs;
475	uint64_t va;
476
477	va = radv_buffer_get_va(device->trace_bo);
478	if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
479		va += 4;
480
481	++cmd_buffer->state.trace_id;
482	radv_emit_write_data_packet(cmd_buffer, va, 1,
483				    &cmd_buffer->state.trace_id);
484
485	radeon_check_space(cmd_buffer->device->ws, cs, 2);
486
487	radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
488	radeon_emit(cs, AC_ENCODE_TRACE_POINT(cmd_buffer->state.trace_id));
489}
490
491static void
492radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer,
493			   enum radv_cmd_flush_bits flags)
494{
495	if (cmd_buffer->device->instance->debug_flags & RADV_DEBUG_SYNC_SHADERS) {
496		assert(flags & (RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
497				RADV_CMD_FLAG_CS_PARTIAL_FLUSH));
498
499		radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4);
500
501		/* Force wait for graphics or compute engines to be idle. */
502		si_cs_emit_cache_flush(cmd_buffer->cs,
503				       cmd_buffer->device->physical_device->rad_info.chip_class,
504				       &cmd_buffer->gfx9_fence_idx,
505				       cmd_buffer->gfx9_fence_va,
506				       radv_cmd_buffer_uses_mec(cmd_buffer),
507				       flags, cmd_buffer->gfx9_eop_bug_va);
508	}
509
510	if (unlikely(cmd_buffer->device->trace_bo))
511		radv_cmd_buffer_trace_emit(cmd_buffer);
512}
513
514static void
515radv_save_pipeline(struct radv_cmd_buffer *cmd_buffer,
516		   struct radv_pipeline *pipeline, enum ring_type ring)
517{
518	struct radv_device *device = cmd_buffer->device;
519	uint32_t data[2];
520	uint64_t va;
521
522	va = radv_buffer_get_va(device->trace_bo);
523
524	switch (ring) {
525	case RING_GFX:
526		va += 8;
527		break;
528	case RING_COMPUTE:
529		va += 16;
530		break;
531	default:
532		assert(!"invalid ring type");
533	}
534
535	data[0] = (uintptr_t)pipeline;
536	data[1] = (uintptr_t)pipeline >> 32;
537
538	radv_emit_write_data_packet(cmd_buffer, va, 2, data);
539}
540
541void radv_set_descriptor_set(struct radv_cmd_buffer *cmd_buffer,
542			     VkPipelineBindPoint bind_point,
543			     struct radv_descriptor_set *set,
544			     unsigned idx)
545{
546	struct radv_descriptor_state *descriptors_state =
547		radv_get_descriptors_state(cmd_buffer, bind_point);
548
549	descriptors_state->sets[idx] = set;
550
551	descriptors_state->valid |= (1u << idx); /* active descriptors */
552	descriptors_state->dirty |= (1u << idx);
553}
554
555static void
556radv_save_descriptors(struct radv_cmd_buffer *cmd_buffer,
557		      VkPipelineBindPoint bind_point)
558{
559	struct radv_descriptor_state *descriptors_state =
560		radv_get_descriptors_state(cmd_buffer, bind_point);
561	struct radv_device *device = cmd_buffer->device;
562	uint32_t data[MAX_SETS * 2] = {};
563	uint64_t va;
564	unsigned i;
565	va = radv_buffer_get_va(device->trace_bo) + 24;
566
567	for_each_bit(i, descriptors_state->valid) {
568		struct radv_descriptor_set *set = descriptors_state->sets[i];
569		data[i * 2] = (uint64_t)(uintptr_t)set;
570		data[i * 2 + 1] = (uint64_t)(uintptr_t)set >> 32;
571	}
572
573	radv_emit_write_data_packet(cmd_buffer, va, MAX_SETS * 2, data);
574}
575
576struct radv_userdata_info *
577radv_lookup_user_sgpr(struct radv_pipeline *pipeline,
578		      gl_shader_stage stage,
579		      int idx)
580{
581	struct radv_shader_variant *shader = radv_get_shader(pipeline, stage);
582	return &shader->info.user_sgprs_locs.shader_data[idx];
583}
584
585static void
586radv_emit_userdata_address(struct radv_cmd_buffer *cmd_buffer,
587			   struct radv_pipeline *pipeline,
588			   gl_shader_stage stage,
589			   int idx, uint64_t va)
590{
591	struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx);
592	uint32_t base_reg = pipeline->user_data_0[stage];
593	if (loc->sgpr_idx == -1)
594		return;
595
596	assert(loc->num_sgprs == 1);
597
598	radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs,
599				 base_reg + loc->sgpr_idx * 4, va, false);
600}
601
602static void
603radv_emit_descriptor_pointers(struct radv_cmd_buffer *cmd_buffer,
604			      struct radv_pipeline *pipeline,
605			      struct radv_descriptor_state *descriptors_state,
606			      gl_shader_stage stage)
607{
608	struct radv_device *device = cmd_buffer->device;
609	struct radeon_cmdbuf *cs = cmd_buffer->cs;
610	uint32_t sh_base = pipeline->user_data_0[stage];
611	struct radv_userdata_locations *locs =
612		&pipeline->shaders[stage]->info.user_sgprs_locs;
613	unsigned mask = locs->descriptor_sets_enabled;
614
615	mask &= descriptors_state->dirty & descriptors_state->valid;
616
617	while (mask) {
618		int start, count;
619
620		u_bit_scan_consecutive_range(&mask, &start, &count);
621
622		struct radv_userdata_info *loc = &locs->descriptor_sets[start];
623		unsigned sh_offset = sh_base + loc->sgpr_idx * 4;
624
625		radv_emit_shader_pointer_head(cs, sh_offset, count, true);
626		for (int i = 0; i < count; i++) {
627			struct radv_descriptor_set *set =
628				descriptors_state->sets[start + i];
629
630			radv_emit_shader_pointer_body(device, cs, set->va, true);
631		}
632	}
633}
634
635static void
636radv_emit_inline_push_consts(struct radv_cmd_buffer *cmd_buffer,
637			     struct radv_pipeline *pipeline,
638			     gl_shader_stage stage,
639			     int idx, int count, uint32_t *values)
640{
641	struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx);
642	uint32_t base_reg = pipeline->user_data_0[stage];
643	if (loc->sgpr_idx == -1)
644		return;
645
646	assert(loc->num_sgprs == count);
647
648	radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, count);
649	radeon_emit_array(cmd_buffer->cs, values, count);
650}
651
652static void
653radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer,
654			      struct radv_pipeline *pipeline)
655{
656	int num_samples = pipeline->graphics.ms.num_samples;
657	struct radv_multisample_state *ms = &pipeline->graphics.ms;
658	struct radv_pipeline *old_pipeline = cmd_buffer->state.emitted_pipeline;
659
660	if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.info.ps.needs_sample_positions)
661		cmd_buffer->sample_positions_needed = true;
662
663	if (old_pipeline && num_samples == old_pipeline->graphics.ms.num_samples)
664		return;
665
666	radeon_set_context_reg_seq(cmd_buffer->cs, R_028BDC_PA_SC_LINE_CNTL, 2);
667	radeon_emit(cmd_buffer->cs, ms->pa_sc_line_cntl);
668	radeon_emit(cmd_buffer->cs, ms->pa_sc_aa_config);
669
670	radeon_set_context_reg(cmd_buffer->cs, R_028A48_PA_SC_MODE_CNTL_0, ms->pa_sc_mode_cntl_0);
671
672	radv_cayman_emit_msaa_sample_locs(cmd_buffer->cs, num_samples);
673
674	/* GFX9: Flush DFSM when the AA mode changes. */
675	if (cmd_buffer->device->dfsm_allowed) {
676		radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
677		radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0));
678	}
679
680	cmd_buffer->state.context_roll_without_scissor_emitted = true;
681}
682
683static void
684radv_emit_shader_prefetch(struct radv_cmd_buffer *cmd_buffer,
685			  struct radv_shader_variant *shader)
686{
687	uint64_t va;
688
689	if (!shader)
690		return;
691
692	va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
693
694	si_cp_dma_prefetch(cmd_buffer, va, shader->code_size);
695}
696
697static void
698radv_emit_prefetch_L2(struct radv_cmd_buffer *cmd_buffer,
699		      struct radv_pipeline *pipeline,
700		      bool vertex_stage_only)
701{
702	struct radv_cmd_state *state = &cmd_buffer->state;
703	uint32_t mask = state->prefetch_L2_mask;
704
705	if (vertex_stage_only) {
706		/* Fast prefetch path for starting draws as soon as possible.
707		 */
708		mask = state->prefetch_L2_mask & (RADV_PREFETCH_VS |
709						  RADV_PREFETCH_VBO_DESCRIPTORS);
710	}
711
712	if (mask & RADV_PREFETCH_VS)
713		radv_emit_shader_prefetch(cmd_buffer,
714					  pipeline->shaders[MESA_SHADER_VERTEX]);
715
716	if (mask & RADV_PREFETCH_VBO_DESCRIPTORS)
717		si_cp_dma_prefetch(cmd_buffer, state->vb_va, state->vb_size);
718
719	if (mask & RADV_PREFETCH_TCS)
720		radv_emit_shader_prefetch(cmd_buffer,
721					  pipeline->shaders[MESA_SHADER_TESS_CTRL]);
722
723	if (mask & RADV_PREFETCH_TES)
724		radv_emit_shader_prefetch(cmd_buffer,
725					  pipeline->shaders[MESA_SHADER_TESS_EVAL]);
726
727	if (mask & RADV_PREFETCH_GS) {
728		radv_emit_shader_prefetch(cmd_buffer,
729					  pipeline->shaders[MESA_SHADER_GEOMETRY]);
730		radv_emit_shader_prefetch(cmd_buffer, pipeline->gs_copy_shader);
731	}
732
733	if (mask & RADV_PREFETCH_PS)
734		radv_emit_shader_prefetch(cmd_buffer,
735					  pipeline->shaders[MESA_SHADER_FRAGMENT]);
736
737	state->prefetch_L2_mask &= ~mask;
738}
739
740static void
741radv_emit_rbplus_state(struct radv_cmd_buffer *cmd_buffer)
742{
743	if (!cmd_buffer->device->physical_device->rbplus_allowed)
744		return;
745
746	struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
747	struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
748	const struct radv_subpass *subpass = cmd_buffer->state.subpass;
749
750	unsigned sx_ps_downconvert = 0;
751	unsigned sx_blend_opt_epsilon = 0;
752	unsigned sx_blend_opt_control = 0;
753
754	for (unsigned i = 0; i < subpass->color_count; ++i) {
755		if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) {
756			sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4);
757			sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4);
758			continue;
759		}
760
761		int idx = subpass->color_attachments[i].attachment;
762		struct radv_color_buffer_info *cb = &framebuffer->attachments[idx].cb;
763
764		unsigned format = G_028C70_FORMAT(cb->cb_color_info);
765		unsigned swap = G_028C70_COMP_SWAP(cb->cb_color_info);
766		uint32_t spi_format = (pipeline->graphics.col_format >> (i * 4)) & 0xf;
767		uint32_t colormask = (pipeline->graphics.cb_target_mask >> (i * 4)) & 0xf;
768
769		bool has_alpha, has_rgb;
770
771		/* Set if RGB and A are present. */
772		has_alpha = !G_028C74_FORCE_DST_ALPHA_1(cb->cb_color_attrib);
773
774		if (format == V_028C70_COLOR_8 ||
775		    format == V_028C70_COLOR_16 ||
776		    format == V_028C70_COLOR_32)
777			has_rgb = !has_alpha;
778		else
779			has_rgb = true;
780
781		/* Check the colormask and export format. */
782		if (!(colormask & 0x7))
783			has_rgb = false;
784		if (!(colormask & 0x8))
785			has_alpha = false;
786
787		if (spi_format == V_028714_SPI_SHADER_ZERO) {
788			has_rgb = false;
789			has_alpha = false;
790		}
791
792		/* Disable value checking for disabled channels. */
793		if (!has_rgb)
794			sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4);
795		if (!has_alpha)
796			sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4);
797
798		/* Enable down-conversion for 32bpp and smaller formats. */
799		switch (format) {
800		case V_028C70_COLOR_8:
801		case V_028C70_COLOR_8_8:
802		case V_028C70_COLOR_8_8_8_8:
803			/* For 1 and 2-channel formats, use the superset thereof. */
804			if (spi_format == V_028714_SPI_SHADER_FP16_ABGR ||
805			    spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
806			    spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
807				sx_ps_downconvert |= V_028754_SX_RT_EXPORT_8_8_8_8 << (i * 4);
808				sx_blend_opt_epsilon |= V_028758_8BIT_FORMAT << (i * 4);
809			}
810			break;
811
812		case V_028C70_COLOR_5_6_5:
813			if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
814				sx_ps_downconvert |= V_028754_SX_RT_EXPORT_5_6_5 << (i * 4);
815				sx_blend_opt_epsilon |= V_028758_6BIT_FORMAT << (i * 4);
816			}
817			break;
818
819		case V_028C70_COLOR_1_5_5_5:
820			if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
821				sx_ps_downconvert |= V_028754_SX_RT_EXPORT_1_5_5_5 << (i * 4);
822				sx_blend_opt_epsilon |= V_028758_5BIT_FORMAT << (i * 4);
823			}
824			break;
825
826		case V_028C70_COLOR_4_4_4_4:
827			if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
828				sx_ps_downconvert |= V_028754_SX_RT_EXPORT_4_4_4_4 << (i * 4);
829				sx_blend_opt_epsilon |= V_028758_4BIT_FORMAT << (i * 4);
830			}
831			break;
832
833		case V_028C70_COLOR_32:
834			if (swap == V_028C70_SWAP_STD &&
835			    spi_format == V_028714_SPI_SHADER_32_R)
836				sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
837			else if (swap == V_028C70_SWAP_ALT_REV &&
838				 spi_format == V_028714_SPI_SHADER_32_AR)
839				sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_A << (i * 4);
840			break;
841
842		case V_028C70_COLOR_16:
843		case V_028C70_COLOR_16_16:
844			/* For 1-channel formats, use the superset thereof. */
845			if (spi_format == V_028714_SPI_SHADER_UNORM16_ABGR ||
846			    spi_format == V_028714_SPI_SHADER_SNORM16_ABGR ||
847			    spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
848			    spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
849				if (swap == V_028C70_SWAP_STD ||
850				    swap == V_028C70_SWAP_STD_REV)
851					sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_GR << (i * 4);
852				else
853					sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_AR << (i * 4);
854			}
855			break;
856
857		case V_028C70_COLOR_10_11_11:
858			if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
859				sx_ps_downconvert |= V_028754_SX_RT_EXPORT_10_11_11 << (i * 4);
860				sx_blend_opt_epsilon |= V_028758_11BIT_FORMAT << (i * 4);
861			}
862			break;
863
864		case V_028C70_COLOR_2_10_10_10:
865			if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
866				sx_ps_downconvert |= V_028754_SX_RT_EXPORT_2_10_10_10 << (i * 4);
867				sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT << (i * 4);
868			}
869			break;
870		}
871	}
872
873	for (unsigned i = subpass->color_count; i < 8; ++i) {
874		sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4);
875		sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4);
876	}
877	/* TODO: avoid redundantly setting context registers */
878	radeon_set_context_reg_seq(cmd_buffer->cs, R_028754_SX_PS_DOWNCONVERT, 3);
879	radeon_emit(cmd_buffer->cs, sx_ps_downconvert);
880	radeon_emit(cmd_buffer->cs, sx_blend_opt_epsilon);
881	radeon_emit(cmd_buffer->cs, sx_blend_opt_control);
882
883	cmd_buffer->state.context_roll_without_scissor_emitted = true;
884}
885
886static void
887radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
888{
889	struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
890
891	if (!pipeline || cmd_buffer->state.emitted_pipeline == pipeline)
892		return;
893
894	radv_update_multisample_state(cmd_buffer, pipeline);
895
896	cmd_buffer->scratch_size_needed =
897	                          MAX2(cmd_buffer->scratch_size_needed,
898	                               pipeline->max_waves * pipeline->scratch_bytes_per_wave);
899
900	if (!cmd_buffer->state.emitted_pipeline ||
901	    cmd_buffer->state.emitted_pipeline->graphics.can_use_guardband !=
902	     pipeline->graphics.can_use_guardband)
903		cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR;
904
905	radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw);
906
907	if (!cmd_buffer->state.emitted_pipeline ||
908	    cmd_buffer->state.emitted_pipeline->ctx_cs.cdw != pipeline->ctx_cs.cdw ||
909	    cmd_buffer->state.emitted_pipeline->ctx_cs_hash != pipeline->ctx_cs_hash ||
910	    memcmp(cmd_buffer->state.emitted_pipeline->ctx_cs.buf,
911	           pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw * 4)) {
912		radeon_emit_array(cmd_buffer->cs, pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw);
913		cmd_buffer->state.context_roll_without_scissor_emitted = true;
914	}
915
916	for (unsigned i = 0; i < MESA_SHADER_COMPUTE; i++) {
917		if (!pipeline->shaders[i])
918			continue;
919
920		radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
921				   pipeline->shaders[i]->bo);
922	}
923
924	if (radv_pipeline_has_gs(pipeline))
925		radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
926				   pipeline->gs_copy_shader->bo);
927
928	if (unlikely(cmd_buffer->device->trace_bo))
929		radv_save_pipeline(cmd_buffer, pipeline, RING_GFX);
930
931	cmd_buffer->state.emitted_pipeline = pipeline;
932
933	cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_PIPELINE;
934}
935
936static void
937radv_emit_viewport(struct radv_cmd_buffer *cmd_buffer)
938{
939	si_write_viewport(cmd_buffer->cs, 0, cmd_buffer->state.dynamic.viewport.count,
940			  cmd_buffer->state.dynamic.viewport.viewports);
941}
942
943static void
944radv_emit_scissor(struct radv_cmd_buffer *cmd_buffer)
945{
946	uint32_t count = cmd_buffer->state.dynamic.scissor.count;
947
948	si_write_scissors(cmd_buffer->cs, 0, count,
949			  cmd_buffer->state.dynamic.scissor.scissors,
950			  cmd_buffer->state.dynamic.viewport.viewports,
951			  cmd_buffer->state.emitted_pipeline->graphics.can_use_guardband);
952
953	cmd_buffer->state.context_roll_without_scissor_emitted = false;
954}
955
956static void
957radv_emit_discard_rectangle(struct radv_cmd_buffer *cmd_buffer)
958{
959	if (!cmd_buffer->state.dynamic.discard_rectangle.count)
960		return;
961
962	radeon_set_context_reg_seq(cmd_buffer->cs, R_028210_PA_SC_CLIPRECT_0_TL,
963	                           cmd_buffer->state.dynamic.discard_rectangle.count * 2);
964	for (unsigned i = 0; i < cmd_buffer->state.dynamic.discard_rectangle.count; ++i) {
965		VkRect2D rect = cmd_buffer->state.dynamic.discard_rectangle.rectangles[i];
966		radeon_emit(cmd_buffer->cs, S_028210_TL_X(rect.offset.x) | S_028210_TL_Y(rect.offset.y));
967		radeon_emit(cmd_buffer->cs, S_028214_BR_X(rect.offset.x + rect.extent.width) |
968		                            S_028214_BR_Y(rect.offset.y + rect.extent.height));
969	}
970}
971
972static void
973radv_emit_line_width(struct radv_cmd_buffer *cmd_buffer)
974{
975	unsigned width = cmd_buffer->state.dynamic.line_width * 8;
976
977	radeon_set_context_reg(cmd_buffer->cs, R_028A08_PA_SU_LINE_CNTL,
978			       S_028A08_WIDTH(CLAMP(width, 0, 0xFFF)));
979}
980
981static void
982radv_emit_blend_constants(struct radv_cmd_buffer *cmd_buffer)
983{
984	struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
985
986	radeon_set_context_reg_seq(cmd_buffer->cs, R_028414_CB_BLEND_RED, 4);
987	radeon_emit_array(cmd_buffer->cs, (uint32_t *)d->blend_constants, 4);
988}
989
990static void
991radv_emit_stencil(struct radv_cmd_buffer *cmd_buffer)
992{
993	struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
994
995	radeon_set_context_reg_seq(cmd_buffer->cs,
996				   R_028430_DB_STENCILREFMASK, 2);
997	radeon_emit(cmd_buffer->cs,
998		    S_028430_STENCILTESTVAL(d->stencil_reference.front) |
999		    S_028430_STENCILMASK(d->stencil_compare_mask.front) |
1000		    S_028430_STENCILWRITEMASK(d->stencil_write_mask.front) |
1001		    S_028430_STENCILOPVAL(1));
1002	radeon_emit(cmd_buffer->cs,
1003		    S_028434_STENCILTESTVAL_BF(d->stencil_reference.back) |
1004		    S_028434_STENCILMASK_BF(d->stencil_compare_mask.back) |
1005		    S_028434_STENCILWRITEMASK_BF(d->stencil_write_mask.back) |
1006		    S_028434_STENCILOPVAL_BF(1));
1007}
1008
1009static void
1010radv_emit_depth_bounds(struct radv_cmd_buffer *cmd_buffer)
1011{
1012	struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1013
1014	radeon_set_context_reg(cmd_buffer->cs, R_028020_DB_DEPTH_BOUNDS_MIN,
1015			       fui(d->depth_bounds.min));
1016	radeon_set_context_reg(cmd_buffer->cs, R_028024_DB_DEPTH_BOUNDS_MAX,
1017			       fui(d->depth_bounds.max));
1018}
1019
1020static void
1021radv_emit_depth_bias(struct radv_cmd_buffer *cmd_buffer)
1022{
1023	struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1024	unsigned slope = fui(d->depth_bias.slope * 16.0f);
1025	unsigned bias = fui(d->depth_bias.bias * cmd_buffer->state.offset_scale);
1026
1027
1028	radeon_set_context_reg_seq(cmd_buffer->cs,
1029				   R_028B7C_PA_SU_POLY_OFFSET_CLAMP, 5);
1030	radeon_emit(cmd_buffer->cs, fui(d->depth_bias.clamp)); /* CLAMP */
1031	radeon_emit(cmd_buffer->cs, slope); /* FRONT SCALE */
1032	radeon_emit(cmd_buffer->cs, bias); /* FRONT OFFSET */
1033	radeon_emit(cmd_buffer->cs, slope); /* BACK SCALE */
1034	radeon_emit(cmd_buffer->cs, bias); /* BACK OFFSET */
1035}
1036
1037static void
1038radv_emit_fb_color_state(struct radv_cmd_buffer *cmd_buffer,
1039			 int index,
1040			 struct radv_attachment_info *att,
1041			 struct radv_image *image,
1042			 VkImageLayout layout)
1043{
1044	bool is_vi = cmd_buffer->device->physical_device->rad_info.chip_class >= VI;
1045	struct radv_color_buffer_info *cb = &att->cb;
1046	uint32_t cb_color_info = cb->cb_color_info;
1047
1048	if (!radv_layout_dcc_compressed(image, layout,
1049	                                radv_image_queue_family_mask(image,
1050	                                                             cmd_buffer->queue_family_index,
1051	                                                             cmd_buffer->queue_family_index))) {
1052		cb_color_info &= C_028C70_DCC_ENABLE;
1053	}
1054
1055	if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
1056		radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
1057		radeon_emit(cmd_buffer->cs, cb->cb_color_base);
1058		radeon_emit(cmd_buffer->cs, S_028C64_BASE_256B(cb->cb_color_base >> 32));
1059		radeon_emit(cmd_buffer->cs, cb->cb_color_attrib2);
1060		radeon_emit(cmd_buffer->cs, cb->cb_color_view);
1061		radeon_emit(cmd_buffer->cs, cb_color_info);
1062		radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
1063		radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
1064		radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
1065		radeon_emit(cmd_buffer->cs, S_028C80_BASE_256B(cb->cb_color_cmask >> 32));
1066		radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
1067		radeon_emit(cmd_buffer->cs, S_028C88_BASE_256B(cb->cb_color_fmask >> 32));
1068
1069		radeon_set_context_reg_seq(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, 2);
1070		radeon_emit(cmd_buffer->cs, cb->cb_dcc_base);
1071		radeon_emit(cmd_buffer->cs, S_028C98_BASE_256B(cb->cb_dcc_base >> 32));
1072
1073		radeon_set_context_reg(cmd_buffer->cs, R_0287A0_CB_MRT0_EPITCH + index * 4,
1074				       cb->cb_mrt_epitch);
1075	} else {
1076		radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
1077		radeon_emit(cmd_buffer->cs, cb->cb_color_base);
1078		radeon_emit(cmd_buffer->cs, cb->cb_color_pitch);
1079		radeon_emit(cmd_buffer->cs, cb->cb_color_slice);
1080		radeon_emit(cmd_buffer->cs, cb->cb_color_view);
1081		radeon_emit(cmd_buffer->cs, cb_color_info);
1082		radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
1083		radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
1084		radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
1085		radeon_emit(cmd_buffer->cs, cb->cb_color_cmask_slice);
1086		radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
1087		radeon_emit(cmd_buffer->cs, cb->cb_color_fmask_slice);
1088
1089		if (is_vi) { /* DCC BASE */
1090			radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->cb_dcc_base);
1091		}
1092	}
1093
1094	if (radv_image_has_dcc(image)) {
1095		/* Drawing with DCC enabled also compresses colorbuffers. */
1096		radv_update_dcc_metadata(cmd_buffer, image, true);
1097	}
1098}
1099
1100static void
1101radv_update_zrange_precision(struct radv_cmd_buffer *cmd_buffer,
1102			     struct radv_ds_buffer_info *ds,
1103			     struct radv_image *image, VkImageLayout layout,
1104			     bool requires_cond_exec)
1105{
1106	uint32_t db_z_info = ds->db_z_info;
1107	uint32_t db_z_info_reg;
1108
1109	if (!radv_image_is_tc_compat_htile(image))
1110		return;
1111
1112	if (!radv_layout_has_htile(image, layout,
1113	                           radv_image_queue_family_mask(image,
1114	                                                        cmd_buffer->queue_family_index,
1115	                                                        cmd_buffer->queue_family_index))) {
1116		db_z_info &= C_028040_TILE_SURFACE_ENABLE;
1117	}
1118
1119	db_z_info &= C_028040_ZRANGE_PRECISION;
1120
1121	if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
1122		db_z_info_reg = R_028038_DB_Z_INFO;
1123	} else {
1124		db_z_info_reg = R_028040_DB_Z_INFO;
1125	}
1126
1127	/* When we don't know the last fast clear value we need to emit a
1128	 * conditional packet that will eventually skip the following
1129	 * SET_CONTEXT_REG packet.
1130	 */
1131	if (requires_cond_exec) {
1132		uint64_t va = radv_buffer_get_va(image->bo);
1133		va += image->offset + image->tc_compat_zrange_offset;
1134
1135		radeon_emit(cmd_buffer->cs, PKT3(PKT3_COND_EXEC, 3, 0));
1136		radeon_emit(cmd_buffer->cs, va);
1137		radeon_emit(cmd_buffer->cs, va >> 32);
1138		radeon_emit(cmd_buffer->cs, 0);
1139		radeon_emit(cmd_buffer->cs, 3); /* SET_CONTEXT_REG size */
1140	}
1141
1142	radeon_set_context_reg(cmd_buffer->cs, db_z_info_reg, db_z_info);
1143}
1144
1145static void
1146radv_emit_fb_ds_state(struct radv_cmd_buffer *cmd_buffer,
1147		      struct radv_ds_buffer_info *ds,
1148		      struct radv_image *image,
1149		      VkImageLayout layout)
1150{
1151	uint32_t db_z_info = ds->db_z_info;
1152	uint32_t db_stencil_info = ds->db_stencil_info;
1153
1154	if (!radv_layout_has_htile(image, layout,
1155	                           radv_image_queue_family_mask(image,
1156	                                                        cmd_buffer->queue_family_index,
1157	                                                        cmd_buffer->queue_family_index))) {
1158		db_z_info &= C_028040_TILE_SURFACE_ENABLE;
1159		db_stencil_info |= S_028044_TILE_STENCIL_DISABLE(1);
1160	}
1161
1162	radeon_set_context_reg(cmd_buffer->cs, R_028008_DB_DEPTH_VIEW, ds->db_depth_view);
1163	radeon_set_context_reg(cmd_buffer->cs, R_028ABC_DB_HTILE_SURFACE, ds->db_htile_surface);
1164
1165
1166	if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
1167		radeon_set_context_reg_seq(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, 3);
1168		radeon_emit(cmd_buffer->cs, ds->db_htile_data_base);
1169		radeon_emit(cmd_buffer->cs, S_028018_BASE_HI(ds->db_htile_data_base >> 32));
1170		radeon_emit(cmd_buffer->cs, ds->db_depth_size);
1171
1172		radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 10);
1173		radeon_emit(cmd_buffer->cs, db_z_info);			/* DB_Z_INFO */
1174		radeon_emit(cmd_buffer->cs, db_stencil_info);	        /* DB_STENCIL_INFO */
1175		radeon_emit(cmd_buffer->cs, ds->db_z_read_base);	/* DB_Z_READ_BASE */
1176		radeon_emit(cmd_buffer->cs, S_028044_BASE_HI(ds->db_z_read_base >> 32));	/* DB_Z_READ_BASE_HI */
1177		radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);	/* DB_STENCIL_READ_BASE */
1178		radeon_emit(cmd_buffer->cs, S_02804C_BASE_HI(ds->db_stencil_read_base >> 32)); /* DB_STENCIL_READ_BASE_HI */
1179		radeon_emit(cmd_buffer->cs, ds->db_z_write_base);	/* DB_Z_WRITE_BASE */
1180		radeon_emit(cmd_buffer->cs, S_028054_BASE_HI(ds->db_z_write_base >> 32));	/* DB_Z_WRITE_BASE_HI */
1181		radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base);	/* DB_STENCIL_WRITE_BASE */
1182		radeon_emit(cmd_buffer->cs, S_02805C_BASE_HI(ds->db_stencil_write_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */
1183
1184		radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_INFO2, 2);
1185		radeon_emit(cmd_buffer->cs, ds->db_z_info2);
1186		radeon_emit(cmd_buffer->cs, ds->db_stencil_info2);
1187	} else {
1188		radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base);
1189
1190		radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 9);
1191		radeon_emit(cmd_buffer->cs, ds->db_depth_info);	/* R_02803C_DB_DEPTH_INFO */
1192		radeon_emit(cmd_buffer->cs, db_z_info);			/* R_028040_DB_Z_INFO */
1193		radeon_emit(cmd_buffer->cs, db_stencil_info);	        /* R_028044_DB_STENCIL_INFO */
1194		radeon_emit(cmd_buffer->cs, ds->db_z_read_base);	/* R_028048_DB_Z_READ_BASE */
1195		radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);	/* R_02804C_DB_STENCIL_READ_BASE */
1196		radeon_emit(cmd_buffer->cs, ds->db_z_write_base);	/* R_028050_DB_Z_WRITE_BASE */
1197		radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base);	/* R_028054_DB_STENCIL_WRITE_BASE */
1198		radeon_emit(cmd_buffer->cs, ds->db_depth_size);	/* R_028058_DB_DEPTH_SIZE */
1199		radeon_emit(cmd_buffer->cs, ds->db_depth_slice);	/* R_02805C_DB_DEPTH_SLICE */
1200
1201	}
1202
1203	/* Update the ZRANGE_PRECISION value for the TC-compat bug. */
1204	radv_update_zrange_precision(cmd_buffer, ds, image, layout, true);
1205
1206	radeon_set_context_reg(cmd_buffer->cs, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL,
1207			       ds->pa_su_poly_offset_db_fmt_cntl);
1208}
1209
1210/**
1211 * Update the fast clear depth/stencil values if the image is bound as a
1212 * depth/stencil buffer.
1213 */
1214static void
1215radv_update_bound_fast_clear_ds(struct radv_cmd_buffer *cmd_buffer,
1216				struct radv_image *image,
1217				VkClearDepthStencilValue ds_clear_value,
1218				VkImageAspectFlags aspects)
1219{
1220	struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
1221	const struct radv_subpass *subpass = cmd_buffer->state.subpass;
1222	struct radeon_cmdbuf *cs = cmd_buffer->cs;
1223	struct radv_attachment_info *att;
1224	uint32_t att_idx;
1225
1226	if (!framebuffer || !subpass)
1227		return;
1228
1229	if (!subpass->depth_stencil_attachment)
1230		return;
1231
1232	att_idx = subpass->depth_stencil_attachment->attachment;
1233	att = &framebuffer->attachments[att_idx];
1234	if (att->attachment->image != image)
1235		return;
1236
1237	radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2);
1238	radeon_emit(cs, ds_clear_value.stencil);
1239	radeon_emit(cs, fui(ds_clear_value.depth));
1240
1241	/* Update the ZRANGE_PRECISION value for the TC-compat bug. This is
1242	 * only needed when clearing Z to 0.0.
1243	 */
1244	if ((aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
1245	    ds_clear_value.depth == 0.0) {
1246		VkImageLayout layout = subpass->depth_stencil_attachment->layout;
1247
1248		radv_update_zrange_precision(cmd_buffer, &att->ds, image,
1249					     layout, false);
1250	}
1251
1252	cmd_buffer->state.context_roll_without_scissor_emitted = true;
1253}
1254
1255/**
1256 * Set the clear depth/stencil values to the image's metadata.
1257 */
1258static void
1259radv_set_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
1260			   struct radv_image *image,
1261			   VkClearDepthStencilValue ds_clear_value,
1262			   VkImageAspectFlags aspects)
1263{
1264	struct radeon_cmdbuf *cs = cmd_buffer->cs;
1265	uint64_t va = radv_buffer_get_va(image->bo);
1266	unsigned reg_offset = 0, reg_count = 0;
1267
1268	va += image->offset + image->clear_value_offset;
1269
1270	if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
1271		++reg_count;
1272	} else {
1273		++reg_offset;
1274		va += 4;
1275	}
1276	if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
1277		++reg_count;
1278
1279	radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + reg_count, cmd_buffer->state.predicating));
1280	radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
1281			S_370_WR_CONFIRM(1) |
1282			S_370_ENGINE_SEL(V_370_PFP));
1283	radeon_emit(cs, va);
1284	radeon_emit(cs, va >> 32);
1285	if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT)
1286		radeon_emit(cs, ds_clear_value.stencil);
1287	if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
1288		radeon_emit(cs, fui(ds_clear_value.depth));
1289}
1290
1291/**
1292 * Update the TC-compat metadata value for this image.
1293 */
1294static void
1295radv_set_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer,
1296				   struct radv_image *image,
1297				   uint32_t value)
1298{
1299	struct radeon_cmdbuf *cs = cmd_buffer->cs;
1300	uint64_t va = radv_buffer_get_va(image->bo);
1301	va += image->offset + image->tc_compat_zrange_offset;
1302
1303	radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, cmd_buffer->state.predicating));
1304	radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
1305			S_370_WR_CONFIRM(1) |
1306			S_370_ENGINE_SEL(V_370_PFP));
1307	radeon_emit(cs, va);
1308	radeon_emit(cs, va >> 32);
1309	radeon_emit(cs, value);
1310}
1311
1312static void
1313radv_update_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer,
1314				      struct radv_image *image,
1315				      VkClearDepthStencilValue ds_clear_value)
1316{
1317	uint64_t va = radv_buffer_get_va(image->bo);
1318	va += image->offset + image->tc_compat_zrange_offset;
1319	uint32_t cond_val;
1320
1321	/* Conditionally set DB_Z_INFO.ZRANGE_PRECISION to 0 when the last
1322	 * depth clear value is 0.0f.
1323	 */
1324	cond_val = ds_clear_value.depth == 0.0f ? UINT_MAX : 0;
1325
1326	radv_set_tc_compat_zrange_metadata(cmd_buffer, image, cond_val);
1327}
1328
1329/**
1330 * Update the clear depth/stencil values for this image.
1331 */
1332void
1333radv_update_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
1334			      struct radv_image *image,
1335			      VkClearDepthStencilValue ds_clear_value,
1336			      VkImageAspectFlags aspects)
1337{
1338	assert(radv_image_has_htile(image));
1339
1340	radv_set_ds_clear_metadata(cmd_buffer, image, ds_clear_value, aspects);
1341
1342	if (radv_image_is_tc_compat_htile(image) &&
1343	    (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) {
1344		radv_update_tc_compat_zrange_metadata(cmd_buffer, image,
1345						      ds_clear_value);
1346	}
1347
1348	radv_update_bound_fast_clear_ds(cmd_buffer, image, ds_clear_value,
1349				        aspects);
1350}
1351
1352/**
1353 * Load the clear depth/stencil values from the image's metadata.
1354 */
1355static void
1356radv_load_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
1357			    struct radv_image *image)
1358{
1359	struct radeon_cmdbuf *cs = cmd_buffer->cs;
1360	VkImageAspectFlags aspects = vk_format_aspects(image->vk_format);
1361	uint64_t va = radv_buffer_get_va(image->bo);
1362	unsigned reg_offset = 0, reg_count = 0;
1363
1364	va += image->offset + image->clear_value_offset;
1365
1366	if (!radv_image_has_htile(image))
1367		return;
1368
1369	if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
1370		++reg_count;
1371	} else {
1372		++reg_offset;
1373		va += 4;
1374	}
1375	if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
1376		++reg_count;
1377
1378	uint32_t reg = R_028028_DB_STENCIL_CLEAR + 4 * reg_offset;
1379
1380	if (cmd_buffer->device->physical_device->has_load_ctx_reg_pkt) {
1381		radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG, 3, 0));
1382		radeon_emit(cs, va);
1383		radeon_emit(cs, va >> 32);
1384		radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
1385		radeon_emit(cs, reg_count);
1386	} else {
1387		radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
1388		radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
1389				COPY_DATA_DST_SEL(COPY_DATA_REG) |
1390				(reg_count == 2 ? COPY_DATA_COUNT_SEL : 0));
1391		radeon_emit(cs, va);
1392		radeon_emit(cs, va >> 32);
1393		radeon_emit(cs, reg >> 2);
1394		radeon_emit(cs, 0);
1395
1396		radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
1397		radeon_emit(cs, 0);
1398	}
1399}
1400
1401/*
1402 * With DCC some colors don't require CMASK elimination before being
1403 * used as a texture. This sets a predicate value to determine if the
1404 * cmask eliminate is required.
1405 */
1406void
1407radv_update_fce_metadata(struct radv_cmd_buffer *cmd_buffer,
1408			 struct radv_image *image, bool value)
1409{
1410	uint64_t pred_val = value;
1411	uint64_t va = radv_buffer_get_va(image->bo);
1412	va += image->offset + image->fce_pred_offset;
1413
1414	assert(radv_image_has_dcc(image));
1415
1416	radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 4, 0));
1417	radeon_emit(cmd_buffer->cs, S_370_DST_SEL(V_370_MEM) |
1418				    S_370_WR_CONFIRM(1) |
1419				    S_370_ENGINE_SEL(V_370_PFP));
1420	radeon_emit(cmd_buffer->cs, va);
1421	radeon_emit(cmd_buffer->cs, va >> 32);
1422	radeon_emit(cmd_buffer->cs, pred_val);
1423	radeon_emit(cmd_buffer->cs, pred_val >> 32);
1424}
1425
1426/**
1427 * Update the DCC predicate to reflect the compression state.
1428 */
1429void
1430radv_update_dcc_metadata(struct radv_cmd_buffer *cmd_buffer,
1431			 struct radv_image *image, bool value)
1432{
1433	uint64_t pred_val = value;
1434	uint64_t va = radv_buffer_get_va(image->bo);
1435	va += image->offset + image->dcc_pred_offset;
1436
1437	assert(radv_image_has_dcc(image));
1438
1439	radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 4, 0));
1440	radeon_emit(cmd_buffer->cs, S_370_DST_SEL(V_370_MEM) |
1441				    S_370_WR_CONFIRM(1) |
1442				    S_370_ENGINE_SEL(V_370_PFP));
1443	radeon_emit(cmd_buffer->cs, va);
1444	radeon_emit(cmd_buffer->cs, va >> 32);
1445	radeon_emit(cmd_buffer->cs, pred_val);
1446	radeon_emit(cmd_buffer->cs, pred_val >> 32);
1447}
1448
1449/**
1450 * Update the fast clear color values if the image is bound as a color buffer.
1451 */
1452static void
1453radv_update_bound_fast_clear_color(struct radv_cmd_buffer *cmd_buffer,
1454				   struct radv_image *image,
1455				   int cb_idx,
1456				   uint32_t color_values[2])
1457{
1458	struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
1459	const struct radv_subpass *subpass = cmd_buffer->state.subpass;
1460	struct radeon_cmdbuf *cs = cmd_buffer->cs;
1461	struct radv_attachment_info *att;
1462	uint32_t att_idx;
1463
1464	if (!framebuffer || !subpass)
1465		return;
1466
1467	att_idx = subpass->color_attachments[cb_idx].attachment;
1468	if (att_idx == VK_ATTACHMENT_UNUSED)
1469		return;
1470
1471	att = &framebuffer->attachments[att_idx];
1472	if (att->attachment->image != image)
1473		return;
1474
1475	radeon_set_context_reg_seq(cs, R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c, 2);
1476	radeon_emit(cs, color_values[0]);
1477	radeon_emit(cs, color_values[1]);
1478
1479	cmd_buffer->state.context_roll_without_scissor_emitted = true;
1480}
1481
1482/**
1483 * Set the clear color values to the image's metadata.
1484 */
1485static void
1486radv_set_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
1487			      struct radv_image *image,
1488			      uint32_t color_values[2])
1489{
1490	struct radeon_cmdbuf *cs = cmd_buffer->cs;
1491	uint64_t va = radv_buffer_get_va(image->bo);
1492
1493	va += image->offset + image->clear_value_offset;
1494
1495	assert(radv_image_has_cmask(image) || radv_image_has_dcc(image));
1496
1497	radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 4, cmd_buffer->state.predicating));
1498	radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
1499			S_370_WR_CONFIRM(1) |
1500			S_370_ENGINE_SEL(V_370_PFP));
1501	radeon_emit(cs, va);
1502	radeon_emit(cs, va >> 32);
1503	radeon_emit(cs, color_values[0]);
1504	radeon_emit(cs, color_values[1]);
1505}
1506
1507/**
1508 * Update the clear color values for this image.
1509 */
1510void
1511radv_update_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
1512				 struct radv_image *image,
1513				 int cb_idx,
1514				 uint32_t color_values[2])
1515{
1516	assert(radv_image_has_cmask(image) || radv_image_has_dcc(image));
1517
1518	radv_set_color_clear_metadata(cmd_buffer, image, color_values);
1519
1520	radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx,
1521					   color_values);
1522}
1523
1524/**
1525 * Load the clear color values from the image's metadata.
1526 */
1527static void
1528radv_load_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
1529			       struct radv_image *image,
1530			       int cb_idx)
1531{
1532	struct radeon_cmdbuf *cs = cmd_buffer->cs;
1533	uint64_t va = radv_buffer_get_va(image->bo);
1534
1535	va += image->offset + image->clear_value_offset;
1536
1537	if (!radv_image_has_cmask(image) && !radv_image_has_dcc(image))
1538		return;
1539
1540	uint32_t reg = R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c;
1541
1542	if (cmd_buffer->device->physical_device->has_load_ctx_reg_pkt) {
1543		radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG, 3, cmd_buffer->state.predicating));
1544		radeon_emit(cs, va);
1545		radeon_emit(cs, va >> 32);
1546		radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
1547		radeon_emit(cs, 2);
1548	} else {
1549		radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
1550		radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
1551				COPY_DATA_DST_SEL(COPY_DATA_REG) |
1552				COPY_DATA_COUNT_SEL);
1553		radeon_emit(cs, va);
1554		radeon_emit(cs, va >> 32);
1555		radeon_emit(cs, reg >> 2);
1556		radeon_emit(cs, 0);
1557
1558		radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
1559		radeon_emit(cs, 0);
1560	}
1561}
1562
1563static void
1564radv_emit_framebuffer_state(struct radv_cmd_buffer *cmd_buffer)
1565{
1566	int i;
1567	struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
1568	const struct radv_subpass *subpass = cmd_buffer->state.subpass;
1569	unsigned num_bpp64_colorbufs = 0;
1570
1571	/* this may happen for inherited secondary recording */
1572	if (!framebuffer)
1573		return;
1574
1575	for (i = 0; i < 8; ++i) {
1576		if (i >= subpass->color_count || subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) {
1577			radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
1578				       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
1579			continue;
1580		}
1581
1582		int idx = subpass->color_attachments[i].attachment;
1583		struct radv_attachment_info *att = &framebuffer->attachments[idx];
1584		struct radv_image *image = att->attachment->image;
1585		VkImageLayout layout = subpass->color_attachments[i].layout;
1586
1587		radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, att->attachment->bo);
1588
1589		assert(att->attachment->aspect_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_PLANE_0_BIT |
1590		                                       VK_IMAGE_ASPECT_PLANE_1_BIT | VK_IMAGE_ASPECT_PLANE_2_BIT));
1591		radv_emit_fb_color_state(cmd_buffer, i, att, image, layout);
1592
1593		radv_load_color_clear_metadata(cmd_buffer, image, i);
1594
1595		if (image->planes[0].surface.bpe >= 8)
1596			num_bpp64_colorbufs++;
1597	}
1598
1599	if (subpass->depth_stencil_attachment) {
1600		int idx = subpass->depth_stencil_attachment->attachment;
1601		VkImageLayout layout = subpass->depth_stencil_attachment->layout;
1602		struct radv_attachment_info *att = &framebuffer->attachments[idx];
1603		struct radv_image *image = att->attachment->image;
1604		radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, att->attachment->bo);
1605		MAYBE_UNUSED uint32_t queue_mask = radv_image_queue_family_mask(image,
1606										cmd_buffer->queue_family_index,
1607										cmd_buffer->queue_family_index);
1608		/* We currently don't support writing decompressed HTILE */
1609		assert(radv_layout_has_htile(image, layout, queue_mask) ==
1610		       radv_layout_is_htile_compressed(image, layout, queue_mask));
1611
1612		radv_emit_fb_ds_state(cmd_buffer, &att->ds, image, layout);
1613
1614		if (att->ds.offset_scale != cmd_buffer->state.offset_scale) {
1615			cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
1616			cmd_buffer->state.offset_scale = att->ds.offset_scale;
1617		}
1618		radv_load_ds_clear_metadata(cmd_buffer, image);
1619	} else {
1620		if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9)
1621			radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 2);
1622		else
1623			radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 2);
1624
1625		radeon_emit(cmd_buffer->cs, S_028040_FORMAT(V_028040_Z_INVALID)); /* DB_Z_INFO */
1626		radeon_emit(cmd_buffer->cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */
1627	}
1628	radeon_set_context_reg(cmd_buffer->cs, R_028208_PA_SC_WINDOW_SCISSOR_BR,
1629			       S_028208_BR_X(framebuffer->width) |
1630			       S_028208_BR_Y(framebuffer->height));
1631
1632	if (cmd_buffer->device->physical_device->rad_info.chip_class >= VI) {
1633		uint8_t watermark = 4; /* Default value for VI. */
1634
1635		/* For optimal DCC performance. */
1636		if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
1637			if (num_bpp64_colorbufs >= 5) {
1638				watermark = 8;
1639			} else {
1640				watermark = 6;
1641			}
1642		}
1643
1644		radeon_set_context_reg(cmd_buffer->cs, R_028424_CB_DCC_CONTROL,
1645				       S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(1) |
1646				       S_028424_OVERWRITE_COMBINER_WATERMARK(watermark));
1647	}
1648
1649	if (cmd_buffer->device->dfsm_allowed) {
1650		radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
1651		radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
1652	}
1653
1654	cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_FRAMEBUFFER;
1655}
1656
1657static void
1658radv_emit_index_buffer(struct radv_cmd_buffer *cmd_buffer)
1659{
1660	struct radeon_cmdbuf *cs = cmd_buffer->cs;
1661	struct radv_cmd_state *state = &cmd_buffer->state;
1662
1663	if (state->index_type != state->last_index_type) {
1664		if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
1665			radeon_set_uconfig_reg_idx(cs, R_03090C_VGT_INDEX_TYPE,
1666						   2, state->index_type);
1667		} else {
1668			radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
1669			radeon_emit(cs, state->index_type);
1670		}
1671
1672		state->last_index_type = state->index_type;
1673	}
1674
1675	radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0));
1676	radeon_emit(cs, state->index_va);
1677	radeon_emit(cs, state->index_va >> 32);
1678
1679	radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
1680	radeon_emit(cs, state->max_index_count);
1681
1682	cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_INDEX_BUFFER;
1683}
1684
1685void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer)
1686{
1687	bool has_perfect_queries = cmd_buffer->state.perfect_occlusion_queries_enabled;
1688	struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
1689	uint32_t pa_sc_mode_cntl_1 =
1690		pipeline ? pipeline->graphics.ms.pa_sc_mode_cntl_1 : 0;
1691	uint32_t db_count_control;
1692
1693	if(!cmd_buffer->state.active_occlusion_queries) {
1694		if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) {
1695			if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) &&
1696			    pipeline->graphics.disable_out_of_order_rast_for_occlusion &&
1697			    has_perfect_queries) {
1698				/* Re-enable out-of-order rasterization if the
1699				 * bound pipeline supports it and if it's has
1700				 * been disabled before starting any perfect
1701				 * occlusion queries.
1702				 */
1703				radeon_set_context_reg(cmd_buffer->cs,
1704						       R_028A4C_PA_SC_MODE_CNTL_1,
1705						       pa_sc_mode_cntl_1);
1706			}
1707		}
1708		db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1);
1709	} else {
1710		const struct radv_subpass *subpass = cmd_buffer->state.subpass;
1711		uint32_t sample_rate = subpass ? util_logbase2(subpass->max_sample_count) : 0;
1712
1713		if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) {
1714			db_count_control =
1715				S_028004_PERFECT_ZPASS_COUNTS(has_perfect_queries) |
1716				S_028004_SAMPLE_RATE(sample_rate) |
1717				S_028004_ZPASS_ENABLE(1) |
1718				S_028004_SLICE_EVEN_ENABLE(1) |
1719				S_028004_SLICE_ODD_ENABLE(1);
1720
1721			if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) &&
1722			    pipeline->graphics.disable_out_of_order_rast_for_occlusion &&
1723			    has_perfect_queries) {
1724				/* If the bound pipeline has enabled
1725				 * out-of-order rasterization, we should
1726				 * disable it before starting any perfect
1727				 * occlusion queries.
1728				 */
1729				pa_sc_mode_cntl_1 &= C_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE;
1730
1731				radeon_set_context_reg(cmd_buffer->cs,
1732						       R_028A4C_PA_SC_MODE_CNTL_1,
1733						       pa_sc_mode_cntl_1);
1734			}
1735		} else {
1736			db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) |
1737				S_028004_SAMPLE_RATE(sample_rate);
1738		}
1739	}
1740
1741	radeon_set_context_reg(cmd_buffer->cs, R_028004_DB_COUNT_CONTROL, db_count_control);
1742
1743	cmd_buffer->state.context_roll_without_scissor_emitted = true;
1744}
1745
1746static void
1747radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer)
1748{
1749	uint32_t states = cmd_buffer->state.dirty & cmd_buffer->state.emitted_pipeline->graphics.needed_dynamic_state;
1750
1751	if (states & (RADV_CMD_DIRTY_DYNAMIC_VIEWPORT))
1752		radv_emit_viewport(cmd_buffer);
1753
1754	if (states & (RADV_CMD_DIRTY_DYNAMIC_SCISSOR | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT) &&
1755	    !cmd_buffer->device->physical_device->has_scissor_bug)
1756		radv_emit_scissor(cmd_buffer);
1757
1758	if (states & RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH)
1759		radv_emit_line_width(cmd_buffer);
1760
1761	if (states & RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS)
1762		radv_emit_blend_constants(cmd_buffer);
1763
1764	if (states & (RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE |
1765				       RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK |
1766				       RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK))
1767		radv_emit_stencil(cmd_buffer);
1768
1769	if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS)
1770		radv_emit_depth_bounds(cmd_buffer);
1771
1772	if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS)
1773		radv_emit_depth_bias(cmd_buffer);
1774
1775	if (states & RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE)
1776		radv_emit_discard_rectangle(cmd_buffer);
1777
1778	cmd_buffer->state.dirty &= ~states;
1779}
1780
1781static void
1782radv_flush_push_descriptors(struct radv_cmd_buffer *cmd_buffer,
1783			    VkPipelineBindPoint bind_point)
1784{
1785	struct radv_descriptor_state *descriptors_state =
1786		radv_get_descriptors_state(cmd_buffer, bind_point);
1787	struct radv_descriptor_set *set = &descriptors_state->push_set.set;
1788	unsigned bo_offset;
1789
1790	if (!radv_cmd_buffer_upload_data(cmd_buffer, set->size, 32,
1791					 set->mapped_ptr,
1792					 &bo_offset))
1793		return;
1794
1795	set->va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
1796	set->va += bo_offset;
1797}
1798
1799static void
1800radv_flush_indirect_descriptor_sets(struct radv_cmd_buffer *cmd_buffer,
1801				    VkPipelineBindPoint bind_point)
1802{
1803	struct radv_descriptor_state *descriptors_state =
1804		radv_get_descriptors_state(cmd_buffer, bind_point);
1805	uint32_t size = MAX_SETS * 4;
1806	uint32_t offset;
1807	void *ptr;
1808
1809	if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size,
1810					  256, &offset, &ptr))
1811		return;
1812
1813	for (unsigned i = 0; i < MAX_SETS; i++) {
1814		uint32_t *uptr = ((uint32_t *)ptr) + i;
1815		uint64_t set_va = 0;
1816		struct radv_descriptor_set *set = descriptors_state->sets[i];
1817		if (descriptors_state->valid & (1u << i))
1818			set_va = set->va;
1819		uptr[0] = set_va & 0xffffffff;
1820	}
1821
1822	uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
1823	va += offset;
1824
1825	if (cmd_buffer->state.pipeline) {
1826		if (cmd_buffer->state.pipeline->shaders[MESA_SHADER_VERTEX])
1827			radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_VERTEX,
1828						   AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
1829
1830		if (cmd_buffer->state.pipeline->shaders[MESA_SHADER_FRAGMENT])
1831			radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_FRAGMENT,
1832						   AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
1833
1834		if (radv_pipeline_has_gs(cmd_buffer->state.pipeline))
1835			radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_GEOMETRY,
1836						   AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
1837
1838		if (radv_pipeline_has_tess(cmd_buffer->state.pipeline))
1839			radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_TESS_CTRL,
1840						   AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
1841
1842		if (radv_pipeline_has_tess(cmd_buffer->state.pipeline))
1843			radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_TESS_EVAL,
1844						   AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
1845	}
1846
1847	if (cmd_buffer->state.compute_pipeline)
1848		radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.compute_pipeline, MESA_SHADER_COMPUTE,
1849					   AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
1850}
1851
1852static void
1853radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer,
1854		       VkShaderStageFlags stages)
1855{
1856	VkPipelineBindPoint bind_point = stages & VK_SHADER_STAGE_COMPUTE_BIT ?
1857					 VK_PIPELINE_BIND_POINT_COMPUTE :
1858					 VK_PIPELINE_BIND_POINT_GRAPHICS;
1859	struct radv_descriptor_state *descriptors_state =
1860		radv_get_descriptors_state(cmd_buffer, bind_point);
1861	struct radv_cmd_state *state = &cmd_buffer->state;
1862	bool flush_indirect_descriptors;
1863
1864	if (!descriptors_state->dirty)
1865		return;
1866
1867	if (descriptors_state->push_dirty)
1868		radv_flush_push_descriptors(cmd_buffer, bind_point);
1869
1870	flush_indirect_descriptors =
1871		(bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS &&
1872		 state->pipeline && state->pipeline->need_indirect_descriptor_sets) ||
1873		(bind_point == VK_PIPELINE_BIND_POINT_COMPUTE &&
1874		 state->compute_pipeline && state->compute_pipeline->need_indirect_descriptor_sets);
1875
1876	if (flush_indirect_descriptors)
1877		radv_flush_indirect_descriptor_sets(cmd_buffer, bind_point);
1878
1879	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
1880	                                                   cmd_buffer->cs,
1881	                                                   MAX_SETS * MESA_SHADER_STAGES * 4);
1882
1883	if (cmd_buffer->state.pipeline) {
1884		radv_foreach_stage(stage, stages) {
1885			if (!cmd_buffer->state.pipeline->shaders[stage])
1886				continue;
1887
1888			radv_emit_descriptor_pointers(cmd_buffer,
1889						      cmd_buffer->state.pipeline,
1890						      descriptors_state, stage);
1891		}
1892	}
1893
1894	if (cmd_buffer->state.compute_pipeline &&
1895	    (stages & VK_SHADER_STAGE_COMPUTE_BIT)) {
1896		radv_emit_descriptor_pointers(cmd_buffer,
1897					      cmd_buffer->state.compute_pipeline,
1898					      descriptors_state,
1899					      MESA_SHADER_COMPUTE);
1900	}
1901
1902	descriptors_state->dirty = 0;
1903	descriptors_state->push_dirty = false;
1904
1905	assert(cmd_buffer->cs->cdw <= cdw_max);
1906
1907	if (unlikely(cmd_buffer->device->trace_bo))
1908		radv_save_descriptors(cmd_buffer, bind_point);
1909}
1910
1911static void
1912radv_flush_constants(struct radv_cmd_buffer *cmd_buffer,
1913		     VkShaderStageFlags stages)
1914{
1915	struct radv_pipeline *pipeline = stages & VK_SHADER_STAGE_COMPUTE_BIT
1916					 ? cmd_buffer->state.compute_pipeline
1917					 : cmd_buffer->state.pipeline;
1918	VkPipelineBindPoint bind_point = stages & VK_SHADER_STAGE_COMPUTE_BIT ?
1919					 VK_PIPELINE_BIND_POINT_COMPUTE :
1920					 VK_PIPELINE_BIND_POINT_GRAPHICS;
1921	struct radv_descriptor_state *descriptors_state =
1922		radv_get_descriptors_state(cmd_buffer, bind_point);
1923	struct radv_pipeline_layout *layout = pipeline->layout;
1924	struct radv_shader_variant *shader, *prev_shader;
1925	bool need_push_constants = false;
1926	unsigned offset;
1927	void *ptr;
1928	uint64_t va;
1929
1930	stages &= cmd_buffer->push_constant_stages;
1931	if (!stages ||
1932	    (!layout->push_constant_size && !layout->dynamic_offset_count))
1933		return;
1934
1935	radv_foreach_stage(stage, stages) {
1936		if (!pipeline->shaders[stage])
1937			continue;
1938
1939		need_push_constants |= pipeline->shaders[stage]->info.info.loads_push_constants;
1940		need_push_constants |= pipeline->shaders[stage]->info.info.loads_dynamic_offsets;
1941
1942		uint8_t base = pipeline->shaders[stage]->info.info.base_inline_push_consts;
1943		uint8_t count = pipeline->shaders[stage]->info.info.num_inline_push_consts;
1944
1945		radv_emit_inline_push_consts(cmd_buffer, pipeline, stage,
1946					     AC_UD_INLINE_PUSH_CONSTANTS,
1947					     count,
1948					     (uint32_t *)&cmd_buffer->push_constants[base * 4]);
1949	}
1950
1951	if (need_push_constants) {
1952		if (!radv_cmd_buffer_upload_alloc(cmd_buffer, layout->push_constant_size +
1953						  16 * layout->dynamic_offset_count,
1954						  256, &offset, &ptr))
1955			return;
1956
1957		memcpy(ptr, cmd_buffer->push_constants, layout->push_constant_size);
1958		memcpy((char*)ptr + layout->push_constant_size,
1959		       descriptors_state->dynamic_buffers,
1960		       16 * layout->dynamic_offset_count);
1961
1962		va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
1963		va += offset;
1964
1965		MAYBE_UNUSED unsigned cdw_max =
1966			radeon_check_space(cmd_buffer->device->ws,
1967	                                   cmd_buffer->cs, MESA_SHADER_STAGES * 4);
1968
1969		prev_shader = NULL;
1970		radv_foreach_stage(stage, stages) {
1971			shader = radv_get_shader(pipeline, stage);
1972
1973			/* Avoid redundantly emitting the address for merged stages. */
1974			if (shader && shader != prev_shader) {
1975				radv_emit_userdata_address(cmd_buffer, pipeline, stage,
1976							   AC_UD_PUSH_CONSTANTS, va);
1977
1978				prev_shader = shader;
1979			}
1980		}
1981		assert(cmd_buffer->cs->cdw <= cdw_max);
1982	}
1983
1984	cmd_buffer->push_constant_stages &= ~stages;
1985}
1986
1987static void
1988radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer,
1989			      bool pipeline_is_dirty)
1990{
1991	if ((pipeline_is_dirty ||
1992	    (cmd_buffer->state.dirty & RADV_CMD_DIRTY_VERTEX_BUFFER)) &&
1993	    cmd_buffer->state.pipeline->num_vertex_bindings &&
1994	    radv_get_shader(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX)->info.info.vs.has_vertex_buffers) {
1995		struct radv_vertex_elements_info *velems = &cmd_buffer->state.pipeline->vertex_elements;
1996		unsigned vb_offset;
1997		void *vb_ptr;
1998		uint32_t i = 0;
1999		uint32_t count = cmd_buffer->state.pipeline->num_vertex_bindings;
2000		uint64_t va;
2001
2002		/* allocate some descriptor state for vertex buffers */
2003		if (!radv_cmd_buffer_upload_alloc(cmd_buffer, count * 16, 256,
2004						  &vb_offset, &vb_ptr))
2005			return;
2006
2007		for (i = 0; i < count; i++) {
2008			uint32_t *desc = &((uint32_t *)vb_ptr)[i * 4];
2009			uint32_t offset;
2010			struct radv_buffer *buffer = cmd_buffer->vertex_bindings[i].buffer;
2011			uint32_t stride = cmd_buffer->state.pipeline->binding_stride[i];
2012
2013			if (!buffer)
2014				continue;
2015
2016			va = radv_buffer_get_va(buffer->bo);
2017
2018			offset = cmd_buffer->vertex_bindings[i].offset;
2019			va += offset + buffer->offset;
2020			desc[0] = va;
2021			desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride);
2022			if (cmd_buffer->device->physical_device->rad_info.chip_class <= CIK && stride)
2023				desc[2] = (buffer->size - offset - velems->format_size[i]) / stride + 1;
2024			else
2025				desc[2] = buffer->size - offset;
2026			desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
2027				  S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
2028				  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
2029				  S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
2030				  S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
2031				  S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
2032		}
2033
2034		va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
2035		va += vb_offset;
2036
2037		radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_VERTEX,
2038					   AC_UD_VS_VERTEX_BUFFERS, va);
2039
2040		cmd_buffer->state.vb_va = va;
2041		cmd_buffer->state.vb_size = count * 16;
2042		cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_VBO_DESCRIPTORS;
2043	}
2044	cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_VERTEX_BUFFER;
2045}
2046
2047static void
2048radv_emit_streamout_buffers(struct radv_cmd_buffer *cmd_buffer, uint64_t va)
2049{
2050	struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
2051	struct radv_userdata_info *loc;
2052	uint32_t base_reg;
2053
2054	for (unsigned stage = 0; stage < MESA_SHADER_STAGES; ++stage) {
2055		if (!radv_get_shader(pipeline, stage))
2056			continue;
2057
2058		loc = radv_lookup_user_sgpr(pipeline, stage,
2059					    AC_UD_STREAMOUT_BUFFERS);
2060		if (loc->sgpr_idx == -1)
2061			continue;
2062
2063		base_reg = pipeline->user_data_0[stage];
2064
2065		radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs,
2066					 base_reg + loc->sgpr_idx * 4, va, false);
2067	}
2068
2069	if (pipeline->gs_copy_shader) {
2070		loc = &pipeline->gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_STREAMOUT_BUFFERS];
2071		if (loc->sgpr_idx != -1) {
2072			base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0;
2073
2074			radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs,
2075						 base_reg + loc->sgpr_idx * 4, va, false);
2076		}
2077	}
2078}
2079
2080static void
2081radv_flush_streamout_descriptors(struct radv_cmd_buffer *cmd_buffer)
2082{
2083	if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_STREAMOUT_BUFFER) {
2084		struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
2085		struct radv_streamout_state *so = &cmd_buffer->state.streamout;
2086		unsigned so_offset;
2087		void *so_ptr;
2088		uint64_t va;
2089
2090		/* Allocate some descriptor state for streamout buffers. */
2091		if (!radv_cmd_buffer_upload_alloc(cmd_buffer,
2092						  MAX_SO_BUFFERS * 16, 256,
2093						  &so_offset, &so_ptr))
2094			return;
2095
2096		for (uint32_t i = 0; i < MAX_SO_BUFFERS; i++) {
2097			struct radv_buffer *buffer = sb[i].buffer;
2098			uint32_t *desc = &((uint32_t *)so_ptr)[i * 4];
2099
2100			if (!(so->enabled_mask & (1 << i)))
2101				continue;
2102
2103			va = radv_buffer_get_va(buffer->bo) + buffer->offset;
2104
2105			va += sb[i].offset;
2106
2107			/* Set the descriptor.
2108			 *
2109			 * On VI, the format must be non-INVALID, otherwise
2110			 * the buffer will be considered not bound and store
2111			 * instructions will be no-ops.
2112			 */
2113			desc[0] = va;
2114			desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
2115			desc[2] = 0xffffffff;
2116			desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
2117				  S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
2118				  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
2119				  S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
2120				  S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
2121		}
2122
2123		va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
2124		va += so_offset;
2125
2126		radv_emit_streamout_buffers(cmd_buffer, va);
2127	}
2128
2129	cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_STREAMOUT_BUFFER;
2130}
2131
2132static void
2133radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
2134{
2135	radv_flush_vertex_descriptors(cmd_buffer, pipeline_is_dirty);
2136	radv_flush_streamout_descriptors(cmd_buffer);
2137	radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS);
2138	radv_flush_constants(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS);
2139}
2140
2141struct radv_draw_info {
2142	/**
2143	 * Number of vertices.
2144	 */
2145	uint32_t count;
2146
2147	/**
2148	 * Index of the first vertex.
2149	 */
2150	int32_t vertex_offset;
2151
2152	/**
2153	 * First instance id.
2154	 */
2155	uint32_t first_instance;
2156
2157	/**
2158	 * Number of instances.
2159	 */
2160	uint32_t instance_count;
2161
2162	/**
2163	 * First index (indexed draws only).
2164	 */
2165	uint32_t first_index;
2166
2167	/**
2168	 * Whether it's an indexed draw.
2169	 */
2170	bool indexed;
2171
2172	/**
2173	 * Indirect draw parameters resource.
2174	 */
2175	struct radv_buffer *indirect;
2176	uint64_t indirect_offset;
2177	uint32_t stride;
2178
2179	/**
2180	 * Draw count parameters resource.
2181	 */
2182	struct radv_buffer *count_buffer;
2183	uint64_t count_buffer_offset;
2184
2185	/**
2186	 * Stream output parameters resource.
2187	 */
2188	struct radv_buffer *strmout_buffer;
2189	uint64_t strmout_buffer_offset;
2190};
2191
2192static void
2193radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer,
2194			 const struct radv_draw_info *draw_info)
2195{
2196	struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info;
2197	struct radv_cmd_state *state = &cmd_buffer->state;
2198	struct radeon_cmdbuf *cs = cmd_buffer->cs;
2199	uint32_t ia_multi_vgt_param;
2200	int32_t primitive_reset_en;
2201
2202	/* Draw state. */
2203	ia_multi_vgt_param =
2204		si_get_ia_multi_vgt_param(cmd_buffer, draw_info->instance_count > 1,
2205					  draw_info->indirect,
2206					  !!draw_info->strmout_buffer,
2207					  draw_info->indirect ? 0 : draw_info->count);
2208
2209	if (state->last_ia_multi_vgt_param != ia_multi_vgt_param) {
2210		if (info->chip_class >= GFX9) {
2211			radeon_set_uconfig_reg_idx(cs,
2212						   R_030960_IA_MULTI_VGT_PARAM,
2213						   4, ia_multi_vgt_param);
2214		} else if (info->chip_class >= CIK) {
2215			radeon_set_context_reg_idx(cs,
2216						   R_028AA8_IA_MULTI_VGT_PARAM,
2217						   1, ia_multi_vgt_param);
2218		} else {
2219			radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM,
2220					       ia_multi_vgt_param);
2221		}
2222		state->last_ia_multi_vgt_param = ia_multi_vgt_param;
2223	}
2224
2225	/* Primitive restart. */
2226	primitive_reset_en =
2227		draw_info->indexed && state->pipeline->graphics.prim_restart_enable;
2228
2229	if (primitive_reset_en != state->last_primitive_reset_en) {
2230		state->last_primitive_reset_en = primitive_reset_en;
2231		if (info->chip_class >= GFX9) {
2232			radeon_set_uconfig_reg(cs,
2233					       R_03092C_VGT_MULTI_PRIM_IB_RESET_EN,
2234					       primitive_reset_en);
2235		} else {
2236			radeon_set_context_reg(cs,
2237					       R_028A94_VGT_MULTI_PRIM_IB_RESET_EN,
2238					       primitive_reset_en);
2239		}
2240	}
2241
2242	if (primitive_reset_en) {
2243		uint32_t primitive_reset_index =
2244			state->index_type ? 0xffffffffu : 0xffffu;
2245
2246		if (primitive_reset_index != state->last_primitive_reset_index) {
2247			radeon_set_context_reg(cs,
2248					       R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX,
2249					       primitive_reset_index);
2250			state->last_primitive_reset_index = primitive_reset_index;
2251		}
2252	}
2253
2254	if (draw_info->strmout_buffer) {
2255		uint64_t va = radv_buffer_get_va(draw_info->strmout_buffer->bo);
2256
2257		va += draw_info->strmout_buffer->offset +
2258		      draw_info->strmout_buffer_offset;
2259
2260		radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE,
2261				       draw_info->stride);
2262
2263		radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
2264		radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
2265				COPY_DATA_DST_SEL(COPY_DATA_REG) |
2266				COPY_DATA_WR_CONFIRM);
2267		radeon_emit(cs, va);
2268		radeon_emit(cs, va >> 32);
2269		radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2);
2270		radeon_emit(cs, 0); /* unused */
2271
2272		radv_cs_add_buffer(cmd_buffer->device->ws, cs, draw_info->strmout_buffer->bo);
2273	}
2274}
2275
2276static void radv_stage_flush(struct radv_cmd_buffer *cmd_buffer,
2277			     VkPipelineStageFlags src_stage_mask)
2278{
2279	if (src_stage_mask & (VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT |
2280	                      VK_PIPELINE_STAGE_TRANSFER_BIT |
2281	                      VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT |
2282	                      VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)) {
2283		cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
2284	}
2285
2286	if (src_stage_mask & (VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
2287			      VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
2288			      VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT |
2289			      VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT |
2290			      VK_PIPELINE_STAGE_TRANSFER_BIT |
2291			      VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT |
2292			      VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT |
2293			      VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)) {
2294		cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
2295	} else if (src_stage_mask & (VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT |
2296	                             VK_PIPELINE_STAGE_VERTEX_INPUT_BIT |
2297	                             VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
2298				     VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT |
2299				     VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT |
2300				     VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT |
2301				     VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT)) {
2302		cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;
2303	}
2304}
2305
2306static enum radv_cmd_flush_bits
2307radv_src_access_flush(struct radv_cmd_buffer *cmd_buffer,
2308		      VkAccessFlags src_flags,
2309		      struct radv_image *image)
2310{
2311	bool flush_CB_meta = true, flush_DB_meta = true;
2312	enum radv_cmd_flush_bits flush_bits = 0;
2313	uint32_t b;
2314
2315	if (image) {
2316		if (!radv_image_has_CB_metadata(image))
2317			flush_CB_meta = false;
2318		if (!radv_image_has_htile(image))
2319			flush_DB_meta = false;
2320	}
2321
2322	for_each_bit(b, src_flags) {
2323		switch ((VkAccessFlagBits)(1 << b)) {
2324		case VK_ACCESS_SHADER_WRITE_BIT:
2325		case VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT:
2326		case VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
2327			flush_bits |= RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2;
2328			break;
2329		case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT:
2330			flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
2331			if (flush_CB_meta)
2332				flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
2333			break;
2334		case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
2335			flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
2336			if (flush_DB_meta)
2337				flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
2338			break;
2339		case VK_ACCESS_TRANSFER_WRITE_BIT:
2340			flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
2341			              RADV_CMD_FLAG_FLUSH_AND_INV_DB |
2342			              RADV_CMD_FLAG_INV_GLOBAL_L2;
2343
2344			if (flush_CB_meta)
2345				flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
2346			if (flush_DB_meta)
2347				flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
2348			break;
2349		default:
2350			break;
2351		}
2352	}
2353	return flush_bits;
2354}
2355
2356static enum radv_cmd_flush_bits
2357radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer,
2358                      VkAccessFlags dst_flags,
2359                      struct radv_image *image)
2360{
2361	bool flush_CB_meta = true, flush_DB_meta = true;
2362	enum radv_cmd_flush_bits flush_bits = 0;
2363	bool flush_CB = true, flush_DB = true;
2364	bool image_is_coherent = false;
2365	uint32_t b;
2366
2367	if (image) {
2368		if (!(image->usage & VK_IMAGE_USAGE_STORAGE_BIT)) {
2369			flush_CB = false;
2370			flush_DB = false;
2371		}
2372
2373		if (!radv_image_has_CB_metadata(image))
2374			flush_CB_meta = false;
2375		if (!radv_image_has_htile(image))
2376			flush_DB_meta = false;
2377
2378		if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
2379			if (image->info.samples == 1 &&
2380			    (image->usage & (VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT |
2381					     VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT)) &&
2382			    !vk_format_is_stencil(image->vk_format)) {
2383				/* Single-sample color and single-sample depth
2384				 * (not stencil) are coherent with shaders on
2385				 * GFX9.
2386				 */
2387				image_is_coherent = true;
2388			}
2389		}
2390	}
2391
2392	for_each_bit(b, dst_flags) {
2393		switch ((VkAccessFlagBits)(1 << b)) {
2394		case VK_ACCESS_INDIRECT_COMMAND_READ_BIT:
2395		case VK_ACCESS_INDEX_READ_BIT:
2396		case VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
2397			break;
2398		case VK_ACCESS_UNIFORM_READ_BIT:
2399			flush_bits |= RADV_CMD_FLAG_INV_VMEM_L1 | RADV_CMD_FLAG_INV_SMEM_L1;
2400			break;
2401		case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT:
2402		case VK_ACCESS_TRANSFER_READ_BIT:
2403		case VK_ACCESS_INPUT_ATTACHMENT_READ_BIT:
2404			flush_bits |= RADV_CMD_FLAG_INV_VMEM_L1 |
2405			              RADV_CMD_FLAG_INV_GLOBAL_L2;
2406			break;
2407		case VK_ACCESS_SHADER_READ_BIT:
2408			flush_bits |= RADV_CMD_FLAG_INV_VMEM_L1;
2409
2410			if (!image_is_coherent)
2411				flush_bits |= RADV_CMD_FLAG_INV_GLOBAL_L2;
2412			break;
2413		case VK_ACCESS_COLOR_ATTACHMENT_READ_BIT:
2414			if (flush_CB)
2415				flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
2416			if (flush_CB_meta)
2417				flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
2418			break;
2419		case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT:
2420			if (flush_DB)
2421				flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
2422			if (flush_DB_meta)
2423				flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
2424			break;
2425		default:
2426			break;
2427		}
2428	}
2429	return flush_bits;
2430}
2431
2432void radv_subpass_barrier(struct radv_cmd_buffer *cmd_buffer,
2433			  const struct radv_subpass_barrier *barrier)
2434{
2435	cmd_buffer->state.flush_bits |= radv_src_access_flush(cmd_buffer, barrier->src_access_mask,
2436							      NULL);
2437	radv_stage_flush(cmd_buffer, barrier->src_stage_mask);
2438	cmd_buffer->state.flush_bits |= radv_dst_access_flush(cmd_buffer, barrier->dst_access_mask,
2439	                                                      NULL);
2440}
2441
2442static void radv_handle_subpass_image_transition(struct radv_cmd_buffer *cmd_buffer,
2443						 struct radv_subpass_attachment att)
2444{
2445	unsigned idx = att.attachment;
2446	struct radv_image_view *view = cmd_buffer->state.framebuffer->attachments[idx].attachment;
2447	VkImageSubresourceRange range;
2448	range.aspectMask = 0;
2449	range.baseMipLevel = view->base_mip;
2450	range.levelCount = 1;
2451	range.baseArrayLayer = view->base_layer;
2452	range.layerCount = cmd_buffer->state.framebuffer->layers;
2453
2454	if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
2455		/* If the current subpass uses multiview, the driver might have
2456		 * performed a fast color/depth clear to the whole image
2457		 * (including all layers). To make sure the driver will
2458		 * decompress the image correctly (if needed), we have to
2459		 * account for the "real" number of layers. If the view mask is
2460		 * sparse, this will decompress more layers than needed.
2461		 */
2462		range.layerCount = util_last_bit(cmd_buffer->state.subpass->view_mask);
2463	}
2464
2465	radv_handle_image_transition(cmd_buffer,
2466				     view->image,
2467				     cmd_buffer->state.attachments[idx].current_layout,
2468				     att.layout, 0, 0, &range);
2469
2470	cmd_buffer->state.attachments[idx].current_layout = att.layout;
2471
2472
2473}
2474
2475void
2476radv_cmd_buffer_set_subpass(struct radv_cmd_buffer *cmd_buffer,
2477			    const struct radv_subpass *subpass)
2478{
2479	cmd_buffer->state.subpass = subpass;
2480
2481	cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER;
2482}
2483
2484static VkResult
2485radv_cmd_state_setup_attachments(struct radv_cmd_buffer *cmd_buffer,
2486				 struct radv_render_pass *pass,
2487				 const VkRenderPassBeginInfo *info)
2488{
2489	struct radv_cmd_state *state = &cmd_buffer->state;
2490
2491	if (pass->attachment_count == 0) {
2492		state->attachments = NULL;
2493		return VK_SUCCESS;
2494	}
2495
2496	state->attachments = vk_alloc(&cmd_buffer->pool->alloc,
2497					pass->attachment_count *
2498					sizeof(state->attachments[0]),
2499					8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2500	if (state->attachments == NULL) {
2501		cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
2502		return cmd_buffer->record_result;
2503	}
2504
2505	for (uint32_t i = 0; i < pass->attachment_count; ++i) {
2506		struct radv_render_pass_attachment *att = &pass->attachments[i];
2507		VkImageAspectFlags att_aspects = vk_format_aspects(att->format);
2508		VkImageAspectFlags clear_aspects = 0;
2509
2510		if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
2511			/* color attachment */
2512			if (att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
2513				clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT;
2514			}
2515		} else {
2516			/* depthstencil attachment */
2517			if ((att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
2518			    att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
2519				clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
2520				if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
2521				    att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_DONT_CARE)
2522					clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
2523			}
2524			if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
2525			    att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
2526				clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
2527			}
2528		}
2529
2530		state->attachments[i].pending_clear_aspects = clear_aspects;
2531		state->attachments[i].cleared_views = 0;
2532		if (clear_aspects && info) {
2533			assert(info->clearValueCount > i);
2534			state->attachments[i].clear_value = info->pClearValues[i];
2535		}
2536
2537		state->attachments[i].current_layout = att->initial_layout;
2538	}
2539
2540	return VK_SUCCESS;
2541}
2542
2543VkResult radv_AllocateCommandBuffers(
2544	VkDevice _device,
2545	const VkCommandBufferAllocateInfo *pAllocateInfo,
2546	VkCommandBuffer *pCommandBuffers)
2547{
2548	RADV_FROM_HANDLE(radv_device, device, _device);
2549	RADV_FROM_HANDLE(radv_cmd_pool, pool, pAllocateInfo->commandPool);
2550
2551	VkResult result = VK_SUCCESS;
2552	uint32_t i;
2553
2554	for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
2555
2556		if (!list_empty(&pool->free_cmd_buffers)) {
2557			struct radv_cmd_buffer *cmd_buffer = list_first_entry(&pool->free_cmd_buffers, struct radv_cmd_buffer, pool_link);
2558
2559			list_del(&cmd_buffer->pool_link);
2560			list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
2561
2562			result = radv_reset_cmd_buffer(cmd_buffer);
2563			cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
2564			cmd_buffer->level = pAllocateInfo->level;
2565
2566			pCommandBuffers[i] = radv_cmd_buffer_to_handle(cmd_buffer);
2567		} else {
2568			result = radv_create_cmd_buffer(device, pool, pAllocateInfo->level,
2569			                                &pCommandBuffers[i]);
2570		}
2571		if (result != VK_SUCCESS)
2572			break;
2573	}
2574
2575	if (result != VK_SUCCESS) {
2576		radv_FreeCommandBuffers(_device, pAllocateInfo->commandPool,
2577					i, pCommandBuffers);
2578
2579		/* From the Vulkan 1.0.66 spec:
2580		 *
2581		 * "vkAllocateCommandBuffers can be used to create multiple
2582		 *  command buffers. If the creation of any of those command
2583		 *  buffers fails, the implementation must destroy all
2584		 *  successfully created command buffer objects from this
2585		 *  command, set all entries of the pCommandBuffers array to
2586		 *  NULL and return the error."
2587		 */
2588		memset(pCommandBuffers, 0,
2589		       sizeof(*pCommandBuffers) * pAllocateInfo->commandBufferCount);
2590	}
2591
2592	return result;
2593}
2594
2595void radv_FreeCommandBuffers(
2596	VkDevice device,
2597	VkCommandPool commandPool,
2598	uint32_t commandBufferCount,
2599	const VkCommandBuffer *pCommandBuffers)
2600{
2601	for (uint32_t i = 0; i < commandBufferCount; i++) {
2602		RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, pCommandBuffers[i]);
2603
2604		if (cmd_buffer) {
2605			if (cmd_buffer->pool) {
2606				list_del(&cmd_buffer->pool_link);
2607				list_addtail(&cmd_buffer->pool_link, &cmd_buffer->pool->free_cmd_buffers);
2608			} else
2609				radv_cmd_buffer_destroy(cmd_buffer);
2610
2611		}
2612	}
2613}
2614
2615VkResult radv_ResetCommandBuffer(
2616	VkCommandBuffer commandBuffer,
2617	VkCommandBufferResetFlags flags)
2618{
2619	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2620	return radv_reset_cmd_buffer(cmd_buffer);
2621}
2622
2623VkResult radv_BeginCommandBuffer(
2624	VkCommandBuffer commandBuffer,
2625	const VkCommandBufferBeginInfo *pBeginInfo)
2626{
2627	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2628	VkResult result = VK_SUCCESS;
2629
2630	if (cmd_buffer->status != RADV_CMD_BUFFER_STATUS_INITIAL) {
2631		/* If the command buffer has already been resetted with
2632		 * vkResetCommandBuffer, no need to do it again.
2633		 */
2634		result = radv_reset_cmd_buffer(cmd_buffer);
2635		if (result != VK_SUCCESS)
2636			return result;
2637	}
2638
2639	memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
2640	cmd_buffer->state.last_primitive_reset_en = -1;
2641	cmd_buffer->state.last_index_type = -1;
2642	cmd_buffer->state.last_num_instances = -1;
2643	cmd_buffer->state.last_vertex_offset = -1;
2644	cmd_buffer->state.last_first_instance = -1;
2645	cmd_buffer->state.predication_type = -1;
2646	cmd_buffer->usage_flags = pBeginInfo->flags;
2647
2648	if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
2649	    (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
2650		assert(pBeginInfo->pInheritanceInfo);
2651		cmd_buffer->state.framebuffer = radv_framebuffer_from_handle(pBeginInfo->pInheritanceInfo->framebuffer);
2652		cmd_buffer->state.pass = radv_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
2653
2654		struct radv_subpass *subpass =
2655			&cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
2656
2657		result = radv_cmd_state_setup_attachments(cmd_buffer, cmd_buffer->state.pass, NULL);
2658		if (result != VK_SUCCESS)
2659			return result;
2660
2661		radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
2662	}
2663
2664	if (unlikely(cmd_buffer->device->trace_bo)) {
2665		struct radv_device *device = cmd_buffer->device;
2666
2667		radv_cs_add_buffer(device->ws, cmd_buffer->cs,
2668				   device->trace_bo);
2669
2670		radv_cmd_buffer_trace_emit(cmd_buffer);
2671	}
2672
2673	cmd_buffer->status = RADV_CMD_BUFFER_STATUS_RECORDING;
2674
2675	return result;
2676}
2677
2678void radv_CmdBindVertexBuffers(
2679	VkCommandBuffer                             commandBuffer,
2680	uint32_t                                    firstBinding,
2681	uint32_t                                    bindingCount,
2682	const VkBuffer*                             pBuffers,
2683	const VkDeviceSize*                         pOffsets)
2684{
2685	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2686	struct radv_vertex_binding *vb = cmd_buffer->vertex_bindings;
2687	bool changed = false;
2688
2689	/* We have to defer setting up vertex buffer since we need the buffer
2690	 * stride from the pipeline. */
2691
2692	assert(firstBinding + bindingCount <= MAX_VBS);
2693	for (uint32_t i = 0; i < bindingCount; i++) {
2694		uint32_t idx = firstBinding + i;
2695
2696		if (!changed &&
2697		    (vb[idx].buffer != radv_buffer_from_handle(pBuffers[i]) ||
2698		     vb[idx].offset != pOffsets[i])) {
2699			changed = true;
2700		}
2701
2702		vb[idx].buffer = radv_buffer_from_handle(pBuffers[i]);
2703		vb[idx].offset = pOffsets[i];
2704
2705		radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
2706				   vb[idx].buffer->bo);
2707	}
2708
2709	if (!changed) {
2710		/* No state changes. */
2711		return;
2712	}
2713
2714	cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER;
2715}
2716
2717void radv_CmdBindIndexBuffer(
2718	VkCommandBuffer                             commandBuffer,
2719	VkBuffer buffer,
2720	VkDeviceSize offset,
2721	VkIndexType indexType)
2722{
2723	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2724	RADV_FROM_HANDLE(radv_buffer, index_buffer, buffer);
2725
2726	if (cmd_buffer->state.index_buffer == index_buffer &&
2727	    cmd_buffer->state.index_offset == offset &&
2728	    cmd_buffer->state.index_type == indexType) {
2729		/* No state changes. */
2730		return;
2731	}
2732
2733	cmd_buffer->state.index_buffer = index_buffer;
2734	cmd_buffer->state.index_offset = offset;
2735	cmd_buffer->state.index_type = indexType; /* vk matches hw */
2736	cmd_buffer->state.index_va = radv_buffer_get_va(index_buffer->bo);
2737	cmd_buffer->state.index_va += index_buffer->offset + offset;
2738
2739	int index_size_shift = cmd_buffer->state.index_type ? 2 : 1;
2740	cmd_buffer->state.max_index_count = (index_buffer->size - offset) >> index_size_shift;
2741	cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
2742	radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, index_buffer->bo);
2743}
2744
2745
2746static void
2747radv_bind_descriptor_set(struct radv_cmd_buffer *cmd_buffer,
2748			 VkPipelineBindPoint bind_point,
2749			 struct radv_descriptor_set *set, unsigned idx)
2750{
2751	struct radeon_winsys *ws = cmd_buffer->device->ws;
2752
2753	radv_set_descriptor_set(cmd_buffer, bind_point, set, idx);
2754
2755	assert(set);
2756	assert(!(set->layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR));
2757
2758	if (!cmd_buffer->device->use_global_bo_list) {
2759		for (unsigned j = 0; j < set->layout->buffer_count; ++j)
2760			if (set->descriptors[j])
2761				radv_cs_add_buffer(ws, cmd_buffer->cs, set->descriptors[j]);
2762	}
2763
2764	if(set->bo)
2765		radv_cs_add_buffer(ws, cmd_buffer->cs, set->bo);
2766}
2767
2768void radv_CmdBindDescriptorSets(
2769	VkCommandBuffer                             commandBuffer,
2770	VkPipelineBindPoint                         pipelineBindPoint,
2771	VkPipelineLayout                            _layout,
2772	uint32_t                                    firstSet,
2773	uint32_t                                    descriptorSetCount,
2774	const VkDescriptorSet*                      pDescriptorSets,
2775	uint32_t                                    dynamicOffsetCount,
2776	const uint32_t*                             pDynamicOffsets)
2777{
2778	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2779	RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
2780	unsigned dyn_idx = 0;
2781
2782	const bool no_dynamic_bounds = cmd_buffer->device->instance->debug_flags & RADV_DEBUG_NO_DYNAMIC_BOUNDS;
2783	struct radv_descriptor_state *descriptors_state =
2784		radv_get_descriptors_state(cmd_buffer, pipelineBindPoint);
2785
2786	for (unsigned i = 0; i < descriptorSetCount; ++i) {
2787		unsigned idx = i + firstSet;
2788		RADV_FROM_HANDLE(radv_descriptor_set, set, pDescriptorSets[i]);
2789		radv_bind_descriptor_set(cmd_buffer, pipelineBindPoint, set, idx);
2790
2791		for(unsigned j = 0; j < set->layout->dynamic_offset_count; ++j, ++dyn_idx) {
2792			unsigned idx = j + layout->set[i + firstSet].dynamic_offset_start;
2793			uint32_t *dst = descriptors_state->dynamic_buffers + idx * 4;
2794			assert(dyn_idx < dynamicOffsetCount);
2795
2796			struct radv_descriptor_range *range = set->dynamic_descriptors + j;
2797			uint64_t va = range->va + pDynamicOffsets[dyn_idx];
2798			dst[0] = va;
2799			dst[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
2800			dst[2] = no_dynamic_bounds ? 0xffffffffu : range->size;
2801			dst[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
2802			         S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
2803			         S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
2804			         S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
2805			         S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
2806			         S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
2807			cmd_buffer->push_constant_stages |=
2808			                     set->layout->dynamic_shader_stages;
2809		}
2810	}
2811}
2812
2813static bool radv_init_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer,
2814                                          struct radv_descriptor_set *set,
2815                                          struct radv_descriptor_set_layout *layout,
2816					  VkPipelineBindPoint bind_point)
2817{
2818	struct radv_descriptor_state *descriptors_state =
2819		radv_get_descriptors_state(cmd_buffer, bind_point);
2820	set->size = layout->size;
2821	set->layout = layout;
2822
2823	if (descriptors_state->push_set.capacity < set->size) {
2824		size_t new_size = MAX2(set->size, 1024);
2825		new_size = MAX2(new_size, 2 * descriptors_state->push_set.capacity);
2826		new_size = MIN2(new_size, 96 * MAX_PUSH_DESCRIPTORS);
2827
2828		free(set->mapped_ptr);
2829		set->mapped_ptr = malloc(new_size);
2830
2831		if (!set->mapped_ptr) {
2832			descriptors_state->push_set.capacity = 0;
2833			cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
2834			return false;
2835		}
2836
2837		descriptors_state->push_set.capacity = new_size;
2838	}
2839
2840	return true;
2841}
2842
2843void radv_meta_push_descriptor_set(
2844	struct radv_cmd_buffer*              cmd_buffer,
2845	VkPipelineBindPoint                  pipelineBindPoint,
2846	VkPipelineLayout                     _layout,
2847	uint32_t                             set,
2848	uint32_t                             descriptorWriteCount,
2849	const VkWriteDescriptorSet*          pDescriptorWrites)
2850{
2851	RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
2852	struct radv_descriptor_set *push_set = &cmd_buffer->meta_push_descriptors;
2853	unsigned bo_offset;
2854
2855	assert(set == 0);
2856	assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
2857
2858	push_set->size = layout->set[set].layout->size;
2859	push_set->layout = layout->set[set].layout;
2860
2861	if (!radv_cmd_buffer_upload_alloc(cmd_buffer, push_set->size, 32,
2862	                                  &bo_offset,
2863	                                  (void**) &push_set->mapped_ptr))
2864		return;
2865
2866	push_set->va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
2867	push_set->va += bo_offset;
2868
2869	radv_update_descriptor_sets(cmd_buffer->device, cmd_buffer,
2870	                            radv_descriptor_set_to_handle(push_set),
2871	                            descriptorWriteCount, pDescriptorWrites, 0, NULL);
2872
2873	radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set);
2874}
2875
2876void radv_CmdPushDescriptorSetKHR(
2877	VkCommandBuffer                             commandBuffer,
2878	VkPipelineBindPoint                         pipelineBindPoint,
2879	VkPipelineLayout                            _layout,
2880	uint32_t                                    set,
2881	uint32_t                                    descriptorWriteCount,
2882	const VkWriteDescriptorSet*                 pDescriptorWrites)
2883{
2884	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2885	RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
2886	struct radv_descriptor_state *descriptors_state =
2887		radv_get_descriptors_state(cmd_buffer, pipelineBindPoint);
2888	struct radv_descriptor_set *push_set = &descriptors_state->push_set.set;
2889
2890	assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
2891
2892	if (!radv_init_push_descriptor_set(cmd_buffer, push_set,
2893					   layout->set[set].layout,
2894					   pipelineBindPoint))
2895		return;
2896
2897	radv_update_descriptor_sets(cmd_buffer->device, cmd_buffer,
2898	                            radv_descriptor_set_to_handle(push_set),
2899	                            descriptorWriteCount, pDescriptorWrites, 0, NULL);
2900
2901	radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set);
2902	descriptors_state->push_dirty = true;
2903}
2904
2905void radv_CmdPushDescriptorSetWithTemplateKHR(
2906	VkCommandBuffer                             commandBuffer,
2907	VkDescriptorUpdateTemplate                  descriptorUpdateTemplate,
2908	VkPipelineLayout                            _layout,
2909	uint32_t                                    set,
2910	const void*                                 pData)
2911{
2912	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2913	RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
2914	RADV_FROM_HANDLE(radv_descriptor_update_template, templ, descriptorUpdateTemplate);
2915	struct radv_descriptor_state *descriptors_state =
2916		radv_get_descriptors_state(cmd_buffer, templ->bind_point);
2917	struct radv_descriptor_set *push_set = &descriptors_state->push_set.set;
2918
2919	assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
2920
2921	if (!radv_init_push_descriptor_set(cmd_buffer, push_set,
2922					   layout->set[set].layout,
2923					   templ->bind_point))
2924		return;
2925
2926	radv_update_descriptor_set_with_template(cmd_buffer->device, cmd_buffer, push_set,
2927						 descriptorUpdateTemplate, pData);
2928
2929	radv_set_descriptor_set(cmd_buffer, templ->bind_point, push_set, set);
2930	descriptors_state->push_dirty = true;
2931}
2932
2933void radv_CmdPushConstants(VkCommandBuffer commandBuffer,
2934			   VkPipelineLayout layout,
2935			   VkShaderStageFlags stageFlags,
2936			   uint32_t offset,
2937			   uint32_t size,
2938			   const void* pValues)
2939{
2940	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2941	memcpy(cmd_buffer->push_constants + offset, pValues, size);
2942	cmd_buffer->push_constant_stages |= stageFlags;
2943}
2944
2945VkResult radv_EndCommandBuffer(
2946	VkCommandBuffer                             commandBuffer)
2947{
2948	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2949
2950	if (cmd_buffer->queue_family_index != RADV_QUEUE_TRANSFER) {
2951		if (cmd_buffer->device->physical_device->rad_info.chip_class == SI)
2952			cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2;
2953		si_emit_cache_flush(cmd_buffer);
2954	}
2955
2956	/* Make sure CP DMA is idle at the end of IBs because the kernel
2957	 * doesn't wait for it.
2958	 */
2959	si_cp_dma_wait_for_idle(cmd_buffer);
2960
2961	vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
2962
2963	if (!cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs))
2964		return vk_error(cmd_buffer->device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
2965
2966	cmd_buffer->status = RADV_CMD_BUFFER_STATUS_EXECUTABLE;
2967
2968	return cmd_buffer->record_result;
2969}
2970
2971static void
2972radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer)
2973{
2974	struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
2975
2976	if (!pipeline || pipeline == cmd_buffer->state.emitted_compute_pipeline)
2977		return;
2978
2979	assert(!pipeline->ctx_cs.cdw);
2980
2981	cmd_buffer->state.emitted_compute_pipeline = pipeline;
2982
2983	radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->cs.cdw);
2984	radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw);
2985
2986	cmd_buffer->compute_scratch_size_needed =
2987	                          MAX2(cmd_buffer->compute_scratch_size_needed,
2988	                               pipeline->max_waves * pipeline->scratch_bytes_per_wave);
2989
2990	radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
2991			   pipeline->shaders[MESA_SHADER_COMPUTE]->bo);
2992
2993	if (unlikely(cmd_buffer->device->trace_bo))
2994		radv_save_pipeline(cmd_buffer, pipeline, RING_COMPUTE);
2995}
2996
2997static void radv_mark_descriptor_sets_dirty(struct radv_cmd_buffer *cmd_buffer,
2998					    VkPipelineBindPoint bind_point)
2999{
3000	struct radv_descriptor_state *descriptors_state =
3001		radv_get_descriptors_state(cmd_buffer, bind_point);
3002
3003	descriptors_state->dirty |= descriptors_state->valid;
3004}
3005
3006void radv_CmdBindPipeline(
3007	VkCommandBuffer                             commandBuffer,
3008	VkPipelineBindPoint                         pipelineBindPoint,
3009	VkPipeline                                  _pipeline)
3010{
3011	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3012	RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
3013
3014	switch (pipelineBindPoint) {
3015	case VK_PIPELINE_BIND_POINT_COMPUTE:
3016		if (cmd_buffer->state.compute_pipeline == pipeline)
3017			return;
3018		radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
3019
3020		cmd_buffer->state.compute_pipeline = pipeline;
3021		cmd_buffer->push_constant_stages |= VK_SHADER_STAGE_COMPUTE_BIT;
3022		break;
3023	case VK_PIPELINE_BIND_POINT_GRAPHICS:
3024		if (cmd_buffer->state.pipeline == pipeline)
3025			return;
3026		radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
3027
3028		cmd_buffer->state.pipeline = pipeline;
3029		if (!pipeline)
3030			break;
3031
3032		cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PIPELINE;
3033		cmd_buffer->push_constant_stages |= pipeline->active_stages;
3034
3035		/* the new vertex shader might not have the same user regs */
3036		cmd_buffer->state.last_first_instance = -1;
3037		cmd_buffer->state.last_vertex_offset = -1;
3038
3039		/* Prefetch all pipeline shaders at first draw time. */
3040		cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_SHADERS;
3041
3042		radv_bind_dynamic_state(cmd_buffer, &pipeline->dynamic_state);
3043		radv_bind_streamout_state(cmd_buffer, pipeline);
3044
3045		if (pipeline->graphics.esgs_ring_size > cmd_buffer->esgs_ring_size_needed)
3046			cmd_buffer->esgs_ring_size_needed = pipeline->graphics.esgs_ring_size;
3047		if (pipeline->graphics.gsvs_ring_size > cmd_buffer->gsvs_ring_size_needed)
3048			cmd_buffer->gsvs_ring_size_needed = pipeline->graphics.gsvs_ring_size;
3049
3050		if (radv_pipeline_has_tess(pipeline))
3051			cmd_buffer->tess_rings_needed = true;
3052		break;
3053	default:
3054		assert(!"invalid bind point");
3055		break;
3056	}
3057}
3058
3059void radv_CmdSetViewport(
3060	VkCommandBuffer                             commandBuffer,
3061	uint32_t                                    firstViewport,
3062	uint32_t                                    viewportCount,
3063	const VkViewport*                           pViewports)
3064{
3065	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3066	struct radv_cmd_state *state = &cmd_buffer->state;
3067	MAYBE_UNUSED const uint32_t total_count = firstViewport + viewportCount;
3068
3069	assert(firstViewport < MAX_VIEWPORTS);
3070	assert(total_count >= 1 && total_count <= MAX_VIEWPORTS);
3071
3072	if (!memcmp(state->dynamic.viewport.viewports + firstViewport,
3073		    pViewports, viewportCount * sizeof(*pViewports))) {
3074		return;
3075	}
3076
3077	memcpy(state->dynamic.viewport.viewports + firstViewport, pViewports,
3078	       viewportCount * sizeof(*pViewports));
3079
3080	state->dirty |= RADV_CMD_DIRTY_DYNAMIC_VIEWPORT;
3081}
3082
3083void radv_CmdSetScissor(
3084	VkCommandBuffer                             commandBuffer,
3085	uint32_t                                    firstScissor,
3086	uint32_t                                    scissorCount,
3087	const VkRect2D*                             pScissors)
3088{
3089	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3090	struct radv_cmd_state *state = &cmd_buffer->state;
3091	MAYBE_UNUSED const uint32_t total_count = firstScissor + scissorCount;
3092
3093	assert(firstScissor < MAX_SCISSORS);
3094	assert(total_count >= 1 && total_count <= MAX_SCISSORS);
3095
3096	if (!memcmp(state->dynamic.scissor.scissors + firstScissor, pScissors,
3097		    scissorCount * sizeof(*pScissors))) {
3098		return;
3099	}
3100
3101	memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors,
3102	       scissorCount * sizeof(*pScissors));
3103
3104	state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR;
3105}
3106
3107void radv_CmdSetLineWidth(
3108	VkCommandBuffer                             commandBuffer,
3109	float                                       lineWidth)
3110{
3111	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3112
3113	if (cmd_buffer->state.dynamic.line_width == lineWidth)
3114		return;
3115
3116	cmd_buffer->state.dynamic.line_width = lineWidth;
3117	cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH;
3118}
3119
3120void radv_CmdSetDepthBias(
3121	VkCommandBuffer                             commandBuffer,
3122	float                                       depthBiasConstantFactor,
3123	float                                       depthBiasClamp,
3124	float                                       depthBiasSlopeFactor)
3125{
3126	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3127	struct radv_cmd_state *state = &cmd_buffer->state;
3128
3129	if (state->dynamic.depth_bias.bias == depthBiasConstantFactor &&
3130	    state->dynamic.depth_bias.clamp == depthBiasClamp &&
3131	    state->dynamic.depth_bias.slope == depthBiasSlopeFactor) {
3132		return;
3133	}
3134
3135	state->dynamic.depth_bias.bias = depthBiasConstantFactor;
3136	state->dynamic.depth_bias.clamp = depthBiasClamp;
3137	state->dynamic.depth_bias.slope = depthBiasSlopeFactor;
3138
3139	state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
3140}
3141
3142void radv_CmdSetBlendConstants(
3143	VkCommandBuffer                             commandBuffer,
3144	const float                                 blendConstants[4])
3145{
3146	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3147	struct radv_cmd_state *state = &cmd_buffer->state;
3148
3149	if (!memcmp(state->dynamic.blend_constants, blendConstants, sizeof(float) * 4))
3150		return;
3151
3152	memcpy(state->dynamic.blend_constants, blendConstants, sizeof(float) * 4);
3153
3154	state->dirty |= RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS;
3155}
3156
3157void radv_CmdSetDepthBounds(
3158	VkCommandBuffer                             commandBuffer,
3159	float                                       minDepthBounds,
3160	float                                       maxDepthBounds)
3161{
3162	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3163	struct radv_cmd_state *state = &cmd_buffer->state;
3164
3165	if (state->dynamic.depth_bounds.min == minDepthBounds &&
3166	    state->dynamic.depth_bounds.max == maxDepthBounds) {
3167		return;
3168	}
3169
3170	state->dynamic.depth_bounds.min = minDepthBounds;
3171	state->dynamic.depth_bounds.max = maxDepthBounds;
3172
3173	state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS;
3174}
3175
3176void radv_CmdSetStencilCompareMask(
3177	VkCommandBuffer                             commandBuffer,
3178	VkStencilFaceFlags                          faceMask,
3179	uint32_t                                    compareMask)
3180{
3181	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3182	struct radv_cmd_state *state = &cmd_buffer->state;
3183	bool front_same = state->dynamic.stencil_compare_mask.front == compareMask;
3184	bool back_same = state->dynamic.stencil_compare_mask.back == compareMask;
3185
3186	if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) &&
3187	    (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) {
3188		return;
3189	}
3190
3191	if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
3192		state->dynamic.stencil_compare_mask.front = compareMask;
3193	if (faceMask & VK_STENCIL_FACE_BACK_BIT)
3194		state->dynamic.stencil_compare_mask.back = compareMask;
3195
3196	state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK;
3197}
3198
3199void radv_CmdSetStencilWriteMask(
3200	VkCommandBuffer                             commandBuffer,
3201	VkStencilFaceFlags                          faceMask,
3202	uint32_t                                    writeMask)
3203{
3204	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3205	struct radv_cmd_state *state = &cmd_buffer->state;
3206	bool front_same = state->dynamic.stencil_write_mask.front == writeMask;
3207	bool back_same = state->dynamic.stencil_write_mask.back == writeMask;
3208
3209	if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) &&
3210	    (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) {
3211		return;
3212	}
3213
3214	if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
3215		state->dynamic.stencil_write_mask.front = writeMask;
3216	if (faceMask & VK_STENCIL_FACE_BACK_BIT)
3217		state->dynamic.stencil_write_mask.back = writeMask;
3218
3219	state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK;
3220}
3221
3222void radv_CmdSetStencilReference(
3223	VkCommandBuffer                             commandBuffer,
3224	VkStencilFaceFlags                          faceMask,
3225	uint32_t                                    reference)
3226{
3227	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3228	struct radv_cmd_state *state = &cmd_buffer->state;
3229	bool front_same = state->dynamic.stencil_reference.front == reference;
3230	bool back_same = state->dynamic.stencil_reference.back == reference;
3231
3232	if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) &&
3233	    (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) {
3234		return;
3235	}
3236
3237	if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
3238		cmd_buffer->state.dynamic.stencil_reference.front = reference;
3239	if (faceMask & VK_STENCIL_FACE_BACK_BIT)
3240		cmd_buffer->state.dynamic.stencil_reference.back = reference;
3241
3242	cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE;
3243}
3244
3245void radv_CmdSetDiscardRectangleEXT(
3246	VkCommandBuffer                             commandBuffer,
3247	uint32_t                                    firstDiscardRectangle,
3248	uint32_t                                    discardRectangleCount,
3249	const VkRect2D*                             pDiscardRectangles)
3250{
3251	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3252	struct radv_cmd_state *state = &cmd_buffer->state;
3253	MAYBE_UNUSED const uint32_t total_count = firstDiscardRectangle + discardRectangleCount;
3254
3255	assert(firstDiscardRectangle < MAX_DISCARD_RECTANGLES);
3256	assert(total_count >= 1 && total_count <= MAX_DISCARD_RECTANGLES);
3257
3258	if (!memcmp(state->dynamic.discard_rectangle.rectangles + firstDiscardRectangle,
3259		    pDiscardRectangles, discardRectangleCount * sizeof(*pDiscardRectangles))) {
3260		return;
3261	}
3262
3263	typed_memcpy(&state->dynamic.discard_rectangle.rectangles[firstDiscardRectangle],
3264	             pDiscardRectangles, discardRectangleCount);
3265
3266	state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE;
3267}
3268
3269void radv_CmdExecuteCommands(
3270	VkCommandBuffer                             commandBuffer,
3271	uint32_t                                    commandBufferCount,
3272	const VkCommandBuffer*                      pCmdBuffers)
3273{
3274	RADV_FROM_HANDLE(radv_cmd_buffer, primary, commandBuffer);
3275
3276	assert(commandBufferCount > 0);
3277
3278	/* Emit pending flushes on primary prior to executing secondary */
3279	si_emit_cache_flush(primary);
3280
3281	for (uint32_t i = 0; i < commandBufferCount; i++) {
3282		RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]);
3283
3284		primary->scratch_size_needed = MAX2(primary->scratch_size_needed,
3285		                                    secondary->scratch_size_needed);
3286		primary->compute_scratch_size_needed = MAX2(primary->compute_scratch_size_needed,
3287		                                            secondary->compute_scratch_size_needed);
3288
3289		if (secondary->esgs_ring_size_needed > primary->esgs_ring_size_needed)
3290			primary->esgs_ring_size_needed = secondary->esgs_ring_size_needed;
3291		if (secondary->gsvs_ring_size_needed > primary->gsvs_ring_size_needed)
3292			primary->gsvs_ring_size_needed = secondary->gsvs_ring_size_needed;
3293		if (secondary->tess_rings_needed)
3294			primary->tess_rings_needed = true;
3295		if (secondary->sample_positions_needed)
3296			primary->sample_positions_needed = true;
3297
3298		primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs);
3299
3300
3301		/* When the secondary command buffer is compute only we don't
3302		 * need to re-emit the current graphics pipeline.
3303		 */
3304		if (secondary->state.emitted_pipeline) {
3305			primary->state.emitted_pipeline =
3306				secondary->state.emitted_pipeline;
3307		}
3308
3309		/* When the secondary command buffer is graphics only we don't
3310		 * need to re-emit the current compute pipeline.
3311		 */
3312		if (secondary->state.emitted_compute_pipeline) {
3313			primary->state.emitted_compute_pipeline =
3314				secondary->state.emitted_compute_pipeline;
3315		}
3316
3317		/* Only re-emit the draw packets when needed. */
3318		if (secondary->state.last_primitive_reset_en != -1) {
3319			primary->state.last_primitive_reset_en =
3320				secondary->state.last_primitive_reset_en;
3321		}
3322
3323		if (secondary->state.last_primitive_reset_index) {
3324			primary->state.last_primitive_reset_index =
3325				secondary->state.last_primitive_reset_index;
3326		}
3327
3328		if (secondary->state.last_ia_multi_vgt_param) {
3329			primary->state.last_ia_multi_vgt_param =
3330				secondary->state.last_ia_multi_vgt_param;
3331		}
3332
3333		primary->state.last_first_instance = secondary->state.last_first_instance;
3334		primary->state.last_num_instances = secondary->state.last_num_instances;
3335		primary->state.last_vertex_offset = secondary->state.last_vertex_offset;
3336
3337		if (secondary->state.last_index_type != -1) {
3338			primary->state.last_index_type =
3339				secondary->state.last_index_type;
3340		}
3341	}
3342
3343	/* After executing commands from secondary buffers we have to dirty
3344	 * some states.
3345	 */
3346	primary->state.dirty |= RADV_CMD_DIRTY_PIPELINE |
3347				RADV_CMD_DIRTY_INDEX_BUFFER |
3348				RADV_CMD_DIRTY_DYNAMIC_ALL;
3349	radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_GRAPHICS);
3350	radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_COMPUTE);
3351}
3352
3353VkResult radv_CreateCommandPool(
3354	VkDevice                                    _device,
3355	const VkCommandPoolCreateInfo*              pCreateInfo,
3356	const VkAllocationCallbacks*                pAllocator,
3357	VkCommandPool*                              pCmdPool)
3358{
3359	RADV_FROM_HANDLE(radv_device, device, _device);
3360	struct radv_cmd_pool *pool;
3361
3362	pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
3363			   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
3364	if (pool == NULL)
3365		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
3366
3367	if (pAllocator)
3368		pool->alloc = *pAllocator;
3369	else
3370		pool->alloc = device->alloc;
3371
3372	list_inithead(&pool->cmd_buffers);
3373	list_inithead(&pool->free_cmd_buffers);
3374
3375	pool->queue_family_index = pCreateInfo->queueFamilyIndex;
3376
3377	*pCmdPool = radv_cmd_pool_to_handle(pool);
3378
3379	return VK_SUCCESS;
3380
3381}
3382
3383void radv_DestroyCommandPool(
3384	VkDevice                                    _device,
3385	VkCommandPool                               commandPool,
3386	const VkAllocationCallbacks*                pAllocator)
3387{
3388	RADV_FROM_HANDLE(radv_device, device, _device);
3389	RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
3390
3391	if (!pool)
3392		return;
3393
3394	list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer,
3395				 &pool->cmd_buffers, pool_link) {
3396		radv_cmd_buffer_destroy(cmd_buffer);
3397	}
3398
3399	list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer,
3400				 &pool->free_cmd_buffers, pool_link) {
3401		radv_cmd_buffer_destroy(cmd_buffer);
3402	}
3403
3404	vk_free2(&device->alloc, pAllocator, pool);
3405}
3406
3407VkResult radv_ResetCommandPool(
3408	VkDevice                                    device,
3409	VkCommandPool                               commandPool,
3410	VkCommandPoolResetFlags                     flags)
3411{
3412	RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
3413	VkResult result;
3414
3415	list_for_each_entry(struct radv_cmd_buffer, cmd_buffer,
3416			    &pool->cmd_buffers, pool_link) {
3417		result = radv_reset_cmd_buffer(cmd_buffer);
3418		if (result != VK_SUCCESS)
3419			return result;
3420	}
3421
3422	return VK_SUCCESS;
3423}
3424
3425void radv_TrimCommandPool(
3426    VkDevice                                    device,
3427    VkCommandPool                               commandPool,
3428    VkCommandPoolTrimFlags                      flags)
3429{
3430	RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
3431
3432	if (!pool)
3433		return;
3434
3435	list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer,
3436				 &pool->free_cmd_buffers, pool_link) {
3437		radv_cmd_buffer_destroy(cmd_buffer);
3438	}
3439}
3440
3441static uint32_t
3442radv_get_subpass_id(struct radv_cmd_buffer *cmd_buffer)
3443{
3444	struct radv_cmd_state *state = &cmd_buffer->state;
3445	uint32_t subpass_id = state->subpass - state->pass->subpasses;
3446
3447	/* The id of this subpass shouldn't exceed the number of subpasses in
3448	 * this render pass minus 1.
3449	 */
3450	assert(subpass_id < state->pass->subpass_count);
3451	return subpass_id;
3452}
3453
3454static void
3455radv_cmd_buffer_begin_subpass(struct radv_cmd_buffer *cmd_buffer,
3456			      uint32_t subpass_id)
3457{
3458	struct radv_cmd_state *state = &cmd_buffer->state;
3459	struct radv_subpass *subpass = &state->pass->subpasses[subpass_id];
3460
3461	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
3462							   cmd_buffer->cs, 4096);
3463
3464	radv_subpass_barrier(cmd_buffer, &subpass->start_barrier);
3465
3466	for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
3467		const uint32_t a = subpass->attachments[i].attachment;
3468		if (a == VK_ATTACHMENT_UNUSED)
3469			continue;
3470
3471		radv_handle_subpass_image_transition(cmd_buffer,
3472						     subpass->attachments[i]);
3473	}
3474
3475	radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
3476	radv_cmd_buffer_clear_subpass(cmd_buffer);
3477
3478	assert(cmd_buffer->cs->cdw <= cdw_max);
3479}
3480
3481static void
3482radv_cmd_buffer_end_subpass(struct radv_cmd_buffer *cmd_buffer)
3483{
3484	struct radv_cmd_state *state = &cmd_buffer->state;
3485	const struct radv_subpass *subpass = state->subpass;
3486	uint32_t subpass_id = radv_get_subpass_id(cmd_buffer);
3487
3488	radv_cmd_buffer_resolve_subpass(cmd_buffer);
3489
3490	for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
3491		const uint32_t a = subpass->attachments[i].attachment;
3492		if (a == VK_ATTACHMENT_UNUSED)
3493			continue;
3494
3495		if (state->pass->attachments[a].last_subpass_idx != subpass_id)
3496			continue;
3497
3498		VkImageLayout layout = state->pass->attachments[a].final_layout;
3499		radv_handle_subpass_image_transition(cmd_buffer,
3500		                      (struct radv_subpass_attachment){a, layout});
3501	}
3502}
3503
3504void radv_CmdBeginRenderPass(
3505	VkCommandBuffer                             commandBuffer,
3506	const VkRenderPassBeginInfo*                pRenderPassBegin,
3507	VkSubpassContents                           contents)
3508{
3509	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3510	RADV_FROM_HANDLE(radv_render_pass, pass, pRenderPassBegin->renderPass);
3511	RADV_FROM_HANDLE(radv_framebuffer, framebuffer, pRenderPassBegin->framebuffer);
3512	VkResult result;
3513
3514	cmd_buffer->state.framebuffer = framebuffer;
3515	cmd_buffer->state.pass = pass;
3516	cmd_buffer->state.render_area = pRenderPassBegin->renderArea;
3517
3518	result = radv_cmd_state_setup_attachments(cmd_buffer, pass, pRenderPassBegin);
3519	if (result != VK_SUCCESS)
3520		return;
3521
3522	radv_cmd_buffer_begin_subpass(cmd_buffer, 0);
3523}
3524
3525void radv_CmdBeginRenderPass2KHR(
3526    VkCommandBuffer                             commandBuffer,
3527    const VkRenderPassBeginInfo*                pRenderPassBeginInfo,
3528    const VkSubpassBeginInfoKHR*                pSubpassBeginInfo)
3529{
3530	radv_CmdBeginRenderPass(commandBuffer, pRenderPassBeginInfo,
3531				pSubpassBeginInfo->contents);
3532}
3533
3534void radv_CmdNextSubpass(
3535    VkCommandBuffer                             commandBuffer,
3536    VkSubpassContents                           contents)
3537{
3538	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3539
3540	uint32_t prev_subpass = radv_get_subpass_id(cmd_buffer);
3541	radv_cmd_buffer_end_subpass(cmd_buffer);
3542	radv_cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1);
3543}
3544
3545void radv_CmdNextSubpass2KHR(
3546    VkCommandBuffer                             commandBuffer,
3547    const VkSubpassBeginInfoKHR*                pSubpassBeginInfo,
3548    const VkSubpassEndInfoKHR*                  pSubpassEndInfo)
3549{
3550	radv_CmdNextSubpass(commandBuffer, pSubpassBeginInfo->contents);
3551}
3552
3553static void radv_emit_view_index(struct radv_cmd_buffer *cmd_buffer, unsigned index)
3554{
3555	struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
3556	for (unsigned stage = 0; stage < MESA_SHADER_STAGES; ++stage) {
3557		if (!radv_get_shader(pipeline, stage))
3558			continue;
3559
3560		struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, AC_UD_VIEW_INDEX);
3561		if (loc->sgpr_idx == -1)
3562			continue;
3563		uint32_t base_reg = pipeline->user_data_0[stage];
3564		radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, index);
3565
3566	}
3567	if (pipeline->gs_copy_shader) {
3568		struct radv_userdata_info *loc = &pipeline->gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_VIEW_INDEX];
3569		if (loc->sgpr_idx != -1) {
3570			uint32_t base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0;
3571			radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, index);
3572		}
3573	}
3574}
3575
3576static void
3577radv_cs_emit_draw_packet(struct radv_cmd_buffer *cmd_buffer,
3578                         uint32_t vertex_count,
3579			 bool use_opaque)
3580{
3581	radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, cmd_buffer->state.predicating));
3582	radeon_emit(cmd_buffer->cs, vertex_count);
3583	radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX |
3584	                            S_0287F0_USE_OPAQUE(use_opaque));
3585}
3586
3587static void
3588radv_cs_emit_draw_indexed_packet(struct radv_cmd_buffer *cmd_buffer,
3589                                 uint64_t index_va,
3590                                 uint32_t index_count)
3591{
3592	radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_2, 4, cmd_buffer->state.predicating));
3593	radeon_emit(cmd_buffer->cs, cmd_buffer->state.max_index_count);
3594	radeon_emit(cmd_buffer->cs, index_va);
3595	radeon_emit(cmd_buffer->cs, index_va >> 32);
3596	radeon_emit(cmd_buffer->cs, index_count);
3597	radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_DMA);
3598}
3599
3600static void
3601radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer *cmd_buffer,
3602                                  bool indexed,
3603                                  uint32_t draw_count,
3604                                  uint64_t count_va,
3605                                  uint32_t stride)
3606{
3607	struct radeon_cmdbuf *cs = cmd_buffer->cs;
3608	unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA
3609	                              : V_0287F0_DI_SRC_SEL_AUTO_INDEX;
3610	bool draw_id_enable = radv_get_shader(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX)->info.info.vs.needs_draw_id;
3611	uint32_t base_reg = cmd_buffer->state.pipeline->graphics.vtx_base_sgpr;
3612	bool predicating = cmd_buffer->state.predicating;
3613	assert(base_reg);
3614
3615	/* just reset draw state for vertex data */
3616	cmd_buffer->state.last_first_instance = -1;
3617	cmd_buffer->state.last_num_instances = -1;
3618	cmd_buffer->state.last_vertex_offset = -1;
3619
3620	if (draw_count == 1 && !count_va && !draw_id_enable) {
3621		radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT :
3622				     PKT3_DRAW_INDIRECT, 3, predicating));
3623		radeon_emit(cs, 0);
3624		radeon_emit(cs, (base_reg - SI_SH_REG_OFFSET) >> 2);
3625		radeon_emit(cs, ((base_reg + 4) - SI_SH_REG_OFFSET) >> 2);
3626		radeon_emit(cs, di_src_sel);
3627	} else {
3628		radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI :
3629				     PKT3_DRAW_INDIRECT_MULTI,
3630				     8, predicating));
3631		radeon_emit(cs, 0);
3632		radeon_emit(cs, (base_reg - SI_SH_REG_OFFSET) >> 2);
3633		radeon_emit(cs, ((base_reg + 4) - SI_SH_REG_OFFSET) >> 2);
3634		radeon_emit(cs, (((base_reg + 8) - SI_SH_REG_OFFSET) >> 2) |
3635			    S_2C3_DRAW_INDEX_ENABLE(draw_id_enable) |
3636			    S_2C3_COUNT_INDIRECT_ENABLE(!!count_va));
3637		radeon_emit(cs, draw_count); /* count */
3638		radeon_emit(cs, count_va); /* count_addr */
3639		radeon_emit(cs, count_va >> 32);
3640		radeon_emit(cs, stride); /* stride */
3641		radeon_emit(cs, di_src_sel);
3642	}
3643}
3644
3645static void
3646radv_emit_draw_packets(struct radv_cmd_buffer *cmd_buffer,
3647		       const struct radv_draw_info *info)
3648{
3649	struct radv_cmd_state *state = &cmd_buffer->state;
3650	struct radeon_winsys *ws = cmd_buffer->device->ws;
3651	struct radeon_cmdbuf *cs = cmd_buffer->cs;
3652
3653	if (info->indirect) {
3654		uint64_t va = radv_buffer_get_va(info->indirect->bo);
3655		uint64_t count_va = 0;
3656
3657		va += info->indirect->offset + info->indirect_offset;
3658
3659		radv_cs_add_buffer(ws, cs, info->indirect->bo);
3660
3661		radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0));
3662		radeon_emit(cs, 1);
3663		radeon_emit(cs, va);
3664		radeon_emit(cs, va >> 32);
3665
3666		if (info->count_buffer) {
3667			count_va = radv_buffer_get_va(info->count_buffer->bo);
3668			count_va += info->count_buffer->offset +
3669				    info->count_buffer_offset;
3670
3671			radv_cs_add_buffer(ws, cs, info->count_buffer->bo);
3672		}
3673
3674		if (!state->subpass->view_mask) {
3675			radv_cs_emit_indirect_draw_packet(cmd_buffer,
3676							  info->indexed,
3677							  info->count,
3678							  count_va,
3679							  info->stride);
3680		} else {
3681			unsigned i;
3682			for_each_bit(i, state->subpass->view_mask) {
3683				radv_emit_view_index(cmd_buffer, i);
3684
3685				radv_cs_emit_indirect_draw_packet(cmd_buffer,
3686								  info->indexed,
3687								  info->count,
3688								  count_va,
3689								  info->stride);
3690			}
3691		}
3692	} else {
3693		assert(state->pipeline->graphics.vtx_base_sgpr);
3694
3695		if (info->vertex_offset != state->last_vertex_offset ||
3696		    info->first_instance != state->last_first_instance) {
3697			radeon_set_sh_reg_seq(cs, state->pipeline->graphics.vtx_base_sgpr,
3698					      state->pipeline->graphics.vtx_emit_num);
3699
3700			radeon_emit(cs, info->vertex_offset);
3701			radeon_emit(cs, info->first_instance);
3702			if (state->pipeline->graphics.vtx_emit_num == 3)
3703				radeon_emit(cs, 0);
3704			state->last_first_instance = info->first_instance;
3705			state->last_vertex_offset = info->vertex_offset;
3706		}
3707
3708		if (state->last_num_instances != info->instance_count) {
3709			radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, false));
3710			radeon_emit(cs, info->instance_count);
3711			state->last_num_instances = info->instance_count;
3712		}
3713
3714		if (info->indexed) {
3715			int index_size = state->index_type ? 4 : 2;
3716			uint64_t index_va;
3717
3718			index_va = state->index_va;
3719			index_va += info->first_index * index_size;
3720
3721			if (!state->subpass->view_mask) {
3722				radv_cs_emit_draw_indexed_packet(cmd_buffer,
3723								 index_va,
3724								 info->count);
3725			} else {
3726				unsigned i;
3727				for_each_bit(i, state->subpass->view_mask) {
3728					radv_emit_view_index(cmd_buffer, i);
3729
3730					radv_cs_emit_draw_indexed_packet(cmd_buffer,
3731									 index_va,
3732									 info->count);
3733				}
3734			}
3735		} else {
3736			if (!state->subpass->view_mask) {
3737				radv_cs_emit_draw_packet(cmd_buffer,
3738							 info->count,
3739							 !!info->strmout_buffer);
3740			} else {
3741				unsigned i;
3742				for_each_bit(i, state->subpass->view_mask) {
3743					radv_emit_view_index(cmd_buffer, i);
3744
3745					radv_cs_emit_draw_packet(cmd_buffer,
3746								 info->count,
3747								 !!info->strmout_buffer);
3748				}
3749			}
3750		}
3751	}
3752}
3753
3754/*
3755 * Vega and raven have a bug which triggers if there are multiple context
3756 * register contexts active at the same time with different scissor values.
3757 *
3758 * There are two possible workarounds:
3759 * 1) Wait for PS_PARTIAL_FLUSH every time the scissor is changed. That way
3760 *    there is only ever 1 active set of scissor values at the same time.
3761 *
3762 * 2) Whenever the hardware switches contexts we have to set the scissor
3763 *    registers again even if it is a noop. That way the new context gets
3764 *    the correct scissor values.
3765 *
3766 * This implements option 2. radv_need_late_scissor_emission needs to
3767 * return true on affected HW if radv_emit_all_graphics_states sets
3768 * any context registers.
3769 */
3770static bool radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer,
3771                                            const struct radv_draw_info *info)
3772{
3773	struct radv_cmd_state *state = &cmd_buffer->state;
3774
3775	if (!cmd_buffer->device->physical_device->has_scissor_bug)
3776		return false;
3777
3778	if (cmd_buffer->state.context_roll_without_scissor_emitted || info->strmout_buffer)
3779		return true;
3780
3781	uint32_t used_states = cmd_buffer->state.pipeline->graphics.needed_dynamic_state | ~RADV_CMD_DIRTY_DYNAMIC_ALL;
3782
3783	/* Index, vertex and streamout buffers don't change context regs, and
3784	 * pipeline is already handled.
3785	 */
3786	used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER |
3787			 RADV_CMD_DIRTY_VERTEX_BUFFER |
3788			 RADV_CMD_DIRTY_STREAMOUT_BUFFER |
3789			 RADV_CMD_DIRTY_PIPELINE);
3790
3791	if (cmd_buffer->state.dirty & used_states)
3792		return true;
3793
3794	if (info->indexed && state->pipeline->graphics.prim_restart_enable &&
3795	    (state->index_type ? 0xffffffffu : 0xffffu) != state->last_primitive_reset_index)
3796		return true;
3797
3798	return false;
3799}
3800
3801static void
3802radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer,
3803			      const struct radv_draw_info *info)
3804{
3805	bool late_scissor_emission;
3806
3807	if ((cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER) ||
3808	    cmd_buffer->state.emitted_pipeline != cmd_buffer->state.pipeline)
3809		radv_emit_rbplus_state(cmd_buffer);
3810
3811	if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE)
3812		radv_emit_graphics_pipeline(cmd_buffer);
3813
3814	/* This should be before the cmd_buffer->state.dirty is cleared
3815	 * (excluding RADV_CMD_DIRTY_PIPELINE) and after
3816	 * cmd_buffer->state.context_roll_without_scissor_emitted is set. */
3817	late_scissor_emission =
3818		radv_need_late_scissor_emission(cmd_buffer, info);
3819
3820	if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)
3821		radv_emit_framebuffer_state(cmd_buffer);
3822
3823	if (info->indexed) {
3824		if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_INDEX_BUFFER)
3825			radv_emit_index_buffer(cmd_buffer);
3826	} else {
3827		/* On CI and later, non-indexed draws overwrite VGT_INDEX_TYPE,
3828		 * so the state must be re-emitted before the next indexed
3829		 * draw.
3830		 */
3831		if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) {
3832			cmd_buffer->state.last_index_type = -1;
3833			cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
3834		}
3835	}
3836
3837	radv_cmd_buffer_flush_dynamic_state(cmd_buffer);
3838
3839	radv_emit_draw_registers(cmd_buffer, info);
3840
3841	if (late_scissor_emission)
3842		radv_emit_scissor(cmd_buffer);
3843}
3844
3845static void
3846radv_draw(struct radv_cmd_buffer *cmd_buffer,
3847	  const struct radv_draw_info *info)
3848{
3849	struct radeon_info *rad_info =
3850		&cmd_buffer->device->physical_device->rad_info;
3851	bool has_prefetch =
3852		cmd_buffer->device->physical_device->rad_info.chip_class >= CIK;
3853	bool pipeline_is_dirty =
3854		(cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) &&
3855		cmd_buffer->state.pipeline != cmd_buffer->state.emitted_pipeline;
3856
3857	MAYBE_UNUSED unsigned cdw_max =
3858		radeon_check_space(cmd_buffer->device->ws,
3859				   cmd_buffer->cs, 4096);
3860
3861	if (likely(!info->indirect)) {
3862		/* SI-CI treat instance_count==0 as instance_count==1. There is
3863		 * no workaround for indirect draws, but we can at least skip
3864		 * direct draws.
3865		 */
3866		if (unlikely(!info->instance_count))
3867			return;
3868
3869		/* Handle count == 0. */
3870		if (unlikely(!info->count && !info->strmout_buffer))
3871			return;
3872	}
3873
3874	/* Use optimal packet order based on whether we need to sync the
3875	 * pipeline.
3876	 */
3877	if (cmd_buffer->state.flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB |
3878					    RADV_CMD_FLAG_FLUSH_AND_INV_DB |
3879					    RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
3880					    RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
3881		/* If we have to wait for idle, set all states first, so that
3882		 * all SET packets are processed in parallel with previous draw
3883		 * calls. Then upload descriptors, set shader pointers, and
3884		 * draw, and prefetch at the end. This ensures that the time
3885		 * the CUs are idle is very short. (there are only SET_SH
3886		 * packets between the wait and the draw)
3887		 */
3888		radv_emit_all_graphics_states(cmd_buffer, info);
3889		si_emit_cache_flush(cmd_buffer);
3890		/* <-- CUs are idle here --> */
3891
3892		radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty);
3893
3894		radv_emit_draw_packets(cmd_buffer, info);
3895		/* <-- CUs are busy here --> */
3896
3897		/* Start prefetches after the draw has been started. Both will
3898		 * run in parallel, but starting the draw first is more
3899		 * important.
3900		 */
3901		if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
3902			radv_emit_prefetch_L2(cmd_buffer,
3903					      cmd_buffer->state.pipeline, false);
3904		}
3905	} else {
3906		/* If we don't wait for idle, start prefetches first, then set
3907		 * states, and draw at the end.
3908		 */
3909		si_emit_cache_flush(cmd_buffer);
3910
3911		if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
3912			/* Only prefetch the vertex shader and VBO descriptors
3913			 * in order to start the draw as soon as possible.
3914			 */
3915			radv_emit_prefetch_L2(cmd_buffer,
3916					      cmd_buffer->state.pipeline, true);
3917		}
3918
3919		radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty);
3920
3921		radv_emit_all_graphics_states(cmd_buffer, info);
3922		radv_emit_draw_packets(cmd_buffer, info);
3923
3924		/* Prefetch the remaining shaders after the draw has been
3925		 * started.
3926		 */
3927		if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
3928			radv_emit_prefetch_L2(cmd_buffer,
3929					      cmd_buffer->state.pipeline, false);
3930		}
3931	}
3932
3933	/* Workaround for a VGT hang when streamout is enabled.
3934	 * It must be done after drawing.
3935	 */
3936	if (cmd_buffer->state.streamout.streamout_enabled &&
3937	    (rad_info->family == CHIP_HAWAII ||
3938	     rad_info->family == CHIP_TONGA ||
3939	     rad_info->family == CHIP_FIJI)) {
3940		cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_STREAMOUT_SYNC;
3941	}
3942
3943	assert(cmd_buffer->cs->cdw <= cdw_max);
3944	radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_PS_PARTIAL_FLUSH);
3945}
3946
3947void radv_CmdDraw(
3948	VkCommandBuffer                             commandBuffer,
3949	uint32_t                                    vertexCount,
3950	uint32_t                                    instanceCount,
3951	uint32_t                                    firstVertex,
3952	uint32_t                                    firstInstance)
3953{
3954	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3955	struct radv_draw_info info = {};
3956
3957	info.count = vertexCount;
3958	info.instance_count = instanceCount;
3959	info.first_instance = firstInstance;
3960	info.vertex_offset = firstVertex;
3961
3962	radv_draw(cmd_buffer, &info);
3963}
3964
3965void radv_CmdDrawIndexed(
3966	VkCommandBuffer                             commandBuffer,
3967	uint32_t                                    indexCount,
3968	uint32_t                                    instanceCount,
3969	uint32_t                                    firstIndex,
3970	int32_t                                     vertexOffset,
3971	uint32_t                                    firstInstance)
3972{
3973	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3974	struct radv_draw_info info = {};
3975
3976	info.indexed = true;
3977	info.count = indexCount;
3978	info.instance_count = instanceCount;
3979	info.first_index = firstIndex;
3980	info.vertex_offset = vertexOffset;
3981	info.first_instance = firstInstance;
3982
3983	radv_draw(cmd_buffer, &info);
3984}
3985
3986void radv_CmdDrawIndirect(
3987	VkCommandBuffer                             commandBuffer,
3988	VkBuffer                                    _buffer,
3989	VkDeviceSize                                offset,
3990	uint32_t                                    drawCount,
3991	uint32_t                                    stride)
3992{
3993	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3994	RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
3995	struct radv_draw_info info = {};
3996
3997	info.count = drawCount;
3998	info.indirect = buffer;
3999	info.indirect_offset = offset;
4000	info.stride = stride;
4001
4002	radv_draw(cmd_buffer, &info);
4003}
4004
4005void radv_CmdDrawIndexedIndirect(
4006	VkCommandBuffer                             commandBuffer,
4007	VkBuffer                                    _buffer,
4008	VkDeviceSize                                offset,
4009	uint32_t                                    drawCount,
4010	uint32_t                                    stride)
4011{
4012	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4013	RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
4014	struct radv_draw_info info = {};
4015
4016	info.indexed = true;
4017	info.count = drawCount;
4018	info.indirect = buffer;
4019	info.indirect_offset = offset;
4020	info.stride = stride;
4021
4022	radv_draw(cmd_buffer, &info);
4023}
4024
4025void radv_CmdDrawIndirectCountAMD(
4026	VkCommandBuffer                             commandBuffer,
4027	VkBuffer                                    _buffer,
4028	VkDeviceSize                                offset,
4029	VkBuffer                                    _countBuffer,
4030	VkDeviceSize                                countBufferOffset,
4031	uint32_t                                    maxDrawCount,
4032	uint32_t                                    stride)
4033{
4034	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4035	RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
4036	RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
4037	struct radv_draw_info info = {};
4038
4039	info.count = maxDrawCount;
4040	info.indirect = buffer;
4041	info.indirect_offset = offset;
4042	info.count_buffer = count_buffer;
4043	info.count_buffer_offset = countBufferOffset;
4044	info.stride = stride;
4045
4046	radv_draw(cmd_buffer, &info);
4047}
4048
4049void radv_CmdDrawIndexedIndirectCountAMD(
4050	VkCommandBuffer                             commandBuffer,
4051	VkBuffer                                    _buffer,
4052	VkDeviceSize                                offset,
4053	VkBuffer                                    _countBuffer,
4054	VkDeviceSize                                countBufferOffset,
4055	uint32_t                                    maxDrawCount,
4056	uint32_t                                    stride)
4057{
4058	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4059	RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
4060	RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
4061	struct radv_draw_info info = {};
4062
4063	info.indexed = true;
4064	info.count = maxDrawCount;
4065	info.indirect = buffer;
4066	info.indirect_offset = offset;
4067	info.count_buffer = count_buffer;
4068	info.count_buffer_offset = countBufferOffset;
4069	info.stride = stride;
4070
4071	radv_draw(cmd_buffer, &info);
4072}
4073
4074void radv_CmdDrawIndirectCountKHR(
4075	VkCommandBuffer                             commandBuffer,
4076	VkBuffer                                    _buffer,
4077	VkDeviceSize                                offset,
4078	VkBuffer                                    _countBuffer,
4079	VkDeviceSize                                countBufferOffset,
4080	uint32_t                                    maxDrawCount,
4081	uint32_t                                    stride)
4082{
4083	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4084	RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
4085	RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
4086	struct radv_draw_info info = {};
4087
4088	info.count = maxDrawCount;
4089	info.indirect = buffer;
4090	info.indirect_offset = offset;
4091	info.count_buffer = count_buffer;
4092	info.count_buffer_offset = countBufferOffset;
4093	info.stride = stride;
4094
4095	radv_draw(cmd_buffer, &info);
4096}
4097
4098void radv_CmdDrawIndexedIndirectCountKHR(
4099	VkCommandBuffer                             commandBuffer,
4100	VkBuffer                                    _buffer,
4101	VkDeviceSize                                offset,
4102	VkBuffer                                    _countBuffer,
4103	VkDeviceSize                                countBufferOffset,
4104	uint32_t                                    maxDrawCount,
4105	uint32_t                                    stride)
4106{
4107	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4108	RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
4109	RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
4110	struct radv_draw_info info = {};
4111
4112	info.indexed = true;
4113	info.count = maxDrawCount;
4114	info.indirect = buffer;
4115	info.indirect_offset = offset;
4116	info.count_buffer = count_buffer;
4117	info.count_buffer_offset = countBufferOffset;
4118	info.stride = stride;
4119
4120	radv_draw(cmd_buffer, &info);
4121}
4122
4123struct radv_dispatch_info {
4124	/**
4125	 * Determine the layout of the grid (in block units) to be used.
4126	 */
4127	uint32_t blocks[3];
4128
4129	/**
4130	 * A starting offset for the grid. If unaligned is set, the offset
4131	 * must still be aligned.
4132	 */
4133	uint32_t offsets[3];
4134	/**
4135	 * Whether it's an unaligned compute dispatch.
4136	 */
4137	bool unaligned;
4138
4139	/**
4140	 * Indirect compute parameters resource.
4141	 */
4142	struct radv_buffer *indirect;
4143	uint64_t indirect_offset;
4144};
4145
4146static void
4147radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer,
4148			   const struct radv_dispatch_info *info)
4149{
4150	struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
4151	struct radv_shader_variant *compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE];
4152	unsigned dispatch_initiator = cmd_buffer->device->dispatch_initiator;
4153	struct radeon_winsys *ws = cmd_buffer->device->ws;
4154	bool predicating = cmd_buffer->state.predicating;
4155	struct radeon_cmdbuf *cs = cmd_buffer->cs;
4156	struct radv_userdata_info *loc;
4157
4158	loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_COMPUTE,
4159				    AC_UD_CS_GRID_SIZE);
4160
4161	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(ws, cs, 25);
4162
4163	if (info->indirect) {
4164		uint64_t va = radv_buffer_get_va(info->indirect->bo);
4165
4166		va += info->indirect->offset + info->indirect_offset;
4167
4168		radv_cs_add_buffer(ws, cs, info->indirect->bo);
4169
4170		if (loc->sgpr_idx != -1) {
4171			for (unsigned i = 0; i < 3; ++i) {
4172				radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
4173				radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
4174						COPY_DATA_DST_SEL(COPY_DATA_REG));
4175				radeon_emit(cs, (va +  4 * i));
4176				radeon_emit(cs, (va + 4 * i) >> 32);
4177				radeon_emit(cs, ((R_00B900_COMPUTE_USER_DATA_0
4178						 + loc->sgpr_idx * 4) >> 2) + i);
4179				radeon_emit(cs, 0);
4180			}
4181		}
4182
4183		if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
4184			radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, predicating) |
4185					PKT3_SHADER_TYPE_S(1));
4186			radeon_emit(cs, va);
4187			radeon_emit(cs, va >> 32);
4188			radeon_emit(cs, dispatch_initiator);
4189		} else {
4190			radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) |
4191					PKT3_SHADER_TYPE_S(1));
4192			radeon_emit(cs, 1);
4193			radeon_emit(cs, va);
4194			radeon_emit(cs, va >> 32);
4195
4196			radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, predicating) |
4197					PKT3_SHADER_TYPE_S(1));
4198			radeon_emit(cs, 0);
4199			radeon_emit(cs, dispatch_initiator);
4200		}
4201	} else {
4202		unsigned blocks[3] = { info->blocks[0], info->blocks[1], info->blocks[2] };
4203		unsigned offsets[3] = { info->offsets[0], info->offsets[1], info->offsets[2] };
4204
4205		if (info->unaligned) {
4206			unsigned *cs_block_size = compute_shader->info.cs.block_size;
4207			unsigned remainder[3];
4208
4209			/* If aligned, these should be an entire block size,
4210			 * not 0.
4211			 */
4212			remainder[0] = blocks[0] + cs_block_size[0] -
4213				       align_u32_npot(blocks[0], cs_block_size[0]);
4214			remainder[1] = blocks[1] + cs_block_size[1] -
4215				       align_u32_npot(blocks[1], cs_block_size[1]);
4216			remainder[2] = blocks[2] + cs_block_size[2] -
4217				       align_u32_npot(blocks[2], cs_block_size[2]);
4218
4219			blocks[0] = round_up_u32(blocks[0], cs_block_size[0]);
4220			blocks[1] = round_up_u32(blocks[1], cs_block_size[1]);
4221			blocks[2] = round_up_u32(blocks[2], cs_block_size[2]);
4222
4223			for(unsigned i = 0; i < 3; ++i) {
4224				assert(offsets[i] % cs_block_size[i] == 0);
4225				offsets[i] /= cs_block_size[i];
4226			}
4227
4228			radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
4229			radeon_emit(cs,
4230				    S_00B81C_NUM_THREAD_FULL(cs_block_size[0]) |
4231				    S_00B81C_NUM_THREAD_PARTIAL(remainder[0]));
4232			radeon_emit(cs,
4233				    S_00B81C_NUM_THREAD_FULL(cs_block_size[1]) |
4234				    S_00B81C_NUM_THREAD_PARTIAL(remainder[1]));
4235			radeon_emit(cs,
4236				    S_00B81C_NUM_THREAD_FULL(cs_block_size[2]) |
4237				    S_00B81C_NUM_THREAD_PARTIAL(remainder[2]));
4238
4239			dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1);
4240		}
4241
4242		if (loc->sgpr_idx != -1) {
4243			assert(loc->num_sgprs == 3);
4244
4245			radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
4246						  loc->sgpr_idx * 4, 3);
4247			radeon_emit(cs, blocks[0]);
4248			radeon_emit(cs, blocks[1]);
4249			radeon_emit(cs, blocks[2]);
4250		}
4251
4252		if (offsets[0] || offsets[1] || offsets[2]) {
4253			radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3);
4254			radeon_emit(cs, offsets[0]);
4255			radeon_emit(cs, offsets[1]);
4256			radeon_emit(cs, offsets[2]);
4257
4258			/* The blocks in the packet are not counts but end values. */
4259			for (unsigned i = 0; i < 3; ++i)
4260				blocks[i] += offsets[i];
4261		} else {
4262			dispatch_initiator |= S_00B800_FORCE_START_AT_000(1);
4263		}
4264
4265		radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, predicating) |
4266				PKT3_SHADER_TYPE_S(1));
4267		radeon_emit(cs, blocks[0]);
4268		radeon_emit(cs, blocks[1]);
4269		radeon_emit(cs, blocks[2]);
4270		radeon_emit(cs, dispatch_initiator);
4271	}
4272
4273	assert(cmd_buffer->cs->cdw <= cdw_max);
4274}
4275
4276static void
4277radv_upload_compute_shader_descriptors(struct radv_cmd_buffer *cmd_buffer)
4278{
4279	radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_COMPUTE_BIT);
4280	radv_flush_constants(cmd_buffer, VK_SHADER_STAGE_COMPUTE_BIT);
4281}
4282
4283static void
4284radv_dispatch(struct radv_cmd_buffer *cmd_buffer,
4285	      const struct radv_dispatch_info *info)
4286{
4287	struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
4288	bool has_prefetch =
4289		cmd_buffer->device->physical_device->rad_info.chip_class >= CIK;
4290	bool pipeline_is_dirty = pipeline &&
4291				 pipeline != cmd_buffer->state.emitted_compute_pipeline;
4292
4293	if (cmd_buffer->state.flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB |
4294					    RADV_CMD_FLAG_FLUSH_AND_INV_DB |
4295					    RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
4296					    RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
4297		/* If we have to wait for idle, set all states first, so that
4298		 * all SET packets are processed in parallel with previous draw
4299		 * calls. Then upload descriptors, set shader pointers, and
4300		 * dispatch, and prefetch at the end. This ensures that the
4301		 * time the CUs are idle is very short. (there are only SET_SH
4302		 * packets between the wait and the draw)
4303		 */
4304		radv_emit_compute_pipeline(cmd_buffer);
4305		si_emit_cache_flush(cmd_buffer);
4306		/* <-- CUs are idle here --> */
4307
4308		radv_upload_compute_shader_descriptors(cmd_buffer);
4309
4310		radv_emit_dispatch_packets(cmd_buffer, info);
4311		/* <-- CUs are busy here --> */
4312
4313		/* Start prefetches after the dispatch has been started. Both
4314		 * will run in parallel, but starting the dispatch first is
4315		 * more important.
4316		 */
4317		if (has_prefetch && pipeline_is_dirty) {
4318			radv_emit_shader_prefetch(cmd_buffer,
4319						  pipeline->shaders[MESA_SHADER_COMPUTE]);
4320		}
4321	} else {
4322		/* If we don't wait for idle, start prefetches first, then set
4323		 * states, and dispatch at the end.
4324		 */
4325		si_emit_cache_flush(cmd_buffer);
4326
4327		if (has_prefetch && pipeline_is_dirty) {
4328			radv_emit_shader_prefetch(cmd_buffer,
4329						  pipeline->shaders[MESA_SHADER_COMPUTE]);
4330		}
4331
4332		radv_upload_compute_shader_descriptors(cmd_buffer);
4333
4334		radv_emit_compute_pipeline(cmd_buffer);
4335		radv_emit_dispatch_packets(cmd_buffer, info);
4336	}
4337
4338	radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_CS_PARTIAL_FLUSH);
4339}
4340
4341void radv_CmdDispatchBase(
4342	VkCommandBuffer                             commandBuffer,
4343	uint32_t                                    base_x,
4344	uint32_t                                    base_y,
4345	uint32_t                                    base_z,
4346	uint32_t                                    x,
4347	uint32_t                                    y,
4348	uint32_t                                    z)
4349{
4350	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4351	struct radv_dispatch_info info = {};
4352
4353	info.blocks[0] = x;
4354	info.blocks[1] = y;
4355	info.blocks[2] = z;
4356
4357	info.offsets[0] = base_x;
4358	info.offsets[1] = base_y;
4359	info.offsets[2] = base_z;
4360	radv_dispatch(cmd_buffer, &info);
4361}
4362
4363void radv_CmdDispatch(
4364	VkCommandBuffer                             commandBuffer,
4365	uint32_t                                    x,
4366	uint32_t                                    y,
4367	uint32_t                                    z)
4368{
4369	radv_CmdDispatchBase(commandBuffer, 0, 0, 0, x, y, z);
4370}
4371
4372void radv_CmdDispatchIndirect(
4373	VkCommandBuffer                             commandBuffer,
4374	VkBuffer                                    _buffer,
4375	VkDeviceSize                                offset)
4376{
4377	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4378	RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
4379	struct radv_dispatch_info info = {};
4380
4381	info.indirect = buffer;
4382	info.indirect_offset = offset;
4383
4384	radv_dispatch(cmd_buffer, &info);
4385}
4386
4387void radv_unaligned_dispatch(
4388	struct radv_cmd_buffer                      *cmd_buffer,
4389	uint32_t                                    x,
4390	uint32_t                                    y,
4391	uint32_t                                    z)
4392{
4393	struct radv_dispatch_info info = {};
4394
4395	info.blocks[0] = x;
4396	info.blocks[1] = y;
4397	info.blocks[2] = z;
4398	info.unaligned = 1;
4399
4400	radv_dispatch(cmd_buffer, &info);
4401}
4402
4403void radv_CmdEndRenderPass(
4404	VkCommandBuffer                             commandBuffer)
4405{
4406	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4407
4408	radv_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier);
4409
4410	radv_cmd_buffer_end_subpass(cmd_buffer);
4411
4412	vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
4413
4414	cmd_buffer->state.pass = NULL;
4415	cmd_buffer->state.subpass = NULL;
4416	cmd_buffer->state.attachments = NULL;
4417	cmd_buffer->state.framebuffer = NULL;
4418}
4419
4420void radv_CmdEndRenderPass2KHR(
4421    VkCommandBuffer                             commandBuffer,
4422    const VkSubpassEndInfoKHR*                  pSubpassEndInfo)
4423{
4424	radv_CmdEndRenderPass(commandBuffer);
4425}
4426
4427/*
4428 * For HTILE we have the following interesting clear words:
4429 *   0xfffff30f: Uncompressed, full depth range, for depth+stencil HTILE
4430 *   0xfffc000f: Uncompressed, full depth range, for depth only HTILE.
4431 *   0xfffffff0: Clear depth to 1.0
4432 *   0x00000000: Clear depth to 0.0
4433 */
4434static void radv_initialize_htile(struct radv_cmd_buffer *cmd_buffer,
4435                                  struct radv_image *image,
4436                                  const VkImageSubresourceRange *range,
4437                                  uint32_t clear_word)
4438{
4439	assert(range->baseMipLevel == 0);
4440	assert(range->levelCount == 1 || range->levelCount == VK_REMAINING_ARRAY_LAYERS);
4441	unsigned layer_count = radv_get_layerCount(image, range);
4442	uint64_t size = image->planes[0].surface.htile_slice_size * layer_count;
4443	VkImageAspectFlags aspects = VK_IMAGE_ASPECT_DEPTH_BIT;
4444	uint64_t offset = image->offset + image->htile_offset +
4445	                  image->planes[0].surface.htile_slice_size * range->baseArrayLayer;
4446	struct radv_cmd_state *state = &cmd_buffer->state;
4447	VkClearDepthStencilValue value = {};
4448
4449	state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB |
4450			     RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
4451
4452	state->flush_bits |= radv_fill_buffer(cmd_buffer, image->bo, offset,
4453					      size, clear_word);
4454
4455	state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
4456
4457	if (vk_format_is_stencil(image->vk_format))
4458		aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
4459
4460	radv_set_ds_clear_metadata(cmd_buffer, image, value, aspects);
4461
4462	if (radv_image_is_tc_compat_htile(image)) {
4463		/* Initialize the TC-compat metada value to 0 because by
4464		 * default DB_Z_INFO.RANGE_PRECISION is set to 1, and we only
4465		 * need have to conditionally update its value when performing
4466		 * a fast depth clear.
4467		 */
4468		radv_set_tc_compat_zrange_metadata(cmd_buffer, image, 0);
4469	}
4470}
4471
4472static void radv_handle_depth_image_transition(struct radv_cmd_buffer *cmd_buffer,
4473					       struct radv_image *image,
4474					       VkImageLayout src_layout,
4475					       VkImageLayout dst_layout,
4476					       unsigned src_queue_mask,
4477					       unsigned dst_queue_mask,
4478					       const VkImageSubresourceRange *range)
4479{
4480	if (!radv_image_has_htile(image))
4481		return;
4482
4483	if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
4484		uint32_t clear_value = vk_format_is_stencil(image->vk_format) ? 0xfffff30f : 0xfffc000f;
4485
4486		if (radv_layout_is_htile_compressed(image, dst_layout,
4487						    dst_queue_mask)) {
4488			clear_value = 0;
4489		}
4490
4491		radv_initialize_htile(cmd_buffer, image, range, clear_value);
4492	} else if (!radv_layout_is_htile_compressed(image, src_layout, src_queue_mask) &&
4493	           radv_layout_is_htile_compressed(image, dst_layout, dst_queue_mask)) {
4494		uint32_t clear_value = vk_format_is_stencil(image->vk_format) ? 0xfffff30f : 0xfffc000f;
4495		radv_initialize_htile(cmd_buffer, image, range, clear_value);
4496	} else if (radv_layout_is_htile_compressed(image, src_layout, src_queue_mask) &&
4497	           !radv_layout_is_htile_compressed(image, dst_layout, dst_queue_mask)) {
4498		VkImageSubresourceRange local_range = *range;
4499		local_range.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT;
4500		local_range.baseMipLevel = 0;
4501		local_range.levelCount = 1;
4502
4503		cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB |
4504		                                RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
4505
4506		radv_decompress_depth_image_inplace(cmd_buffer, image, &local_range);
4507
4508		cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB |
4509		                                RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
4510	}
4511}
4512
4513static void radv_initialise_cmask(struct radv_cmd_buffer *cmd_buffer,
4514				  struct radv_image *image, uint32_t value)
4515{
4516	struct radv_cmd_state *state = &cmd_buffer->state;
4517
4518	state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
4519			    RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
4520
4521	state->flush_bits |= radv_clear_cmask(cmd_buffer, image, value);
4522
4523	state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
4524}
4525
4526void radv_initialize_fmask(struct radv_cmd_buffer *cmd_buffer,
4527			   struct radv_image *image)
4528{
4529	struct radv_cmd_state *state = &cmd_buffer->state;
4530	static const uint32_t fmask_clear_values[4] = {
4531		0x00000000,
4532		0x02020202,
4533		0xE4E4E4E4,
4534		0x76543210
4535	};
4536	uint32_t log2_samples = util_logbase2(image->info.samples);
4537	uint32_t value = fmask_clear_values[log2_samples];
4538
4539	state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
4540			     RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
4541
4542	state->flush_bits |= radv_clear_fmask(cmd_buffer, image, value);
4543
4544	state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
4545}
4546
4547void radv_initialize_dcc(struct radv_cmd_buffer *cmd_buffer,
4548			 struct radv_image *image, uint32_t value)
4549{
4550	struct radv_cmd_state *state = &cmd_buffer->state;
4551
4552	state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
4553			     RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
4554
4555	state->flush_bits |= radv_clear_dcc(cmd_buffer, image, value);
4556
4557	state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
4558			     RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
4559}
4560
4561/**
4562 * Initialize DCC/FMASK/CMASK metadata for a color image.
4563 */
4564static void radv_init_color_image_metadata(struct radv_cmd_buffer *cmd_buffer,
4565					   struct radv_image *image,
4566					   VkImageLayout src_layout,
4567					   VkImageLayout dst_layout,
4568					   unsigned src_queue_mask,
4569					   unsigned dst_queue_mask)
4570{
4571	if (radv_image_has_cmask(image)) {
4572		uint32_t value = 0xffffffffu; /* Fully expanded mode. */
4573
4574		/*  TODO: clarify this. */
4575		if (radv_image_has_fmask(image)) {
4576			value = 0xccccccccu;
4577		}
4578
4579		radv_initialise_cmask(cmd_buffer, image, value);
4580	}
4581
4582	if (radv_image_has_fmask(image)) {
4583		radv_initialize_fmask(cmd_buffer, image);
4584	}
4585
4586	if (radv_image_has_dcc(image)) {
4587		uint32_t value = 0xffffffffu; /* Fully expanded mode. */
4588		bool need_decompress_pass = false;
4589
4590		if (radv_layout_dcc_compressed(image, dst_layout,
4591					       dst_queue_mask)) {
4592			value = 0x20202020u;
4593			need_decompress_pass = true;
4594		}
4595
4596		radv_initialize_dcc(cmd_buffer, image, value);
4597
4598		radv_update_fce_metadata(cmd_buffer, image,
4599					 need_decompress_pass);
4600	}
4601
4602	if (radv_image_has_cmask(image) || radv_image_has_dcc(image)) {
4603		uint32_t color_values[2] = {};
4604		radv_set_color_clear_metadata(cmd_buffer, image, color_values);
4605	}
4606}
4607
4608/**
4609 * Handle color image transitions for DCC/FMASK/CMASK.
4610 */
4611static void radv_handle_color_image_transition(struct radv_cmd_buffer *cmd_buffer,
4612					       struct radv_image *image,
4613					       VkImageLayout src_layout,
4614					       VkImageLayout dst_layout,
4615					       unsigned src_queue_mask,
4616					       unsigned dst_queue_mask,
4617					       const VkImageSubresourceRange *range)
4618{
4619	if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
4620		radv_init_color_image_metadata(cmd_buffer, image,
4621					       src_layout, dst_layout,
4622					       src_queue_mask, dst_queue_mask);
4623		return;
4624	}
4625
4626	if (radv_image_has_dcc(image)) {
4627		if (src_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) {
4628			radv_initialize_dcc(cmd_buffer, image, 0xffffffffu);
4629		} else if (radv_layout_dcc_compressed(image, src_layout, src_queue_mask) &&
4630		           !radv_layout_dcc_compressed(image, dst_layout, dst_queue_mask)) {
4631			radv_decompress_dcc(cmd_buffer, image, range);
4632		} else if (radv_layout_can_fast_clear(image, src_layout, src_queue_mask) &&
4633			   !radv_layout_can_fast_clear(image, dst_layout, dst_queue_mask)) {
4634			radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
4635		}
4636	} else if (radv_image_has_cmask(image) || radv_image_has_fmask(image)) {
4637		if (radv_layout_can_fast_clear(image, src_layout, src_queue_mask) &&
4638		    !radv_layout_can_fast_clear(image, dst_layout, dst_queue_mask)) {
4639			radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
4640		}
4641
4642		if (radv_image_has_fmask(image)) {
4643			if (src_layout != VK_IMAGE_LAYOUT_GENERAL &&
4644			    dst_layout == VK_IMAGE_LAYOUT_GENERAL) {
4645				radv_expand_fmask_image_inplace(cmd_buffer, image, range);
4646			}
4647		}
4648	}
4649}
4650
4651static void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer,
4652					 struct radv_image *image,
4653					 VkImageLayout src_layout,
4654					 VkImageLayout dst_layout,
4655					 uint32_t src_family,
4656					 uint32_t dst_family,
4657					 const VkImageSubresourceRange *range)
4658{
4659	if (image->exclusive && src_family != dst_family) {
4660		/* This is an acquire or a release operation and there will be
4661		 * a corresponding release/acquire. Do the transition in the
4662		 * most flexible queue. */
4663
4664		assert(src_family == cmd_buffer->queue_family_index ||
4665		       dst_family == cmd_buffer->queue_family_index);
4666
4667		if (src_family == VK_QUEUE_FAMILY_EXTERNAL)
4668			return;
4669
4670		if (cmd_buffer->queue_family_index == RADV_QUEUE_TRANSFER)
4671			return;
4672
4673		if (cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE &&
4674		    (src_family == RADV_QUEUE_GENERAL ||
4675		     dst_family == RADV_QUEUE_GENERAL))
4676			return;
4677	}
4678
4679	if (src_layout == dst_layout)
4680		return;
4681
4682	unsigned src_queue_mask =
4683		radv_image_queue_family_mask(image, src_family,
4684					     cmd_buffer->queue_family_index);
4685	unsigned dst_queue_mask =
4686		radv_image_queue_family_mask(image, dst_family,
4687					     cmd_buffer->queue_family_index);
4688
4689	if (vk_format_is_depth(image->vk_format)) {
4690		radv_handle_depth_image_transition(cmd_buffer, image,
4691						   src_layout, dst_layout,
4692						   src_queue_mask, dst_queue_mask,
4693						   range);
4694	} else {
4695		radv_handle_color_image_transition(cmd_buffer, image,
4696						   src_layout, dst_layout,
4697						   src_queue_mask, dst_queue_mask,
4698						   range);
4699	}
4700}
4701
4702struct radv_barrier_info {
4703	uint32_t eventCount;
4704	const VkEvent *pEvents;
4705	VkPipelineStageFlags srcStageMask;
4706	VkPipelineStageFlags dstStageMask;
4707};
4708
4709static void
4710radv_barrier(struct radv_cmd_buffer *cmd_buffer,
4711	     uint32_t memoryBarrierCount,
4712	     const VkMemoryBarrier *pMemoryBarriers,
4713	     uint32_t bufferMemoryBarrierCount,
4714	     const VkBufferMemoryBarrier *pBufferMemoryBarriers,
4715	     uint32_t imageMemoryBarrierCount,
4716	     const VkImageMemoryBarrier *pImageMemoryBarriers,
4717	     const struct radv_barrier_info *info)
4718{
4719	struct radeon_cmdbuf *cs = cmd_buffer->cs;
4720	enum radv_cmd_flush_bits src_flush_bits = 0;
4721	enum radv_cmd_flush_bits dst_flush_bits = 0;
4722
4723	for (unsigned i = 0; i < info->eventCount; ++i) {
4724		RADV_FROM_HANDLE(radv_event, event, info->pEvents[i]);
4725		uint64_t va = radv_buffer_get_va(event->bo);
4726
4727		radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo);
4728
4729		MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 7);
4730
4731		radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL, va, 1, 0xffffffff);
4732		assert(cmd_buffer->cs->cdw <= cdw_max);
4733	}
4734
4735	for (uint32_t i = 0; i < memoryBarrierCount; i++) {
4736		src_flush_bits |= radv_src_access_flush(cmd_buffer, pMemoryBarriers[i].srcAccessMask,
4737							NULL);
4738		dst_flush_bits |= radv_dst_access_flush(cmd_buffer, pMemoryBarriers[i].dstAccessMask,
4739		                                        NULL);
4740	}
4741
4742	for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) {
4743		src_flush_bits |= radv_src_access_flush(cmd_buffer, pBufferMemoryBarriers[i].srcAccessMask,
4744							NULL);
4745		dst_flush_bits |= radv_dst_access_flush(cmd_buffer, pBufferMemoryBarriers[i].dstAccessMask,
4746		                                        NULL);
4747	}
4748
4749	for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
4750		RADV_FROM_HANDLE(radv_image, image, pImageMemoryBarriers[i].image);
4751
4752		src_flush_bits |= radv_src_access_flush(cmd_buffer, pImageMemoryBarriers[i].srcAccessMask,
4753							image);
4754		dst_flush_bits |= radv_dst_access_flush(cmd_buffer, pImageMemoryBarriers[i].dstAccessMask,
4755		                                        image);
4756	}
4757
4758	/* The Vulkan spec 1.1.98 says:
4759	 *
4760	 * "An execution dependency with only
4761	 *  VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT in the destination stage mask
4762	 *  will only prevent that stage from executing in subsequently
4763	 *  submitted commands. As this stage does not perform any actual
4764	 *  execution, this is not observable - in effect, it does not delay
4765	 *  processing of subsequent commands. Similarly an execution dependency
4766	 *  with only VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT in the source stage mask
4767	 *  will effectively not wait for any prior commands to complete."
4768	 */
4769	if (info->dstStageMask != VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT)
4770		radv_stage_flush(cmd_buffer, info->srcStageMask);
4771	cmd_buffer->state.flush_bits |= src_flush_bits;
4772
4773	for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
4774		RADV_FROM_HANDLE(radv_image, image, pImageMemoryBarriers[i].image);
4775		radv_handle_image_transition(cmd_buffer, image,
4776					     pImageMemoryBarriers[i].oldLayout,
4777					     pImageMemoryBarriers[i].newLayout,
4778					     pImageMemoryBarriers[i].srcQueueFamilyIndex,
4779					     pImageMemoryBarriers[i].dstQueueFamilyIndex,
4780					     &pImageMemoryBarriers[i].subresourceRange);
4781	}
4782
4783	/* Make sure CP DMA is idle because the driver might have performed a
4784	 * DMA operation for copying or filling buffers/images.
4785	 */
4786	if (info->srcStageMask & (VK_PIPELINE_STAGE_TRANSFER_BIT |
4787				  VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT))
4788		si_cp_dma_wait_for_idle(cmd_buffer);
4789
4790	cmd_buffer->state.flush_bits |= dst_flush_bits;
4791}
4792
4793void radv_CmdPipelineBarrier(
4794	VkCommandBuffer                             commandBuffer,
4795	VkPipelineStageFlags                        srcStageMask,
4796	VkPipelineStageFlags                        destStageMask,
4797	VkBool32                                    byRegion,
4798	uint32_t                                    memoryBarrierCount,
4799	const VkMemoryBarrier*                      pMemoryBarriers,
4800	uint32_t                                    bufferMemoryBarrierCount,
4801	const VkBufferMemoryBarrier*                pBufferMemoryBarriers,
4802	uint32_t                                    imageMemoryBarrierCount,
4803	const VkImageMemoryBarrier*                 pImageMemoryBarriers)
4804{
4805	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4806	struct radv_barrier_info info;
4807
4808	info.eventCount = 0;
4809	info.pEvents = NULL;
4810	info.srcStageMask = srcStageMask;
4811	info.dstStageMask = destStageMask;
4812
4813	radv_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers,
4814		     bufferMemoryBarrierCount, pBufferMemoryBarriers,
4815		     imageMemoryBarrierCount, pImageMemoryBarriers, &info);
4816}
4817
4818
4819static void write_event(struct radv_cmd_buffer *cmd_buffer,
4820			struct radv_event *event,
4821			VkPipelineStageFlags stageMask,
4822			unsigned value)
4823{
4824	struct radeon_cmdbuf *cs = cmd_buffer->cs;
4825	uint64_t va = radv_buffer_get_va(event->bo);
4826
4827	si_emit_cache_flush(cmd_buffer);
4828
4829	radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo);
4830
4831	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 21);
4832
4833	/* Flags that only require a top-of-pipe event. */
4834	VkPipelineStageFlags top_of_pipe_flags =
4835		VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
4836
4837	/* Flags that only require a post-index-fetch event. */
4838	VkPipelineStageFlags post_index_fetch_flags =
4839		top_of_pipe_flags |
4840		VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT |
4841		VK_PIPELINE_STAGE_VERTEX_INPUT_BIT;
4842
4843	/* Make sure CP DMA is idle because the driver might have performed a
4844	 * DMA operation for copying or filling buffers/images.
4845	 */
4846	if (stageMask & (VK_PIPELINE_STAGE_TRANSFER_BIT |
4847			 VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT))
4848		si_cp_dma_wait_for_idle(cmd_buffer);
4849
4850	/* TODO: Emit EOS events for syncing PS/CS stages. */
4851
4852	if (!(stageMask & ~top_of_pipe_flags)) {
4853		/* Just need to sync the PFP engine. */
4854		radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
4855		radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
4856				S_370_WR_CONFIRM(1) |
4857				S_370_ENGINE_SEL(V_370_PFP));
4858		radeon_emit(cs, va);
4859		radeon_emit(cs, va >> 32);
4860		radeon_emit(cs, value);
4861	} else if (!(stageMask & ~post_index_fetch_flags)) {
4862		/* Sync ME because PFP reads index and indirect buffers. */
4863		radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
4864		radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
4865				S_370_WR_CONFIRM(1) |
4866				S_370_ENGINE_SEL(V_370_ME));
4867		radeon_emit(cs, va);
4868		radeon_emit(cs, va >> 32);
4869		radeon_emit(cs, value);
4870	} else {
4871		/* Otherwise, sync all prior GPU work using an EOP event. */
4872		si_cs_emit_write_event_eop(cs,
4873					   cmd_buffer->device->physical_device->rad_info.chip_class,
4874					   radv_cmd_buffer_uses_mec(cmd_buffer),
4875					   V_028A90_BOTTOM_OF_PIPE_TS, 0,
4876					   EOP_DATA_SEL_VALUE_32BIT, va, value,
4877					   cmd_buffer->gfx9_eop_bug_va);
4878	}
4879
4880	assert(cmd_buffer->cs->cdw <= cdw_max);
4881}
4882
4883void radv_CmdSetEvent(VkCommandBuffer commandBuffer,
4884		      VkEvent _event,
4885		      VkPipelineStageFlags stageMask)
4886{
4887	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4888	RADV_FROM_HANDLE(radv_event, event, _event);
4889
4890	write_event(cmd_buffer, event, stageMask, 1);
4891}
4892
4893void radv_CmdResetEvent(VkCommandBuffer commandBuffer,
4894			VkEvent _event,
4895			VkPipelineStageFlags stageMask)
4896{
4897	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4898	RADV_FROM_HANDLE(radv_event, event, _event);
4899
4900	write_event(cmd_buffer, event, stageMask, 0);
4901}
4902
4903void radv_CmdWaitEvents(VkCommandBuffer commandBuffer,
4904			uint32_t eventCount,
4905			const VkEvent* pEvents,
4906			VkPipelineStageFlags srcStageMask,
4907			VkPipelineStageFlags dstStageMask,
4908			uint32_t memoryBarrierCount,
4909			const VkMemoryBarrier* pMemoryBarriers,
4910			uint32_t bufferMemoryBarrierCount,
4911			const VkBufferMemoryBarrier* pBufferMemoryBarriers,
4912			uint32_t imageMemoryBarrierCount,
4913			const VkImageMemoryBarrier* pImageMemoryBarriers)
4914{
4915	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4916	struct radv_barrier_info info;
4917
4918	info.eventCount = eventCount;
4919	info.pEvents = pEvents;
4920	info.srcStageMask = 0;
4921
4922	radv_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers,
4923		     bufferMemoryBarrierCount, pBufferMemoryBarriers,
4924		     imageMemoryBarrierCount, pImageMemoryBarriers, &info);
4925}
4926
4927
4928void radv_CmdSetDeviceMask(VkCommandBuffer commandBuffer,
4929                           uint32_t deviceMask)
4930{
4931   /* No-op */
4932}
4933
4934/* VK_EXT_conditional_rendering */
4935void radv_CmdBeginConditionalRenderingEXT(
4936	VkCommandBuffer                             commandBuffer,
4937	const VkConditionalRenderingBeginInfoEXT*   pConditionalRenderingBegin)
4938{
4939	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4940	RADV_FROM_HANDLE(radv_buffer, buffer, pConditionalRenderingBegin->buffer);
4941	struct radeon_cmdbuf *cs = cmd_buffer->cs;
4942	bool draw_visible = true;
4943	uint64_t pred_value = 0;
4944	uint64_t va, new_va;
4945	unsigned pred_offset;
4946
4947	va = radv_buffer_get_va(buffer->bo) + pConditionalRenderingBegin->offset;
4948
4949	/* By default, if the 32-bit value at offset in buffer memory is zero,
4950	 * then the rendering commands are discarded, otherwise they are
4951	 * executed as normal. If the inverted flag is set, all commands are
4952	 * discarded if the value is non zero.
4953	 */
4954	if (pConditionalRenderingBegin->flags &
4955	    VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT) {
4956		draw_visible = false;
4957	}
4958
4959	si_emit_cache_flush(cmd_buffer);
4960
4961	/* From the Vulkan spec 1.1.107:
4962	 *
4963	 * "If the 32-bit value at offset in buffer memory is zero, then the
4964	 *  rendering commands are discarded, otherwise they are executed as
4965	 *  normal. If the value of the predicate in buffer memory changes while
4966	 *  conditional rendering is active, the rendering commands may be
4967	 *  discarded in an implementation-dependent way. Some implementations
4968	 *  may latch the value of the predicate upon beginning conditional
4969	 *  rendering while others may read it before every rendering command."
4970	 *
4971	 * But, the AMD hardware treats the predicate as a 64-bit value which
4972	 * means we need a workaround in the driver. Luckily, it's not required
4973	 * to support if the value changes when predication is active.
4974	 *
4975	 * The workaround is as follows:
4976	 * 1) allocate a 64-value in the upload BO and initialize it to 0
4977	 * 2) copy the 32-bit predicate value to the upload BO
4978	 * 3) use the new allocated VA address for predication
4979	 *
4980	 * Based on the conditionalrender demo, it's faster to do the COPY_DATA
4981	 * in ME  (+ sync PFP) instead of PFP.
4982	 */
4983	radv_cmd_buffer_upload_data(cmd_buffer, 8, 16, &pred_value, &pred_offset);
4984
4985	new_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset;
4986
4987	radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
4988	radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
4989			COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
4990			COPY_DATA_WR_CONFIRM);
4991	radeon_emit(cs, va);
4992	radeon_emit(cs, va >> 32);
4993	radeon_emit(cs, new_va);
4994	radeon_emit(cs, new_va >> 32);
4995
4996	radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
4997	radeon_emit(cs, 0);
4998
4999	/* Enable predication for this command buffer. */
5000	si_emit_set_predication_state(cmd_buffer, draw_visible, new_va);
5001	cmd_buffer->state.predicating = true;
5002
5003	/* Store conditional rendering user info. */
5004	cmd_buffer->state.predication_type = draw_visible;
5005	cmd_buffer->state.predication_va = new_va;
5006}
5007
5008void radv_CmdEndConditionalRenderingEXT(
5009	VkCommandBuffer                             commandBuffer)
5010{
5011	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5012
5013	/* Disable predication for this command buffer. */
5014	si_emit_set_predication_state(cmd_buffer, false, 0);
5015	cmd_buffer->state.predicating = false;
5016
5017	/* Reset conditional rendering user info. */
5018	cmd_buffer->state.predication_type = -1;
5019	cmd_buffer->state.predication_va = 0;
5020}
5021
5022/* VK_EXT_transform_feedback */
5023void radv_CmdBindTransformFeedbackBuffersEXT(
5024    VkCommandBuffer                             commandBuffer,
5025    uint32_t                                    firstBinding,
5026    uint32_t                                    bindingCount,
5027    const VkBuffer*                             pBuffers,
5028    const VkDeviceSize*                         pOffsets,
5029    const VkDeviceSize*                         pSizes)
5030{
5031	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5032	struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
5033	uint8_t enabled_mask = 0;
5034
5035	assert(firstBinding + bindingCount <= MAX_SO_BUFFERS);
5036	for (uint32_t i = 0; i < bindingCount; i++) {
5037		uint32_t idx = firstBinding + i;
5038
5039		sb[idx].buffer = radv_buffer_from_handle(pBuffers[i]);
5040		sb[idx].offset = pOffsets[i];
5041		sb[idx].size = pSizes[i];
5042
5043		radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
5044				   sb[idx].buffer->bo);
5045
5046		enabled_mask |= 1 << idx;
5047	}
5048
5049	cmd_buffer->state.streamout.enabled_mask |= enabled_mask;
5050
5051	cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_BUFFER;
5052}
5053
5054static void
5055radv_emit_streamout_enable(struct radv_cmd_buffer *cmd_buffer)
5056{
5057	struct radv_streamout_state *so = &cmd_buffer->state.streamout;
5058	struct radeon_cmdbuf *cs = cmd_buffer->cs;
5059
5060	radeon_set_context_reg_seq(cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
5061	radeon_emit(cs,
5062		    S_028B94_STREAMOUT_0_EN(so->streamout_enabled) |
5063		    S_028B94_RAST_STREAM(0) |
5064		    S_028B94_STREAMOUT_1_EN(so->streamout_enabled) |
5065		    S_028B94_STREAMOUT_2_EN(so->streamout_enabled) |
5066		    S_028B94_STREAMOUT_3_EN(so->streamout_enabled));
5067	radeon_emit(cs, so->hw_enabled_mask &
5068			so->enabled_stream_buffers_mask);
5069
5070	cmd_buffer->state.context_roll_without_scissor_emitted = true;
5071}
5072
5073static void
5074radv_set_streamout_enable(struct radv_cmd_buffer *cmd_buffer, bool enable)
5075{
5076	struct radv_streamout_state *so = &cmd_buffer->state.streamout;
5077	bool old_streamout_enabled = so->streamout_enabled;
5078	uint32_t old_hw_enabled_mask = so->hw_enabled_mask;
5079
5080	so->streamout_enabled = enable;
5081
5082	so->hw_enabled_mask = so->enabled_mask |
5083			      (so->enabled_mask << 4) |
5084			      (so->enabled_mask << 8) |
5085			      (so->enabled_mask << 12);
5086
5087	if ((old_streamout_enabled != so->streamout_enabled) ||
5088	    (old_hw_enabled_mask != so->hw_enabled_mask))
5089		radv_emit_streamout_enable(cmd_buffer);
5090}
5091
5092static void radv_flush_vgt_streamout(struct radv_cmd_buffer *cmd_buffer)
5093{
5094	struct radeon_cmdbuf *cs = cmd_buffer->cs;
5095	unsigned reg_strmout_cntl;
5096
5097	/* The register is at different places on different ASICs. */
5098	if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) {
5099		reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
5100		radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
5101	} else {
5102		reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
5103		radeon_set_config_reg(cs, reg_strmout_cntl, 0);
5104	}
5105
5106	radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
5107	radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));
5108
5109	radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
5110	radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
5111	radeon_emit(cs, reg_strmout_cntl >> 2);  /* register */
5112	radeon_emit(cs, 0);
5113	radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
5114	radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
5115	radeon_emit(cs, 4); /* poll interval */
5116}
5117
5118void radv_CmdBeginTransformFeedbackEXT(
5119    VkCommandBuffer                             commandBuffer,
5120    uint32_t                                    firstCounterBuffer,
5121    uint32_t                                    counterBufferCount,
5122    const VkBuffer*                             pCounterBuffers,
5123    const VkDeviceSize*                         pCounterBufferOffsets)
5124{
5125	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5126	struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
5127	struct radv_streamout_state *so = &cmd_buffer->state.streamout;
5128	struct radeon_cmdbuf *cs = cmd_buffer->cs;
5129	uint32_t i;
5130
5131	radv_flush_vgt_streamout(cmd_buffer);
5132
5133	assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
5134	for_each_bit(i, so->enabled_mask) {
5135		int32_t counter_buffer_idx = i - firstCounterBuffer;
5136		if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
5137			counter_buffer_idx = -1;
5138
5139		/* SI binds streamout buffers as shader resources.
5140		 * VGT only counts primitives and tells the shader through
5141		 * SGPRs what to do.
5142		 */
5143		radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 2);
5144		radeon_emit(cs, sb[i].size >> 2);	/* BUFFER_SIZE (in DW) */
5145		radeon_emit(cs, so->stride_in_dw[i]);			/* VTX_STRIDE (in DW) */
5146
5147		cmd_buffer->state.context_roll_without_scissor_emitted = true;
5148
5149		if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
5150			/* The array of counter buffers is optional. */
5151			RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
5152			uint64_t va = radv_buffer_get_va(buffer->bo);
5153
5154			va += buffer->offset + pCounterBufferOffsets[counter_buffer_idx];
5155
5156			/* Append */
5157			radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
5158			radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
5159					STRMOUT_DATA_TYPE(1) | /* offset in bytes */
5160					STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
5161			radeon_emit(cs, 0); /* unused */
5162			radeon_emit(cs, 0); /* unused */
5163			radeon_emit(cs, va); /* src address lo */
5164			radeon_emit(cs, va >> 32); /* src address hi */
5165
5166			radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
5167		} else {
5168			/* Start from the beginning. */
5169			radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
5170			radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
5171					STRMOUT_DATA_TYPE(1) | /* offset in bytes */
5172					STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
5173			radeon_emit(cs, 0); /* unused */
5174			radeon_emit(cs, 0); /* unused */
5175			radeon_emit(cs, 0); /* unused */
5176			radeon_emit(cs, 0); /* unused */
5177		}
5178	}
5179
5180	radv_set_streamout_enable(cmd_buffer, true);
5181}
5182
5183void radv_CmdEndTransformFeedbackEXT(
5184    VkCommandBuffer                             commandBuffer,
5185    uint32_t                                    firstCounterBuffer,
5186    uint32_t                                    counterBufferCount,
5187    const VkBuffer*                             pCounterBuffers,
5188    const VkDeviceSize*                         pCounterBufferOffsets)
5189{
5190	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5191	struct radv_streamout_state *so = &cmd_buffer->state.streamout;
5192	struct radeon_cmdbuf *cs = cmd_buffer->cs;
5193	uint32_t i;
5194
5195	radv_flush_vgt_streamout(cmd_buffer);
5196
5197	assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
5198	for_each_bit(i, so->enabled_mask) {
5199		int32_t counter_buffer_idx = i - firstCounterBuffer;
5200		if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
5201			counter_buffer_idx = -1;
5202
5203		if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
5204			/* The array of counters buffer is optional. */
5205			RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
5206			uint64_t va = radv_buffer_get_va(buffer->bo);
5207
5208			va += buffer->offset + pCounterBufferOffsets[counter_buffer_idx];
5209
5210			radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
5211			radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
5212					STRMOUT_DATA_TYPE(1) | /* offset in bytes */
5213					STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
5214					STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
5215			radeon_emit(cs, va);		/* dst address lo */
5216			radeon_emit(cs, va >> 32);	/* dst address hi */
5217			radeon_emit(cs, 0);		/* unused */
5218			radeon_emit(cs, 0);		/* unused */
5219
5220			radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
5221		}
5222
5223		/* Deactivate transform feedback by zeroing the buffer size.
5224		 * The counters (primitives generated, primitives emitted) may
5225		 * be enabled even if there is not buffer bound. This ensures
5226		 * that the primitives-emitted query won't increment.
5227		 */
5228		radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 0);
5229
5230		cmd_buffer->state.context_roll_without_scissor_emitted = true;
5231	}
5232
5233	radv_set_streamout_enable(cmd_buffer, false);
5234}
5235
5236void radv_CmdDrawIndirectByteCountEXT(
5237    VkCommandBuffer                             commandBuffer,
5238    uint32_t                                    instanceCount,
5239    uint32_t                                    firstInstance,
5240    VkBuffer                                    _counterBuffer,
5241    VkDeviceSize                                counterBufferOffset,
5242    uint32_t                                    counterOffset,
5243    uint32_t                                    vertexStride)
5244{
5245	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5246	RADV_FROM_HANDLE(radv_buffer, counterBuffer, _counterBuffer);
5247	struct radv_draw_info info = {};
5248
5249	info.instance_count = instanceCount;
5250	info.first_instance = firstInstance;
5251	info.strmout_buffer = counterBuffer;
5252	info.strmout_buffer_offset = counterBufferOffset;
5253	info.stride = vertexStride;
5254
5255	radv_draw(cmd_buffer, &info);
5256}
5257