1/*
2 * Copyright 2018 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 *
24 */
25
26#include "si_pipe.h"
27#include "util/u_format.h"
28#include "util/format_srgb.h"
29
30/* Note: Compute shaders always use SI_COMPUTE_DST_CACHE_POLICY for dst
31 * and L2_STREAM for src.
32 */
33static enum si_cache_policy get_cache_policy(struct si_context *sctx,
34					     enum si_coherency coher,
35					     uint64_t size)
36{
37	if ((sctx->chip_class >= GFX9 && (coher == SI_COHERENCY_CB_META ||
38					  coher == SI_COHERENCY_CP)) ||
39	    (sctx->chip_class >= CIK && coher == SI_COHERENCY_SHADER))
40		return size <= 256 * 1024 ? L2_LRU : L2_STREAM;
41
42	return L2_BYPASS;
43}
44
45unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
46			    enum si_cache_policy cache_policy)
47{
48	switch (coher) {
49	default:
50	case SI_COHERENCY_NONE:
51	case SI_COHERENCY_CP:
52		return 0;
53	case SI_COHERENCY_SHADER:
54		return SI_CONTEXT_INV_SMEM_L1 |
55		       SI_CONTEXT_INV_VMEM_L1 |
56		       (cache_policy == L2_BYPASS ? SI_CONTEXT_INV_GLOBAL_L2 : 0);
57	case SI_COHERENCY_CB_META:
58		return SI_CONTEXT_FLUSH_AND_INV_CB;
59	}
60}
61
62static void si_compute_internal_begin(struct si_context *sctx)
63{
64	sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS;
65	sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
66	sctx->render_cond_force_off = true;
67}
68
69static void si_compute_internal_end(struct si_context *sctx)
70{
71	sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS;
72	sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS;
73	sctx->render_cond_force_off = false;
74}
75
76static void si_compute_do_clear_or_copy(struct si_context *sctx,
77					struct pipe_resource *dst,
78					unsigned dst_offset,
79					struct pipe_resource *src,
80					unsigned src_offset,
81					unsigned size,
82					const uint32_t *clear_value,
83					unsigned clear_value_size,
84					enum si_coherency coher)
85{
86	struct pipe_context *ctx = &sctx->b;
87
88	assert(src_offset % 4 == 0);
89	assert(dst_offset % 4 == 0);
90	assert(size % 4 == 0);
91
92	assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0);
93	assert(!src || src_offset + size <= src->width0);
94
95	si_compute_internal_begin(sctx);
96	sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
97		       SI_CONTEXT_CS_PARTIAL_FLUSH |
98		       si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
99
100	/* Save states. */
101	void *saved_cs = sctx->cs_shader_state.program;
102	struct pipe_shader_buffer saved_sb[2] = {};
103	si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb);
104
105	unsigned saved_writable_mask = 0;
106	for (unsigned i = 0; i < (src ? 2 : 1); i++) {
107		if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
108		    (1u << si_get_shaderbuf_slot(i)))
109			saved_writable_mask |= 1 << i;
110	}
111
112	/* The memory accesses are coalesced, meaning that the 1st instruction writes
113	 * the 1st contiguous block of data for the whole wave, the 2nd instruction
114	 * writes the 2nd contiguous block of data, etc.
115	 */
116	unsigned dwords_per_thread = src ? SI_COMPUTE_COPY_DW_PER_THREAD :
117					   SI_COMPUTE_CLEAR_DW_PER_THREAD;
118	unsigned instructions_per_thread = MAX2(1, dwords_per_thread / 4);
119	unsigned dwords_per_instruction = dwords_per_thread / instructions_per_thread;
120	unsigned dwords_per_wave = dwords_per_thread * 64;
121
122	unsigned num_dwords = size / 4;
123	unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
124
125	struct pipe_grid_info info = {};
126	info.block[0] = MIN2(64, num_instructions);
127	info.block[1] = 1;
128	info.block[2] = 1;
129	info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
130	info.grid[1] = 1;
131	info.grid[2] = 1;
132
133	struct pipe_shader_buffer sb[2] = {};
134	sb[0].buffer = dst;
135	sb[0].buffer_offset = dst_offset;
136	sb[0].buffer_size = size;
137
138	bool shader_dst_stream_policy = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU;
139
140	if (src) {
141		sb[1].buffer = src;
142		sb[1].buffer_offset = src_offset;
143		sb[1].buffer_size = size;
144
145		ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 2, sb, 0x1);
146
147		if (!sctx->cs_copy_buffer) {
148			sctx->cs_copy_buffer = si_create_dma_compute_shader(&sctx->b,
149							     SI_COMPUTE_COPY_DW_PER_THREAD,
150							     shader_dst_stream_policy, true);
151		}
152		ctx->bind_compute_state(ctx, sctx->cs_copy_buffer);
153	} else {
154		assert(clear_value_size >= 4 &&
155		       clear_value_size <= 16 &&
156		       util_is_power_of_two_or_zero(clear_value_size));
157
158		for (unsigned i = 0; i < 4; i++)
159			sctx->cs_user_data[i] = clear_value[i % (clear_value_size / 4)];
160
161		ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, sb, 0x1);
162
163		if (!sctx->cs_clear_buffer) {
164			sctx->cs_clear_buffer = si_create_dma_compute_shader(&sctx->b,
165							     SI_COMPUTE_CLEAR_DW_PER_THREAD,
166							     shader_dst_stream_policy, false);
167		}
168		ctx->bind_compute_state(ctx, sctx->cs_clear_buffer);
169	}
170
171	ctx->launch_grid(ctx, &info);
172
173	enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
174	sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
175		       (cache_policy == L2_BYPASS ? SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 0);
176
177	if (cache_policy != L2_BYPASS)
178		si_resource(dst)->TC_L2_dirty = true;
179
180	/* Restore states. */
181	ctx->bind_compute_state(ctx, saved_cs);
182	ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb,
183				saved_writable_mask);
184	si_compute_internal_end(sctx);
185}
186
187void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
188		     uint64_t offset, uint64_t size, uint32_t *clear_value,
189		     uint32_t clear_value_size, enum si_coherency coher,
190		     bool force_cpdma)
191{
192	if (!size)
193		return;
194
195	unsigned clear_alignment = MIN2(clear_value_size, 4);
196
197	assert(clear_value_size != 3 && clear_value_size != 6); /* 12 is allowed. */
198	assert(offset % clear_alignment == 0);
199	assert(size % clear_alignment == 0);
200	assert(size < (UINT_MAX & ~0xf)); /* TODO: test 64-bit sizes in all codepaths */
201
202	/* Reduce a large clear value size if possible. */
203	if (clear_value_size > 4) {
204		bool clear_dword_duplicated = true;
205
206		/* See if we can lower large fills to dword fills. */
207		for (unsigned i = 1; i < clear_value_size / 4; i++) {
208			if (clear_value[0] != clear_value[i]) {
209				clear_dword_duplicated = false;
210				break;
211			}
212		}
213		if (clear_dword_duplicated)
214			clear_value_size = 4;
215	}
216
217	/* Expand a small clear value size. */
218	uint32_t tmp_clear_value;
219	if (clear_value_size <= 2) {
220		if (clear_value_size == 1) {
221			tmp_clear_value = *(uint8_t*)clear_value;
222			tmp_clear_value |= (tmp_clear_value << 8) |
223					   (tmp_clear_value << 16) |
224					   (tmp_clear_value << 24);
225		} else {
226			tmp_clear_value = *(uint16_t*)clear_value;
227			tmp_clear_value |= tmp_clear_value << 16;
228		}
229		clear_value = &tmp_clear_value;
230		clear_value_size = 4;
231	}
232
233	/* Use transform feedback for 12-byte clears. */
234	/* TODO: Use compute. */
235	if (clear_value_size == 12) {
236		union pipe_color_union streamout_clear_value;
237
238		memcpy(&streamout_clear_value, clear_value, clear_value_size);
239		si_blitter_begin(sctx, SI_DISABLE_RENDER_COND);
240		util_blitter_clear_buffer(sctx->blitter, dst, offset,
241					  size, clear_value_size / 4,
242					  &streamout_clear_value);
243		si_blitter_end(sctx);
244		return;
245	}
246
247	uint64_t aligned_size = size & ~3ull;
248	if (aligned_size >= 4) {
249		/* Before GFX9, CP DMA was very slow when clearing GTT, so never
250		 * use CP DMA clears on those chips, because we can't be certain
251		 * about buffer placements.
252		 */
253		if (clear_value_size > 4 ||
254		    (!force_cpdma &&
255		     clear_value_size == 4 &&
256		     offset % 4 == 0 &&
257		     (size > 32*1024 || sctx->chip_class <= VI))) {
258			si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0,
259						    aligned_size, clear_value,
260						    clear_value_size, coher);
261		} else {
262			assert(clear_value_size == 4);
263			si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, offset,
264					       aligned_size, *clear_value, 0, coher,
265					       get_cache_policy(sctx, coher, size));
266		}
267
268		offset += aligned_size;
269		size -= aligned_size;
270	}
271
272	/* Handle non-dword alignment. */
273	if (size) {
274		assert(dst);
275		assert(dst->target == PIPE_BUFFER);
276		assert(size < 4);
277
278		pipe_buffer_write(&sctx->b, dst, offset, size, clear_value);
279	}
280}
281
282static void si_pipe_clear_buffer(struct pipe_context *ctx,
283				 struct pipe_resource *dst,
284				 unsigned offset, unsigned size,
285				 const void *clear_value,
286				 int clear_value_size)
287{
288	si_clear_buffer((struct si_context*)ctx, dst, offset, size, (uint32_t*)clear_value,
289			clear_value_size, SI_COHERENCY_SHADER, false);
290}
291
292void si_copy_buffer(struct si_context *sctx,
293		    struct pipe_resource *dst, struct pipe_resource *src,
294		    uint64_t dst_offset, uint64_t src_offset, unsigned size)
295{
296	if (!size)
297		return;
298
299	enum si_coherency coher = SI_COHERENCY_SHADER;
300	enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
301
302	/* Only use compute for VRAM copies on dGPUs. */
303	if (sctx->screen->info.has_dedicated_vram &&
304	    si_resource(dst)->domains & RADEON_DOMAIN_VRAM &&
305	    si_resource(src)->domains & RADEON_DOMAIN_VRAM &&
306	    size > 32 * 1024 &&
307	    dst_offset % 4 == 0 && src_offset % 4 == 0 && size % 4 == 0) {
308		si_compute_do_clear_or_copy(sctx, dst, dst_offset, src, src_offset,
309					    size, NULL, 0, coher);
310	} else {
311		si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size,
312				      0, coher, cache_policy);
313	}
314}
315
316void si_compute_copy_image(struct si_context *sctx,
317			   struct pipe_resource *dst,
318			   unsigned dst_level,
319			   struct pipe_resource *src,
320			   unsigned src_level,
321			   unsigned dstx, unsigned dsty, unsigned dstz,
322			   const struct pipe_box *src_box)
323{
324	struct pipe_context *ctx = &sctx->b;
325	unsigned width = src_box->width;
326	unsigned height = src_box->height;
327	unsigned depth = src_box->depth;
328
329	unsigned data[] = {src_box->x, src_box->y, src_box->z, 0, dstx, dsty, dstz, 0};
330
331	if (width == 0 || height == 0)
332		return;
333
334	si_compute_internal_begin(sctx);
335	sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
336		       si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
337
338	/* src and dst have the same number of samples. */
339	si_make_CB_shader_coherent(sctx, src->nr_samples, true,
340				   /* Only src can have DCC.*/
341				   ((struct si_texture*)src)->surface.u.gfx9.dcc.pipe_aligned);
342
343	struct pipe_constant_buffer saved_cb = {};
344	si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
345
346	struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE];
347	struct pipe_image_view saved_image[2] = {0};
348	util_copy_image_view(&saved_image[0], &images->views[0]);
349	util_copy_image_view(&saved_image[1], &images->views[1]);
350
351	void *saved_cs = sctx->cs_shader_state.program;
352
353	struct pipe_constant_buffer cb = {};
354	cb.buffer_size = sizeof(data);
355	cb.user_buffer = data;
356	ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
357
358	struct pipe_image_view image[2] = {0};
359	image[0].resource = src;
360	image[0].shader_access = image[0].access = PIPE_IMAGE_ACCESS_READ;
361	image[0].format = util_format_linear(src->format);
362	image[0].u.tex.level = src_level;
363	image[0].u.tex.first_layer = 0;
364	image[0].u.tex.last_layer =
365		src->target == PIPE_TEXTURE_3D ? u_minify(src->depth0, src_level) - 1
366						: (unsigned)(src->array_size - 1);
367	image[1].resource = dst;
368	image[1].shader_access = image[1].access = PIPE_IMAGE_ACCESS_WRITE;
369	image[1].format = util_format_linear(dst->format);
370	image[1].u.tex.level = dst_level;
371	image[1].u.tex.first_layer = 0;
372	image[1].u.tex.last_layer =
373		dst->target == PIPE_TEXTURE_3D ? u_minify(dst->depth0, dst_level) - 1
374						: (unsigned)(dst->array_size - 1);
375
376	if (src->format == PIPE_FORMAT_R9G9B9E5_FLOAT)
377		image[0].format = image[1].format = PIPE_FORMAT_R32_UINT;
378
379	/* SNORM8 blitting has precision issues on some chips. Use the SINT
380	 * equivalent instead, which doesn't force DCC decompression.
381	 * Note that some chips avoid this issue by using SDMA.
382	 */
383	if (util_format_is_snorm8(dst->format)) {
384		image[0].format = image[1].format =
385			util_format_snorm8_to_sint8(dst->format);
386	}
387
388	ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, image);
389
390	struct pipe_grid_info info = {0};
391
392	if (dst->target == PIPE_TEXTURE_1D_ARRAY && src->target == PIPE_TEXTURE_1D_ARRAY) {
393		if (!sctx->cs_copy_image_1d_array)
394			sctx->cs_copy_image_1d_array =
395				si_create_copy_image_compute_shader_1d_array(ctx);
396		ctx->bind_compute_state(ctx, sctx->cs_copy_image_1d_array);
397		info.block[0] = 64;
398		info.last_block[0] = width % 64;
399		info.block[1] = 1;
400		info.block[2] = 1;
401		info.grid[0] = DIV_ROUND_UP(width, 64);
402		info.grid[1] = depth;
403		info.grid[2] = 1;
404	} else {
405		if (!sctx->cs_copy_image)
406			sctx->cs_copy_image = si_create_copy_image_compute_shader(ctx);
407		ctx->bind_compute_state(ctx, sctx->cs_copy_image);
408		info.block[0] = 8;
409		info.last_block[0] = width % 8;
410		info.block[1] = 8;
411		info.last_block[1] = height % 8;
412		info.block[2] = 1;
413		info.grid[0] = DIV_ROUND_UP(width, 8);
414		info.grid[1] = DIV_ROUND_UP(height, 8);
415		info.grid[2] = depth;
416	}
417
418	ctx->launch_grid(ctx, &info);
419
420	sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
421		       (sctx->chip_class <= VI ? SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 0) |
422		       si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
423	ctx->bind_compute_state(ctx, saved_cs);
424	ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, saved_image);
425	ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
426	si_compute_internal_end(sctx);
427}
428
429void si_retile_dcc(struct si_context *sctx, struct si_texture *tex)
430{
431	struct pipe_context *ctx = &sctx->b;
432
433	sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
434		       SI_CONTEXT_CS_PARTIAL_FLUSH |
435		       si_get_flush_flags(sctx, SI_COHERENCY_CB_META, L2_LRU) |
436		       si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_LRU);
437	si_emit_cache_flush(sctx);
438
439	/* Save states. */
440	void *saved_cs = sctx->cs_shader_state.program;
441	struct pipe_image_view saved_img[3] = {};
442
443	for (unsigned i = 0; i < 3; i++) {
444		util_copy_image_view(&saved_img[i],
445				     &sctx->images[PIPE_SHADER_COMPUTE].views[i]);
446	}
447
448	/* Set images. */
449	bool use_uint16 = tex->surface.u.gfx9.dcc_retile_use_uint16;
450	unsigned num_elements = tex->surface.u.gfx9.dcc_retile_num_elements;
451	struct pipe_image_view img[3];
452
453	assert(tex->dcc_retile_map_offset && tex->dcc_retile_map_offset <= UINT_MAX);
454	assert(tex->dcc_offset && tex->dcc_offset <= UINT_MAX);
455	assert(tex->display_dcc_offset && tex->display_dcc_offset <= UINT_MAX);
456
457	for (unsigned i = 0; i < 3; i++) {
458		img[i].resource = &tex->buffer.b.b;
459		img[i].access = i == 2 ? PIPE_IMAGE_ACCESS_WRITE : PIPE_IMAGE_ACCESS_READ;
460		img[i].shader_access = SI_IMAGE_ACCESS_AS_BUFFER;
461	}
462
463	img[0].format = use_uint16 ? PIPE_FORMAT_R16G16B16A16_UINT :
464				     PIPE_FORMAT_R32G32B32A32_UINT;
465	img[0].u.buf.offset = tex->dcc_retile_map_offset;
466	img[0].u.buf.size = num_elements * (use_uint16 ? 2 : 4);
467
468	img[1].format = PIPE_FORMAT_R8_UINT;
469	img[1].u.buf.offset = tex->dcc_offset;
470	img[1].u.buf.size = tex->surface.dcc_size;
471
472	img[2].format = PIPE_FORMAT_R8_UINT;
473	img[2].u.buf.offset = tex->display_dcc_offset;
474	img[2].u.buf.size = tex->surface.u.gfx9.display_dcc_size;
475
476	ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 3, img);
477
478	/* Bind the compute shader. */
479	if (!sctx->cs_dcc_retile)
480		sctx->cs_dcc_retile = si_create_dcc_retile_cs(ctx);
481	ctx->bind_compute_state(ctx, sctx->cs_dcc_retile);
482
483	/* Dispatch compute. */
484	/* img[0] has 4 channels per element containing 2 pairs of DCC offsets. */
485	unsigned num_threads = num_elements / 4;
486
487	struct pipe_grid_info info = {};
488	info.block[0] = 64;
489	info.block[1] = 1;
490	info.block[2] = 1;
491	info.grid[0] = DIV_ROUND_UP(num_threads, 64); /* includes the partial block */
492	info.grid[1] = 1;
493	info.grid[2] = 1;
494	info.last_block[0] = num_threads % 64;
495
496	ctx->launch_grid(ctx, &info);
497
498	/* Don't flush caches or wait. The driver will wait at the end of this IB,
499	 * and L2 will be flushed by the kernel fence.
500	 */
501
502	/* Restore states. */
503	ctx->bind_compute_state(ctx, saved_cs);
504	ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 3, saved_img);
505}
506
507void si_init_compute_blit_functions(struct si_context *sctx)
508{
509	sctx->b.clear_buffer = si_pipe_clear_buffer;
510}
511
512/* Clear a region of a color surface to a constant value. */
513void si_compute_clear_render_target(struct pipe_context *ctx,
514				    struct pipe_surface *dstsurf,
515				    const union pipe_color_union *color,
516				    unsigned dstx, unsigned dsty,
517				    unsigned width, unsigned height,
518				    bool render_condition_enabled)
519{
520	struct si_context *sctx = (struct si_context *)ctx;
521	unsigned num_layers = dstsurf->u.tex.last_layer - dstsurf->u.tex.first_layer + 1;
522	unsigned data[4 + sizeof(color->ui)] = {dstx, dsty, dstsurf->u.tex.first_layer, 0};
523
524	if (width == 0 || height == 0)
525		return;
526
527	if (util_format_is_srgb(dstsurf->format)) {
528		union pipe_color_union color_srgb;
529		for (int i = 0; i < 3; i++)
530			color_srgb.f[i] = util_format_linear_to_srgb_float(color->f[i]);
531		color_srgb.f[3] = color->f[3];
532		memcpy(data + 4, color_srgb.ui, sizeof(color->ui));
533	} else {
534		memcpy(data + 4, color->ui, sizeof(color->ui));
535	}
536
537	si_compute_internal_begin(sctx);
538	sctx->render_cond_force_off = !render_condition_enabled;
539
540	sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
541		       si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
542	si_make_CB_shader_coherent(sctx, dstsurf->texture->nr_samples, true,
543				   true /* DCC is not possible with image stores */);
544
545	struct pipe_constant_buffer saved_cb = {};
546	si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
547
548	struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE];
549	struct pipe_image_view saved_image = {0};
550	util_copy_image_view(&saved_image, &images->views[0]);
551
552	void *saved_cs = sctx->cs_shader_state.program;
553
554	struct pipe_constant_buffer cb = {};
555	cb.buffer_size = sizeof(data);
556	cb.user_buffer = data;
557	ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
558
559	struct pipe_image_view image = {0};
560	image.resource = dstsurf->texture;
561	image.shader_access = image.access = PIPE_IMAGE_ACCESS_WRITE;
562	image.format = util_format_linear(dstsurf->format);
563	image.u.tex.level = dstsurf->u.tex.level;
564	image.u.tex.first_layer = 0; /* 3D images ignore first_layer (BASE_ARRAY) */
565	image.u.tex.last_layer = dstsurf->u.tex.last_layer;
566
567	ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &image);
568
569	struct pipe_grid_info info = {0};
570
571	if (dstsurf->texture->target != PIPE_TEXTURE_1D_ARRAY) {
572		if (!sctx->cs_clear_render_target)
573			sctx->cs_clear_render_target = si_clear_render_target_shader(ctx);
574		ctx->bind_compute_state(ctx, sctx->cs_clear_render_target);
575		info.block[0] = 8;
576		info.last_block[0] = width % 8;
577		info.block[1] = 8;
578		info.last_block[1] = height % 8;
579		info.block[2] = 1;
580		info.grid[0] = DIV_ROUND_UP(width, 8);
581		info.grid[1] = DIV_ROUND_UP(height, 8);
582		info.grid[2] = num_layers;
583	} else {
584		if (!sctx->cs_clear_render_target_1d_array)
585			sctx->cs_clear_render_target_1d_array =
586				si_clear_render_target_shader_1d_array(ctx);
587		ctx->bind_compute_state(ctx, sctx->cs_clear_render_target_1d_array);
588		info.block[0] = 64;
589		info.last_block[0] = width % 64;
590		info.block[1] = 1;
591		info.block[2] = 1;
592		info.grid[0] = DIV_ROUND_UP(width, 64);
593		info.grid[1] = num_layers;
594		info.grid[2] = 1;
595	}
596
597	ctx->launch_grid(ctx, &info);
598
599	sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
600		       (sctx->chip_class <= VI ? SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 0) |
601		       si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
602	ctx->bind_compute_state(ctx, saved_cs);
603	ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_image);
604	ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
605	si_compute_internal_end(sctx);
606}
607