1/* 2 * Copyright 2018 Advanced Micro Devices, Inc. 3 * All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * on the rights to use, copy, modify, merge, publish, distribute, sub 9 * license, and/or sell copies of the Software, and to permit persons to whom 10 * the Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22 * USE OR OTHER DEALINGS IN THE SOFTWARE. 23 * 24 */ 25 26#include "si_pipe.h" 27#include "util/u_format.h" 28#include "util/format_srgb.h" 29 30/* Note: Compute shaders always use SI_COMPUTE_DST_CACHE_POLICY for dst 31 * and L2_STREAM for src. 32 */ 33static enum si_cache_policy get_cache_policy(struct si_context *sctx, 34 enum si_coherency coher, 35 uint64_t size) 36{ 37 if ((sctx->chip_class >= GFX9 && (coher == SI_COHERENCY_CB_META || 38 coher == SI_COHERENCY_CP)) || 39 (sctx->chip_class >= CIK && coher == SI_COHERENCY_SHADER)) 40 return size <= 256 * 1024 ? L2_LRU : L2_STREAM; 41 42 return L2_BYPASS; 43} 44 45unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher, 46 enum si_cache_policy cache_policy) 47{ 48 switch (coher) { 49 default: 50 case SI_COHERENCY_NONE: 51 case SI_COHERENCY_CP: 52 return 0; 53 case SI_COHERENCY_SHADER: 54 return SI_CONTEXT_INV_SMEM_L1 | 55 SI_CONTEXT_INV_VMEM_L1 | 56 (cache_policy == L2_BYPASS ? SI_CONTEXT_INV_GLOBAL_L2 : 0); 57 case SI_COHERENCY_CB_META: 58 return SI_CONTEXT_FLUSH_AND_INV_CB; 59 } 60} 61 62static void si_compute_internal_begin(struct si_context *sctx) 63{ 64 sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS; 65 sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS; 66 sctx->render_cond_force_off = true; 67} 68 69static void si_compute_internal_end(struct si_context *sctx) 70{ 71 sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS; 72 sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS; 73 sctx->render_cond_force_off = false; 74} 75 76static void si_compute_do_clear_or_copy(struct si_context *sctx, 77 struct pipe_resource *dst, 78 unsigned dst_offset, 79 struct pipe_resource *src, 80 unsigned src_offset, 81 unsigned size, 82 const uint32_t *clear_value, 83 unsigned clear_value_size, 84 enum si_coherency coher) 85{ 86 struct pipe_context *ctx = &sctx->b; 87 88 assert(src_offset % 4 == 0); 89 assert(dst_offset % 4 == 0); 90 assert(size % 4 == 0); 91 92 assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0); 93 assert(!src || src_offset + size <= src->width0); 94 95 si_compute_internal_begin(sctx); 96 sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | 97 SI_CONTEXT_CS_PARTIAL_FLUSH | 98 si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY); 99 100 /* Save states. */ 101 void *saved_cs = sctx->cs_shader_state.program; 102 struct pipe_shader_buffer saved_sb[2] = {}; 103 si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb); 104 105 unsigned saved_writable_mask = 0; 106 for (unsigned i = 0; i < (src ? 2 : 1); i++) { 107 if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask & 108 (1u << si_get_shaderbuf_slot(i))) 109 saved_writable_mask |= 1 << i; 110 } 111 112 /* The memory accesses are coalesced, meaning that the 1st instruction writes 113 * the 1st contiguous block of data for the whole wave, the 2nd instruction 114 * writes the 2nd contiguous block of data, etc. 115 */ 116 unsigned dwords_per_thread = src ? SI_COMPUTE_COPY_DW_PER_THREAD : 117 SI_COMPUTE_CLEAR_DW_PER_THREAD; 118 unsigned instructions_per_thread = MAX2(1, dwords_per_thread / 4); 119 unsigned dwords_per_instruction = dwords_per_thread / instructions_per_thread; 120 unsigned dwords_per_wave = dwords_per_thread * 64; 121 122 unsigned num_dwords = size / 4; 123 unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction); 124 125 struct pipe_grid_info info = {}; 126 info.block[0] = MIN2(64, num_instructions); 127 info.block[1] = 1; 128 info.block[2] = 1; 129 info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave); 130 info.grid[1] = 1; 131 info.grid[2] = 1; 132 133 struct pipe_shader_buffer sb[2] = {}; 134 sb[0].buffer = dst; 135 sb[0].buffer_offset = dst_offset; 136 sb[0].buffer_size = size; 137 138 bool shader_dst_stream_policy = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU; 139 140 if (src) { 141 sb[1].buffer = src; 142 sb[1].buffer_offset = src_offset; 143 sb[1].buffer_size = size; 144 145 ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 2, sb, 0x1); 146 147 if (!sctx->cs_copy_buffer) { 148 sctx->cs_copy_buffer = si_create_dma_compute_shader(&sctx->b, 149 SI_COMPUTE_COPY_DW_PER_THREAD, 150 shader_dst_stream_policy, true); 151 } 152 ctx->bind_compute_state(ctx, sctx->cs_copy_buffer); 153 } else { 154 assert(clear_value_size >= 4 && 155 clear_value_size <= 16 && 156 util_is_power_of_two_or_zero(clear_value_size)); 157 158 for (unsigned i = 0; i < 4; i++) 159 sctx->cs_user_data[i] = clear_value[i % (clear_value_size / 4)]; 160 161 ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, sb, 0x1); 162 163 if (!sctx->cs_clear_buffer) { 164 sctx->cs_clear_buffer = si_create_dma_compute_shader(&sctx->b, 165 SI_COMPUTE_CLEAR_DW_PER_THREAD, 166 shader_dst_stream_policy, false); 167 } 168 ctx->bind_compute_state(ctx, sctx->cs_clear_buffer); 169 } 170 171 ctx->launch_grid(ctx, &info); 172 173 enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size); 174 sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | 175 (cache_policy == L2_BYPASS ? SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 0); 176 177 if (cache_policy != L2_BYPASS) 178 si_resource(dst)->TC_L2_dirty = true; 179 180 /* Restore states. */ 181 ctx->bind_compute_state(ctx, saved_cs); 182 ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb, 183 saved_writable_mask); 184 si_compute_internal_end(sctx); 185} 186 187void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, 188 uint64_t offset, uint64_t size, uint32_t *clear_value, 189 uint32_t clear_value_size, enum si_coherency coher, 190 bool force_cpdma) 191{ 192 if (!size) 193 return; 194 195 unsigned clear_alignment = MIN2(clear_value_size, 4); 196 197 assert(clear_value_size != 3 && clear_value_size != 6); /* 12 is allowed. */ 198 assert(offset % clear_alignment == 0); 199 assert(size % clear_alignment == 0); 200 assert(size < (UINT_MAX & ~0xf)); /* TODO: test 64-bit sizes in all codepaths */ 201 202 /* Reduce a large clear value size if possible. */ 203 if (clear_value_size > 4) { 204 bool clear_dword_duplicated = true; 205 206 /* See if we can lower large fills to dword fills. */ 207 for (unsigned i = 1; i < clear_value_size / 4; i++) { 208 if (clear_value[0] != clear_value[i]) { 209 clear_dword_duplicated = false; 210 break; 211 } 212 } 213 if (clear_dword_duplicated) 214 clear_value_size = 4; 215 } 216 217 /* Expand a small clear value size. */ 218 uint32_t tmp_clear_value; 219 if (clear_value_size <= 2) { 220 if (clear_value_size == 1) { 221 tmp_clear_value = *(uint8_t*)clear_value; 222 tmp_clear_value |= (tmp_clear_value << 8) | 223 (tmp_clear_value << 16) | 224 (tmp_clear_value << 24); 225 } else { 226 tmp_clear_value = *(uint16_t*)clear_value; 227 tmp_clear_value |= tmp_clear_value << 16; 228 } 229 clear_value = &tmp_clear_value; 230 clear_value_size = 4; 231 } 232 233 /* Use transform feedback for 12-byte clears. */ 234 /* TODO: Use compute. */ 235 if (clear_value_size == 12) { 236 union pipe_color_union streamout_clear_value; 237 238 memcpy(&streamout_clear_value, clear_value, clear_value_size); 239 si_blitter_begin(sctx, SI_DISABLE_RENDER_COND); 240 util_blitter_clear_buffer(sctx->blitter, dst, offset, 241 size, clear_value_size / 4, 242 &streamout_clear_value); 243 si_blitter_end(sctx); 244 return; 245 } 246 247 uint64_t aligned_size = size & ~3ull; 248 if (aligned_size >= 4) { 249 /* Before GFX9, CP DMA was very slow when clearing GTT, so never 250 * use CP DMA clears on those chips, because we can't be certain 251 * about buffer placements. 252 */ 253 if (clear_value_size > 4 || 254 (!force_cpdma && 255 clear_value_size == 4 && 256 offset % 4 == 0 && 257 (size > 32*1024 || sctx->chip_class <= VI))) { 258 si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0, 259 aligned_size, clear_value, 260 clear_value_size, coher); 261 } else { 262 assert(clear_value_size == 4); 263 si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, offset, 264 aligned_size, *clear_value, 0, coher, 265 get_cache_policy(sctx, coher, size)); 266 } 267 268 offset += aligned_size; 269 size -= aligned_size; 270 } 271 272 /* Handle non-dword alignment. */ 273 if (size) { 274 assert(dst); 275 assert(dst->target == PIPE_BUFFER); 276 assert(size < 4); 277 278 pipe_buffer_write(&sctx->b, dst, offset, size, clear_value); 279 } 280} 281 282static void si_pipe_clear_buffer(struct pipe_context *ctx, 283 struct pipe_resource *dst, 284 unsigned offset, unsigned size, 285 const void *clear_value, 286 int clear_value_size) 287{ 288 si_clear_buffer((struct si_context*)ctx, dst, offset, size, (uint32_t*)clear_value, 289 clear_value_size, SI_COHERENCY_SHADER, false); 290} 291 292void si_copy_buffer(struct si_context *sctx, 293 struct pipe_resource *dst, struct pipe_resource *src, 294 uint64_t dst_offset, uint64_t src_offset, unsigned size) 295{ 296 if (!size) 297 return; 298 299 enum si_coherency coher = SI_COHERENCY_SHADER; 300 enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size); 301 302 /* Only use compute for VRAM copies on dGPUs. */ 303 if (sctx->screen->info.has_dedicated_vram && 304 si_resource(dst)->domains & RADEON_DOMAIN_VRAM && 305 si_resource(src)->domains & RADEON_DOMAIN_VRAM && 306 size > 32 * 1024 && 307 dst_offset % 4 == 0 && src_offset % 4 == 0 && size % 4 == 0) { 308 si_compute_do_clear_or_copy(sctx, dst, dst_offset, src, src_offset, 309 size, NULL, 0, coher); 310 } else { 311 si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size, 312 0, coher, cache_policy); 313 } 314} 315 316void si_compute_copy_image(struct si_context *sctx, 317 struct pipe_resource *dst, 318 unsigned dst_level, 319 struct pipe_resource *src, 320 unsigned src_level, 321 unsigned dstx, unsigned dsty, unsigned dstz, 322 const struct pipe_box *src_box) 323{ 324 struct pipe_context *ctx = &sctx->b; 325 unsigned width = src_box->width; 326 unsigned height = src_box->height; 327 unsigned depth = src_box->depth; 328 329 unsigned data[] = {src_box->x, src_box->y, src_box->z, 0, dstx, dsty, dstz, 0}; 330 331 if (width == 0 || height == 0) 332 return; 333 334 si_compute_internal_begin(sctx); 335 sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | 336 si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM); 337 338 /* src and dst have the same number of samples. */ 339 si_make_CB_shader_coherent(sctx, src->nr_samples, true, 340 /* Only src can have DCC.*/ 341 ((struct si_texture*)src)->surface.u.gfx9.dcc.pipe_aligned); 342 343 struct pipe_constant_buffer saved_cb = {}; 344 si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb); 345 346 struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE]; 347 struct pipe_image_view saved_image[2] = {0}; 348 util_copy_image_view(&saved_image[0], &images->views[0]); 349 util_copy_image_view(&saved_image[1], &images->views[1]); 350 351 void *saved_cs = sctx->cs_shader_state.program; 352 353 struct pipe_constant_buffer cb = {}; 354 cb.buffer_size = sizeof(data); 355 cb.user_buffer = data; 356 ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb); 357 358 struct pipe_image_view image[2] = {0}; 359 image[0].resource = src; 360 image[0].shader_access = image[0].access = PIPE_IMAGE_ACCESS_READ; 361 image[0].format = util_format_linear(src->format); 362 image[0].u.tex.level = src_level; 363 image[0].u.tex.first_layer = 0; 364 image[0].u.tex.last_layer = 365 src->target == PIPE_TEXTURE_3D ? u_minify(src->depth0, src_level) - 1 366 : (unsigned)(src->array_size - 1); 367 image[1].resource = dst; 368 image[1].shader_access = image[1].access = PIPE_IMAGE_ACCESS_WRITE; 369 image[1].format = util_format_linear(dst->format); 370 image[1].u.tex.level = dst_level; 371 image[1].u.tex.first_layer = 0; 372 image[1].u.tex.last_layer = 373 dst->target == PIPE_TEXTURE_3D ? u_minify(dst->depth0, dst_level) - 1 374 : (unsigned)(dst->array_size - 1); 375 376 if (src->format == PIPE_FORMAT_R9G9B9E5_FLOAT) 377 image[0].format = image[1].format = PIPE_FORMAT_R32_UINT; 378 379 /* SNORM8 blitting has precision issues on some chips. Use the SINT 380 * equivalent instead, which doesn't force DCC decompression. 381 * Note that some chips avoid this issue by using SDMA. 382 */ 383 if (util_format_is_snorm8(dst->format)) { 384 image[0].format = image[1].format = 385 util_format_snorm8_to_sint8(dst->format); 386 } 387 388 ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, image); 389 390 struct pipe_grid_info info = {0}; 391 392 if (dst->target == PIPE_TEXTURE_1D_ARRAY && src->target == PIPE_TEXTURE_1D_ARRAY) { 393 if (!sctx->cs_copy_image_1d_array) 394 sctx->cs_copy_image_1d_array = 395 si_create_copy_image_compute_shader_1d_array(ctx); 396 ctx->bind_compute_state(ctx, sctx->cs_copy_image_1d_array); 397 info.block[0] = 64; 398 info.last_block[0] = width % 64; 399 info.block[1] = 1; 400 info.block[2] = 1; 401 info.grid[0] = DIV_ROUND_UP(width, 64); 402 info.grid[1] = depth; 403 info.grid[2] = 1; 404 } else { 405 if (!sctx->cs_copy_image) 406 sctx->cs_copy_image = si_create_copy_image_compute_shader(ctx); 407 ctx->bind_compute_state(ctx, sctx->cs_copy_image); 408 info.block[0] = 8; 409 info.last_block[0] = width % 8; 410 info.block[1] = 8; 411 info.last_block[1] = height % 8; 412 info.block[2] = 1; 413 info.grid[0] = DIV_ROUND_UP(width, 8); 414 info.grid[1] = DIV_ROUND_UP(height, 8); 415 info.grid[2] = depth; 416 } 417 418 ctx->launch_grid(ctx, &info); 419 420 sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | 421 (sctx->chip_class <= VI ? SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 0) | 422 si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM); 423 ctx->bind_compute_state(ctx, saved_cs); 424 ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, saved_image); 425 ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb); 426 si_compute_internal_end(sctx); 427} 428 429void si_retile_dcc(struct si_context *sctx, struct si_texture *tex) 430{ 431 struct pipe_context *ctx = &sctx->b; 432 433 sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | 434 SI_CONTEXT_CS_PARTIAL_FLUSH | 435 si_get_flush_flags(sctx, SI_COHERENCY_CB_META, L2_LRU) | 436 si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_LRU); 437 si_emit_cache_flush(sctx); 438 439 /* Save states. */ 440 void *saved_cs = sctx->cs_shader_state.program; 441 struct pipe_image_view saved_img[3] = {}; 442 443 for (unsigned i = 0; i < 3; i++) { 444 util_copy_image_view(&saved_img[i], 445 &sctx->images[PIPE_SHADER_COMPUTE].views[i]); 446 } 447 448 /* Set images. */ 449 bool use_uint16 = tex->surface.u.gfx9.dcc_retile_use_uint16; 450 unsigned num_elements = tex->surface.u.gfx9.dcc_retile_num_elements; 451 struct pipe_image_view img[3]; 452 453 assert(tex->dcc_retile_map_offset && tex->dcc_retile_map_offset <= UINT_MAX); 454 assert(tex->dcc_offset && tex->dcc_offset <= UINT_MAX); 455 assert(tex->display_dcc_offset && tex->display_dcc_offset <= UINT_MAX); 456 457 for (unsigned i = 0; i < 3; i++) { 458 img[i].resource = &tex->buffer.b.b; 459 img[i].access = i == 2 ? PIPE_IMAGE_ACCESS_WRITE : PIPE_IMAGE_ACCESS_READ; 460 img[i].shader_access = SI_IMAGE_ACCESS_AS_BUFFER; 461 } 462 463 img[0].format = use_uint16 ? PIPE_FORMAT_R16G16B16A16_UINT : 464 PIPE_FORMAT_R32G32B32A32_UINT; 465 img[0].u.buf.offset = tex->dcc_retile_map_offset; 466 img[0].u.buf.size = num_elements * (use_uint16 ? 2 : 4); 467 468 img[1].format = PIPE_FORMAT_R8_UINT; 469 img[1].u.buf.offset = tex->dcc_offset; 470 img[1].u.buf.size = tex->surface.dcc_size; 471 472 img[2].format = PIPE_FORMAT_R8_UINT; 473 img[2].u.buf.offset = tex->display_dcc_offset; 474 img[2].u.buf.size = tex->surface.u.gfx9.display_dcc_size; 475 476 ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 3, img); 477 478 /* Bind the compute shader. */ 479 if (!sctx->cs_dcc_retile) 480 sctx->cs_dcc_retile = si_create_dcc_retile_cs(ctx); 481 ctx->bind_compute_state(ctx, sctx->cs_dcc_retile); 482 483 /* Dispatch compute. */ 484 /* img[0] has 4 channels per element containing 2 pairs of DCC offsets. */ 485 unsigned num_threads = num_elements / 4; 486 487 struct pipe_grid_info info = {}; 488 info.block[0] = 64; 489 info.block[1] = 1; 490 info.block[2] = 1; 491 info.grid[0] = DIV_ROUND_UP(num_threads, 64); /* includes the partial block */ 492 info.grid[1] = 1; 493 info.grid[2] = 1; 494 info.last_block[0] = num_threads % 64; 495 496 ctx->launch_grid(ctx, &info); 497 498 /* Don't flush caches or wait. The driver will wait at the end of this IB, 499 * and L2 will be flushed by the kernel fence. 500 */ 501 502 /* Restore states. */ 503 ctx->bind_compute_state(ctx, saved_cs); 504 ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 3, saved_img); 505} 506 507void si_init_compute_blit_functions(struct si_context *sctx) 508{ 509 sctx->b.clear_buffer = si_pipe_clear_buffer; 510} 511 512/* Clear a region of a color surface to a constant value. */ 513void si_compute_clear_render_target(struct pipe_context *ctx, 514 struct pipe_surface *dstsurf, 515 const union pipe_color_union *color, 516 unsigned dstx, unsigned dsty, 517 unsigned width, unsigned height, 518 bool render_condition_enabled) 519{ 520 struct si_context *sctx = (struct si_context *)ctx; 521 unsigned num_layers = dstsurf->u.tex.last_layer - dstsurf->u.tex.first_layer + 1; 522 unsigned data[4 + sizeof(color->ui)] = {dstx, dsty, dstsurf->u.tex.first_layer, 0}; 523 524 if (width == 0 || height == 0) 525 return; 526 527 if (util_format_is_srgb(dstsurf->format)) { 528 union pipe_color_union color_srgb; 529 for (int i = 0; i < 3; i++) 530 color_srgb.f[i] = util_format_linear_to_srgb_float(color->f[i]); 531 color_srgb.f[3] = color->f[3]; 532 memcpy(data + 4, color_srgb.ui, sizeof(color->ui)); 533 } else { 534 memcpy(data + 4, color->ui, sizeof(color->ui)); 535 } 536 537 si_compute_internal_begin(sctx); 538 sctx->render_cond_force_off = !render_condition_enabled; 539 540 sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | 541 si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM); 542 si_make_CB_shader_coherent(sctx, dstsurf->texture->nr_samples, true, 543 true /* DCC is not possible with image stores */); 544 545 struct pipe_constant_buffer saved_cb = {}; 546 si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb); 547 548 struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE]; 549 struct pipe_image_view saved_image = {0}; 550 util_copy_image_view(&saved_image, &images->views[0]); 551 552 void *saved_cs = sctx->cs_shader_state.program; 553 554 struct pipe_constant_buffer cb = {}; 555 cb.buffer_size = sizeof(data); 556 cb.user_buffer = data; 557 ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb); 558 559 struct pipe_image_view image = {0}; 560 image.resource = dstsurf->texture; 561 image.shader_access = image.access = PIPE_IMAGE_ACCESS_WRITE; 562 image.format = util_format_linear(dstsurf->format); 563 image.u.tex.level = dstsurf->u.tex.level; 564 image.u.tex.first_layer = 0; /* 3D images ignore first_layer (BASE_ARRAY) */ 565 image.u.tex.last_layer = dstsurf->u.tex.last_layer; 566 567 ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &image); 568 569 struct pipe_grid_info info = {0}; 570 571 if (dstsurf->texture->target != PIPE_TEXTURE_1D_ARRAY) { 572 if (!sctx->cs_clear_render_target) 573 sctx->cs_clear_render_target = si_clear_render_target_shader(ctx); 574 ctx->bind_compute_state(ctx, sctx->cs_clear_render_target); 575 info.block[0] = 8; 576 info.last_block[0] = width % 8; 577 info.block[1] = 8; 578 info.last_block[1] = height % 8; 579 info.block[2] = 1; 580 info.grid[0] = DIV_ROUND_UP(width, 8); 581 info.grid[1] = DIV_ROUND_UP(height, 8); 582 info.grid[2] = num_layers; 583 } else { 584 if (!sctx->cs_clear_render_target_1d_array) 585 sctx->cs_clear_render_target_1d_array = 586 si_clear_render_target_shader_1d_array(ctx); 587 ctx->bind_compute_state(ctx, sctx->cs_clear_render_target_1d_array); 588 info.block[0] = 64; 589 info.last_block[0] = width % 64; 590 info.block[1] = 1; 591 info.block[2] = 1; 592 info.grid[0] = DIV_ROUND_UP(width, 64); 593 info.grid[1] = num_layers; 594 info.grid[2] = 1; 595 } 596 597 ctx->launch_grid(ctx, &info); 598 599 sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | 600 (sctx->chip_class <= VI ? SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 0) | 601 si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM); 602 ctx->bind_compute_state(ctx, saved_cs); 603 ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_image); 604 ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb); 605 si_compute_internal_end(sctx); 606} 607