1/* 2 * Copyright 2018 Advanced Micro Devices, Inc. 3 * All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * on the rights to use, copy, modify, merge, publish, distribute, sub 9 * license, and/or sell copies of the Software, and to permit persons to whom 10 * the Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22 * USE OR OTHER DEALINGS IN THE SOFTWARE. 23 * 24 */ 25 26#include "si_pipe.h" 27#include "util/format/u_format.h" 28#include "util/format_srgb.h" 29#include "util/u_helpers.h" 30 31/* Determine the cache policy. */ 32static enum si_cache_policy get_cache_policy(struct si_context *sctx, enum si_coherency coher, 33 uint64_t size) 34{ 35 if ((sctx->chip_class >= GFX9 && (coher == SI_COHERENCY_CB_META || 36 coher == SI_COHERENCY_DB_META || 37 coher == SI_COHERENCY_CP)) || 38 (sctx->chip_class >= GFX7 && coher == SI_COHERENCY_SHADER)) 39 return L2_LRU; /* it's faster if L2 doesn't evict anything */ 40 41 return L2_BYPASS; 42} 43 44unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher, 45 enum si_cache_policy cache_policy) 46{ 47 switch (coher) { 48 default: 49 case SI_COHERENCY_NONE: 50 case SI_COHERENCY_CP: 51 return 0; 52 case SI_COHERENCY_SHADER: 53 return SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | 54 (cache_policy == L2_BYPASS ? SI_CONTEXT_INV_L2 : 0); 55 case SI_COHERENCY_CB_META: 56 return SI_CONTEXT_FLUSH_AND_INV_CB; 57 case SI_COHERENCY_DB_META: 58 return SI_CONTEXT_FLUSH_AND_INV_DB; 59 } 60} 61 62void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *info, 63 void *shader, unsigned flags) 64{ 65 66 /* Wait for previous shaders to finish. */ 67 if (flags & SI_OP_SYNC_PS_BEFORE) 68 sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH; 69 70 if (flags & SI_OP_SYNC_CS_BEFORE) 71 sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH; 72 73 if (!(flags & SI_OP_CS_IMAGE)) 74 sctx->flags |= SI_CONTEXT_PFP_SYNC_ME; 75 76 /* Invalidate L0-L1 caches. */ 77 /* sL0 is never invalidated, because src resources don't use it. */ 78 if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE)) 79 sctx->flags |= SI_CONTEXT_INV_VCACHE; 80 81 /* Set settings for driver-internal compute dispatches. */ 82 sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS; 83 sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS; 84 85 if (!(flags & SI_OP_CS_RENDER_COND_ENABLE)) 86 sctx->render_cond_enabled = false; 87 88 /* Skip decompression to prevent infinite recursion. */ 89 sctx->blitter_running = true; 90 91 /* Dispatch compute. */ 92 void *saved_cs = sctx->cs_shader_state.program; 93 sctx->b.bind_compute_state(&sctx->b, shader); 94 sctx->b.launch_grid(&sctx->b, info); 95 sctx->b.bind_compute_state(&sctx->b, saved_cs); 96 97 /* Restore default settings. */ 98 sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS; 99 sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS; 100 sctx->render_cond_enabled = sctx->render_cond; 101 sctx->blitter_running = false; 102 103 if (flags & SI_OP_SYNC_AFTER) { 104 sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH; 105 106 if (flags & SI_OP_CS_IMAGE) { 107 /* Make sure image stores are visible to CB, which doesn't use L2 on GFX6-8. */ 108 sctx->flags |= sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0; 109 /* Make sure image stores are visible to all CUs. */ 110 sctx->flags |= SI_CONTEXT_INV_VCACHE; 111 } else { 112 /* Make sure buffer stores are visible to all CUs. */ 113 sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | SI_CONTEXT_PFP_SYNC_ME; 114 } 115 } 116} 117 118void si_launch_grid_internal_ssbos(struct si_context *sctx, struct pipe_grid_info *info, 119 void *shader, unsigned flags, enum si_coherency coher, 120 unsigned num_buffers, const struct pipe_shader_buffer *buffers, 121 unsigned writeable_bitmask) 122{ 123 if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE)) 124 sctx->flags |= si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY); 125 126 /* Save states. */ 127 struct pipe_shader_buffer saved_sb[3] = {}; 128 assert(num_buffers <= ARRAY_SIZE(saved_sb)); 129 si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, num_buffers, saved_sb); 130 131 unsigned saved_writable_mask = 0; 132 for (unsigned i = 0; i < num_buffers; i++) { 133 if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask & 134 (1u << si_get_shaderbuf_slot(i))) 135 saved_writable_mask |= 1 << i; 136 } 137 138 /* Bind buffers and launch compute. */ 139 sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, num_buffers, buffers, 140 writeable_bitmask); 141 si_launch_grid_internal(sctx, info, shader, flags); 142 143 /* Do cache flushing at the end. */ 144 if (get_cache_policy(sctx, coher, 0) == L2_BYPASS) { 145 if (flags & SI_OP_SYNC_AFTER) 146 sctx->flags |= SI_CONTEXT_WB_L2; 147 } else { 148 while (writeable_bitmask) 149 si_resource(buffers[u_bit_scan(&writeable_bitmask)].buffer)->TC_L2_dirty = true; 150 } 151 152 /* Restore states. */ 153 sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, num_buffers, saved_sb, 154 saved_writable_mask); 155 for (int i = 0; i < num_buffers; i++) 156 pipe_resource_reference(&saved_sb[i].buffer, NULL); 157} 158 159/** 160 * Clear a buffer using read-modify-write with a 32-bit write bitmask. 161 * The clear value has 32 bits. 162 */ 163void si_compute_clear_buffer_rmw(struct si_context *sctx, struct pipe_resource *dst, 164 unsigned dst_offset, unsigned size, 165 uint32_t clear_value, uint32_t writebitmask, 166 unsigned flags, enum si_coherency coher) 167{ 168 assert(dst_offset % 4 == 0); 169 assert(size % 4 == 0); 170 171 assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0); 172 173 /* Use buffer_load_dwordx4 and buffer_store_dwordx4 per thread. */ 174 unsigned dwords_per_instruction = 4; 175 unsigned wave_size = sctx->screen->compute_wave_size; 176 unsigned dwords_per_wave = dwords_per_instruction * wave_size; 177 178 unsigned num_dwords = size / 4; 179 unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction); 180 181 struct pipe_grid_info info = {}; 182 info.block[0] = MIN2(wave_size, num_instructions); 183 info.block[1] = 1; 184 info.block[2] = 1; 185 info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave); 186 info.grid[1] = 1; 187 info.grid[2] = 1; 188 189 struct pipe_shader_buffer sb = {}; 190 sb.buffer = dst; 191 sb.buffer_offset = dst_offset; 192 sb.buffer_size = size; 193 194 sctx->cs_user_data[0] = clear_value & writebitmask; 195 sctx->cs_user_data[1] = ~writebitmask; 196 197 if (!sctx->cs_clear_buffer_rmw) 198 sctx->cs_clear_buffer_rmw = si_create_clear_buffer_rmw_cs(&sctx->b); 199 200 si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_clear_buffer_rmw, flags, coher, 201 1, &sb, 0x1); 202} 203 204static void si_compute_clear_12bytes_buffer(struct si_context *sctx, struct pipe_resource *dst, 205 unsigned dst_offset, unsigned size, 206 const uint32_t *clear_value, unsigned flags, 207 enum si_coherency coher) 208{ 209 struct pipe_context *ctx = &sctx->b; 210 211 assert(dst_offset % 4 == 0); 212 assert(size % 4 == 0); 213 unsigned size_12 = DIV_ROUND_UP(size, 12); 214 215 struct pipe_shader_buffer sb = {0}; 216 sb.buffer = dst; 217 sb.buffer_offset = dst_offset; 218 sb.buffer_size = size; 219 220 memcpy(sctx->cs_user_data, clear_value, 12); 221 222 struct pipe_grid_info info = {0}; 223 224 if (!sctx->cs_clear_12bytes_buffer) 225 sctx->cs_clear_12bytes_buffer = si_clear_12bytes_buffer_shader(ctx); 226 227 info.block[0] = 64; 228 info.last_block[0] = size_12 % 64; 229 info.block[1] = 1; 230 info.block[2] = 1; 231 info.grid[0] = DIV_ROUND_UP(size_12, 64); 232 info.grid[1] = 1; 233 info.grid[2] = 1; 234 235 si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_clear_12bytes_buffer, flags, coher, 236 1, &sb, 0x1); 237} 238 239static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_resource *dst, 240 unsigned dst_offset, struct pipe_resource *src, 241 unsigned src_offset, unsigned size, 242 const uint32_t *clear_value, unsigned clear_value_size, 243 unsigned flags, enum si_coherency coher) 244{ 245 assert(src_offset % 4 == 0); 246 assert(dst_offset % 4 == 0); 247 assert(size % 4 == 0); 248 249 assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0); 250 assert(!src || src_offset + size <= src->width0); 251 252 /* The memory accesses are coalesced, meaning that the 1st instruction writes 253 * the 1st contiguous block of data for the whole wave, the 2nd instruction 254 * writes the 2nd contiguous block of data, etc. 255 */ 256 unsigned dwords_per_thread = 257 src ? SI_COMPUTE_COPY_DW_PER_THREAD : SI_COMPUTE_CLEAR_DW_PER_THREAD; 258 unsigned instructions_per_thread = MAX2(1, dwords_per_thread / 4); 259 unsigned dwords_per_instruction = dwords_per_thread / instructions_per_thread; 260 unsigned wave_size = sctx->screen->compute_wave_size; 261 unsigned dwords_per_wave = dwords_per_thread * wave_size; 262 263 unsigned num_dwords = size / 4; 264 unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction); 265 266 struct pipe_grid_info info = {}; 267 info.block[0] = MIN2(wave_size, num_instructions); 268 info.block[1] = 1; 269 info.block[2] = 1; 270 info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave); 271 info.grid[1] = 1; 272 info.grid[2] = 1; 273 274 struct pipe_shader_buffer sb[2] = {}; 275 sb[0].buffer = dst; 276 sb[0].buffer_offset = dst_offset; 277 sb[0].buffer_size = size; 278 279 bool shader_dst_stream_policy = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU; 280 281 if (src) { 282 sb[1].buffer = src; 283 sb[1].buffer_offset = src_offset; 284 sb[1].buffer_size = size; 285 286 if (!sctx->cs_copy_buffer) { 287 sctx->cs_copy_buffer = si_create_dma_compute_shader( 288 &sctx->b, SI_COMPUTE_COPY_DW_PER_THREAD, shader_dst_stream_policy, true); 289 } 290 291 si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_copy_buffer, flags, coher, 292 2, sb, 0x1); 293 } else { 294 assert(clear_value_size >= 4 && clear_value_size <= 16 && 295 util_is_power_of_two_or_zero(clear_value_size)); 296 297 for (unsigned i = 0; i < 4; i++) 298 sctx->cs_user_data[i] = clear_value[i % (clear_value_size / 4)]; 299 300 if (!sctx->cs_clear_buffer) { 301 sctx->cs_clear_buffer = si_create_dma_compute_shader( 302 &sctx->b, SI_COMPUTE_CLEAR_DW_PER_THREAD, shader_dst_stream_policy, false); 303 } 304 305 si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_clear_buffer, flags, coher, 306 1, sb, 0x1); 307 } 308} 309 310void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, 311 uint64_t offset, uint64_t size, uint32_t *clear_value, 312 uint32_t clear_value_size, unsigned flags, 313 enum si_coherency coher, enum si_clear_method method) 314{ 315 if (!size) 316 return; 317 318 ASSERTED unsigned clear_alignment = MIN2(clear_value_size, 4); 319 320 assert(clear_value_size != 3 && clear_value_size != 6); /* 12 is allowed. */ 321 assert(offset % clear_alignment == 0); 322 assert(size % clear_alignment == 0); 323 assert(size < (UINT_MAX & ~0xf)); /* TODO: test 64-bit sizes in all codepaths */ 324 325 uint32_t clamped; 326 if (util_lower_clearsize_to_dword(clear_value, (int*)&clear_value_size, &clamped)) 327 clear_value = &clamped; 328 329 if (clear_value_size == 12) { 330 si_compute_clear_12bytes_buffer(sctx, dst, offset, size, clear_value, flags, coher); 331 return; 332 } 333 334 uint64_t aligned_size = size & ~3ull; 335 if (aligned_size >= 4) { 336 uint64_t compute_min_size; 337 338 if (sctx->chip_class <= GFX8) { 339 /* CP DMA clears are terribly slow with GTT on GFX6-8, which can always 340 * happen due to BO evictions. 341 */ 342 compute_min_size = 0; 343 } else { 344 /* Use a small enough size because CP DMA is slower than compute with bigger sizes. */ 345 compute_min_size = 4 * 1024; 346 } 347 348 if (method == SI_AUTO_SELECT_CLEAR_METHOD && ( 349 clear_value_size > 4 || 350 (clear_value_size == 4 && offset % 4 == 0 && size > compute_min_size))) { 351 method = SI_COMPUTE_CLEAR_METHOD; 352 } 353 if (method == SI_COMPUTE_CLEAR_METHOD) { 354 si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0, aligned_size, clear_value, 355 clear_value_size, flags, coher); 356 } else { 357 assert(clear_value_size == 4); 358 si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, dst, offset, aligned_size, *clear_value, 359 flags, coher, get_cache_policy(sctx, coher, size)); 360 } 361 362 offset += aligned_size; 363 size -= aligned_size; 364 } 365 366 /* Handle non-dword alignment. */ 367 if (size) { 368 assert(dst); 369 assert(dst->target == PIPE_BUFFER); 370 assert(size < 4); 371 372 pipe_buffer_write(&sctx->b, dst, offset, size, clear_value); 373 } 374} 375 376void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst, uint64_t offset, 377 uint64_t size, unsigned value, unsigned flags) 378{ 379 struct si_context *ctx = (struct si_context *)sscreen->aux_context; 380 381 simple_mtx_lock(&sscreen->aux_context_lock); 382 si_clear_buffer(ctx, dst, offset, size, &value, 4, flags, 383 SI_COHERENCY_SHADER, SI_AUTO_SELECT_CLEAR_METHOD); 384 sscreen->aux_context->flush(sscreen->aux_context, NULL, 0); 385 simple_mtx_unlock(&sscreen->aux_context_lock); 386} 387 388static void si_pipe_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, 389 unsigned offset, unsigned size, const void *clear_value, 390 int clear_value_size) 391{ 392 si_clear_buffer((struct si_context *)ctx, dst, offset, size, (uint32_t *)clear_value, 393 clear_value_size, SI_OP_SYNC_BEFORE_AFTER, SI_COHERENCY_SHADER, 394 SI_AUTO_SELECT_CLEAR_METHOD); 395} 396 397void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src, 398 uint64_t dst_offset, uint64_t src_offset, unsigned size, unsigned flags) 399{ 400 if (!size) 401 return; 402 403 enum si_coherency coher = SI_COHERENCY_SHADER; 404 enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size); 405 uint64_t compute_min_size = 8 * 1024; 406 407 /* Only use compute for VRAM copies on dGPUs. */ 408 if (sctx->screen->info.has_dedicated_vram && si_resource(dst)->domains & RADEON_DOMAIN_VRAM && 409 si_resource(src)->domains & RADEON_DOMAIN_VRAM && size > compute_min_size && 410 dst_offset % 4 == 0 && src_offset % 4 == 0 && size % 4 == 0) { 411 si_compute_do_clear_or_copy(sctx, dst, dst_offset, src, src_offset, size, NULL, 0, 412 flags, coher); 413 } else { 414 si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size, 415 flags, coher, cache_policy); 416 } 417} 418 419void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, unsigned dst_level, 420 struct pipe_resource *src, unsigned src_level, unsigned dstx, 421 unsigned dsty, unsigned dstz, const struct pipe_box *src_box, 422 bool is_dcc_decompress, unsigned flags) 423{ 424 struct pipe_context *ctx = &sctx->b; 425 struct si_texture *ssrc = (struct si_texture*)src; 426 struct si_texture *sdst = (struct si_texture*)dst; 427 unsigned width = src_box->width; 428 unsigned height = src_box->height; 429 unsigned depth = src_box->depth; 430 enum pipe_format src_format = util_format_linear(src->format); 431 enum pipe_format dst_format = util_format_linear(dst->format); 432 bool is_linear = ssrc->surface.is_linear || sdst->surface.is_linear; 433 434 assert(util_format_is_subsampled_422(src_format) == util_format_is_subsampled_422(dst_format)); 435 436 if (!vi_dcc_enabled(ssrc, src_level) && 437 !vi_dcc_enabled(sdst, dst_level) && 438 src_format == dst_format && 439 util_format_is_float(src_format) && 440 !util_format_is_compressed(src_format)) { 441 /* Interpret as integer values to avoid NaN issues */ 442 switch(util_format_get_blocksizebits(src_format)) { 443 case 16: 444 src_format = dst_format = PIPE_FORMAT_R16_UINT; 445 break; 446 case 32: 447 src_format = dst_format = PIPE_FORMAT_R32_UINT; 448 break; 449 case 64: 450 src_format = dst_format = PIPE_FORMAT_R32G32_UINT; 451 break; 452 case 128: 453 src_format = dst_format = PIPE_FORMAT_R32G32B32A32_UINT; 454 break; 455 default: 456 assert(false); 457 } 458 } 459 460 if (util_format_is_subsampled_422(src_format)) { 461 src_format = dst_format = PIPE_FORMAT_R32_UINT; 462 /* Interpreting 422 subsampled format (16 bpp) as 32 bpp 463 * should force us to divide src_box->x, dstx and width by 2. 464 * But given that ac_surface allocates this format as 32 bpp 465 * and that surf_size is then modified to pack the values 466 * we must keep the original values to get the correct results. 467 */ 468 } 469 470 if (width == 0 || height == 0) 471 return; 472 473 /* The driver doesn't decompress resources automatically here. */ 474 si_decompress_subresource(ctx, dst, PIPE_MASK_RGBAZS, dst_level, dstz, 475 dstz + src_box->depth - 1); 476 si_decompress_subresource(ctx, src, PIPE_MASK_RGBAZS, src_level, src_box->z, 477 src_box->z + src_box->depth - 1); 478 479 /* src and dst have the same number of samples. */ 480 si_make_CB_shader_coherent(sctx, src->nr_samples, true, 481 ssrc->surface.u.gfx9.color.dcc.pipe_aligned); 482 if (sctx->chip_class >= GFX10) { 483 /* GFX10+ uses DCC stores so si_make_CB_shader_coherent is required for dst too */ 484 si_make_CB_shader_coherent(sctx, dst->nr_samples, true, 485 sdst->surface.u.gfx9.color.dcc.pipe_aligned); 486 } 487 488 struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE]; 489 struct pipe_image_view saved_image[2] = {0}; 490 util_copy_image_view(&saved_image[0], &images->views[0]); 491 util_copy_image_view(&saved_image[1], &images->views[1]); 492 493 struct pipe_image_view image[2] = {0}; 494 image[0].resource = src; 495 image[0].shader_access = image[0].access = PIPE_IMAGE_ACCESS_READ; 496 image[0].format = src_format; 497 image[0].u.tex.level = src_level; 498 image[0].u.tex.first_layer = 0; 499 image[0].u.tex.last_layer = src->target == PIPE_TEXTURE_3D ? u_minify(src->depth0, src_level) - 1 500 : (unsigned)(src->array_size - 1); 501 image[1].resource = dst; 502 image[1].shader_access = image[1].access = PIPE_IMAGE_ACCESS_WRITE; 503 image[1].format = dst_format; 504 image[1].u.tex.level = dst_level; 505 image[1].u.tex.first_layer = 0; 506 image[1].u.tex.last_layer = dst->target == PIPE_TEXTURE_3D ? u_minify(dst->depth0, dst_level) - 1 507 : (unsigned)(dst->array_size - 1); 508 509 /* SNORM8 blitting has precision issues on some chips. Use the SINT 510 * equivalent instead, which doesn't force DCC decompression. 511 */ 512 if (util_format_is_snorm8(dst->format)) { 513 image[0].format = image[1].format = util_format_snorm8_to_sint8(dst->format); 514 } 515 516 if (is_dcc_decompress) 517 image[1].access |= SI_IMAGE_ACCESS_DCC_OFF; 518 else if (sctx->chip_class >= GFX10) 519 image[1].access |= SI_IMAGE_ACCESS_ALLOW_DCC_STORE; 520 521 ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, 0, image); 522 523 if (!is_dcc_decompress) { 524 sctx->cs_user_data[0] = src_box->x | (dstx << 16); 525 sctx->cs_user_data[1] = src_box->y | (dsty << 16); 526 sctx->cs_user_data[2] = src_box->z | (dstz << 16); 527 } 528 529 struct pipe_grid_info info = {0}; 530 531 if (is_dcc_decompress) { 532 /* The DCC decompression is a normal blit where the load is compressed 533 * and the store is uncompressed. The workgroup size is either equal to 534 * the DCC block size or a multiple thereof. The shader uses a barrier 535 * between loads and stores to safely overwrite each DCC block of pixels. 536 */ 537 unsigned dim[3] = {src_box->width, src_box->height, src_box->depth}; 538 539 assert(src == dst); 540 assert(dst->target != PIPE_TEXTURE_1D && dst->target != PIPE_TEXTURE_1D_ARRAY); 541 542 if (!sctx->cs_dcc_decompress) 543 sctx->cs_dcc_decompress = si_create_dcc_decompress_cs(ctx); 544 545 info.block[0] = ssrc->surface.u.gfx9.color.dcc_block_width; 546 info.block[1] = ssrc->surface.u.gfx9.color.dcc_block_height; 547 info.block[2] = ssrc->surface.u.gfx9.color.dcc_block_depth; 548 549 /* Make sure the block size is at least the same as wave size. */ 550 while (info.block[0] * info.block[1] * info.block[2] < 551 sctx->screen->compute_wave_size) { 552 info.block[0] *= 2; 553 } 554 555 for (unsigned i = 0; i < 3; i++) { 556 info.last_block[i] = dim[i] % info.block[i]; 557 info.grid[i] = DIV_ROUND_UP(dim[i], info.block[i]); 558 } 559 560 si_launch_grid_internal(sctx, &info, sctx->cs_dcc_decompress, flags | SI_OP_CS_IMAGE); 561 } else if (dst->target == PIPE_TEXTURE_1D_ARRAY && src->target == PIPE_TEXTURE_1D_ARRAY) { 562 if (!sctx->cs_copy_image_1d_array) 563 sctx->cs_copy_image_1d_array = si_create_copy_image_compute_shader_1d_array(ctx); 564 565 info.block[0] = 64; 566 info.last_block[0] = width % 64; 567 info.block[1] = 1; 568 info.block[2] = 1; 569 info.grid[0] = DIV_ROUND_UP(width, 64); 570 info.grid[1] = depth; 571 info.grid[2] = 1; 572 573 si_launch_grid_internal(sctx, &info, sctx->cs_copy_image_1d_array, flags | SI_OP_CS_IMAGE); 574 } else { 575 if (!sctx->cs_copy_image) 576 sctx->cs_copy_image = si_create_copy_image_compute_shader(ctx); 577 578 /* This is better for access over PCIe. */ 579 if (is_linear) { 580 info.block[0] = 64; 581 info.block[1] = 1; 582 } else { 583 info.block[0] = 8; 584 info.block[1] = 8; 585 } 586 info.last_block[0] = width % info.block[0]; 587 info.last_block[1] = height % info.block[1]; 588 info.block[2] = 1; 589 info.grid[0] = DIV_ROUND_UP(width, info.block[0]); 590 info.grid[1] = DIV_ROUND_UP(height, info.block[1]); 591 info.grid[2] = depth; 592 593 si_launch_grid_internal(sctx, &info, sctx->cs_copy_image, flags | SI_OP_CS_IMAGE); 594 } 595 596 ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, 0, saved_image); 597 for (int i = 0; i < 2; i++) 598 pipe_resource_reference(&saved_image[i].resource, NULL); 599} 600 601void si_retile_dcc(struct si_context *sctx, struct si_texture *tex) 602{ 603 /* Set the DCC buffer. */ 604 assert(tex->surface.meta_offset && tex->surface.meta_offset <= UINT_MAX); 605 assert(tex->surface.display_dcc_offset && tex->surface.display_dcc_offset <= UINT_MAX); 606 assert(tex->surface.display_dcc_offset < tex->surface.meta_offset); 607 assert(tex->buffer.bo_size <= UINT_MAX); 608 609 struct pipe_shader_buffer sb = {}; 610 sb.buffer = &tex->buffer.b.b; 611 sb.buffer_offset = tex->surface.display_dcc_offset; 612 sb.buffer_size = tex->buffer.bo_size - sb.buffer_offset; 613 614 sctx->cs_user_data[0] = tex->surface.meta_offset - tex->surface.display_dcc_offset; 615 sctx->cs_user_data[1] = (tex->surface.u.gfx9.color.dcc_pitch_max + 1) | 616 (tex->surface.u.gfx9.color.dcc_height << 16); 617 sctx->cs_user_data[2] = (tex->surface.u.gfx9.color.display_dcc_pitch_max + 1) | 618 (tex->surface.u.gfx9.color.display_dcc_height << 16); 619 620 /* We have only 1 variant per bpp for now, so expect 32 bpp. */ 621 assert(tex->surface.bpe == 4); 622 623 void **shader = &sctx->cs_dcc_retile[tex->surface.u.gfx9.swizzle_mode]; 624 if (!*shader) 625 *shader = si_create_dcc_retile_cs(sctx, &tex->surface); 626 627 /* Dispatch compute. */ 628 unsigned width = DIV_ROUND_UP(tex->buffer.b.b.width0, tex->surface.u.gfx9.color.dcc_block_width); 629 unsigned height = DIV_ROUND_UP(tex->buffer.b.b.height0, tex->surface.u.gfx9.color.dcc_block_height); 630 631 struct pipe_grid_info info = {}; 632 info.block[0] = 8; 633 info.block[1] = 8; 634 info.block[2] = 1; 635 info.last_block[0] = width % info.block[0]; 636 info.last_block[1] = height % info.block[1]; 637 info.grid[0] = DIV_ROUND_UP(width, info.block[0]); 638 info.grid[1] = DIV_ROUND_UP(height, info.block[1]); 639 info.grid[2] = 1; 640 641 si_launch_grid_internal_ssbos(sctx, &info, *shader, SI_OP_SYNC_BEFORE, 642 SI_COHERENCY_CB_META, 1, &sb, 0x1); 643 644 /* Don't flush caches. L2 will be flushed by the kernel fence. */ 645} 646 647void gfx9_clear_dcc_msaa(struct si_context *sctx, struct pipe_resource *res, uint32_t clear_value, 648 unsigned flags, enum si_coherency coher) 649{ 650 struct si_texture *tex = (struct si_texture*)res; 651 652 /* Set the DCC buffer. */ 653 assert(tex->surface.meta_offset && tex->surface.meta_offset <= UINT_MAX); 654 assert(tex->buffer.bo_size <= UINT_MAX); 655 656 struct pipe_shader_buffer sb = {}; 657 sb.buffer = &tex->buffer.b.b; 658 sb.buffer_offset = tex->surface.meta_offset; 659 sb.buffer_size = tex->buffer.bo_size - sb.buffer_offset; 660 661 sctx->cs_user_data[0] = (tex->surface.u.gfx9.color.dcc_pitch_max + 1) | 662 (tex->surface.u.gfx9.color.dcc_height << 16); 663 sctx->cs_user_data[1] = (clear_value & 0xffff) | 664 ((uint32_t)tex->surface.tile_swizzle << 16); 665 666 /* These variables identify the shader variant. */ 667 unsigned swizzle_mode = tex->surface.u.gfx9.swizzle_mode; 668 unsigned bpe_log2 = util_logbase2(tex->surface.bpe); 669 unsigned log2_samples = util_logbase2(tex->buffer.b.b.nr_samples); 670 bool fragments8 = tex->buffer.b.b.nr_storage_samples == 8; 671 bool is_array = tex->buffer.b.b.array_size > 1; 672 void **shader = &sctx->cs_clear_dcc_msaa[swizzle_mode][bpe_log2][fragments8][log2_samples - 2][is_array]; 673 674 if (!*shader) 675 *shader = gfx9_create_clear_dcc_msaa_cs(sctx, tex); 676 677 /* Dispatch compute. */ 678 unsigned width = DIV_ROUND_UP(tex->buffer.b.b.width0, tex->surface.u.gfx9.color.dcc_block_width); 679 unsigned height = DIV_ROUND_UP(tex->buffer.b.b.height0, tex->surface.u.gfx9.color.dcc_block_height); 680 unsigned depth = DIV_ROUND_UP(tex->buffer.b.b.array_size, tex->surface.u.gfx9.color.dcc_block_depth); 681 682 struct pipe_grid_info info = {}; 683 info.block[0] = 8; 684 info.block[1] = 8; 685 info.block[2] = 1; 686 info.last_block[0] = width % info.block[0]; 687 info.last_block[1] = height % info.block[1]; 688 info.grid[0] = DIV_ROUND_UP(width, info.block[0]); 689 info.grid[1] = DIV_ROUND_UP(height, info.block[1]); 690 info.grid[2] = depth; 691 692 si_launch_grid_internal_ssbos(sctx, &info, *shader, flags, coher, 1, &sb, 0x1); 693} 694 695/* Expand FMASK to make it identity, so that image stores can ignore it. */ 696void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex) 697{ 698 struct si_context *sctx = (struct si_context *)ctx; 699 bool is_array = tex->target == PIPE_TEXTURE_2D_ARRAY; 700 unsigned log_fragments = util_logbase2(tex->nr_storage_samples); 701 unsigned log_samples = util_logbase2(tex->nr_samples); 702 assert(tex->nr_samples >= 2); 703 704 /* EQAA FMASK expansion is unimplemented. */ 705 if (tex->nr_samples != tex->nr_storage_samples) 706 return; 707 708 si_make_CB_shader_coherent(sctx, tex->nr_samples, true, 709 ((struct si_texture*)tex)->surface.u.gfx9.color.dcc.pipe_aligned); 710 711 /* Save states. */ 712 struct pipe_image_view saved_image = {0}; 713 util_copy_image_view(&saved_image, &sctx->images[PIPE_SHADER_COMPUTE].views[0]); 714 715 /* Bind the image. */ 716 struct pipe_image_view image = {0}; 717 image.resource = tex; 718 /* Don't set WRITE so as not to trigger FMASK expansion, causing 719 * an infinite loop. */ 720 image.shader_access = image.access = PIPE_IMAGE_ACCESS_READ; 721 image.format = util_format_linear(tex->format); 722 if (is_array) 723 image.u.tex.last_layer = tex->array_size - 1; 724 725 ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, 0, &image); 726 727 /* Bind the shader. */ 728 void **shader = &sctx->cs_fmask_expand[log_samples - 1][is_array]; 729 if (!*shader) 730 *shader = si_create_fmask_expand_cs(ctx, tex->nr_samples, is_array); 731 732 /* Dispatch compute. */ 733 struct pipe_grid_info info = {0}; 734 info.block[0] = 8; 735 info.last_block[0] = tex->width0 % 8; 736 info.block[1] = 8; 737 info.last_block[1] = tex->height0 % 8; 738 info.block[2] = 1; 739 info.grid[0] = DIV_ROUND_UP(tex->width0, 8); 740 info.grid[1] = DIV_ROUND_UP(tex->height0, 8); 741 info.grid[2] = is_array ? tex->array_size : 1; 742 743 si_launch_grid_internal(sctx, &info, *shader, SI_OP_SYNC_BEFORE_AFTER); 744 745 /* Restore previous states. */ 746 ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, 0, &saved_image); 747 pipe_resource_reference(&saved_image.resource, NULL); 748 749 /* Array of fully expanded FMASK values, arranged by [log2(fragments)][log2(samples)-1]. */ 750#define INVALID 0 /* never used */ 751 static const uint64_t fmask_expand_values[][4] = { 752 /* samples */ 753 /* 2 (8 bpp) 4 (8 bpp) 8 (8-32bpp) 16 (16-64bpp) fragments */ 754 {0x02020202, 0x0E0E0E0E, 0xFEFEFEFE, 0xFFFEFFFE}, /* 1 */ 755 {0x02020202, 0xA4A4A4A4, 0xAAA4AAA4, 0xAAAAAAA4}, /* 2 */ 756 {INVALID, 0xE4E4E4E4, 0x44443210, 0x4444444444443210}, /* 4 */ 757 {INVALID, INVALID, 0x76543210, 0x8888888876543210}, /* 8 */ 758 }; 759 760 /* Clear FMASK to identity. */ 761 struct si_texture *stex = (struct si_texture *)tex; 762 si_clear_buffer(sctx, tex, stex->surface.fmask_offset, stex->surface.fmask_size, 763 (uint32_t *)&fmask_expand_values[log_fragments][log_samples - 1], 764 log_fragments >= 2 && log_samples == 4 ? 8 : 4, SI_OP_SYNC_AFTER, 765 SI_COHERENCY_SHADER, SI_AUTO_SELECT_CLEAR_METHOD); 766} 767 768void si_init_compute_blit_functions(struct si_context *sctx) 769{ 770 sctx->b.clear_buffer = si_pipe_clear_buffer; 771} 772 773/* Clear a region of a color surface to a constant value. */ 774void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surface *dstsurf, 775 const union pipe_color_union *color, unsigned dstx, 776 unsigned dsty, unsigned width, unsigned height, 777 bool render_condition_enabled) 778{ 779 struct si_context *sctx = (struct si_context *)ctx; 780 struct si_texture *tex = (struct si_texture*)dstsurf->texture; 781 unsigned num_layers = dstsurf->u.tex.last_layer - dstsurf->u.tex.first_layer + 1; 782 unsigned data[4 + sizeof(color->ui)] = {dstx, dsty, dstsurf->u.tex.first_layer, 0}; 783 784 if (width == 0 || height == 0) 785 return; 786 787 /* The driver doesn't decompress resources automatically here. */ 788 si_decompress_subresource(ctx, dstsurf->texture, PIPE_MASK_RGBA, dstsurf->u.tex.level, 789 dstsurf->u.tex.first_layer, dstsurf->u.tex.last_layer); 790 791 if (util_format_is_srgb(dstsurf->format)) { 792 union pipe_color_union color_srgb; 793 for (int i = 0; i < 3; i++) 794 color_srgb.f[i] = util_format_linear_to_srgb_float(color->f[i]); 795 color_srgb.f[3] = color->f[3]; 796 memcpy(data + 4, color_srgb.ui, sizeof(color->ui)); 797 } else { 798 memcpy(data + 4, color->ui, sizeof(color->ui)); 799 } 800 801 si_make_CB_shader_coherent(sctx, dstsurf->texture->nr_samples, true, 802 tex->surface.u.gfx9.color.dcc.pipe_aligned); 803 804 struct pipe_constant_buffer saved_cb = {}; 805 si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb); 806 807 struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE]; 808 struct pipe_image_view saved_image = {0}; 809 util_copy_image_view(&saved_image, &images->views[0]); 810 811 struct pipe_constant_buffer cb = {}; 812 cb.buffer_size = sizeof(data); 813 cb.user_buffer = data; 814 ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, false, &cb); 815 816 struct pipe_image_view image = {0}; 817 image.resource = dstsurf->texture; 818 image.shader_access = image.access = PIPE_IMAGE_ACCESS_WRITE | SI_IMAGE_ACCESS_ALLOW_DCC_STORE; 819 image.format = util_format_linear(dstsurf->format); 820 image.u.tex.level = dstsurf->u.tex.level; 821 image.u.tex.first_layer = 0; /* 3D images ignore first_layer (BASE_ARRAY) */ 822 image.u.tex.last_layer = dstsurf->u.tex.last_layer; 823 824 ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, 0, &image); 825 826 struct pipe_grid_info info = {0}; 827 void *shader; 828 829 if (dstsurf->texture->target != PIPE_TEXTURE_1D_ARRAY) { 830 if (!sctx->cs_clear_render_target) 831 sctx->cs_clear_render_target = si_clear_render_target_shader(ctx); 832 shader = sctx->cs_clear_render_target; 833 834 info.block[0] = 8; 835 info.last_block[0] = width % 8; 836 info.block[1] = 8; 837 info.last_block[1] = height % 8; 838 info.block[2] = 1; 839 info.grid[0] = DIV_ROUND_UP(width, 8); 840 info.grid[1] = DIV_ROUND_UP(height, 8); 841 info.grid[2] = num_layers; 842 } else { 843 if (!sctx->cs_clear_render_target_1d_array) 844 sctx->cs_clear_render_target_1d_array = si_clear_render_target_shader_1d_array(ctx); 845 shader = sctx->cs_clear_render_target_1d_array; 846 847 info.block[0] = 64; 848 info.last_block[0] = width % 64; 849 info.block[1] = 1; 850 info.block[2] = 1; 851 info.grid[0] = DIV_ROUND_UP(width, 64); 852 info.grid[1] = num_layers; 853 info.grid[2] = 1; 854 } 855 856 si_launch_grid_internal(sctx, &info, shader, SI_OP_SYNC_BEFORE_AFTER | SI_OP_CS_IMAGE | 857 (render_condition_enabled ? SI_OP_CS_RENDER_COND_ENABLE : 0)); 858 859 ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, 0, &saved_image); 860 ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, true, &saved_cb); 861 pipe_resource_reference(&saved_image.resource, NULL); 862} 863