1/* 2 * Copyright 2013 Advanced Micro Devices, Inc. 3 * All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * on the rights to use, copy, modify, merge, publish, distribute, sub 9 * license, and/or sell copies of the Software, and to permit persons to whom 10 * the Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22 * USE OR OTHER DEALINGS IN THE SOFTWARE. 23 */ 24 25#include "si_pipe.h" 26#include "sid.h" 27#include "si_build_pm4.h" 28 29/* Set this if you want the ME to wait until CP DMA is done. 30 * It should be set on the last CP DMA packet. */ 31#define CP_DMA_SYNC (1 << 0) 32 33/* Set this if the source data was used as a destination in a previous CP DMA 34 * packet. It's for preventing a read-after-write (RAW) hazard between two 35 * CP DMA packets. */ 36#define CP_DMA_RAW_WAIT (1 << 1) 37#define CP_DMA_DST_IS_GDS (1 << 2) 38#define CP_DMA_CLEAR (1 << 3) 39#define CP_DMA_PFP_SYNC_ME (1 << 4) 40#define CP_DMA_SRC_IS_GDS (1 << 5) 41 42/* The max number of bytes that can be copied per packet. */ 43static inline unsigned cp_dma_max_byte_count(struct si_context *sctx) 44{ 45 unsigned max = 46 sctx->chip_class >= GFX9 ? S_415_BYTE_COUNT_GFX9(~0u) : S_415_BYTE_COUNT_GFX6(~0u); 47 48 /* make it aligned for optimal performance */ 49 return max & ~(SI_CPDMA_ALIGNMENT - 1); 50} 51 52/* Emit a CP DMA packet to do a copy from one buffer to another, or to clear 53 * a buffer. The size must fit in bits [20:0]. If CP_DMA_CLEAR is set, src_va is a 32-bit 54 * clear value. 55 */ 56static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf *cs, uint64_t dst_va, 57 uint64_t src_va, unsigned size, unsigned flags, 58 enum si_cache_policy cache_policy) 59{ 60 uint32_t header = 0, command = 0; 61 62 assert(size <= cp_dma_max_byte_count(sctx)); 63 assert(sctx->chip_class != GFX6 || cache_policy == L2_BYPASS); 64 65 if (sctx->chip_class >= GFX9) 66 command |= S_415_BYTE_COUNT_GFX9(size); 67 else 68 command |= S_415_BYTE_COUNT_GFX6(size); 69 70 /* Sync flags. */ 71 if (flags & CP_DMA_SYNC) 72 header |= S_411_CP_SYNC(1); 73 74 if (flags & CP_DMA_RAW_WAIT) 75 command |= S_415_RAW_WAIT(1); 76 77 /* Src and dst flags. */ 78 if (sctx->chip_class >= GFX9 && !(flags & CP_DMA_CLEAR) && src_va == dst_va) { 79 header |= S_411_DST_SEL(V_411_NOWHERE); /* prefetch only */ 80 } else if (flags & CP_DMA_DST_IS_GDS) { 81 header |= S_411_DST_SEL(V_411_GDS); 82 /* GDS increments the address, not CP. */ 83 command |= S_415_DAS(V_415_REGISTER) | S_415_DAIC(V_415_NO_INCREMENT); 84 } else if (sctx->chip_class >= GFX7 && cache_policy != L2_BYPASS) { 85 header |= 86 S_411_DST_SEL(V_411_DST_ADDR_TC_L2) | S_500_DST_CACHE_POLICY(cache_policy == L2_STREAM); 87 } 88 89 if (flags & CP_DMA_CLEAR) { 90 header |= S_411_SRC_SEL(V_411_DATA); 91 } else if (flags & CP_DMA_SRC_IS_GDS) { 92 header |= S_411_SRC_SEL(V_411_GDS); 93 /* Both of these are required for GDS. It does increment the address. */ 94 command |= S_415_SAS(V_415_REGISTER) | S_415_SAIC(V_415_NO_INCREMENT); 95 } else if (sctx->chip_class >= GFX7 && cache_policy != L2_BYPASS) { 96 header |= 97 S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2) | S_500_SRC_CACHE_POLICY(cache_policy == L2_STREAM); 98 } 99 100 radeon_begin(cs); 101 102 if (sctx->chip_class >= GFX7) { 103 radeon_emit(PKT3(PKT3_DMA_DATA, 5, 0)); 104 radeon_emit(header); 105 radeon_emit(src_va); /* SRC_ADDR_LO [31:0] */ 106 radeon_emit(src_va >> 32); /* SRC_ADDR_HI [31:0] */ 107 radeon_emit(dst_va); /* DST_ADDR_LO [31:0] */ 108 radeon_emit(dst_va >> 32); /* DST_ADDR_HI [31:0] */ 109 radeon_emit(command); 110 } else { 111 header |= S_411_SRC_ADDR_HI(src_va >> 32); 112 113 radeon_emit(PKT3(PKT3_CP_DMA, 4, 0)); 114 radeon_emit(src_va); /* SRC_ADDR_LO [31:0] */ 115 radeon_emit(header); /* SRC_ADDR_HI [15:0] + flags. */ 116 radeon_emit(dst_va); /* DST_ADDR_LO [31:0] */ 117 radeon_emit((dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */ 118 radeon_emit(command); 119 } 120 121 /* CP DMA is executed in ME, but index buffers are read by PFP. 122 * This ensures that ME (CP DMA) is idle before PFP starts fetching 123 * indices. If we wanted to execute CP DMA in PFP, this packet 124 * should precede it. 125 */ 126 if (sctx->has_graphics && flags & CP_DMA_PFP_SYNC_ME) { 127 radeon_emit(PKT3(PKT3_PFP_SYNC_ME, 0, 0)); 128 radeon_emit(0); 129 } 130 radeon_end(); 131} 132 133void si_cp_dma_wait_for_idle(struct si_context *sctx, struct radeon_cmdbuf *cs) 134{ 135 /* Issue a dummy DMA that copies zero bytes. 136 * 137 * The DMA engine will see that there's no work to do and skip this 138 * DMA request, however, the CP will see the sync flag and still wait 139 * for all DMAs to complete. 140 */ 141 si_emit_cp_dma(sctx, cs, 0, 0, 0, CP_DMA_SYNC, L2_BYPASS); 142} 143 144static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst, 145 struct pipe_resource *src, unsigned byte_count, 146 uint64_t remaining_size, unsigned user_flags, enum si_coherency coher, 147 bool *is_first, unsigned *packet_flags) 148{ 149 /* Count memory usage in so that need_cs_space can take it into account. */ 150 if (dst) 151 si_context_add_resource_size(sctx, dst); 152 if (src) 153 si_context_add_resource_size(sctx, src); 154 155 if (!(user_flags & SI_OP_CPDMA_SKIP_CHECK_CS_SPACE)) 156 si_need_gfx_cs_space(sctx, 0); 157 158 /* This must be done after need_cs_space. */ 159 if (dst) 160 radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(dst), RADEON_USAGE_WRITE, 161 RADEON_PRIO_CP_DMA); 162 if (src) 163 radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(src), RADEON_USAGE_READ, 164 RADEON_PRIO_CP_DMA); 165 166 /* Flush the caches for the first copy only. 167 * Also wait for the previous CP DMA operations. 168 */ 169 if (*is_first && sctx->flags) 170 sctx->emit_cache_flush(sctx, &sctx->gfx_cs); 171 172 if (user_flags & SI_OP_SYNC_CPDMA_BEFORE && *is_first && !(*packet_flags & CP_DMA_CLEAR)) 173 *packet_flags |= CP_DMA_RAW_WAIT; 174 175 *is_first = false; 176 177 /* Do the synchronization after the last dma, so that all data 178 * is written to memory. 179 */ 180 if (user_flags & SI_OP_SYNC_AFTER && byte_count == remaining_size) { 181 *packet_flags |= CP_DMA_SYNC; 182 183 if (coher == SI_COHERENCY_SHADER) 184 *packet_flags |= CP_DMA_PFP_SYNC_ME; 185 } 186} 187 188void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs, 189 struct pipe_resource *dst, uint64_t offset, uint64_t size, 190 unsigned value, unsigned user_flags, enum si_coherency coher, 191 enum si_cache_policy cache_policy) 192{ 193 struct si_resource *sdst = si_resource(dst); 194 uint64_t va = (sdst ? sdst->gpu_address : 0) + offset; 195 bool is_first = true; 196 197 assert(size && size % 4 == 0); 198 199 if (user_flags & SI_OP_SYNC_CS_BEFORE) 200 sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME; 201 202 if (user_flags & SI_OP_SYNC_PS_BEFORE) 203 sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME; 204 205 /* Mark the buffer range of destination as valid (initialized), 206 * so that transfer_map knows it should wait for the GPU when mapping 207 * that range. */ 208 if (sdst) { 209 util_range_add(dst, &sdst->valid_buffer_range, offset, offset + size); 210 211 if (!(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE)) 212 sctx->flags |= si_get_flush_flags(sctx, coher, cache_policy); 213 } 214 215 while (size) { 216 unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx)); 217 unsigned dma_flags = CP_DMA_CLEAR | (sdst ? 0 : CP_DMA_DST_IS_GDS); 218 219 si_cp_dma_prepare(sctx, dst, NULL, byte_count, size, user_flags, coher, &is_first, 220 &dma_flags); 221 222 /* Emit the clear packet. */ 223 si_emit_cp_dma(sctx, cs, va, value, byte_count, dma_flags, cache_policy); 224 225 size -= byte_count; 226 va += byte_count; 227 } 228 229 if (sdst && cache_policy != L2_BYPASS) 230 sdst->TC_L2_dirty = true; 231 232 /* If it's not a framebuffer fast clear... */ 233 if (coher == SI_COHERENCY_SHADER) 234 sctx->num_cp_dma_calls++; 235} 236 237/** 238 * Realign the CP DMA engine. This must be done after a copy with an unaligned 239 * size. 240 * 241 * \param size Remaining size to the CP DMA alignment. 242 */ 243static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size, unsigned user_flags, 244 enum si_coherency coher, enum si_cache_policy cache_policy, 245 bool *is_first) 246{ 247 uint64_t va; 248 unsigned dma_flags = 0; 249 unsigned scratch_size = SI_CPDMA_ALIGNMENT * 2; 250 251 assert(size < SI_CPDMA_ALIGNMENT); 252 253 /* Use the scratch buffer as the dummy buffer. The 3D engine should be 254 * idle at this point. 255 */ 256 if (!sctx->scratch_buffer || sctx->scratch_buffer->b.b.width0 < scratch_size) { 257 si_resource_reference(&sctx->scratch_buffer, NULL); 258 sctx->scratch_buffer = si_aligned_buffer_create(&sctx->screen->b, 259 SI_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_DRIVER_INTERNAL, 260 PIPE_USAGE_DEFAULT, scratch_size, 256); 261 if (!sctx->scratch_buffer) 262 return; 263 264 si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state); 265 } 266 267 si_cp_dma_prepare(sctx, &sctx->scratch_buffer->b.b, &sctx->scratch_buffer->b.b, size, size, 268 user_flags, coher, is_first, &dma_flags); 269 270 va = sctx->scratch_buffer->gpu_address; 271 si_emit_cp_dma(sctx, &sctx->gfx_cs, va, va + SI_CPDMA_ALIGNMENT, size, dma_flags, cache_policy); 272} 273 274/** 275 * Do memcpy between buffers using CP DMA. 276 * If src or dst is NULL, it means read or write GDS, respectively. 277 * 278 * \param user_flags bitmask of SI_CPDMA_* 279 */ 280void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, 281 struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset, 282 unsigned size, unsigned user_flags, enum si_coherency coher, 283 enum si_cache_policy cache_policy) 284{ 285 uint64_t main_dst_offset, main_src_offset; 286 unsigned skipped_size = 0; 287 unsigned realign_size = 0; 288 unsigned gds_flags = (dst ? 0 : CP_DMA_DST_IS_GDS) | (src ? 0 : CP_DMA_SRC_IS_GDS); 289 bool is_first = true; 290 291 assert(size); 292 293 if (dst) { 294 /* Skip this for the L2 prefetch. */ 295 if (dst != src || dst_offset != src_offset) { 296 /* Mark the buffer range of destination as valid (initialized), 297 * so that transfer_map knows it should wait for the GPU when mapping 298 * that range. */ 299 util_range_add(dst, &si_resource(dst)->valid_buffer_range, dst_offset, dst_offset + size); 300 } 301 302 dst_offset += si_resource(dst)->gpu_address; 303 } 304 if (src) 305 src_offset += si_resource(src)->gpu_address; 306 307 /* The workarounds aren't needed on Fiji and beyond. */ 308 if (sctx->family <= CHIP_CARRIZO || sctx->family == CHIP_STONEY) { 309 /* If the size is not aligned, we must add a dummy copy at the end 310 * just to align the internal counter. Otherwise, the DMA engine 311 * would slow down by an order of magnitude for following copies. 312 */ 313 if (size % SI_CPDMA_ALIGNMENT) 314 realign_size = SI_CPDMA_ALIGNMENT - (size % SI_CPDMA_ALIGNMENT); 315 316 /* If the copy begins unaligned, we must start copying from the next 317 * aligned block and the skipped part should be copied after everything 318 * else has been copied. Only the src alignment matters, not dst. 319 * 320 * GDS doesn't need the source address to be aligned. 321 */ 322 if (src && src_offset % SI_CPDMA_ALIGNMENT) { 323 skipped_size = SI_CPDMA_ALIGNMENT - (src_offset % SI_CPDMA_ALIGNMENT); 324 /* The main part will be skipped if the size is too small. */ 325 skipped_size = MIN2(skipped_size, size); 326 size -= skipped_size; 327 } 328 } 329 330 /* TMZ handling */ 331 if (unlikely(radeon_uses_secure_bos(sctx->ws))) { 332 bool secure = src && (si_resource(src)->flags & RADEON_FLAG_ENCRYPTED); 333 assert(!secure || (!dst || (si_resource(dst)->flags & RADEON_FLAG_ENCRYPTED))); 334 if (secure != sctx->ws->cs_is_secure(&sctx->gfx_cs)) { 335 si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW | 336 RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION, NULL); 337 } 338 } 339 340 if (user_flags & SI_OP_SYNC_CS_BEFORE) 341 sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME; 342 343 if (user_flags & SI_OP_SYNC_PS_BEFORE) 344 sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME; 345 346 if ((dst || src) && !(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE)) 347 sctx->flags |= si_get_flush_flags(sctx, coher, cache_policy); 348 349 /* This is the main part doing the copying. Src is always aligned. */ 350 main_dst_offset = dst_offset + skipped_size; 351 main_src_offset = src_offset + skipped_size; 352 353 while (size) { 354 unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx)); 355 unsigned dma_flags = gds_flags; 356 357 si_cp_dma_prepare(sctx, dst, src, byte_count, size + skipped_size + realign_size, user_flags, 358 coher, &is_first, &dma_flags); 359 360 si_emit_cp_dma(sctx, &sctx->gfx_cs, main_dst_offset, main_src_offset, byte_count, dma_flags, 361 cache_policy); 362 363 size -= byte_count; 364 main_src_offset += byte_count; 365 main_dst_offset += byte_count; 366 } 367 368 /* Copy the part we skipped because src wasn't aligned. */ 369 if (skipped_size) { 370 unsigned dma_flags = gds_flags; 371 372 si_cp_dma_prepare(sctx, dst, src, skipped_size, skipped_size + realign_size, user_flags, 373 coher, &is_first, &dma_flags); 374 375 si_emit_cp_dma(sctx, &sctx->gfx_cs, dst_offset, src_offset, skipped_size, dma_flags, 376 cache_policy); 377 } 378 379 /* Finally, realign the engine if the size wasn't aligned. */ 380 if (realign_size) { 381 si_cp_dma_realign_engine(sctx, realign_size, user_flags, coher, cache_policy, &is_first); 382 } 383 384 if (dst && cache_policy != L2_BYPASS) 385 si_resource(dst)->TC_L2_dirty = true; 386 387 /* If it's not a prefetch or GDS copy... */ 388 if (dst && src && (dst != src || dst_offset != src_offset)) 389 sctx->num_cp_dma_calls++; 390} 391 392void si_cp_dma_prefetch(struct si_context *sctx, struct pipe_resource *buf, 393 unsigned offset, unsigned size) 394{ 395 uint64_t address = si_resource(buf)->gpu_address + offset; 396 397 assert(sctx->chip_class >= GFX7); 398 399 /* The prefetch address and size must be aligned, so that we don't have to apply 400 * the complicated hw bug workaround. 401 * 402 * The size should also be less than 2 MB, so that we don't have to use a loop. 403 * Callers shouldn't need to prefetch more than 2 MB. 404 */ 405 assert(size % SI_CPDMA_ALIGNMENT == 0); 406 assert(address % SI_CPDMA_ALIGNMENT == 0); 407 assert(size < S_415_BYTE_COUNT_GFX6(~0u)); 408 409 uint32_t header = S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2); 410 uint32_t command = S_415_BYTE_COUNT_GFX6(size); 411 412 if (sctx->chip_class >= GFX9) { 413 command |= S_415_DISABLE_WR_CONFIRM_GFX9(1); 414 header |= S_411_DST_SEL(V_411_NOWHERE); 415 } else { 416 command |= S_415_DISABLE_WR_CONFIRM_GFX6(1); 417 header |= S_411_DST_SEL(V_411_DST_ADDR_TC_L2); 418 } 419 420 struct radeon_cmdbuf *cs = &sctx->gfx_cs; 421 radeon_begin(cs); 422 radeon_emit(PKT3(PKT3_DMA_DATA, 5, 0)); 423 radeon_emit(header); 424 radeon_emit(address); /* SRC_ADDR_LO [31:0] */ 425 radeon_emit(address >> 32); /* SRC_ADDR_HI [31:0] */ 426 radeon_emit(address); /* DST_ADDR_LO [31:0] */ 427 radeon_emit(address >> 32); /* DST_ADDR_HI [31:0] */ 428 radeon_emit(command); 429 radeon_end(); 430} 431 432void si_test_gds(struct si_context *sctx) 433{ 434 struct pipe_context *ctx = &sctx->b; 435 struct pipe_resource *src, *dst; 436 unsigned r[4] = {}; 437 unsigned offset = debug_get_num_option("OFFSET", 16); 438 439 src = pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_DEFAULT, 16); 440 dst = pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_DEFAULT, 16); 441 si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, src, 0, 4, 0xabcdef01, SI_OP_SYNC_BEFORE_AFTER, 442 SI_COHERENCY_SHADER, L2_BYPASS); 443 si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, src, 4, 4, 0x23456789, SI_OP_SYNC_BEFORE_AFTER, 444 SI_COHERENCY_SHADER, L2_BYPASS); 445 si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, src, 8, 4, 0x87654321, SI_OP_SYNC_BEFORE_AFTER, 446 SI_COHERENCY_SHADER, L2_BYPASS); 447 si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, src, 12, 4, 0xfedcba98, SI_OP_SYNC_BEFORE_AFTER, 448 SI_COHERENCY_SHADER, L2_BYPASS); 449 si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, dst, 0, 16, 0xdeadbeef, SI_OP_SYNC_BEFORE_AFTER, 450 SI_COHERENCY_SHADER, L2_BYPASS); 451 452 si_cp_dma_copy_buffer(sctx, NULL, src, offset, 0, 16, SI_OP_SYNC_BEFORE_AFTER, 453 SI_COHERENCY_NONE, L2_BYPASS); 454 si_cp_dma_copy_buffer(sctx, dst, NULL, 0, offset, 16, SI_OP_SYNC_BEFORE_AFTER, 455 SI_COHERENCY_NONE, L2_BYPASS); 456 457 pipe_buffer_read(ctx, dst, 0, sizeof(r), r); 458 printf("GDS copy = %08x %08x %08x %08x -> %s\n", r[0], r[1], r[2], r[3], 459 r[0] == 0xabcdef01 && r[1] == 0x23456789 && r[2] == 0x87654321 && r[3] == 0xfedcba98 460 ? "pass" 461 : "fail"); 462 463 si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, NULL, offset, 16, 0xc1ea4146, 464 SI_OP_SYNC_BEFORE_AFTER, SI_COHERENCY_NONE, L2_BYPASS); 465 si_cp_dma_copy_buffer(sctx, dst, NULL, 0, offset, 16, SI_OP_SYNC_BEFORE_AFTER, 466 SI_COHERENCY_NONE, L2_BYPASS); 467 468 pipe_buffer_read(ctx, dst, 0, sizeof(r), r); 469 printf("GDS clear = %08x %08x %08x %08x -> %s\n", r[0], r[1], r[2], r[3], 470 r[0] == 0xc1ea4146 && r[1] == 0xc1ea4146 && r[2] == 0xc1ea4146 && r[3] == 0xc1ea4146 471 ? "pass" 472 : "fail"); 473 474 pipe_resource_reference(&src, NULL); 475 pipe_resource_reference(&dst, NULL); 476 exit(0); 477} 478 479void si_cp_write_data(struct si_context *sctx, struct si_resource *buf, unsigned offset, 480 unsigned size, unsigned dst_sel, unsigned engine, const void *data) 481{ 482 struct radeon_cmdbuf *cs = &sctx->gfx_cs; 483 484 assert(offset % 4 == 0); 485 assert(size % 4 == 0); 486 487 if (sctx->chip_class == GFX6 && dst_sel == V_370_MEM) 488 dst_sel = V_370_MEM_GRBM; 489 490 radeon_add_to_buffer_list(sctx, cs, buf, RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA); 491 uint64_t va = buf->gpu_address + offset; 492 493 radeon_begin(cs); 494 radeon_emit(PKT3(PKT3_WRITE_DATA, 2 + size / 4, 0)); 495 radeon_emit(S_370_DST_SEL(dst_sel) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(engine)); 496 radeon_emit(va); 497 radeon_emit(va >> 32); 498 radeon_emit_array((const uint32_t *)data, size / 4); 499 radeon_end(); 500} 501 502void si_cp_copy_data(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned dst_sel, 503 struct si_resource *dst, unsigned dst_offset, unsigned src_sel, 504 struct si_resource *src, unsigned src_offset) 505{ 506 /* cs can point to the compute IB, which has the buffer list in gfx_cs. */ 507 if (dst) { 508 radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, dst, RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA); 509 } 510 if (src) { 511 radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, src, RADEON_USAGE_READ, RADEON_PRIO_CP_DMA); 512 } 513 514 uint64_t dst_va = (dst ? dst->gpu_address : 0ull) + dst_offset; 515 uint64_t src_va = (src ? src->gpu_address : 0ull) + src_offset; 516 517 radeon_begin(cs); 518 radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0)); 519 radeon_emit(COPY_DATA_SRC_SEL(src_sel) | COPY_DATA_DST_SEL(dst_sel) | COPY_DATA_WR_CONFIRM); 520 radeon_emit(src_va); 521 radeon_emit(src_va >> 32); 522 radeon_emit(dst_va); 523 radeon_emit(dst_va >> 32); 524 radeon_end(); 525} 526