1/* 2 * Copyright 2018 Advanced Micro Devices, Inc. 3 * All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * on the rights to use, copy, modify, merge, publish, distribute, sub 9 * license, and/or sell copies of the Software, and to permit persons to whom 10 * the Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22 * USE OR OTHER DEALINGS IN THE SOFTWARE. 23 */ 24 25#include "si_pipe.h" 26#include "sid.h" 27 28static void si_dma_emit_wait_idle(struct si_context *sctx) 29{ 30 struct radeon_cmdbuf *cs = sctx->dma_cs; 31 32 /* NOP waits for idle. */ 33 if (sctx->chip_class >= CIK) 34 radeon_emit(cs, 0x00000000); /* NOP */ 35 else 36 radeon_emit(cs, 0xf0000000); /* NOP */ 37} 38 39void si_dma_emit_timestamp(struct si_context *sctx, struct si_resource *dst, 40 uint64_t offset) 41{ 42 struct radeon_cmdbuf *cs = sctx->dma_cs; 43 uint64_t va = dst->gpu_address + offset; 44 45 if (sctx->chip_class == SI) { 46 unreachable("SI DMA doesn't support the timestamp packet."); 47 return; 48 } 49 50 /* Mark the buffer range of destination as valid (initialized), 51 * so that transfer_map knows it should wait for the GPU when mapping 52 * that range. */ 53 util_range_add(&dst->valid_buffer_range, offset, offset + 8); 54 55 assert(va % 8 == 0); 56 57 si_need_dma_space(sctx, 4, dst, NULL); 58 si_dma_emit_wait_idle(sctx); 59 60 radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_TIMESTAMP, 61 SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP, 62 0)); 63 radeon_emit(cs, va); 64 radeon_emit(cs, va >> 32); 65} 66 67void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, 68 uint64_t offset, uint64_t size, unsigned clear_value) 69{ 70 struct radeon_cmdbuf *cs = sctx->dma_cs; 71 unsigned i, ncopy, csize; 72 struct si_resource *sdst = si_resource(dst); 73 74 assert(offset % 4 == 0); 75 assert(size); 76 assert(size % 4 == 0); 77 78 if (!cs || dst->flags & PIPE_RESOURCE_FLAG_SPARSE) { 79 sctx->b.clear_buffer(&sctx->b, dst, offset, size, &clear_value, 4); 80 return; 81 } 82 83 /* Mark the buffer range of destination as valid (initialized), 84 * so that transfer_map knows it should wait for the GPU when mapping 85 * that range. */ 86 util_range_add(&sdst->valid_buffer_range, offset, offset + size); 87 88 offset += sdst->gpu_address; 89 90 if (sctx->chip_class == SI) { 91 /* the same maximum size as for copying */ 92 ncopy = DIV_ROUND_UP(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE); 93 si_need_dma_space(sctx, ncopy * 4, sdst, NULL); 94 95 for (i = 0; i < ncopy; i++) { 96 csize = MIN2(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE); 97 radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_CONSTANT_FILL, 0, 98 csize / 4)); 99 radeon_emit(cs, offset); 100 radeon_emit(cs, clear_value); 101 radeon_emit(cs, (offset >> 32) << 16); 102 offset += csize; 103 size -= csize; 104 } 105 return; 106 } 107 108 /* The following code is for CI, VI, Vega/Raven, etc. */ 109 /* the same maximum size as for copying */ 110 ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE); 111 si_need_dma_space(sctx, ncopy * 5, sdst, NULL); 112 113 for (i = 0; i < ncopy; i++) { 114 csize = MIN2(size, CIK_SDMA_COPY_MAX_SIZE); 115 radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_PACKET_CONSTANT_FILL, 0, 116 0x8000 /* dword copy */)); 117 radeon_emit(cs, offset); 118 radeon_emit(cs, offset >> 32); 119 radeon_emit(cs, clear_value); 120 radeon_emit(cs, sctx->chip_class >= GFX9 ? csize - 1 : csize); 121 offset += csize; 122 size -= csize; 123 } 124} 125 126void si_need_dma_space(struct si_context *ctx, unsigned num_dw, 127 struct si_resource *dst, struct si_resource *src) 128{ 129 struct radeon_winsys *ws = ctx->ws; 130 uint64_t vram = ctx->dma_cs->used_vram; 131 uint64_t gtt = ctx->dma_cs->used_gart; 132 133 if (dst) { 134 vram += dst->vram_usage; 135 gtt += dst->gart_usage; 136 } 137 if (src) { 138 vram += src->vram_usage; 139 gtt += src->gart_usage; 140 } 141 142 /* Flush the GFX IB if DMA depends on it. */ 143 if (!ctx->sdma_uploads_in_progress && 144 radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) && 145 ((dst && 146 ws->cs_is_buffer_referenced(ctx->gfx_cs, dst->buf, 147 RADEON_USAGE_READWRITE)) || 148 (src && 149 ws->cs_is_buffer_referenced(ctx->gfx_cs, src->buf, 150 RADEON_USAGE_WRITE)))) 151 si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); 152 153 /* Flush if there's not enough space, or if the memory usage per IB 154 * is too large. 155 * 156 * IBs using too little memory are limited by the IB submission overhead. 157 * IBs using too much memory are limited by the kernel/TTM overhead. 158 * Too long IBs create CPU-GPU pipeline bubbles and add latency. 159 * 160 * This heuristic makes sure that DMA requests are executed 161 * very soon after the call is made and lowers memory usage. 162 * It improves texture upload performance by keeping the DMA 163 * engine busy while uploads are being submitted. 164 */ 165 num_dw++; /* for emit_wait_idle below */ 166 if (!ctx->sdma_uploads_in_progress && 167 (!ws->cs_check_space(ctx->dma_cs, num_dw) || 168 ctx->dma_cs->used_vram + ctx->dma_cs->used_gart > 64 * 1024 * 1024 || 169 !radeon_cs_memory_below_limit(ctx->screen, ctx->dma_cs, vram, gtt))) { 170 si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL); 171 assert((num_dw + ctx->dma_cs->current.cdw) <= ctx->dma_cs->current.max_dw); 172 } 173 174 /* Wait for idle if either buffer has been used in the IB before to 175 * prevent read-after-write hazards. 176 */ 177 if ((dst && 178 ws->cs_is_buffer_referenced(ctx->dma_cs, dst->buf, 179 RADEON_USAGE_READWRITE)) || 180 (src && 181 ws->cs_is_buffer_referenced(ctx->dma_cs, src->buf, 182 RADEON_USAGE_WRITE))) 183 si_dma_emit_wait_idle(ctx); 184 185 unsigned sync = ctx->sdma_uploads_in_progress ? 0 : RADEON_USAGE_SYNCHRONIZED; 186 if (dst) { 187 ws->cs_add_buffer(ctx->dma_cs, dst->buf, RADEON_USAGE_WRITE | sync, 188 dst->domains, 0); 189 } 190 if (src) { 191 ws->cs_add_buffer(ctx->dma_cs, src->buf, RADEON_USAGE_READ | sync, 192 src->domains, 0); 193 } 194 195 /* this function is called before all DMA calls, so increment this. */ 196 ctx->num_dma_calls++; 197} 198 199void si_flush_dma_cs(struct si_context *ctx, unsigned flags, 200 struct pipe_fence_handle **fence) 201{ 202 struct radeon_cmdbuf *cs = ctx->dma_cs; 203 struct radeon_saved_cs saved; 204 bool check_vm = (ctx->screen->debug_flags & DBG(CHECK_VM)) != 0; 205 206 if (!radeon_emitted(cs, 0)) { 207 if (fence) 208 ctx->ws->fence_reference(fence, ctx->last_sdma_fence); 209 return; 210 } 211 212 if (check_vm) 213 si_save_cs(ctx->ws, cs, &saved, true); 214 215 ctx->ws->cs_flush(cs, flags, &ctx->last_sdma_fence); 216 if (fence) 217 ctx->ws->fence_reference(fence, ctx->last_sdma_fence); 218 219 if (check_vm) { 220 /* Use conservative timeout 800ms, after which we won't wait any 221 * longer and assume the GPU is hung. 222 */ 223 ctx->ws->fence_wait(ctx->ws, ctx->last_sdma_fence, 800*1000*1000); 224 225 si_check_vm_faults(ctx, &saved, RING_DMA); 226 si_clear_saved_cs(&saved); 227 } 228} 229 230void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst, 231 uint64_t offset, uint64_t size, unsigned value) 232{ 233 struct si_context *ctx = (struct si_context*)sscreen->aux_context; 234 235 mtx_lock(&sscreen->aux_context_lock); 236 si_sdma_clear_buffer(ctx, dst, offset, size, value); 237 sscreen->aux_context->flush(sscreen->aux_context, NULL, 0); 238 mtx_unlock(&sscreen->aux_context_lock); 239} 240