1/* 2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org> 3 * Copyright 2018 Advanced Micro Devices, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * on the rights to use, copy, modify, merge, publish, distribute, sub 10 * license, and/or sell copies of the Software, and to permit persons to whom 11 * the Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice (including the next 14 * paragraph) shall be included in all copies or substantial portions of the 15 * Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 20 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 21 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 22 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 23 * USE OR OTHER DEALINGS IN THE SOFTWARE. 24 */ 25 26#include "sid.h" 27#include "si_pipe.h" 28 29#include "util/u_format.h" 30 31static void si_dma_copy_buffer(struct si_context *ctx, 32 struct pipe_resource *dst, 33 struct pipe_resource *src, 34 uint64_t dst_offset, 35 uint64_t src_offset, 36 uint64_t size) 37{ 38 struct radeon_cmdbuf *cs = ctx->dma_cs; 39 unsigned i, ncopy, count, max_size, sub_cmd, shift; 40 struct si_resource *sdst = si_resource(dst); 41 struct si_resource *ssrc = si_resource(src); 42 43 /* Mark the buffer range of destination as valid (initialized), 44 * so that transfer_map knows it should wait for the GPU when mapping 45 * that range. */ 46 util_range_add(&sdst->valid_buffer_range, dst_offset, 47 dst_offset + size); 48 49 dst_offset += sdst->gpu_address; 50 src_offset += ssrc->gpu_address; 51 52 /* see whether we should use the dword-aligned or byte-aligned copy */ 53 if (!(dst_offset % 4) && !(src_offset % 4) && !(size % 4)) { 54 sub_cmd = SI_DMA_COPY_DWORD_ALIGNED; 55 shift = 2; 56 max_size = SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE; 57 } else { 58 sub_cmd = SI_DMA_COPY_BYTE_ALIGNED; 59 shift = 0; 60 max_size = SI_DMA_COPY_MAX_BYTE_ALIGNED_SIZE; 61 } 62 63 ncopy = DIV_ROUND_UP(size, max_size); 64 si_need_dma_space(ctx, ncopy * 5, sdst, ssrc); 65 66 for (i = 0; i < ncopy; i++) { 67 count = MIN2(size, max_size); 68 radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd, 69 count >> shift)); 70 radeon_emit(cs, dst_offset); 71 radeon_emit(cs, src_offset); 72 radeon_emit(cs, (dst_offset >> 32UL) & 0xff); 73 radeon_emit(cs, (src_offset >> 32UL) & 0xff); 74 dst_offset += count; 75 src_offset += count; 76 size -= count; 77 } 78} 79 80static void si_dma_copy_tile(struct si_context *ctx, 81 struct pipe_resource *dst, 82 unsigned dst_level, 83 unsigned dst_x, 84 unsigned dst_y, 85 unsigned dst_z, 86 struct pipe_resource *src, 87 unsigned src_level, 88 unsigned src_x, 89 unsigned src_y, 90 unsigned src_z, 91 unsigned copy_height, 92 unsigned pitch, 93 unsigned bpp) 94{ 95 struct radeon_cmdbuf *cs = ctx->dma_cs; 96 struct si_texture *ssrc = (struct si_texture*)src; 97 struct si_texture *sdst = (struct si_texture*)dst; 98 unsigned dst_mode = sdst->surface.u.legacy.level[dst_level].mode; 99 bool detile = dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED; 100 struct si_texture *linear = detile ? sdst : ssrc; 101 struct si_texture *tiled = detile ? ssrc : sdst; 102 unsigned linear_lvl = detile ? dst_level : src_level; 103 unsigned tiled_lvl = detile ? src_level : dst_level; 104 struct radeon_info *info = &ctx->screen->info; 105 unsigned index = tiled->surface.u.legacy.tiling_index[tiled_lvl]; 106 unsigned tile_mode = info->si_tile_mode_array[index]; 107 unsigned array_mode, lbpp, pitch_tile_max, slice_tile_max, size; 108 unsigned ncopy, height, cheight, i; 109 unsigned linear_x, linear_y, linear_z, tiled_x, tiled_y, tiled_z; 110 unsigned sub_cmd, bank_h, bank_w, mt_aspect, nbanks, tile_split, mt; 111 uint64_t base, addr; 112 unsigned pipe_config; 113 114 assert(dst_mode != ssrc->surface.u.legacy.level[src_level].mode); 115 116 sub_cmd = SI_DMA_COPY_TILED; 117 lbpp = util_logbase2(bpp); 118 pitch_tile_max = ((pitch / bpp) / 8) - 1; 119 120 linear_x = detile ? dst_x : src_x; 121 linear_y = detile ? dst_y : src_y; 122 linear_z = detile ? dst_z : src_z; 123 tiled_x = detile ? src_x : dst_x; 124 tiled_y = detile ? src_y : dst_y; 125 tiled_z = detile ? src_z : dst_z; 126 127 assert(!util_format_is_depth_and_stencil(tiled->buffer.b.b.format)); 128 129 array_mode = G_009910_ARRAY_MODE(tile_mode); 130 slice_tile_max = (tiled->surface.u.legacy.level[tiled_lvl].nblk_x * 131 tiled->surface.u.legacy.level[tiled_lvl].nblk_y) / (8*8) - 1; 132 /* linear height must be the same as the slice tile max height, it's ok even 133 * if the linear destination/source have smaller heigh as the size of the 134 * dma packet will be using the copy_height which is always smaller or equal 135 * to the linear height 136 */ 137 height = tiled->surface.u.legacy.level[tiled_lvl].nblk_y; 138 base = tiled->surface.u.legacy.level[tiled_lvl].offset; 139 addr = linear->surface.u.legacy.level[linear_lvl].offset; 140 addr += (uint64_t)linear->surface.u.legacy.level[linear_lvl].slice_size_dw * 4 * linear_z; 141 addr += linear_y * pitch + linear_x * bpp; 142 bank_h = G_009910_BANK_HEIGHT(tile_mode); 143 bank_w = G_009910_BANK_WIDTH(tile_mode); 144 mt_aspect = G_009910_MACRO_TILE_ASPECT(tile_mode); 145 /* Non-depth modes don't have TILE_SPLIT set. */ 146 tile_split = util_logbase2(tiled->surface.u.legacy.tile_split >> 6); 147 nbanks = G_009910_NUM_BANKS(tile_mode); 148 base += tiled->buffer.gpu_address; 149 addr += linear->buffer.gpu_address; 150 151 pipe_config = G_009910_PIPE_CONFIG(tile_mode); 152 mt = G_009910_MICRO_TILE_MODE(tile_mode); 153 size = copy_height * pitch; 154 ncopy = DIV_ROUND_UP(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE); 155 si_need_dma_space(ctx, ncopy * 9, &sdst->buffer, &ssrc->buffer); 156 157 for (i = 0; i < ncopy; i++) { 158 cheight = copy_height; 159 if (cheight * pitch > SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE) { 160 cheight = SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE / pitch; 161 } 162 size = cheight * pitch; 163 radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd, size / 4)); 164 radeon_emit(cs, base >> 8); 165 radeon_emit(cs, (detile << 31) | (array_mode << 27) | 166 (lbpp << 24) | (bank_h << 21) | 167 (bank_w << 18) | (mt_aspect << 16)); 168 radeon_emit(cs, (pitch_tile_max << 0) | ((height - 1) << 16)); 169 radeon_emit(cs, (slice_tile_max << 0) | (pipe_config << 26)); 170 radeon_emit(cs, (tiled_x << 0) | (tiled_z << 18)); 171 radeon_emit(cs, (tiled_y << 0) | (tile_split << 21) | (nbanks << 25) | (mt << 27)); 172 radeon_emit(cs, addr & 0xfffffffc); 173 radeon_emit(cs, (addr >> 32UL) & 0xff); 174 copy_height -= cheight; 175 addr += cheight * pitch; 176 tiled_y += cheight; 177 } 178} 179 180static void si_dma_copy(struct pipe_context *ctx, 181 struct pipe_resource *dst, 182 unsigned dst_level, 183 unsigned dstx, unsigned dsty, unsigned dstz, 184 struct pipe_resource *src, 185 unsigned src_level, 186 const struct pipe_box *src_box) 187{ 188 struct si_context *sctx = (struct si_context *)ctx; 189 struct si_texture *ssrc = (struct si_texture*)src; 190 struct si_texture *sdst = (struct si_texture*)dst; 191 unsigned dst_pitch, src_pitch, bpp, dst_mode, src_mode; 192 unsigned src_w, dst_w; 193 unsigned src_x, src_y; 194 unsigned dst_x = dstx, dst_y = dsty, dst_z = dstz; 195 196 if (sctx->dma_cs == NULL || 197 src->flags & PIPE_RESOURCE_FLAG_SPARSE || 198 dst->flags & PIPE_RESOURCE_FLAG_SPARSE) { 199 goto fallback; 200 } 201 202 if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) { 203 si_dma_copy_buffer(sctx, dst, src, dst_x, src_box->x, src_box->width); 204 return; 205 } 206 207 /* XXX: Using the asynchronous DMA engine for multi-dimensional 208 * operations seems to cause random GPU lockups for various people. 209 * While the root cause for this might need to be fixed in the kernel, 210 * let's disable it for now. 211 * 212 * Before re-enabling this, please make sure you can hit all newly 213 * enabled paths in your testing, preferably with both piglit and real 214 * world apps, and get in touch with people on the bug reports below 215 * for stability testing. 216 * 217 * https://bugs.freedesktop.org/show_bug.cgi?id=85647 218 * https://bugs.freedesktop.org/show_bug.cgi?id=83500 219 */ 220 goto fallback; 221 222 if (src_box->depth > 1 || 223 !si_prepare_for_dma_blit(sctx, sdst, dst_level, dstx, dsty, 224 dstz, ssrc, src_level, src_box)) 225 goto fallback; 226 227 src_x = util_format_get_nblocksx(src->format, src_box->x); 228 dst_x = util_format_get_nblocksx(src->format, dst_x); 229 src_y = util_format_get_nblocksy(src->format, src_box->y); 230 dst_y = util_format_get_nblocksy(src->format, dst_y); 231 232 bpp = sdst->surface.bpe; 233 dst_pitch = sdst->surface.u.legacy.level[dst_level].nblk_x * sdst->surface.bpe; 234 src_pitch = ssrc->surface.u.legacy.level[src_level].nblk_x * ssrc->surface.bpe; 235 src_w = u_minify(ssrc->buffer.b.b.width0, src_level); 236 dst_w = u_minify(sdst->buffer.b.b.width0, dst_level); 237 238 dst_mode = sdst->surface.u.legacy.level[dst_level].mode; 239 src_mode = ssrc->surface.u.legacy.level[src_level].mode; 240 241 if (src_pitch != dst_pitch || src_box->x || dst_x || src_w != dst_w || 242 src_box->width != src_w || 243 src_box->height != u_minify(ssrc->buffer.b.b.height0, src_level) || 244 src_box->height != u_minify(sdst->buffer.b.b.height0, dst_level) || 245 ssrc->surface.u.legacy.level[src_level].nblk_y != 246 sdst->surface.u.legacy.level[dst_level].nblk_y) { 247 /* FIXME si can do partial blit */ 248 goto fallback; 249 } 250 /* the x test here are currently useless (because we don't support partial blit) 251 * but keep them around so we don't forget about those 252 */ 253 if ((src_pitch % 8) || (src_box->x % 8) || (dst_x % 8) || 254 (src_box->y % 8) || (dst_y % 8) || (src_box->height % 8)) { 255 goto fallback; 256 } 257 258 if (src_mode == dst_mode) { 259 uint64_t dst_offset, src_offset; 260 /* simple dma blit would do NOTE code here assume : 261 * src_box.x/y == 0 262 * dst_x/y == 0 263 * dst_pitch == src_pitch 264 */ 265 src_offset= ssrc->surface.u.legacy.level[src_level].offset; 266 src_offset += (uint64_t)ssrc->surface.u.legacy.level[src_level].slice_size_dw * 4 * src_box->z; 267 src_offset += src_y * src_pitch + src_x * bpp; 268 dst_offset = sdst->surface.u.legacy.level[dst_level].offset; 269 dst_offset += (uint64_t)sdst->surface.u.legacy.level[dst_level].slice_size_dw * 4 * dst_z; 270 dst_offset += dst_y * dst_pitch + dst_x * bpp; 271 si_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, 272 (uint64_t)ssrc->surface.u.legacy.level[src_level].slice_size_dw * 4); 273 } else { 274 si_dma_copy_tile(sctx, dst, dst_level, dst_x, dst_y, dst_z, 275 src, src_level, src_x, src_y, src_box->z, 276 src_box->height / ssrc->surface.blk_h, 277 dst_pitch, bpp); 278 } 279 return; 280 281fallback: 282 si_resource_copy_region(ctx, dst, dst_level, dstx, dsty, dstz, 283 src, src_level, src_box); 284} 285 286void si_init_dma_functions(struct si_context *sctx) 287{ 288 sctx->dma_copy = si_dma_copy; 289} 290