1848b8605Smrg/* 2848b8605Smrg * Copyright 2010 Jerome Glisse <glisse@freedesktop.org> 3b8e80941Smrg * Copyright 2018 Advanced Micro Devices, Inc. 4b8e80941Smrg * All Rights Reserved. 5848b8605Smrg * 6848b8605Smrg * Permission is hereby granted, free of charge, to any person obtaining a 7848b8605Smrg * copy of this software and associated documentation files (the "Software"), 8848b8605Smrg * to deal in the Software without restriction, including without limitation 9848b8605Smrg * on the rights to use, copy, modify, merge, publish, distribute, sub 10848b8605Smrg * license, and/or sell copies of the Software, and to permit persons to whom 11848b8605Smrg * the Software is furnished to do so, subject to the following conditions: 12848b8605Smrg * 13848b8605Smrg * The above copyright notice and this permission notice (including the next 14848b8605Smrg * paragraph) shall be included in all copies or substantial portions of the 15848b8605Smrg * Software. 16848b8605Smrg * 17848b8605Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18848b8605Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19848b8605Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 20848b8605Smrg * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 21848b8605Smrg * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 22848b8605Smrg * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 23848b8605Smrg * USE OR OTHER DEALINGS IN THE SOFTWARE. 24848b8605Smrg */ 25848b8605Smrg 26848b8605Smrg#include "sid.h" 27848b8605Smrg#include "si_pipe.h" 28848b8605Smrg 29848b8605Smrg#include "util/u_format.h" 30848b8605Smrg 31848b8605Smrgstatic void si_dma_copy_buffer(struct si_context *ctx, 32848b8605Smrg struct pipe_resource *dst, 33848b8605Smrg struct pipe_resource *src, 34848b8605Smrg uint64_t dst_offset, 35848b8605Smrg uint64_t src_offset, 36848b8605Smrg uint64_t size) 37848b8605Smrg{ 38b8e80941Smrg struct radeon_cmdbuf *cs = ctx->dma_cs; 39b8e80941Smrg unsigned i, ncopy, count, max_size, sub_cmd, shift; 40b8e80941Smrg struct si_resource *sdst = si_resource(dst); 41b8e80941Smrg struct si_resource *ssrc = si_resource(src); 42848b8605Smrg 43848b8605Smrg /* Mark the buffer range of destination as valid (initialized), 44848b8605Smrg * so that transfer_map knows it should wait for the GPU when mapping 45848b8605Smrg * that range. */ 46b8e80941Smrg util_range_add(&sdst->valid_buffer_range, dst_offset, 47848b8605Smrg dst_offset + size); 48848b8605Smrg 49b8e80941Smrg dst_offset += sdst->gpu_address; 50b8e80941Smrg src_offset += ssrc->gpu_address; 51848b8605Smrg 52b8e80941Smrg /* see whether we should use the dword-aligned or byte-aligned copy */ 53848b8605Smrg if (!(dst_offset % 4) && !(src_offset % 4) && !(size % 4)) { 54848b8605Smrg sub_cmd = SI_DMA_COPY_DWORD_ALIGNED; 55848b8605Smrg shift = 2; 56b8e80941Smrg max_size = SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE; 57848b8605Smrg } else { 58848b8605Smrg sub_cmd = SI_DMA_COPY_BYTE_ALIGNED; 59848b8605Smrg shift = 0; 60b8e80941Smrg max_size = SI_DMA_COPY_MAX_BYTE_ALIGNED_SIZE; 61848b8605Smrg } 62848b8605Smrg 63b8e80941Smrg ncopy = DIV_ROUND_UP(size, max_size); 64b8e80941Smrg si_need_dma_space(ctx, ncopy * 5, sdst, ssrc); 65848b8605Smrg 66848b8605Smrg for (i = 0; i < ncopy; i++) { 67b8e80941Smrg count = MIN2(size, max_size); 68b8e80941Smrg radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd, 69b8e80941Smrg count >> shift)); 70b8e80941Smrg radeon_emit(cs, dst_offset); 71b8e80941Smrg radeon_emit(cs, src_offset); 72b8e80941Smrg radeon_emit(cs, (dst_offset >> 32UL) & 0xff); 73b8e80941Smrg radeon_emit(cs, (src_offset >> 32UL) & 0xff); 74b8e80941Smrg dst_offset += count; 75b8e80941Smrg src_offset += count; 76b8e80941Smrg size -= count; 77848b8605Smrg } 78848b8605Smrg} 79848b8605Smrg 80848b8605Smrgstatic void si_dma_copy_tile(struct si_context *ctx, 81848b8605Smrg struct pipe_resource *dst, 82848b8605Smrg unsigned dst_level, 83848b8605Smrg unsigned dst_x, 84848b8605Smrg unsigned dst_y, 85848b8605Smrg unsigned dst_z, 86848b8605Smrg struct pipe_resource *src, 87848b8605Smrg unsigned src_level, 88848b8605Smrg unsigned src_x, 89848b8605Smrg unsigned src_y, 90848b8605Smrg unsigned src_z, 91848b8605Smrg unsigned copy_height, 92848b8605Smrg unsigned pitch, 93848b8605Smrg unsigned bpp) 94848b8605Smrg{ 95b8e80941Smrg struct radeon_cmdbuf *cs = ctx->dma_cs; 96b8e80941Smrg struct si_texture *ssrc = (struct si_texture*)src; 97b8e80941Smrg struct si_texture *sdst = (struct si_texture*)dst; 98b8e80941Smrg unsigned dst_mode = sdst->surface.u.legacy.level[dst_level].mode; 99b8e80941Smrg bool detile = dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED; 100b8e80941Smrg struct si_texture *linear = detile ? sdst : ssrc; 101b8e80941Smrg struct si_texture *tiled = detile ? ssrc : sdst; 102b8e80941Smrg unsigned linear_lvl = detile ? dst_level : src_level; 103b8e80941Smrg unsigned tiled_lvl = detile ? src_level : dst_level; 104b8e80941Smrg struct radeon_info *info = &ctx->screen->info; 105b8e80941Smrg unsigned index = tiled->surface.u.legacy.tiling_index[tiled_lvl]; 106b8e80941Smrg unsigned tile_mode = info->si_tile_mode_array[index]; 107848b8605Smrg unsigned array_mode, lbpp, pitch_tile_max, slice_tile_max, size; 108b8e80941Smrg unsigned ncopy, height, cheight, i; 109b8e80941Smrg unsigned linear_x, linear_y, linear_z, tiled_x, tiled_y, tiled_z; 110848b8605Smrg unsigned sub_cmd, bank_h, bank_w, mt_aspect, nbanks, tile_split, mt; 111848b8605Smrg uint64_t base, addr; 112b8e80941Smrg unsigned pipe_config; 113848b8605Smrg 114b8e80941Smrg assert(dst_mode != ssrc->surface.u.legacy.level[src_level].mode); 115848b8605Smrg 116848b8605Smrg sub_cmd = SI_DMA_COPY_TILED; 117848b8605Smrg lbpp = util_logbase2(bpp); 118848b8605Smrg pitch_tile_max = ((pitch / bpp) / 8) - 1; 119848b8605Smrg 120b8e80941Smrg linear_x = detile ? dst_x : src_x; 121b8e80941Smrg linear_y = detile ? dst_y : src_y; 122b8e80941Smrg linear_z = detile ? dst_z : src_z; 123b8e80941Smrg tiled_x = detile ? src_x : dst_x; 124b8e80941Smrg tiled_y = detile ? src_y : dst_y; 125b8e80941Smrg tiled_z = detile ? src_z : dst_z; 126b8e80941Smrg 127b8e80941Smrg assert(!util_format_is_depth_and_stencil(tiled->buffer.b.b.format)); 128b8e80941Smrg 129b8e80941Smrg array_mode = G_009910_ARRAY_MODE(tile_mode); 130b8e80941Smrg slice_tile_max = (tiled->surface.u.legacy.level[tiled_lvl].nblk_x * 131b8e80941Smrg tiled->surface.u.legacy.level[tiled_lvl].nblk_y) / (8*8) - 1; 132b8e80941Smrg /* linear height must be the same as the slice tile max height, it's ok even 133b8e80941Smrg * if the linear destination/source have smaller heigh as the size of the 134b8e80941Smrg * dma packet will be using the copy_height which is always smaller or equal 135b8e80941Smrg * to the linear height 136b8e80941Smrg */ 137b8e80941Smrg height = tiled->surface.u.legacy.level[tiled_lvl].nblk_y; 138b8e80941Smrg base = tiled->surface.u.legacy.level[tiled_lvl].offset; 139b8e80941Smrg addr = linear->surface.u.legacy.level[linear_lvl].offset; 140b8e80941Smrg addr += (uint64_t)linear->surface.u.legacy.level[linear_lvl].slice_size_dw * 4 * linear_z; 141b8e80941Smrg addr += linear_y * pitch + linear_x * bpp; 142b8e80941Smrg bank_h = G_009910_BANK_HEIGHT(tile_mode); 143b8e80941Smrg bank_w = G_009910_BANK_WIDTH(tile_mode); 144b8e80941Smrg mt_aspect = G_009910_MACRO_TILE_ASPECT(tile_mode); 145b8e80941Smrg /* Non-depth modes don't have TILE_SPLIT set. */ 146b8e80941Smrg tile_split = util_logbase2(tiled->surface.u.legacy.tile_split >> 6); 147b8e80941Smrg nbanks = G_009910_NUM_BANKS(tile_mode); 148b8e80941Smrg base += tiled->buffer.gpu_address; 149b8e80941Smrg addr += linear->buffer.gpu_address; 150b8e80941Smrg 151b8e80941Smrg pipe_config = G_009910_PIPE_CONFIG(tile_mode); 152b8e80941Smrg mt = G_009910_MICRO_TILE_MODE(tile_mode); 153b8e80941Smrg size = copy_height * pitch; 154b8e80941Smrg ncopy = DIV_ROUND_UP(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE); 155b8e80941Smrg si_need_dma_space(ctx, ncopy * 9, &sdst->buffer, &ssrc->buffer); 156848b8605Smrg 157848b8605Smrg for (i = 0; i < ncopy; i++) { 158848b8605Smrg cheight = copy_height; 159b8e80941Smrg if (cheight * pitch > SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE) { 160b8e80941Smrg cheight = SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE / pitch; 161848b8605Smrg } 162b8e80941Smrg size = cheight * pitch; 163b8e80941Smrg radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd, size / 4)); 164b8e80941Smrg radeon_emit(cs, base >> 8); 165b8e80941Smrg radeon_emit(cs, (detile << 31) | (array_mode << 27) | 166b8e80941Smrg (lbpp << 24) | (bank_h << 21) | 167b8e80941Smrg (bank_w << 18) | (mt_aspect << 16)); 168b8e80941Smrg radeon_emit(cs, (pitch_tile_max << 0) | ((height - 1) << 16)); 169b8e80941Smrg radeon_emit(cs, (slice_tile_max << 0) | (pipe_config << 26)); 170b8e80941Smrg radeon_emit(cs, (tiled_x << 0) | (tiled_z << 18)); 171b8e80941Smrg radeon_emit(cs, (tiled_y << 0) | (tile_split << 21) | (nbanks << 25) | (mt << 27)); 172b8e80941Smrg radeon_emit(cs, addr & 0xfffffffc); 173b8e80941Smrg radeon_emit(cs, (addr >> 32UL) & 0xff); 174848b8605Smrg copy_height -= cheight; 175848b8605Smrg addr += cheight * pitch; 176b8e80941Smrg tiled_y += cheight; 177848b8605Smrg } 178848b8605Smrg} 179848b8605Smrg 180b8e80941Smrgstatic void si_dma_copy(struct pipe_context *ctx, 181b8e80941Smrg struct pipe_resource *dst, 182b8e80941Smrg unsigned dst_level, 183b8e80941Smrg unsigned dstx, unsigned dsty, unsigned dstz, 184b8e80941Smrg struct pipe_resource *src, 185b8e80941Smrg unsigned src_level, 186b8e80941Smrg const struct pipe_box *src_box) 187848b8605Smrg{ 188848b8605Smrg struct si_context *sctx = (struct si_context *)ctx; 189b8e80941Smrg struct si_texture *ssrc = (struct si_texture*)src; 190b8e80941Smrg struct si_texture *sdst = (struct si_texture*)dst; 191b8e80941Smrg unsigned dst_pitch, src_pitch, bpp, dst_mode, src_mode; 192848b8605Smrg unsigned src_w, dst_w; 193848b8605Smrg unsigned src_x, src_y; 194848b8605Smrg unsigned dst_x = dstx, dst_y = dsty, dst_z = dstz; 195848b8605Smrg 196b8e80941Smrg if (sctx->dma_cs == NULL || 197b8e80941Smrg src->flags & PIPE_RESOURCE_FLAG_SPARSE || 198b8e80941Smrg dst->flags & PIPE_RESOURCE_FLAG_SPARSE) { 199848b8605Smrg goto fallback; 200848b8605Smrg } 201848b8605Smrg 202848b8605Smrg if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) { 203848b8605Smrg si_dma_copy_buffer(sctx, dst, src, dst_x, src_box->x, src_box->width); 204848b8605Smrg return; 205848b8605Smrg } 206848b8605Smrg 207848b8605Smrg /* XXX: Using the asynchronous DMA engine for multi-dimensional 208848b8605Smrg * operations seems to cause random GPU lockups for various people. 209848b8605Smrg * While the root cause for this might need to be fixed in the kernel, 210848b8605Smrg * let's disable it for now. 211848b8605Smrg * 212848b8605Smrg * Before re-enabling this, please make sure you can hit all newly 213848b8605Smrg * enabled paths in your testing, preferably with both piglit and real 214848b8605Smrg * world apps, and get in touch with people on the bug reports below 215848b8605Smrg * for stability testing. 216848b8605Smrg * 217848b8605Smrg * https://bugs.freedesktop.org/show_bug.cgi?id=85647 218848b8605Smrg * https://bugs.freedesktop.org/show_bug.cgi?id=83500 219848b8605Smrg */ 220848b8605Smrg goto fallback; 221848b8605Smrg 222b8e80941Smrg if (src_box->depth > 1 || 223b8e80941Smrg !si_prepare_for_dma_blit(sctx, sdst, dst_level, dstx, dsty, 224b8e80941Smrg dstz, ssrc, src_level, src_box)) 225848b8605Smrg goto fallback; 226848b8605Smrg 227848b8605Smrg src_x = util_format_get_nblocksx(src->format, src_box->x); 228848b8605Smrg dst_x = util_format_get_nblocksx(src->format, dst_x); 229848b8605Smrg src_y = util_format_get_nblocksy(src->format, src_box->y); 230848b8605Smrg dst_y = util_format_get_nblocksy(src->format, dst_y); 231848b8605Smrg 232b8e80941Smrg bpp = sdst->surface.bpe; 233b8e80941Smrg dst_pitch = sdst->surface.u.legacy.level[dst_level].nblk_x * sdst->surface.bpe; 234b8e80941Smrg src_pitch = ssrc->surface.u.legacy.level[src_level].nblk_x * ssrc->surface.bpe; 235b8e80941Smrg src_w = u_minify(ssrc->buffer.b.b.width0, src_level); 236b8e80941Smrg dst_w = u_minify(sdst->buffer.b.b.width0, dst_level); 237b8e80941Smrg 238b8e80941Smrg dst_mode = sdst->surface.u.legacy.level[dst_level].mode; 239b8e80941Smrg src_mode = ssrc->surface.u.legacy.level[src_level].mode; 240b8e80941Smrg 241b8e80941Smrg if (src_pitch != dst_pitch || src_box->x || dst_x || src_w != dst_w || 242b8e80941Smrg src_box->width != src_w || 243b8e80941Smrg src_box->height != u_minify(ssrc->buffer.b.b.height0, src_level) || 244b8e80941Smrg src_box->height != u_minify(sdst->buffer.b.b.height0, dst_level) || 245b8e80941Smrg ssrc->surface.u.legacy.level[src_level].nblk_y != 246b8e80941Smrg sdst->surface.u.legacy.level[dst_level].nblk_y) { 247848b8605Smrg /* FIXME si can do partial blit */ 248848b8605Smrg goto fallback; 249848b8605Smrg } 250848b8605Smrg /* the x test here are currently useless (because we don't support partial blit) 251848b8605Smrg * but keep them around so we don't forget about those 252848b8605Smrg */ 253b8e80941Smrg if ((src_pitch % 8) || (src_box->x % 8) || (dst_x % 8) || 254b8e80941Smrg (src_box->y % 8) || (dst_y % 8) || (src_box->height % 8)) { 255848b8605Smrg goto fallback; 256848b8605Smrg } 257848b8605Smrg 258848b8605Smrg if (src_mode == dst_mode) { 259848b8605Smrg uint64_t dst_offset, src_offset; 260848b8605Smrg /* simple dma blit would do NOTE code here assume : 261848b8605Smrg * src_box.x/y == 0 262848b8605Smrg * dst_x/y == 0 263848b8605Smrg * dst_pitch == src_pitch 264848b8605Smrg */ 265b8e80941Smrg src_offset= ssrc->surface.u.legacy.level[src_level].offset; 266b8e80941Smrg src_offset += (uint64_t)ssrc->surface.u.legacy.level[src_level].slice_size_dw * 4 * src_box->z; 267848b8605Smrg src_offset += src_y * src_pitch + src_x * bpp; 268b8e80941Smrg dst_offset = sdst->surface.u.legacy.level[dst_level].offset; 269b8e80941Smrg dst_offset += (uint64_t)sdst->surface.u.legacy.level[dst_level].slice_size_dw * 4 * dst_z; 270848b8605Smrg dst_offset += dst_y * dst_pitch + dst_x * bpp; 271848b8605Smrg si_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, 272b8e80941Smrg (uint64_t)ssrc->surface.u.legacy.level[src_level].slice_size_dw * 4); 273848b8605Smrg } else { 274848b8605Smrg si_dma_copy_tile(sctx, dst, dst_level, dst_x, dst_y, dst_z, 275848b8605Smrg src, src_level, src_x, src_y, src_box->z, 276b8e80941Smrg src_box->height / ssrc->surface.blk_h, 277b8e80941Smrg dst_pitch, bpp); 278848b8605Smrg } 279848b8605Smrg return; 280848b8605Smrg 281848b8605Smrgfallback: 282b8e80941Smrg si_resource_copy_region(ctx, dst, dst_level, dstx, dsty, dstz, 283b8e80941Smrg src, src_level, src_box); 284b8e80941Smrg} 285b8e80941Smrg 286b8e80941Smrgvoid si_init_dma_functions(struct si_context *sctx) 287b8e80941Smrg{ 288b8e80941Smrg sctx->dma_copy = si_dma_copy; 289848b8605Smrg} 290