Home | History | Annotate | Line # | Download | only in amdgpu
      1 /*	$NetBSD: amdgpu_si_dma.c,v 1.3 2021/12/19 12:21:29 riastradh Exp $	*/
      2 
      3 /*
      4  * Copyright 2015 Advanced Micro Devices, Inc.
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a
      7  * copy of this software and associated documentation files (the "Software"),
      8  * to deal in the Software without restriction, including without limitation
      9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     10  * and/or sell copies of the Software, and to permit persons to whom the
     11  * Software is furnished to do so, subject to the following conditions:
     12  *
     13  * The above copyright notice and this permission notice shall be included in
     14  * all copies or substantial portions of the Software.
     15  *
     16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
     20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
     21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
     22  * OTHER DEALINGS IN THE SOFTWARE.
     23  *
     24  * Authors: Alex Deucher
     25  */
     26 
     27 #include <sys/cdefs.h>
     28 __KERNEL_RCSID(0, "$NetBSD: amdgpu_si_dma.c,v 1.3 2021/12/19 12:21:29 riastradh Exp $");
     29 
     30 #include "amdgpu.h"
     31 #include "amdgpu_trace.h"
     32 #include "si.h"
     33 #include "sid.h"
     34 
     35 const u32 sdma_offsets[SDMA_MAX_INSTANCE] =
     36 {
     37 	DMA0_REGISTER_OFFSET,
     38 	DMA1_REGISTER_OFFSET
     39 };
     40 
     41 static void si_dma_set_ring_funcs(struct amdgpu_device *adev);
     42 static void si_dma_set_buffer_funcs(struct amdgpu_device *adev);
     43 static void si_dma_set_vm_pte_funcs(struct amdgpu_device *adev);
     44 static void si_dma_set_irq_funcs(struct amdgpu_device *adev);
     45 
     46 static uint64_t si_dma_ring_get_rptr(struct amdgpu_ring *ring)
     47 {
     48 	return ring->adev->wb.wb[ring->rptr_offs>>2];
     49 }
     50 
     51 static uint64_t si_dma_ring_get_wptr(struct amdgpu_ring *ring)
     52 {
     53 	struct amdgpu_device *adev = ring->adev;
     54 	u32 me = (ring == &adev->sdma.instance[0].ring) ? 0 : 1;
     55 
     56 	return (RREG32(DMA_RB_WPTR + sdma_offsets[me]) & 0x3fffc) >> 2;
     57 }
     58 
     59 static void si_dma_ring_set_wptr(struct amdgpu_ring *ring)
     60 {
     61 	struct amdgpu_device *adev = ring->adev;
     62 	u32 me = (ring == &adev->sdma.instance[0].ring) ? 0 : 1;
     63 
     64 	WREG32(DMA_RB_WPTR + sdma_offsets[me],
     65 	       (lower_32_bits(ring->wptr) << 2) & 0x3fffc);
     66 }
     67 
     68 static void si_dma_ring_emit_ib(struct amdgpu_ring *ring,
     69 				struct amdgpu_job *job,
     70 				struct amdgpu_ib *ib,
     71 				uint32_t flags)
     72 {
     73 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
     74 	/* The indirect buffer packet must end on an 8 DW boundary in the DMA ring.
     75 	 * Pad as necessary with NOPs.
     76 	 */
     77 	while ((lower_32_bits(ring->wptr) & 7) != 5)
     78 		amdgpu_ring_write(ring, DMA_PACKET(DMA_PACKET_NOP, 0, 0, 0, 0));
     79 	amdgpu_ring_write(ring, DMA_IB_PACKET(DMA_PACKET_INDIRECT_BUFFER, vmid, 0));
     80 	amdgpu_ring_write(ring, (ib->gpu_addr & 0xFFFFFFE0));
     81 	amdgpu_ring_write(ring, (ib->length_dw << 12) | (upper_32_bits(ib->gpu_addr) & 0xFF));
     82 
     83 }
     84 
     85 /**
     86  * si_dma_ring_emit_fence - emit a fence on the DMA ring
     87  *
     88  * @ring: amdgpu ring pointer
     89  * @fence: amdgpu fence object
     90  *
     91  * Add a DMA fence packet to the ring to write
     92  * the fence seq number and DMA trap packet to generate
     93  * an interrupt if needed (VI).
     94  */
     95 static void si_dma_ring_emit_fence(struct amdgpu_ring *ring, u64 addr, u64 seq,
     96 				      unsigned flags)
     97 {
     98 
     99 	bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
    100 	/* write the fence */
    101 	amdgpu_ring_write(ring, DMA_PACKET(DMA_PACKET_FENCE, 0, 0, 0, 0));
    102 	amdgpu_ring_write(ring, addr & 0xfffffffc);
    103 	amdgpu_ring_write(ring, (upper_32_bits(addr) & 0xff));
    104 	amdgpu_ring_write(ring, seq);
    105 	/* optionally write high bits as well */
    106 	if (write64bit) {
    107 		addr += 4;
    108 		amdgpu_ring_write(ring, DMA_PACKET(DMA_PACKET_FENCE, 0, 0, 0, 0));
    109 		amdgpu_ring_write(ring, addr & 0xfffffffc);
    110 		amdgpu_ring_write(ring, (upper_32_bits(addr) & 0xff));
    111 		amdgpu_ring_write(ring, upper_32_bits(seq));
    112 	}
    113 	/* generate an interrupt */
    114 	amdgpu_ring_write(ring, DMA_PACKET(DMA_PACKET_TRAP, 0, 0, 0, 0));
    115 }
    116 
    117 static void si_dma_stop(struct amdgpu_device *adev)
    118 {
    119 	struct amdgpu_ring *ring;
    120 	u32 rb_cntl;
    121 	unsigned i;
    122 
    123 	for (i = 0; i < adev->sdma.num_instances; i++) {
    124 		ring = &adev->sdma.instance[i].ring;
    125 		/* dma0 */
    126 		rb_cntl = RREG32(DMA_RB_CNTL + sdma_offsets[i]);
    127 		rb_cntl &= ~DMA_RB_ENABLE;
    128 		WREG32(DMA_RB_CNTL + sdma_offsets[i], rb_cntl);
    129 
    130 		if (adev->mman.buffer_funcs_ring == ring)
    131 			amdgpu_ttm_set_buffer_funcs_status(adev, false);
    132 		ring->sched.ready = false;
    133 	}
    134 }
    135 
    136 static int si_dma_start(struct amdgpu_device *adev)
    137 {
    138 	struct amdgpu_ring *ring;
    139 	u32 rb_cntl, dma_cntl, ib_cntl, rb_bufsz;
    140 	int i, r;
    141 	uint64_t rptr_addr;
    142 
    143 	for (i = 0; i < adev->sdma.num_instances; i++) {
    144 		ring = &adev->sdma.instance[i].ring;
    145 
    146 		WREG32(DMA_SEM_INCOMPLETE_TIMER_CNTL + sdma_offsets[i], 0);
    147 		WREG32(DMA_SEM_WAIT_FAIL_TIMER_CNTL + sdma_offsets[i], 0);
    148 
    149 		/* Set ring buffer size in dwords */
    150 		rb_bufsz = order_base_2(ring->ring_size / 4);
    151 		rb_cntl = rb_bufsz << 1;
    152 #ifdef __BIG_ENDIAN
    153 		rb_cntl |= DMA_RB_SWAP_ENABLE | DMA_RPTR_WRITEBACK_SWAP_ENABLE;
    154 #endif
    155 		WREG32(DMA_RB_CNTL + sdma_offsets[i], rb_cntl);
    156 
    157 		/* Initialize the ring buffer's read and write pointers */
    158 		WREG32(DMA_RB_RPTR + sdma_offsets[i], 0);
    159 		WREG32(DMA_RB_WPTR + sdma_offsets[i], 0);
    160 
    161 		rptr_addr = adev->wb.gpu_addr + (ring->rptr_offs * 4);
    162 
    163 		WREG32(DMA_RB_RPTR_ADDR_LO + sdma_offsets[i], lower_32_bits(rptr_addr));
    164 		WREG32(DMA_RB_RPTR_ADDR_HI + sdma_offsets[i], upper_32_bits(rptr_addr) & 0xFF);
    165 
    166 		rb_cntl |= DMA_RPTR_WRITEBACK_ENABLE;
    167 
    168 		WREG32(DMA_RB_BASE + sdma_offsets[i], ring->gpu_addr >> 8);
    169 
    170 		/* enable DMA IBs */
    171 		ib_cntl = DMA_IB_ENABLE | CMD_VMID_FORCE;
    172 #ifdef __BIG_ENDIAN
    173 		ib_cntl |= DMA_IB_SWAP_ENABLE;
    174 #endif
    175 		WREG32(DMA_IB_CNTL + sdma_offsets[i], ib_cntl);
    176 
    177 		dma_cntl = RREG32(DMA_CNTL + sdma_offsets[i]);
    178 		dma_cntl &= ~CTXEMPTY_INT_ENABLE;
    179 		WREG32(DMA_CNTL + sdma_offsets[i], dma_cntl);
    180 
    181 		ring->wptr = 0;
    182 		WREG32(DMA_RB_WPTR + sdma_offsets[i], lower_32_bits(ring->wptr) << 2);
    183 		WREG32(DMA_RB_CNTL + sdma_offsets[i], rb_cntl | DMA_RB_ENABLE);
    184 
    185 		ring->sched.ready = true;
    186 
    187 		r = amdgpu_ring_test_helper(ring);
    188 		if (r)
    189 			return r;
    190 
    191 		if (adev->mman.buffer_funcs_ring == ring)
    192 			amdgpu_ttm_set_buffer_funcs_status(adev, true);
    193 	}
    194 
    195 	return 0;
    196 }
    197 
    198 /**
    199  * si_dma_ring_test_ring - simple async dma engine test
    200  *
    201  * @ring: amdgpu_ring structure holding ring information
    202  *
    203  * Test the DMA engine by writing using it to write an
    204  * value to memory. (VI).
    205  * Returns 0 for success, error for failure.
    206  */
    207 static int si_dma_ring_test_ring(struct amdgpu_ring *ring)
    208 {
    209 	struct amdgpu_device *adev = ring->adev;
    210 	unsigned i;
    211 	unsigned index;
    212 	int r;
    213 	u32 tmp;
    214 	u64 gpu_addr;
    215 
    216 	r = amdgpu_device_wb_get(adev, &index);
    217 	if (r)
    218 		return r;
    219 
    220 	gpu_addr = adev->wb.gpu_addr + (index * 4);
    221 	tmp = 0xCAFEDEAD;
    222 	adev->wb.wb[index] = cpu_to_le32(tmp);
    223 
    224 	r = amdgpu_ring_alloc(ring, 4);
    225 	if (r)
    226 		goto error_free_wb;
    227 
    228 	amdgpu_ring_write(ring, DMA_PACKET(DMA_PACKET_WRITE, 0, 0, 0, 1));
    229 	amdgpu_ring_write(ring, lower_32_bits(gpu_addr));
    230 	amdgpu_ring_write(ring, upper_32_bits(gpu_addr) & 0xff);
    231 	amdgpu_ring_write(ring, 0xDEADBEEF);
    232 	amdgpu_ring_commit(ring);
    233 
    234 	for (i = 0; i < adev->usec_timeout; i++) {
    235 		tmp = le32_to_cpu(adev->wb.wb[index]);
    236 		if (tmp == 0xDEADBEEF)
    237 			break;
    238 		udelay(1);
    239 	}
    240 
    241 	if (i >= adev->usec_timeout)
    242 		r = -ETIMEDOUT;
    243 
    244 error_free_wb:
    245 	amdgpu_device_wb_free(adev, index);
    246 	return r;
    247 }
    248 
    249 /**
    250  * si_dma_ring_test_ib - test an IB on the DMA engine
    251  *
    252  * @ring: amdgpu_ring structure holding ring information
    253  *
    254  * Test a simple IB in the DMA ring (VI).
    255  * Returns 0 on success, error on failure.
    256  */
    257 static int si_dma_ring_test_ib(struct amdgpu_ring *ring, long timeout)
    258 {
    259 	struct amdgpu_device *adev = ring->adev;
    260 	struct amdgpu_ib ib;
    261 	struct dma_fence *f = NULL;
    262 	unsigned index;
    263 	u32 tmp = 0;
    264 	u64 gpu_addr;
    265 	long r;
    266 
    267 	r = amdgpu_device_wb_get(adev, &index);
    268 	if (r)
    269 		return r;
    270 
    271 	gpu_addr = adev->wb.gpu_addr + (index * 4);
    272 	tmp = 0xCAFEDEAD;
    273 	adev->wb.wb[index] = cpu_to_le32(tmp);
    274 	memset(&ib, 0, sizeof(ib));
    275 	r = amdgpu_ib_get(adev, NULL, 256, &ib);
    276 	if (r)
    277 		goto err0;
    278 
    279 	ib.ptr[0] = DMA_PACKET(DMA_PACKET_WRITE, 0, 0, 0, 1);
    280 	ib.ptr[1] = lower_32_bits(gpu_addr);
    281 	ib.ptr[2] = upper_32_bits(gpu_addr) & 0xff;
    282 	ib.ptr[3] = 0xDEADBEEF;
    283 	ib.length_dw = 4;
    284 	r = amdgpu_ib_schedule(ring, 1, &ib, NULL, &f);
    285 	if (r)
    286 		goto err1;
    287 
    288 	r = dma_fence_wait_timeout(f, false, timeout);
    289 	if (r == 0) {
    290 		r = -ETIMEDOUT;
    291 		goto err1;
    292 	} else if (r < 0) {
    293 		goto err1;
    294 	}
    295 	tmp = le32_to_cpu(adev->wb.wb[index]);
    296 	if (tmp == 0xDEADBEEF)
    297 		r = 0;
    298 	else
    299 		r = -EINVAL;
    300 
    301 err1:
    302 	amdgpu_ib_free(adev, &ib, NULL);
    303 	dma_fence_put(f);
    304 err0:
    305 	amdgpu_device_wb_free(adev, index);
    306 	return r;
    307 }
    308 
    309 /**
    310  * cik_dma_vm_copy_pte - update PTEs by copying them from the GART
    311  *
    312  * @ib: indirect buffer to fill with commands
    313  * @pe: addr of the page entry
    314  * @src: src addr to copy from
    315  * @count: number of page entries to update
    316  *
    317  * Update PTEs by copying them from the GART using DMA (SI).
    318  */
    319 static void si_dma_vm_copy_pte(struct amdgpu_ib *ib,
    320 			       uint64_t pe, uint64_t src,
    321 			       unsigned count)
    322 {
    323 	unsigned bytes = count * 8;
    324 
    325 	ib->ptr[ib->length_dw++] = DMA_PACKET(DMA_PACKET_COPY,
    326 					      1, 0, 0, bytes);
    327 	ib->ptr[ib->length_dw++] = lower_32_bits(pe);
    328 	ib->ptr[ib->length_dw++] = lower_32_bits(src);
    329 	ib->ptr[ib->length_dw++] = upper_32_bits(pe) & 0xff;
    330 	ib->ptr[ib->length_dw++] = upper_32_bits(src) & 0xff;
    331 }
    332 
    333 /**
    334  * si_dma_vm_write_pte - update PTEs by writing them manually
    335  *
    336  * @ib: indirect buffer to fill with commands
    337  * @pe: addr of the page entry
    338  * @value: dst addr to write into pe
    339  * @count: number of page entries to update
    340  * @incr: increase next addr by incr bytes
    341  *
    342  * Update PTEs by writing them manually using DMA (SI).
    343  */
    344 static void si_dma_vm_write_pte(struct amdgpu_ib *ib, uint64_t pe,
    345 				uint64_t value, unsigned count,
    346 				uint32_t incr)
    347 {
    348 	unsigned ndw = count * 2;
    349 
    350 	ib->ptr[ib->length_dw++] = DMA_PACKET(DMA_PACKET_WRITE, 0, 0, 0, ndw);
    351 	ib->ptr[ib->length_dw++] = lower_32_bits(pe);
    352 	ib->ptr[ib->length_dw++] = upper_32_bits(pe);
    353 	for (; ndw > 0; ndw -= 2) {
    354 		ib->ptr[ib->length_dw++] = lower_32_bits(value);
    355 		ib->ptr[ib->length_dw++] = upper_32_bits(value);
    356 		value += incr;
    357 	}
    358 }
    359 
    360 /**
    361  * si_dma_vm_set_pte_pde - update the page tables using sDMA
    362  *
    363  * @ib: indirect buffer to fill with commands
    364  * @pe: addr of the page entry
    365  * @addr: dst addr to write into pe
    366  * @count: number of page entries to update
    367  * @incr: increase next addr by incr bytes
    368  * @flags: access flags
    369  *
    370  * Update the page tables using sDMA (CIK).
    371  */
    372 static void si_dma_vm_set_pte_pde(struct amdgpu_ib *ib,
    373 				     uint64_t pe,
    374 				     uint64_t addr, unsigned count,
    375 				     uint32_t incr, uint64_t flags)
    376 {
    377 	uint64_t value;
    378 	unsigned ndw;
    379 
    380 	while (count) {
    381 		ndw = count * 2;
    382 		if (ndw > 0xFFFFE)
    383 			ndw = 0xFFFFE;
    384 
    385 		if (flags & AMDGPU_PTE_VALID)
    386 			value = addr;
    387 		else
    388 			value = 0;
    389 
    390 		/* for physically contiguous pages (vram) */
    391 		ib->ptr[ib->length_dw++] = DMA_PTE_PDE_PACKET(ndw);
    392 		ib->ptr[ib->length_dw++] = pe; /* dst addr */
    393 		ib->ptr[ib->length_dw++] = upper_32_bits(pe) & 0xff;
    394 		ib->ptr[ib->length_dw++] = lower_32_bits(flags); /* mask */
    395 		ib->ptr[ib->length_dw++] = upper_32_bits(flags);
    396 		ib->ptr[ib->length_dw++] = value; /* value */
    397 		ib->ptr[ib->length_dw++] = upper_32_bits(value);
    398 		ib->ptr[ib->length_dw++] = incr; /* increment size */
    399 		ib->ptr[ib->length_dw++] = 0;
    400 		pe += ndw * 4;
    401 		addr += (ndw / 2) * incr;
    402 		count -= ndw / 2;
    403 	}
    404 }
    405 
    406 /**
    407  * si_dma_pad_ib - pad the IB to the required number of dw
    408  *
    409  * @ib: indirect buffer to fill with padding
    410  *
    411  */
    412 static void si_dma_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
    413 {
    414 	while (ib->length_dw & 0x7)
    415 		ib->ptr[ib->length_dw++] = DMA_PACKET(DMA_PACKET_NOP, 0, 0, 0, 0);
    416 }
    417 
    418 /**
    419  * cik_sdma_ring_emit_pipeline_sync - sync the pipeline
    420  *
    421  * @ring: amdgpu_ring pointer
    422  *
    423  * Make sure all previous operations are completed (CIK).
    424  */
    425 static void si_dma_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
    426 {
    427 	uint32_t seq = ring->fence_drv.sync_seq;
    428 	uint64_t addr = ring->fence_drv.gpu_addr;
    429 
    430 	/* wait for idle */
    431 	amdgpu_ring_write(ring, DMA_PACKET(DMA_PACKET_POLL_REG_MEM, 0, 0, 0, 0) |
    432 			  (1 << 27)); /* Poll memory */
    433 	amdgpu_ring_write(ring, lower_32_bits(addr));
    434 	amdgpu_ring_write(ring, (0xff << 16) | upper_32_bits(addr)); /* retry, addr_hi */
    435 	amdgpu_ring_write(ring, 0xffffffff); /* mask */
    436 	amdgpu_ring_write(ring, seq); /* value */
    437 	amdgpu_ring_write(ring, (3 << 28) | 0x20); /* func(equal) | poll interval */
    438 }
    439 
    440 /**
    441  * si_dma_ring_emit_vm_flush - cik vm flush using sDMA
    442  *
    443  * @ring: amdgpu_ring pointer
    444  * @vm: amdgpu_vm pointer
    445  *
    446  * Update the page table base and flush the VM TLB
    447  * using sDMA (VI).
    448  */
    449 static void si_dma_ring_emit_vm_flush(struct amdgpu_ring *ring,
    450 				      unsigned vmid, uint64_t pd_addr)
    451 {
    452 	amdgpu_gmc_emit_flush_gpu_tlb(ring, vmid, pd_addr);
    453 
    454 	/* wait for invalidate to complete */
    455 	amdgpu_ring_write(ring, DMA_PACKET(DMA_PACKET_POLL_REG_MEM, 0, 0, 0, 0));
    456 	amdgpu_ring_write(ring, VM_INVALIDATE_REQUEST);
    457 	amdgpu_ring_write(ring, 0xff << 16); /* retry */
    458 	amdgpu_ring_write(ring, 1 << vmid); /* mask */
    459 	amdgpu_ring_write(ring, 0); /* value */
    460 	amdgpu_ring_write(ring, (0 << 28) | 0x20); /* func(always) | poll interval */
    461 }
    462 
    463 static void si_dma_ring_emit_wreg(struct amdgpu_ring *ring,
    464 				  uint32_t reg, uint32_t val)
    465 {
    466 	amdgpu_ring_write(ring, DMA_PACKET(DMA_PACKET_SRBM_WRITE, 0, 0, 0, 0));
    467 	amdgpu_ring_write(ring, (0xf << 16) | reg);
    468 	amdgpu_ring_write(ring, val);
    469 }
    470 
    471 static int si_dma_early_init(void *handle)
    472 {
    473 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
    474 
    475 	adev->sdma.num_instances = 2;
    476 
    477 	si_dma_set_ring_funcs(adev);
    478 	si_dma_set_buffer_funcs(adev);
    479 	si_dma_set_vm_pte_funcs(adev);
    480 	si_dma_set_irq_funcs(adev);
    481 
    482 	return 0;
    483 }
    484 
    485 static int si_dma_sw_init(void *handle)
    486 {
    487 	struct amdgpu_ring *ring;
    488 	int r, i;
    489 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
    490 
    491 	/* DMA0 trap event */
    492 	r = amdgpu_irq_add_id(adev, AMDGPU_IRQ_CLIENTID_LEGACY, 224,
    493 			      &adev->sdma.trap_irq);
    494 	if (r)
    495 		return r;
    496 
    497 	/* DMA1 trap event */
    498 	r = amdgpu_irq_add_id(adev, AMDGPU_IRQ_CLIENTID_LEGACY, 244,
    499 			      &adev->sdma.trap_irq);
    500 	if (r)
    501 		return r;
    502 
    503 	for (i = 0; i < adev->sdma.num_instances; i++) {
    504 		ring = &adev->sdma.instance[i].ring;
    505 		ring->ring_obj = NULL;
    506 		ring->use_doorbell = false;
    507 		snprintf(ring->name, sizeof(ring->name), "sdma%d", i);
    508 		r = amdgpu_ring_init(adev, ring, 1024,
    509 				     &adev->sdma.trap_irq,
    510 				     (i == 0) ?
    511 				     AMDGPU_SDMA_IRQ_INSTANCE0 :
    512 				     AMDGPU_SDMA_IRQ_INSTANCE1);
    513 		if (r)
    514 			return r;
    515 	}
    516 
    517 	return r;
    518 }
    519 
    520 static int si_dma_sw_fini(void *handle)
    521 {
    522 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
    523 	int i;
    524 
    525 	for (i = 0; i < adev->sdma.num_instances; i++)
    526 		amdgpu_ring_fini(&adev->sdma.instance[i].ring);
    527 
    528 	return 0;
    529 }
    530 
    531 static int si_dma_hw_init(void *handle)
    532 {
    533 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
    534 
    535 	return si_dma_start(adev);
    536 }
    537 
    538 static int si_dma_hw_fini(void *handle)
    539 {
    540 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
    541 
    542 	si_dma_stop(adev);
    543 
    544 	return 0;
    545 }
    546 
    547 static int si_dma_suspend(void *handle)
    548 {
    549 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
    550 
    551 	return si_dma_hw_fini(adev);
    552 }
    553 
    554 static int si_dma_resume(void *handle)
    555 {
    556 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
    557 
    558 	return si_dma_hw_init(adev);
    559 }
    560 
    561 static bool si_dma_is_idle(void *handle)
    562 {
    563 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
    564 	u32 tmp = RREG32(SRBM_STATUS2);
    565 
    566 	if (tmp & (DMA_BUSY_MASK | DMA1_BUSY_MASK))
    567 	    return false;
    568 
    569 	return true;
    570 }
    571 
    572 static int si_dma_wait_for_idle(void *handle)
    573 {
    574 	unsigned i;
    575 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
    576 
    577 	for (i = 0; i < adev->usec_timeout; i++) {
    578 		if (si_dma_is_idle(handle))
    579 			return 0;
    580 		udelay(1);
    581 	}
    582 	return -ETIMEDOUT;
    583 }
    584 
    585 static int si_dma_soft_reset(void *handle)
    586 {
    587 	DRM_INFO("si_dma_soft_reset --- not implemented !!!!!!!\n");
    588 	return 0;
    589 }
    590 
    591 static int si_dma_set_trap_irq_state(struct amdgpu_device *adev,
    592 					struct amdgpu_irq_src *src,
    593 					unsigned type,
    594 					enum amdgpu_interrupt_state state)
    595 {
    596 	u32 sdma_cntl;
    597 
    598 	switch (type) {
    599 	case AMDGPU_SDMA_IRQ_INSTANCE0:
    600 		switch (state) {
    601 		case AMDGPU_IRQ_STATE_DISABLE:
    602 			sdma_cntl = RREG32(DMA_CNTL + DMA0_REGISTER_OFFSET);
    603 			sdma_cntl &= ~TRAP_ENABLE;
    604 			WREG32(DMA_CNTL + DMA0_REGISTER_OFFSET, sdma_cntl);
    605 			break;
    606 		case AMDGPU_IRQ_STATE_ENABLE:
    607 			sdma_cntl = RREG32(DMA_CNTL + DMA0_REGISTER_OFFSET);
    608 			sdma_cntl |= TRAP_ENABLE;
    609 			WREG32(DMA_CNTL + DMA0_REGISTER_OFFSET, sdma_cntl);
    610 			break;
    611 		default:
    612 			break;
    613 		}
    614 		break;
    615 	case AMDGPU_SDMA_IRQ_INSTANCE1:
    616 		switch (state) {
    617 		case AMDGPU_IRQ_STATE_DISABLE:
    618 			sdma_cntl = RREG32(DMA_CNTL + DMA1_REGISTER_OFFSET);
    619 			sdma_cntl &= ~TRAP_ENABLE;
    620 			WREG32(DMA_CNTL + DMA1_REGISTER_OFFSET, sdma_cntl);
    621 			break;
    622 		case AMDGPU_IRQ_STATE_ENABLE:
    623 			sdma_cntl = RREG32(DMA_CNTL + DMA1_REGISTER_OFFSET);
    624 			sdma_cntl |= TRAP_ENABLE;
    625 			WREG32(DMA_CNTL + DMA1_REGISTER_OFFSET, sdma_cntl);
    626 			break;
    627 		default:
    628 			break;
    629 		}
    630 		break;
    631 	default:
    632 		break;
    633 	}
    634 	return 0;
    635 }
    636 
    637 static int si_dma_process_trap_irq(struct amdgpu_device *adev,
    638 				      struct amdgpu_irq_src *source,
    639 				      struct amdgpu_iv_entry *entry)
    640 {
    641 	if (entry->src_id == 224)
    642 		amdgpu_fence_process(&adev->sdma.instance[0].ring);
    643 	else
    644 		amdgpu_fence_process(&adev->sdma.instance[1].ring);
    645 	return 0;
    646 }
    647 
    648 static int si_dma_set_clockgating_state(void *handle,
    649 					  enum amd_clockgating_state state)
    650 {
    651 	u32 orig, data, offset;
    652 	int i;
    653 	bool enable;
    654 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
    655 
    656 	enable = (state == AMD_CG_STATE_GATE);
    657 
    658 	if (enable && (adev->cg_flags & AMD_CG_SUPPORT_SDMA_MGCG)) {
    659 		for (i = 0; i < adev->sdma.num_instances; i++) {
    660 			if (i == 0)
    661 				offset = DMA0_REGISTER_OFFSET;
    662 			else
    663 				offset = DMA1_REGISTER_OFFSET;
    664 			orig = data = RREG32(DMA_POWER_CNTL + offset);
    665 			data &= ~MEM_POWER_OVERRIDE;
    666 			if (data != orig)
    667 				WREG32(DMA_POWER_CNTL + offset, data);
    668 			WREG32(DMA_CLK_CTRL + offset, 0x00000100);
    669 		}
    670 	} else {
    671 		for (i = 0; i < adev->sdma.num_instances; i++) {
    672 			if (i == 0)
    673 				offset = DMA0_REGISTER_OFFSET;
    674 			else
    675 				offset = DMA1_REGISTER_OFFSET;
    676 			orig = data = RREG32(DMA_POWER_CNTL + offset);
    677 			data |= MEM_POWER_OVERRIDE;
    678 			if (data != orig)
    679 				WREG32(DMA_POWER_CNTL + offset, data);
    680 
    681 			orig = data = RREG32(DMA_CLK_CTRL + offset);
    682 			data = 0xff000000;
    683 			if (data != orig)
    684 				WREG32(DMA_CLK_CTRL + offset, data);
    685 		}
    686 	}
    687 
    688 	return 0;
    689 }
    690 
    691 static int si_dma_set_powergating_state(void *handle,
    692 					  enum amd_powergating_state state)
    693 {
    694 	u32 tmp;
    695 
    696 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
    697 
    698 	WREG32(DMA_PGFSM_WRITE,  0x00002000);
    699 	WREG32(DMA_PGFSM_CONFIG, 0x100010ff);
    700 
    701 	for (tmp = 0; tmp < 5; tmp++)
    702 		WREG32(DMA_PGFSM_WRITE, 0);
    703 
    704 	return 0;
    705 }
    706 
    707 static const struct amd_ip_funcs si_dma_ip_funcs = {
    708 	.name = "si_dma",
    709 	.early_init = si_dma_early_init,
    710 	.late_init = NULL,
    711 	.sw_init = si_dma_sw_init,
    712 	.sw_fini = si_dma_sw_fini,
    713 	.hw_init = si_dma_hw_init,
    714 	.hw_fini = si_dma_hw_fini,
    715 	.suspend = si_dma_suspend,
    716 	.resume = si_dma_resume,
    717 	.is_idle = si_dma_is_idle,
    718 	.wait_for_idle = si_dma_wait_for_idle,
    719 	.soft_reset = si_dma_soft_reset,
    720 	.set_clockgating_state = si_dma_set_clockgating_state,
    721 	.set_powergating_state = si_dma_set_powergating_state,
    722 };
    723 
    724 static const struct amdgpu_ring_funcs si_dma_ring_funcs = {
    725 	.type = AMDGPU_RING_TYPE_SDMA,
    726 	.align_mask = 0xf,
    727 	.nop = DMA_PACKET(DMA_PACKET_NOP, 0, 0, 0, 0),
    728 	.support_64bit_ptrs = false,
    729 	.get_rptr = si_dma_ring_get_rptr,
    730 	.get_wptr = si_dma_ring_get_wptr,
    731 	.set_wptr = si_dma_ring_set_wptr,
    732 	.emit_frame_size =
    733 		3 + 3 + /* hdp flush / invalidate */
    734 		6 + /* si_dma_ring_emit_pipeline_sync */
    735 		SI_FLUSH_GPU_TLB_NUM_WREG * 3 + 6 + /* si_dma_ring_emit_vm_flush */
    736 		9 + 9 + 9, /* si_dma_ring_emit_fence x3 for user fence, vm fence */
    737 	.emit_ib_size = 7 + 3, /* si_dma_ring_emit_ib */
    738 	.emit_ib = si_dma_ring_emit_ib,
    739 	.emit_fence = si_dma_ring_emit_fence,
    740 	.emit_pipeline_sync = si_dma_ring_emit_pipeline_sync,
    741 	.emit_vm_flush = si_dma_ring_emit_vm_flush,
    742 	.test_ring = si_dma_ring_test_ring,
    743 	.test_ib = si_dma_ring_test_ib,
    744 	.insert_nop = amdgpu_ring_insert_nop,
    745 	.pad_ib = si_dma_ring_pad_ib,
    746 	.emit_wreg = si_dma_ring_emit_wreg,
    747 };
    748 
    749 static void si_dma_set_ring_funcs(struct amdgpu_device *adev)
    750 {
    751 	int i;
    752 
    753 	for (i = 0; i < adev->sdma.num_instances; i++)
    754 		adev->sdma.instance[i].ring.funcs = &si_dma_ring_funcs;
    755 }
    756 
    757 static const struct amdgpu_irq_src_funcs si_dma_trap_irq_funcs = {
    758 	.set = si_dma_set_trap_irq_state,
    759 	.process = si_dma_process_trap_irq,
    760 };
    761 
    762 static void si_dma_set_irq_funcs(struct amdgpu_device *adev)
    763 {
    764 	adev->sdma.trap_irq.num_types = AMDGPU_SDMA_IRQ_LAST;
    765 	adev->sdma.trap_irq.funcs = &si_dma_trap_irq_funcs;
    766 }
    767 
    768 /**
    769  * si_dma_emit_copy_buffer - copy buffer using the sDMA engine
    770  *
    771  * @ring: amdgpu_ring structure holding ring information
    772  * @src_offset: src GPU address
    773  * @dst_offset: dst GPU address
    774  * @byte_count: number of bytes to xfer
    775  *
    776  * Copy GPU buffers using the DMA engine (VI).
    777  * Used by the amdgpu ttm implementation to move pages if
    778  * registered as the asic copy callback.
    779  */
    780 static void si_dma_emit_copy_buffer(struct amdgpu_ib *ib,
    781 				       uint64_t src_offset,
    782 				       uint64_t dst_offset,
    783 				       uint32_t byte_count)
    784 {
    785 	ib->ptr[ib->length_dw++] = DMA_PACKET(DMA_PACKET_COPY,
    786 					      1, 0, 0, byte_count);
    787 	ib->ptr[ib->length_dw++] = lower_32_bits(dst_offset);
    788 	ib->ptr[ib->length_dw++] = lower_32_bits(src_offset);
    789 	ib->ptr[ib->length_dw++] = upper_32_bits(dst_offset) & 0xff;
    790 	ib->ptr[ib->length_dw++] = upper_32_bits(src_offset) & 0xff;
    791 }
    792 
    793 /**
    794  * si_dma_emit_fill_buffer - fill buffer using the sDMA engine
    795  *
    796  * @ring: amdgpu_ring structure holding ring information
    797  * @src_data: value to write to buffer
    798  * @dst_offset: dst GPU address
    799  * @byte_count: number of bytes to xfer
    800  *
    801  * Fill GPU buffers using the DMA engine (VI).
    802  */
    803 static void si_dma_emit_fill_buffer(struct amdgpu_ib *ib,
    804 				       uint32_t src_data,
    805 				       uint64_t dst_offset,
    806 				       uint32_t byte_count)
    807 {
    808 	ib->ptr[ib->length_dw++] = DMA_PACKET(DMA_PACKET_CONSTANT_FILL,
    809 					      0, 0, 0, byte_count / 4);
    810 	ib->ptr[ib->length_dw++] = lower_32_bits(dst_offset);
    811 	ib->ptr[ib->length_dw++] = src_data;
    812 	ib->ptr[ib->length_dw++] = upper_32_bits(dst_offset) << 16;
    813 }
    814 
    815 
    816 static const struct amdgpu_buffer_funcs si_dma_buffer_funcs = {
    817 	.copy_max_bytes = 0xffff8,
    818 	.copy_num_dw = 5,
    819 	.emit_copy_buffer = si_dma_emit_copy_buffer,
    820 
    821 	.fill_max_bytes = 0xffff8,
    822 	.fill_num_dw = 4,
    823 	.emit_fill_buffer = si_dma_emit_fill_buffer,
    824 };
    825 
    826 static void si_dma_set_buffer_funcs(struct amdgpu_device *adev)
    827 {
    828 	adev->mman.buffer_funcs = &si_dma_buffer_funcs;
    829 	adev->mman.buffer_funcs_ring = &adev->sdma.instance[0].ring;
    830 }
    831 
    832 static const struct amdgpu_vm_pte_funcs si_dma_vm_pte_funcs = {
    833 	.copy_pte_num_dw = 5,
    834 	.copy_pte = si_dma_vm_copy_pte,
    835 
    836 	.write_pte = si_dma_vm_write_pte,
    837 	.set_pte_pde = si_dma_vm_set_pte_pde,
    838 };
    839 
    840 static void si_dma_set_vm_pte_funcs(struct amdgpu_device *adev)
    841 {
    842 	unsigned i;
    843 
    844 	adev->vm_manager.vm_pte_funcs = &si_dma_vm_pte_funcs;
    845 	for (i = 0; i < adev->sdma.num_instances; i++) {
    846 		adev->vm_manager.vm_pte_scheds[i] =
    847 			&adev->sdma.instance[i].ring.sched;
    848 	}
    849 	adev->vm_manager.vm_pte_num_scheds = adev->sdma.num_instances;
    850 }
    851 
    852 const struct amdgpu_ip_block_version si_dma_ip_block =
    853 {
    854 	.type = AMD_IP_BLOCK_TYPE_SDMA,
    855 	.major = 1,
    856 	.minor = 0,
    857 	.rev = 0,
    858 	.funcs = &si_dma_ip_funcs,
    859 };
    860