Home | History | Annotate | Line # | Download | only in amdgpu
      1 /*	$NetBSD: amdgpu_amdkfd_gfx_v10.c,v 1.2 2021/12/18 23:44:58 riastradh Exp $	*/
      2 
      3 /*
      4  * Copyright 2019 Advanced Micro Devices, Inc.
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a
      7  * copy of this software and associated documentation files (the "Software"),
      8  * to deal in the Software without restriction, including without limitation
      9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     10  * and/or sell copies of the Software, and to permit persons to whom the
     11  * Software is furnished to do so, subject to the following conditions:
     12  *
     13  * The above copyright notice and this permission notice shall be included in
     14  * all copies or substantial portions of the Software.
     15  *
     16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
     20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
     21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
     22  * OTHER DEALINGS IN THE SOFTWARE.
     23  */
     24 #include <sys/cdefs.h>
     25 __KERNEL_RCSID(0, "$NetBSD: amdgpu_amdkfd_gfx_v10.c,v 1.2 2021/12/18 23:44:58 riastradh Exp $");
     26 
     27 #include <linux/mmu_context.h>
     28 #include "amdgpu.h"
     29 #include "amdgpu_amdkfd.h"
     30 #include "gc/gc_10_1_0_offset.h"
     31 #include "gc/gc_10_1_0_sh_mask.h"
     32 #include "navi10_enum.h"
     33 #include "athub/athub_2_0_0_offset.h"
     34 #include "athub/athub_2_0_0_sh_mask.h"
     35 #include "oss/osssys_5_0_0_offset.h"
     36 #include "oss/osssys_5_0_0_sh_mask.h"
     37 #include "soc15_common.h"
     38 #include "v10_structs.h"
     39 #include "nv.h"
     40 #include "nvd.h"
     41 #include "gfxhub_v2_0.h"
     42 
     43 enum hqd_dequeue_request_type {
     44 	NO_ACTION = 0,
     45 	DRAIN_PIPE,
     46 	RESET_WAVES,
     47 	SAVE_WAVES
     48 };
     49 
     50 /* Because of REG_GET_FIELD() being used, we put this function in the
     51  * asic specific file.
     52  */
     53 static int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd,
     54 		struct tile_config *config)
     55 {
     56 	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
     57 
     58 	config->gb_addr_config = adev->gfx.config.gb_addr_config;
     59 #if 0
     60 /* TODO - confirm REG_GET_FIELD x2, should be OK as is... but
     61  * MC_ARB_RAMCFG register doesn't exist on Vega10 - initial amdgpu
     62  * changes commented out related code, doing the same here for now but
     63  * need to sync with Ken et al
     64  */
     65 	config->num_banks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg,
     66 				MC_ARB_RAMCFG, NOOFBANK);
     67 	config->num_ranks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg,
     68 				MC_ARB_RAMCFG, NOOFRANKS);
     69 #endif
     70 
     71 	config->tile_config_ptr = adev->gfx.config.tile_mode_array;
     72 	config->num_tile_configs =
     73 			ARRAY_SIZE(adev->gfx.config.tile_mode_array);
     74 	config->macro_tile_config_ptr =
     75 			adev->gfx.config.macrotile_mode_array;
     76 	config->num_macro_tile_configs =
     77 			ARRAY_SIZE(adev->gfx.config.macrotile_mode_array);
     78 
     79 	return 0;
     80 }
     81 
     82 static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd)
     83 {
     84 	return (struct amdgpu_device *)kgd;
     85 }
     86 
     87 static void lock_srbm(struct kgd_dev *kgd, uint32_t mec, uint32_t pipe,
     88 			uint32_t queue, uint32_t vmid)
     89 {
     90 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
     91 
     92 	mutex_lock(&adev->srbm_mutex);
     93 	nv_grbm_select(adev, mec, pipe, queue, vmid);
     94 }
     95 
     96 static void unlock_srbm(struct kgd_dev *kgd)
     97 {
     98 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
     99 
    100 	nv_grbm_select(adev, 0, 0, 0, 0);
    101 	mutex_unlock(&adev->srbm_mutex);
    102 }
    103 
    104 static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id,
    105 				uint32_t queue_id)
    106 {
    107 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
    108 
    109 	uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
    110 	uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
    111 
    112 	lock_srbm(kgd, mec, pipe, queue_id, 0);
    113 }
    114 
    115 static uint64_t get_queue_mask(struct amdgpu_device *adev,
    116 			       uint32_t pipe_id, uint32_t queue_id)
    117 {
    118 	unsigned int bit = pipe_id * adev->gfx.mec.num_queue_per_pipe +
    119 			queue_id;
    120 
    121 	return 1ull << bit;
    122 }
    123 
    124 static void release_queue(struct kgd_dev *kgd)
    125 {
    126 	unlock_srbm(kgd);
    127 }
    128 
    129 static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid,
    130 					uint32_t sh_mem_config,
    131 					uint32_t sh_mem_ape1_base,
    132 					uint32_t sh_mem_ape1_limit,
    133 					uint32_t sh_mem_bases)
    134 {
    135 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
    136 
    137 	lock_srbm(kgd, 0, 0, 0, vmid);
    138 
    139 	WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_CONFIG), sh_mem_config);
    140 	WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_BASES), sh_mem_bases);
    141 	/* APE1 no longer exists on GFX9 */
    142 
    143 	unlock_srbm(kgd);
    144 }
    145 
    146 static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid,
    147 					unsigned int vmid)
    148 {
    149 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
    150 
    151 	/*
    152 	 * We have to assume that there is no outstanding mapping.
    153 	 * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because
    154 	 * a mapping is in progress or because a mapping finished
    155 	 * and the SW cleared it.
    156 	 * So the protocol is to always wait & clear.
    157 	 */
    158 	uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid |
    159 			ATC_VMID0_PASID_MAPPING__VALID_MASK;
    160 
    161 	pr_debug("pasid 0x%x vmid %d, reg value %x\n", pasid, vmid, pasid_mapping);
    162 
    163 	pr_debug("ATHUB, reg %x\n", SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) + vmid);
    164 	WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) + vmid,
    165 	       pasid_mapping);
    166 
    167 #if 0
    168 	/* TODO: uncomment this code when the hardware support is ready. */
    169 	while (!(RREG32(SOC15_REG_OFFSET(
    170 				ATHUB, 0,
    171 				mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) &
    172 		 (1U << vmid)))
    173 		cpu_relax();
    174 
    175 	pr_debug("ATHUB mapping update finished\n");
    176 	WREG32(SOC15_REG_OFFSET(ATHUB, 0,
    177 				mmATC_VMID_PASID_MAPPING_UPDATE_STATUS),
    178 	       1U << vmid);
    179 #endif
    180 
    181 	/* Mapping vmid to pasid also for IH block */
    182 	pr_debug("update mapping for IH block and mmhub");
    183 	WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT) + vmid,
    184 	       pasid_mapping);
    185 
    186 	return 0;
    187 }
    188 
    189 /* TODO - RING0 form of field is obsolete, seems to date back to SI
    190  * but still works
    191  */
    192 
    193 static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id)
    194 {
    195 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
    196 	uint32_t mec;
    197 	uint32_t pipe;
    198 
    199 	mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
    200 	pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
    201 
    202 	lock_srbm(kgd, mec, pipe, 0, 0);
    203 
    204 	WREG32(SOC15_REG_OFFSET(GC, 0, mmCPC_INT_CNTL),
    205 		CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK |
    206 		CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK);
    207 
    208 	unlock_srbm(kgd);
    209 
    210 	return 0;
    211 }
    212 
    213 static uint32_t get_sdma_rlc_reg_offset(struct amdgpu_device *adev,
    214 				unsigned int engine_id,
    215 				unsigned int queue_id)
    216 {
    217 	uint32_t sdma_engine_reg_base[2] = {
    218 		SOC15_REG_OFFSET(SDMA0, 0,
    219 				 mmSDMA0_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL,
    220 		/* On gfx10, mmSDMA1_xxx registers are defined NOT based
    221 		 * on SDMA1 base address (dw 0x1860) but based on SDMA0
    222 		 * base address (dw 0x1260). Therefore use mmSDMA0_RLC0_RB_CNTL
    223 		 * instead of mmSDMA1_RLC0_RB_CNTL for the base address calc
    224 		 * below
    225 		 */
    226 		SOC15_REG_OFFSET(SDMA1, 0,
    227 				 mmSDMA1_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL
    228 	};
    229 
    230 	uint32_t retval = sdma_engine_reg_base[engine_id]
    231 		+ queue_id * (mmSDMA0_RLC1_RB_CNTL - mmSDMA0_RLC0_RB_CNTL);
    232 
    233 	pr_debug("RLC register offset for SDMA%d RLC%d: 0x%x\n", engine_id,
    234 			queue_id, retval);
    235 
    236 	return retval;
    237 }
    238 
    239 #if 0
    240 static uint32_t get_watch_base_addr(struct amdgpu_device *adev)
    241 {
    242 	uint32_t retval = SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_H) -
    243 			mmTCP_WATCH0_ADDR_H;
    244 
    245 	pr_debug("kfd: reg watch base address: 0x%x\n", retval);
    246 
    247 	return retval;
    248 }
    249 #endif
    250 
    251 static inline struct v10_compute_mqd *get_mqd(void *mqd)
    252 {
    253 	return (struct v10_compute_mqd *)mqd;
    254 }
    255 
    256 static inline struct v10_sdma_mqd *get_sdma_mqd(void *mqd)
    257 {
    258 	return (struct v10_sdma_mqd *)mqd;
    259 }
    260 
    261 static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
    262 			uint32_t queue_id, uint32_t __user *wptr,
    263 			uint32_t wptr_shift, uint32_t wptr_mask,
    264 			struct mm_struct *mm)
    265 {
    266 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
    267 	struct v10_compute_mqd *m;
    268 	uint32_t *mqd_hqd;
    269 	uint32_t reg, hqd_base, data;
    270 
    271 	m = get_mqd(mqd);
    272 
    273 	pr_debug("Load hqd of pipe %d queue %d\n", pipe_id, queue_id);
    274 	acquire_queue(kgd, pipe_id, queue_id);
    275 
    276 	/* HQD registers extend from CP_MQD_BASE_ADDR to CP_HQD_EOP_WPTR_MEM. */
    277 	mqd_hqd = &m->cp_mqd_base_addr_lo;
    278 	hqd_base = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR);
    279 
    280 	for (reg = hqd_base;
    281 	     reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++)
    282 		WREG32(reg, mqd_hqd[reg - hqd_base]);
    283 
    284 
    285 	/* Activate doorbell logic before triggering WPTR poll. */
    286 	data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control,
    287 			     CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1);
    288 	WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_DOORBELL_CONTROL), data);
    289 
    290 	if (wptr) {
    291 		/* Don't read wptr with get_user because the user
    292 		 * context may not be accessible (if this function
    293 		 * runs in a work queue). Instead trigger a one-shot
    294 		 * polling read from memory in the CP. This assumes
    295 		 * that wptr is GPU-accessible in the queue's VMID via
    296 		 * ATC or SVM. WPTR==RPTR before starting the poll so
    297 		 * the CP starts fetching new commands from the right
    298 		 * place.
    299 		 *
    300 		 * Guessing a 64-bit WPTR from a 32-bit RPTR is a bit
    301 		 * tricky. Assume that the queue didn't overflow. The
    302 		 * number of valid bits in the 32-bit RPTR depends on
    303 		 * the queue size. The remaining bits are taken from
    304 		 * the saved 64-bit WPTR. If the WPTR wrapped, add the
    305 		 * queue size.
    306 		 */
    307 		uint32_t queue_size =
    308 			2 << REG_GET_FIELD(m->cp_hqd_pq_control,
    309 					   CP_HQD_PQ_CONTROL, QUEUE_SIZE);
    310 		uint64_t guessed_wptr = m->cp_hqd_pq_rptr & (queue_size - 1);
    311 
    312 		if ((m->cp_hqd_pq_wptr_lo & (queue_size - 1)) < guessed_wptr)
    313 			guessed_wptr += queue_size;
    314 		guessed_wptr += m->cp_hqd_pq_wptr_lo & ~(queue_size - 1);
    315 		guessed_wptr += (uint64_t)m->cp_hqd_pq_wptr_hi << 32;
    316 
    317 		WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_LO),
    318 		       lower_32_bits(guessed_wptr));
    319 		WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI),
    320 		       upper_32_bits(guessed_wptr));
    321 		WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR),
    322 		       lower_32_bits((uint64_t)wptr));
    323 		WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR_HI),
    324 		       upper_32_bits((uint64_t)wptr));
    325 		pr_debug("%s setting CP_PQ_WPTR_POLL_CNTL1 to %x\n", __func__,
    326 			 (uint32_t)get_queue_mask(adev, pipe_id, queue_id));
    327 		WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_PQ_WPTR_POLL_CNTL1),
    328 		       (uint32_t)get_queue_mask(adev, pipe_id, queue_id));
    329 	}
    330 
    331 	/* Start the EOP fetcher */
    332 	WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_EOP_RPTR),
    333 	       REG_SET_FIELD(m->cp_hqd_eop_rptr,
    334 			     CP_HQD_EOP_RPTR, INIT_FETCHER, 1));
    335 
    336 	data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1);
    337 	WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE), data);
    338 
    339 	release_queue(kgd);
    340 
    341 	return 0;
    342 }
    343 
    344 static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
    345 			    uint32_t pipe_id, uint32_t queue_id,
    346 			    uint32_t doorbell_off)
    347 {
    348 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
    349 	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
    350 	struct v10_compute_mqd *m;
    351 	uint32_t mec, pipe;
    352 	int r;
    353 
    354 	m = get_mqd(mqd);
    355 
    356 	acquire_queue(kgd, pipe_id, queue_id);
    357 
    358 	mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
    359 	pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
    360 
    361 	pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
    362 		 mec, pipe, queue_id);
    363 
    364 	spin_lock(&adev->gfx.kiq.ring_lock);
    365 	r = amdgpu_ring_alloc(kiq_ring, 7);
    366 	if (r) {
    367 		pr_err("Failed to alloc KIQ (%d).\n", r);
    368 		goto out_unlock;
    369 	}
    370 
    371 	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
    372 	amdgpu_ring_write(kiq_ring,
    373 			  PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */
    374 			  PACKET3_MAP_QUEUES_VMID(m->cp_hqd_vmid) | /* VMID */
    375 			  PACKET3_MAP_QUEUES_QUEUE(queue_id) |
    376 			  PACKET3_MAP_QUEUES_PIPE(pipe) |
    377 			  PACKET3_MAP_QUEUES_ME((mec - 1)) |
    378 			  PACKET3_MAP_QUEUES_QUEUE_TYPE(0) | /*queue_type: normal compute queue */
    379 			  PACKET3_MAP_QUEUES_ALLOC_FORMAT(0) | /* alloc format: all_on_one_pipe */
    380 			  PACKET3_MAP_QUEUES_ENGINE_SEL(1) | /* engine_sel: hiq */
    381 			  PACKET3_MAP_QUEUES_NUM_QUEUES(1)); /* num_queues: must be 1 */
    382 	amdgpu_ring_write(kiq_ring,
    383 			  PACKET3_MAP_QUEUES_DOORBELL_OFFSET(doorbell_off));
    384 	amdgpu_ring_write(kiq_ring, m->cp_mqd_base_addr_lo);
    385 	amdgpu_ring_write(kiq_ring, m->cp_mqd_base_addr_hi);
    386 	amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_lo);
    387 	amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi);
    388 	amdgpu_ring_commit(kiq_ring);
    389 
    390 out_unlock:
    391 	spin_unlock(&adev->gfx.kiq.ring_lock);
    392 	release_queue(kgd);
    393 
    394 	return r;
    395 }
    396 
    397 static int kgd_hqd_dump(struct kgd_dev *kgd,
    398 			uint32_t pipe_id, uint32_t queue_id,
    399 			uint32_t (**dump)[2], uint32_t *n_regs)
    400 {
    401 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
    402 	uint32_t i = 0, reg;
    403 #define HQD_N_REGS 56
    404 #define DUMP_REG(addr) do {				\
    405 		if (WARN_ON_ONCE(i >= HQD_N_REGS))	\
    406 			break;				\
    407 		(*dump)[i][0] = (addr) << 2;		\
    408 		(*dump)[i++][1] = RREG32(addr);		\
    409 	} while (0)
    410 
    411 	*dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL);
    412 	if (*dump == NULL)
    413 		return -ENOMEM;
    414 
    415 	acquire_queue(kgd, pipe_id, queue_id);
    416 
    417 	for (reg = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR);
    418 	     reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++)
    419 		DUMP_REG(reg);
    420 
    421 	release_queue(kgd);
    422 
    423 	WARN_ON_ONCE(i != HQD_N_REGS);
    424 	*n_regs = i;
    425 
    426 	return 0;
    427 }
    428 
    429 static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd,
    430 			     uint32_t __user *wptr, struct mm_struct *mm)
    431 {
    432 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
    433 	struct v10_sdma_mqd *m;
    434 	uint32_t sdma_rlc_reg_offset;
    435 	unsigned long end_jiffies;
    436 	uint32_t data;
    437 	uint64_t data64;
    438 	uint64_t __user *wptr64 = (uint64_t __user *)wptr;
    439 
    440 	m = get_sdma_mqd(mqd);
    441 	sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id,
    442 					    m->sdma_queue_id);
    443 
    444 	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL,
    445 		m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK));
    446 
    447 	end_jiffies = msecs_to_jiffies(2000) + jiffies;
    448 	while (true) {
    449 		data = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_CONTEXT_STATUS);
    450 		if (data & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK)
    451 			break;
    452 		if (time_after(jiffies, end_jiffies)) {
    453 			pr_err("SDMA RLC not idle in %s\n", __func__);
    454 			return -ETIME;
    455 		}
    456 		usleep_range(500, 1000);
    457 	}
    458 
    459 	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL_OFFSET,
    460 	       m->sdmax_rlcx_doorbell_offset);
    461 
    462 	data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL,
    463 			     ENABLE, 1);
    464 	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL, data);
    465 	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR,
    466 				m->sdmax_rlcx_rb_rptr);
    467 	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_HI,
    468 				m->sdmax_rlcx_rb_rptr_hi);
    469 
    470 	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 1);
    471 	if (read_user_wptr(mm, wptr64, data64)) {
    472 		WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR,
    473 		       lower_32_bits(data64));
    474 		WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR_HI,
    475 		       upper_32_bits(data64));
    476 	} else {
    477 		WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR,
    478 		       m->sdmax_rlcx_rb_rptr);
    479 		WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR_HI,
    480 		       m->sdmax_rlcx_rb_rptr_hi);
    481 	}
    482 	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 0);
    483 
    484 	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base);
    485 	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_BASE_HI,
    486 			m->sdmax_rlcx_rb_base_hi);
    487 	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_ADDR_LO,
    488 			m->sdmax_rlcx_rb_rptr_addr_lo);
    489 	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_ADDR_HI,
    490 			m->sdmax_rlcx_rb_rptr_addr_hi);
    491 
    492 	data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL,
    493 			     RB_ENABLE, 1);
    494 	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, data);
    495 
    496 	return 0;
    497 }
    498 
    499 static int kgd_hqd_sdma_dump(struct kgd_dev *kgd,
    500 			     uint32_t engine_id, uint32_t queue_id,
    501 			     uint32_t (**dump)[2], uint32_t *n_regs)
    502 {
    503 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
    504 	uint32_t sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev,
    505 			engine_id, queue_id);
    506 	uint32_t i = 0, reg;
    507 #undef HQD_N_REGS
    508 #define HQD_N_REGS (19+6+7+10)
    509 
    510 	*dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL);
    511 	if (*dump == NULL)
    512 		return -ENOMEM;
    513 
    514 	for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++)
    515 		DUMP_REG(sdma_rlc_reg_offset + reg);
    516 	for (reg = mmSDMA0_RLC0_STATUS; reg <= mmSDMA0_RLC0_CSA_ADDR_HI; reg++)
    517 		DUMP_REG(sdma_rlc_reg_offset + reg);
    518 	for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN;
    519 	     reg <= mmSDMA0_RLC0_MINOR_PTR_UPDATE; reg++)
    520 		DUMP_REG(sdma_rlc_reg_offset + reg);
    521 	for (reg = mmSDMA0_RLC0_MIDCMD_DATA0;
    522 	     reg <= mmSDMA0_RLC0_MIDCMD_CNTL; reg++)
    523 		DUMP_REG(sdma_rlc_reg_offset + reg);
    524 
    525 	WARN_ON_ONCE(i != HQD_N_REGS);
    526 	*n_regs = i;
    527 
    528 	return 0;
    529 }
    530 
    531 static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address,
    532 				uint32_t pipe_id, uint32_t queue_id)
    533 {
    534 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
    535 	uint32_t act;
    536 	bool retval = false;
    537 	uint32_t low, high;
    538 
    539 	acquire_queue(kgd, pipe_id, queue_id);
    540 	act = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE));
    541 	if (act) {
    542 		low = lower_32_bits(queue_address >> 8);
    543 		high = upper_32_bits(queue_address >> 8);
    544 
    545 		if (low == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE)) &&
    546 		   high == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE_HI)))
    547 			retval = true;
    548 	}
    549 	release_queue(kgd);
    550 	return retval;
    551 }
    552 
    553 static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd)
    554 {
    555 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
    556 	struct v10_sdma_mqd *m;
    557 	uint32_t sdma_rlc_reg_offset;
    558 	uint32_t sdma_rlc_rb_cntl;
    559 
    560 	m = get_sdma_mqd(mqd);
    561 	sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id,
    562 					    m->sdma_queue_id);
    563 
    564 	sdma_rlc_rb_cntl = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL);
    565 
    566 	if (sdma_rlc_rb_cntl & SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)
    567 		return true;
    568 
    569 	return false;
    570 }
    571 
    572 static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
    573 				enum kfd_preempt_type reset_type,
    574 				unsigned int utimeout, uint32_t pipe_id,
    575 				uint32_t queue_id)
    576 {
    577 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
    578 	enum hqd_dequeue_request_type type;
    579 	unsigned long end_jiffies;
    580 	uint32_t temp;
    581 	struct v10_compute_mqd *m = get_mqd(mqd);
    582 
    583 #if 0
    584 	unsigned long flags;
    585 	int retry;
    586 #endif
    587 
    588 	acquire_queue(kgd, pipe_id, queue_id);
    589 
    590 	if (m->cp_hqd_vmid == 0)
    591 		WREG32_FIELD15(GC, 0, RLC_CP_SCHEDULERS, scheduler1, 0);
    592 
    593 	switch (reset_type) {
    594 	case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN:
    595 		type = DRAIN_PIPE;
    596 		break;
    597 	case KFD_PREEMPT_TYPE_WAVEFRONT_RESET:
    598 		type = RESET_WAVES;
    599 		break;
    600 	default:
    601 		type = DRAIN_PIPE;
    602 		break;
    603 	}
    604 
    605 #if 0 /* Is this still needed? */
    606 	/* Workaround: If IQ timer is active and the wait time is close to or
    607 	 * equal to 0, dequeueing is not safe. Wait until either the wait time
    608 	 * is larger or timer is cleared. Also, ensure that IQ_REQ_PEND is
    609 	 * cleared before continuing. Also, ensure wait times are set to at
    610 	 * least 0x3.
    611 	 */
    612 	local_irq_save(flags);
    613 	preempt_disable();
    614 	retry = 5000; /* wait for 500 usecs at maximum */
    615 	while (true) {
    616 		temp = RREG32(mmCP_HQD_IQ_TIMER);
    617 		if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, PROCESSING_IQ)) {
    618 			pr_debug("HW is processing IQ\n");
    619 			goto loop;
    620 		}
    621 		if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, ACTIVE)) {
    622 			if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, RETRY_TYPE)
    623 					== 3) /* SEM-rearm is safe */
    624 				break;
    625 			/* Wait time 3 is safe for CP, but our MMIO read/write
    626 			 * time is close to 1 microsecond, so check for 10 to
    627 			 * leave more buffer room
    628 			 */
    629 			if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, WAIT_TIME)
    630 					>= 10)
    631 				break;
    632 			pr_debug("IQ timer is active\n");
    633 		} else
    634 			break;
    635 loop:
    636 		if (!retry) {
    637 			pr_err("CP HQD IQ timer status time out\n");
    638 			break;
    639 		}
    640 		ndelay(100);
    641 		--retry;
    642 	}
    643 	retry = 1000;
    644 	while (true) {
    645 		temp = RREG32(mmCP_HQD_DEQUEUE_REQUEST);
    646 		if (!(temp & CP_HQD_DEQUEUE_REQUEST__IQ_REQ_PEND_MASK))
    647 			break;
    648 		pr_debug("Dequeue request is pending\n");
    649 
    650 		if (!retry) {
    651 			pr_err("CP HQD dequeue request time out\n");
    652 			break;
    653 		}
    654 		ndelay(100);
    655 		--retry;
    656 	}
    657 	local_irq_restore(flags);
    658 	preempt_enable();
    659 #endif
    660 
    661 	WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_DEQUEUE_REQUEST), type);
    662 
    663 	end_jiffies = (utimeout * HZ / 1000) + jiffies;
    664 	while (true) {
    665 		temp = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE));
    666 		if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK))
    667 			break;
    668 		if (time_after(jiffies, end_jiffies)) {
    669 			pr_err("cp queue preemption time out.\n");
    670 			release_queue(kgd);
    671 			return -ETIME;
    672 		}
    673 		usleep_range(500, 1000);
    674 	}
    675 
    676 	release_queue(kgd);
    677 	return 0;
    678 }
    679 
    680 static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
    681 				unsigned int utimeout)
    682 {
    683 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
    684 	struct v10_sdma_mqd *m;
    685 	uint32_t sdma_rlc_reg_offset;
    686 	uint32_t temp;
    687 	unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies;
    688 
    689 	m = get_sdma_mqd(mqd);
    690 	sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id,
    691 					    m->sdma_queue_id);
    692 
    693 	temp = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL);
    694 	temp = temp & ~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK;
    695 	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, temp);
    696 
    697 	while (true) {
    698 		temp = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_CONTEXT_STATUS);
    699 		if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK)
    700 			break;
    701 		if (time_after(jiffies, end_jiffies)) {
    702 			pr_err("SDMA RLC not idle in %s\n", __func__);
    703 			return -ETIME;
    704 		}
    705 		usleep_range(500, 1000);
    706 	}
    707 
    708 	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL, 0);
    709 	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL,
    710 		RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL) |
    711 		SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK);
    712 
    713 	m->sdmax_rlcx_rb_rptr = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR);
    714 	m->sdmax_rlcx_rb_rptr_hi =
    715 		RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_HI);
    716 
    717 	return 0;
    718 }
    719 
    720 static bool get_atc_vmid_pasid_mapping_info(struct kgd_dev *kgd,
    721 					uint8_t vmid, uint16_t *p_pasid)
    722 {
    723 	uint32_t value;
    724 	struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
    725 
    726 	value = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING)
    727 		     + vmid);
    728 	*p_pasid = value & ATC_VMID0_PASID_MAPPING__PASID_MASK;
    729 
    730 	return !!(value & ATC_VMID0_PASID_MAPPING__VALID_MASK);
    731 }
    732 
    733 static int kgd_address_watch_disable(struct kgd_dev *kgd)
    734 {
    735 	return 0;
    736 }
    737 
    738 static int kgd_address_watch_execute(struct kgd_dev *kgd,
    739 					unsigned int watch_point_id,
    740 					uint32_t cntl_val,
    741 					uint32_t addr_hi,
    742 					uint32_t addr_lo)
    743 {
    744 	return 0;
    745 }
    746 
    747 static int kgd_wave_control_execute(struct kgd_dev *kgd,
    748 					uint32_t gfx_index_val,
    749 					uint32_t sq_cmd)
    750 {
    751 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
    752 	uint32_t data = 0;
    753 
    754 	mutex_lock(&adev->grbm_idx_mutex);
    755 
    756 	WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), gfx_index_val);
    757 	WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CMD), sq_cmd);
    758 
    759 	data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
    760 		INSTANCE_BROADCAST_WRITES, 1);
    761 	data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
    762 		SA_BROADCAST_WRITES, 1);
    763 	data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
    764 		SE_BROADCAST_WRITES, 1);
    765 
    766 	WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), data);
    767 	mutex_unlock(&adev->grbm_idx_mutex);
    768 
    769 	return 0;
    770 }
    771 
    772 static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd,
    773 					unsigned int watch_point_id,
    774 					unsigned int reg_offset)
    775 {
    776 	return 0;
    777 }
    778 
    779 static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid,
    780 		uint64_t page_table_base)
    781 {
    782 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
    783 
    784 	if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) {
    785 		pr_err("trying to set page table base for wrong VMID %u\n",
    786 		       vmid);
    787 		return;
    788 	}
    789 
    790 	/* SDMA is on gfxhub as well for Navi1* series */
    791 	gfxhub_v2_0_setup_vm_pt_regs(adev, vmid, page_table_base);
    792 }
    793 
    794 const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
    795 	.program_sh_mem_settings = kgd_program_sh_mem_settings,
    796 	.set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping,
    797 	.init_interrupts = kgd_init_interrupts,
    798 	.hqd_load = kgd_hqd_load,
    799 	.hiq_mqd_load = kgd_hiq_mqd_load,
    800 	.hqd_sdma_load = kgd_hqd_sdma_load,
    801 	.hqd_dump = kgd_hqd_dump,
    802 	.hqd_sdma_dump = kgd_hqd_sdma_dump,
    803 	.hqd_is_occupied = kgd_hqd_is_occupied,
    804 	.hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied,
    805 	.hqd_destroy = kgd_hqd_destroy,
    806 	.hqd_sdma_destroy = kgd_hqd_sdma_destroy,
    807 	.address_watch_disable = kgd_address_watch_disable,
    808 	.address_watch_execute = kgd_address_watch_execute,
    809 	.wave_control_execute = kgd_wave_control_execute,
    810 	.address_watch_get_offset = kgd_address_watch_get_offset,
    811 	.get_atc_vmid_pasid_mapping_info =
    812 			get_atc_vmid_pasid_mapping_info,
    813 	.get_tile_config = amdgpu_amdkfd_get_tile_config,
    814 	.set_vm_context_page_table_base = set_vm_context_page_table_base,
    815 	.get_hive_id = amdgpu_amdkfd_get_hive_id,
    816 };
    817