1 /* $NetBSD: kfd_flat_memory.c,v 1.3 2021/12/18 23:44:59 riastradh Exp $ */ 2 3 /* 4 * Copyright 2014 Advanced Micro Devices, Inc. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 */ 25 26 #include <sys/cdefs.h> 27 __KERNEL_RCSID(0, "$NetBSD: kfd_flat_memory.c,v 1.3 2021/12/18 23:44:59 riastradh Exp $"); 28 29 #include <linux/device.h> 30 #include <linux/export.h> 31 #include <linux/err.h> 32 #include <linux/fs.h> 33 #include <linux/sched.h> 34 #include <linux/slab.h> 35 #include <linux/uaccess.h> 36 #include <linux/compat.h> 37 #include <uapi/linux/kfd_ioctl.h> 38 #include <linux/time.h> 39 #include "kfd_priv.h" 40 #include <linux/mm.h> 41 #include <linux/mman.h> 42 #include <asm/processor.h> 43 44 /* 45 * The primary memory I/O features being added for revisions of gfxip 46 * beyond 7.0 (Kaveri) are: 47 * 48 * Access to ATC/IOMMU mapped memory w/ associated extension of VA to 48b 49 * 50 * Flat shader memory access These are new shader vector memory 51 * operations that do not reference a T#/V# so a pointer is what is 52 * sourced from the vector gprs for direct access to memory. 53 * This pointer space has the Shared(LDS) and Private(Scratch) memory 54 * mapped into this pointer space as apertures. 55 * The hardware then determines how to direct the memory request 56 * based on what apertures the request falls in. 57 * 58 * Unaligned support and alignment check 59 * 60 * 61 * System Unified Address - SUA 62 * 63 * The standard usage for GPU virtual addresses are that they are mapped by 64 * a set of page tables we call GPUVM and these page tables are managed by 65 * a combination of vidMM/driver software components. The current virtual 66 * address (VA) range for GPUVM is 40b. 67 * 68 * As of gfxip7.1 and beyond were adding the ability for compute memory 69 * clients (CP/RLC, DMA, SHADER(ifetch, scalar, and vector ops)) to access 70 * the same page tables used by host x86 processors and that are managed by 71 * the operating system. This is via a technique and hardware called ATC/IOMMU. 72 * The GPU has the capability of accessing both the GPUVM and ATC address 73 * spaces for a given VMID (process) simultaneously and we call this feature 74 * system unified address (SUA). 75 * 76 * There are three fundamental address modes of operation for a given VMID 77 * (process) on the GPU: 78 * 79 * HSA64 64b pointers and the default address space is ATC 80 * HSA32 32b pointers and the default address space is ATC 81 * GPUVM 64b pointers and the default address space is GPUVM (driver 82 * model mode) 83 * 84 * 85 * HSA64 - ATC/IOMMU 64b 86 * 87 * A 64b pointer in the AMD64/IA64 CPU architecture is not fully utilized 88 * by the CPU so an AMD CPU can only access the high area 89 * (VA[63:47] == 0x1FFFF) and low area (VA[63:47 == 0) of the address space 90 * so the actual VA carried to translation is 48b. There is a hole in 91 * the middle of the 64b VA space. 92 * 93 * The GPU not only has access to all of the CPU accessible address space via 94 * ATC/IOMMU, but it also has access to the GPUVM address space. The system 95 * unified address feature (SUA) is the mapping of GPUVM and ATC address 96 * spaces into a unified pointer space. The method we take for 64b mode is 97 * to map the full 40b GPUVM address space into the hole of the 64b address 98 * space. 99 100 * The GPUVM_Base/GPUVM_Limit defines the aperture in the 64b space where we 101 * direct requests to be translated via GPUVM page tables instead of the 102 * IOMMU path. 103 * 104 * 105 * 64b to 49b Address conversion 106 * 107 * Note that there are still significant portions of unused regions (holes) 108 * in the 64b address space even for the GPU. There are several places in 109 * the pipeline (sw and hw), we wish to compress the 64b virtual address 110 * to a 49b address. This 49b address is constituted of an ATC bit 111 * plus a 48b virtual address. This 49b address is what is passed to the 112 * translation hardware. ATC==0 means the 48b address is a GPUVM address 113 * (max of 2^40 1) intended to be translated via GPUVM page tables. 114 * ATC==1 means the 48b address is intended to be translated via IOMMU 115 * page tables. 116 * 117 * A 64b pointer is compared to the apertures that are defined (Base/Limit), in 118 * this case the GPUVM aperture (red) is defined and if a pointer falls in this 119 * aperture, we subtract the GPUVM_Base address and set the ATC bit to zero 120 * as part of the 64b to 49b conversion. 121 * 122 * Where this 64b to 49b conversion is done is a function of the usage. 123 * Most GPU memory access is via memory objects where the driver builds 124 * a descriptor which consists of a base address and a memory access by 125 * the GPU usually consists of some kind of an offset or Cartesian coordinate 126 * that references this memory descriptor. This is the case for shader 127 * instructions that reference the T# or V# constants, or for specified 128 * locations of assets (ex. the shader program location). In these cases 129 * the driver is what handles the 64b to 49b conversion and the base 130 * address in the descriptor (ex. V# or T# or shader program location) 131 * is defined as a 48b address w/ an ATC bit. For this usage a given 132 * memory object cannot straddle multiple apertures in the 64b address 133 * space. For example a shader program cannot jump in/out between ATC 134 * and GPUVM space. 135 * 136 * In some cases we wish to pass a 64b pointer to the GPU hardware and 137 * the GPU hw does the 64b to 49b conversion before passing memory 138 * requests to the cache/memory system. This is the case for the 139 * S_LOAD and FLAT_* shader memory instructions where we have 64b pointers 140 * in scalar and vector GPRs respectively. 141 * 142 * In all cases (no matter where the 64b -> 49b conversion is done), the gfxip 143 * hardware sends a 48b address along w/ an ATC bit, to the memory controller 144 * on the memory request interfaces. 145 * 146 * <client>_MC_rdreq_atc // read request ATC bit 147 * 148 * 0 : <client>_MC_rdreq_addr is a GPUVM VA 149 * 150 * 1 : <client>_MC_rdreq_addr is a ATC VA 151 * 152 * 153 * Spare aperture (APE1) 154 * 155 * We use the GPUVM aperture to differentiate ATC vs. GPUVM, but we also use 156 * apertures to set the Mtype field for S_LOAD/FLAT_* ops which is input to the 157 * config tables for setting cache policies. The spare (APE1) aperture is 158 * motivated by getting a different Mtype from the default. 159 * The default aperture isnt an actual base/limit aperture; it is just the 160 * address space that doesnt hit any defined base/limit apertures. 161 * The following diagram is a complete picture of the gfxip7.x SUA apertures. 162 * The APE1 can be placed either below or above 163 * the hole (cannot be in the hole). 164 * 165 * 166 * General Aperture definitions and rules 167 * 168 * An aperture register definition consists of a Base, Limit, Mtype, and 169 * usually an ATC bit indicating which translation tables that aperture uses. 170 * In all cases (for SUA and DUA apertures discussed later), aperture base 171 * and limit definitions are 64KB aligned. 172 * 173 * <ape>_Base[63:0] = { <ape>_Base_register[63:16], 0x0000 } 174 * 175 * <ape>_Limit[63:0] = { <ape>_Limit_register[63:16], 0xFFFF } 176 * 177 * The base and limit are considered inclusive to an aperture so being 178 * inside an aperture means (address >= Base) AND (address <= Limit). 179 * 180 * In no case is a payload that straddles multiple apertures expected to work. 181 * For example a load_dword_x4 that starts in one aperture and ends in another, 182 * does not work. For the vector FLAT_* ops we have detection capability in 183 * the shader for reporting a memory violation back to the 184 * SQ block for use in traps. 185 * A memory violation results when an op falls into the hole, 186 * or a payload straddles multiple apertures. The S_LOAD instruction 187 * does not have this detection. 188 * 189 * Apertures cannot overlap. 190 * 191 * 192 * 193 * HSA32 - ATC/IOMMU 32b 194 * 195 * For HSA32 mode, the pointers are interpreted as 32 bits and use a single GPR 196 * instead of two for the S_LOAD and FLAT_* ops. The entire GPUVM space of 40b 197 * will not fit so there is only partial visibility to the GPUVM 198 * space (defined by the aperture) for S_LOAD and FLAT_* ops. 199 * There is no spare (APE1) aperture for HSA32 mode. 200 * 201 * 202 * GPUVM 64b mode (driver model) 203 * 204 * This mode is related to HSA64 in that the difference really is that 205 * the default aperture is GPUVM (ATC==0) and not ATC space. 206 * We have gfxip7.x hardware that has FLAT_* and S_LOAD support for 207 * SUA GPUVM mode, but does not support HSA32/HSA64. 208 * 209 * 210 * Device Unified Address - DUA 211 * 212 * Device unified address (DUA) is the name of the feature that maps the 213 * Shared(LDS) memory and Private(Scratch) memory into the overall address 214 * space for use by the new FLAT_* vector memory ops. The Shared and 215 * Private memories are mapped as apertures into the address space, 216 * and the hardware detects when a FLAT_* memory request is to be redirected 217 * to the LDS or Scratch memory when it falls into one of these apertures. 218 * Like the SUA apertures, the Shared/Private apertures are 64KB aligned and 219 * the base/limit is in the aperture. For both HSA64 and GPUVM SUA modes, 220 * the Shared/Private apertures are always placed in a limited selection of 221 * options in the hole of the 64b address space. For HSA32 mode, the 222 * Shared/Private apertures can be placed anywhere in the 32b space 223 * except at 0. 224 * 225 * 226 * HSA64 Apertures for FLAT_* vector ops 227 * 228 * For HSA64 SUA mode, the Shared and Private apertures are always placed 229 * in the hole w/ a limited selection of possible locations. The requests 230 * that fall in the private aperture are expanded as a function of the 231 * work-item id (tid) and redirected to the location of the 232 * hidden private memory. The hidden private can be placed in either GPUVM 233 * or ATC space. The addresses that fall in the shared aperture are 234 * re-directed to the on-chip LDS memory hardware. 235 * 236 * 237 * HSA32 Apertures for FLAT_* vector ops 238 * 239 * In HSA32 mode, the Private and Shared apertures can be placed anywhere 240 * in the 32b space except at 0 (Private or Shared Base at zero disables 241 * the apertures). If the base address of the apertures are non-zero 242 * (ie apertures exists), the size is always 64KB. 243 * 244 * 245 * GPUVM Apertures for FLAT_* vector ops 246 * 247 * In GPUVM mode, the Shared/Private apertures are specified identically 248 * to HSA64 mode where they are always in the hole at a limited selection 249 * of locations. 250 * 251 * 252 * Aperture Definitions for SUA and DUA 253 * 254 * The interpretation of the aperture register definitions for a given 255 * VMID is a function of the SUA Mode which is one of HSA64, HSA32, or 256 * GPUVM64 discussed in previous sections. The mode is first decoded, and 257 * then the remaining register decode is a function of the mode. 258 * 259 * 260 * SUA Mode Decode 261 * 262 * For the S_LOAD and FLAT_* shader operations, the SUA mode is decoded from 263 * the COMPUTE_DISPATCH_INITIATOR:DATA_ATC bit and 264 * the SH_MEM_CONFIG:PTR32 bits. 265 * 266 * COMPUTE_DISPATCH_INITIATOR:DATA_ATC SH_MEM_CONFIG:PTR32 Mode 267 * 268 * 1 0 HSA64 269 * 270 * 1 1 HSA32 271 * 272 * 0 X GPUVM64 273 * 274 * In general the hardware will ignore the PTR32 bit and treat 275 * as 0 whenever DATA_ATC = 0, but sw should set PTR32=0 276 * when DATA_ATC=0. 277 * 278 * The DATA_ATC bit is only set for compute dispatches. 279 * All Draw dispatches are hardcoded to GPUVM64 mode 280 * for FLAT_* / S_LOAD operations. 281 */ 282 283 #define MAKE_GPUVM_APP_BASE_VI(gpu_num) \ 284 (((uint64_t)(gpu_num) << 61) + 0x1000000000000L) 285 286 #define MAKE_GPUVM_APP_LIMIT(base, size) \ 287 (((uint64_t)(base) & 0xFFFFFF0000000000UL) + (size) - 1) 288 289 #define MAKE_SCRATCH_APP_BASE_VI() \ 290 (((uint64_t)(0x1UL) << 61) + 0x100000000L) 291 292 #define MAKE_SCRATCH_APP_LIMIT(base) \ 293 (((uint64_t)base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) 294 295 #define MAKE_LDS_APP_BASE_VI() \ 296 (((uint64_t)(0x1UL) << 61) + 0x0) 297 #define MAKE_LDS_APP_LIMIT(base) \ 298 (((uint64_t)(base) & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) 299 300 /* On GFXv9 the LDS and scratch apertures are programmed independently 301 * using the high 16 bits of the 64-bit virtual address. They must be 302 * in the hole, which will be the case as long as the high 16 bits are 303 * not 0. 304 * 305 * The aperture sizes are still 4GB implicitly. 306 * 307 * A GPUVM aperture is not applicable on GFXv9. 308 */ 309 #define MAKE_LDS_APP_BASE_V9() ((uint64_t)(0x1UL) << 48) 310 #define MAKE_SCRATCH_APP_BASE_V9() ((uint64_t)(0x2UL) << 48) 311 312 /* User mode manages most of the SVM aperture address space. The low 313 * 16MB are reserved for kernel use (CWSR trap handler and kernel IB 314 * for now). 315 */ 316 #define SVM_USER_BASE 0x1000000ull 317 #define SVM_CWSR_BASE (SVM_USER_BASE - KFD_CWSR_TBA_TMA_SIZE) 318 #define SVM_IB_BASE (SVM_CWSR_BASE - PAGE_SIZE) 319 320 static void kfd_init_apertures_vi(struct kfd_process_device *pdd, uint8_t id) 321 { 322 /* 323 * node id couldn't be 0 - the three MSB bits of 324 * aperture shoudn't be 0 325 */ 326 pdd->lds_base = MAKE_LDS_APP_BASE_VI(); 327 pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); 328 329 if (!pdd->dev->device_info->needs_iommu_device) { 330 /* dGPUs: SVM aperture starting at 0 331 * with small reserved space for kernel. 332 * Set them to CANONICAL addresses. 333 */ 334 pdd->gpuvm_base = SVM_USER_BASE; 335 pdd->gpuvm_limit = 336 pdd->dev->shared_resources.gpuvm_size - 1; 337 } else { 338 /* set them to non CANONICAL addresses, and no SVM is 339 * allocated. 340 */ 341 pdd->gpuvm_base = MAKE_GPUVM_APP_BASE_VI(id + 1); 342 pdd->gpuvm_limit = MAKE_GPUVM_APP_LIMIT(pdd->gpuvm_base, 343 pdd->dev->shared_resources.gpuvm_size); 344 } 345 346 pdd->scratch_base = MAKE_SCRATCH_APP_BASE_VI(); 347 pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); 348 } 349 350 static void kfd_init_apertures_v9(struct kfd_process_device *pdd, uint8_t id) 351 { 352 pdd->lds_base = MAKE_LDS_APP_BASE_V9(); 353 pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); 354 355 /* Raven needs SVM to support graphic handle, etc. Leave the small 356 * reserved space before SVM on Raven as well, even though we don't 357 * have to. 358 * Set gpuvm_base and gpuvm_limit to CANONICAL addresses so that they 359 * are used in Thunk to reserve SVM. 360 */ 361 pdd->gpuvm_base = SVM_USER_BASE; 362 pdd->gpuvm_limit = 363 pdd->dev->shared_resources.gpuvm_size - 1; 364 365 pdd->scratch_base = MAKE_SCRATCH_APP_BASE_V9(); 366 pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); 367 } 368 369 int kfd_init_apertures(struct kfd_process *process) 370 { 371 uint8_t id = 0; 372 struct kfd_dev *dev; 373 struct kfd_process_device *pdd; 374 375 /*Iterating over all devices*/ 376 while (kfd_topology_enum_kfd_devices(id, &dev) == 0) { 377 if (!dev || kfd_devcgroup_check_permission(dev)) { 378 /* Skip non GPU devices and devices to which the 379 * current process have no access to. Access can be 380 * limited by placing the process in a specific 381 * cgroup hierarchy 382 */ 383 id++; 384 continue; 385 } 386 387 pdd = kfd_create_process_device_data(dev, process); 388 if (!pdd) { 389 pr_err("Failed to create process device data\n"); 390 return -ENOMEM; 391 } 392 /* 393 * For 64 bit process apertures will be statically reserved in 394 * the x86_64 non canonical process address space 395 * amdkfd doesn't currently support apertures for 32 bit process 396 */ 397 if (process->is_32bit_user_mode) { 398 pdd->lds_base = pdd->lds_limit = 0; 399 pdd->gpuvm_base = pdd->gpuvm_limit = 0; 400 pdd->scratch_base = pdd->scratch_limit = 0; 401 } else { 402 switch (dev->device_info->asic_family) { 403 case CHIP_KAVERI: 404 case CHIP_HAWAII: 405 case CHIP_CARRIZO: 406 case CHIP_TONGA: 407 case CHIP_FIJI: 408 case CHIP_POLARIS10: 409 case CHIP_POLARIS11: 410 case CHIP_POLARIS12: 411 case CHIP_VEGAM: 412 kfd_init_apertures_vi(pdd, id); 413 break; 414 case CHIP_VEGA10: 415 case CHIP_VEGA12: 416 case CHIP_VEGA20: 417 case CHIP_RAVEN: 418 case CHIP_RENOIR: 419 case CHIP_ARCTURUS: 420 case CHIP_NAVI10: 421 case CHIP_NAVI12: 422 case CHIP_NAVI14: 423 kfd_init_apertures_v9(pdd, id); 424 break; 425 default: 426 WARN(1, "Unexpected ASIC family %u", 427 dev->device_info->asic_family); 428 return -EINVAL; 429 } 430 431 if (!dev->device_info->needs_iommu_device) { 432 /* dGPUs: the reserved space for kernel 433 * before SVM 434 */ 435 pdd->qpd.cwsr_base = SVM_CWSR_BASE; 436 pdd->qpd.ib_base = SVM_IB_BASE; 437 } 438 } 439 440 dev_dbg(kfd_device, "node id %u\n", id); 441 dev_dbg(kfd_device, "gpu id %u\n", pdd->dev->id); 442 dev_dbg(kfd_device, "lds_base %llX\n", pdd->lds_base); 443 dev_dbg(kfd_device, "lds_limit %llX\n", pdd->lds_limit); 444 dev_dbg(kfd_device, "gpuvm_base %llX\n", pdd->gpuvm_base); 445 dev_dbg(kfd_device, "gpuvm_limit %llX\n", pdd->gpuvm_limit); 446 dev_dbg(kfd_device, "scratch_base %llX\n", pdd->scratch_base); 447 dev_dbg(kfd_device, "scratch_limit %llX\n", pdd->scratch_limit); 448 449 id++; 450 } 451 452 return 0; 453 } 454