1/* 2 * Copyright © 2011 Marek Olšák <maraeo@gmail.com> 3 * Copyright © 2015 Advanced Micro Devices, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining 7 * a copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 16 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS 18 * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 21 * USE OR OTHER DEALINGS IN THE SOFTWARE. 22 * 23 * The above copyright notice and this permission notice (including the 24 * next paragraph) shall be included in all copies or substantial portions 25 * of the Software. 26 */ 27 28#include "amdgpu_cs.h" 29 30#include "util/os_time.h" 31#include "util/u_hash_table.h" 32#include "state_tracker/drm_driver.h" 33#include <amdgpu_drm.h> 34#include <xf86drm.h> 35#include <stdio.h> 36#include <inttypes.h> 37 38#ifndef AMDGPU_GEM_CREATE_VM_ALWAYS_VALID 39#define AMDGPU_GEM_CREATE_VM_ALWAYS_VALID (1 << 6) 40#endif 41 42#ifndef AMDGPU_VA_RANGE_HIGH 43#define AMDGPU_VA_RANGE_HIGH 0x2 44#endif 45 46/* Set to 1 for verbose output showing committed sparse buffer ranges. */ 47#define DEBUG_SPARSE_COMMITS 0 48 49struct amdgpu_sparse_backing_chunk { 50 uint32_t begin, end; 51}; 52 53static struct pb_buffer * 54amdgpu_bo_create(struct radeon_winsys *rws, 55 uint64_t size, 56 unsigned alignment, 57 enum radeon_bo_domain domain, 58 enum radeon_bo_flag flags); 59static void amdgpu_bo_unmap(struct pb_buffer *buf); 60 61static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout, 62 enum radeon_bo_usage usage) 63{ 64 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); 65 struct amdgpu_winsys *ws = bo->ws; 66 int64_t abs_timeout; 67 68 if (timeout == 0) { 69 if (p_atomic_read(&bo->num_active_ioctls)) 70 return false; 71 72 } else { 73 abs_timeout = os_time_get_absolute_timeout(timeout); 74 75 /* Wait if any ioctl is being submitted with this buffer. */ 76 if (!os_wait_until_zero_abs_timeout(&bo->num_active_ioctls, abs_timeout)) 77 return false; 78 } 79 80 if (bo->is_shared) { 81 /* We can't use user fences for shared buffers, because user fences 82 * are local to this process only. If we want to wait for all buffer 83 * uses in all processes, we have to use amdgpu_bo_wait_for_idle. 84 */ 85 bool buffer_busy = true; 86 int r; 87 88 r = amdgpu_bo_wait_for_idle(bo->bo, timeout, &buffer_busy); 89 if (r) 90 fprintf(stderr, "%s: amdgpu_bo_wait_for_idle failed %i\n", __func__, 91 r); 92 return !buffer_busy; 93 } 94 95 if (timeout == 0) { 96 unsigned idle_fences; 97 bool buffer_idle; 98 99 simple_mtx_lock(&ws->bo_fence_lock); 100 101 for (idle_fences = 0; idle_fences < bo->num_fences; ++idle_fences) { 102 if (!amdgpu_fence_wait(bo->fences[idle_fences], 0, false)) 103 break; 104 } 105 106 /* Release the idle fences to avoid checking them again later. */ 107 for (unsigned i = 0; i < idle_fences; ++i) 108 amdgpu_fence_reference(&bo->fences[i], NULL); 109 110 memmove(&bo->fences[0], &bo->fences[idle_fences], 111 (bo->num_fences - idle_fences) * sizeof(*bo->fences)); 112 bo->num_fences -= idle_fences; 113 114 buffer_idle = !bo->num_fences; 115 simple_mtx_unlock(&ws->bo_fence_lock); 116 117 return buffer_idle; 118 } else { 119 bool buffer_idle = true; 120 121 simple_mtx_lock(&ws->bo_fence_lock); 122 while (bo->num_fences && buffer_idle) { 123 struct pipe_fence_handle *fence = NULL; 124 bool fence_idle = false; 125 126 amdgpu_fence_reference(&fence, bo->fences[0]); 127 128 /* Wait for the fence. */ 129 simple_mtx_unlock(&ws->bo_fence_lock); 130 if (amdgpu_fence_wait(fence, abs_timeout, true)) 131 fence_idle = true; 132 else 133 buffer_idle = false; 134 simple_mtx_lock(&ws->bo_fence_lock); 135 136 /* Release an idle fence to avoid checking it again later, keeping in 137 * mind that the fence array may have been modified by other threads. 138 */ 139 if (fence_idle && bo->num_fences && bo->fences[0] == fence) { 140 amdgpu_fence_reference(&bo->fences[0], NULL); 141 memmove(&bo->fences[0], &bo->fences[1], 142 (bo->num_fences - 1) * sizeof(*bo->fences)); 143 bo->num_fences--; 144 } 145 146 amdgpu_fence_reference(&fence, NULL); 147 } 148 simple_mtx_unlock(&ws->bo_fence_lock); 149 150 return buffer_idle; 151 } 152} 153 154static enum radeon_bo_domain amdgpu_bo_get_initial_domain( 155 struct pb_buffer *buf) 156{ 157 return ((struct amdgpu_winsys_bo*)buf)->initial_domain; 158} 159 160static void amdgpu_bo_remove_fences(struct amdgpu_winsys_bo *bo) 161{ 162 for (unsigned i = 0; i < bo->num_fences; ++i) 163 amdgpu_fence_reference(&bo->fences[i], NULL); 164 165 FREE(bo->fences); 166 bo->num_fences = 0; 167 bo->max_fences = 0; 168} 169 170void amdgpu_bo_destroy(struct pb_buffer *_buf) 171{ 172 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); 173 struct amdgpu_winsys *ws = bo->ws; 174 175 assert(bo->bo && "must not be called for slab entries"); 176 177 if (!bo->is_user_ptr && bo->cpu_ptr) { 178 bo->cpu_ptr = NULL; 179 amdgpu_bo_unmap(&bo->base); 180 } 181 assert(bo->is_user_ptr || bo->u.real.map_count == 0); 182 183 if (ws->debug_all_bos) { 184 simple_mtx_lock(&ws->global_bo_list_lock); 185 LIST_DEL(&bo->u.real.global_list_item); 186 ws->num_buffers--; 187 simple_mtx_unlock(&ws->global_bo_list_lock); 188 } 189 190 simple_mtx_lock(&ws->bo_export_table_lock); 191 util_hash_table_remove(ws->bo_export_table, bo->bo); 192 simple_mtx_unlock(&ws->bo_export_table_lock); 193 194 if (bo->initial_domain & RADEON_DOMAIN_VRAM_GTT) { 195 amdgpu_bo_va_op(bo->bo, 0, bo->base.size, bo->va, 0, AMDGPU_VA_OP_UNMAP); 196 amdgpu_va_range_free(bo->u.real.va_handle); 197 } 198 amdgpu_bo_free(bo->bo); 199 200 amdgpu_bo_remove_fences(bo); 201 202 if (bo->initial_domain & RADEON_DOMAIN_VRAM) 203 ws->allocated_vram -= align64(bo->base.size, ws->info.gart_page_size); 204 else if (bo->initial_domain & RADEON_DOMAIN_GTT) 205 ws->allocated_gtt -= align64(bo->base.size, ws->info.gart_page_size); 206 207 simple_mtx_destroy(&bo->lock); 208 FREE(bo); 209} 210 211static void amdgpu_bo_destroy_or_cache(struct pb_buffer *_buf) 212{ 213 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); 214 215 assert(bo->bo); /* slab buffers have a separate vtbl */ 216 217 if (bo->u.real.use_reusable_pool) 218 pb_cache_add_buffer(&bo->u.real.cache_entry); 219 else 220 amdgpu_bo_destroy(_buf); 221} 222 223static void amdgpu_clean_up_buffer_managers(struct amdgpu_winsys *ws) 224{ 225 for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) 226 pb_slabs_reclaim(&ws->bo_slabs[i]); 227 228 pb_cache_release_all_buffers(&ws->bo_cache); 229} 230 231static bool amdgpu_bo_do_map(struct amdgpu_winsys_bo *bo, void **cpu) 232{ 233 assert(!bo->sparse && bo->bo && !bo->is_user_ptr); 234 int r = amdgpu_bo_cpu_map(bo->bo, cpu); 235 if (r) { 236 /* Clean up buffer managers and try again. */ 237 amdgpu_clean_up_buffer_managers(bo->ws); 238 r = amdgpu_bo_cpu_map(bo->bo, cpu); 239 if (r) 240 return false; 241 } 242 243 if (p_atomic_inc_return(&bo->u.real.map_count) == 1) { 244 if (bo->initial_domain & RADEON_DOMAIN_VRAM) 245 bo->ws->mapped_vram += bo->base.size; 246 else if (bo->initial_domain & RADEON_DOMAIN_GTT) 247 bo->ws->mapped_gtt += bo->base.size; 248 bo->ws->num_mapped_buffers++; 249 } 250 251 return true; 252} 253 254static void *amdgpu_bo_map(struct pb_buffer *buf, 255 struct radeon_cmdbuf *rcs, 256 enum pipe_transfer_usage usage) 257{ 258 struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf; 259 struct amdgpu_winsys_bo *real; 260 struct amdgpu_cs *cs = (struct amdgpu_cs*)rcs; 261 262 assert(!bo->sparse); 263 264 /* If it's not unsynchronized bo_map, flush CS if needed and then wait. */ 265 if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) { 266 /* DONTBLOCK doesn't make sense with UNSYNCHRONIZED. */ 267 if (usage & PIPE_TRANSFER_DONTBLOCK) { 268 if (!(usage & PIPE_TRANSFER_WRITE)) { 269 /* Mapping for read. 270 * 271 * Since we are mapping for read, we don't need to wait 272 * if the GPU is using the buffer for read too 273 * (neither one is changing it). 274 * 275 * Only check whether the buffer is being used for write. */ 276 if (cs && amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo, 277 RADEON_USAGE_WRITE)) { 278 cs->flush_cs(cs->flush_data, 279 RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); 280 return NULL; 281 } 282 283 if (!amdgpu_bo_wait((struct pb_buffer*)bo, 0, 284 RADEON_USAGE_WRITE)) { 285 return NULL; 286 } 287 } else { 288 if (cs && amdgpu_bo_is_referenced_by_cs(cs, bo)) { 289 cs->flush_cs(cs->flush_data, 290 RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); 291 return NULL; 292 } 293 294 if (!amdgpu_bo_wait((struct pb_buffer*)bo, 0, 295 RADEON_USAGE_READWRITE)) { 296 return NULL; 297 } 298 } 299 } else { 300 uint64_t time = os_time_get_nano(); 301 302 if (!(usage & PIPE_TRANSFER_WRITE)) { 303 /* Mapping for read. 304 * 305 * Since we are mapping for read, we don't need to wait 306 * if the GPU is using the buffer for read too 307 * (neither one is changing it). 308 * 309 * Only check whether the buffer is being used for write. */ 310 if (cs) { 311 if (amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo, 312 RADEON_USAGE_WRITE)) { 313 cs->flush_cs(cs->flush_data, 314 RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL); 315 } else { 316 /* Try to avoid busy-waiting in amdgpu_bo_wait. */ 317 if (p_atomic_read(&bo->num_active_ioctls)) 318 amdgpu_cs_sync_flush(rcs); 319 } 320 } 321 322 amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE, 323 RADEON_USAGE_WRITE); 324 } else { 325 /* Mapping for write. */ 326 if (cs) { 327 if (amdgpu_bo_is_referenced_by_cs(cs, bo)) { 328 cs->flush_cs(cs->flush_data, 329 RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL); 330 } else { 331 /* Try to avoid busy-waiting in amdgpu_bo_wait. */ 332 if (p_atomic_read(&bo->num_active_ioctls)) 333 amdgpu_cs_sync_flush(rcs); 334 } 335 } 336 337 amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE, 338 RADEON_USAGE_READWRITE); 339 } 340 341 bo->ws->buffer_wait_time += os_time_get_nano() - time; 342 } 343 } 344 345 /* Buffer synchronization has been checked, now actually map the buffer. */ 346 void *cpu = NULL; 347 uint64_t offset = 0; 348 349 if (bo->bo) { 350 real = bo; 351 } else { 352 real = bo->u.slab.real; 353 offset = bo->va - real->va; 354 } 355 356 if (usage & RADEON_TRANSFER_TEMPORARY) { 357 if (real->is_user_ptr) { 358 cpu = real->cpu_ptr; 359 } else { 360 if (!amdgpu_bo_do_map(real, &cpu)) 361 return NULL; 362 } 363 } else { 364 cpu = p_atomic_read(&real->cpu_ptr); 365 if (!cpu) { 366 simple_mtx_lock(&real->lock); 367 /* Must re-check due to the possibility of a race. Re-check need not 368 * be atomic thanks to the lock. */ 369 cpu = real->cpu_ptr; 370 if (!cpu) { 371 if (!amdgpu_bo_do_map(real, &cpu)) { 372 simple_mtx_unlock(&real->lock); 373 return NULL; 374 } 375 p_atomic_set(&real->cpu_ptr, cpu); 376 } 377 simple_mtx_unlock(&real->lock); 378 } 379 } 380 381 return (uint8_t*)cpu + offset; 382} 383 384static void amdgpu_bo_unmap(struct pb_buffer *buf) 385{ 386 struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf; 387 struct amdgpu_winsys_bo *real; 388 389 assert(!bo->sparse); 390 391 if (bo->is_user_ptr) 392 return; 393 394 real = bo->bo ? bo : bo->u.slab.real; 395 assert(real->u.real.map_count != 0 && "too many unmaps"); 396 if (p_atomic_dec_zero(&real->u.real.map_count)) { 397 assert(!real->cpu_ptr && 398 "too many unmaps or forgot RADEON_TRANSFER_TEMPORARY flag"); 399 400 if (real->initial_domain & RADEON_DOMAIN_VRAM) 401 real->ws->mapped_vram -= real->base.size; 402 else if (real->initial_domain & RADEON_DOMAIN_GTT) 403 real->ws->mapped_gtt -= real->base.size; 404 real->ws->num_mapped_buffers--; 405 } 406 407 amdgpu_bo_cpu_unmap(real->bo); 408} 409 410static const struct pb_vtbl amdgpu_winsys_bo_vtbl = { 411 amdgpu_bo_destroy_or_cache 412 /* other functions are never called */ 413}; 414 415static void amdgpu_add_buffer_to_global_list(struct amdgpu_winsys_bo *bo) 416{ 417 struct amdgpu_winsys *ws = bo->ws; 418 419 assert(bo->bo); 420 421 if (ws->debug_all_bos) { 422 simple_mtx_lock(&ws->global_bo_list_lock); 423 LIST_ADDTAIL(&bo->u.real.global_list_item, &ws->global_bo_list); 424 ws->num_buffers++; 425 simple_mtx_unlock(&ws->global_bo_list_lock); 426 } 427} 428 429static uint64_t amdgpu_get_optimal_vm_alignment(struct amdgpu_winsys *ws, 430 uint64_t size, unsigned alignment) 431{ 432 uint64_t vm_alignment = alignment; 433 434 /* Increase the VM alignment for faster address translation. */ 435 if (size >= ws->info.pte_fragment_size) 436 vm_alignment = MAX2(vm_alignment, ws->info.pte_fragment_size); 437 438 /* Gfx9: Increase the VM alignment to the most significant bit set 439 * in the size for faster address translation. 440 */ 441 if (ws->info.chip_class >= GFX9) { 442 unsigned msb = util_last_bit64(size); /* 0 = no bit is set */ 443 uint64_t msb_alignment = msb ? 1ull << (msb - 1) : 0; 444 445 vm_alignment = MAX2(vm_alignment, msb_alignment); 446 } 447 return vm_alignment; 448} 449 450static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws, 451 uint64_t size, 452 unsigned alignment, 453 enum radeon_bo_domain initial_domain, 454 unsigned flags, 455 int heap) 456{ 457 struct amdgpu_bo_alloc_request request = {0}; 458 amdgpu_bo_handle buf_handle; 459 uint64_t va = 0; 460 struct amdgpu_winsys_bo *bo; 461 amdgpu_va_handle va_handle; 462 int r; 463 464 /* VRAM or GTT must be specified, but not both at the same time. */ 465 assert(util_bitcount(initial_domain & (RADEON_DOMAIN_VRAM_GTT | 466 RADEON_DOMAIN_GDS | 467 RADEON_DOMAIN_OA)) == 1); 468 469 bo = CALLOC_STRUCT(amdgpu_winsys_bo); 470 if (!bo) { 471 return NULL; 472 } 473 474 if (heap >= 0) { 475 pb_cache_init_entry(&ws->bo_cache, &bo->u.real.cache_entry, &bo->base, 476 heap); 477 } 478 request.alloc_size = size; 479 request.phys_alignment = alignment; 480 481 if (initial_domain & RADEON_DOMAIN_VRAM) { 482 request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM; 483 484 /* Since VRAM and GTT have almost the same performance on APUs, we could 485 * just set GTT. However, in order to decrease GTT(RAM) usage, which is 486 * shared with the OS, allow VRAM placements too. The idea is not to use 487 * VRAM usefully, but to use it so that it's not unused and wasted. 488 */ 489 if (!ws->info.has_dedicated_vram) 490 request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT; 491 } 492 493 if (initial_domain & RADEON_DOMAIN_GTT) 494 request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT; 495 if (initial_domain & RADEON_DOMAIN_GDS) 496 request.preferred_heap |= AMDGPU_GEM_DOMAIN_GDS; 497 if (initial_domain & RADEON_DOMAIN_OA) 498 request.preferred_heap |= AMDGPU_GEM_DOMAIN_OA; 499 500 if (flags & RADEON_FLAG_NO_CPU_ACCESS) 501 request.flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS; 502 if (flags & RADEON_FLAG_GTT_WC) 503 request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC; 504 if (flags & RADEON_FLAG_NO_INTERPROCESS_SHARING && 505 ws->info.has_local_buffers) 506 request.flags |= AMDGPU_GEM_CREATE_VM_ALWAYS_VALID; 507 if (ws->zero_all_vram_allocs && 508 (request.preferred_heap & AMDGPU_GEM_DOMAIN_VRAM)) 509 request.flags |= AMDGPU_GEM_CREATE_VRAM_CLEARED; 510 511 r = amdgpu_bo_alloc(ws->dev, &request, &buf_handle); 512 if (r) { 513 fprintf(stderr, "amdgpu: Failed to allocate a buffer:\n"); 514 fprintf(stderr, "amdgpu: size : %"PRIu64" bytes\n", size); 515 fprintf(stderr, "amdgpu: alignment : %u bytes\n", alignment); 516 fprintf(stderr, "amdgpu: domains : %u\n", initial_domain); 517 goto error_bo_alloc; 518 } 519 520 if (initial_domain & RADEON_DOMAIN_VRAM_GTT) { 521 unsigned va_gap_size = ws->check_vm ? MAX2(4 * alignment, 64 * 1024) : 0; 522 523 r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general, 524 size + va_gap_size, 525 amdgpu_get_optimal_vm_alignment(ws, size, alignment), 526 0, &va, &va_handle, 527 (flags & RADEON_FLAG_32BIT ? AMDGPU_VA_RANGE_32_BIT : 0) | 528 AMDGPU_VA_RANGE_HIGH); 529 if (r) 530 goto error_va_alloc; 531 532 unsigned vm_flags = AMDGPU_VM_PAGE_READABLE | 533 AMDGPU_VM_PAGE_EXECUTABLE; 534 535 if (!(flags & RADEON_FLAG_READ_ONLY)) 536 vm_flags |= AMDGPU_VM_PAGE_WRITEABLE; 537 538 r = amdgpu_bo_va_op_raw(ws->dev, buf_handle, 0, size, va, vm_flags, 539 AMDGPU_VA_OP_MAP); 540 if (r) 541 goto error_va_map; 542 } 543 544 simple_mtx_init(&bo->lock, mtx_plain); 545 pipe_reference_init(&bo->base.reference, 1); 546 bo->base.alignment = alignment; 547 bo->base.usage = 0; 548 bo->base.size = size; 549 bo->base.vtbl = &amdgpu_winsys_bo_vtbl; 550 bo->ws = ws; 551 bo->bo = buf_handle; 552 bo->va = va; 553 bo->u.real.va_handle = va_handle; 554 bo->initial_domain = initial_domain; 555 bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1); 556 bo->is_local = !!(request.flags & AMDGPU_GEM_CREATE_VM_ALWAYS_VALID); 557 558 if (initial_domain & RADEON_DOMAIN_VRAM) 559 ws->allocated_vram += align64(size, ws->info.gart_page_size); 560 else if (initial_domain & RADEON_DOMAIN_GTT) 561 ws->allocated_gtt += align64(size, ws->info.gart_page_size); 562 563 amdgpu_bo_export(bo->bo, amdgpu_bo_handle_type_kms, &bo->u.real.kms_handle); 564 565 amdgpu_add_buffer_to_global_list(bo); 566 567 return bo; 568 569error_va_map: 570 amdgpu_va_range_free(va_handle); 571 572error_va_alloc: 573 amdgpu_bo_free(buf_handle); 574 575error_bo_alloc: 576 FREE(bo); 577 return NULL; 578} 579 580bool amdgpu_bo_can_reclaim(struct pb_buffer *_buf) 581{ 582 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); 583 584 if (amdgpu_bo_is_referenced_by_any_cs(bo)) { 585 return false; 586 } 587 588 return amdgpu_bo_wait(_buf, 0, RADEON_USAGE_READWRITE); 589} 590 591bool amdgpu_bo_can_reclaim_slab(void *priv, struct pb_slab_entry *entry) 592{ 593 struct amdgpu_winsys_bo *bo = NULL; /* fix container_of */ 594 bo = container_of(entry, bo, u.slab.entry); 595 596 return amdgpu_bo_can_reclaim(&bo->base); 597} 598 599static struct pb_slabs *get_slabs(struct amdgpu_winsys *ws, uint64_t size) 600{ 601 /* Find the correct slab allocator for the given size. */ 602 for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) { 603 struct pb_slabs *slabs = &ws->bo_slabs[i]; 604 605 if (size <= 1 << (slabs->min_order + slabs->num_orders - 1)) 606 return slabs; 607 } 608 609 assert(0); 610 return NULL; 611} 612 613static void amdgpu_bo_slab_destroy(struct pb_buffer *_buf) 614{ 615 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); 616 617 assert(!bo->bo); 618 619 pb_slab_free(get_slabs(bo->ws, bo->base.size), &bo->u.slab.entry); 620} 621 622static const struct pb_vtbl amdgpu_winsys_bo_slab_vtbl = { 623 amdgpu_bo_slab_destroy 624 /* other functions are never called */ 625}; 626 627struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap, 628 unsigned entry_size, 629 unsigned group_index) 630{ 631 struct amdgpu_winsys *ws = priv; 632 struct amdgpu_slab *slab = CALLOC_STRUCT(amdgpu_slab); 633 enum radeon_bo_domain domains = radeon_domain_from_heap(heap); 634 enum radeon_bo_flag flags = radeon_flags_from_heap(heap); 635 uint32_t base_id; 636 unsigned slab_size = 0; 637 638 if (!slab) 639 return NULL; 640 641 /* Determine the slab buffer size. */ 642 for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) { 643 struct pb_slabs *slabs = &ws->bo_slabs[i]; 644 unsigned max_entry_size = 1 << (slabs->min_order + slabs->num_orders - 1); 645 646 if (entry_size <= max_entry_size) { 647 /* The slab size is twice the size of the largest possible entry. */ 648 slab_size = max_entry_size * 2; 649 650 /* The largest slab should have the same size as the PTE fragment 651 * size to get faster address translation. 652 */ 653 if (i == NUM_SLAB_ALLOCATORS - 1 && 654 slab_size < ws->info.pte_fragment_size) 655 slab_size = ws->info.pte_fragment_size; 656 break; 657 } 658 } 659 assert(slab_size != 0); 660 661 slab->buffer = amdgpu_winsys_bo(amdgpu_bo_create(&ws->base, 662 slab_size, slab_size, 663 domains, flags)); 664 if (!slab->buffer) 665 goto fail; 666 667 slab->base.num_entries = slab->buffer->base.size / entry_size; 668 slab->base.num_free = slab->base.num_entries; 669 slab->entries = CALLOC(slab->base.num_entries, sizeof(*slab->entries)); 670 if (!slab->entries) 671 goto fail_buffer; 672 673 LIST_INITHEAD(&slab->base.free); 674 675 base_id = __sync_fetch_and_add(&ws->next_bo_unique_id, slab->base.num_entries); 676 677 for (unsigned i = 0; i < slab->base.num_entries; ++i) { 678 struct amdgpu_winsys_bo *bo = &slab->entries[i]; 679 680 simple_mtx_init(&bo->lock, mtx_plain); 681 bo->base.alignment = entry_size; 682 bo->base.usage = slab->buffer->base.usage; 683 bo->base.size = entry_size; 684 bo->base.vtbl = &amdgpu_winsys_bo_slab_vtbl; 685 bo->ws = ws; 686 bo->va = slab->buffer->va + i * entry_size; 687 bo->initial_domain = domains; 688 bo->unique_id = base_id + i; 689 bo->u.slab.entry.slab = &slab->base; 690 bo->u.slab.entry.group_index = group_index; 691 692 if (slab->buffer->bo) { 693 /* The slab is not suballocated. */ 694 bo->u.slab.real = slab->buffer; 695 } else { 696 /* The slab is allocated out of a bigger slab. */ 697 bo->u.slab.real = slab->buffer->u.slab.real; 698 assert(bo->u.slab.real->bo); 699 } 700 701 LIST_ADDTAIL(&bo->u.slab.entry.head, &slab->base.free); 702 } 703 704 return &slab->base; 705 706fail_buffer: 707 amdgpu_winsys_bo_reference(&slab->buffer, NULL); 708fail: 709 FREE(slab); 710 return NULL; 711} 712 713void amdgpu_bo_slab_free(void *priv, struct pb_slab *pslab) 714{ 715 struct amdgpu_slab *slab = amdgpu_slab(pslab); 716 717 for (unsigned i = 0; i < slab->base.num_entries; ++i) { 718 amdgpu_bo_remove_fences(&slab->entries[i]); 719 simple_mtx_destroy(&slab->entries[i].lock); 720 } 721 722 FREE(slab->entries); 723 amdgpu_winsys_bo_reference(&slab->buffer, NULL); 724 FREE(slab); 725} 726 727#if DEBUG_SPARSE_COMMITS 728static void 729sparse_dump(struct amdgpu_winsys_bo *bo, const char *func) 730{ 731 fprintf(stderr, "%s: %p (size=%"PRIu64", num_va_pages=%u) @ %s\n" 732 "Commitments:\n", 733 __func__, bo, bo->base.size, bo->u.sparse.num_va_pages, func); 734 735 struct amdgpu_sparse_backing *span_backing = NULL; 736 uint32_t span_first_backing_page = 0; 737 uint32_t span_first_va_page = 0; 738 uint32_t va_page = 0; 739 740 for (;;) { 741 struct amdgpu_sparse_backing *backing = 0; 742 uint32_t backing_page = 0; 743 744 if (va_page < bo->u.sparse.num_va_pages) { 745 backing = bo->u.sparse.commitments[va_page].backing; 746 backing_page = bo->u.sparse.commitments[va_page].page; 747 } 748 749 if (span_backing && 750 (backing != span_backing || 751 backing_page != span_first_backing_page + (va_page - span_first_va_page))) { 752 fprintf(stderr, " %u..%u: backing=%p:%u..%u\n", 753 span_first_va_page, va_page - 1, span_backing, 754 span_first_backing_page, 755 span_first_backing_page + (va_page - span_first_va_page) - 1); 756 757 span_backing = NULL; 758 } 759 760 if (va_page >= bo->u.sparse.num_va_pages) 761 break; 762 763 if (backing && !span_backing) { 764 span_backing = backing; 765 span_first_backing_page = backing_page; 766 span_first_va_page = va_page; 767 } 768 769 va_page++; 770 } 771 772 fprintf(stderr, "Backing:\n"); 773 774 list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) { 775 fprintf(stderr, " %p (size=%"PRIu64")\n", backing, backing->bo->base.size); 776 for (unsigned i = 0; i < backing->num_chunks; ++i) 777 fprintf(stderr, " %u..%u\n", backing->chunks[i].begin, backing->chunks[i].end); 778 } 779} 780#endif 781 782/* 783 * Attempt to allocate the given number of backing pages. Fewer pages may be 784 * allocated (depending on the fragmentation of existing backing buffers), 785 * which will be reflected by a change to *pnum_pages. 786 */ 787static struct amdgpu_sparse_backing * 788sparse_backing_alloc(struct amdgpu_winsys_bo *bo, uint32_t *pstart_page, uint32_t *pnum_pages) 789{ 790 struct amdgpu_sparse_backing *best_backing; 791 unsigned best_idx; 792 uint32_t best_num_pages; 793 794 best_backing = NULL; 795 best_idx = 0; 796 best_num_pages = 0; 797 798 /* This is a very simple and inefficient best-fit algorithm. */ 799 list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) { 800 for (unsigned idx = 0; idx < backing->num_chunks; ++idx) { 801 uint32_t cur_num_pages = backing->chunks[idx].end - backing->chunks[idx].begin; 802 if ((best_num_pages < *pnum_pages && cur_num_pages > best_num_pages) || 803 (best_num_pages > *pnum_pages && cur_num_pages < best_num_pages)) { 804 best_backing = backing; 805 best_idx = idx; 806 best_num_pages = cur_num_pages; 807 } 808 } 809 } 810 811 /* Allocate a new backing buffer if necessary. */ 812 if (!best_backing) { 813 struct pb_buffer *buf; 814 uint64_t size; 815 uint32_t pages; 816 817 best_backing = CALLOC_STRUCT(amdgpu_sparse_backing); 818 if (!best_backing) 819 return NULL; 820 821 best_backing->max_chunks = 4; 822 best_backing->chunks = CALLOC(best_backing->max_chunks, 823 sizeof(*best_backing->chunks)); 824 if (!best_backing->chunks) { 825 FREE(best_backing); 826 return NULL; 827 } 828 829 assert(bo->u.sparse.num_backing_pages < DIV_ROUND_UP(bo->base.size, RADEON_SPARSE_PAGE_SIZE)); 830 831 size = MIN3(bo->base.size / 16, 832 8 * 1024 * 1024, 833 bo->base.size - (uint64_t)bo->u.sparse.num_backing_pages * RADEON_SPARSE_PAGE_SIZE); 834 size = MAX2(size, RADEON_SPARSE_PAGE_SIZE); 835 836 buf = amdgpu_bo_create(&bo->ws->base, size, RADEON_SPARSE_PAGE_SIZE, 837 bo->initial_domain, 838 bo->u.sparse.flags | RADEON_FLAG_NO_SUBALLOC); 839 if (!buf) { 840 FREE(best_backing->chunks); 841 FREE(best_backing); 842 return NULL; 843 } 844 845 /* We might have gotten a bigger buffer than requested via caching. */ 846 pages = buf->size / RADEON_SPARSE_PAGE_SIZE; 847 848 best_backing->bo = amdgpu_winsys_bo(buf); 849 best_backing->num_chunks = 1; 850 best_backing->chunks[0].begin = 0; 851 best_backing->chunks[0].end = pages; 852 853 list_add(&best_backing->list, &bo->u.sparse.backing); 854 bo->u.sparse.num_backing_pages += pages; 855 856 best_idx = 0; 857 best_num_pages = pages; 858 } 859 860 *pnum_pages = MIN2(*pnum_pages, best_num_pages); 861 *pstart_page = best_backing->chunks[best_idx].begin; 862 best_backing->chunks[best_idx].begin += *pnum_pages; 863 864 if (best_backing->chunks[best_idx].begin >= best_backing->chunks[best_idx].end) { 865 memmove(&best_backing->chunks[best_idx], &best_backing->chunks[best_idx + 1], 866 sizeof(*best_backing->chunks) * (best_backing->num_chunks - best_idx - 1)); 867 best_backing->num_chunks--; 868 } 869 870 return best_backing; 871} 872 873static void 874sparse_free_backing_buffer(struct amdgpu_winsys_bo *bo, 875 struct amdgpu_sparse_backing *backing) 876{ 877 struct amdgpu_winsys *ws = backing->bo->ws; 878 879 bo->u.sparse.num_backing_pages -= backing->bo->base.size / RADEON_SPARSE_PAGE_SIZE; 880 881 simple_mtx_lock(&ws->bo_fence_lock); 882 amdgpu_add_fences(backing->bo, bo->num_fences, bo->fences); 883 simple_mtx_unlock(&ws->bo_fence_lock); 884 885 list_del(&backing->list); 886 amdgpu_winsys_bo_reference(&backing->bo, NULL); 887 FREE(backing->chunks); 888 FREE(backing); 889} 890 891/* 892 * Return a range of pages from the given backing buffer back into the 893 * free structure. 894 */ 895static bool 896sparse_backing_free(struct amdgpu_winsys_bo *bo, 897 struct amdgpu_sparse_backing *backing, 898 uint32_t start_page, uint32_t num_pages) 899{ 900 uint32_t end_page = start_page + num_pages; 901 unsigned low = 0; 902 unsigned high = backing->num_chunks; 903 904 /* Find the first chunk with begin >= start_page. */ 905 while (low < high) { 906 unsigned mid = low + (high - low) / 2; 907 908 if (backing->chunks[mid].begin >= start_page) 909 high = mid; 910 else 911 low = mid + 1; 912 } 913 914 assert(low >= backing->num_chunks || end_page <= backing->chunks[low].begin); 915 assert(low == 0 || backing->chunks[low - 1].end <= start_page); 916 917 if (low > 0 && backing->chunks[low - 1].end == start_page) { 918 backing->chunks[low - 1].end = end_page; 919 920 if (low < backing->num_chunks && end_page == backing->chunks[low].begin) { 921 backing->chunks[low - 1].end = backing->chunks[low].end; 922 memmove(&backing->chunks[low], &backing->chunks[low + 1], 923 sizeof(*backing->chunks) * (backing->num_chunks - low - 1)); 924 backing->num_chunks--; 925 } 926 } else if (low < backing->num_chunks && end_page == backing->chunks[low].begin) { 927 backing->chunks[low].begin = start_page; 928 } else { 929 if (backing->num_chunks >= backing->max_chunks) { 930 unsigned new_max_chunks = 2 * backing->max_chunks; 931 struct amdgpu_sparse_backing_chunk *new_chunks = 932 REALLOC(backing->chunks, 933 sizeof(*backing->chunks) * backing->max_chunks, 934 sizeof(*backing->chunks) * new_max_chunks); 935 if (!new_chunks) 936 return false; 937 938 backing->max_chunks = new_max_chunks; 939 backing->chunks = new_chunks; 940 } 941 942 memmove(&backing->chunks[low + 1], &backing->chunks[low], 943 sizeof(*backing->chunks) * (backing->num_chunks - low)); 944 backing->chunks[low].begin = start_page; 945 backing->chunks[low].end = end_page; 946 backing->num_chunks++; 947 } 948 949 if (backing->num_chunks == 1 && backing->chunks[0].begin == 0 && 950 backing->chunks[0].end == backing->bo->base.size / RADEON_SPARSE_PAGE_SIZE) 951 sparse_free_backing_buffer(bo, backing); 952 953 return true; 954} 955 956static void amdgpu_bo_sparse_destroy(struct pb_buffer *_buf) 957{ 958 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); 959 int r; 960 961 assert(!bo->bo && bo->sparse); 962 963 r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0, 964 (uint64_t)bo->u.sparse.num_va_pages * RADEON_SPARSE_PAGE_SIZE, 965 bo->va, 0, AMDGPU_VA_OP_CLEAR); 966 if (r) { 967 fprintf(stderr, "amdgpu: clearing PRT VA region on destroy failed (%d)\n", r); 968 } 969 970 while (!list_empty(&bo->u.sparse.backing)) { 971 struct amdgpu_sparse_backing *dummy = NULL; 972 sparse_free_backing_buffer(bo, 973 container_of(bo->u.sparse.backing.next, 974 dummy, list)); 975 } 976 977 amdgpu_va_range_free(bo->u.sparse.va_handle); 978 FREE(bo->u.sparse.commitments); 979 simple_mtx_destroy(&bo->lock); 980 FREE(bo); 981} 982 983static const struct pb_vtbl amdgpu_winsys_bo_sparse_vtbl = { 984 amdgpu_bo_sparse_destroy 985 /* other functions are never called */ 986}; 987 988static struct pb_buffer * 989amdgpu_bo_sparse_create(struct amdgpu_winsys *ws, uint64_t size, 990 enum radeon_bo_domain domain, 991 enum radeon_bo_flag flags) 992{ 993 struct amdgpu_winsys_bo *bo; 994 uint64_t map_size; 995 uint64_t va_gap_size; 996 int r; 997 998 /* We use 32-bit page numbers; refuse to attempt allocating sparse buffers 999 * that exceed this limit. This is not really a restriction: we don't have 1000 * that much virtual address space anyway. 1001 */ 1002 if (size > (uint64_t)INT32_MAX * RADEON_SPARSE_PAGE_SIZE) 1003 return NULL; 1004 1005 bo = CALLOC_STRUCT(amdgpu_winsys_bo); 1006 if (!bo) 1007 return NULL; 1008 1009 simple_mtx_init(&bo->lock, mtx_plain); 1010 pipe_reference_init(&bo->base.reference, 1); 1011 bo->base.alignment = RADEON_SPARSE_PAGE_SIZE; 1012 bo->base.size = size; 1013 bo->base.vtbl = &amdgpu_winsys_bo_sparse_vtbl; 1014 bo->ws = ws; 1015 bo->initial_domain = domain; 1016 bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1); 1017 bo->sparse = true; 1018 bo->u.sparse.flags = flags & ~RADEON_FLAG_SPARSE; 1019 1020 bo->u.sparse.num_va_pages = DIV_ROUND_UP(size, RADEON_SPARSE_PAGE_SIZE); 1021 bo->u.sparse.commitments = CALLOC(bo->u.sparse.num_va_pages, 1022 sizeof(*bo->u.sparse.commitments)); 1023 if (!bo->u.sparse.commitments) 1024 goto error_alloc_commitments; 1025 1026 LIST_INITHEAD(&bo->u.sparse.backing); 1027 1028 /* For simplicity, we always map a multiple of the page size. */ 1029 map_size = align64(size, RADEON_SPARSE_PAGE_SIZE); 1030 va_gap_size = ws->check_vm ? 4 * RADEON_SPARSE_PAGE_SIZE : 0; 1031 r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general, 1032 map_size + va_gap_size, RADEON_SPARSE_PAGE_SIZE, 1033 0, &bo->va, &bo->u.sparse.va_handle, 1034 AMDGPU_VA_RANGE_HIGH); 1035 if (r) 1036 goto error_va_alloc; 1037 1038 r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0, size, bo->va, 1039 AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_MAP); 1040 if (r) 1041 goto error_va_map; 1042 1043 return &bo->base; 1044 1045error_va_map: 1046 amdgpu_va_range_free(bo->u.sparse.va_handle); 1047error_va_alloc: 1048 FREE(bo->u.sparse.commitments); 1049error_alloc_commitments: 1050 simple_mtx_destroy(&bo->lock); 1051 FREE(bo); 1052 return NULL; 1053} 1054 1055static bool 1056amdgpu_bo_sparse_commit(struct pb_buffer *buf, uint64_t offset, uint64_t size, 1057 bool commit) 1058{ 1059 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(buf); 1060 struct amdgpu_sparse_commitment *comm; 1061 uint32_t va_page, end_va_page; 1062 bool ok = true; 1063 int r; 1064 1065 assert(bo->sparse); 1066 assert(offset % RADEON_SPARSE_PAGE_SIZE == 0); 1067 assert(offset <= bo->base.size); 1068 assert(size <= bo->base.size - offset); 1069 assert(size % RADEON_SPARSE_PAGE_SIZE == 0 || offset + size == bo->base.size); 1070 1071 comm = bo->u.sparse.commitments; 1072 va_page = offset / RADEON_SPARSE_PAGE_SIZE; 1073 end_va_page = va_page + DIV_ROUND_UP(size, RADEON_SPARSE_PAGE_SIZE); 1074 1075 simple_mtx_lock(&bo->lock); 1076 1077#if DEBUG_SPARSE_COMMITS 1078 sparse_dump(bo, __func__); 1079#endif 1080 1081 if (commit) { 1082 while (va_page < end_va_page) { 1083 uint32_t span_va_page; 1084 1085 /* Skip pages that are already committed. */ 1086 if (comm[va_page].backing) { 1087 va_page++; 1088 continue; 1089 } 1090 1091 /* Determine length of uncommitted span. */ 1092 span_va_page = va_page; 1093 while (va_page < end_va_page && !comm[va_page].backing) 1094 va_page++; 1095 1096 /* Fill the uncommitted span with chunks of backing memory. */ 1097 while (span_va_page < va_page) { 1098 struct amdgpu_sparse_backing *backing; 1099 uint32_t backing_start, backing_size; 1100 1101 backing_size = va_page - span_va_page; 1102 backing = sparse_backing_alloc(bo, &backing_start, &backing_size); 1103 if (!backing) { 1104 ok = false; 1105 goto out; 1106 } 1107 1108 r = amdgpu_bo_va_op_raw(bo->ws->dev, backing->bo->bo, 1109 (uint64_t)backing_start * RADEON_SPARSE_PAGE_SIZE, 1110 (uint64_t)backing_size * RADEON_SPARSE_PAGE_SIZE, 1111 bo->va + (uint64_t)span_va_page * RADEON_SPARSE_PAGE_SIZE, 1112 AMDGPU_VM_PAGE_READABLE | 1113 AMDGPU_VM_PAGE_WRITEABLE | 1114 AMDGPU_VM_PAGE_EXECUTABLE, 1115 AMDGPU_VA_OP_REPLACE); 1116 if (r) { 1117 ok = sparse_backing_free(bo, backing, backing_start, backing_size); 1118 assert(ok && "sufficient memory should already be allocated"); 1119 1120 ok = false; 1121 goto out; 1122 } 1123 1124 while (backing_size) { 1125 comm[span_va_page].backing = backing; 1126 comm[span_va_page].page = backing_start; 1127 span_va_page++; 1128 backing_start++; 1129 backing_size--; 1130 } 1131 } 1132 } 1133 } else { 1134 r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0, 1135 (uint64_t)(end_va_page - va_page) * RADEON_SPARSE_PAGE_SIZE, 1136 bo->va + (uint64_t)va_page * RADEON_SPARSE_PAGE_SIZE, 1137 AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_REPLACE); 1138 if (r) { 1139 ok = false; 1140 goto out; 1141 } 1142 1143 while (va_page < end_va_page) { 1144 struct amdgpu_sparse_backing *backing; 1145 uint32_t backing_start; 1146 uint32_t span_pages; 1147 1148 /* Skip pages that are already uncommitted. */ 1149 if (!comm[va_page].backing) { 1150 va_page++; 1151 continue; 1152 } 1153 1154 /* Group contiguous spans of pages. */ 1155 backing = comm[va_page].backing; 1156 backing_start = comm[va_page].page; 1157 comm[va_page].backing = NULL; 1158 1159 span_pages = 1; 1160 va_page++; 1161 1162 while (va_page < end_va_page && 1163 comm[va_page].backing == backing && 1164 comm[va_page].page == backing_start + span_pages) { 1165 comm[va_page].backing = NULL; 1166 va_page++; 1167 span_pages++; 1168 } 1169 1170 if (!sparse_backing_free(bo, backing, backing_start, span_pages)) { 1171 /* Couldn't allocate tracking data structures, so we have to leak */ 1172 fprintf(stderr, "amdgpu: leaking PRT backing memory\n"); 1173 ok = false; 1174 } 1175 } 1176 } 1177out: 1178 1179 simple_mtx_unlock(&bo->lock); 1180 1181 return ok; 1182} 1183 1184static unsigned eg_tile_split(unsigned tile_split) 1185{ 1186 switch (tile_split) { 1187 case 0: tile_split = 64; break; 1188 case 1: tile_split = 128; break; 1189 case 2: tile_split = 256; break; 1190 case 3: tile_split = 512; break; 1191 default: 1192 case 4: tile_split = 1024; break; 1193 case 5: tile_split = 2048; break; 1194 case 6: tile_split = 4096; break; 1195 } 1196 return tile_split; 1197} 1198 1199static unsigned eg_tile_split_rev(unsigned eg_tile_split) 1200{ 1201 switch (eg_tile_split) { 1202 case 64: return 0; 1203 case 128: return 1; 1204 case 256: return 2; 1205 case 512: return 3; 1206 default: 1207 case 1024: return 4; 1208 case 2048: return 5; 1209 case 4096: return 6; 1210 } 1211} 1212 1213static void amdgpu_buffer_get_metadata(struct pb_buffer *_buf, 1214 struct radeon_bo_metadata *md) 1215{ 1216 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); 1217 struct amdgpu_bo_info info = {0}; 1218 uint64_t tiling_flags; 1219 int r; 1220 1221 assert(bo->bo && "must not be called for slab entries"); 1222 1223 r = amdgpu_bo_query_info(bo->bo, &info); 1224 if (r) 1225 return; 1226 1227 tiling_flags = info.metadata.tiling_info; 1228 1229 if (bo->ws->info.chip_class >= GFX9) { 1230 md->u.gfx9.swizzle_mode = AMDGPU_TILING_GET(tiling_flags, SWIZZLE_MODE); 1231 1232 md->u.gfx9.dcc_offset_256B = AMDGPU_TILING_GET(tiling_flags, DCC_OFFSET_256B); 1233 md->u.gfx9.dcc_pitch_max = AMDGPU_TILING_GET(tiling_flags, DCC_PITCH_MAX); 1234 md->u.gfx9.dcc_independent_64B = AMDGPU_TILING_GET(tiling_flags, DCC_INDEPENDENT_64B); 1235 } else { 1236 md->u.legacy.microtile = RADEON_LAYOUT_LINEAR; 1237 md->u.legacy.macrotile = RADEON_LAYOUT_LINEAR; 1238 1239 if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 4) /* 2D_TILED_THIN1 */ 1240 md->u.legacy.macrotile = RADEON_LAYOUT_TILED; 1241 else if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 2) /* 1D_TILED_THIN1 */ 1242 md->u.legacy.microtile = RADEON_LAYOUT_TILED; 1243 1244 md->u.legacy.pipe_config = AMDGPU_TILING_GET(tiling_flags, PIPE_CONFIG); 1245 md->u.legacy.bankw = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_WIDTH); 1246 md->u.legacy.bankh = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_HEIGHT); 1247 md->u.legacy.tile_split = eg_tile_split(AMDGPU_TILING_GET(tiling_flags, TILE_SPLIT)); 1248 md->u.legacy.mtilea = 1 << AMDGPU_TILING_GET(tiling_flags, MACRO_TILE_ASPECT); 1249 md->u.legacy.num_banks = 2 << AMDGPU_TILING_GET(tiling_flags, NUM_BANKS); 1250 md->u.legacy.scanout = AMDGPU_TILING_GET(tiling_flags, MICRO_TILE_MODE) == 0; /* DISPLAY */ 1251 } 1252 1253 md->size_metadata = info.metadata.size_metadata; 1254 memcpy(md->metadata, info.metadata.umd_metadata, sizeof(md->metadata)); 1255} 1256 1257static void amdgpu_buffer_set_metadata(struct pb_buffer *_buf, 1258 struct radeon_bo_metadata *md) 1259{ 1260 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); 1261 struct amdgpu_bo_metadata metadata = {0}; 1262 uint64_t tiling_flags = 0; 1263 1264 assert(bo->bo && "must not be called for slab entries"); 1265 1266 if (bo->ws->info.chip_class >= GFX9) { 1267 tiling_flags |= AMDGPU_TILING_SET(SWIZZLE_MODE, md->u.gfx9.swizzle_mode); 1268 1269 tiling_flags |= AMDGPU_TILING_SET(DCC_OFFSET_256B, md->u.gfx9.dcc_offset_256B); 1270 tiling_flags |= AMDGPU_TILING_SET(DCC_PITCH_MAX, md->u.gfx9.dcc_pitch_max); 1271 tiling_flags |= AMDGPU_TILING_SET(DCC_INDEPENDENT_64B, md->u.gfx9.dcc_independent_64B); 1272 } else { 1273 if (md->u.legacy.macrotile == RADEON_LAYOUT_TILED) 1274 tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 4); /* 2D_TILED_THIN1 */ 1275 else if (md->u.legacy.microtile == RADEON_LAYOUT_TILED) 1276 tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 2); /* 1D_TILED_THIN1 */ 1277 else 1278 tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 1); /* LINEAR_ALIGNED */ 1279 1280 tiling_flags |= AMDGPU_TILING_SET(PIPE_CONFIG, md->u.legacy.pipe_config); 1281 tiling_flags |= AMDGPU_TILING_SET(BANK_WIDTH, util_logbase2(md->u.legacy.bankw)); 1282 tiling_flags |= AMDGPU_TILING_SET(BANK_HEIGHT, util_logbase2(md->u.legacy.bankh)); 1283 if (md->u.legacy.tile_split) 1284 tiling_flags |= AMDGPU_TILING_SET(TILE_SPLIT, eg_tile_split_rev(md->u.legacy.tile_split)); 1285 tiling_flags |= AMDGPU_TILING_SET(MACRO_TILE_ASPECT, util_logbase2(md->u.legacy.mtilea)); 1286 tiling_flags |= AMDGPU_TILING_SET(NUM_BANKS, util_logbase2(md->u.legacy.num_banks)-1); 1287 1288 if (md->u.legacy.scanout) 1289 tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 0); /* DISPLAY_MICRO_TILING */ 1290 else 1291 tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 1); /* THIN_MICRO_TILING */ 1292 } 1293 1294 metadata.tiling_info = tiling_flags; 1295 metadata.size_metadata = md->size_metadata; 1296 memcpy(metadata.umd_metadata, md->metadata, sizeof(md->metadata)); 1297 1298 amdgpu_bo_set_metadata(bo->bo, &metadata); 1299} 1300 1301static struct pb_buffer * 1302amdgpu_bo_create(struct radeon_winsys *rws, 1303 uint64_t size, 1304 unsigned alignment, 1305 enum radeon_bo_domain domain, 1306 enum radeon_bo_flag flags) 1307{ 1308 struct amdgpu_winsys *ws = amdgpu_winsys(rws); 1309 struct amdgpu_winsys_bo *bo; 1310 int heap = -1; 1311 1312 /* VRAM implies WC. This is not optional. */ 1313 assert(!(domain & RADEON_DOMAIN_VRAM) || flags & RADEON_FLAG_GTT_WC); 1314 1315 /* NO_CPU_ACCESS is valid with VRAM only. */ 1316 assert(domain == RADEON_DOMAIN_VRAM || !(flags & RADEON_FLAG_NO_CPU_ACCESS)); 1317 1318 /* Sparse buffers must have NO_CPU_ACCESS set. */ 1319 assert(!(flags & RADEON_FLAG_SPARSE) || flags & RADEON_FLAG_NO_CPU_ACCESS); 1320 1321 struct pb_slabs *last_slab = &ws->bo_slabs[NUM_SLAB_ALLOCATORS - 1]; 1322 unsigned max_slab_entry_size = 1 << (last_slab->min_order + last_slab->num_orders - 1); 1323 1324 /* Sub-allocate small buffers from slabs. */ 1325 if (!(flags & (RADEON_FLAG_NO_SUBALLOC | RADEON_FLAG_SPARSE)) && 1326 size <= max_slab_entry_size && 1327 /* The alignment must be at most the size of the smallest slab entry or 1328 * the next power of two. */ 1329 alignment <= MAX2(1 << ws->bo_slabs[0].min_order, util_next_power_of_two(size))) { 1330 struct pb_slab_entry *entry; 1331 int heap = radeon_get_heap_index(domain, flags); 1332 1333 if (heap < 0 || heap >= RADEON_MAX_SLAB_HEAPS) 1334 goto no_slab; 1335 1336 struct pb_slabs *slabs = get_slabs(ws, size); 1337 entry = pb_slab_alloc(slabs, size, heap); 1338 if (!entry) { 1339 /* Clean up buffer managers and try again. */ 1340 amdgpu_clean_up_buffer_managers(ws); 1341 1342 entry = pb_slab_alloc(slabs, size, heap); 1343 } 1344 if (!entry) 1345 return NULL; 1346 1347 bo = NULL; 1348 bo = container_of(entry, bo, u.slab.entry); 1349 1350 pipe_reference_init(&bo->base.reference, 1); 1351 1352 return &bo->base; 1353 } 1354no_slab: 1355 1356 if (flags & RADEON_FLAG_SPARSE) { 1357 assert(RADEON_SPARSE_PAGE_SIZE % alignment == 0); 1358 1359 return amdgpu_bo_sparse_create(ws, size, domain, flags); 1360 } 1361 1362 /* This flag is irrelevant for the cache. */ 1363 flags &= ~RADEON_FLAG_NO_SUBALLOC; 1364 1365 /* Align size to page size. This is the minimum alignment for normal 1366 * BOs. Aligning this here helps the cached bufmgr. Especially small BOs, 1367 * like constant/uniform buffers, can benefit from better and more reuse. 1368 */ 1369 if (domain & RADEON_DOMAIN_VRAM_GTT) { 1370 size = align64(size, ws->info.gart_page_size); 1371 alignment = align(alignment, ws->info.gart_page_size); 1372 } 1373 1374 bool use_reusable_pool = flags & RADEON_FLAG_NO_INTERPROCESS_SHARING; 1375 1376 if (use_reusable_pool) { 1377 heap = radeon_get_heap_index(domain, flags); 1378 assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS); 1379 1380 /* Get a buffer from the cache. */ 1381 bo = (struct amdgpu_winsys_bo*) 1382 pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, 0, heap); 1383 if (bo) 1384 return &bo->base; 1385 } 1386 1387 /* Create a new one. */ 1388 bo = amdgpu_create_bo(ws, size, alignment, domain, flags, heap); 1389 if (!bo) { 1390 /* Clean up buffer managers and try again. */ 1391 amdgpu_clean_up_buffer_managers(ws); 1392 1393 bo = amdgpu_create_bo(ws, size, alignment, domain, flags, heap); 1394 if (!bo) 1395 return NULL; 1396 } 1397 1398 bo->u.real.use_reusable_pool = use_reusable_pool; 1399 return &bo->base; 1400} 1401 1402static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws, 1403 struct winsys_handle *whandle, 1404 unsigned vm_alignment, 1405 unsigned *stride, 1406 unsigned *offset) 1407{ 1408 struct amdgpu_winsys *ws = amdgpu_winsys(rws); 1409 struct amdgpu_winsys_bo *bo = NULL; 1410 enum amdgpu_bo_handle_type type; 1411 struct amdgpu_bo_import_result result = {0}; 1412 uint64_t va; 1413 amdgpu_va_handle va_handle = NULL; 1414 struct amdgpu_bo_info info = {0}; 1415 enum radeon_bo_domain initial = 0; 1416 int r; 1417 1418 switch (whandle->type) { 1419 case WINSYS_HANDLE_TYPE_SHARED: 1420 type = amdgpu_bo_handle_type_gem_flink_name; 1421 break; 1422 case WINSYS_HANDLE_TYPE_FD: 1423 type = amdgpu_bo_handle_type_dma_buf_fd; 1424 break; 1425 default: 1426 return NULL; 1427 } 1428 1429 if (stride) 1430 *stride = whandle->stride; 1431 if (offset) 1432 *offset = whandle->offset; 1433 1434 r = amdgpu_bo_import(ws->dev, type, whandle->handle, &result); 1435 if (r) 1436 return NULL; 1437 1438 simple_mtx_lock(&ws->bo_export_table_lock); 1439 bo = util_hash_table_get(ws->bo_export_table, result.buf_handle); 1440 1441 /* If the amdgpu_winsys_bo instance already exists, bump the reference 1442 * counter and return it. 1443 */ 1444 if (bo) { 1445 p_atomic_inc(&bo->base.reference.count); 1446 simple_mtx_unlock(&ws->bo_export_table_lock); 1447 1448 /* Release the buffer handle, because we don't need it anymore. 1449 * This function is returning an existing buffer, which has its own 1450 * handle. 1451 */ 1452 amdgpu_bo_free(result.buf_handle); 1453 return &bo->base; 1454 } 1455 1456 /* Get initial domains. */ 1457 r = amdgpu_bo_query_info(result.buf_handle, &info); 1458 if (r) 1459 goto error; 1460 1461 r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general, 1462 result.alloc_size, 1463 amdgpu_get_optimal_vm_alignment(ws, result.alloc_size, 1464 vm_alignment), 1465 0, &va, &va_handle, AMDGPU_VA_RANGE_HIGH); 1466 if (r) 1467 goto error; 1468 1469 bo = CALLOC_STRUCT(amdgpu_winsys_bo); 1470 if (!bo) 1471 goto error; 1472 1473 r = amdgpu_bo_va_op(result.buf_handle, 0, result.alloc_size, va, 0, AMDGPU_VA_OP_MAP); 1474 if (r) 1475 goto error; 1476 1477 if (info.preferred_heap & AMDGPU_GEM_DOMAIN_VRAM) 1478 initial |= RADEON_DOMAIN_VRAM; 1479 if (info.preferred_heap & AMDGPU_GEM_DOMAIN_GTT) 1480 initial |= RADEON_DOMAIN_GTT; 1481 1482 /* Initialize the structure. */ 1483 simple_mtx_init(&bo->lock, mtx_plain); 1484 pipe_reference_init(&bo->base.reference, 1); 1485 bo->base.alignment = info.phys_alignment; 1486 bo->bo = result.buf_handle; 1487 bo->base.size = result.alloc_size; 1488 bo->base.vtbl = &amdgpu_winsys_bo_vtbl; 1489 bo->ws = ws; 1490 bo->va = va; 1491 bo->u.real.va_handle = va_handle; 1492 bo->initial_domain = initial; 1493 bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1); 1494 bo->is_shared = true; 1495 1496 if (bo->initial_domain & RADEON_DOMAIN_VRAM) 1497 ws->allocated_vram += align64(bo->base.size, ws->info.gart_page_size); 1498 else if (bo->initial_domain & RADEON_DOMAIN_GTT) 1499 ws->allocated_gtt += align64(bo->base.size, ws->info.gart_page_size); 1500 1501 amdgpu_bo_export(bo->bo, amdgpu_bo_handle_type_kms, &bo->u.real.kms_handle); 1502 1503 amdgpu_add_buffer_to_global_list(bo); 1504 1505 util_hash_table_set(ws->bo_export_table, bo->bo, bo); 1506 simple_mtx_unlock(&ws->bo_export_table_lock); 1507 1508 return &bo->base; 1509 1510error: 1511 simple_mtx_unlock(&ws->bo_export_table_lock); 1512 if (bo) 1513 FREE(bo); 1514 if (va_handle) 1515 amdgpu_va_range_free(va_handle); 1516 amdgpu_bo_free(result.buf_handle); 1517 return NULL; 1518} 1519 1520static bool amdgpu_bo_get_handle(struct pb_buffer *buffer, 1521 unsigned stride, unsigned offset, 1522 unsigned slice_size, 1523 struct winsys_handle *whandle) 1524{ 1525 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(buffer); 1526 struct amdgpu_winsys *ws = bo->ws; 1527 enum amdgpu_bo_handle_type type; 1528 int r; 1529 1530 /* Don't allow exports of slab entries and sparse buffers. */ 1531 if (!bo->bo) 1532 return false; 1533 1534 bo->u.real.use_reusable_pool = false; 1535 1536 switch (whandle->type) { 1537 case WINSYS_HANDLE_TYPE_SHARED: 1538 type = amdgpu_bo_handle_type_gem_flink_name; 1539 break; 1540 case WINSYS_HANDLE_TYPE_FD: 1541 type = amdgpu_bo_handle_type_dma_buf_fd; 1542 break; 1543 case WINSYS_HANDLE_TYPE_KMS: 1544 type = amdgpu_bo_handle_type_kms; 1545 break; 1546 default: 1547 return false; 1548 } 1549 1550 r = amdgpu_bo_export(bo->bo, type, &whandle->handle); 1551 if (r) 1552 return false; 1553 1554 simple_mtx_lock(&ws->bo_export_table_lock); 1555 util_hash_table_set(ws->bo_export_table, bo->bo, bo); 1556 simple_mtx_unlock(&ws->bo_export_table_lock); 1557 1558 whandle->stride = stride; 1559 whandle->offset = offset; 1560 whandle->offset += slice_size * whandle->layer; 1561 bo->is_shared = true; 1562 return true; 1563} 1564 1565static struct pb_buffer *amdgpu_bo_from_ptr(struct radeon_winsys *rws, 1566 void *pointer, uint64_t size) 1567{ 1568 struct amdgpu_winsys *ws = amdgpu_winsys(rws); 1569 amdgpu_bo_handle buf_handle; 1570 struct amdgpu_winsys_bo *bo; 1571 uint64_t va; 1572 amdgpu_va_handle va_handle; 1573 /* Avoid failure when the size is not page aligned */ 1574 uint64_t aligned_size = align64(size, ws->info.gart_page_size); 1575 1576 bo = CALLOC_STRUCT(amdgpu_winsys_bo); 1577 if (!bo) 1578 return NULL; 1579 1580 if (amdgpu_create_bo_from_user_mem(ws->dev, pointer, 1581 aligned_size, &buf_handle)) 1582 goto error; 1583 1584 if (amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general, 1585 aligned_size, 1586 amdgpu_get_optimal_vm_alignment(ws, aligned_size, 1587 ws->info.gart_page_size), 1588 0, &va, &va_handle, AMDGPU_VA_RANGE_HIGH)) 1589 goto error_va_alloc; 1590 1591 if (amdgpu_bo_va_op(buf_handle, 0, aligned_size, va, 0, AMDGPU_VA_OP_MAP)) 1592 goto error_va_map; 1593 1594 /* Initialize it. */ 1595 bo->is_user_ptr = true; 1596 pipe_reference_init(&bo->base.reference, 1); 1597 simple_mtx_init(&bo->lock, mtx_plain); 1598 bo->bo = buf_handle; 1599 bo->base.alignment = 0; 1600 bo->base.size = size; 1601 bo->base.vtbl = &amdgpu_winsys_bo_vtbl; 1602 bo->ws = ws; 1603 bo->cpu_ptr = pointer; 1604 bo->va = va; 1605 bo->u.real.va_handle = va_handle; 1606 bo->initial_domain = RADEON_DOMAIN_GTT; 1607 bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1); 1608 1609 ws->allocated_gtt += aligned_size; 1610 1611 amdgpu_add_buffer_to_global_list(bo); 1612 1613 amdgpu_bo_export(bo->bo, amdgpu_bo_handle_type_kms, &bo->u.real.kms_handle); 1614 1615 return (struct pb_buffer*)bo; 1616 1617error_va_map: 1618 amdgpu_va_range_free(va_handle); 1619 1620error_va_alloc: 1621 amdgpu_bo_free(buf_handle); 1622 1623error: 1624 FREE(bo); 1625 return NULL; 1626} 1627 1628static bool amdgpu_bo_is_user_ptr(struct pb_buffer *buf) 1629{ 1630 return ((struct amdgpu_winsys_bo*)buf)->is_user_ptr; 1631} 1632 1633static bool amdgpu_bo_is_suballocated(struct pb_buffer *buf) 1634{ 1635 struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf; 1636 1637 return !bo->bo && !bo->sparse; 1638} 1639 1640static uint64_t amdgpu_bo_get_va(struct pb_buffer *buf) 1641{ 1642 return ((struct amdgpu_winsys_bo*)buf)->va; 1643} 1644 1645void amdgpu_bo_init_functions(struct amdgpu_winsys *ws) 1646{ 1647 ws->base.buffer_set_metadata = amdgpu_buffer_set_metadata; 1648 ws->base.buffer_get_metadata = amdgpu_buffer_get_metadata; 1649 ws->base.buffer_map = amdgpu_bo_map; 1650 ws->base.buffer_unmap = amdgpu_bo_unmap; 1651 ws->base.buffer_wait = amdgpu_bo_wait; 1652 ws->base.buffer_create = amdgpu_bo_create; 1653 ws->base.buffer_from_handle = amdgpu_bo_from_handle; 1654 ws->base.buffer_from_ptr = amdgpu_bo_from_ptr; 1655 ws->base.buffer_is_user_ptr = amdgpu_bo_is_user_ptr; 1656 ws->base.buffer_is_suballocated = amdgpu_bo_is_suballocated; 1657 ws->base.buffer_get_handle = amdgpu_bo_get_handle; 1658 ws->base.buffer_commit = amdgpu_bo_sparse_commit; 1659 ws->base.buffer_get_virtual_address = amdgpu_bo_get_va; 1660 ws->base.buffer_get_initial_domain = amdgpu_bo_get_initial_domain; 1661} 1662