1/* 2 * Copyright © 2015 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include <stdlib.h> 25#include <unistd.h> 26#include <limits.h> 27#include <assert.h> 28#include <linux/memfd.h> 29#include <sys/mman.h> 30 31#include "anv_private.h" 32 33#include "util/hash_table.h" 34#include "util/simple_mtx.h" 35 36#ifdef HAVE_VALGRIND 37#define VG_NOACCESS_READ(__ptr) ({ \ 38 VALGRIND_MAKE_MEM_DEFINED((__ptr), sizeof(*(__ptr))); \ 39 __typeof(*(__ptr)) __val = *(__ptr); \ 40 VALGRIND_MAKE_MEM_NOACCESS((__ptr), sizeof(*(__ptr)));\ 41 __val; \ 42}) 43#define VG_NOACCESS_WRITE(__ptr, __val) ({ \ 44 VALGRIND_MAKE_MEM_UNDEFINED((__ptr), sizeof(*(__ptr))); \ 45 *(__ptr) = (__val); \ 46 VALGRIND_MAKE_MEM_NOACCESS((__ptr), sizeof(*(__ptr))); \ 47}) 48#else 49#define VG_NOACCESS_READ(__ptr) (*(__ptr)) 50#define VG_NOACCESS_WRITE(__ptr, __val) (*(__ptr) = (__val)) 51#endif 52 53/* Design goals: 54 * 55 * - Lock free (except when resizing underlying bos) 56 * 57 * - Constant time allocation with typically only one atomic 58 * 59 * - Multiple allocation sizes without fragmentation 60 * 61 * - Can grow while keeping addresses and offset of contents stable 62 * 63 * - All allocations within one bo so we can point one of the 64 * STATE_BASE_ADDRESS pointers at it. 65 * 66 * The overall design is a two-level allocator: top level is a fixed size, big 67 * block (8k) allocator, which operates out of a bo. Allocation is done by 68 * either pulling a block from the free list or growing the used range of the 69 * bo. Growing the range may run out of space in the bo which we then need to 70 * grow. Growing the bo is tricky in a multi-threaded, lockless environment: 71 * we need to keep all pointers and contents in the old map valid. GEM bos in 72 * general can't grow, but we use a trick: we create a memfd and use ftruncate 73 * to grow it as necessary. We mmap the new size and then create a gem bo for 74 * it using the new gem userptr ioctl. Without heavy-handed locking around 75 * our allocation fast-path, there isn't really a way to munmap the old mmap, 76 * so we just keep it around until garbage collection time. While the block 77 * allocator is lockless for normal operations, we block other threads trying 78 * to allocate while we're growing the map. It sholdn't happen often, and 79 * growing is fast anyway. 80 * 81 * At the next level we can use various sub-allocators. The state pool is a 82 * pool of smaller, fixed size objects, which operates much like the block 83 * pool. It uses a free list for freeing objects, but when it runs out of 84 * space it just allocates a new block from the block pool. This allocator is 85 * intended for longer lived state objects such as SURFACE_STATE and most 86 * other persistent state objects in the API. We may need to track more info 87 * with these object and a pointer back to the CPU object (eg VkImage). In 88 * those cases we just allocate a slightly bigger object and put the extra 89 * state after the GPU state object. 90 * 91 * The state stream allocator works similar to how the i965 DRI driver streams 92 * all its state. Even with Vulkan, we need to emit transient state (whether 93 * surface state base or dynamic state base), and for that we can just get a 94 * block and fill it up. These cases are local to a command buffer and the 95 * sub-allocator need not be thread safe. The streaming allocator gets a new 96 * block when it runs out of space and chains them together so they can be 97 * easily freed. 98 */ 99 100/* Allocations are always at least 64 byte aligned, so 1 is an invalid value. 101 * We use it to indicate the free list is empty. */ 102#define EMPTY UINT32_MAX 103 104#define PAGE_SIZE 4096 105 106struct anv_mmap_cleanup { 107 void *map; 108 size_t size; 109 uint32_t gem_handle; 110}; 111 112#define ANV_MMAP_CLEANUP_INIT ((struct anv_mmap_cleanup){0}) 113 114#ifndef HAVE_MEMFD_CREATE 115static inline int 116memfd_create(const char *name, unsigned int flags) 117{ 118 return syscall(SYS_memfd_create, name, flags); 119} 120#endif 121 122static inline uint32_t 123ilog2_round_up(uint32_t value) 124{ 125 assert(value != 0); 126 return 32 - __builtin_clz(value - 1); 127} 128 129static inline uint32_t 130round_to_power_of_two(uint32_t value) 131{ 132 return 1 << ilog2_round_up(value); 133} 134 135struct anv_state_table_cleanup { 136 void *map; 137 size_t size; 138}; 139 140#define ANV_STATE_TABLE_CLEANUP_INIT ((struct anv_state_table_cleanup){0}) 141#define ANV_STATE_ENTRY_SIZE (sizeof(struct anv_free_entry)) 142 143static VkResult 144anv_state_table_expand_range(struct anv_state_table *table, uint32_t size); 145 146VkResult 147anv_state_table_init(struct anv_state_table *table, 148 struct anv_device *device, 149 uint32_t initial_entries) 150{ 151 VkResult result; 152 153 table->device = device; 154 155 table->fd = memfd_create("state table", MFD_CLOEXEC); 156 if (table->fd == -1) 157 return vk_error(VK_ERROR_INITIALIZATION_FAILED); 158 159 /* Just make it 2GB up-front. The Linux kernel won't actually back it 160 * with pages until we either map and fault on one of them or we use 161 * userptr and send a chunk of it off to the GPU. 162 */ 163 if (ftruncate(table->fd, BLOCK_POOL_MEMFD_SIZE) == -1) { 164 result = vk_error(VK_ERROR_INITIALIZATION_FAILED); 165 goto fail_fd; 166 } 167 168 if (!u_vector_init(&table->cleanups, 169 round_to_power_of_two(sizeof(struct anv_state_table_cleanup)), 170 128)) { 171 result = vk_error(VK_ERROR_INITIALIZATION_FAILED); 172 goto fail_fd; 173 } 174 175 table->state.next = 0; 176 table->state.end = 0; 177 table->size = 0; 178 179 uint32_t initial_size = initial_entries * ANV_STATE_ENTRY_SIZE; 180 result = anv_state_table_expand_range(table, initial_size); 181 if (result != VK_SUCCESS) 182 goto fail_cleanups; 183 184 return VK_SUCCESS; 185 186 fail_cleanups: 187 u_vector_finish(&table->cleanups); 188 fail_fd: 189 close(table->fd); 190 191 return result; 192} 193 194static VkResult 195anv_state_table_expand_range(struct anv_state_table *table, uint32_t size) 196{ 197 void *map; 198 struct anv_state_table_cleanup *cleanup; 199 200 /* Assert that we only ever grow the pool */ 201 assert(size >= table->state.end); 202 203 /* Make sure that we don't go outside the bounds of the memfd */ 204 if (size > BLOCK_POOL_MEMFD_SIZE) 205 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); 206 207 cleanup = u_vector_add(&table->cleanups); 208 if (!cleanup) 209 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); 210 211 *cleanup = ANV_STATE_TABLE_CLEANUP_INIT; 212 213 /* Just leak the old map until we destroy the pool. We can't munmap it 214 * without races or imposing locking on the block allocate fast path. On 215 * the whole the leaked maps adds up to less than the size of the 216 * current map. MAP_POPULATE seems like the right thing to do, but we 217 * should try to get some numbers. 218 */ 219 map = mmap(NULL, size, PROT_READ | PROT_WRITE, 220 MAP_SHARED | MAP_POPULATE, table->fd, 0); 221 if (map == MAP_FAILED) { 222 return vk_errorf(table->device->instance, table->device, 223 VK_ERROR_OUT_OF_HOST_MEMORY, "mmap failed: %m"); 224 } 225 226 cleanup->map = map; 227 cleanup->size = size; 228 229 table->map = map; 230 table->size = size; 231 232 return VK_SUCCESS; 233} 234 235static VkResult 236anv_state_table_grow(struct anv_state_table *table) 237{ 238 VkResult result = VK_SUCCESS; 239 240 uint32_t used = align_u32(table->state.next * ANV_STATE_ENTRY_SIZE, 241 PAGE_SIZE); 242 uint32_t old_size = table->size; 243 244 /* The block pool is always initialized to a nonzero size and this function 245 * is always called after initialization. 246 */ 247 assert(old_size > 0); 248 249 uint32_t required = MAX2(used, old_size); 250 if (used * 2 <= required) { 251 /* If we're in this case then this isn't the firsta allocation and we 252 * already have enough space on both sides to hold double what we 253 * have allocated. There's nothing for us to do. 254 */ 255 goto done; 256 } 257 258 uint32_t size = old_size * 2; 259 while (size < required) 260 size *= 2; 261 262 assert(size > table->size); 263 264 result = anv_state_table_expand_range(table, size); 265 266 done: 267 return result; 268} 269 270void 271anv_state_table_finish(struct anv_state_table *table) 272{ 273 struct anv_state_table_cleanup *cleanup; 274 275 u_vector_foreach(cleanup, &table->cleanups) { 276 if (cleanup->map) 277 munmap(cleanup->map, cleanup->size); 278 } 279 280 u_vector_finish(&table->cleanups); 281 282 close(table->fd); 283} 284 285VkResult 286anv_state_table_add(struct anv_state_table *table, uint32_t *idx, 287 uint32_t count) 288{ 289 struct anv_block_state state, old, new; 290 VkResult result; 291 292 assert(idx); 293 294 while(1) { 295 state.u64 = __sync_fetch_and_add(&table->state.u64, count); 296 if (state.next + count <= state.end) { 297 assert(table->map); 298 struct anv_free_entry *entry = &table->map[state.next]; 299 for (int i = 0; i < count; i++) { 300 entry[i].state.idx = state.next + i; 301 } 302 *idx = state.next; 303 return VK_SUCCESS; 304 } else if (state.next <= state.end) { 305 /* We allocated the first block outside the pool so we have to grow 306 * the pool. pool_state->next acts a mutex: threads who try to 307 * allocate now will get block indexes above the current limit and 308 * hit futex_wait below. 309 */ 310 new.next = state.next + count; 311 do { 312 result = anv_state_table_grow(table); 313 if (result != VK_SUCCESS) 314 return result; 315 new.end = table->size / ANV_STATE_ENTRY_SIZE; 316 } while (new.end < new.next); 317 318 old.u64 = __sync_lock_test_and_set(&table->state.u64, new.u64); 319 if (old.next != state.next) 320 futex_wake(&table->state.end, INT_MAX); 321 } else { 322 futex_wait(&table->state.end, state.end, NULL); 323 continue; 324 } 325 } 326} 327 328void 329anv_free_list_push(union anv_free_list *list, 330 struct anv_state_table *table, 331 uint32_t first, uint32_t count) 332{ 333 union anv_free_list current, old, new; 334 uint32_t last = first; 335 336 for (uint32_t i = 1; i < count; i++, last++) 337 table->map[last].next = last + 1; 338 339 old = *list; 340 do { 341 current = old; 342 table->map[last].next = current.offset; 343 new.offset = first; 344 new.count = current.count + 1; 345 old.u64 = __sync_val_compare_and_swap(&list->u64, current.u64, new.u64); 346 } while (old.u64 != current.u64); 347} 348 349struct anv_state * 350anv_free_list_pop(union anv_free_list *list, 351 struct anv_state_table *table) 352{ 353 union anv_free_list current, new, old; 354 355 current.u64 = list->u64; 356 while (current.offset != EMPTY) { 357 __sync_synchronize(); 358 new.offset = table->map[current.offset].next; 359 new.count = current.count + 1; 360 old.u64 = __sync_val_compare_and_swap(&list->u64, current.u64, new.u64); 361 if (old.u64 == current.u64) { 362 struct anv_free_entry *entry = &table->map[current.offset]; 363 return &entry->state; 364 } 365 current = old; 366 } 367 368 return NULL; 369} 370 371/* All pointers in the ptr_free_list are assumed to be page-aligned. This 372 * means that the bottom 12 bits should all be zero. 373 */ 374#define PFL_COUNT(x) ((uintptr_t)(x) & 0xfff) 375#define PFL_PTR(x) ((void *)((uintptr_t)(x) & ~(uintptr_t)0xfff)) 376#define PFL_PACK(ptr, count) ({ \ 377 (void *)(((uintptr_t)(ptr) & ~(uintptr_t)0xfff) | ((count) & 0xfff)); \ 378}) 379 380static bool 381anv_ptr_free_list_pop(void **list, void **elem) 382{ 383 void *current = *list; 384 while (PFL_PTR(current) != NULL) { 385 void **next_ptr = PFL_PTR(current); 386 void *new_ptr = VG_NOACCESS_READ(next_ptr); 387 unsigned new_count = PFL_COUNT(current) + 1; 388 void *new = PFL_PACK(new_ptr, new_count); 389 void *old = __sync_val_compare_and_swap(list, current, new); 390 if (old == current) { 391 *elem = PFL_PTR(current); 392 return true; 393 } 394 current = old; 395 } 396 397 return false; 398} 399 400static void 401anv_ptr_free_list_push(void **list, void *elem) 402{ 403 void *old, *current; 404 void **next_ptr = elem; 405 406 /* The pointer-based free list requires that the pointer be 407 * page-aligned. This is because we use the bottom 12 bits of the 408 * pointer to store a counter to solve the ABA concurrency problem. 409 */ 410 assert(((uintptr_t)elem & 0xfff) == 0); 411 412 old = *list; 413 do { 414 current = old; 415 VG_NOACCESS_WRITE(next_ptr, PFL_PTR(current)); 416 unsigned new_count = PFL_COUNT(current) + 1; 417 void *new = PFL_PACK(elem, new_count); 418 old = __sync_val_compare_and_swap(list, current, new); 419 } while (old != current); 420} 421 422static VkResult 423anv_block_pool_expand_range(struct anv_block_pool *pool, 424 uint32_t center_bo_offset, uint32_t size); 425 426VkResult 427anv_block_pool_init(struct anv_block_pool *pool, 428 struct anv_device *device, 429 uint64_t start_address, 430 uint32_t initial_size, 431 uint64_t bo_flags) 432{ 433 VkResult result; 434 435 pool->device = device; 436 pool->bo_flags = bo_flags; 437 pool->nbos = 0; 438 pool->size = 0; 439 pool->center_bo_offset = 0; 440 pool->start_address = gen_canonical_address(start_address); 441 pool->map = NULL; 442 443 /* This pointer will always point to the first BO in the list */ 444 pool->bo = &pool->bos[0]; 445 446 anv_bo_init(pool->bo, 0, 0); 447 448 if (!(pool->bo_flags & EXEC_OBJECT_PINNED)) { 449 pool->fd = memfd_create("block pool", MFD_CLOEXEC); 450 if (pool->fd == -1) 451 return vk_error(VK_ERROR_INITIALIZATION_FAILED); 452 453 /* Just make it 2GB up-front. The Linux kernel won't actually back it 454 * with pages until we either map and fault on one of them or we use 455 * userptr and send a chunk of it off to the GPU. 456 */ 457 if (ftruncate(pool->fd, BLOCK_POOL_MEMFD_SIZE) == -1) { 458 result = vk_error(VK_ERROR_INITIALIZATION_FAILED); 459 goto fail_fd; 460 } 461 } else { 462 pool->fd = -1; 463 } 464 465 if (!u_vector_init(&pool->mmap_cleanups, 466 round_to_power_of_two(sizeof(struct anv_mmap_cleanup)), 467 128)) { 468 result = vk_error(VK_ERROR_INITIALIZATION_FAILED); 469 goto fail_fd; 470 } 471 472 pool->state.next = 0; 473 pool->state.end = 0; 474 pool->back_state.next = 0; 475 pool->back_state.end = 0; 476 477 result = anv_block_pool_expand_range(pool, 0, initial_size); 478 if (result != VK_SUCCESS) 479 goto fail_mmap_cleanups; 480 481 /* Make the entire pool available in the front of the pool. If back 482 * allocation needs to use this space, the "ends" will be re-arranged. 483 */ 484 pool->state.end = pool->size; 485 486 return VK_SUCCESS; 487 488 fail_mmap_cleanups: 489 u_vector_finish(&pool->mmap_cleanups); 490 fail_fd: 491 if (!(pool->bo_flags & EXEC_OBJECT_PINNED)) 492 close(pool->fd); 493 494 return result; 495} 496 497void 498anv_block_pool_finish(struct anv_block_pool *pool) 499{ 500 struct anv_mmap_cleanup *cleanup; 501 const bool use_softpin = !!(pool->bo_flags & EXEC_OBJECT_PINNED); 502 503 u_vector_foreach(cleanup, &pool->mmap_cleanups) { 504 if (use_softpin) 505 anv_gem_munmap(cleanup->map, cleanup->size); 506 else 507 munmap(cleanup->map, cleanup->size); 508 509 if (cleanup->gem_handle) 510 anv_gem_close(pool->device, cleanup->gem_handle); 511 } 512 513 u_vector_finish(&pool->mmap_cleanups); 514 if (!(pool->bo_flags & EXEC_OBJECT_PINNED)) 515 close(pool->fd); 516} 517 518static VkResult 519anv_block_pool_expand_range(struct anv_block_pool *pool, 520 uint32_t center_bo_offset, uint32_t size) 521{ 522 void *map; 523 uint32_t gem_handle; 524 struct anv_mmap_cleanup *cleanup; 525 const bool use_softpin = !!(pool->bo_flags & EXEC_OBJECT_PINNED); 526 527 /* Assert that we only ever grow the pool */ 528 assert(center_bo_offset >= pool->back_state.end); 529 assert(size - center_bo_offset >= pool->state.end); 530 531 /* Assert that we don't go outside the bounds of the memfd */ 532 assert(center_bo_offset <= BLOCK_POOL_MEMFD_CENTER); 533 assert(use_softpin || 534 size - center_bo_offset <= 535 BLOCK_POOL_MEMFD_SIZE - BLOCK_POOL_MEMFD_CENTER); 536 537 cleanup = u_vector_add(&pool->mmap_cleanups); 538 if (!cleanup) 539 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); 540 541 *cleanup = ANV_MMAP_CLEANUP_INIT; 542 543 uint32_t newbo_size = size - pool->size; 544 if (use_softpin) { 545 gem_handle = anv_gem_create(pool->device, newbo_size); 546 map = anv_gem_mmap(pool->device, gem_handle, 0, newbo_size, 0); 547 if (map == MAP_FAILED) 548 return vk_errorf(pool->device->instance, pool->device, 549 VK_ERROR_MEMORY_MAP_FAILED, "gem mmap failed: %m"); 550 assert(center_bo_offset == 0); 551 } else { 552 /* Just leak the old map until we destroy the pool. We can't munmap it 553 * without races or imposing locking on the block allocate fast path. On 554 * the whole the leaked maps adds up to less than the size of the 555 * current map. MAP_POPULATE seems like the right thing to do, but we 556 * should try to get some numbers. 557 */ 558 map = mmap(NULL, size, PROT_READ | PROT_WRITE, 559 MAP_SHARED | MAP_POPULATE, pool->fd, 560 BLOCK_POOL_MEMFD_CENTER - center_bo_offset); 561 if (map == MAP_FAILED) 562 return vk_errorf(pool->device->instance, pool->device, 563 VK_ERROR_MEMORY_MAP_FAILED, "mmap failed: %m"); 564 565 /* Now that we mapped the new memory, we can write the new 566 * center_bo_offset back into pool and update pool->map. */ 567 pool->center_bo_offset = center_bo_offset; 568 pool->map = map + center_bo_offset; 569 gem_handle = anv_gem_userptr(pool->device, map, size); 570 if (gem_handle == 0) { 571 munmap(map, size); 572 return vk_errorf(pool->device->instance, pool->device, 573 VK_ERROR_TOO_MANY_OBJECTS, "userptr failed: %m"); 574 } 575 } 576 577 cleanup->map = map; 578 cleanup->size = use_softpin ? newbo_size : size; 579 cleanup->gem_handle = gem_handle; 580 581 /* Regular objects are created I915_CACHING_CACHED on LLC platforms and 582 * I915_CACHING_NONE on non-LLC platforms. However, userptr objects are 583 * always created as I915_CACHING_CACHED, which on non-LLC means 584 * snooped. 585 * 586 * On platforms that support softpin, we are not going to use userptr 587 * anymore, but we still want to rely on the snooped states. So make sure 588 * everything is set to I915_CACHING_CACHED. 589 */ 590 if (!pool->device->info.has_llc) 591 anv_gem_set_caching(pool->device, gem_handle, I915_CACHING_CACHED); 592 593 /* For block pool BOs we have to be a bit careful about where we place them 594 * in the GTT. There are two documented workarounds for state base address 595 * placement : Wa32bitGeneralStateOffset and Wa32bitInstructionBaseOffset 596 * which state that those two base addresses do not support 48-bit 597 * addresses and need to be placed in the bottom 32-bit range. 598 * Unfortunately, this is not quite accurate. 599 * 600 * The real problem is that we always set the size of our state pools in 601 * STATE_BASE_ADDRESS to 0xfffff (the maximum) even though the BO is most 602 * likely significantly smaller. We do this because we do not no at the 603 * time we emit STATE_BASE_ADDRESS whether or not we will need to expand 604 * the pool during command buffer building so we don't actually have a 605 * valid final size. If the address + size, as seen by STATE_BASE_ADDRESS 606 * overflows 48 bits, the GPU appears to treat all accesses to the buffer 607 * as being out of bounds and returns zero. For dynamic state, this 608 * usually just leads to rendering corruptions, but shaders that are all 609 * zero hang the GPU immediately. 610 * 611 * The easiest solution to do is exactly what the bogus workarounds say to 612 * do: restrict these buffers to 32-bit addresses. We could also pin the 613 * BO to some particular location of our choosing, but that's significantly 614 * more work than just not setting a flag. So, we explicitly DO NOT set 615 * the EXEC_OBJECT_SUPPORTS_48B_ADDRESS flag and the kernel does all of the 616 * hard work for us. 617 */ 618 struct anv_bo *bo; 619 uint32_t bo_size; 620 uint64_t bo_offset; 621 622 assert(pool->nbos < ANV_MAX_BLOCK_POOL_BOS); 623 624 if (use_softpin) { 625 /* With softpin, we add a new BO to the pool, and set its offset to right 626 * where the previous BO ends (the end of the pool). 627 */ 628 bo = &pool->bos[pool->nbos++]; 629 bo_size = newbo_size; 630 bo_offset = pool->start_address + pool->size; 631 } else { 632 /* Without softpin, we just need one BO, and we already have a pointer to 633 * it. Simply "allocate" it from our array if we didn't do it before. 634 * The offset doesn't matter since we are not pinning the BO anyway. 635 */ 636 if (pool->nbos == 0) 637 pool->nbos++; 638 bo = pool->bo; 639 bo_size = size; 640 bo_offset = 0; 641 } 642 643 anv_bo_init(bo, gem_handle, bo_size); 644 bo->offset = bo_offset; 645 bo->flags = pool->bo_flags; 646 bo->map = map; 647 pool->size = size; 648 649 return VK_SUCCESS; 650} 651 652static struct anv_bo * 653anv_block_pool_get_bo(struct anv_block_pool *pool, int32_t *offset) 654{ 655 struct anv_bo *bo, *bo_found = NULL; 656 int32_t cur_offset = 0; 657 658 assert(offset); 659 660 if (!(pool->bo_flags & EXEC_OBJECT_PINNED)) 661 return pool->bo; 662 663 anv_block_pool_foreach_bo(bo, pool) { 664 if (*offset < cur_offset + bo->size) { 665 bo_found = bo; 666 break; 667 } 668 cur_offset += bo->size; 669 } 670 671 assert(bo_found != NULL); 672 *offset -= cur_offset; 673 674 return bo_found; 675} 676 677/** Returns current memory map of the block pool. 678 * 679 * The returned pointer points to the map for the memory at the specified 680 * offset. The offset parameter is relative to the "center" of the block pool 681 * rather than the start of the block pool BO map. 682 */ 683void* 684anv_block_pool_map(struct anv_block_pool *pool, int32_t offset) 685{ 686 if (pool->bo_flags & EXEC_OBJECT_PINNED) { 687 struct anv_bo *bo = anv_block_pool_get_bo(pool, &offset); 688 return bo->map + offset; 689 } else { 690 return pool->map + offset; 691 } 692} 693 694/** Grows and re-centers the block pool. 695 * 696 * We grow the block pool in one or both directions in such a way that the 697 * following conditions are met: 698 * 699 * 1) The size of the entire pool is always a power of two. 700 * 701 * 2) The pool only grows on both ends. Neither end can get 702 * shortened. 703 * 704 * 3) At the end of the allocation, we have about twice as much space 705 * allocated for each end as we have used. This way the pool doesn't 706 * grow too far in one direction or the other. 707 * 708 * 4) If the _alloc_back() has never been called, then the back portion of 709 * the pool retains a size of zero. (This makes it easier for users of 710 * the block pool that only want a one-sided pool.) 711 * 712 * 5) We have enough space allocated for at least one more block in 713 * whichever side `state` points to. 714 * 715 * 6) The center of the pool is always aligned to both the block_size of 716 * the pool and a 4K CPU page. 717 */ 718static uint32_t 719anv_block_pool_grow(struct anv_block_pool *pool, struct anv_block_state *state) 720{ 721 VkResult result = VK_SUCCESS; 722 723 pthread_mutex_lock(&pool->device->mutex); 724 725 assert(state == &pool->state || state == &pool->back_state); 726 727 /* Gather a little usage information on the pool. Since we may have 728 * threadsd waiting in queue to get some storage while we resize, it's 729 * actually possible that total_used will be larger than old_size. In 730 * particular, block_pool_alloc() increments state->next prior to 731 * calling block_pool_grow, so this ensures that we get enough space for 732 * which ever side tries to grow the pool. 733 * 734 * We align to a page size because it makes it easier to do our 735 * calculations later in such a way that we state page-aigned. 736 */ 737 uint32_t back_used = align_u32(pool->back_state.next, PAGE_SIZE); 738 uint32_t front_used = align_u32(pool->state.next, PAGE_SIZE); 739 uint32_t total_used = front_used + back_used; 740 741 assert(state == &pool->state || back_used > 0); 742 743 uint32_t old_size = pool->size; 744 745 /* The block pool is always initialized to a nonzero size and this function 746 * is always called after initialization. 747 */ 748 assert(old_size > 0); 749 750 /* The back_used and front_used may actually be smaller than the actual 751 * requirement because they are based on the next pointers which are 752 * updated prior to calling this function. 753 */ 754 uint32_t back_required = MAX2(back_used, pool->center_bo_offset); 755 uint32_t front_required = MAX2(front_used, old_size - pool->center_bo_offset); 756 757 if (back_used * 2 <= back_required && front_used * 2 <= front_required) { 758 /* If we're in this case then this isn't the firsta allocation and we 759 * already have enough space on both sides to hold double what we 760 * have allocated. There's nothing for us to do. 761 */ 762 goto done; 763 } 764 765 uint32_t size = old_size * 2; 766 while (size < back_required + front_required) 767 size *= 2; 768 769 assert(size > pool->size); 770 771 /* We compute a new center_bo_offset such that, when we double the size 772 * of the pool, we maintain the ratio of how much is used by each side. 773 * This way things should remain more-or-less balanced. 774 */ 775 uint32_t center_bo_offset; 776 if (back_used == 0) { 777 /* If we're in this case then we have never called alloc_back(). In 778 * this case, we want keep the offset at 0 to make things as simple 779 * as possible for users that don't care about back allocations. 780 */ 781 center_bo_offset = 0; 782 } else { 783 /* Try to "center" the allocation based on how much is currently in 784 * use on each side of the center line. 785 */ 786 center_bo_offset = ((uint64_t)size * back_used) / total_used; 787 788 /* Align down to a multiple of the page size */ 789 center_bo_offset &= ~(PAGE_SIZE - 1); 790 791 assert(center_bo_offset >= back_used); 792 793 /* Make sure we don't shrink the back end of the pool */ 794 if (center_bo_offset < back_required) 795 center_bo_offset = back_required; 796 797 /* Make sure that we don't shrink the front end of the pool */ 798 if (size - center_bo_offset < front_required) 799 center_bo_offset = size - front_required; 800 } 801 802 assert(center_bo_offset % PAGE_SIZE == 0); 803 804 result = anv_block_pool_expand_range(pool, center_bo_offset, size); 805 806 pool->bo->flags = pool->bo_flags; 807 808done: 809 pthread_mutex_unlock(&pool->device->mutex); 810 811 if (result == VK_SUCCESS) { 812 /* Return the appropriate new size. This function never actually 813 * updates state->next. Instead, we let the caller do that because it 814 * needs to do so in order to maintain its concurrency model. 815 */ 816 if (state == &pool->state) { 817 return pool->size - pool->center_bo_offset; 818 } else { 819 assert(pool->center_bo_offset > 0); 820 return pool->center_bo_offset; 821 } 822 } else { 823 return 0; 824 } 825} 826 827static uint32_t 828anv_block_pool_alloc_new(struct anv_block_pool *pool, 829 struct anv_block_state *pool_state, 830 uint32_t block_size, uint32_t *padding) 831{ 832 struct anv_block_state state, old, new; 833 834 /* Most allocations won't generate any padding */ 835 if (padding) 836 *padding = 0; 837 838 while (1) { 839 state.u64 = __sync_fetch_and_add(&pool_state->u64, block_size); 840 if (state.next + block_size <= state.end) { 841 return state.next; 842 } else if (state.next <= state.end) { 843 if (pool->bo_flags & EXEC_OBJECT_PINNED && state.next < state.end) { 844 /* We need to grow the block pool, but still have some leftover 845 * space that can't be used by that particular allocation. So we 846 * add that as a "padding", and return it. 847 */ 848 uint32_t leftover = state.end - state.next; 849 850 /* If there is some leftover space in the pool, the caller must 851 * deal with it. 852 */ 853 assert(leftover == 0 || padding); 854 if (padding) 855 *padding = leftover; 856 state.next += leftover; 857 } 858 859 /* We allocated the first block outside the pool so we have to grow 860 * the pool. pool_state->next acts a mutex: threads who try to 861 * allocate now will get block indexes above the current limit and 862 * hit futex_wait below. 863 */ 864 new.next = state.next + block_size; 865 do { 866 new.end = anv_block_pool_grow(pool, pool_state); 867 } while (new.end < new.next); 868 869 old.u64 = __sync_lock_test_and_set(&pool_state->u64, new.u64); 870 if (old.next != state.next) 871 futex_wake(&pool_state->end, INT_MAX); 872 return state.next; 873 } else { 874 futex_wait(&pool_state->end, state.end, NULL); 875 continue; 876 } 877 } 878} 879 880int32_t 881anv_block_pool_alloc(struct anv_block_pool *pool, 882 uint32_t block_size, uint32_t *padding) 883{ 884 uint32_t offset; 885 886 offset = anv_block_pool_alloc_new(pool, &pool->state, block_size, padding); 887 888 return offset; 889} 890 891/* Allocates a block out of the back of the block pool. 892 * 893 * This will allocated a block earlier than the "start" of the block pool. 894 * The offsets returned from this function will be negative but will still 895 * be correct relative to the block pool's map pointer. 896 * 897 * If you ever use anv_block_pool_alloc_back, then you will have to do 898 * gymnastics with the block pool's BO when doing relocations. 899 */ 900int32_t 901anv_block_pool_alloc_back(struct anv_block_pool *pool, 902 uint32_t block_size) 903{ 904 int32_t offset = anv_block_pool_alloc_new(pool, &pool->back_state, 905 block_size, NULL); 906 907 /* The offset we get out of anv_block_pool_alloc_new() is actually the 908 * number of bytes downwards from the middle to the end of the block. 909 * We need to turn it into a (negative) offset from the middle to the 910 * start of the block. 911 */ 912 assert(offset >= 0); 913 return -(offset + block_size); 914} 915 916VkResult 917anv_state_pool_init(struct anv_state_pool *pool, 918 struct anv_device *device, 919 uint64_t start_address, 920 uint32_t block_size, 921 uint64_t bo_flags) 922{ 923 VkResult result = anv_block_pool_init(&pool->block_pool, device, 924 start_address, 925 block_size * 16, 926 bo_flags); 927 if (result != VK_SUCCESS) 928 return result; 929 930 result = anv_state_table_init(&pool->table, device, 64); 931 if (result != VK_SUCCESS) { 932 anv_block_pool_finish(&pool->block_pool); 933 return result; 934 } 935 936 assert(util_is_power_of_two_or_zero(block_size)); 937 pool->block_size = block_size; 938 pool->back_alloc_free_list = ANV_FREE_LIST_EMPTY; 939 for (unsigned i = 0; i < ANV_STATE_BUCKETS; i++) { 940 pool->buckets[i].free_list = ANV_FREE_LIST_EMPTY; 941 pool->buckets[i].block.next = 0; 942 pool->buckets[i].block.end = 0; 943 } 944 VG(VALGRIND_CREATE_MEMPOOL(pool, 0, false)); 945 946 return VK_SUCCESS; 947} 948 949void 950anv_state_pool_finish(struct anv_state_pool *pool) 951{ 952 VG(VALGRIND_DESTROY_MEMPOOL(pool)); 953 anv_state_table_finish(&pool->table); 954 anv_block_pool_finish(&pool->block_pool); 955} 956 957static uint32_t 958anv_fixed_size_state_pool_alloc_new(struct anv_fixed_size_state_pool *pool, 959 struct anv_block_pool *block_pool, 960 uint32_t state_size, 961 uint32_t block_size, 962 uint32_t *padding) 963{ 964 struct anv_block_state block, old, new; 965 uint32_t offset; 966 967 /* We don't always use anv_block_pool_alloc(), which would set *padding to 968 * zero for us. So if we have a pointer to padding, we must zero it out 969 * ourselves here, to make sure we always return some sensible value. 970 */ 971 if (padding) 972 *padding = 0; 973 974 /* If our state is large, we don't need any sub-allocation from a block. 975 * Instead, we just grab whole (potentially large) blocks. 976 */ 977 if (state_size >= block_size) 978 return anv_block_pool_alloc(block_pool, state_size, padding); 979 980 restart: 981 block.u64 = __sync_fetch_and_add(&pool->block.u64, state_size); 982 983 if (block.next < block.end) { 984 return block.next; 985 } else if (block.next == block.end) { 986 offset = anv_block_pool_alloc(block_pool, block_size, padding); 987 new.next = offset + state_size; 988 new.end = offset + block_size; 989 old.u64 = __sync_lock_test_and_set(&pool->block.u64, new.u64); 990 if (old.next != block.next) 991 futex_wake(&pool->block.end, INT_MAX); 992 return offset; 993 } else { 994 futex_wait(&pool->block.end, block.end, NULL); 995 goto restart; 996 } 997} 998 999static uint32_t 1000anv_state_pool_get_bucket(uint32_t size) 1001{ 1002 unsigned size_log2 = ilog2_round_up(size); 1003 assert(size_log2 <= ANV_MAX_STATE_SIZE_LOG2); 1004 if (size_log2 < ANV_MIN_STATE_SIZE_LOG2) 1005 size_log2 = ANV_MIN_STATE_SIZE_LOG2; 1006 return size_log2 - ANV_MIN_STATE_SIZE_LOG2; 1007} 1008 1009static uint32_t 1010anv_state_pool_get_bucket_size(uint32_t bucket) 1011{ 1012 uint32_t size_log2 = bucket + ANV_MIN_STATE_SIZE_LOG2; 1013 return 1 << size_log2; 1014} 1015 1016/** Helper to push a chunk into the state table. 1017 * 1018 * It creates 'count' entries into the state table and update their sizes, 1019 * offsets and maps, also pushing them as "free" states. 1020 */ 1021static void 1022anv_state_pool_return_blocks(struct anv_state_pool *pool, 1023 uint32_t chunk_offset, uint32_t count, 1024 uint32_t block_size) 1025{ 1026 /* Disallow returning 0 chunks */ 1027 assert(count != 0); 1028 1029 /* Make sure we always return chunks aligned to the block_size */ 1030 assert(chunk_offset % block_size == 0); 1031 1032 uint32_t st_idx; 1033 UNUSED VkResult result = anv_state_table_add(&pool->table, &st_idx, count); 1034 assert(result == VK_SUCCESS); 1035 for (int i = 0; i < count; i++) { 1036 /* update states that were added back to the state table */ 1037 struct anv_state *state_i = anv_state_table_get(&pool->table, 1038 st_idx + i); 1039 state_i->alloc_size = block_size; 1040 state_i->offset = chunk_offset + block_size * i; 1041 state_i->map = anv_block_pool_map(&pool->block_pool, state_i->offset); 1042 } 1043 1044 uint32_t block_bucket = anv_state_pool_get_bucket(block_size); 1045 anv_free_list_push(&pool->buckets[block_bucket].free_list, 1046 &pool->table, st_idx, count); 1047} 1048 1049/** Returns a chunk of memory back to the state pool. 1050 * 1051 * Do a two-level split. If chunk_size is bigger than divisor 1052 * (pool->block_size), we return as many divisor sized blocks as we can, from 1053 * the end of the chunk. 1054 * 1055 * The remaining is then split into smaller blocks (starting at small_size if 1056 * it is non-zero), with larger blocks always being taken from the end of the 1057 * chunk. 1058 */ 1059static void 1060anv_state_pool_return_chunk(struct anv_state_pool *pool, 1061 uint32_t chunk_offset, uint32_t chunk_size, 1062 uint32_t small_size) 1063{ 1064 uint32_t divisor = pool->block_size; 1065 uint32_t nblocks = chunk_size / divisor; 1066 uint32_t rest = chunk_size - nblocks * divisor; 1067 1068 if (nblocks > 0) { 1069 /* First return divisor aligned and sized chunks. We start returning 1070 * larger blocks from the end fo the chunk, since they should already be 1071 * aligned to divisor. Also anv_state_pool_return_blocks() only accepts 1072 * aligned chunks. 1073 */ 1074 uint32_t offset = chunk_offset + rest; 1075 anv_state_pool_return_blocks(pool, offset, nblocks, divisor); 1076 } 1077 1078 chunk_size = rest; 1079 divisor /= 2; 1080 1081 if (small_size > 0 && small_size < divisor) 1082 divisor = small_size; 1083 1084 uint32_t min_size = 1 << ANV_MIN_STATE_SIZE_LOG2; 1085 1086 /* Just as before, return larger divisor aligned blocks from the end of the 1087 * chunk first. 1088 */ 1089 while (chunk_size > 0 && divisor >= min_size) { 1090 nblocks = chunk_size / divisor; 1091 rest = chunk_size - nblocks * divisor; 1092 if (nblocks > 0) { 1093 anv_state_pool_return_blocks(pool, chunk_offset + rest, 1094 nblocks, divisor); 1095 chunk_size = rest; 1096 } 1097 divisor /= 2; 1098 } 1099} 1100 1101static struct anv_state 1102anv_state_pool_alloc_no_vg(struct anv_state_pool *pool, 1103 uint32_t size, uint32_t align) 1104{ 1105 uint32_t bucket = anv_state_pool_get_bucket(MAX2(size, align)); 1106 1107 struct anv_state *state; 1108 uint32_t alloc_size = anv_state_pool_get_bucket_size(bucket); 1109 int32_t offset; 1110 1111 /* Try free list first. */ 1112 state = anv_free_list_pop(&pool->buckets[bucket].free_list, 1113 &pool->table); 1114 if (state) { 1115 assert(state->offset >= 0); 1116 goto done; 1117 } 1118 1119 /* Try to grab a chunk from some larger bucket and split it up */ 1120 for (unsigned b = bucket + 1; b < ANV_STATE_BUCKETS; b++) { 1121 state = anv_free_list_pop(&pool->buckets[b].free_list, &pool->table); 1122 if (state) { 1123 unsigned chunk_size = anv_state_pool_get_bucket_size(b); 1124 int32_t chunk_offset = state->offset; 1125 1126 /* First lets update the state we got to its new size. offset and map 1127 * remain the same. 1128 */ 1129 state->alloc_size = alloc_size; 1130 1131 /* Now return the unused part of the chunk back to the pool as free 1132 * blocks 1133 * 1134 * There are a couple of options as to what we do with it: 1135 * 1136 * 1) We could fully split the chunk into state.alloc_size sized 1137 * pieces. However, this would mean that allocating a 16B 1138 * state could potentially split a 2MB chunk into 512K smaller 1139 * chunks. This would lead to unnecessary fragmentation. 1140 * 1141 * 2) The classic "buddy allocator" method would have us split the 1142 * chunk in half and return one half. Then we would split the 1143 * remaining half in half and return one half, and repeat as 1144 * needed until we get down to the size we want. However, if 1145 * you are allocating a bunch of the same size state (which is 1146 * the common case), this means that every other allocation has 1147 * to go up a level and every fourth goes up two levels, etc. 1148 * This is not nearly as efficient as it could be if we did a 1149 * little more work up-front. 1150 * 1151 * 3) Split the difference between (1) and (2) by doing a 1152 * two-level split. If it's bigger than some fixed block_size, 1153 * we split it into block_size sized chunks and return all but 1154 * one of them. Then we split what remains into 1155 * state.alloc_size sized chunks and return them. 1156 * 1157 * We choose something close to option (3), which is implemented with 1158 * anv_state_pool_return_chunk(). That is done by returning the 1159 * remaining of the chunk, with alloc_size as a hint of the size that 1160 * we want the smaller chunk split into. 1161 */ 1162 anv_state_pool_return_chunk(pool, chunk_offset + alloc_size, 1163 chunk_size - alloc_size, alloc_size); 1164 goto done; 1165 } 1166 } 1167 1168 uint32_t padding; 1169 offset = anv_fixed_size_state_pool_alloc_new(&pool->buckets[bucket], 1170 &pool->block_pool, 1171 alloc_size, 1172 pool->block_size, 1173 &padding); 1174 /* Everytime we allocate a new state, add it to the state pool */ 1175 uint32_t idx; 1176 UNUSED VkResult result = anv_state_table_add(&pool->table, &idx, 1); 1177 assert(result == VK_SUCCESS); 1178 1179 state = anv_state_table_get(&pool->table, idx); 1180 state->offset = offset; 1181 state->alloc_size = alloc_size; 1182 state->map = anv_block_pool_map(&pool->block_pool, offset); 1183 1184 if (padding > 0) { 1185 uint32_t return_offset = offset - padding; 1186 anv_state_pool_return_chunk(pool, return_offset, padding, 0); 1187 } 1188 1189done: 1190 return *state; 1191} 1192 1193struct anv_state 1194anv_state_pool_alloc(struct anv_state_pool *pool, uint32_t size, uint32_t align) 1195{ 1196 if (size == 0) 1197 return ANV_STATE_NULL; 1198 1199 struct anv_state state = anv_state_pool_alloc_no_vg(pool, size, align); 1200 VG(VALGRIND_MEMPOOL_ALLOC(pool, state.map, size)); 1201 return state; 1202} 1203 1204struct anv_state 1205anv_state_pool_alloc_back(struct anv_state_pool *pool) 1206{ 1207 struct anv_state *state; 1208 uint32_t alloc_size = pool->block_size; 1209 1210 state = anv_free_list_pop(&pool->back_alloc_free_list, &pool->table); 1211 if (state) { 1212 assert(state->offset < 0); 1213 goto done; 1214 } 1215 1216 int32_t offset; 1217 offset = anv_block_pool_alloc_back(&pool->block_pool, 1218 pool->block_size); 1219 uint32_t idx; 1220 UNUSED VkResult result = anv_state_table_add(&pool->table, &idx, 1); 1221 assert(result == VK_SUCCESS); 1222 1223 state = anv_state_table_get(&pool->table, idx); 1224 state->offset = offset; 1225 state->alloc_size = alloc_size; 1226 state->map = anv_block_pool_map(&pool->block_pool, state->offset); 1227 1228done: 1229 VG(VALGRIND_MEMPOOL_ALLOC(pool, state->map, state->alloc_size)); 1230 return *state; 1231} 1232 1233static void 1234anv_state_pool_free_no_vg(struct anv_state_pool *pool, struct anv_state state) 1235{ 1236 assert(util_is_power_of_two_or_zero(state.alloc_size)); 1237 unsigned bucket = anv_state_pool_get_bucket(state.alloc_size); 1238 1239 if (state.offset < 0) { 1240 assert(state.alloc_size == pool->block_size); 1241 anv_free_list_push(&pool->back_alloc_free_list, 1242 &pool->table, state.idx, 1); 1243 } else { 1244 anv_free_list_push(&pool->buckets[bucket].free_list, 1245 &pool->table, state.idx, 1); 1246 } 1247} 1248 1249void 1250anv_state_pool_free(struct anv_state_pool *pool, struct anv_state state) 1251{ 1252 if (state.alloc_size == 0) 1253 return; 1254 1255 VG(VALGRIND_MEMPOOL_FREE(pool, state.map)); 1256 anv_state_pool_free_no_vg(pool, state); 1257} 1258 1259struct anv_state_stream_block { 1260 struct anv_state block; 1261 1262 /* The next block */ 1263 struct anv_state_stream_block *next; 1264 1265#ifdef HAVE_VALGRIND 1266 /* A pointer to the first user-allocated thing in this block. This is 1267 * what valgrind sees as the start of the block. 1268 */ 1269 void *_vg_ptr; 1270#endif 1271}; 1272 1273/* The state stream allocator is a one-shot, single threaded allocator for 1274 * variable sized blocks. We use it for allocating dynamic state. 1275 */ 1276void 1277anv_state_stream_init(struct anv_state_stream *stream, 1278 struct anv_state_pool *state_pool, 1279 uint32_t block_size) 1280{ 1281 stream->state_pool = state_pool; 1282 stream->block_size = block_size; 1283 1284 stream->block = ANV_STATE_NULL; 1285 1286 stream->block_list = NULL; 1287 1288 /* Ensure that next + whatever > block_size. This way the first call to 1289 * state_stream_alloc fetches a new block. 1290 */ 1291 stream->next = block_size; 1292 1293 VG(VALGRIND_CREATE_MEMPOOL(stream, 0, false)); 1294} 1295 1296void 1297anv_state_stream_finish(struct anv_state_stream *stream) 1298{ 1299 struct anv_state_stream_block *next = stream->block_list; 1300 while (next != NULL) { 1301 struct anv_state_stream_block sb = VG_NOACCESS_READ(next); 1302 VG(VALGRIND_MEMPOOL_FREE(stream, sb._vg_ptr)); 1303 VG(VALGRIND_MAKE_MEM_UNDEFINED(next, stream->block_size)); 1304 anv_state_pool_free_no_vg(stream->state_pool, sb.block); 1305 next = sb.next; 1306 } 1307 1308 VG(VALGRIND_DESTROY_MEMPOOL(stream)); 1309} 1310 1311struct anv_state 1312anv_state_stream_alloc(struct anv_state_stream *stream, 1313 uint32_t size, uint32_t alignment) 1314{ 1315 if (size == 0) 1316 return ANV_STATE_NULL; 1317 1318 assert(alignment <= PAGE_SIZE); 1319 1320 uint32_t offset = align_u32(stream->next, alignment); 1321 if (offset + size > stream->block.alloc_size) { 1322 uint32_t block_size = stream->block_size; 1323 if (block_size < size) 1324 block_size = round_to_power_of_two(size); 1325 1326 stream->block = anv_state_pool_alloc_no_vg(stream->state_pool, 1327 block_size, PAGE_SIZE); 1328 1329 struct anv_state_stream_block *sb = stream->block.map; 1330 VG_NOACCESS_WRITE(&sb->block, stream->block); 1331 VG_NOACCESS_WRITE(&sb->next, stream->block_list); 1332 stream->block_list = sb; 1333 VG(VG_NOACCESS_WRITE(&sb->_vg_ptr, NULL)); 1334 1335 VG(VALGRIND_MAKE_MEM_NOACCESS(stream->block.map, stream->block_size)); 1336 1337 /* Reset back to the start plus space for the header */ 1338 stream->next = sizeof(*sb); 1339 1340 offset = align_u32(stream->next, alignment); 1341 assert(offset + size <= stream->block.alloc_size); 1342 } 1343 1344 struct anv_state state = stream->block; 1345 state.offset += offset; 1346 state.alloc_size = size; 1347 state.map += offset; 1348 1349 stream->next = offset + size; 1350 1351#ifdef HAVE_VALGRIND 1352 struct anv_state_stream_block *sb = stream->block_list; 1353 void *vg_ptr = VG_NOACCESS_READ(&sb->_vg_ptr); 1354 if (vg_ptr == NULL) { 1355 vg_ptr = state.map; 1356 VG_NOACCESS_WRITE(&sb->_vg_ptr, vg_ptr); 1357 VALGRIND_MEMPOOL_ALLOC(stream, vg_ptr, size); 1358 } else { 1359 void *state_end = state.map + state.alloc_size; 1360 /* This only updates the mempool. The newly allocated chunk is still 1361 * marked as NOACCESS. */ 1362 VALGRIND_MEMPOOL_CHANGE(stream, vg_ptr, vg_ptr, state_end - vg_ptr); 1363 /* Mark the newly allocated chunk as undefined */ 1364 VALGRIND_MAKE_MEM_UNDEFINED(state.map, state.alloc_size); 1365 } 1366#endif 1367 1368 return state; 1369} 1370 1371struct bo_pool_bo_link { 1372 struct bo_pool_bo_link *next; 1373 struct anv_bo bo; 1374}; 1375 1376void 1377anv_bo_pool_init(struct anv_bo_pool *pool, struct anv_device *device, 1378 uint64_t bo_flags) 1379{ 1380 pool->device = device; 1381 pool->bo_flags = bo_flags; 1382 memset(pool->free_list, 0, sizeof(pool->free_list)); 1383 1384 VG(VALGRIND_CREATE_MEMPOOL(pool, 0, false)); 1385} 1386 1387void 1388anv_bo_pool_finish(struct anv_bo_pool *pool) 1389{ 1390 for (unsigned i = 0; i < ARRAY_SIZE(pool->free_list); i++) { 1391 struct bo_pool_bo_link *link = PFL_PTR(pool->free_list[i]); 1392 while (link != NULL) { 1393 struct bo_pool_bo_link link_copy = VG_NOACCESS_READ(link); 1394 1395 anv_gem_munmap(link_copy.bo.map, link_copy.bo.size); 1396 anv_vma_free(pool->device, &link_copy.bo); 1397 anv_gem_close(pool->device, link_copy.bo.gem_handle); 1398 link = link_copy.next; 1399 } 1400 } 1401 1402 VG(VALGRIND_DESTROY_MEMPOOL(pool)); 1403} 1404 1405VkResult 1406anv_bo_pool_alloc(struct anv_bo_pool *pool, struct anv_bo *bo, uint32_t size) 1407{ 1408 VkResult result; 1409 1410 const unsigned size_log2 = size < 4096 ? 12 : ilog2_round_up(size); 1411 const unsigned pow2_size = 1 << size_log2; 1412 const unsigned bucket = size_log2 - 12; 1413 assert(bucket < ARRAY_SIZE(pool->free_list)); 1414 1415 void *next_free_void; 1416 if (anv_ptr_free_list_pop(&pool->free_list[bucket], &next_free_void)) { 1417 struct bo_pool_bo_link *next_free = next_free_void; 1418 *bo = VG_NOACCESS_READ(&next_free->bo); 1419 assert(bo->gem_handle); 1420 assert(bo->map == next_free); 1421 assert(size <= bo->size); 1422 1423 VG(VALGRIND_MEMPOOL_ALLOC(pool, bo->map, size)); 1424 1425 return VK_SUCCESS; 1426 } 1427 1428 struct anv_bo new_bo; 1429 1430 result = anv_bo_init_new(&new_bo, pool->device, pow2_size); 1431 if (result != VK_SUCCESS) 1432 return result; 1433 1434 new_bo.flags = pool->bo_flags; 1435 1436 if (!anv_vma_alloc(pool->device, &new_bo)) 1437 return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY); 1438 1439 assert(new_bo.size == pow2_size); 1440 1441 new_bo.map = anv_gem_mmap(pool->device, new_bo.gem_handle, 0, pow2_size, 0); 1442 if (new_bo.map == MAP_FAILED) { 1443 anv_gem_close(pool->device, new_bo.gem_handle); 1444 anv_vma_free(pool->device, &new_bo); 1445 return vk_error(VK_ERROR_MEMORY_MAP_FAILED); 1446 } 1447 1448 /* We are removing the state flushes, so lets make sure that these buffers 1449 * are cached/snooped. 1450 */ 1451 if (!pool->device->info.has_llc) { 1452 anv_gem_set_caching(pool->device, new_bo.gem_handle, 1453 I915_CACHING_CACHED); 1454 } 1455 1456 *bo = new_bo; 1457 1458 VG(VALGRIND_MEMPOOL_ALLOC(pool, bo->map, size)); 1459 1460 return VK_SUCCESS; 1461} 1462 1463void 1464anv_bo_pool_free(struct anv_bo_pool *pool, const struct anv_bo *bo_in) 1465{ 1466 /* Make a copy in case the anv_bo happens to be storred in the BO */ 1467 struct anv_bo bo = *bo_in; 1468 1469 VG(VALGRIND_MEMPOOL_FREE(pool, bo.map)); 1470 1471 struct bo_pool_bo_link *link = bo.map; 1472 VG_NOACCESS_WRITE(&link->bo, bo); 1473 1474 assert(util_is_power_of_two_or_zero(bo.size)); 1475 const unsigned size_log2 = ilog2_round_up(bo.size); 1476 const unsigned bucket = size_log2 - 12; 1477 assert(bucket < ARRAY_SIZE(pool->free_list)); 1478 1479 anv_ptr_free_list_push(&pool->free_list[bucket], link); 1480} 1481 1482// Scratch pool 1483 1484void 1485anv_scratch_pool_init(struct anv_device *device, struct anv_scratch_pool *pool) 1486{ 1487 memset(pool, 0, sizeof(*pool)); 1488} 1489 1490void 1491anv_scratch_pool_finish(struct anv_device *device, struct anv_scratch_pool *pool) 1492{ 1493 for (unsigned s = 0; s < MESA_SHADER_STAGES; s++) { 1494 for (unsigned i = 0; i < 16; i++) { 1495 struct anv_scratch_bo *bo = &pool->bos[i][s]; 1496 if (bo->exists > 0) { 1497 anv_vma_free(device, &bo->bo); 1498 anv_gem_close(device, bo->bo.gem_handle); 1499 } 1500 } 1501 } 1502} 1503 1504struct anv_bo * 1505anv_scratch_pool_alloc(struct anv_device *device, struct anv_scratch_pool *pool, 1506 gl_shader_stage stage, unsigned per_thread_scratch) 1507{ 1508 if (per_thread_scratch == 0) 1509 return NULL; 1510 1511 unsigned scratch_size_log2 = ffs(per_thread_scratch / 2048); 1512 assert(scratch_size_log2 < 16); 1513 1514 struct anv_scratch_bo *bo = &pool->bos[scratch_size_log2][stage]; 1515 1516 /* We can use "exists" to shortcut and ignore the critical section */ 1517 if (bo->exists) 1518 return &bo->bo; 1519 1520 pthread_mutex_lock(&device->mutex); 1521 1522 __sync_synchronize(); 1523 if (bo->exists) { 1524 pthread_mutex_unlock(&device->mutex); 1525 return &bo->bo; 1526 } 1527 1528 const struct anv_physical_device *physical_device = 1529 &device->instance->physicalDevice; 1530 const struct gen_device_info *devinfo = &physical_device->info; 1531 1532 const unsigned subslices = MAX2(physical_device->subslice_total, 1); 1533 1534 unsigned scratch_ids_per_subslice; 1535 if (devinfo->is_haswell) { 1536 /* WaCSScratchSize:hsw 1537 * 1538 * Haswell's scratch space address calculation appears to be sparse 1539 * rather than tightly packed. The Thread ID has bits indicating 1540 * which subslice, EU within a subslice, and thread within an EU it 1541 * is. There's a maximum of two slices and two subslices, so these 1542 * can be stored with a single bit. Even though there are only 10 EUs 1543 * per subslice, this is stored in 4 bits, so there's an effective 1544 * maximum value of 16 EUs. Similarly, although there are only 7 1545 * threads per EU, this is stored in a 3 bit number, giving an 1546 * effective maximum value of 8 threads per EU. 1547 * 1548 * This means that we need to use 16 * 8 instead of 10 * 7 for the 1549 * number of threads per subslice. 1550 */ 1551 scratch_ids_per_subslice = 16 * 8; 1552 } else if (devinfo->is_cherryview) { 1553 /* Cherryview devices have either 6 or 8 EUs per subslice, and each EU 1554 * has 7 threads. The 6 EU devices appear to calculate thread IDs as if 1555 * it had 8 EUs. 1556 */ 1557 scratch_ids_per_subslice = 8 * 7; 1558 } else { 1559 scratch_ids_per_subslice = devinfo->max_cs_threads; 1560 } 1561 1562 uint32_t max_threads[] = { 1563 [MESA_SHADER_VERTEX] = devinfo->max_vs_threads, 1564 [MESA_SHADER_TESS_CTRL] = devinfo->max_tcs_threads, 1565 [MESA_SHADER_TESS_EVAL] = devinfo->max_tes_threads, 1566 [MESA_SHADER_GEOMETRY] = devinfo->max_gs_threads, 1567 [MESA_SHADER_FRAGMENT] = devinfo->max_wm_threads, 1568 [MESA_SHADER_COMPUTE] = scratch_ids_per_subslice * subslices, 1569 }; 1570 1571 uint32_t size = per_thread_scratch * max_threads[stage]; 1572 1573 anv_bo_init_new(&bo->bo, device, size); 1574 1575 /* Even though the Scratch base pointers in 3DSTATE_*S are 64 bits, they 1576 * are still relative to the general state base address. When we emit 1577 * STATE_BASE_ADDRESS, we set general state base address to 0 and the size 1578 * to the maximum (1 page under 4GB). This allows us to just place the 1579 * scratch buffers anywhere we wish in the bottom 32 bits of address space 1580 * and just set the scratch base pointer in 3DSTATE_*S using a relocation. 1581 * However, in order to do so, we need to ensure that the kernel does not 1582 * place the scratch BO above the 32-bit boundary. 1583 * 1584 * NOTE: Technically, it can't go "anywhere" because the top page is off 1585 * limits. However, when EXEC_OBJECT_SUPPORTS_48B_ADDRESS is set, the 1586 * kernel allocates space using 1587 * 1588 * end = min_t(u64, end, (1ULL << 32) - I915_GTT_PAGE_SIZE); 1589 * 1590 * so nothing will ever touch the top page. 1591 */ 1592 assert(!(bo->bo.flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS)); 1593 1594 if (device->instance->physicalDevice.has_exec_async) 1595 bo->bo.flags |= EXEC_OBJECT_ASYNC; 1596 1597 if (device->instance->physicalDevice.use_softpin) 1598 bo->bo.flags |= EXEC_OBJECT_PINNED; 1599 1600 anv_vma_alloc(device, &bo->bo); 1601 1602 /* Set the exists last because it may be read by other threads */ 1603 __sync_synchronize(); 1604 bo->exists = true; 1605 1606 pthread_mutex_unlock(&device->mutex); 1607 1608 return &bo->bo; 1609} 1610 1611struct anv_cached_bo { 1612 struct anv_bo bo; 1613 1614 uint32_t refcount; 1615}; 1616 1617VkResult 1618anv_bo_cache_init(struct anv_bo_cache *cache) 1619{ 1620 cache->bo_map = _mesa_pointer_hash_table_create(NULL); 1621 if (!cache->bo_map) 1622 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); 1623 1624 if (pthread_mutex_init(&cache->mutex, NULL)) { 1625 _mesa_hash_table_destroy(cache->bo_map, NULL); 1626 return vk_errorf(NULL, NULL, VK_ERROR_OUT_OF_HOST_MEMORY, 1627 "pthread_mutex_init failed: %m"); 1628 } 1629 1630 return VK_SUCCESS; 1631} 1632 1633void 1634anv_bo_cache_finish(struct anv_bo_cache *cache) 1635{ 1636 _mesa_hash_table_destroy(cache->bo_map, NULL); 1637 pthread_mutex_destroy(&cache->mutex); 1638} 1639 1640static struct anv_cached_bo * 1641anv_bo_cache_lookup_locked(struct anv_bo_cache *cache, uint32_t gem_handle) 1642{ 1643 struct hash_entry *entry = 1644 _mesa_hash_table_search(cache->bo_map, 1645 (const void *)(uintptr_t)gem_handle); 1646 if (!entry) 1647 return NULL; 1648 1649 struct anv_cached_bo *bo = (struct anv_cached_bo *)entry->data; 1650 assert(bo->bo.gem_handle == gem_handle); 1651 1652 return bo; 1653} 1654 1655UNUSED static struct anv_bo * 1656anv_bo_cache_lookup(struct anv_bo_cache *cache, uint32_t gem_handle) 1657{ 1658 pthread_mutex_lock(&cache->mutex); 1659 1660 struct anv_cached_bo *bo = anv_bo_cache_lookup_locked(cache, gem_handle); 1661 1662 pthread_mutex_unlock(&cache->mutex); 1663 1664 return bo ? &bo->bo : NULL; 1665} 1666 1667#define ANV_BO_CACHE_SUPPORTED_FLAGS \ 1668 (EXEC_OBJECT_WRITE | \ 1669 EXEC_OBJECT_ASYNC | \ 1670 EXEC_OBJECT_SUPPORTS_48B_ADDRESS | \ 1671 EXEC_OBJECT_PINNED | \ 1672 ANV_BO_EXTERNAL) 1673 1674VkResult 1675anv_bo_cache_alloc(struct anv_device *device, 1676 struct anv_bo_cache *cache, 1677 uint64_t size, uint64_t bo_flags, 1678 struct anv_bo **bo_out) 1679{ 1680 assert(bo_flags == (bo_flags & ANV_BO_CACHE_SUPPORTED_FLAGS)); 1681 1682 struct anv_cached_bo *bo = 1683 vk_alloc(&device->alloc, sizeof(struct anv_cached_bo), 8, 1684 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 1685 if (!bo) 1686 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); 1687 1688 bo->refcount = 1; 1689 1690 /* The kernel is going to give us whole pages anyway */ 1691 size = align_u64(size, 4096); 1692 1693 VkResult result = anv_bo_init_new(&bo->bo, device, size); 1694 if (result != VK_SUCCESS) { 1695 vk_free(&device->alloc, bo); 1696 return result; 1697 } 1698 1699 bo->bo.flags = bo_flags; 1700 1701 if (!anv_vma_alloc(device, &bo->bo)) { 1702 anv_gem_close(device, bo->bo.gem_handle); 1703 vk_free(&device->alloc, bo); 1704 return vk_errorf(device->instance, NULL, 1705 VK_ERROR_OUT_OF_DEVICE_MEMORY, 1706 "failed to allocate virtual address for BO"); 1707 } 1708 1709 assert(bo->bo.gem_handle); 1710 1711 pthread_mutex_lock(&cache->mutex); 1712 1713 _mesa_hash_table_insert(cache->bo_map, 1714 (void *)(uintptr_t)bo->bo.gem_handle, bo); 1715 1716 pthread_mutex_unlock(&cache->mutex); 1717 1718 *bo_out = &bo->bo; 1719 1720 return VK_SUCCESS; 1721} 1722 1723VkResult 1724anv_bo_cache_import_host_ptr(struct anv_device *device, 1725 struct anv_bo_cache *cache, 1726 void *host_ptr, uint32_t size, 1727 uint64_t bo_flags, struct anv_bo **bo_out) 1728{ 1729 assert(bo_flags == (bo_flags & ANV_BO_CACHE_SUPPORTED_FLAGS)); 1730 assert((bo_flags & ANV_BO_EXTERNAL) == 0); 1731 1732 uint32_t gem_handle = anv_gem_userptr(device, host_ptr, size); 1733 if (!gem_handle) 1734 return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE); 1735 1736 pthread_mutex_lock(&cache->mutex); 1737 1738 struct anv_cached_bo *bo = anv_bo_cache_lookup_locked(cache, gem_handle); 1739 if (bo) { 1740 /* VK_EXT_external_memory_host doesn't require handling importing the 1741 * same pointer twice at the same time, but we don't get in the way. If 1742 * kernel gives us the same gem_handle, only succeed if the flags match. 1743 */ 1744 if (bo_flags != bo->bo.flags) { 1745 pthread_mutex_unlock(&cache->mutex); 1746 return vk_errorf(device->instance, NULL, 1747 VK_ERROR_INVALID_EXTERNAL_HANDLE, 1748 "same host pointer imported two different ways"); 1749 } 1750 __sync_fetch_and_add(&bo->refcount, 1); 1751 } else { 1752 bo = vk_alloc(&device->alloc, sizeof(struct anv_cached_bo), 8, 1753 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 1754 if (!bo) { 1755 anv_gem_close(device, gem_handle); 1756 pthread_mutex_unlock(&cache->mutex); 1757 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); 1758 } 1759 1760 bo->refcount = 1; 1761 1762 anv_bo_init(&bo->bo, gem_handle, size); 1763 bo->bo.flags = bo_flags; 1764 1765 if (!anv_vma_alloc(device, &bo->bo)) { 1766 anv_gem_close(device, bo->bo.gem_handle); 1767 pthread_mutex_unlock(&cache->mutex); 1768 vk_free(&device->alloc, bo); 1769 return vk_errorf(device->instance, NULL, 1770 VK_ERROR_OUT_OF_DEVICE_MEMORY, 1771 "failed to allocate virtual address for BO"); 1772 } 1773 1774 _mesa_hash_table_insert(cache->bo_map, (void *)(uintptr_t)gem_handle, bo); 1775 } 1776 1777 pthread_mutex_unlock(&cache->mutex); 1778 *bo_out = &bo->bo; 1779 1780 return VK_SUCCESS; 1781} 1782 1783VkResult 1784anv_bo_cache_import(struct anv_device *device, 1785 struct anv_bo_cache *cache, 1786 int fd, uint64_t bo_flags, 1787 struct anv_bo **bo_out) 1788{ 1789 assert(bo_flags == (bo_flags & ANV_BO_CACHE_SUPPORTED_FLAGS)); 1790 assert(bo_flags & ANV_BO_EXTERNAL); 1791 1792 pthread_mutex_lock(&cache->mutex); 1793 1794 uint32_t gem_handle = anv_gem_fd_to_handle(device, fd); 1795 if (!gem_handle) { 1796 pthread_mutex_unlock(&cache->mutex); 1797 return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE); 1798 } 1799 1800 struct anv_cached_bo *bo = anv_bo_cache_lookup_locked(cache, gem_handle); 1801 if (bo) { 1802 /* We have to be careful how we combine flags so that it makes sense. 1803 * Really, though, if we get to this case and it actually matters, the 1804 * client has imported a BO twice in different ways and they get what 1805 * they have coming. 1806 */ 1807 uint64_t new_flags = ANV_BO_EXTERNAL; 1808 new_flags |= (bo->bo.flags | bo_flags) & EXEC_OBJECT_WRITE; 1809 new_flags |= (bo->bo.flags & bo_flags) & EXEC_OBJECT_ASYNC; 1810 new_flags |= (bo->bo.flags & bo_flags) & EXEC_OBJECT_SUPPORTS_48B_ADDRESS; 1811 new_flags |= (bo->bo.flags | bo_flags) & EXEC_OBJECT_PINNED; 1812 1813 /* It's theoretically possible for a BO to get imported such that it's 1814 * both pinned and not pinned. The only way this can happen is if it 1815 * gets imported as both a semaphore and a memory object and that would 1816 * be an application error. Just fail out in that case. 1817 */ 1818 if ((bo->bo.flags & EXEC_OBJECT_PINNED) != 1819 (bo_flags & EXEC_OBJECT_PINNED)) { 1820 pthread_mutex_unlock(&cache->mutex); 1821 return vk_errorf(device->instance, NULL, 1822 VK_ERROR_INVALID_EXTERNAL_HANDLE, 1823 "The same BO was imported two different ways"); 1824 } 1825 1826 /* It's also theoretically possible that someone could export a BO from 1827 * one heap and import it into another or to import the same BO into two 1828 * different heaps. If this happens, we could potentially end up both 1829 * allowing and disallowing 48-bit addresses. There's not much we can 1830 * do about it if we're pinning so we just throw an error and hope no 1831 * app is actually that stupid. 1832 */ 1833 if ((new_flags & EXEC_OBJECT_PINNED) && 1834 (bo->bo.flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) != 1835 (bo_flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS)) { 1836 pthread_mutex_unlock(&cache->mutex); 1837 return vk_errorf(device->instance, NULL, 1838 VK_ERROR_INVALID_EXTERNAL_HANDLE, 1839 "The same BO was imported on two different heaps"); 1840 } 1841 1842 bo->bo.flags = new_flags; 1843 1844 __sync_fetch_and_add(&bo->refcount, 1); 1845 } else { 1846 off_t size = lseek(fd, 0, SEEK_END); 1847 if (size == (off_t)-1) { 1848 anv_gem_close(device, gem_handle); 1849 pthread_mutex_unlock(&cache->mutex); 1850 return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE); 1851 } 1852 1853 bo = vk_alloc(&device->alloc, sizeof(struct anv_cached_bo), 8, 1854 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 1855 if (!bo) { 1856 anv_gem_close(device, gem_handle); 1857 pthread_mutex_unlock(&cache->mutex); 1858 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); 1859 } 1860 1861 bo->refcount = 1; 1862 1863 anv_bo_init(&bo->bo, gem_handle, size); 1864 bo->bo.flags = bo_flags; 1865 1866 if (!anv_vma_alloc(device, &bo->bo)) { 1867 anv_gem_close(device, bo->bo.gem_handle); 1868 pthread_mutex_unlock(&cache->mutex); 1869 vk_free(&device->alloc, bo); 1870 return vk_errorf(device->instance, NULL, 1871 VK_ERROR_OUT_OF_DEVICE_MEMORY, 1872 "failed to allocate virtual address for BO"); 1873 } 1874 1875 _mesa_hash_table_insert(cache->bo_map, (void *)(uintptr_t)gem_handle, bo); 1876 } 1877 1878 pthread_mutex_unlock(&cache->mutex); 1879 *bo_out = &bo->bo; 1880 1881 return VK_SUCCESS; 1882} 1883 1884VkResult 1885anv_bo_cache_export(struct anv_device *device, 1886 struct anv_bo_cache *cache, 1887 struct anv_bo *bo_in, int *fd_out) 1888{ 1889 assert(anv_bo_cache_lookup(cache, bo_in->gem_handle) == bo_in); 1890 struct anv_cached_bo *bo = (struct anv_cached_bo *)bo_in; 1891 1892 /* This BO must have been flagged external in order for us to be able 1893 * to export it. This is done based on external options passed into 1894 * anv_AllocateMemory. 1895 */ 1896 assert(bo->bo.flags & ANV_BO_EXTERNAL); 1897 1898 int fd = anv_gem_handle_to_fd(device, bo->bo.gem_handle); 1899 if (fd < 0) 1900 return vk_error(VK_ERROR_TOO_MANY_OBJECTS); 1901 1902 *fd_out = fd; 1903 1904 return VK_SUCCESS; 1905} 1906 1907static bool 1908atomic_dec_not_one(uint32_t *counter) 1909{ 1910 uint32_t old, val; 1911 1912 val = *counter; 1913 while (1) { 1914 if (val == 1) 1915 return false; 1916 1917 old = __sync_val_compare_and_swap(counter, val, val - 1); 1918 if (old == val) 1919 return true; 1920 1921 val = old; 1922 } 1923} 1924 1925void 1926anv_bo_cache_release(struct anv_device *device, 1927 struct anv_bo_cache *cache, 1928 struct anv_bo *bo_in) 1929{ 1930 assert(anv_bo_cache_lookup(cache, bo_in->gem_handle) == bo_in); 1931 struct anv_cached_bo *bo = (struct anv_cached_bo *)bo_in; 1932 1933 /* Try to decrement the counter but don't go below one. If this succeeds 1934 * then the refcount has been decremented and we are not the last 1935 * reference. 1936 */ 1937 if (atomic_dec_not_one(&bo->refcount)) 1938 return; 1939 1940 pthread_mutex_lock(&cache->mutex); 1941 1942 /* We are probably the last reference since our attempt to decrement above 1943 * failed. However, we can't actually know until we are inside the mutex. 1944 * Otherwise, someone could import the BO between the decrement and our 1945 * taking the mutex. 1946 */ 1947 if (unlikely(__sync_sub_and_fetch(&bo->refcount, 1) > 0)) { 1948 /* Turns out we're not the last reference. Unlock and bail. */ 1949 pthread_mutex_unlock(&cache->mutex); 1950 return; 1951 } 1952 1953 struct hash_entry *entry = 1954 _mesa_hash_table_search(cache->bo_map, 1955 (const void *)(uintptr_t)bo->bo.gem_handle); 1956 assert(entry); 1957 _mesa_hash_table_remove(cache->bo_map, entry); 1958 1959 if (bo->bo.map) 1960 anv_gem_munmap(bo->bo.map, bo->bo.size); 1961 1962 anv_vma_free(device, &bo->bo); 1963 1964 anv_gem_close(device, bo->bo.gem_handle); 1965 1966 /* Don't unlock until we've actually closed the BO. The whole point of 1967 * the BO cache is to ensure that we correctly handle races with creating 1968 * and releasing GEM handles and we don't want to let someone import the BO 1969 * again between mutex unlock and closing the GEM handle. 1970 */ 1971 pthread_mutex_unlock(&cache->mutex); 1972 1973 vk_free(&device->alloc, bo); 1974} 1975