1 /* $NetBSD: i915_gem_execbuffer.c,v 1.5 2022/09/13 10:14:43 riastradh Exp $ */ 2 3 /* 4 * SPDX-License-Identifier: MIT 5 * 6 * Copyright 2008,2010 Intel Corporation 7 */ 8 9 #include <sys/cdefs.h> 10 __KERNEL_RCSID(0, "$NetBSD: i915_gem_execbuffer.c,v 1.5 2022/09/13 10:14:43 riastradh Exp $"); 11 12 #include <linux/intel-iommu.h> 13 #include <linux/dma-resv.h> 14 #include <linux/sync_file.h> 15 #include <linux/uaccess.h> 16 17 #include <drm/drm_syncobj.h> 18 #include <drm/i915_drm.h> 19 20 #ifdef __NetBSD__ 21 #include <sys/filedesc.h> 22 #endif 23 24 #include "display/intel_frontbuffer.h" 25 26 #include "gem/i915_gem_ioctls.h" 27 #include "gt/intel_context.h" 28 #include "gt/intel_engine_pool.h" 29 #include "gt/intel_gt.h" 30 #include "gt/intel_gt_pm.h" 31 #include "gt/intel_ring.h" 32 33 #include "i915_drv.h" 34 #include "i915_gem_clflush.h" 35 #include "i915_gem_context.h" 36 #include "i915_gem_ioctls.h" 37 #include "i915_sw_fence_work.h" 38 #include "i915_trace.h" 39 40 enum { 41 FORCE_CPU_RELOC = 1, 42 FORCE_GTT_RELOC, 43 FORCE_GPU_RELOC, 44 #define DBG_FORCE_RELOC 0 /* choose one of the above! */ 45 }; 46 47 #define __EXEC_OBJECT_HAS_REF BIT(31) 48 #define __EXEC_OBJECT_HAS_PIN BIT(30) 49 #define __EXEC_OBJECT_HAS_FENCE BIT(29) 50 #define __EXEC_OBJECT_NEEDS_MAP BIT(28) 51 #define __EXEC_OBJECT_NEEDS_BIAS BIT(27) 52 #define __EXEC_OBJECT_INTERNAL_FLAGS (~0u << 27) /* all of the above */ 53 #define __EXEC_OBJECT_RESERVED (__EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_FENCE) 54 55 #define __EXEC_HAS_RELOC BIT(31) 56 #define __EXEC_VALIDATED BIT(30) 57 #define __EXEC_INTERNAL_FLAGS (~0u << 30) 58 #define UPDATE PIN_OFFSET_FIXED 59 60 #define BATCH_OFFSET_BIAS (256*1024) 61 62 #define __I915_EXEC_ILLEGAL_FLAGS \ 63 (__I915_EXEC_UNKNOWN_FLAGS | \ 64 I915_EXEC_CONSTANTS_MASK | \ 65 I915_EXEC_RESOURCE_STREAMER) 66 67 /* Catch emission of unexpected errors for CI! */ 68 #if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM) 69 #undef EINVAL 70 #define EINVAL ({ \ 71 DRM_DEBUG_DRIVER("EINVAL at %s:%d\n", __func__, __LINE__); \ 72 22; \ 73 }) 74 #endif 75 76 /** 77 * DOC: User command execution 78 * 79 * Userspace submits commands to be executed on the GPU as an instruction 80 * stream within a GEM object we call a batchbuffer. This instructions may 81 * refer to other GEM objects containing auxiliary state such as kernels, 82 * samplers, render targets and even secondary batchbuffers. Userspace does 83 * not know where in the GPU memory these objects reside and so before the 84 * batchbuffer is passed to the GPU for execution, those addresses in the 85 * batchbuffer and auxiliary objects are updated. This is known as relocation, 86 * or patching. To try and avoid having to relocate each object on the next 87 * execution, userspace is told the location of those objects in this pass, 88 * but this remains just a hint as the kernel may choose a new location for 89 * any object in the future. 90 * 91 * At the level of talking to the hardware, submitting a batchbuffer for the 92 * GPU to execute is to add content to a buffer from which the HW 93 * command streamer is reading. 94 * 95 * 1. Add a command to load the HW context. For Logical Ring Contexts, i.e. 96 * Execlists, this command is not placed on the same buffer as the 97 * remaining items. 98 * 99 * 2. Add a command to invalidate caches to the buffer. 100 * 101 * 3. Add a batchbuffer start command to the buffer; the start command is 102 * essentially a token together with the GPU address of the batchbuffer 103 * to be executed. 104 * 105 * 4. Add a pipeline flush to the buffer. 106 * 107 * 5. Add a memory write command to the buffer to record when the GPU 108 * is done executing the batchbuffer. The memory write writes the 109 * global sequence number of the request, ``i915_request::global_seqno``; 110 * the i915 driver uses the current value in the register to determine 111 * if the GPU has completed the batchbuffer. 112 * 113 * 6. Add a user interrupt command to the buffer. This command instructs 114 * the GPU to issue an interrupt when the command, pipeline flush and 115 * memory write are completed. 116 * 117 * 7. Inform the hardware of the additional commands added to the buffer 118 * (by updating the tail pointer). 119 * 120 * Processing an execbuf ioctl is conceptually split up into a few phases. 121 * 122 * 1. Validation - Ensure all the pointers, handles and flags are valid. 123 * 2. Reservation - Assign GPU address space for every object 124 * 3. Relocation - Update any addresses to point to the final locations 125 * 4. Serialisation - Order the request with respect to its dependencies 126 * 5. Construction - Construct a request to execute the batchbuffer 127 * 6. Submission (at some point in the future execution) 128 * 129 * Reserving resources for the execbuf is the most complicated phase. We 130 * neither want to have to migrate the object in the address space, nor do 131 * we want to have to update any relocations pointing to this object. Ideally, 132 * we want to leave the object where it is and for all the existing relocations 133 * to match. If the object is given a new address, or if userspace thinks the 134 * object is elsewhere, we have to parse all the relocation entries and update 135 * the addresses. Userspace can set the I915_EXEC_NORELOC flag to hint that 136 * all the target addresses in all of its objects match the value in the 137 * relocation entries and that they all match the presumed offsets given by the 138 * list of execbuffer objects. Using this knowledge, we know that if we haven't 139 * moved any buffers, all the relocation entries are valid and we can skip 140 * the update. (If userspace is wrong, the likely outcome is an impromptu GPU 141 * hang.) The requirement for using I915_EXEC_NO_RELOC are: 142 * 143 * The addresses written in the objects must match the corresponding 144 * reloc.presumed_offset which in turn must match the corresponding 145 * execobject.offset. 146 * 147 * Any render targets written to in the batch must be flagged with 148 * EXEC_OBJECT_WRITE. 149 * 150 * To avoid stalling, execobject.offset should match the current 151 * address of that object within the active context. 152 * 153 * The reservation is done is multiple phases. First we try and keep any 154 * object already bound in its current location - so as long as meets the 155 * constraints imposed by the new execbuffer. Any object left unbound after the 156 * first pass is then fitted into any available idle space. If an object does 157 * not fit, all objects are removed from the reservation and the process rerun 158 * after sorting the objects into a priority order (more difficult to fit 159 * objects are tried first). Failing that, the entire VM is cleared and we try 160 * to fit the execbuf once last time before concluding that it simply will not 161 * fit. 162 * 163 * A small complication to all of this is that we allow userspace not only to 164 * specify an alignment and a size for the object in the address space, but 165 * we also allow userspace to specify the exact offset. This objects are 166 * simpler to place (the location is known a priori) all we have to do is make 167 * sure the space is available. 168 * 169 * Once all the objects are in place, patching up the buried pointers to point 170 * to the final locations is a fairly simple job of walking over the relocation 171 * entry arrays, looking up the right address and rewriting the value into 172 * the object. Simple! ... The relocation entries are stored in user memory 173 * and so to access them we have to copy them into a local buffer. That copy 174 * has to avoid taking any pagefaults as they may lead back to a GEM object 175 * requiring the struct_mutex (i.e. recursive deadlock). So once again we split 176 * the relocation into multiple passes. First we try to do everything within an 177 * atomic context (avoid the pagefaults) which requires that we never wait. If 178 * we detect that we may wait, or if we need to fault, then we have to fallback 179 * to a slower path. The slowpath has to drop the mutex. (Can you hear alarm 180 * bells yet?) Dropping the mutex means that we lose all the state we have 181 * built up so far for the execbuf and we must reset any global data. However, 182 * we do leave the objects pinned in their final locations - which is a 183 * potential issue for concurrent execbufs. Once we have left the mutex, we can 184 * allocate and copy all the relocation entries into a large array at our 185 * leisure, reacquire the mutex, reclaim all the objects and other state and 186 * then proceed to update any incorrect addresses with the objects. 187 * 188 * As we process the relocation entries, we maintain a record of whether the 189 * object is being written to. Using NORELOC, we expect userspace to provide 190 * this information instead. We also check whether we can skip the relocation 191 * by comparing the expected value inside the relocation entry with the target's 192 * final address. If they differ, we have to map the current object and rewrite 193 * the 4 or 8 byte pointer within. 194 * 195 * Serialising an execbuf is quite simple according to the rules of the GEM 196 * ABI. Execution within each context is ordered by the order of submission. 197 * Writes to any GEM object are in order of submission and are exclusive. Reads 198 * from a GEM object are unordered with respect to other reads, but ordered by 199 * writes. A write submitted after a read cannot occur before the read, and 200 * similarly any read submitted after a write cannot occur before the write. 201 * Writes are ordered between engines such that only one write occurs at any 202 * time (completing any reads beforehand) - using semaphores where available 203 * and CPU serialisation otherwise. Other GEM access obey the same rules, any 204 * write (either via mmaps using set-domain, or via pwrite) must flush all GPU 205 * reads before starting, and any read (either using set-domain or pread) must 206 * flush all GPU writes before starting. (Note we only employ a barrier before, 207 * we currently rely on userspace not concurrently starting a new execution 208 * whilst reading or writing to an object. This may be an advantage or not 209 * depending on how much you trust userspace not to shoot themselves in the 210 * foot.) Serialisation may just result in the request being inserted into 211 * a DAG awaiting its turn, but most simple is to wait on the CPU until 212 * all dependencies are resolved. 213 * 214 * After all of that, is just a matter of closing the request and handing it to 215 * the hardware (well, leaving it in a queue to be executed). However, we also 216 * offer the ability for batchbuffers to be run with elevated privileges so 217 * that they access otherwise hidden registers. (Used to adjust L3 cache etc.) 218 * Before any batch is given extra privileges we first must check that it 219 * contains no nefarious instructions, we check that each instruction is from 220 * our whitelist and all registers are also from an allowed list. We first 221 * copy the user's batchbuffer to a shadow (so that the user doesn't have 222 * access to it, either by the CPU or GPU as we scan it) and then parse each 223 * instruction. If everything is ok, we set a flag telling the hardware to run 224 * the batchbuffer in trusted mode, otherwise the ioctl is rejected. 225 */ 226 227 struct i915_execbuffer { 228 struct drm_i915_private *i915; /** i915 backpointer */ 229 struct drm_file *file; /** per-file lookup tables and limits */ 230 struct drm_i915_gem_execbuffer2 *args; /** ioctl parameters */ 231 struct drm_i915_gem_exec_object2 *exec; /** ioctl execobj[] */ 232 struct i915_vma **vma; 233 unsigned int *flags; 234 235 struct intel_engine_cs *engine; /** engine to queue the request to */ 236 struct intel_context *context; /* logical state for the request */ 237 struct i915_gem_context *gem_context; /** caller's context */ 238 239 struct i915_request *request; /** our request to build */ 240 struct i915_vma *batch; /** identity of the batch obj/vma */ 241 struct i915_vma *trampoline; /** trampoline used for chaining */ 242 243 /** actual size of execobj[] as we may extend it for the cmdparser */ 244 unsigned int buffer_count; 245 246 /** list of vma not yet bound during reservation phase */ 247 struct list_head unbound; 248 249 /** list of vma that have execobj.relocation_count */ 250 struct list_head relocs; 251 252 /** 253 * Track the most recently used object for relocations, as we 254 * frequently have to perform multiple relocations within the same 255 * obj/page 256 */ 257 struct reloc_cache { 258 struct drm_mm_node node; /** temporary GTT binding */ 259 unsigned long vaddr; /** Current kmap address */ 260 unsigned long page; /** Currently mapped page index */ 261 unsigned int gen; /** Cached value of INTEL_GEN */ 262 bool use_64bit_reloc : 1; 263 bool has_llc : 1; 264 bool has_fence : 1; 265 bool needs_unfenced : 1; 266 267 struct i915_request *rq; 268 u32 *rq_cmd; 269 unsigned int rq_size; 270 } reloc_cache; 271 272 u64 invalid_flags; /** Set of execobj.flags that are invalid */ 273 u32 context_flags; /** Set of execobj.flags to insert from the ctx */ 274 275 u32 batch_start_offset; /** Location within object of batch */ 276 u32 batch_len; /** Length of batch within object */ 277 u32 batch_flags; /** Flags composed for emit_bb_start() */ 278 279 /** 280 * Indicate either the size of the hastable used to resolve 281 * relocation handles, or if negative that we are using a direct 282 * index into the execobj[]. 283 */ 284 int lut_size; 285 struct hlist_head *buckets; /** ht for relocation handles */ 286 }; 287 288 #define exec_entry(EB, VMA) (&(EB)->exec[(VMA)->exec_flags - (EB)->flags]) 289 290 static inline bool eb_use_cmdparser(const struct i915_execbuffer *eb) 291 { 292 return intel_engine_requires_cmd_parser(eb->engine) || 293 (intel_engine_using_cmd_parser(eb->engine) && 294 eb->args->batch_len); 295 } 296 297 static int eb_create(struct i915_execbuffer *eb) 298 { 299 if (!(eb->args->flags & I915_EXEC_HANDLE_LUT)) { 300 unsigned int size = 1 + ilog2(eb->buffer_count); 301 302 /* 303 * Without a 1:1 association between relocation handles and 304 * the execobject[] index, we instead create a hashtable. 305 * We size it dynamically based on available memory, starting 306 * first with 1:1 assocative hash and scaling back until 307 * the allocation succeeds. 308 * 309 * Later on we use a positive lut_size to indicate we are 310 * using this hashtable, and a negative value to indicate a 311 * direct lookup. 312 */ 313 do { 314 gfp_t flags; 315 316 /* While we can still reduce the allocation size, don't 317 * raise a warning and allow the allocation to fail. 318 * On the last pass though, we want to try as hard 319 * as possible to perform the allocation and warn 320 * if it fails. 321 */ 322 flags = GFP_KERNEL; 323 if (size > 1) 324 flags |= __GFP_NORETRY | __GFP_NOWARN; 325 326 eb->buckets = kzalloc(sizeof(struct hlist_head) << size, 327 flags); 328 if (eb->buckets) 329 break; 330 } while (--size); 331 332 if (unlikely(!size)) 333 return -ENOMEM; 334 335 eb->lut_size = size; 336 } else { 337 eb->lut_size = -eb->buffer_count; 338 } 339 340 return 0; 341 } 342 343 static bool 344 eb_vma_misplaced(const struct drm_i915_gem_exec_object2 *entry, 345 const struct i915_vma *vma, 346 unsigned int flags) 347 { 348 if (vma->node.size < entry->pad_to_size) 349 return true; 350 351 if (entry->alignment && !IS_ALIGNED(vma->node.start, entry->alignment)) 352 return true; 353 354 if (flags & EXEC_OBJECT_PINNED && 355 vma->node.start != entry->offset) 356 return true; 357 358 if (flags & __EXEC_OBJECT_NEEDS_BIAS && 359 vma->node.start < BATCH_OFFSET_BIAS) 360 return true; 361 362 if (!(flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) && 363 (vma->node.start + vma->node.size - 1) >> 32) 364 return true; 365 366 if (flags & __EXEC_OBJECT_NEEDS_MAP && 367 !i915_vma_is_map_and_fenceable(vma)) 368 return true; 369 370 return false; 371 } 372 373 static inline bool 374 eb_pin_vma(struct i915_execbuffer *eb, 375 const struct drm_i915_gem_exec_object2 *entry, 376 struct i915_vma *vma) 377 { 378 unsigned int exec_flags = *vma->exec_flags; 379 u64 pin_flags; 380 381 if (vma->node.size) 382 pin_flags = vma->node.start; 383 else 384 pin_flags = entry->offset & PIN_OFFSET_MASK; 385 386 pin_flags |= PIN_USER | PIN_NOEVICT | PIN_OFFSET_FIXED; 387 if (unlikely(exec_flags & EXEC_OBJECT_NEEDS_GTT)) 388 pin_flags |= PIN_GLOBAL; 389 390 if (unlikely(i915_vma_pin(vma, 0, 0, pin_flags))) 391 return false; 392 393 if (unlikely(exec_flags & EXEC_OBJECT_NEEDS_FENCE)) { 394 if (unlikely(i915_vma_pin_fence(vma))) { 395 i915_vma_unpin(vma); 396 return false; 397 } 398 399 if (vma->fence) 400 exec_flags |= __EXEC_OBJECT_HAS_FENCE; 401 } 402 403 *vma->exec_flags = exec_flags | __EXEC_OBJECT_HAS_PIN; 404 return !eb_vma_misplaced(entry, vma, exec_flags); 405 } 406 407 static inline void __eb_unreserve_vma(struct i915_vma *vma, unsigned int flags) 408 { 409 GEM_BUG_ON(!(flags & __EXEC_OBJECT_HAS_PIN)); 410 411 if (unlikely(flags & __EXEC_OBJECT_HAS_FENCE)) 412 __i915_vma_unpin_fence(vma); 413 414 __i915_vma_unpin(vma); 415 } 416 417 static inline void 418 eb_unreserve_vma(struct i915_vma *vma, unsigned int *flags) 419 { 420 if (!(*flags & __EXEC_OBJECT_HAS_PIN)) 421 return; 422 423 __eb_unreserve_vma(vma, *flags); 424 *flags &= ~__EXEC_OBJECT_RESERVED; 425 } 426 427 static int 428 eb_validate_vma(struct i915_execbuffer *eb, 429 struct drm_i915_gem_exec_object2 *entry, 430 struct i915_vma *vma) 431 { 432 if (unlikely(entry->flags & eb->invalid_flags)) 433 return -EINVAL; 434 435 if (unlikely(entry->alignment && !is_power_of_2(entry->alignment))) 436 return -EINVAL; 437 438 /* 439 * Offset can be used as input (EXEC_OBJECT_PINNED), reject 440 * any non-page-aligned or non-canonical addresses. 441 */ 442 if (unlikely(entry->flags & EXEC_OBJECT_PINNED && 443 entry->offset != gen8_canonical_addr(entry->offset & I915_GTT_PAGE_MASK))) 444 return -EINVAL; 445 446 /* pad_to_size was once a reserved field, so sanitize it */ 447 if (entry->flags & EXEC_OBJECT_PAD_TO_SIZE) { 448 if (unlikely(offset_in_page(entry->pad_to_size))) 449 return -EINVAL; 450 } else { 451 entry->pad_to_size = 0; 452 } 453 454 if (unlikely(vma->exec_flags)) { 455 DRM_DEBUG("Object [handle %d, index %d] appears more than once in object list\n", 456 entry->handle, (int)(entry - eb->exec)); 457 return -EINVAL; 458 } 459 460 /* 461 * From drm_mm perspective address space is continuous, 462 * so from this point we're always using non-canonical 463 * form internally. 464 */ 465 entry->offset = gen8_noncanonical_addr(entry->offset); 466 467 if (!eb->reloc_cache.has_fence) { 468 entry->flags &= ~EXEC_OBJECT_NEEDS_FENCE; 469 } else { 470 if ((entry->flags & EXEC_OBJECT_NEEDS_FENCE || 471 eb->reloc_cache.needs_unfenced) && 472 i915_gem_object_is_tiled(vma->obj)) 473 entry->flags |= EXEC_OBJECT_NEEDS_GTT | __EXEC_OBJECT_NEEDS_MAP; 474 } 475 476 if (!(entry->flags & EXEC_OBJECT_PINNED)) 477 entry->flags |= eb->context_flags; 478 479 return 0; 480 } 481 482 static int 483 eb_add_vma(struct i915_execbuffer *eb, 484 unsigned int i, unsigned batch_idx, 485 struct i915_vma *vma) 486 { 487 struct drm_i915_gem_exec_object2 *entry = &eb->exec[i]; 488 int err; 489 490 GEM_BUG_ON(i915_vma_is_closed(vma)); 491 492 if (!(eb->args->flags & __EXEC_VALIDATED)) { 493 err = eb_validate_vma(eb, entry, vma); 494 if (unlikely(err)) 495 return err; 496 } 497 498 if (eb->lut_size > 0) { 499 vma->exec_handle = entry->handle; 500 hlist_add_head(&vma->exec_node, 501 &eb->buckets[hash_32(entry->handle, 502 eb->lut_size)]); 503 } 504 505 if (entry->relocation_count) 506 list_add_tail(&vma->reloc_link, &eb->relocs); 507 508 /* 509 * Stash a pointer from the vma to execobj, so we can query its flags, 510 * size, alignment etc as provided by the user. Also we stash a pointer 511 * to the vma inside the execobj so that we can use a direct lookup 512 * to find the right target VMA when doing relocations. 513 */ 514 eb->vma[i] = vma; 515 eb->flags[i] = entry->flags; 516 vma->exec_flags = &eb->flags[i]; 517 518 /* 519 * SNA is doing fancy tricks with compressing batch buffers, which leads 520 * to negative relocation deltas. Usually that works out ok since the 521 * relocate address is still positive, except when the batch is placed 522 * very low in the GTT. Ensure this doesn't happen. 523 * 524 * Note that actual hangs have only been observed on gen7, but for 525 * paranoia do it everywhere. 526 */ 527 if (i == batch_idx) { 528 if (entry->relocation_count && 529 !(eb->flags[i] & EXEC_OBJECT_PINNED)) 530 eb->flags[i] |= __EXEC_OBJECT_NEEDS_BIAS; 531 if (eb->reloc_cache.has_fence) 532 eb->flags[i] |= EXEC_OBJECT_NEEDS_FENCE; 533 534 eb->batch = vma; 535 } 536 537 err = 0; 538 if (eb_pin_vma(eb, entry, vma)) { 539 if (entry->offset != vma->node.start) { 540 entry->offset = vma->node.start | UPDATE; 541 eb->args->flags |= __EXEC_HAS_RELOC; 542 } 543 } else { 544 eb_unreserve_vma(vma, vma->exec_flags); 545 546 list_add_tail(&vma->exec_link, &eb->unbound); 547 if (drm_mm_node_allocated(&vma->node)) 548 err = i915_vma_unbind(vma); 549 if (unlikely(err)) { 550 vma->exec_flags = NULL; 551 if (i == batch_idx) 552 eb->batch = NULL; 553 eb->vma[i] = NULL; 554 } 555 } 556 return err; 557 } 558 559 static inline int use_cpu_reloc(const struct reloc_cache *cache, 560 const struct drm_i915_gem_object *obj) 561 { 562 if (!i915_gem_object_has_struct_page(obj)) 563 return false; 564 565 if (DBG_FORCE_RELOC == FORCE_CPU_RELOC) 566 return true; 567 568 if (DBG_FORCE_RELOC == FORCE_GTT_RELOC) 569 return false; 570 571 return (cache->has_llc || 572 obj->cache_dirty || 573 obj->cache_level != I915_CACHE_NONE); 574 } 575 576 static int eb_reserve_vma(const struct i915_execbuffer *eb, 577 struct i915_vma *vma) 578 { 579 struct drm_i915_gem_exec_object2 *entry = exec_entry(eb, vma); 580 unsigned int exec_flags = *vma->exec_flags; 581 u64 pin_flags; 582 int err; 583 584 pin_flags = PIN_USER | PIN_NONBLOCK; 585 if (exec_flags & EXEC_OBJECT_NEEDS_GTT) 586 pin_flags |= PIN_GLOBAL; 587 588 /* 589 * Wa32bitGeneralStateOffset & Wa32bitInstructionBaseOffset, 590 * limit address to the first 4GBs for unflagged objects. 591 */ 592 if (!(exec_flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS)) 593 pin_flags |= PIN_ZONE_4G; 594 595 if (exec_flags & __EXEC_OBJECT_NEEDS_MAP) 596 pin_flags |= PIN_MAPPABLE; 597 598 if (exec_flags & EXEC_OBJECT_PINNED) { 599 pin_flags |= entry->offset | PIN_OFFSET_FIXED; 600 pin_flags &= ~PIN_NONBLOCK; /* force overlapping checks */ 601 } else if (exec_flags & __EXEC_OBJECT_NEEDS_BIAS) { 602 pin_flags |= BATCH_OFFSET_BIAS | PIN_OFFSET_BIAS; 603 } 604 605 err = i915_vma_pin(vma, 606 entry->pad_to_size, entry->alignment, 607 pin_flags); 608 if (err) 609 return err; 610 611 if (entry->offset != vma->node.start) { 612 entry->offset = vma->node.start | UPDATE; 613 eb->args->flags |= __EXEC_HAS_RELOC; 614 } 615 616 if (unlikely(exec_flags & EXEC_OBJECT_NEEDS_FENCE)) { 617 err = i915_vma_pin_fence(vma); 618 if (unlikely(err)) { 619 i915_vma_unpin(vma); 620 return err; 621 } 622 623 if (vma->fence) 624 exec_flags |= __EXEC_OBJECT_HAS_FENCE; 625 } 626 627 *vma->exec_flags = exec_flags | __EXEC_OBJECT_HAS_PIN; 628 GEM_BUG_ON(eb_vma_misplaced(entry, vma, exec_flags)); 629 630 return 0; 631 } 632 633 static int eb_reserve(struct i915_execbuffer *eb) 634 { 635 const unsigned int count = eb->buffer_count; 636 struct list_head last; 637 struct i915_vma *vma; 638 unsigned int i, pass; 639 int err; 640 641 /* 642 * Attempt to pin all of the buffers into the GTT. 643 * This is done in 3 phases: 644 * 645 * 1a. Unbind all objects that do not match the GTT constraints for 646 * the execbuffer (fenceable, mappable, alignment etc). 647 * 1b. Increment pin count for already bound objects. 648 * 2. Bind new objects. 649 * 3. Decrement pin count. 650 * 651 * This avoid unnecessary unbinding of later objects in order to make 652 * room for the earlier objects *unless* we need to defragment. 653 */ 654 655 pass = 0; 656 err = 0; 657 do { 658 list_for_each_entry(vma, &eb->unbound, exec_link) { 659 err = eb_reserve_vma(eb, vma); 660 if (err) 661 break; 662 } 663 if (err != -ENOSPC) 664 return err; 665 666 /* Resort *all* the objects into priority order */ 667 INIT_LIST_HEAD(&eb->unbound); 668 INIT_LIST_HEAD(&last); 669 for (i = 0; i < count; i++) { 670 unsigned int flags = eb->flags[i]; 671 struct i915_vma *vma = eb->vma[i]; 672 673 if (flags & EXEC_OBJECT_PINNED && 674 flags & __EXEC_OBJECT_HAS_PIN) 675 continue; 676 677 eb_unreserve_vma(vma, &eb->flags[i]); 678 679 if (flags & EXEC_OBJECT_PINNED) 680 /* Pinned must have their slot */ 681 list_add(&vma->exec_link, &eb->unbound); 682 else if (flags & __EXEC_OBJECT_NEEDS_MAP) 683 /* Map require the lowest 256MiB (aperture) */ 684 list_add_tail(&vma->exec_link, &eb->unbound); 685 else if (!(flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS)) 686 /* Prioritise 4GiB region for restricted bo */ 687 list_add(&vma->exec_link, &last); 688 else 689 list_add_tail(&vma->exec_link, &last); 690 } 691 list_splice_tail(&last, &eb->unbound); 692 693 switch (pass++) { 694 case 0: 695 break; 696 697 case 1: 698 /* Too fragmented, unbind everything and retry */ 699 mutex_lock(&eb->context->vm->mutex); 700 err = i915_gem_evict_vm(eb->context->vm); 701 mutex_unlock(&eb->context->vm->mutex); 702 if (err) 703 return err; 704 break; 705 706 default: 707 return -ENOSPC; 708 } 709 } while (1); 710 } 711 712 static unsigned int eb_batch_index(const struct i915_execbuffer *eb) 713 { 714 if (eb->args->flags & I915_EXEC_BATCH_FIRST) 715 return 0; 716 else 717 return eb->buffer_count - 1; 718 } 719 720 static int eb_select_context(struct i915_execbuffer *eb) 721 { 722 struct i915_gem_context *ctx; 723 724 ctx = i915_gem_context_lookup(eb->file->driver_priv, eb->args->rsvd1); 725 if (unlikely(!ctx)) 726 return -ENOENT; 727 728 eb->gem_context = ctx; 729 if (rcu_access_pointer(ctx->vm)) 730 eb->invalid_flags |= EXEC_OBJECT_NEEDS_GTT; 731 732 eb->context_flags = 0; 733 if (test_bit(UCONTEXT_NO_ZEROMAP, &ctx->user_flags)) 734 eb->context_flags |= __EXEC_OBJECT_NEEDS_BIAS; 735 736 return 0; 737 } 738 739 static int eb_lookup_vmas(struct i915_execbuffer *eb) 740 { 741 struct radix_tree_root *handles_vma = &eb->gem_context->handles_vma; 742 struct drm_i915_gem_object *obj; 743 unsigned int i, batch; 744 int err; 745 746 INIT_LIST_HEAD(&eb->relocs); 747 INIT_LIST_HEAD(&eb->unbound); 748 749 batch = eb_batch_index(eb); 750 751 mutex_lock(&eb->gem_context->mutex); 752 if (unlikely(i915_gem_context_is_closed(eb->gem_context))) { 753 err = -ENOENT; 754 goto err_ctx; 755 } 756 757 for (i = 0; i < eb->buffer_count; i++) { 758 u32 handle = eb->exec[i].handle; 759 struct i915_lut_handle *lut; 760 struct i915_vma *vma; 761 762 vma = radix_tree_lookup(handles_vma, handle); 763 if (likely(vma)) 764 goto add_vma; 765 766 obj = i915_gem_object_lookup(eb->file, handle); 767 if (unlikely(!obj)) { 768 err = -ENOENT; 769 goto err_vma; 770 } 771 772 vma = i915_vma_instance(obj, eb->context->vm, NULL); 773 if (IS_ERR(vma)) { 774 err = PTR_ERR(vma); 775 goto err_obj; 776 } 777 778 lut = i915_lut_handle_alloc(); 779 if (unlikely(!lut)) { 780 err = -ENOMEM; 781 goto err_obj; 782 } 783 784 err = radix_tree_insert(handles_vma, handle, vma); 785 if (unlikely(err)) { 786 i915_lut_handle_free(lut); 787 goto err_obj; 788 } 789 790 /* transfer ref to lut */ 791 if (!atomic_fetch_inc(&vma->open_count)) 792 i915_vma_reopen(vma); 793 lut->handle = handle; 794 lut->ctx = eb->gem_context; 795 796 i915_gem_object_lock(obj); 797 list_add(&lut->obj_link, &obj->lut_list); 798 i915_gem_object_unlock(obj); 799 800 add_vma: 801 err = eb_add_vma(eb, i, batch, vma); 802 if (unlikely(err)) 803 goto err_vma; 804 805 GEM_BUG_ON(vma != eb->vma[i]); 806 GEM_BUG_ON(vma->exec_flags != &eb->flags[i]); 807 GEM_BUG_ON(drm_mm_node_allocated(&vma->node) && 808 eb_vma_misplaced(&eb->exec[i], vma, eb->flags[i])); 809 } 810 811 mutex_unlock(&eb->gem_context->mutex); 812 813 eb->args->flags |= __EXEC_VALIDATED; 814 return eb_reserve(eb); 815 816 err_obj: 817 i915_gem_object_put(obj); 818 err_vma: 819 eb->vma[i] = NULL; 820 err_ctx: 821 mutex_unlock(&eb->gem_context->mutex); 822 return err; 823 } 824 825 static struct i915_vma * 826 eb_get_vma(const struct i915_execbuffer *eb, unsigned long handle) 827 { 828 if (eb->lut_size < 0) { 829 if (handle >= -eb->lut_size) 830 return NULL; 831 return eb->vma[handle]; 832 } else { 833 struct hlist_head *head; 834 struct i915_vma *vma; 835 836 head = &eb->buckets[hash_32(handle, eb->lut_size)]; 837 hlist_for_each_entry(vma, head, exec_node) { 838 if (vma->exec_handle == handle) 839 return vma; 840 } 841 return NULL; 842 } 843 } 844 845 static void eb_release_vmas(const struct i915_execbuffer *eb) 846 { 847 const unsigned int count = eb->buffer_count; 848 unsigned int i; 849 850 for (i = 0; i < count; i++) { 851 struct i915_vma *vma = eb->vma[i]; 852 unsigned int flags = eb->flags[i]; 853 854 if (!vma) 855 break; 856 857 GEM_BUG_ON(vma->exec_flags != &eb->flags[i]); 858 vma->exec_flags = NULL; 859 eb->vma[i] = NULL; 860 861 if (flags & __EXEC_OBJECT_HAS_PIN) 862 __eb_unreserve_vma(vma, flags); 863 864 if (flags & __EXEC_OBJECT_HAS_REF) 865 i915_vma_put(vma); 866 } 867 } 868 869 static void eb_reset_vmas(const struct i915_execbuffer *eb) 870 { 871 eb_release_vmas(eb); 872 if (eb->lut_size > 0) 873 memset(eb->buckets, 0, 874 sizeof(struct hlist_head) << eb->lut_size); 875 } 876 877 static void eb_destroy(const struct i915_execbuffer *eb) 878 { 879 GEM_BUG_ON(eb->reloc_cache.rq); 880 881 if (eb->lut_size > 0) 882 kfree(eb->buckets); 883 } 884 885 static inline u64 886 relocation_target(const struct drm_i915_gem_relocation_entry *reloc, 887 const struct i915_vma *target) 888 { 889 return gen8_canonical_addr((int)reloc->delta + target->node.start); 890 } 891 892 static void reloc_cache_init(struct reloc_cache *cache, 893 struct drm_i915_private *i915) 894 { 895 cache->page = -1; 896 cache->vaddr = 0; 897 /* Must be a variable in the struct to allow GCC to unroll. */ 898 cache->gen = INTEL_GEN(i915); 899 cache->has_llc = HAS_LLC(i915); 900 cache->use_64bit_reloc = HAS_64BIT_RELOC(i915); 901 cache->has_fence = cache->gen < 4; 902 cache->needs_unfenced = INTEL_INFO(i915)->unfenced_needs_alignment; 903 cache->node.flags = 0; 904 cache->rq = NULL; 905 cache->rq_size = 0; 906 } 907 908 static inline void *unmask_page(unsigned long p) 909 { 910 return (void *)(uintptr_t)(p & PAGE_MASK); 911 } 912 913 static inline unsigned int unmask_flags(unsigned long p) 914 { 915 return p & ~PAGE_MASK; 916 } 917 918 #define KMAP 0x4 /* after CLFLUSH_FLAGS */ 919 920 static inline struct i915_ggtt *cache_to_ggtt(struct reloc_cache *cache) 921 { 922 struct drm_i915_private *i915 = 923 container_of(cache, struct i915_execbuffer, reloc_cache)->i915; 924 return &i915->ggtt; 925 } 926 927 static void reloc_gpu_flush(struct reloc_cache *cache) 928 { 929 GEM_BUG_ON(cache->rq_size >= cache->rq->batch->obj->base.size / sizeof(u32)); 930 cache->rq_cmd[cache->rq_size] = MI_BATCH_BUFFER_END; 931 932 __i915_gem_object_flush_map(cache->rq->batch->obj, 0, cache->rq_size); 933 i915_gem_object_unpin_map(cache->rq->batch->obj); 934 935 intel_gt_chipset_flush(cache->rq->engine->gt); 936 937 i915_request_add(cache->rq); 938 cache->rq = NULL; 939 } 940 941 static void reloc_cache_reset(struct reloc_cache *cache) 942 { 943 void *vaddr; 944 945 if (cache->rq) 946 reloc_gpu_flush(cache); 947 948 if (!cache->vaddr) 949 return; 950 951 vaddr = unmask_page(cache->vaddr); 952 if (cache->vaddr & KMAP) { 953 if (cache->vaddr & CLFLUSH_AFTER) 954 mb(); 955 956 kunmap_atomic(vaddr); 957 i915_gem_object_finish_access((struct drm_i915_gem_object *)cache->node.mm); 958 } else { 959 struct i915_ggtt *ggtt = cache_to_ggtt(cache); 960 961 intel_gt_flush_ggtt_writes(ggtt->vm.gt); 962 #ifdef __NetBSD__ 963 io_mapping_unmap_atomic(&ggtt->iomap, vaddr); 964 #else 965 io_mapping_unmap_atomic((void __iomem *)vaddr); 966 #endif 967 968 if (drm_mm_node_allocated(&cache->node)) { 969 ggtt->vm.clear_range(&ggtt->vm, 970 cache->node.start, 971 cache->node.size); 972 mutex_lock(&ggtt->vm.mutex); 973 drm_mm_remove_node(&cache->node); 974 mutex_unlock(&ggtt->vm.mutex); 975 } else { 976 i915_vma_unpin((struct i915_vma *)cache->node.mm); 977 } 978 } 979 980 cache->vaddr = 0; 981 cache->page = -1; 982 } 983 984 static void *reloc_kmap(struct drm_i915_gem_object *obj, 985 struct reloc_cache *cache, 986 unsigned long page) 987 { 988 void *vaddr; 989 990 if (cache->vaddr) { 991 kunmap_atomic(unmask_page(cache->vaddr)); 992 } else { 993 unsigned int flushes; 994 int err; 995 996 err = i915_gem_object_prepare_write(obj, &flushes); 997 if (err) 998 return ERR_PTR(err); 999 1000 BUILD_BUG_ON(KMAP & CLFLUSH_FLAGS); 1001 BUILD_BUG_ON((KMAP | CLFLUSH_FLAGS) & PAGE_MASK); 1002 1003 cache->vaddr = flushes | KMAP; 1004 cache->node.mm = (void *)obj; 1005 if (flushes) 1006 mb(); 1007 } 1008 1009 vaddr = kmap_atomic(i915_gem_object_get_dirty_page(obj, page)); 1010 cache->vaddr = unmask_flags(cache->vaddr) | (unsigned long)vaddr; 1011 cache->page = page; 1012 1013 return vaddr; 1014 } 1015 1016 static void *reloc_iomap(struct drm_i915_gem_object *obj, 1017 struct reloc_cache *cache, 1018 unsigned long page) 1019 { 1020 struct i915_ggtt *ggtt = cache_to_ggtt(cache); 1021 unsigned long offset; 1022 void *vaddr; 1023 1024 if (cache->vaddr) { 1025 intel_gt_flush_ggtt_writes(ggtt->vm.gt); 1026 #ifdef __NetBSD__ 1027 io_mapping_unmap_atomic(&ggtt->iomap, 1028 unmask_page(cache->vaddr)); 1029 #else 1030 io_mapping_unmap_atomic((void __force __iomem *) unmask_page(cache->vaddr)); 1031 #endif 1032 } else { 1033 struct i915_vma *vma; 1034 int err; 1035 1036 if (i915_gem_object_is_tiled(obj)) 1037 return ERR_PTR(-EINVAL); 1038 1039 if (use_cpu_reloc(cache, obj)) 1040 return NULL; 1041 1042 i915_gem_object_lock(obj); 1043 err = i915_gem_object_set_to_gtt_domain(obj, true); 1044 i915_gem_object_unlock(obj); 1045 if (err) 1046 return ERR_PTR(err); 1047 1048 vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, 1049 PIN_MAPPABLE | 1050 PIN_NONBLOCK /* NOWARN */ | 1051 PIN_NOEVICT); 1052 if (IS_ERR(vma)) { 1053 memset(&cache->node, 0, sizeof(cache->node)); 1054 mutex_lock(&ggtt->vm.mutex); 1055 err = drm_mm_insert_node_in_range 1056 (&ggtt->vm.mm, &cache->node, 1057 PAGE_SIZE, 0, I915_COLOR_UNEVICTABLE, 1058 0, ggtt->mappable_end, 1059 DRM_MM_INSERT_LOW); 1060 mutex_unlock(&ggtt->vm.mutex); 1061 if (err) /* no inactive aperture space, use cpu reloc */ 1062 return NULL; 1063 } else { 1064 cache->node.start = vma->node.start; 1065 cache->node.mm = (void *)vma; 1066 } 1067 } 1068 1069 offset = cache->node.start; 1070 if (drm_mm_node_allocated(&cache->node)) { 1071 ggtt->vm.insert_page(&ggtt->vm, 1072 i915_gem_object_get_dma_address(obj, page), 1073 offset, I915_CACHE_NONE, 0); 1074 } else { 1075 offset += page << PAGE_SHIFT; 1076 } 1077 1078 vaddr = (void __force *)io_mapping_map_atomic_wc(&ggtt->iomap, 1079 offset); 1080 cache->page = page; 1081 cache->vaddr = (unsigned long)vaddr; 1082 1083 return vaddr; 1084 } 1085 1086 static void *reloc_vaddr(struct drm_i915_gem_object *obj, 1087 struct reloc_cache *cache, 1088 unsigned long page) 1089 { 1090 void *vaddr; 1091 1092 if (cache->page == page) { 1093 vaddr = unmask_page(cache->vaddr); 1094 } else { 1095 vaddr = NULL; 1096 if ((cache->vaddr & KMAP) == 0) 1097 vaddr = reloc_iomap(obj, cache, page); 1098 if (!vaddr) 1099 vaddr = reloc_kmap(obj, cache, page); 1100 } 1101 1102 return vaddr; 1103 } 1104 1105 static void clflush_write32(u32 *addr, u32 value, unsigned int flushes) 1106 { 1107 if (unlikely(flushes & (CLFLUSH_BEFORE | CLFLUSH_AFTER))) { 1108 if (flushes & CLFLUSH_BEFORE) { 1109 clflushopt(addr); 1110 mb(); 1111 } 1112 1113 *addr = value; 1114 1115 /* 1116 * Writes to the same cacheline are serialised by the CPU 1117 * (including clflush). On the write path, we only require 1118 * that it hits memory in an orderly fashion and place 1119 * mb barriers at the start and end of the relocation phase 1120 * to ensure ordering of clflush wrt to the system. 1121 */ 1122 if (flushes & CLFLUSH_AFTER) 1123 clflushopt(addr); 1124 } else 1125 *addr = value; 1126 } 1127 1128 static int reloc_move_to_gpu(struct i915_request *rq, struct i915_vma *vma) 1129 { 1130 struct drm_i915_gem_object *obj = vma->obj; 1131 int err; 1132 1133 i915_vma_lock(vma); 1134 1135 if (obj->cache_dirty & ~obj->cache_coherent) 1136 i915_gem_clflush_object(obj, 0); 1137 obj->write_domain = 0; 1138 1139 err = i915_request_await_object(rq, vma->obj, true); 1140 if (err == 0) 1141 err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE); 1142 1143 i915_vma_unlock(vma); 1144 1145 return err; 1146 } 1147 1148 static int __reloc_gpu_alloc(struct i915_execbuffer *eb, 1149 struct i915_vma *vma, 1150 unsigned int len) 1151 { 1152 struct reloc_cache *cache = &eb->reloc_cache; 1153 struct intel_engine_pool_node *pool; 1154 struct i915_request *rq; 1155 struct i915_vma *batch; 1156 u32 *cmd; 1157 int err; 1158 1159 pool = intel_engine_get_pool(eb->engine, PAGE_SIZE); 1160 if (IS_ERR(pool)) 1161 return PTR_ERR(pool); 1162 1163 cmd = i915_gem_object_pin_map(pool->obj, 1164 cache->has_llc ? 1165 I915_MAP_FORCE_WB : 1166 I915_MAP_FORCE_WC); 1167 if (IS_ERR(cmd)) { 1168 err = PTR_ERR(cmd); 1169 goto out_pool; 1170 } 1171 1172 batch = i915_vma_instance(pool->obj, vma->vm, NULL); 1173 if (IS_ERR(batch)) { 1174 err = PTR_ERR(batch); 1175 goto err_unmap; 1176 } 1177 1178 err = i915_vma_pin(batch, 0, 0, PIN_USER | PIN_NONBLOCK); 1179 if (err) 1180 goto err_unmap; 1181 1182 rq = i915_request_create(eb->context); 1183 if (IS_ERR(rq)) { 1184 err = PTR_ERR(rq); 1185 goto err_unpin; 1186 } 1187 1188 err = intel_engine_pool_mark_active(pool, rq); 1189 if (err) 1190 goto err_request; 1191 1192 err = reloc_move_to_gpu(rq, vma); 1193 if (err) 1194 goto err_request; 1195 1196 err = eb->engine->emit_bb_start(rq, 1197 batch->node.start, PAGE_SIZE, 1198 cache->gen > 5 ? 0 : I915_DISPATCH_SECURE); 1199 if (err) 1200 goto skip_request; 1201 1202 i915_vma_lock(batch); 1203 err = i915_request_await_object(rq, batch->obj, false); 1204 if (err == 0) 1205 err = i915_vma_move_to_active(batch, rq, 0); 1206 i915_vma_unlock(batch); 1207 if (err) 1208 goto skip_request; 1209 1210 rq->batch = batch; 1211 i915_vma_unpin(batch); 1212 1213 cache->rq = rq; 1214 cache->rq_cmd = cmd; 1215 cache->rq_size = 0; 1216 1217 /* Return with batch mapping (cmd) still pinned */ 1218 goto out_pool; 1219 1220 skip_request: 1221 i915_request_skip(rq, err); 1222 err_request: 1223 i915_request_add(rq); 1224 err_unpin: 1225 i915_vma_unpin(batch); 1226 err_unmap: 1227 i915_gem_object_unpin_map(pool->obj); 1228 out_pool: 1229 intel_engine_pool_put(pool); 1230 return err; 1231 } 1232 1233 static u32 *reloc_gpu(struct i915_execbuffer *eb, 1234 struct i915_vma *vma, 1235 unsigned int len) 1236 { 1237 struct reloc_cache *cache = &eb->reloc_cache; 1238 u32 *cmd; 1239 1240 if (cache->rq_size > PAGE_SIZE/sizeof(u32) - (len + 1)) 1241 reloc_gpu_flush(cache); 1242 1243 if (unlikely(!cache->rq)) { 1244 int err; 1245 1246 if (!intel_engine_can_store_dword(eb->engine)) 1247 return ERR_PTR(-ENODEV); 1248 1249 err = __reloc_gpu_alloc(eb, vma, len); 1250 if (unlikely(err)) 1251 return ERR_PTR(err); 1252 } 1253 1254 cmd = cache->rq_cmd + cache->rq_size; 1255 cache->rq_size += len; 1256 1257 return cmd; 1258 } 1259 1260 static u64 1261 relocate_entry(struct i915_vma *vma, 1262 const struct drm_i915_gem_relocation_entry *reloc, 1263 struct i915_execbuffer *eb, 1264 const struct i915_vma *target) 1265 { 1266 u64 offset = reloc->offset; 1267 u64 target_offset = relocation_target(reloc, target); 1268 bool wide = eb->reloc_cache.use_64bit_reloc; 1269 void *vaddr; 1270 1271 if (!eb->reloc_cache.vaddr && 1272 (DBG_FORCE_RELOC == FORCE_GPU_RELOC || 1273 !dma_resv_test_signaled_rcu(vma->resv, true))) { 1274 const unsigned int gen = eb->reloc_cache.gen; 1275 unsigned int len; 1276 u32 *batch; 1277 u64 addr; 1278 1279 if (wide) 1280 len = offset & 7 ? 8 : 5; 1281 else if (gen >= 4) 1282 len = 4; 1283 else 1284 len = 3; 1285 1286 batch = reloc_gpu(eb, vma, len); 1287 if (IS_ERR(batch)) 1288 goto repeat; 1289 1290 addr = gen8_canonical_addr(vma->node.start + offset); 1291 if (wide) { 1292 if (offset & 7) { 1293 *batch++ = MI_STORE_DWORD_IMM_GEN4; 1294 *batch++ = lower_32_bits(addr); 1295 *batch++ = upper_32_bits(addr); 1296 *batch++ = lower_32_bits(target_offset); 1297 1298 addr = gen8_canonical_addr(addr + 4); 1299 1300 *batch++ = MI_STORE_DWORD_IMM_GEN4; 1301 *batch++ = lower_32_bits(addr); 1302 *batch++ = upper_32_bits(addr); 1303 *batch++ = upper_32_bits(target_offset); 1304 } else { 1305 *batch++ = (MI_STORE_DWORD_IMM_GEN4 | (1 << 21)) + 1; 1306 *batch++ = lower_32_bits(addr); 1307 *batch++ = upper_32_bits(addr); 1308 *batch++ = lower_32_bits(target_offset); 1309 *batch++ = upper_32_bits(target_offset); 1310 } 1311 } else if (gen >= 6) { 1312 *batch++ = MI_STORE_DWORD_IMM_GEN4; 1313 *batch++ = 0; 1314 *batch++ = addr; 1315 *batch++ = target_offset; 1316 } else if (gen >= 4) { 1317 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 1318 *batch++ = 0; 1319 *batch++ = addr; 1320 *batch++ = target_offset; 1321 } else { 1322 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; 1323 *batch++ = addr; 1324 *batch++ = target_offset; 1325 } 1326 1327 goto out; 1328 } 1329 1330 repeat: 1331 vaddr = reloc_vaddr(vma->obj, &eb->reloc_cache, offset >> PAGE_SHIFT); 1332 if (IS_ERR(vaddr)) 1333 return PTR_ERR(vaddr); 1334 1335 clflush_write32(vaddr + offset_in_page(offset), 1336 lower_32_bits(target_offset), 1337 eb->reloc_cache.vaddr); 1338 1339 if (wide) { 1340 offset += sizeof(u32); 1341 target_offset >>= 32; 1342 wide = false; 1343 goto repeat; 1344 } 1345 1346 out: 1347 return target->node.start | UPDATE; 1348 } 1349 1350 static u64 1351 eb_relocate_entry(struct i915_execbuffer *eb, 1352 struct i915_vma *vma, 1353 const struct drm_i915_gem_relocation_entry *reloc) 1354 { 1355 struct i915_vma *target; 1356 int err; 1357 1358 /* we've already hold a reference to all valid objects */ 1359 target = eb_get_vma(eb, reloc->target_handle); 1360 if (unlikely(!target)) 1361 return -ENOENT; 1362 1363 /* Validate that the target is in a valid r/w GPU domain */ 1364 if (unlikely(reloc->write_domain & (reloc->write_domain - 1))) { 1365 DRM_DEBUG("reloc with multiple write domains: " 1366 "target %d offset %d " 1367 "read %08x write %08x", 1368 reloc->target_handle, 1369 (int) reloc->offset, 1370 reloc->read_domains, 1371 reloc->write_domain); 1372 return -EINVAL; 1373 } 1374 if (unlikely((reloc->write_domain | reloc->read_domains) 1375 & ~I915_GEM_GPU_DOMAINS)) { 1376 DRM_DEBUG("reloc with read/write non-GPU domains: " 1377 "target %d offset %d " 1378 "read %08x write %08x", 1379 reloc->target_handle, 1380 (int) reloc->offset, 1381 reloc->read_domains, 1382 reloc->write_domain); 1383 return -EINVAL; 1384 } 1385 1386 if (reloc->write_domain) { 1387 *target->exec_flags |= EXEC_OBJECT_WRITE; 1388 1389 /* 1390 * Sandybridge PPGTT errata: We need a global gtt mapping 1391 * for MI and pipe_control writes because the gpu doesn't 1392 * properly redirect them through the ppgtt for non_secure 1393 * batchbuffers. 1394 */ 1395 if (reloc->write_domain == I915_GEM_DOMAIN_INSTRUCTION && 1396 IS_GEN(eb->i915, 6)) { 1397 err = i915_vma_bind(target, target->obj->cache_level, 1398 PIN_GLOBAL, NULL); 1399 if (WARN_ONCE(err, 1400 "Unexpected failure to bind target VMA!")) 1401 return err; 1402 } 1403 } 1404 1405 /* 1406 * If the relocation already has the right value in it, no 1407 * more work needs to be done. 1408 */ 1409 if (!DBG_FORCE_RELOC && 1410 gen8_canonical_addr(target->node.start) == reloc->presumed_offset) 1411 return 0; 1412 1413 /* Check that the relocation address is valid... */ 1414 if (unlikely(reloc->offset > 1415 vma->size - (eb->reloc_cache.use_64bit_reloc ? 8 : 4))) { 1416 DRM_DEBUG("Relocation beyond object bounds: " 1417 "target %d offset %d size %d.\n", 1418 reloc->target_handle, 1419 (int)reloc->offset, 1420 (int)vma->size); 1421 return -EINVAL; 1422 } 1423 if (unlikely(reloc->offset & 3)) { 1424 DRM_DEBUG("Relocation not 4-byte aligned: " 1425 "target %d offset %d.\n", 1426 reloc->target_handle, 1427 (int)reloc->offset); 1428 return -EINVAL; 1429 } 1430 1431 /* 1432 * If we write into the object, we need to force the synchronisation 1433 * barrier, either with an asynchronous clflush or if we executed the 1434 * patching using the GPU (though that should be serialised by the 1435 * timeline). To be completely sure, and since we are required to 1436 * do relocations we are already stalling, disable the user's opt 1437 * out of our synchronisation. 1438 */ 1439 *vma->exec_flags &= ~EXEC_OBJECT_ASYNC; 1440 1441 /* and update the user's relocation entry */ 1442 return relocate_entry(vma, reloc, eb, target); 1443 } 1444 1445 static int eb_relocate_vma(struct i915_execbuffer *eb, struct i915_vma *vma) 1446 { 1447 #define N_RELOC(x) ((x) / sizeof(struct drm_i915_gem_relocation_entry)) 1448 struct drm_i915_gem_relocation_entry stack[N_RELOC(512)]; 1449 struct drm_i915_gem_relocation_entry __user *urelocs; 1450 const struct drm_i915_gem_exec_object2 *entry = exec_entry(eb, vma); 1451 unsigned int remain; 1452 1453 urelocs = u64_to_user_ptr(entry->relocs_ptr); 1454 remain = entry->relocation_count; 1455 #ifndef _LP64 /* XXX why, gcc, do you make it hard to be safe */ 1456 if (unlikely(remain > N_RELOC(ULONG_MAX))) 1457 return -EINVAL; 1458 #endif 1459 1460 /* 1461 * We must check that the entire relocation array is safe 1462 * to read. However, if the array is not writable the user loses 1463 * the updated relocation values. 1464 */ 1465 if (unlikely(!access_ok(urelocs, remain*sizeof(*urelocs)))) 1466 return -EFAULT; 1467 1468 do { 1469 struct drm_i915_gem_relocation_entry *r = stack; 1470 unsigned int count = 1471 min_t(unsigned int, remain, ARRAY_SIZE(stack)); 1472 unsigned int copied; 1473 1474 /* 1475 * This is the fast path and we cannot handle a pagefault 1476 * whilst holding the struct mutex lest the user pass in the 1477 * relocations contained within a mmaped bo. For in such a case 1478 * we, the page fault handler would call i915_gem_fault() and 1479 * we would try to acquire the struct mutex again. Obviously 1480 * this is bad and so lockdep complains vehemently. 1481 */ 1482 #ifdef __NetBSD__ /* XXX copy fastpath */ 1483 copied = 1; 1484 #else 1485 pagefault_disable(); 1486 copied = __copy_from_user_inatomic(r, urelocs, count * sizeof(r[0])); 1487 pagefault_enable(); 1488 #endif 1489 if (unlikely(copied)) { 1490 remain = -EFAULT; 1491 goto out; 1492 } 1493 1494 remain -= count; 1495 do { 1496 u64 offset = eb_relocate_entry(eb, vma, r); 1497 1498 if (likely(offset == 0)) { 1499 } else if ((s64)offset < 0) { 1500 remain = (int)offset; 1501 goto out; 1502 } else { 1503 /* 1504 * Note that reporting an error now 1505 * leaves everything in an inconsistent 1506 * state as we have *already* changed 1507 * the relocation value inside the 1508 * object. As we have not changed the 1509 * reloc.presumed_offset or will not 1510 * change the execobject.offset, on the 1511 * call we may not rewrite the value 1512 * inside the object, leaving it 1513 * dangling and causing a GPU hang. Unless 1514 * userspace dynamically rebuilds the 1515 * relocations on each execbuf rather than 1516 * presume a static tree. 1517 * 1518 * We did previously check if the relocations 1519 * were writable (access_ok), an error now 1520 * would be a strange race with mprotect, 1521 * having already demonstrated that we 1522 * can read from this userspace address. 1523 */ 1524 offset = gen8_canonical_addr(offset & ~UPDATE); 1525 if (unlikely(__put_user(offset, &urelocs[r-stack].presumed_offset))) { 1526 remain = -EFAULT; 1527 goto out; 1528 } 1529 } 1530 } while (r++, --count); 1531 urelocs += ARRAY_SIZE(stack); 1532 } while (remain); 1533 out: 1534 reloc_cache_reset(&eb->reloc_cache); 1535 return remain; 1536 } 1537 1538 static int 1539 eb_relocate_vma_slow(struct i915_execbuffer *eb, struct i915_vma *vma) 1540 { 1541 const struct drm_i915_gem_exec_object2 *entry = exec_entry(eb, vma); 1542 struct drm_i915_gem_relocation_entry *relocs = 1543 u64_to_ptr(typeof(*relocs), entry->relocs_ptr); 1544 unsigned int i; 1545 int err; 1546 1547 for (i = 0; i < entry->relocation_count; i++) { 1548 u64 offset = eb_relocate_entry(eb, vma, &relocs[i]); 1549 1550 if ((s64)offset < 0) { 1551 err = (int)offset; 1552 goto err; 1553 } 1554 } 1555 err = 0; 1556 err: 1557 reloc_cache_reset(&eb->reloc_cache); 1558 return err; 1559 } 1560 1561 static int check_relocations(const struct drm_i915_gem_exec_object2 *entry) 1562 { 1563 const char __user *addr, *end; 1564 unsigned long size; 1565 char __maybe_unused c; 1566 1567 size = entry->relocation_count; 1568 if (size == 0) 1569 return 0; 1570 1571 if (size > N_RELOC(ULONG_MAX)) 1572 return -EINVAL; 1573 1574 addr = u64_to_user_ptr(entry->relocs_ptr); 1575 size *= sizeof(struct drm_i915_gem_relocation_entry); 1576 if (!access_ok(addr, size)) 1577 return -EFAULT; 1578 1579 end = addr + size; 1580 for (; addr < end; addr += PAGE_SIZE) { 1581 int err = __get_user(c, addr); 1582 if (err) 1583 return err; 1584 } 1585 return __get_user(c, end - 1); 1586 } 1587 1588 static int eb_copy_relocations(const struct i915_execbuffer *eb) 1589 { 1590 struct drm_i915_gem_relocation_entry *relocs; 1591 const unsigned int count = eb->buffer_count; 1592 unsigned int i; 1593 int err; 1594 1595 for (i = 0; i < count; i++) { 1596 const unsigned int nreloc = eb->exec[i].relocation_count; 1597 struct drm_i915_gem_relocation_entry __user *urelocs; 1598 unsigned long size; 1599 unsigned long copied; 1600 1601 if (nreloc == 0) 1602 continue; 1603 1604 err = check_relocations(&eb->exec[i]); 1605 if (err) 1606 goto err; 1607 1608 urelocs = u64_to_user_ptr(eb->exec[i].relocs_ptr); 1609 size = nreloc * sizeof(*relocs); 1610 1611 relocs = kvmalloc_array(size, 1, GFP_KERNEL); 1612 if (!relocs) { 1613 err = -ENOMEM; 1614 goto err; 1615 } 1616 1617 /* copy_from_user is limited to < 4GiB */ 1618 copied = 0; 1619 do { 1620 unsigned int len = 1621 min_t(u64, BIT_ULL(31), size - copied); 1622 1623 if (__copy_from_user((char *)relocs + copied, 1624 (char __user *)urelocs + copied, 1625 len)) 1626 goto end; 1627 1628 copied += len; 1629 } while (copied < size); 1630 1631 /* 1632 * As we do not update the known relocation offsets after 1633 * relocating (due to the complexities in lock handling), 1634 * we need to mark them as invalid now so that we force the 1635 * relocation processing next time. Just in case the target 1636 * object is evicted and then rebound into its old 1637 * presumed_offset before the next execbuffer - if that 1638 * happened we would make the mistake of assuming that the 1639 * relocations were valid. 1640 */ 1641 if (!user_access_begin(urelocs, size)) 1642 goto end; 1643 1644 for (copied = 0; copied < nreloc; copied++) 1645 unsafe_put_user(-1, 1646 &urelocs[copied].presumed_offset, 1647 end_user); 1648 user_access_end(); 1649 1650 eb->exec[i].relocs_ptr = (uintptr_t)relocs; 1651 } 1652 1653 return 0; 1654 1655 end_user: 1656 user_access_end(); 1657 end: 1658 kvfree(relocs); 1659 err = -EFAULT; 1660 err: 1661 while (i--) { 1662 relocs = u64_to_ptr(typeof(*relocs), eb->exec[i].relocs_ptr); 1663 if (eb->exec[i].relocation_count) 1664 kvfree(relocs); 1665 } 1666 return err; 1667 } 1668 1669 static int eb_prefault_relocations(const struct i915_execbuffer *eb) 1670 { 1671 const unsigned int count = eb->buffer_count; 1672 unsigned int i; 1673 1674 if (unlikely(i915_modparams.prefault_disable)) 1675 return 0; 1676 1677 for (i = 0; i < count; i++) { 1678 int err; 1679 1680 err = check_relocations(&eb->exec[i]); 1681 if (err) 1682 return err; 1683 } 1684 1685 return 0; 1686 } 1687 1688 static noinline int eb_relocate_slow(struct i915_execbuffer *eb) 1689 { 1690 struct drm_device *dev = &eb->i915->drm; 1691 bool have_copy = false; 1692 struct i915_vma *vma; 1693 int err = 0; 1694 1695 repeat: 1696 #ifdef __NetBSD__ 1697 if (sigispending(curlwp, 0)) { 1698 err = -ERESTARTSYS; 1699 goto out; 1700 } 1701 #else 1702 if (signal_pending(current)) { 1703 err = -ERESTARTSYS; 1704 goto out; 1705 } 1706 #endif 1707 1708 /* We may process another execbuffer during the unlock... */ 1709 eb_reset_vmas(eb); 1710 mutex_unlock(&dev->struct_mutex); 1711 1712 /* 1713 * We take 3 passes through the slowpatch. 1714 * 1715 * 1 - we try to just prefault all the user relocation entries and 1716 * then attempt to reuse the atomic pagefault disabled fast path again. 1717 * 1718 * 2 - we copy the user entries to a local buffer here outside of the 1719 * local and allow ourselves to wait upon any rendering before 1720 * relocations 1721 * 1722 * 3 - we already have a local copy of the relocation entries, but 1723 * were interrupted (EAGAIN) whilst waiting for the objects, try again. 1724 */ 1725 if (!err) { 1726 err = eb_prefault_relocations(eb); 1727 } else if (!have_copy) { 1728 err = eb_copy_relocations(eb); 1729 have_copy = err == 0; 1730 } else { 1731 cond_resched(); 1732 err = 0; 1733 } 1734 if (err) { 1735 mutex_lock(&dev->struct_mutex); 1736 goto out; 1737 } 1738 1739 /* A frequent cause for EAGAIN are currently unavailable client pages */ 1740 flush_workqueue(eb->i915->mm.userptr_wq); 1741 1742 err = i915_mutex_lock_interruptible(dev); 1743 if (err) { 1744 mutex_lock(&dev->struct_mutex); 1745 goto out; 1746 } 1747 1748 /* reacquire the objects */ 1749 err = eb_lookup_vmas(eb); 1750 if (err) 1751 goto err; 1752 1753 GEM_BUG_ON(!eb->batch); 1754 1755 list_for_each_entry(vma, &eb->relocs, reloc_link) { 1756 if (!have_copy) { 1757 #ifdef __NetBSD__ 1758 err = -EFAULT; 1759 #else 1760 pagefault_disable(); 1761 err = eb_relocate_vma(eb, vma); 1762 pagefault_enable(); 1763 #endif 1764 if (err) 1765 goto repeat; 1766 } else { 1767 err = eb_relocate_vma_slow(eb, vma); 1768 if (err) 1769 goto err; 1770 } 1771 } 1772 1773 /* 1774 * Leave the user relocations as are, this is the painfully slow path, 1775 * and we want to avoid the complication of dropping the lock whilst 1776 * having buffers reserved in the aperture and so causing spurious 1777 * ENOSPC for random operations. 1778 */ 1779 1780 err: 1781 if (err == -EAGAIN) 1782 goto repeat; 1783 1784 out: 1785 if (have_copy) { 1786 const unsigned int count = eb->buffer_count; 1787 unsigned int i; 1788 1789 for (i = 0; i < count; i++) { 1790 const struct drm_i915_gem_exec_object2 *entry = 1791 &eb->exec[i]; 1792 struct drm_i915_gem_relocation_entry *relocs; 1793 1794 if (!entry->relocation_count) 1795 continue; 1796 1797 relocs = u64_to_ptr(typeof(*relocs), entry->relocs_ptr); 1798 kvfree(relocs); 1799 } 1800 } 1801 1802 return err; 1803 } 1804 1805 static int eb_relocate(struct i915_execbuffer *eb) 1806 { 1807 if (eb_lookup_vmas(eb)) 1808 goto slow; 1809 1810 /* The objects are in their final locations, apply the relocations. */ 1811 if (eb->args->flags & __EXEC_HAS_RELOC) { 1812 struct i915_vma *vma; 1813 1814 list_for_each_entry(vma, &eb->relocs, reloc_link) { 1815 if (eb_relocate_vma(eb, vma)) 1816 goto slow; 1817 } 1818 } 1819 1820 return 0; 1821 1822 slow: 1823 return eb_relocate_slow(eb); 1824 } 1825 1826 static int eb_move_to_gpu(struct i915_execbuffer *eb) 1827 { 1828 const unsigned int count = eb->buffer_count; 1829 struct ww_acquire_ctx acquire; 1830 unsigned int i; 1831 int err = 0; 1832 1833 ww_acquire_init(&acquire, &reservation_ww_class); 1834 1835 for (i = 0; i < count; i++) { 1836 struct i915_vma *vma = eb->vma[i]; 1837 1838 err = ww_mutex_lock_interruptible(&vma->resv->lock, &acquire); 1839 if (!err) 1840 continue; 1841 1842 GEM_BUG_ON(err == -EALREADY); /* No duplicate vma */ 1843 1844 if (err == -EDEADLK) { 1845 GEM_BUG_ON(i == 0); 1846 do { 1847 int j = i - 1; 1848 1849 ww_mutex_unlock(&eb->vma[j]->resv->lock); 1850 1851 swap(eb->flags[i], eb->flags[j]); 1852 swap(eb->vma[i], eb->vma[j]); 1853 eb->vma[i]->exec_flags = &eb->flags[i]; 1854 } while (--i); 1855 GEM_BUG_ON(vma != eb->vma[0]); 1856 vma->exec_flags = &eb->flags[0]; 1857 1858 err = ww_mutex_lock_slow_interruptible(&vma->resv->lock, 1859 &acquire); 1860 } 1861 if (err) 1862 break; 1863 } 1864 ww_acquire_done(&acquire); 1865 1866 while (i--) { 1867 unsigned int flags = eb->flags[i]; 1868 struct i915_vma *vma = eb->vma[i]; 1869 struct drm_i915_gem_object *obj = vma->obj; 1870 1871 assert_vma_held(vma); 1872 1873 if (flags & EXEC_OBJECT_CAPTURE) { 1874 struct i915_capture_list *capture; 1875 1876 capture = kmalloc(sizeof(*capture), GFP_KERNEL); 1877 if (capture) { 1878 capture->next = eb->request->capture_list; 1879 capture->vma = vma; 1880 eb->request->capture_list = capture; 1881 } 1882 } 1883 1884 /* 1885 * If the GPU is not _reading_ through the CPU cache, we need 1886 * to make sure that any writes (both previous GPU writes from 1887 * before a change in snooping levels and normal CPU writes) 1888 * caught in that cache are flushed to main memory. 1889 * 1890 * We want to say 1891 * obj->cache_dirty && 1892 * !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ) 1893 * but gcc's optimiser doesn't handle that as well and emits 1894 * two jumps instead of one. Maybe one day... 1895 */ 1896 if (unlikely(obj->cache_dirty & ~obj->cache_coherent)) { 1897 if (i915_gem_clflush_object(obj, 0)) 1898 flags &= ~EXEC_OBJECT_ASYNC; 1899 } 1900 1901 if (err == 0 && !(flags & EXEC_OBJECT_ASYNC)) { 1902 err = i915_request_await_object 1903 (eb->request, obj, flags & EXEC_OBJECT_WRITE); 1904 } 1905 1906 if (err == 0) 1907 err = i915_vma_move_to_active(vma, eb->request, flags); 1908 1909 i915_vma_unlock(vma); 1910 1911 __eb_unreserve_vma(vma, flags); 1912 vma->exec_flags = NULL; 1913 if (err) 1914 eb->vma[i] = NULL; 1915 1916 if (unlikely(flags & __EXEC_OBJECT_HAS_REF)) 1917 i915_vma_put(vma); 1918 } 1919 ww_acquire_fini(&acquire); 1920 1921 if (unlikely(err)) 1922 goto err_skip; 1923 1924 eb->exec = NULL; 1925 1926 /* Unconditionally flush any chipset caches (for streaming writes). */ 1927 intel_gt_chipset_flush(eb->engine->gt); 1928 return 0; 1929 1930 err_skip: 1931 i915_request_skip(eb->request, err); 1932 return err; 1933 } 1934 1935 static int i915_gem_check_execbuffer(struct drm_i915_gem_execbuffer2 *exec) 1936 { 1937 if (exec->flags & __I915_EXEC_ILLEGAL_FLAGS) 1938 return -EINVAL; 1939 1940 /* Kernel clipping was a DRI1 misfeature */ 1941 if (!(exec->flags & I915_EXEC_FENCE_ARRAY)) { 1942 if (exec->num_cliprects || exec->cliprects_ptr) 1943 return -EINVAL; 1944 } 1945 1946 if (exec->DR4 == 0xffffffff) { 1947 DRM_DEBUG("UXA submitting garbage DR4, fixing up\n"); 1948 exec->DR4 = 0; 1949 } 1950 if (exec->DR1 || exec->DR4) 1951 return -EINVAL; 1952 1953 if ((exec->batch_start_offset | exec->batch_len) & 0x7) 1954 return -EINVAL; 1955 1956 return 0; 1957 } 1958 1959 static int i915_reset_gen7_sol_offsets(struct i915_request *rq) 1960 { 1961 u32 *cs; 1962 int i; 1963 1964 if (!IS_GEN(rq->i915, 7) || rq->engine->id != RCS0) { 1965 DRM_DEBUG("sol reset is gen7/rcs only\n"); 1966 return -EINVAL; 1967 } 1968 1969 cs = intel_ring_begin(rq, 4 * 2 + 2); 1970 if (IS_ERR(cs)) 1971 return PTR_ERR(cs); 1972 1973 *cs++ = MI_LOAD_REGISTER_IMM(4); 1974 for (i = 0; i < 4; i++) { 1975 *cs++ = i915_mmio_reg_offset(GEN7_SO_WRITE_OFFSET(i)); 1976 *cs++ = 0; 1977 } 1978 *cs++ = MI_NOOP; 1979 intel_ring_advance(rq, cs); 1980 1981 return 0; 1982 } 1983 1984 static struct i915_vma * 1985 shadow_batch_pin(struct drm_i915_gem_object *obj, 1986 struct i915_address_space *vm, 1987 unsigned int flags) 1988 { 1989 struct i915_vma *vma; 1990 int err; 1991 1992 vma = i915_vma_instance(obj, vm, NULL); 1993 if (IS_ERR(vma)) 1994 return vma; 1995 1996 err = i915_vma_pin(vma, 0, 0, flags); 1997 if (err) 1998 return ERR_PTR(err); 1999 2000 return vma; 2001 } 2002 2003 struct eb_parse_work { 2004 struct dma_fence_work base; 2005 struct intel_engine_cs *engine; 2006 struct i915_vma *batch; 2007 struct i915_vma *shadow; 2008 struct i915_vma *trampoline; 2009 unsigned int batch_offset; 2010 unsigned int batch_length; 2011 }; 2012 2013 static int __eb_parse(struct dma_fence_work *work) 2014 { 2015 struct eb_parse_work *pw = container_of(work, typeof(*pw), base); 2016 2017 return intel_engine_cmd_parser(pw->engine, 2018 pw->batch, 2019 pw->batch_offset, 2020 pw->batch_length, 2021 pw->shadow, 2022 pw->trampoline); 2023 } 2024 2025 static void __eb_parse_release(struct dma_fence_work *work) 2026 { 2027 struct eb_parse_work *pw = container_of(work, typeof(*pw), base); 2028 2029 if (pw->trampoline) 2030 i915_active_release(&pw->trampoline->active); 2031 i915_active_release(&pw->shadow->active); 2032 i915_active_release(&pw->batch->active); 2033 } 2034 2035 static const struct dma_fence_work_ops eb_parse_ops = { 2036 .name = "eb_parse", 2037 .work = __eb_parse, 2038 .release = __eb_parse_release, 2039 }; 2040 2041 static int eb_parse_pipeline(struct i915_execbuffer *eb, 2042 struct i915_vma *shadow, 2043 struct i915_vma *trampoline) 2044 { 2045 struct eb_parse_work *pw; 2046 int err; 2047 2048 pw = kzalloc(sizeof(*pw), GFP_KERNEL); 2049 if (!pw) 2050 return -ENOMEM; 2051 2052 err = i915_active_acquire(&eb->batch->active); 2053 if (err) 2054 goto err_free; 2055 2056 err = i915_active_acquire(&shadow->active); 2057 if (err) 2058 goto err_batch; 2059 2060 if (trampoline) { 2061 err = i915_active_acquire(&trampoline->active); 2062 if (err) 2063 goto err_shadow; 2064 } 2065 2066 dma_fence_work_init(&pw->base, &eb_parse_ops); 2067 2068 pw->engine = eb->engine; 2069 pw->batch = eb->batch; 2070 pw->batch_offset = eb->batch_start_offset; 2071 pw->batch_length = eb->batch_len; 2072 pw->shadow = shadow; 2073 pw->trampoline = trampoline; 2074 2075 err = dma_resv_lock_interruptible(pw->batch->resv, NULL); 2076 if (err) 2077 goto err_trampoline; 2078 2079 err = dma_resv_reserve_shared(pw->batch->resv, 1); 2080 if (err) 2081 goto err_batch_unlock; 2082 2083 /* Wait for all writes (and relocs) into the batch to complete */ 2084 err = i915_sw_fence_await_reservation(&pw->base.chain, 2085 pw->batch->resv, NULL, false, 2086 0, I915_FENCE_GFP); 2087 if (err < 0) 2088 goto err_batch_unlock; 2089 2090 /* Keep the batch alive and unwritten as we parse */ 2091 dma_resv_add_shared_fence(pw->batch->resv, &pw->base.dma); 2092 2093 dma_resv_unlock(pw->batch->resv); 2094 2095 /* Force execution to wait for completion of the parser */ 2096 dma_resv_lock(shadow->resv, NULL); 2097 dma_resv_add_excl_fence(shadow->resv, &pw->base.dma); 2098 dma_resv_unlock(shadow->resv); 2099 2100 dma_fence_work_commit(&pw->base); 2101 return 0; 2102 2103 err_batch_unlock: 2104 dma_resv_unlock(pw->batch->resv); 2105 err_trampoline: 2106 if (trampoline) 2107 i915_active_release(&trampoline->active); 2108 err_shadow: 2109 i915_active_release(&shadow->active); 2110 err_batch: 2111 i915_active_release(&eb->batch->active); 2112 err_free: 2113 kfree(pw); 2114 return err; 2115 } 2116 2117 static int eb_parse(struct i915_execbuffer *eb) 2118 { 2119 struct intel_engine_pool_node *pool; 2120 struct i915_vma *shadow, *trampoline; 2121 unsigned int len; 2122 int err; 2123 2124 if (!eb_use_cmdparser(eb)) 2125 return 0; 2126 2127 len = eb->batch_len; 2128 if (!CMDPARSER_USES_GGTT(eb->i915)) { 2129 /* 2130 * ppGTT backed shadow buffers must be mapped RO, to prevent 2131 * post-scan tampering 2132 */ 2133 if (!eb->context->vm->has_read_only) { 2134 DRM_DEBUG("Cannot prevent post-scan tampering without RO capable vm\n"); 2135 return -EINVAL; 2136 } 2137 } else { 2138 len += I915_CMD_PARSER_TRAMPOLINE_SIZE; 2139 } 2140 2141 pool = intel_engine_get_pool(eb->engine, len); 2142 if (IS_ERR(pool)) 2143 return PTR_ERR(pool); 2144 2145 shadow = shadow_batch_pin(pool->obj, eb->context->vm, PIN_USER); 2146 if (IS_ERR(shadow)) { 2147 err = PTR_ERR(shadow); 2148 goto err; 2149 } 2150 i915_gem_object_set_readonly(shadow->obj); 2151 2152 trampoline = NULL; 2153 if (CMDPARSER_USES_GGTT(eb->i915)) { 2154 trampoline = shadow; 2155 2156 shadow = shadow_batch_pin(pool->obj, 2157 &eb->engine->gt->ggtt->vm, 2158 PIN_GLOBAL); 2159 if (IS_ERR(shadow)) { 2160 err = PTR_ERR(shadow); 2161 shadow = trampoline; 2162 goto err_shadow; 2163 } 2164 2165 eb->batch_flags |= I915_DISPATCH_SECURE; 2166 } 2167 2168 err = eb_parse_pipeline(eb, shadow, trampoline); 2169 if (err) 2170 goto err_trampoline; 2171 2172 eb->vma[eb->buffer_count] = i915_vma_get(shadow); 2173 eb->flags[eb->buffer_count] = 2174 __EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_REF; 2175 shadow->exec_flags = &eb->flags[eb->buffer_count]; 2176 eb->buffer_count++; 2177 2178 eb->trampoline = trampoline; 2179 eb->batch_start_offset = 0; 2180 eb->batch = shadow; 2181 2182 shadow->private = pool; 2183 return 0; 2184 2185 err_trampoline: 2186 if (trampoline) 2187 i915_vma_unpin(trampoline); 2188 err_shadow: 2189 i915_vma_unpin(shadow); 2190 err: 2191 intel_engine_pool_put(pool); 2192 return err; 2193 } 2194 2195 static void 2196 add_to_client(struct i915_request *rq, struct drm_file *file) 2197 { 2198 struct drm_i915_file_private *file_priv = file->driver_priv; 2199 2200 rq->file_priv = file_priv; 2201 2202 spin_lock(&file_priv->mm.lock); 2203 list_add_tail(&rq->client_link, &file_priv->mm.request_list); 2204 spin_unlock(&file_priv->mm.lock); 2205 } 2206 2207 static int eb_submit(struct i915_execbuffer *eb) 2208 { 2209 int err; 2210 2211 err = eb_move_to_gpu(eb); 2212 if (err) 2213 return err; 2214 2215 if (eb->args->flags & I915_EXEC_GEN7_SOL_RESET) { 2216 err = i915_reset_gen7_sol_offsets(eb->request); 2217 if (err) 2218 return err; 2219 } 2220 2221 /* 2222 * After we completed waiting for other engines (using HW semaphores) 2223 * then we can signal that this request/batch is ready to run. This 2224 * allows us to determine if the batch is still waiting on the GPU 2225 * or actually running by checking the breadcrumb. 2226 */ 2227 if (eb->engine->emit_init_breadcrumb) { 2228 err = eb->engine->emit_init_breadcrumb(eb->request); 2229 if (err) 2230 return err; 2231 } 2232 2233 err = eb->engine->emit_bb_start(eb->request, 2234 eb->batch->node.start + 2235 eb->batch_start_offset, 2236 eb->batch_len, 2237 eb->batch_flags); 2238 if (err) 2239 return err; 2240 2241 if (eb->trampoline) { 2242 GEM_BUG_ON(eb->batch_start_offset); 2243 err = eb->engine->emit_bb_start(eb->request, 2244 eb->trampoline->node.start + 2245 eb->batch_len, 2246 0, 0); 2247 if (err) 2248 return err; 2249 } 2250 2251 if (intel_context_nopreempt(eb->context)) 2252 __set_bit(I915_FENCE_FLAG_NOPREEMPT, &eb->request->fence.flags); 2253 2254 return 0; 2255 } 2256 2257 static int num_vcs_engines(const struct drm_i915_private *i915) 2258 { 2259 return hweight64(INTEL_INFO(i915)->engine_mask & 2260 GENMASK_ULL(VCS0 + I915_MAX_VCS - 1, VCS0)); 2261 } 2262 2263 /* 2264 * Find one BSD ring to dispatch the corresponding BSD command. 2265 * The engine index is returned. 2266 */ 2267 static unsigned int 2268 gen8_dispatch_bsd_engine(struct drm_i915_private *dev_priv, 2269 struct drm_file *file) 2270 { 2271 struct drm_i915_file_private *file_priv = file->driver_priv; 2272 2273 /* Check whether the file_priv has already selected one ring. */ 2274 if ((int)file_priv->bsd_engine < 0) 2275 file_priv->bsd_engine = 2276 get_random_int() % num_vcs_engines(dev_priv); 2277 2278 return file_priv->bsd_engine; 2279 } 2280 2281 static const enum intel_engine_id user_ring_map[] = { 2282 [I915_EXEC_DEFAULT] = RCS0, 2283 [I915_EXEC_RENDER] = RCS0, 2284 [I915_EXEC_BLT] = BCS0, 2285 [I915_EXEC_BSD] = VCS0, 2286 [I915_EXEC_VEBOX] = VECS0 2287 }; 2288 2289 static struct i915_request *eb_throttle(struct intel_context *ce) 2290 { 2291 struct intel_ring *ring = ce->ring; 2292 struct intel_timeline *tl = ce->timeline; 2293 struct i915_request *rq; 2294 2295 /* 2296 * Completely unscientific finger-in-the-air estimates for suitable 2297 * maximum user request size (to avoid blocking) and then backoff. 2298 */ 2299 if (intel_ring_update_space(ring) >= PAGE_SIZE) 2300 return NULL; 2301 2302 /* 2303 * Find a request that after waiting upon, there will be at least half 2304 * the ring available. The hysteresis allows us to compete for the 2305 * shared ring and should mean that we sleep less often prior to 2306 * claiming our resources, but not so long that the ring completely 2307 * drains before we can submit our next request. 2308 */ 2309 list_for_each_entry(rq, &tl->requests, link) { 2310 if (rq->ring != ring) 2311 continue; 2312 2313 if (__intel_ring_space(rq->postfix, 2314 ring->emit, ring->size) > ring->size / 2) 2315 break; 2316 } 2317 if (&rq->link == &tl->requests) 2318 return NULL; /* weird, we will check again later for real */ 2319 2320 return i915_request_get(rq); 2321 } 2322 2323 static int __eb_pin_engine(struct i915_execbuffer *eb, struct intel_context *ce) 2324 { 2325 struct intel_timeline *tl; 2326 struct i915_request *rq; 2327 int err; 2328 2329 /* 2330 * ABI: Before userspace accesses the GPU (e.g. execbuffer), report 2331 * EIO if the GPU is already wedged. 2332 */ 2333 err = intel_gt_terminally_wedged(ce->engine->gt); 2334 if (err) 2335 return err; 2336 2337 if (unlikely(intel_context_is_banned(ce))) 2338 return -EIO; 2339 2340 /* 2341 * Pinning the contexts may generate requests in order to acquire 2342 * GGTT space, so do this first before we reserve a seqno for 2343 * ourselves. 2344 */ 2345 err = intel_context_pin(ce); 2346 if (err) 2347 return err; 2348 2349 /* 2350 * Take a local wakeref for preparing to dispatch the execbuf as 2351 * we expect to access the hardware fairly frequently in the 2352 * process, and require the engine to be kept awake between accesses. 2353 * Upon dispatch, we acquire another prolonged wakeref that we hold 2354 * until the timeline is idle, which in turn releases the wakeref 2355 * taken on the engine, and the parent device. 2356 */ 2357 tl = intel_context_timeline_lock(ce); 2358 if (IS_ERR(tl)) { 2359 err = PTR_ERR(tl); 2360 goto err_unpin; 2361 } 2362 2363 intel_context_enter(ce); 2364 rq = eb_throttle(ce); 2365 2366 intel_context_timeline_unlock(tl); 2367 2368 if (rq) { 2369 if (i915_request_wait(rq, 2370 I915_WAIT_INTERRUPTIBLE, 2371 MAX_SCHEDULE_TIMEOUT) < 0) { 2372 i915_request_put(rq); 2373 err = -EINTR; 2374 goto err_exit; 2375 } 2376 2377 i915_request_put(rq); 2378 } 2379 2380 eb->engine = ce->engine; 2381 eb->context = ce; 2382 return 0; 2383 2384 err_exit: 2385 mutex_lock(&tl->mutex); 2386 intel_context_exit(ce); 2387 intel_context_timeline_unlock(tl); 2388 err_unpin: 2389 intel_context_unpin(ce); 2390 return err; 2391 } 2392 2393 static void eb_unpin_engine(struct i915_execbuffer *eb) 2394 { 2395 struct intel_context *ce = eb->context; 2396 struct intel_timeline *tl = ce->timeline; 2397 2398 mutex_lock(&tl->mutex); 2399 intel_context_exit(ce); 2400 mutex_unlock(&tl->mutex); 2401 2402 intel_context_unpin(ce); 2403 } 2404 2405 static unsigned int 2406 eb_select_legacy_ring(struct i915_execbuffer *eb, 2407 struct drm_file *file, 2408 struct drm_i915_gem_execbuffer2 *args) 2409 { 2410 struct drm_i915_private *i915 = eb->i915; 2411 unsigned int user_ring_id = args->flags & I915_EXEC_RING_MASK; 2412 2413 if (user_ring_id != I915_EXEC_BSD && 2414 (args->flags & I915_EXEC_BSD_MASK)) { 2415 DRM_DEBUG("execbuf with non bsd ring but with invalid " 2416 "bsd dispatch flags: %d\n", (int)(args->flags)); 2417 return -1; 2418 } 2419 2420 if (user_ring_id == I915_EXEC_BSD && num_vcs_engines(i915) > 1) { 2421 unsigned int bsd_idx = args->flags & I915_EXEC_BSD_MASK; 2422 2423 if (bsd_idx == I915_EXEC_BSD_DEFAULT) { 2424 bsd_idx = gen8_dispatch_bsd_engine(i915, file); 2425 } else if (bsd_idx >= I915_EXEC_BSD_RING1 && 2426 bsd_idx <= I915_EXEC_BSD_RING2) { 2427 bsd_idx >>= I915_EXEC_BSD_SHIFT; 2428 bsd_idx--; 2429 } else { 2430 DRM_DEBUG("execbuf with unknown bsd ring: %u\n", 2431 bsd_idx); 2432 return -1; 2433 } 2434 2435 return _VCS(bsd_idx); 2436 } 2437 2438 if (user_ring_id >= ARRAY_SIZE(user_ring_map)) { 2439 DRM_DEBUG("execbuf with unknown ring: %u\n", user_ring_id); 2440 return -1; 2441 } 2442 2443 return user_ring_map[user_ring_id]; 2444 } 2445 2446 static int 2447 eb_pin_engine(struct i915_execbuffer *eb, 2448 struct drm_file *file, 2449 struct drm_i915_gem_execbuffer2 *args) 2450 { 2451 struct intel_context *ce; 2452 unsigned int idx; 2453 int err; 2454 2455 if (i915_gem_context_user_engines(eb->gem_context)) 2456 idx = args->flags & I915_EXEC_RING_MASK; 2457 else 2458 idx = eb_select_legacy_ring(eb, file, args); 2459 2460 ce = i915_gem_context_get_engine(eb->gem_context, idx); 2461 if (IS_ERR(ce)) 2462 return PTR_ERR(ce); 2463 2464 err = __eb_pin_engine(eb, ce); 2465 intel_context_put(ce); 2466 2467 return err; 2468 } 2469 2470 static void 2471 __free_fence_array(struct drm_syncobj **fences, unsigned int n) 2472 { 2473 while (n--) 2474 drm_syncobj_put(ptr_mask_bits(fences[n], 2)); 2475 kvfree(fences); 2476 } 2477 2478 static struct drm_syncobj ** 2479 get_fence_array(struct drm_i915_gem_execbuffer2 *args, 2480 struct drm_file *file) 2481 { 2482 const unsigned long nfences = args->num_cliprects; 2483 struct drm_i915_gem_exec_fence __user *user; 2484 struct drm_syncobj **fences; 2485 unsigned long n; 2486 int err; 2487 2488 if (!(args->flags & I915_EXEC_FENCE_ARRAY)) 2489 return NULL; 2490 2491 /* Check multiplication overflow for access_ok() and kvmalloc_array() */ 2492 BUILD_BUG_ON(sizeof(size_t) > sizeof(unsigned long)); 2493 if (nfences > min_t(unsigned long, 2494 ULONG_MAX / sizeof(*user), 2495 SIZE_MAX / sizeof(*fences))) 2496 return ERR_PTR(-EINVAL); 2497 2498 user = u64_to_user_ptr(args->cliprects_ptr); 2499 if (!access_ok(user, nfences * sizeof(*user))) 2500 return ERR_PTR(-EFAULT); 2501 2502 fences = kvmalloc_array(nfences, sizeof(*fences), 2503 __GFP_NOWARN | GFP_KERNEL); 2504 if (!fences) 2505 return ERR_PTR(-ENOMEM); 2506 2507 for (n = 0; n < nfences; n++) { 2508 struct drm_i915_gem_exec_fence fence; 2509 struct drm_syncobj *syncobj; 2510 2511 if (__copy_from_user(&fence, user++, sizeof(fence))) { 2512 err = -EFAULT; 2513 goto err; 2514 } 2515 2516 if (fence.flags & __I915_EXEC_FENCE_UNKNOWN_FLAGS) { 2517 err = -EINVAL; 2518 goto err; 2519 } 2520 2521 syncobj = drm_syncobj_find(file, fence.handle); 2522 if (!syncobj) { 2523 DRM_DEBUG("Invalid syncobj handle provided\n"); 2524 err = -ENOENT; 2525 goto err; 2526 } 2527 2528 BUILD_BUG_ON(~(ARCH_KMALLOC_MINALIGN - 1) & 2529 ~__I915_EXEC_FENCE_UNKNOWN_FLAGS); 2530 2531 fences[n] = ptr_pack_bits(syncobj, fence.flags, 2); 2532 } 2533 2534 return fences; 2535 2536 err: 2537 __free_fence_array(fences, n); 2538 return ERR_PTR(err); 2539 } 2540 2541 static void 2542 put_fence_array(struct drm_i915_gem_execbuffer2 *args, 2543 struct drm_syncobj **fences) 2544 { 2545 if (fences) 2546 __free_fence_array(fences, args->num_cliprects); 2547 } 2548 2549 static int 2550 await_fence_array(struct i915_execbuffer *eb, 2551 struct drm_syncobj **fences) 2552 { 2553 const unsigned int nfences = eb->args->num_cliprects; 2554 unsigned int n; 2555 int err; 2556 2557 for (n = 0; n < nfences; n++) { 2558 struct drm_syncobj *syncobj; 2559 struct dma_fence *fence; 2560 unsigned int flags; 2561 2562 syncobj = ptr_unpack_bits(fences[n], &flags, 2); 2563 if (!(flags & I915_EXEC_FENCE_WAIT)) 2564 continue; 2565 2566 fence = drm_syncobj_fence_get(syncobj); 2567 if (!fence) 2568 return -EINVAL; 2569 2570 err = i915_request_await_dma_fence(eb->request, fence); 2571 dma_fence_put(fence); 2572 if (err < 0) 2573 return err; 2574 } 2575 2576 return 0; 2577 } 2578 2579 static void 2580 signal_fence_array(struct i915_execbuffer *eb, 2581 struct drm_syncobj **fences) 2582 { 2583 const unsigned int nfences = eb->args->num_cliprects; 2584 struct dma_fence * const fence = &eb->request->fence; 2585 unsigned int n; 2586 2587 for (n = 0; n < nfences; n++) { 2588 struct drm_syncobj *syncobj; 2589 unsigned int flags; 2590 2591 syncobj = ptr_unpack_bits(fences[n], &flags, 2); 2592 if (!(flags & I915_EXEC_FENCE_SIGNAL)) 2593 continue; 2594 2595 drm_syncobj_replace_fence(syncobj, fence); 2596 } 2597 } 2598 2599 static int 2600 i915_gem_do_execbuffer(struct drm_device *dev, 2601 struct drm_file *file, 2602 struct drm_i915_gem_execbuffer2 *args, 2603 struct drm_i915_gem_exec_object2 *exec, 2604 struct drm_syncobj **fences) 2605 { 2606 struct drm_i915_private *i915 = to_i915(dev); 2607 struct i915_execbuffer eb; 2608 struct dma_fence *in_fence = NULL; 2609 struct dma_fence *exec_fence = NULL; 2610 struct sync_file *out_fence = NULL; 2611 int out_fence_fd = -1; 2612 #ifdef __NetBSD__ 2613 struct file *fp = NULL; 2614 #endif 2615 int err; 2616 2617 BUILD_BUG_ON(__EXEC_INTERNAL_FLAGS & ~__I915_EXEC_ILLEGAL_FLAGS); 2618 BUILD_BUG_ON(__EXEC_OBJECT_INTERNAL_FLAGS & 2619 ~__EXEC_OBJECT_UNKNOWN_FLAGS); 2620 2621 eb.i915 = i915; 2622 eb.file = file; 2623 eb.args = args; 2624 if (DBG_FORCE_RELOC || !(args->flags & I915_EXEC_NO_RELOC)) 2625 args->flags |= __EXEC_HAS_RELOC; 2626 2627 eb.exec = exec; 2628 eb.vma = (struct i915_vma **)(exec + args->buffer_count + 1); 2629 eb.vma[0] = NULL; 2630 eb.flags = (unsigned int *)(eb.vma + args->buffer_count + 1); 2631 2632 eb.invalid_flags = __EXEC_OBJECT_UNKNOWN_FLAGS; 2633 reloc_cache_init(&eb.reloc_cache, eb.i915); 2634 2635 eb.buffer_count = args->buffer_count; 2636 eb.batch_start_offset = args->batch_start_offset; 2637 eb.batch_len = args->batch_len; 2638 eb.trampoline = NULL; 2639 2640 eb.batch_flags = 0; 2641 if (args->flags & I915_EXEC_SECURE) { 2642 if (INTEL_GEN(i915) >= 11) 2643 return -ENODEV; 2644 2645 /* Return -EPERM to trigger fallback code on old binaries. */ 2646 if (!HAS_SECURE_BATCHES(i915)) 2647 return -EPERM; 2648 2649 if (!drm_is_current_master(file) || !capable(CAP_SYS_ADMIN)) 2650 return -EPERM; 2651 2652 eb.batch_flags |= I915_DISPATCH_SECURE; 2653 } 2654 if (args->flags & I915_EXEC_IS_PINNED) 2655 eb.batch_flags |= I915_DISPATCH_PINNED; 2656 2657 if (args->flags & I915_EXEC_FENCE_IN) { 2658 in_fence = sync_file_get_fence(lower_32_bits(args->rsvd2)); 2659 if (!in_fence) 2660 return -EINVAL; 2661 } 2662 2663 if (args->flags & I915_EXEC_FENCE_SUBMIT) { 2664 if (in_fence) { 2665 err = -EINVAL; 2666 goto err_in_fence; 2667 } 2668 2669 exec_fence = sync_file_get_fence(lower_32_bits(args->rsvd2)); 2670 if (!exec_fence) { 2671 err = -EINVAL; 2672 goto err_in_fence; 2673 } 2674 } 2675 2676 if (args->flags & I915_EXEC_FENCE_OUT) { 2677 #ifdef __NetBSD__ 2678 err = -fd_allocfile(&fp, &out_fence_fd); 2679 if (err) 2680 goto err_exec_fence; 2681 #else 2682 out_fence_fd = get_unused_fd_flags(O_CLOEXEC); 2683 if (out_fence_fd < 0) { 2684 err = out_fence_fd; 2685 goto err_exec_fence; 2686 } 2687 #endif 2688 } 2689 2690 err = eb_create(&eb); 2691 if (err) 2692 goto err_out_fence; 2693 2694 GEM_BUG_ON(!eb.lut_size); 2695 2696 err = eb_select_context(&eb); 2697 if (unlikely(err)) 2698 goto err_destroy; 2699 2700 err = eb_pin_engine(&eb, file, args); 2701 if (unlikely(err)) 2702 goto err_context; 2703 2704 err = i915_mutex_lock_interruptible(dev); 2705 if (err) 2706 goto err_engine; 2707 2708 err = eb_relocate(&eb); 2709 if (err) { 2710 /* 2711 * If the user expects the execobject.offset and 2712 * reloc.presumed_offset to be an exact match, 2713 * as for using NO_RELOC, then we cannot update 2714 * the execobject.offset until we have completed 2715 * relocation. 2716 */ 2717 args->flags &= ~__EXEC_HAS_RELOC; 2718 goto err_vma; 2719 } 2720 2721 if (unlikely(*eb.batch->exec_flags & EXEC_OBJECT_WRITE)) { 2722 DRM_DEBUG("Attempting to use self-modifying batch buffer\n"); 2723 err = -EINVAL; 2724 goto err_vma; 2725 } 2726 if (eb.batch_start_offset > eb.batch->size || 2727 eb.batch_len > eb.batch->size - eb.batch_start_offset) { 2728 DRM_DEBUG("Attempting to use out-of-bounds batch\n"); 2729 err = -EINVAL; 2730 goto err_vma; 2731 } 2732 2733 if (eb.batch_len == 0) 2734 eb.batch_len = eb.batch->size - eb.batch_start_offset; 2735 2736 err = eb_parse(&eb); 2737 if (err) 2738 goto err_vma; 2739 2740 /* 2741 * snb/ivb/vlv conflate the "batch in ppgtt" bit with the "non-secure 2742 * batch" bit. Hence we need to pin secure batches into the global gtt. 2743 * hsw should have this fixed, but bdw mucks it up again. */ 2744 if (eb.batch_flags & I915_DISPATCH_SECURE) { 2745 struct i915_vma *vma; 2746 2747 /* 2748 * So on first glance it looks freaky that we pin the batch here 2749 * outside of the reservation loop. But: 2750 * - The batch is already pinned into the relevant ppgtt, so we 2751 * already have the backing storage fully allocated. 2752 * - No other BO uses the global gtt (well contexts, but meh), 2753 * so we don't really have issues with multiple objects not 2754 * fitting due to fragmentation. 2755 * So this is actually safe. 2756 */ 2757 vma = i915_gem_object_ggtt_pin(eb.batch->obj, NULL, 0, 0, 0); 2758 if (IS_ERR(vma)) { 2759 err = PTR_ERR(vma); 2760 goto err_vma; 2761 } 2762 2763 eb.batch = vma; 2764 } 2765 2766 /* All GPU relocation batches must be submitted prior to the user rq */ 2767 GEM_BUG_ON(eb.reloc_cache.rq); 2768 2769 /* Allocate a request for this batch buffer nice and early. */ 2770 eb.request = i915_request_create(eb.context); 2771 if (IS_ERR(eb.request)) { 2772 err = PTR_ERR(eb.request); 2773 goto err_batch_unpin; 2774 } 2775 2776 if (in_fence) { 2777 err = i915_request_await_dma_fence(eb.request, in_fence); 2778 if (err < 0) 2779 goto err_request; 2780 } 2781 2782 if (exec_fence) { 2783 err = i915_request_await_execution(eb.request, exec_fence, 2784 eb.engine->bond_execute); 2785 if (err < 0) 2786 goto err_request; 2787 } 2788 2789 if (fences) { 2790 err = await_fence_array(&eb, fences); 2791 if (err) 2792 goto err_request; 2793 } 2794 2795 if (out_fence_fd != -1) { 2796 #ifdef __NetBSD__ 2797 out_fence = sync_file_create(&eb.request->fence, fp); 2798 #else 2799 out_fence = sync_file_create(&eb.request->fence); 2800 #endif 2801 if (!out_fence) { 2802 err = -ENOMEM; 2803 goto err_request; 2804 } 2805 } 2806 2807 /* 2808 * Whilst this request exists, batch_obj will be on the 2809 * active_list, and so will hold the active reference. Only when this 2810 * request is retired will the the batch_obj be moved onto the 2811 * inactive_list and lose its active reference. Hence we do not need 2812 * to explicitly hold another reference here. 2813 */ 2814 eb.request->batch = eb.batch; 2815 if (eb.batch->private) 2816 intel_engine_pool_mark_active(eb.batch->private, eb.request); 2817 2818 trace_i915_request_queue(eb.request, eb.batch_flags); 2819 err = eb_submit(&eb); 2820 err_request: 2821 add_to_client(eb.request, file); 2822 i915_request_get(eb.request); 2823 i915_request_add(eb.request); 2824 2825 if (fences) 2826 signal_fence_array(&eb, fences); 2827 2828 if (out_fence) { 2829 if (err == 0) { 2830 #ifdef __NetBSD__ 2831 fd_affix(curproc, fp, out_fence_fd); 2832 #else 2833 fd_install(out_fence_fd, out_fence->file); 2834 #endif 2835 args->rsvd2 &= GENMASK_ULL(31, 0); /* keep in-fence */ 2836 args->rsvd2 |= (u64)out_fence_fd << 32; 2837 out_fence_fd = -1; 2838 } else { 2839 #ifdef __NetBSD__ 2840 fd_abort(curproc, fp, out_fence_fd); 2841 out_fence_fd = -1; 2842 fp = NULL; 2843 #else 2844 fput(out_fence->file); 2845 #endif 2846 } 2847 } 2848 i915_request_put(eb.request); 2849 2850 err_batch_unpin: 2851 if (eb.batch_flags & I915_DISPATCH_SECURE) 2852 i915_vma_unpin(eb.batch); 2853 if (eb.batch->private) 2854 intel_engine_pool_put(eb.batch->private); 2855 err_vma: 2856 if (eb.exec) 2857 eb_release_vmas(&eb); 2858 if (eb.trampoline) 2859 i915_vma_unpin(eb.trampoline); 2860 mutex_unlock(&dev->struct_mutex); 2861 err_engine: 2862 eb_unpin_engine(&eb); 2863 err_context: 2864 i915_gem_context_put(eb.gem_context); 2865 err_destroy: 2866 eb_destroy(&eb); 2867 err_out_fence: 2868 if (out_fence_fd != -1) 2869 #ifdef __NetBSD__ 2870 fd_abort(curproc, fp, out_fence_fd); 2871 #else 2872 put_unused_fd(out_fence_fd); 2873 #endif 2874 err_exec_fence: 2875 dma_fence_put(exec_fence); 2876 err_in_fence: 2877 dma_fence_put(in_fence); 2878 return err; 2879 } 2880 2881 static size_t eb_element_size(void) 2882 { 2883 return (sizeof(struct drm_i915_gem_exec_object2) + 2884 sizeof(struct i915_vma *) + 2885 sizeof(unsigned int)); 2886 } 2887 2888 static bool check_buffer_count(size_t count) 2889 { 2890 const size_t sz = eb_element_size(); 2891 2892 /* 2893 * When using LUT_HANDLE, we impose a limit of INT_MAX for the lookup 2894 * array size (see eb_create()). Otherwise, we can accept an array as 2895 * large as can be addressed (though use large arrays at your peril)! 2896 */ 2897 2898 return !(count < 1 || count > INT_MAX || count > SIZE_MAX / sz - 1); 2899 } 2900 2901 /* 2902 * Legacy execbuffer just creates an exec2 list from the original exec object 2903 * list array and passes it to the real function. 2904 */ 2905 int 2906 i915_gem_execbuffer_ioctl(struct drm_device *dev, void *data, 2907 struct drm_file *file) 2908 { 2909 struct drm_i915_gem_execbuffer *args = data; 2910 struct drm_i915_gem_execbuffer2 exec2; 2911 struct drm_i915_gem_exec_object *exec_list = NULL; 2912 struct drm_i915_gem_exec_object2 *exec2_list = NULL; 2913 const size_t count = args->buffer_count; 2914 unsigned int i; 2915 int err; 2916 2917 if (!check_buffer_count(count)) { 2918 DRM_DEBUG("execbuf2 with %zd buffers\n", count); 2919 return -EINVAL; 2920 } 2921 2922 exec2.buffers_ptr = args->buffers_ptr; 2923 exec2.buffer_count = args->buffer_count; 2924 exec2.batch_start_offset = args->batch_start_offset; 2925 exec2.batch_len = args->batch_len; 2926 exec2.DR1 = args->DR1; 2927 exec2.DR4 = args->DR4; 2928 exec2.num_cliprects = args->num_cliprects; 2929 exec2.cliprects_ptr = args->cliprects_ptr; 2930 exec2.flags = I915_EXEC_RENDER; 2931 i915_execbuffer2_set_context_id(exec2, 0); 2932 2933 err = i915_gem_check_execbuffer(&exec2); 2934 if (err) 2935 return err; 2936 2937 /* Copy in the exec list from userland */ 2938 exec_list = kvmalloc_array(count, sizeof(*exec_list), 2939 __GFP_NOWARN | GFP_KERNEL); 2940 exec2_list = kvmalloc_array(count + 1, eb_element_size(), 2941 __GFP_NOWARN | GFP_KERNEL); 2942 if (exec_list == NULL || exec2_list == NULL) { 2943 DRM_DEBUG("Failed to allocate exec list for %d buffers\n", 2944 args->buffer_count); 2945 kvfree(exec_list); 2946 kvfree(exec2_list); 2947 return -ENOMEM; 2948 } 2949 err = copy_from_user(exec_list, 2950 u64_to_user_ptr(args->buffers_ptr), 2951 sizeof(*exec_list) * count); 2952 if (err) { 2953 DRM_DEBUG("copy %d exec entries failed %d\n", 2954 args->buffer_count, err); 2955 kvfree(exec_list); 2956 kvfree(exec2_list); 2957 return -EFAULT; 2958 } 2959 2960 for (i = 0; i < args->buffer_count; i++) { 2961 exec2_list[i].handle = exec_list[i].handle; 2962 exec2_list[i].relocation_count = exec_list[i].relocation_count; 2963 exec2_list[i].relocs_ptr = exec_list[i].relocs_ptr; 2964 exec2_list[i].alignment = exec_list[i].alignment; 2965 exec2_list[i].offset = exec_list[i].offset; 2966 if (INTEL_GEN(to_i915(dev)) < 4) 2967 exec2_list[i].flags = EXEC_OBJECT_NEEDS_FENCE; 2968 else 2969 exec2_list[i].flags = 0; 2970 } 2971 2972 err = i915_gem_do_execbuffer(dev, file, &exec2, exec2_list, NULL); 2973 if (exec2.flags & __EXEC_HAS_RELOC) { 2974 struct drm_i915_gem_exec_object __user *user_exec_list = 2975 u64_to_user_ptr(args->buffers_ptr); 2976 2977 /* Copy the new buffer offsets back to the user's exec list. */ 2978 for (i = 0; i < args->buffer_count; i++) { 2979 if (!(exec2_list[i].offset & UPDATE)) 2980 continue; 2981 2982 exec2_list[i].offset = 2983 gen8_canonical_addr(exec2_list[i].offset & PIN_OFFSET_MASK); 2984 exec2_list[i].offset &= PIN_OFFSET_MASK; 2985 if (__copy_to_user(&user_exec_list[i].offset, 2986 &exec2_list[i].offset, 2987 sizeof(user_exec_list[i].offset))) 2988 break; 2989 } 2990 } 2991 2992 kvfree(exec_list); 2993 kvfree(exec2_list); 2994 return err; 2995 } 2996 2997 int 2998 i915_gem_execbuffer2_ioctl(struct drm_device *dev, void *data, 2999 struct drm_file *file) 3000 { 3001 struct drm_i915_gem_execbuffer2 *args = data; 3002 struct drm_i915_gem_exec_object2 *exec2_list; 3003 struct drm_syncobj **fences = NULL; 3004 const size_t count = args->buffer_count; 3005 int err; 3006 3007 if (!check_buffer_count(count)) { 3008 DRM_DEBUG("execbuf2 with %zd buffers\n", count); 3009 return -EINVAL; 3010 } 3011 3012 err = i915_gem_check_execbuffer(args); 3013 if (err) 3014 return err; 3015 3016 /* Allocate an extra slot for use by the command parser */ 3017 exec2_list = kvmalloc_array(count + 1, eb_element_size(), 3018 __GFP_NOWARN | GFP_KERNEL); 3019 if (exec2_list == NULL) { 3020 DRM_DEBUG("Failed to allocate exec list for %zd buffers\n", 3021 count); 3022 return -ENOMEM; 3023 } 3024 if (copy_from_user(exec2_list, 3025 u64_to_user_ptr(args->buffers_ptr), 3026 sizeof(*exec2_list) * count)) { 3027 DRM_DEBUG("copy %zd exec entries failed\n", count); 3028 kvfree(exec2_list); 3029 return -EFAULT; 3030 } 3031 3032 if (args->flags & I915_EXEC_FENCE_ARRAY) { 3033 fences = get_fence_array(args, file); 3034 if (IS_ERR(fences)) { 3035 kvfree(exec2_list); 3036 return PTR_ERR(fences); 3037 } 3038 } 3039 3040 err = i915_gem_do_execbuffer(dev, file, args, exec2_list, fences); 3041 3042 /* 3043 * Now that we have begun execution of the batchbuffer, we ignore 3044 * any new error after this point. Also given that we have already 3045 * updated the associated relocations, we try to write out the current 3046 * object locations irrespective of any error. 3047 */ 3048 if (args->flags & __EXEC_HAS_RELOC) { 3049 struct drm_i915_gem_exec_object2 __user *user_exec_list = 3050 u64_to_user_ptr(args->buffers_ptr); 3051 unsigned int i; 3052 3053 /* Copy the new buffer offsets back to the user's exec list. */ 3054 /* 3055 * Note: count * sizeof(*user_exec_list) does not overflow, 3056 * because we checked 'count' in check_buffer_count(). 3057 * 3058 * And this range already got effectively checked earlier 3059 * when we did the "copy_from_user()" above. 3060 */ 3061 if (!user_access_begin(user_exec_list, count * sizeof(*user_exec_list))) 3062 goto end; 3063 3064 for (i = 0; i < args->buffer_count; i++) { 3065 if (!(exec2_list[i].offset & UPDATE)) 3066 continue; 3067 3068 exec2_list[i].offset = 3069 gen8_canonical_addr(exec2_list[i].offset & PIN_OFFSET_MASK); 3070 unsafe_put_user(exec2_list[i].offset, 3071 &user_exec_list[i].offset, 3072 end_user); 3073 } 3074 end_user: 3075 user_access_end(); 3076 end:; 3077 } 3078 3079 args->flags &= ~__I915_EXEC_UNKNOWN_FLAGS; 3080 put_fence_array(args, fences); 3081 kvfree(exec2_list); 3082 return err; 3083 } 3084