1 1.17 riastrad /* $NetBSD: i915_request.c,v 1.17 2022/07/11 18:56:00 riastradh Exp $ */ 2 1.1 riastrad 3 1.1 riastrad /* 4 1.1 riastrad * Copyright 2008-2015 Intel Corporation 5 1.1 riastrad * 6 1.1 riastrad * Permission is hereby granted, free of charge, to any person obtaining a 7 1.1 riastrad * copy of this software and associated documentation files (the "Software"), 8 1.1 riastrad * to deal in the Software without restriction, including without limitation 9 1.1 riastrad * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 1.1 riastrad * and/or sell copies of the Software, and to permit persons to whom the 11 1.1 riastrad * Software is furnished to do so, subject to the following conditions: 12 1.1 riastrad * 13 1.1 riastrad * The above copyright notice and this permission notice (including the next 14 1.1 riastrad * paragraph) shall be included in all copies or substantial portions of the 15 1.1 riastrad * Software. 16 1.1 riastrad * 17 1.1 riastrad * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 1.1 riastrad * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 1.1 riastrad * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 1.1 riastrad * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 1.1 riastrad * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 22 1.1 riastrad * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 23 1.1 riastrad * IN THE SOFTWARE. 24 1.1 riastrad * 25 1.1 riastrad */ 26 1.1 riastrad 27 1.1 riastrad #include <sys/cdefs.h> 28 1.17 riastrad __KERNEL_RCSID(0, "$NetBSD: i915_request.c,v 1.17 2022/07/11 18:56:00 riastradh Exp $"); 29 1.1 riastrad 30 1.1 riastrad #include <linux/dma-fence-array.h> 31 1.1 riastrad #include <linux/irq_work.h> 32 1.1 riastrad #include <linux/prefetch.h> 33 1.1 riastrad #include <linux/sched.h> 34 1.1 riastrad #include <linux/sched/clock.h> 35 1.1 riastrad #include <linux/sched/signal.h> 36 1.1 riastrad 37 1.1 riastrad #include "gem/i915_gem_context.h" 38 1.1 riastrad #include "gt/intel_context.h" 39 1.1 riastrad #include "gt/intel_ring.h" 40 1.1 riastrad #include "gt/intel_rps.h" 41 1.1 riastrad 42 1.1 riastrad #include "i915_active.h" 43 1.1 riastrad #include "i915_drv.h" 44 1.1 riastrad #include "i915_globals.h" 45 1.1 riastrad #include "i915_trace.h" 46 1.1 riastrad #include "intel_pm.h" 47 1.1 riastrad 48 1.1 riastrad struct execute_cb { 49 1.1 riastrad struct list_head link; 50 1.1 riastrad struct irq_work work; 51 1.1 riastrad struct i915_sw_fence *fence; 52 1.1 riastrad void (*hook)(struct i915_request *rq, struct dma_fence *signal); 53 1.1 riastrad struct i915_request *signal; 54 1.1 riastrad }; 55 1.1 riastrad 56 1.1 riastrad static struct i915_global_request { 57 1.1 riastrad struct i915_global base; 58 1.1 riastrad struct kmem_cache *slab_requests; 59 1.1 riastrad struct kmem_cache *slab_dependencies; 60 1.1 riastrad struct kmem_cache *slab_execute_cbs; 61 1.1 riastrad } global; 62 1.1 riastrad 63 1.1 riastrad static const char *i915_fence_get_driver_name(struct dma_fence *fence) 64 1.1 riastrad { 65 1.1 riastrad return dev_name(to_request(fence)->i915->drm.dev); 66 1.1 riastrad } 67 1.1 riastrad 68 1.1 riastrad static const char *i915_fence_get_timeline_name(struct dma_fence *fence) 69 1.1 riastrad { 70 1.1 riastrad const struct i915_gem_context *ctx; 71 1.1 riastrad 72 1.1 riastrad /* 73 1.1 riastrad * The timeline struct (as part of the ppgtt underneath a context) 74 1.1 riastrad * may be freed when the request is no longer in use by the GPU. 75 1.1 riastrad * We could extend the life of a context to beyond that of all 76 1.1 riastrad * fences, possibly keeping the hw resource around indefinitely, 77 1.1 riastrad * or we just give them a false name. Since 78 1.1 riastrad * dma_fence_ops.get_timeline_name is a debug feature, the occasional 79 1.1 riastrad * lie seems justifiable. 80 1.1 riastrad */ 81 1.1 riastrad if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) 82 1.1 riastrad return "signaled"; 83 1.1 riastrad 84 1.1 riastrad ctx = i915_request_gem_context(to_request(fence)); 85 1.1 riastrad if (!ctx) 86 1.1 riastrad return "[" DRIVER_NAME "]"; 87 1.1 riastrad 88 1.1 riastrad return ctx->name; 89 1.1 riastrad } 90 1.1 riastrad 91 1.1 riastrad static bool i915_fence_signaled(struct dma_fence *fence) 92 1.1 riastrad { 93 1.1 riastrad return i915_request_completed(to_request(fence)); 94 1.1 riastrad } 95 1.1 riastrad 96 1.1 riastrad static bool i915_fence_enable_signaling(struct dma_fence *fence) 97 1.1 riastrad { 98 1.1 riastrad return i915_request_enable_breadcrumb(to_request(fence)); 99 1.1 riastrad } 100 1.1 riastrad 101 1.1 riastrad static signed long i915_fence_wait(struct dma_fence *fence, 102 1.1 riastrad bool interruptible, 103 1.1 riastrad signed long timeout) 104 1.1 riastrad { 105 1.1 riastrad return i915_request_wait(to_request(fence), 106 1.1 riastrad interruptible | I915_WAIT_PRIORITY, 107 1.1 riastrad timeout); 108 1.1 riastrad } 109 1.1 riastrad 110 1.1 riastrad static void i915_fence_release(struct dma_fence *fence) 111 1.1 riastrad { 112 1.1 riastrad struct i915_request *rq = to_request(fence); 113 1.1 riastrad 114 1.1 riastrad /* 115 1.1 riastrad * The request is put onto a RCU freelist (i.e. the address 116 1.1 riastrad * is immediately reused), mark the fences as being freed now. 117 1.1 riastrad * Otherwise the debugobjects for the fences are only marked as 118 1.1 riastrad * freed when the slab cache itself is freed, and so we would get 119 1.1 riastrad * caught trying to reuse dead objects. 120 1.1 riastrad */ 121 1.10 riastrad #ifndef __NetBSD__ 122 1.1 riastrad i915_sw_fence_fini(&rq->submit); 123 1.1 riastrad i915_sw_fence_fini(&rq->semaphore); 124 1.10 riastrad #endif 125 1.1 riastrad 126 1.1 riastrad kmem_cache_free(global.slab_requests, rq); 127 1.1 riastrad } 128 1.1 riastrad 129 1.1 riastrad const struct dma_fence_ops i915_fence_ops = { 130 1.1 riastrad .get_driver_name = i915_fence_get_driver_name, 131 1.1 riastrad .get_timeline_name = i915_fence_get_timeline_name, 132 1.1 riastrad .enable_signaling = i915_fence_enable_signaling, 133 1.1 riastrad .signaled = i915_fence_signaled, 134 1.1 riastrad .wait = i915_fence_wait, 135 1.1 riastrad .release = i915_fence_release, 136 1.1 riastrad }; 137 1.1 riastrad 138 1.1 riastrad static void irq_execute_cb(struct irq_work *wrk) 139 1.1 riastrad { 140 1.1 riastrad struct execute_cb *cb = container_of(wrk, typeof(*cb), work); 141 1.1 riastrad 142 1.1 riastrad i915_sw_fence_complete(cb->fence); 143 1.1 riastrad kmem_cache_free(global.slab_execute_cbs, cb); 144 1.1 riastrad } 145 1.1 riastrad 146 1.1 riastrad static void irq_execute_cb_hook(struct irq_work *wrk) 147 1.1 riastrad { 148 1.1 riastrad struct execute_cb *cb = container_of(wrk, typeof(*cb), work); 149 1.1 riastrad 150 1.1 riastrad cb->hook(container_of(cb->fence, struct i915_request, submit), 151 1.1 riastrad &cb->signal->fence); 152 1.1 riastrad i915_request_put(cb->signal); 153 1.1 riastrad 154 1.1 riastrad irq_execute_cb(wrk); 155 1.1 riastrad } 156 1.1 riastrad 157 1.1 riastrad static void __notify_execute_cb(struct i915_request *rq) 158 1.1 riastrad { 159 1.1 riastrad struct execute_cb *cb; 160 1.1 riastrad 161 1.1 riastrad lockdep_assert_held(&rq->lock); 162 1.1 riastrad 163 1.1 riastrad if (list_empty(&rq->execute_cb)) 164 1.1 riastrad return; 165 1.1 riastrad 166 1.1 riastrad list_for_each_entry(cb, &rq->execute_cb, link) 167 1.1 riastrad irq_work_queue(&cb->work); 168 1.1 riastrad 169 1.1 riastrad /* 170 1.1 riastrad * XXX Rollback on __i915_request_unsubmit() 171 1.1 riastrad * 172 1.1 riastrad * In the future, perhaps when we have an active time-slicing scheduler, 173 1.1 riastrad * it will be interesting to unsubmit parallel execution and remove 174 1.1 riastrad * busywaits from the GPU until their master is restarted. This is 175 1.1 riastrad * quite hairy, we have to carefully rollback the fence and do a 176 1.1 riastrad * preempt-to-idle cycle on the target engine, all the while the 177 1.1 riastrad * master execute_cb may refire. 178 1.1 riastrad */ 179 1.1 riastrad INIT_LIST_HEAD(&rq->execute_cb); 180 1.1 riastrad } 181 1.1 riastrad 182 1.1 riastrad static inline void 183 1.1 riastrad remove_from_client(struct i915_request *request) 184 1.1 riastrad { 185 1.1 riastrad struct drm_i915_file_private *file_priv; 186 1.1 riastrad 187 1.1 riastrad if (!READ_ONCE(request->file_priv)) 188 1.1 riastrad return; 189 1.1 riastrad 190 1.1 riastrad rcu_read_lock(); 191 1.1 riastrad file_priv = xchg(&request->file_priv, NULL); 192 1.1 riastrad if (file_priv) { 193 1.1 riastrad spin_lock(&file_priv->mm.lock); 194 1.1 riastrad list_del(&request->client_link); 195 1.1 riastrad spin_unlock(&file_priv->mm.lock); 196 1.1 riastrad } 197 1.1 riastrad rcu_read_unlock(); 198 1.1 riastrad } 199 1.1 riastrad 200 1.1 riastrad static void free_capture_list(struct i915_request *request) 201 1.1 riastrad { 202 1.1 riastrad struct i915_capture_list *capture; 203 1.1 riastrad 204 1.1 riastrad capture = fetch_and_zero(&request->capture_list); 205 1.1 riastrad while (capture) { 206 1.1 riastrad struct i915_capture_list *next = capture->next; 207 1.1 riastrad 208 1.1 riastrad kfree(capture); 209 1.1 riastrad capture = next; 210 1.1 riastrad } 211 1.1 riastrad } 212 1.1 riastrad 213 1.1 riastrad static void remove_from_engine(struct i915_request *rq) 214 1.1 riastrad { 215 1.1 riastrad struct intel_engine_cs *engine, *locked; 216 1.1 riastrad 217 1.1 riastrad /* 218 1.1 riastrad * Virtual engines complicate acquiring the engine timeline lock, 219 1.1 riastrad * as their rq->engine pointer is not stable until under that 220 1.1 riastrad * engine lock. The simple ploy we use is to take the lock then 221 1.1 riastrad * check that the rq still belongs to the newly locked engine. 222 1.1 riastrad */ 223 1.1 riastrad locked = READ_ONCE(rq->engine); 224 1.1 riastrad spin_lock_irq(&locked->active.lock); 225 1.1 riastrad while (unlikely(locked != (engine = READ_ONCE(rq->engine)))) { 226 1.1 riastrad spin_unlock(&locked->active.lock); 227 1.1 riastrad spin_lock(&engine->active.lock); 228 1.1 riastrad locked = engine; 229 1.1 riastrad } 230 1.1 riastrad list_del_init(&rq->sched.link); 231 1.1 riastrad clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 232 1.1 riastrad clear_bit(I915_FENCE_FLAG_HOLD, &rq->fence.flags); 233 1.1 riastrad spin_unlock_irq(&locked->active.lock); 234 1.1 riastrad } 235 1.1 riastrad 236 1.1 riastrad bool i915_request_retire(struct i915_request *rq) 237 1.1 riastrad { 238 1.1 riastrad if (!i915_request_completed(rq)) 239 1.1 riastrad return false; 240 1.1 riastrad 241 1.1 riastrad RQ_TRACE(rq, "\n"); 242 1.1 riastrad 243 1.1 riastrad GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit)); 244 1.1 riastrad trace_i915_request_retire(rq); 245 1.1 riastrad 246 1.1 riastrad /* 247 1.1 riastrad * We know the GPU must have read the request to have 248 1.1 riastrad * sent us the seqno + interrupt, so use the position 249 1.1 riastrad * of tail of the request to update the last known position 250 1.1 riastrad * of the GPU head. 251 1.1 riastrad * 252 1.1 riastrad * Note this requires that we are always called in request 253 1.1 riastrad * completion order. 254 1.1 riastrad */ 255 1.1 riastrad GEM_BUG_ON(!list_is_first(&rq->link, 256 1.1 riastrad &i915_request_timeline(rq)->requests)); 257 1.1 riastrad rq->ring->head = rq->postfix; 258 1.1 riastrad 259 1.1 riastrad /* 260 1.1 riastrad * We only loosely track inflight requests across preemption, 261 1.1 riastrad * and so we may find ourselves attempting to retire a _completed_ 262 1.1 riastrad * request that we have removed from the HW and put back on a run 263 1.1 riastrad * queue. 264 1.1 riastrad */ 265 1.1 riastrad remove_from_engine(rq); 266 1.1 riastrad 267 1.1 riastrad spin_lock_irq(&rq->lock); 268 1.1 riastrad i915_request_mark_complete(rq); 269 1.1 riastrad if (!i915_request_signaled(rq)) 270 1.1 riastrad dma_fence_signal_locked(&rq->fence); 271 1.1 riastrad if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &rq->fence.flags)) 272 1.1 riastrad i915_request_cancel_breadcrumb(rq); 273 1.1 riastrad if (i915_request_has_waitboost(rq)) { 274 1.1 riastrad GEM_BUG_ON(!atomic_read(&rq->engine->gt->rps.num_waiters)); 275 1.1 riastrad atomic_dec(&rq->engine->gt->rps.num_waiters); 276 1.1 riastrad } 277 1.1 riastrad if (!test_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags)) { 278 1.1 riastrad set_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags); 279 1.1 riastrad __notify_execute_cb(rq); 280 1.1 riastrad } 281 1.1 riastrad GEM_BUG_ON(!list_empty(&rq->execute_cb)); 282 1.1 riastrad spin_unlock_irq(&rq->lock); 283 1.1 riastrad 284 1.1 riastrad remove_from_client(rq); 285 1.1 riastrad list_del(&rq->link); 286 1.1 riastrad 287 1.1 riastrad intel_context_exit(rq->context); 288 1.1 riastrad intel_context_unpin(rq->context); 289 1.1 riastrad 290 1.1 riastrad free_capture_list(rq); 291 1.1 riastrad i915_sched_node_fini(&rq->sched); 292 1.1 riastrad i915_request_put(rq); 293 1.1 riastrad 294 1.1 riastrad return true; 295 1.1 riastrad } 296 1.1 riastrad 297 1.1 riastrad void i915_request_retire_upto(struct i915_request *rq) 298 1.1 riastrad { 299 1.1 riastrad struct intel_timeline * const tl = i915_request_timeline(rq); 300 1.1 riastrad struct i915_request *tmp; 301 1.1 riastrad 302 1.1 riastrad RQ_TRACE(rq, "\n"); 303 1.1 riastrad 304 1.1 riastrad GEM_BUG_ON(!i915_request_completed(rq)); 305 1.1 riastrad 306 1.1 riastrad do { 307 1.1 riastrad tmp = list_first_entry(&tl->requests, typeof(*tmp), link); 308 1.1 riastrad } while (i915_request_retire(tmp) && tmp != rq); 309 1.1 riastrad } 310 1.1 riastrad 311 1.1 riastrad static int 312 1.1 riastrad __await_execution(struct i915_request *rq, 313 1.1 riastrad struct i915_request *signal, 314 1.1 riastrad void (*hook)(struct i915_request *rq, 315 1.1 riastrad struct dma_fence *signal), 316 1.1 riastrad gfp_t gfp) 317 1.1 riastrad { 318 1.1 riastrad struct execute_cb *cb; 319 1.1 riastrad 320 1.1 riastrad if (i915_request_is_active(signal)) { 321 1.1 riastrad if (hook) 322 1.1 riastrad hook(rq, &signal->fence); 323 1.1 riastrad return 0; 324 1.1 riastrad } 325 1.1 riastrad 326 1.1 riastrad cb = kmem_cache_alloc(global.slab_execute_cbs, gfp); 327 1.1 riastrad if (!cb) 328 1.1 riastrad return -ENOMEM; 329 1.1 riastrad 330 1.1 riastrad cb->fence = &rq->submit; 331 1.1 riastrad i915_sw_fence_await(cb->fence); 332 1.1 riastrad init_irq_work(&cb->work, irq_execute_cb); 333 1.1 riastrad 334 1.1 riastrad if (hook) { 335 1.1 riastrad cb->hook = hook; 336 1.1 riastrad cb->signal = i915_request_get(signal); 337 1.1 riastrad cb->work.func = irq_execute_cb_hook; 338 1.1 riastrad } 339 1.1 riastrad 340 1.1 riastrad spin_lock_irq(&signal->lock); 341 1.1 riastrad if (i915_request_is_active(signal)) { 342 1.1 riastrad if (hook) { 343 1.1 riastrad hook(rq, &signal->fence); 344 1.1 riastrad i915_request_put(signal); 345 1.1 riastrad } 346 1.1 riastrad i915_sw_fence_complete(cb->fence); 347 1.1 riastrad kmem_cache_free(global.slab_execute_cbs, cb); 348 1.1 riastrad } else { 349 1.1 riastrad list_add_tail(&cb->link, &signal->execute_cb); 350 1.1 riastrad } 351 1.1 riastrad spin_unlock_irq(&signal->lock); 352 1.1 riastrad 353 1.1 riastrad /* Copy across semaphore status as we need the same behaviour */ 354 1.1 riastrad rq->sched.flags |= signal->sched.flags; 355 1.1 riastrad return 0; 356 1.1 riastrad } 357 1.1 riastrad 358 1.1 riastrad bool __i915_request_submit(struct i915_request *request) 359 1.1 riastrad { 360 1.1 riastrad struct intel_engine_cs *engine = request->engine; 361 1.1 riastrad bool result = false; 362 1.1 riastrad 363 1.1 riastrad RQ_TRACE(request, "\n"); 364 1.1 riastrad 365 1.1 riastrad GEM_BUG_ON(!irqs_disabled()); 366 1.1 riastrad lockdep_assert_held(&engine->active.lock); 367 1.1 riastrad 368 1.1 riastrad /* 369 1.1 riastrad * With the advent of preempt-to-busy, we frequently encounter 370 1.1 riastrad * requests that we have unsubmitted from HW, but left running 371 1.1 riastrad * until the next ack and so have completed in the meantime. On 372 1.1 riastrad * resubmission of that completed request, we can skip 373 1.1 riastrad * updating the payload, and execlists can even skip submitting 374 1.1 riastrad * the request. 375 1.1 riastrad * 376 1.1 riastrad * We must remove the request from the caller's priority queue, 377 1.1 riastrad * and the caller must only call us when the request is in their 378 1.1 riastrad * priority queue, under the active.lock. This ensures that the 379 1.1 riastrad * request has *not* yet been retired and we can safely move 380 1.1 riastrad * the request into the engine->active.list where it will be 381 1.1 riastrad * dropped upon retiring. (Otherwise if resubmit a *retired* 382 1.1 riastrad * request, this would be a horrible use-after-free.) 383 1.1 riastrad */ 384 1.1 riastrad if (i915_request_completed(request)) 385 1.1 riastrad goto xfer; 386 1.1 riastrad 387 1.1 riastrad if (intel_context_is_banned(request->context)) 388 1.1 riastrad i915_request_skip(request, -EIO); 389 1.1 riastrad 390 1.1 riastrad /* 391 1.1 riastrad * Are we using semaphores when the gpu is already saturated? 392 1.1 riastrad * 393 1.1 riastrad * Using semaphores incurs a cost in having the GPU poll a 394 1.1 riastrad * memory location, busywaiting for it to change. The continual 395 1.1 riastrad * memory reads can have a noticeable impact on the rest of the 396 1.1 riastrad * system with the extra bus traffic, stalling the cpu as it too 397 1.1 riastrad * tries to access memory across the bus (perf stat -e bus-cycles). 398 1.1 riastrad * 399 1.1 riastrad * If we installed a semaphore on this request and we only submit 400 1.1 riastrad * the request after the signaler completed, that indicates the 401 1.1 riastrad * system is overloaded and using semaphores at this time only 402 1.1 riastrad * increases the amount of work we are doing. If so, we disable 403 1.1 riastrad * further use of semaphores until we are idle again, whence we 404 1.1 riastrad * optimistically try again. 405 1.1 riastrad */ 406 1.1 riastrad if (request->sched.semaphores && 407 1.1 riastrad i915_sw_fence_signaled(&request->semaphore)) 408 1.1 riastrad engine->saturated |= request->sched.semaphores; 409 1.1 riastrad 410 1.1 riastrad engine->emit_fini_breadcrumb(request, 411 1.1 riastrad request->ring->vaddr + request->postfix); 412 1.1 riastrad 413 1.1 riastrad trace_i915_request_execute(request); 414 1.1 riastrad engine->serial++; 415 1.1 riastrad result = true; 416 1.1 riastrad 417 1.1 riastrad xfer: /* We may be recursing from the signal callback of another i915 fence */ 418 1.1 riastrad spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING); 419 1.1 riastrad 420 1.1 riastrad if (!test_and_set_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags)) { 421 1.1 riastrad list_move_tail(&request->sched.link, &engine->active.requests); 422 1.1 riastrad clear_bit(I915_FENCE_FLAG_PQUEUE, &request->fence.flags); 423 1.1 riastrad } 424 1.1 riastrad 425 1.1 riastrad if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags) && 426 1.1 riastrad !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &request->fence.flags) && 427 1.1 riastrad !i915_request_enable_breadcrumb(request)) 428 1.1 riastrad intel_engine_signal_breadcrumbs(engine); 429 1.1 riastrad 430 1.1 riastrad __notify_execute_cb(request); 431 1.1 riastrad 432 1.1 riastrad spin_unlock(&request->lock); 433 1.1 riastrad 434 1.1 riastrad return result; 435 1.1 riastrad } 436 1.1 riastrad 437 1.1 riastrad void i915_request_submit(struct i915_request *request) 438 1.1 riastrad { 439 1.1 riastrad struct intel_engine_cs *engine = request->engine; 440 1.1 riastrad unsigned long flags; 441 1.1 riastrad 442 1.1 riastrad /* Will be called from irq-context when using foreign fences. */ 443 1.1 riastrad spin_lock_irqsave(&engine->active.lock, flags); 444 1.1 riastrad 445 1.1 riastrad __i915_request_submit(request); 446 1.1 riastrad 447 1.1 riastrad spin_unlock_irqrestore(&engine->active.lock, flags); 448 1.1 riastrad } 449 1.1 riastrad 450 1.1 riastrad void __i915_request_unsubmit(struct i915_request *request) 451 1.1 riastrad { 452 1.5 riastrad struct intel_engine_cs *engine __lockdep_used = request->engine; 453 1.1 riastrad 454 1.1 riastrad RQ_TRACE(request, "\n"); 455 1.1 riastrad 456 1.1 riastrad GEM_BUG_ON(!irqs_disabled()); 457 1.1 riastrad lockdep_assert_held(&engine->active.lock); 458 1.1 riastrad 459 1.1 riastrad /* 460 1.1 riastrad * Only unwind in reverse order, required so that the per-context list 461 1.1 riastrad * is kept in seqno/ring order. 462 1.1 riastrad */ 463 1.1 riastrad 464 1.1 riastrad /* We may be recursing from the signal callback of another i915 fence */ 465 1.1 riastrad spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING); 466 1.1 riastrad 467 1.1 riastrad if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags)) 468 1.1 riastrad i915_request_cancel_breadcrumb(request); 469 1.1 riastrad 470 1.1 riastrad GEM_BUG_ON(!test_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags)); 471 1.1 riastrad clear_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags); 472 1.1 riastrad 473 1.1 riastrad spin_unlock(&request->lock); 474 1.1 riastrad 475 1.1 riastrad /* We've already spun, don't charge on resubmitting. */ 476 1.1 riastrad if (request->sched.semaphores && i915_request_started(request)) { 477 1.1 riastrad request->sched.attr.priority |= I915_PRIORITY_NOSEMAPHORE; 478 1.1 riastrad request->sched.semaphores = 0; 479 1.1 riastrad } 480 1.1 riastrad 481 1.1 riastrad /* 482 1.1 riastrad * We don't need to wake_up any waiters on request->execute, they 483 1.1 riastrad * will get woken by any other event or us re-adding this request 484 1.1 riastrad * to the engine timeline (__i915_request_submit()). The waiters 485 1.1 riastrad * should be quite adapt at finding that the request now has a new 486 1.1 riastrad * global_seqno to the one they went to sleep on. 487 1.1 riastrad */ 488 1.1 riastrad } 489 1.1 riastrad 490 1.1 riastrad void i915_request_unsubmit(struct i915_request *request) 491 1.1 riastrad { 492 1.1 riastrad struct intel_engine_cs *engine = request->engine; 493 1.1 riastrad unsigned long flags; 494 1.1 riastrad 495 1.1 riastrad /* Will be called from irq-context when using foreign fences. */ 496 1.1 riastrad spin_lock_irqsave(&engine->active.lock, flags); 497 1.1 riastrad 498 1.1 riastrad __i915_request_unsubmit(request); 499 1.1 riastrad 500 1.1 riastrad spin_unlock_irqrestore(&engine->active.lock, flags); 501 1.1 riastrad } 502 1.1 riastrad 503 1.1 riastrad static int __i915_sw_fence_call 504 1.1 riastrad submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state) 505 1.1 riastrad { 506 1.1 riastrad struct i915_request *request = 507 1.1 riastrad container_of(fence, typeof(*request), submit); 508 1.1 riastrad 509 1.1 riastrad switch (state) { 510 1.1 riastrad case FENCE_COMPLETE: 511 1.1 riastrad trace_i915_request_submit(request); 512 1.1 riastrad 513 1.1 riastrad if (unlikely(fence->error)) 514 1.1 riastrad i915_request_skip(request, fence->error); 515 1.1 riastrad 516 1.1 riastrad /* 517 1.1 riastrad * We need to serialize use of the submit_request() callback 518 1.1 riastrad * with its hotplugging performed during an emergency 519 1.1 riastrad * i915_gem_set_wedged(). We use the RCU mechanism to mark the 520 1.1 riastrad * critical section in order to force i915_gem_set_wedged() to 521 1.1 riastrad * wait until the submit_request() is completed before 522 1.1 riastrad * proceeding. 523 1.1 riastrad */ 524 1.1 riastrad rcu_read_lock(); 525 1.1 riastrad request->engine->submit_request(request); 526 1.1 riastrad rcu_read_unlock(); 527 1.1 riastrad break; 528 1.1 riastrad 529 1.1 riastrad case FENCE_FREE: 530 1.1 riastrad i915_request_put(request); 531 1.1 riastrad break; 532 1.1 riastrad } 533 1.1 riastrad 534 1.1 riastrad return NOTIFY_DONE; 535 1.1 riastrad } 536 1.1 riastrad 537 1.1 riastrad static int __i915_sw_fence_call 538 1.1 riastrad semaphore_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state) 539 1.1 riastrad { 540 1.1 riastrad struct i915_request *request = 541 1.1 riastrad container_of(fence, typeof(*request), semaphore); 542 1.1 riastrad 543 1.1 riastrad switch (state) { 544 1.1 riastrad case FENCE_COMPLETE: 545 1.1 riastrad i915_schedule_bump_priority(request, I915_PRIORITY_NOSEMAPHORE); 546 1.1 riastrad break; 547 1.1 riastrad 548 1.1 riastrad case FENCE_FREE: 549 1.1 riastrad i915_request_put(request); 550 1.1 riastrad break; 551 1.1 riastrad } 552 1.1 riastrad 553 1.1 riastrad return NOTIFY_DONE; 554 1.1 riastrad } 555 1.1 riastrad 556 1.1 riastrad static void retire_requests(struct intel_timeline *tl) 557 1.1 riastrad { 558 1.1 riastrad struct i915_request *rq, *rn; 559 1.1 riastrad 560 1.1 riastrad list_for_each_entry_safe(rq, rn, &tl->requests, link) 561 1.1 riastrad if (!i915_request_retire(rq)) 562 1.1 riastrad break; 563 1.1 riastrad } 564 1.1 riastrad 565 1.1 riastrad static noinline struct i915_request * 566 1.1 riastrad request_alloc_slow(struct intel_timeline *tl, gfp_t gfp) 567 1.1 riastrad { 568 1.1 riastrad struct i915_request *rq; 569 1.1 riastrad 570 1.1 riastrad if (list_empty(&tl->requests)) 571 1.1 riastrad goto out; 572 1.1 riastrad 573 1.1 riastrad if (!gfpflags_allow_blocking(gfp)) 574 1.1 riastrad goto out; 575 1.1 riastrad 576 1.1 riastrad /* Move our oldest request to the slab-cache (if not in use!) */ 577 1.1 riastrad rq = list_first_entry(&tl->requests, typeof(*rq), link); 578 1.1 riastrad i915_request_retire(rq); 579 1.1 riastrad 580 1.1 riastrad rq = kmem_cache_alloc(global.slab_requests, 581 1.1 riastrad gfp | __GFP_RETRY_MAYFAIL | __GFP_NOWARN); 582 1.1 riastrad if (rq) 583 1.1 riastrad return rq; 584 1.1 riastrad 585 1.1 riastrad /* Ratelimit ourselves to prevent oom from malicious clients */ 586 1.1 riastrad rq = list_last_entry(&tl->requests, typeof(*rq), link); 587 1.1 riastrad cond_synchronize_rcu(rq->rcustate); 588 1.1 riastrad 589 1.1 riastrad /* Retire our old requests in the hope that we free some */ 590 1.1 riastrad retire_requests(tl); 591 1.1 riastrad 592 1.1 riastrad out: 593 1.1 riastrad return kmem_cache_alloc(global.slab_requests, gfp); 594 1.1 riastrad } 595 1.1 riastrad 596 1.1 riastrad static void __i915_request_ctor(void *arg) 597 1.1 riastrad { 598 1.1 riastrad struct i915_request *rq = arg; 599 1.1 riastrad 600 1.1 riastrad spin_lock_init(&rq->lock); 601 1.1 riastrad i915_sched_node_init(&rq->sched); 602 1.1 riastrad i915_sw_fence_init(&rq->submit, submit_notify); 603 1.1 riastrad i915_sw_fence_init(&rq->semaphore, semaphore_notify); 604 1.1 riastrad 605 1.1 riastrad dma_fence_init(&rq->fence, &i915_fence_ops, &rq->lock, 0, 0); 606 1.1 riastrad 607 1.1 riastrad rq->file_priv = NULL; 608 1.1 riastrad rq->capture_list = NULL; 609 1.1 riastrad 610 1.1 riastrad INIT_LIST_HEAD(&rq->execute_cb); 611 1.1 riastrad } 612 1.1 riastrad 613 1.9 riastrad static void __i915_request_dtor(void *arg) 614 1.9 riastrad { 615 1.9 riastrad struct i915_request *rq = arg; 616 1.9 riastrad 617 1.9 riastrad dma_fence_destroy(&rq->fence); 618 1.10 riastrad #ifdef __NetBSD__ 619 1.10 riastrad i915_sw_fence_fini(&rq->submit); 620 1.10 riastrad i915_sw_fence_fini(&rq->semaphore); 621 1.10 riastrad #endif 622 1.9 riastrad spin_lock_destroy(&rq->lock); 623 1.9 riastrad } 624 1.9 riastrad 625 1.1 riastrad struct i915_request * 626 1.1 riastrad __i915_request_create(struct intel_context *ce, gfp_t gfp) 627 1.1 riastrad { 628 1.1 riastrad struct intel_timeline *tl = ce->timeline; 629 1.1 riastrad struct i915_request *rq; 630 1.1 riastrad u32 seqno; 631 1.1 riastrad int ret; 632 1.1 riastrad 633 1.1 riastrad might_sleep_if(gfpflags_allow_blocking(gfp)); 634 1.1 riastrad 635 1.1 riastrad /* Check that the caller provided an already pinned context */ 636 1.1 riastrad __intel_context_pin(ce); 637 1.1 riastrad 638 1.1 riastrad /* 639 1.1 riastrad * Beware: Dragons be flying overhead. 640 1.1 riastrad * 641 1.1 riastrad * We use RCU to look up requests in flight. The lookups may 642 1.1 riastrad * race with the request being allocated from the slab freelist. 643 1.1 riastrad * That is the request we are writing to here, may be in the process 644 1.1 riastrad * of being read by __i915_active_request_get_rcu(). As such, 645 1.1 riastrad * we have to be very careful when overwriting the contents. During 646 1.1 riastrad * the RCU lookup, we change chase the request->engine pointer, 647 1.1 riastrad * read the request->global_seqno and increment the reference count. 648 1.1 riastrad * 649 1.1 riastrad * The reference count is incremented atomically. If it is zero, 650 1.1 riastrad * the lookup knows the request is unallocated and complete. Otherwise, 651 1.1 riastrad * it is either still in use, or has been reallocated and reset 652 1.1 riastrad * with dma_fence_init(). This increment is safe for release as we 653 1.1 riastrad * check that the request we have a reference to and matches the active 654 1.1 riastrad * request. 655 1.1 riastrad * 656 1.1 riastrad * Before we increment the refcount, we chase the request->engine 657 1.1 riastrad * pointer. We must not call kmem_cache_zalloc() or else we set 658 1.1 riastrad * that pointer to NULL and cause a crash during the lookup. If 659 1.1 riastrad * we see the request is completed (based on the value of the 660 1.1 riastrad * old engine and seqno), the lookup is complete and reports NULL. 661 1.1 riastrad * If we decide the request is not completed (new engine or seqno), 662 1.1 riastrad * then we grab a reference and double check that it is still the 663 1.1 riastrad * active request - which it won't be and restart the lookup. 664 1.1 riastrad * 665 1.1 riastrad * Do not use kmem_cache_zalloc() here! 666 1.1 riastrad */ 667 1.1 riastrad rq = kmem_cache_alloc(global.slab_requests, 668 1.1 riastrad gfp | __GFP_RETRY_MAYFAIL | __GFP_NOWARN); 669 1.1 riastrad if (unlikely(!rq)) { 670 1.1 riastrad rq = request_alloc_slow(tl, gfp); 671 1.1 riastrad if (!rq) { 672 1.1 riastrad ret = -ENOMEM; 673 1.1 riastrad goto err_unreserve; 674 1.1 riastrad } 675 1.1 riastrad } 676 1.1 riastrad 677 1.1 riastrad rq->i915 = ce->engine->i915; 678 1.1 riastrad rq->context = ce; 679 1.1 riastrad rq->engine = ce->engine; 680 1.1 riastrad rq->ring = ce->ring; 681 1.1 riastrad rq->execution_mask = ce->engine->mask; 682 1.1 riastrad 683 1.5 riastrad #ifdef __NetBSD__ 684 1.9 riastrad dma_fence_reset(&rq->fence, &i915_fence_ops, &rq->lock, 0, 0); 685 1.5 riastrad #else 686 1.1 riastrad kref_init(&rq->fence.refcount); 687 1.1 riastrad rq->fence.flags = 0; 688 1.1 riastrad rq->fence.error = 0; 689 1.1 riastrad INIT_LIST_HEAD(&rq->fence.cb_list); 690 1.5 riastrad #endif 691 1.1 riastrad 692 1.1 riastrad ret = intel_timeline_get_seqno(tl, rq, &seqno); 693 1.1 riastrad if (ret) 694 1.1 riastrad goto err_free; 695 1.1 riastrad 696 1.1 riastrad rq->fence.context = tl->fence_context; 697 1.1 riastrad rq->fence.seqno = seqno; 698 1.1 riastrad 699 1.1 riastrad RCU_INIT_POINTER(rq->timeline, tl); 700 1.1 riastrad RCU_INIT_POINTER(rq->hwsp_cacheline, tl->hwsp_cacheline); 701 1.1 riastrad rq->hwsp_seqno = tl->hwsp_seqno; 702 1.1 riastrad 703 1.1 riastrad rq->rcustate = get_state_synchronize_rcu(); /* acts as smp_mb() */ 704 1.1 riastrad 705 1.1 riastrad /* We bump the ref for the fence chain */ 706 1.1 riastrad i915_sw_fence_reinit(&i915_request_get(rq)->submit); 707 1.1 riastrad i915_sw_fence_reinit(&i915_request_get(rq)->semaphore); 708 1.1 riastrad 709 1.1 riastrad i915_sched_node_reinit(&rq->sched); 710 1.1 riastrad 711 1.1 riastrad /* No zalloc, everything must be cleared after use */ 712 1.1 riastrad rq->batch = NULL; 713 1.1 riastrad GEM_BUG_ON(rq->file_priv); 714 1.1 riastrad GEM_BUG_ON(rq->capture_list); 715 1.1 riastrad GEM_BUG_ON(!list_empty(&rq->execute_cb)); 716 1.1 riastrad 717 1.1 riastrad /* 718 1.1 riastrad * Reserve space in the ring buffer for all the commands required to 719 1.1 riastrad * eventually emit this request. This is to guarantee that the 720 1.1 riastrad * i915_request_add() call can't fail. Note that the reserve may need 721 1.1 riastrad * to be redone if the request is not actually submitted straight 722 1.1 riastrad * away, e.g. because a GPU scheduler has deferred it. 723 1.1 riastrad * 724 1.1 riastrad * Note that due to how we add reserved_space to intel_ring_begin() 725 1.1 riastrad * we need to double our request to ensure that if we need to wrap 726 1.1 riastrad * around inside i915_request_add() there is sufficient space at 727 1.1 riastrad * the beginning of the ring as well. 728 1.1 riastrad */ 729 1.1 riastrad rq->reserved_space = 730 1.1 riastrad 2 * rq->engine->emit_fini_breadcrumb_dw * sizeof(u32); 731 1.1 riastrad 732 1.1 riastrad /* 733 1.1 riastrad * Record the position of the start of the request so that 734 1.1 riastrad * should we detect the updated seqno part-way through the 735 1.1 riastrad * GPU processing the request, we never over-estimate the 736 1.1 riastrad * position of the head. 737 1.1 riastrad */ 738 1.1 riastrad rq->head = rq->ring->emit; 739 1.1 riastrad 740 1.1 riastrad ret = rq->engine->request_alloc(rq); 741 1.1 riastrad if (ret) 742 1.1 riastrad goto err_unwind; 743 1.1 riastrad 744 1.1 riastrad rq->infix = rq->ring->emit; /* end of header; start of user payload */ 745 1.1 riastrad 746 1.1 riastrad intel_context_mark_active(ce); 747 1.1 riastrad return rq; 748 1.1 riastrad 749 1.1 riastrad err_unwind: 750 1.1 riastrad ce->ring->emit = rq->head; 751 1.1 riastrad 752 1.1 riastrad /* Make sure we didn't add ourselves to external state before freeing */ 753 1.1 riastrad GEM_BUG_ON(!list_empty(&rq->sched.signalers_list)); 754 1.1 riastrad GEM_BUG_ON(!list_empty(&rq->sched.waiters_list)); 755 1.1 riastrad 756 1.1 riastrad err_free: 757 1.1 riastrad kmem_cache_free(global.slab_requests, rq); 758 1.1 riastrad err_unreserve: 759 1.1 riastrad intel_context_unpin(ce); 760 1.1 riastrad return ERR_PTR(ret); 761 1.1 riastrad } 762 1.1 riastrad 763 1.1 riastrad struct i915_request * 764 1.1 riastrad i915_request_create(struct intel_context *ce) 765 1.1 riastrad { 766 1.1 riastrad struct i915_request *rq; 767 1.1 riastrad struct intel_timeline *tl; 768 1.1 riastrad 769 1.1 riastrad tl = intel_context_timeline_lock(ce); 770 1.1 riastrad if (IS_ERR(tl)) 771 1.1 riastrad return ERR_CAST(tl); 772 1.1 riastrad 773 1.1 riastrad /* Move our oldest request to the slab-cache (if not in use!) */ 774 1.1 riastrad rq = list_first_entry(&tl->requests, typeof(*rq), link); 775 1.1 riastrad if (!list_is_last(&rq->link, &tl->requests)) 776 1.1 riastrad i915_request_retire(rq); 777 1.1 riastrad 778 1.1 riastrad intel_context_enter(ce); 779 1.1 riastrad rq = __i915_request_create(ce, GFP_KERNEL); 780 1.1 riastrad intel_context_exit(ce); /* active reference transferred to request */ 781 1.1 riastrad if (IS_ERR(rq)) 782 1.1 riastrad goto err_unlock; 783 1.1 riastrad 784 1.1 riastrad /* Check that we do not interrupt ourselves with a new request */ 785 1.1 riastrad rq->cookie = lockdep_pin_lock(&tl->mutex); 786 1.1 riastrad 787 1.1 riastrad return rq; 788 1.1 riastrad 789 1.1 riastrad err_unlock: 790 1.1 riastrad intel_context_timeline_unlock(tl); 791 1.1 riastrad return rq; 792 1.1 riastrad } 793 1.1 riastrad 794 1.1 riastrad static int 795 1.1 riastrad i915_request_await_start(struct i915_request *rq, struct i915_request *signal) 796 1.1 riastrad { 797 1.1 riastrad struct dma_fence *fence; 798 1.1 riastrad int err; 799 1.1 riastrad 800 1.1 riastrad GEM_BUG_ON(i915_request_timeline(rq) == 801 1.1 riastrad rcu_access_pointer(signal->timeline)); 802 1.1 riastrad 803 1.1 riastrad fence = NULL; 804 1.1 riastrad rcu_read_lock(); 805 1.1 riastrad spin_lock_irq(&signal->lock); 806 1.1 riastrad if (!i915_request_started(signal) && 807 1.1 riastrad !list_is_first(&signal->link, 808 1.1 riastrad &rcu_dereference(signal->timeline)->requests)) { 809 1.1 riastrad struct i915_request *prev = list_prev_entry(signal, link); 810 1.1 riastrad 811 1.1 riastrad /* 812 1.1 riastrad * Peek at the request before us in the timeline. That 813 1.1 riastrad * request will only be valid before it is retired, so 814 1.1 riastrad * after acquiring a reference to it, confirm that it is 815 1.1 riastrad * still part of the signaler's timeline. 816 1.1 riastrad */ 817 1.1 riastrad if (i915_request_get_rcu(prev)) { 818 1.1 riastrad if (list_next_entry(prev, link) == signal) 819 1.1 riastrad fence = &prev->fence; 820 1.1 riastrad else 821 1.1 riastrad i915_request_put(prev); 822 1.1 riastrad } 823 1.1 riastrad } 824 1.1 riastrad spin_unlock_irq(&signal->lock); 825 1.1 riastrad rcu_read_unlock(); 826 1.1 riastrad if (!fence) 827 1.1 riastrad return 0; 828 1.1 riastrad 829 1.1 riastrad err = 0; 830 1.1 riastrad if (intel_timeline_sync_is_later(i915_request_timeline(rq), fence)) 831 1.1 riastrad err = i915_sw_fence_await_dma_fence(&rq->submit, 832 1.1 riastrad fence, 0, 833 1.1 riastrad I915_FENCE_GFP); 834 1.1 riastrad dma_fence_put(fence); 835 1.1 riastrad 836 1.1 riastrad return err; 837 1.1 riastrad } 838 1.1 riastrad 839 1.1 riastrad static intel_engine_mask_t 840 1.1 riastrad already_busywaiting(struct i915_request *rq) 841 1.1 riastrad { 842 1.1 riastrad /* 843 1.1 riastrad * Polling a semaphore causes bus traffic, delaying other users of 844 1.1 riastrad * both the GPU and CPU. We want to limit the impact on others, 845 1.1 riastrad * while taking advantage of early submission to reduce GPU 846 1.1 riastrad * latency. Therefore we restrict ourselves to not using more 847 1.1 riastrad * than one semaphore from each source, and not using a semaphore 848 1.1 riastrad * if we have detected the engine is saturated (i.e. would not be 849 1.1 riastrad * submitted early and cause bus traffic reading an already passed 850 1.1 riastrad * semaphore). 851 1.1 riastrad * 852 1.1 riastrad * See the are-we-too-late? check in __i915_request_submit(). 853 1.1 riastrad */ 854 1.1 riastrad return rq->sched.semaphores | rq->engine->saturated; 855 1.1 riastrad } 856 1.1 riastrad 857 1.1 riastrad static int 858 1.1 riastrad __emit_semaphore_wait(struct i915_request *to, 859 1.1 riastrad struct i915_request *from, 860 1.1 riastrad u32 seqno) 861 1.1 riastrad { 862 1.1 riastrad const int has_token = INTEL_GEN(to->i915) >= 12; 863 1.1 riastrad u32 hwsp_offset; 864 1.1 riastrad int len, err; 865 1.1 riastrad u32 *cs; 866 1.1 riastrad 867 1.1 riastrad GEM_BUG_ON(INTEL_GEN(to->i915) < 8); 868 1.1 riastrad 869 1.1 riastrad /* We need to pin the signaler's HWSP until we are finished reading. */ 870 1.1 riastrad err = intel_timeline_read_hwsp(from, to, &hwsp_offset); 871 1.1 riastrad if (err) 872 1.1 riastrad return err; 873 1.1 riastrad 874 1.1 riastrad len = 4; 875 1.1 riastrad if (has_token) 876 1.1 riastrad len += 2; 877 1.1 riastrad 878 1.1 riastrad cs = intel_ring_begin(to, len); 879 1.1 riastrad if (IS_ERR(cs)) 880 1.1 riastrad return PTR_ERR(cs); 881 1.1 riastrad 882 1.1 riastrad /* 883 1.1 riastrad * Using greater-than-or-equal here means we have to worry 884 1.1 riastrad * about seqno wraparound. To side step that issue, we swap 885 1.1 riastrad * the timeline HWSP upon wrapping, so that everyone listening 886 1.1 riastrad * for the old (pre-wrap) values do not see the much smaller 887 1.1 riastrad * (post-wrap) values than they were expecting (and so wait 888 1.1 riastrad * forever). 889 1.1 riastrad */ 890 1.1 riastrad *cs++ = (MI_SEMAPHORE_WAIT | 891 1.1 riastrad MI_SEMAPHORE_GLOBAL_GTT | 892 1.1 riastrad MI_SEMAPHORE_POLL | 893 1.1 riastrad MI_SEMAPHORE_SAD_GTE_SDD) + 894 1.1 riastrad has_token; 895 1.1 riastrad *cs++ = seqno; 896 1.1 riastrad *cs++ = hwsp_offset; 897 1.1 riastrad *cs++ = 0; 898 1.1 riastrad if (has_token) { 899 1.1 riastrad *cs++ = 0; 900 1.1 riastrad *cs++ = MI_NOOP; 901 1.1 riastrad } 902 1.1 riastrad 903 1.1 riastrad intel_ring_advance(to, cs); 904 1.1 riastrad return 0; 905 1.1 riastrad } 906 1.1 riastrad 907 1.1 riastrad static int 908 1.1 riastrad emit_semaphore_wait(struct i915_request *to, 909 1.1 riastrad struct i915_request *from, 910 1.1 riastrad gfp_t gfp) 911 1.1 riastrad { 912 1.1 riastrad /* Just emit the first semaphore we see as request space is limited. */ 913 1.1 riastrad if (already_busywaiting(to) & from->engine->mask) 914 1.1 riastrad goto await_fence; 915 1.1 riastrad 916 1.1 riastrad if (i915_request_await_start(to, from) < 0) 917 1.1 riastrad goto await_fence; 918 1.1 riastrad 919 1.1 riastrad /* Only submit our spinner after the signaler is running! */ 920 1.1 riastrad if (__await_execution(to, from, NULL, gfp)) 921 1.1 riastrad goto await_fence; 922 1.1 riastrad 923 1.1 riastrad if (__emit_semaphore_wait(to, from, from->fence.seqno)) 924 1.1 riastrad goto await_fence; 925 1.1 riastrad 926 1.1 riastrad to->sched.semaphores |= from->engine->mask; 927 1.1 riastrad to->sched.flags |= I915_SCHED_HAS_SEMAPHORE_CHAIN; 928 1.1 riastrad return 0; 929 1.1 riastrad 930 1.1 riastrad await_fence: 931 1.1 riastrad return i915_sw_fence_await_dma_fence(&to->submit, 932 1.1 riastrad &from->fence, 0, 933 1.1 riastrad I915_FENCE_GFP); 934 1.1 riastrad } 935 1.1 riastrad 936 1.1 riastrad static int 937 1.1 riastrad i915_request_await_request(struct i915_request *to, struct i915_request *from) 938 1.1 riastrad { 939 1.1 riastrad int ret; 940 1.1 riastrad 941 1.1 riastrad GEM_BUG_ON(to == from); 942 1.1 riastrad GEM_BUG_ON(to->timeline == from->timeline); 943 1.1 riastrad 944 1.1 riastrad if (i915_request_completed(from)) 945 1.1 riastrad return 0; 946 1.1 riastrad 947 1.1 riastrad if (to->engine->schedule) { 948 1.1 riastrad ret = i915_sched_node_add_dependency(&to->sched, &from->sched); 949 1.1 riastrad if (ret < 0) 950 1.1 riastrad return ret; 951 1.1 riastrad } 952 1.1 riastrad 953 1.1 riastrad if (to->engine == from->engine) 954 1.1 riastrad ret = i915_sw_fence_await_sw_fence_gfp(&to->submit, 955 1.1 riastrad &from->submit, 956 1.1 riastrad I915_FENCE_GFP); 957 1.1 riastrad else if (intel_context_use_semaphores(to->context)) 958 1.1 riastrad ret = emit_semaphore_wait(to, from, I915_FENCE_GFP); 959 1.1 riastrad else 960 1.1 riastrad ret = i915_sw_fence_await_dma_fence(&to->submit, 961 1.1 riastrad &from->fence, 0, 962 1.1 riastrad I915_FENCE_GFP); 963 1.1 riastrad if (ret < 0) 964 1.1 riastrad return ret; 965 1.1 riastrad 966 1.1 riastrad if (to->sched.flags & I915_SCHED_HAS_SEMAPHORE_CHAIN) { 967 1.1 riastrad ret = i915_sw_fence_await_dma_fence(&to->semaphore, 968 1.1 riastrad &from->fence, 0, 969 1.1 riastrad I915_FENCE_GFP); 970 1.1 riastrad if (ret < 0) 971 1.1 riastrad return ret; 972 1.1 riastrad } 973 1.1 riastrad 974 1.1 riastrad return 0; 975 1.1 riastrad } 976 1.1 riastrad 977 1.1 riastrad int 978 1.1 riastrad i915_request_await_dma_fence(struct i915_request *rq, struct dma_fence *fence) 979 1.1 riastrad { 980 1.1 riastrad struct dma_fence **child = &fence; 981 1.1 riastrad unsigned int nchild = 1; 982 1.1 riastrad int ret; 983 1.1 riastrad 984 1.1 riastrad /* 985 1.1 riastrad * Note that if the fence-array was created in signal-on-any mode, 986 1.1 riastrad * we should *not* decompose it into its individual fences. However, 987 1.1 riastrad * we don't currently store which mode the fence-array is operating 988 1.1 riastrad * in. Fortunately, the only user of signal-on-any is private to 989 1.1 riastrad * amdgpu and we should not see any incoming fence-array from 990 1.1 riastrad * sync-file being in signal-on-any mode. 991 1.1 riastrad */ 992 1.1 riastrad if (dma_fence_is_array(fence)) { 993 1.1 riastrad struct dma_fence_array *array = to_dma_fence_array(fence); 994 1.1 riastrad 995 1.1 riastrad child = array->fences; 996 1.1 riastrad nchild = array->num_fences; 997 1.1 riastrad GEM_BUG_ON(!nchild); 998 1.1 riastrad } 999 1.1 riastrad 1000 1.1 riastrad do { 1001 1.1 riastrad fence = *child++; 1002 1.1 riastrad if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) { 1003 1.1 riastrad i915_sw_fence_set_error_once(&rq->submit, fence->error); 1004 1.1 riastrad continue; 1005 1.1 riastrad } 1006 1.1 riastrad 1007 1.1 riastrad /* 1008 1.1 riastrad * Requests on the same timeline are explicitly ordered, along 1009 1.1 riastrad * with their dependencies, by i915_request_add() which ensures 1010 1.1 riastrad * that requests are submitted in-order through each ring. 1011 1.1 riastrad */ 1012 1.1 riastrad if (fence->context == rq->fence.context) 1013 1.1 riastrad continue; 1014 1.1 riastrad 1015 1.1 riastrad /* Squash repeated waits to the same timelines */ 1016 1.1 riastrad if (fence->context && 1017 1.1 riastrad intel_timeline_sync_is_later(i915_request_timeline(rq), 1018 1.1 riastrad fence)) 1019 1.1 riastrad continue; 1020 1.1 riastrad 1021 1.1 riastrad if (dma_fence_is_i915(fence)) 1022 1.1 riastrad ret = i915_request_await_request(rq, to_request(fence)); 1023 1.1 riastrad else 1024 1.1 riastrad ret = i915_sw_fence_await_dma_fence(&rq->submit, fence, 1025 1.1 riastrad fence->context ? I915_FENCE_TIMEOUT : 0, 1026 1.1 riastrad I915_FENCE_GFP); 1027 1.1 riastrad if (ret < 0) 1028 1.1 riastrad return ret; 1029 1.1 riastrad 1030 1.1 riastrad /* Record the latest fence used against each timeline */ 1031 1.1 riastrad if (fence->context) 1032 1.1 riastrad intel_timeline_sync_set(i915_request_timeline(rq), 1033 1.1 riastrad fence); 1034 1.1 riastrad } while (--nchild); 1035 1.1 riastrad 1036 1.1 riastrad return 0; 1037 1.1 riastrad } 1038 1.1 riastrad 1039 1.1 riastrad static bool intel_timeline_sync_has_start(struct intel_timeline *tl, 1040 1.1 riastrad struct dma_fence *fence) 1041 1.1 riastrad { 1042 1.1 riastrad return __intel_timeline_sync_is_later(tl, 1043 1.1 riastrad fence->context, 1044 1.1 riastrad fence->seqno - 1); 1045 1.1 riastrad } 1046 1.1 riastrad 1047 1.1 riastrad static int intel_timeline_sync_set_start(struct intel_timeline *tl, 1048 1.1 riastrad const struct dma_fence *fence) 1049 1.1 riastrad { 1050 1.1 riastrad return __intel_timeline_sync_set(tl, fence->context, fence->seqno - 1); 1051 1.1 riastrad } 1052 1.1 riastrad 1053 1.1 riastrad static int 1054 1.1 riastrad __i915_request_await_execution(struct i915_request *to, 1055 1.1 riastrad struct i915_request *from, 1056 1.1 riastrad void (*hook)(struct i915_request *rq, 1057 1.1 riastrad struct dma_fence *signal)) 1058 1.1 riastrad { 1059 1.1 riastrad int err; 1060 1.1 riastrad 1061 1.1 riastrad /* Submit both requests at the same time */ 1062 1.1 riastrad err = __await_execution(to, from, hook, I915_FENCE_GFP); 1063 1.1 riastrad if (err) 1064 1.1 riastrad return err; 1065 1.1 riastrad 1066 1.1 riastrad /* Squash repeated depenendices to the same timelines */ 1067 1.1 riastrad if (intel_timeline_sync_has_start(i915_request_timeline(to), 1068 1.1 riastrad &from->fence)) 1069 1.1 riastrad return 0; 1070 1.1 riastrad 1071 1.1 riastrad /* Ensure both start together [after all semaphores in signal] */ 1072 1.1 riastrad if (intel_engine_has_semaphores(to->engine)) 1073 1.1 riastrad err = __emit_semaphore_wait(to, from, from->fence.seqno - 1); 1074 1.1 riastrad else 1075 1.1 riastrad err = i915_request_await_start(to, from); 1076 1.1 riastrad if (err < 0) 1077 1.1 riastrad return err; 1078 1.1 riastrad 1079 1.1 riastrad /* Couple the dependency tree for PI on this exposed to->fence */ 1080 1.1 riastrad if (to->engine->schedule) { 1081 1.1 riastrad err = i915_sched_node_add_dependency(&to->sched, &from->sched); 1082 1.1 riastrad if (err < 0) 1083 1.1 riastrad return err; 1084 1.1 riastrad } 1085 1.1 riastrad 1086 1.1 riastrad return intel_timeline_sync_set_start(i915_request_timeline(to), 1087 1.1 riastrad &from->fence); 1088 1.1 riastrad } 1089 1.1 riastrad 1090 1.1 riastrad int 1091 1.1 riastrad i915_request_await_execution(struct i915_request *rq, 1092 1.1 riastrad struct dma_fence *fence, 1093 1.1 riastrad void (*hook)(struct i915_request *rq, 1094 1.1 riastrad struct dma_fence *signal)) 1095 1.1 riastrad { 1096 1.1 riastrad struct dma_fence **child = &fence; 1097 1.1 riastrad unsigned int nchild = 1; 1098 1.1 riastrad int ret; 1099 1.1 riastrad 1100 1.1 riastrad if (dma_fence_is_array(fence)) { 1101 1.1 riastrad struct dma_fence_array *array = to_dma_fence_array(fence); 1102 1.1 riastrad 1103 1.1 riastrad /* XXX Error for signal-on-any fence arrays */ 1104 1.1 riastrad 1105 1.1 riastrad child = array->fences; 1106 1.1 riastrad nchild = array->num_fences; 1107 1.1 riastrad GEM_BUG_ON(!nchild); 1108 1.1 riastrad } 1109 1.1 riastrad 1110 1.1 riastrad do { 1111 1.1 riastrad fence = *child++; 1112 1.1 riastrad if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) { 1113 1.1 riastrad i915_sw_fence_set_error_once(&rq->submit, fence->error); 1114 1.1 riastrad continue; 1115 1.1 riastrad } 1116 1.1 riastrad 1117 1.1 riastrad /* 1118 1.1 riastrad * We don't squash repeated fence dependencies here as we 1119 1.1 riastrad * want to run our callback in all cases. 1120 1.1 riastrad */ 1121 1.1 riastrad 1122 1.1 riastrad if (dma_fence_is_i915(fence)) 1123 1.1 riastrad ret = __i915_request_await_execution(rq, 1124 1.1 riastrad to_request(fence), 1125 1.1 riastrad hook); 1126 1.1 riastrad else 1127 1.1 riastrad ret = i915_sw_fence_await_dma_fence(&rq->submit, fence, 1128 1.1 riastrad I915_FENCE_TIMEOUT, 1129 1.1 riastrad GFP_KERNEL); 1130 1.1 riastrad if (ret < 0) 1131 1.1 riastrad return ret; 1132 1.1 riastrad } while (--nchild); 1133 1.1 riastrad 1134 1.1 riastrad return 0; 1135 1.1 riastrad } 1136 1.1 riastrad 1137 1.1 riastrad /** 1138 1.1 riastrad * i915_request_await_object - set this request to (async) wait upon a bo 1139 1.1 riastrad * @to: request we are wishing to use 1140 1.1 riastrad * @obj: object which may be in use on another ring. 1141 1.1 riastrad * @write: whether the wait is on behalf of a writer 1142 1.1 riastrad * 1143 1.1 riastrad * This code is meant to abstract object synchronization with the GPU. 1144 1.1 riastrad * Conceptually we serialise writes between engines inside the GPU. 1145 1.1 riastrad * We only allow one engine to write into a buffer at any time, but 1146 1.1 riastrad * multiple readers. To ensure each has a coherent view of memory, we must: 1147 1.1 riastrad * 1148 1.1 riastrad * - If there is an outstanding write request to the object, the new 1149 1.1 riastrad * request must wait for it to complete (either CPU or in hw, requests 1150 1.1 riastrad * on the same ring will be naturally ordered). 1151 1.1 riastrad * 1152 1.1 riastrad * - If we are a write request (pending_write_domain is set), the new 1153 1.1 riastrad * request must wait for outstanding read requests to complete. 1154 1.1 riastrad * 1155 1.1 riastrad * Returns 0 if successful, else propagates up the lower layer error. 1156 1.1 riastrad */ 1157 1.1 riastrad int 1158 1.1 riastrad i915_request_await_object(struct i915_request *to, 1159 1.1 riastrad struct drm_i915_gem_object *obj, 1160 1.1 riastrad bool write) 1161 1.1 riastrad { 1162 1.1 riastrad struct dma_fence *excl; 1163 1.1 riastrad int ret = 0; 1164 1.1 riastrad 1165 1.1 riastrad if (write) { 1166 1.1 riastrad struct dma_fence **shared; 1167 1.1 riastrad unsigned int count, i; 1168 1.1 riastrad 1169 1.1 riastrad ret = dma_resv_get_fences_rcu(obj->base.resv, 1170 1.1 riastrad &excl, &count, &shared); 1171 1.1 riastrad if (ret) 1172 1.1 riastrad return ret; 1173 1.1 riastrad 1174 1.1 riastrad for (i = 0; i < count; i++) { 1175 1.1 riastrad ret = i915_request_await_dma_fence(to, shared[i]); 1176 1.1 riastrad if (ret) 1177 1.1 riastrad break; 1178 1.1 riastrad 1179 1.1 riastrad dma_fence_put(shared[i]); 1180 1.1 riastrad } 1181 1.1 riastrad 1182 1.1 riastrad for (; i < count; i++) 1183 1.1 riastrad dma_fence_put(shared[i]); 1184 1.1 riastrad kfree(shared); 1185 1.1 riastrad } else { 1186 1.1 riastrad excl = dma_resv_get_excl_rcu(obj->base.resv); 1187 1.1 riastrad } 1188 1.1 riastrad 1189 1.1 riastrad if (excl) { 1190 1.1 riastrad if (ret == 0) 1191 1.1 riastrad ret = i915_request_await_dma_fence(to, excl); 1192 1.1 riastrad 1193 1.1 riastrad dma_fence_put(excl); 1194 1.1 riastrad } 1195 1.1 riastrad 1196 1.1 riastrad return ret; 1197 1.1 riastrad } 1198 1.1 riastrad 1199 1.1 riastrad void i915_request_skip(struct i915_request *rq, int error) 1200 1.1 riastrad { 1201 1.1 riastrad void *vaddr = rq->ring->vaddr; 1202 1.1 riastrad u32 head; 1203 1.1 riastrad 1204 1.1 riastrad GEM_BUG_ON(!IS_ERR_VALUE((long)error)); 1205 1.1 riastrad dma_fence_set_error(&rq->fence, error); 1206 1.1 riastrad 1207 1.1 riastrad if (rq->infix == rq->postfix) 1208 1.1 riastrad return; 1209 1.1 riastrad 1210 1.1 riastrad /* 1211 1.1 riastrad * As this request likely depends on state from the lost 1212 1.1 riastrad * context, clear out all the user operations leaving the 1213 1.1 riastrad * breadcrumb at the end (so we get the fence notifications). 1214 1.1 riastrad */ 1215 1.1 riastrad head = rq->infix; 1216 1.1 riastrad if (rq->postfix < head) { 1217 1.8 riastrad memset(vaddr + head, 0, rq->ring->size - head); 1218 1.1 riastrad head = 0; 1219 1.1 riastrad } 1220 1.8 riastrad memset(vaddr + head, 0, rq->postfix - head); 1221 1.1 riastrad rq->infix = rq->postfix; 1222 1.1 riastrad } 1223 1.1 riastrad 1224 1.1 riastrad static struct i915_request * 1225 1.1 riastrad __i915_request_add_to_timeline(struct i915_request *rq) 1226 1.1 riastrad { 1227 1.1 riastrad struct intel_timeline *timeline = i915_request_timeline(rq); 1228 1.1 riastrad struct i915_request *prev; 1229 1.1 riastrad 1230 1.1 riastrad /* 1231 1.1 riastrad * Dependency tracking and request ordering along the timeline 1232 1.1 riastrad * is special cased so that we can eliminate redundant ordering 1233 1.1 riastrad * operations while building the request (we know that the timeline 1234 1.1 riastrad * itself is ordered, and here we guarantee it). 1235 1.1 riastrad * 1236 1.1 riastrad * As we know we will need to emit tracking along the timeline, 1237 1.1 riastrad * we embed the hooks into our request struct -- at the cost of 1238 1.1 riastrad * having to have specialised no-allocation interfaces (which will 1239 1.1 riastrad * be beneficial elsewhere). 1240 1.1 riastrad * 1241 1.1 riastrad * A second benefit to open-coding i915_request_await_request is 1242 1.1 riastrad * that we can apply a slight variant of the rules specialised 1243 1.1 riastrad * for timelines that jump between engines (such as virtual engines). 1244 1.1 riastrad * If we consider the case of virtual engine, we must emit a dma-fence 1245 1.1 riastrad * to prevent scheduling of the second request until the first is 1246 1.1 riastrad * complete (to maximise our greedy late load balancing) and this 1247 1.1 riastrad * precludes optimising to use semaphores serialisation of a single 1248 1.1 riastrad * timeline across engines. 1249 1.1 riastrad */ 1250 1.1 riastrad prev = to_request(__i915_active_fence_set(&timeline->last_request, 1251 1.1 riastrad &rq->fence)); 1252 1.1 riastrad if (prev && !i915_request_completed(prev)) { 1253 1.1 riastrad if (is_power_of_2(prev->engine->mask | rq->engine->mask)) 1254 1.1 riastrad i915_sw_fence_await_sw_fence(&rq->submit, 1255 1.1 riastrad &prev->submit, 1256 1.1 riastrad &rq->submitq); 1257 1.1 riastrad else 1258 1.1 riastrad __i915_sw_fence_await_dma_fence(&rq->submit, 1259 1.1 riastrad &prev->fence, 1260 1.1 riastrad &rq->dmaq); 1261 1.1 riastrad if (rq->engine->schedule) 1262 1.1 riastrad __i915_sched_node_add_dependency(&rq->sched, 1263 1.1 riastrad &prev->sched, 1264 1.1 riastrad &rq->dep, 1265 1.1 riastrad 0); 1266 1.1 riastrad } 1267 1.1 riastrad 1268 1.1 riastrad list_add_tail(&rq->link, &timeline->requests); 1269 1.1 riastrad 1270 1.1 riastrad /* 1271 1.1 riastrad * Make sure that no request gazumped us - if it was allocated after 1272 1.1 riastrad * our i915_request_alloc() and called __i915_request_add() before 1273 1.1 riastrad * us, the timeline will hold its seqno which is later than ours. 1274 1.1 riastrad */ 1275 1.1 riastrad GEM_BUG_ON(timeline->seqno != rq->fence.seqno); 1276 1.1 riastrad 1277 1.1 riastrad return prev; 1278 1.1 riastrad } 1279 1.1 riastrad 1280 1.1 riastrad /* 1281 1.1 riastrad * NB: This function is not allowed to fail. Doing so would mean the the 1282 1.1 riastrad * request is not being tracked for completion but the work itself is 1283 1.1 riastrad * going to happen on the hardware. This would be a Bad Thing(tm). 1284 1.1 riastrad */ 1285 1.1 riastrad struct i915_request *__i915_request_commit(struct i915_request *rq) 1286 1.1 riastrad { 1287 1.1 riastrad struct intel_engine_cs *engine = rq->engine; 1288 1.1 riastrad struct intel_ring *ring = rq->ring; 1289 1.1 riastrad u32 *cs; 1290 1.1 riastrad 1291 1.1 riastrad RQ_TRACE(rq, "\n"); 1292 1.1 riastrad 1293 1.1 riastrad /* 1294 1.1 riastrad * To ensure that this call will not fail, space for its emissions 1295 1.1 riastrad * should already have been reserved in the ring buffer. Let the ring 1296 1.1 riastrad * know that it is time to use that space up. 1297 1.1 riastrad */ 1298 1.1 riastrad GEM_BUG_ON(rq->reserved_space > ring->space); 1299 1.1 riastrad rq->reserved_space = 0; 1300 1.1 riastrad rq->emitted_jiffies = jiffies; 1301 1.1 riastrad 1302 1.1 riastrad /* 1303 1.1 riastrad * Record the position of the start of the breadcrumb so that 1304 1.1 riastrad * should we detect the updated seqno part-way through the 1305 1.1 riastrad * GPU processing the request, we never over-estimate the 1306 1.1 riastrad * position of the ring's HEAD. 1307 1.1 riastrad */ 1308 1.1 riastrad cs = intel_ring_begin(rq, engine->emit_fini_breadcrumb_dw); 1309 1.1 riastrad GEM_BUG_ON(IS_ERR(cs)); 1310 1.1 riastrad rq->postfix = intel_ring_offset(rq, cs); 1311 1.1 riastrad 1312 1.1 riastrad return __i915_request_add_to_timeline(rq); 1313 1.1 riastrad } 1314 1.1 riastrad 1315 1.1 riastrad void __i915_request_queue(struct i915_request *rq, 1316 1.1 riastrad const struct i915_sched_attr *attr) 1317 1.1 riastrad { 1318 1.1 riastrad /* 1319 1.1 riastrad * Let the backend know a new request has arrived that may need 1320 1.1 riastrad * to adjust the existing execution schedule due to a high priority 1321 1.1 riastrad * request - i.e. we may want to preempt the current request in order 1322 1.1 riastrad * to run a high priority dependency chain *before* we can execute this 1323 1.1 riastrad * request. 1324 1.1 riastrad * 1325 1.1 riastrad * This is called before the request is ready to run so that we can 1326 1.1 riastrad * decide whether to preempt the entire chain so that it is ready to 1327 1.1 riastrad * run at the earliest possible convenience. 1328 1.1 riastrad */ 1329 1.1 riastrad i915_sw_fence_commit(&rq->semaphore); 1330 1.1 riastrad if (attr && rq->engine->schedule) 1331 1.1 riastrad rq->engine->schedule(rq, attr); 1332 1.1 riastrad i915_sw_fence_commit(&rq->submit); 1333 1.1 riastrad } 1334 1.1 riastrad 1335 1.1 riastrad void i915_request_add(struct i915_request *rq) 1336 1.1 riastrad { 1337 1.1 riastrad struct intel_timeline * const tl = i915_request_timeline(rq); 1338 1.1 riastrad struct i915_sched_attr attr = {}; 1339 1.1 riastrad struct i915_request *prev; 1340 1.1 riastrad 1341 1.1 riastrad lockdep_assert_held(&tl->mutex); 1342 1.1 riastrad lockdep_unpin_lock(&tl->mutex, rq->cookie); 1343 1.1 riastrad 1344 1.1 riastrad trace_i915_request_add(rq); 1345 1.1 riastrad 1346 1.1 riastrad prev = __i915_request_commit(rq); 1347 1.1 riastrad 1348 1.1 riastrad if (rcu_access_pointer(rq->context->gem_context)) 1349 1.1 riastrad attr = i915_request_gem_context(rq)->sched; 1350 1.1 riastrad 1351 1.1 riastrad /* 1352 1.1 riastrad * Boost actual workloads past semaphores! 1353 1.1 riastrad * 1354 1.1 riastrad * With semaphores we spin on one engine waiting for another, 1355 1.1 riastrad * simply to reduce the latency of starting our work when 1356 1.1 riastrad * the signaler completes. However, if there is any other 1357 1.1 riastrad * work that we could be doing on this engine instead, that 1358 1.1 riastrad * is better utilisation and will reduce the overall duration 1359 1.1 riastrad * of the current work. To avoid PI boosting a semaphore 1360 1.1 riastrad * far in the distance past over useful work, we keep a history 1361 1.1 riastrad * of any semaphore use along our dependency chain. 1362 1.1 riastrad */ 1363 1.1 riastrad if (!(rq->sched.flags & I915_SCHED_HAS_SEMAPHORE_CHAIN)) 1364 1.1 riastrad attr.priority |= I915_PRIORITY_NOSEMAPHORE; 1365 1.1 riastrad 1366 1.1 riastrad /* 1367 1.1 riastrad * Boost priorities to new clients (new request flows). 1368 1.1 riastrad * 1369 1.1 riastrad * Allow interactive/synchronous clients to jump ahead of 1370 1.1 riastrad * the bulk clients. (FQ_CODEL) 1371 1.1 riastrad */ 1372 1.1 riastrad if (list_empty(&rq->sched.signalers_list)) 1373 1.1 riastrad attr.priority |= I915_PRIORITY_WAIT; 1374 1.1 riastrad 1375 1.4 riastrad #ifdef __NetBSD__ 1376 1.4 riastrad int s = splsoftserial(); 1377 1.4 riastrad #else 1378 1.1 riastrad local_bh_disable(); 1379 1.4 riastrad #endif 1380 1.1 riastrad __i915_request_queue(rq, &attr); 1381 1.4 riastrad #ifdef __NetBSD__ 1382 1.4 riastrad splx(s); 1383 1.4 riastrad #else 1384 1.1 riastrad local_bh_enable(); /* Kick the execlists tasklet if just scheduled */ 1385 1.4 riastrad #endif 1386 1.1 riastrad 1387 1.1 riastrad /* 1388 1.1 riastrad * In typical scenarios, we do not expect the previous request on 1389 1.1 riastrad * the timeline to be still tracked by timeline->last_request if it 1390 1.1 riastrad * has been completed. If the completed request is still here, that 1391 1.1 riastrad * implies that request retirement is a long way behind submission, 1392 1.1 riastrad * suggesting that we haven't been retiring frequently enough from 1393 1.1 riastrad * the combination of retire-before-alloc, waiters and the background 1394 1.1 riastrad * retirement worker. So if the last request on this timeline was 1395 1.1 riastrad * already completed, do a catch up pass, flushing the retirement queue 1396 1.1 riastrad * up to this client. Since we have now moved the heaviest operations 1397 1.1 riastrad * during retirement onto secondary workers, such as freeing objects 1398 1.1 riastrad * or contexts, retiring a bunch of requests is mostly list management 1399 1.1 riastrad * (and cache misses), and so we should not be overly penalizing this 1400 1.1 riastrad * client by performing excess work, though we may still performing 1401 1.1 riastrad * work on behalf of others -- but instead we should benefit from 1402 1.1 riastrad * improved resource management. (Well, that's the theory at least.) 1403 1.1 riastrad */ 1404 1.1 riastrad if (prev && 1405 1.1 riastrad i915_request_completed(prev) && 1406 1.1 riastrad rcu_access_pointer(prev->timeline) == tl) 1407 1.1 riastrad i915_request_retire_upto(prev); 1408 1.1 riastrad 1409 1.1 riastrad mutex_unlock(&tl->mutex); 1410 1.1 riastrad } 1411 1.1 riastrad 1412 1.1 riastrad static unsigned long local_clock_us(unsigned int *cpu) 1413 1.1 riastrad { 1414 1.1 riastrad unsigned long t; 1415 1.1 riastrad 1416 1.1 riastrad /* 1417 1.1 riastrad * Cheaply and approximately convert from nanoseconds to microseconds. 1418 1.1 riastrad * The result and subsequent calculations are also defined in the same 1419 1.1 riastrad * approximate microseconds units. The principal source of timing 1420 1.1 riastrad * error here is from the simple truncation. 1421 1.1 riastrad * 1422 1.1 riastrad * Note that local_clock() is only defined wrt to the current CPU; 1423 1.1 riastrad * the comparisons are no longer valid if we switch CPUs. Instead of 1424 1.1 riastrad * blocking preemption for the entire busywait, we can detect the CPU 1425 1.1 riastrad * switch and use that as indicator of system load and a reason to 1426 1.1 riastrad * stop busywaiting, see busywait_stop(). 1427 1.1 riastrad */ 1428 1.1 riastrad *cpu = get_cpu(); 1429 1.1 riastrad t = local_clock() >> 10; 1430 1.1 riastrad put_cpu(); 1431 1.1 riastrad 1432 1.1 riastrad return t; 1433 1.1 riastrad } 1434 1.1 riastrad 1435 1.1 riastrad static bool busywait_stop(unsigned long timeout, unsigned int cpu) 1436 1.1 riastrad { 1437 1.1 riastrad unsigned int this_cpu; 1438 1.1 riastrad 1439 1.1 riastrad if (time_after(local_clock_us(&this_cpu), timeout)) 1440 1.1 riastrad return true; 1441 1.1 riastrad 1442 1.1 riastrad return this_cpu != cpu; 1443 1.1 riastrad } 1444 1.1 riastrad 1445 1.1 riastrad static bool __i915_spin_request(const struct i915_request * const rq, 1446 1.1 riastrad int state, unsigned long timeout_us) 1447 1.1 riastrad { 1448 1.1 riastrad unsigned int cpu; 1449 1.1 riastrad 1450 1.1 riastrad /* 1451 1.1 riastrad * Only wait for the request if we know it is likely to complete. 1452 1.1 riastrad * 1453 1.1 riastrad * We don't track the timestamps around requests, nor the average 1454 1.1 riastrad * request length, so we do not have a good indicator that this 1455 1.1 riastrad * request will complete within the timeout. What we do know is the 1456 1.1 riastrad * order in which requests are executed by the context and so we can 1457 1.1 riastrad * tell if the request has been started. If the request is not even 1458 1.1 riastrad * running yet, it is a fair assumption that it will not complete 1459 1.1 riastrad * within our relatively short timeout. 1460 1.1 riastrad */ 1461 1.1 riastrad if (!i915_request_is_running(rq)) 1462 1.1 riastrad return false; 1463 1.1 riastrad 1464 1.1 riastrad /* 1465 1.1 riastrad * When waiting for high frequency requests, e.g. during synchronous 1466 1.1 riastrad * rendering split between the CPU and GPU, the finite amount of time 1467 1.1 riastrad * required to set up the irq and wait upon it limits the response 1468 1.1 riastrad * rate. By busywaiting on the request completion for a short while we 1469 1.1 riastrad * can service the high frequency waits as quick as possible. However, 1470 1.1 riastrad * if it is a slow request, we want to sleep as quickly as possible. 1471 1.1 riastrad * The tradeoff between waiting and sleeping is roughly the time it 1472 1.1 riastrad * takes to sleep on a request, on the order of a microsecond. 1473 1.1 riastrad */ 1474 1.1 riastrad 1475 1.1 riastrad timeout_us += local_clock_us(&cpu); 1476 1.1 riastrad do { 1477 1.1 riastrad if (i915_request_completed(rq)) 1478 1.1 riastrad return true; 1479 1.1 riastrad 1480 1.1 riastrad if (signal_pending_state(state, current)) 1481 1.1 riastrad break; 1482 1.1 riastrad 1483 1.1 riastrad if (busywait_stop(timeout_us, cpu)) 1484 1.1 riastrad break; 1485 1.1 riastrad 1486 1.1 riastrad cpu_relax(); 1487 1.1 riastrad } while (!need_resched()); 1488 1.1 riastrad 1489 1.1 riastrad return false; 1490 1.1 riastrad } 1491 1.1 riastrad 1492 1.1 riastrad struct request_wait { 1493 1.1 riastrad struct dma_fence_cb cb; 1494 1.3 riastrad #ifdef __NetBSD__ 1495 1.5 riastrad drm_waitqueue_t wq; 1496 1.3 riastrad #else 1497 1.1 riastrad struct task_struct *tsk; 1498 1.3 riastrad #endif 1499 1.1 riastrad }; 1500 1.1 riastrad 1501 1.1 riastrad static void request_wait_wake(struct dma_fence *fence, struct dma_fence_cb *cb) 1502 1.1 riastrad { 1503 1.1 riastrad struct request_wait *wait = container_of(cb, typeof(*wait), cb); 1504 1.1 riastrad 1505 1.5 riastrad #ifdef __NetBSD__ 1506 1.5 riastrad DRM_SPIN_WAKEUP_ALL(&wait->wq, fence->lock); 1507 1.5 riastrad #else 1508 1.1 riastrad wake_up_process(wait->tsk); 1509 1.5 riastrad #endif 1510 1.1 riastrad } 1511 1.1 riastrad 1512 1.1 riastrad /** 1513 1.1 riastrad * i915_request_wait - wait until execution of request has finished 1514 1.1 riastrad * @rq: the request to wait upon 1515 1.1 riastrad * @flags: how to wait 1516 1.1 riastrad * @timeout: how long to wait in jiffies 1517 1.1 riastrad * 1518 1.1 riastrad * i915_request_wait() waits for the request to be completed, for a 1519 1.1 riastrad * maximum of @timeout jiffies (with MAX_SCHEDULE_TIMEOUT implying an 1520 1.1 riastrad * unbounded wait). 1521 1.1 riastrad * 1522 1.1 riastrad * Returns the remaining time (in jiffies) if the request completed, which may 1523 1.1 riastrad * be zero or -ETIME if the request is unfinished after the timeout expires. 1524 1.1 riastrad * May return -EINTR is called with I915_WAIT_INTERRUPTIBLE and a signal is 1525 1.1 riastrad * pending before the request completes. 1526 1.1 riastrad */ 1527 1.1 riastrad long i915_request_wait(struct i915_request *rq, 1528 1.1 riastrad unsigned int flags, 1529 1.1 riastrad long timeout) 1530 1.1 riastrad { 1531 1.1 riastrad const int state = flags & I915_WAIT_INTERRUPTIBLE ? 1532 1.1 riastrad TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE; 1533 1.1 riastrad struct request_wait wait; 1534 1.1 riastrad 1535 1.1 riastrad might_sleep(); 1536 1.1 riastrad GEM_BUG_ON(timeout < 0); 1537 1.1 riastrad 1538 1.1 riastrad if (dma_fence_is_signaled(&rq->fence)) 1539 1.1 riastrad return timeout; 1540 1.1 riastrad 1541 1.1 riastrad if (!timeout) 1542 1.1 riastrad return -ETIME; 1543 1.1 riastrad 1544 1.1 riastrad trace_i915_request_wait_begin(rq, flags); 1545 1.1 riastrad 1546 1.1 riastrad /* 1547 1.1 riastrad * We must never wait on the GPU while holding a lock as we 1548 1.1 riastrad * may need to perform a GPU reset. So while we don't need to 1549 1.1 riastrad * serialise wait/reset with an explicit lock, we do want 1550 1.1 riastrad * lockdep to detect potential dependency cycles. 1551 1.1 riastrad */ 1552 1.1 riastrad mutex_acquire(&rq->engine->gt->reset.mutex.dep_map, 0, 0, _THIS_IP_); 1553 1.1 riastrad 1554 1.1 riastrad /* 1555 1.1 riastrad * Optimistic spin before touching IRQs. 1556 1.1 riastrad * 1557 1.1 riastrad * We may use a rather large value here to offset the penalty of 1558 1.1 riastrad * switching away from the active task. Frequently, the client will 1559 1.1 riastrad * wait upon an old swapbuffer to throttle itself to remain within a 1560 1.1 riastrad * frame of the gpu. If the client is running in lockstep with the gpu, 1561 1.1 riastrad * then it should not be waiting long at all, and a sleep now will incur 1562 1.1 riastrad * extra scheduler latency in producing the next frame. To try to 1563 1.1 riastrad * avoid adding the cost of enabling/disabling the interrupt to the 1564 1.1 riastrad * short wait, we first spin to see if the request would have completed 1565 1.1 riastrad * in the time taken to setup the interrupt. 1566 1.1 riastrad * 1567 1.1 riastrad * We need upto 5us to enable the irq, and upto 20us to hide the 1568 1.1 riastrad * scheduler latency of a context switch, ignoring the secondary 1569 1.1 riastrad * impacts from a context switch such as cache eviction. 1570 1.1 riastrad * 1571 1.1 riastrad * The scheme used for low-latency IO is called "hybrid interrupt 1572 1.1 riastrad * polling". The suggestion there is to sleep until just before you 1573 1.1 riastrad * expect to be woken by the device interrupt and then poll for its 1574 1.1 riastrad * completion. That requires having a good predictor for the request 1575 1.1 riastrad * duration, which we currently lack. 1576 1.1 riastrad */ 1577 1.1 riastrad if (IS_ACTIVE(CONFIG_DRM_I915_SPIN_REQUEST) && 1578 1.1 riastrad __i915_spin_request(rq, state, CONFIG_DRM_I915_SPIN_REQUEST)) { 1579 1.1 riastrad dma_fence_signal(&rq->fence); 1580 1.1 riastrad goto out; 1581 1.1 riastrad } 1582 1.1 riastrad 1583 1.1 riastrad /* 1584 1.1 riastrad * This client is about to stall waiting for the GPU. In many cases 1585 1.1 riastrad * this is undesirable and limits the throughput of the system, as 1586 1.1 riastrad * many clients cannot continue processing user input/output whilst 1587 1.1 riastrad * blocked. RPS autotuning may take tens of milliseconds to respond 1588 1.1 riastrad * to the GPU load and thus incurs additional latency for the client. 1589 1.1 riastrad * We can circumvent that by promoting the GPU frequency to maximum 1590 1.1 riastrad * before we sleep. This makes the GPU throttle up much more quickly 1591 1.1 riastrad * (good for benchmarks and user experience, e.g. window animations), 1592 1.1 riastrad * but at a cost of spending more power processing the workload 1593 1.1 riastrad * (bad for battery). 1594 1.1 riastrad */ 1595 1.1 riastrad if (flags & I915_WAIT_PRIORITY) { 1596 1.1 riastrad if (!i915_request_started(rq) && INTEL_GEN(rq->i915) >= 6) 1597 1.1 riastrad intel_rps_boost(rq); 1598 1.1 riastrad i915_schedule_bump_priority(rq, I915_PRIORITY_WAIT); 1599 1.1 riastrad } 1600 1.1 riastrad 1601 1.5 riastrad #ifdef __NetBSD__ 1602 1.5 riastrad DRM_INIT_WAITQUEUE(&wait.wq, "i915req"); 1603 1.15 riastrad #else 1604 1.15 riastrad wait.tsk = current; 1605 1.15 riastrad #endif 1606 1.5 riastrad if (dma_fence_add_callback(&rq->fence, &wait.cb, request_wait_wake)) 1607 1.5 riastrad goto out; 1608 1.15 riastrad 1609 1.15 riastrad #ifdef __NetBSD__ 1610 1.5 riastrad spin_lock(rq->fence.lock); 1611 1.7 riastrad #define C (i915_request_completed(rq) ? 1 : \ 1612 1.12 riastrad (spin_unlock(rq->fence.lock), \ 1613 1.12 riastrad intel_engine_flush_submission(rq->engine), \ 1614 1.12 riastrad spin_lock(rq->fence.lock), \ 1615 1.12 riastrad i915_request_completed(rq))) 1616 1.5 riastrad if (flags & I915_WAIT_INTERRUPTIBLE) { 1617 1.5 riastrad DRM_SPIN_TIMED_WAIT_UNTIL(timeout, &wait.wq, 1618 1.5 riastrad rq->fence.lock, timeout, 1619 1.7 riastrad C); 1620 1.5 riastrad } else { 1621 1.5 riastrad DRM_SPIN_TIMED_WAIT_NOINTR_UNTIL(timeout, &wait.wq, 1622 1.5 riastrad rq->fence.lock, timeout, 1623 1.7 riastrad C); 1624 1.5 riastrad } 1625 1.7 riastrad #undef C 1626 1.16 riastrad if (timeout > 0) { /* succeeded before timeout */ 1627 1.16 riastrad KASSERT(i915_request_completed(rq)); 1628 1.6 riastrad dma_fence_signal_locked(&rq->fence); 1629 1.16 riastrad } else if (timeout == 0) { /* timed out */ 1630 1.16 riastrad timeout = -ETIME; 1631 1.16 riastrad } 1632 1.5 riastrad spin_unlock(rq->fence.lock); 1633 1.5 riastrad #else 1634 1.1 riastrad for (;;) { 1635 1.1 riastrad set_current_state(state); 1636 1.1 riastrad 1637 1.1 riastrad if (i915_request_completed(rq)) { 1638 1.1 riastrad dma_fence_signal(&rq->fence); 1639 1.1 riastrad break; 1640 1.1 riastrad } 1641 1.1 riastrad 1642 1.1 riastrad if (signal_pending_state(state, current)) { 1643 1.1 riastrad timeout = -ERESTARTSYS; 1644 1.1 riastrad break; 1645 1.1 riastrad } 1646 1.1 riastrad 1647 1.1 riastrad if (!timeout) { 1648 1.1 riastrad timeout = -ETIME; 1649 1.1 riastrad break; 1650 1.1 riastrad } 1651 1.1 riastrad 1652 1.1 riastrad intel_engine_flush_submission(rq->engine); 1653 1.1 riastrad timeout = io_schedule_timeout(timeout); 1654 1.1 riastrad } 1655 1.1 riastrad __set_current_state(TASK_RUNNING); 1656 1.5 riastrad #endif 1657 1.1 riastrad 1658 1.1 riastrad dma_fence_remove_callback(&rq->fence, &wait.cb); 1659 1.17 riastrad #ifdef __NetBSD__ 1660 1.17 riastrad DRM_DESTROY_WAITQUEUE(&wait.wq); 1661 1.17 riastrad #endif 1662 1.1 riastrad 1663 1.1 riastrad out: 1664 1.1 riastrad mutex_release(&rq->engine->gt->reset.mutex.dep_map, _THIS_IP_); 1665 1.1 riastrad trace_i915_request_wait_end(rq); 1666 1.1 riastrad return timeout; 1667 1.1 riastrad } 1668 1.1 riastrad 1669 1.1 riastrad #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1670 1.1 riastrad #include "selftests/mock_request.c" 1671 1.1 riastrad #include "selftests/i915_request.c" 1672 1.1 riastrad #endif 1673 1.1 riastrad 1674 1.1 riastrad static void i915_global_request_shrink(void) 1675 1.1 riastrad { 1676 1.1 riastrad kmem_cache_shrink(global.slab_dependencies); 1677 1.1 riastrad kmem_cache_shrink(global.slab_execute_cbs); 1678 1.1 riastrad kmem_cache_shrink(global.slab_requests); 1679 1.1 riastrad } 1680 1.1 riastrad 1681 1.1 riastrad static void i915_global_request_exit(void) 1682 1.1 riastrad { 1683 1.1 riastrad kmem_cache_destroy(global.slab_dependencies); 1684 1.1 riastrad kmem_cache_destroy(global.slab_execute_cbs); 1685 1.1 riastrad kmem_cache_destroy(global.slab_requests); 1686 1.1 riastrad } 1687 1.1 riastrad 1688 1.1 riastrad static struct i915_global_request global = { { 1689 1.1 riastrad .shrink = i915_global_request_shrink, 1690 1.1 riastrad .exit = i915_global_request_exit, 1691 1.1 riastrad } }; 1692 1.1 riastrad 1693 1.1 riastrad int __init i915_global_request_init(void) 1694 1.1 riastrad { 1695 1.1 riastrad global.slab_requests = 1696 1.9 riastrad kmem_cache_create_dtor("i915_request", 1697 1.1 riastrad sizeof(struct i915_request), 1698 1.1 riastrad __alignof__(struct i915_request), 1699 1.1 riastrad SLAB_HWCACHE_ALIGN | 1700 1.1 riastrad SLAB_RECLAIM_ACCOUNT | 1701 1.1 riastrad SLAB_TYPESAFE_BY_RCU, 1702 1.9 riastrad __i915_request_ctor, 1703 1.9 riastrad __i915_request_dtor); 1704 1.1 riastrad if (!global.slab_requests) 1705 1.1 riastrad return -ENOMEM; 1706 1.1 riastrad 1707 1.1 riastrad global.slab_execute_cbs = KMEM_CACHE(execute_cb, 1708 1.1 riastrad SLAB_HWCACHE_ALIGN | 1709 1.1 riastrad SLAB_RECLAIM_ACCOUNT | 1710 1.1 riastrad SLAB_TYPESAFE_BY_RCU); 1711 1.1 riastrad if (!global.slab_execute_cbs) 1712 1.1 riastrad goto err_requests; 1713 1.1 riastrad 1714 1.1 riastrad global.slab_dependencies = KMEM_CACHE(i915_dependency, 1715 1.1 riastrad SLAB_HWCACHE_ALIGN | 1716 1.1 riastrad SLAB_RECLAIM_ACCOUNT); 1717 1.1 riastrad if (!global.slab_dependencies) 1718 1.1 riastrad goto err_execute_cbs; 1719 1.1 riastrad 1720 1.1 riastrad i915_global_register(&global.base); 1721 1.1 riastrad return 0; 1722 1.1 riastrad 1723 1.1 riastrad err_execute_cbs: 1724 1.1 riastrad kmem_cache_destroy(global.slab_execute_cbs); 1725 1.1 riastrad err_requests: 1726 1.1 riastrad kmem_cache_destroy(global.slab_requests); 1727 1.1 riastrad return -ENOMEM; 1728 1.1 riastrad } 1729