intel_lrc.c revision 1.2 1 /* $NetBSD: intel_lrc.c,v 1.2 2021/12/18 23:45:30 riastradh Exp $ */
2
3 /*
4 * Copyright 2014 Intel Corporation
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23 * IN THE SOFTWARE.
24 *
25 * Authors:
26 * Ben Widawsky <ben (at) bwidawsk.net>
27 * Michel Thierry <michel.thierry (at) intel.com>
28 * Thomas Daniel <thomas.daniel (at) intel.com>
29 * Oscar Mateo <oscar.mateo (at) intel.com>
30 *
31 */
32
33 /**
34 * DOC: Logical Rings, Logical Ring Contexts and Execlists
35 *
36 * Motivation:
37 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
38 * These expanded contexts enable a number of new abilities, especially
39 * "Execlists" (also implemented in this file).
40 *
41 * One of the main differences with the legacy HW contexts is that logical
42 * ring contexts incorporate many more things to the context's state, like
43 * PDPs or ringbuffer control registers:
44 *
45 * The reason why PDPs are included in the context is straightforward: as
46 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
47 * contained there mean you don't need to do a ppgtt->switch_mm yourself,
48 * instead, the GPU will do it for you on the context switch.
49 *
50 * But, what about the ringbuffer control registers (head, tail, etc..)?
51 * shouldn't we just need a set of those per engine command streamer? This is
52 * where the name "Logical Rings" starts to make sense: by virtualizing the
53 * rings, the engine cs shifts to a new "ring buffer" with every context
54 * switch. When you want to submit a workload to the GPU you: A) choose your
55 * context, B) find its appropriate virtualized ring, C) write commands to it
56 * and then, finally, D) tell the GPU to switch to that context.
57 *
58 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
59 * to a contexts is via a context execution list, ergo "Execlists".
60 *
61 * LRC implementation:
62 * Regarding the creation of contexts, we have:
63 *
64 * - One global default context.
65 * - One local default context for each opened fd.
66 * - One local extra context for each context create ioctl call.
67 *
68 * Now that ringbuffers belong per-context (and not per-engine, like before)
69 * and that contexts are uniquely tied to a given engine (and not reusable,
70 * like before) we need:
71 *
72 * - One ringbuffer per-engine inside each context.
73 * - One backing object per-engine inside each context.
74 *
75 * The global default context starts its life with these new objects fully
76 * allocated and populated. The local default context for each opened fd is
77 * more complex, because we don't know at creation time which engine is going
78 * to use them. To handle this, we have implemented a deferred creation of LR
79 * contexts:
80 *
81 * The local context starts its life as a hollow or blank holder, that only
82 * gets populated for a given engine once we receive an execbuffer. If later
83 * on we receive another execbuffer ioctl for the same context but a different
84 * engine, we allocate/populate a new ringbuffer and context backing object and
85 * so on.
86 *
87 * Finally, regarding local contexts created using the ioctl call: as they are
88 * only allowed with the render ring, we can allocate & populate them right
89 * away (no need to defer anything, at least for now).
90 *
91 * Execlists implementation:
92 * Execlists are the new method by which, on gen8+ hardware, workloads are
93 * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
94 * This method works as follows:
95 *
96 * When a request is committed, its commands (the BB start and any leading or
97 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
98 * for the appropriate context. The tail pointer in the hardware context is not
99 * updated at this time, but instead, kept by the driver in the ringbuffer
100 * structure. A structure representing this request is added to a request queue
101 * for the appropriate engine: this structure contains a copy of the context's
102 * tail after the request was written to the ring buffer and a pointer to the
103 * context itself.
104 *
105 * If the engine's request queue was empty before the request was added, the
106 * queue is processed immediately. Otherwise the queue will be processed during
107 * a context switch interrupt. In any case, elements on the queue will get sent
108 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
109 * globally unique 20-bits submission ID.
110 *
111 * When execution of a request completes, the GPU updates the context status
112 * buffer with a context complete event and generates a context switch interrupt.
113 * During the interrupt handling, the driver examines the events in the buffer:
114 * for each context complete event, if the announced ID matches that on the head
115 * of the request queue, then that request is retired and removed from the queue.
116 *
117 * After processing, if any requests were retired and the queue is not empty
118 * then a new execution list can be submitted. The two requests at the front of
119 * the queue are next to be submitted but since a context may not occur twice in
120 * an execution list, if subsequent requests have the same ID as the first then
121 * the two requests must be combined. This is done simply by discarding requests
122 * at the head of the queue until either only one requests is left (in which case
123 * we use a NULL second context) or the first two requests have unique IDs.
124 *
125 * By always executing the first two requests in the queue the driver ensures
126 * that the GPU is kept as busy as possible. In the case where a single context
127 * completes but a second context is still executing, the request for this second
128 * context will be at the head of the queue when we remove the first one. This
129 * request will then be resubmitted along with a new request for a different context,
130 * which will cause the hardware to continue executing the second request and queue
131 * the new request (the GPU detects the condition of a context getting preempted
132 * with the same context and optimizes the context switch flow by not doing
133 * preemption, but just sampling the new tail pointer).
134 *
135 */
136 #include <sys/cdefs.h>
137 __KERNEL_RCSID(0, "$NetBSD: intel_lrc.c,v 1.2 2021/12/18 23:45:30 riastradh Exp $");
138
139 #include <linux/interrupt.h>
140
141 #include "i915_drv.h"
142 #include "i915_perf.h"
143 #include "i915_trace.h"
144 #include "i915_vgpu.h"
145 #include "intel_context.h"
146 #include "intel_engine_pm.h"
147 #include "intel_gt.h"
148 #include "intel_gt_pm.h"
149 #include "intel_gt_requests.h"
150 #include "intel_lrc_reg.h"
151 #include "intel_mocs.h"
152 #include "intel_reset.h"
153 #include "intel_ring.h"
154 #include "intel_workarounds.h"
155
156 #define RING_EXECLIST_QFULL (1 << 0x2)
157 #define RING_EXECLIST1_VALID (1 << 0x3)
158 #define RING_EXECLIST0_VALID (1 << 0x4)
159 #define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE)
160 #define RING_EXECLIST1_ACTIVE (1 << 0x11)
161 #define RING_EXECLIST0_ACTIVE (1 << 0x12)
162
163 #define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0)
164 #define GEN8_CTX_STATUS_PREEMPTED (1 << 1)
165 #define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2)
166 #define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3)
167 #define GEN8_CTX_STATUS_COMPLETE (1 << 4)
168 #define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15)
169
170 #define GEN8_CTX_STATUS_COMPLETED_MASK \
171 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
172
173 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
174
175 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE (0x1) /* lower csb dword */
176 #define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */
177 #define GEN12_CSB_SW_CTX_ID_MASK GENMASK(25, 15)
178 #define GEN12_IDLE_CTX_ID 0x7FF
179 #define GEN12_CSB_CTX_VALID(csb_dw) \
180 (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
181
182 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
183 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
184 #define WA_TAIL_DWORDS 2
185 #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
186
187 struct virtual_engine {
188 struct intel_engine_cs base;
189 struct intel_context context;
190
191 /*
192 * We allow only a single request through the virtual engine at a time
193 * (each request in the timeline waits for the completion fence of
194 * the previous before being submitted). By restricting ourselves to
195 * only submitting a single request, each request is placed on to a
196 * physical to maximise load spreading (by virtue of the late greedy
197 * scheduling -- each real engine takes the next available request
198 * upon idling).
199 */
200 struct i915_request *request;
201
202 /*
203 * We keep a rbtree of available virtual engines inside each physical
204 * engine, sorted by priority. Here we preallocate the nodes we need
205 * for the virtual engine, indexed by physical_engine->id.
206 */
207 struct ve_node {
208 struct rb_node rb;
209 int prio;
210 } nodes[I915_NUM_ENGINES];
211
212 /*
213 * Keep track of bonded pairs -- restrictions upon on our selection
214 * of physical engines any particular request may be submitted to.
215 * If we receive a submit-fence from a master engine, we will only
216 * use one of sibling_mask physical engines.
217 */
218 struct ve_bond {
219 const struct intel_engine_cs *master;
220 intel_engine_mask_t sibling_mask;
221 } *bonds;
222 unsigned int num_bonds;
223
224 /* And finally, which physical engines this virtual engine maps onto. */
225 unsigned int num_siblings;
226 struct intel_engine_cs *siblings[0];
227 };
228
229 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
230 {
231 GEM_BUG_ON(!intel_engine_is_virtual(engine));
232 return container_of(engine, struct virtual_engine, base);
233 }
234
235 static int __execlists_context_alloc(struct intel_context *ce,
236 struct intel_engine_cs *engine);
237
238 static void execlists_init_reg_state(u32 *reg_state,
239 const struct intel_context *ce,
240 const struct intel_engine_cs *engine,
241 const struct intel_ring *ring,
242 bool close);
243 static void
244 __execlists_update_reg_state(const struct intel_context *ce,
245 const struct intel_engine_cs *engine,
246 u32 head);
247
248 static void mark_eio(struct i915_request *rq)
249 {
250 if (i915_request_completed(rq))
251 return;
252
253 GEM_BUG_ON(i915_request_signaled(rq));
254
255 dma_fence_set_error(&rq->fence, -EIO);
256 i915_request_mark_complete(rq);
257 }
258
259 static struct i915_request *
260 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
261 {
262 struct i915_request *active = rq;
263
264 rcu_read_lock();
265 list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
266 if (i915_request_completed(rq))
267 break;
268
269 active = rq;
270 }
271 rcu_read_unlock();
272
273 return active;
274 }
275
276 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
277 {
278 return (i915_ggtt_offset(engine->status_page.vma) +
279 I915_GEM_HWS_PREEMPT_ADDR);
280 }
281
282 static inline void
283 ring_set_paused(const struct intel_engine_cs *engine, int state)
284 {
285 /*
286 * We inspect HWS_PREEMPT with a semaphore inside
287 * engine->emit_fini_breadcrumb. If the dword is true,
288 * the ring is paused as the semaphore will busywait
289 * until the dword is false.
290 */
291 engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
292 if (state)
293 wmb();
294 }
295
296 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
297 {
298 return rb_entry(rb, struct i915_priolist, node);
299 }
300
301 static inline int rq_prio(const struct i915_request *rq)
302 {
303 return rq->sched.attr.priority;
304 }
305
306 static int effective_prio(const struct i915_request *rq)
307 {
308 int prio = rq_prio(rq);
309
310 /*
311 * If this request is special and must not be interrupted at any
312 * cost, so be it. Note we are only checking the most recent request
313 * in the context and so may be masking an earlier vip request. It
314 * is hoped that under the conditions where nopreempt is used, this
315 * will not matter (i.e. all requests to that context will be
316 * nopreempt for as long as desired).
317 */
318 if (i915_request_has_nopreempt(rq))
319 prio = I915_PRIORITY_UNPREEMPTABLE;
320
321 /*
322 * On unwinding the active request, we give it a priority bump
323 * if it has completed waiting on any semaphore. If we know that
324 * the request has already started, we can prevent an unwanted
325 * preempt-to-idle cycle by taking that into account now.
326 */
327 if (__i915_request_has_started(rq))
328 prio |= I915_PRIORITY_NOSEMAPHORE;
329
330 /* Restrict mere WAIT boosts from triggering preemption */
331 BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */
332 return prio | __NO_PREEMPTION;
333 }
334
335 static int queue_prio(const struct intel_engine_execlists *execlists)
336 {
337 struct i915_priolist *p;
338 struct rb_node *rb;
339
340 rb = rb_first_cached(&execlists->queue);
341 if (!rb)
342 return INT_MIN;
343
344 /*
345 * As the priolist[] are inverted, with the highest priority in [0],
346 * we have to flip the index value to become priority.
347 */
348 p = to_priolist(rb);
349 return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
350 }
351
352 static inline bool need_preempt(const struct intel_engine_cs *engine,
353 const struct i915_request *rq,
354 struct rb_node *rb)
355 {
356 int last_prio;
357
358 if (!intel_engine_has_semaphores(engine))
359 return false;
360
361 /*
362 * Check if the current priority hint merits a preemption attempt.
363 *
364 * We record the highest value priority we saw during rescheduling
365 * prior to this dequeue, therefore we know that if it is strictly
366 * less than the current tail of ESLP[0], we do not need to force
367 * a preempt-to-idle cycle.
368 *
369 * However, the priority hint is a mere hint that we may need to
370 * preempt. If that hint is stale or we may be trying to preempt
371 * ourselves, ignore the request.
372 *
373 * More naturally we would write
374 * prio >= max(0, last);
375 * except that we wish to prevent triggering preemption at the same
376 * priority level: the task that is running should remain running
377 * to preserve FIFO ordering of dependencies.
378 */
379 last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
380 if (engine->execlists.queue_priority_hint <= last_prio)
381 return false;
382
383 /*
384 * Check against the first request in ELSP[1], it will, thanks to the
385 * power of PI, be the highest priority of that context.
386 */
387 if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
388 rq_prio(list_next_entry(rq, sched.link)) > last_prio)
389 return true;
390
391 if (rb) {
392 struct virtual_engine *ve =
393 rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
394 bool preempt = false;
395
396 if (engine == ve->siblings[0]) { /* only preempt one sibling */
397 struct i915_request *next;
398
399 rcu_read_lock();
400 next = READ_ONCE(ve->request);
401 if (next)
402 preempt = rq_prio(next) > last_prio;
403 rcu_read_unlock();
404 }
405
406 if (preempt)
407 return preempt;
408 }
409
410 /*
411 * If the inflight context did not trigger the preemption, then maybe
412 * it was the set of queued requests? Pick the highest priority in
413 * the queue (the first active priolist) and see if it deserves to be
414 * running instead of ELSP[0].
415 *
416 * The highest priority request in the queue can not be either
417 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
418 * context, it's priority would not exceed ELSP[0] aka last_prio.
419 */
420 return queue_prio(&engine->execlists) > last_prio;
421 }
422
423 __maybe_unused static inline bool
424 assert_priority_queue(const struct i915_request *prev,
425 const struct i915_request *next)
426 {
427 /*
428 * Without preemption, the prev may refer to the still active element
429 * which we refuse to let go.
430 *
431 * Even with preemption, there are times when we think it is better not
432 * to preempt and leave an ostensibly lower priority request in flight.
433 */
434 if (i915_request_is_active(prev))
435 return true;
436
437 return rq_prio(prev) >= rq_prio(next);
438 }
439
440 /*
441 * The context descriptor encodes various attributes of a context,
442 * including its GTT address and some flags. Because it's fairly
443 * expensive to calculate, we'll just do it once and cache the result,
444 * which remains valid until the context is unpinned.
445 *
446 * This is what a descriptor looks like, from LSB to MSB::
447 *
448 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template)
449 * bits 12-31: LRCA, GTT address of (the HWSP of) this context
450 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC)
451 * bits 53-54: mbz, reserved for use by hardware
452 * bits 55-63: group ID, currently unused and set to 0
453 *
454 * Starting from Gen11, the upper dword of the descriptor has a new format:
455 *
456 * bits 32-36: reserved
457 * bits 37-47: SW context ID
458 * bits 48:53: engine instance
459 * bit 54: mbz, reserved for use by hardware
460 * bits 55-60: SW counter
461 * bits 61-63: engine class
462 *
463 * engine info, SW context ID and SW counter need to form a unique number
464 * (Context ID) per lrc.
465 */
466 static u64
467 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
468 {
469 u64 desc;
470
471 desc = INTEL_LEGACY_32B_CONTEXT;
472 if (i915_vm_is_4lvl(ce->vm))
473 desc = INTEL_LEGACY_64B_CONTEXT;
474 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
475
476 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
477 if (IS_GEN(engine->i915, 8))
478 desc |= GEN8_CTX_L3LLC_COHERENT;
479
480 desc |= i915_ggtt_offset(ce->state); /* bits 12-31 */
481 /*
482 * The following 32bits are copied into the OA reports (dword 2).
483 * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
484 * anything below.
485 */
486 if (INTEL_GEN(engine->i915) >= 11) {
487 desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT;
488 /* bits 48-53 */
489
490 desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT;
491 /* bits 61-63 */
492 }
493
494 return desc;
495 }
496
497 static inline unsigned int dword_in_page(void *addr)
498 {
499 return offset_in_page(addr) / sizeof(u32);
500 }
501
502 static void set_offsets(u32 *regs,
503 const u8 *data,
504 const struct intel_engine_cs *engine,
505 bool clear)
506 #define NOP(x) (BIT(7) | (x))
507 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
508 #define POSTED BIT(0)
509 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
510 #define REG16(x) \
511 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
512 (((x) >> 2) & 0x7f)
513 #define END(x) 0, (x)
514 {
515 const u32 base = engine->mmio_base;
516
517 while (*data) {
518 u8 count, flags;
519
520 if (*data & BIT(7)) { /* skip */
521 count = *data++ & ~BIT(7);
522 if (clear)
523 memset32(regs, MI_NOOP, count);
524 regs += count;
525 continue;
526 }
527
528 count = *data & 0x3f;
529 flags = *data >> 6;
530 data++;
531
532 *regs = MI_LOAD_REGISTER_IMM(count);
533 if (flags & POSTED)
534 *regs |= MI_LRI_FORCE_POSTED;
535 if (INTEL_GEN(engine->i915) >= 11)
536 *regs |= MI_LRI_CS_MMIO;
537 regs++;
538
539 GEM_BUG_ON(!count);
540 do {
541 u32 offset = 0;
542 u8 v;
543
544 do {
545 v = *data++;
546 offset <<= 7;
547 offset |= v & ~BIT(7);
548 } while (v & BIT(7));
549
550 regs[0] = base + (offset << 2);
551 if (clear)
552 regs[1] = 0;
553 regs += 2;
554 } while (--count);
555 }
556
557 if (clear) {
558 u8 count = *++data;
559
560 /* Clear past the tail for HW access */
561 GEM_BUG_ON(dword_in_page(regs) > count);
562 memset32(regs, MI_NOOP, count - dword_in_page(regs));
563
564 /* Close the batch; used mainly by live_lrc_layout() */
565 *regs = MI_BATCH_BUFFER_END;
566 if (INTEL_GEN(engine->i915) >= 10)
567 *regs |= BIT(0);
568 }
569 }
570
571 static const u8 gen8_xcs_offsets[] = {
572 NOP(1),
573 LRI(11, 0),
574 REG16(0x244),
575 REG(0x034),
576 REG(0x030),
577 REG(0x038),
578 REG(0x03c),
579 REG(0x168),
580 REG(0x140),
581 REG(0x110),
582 REG(0x11c),
583 REG(0x114),
584 REG(0x118),
585
586 NOP(9),
587 LRI(9, 0),
588 REG16(0x3a8),
589 REG16(0x28c),
590 REG16(0x288),
591 REG16(0x284),
592 REG16(0x280),
593 REG16(0x27c),
594 REG16(0x278),
595 REG16(0x274),
596 REG16(0x270),
597
598 NOP(13),
599 LRI(2, 0),
600 REG16(0x200),
601 REG(0x028),
602
603 END(80)
604 };
605
606 static const u8 gen9_xcs_offsets[] = {
607 NOP(1),
608 LRI(14, POSTED),
609 REG16(0x244),
610 REG(0x034),
611 REG(0x030),
612 REG(0x038),
613 REG(0x03c),
614 REG(0x168),
615 REG(0x140),
616 REG(0x110),
617 REG(0x11c),
618 REG(0x114),
619 REG(0x118),
620 REG(0x1c0),
621 REG(0x1c4),
622 REG(0x1c8),
623
624 NOP(3),
625 LRI(9, POSTED),
626 REG16(0x3a8),
627 REG16(0x28c),
628 REG16(0x288),
629 REG16(0x284),
630 REG16(0x280),
631 REG16(0x27c),
632 REG16(0x278),
633 REG16(0x274),
634 REG16(0x270),
635
636 NOP(13),
637 LRI(1, POSTED),
638 REG16(0x200),
639
640 NOP(13),
641 LRI(44, POSTED),
642 REG(0x028),
643 REG(0x09c),
644 REG(0x0c0),
645 REG(0x178),
646 REG(0x17c),
647 REG16(0x358),
648 REG(0x170),
649 REG(0x150),
650 REG(0x154),
651 REG(0x158),
652 REG16(0x41c),
653 REG16(0x600),
654 REG16(0x604),
655 REG16(0x608),
656 REG16(0x60c),
657 REG16(0x610),
658 REG16(0x614),
659 REG16(0x618),
660 REG16(0x61c),
661 REG16(0x620),
662 REG16(0x624),
663 REG16(0x628),
664 REG16(0x62c),
665 REG16(0x630),
666 REG16(0x634),
667 REG16(0x638),
668 REG16(0x63c),
669 REG16(0x640),
670 REG16(0x644),
671 REG16(0x648),
672 REG16(0x64c),
673 REG16(0x650),
674 REG16(0x654),
675 REG16(0x658),
676 REG16(0x65c),
677 REG16(0x660),
678 REG16(0x664),
679 REG16(0x668),
680 REG16(0x66c),
681 REG16(0x670),
682 REG16(0x674),
683 REG16(0x678),
684 REG16(0x67c),
685 REG(0x068),
686
687 END(176)
688 };
689
690 static const u8 gen12_xcs_offsets[] = {
691 NOP(1),
692 LRI(13, POSTED),
693 REG16(0x244),
694 REG(0x034),
695 REG(0x030),
696 REG(0x038),
697 REG(0x03c),
698 REG(0x168),
699 REG(0x140),
700 REG(0x110),
701 REG(0x1c0),
702 REG(0x1c4),
703 REG(0x1c8),
704 REG(0x180),
705 REG16(0x2b4),
706
707 NOP(5),
708 LRI(9, POSTED),
709 REG16(0x3a8),
710 REG16(0x28c),
711 REG16(0x288),
712 REG16(0x284),
713 REG16(0x280),
714 REG16(0x27c),
715 REG16(0x278),
716 REG16(0x274),
717 REG16(0x270),
718
719 END(80)
720 };
721
722 static const u8 gen8_rcs_offsets[] = {
723 NOP(1),
724 LRI(14, POSTED),
725 REG16(0x244),
726 REG(0x034),
727 REG(0x030),
728 REG(0x038),
729 REG(0x03c),
730 REG(0x168),
731 REG(0x140),
732 REG(0x110),
733 REG(0x11c),
734 REG(0x114),
735 REG(0x118),
736 REG(0x1c0),
737 REG(0x1c4),
738 REG(0x1c8),
739
740 NOP(3),
741 LRI(9, POSTED),
742 REG16(0x3a8),
743 REG16(0x28c),
744 REG16(0x288),
745 REG16(0x284),
746 REG16(0x280),
747 REG16(0x27c),
748 REG16(0x278),
749 REG16(0x274),
750 REG16(0x270),
751
752 NOP(13),
753 LRI(1, 0),
754 REG(0x0c8),
755
756 END(80)
757 };
758
759 static const u8 gen9_rcs_offsets[] = {
760 NOP(1),
761 LRI(14, POSTED),
762 REG16(0x244),
763 REG(0x34),
764 REG(0x30),
765 REG(0x38),
766 REG(0x3c),
767 REG(0x168),
768 REG(0x140),
769 REG(0x110),
770 REG(0x11c),
771 REG(0x114),
772 REG(0x118),
773 REG(0x1c0),
774 REG(0x1c4),
775 REG(0x1c8),
776
777 NOP(3),
778 LRI(9, POSTED),
779 REG16(0x3a8),
780 REG16(0x28c),
781 REG16(0x288),
782 REG16(0x284),
783 REG16(0x280),
784 REG16(0x27c),
785 REG16(0x278),
786 REG16(0x274),
787 REG16(0x270),
788
789 NOP(13),
790 LRI(1, 0),
791 REG(0xc8),
792
793 NOP(13),
794 LRI(44, POSTED),
795 REG(0x28),
796 REG(0x9c),
797 REG(0xc0),
798 REG(0x178),
799 REG(0x17c),
800 REG16(0x358),
801 REG(0x170),
802 REG(0x150),
803 REG(0x154),
804 REG(0x158),
805 REG16(0x41c),
806 REG16(0x600),
807 REG16(0x604),
808 REG16(0x608),
809 REG16(0x60c),
810 REG16(0x610),
811 REG16(0x614),
812 REG16(0x618),
813 REG16(0x61c),
814 REG16(0x620),
815 REG16(0x624),
816 REG16(0x628),
817 REG16(0x62c),
818 REG16(0x630),
819 REG16(0x634),
820 REG16(0x638),
821 REG16(0x63c),
822 REG16(0x640),
823 REG16(0x644),
824 REG16(0x648),
825 REG16(0x64c),
826 REG16(0x650),
827 REG16(0x654),
828 REG16(0x658),
829 REG16(0x65c),
830 REG16(0x660),
831 REG16(0x664),
832 REG16(0x668),
833 REG16(0x66c),
834 REG16(0x670),
835 REG16(0x674),
836 REG16(0x678),
837 REG16(0x67c),
838 REG(0x68),
839
840 END(176)
841 };
842
843 static const u8 gen11_rcs_offsets[] = {
844 NOP(1),
845 LRI(15, POSTED),
846 REG16(0x244),
847 REG(0x034),
848 REG(0x030),
849 REG(0x038),
850 REG(0x03c),
851 REG(0x168),
852 REG(0x140),
853 REG(0x110),
854 REG(0x11c),
855 REG(0x114),
856 REG(0x118),
857 REG(0x1c0),
858 REG(0x1c4),
859 REG(0x1c8),
860 REG(0x180),
861
862 NOP(1),
863 LRI(9, POSTED),
864 REG16(0x3a8),
865 REG16(0x28c),
866 REG16(0x288),
867 REG16(0x284),
868 REG16(0x280),
869 REG16(0x27c),
870 REG16(0x278),
871 REG16(0x274),
872 REG16(0x270),
873
874 LRI(1, POSTED),
875 REG(0x1b0),
876
877 NOP(10),
878 LRI(1, 0),
879 REG(0x0c8),
880
881 END(80)
882 };
883
884 static const u8 gen12_rcs_offsets[] = {
885 NOP(1),
886 LRI(13, POSTED),
887 REG16(0x244),
888 REG(0x034),
889 REG(0x030),
890 REG(0x038),
891 REG(0x03c),
892 REG(0x168),
893 REG(0x140),
894 REG(0x110),
895 REG(0x1c0),
896 REG(0x1c4),
897 REG(0x1c8),
898 REG(0x180),
899 REG16(0x2b4),
900
901 NOP(5),
902 LRI(9, POSTED),
903 REG16(0x3a8),
904 REG16(0x28c),
905 REG16(0x288),
906 REG16(0x284),
907 REG16(0x280),
908 REG16(0x27c),
909 REG16(0x278),
910 REG16(0x274),
911 REG16(0x270),
912
913 LRI(3, POSTED),
914 REG(0x1b0),
915 REG16(0x5a8),
916 REG16(0x5ac),
917
918 NOP(6),
919 LRI(1, 0),
920 REG(0x0c8),
921
922 END(80)
923 };
924
925 #undef END
926 #undef REG16
927 #undef REG
928 #undef LRI
929 #undef NOP
930
931 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
932 {
933 /*
934 * The gen12+ lists only have the registers we program in the basic
935 * default state. We rely on the context image using relative
936 * addressing to automatic fixup the register state between the
937 * physical engines for virtual engine.
938 */
939 GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
940 !intel_engine_has_relative_mmio(engine));
941
942 if (engine->class == RENDER_CLASS) {
943 if (INTEL_GEN(engine->i915) >= 12)
944 return gen12_rcs_offsets;
945 else if (INTEL_GEN(engine->i915) >= 11)
946 return gen11_rcs_offsets;
947 else if (INTEL_GEN(engine->i915) >= 9)
948 return gen9_rcs_offsets;
949 else
950 return gen8_rcs_offsets;
951 } else {
952 if (INTEL_GEN(engine->i915) >= 12)
953 return gen12_xcs_offsets;
954 else if (INTEL_GEN(engine->i915) >= 9)
955 return gen9_xcs_offsets;
956 else
957 return gen8_xcs_offsets;
958 }
959 }
960
961 static struct i915_request *
962 __unwind_incomplete_requests(struct intel_engine_cs *engine)
963 {
964 struct i915_request *rq, *rn, *active = NULL;
965 struct list_head *uninitialized_var(pl);
966 int prio = I915_PRIORITY_INVALID;
967
968 lockdep_assert_held(&engine->active.lock);
969
970 list_for_each_entry_safe_reverse(rq, rn,
971 &engine->active.requests,
972 sched.link) {
973 if (i915_request_completed(rq))
974 continue; /* XXX */
975
976 __i915_request_unsubmit(rq);
977
978 /*
979 * Push the request back into the queue for later resubmission.
980 * If this request is not native to this physical engine (i.e.
981 * it came from a virtual source), push it back onto the virtual
982 * engine so that it can be moved across onto another physical
983 * engine as load dictates.
984 */
985 if (likely(rq->execution_mask == engine->mask)) {
986 GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
987 if (rq_prio(rq) != prio) {
988 prio = rq_prio(rq);
989 pl = i915_sched_lookup_priolist(engine, prio);
990 }
991 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
992
993 list_move(&rq->sched.link, pl);
994 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
995
996 active = rq;
997 } else {
998 struct intel_engine_cs *owner = rq->context->engine;
999
1000 /*
1001 * Decouple the virtual breadcrumb before moving it
1002 * back to the virtual engine -- we don't want the
1003 * request to complete in the background and try
1004 * and cancel the breadcrumb on the virtual engine
1005 * (instead of the old engine where it is linked)!
1006 */
1007 if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
1008 &rq->fence.flags)) {
1009 spin_lock_nested(&rq->lock,
1010 SINGLE_DEPTH_NESTING);
1011 i915_request_cancel_breadcrumb(rq);
1012 spin_unlock(&rq->lock);
1013 }
1014 rq->engine = owner;
1015 owner->submit_request(rq);
1016 active = NULL;
1017 }
1018 }
1019
1020 return active;
1021 }
1022
1023 struct i915_request *
1024 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
1025 {
1026 struct intel_engine_cs *engine =
1027 container_of(execlists, typeof(*engine), execlists);
1028
1029 return __unwind_incomplete_requests(engine);
1030 }
1031
1032 static inline void
1033 execlists_context_status_change(struct i915_request *rq, unsigned long status)
1034 {
1035 /*
1036 * Only used when GVT-g is enabled now. When GVT-g is disabled,
1037 * The compiler should eliminate this function as dead-code.
1038 */
1039 if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
1040 return;
1041
1042 atomic_notifier_call_chain(&rq->engine->context_status_notifier,
1043 status, rq);
1044 }
1045
1046 static void intel_engine_context_in(struct intel_engine_cs *engine)
1047 {
1048 unsigned long flags;
1049
1050 if (READ_ONCE(engine->stats.enabled) == 0)
1051 return;
1052
1053 write_seqlock_irqsave(&engine->stats.lock, flags);
1054
1055 if (engine->stats.enabled > 0) {
1056 if (engine->stats.active++ == 0)
1057 engine->stats.start = ktime_get();
1058 GEM_BUG_ON(engine->stats.active == 0);
1059 }
1060
1061 write_sequnlock_irqrestore(&engine->stats.lock, flags);
1062 }
1063
1064 static void intel_engine_context_out(struct intel_engine_cs *engine)
1065 {
1066 unsigned long flags;
1067
1068 if (READ_ONCE(engine->stats.enabled) == 0)
1069 return;
1070
1071 write_seqlock_irqsave(&engine->stats.lock, flags);
1072
1073 if (engine->stats.enabled > 0) {
1074 ktime_t last;
1075
1076 if (engine->stats.active && --engine->stats.active == 0) {
1077 /*
1078 * Decrement the active context count and in case GPU
1079 * is now idle add up to the running total.
1080 */
1081 last = ktime_sub(ktime_get(), engine->stats.start);
1082
1083 engine->stats.total = ktime_add(engine->stats.total,
1084 last);
1085 } else if (engine->stats.active == 0) {
1086 /*
1087 * After turning on engine stats, context out might be
1088 * the first event in which case we account from the
1089 * time stats gathering was turned on.
1090 */
1091 last = ktime_sub(ktime_get(), engine->stats.enabled_at);
1092
1093 engine->stats.total = ktime_add(engine->stats.total,
1094 last);
1095 }
1096 }
1097
1098 write_sequnlock_irqrestore(&engine->stats.lock, flags);
1099 }
1100
1101 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
1102 {
1103 if (INTEL_GEN(engine->i915) >= 12)
1104 return 0x60;
1105 else if (INTEL_GEN(engine->i915) >= 9)
1106 return 0x54;
1107 else if (engine->class == RENDER_CLASS)
1108 return 0x58;
1109 else
1110 return -1;
1111 }
1112
1113 static void
1114 execlists_check_context(const struct intel_context *ce,
1115 const struct intel_engine_cs *engine)
1116 {
1117 const struct intel_ring *ring = ce->ring;
1118 u32 *regs = ce->lrc_reg_state;
1119 bool valid = true;
1120 int x;
1121
1122 if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1123 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1124 engine->name,
1125 regs[CTX_RING_START],
1126 i915_ggtt_offset(ring->vma));
1127 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1128 valid = false;
1129 }
1130
1131 if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1132 (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1133 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1134 engine->name,
1135 regs[CTX_RING_CTL],
1136 (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1137 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1138 valid = false;
1139 }
1140
1141 x = lrc_ring_mi_mode(engine);
1142 if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1143 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1144 engine->name, regs[x + 1]);
1145 regs[x + 1] &= ~STOP_RING;
1146 regs[x + 1] |= STOP_RING << 16;
1147 valid = false;
1148 }
1149
1150 WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
1151 }
1152
1153 static void restore_default_state(struct intel_context *ce,
1154 struct intel_engine_cs *engine)
1155 {
1156 u32 *regs = ce->lrc_reg_state;
1157
1158 if (engine->pinned_default_state)
1159 memcpy(regs, /* skip restoring the vanilla PPHWSP */
1160 engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
1161 engine->context_size - PAGE_SIZE);
1162
1163 execlists_init_reg_state(regs, ce, engine, ce->ring, false);
1164 }
1165
1166 static void reset_active(struct i915_request *rq,
1167 struct intel_engine_cs *engine)
1168 {
1169 struct intel_context * const ce = rq->context;
1170 u32 head;
1171
1172 /*
1173 * The executing context has been cancelled. We want to prevent
1174 * further execution along this context and propagate the error on
1175 * to anything depending on its results.
1176 *
1177 * In __i915_request_submit(), we apply the -EIO and remove the
1178 * requests' payloads for any banned requests. But first, we must
1179 * rewind the context back to the start of the incomplete request so
1180 * that we do not jump back into the middle of the batch.
1181 *
1182 * We preserve the breadcrumbs and semaphores of the incomplete
1183 * requests so that inter-timeline dependencies (i.e other timelines)
1184 * remain correctly ordered. And we defer to __i915_request_submit()
1185 * so that all asynchronous waits are correctly handled.
1186 */
1187 ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n",
1188 rq->fence.context, rq->fence.seqno);
1189
1190 /* On resubmission of the active request, payload will be scrubbed */
1191 if (i915_request_completed(rq))
1192 head = rq->tail;
1193 else
1194 head = active_request(ce->timeline, rq)->head;
1195 head = intel_ring_wrap(ce->ring, head);
1196
1197 /* Scrub the context image to prevent replaying the previous batch */
1198 restore_default_state(ce, engine);
1199 __execlists_update_reg_state(ce, engine, head);
1200
1201 /* We've switched away, so this should be a no-op, but intent matters */
1202 ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
1203 }
1204
1205 static inline struct intel_engine_cs *
1206 __execlists_schedule_in(struct i915_request *rq)
1207 {
1208 struct intel_engine_cs * const engine = rq->engine;
1209 struct intel_context * const ce = rq->context;
1210
1211 intel_context_get(ce);
1212
1213 if (unlikely(intel_context_is_banned(ce)))
1214 reset_active(rq, engine);
1215
1216 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1217 execlists_check_context(ce, engine);
1218
1219 if (ce->tag) {
1220 /* Use a fixed tag for OA and friends */
1221 ce->lrc_desc |= (u64)ce->tag << 32;
1222 } else {
1223 /* We don't need a strict matching tag, just different values */
1224 ce->lrc_desc &= ~GENMASK_ULL(47, 37);
1225 ce->lrc_desc |=
1226 (u64)(++engine->context_tag % NUM_CONTEXT_TAG) <<
1227 GEN11_SW_CTX_ID_SHIFT;
1228 BUILD_BUG_ON(NUM_CONTEXT_TAG > GEN12_MAX_CONTEXT_HW_ID);
1229 }
1230
1231 __intel_gt_pm_get(engine->gt);
1232 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1233 intel_engine_context_in(engine);
1234
1235 return engine;
1236 }
1237
1238 static inline struct i915_request *
1239 execlists_schedule_in(struct i915_request *rq, int idx)
1240 {
1241 struct intel_context * const ce = rq->context;
1242 struct intel_engine_cs *old;
1243
1244 GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1245 trace_i915_request_in(rq, idx);
1246
1247 old = READ_ONCE(ce->inflight);
1248 do {
1249 if (!old) {
1250 WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1251 break;
1252 }
1253 } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1254
1255 GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1256 return i915_request_get(rq);
1257 }
1258
1259 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1260 {
1261 struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1262 struct i915_request *next = READ_ONCE(ve->request);
1263
1264 if (next && next->execution_mask & ~rq->execution_mask)
1265 tasklet_schedule(&ve->base.execlists.tasklet);
1266 }
1267
1268 static inline void
1269 __execlists_schedule_out(struct i915_request *rq,
1270 struct intel_engine_cs * const engine)
1271 {
1272 struct intel_context * const ce = rq->context;
1273
1274 /*
1275 * NB process_csb() is not under the engine->active.lock and hence
1276 * schedule_out can race with schedule_in meaning that we should
1277 * refrain from doing non-trivial work here.
1278 */
1279
1280 /*
1281 * If we have just completed this context, the engine may now be
1282 * idle and we want to re-enter powersaving.
1283 */
1284 if (list_is_last(&rq->link, &ce->timeline->requests) &&
1285 i915_request_completed(rq))
1286 intel_engine_add_retire(engine, ce->timeline);
1287
1288 intel_engine_context_out(engine);
1289 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1290 intel_gt_pm_put_async(engine->gt);
1291
1292 /*
1293 * If this is part of a virtual engine, its next request may
1294 * have been blocked waiting for access to the active context.
1295 * We have to kick all the siblings again in case we need to
1296 * switch (e.g. the next request is not runnable on this
1297 * engine). Hopefully, we will already have submitted the next
1298 * request before the tasklet runs and do not need to rebuild
1299 * each virtual tree and kick everyone again.
1300 */
1301 if (ce->engine != engine)
1302 kick_siblings(rq, ce);
1303
1304 intel_context_put(ce);
1305 }
1306
1307 static inline void
1308 execlists_schedule_out(struct i915_request *rq)
1309 {
1310 struct intel_context * const ce = rq->context;
1311 struct intel_engine_cs *cur, *old;
1312
1313 trace_i915_request_out(rq);
1314
1315 old = READ_ONCE(ce->inflight);
1316 do
1317 cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1318 while (!try_cmpxchg(&ce->inflight, &old, cur));
1319 if (!cur)
1320 __execlists_schedule_out(rq, old);
1321
1322 i915_request_put(rq);
1323 }
1324
1325 static u64 execlists_update_context(struct i915_request *rq)
1326 {
1327 struct intel_context *ce = rq->context;
1328 u64 desc = ce->lrc_desc;
1329 u32 tail, prev;
1330
1331 /*
1332 * WaIdleLiteRestore:bdw,skl
1333 *
1334 * We should never submit the context with the same RING_TAIL twice
1335 * just in case we submit an empty ring, which confuses the HW.
1336 *
1337 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1338 * the normal request to be able to always advance the RING_TAIL on
1339 * subsequent resubmissions (for lite restore). Should that fail us,
1340 * and we try and submit the same tail again, force the context
1341 * reload.
1342 *
1343 * If we need to return to a preempted context, we need to skip the
1344 * lite-restore and force it to reload the RING_TAIL. Otherwise, the
1345 * HW has a tendency to ignore us rewinding the TAIL to the end of
1346 * an earlier request.
1347 */
1348 tail = intel_ring_set_tail(rq->ring, rq->tail);
1349 prev = ce->lrc_reg_state[CTX_RING_TAIL];
1350 if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0))
1351 desc |= CTX_DESC_FORCE_RESTORE;
1352 ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1353 rq->tail = rq->wa_tail;
1354
1355 /*
1356 * Make sure the context image is complete before we submit it to HW.
1357 *
1358 * Ostensibly, writes (including the WCB) should be flushed prior to
1359 * an uncached write such as our mmio register access, the empirical
1360 * evidence (esp. on Braswell) suggests that the WC write into memory
1361 * may not be visible to the HW prior to the completion of the UC
1362 * register write and that we may begin execution from the context
1363 * before its image is complete leading to invalid PD chasing.
1364 */
1365 wmb();
1366
1367 ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE;
1368 return desc;
1369 }
1370
1371 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1372 {
1373 if (execlists->ctrl_reg) {
1374 writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1375 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1376 } else {
1377 writel(upper_32_bits(desc), execlists->submit_reg);
1378 writel(lower_32_bits(desc), execlists->submit_reg);
1379 }
1380 }
1381
1382 static __maybe_unused void
1383 trace_ports(const struct intel_engine_execlists *execlists,
1384 const char *msg,
1385 struct i915_request * const *ports)
1386 {
1387 const struct intel_engine_cs *engine =
1388 container_of(execlists, typeof(*engine), execlists);
1389
1390 if (!ports[0])
1391 return;
1392
1393 ENGINE_TRACE(engine, "%s { %llx:%lld%s, %llx:%lld }\n", msg,
1394 ports[0]->fence.context,
1395 ports[0]->fence.seqno,
1396 i915_request_completed(ports[0]) ? "!" :
1397 i915_request_started(ports[0]) ? "*" :
1398 "",
1399 ports[1] ? ports[1]->fence.context : 0,
1400 ports[1] ? ports[1]->fence.seqno : 0);
1401 }
1402
1403 static __maybe_unused bool
1404 assert_pending_valid(const struct intel_engine_execlists *execlists,
1405 const char *msg)
1406 {
1407 struct i915_request * const *port, *rq;
1408 struct intel_context *ce = NULL;
1409
1410 trace_ports(execlists, msg, execlists->pending);
1411
1412 if (!execlists->pending[0]) {
1413 GEM_TRACE_ERR("Nothing pending for promotion!\n");
1414 return false;
1415 }
1416
1417 if (execlists->pending[execlists_num_ports(execlists)]) {
1418 GEM_TRACE_ERR("Excess pending[%d] for promotion!\n",
1419 execlists_num_ports(execlists));
1420 return false;
1421 }
1422
1423 for (port = execlists->pending; (rq = *port); port++) {
1424 unsigned long flags;
1425 bool ok = true;
1426
1427 GEM_BUG_ON(!kref_read(&rq->fence.refcount));
1428 GEM_BUG_ON(!i915_request_is_active(rq));
1429
1430 if (ce == rq->context) {
1431 GEM_TRACE_ERR("Dup context:%llx in pending[%zd]\n",
1432 ce->timeline->fence_context,
1433 port - execlists->pending);
1434 return false;
1435 }
1436 ce = rq->context;
1437
1438 /* Hold tightly onto the lock to prevent concurrent retires! */
1439 if (!spin_trylock_irqsave(&rq->lock, flags))
1440 continue;
1441
1442 if (i915_request_completed(rq))
1443 goto unlock;
1444
1445 if (i915_active_is_idle(&ce->active) &&
1446 !intel_context_is_barrier(ce)) {
1447 GEM_TRACE_ERR("Inactive context:%llx in pending[%zd]\n",
1448 ce->timeline->fence_context,
1449 port - execlists->pending);
1450 ok = false;
1451 goto unlock;
1452 }
1453
1454 if (!i915_vma_is_pinned(ce->state)) {
1455 GEM_TRACE_ERR("Unpinned context:%llx in pending[%zd]\n",
1456 ce->timeline->fence_context,
1457 port - execlists->pending);
1458 ok = false;
1459 goto unlock;
1460 }
1461
1462 if (!i915_vma_is_pinned(ce->ring->vma)) {
1463 GEM_TRACE_ERR("Unpinned ring:%llx in pending[%zd]\n",
1464 ce->timeline->fence_context,
1465 port - execlists->pending);
1466 ok = false;
1467 goto unlock;
1468 }
1469
1470 unlock:
1471 spin_unlock_irqrestore(&rq->lock, flags);
1472 if (!ok)
1473 return false;
1474 }
1475
1476 return ce;
1477 }
1478
1479 static void execlists_submit_ports(struct intel_engine_cs *engine)
1480 {
1481 struct intel_engine_execlists *execlists = &engine->execlists;
1482 unsigned int n;
1483
1484 GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1485
1486 /*
1487 * We can skip acquiring intel_runtime_pm_get() here as it was taken
1488 * on our behalf by the request (see i915_gem_mark_busy()) and it will
1489 * not be relinquished until the device is idle (see
1490 * i915_gem_idle_work_handler()). As a precaution, we make sure
1491 * that all ELSP are drained i.e. we have processed the CSB,
1492 * before allowing ourselves to idle and calling intel_runtime_pm_put().
1493 */
1494 GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1495
1496 /*
1497 * ELSQ note: the submit queue is not cleared after being submitted
1498 * to the HW so we need to make sure we always clean it up. This is
1499 * currently ensured by the fact that we always write the same number
1500 * of elsq entries, keep this in mind before changing the loop below.
1501 */
1502 for (n = execlists_num_ports(execlists); n--; ) {
1503 struct i915_request *rq = execlists->pending[n];
1504
1505 write_desc(execlists,
1506 rq ? execlists_update_context(rq) : 0,
1507 n);
1508 }
1509
1510 /* we need to manually load the submit queue */
1511 if (execlists->ctrl_reg)
1512 writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1513 }
1514
1515 static bool ctx_single_port_submission(const struct intel_context *ce)
1516 {
1517 return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1518 intel_context_force_single_submission(ce));
1519 }
1520
1521 static bool can_merge_ctx(const struct intel_context *prev,
1522 const struct intel_context *next)
1523 {
1524 if (prev != next)
1525 return false;
1526
1527 if (ctx_single_port_submission(prev))
1528 return false;
1529
1530 return true;
1531 }
1532
1533 static bool can_merge_rq(const struct i915_request *prev,
1534 const struct i915_request *next)
1535 {
1536 GEM_BUG_ON(prev == next);
1537 GEM_BUG_ON(!assert_priority_queue(prev, next));
1538
1539 /*
1540 * We do not submit known completed requests. Therefore if the next
1541 * request is already completed, we can pretend to merge it in
1542 * with the previous context (and we will skip updating the ELSP
1543 * and tracking). Thus hopefully keeping the ELSP full with active
1544 * contexts, despite the best efforts of preempt-to-busy to confuse
1545 * us.
1546 */
1547 if (i915_request_completed(next))
1548 return true;
1549
1550 if (unlikely((prev->fence.flags ^ next->fence.flags) &
1551 (BIT(I915_FENCE_FLAG_NOPREEMPT) |
1552 BIT(I915_FENCE_FLAG_SENTINEL))))
1553 return false;
1554
1555 if (!can_merge_ctx(prev->context, next->context))
1556 return false;
1557
1558 return true;
1559 }
1560
1561 static void virtual_update_register_offsets(u32 *regs,
1562 struct intel_engine_cs *engine)
1563 {
1564 set_offsets(regs, reg_offsets(engine), engine, false);
1565 }
1566
1567 static bool virtual_matches(const struct virtual_engine *ve,
1568 const struct i915_request *rq,
1569 const struct intel_engine_cs *engine)
1570 {
1571 const struct intel_engine_cs *inflight;
1572
1573 if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1574 return false;
1575
1576 /*
1577 * We track when the HW has completed saving the context image
1578 * (i.e. when we have seen the final CS event switching out of
1579 * the context) and must not overwrite the context image before
1580 * then. This restricts us to only using the active engine
1581 * while the previous virtualized request is inflight (so
1582 * we reuse the register offsets). This is a very small
1583 * hystersis on the greedy seelction algorithm.
1584 */
1585 inflight = intel_context_inflight(&ve->context);
1586 if (inflight && inflight != engine)
1587 return false;
1588
1589 return true;
1590 }
1591
1592 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve,
1593 struct intel_engine_cs *engine)
1594 {
1595 struct intel_engine_cs *old = ve->siblings[0];
1596
1597 /* All unattached (rq->engine == old) must already be completed */
1598
1599 spin_lock(&old->breadcrumbs.irq_lock);
1600 if (!list_empty(&ve->context.signal_link)) {
1601 list_move_tail(&ve->context.signal_link,
1602 &engine->breadcrumbs.signalers);
1603 intel_engine_signal_breadcrumbs(engine);
1604 }
1605 spin_unlock(&old->breadcrumbs.irq_lock);
1606 }
1607
1608 static struct i915_request *
1609 last_active(const struct intel_engine_execlists *execlists)
1610 {
1611 struct i915_request * const *last = READ_ONCE(execlists->active);
1612
1613 while (*last && i915_request_completed(*last))
1614 last++;
1615
1616 return *last;
1617 }
1618
1619 #define for_each_waiter(p__, rq__) \
1620 list_for_each_entry_lockless(p__, \
1621 &(rq__)->sched.waiters_list, \
1622 wait_link)
1623
1624 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1625 {
1626 LIST_HEAD(list);
1627
1628 /*
1629 * We want to move the interrupted request to the back of
1630 * the round-robin list (i.e. its priority level), but
1631 * in doing so, we must then move all requests that were in
1632 * flight and were waiting for the interrupted request to
1633 * be run after it again.
1634 */
1635 do {
1636 struct i915_dependency *p;
1637
1638 GEM_BUG_ON(i915_request_is_active(rq));
1639 list_move_tail(&rq->sched.link, pl);
1640
1641 for_each_waiter(p, rq) {
1642 struct i915_request *w =
1643 container_of(p->waiter, typeof(*w), sched);
1644
1645 /* Leave semaphores spinning on the other engines */
1646 if (w->engine != rq->engine)
1647 continue;
1648
1649 /* No waiter should start before its signaler */
1650 GEM_BUG_ON(i915_request_started(w) &&
1651 !i915_request_completed(rq));
1652
1653 GEM_BUG_ON(i915_request_is_active(w));
1654 if (!i915_request_is_ready(w))
1655 continue;
1656
1657 if (rq_prio(w) < rq_prio(rq))
1658 continue;
1659
1660 GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1661 list_move_tail(&w->sched.link, &list);
1662 }
1663
1664 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1665 } while (rq);
1666 }
1667
1668 static void defer_active(struct intel_engine_cs *engine)
1669 {
1670 struct i915_request *rq;
1671
1672 rq = __unwind_incomplete_requests(engine);
1673 if (!rq)
1674 return;
1675
1676 defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1677 }
1678
1679 static bool
1680 need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq)
1681 {
1682 int hint;
1683
1684 if (!intel_engine_has_timeslices(engine))
1685 return false;
1686
1687 if (list_is_last(&rq->sched.link, &engine->active.requests))
1688 return false;
1689
1690 hint = max(rq_prio(list_next_entry(rq, sched.link)),
1691 engine->execlists.queue_priority_hint);
1692
1693 return hint >= effective_prio(rq);
1694 }
1695
1696 static int
1697 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1698 {
1699 if (list_is_last(&rq->sched.link, &engine->active.requests))
1700 return INT_MIN;
1701
1702 return rq_prio(list_next_entry(rq, sched.link));
1703 }
1704
1705 static inline unsigned long
1706 timeslice(const struct intel_engine_cs *engine)
1707 {
1708 return READ_ONCE(engine->props.timeslice_duration_ms);
1709 }
1710
1711 static unsigned long
1712 active_timeslice(const struct intel_engine_cs *engine)
1713 {
1714 const struct i915_request *rq = *engine->execlists.active;
1715
1716 if (!rq || i915_request_completed(rq))
1717 return 0;
1718
1719 if (engine->execlists.switch_priority_hint < effective_prio(rq))
1720 return 0;
1721
1722 return timeslice(engine);
1723 }
1724
1725 static void set_timeslice(struct intel_engine_cs *engine)
1726 {
1727 if (!intel_engine_has_timeslices(engine))
1728 return;
1729
1730 set_timer_ms(&engine->execlists.timer, active_timeslice(engine));
1731 }
1732
1733 static void record_preemption(struct intel_engine_execlists *execlists)
1734 {
1735 (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
1736 }
1737
1738 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine)
1739 {
1740 struct i915_request *rq;
1741
1742 rq = last_active(&engine->execlists);
1743 if (!rq)
1744 return 0;
1745
1746 /* Force a fast reset for terminated contexts (ignoring sysfs!) */
1747 if (unlikely(intel_context_is_banned(rq->context)))
1748 return 1;
1749
1750 return READ_ONCE(engine->props.preempt_timeout_ms);
1751 }
1752
1753 static void set_preempt_timeout(struct intel_engine_cs *engine)
1754 {
1755 if (!intel_engine_has_preempt_reset(engine))
1756 return;
1757
1758 set_timer_ms(&engine->execlists.preempt,
1759 active_preempt_timeout(engine));
1760 }
1761
1762 static inline void clear_ports(struct i915_request **ports, int count)
1763 {
1764 memset_p((void **)ports, NULL, count);
1765 }
1766
1767 static void execlists_dequeue(struct intel_engine_cs *engine)
1768 {
1769 struct intel_engine_execlists * const execlists = &engine->execlists;
1770 struct i915_request **port = execlists->pending;
1771 struct i915_request ** const last_port = port + execlists->port_mask;
1772 struct i915_request *last;
1773 struct rb_node *rb;
1774 bool submit = false;
1775
1776 /*
1777 * Hardware submission is through 2 ports. Conceptually each port
1778 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
1779 * static for a context, and unique to each, so we only execute
1780 * requests belonging to a single context from each ring. RING_HEAD
1781 * is maintained by the CS in the context image, it marks the place
1782 * where it got up to last time, and through RING_TAIL we tell the CS
1783 * where we want to execute up to this time.
1784 *
1785 * In this list the requests are in order of execution. Consecutive
1786 * requests from the same context are adjacent in the ringbuffer. We
1787 * can combine these requests into a single RING_TAIL update:
1788 *
1789 * RING_HEAD...req1...req2
1790 * ^- RING_TAIL
1791 * since to execute req2 the CS must first execute req1.
1792 *
1793 * Our goal then is to point each port to the end of a consecutive
1794 * sequence of requests as being the most optimal (fewest wake ups
1795 * and context switches) submission.
1796 */
1797
1798 for (rb = rb_first_cached(&execlists->virtual); rb; ) {
1799 struct virtual_engine *ve =
1800 rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1801 struct i915_request *rq = READ_ONCE(ve->request);
1802
1803 if (!rq) { /* lazily cleanup after another engine handled rq */
1804 rb_erase_cached(rb, &execlists->virtual);
1805 RB_CLEAR_NODE(rb);
1806 rb = rb_first_cached(&execlists->virtual);
1807 continue;
1808 }
1809
1810 if (!virtual_matches(ve, rq, engine)) {
1811 rb = rb_next(rb);
1812 continue;
1813 }
1814
1815 break;
1816 }
1817
1818 /*
1819 * If the queue is higher priority than the last
1820 * request in the currently active context, submit afresh.
1821 * We will resubmit again afterwards in case we need to split
1822 * the active context to interject the preemption request,
1823 * i.e. we will retrigger preemption following the ack in case
1824 * of trouble.
1825 */
1826 last = last_active(execlists);
1827 if (last) {
1828 if (need_preempt(engine, last, rb)) {
1829 ENGINE_TRACE(engine,
1830 "preempting last=%llx:%lld, prio=%d, hint=%d\n",
1831 last->fence.context,
1832 last->fence.seqno,
1833 last->sched.attr.priority,
1834 execlists->queue_priority_hint);
1835 record_preemption(execlists);
1836
1837 /*
1838 * Don't let the RING_HEAD advance past the breadcrumb
1839 * as we unwind (and until we resubmit) so that we do
1840 * not accidentally tell it to go backwards.
1841 */
1842 ring_set_paused(engine, 1);
1843
1844 /*
1845 * Note that we have not stopped the GPU at this point,
1846 * so we are unwinding the incomplete requests as they
1847 * remain inflight and so by the time we do complete
1848 * the preemption, some of the unwound requests may
1849 * complete!
1850 */
1851 __unwind_incomplete_requests(engine);
1852
1853 last = NULL;
1854 } else if (need_timeslice(engine, last) &&
1855 timer_expired(&engine->execlists.timer)) {
1856 ENGINE_TRACE(engine,
1857 "expired last=%llx:%lld, prio=%d, hint=%d\n",
1858 last->fence.context,
1859 last->fence.seqno,
1860 last->sched.attr.priority,
1861 execlists->queue_priority_hint);
1862
1863 ring_set_paused(engine, 1);
1864 defer_active(engine);
1865
1866 /*
1867 * Unlike for preemption, if we rewind and continue
1868 * executing the same context as previously active,
1869 * the order of execution will remain the same and
1870 * the tail will only advance. We do not need to
1871 * force a full context restore, as a lite-restore
1872 * is sufficient to resample the monotonic TAIL.
1873 *
1874 * If we switch to any other context, similarly we
1875 * will not rewind TAIL of current context, and
1876 * normal save/restore will preserve state and allow
1877 * us to later continue executing the same request.
1878 */
1879 last = NULL;
1880 } else {
1881 /*
1882 * Otherwise if we already have a request pending
1883 * for execution after the current one, we can
1884 * just wait until the next CS event before
1885 * queuing more. In either case we will force a
1886 * lite-restore preemption event, but if we wait
1887 * we hopefully coalesce several updates into a single
1888 * submission.
1889 */
1890 if (!list_is_last(&last->sched.link,
1891 &engine->active.requests)) {
1892 /*
1893 * Even if ELSP[1] is occupied and not worthy
1894 * of timeslices, our queue might be.
1895 */
1896 if (!execlists->timer.expires &&
1897 need_timeslice(engine, last))
1898 set_timer_ms(&execlists->timer,
1899 timeslice(engine));
1900
1901 return;
1902 }
1903 }
1904 }
1905
1906 while (rb) { /* XXX virtual is always taking precedence */
1907 struct virtual_engine *ve =
1908 rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1909 struct i915_request *rq;
1910
1911 spin_lock(&ve->base.active.lock);
1912
1913 rq = ve->request;
1914 if (unlikely(!rq)) { /* lost the race to a sibling */
1915 spin_unlock(&ve->base.active.lock);
1916 rb_erase_cached(rb, &execlists->virtual);
1917 RB_CLEAR_NODE(rb);
1918 rb = rb_first_cached(&execlists->virtual);
1919 continue;
1920 }
1921
1922 GEM_BUG_ON(rq != ve->request);
1923 GEM_BUG_ON(rq->engine != &ve->base);
1924 GEM_BUG_ON(rq->context != &ve->context);
1925
1926 if (rq_prio(rq) >= queue_prio(execlists)) {
1927 if (!virtual_matches(ve, rq, engine)) {
1928 spin_unlock(&ve->base.active.lock);
1929 rb = rb_next(rb);
1930 continue;
1931 }
1932
1933 if (last && !can_merge_rq(last, rq)) {
1934 spin_unlock(&ve->base.active.lock);
1935 return; /* leave this for another */
1936 }
1937
1938 ENGINE_TRACE(engine,
1939 "virtual rq=%llx:%lld%s, new engine? %s\n",
1940 rq->fence.context,
1941 rq->fence.seqno,
1942 i915_request_completed(rq) ? "!" :
1943 i915_request_started(rq) ? "*" :
1944 "",
1945 yesno(engine != ve->siblings[0]));
1946
1947 ve->request = NULL;
1948 ve->base.execlists.queue_priority_hint = INT_MIN;
1949 rb_erase_cached(rb, &execlists->virtual);
1950 RB_CLEAR_NODE(rb);
1951
1952 GEM_BUG_ON(!(rq->execution_mask & engine->mask));
1953 rq->engine = engine;
1954
1955 if (engine != ve->siblings[0]) {
1956 u32 *regs = ve->context.lrc_reg_state;
1957 unsigned int n;
1958
1959 GEM_BUG_ON(READ_ONCE(ve->context.inflight));
1960
1961 if (!intel_engine_has_relative_mmio(engine))
1962 virtual_update_register_offsets(regs,
1963 engine);
1964
1965 if (!list_empty(&ve->context.signals))
1966 virtual_xfer_breadcrumbs(ve, engine);
1967
1968 /*
1969 * Move the bound engine to the top of the list
1970 * for future execution. We then kick this
1971 * tasklet first before checking others, so that
1972 * we preferentially reuse this set of bound
1973 * registers.
1974 */
1975 for (n = 1; n < ve->num_siblings; n++) {
1976 if (ve->siblings[n] == engine) {
1977 swap(ve->siblings[n],
1978 ve->siblings[0]);
1979 break;
1980 }
1981 }
1982
1983 GEM_BUG_ON(ve->siblings[0] != engine);
1984 }
1985
1986 if (__i915_request_submit(rq)) {
1987 submit = true;
1988 last = rq;
1989 }
1990 i915_request_put(rq);
1991
1992 /*
1993 * Hmm, we have a bunch of virtual engine requests,
1994 * but the first one was already completed (thanks
1995 * preempt-to-busy!). Keep looking at the veng queue
1996 * until we have no more relevant requests (i.e.
1997 * the normal submit queue has higher priority).
1998 */
1999 if (!submit) {
2000 spin_unlock(&ve->base.active.lock);
2001 rb = rb_first_cached(&execlists->virtual);
2002 continue;
2003 }
2004 }
2005
2006 spin_unlock(&ve->base.active.lock);
2007 break;
2008 }
2009
2010 while ((rb = rb_first_cached(&execlists->queue))) {
2011 struct i915_priolist *p = to_priolist(rb);
2012 struct i915_request *rq, *rn;
2013 int i;
2014
2015 priolist_for_each_request_consume(rq, rn, p, i) {
2016 bool merge = true;
2017
2018 /*
2019 * Can we combine this request with the current port?
2020 * It has to be the same context/ringbuffer and not
2021 * have any exceptions (e.g. GVT saying never to
2022 * combine contexts).
2023 *
2024 * If we can combine the requests, we can execute both
2025 * by updating the RING_TAIL to point to the end of the
2026 * second request, and so we never need to tell the
2027 * hardware about the first.
2028 */
2029 if (last && !can_merge_rq(last, rq)) {
2030 /*
2031 * If we are on the second port and cannot
2032 * combine this request with the last, then we
2033 * are done.
2034 */
2035 if (port == last_port)
2036 goto done;
2037
2038 /*
2039 * We must not populate both ELSP[] with the
2040 * same LRCA, i.e. we must submit 2 different
2041 * contexts if we submit 2 ELSP.
2042 */
2043 if (last->context == rq->context)
2044 goto done;
2045
2046 if (i915_request_has_sentinel(last))
2047 goto done;
2048
2049 /*
2050 * If GVT overrides us we only ever submit
2051 * port[0], leaving port[1] empty. Note that we
2052 * also have to be careful that we don't queue
2053 * the same context (even though a different
2054 * request) to the second port.
2055 */
2056 if (ctx_single_port_submission(last->context) ||
2057 ctx_single_port_submission(rq->context))
2058 goto done;
2059
2060 merge = false;
2061 }
2062
2063 if (__i915_request_submit(rq)) {
2064 if (!merge) {
2065 *port = execlists_schedule_in(last, port - execlists->pending);
2066 port++;
2067 last = NULL;
2068 }
2069
2070 GEM_BUG_ON(last &&
2071 !can_merge_ctx(last->context,
2072 rq->context));
2073
2074 submit = true;
2075 last = rq;
2076 }
2077 }
2078
2079 rb_erase_cached(&p->node, &execlists->queue);
2080 i915_priolist_free(p);
2081 }
2082
2083 done:
2084 /*
2085 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
2086 *
2087 * We choose the priority hint such that if we add a request of greater
2088 * priority than this, we kick the submission tasklet to decide on
2089 * the right order of submitting the requests to hardware. We must
2090 * also be prepared to reorder requests as they are in-flight on the
2091 * HW. We derive the priority hint then as the first "hole" in
2092 * the HW submission ports and if there are no available slots,
2093 * the priority of the lowest executing request, i.e. last.
2094 *
2095 * When we do receive a higher priority request ready to run from the
2096 * user, see queue_request(), the priority hint is bumped to that
2097 * request triggering preemption on the next dequeue (or subsequent
2098 * interrupt for secondary ports).
2099 */
2100 execlists->queue_priority_hint = queue_prio(execlists);
2101
2102 if (submit) {
2103 *port = execlists_schedule_in(last, port - execlists->pending);
2104 execlists->switch_priority_hint =
2105 switch_prio(engine, *execlists->pending);
2106
2107 /*
2108 * Skip if we ended up with exactly the same set of requests,
2109 * e.g. trying to timeslice a pair of ordered contexts
2110 */
2111 if (!memcmp(execlists->active, execlists->pending,
2112 (port - execlists->pending + 1) * sizeof(*port))) {
2113 do
2114 execlists_schedule_out(fetch_and_zero(port));
2115 while (port-- != execlists->pending);
2116
2117 goto skip_submit;
2118 }
2119 clear_ports(port + 1, last_port - port);
2120
2121 execlists_submit_ports(engine);
2122 set_preempt_timeout(engine);
2123 } else {
2124 skip_submit:
2125 ring_set_paused(engine, 0);
2126 }
2127 }
2128
2129 static void
2130 cancel_port_requests(struct intel_engine_execlists * const execlists)
2131 {
2132 struct i915_request * const *port;
2133
2134 for (port = execlists->pending; *port; port++)
2135 execlists_schedule_out(*port);
2136 clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
2137
2138 /* Mark the end of active before we overwrite *active */
2139 for (port = xchg(&execlists->active, execlists->pending); *port; port++)
2140 execlists_schedule_out(*port);
2141 clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
2142
2143 WRITE_ONCE(execlists->active, execlists->inflight);
2144 }
2145
2146 static inline void
2147 invalidate_csb_entries(const u32 *first, const u32 *last)
2148 {
2149 clflush((void *)first);
2150 clflush((void *)last);
2151 }
2152
2153 static inline bool
2154 reset_in_progress(const struct intel_engine_execlists *execlists)
2155 {
2156 return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
2157 }
2158
2159 /*
2160 * Starting with Gen12, the status has a new format:
2161 *
2162 * bit 0: switched to new queue
2163 * bit 1: reserved
2164 * bit 2: semaphore wait mode (poll or signal), only valid when
2165 * switch detail is set to "wait on semaphore"
2166 * bits 3-5: engine class
2167 * bits 6-11: engine instance
2168 * bits 12-14: reserved
2169 * bits 15-25: sw context id of the lrc the GT switched to
2170 * bits 26-31: sw counter of the lrc the GT switched to
2171 * bits 32-35: context switch detail
2172 * - 0: ctx complete
2173 * - 1: wait on sync flip
2174 * - 2: wait on vblank
2175 * - 3: wait on scanline
2176 * - 4: wait on semaphore
2177 * - 5: context preempted (not on SEMAPHORE_WAIT or
2178 * WAIT_FOR_EVENT)
2179 * bit 36: reserved
2180 * bits 37-43: wait detail (for switch detail 1 to 4)
2181 * bits 44-46: reserved
2182 * bits 47-57: sw context id of the lrc the GT switched away from
2183 * bits 58-63: sw counter of the lrc the GT switched away from
2184 */
2185 static inline bool
2186 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2187 {
2188 u32 lower_dw = csb[0];
2189 u32 upper_dw = csb[1];
2190 bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw);
2191 bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw);
2192 bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2193
2194 /*
2195 * The context switch detail is not guaranteed to be 5 when a preemption
2196 * occurs, so we can't just check for that. The check below works for
2197 * all the cases we care about, including preemptions of WAIT
2198 * instructions and lite-restore. Preempt-to-idle via the CTRL register
2199 * would require some extra handling, but we don't support that.
2200 */
2201 if (!ctx_away_valid || new_queue) {
2202 GEM_BUG_ON(!ctx_to_valid);
2203 return true;
2204 }
2205
2206 /*
2207 * switch detail = 5 is covered by the case above and we do not expect a
2208 * context switch on an unsuccessful wait instruction since we always
2209 * use polling mode.
2210 */
2211 GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw));
2212 return false;
2213 }
2214
2215 static inline bool
2216 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2217 {
2218 return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2219 }
2220
2221 static void process_csb(struct intel_engine_cs *engine)
2222 {
2223 struct intel_engine_execlists * const execlists = &engine->execlists;
2224 const u32 * const buf = execlists->csb_status;
2225 const u8 num_entries = execlists->csb_size;
2226 u8 head, tail;
2227
2228 /*
2229 * As we modify our execlists state tracking we require exclusive
2230 * access. Either we are inside the tasklet, or the tasklet is disabled
2231 * and we assume that is only inside the reset paths and so serialised.
2232 */
2233 GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2234 !reset_in_progress(execlists));
2235 GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2236
2237 /*
2238 * Note that csb_write, csb_status may be either in HWSP or mmio.
2239 * When reading from the csb_write mmio register, we have to be
2240 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2241 * the low 4bits. As it happens we know the next 4bits are always
2242 * zero and so we can simply masked off the low u8 of the register
2243 * and treat it identically to reading from the HWSP (without having
2244 * to use explicit shifting and masking, and probably bifurcating
2245 * the code to handle the legacy mmio read).
2246 */
2247 head = execlists->csb_head;
2248 tail = READ_ONCE(*execlists->csb_write);
2249 ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
2250 if (unlikely(head == tail))
2251 return;
2252
2253 /*
2254 * Hopefully paired with a wmb() in HW!
2255 *
2256 * We must complete the read of the write pointer before any reads
2257 * from the CSB, so that we do not see stale values. Without an rmb
2258 * (lfence) the HW may speculatively perform the CSB[] reads *before*
2259 * we perform the READ_ONCE(*csb_write).
2260 */
2261 rmb();
2262
2263 do {
2264 bool promote;
2265
2266 if (++head == num_entries)
2267 head = 0;
2268
2269 /*
2270 * We are flying near dragons again.
2271 *
2272 * We hold a reference to the request in execlist_port[]
2273 * but no more than that. We are operating in softirq
2274 * context and so cannot hold any mutex or sleep. That
2275 * prevents us stopping the requests we are processing
2276 * in port[] from being retired simultaneously (the
2277 * breadcrumb will be complete before we see the
2278 * context-switch). As we only hold the reference to the
2279 * request, any pointer chasing underneath the request
2280 * is subject to a potential use-after-free. Thus we
2281 * store all of the bookkeeping within port[] as
2282 * required, and avoid using unguarded pointers beneath
2283 * request itself. The same applies to the atomic
2284 * status notifier.
2285 */
2286
2287 ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
2288 head, buf[2 * head + 0], buf[2 * head + 1]);
2289
2290 if (INTEL_GEN(engine->i915) >= 12)
2291 promote = gen12_csb_parse(execlists, buf + 2 * head);
2292 else
2293 promote = gen8_csb_parse(execlists, buf + 2 * head);
2294 if (promote) {
2295 struct i915_request * const *old = execlists->active;
2296
2297 /* Point active to the new ELSP; prevent overwriting */
2298 WRITE_ONCE(execlists->active, execlists->pending);
2299
2300 if (!inject_preempt_hang(execlists))
2301 ring_set_paused(engine, 0);
2302
2303 /* cancel old inflight, prepare for switch */
2304 trace_ports(execlists, "preempted", old);
2305 while (*old)
2306 execlists_schedule_out(*old++);
2307
2308 /* switch pending to inflight */
2309 GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2310 WRITE_ONCE(execlists->active,
2311 memcpy(execlists->inflight,
2312 execlists->pending,
2313 execlists_num_ports(execlists) *
2314 sizeof(*execlists->pending)));
2315
2316 WRITE_ONCE(execlists->pending[0], NULL);
2317 } else {
2318 GEM_BUG_ON(!*execlists->active);
2319
2320 /* port0 completed, advanced to port1 */
2321 trace_ports(execlists, "completed", execlists->active);
2322
2323 /*
2324 * We rely on the hardware being strongly
2325 * ordered, that the breadcrumb write is
2326 * coherent (visible from the CPU) before the
2327 * user interrupt and CSB is processed.
2328 */
2329 GEM_BUG_ON(!i915_request_completed(*execlists->active) &&
2330 !reset_in_progress(execlists));
2331 execlists_schedule_out(*execlists->active++);
2332
2333 GEM_BUG_ON(execlists->active - execlists->inflight >
2334 execlists_num_ports(execlists));
2335 }
2336 } while (head != tail);
2337
2338 execlists->csb_head = head;
2339 set_timeslice(engine);
2340
2341 /*
2342 * Gen11 has proven to fail wrt global observation point between
2343 * entry and tail update, failing on the ordering and thus
2344 * we see an old entry in the context status buffer.
2345 *
2346 * Forcibly evict out entries for the next gpu csb update,
2347 * to increase the odds that we get a fresh entries with non
2348 * working hardware. The cost for doing so comes out mostly with
2349 * the wash as hardware, working or not, will need to do the
2350 * invalidation before.
2351 */
2352 invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2353 }
2354
2355 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2356 {
2357 lockdep_assert_held(&engine->active.lock);
2358 if (!engine->execlists.pending[0]) {
2359 rcu_read_lock(); /* protect peeking at execlists->active */
2360 execlists_dequeue(engine);
2361 rcu_read_unlock();
2362 }
2363 }
2364
2365 static void __execlists_hold(struct i915_request *rq)
2366 {
2367 LIST_HEAD(list);
2368
2369 do {
2370 struct i915_dependency *p;
2371
2372 if (i915_request_is_active(rq))
2373 __i915_request_unsubmit(rq);
2374
2375 RQ_TRACE(rq, "on hold\n");
2376 clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2377 list_move_tail(&rq->sched.link, &rq->engine->active.hold);
2378 i915_request_set_hold(rq);
2379
2380 list_for_each_entry(p, &rq->sched.waiters_list, wait_link) {
2381 struct i915_request *w =
2382 container_of(p->waiter, typeof(*w), sched);
2383
2384 /* Leave semaphores spinning on the other engines */
2385 if (w->engine != rq->engine)
2386 continue;
2387
2388 if (!i915_request_is_ready(w))
2389 continue;
2390
2391 if (i915_request_completed(w))
2392 continue;
2393
2394 if (i915_request_on_hold(rq))
2395 continue;
2396
2397 list_move_tail(&w->sched.link, &list);
2398 }
2399
2400 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2401 } while (rq);
2402 }
2403
2404 static bool execlists_hold(struct intel_engine_cs *engine,
2405 struct i915_request *rq)
2406 {
2407 spin_lock_irq(&engine->active.lock);
2408
2409 if (i915_request_completed(rq)) { /* too late! */
2410 rq = NULL;
2411 goto unlock;
2412 }
2413
2414 if (rq->engine != engine) { /* preempted virtual engine */
2415 struct virtual_engine *ve = to_virtual_engine(rq->engine);
2416
2417 /*
2418 * intel_context_inflight() is only protected by virtue
2419 * of process_csb() being called only by the tasklet (or
2420 * directly from inside reset while the tasklet is suspended).
2421 * Assert that neither of those are allowed to run while we
2422 * poke at the request queues.
2423 */
2424 GEM_BUG_ON(!reset_in_progress(&engine->execlists));
2425
2426 /*
2427 * An unsubmitted request along a virtual engine will
2428 * remain on the active (this) engine until we are able
2429 * to process the context switch away (and so mark the
2430 * context as no longer in flight). That cannot have happened
2431 * yet, otherwise we would not be hanging!
2432 */
2433 spin_lock(&ve->base.active.lock);
2434 GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
2435 GEM_BUG_ON(ve->request != rq);
2436 ve->request = NULL;
2437 spin_unlock(&ve->base.active.lock);
2438 i915_request_put(rq);
2439
2440 rq->engine = engine;
2441 }
2442
2443 /*
2444 * Transfer this request onto the hold queue to prevent it
2445 * being resumbitted to HW (and potentially completed) before we have
2446 * released it. Since we may have already submitted following
2447 * requests, we need to remove those as well.
2448 */
2449 GEM_BUG_ON(i915_request_on_hold(rq));
2450 GEM_BUG_ON(rq->engine != engine);
2451 __execlists_hold(rq);
2452
2453 unlock:
2454 spin_unlock_irq(&engine->active.lock);
2455 return rq;
2456 }
2457
2458 static bool hold_request(const struct i915_request *rq)
2459 {
2460 struct i915_dependency *p;
2461
2462 /*
2463 * If one of our ancestors is on hold, we must also be on hold,
2464 * otherwise we will bypass it and execute before it.
2465 */
2466 list_for_each_entry(p, &rq->sched.signalers_list, signal_link) {
2467 const struct i915_request *s =
2468 container_of(p->signaler, typeof(*s), sched);
2469
2470 if (s->engine != rq->engine)
2471 continue;
2472
2473 if (i915_request_on_hold(s))
2474 return true;
2475 }
2476
2477 return false;
2478 }
2479
2480 static void __execlists_unhold(struct i915_request *rq)
2481 {
2482 LIST_HEAD(list);
2483
2484 do {
2485 struct i915_dependency *p;
2486
2487 GEM_BUG_ON(!i915_request_on_hold(rq));
2488 GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
2489
2490 i915_request_clear_hold(rq);
2491 list_move_tail(&rq->sched.link,
2492 i915_sched_lookup_priolist(rq->engine,
2493 rq_prio(rq)));
2494 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2495 RQ_TRACE(rq, "hold release\n");
2496
2497 /* Also release any children on this engine that are ready */
2498 list_for_each_entry(p, &rq->sched.waiters_list, wait_link) {
2499 struct i915_request *w =
2500 container_of(p->waiter, typeof(*w), sched);
2501
2502 if (w->engine != rq->engine)
2503 continue;
2504
2505 if (!i915_request_on_hold(rq))
2506 continue;
2507
2508 /* Check that no other parents are also on hold */
2509 if (hold_request(rq))
2510 continue;
2511
2512 list_move_tail(&w->sched.link, &list);
2513 }
2514
2515 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2516 } while (rq);
2517 }
2518
2519 static void execlists_unhold(struct intel_engine_cs *engine,
2520 struct i915_request *rq)
2521 {
2522 spin_lock_irq(&engine->active.lock);
2523
2524 /*
2525 * Move this request back to the priority queue, and all of its
2526 * children and grandchildren that were suspended along with it.
2527 */
2528 __execlists_unhold(rq);
2529
2530 if (rq_prio(rq) > engine->execlists.queue_priority_hint) {
2531 engine->execlists.queue_priority_hint = rq_prio(rq);
2532 tasklet_hi_schedule(&engine->execlists.tasklet);
2533 }
2534
2535 spin_unlock_irq(&engine->active.lock);
2536 }
2537
2538 struct execlists_capture {
2539 struct work_struct work;
2540 struct i915_request *rq;
2541 struct i915_gpu_coredump *error;
2542 };
2543
2544 static void execlists_capture_work(struct work_struct *work)
2545 {
2546 struct execlists_capture *cap = container_of(work, typeof(*cap), work);
2547 const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
2548 struct intel_engine_cs *engine = cap->rq->engine;
2549 struct intel_gt_coredump *gt = cap->error->gt;
2550 struct intel_engine_capture_vma *vma;
2551
2552 /* Compress all the objects attached to the request, slow! */
2553 vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
2554 if (vma) {
2555 struct i915_vma_compress *compress =
2556 i915_vma_capture_prepare(gt);
2557
2558 intel_engine_coredump_add_vma(gt->engine, vma, compress);
2559 i915_vma_capture_finish(gt, compress);
2560 }
2561
2562 gt->simulated = gt->engine->simulated;
2563 cap->error->simulated = gt->simulated;
2564
2565 /* Publish the error state, and announce it to the world */
2566 i915_error_state_store(cap->error);
2567 i915_gpu_coredump_put(cap->error);
2568
2569 /* Return this request and all that depend upon it for signaling */
2570 execlists_unhold(engine, cap->rq);
2571 i915_request_put(cap->rq);
2572
2573 kfree(cap);
2574 }
2575
2576 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
2577 {
2578 const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
2579 struct execlists_capture *cap;
2580
2581 cap = kmalloc(sizeof(*cap), gfp);
2582 if (!cap)
2583 return NULL;
2584
2585 cap->error = i915_gpu_coredump_alloc(engine->i915, gfp);
2586 if (!cap->error)
2587 goto err_cap;
2588
2589 cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
2590 if (!cap->error->gt)
2591 goto err_gpu;
2592
2593 cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
2594 if (!cap->error->gt->engine)
2595 goto err_gt;
2596
2597 return cap;
2598
2599 err_gt:
2600 kfree(cap->error->gt);
2601 err_gpu:
2602 kfree(cap->error);
2603 err_cap:
2604 kfree(cap);
2605 return NULL;
2606 }
2607
2608 static bool execlists_capture(struct intel_engine_cs *engine)
2609 {
2610 struct execlists_capture *cap;
2611
2612 if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
2613 return true;
2614
2615 /*
2616 * We need to _quickly_ capture the engine state before we reset.
2617 * We are inside an atomic section (softirq) here and we are delaying
2618 * the forced preemption event.
2619 */
2620 cap = capture_regs(engine);
2621 if (!cap)
2622 return true;
2623
2624 cap->rq = execlists_active(&engine->execlists);
2625 GEM_BUG_ON(!cap->rq);
2626
2627 rcu_read_lock();
2628 cap->rq = active_request(cap->rq->context->timeline, cap->rq);
2629 cap->rq = i915_request_get_rcu(cap->rq);
2630 rcu_read_unlock();
2631 if (!cap->rq)
2632 goto err_free;
2633
2634 /*
2635 * Remove the request from the execlists queue, and take ownership
2636 * of the request. We pass it to our worker who will _slowly_ compress
2637 * all the pages the _user_ requested for debugging their batch, after
2638 * which we return it to the queue for signaling.
2639 *
2640 * By removing them from the execlists queue, we also remove the
2641 * requests from being processed by __unwind_incomplete_requests()
2642 * during the intel_engine_reset(), and so they will *not* be replayed
2643 * afterwards.
2644 *
2645 * Note that because we have not yet reset the engine at this point,
2646 * it is possible for the request that we have identified as being
2647 * guilty, did in fact complete and we will then hit an arbitration
2648 * point allowing the outstanding preemption to succeed. The likelihood
2649 * of that is very low (as capturing of the engine registers should be
2650 * fast enough to run inside an irq-off atomic section!), so we will
2651 * simply hold that request accountable for being non-preemptible
2652 * long enough to force the reset.
2653 */
2654 if (!execlists_hold(engine, cap->rq))
2655 goto err_rq;
2656
2657 INIT_WORK(&cap->work, execlists_capture_work);
2658 schedule_work(&cap->work);
2659 return true;
2660
2661 err_rq:
2662 i915_request_put(cap->rq);
2663 err_free:
2664 i915_gpu_coredump_put(cap->error);
2665 kfree(cap);
2666 return false;
2667 }
2668
2669 static noinline void preempt_reset(struct intel_engine_cs *engine)
2670 {
2671 const unsigned int bit = I915_RESET_ENGINE + engine->id;
2672 unsigned long *lock = &engine->gt->reset.flags;
2673
2674 if (i915_modparams.reset < 3)
2675 return;
2676
2677 if (test_and_set_bit(bit, lock))
2678 return;
2679
2680 /* Mark this tasklet as disabled to avoid waiting for it to complete */
2681 tasklet_disable_nosync(&engine->execlists.tasklet);
2682
2683 ENGINE_TRACE(engine, "preempt timeout %lu+%ums\n",
2684 READ_ONCE(engine->props.preempt_timeout_ms),
2685 jiffies_to_msecs(jiffies - engine->execlists.preempt.expires));
2686
2687 ring_set_paused(engine, 1); /* Freeze the current request in place */
2688 if (execlists_capture(engine))
2689 intel_engine_reset(engine, "preemption time out");
2690 else
2691 ring_set_paused(engine, 0);
2692
2693 tasklet_enable(&engine->execlists.tasklet);
2694 clear_and_wake_up_bit(bit, lock);
2695 }
2696
2697 static bool preempt_timeout(const struct intel_engine_cs *const engine)
2698 {
2699 const struct timer_list *t = &engine->execlists.preempt;
2700
2701 if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
2702 return false;
2703
2704 if (!timer_expired(t))
2705 return false;
2706
2707 return READ_ONCE(engine->execlists.pending[0]);
2708 }
2709
2710 /*
2711 * Check the unread Context Status Buffers and manage the submission of new
2712 * contexts to the ELSP accordingly.
2713 */
2714 static void execlists_submission_tasklet(unsigned long data)
2715 {
2716 struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
2717 bool timeout = preempt_timeout(engine);
2718
2719 process_csb(engine);
2720 if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
2721 unsigned long flags;
2722
2723 spin_lock_irqsave(&engine->active.lock, flags);
2724 __execlists_submission_tasklet(engine);
2725 spin_unlock_irqrestore(&engine->active.lock, flags);
2726
2727 /* Recheck after serialising with direct-submission */
2728 if (timeout && preempt_timeout(engine))
2729 preempt_reset(engine);
2730 }
2731 }
2732
2733 static void __execlists_kick(struct intel_engine_execlists *execlists)
2734 {
2735 /* Kick the tasklet for some interrupt coalescing and reset handling */
2736 tasklet_hi_schedule(&execlists->tasklet);
2737 }
2738
2739 #define execlists_kick(t, member) \
2740 __execlists_kick(container_of(t, struct intel_engine_execlists, member))
2741
2742 static void execlists_timeslice(struct timer_list *timer)
2743 {
2744 execlists_kick(timer, timer);
2745 }
2746
2747 static void execlists_preempt(struct timer_list *timer)
2748 {
2749 execlists_kick(timer, preempt);
2750 }
2751
2752 static void queue_request(struct intel_engine_cs *engine,
2753 struct i915_request *rq)
2754 {
2755 GEM_BUG_ON(!list_empty(&rq->sched.link));
2756 list_add_tail(&rq->sched.link,
2757 i915_sched_lookup_priolist(engine, rq_prio(rq)));
2758 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2759 }
2760
2761 static void __submit_queue_imm(struct intel_engine_cs *engine)
2762 {
2763 struct intel_engine_execlists * const execlists = &engine->execlists;
2764
2765 if (reset_in_progress(execlists))
2766 return; /* defer until we restart the engine following reset */
2767
2768 if (execlists->tasklet.func == execlists_submission_tasklet)
2769 __execlists_submission_tasklet(engine);
2770 else
2771 tasklet_hi_schedule(&execlists->tasklet);
2772 }
2773
2774 static void submit_queue(struct intel_engine_cs *engine,
2775 const struct i915_request *rq)
2776 {
2777 struct intel_engine_execlists *execlists = &engine->execlists;
2778
2779 if (rq_prio(rq) <= execlists->queue_priority_hint)
2780 return;
2781
2782 execlists->queue_priority_hint = rq_prio(rq);
2783 __submit_queue_imm(engine);
2784 }
2785
2786 static bool ancestor_on_hold(const struct intel_engine_cs *engine,
2787 const struct i915_request *rq)
2788 {
2789 GEM_BUG_ON(i915_request_on_hold(rq));
2790 return !list_empty(&engine->active.hold) && hold_request(rq);
2791 }
2792
2793 static void execlists_submit_request(struct i915_request *request)
2794 {
2795 struct intel_engine_cs *engine = request->engine;
2796 unsigned long flags;
2797
2798 /* Will be called from irq-context when using foreign fences. */
2799 spin_lock_irqsave(&engine->active.lock, flags);
2800
2801 if (unlikely(ancestor_on_hold(engine, request))) {
2802 list_add_tail(&request->sched.link, &engine->active.hold);
2803 i915_request_set_hold(request);
2804 } else {
2805 queue_request(engine, request);
2806
2807 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
2808 GEM_BUG_ON(list_empty(&request->sched.link));
2809
2810 submit_queue(engine, request);
2811 }
2812
2813 spin_unlock_irqrestore(&engine->active.lock, flags);
2814 }
2815
2816 static void __execlists_context_fini(struct intel_context *ce)
2817 {
2818 intel_ring_put(ce->ring);
2819 i915_vma_put(ce->state);
2820 }
2821
2822 static void execlists_context_destroy(struct kref *kref)
2823 {
2824 struct intel_context *ce = container_of(kref, typeof(*ce), ref);
2825
2826 GEM_BUG_ON(!i915_active_is_idle(&ce->active));
2827 GEM_BUG_ON(intel_context_is_pinned(ce));
2828
2829 if (ce->state)
2830 __execlists_context_fini(ce);
2831
2832 intel_context_fini(ce);
2833 intel_context_free(ce);
2834 }
2835
2836 static void
2837 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
2838 {
2839 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
2840 return;
2841
2842 vaddr += engine->context_size;
2843
2844 memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
2845 }
2846
2847 static void
2848 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
2849 {
2850 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
2851 return;
2852
2853 vaddr += engine->context_size;
2854
2855 if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
2856 dev_err_once(engine->i915->drm.dev,
2857 "%s context redzone overwritten!\n",
2858 engine->name);
2859 }
2860
2861 static void execlists_context_unpin(struct intel_context *ce)
2862 {
2863 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE,
2864 ce->engine);
2865
2866 i915_gem_object_unpin_map(ce->state->obj);
2867 }
2868
2869 static void
2870 __execlists_update_reg_state(const struct intel_context *ce,
2871 const struct intel_engine_cs *engine,
2872 u32 head)
2873 {
2874 struct intel_ring *ring = ce->ring;
2875 u32 *regs = ce->lrc_reg_state;
2876
2877 GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
2878 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
2879
2880 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
2881 regs[CTX_RING_HEAD] = head;
2882 regs[CTX_RING_TAIL] = ring->tail;
2883
2884 /* RPCS */
2885 if (engine->class == RENDER_CLASS) {
2886 regs[CTX_R_PWR_CLK_STATE] =
2887 intel_sseu_make_rpcs(engine->i915, &ce->sseu);
2888
2889 i915_oa_init_reg_state(ce, engine);
2890 }
2891 }
2892
2893 static int
2894 __execlists_context_pin(struct intel_context *ce,
2895 struct intel_engine_cs *engine)
2896 {
2897 void *vaddr;
2898
2899 GEM_BUG_ON(!ce->state);
2900 GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
2901
2902 vaddr = i915_gem_object_pin_map(ce->state->obj,
2903 i915_coherent_map_type(engine->i915) |
2904 I915_MAP_OVERRIDE);
2905 if (IS_ERR(vaddr))
2906 return PTR_ERR(vaddr);
2907
2908 ce->lrc_desc = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
2909 ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
2910 __execlists_update_reg_state(ce, engine, ce->ring->tail);
2911
2912 return 0;
2913 }
2914
2915 static int execlists_context_pin(struct intel_context *ce)
2916 {
2917 return __execlists_context_pin(ce, ce->engine);
2918 }
2919
2920 static int execlists_context_alloc(struct intel_context *ce)
2921 {
2922 return __execlists_context_alloc(ce, ce->engine);
2923 }
2924
2925 static void execlists_context_reset(struct intel_context *ce)
2926 {
2927 CE_TRACE(ce, "reset\n");
2928 GEM_BUG_ON(!intel_context_is_pinned(ce));
2929
2930 /*
2931 * Because we emit WA_TAIL_DWORDS there may be a disparity
2932 * between our bookkeeping in ce->ring->head and ce->ring->tail and
2933 * that stored in context. As we only write new commands from
2934 * ce->ring->tail onwards, everything before that is junk. If the GPU
2935 * starts reading from its RING_HEAD from the context, it may try to
2936 * execute that junk and die.
2937 *
2938 * The contexts that are stilled pinned on resume belong to the
2939 * kernel, and are local to each engine. All other contexts will
2940 * have their head/tail sanitized upon pinning before use, so they
2941 * will never see garbage,
2942 *
2943 * So to avoid that we reset the context images upon resume. For
2944 * simplicity, we just zero everything out.
2945 */
2946 intel_ring_reset(ce->ring, ce->ring->emit);
2947
2948 /* Scrub away the garbage */
2949 execlists_init_reg_state(ce->lrc_reg_state,
2950 ce, ce->engine, ce->ring, true);
2951 __execlists_update_reg_state(ce, ce->engine, ce->ring->tail);
2952
2953 ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
2954 }
2955
2956 static const struct intel_context_ops execlists_context_ops = {
2957 .alloc = execlists_context_alloc,
2958
2959 .pin = execlists_context_pin,
2960 .unpin = execlists_context_unpin,
2961
2962 .enter = intel_context_enter_engine,
2963 .exit = intel_context_exit_engine,
2964
2965 .reset = execlists_context_reset,
2966 .destroy = execlists_context_destroy,
2967 };
2968
2969 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
2970 {
2971 u32 *cs;
2972
2973 GEM_BUG_ON(!i915_request_timeline(rq)->has_initial_breadcrumb);
2974
2975 cs = intel_ring_begin(rq, 6);
2976 if (IS_ERR(cs))
2977 return PTR_ERR(cs);
2978
2979 /*
2980 * Check if we have been preempted before we even get started.
2981 *
2982 * After this point i915_request_started() reports true, even if
2983 * we get preempted and so are no longer running.
2984 */
2985 *cs++ = MI_ARB_CHECK;
2986 *cs++ = MI_NOOP;
2987
2988 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
2989 *cs++ = i915_request_timeline(rq)->hwsp_offset;
2990 *cs++ = 0;
2991 *cs++ = rq->fence.seqno - 1;
2992
2993 intel_ring_advance(rq, cs);
2994
2995 /* Record the updated position of the request's payload */
2996 rq->infix = intel_ring_offset(rq, cs);
2997
2998 return 0;
2999 }
3000
3001 static int execlists_request_alloc(struct i915_request *request)
3002 {
3003 int ret;
3004
3005 GEM_BUG_ON(!intel_context_is_pinned(request->context));
3006
3007 /*
3008 * Flush enough space to reduce the likelihood of waiting after
3009 * we start building the request - in which case we will just
3010 * have to repeat work.
3011 */
3012 request->reserved_space += EXECLISTS_REQUEST_SIZE;
3013
3014 /*
3015 * Note that after this point, we have committed to using
3016 * this request as it is being used to both track the
3017 * state of engine initialisation and liveness of the
3018 * golden renderstate above. Think twice before you try
3019 * to cancel/unwind this request now.
3020 */
3021
3022 /* Unconditionally invalidate GPU caches and TLBs. */
3023 ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
3024 if (ret)
3025 return ret;
3026
3027 request->reserved_space -= EXECLISTS_REQUEST_SIZE;
3028 return 0;
3029 }
3030
3031 /*
3032 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
3033 * PIPE_CONTROL instruction. This is required for the flush to happen correctly
3034 * but there is a slight complication as this is applied in WA batch where the
3035 * values are only initialized once so we cannot take register value at the
3036 * beginning and reuse it further; hence we save its value to memory, upload a
3037 * constant value with bit21 set and then we restore it back with the saved value.
3038 * To simplify the WA, a constant value is formed by using the default value
3039 * of this register. This shouldn't be a problem because we are only modifying
3040 * it for a short period and this batch in non-premptible. We can ofcourse
3041 * use additional instructions that read the actual value of the register
3042 * at that time and set our bit of interest but it makes the WA complicated.
3043 *
3044 * This WA is also required for Gen9 so extracting as a function avoids
3045 * code duplication.
3046 */
3047 static u32 *
3048 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
3049 {
3050 /* NB no one else is allowed to scribble over scratch + 256! */
3051 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3052 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3053 *batch++ = intel_gt_scratch_offset(engine->gt,
3054 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3055 *batch++ = 0;
3056
3057 *batch++ = MI_LOAD_REGISTER_IMM(1);
3058 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3059 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
3060
3061 batch = gen8_emit_pipe_control(batch,
3062 PIPE_CONTROL_CS_STALL |
3063 PIPE_CONTROL_DC_FLUSH_ENABLE,
3064 0);
3065
3066 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3067 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3068 *batch++ = intel_gt_scratch_offset(engine->gt,
3069 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3070 *batch++ = 0;
3071
3072 return batch;
3073 }
3074
3075 /*
3076 * Typically we only have one indirect_ctx and per_ctx batch buffer which are
3077 * initialized at the beginning and shared across all contexts but this field
3078 * helps us to have multiple batches at different offsets and select them based
3079 * on a criteria. At the moment this batch always start at the beginning of the page
3080 * and at this point we don't have multiple wa_ctx batch buffers.
3081 *
3082 * The number of WA applied are not known at the beginning; we use this field
3083 * to return the no of DWORDS written.
3084 *
3085 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
3086 * so it adds NOOPs as padding to make it cacheline aligned.
3087 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
3088 * makes a complete batch buffer.
3089 */
3090 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3091 {
3092 /* WaDisableCtxRestoreArbitration:bdw,chv */
3093 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3094
3095 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
3096 if (IS_BROADWELL(engine->i915))
3097 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3098
3099 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
3100 /* Actual scratch location is at 128 bytes offset */
3101 batch = gen8_emit_pipe_control(batch,
3102 PIPE_CONTROL_FLUSH_L3 |
3103 PIPE_CONTROL_STORE_DATA_INDEX |
3104 PIPE_CONTROL_CS_STALL |
3105 PIPE_CONTROL_QW_WRITE,
3106 LRC_PPHWSP_SCRATCH_ADDR);
3107
3108 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3109
3110 /* Pad to end of cacheline */
3111 while ((unsigned long)batch % CACHELINE_BYTES)
3112 *batch++ = MI_NOOP;
3113
3114 /*
3115 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
3116 * execution depends on the length specified in terms of cache lines
3117 * in the register CTX_RCS_INDIRECT_CTX
3118 */
3119
3120 return batch;
3121 }
3122
3123 struct lri {
3124 i915_reg_t reg;
3125 u32 value;
3126 };
3127
3128 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
3129 {
3130 GEM_BUG_ON(!count || count > 63);
3131
3132 *batch++ = MI_LOAD_REGISTER_IMM(count);
3133 do {
3134 *batch++ = i915_mmio_reg_offset(lri->reg);
3135 *batch++ = lri->value;
3136 } while (lri++, --count);
3137 *batch++ = MI_NOOP;
3138
3139 return batch;
3140 }
3141
3142 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3143 {
3144 static const struct lri lri[] = {
3145 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
3146 {
3147 COMMON_SLICE_CHICKEN2,
3148 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
3149 0),
3150 },
3151
3152 /* BSpec: 11391 */
3153 {
3154 FF_SLICE_CHICKEN,
3155 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
3156 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
3157 },
3158
3159 /* BSpec: 11299 */
3160 {
3161 _3D_CHICKEN3,
3162 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
3163 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
3164 }
3165 };
3166
3167 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3168
3169 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
3170 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3171
3172 /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
3173 batch = gen8_emit_pipe_control(batch,
3174 PIPE_CONTROL_FLUSH_L3 |
3175 PIPE_CONTROL_STORE_DATA_INDEX |
3176 PIPE_CONTROL_CS_STALL |
3177 PIPE_CONTROL_QW_WRITE,
3178 LRC_PPHWSP_SCRATCH_ADDR);
3179
3180 batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
3181
3182 /* WaMediaPoolStateCmdInWABB:bxt,glk */
3183 if (HAS_POOLED_EU(engine->i915)) {
3184 /*
3185 * EU pool configuration is setup along with golden context
3186 * during context initialization. This value depends on
3187 * device type (2x6 or 3x6) and needs to be updated based
3188 * on which subslice is disabled especially for 2x6
3189 * devices, however it is safe to load default
3190 * configuration of 3x6 device instead of masking off
3191 * corresponding bits because HW ignores bits of a disabled
3192 * subslice and drops down to appropriate config. Please
3193 * see render_state_setup() in i915_gem_render_state.c for
3194 * possible configurations, to avoid duplication they are
3195 * not shown here again.
3196 */
3197 *batch++ = GEN9_MEDIA_POOL_STATE;
3198 *batch++ = GEN9_MEDIA_POOL_ENABLE;
3199 *batch++ = 0x00777000;
3200 *batch++ = 0;
3201 *batch++ = 0;
3202 *batch++ = 0;
3203 }
3204
3205 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3206
3207 /* Pad to end of cacheline */
3208 while ((unsigned long)batch % CACHELINE_BYTES)
3209 *batch++ = MI_NOOP;
3210
3211 return batch;
3212 }
3213
3214 static u32 *
3215 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3216 {
3217 int i;
3218
3219 /*
3220 * WaPipeControlBefore3DStateSamplePattern: cnl
3221 *
3222 * Ensure the engine is idle prior to programming a
3223 * 3DSTATE_SAMPLE_PATTERN during a context restore.
3224 */
3225 batch = gen8_emit_pipe_control(batch,
3226 PIPE_CONTROL_CS_STALL,
3227 0);
3228 /*
3229 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
3230 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
3231 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
3232 * confusing. Since gen8_emit_pipe_control() already advances the
3233 * batch by 6 dwords, we advance the other 10 here, completing a
3234 * cacheline. It's not clear if the workaround requires this padding
3235 * before other commands, or if it's just the regular padding we would
3236 * already have for the workaround bb, so leave it here for now.
3237 */
3238 for (i = 0; i < 10; i++)
3239 *batch++ = MI_NOOP;
3240
3241 /* Pad to end of cacheline */
3242 while ((unsigned long)batch % CACHELINE_BYTES)
3243 *batch++ = MI_NOOP;
3244
3245 return batch;
3246 }
3247
3248 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
3249
3250 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
3251 {
3252 struct drm_i915_gem_object *obj;
3253 struct i915_vma *vma;
3254 int err;
3255
3256 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
3257 if (IS_ERR(obj))
3258 return PTR_ERR(obj);
3259
3260 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
3261 if (IS_ERR(vma)) {
3262 err = PTR_ERR(vma);
3263 goto err;
3264 }
3265
3266 err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
3267 if (err)
3268 goto err;
3269
3270 engine->wa_ctx.vma = vma;
3271 return 0;
3272
3273 err:
3274 i915_gem_object_put(obj);
3275 return err;
3276 }
3277
3278 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
3279 {
3280 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
3281 }
3282
3283 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
3284
3285 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
3286 {
3287 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
3288 struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
3289 &wa_ctx->per_ctx };
3290 wa_bb_func_t wa_bb_fn[2];
3291 struct page *page;
3292 void *batch, *batch_ptr;
3293 unsigned int i;
3294 int ret;
3295
3296 if (engine->class != RENDER_CLASS)
3297 return 0;
3298
3299 switch (INTEL_GEN(engine->i915)) {
3300 case 12:
3301 case 11:
3302 return 0;
3303 case 10:
3304 wa_bb_fn[0] = gen10_init_indirectctx_bb;
3305 wa_bb_fn[1] = NULL;
3306 break;
3307 case 9:
3308 wa_bb_fn[0] = gen9_init_indirectctx_bb;
3309 wa_bb_fn[1] = NULL;
3310 break;
3311 case 8:
3312 wa_bb_fn[0] = gen8_init_indirectctx_bb;
3313 wa_bb_fn[1] = NULL;
3314 break;
3315 default:
3316 MISSING_CASE(INTEL_GEN(engine->i915));
3317 return 0;
3318 }
3319
3320 ret = lrc_setup_wa_ctx(engine);
3321 if (ret) {
3322 DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
3323 return ret;
3324 }
3325
3326 page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
3327 batch = batch_ptr = kmap_atomic(page);
3328
3329 /*
3330 * Emit the two workaround batch buffers, recording the offset from the
3331 * start of the workaround batch buffer object for each and their
3332 * respective sizes.
3333 */
3334 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
3335 wa_bb[i]->offset = batch_ptr - batch;
3336 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
3337 CACHELINE_BYTES))) {
3338 ret = -EINVAL;
3339 break;
3340 }
3341 if (wa_bb_fn[i])
3342 batch_ptr = wa_bb_fn[i](engine, batch_ptr);
3343 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
3344 }
3345
3346 BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
3347
3348 kunmap_atomic(batch);
3349 if (ret)
3350 lrc_destroy_wa_ctx(engine);
3351
3352 return ret;
3353 }
3354
3355 static void enable_execlists(struct intel_engine_cs *engine)
3356 {
3357 u32 mode;
3358
3359 assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
3360
3361 intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
3362
3363 if (INTEL_GEN(engine->i915) >= 11)
3364 mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
3365 else
3366 mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
3367 ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
3368
3369 ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
3370
3371 ENGINE_WRITE_FW(engine,
3372 RING_HWS_PGA,
3373 i915_ggtt_offset(engine->status_page.vma));
3374 ENGINE_POSTING_READ(engine, RING_HWS_PGA);
3375
3376 engine->context_tag = 0;
3377 }
3378
3379 static bool unexpected_starting_state(struct intel_engine_cs *engine)
3380 {
3381 bool unexpected = false;
3382
3383 if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
3384 DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n");
3385 unexpected = true;
3386 }
3387
3388 return unexpected;
3389 }
3390
3391 static int execlists_resume(struct intel_engine_cs *engine)
3392 {
3393 intel_engine_apply_workarounds(engine);
3394 intel_engine_apply_whitelist(engine);
3395
3396 intel_mocs_init_engine(engine);
3397
3398 intel_engine_reset_breadcrumbs(engine);
3399
3400 if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
3401 struct drm_printer p = drm_debug_printer(__func__);
3402
3403 intel_engine_dump(engine, &p, NULL);
3404 }
3405
3406 enable_execlists(engine);
3407
3408 return 0;
3409 }
3410
3411 static void execlists_reset_prepare(struct intel_engine_cs *engine)
3412 {
3413 struct intel_engine_execlists * const execlists = &engine->execlists;
3414 unsigned long flags;
3415
3416 ENGINE_TRACE(engine, "depth<-%d\n",
3417 atomic_read(&execlists->tasklet.count));
3418
3419 /*
3420 * Prevent request submission to the hardware until we have
3421 * completed the reset in i915_gem_reset_finish(). If a request
3422 * is completed by one engine, it may then queue a request
3423 * to a second via its execlists->tasklet *just* as we are
3424 * calling engine->resume() and also writing the ELSP.
3425 * Turning off the execlists->tasklet until the reset is over
3426 * prevents the race.
3427 */
3428 __tasklet_disable_sync_once(&execlists->tasklet);
3429 GEM_BUG_ON(!reset_in_progress(execlists));
3430
3431 /* And flush any current direct submission. */
3432 spin_lock_irqsave(&engine->active.lock, flags);
3433 spin_unlock_irqrestore(&engine->active.lock, flags);
3434
3435 /*
3436 * We stop engines, otherwise we might get failed reset and a
3437 * dead gpu (on elk). Also as modern gpu as kbl can suffer
3438 * from system hang if batchbuffer is progressing when
3439 * the reset is issued, regardless of READY_TO_RESET ack.
3440 * Thus assume it is best to stop engines on all gens
3441 * where we have a gpu reset.
3442 *
3443 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
3444 *
3445 * FIXME: Wa for more modern gens needs to be validated
3446 */
3447 intel_engine_stop_cs(engine);
3448 }
3449
3450 static void reset_csb_pointers(struct intel_engine_cs *engine)
3451 {
3452 struct intel_engine_execlists * const execlists = &engine->execlists;
3453 const unsigned int reset_value = execlists->csb_size - 1;
3454
3455 ring_set_paused(engine, 0);
3456
3457 /*
3458 * After a reset, the HW starts writing into CSB entry [0]. We
3459 * therefore have to set our HEAD pointer back one entry so that
3460 * the *first* entry we check is entry 0. To complicate this further,
3461 * as we don't wait for the first interrupt after reset, we have to
3462 * fake the HW write to point back to the last entry so that our
3463 * inline comparison of our cached head position against the last HW
3464 * write works even before the first interrupt.
3465 */
3466 execlists->csb_head = reset_value;
3467 WRITE_ONCE(*execlists->csb_write, reset_value);
3468 wmb(); /* Make sure this is visible to HW (paranoia?) */
3469
3470 /*
3471 * Sometimes Icelake forgets to reset its pointers on a GPU reset.
3472 * Bludgeon them with a mmio update to be sure.
3473 */
3474 ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
3475 reset_value << 8 | reset_value);
3476 ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
3477
3478 invalidate_csb_entries(&execlists->csb_status[0],
3479 &execlists->csb_status[reset_value]);
3480 }
3481
3482 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
3483 {
3484 int x;
3485
3486 x = lrc_ring_mi_mode(engine);
3487 if (x != -1) {
3488 regs[x + 1] &= ~STOP_RING;
3489 regs[x + 1] |= STOP_RING << 16;
3490 }
3491 }
3492
3493 static void __execlists_reset_reg_state(const struct intel_context *ce,
3494 const struct intel_engine_cs *engine)
3495 {
3496 u32 *regs = ce->lrc_reg_state;
3497
3498 __reset_stop_ring(regs, engine);
3499 }
3500
3501 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
3502 {
3503 struct intel_engine_execlists * const execlists = &engine->execlists;
3504 struct intel_context *ce;
3505 struct i915_request *rq;
3506 u32 head;
3507
3508 mb(); /* paranoia: read the CSB pointers from after the reset */
3509 clflush(execlists->csb_write);
3510 mb();
3511
3512 process_csb(engine); /* drain preemption events */
3513
3514 /* Following the reset, we need to reload the CSB read/write pointers */
3515 reset_csb_pointers(engine);
3516
3517 /*
3518 * Save the currently executing context, even if we completed
3519 * its request, it was still running at the time of the
3520 * reset and will have been clobbered.
3521 */
3522 rq = execlists_active(execlists);
3523 if (!rq)
3524 goto unwind;
3525
3526 /* We still have requests in-flight; the engine should be active */
3527 GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
3528
3529 ce = rq->context;
3530 GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3531
3532 if (i915_request_completed(rq)) {
3533 /* Idle context; tidy up the ring so we can restart afresh */
3534 head = intel_ring_wrap(ce->ring, rq->tail);
3535 goto out_replay;
3536 }
3537
3538 /* Context has requests still in-flight; it should not be idle! */
3539 GEM_BUG_ON(i915_active_is_idle(&ce->active));
3540 rq = active_request(ce->timeline, rq);
3541 head = intel_ring_wrap(ce->ring, rq->head);
3542 GEM_BUG_ON(head == ce->ring->tail);
3543
3544 /*
3545 * If this request hasn't started yet, e.g. it is waiting on a
3546 * semaphore, we need to avoid skipping the request or else we
3547 * break the signaling chain. However, if the context is corrupt
3548 * the request will not restart and we will be stuck with a wedged
3549 * device. It is quite often the case that if we issue a reset
3550 * while the GPU is loading the context image, that the context
3551 * image becomes corrupt.
3552 *
3553 * Otherwise, if we have not started yet, the request should replay
3554 * perfectly and we do not need to flag the result as being erroneous.
3555 */
3556 if (!i915_request_started(rq))
3557 goto out_replay;
3558
3559 /*
3560 * If the request was innocent, we leave the request in the ELSP
3561 * and will try to replay it on restarting. The context image may
3562 * have been corrupted by the reset, in which case we may have
3563 * to service a new GPU hang, but more likely we can continue on
3564 * without impact.
3565 *
3566 * If the request was guilty, we presume the context is corrupt
3567 * and have to at least restore the RING register in the context
3568 * image back to the expected values to skip over the guilty request.
3569 */
3570 __i915_request_reset(rq, stalled);
3571 if (!stalled)
3572 goto out_replay;
3573
3574 /*
3575 * We want a simple context + ring to execute the breadcrumb update.
3576 * We cannot rely on the context being intact across the GPU hang,
3577 * so clear it and rebuild just what we need for the breadcrumb.
3578 * All pending requests for this context will be zapped, and any
3579 * future request will be after userspace has had the opportunity
3580 * to recreate its own state.
3581 */
3582 GEM_BUG_ON(!intel_context_is_pinned(ce));
3583 restore_default_state(ce, engine);
3584
3585 out_replay:
3586 ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
3587 head, ce->ring->tail);
3588 __execlists_reset_reg_state(ce, engine);
3589 __execlists_update_reg_state(ce, engine, head);
3590 ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
3591
3592 unwind:
3593 /* Push back any incomplete requests for replay after the reset. */
3594 cancel_port_requests(execlists);
3595 __unwind_incomplete_requests(engine);
3596 }
3597
3598 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
3599 {
3600 unsigned long flags;
3601
3602 ENGINE_TRACE(engine, "\n");
3603
3604 spin_lock_irqsave(&engine->active.lock, flags);
3605
3606 __execlists_reset(engine, stalled);
3607
3608 spin_unlock_irqrestore(&engine->active.lock, flags);
3609 }
3610
3611 static void nop_submission_tasklet(unsigned long data)
3612 {
3613 /* The driver is wedged; don't process any more events. */
3614 }
3615
3616 static void execlists_reset_cancel(struct intel_engine_cs *engine)
3617 {
3618 struct intel_engine_execlists * const execlists = &engine->execlists;
3619 struct i915_request *rq, *rn;
3620 struct rb_node *rb;
3621 unsigned long flags;
3622
3623 ENGINE_TRACE(engine, "\n");
3624
3625 /*
3626 * Before we call engine->cancel_requests(), we should have exclusive
3627 * access to the submission state. This is arranged for us by the
3628 * caller disabling the interrupt generation, the tasklet and other
3629 * threads that may then access the same state, giving us a free hand
3630 * to reset state. However, we still need to let lockdep be aware that
3631 * we know this state may be accessed in hardirq context, so we
3632 * disable the irq around this manipulation and we want to keep
3633 * the spinlock focused on its duties and not accidentally conflate
3634 * coverage to the submission's irq state. (Similarly, although we
3635 * shouldn't need to disable irq around the manipulation of the
3636 * submission's irq state, we also wish to remind ourselves that
3637 * it is irq state.)
3638 */
3639 spin_lock_irqsave(&engine->active.lock, flags);
3640
3641 __execlists_reset(engine, true);
3642
3643 /* Mark all executing requests as skipped. */
3644 list_for_each_entry(rq, &engine->active.requests, sched.link)
3645 mark_eio(rq);
3646
3647 /* Flush the queued requests to the timeline list (for retiring). */
3648 while ((rb = rb_first_cached(&execlists->queue))) {
3649 struct i915_priolist *p = to_priolist(rb);
3650 int i;
3651
3652 priolist_for_each_request_consume(rq, rn, p, i) {
3653 mark_eio(rq);
3654 __i915_request_submit(rq);
3655 }
3656
3657 rb_erase_cached(&p->node, &execlists->queue);
3658 i915_priolist_free(p);
3659 }
3660
3661 /* On-hold requests will be flushed to timeline upon their release */
3662 list_for_each_entry(rq, &engine->active.hold, sched.link)
3663 mark_eio(rq);
3664
3665 /* Cancel all attached virtual engines */
3666 while ((rb = rb_first_cached(&execlists->virtual))) {
3667 struct virtual_engine *ve =
3668 rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
3669
3670 rb_erase_cached(rb, &execlists->virtual);
3671 RB_CLEAR_NODE(rb);
3672
3673 spin_lock(&ve->base.active.lock);
3674 rq = fetch_and_zero(&ve->request);
3675 if (rq) {
3676 mark_eio(rq);
3677
3678 rq->engine = engine;
3679 __i915_request_submit(rq);
3680 i915_request_put(rq);
3681
3682 ve->base.execlists.queue_priority_hint = INT_MIN;
3683 }
3684 spin_unlock(&ve->base.active.lock);
3685 }
3686
3687 /* Remaining _unready_ requests will be nop'ed when submitted */
3688
3689 execlists->queue_priority_hint = INT_MIN;
3690 execlists->queue = RB_ROOT_CACHED;
3691
3692 GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
3693 execlists->tasklet.func = nop_submission_tasklet;
3694
3695 spin_unlock_irqrestore(&engine->active.lock, flags);
3696 }
3697
3698 static void execlists_reset_finish(struct intel_engine_cs *engine)
3699 {
3700 struct intel_engine_execlists * const execlists = &engine->execlists;
3701
3702 /*
3703 * After a GPU reset, we may have requests to replay. Do so now while
3704 * we still have the forcewake to be sure that the GPU is not allowed
3705 * to sleep before we restart and reload a context.
3706 */
3707 GEM_BUG_ON(!reset_in_progress(execlists));
3708 if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
3709 execlists->tasklet.func(execlists->tasklet.data);
3710
3711 if (__tasklet_enable(&execlists->tasklet))
3712 /* And kick in case we missed a new request submission. */
3713 tasklet_hi_schedule(&execlists->tasklet);
3714 ENGINE_TRACE(engine, "depth->%d\n",
3715 atomic_read(&execlists->tasklet.count));
3716 }
3717
3718 static int gen8_emit_bb_start_noarb(struct i915_request *rq,
3719 u64 offset, u32 len,
3720 const unsigned int flags)
3721 {
3722 u32 *cs;
3723
3724 cs = intel_ring_begin(rq, 4);
3725 if (IS_ERR(cs))
3726 return PTR_ERR(cs);
3727
3728 /*
3729 * WaDisableCtxRestoreArbitration:bdw,chv
3730 *
3731 * We don't need to perform MI_ARB_ENABLE as often as we do (in
3732 * particular all the gen that do not need the w/a at all!), if we
3733 * took care to make sure that on every switch into this context
3734 * (both ordinary and for preemption) that arbitrartion was enabled
3735 * we would be fine. However, for gen8 there is another w/a that
3736 * requires us to not preempt inside GPGPU execution, so we keep
3737 * arbitration disabled for gen8 batches. Arbitration will be
3738 * re-enabled before we close the request
3739 * (engine->emit_fini_breadcrumb).
3740 */
3741 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3742
3743 /* FIXME(BDW+): Address space and security selectors. */
3744 *cs++ = MI_BATCH_BUFFER_START_GEN8 |
3745 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3746 *cs++ = lower_32_bits(offset);
3747 *cs++ = upper_32_bits(offset);
3748
3749 intel_ring_advance(rq, cs);
3750
3751 return 0;
3752 }
3753
3754 static int gen8_emit_bb_start(struct i915_request *rq,
3755 u64 offset, u32 len,
3756 const unsigned int flags)
3757 {
3758 u32 *cs;
3759
3760 cs = intel_ring_begin(rq, 6);
3761 if (IS_ERR(cs))
3762 return PTR_ERR(cs);
3763
3764 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3765
3766 *cs++ = MI_BATCH_BUFFER_START_GEN8 |
3767 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3768 *cs++ = lower_32_bits(offset);
3769 *cs++ = upper_32_bits(offset);
3770
3771 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3772 *cs++ = MI_NOOP;
3773
3774 intel_ring_advance(rq, cs);
3775
3776 return 0;
3777 }
3778
3779 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
3780 {
3781 ENGINE_WRITE(engine, RING_IMR,
3782 ~(engine->irq_enable_mask | engine->irq_keep_mask));
3783 ENGINE_POSTING_READ(engine, RING_IMR);
3784 }
3785
3786 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
3787 {
3788 ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
3789 }
3790
3791 static int gen8_emit_flush(struct i915_request *request, u32 mode)
3792 {
3793 u32 cmd, *cs;
3794
3795 cs = intel_ring_begin(request, 4);
3796 if (IS_ERR(cs))
3797 return PTR_ERR(cs);
3798
3799 cmd = MI_FLUSH_DW + 1;
3800
3801 /* We always require a command barrier so that subsequent
3802 * commands, such as breadcrumb interrupts, are strictly ordered
3803 * wrt the contents of the write cache being flushed to memory
3804 * (and thus being coherent from the CPU).
3805 */
3806 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
3807
3808 if (mode & EMIT_INVALIDATE) {
3809 cmd |= MI_INVALIDATE_TLB;
3810 if (request->engine->class == VIDEO_DECODE_CLASS)
3811 cmd |= MI_INVALIDATE_BSD;
3812 }
3813
3814 *cs++ = cmd;
3815 *cs++ = LRC_PPHWSP_SCRATCH_ADDR;
3816 *cs++ = 0; /* upper addr */
3817 *cs++ = 0; /* value */
3818 intel_ring_advance(request, cs);
3819
3820 return 0;
3821 }
3822
3823 static int gen8_emit_flush_render(struct i915_request *request,
3824 u32 mode)
3825 {
3826 bool vf_flush_wa = false, dc_flush_wa = false;
3827 u32 *cs, flags = 0;
3828 int len;
3829
3830 flags |= PIPE_CONTROL_CS_STALL;
3831
3832 if (mode & EMIT_FLUSH) {
3833 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3834 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3835 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3836 flags |= PIPE_CONTROL_FLUSH_ENABLE;
3837 }
3838
3839 if (mode & EMIT_INVALIDATE) {
3840 flags |= PIPE_CONTROL_TLB_INVALIDATE;
3841 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3842 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3843 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3844 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3845 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3846 flags |= PIPE_CONTROL_QW_WRITE;
3847 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3848
3849 /*
3850 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
3851 * pipe control.
3852 */
3853 if (IS_GEN(request->i915, 9))
3854 vf_flush_wa = true;
3855
3856 /* WaForGAMHang:kbl */
3857 if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
3858 dc_flush_wa = true;
3859 }
3860
3861 len = 6;
3862
3863 if (vf_flush_wa)
3864 len += 6;
3865
3866 if (dc_flush_wa)
3867 len += 12;
3868
3869 cs = intel_ring_begin(request, len);
3870 if (IS_ERR(cs))
3871 return PTR_ERR(cs);
3872
3873 if (vf_flush_wa)
3874 cs = gen8_emit_pipe_control(cs, 0, 0);
3875
3876 if (dc_flush_wa)
3877 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
3878 0);
3879
3880 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3881
3882 if (dc_flush_wa)
3883 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
3884
3885 intel_ring_advance(request, cs);
3886
3887 return 0;
3888 }
3889
3890 static int gen11_emit_flush_render(struct i915_request *request,
3891 u32 mode)
3892 {
3893 if (mode & EMIT_FLUSH) {
3894 u32 *cs;
3895 u32 flags = 0;
3896
3897 flags |= PIPE_CONTROL_CS_STALL;
3898
3899 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
3900 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3901 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3902 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3903 flags |= PIPE_CONTROL_FLUSH_ENABLE;
3904 flags |= PIPE_CONTROL_QW_WRITE;
3905 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3906
3907 cs = intel_ring_begin(request, 6);
3908 if (IS_ERR(cs))
3909 return PTR_ERR(cs);
3910
3911 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3912 intel_ring_advance(request, cs);
3913 }
3914
3915 if (mode & EMIT_INVALIDATE) {
3916 u32 *cs;
3917 u32 flags = 0;
3918
3919 flags |= PIPE_CONTROL_CS_STALL;
3920
3921 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
3922 flags |= PIPE_CONTROL_TLB_INVALIDATE;
3923 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3924 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3925 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3926 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3927 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3928 flags |= PIPE_CONTROL_QW_WRITE;
3929 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3930
3931 cs = intel_ring_begin(request, 6);
3932 if (IS_ERR(cs))
3933 return PTR_ERR(cs);
3934
3935 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3936 intel_ring_advance(request, cs);
3937 }
3938
3939 return 0;
3940 }
3941
3942 static u32 preparser_disable(bool state)
3943 {
3944 return MI_ARB_CHECK | 1 << 8 | state;
3945 }
3946
3947 static int gen12_emit_flush_render(struct i915_request *request,
3948 u32 mode)
3949 {
3950 if (mode & EMIT_FLUSH) {
3951 u32 flags = 0;
3952 u32 *cs;
3953
3954 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
3955 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3956 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3957 /* Wa_1409600907:tgl */
3958 flags |= PIPE_CONTROL_DEPTH_STALL;
3959 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3960 flags |= PIPE_CONTROL_FLUSH_ENABLE;
3961 flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
3962
3963 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3964 flags |= PIPE_CONTROL_QW_WRITE;
3965
3966 flags |= PIPE_CONTROL_CS_STALL;
3967
3968 cs = intel_ring_begin(request, 6);
3969 if (IS_ERR(cs))
3970 return PTR_ERR(cs);
3971
3972 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3973 intel_ring_advance(request, cs);
3974 }
3975
3976 if (mode & EMIT_INVALIDATE) {
3977 u32 flags = 0;
3978 u32 *cs;
3979
3980 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
3981 flags |= PIPE_CONTROL_TLB_INVALIDATE;
3982 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3983 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3984 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3985 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3986 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3987 flags |= PIPE_CONTROL_L3_RO_CACHE_INVALIDATE;
3988
3989 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3990 flags |= PIPE_CONTROL_QW_WRITE;
3991
3992 flags |= PIPE_CONTROL_CS_STALL;
3993
3994 cs = intel_ring_begin(request, 8);
3995 if (IS_ERR(cs))
3996 return PTR_ERR(cs);
3997
3998 /*
3999 * Prevent the pre-parser from skipping past the TLB
4000 * invalidate and loading a stale page for the batch
4001 * buffer / request payload.
4002 */
4003 *cs++ = preparser_disable(true);
4004
4005 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4006
4007 *cs++ = preparser_disable(false);
4008 intel_ring_advance(request, cs);
4009
4010 /*
4011 * Wa_1604544889:tgl
4012 */
4013 if (IS_TGL_REVID(request->i915, TGL_REVID_A0, TGL_REVID_A0)) {
4014 flags = 0;
4015 flags |= PIPE_CONTROL_CS_STALL;
4016 flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
4017
4018 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4019 flags |= PIPE_CONTROL_QW_WRITE;
4020
4021 cs = intel_ring_begin(request, 6);
4022 if (IS_ERR(cs))
4023 return PTR_ERR(cs);
4024
4025 cs = gen8_emit_pipe_control(cs, flags,
4026 LRC_PPHWSP_SCRATCH_ADDR);
4027 intel_ring_advance(request, cs);
4028 }
4029 }
4030
4031 return 0;
4032 }
4033
4034 /*
4035 * Reserve space for 2 NOOPs at the end of each request to be
4036 * used as a workaround for not being allowed to do lite
4037 * restore with HEAD==TAIL (WaIdleLiteRestore).
4038 */
4039 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
4040 {
4041 /* Ensure there's always at least one preemption point per-request. */
4042 *cs++ = MI_ARB_CHECK;
4043 *cs++ = MI_NOOP;
4044 request->wa_tail = intel_ring_offset(request, cs);
4045
4046 return cs;
4047 }
4048
4049 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
4050 {
4051 *cs++ = MI_SEMAPHORE_WAIT |
4052 MI_SEMAPHORE_GLOBAL_GTT |
4053 MI_SEMAPHORE_POLL |
4054 MI_SEMAPHORE_SAD_EQ_SDD;
4055 *cs++ = 0;
4056 *cs++ = intel_hws_preempt_address(request->engine);
4057 *cs++ = 0;
4058
4059 return cs;
4060 }
4061
4062 static __always_inline u32*
4063 gen8_emit_fini_breadcrumb_footer(struct i915_request *request,
4064 u32 *cs)
4065 {
4066 *cs++ = MI_USER_INTERRUPT;
4067
4068 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4069 if (intel_engine_has_semaphores(request->engine))
4070 cs = emit_preempt_busywait(request, cs);
4071
4072 request->tail = intel_ring_offset(request, cs);
4073 assert_ring_tail_valid(request->ring, request->tail);
4074
4075 return gen8_emit_wa_tail(request, cs);
4076 }
4077
4078 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
4079 {
4080 cs = gen8_emit_ggtt_write(cs,
4081 request->fence.seqno,
4082 i915_request_active_timeline(request)->hwsp_offset,
4083 0);
4084
4085 return gen8_emit_fini_breadcrumb_footer(request, cs);
4086 }
4087
4088 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4089 {
4090 cs = gen8_emit_pipe_control(cs,
4091 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4092 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4093 PIPE_CONTROL_DC_FLUSH_ENABLE,
4094 0);
4095
4096 /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
4097 cs = gen8_emit_ggtt_write_rcs(cs,
4098 request->fence.seqno,
4099 i915_request_active_timeline(request)->hwsp_offset,
4100 PIPE_CONTROL_FLUSH_ENABLE |
4101 PIPE_CONTROL_CS_STALL);
4102
4103 return gen8_emit_fini_breadcrumb_footer(request, cs);
4104 }
4105
4106 static u32 *
4107 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4108 {
4109 cs = gen8_emit_ggtt_write_rcs(cs,
4110 request->fence.seqno,
4111 i915_request_active_timeline(request)->hwsp_offset,
4112 PIPE_CONTROL_CS_STALL |
4113 PIPE_CONTROL_TILE_CACHE_FLUSH |
4114 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4115 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4116 PIPE_CONTROL_DC_FLUSH_ENABLE |
4117 PIPE_CONTROL_FLUSH_ENABLE);
4118
4119 return gen8_emit_fini_breadcrumb_footer(request, cs);
4120 }
4121
4122 /*
4123 * Note that the CS instruction pre-parser will not stall on the breadcrumb
4124 * flush and will continue pre-fetching the instructions after it before the
4125 * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
4126 * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
4127 * of the next request before the memory has been flushed, we're guaranteed that
4128 * we won't access the batch itself too early.
4129 * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
4130 * so, if the current request is modifying an instruction in the next request on
4131 * the same intel_context, we might pre-fetch and then execute the pre-update
4132 * instruction. To avoid this, the users of self-modifying code should either
4133 * disable the parser around the code emitting the memory writes, via a new flag
4134 * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
4135 * the in-kernel use-cases we've opted to use a separate context, see
4136 * reloc_gpu() as an example.
4137 * All the above applies only to the instructions themselves. Non-inline data
4138 * used by the instructions is not pre-fetched.
4139 */
4140
4141 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
4142 {
4143 *cs++ = MI_SEMAPHORE_WAIT_TOKEN |
4144 MI_SEMAPHORE_GLOBAL_GTT |
4145 MI_SEMAPHORE_POLL |
4146 MI_SEMAPHORE_SAD_EQ_SDD;
4147 *cs++ = 0;
4148 *cs++ = intel_hws_preempt_address(request->engine);
4149 *cs++ = 0;
4150 *cs++ = 0;
4151 *cs++ = MI_NOOP;
4152
4153 return cs;
4154 }
4155
4156 static __always_inline u32*
4157 gen12_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs)
4158 {
4159 *cs++ = MI_USER_INTERRUPT;
4160
4161 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4162 if (intel_engine_has_semaphores(request->engine))
4163 cs = gen12_emit_preempt_busywait(request, cs);
4164
4165 request->tail = intel_ring_offset(request, cs);
4166 assert_ring_tail_valid(request->ring, request->tail);
4167
4168 return gen8_emit_wa_tail(request, cs);
4169 }
4170
4171 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
4172 {
4173 cs = gen8_emit_ggtt_write(cs,
4174 request->fence.seqno,
4175 i915_request_active_timeline(request)->hwsp_offset,
4176 0);
4177
4178 return gen12_emit_fini_breadcrumb_footer(request, cs);
4179 }
4180
4181 static u32 *
4182 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4183 {
4184 cs = gen8_emit_ggtt_write_rcs(cs,
4185 request->fence.seqno,
4186 i915_request_active_timeline(request)->hwsp_offset,
4187 PIPE_CONTROL_CS_STALL |
4188 PIPE_CONTROL_TILE_CACHE_FLUSH |
4189 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4190 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4191 /* Wa_1409600907:tgl */
4192 PIPE_CONTROL_DEPTH_STALL |
4193 PIPE_CONTROL_DC_FLUSH_ENABLE |
4194 PIPE_CONTROL_FLUSH_ENABLE |
4195 PIPE_CONTROL_HDC_PIPELINE_FLUSH);
4196
4197 return gen12_emit_fini_breadcrumb_footer(request, cs);
4198 }
4199
4200 static void execlists_park(struct intel_engine_cs *engine)
4201 {
4202 cancel_timer(&engine->execlists.timer);
4203 cancel_timer(&engine->execlists.preempt);
4204 }
4205
4206 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
4207 {
4208 engine->submit_request = execlists_submit_request;
4209 engine->schedule = i915_schedule;
4210 engine->execlists.tasklet.func = execlists_submission_tasklet;
4211
4212 engine->reset.prepare = execlists_reset_prepare;
4213 engine->reset.rewind = execlists_reset_rewind;
4214 engine->reset.cancel = execlists_reset_cancel;
4215 engine->reset.finish = execlists_reset_finish;
4216
4217 engine->park = execlists_park;
4218 engine->unpark = NULL;
4219
4220 engine->flags |= I915_ENGINE_SUPPORTS_STATS;
4221 if (!intel_vgpu_active(engine->i915)) {
4222 engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
4223 if (HAS_LOGICAL_RING_PREEMPTION(engine->i915))
4224 engine->flags |= I915_ENGINE_HAS_PREEMPTION;
4225 }
4226
4227 if (INTEL_GEN(engine->i915) >= 12)
4228 engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
4229
4230 if (intel_engine_has_preemption(engine))
4231 engine->emit_bb_start = gen8_emit_bb_start;
4232 else
4233 engine->emit_bb_start = gen8_emit_bb_start_noarb;
4234 }
4235
4236 static void execlists_shutdown(struct intel_engine_cs *engine)
4237 {
4238 /* Synchronise with residual timers and any softirq they raise */
4239 del_timer_sync(&engine->execlists.timer);
4240 del_timer_sync(&engine->execlists.preempt);
4241 tasklet_kill(&engine->execlists.tasklet);
4242 }
4243
4244 static void execlists_release(struct intel_engine_cs *engine)
4245 {
4246 execlists_shutdown(engine);
4247
4248 intel_engine_cleanup_common(engine);
4249 lrc_destroy_wa_ctx(engine);
4250 }
4251
4252 static void
4253 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
4254 {
4255 /* Default vfuncs which can be overriden by each engine. */
4256
4257 engine->resume = execlists_resume;
4258
4259 engine->cops = &execlists_context_ops;
4260 engine->request_alloc = execlists_request_alloc;
4261
4262 engine->emit_flush = gen8_emit_flush;
4263 engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
4264 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
4265 if (INTEL_GEN(engine->i915) >= 12)
4266 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
4267
4268 engine->set_default_submission = intel_execlists_set_default_submission;
4269
4270 if (INTEL_GEN(engine->i915) < 11) {
4271 engine->irq_enable = gen8_logical_ring_enable_irq;
4272 engine->irq_disable = gen8_logical_ring_disable_irq;
4273 } else {
4274 /*
4275 * TODO: On Gen11 interrupt masks need to be clear
4276 * to allow C6 entry. Keep interrupts enabled at
4277 * and take the hit of generating extra interrupts
4278 * until a more refined solution exists.
4279 */
4280 }
4281 }
4282
4283 static inline void
4284 logical_ring_default_irqs(struct intel_engine_cs *engine)
4285 {
4286 unsigned int shift = 0;
4287
4288 if (INTEL_GEN(engine->i915) < 11) {
4289 const u8 irq_shifts[] = {
4290 [RCS0] = GEN8_RCS_IRQ_SHIFT,
4291 [BCS0] = GEN8_BCS_IRQ_SHIFT,
4292 [VCS0] = GEN8_VCS0_IRQ_SHIFT,
4293 [VCS1] = GEN8_VCS1_IRQ_SHIFT,
4294 [VECS0] = GEN8_VECS_IRQ_SHIFT,
4295 };
4296
4297 shift = irq_shifts[engine->id];
4298 }
4299
4300 engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
4301 engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
4302 }
4303
4304 static void rcs_submission_override(struct intel_engine_cs *engine)
4305 {
4306 switch (INTEL_GEN(engine->i915)) {
4307 case 12:
4308 engine->emit_flush = gen12_emit_flush_render;
4309 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
4310 break;
4311 case 11:
4312 engine->emit_flush = gen11_emit_flush_render;
4313 engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
4314 break;
4315 default:
4316 engine->emit_flush = gen8_emit_flush_render;
4317 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
4318 break;
4319 }
4320 }
4321
4322 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
4323 {
4324 struct intel_engine_execlists * const execlists = &engine->execlists;
4325 struct drm_i915_private *i915 = engine->i915;
4326 struct intel_uncore *uncore = engine->uncore;
4327 u32 base = engine->mmio_base;
4328
4329 tasklet_init(&engine->execlists.tasklet,
4330 execlists_submission_tasklet, (unsigned long)engine);
4331 timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
4332 timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
4333
4334 logical_ring_default_vfuncs(engine);
4335 logical_ring_default_irqs(engine);
4336
4337 if (engine->class == RENDER_CLASS)
4338 rcs_submission_override(engine);
4339
4340 if (intel_init_workaround_bb(engine))
4341 /*
4342 * We continue even if we fail to initialize WA batch
4343 * because we only expect rare glitches but nothing
4344 * critical to prevent us from using GPU
4345 */
4346 DRM_ERROR("WA batch buffer initialization failed\n");
4347
4348 if (HAS_LOGICAL_RING_ELSQ(i915)) {
4349 execlists->submit_reg = uncore->regs +
4350 i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
4351 execlists->ctrl_reg = uncore->regs +
4352 i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
4353 } else {
4354 execlists->submit_reg = uncore->regs +
4355 i915_mmio_reg_offset(RING_ELSP(base));
4356 }
4357
4358 execlists->csb_status =
4359 &engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
4360
4361 execlists->csb_write =
4362 &engine->status_page.addr[intel_hws_csb_write_index(i915)];
4363
4364 if (INTEL_GEN(i915) < 11)
4365 execlists->csb_size = GEN8_CSB_ENTRIES;
4366 else
4367 execlists->csb_size = GEN11_CSB_ENTRIES;
4368
4369 reset_csb_pointers(engine);
4370
4371 /* Finally, take ownership and responsibility for cleanup! */
4372 engine->release = execlists_release;
4373
4374 return 0;
4375 }
4376
4377 static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine)
4378 {
4379 u32 indirect_ctx_offset;
4380
4381 switch (INTEL_GEN(engine->i915)) {
4382 default:
4383 MISSING_CASE(INTEL_GEN(engine->i915));
4384 /* fall through */
4385 case 12:
4386 indirect_ctx_offset =
4387 GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4388 break;
4389 case 11:
4390 indirect_ctx_offset =
4391 GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4392 break;
4393 case 10:
4394 indirect_ctx_offset =
4395 GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4396 break;
4397 case 9:
4398 indirect_ctx_offset =
4399 GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4400 break;
4401 case 8:
4402 indirect_ctx_offset =
4403 GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4404 break;
4405 }
4406
4407 return indirect_ctx_offset;
4408 }
4409
4410
4411 static void init_common_reg_state(u32 * const regs,
4412 const struct intel_engine_cs *engine,
4413 const struct intel_ring *ring,
4414 bool inhibit)
4415 {
4416 u32 ctl;
4417
4418 ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
4419 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
4420 if (inhibit)
4421 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
4422 if (INTEL_GEN(engine->i915) < 11)
4423 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
4424 CTX_CTRL_RS_CTX_ENABLE);
4425 regs[CTX_CONTEXT_CONTROL] = ctl;
4426
4427 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
4428 }
4429
4430 static void init_wa_bb_reg_state(u32 * const regs,
4431 const struct intel_engine_cs *engine,
4432 u32 pos_bb_per_ctx)
4433 {
4434 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
4435
4436 if (wa_ctx->per_ctx.size) {
4437 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
4438
4439 regs[pos_bb_per_ctx] =
4440 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
4441 }
4442
4443 if (wa_ctx->indirect_ctx.size) {
4444 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
4445
4446 regs[pos_bb_per_ctx + 2] =
4447 (ggtt_offset + wa_ctx->indirect_ctx.offset) |
4448 (wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
4449
4450 regs[pos_bb_per_ctx + 4] =
4451 intel_lr_indirect_ctx_offset(engine) << 6;
4452 }
4453 }
4454
4455 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
4456 {
4457 if (i915_vm_is_4lvl(&ppgtt->vm)) {
4458 /* 64b PPGTT (48bit canonical)
4459 * PDP0_DESCRIPTOR contains the base address to PML4 and
4460 * other PDP Descriptors are ignored.
4461 */
4462 ASSIGN_CTX_PML4(ppgtt, regs);
4463 } else {
4464 ASSIGN_CTX_PDP(ppgtt, regs, 3);
4465 ASSIGN_CTX_PDP(ppgtt, regs, 2);
4466 ASSIGN_CTX_PDP(ppgtt, regs, 1);
4467 ASSIGN_CTX_PDP(ppgtt, regs, 0);
4468 }
4469 }
4470
4471 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
4472 {
4473 if (i915_is_ggtt(vm))
4474 return i915_vm_to_ggtt(vm)->alias;
4475 else
4476 return i915_vm_to_ppgtt(vm);
4477 }
4478
4479 static void execlists_init_reg_state(u32 *regs,
4480 const struct intel_context *ce,
4481 const struct intel_engine_cs *engine,
4482 const struct intel_ring *ring,
4483 bool inhibit)
4484 {
4485 /*
4486 * A context is actually a big batch buffer with several
4487 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
4488 * values we are setting here are only for the first context restore:
4489 * on a subsequent save, the GPU will recreate this batchbuffer with new
4490 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
4491 * we are not initializing here).
4492 *
4493 * Must keep consistent with virtual_update_register_offsets().
4494 */
4495 set_offsets(regs, reg_offsets(engine), engine, inhibit);
4496
4497 init_common_reg_state(regs, engine, ring, inhibit);
4498 init_ppgtt_reg_state(regs, vm_alias(ce->vm));
4499
4500 init_wa_bb_reg_state(regs, engine,
4501 INTEL_GEN(engine->i915) >= 12 ?
4502 GEN12_CTX_BB_PER_CTX_PTR :
4503 CTX_BB_PER_CTX_PTR);
4504
4505 __reset_stop_ring(regs, engine);
4506 }
4507
4508 static int
4509 populate_lr_context(struct intel_context *ce,
4510 struct drm_i915_gem_object *ctx_obj,
4511 struct intel_engine_cs *engine,
4512 struct intel_ring *ring)
4513 {
4514 bool inhibit = true;
4515 void *vaddr;
4516 int ret;
4517
4518 vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
4519 if (IS_ERR(vaddr)) {
4520 ret = PTR_ERR(vaddr);
4521 DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
4522 return ret;
4523 }
4524
4525 set_redzone(vaddr, engine);
4526
4527 if (engine->default_state) {
4528 void *defaults;
4529
4530 defaults = i915_gem_object_pin_map(engine->default_state,
4531 I915_MAP_WB);
4532 if (IS_ERR(defaults)) {
4533 ret = PTR_ERR(defaults);
4534 goto err_unpin_ctx;
4535 }
4536
4537 memcpy(vaddr, defaults, engine->context_size);
4538 i915_gem_object_unpin_map(engine->default_state);
4539 __set_bit(CONTEXT_VALID_BIT, &ce->flags);
4540 inhibit = false;
4541 }
4542
4543 /* The second page of the context object contains some fields which must
4544 * be set up prior to the first execution. */
4545 execlists_init_reg_state(vaddr + LRC_STATE_PN * PAGE_SIZE,
4546 ce, engine, ring, inhibit);
4547
4548 ret = 0;
4549 err_unpin_ctx:
4550 __i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
4551 i915_gem_object_unpin_map(ctx_obj);
4552 return ret;
4553 }
4554
4555 static int __execlists_context_alloc(struct intel_context *ce,
4556 struct intel_engine_cs *engine)
4557 {
4558 struct drm_i915_gem_object *ctx_obj;
4559 struct intel_ring *ring;
4560 struct i915_vma *vma;
4561 u32 context_size;
4562 int ret;
4563
4564 GEM_BUG_ON(ce->state);
4565 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
4566
4567 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
4568 context_size += I915_GTT_PAGE_SIZE; /* for redzone */
4569
4570 ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
4571 if (IS_ERR(ctx_obj))
4572 return PTR_ERR(ctx_obj);
4573
4574 vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
4575 if (IS_ERR(vma)) {
4576 ret = PTR_ERR(vma);
4577 goto error_deref_obj;
4578 }
4579
4580 if (!ce->timeline) {
4581 struct intel_timeline *tl;
4582
4583 tl = intel_timeline_create(engine->gt, NULL);
4584 if (IS_ERR(tl)) {
4585 ret = PTR_ERR(tl);
4586 goto error_deref_obj;
4587 }
4588
4589 ce->timeline = tl;
4590 }
4591
4592 ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
4593 if (IS_ERR(ring)) {
4594 ret = PTR_ERR(ring);
4595 goto error_deref_obj;
4596 }
4597
4598 ret = populate_lr_context(ce, ctx_obj, engine, ring);
4599 if (ret) {
4600 DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
4601 goto error_ring_free;
4602 }
4603
4604 ce->ring = ring;
4605 ce->state = vma;
4606
4607 return 0;
4608
4609 error_ring_free:
4610 intel_ring_put(ring);
4611 error_deref_obj:
4612 i915_gem_object_put(ctx_obj);
4613 return ret;
4614 }
4615
4616 static struct list_head *virtual_queue(struct virtual_engine *ve)
4617 {
4618 return &ve->base.execlists.default_priolist.requests[0];
4619 }
4620
4621 static void virtual_context_destroy(struct kref *kref)
4622 {
4623 struct virtual_engine *ve =
4624 container_of(kref, typeof(*ve), context.ref);
4625 unsigned int n;
4626
4627 GEM_BUG_ON(!list_empty(virtual_queue(ve)));
4628 GEM_BUG_ON(ve->request);
4629 GEM_BUG_ON(ve->context.inflight);
4630
4631 for (n = 0; n < ve->num_siblings; n++) {
4632 struct intel_engine_cs *sibling = ve->siblings[n];
4633 struct rb_node *node = &ve->nodes[sibling->id].rb;
4634 unsigned long flags;
4635
4636 if (RB_EMPTY_NODE(node))
4637 continue;
4638
4639 spin_lock_irqsave(&sibling->active.lock, flags);
4640
4641 /* Detachment is lazily performed in the execlists tasklet */
4642 if (!RB_EMPTY_NODE(node))
4643 rb_erase_cached(node, &sibling->execlists.virtual);
4644
4645 spin_unlock_irqrestore(&sibling->active.lock, flags);
4646 }
4647 GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
4648
4649 if (ve->context.state)
4650 __execlists_context_fini(&ve->context);
4651 intel_context_fini(&ve->context);
4652
4653 kfree(ve->bonds);
4654 kfree(ve);
4655 }
4656
4657 static void virtual_engine_initial_hint(struct virtual_engine *ve)
4658 {
4659 int swp;
4660
4661 /*
4662 * Pick a random sibling on starting to help spread the load around.
4663 *
4664 * New contexts are typically created with exactly the same order
4665 * of siblings, and often started in batches. Due to the way we iterate
4666 * the array of sibling when submitting requests, sibling[0] is
4667 * prioritised for dequeuing. If we make sure that sibling[0] is fairly
4668 * randomised across the system, we also help spread the load by the
4669 * first engine we inspect being different each time.
4670 *
4671 * NB This does not force us to execute on this engine, it will just
4672 * typically be the first we inspect for submission.
4673 */
4674 swp = prandom_u32_max(ve->num_siblings);
4675 if (!swp)
4676 return;
4677
4678 swap(ve->siblings[swp], ve->siblings[0]);
4679 if (!intel_engine_has_relative_mmio(ve->siblings[0]))
4680 virtual_update_register_offsets(ve->context.lrc_reg_state,
4681 ve->siblings[0]);
4682 }
4683
4684 static int virtual_context_alloc(struct intel_context *ce)
4685 {
4686 struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4687
4688 return __execlists_context_alloc(ce, ve->siblings[0]);
4689 }
4690
4691 static int virtual_context_pin(struct intel_context *ce)
4692 {
4693 struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4694 int err;
4695
4696 /* Note: we must use a real engine class for setting up reg state */
4697 err = __execlists_context_pin(ce, ve->siblings[0]);
4698 if (err)
4699 return err;
4700
4701 virtual_engine_initial_hint(ve);
4702 return 0;
4703 }
4704
4705 static void virtual_context_enter(struct intel_context *ce)
4706 {
4707 struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4708 unsigned int n;
4709
4710 for (n = 0; n < ve->num_siblings; n++)
4711 intel_engine_pm_get(ve->siblings[n]);
4712
4713 intel_timeline_enter(ce->timeline);
4714 }
4715
4716 static void virtual_context_exit(struct intel_context *ce)
4717 {
4718 struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4719 unsigned int n;
4720
4721 intel_timeline_exit(ce->timeline);
4722
4723 for (n = 0; n < ve->num_siblings; n++)
4724 intel_engine_pm_put(ve->siblings[n]);
4725 }
4726
4727 static const struct intel_context_ops virtual_context_ops = {
4728 .alloc = virtual_context_alloc,
4729
4730 .pin = virtual_context_pin,
4731 .unpin = execlists_context_unpin,
4732
4733 .enter = virtual_context_enter,
4734 .exit = virtual_context_exit,
4735
4736 .destroy = virtual_context_destroy,
4737 };
4738
4739 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
4740 {
4741 struct i915_request *rq;
4742 intel_engine_mask_t mask;
4743
4744 rq = READ_ONCE(ve->request);
4745 if (!rq)
4746 return 0;
4747
4748 /* The rq is ready for submission; rq->execution_mask is now stable. */
4749 mask = rq->execution_mask;
4750 if (unlikely(!mask)) {
4751 /* Invalid selection, submit to a random engine in error */
4752 i915_request_skip(rq, -ENODEV);
4753 mask = ve->siblings[0]->mask;
4754 }
4755
4756 ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
4757 rq->fence.context, rq->fence.seqno,
4758 mask, ve->base.execlists.queue_priority_hint);
4759
4760 return mask;
4761 }
4762
4763 static void virtual_submission_tasklet(unsigned long data)
4764 {
4765 struct virtual_engine * const ve = (struct virtual_engine *)data;
4766 const int prio = ve->base.execlists.queue_priority_hint;
4767 intel_engine_mask_t mask;
4768 unsigned int n;
4769
4770 rcu_read_lock();
4771 mask = virtual_submission_mask(ve);
4772 rcu_read_unlock();
4773 if (unlikely(!mask))
4774 return;
4775
4776 local_irq_disable();
4777 for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) {
4778 struct intel_engine_cs *sibling = ve->siblings[n];
4779 struct ve_node * const node = &ve->nodes[sibling->id];
4780 struct rb_node **parent, *rb;
4781 bool first;
4782
4783 if (unlikely(!(mask & sibling->mask))) {
4784 if (!RB_EMPTY_NODE(&node->rb)) {
4785 spin_lock(&sibling->active.lock);
4786 rb_erase_cached(&node->rb,
4787 &sibling->execlists.virtual);
4788 RB_CLEAR_NODE(&node->rb);
4789 spin_unlock(&sibling->active.lock);
4790 }
4791 continue;
4792 }
4793
4794 spin_lock(&sibling->active.lock);
4795
4796 if (!RB_EMPTY_NODE(&node->rb)) {
4797 /*
4798 * Cheat and avoid rebalancing the tree if we can
4799 * reuse this node in situ.
4800 */
4801 first = rb_first_cached(&sibling->execlists.virtual) ==
4802 &node->rb;
4803 if (prio == node->prio || (prio > node->prio && first))
4804 goto submit_engine;
4805
4806 rb_erase_cached(&node->rb, &sibling->execlists.virtual);
4807 }
4808
4809 rb = NULL;
4810 first = true;
4811 parent = &sibling->execlists.virtual.rb_root.rb_node;
4812 while (*parent) {
4813 struct ve_node *other;
4814
4815 rb = *parent;
4816 other = rb_entry(rb, typeof(*other), rb);
4817 if (prio > other->prio) {
4818 parent = &rb->rb_left;
4819 } else {
4820 parent = &rb->rb_right;
4821 first = false;
4822 }
4823 }
4824
4825 rb_link_node(&node->rb, rb, parent);
4826 rb_insert_color_cached(&node->rb,
4827 &sibling->execlists.virtual,
4828 first);
4829
4830 submit_engine:
4831 GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
4832 node->prio = prio;
4833 if (first && prio > sibling->execlists.queue_priority_hint) {
4834 sibling->execlists.queue_priority_hint = prio;
4835 tasklet_hi_schedule(&sibling->execlists.tasklet);
4836 }
4837
4838 spin_unlock(&sibling->active.lock);
4839 }
4840 local_irq_enable();
4841 }
4842
4843 static void virtual_submit_request(struct i915_request *rq)
4844 {
4845 struct virtual_engine *ve = to_virtual_engine(rq->engine);
4846 struct i915_request *old;
4847 unsigned long flags;
4848
4849 ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
4850 rq->fence.context,
4851 rq->fence.seqno);
4852
4853 GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
4854
4855 spin_lock_irqsave(&ve->base.active.lock, flags);
4856
4857 old = ve->request;
4858 if (old) { /* background completion event from preempt-to-busy */
4859 GEM_BUG_ON(!i915_request_completed(old));
4860 __i915_request_submit(old);
4861 i915_request_put(old);
4862 }
4863
4864 if (i915_request_completed(rq)) {
4865 __i915_request_submit(rq);
4866
4867 ve->base.execlists.queue_priority_hint = INT_MIN;
4868 ve->request = NULL;
4869 } else {
4870 ve->base.execlists.queue_priority_hint = rq_prio(rq);
4871 ve->request = i915_request_get(rq);
4872
4873 GEM_BUG_ON(!list_empty(virtual_queue(ve)));
4874 list_move_tail(&rq->sched.link, virtual_queue(ve));
4875
4876 tasklet_schedule(&ve->base.execlists.tasklet);
4877 }
4878
4879 spin_unlock_irqrestore(&ve->base.active.lock, flags);
4880 }
4881
4882 static struct ve_bond *
4883 virtual_find_bond(struct virtual_engine *ve,
4884 const struct intel_engine_cs *master)
4885 {
4886 int i;
4887
4888 for (i = 0; i < ve->num_bonds; i++) {
4889 if (ve->bonds[i].master == master)
4890 return &ve->bonds[i];
4891 }
4892
4893 return NULL;
4894 }
4895
4896 static void
4897 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
4898 {
4899 struct virtual_engine *ve = to_virtual_engine(rq->engine);
4900 intel_engine_mask_t allowed, exec;
4901 struct ve_bond *bond;
4902
4903 allowed = ~to_request(signal)->engine->mask;
4904
4905 bond = virtual_find_bond(ve, to_request(signal)->engine);
4906 if (bond)
4907 allowed &= bond->sibling_mask;
4908
4909 /* Restrict the bonded request to run on only the available engines */
4910 exec = READ_ONCE(rq->execution_mask);
4911 while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
4912 ;
4913
4914 /* Prevent the master from being re-run on the bonded engines */
4915 to_request(signal)->execution_mask &= ~allowed;
4916 }
4917
4918 struct intel_context *
4919 intel_execlists_create_virtual(struct intel_engine_cs **siblings,
4920 unsigned int count)
4921 {
4922 struct virtual_engine *ve;
4923 unsigned int n;
4924 int err;
4925
4926 if (count == 0)
4927 return ERR_PTR(-EINVAL);
4928
4929 if (count == 1)
4930 return intel_context_create(siblings[0]);
4931
4932 ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
4933 if (!ve)
4934 return ERR_PTR(-ENOMEM);
4935
4936 ve->base.i915 = siblings[0]->i915;
4937 ve->base.gt = siblings[0]->gt;
4938 ve->base.uncore = siblings[0]->uncore;
4939 ve->base.id = -1;
4940
4941 ve->base.class = OTHER_CLASS;
4942 ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
4943 ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
4944 ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
4945
4946 /*
4947 * The decision on whether to submit a request using semaphores
4948 * depends on the saturated state of the engine. We only compute
4949 * this during HW submission of the request, and we need for this
4950 * state to be globally applied to all requests being submitted
4951 * to this engine. Virtual engines encompass more than one physical
4952 * engine and so we cannot accurately tell in advance if one of those
4953 * engines is already saturated and so cannot afford to use a semaphore
4954 * and be pessimized in priority for doing so -- if we are the only
4955 * context using semaphores after all other clients have stopped, we
4956 * will be starved on the saturated system. Such a global switch for
4957 * semaphores is less than ideal, but alas is the current compromise.
4958 */
4959 ve->base.saturated = ALL_ENGINES;
4960
4961 snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
4962
4963 intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
4964 intel_engine_init_breadcrumbs(&ve->base);
4965 intel_engine_init_execlists(&ve->base);
4966
4967 ve->base.cops = &virtual_context_ops;
4968 ve->base.request_alloc = execlists_request_alloc;
4969
4970 ve->base.schedule = i915_schedule;
4971 ve->base.submit_request = virtual_submit_request;
4972 ve->base.bond_execute = virtual_bond_execute;
4973
4974 INIT_LIST_HEAD(virtual_queue(ve));
4975 ve->base.execlists.queue_priority_hint = INT_MIN;
4976 tasklet_init(&ve->base.execlists.tasklet,
4977 virtual_submission_tasklet,
4978 (unsigned long)ve);
4979
4980 intel_context_init(&ve->context, &ve->base);
4981
4982 for (n = 0; n < count; n++) {
4983 struct intel_engine_cs *sibling = siblings[n];
4984
4985 GEM_BUG_ON(!is_power_of_2(sibling->mask));
4986 if (sibling->mask & ve->base.mask) {
4987 DRM_DEBUG("duplicate %s entry in load balancer\n",
4988 sibling->name);
4989 err = -EINVAL;
4990 goto err_put;
4991 }
4992
4993 /*
4994 * The virtual engine implementation is tightly coupled to
4995 * the execlists backend -- we push out request directly
4996 * into a tree inside each physical engine. We could support
4997 * layering if we handle cloning of the requests and
4998 * submitting a copy into each backend.
4999 */
5000 if (sibling->execlists.tasklet.func !=
5001 execlists_submission_tasklet) {
5002 err = -ENODEV;
5003 goto err_put;
5004 }
5005
5006 GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
5007 RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
5008
5009 ve->siblings[ve->num_siblings++] = sibling;
5010 ve->base.mask |= sibling->mask;
5011
5012 /*
5013 * All physical engines must be compatible for their emission
5014 * functions (as we build the instructions during request
5015 * construction and do not alter them before submission
5016 * on the physical engine). We use the engine class as a guide
5017 * here, although that could be refined.
5018 */
5019 if (ve->base.class != OTHER_CLASS) {
5020 if (ve->base.class != sibling->class) {
5021 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
5022 sibling->class, ve->base.class);
5023 err = -EINVAL;
5024 goto err_put;
5025 }
5026 continue;
5027 }
5028
5029 ve->base.class = sibling->class;
5030 ve->base.uabi_class = sibling->uabi_class;
5031 snprintf(ve->base.name, sizeof(ve->base.name),
5032 "v%dx%d", ve->base.class, count);
5033 ve->base.context_size = sibling->context_size;
5034
5035 ve->base.emit_bb_start = sibling->emit_bb_start;
5036 ve->base.emit_flush = sibling->emit_flush;
5037 ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
5038 ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
5039 ve->base.emit_fini_breadcrumb_dw =
5040 sibling->emit_fini_breadcrumb_dw;
5041
5042 ve->base.flags = sibling->flags;
5043 }
5044
5045 ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
5046
5047 return &ve->context;
5048
5049 err_put:
5050 intel_context_put(&ve->context);
5051 return ERR_PTR(err);
5052 }
5053
5054 struct intel_context *
5055 intel_execlists_clone_virtual(struct intel_engine_cs *src)
5056 {
5057 struct virtual_engine *se = to_virtual_engine(src);
5058 struct intel_context *dst;
5059
5060 dst = intel_execlists_create_virtual(se->siblings,
5061 se->num_siblings);
5062 if (IS_ERR(dst))
5063 return dst;
5064
5065 if (se->num_bonds) {
5066 struct virtual_engine *de = to_virtual_engine(dst->engine);
5067
5068 de->bonds = kmemdup(se->bonds,
5069 sizeof(*se->bonds) * se->num_bonds,
5070 GFP_KERNEL);
5071 if (!de->bonds) {
5072 intel_context_put(dst);
5073 return ERR_PTR(-ENOMEM);
5074 }
5075
5076 de->num_bonds = se->num_bonds;
5077 }
5078
5079 return dst;
5080 }
5081
5082 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
5083 const struct intel_engine_cs *master,
5084 const struct intel_engine_cs *sibling)
5085 {
5086 struct virtual_engine *ve = to_virtual_engine(engine);
5087 struct ve_bond *bond;
5088 int n;
5089
5090 /* Sanity check the sibling is part of the virtual engine */
5091 for (n = 0; n < ve->num_siblings; n++)
5092 if (sibling == ve->siblings[n])
5093 break;
5094 if (n == ve->num_siblings)
5095 return -EINVAL;
5096
5097 bond = virtual_find_bond(ve, master);
5098 if (bond) {
5099 bond->sibling_mask |= sibling->mask;
5100 return 0;
5101 }
5102
5103 bond = krealloc(ve->bonds,
5104 sizeof(*bond) * (ve->num_bonds + 1),
5105 GFP_KERNEL);
5106 if (!bond)
5107 return -ENOMEM;
5108
5109 bond[ve->num_bonds].master = master;
5110 bond[ve->num_bonds].sibling_mask = sibling->mask;
5111
5112 ve->bonds = bond;
5113 ve->num_bonds++;
5114
5115 return 0;
5116 }
5117
5118 struct intel_engine_cs *
5119 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
5120 unsigned int sibling)
5121 {
5122 struct virtual_engine *ve = to_virtual_engine(engine);
5123
5124 if (sibling >= ve->num_siblings)
5125 return NULL;
5126
5127 return ve->siblings[sibling];
5128 }
5129
5130 void intel_execlists_show_requests(struct intel_engine_cs *engine,
5131 struct drm_printer *m,
5132 void (*show_request)(struct drm_printer *m,
5133 struct i915_request *rq,
5134 const char *prefix),
5135 unsigned int max)
5136 {
5137 const struct intel_engine_execlists *execlists = &engine->execlists;
5138 struct i915_request *rq, *last;
5139 unsigned long flags;
5140 unsigned int count;
5141 struct rb_node *rb;
5142
5143 spin_lock_irqsave(&engine->active.lock, flags);
5144
5145 last = NULL;
5146 count = 0;
5147 list_for_each_entry(rq, &engine->active.requests, sched.link) {
5148 if (count++ < max - 1)
5149 show_request(m, rq, "\t\tE ");
5150 else
5151 last = rq;
5152 }
5153 if (last) {
5154 if (count > max) {
5155 drm_printf(m,
5156 "\t\t...skipping %d executing requests...\n",
5157 count - max);
5158 }
5159 show_request(m, last, "\t\tE ");
5160 }
5161
5162 last = NULL;
5163 count = 0;
5164 if (execlists->queue_priority_hint != INT_MIN)
5165 drm_printf(m, "\t\tQueue priority hint: %d\n",
5166 execlists->queue_priority_hint);
5167 for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
5168 struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
5169 int i;
5170
5171 priolist_for_each_request(rq, p, i) {
5172 if (count++ < max - 1)
5173 show_request(m, rq, "\t\tQ ");
5174 else
5175 last = rq;
5176 }
5177 }
5178 if (last) {
5179 if (count > max) {
5180 drm_printf(m,
5181 "\t\t...skipping %d queued requests...\n",
5182 count - max);
5183 }
5184 show_request(m, last, "\t\tQ ");
5185 }
5186
5187 last = NULL;
5188 count = 0;
5189 for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
5190 struct virtual_engine *ve =
5191 rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
5192 struct i915_request *rq = READ_ONCE(ve->request);
5193
5194 if (rq) {
5195 if (count++ < max - 1)
5196 show_request(m, rq, "\t\tV ");
5197 else
5198 last = rq;
5199 }
5200 }
5201 if (last) {
5202 if (count > max) {
5203 drm_printf(m,
5204 "\t\t...skipping %d virtual requests...\n",
5205 count - max);
5206 }
5207 show_request(m, last, "\t\tV ");
5208 }
5209
5210 spin_unlock_irqrestore(&engine->active.lock, flags);
5211 }
5212
5213 void intel_lr_context_reset(struct intel_engine_cs *engine,
5214 struct intel_context *ce,
5215 u32 head,
5216 bool scrub)
5217 {
5218 GEM_BUG_ON(!intel_context_is_pinned(ce));
5219
5220 /*
5221 * We want a simple context + ring to execute the breadcrumb update.
5222 * We cannot rely on the context being intact across the GPU hang,
5223 * so clear it and rebuild just what we need for the breadcrumb.
5224 * All pending requests for this context will be zapped, and any
5225 * future request will be after userspace has had the opportunity
5226 * to recreate its own state.
5227 */
5228 if (scrub)
5229 restore_default_state(ce, engine);
5230
5231 /* Rerun the request; its payload has been neutered (if guilty). */
5232 __execlists_update_reg_state(ce, engine, head);
5233 }
5234
5235 bool
5236 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
5237 {
5238 return engine->set_default_submission ==
5239 intel_execlists_set_default_submission;
5240 }
5241
5242 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
5243 #include "selftest_lrc.c"
5244 #endif
5245