Home | History | Annotate | Line # | Download | only in gt
intel_lrc.c revision 1.2
      1 /*	$NetBSD: intel_lrc.c,v 1.2 2021/12/18 23:45:30 riastradh Exp $	*/
      2 
      3 /*
      4  * Copyright  2014 Intel Corporation
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a
      7  * copy of this software and associated documentation files (the "Software"),
      8  * to deal in the Software without restriction, including without limitation
      9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     10  * and/or sell copies of the Software, and to permit persons to whom the
     11  * Software is furnished to do so, subject to the following conditions:
     12  *
     13  * The above copyright notice and this permission notice (including the next
     14  * paragraph) shall be included in all copies or substantial portions of the
     15  * Software.
     16  *
     17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     22  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     23  * IN THE SOFTWARE.
     24  *
     25  * Authors:
     26  *    Ben Widawsky <ben (at) bwidawsk.net>
     27  *    Michel Thierry <michel.thierry (at) intel.com>
     28  *    Thomas Daniel <thomas.daniel (at) intel.com>
     29  *    Oscar Mateo <oscar.mateo (at) intel.com>
     30  *
     31  */
     32 
     33 /**
     34  * DOC: Logical Rings, Logical Ring Contexts and Execlists
     35  *
     36  * Motivation:
     37  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
     38  * These expanded contexts enable a number of new abilities, especially
     39  * "Execlists" (also implemented in this file).
     40  *
     41  * One of the main differences with the legacy HW contexts is that logical
     42  * ring contexts incorporate many more things to the context's state, like
     43  * PDPs or ringbuffer control registers:
     44  *
     45  * The reason why PDPs are included in the context is straightforward: as
     46  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
     47  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
     48  * instead, the GPU will do it for you on the context switch.
     49  *
     50  * But, what about the ringbuffer control registers (head, tail, etc..)?
     51  * shouldn't we just need a set of those per engine command streamer? This is
     52  * where the name "Logical Rings" starts to make sense: by virtualizing the
     53  * rings, the engine cs shifts to a new "ring buffer" with every context
     54  * switch. When you want to submit a workload to the GPU you: A) choose your
     55  * context, B) find its appropriate virtualized ring, C) write commands to it
     56  * and then, finally, D) tell the GPU to switch to that context.
     57  *
     58  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
     59  * to a contexts is via a context execution list, ergo "Execlists".
     60  *
     61  * LRC implementation:
     62  * Regarding the creation of contexts, we have:
     63  *
     64  * - One global default context.
     65  * - One local default context for each opened fd.
     66  * - One local extra context for each context create ioctl call.
     67  *
     68  * Now that ringbuffers belong per-context (and not per-engine, like before)
     69  * and that contexts are uniquely tied to a given engine (and not reusable,
     70  * like before) we need:
     71  *
     72  * - One ringbuffer per-engine inside each context.
     73  * - One backing object per-engine inside each context.
     74  *
     75  * The global default context starts its life with these new objects fully
     76  * allocated and populated. The local default context for each opened fd is
     77  * more complex, because we don't know at creation time which engine is going
     78  * to use them. To handle this, we have implemented a deferred creation of LR
     79  * contexts:
     80  *
     81  * The local context starts its life as a hollow or blank holder, that only
     82  * gets populated for a given engine once we receive an execbuffer. If later
     83  * on we receive another execbuffer ioctl for the same context but a different
     84  * engine, we allocate/populate a new ringbuffer and context backing object and
     85  * so on.
     86  *
     87  * Finally, regarding local contexts created using the ioctl call: as they are
     88  * only allowed with the render ring, we can allocate & populate them right
     89  * away (no need to defer anything, at least for now).
     90  *
     91  * Execlists implementation:
     92  * Execlists are the new method by which, on gen8+ hardware, workloads are
     93  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
     94  * This method works as follows:
     95  *
     96  * When a request is committed, its commands (the BB start and any leading or
     97  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
     98  * for the appropriate context. The tail pointer in the hardware context is not
     99  * updated at this time, but instead, kept by the driver in the ringbuffer
    100  * structure. A structure representing this request is added to a request queue
    101  * for the appropriate engine: this structure contains a copy of the context's
    102  * tail after the request was written to the ring buffer and a pointer to the
    103  * context itself.
    104  *
    105  * If the engine's request queue was empty before the request was added, the
    106  * queue is processed immediately. Otherwise the queue will be processed during
    107  * a context switch interrupt. In any case, elements on the queue will get sent
    108  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
    109  * globally unique 20-bits submission ID.
    110  *
    111  * When execution of a request completes, the GPU updates the context status
    112  * buffer with a context complete event and generates a context switch interrupt.
    113  * During the interrupt handling, the driver examines the events in the buffer:
    114  * for each context complete event, if the announced ID matches that on the head
    115  * of the request queue, then that request is retired and removed from the queue.
    116  *
    117  * After processing, if any requests were retired and the queue is not empty
    118  * then a new execution list can be submitted. The two requests at the front of
    119  * the queue are next to be submitted but since a context may not occur twice in
    120  * an execution list, if subsequent requests have the same ID as the first then
    121  * the two requests must be combined. This is done simply by discarding requests
    122  * at the head of the queue until either only one requests is left (in which case
    123  * we use a NULL second context) or the first two requests have unique IDs.
    124  *
    125  * By always executing the first two requests in the queue the driver ensures
    126  * that the GPU is kept as busy as possible. In the case where a single context
    127  * completes but a second context is still executing, the request for this second
    128  * context will be at the head of the queue when we remove the first one. This
    129  * request will then be resubmitted along with a new request for a different context,
    130  * which will cause the hardware to continue executing the second request and queue
    131  * the new request (the GPU detects the condition of a context getting preempted
    132  * with the same context and optimizes the context switch flow by not doing
    133  * preemption, but just sampling the new tail pointer).
    134  *
    135  */
    136 #include <sys/cdefs.h>
    137 __KERNEL_RCSID(0, "$NetBSD: intel_lrc.c,v 1.2 2021/12/18 23:45:30 riastradh Exp $");
    138 
    139 #include <linux/interrupt.h>
    140 
    141 #include "i915_drv.h"
    142 #include "i915_perf.h"
    143 #include "i915_trace.h"
    144 #include "i915_vgpu.h"
    145 #include "intel_context.h"
    146 #include "intel_engine_pm.h"
    147 #include "intel_gt.h"
    148 #include "intel_gt_pm.h"
    149 #include "intel_gt_requests.h"
    150 #include "intel_lrc_reg.h"
    151 #include "intel_mocs.h"
    152 #include "intel_reset.h"
    153 #include "intel_ring.h"
    154 #include "intel_workarounds.h"
    155 
    156 #define RING_EXECLIST_QFULL		(1 << 0x2)
    157 #define RING_EXECLIST1_VALID		(1 << 0x3)
    158 #define RING_EXECLIST0_VALID		(1 << 0x4)
    159 #define RING_EXECLIST_ACTIVE_STATUS	(3 << 0xE)
    160 #define RING_EXECLIST1_ACTIVE		(1 << 0x11)
    161 #define RING_EXECLIST0_ACTIVE		(1 << 0x12)
    162 
    163 #define GEN8_CTX_STATUS_IDLE_ACTIVE	(1 << 0)
    164 #define GEN8_CTX_STATUS_PREEMPTED	(1 << 1)
    165 #define GEN8_CTX_STATUS_ELEMENT_SWITCH	(1 << 2)
    166 #define GEN8_CTX_STATUS_ACTIVE_IDLE	(1 << 3)
    167 #define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
    168 #define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
    169 
    170 #define GEN8_CTX_STATUS_COMPLETED_MASK \
    171 	 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
    172 
    173 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
    174 
    175 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE	(0x1) /* lower csb dword */
    176 #define GEN12_CTX_SWITCH_DETAIL(csb_dw)	((csb_dw) & 0xF) /* upper csb dword */
    177 #define GEN12_CSB_SW_CTX_ID_MASK		GENMASK(25, 15)
    178 #define GEN12_IDLE_CTX_ID		0x7FF
    179 #define GEN12_CSB_CTX_VALID(csb_dw) \
    180 	(FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
    181 
    182 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
    183 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
    184 #define WA_TAIL_DWORDS 2
    185 #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
    186 
    187 struct virtual_engine {
    188 	struct intel_engine_cs base;
    189 	struct intel_context context;
    190 
    191 	/*
    192 	 * We allow only a single request through the virtual engine at a time
    193 	 * (each request in the timeline waits for the completion fence of
    194 	 * the previous before being submitted). By restricting ourselves to
    195 	 * only submitting a single request, each request is placed on to a
    196 	 * physical to maximise load spreading (by virtue of the late greedy
    197 	 * scheduling -- each real engine takes the next available request
    198 	 * upon idling).
    199 	 */
    200 	struct i915_request *request;
    201 
    202 	/*
    203 	 * We keep a rbtree of available virtual engines inside each physical
    204 	 * engine, sorted by priority. Here we preallocate the nodes we need
    205 	 * for the virtual engine, indexed by physical_engine->id.
    206 	 */
    207 	struct ve_node {
    208 		struct rb_node rb;
    209 		int prio;
    210 	} nodes[I915_NUM_ENGINES];
    211 
    212 	/*
    213 	 * Keep track of bonded pairs -- restrictions upon on our selection
    214 	 * of physical engines any particular request may be submitted to.
    215 	 * If we receive a submit-fence from a master engine, we will only
    216 	 * use one of sibling_mask physical engines.
    217 	 */
    218 	struct ve_bond {
    219 		const struct intel_engine_cs *master;
    220 		intel_engine_mask_t sibling_mask;
    221 	} *bonds;
    222 	unsigned int num_bonds;
    223 
    224 	/* And finally, which physical engines this virtual engine maps onto. */
    225 	unsigned int num_siblings;
    226 	struct intel_engine_cs *siblings[0];
    227 };
    228 
    229 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
    230 {
    231 	GEM_BUG_ON(!intel_engine_is_virtual(engine));
    232 	return container_of(engine, struct virtual_engine, base);
    233 }
    234 
    235 static int __execlists_context_alloc(struct intel_context *ce,
    236 				     struct intel_engine_cs *engine);
    237 
    238 static void execlists_init_reg_state(u32 *reg_state,
    239 				     const struct intel_context *ce,
    240 				     const struct intel_engine_cs *engine,
    241 				     const struct intel_ring *ring,
    242 				     bool close);
    243 static void
    244 __execlists_update_reg_state(const struct intel_context *ce,
    245 			     const struct intel_engine_cs *engine,
    246 			     u32 head);
    247 
    248 static void mark_eio(struct i915_request *rq)
    249 {
    250 	if (i915_request_completed(rq))
    251 		return;
    252 
    253 	GEM_BUG_ON(i915_request_signaled(rq));
    254 
    255 	dma_fence_set_error(&rq->fence, -EIO);
    256 	i915_request_mark_complete(rq);
    257 }
    258 
    259 static struct i915_request *
    260 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
    261 {
    262 	struct i915_request *active = rq;
    263 
    264 	rcu_read_lock();
    265 	list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
    266 		if (i915_request_completed(rq))
    267 			break;
    268 
    269 		active = rq;
    270 	}
    271 	rcu_read_unlock();
    272 
    273 	return active;
    274 }
    275 
    276 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
    277 {
    278 	return (i915_ggtt_offset(engine->status_page.vma) +
    279 		I915_GEM_HWS_PREEMPT_ADDR);
    280 }
    281 
    282 static inline void
    283 ring_set_paused(const struct intel_engine_cs *engine, int state)
    284 {
    285 	/*
    286 	 * We inspect HWS_PREEMPT with a semaphore inside
    287 	 * engine->emit_fini_breadcrumb. If the dword is true,
    288 	 * the ring is paused as the semaphore will busywait
    289 	 * until the dword is false.
    290 	 */
    291 	engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
    292 	if (state)
    293 		wmb();
    294 }
    295 
    296 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
    297 {
    298 	return rb_entry(rb, struct i915_priolist, node);
    299 }
    300 
    301 static inline int rq_prio(const struct i915_request *rq)
    302 {
    303 	return rq->sched.attr.priority;
    304 }
    305 
    306 static int effective_prio(const struct i915_request *rq)
    307 {
    308 	int prio = rq_prio(rq);
    309 
    310 	/*
    311 	 * If this request is special and must not be interrupted at any
    312 	 * cost, so be it. Note we are only checking the most recent request
    313 	 * in the context and so may be masking an earlier vip request. It
    314 	 * is hoped that under the conditions where nopreempt is used, this
    315 	 * will not matter (i.e. all requests to that context will be
    316 	 * nopreempt for as long as desired).
    317 	 */
    318 	if (i915_request_has_nopreempt(rq))
    319 		prio = I915_PRIORITY_UNPREEMPTABLE;
    320 
    321 	/*
    322 	 * On unwinding the active request, we give it a priority bump
    323 	 * if it has completed waiting on any semaphore. If we know that
    324 	 * the request has already started, we can prevent an unwanted
    325 	 * preempt-to-idle cycle by taking that into account now.
    326 	 */
    327 	if (__i915_request_has_started(rq))
    328 		prio |= I915_PRIORITY_NOSEMAPHORE;
    329 
    330 	/* Restrict mere WAIT boosts from triggering preemption */
    331 	BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */
    332 	return prio | __NO_PREEMPTION;
    333 }
    334 
    335 static int queue_prio(const struct intel_engine_execlists *execlists)
    336 {
    337 	struct i915_priolist *p;
    338 	struct rb_node *rb;
    339 
    340 	rb = rb_first_cached(&execlists->queue);
    341 	if (!rb)
    342 		return INT_MIN;
    343 
    344 	/*
    345 	 * As the priolist[] are inverted, with the highest priority in [0],
    346 	 * we have to flip the index value to become priority.
    347 	 */
    348 	p = to_priolist(rb);
    349 	return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
    350 }
    351 
    352 static inline bool need_preempt(const struct intel_engine_cs *engine,
    353 				const struct i915_request *rq,
    354 				struct rb_node *rb)
    355 {
    356 	int last_prio;
    357 
    358 	if (!intel_engine_has_semaphores(engine))
    359 		return false;
    360 
    361 	/*
    362 	 * Check if the current priority hint merits a preemption attempt.
    363 	 *
    364 	 * We record the highest value priority we saw during rescheduling
    365 	 * prior to this dequeue, therefore we know that if it is strictly
    366 	 * less than the current tail of ESLP[0], we do not need to force
    367 	 * a preempt-to-idle cycle.
    368 	 *
    369 	 * However, the priority hint is a mere hint that we may need to
    370 	 * preempt. If that hint is stale or we may be trying to preempt
    371 	 * ourselves, ignore the request.
    372 	 *
    373 	 * More naturally we would write
    374 	 *      prio >= max(0, last);
    375 	 * except that we wish to prevent triggering preemption at the same
    376 	 * priority level: the task that is running should remain running
    377 	 * to preserve FIFO ordering of dependencies.
    378 	 */
    379 	last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
    380 	if (engine->execlists.queue_priority_hint <= last_prio)
    381 		return false;
    382 
    383 	/*
    384 	 * Check against the first request in ELSP[1], it will, thanks to the
    385 	 * power of PI, be the highest priority of that context.
    386 	 */
    387 	if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
    388 	    rq_prio(list_next_entry(rq, sched.link)) > last_prio)
    389 		return true;
    390 
    391 	if (rb) {
    392 		struct virtual_engine *ve =
    393 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
    394 		bool preempt = false;
    395 
    396 		if (engine == ve->siblings[0]) { /* only preempt one sibling */
    397 			struct i915_request *next;
    398 
    399 			rcu_read_lock();
    400 			next = READ_ONCE(ve->request);
    401 			if (next)
    402 				preempt = rq_prio(next) > last_prio;
    403 			rcu_read_unlock();
    404 		}
    405 
    406 		if (preempt)
    407 			return preempt;
    408 	}
    409 
    410 	/*
    411 	 * If the inflight context did not trigger the preemption, then maybe
    412 	 * it was the set of queued requests? Pick the highest priority in
    413 	 * the queue (the first active priolist) and see if it deserves to be
    414 	 * running instead of ELSP[0].
    415 	 *
    416 	 * The highest priority request in the queue can not be either
    417 	 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
    418 	 * context, it's priority would not exceed ELSP[0] aka last_prio.
    419 	 */
    420 	return queue_prio(&engine->execlists) > last_prio;
    421 }
    422 
    423 __maybe_unused static inline bool
    424 assert_priority_queue(const struct i915_request *prev,
    425 		      const struct i915_request *next)
    426 {
    427 	/*
    428 	 * Without preemption, the prev may refer to the still active element
    429 	 * which we refuse to let go.
    430 	 *
    431 	 * Even with preemption, there are times when we think it is better not
    432 	 * to preempt and leave an ostensibly lower priority request in flight.
    433 	 */
    434 	if (i915_request_is_active(prev))
    435 		return true;
    436 
    437 	return rq_prio(prev) >= rq_prio(next);
    438 }
    439 
    440 /*
    441  * The context descriptor encodes various attributes of a context,
    442  * including its GTT address and some flags. Because it's fairly
    443  * expensive to calculate, we'll just do it once and cache the result,
    444  * which remains valid until the context is unpinned.
    445  *
    446  * This is what a descriptor looks like, from LSB to MSB::
    447  *
    448  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
    449  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
    450  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
    451  *      bits 53-54:    mbz, reserved for use by hardware
    452  *      bits 55-63:    group ID, currently unused and set to 0
    453  *
    454  * Starting from Gen11, the upper dword of the descriptor has a new format:
    455  *
    456  *      bits 32-36:    reserved
    457  *      bits 37-47:    SW context ID
    458  *      bits 48:53:    engine instance
    459  *      bit 54:        mbz, reserved for use by hardware
    460  *      bits 55-60:    SW counter
    461  *      bits 61-63:    engine class
    462  *
    463  * engine info, SW context ID and SW counter need to form a unique number
    464  * (Context ID) per lrc.
    465  */
    466 static u64
    467 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
    468 {
    469 	u64 desc;
    470 
    471 	desc = INTEL_LEGACY_32B_CONTEXT;
    472 	if (i915_vm_is_4lvl(ce->vm))
    473 		desc = INTEL_LEGACY_64B_CONTEXT;
    474 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
    475 
    476 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
    477 	if (IS_GEN(engine->i915, 8))
    478 		desc |= GEN8_CTX_L3LLC_COHERENT;
    479 
    480 	desc |= i915_ggtt_offset(ce->state); /* bits 12-31 */
    481 	/*
    482 	 * The following 32bits are copied into the OA reports (dword 2).
    483 	 * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
    484 	 * anything below.
    485 	 */
    486 	if (INTEL_GEN(engine->i915) >= 11) {
    487 		desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT;
    488 								/* bits 48-53 */
    489 
    490 		desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT;
    491 								/* bits 61-63 */
    492 	}
    493 
    494 	return desc;
    495 }
    496 
    497 static inline unsigned int dword_in_page(void *addr)
    498 {
    499 	return offset_in_page(addr) / sizeof(u32);
    500 }
    501 
    502 static void set_offsets(u32 *regs,
    503 			const u8 *data,
    504 			const struct intel_engine_cs *engine,
    505 			bool clear)
    506 #define NOP(x) (BIT(7) | (x))
    507 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
    508 #define POSTED BIT(0)
    509 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
    510 #define REG16(x) \
    511 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
    512 	(((x) >> 2) & 0x7f)
    513 #define END(x) 0, (x)
    514 {
    515 	const u32 base = engine->mmio_base;
    516 
    517 	while (*data) {
    518 		u8 count, flags;
    519 
    520 		if (*data & BIT(7)) { /* skip */
    521 			count = *data++ & ~BIT(7);
    522 			if (clear)
    523 				memset32(regs, MI_NOOP, count);
    524 			regs += count;
    525 			continue;
    526 		}
    527 
    528 		count = *data & 0x3f;
    529 		flags = *data >> 6;
    530 		data++;
    531 
    532 		*regs = MI_LOAD_REGISTER_IMM(count);
    533 		if (flags & POSTED)
    534 			*regs |= MI_LRI_FORCE_POSTED;
    535 		if (INTEL_GEN(engine->i915) >= 11)
    536 			*regs |= MI_LRI_CS_MMIO;
    537 		regs++;
    538 
    539 		GEM_BUG_ON(!count);
    540 		do {
    541 			u32 offset = 0;
    542 			u8 v;
    543 
    544 			do {
    545 				v = *data++;
    546 				offset <<= 7;
    547 				offset |= v & ~BIT(7);
    548 			} while (v & BIT(7));
    549 
    550 			regs[0] = base + (offset << 2);
    551 			if (clear)
    552 				regs[1] = 0;
    553 			regs += 2;
    554 		} while (--count);
    555 	}
    556 
    557 	if (clear) {
    558 		u8 count = *++data;
    559 
    560 		/* Clear past the tail for HW access */
    561 		GEM_BUG_ON(dword_in_page(regs) > count);
    562 		memset32(regs, MI_NOOP, count - dword_in_page(regs));
    563 
    564 		/* Close the batch; used mainly by live_lrc_layout() */
    565 		*regs = MI_BATCH_BUFFER_END;
    566 		if (INTEL_GEN(engine->i915) >= 10)
    567 			*regs |= BIT(0);
    568 	}
    569 }
    570 
    571 static const u8 gen8_xcs_offsets[] = {
    572 	NOP(1),
    573 	LRI(11, 0),
    574 	REG16(0x244),
    575 	REG(0x034),
    576 	REG(0x030),
    577 	REG(0x038),
    578 	REG(0x03c),
    579 	REG(0x168),
    580 	REG(0x140),
    581 	REG(0x110),
    582 	REG(0x11c),
    583 	REG(0x114),
    584 	REG(0x118),
    585 
    586 	NOP(9),
    587 	LRI(9, 0),
    588 	REG16(0x3a8),
    589 	REG16(0x28c),
    590 	REG16(0x288),
    591 	REG16(0x284),
    592 	REG16(0x280),
    593 	REG16(0x27c),
    594 	REG16(0x278),
    595 	REG16(0x274),
    596 	REG16(0x270),
    597 
    598 	NOP(13),
    599 	LRI(2, 0),
    600 	REG16(0x200),
    601 	REG(0x028),
    602 
    603 	END(80)
    604 };
    605 
    606 static const u8 gen9_xcs_offsets[] = {
    607 	NOP(1),
    608 	LRI(14, POSTED),
    609 	REG16(0x244),
    610 	REG(0x034),
    611 	REG(0x030),
    612 	REG(0x038),
    613 	REG(0x03c),
    614 	REG(0x168),
    615 	REG(0x140),
    616 	REG(0x110),
    617 	REG(0x11c),
    618 	REG(0x114),
    619 	REG(0x118),
    620 	REG(0x1c0),
    621 	REG(0x1c4),
    622 	REG(0x1c8),
    623 
    624 	NOP(3),
    625 	LRI(9, POSTED),
    626 	REG16(0x3a8),
    627 	REG16(0x28c),
    628 	REG16(0x288),
    629 	REG16(0x284),
    630 	REG16(0x280),
    631 	REG16(0x27c),
    632 	REG16(0x278),
    633 	REG16(0x274),
    634 	REG16(0x270),
    635 
    636 	NOP(13),
    637 	LRI(1, POSTED),
    638 	REG16(0x200),
    639 
    640 	NOP(13),
    641 	LRI(44, POSTED),
    642 	REG(0x028),
    643 	REG(0x09c),
    644 	REG(0x0c0),
    645 	REG(0x178),
    646 	REG(0x17c),
    647 	REG16(0x358),
    648 	REG(0x170),
    649 	REG(0x150),
    650 	REG(0x154),
    651 	REG(0x158),
    652 	REG16(0x41c),
    653 	REG16(0x600),
    654 	REG16(0x604),
    655 	REG16(0x608),
    656 	REG16(0x60c),
    657 	REG16(0x610),
    658 	REG16(0x614),
    659 	REG16(0x618),
    660 	REG16(0x61c),
    661 	REG16(0x620),
    662 	REG16(0x624),
    663 	REG16(0x628),
    664 	REG16(0x62c),
    665 	REG16(0x630),
    666 	REG16(0x634),
    667 	REG16(0x638),
    668 	REG16(0x63c),
    669 	REG16(0x640),
    670 	REG16(0x644),
    671 	REG16(0x648),
    672 	REG16(0x64c),
    673 	REG16(0x650),
    674 	REG16(0x654),
    675 	REG16(0x658),
    676 	REG16(0x65c),
    677 	REG16(0x660),
    678 	REG16(0x664),
    679 	REG16(0x668),
    680 	REG16(0x66c),
    681 	REG16(0x670),
    682 	REG16(0x674),
    683 	REG16(0x678),
    684 	REG16(0x67c),
    685 	REG(0x068),
    686 
    687 	END(176)
    688 };
    689 
    690 static const u8 gen12_xcs_offsets[] = {
    691 	NOP(1),
    692 	LRI(13, POSTED),
    693 	REG16(0x244),
    694 	REG(0x034),
    695 	REG(0x030),
    696 	REG(0x038),
    697 	REG(0x03c),
    698 	REG(0x168),
    699 	REG(0x140),
    700 	REG(0x110),
    701 	REG(0x1c0),
    702 	REG(0x1c4),
    703 	REG(0x1c8),
    704 	REG(0x180),
    705 	REG16(0x2b4),
    706 
    707 	NOP(5),
    708 	LRI(9, POSTED),
    709 	REG16(0x3a8),
    710 	REG16(0x28c),
    711 	REG16(0x288),
    712 	REG16(0x284),
    713 	REG16(0x280),
    714 	REG16(0x27c),
    715 	REG16(0x278),
    716 	REG16(0x274),
    717 	REG16(0x270),
    718 
    719 	END(80)
    720 };
    721 
    722 static const u8 gen8_rcs_offsets[] = {
    723 	NOP(1),
    724 	LRI(14, POSTED),
    725 	REG16(0x244),
    726 	REG(0x034),
    727 	REG(0x030),
    728 	REG(0x038),
    729 	REG(0x03c),
    730 	REG(0x168),
    731 	REG(0x140),
    732 	REG(0x110),
    733 	REG(0x11c),
    734 	REG(0x114),
    735 	REG(0x118),
    736 	REG(0x1c0),
    737 	REG(0x1c4),
    738 	REG(0x1c8),
    739 
    740 	NOP(3),
    741 	LRI(9, POSTED),
    742 	REG16(0x3a8),
    743 	REG16(0x28c),
    744 	REG16(0x288),
    745 	REG16(0x284),
    746 	REG16(0x280),
    747 	REG16(0x27c),
    748 	REG16(0x278),
    749 	REG16(0x274),
    750 	REG16(0x270),
    751 
    752 	NOP(13),
    753 	LRI(1, 0),
    754 	REG(0x0c8),
    755 
    756 	END(80)
    757 };
    758 
    759 static const u8 gen9_rcs_offsets[] = {
    760 	NOP(1),
    761 	LRI(14, POSTED),
    762 	REG16(0x244),
    763 	REG(0x34),
    764 	REG(0x30),
    765 	REG(0x38),
    766 	REG(0x3c),
    767 	REG(0x168),
    768 	REG(0x140),
    769 	REG(0x110),
    770 	REG(0x11c),
    771 	REG(0x114),
    772 	REG(0x118),
    773 	REG(0x1c0),
    774 	REG(0x1c4),
    775 	REG(0x1c8),
    776 
    777 	NOP(3),
    778 	LRI(9, POSTED),
    779 	REG16(0x3a8),
    780 	REG16(0x28c),
    781 	REG16(0x288),
    782 	REG16(0x284),
    783 	REG16(0x280),
    784 	REG16(0x27c),
    785 	REG16(0x278),
    786 	REG16(0x274),
    787 	REG16(0x270),
    788 
    789 	NOP(13),
    790 	LRI(1, 0),
    791 	REG(0xc8),
    792 
    793 	NOP(13),
    794 	LRI(44, POSTED),
    795 	REG(0x28),
    796 	REG(0x9c),
    797 	REG(0xc0),
    798 	REG(0x178),
    799 	REG(0x17c),
    800 	REG16(0x358),
    801 	REG(0x170),
    802 	REG(0x150),
    803 	REG(0x154),
    804 	REG(0x158),
    805 	REG16(0x41c),
    806 	REG16(0x600),
    807 	REG16(0x604),
    808 	REG16(0x608),
    809 	REG16(0x60c),
    810 	REG16(0x610),
    811 	REG16(0x614),
    812 	REG16(0x618),
    813 	REG16(0x61c),
    814 	REG16(0x620),
    815 	REG16(0x624),
    816 	REG16(0x628),
    817 	REG16(0x62c),
    818 	REG16(0x630),
    819 	REG16(0x634),
    820 	REG16(0x638),
    821 	REG16(0x63c),
    822 	REG16(0x640),
    823 	REG16(0x644),
    824 	REG16(0x648),
    825 	REG16(0x64c),
    826 	REG16(0x650),
    827 	REG16(0x654),
    828 	REG16(0x658),
    829 	REG16(0x65c),
    830 	REG16(0x660),
    831 	REG16(0x664),
    832 	REG16(0x668),
    833 	REG16(0x66c),
    834 	REG16(0x670),
    835 	REG16(0x674),
    836 	REG16(0x678),
    837 	REG16(0x67c),
    838 	REG(0x68),
    839 
    840 	END(176)
    841 };
    842 
    843 static const u8 gen11_rcs_offsets[] = {
    844 	NOP(1),
    845 	LRI(15, POSTED),
    846 	REG16(0x244),
    847 	REG(0x034),
    848 	REG(0x030),
    849 	REG(0x038),
    850 	REG(0x03c),
    851 	REG(0x168),
    852 	REG(0x140),
    853 	REG(0x110),
    854 	REG(0x11c),
    855 	REG(0x114),
    856 	REG(0x118),
    857 	REG(0x1c0),
    858 	REG(0x1c4),
    859 	REG(0x1c8),
    860 	REG(0x180),
    861 
    862 	NOP(1),
    863 	LRI(9, POSTED),
    864 	REG16(0x3a8),
    865 	REG16(0x28c),
    866 	REG16(0x288),
    867 	REG16(0x284),
    868 	REG16(0x280),
    869 	REG16(0x27c),
    870 	REG16(0x278),
    871 	REG16(0x274),
    872 	REG16(0x270),
    873 
    874 	LRI(1, POSTED),
    875 	REG(0x1b0),
    876 
    877 	NOP(10),
    878 	LRI(1, 0),
    879 	REG(0x0c8),
    880 
    881 	END(80)
    882 };
    883 
    884 static const u8 gen12_rcs_offsets[] = {
    885 	NOP(1),
    886 	LRI(13, POSTED),
    887 	REG16(0x244),
    888 	REG(0x034),
    889 	REG(0x030),
    890 	REG(0x038),
    891 	REG(0x03c),
    892 	REG(0x168),
    893 	REG(0x140),
    894 	REG(0x110),
    895 	REG(0x1c0),
    896 	REG(0x1c4),
    897 	REG(0x1c8),
    898 	REG(0x180),
    899 	REG16(0x2b4),
    900 
    901 	NOP(5),
    902 	LRI(9, POSTED),
    903 	REG16(0x3a8),
    904 	REG16(0x28c),
    905 	REG16(0x288),
    906 	REG16(0x284),
    907 	REG16(0x280),
    908 	REG16(0x27c),
    909 	REG16(0x278),
    910 	REG16(0x274),
    911 	REG16(0x270),
    912 
    913 	LRI(3, POSTED),
    914 	REG(0x1b0),
    915 	REG16(0x5a8),
    916 	REG16(0x5ac),
    917 
    918 	NOP(6),
    919 	LRI(1, 0),
    920 	REG(0x0c8),
    921 
    922 	END(80)
    923 };
    924 
    925 #undef END
    926 #undef REG16
    927 #undef REG
    928 #undef LRI
    929 #undef NOP
    930 
    931 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
    932 {
    933 	/*
    934 	 * The gen12+ lists only have the registers we program in the basic
    935 	 * default state. We rely on the context image using relative
    936 	 * addressing to automatic fixup the register state between the
    937 	 * physical engines for virtual engine.
    938 	 */
    939 	GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
    940 		   !intel_engine_has_relative_mmio(engine));
    941 
    942 	if (engine->class == RENDER_CLASS) {
    943 		if (INTEL_GEN(engine->i915) >= 12)
    944 			return gen12_rcs_offsets;
    945 		else if (INTEL_GEN(engine->i915) >= 11)
    946 			return gen11_rcs_offsets;
    947 		else if (INTEL_GEN(engine->i915) >= 9)
    948 			return gen9_rcs_offsets;
    949 		else
    950 			return gen8_rcs_offsets;
    951 	} else {
    952 		if (INTEL_GEN(engine->i915) >= 12)
    953 			return gen12_xcs_offsets;
    954 		else if (INTEL_GEN(engine->i915) >= 9)
    955 			return gen9_xcs_offsets;
    956 		else
    957 			return gen8_xcs_offsets;
    958 	}
    959 }
    960 
    961 static struct i915_request *
    962 __unwind_incomplete_requests(struct intel_engine_cs *engine)
    963 {
    964 	struct i915_request *rq, *rn, *active = NULL;
    965 	struct list_head *uninitialized_var(pl);
    966 	int prio = I915_PRIORITY_INVALID;
    967 
    968 	lockdep_assert_held(&engine->active.lock);
    969 
    970 	list_for_each_entry_safe_reverse(rq, rn,
    971 					 &engine->active.requests,
    972 					 sched.link) {
    973 		if (i915_request_completed(rq))
    974 			continue; /* XXX */
    975 
    976 		__i915_request_unsubmit(rq);
    977 
    978 		/*
    979 		 * Push the request back into the queue for later resubmission.
    980 		 * If this request is not native to this physical engine (i.e.
    981 		 * it came from a virtual source), push it back onto the virtual
    982 		 * engine so that it can be moved across onto another physical
    983 		 * engine as load dictates.
    984 		 */
    985 		if (likely(rq->execution_mask == engine->mask)) {
    986 			GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
    987 			if (rq_prio(rq) != prio) {
    988 				prio = rq_prio(rq);
    989 				pl = i915_sched_lookup_priolist(engine, prio);
    990 			}
    991 			GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
    992 
    993 			list_move(&rq->sched.link, pl);
    994 			set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
    995 
    996 			active = rq;
    997 		} else {
    998 			struct intel_engine_cs *owner = rq->context->engine;
    999 
   1000 			/*
   1001 			 * Decouple the virtual breadcrumb before moving it
   1002 			 * back to the virtual engine -- we don't want the
   1003 			 * request to complete in the background and try
   1004 			 * and cancel the breadcrumb on the virtual engine
   1005 			 * (instead of the old engine where it is linked)!
   1006 			 */
   1007 			if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
   1008 				     &rq->fence.flags)) {
   1009 				spin_lock_nested(&rq->lock,
   1010 						 SINGLE_DEPTH_NESTING);
   1011 				i915_request_cancel_breadcrumb(rq);
   1012 				spin_unlock(&rq->lock);
   1013 			}
   1014 			rq->engine = owner;
   1015 			owner->submit_request(rq);
   1016 			active = NULL;
   1017 		}
   1018 	}
   1019 
   1020 	return active;
   1021 }
   1022 
   1023 struct i915_request *
   1024 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
   1025 {
   1026 	struct intel_engine_cs *engine =
   1027 		container_of(execlists, typeof(*engine), execlists);
   1028 
   1029 	return __unwind_incomplete_requests(engine);
   1030 }
   1031 
   1032 static inline void
   1033 execlists_context_status_change(struct i915_request *rq, unsigned long status)
   1034 {
   1035 	/*
   1036 	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
   1037 	 * The compiler should eliminate this function as dead-code.
   1038 	 */
   1039 	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
   1040 		return;
   1041 
   1042 	atomic_notifier_call_chain(&rq->engine->context_status_notifier,
   1043 				   status, rq);
   1044 }
   1045 
   1046 static void intel_engine_context_in(struct intel_engine_cs *engine)
   1047 {
   1048 	unsigned long flags;
   1049 
   1050 	if (READ_ONCE(engine->stats.enabled) == 0)
   1051 		return;
   1052 
   1053 	write_seqlock_irqsave(&engine->stats.lock, flags);
   1054 
   1055 	if (engine->stats.enabled > 0) {
   1056 		if (engine->stats.active++ == 0)
   1057 			engine->stats.start = ktime_get();
   1058 		GEM_BUG_ON(engine->stats.active == 0);
   1059 	}
   1060 
   1061 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
   1062 }
   1063 
   1064 static void intel_engine_context_out(struct intel_engine_cs *engine)
   1065 {
   1066 	unsigned long flags;
   1067 
   1068 	if (READ_ONCE(engine->stats.enabled) == 0)
   1069 		return;
   1070 
   1071 	write_seqlock_irqsave(&engine->stats.lock, flags);
   1072 
   1073 	if (engine->stats.enabled > 0) {
   1074 		ktime_t last;
   1075 
   1076 		if (engine->stats.active && --engine->stats.active == 0) {
   1077 			/*
   1078 			 * Decrement the active context count and in case GPU
   1079 			 * is now idle add up to the running total.
   1080 			 */
   1081 			last = ktime_sub(ktime_get(), engine->stats.start);
   1082 
   1083 			engine->stats.total = ktime_add(engine->stats.total,
   1084 							last);
   1085 		} else if (engine->stats.active == 0) {
   1086 			/*
   1087 			 * After turning on engine stats, context out might be
   1088 			 * the first event in which case we account from the
   1089 			 * time stats gathering was turned on.
   1090 			 */
   1091 			last = ktime_sub(ktime_get(), engine->stats.enabled_at);
   1092 
   1093 			engine->stats.total = ktime_add(engine->stats.total,
   1094 							last);
   1095 		}
   1096 	}
   1097 
   1098 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
   1099 }
   1100 
   1101 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
   1102 {
   1103 	if (INTEL_GEN(engine->i915) >= 12)
   1104 		return 0x60;
   1105 	else if (INTEL_GEN(engine->i915) >= 9)
   1106 		return 0x54;
   1107 	else if (engine->class == RENDER_CLASS)
   1108 		return 0x58;
   1109 	else
   1110 		return -1;
   1111 }
   1112 
   1113 static void
   1114 execlists_check_context(const struct intel_context *ce,
   1115 			const struct intel_engine_cs *engine)
   1116 {
   1117 	const struct intel_ring *ring = ce->ring;
   1118 	u32 *regs = ce->lrc_reg_state;
   1119 	bool valid = true;
   1120 	int x;
   1121 
   1122 	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
   1123 		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
   1124 		       engine->name,
   1125 		       regs[CTX_RING_START],
   1126 		       i915_ggtt_offset(ring->vma));
   1127 		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
   1128 		valid = false;
   1129 	}
   1130 
   1131 	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
   1132 	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
   1133 		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
   1134 		       engine->name,
   1135 		       regs[CTX_RING_CTL],
   1136 		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
   1137 		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
   1138 		valid = false;
   1139 	}
   1140 
   1141 	x = lrc_ring_mi_mode(engine);
   1142 	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
   1143 		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
   1144 		       engine->name, regs[x + 1]);
   1145 		regs[x + 1] &= ~STOP_RING;
   1146 		regs[x + 1] |= STOP_RING << 16;
   1147 		valid = false;
   1148 	}
   1149 
   1150 	WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
   1151 }
   1152 
   1153 static void restore_default_state(struct intel_context *ce,
   1154 				  struct intel_engine_cs *engine)
   1155 {
   1156 	u32 *regs = ce->lrc_reg_state;
   1157 
   1158 	if (engine->pinned_default_state)
   1159 		memcpy(regs, /* skip restoring the vanilla PPHWSP */
   1160 		       engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
   1161 		       engine->context_size - PAGE_SIZE);
   1162 
   1163 	execlists_init_reg_state(regs, ce, engine, ce->ring, false);
   1164 }
   1165 
   1166 static void reset_active(struct i915_request *rq,
   1167 			 struct intel_engine_cs *engine)
   1168 {
   1169 	struct intel_context * const ce = rq->context;
   1170 	u32 head;
   1171 
   1172 	/*
   1173 	 * The executing context has been cancelled. We want to prevent
   1174 	 * further execution along this context and propagate the error on
   1175 	 * to anything depending on its results.
   1176 	 *
   1177 	 * In __i915_request_submit(), we apply the -EIO and remove the
   1178 	 * requests' payloads for any banned requests. But first, we must
   1179 	 * rewind the context back to the start of the incomplete request so
   1180 	 * that we do not jump back into the middle of the batch.
   1181 	 *
   1182 	 * We preserve the breadcrumbs and semaphores of the incomplete
   1183 	 * requests so that inter-timeline dependencies (i.e other timelines)
   1184 	 * remain correctly ordered. And we defer to __i915_request_submit()
   1185 	 * so that all asynchronous waits are correctly handled.
   1186 	 */
   1187 	ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n",
   1188 		     rq->fence.context, rq->fence.seqno);
   1189 
   1190 	/* On resubmission of the active request, payload will be scrubbed */
   1191 	if (i915_request_completed(rq))
   1192 		head = rq->tail;
   1193 	else
   1194 		head = active_request(ce->timeline, rq)->head;
   1195 	head = intel_ring_wrap(ce->ring, head);
   1196 
   1197 	/* Scrub the context image to prevent replaying the previous batch */
   1198 	restore_default_state(ce, engine);
   1199 	__execlists_update_reg_state(ce, engine, head);
   1200 
   1201 	/* We've switched away, so this should be a no-op, but intent matters */
   1202 	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
   1203 }
   1204 
   1205 static inline struct intel_engine_cs *
   1206 __execlists_schedule_in(struct i915_request *rq)
   1207 {
   1208 	struct intel_engine_cs * const engine = rq->engine;
   1209 	struct intel_context * const ce = rq->context;
   1210 
   1211 	intel_context_get(ce);
   1212 
   1213 	if (unlikely(intel_context_is_banned(ce)))
   1214 		reset_active(rq, engine);
   1215 
   1216 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
   1217 		execlists_check_context(ce, engine);
   1218 
   1219 	if (ce->tag) {
   1220 		/* Use a fixed tag for OA and friends */
   1221 		ce->lrc_desc |= (u64)ce->tag << 32;
   1222 	} else {
   1223 		/* We don't need a strict matching tag, just different values */
   1224 		ce->lrc_desc &= ~GENMASK_ULL(47, 37);
   1225 		ce->lrc_desc |=
   1226 			(u64)(++engine->context_tag % NUM_CONTEXT_TAG) <<
   1227 			GEN11_SW_CTX_ID_SHIFT;
   1228 		BUILD_BUG_ON(NUM_CONTEXT_TAG > GEN12_MAX_CONTEXT_HW_ID);
   1229 	}
   1230 
   1231 	__intel_gt_pm_get(engine->gt);
   1232 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
   1233 	intel_engine_context_in(engine);
   1234 
   1235 	return engine;
   1236 }
   1237 
   1238 static inline struct i915_request *
   1239 execlists_schedule_in(struct i915_request *rq, int idx)
   1240 {
   1241 	struct intel_context * const ce = rq->context;
   1242 	struct intel_engine_cs *old;
   1243 
   1244 	GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
   1245 	trace_i915_request_in(rq, idx);
   1246 
   1247 	old = READ_ONCE(ce->inflight);
   1248 	do {
   1249 		if (!old) {
   1250 			WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
   1251 			break;
   1252 		}
   1253 	} while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
   1254 
   1255 	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
   1256 	return i915_request_get(rq);
   1257 }
   1258 
   1259 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
   1260 {
   1261 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
   1262 	struct i915_request *next = READ_ONCE(ve->request);
   1263 
   1264 	if (next && next->execution_mask & ~rq->execution_mask)
   1265 		tasklet_schedule(&ve->base.execlists.tasklet);
   1266 }
   1267 
   1268 static inline void
   1269 __execlists_schedule_out(struct i915_request *rq,
   1270 			 struct intel_engine_cs * const engine)
   1271 {
   1272 	struct intel_context * const ce = rq->context;
   1273 
   1274 	/*
   1275 	 * NB process_csb() is not under the engine->active.lock and hence
   1276 	 * schedule_out can race with schedule_in meaning that we should
   1277 	 * refrain from doing non-trivial work here.
   1278 	 */
   1279 
   1280 	/*
   1281 	 * If we have just completed this context, the engine may now be
   1282 	 * idle and we want to re-enter powersaving.
   1283 	 */
   1284 	if (list_is_last(&rq->link, &ce->timeline->requests) &&
   1285 	    i915_request_completed(rq))
   1286 		intel_engine_add_retire(engine, ce->timeline);
   1287 
   1288 	intel_engine_context_out(engine);
   1289 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
   1290 	intel_gt_pm_put_async(engine->gt);
   1291 
   1292 	/*
   1293 	 * If this is part of a virtual engine, its next request may
   1294 	 * have been blocked waiting for access to the active context.
   1295 	 * We have to kick all the siblings again in case we need to
   1296 	 * switch (e.g. the next request is not runnable on this
   1297 	 * engine). Hopefully, we will already have submitted the next
   1298 	 * request before the tasklet runs and do not need to rebuild
   1299 	 * each virtual tree and kick everyone again.
   1300 	 */
   1301 	if (ce->engine != engine)
   1302 		kick_siblings(rq, ce);
   1303 
   1304 	intel_context_put(ce);
   1305 }
   1306 
   1307 static inline void
   1308 execlists_schedule_out(struct i915_request *rq)
   1309 {
   1310 	struct intel_context * const ce = rq->context;
   1311 	struct intel_engine_cs *cur, *old;
   1312 
   1313 	trace_i915_request_out(rq);
   1314 
   1315 	old = READ_ONCE(ce->inflight);
   1316 	do
   1317 		cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
   1318 	while (!try_cmpxchg(&ce->inflight, &old, cur));
   1319 	if (!cur)
   1320 		__execlists_schedule_out(rq, old);
   1321 
   1322 	i915_request_put(rq);
   1323 }
   1324 
   1325 static u64 execlists_update_context(struct i915_request *rq)
   1326 {
   1327 	struct intel_context *ce = rq->context;
   1328 	u64 desc = ce->lrc_desc;
   1329 	u32 tail, prev;
   1330 
   1331 	/*
   1332 	 * WaIdleLiteRestore:bdw,skl
   1333 	 *
   1334 	 * We should never submit the context with the same RING_TAIL twice
   1335 	 * just in case we submit an empty ring, which confuses the HW.
   1336 	 *
   1337 	 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
   1338 	 * the normal request to be able to always advance the RING_TAIL on
   1339 	 * subsequent resubmissions (for lite restore). Should that fail us,
   1340 	 * and we try and submit the same tail again, force the context
   1341 	 * reload.
   1342 	 *
   1343 	 * If we need to return to a preempted context, we need to skip the
   1344 	 * lite-restore and force it to reload the RING_TAIL. Otherwise, the
   1345 	 * HW has a tendency to ignore us rewinding the TAIL to the end of
   1346 	 * an earlier request.
   1347 	 */
   1348 	tail = intel_ring_set_tail(rq->ring, rq->tail);
   1349 	prev = ce->lrc_reg_state[CTX_RING_TAIL];
   1350 	if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0))
   1351 		desc |= CTX_DESC_FORCE_RESTORE;
   1352 	ce->lrc_reg_state[CTX_RING_TAIL] = tail;
   1353 	rq->tail = rq->wa_tail;
   1354 
   1355 	/*
   1356 	 * Make sure the context image is complete before we submit it to HW.
   1357 	 *
   1358 	 * Ostensibly, writes (including the WCB) should be flushed prior to
   1359 	 * an uncached write such as our mmio register access, the empirical
   1360 	 * evidence (esp. on Braswell) suggests that the WC write into memory
   1361 	 * may not be visible to the HW prior to the completion of the UC
   1362 	 * register write and that we may begin execution from the context
   1363 	 * before its image is complete leading to invalid PD chasing.
   1364 	 */
   1365 	wmb();
   1366 
   1367 	ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE;
   1368 	return desc;
   1369 }
   1370 
   1371 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
   1372 {
   1373 	if (execlists->ctrl_reg) {
   1374 		writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
   1375 		writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
   1376 	} else {
   1377 		writel(upper_32_bits(desc), execlists->submit_reg);
   1378 		writel(lower_32_bits(desc), execlists->submit_reg);
   1379 	}
   1380 }
   1381 
   1382 static __maybe_unused void
   1383 trace_ports(const struct intel_engine_execlists *execlists,
   1384 	    const char *msg,
   1385 	    struct i915_request * const *ports)
   1386 {
   1387 	const struct intel_engine_cs *engine =
   1388 		container_of(execlists, typeof(*engine), execlists);
   1389 
   1390 	if (!ports[0])
   1391 		return;
   1392 
   1393 	ENGINE_TRACE(engine, "%s { %llx:%lld%s, %llx:%lld }\n", msg,
   1394 		     ports[0]->fence.context,
   1395 		     ports[0]->fence.seqno,
   1396 		     i915_request_completed(ports[0]) ? "!" :
   1397 		     i915_request_started(ports[0]) ? "*" :
   1398 		     "",
   1399 		     ports[1] ? ports[1]->fence.context : 0,
   1400 		     ports[1] ? ports[1]->fence.seqno : 0);
   1401 }
   1402 
   1403 static __maybe_unused bool
   1404 assert_pending_valid(const struct intel_engine_execlists *execlists,
   1405 		     const char *msg)
   1406 {
   1407 	struct i915_request * const *port, *rq;
   1408 	struct intel_context *ce = NULL;
   1409 
   1410 	trace_ports(execlists, msg, execlists->pending);
   1411 
   1412 	if (!execlists->pending[0]) {
   1413 		GEM_TRACE_ERR("Nothing pending for promotion!\n");
   1414 		return false;
   1415 	}
   1416 
   1417 	if (execlists->pending[execlists_num_ports(execlists)]) {
   1418 		GEM_TRACE_ERR("Excess pending[%d] for promotion!\n",
   1419 			      execlists_num_ports(execlists));
   1420 		return false;
   1421 	}
   1422 
   1423 	for (port = execlists->pending; (rq = *port); port++) {
   1424 		unsigned long flags;
   1425 		bool ok = true;
   1426 
   1427 		GEM_BUG_ON(!kref_read(&rq->fence.refcount));
   1428 		GEM_BUG_ON(!i915_request_is_active(rq));
   1429 
   1430 		if (ce == rq->context) {
   1431 			GEM_TRACE_ERR("Dup context:%llx in pending[%zd]\n",
   1432 				      ce->timeline->fence_context,
   1433 				      port - execlists->pending);
   1434 			return false;
   1435 		}
   1436 		ce = rq->context;
   1437 
   1438 		/* Hold tightly onto the lock to prevent concurrent retires! */
   1439 		if (!spin_trylock_irqsave(&rq->lock, flags))
   1440 			continue;
   1441 
   1442 		if (i915_request_completed(rq))
   1443 			goto unlock;
   1444 
   1445 		if (i915_active_is_idle(&ce->active) &&
   1446 		    !intel_context_is_barrier(ce)) {
   1447 			GEM_TRACE_ERR("Inactive context:%llx in pending[%zd]\n",
   1448 				      ce->timeline->fence_context,
   1449 				      port - execlists->pending);
   1450 			ok = false;
   1451 			goto unlock;
   1452 		}
   1453 
   1454 		if (!i915_vma_is_pinned(ce->state)) {
   1455 			GEM_TRACE_ERR("Unpinned context:%llx in pending[%zd]\n",
   1456 				      ce->timeline->fence_context,
   1457 				      port - execlists->pending);
   1458 			ok = false;
   1459 			goto unlock;
   1460 		}
   1461 
   1462 		if (!i915_vma_is_pinned(ce->ring->vma)) {
   1463 			GEM_TRACE_ERR("Unpinned ring:%llx in pending[%zd]\n",
   1464 				      ce->timeline->fence_context,
   1465 				      port - execlists->pending);
   1466 			ok = false;
   1467 			goto unlock;
   1468 		}
   1469 
   1470 unlock:
   1471 		spin_unlock_irqrestore(&rq->lock, flags);
   1472 		if (!ok)
   1473 			return false;
   1474 	}
   1475 
   1476 	return ce;
   1477 }
   1478 
   1479 static void execlists_submit_ports(struct intel_engine_cs *engine)
   1480 {
   1481 	struct intel_engine_execlists *execlists = &engine->execlists;
   1482 	unsigned int n;
   1483 
   1484 	GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
   1485 
   1486 	/*
   1487 	 * We can skip acquiring intel_runtime_pm_get() here as it was taken
   1488 	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
   1489 	 * not be relinquished until the device is idle (see
   1490 	 * i915_gem_idle_work_handler()). As a precaution, we make sure
   1491 	 * that all ELSP are drained i.e. we have processed the CSB,
   1492 	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
   1493 	 */
   1494 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
   1495 
   1496 	/*
   1497 	 * ELSQ note: the submit queue is not cleared after being submitted
   1498 	 * to the HW so we need to make sure we always clean it up. This is
   1499 	 * currently ensured by the fact that we always write the same number
   1500 	 * of elsq entries, keep this in mind before changing the loop below.
   1501 	 */
   1502 	for (n = execlists_num_ports(execlists); n--; ) {
   1503 		struct i915_request *rq = execlists->pending[n];
   1504 
   1505 		write_desc(execlists,
   1506 			   rq ? execlists_update_context(rq) : 0,
   1507 			   n);
   1508 	}
   1509 
   1510 	/* we need to manually load the submit queue */
   1511 	if (execlists->ctrl_reg)
   1512 		writel(EL_CTRL_LOAD, execlists->ctrl_reg);
   1513 }
   1514 
   1515 static bool ctx_single_port_submission(const struct intel_context *ce)
   1516 {
   1517 	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
   1518 		intel_context_force_single_submission(ce));
   1519 }
   1520 
   1521 static bool can_merge_ctx(const struct intel_context *prev,
   1522 			  const struct intel_context *next)
   1523 {
   1524 	if (prev != next)
   1525 		return false;
   1526 
   1527 	if (ctx_single_port_submission(prev))
   1528 		return false;
   1529 
   1530 	return true;
   1531 }
   1532 
   1533 static bool can_merge_rq(const struct i915_request *prev,
   1534 			 const struct i915_request *next)
   1535 {
   1536 	GEM_BUG_ON(prev == next);
   1537 	GEM_BUG_ON(!assert_priority_queue(prev, next));
   1538 
   1539 	/*
   1540 	 * We do not submit known completed requests. Therefore if the next
   1541 	 * request is already completed, we can pretend to merge it in
   1542 	 * with the previous context (and we will skip updating the ELSP
   1543 	 * and tracking). Thus hopefully keeping the ELSP full with active
   1544 	 * contexts, despite the best efforts of preempt-to-busy to confuse
   1545 	 * us.
   1546 	 */
   1547 	if (i915_request_completed(next))
   1548 		return true;
   1549 
   1550 	if (unlikely((prev->fence.flags ^ next->fence.flags) &
   1551 		     (BIT(I915_FENCE_FLAG_NOPREEMPT) |
   1552 		      BIT(I915_FENCE_FLAG_SENTINEL))))
   1553 		return false;
   1554 
   1555 	if (!can_merge_ctx(prev->context, next->context))
   1556 		return false;
   1557 
   1558 	return true;
   1559 }
   1560 
   1561 static void virtual_update_register_offsets(u32 *regs,
   1562 					    struct intel_engine_cs *engine)
   1563 {
   1564 	set_offsets(regs, reg_offsets(engine), engine, false);
   1565 }
   1566 
   1567 static bool virtual_matches(const struct virtual_engine *ve,
   1568 			    const struct i915_request *rq,
   1569 			    const struct intel_engine_cs *engine)
   1570 {
   1571 	const struct intel_engine_cs *inflight;
   1572 
   1573 	if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
   1574 		return false;
   1575 
   1576 	/*
   1577 	 * We track when the HW has completed saving the context image
   1578 	 * (i.e. when we have seen the final CS event switching out of
   1579 	 * the context) and must not overwrite the context image before
   1580 	 * then. This restricts us to only using the active engine
   1581 	 * while the previous virtualized request is inflight (so
   1582 	 * we reuse the register offsets). This is a very small
   1583 	 * hystersis on the greedy seelction algorithm.
   1584 	 */
   1585 	inflight = intel_context_inflight(&ve->context);
   1586 	if (inflight && inflight != engine)
   1587 		return false;
   1588 
   1589 	return true;
   1590 }
   1591 
   1592 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve,
   1593 				     struct intel_engine_cs *engine)
   1594 {
   1595 	struct intel_engine_cs *old = ve->siblings[0];
   1596 
   1597 	/* All unattached (rq->engine == old) must already be completed */
   1598 
   1599 	spin_lock(&old->breadcrumbs.irq_lock);
   1600 	if (!list_empty(&ve->context.signal_link)) {
   1601 		list_move_tail(&ve->context.signal_link,
   1602 			       &engine->breadcrumbs.signalers);
   1603 		intel_engine_signal_breadcrumbs(engine);
   1604 	}
   1605 	spin_unlock(&old->breadcrumbs.irq_lock);
   1606 }
   1607 
   1608 static struct i915_request *
   1609 last_active(const struct intel_engine_execlists *execlists)
   1610 {
   1611 	struct i915_request * const *last = READ_ONCE(execlists->active);
   1612 
   1613 	while (*last && i915_request_completed(*last))
   1614 		last++;
   1615 
   1616 	return *last;
   1617 }
   1618 
   1619 #define for_each_waiter(p__, rq__) \
   1620 	list_for_each_entry_lockless(p__, \
   1621 				     &(rq__)->sched.waiters_list, \
   1622 				     wait_link)
   1623 
   1624 static void defer_request(struct i915_request *rq, struct list_head * const pl)
   1625 {
   1626 	LIST_HEAD(list);
   1627 
   1628 	/*
   1629 	 * We want to move the interrupted request to the back of
   1630 	 * the round-robin list (i.e. its priority level), but
   1631 	 * in doing so, we must then move all requests that were in
   1632 	 * flight and were waiting for the interrupted request to
   1633 	 * be run after it again.
   1634 	 */
   1635 	do {
   1636 		struct i915_dependency *p;
   1637 
   1638 		GEM_BUG_ON(i915_request_is_active(rq));
   1639 		list_move_tail(&rq->sched.link, pl);
   1640 
   1641 		for_each_waiter(p, rq) {
   1642 			struct i915_request *w =
   1643 				container_of(p->waiter, typeof(*w), sched);
   1644 
   1645 			/* Leave semaphores spinning on the other engines */
   1646 			if (w->engine != rq->engine)
   1647 				continue;
   1648 
   1649 			/* No waiter should start before its signaler */
   1650 			GEM_BUG_ON(i915_request_started(w) &&
   1651 				   !i915_request_completed(rq));
   1652 
   1653 			GEM_BUG_ON(i915_request_is_active(w));
   1654 			if (!i915_request_is_ready(w))
   1655 				continue;
   1656 
   1657 			if (rq_prio(w) < rq_prio(rq))
   1658 				continue;
   1659 
   1660 			GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
   1661 			list_move_tail(&w->sched.link, &list);
   1662 		}
   1663 
   1664 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
   1665 	} while (rq);
   1666 }
   1667 
   1668 static void defer_active(struct intel_engine_cs *engine)
   1669 {
   1670 	struct i915_request *rq;
   1671 
   1672 	rq = __unwind_incomplete_requests(engine);
   1673 	if (!rq)
   1674 		return;
   1675 
   1676 	defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
   1677 }
   1678 
   1679 static bool
   1680 need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq)
   1681 {
   1682 	int hint;
   1683 
   1684 	if (!intel_engine_has_timeslices(engine))
   1685 		return false;
   1686 
   1687 	if (list_is_last(&rq->sched.link, &engine->active.requests))
   1688 		return false;
   1689 
   1690 	hint = max(rq_prio(list_next_entry(rq, sched.link)),
   1691 		   engine->execlists.queue_priority_hint);
   1692 
   1693 	return hint >= effective_prio(rq);
   1694 }
   1695 
   1696 static int
   1697 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
   1698 {
   1699 	if (list_is_last(&rq->sched.link, &engine->active.requests))
   1700 		return INT_MIN;
   1701 
   1702 	return rq_prio(list_next_entry(rq, sched.link));
   1703 }
   1704 
   1705 static inline unsigned long
   1706 timeslice(const struct intel_engine_cs *engine)
   1707 {
   1708 	return READ_ONCE(engine->props.timeslice_duration_ms);
   1709 }
   1710 
   1711 static unsigned long
   1712 active_timeslice(const struct intel_engine_cs *engine)
   1713 {
   1714 	const struct i915_request *rq = *engine->execlists.active;
   1715 
   1716 	if (!rq || i915_request_completed(rq))
   1717 		return 0;
   1718 
   1719 	if (engine->execlists.switch_priority_hint < effective_prio(rq))
   1720 		return 0;
   1721 
   1722 	return timeslice(engine);
   1723 }
   1724 
   1725 static void set_timeslice(struct intel_engine_cs *engine)
   1726 {
   1727 	if (!intel_engine_has_timeslices(engine))
   1728 		return;
   1729 
   1730 	set_timer_ms(&engine->execlists.timer, active_timeslice(engine));
   1731 }
   1732 
   1733 static void record_preemption(struct intel_engine_execlists *execlists)
   1734 {
   1735 	(void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
   1736 }
   1737 
   1738 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine)
   1739 {
   1740 	struct i915_request *rq;
   1741 
   1742 	rq = last_active(&engine->execlists);
   1743 	if (!rq)
   1744 		return 0;
   1745 
   1746 	/* Force a fast reset for terminated contexts (ignoring sysfs!) */
   1747 	if (unlikely(intel_context_is_banned(rq->context)))
   1748 		return 1;
   1749 
   1750 	return READ_ONCE(engine->props.preempt_timeout_ms);
   1751 }
   1752 
   1753 static void set_preempt_timeout(struct intel_engine_cs *engine)
   1754 {
   1755 	if (!intel_engine_has_preempt_reset(engine))
   1756 		return;
   1757 
   1758 	set_timer_ms(&engine->execlists.preempt,
   1759 		     active_preempt_timeout(engine));
   1760 }
   1761 
   1762 static inline void clear_ports(struct i915_request **ports, int count)
   1763 {
   1764 	memset_p((void **)ports, NULL, count);
   1765 }
   1766 
   1767 static void execlists_dequeue(struct intel_engine_cs *engine)
   1768 {
   1769 	struct intel_engine_execlists * const execlists = &engine->execlists;
   1770 	struct i915_request **port = execlists->pending;
   1771 	struct i915_request ** const last_port = port + execlists->port_mask;
   1772 	struct i915_request *last;
   1773 	struct rb_node *rb;
   1774 	bool submit = false;
   1775 
   1776 	/*
   1777 	 * Hardware submission is through 2 ports. Conceptually each port
   1778 	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
   1779 	 * static for a context, and unique to each, so we only execute
   1780 	 * requests belonging to a single context from each ring. RING_HEAD
   1781 	 * is maintained by the CS in the context image, it marks the place
   1782 	 * where it got up to last time, and through RING_TAIL we tell the CS
   1783 	 * where we want to execute up to this time.
   1784 	 *
   1785 	 * In this list the requests are in order of execution. Consecutive
   1786 	 * requests from the same context are adjacent in the ringbuffer. We
   1787 	 * can combine these requests into a single RING_TAIL update:
   1788 	 *
   1789 	 *              RING_HEAD...req1...req2
   1790 	 *                                    ^- RING_TAIL
   1791 	 * since to execute req2 the CS must first execute req1.
   1792 	 *
   1793 	 * Our goal then is to point each port to the end of a consecutive
   1794 	 * sequence of requests as being the most optimal (fewest wake ups
   1795 	 * and context switches) submission.
   1796 	 */
   1797 
   1798 	for (rb = rb_first_cached(&execlists->virtual); rb; ) {
   1799 		struct virtual_engine *ve =
   1800 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
   1801 		struct i915_request *rq = READ_ONCE(ve->request);
   1802 
   1803 		if (!rq) { /* lazily cleanup after another engine handled rq */
   1804 			rb_erase_cached(rb, &execlists->virtual);
   1805 			RB_CLEAR_NODE(rb);
   1806 			rb = rb_first_cached(&execlists->virtual);
   1807 			continue;
   1808 		}
   1809 
   1810 		if (!virtual_matches(ve, rq, engine)) {
   1811 			rb = rb_next(rb);
   1812 			continue;
   1813 		}
   1814 
   1815 		break;
   1816 	}
   1817 
   1818 	/*
   1819 	 * If the queue is higher priority than the last
   1820 	 * request in the currently active context, submit afresh.
   1821 	 * We will resubmit again afterwards in case we need to split
   1822 	 * the active context to interject the preemption request,
   1823 	 * i.e. we will retrigger preemption following the ack in case
   1824 	 * of trouble.
   1825 	 */
   1826 	last = last_active(execlists);
   1827 	if (last) {
   1828 		if (need_preempt(engine, last, rb)) {
   1829 			ENGINE_TRACE(engine,
   1830 				     "preempting last=%llx:%lld, prio=%d, hint=%d\n",
   1831 				     last->fence.context,
   1832 				     last->fence.seqno,
   1833 				     last->sched.attr.priority,
   1834 				     execlists->queue_priority_hint);
   1835 			record_preemption(execlists);
   1836 
   1837 			/*
   1838 			 * Don't let the RING_HEAD advance past the breadcrumb
   1839 			 * as we unwind (and until we resubmit) so that we do
   1840 			 * not accidentally tell it to go backwards.
   1841 			 */
   1842 			ring_set_paused(engine, 1);
   1843 
   1844 			/*
   1845 			 * Note that we have not stopped the GPU at this point,
   1846 			 * so we are unwinding the incomplete requests as they
   1847 			 * remain inflight and so by the time we do complete
   1848 			 * the preemption, some of the unwound requests may
   1849 			 * complete!
   1850 			 */
   1851 			__unwind_incomplete_requests(engine);
   1852 
   1853 			last = NULL;
   1854 		} else if (need_timeslice(engine, last) &&
   1855 			   timer_expired(&engine->execlists.timer)) {
   1856 			ENGINE_TRACE(engine,
   1857 				     "expired last=%llx:%lld, prio=%d, hint=%d\n",
   1858 				     last->fence.context,
   1859 				     last->fence.seqno,
   1860 				     last->sched.attr.priority,
   1861 				     execlists->queue_priority_hint);
   1862 
   1863 			ring_set_paused(engine, 1);
   1864 			defer_active(engine);
   1865 
   1866 			/*
   1867 			 * Unlike for preemption, if we rewind and continue
   1868 			 * executing the same context as previously active,
   1869 			 * the order of execution will remain the same and
   1870 			 * the tail will only advance. We do not need to
   1871 			 * force a full context restore, as a lite-restore
   1872 			 * is sufficient to resample the monotonic TAIL.
   1873 			 *
   1874 			 * If we switch to any other context, similarly we
   1875 			 * will not rewind TAIL of current context, and
   1876 			 * normal save/restore will preserve state and allow
   1877 			 * us to later continue executing the same request.
   1878 			 */
   1879 			last = NULL;
   1880 		} else {
   1881 			/*
   1882 			 * Otherwise if we already have a request pending
   1883 			 * for execution after the current one, we can
   1884 			 * just wait until the next CS event before
   1885 			 * queuing more. In either case we will force a
   1886 			 * lite-restore preemption event, but if we wait
   1887 			 * we hopefully coalesce several updates into a single
   1888 			 * submission.
   1889 			 */
   1890 			if (!list_is_last(&last->sched.link,
   1891 					  &engine->active.requests)) {
   1892 				/*
   1893 				 * Even if ELSP[1] is occupied and not worthy
   1894 				 * of timeslices, our queue might be.
   1895 				 */
   1896 				if (!execlists->timer.expires &&
   1897 				    need_timeslice(engine, last))
   1898 					set_timer_ms(&execlists->timer,
   1899 						     timeslice(engine));
   1900 
   1901 				return;
   1902 			}
   1903 		}
   1904 	}
   1905 
   1906 	while (rb) { /* XXX virtual is always taking precedence */
   1907 		struct virtual_engine *ve =
   1908 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
   1909 		struct i915_request *rq;
   1910 
   1911 		spin_lock(&ve->base.active.lock);
   1912 
   1913 		rq = ve->request;
   1914 		if (unlikely(!rq)) { /* lost the race to a sibling */
   1915 			spin_unlock(&ve->base.active.lock);
   1916 			rb_erase_cached(rb, &execlists->virtual);
   1917 			RB_CLEAR_NODE(rb);
   1918 			rb = rb_first_cached(&execlists->virtual);
   1919 			continue;
   1920 		}
   1921 
   1922 		GEM_BUG_ON(rq != ve->request);
   1923 		GEM_BUG_ON(rq->engine != &ve->base);
   1924 		GEM_BUG_ON(rq->context != &ve->context);
   1925 
   1926 		if (rq_prio(rq) >= queue_prio(execlists)) {
   1927 			if (!virtual_matches(ve, rq, engine)) {
   1928 				spin_unlock(&ve->base.active.lock);
   1929 				rb = rb_next(rb);
   1930 				continue;
   1931 			}
   1932 
   1933 			if (last && !can_merge_rq(last, rq)) {
   1934 				spin_unlock(&ve->base.active.lock);
   1935 				return; /* leave this for another */
   1936 			}
   1937 
   1938 			ENGINE_TRACE(engine,
   1939 				     "virtual rq=%llx:%lld%s, new engine? %s\n",
   1940 				     rq->fence.context,
   1941 				     rq->fence.seqno,
   1942 				     i915_request_completed(rq) ? "!" :
   1943 				     i915_request_started(rq) ? "*" :
   1944 				     "",
   1945 				     yesno(engine != ve->siblings[0]));
   1946 
   1947 			ve->request = NULL;
   1948 			ve->base.execlists.queue_priority_hint = INT_MIN;
   1949 			rb_erase_cached(rb, &execlists->virtual);
   1950 			RB_CLEAR_NODE(rb);
   1951 
   1952 			GEM_BUG_ON(!(rq->execution_mask & engine->mask));
   1953 			rq->engine = engine;
   1954 
   1955 			if (engine != ve->siblings[0]) {
   1956 				u32 *regs = ve->context.lrc_reg_state;
   1957 				unsigned int n;
   1958 
   1959 				GEM_BUG_ON(READ_ONCE(ve->context.inflight));
   1960 
   1961 				if (!intel_engine_has_relative_mmio(engine))
   1962 					virtual_update_register_offsets(regs,
   1963 									engine);
   1964 
   1965 				if (!list_empty(&ve->context.signals))
   1966 					virtual_xfer_breadcrumbs(ve, engine);
   1967 
   1968 				/*
   1969 				 * Move the bound engine to the top of the list
   1970 				 * for future execution. We then kick this
   1971 				 * tasklet first before checking others, so that
   1972 				 * we preferentially reuse this set of bound
   1973 				 * registers.
   1974 				 */
   1975 				for (n = 1; n < ve->num_siblings; n++) {
   1976 					if (ve->siblings[n] == engine) {
   1977 						swap(ve->siblings[n],
   1978 						     ve->siblings[0]);
   1979 						break;
   1980 					}
   1981 				}
   1982 
   1983 				GEM_BUG_ON(ve->siblings[0] != engine);
   1984 			}
   1985 
   1986 			if (__i915_request_submit(rq)) {
   1987 				submit = true;
   1988 				last = rq;
   1989 			}
   1990 			i915_request_put(rq);
   1991 
   1992 			/*
   1993 			 * Hmm, we have a bunch of virtual engine requests,
   1994 			 * but the first one was already completed (thanks
   1995 			 * preempt-to-busy!). Keep looking at the veng queue
   1996 			 * until we have no more relevant requests (i.e.
   1997 			 * the normal submit queue has higher priority).
   1998 			 */
   1999 			if (!submit) {
   2000 				spin_unlock(&ve->base.active.lock);
   2001 				rb = rb_first_cached(&execlists->virtual);
   2002 				continue;
   2003 			}
   2004 		}
   2005 
   2006 		spin_unlock(&ve->base.active.lock);
   2007 		break;
   2008 	}
   2009 
   2010 	while ((rb = rb_first_cached(&execlists->queue))) {
   2011 		struct i915_priolist *p = to_priolist(rb);
   2012 		struct i915_request *rq, *rn;
   2013 		int i;
   2014 
   2015 		priolist_for_each_request_consume(rq, rn, p, i) {
   2016 			bool merge = true;
   2017 
   2018 			/*
   2019 			 * Can we combine this request with the current port?
   2020 			 * It has to be the same context/ringbuffer and not
   2021 			 * have any exceptions (e.g. GVT saying never to
   2022 			 * combine contexts).
   2023 			 *
   2024 			 * If we can combine the requests, we can execute both
   2025 			 * by updating the RING_TAIL to point to the end of the
   2026 			 * second request, and so we never need to tell the
   2027 			 * hardware about the first.
   2028 			 */
   2029 			if (last && !can_merge_rq(last, rq)) {
   2030 				/*
   2031 				 * If we are on the second port and cannot
   2032 				 * combine this request with the last, then we
   2033 				 * are done.
   2034 				 */
   2035 				if (port == last_port)
   2036 					goto done;
   2037 
   2038 				/*
   2039 				 * We must not populate both ELSP[] with the
   2040 				 * same LRCA, i.e. we must submit 2 different
   2041 				 * contexts if we submit 2 ELSP.
   2042 				 */
   2043 				if (last->context == rq->context)
   2044 					goto done;
   2045 
   2046 				if (i915_request_has_sentinel(last))
   2047 					goto done;
   2048 
   2049 				/*
   2050 				 * If GVT overrides us we only ever submit
   2051 				 * port[0], leaving port[1] empty. Note that we
   2052 				 * also have to be careful that we don't queue
   2053 				 * the same context (even though a different
   2054 				 * request) to the second port.
   2055 				 */
   2056 				if (ctx_single_port_submission(last->context) ||
   2057 				    ctx_single_port_submission(rq->context))
   2058 					goto done;
   2059 
   2060 				merge = false;
   2061 			}
   2062 
   2063 			if (__i915_request_submit(rq)) {
   2064 				if (!merge) {
   2065 					*port = execlists_schedule_in(last, port - execlists->pending);
   2066 					port++;
   2067 					last = NULL;
   2068 				}
   2069 
   2070 				GEM_BUG_ON(last &&
   2071 					   !can_merge_ctx(last->context,
   2072 							  rq->context));
   2073 
   2074 				submit = true;
   2075 				last = rq;
   2076 			}
   2077 		}
   2078 
   2079 		rb_erase_cached(&p->node, &execlists->queue);
   2080 		i915_priolist_free(p);
   2081 	}
   2082 
   2083 done:
   2084 	/*
   2085 	 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
   2086 	 *
   2087 	 * We choose the priority hint such that if we add a request of greater
   2088 	 * priority than this, we kick the submission tasklet to decide on
   2089 	 * the right order of submitting the requests to hardware. We must
   2090 	 * also be prepared to reorder requests as they are in-flight on the
   2091 	 * HW. We derive the priority hint then as the first "hole" in
   2092 	 * the HW submission ports and if there are no available slots,
   2093 	 * the priority of the lowest executing request, i.e. last.
   2094 	 *
   2095 	 * When we do receive a higher priority request ready to run from the
   2096 	 * user, see queue_request(), the priority hint is bumped to that
   2097 	 * request triggering preemption on the next dequeue (or subsequent
   2098 	 * interrupt for secondary ports).
   2099 	 */
   2100 	execlists->queue_priority_hint = queue_prio(execlists);
   2101 
   2102 	if (submit) {
   2103 		*port = execlists_schedule_in(last, port - execlists->pending);
   2104 		execlists->switch_priority_hint =
   2105 			switch_prio(engine, *execlists->pending);
   2106 
   2107 		/*
   2108 		 * Skip if we ended up with exactly the same set of requests,
   2109 		 * e.g. trying to timeslice a pair of ordered contexts
   2110 		 */
   2111 		if (!memcmp(execlists->active, execlists->pending,
   2112 			    (port - execlists->pending + 1) * sizeof(*port))) {
   2113 			do
   2114 				execlists_schedule_out(fetch_and_zero(port));
   2115 			while (port-- != execlists->pending);
   2116 
   2117 			goto skip_submit;
   2118 		}
   2119 		clear_ports(port + 1, last_port - port);
   2120 
   2121 		execlists_submit_ports(engine);
   2122 		set_preempt_timeout(engine);
   2123 	} else {
   2124 skip_submit:
   2125 		ring_set_paused(engine, 0);
   2126 	}
   2127 }
   2128 
   2129 static void
   2130 cancel_port_requests(struct intel_engine_execlists * const execlists)
   2131 {
   2132 	struct i915_request * const *port;
   2133 
   2134 	for (port = execlists->pending; *port; port++)
   2135 		execlists_schedule_out(*port);
   2136 	clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
   2137 
   2138 	/* Mark the end of active before we overwrite *active */
   2139 	for (port = xchg(&execlists->active, execlists->pending); *port; port++)
   2140 		execlists_schedule_out(*port);
   2141 	clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
   2142 
   2143 	WRITE_ONCE(execlists->active, execlists->inflight);
   2144 }
   2145 
   2146 static inline void
   2147 invalidate_csb_entries(const u32 *first, const u32 *last)
   2148 {
   2149 	clflush((void *)first);
   2150 	clflush((void *)last);
   2151 }
   2152 
   2153 static inline bool
   2154 reset_in_progress(const struct intel_engine_execlists *execlists)
   2155 {
   2156 	return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
   2157 }
   2158 
   2159 /*
   2160  * Starting with Gen12, the status has a new format:
   2161  *
   2162  *     bit  0:     switched to new queue
   2163  *     bit  1:     reserved
   2164  *     bit  2:     semaphore wait mode (poll or signal), only valid when
   2165  *                 switch detail is set to "wait on semaphore"
   2166  *     bits 3-5:   engine class
   2167  *     bits 6-11:  engine instance
   2168  *     bits 12-14: reserved
   2169  *     bits 15-25: sw context id of the lrc the GT switched to
   2170  *     bits 26-31: sw counter of the lrc the GT switched to
   2171  *     bits 32-35: context switch detail
   2172  *                  - 0: ctx complete
   2173  *                  - 1: wait on sync flip
   2174  *                  - 2: wait on vblank
   2175  *                  - 3: wait on scanline
   2176  *                  - 4: wait on semaphore
   2177  *                  - 5: context preempted (not on SEMAPHORE_WAIT or
   2178  *                       WAIT_FOR_EVENT)
   2179  *     bit  36:    reserved
   2180  *     bits 37-43: wait detail (for switch detail 1 to 4)
   2181  *     bits 44-46: reserved
   2182  *     bits 47-57: sw context id of the lrc the GT switched away from
   2183  *     bits 58-63: sw counter of the lrc the GT switched away from
   2184  */
   2185 static inline bool
   2186 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
   2187 {
   2188 	u32 lower_dw = csb[0];
   2189 	u32 upper_dw = csb[1];
   2190 	bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw);
   2191 	bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw);
   2192 	bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
   2193 
   2194 	/*
   2195 	 * The context switch detail is not guaranteed to be 5 when a preemption
   2196 	 * occurs, so we can't just check for that. The check below works for
   2197 	 * all the cases we care about, including preemptions of WAIT
   2198 	 * instructions and lite-restore. Preempt-to-idle via the CTRL register
   2199 	 * would require some extra handling, but we don't support that.
   2200 	 */
   2201 	if (!ctx_away_valid || new_queue) {
   2202 		GEM_BUG_ON(!ctx_to_valid);
   2203 		return true;
   2204 	}
   2205 
   2206 	/*
   2207 	 * switch detail = 5 is covered by the case above and we do not expect a
   2208 	 * context switch on an unsuccessful wait instruction since we always
   2209 	 * use polling mode.
   2210 	 */
   2211 	GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw));
   2212 	return false;
   2213 }
   2214 
   2215 static inline bool
   2216 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
   2217 {
   2218 	return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
   2219 }
   2220 
   2221 static void process_csb(struct intel_engine_cs *engine)
   2222 {
   2223 	struct intel_engine_execlists * const execlists = &engine->execlists;
   2224 	const u32 * const buf = execlists->csb_status;
   2225 	const u8 num_entries = execlists->csb_size;
   2226 	u8 head, tail;
   2227 
   2228 	/*
   2229 	 * As we modify our execlists state tracking we require exclusive
   2230 	 * access. Either we are inside the tasklet, or the tasklet is disabled
   2231 	 * and we assume that is only inside the reset paths and so serialised.
   2232 	 */
   2233 	GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
   2234 		   !reset_in_progress(execlists));
   2235 	GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
   2236 
   2237 	/*
   2238 	 * Note that csb_write, csb_status may be either in HWSP or mmio.
   2239 	 * When reading from the csb_write mmio register, we have to be
   2240 	 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
   2241 	 * the low 4bits. As it happens we know the next 4bits are always
   2242 	 * zero and so we can simply masked off the low u8 of the register
   2243 	 * and treat it identically to reading from the HWSP (without having
   2244 	 * to use explicit shifting and masking, and probably bifurcating
   2245 	 * the code to handle the legacy mmio read).
   2246 	 */
   2247 	head = execlists->csb_head;
   2248 	tail = READ_ONCE(*execlists->csb_write);
   2249 	ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
   2250 	if (unlikely(head == tail))
   2251 		return;
   2252 
   2253 	/*
   2254 	 * Hopefully paired with a wmb() in HW!
   2255 	 *
   2256 	 * We must complete the read of the write pointer before any reads
   2257 	 * from the CSB, so that we do not see stale values. Without an rmb
   2258 	 * (lfence) the HW may speculatively perform the CSB[] reads *before*
   2259 	 * we perform the READ_ONCE(*csb_write).
   2260 	 */
   2261 	rmb();
   2262 
   2263 	do {
   2264 		bool promote;
   2265 
   2266 		if (++head == num_entries)
   2267 			head = 0;
   2268 
   2269 		/*
   2270 		 * We are flying near dragons again.
   2271 		 *
   2272 		 * We hold a reference to the request in execlist_port[]
   2273 		 * but no more than that. We are operating in softirq
   2274 		 * context and so cannot hold any mutex or sleep. That
   2275 		 * prevents us stopping the requests we are processing
   2276 		 * in port[] from being retired simultaneously (the
   2277 		 * breadcrumb will be complete before we see the
   2278 		 * context-switch). As we only hold the reference to the
   2279 		 * request, any pointer chasing underneath the request
   2280 		 * is subject to a potential use-after-free. Thus we
   2281 		 * store all of the bookkeeping within port[] as
   2282 		 * required, and avoid using unguarded pointers beneath
   2283 		 * request itself. The same applies to the atomic
   2284 		 * status notifier.
   2285 		 */
   2286 
   2287 		ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
   2288 			     head, buf[2 * head + 0], buf[2 * head + 1]);
   2289 
   2290 		if (INTEL_GEN(engine->i915) >= 12)
   2291 			promote = gen12_csb_parse(execlists, buf + 2 * head);
   2292 		else
   2293 			promote = gen8_csb_parse(execlists, buf + 2 * head);
   2294 		if (promote) {
   2295 			struct i915_request * const *old = execlists->active;
   2296 
   2297 			/* Point active to the new ELSP; prevent overwriting */
   2298 			WRITE_ONCE(execlists->active, execlists->pending);
   2299 
   2300 			if (!inject_preempt_hang(execlists))
   2301 				ring_set_paused(engine, 0);
   2302 
   2303 			/* cancel old inflight, prepare for switch */
   2304 			trace_ports(execlists, "preempted", old);
   2305 			while (*old)
   2306 				execlists_schedule_out(*old++);
   2307 
   2308 			/* switch pending to inflight */
   2309 			GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
   2310 			WRITE_ONCE(execlists->active,
   2311 				   memcpy(execlists->inflight,
   2312 					  execlists->pending,
   2313 					  execlists_num_ports(execlists) *
   2314 					  sizeof(*execlists->pending)));
   2315 
   2316 			WRITE_ONCE(execlists->pending[0], NULL);
   2317 		} else {
   2318 			GEM_BUG_ON(!*execlists->active);
   2319 
   2320 			/* port0 completed, advanced to port1 */
   2321 			trace_ports(execlists, "completed", execlists->active);
   2322 
   2323 			/*
   2324 			 * We rely on the hardware being strongly
   2325 			 * ordered, that the breadcrumb write is
   2326 			 * coherent (visible from the CPU) before the
   2327 			 * user interrupt and CSB is processed.
   2328 			 */
   2329 			GEM_BUG_ON(!i915_request_completed(*execlists->active) &&
   2330 				   !reset_in_progress(execlists));
   2331 			execlists_schedule_out(*execlists->active++);
   2332 
   2333 			GEM_BUG_ON(execlists->active - execlists->inflight >
   2334 				   execlists_num_ports(execlists));
   2335 		}
   2336 	} while (head != tail);
   2337 
   2338 	execlists->csb_head = head;
   2339 	set_timeslice(engine);
   2340 
   2341 	/*
   2342 	 * Gen11 has proven to fail wrt global observation point between
   2343 	 * entry and tail update, failing on the ordering and thus
   2344 	 * we see an old entry in the context status buffer.
   2345 	 *
   2346 	 * Forcibly evict out entries for the next gpu csb update,
   2347 	 * to increase the odds that we get a fresh entries with non
   2348 	 * working hardware. The cost for doing so comes out mostly with
   2349 	 * the wash as hardware, working or not, will need to do the
   2350 	 * invalidation before.
   2351 	 */
   2352 	invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
   2353 }
   2354 
   2355 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
   2356 {
   2357 	lockdep_assert_held(&engine->active.lock);
   2358 	if (!engine->execlists.pending[0]) {
   2359 		rcu_read_lock(); /* protect peeking at execlists->active */
   2360 		execlists_dequeue(engine);
   2361 		rcu_read_unlock();
   2362 	}
   2363 }
   2364 
   2365 static void __execlists_hold(struct i915_request *rq)
   2366 {
   2367 	LIST_HEAD(list);
   2368 
   2369 	do {
   2370 		struct i915_dependency *p;
   2371 
   2372 		if (i915_request_is_active(rq))
   2373 			__i915_request_unsubmit(rq);
   2374 
   2375 		RQ_TRACE(rq, "on hold\n");
   2376 		clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
   2377 		list_move_tail(&rq->sched.link, &rq->engine->active.hold);
   2378 		i915_request_set_hold(rq);
   2379 
   2380 		list_for_each_entry(p, &rq->sched.waiters_list, wait_link) {
   2381 			struct i915_request *w =
   2382 				container_of(p->waiter, typeof(*w), sched);
   2383 
   2384 			/* Leave semaphores spinning on the other engines */
   2385 			if (w->engine != rq->engine)
   2386 				continue;
   2387 
   2388 			if (!i915_request_is_ready(w))
   2389 				continue;
   2390 
   2391 			if (i915_request_completed(w))
   2392 				continue;
   2393 
   2394 			if (i915_request_on_hold(rq))
   2395 				continue;
   2396 
   2397 			list_move_tail(&w->sched.link, &list);
   2398 		}
   2399 
   2400 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
   2401 	} while (rq);
   2402 }
   2403 
   2404 static bool execlists_hold(struct intel_engine_cs *engine,
   2405 			   struct i915_request *rq)
   2406 {
   2407 	spin_lock_irq(&engine->active.lock);
   2408 
   2409 	if (i915_request_completed(rq)) { /* too late! */
   2410 		rq = NULL;
   2411 		goto unlock;
   2412 	}
   2413 
   2414 	if (rq->engine != engine) { /* preempted virtual engine */
   2415 		struct virtual_engine *ve = to_virtual_engine(rq->engine);
   2416 
   2417 		/*
   2418 		 * intel_context_inflight() is only protected by virtue
   2419 		 * of process_csb() being called only by the tasklet (or
   2420 		 * directly from inside reset while the tasklet is suspended).
   2421 		 * Assert that neither of those are allowed to run while we
   2422 		 * poke at the request queues.
   2423 		 */
   2424 		GEM_BUG_ON(!reset_in_progress(&engine->execlists));
   2425 
   2426 		/*
   2427 		 * An unsubmitted request along a virtual engine will
   2428 		 * remain on the active (this) engine until we are able
   2429 		 * to process the context switch away (and so mark the
   2430 		 * context as no longer in flight). That cannot have happened
   2431 		 * yet, otherwise we would not be hanging!
   2432 		 */
   2433 		spin_lock(&ve->base.active.lock);
   2434 		GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
   2435 		GEM_BUG_ON(ve->request != rq);
   2436 		ve->request = NULL;
   2437 		spin_unlock(&ve->base.active.lock);
   2438 		i915_request_put(rq);
   2439 
   2440 		rq->engine = engine;
   2441 	}
   2442 
   2443 	/*
   2444 	 * Transfer this request onto the hold queue to prevent it
   2445 	 * being resumbitted to HW (and potentially completed) before we have
   2446 	 * released it. Since we may have already submitted following
   2447 	 * requests, we need to remove those as well.
   2448 	 */
   2449 	GEM_BUG_ON(i915_request_on_hold(rq));
   2450 	GEM_BUG_ON(rq->engine != engine);
   2451 	__execlists_hold(rq);
   2452 
   2453 unlock:
   2454 	spin_unlock_irq(&engine->active.lock);
   2455 	return rq;
   2456 }
   2457 
   2458 static bool hold_request(const struct i915_request *rq)
   2459 {
   2460 	struct i915_dependency *p;
   2461 
   2462 	/*
   2463 	 * If one of our ancestors is on hold, we must also be on hold,
   2464 	 * otherwise we will bypass it and execute before it.
   2465 	 */
   2466 	list_for_each_entry(p, &rq->sched.signalers_list, signal_link) {
   2467 		const struct i915_request *s =
   2468 			container_of(p->signaler, typeof(*s), sched);
   2469 
   2470 		if (s->engine != rq->engine)
   2471 			continue;
   2472 
   2473 		if (i915_request_on_hold(s))
   2474 			return true;
   2475 	}
   2476 
   2477 	return false;
   2478 }
   2479 
   2480 static void __execlists_unhold(struct i915_request *rq)
   2481 {
   2482 	LIST_HEAD(list);
   2483 
   2484 	do {
   2485 		struct i915_dependency *p;
   2486 
   2487 		GEM_BUG_ON(!i915_request_on_hold(rq));
   2488 		GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
   2489 
   2490 		i915_request_clear_hold(rq);
   2491 		list_move_tail(&rq->sched.link,
   2492 			       i915_sched_lookup_priolist(rq->engine,
   2493 							  rq_prio(rq)));
   2494 		set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
   2495 		RQ_TRACE(rq, "hold release\n");
   2496 
   2497 		/* Also release any children on this engine that are ready */
   2498 		list_for_each_entry(p, &rq->sched.waiters_list, wait_link) {
   2499 			struct i915_request *w =
   2500 				container_of(p->waiter, typeof(*w), sched);
   2501 
   2502 			if (w->engine != rq->engine)
   2503 				continue;
   2504 
   2505 			if (!i915_request_on_hold(rq))
   2506 				continue;
   2507 
   2508 			/* Check that no other parents are also on hold */
   2509 			if (hold_request(rq))
   2510 				continue;
   2511 
   2512 			list_move_tail(&w->sched.link, &list);
   2513 		}
   2514 
   2515 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
   2516 	} while (rq);
   2517 }
   2518 
   2519 static void execlists_unhold(struct intel_engine_cs *engine,
   2520 			     struct i915_request *rq)
   2521 {
   2522 	spin_lock_irq(&engine->active.lock);
   2523 
   2524 	/*
   2525 	 * Move this request back to the priority queue, and all of its
   2526 	 * children and grandchildren that were suspended along with it.
   2527 	 */
   2528 	__execlists_unhold(rq);
   2529 
   2530 	if (rq_prio(rq) > engine->execlists.queue_priority_hint) {
   2531 		engine->execlists.queue_priority_hint = rq_prio(rq);
   2532 		tasklet_hi_schedule(&engine->execlists.tasklet);
   2533 	}
   2534 
   2535 	spin_unlock_irq(&engine->active.lock);
   2536 }
   2537 
   2538 struct execlists_capture {
   2539 	struct work_struct work;
   2540 	struct i915_request *rq;
   2541 	struct i915_gpu_coredump *error;
   2542 };
   2543 
   2544 static void execlists_capture_work(struct work_struct *work)
   2545 {
   2546 	struct execlists_capture *cap = container_of(work, typeof(*cap), work);
   2547 	const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
   2548 	struct intel_engine_cs *engine = cap->rq->engine;
   2549 	struct intel_gt_coredump *gt = cap->error->gt;
   2550 	struct intel_engine_capture_vma *vma;
   2551 
   2552 	/* Compress all the objects attached to the request, slow! */
   2553 	vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
   2554 	if (vma) {
   2555 		struct i915_vma_compress *compress =
   2556 			i915_vma_capture_prepare(gt);
   2557 
   2558 		intel_engine_coredump_add_vma(gt->engine, vma, compress);
   2559 		i915_vma_capture_finish(gt, compress);
   2560 	}
   2561 
   2562 	gt->simulated = gt->engine->simulated;
   2563 	cap->error->simulated = gt->simulated;
   2564 
   2565 	/* Publish the error state, and announce it to the world */
   2566 	i915_error_state_store(cap->error);
   2567 	i915_gpu_coredump_put(cap->error);
   2568 
   2569 	/* Return this request and all that depend upon it for signaling */
   2570 	execlists_unhold(engine, cap->rq);
   2571 	i915_request_put(cap->rq);
   2572 
   2573 	kfree(cap);
   2574 }
   2575 
   2576 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
   2577 {
   2578 	const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
   2579 	struct execlists_capture *cap;
   2580 
   2581 	cap = kmalloc(sizeof(*cap), gfp);
   2582 	if (!cap)
   2583 		return NULL;
   2584 
   2585 	cap->error = i915_gpu_coredump_alloc(engine->i915, gfp);
   2586 	if (!cap->error)
   2587 		goto err_cap;
   2588 
   2589 	cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
   2590 	if (!cap->error->gt)
   2591 		goto err_gpu;
   2592 
   2593 	cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
   2594 	if (!cap->error->gt->engine)
   2595 		goto err_gt;
   2596 
   2597 	return cap;
   2598 
   2599 err_gt:
   2600 	kfree(cap->error->gt);
   2601 err_gpu:
   2602 	kfree(cap->error);
   2603 err_cap:
   2604 	kfree(cap);
   2605 	return NULL;
   2606 }
   2607 
   2608 static bool execlists_capture(struct intel_engine_cs *engine)
   2609 {
   2610 	struct execlists_capture *cap;
   2611 
   2612 	if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
   2613 		return true;
   2614 
   2615 	/*
   2616 	 * We need to _quickly_ capture the engine state before we reset.
   2617 	 * We are inside an atomic section (softirq) here and we are delaying
   2618 	 * the forced preemption event.
   2619 	 */
   2620 	cap = capture_regs(engine);
   2621 	if (!cap)
   2622 		return true;
   2623 
   2624 	cap->rq = execlists_active(&engine->execlists);
   2625 	GEM_BUG_ON(!cap->rq);
   2626 
   2627 	rcu_read_lock();
   2628 	cap->rq = active_request(cap->rq->context->timeline, cap->rq);
   2629 	cap->rq = i915_request_get_rcu(cap->rq);
   2630 	rcu_read_unlock();
   2631 	if (!cap->rq)
   2632 		goto err_free;
   2633 
   2634 	/*
   2635 	 * Remove the request from the execlists queue, and take ownership
   2636 	 * of the request. We pass it to our worker who will _slowly_ compress
   2637 	 * all the pages the _user_ requested for debugging their batch, after
   2638 	 * which we return it to the queue for signaling.
   2639 	 *
   2640 	 * By removing them from the execlists queue, we also remove the
   2641 	 * requests from being processed by __unwind_incomplete_requests()
   2642 	 * during the intel_engine_reset(), and so they will *not* be replayed
   2643 	 * afterwards.
   2644 	 *
   2645 	 * Note that because we have not yet reset the engine at this point,
   2646 	 * it is possible for the request that we have identified as being
   2647 	 * guilty, did in fact complete and we will then hit an arbitration
   2648 	 * point allowing the outstanding preemption to succeed. The likelihood
   2649 	 * of that is very low (as capturing of the engine registers should be
   2650 	 * fast enough to run inside an irq-off atomic section!), so we will
   2651 	 * simply hold that request accountable for being non-preemptible
   2652 	 * long enough to force the reset.
   2653 	 */
   2654 	if (!execlists_hold(engine, cap->rq))
   2655 		goto err_rq;
   2656 
   2657 	INIT_WORK(&cap->work, execlists_capture_work);
   2658 	schedule_work(&cap->work);
   2659 	return true;
   2660 
   2661 err_rq:
   2662 	i915_request_put(cap->rq);
   2663 err_free:
   2664 	i915_gpu_coredump_put(cap->error);
   2665 	kfree(cap);
   2666 	return false;
   2667 }
   2668 
   2669 static noinline void preempt_reset(struct intel_engine_cs *engine)
   2670 {
   2671 	const unsigned int bit = I915_RESET_ENGINE + engine->id;
   2672 	unsigned long *lock = &engine->gt->reset.flags;
   2673 
   2674 	if (i915_modparams.reset < 3)
   2675 		return;
   2676 
   2677 	if (test_and_set_bit(bit, lock))
   2678 		return;
   2679 
   2680 	/* Mark this tasklet as disabled to avoid waiting for it to complete */
   2681 	tasklet_disable_nosync(&engine->execlists.tasklet);
   2682 
   2683 	ENGINE_TRACE(engine, "preempt timeout %lu+%ums\n",
   2684 		     READ_ONCE(engine->props.preempt_timeout_ms),
   2685 		     jiffies_to_msecs(jiffies - engine->execlists.preempt.expires));
   2686 
   2687 	ring_set_paused(engine, 1); /* Freeze the current request in place */
   2688 	if (execlists_capture(engine))
   2689 		intel_engine_reset(engine, "preemption time out");
   2690 	else
   2691 		ring_set_paused(engine, 0);
   2692 
   2693 	tasklet_enable(&engine->execlists.tasklet);
   2694 	clear_and_wake_up_bit(bit, lock);
   2695 }
   2696 
   2697 static bool preempt_timeout(const struct intel_engine_cs *const engine)
   2698 {
   2699 	const struct timer_list *t = &engine->execlists.preempt;
   2700 
   2701 	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
   2702 		return false;
   2703 
   2704 	if (!timer_expired(t))
   2705 		return false;
   2706 
   2707 	return READ_ONCE(engine->execlists.pending[0]);
   2708 }
   2709 
   2710 /*
   2711  * Check the unread Context Status Buffers and manage the submission of new
   2712  * contexts to the ELSP accordingly.
   2713  */
   2714 static void execlists_submission_tasklet(unsigned long data)
   2715 {
   2716 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
   2717 	bool timeout = preempt_timeout(engine);
   2718 
   2719 	process_csb(engine);
   2720 	if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
   2721 		unsigned long flags;
   2722 
   2723 		spin_lock_irqsave(&engine->active.lock, flags);
   2724 		__execlists_submission_tasklet(engine);
   2725 		spin_unlock_irqrestore(&engine->active.lock, flags);
   2726 
   2727 		/* Recheck after serialising with direct-submission */
   2728 		if (timeout && preempt_timeout(engine))
   2729 			preempt_reset(engine);
   2730 	}
   2731 }
   2732 
   2733 static void __execlists_kick(struct intel_engine_execlists *execlists)
   2734 {
   2735 	/* Kick the tasklet for some interrupt coalescing and reset handling */
   2736 	tasklet_hi_schedule(&execlists->tasklet);
   2737 }
   2738 
   2739 #define execlists_kick(t, member) \
   2740 	__execlists_kick(container_of(t, struct intel_engine_execlists, member))
   2741 
   2742 static void execlists_timeslice(struct timer_list *timer)
   2743 {
   2744 	execlists_kick(timer, timer);
   2745 }
   2746 
   2747 static void execlists_preempt(struct timer_list *timer)
   2748 {
   2749 	execlists_kick(timer, preempt);
   2750 }
   2751 
   2752 static void queue_request(struct intel_engine_cs *engine,
   2753 			  struct i915_request *rq)
   2754 {
   2755 	GEM_BUG_ON(!list_empty(&rq->sched.link));
   2756 	list_add_tail(&rq->sched.link,
   2757 		      i915_sched_lookup_priolist(engine, rq_prio(rq)));
   2758 	set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
   2759 }
   2760 
   2761 static void __submit_queue_imm(struct intel_engine_cs *engine)
   2762 {
   2763 	struct intel_engine_execlists * const execlists = &engine->execlists;
   2764 
   2765 	if (reset_in_progress(execlists))
   2766 		return; /* defer until we restart the engine following reset */
   2767 
   2768 	if (execlists->tasklet.func == execlists_submission_tasklet)
   2769 		__execlists_submission_tasklet(engine);
   2770 	else
   2771 		tasklet_hi_schedule(&execlists->tasklet);
   2772 }
   2773 
   2774 static void submit_queue(struct intel_engine_cs *engine,
   2775 			 const struct i915_request *rq)
   2776 {
   2777 	struct intel_engine_execlists *execlists = &engine->execlists;
   2778 
   2779 	if (rq_prio(rq) <= execlists->queue_priority_hint)
   2780 		return;
   2781 
   2782 	execlists->queue_priority_hint = rq_prio(rq);
   2783 	__submit_queue_imm(engine);
   2784 }
   2785 
   2786 static bool ancestor_on_hold(const struct intel_engine_cs *engine,
   2787 			     const struct i915_request *rq)
   2788 {
   2789 	GEM_BUG_ON(i915_request_on_hold(rq));
   2790 	return !list_empty(&engine->active.hold) && hold_request(rq);
   2791 }
   2792 
   2793 static void execlists_submit_request(struct i915_request *request)
   2794 {
   2795 	struct intel_engine_cs *engine = request->engine;
   2796 	unsigned long flags;
   2797 
   2798 	/* Will be called from irq-context when using foreign fences. */
   2799 	spin_lock_irqsave(&engine->active.lock, flags);
   2800 
   2801 	if (unlikely(ancestor_on_hold(engine, request))) {
   2802 		list_add_tail(&request->sched.link, &engine->active.hold);
   2803 		i915_request_set_hold(request);
   2804 	} else {
   2805 		queue_request(engine, request);
   2806 
   2807 		GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
   2808 		GEM_BUG_ON(list_empty(&request->sched.link));
   2809 
   2810 		submit_queue(engine, request);
   2811 	}
   2812 
   2813 	spin_unlock_irqrestore(&engine->active.lock, flags);
   2814 }
   2815 
   2816 static void __execlists_context_fini(struct intel_context *ce)
   2817 {
   2818 	intel_ring_put(ce->ring);
   2819 	i915_vma_put(ce->state);
   2820 }
   2821 
   2822 static void execlists_context_destroy(struct kref *kref)
   2823 {
   2824 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
   2825 
   2826 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
   2827 	GEM_BUG_ON(intel_context_is_pinned(ce));
   2828 
   2829 	if (ce->state)
   2830 		__execlists_context_fini(ce);
   2831 
   2832 	intel_context_fini(ce);
   2833 	intel_context_free(ce);
   2834 }
   2835 
   2836 static void
   2837 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
   2838 {
   2839 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
   2840 		return;
   2841 
   2842 	vaddr += engine->context_size;
   2843 
   2844 	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
   2845 }
   2846 
   2847 static void
   2848 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
   2849 {
   2850 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
   2851 		return;
   2852 
   2853 	vaddr += engine->context_size;
   2854 
   2855 	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
   2856 		dev_err_once(engine->i915->drm.dev,
   2857 			     "%s context redzone overwritten!\n",
   2858 			     engine->name);
   2859 }
   2860 
   2861 static void execlists_context_unpin(struct intel_context *ce)
   2862 {
   2863 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE,
   2864 		      ce->engine);
   2865 
   2866 	i915_gem_object_unpin_map(ce->state->obj);
   2867 }
   2868 
   2869 static void
   2870 __execlists_update_reg_state(const struct intel_context *ce,
   2871 			     const struct intel_engine_cs *engine,
   2872 			     u32 head)
   2873 {
   2874 	struct intel_ring *ring = ce->ring;
   2875 	u32 *regs = ce->lrc_reg_state;
   2876 
   2877 	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
   2878 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
   2879 
   2880 	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
   2881 	regs[CTX_RING_HEAD] = head;
   2882 	regs[CTX_RING_TAIL] = ring->tail;
   2883 
   2884 	/* RPCS */
   2885 	if (engine->class == RENDER_CLASS) {
   2886 		regs[CTX_R_PWR_CLK_STATE] =
   2887 			intel_sseu_make_rpcs(engine->i915, &ce->sseu);
   2888 
   2889 		i915_oa_init_reg_state(ce, engine);
   2890 	}
   2891 }
   2892 
   2893 static int
   2894 __execlists_context_pin(struct intel_context *ce,
   2895 			struct intel_engine_cs *engine)
   2896 {
   2897 	void *vaddr;
   2898 
   2899 	GEM_BUG_ON(!ce->state);
   2900 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
   2901 
   2902 	vaddr = i915_gem_object_pin_map(ce->state->obj,
   2903 					i915_coherent_map_type(engine->i915) |
   2904 					I915_MAP_OVERRIDE);
   2905 	if (IS_ERR(vaddr))
   2906 		return PTR_ERR(vaddr);
   2907 
   2908 	ce->lrc_desc = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
   2909 	ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
   2910 	__execlists_update_reg_state(ce, engine, ce->ring->tail);
   2911 
   2912 	return 0;
   2913 }
   2914 
   2915 static int execlists_context_pin(struct intel_context *ce)
   2916 {
   2917 	return __execlists_context_pin(ce, ce->engine);
   2918 }
   2919 
   2920 static int execlists_context_alloc(struct intel_context *ce)
   2921 {
   2922 	return __execlists_context_alloc(ce, ce->engine);
   2923 }
   2924 
   2925 static void execlists_context_reset(struct intel_context *ce)
   2926 {
   2927 	CE_TRACE(ce, "reset\n");
   2928 	GEM_BUG_ON(!intel_context_is_pinned(ce));
   2929 
   2930 	/*
   2931 	 * Because we emit WA_TAIL_DWORDS there may be a disparity
   2932 	 * between our bookkeeping in ce->ring->head and ce->ring->tail and
   2933 	 * that stored in context. As we only write new commands from
   2934 	 * ce->ring->tail onwards, everything before that is junk. If the GPU
   2935 	 * starts reading from its RING_HEAD from the context, it may try to
   2936 	 * execute that junk and die.
   2937 	 *
   2938 	 * The contexts that are stilled pinned on resume belong to the
   2939 	 * kernel, and are local to each engine. All other contexts will
   2940 	 * have their head/tail sanitized upon pinning before use, so they
   2941 	 * will never see garbage,
   2942 	 *
   2943 	 * So to avoid that we reset the context images upon resume. For
   2944 	 * simplicity, we just zero everything out.
   2945 	 */
   2946 	intel_ring_reset(ce->ring, ce->ring->emit);
   2947 
   2948 	/* Scrub away the garbage */
   2949 	execlists_init_reg_state(ce->lrc_reg_state,
   2950 				 ce, ce->engine, ce->ring, true);
   2951 	__execlists_update_reg_state(ce, ce->engine, ce->ring->tail);
   2952 
   2953 	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
   2954 }
   2955 
   2956 static const struct intel_context_ops execlists_context_ops = {
   2957 	.alloc = execlists_context_alloc,
   2958 
   2959 	.pin = execlists_context_pin,
   2960 	.unpin = execlists_context_unpin,
   2961 
   2962 	.enter = intel_context_enter_engine,
   2963 	.exit = intel_context_exit_engine,
   2964 
   2965 	.reset = execlists_context_reset,
   2966 	.destroy = execlists_context_destroy,
   2967 };
   2968 
   2969 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
   2970 {
   2971 	u32 *cs;
   2972 
   2973 	GEM_BUG_ON(!i915_request_timeline(rq)->has_initial_breadcrumb);
   2974 
   2975 	cs = intel_ring_begin(rq, 6);
   2976 	if (IS_ERR(cs))
   2977 		return PTR_ERR(cs);
   2978 
   2979 	/*
   2980 	 * Check if we have been preempted before we even get started.
   2981 	 *
   2982 	 * After this point i915_request_started() reports true, even if
   2983 	 * we get preempted and so are no longer running.
   2984 	 */
   2985 	*cs++ = MI_ARB_CHECK;
   2986 	*cs++ = MI_NOOP;
   2987 
   2988 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
   2989 	*cs++ = i915_request_timeline(rq)->hwsp_offset;
   2990 	*cs++ = 0;
   2991 	*cs++ = rq->fence.seqno - 1;
   2992 
   2993 	intel_ring_advance(rq, cs);
   2994 
   2995 	/* Record the updated position of the request's payload */
   2996 	rq->infix = intel_ring_offset(rq, cs);
   2997 
   2998 	return 0;
   2999 }
   3000 
   3001 static int execlists_request_alloc(struct i915_request *request)
   3002 {
   3003 	int ret;
   3004 
   3005 	GEM_BUG_ON(!intel_context_is_pinned(request->context));
   3006 
   3007 	/*
   3008 	 * Flush enough space to reduce the likelihood of waiting after
   3009 	 * we start building the request - in which case we will just
   3010 	 * have to repeat work.
   3011 	 */
   3012 	request->reserved_space += EXECLISTS_REQUEST_SIZE;
   3013 
   3014 	/*
   3015 	 * Note that after this point, we have committed to using
   3016 	 * this request as it is being used to both track the
   3017 	 * state of engine initialisation and liveness of the
   3018 	 * golden renderstate above. Think twice before you try
   3019 	 * to cancel/unwind this request now.
   3020 	 */
   3021 
   3022 	/* Unconditionally invalidate GPU caches and TLBs. */
   3023 	ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
   3024 	if (ret)
   3025 		return ret;
   3026 
   3027 	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
   3028 	return 0;
   3029 }
   3030 
   3031 /*
   3032  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
   3033  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
   3034  * but there is a slight complication as this is applied in WA batch where the
   3035  * values are only initialized once so we cannot take register value at the
   3036  * beginning and reuse it further; hence we save its value to memory, upload a
   3037  * constant value with bit21 set and then we restore it back with the saved value.
   3038  * To simplify the WA, a constant value is formed by using the default value
   3039  * of this register. This shouldn't be a problem because we are only modifying
   3040  * it for a short period and this batch in non-premptible. We can ofcourse
   3041  * use additional instructions that read the actual value of the register
   3042  * at that time and set our bit of interest but it makes the WA complicated.
   3043  *
   3044  * This WA is also required for Gen9 so extracting as a function avoids
   3045  * code duplication.
   3046  */
   3047 static u32 *
   3048 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
   3049 {
   3050 	/* NB no one else is allowed to scribble over scratch + 256! */
   3051 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
   3052 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
   3053 	*batch++ = intel_gt_scratch_offset(engine->gt,
   3054 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
   3055 	*batch++ = 0;
   3056 
   3057 	*batch++ = MI_LOAD_REGISTER_IMM(1);
   3058 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
   3059 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
   3060 
   3061 	batch = gen8_emit_pipe_control(batch,
   3062 				       PIPE_CONTROL_CS_STALL |
   3063 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
   3064 				       0);
   3065 
   3066 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
   3067 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
   3068 	*batch++ = intel_gt_scratch_offset(engine->gt,
   3069 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
   3070 	*batch++ = 0;
   3071 
   3072 	return batch;
   3073 }
   3074 
   3075 /*
   3076  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
   3077  * initialized at the beginning and shared across all contexts but this field
   3078  * helps us to have multiple batches at different offsets and select them based
   3079  * on a criteria. At the moment this batch always start at the beginning of the page
   3080  * and at this point we don't have multiple wa_ctx batch buffers.
   3081  *
   3082  * The number of WA applied are not known at the beginning; we use this field
   3083  * to return the no of DWORDS written.
   3084  *
   3085  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
   3086  * so it adds NOOPs as padding to make it cacheline aligned.
   3087  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
   3088  * makes a complete batch buffer.
   3089  */
   3090 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
   3091 {
   3092 	/* WaDisableCtxRestoreArbitration:bdw,chv */
   3093 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
   3094 
   3095 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
   3096 	if (IS_BROADWELL(engine->i915))
   3097 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
   3098 
   3099 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
   3100 	/* Actual scratch location is at 128 bytes offset */
   3101 	batch = gen8_emit_pipe_control(batch,
   3102 				       PIPE_CONTROL_FLUSH_L3 |
   3103 				       PIPE_CONTROL_STORE_DATA_INDEX |
   3104 				       PIPE_CONTROL_CS_STALL |
   3105 				       PIPE_CONTROL_QW_WRITE,
   3106 				       LRC_PPHWSP_SCRATCH_ADDR);
   3107 
   3108 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
   3109 
   3110 	/* Pad to end of cacheline */
   3111 	while ((unsigned long)batch % CACHELINE_BYTES)
   3112 		*batch++ = MI_NOOP;
   3113 
   3114 	/*
   3115 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
   3116 	 * execution depends on the length specified in terms of cache lines
   3117 	 * in the register CTX_RCS_INDIRECT_CTX
   3118 	 */
   3119 
   3120 	return batch;
   3121 }
   3122 
   3123 struct lri {
   3124 	i915_reg_t reg;
   3125 	u32 value;
   3126 };
   3127 
   3128 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
   3129 {
   3130 	GEM_BUG_ON(!count || count > 63);
   3131 
   3132 	*batch++ = MI_LOAD_REGISTER_IMM(count);
   3133 	do {
   3134 		*batch++ = i915_mmio_reg_offset(lri->reg);
   3135 		*batch++ = lri->value;
   3136 	} while (lri++, --count);
   3137 	*batch++ = MI_NOOP;
   3138 
   3139 	return batch;
   3140 }
   3141 
   3142 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
   3143 {
   3144 	static const struct lri lri[] = {
   3145 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
   3146 		{
   3147 			COMMON_SLICE_CHICKEN2,
   3148 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
   3149 				       0),
   3150 		},
   3151 
   3152 		/* BSpec: 11391 */
   3153 		{
   3154 			FF_SLICE_CHICKEN,
   3155 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
   3156 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
   3157 		},
   3158 
   3159 		/* BSpec: 11299 */
   3160 		{
   3161 			_3D_CHICKEN3,
   3162 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
   3163 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
   3164 		}
   3165 	};
   3166 
   3167 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
   3168 
   3169 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
   3170 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
   3171 
   3172 	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
   3173 	batch = gen8_emit_pipe_control(batch,
   3174 				       PIPE_CONTROL_FLUSH_L3 |
   3175 				       PIPE_CONTROL_STORE_DATA_INDEX |
   3176 				       PIPE_CONTROL_CS_STALL |
   3177 				       PIPE_CONTROL_QW_WRITE,
   3178 				       LRC_PPHWSP_SCRATCH_ADDR);
   3179 
   3180 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
   3181 
   3182 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
   3183 	if (HAS_POOLED_EU(engine->i915)) {
   3184 		/*
   3185 		 * EU pool configuration is setup along with golden context
   3186 		 * during context initialization. This value depends on
   3187 		 * device type (2x6 or 3x6) and needs to be updated based
   3188 		 * on which subslice is disabled especially for 2x6
   3189 		 * devices, however it is safe to load default
   3190 		 * configuration of 3x6 device instead of masking off
   3191 		 * corresponding bits because HW ignores bits of a disabled
   3192 		 * subslice and drops down to appropriate config. Please
   3193 		 * see render_state_setup() in i915_gem_render_state.c for
   3194 		 * possible configurations, to avoid duplication they are
   3195 		 * not shown here again.
   3196 		 */
   3197 		*batch++ = GEN9_MEDIA_POOL_STATE;
   3198 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
   3199 		*batch++ = 0x00777000;
   3200 		*batch++ = 0;
   3201 		*batch++ = 0;
   3202 		*batch++ = 0;
   3203 	}
   3204 
   3205 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
   3206 
   3207 	/* Pad to end of cacheline */
   3208 	while ((unsigned long)batch % CACHELINE_BYTES)
   3209 		*batch++ = MI_NOOP;
   3210 
   3211 	return batch;
   3212 }
   3213 
   3214 static u32 *
   3215 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
   3216 {
   3217 	int i;
   3218 
   3219 	/*
   3220 	 * WaPipeControlBefore3DStateSamplePattern: cnl
   3221 	 *
   3222 	 * Ensure the engine is idle prior to programming a
   3223 	 * 3DSTATE_SAMPLE_PATTERN during a context restore.
   3224 	 */
   3225 	batch = gen8_emit_pipe_control(batch,
   3226 				       PIPE_CONTROL_CS_STALL,
   3227 				       0);
   3228 	/*
   3229 	 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
   3230 	 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
   3231 	 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
   3232 	 * confusing. Since gen8_emit_pipe_control() already advances the
   3233 	 * batch by 6 dwords, we advance the other 10 here, completing a
   3234 	 * cacheline. It's not clear if the workaround requires this padding
   3235 	 * before other commands, or if it's just the regular padding we would
   3236 	 * already have for the workaround bb, so leave it here for now.
   3237 	 */
   3238 	for (i = 0; i < 10; i++)
   3239 		*batch++ = MI_NOOP;
   3240 
   3241 	/* Pad to end of cacheline */
   3242 	while ((unsigned long)batch % CACHELINE_BYTES)
   3243 		*batch++ = MI_NOOP;
   3244 
   3245 	return batch;
   3246 }
   3247 
   3248 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
   3249 
   3250 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
   3251 {
   3252 	struct drm_i915_gem_object *obj;
   3253 	struct i915_vma *vma;
   3254 	int err;
   3255 
   3256 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
   3257 	if (IS_ERR(obj))
   3258 		return PTR_ERR(obj);
   3259 
   3260 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
   3261 	if (IS_ERR(vma)) {
   3262 		err = PTR_ERR(vma);
   3263 		goto err;
   3264 	}
   3265 
   3266 	err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
   3267 	if (err)
   3268 		goto err;
   3269 
   3270 	engine->wa_ctx.vma = vma;
   3271 	return 0;
   3272 
   3273 err:
   3274 	i915_gem_object_put(obj);
   3275 	return err;
   3276 }
   3277 
   3278 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
   3279 {
   3280 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
   3281 }
   3282 
   3283 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
   3284 
   3285 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
   3286 {
   3287 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
   3288 	struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
   3289 					    &wa_ctx->per_ctx };
   3290 	wa_bb_func_t wa_bb_fn[2];
   3291 	struct page *page;
   3292 	void *batch, *batch_ptr;
   3293 	unsigned int i;
   3294 	int ret;
   3295 
   3296 	if (engine->class != RENDER_CLASS)
   3297 		return 0;
   3298 
   3299 	switch (INTEL_GEN(engine->i915)) {
   3300 	case 12:
   3301 	case 11:
   3302 		return 0;
   3303 	case 10:
   3304 		wa_bb_fn[0] = gen10_init_indirectctx_bb;
   3305 		wa_bb_fn[1] = NULL;
   3306 		break;
   3307 	case 9:
   3308 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
   3309 		wa_bb_fn[1] = NULL;
   3310 		break;
   3311 	case 8:
   3312 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
   3313 		wa_bb_fn[1] = NULL;
   3314 		break;
   3315 	default:
   3316 		MISSING_CASE(INTEL_GEN(engine->i915));
   3317 		return 0;
   3318 	}
   3319 
   3320 	ret = lrc_setup_wa_ctx(engine);
   3321 	if (ret) {
   3322 		DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
   3323 		return ret;
   3324 	}
   3325 
   3326 	page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
   3327 	batch = batch_ptr = kmap_atomic(page);
   3328 
   3329 	/*
   3330 	 * Emit the two workaround batch buffers, recording the offset from the
   3331 	 * start of the workaround batch buffer object for each and their
   3332 	 * respective sizes.
   3333 	 */
   3334 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
   3335 		wa_bb[i]->offset = batch_ptr - batch;
   3336 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
   3337 						  CACHELINE_BYTES))) {
   3338 			ret = -EINVAL;
   3339 			break;
   3340 		}
   3341 		if (wa_bb_fn[i])
   3342 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
   3343 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
   3344 	}
   3345 
   3346 	BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
   3347 
   3348 	kunmap_atomic(batch);
   3349 	if (ret)
   3350 		lrc_destroy_wa_ctx(engine);
   3351 
   3352 	return ret;
   3353 }
   3354 
   3355 static void enable_execlists(struct intel_engine_cs *engine)
   3356 {
   3357 	u32 mode;
   3358 
   3359 	assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
   3360 
   3361 	intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
   3362 
   3363 	if (INTEL_GEN(engine->i915) >= 11)
   3364 		mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
   3365 	else
   3366 		mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
   3367 	ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
   3368 
   3369 	ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
   3370 
   3371 	ENGINE_WRITE_FW(engine,
   3372 			RING_HWS_PGA,
   3373 			i915_ggtt_offset(engine->status_page.vma));
   3374 	ENGINE_POSTING_READ(engine, RING_HWS_PGA);
   3375 
   3376 	engine->context_tag = 0;
   3377 }
   3378 
   3379 static bool unexpected_starting_state(struct intel_engine_cs *engine)
   3380 {
   3381 	bool unexpected = false;
   3382 
   3383 	if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
   3384 		DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n");
   3385 		unexpected = true;
   3386 	}
   3387 
   3388 	return unexpected;
   3389 }
   3390 
   3391 static int execlists_resume(struct intel_engine_cs *engine)
   3392 {
   3393 	intel_engine_apply_workarounds(engine);
   3394 	intel_engine_apply_whitelist(engine);
   3395 
   3396 	intel_mocs_init_engine(engine);
   3397 
   3398 	intel_engine_reset_breadcrumbs(engine);
   3399 
   3400 	if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
   3401 		struct drm_printer p = drm_debug_printer(__func__);
   3402 
   3403 		intel_engine_dump(engine, &p, NULL);
   3404 	}
   3405 
   3406 	enable_execlists(engine);
   3407 
   3408 	return 0;
   3409 }
   3410 
   3411 static void execlists_reset_prepare(struct intel_engine_cs *engine)
   3412 {
   3413 	struct intel_engine_execlists * const execlists = &engine->execlists;
   3414 	unsigned long flags;
   3415 
   3416 	ENGINE_TRACE(engine, "depth<-%d\n",
   3417 		     atomic_read(&execlists->tasklet.count));
   3418 
   3419 	/*
   3420 	 * Prevent request submission to the hardware until we have
   3421 	 * completed the reset in i915_gem_reset_finish(). If a request
   3422 	 * is completed by one engine, it may then queue a request
   3423 	 * to a second via its execlists->tasklet *just* as we are
   3424 	 * calling engine->resume() and also writing the ELSP.
   3425 	 * Turning off the execlists->tasklet until the reset is over
   3426 	 * prevents the race.
   3427 	 */
   3428 	__tasklet_disable_sync_once(&execlists->tasklet);
   3429 	GEM_BUG_ON(!reset_in_progress(execlists));
   3430 
   3431 	/* And flush any current direct submission. */
   3432 	spin_lock_irqsave(&engine->active.lock, flags);
   3433 	spin_unlock_irqrestore(&engine->active.lock, flags);
   3434 
   3435 	/*
   3436 	 * We stop engines, otherwise we might get failed reset and a
   3437 	 * dead gpu (on elk). Also as modern gpu as kbl can suffer
   3438 	 * from system hang if batchbuffer is progressing when
   3439 	 * the reset is issued, regardless of READY_TO_RESET ack.
   3440 	 * Thus assume it is best to stop engines on all gens
   3441 	 * where we have a gpu reset.
   3442 	 *
   3443 	 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
   3444 	 *
   3445 	 * FIXME: Wa for more modern gens needs to be validated
   3446 	 */
   3447 	intel_engine_stop_cs(engine);
   3448 }
   3449 
   3450 static void reset_csb_pointers(struct intel_engine_cs *engine)
   3451 {
   3452 	struct intel_engine_execlists * const execlists = &engine->execlists;
   3453 	const unsigned int reset_value = execlists->csb_size - 1;
   3454 
   3455 	ring_set_paused(engine, 0);
   3456 
   3457 	/*
   3458 	 * After a reset, the HW starts writing into CSB entry [0]. We
   3459 	 * therefore have to set our HEAD pointer back one entry so that
   3460 	 * the *first* entry we check is entry 0. To complicate this further,
   3461 	 * as we don't wait for the first interrupt after reset, we have to
   3462 	 * fake the HW write to point back to the last entry so that our
   3463 	 * inline comparison of our cached head position against the last HW
   3464 	 * write works even before the first interrupt.
   3465 	 */
   3466 	execlists->csb_head = reset_value;
   3467 	WRITE_ONCE(*execlists->csb_write, reset_value);
   3468 	wmb(); /* Make sure this is visible to HW (paranoia?) */
   3469 
   3470 	/*
   3471 	 * Sometimes Icelake forgets to reset its pointers on a GPU reset.
   3472 	 * Bludgeon them with a mmio update to be sure.
   3473 	 */
   3474 	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
   3475 		     reset_value << 8 | reset_value);
   3476 	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
   3477 
   3478 	invalidate_csb_entries(&execlists->csb_status[0],
   3479 			       &execlists->csb_status[reset_value]);
   3480 }
   3481 
   3482 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
   3483 {
   3484 	int x;
   3485 
   3486 	x = lrc_ring_mi_mode(engine);
   3487 	if (x != -1) {
   3488 		regs[x + 1] &= ~STOP_RING;
   3489 		regs[x + 1] |= STOP_RING << 16;
   3490 	}
   3491 }
   3492 
   3493 static void __execlists_reset_reg_state(const struct intel_context *ce,
   3494 					const struct intel_engine_cs *engine)
   3495 {
   3496 	u32 *regs = ce->lrc_reg_state;
   3497 
   3498 	__reset_stop_ring(regs, engine);
   3499 }
   3500 
   3501 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
   3502 {
   3503 	struct intel_engine_execlists * const execlists = &engine->execlists;
   3504 	struct intel_context *ce;
   3505 	struct i915_request *rq;
   3506 	u32 head;
   3507 
   3508 	mb(); /* paranoia: read the CSB pointers from after the reset */
   3509 	clflush(execlists->csb_write);
   3510 	mb();
   3511 
   3512 	process_csb(engine); /* drain preemption events */
   3513 
   3514 	/* Following the reset, we need to reload the CSB read/write pointers */
   3515 	reset_csb_pointers(engine);
   3516 
   3517 	/*
   3518 	 * Save the currently executing context, even if we completed
   3519 	 * its request, it was still running at the time of the
   3520 	 * reset and will have been clobbered.
   3521 	 */
   3522 	rq = execlists_active(execlists);
   3523 	if (!rq)
   3524 		goto unwind;
   3525 
   3526 	/* We still have requests in-flight; the engine should be active */
   3527 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
   3528 
   3529 	ce = rq->context;
   3530 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
   3531 
   3532 	if (i915_request_completed(rq)) {
   3533 		/* Idle context; tidy up the ring so we can restart afresh */
   3534 		head = intel_ring_wrap(ce->ring, rq->tail);
   3535 		goto out_replay;
   3536 	}
   3537 
   3538 	/* Context has requests still in-flight; it should not be idle! */
   3539 	GEM_BUG_ON(i915_active_is_idle(&ce->active));
   3540 	rq = active_request(ce->timeline, rq);
   3541 	head = intel_ring_wrap(ce->ring, rq->head);
   3542 	GEM_BUG_ON(head == ce->ring->tail);
   3543 
   3544 	/*
   3545 	 * If this request hasn't started yet, e.g. it is waiting on a
   3546 	 * semaphore, we need to avoid skipping the request or else we
   3547 	 * break the signaling chain. However, if the context is corrupt
   3548 	 * the request will not restart and we will be stuck with a wedged
   3549 	 * device. It is quite often the case that if we issue a reset
   3550 	 * while the GPU is loading the context image, that the context
   3551 	 * image becomes corrupt.
   3552 	 *
   3553 	 * Otherwise, if we have not started yet, the request should replay
   3554 	 * perfectly and we do not need to flag the result as being erroneous.
   3555 	 */
   3556 	if (!i915_request_started(rq))
   3557 		goto out_replay;
   3558 
   3559 	/*
   3560 	 * If the request was innocent, we leave the request in the ELSP
   3561 	 * and will try to replay it on restarting. The context image may
   3562 	 * have been corrupted by the reset, in which case we may have
   3563 	 * to service a new GPU hang, but more likely we can continue on
   3564 	 * without impact.
   3565 	 *
   3566 	 * If the request was guilty, we presume the context is corrupt
   3567 	 * and have to at least restore the RING register in the context
   3568 	 * image back to the expected values to skip over the guilty request.
   3569 	 */
   3570 	__i915_request_reset(rq, stalled);
   3571 	if (!stalled)
   3572 		goto out_replay;
   3573 
   3574 	/*
   3575 	 * We want a simple context + ring to execute the breadcrumb update.
   3576 	 * We cannot rely on the context being intact across the GPU hang,
   3577 	 * so clear it and rebuild just what we need for the breadcrumb.
   3578 	 * All pending requests for this context will be zapped, and any
   3579 	 * future request will be after userspace has had the opportunity
   3580 	 * to recreate its own state.
   3581 	 */
   3582 	GEM_BUG_ON(!intel_context_is_pinned(ce));
   3583 	restore_default_state(ce, engine);
   3584 
   3585 out_replay:
   3586 	ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
   3587 		     head, ce->ring->tail);
   3588 	__execlists_reset_reg_state(ce, engine);
   3589 	__execlists_update_reg_state(ce, engine, head);
   3590 	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
   3591 
   3592 unwind:
   3593 	/* Push back any incomplete requests for replay after the reset. */
   3594 	cancel_port_requests(execlists);
   3595 	__unwind_incomplete_requests(engine);
   3596 }
   3597 
   3598 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
   3599 {
   3600 	unsigned long flags;
   3601 
   3602 	ENGINE_TRACE(engine, "\n");
   3603 
   3604 	spin_lock_irqsave(&engine->active.lock, flags);
   3605 
   3606 	__execlists_reset(engine, stalled);
   3607 
   3608 	spin_unlock_irqrestore(&engine->active.lock, flags);
   3609 }
   3610 
   3611 static void nop_submission_tasklet(unsigned long data)
   3612 {
   3613 	/* The driver is wedged; don't process any more events. */
   3614 }
   3615 
   3616 static void execlists_reset_cancel(struct intel_engine_cs *engine)
   3617 {
   3618 	struct intel_engine_execlists * const execlists = &engine->execlists;
   3619 	struct i915_request *rq, *rn;
   3620 	struct rb_node *rb;
   3621 	unsigned long flags;
   3622 
   3623 	ENGINE_TRACE(engine, "\n");
   3624 
   3625 	/*
   3626 	 * Before we call engine->cancel_requests(), we should have exclusive
   3627 	 * access to the submission state. This is arranged for us by the
   3628 	 * caller disabling the interrupt generation, the tasklet and other
   3629 	 * threads that may then access the same state, giving us a free hand
   3630 	 * to reset state. However, we still need to let lockdep be aware that
   3631 	 * we know this state may be accessed in hardirq context, so we
   3632 	 * disable the irq around this manipulation and we want to keep
   3633 	 * the spinlock focused on its duties and not accidentally conflate
   3634 	 * coverage to the submission's irq state. (Similarly, although we
   3635 	 * shouldn't need to disable irq around the manipulation of the
   3636 	 * submission's irq state, we also wish to remind ourselves that
   3637 	 * it is irq state.)
   3638 	 */
   3639 	spin_lock_irqsave(&engine->active.lock, flags);
   3640 
   3641 	__execlists_reset(engine, true);
   3642 
   3643 	/* Mark all executing requests as skipped. */
   3644 	list_for_each_entry(rq, &engine->active.requests, sched.link)
   3645 		mark_eio(rq);
   3646 
   3647 	/* Flush the queued requests to the timeline list (for retiring). */
   3648 	while ((rb = rb_first_cached(&execlists->queue))) {
   3649 		struct i915_priolist *p = to_priolist(rb);
   3650 		int i;
   3651 
   3652 		priolist_for_each_request_consume(rq, rn, p, i) {
   3653 			mark_eio(rq);
   3654 			__i915_request_submit(rq);
   3655 		}
   3656 
   3657 		rb_erase_cached(&p->node, &execlists->queue);
   3658 		i915_priolist_free(p);
   3659 	}
   3660 
   3661 	/* On-hold requests will be flushed to timeline upon their release */
   3662 	list_for_each_entry(rq, &engine->active.hold, sched.link)
   3663 		mark_eio(rq);
   3664 
   3665 	/* Cancel all attached virtual engines */
   3666 	while ((rb = rb_first_cached(&execlists->virtual))) {
   3667 		struct virtual_engine *ve =
   3668 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
   3669 
   3670 		rb_erase_cached(rb, &execlists->virtual);
   3671 		RB_CLEAR_NODE(rb);
   3672 
   3673 		spin_lock(&ve->base.active.lock);
   3674 		rq = fetch_and_zero(&ve->request);
   3675 		if (rq) {
   3676 			mark_eio(rq);
   3677 
   3678 			rq->engine = engine;
   3679 			__i915_request_submit(rq);
   3680 			i915_request_put(rq);
   3681 
   3682 			ve->base.execlists.queue_priority_hint = INT_MIN;
   3683 		}
   3684 		spin_unlock(&ve->base.active.lock);
   3685 	}
   3686 
   3687 	/* Remaining _unready_ requests will be nop'ed when submitted */
   3688 
   3689 	execlists->queue_priority_hint = INT_MIN;
   3690 	execlists->queue = RB_ROOT_CACHED;
   3691 
   3692 	GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
   3693 	execlists->tasklet.func = nop_submission_tasklet;
   3694 
   3695 	spin_unlock_irqrestore(&engine->active.lock, flags);
   3696 }
   3697 
   3698 static void execlists_reset_finish(struct intel_engine_cs *engine)
   3699 {
   3700 	struct intel_engine_execlists * const execlists = &engine->execlists;
   3701 
   3702 	/*
   3703 	 * After a GPU reset, we may have requests to replay. Do so now while
   3704 	 * we still have the forcewake to be sure that the GPU is not allowed
   3705 	 * to sleep before we restart and reload a context.
   3706 	 */
   3707 	GEM_BUG_ON(!reset_in_progress(execlists));
   3708 	if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
   3709 		execlists->tasklet.func(execlists->tasklet.data);
   3710 
   3711 	if (__tasklet_enable(&execlists->tasklet))
   3712 		/* And kick in case we missed a new request submission. */
   3713 		tasklet_hi_schedule(&execlists->tasklet);
   3714 	ENGINE_TRACE(engine, "depth->%d\n",
   3715 		     atomic_read(&execlists->tasklet.count));
   3716 }
   3717 
   3718 static int gen8_emit_bb_start_noarb(struct i915_request *rq,
   3719 				    u64 offset, u32 len,
   3720 				    const unsigned int flags)
   3721 {
   3722 	u32 *cs;
   3723 
   3724 	cs = intel_ring_begin(rq, 4);
   3725 	if (IS_ERR(cs))
   3726 		return PTR_ERR(cs);
   3727 
   3728 	/*
   3729 	 * WaDisableCtxRestoreArbitration:bdw,chv
   3730 	 *
   3731 	 * We don't need to perform MI_ARB_ENABLE as often as we do (in
   3732 	 * particular all the gen that do not need the w/a at all!), if we
   3733 	 * took care to make sure that on every switch into this context
   3734 	 * (both ordinary and for preemption) that arbitrartion was enabled
   3735 	 * we would be fine.  However, for gen8 there is another w/a that
   3736 	 * requires us to not preempt inside GPGPU execution, so we keep
   3737 	 * arbitration disabled for gen8 batches. Arbitration will be
   3738 	 * re-enabled before we close the request
   3739 	 * (engine->emit_fini_breadcrumb).
   3740 	 */
   3741 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
   3742 
   3743 	/* FIXME(BDW+): Address space and security selectors. */
   3744 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
   3745 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
   3746 	*cs++ = lower_32_bits(offset);
   3747 	*cs++ = upper_32_bits(offset);
   3748 
   3749 	intel_ring_advance(rq, cs);
   3750 
   3751 	return 0;
   3752 }
   3753 
   3754 static int gen8_emit_bb_start(struct i915_request *rq,
   3755 			      u64 offset, u32 len,
   3756 			      const unsigned int flags)
   3757 {
   3758 	u32 *cs;
   3759 
   3760 	cs = intel_ring_begin(rq, 6);
   3761 	if (IS_ERR(cs))
   3762 		return PTR_ERR(cs);
   3763 
   3764 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
   3765 
   3766 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
   3767 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
   3768 	*cs++ = lower_32_bits(offset);
   3769 	*cs++ = upper_32_bits(offset);
   3770 
   3771 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
   3772 	*cs++ = MI_NOOP;
   3773 
   3774 	intel_ring_advance(rq, cs);
   3775 
   3776 	return 0;
   3777 }
   3778 
   3779 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
   3780 {
   3781 	ENGINE_WRITE(engine, RING_IMR,
   3782 		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
   3783 	ENGINE_POSTING_READ(engine, RING_IMR);
   3784 }
   3785 
   3786 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
   3787 {
   3788 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
   3789 }
   3790 
   3791 static int gen8_emit_flush(struct i915_request *request, u32 mode)
   3792 {
   3793 	u32 cmd, *cs;
   3794 
   3795 	cs = intel_ring_begin(request, 4);
   3796 	if (IS_ERR(cs))
   3797 		return PTR_ERR(cs);
   3798 
   3799 	cmd = MI_FLUSH_DW + 1;
   3800 
   3801 	/* We always require a command barrier so that subsequent
   3802 	 * commands, such as breadcrumb interrupts, are strictly ordered
   3803 	 * wrt the contents of the write cache being flushed to memory
   3804 	 * (and thus being coherent from the CPU).
   3805 	 */
   3806 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
   3807 
   3808 	if (mode & EMIT_INVALIDATE) {
   3809 		cmd |= MI_INVALIDATE_TLB;
   3810 		if (request->engine->class == VIDEO_DECODE_CLASS)
   3811 			cmd |= MI_INVALIDATE_BSD;
   3812 	}
   3813 
   3814 	*cs++ = cmd;
   3815 	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
   3816 	*cs++ = 0; /* upper addr */
   3817 	*cs++ = 0; /* value */
   3818 	intel_ring_advance(request, cs);
   3819 
   3820 	return 0;
   3821 }
   3822 
   3823 static int gen8_emit_flush_render(struct i915_request *request,
   3824 				  u32 mode)
   3825 {
   3826 	bool vf_flush_wa = false, dc_flush_wa = false;
   3827 	u32 *cs, flags = 0;
   3828 	int len;
   3829 
   3830 	flags |= PIPE_CONTROL_CS_STALL;
   3831 
   3832 	if (mode & EMIT_FLUSH) {
   3833 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
   3834 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
   3835 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
   3836 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
   3837 	}
   3838 
   3839 	if (mode & EMIT_INVALIDATE) {
   3840 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
   3841 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
   3842 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
   3843 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
   3844 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
   3845 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
   3846 		flags |= PIPE_CONTROL_QW_WRITE;
   3847 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
   3848 
   3849 		/*
   3850 		 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
   3851 		 * pipe control.
   3852 		 */
   3853 		if (IS_GEN(request->i915, 9))
   3854 			vf_flush_wa = true;
   3855 
   3856 		/* WaForGAMHang:kbl */
   3857 		if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
   3858 			dc_flush_wa = true;
   3859 	}
   3860 
   3861 	len = 6;
   3862 
   3863 	if (vf_flush_wa)
   3864 		len += 6;
   3865 
   3866 	if (dc_flush_wa)
   3867 		len += 12;
   3868 
   3869 	cs = intel_ring_begin(request, len);
   3870 	if (IS_ERR(cs))
   3871 		return PTR_ERR(cs);
   3872 
   3873 	if (vf_flush_wa)
   3874 		cs = gen8_emit_pipe_control(cs, 0, 0);
   3875 
   3876 	if (dc_flush_wa)
   3877 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
   3878 					    0);
   3879 
   3880 	cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
   3881 
   3882 	if (dc_flush_wa)
   3883 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
   3884 
   3885 	intel_ring_advance(request, cs);
   3886 
   3887 	return 0;
   3888 }
   3889 
   3890 static int gen11_emit_flush_render(struct i915_request *request,
   3891 				   u32 mode)
   3892 {
   3893 	if (mode & EMIT_FLUSH) {
   3894 		u32 *cs;
   3895 		u32 flags = 0;
   3896 
   3897 		flags |= PIPE_CONTROL_CS_STALL;
   3898 
   3899 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
   3900 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
   3901 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
   3902 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
   3903 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
   3904 		flags |= PIPE_CONTROL_QW_WRITE;
   3905 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
   3906 
   3907 		cs = intel_ring_begin(request, 6);
   3908 		if (IS_ERR(cs))
   3909 			return PTR_ERR(cs);
   3910 
   3911 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
   3912 		intel_ring_advance(request, cs);
   3913 	}
   3914 
   3915 	if (mode & EMIT_INVALIDATE) {
   3916 		u32 *cs;
   3917 		u32 flags = 0;
   3918 
   3919 		flags |= PIPE_CONTROL_CS_STALL;
   3920 
   3921 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
   3922 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
   3923 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
   3924 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
   3925 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
   3926 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
   3927 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
   3928 		flags |= PIPE_CONTROL_QW_WRITE;
   3929 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
   3930 
   3931 		cs = intel_ring_begin(request, 6);
   3932 		if (IS_ERR(cs))
   3933 			return PTR_ERR(cs);
   3934 
   3935 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
   3936 		intel_ring_advance(request, cs);
   3937 	}
   3938 
   3939 	return 0;
   3940 }
   3941 
   3942 static u32 preparser_disable(bool state)
   3943 {
   3944 	return MI_ARB_CHECK | 1 << 8 | state;
   3945 }
   3946 
   3947 static int gen12_emit_flush_render(struct i915_request *request,
   3948 				   u32 mode)
   3949 {
   3950 	if (mode & EMIT_FLUSH) {
   3951 		u32 flags = 0;
   3952 		u32 *cs;
   3953 
   3954 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
   3955 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
   3956 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
   3957 		/* Wa_1409600907:tgl */
   3958 		flags |= PIPE_CONTROL_DEPTH_STALL;
   3959 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
   3960 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
   3961 		flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
   3962 
   3963 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
   3964 		flags |= PIPE_CONTROL_QW_WRITE;
   3965 
   3966 		flags |= PIPE_CONTROL_CS_STALL;
   3967 
   3968 		cs = intel_ring_begin(request, 6);
   3969 		if (IS_ERR(cs))
   3970 			return PTR_ERR(cs);
   3971 
   3972 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
   3973 		intel_ring_advance(request, cs);
   3974 	}
   3975 
   3976 	if (mode & EMIT_INVALIDATE) {
   3977 		u32 flags = 0;
   3978 		u32 *cs;
   3979 
   3980 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
   3981 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
   3982 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
   3983 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
   3984 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
   3985 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
   3986 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
   3987 		flags |= PIPE_CONTROL_L3_RO_CACHE_INVALIDATE;
   3988 
   3989 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
   3990 		flags |= PIPE_CONTROL_QW_WRITE;
   3991 
   3992 		flags |= PIPE_CONTROL_CS_STALL;
   3993 
   3994 		cs = intel_ring_begin(request, 8);
   3995 		if (IS_ERR(cs))
   3996 			return PTR_ERR(cs);
   3997 
   3998 		/*
   3999 		 * Prevent the pre-parser from skipping past the TLB
   4000 		 * invalidate and loading a stale page for the batch
   4001 		 * buffer / request payload.
   4002 		 */
   4003 		*cs++ = preparser_disable(true);
   4004 
   4005 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
   4006 
   4007 		*cs++ = preparser_disable(false);
   4008 		intel_ring_advance(request, cs);
   4009 
   4010 		/*
   4011 		 * Wa_1604544889:tgl
   4012 		 */
   4013 		if (IS_TGL_REVID(request->i915, TGL_REVID_A0, TGL_REVID_A0)) {
   4014 			flags = 0;
   4015 			flags |= PIPE_CONTROL_CS_STALL;
   4016 			flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
   4017 
   4018 			flags |= PIPE_CONTROL_STORE_DATA_INDEX;
   4019 			flags |= PIPE_CONTROL_QW_WRITE;
   4020 
   4021 			cs = intel_ring_begin(request, 6);
   4022 			if (IS_ERR(cs))
   4023 				return PTR_ERR(cs);
   4024 
   4025 			cs = gen8_emit_pipe_control(cs, flags,
   4026 						    LRC_PPHWSP_SCRATCH_ADDR);
   4027 			intel_ring_advance(request, cs);
   4028 		}
   4029 	}
   4030 
   4031 	return 0;
   4032 }
   4033 
   4034 /*
   4035  * Reserve space for 2 NOOPs at the end of each request to be
   4036  * used as a workaround for not being allowed to do lite
   4037  * restore with HEAD==TAIL (WaIdleLiteRestore).
   4038  */
   4039 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
   4040 {
   4041 	/* Ensure there's always at least one preemption point per-request. */
   4042 	*cs++ = MI_ARB_CHECK;
   4043 	*cs++ = MI_NOOP;
   4044 	request->wa_tail = intel_ring_offset(request, cs);
   4045 
   4046 	return cs;
   4047 }
   4048 
   4049 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
   4050 {
   4051 	*cs++ = MI_SEMAPHORE_WAIT |
   4052 		MI_SEMAPHORE_GLOBAL_GTT |
   4053 		MI_SEMAPHORE_POLL |
   4054 		MI_SEMAPHORE_SAD_EQ_SDD;
   4055 	*cs++ = 0;
   4056 	*cs++ = intel_hws_preempt_address(request->engine);
   4057 	*cs++ = 0;
   4058 
   4059 	return cs;
   4060 }
   4061 
   4062 static __always_inline u32*
   4063 gen8_emit_fini_breadcrumb_footer(struct i915_request *request,
   4064 				 u32 *cs)
   4065 {
   4066 	*cs++ = MI_USER_INTERRUPT;
   4067 
   4068 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
   4069 	if (intel_engine_has_semaphores(request->engine))
   4070 		cs = emit_preempt_busywait(request, cs);
   4071 
   4072 	request->tail = intel_ring_offset(request, cs);
   4073 	assert_ring_tail_valid(request->ring, request->tail);
   4074 
   4075 	return gen8_emit_wa_tail(request, cs);
   4076 }
   4077 
   4078 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
   4079 {
   4080 	cs = gen8_emit_ggtt_write(cs,
   4081 				  request->fence.seqno,
   4082 				  i915_request_active_timeline(request)->hwsp_offset,
   4083 				  0);
   4084 
   4085 	return gen8_emit_fini_breadcrumb_footer(request, cs);
   4086 }
   4087 
   4088 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
   4089 {
   4090 	cs = gen8_emit_pipe_control(cs,
   4091 				    PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
   4092 				    PIPE_CONTROL_DEPTH_CACHE_FLUSH |
   4093 				    PIPE_CONTROL_DC_FLUSH_ENABLE,
   4094 				    0);
   4095 
   4096 	/* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
   4097 	cs = gen8_emit_ggtt_write_rcs(cs,
   4098 				      request->fence.seqno,
   4099 				      i915_request_active_timeline(request)->hwsp_offset,
   4100 				      PIPE_CONTROL_FLUSH_ENABLE |
   4101 				      PIPE_CONTROL_CS_STALL);
   4102 
   4103 	return gen8_emit_fini_breadcrumb_footer(request, cs);
   4104 }
   4105 
   4106 static u32 *
   4107 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
   4108 {
   4109 	cs = gen8_emit_ggtt_write_rcs(cs,
   4110 				      request->fence.seqno,
   4111 				      i915_request_active_timeline(request)->hwsp_offset,
   4112 				      PIPE_CONTROL_CS_STALL |
   4113 				      PIPE_CONTROL_TILE_CACHE_FLUSH |
   4114 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
   4115 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
   4116 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
   4117 				      PIPE_CONTROL_FLUSH_ENABLE);
   4118 
   4119 	return gen8_emit_fini_breadcrumb_footer(request, cs);
   4120 }
   4121 
   4122 /*
   4123  * Note that the CS instruction pre-parser will not stall on the breadcrumb
   4124  * flush and will continue pre-fetching the instructions after it before the
   4125  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
   4126  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
   4127  * of the next request before the memory has been flushed, we're guaranteed that
   4128  * we won't access the batch itself too early.
   4129  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
   4130  * so, if the current request is modifying an instruction in the next request on
   4131  * the same intel_context, we might pre-fetch and then execute the pre-update
   4132  * instruction. To avoid this, the users of self-modifying code should either
   4133  * disable the parser around the code emitting the memory writes, via a new flag
   4134  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
   4135  * the in-kernel use-cases we've opted to use a separate context, see
   4136  * reloc_gpu() as an example.
   4137  * All the above applies only to the instructions themselves. Non-inline data
   4138  * used by the instructions is not pre-fetched.
   4139  */
   4140 
   4141 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
   4142 {
   4143 	*cs++ = MI_SEMAPHORE_WAIT_TOKEN |
   4144 		MI_SEMAPHORE_GLOBAL_GTT |
   4145 		MI_SEMAPHORE_POLL |
   4146 		MI_SEMAPHORE_SAD_EQ_SDD;
   4147 	*cs++ = 0;
   4148 	*cs++ = intel_hws_preempt_address(request->engine);
   4149 	*cs++ = 0;
   4150 	*cs++ = 0;
   4151 	*cs++ = MI_NOOP;
   4152 
   4153 	return cs;
   4154 }
   4155 
   4156 static __always_inline u32*
   4157 gen12_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs)
   4158 {
   4159 	*cs++ = MI_USER_INTERRUPT;
   4160 
   4161 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
   4162 	if (intel_engine_has_semaphores(request->engine))
   4163 		cs = gen12_emit_preempt_busywait(request, cs);
   4164 
   4165 	request->tail = intel_ring_offset(request, cs);
   4166 	assert_ring_tail_valid(request->ring, request->tail);
   4167 
   4168 	return gen8_emit_wa_tail(request, cs);
   4169 }
   4170 
   4171 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
   4172 {
   4173 	cs = gen8_emit_ggtt_write(cs,
   4174 				  request->fence.seqno,
   4175 				  i915_request_active_timeline(request)->hwsp_offset,
   4176 				  0);
   4177 
   4178 	return gen12_emit_fini_breadcrumb_footer(request, cs);
   4179 }
   4180 
   4181 static u32 *
   4182 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
   4183 {
   4184 	cs = gen8_emit_ggtt_write_rcs(cs,
   4185 				      request->fence.seqno,
   4186 				      i915_request_active_timeline(request)->hwsp_offset,
   4187 				      PIPE_CONTROL_CS_STALL |
   4188 				      PIPE_CONTROL_TILE_CACHE_FLUSH |
   4189 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
   4190 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
   4191 				      /* Wa_1409600907:tgl */
   4192 				      PIPE_CONTROL_DEPTH_STALL |
   4193 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
   4194 				      PIPE_CONTROL_FLUSH_ENABLE |
   4195 				      PIPE_CONTROL_HDC_PIPELINE_FLUSH);
   4196 
   4197 	return gen12_emit_fini_breadcrumb_footer(request, cs);
   4198 }
   4199 
   4200 static void execlists_park(struct intel_engine_cs *engine)
   4201 {
   4202 	cancel_timer(&engine->execlists.timer);
   4203 	cancel_timer(&engine->execlists.preempt);
   4204 }
   4205 
   4206 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
   4207 {
   4208 	engine->submit_request = execlists_submit_request;
   4209 	engine->schedule = i915_schedule;
   4210 	engine->execlists.tasklet.func = execlists_submission_tasklet;
   4211 
   4212 	engine->reset.prepare = execlists_reset_prepare;
   4213 	engine->reset.rewind = execlists_reset_rewind;
   4214 	engine->reset.cancel = execlists_reset_cancel;
   4215 	engine->reset.finish = execlists_reset_finish;
   4216 
   4217 	engine->park = execlists_park;
   4218 	engine->unpark = NULL;
   4219 
   4220 	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
   4221 	if (!intel_vgpu_active(engine->i915)) {
   4222 		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
   4223 		if (HAS_LOGICAL_RING_PREEMPTION(engine->i915))
   4224 			engine->flags |= I915_ENGINE_HAS_PREEMPTION;
   4225 	}
   4226 
   4227 	if (INTEL_GEN(engine->i915) >= 12)
   4228 		engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
   4229 
   4230 	if (intel_engine_has_preemption(engine))
   4231 		engine->emit_bb_start = gen8_emit_bb_start;
   4232 	else
   4233 		engine->emit_bb_start = gen8_emit_bb_start_noarb;
   4234 }
   4235 
   4236 static void execlists_shutdown(struct intel_engine_cs *engine)
   4237 {
   4238 	/* Synchronise with residual timers and any softirq they raise */
   4239 	del_timer_sync(&engine->execlists.timer);
   4240 	del_timer_sync(&engine->execlists.preempt);
   4241 	tasklet_kill(&engine->execlists.tasklet);
   4242 }
   4243 
   4244 static void execlists_release(struct intel_engine_cs *engine)
   4245 {
   4246 	execlists_shutdown(engine);
   4247 
   4248 	intel_engine_cleanup_common(engine);
   4249 	lrc_destroy_wa_ctx(engine);
   4250 }
   4251 
   4252 static void
   4253 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
   4254 {
   4255 	/* Default vfuncs which can be overriden by each engine. */
   4256 
   4257 	engine->resume = execlists_resume;
   4258 
   4259 	engine->cops = &execlists_context_ops;
   4260 	engine->request_alloc = execlists_request_alloc;
   4261 
   4262 	engine->emit_flush = gen8_emit_flush;
   4263 	engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
   4264 	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
   4265 	if (INTEL_GEN(engine->i915) >= 12)
   4266 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
   4267 
   4268 	engine->set_default_submission = intel_execlists_set_default_submission;
   4269 
   4270 	if (INTEL_GEN(engine->i915) < 11) {
   4271 		engine->irq_enable = gen8_logical_ring_enable_irq;
   4272 		engine->irq_disable = gen8_logical_ring_disable_irq;
   4273 	} else {
   4274 		/*
   4275 		 * TODO: On Gen11 interrupt masks need to be clear
   4276 		 * to allow C6 entry. Keep interrupts enabled at
   4277 		 * and take the hit of generating extra interrupts
   4278 		 * until a more refined solution exists.
   4279 		 */
   4280 	}
   4281 }
   4282 
   4283 static inline void
   4284 logical_ring_default_irqs(struct intel_engine_cs *engine)
   4285 {
   4286 	unsigned int shift = 0;
   4287 
   4288 	if (INTEL_GEN(engine->i915) < 11) {
   4289 		const u8 irq_shifts[] = {
   4290 			[RCS0]  = GEN8_RCS_IRQ_SHIFT,
   4291 			[BCS0]  = GEN8_BCS_IRQ_SHIFT,
   4292 			[VCS0]  = GEN8_VCS0_IRQ_SHIFT,
   4293 			[VCS1]  = GEN8_VCS1_IRQ_SHIFT,
   4294 			[VECS0] = GEN8_VECS_IRQ_SHIFT,
   4295 		};
   4296 
   4297 		shift = irq_shifts[engine->id];
   4298 	}
   4299 
   4300 	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
   4301 	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
   4302 }
   4303 
   4304 static void rcs_submission_override(struct intel_engine_cs *engine)
   4305 {
   4306 	switch (INTEL_GEN(engine->i915)) {
   4307 	case 12:
   4308 		engine->emit_flush = gen12_emit_flush_render;
   4309 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
   4310 		break;
   4311 	case 11:
   4312 		engine->emit_flush = gen11_emit_flush_render;
   4313 		engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
   4314 		break;
   4315 	default:
   4316 		engine->emit_flush = gen8_emit_flush_render;
   4317 		engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
   4318 		break;
   4319 	}
   4320 }
   4321 
   4322 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
   4323 {
   4324 	struct intel_engine_execlists * const execlists = &engine->execlists;
   4325 	struct drm_i915_private *i915 = engine->i915;
   4326 	struct intel_uncore *uncore = engine->uncore;
   4327 	u32 base = engine->mmio_base;
   4328 
   4329 	tasklet_init(&engine->execlists.tasklet,
   4330 		     execlists_submission_tasklet, (unsigned long)engine);
   4331 	timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
   4332 	timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
   4333 
   4334 	logical_ring_default_vfuncs(engine);
   4335 	logical_ring_default_irqs(engine);
   4336 
   4337 	if (engine->class == RENDER_CLASS)
   4338 		rcs_submission_override(engine);
   4339 
   4340 	if (intel_init_workaround_bb(engine))
   4341 		/*
   4342 		 * We continue even if we fail to initialize WA batch
   4343 		 * because we only expect rare glitches but nothing
   4344 		 * critical to prevent us from using GPU
   4345 		 */
   4346 		DRM_ERROR("WA batch buffer initialization failed\n");
   4347 
   4348 	if (HAS_LOGICAL_RING_ELSQ(i915)) {
   4349 		execlists->submit_reg = uncore->regs +
   4350 			i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
   4351 		execlists->ctrl_reg = uncore->regs +
   4352 			i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
   4353 	} else {
   4354 		execlists->submit_reg = uncore->regs +
   4355 			i915_mmio_reg_offset(RING_ELSP(base));
   4356 	}
   4357 
   4358 	execlists->csb_status =
   4359 		&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
   4360 
   4361 	execlists->csb_write =
   4362 		&engine->status_page.addr[intel_hws_csb_write_index(i915)];
   4363 
   4364 	if (INTEL_GEN(i915) < 11)
   4365 		execlists->csb_size = GEN8_CSB_ENTRIES;
   4366 	else
   4367 		execlists->csb_size = GEN11_CSB_ENTRIES;
   4368 
   4369 	reset_csb_pointers(engine);
   4370 
   4371 	/* Finally, take ownership and responsibility for cleanup! */
   4372 	engine->release = execlists_release;
   4373 
   4374 	return 0;
   4375 }
   4376 
   4377 static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine)
   4378 {
   4379 	u32 indirect_ctx_offset;
   4380 
   4381 	switch (INTEL_GEN(engine->i915)) {
   4382 	default:
   4383 		MISSING_CASE(INTEL_GEN(engine->i915));
   4384 		/* fall through */
   4385 	case 12:
   4386 		indirect_ctx_offset =
   4387 			GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
   4388 		break;
   4389 	case 11:
   4390 		indirect_ctx_offset =
   4391 			GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
   4392 		break;
   4393 	case 10:
   4394 		indirect_ctx_offset =
   4395 			GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
   4396 		break;
   4397 	case 9:
   4398 		indirect_ctx_offset =
   4399 			GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
   4400 		break;
   4401 	case 8:
   4402 		indirect_ctx_offset =
   4403 			GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
   4404 		break;
   4405 	}
   4406 
   4407 	return indirect_ctx_offset;
   4408 }
   4409 
   4410 
   4411 static void init_common_reg_state(u32 * const regs,
   4412 				  const struct intel_engine_cs *engine,
   4413 				  const struct intel_ring *ring,
   4414 				  bool inhibit)
   4415 {
   4416 	u32 ctl;
   4417 
   4418 	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
   4419 	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
   4420 	if (inhibit)
   4421 		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
   4422 	if (INTEL_GEN(engine->i915) < 11)
   4423 		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
   4424 					   CTX_CTRL_RS_CTX_ENABLE);
   4425 	regs[CTX_CONTEXT_CONTROL] = ctl;
   4426 
   4427 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
   4428 }
   4429 
   4430 static void init_wa_bb_reg_state(u32 * const regs,
   4431 				 const struct intel_engine_cs *engine,
   4432 				 u32 pos_bb_per_ctx)
   4433 {
   4434 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
   4435 
   4436 	if (wa_ctx->per_ctx.size) {
   4437 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
   4438 
   4439 		regs[pos_bb_per_ctx] =
   4440 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
   4441 	}
   4442 
   4443 	if (wa_ctx->indirect_ctx.size) {
   4444 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
   4445 
   4446 		regs[pos_bb_per_ctx + 2] =
   4447 			(ggtt_offset + wa_ctx->indirect_ctx.offset) |
   4448 			(wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
   4449 
   4450 		regs[pos_bb_per_ctx + 4] =
   4451 			intel_lr_indirect_ctx_offset(engine) << 6;
   4452 	}
   4453 }
   4454 
   4455 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
   4456 {
   4457 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
   4458 		/* 64b PPGTT (48bit canonical)
   4459 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
   4460 		 * other PDP Descriptors are ignored.
   4461 		 */
   4462 		ASSIGN_CTX_PML4(ppgtt, regs);
   4463 	} else {
   4464 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
   4465 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
   4466 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
   4467 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
   4468 	}
   4469 }
   4470 
   4471 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
   4472 {
   4473 	if (i915_is_ggtt(vm))
   4474 		return i915_vm_to_ggtt(vm)->alias;
   4475 	else
   4476 		return i915_vm_to_ppgtt(vm);
   4477 }
   4478 
   4479 static void execlists_init_reg_state(u32 *regs,
   4480 				     const struct intel_context *ce,
   4481 				     const struct intel_engine_cs *engine,
   4482 				     const struct intel_ring *ring,
   4483 				     bool inhibit)
   4484 {
   4485 	/*
   4486 	 * A context is actually a big batch buffer with several
   4487 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
   4488 	 * values we are setting here are only for the first context restore:
   4489 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
   4490 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
   4491 	 * we are not initializing here).
   4492 	 *
   4493 	 * Must keep consistent with virtual_update_register_offsets().
   4494 	 */
   4495 	set_offsets(regs, reg_offsets(engine), engine, inhibit);
   4496 
   4497 	init_common_reg_state(regs, engine, ring, inhibit);
   4498 	init_ppgtt_reg_state(regs, vm_alias(ce->vm));
   4499 
   4500 	init_wa_bb_reg_state(regs, engine,
   4501 			     INTEL_GEN(engine->i915) >= 12 ?
   4502 			     GEN12_CTX_BB_PER_CTX_PTR :
   4503 			     CTX_BB_PER_CTX_PTR);
   4504 
   4505 	__reset_stop_ring(regs, engine);
   4506 }
   4507 
   4508 static int
   4509 populate_lr_context(struct intel_context *ce,
   4510 		    struct drm_i915_gem_object *ctx_obj,
   4511 		    struct intel_engine_cs *engine,
   4512 		    struct intel_ring *ring)
   4513 {
   4514 	bool inhibit = true;
   4515 	void *vaddr;
   4516 	int ret;
   4517 
   4518 	vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
   4519 	if (IS_ERR(vaddr)) {
   4520 		ret = PTR_ERR(vaddr);
   4521 		DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
   4522 		return ret;
   4523 	}
   4524 
   4525 	set_redzone(vaddr, engine);
   4526 
   4527 	if (engine->default_state) {
   4528 		void *defaults;
   4529 
   4530 		defaults = i915_gem_object_pin_map(engine->default_state,
   4531 						   I915_MAP_WB);
   4532 		if (IS_ERR(defaults)) {
   4533 			ret = PTR_ERR(defaults);
   4534 			goto err_unpin_ctx;
   4535 		}
   4536 
   4537 		memcpy(vaddr, defaults, engine->context_size);
   4538 		i915_gem_object_unpin_map(engine->default_state);
   4539 		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
   4540 		inhibit = false;
   4541 	}
   4542 
   4543 	/* The second page of the context object contains some fields which must
   4544 	 * be set up prior to the first execution. */
   4545 	execlists_init_reg_state(vaddr + LRC_STATE_PN * PAGE_SIZE,
   4546 				 ce, engine, ring, inhibit);
   4547 
   4548 	ret = 0;
   4549 err_unpin_ctx:
   4550 	__i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
   4551 	i915_gem_object_unpin_map(ctx_obj);
   4552 	return ret;
   4553 }
   4554 
   4555 static int __execlists_context_alloc(struct intel_context *ce,
   4556 				     struct intel_engine_cs *engine)
   4557 {
   4558 	struct drm_i915_gem_object *ctx_obj;
   4559 	struct intel_ring *ring;
   4560 	struct i915_vma *vma;
   4561 	u32 context_size;
   4562 	int ret;
   4563 
   4564 	GEM_BUG_ON(ce->state);
   4565 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
   4566 
   4567 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
   4568 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
   4569 
   4570 	ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
   4571 	if (IS_ERR(ctx_obj))
   4572 		return PTR_ERR(ctx_obj);
   4573 
   4574 	vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
   4575 	if (IS_ERR(vma)) {
   4576 		ret = PTR_ERR(vma);
   4577 		goto error_deref_obj;
   4578 	}
   4579 
   4580 	if (!ce->timeline) {
   4581 		struct intel_timeline *tl;
   4582 
   4583 		tl = intel_timeline_create(engine->gt, NULL);
   4584 		if (IS_ERR(tl)) {
   4585 			ret = PTR_ERR(tl);
   4586 			goto error_deref_obj;
   4587 		}
   4588 
   4589 		ce->timeline = tl;
   4590 	}
   4591 
   4592 	ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
   4593 	if (IS_ERR(ring)) {
   4594 		ret = PTR_ERR(ring);
   4595 		goto error_deref_obj;
   4596 	}
   4597 
   4598 	ret = populate_lr_context(ce, ctx_obj, engine, ring);
   4599 	if (ret) {
   4600 		DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
   4601 		goto error_ring_free;
   4602 	}
   4603 
   4604 	ce->ring = ring;
   4605 	ce->state = vma;
   4606 
   4607 	return 0;
   4608 
   4609 error_ring_free:
   4610 	intel_ring_put(ring);
   4611 error_deref_obj:
   4612 	i915_gem_object_put(ctx_obj);
   4613 	return ret;
   4614 }
   4615 
   4616 static struct list_head *virtual_queue(struct virtual_engine *ve)
   4617 {
   4618 	return &ve->base.execlists.default_priolist.requests[0];
   4619 }
   4620 
   4621 static void virtual_context_destroy(struct kref *kref)
   4622 {
   4623 	struct virtual_engine *ve =
   4624 		container_of(kref, typeof(*ve), context.ref);
   4625 	unsigned int n;
   4626 
   4627 	GEM_BUG_ON(!list_empty(virtual_queue(ve)));
   4628 	GEM_BUG_ON(ve->request);
   4629 	GEM_BUG_ON(ve->context.inflight);
   4630 
   4631 	for (n = 0; n < ve->num_siblings; n++) {
   4632 		struct intel_engine_cs *sibling = ve->siblings[n];
   4633 		struct rb_node *node = &ve->nodes[sibling->id].rb;
   4634 		unsigned long flags;
   4635 
   4636 		if (RB_EMPTY_NODE(node))
   4637 			continue;
   4638 
   4639 		spin_lock_irqsave(&sibling->active.lock, flags);
   4640 
   4641 		/* Detachment is lazily performed in the execlists tasklet */
   4642 		if (!RB_EMPTY_NODE(node))
   4643 			rb_erase_cached(node, &sibling->execlists.virtual);
   4644 
   4645 		spin_unlock_irqrestore(&sibling->active.lock, flags);
   4646 	}
   4647 	GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
   4648 
   4649 	if (ve->context.state)
   4650 		__execlists_context_fini(&ve->context);
   4651 	intel_context_fini(&ve->context);
   4652 
   4653 	kfree(ve->bonds);
   4654 	kfree(ve);
   4655 }
   4656 
   4657 static void virtual_engine_initial_hint(struct virtual_engine *ve)
   4658 {
   4659 	int swp;
   4660 
   4661 	/*
   4662 	 * Pick a random sibling on starting to help spread the load around.
   4663 	 *
   4664 	 * New contexts are typically created with exactly the same order
   4665 	 * of siblings, and often started in batches. Due to the way we iterate
   4666 	 * the array of sibling when submitting requests, sibling[0] is
   4667 	 * prioritised for dequeuing. If we make sure that sibling[0] is fairly
   4668 	 * randomised across the system, we also help spread the load by the
   4669 	 * first engine we inspect being different each time.
   4670 	 *
   4671 	 * NB This does not force us to execute on this engine, it will just
   4672 	 * typically be the first we inspect for submission.
   4673 	 */
   4674 	swp = prandom_u32_max(ve->num_siblings);
   4675 	if (!swp)
   4676 		return;
   4677 
   4678 	swap(ve->siblings[swp], ve->siblings[0]);
   4679 	if (!intel_engine_has_relative_mmio(ve->siblings[0]))
   4680 		virtual_update_register_offsets(ve->context.lrc_reg_state,
   4681 						ve->siblings[0]);
   4682 }
   4683 
   4684 static int virtual_context_alloc(struct intel_context *ce)
   4685 {
   4686 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
   4687 
   4688 	return __execlists_context_alloc(ce, ve->siblings[0]);
   4689 }
   4690 
   4691 static int virtual_context_pin(struct intel_context *ce)
   4692 {
   4693 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
   4694 	int err;
   4695 
   4696 	/* Note: we must use a real engine class for setting up reg state */
   4697 	err = __execlists_context_pin(ce, ve->siblings[0]);
   4698 	if (err)
   4699 		return err;
   4700 
   4701 	virtual_engine_initial_hint(ve);
   4702 	return 0;
   4703 }
   4704 
   4705 static void virtual_context_enter(struct intel_context *ce)
   4706 {
   4707 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
   4708 	unsigned int n;
   4709 
   4710 	for (n = 0; n < ve->num_siblings; n++)
   4711 		intel_engine_pm_get(ve->siblings[n]);
   4712 
   4713 	intel_timeline_enter(ce->timeline);
   4714 }
   4715 
   4716 static void virtual_context_exit(struct intel_context *ce)
   4717 {
   4718 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
   4719 	unsigned int n;
   4720 
   4721 	intel_timeline_exit(ce->timeline);
   4722 
   4723 	for (n = 0; n < ve->num_siblings; n++)
   4724 		intel_engine_pm_put(ve->siblings[n]);
   4725 }
   4726 
   4727 static const struct intel_context_ops virtual_context_ops = {
   4728 	.alloc = virtual_context_alloc,
   4729 
   4730 	.pin = virtual_context_pin,
   4731 	.unpin = execlists_context_unpin,
   4732 
   4733 	.enter = virtual_context_enter,
   4734 	.exit = virtual_context_exit,
   4735 
   4736 	.destroy = virtual_context_destroy,
   4737 };
   4738 
   4739 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
   4740 {
   4741 	struct i915_request *rq;
   4742 	intel_engine_mask_t mask;
   4743 
   4744 	rq = READ_ONCE(ve->request);
   4745 	if (!rq)
   4746 		return 0;
   4747 
   4748 	/* The rq is ready for submission; rq->execution_mask is now stable. */
   4749 	mask = rq->execution_mask;
   4750 	if (unlikely(!mask)) {
   4751 		/* Invalid selection, submit to a random engine in error */
   4752 		i915_request_skip(rq, -ENODEV);
   4753 		mask = ve->siblings[0]->mask;
   4754 	}
   4755 
   4756 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
   4757 		     rq->fence.context, rq->fence.seqno,
   4758 		     mask, ve->base.execlists.queue_priority_hint);
   4759 
   4760 	return mask;
   4761 }
   4762 
   4763 static void virtual_submission_tasklet(unsigned long data)
   4764 {
   4765 	struct virtual_engine * const ve = (struct virtual_engine *)data;
   4766 	const int prio = ve->base.execlists.queue_priority_hint;
   4767 	intel_engine_mask_t mask;
   4768 	unsigned int n;
   4769 
   4770 	rcu_read_lock();
   4771 	mask = virtual_submission_mask(ve);
   4772 	rcu_read_unlock();
   4773 	if (unlikely(!mask))
   4774 		return;
   4775 
   4776 	local_irq_disable();
   4777 	for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) {
   4778 		struct intel_engine_cs *sibling = ve->siblings[n];
   4779 		struct ve_node * const node = &ve->nodes[sibling->id];
   4780 		struct rb_node **parent, *rb;
   4781 		bool first;
   4782 
   4783 		if (unlikely(!(mask & sibling->mask))) {
   4784 			if (!RB_EMPTY_NODE(&node->rb)) {
   4785 				spin_lock(&sibling->active.lock);
   4786 				rb_erase_cached(&node->rb,
   4787 						&sibling->execlists.virtual);
   4788 				RB_CLEAR_NODE(&node->rb);
   4789 				spin_unlock(&sibling->active.lock);
   4790 			}
   4791 			continue;
   4792 		}
   4793 
   4794 		spin_lock(&sibling->active.lock);
   4795 
   4796 		if (!RB_EMPTY_NODE(&node->rb)) {
   4797 			/*
   4798 			 * Cheat and avoid rebalancing the tree if we can
   4799 			 * reuse this node in situ.
   4800 			 */
   4801 			first = rb_first_cached(&sibling->execlists.virtual) ==
   4802 				&node->rb;
   4803 			if (prio == node->prio || (prio > node->prio && first))
   4804 				goto submit_engine;
   4805 
   4806 			rb_erase_cached(&node->rb, &sibling->execlists.virtual);
   4807 		}
   4808 
   4809 		rb = NULL;
   4810 		first = true;
   4811 		parent = &sibling->execlists.virtual.rb_root.rb_node;
   4812 		while (*parent) {
   4813 			struct ve_node *other;
   4814 
   4815 			rb = *parent;
   4816 			other = rb_entry(rb, typeof(*other), rb);
   4817 			if (prio > other->prio) {
   4818 				parent = &rb->rb_left;
   4819 			} else {
   4820 				parent = &rb->rb_right;
   4821 				first = false;
   4822 			}
   4823 		}
   4824 
   4825 		rb_link_node(&node->rb, rb, parent);
   4826 		rb_insert_color_cached(&node->rb,
   4827 				       &sibling->execlists.virtual,
   4828 				       first);
   4829 
   4830 submit_engine:
   4831 		GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
   4832 		node->prio = prio;
   4833 		if (first && prio > sibling->execlists.queue_priority_hint) {
   4834 			sibling->execlists.queue_priority_hint = prio;
   4835 			tasklet_hi_schedule(&sibling->execlists.tasklet);
   4836 		}
   4837 
   4838 		spin_unlock(&sibling->active.lock);
   4839 	}
   4840 	local_irq_enable();
   4841 }
   4842 
   4843 static void virtual_submit_request(struct i915_request *rq)
   4844 {
   4845 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
   4846 	struct i915_request *old;
   4847 	unsigned long flags;
   4848 
   4849 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
   4850 		     rq->fence.context,
   4851 		     rq->fence.seqno);
   4852 
   4853 	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
   4854 
   4855 	spin_lock_irqsave(&ve->base.active.lock, flags);
   4856 
   4857 	old = ve->request;
   4858 	if (old) { /* background completion event from preempt-to-busy */
   4859 		GEM_BUG_ON(!i915_request_completed(old));
   4860 		__i915_request_submit(old);
   4861 		i915_request_put(old);
   4862 	}
   4863 
   4864 	if (i915_request_completed(rq)) {
   4865 		__i915_request_submit(rq);
   4866 
   4867 		ve->base.execlists.queue_priority_hint = INT_MIN;
   4868 		ve->request = NULL;
   4869 	} else {
   4870 		ve->base.execlists.queue_priority_hint = rq_prio(rq);
   4871 		ve->request = i915_request_get(rq);
   4872 
   4873 		GEM_BUG_ON(!list_empty(virtual_queue(ve)));
   4874 		list_move_tail(&rq->sched.link, virtual_queue(ve));
   4875 
   4876 		tasklet_schedule(&ve->base.execlists.tasklet);
   4877 	}
   4878 
   4879 	spin_unlock_irqrestore(&ve->base.active.lock, flags);
   4880 }
   4881 
   4882 static struct ve_bond *
   4883 virtual_find_bond(struct virtual_engine *ve,
   4884 		  const struct intel_engine_cs *master)
   4885 {
   4886 	int i;
   4887 
   4888 	for (i = 0; i < ve->num_bonds; i++) {
   4889 		if (ve->bonds[i].master == master)
   4890 			return &ve->bonds[i];
   4891 	}
   4892 
   4893 	return NULL;
   4894 }
   4895 
   4896 static void
   4897 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
   4898 {
   4899 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
   4900 	intel_engine_mask_t allowed, exec;
   4901 	struct ve_bond *bond;
   4902 
   4903 	allowed = ~to_request(signal)->engine->mask;
   4904 
   4905 	bond = virtual_find_bond(ve, to_request(signal)->engine);
   4906 	if (bond)
   4907 		allowed &= bond->sibling_mask;
   4908 
   4909 	/* Restrict the bonded request to run on only the available engines */
   4910 	exec = READ_ONCE(rq->execution_mask);
   4911 	while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
   4912 		;
   4913 
   4914 	/* Prevent the master from being re-run on the bonded engines */
   4915 	to_request(signal)->execution_mask &= ~allowed;
   4916 }
   4917 
   4918 struct intel_context *
   4919 intel_execlists_create_virtual(struct intel_engine_cs **siblings,
   4920 			       unsigned int count)
   4921 {
   4922 	struct virtual_engine *ve;
   4923 	unsigned int n;
   4924 	int err;
   4925 
   4926 	if (count == 0)
   4927 		return ERR_PTR(-EINVAL);
   4928 
   4929 	if (count == 1)
   4930 		return intel_context_create(siblings[0]);
   4931 
   4932 	ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
   4933 	if (!ve)
   4934 		return ERR_PTR(-ENOMEM);
   4935 
   4936 	ve->base.i915 = siblings[0]->i915;
   4937 	ve->base.gt = siblings[0]->gt;
   4938 	ve->base.uncore = siblings[0]->uncore;
   4939 	ve->base.id = -1;
   4940 
   4941 	ve->base.class = OTHER_CLASS;
   4942 	ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
   4943 	ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
   4944 	ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
   4945 
   4946 	/*
   4947 	 * The decision on whether to submit a request using semaphores
   4948 	 * depends on the saturated state of the engine. We only compute
   4949 	 * this during HW submission of the request, and we need for this
   4950 	 * state to be globally applied to all requests being submitted
   4951 	 * to this engine. Virtual engines encompass more than one physical
   4952 	 * engine and so we cannot accurately tell in advance if one of those
   4953 	 * engines is already saturated and so cannot afford to use a semaphore
   4954 	 * and be pessimized in priority for doing so -- if we are the only
   4955 	 * context using semaphores after all other clients have stopped, we
   4956 	 * will be starved on the saturated system. Such a global switch for
   4957 	 * semaphores is less than ideal, but alas is the current compromise.
   4958 	 */
   4959 	ve->base.saturated = ALL_ENGINES;
   4960 
   4961 	snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
   4962 
   4963 	intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
   4964 	intel_engine_init_breadcrumbs(&ve->base);
   4965 	intel_engine_init_execlists(&ve->base);
   4966 
   4967 	ve->base.cops = &virtual_context_ops;
   4968 	ve->base.request_alloc = execlists_request_alloc;
   4969 
   4970 	ve->base.schedule = i915_schedule;
   4971 	ve->base.submit_request = virtual_submit_request;
   4972 	ve->base.bond_execute = virtual_bond_execute;
   4973 
   4974 	INIT_LIST_HEAD(virtual_queue(ve));
   4975 	ve->base.execlists.queue_priority_hint = INT_MIN;
   4976 	tasklet_init(&ve->base.execlists.tasklet,
   4977 		     virtual_submission_tasklet,
   4978 		     (unsigned long)ve);
   4979 
   4980 	intel_context_init(&ve->context, &ve->base);
   4981 
   4982 	for (n = 0; n < count; n++) {
   4983 		struct intel_engine_cs *sibling = siblings[n];
   4984 
   4985 		GEM_BUG_ON(!is_power_of_2(sibling->mask));
   4986 		if (sibling->mask & ve->base.mask) {
   4987 			DRM_DEBUG("duplicate %s entry in load balancer\n",
   4988 				  sibling->name);
   4989 			err = -EINVAL;
   4990 			goto err_put;
   4991 		}
   4992 
   4993 		/*
   4994 		 * The virtual engine implementation is tightly coupled to
   4995 		 * the execlists backend -- we push out request directly
   4996 		 * into a tree inside each physical engine. We could support
   4997 		 * layering if we handle cloning of the requests and
   4998 		 * submitting a copy into each backend.
   4999 		 */
   5000 		if (sibling->execlists.tasklet.func !=
   5001 		    execlists_submission_tasklet) {
   5002 			err = -ENODEV;
   5003 			goto err_put;
   5004 		}
   5005 
   5006 		GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
   5007 		RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
   5008 
   5009 		ve->siblings[ve->num_siblings++] = sibling;
   5010 		ve->base.mask |= sibling->mask;
   5011 
   5012 		/*
   5013 		 * All physical engines must be compatible for their emission
   5014 		 * functions (as we build the instructions during request
   5015 		 * construction and do not alter them before submission
   5016 		 * on the physical engine). We use the engine class as a guide
   5017 		 * here, although that could be refined.
   5018 		 */
   5019 		if (ve->base.class != OTHER_CLASS) {
   5020 			if (ve->base.class != sibling->class) {
   5021 				DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
   5022 					  sibling->class, ve->base.class);
   5023 				err = -EINVAL;
   5024 				goto err_put;
   5025 			}
   5026 			continue;
   5027 		}
   5028 
   5029 		ve->base.class = sibling->class;
   5030 		ve->base.uabi_class = sibling->uabi_class;
   5031 		snprintf(ve->base.name, sizeof(ve->base.name),
   5032 			 "v%dx%d", ve->base.class, count);
   5033 		ve->base.context_size = sibling->context_size;
   5034 
   5035 		ve->base.emit_bb_start = sibling->emit_bb_start;
   5036 		ve->base.emit_flush = sibling->emit_flush;
   5037 		ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
   5038 		ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
   5039 		ve->base.emit_fini_breadcrumb_dw =
   5040 			sibling->emit_fini_breadcrumb_dw;
   5041 
   5042 		ve->base.flags = sibling->flags;
   5043 	}
   5044 
   5045 	ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
   5046 
   5047 	return &ve->context;
   5048 
   5049 err_put:
   5050 	intel_context_put(&ve->context);
   5051 	return ERR_PTR(err);
   5052 }
   5053 
   5054 struct intel_context *
   5055 intel_execlists_clone_virtual(struct intel_engine_cs *src)
   5056 {
   5057 	struct virtual_engine *se = to_virtual_engine(src);
   5058 	struct intel_context *dst;
   5059 
   5060 	dst = intel_execlists_create_virtual(se->siblings,
   5061 					     se->num_siblings);
   5062 	if (IS_ERR(dst))
   5063 		return dst;
   5064 
   5065 	if (se->num_bonds) {
   5066 		struct virtual_engine *de = to_virtual_engine(dst->engine);
   5067 
   5068 		de->bonds = kmemdup(se->bonds,
   5069 				    sizeof(*se->bonds) * se->num_bonds,
   5070 				    GFP_KERNEL);
   5071 		if (!de->bonds) {
   5072 			intel_context_put(dst);
   5073 			return ERR_PTR(-ENOMEM);
   5074 		}
   5075 
   5076 		de->num_bonds = se->num_bonds;
   5077 	}
   5078 
   5079 	return dst;
   5080 }
   5081 
   5082 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
   5083 				     const struct intel_engine_cs *master,
   5084 				     const struct intel_engine_cs *sibling)
   5085 {
   5086 	struct virtual_engine *ve = to_virtual_engine(engine);
   5087 	struct ve_bond *bond;
   5088 	int n;
   5089 
   5090 	/* Sanity check the sibling is part of the virtual engine */
   5091 	for (n = 0; n < ve->num_siblings; n++)
   5092 		if (sibling == ve->siblings[n])
   5093 			break;
   5094 	if (n == ve->num_siblings)
   5095 		return -EINVAL;
   5096 
   5097 	bond = virtual_find_bond(ve, master);
   5098 	if (bond) {
   5099 		bond->sibling_mask |= sibling->mask;
   5100 		return 0;
   5101 	}
   5102 
   5103 	bond = krealloc(ve->bonds,
   5104 			sizeof(*bond) * (ve->num_bonds + 1),
   5105 			GFP_KERNEL);
   5106 	if (!bond)
   5107 		return -ENOMEM;
   5108 
   5109 	bond[ve->num_bonds].master = master;
   5110 	bond[ve->num_bonds].sibling_mask = sibling->mask;
   5111 
   5112 	ve->bonds = bond;
   5113 	ve->num_bonds++;
   5114 
   5115 	return 0;
   5116 }
   5117 
   5118 struct intel_engine_cs *
   5119 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
   5120 				 unsigned int sibling)
   5121 {
   5122 	struct virtual_engine *ve = to_virtual_engine(engine);
   5123 
   5124 	if (sibling >= ve->num_siblings)
   5125 		return NULL;
   5126 
   5127 	return ve->siblings[sibling];
   5128 }
   5129 
   5130 void intel_execlists_show_requests(struct intel_engine_cs *engine,
   5131 				   struct drm_printer *m,
   5132 				   void (*show_request)(struct drm_printer *m,
   5133 							struct i915_request *rq,
   5134 							const char *prefix),
   5135 				   unsigned int max)
   5136 {
   5137 	const struct intel_engine_execlists *execlists = &engine->execlists;
   5138 	struct i915_request *rq, *last;
   5139 	unsigned long flags;
   5140 	unsigned int count;
   5141 	struct rb_node *rb;
   5142 
   5143 	spin_lock_irqsave(&engine->active.lock, flags);
   5144 
   5145 	last = NULL;
   5146 	count = 0;
   5147 	list_for_each_entry(rq, &engine->active.requests, sched.link) {
   5148 		if (count++ < max - 1)
   5149 			show_request(m, rq, "\t\tE ");
   5150 		else
   5151 			last = rq;
   5152 	}
   5153 	if (last) {
   5154 		if (count > max) {
   5155 			drm_printf(m,
   5156 				   "\t\t...skipping %d executing requests...\n",
   5157 				   count - max);
   5158 		}
   5159 		show_request(m, last, "\t\tE ");
   5160 	}
   5161 
   5162 	last = NULL;
   5163 	count = 0;
   5164 	if (execlists->queue_priority_hint != INT_MIN)
   5165 		drm_printf(m, "\t\tQueue priority hint: %d\n",
   5166 			   execlists->queue_priority_hint);
   5167 	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
   5168 		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
   5169 		int i;
   5170 
   5171 		priolist_for_each_request(rq, p, i) {
   5172 			if (count++ < max - 1)
   5173 				show_request(m, rq, "\t\tQ ");
   5174 			else
   5175 				last = rq;
   5176 		}
   5177 	}
   5178 	if (last) {
   5179 		if (count > max) {
   5180 			drm_printf(m,
   5181 				   "\t\t...skipping %d queued requests...\n",
   5182 				   count - max);
   5183 		}
   5184 		show_request(m, last, "\t\tQ ");
   5185 	}
   5186 
   5187 	last = NULL;
   5188 	count = 0;
   5189 	for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
   5190 		struct virtual_engine *ve =
   5191 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
   5192 		struct i915_request *rq = READ_ONCE(ve->request);
   5193 
   5194 		if (rq) {
   5195 			if (count++ < max - 1)
   5196 				show_request(m, rq, "\t\tV ");
   5197 			else
   5198 				last = rq;
   5199 		}
   5200 	}
   5201 	if (last) {
   5202 		if (count > max) {
   5203 			drm_printf(m,
   5204 				   "\t\t...skipping %d virtual requests...\n",
   5205 				   count - max);
   5206 		}
   5207 		show_request(m, last, "\t\tV ");
   5208 	}
   5209 
   5210 	spin_unlock_irqrestore(&engine->active.lock, flags);
   5211 }
   5212 
   5213 void intel_lr_context_reset(struct intel_engine_cs *engine,
   5214 			    struct intel_context *ce,
   5215 			    u32 head,
   5216 			    bool scrub)
   5217 {
   5218 	GEM_BUG_ON(!intel_context_is_pinned(ce));
   5219 
   5220 	/*
   5221 	 * We want a simple context + ring to execute the breadcrumb update.
   5222 	 * We cannot rely on the context being intact across the GPU hang,
   5223 	 * so clear it and rebuild just what we need for the breadcrumb.
   5224 	 * All pending requests for this context will be zapped, and any
   5225 	 * future request will be after userspace has had the opportunity
   5226 	 * to recreate its own state.
   5227 	 */
   5228 	if (scrub)
   5229 		restore_default_state(ce, engine);
   5230 
   5231 	/* Rerun the request; its payload has been neutered (if guilty). */
   5232 	__execlists_update_reg_state(ce, engine, head);
   5233 }
   5234 
   5235 bool
   5236 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
   5237 {
   5238 	return engine->set_default_submission ==
   5239 	       intel_execlists_set_default_submission;
   5240 }
   5241 
   5242 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
   5243 #include "selftest_lrc.c"
   5244 #endif
   5245