Home | History | Annotate | Line # | Download | only in gt
      1 /*	$NetBSD: intel_ring_submission.c,v 1.3 2021/12/19 11:49:11 riastradh Exp $	*/
      2 
      3 /*
      4  * Copyright  2008-2010 Intel Corporation
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a
      7  * copy of this software and associated documentation files (the "Software"),
      8  * to deal in the Software without restriction, including without limitation
      9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     10  * and/or sell copies of the Software, and to permit persons to whom the
     11  * Software is furnished to do so, subject to the following conditions:
     12  *
     13  * The above copyright notice and this permission notice (including the next
     14  * paragraph) shall be included in all copies or substantial portions of the
     15  * Software.
     16  *
     17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     22  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     23  * IN THE SOFTWARE.
     24  *
     25  * Authors:
     26  *    Eric Anholt <eric (at) anholt.net>
     27  *    Zou Nan hai <nanhai.zou (at) intel.com>
     28  *    Xiang Hai hao<haihao.xiang (at) intel.com>
     29  *
     30  */
     31 
     32 #include <sys/cdefs.h>
     33 __KERNEL_RCSID(0, "$NetBSD: intel_ring_submission.c,v 1.3 2021/12/19 11:49:11 riastradh Exp $");
     34 
     35 #include <linux/log2.h>
     36 
     37 #include <drm/i915_drm.h>
     38 
     39 #include "gem/i915_gem_context.h"
     40 
     41 #include "gen6_ppgtt.h"
     42 #include "i915_drv.h"
     43 #include "i915_trace.h"
     44 #include "intel_context.h"
     45 #include "intel_gt.h"
     46 #include "intel_gt_irq.h"
     47 #include "intel_gt_pm_irq.h"
     48 #include "intel_reset.h"
     49 #include "intel_ring.h"
     50 #include "intel_workarounds.h"
     51 
     52 /* Rough estimate of the typical request size, performing a flush,
     53  * set-context and then emitting the batch.
     54  */
     55 #define LEGACY_REQUEST_SIZE 200
     56 
     57 static int
     58 gen2_render_ring_flush(struct i915_request *rq, u32 mode)
     59 {
     60 	unsigned int num_store_dw;
     61 	u32 cmd, *cs;
     62 
     63 	cmd = MI_FLUSH;
     64 	num_store_dw = 0;
     65 	if (mode & EMIT_INVALIDATE)
     66 		cmd |= MI_READ_FLUSH;
     67 	if (mode & EMIT_FLUSH)
     68 		num_store_dw = 4;
     69 
     70 	cs = intel_ring_begin(rq, 2 + 3 * num_store_dw);
     71 	if (IS_ERR(cs))
     72 		return PTR_ERR(cs);
     73 
     74 	*cs++ = cmd;
     75 	while (num_store_dw--) {
     76 		*cs++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
     77 		*cs++ = intel_gt_scratch_offset(rq->engine->gt,
     78 						INTEL_GT_SCRATCH_FIELD_DEFAULT);
     79 		*cs++ = 0;
     80 	}
     81 	*cs++ = MI_FLUSH | MI_NO_WRITE_FLUSH;
     82 
     83 	intel_ring_advance(rq, cs);
     84 
     85 	return 0;
     86 }
     87 
     88 static int
     89 gen4_render_ring_flush(struct i915_request *rq, u32 mode)
     90 {
     91 	u32 cmd, *cs;
     92 	int i;
     93 
     94 	/*
     95 	 * read/write caches:
     96 	 *
     97 	 * I915_GEM_DOMAIN_RENDER is always invalidated, but is
     98 	 * only flushed if MI_NO_WRITE_FLUSH is unset.  On 965, it is
     99 	 * also flushed at 2d versus 3d pipeline switches.
    100 	 *
    101 	 * read-only caches:
    102 	 *
    103 	 * I915_GEM_DOMAIN_SAMPLER is flushed on pre-965 if
    104 	 * MI_READ_FLUSH is set, and is always flushed on 965.
    105 	 *
    106 	 * I915_GEM_DOMAIN_COMMAND may not exist?
    107 	 *
    108 	 * I915_GEM_DOMAIN_INSTRUCTION, which exists on 965, is
    109 	 * invalidated when MI_EXE_FLUSH is set.
    110 	 *
    111 	 * I915_GEM_DOMAIN_VERTEX, which exists on 965, is
    112 	 * invalidated with every MI_FLUSH.
    113 	 *
    114 	 * TLBs:
    115 	 *
    116 	 * On 965, TLBs associated with I915_GEM_DOMAIN_COMMAND
    117 	 * and I915_GEM_DOMAIN_CPU in are invalidated at PTE write and
    118 	 * I915_GEM_DOMAIN_RENDER and I915_GEM_DOMAIN_SAMPLER
    119 	 * are flushed at any MI_FLUSH.
    120 	 */
    121 
    122 	cmd = MI_FLUSH;
    123 	if (mode & EMIT_INVALIDATE) {
    124 		cmd |= MI_EXE_FLUSH;
    125 		if (IS_G4X(rq->i915) || IS_GEN(rq->i915, 5))
    126 			cmd |= MI_INVALIDATE_ISP;
    127 	}
    128 
    129 	i = 2;
    130 	if (mode & EMIT_INVALIDATE)
    131 		i += 20;
    132 
    133 	cs = intel_ring_begin(rq, i);
    134 	if (IS_ERR(cs))
    135 		return PTR_ERR(cs);
    136 
    137 	*cs++ = cmd;
    138 
    139 	/*
    140 	 * A random delay to let the CS invalidate take effect? Without this
    141 	 * delay, the GPU relocation path fails as the CS does not see
    142 	 * the updated contents. Just as important, if we apply the flushes
    143 	 * to the EMIT_FLUSH branch (i.e. immediately after the relocation
    144 	 * write and before the invalidate on the next batch), the relocations
    145 	 * still fail. This implies that is a delay following invalidation
    146 	 * that is required to reset the caches as opposed to a delay to
    147 	 * ensure the memory is written.
    148 	 */
    149 	if (mode & EMIT_INVALIDATE) {
    150 		*cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE;
    151 		*cs++ = intel_gt_scratch_offset(rq->engine->gt,
    152 						INTEL_GT_SCRATCH_FIELD_DEFAULT) |
    153 			PIPE_CONTROL_GLOBAL_GTT;
    154 		*cs++ = 0;
    155 		*cs++ = 0;
    156 
    157 		for (i = 0; i < 12; i++)
    158 			*cs++ = MI_FLUSH;
    159 
    160 		*cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE;
    161 		*cs++ = intel_gt_scratch_offset(rq->engine->gt,
    162 						INTEL_GT_SCRATCH_FIELD_DEFAULT) |
    163 			PIPE_CONTROL_GLOBAL_GTT;
    164 		*cs++ = 0;
    165 		*cs++ = 0;
    166 	}
    167 
    168 	*cs++ = cmd;
    169 
    170 	intel_ring_advance(rq, cs);
    171 
    172 	return 0;
    173 }
    174 
    175 /*
    176  * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
    177  * implementing two workarounds on gen6.  From section 1.4.7.1
    178  * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
    179  *
    180  * [DevSNB-C+{W/A}] Before any depth stall flush (including those
    181  * produced by non-pipelined state commands), software needs to first
    182  * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
    183  * 0.
    184  *
    185  * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
    186  * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
    187  *
    188  * And the workaround for these two requires this workaround first:
    189  *
    190  * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
    191  * BEFORE the pipe-control with a post-sync op and no write-cache
    192  * flushes.
    193  *
    194  * And this last workaround is tricky because of the requirements on
    195  * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
    196  * volume 2 part 1:
    197  *
    198  *     "1 of the following must also be set:
    199  *      - Render Target Cache Flush Enable ([12] of DW1)
    200  *      - Depth Cache Flush Enable ([0] of DW1)
    201  *      - Stall at Pixel Scoreboard ([1] of DW1)
    202  *      - Depth Stall ([13] of DW1)
    203  *      - Post-Sync Operation ([13] of DW1)
    204  *      - Notify Enable ([8] of DW1)"
    205  *
    206  * The cache flushes require the workaround flush that triggered this
    207  * one, so we can't use it.  Depth stall would trigger the same.
    208  * Post-sync nonzero is what triggered this second workaround, so we
    209  * can't use that one either.  Notify enable is IRQs, which aren't
    210  * really our business.  That leaves only stall at scoreboard.
    211  */
    212 static int
    213 gen6_emit_post_sync_nonzero_flush(struct i915_request *rq)
    214 {
    215 	u32 scratch_addr =
    216 		intel_gt_scratch_offset(rq->engine->gt,
    217 					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
    218 	u32 *cs;
    219 
    220 	cs = intel_ring_begin(rq, 6);
    221 	if (IS_ERR(cs))
    222 		return PTR_ERR(cs);
    223 
    224 	*cs++ = GFX_OP_PIPE_CONTROL(5);
    225 	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
    226 	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
    227 	*cs++ = 0; /* low dword */
    228 	*cs++ = 0; /* high dword */
    229 	*cs++ = MI_NOOP;
    230 	intel_ring_advance(rq, cs);
    231 
    232 	cs = intel_ring_begin(rq, 6);
    233 	if (IS_ERR(cs))
    234 		return PTR_ERR(cs);
    235 
    236 	*cs++ = GFX_OP_PIPE_CONTROL(5);
    237 	*cs++ = PIPE_CONTROL_QW_WRITE;
    238 	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
    239 	*cs++ = 0;
    240 	*cs++ = 0;
    241 	*cs++ = MI_NOOP;
    242 	intel_ring_advance(rq, cs);
    243 
    244 	return 0;
    245 }
    246 
    247 static int
    248 gen6_render_ring_flush(struct i915_request *rq, u32 mode)
    249 {
    250 	u32 scratch_addr =
    251 		intel_gt_scratch_offset(rq->engine->gt,
    252 					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
    253 	u32 *cs, flags = 0;
    254 	int ret;
    255 
    256 	/* Force SNB workarounds for PIPE_CONTROL flushes */
    257 	ret = gen6_emit_post_sync_nonzero_flush(rq);
    258 	if (ret)
    259 		return ret;
    260 
    261 	/* Just flush everything.  Experiments have shown that reducing the
    262 	 * number of bits based on the write domains has little performance
    263 	 * impact.
    264 	 */
    265 	if (mode & EMIT_FLUSH) {
    266 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
    267 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
    268 		/*
    269 		 * Ensure that any following seqno writes only happen
    270 		 * when the render cache is indeed flushed.
    271 		 */
    272 		flags |= PIPE_CONTROL_CS_STALL;
    273 	}
    274 	if (mode & EMIT_INVALIDATE) {
    275 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
    276 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
    277 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
    278 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
    279 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
    280 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
    281 		/*
    282 		 * TLB invalidate requires a post-sync write.
    283 		 */
    284 		flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
    285 	}
    286 
    287 	cs = intel_ring_begin(rq, 4);
    288 	if (IS_ERR(cs))
    289 		return PTR_ERR(cs);
    290 
    291 	*cs++ = GFX_OP_PIPE_CONTROL(4);
    292 	*cs++ = flags;
    293 	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
    294 	*cs++ = 0;
    295 	intel_ring_advance(rq, cs);
    296 
    297 	return 0;
    298 }
    299 
    300 static u32 *gen6_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
    301 {
    302 	/* First we do the gen6_emit_post_sync_nonzero_flush w/a */
    303 	*cs++ = GFX_OP_PIPE_CONTROL(4);
    304 	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
    305 	*cs++ = 0;
    306 	*cs++ = 0;
    307 
    308 	*cs++ = GFX_OP_PIPE_CONTROL(4);
    309 	*cs++ = PIPE_CONTROL_QW_WRITE;
    310 	*cs++ = intel_gt_scratch_offset(rq->engine->gt,
    311 					INTEL_GT_SCRATCH_FIELD_DEFAULT) |
    312 		PIPE_CONTROL_GLOBAL_GTT;
    313 	*cs++ = 0;
    314 
    315 	/* Finally we can flush and with it emit the breadcrumb */
    316 	*cs++ = GFX_OP_PIPE_CONTROL(4);
    317 	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
    318 		 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
    319 		 PIPE_CONTROL_DC_FLUSH_ENABLE |
    320 		 PIPE_CONTROL_QW_WRITE |
    321 		 PIPE_CONTROL_CS_STALL);
    322 	*cs++ = i915_request_active_timeline(rq)->hwsp_offset |
    323 		PIPE_CONTROL_GLOBAL_GTT;
    324 	*cs++ = rq->fence.seqno;
    325 
    326 	*cs++ = MI_USER_INTERRUPT;
    327 	*cs++ = MI_NOOP;
    328 
    329 	rq->tail = intel_ring_offset(rq, cs);
    330 	assert_ring_tail_valid(rq->ring, rq->tail);
    331 
    332 	return cs;
    333 }
    334 
    335 static int
    336 gen7_render_ring_cs_stall_wa(struct i915_request *rq)
    337 {
    338 	u32 *cs;
    339 
    340 	cs = intel_ring_begin(rq, 4);
    341 	if (IS_ERR(cs))
    342 		return PTR_ERR(cs);
    343 
    344 	*cs++ = GFX_OP_PIPE_CONTROL(4);
    345 	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
    346 	*cs++ = 0;
    347 	*cs++ = 0;
    348 	intel_ring_advance(rq, cs);
    349 
    350 	return 0;
    351 }
    352 
    353 static int
    354 gen7_render_ring_flush(struct i915_request *rq, u32 mode)
    355 {
    356 	u32 scratch_addr =
    357 		intel_gt_scratch_offset(rq->engine->gt,
    358 					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
    359 	u32 *cs, flags = 0;
    360 
    361 	/*
    362 	 * Ensure that any following seqno writes only happen when the render
    363 	 * cache is indeed flushed.
    364 	 *
    365 	 * Workaround: 4th PIPE_CONTROL command (except the ones with only
    366 	 * read-cache invalidate bits set) must have the CS_STALL bit set. We
    367 	 * don't try to be clever and just set it unconditionally.
    368 	 */
    369 	flags |= PIPE_CONTROL_CS_STALL;
    370 
    371 	/*
    372 	 * CS_STALL suggests at least a post-sync write.
    373 	 */
    374 	flags |= PIPE_CONTROL_QW_WRITE;
    375 	flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
    376 
    377 	/* Just flush everything.  Experiments have shown that reducing the
    378 	 * number of bits based on the write domains has little performance
    379 	 * impact.
    380 	 */
    381 	if (mode & EMIT_FLUSH) {
    382 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
    383 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
    384 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
    385 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
    386 	}
    387 	if (mode & EMIT_INVALIDATE) {
    388 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
    389 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
    390 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
    391 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
    392 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
    393 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
    394 		flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR;
    395 
    396 		/* Workaround: we must issue a pipe_control with CS-stall bit
    397 		 * set before a pipe_control command that has the state cache
    398 		 * invalidate bit set. */
    399 		gen7_render_ring_cs_stall_wa(rq);
    400 	}
    401 
    402 	cs = intel_ring_begin(rq, 4);
    403 	if (IS_ERR(cs))
    404 		return PTR_ERR(cs);
    405 
    406 	*cs++ = GFX_OP_PIPE_CONTROL(4);
    407 	*cs++ = flags;
    408 	*cs++ = scratch_addr;
    409 	*cs++ = 0;
    410 	intel_ring_advance(rq, cs);
    411 
    412 	return 0;
    413 }
    414 
    415 static u32 *gen7_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
    416 {
    417 	*cs++ = GFX_OP_PIPE_CONTROL(4);
    418 	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
    419 		 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
    420 		 PIPE_CONTROL_DC_FLUSH_ENABLE |
    421 		 PIPE_CONTROL_FLUSH_ENABLE |
    422 		 PIPE_CONTROL_QW_WRITE |
    423 		 PIPE_CONTROL_GLOBAL_GTT_IVB |
    424 		 PIPE_CONTROL_CS_STALL);
    425 	*cs++ = i915_request_active_timeline(rq)->hwsp_offset;
    426 	*cs++ = rq->fence.seqno;
    427 
    428 	*cs++ = MI_USER_INTERRUPT;
    429 	*cs++ = MI_NOOP;
    430 
    431 	rq->tail = intel_ring_offset(rq, cs);
    432 	assert_ring_tail_valid(rq->ring, rq->tail);
    433 
    434 	return cs;
    435 }
    436 
    437 static u32 *gen6_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
    438 {
    439 	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
    440 	GEM_BUG_ON(offset_in_page(i915_request_active_timeline(rq)->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
    441 
    442 	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
    443 	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
    444 	*cs++ = rq->fence.seqno;
    445 
    446 	*cs++ = MI_USER_INTERRUPT;
    447 
    448 	rq->tail = intel_ring_offset(rq, cs);
    449 	assert_ring_tail_valid(rq->ring, rq->tail);
    450 
    451 	return cs;
    452 }
    453 
    454 #define GEN7_XCS_WA 32
    455 static u32 *gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
    456 {
    457 	int i;
    458 
    459 	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
    460 	GEM_BUG_ON(offset_in_page(i915_request_active_timeline(rq)->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
    461 
    462 	*cs++ = MI_FLUSH_DW | MI_INVALIDATE_TLB |
    463 		MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
    464 	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
    465 	*cs++ = rq->fence.seqno;
    466 
    467 	for (i = 0; i < GEN7_XCS_WA; i++) {
    468 		*cs++ = MI_STORE_DWORD_INDEX;
    469 		*cs++ = I915_GEM_HWS_SEQNO_ADDR;
    470 		*cs++ = rq->fence.seqno;
    471 	}
    472 
    473 	*cs++ = MI_FLUSH_DW;
    474 	*cs++ = 0;
    475 	*cs++ = 0;
    476 
    477 	*cs++ = MI_USER_INTERRUPT;
    478 	*cs++ = MI_NOOP;
    479 
    480 	rq->tail = intel_ring_offset(rq, cs);
    481 	assert_ring_tail_valid(rq->ring, rq->tail);
    482 
    483 	return cs;
    484 }
    485 #undef GEN7_XCS_WA
    486 
    487 static void set_hwstam(struct intel_engine_cs *engine, u32 mask)
    488 {
    489 	/*
    490 	 * Keep the render interrupt unmasked as this papers over
    491 	 * lost interrupts following a reset.
    492 	 */
    493 	if (engine->class == RENDER_CLASS) {
    494 		if (INTEL_GEN(engine->i915) >= 6)
    495 			mask &= ~BIT(0);
    496 		else
    497 			mask &= ~I915_USER_INTERRUPT;
    498 	}
    499 
    500 	intel_engine_set_hwsp_writemask(engine, mask);
    501 }
    502 
    503 static void set_hws_pga(struct intel_engine_cs *engine, phys_addr_t phys)
    504 {
    505 	u32 addr;
    506 
    507 	addr = lower_32_bits(phys);
    508 	if (INTEL_GEN(engine->i915) >= 4)
    509 		addr |= (phys >> 28) & 0xf0;
    510 
    511 	intel_uncore_write(engine->uncore, HWS_PGA, addr);
    512 }
    513 
    514 #ifdef __NetBSD__
    515 static void
    516 ring_setup_phys_status_page(struct intel_engine_cs *engine)
    517 {
    518 	struct drm_i915_gem_object *obj = engine->status_page.vma->obj;
    519 	bus_addr_t addr;
    520 
    521 	GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
    522 	addr = obj->mm.pages->sgl[0].sg_dmamap->dm_segs[0].ds_addr;
    523 	set_hws_pga(engine, addr);
    524 	set_hwstam(engine, ~0u);
    525 }
    526 #else
    527 static struct page *status_page(struct intel_engine_cs *engine)
    528 {
    529 	struct drm_i915_gem_object *obj = engine->status_page.vma->obj;
    530 
    531 	GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
    532 	return sg_page(obj->mm.pages->sgl);
    533 }
    534 
    535 static void ring_setup_phys_status_page(struct intel_engine_cs *engine)
    536 {
    537 	set_hws_pga(engine, PFN_PHYS(page_to_pfn(status_page(engine))));
    538 	set_hwstam(engine, ~0u);
    539 }
    540 #endif
    541 
    542 static void set_hwsp(struct intel_engine_cs *engine, u32 offset)
    543 {
    544 	i915_reg_t hwsp;
    545 
    546 	/*
    547 	 * The ring status page addresses are no longer next to the rest of
    548 	 * the ring registers as of gen7.
    549 	 */
    550 	if (IS_GEN(engine->i915, 7)) {
    551 		switch (engine->id) {
    552 		/*
    553 		 * No more rings exist on Gen7. Default case is only to shut up
    554 		 * gcc switch check warning.
    555 		 */
    556 		default:
    557 			GEM_BUG_ON(engine->id);
    558 			/* fallthrough */
    559 		case RCS0:
    560 			hwsp = RENDER_HWS_PGA_GEN7;
    561 			break;
    562 		case BCS0:
    563 			hwsp = BLT_HWS_PGA_GEN7;
    564 			break;
    565 		case VCS0:
    566 			hwsp = BSD_HWS_PGA_GEN7;
    567 			break;
    568 		case VECS0:
    569 			hwsp = VEBOX_HWS_PGA_GEN7;
    570 			break;
    571 		}
    572 	} else if (IS_GEN(engine->i915, 6)) {
    573 		hwsp = RING_HWS_PGA_GEN6(engine->mmio_base);
    574 	} else {
    575 		hwsp = RING_HWS_PGA(engine->mmio_base);
    576 	}
    577 
    578 	intel_uncore_write(engine->uncore, hwsp, offset);
    579 	intel_uncore_posting_read(engine->uncore, hwsp);
    580 }
    581 
    582 static void flush_cs_tlb(struct intel_engine_cs *engine)
    583 {
    584 	struct drm_i915_private *dev_priv = engine->i915;
    585 
    586 	if (!IS_GEN_RANGE(dev_priv, 6, 7))
    587 		return;
    588 
    589 	/* ring should be idle before issuing a sync flush*/
    590 	WARN_ON((ENGINE_READ(engine, RING_MI_MODE) & MODE_IDLE) == 0);
    591 
    592 	ENGINE_WRITE(engine, RING_INSTPM,
    593 		     _MASKED_BIT_ENABLE(INSTPM_TLB_INVALIDATE |
    594 					INSTPM_SYNC_FLUSH));
    595 	if (intel_wait_for_register(engine->uncore,
    596 				    RING_INSTPM(engine->mmio_base),
    597 				    INSTPM_SYNC_FLUSH, 0,
    598 				    1000))
    599 		DRM_ERROR("%s: wait for SyncFlush to complete for TLB invalidation timed out\n",
    600 			  engine->name);
    601 }
    602 
    603 static void ring_setup_status_page(struct intel_engine_cs *engine)
    604 {
    605 	set_hwsp(engine, i915_ggtt_offset(engine->status_page.vma));
    606 	set_hwstam(engine, ~0u);
    607 
    608 	flush_cs_tlb(engine);
    609 }
    610 
    611 static bool stop_ring(struct intel_engine_cs *engine)
    612 {
    613 	struct drm_i915_private *dev_priv = engine->i915;
    614 
    615 	if (INTEL_GEN(dev_priv) > 2) {
    616 		ENGINE_WRITE(engine,
    617 			     RING_MI_MODE, _MASKED_BIT_ENABLE(STOP_RING));
    618 		if (intel_wait_for_register(engine->uncore,
    619 					    RING_MI_MODE(engine->mmio_base),
    620 					    MODE_IDLE,
    621 					    MODE_IDLE,
    622 					    1000)) {
    623 			DRM_ERROR("%s : timed out trying to stop ring\n",
    624 				  engine->name);
    625 
    626 			/*
    627 			 * Sometimes we observe that the idle flag is not
    628 			 * set even though the ring is empty. So double
    629 			 * check before giving up.
    630 			 */
    631 			if (ENGINE_READ(engine, RING_HEAD) !=
    632 			    ENGINE_READ(engine, RING_TAIL))
    633 				return false;
    634 		}
    635 	}
    636 
    637 	ENGINE_WRITE(engine, RING_HEAD, ENGINE_READ(engine, RING_TAIL));
    638 
    639 	ENGINE_WRITE(engine, RING_HEAD, 0);
    640 	ENGINE_WRITE(engine, RING_TAIL, 0);
    641 
    642 	/* The ring must be empty before it is disabled */
    643 	ENGINE_WRITE(engine, RING_CTL, 0);
    644 
    645 	return (ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR) == 0;
    646 }
    647 
    648 static int xcs_resume(struct intel_engine_cs *engine)
    649 {
    650 	struct drm_i915_private *dev_priv = engine->i915;
    651 	struct intel_ring *ring = engine->legacy.ring;
    652 	int ret = 0;
    653 
    654 	ENGINE_TRACE(engine, "ring:{HEAD:%04x, TAIL:%04x}\n",
    655 		     ring->head, ring->tail);
    656 
    657 	intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL);
    658 
    659 	/* WaClearRingBufHeadRegAtInit:ctg,elk */
    660 	if (!stop_ring(engine)) {
    661 		/* G45 ring initialization often fails to reset head to zero */
    662 		DRM_DEBUG_DRIVER("%s head not reset to zero "
    663 				"ctl %08x head %08x tail %08x start %08x\n",
    664 				engine->name,
    665 				ENGINE_READ(engine, RING_CTL),
    666 				ENGINE_READ(engine, RING_HEAD),
    667 				ENGINE_READ(engine, RING_TAIL),
    668 				ENGINE_READ(engine, RING_START));
    669 
    670 		if (!stop_ring(engine)) {
    671 			DRM_ERROR("failed to set %s head to zero "
    672 				  "ctl %08x head %08x tail %08x start %08x\n",
    673 				  engine->name,
    674 				  ENGINE_READ(engine, RING_CTL),
    675 				  ENGINE_READ(engine, RING_HEAD),
    676 				  ENGINE_READ(engine, RING_TAIL),
    677 				  ENGINE_READ(engine, RING_START));
    678 			ret = -EIO;
    679 			goto out;
    680 		}
    681 	}
    682 
    683 	if (HWS_NEEDS_PHYSICAL(dev_priv))
    684 		ring_setup_phys_status_page(engine);
    685 	else
    686 		ring_setup_status_page(engine);
    687 
    688 	intel_engine_reset_breadcrumbs(engine);
    689 
    690 	/* Enforce ordering by reading HEAD register back */
    691 	ENGINE_POSTING_READ(engine, RING_HEAD);
    692 
    693 	/*
    694 	 * Initialize the ring. This must happen _after_ we've cleared the ring
    695 	 * registers with the above sequence (the readback of the HEAD registers
    696 	 * also enforces ordering), otherwise the hw might lose the new ring
    697 	 * register values.
    698 	 */
    699 	ENGINE_WRITE(engine, RING_START, i915_ggtt_offset(ring->vma));
    700 
    701 	/* Check that the ring offsets point within the ring! */
    702 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->head));
    703 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
    704 	intel_ring_update_space(ring);
    705 
    706 	/* First wake the ring up to an empty/idle ring */
    707 	ENGINE_WRITE(engine, RING_HEAD, ring->head);
    708 	ENGINE_WRITE(engine, RING_TAIL, ring->head);
    709 	ENGINE_POSTING_READ(engine, RING_TAIL);
    710 
    711 	ENGINE_WRITE(engine, RING_CTL, RING_CTL_SIZE(ring->size) | RING_VALID);
    712 
    713 	/* If the head is still not zero, the ring is dead */
    714 	if (intel_wait_for_register(engine->uncore,
    715 				    RING_CTL(engine->mmio_base),
    716 				    RING_VALID, RING_VALID,
    717 				    50)) {
    718 		DRM_ERROR("%s initialization failed "
    719 			  "ctl %08x (valid? %d) head %08x [%08x] tail %08x [%08x] start %08x [expected %08x]\n",
    720 			  engine->name,
    721 			  ENGINE_READ(engine, RING_CTL),
    722 			  ENGINE_READ(engine, RING_CTL) & RING_VALID,
    723 			  ENGINE_READ(engine, RING_HEAD), ring->head,
    724 			  ENGINE_READ(engine, RING_TAIL), ring->tail,
    725 			  ENGINE_READ(engine, RING_START),
    726 			  i915_ggtt_offset(ring->vma));
    727 		ret = -EIO;
    728 		goto out;
    729 	}
    730 
    731 	if (INTEL_GEN(dev_priv) > 2)
    732 		ENGINE_WRITE(engine,
    733 			     RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
    734 
    735 	/* Now awake, let it get started */
    736 	if (ring->tail != ring->head) {
    737 		ENGINE_WRITE(engine, RING_TAIL, ring->tail);
    738 		ENGINE_POSTING_READ(engine, RING_TAIL);
    739 	}
    740 
    741 	/* Papering over lost _interrupts_ immediately following the restart */
    742 	intel_engine_signal_breadcrumbs(engine);
    743 out:
    744 	intel_uncore_forcewake_put(engine->uncore, FORCEWAKE_ALL);
    745 
    746 	return ret;
    747 }
    748 
    749 static void reset_prepare(struct intel_engine_cs *engine)
    750 {
    751 	struct intel_uncore *uncore = engine->uncore;
    752 	const u32 base = engine->mmio_base;
    753 
    754 	/*
    755 	 * We stop engines, otherwise we might get failed reset and a
    756 	 * dead gpu (on elk). Also as modern gpu as kbl can suffer
    757 	 * from system hang if batchbuffer is progressing when
    758 	 * the reset is issued, regardless of READY_TO_RESET ack.
    759 	 * Thus assume it is best to stop engines on all gens
    760 	 * where we have a gpu reset.
    761 	 *
    762 	 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
    763 	 *
    764 	 * WaMediaResetMainRingCleanup:ctg,elk (presumably)
    765 	 *
    766 	 * FIXME: Wa for more modern gens needs to be validated
    767 	 */
    768 	ENGINE_TRACE(engine, "\n");
    769 
    770 	if (intel_engine_stop_cs(engine))
    771 		ENGINE_TRACE(engine, "timed out on STOP_RING\n");
    772 
    773 	intel_uncore_write_fw(uncore,
    774 			      RING_HEAD(base),
    775 			      intel_uncore_read_fw(uncore, RING_TAIL(base)));
    776 	intel_uncore_posting_read_fw(uncore, RING_HEAD(base)); /* paranoia */
    777 
    778 	intel_uncore_write_fw(uncore, RING_HEAD(base), 0);
    779 	intel_uncore_write_fw(uncore, RING_TAIL(base), 0);
    780 	intel_uncore_posting_read_fw(uncore, RING_TAIL(base));
    781 
    782 	/* The ring must be empty before it is disabled */
    783 	intel_uncore_write_fw(uncore, RING_CTL(base), 0);
    784 
    785 	/* Check acts as a post */
    786 	if (intel_uncore_read_fw(uncore, RING_HEAD(base)))
    787 		ENGINE_TRACE(engine, "ring head [%x] not parked\n",
    788 			     intel_uncore_read_fw(uncore, RING_HEAD(base)));
    789 }
    790 
    791 static void reset_rewind(struct intel_engine_cs *engine, bool stalled)
    792 {
    793 	struct i915_request *pos, *rq;
    794 	unsigned long flags;
    795 	u32 head;
    796 
    797 	rq = NULL;
    798 	spin_lock_irqsave(&engine->active.lock, flags);
    799 	list_for_each_entry(pos, &engine->active.requests, sched.link) {
    800 		if (!i915_request_completed(pos)) {
    801 			rq = pos;
    802 			break;
    803 		}
    804 	}
    805 
    806 	/*
    807 	 * The guilty request will get skipped on a hung engine.
    808 	 *
    809 	 * Users of client default contexts do not rely on logical
    810 	 * state preserved between batches so it is safe to execute
    811 	 * queued requests following the hang. Non default contexts
    812 	 * rely on preserved state, so skipping a batch loses the
    813 	 * evolution of the state and it needs to be considered corrupted.
    814 	 * Executing more queued batches on top of corrupted state is
    815 	 * risky. But we take the risk by trying to advance through
    816 	 * the queued requests in order to make the client behaviour
    817 	 * more predictable around resets, by not throwing away random
    818 	 * amount of batches it has prepared for execution. Sophisticated
    819 	 * clients can use gem_reset_stats_ioctl and dma fence status
    820 	 * (exported via sync_file info ioctl on explicit fences) to observe
    821 	 * when it loses the context state and should rebuild accordingly.
    822 	 *
    823 	 * The context ban, and ultimately the client ban, mechanism are safety
    824 	 * valves if client submission ends up resulting in nothing more than
    825 	 * subsequent hangs.
    826 	 */
    827 
    828 	if (rq) {
    829 		/*
    830 		 * Try to restore the logical GPU state to match the
    831 		 * continuation of the request queue. If we skip the
    832 		 * context/PD restore, then the next request may try to execute
    833 		 * assuming that its context is valid and loaded on the GPU and
    834 		 * so may try to access invalid memory, prompting repeated GPU
    835 		 * hangs.
    836 		 *
    837 		 * If the request was guilty, we still restore the logical
    838 		 * state in case the next request requires it (e.g. the
    839 		 * aliasing ppgtt), but skip over the hung batch.
    840 		 *
    841 		 * If the request was innocent, we try to replay the request
    842 		 * with the restored context.
    843 		 */
    844 		__i915_request_reset(rq, stalled);
    845 
    846 		GEM_BUG_ON(rq->ring != engine->legacy.ring);
    847 		head = rq->head;
    848 	} else {
    849 		head = engine->legacy.ring->tail;
    850 	}
    851 	engine->legacy.ring->head = intel_ring_wrap(engine->legacy.ring, head);
    852 
    853 	spin_unlock_irqrestore(&engine->active.lock, flags);
    854 }
    855 
    856 static void reset_finish(struct intel_engine_cs *engine)
    857 {
    858 }
    859 
    860 static int rcs_resume(struct intel_engine_cs *engine)
    861 {
    862 	struct drm_i915_private *i915 = engine->i915;
    863 	struct intel_uncore *uncore = engine->uncore;
    864 
    865 	/*
    866 	 * Disable CONSTANT_BUFFER before it is loaded from the context
    867 	 * image. For as it is loaded, it is executed and the stored
    868 	 * address may no longer be valid, leading to a GPU hang.
    869 	 *
    870 	 * This imposes the requirement that userspace reload their
    871 	 * CONSTANT_BUFFER on every batch, fortunately a requirement
    872 	 * they are already accustomed to from before contexts were
    873 	 * enabled.
    874 	 */
    875 	if (IS_GEN(i915, 4))
    876 		intel_uncore_write(uncore, ECOSKPD,
    877 			   _MASKED_BIT_ENABLE(ECO_CONSTANT_BUFFER_SR_DISABLE));
    878 
    879 	/* WaTimedSingleVertexDispatch:cl,bw,ctg,elk,ilk,snb */
    880 	if (IS_GEN_RANGE(i915, 4, 6))
    881 		intel_uncore_write(uncore, MI_MODE,
    882 				   _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH));
    883 
    884 	/* We need to disable the AsyncFlip performance optimisations in order
    885 	 * to use MI_WAIT_FOR_EVENT within the CS. It should already be
    886 	 * programmed to '1' on all products.
    887 	 *
    888 	 * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv
    889 	 */
    890 	if (IS_GEN_RANGE(i915, 6, 7))
    891 		intel_uncore_write(uncore, MI_MODE,
    892 				   _MASKED_BIT_ENABLE(ASYNC_FLIP_PERF_DISABLE));
    893 
    894 	/* Required for the hardware to program scanline values for waiting */
    895 	/* WaEnableFlushTlbInvalidationMode:snb */
    896 	if (IS_GEN(i915, 6))
    897 		intel_uncore_write(uncore, GFX_MODE,
    898 			   _MASKED_BIT_ENABLE(GFX_TLB_INVALIDATE_EXPLICIT));
    899 
    900 	/* WaBCSVCSTlbInvalidationMode:ivb,vlv,hsw */
    901 	if (IS_GEN(i915, 7))
    902 		intel_uncore_write(uncore, GFX_MODE_GEN7,
    903 			   _MASKED_BIT_ENABLE(GFX_TLB_INVALIDATE_EXPLICIT) |
    904 			   _MASKED_BIT_ENABLE(GFX_REPLAY_MODE));
    905 
    906 	if (IS_GEN(i915, 6)) {
    907 		/* From the Sandybridge PRM, volume 1 part 3, page 24:
    908 		 * "If this bit is set, STCunit will have LRA as replacement
    909 		 *  policy. [...] This bit must be reset.  LRA replacement
    910 		 *  policy is not supported."
    911 		 */
    912 		intel_uncore_write(uncore, CACHE_MODE_0,
    913 			   _MASKED_BIT_DISABLE(CM0_STC_EVICT_DISABLE_LRA_SNB));
    914 	}
    915 
    916 	if (IS_GEN_RANGE(i915, 6, 7))
    917 		intel_uncore_write(uncore, INSTPM,
    918 				   _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));
    919 
    920 	return xcs_resume(engine);
    921 }
    922 
    923 static void reset_cancel(struct intel_engine_cs *engine)
    924 {
    925 	struct i915_request *request;
    926 	unsigned long flags;
    927 
    928 	spin_lock_irqsave(&engine->active.lock, flags);
    929 
    930 	/* Mark all submitted requests as skipped. */
    931 	list_for_each_entry(request, &engine->active.requests, sched.link) {
    932 		if (!i915_request_signaled(request))
    933 			dma_fence_set_error(&request->fence, -EIO);
    934 
    935 		i915_request_mark_complete(request);
    936 	}
    937 
    938 	/* Remaining _unready_ requests will be nop'ed when submitted */
    939 
    940 	spin_unlock_irqrestore(&engine->active.lock, flags);
    941 }
    942 
    943 static void i9xx_submit_request(struct i915_request *request)
    944 {
    945 	i915_request_submit(request);
    946 	wmb(); /* paranoid flush writes out of the WCB before mmio */
    947 
    948 	ENGINE_WRITE(request->engine, RING_TAIL,
    949 		     intel_ring_set_tail(request->ring, request->tail));
    950 }
    951 
    952 static u32 *i9xx_emit_breadcrumb(struct i915_request *rq, u32 *cs)
    953 {
    954 	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
    955 	GEM_BUG_ON(offset_in_page(i915_request_active_timeline(rq)->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
    956 
    957 	*cs++ = MI_FLUSH;
    958 
    959 	*cs++ = MI_STORE_DWORD_INDEX;
    960 	*cs++ = I915_GEM_HWS_SEQNO_ADDR;
    961 	*cs++ = rq->fence.seqno;
    962 
    963 	*cs++ = MI_USER_INTERRUPT;
    964 	*cs++ = MI_NOOP;
    965 
    966 	rq->tail = intel_ring_offset(rq, cs);
    967 	assert_ring_tail_valid(rq->ring, rq->tail);
    968 
    969 	return cs;
    970 }
    971 
    972 #define GEN5_WA_STORES 8 /* must be at least 1! */
    973 static u32 *gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
    974 {
    975 	int i;
    976 
    977 	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
    978 	GEM_BUG_ON(offset_in_page(i915_request_active_timeline(rq)->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
    979 
    980 	*cs++ = MI_FLUSH;
    981 
    982 	BUILD_BUG_ON(GEN5_WA_STORES < 1);
    983 	for (i = 0; i < GEN5_WA_STORES; i++) {
    984 		*cs++ = MI_STORE_DWORD_INDEX;
    985 		*cs++ = I915_GEM_HWS_SEQNO_ADDR;
    986 		*cs++ = rq->fence.seqno;
    987 	}
    988 
    989 	*cs++ = MI_USER_INTERRUPT;
    990 
    991 	rq->tail = intel_ring_offset(rq, cs);
    992 	assert_ring_tail_valid(rq->ring, rq->tail);
    993 
    994 	return cs;
    995 }
    996 #undef GEN5_WA_STORES
    997 
    998 static void
    999 gen5_irq_enable(struct intel_engine_cs *engine)
   1000 {
   1001 	gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask);
   1002 }
   1003 
   1004 static void
   1005 gen5_irq_disable(struct intel_engine_cs *engine)
   1006 {
   1007 	gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask);
   1008 }
   1009 
   1010 static void
   1011 i9xx_irq_enable(struct intel_engine_cs *engine)
   1012 {
   1013 	engine->i915->irq_mask &= ~engine->irq_enable_mask;
   1014 	intel_uncore_write(engine->uncore, GEN2_IMR, engine->i915->irq_mask);
   1015 	intel_uncore_posting_read_fw(engine->uncore, GEN2_IMR);
   1016 }
   1017 
   1018 static void
   1019 i9xx_irq_disable(struct intel_engine_cs *engine)
   1020 {
   1021 	engine->i915->irq_mask |= engine->irq_enable_mask;
   1022 	intel_uncore_write(engine->uncore, GEN2_IMR, engine->i915->irq_mask);
   1023 }
   1024 
   1025 static void
   1026 i8xx_irq_enable(struct intel_engine_cs *engine)
   1027 {
   1028 	struct drm_i915_private *i915 = engine->i915;
   1029 
   1030 	i915->irq_mask &= ~engine->irq_enable_mask;
   1031 	intel_uncore_write16(&i915->uncore, GEN2_IMR, i915->irq_mask);
   1032 	ENGINE_POSTING_READ16(engine, RING_IMR);
   1033 }
   1034 
   1035 static void
   1036 i8xx_irq_disable(struct intel_engine_cs *engine)
   1037 {
   1038 	struct drm_i915_private *i915 = engine->i915;
   1039 
   1040 	i915->irq_mask |= engine->irq_enable_mask;
   1041 	intel_uncore_write16(&i915->uncore, GEN2_IMR, i915->irq_mask);
   1042 }
   1043 
   1044 static int
   1045 bsd_ring_flush(struct i915_request *rq, u32 mode)
   1046 {
   1047 	u32 *cs;
   1048 
   1049 	cs = intel_ring_begin(rq, 2);
   1050 	if (IS_ERR(cs))
   1051 		return PTR_ERR(cs);
   1052 
   1053 	*cs++ = MI_FLUSH;
   1054 	*cs++ = MI_NOOP;
   1055 	intel_ring_advance(rq, cs);
   1056 	return 0;
   1057 }
   1058 
   1059 static void
   1060 gen6_irq_enable(struct intel_engine_cs *engine)
   1061 {
   1062 	ENGINE_WRITE(engine, RING_IMR,
   1063 		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
   1064 
   1065 	/* Flush/delay to ensure the RING_IMR is active before the GT IMR */
   1066 	ENGINE_POSTING_READ(engine, RING_IMR);
   1067 
   1068 	gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask);
   1069 }
   1070 
   1071 static void
   1072 gen6_irq_disable(struct intel_engine_cs *engine)
   1073 {
   1074 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
   1075 	gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask);
   1076 }
   1077 
   1078 static void
   1079 hsw_vebox_irq_enable(struct intel_engine_cs *engine)
   1080 {
   1081 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_enable_mask);
   1082 
   1083 	/* Flush/delay to ensure the RING_IMR is active before the GT IMR */
   1084 	ENGINE_POSTING_READ(engine, RING_IMR);
   1085 
   1086 	gen6_gt_pm_unmask_irq(engine->gt, engine->irq_enable_mask);
   1087 }
   1088 
   1089 static void
   1090 hsw_vebox_irq_disable(struct intel_engine_cs *engine)
   1091 {
   1092 	ENGINE_WRITE(engine, RING_IMR, ~0);
   1093 	gen6_gt_pm_mask_irq(engine->gt, engine->irq_enable_mask);
   1094 }
   1095 
   1096 static int
   1097 i965_emit_bb_start(struct i915_request *rq,
   1098 		   u64 offset, u32 length,
   1099 		   unsigned int dispatch_flags)
   1100 {
   1101 	u32 *cs;
   1102 
   1103 	cs = intel_ring_begin(rq, 2);
   1104 	if (IS_ERR(cs))
   1105 		return PTR_ERR(cs);
   1106 
   1107 	*cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT | (dispatch_flags &
   1108 		I915_DISPATCH_SECURE ? 0 : MI_BATCH_NON_SECURE_I965);
   1109 	*cs++ = offset;
   1110 	intel_ring_advance(rq, cs);
   1111 
   1112 	return 0;
   1113 }
   1114 
   1115 /* Just userspace ABI convention to limit the wa batch bo to a resonable size */
   1116 #define I830_BATCH_LIMIT SZ_256K
   1117 #define I830_TLB_ENTRIES (2)
   1118 #define I830_WA_SIZE max(I830_TLB_ENTRIES*4096, I830_BATCH_LIMIT)
   1119 static int
   1120 i830_emit_bb_start(struct i915_request *rq,
   1121 		   u64 offset, u32 len,
   1122 		   unsigned int dispatch_flags)
   1123 {
   1124 	u32 *cs, cs_offset =
   1125 		intel_gt_scratch_offset(rq->engine->gt,
   1126 					INTEL_GT_SCRATCH_FIELD_DEFAULT);
   1127 
   1128 	GEM_BUG_ON(rq->engine->gt->scratch->size < I830_WA_SIZE);
   1129 
   1130 	cs = intel_ring_begin(rq, 6);
   1131 	if (IS_ERR(cs))
   1132 		return PTR_ERR(cs);
   1133 
   1134 	/* Evict the invalid PTE TLBs */
   1135 	*cs++ = COLOR_BLT_CMD | BLT_WRITE_RGBA;
   1136 	*cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | 4096;
   1137 	*cs++ = I830_TLB_ENTRIES << 16 | 4; /* load each page */
   1138 	*cs++ = cs_offset;
   1139 	*cs++ = 0xdeadbeef;
   1140 	*cs++ = MI_NOOP;
   1141 	intel_ring_advance(rq, cs);
   1142 
   1143 	if ((dispatch_flags & I915_DISPATCH_PINNED) == 0) {
   1144 		if (len > I830_BATCH_LIMIT)
   1145 			return -ENOSPC;
   1146 
   1147 		cs = intel_ring_begin(rq, 6 + 2);
   1148 		if (IS_ERR(cs))
   1149 			return PTR_ERR(cs);
   1150 
   1151 		/* Blit the batch (which has now all relocs applied) to the
   1152 		 * stable batch scratch bo area (so that the CS never
   1153 		 * stumbles over its tlb invalidation bug) ...
   1154 		 */
   1155 		*cs++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (6 - 2);
   1156 		*cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | 4096;
   1157 		*cs++ = DIV_ROUND_UP(len, 4096) << 16 | 4096;
   1158 		*cs++ = cs_offset;
   1159 		*cs++ = 4096;
   1160 		*cs++ = offset;
   1161 
   1162 		*cs++ = MI_FLUSH;
   1163 		*cs++ = MI_NOOP;
   1164 		intel_ring_advance(rq, cs);
   1165 
   1166 		/* ... and execute it. */
   1167 		offset = cs_offset;
   1168 	}
   1169 
   1170 	cs = intel_ring_begin(rq, 2);
   1171 	if (IS_ERR(cs))
   1172 		return PTR_ERR(cs);
   1173 
   1174 	*cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
   1175 	*cs++ = offset | (dispatch_flags & I915_DISPATCH_SECURE ? 0 :
   1176 		MI_BATCH_NON_SECURE);
   1177 	intel_ring_advance(rq, cs);
   1178 
   1179 	return 0;
   1180 }
   1181 
   1182 static int
   1183 i915_emit_bb_start(struct i915_request *rq,
   1184 		   u64 offset, u32 len,
   1185 		   unsigned int dispatch_flags)
   1186 {
   1187 	u32 *cs;
   1188 
   1189 	cs = intel_ring_begin(rq, 2);
   1190 	if (IS_ERR(cs))
   1191 		return PTR_ERR(cs);
   1192 
   1193 	*cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
   1194 	*cs++ = offset | (dispatch_flags & I915_DISPATCH_SECURE ? 0 :
   1195 		MI_BATCH_NON_SECURE);
   1196 	intel_ring_advance(rq, cs);
   1197 
   1198 	return 0;
   1199 }
   1200 
   1201 static void __ring_context_fini(struct intel_context *ce)
   1202 {
   1203 	i915_vma_put(ce->state);
   1204 }
   1205 
   1206 static void ring_context_destroy(struct kref *ref)
   1207 {
   1208 	struct intel_context *ce = container_of(ref, typeof(*ce), ref);
   1209 
   1210 	GEM_BUG_ON(intel_context_is_pinned(ce));
   1211 
   1212 	if (ce->state)
   1213 		__ring_context_fini(ce);
   1214 
   1215 	intel_context_fini(ce);
   1216 	intel_context_free(ce);
   1217 }
   1218 
   1219 static struct i915_address_space *vm_alias(struct intel_context *ce)
   1220 {
   1221 	struct i915_address_space *vm;
   1222 
   1223 	vm = ce->vm;
   1224 	if (i915_is_ggtt(vm))
   1225 		vm = &i915_vm_to_ggtt(vm)->alias->vm;
   1226 
   1227 	return vm;
   1228 }
   1229 
   1230 static int __context_pin_ppgtt(struct intel_context *ce)
   1231 {
   1232 	struct i915_address_space *vm;
   1233 	int err = 0;
   1234 
   1235 	vm = vm_alias(ce);
   1236 	if (vm)
   1237 		err = gen6_ppgtt_pin(i915_vm_to_ppgtt((vm)));
   1238 
   1239 	return err;
   1240 }
   1241 
   1242 static void __context_unpin_ppgtt(struct intel_context *ce)
   1243 {
   1244 	struct i915_address_space *vm;
   1245 
   1246 	vm = vm_alias(ce);
   1247 	if (vm)
   1248 		gen6_ppgtt_unpin(i915_vm_to_ppgtt(vm));
   1249 }
   1250 
   1251 static void ring_context_unpin(struct intel_context *ce)
   1252 {
   1253 	__context_unpin_ppgtt(ce);
   1254 }
   1255 
   1256 static struct i915_vma *
   1257 alloc_context_vma(struct intel_engine_cs *engine)
   1258 {
   1259 	struct drm_i915_private *i915 = engine->i915;
   1260 	struct drm_i915_gem_object *obj;
   1261 	struct i915_vma *vma;
   1262 	int err;
   1263 
   1264 	obj = i915_gem_object_create_shmem(i915, engine->context_size);
   1265 	if (IS_ERR(obj))
   1266 		return ERR_CAST(obj);
   1267 
   1268 	/*
   1269 	 * Try to make the context utilize L3 as well as LLC.
   1270 	 *
   1271 	 * On VLV we don't have L3 controls in the PTEs so we
   1272 	 * shouldn't touch the cache level, especially as that
   1273 	 * would make the object snooped which might have a
   1274 	 * negative performance impact.
   1275 	 *
   1276 	 * Snooping is required on non-llc platforms in execlist
   1277 	 * mode, but since all GGTT accesses use PAT entry 0 we
   1278 	 * get snooping anyway regardless of cache_level.
   1279 	 *
   1280 	 * This is only applicable for Ivy Bridge devices since
   1281 	 * later platforms don't have L3 control bits in the PTE.
   1282 	 */
   1283 	if (IS_IVYBRIDGE(i915))
   1284 		i915_gem_object_set_cache_coherency(obj, I915_CACHE_L3_LLC);
   1285 
   1286 	if (engine->default_state) {
   1287 		void *defaults, *vaddr;
   1288 
   1289 		vaddr = i915_gem_object_pin_map(obj, I915_MAP_WB);
   1290 		if (IS_ERR(vaddr)) {
   1291 			err = PTR_ERR(vaddr);
   1292 			goto err_obj;
   1293 		}
   1294 
   1295 		defaults = i915_gem_object_pin_map(engine->default_state,
   1296 						   I915_MAP_WB);
   1297 		if (IS_ERR(defaults)) {
   1298 			err = PTR_ERR(defaults);
   1299 			goto err_map;
   1300 		}
   1301 
   1302 		memcpy(vaddr, defaults, engine->context_size);
   1303 		i915_gem_object_unpin_map(engine->default_state);
   1304 
   1305 		i915_gem_object_flush_map(obj);
   1306 		i915_gem_object_unpin_map(obj);
   1307 	}
   1308 
   1309 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
   1310 	if (IS_ERR(vma)) {
   1311 		err = PTR_ERR(vma);
   1312 		goto err_obj;
   1313 	}
   1314 
   1315 	return vma;
   1316 
   1317 err_map:
   1318 	i915_gem_object_unpin_map(obj);
   1319 err_obj:
   1320 	i915_gem_object_put(obj);
   1321 	return ERR_PTR(err);
   1322 }
   1323 
   1324 static int ring_context_alloc(struct intel_context *ce)
   1325 {
   1326 	struct intel_engine_cs *engine = ce->engine;
   1327 
   1328 	/* One ringbuffer to rule them all */
   1329 	GEM_BUG_ON(!engine->legacy.ring);
   1330 	ce->ring = engine->legacy.ring;
   1331 	ce->timeline = intel_timeline_get(engine->legacy.timeline);
   1332 
   1333 	GEM_BUG_ON(ce->state);
   1334 	if (engine->context_size) {
   1335 		struct i915_vma *vma;
   1336 
   1337 		vma = alloc_context_vma(engine);
   1338 		if (IS_ERR(vma))
   1339 			return PTR_ERR(vma);
   1340 
   1341 		ce->state = vma;
   1342 		if (engine->default_state)
   1343 			__set_bit(CONTEXT_VALID_BIT, &ce->flags);
   1344 	}
   1345 
   1346 	return 0;
   1347 }
   1348 
   1349 static int ring_context_pin(struct intel_context *ce)
   1350 {
   1351 	return __context_pin_ppgtt(ce);
   1352 }
   1353 
   1354 static void ring_context_reset(struct intel_context *ce)
   1355 {
   1356 	intel_ring_reset(ce->ring, ce->ring->emit);
   1357 }
   1358 
   1359 static const struct intel_context_ops ring_context_ops = {
   1360 	.alloc = ring_context_alloc,
   1361 
   1362 	.pin = ring_context_pin,
   1363 	.unpin = ring_context_unpin,
   1364 
   1365 	.enter = intel_context_enter_engine,
   1366 	.exit = intel_context_exit_engine,
   1367 
   1368 	.reset = ring_context_reset,
   1369 	.destroy = ring_context_destroy,
   1370 };
   1371 
   1372 static int load_pd_dir(struct i915_request *rq,
   1373 		       const struct i915_ppgtt *ppgtt,
   1374 		       u32 valid)
   1375 {
   1376 	const struct intel_engine_cs * const engine = rq->engine;
   1377 	u32 *cs;
   1378 
   1379 	cs = intel_ring_begin(rq, 12);
   1380 	if (IS_ERR(cs))
   1381 		return PTR_ERR(cs);
   1382 
   1383 	*cs++ = MI_LOAD_REGISTER_IMM(1);
   1384 	*cs++ = i915_mmio_reg_offset(RING_PP_DIR_DCLV(engine->mmio_base));
   1385 	*cs++ = valid;
   1386 
   1387 	*cs++ = MI_LOAD_REGISTER_IMM(1);
   1388 	*cs++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine->mmio_base));
   1389 	*cs++ = px_base(ppgtt->pd)->ggtt_offset << 10;
   1390 
   1391 	/* Stall until the page table load is complete? */
   1392 	*cs++ = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
   1393 	*cs++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine->mmio_base));
   1394 	*cs++ = intel_gt_scratch_offset(engine->gt,
   1395 					INTEL_GT_SCRATCH_FIELD_DEFAULT);
   1396 
   1397 	*cs++ = MI_LOAD_REGISTER_IMM(1);
   1398 	*cs++ = i915_mmio_reg_offset(RING_INSTPM(engine->mmio_base));
   1399 	*cs++ = _MASKED_BIT_ENABLE(INSTPM_TLB_INVALIDATE);
   1400 
   1401 	intel_ring_advance(rq, cs);
   1402 
   1403 	return rq->engine->emit_flush(rq, EMIT_FLUSH);
   1404 }
   1405 
   1406 static inline int mi_set_context(struct i915_request *rq, u32 flags)
   1407 {
   1408 	struct drm_i915_private *i915 = rq->i915;
   1409 	struct intel_engine_cs *engine = rq->engine;
   1410 	enum intel_engine_id id;
   1411 	const int num_engines =
   1412 		IS_HASWELL(i915) ? RUNTIME_INFO(i915)->num_engines - 1 : 0;
   1413 	bool force_restore = false;
   1414 	int len;
   1415 	u32 *cs;
   1416 
   1417 	len = 4;
   1418 	if (IS_GEN(i915, 7))
   1419 		len += 2 + (num_engines ? 4 * num_engines + 6 : 0);
   1420 	else if (IS_GEN(i915, 5))
   1421 		len += 2;
   1422 	if (flags & MI_FORCE_RESTORE) {
   1423 		GEM_BUG_ON(flags & MI_RESTORE_INHIBIT);
   1424 		flags &= ~MI_FORCE_RESTORE;
   1425 		force_restore = true;
   1426 		len += 2;
   1427 	}
   1428 
   1429 	cs = intel_ring_begin(rq, len);
   1430 	if (IS_ERR(cs))
   1431 		return PTR_ERR(cs);
   1432 
   1433 	/* WaProgramMiArbOnOffAroundMiSetContext:ivb,vlv,hsw,bdw,chv */
   1434 	if (IS_GEN(i915, 7)) {
   1435 		*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
   1436 		if (num_engines) {
   1437 			struct intel_engine_cs *signaller;
   1438 
   1439 			*cs++ = MI_LOAD_REGISTER_IMM(num_engines);
   1440 			for_each_engine(signaller, engine->gt, id) {
   1441 				if (signaller == engine)
   1442 					continue;
   1443 
   1444 				*cs++ = i915_mmio_reg_offset(
   1445 					   RING_PSMI_CTL(signaller->mmio_base));
   1446 				*cs++ = _MASKED_BIT_ENABLE(
   1447 						GEN6_PSMI_SLEEP_MSG_DISABLE);
   1448 			}
   1449 		}
   1450 	} else if (IS_GEN(i915, 5)) {
   1451 		/*
   1452 		 * This w/a is only listed for pre-production ilk a/b steppings,
   1453 		 * but is also mentioned for programming the powerctx. To be
   1454 		 * safe, just apply the workaround; we do not use SyncFlush so
   1455 		 * this should never take effect and so be a no-op!
   1456 		 */
   1457 		*cs++ = MI_SUSPEND_FLUSH | MI_SUSPEND_FLUSH_EN;
   1458 	}
   1459 
   1460 	if (force_restore) {
   1461 		/*
   1462 		 * The HW doesn't handle being told to restore the current
   1463 		 * context very well. Quite often it likes goes to go off and
   1464 		 * sulk, especially when it is meant to be reloading PP_DIR.
   1465 		 * A very simple fix to force the reload is to simply switch
   1466 		 * away from the current context and back again.
   1467 		 *
   1468 		 * Note that the kernel_context will contain random state
   1469 		 * following the INHIBIT_RESTORE. We accept this since we
   1470 		 * never use the kernel_context state; it is merely a
   1471 		 * placeholder we use to flush other contexts.
   1472 		 */
   1473 		*cs++ = MI_SET_CONTEXT;
   1474 		*cs++ = i915_ggtt_offset(engine->kernel_context->state) |
   1475 			MI_MM_SPACE_GTT |
   1476 			MI_RESTORE_INHIBIT;
   1477 	}
   1478 
   1479 	*cs++ = MI_NOOP;
   1480 	*cs++ = MI_SET_CONTEXT;
   1481 	*cs++ = i915_ggtt_offset(rq->context->state) | flags;
   1482 	/*
   1483 	 * w/a: MI_SET_CONTEXT must always be followed by MI_NOOP
   1484 	 * WaMiSetContext_Hang:snb,ivb,vlv
   1485 	 */
   1486 	*cs++ = MI_NOOP;
   1487 
   1488 	if (IS_GEN(i915, 7)) {
   1489 		if (num_engines) {
   1490 			struct intel_engine_cs *signaller;
   1491 			i915_reg_t last_reg = {}; /* keep gcc quiet */
   1492 
   1493 			*cs++ = MI_LOAD_REGISTER_IMM(num_engines);
   1494 			for_each_engine(signaller, engine->gt, id) {
   1495 				if (signaller == engine)
   1496 					continue;
   1497 
   1498 				last_reg = RING_PSMI_CTL(signaller->mmio_base);
   1499 				*cs++ = i915_mmio_reg_offset(last_reg);
   1500 				*cs++ = _MASKED_BIT_DISABLE(
   1501 						GEN6_PSMI_SLEEP_MSG_DISABLE);
   1502 			}
   1503 
   1504 			/* Insert a delay before the next switch! */
   1505 			*cs++ = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
   1506 			*cs++ = i915_mmio_reg_offset(last_reg);
   1507 			*cs++ = intel_gt_scratch_offset(engine->gt,
   1508 							INTEL_GT_SCRATCH_FIELD_DEFAULT);
   1509 			*cs++ = MI_NOOP;
   1510 		}
   1511 		*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
   1512 	} else if (IS_GEN(i915, 5)) {
   1513 		*cs++ = MI_SUSPEND_FLUSH;
   1514 	}
   1515 
   1516 	intel_ring_advance(rq, cs);
   1517 
   1518 	return 0;
   1519 }
   1520 
   1521 static int remap_l3_slice(struct i915_request *rq, int slice)
   1522 {
   1523 	u32 *cs, *remap_info = rq->i915->l3_parity.remap_info[slice];
   1524 	int i;
   1525 
   1526 	if (!remap_info)
   1527 		return 0;
   1528 
   1529 	cs = intel_ring_begin(rq, GEN7_L3LOG_SIZE/4 * 2 + 2);
   1530 	if (IS_ERR(cs))
   1531 		return PTR_ERR(cs);
   1532 
   1533 	/*
   1534 	 * Note: We do not worry about the concurrent register cacheline hang
   1535 	 * here because no other code should access these registers other than
   1536 	 * at initialization time.
   1537 	 */
   1538 	*cs++ = MI_LOAD_REGISTER_IMM(GEN7_L3LOG_SIZE/4);
   1539 	for (i = 0; i < GEN7_L3LOG_SIZE/4; i++) {
   1540 		*cs++ = i915_mmio_reg_offset(GEN7_L3LOG(slice, i));
   1541 		*cs++ = remap_info[i];
   1542 	}
   1543 	*cs++ = MI_NOOP;
   1544 	intel_ring_advance(rq, cs);
   1545 
   1546 	return 0;
   1547 }
   1548 
   1549 static int remap_l3(struct i915_request *rq)
   1550 {
   1551 	struct i915_gem_context *ctx = i915_request_gem_context(rq);
   1552 	int i, err;
   1553 
   1554 	if (!ctx || !ctx->remap_slice)
   1555 		return 0;
   1556 
   1557 	for (i = 0; i < MAX_L3_SLICES; i++) {
   1558 		if (!(ctx->remap_slice & BIT(i)))
   1559 			continue;
   1560 
   1561 		err = remap_l3_slice(rq, i);
   1562 		if (err)
   1563 			return err;
   1564 	}
   1565 
   1566 	ctx->remap_slice = 0;
   1567 	return 0;
   1568 }
   1569 
   1570 static int switch_mm(struct i915_request *rq, struct i915_address_space *vm)
   1571 {
   1572 	int ret;
   1573 
   1574 	if (!vm)
   1575 		return 0;
   1576 
   1577 	ret = rq->engine->emit_flush(rq, EMIT_FLUSH);
   1578 	if (ret)
   1579 		return ret;
   1580 
   1581 	/*
   1582 	 * Not only do we need a full barrier (post-sync write) after
   1583 	 * invalidating the TLBs, but we need to wait a little bit
   1584 	 * longer. Whether this is merely delaying us, or the
   1585 	 * subsequent flush is a key part of serialising with the
   1586 	 * post-sync op, this extra pass appears vital before a
   1587 	 * mm switch!
   1588 	 */
   1589 	ret = load_pd_dir(rq, i915_vm_to_ppgtt(vm), PP_DIR_DCLV_2G);
   1590 	if (ret)
   1591 		return ret;
   1592 
   1593 	return rq->engine->emit_flush(rq, EMIT_INVALIDATE);
   1594 }
   1595 
   1596 static int switch_context(struct i915_request *rq)
   1597 {
   1598 	struct intel_context *ce = rq->context;
   1599 	int ret;
   1600 
   1601 	GEM_BUG_ON(HAS_EXECLISTS(rq->i915));
   1602 
   1603 	ret = switch_mm(rq, vm_alias(ce));
   1604 	if (ret)
   1605 		return ret;
   1606 
   1607 	if (ce->state) {
   1608 		u32 flags;
   1609 
   1610 		GEM_BUG_ON(rq->engine->id != RCS0);
   1611 
   1612 		/* For resource streamer on HSW+ and power context elsewhere */
   1613 		BUILD_BUG_ON(HSW_MI_RS_SAVE_STATE_EN != MI_SAVE_EXT_STATE_EN);
   1614 		BUILD_BUG_ON(HSW_MI_RS_RESTORE_STATE_EN != MI_RESTORE_EXT_STATE_EN);
   1615 
   1616 		flags = MI_SAVE_EXT_STATE_EN | MI_MM_SPACE_GTT;
   1617 		if (test_bit(CONTEXT_VALID_BIT, &ce->flags))
   1618 			flags |= MI_RESTORE_EXT_STATE_EN;
   1619 		else
   1620 			flags |= MI_RESTORE_INHIBIT;
   1621 
   1622 		ret = mi_set_context(rq, flags);
   1623 		if (ret)
   1624 			return ret;
   1625 	}
   1626 
   1627 	ret = remap_l3(rq);
   1628 	if (ret)
   1629 		return ret;
   1630 
   1631 	return 0;
   1632 }
   1633 
   1634 static int ring_request_alloc(struct i915_request *request)
   1635 {
   1636 	int ret;
   1637 
   1638 	GEM_BUG_ON(!intel_context_is_pinned(request->context));
   1639 	GEM_BUG_ON(i915_request_timeline(request)->has_initial_breadcrumb);
   1640 
   1641 	/*
   1642 	 * Flush enough space to reduce the likelihood of waiting after
   1643 	 * we start building the request - in which case we will just
   1644 	 * have to repeat work.
   1645 	 */
   1646 	request->reserved_space += LEGACY_REQUEST_SIZE;
   1647 
   1648 	/* Unconditionally invalidate GPU caches and TLBs. */
   1649 	ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
   1650 	if (ret)
   1651 		return ret;
   1652 
   1653 	ret = switch_context(request);
   1654 	if (ret)
   1655 		return ret;
   1656 
   1657 	request->reserved_space -= LEGACY_REQUEST_SIZE;
   1658 	return 0;
   1659 }
   1660 
   1661 static void gen6_bsd_submit_request(struct i915_request *request)
   1662 {
   1663 	struct intel_uncore *uncore = request->engine->uncore;
   1664 
   1665 	intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL);
   1666 
   1667        /* Every tail move must follow the sequence below */
   1668 
   1669 	/* Disable notification that the ring is IDLE. The GT
   1670 	 * will then assume that it is busy and bring it out of rc6.
   1671 	 */
   1672 	intel_uncore_write_fw(uncore, GEN6_BSD_SLEEP_PSMI_CONTROL,
   1673 			      _MASKED_BIT_ENABLE(GEN6_BSD_SLEEP_MSG_DISABLE));
   1674 
   1675 	/* Clear the context id. Here be magic! */
   1676 	intel_uncore_write64_fw(uncore, GEN6_BSD_RNCID, 0x0);
   1677 
   1678 	/* Wait for the ring not to be idle, i.e. for it to wake up. */
   1679 	if (__intel_wait_for_register_fw(uncore,
   1680 					 GEN6_BSD_SLEEP_PSMI_CONTROL,
   1681 					 GEN6_BSD_SLEEP_INDICATOR,
   1682 					 0,
   1683 					 1000, 0, NULL))
   1684 		DRM_ERROR("timed out waiting for the BSD ring to wake up\n");
   1685 
   1686 	/* Now that the ring is fully powered up, update the tail */
   1687 	i9xx_submit_request(request);
   1688 
   1689 	/* Let the ring send IDLE messages to the GT again,
   1690 	 * and so let it sleep to conserve power when idle.
   1691 	 */
   1692 	intel_uncore_write_fw(uncore, GEN6_BSD_SLEEP_PSMI_CONTROL,
   1693 			      _MASKED_BIT_DISABLE(GEN6_BSD_SLEEP_MSG_DISABLE));
   1694 
   1695 	intel_uncore_forcewake_put(uncore, FORCEWAKE_ALL);
   1696 }
   1697 
   1698 static int mi_flush_dw(struct i915_request *rq, u32 flags)
   1699 {
   1700 	u32 cmd, *cs;
   1701 
   1702 	cs = intel_ring_begin(rq, 4);
   1703 	if (IS_ERR(cs))
   1704 		return PTR_ERR(cs);
   1705 
   1706 	cmd = MI_FLUSH_DW;
   1707 
   1708 	/*
   1709 	 * We always require a command barrier so that subsequent
   1710 	 * commands, such as breadcrumb interrupts, are strictly ordered
   1711 	 * wrt the contents of the write cache being flushed to memory
   1712 	 * (and thus being coherent from the CPU).
   1713 	 */
   1714 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
   1715 
   1716 	/*
   1717 	 * Bspec vol 1c.3 - blitter engine command streamer:
   1718 	 * "If ENABLED, all TLBs will be invalidated once the flush
   1719 	 * operation is complete. This bit is only valid when the
   1720 	 * Post-Sync Operation field is a value of 1h or 3h."
   1721 	 */
   1722 	cmd |= flags;
   1723 
   1724 	*cs++ = cmd;
   1725 	*cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
   1726 	*cs++ = 0;
   1727 	*cs++ = MI_NOOP;
   1728 
   1729 	intel_ring_advance(rq, cs);
   1730 
   1731 	return 0;
   1732 }
   1733 
   1734 static int gen6_flush_dw(struct i915_request *rq, u32 mode, u32 invflags)
   1735 {
   1736 	return mi_flush_dw(rq, mode & EMIT_INVALIDATE ? invflags : 0);
   1737 }
   1738 
   1739 static int gen6_bsd_ring_flush(struct i915_request *rq, u32 mode)
   1740 {
   1741 	return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB | MI_INVALIDATE_BSD);
   1742 }
   1743 
   1744 static int
   1745 hsw_emit_bb_start(struct i915_request *rq,
   1746 		  u64 offset, u32 len,
   1747 		  unsigned int dispatch_flags)
   1748 {
   1749 	u32 *cs;
   1750 
   1751 	cs = intel_ring_begin(rq, 2);
   1752 	if (IS_ERR(cs))
   1753 		return PTR_ERR(cs);
   1754 
   1755 	*cs++ = MI_BATCH_BUFFER_START | (dispatch_flags & I915_DISPATCH_SECURE ?
   1756 		0 : MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW);
   1757 	/* bit0-7 is the length on GEN6+ */
   1758 	*cs++ = offset;
   1759 	intel_ring_advance(rq, cs);
   1760 
   1761 	return 0;
   1762 }
   1763 
   1764 static int
   1765 gen6_emit_bb_start(struct i915_request *rq,
   1766 		   u64 offset, u32 len,
   1767 		   unsigned int dispatch_flags)
   1768 {
   1769 	u32 *cs;
   1770 
   1771 	cs = intel_ring_begin(rq, 2);
   1772 	if (IS_ERR(cs))
   1773 		return PTR_ERR(cs);
   1774 
   1775 	*cs++ = MI_BATCH_BUFFER_START | (dispatch_flags & I915_DISPATCH_SECURE ?
   1776 		0 : MI_BATCH_NON_SECURE_I965);
   1777 	/* bit0-7 is the length on GEN6+ */
   1778 	*cs++ = offset;
   1779 	intel_ring_advance(rq, cs);
   1780 
   1781 	return 0;
   1782 }
   1783 
   1784 /* Blitter support (SandyBridge+) */
   1785 
   1786 static int gen6_ring_flush(struct i915_request *rq, u32 mode)
   1787 {
   1788 	return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB);
   1789 }
   1790 
   1791 static void i9xx_set_default_submission(struct intel_engine_cs *engine)
   1792 {
   1793 	engine->submit_request = i9xx_submit_request;
   1794 
   1795 	engine->park = NULL;
   1796 	engine->unpark = NULL;
   1797 }
   1798 
   1799 static void gen6_bsd_set_default_submission(struct intel_engine_cs *engine)
   1800 {
   1801 	i9xx_set_default_submission(engine);
   1802 	engine->submit_request = gen6_bsd_submit_request;
   1803 }
   1804 
   1805 static void ring_release(struct intel_engine_cs *engine)
   1806 {
   1807 	struct drm_i915_private *dev_priv = engine->i915;
   1808 
   1809 	WARN_ON(INTEL_GEN(dev_priv) > 2 &&
   1810 		(ENGINE_READ(engine, RING_MI_MODE) & MODE_IDLE) == 0);
   1811 
   1812 	intel_engine_cleanup_common(engine);
   1813 
   1814 	intel_ring_unpin(engine->legacy.ring);
   1815 	intel_ring_put(engine->legacy.ring);
   1816 
   1817 	intel_timeline_unpin(engine->legacy.timeline);
   1818 	intel_timeline_put(engine->legacy.timeline);
   1819 }
   1820 
   1821 static void setup_irq(struct intel_engine_cs *engine)
   1822 {
   1823 	struct drm_i915_private *i915 = engine->i915;
   1824 
   1825 	if (INTEL_GEN(i915) >= 6) {
   1826 		engine->irq_enable = gen6_irq_enable;
   1827 		engine->irq_disable = gen6_irq_disable;
   1828 	} else if (INTEL_GEN(i915) >= 5) {
   1829 		engine->irq_enable = gen5_irq_enable;
   1830 		engine->irq_disable = gen5_irq_disable;
   1831 	} else if (INTEL_GEN(i915) >= 3) {
   1832 		engine->irq_enable = i9xx_irq_enable;
   1833 		engine->irq_disable = i9xx_irq_disable;
   1834 	} else {
   1835 		engine->irq_enable = i8xx_irq_enable;
   1836 		engine->irq_disable = i8xx_irq_disable;
   1837 	}
   1838 }
   1839 
   1840 static void setup_common(struct intel_engine_cs *engine)
   1841 {
   1842 	struct drm_i915_private *i915 = engine->i915;
   1843 
   1844 	/* gen8+ are only supported with execlists */
   1845 	GEM_BUG_ON(INTEL_GEN(i915) >= 8);
   1846 
   1847 	setup_irq(engine);
   1848 
   1849 	engine->resume = xcs_resume;
   1850 	engine->reset.prepare = reset_prepare;
   1851 	engine->reset.rewind = reset_rewind;
   1852 	engine->reset.cancel = reset_cancel;
   1853 	engine->reset.finish = reset_finish;
   1854 
   1855 	engine->cops = &ring_context_ops;
   1856 	engine->request_alloc = ring_request_alloc;
   1857 
   1858 	/*
   1859 	 * Using a global execution timeline; the previous final breadcrumb is
   1860 	 * equivalent to our next initial bread so we can elide
   1861 	 * engine->emit_init_breadcrumb().
   1862 	 */
   1863 	engine->emit_fini_breadcrumb = i9xx_emit_breadcrumb;
   1864 	if (IS_GEN(i915, 5))
   1865 		engine->emit_fini_breadcrumb = gen5_emit_breadcrumb;
   1866 
   1867 	engine->set_default_submission = i9xx_set_default_submission;
   1868 
   1869 	if (INTEL_GEN(i915) >= 6)
   1870 		engine->emit_bb_start = gen6_emit_bb_start;
   1871 	else if (INTEL_GEN(i915) >= 4)
   1872 		engine->emit_bb_start = i965_emit_bb_start;
   1873 	else if (IS_I830(i915) || IS_I845G(i915))
   1874 		engine->emit_bb_start = i830_emit_bb_start;
   1875 	else
   1876 		engine->emit_bb_start = i915_emit_bb_start;
   1877 }
   1878 
   1879 static void setup_rcs(struct intel_engine_cs *engine)
   1880 {
   1881 	struct drm_i915_private *i915 = engine->i915;
   1882 
   1883 	if (HAS_L3_DPF(i915))
   1884 		engine->irq_keep_mask = GT_RENDER_L3_PARITY_ERROR_INTERRUPT;
   1885 
   1886 	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT;
   1887 
   1888 	if (INTEL_GEN(i915) >= 7) {
   1889 		engine->emit_flush = gen7_render_ring_flush;
   1890 		engine->emit_fini_breadcrumb = gen7_rcs_emit_breadcrumb;
   1891 	} else if (IS_GEN(i915, 6)) {
   1892 		engine->emit_flush = gen6_render_ring_flush;
   1893 		engine->emit_fini_breadcrumb = gen6_rcs_emit_breadcrumb;
   1894 	} else if (IS_GEN(i915, 5)) {
   1895 		engine->emit_flush = gen4_render_ring_flush;
   1896 	} else {
   1897 		if (INTEL_GEN(i915) < 4)
   1898 			engine->emit_flush = gen2_render_ring_flush;
   1899 		else
   1900 			engine->emit_flush = gen4_render_ring_flush;
   1901 		engine->irq_enable_mask = I915_USER_INTERRUPT;
   1902 	}
   1903 
   1904 	if (IS_HASWELL(i915))
   1905 		engine->emit_bb_start = hsw_emit_bb_start;
   1906 
   1907 	engine->resume = rcs_resume;
   1908 }
   1909 
   1910 static void setup_vcs(struct intel_engine_cs *engine)
   1911 {
   1912 	struct drm_i915_private *i915 = engine->i915;
   1913 
   1914 	if (INTEL_GEN(i915) >= 6) {
   1915 		/* gen6 bsd needs a special wa for tail updates */
   1916 		if (IS_GEN(i915, 6))
   1917 			engine->set_default_submission = gen6_bsd_set_default_submission;
   1918 		engine->emit_flush = gen6_bsd_ring_flush;
   1919 		engine->irq_enable_mask = GT_BSD_USER_INTERRUPT;
   1920 
   1921 		if (IS_GEN(i915, 6))
   1922 			engine->emit_fini_breadcrumb = gen6_xcs_emit_breadcrumb;
   1923 		else
   1924 			engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb;
   1925 	} else {
   1926 		engine->emit_flush = bsd_ring_flush;
   1927 		if (IS_GEN(i915, 5))
   1928 			engine->irq_enable_mask = ILK_BSD_USER_INTERRUPT;
   1929 		else
   1930 			engine->irq_enable_mask = I915_BSD_USER_INTERRUPT;
   1931 	}
   1932 }
   1933 
   1934 static void setup_bcs(struct intel_engine_cs *engine)
   1935 {
   1936 	struct drm_i915_private *i915 = engine->i915;
   1937 
   1938 	engine->emit_flush = gen6_ring_flush;
   1939 	engine->irq_enable_mask = GT_BLT_USER_INTERRUPT;
   1940 
   1941 	if (IS_GEN(i915, 6))
   1942 		engine->emit_fini_breadcrumb = gen6_xcs_emit_breadcrumb;
   1943 	else
   1944 		engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb;
   1945 }
   1946 
   1947 static void setup_vecs(struct intel_engine_cs *engine)
   1948 {
   1949 	struct drm_i915_private *i915 = engine->i915;
   1950 
   1951 	GEM_BUG_ON(INTEL_GEN(i915) < 7);
   1952 
   1953 	engine->emit_flush = gen6_ring_flush;
   1954 	engine->irq_enable_mask = PM_VEBOX_USER_INTERRUPT;
   1955 	engine->irq_enable = hsw_vebox_irq_enable;
   1956 	engine->irq_disable = hsw_vebox_irq_disable;
   1957 
   1958 	engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb;
   1959 }
   1960 
   1961 int intel_ring_submission_setup(struct intel_engine_cs *engine)
   1962 {
   1963 	struct intel_timeline *timeline;
   1964 	struct intel_ring *ring;
   1965 	int err;
   1966 
   1967 	setup_common(engine);
   1968 
   1969 	switch (engine->class) {
   1970 	case RENDER_CLASS:
   1971 		setup_rcs(engine);
   1972 		break;
   1973 	case VIDEO_DECODE_CLASS:
   1974 		setup_vcs(engine);
   1975 		break;
   1976 	case COPY_ENGINE_CLASS:
   1977 		setup_bcs(engine);
   1978 		break;
   1979 	case VIDEO_ENHANCEMENT_CLASS:
   1980 		setup_vecs(engine);
   1981 		break;
   1982 	default:
   1983 		MISSING_CASE(engine->class);
   1984 		return -ENODEV;
   1985 	}
   1986 
   1987 	timeline = intel_timeline_create(engine->gt, engine->status_page.vma);
   1988 	if (IS_ERR(timeline)) {
   1989 		err = PTR_ERR(timeline);
   1990 		goto err;
   1991 	}
   1992 	GEM_BUG_ON(timeline->has_initial_breadcrumb);
   1993 
   1994 	err = intel_timeline_pin(timeline);
   1995 	if (err)
   1996 		goto err_timeline;
   1997 
   1998 	ring = intel_engine_create_ring(engine, SZ_16K);
   1999 	if (IS_ERR(ring)) {
   2000 		err = PTR_ERR(ring);
   2001 		goto err_timeline_unpin;
   2002 	}
   2003 
   2004 	err = intel_ring_pin(ring);
   2005 	if (err)
   2006 		goto err_ring;
   2007 
   2008 	GEM_BUG_ON(engine->legacy.ring);
   2009 	engine->legacy.ring = ring;
   2010 	engine->legacy.timeline = timeline;
   2011 
   2012 	GEM_BUG_ON(timeline->hwsp_ggtt != engine->status_page.vma);
   2013 
   2014 	/* Finally, take ownership and responsibility for cleanup! */
   2015 	engine->release = ring_release;
   2016 
   2017 	return 0;
   2018 
   2019 err_ring:
   2020 	intel_ring_put(ring);
   2021 err_timeline_unpin:
   2022 	intel_timeline_unpin(timeline);
   2023 err_timeline:
   2024 	intel_timeline_put(timeline);
   2025 err:
   2026 	intel_engine_cleanup_common(engine);
   2027 	return err;
   2028 }
   2029