Home | History | Annotate | Line # | Download | only in gvt
      1 /*	$NetBSD: scheduler.c,v 1.2 2021/12/18 23:45:31 riastradh Exp $	*/
      2 
      3 /*
      4  * Copyright(c) 2011-2016 Intel Corporation. All rights reserved.
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a
      7  * copy of this software and associated documentation files (the "Software"),
      8  * to deal in the Software without restriction, including without limitation
      9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     10  * and/or sell copies of the Software, and to permit persons to whom the
     11  * Software is furnished to do so, subject to the following conditions:
     12  *
     13  * The above copyright notice and this permission notice (including the next
     14  * paragraph) shall be included in all copies or substantial portions of the
     15  * Software.
     16  *
     17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
     23  * SOFTWARE.
     24  *
     25  * Authors:
     26  *    Zhi Wang <zhi.a.wang (at) intel.com>
     27  *
     28  * Contributors:
     29  *    Ping Gao <ping.a.gao (at) intel.com>
     30  *    Tina Zhang <tina.zhang (at) intel.com>
     31  *    Chanbin Du <changbin.du (at) intel.com>
     32  *    Min He <min.he (at) intel.com>
     33  *    Bing Niu <bing.niu (at) intel.com>
     34  *    Zhenyu Wang <zhenyuw (at) linux.intel.com>
     35  *
     36  */
     37 
     38 #include <sys/cdefs.h>
     39 __KERNEL_RCSID(0, "$NetBSD: scheduler.c,v 1.2 2021/12/18 23:45:31 riastradh Exp $");
     40 
     41 #include <linux/kthread.h>
     42 
     43 #include "gem/i915_gem_pm.h"
     44 #include "gt/intel_context.h"
     45 #include "gt/intel_ring.h"
     46 
     47 #include "i915_drv.h"
     48 #include "i915_gem_gtt.h"
     49 #include "gvt.h"
     50 
     51 #define RING_CTX_OFF(x) \
     52 	offsetof(struct execlist_ring_context, x)
     53 
     54 static void set_context_pdp_root_pointer(
     55 		struct execlist_ring_context *ring_context,
     56 		u32 pdp[8])
     57 {
     58 	int i;
     59 
     60 	for (i = 0; i < 8; i++)
     61 		ring_context->pdps[i].val = pdp[7 - i];
     62 }
     63 
     64 static void update_shadow_pdps(struct intel_vgpu_workload *workload)
     65 {
     66 	struct drm_i915_gem_object *ctx_obj =
     67 		workload->req->context->state->obj;
     68 	struct execlist_ring_context *shadow_ring_context;
     69 	struct page *page;
     70 
     71 	if (WARN_ON(!workload->shadow_mm))
     72 		return;
     73 
     74 	if (WARN_ON(!atomic_read(&workload->shadow_mm->pincount)))
     75 		return;
     76 
     77 	page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
     78 	shadow_ring_context = kmap(page);
     79 	set_context_pdp_root_pointer(shadow_ring_context,
     80 			(void *)workload->shadow_mm->ppgtt_mm.shadow_pdps);
     81 	kunmap(page);
     82 }
     83 
     84 /*
     85  * when populating shadow ctx from guest, we should not overrride oa related
     86  * registers, so that they will not be overlapped by guest oa configs. Thus
     87  * made it possible to capture oa data from host for both host and guests.
     88  */
     89 static void sr_oa_regs(struct intel_vgpu_workload *workload,
     90 		u32 *reg_state, bool save)
     91 {
     92 	struct drm_i915_private *dev_priv = workload->vgpu->gvt->dev_priv;
     93 	u32 ctx_oactxctrl = dev_priv->perf.ctx_oactxctrl_offset;
     94 	u32 ctx_flexeu0 = dev_priv->perf.ctx_flexeu0_offset;
     95 	int i = 0;
     96 	u32 flex_mmio[] = {
     97 		i915_mmio_reg_offset(EU_PERF_CNTL0),
     98 		i915_mmio_reg_offset(EU_PERF_CNTL1),
     99 		i915_mmio_reg_offset(EU_PERF_CNTL2),
    100 		i915_mmio_reg_offset(EU_PERF_CNTL3),
    101 		i915_mmio_reg_offset(EU_PERF_CNTL4),
    102 		i915_mmio_reg_offset(EU_PERF_CNTL5),
    103 		i915_mmio_reg_offset(EU_PERF_CNTL6),
    104 	};
    105 
    106 	if (workload->ring_id != RCS0)
    107 		return;
    108 
    109 	if (save) {
    110 		workload->oactxctrl = reg_state[ctx_oactxctrl + 1];
    111 
    112 		for (i = 0; i < ARRAY_SIZE(workload->flex_mmio); i++) {
    113 			u32 state_offset = ctx_flexeu0 + i * 2;
    114 
    115 			workload->flex_mmio[i] = reg_state[state_offset + 1];
    116 		}
    117 	} else {
    118 		reg_state[ctx_oactxctrl] =
    119 			i915_mmio_reg_offset(GEN8_OACTXCONTROL);
    120 		reg_state[ctx_oactxctrl + 1] = workload->oactxctrl;
    121 
    122 		for (i = 0; i < ARRAY_SIZE(workload->flex_mmio); i++) {
    123 			u32 state_offset = ctx_flexeu0 + i * 2;
    124 			u32 mmio = flex_mmio[i];
    125 
    126 			reg_state[state_offset] = mmio;
    127 			reg_state[state_offset + 1] = workload->flex_mmio[i];
    128 		}
    129 	}
    130 }
    131 
    132 static int populate_shadow_context(struct intel_vgpu_workload *workload)
    133 {
    134 	struct intel_vgpu *vgpu = workload->vgpu;
    135 	struct intel_gvt *gvt = vgpu->gvt;
    136 	int ring_id = workload->ring_id;
    137 	struct drm_i915_gem_object *ctx_obj =
    138 		workload->req->context->state->obj;
    139 	struct execlist_ring_context *shadow_ring_context;
    140 	struct page *page;
    141 	void *dst;
    142 	unsigned long context_gpa, context_page_num;
    143 	int i;
    144 
    145 	page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
    146 	shadow_ring_context = kmap(page);
    147 
    148 	sr_oa_regs(workload, (u32 *)shadow_ring_context, true);
    149 #define COPY_REG(name) \
    150 	intel_gvt_hypervisor_read_gpa(vgpu, workload->ring_context_gpa \
    151 		+ RING_CTX_OFF(name.val), &shadow_ring_context->name.val, 4)
    152 #define COPY_REG_MASKED(name) {\
    153 		intel_gvt_hypervisor_read_gpa(vgpu, workload->ring_context_gpa \
    154 					      + RING_CTX_OFF(name.val),\
    155 					      &shadow_ring_context->name.val, 4);\
    156 		shadow_ring_context->name.val |= 0xffff << 16;\
    157 	}
    158 
    159 	COPY_REG_MASKED(ctx_ctrl);
    160 	COPY_REG(ctx_timestamp);
    161 
    162 	if (ring_id == RCS0) {
    163 		COPY_REG(bb_per_ctx_ptr);
    164 		COPY_REG(rcs_indirect_ctx);
    165 		COPY_REG(rcs_indirect_ctx_offset);
    166 	}
    167 #undef COPY_REG
    168 #undef COPY_REG_MASKED
    169 
    170 	intel_gvt_hypervisor_read_gpa(vgpu,
    171 			workload->ring_context_gpa +
    172 			sizeof(*shadow_ring_context),
    173 			(void *)shadow_ring_context +
    174 			sizeof(*shadow_ring_context),
    175 			I915_GTT_PAGE_SIZE - sizeof(*shadow_ring_context));
    176 
    177 	sr_oa_regs(workload, (u32 *)shadow_ring_context, false);
    178 	kunmap(page);
    179 
    180 	if (IS_RESTORE_INHIBIT(shadow_ring_context->ctx_ctrl.val))
    181 		return 0;
    182 
    183 	gvt_dbg_sched("ring id %d workload lrca %x", ring_id,
    184 			workload->ctx_desc.lrca);
    185 
    186 	context_page_num = gvt->dev_priv->engine[ring_id]->context_size;
    187 
    188 	context_page_num = context_page_num >> PAGE_SHIFT;
    189 
    190 	if (IS_BROADWELL(gvt->dev_priv) && ring_id == RCS0)
    191 		context_page_num = 19;
    192 
    193 	i = 2;
    194 	while (i < context_page_num) {
    195 		context_gpa = intel_vgpu_gma_to_gpa(vgpu->gtt.ggtt_mm,
    196 				(u32)((workload->ctx_desc.lrca + i) <<
    197 				I915_GTT_PAGE_SHIFT));
    198 		if (context_gpa == INTEL_GVT_INVALID_ADDR) {
    199 			gvt_vgpu_err("Invalid guest context descriptor\n");
    200 			return -EFAULT;
    201 		}
    202 
    203 		page = i915_gem_object_get_page(ctx_obj, i);
    204 		dst = kmap(page);
    205 		intel_gvt_hypervisor_read_gpa(vgpu, context_gpa, dst,
    206 				I915_GTT_PAGE_SIZE);
    207 		kunmap(page);
    208 		i++;
    209 	}
    210 	return 0;
    211 }
    212 
    213 static inline bool is_gvt_request(struct i915_request *rq)
    214 {
    215 	return intel_context_force_single_submission(rq->context);
    216 }
    217 
    218 static void save_ring_hw_state(struct intel_vgpu *vgpu, int ring_id)
    219 {
    220 	struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
    221 	u32 ring_base = dev_priv->engine[ring_id]->mmio_base;
    222 	i915_reg_t reg;
    223 
    224 	reg = RING_INSTDONE(ring_base);
    225 	vgpu_vreg(vgpu, i915_mmio_reg_offset(reg)) = I915_READ_FW(reg);
    226 	reg = RING_ACTHD(ring_base);
    227 	vgpu_vreg(vgpu, i915_mmio_reg_offset(reg)) = I915_READ_FW(reg);
    228 	reg = RING_ACTHD_UDW(ring_base);
    229 	vgpu_vreg(vgpu, i915_mmio_reg_offset(reg)) = I915_READ_FW(reg);
    230 }
    231 
    232 static int shadow_context_status_change(struct notifier_block *nb,
    233 		unsigned long action, void *data)
    234 {
    235 	struct i915_request *req = data;
    236 	struct intel_gvt *gvt = container_of(nb, struct intel_gvt,
    237 				shadow_ctx_notifier_block[req->engine->id]);
    238 	struct intel_gvt_workload_scheduler *scheduler = &gvt->scheduler;
    239 	enum intel_engine_id ring_id = req->engine->id;
    240 	struct intel_vgpu_workload *workload;
    241 	unsigned long flags;
    242 
    243 	if (!is_gvt_request(req)) {
    244 		spin_lock_irqsave(&scheduler->mmio_context_lock, flags);
    245 		if (action == INTEL_CONTEXT_SCHEDULE_IN &&
    246 		    scheduler->engine_owner[ring_id]) {
    247 			/* Switch ring from vGPU to host. */
    248 			intel_gvt_switch_mmio(scheduler->engine_owner[ring_id],
    249 					      NULL, ring_id);
    250 			scheduler->engine_owner[ring_id] = NULL;
    251 		}
    252 		spin_unlock_irqrestore(&scheduler->mmio_context_lock, flags);
    253 
    254 		return NOTIFY_OK;
    255 	}
    256 
    257 	workload = scheduler->current_workload[ring_id];
    258 	if (unlikely(!workload))
    259 		return NOTIFY_OK;
    260 
    261 	switch (action) {
    262 	case INTEL_CONTEXT_SCHEDULE_IN:
    263 		spin_lock_irqsave(&scheduler->mmio_context_lock, flags);
    264 		if (workload->vgpu != scheduler->engine_owner[ring_id]) {
    265 			/* Switch ring from host to vGPU or vGPU to vGPU. */
    266 			intel_gvt_switch_mmio(scheduler->engine_owner[ring_id],
    267 					      workload->vgpu, ring_id);
    268 			scheduler->engine_owner[ring_id] = workload->vgpu;
    269 		} else
    270 			gvt_dbg_sched("skip ring %d mmio switch for vgpu%d\n",
    271 				      ring_id, workload->vgpu->id);
    272 		spin_unlock_irqrestore(&scheduler->mmio_context_lock, flags);
    273 		atomic_set(&workload->shadow_ctx_active, 1);
    274 		break;
    275 	case INTEL_CONTEXT_SCHEDULE_OUT:
    276 		save_ring_hw_state(workload->vgpu, ring_id);
    277 		atomic_set(&workload->shadow_ctx_active, 0);
    278 		break;
    279 	case INTEL_CONTEXT_SCHEDULE_PREEMPTED:
    280 		save_ring_hw_state(workload->vgpu, ring_id);
    281 		break;
    282 	default:
    283 		WARN_ON(1);
    284 		return NOTIFY_OK;
    285 	}
    286 	wake_up(&workload->shadow_ctx_status_wq);
    287 	return NOTIFY_OK;
    288 }
    289 
    290 static void
    291 shadow_context_descriptor_update(struct intel_context *ce,
    292 				 struct intel_vgpu_workload *workload)
    293 {
    294 	u64 desc = ce->lrc_desc;
    295 
    296 	/*
    297 	 * Update bits 0-11 of the context descriptor which includes flags
    298 	 * like GEN8_CTX_* cached in desc_template
    299 	 */
    300 	desc &= ~(0x3 << GEN8_CTX_ADDRESSING_MODE_SHIFT);
    301 	desc |= workload->ctx_desc.addressing_mode <<
    302 		GEN8_CTX_ADDRESSING_MODE_SHIFT;
    303 
    304 	ce->lrc_desc = desc;
    305 }
    306 
    307 static int copy_workload_to_ring_buffer(struct intel_vgpu_workload *workload)
    308 {
    309 	struct intel_vgpu *vgpu = workload->vgpu;
    310 	struct i915_request *req = workload->req;
    311 	void *shadow_ring_buffer_va;
    312 	u32 *cs;
    313 	int err;
    314 
    315 	if (IS_GEN(req->i915, 9) && is_inhibit_context(req->context))
    316 		intel_vgpu_restore_inhibit_context(vgpu, req);
    317 
    318 	/*
    319 	 * To track whether a request has started on HW, we can emit a
    320 	 * breadcrumb at the beginning of the request and check its
    321 	 * timeline's HWSP to see if the breadcrumb has advanced past the
    322 	 * start of this request. Actually, the request must have the
    323 	 * init_breadcrumb if its timeline set has_init_bread_crumb, or the
    324 	 * scheduler might get a wrong state of it during reset. Since the
    325 	 * requests from gvt always set the has_init_breadcrumb flag, here
    326 	 * need to do the emit_init_breadcrumb for all the requests.
    327 	 */
    328 	if (req->engine->emit_init_breadcrumb) {
    329 		err = req->engine->emit_init_breadcrumb(req);
    330 		if (err) {
    331 			gvt_vgpu_err("fail to emit init breadcrumb\n");
    332 			return err;
    333 		}
    334 	}
    335 
    336 	/* allocate shadow ring buffer */
    337 	cs = intel_ring_begin(workload->req, workload->rb_len / sizeof(u32));
    338 	if (IS_ERR(cs)) {
    339 		gvt_vgpu_err("fail to alloc size =%ld shadow  ring buffer\n",
    340 			workload->rb_len);
    341 		return PTR_ERR(cs);
    342 	}
    343 
    344 	shadow_ring_buffer_va = workload->shadow_ring_buffer_va;
    345 
    346 	/* get shadow ring buffer va */
    347 	workload->shadow_ring_buffer_va = cs;
    348 
    349 	memcpy(cs, shadow_ring_buffer_va,
    350 			workload->rb_len);
    351 
    352 	cs += workload->rb_len / sizeof(u32);
    353 	intel_ring_advance(workload->req, cs);
    354 
    355 	return 0;
    356 }
    357 
    358 static void release_shadow_wa_ctx(struct intel_shadow_wa_ctx *wa_ctx)
    359 {
    360 	if (!wa_ctx->indirect_ctx.obj)
    361 		return;
    362 
    363 	i915_gem_object_unpin_map(wa_ctx->indirect_ctx.obj);
    364 	i915_gem_object_put(wa_ctx->indirect_ctx.obj);
    365 
    366 	wa_ctx->indirect_ctx.obj = NULL;
    367 	wa_ctx->indirect_ctx.shadow_va = NULL;
    368 }
    369 
    370 static void set_context_ppgtt_from_shadow(struct intel_vgpu_workload *workload,
    371 					  struct intel_context *ce)
    372 {
    373 	struct intel_vgpu_mm *mm = workload->shadow_mm;
    374 	struct i915_ppgtt *ppgtt = i915_vm_to_ppgtt(ce->vm);
    375 	int i = 0;
    376 
    377 	if (mm->ppgtt_mm.root_entry_type == GTT_TYPE_PPGTT_ROOT_L4_ENTRY) {
    378 		px_dma(ppgtt->pd) = mm->ppgtt_mm.shadow_pdps[0];
    379 	} else {
    380 		for (i = 0; i < GVT_RING_CTX_NR_PDPS; i++) {
    381 			struct i915_page_directory * const pd =
    382 				i915_pd_entry(ppgtt->pd, i);
    383 
    384 			px_dma(pd) = mm->ppgtt_mm.shadow_pdps[i];
    385 		}
    386 	}
    387 }
    388 
    389 static int
    390 intel_gvt_workload_req_alloc(struct intel_vgpu_workload *workload)
    391 {
    392 	struct intel_vgpu *vgpu = workload->vgpu;
    393 	struct intel_vgpu_submission *s = &vgpu->submission;
    394 	struct i915_request *rq;
    395 
    396 	if (workload->req)
    397 		return 0;
    398 
    399 	rq = i915_request_create(s->shadow[workload->ring_id]);
    400 	if (IS_ERR(rq)) {
    401 		gvt_vgpu_err("fail to allocate gem request\n");
    402 		return PTR_ERR(rq);
    403 	}
    404 
    405 	workload->req = i915_request_get(rq);
    406 	return 0;
    407 }
    408 
    409 /**
    410  * intel_gvt_scan_and_shadow_workload - audit the workload by scanning and
    411  * shadow it as well, include ringbuffer,wa_ctx and ctx.
    412  * @workload: an abstract entity for each execlist submission.
    413  *
    414  * This function is called before the workload submitting to i915, to make
    415  * sure the content of the workload is valid.
    416  */
    417 int intel_gvt_scan_and_shadow_workload(struct intel_vgpu_workload *workload)
    418 {
    419 	struct intel_vgpu *vgpu = workload->vgpu;
    420 	struct intel_vgpu_submission *s = &vgpu->submission;
    421 	int ret;
    422 
    423 	lockdep_assert_held(&vgpu->vgpu_lock);
    424 
    425 	if (workload->shadow)
    426 		return 0;
    427 
    428 	if (!test_and_set_bit(workload->ring_id, s->shadow_ctx_desc_updated))
    429 		shadow_context_descriptor_update(s->shadow[workload->ring_id],
    430 						 workload);
    431 
    432 	ret = intel_gvt_scan_and_shadow_ringbuffer(workload);
    433 	if (ret)
    434 		return ret;
    435 
    436 	if (workload->ring_id == RCS0 && workload->wa_ctx.indirect_ctx.size) {
    437 		ret = intel_gvt_scan_and_shadow_wa_ctx(&workload->wa_ctx);
    438 		if (ret)
    439 			goto err_shadow;
    440 	}
    441 
    442 	workload->shadow = true;
    443 	return 0;
    444 err_shadow:
    445 	release_shadow_wa_ctx(&workload->wa_ctx);
    446 	return ret;
    447 }
    448 
    449 static void release_shadow_batch_buffer(struct intel_vgpu_workload *workload);
    450 
    451 static int prepare_shadow_batch_buffer(struct intel_vgpu_workload *workload)
    452 {
    453 	struct intel_gvt *gvt = workload->vgpu->gvt;
    454 	const int gmadr_bytes = gvt->device_info.gmadr_bytes_in_cmd;
    455 	struct intel_vgpu_shadow_bb *bb;
    456 	int ret;
    457 
    458 	list_for_each_entry(bb, &workload->shadow_bb, list) {
    459 		/* For privilge batch buffer and not wa_ctx, the bb_start_cmd_va
    460 		 * is only updated into ring_scan_buffer, not real ring address
    461 		 * allocated in later copy_workload_to_ring_buffer. pls be noted
    462 		 * shadow_ring_buffer_va is now pointed to real ring buffer va
    463 		 * in copy_workload_to_ring_buffer.
    464 		 */
    465 
    466 		if (bb->bb_offset)
    467 			bb->bb_start_cmd_va = workload->shadow_ring_buffer_va
    468 				+ bb->bb_offset;
    469 
    470 		if (bb->ppgtt) {
    471 			/* for non-priv bb, scan&shadow is only for
    472 			 * debugging purpose, so the content of shadow bb
    473 			 * is the same as original bb. Therefore,
    474 			 * here, rather than switch to shadow bb's gma
    475 			 * address, we directly use original batch buffer's
    476 			 * gma address, and send original bb to hardware
    477 			 * directly
    478 			 */
    479 			if (bb->clflush & CLFLUSH_AFTER) {
    480 				drm_clflush_virt_range(bb->va,
    481 						bb->obj->base.size);
    482 				bb->clflush &= ~CLFLUSH_AFTER;
    483 			}
    484 			i915_gem_object_finish_access(bb->obj);
    485 			bb->accessing = false;
    486 
    487 		} else {
    488 			bb->vma = i915_gem_object_ggtt_pin(bb->obj,
    489 					NULL, 0, 0, 0);
    490 			if (IS_ERR(bb->vma)) {
    491 				ret = PTR_ERR(bb->vma);
    492 				goto err;
    493 			}
    494 
    495 			/* relocate shadow batch buffer */
    496 			bb->bb_start_cmd_va[1] = i915_ggtt_offset(bb->vma);
    497 			if (gmadr_bytes == 8)
    498 				bb->bb_start_cmd_va[2] = 0;
    499 
    500 			/* No one is going to touch shadow bb from now on. */
    501 			if (bb->clflush & CLFLUSH_AFTER) {
    502 				drm_clflush_virt_range(bb->va,
    503 						bb->obj->base.size);
    504 				bb->clflush &= ~CLFLUSH_AFTER;
    505 			}
    506 
    507 			ret = i915_gem_object_set_to_gtt_domain(bb->obj,
    508 								false);
    509 			if (ret)
    510 				goto err;
    511 
    512 			ret = i915_vma_move_to_active(bb->vma,
    513 						      workload->req,
    514 						      0);
    515 			if (ret)
    516 				goto err;
    517 
    518 			i915_gem_object_finish_access(bb->obj);
    519 			bb->accessing = false;
    520 		}
    521 	}
    522 	return 0;
    523 err:
    524 	release_shadow_batch_buffer(workload);
    525 	return ret;
    526 }
    527 
    528 static void update_wa_ctx_2_shadow_ctx(struct intel_shadow_wa_ctx *wa_ctx)
    529 {
    530 	struct intel_vgpu_workload *workload =
    531 		container_of(wa_ctx, struct intel_vgpu_workload, wa_ctx);
    532 	struct i915_request *rq = workload->req;
    533 	struct execlist_ring_context *shadow_ring_context =
    534 		(struct execlist_ring_context *)rq->context->lrc_reg_state;
    535 
    536 	shadow_ring_context->bb_per_ctx_ptr.val =
    537 		(shadow_ring_context->bb_per_ctx_ptr.val &
    538 		(~PER_CTX_ADDR_MASK)) | wa_ctx->per_ctx.shadow_gma;
    539 	shadow_ring_context->rcs_indirect_ctx.val =
    540 		(shadow_ring_context->rcs_indirect_ctx.val &
    541 		(~INDIRECT_CTX_ADDR_MASK)) | wa_ctx->indirect_ctx.shadow_gma;
    542 }
    543 
    544 static int prepare_shadow_wa_ctx(struct intel_shadow_wa_ctx *wa_ctx)
    545 {
    546 	struct i915_vma *vma;
    547 	unsigned char *per_ctx_va =
    548 		(unsigned char *)wa_ctx->indirect_ctx.shadow_va +
    549 		wa_ctx->indirect_ctx.size;
    550 
    551 	if (wa_ctx->indirect_ctx.size == 0)
    552 		return 0;
    553 
    554 	vma = i915_gem_object_ggtt_pin(wa_ctx->indirect_ctx.obj, NULL,
    555 				       0, CACHELINE_BYTES, 0);
    556 	if (IS_ERR(vma))
    557 		return PTR_ERR(vma);
    558 
    559 	/* FIXME: we are not tracking our pinned VMA leaving it
    560 	 * up to the core to fix up the stray pin_count upon
    561 	 * free.
    562 	 */
    563 
    564 	wa_ctx->indirect_ctx.shadow_gma = i915_ggtt_offset(vma);
    565 
    566 	wa_ctx->per_ctx.shadow_gma = *((unsigned int *)per_ctx_va + 1);
    567 	memset(per_ctx_va, 0, CACHELINE_BYTES);
    568 
    569 	update_wa_ctx_2_shadow_ctx(wa_ctx);
    570 	return 0;
    571 }
    572 
    573 static void update_vreg_in_ctx(struct intel_vgpu_workload *workload)
    574 {
    575 	struct intel_vgpu *vgpu = workload->vgpu;
    576 	struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
    577 	u32 ring_base;
    578 
    579 	ring_base = dev_priv->engine[workload->ring_id]->mmio_base;
    580 	vgpu_vreg_t(vgpu, RING_START(ring_base)) = workload->rb_start;
    581 }
    582 
    583 static void release_shadow_batch_buffer(struct intel_vgpu_workload *workload)
    584 {
    585 	struct intel_vgpu_shadow_bb *bb, *pos;
    586 
    587 	if (list_empty(&workload->shadow_bb))
    588 		return;
    589 
    590 	bb = list_first_entry(&workload->shadow_bb,
    591 			struct intel_vgpu_shadow_bb, list);
    592 
    593 	list_for_each_entry_safe(bb, pos, &workload->shadow_bb, list) {
    594 		if (bb->obj) {
    595 			if (bb->accessing)
    596 				i915_gem_object_finish_access(bb->obj);
    597 
    598 			if (bb->va && !IS_ERR(bb->va))
    599 				i915_gem_object_unpin_map(bb->obj);
    600 
    601 			if (bb->vma && !IS_ERR(bb->vma)) {
    602 				i915_vma_unpin(bb->vma);
    603 				i915_vma_close(bb->vma);
    604 			}
    605 			i915_gem_object_put(bb->obj);
    606 		}
    607 		list_del(&bb->list);
    608 		kfree(bb);
    609 	}
    610 }
    611 
    612 static int prepare_workload(struct intel_vgpu_workload *workload)
    613 {
    614 	struct intel_vgpu *vgpu = workload->vgpu;
    615 	struct intel_vgpu_submission *s = &vgpu->submission;
    616 	int ring = workload->ring_id;
    617 	int ret = 0;
    618 
    619 	ret = intel_vgpu_pin_mm(workload->shadow_mm);
    620 	if (ret) {
    621 		gvt_vgpu_err("fail to vgpu pin mm\n");
    622 		return ret;
    623 	}
    624 
    625 	if (workload->shadow_mm->type != INTEL_GVT_MM_PPGTT ||
    626 	    !workload->shadow_mm->ppgtt_mm.shadowed) {
    627 		gvt_vgpu_err("workload shadow ppgtt isn't ready\n");
    628 		return -EINVAL;
    629 	}
    630 
    631 	update_shadow_pdps(workload);
    632 
    633 	set_context_ppgtt_from_shadow(workload, s->shadow[ring]);
    634 
    635 	ret = intel_vgpu_sync_oos_pages(workload->vgpu);
    636 	if (ret) {
    637 		gvt_vgpu_err("fail to vgpu sync oos pages\n");
    638 		goto err_unpin_mm;
    639 	}
    640 
    641 	ret = intel_vgpu_flush_post_shadow(workload->vgpu);
    642 	if (ret) {
    643 		gvt_vgpu_err("fail to flush post shadow\n");
    644 		goto err_unpin_mm;
    645 	}
    646 
    647 	ret = copy_workload_to_ring_buffer(workload);
    648 	if (ret) {
    649 		gvt_vgpu_err("fail to generate request\n");
    650 		goto err_unpin_mm;
    651 	}
    652 
    653 	ret = prepare_shadow_batch_buffer(workload);
    654 	if (ret) {
    655 		gvt_vgpu_err("fail to prepare_shadow_batch_buffer\n");
    656 		goto err_unpin_mm;
    657 	}
    658 
    659 	ret = prepare_shadow_wa_ctx(&workload->wa_ctx);
    660 	if (ret) {
    661 		gvt_vgpu_err("fail to prepare_shadow_wa_ctx\n");
    662 		goto err_shadow_batch;
    663 	}
    664 
    665 	if (workload->prepare) {
    666 		ret = workload->prepare(workload);
    667 		if (ret)
    668 			goto err_shadow_wa_ctx;
    669 	}
    670 
    671 	return 0;
    672 err_shadow_wa_ctx:
    673 	release_shadow_wa_ctx(&workload->wa_ctx);
    674 err_shadow_batch:
    675 	release_shadow_batch_buffer(workload);
    676 err_unpin_mm:
    677 	intel_vgpu_unpin_mm(workload->shadow_mm);
    678 	return ret;
    679 }
    680 
    681 static int dispatch_workload(struct intel_vgpu_workload *workload)
    682 {
    683 	struct intel_vgpu *vgpu = workload->vgpu;
    684 	struct i915_request *rq;
    685 	int ring_id = workload->ring_id;
    686 	int ret;
    687 
    688 	gvt_dbg_sched("ring id %d prepare to dispatch workload %p\n",
    689 		ring_id, workload);
    690 
    691 	mutex_lock(&vgpu->vgpu_lock);
    692 
    693 	ret = intel_gvt_workload_req_alloc(workload);
    694 	if (ret)
    695 		goto err_req;
    696 
    697 	ret = intel_gvt_scan_and_shadow_workload(workload);
    698 	if (ret)
    699 		goto out;
    700 
    701 	ret = populate_shadow_context(workload);
    702 	if (ret) {
    703 		release_shadow_wa_ctx(&workload->wa_ctx);
    704 		goto out;
    705 	}
    706 
    707 	ret = prepare_workload(workload);
    708 out:
    709 	if (ret) {
    710 		/* We might still need to add request with
    711 		 * clean ctx to retire it properly..
    712 		 */
    713 		rq = fetch_and_zero(&workload->req);
    714 		i915_request_put(rq);
    715 	}
    716 
    717 	if (!IS_ERR_OR_NULL(workload->req)) {
    718 		gvt_dbg_sched("ring id %d submit workload to i915 %p\n",
    719 				ring_id, workload->req);
    720 		i915_request_add(workload->req);
    721 		workload->dispatched = true;
    722 	}
    723 err_req:
    724 	if (ret)
    725 		workload->status = ret;
    726 	mutex_unlock(&vgpu->vgpu_lock);
    727 	return ret;
    728 }
    729 
    730 static struct intel_vgpu_workload *pick_next_workload(
    731 		struct intel_gvt *gvt, int ring_id)
    732 {
    733 	struct intel_gvt_workload_scheduler *scheduler = &gvt->scheduler;
    734 	struct intel_vgpu_workload *workload = NULL;
    735 
    736 	mutex_lock(&gvt->sched_lock);
    737 
    738 	/*
    739 	 * no current vgpu / will be scheduled out / no workload
    740 	 * bail out
    741 	 */
    742 	if (!scheduler->current_vgpu) {
    743 		gvt_dbg_sched("ring id %d stop - no current vgpu\n", ring_id);
    744 		goto out;
    745 	}
    746 
    747 	if (scheduler->need_reschedule) {
    748 		gvt_dbg_sched("ring id %d stop - will reschedule\n", ring_id);
    749 		goto out;
    750 	}
    751 
    752 	if (!scheduler->current_vgpu->active ||
    753 	    list_empty(workload_q_head(scheduler->current_vgpu, ring_id)))
    754 		goto out;
    755 
    756 	/*
    757 	 * still have current workload, maybe the workload disptacher
    758 	 * fail to submit it for some reason, resubmit it.
    759 	 */
    760 	if (scheduler->current_workload[ring_id]) {
    761 		workload = scheduler->current_workload[ring_id];
    762 		gvt_dbg_sched("ring id %d still have current workload %p\n",
    763 				ring_id, workload);
    764 		goto out;
    765 	}
    766 
    767 	/*
    768 	 * pick a workload as current workload
    769 	 * once current workload is set, schedule policy routines
    770 	 * will wait the current workload is finished when trying to
    771 	 * schedule out a vgpu.
    772 	 */
    773 	scheduler->current_workload[ring_id] = container_of(
    774 			workload_q_head(scheduler->current_vgpu, ring_id)->next,
    775 			struct intel_vgpu_workload, list);
    776 
    777 	workload = scheduler->current_workload[ring_id];
    778 
    779 	gvt_dbg_sched("ring id %d pick new workload %p\n", ring_id, workload);
    780 
    781 	atomic_inc(&workload->vgpu->submission.running_workload_num);
    782 out:
    783 	mutex_unlock(&gvt->sched_lock);
    784 	return workload;
    785 }
    786 
    787 static void update_guest_context(struct intel_vgpu_workload *workload)
    788 {
    789 	struct i915_request *rq = workload->req;
    790 	struct intel_vgpu *vgpu = workload->vgpu;
    791 	struct intel_gvt *gvt = vgpu->gvt;
    792 	struct drm_i915_gem_object *ctx_obj = rq->context->state->obj;
    793 	struct execlist_ring_context *shadow_ring_context;
    794 	struct page *page;
    795 	void *src;
    796 	unsigned long context_gpa, context_page_num;
    797 	int i;
    798 	struct drm_i915_private *dev_priv = gvt->dev_priv;
    799 	u32 ring_base;
    800 	u32 head, tail;
    801 	u16 wrap_count;
    802 
    803 	gvt_dbg_sched("ring id %d workload lrca %x\n", rq->engine->id,
    804 		      workload->ctx_desc.lrca);
    805 
    806 	head = workload->rb_head;
    807 	tail = workload->rb_tail;
    808 	wrap_count = workload->guest_rb_head >> RB_HEAD_WRAP_CNT_OFF;
    809 
    810 	if (tail < head) {
    811 		if (wrap_count == RB_HEAD_WRAP_CNT_MAX)
    812 			wrap_count = 0;
    813 		else
    814 			wrap_count += 1;
    815 	}
    816 
    817 	head = (wrap_count << RB_HEAD_WRAP_CNT_OFF) | tail;
    818 
    819 	ring_base = dev_priv->engine[workload->ring_id]->mmio_base;
    820 	vgpu_vreg_t(vgpu, RING_TAIL(ring_base)) = tail;
    821 	vgpu_vreg_t(vgpu, RING_HEAD(ring_base)) = head;
    822 
    823 	context_page_num = rq->engine->context_size;
    824 	context_page_num = context_page_num >> PAGE_SHIFT;
    825 
    826 	if (IS_BROADWELL(gvt->dev_priv) && rq->engine->id == RCS0)
    827 		context_page_num = 19;
    828 
    829 	i = 2;
    830 
    831 	while (i < context_page_num) {
    832 		context_gpa = intel_vgpu_gma_to_gpa(vgpu->gtt.ggtt_mm,
    833 				(u32)((workload->ctx_desc.lrca + i) <<
    834 					I915_GTT_PAGE_SHIFT));
    835 		if (context_gpa == INTEL_GVT_INVALID_ADDR) {
    836 			gvt_vgpu_err("invalid guest context descriptor\n");
    837 			return;
    838 		}
    839 
    840 		page = i915_gem_object_get_page(ctx_obj, i);
    841 		src = kmap(page);
    842 		intel_gvt_hypervisor_write_gpa(vgpu, context_gpa, src,
    843 				I915_GTT_PAGE_SIZE);
    844 		kunmap(page);
    845 		i++;
    846 	}
    847 
    848 	intel_gvt_hypervisor_write_gpa(vgpu, workload->ring_context_gpa +
    849 		RING_CTX_OFF(ring_header.val), &workload->rb_tail, 4);
    850 
    851 	page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
    852 	shadow_ring_context = kmap(page);
    853 
    854 #define COPY_REG(name) \
    855 	intel_gvt_hypervisor_write_gpa(vgpu, workload->ring_context_gpa + \
    856 		RING_CTX_OFF(name.val), &shadow_ring_context->name.val, 4)
    857 
    858 	COPY_REG(ctx_ctrl);
    859 	COPY_REG(ctx_timestamp);
    860 
    861 #undef COPY_REG
    862 
    863 	intel_gvt_hypervisor_write_gpa(vgpu,
    864 			workload->ring_context_gpa +
    865 			sizeof(*shadow_ring_context),
    866 			(void *)shadow_ring_context +
    867 			sizeof(*shadow_ring_context),
    868 			I915_GTT_PAGE_SIZE - sizeof(*shadow_ring_context));
    869 
    870 	kunmap(page);
    871 }
    872 
    873 void intel_vgpu_clean_workloads(struct intel_vgpu *vgpu,
    874 				intel_engine_mask_t engine_mask)
    875 {
    876 	struct intel_vgpu_submission *s = &vgpu->submission;
    877 	struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
    878 	struct intel_engine_cs *engine;
    879 	struct intel_vgpu_workload *pos, *n;
    880 	intel_engine_mask_t tmp;
    881 
    882 	/* free the unsubmited workloads in the queues. */
    883 	for_each_engine_masked(engine, &dev_priv->gt, engine_mask, tmp) {
    884 		list_for_each_entry_safe(pos, n,
    885 			&s->workload_q_head[engine->id], list) {
    886 			list_del_init(&pos->list);
    887 			intel_vgpu_destroy_workload(pos);
    888 		}
    889 		clear_bit(engine->id, s->shadow_ctx_desc_updated);
    890 	}
    891 }
    892 
    893 static void complete_current_workload(struct intel_gvt *gvt, int ring_id)
    894 {
    895 	struct intel_gvt_workload_scheduler *scheduler = &gvt->scheduler;
    896 	struct intel_vgpu_workload *workload =
    897 		scheduler->current_workload[ring_id];
    898 	struct intel_vgpu *vgpu = workload->vgpu;
    899 	struct intel_vgpu_submission *s = &vgpu->submission;
    900 	struct i915_request *rq = workload->req;
    901 	int event;
    902 
    903 	mutex_lock(&vgpu->vgpu_lock);
    904 	mutex_lock(&gvt->sched_lock);
    905 
    906 	/* For the workload w/ request, needs to wait for the context
    907 	 * switch to make sure request is completed.
    908 	 * For the workload w/o request, directly complete the workload.
    909 	 */
    910 	if (rq) {
    911 		wait_event(workload->shadow_ctx_status_wq,
    912 			   !atomic_read(&workload->shadow_ctx_active));
    913 
    914 		/* If this request caused GPU hang, req->fence.error will
    915 		 * be set to -EIO. Use -EIO to set workload status so
    916 		 * that when this request caused GPU hang, didn't trigger
    917 		 * context switch interrupt to guest.
    918 		 */
    919 		if (likely(workload->status == -EINPROGRESS)) {
    920 			if (workload->req->fence.error == -EIO)
    921 				workload->status = -EIO;
    922 			else
    923 				workload->status = 0;
    924 		}
    925 
    926 		if (!workload->status &&
    927 		    !(vgpu->resetting_eng & BIT(ring_id))) {
    928 			update_guest_context(workload);
    929 
    930 			for_each_set_bit(event, workload->pending_events,
    931 					 INTEL_GVT_EVENT_MAX)
    932 				intel_vgpu_trigger_virtual_event(vgpu, event);
    933 		}
    934 
    935 		i915_request_put(fetch_and_zero(&workload->req));
    936 	}
    937 
    938 	gvt_dbg_sched("ring id %d complete workload %p status %d\n",
    939 			ring_id, workload, workload->status);
    940 
    941 	scheduler->current_workload[ring_id] = NULL;
    942 
    943 	list_del_init(&workload->list);
    944 
    945 	if (workload->status || vgpu->resetting_eng & BIT(ring_id)) {
    946 		/* if workload->status is not successful means HW GPU
    947 		 * has occurred GPU hang or something wrong with i915/GVT,
    948 		 * and GVT won't inject context switch interrupt to guest.
    949 		 * So this error is a vGPU hang actually to the guest.
    950 		 * According to this we should emunlate a vGPU hang. If
    951 		 * there are pending workloads which are already submitted
    952 		 * from guest, we should clean them up like HW GPU does.
    953 		 *
    954 		 * if it is in middle of engine resetting, the pending
    955 		 * workloads won't be submitted to HW GPU and will be
    956 		 * cleaned up during the resetting process later, so doing
    957 		 * the workload clean up here doesn't have any impact.
    958 		 **/
    959 		intel_vgpu_clean_workloads(vgpu, BIT(ring_id));
    960 	}
    961 
    962 	workload->complete(workload);
    963 
    964 	atomic_dec(&s->running_workload_num);
    965 	wake_up(&scheduler->workload_complete_wq);
    966 
    967 	if (gvt->scheduler.need_reschedule)
    968 		intel_gvt_request_service(gvt, INTEL_GVT_REQUEST_EVENT_SCHED);
    969 
    970 	mutex_unlock(&gvt->sched_lock);
    971 	mutex_unlock(&vgpu->vgpu_lock);
    972 }
    973 
    974 struct workload_thread_param {
    975 	struct intel_gvt *gvt;
    976 	int ring_id;
    977 };
    978 
    979 static int workload_thread(void *priv)
    980 {
    981 	struct workload_thread_param *p = (struct workload_thread_param *)priv;
    982 	struct intel_gvt *gvt = p->gvt;
    983 	int ring_id = p->ring_id;
    984 	struct intel_gvt_workload_scheduler *scheduler = &gvt->scheduler;
    985 	struct intel_vgpu_workload *workload = NULL;
    986 	struct intel_vgpu *vgpu = NULL;
    987 	int ret;
    988 	bool need_force_wake = (INTEL_GEN(gvt->dev_priv) >= 9);
    989 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
    990 	struct intel_runtime_pm *rpm = &gvt->dev_priv->runtime_pm;
    991 
    992 	kfree(p);
    993 
    994 	gvt_dbg_core("workload thread for ring %d started\n", ring_id);
    995 
    996 	while (!kthread_should_stop()) {
    997 		add_wait_queue(&scheduler->waitq[ring_id], &wait);
    998 		do {
    999 			workload = pick_next_workload(gvt, ring_id);
   1000 			if (workload)
   1001 				break;
   1002 			wait_woken(&wait, TASK_INTERRUPTIBLE,
   1003 				   MAX_SCHEDULE_TIMEOUT);
   1004 		} while (!kthread_should_stop());
   1005 		remove_wait_queue(&scheduler->waitq[ring_id], &wait);
   1006 
   1007 		if (!workload)
   1008 			break;
   1009 
   1010 		gvt_dbg_sched("ring id %d next workload %p vgpu %d\n",
   1011 				workload->ring_id, workload,
   1012 				workload->vgpu->id);
   1013 
   1014 		intel_runtime_pm_get(rpm);
   1015 
   1016 		gvt_dbg_sched("ring id %d will dispatch workload %p\n",
   1017 				workload->ring_id, workload);
   1018 
   1019 		if (need_force_wake)
   1020 			intel_uncore_forcewake_get(&gvt->dev_priv->uncore,
   1021 					FORCEWAKE_ALL);
   1022 		/*
   1023 		 * Update the vReg of the vGPU which submitted this
   1024 		 * workload. The vGPU may use these registers for checking
   1025 		 * the context state. The value comes from GPU commands
   1026 		 * in this workload.
   1027 		 */
   1028 		update_vreg_in_ctx(workload);
   1029 
   1030 		ret = dispatch_workload(workload);
   1031 
   1032 		if (ret) {
   1033 			vgpu = workload->vgpu;
   1034 			gvt_vgpu_err("fail to dispatch workload, skip\n");
   1035 			goto complete;
   1036 		}
   1037 
   1038 		gvt_dbg_sched("ring id %d wait workload %p\n",
   1039 				workload->ring_id, workload);
   1040 		i915_request_wait(workload->req, 0, MAX_SCHEDULE_TIMEOUT);
   1041 
   1042 complete:
   1043 		gvt_dbg_sched("will complete workload %p, status: %d\n",
   1044 				workload, workload->status);
   1045 
   1046 		complete_current_workload(gvt, ring_id);
   1047 
   1048 		if (need_force_wake)
   1049 			intel_uncore_forcewake_put(&gvt->dev_priv->uncore,
   1050 					FORCEWAKE_ALL);
   1051 
   1052 		intel_runtime_pm_put_unchecked(rpm);
   1053 		if (ret && (vgpu_is_vm_unhealthy(ret)))
   1054 			enter_failsafe_mode(vgpu, GVT_FAILSAFE_GUEST_ERR);
   1055 	}
   1056 	return 0;
   1057 }
   1058 
   1059 void intel_gvt_wait_vgpu_idle(struct intel_vgpu *vgpu)
   1060 {
   1061 	struct intel_vgpu_submission *s = &vgpu->submission;
   1062 	struct intel_gvt *gvt = vgpu->gvt;
   1063 	struct intel_gvt_workload_scheduler *scheduler = &gvt->scheduler;
   1064 
   1065 	if (atomic_read(&s->running_workload_num)) {
   1066 		gvt_dbg_sched("wait vgpu idle\n");
   1067 
   1068 		wait_event(scheduler->workload_complete_wq,
   1069 				!atomic_read(&s->running_workload_num));
   1070 	}
   1071 }
   1072 
   1073 void intel_gvt_clean_workload_scheduler(struct intel_gvt *gvt)
   1074 {
   1075 	struct intel_gvt_workload_scheduler *scheduler = &gvt->scheduler;
   1076 	struct intel_engine_cs *engine;
   1077 	enum intel_engine_id i;
   1078 
   1079 	gvt_dbg_core("clean workload scheduler\n");
   1080 
   1081 	for_each_engine(engine, gvt->dev_priv, i) {
   1082 		atomic_notifier_chain_unregister(
   1083 					&engine->context_status_notifier,
   1084 					&gvt->shadow_ctx_notifier_block[i]);
   1085 		kthread_stop(scheduler->thread[i]);
   1086 	}
   1087 }
   1088 
   1089 int intel_gvt_init_workload_scheduler(struct intel_gvt *gvt)
   1090 {
   1091 	struct intel_gvt_workload_scheduler *scheduler = &gvt->scheduler;
   1092 	struct workload_thread_param *param = NULL;
   1093 	struct intel_engine_cs *engine;
   1094 	enum intel_engine_id i;
   1095 	int ret;
   1096 
   1097 	gvt_dbg_core("init workload scheduler\n");
   1098 
   1099 	init_waitqueue_head(&scheduler->workload_complete_wq);
   1100 
   1101 	for_each_engine(engine, gvt->dev_priv, i) {
   1102 		init_waitqueue_head(&scheduler->waitq[i]);
   1103 
   1104 		param = kzalloc(sizeof(*param), GFP_KERNEL);
   1105 		if (!param) {
   1106 			ret = -ENOMEM;
   1107 			goto err;
   1108 		}
   1109 
   1110 		param->gvt = gvt;
   1111 		param->ring_id = i;
   1112 
   1113 		scheduler->thread[i] = kthread_run(workload_thread, param,
   1114 			"gvt workload %d", i);
   1115 		if (IS_ERR(scheduler->thread[i])) {
   1116 			gvt_err("fail to create workload thread\n");
   1117 			ret = PTR_ERR(scheduler->thread[i]);
   1118 			goto err;
   1119 		}
   1120 
   1121 		gvt->shadow_ctx_notifier_block[i].notifier_call =
   1122 					shadow_context_status_change;
   1123 		atomic_notifier_chain_register(&engine->context_status_notifier,
   1124 					&gvt->shadow_ctx_notifier_block[i]);
   1125 	}
   1126 	return 0;
   1127 err:
   1128 	intel_gvt_clean_workload_scheduler(gvt);
   1129 	kfree(param);
   1130 	param = NULL;
   1131 	return ret;
   1132 }
   1133 
   1134 static void
   1135 i915_context_ppgtt_root_restore(struct intel_vgpu_submission *s,
   1136 				struct i915_ppgtt *ppgtt)
   1137 {
   1138 	int i;
   1139 
   1140 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
   1141 		px_dma(ppgtt->pd) = s->i915_context_pml4;
   1142 	} else {
   1143 		for (i = 0; i < GEN8_3LVL_PDPES; i++) {
   1144 			struct i915_page_directory * const pd =
   1145 				i915_pd_entry(ppgtt->pd, i);
   1146 
   1147 			px_dma(pd) = s->i915_context_pdps[i];
   1148 		}
   1149 	}
   1150 }
   1151 
   1152 /**
   1153  * intel_vgpu_clean_submission - free submission-related resource for vGPU
   1154  * @vgpu: a vGPU
   1155  *
   1156  * This function is called when a vGPU is being destroyed.
   1157  *
   1158  */
   1159 void intel_vgpu_clean_submission(struct intel_vgpu *vgpu)
   1160 {
   1161 	struct intel_vgpu_submission *s = &vgpu->submission;
   1162 	struct intel_engine_cs *engine;
   1163 	enum intel_engine_id id;
   1164 
   1165 	intel_vgpu_select_submission_ops(vgpu, ALL_ENGINES, 0);
   1166 
   1167 	i915_context_ppgtt_root_restore(s, i915_vm_to_ppgtt(s->shadow[0]->vm));
   1168 	for_each_engine(engine, vgpu->gvt->dev_priv, id)
   1169 		intel_context_unpin(s->shadow[id]);
   1170 
   1171 	kmem_cache_destroy(s->workloads);
   1172 }
   1173 
   1174 
   1175 /**
   1176  * intel_vgpu_reset_submission - reset submission-related resource for vGPU
   1177  * @vgpu: a vGPU
   1178  * @engine_mask: engines expected to be reset
   1179  *
   1180  * This function is called when a vGPU is being destroyed.
   1181  *
   1182  */
   1183 void intel_vgpu_reset_submission(struct intel_vgpu *vgpu,
   1184 				 intel_engine_mask_t engine_mask)
   1185 {
   1186 	struct intel_vgpu_submission *s = &vgpu->submission;
   1187 
   1188 	if (!s->active)
   1189 		return;
   1190 
   1191 	intel_vgpu_clean_workloads(vgpu, engine_mask);
   1192 	s->ops->reset(vgpu, engine_mask);
   1193 }
   1194 
   1195 static void
   1196 i915_context_ppgtt_root_save(struct intel_vgpu_submission *s,
   1197 			     struct i915_ppgtt *ppgtt)
   1198 {
   1199 	int i;
   1200 
   1201 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
   1202 		s->i915_context_pml4 = px_dma(ppgtt->pd);
   1203 	} else {
   1204 		for (i = 0; i < GEN8_3LVL_PDPES; i++) {
   1205 			struct i915_page_directory * const pd =
   1206 				i915_pd_entry(ppgtt->pd, i);
   1207 
   1208 			s->i915_context_pdps[i] = px_dma(pd);
   1209 		}
   1210 	}
   1211 }
   1212 
   1213 /**
   1214  * intel_vgpu_setup_submission - setup submission-related resource for vGPU
   1215  * @vgpu: a vGPU
   1216  *
   1217  * This function is called when a vGPU is being created.
   1218  *
   1219  * Returns:
   1220  * Zero on success, negative error code if failed.
   1221  *
   1222  */
   1223 int intel_vgpu_setup_submission(struct intel_vgpu *vgpu)
   1224 {
   1225 	struct drm_i915_private *i915 = vgpu->gvt->dev_priv;
   1226 	struct intel_vgpu_submission *s = &vgpu->submission;
   1227 	struct intel_engine_cs *engine;
   1228 	struct i915_ppgtt *ppgtt;
   1229 	enum intel_engine_id i;
   1230 	int ret;
   1231 
   1232 	ppgtt = i915_ppgtt_create(&i915->gt);
   1233 	if (IS_ERR(ppgtt))
   1234 		return PTR_ERR(ppgtt);
   1235 
   1236 	i915_context_ppgtt_root_save(s, ppgtt);
   1237 
   1238 	for_each_engine(engine, i915, i) {
   1239 		struct intel_context *ce;
   1240 
   1241 		INIT_LIST_HEAD(&s->workload_q_head[i]);
   1242 		s->shadow[i] = ERR_PTR(-EINVAL);
   1243 
   1244 		ce = intel_context_create(engine);
   1245 		if (IS_ERR(ce)) {
   1246 			ret = PTR_ERR(ce);
   1247 			goto out_shadow_ctx;
   1248 		}
   1249 
   1250 		i915_vm_put(ce->vm);
   1251 		ce->vm = i915_vm_get(&ppgtt->vm);
   1252 		intel_context_set_single_submission(ce);
   1253 
   1254 		if (!USES_GUC_SUBMISSION(i915)) { /* Max ring buffer size */
   1255 			const unsigned int ring_size = 512 * SZ_4K;
   1256 
   1257 			ce->ring = __intel_context_ring_size(ring_size);
   1258 		}
   1259 
   1260 		ret = intel_context_pin(ce);
   1261 		intel_context_put(ce);
   1262 		if (ret)
   1263 			goto out_shadow_ctx;
   1264 
   1265 		s->shadow[i] = ce;
   1266 	}
   1267 
   1268 	bitmap_zero(s->shadow_ctx_desc_updated, I915_NUM_ENGINES);
   1269 
   1270 	s->workloads = kmem_cache_create_usercopy("gvt-g_vgpu_workload",
   1271 						  sizeof(struct intel_vgpu_workload), 0,
   1272 						  SLAB_HWCACHE_ALIGN,
   1273 						  offsetof(struct intel_vgpu_workload, rb_tail),
   1274 						  sizeof_field(struct intel_vgpu_workload, rb_tail),
   1275 						  NULL);
   1276 
   1277 	if (!s->workloads) {
   1278 		ret = -ENOMEM;
   1279 		goto out_shadow_ctx;
   1280 	}
   1281 
   1282 	atomic_set(&s->running_workload_num, 0);
   1283 	bitmap_zero(s->tlb_handle_pending, I915_NUM_ENGINES);
   1284 
   1285 	i915_vm_put(&ppgtt->vm);
   1286 	return 0;
   1287 
   1288 out_shadow_ctx:
   1289 	i915_context_ppgtt_root_restore(s, ppgtt);
   1290 	for_each_engine(engine, i915, i) {
   1291 		if (IS_ERR(s->shadow[i]))
   1292 			break;
   1293 
   1294 		intel_context_unpin(s->shadow[i]);
   1295 		intel_context_put(s->shadow[i]);
   1296 	}
   1297 	i915_vm_put(&ppgtt->vm);
   1298 	return ret;
   1299 }
   1300 
   1301 /**
   1302  * intel_vgpu_select_submission_ops - select virtual submission interface
   1303  * @vgpu: a vGPU
   1304  * @engine_mask: either ALL_ENGINES or target engine mask
   1305  * @interface: expected vGPU virtual submission interface
   1306  *
   1307  * This function is called when guest configures submission interface.
   1308  *
   1309  * Returns:
   1310  * Zero on success, negative error code if failed.
   1311  *
   1312  */
   1313 int intel_vgpu_select_submission_ops(struct intel_vgpu *vgpu,
   1314 				     intel_engine_mask_t engine_mask,
   1315 				     unsigned int interface)
   1316 {
   1317 	struct intel_vgpu_submission *s = &vgpu->submission;
   1318 	const struct intel_vgpu_submission_ops *ops[] = {
   1319 		[INTEL_VGPU_EXECLIST_SUBMISSION] =
   1320 			&intel_vgpu_execlist_submission_ops,
   1321 	};
   1322 	int ret;
   1323 
   1324 	if (WARN_ON(interface >= ARRAY_SIZE(ops)))
   1325 		return -EINVAL;
   1326 
   1327 	if (WARN_ON(interface == 0 && engine_mask != ALL_ENGINES))
   1328 		return -EINVAL;
   1329 
   1330 	if (s->active)
   1331 		s->ops->clean(vgpu, engine_mask);
   1332 
   1333 	if (interface == 0) {
   1334 		s->ops = NULL;
   1335 		s->virtual_submission_interface = 0;
   1336 		s->active = false;
   1337 		gvt_dbg_core("vgpu%d: remove submission ops\n", vgpu->id);
   1338 		return 0;
   1339 	}
   1340 
   1341 	ret = ops[interface]->init(vgpu, engine_mask);
   1342 	if (ret)
   1343 		return ret;
   1344 
   1345 	s->ops = ops[interface];
   1346 	s->virtual_submission_interface = interface;
   1347 	s->active = true;
   1348 
   1349 	gvt_dbg_core("vgpu%d: activate ops [ %s ]\n",
   1350 			vgpu->id, s->ops->name);
   1351 
   1352 	return 0;
   1353 }
   1354 
   1355 /**
   1356  * intel_vgpu_destroy_workload - destroy a vGPU workload
   1357  * @workload: workload to destroy
   1358  *
   1359  * This function is called when destroy a vGPU workload.
   1360  *
   1361  */
   1362 void intel_vgpu_destroy_workload(struct intel_vgpu_workload *workload)
   1363 {
   1364 	struct intel_vgpu_submission *s = &workload->vgpu->submission;
   1365 
   1366 	release_shadow_batch_buffer(workload);
   1367 	release_shadow_wa_ctx(&workload->wa_ctx);
   1368 
   1369 	if (workload->shadow_mm)
   1370 		intel_vgpu_mm_put(workload->shadow_mm);
   1371 
   1372 	kmem_cache_free(s->workloads, workload);
   1373 }
   1374 
   1375 static struct intel_vgpu_workload *
   1376 alloc_workload(struct intel_vgpu *vgpu)
   1377 {
   1378 	struct intel_vgpu_submission *s = &vgpu->submission;
   1379 	struct intel_vgpu_workload *workload;
   1380 
   1381 	workload = kmem_cache_zalloc(s->workloads, GFP_KERNEL);
   1382 	if (!workload)
   1383 		return ERR_PTR(-ENOMEM);
   1384 
   1385 	INIT_LIST_HEAD(&workload->list);
   1386 	INIT_LIST_HEAD(&workload->shadow_bb);
   1387 
   1388 	init_waitqueue_head(&workload->shadow_ctx_status_wq);
   1389 	atomic_set(&workload->shadow_ctx_active, 0);
   1390 
   1391 	workload->status = -EINPROGRESS;
   1392 	workload->vgpu = vgpu;
   1393 
   1394 	return workload;
   1395 }
   1396 
   1397 #define RING_CTX_OFF(x) \
   1398 	offsetof(struct execlist_ring_context, x)
   1399 
   1400 static void read_guest_pdps(struct intel_vgpu *vgpu,
   1401 		u64 ring_context_gpa, u32 pdp[8])
   1402 {
   1403 	u64 gpa;
   1404 	int i;
   1405 
   1406 	gpa = ring_context_gpa + RING_CTX_OFF(pdps[0].val);
   1407 
   1408 	for (i = 0; i < 8; i++)
   1409 		intel_gvt_hypervisor_read_gpa(vgpu,
   1410 				gpa + i * 8, &pdp[7 - i], 4);
   1411 }
   1412 
   1413 static int prepare_mm(struct intel_vgpu_workload *workload)
   1414 {
   1415 	struct execlist_ctx_descriptor_format *desc = &workload->ctx_desc;
   1416 	struct intel_vgpu_mm *mm;
   1417 	struct intel_vgpu *vgpu = workload->vgpu;
   1418 	enum intel_gvt_gtt_type root_entry_type;
   1419 	u64 pdps[GVT_RING_CTX_NR_PDPS];
   1420 
   1421 	switch (desc->addressing_mode) {
   1422 	case 1: /* legacy 32-bit */
   1423 		root_entry_type = GTT_TYPE_PPGTT_ROOT_L3_ENTRY;
   1424 		break;
   1425 	case 3: /* legacy 64-bit */
   1426 		root_entry_type = GTT_TYPE_PPGTT_ROOT_L4_ENTRY;
   1427 		break;
   1428 	default:
   1429 		gvt_vgpu_err("Advanced Context mode(SVM) is not supported!\n");
   1430 		return -EINVAL;
   1431 	}
   1432 
   1433 	read_guest_pdps(workload->vgpu, workload->ring_context_gpa, (void *)pdps);
   1434 
   1435 	mm = intel_vgpu_get_ppgtt_mm(workload->vgpu, root_entry_type, pdps);
   1436 	if (IS_ERR(mm))
   1437 		return PTR_ERR(mm);
   1438 
   1439 	workload->shadow_mm = mm;
   1440 	return 0;
   1441 }
   1442 
   1443 #define same_context(a, b) (((a)->context_id == (b)->context_id) && \
   1444 		((a)->lrca == (b)->lrca))
   1445 
   1446 /**
   1447  * intel_vgpu_create_workload - create a vGPU workload
   1448  * @vgpu: a vGPU
   1449  * @ring_id: ring index
   1450  * @desc: a guest context descriptor
   1451  *
   1452  * This function is called when creating a vGPU workload.
   1453  *
   1454  * Returns:
   1455  * struct intel_vgpu_workload * on success, negative error code in
   1456  * pointer if failed.
   1457  *
   1458  */
   1459 struct intel_vgpu_workload *
   1460 intel_vgpu_create_workload(struct intel_vgpu *vgpu, int ring_id,
   1461 			   struct execlist_ctx_descriptor_format *desc)
   1462 {
   1463 	struct intel_vgpu_submission *s = &vgpu->submission;
   1464 	struct list_head *q = workload_q_head(vgpu, ring_id);
   1465 	struct intel_vgpu_workload *last_workload = NULL;
   1466 	struct intel_vgpu_workload *workload = NULL;
   1467 	struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
   1468 	u64 ring_context_gpa;
   1469 	u32 head, tail, start, ctl, ctx_ctl, per_ctx, indirect_ctx;
   1470 	u32 guest_head;
   1471 	int ret;
   1472 
   1473 	ring_context_gpa = intel_vgpu_gma_to_gpa(vgpu->gtt.ggtt_mm,
   1474 			(u32)((desc->lrca + 1) << I915_GTT_PAGE_SHIFT));
   1475 	if (ring_context_gpa == INTEL_GVT_INVALID_ADDR) {
   1476 		gvt_vgpu_err("invalid guest context LRCA: %x\n", desc->lrca);
   1477 		return ERR_PTR(-EINVAL);
   1478 	}
   1479 
   1480 	intel_gvt_hypervisor_read_gpa(vgpu, ring_context_gpa +
   1481 			RING_CTX_OFF(ring_header.val), &head, 4);
   1482 
   1483 	intel_gvt_hypervisor_read_gpa(vgpu, ring_context_gpa +
   1484 			RING_CTX_OFF(ring_tail.val), &tail, 4);
   1485 
   1486 	guest_head = head;
   1487 
   1488 	head &= RB_HEAD_OFF_MASK;
   1489 	tail &= RB_TAIL_OFF_MASK;
   1490 
   1491 	list_for_each_entry_reverse(last_workload, q, list) {
   1492 
   1493 		if (same_context(&last_workload->ctx_desc, desc)) {
   1494 			gvt_dbg_el("ring id %d cur workload == last\n",
   1495 					ring_id);
   1496 			gvt_dbg_el("ctx head %x real head %lx\n", head,
   1497 					last_workload->rb_tail);
   1498 			/*
   1499 			 * cannot use guest context head pointer here,
   1500 			 * as it might not be updated at this time
   1501 			 */
   1502 			head = last_workload->rb_tail;
   1503 			break;
   1504 		}
   1505 	}
   1506 
   1507 	gvt_dbg_el("ring id %d begin a new workload\n", ring_id);
   1508 
   1509 	/* record some ring buffer register values for scan and shadow */
   1510 	intel_gvt_hypervisor_read_gpa(vgpu, ring_context_gpa +
   1511 			RING_CTX_OFF(rb_start.val), &start, 4);
   1512 	intel_gvt_hypervisor_read_gpa(vgpu, ring_context_gpa +
   1513 			RING_CTX_OFF(rb_ctrl.val), &ctl, 4);
   1514 	intel_gvt_hypervisor_read_gpa(vgpu, ring_context_gpa +
   1515 			RING_CTX_OFF(ctx_ctrl.val), &ctx_ctl, 4);
   1516 
   1517 	if (!intel_gvt_ggtt_validate_range(vgpu, start,
   1518 				_RING_CTL_BUF_SIZE(ctl))) {
   1519 		gvt_vgpu_err("context contain invalid rb at: 0x%x\n", start);
   1520 		return ERR_PTR(-EINVAL);
   1521 	}
   1522 
   1523 	workload = alloc_workload(vgpu);
   1524 	if (IS_ERR(workload))
   1525 		return workload;
   1526 
   1527 	workload->ring_id = ring_id;
   1528 	workload->ctx_desc = *desc;
   1529 	workload->ring_context_gpa = ring_context_gpa;
   1530 	workload->rb_head = head;
   1531 	workload->guest_rb_head = guest_head;
   1532 	workload->rb_tail = tail;
   1533 	workload->rb_start = start;
   1534 	workload->rb_ctl = ctl;
   1535 
   1536 	if (ring_id == RCS0) {
   1537 		intel_gvt_hypervisor_read_gpa(vgpu, ring_context_gpa +
   1538 			RING_CTX_OFF(bb_per_ctx_ptr.val), &per_ctx, 4);
   1539 		intel_gvt_hypervisor_read_gpa(vgpu, ring_context_gpa +
   1540 			RING_CTX_OFF(rcs_indirect_ctx.val), &indirect_ctx, 4);
   1541 
   1542 		workload->wa_ctx.indirect_ctx.guest_gma =
   1543 			indirect_ctx & INDIRECT_CTX_ADDR_MASK;
   1544 		workload->wa_ctx.indirect_ctx.size =
   1545 			(indirect_ctx & INDIRECT_CTX_SIZE_MASK) *
   1546 			CACHELINE_BYTES;
   1547 
   1548 		if (workload->wa_ctx.indirect_ctx.size != 0) {
   1549 			if (!intel_gvt_ggtt_validate_range(vgpu,
   1550 				workload->wa_ctx.indirect_ctx.guest_gma,
   1551 				workload->wa_ctx.indirect_ctx.size)) {
   1552 				gvt_vgpu_err("invalid wa_ctx at: 0x%lx\n",
   1553 				    workload->wa_ctx.indirect_ctx.guest_gma);
   1554 				kmem_cache_free(s->workloads, workload);
   1555 				return ERR_PTR(-EINVAL);
   1556 			}
   1557 		}
   1558 
   1559 		workload->wa_ctx.per_ctx.guest_gma =
   1560 			per_ctx & PER_CTX_ADDR_MASK;
   1561 		workload->wa_ctx.per_ctx.valid = per_ctx & 1;
   1562 		if (workload->wa_ctx.per_ctx.valid) {
   1563 			if (!intel_gvt_ggtt_validate_range(vgpu,
   1564 				workload->wa_ctx.per_ctx.guest_gma,
   1565 				CACHELINE_BYTES)) {
   1566 				gvt_vgpu_err("invalid per_ctx at: 0x%lx\n",
   1567 					workload->wa_ctx.per_ctx.guest_gma);
   1568 				kmem_cache_free(s->workloads, workload);
   1569 				return ERR_PTR(-EINVAL);
   1570 			}
   1571 		}
   1572 	}
   1573 
   1574 	gvt_dbg_el("workload %p ring id %d head %x tail %x start %x ctl %x\n",
   1575 			workload, ring_id, head, tail, start, ctl);
   1576 
   1577 	ret = prepare_mm(workload);
   1578 	if (ret) {
   1579 		kmem_cache_free(s->workloads, workload);
   1580 		return ERR_PTR(ret);
   1581 	}
   1582 
   1583 	/* Only scan and shadow the first workload in the queue
   1584 	 * as there is only one pre-allocated buf-obj for shadow.
   1585 	 */
   1586 	if (list_empty(workload_q_head(vgpu, ring_id))) {
   1587 		intel_runtime_pm_get(&dev_priv->runtime_pm);
   1588 		ret = intel_gvt_scan_and_shadow_workload(workload);
   1589 		intel_runtime_pm_put_unchecked(&dev_priv->runtime_pm);
   1590 	}
   1591 
   1592 	if (ret) {
   1593 		if (vgpu_is_vm_unhealthy(ret))
   1594 			enter_failsafe_mode(vgpu, GVT_FAILSAFE_GUEST_ERR);
   1595 		intel_vgpu_destroy_workload(workload);
   1596 		return ERR_PTR(ret);
   1597 	}
   1598 
   1599 	return workload;
   1600 }
   1601 
   1602 /**
   1603  * intel_vgpu_queue_workload - Qeue a vGPU workload
   1604  * @workload: the workload to queue in
   1605  */
   1606 void intel_vgpu_queue_workload(struct intel_vgpu_workload *workload)
   1607 {
   1608 	list_add_tail(&workload->list,
   1609 		workload_q_head(workload->vgpu, workload->ring_id));
   1610 	intel_gvt_kick_schedule(workload->vgpu->gvt);
   1611 	wake_up(&workload->vgpu->gvt->scheduler.waitq[workload->ring_id]);
   1612 }
   1613