Home | History | Annotate | Line # | Download | only in scheduler
      1 /*	$NetBSD: sched_main.c,v 1.11 2021/12/19 12:42:58 riastradh Exp $	*/
      2 
      3 /*
      4  * Copyright 2015 Advanced Micro Devices, Inc.
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a
      7  * copy of this software and associated documentation files (the "Software"),
      8  * to deal in the Software without restriction, including without limitation
      9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     10  * and/or sell copies of the Software, and to permit persons to whom the
     11  * Software is furnished to do so, subject to the following conditions:
     12  *
     13  * The above copyright notice and this permission notice shall be included in
     14  * all copies or substantial portions of the Software.
     15  *
     16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
     20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
     21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
     22  * OTHER DEALINGS IN THE SOFTWARE.
     23  *
     24  */
     25 
     26 /**
     27  * DOC: Overview
     28  *
     29  * The GPU scheduler provides entities which allow userspace to push jobs
     30  * into software queues which are then scheduled on a hardware run queue.
     31  * The software queues have a priority among them. The scheduler selects the entities
     32  * from the run queue using a FIFO. The scheduler provides dependency handling
     33  * features among jobs. The driver is supposed to provide callback functions for
     34  * backend operations to the scheduler like submitting a job to hardware run queue,
     35  * returning the dependencies of a job etc.
     36  *
     37  * The organisation of the scheduler is the following:
     38  *
     39  * 1. Each hw run queue has one scheduler
     40  * 2. Each scheduler has multiple run queues with different priorities
     41  *    (e.g., HIGH_HW,HIGH_SW, KERNEL, NORMAL)
     42  * 3. Each scheduler run queue has a queue of entities to schedule
     43  * 4. Entities themselves maintain a queue of jobs that will be scheduled on
     44  *    the hardware.
     45  *
     46  * The jobs in a entity are always scheduled in the order that they were pushed.
     47  */
     48 
     49 #include <sys/cdefs.h>
     50 __KERNEL_RCSID(0, "$NetBSD: sched_main.c,v 1.11 2021/12/19 12:42:58 riastradh Exp $");
     51 
     52 #include <linux/kthread.h>
     53 #include <linux/wait.h>
     54 #include <linux/sched.h>
     55 #include <linux/completion.h>
     56 #include <uapi/linux/sched/types.h>
     57 
     58 #include <drm/drm_print.h>
     59 #include <drm/gpu_scheduler.h>
     60 #include <drm/spsc_queue.h>
     61 
     62 #define CREATE_TRACE_POINTS
     63 #include "gpu_scheduler_trace.h"
     64 
     65 #define to_drm_sched_job(sched_job)		\
     66 		container_of((sched_job), struct drm_sched_job, queue_node)
     67 
     68 static void drm_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb);
     69 
     70 /**
     71  * drm_sched_rq_init - initialize a given run queue struct
     72  *
     73  * @rq: scheduler run queue
     74  *
     75  * Initializes a scheduler runqueue.
     76  */
     77 static void drm_sched_rq_init(struct drm_gpu_scheduler *sched,
     78 			      struct drm_sched_rq *rq)
     79 {
     80 	spin_lock_init(&rq->lock);
     81 	INIT_LIST_HEAD(&rq->entities);
     82 	rq->current_entity = NULL;
     83 	rq->sched = sched;
     84 }
     85 
     86 /**
     87  * drm_sched_rq_add_entity - add an entity
     88  *
     89  * @rq: scheduler run queue
     90  * @entity: scheduler entity
     91  *
     92  * Adds a scheduler entity to the run queue.
     93  */
     94 void drm_sched_rq_add_entity(struct drm_sched_rq *rq,
     95 			     struct drm_sched_entity *entity)
     96 {
     97 	if (!list_empty(&entity->list))
     98 		return;
     99 	spin_lock(&rq->lock);
    100 	atomic_inc(&rq->sched->score);
    101 	list_add_tail(&entity->list, &rq->entities);
    102 	spin_unlock(&rq->lock);
    103 }
    104 
    105 /**
    106  * drm_sched_rq_remove_entity - remove an entity
    107  *
    108  * @rq: scheduler run queue
    109  * @entity: scheduler entity
    110  *
    111  * Removes a scheduler entity from the run queue.
    112  */
    113 void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
    114 				struct drm_sched_entity *entity)
    115 {
    116 	if (list_empty(&entity->list))
    117 		return;
    118 	spin_lock(&rq->lock);
    119 	atomic_dec(&rq->sched->score);
    120 	list_del_init(&entity->list);
    121 	if (rq->current_entity == entity)
    122 		rq->current_entity = NULL;
    123 	spin_unlock(&rq->lock);
    124 }
    125 
    126 /**
    127  * drm_sched_rq_select_entity - Select an entity which could provide a job to run
    128  *
    129  * @rq: scheduler run queue to check.
    130  *
    131  * Try to find a ready entity, returns NULL if none found.
    132  */
    133 static struct drm_sched_entity *
    134 drm_sched_rq_select_entity(struct drm_sched_rq *rq)
    135 {
    136 	struct drm_sched_entity *entity;
    137 
    138 	spin_lock(&rq->lock);
    139 
    140 	entity = rq->current_entity;
    141 	if (entity) {
    142 		list_for_each_entry_continue(entity, &rq->entities, list) {
    143 			if (drm_sched_entity_is_ready(entity)) {
    144 				rq->current_entity = entity;
    145 				reinit_completion(&entity->entity_idle);
    146 				spin_unlock(&rq->lock);
    147 				return entity;
    148 			}
    149 		}
    150 	}
    151 
    152 	list_for_each_entry(entity, &rq->entities, list) {
    153 
    154 		if (drm_sched_entity_is_ready(entity)) {
    155 			rq->current_entity = entity;
    156 			reinit_completion(&entity->entity_idle);
    157 			spin_unlock(&rq->lock);
    158 			return entity;
    159 		}
    160 
    161 		if (entity == rq->current_entity)
    162 			break;
    163 	}
    164 
    165 	spin_unlock(&rq->lock);
    166 
    167 	return NULL;
    168 }
    169 
    170 /**
    171  * drm_sched_dependency_optimized
    172  *
    173  * @fence: the dependency fence
    174  * @entity: the entity which depends on the above fence
    175  *
    176  * Returns true if the dependency can be optimized and false otherwise
    177  */
    178 bool drm_sched_dependency_optimized(struct dma_fence* fence,
    179 				    struct drm_sched_entity *entity)
    180 {
    181 	struct drm_gpu_scheduler *sched = entity->rq->sched;
    182 	struct drm_sched_fence *s_fence;
    183 
    184 	if (!fence || dma_fence_is_signaled(fence))
    185 		return false;
    186 	if (fence->context == entity->fence_context)
    187 		return true;
    188 	s_fence = to_drm_sched_fence(fence);
    189 	if (s_fence && s_fence->sched == sched)
    190 		return true;
    191 
    192 	return false;
    193 }
    194 EXPORT_SYMBOL(drm_sched_dependency_optimized);
    195 
    196 /**
    197  * drm_sched_start_timeout - start timeout for reset worker
    198  *
    199  * @sched: scheduler instance to start the worker for
    200  *
    201  * Start the timeout for the given scheduler.
    202  */
    203 static void drm_sched_start_timeout(struct drm_gpu_scheduler *sched)
    204 {
    205 	if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
    206 	    !list_empty(&sched->ring_mirror_list))
    207 		schedule_delayed_work(&sched->work_tdr, sched->timeout);
    208 }
    209 
    210 /**
    211  * drm_sched_fault - immediately start timeout handler
    212  *
    213  * @sched: scheduler where the timeout handling should be started.
    214  *
    215  * Start timeout handling immediately when the driver detects a hardware fault.
    216  */
    217 void drm_sched_fault(struct drm_gpu_scheduler *sched)
    218 {
    219 	mod_delayed_work(system_wq, &sched->work_tdr, 0);
    220 }
    221 EXPORT_SYMBOL(drm_sched_fault);
    222 
    223 /**
    224  * drm_sched_suspend_timeout - Suspend scheduler job timeout
    225  *
    226  * @sched: scheduler instance for which to suspend the timeout
    227  *
    228  * Suspend the delayed work timeout for the scheduler. This is done by
    229  * modifying the delayed work timeout to an arbitrary large value,
    230  * MAX_SCHEDULE_TIMEOUT in this case. Note that this function can be
    231  * called from an IRQ context.
    232  *
    233  * Returns the timeout remaining
    234  *
    235  */
    236 unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched)
    237 {
    238 #ifdef __NetBSD__
    239 	/* XXX Currently nothing uses the return value.  */
    240 	cancel_delayed_work(&sched->work_tdr);
    241 	return -123456789;
    242 #else
    243 	unsigned long sched_timeout, now = jiffies;
    244 
    245 	sched_timeout = sched->work_tdr.timer.expires;
    246 
    247 	/*
    248 	 * Modify the timeout to an arbitrarily large value. This also prevents
    249 	 * the timeout to be restarted when new submissions arrive
    250 	 */
    251 	if (mod_delayed_work(system_wq, &sched->work_tdr, MAX_SCHEDULE_TIMEOUT)
    252 			&& time_after(sched_timeout, now))
    253 		return sched_timeout - now;
    254 	else
    255 		return sched->timeout;
    256 #endif
    257 }
    258 EXPORT_SYMBOL(drm_sched_suspend_timeout);
    259 
    260 /**
    261  * drm_sched_resume_timeout - Resume scheduler job timeout
    262  *
    263  * @sched: scheduler instance for which to resume the timeout
    264  * @remaining: remaining timeout
    265  *
    266  * Resume the delayed work timeout for the scheduler. Note that
    267  * this function can be called from an IRQ context.
    268  */
    269 void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched,
    270 		unsigned long remaining)
    271 {
    272 	unsigned long flags;
    273 
    274 	spin_lock_irqsave(&sched->job_list_lock, flags);
    275 
    276 	if (list_empty(&sched->ring_mirror_list))
    277 		cancel_delayed_work(&sched->work_tdr);
    278 	else
    279 		mod_delayed_work(system_wq, &sched->work_tdr, remaining);
    280 
    281 	spin_unlock_irqrestore(&sched->job_list_lock, flags);
    282 }
    283 EXPORT_SYMBOL(drm_sched_resume_timeout);
    284 
    285 static void drm_sched_job_begin(struct drm_sched_job *s_job)
    286 {
    287 	struct drm_gpu_scheduler *sched = s_job->sched;
    288 	unsigned long flags;
    289 
    290 	spin_lock_irqsave(&sched->job_list_lock, flags);
    291 	list_add_tail(&s_job->node, &sched->ring_mirror_list);
    292 	drm_sched_start_timeout(sched);
    293 	spin_unlock_irqrestore(&sched->job_list_lock, flags);
    294 }
    295 
    296 static void drm_sched_job_timedout(struct work_struct *work)
    297 {
    298 	struct drm_gpu_scheduler *sched;
    299 	struct drm_sched_job *job;
    300 	unsigned long flags;
    301 
    302 	sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);
    303 
    304 	/* Protects against concurrent deletion in drm_sched_get_cleanup_job */
    305 	spin_lock_irqsave(&sched->job_list_lock, flags);
    306 	job = list_first_entry_or_null(&sched->ring_mirror_list,
    307 				       struct drm_sched_job, node);
    308 
    309 	if (job) {
    310 		/*
    311 		 * Remove the bad job so it cannot be freed by concurrent
    312 		 * drm_sched_cleanup_jobs. It will be reinserted back after sched->thread
    313 		 * is parked at which point it's safe.
    314 		 */
    315 		list_del_init(&job->node);
    316 		spin_unlock_irqrestore(&sched->job_list_lock, flags);
    317 
    318 		job->sched->ops->timedout_job(job);
    319 
    320 		/*
    321 		 * Guilty job did complete and hence needs to be manually removed
    322 		 * See drm_sched_stop doc.
    323 		 */
    324 		if (sched->free_guilty) {
    325 			job->sched->ops->free_job(job);
    326 			sched->free_guilty = false;
    327 		}
    328 	} else {
    329 		spin_unlock_irqrestore(&sched->job_list_lock, flags);
    330 	}
    331 
    332 	spin_lock_irqsave(&sched->job_list_lock, flags);
    333 	drm_sched_start_timeout(sched);
    334 	spin_unlock_irqrestore(&sched->job_list_lock, flags);
    335 }
    336 
    337  /**
    338   * drm_sched_increase_karma - Update sched_entity guilty flag
    339   *
    340   * @bad: The job guilty of time out
    341   *
    342   * Increment on every hang caused by the 'bad' job. If this exceeds the hang
    343   * limit of the scheduler then the respective sched entity is marked guilty and
    344   * jobs from it will not be scheduled further
    345   */
    346 void drm_sched_increase_karma(struct drm_sched_job *bad)
    347 {
    348 	int i;
    349 	struct drm_sched_entity *tmp;
    350 	struct drm_sched_entity *entity;
    351 	struct drm_gpu_scheduler *sched = bad->sched;
    352 
    353 	/* don't increase @bad's karma if it's from KERNEL RQ,
    354 	 * because sometimes GPU hang would cause kernel jobs (like VM updating jobs)
    355 	 * corrupt but keep in mind that kernel jobs always considered good.
    356 	 */
    357 	if (bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) {
    358 		atomic_inc(&bad->karma);
    359 		for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_KERNEL;
    360 		     i++) {
    361 			struct drm_sched_rq *rq = &sched->sched_rq[i];
    362 
    363 			spin_lock(&rq->lock);
    364 			list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
    365 				if (bad->s_fence->scheduled.context ==
    366 				    entity->fence_context) {
    367 					if (atomic_read(&bad->karma) >
    368 					    bad->sched->hang_limit)
    369 						if (entity->guilty)
    370 							atomic_set(entity->guilty, 1);
    371 					break;
    372 				}
    373 			}
    374 			spin_unlock(&rq->lock);
    375 			if (&entity->list != &rq->entities)
    376 				break;
    377 		}
    378 	}
    379 }
    380 EXPORT_SYMBOL(drm_sched_increase_karma);
    381 
    382 /**
    383  * drm_sched_stop - stop the scheduler
    384  *
    385  * @sched: scheduler instance
    386  * @bad: job which caused the time out
    387  *
    388  * Stop the scheduler and also removes and frees all completed jobs.
    389  * Note: bad job will not be freed as it might be used later and so it's
    390  * callers responsibility to release it manually if it's not part of the
    391  * mirror list any more.
    392  *
    393  */
    394 void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
    395 {
    396 	struct drm_sched_job *s_job, *tmp;
    397 	unsigned long flags;
    398 
    399 	kthread_park(sched->thread);
    400 
    401 	/*
    402 	 * Reinsert back the bad job here - now it's safe as
    403 	 * drm_sched_get_cleanup_job cannot race against us and release the
    404 	 * bad job at this point - we parked (waited for) any in progress
    405 	 * (earlier) cleanups and drm_sched_get_cleanup_job will not be called
    406 	 * now until the scheduler thread is unparked.
    407 	 */
    408 	if (bad && bad->sched == sched)
    409 		/*
    410 		 * Add at the head of the queue to reflect it was the earliest
    411 		 * job extracted.
    412 		 */
    413 		list_add(&bad->node, &sched->ring_mirror_list);
    414 
    415 	/*
    416 	 * Iterate the job list from later to  earlier one and either deactive
    417 	 * their HW callbacks or remove them from mirror list if they already
    418 	 * signaled.
    419 	 * This iteration is thread safe as sched thread is stopped.
    420 	 */
    421 	list_for_each_entry_safe_reverse(s_job, tmp, &sched->ring_mirror_list, node) {
    422 		if (s_job->s_fence->parent &&
    423 		    dma_fence_remove_callback(s_job->s_fence->parent,
    424 					      &s_job->cb)) {
    425 			atomic_dec(&sched->hw_rq_count);
    426 		} else {
    427 			/*
    428 			 * remove job from ring_mirror_list.
    429 			 * Locking here is for concurrent resume timeout
    430 			 */
    431 			spin_lock_irqsave(&sched->job_list_lock, flags);
    432 			list_del_init(&s_job->node);
    433 			spin_unlock_irqrestore(&sched->job_list_lock, flags);
    434 
    435 			/*
    436 			 * Wait for job's HW fence callback to finish using s_job
    437 			 * before releasing it.
    438 			 *
    439 			 * Job is still alive so fence refcount at least 1
    440 			 */
    441 			dma_fence_wait(&s_job->s_fence->finished, false);
    442 
    443 			/*
    444 			 * We must keep bad job alive for later use during
    445 			 * recovery by some of the drivers but leave a hint
    446 			 * that the guilty job must be released.
    447 			 */
    448 			if (bad != s_job)
    449 				sched->ops->free_job(s_job);
    450 			else
    451 				sched->free_guilty = true;
    452 		}
    453 	}
    454 
    455 	/*
    456 	 * Stop pending timer in flight as we rearm it in  drm_sched_start. This
    457 	 * avoids the pending timeout work in progress to fire right away after
    458 	 * this TDR finished and before the newly restarted jobs had a
    459 	 * chance to complete.
    460 	 */
    461 	cancel_delayed_work(&sched->work_tdr);
    462 }
    463 
    464 EXPORT_SYMBOL(drm_sched_stop);
    465 
    466 /**
    467  * drm_sched_job_recovery - recover jobs after a reset
    468  *
    469  * @sched: scheduler instance
    470  * @full_recovery: proceed with complete sched restart
    471  *
    472  */
    473 void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
    474 {
    475 	struct drm_sched_job *s_job, *tmp;
    476 	unsigned long flags;
    477 	int r;
    478 
    479 	/*
    480 	 * Locking the list is not required here as the sched thread is parked
    481 	 * so no new jobs are being inserted or removed. Also concurrent
    482 	 * GPU recovers can't run in parallel.
    483 	 */
    484 	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
    485 		struct dma_fence *fence = s_job->s_fence->parent;
    486 
    487 		atomic_inc(&sched->hw_rq_count);
    488 
    489 		if (!full_recovery)
    490 			continue;
    491 
    492 		if (fence) {
    493 			r = dma_fence_add_callback(fence, &s_job->cb,
    494 						   drm_sched_process_job);
    495 			if (r == -ENOENT)
    496 				drm_sched_process_job(fence, &s_job->cb);
    497 			else if (r)
    498 				DRM_ERROR("fence add callback failed (%d)\n",
    499 					  r);
    500 		} else
    501 			drm_sched_process_job(NULL, &s_job->cb);
    502 	}
    503 
    504 	if (full_recovery) {
    505 		spin_lock_irqsave(&sched->job_list_lock, flags);
    506 		drm_sched_start_timeout(sched);
    507 		spin_unlock_irqrestore(&sched->job_list_lock, flags);
    508 	}
    509 
    510 	kthread_unpark(sched->thread);
    511 }
    512 EXPORT_SYMBOL(drm_sched_start);
    513 
    514 /**
    515  * drm_sched_resubmit_jobs - helper to relunch job from mirror ring list
    516  *
    517  * @sched: scheduler instance
    518  *
    519  */
    520 void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched)
    521 {
    522 	struct drm_sched_job *s_job, *tmp;
    523 	uint64_t guilty_context;
    524 	bool found_guilty = false;
    525 	struct dma_fence *fence;
    526 
    527 	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
    528 		struct drm_sched_fence *s_fence = s_job->s_fence;
    529 
    530 		if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
    531 			found_guilty = true;
    532 			guilty_context = s_job->s_fence->scheduled.context;
    533 		}
    534 
    535 		if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)
    536 			dma_fence_set_error(&s_fence->finished, -ECANCELED);
    537 
    538 		dma_fence_put(s_job->s_fence->parent);
    539 		fence = sched->ops->run_job(s_job);
    540 
    541 		if (IS_ERR_OR_NULL(fence)) {
    542 			if (IS_ERR(fence))
    543 				dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
    544 
    545 			s_job->s_fence->parent = NULL;
    546 		} else {
    547 			s_job->s_fence->parent = fence;
    548 		}
    549 
    550 
    551 	}
    552 }
    553 EXPORT_SYMBOL(drm_sched_resubmit_jobs);
    554 
    555 /**
    556  * drm_sched_job_init - init a scheduler job
    557  *
    558  * @job: scheduler job to init
    559  * @entity: scheduler entity to use
    560  * @owner: job owner for debugging
    561  *
    562  * Refer to drm_sched_entity_push_job() documentation
    563  * for locking considerations.
    564  *
    565  * Returns 0 for success, negative error code otherwise.
    566  */
    567 int drm_sched_job_init(struct drm_sched_job *job,
    568 		       struct drm_sched_entity *entity,
    569 		       void *owner)
    570 {
    571 	struct drm_gpu_scheduler *sched;
    572 
    573 	drm_sched_entity_select_rq(entity);
    574 	if (!entity->rq)
    575 		return -ENOENT;
    576 
    577 	sched = entity->rq->sched;
    578 
    579 	job->sched = sched;
    580 	job->entity = entity;
    581 	job->s_priority = entity->rq - sched->sched_rq;
    582 	job->s_fence = drm_sched_fence_create(entity, owner);
    583 	if (!job->s_fence)
    584 		return -ENOMEM;
    585 	job->id = atomic64_inc_return(&sched->job_id_count);
    586 
    587 	INIT_LIST_HEAD(&job->node);
    588 
    589 	return 0;
    590 }
    591 EXPORT_SYMBOL(drm_sched_job_init);
    592 
    593 /**
    594  * drm_sched_job_cleanup - clean up scheduler job resources
    595  *
    596  * @job: scheduler job to clean up
    597  */
    598 void drm_sched_job_cleanup(struct drm_sched_job *job)
    599 {
    600 	dma_fence_put(&job->s_fence->finished);
    601 	job->s_fence = NULL;
    602 }
    603 EXPORT_SYMBOL(drm_sched_job_cleanup);
    604 
    605 /**
    606  * drm_sched_ready - is the scheduler ready
    607  *
    608  * @sched: scheduler instance
    609  *
    610  * Return true if we can push more jobs to the hw, otherwise false.
    611  */
    612 static bool drm_sched_ready(struct drm_gpu_scheduler *sched)
    613 {
    614 	return atomic_read(&sched->hw_rq_count) <
    615 		sched->hw_submission_limit;
    616 }
    617 
    618 /**
    619  * drm_sched_wakeup - Wake up the scheduler when it is ready
    620  *
    621  * @sched: scheduler instance
    622  *
    623  */
    624 void drm_sched_wakeup(struct drm_gpu_scheduler *sched)
    625 {
    626 	assert_spin_locked(&sched->job_list_lock);
    627 	if (drm_sched_ready(sched))
    628 		DRM_SPIN_WAKEUP_ONE(&sched->wake_up_worker,
    629 		    &sched->job_list_lock);
    630 }
    631 
    632 /**
    633  * drm_sched_select_entity - Select next entity to process
    634  *
    635  * @sched: scheduler instance
    636  *
    637  * Returns the entity to process or NULL if none are found.
    638  */
    639 static struct drm_sched_entity *
    640 drm_sched_select_entity(struct drm_gpu_scheduler *sched)
    641 {
    642 	struct drm_sched_entity *entity;
    643 	int i;
    644 
    645 	if (!drm_sched_ready(sched))
    646 		return NULL;
    647 
    648 	/* Kernel run queue has higher priority than normal run queue*/
    649 	for (i = DRM_SCHED_PRIORITY_MAX - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
    650 		entity = drm_sched_rq_select_entity(&sched->sched_rq[i]);
    651 		if (entity)
    652 			break;
    653 	}
    654 
    655 	return entity;
    656 }
    657 
    658 /**
    659  * drm_sched_process_job - process a job
    660  *
    661  * @f: fence
    662  * @cb: fence callbacks
    663  *
    664  * Called after job has finished execution.
    665  */
    666 static void drm_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb)
    667 {
    668 	struct drm_sched_job *s_job = container_of(cb, struct drm_sched_job, cb);
    669 	struct drm_sched_fence *s_fence = s_job->s_fence;
    670 	struct drm_gpu_scheduler *sched = s_fence->sched;
    671 
    672 	atomic_dec(&sched->hw_rq_count);
    673 	atomic_dec(&sched->score);
    674 
    675 	trace_drm_sched_process_job(s_fence);
    676 
    677 	drm_sched_fence_finished(s_fence);
    678 	spin_lock(&sched->job_list_lock);
    679 	DRM_SPIN_WAKEUP_ONE(&sched->wake_up_worker, &sched->job_list_lock);
    680 	spin_unlock(&sched->job_list_lock);
    681 }
    682 
    683 /**
    684  * drm_sched_get_cleanup_job - fetch the next finished job to be destroyed
    685  *
    686  * @sched: scheduler instance
    687  *
    688  * Returns the next finished job from the mirror list (if there is one)
    689  * ready for it to be destroyed.
    690  */
    691 static struct drm_sched_job *
    692 drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
    693 {
    694 	struct drm_sched_job *job;
    695 
    696 	assert_spin_locked(&sched->job_list_lock);
    697 
    698 	/*
    699 	 * Don't destroy jobs while the timeout worker is running  OR thread
    700 	 * is being parked and hence assumed to not touch ring_mirror_list
    701 	 */
    702 	if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
    703 	    !cancel_delayed_work(&sched->work_tdr)) ||
    704 	    __kthread_should_park(sched->thread))
    705 		return NULL;
    706 
    707 	job = list_first_entry_or_null(&sched->ring_mirror_list,
    708 				       struct drm_sched_job, node);
    709 
    710 	if (job && dma_fence_is_signaled(&job->s_fence->finished)) {
    711 		/* remove job from ring_mirror_list */
    712 		list_del_init(&job->node);
    713 	} else {
    714 		job = NULL;
    715 		/* queue timeout for next job */
    716 		drm_sched_start_timeout(sched);
    717 	}
    718 
    719 	return job;
    720 }
    721 
    722 /**
    723  * drm_sched_blocked - check if the scheduler is blocked
    724  *
    725  * @sched: scheduler instance
    726  *
    727  * Returns true if blocked, otherwise false.
    728  */
    729 static bool drm_sched_blocked(struct drm_gpu_scheduler *sched)
    730 {
    731 	if (kthread_should_park()) {
    732 		kthread_parkme();
    733 		return true;
    734 	}
    735 
    736 	return false;
    737 }
    738 
    739 /**
    740  * drm_sched_main - main scheduler thread
    741  *
    742  * @param: scheduler instance
    743  *
    744  * Returns 0.
    745  */
    746 static int drm_sched_main(void *param)
    747 {
    748 	struct sched_param sparam = {.sched_priority = 1};
    749 	struct drm_gpu_scheduler *sched = (struct drm_gpu_scheduler *)param;
    750 	int r;
    751 
    752 	sched_setscheduler(current, SCHED_FIFO, &sparam);
    753 
    754 	/* Wait for sched->thread to be initialized by drm_sched_init.  */
    755 	spin_lock(&sched->job_list_lock);
    756 	DRM_SPIN_WAIT_UNTIL(r, &sched->wake_up_worker, &sched->job_list_lock,
    757 	    sched->thread != NULL);
    758 	spin_unlock(&sched->job_list_lock);
    759 
    760 	while (!kthread_should_stop()) {
    761 		struct drm_sched_entity *entity = NULL;
    762 		struct drm_sched_fence *s_fence;
    763 		struct drm_sched_job *sched_job;
    764 		struct dma_fence *fence;
    765 		struct drm_sched_job *cleanup_job = NULL;
    766 
    767 		spin_lock(&sched->job_list_lock);
    768 		DRM_SPIN_WAIT_UNTIL(r, &sched->wake_up_worker,
    769 		    &sched->job_list_lock,
    770 		    ((cleanup_job = drm_sched_get_cleanup_job(sched)) ||
    771 			(!drm_sched_blocked(sched) &&
    772 			    (entity = drm_sched_select_entity(sched))) ||
    773 			kthread_should_stop()));
    774 		spin_unlock(&sched->job_list_lock);
    775 
    776 		if (cleanup_job) {
    777 			sched->ops->free_job(cleanup_job);
    778 			/* queue timeout for next job */
    779 			drm_sched_start_timeout(sched);
    780 		}
    781 
    782 		if (!entity)
    783 			continue;
    784 
    785 		sched_job = drm_sched_entity_pop_job(entity);
    786 
    787 		complete(&entity->entity_idle);
    788 
    789 		if (!sched_job)
    790 			continue;
    791 
    792 		s_fence = sched_job->s_fence;
    793 
    794 		atomic_inc(&sched->hw_rq_count);
    795 		drm_sched_job_begin(sched_job);
    796 
    797 		fence = sched->ops->run_job(sched_job);
    798 		drm_sched_fence_scheduled(s_fence);
    799 
    800 		if (!IS_ERR_OR_NULL(fence)) {
    801 			s_fence->parent = dma_fence_get(fence);
    802 			r = dma_fence_add_callback(fence, &sched_job->cb,
    803 						   drm_sched_process_job);
    804 			if (r == -ENOENT)
    805 				drm_sched_process_job(fence, &sched_job->cb);
    806 			else if (r)
    807 				DRM_ERROR("fence add callback failed (%d)\n",
    808 					  r);
    809 			dma_fence_put(fence);
    810 		} else {
    811 			if (IS_ERR(fence))
    812 				dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
    813 
    814 			drm_sched_process_job(NULL, &sched_job->cb);
    815 		}
    816 
    817 		spin_lock(&sched->job_list_lock);
    818 		DRM_SPIN_WAKEUP_ONE(&sched->job_scheduled,
    819 		    &sched->job_list_lock);
    820 		spin_unlock(&sched->job_list_lock);
    821 	}
    822 	return 0;
    823 }
    824 
    825 /**
    826  * drm_sched_init - Init a gpu scheduler instance
    827  *
    828  * @sched: scheduler instance
    829  * @ops: backend operations for this scheduler
    830  * @hw_submission: number of hw submissions that can be in flight
    831  * @hang_limit: number of times to allow a job to hang before dropping it
    832  * @timeout: timeout value in jiffies for the scheduler
    833  * @name: name used for debugging
    834  *
    835  * Return 0 on success, otherwise error code.
    836  */
    837 int drm_sched_init(struct drm_gpu_scheduler *sched,
    838 		   const struct drm_sched_backend_ops *ops,
    839 		   unsigned hw_submission,
    840 		   unsigned hang_limit,
    841 		   long timeout,
    842 		   const char *name)
    843 {
    844 	int i, ret;
    845 	sched->ops = ops;
    846 	sched->hw_submission_limit = hw_submission;
    847 	sched->name = name;
    848 	sched->timeout = timeout;
    849 	sched->hang_limit = hang_limit;
    850 	for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_MAX; i++)
    851 		drm_sched_rq_init(sched, &sched->sched_rq[i]);
    852 
    853 	DRM_INIT_WAITQUEUE(&sched->wake_up_worker, "drmschedw");
    854 	DRM_INIT_WAITQUEUE(&sched->job_scheduled, "drmschedj");
    855 	INIT_LIST_HEAD(&sched->ring_mirror_list);
    856 	spin_lock_init(&sched->job_list_lock);
    857 	atomic_set(&sched->hw_rq_count, 0);
    858 	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
    859 	atomic_set(&sched->score, 0);
    860 	atomic64_set(&sched->job_id_count, 0);
    861 
    862 	/* Each scheduler will run on a seperate kernel thread */
    863 	struct task_struct *thread =
    864 	    kthread_run(drm_sched_main, sched, sched->name,
    865 		&sched->job_list_lock, &sched->wake_up_worker);
    866 	if (IS_ERR(thread)) {
    867 		ret = PTR_ERR(thread);
    868 		sched->thread = NULL;
    869 		DRM_ERROR("Failed to create scheduler for %s.\n", name);
    870 		return ret;
    871 	}
    872 	spin_lock(&sched->job_list_lock);
    873 	sched->thread = thread;
    874 	DRM_SPIN_WAKEUP_ALL(&sched->wake_up_worker, &sched->job_list_lock);
    875 	spin_unlock(&sched->job_list_lock);
    876 
    877 	sched->ready = true;
    878 	return 0;
    879 }
    880 EXPORT_SYMBOL(drm_sched_init);
    881 
    882 /**
    883  * drm_sched_fini - Destroy a gpu scheduler
    884  *
    885  * @sched: scheduler instance
    886  *
    887  * Tears down and cleans up the scheduler.
    888  */
    889 void drm_sched_fini(struct drm_gpu_scheduler *sched)
    890 {
    891 	if (sched->thread)
    892 		kthread_stop(sched->thread);
    893 
    894 	sched->ready = false;
    895 
    896 	spin_lock_destroy(&sched->job_list_lock);
    897 	DRM_DESTROY_WAITQUEUE(&sched->job_scheduled);
    898 	DRM_DESTROY_WAITQUEUE(&sched->wake_up_worker);
    899 
    900 	for (int i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_MAX; i++) {
    901 		struct drm_sched_rq *rq = &sched->sched_rq[i];
    902 		spin_lock_destroy(&rq->lock);
    903 	}
    904 }
    905 EXPORT_SYMBOL(drm_sched_fini);
    906