nir_schedule.c revision 7ec681f3
1/*
2 * Copyright © 2019 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include "nir_schedule.h"
25#include "util/dag.h"
26#include "util/u_dynarray.h"
27
28/** @file
29 *
30 * Implements basic-block-level prepass instruction scheduling in NIR to
31 * manage register pressure.
32 *
33 * This is based on the Goodman/Hsu paper (1988, cached copy at
34 * https://people.freedesktop.org/~anholt/scheduling-goodman-hsu.pdf).  We
35 * make up the DDG for NIR (which can be mostly done using the NIR def/use
36 * chains for SSA instructions, plus some edges for ordering register writes
37 * vs reads, and some more for ordering intrinsics).  Then we pick heads off
38 * of the DDG using their heuristic to emit the NIR instructions back into the
39 * block in their new order.
40 *
41 * The hard case for prepass scheduling on GPUs seems to always be consuming
42 * texture/ubo results.  The register pressure heuristic doesn't want to pick
43 * an instr that starts consuming texture results because it usually won't be
44 * the only usage, so that instruction will increase pressure.
45 *
46 * If you try to force consumption of tex results always, then in a case where
47 * single sample is used for many outputs, you'll end up picking every other
48 * user and expanding register pressure.  The partially_evaluated_path flag
49 * helps tremendously, in that if you happen for whatever reason to pick a
50 * texture sample's output, then you'll try to finish off that sample.  Future
51 * work may include doing some local search before locking in a choice, to try
52 * to more reliably find the case where just a few choices going against the
53 * heuristic can manage to free the whole vector.
54 */
55
56static bool debug;
57
58/**
59 * Represents a node in the DDG for a NIR instruction.
60 */
61typedef struct {
62   struct dag_node dag; /* must be first for our u_dynarray_foreach */
63   nir_instr *instr;
64   bool partially_evaluated_path;
65
66   /* Approximate estimate of the delay between starting this instruction and
67    * its results being available.
68    *
69    * Accuracy is not too important, given that we're prepass scheduling here
70    * and just trying to reduce excess dependencies introduced by a register
71    * allocator by stretching out the live intervals of expensive
72    * instructions.
73    */
74   uint32_t delay;
75
76   /* Cost of the maximum-delay path from this node to the leaves. */
77   uint32_t max_delay;
78
79   /* scoreboard->time value when this instruction can be scheduled without
80    * any stalls expected.
81    */
82   uint32_t ready_time;
83} nir_schedule_node;
84
85typedef struct {
86   struct dag *dag;
87
88   nir_shader *shader;
89
90   /* Mapping from nir_register * or nir_ssa_def * to a struct set of
91    * instructions remaining to be scheduled using the register.
92    */
93   struct hash_table *remaining_uses;
94
95   /* Map from nir_instr to nir_schedule_node * */
96   struct hash_table *instr_map;
97
98   /* Set of nir_register * or nir_ssa_def * that have had any instruction
99    * scheduled on them.
100    */
101   struct set *live_values;
102
103   /* An abstract approximation of the number of nir_scheduler_node->delay
104    * units since the start of the shader.
105    */
106   uint32_t time;
107
108   /* Number of channels currently used by the NIR instructions that have been
109    * scheduled.
110    */
111   int pressure;
112
113   /* Options specified by the backend */
114   const nir_schedule_options *options;
115} nir_schedule_scoreboard;
116
117/* When walking the instructions in reverse, we use this flag to swap
118 * before/after in add_dep().
119 */
120enum direction { F, R };
121
122struct nir_schedule_class_dep {
123   int klass;
124   nir_schedule_node *node;
125   struct nir_schedule_class_dep *next;
126};
127
128typedef struct {
129   nir_schedule_scoreboard *scoreboard;
130
131   /* Map from nir_register to nir_schedule_node * */
132   struct hash_table *reg_map;
133
134   /* Scheduler nodes for last instruction involved in some class of dependency.
135    */
136   nir_schedule_node *load_input;
137   nir_schedule_node *store_shared;
138   nir_schedule_node *unknown_intrinsic;
139   nir_schedule_node *discard;
140   nir_schedule_node *jump;
141
142   struct nir_schedule_class_dep *class_deps;
143
144   enum direction dir;
145} nir_deps_state;
146
147static void *
148_mesa_hash_table_search_data(struct hash_table *ht, void *key)
149{
150   struct hash_entry *entry = _mesa_hash_table_search(ht, key);
151   if (!entry)
152      return NULL;
153   return entry->data;
154}
155
156static nir_schedule_node *
157nir_schedule_get_node(struct hash_table *instr_map, nir_instr *instr)
158{
159   return _mesa_hash_table_search_data(instr_map, instr);
160}
161
162static struct set *
163nir_schedule_scoreboard_get_src(nir_schedule_scoreboard *scoreboard, nir_src *src)
164{
165   if (src->is_ssa) {
166      return _mesa_hash_table_search_data(scoreboard->remaining_uses, src->ssa);
167   } else {
168      return _mesa_hash_table_search_data(scoreboard->remaining_uses,
169                                          src->reg.reg);
170   }
171}
172
173static int
174nir_schedule_def_pressure(nir_ssa_def *def)
175{
176   return def->num_components;
177}
178
179static int
180nir_schedule_src_pressure(nir_src *src)
181{
182   if (src->is_ssa)
183      return nir_schedule_def_pressure(src->ssa);
184   else
185      return src->reg.reg->num_components;
186}
187
188static int
189nir_schedule_dest_pressure(nir_dest *dest)
190{
191   if (dest->is_ssa)
192      return nir_schedule_def_pressure(&dest->ssa);
193   else
194      return dest->reg.reg->num_components;
195}
196
197/**
198 * Adds a dependency such that @after must appear in the final program after
199 * @before.
200 *
201 * We add @before as a child of @after, so that DAG heads are the outputs of
202 * the program and we make our scheduling decisions bottom to top.
203 */
204static void
205add_dep(nir_deps_state *state,
206        nir_schedule_node *before,
207        nir_schedule_node *after)
208{
209   if (!before || !after)
210      return;
211
212   assert(before != after);
213
214   if (state->dir == F)
215      dag_add_edge(&before->dag, &after->dag, NULL);
216   else
217      dag_add_edge(&after->dag, &before->dag, NULL);
218}
219
220
221static void
222add_read_dep(nir_deps_state *state,
223             nir_schedule_node *before,
224             nir_schedule_node *after)
225{
226   add_dep(state, before, after);
227}
228
229static void
230add_write_dep(nir_deps_state *state,
231              nir_schedule_node **before,
232              nir_schedule_node *after)
233{
234   add_dep(state, *before, after);
235   *before = after;
236}
237
238static bool
239nir_schedule_reg_src_deps(nir_src *src, void *in_state)
240{
241   nir_deps_state *state = in_state;
242
243   if (src->is_ssa)
244      return true;
245
246   struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
247                                                      src->reg.reg);
248   if (!entry)
249      return true;
250   nir_schedule_node *dst_n = entry->data;
251
252   nir_schedule_node *src_n =
253      nir_schedule_get_node(state->scoreboard->instr_map,
254                            src->parent_instr);
255
256   add_dep(state, dst_n, src_n);
257
258   return true;
259}
260
261static bool
262nir_schedule_reg_dest_deps(nir_dest *dest, void *in_state)
263{
264   nir_deps_state *state = in_state;
265
266   if (dest->is_ssa)
267      return true;
268
269   nir_schedule_node *dest_n =
270      nir_schedule_get_node(state->scoreboard->instr_map,
271                            dest->reg.parent_instr);
272
273   struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
274                                                      dest->reg.reg);
275   if (!entry) {
276      _mesa_hash_table_insert(state->reg_map, dest->reg.reg, dest_n);
277      return true;
278   }
279   nir_schedule_node **before = (nir_schedule_node **)&entry->data;
280
281   add_write_dep(state, before, dest_n);
282
283   return true;
284}
285
286static bool
287nir_schedule_ssa_deps(nir_ssa_def *def, void *in_state)
288{
289   nir_deps_state *state = in_state;
290   struct hash_table *instr_map = state->scoreboard->instr_map;
291   nir_schedule_node *def_n = nir_schedule_get_node(instr_map, def->parent_instr);
292
293   nir_foreach_use(src, def) {
294      nir_schedule_node *use_n = nir_schedule_get_node(instr_map,
295                                                       src->parent_instr);
296
297      add_read_dep(state, def_n, use_n);
298   }
299
300   return true;
301}
302
303static struct nir_schedule_class_dep *
304nir_schedule_get_class_dep(nir_deps_state *state,
305                           int klass)
306{
307   for (struct nir_schedule_class_dep *class_dep = state->class_deps;
308        class_dep != NULL;
309        class_dep = class_dep->next) {
310      if (class_dep->klass == klass)
311         return class_dep;
312   }
313
314   struct nir_schedule_class_dep *class_dep =
315      ralloc(state->reg_map, struct nir_schedule_class_dep);
316
317   class_dep->klass = klass;
318   class_dep->node = NULL;
319   class_dep->next = state->class_deps;
320
321   state->class_deps = class_dep;
322
323   return class_dep;
324}
325
326static void
327nir_schedule_intrinsic_deps(nir_deps_state *state,
328                            nir_intrinsic_instr *instr)
329{
330   nir_schedule_node *n = nir_schedule_get_node(state->scoreboard->instr_map,
331                                                &instr->instr);
332   const nir_schedule_options *options = state->scoreboard->options;
333   nir_schedule_dependency dep;
334
335   if (options->intrinsic_cb &&
336       options->intrinsic_cb(instr, &dep, options->intrinsic_cb_data)) {
337      struct nir_schedule_class_dep *class_dep =
338         nir_schedule_get_class_dep(state, dep.klass);
339
340      switch (dep.type) {
341      case NIR_SCHEDULE_READ_DEPENDENCY:
342         add_read_dep(state, class_dep->node, n);
343         break;
344      case NIR_SCHEDULE_WRITE_DEPENDENCY:
345         add_write_dep(state, &class_dep->node, n);
346         break;
347      }
348   }
349
350   switch (instr->intrinsic) {
351   case nir_intrinsic_load_uniform:
352   case nir_intrinsic_load_ubo:
353   case nir_intrinsic_load_front_face:
354      break;
355
356   case nir_intrinsic_discard:
357   case nir_intrinsic_discard_if:
358   case nir_intrinsic_demote:
359   case nir_intrinsic_demote_if:
360   case nir_intrinsic_terminate:
361   case nir_intrinsic_terminate_if:
362      /* We are adding two dependencies:
363       *
364       * * A individual one that we could use to add a read_dep while handling
365       *   nir_instr_type_tex
366       *
367       * * Include it on the unknown intrinsic set, as we want discard to be
368       *   serialized in in the same order relative to intervening stores or
369       *   atomic accesses to SSBOs and images
370       */
371      add_write_dep(state, &state->discard, n);
372      add_write_dep(state, &state->unknown_intrinsic, n);
373      break;
374
375   case nir_intrinsic_store_output:
376      /* For some hardware and stages, output stores affect the same shared
377       * memory as input loads.
378       */
379      if ((state->scoreboard->options->stages_with_shared_io_memory &
380           (1 << state->scoreboard->shader->info.stage)))
381         add_write_dep(state, &state->load_input, n);
382
383      /* Make sure that preceding discards stay before the store_output */
384      add_read_dep(state, state->discard, n);
385
386      break;
387
388   case nir_intrinsic_load_input:
389   case nir_intrinsic_load_per_vertex_input:
390      add_read_dep(state, state->load_input, n);
391      break;
392
393   case nir_intrinsic_load_shared:
394      /* Don't move load_shared beyond a following store_shared, as it could
395       * change their value
396       */
397      add_read_dep(state, state->store_shared, n);
398      break;
399
400   case nir_intrinsic_store_shared:
401      add_write_dep(state, &state->store_shared, n);
402      break;
403
404   case nir_intrinsic_control_barrier:
405   case nir_intrinsic_memory_barrier_shared:
406      add_write_dep(state, &state->store_shared, n);
407
408      /* Serialize against ssbos/atomics/etc. */
409      add_write_dep(state, &state->unknown_intrinsic, n);
410      break;
411
412   default:
413      /* Attempt to handle other intrinsics that we haven't individually
414       * categorized by serializing them in the same order relative to each
415       * other.
416       */
417      add_write_dep(state, &state->unknown_intrinsic, n);
418      break;
419   }
420}
421
422/**
423 * Common code for dependencies that need to be tracked both forward and
424 * backward.
425 *
426 * This is for things like "all reads of r4 have to happen between the r4
427 * writes that surround them".
428 */
429static void
430nir_schedule_calculate_deps(nir_deps_state *state, nir_schedule_node *n)
431{
432   nir_instr *instr = n->instr;
433
434   /* For NIR SSA defs, we only need to do a single pass of making the uses
435    * depend on the def.
436    */
437   if (state->dir == F)
438      nir_foreach_ssa_def(instr, nir_schedule_ssa_deps, state);
439
440   /* For NIR regs, track the last writer in the scheduler state so that we
441    * can keep the writes in order and let reads get reordered only between
442    * each write.
443    */
444   nir_foreach_src(instr, nir_schedule_reg_src_deps, state);
445
446   nir_foreach_dest(instr, nir_schedule_reg_dest_deps, state);
447
448   /* Make sure any other instructions keep their positions relative to
449    * jumps.
450    */
451   if (instr->type != nir_instr_type_jump)
452      add_read_dep(state, state->jump, n);
453
454   switch (instr->type) {
455   case nir_instr_type_ssa_undef:
456   case nir_instr_type_load_const:
457   case nir_instr_type_alu:
458   case nir_instr_type_deref:
459      break;
460
461   case nir_instr_type_tex:
462      /* Don't move texture ops before a discard, as that could increase
463       * memory bandwidth for reading the discarded samples.
464       */
465      add_read_dep(state, state->discard, n);
466      break;
467
468   case nir_instr_type_jump:
469      add_write_dep(state, &state->jump, n);
470      break;
471
472   case nir_instr_type_call:
473      unreachable("Calls should have been lowered");
474      break;
475
476   case nir_instr_type_parallel_copy:
477      unreachable("Parallel copies should have been lowered");
478      break;
479
480   case nir_instr_type_phi:
481      unreachable("nir_schedule() should be called after lowering from SSA");
482      break;
483
484   case nir_instr_type_intrinsic:
485      nir_schedule_intrinsic_deps(state, nir_instr_as_intrinsic(instr));
486      break;
487   }
488}
489
490static void
491calculate_forward_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
492{
493   nir_deps_state state = {
494      .scoreboard = scoreboard,
495      .dir = F,
496      .reg_map = _mesa_pointer_hash_table_create(NULL),
497   };
498
499   nir_foreach_instr(instr, block) {
500      nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
501                                                      instr);
502      nir_schedule_calculate_deps(&state, node);
503   }
504
505   ralloc_free(state.reg_map);
506}
507
508static void
509calculate_reverse_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
510{
511   nir_deps_state state = {
512      .scoreboard = scoreboard,
513      .dir = R,
514      .reg_map = _mesa_pointer_hash_table_create(NULL),
515   };
516
517   nir_foreach_instr_reverse(instr, block) {
518      nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
519                                                      instr);
520      nir_schedule_calculate_deps(&state, node);
521   }
522
523   ralloc_free(state.reg_map);
524}
525
526typedef struct {
527   nir_schedule_scoreboard *scoreboard;
528   int regs_freed;
529} nir_schedule_regs_freed_state;
530
531static bool
532nir_schedule_regs_freed_src_cb(nir_src *src, void *in_state)
533{
534   nir_schedule_regs_freed_state *state = in_state;
535   nir_schedule_scoreboard *scoreboard = state->scoreboard;
536   struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
537
538   if (remaining_uses->entries == 1 &&
539       _mesa_set_search(remaining_uses, src->parent_instr)) {
540      state->regs_freed += nir_schedule_src_pressure(src);
541   }
542
543   return true;
544}
545
546static bool
547nir_schedule_regs_freed_def_cb(nir_ssa_def *def, void *in_state)
548{
549   nir_schedule_regs_freed_state *state = in_state;
550
551   state->regs_freed -= nir_schedule_def_pressure(def);
552
553   return true;
554}
555
556static bool
557nir_schedule_regs_freed_dest_cb(nir_dest *dest, void *in_state)
558{
559   nir_schedule_regs_freed_state *state = in_state;
560   nir_schedule_scoreboard *scoreboard = state->scoreboard;
561
562   if (dest->is_ssa)
563      return true;
564
565   nir_register *reg = dest->reg.reg;
566
567   /* Only the first def of a reg counts against register pressure. */
568   if (!_mesa_set_search(scoreboard->live_values, reg))
569      state->regs_freed -= nir_schedule_dest_pressure(dest);
570
571   return true;
572}
573
574static int
575nir_schedule_regs_freed(nir_schedule_scoreboard *scoreboard, nir_schedule_node *n)
576{
577   nir_schedule_regs_freed_state state = {
578      .scoreboard = scoreboard,
579   };
580
581   nir_foreach_src(n->instr, nir_schedule_regs_freed_src_cb, &state);
582
583   nir_foreach_ssa_def(n->instr, nir_schedule_regs_freed_def_cb, &state);
584
585   nir_foreach_dest(n->instr, nir_schedule_regs_freed_dest_cb, &state);
586
587   return state.regs_freed;
588}
589
590/**
591 * Chooses an instruction that will minimise the register pressure as much as
592 * possible. This should only be used as a fallback when the regular scheduling
593 * generates a shader whose register allocation fails.
594 */
595static nir_schedule_node *
596nir_schedule_choose_instruction_fallback(nir_schedule_scoreboard *scoreboard)
597{
598   nir_schedule_node *chosen = NULL;
599
600   /* Find the leader in the ready (shouldn't-stall) set with the mininum
601    * cost.
602    */
603   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
604      if (scoreboard->time < n->ready_time)
605         continue;
606
607      if (!chosen || chosen->max_delay > n->max_delay)
608         chosen = n;
609   }
610   if (chosen) {
611      if (debug) {
612         fprintf(stderr, "chose (ready fallback):          ");
613         nir_print_instr(chosen->instr, stderr);
614         fprintf(stderr, "\n");
615      }
616
617      return chosen;
618   }
619
620   /* Otherwise, choose the leader with the minimum cost. */
621   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
622      if (!chosen || chosen->max_delay > n->max_delay)
623         chosen = n;
624   }
625   if (debug) {
626      fprintf(stderr, "chose (leader fallback):         ");
627      nir_print_instr(chosen->instr, stderr);
628      fprintf(stderr, "\n");
629   }
630
631   return chosen;
632}
633
634/**
635 * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSP (Code
636 * Scheduling for Parallelism) heuristic.
637 *
638 * Picks an instruction on the critical that's ready to execute without
639 * stalls, if possible, otherwise picks the instruction on the critical path.
640 */
641static nir_schedule_node *
642nir_schedule_choose_instruction_csp(nir_schedule_scoreboard *scoreboard)
643{
644   nir_schedule_node *chosen = NULL;
645
646   /* Find the leader in the ready (shouldn't-stall) set with the maximum
647    * cost.
648    */
649   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
650      if (scoreboard->time < n->ready_time)
651         continue;
652
653      if (!chosen || chosen->max_delay < n->max_delay)
654         chosen = n;
655   }
656   if (chosen) {
657      if (debug) {
658         fprintf(stderr, "chose (ready):          ");
659         nir_print_instr(chosen->instr, stderr);
660         fprintf(stderr, "\n");
661      }
662
663      return chosen;
664   }
665
666   /* Otherwise, choose the leader with the maximum cost. */
667   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
668      if (!chosen || chosen->max_delay < n->max_delay)
669         chosen = n;
670   }
671   if (debug) {
672      fprintf(stderr, "chose (leader):         ");
673      nir_print_instr(chosen->instr, stderr);
674      fprintf(stderr, "\n");
675   }
676
677   return chosen;
678}
679
680/**
681 * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code
682 * Scheduling for Register pressure) heuristic.
683 */
684static nir_schedule_node *
685nir_schedule_choose_instruction_csr(nir_schedule_scoreboard *scoreboard)
686{
687   nir_schedule_node *chosen = NULL;
688
689   /* Find a ready inst with regs freed and pick the one with max cost. */
690   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
691      if (n->ready_time > scoreboard->time)
692         continue;
693
694      int regs_freed = nir_schedule_regs_freed(scoreboard, n);
695
696      if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
697         chosen = n;
698      }
699   }
700   if (chosen) {
701      if (debug) {
702         fprintf(stderr, "chose (freed+ready):    ");
703         nir_print_instr(chosen->instr, stderr);
704         fprintf(stderr, "\n");
705      }
706
707      return chosen;
708   }
709
710   /* Find a leader with regs freed and pick the one with max cost. */
711   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
712      int regs_freed = nir_schedule_regs_freed(scoreboard, n);
713
714      if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
715         chosen = n;
716      }
717   }
718   if (chosen) {
719      if (debug) {
720         fprintf(stderr, "chose (regs freed):     ");
721         nir_print_instr(chosen->instr, stderr);
722         fprintf(stderr, "\n");
723      }
724
725      return chosen;
726   }
727
728   /* Find a partially evaluated path and try to finish it off */
729   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
730      if (n->partially_evaluated_path &&
731          (!chosen || chosen->max_delay < n->max_delay)) {
732         chosen = n;
733      }
734   }
735   if (chosen) {
736      if (debug) {
737         fprintf(stderr, "chose (partial path):   ");
738         nir_print_instr(chosen->instr, stderr);
739         fprintf(stderr, "\n");
740      }
741
742      return chosen;
743   }
744
745   /* Contra the paper, pick a leader with no effect on used regs.  This may
746    * open up new opportunities, as otherwise a single-operand instr consuming
747    * a value will tend to block finding freeing that value.  This had a
748    * massive effect on reducing spilling on V3D.
749    *
750    * XXX: Should this prioritize ready?
751    */
752   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
753      if (nir_schedule_regs_freed(scoreboard, n) != 0)
754         continue;
755
756      if (!chosen || chosen->max_delay < n->max_delay)
757         chosen = n;
758   }
759   if (chosen) {
760      if (debug) {
761         fprintf(stderr, "chose (regs no-op):     ");
762         nir_print_instr(chosen->instr, stderr);
763         fprintf(stderr, "\n");
764      }
765
766      return chosen;
767   }
768
769   /* Pick the max delay of the remaining ready set. */
770   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
771      if (n->ready_time > scoreboard->time)
772         continue;
773      if (!chosen || chosen->max_delay < n->max_delay)
774         chosen = n;
775   }
776   if (chosen) {
777      if (debug) {
778         fprintf(stderr, "chose (ready max delay):   ");
779         nir_print_instr(chosen->instr, stderr);
780         fprintf(stderr, "\n");
781      }
782      return chosen;
783   }
784
785   /* Pick the max delay of the remaining leaders. */
786   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
787      if (!chosen || chosen->max_delay < n->max_delay)
788         chosen = n;
789   }
790
791   if (debug) {
792      fprintf(stderr, "chose (max delay):         ");
793      nir_print_instr(chosen->instr, stderr);
794      fprintf(stderr, "\n");
795   }
796
797   return chosen;
798}
799
800static void
801dump_state(nir_schedule_scoreboard *scoreboard)
802{
803   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
804      fprintf(stderr, "maxdel %5d ", n->max_delay);
805      nir_print_instr(n->instr, stderr);
806      fprintf(stderr, "\n");
807
808      util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
809         nir_schedule_node *child = (nir_schedule_node *)edge->child;
810
811         fprintf(stderr, " -> (%d parents) ", child->dag.parent_count);
812         nir_print_instr(child->instr, stderr);
813         fprintf(stderr, "\n");
814      }
815   }
816}
817
818static void
819nir_schedule_mark_use(nir_schedule_scoreboard *scoreboard,
820                      void *reg_or_def,
821                      nir_instr *reg_or_def_parent,
822                      int pressure)
823{
824   /* Make the value live if it's the first time it's been used. */
825   if (!_mesa_set_search(scoreboard->live_values, reg_or_def)) {
826      _mesa_set_add(scoreboard->live_values, reg_or_def);
827      scoreboard->pressure += pressure;
828   }
829
830   /* Make the value dead if it's the last remaining use.  Be careful when one
831    * instruction uses a value twice to not decrement pressure twice.
832    */
833   struct set *remaining_uses =
834      _mesa_hash_table_search_data(scoreboard->remaining_uses, reg_or_def);
835   struct set_entry *entry = _mesa_set_search(remaining_uses, reg_or_def_parent);
836   if (entry) {
837      _mesa_set_remove(remaining_uses, entry);
838
839      if (remaining_uses->entries == 0)
840         scoreboard->pressure -= pressure;
841   }
842}
843
844static bool
845nir_schedule_mark_src_scheduled(nir_src *src, void *state)
846{
847   nir_schedule_scoreboard *scoreboard = state;
848   struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
849
850   struct set_entry *entry = _mesa_set_search(remaining_uses,
851                                              src->parent_instr);
852   if (entry) {
853      /* Once we've used an SSA value in one instruction, bump the priority of
854       * the other uses so the SSA value can get fully consumed.
855       *
856       * We don't do this for registers, and it's would be a hassle and it's
857       * unclear if that would help or not.  Also, skip it for constants, as
858       * they're often folded as immediates into backend instructions and have
859       * many unrelated instructions all referencing the same value (0).
860       */
861      if (src->is_ssa &&
862          src->ssa->parent_instr->type != nir_instr_type_load_const) {
863         nir_foreach_use(other_src, src->ssa) {
864            if (other_src->parent_instr == src->parent_instr)
865               continue;
866
867            nir_schedule_node *n =
868               nir_schedule_get_node(scoreboard->instr_map,
869                                     other_src->parent_instr);
870
871            if (n && !n->partially_evaluated_path) {
872               if (debug) {
873                  fprintf(stderr, "  New partially evaluated path: ");
874                  nir_print_instr(n->instr, stderr);
875                  fprintf(stderr, "\n");
876               }
877
878               n->partially_evaluated_path = true;
879            }
880         }
881      }
882   }
883
884   nir_schedule_mark_use(scoreboard,
885                         src->is_ssa ? (void *)src->ssa : (void *)src->reg.reg,
886                         src->parent_instr,
887                         nir_schedule_src_pressure(src));
888
889   return true;
890}
891
892static bool
893nir_schedule_mark_def_scheduled(nir_ssa_def *def, void *state)
894{
895   nir_schedule_scoreboard *scoreboard = state;
896
897   nir_schedule_mark_use(scoreboard, def, def->parent_instr,
898                         nir_schedule_def_pressure(def));
899
900   return true;
901}
902
903static bool
904nir_schedule_mark_dest_scheduled(nir_dest *dest, void *state)
905{
906   nir_schedule_scoreboard *scoreboard = state;
907
908   /* SSA defs were handled in nir_schedule_mark_def_scheduled()
909    */
910   if (dest->is_ssa)
911      return true;
912
913   /* XXX: This is not actually accurate for regs -- the last use of a reg may
914    * have a live interval that extends across control flow.  We should
915    * calculate the live ranges of regs, and have scheduler nodes for the CF
916    * nodes that also "use" the reg.
917    */
918   nir_schedule_mark_use(scoreboard, dest->reg.reg,
919                         dest->reg.parent_instr,
920                         nir_schedule_dest_pressure(dest));
921
922   return true;
923}
924
925static void
926nir_schedule_mark_node_scheduled(nir_schedule_scoreboard *scoreboard,
927                                 nir_schedule_node *n)
928{
929   nir_foreach_src(n->instr, nir_schedule_mark_src_scheduled, scoreboard);
930   nir_foreach_ssa_def(n->instr, nir_schedule_mark_def_scheduled, scoreboard);
931   nir_foreach_dest(n->instr, nir_schedule_mark_dest_scheduled, scoreboard);
932
933   util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
934      nir_schedule_node *child = (nir_schedule_node *)edge->child;
935
936      child->ready_time = MAX2(child->ready_time,
937                               scoreboard->time + n->delay);
938
939      if (child->dag.parent_count == 1) {
940         if (debug) {
941            fprintf(stderr, "  New DAG head: ");
942            nir_print_instr(child->instr, stderr);
943            fprintf(stderr, "\n");
944         }
945      }
946   }
947
948   dag_prune_head(scoreboard->dag, &n->dag);
949
950   scoreboard->time = MAX2(n->ready_time, scoreboard->time);
951   scoreboard->time++;
952}
953
954static void
955nir_schedule_instructions(nir_schedule_scoreboard *scoreboard, nir_block *block)
956{
957   while (!list_is_empty(&scoreboard->dag->heads)) {
958      if (debug) {
959         fprintf(stderr, "current list:\n");
960         dump_state(scoreboard);
961      }
962
963      nir_schedule_node *chosen;
964      if (scoreboard->options->fallback)
965         chosen = nir_schedule_choose_instruction_fallback(scoreboard);
966      else if (scoreboard->pressure < scoreboard->options->threshold)
967         chosen = nir_schedule_choose_instruction_csp(scoreboard);
968      else
969         chosen = nir_schedule_choose_instruction_csr(scoreboard);
970
971      /* Now that we've scheduled a new instruction, some of its children may
972       * be promoted to the list of instructions ready to be scheduled.
973       */
974      nir_schedule_mark_node_scheduled(scoreboard, chosen);
975
976      /* Move the instruction to the end (so our first chosen instructions are
977       * the start of the program).
978       */
979      exec_node_remove(&chosen->instr->node);
980      exec_list_push_tail(&block->instr_list, &chosen->instr->node);
981
982      if (debug)
983         fprintf(stderr, "\n");
984   }
985}
986
987static uint32_t
988nir_schedule_get_delay(nir_instr *instr)
989{
990   switch (instr->type) {
991   case nir_instr_type_ssa_undef:
992   case nir_instr_type_load_const:
993   case nir_instr_type_alu:
994   case nir_instr_type_deref:
995   case nir_instr_type_jump:
996   case nir_instr_type_parallel_copy:
997   case nir_instr_type_call:
998   case nir_instr_type_phi:
999      return 1;
1000
1001   case nir_instr_type_intrinsic:
1002      /* XXX: Pick a large number for UBO/SSBO/image/shared loads */
1003      return 1;
1004
1005   case nir_instr_type_tex:
1006      /* Pick some large number to try to fetch textures early and sample them
1007       * late.
1008       */
1009      return 100;
1010   }
1011
1012   return 0;
1013}
1014
1015static void
1016nir_schedule_dag_max_delay_cb(struct dag_node *node, void *state)
1017{
1018   nir_schedule_node *n = (nir_schedule_node *)node;
1019   uint32_t max_delay = 0;
1020
1021   util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1022      nir_schedule_node *child = (nir_schedule_node *)edge->child;
1023      max_delay = MAX2(child->max_delay, max_delay);
1024   }
1025
1026   n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
1027 }
1028
1029static void
1030nir_schedule_block(nir_schedule_scoreboard *scoreboard, nir_block *block)
1031{
1032   void *mem_ctx = ralloc_context(NULL);
1033   scoreboard->instr_map = _mesa_pointer_hash_table_create(mem_ctx);
1034
1035   scoreboard->dag = dag_create(mem_ctx);
1036
1037   nir_foreach_instr(instr, block) {
1038      nir_schedule_node *n =
1039         rzalloc(mem_ctx, nir_schedule_node);
1040
1041      n->instr = instr;
1042      n->delay = nir_schedule_get_delay(instr);
1043      dag_init_node(scoreboard->dag, &n->dag);
1044
1045      _mesa_hash_table_insert(scoreboard->instr_map, instr, n);
1046   }
1047
1048   calculate_forward_deps(scoreboard, block);
1049   calculate_reverse_deps(scoreboard, block);
1050
1051   dag_traverse_bottom_up(scoreboard->dag, nir_schedule_dag_max_delay_cb, NULL);
1052
1053   nir_schedule_instructions(scoreboard, block);
1054
1055   ralloc_free(mem_ctx);
1056   scoreboard->instr_map = NULL;
1057}
1058
1059static bool
1060nir_schedule_ssa_def_init_scoreboard(nir_ssa_def *def, void *state)
1061{
1062   nir_schedule_scoreboard *scoreboard = state;
1063   struct set *def_uses = _mesa_pointer_set_create(scoreboard);
1064
1065   _mesa_hash_table_insert(scoreboard->remaining_uses, def, def_uses);
1066
1067   _mesa_set_add(def_uses, def->parent_instr);
1068
1069   nir_foreach_use(src, def) {
1070      _mesa_set_add(def_uses, src->parent_instr);
1071   }
1072
1073   /* XXX: Handle if uses */
1074
1075   return true;
1076}
1077
1078static nir_schedule_scoreboard *
1079nir_schedule_get_scoreboard(nir_shader *shader,
1080                            const nir_schedule_options *options)
1081{
1082   nir_schedule_scoreboard *scoreboard = rzalloc(NULL, nir_schedule_scoreboard);
1083
1084   scoreboard->shader = shader;
1085   scoreboard->live_values = _mesa_pointer_set_create(scoreboard);
1086   scoreboard->remaining_uses = _mesa_pointer_hash_table_create(scoreboard);
1087   scoreboard->options = options;
1088   scoreboard->pressure = 0;
1089
1090   nir_foreach_function(function, shader) {
1091      nir_foreach_register(reg, &function->impl->registers) {
1092         struct set *register_uses =
1093            _mesa_pointer_set_create(scoreboard);
1094
1095         _mesa_hash_table_insert(scoreboard->remaining_uses, reg, register_uses);
1096
1097         nir_foreach_use(src, reg) {
1098            _mesa_set_add(register_uses, src->parent_instr);
1099         }
1100
1101         /* XXX: Handle if uses */
1102
1103         nir_foreach_def(dest, reg) {
1104            _mesa_set_add(register_uses, dest->reg.parent_instr);
1105         }
1106      }
1107
1108      nir_foreach_block(block, function->impl) {
1109         nir_foreach_instr(instr, block) {
1110            nir_foreach_ssa_def(instr, nir_schedule_ssa_def_init_scoreboard,
1111                                scoreboard);
1112         }
1113
1114         /* XXX: We're ignoring if uses, which may prioritize scheduling other
1115          * uses of the if src even when it doesn't help.  That's not many
1116          * values, though, so meh.
1117          */
1118      }
1119   }
1120
1121   return scoreboard;
1122}
1123
1124static void
1125nir_schedule_validate_uses(nir_schedule_scoreboard *scoreboard)
1126{
1127#ifdef NDEBUG
1128   return;
1129#endif
1130
1131   bool any_uses = false;
1132
1133   hash_table_foreach(scoreboard->remaining_uses, entry) {
1134      struct set *remaining_uses = entry->data;
1135
1136      set_foreach(remaining_uses, instr_entry) {
1137         if (!any_uses) {
1138            fprintf(stderr, "Tracked uses remain after scheduling.  "
1139                    "Affected instructions: \n");
1140            any_uses = true;
1141         }
1142         nir_print_instr(instr_entry->key, stderr);
1143         fprintf(stderr, "\n");
1144      }
1145   }
1146
1147   assert(!any_uses);
1148}
1149
1150/**
1151 * Schedules the NIR instructions to try to decrease stalls (for example,
1152 * delaying texture reads) while managing register pressure.
1153 *
1154 * The threshold represents "number of NIR register/SSA def channels live
1155 * before switching the scheduling heuristic to reduce register pressure",
1156 * since most of our GPU architectures are scalar (extending to vector with a
1157 * flag wouldn't be hard).  This number should be a bit below the number of
1158 * registers available (counting any that may be occupied by system value
1159 * payload values, for example), since the heuristic may not always be able to
1160 * free a register immediately.  The amount below the limit is up to you to
1161 * tune.
1162 */
1163void
1164nir_schedule(nir_shader *shader,
1165             const nir_schedule_options *options)
1166{
1167   nir_schedule_scoreboard *scoreboard = nir_schedule_get_scoreboard(shader,
1168                                                                     options);
1169
1170   if (debug) {
1171      fprintf(stderr, "NIR shader before scheduling:\n");
1172      nir_print_shader(shader, stderr);
1173   }
1174
1175   nir_foreach_function(function, shader) {
1176      if (!function->impl)
1177         continue;
1178
1179      nir_foreach_block(block, function->impl) {
1180         nir_schedule_block(scoreboard, block);
1181      }
1182   }
1183
1184   nir_schedule_validate_uses(scoreboard);
1185
1186   ralloc_free(scoreboard);
1187}
1188