1/*
2 * Copyright (C) 2021 Valve Corporation
3 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "ir3_ra.h"
26#include "util/rb_tree.h"
27#include "util/u_math.h"
28#include "ir3_shader.h"
29
30/* This file implements an SSA-based register allocator. Unlike other
31 * SSA-based allocators, it handles vector split/collect "smartly," meaning
32 * that multiple values may share the same register interval. From the
33 * perspective of the allocator itself, only the top-level intervals matter,
34 * and the allocator is only concerned with allocating top-level intervals,
35 * which may mean moving other top-level intervals around. Other intervals,
36 * like the destination of a split instruction or the source of a collect
37 * instruction, are "locked" to their parent interval. The details of this are
38 * mostly handled by ir3_merge_regs and ir3_reg_ctx.
39 *
40 * We currently don't do any backtracking, but we do use the merge sets as a
41 * form of affinity to try to avoid moves from phis/splits/collects. Each
42 * merge set is what a more "classic" graph-coloring or live-range based
43 * allocator would consider a single register, but here we use it as merely a
44 * hint, except when multiple overlapping values are live at the same time.
45 * Each merge set has a "preferred" register, and we try to honor that when
46 * allocating values in the merge set.
47 */
48
49/* ir3_reg_ctx implementation. */
50
51static int
52ir3_reg_interval_cmp(const struct rb_node *node, const void *data)
53{
54   unsigned reg = *(const unsigned *)data;
55   const struct ir3_reg_interval *interval =
56      ir3_rb_node_to_interval_const(node);
57   if (interval->reg->interval_start > reg)
58      return -1;
59   else if (interval->reg->interval_end <= reg)
60      return 1;
61   else
62      return 0;
63}
64
65static struct ir3_reg_interval *
66ir3_reg_interval_search(struct rb_tree *tree, unsigned offset)
67{
68   struct rb_node *node = rb_tree_search(tree, &offset, ir3_reg_interval_cmp);
69   return node ? ir3_rb_node_to_interval(node) : NULL;
70}
71
72static struct ir3_reg_interval *
73ir3_reg_interval_search_sloppy(struct rb_tree *tree, unsigned offset)
74{
75   struct rb_node *node =
76      rb_tree_search_sloppy(tree, &offset, ir3_reg_interval_cmp);
77   return node ? ir3_rb_node_to_interval(node) : NULL;
78}
79
80/* Get the interval covering the reg, or the closest to the right if it
81 * doesn't exist.
82 */
83static struct ir3_reg_interval *
84ir3_reg_interval_search_right(struct rb_tree *tree, unsigned offset)
85{
86   struct ir3_reg_interval *interval =
87      ir3_reg_interval_search_sloppy(tree, offset);
88   if (!interval) {
89      return NULL;
90   } else if (interval->reg->interval_end > offset) {
91      return interval;
92   } else {
93      /* There is no interval covering reg, and ra_file_search_sloppy()
94       * returned the closest range to the left, so the next interval to the
95       * right should be the closest to the right.
96       */
97      return ir3_reg_interval_next_or_null(interval);
98   }
99}
100
101static int
102ir3_reg_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
103{
104   const struct ir3_reg_interval *a = ir3_rb_node_to_interval_const(_a);
105   const struct ir3_reg_interval *b = ir3_rb_node_to_interval_const(_b);
106   return b->reg->interval_start - a->reg->interval_start;
107}
108
109static void
110interval_insert(struct ir3_reg_ctx *ctx, struct rb_tree *tree,
111                struct ir3_reg_interval *interval)
112{
113   struct ir3_reg_interval *right =
114      ir3_reg_interval_search_right(tree, interval->reg->interval_start);
115   if (right && right->reg->interval_start < interval->reg->interval_end) {
116      /* We disallow trees where different members have different half-ness.
117       * This means that we can't treat bitcasts as copies like normal
118       * split/collect, so something like this would require an extra copy
119       * in mergedregs mode, and count as 4 half-units of register pressure
120       * instead of 2:
121       *
122       * f16vec2 foo = unpackFloat2x16(bar)
123       * ... = foo.x
124       * ... = bar
125       *
126       * However, relaxing this rule would open a huge can of worms. What
127       * happens when there's a vector of 16 things, and the fifth element
128       * has been bitcasted as a half-reg? Would that element alone have to
129       * be small enough to be used as a half-reg source? Let's keep that
130       * can of worms firmly shut for now.
131       */
132      assert((interval->reg->flags & IR3_REG_HALF) ==
133             (right->reg->flags & IR3_REG_HALF));
134
135      if (right->reg->interval_end <= interval->reg->interval_end &&
136          right->reg->interval_start >= interval->reg->interval_start) {
137         /* Check if we're inserting something that's already inserted */
138         assert(interval != right);
139
140         /* "right" is contained in "interval" and must become a child of
141          * it. There may be further children too.
142          */
143         for (struct ir3_reg_interval *next = ir3_reg_interval_next(right);
144              right && right->reg->interval_start < interval->reg->interval_end;
145              right = next, next = ir3_reg_interval_next_or_null(next)) {
146            /* "right" must be contained in "interval." */
147            assert(right->reg->interval_end <= interval->reg->interval_end);
148            assert((interval->reg->flags & IR3_REG_HALF) ==
149                   (right->reg->flags & IR3_REG_HALF));
150            if (!right->parent)
151               ctx->interval_delete(ctx, right);
152            right->parent = interval;
153            rb_tree_remove(tree, &right->node);
154            rb_tree_insert(&interval->children, &right->node,
155                           ir3_reg_interval_insert_cmp);
156         }
157      } else {
158         /* "right" must contain "interval," since intervals must form a
159          * tree.
160          */
161         assert(right->reg->interval_start <= interval->reg->interval_start);
162         interval->parent = right;
163         interval_insert(ctx, &right->children, interval);
164         return;
165      }
166   }
167
168   if (!interval->parent)
169      ctx->interval_add(ctx, interval);
170   rb_tree_insert(tree, &interval->node, ir3_reg_interval_insert_cmp);
171   interval->inserted = true;
172}
173
174void
175ir3_reg_interval_insert(struct ir3_reg_ctx *ctx,
176                        struct ir3_reg_interval *interval)
177{
178   rb_tree_init(&interval->children);
179   interval->parent = NULL;
180   interval_insert(ctx, &ctx->intervals, interval);
181}
182
183/* Call after ir3_reg_interval_remove_temp() to reinsert the interval */
184static void
185ir3_reg_interval_reinsert(struct ir3_reg_ctx *ctx,
186                          struct ir3_reg_interval *interval)
187{
188   interval->parent = NULL;
189   interval_insert(ctx, &ctx->intervals, interval);
190}
191
192void
193ir3_reg_interval_remove(struct ir3_reg_ctx *ctx,
194                        struct ir3_reg_interval *interval)
195{
196   if (interval->parent) {
197      rb_tree_remove(&interval->parent->children, &interval->node);
198   } else {
199      ctx->interval_delete(ctx, interval);
200      rb_tree_remove(&ctx->intervals, &interval->node);
201   }
202
203   rb_tree_foreach_safe (struct ir3_reg_interval, child, &interval->children,
204                         node) {
205      rb_tree_remove(&interval->children, &child->node);
206      child->parent = interval->parent;
207
208      if (interval->parent) {
209         rb_tree_insert(&child->parent->children, &child->node,
210                        ir3_reg_interval_insert_cmp);
211      } else {
212         ctx->interval_readd(ctx, interval, child);
213         rb_tree_insert(&ctx->intervals, &child->node,
214                        ir3_reg_interval_insert_cmp);
215      }
216   }
217
218   interval->inserted = false;
219}
220
221static void
222_mark_free(struct ir3_reg_interval *interval)
223{
224   interval->inserted = false;
225   rb_tree_foreach (struct ir3_reg_interval, child, &interval->children, node) {
226      _mark_free(child);
227   }
228}
229
230/* Remove an interval and all its children from the tree. */
231void
232ir3_reg_interval_remove_all(struct ir3_reg_ctx *ctx,
233                            struct ir3_reg_interval *interval)
234{
235   assert(!interval->parent);
236
237   ctx->interval_delete(ctx, interval);
238   rb_tree_remove(&ctx->intervals, &interval->node);
239   _mark_free(interval);
240}
241
242/* Used when popping an interval to be shuffled around. Don't disturb children
243 * so that it can be later reinserted.
244 */
245static void
246ir3_reg_interval_remove_temp(struct ir3_reg_ctx *ctx,
247                             struct ir3_reg_interval *interval)
248{
249   assert(!interval->parent);
250
251   ctx->interval_delete(ctx, interval);
252   rb_tree_remove(&ctx->intervals, &interval->node);
253}
254
255static void
256interval_dump(struct log_stream *stream, struct ir3_reg_interval *interval,
257              unsigned indent)
258{
259   for (unsigned i = 0; i < indent; i++)
260      mesa_log_stream_printf(stream, "\t");
261   mesa_log_stream_printf(stream, "reg %u start %u\n", interval->reg->name,
262                          interval->reg->interval_start);
263
264   rb_tree_foreach (struct ir3_reg_interval, child, &interval->children, node) {
265      interval_dump(stream, child, indent + 1);
266   }
267
268   for (unsigned i = 0; i < indent; i++)
269      mesa_log_stream_printf(stream, "\t");
270   mesa_log_stream_printf(stream, "reg %u end %u\n", interval->reg->name,
271                          interval->reg->interval_end);
272}
273
274void
275ir3_reg_interval_dump(struct log_stream *stream, struct ir3_reg_interval *interval)
276{
277   interval_dump(stream, interval, 0);
278}
279
280/* These are the core datastructures used by the register allocator. First
281 * ra_interval and ra_file, which are used for intra-block tracking and use
282 * the ir3_reg_ctx infrastructure:
283 */
284
285struct ra_interval {
286   struct ir3_reg_interval interval;
287
288   struct rb_node physreg_node;
289   physreg_t physreg_start, physreg_end;
290
291   /* True if this is a source of the current instruction which is entirely
292    * killed. This means we can allocate the dest over it, but we can't break
293    * it up.
294    */
295   bool is_killed;
296
297   /* True if this interval cannot be moved from its position. This is only
298    * used for precolored inputs to ensure that other inputs don't get
299    * allocated on top of them.
300    */
301   bool frozen;
302};
303
304struct ra_file {
305   struct ir3_reg_ctx reg_ctx;
306
307   BITSET_DECLARE(available, RA_MAX_FILE_SIZE);
308   BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE);
309
310   struct rb_tree physreg_intervals;
311
312   unsigned size;
313   unsigned start;
314};
315
316/* State for inter-block tracking. When we split a live range to make space
317 * for a vector, we may need to insert fixup code when a block has multiple
318 * predecessors that have moved the same live value to different registers.
319 * This keeps track of state required to do that.
320 */
321
322struct ra_block_state {
323   /* Map of defining ir3_register -> physreg it was allocated to at the end
324    * of the block.
325    */
326   struct hash_table *renames;
327
328   /* For loops, we need to process a block before all its predecessors have
329    * been processed. In particular, we need to pick registers for values
330    * without knowing if all the predecessors have been renamed. This keeps
331    * track of the registers we chose so that when we visit the back-edge we
332    * can move them appropriately. If all predecessors have been visited
333    * before this block is visited then we don't need to fill this out. This
334    * is a map from ir3_register -> physreg.
335    */
336   struct hash_table *entry_regs;
337
338   /* True if the block has been visited and "renames" is complete.
339    */
340   bool visited;
341};
342
343struct ra_parallel_copy {
344   struct ra_interval *interval;
345   physreg_t src;
346};
347
348/* The main context: */
349
350struct ra_ctx {
351   /* r0.x - r47.w. On a6xx with merged-regs, hr0.x-hr47.w go into the bottom
352    * half of this file too.
353    */
354   struct ra_file full;
355
356   /* hr0.x - hr63.w, only used without merged-regs. */
357   struct ra_file half;
358
359   /* Shared regs. */
360   struct ra_file shared;
361
362   struct ir3_liveness *live;
363
364   struct ir3_block *block;
365
366   const struct ir3_compiler *compiler;
367   gl_shader_stage stage;
368
369   /* Pending moves of top-level intervals that will be emitted once we're
370    * finished:
371    */
372   DECLARE_ARRAY(struct ra_parallel_copy, parallel_copies);
373
374   struct ra_interval *intervals;
375   struct ra_block_state *blocks;
376
377   bool merged_regs;
378};
379
380#define foreach_interval(interval, file)                                       \
381   rb_tree_foreach (struct ra_interval, interval, &(file)->physreg_intervals,  \
382                    physreg_node)
383#define foreach_interval_rev(interval, file)                                   \
384   rb_tree_foreach (struct ra_interval, interval, &(file)->physreg_intervals,  \
385                    physreg_node)
386#define foreach_interval_safe(interval, file)                                  \
387   rb_tree_foreach_safe (struct ra_interval, interval,                         \
388                         &(file)->physreg_intervals, physreg_node)
389#define foreach_interval_rev_safe(interval, file)                              \
390   rb_tree_foreach_rev_safe(struct ra_interval, interval,                      \
391                            &(file)->physreg_intervals, physreg_node)
392
393static struct ra_interval *
394rb_node_to_interval(struct rb_node *node)
395{
396   return rb_node_data(struct ra_interval, node, physreg_node);
397}
398
399static const struct ra_interval *
400rb_node_to_interval_const(const struct rb_node *node)
401{
402   return rb_node_data(struct ra_interval, node, physreg_node);
403}
404
405static struct ra_interval *
406ra_interval_next(struct ra_interval *interval)
407{
408   struct rb_node *next = rb_node_next(&interval->physreg_node);
409   return next ? rb_node_to_interval(next) : NULL;
410}
411
412static struct ra_interval *
413ra_interval_next_or_null(struct ra_interval *interval)
414{
415   return interval ? ra_interval_next(interval) : NULL;
416}
417
418static int
419ra_interval_cmp(const struct rb_node *node, const void *data)
420{
421   physreg_t reg = *(const physreg_t *)data;
422   const struct ra_interval *interval = rb_node_to_interval_const(node);
423   if (interval->physreg_start > reg)
424      return -1;
425   else if (interval->physreg_end <= reg)
426      return 1;
427   else
428      return 0;
429}
430
431static struct ra_interval *
432ra_interval_search_sloppy(struct rb_tree *tree, physreg_t reg)
433{
434   struct rb_node *node = rb_tree_search_sloppy(tree, &reg, ra_interval_cmp);
435   return node ? rb_node_to_interval(node) : NULL;
436}
437
438/* Get the interval covering the reg, or the closest to the right if it
439 * doesn't exist.
440 */
441static struct ra_interval *
442ra_interval_search_right(struct rb_tree *tree, physreg_t reg)
443{
444   struct ra_interval *interval = ra_interval_search_sloppy(tree, reg);
445   if (!interval) {
446      return NULL;
447   } else if (interval->physreg_end > reg) {
448      return interval;
449   } else {
450      /* There is no interval covering reg, and ra_file_search_sloppy()
451       * returned the closest range to the left, so the next interval to the
452       * right should be the closest to the right.
453       */
454      return ra_interval_next_or_null(interval);
455   }
456}
457
458static struct ra_interval *
459ra_file_search_right(struct ra_file *file, physreg_t reg)
460{
461   return ra_interval_search_right(&file->physreg_intervals, reg);
462}
463
464static int
465ra_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
466{
467   const struct ra_interval *a = rb_node_to_interval_const(_a);
468   const struct ra_interval *b = rb_node_to_interval_const(_b);
469   return b->physreg_start - a->physreg_start;
470}
471
472static struct ra_interval *
473ir3_reg_interval_to_ra_interval(struct ir3_reg_interval *interval)
474{
475   return rb_node_data(struct ra_interval, interval, interval);
476}
477
478static struct ra_file *
479ir3_reg_ctx_to_file(struct ir3_reg_ctx *ctx)
480{
481   return rb_node_data(struct ra_file, ctx, reg_ctx);
482}
483
484static void
485interval_add(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval)
486{
487   struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
488   struct ra_file *file = ir3_reg_ctx_to_file(ctx);
489
490   /* We can assume in this case that physreg_start/physreg_end is already
491    * initialized.
492    */
493   for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
494      BITSET_CLEAR(file->available, i);
495      BITSET_CLEAR(file->available_to_evict, i);
496   }
497
498   rb_tree_insert(&file->physreg_intervals, &interval->physreg_node,
499                  ra_interval_insert_cmp);
500}
501
502static void
503interval_delete(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval)
504{
505   struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
506   struct ra_file *file = ir3_reg_ctx_to_file(ctx);
507
508   for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
509      BITSET_SET(file->available, i);
510      BITSET_SET(file->available_to_evict, i);
511   }
512
513   rb_tree_remove(&file->physreg_intervals, &interval->physreg_node);
514}
515
516static void
517interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_parent,
518               struct ir3_reg_interval *_child)
519{
520   struct ra_interval *parent = ir3_reg_interval_to_ra_interval(_parent);
521   struct ra_interval *child = ir3_reg_interval_to_ra_interval(_child);
522
523   child->physreg_start =
524      parent->physreg_start + (child->interval.reg->interval_start -
525                               parent->interval.reg->interval_start);
526   child->physreg_end =
527      child->physreg_start +
528      (child->interval.reg->interval_end - child->interval.reg->interval_start);
529
530   interval_add(ctx, _child);
531}
532
533static void
534ra_file_init(struct ra_file *file)
535{
536   for (unsigned i = 0; i < file->size; i++) {
537      BITSET_SET(file->available, i);
538      BITSET_SET(file->available_to_evict, i);
539   }
540
541   rb_tree_init(&file->reg_ctx.intervals);
542   rb_tree_init(&file->physreg_intervals);
543
544   file->reg_ctx.interval_add = interval_add;
545   file->reg_ctx.interval_delete = interval_delete;
546   file->reg_ctx.interval_readd = interval_readd;
547}
548
549static void
550ra_file_insert(struct ra_file *file, struct ra_interval *interval)
551{
552   assert(interval->physreg_start < interval->physreg_end);
553   assert(interval->physreg_end <= file->size);
554   if (interval->interval.reg->flags & IR3_REG_HALF)
555      assert(interval->physreg_end <= RA_HALF_SIZE);
556
557   ir3_reg_interval_insert(&file->reg_ctx, &interval->interval);
558}
559
560static void
561ra_file_remove(struct ra_file *file, struct ra_interval *interval)
562{
563   ir3_reg_interval_remove(&file->reg_ctx, &interval->interval);
564}
565
566static void
567ra_file_mark_killed(struct ra_file *file, struct ra_interval *interval)
568{
569   assert(!interval->interval.parent);
570
571   for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
572      BITSET_SET(file->available, i);
573   }
574
575   interval->is_killed = true;
576}
577
578static void
579ra_file_unmark_killed(struct ra_file *file, struct ra_interval *interval)
580{
581   assert(!interval->interval.parent);
582
583   for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
584      BITSET_CLEAR(file->available, i);
585   }
586
587   interval->is_killed = false;
588}
589
590static physreg_t
591ra_interval_get_physreg(const struct ra_interval *interval)
592{
593   unsigned child_start = interval->interval.reg->interval_start;
594
595   while (interval->interval.parent) {
596      interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
597   }
598
599   return interval->physreg_start +
600          (child_start - interval->interval.reg->interval_start);
601}
602
603static unsigned
604ra_interval_get_num(const struct ra_interval *interval)
605{
606   return ra_physreg_to_num(ra_interval_get_physreg(interval),
607                            interval->interval.reg->flags);
608}
609
610static void
611ra_interval_init(struct ra_interval *interval, struct ir3_register *reg)
612{
613   ir3_reg_interval_init(&interval->interval, reg);
614   interval->is_killed = false;
615   interval->frozen = false;
616}
617
618static void
619ra_interval_dump(struct log_stream *stream, struct ra_interval *interval)
620{
621   mesa_log_stream_printf(stream, "physreg %u ", interval->physreg_start);
622
623   ir3_reg_interval_dump(stream, &interval->interval);
624}
625
626static void
627ra_file_dump(struct log_stream *stream, struct ra_file *file)
628{
629   rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
630                    physreg_node) {
631      ra_interval_dump(stream, interval);
632   }
633
634   unsigned start, end;
635   mesa_log_stream_printf(stream, "available:\n");
636   BITSET_FOREACH_RANGE (start, end, file->available, file->size) {
637      mesa_log_stream_printf(stream, "%u-%u ", start, end);
638   }
639   mesa_log_stream_printf(stream, "\n");
640
641   mesa_log_stream_printf(stream, "available to evict:\n");
642   BITSET_FOREACH_RANGE (start, end, file->available_to_evict, file->size) {
643      mesa_log_stream_printf(stream, "%u-%u ", start, end);
644   }
645   mesa_log_stream_printf(stream, "\n");
646   mesa_log_stream_printf(stream, "start: %u\n", file->start);
647}
648
649static void
650ra_ctx_dump(struct ra_ctx *ctx)
651{
652   struct log_stream *stream = mesa_log_streami();
653   mesa_log_stream_printf(stream, "full:\n");
654   ra_file_dump(stream, &ctx->full);
655   mesa_log_stream_printf(stream, "half:\n");
656   ra_file_dump(stream, &ctx->half);
657   mesa_log_stream_printf(stream, "shared:");
658   ra_file_dump(stream, &ctx->shared);
659   mesa_log_stream_destroy(stream);
660}
661
662static unsigned
663reg_file_size(struct ra_file *file, struct ir3_register *reg)
664{
665   /* Half-regs can only take up the first half of the combined regfile */
666   if (reg->flags & IR3_REG_HALF)
667      return MIN2(file->size, RA_HALF_SIZE);
668   else
669      return file->size;
670}
671
672/* ra_pop_interval/ra_push_interval provide an API to shuffle around multiple
673 * top-level intervals at once. Pop multiple intervals, then push them back in
674 * any order.
675 */
676
677struct ra_removed_interval {
678   struct ra_interval *interval;
679   unsigned size;
680};
681
682static struct ra_removed_interval
683ra_pop_interval(struct ra_ctx *ctx, struct ra_file *file,
684                struct ra_interval *interval)
685{
686   assert(!interval->interval.parent);
687
688   /* Check if we've already moved this reg before */
689   unsigned pcopy_index;
690   for (pcopy_index = 0; pcopy_index < ctx->parallel_copies_count;
691        pcopy_index++) {
692      if (ctx->parallel_copies[pcopy_index].interval == interval)
693         break;
694   }
695
696   if (pcopy_index == ctx->parallel_copies_count) {
697      array_insert(ctx, ctx->parallel_copies,
698                   (struct ra_parallel_copy){
699                      .interval = interval,
700                      .src = interval->physreg_start,
701                   });
702   }
703
704   ir3_reg_interval_remove_temp(&file->reg_ctx, &interval->interval);
705
706   return (struct ra_removed_interval){
707      .interval = interval,
708      .size = interval->physreg_end - interval->physreg_start,
709   };
710}
711
712static void
713ra_push_interval(struct ra_ctx *ctx, struct ra_file *file,
714                 const struct ra_removed_interval *removed, physreg_t dst)
715{
716   struct ra_interval *interval = removed->interval;
717
718   interval->physreg_start = dst;
719   interval->physreg_end = dst + removed->size;
720
721   ir3_reg_interval_reinsert(&file->reg_ctx, &interval->interval);
722}
723
724/* Pick up the interval and place it at "dst". */
725static void
726ra_move_interval(struct ra_ctx *ctx, struct ra_file *file,
727                 struct ra_interval *interval, physreg_t dst)
728{
729   struct ra_removed_interval temp = ra_pop_interval(ctx, file, interval);
730   ra_push_interval(ctx, file, &temp, dst);
731}
732
733static bool
734get_reg_specified(struct ra_file *file, struct ir3_register *reg,
735                  physreg_t physreg, bool is_source)
736{
737   for (unsigned i = 0; i < reg_size(reg); i++) {
738      if (!BITSET_TEST(is_source ? file->available_to_evict : file->available,
739                       physreg + i))
740         return false;
741   }
742
743   return true;
744}
745
746/* Try to evict any registers conflicting with the proposed spot "physreg" for
747 * "reg". That is, move them to other places so that we can allocate "physreg"
748 * here.
749 */
750
751static bool
752try_evict_regs(struct ra_ctx *ctx, struct ra_file *file,
753               struct ir3_register *reg, physreg_t physreg,
754               unsigned *_eviction_count, bool is_source, bool speculative)
755{
756   BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE);
757   memcpy(available_to_evict, file->available_to_evict,
758          sizeof(available_to_evict));
759
760   BITSET_DECLARE(available, RA_MAX_FILE_SIZE);
761   memcpy(available, file->available, sizeof(available));
762
763   for (unsigned i = 0; i < reg_size(reg); i++) {
764      BITSET_CLEAR(available_to_evict, physreg + i);
765      BITSET_CLEAR(available, physreg + i);
766   }
767
768   unsigned eviction_count = 0;
769   /* Iterate over each range conflicting with physreg */
770   for (struct ra_interval *conflicting = ra_file_search_right(file, physreg),
771                           *next = ra_interval_next_or_null(conflicting);
772        conflicting != NULL &&
773        conflicting->physreg_start < physreg + reg_size(reg);
774        conflicting = next, next = ra_interval_next_or_null(next)) {
775      if (!is_source && conflicting->is_killed)
776         continue;
777
778      if (conflicting->frozen) {
779         assert(speculative);
780         return false;
781      }
782
783      unsigned conflicting_file_size =
784         reg_file_size(file, conflicting->interval.reg);
785      unsigned avail_start, avail_end;
786      bool evicted = false;
787      BITSET_FOREACH_RANGE (avail_start, avail_end, available_to_evict,
788                            conflicting_file_size) {
789         unsigned size = avail_end - avail_start;
790
791         /* non-half registers must be aligned */
792         if (!(conflicting->interval.reg->flags & IR3_REG_HALF) &&
793             avail_start % 2 == 1) {
794            avail_start++;
795            size--;
796         }
797
798         if (size >= conflicting->physreg_end - conflicting->physreg_start) {
799            for (unsigned i = 0;
800                 i < conflicting->physreg_end - conflicting->physreg_start; i++)
801               BITSET_CLEAR(available_to_evict, avail_start + i);
802            eviction_count +=
803               conflicting->physreg_end - conflicting->physreg_start;
804            if (!speculative)
805               ra_move_interval(ctx, file, conflicting, avail_start);
806            evicted = true;
807            break;
808         }
809      }
810
811      if (evicted)
812         continue;
813
814      /* If we couldn't evict this range, we may be able to swap it with a
815       * killed range to acheive the same effect.
816       */
817      foreach_interval (killed, file) {
818         if (!killed->is_killed)
819            continue;
820
821         if (killed->physreg_end - killed->physreg_start !=
822             conflicting->physreg_end - conflicting->physreg_start)
823            continue;
824
825         if (killed->physreg_end > conflicting_file_size ||
826             conflicting->physreg_end > reg_file_size(file, killed->interval.reg))
827            continue;
828
829         /* We can't swap the killed range if it partially/fully overlaps the
830          * space we're trying to allocate or (in speculative mode) if it's
831          * already been swapped and will overlap when we actually evict.
832          */
833         bool killed_available = true;
834         for (unsigned i = killed->physreg_start; i < killed->physreg_end; i++) {
835            if (!BITSET_TEST(available, i)) {
836               killed_available = false;
837               break;
838            }
839         }
840
841         if (!killed_available)
842            continue;
843
844         /* Check for alignment if one is a full reg */
845         if ((!(killed->interval.reg->flags & IR3_REG_HALF) ||
846              !(conflicting->interval.reg->flags & IR3_REG_HALF)) &&
847             (killed->physreg_start % 2 != 0 ||
848              conflicting->physreg_start % 2 != 0))
849            continue;
850
851         for (unsigned i = killed->physreg_start; i < killed->physreg_end; i++) {
852            BITSET_CLEAR(available, i);
853         }
854         /* Because this will generate swaps instead of moves, multiply the
855          * cost by 2.
856          */
857         eviction_count += (killed->physreg_end - killed->physreg_start) * 2;
858         if (!speculative) {
859            physreg_t killed_start = killed->physreg_start,
860                      conflicting_start = conflicting->physreg_start;
861            struct ra_removed_interval killed_removed =
862               ra_pop_interval(ctx, file, killed);
863            struct ra_removed_interval conflicting_removed =
864               ra_pop_interval(ctx, file, conflicting);
865            ra_push_interval(ctx, file, &killed_removed, conflicting_start);
866            ra_push_interval(ctx, file, &conflicting_removed, killed_start);
867         }
868
869         evicted = true;
870         break;
871      }
872
873      if (!evicted)
874         return false;
875   }
876
877   *_eviction_count = eviction_count;
878   return true;
879}
880
881static int
882removed_interval_cmp(const void *_i1, const void *_i2)
883{
884   const struct ra_removed_interval *i1 = _i1;
885   const struct ra_removed_interval *i2 = _i2;
886
887   /* We sort the registers as follows:
888    *
889    * |--------------------------------------------------------------------|
890    * |                    |             |             |                   |
891    * |  Half live-through | Half killed | Full killed | Full live-through |
892    * |                    |             |             |                   |
893    * |--------------------------------------------------------------------|
894    *                        |                 |
895    *                        |   Destination   |
896    *                        |                 |
897    *                        |-----------------|
898    *
899    * Half-registers have to be first so that they stay in the low half of
900    * the register file. Then half and full killed must stay together so that
901    * there's a contiguous range where we can put the register. With this
902    * structure we should be able to accomodate any collection of intervals
903    * such that the total number of half components is within the half limit
904    * and the combined components are within the full limit.
905    */
906
907   unsigned i1_align = reg_elem_size(i1->interval->interval.reg);
908   unsigned i2_align = reg_elem_size(i2->interval->interval.reg);
909   if (i1_align > i2_align)
910      return 1;
911   if (i1_align < i2_align)
912      return -1;
913
914   if (i1_align == 1) {
915      if (i2->interval->is_killed)
916         return -1;
917      if (i1->interval->is_killed)
918         return 1;
919   } else {
920      if (i2->interval->is_killed)
921         return 1;
922      if (i1->interval->is_killed)
923         return -1;
924   }
925
926   return 0;
927}
928
929/* "Compress" all the live intervals so that there is enough space for the
930 * destination register. As there can be gaps when a more-aligned interval
931 * follows a less-aligned interval, this also sorts them to remove such
932 * "padding", which may be required when space is very tight.  This isn't
933 * amazing, but should be used only as a last resort in case the register file
934 * is almost full and badly fragmented.
935 *
936 * Return the physreg to use.
937 */
938static physreg_t
939compress_regs_left(struct ra_ctx *ctx, struct ra_file *file, unsigned size,
940                   unsigned align, bool is_source)
941{
942   DECLARE_ARRAY(struct ra_removed_interval, intervals);
943   intervals_count = intervals_sz = 0;
944   intervals = NULL;
945
946   unsigned removed_size = 0, removed_half_size = 0;
947   unsigned file_size =
948      align == 1 ? MIN2(file->size, RA_HALF_SIZE) : file->size;
949   physreg_t start_reg = 0;
950
951   foreach_interval_rev_safe (interval, file) {
952      /* Check if we can sort the intervals *after* this one and have enough
953       * space leftover to accomodate "size" units. Also check that we have
954       * enough space leftover for half-registers, if we're inserting a
955       * half-register (otherwise we only shift any half-registers down so they
956       * should be safe).
957       */
958      if (interval->physreg_end + size + removed_size <= file->size &&
959          (align != 1 ||
960           interval->physreg_end + size + removed_half_size <= file_size)) {
961         start_reg = interval->physreg_end;
962         break;
963      }
964
965      /* We assume that all frozen intervals are at the start and that we
966       * can avoid popping them.
967       */
968      assert(!interval->frozen);
969
970      /* Killed sources don't count because they go at the end and can
971       * overlap the register we're trying to add, unless it's a source.
972       */
973      if (!interval->is_killed || is_source) {
974         removed_size += interval->physreg_end - interval->physreg_start;
975         if (interval->interval.reg->flags & IR3_REG_HALF) {
976            removed_half_size += interval->physreg_end -
977               interval->physreg_start;
978         }
979      }
980
981      /* Now that we've done the accounting, pop this off */
982      d("popping interval %u physreg %u\n", interval->interval.reg->name,
983        interval->physreg_start);
984      array_insert(ctx, intervals, ra_pop_interval(ctx, file, interval));
985   }
986
987   /* TODO: In addition to skipping registers at the beginning that are
988    * well-packed, we should try to skip registers at the end.
989    */
990
991   qsort(intervals, intervals_count, sizeof(*intervals), removed_interval_cmp);
992
993   physreg_t physreg = start_reg;
994   physreg_t ret_reg = (physreg_t)~0;
995   for (unsigned i = 0; i < intervals_count; i++) {
996      if (ret_reg == (physreg_t)~0 &&
997          ((intervals[i].interval->is_killed && !is_source) ||
998           !(intervals[i].interval->interval.reg->flags & IR3_REG_HALF))) {
999         ret_reg = ALIGN(physreg, align);
1000      }
1001
1002      if (ret_reg != (physreg_t)~0 &&
1003          (is_source || !intervals[i].interval->is_killed)) {
1004         physreg = MAX2(physreg, ret_reg + size);
1005      }
1006
1007      if (!(intervals[i].interval->interval.reg->flags & IR3_REG_HALF)) {
1008         physreg = ALIGN(physreg, 2);
1009      }
1010
1011      if (physreg + intervals[i].size >
1012          reg_file_size(file, intervals[i].interval->interval.reg)) {
1013         d("ran out of room for interval %u!\n",
1014           intervals[i].interval->interval.reg->name);
1015         unreachable("reg pressure calculation was wrong!");
1016         return 0;
1017      }
1018
1019      d("pushing interval %u physreg %u\n",
1020        intervals[i].interval->interval.reg->name, physreg);
1021      ra_push_interval(ctx, file, &intervals[i], physreg);
1022
1023      physreg += intervals[i].size;
1024   }
1025
1026   if (ret_reg == (physreg_t)~0)
1027      ret_reg = physreg;
1028
1029   ret_reg = ALIGN(ret_reg, align);
1030   if (ret_reg + size > file_size) {
1031      d("ran out of room for the new interval!\n");
1032      unreachable("reg pressure calculation was wrong!");
1033      return 0;
1034   }
1035
1036   return ret_reg;
1037}
1038
1039static void
1040update_affinity(struct ra_file *file, struct ir3_register *reg,
1041                physreg_t physreg)
1042{
1043   if (!reg->merge_set || reg->merge_set->preferred_reg != (physreg_t)~0)
1044      return;
1045
1046   if (physreg < reg->merge_set_offset)
1047      return;
1048
1049   if ((physreg - reg->merge_set_offset + reg->merge_set->size) > file->size)
1050      return;
1051
1052   reg->merge_set->preferred_reg = physreg - reg->merge_set_offset;
1053}
1054
1055/* Try to find free space for a register without shuffling anything. This uses
1056 * a round-robin algorithm to reduce false dependencies.
1057 */
1058static physreg_t
1059find_best_gap(struct ra_file *file, unsigned file_size, unsigned size,
1060              unsigned align, bool is_source)
1061{
1062   /* This can happen if we create a very large merge set. Just bail out in that
1063    * case.
1064    */
1065   if (size > file_size)
1066      return (physreg_t) ~0;
1067
1068   BITSET_WORD *available =
1069      is_source ? file->available_to_evict : file->available;
1070
1071   unsigned start = ALIGN(file->start, align) % (file_size - size + align);
1072   unsigned candidate = start;
1073   do {
1074      bool is_available = true;
1075      for (unsigned i = 0; i < size; i++) {
1076         if (!BITSET_TEST(available, candidate + i)) {
1077            is_available = false;
1078            break;
1079         }
1080      }
1081
1082      if (is_available) {
1083         file->start = (candidate + size) % file_size;
1084         return candidate;
1085      }
1086
1087      candidate += align;
1088      if (candidate + size > file_size)
1089         candidate = 0;
1090   } while (candidate != start);
1091
1092   return (physreg_t)~0;
1093}
1094
1095static struct ra_file *
1096ra_get_file(struct ra_ctx *ctx, struct ir3_register *reg)
1097{
1098   if (reg->flags & IR3_REG_SHARED)
1099      return &ctx->shared;
1100   else if (ctx->merged_regs || !(reg->flags & IR3_REG_HALF))
1101      return &ctx->full;
1102   else
1103      return &ctx->half;
1104}
1105
1106/* This is the main entrypoint for picking a register. Pick a free register
1107 * for "reg", shuffling around sources if necessary. In the normal case where
1108 * "is_source" is false, this register can overlap with killed sources
1109 * (intervals with "is_killed == true"). If "is_source" is true, then
1110 * is_killed is ignored and the register returned must not overlap with killed
1111 * sources. This must be used for tied registers, because we're actually
1112 * allocating the destination and the tied source at the same time.
1113 */
1114
1115static physreg_t
1116get_reg(struct ra_ctx *ctx, struct ra_file *file, struct ir3_register *reg,
1117        bool is_source)
1118{
1119   unsigned file_size = reg_file_size(file, reg);
1120   if (reg->merge_set && reg->merge_set->preferred_reg != (physreg_t)~0) {
1121      physreg_t preferred_reg =
1122         reg->merge_set->preferred_reg + reg->merge_set_offset;
1123      if (preferred_reg < file_size &&
1124          preferred_reg % reg_elem_size(reg) == 0 &&
1125          get_reg_specified(file, reg, preferred_reg, is_source))
1126         return preferred_reg;
1127   }
1128
1129   /* If this register is a subset of a merge set which we have not picked a
1130    * register for, first try to allocate enough space for the entire merge
1131    * set.
1132    */
1133   unsigned size = reg_size(reg);
1134   if (reg->merge_set && reg->merge_set->preferred_reg == (physreg_t)~0 &&
1135       size < reg->merge_set->size) {
1136      physreg_t best_reg = find_best_gap(file, file_size, reg->merge_set->size,
1137                                         reg->merge_set->alignment, is_source);
1138      if (best_reg != (physreg_t)~0u) {
1139         best_reg += reg->merge_set_offset;
1140         return best_reg;
1141      }
1142   }
1143
1144   /* For ALU and SFU instructions, if the src reg is avail to pick, use it.
1145    * Because this doesn't introduce unnecessary dependencies, and it
1146    * potentially avoids needing (ss) syncs for write after read hazards for
1147    * SFU instructions:
1148    */
1149   if (is_sfu(reg->instr) || is_alu(reg->instr)) {
1150      for (unsigned i = 0; i < reg->instr->srcs_count; i++) {
1151         struct ir3_register *src = reg->instr->srcs[i];
1152         if (!ra_reg_is_src(src))
1153            continue;
1154         if (ra_get_file(ctx, src) == file && reg_size(src) >= size) {
1155            struct ra_interval *src_interval = &ctx->intervals[src->def->name];
1156            physreg_t src_physreg = ra_interval_get_physreg(src_interval);
1157            if (src_physreg % reg_elem_size(reg) == 0 &&
1158                src_physreg + size <= file_size &&
1159                get_reg_specified(file, reg, src_physreg, is_source))
1160               return src_physreg;
1161         }
1162      }
1163   }
1164
1165   physreg_t best_reg =
1166      find_best_gap(file, file_size, size, reg_elem_size(reg), is_source);
1167   if (best_reg != (physreg_t)~0u) {
1168      return best_reg;
1169   }
1170
1171   /* Ok, we couldn't find anything that fits. Here is where we have to start
1172    * moving things around to make stuff fit. First try solely evicting
1173    * registers in the way.
1174    */
1175   unsigned best_eviction_count = ~0;
1176   for (physreg_t i = 0; i + size <= file_size; i += reg_elem_size(reg)) {
1177      unsigned eviction_count;
1178      if (try_evict_regs(ctx, file, reg, i, &eviction_count, is_source, true)) {
1179         if (eviction_count < best_eviction_count) {
1180            best_eviction_count = eviction_count;
1181            best_reg = i;
1182         }
1183      }
1184   }
1185
1186   if (best_eviction_count != ~0) {
1187      ASSERTED bool result = try_evict_regs(
1188         ctx, file, reg, best_reg, &best_eviction_count, is_source, false);
1189      assert(result);
1190      return best_reg;
1191   }
1192
1193   /* Use the dumb fallback only if try_evict_regs() fails. */
1194   return compress_regs_left(ctx, file, reg_size(reg), reg_elem_size(reg),
1195                             is_source);
1196}
1197
1198static void
1199assign_reg(struct ir3_instruction *instr, struct ir3_register *reg,
1200           unsigned num)
1201{
1202   if (reg->flags & IR3_REG_ARRAY) {
1203      reg->array.base = num;
1204      if (reg->flags & IR3_REG_RELATIV)
1205         reg->array.offset += num;
1206      else
1207         reg->num = num + reg->array.offset;
1208   } else {
1209      reg->num = num;
1210   }
1211}
1212
1213static void
1214mark_src_killed(struct ra_ctx *ctx, struct ir3_register *src)
1215{
1216   struct ra_interval *interval = &ctx->intervals[src->def->name];
1217
1218   if (!(src->flags & IR3_REG_FIRST_KILL) || interval->is_killed ||
1219       interval->interval.parent ||
1220       !rb_tree_is_empty(&interval->interval.children))
1221      return;
1222
1223   ra_file_mark_killed(ra_get_file(ctx, src), interval);
1224}
1225
1226static void
1227insert_dst(struct ra_ctx *ctx, struct ir3_register *dst)
1228{
1229   struct ra_file *file = ra_get_file(ctx, dst);
1230   struct ra_interval *interval = &ctx->intervals[dst->name];
1231
1232   d("insert dst %u physreg %u", dst->name, ra_interval_get_physreg(interval));
1233
1234   if (!(dst->flags & IR3_REG_UNUSED))
1235      ra_file_insert(file, interval);
1236
1237   assign_reg(dst->instr, dst, ra_interval_get_num(interval));
1238}
1239
1240static void
1241allocate_dst_fixed(struct ra_ctx *ctx, struct ir3_register *dst,
1242                   physreg_t physreg)
1243{
1244   struct ra_file *file = ra_get_file(ctx, dst);
1245   struct ra_interval *interval = &ctx->intervals[dst->name];
1246   update_affinity(file, dst, physreg);
1247
1248   ra_interval_init(interval, dst);
1249   interval->physreg_start = physreg;
1250   interval->physreg_end = physreg + reg_size(dst);
1251}
1252
1253static void
1254allocate_dst(struct ra_ctx *ctx, struct ir3_register *dst)
1255{
1256   struct ra_file *file = ra_get_file(ctx, dst);
1257
1258   struct ir3_register *tied = dst->tied;
1259   if (tied) {
1260      struct ra_interval *tied_interval = &ctx->intervals[tied->def->name];
1261      struct ra_interval *dst_interval = &ctx->intervals[dst->name];
1262      physreg_t tied_physreg = ra_interval_get_physreg(tied_interval);
1263      if (tied_interval->is_killed) {
1264         /* The easy case: the source is killed, so we can just reuse it
1265          * for the destination.
1266          */
1267         allocate_dst_fixed(ctx, dst, ra_interval_get_physreg(tied_interval));
1268      } else {
1269         /* The source is live-through, so we need to get a free register
1270          * (which is free for both the source and destination!), copy the
1271          * original source to it, then use that for the source and
1272          * destination.
1273          */
1274         physreg_t physreg = get_reg(ctx, file, dst, true);
1275         allocate_dst_fixed(ctx, dst, physreg);
1276         array_insert(ctx, ctx->parallel_copies,
1277                      (struct ra_parallel_copy){
1278                         .interval = dst_interval,
1279                         .src = tied_physreg,
1280                      });
1281      }
1282
1283      return;
1284   }
1285
1286   /* All the hard work is done by get_reg here. */
1287   physreg_t physreg = get_reg(ctx, file, dst, false);
1288
1289   allocate_dst_fixed(ctx, dst, physreg);
1290}
1291
1292static void
1293assign_src(struct ra_ctx *ctx, struct ir3_instruction *instr,
1294           struct ir3_register *src)
1295{
1296   struct ra_interval *interval = &ctx->intervals[src->def->name];
1297   struct ra_file *file = ra_get_file(ctx, src);
1298
1299   struct ir3_register *tied = src->tied;
1300   physreg_t physreg;
1301   if (tied) {
1302      struct ra_interval *tied_interval = &ctx->intervals[tied->name];
1303      physreg = ra_interval_get_physreg(tied_interval);
1304   } else {
1305      physreg = ra_interval_get_physreg(interval);
1306   }
1307
1308   assign_reg(instr, src, ra_physreg_to_num(physreg, src->flags));
1309
1310   if (src->flags & IR3_REG_FIRST_KILL)
1311      ra_file_remove(file, interval);
1312}
1313
1314/* Insert a parallel copy instruction before the instruction with the parallel
1315 * copy entries we've built up.
1316 */
1317static void
1318insert_parallel_copy_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
1319{
1320   if (ctx->parallel_copies_count == 0)
1321      return;
1322
1323   struct ir3_instruction *pcopy =
1324      ir3_instr_create(instr->block, OPC_META_PARALLEL_COPY,
1325                       ctx->parallel_copies_count, ctx->parallel_copies_count);
1326
1327   for (unsigned i = 0; i < ctx->parallel_copies_count; i++) {
1328      struct ra_parallel_copy *entry = &ctx->parallel_copies[i];
1329      struct ir3_register *reg =
1330         ir3_dst_create(pcopy, INVALID_REG,
1331                        entry->interval->interval.reg->flags & ~IR3_REG_SSA);
1332      reg->size = entry->interval->interval.reg->size;
1333      reg->wrmask = entry->interval->interval.reg->wrmask;
1334      assign_reg(pcopy, reg, ra_interval_get_num(entry->interval));
1335   }
1336
1337   for (unsigned i = 0; i < ctx->parallel_copies_count; i++) {
1338      struct ra_parallel_copy *entry = &ctx->parallel_copies[i];
1339      struct ir3_register *reg =
1340         ir3_src_create(pcopy, INVALID_REG,
1341                        entry->interval->interval.reg->flags & ~IR3_REG_SSA);
1342      reg->size = entry->interval->interval.reg->size;
1343      reg->wrmask = entry->interval->interval.reg->wrmask;
1344      assign_reg(pcopy, reg, ra_physreg_to_num(entry->src, reg->flags));
1345   }
1346
1347   list_del(&pcopy->node);
1348   list_addtail(&pcopy->node, &instr->node);
1349   ctx->parallel_copies_count = 0;
1350}
1351
1352static void
1353handle_normal_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
1354{
1355   /* First, mark sources as going-to-be-killed while allocating the dest. */
1356   ra_foreach_src (src, instr) {
1357      mark_src_killed(ctx, src);
1358   }
1359
1360   /* Allocate the destination. */
1361   ra_foreach_dst (dst, instr) {
1362      allocate_dst(ctx, dst);
1363   }
1364
1365   /* Now handle sources. Go backward so that in case there are multiple
1366    * sources with the same def and that def is killed we only remove it at
1367    * the end.
1368    */
1369   ra_foreach_src_rev (src, instr) {
1370      assign_src(ctx, instr, src);
1371   }
1372
1373   /* Now finally insert the destination into the map. */
1374   ra_foreach_dst (dst, instr) {
1375      insert_dst(ctx, dst);
1376   }
1377
1378   insert_parallel_copy_instr(ctx, instr);
1379}
1380
1381static void
1382handle_split(struct ra_ctx *ctx, struct ir3_instruction *instr)
1383{
1384   struct ir3_register *dst = instr->dsts[0];
1385   struct ir3_register *src = instr->srcs[0];
1386
1387   if (dst->merge_set == NULL || src->def->merge_set != dst->merge_set) {
1388      handle_normal_instr(ctx, instr);
1389      return;
1390   }
1391
1392   struct ra_interval *src_interval = &ctx->intervals[src->def->name];
1393
1394   physreg_t physreg = ra_interval_get_physreg(src_interval);
1395   assign_src(ctx, instr, src);
1396
1397   allocate_dst_fixed(
1398      ctx, dst, physreg - src->def->merge_set_offset + dst->merge_set_offset);
1399   insert_dst(ctx, dst);
1400}
1401
1402static void
1403handle_collect(struct ra_ctx *ctx, struct ir3_instruction *instr)
1404{
1405   struct ir3_merge_set *dst_set = instr->dsts[0]->merge_set;
1406   unsigned dst_offset = instr->dsts[0]->merge_set_offset;
1407
1408   if (!dst_set || dst_set->regs_count == 1) {
1409      handle_normal_instr(ctx, instr);
1410      return;
1411   }
1412
1413   /* We need to check if any of the sources are contained in an interval
1414    * that is at least as large as the vector. In this case, we should put
1415    * the vector inside that larger interval. (There should be one
1416    * unambiguous place to put it, because values sharing the same merge set
1417    * should be allocated together.) This can happen in a case like:
1418    *
1419    * ssa_1 (wrmask=0xf) = ...
1420    * ssa_2 = split ssa_1 off:0
1421    * ssa_3 = split ssa_1 off:1
1422    * ssa_4 (wrmask=0x3) = collect (kill)ssa_2, (kill)ssa_3
1423    * ... = (kill)ssa_1
1424    * ... = (kill)ssa_4
1425    *
1426    * ssa_4 will be coalesced with ssa_1 and needs to be allocated inside it.
1427    */
1428   physreg_t dst_fixed = (physreg_t)~0u;
1429
1430   ra_foreach_src (src, instr) {
1431      if (src->flags & IR3_REG_FIRST_KILL) {
1432         mark_src_killed(ctx, src);
1433      }
1434
1435      struct ra_interval *interval = &ctx->intervals[src->def->name];
1436
1437      if (src->def->merge_set != dst_set || interval->is_killed)
1438         continue;
1439      while (interval->interval.parent != NULL) {
1440         interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
1441      }
1442      if (reg_size(interval->interval.reg) >= reg_size(instr->dsts[0])) {
1443         dst_fixed = interval->physreg_start -
1444                     interval->interval.reg->merge_set_offset + dst_offset;
1445      } else {
1446         /* For sources whose root interval is smaller than the
1447          * destination (i.e. the normal case), we will shuffle them
1448          * around after allocating the destination. Mark them killed so
1449          * that the destination can be allocated over them, even if they
1450          * aren't actually killed.
1451          */
1452         ra_file_mark_killed(ra_get_file(ctx, src), interval);
1453      }
1454   }
1455
1456   if (dst_fixed != (physreg_t)~0u)
1457      allocate_dst_fixed(ctx, instr->dsts[0], dst_fixed);
1458   else
1459      allocate_dst(ctx, instr->dsts[0]);
1460
1461   /* Remove the temporary is_killed we added */
1462   ra_foreach_src (src, instr) {
1463      struct ra_interval *interval = &ctx->intervals[src->def->name];
1464      while (interval->interval.parent != NULL) {
1465         interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
1466      }
1467
1468      /* Filter out cases where it actually should be killed */
1469      if (interval != &ctx->intervals[src->def->name] ||
1470          !(src->flags & IR3_REG_KILL)) {
1471         ra_file_unmark_killed(ra_get_file(ctx, src), interval);
1472      }
1473   }
1474
1475   ra_foreach_src_rev (src, instr) {
1476      assign_src(ctx, instr, src);
1477   }
1478
1479   /* We need to do this before insert_dst(), so that children of the
1480    * destination which got marked as killed and then shuffled around to make
1481    * space for the destination have the correct pcopy destination that
1482    * matches what we assign the source of the collect to in assign_src().
1483    *
1484    * TODO: In this case we'll wind up copying the value in the pcopy and
1485    * then again in the collect. We could avoid one of those by updating the
1486    * pcopy destination to match up with the final location of the source
1487    * after the collect and making the collect a no-op. However this doesn't
1488    * seem to happen often.
1489    */
1490   insert_parallel_copy_instr(ctx, instr);
1491
1492   /* Note: insert_dst will automatically shuffle around any intervals that
1493    * are a child of the collect by making them children of the collect.
1494    */
1495
1496   insert_dst(ctx, instr->dsts[0]);
1497}
1498
1499/* Parallel copies before RA should only be at the end of the block, for
1500 * phi's. For these we only need to fill in the sources, and then we fill in
1501 * the destinations in the successor block.
1502 */
1503static void
1504handle_pcopy(struct ra_ctx *ctx, struct ir3_instruction *instr)
1505{
1506   ra_foreach_src_rev (src, instr) {
1507      assign_src(ctx, instr, src);
1508   }
1509}
1510
1511/* Some inputs may need to be precolored. We need to handle those first, so
1512 * that other non-precolored inputs don't accidentally get allocated over
1513 * them. Inputs are the very first thing in the shader, so it shouldn't be a
1514 * problem to allocate them to a specific physreg.
1515 */
1516
1517static void
1518handle_precolored_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
1519{
1520   if (instr->dsts[0]->num == INVALID_REG)
1521      return;
1522
1523   struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
1524   physreg_t physreg = ra_reg_get_physreg(instr->dsts[0]);
1525   allocate_dst_fixed(ctx, instr->dsts[0], physreg);
1526   insert_dst(ctx, instr->dsts[0]);
1527   interval->frozen = true;
1528}
1529
1530static void
1531handle_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
1532{
1533   if (instr->dsts[0]->num != INVALID_REG)
1534      return;
1535
1536   allocate_dst(ctx, instr->dsts[0]);
1537
1538   struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
1539   struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
1540   ra_file_insert(file, interval);
1541}
1542
1543static void
1544assign_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
1545{
1546   struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
1547   struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
1548
1549   if (instr->dsts[0]->num == INVALID_REG) {
1550      assign_reg(instr, instr->dsts[0], ra_interval_get_num(interval));
1551   } else {
1552      interval->frozen = false;
1553   }
1554
1555   if (instr->dsts[0]->flags & IR3_REG_UNUSED)
1556      ra_file_remove(file, interval);
1557
1558   ra_foreach_src_rev (src, instr)
1559      assign_src(ctx, instr, src);
1560}
1561
1562/* chmask is a bit weird, because it has pre-colored sources due to the need
1563 * to pass some registers to the next stage. Fortunately there are only at
1564 * most two, and there should be no other live values by the time we get to
1565 * this instruction, so we only have to do the minimum and don't need any
1566 * fancy fallbacks.
1567 *
1568 * TODO: Add more complete handling of precolored sources, e.g. for function
1569 * argument handling. We'd need a way to mark sources as fixed so that they
1570 * don't get moved around when placing other sources in the fallback case, and
1571 * a duplication of much of the logic in get_reg(). This also opens another
1572 * can of worms, e.g. what if the precolored source is a split of a vector
1573 * which is still live -- this breaks our assumption that splits don't incur
1574 * any "extra" register requirements and we'd have to break it out of the
1575 * parent ra_interval.
1576 */
1577
1578static void
1579handle_precolored_source(struct ra_ctx *ctx, struct ir3_register *src)
1580{
1581   struct ra_file *file = ra_get_file(ctx, src);
1582   struct ra_interval *interval = &ctx->intervals[src->def->name];
1583   physreg_t physreg = ra_reg_get_physreg(src);
1584
1585   if (ra_interval_get_num(interval) == src->num)
1586      return;
1587
1588   /* Try evicting stuff in our way if it isn't free. This won't move
1589    * anything unless it overlaps with our precolored physreg, so we don't
1590    * have to worry about evicting other precolored sources.
1591    */
1592   if (!get_reg_specified(file, src, physreg, true)) {
1593      unsigned eviction_count;
1594      if (!try_evict_regs(ctx, file, src, physreg, &eviction_count, true,
1595                          false)) {
1596         unreachable("failed to evict for precolored source!");
1597         return;
1598      }
1599   }
1600
1601   ra_move_interval(ctx, file, interval, physreg);
1602}
1603
1604static void
1605handle_chmask(struct ra_ctx *ctx, struct ir3_instruction *instr)
1606{
1607   /* Note: we purposely don't mark sources as killed, so that we can reuse
1608    * some of the get_reg() machinery as-if the source is a destination.
1609    * Marking it as killed would make e.g. get_reg_specified() wouldn't work
1610    * correctly.
1611    */
1612   ra_foreach_src (src, instr) {
1613      assert(src->num != INVALID_REG);
1614      handle_precolored_source(ctx, src);
1615   }
1616
1617   ra_foreach_src (src, instr) {
1618      struct ra_file *file = ra_get_file(ctx, src);
1619      struct ra_interval *interval = &ctx->intervals[src->def->name];
1620      if (src->flags & IR3_REG_FIRST_KILL)
1621         ra_file_remove(file, interval);
1622   }
1623
1624   insert_parallel_copy_instr(ctx, instr);
1625}
1626
1627static physreg_t
1628read_register(struct ra_ctx *ctx, struct ir3_block *block,
1629              struct ir3_register *def)
1630{
1631   struct ra_block_state *state = &ctx->blocks[block->index];
1632   if (state->renames) {
1633      struct hash_entry *entry = _mesa_hash_table_search(state->renames, def);
1634      if (entry) {
1635         return (physreg_t)(uintptr_t)entry->data;
1636      }
1637   }
1638
1639   return ra_reg_get_physreg(def);
1640}
1641
1642static void
1643handle_live_in(struct ra_ctx *ctx, struct ir3_register *def)
1644{
1645   physreg_t physreg = ~0;
1646   for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
1647      struct ir3_block *pred = ctx->block->predecessors[i];
1648      struct ra_block_state *pred_state = &ctx->blocks[pred->index];
1649
1650      if (!pred_state->visited)
1651         continue;
1652
1653      physreg = read_register(ctx, pred, def);
1654      break;
1655   }
1656
1657   assert(physreg != (physreg_t)~0);
1658
1659   struct ra_interval *interval = &ctx->intervals[def->name];
1660   struct ra_file *file = ra_get_file(ctx, def);
1661   ra_interval_init(interval, def);
1662   interval->physreg_start = physreg;
1663   interval->physreg_end = physreg + reg_size(def);
1664   ra_file_insert(file, interval);
1665}
1666
1667static void
1668handle_live_out(struct ra_ctx *ctx, struct ir3_register *def)
1669{
1670   /* Skip parallelcopy's which in the original program are only used as phi
1671    * arguments. Even though phi arguments are live out, they are only
1672    * assigned when the phi is.
1673    */
1674   if (def->instr->opc == OPC_META_PARALLEL_COPY)
1675      return;
1676
1677   struct ra_block_state *state = &ctx->blocks[ctx->block->index];
1678   struct ra_interval *interval = &ctx->intervals[def->name];
1679   physreg_t physreg = ra_interval_get_physreg(interval);
1680   if (physreg != ra_reg_get_physreg(def)) {
1681      if (!state->renames)
1682         state->renames = _mesa_pointer_hash_table_create(ctx);
1683      _mesa_hash_table_insert(state->renames, def, (void *)(uintptr_t)physreg);
1684   }
1685}
1686
1687static void
1688handle_phi(struct ra_ctx *ctx, struct ir3_register *def)
1689{
1690   struct ra_file *file = ra_get_file(ctx, def);
1691   struct ra_interval *interval = &ctx->intervals[def->name];
1692
1693   /* phis are always scalar, so they should already be the smallest possible
1694    * size. However they may be coalesced with other live-in values/phi
1695    * nodes, so check for that here.
1696    */
1697   struct ir3_reg_interval *parent_ir3 =
1698      ir3_reg_interval_search(&file->reg_ctx.intervals, def->interval_start);
1699   physreg_t physreg;
1700   if (parent_ir3) {
1701      struct ra_interval *parent = ir3_reg_interval_to_ra_interval(parent_ir3);
1702      physreg = ra_interval_get_physreg(parent) +
1703                (def->interval_start - parent_ir3->reg->interval_start);
1704   } else {
1705      physreg = get_reg(ctx, file, def, false);
1706   }
1707
1708   allocate_dst_fixed(ctx, def, physreg);
1709
1710   ra_file_insert(file, interval);
1711}
1712
1713static void
1714assign_phi(struct ra_ctx *ctx, struct ir3_instruction *phi)
1715{
1716   struct ra_file *file = ra_get_file(ctx, phi->dsts[0]);
1717   struct ra_interval *interval = &ctx->intervals[phi->dsts[0]->name];
1718   assert(!interval->interval.parent);
1719   unsigned num = ra_interval_get_num(interval);
1720   assign_reg(phi, phi->dsts[0], num);
1721
1722   /* Assign the parallelcopy sources of this phi */
1723   for (unsigned i = 0; i < phi->srcs_count; i++) {
1724      if (phi->srcs[i]->def) {
1725         assign_reg(phi, phi->srcs[i], num);
1726         assign_reg(phi, phi->srcs[i]->def, num);
1727      }
1728   }
1729
1730   if (phi->dsts[0]->flags & IR3_REG_UNUSED)
1731      ra_file_remove(file, interval);
1732}
1733
1734/* When we split a live range, we sometimes need to emit fixup code at the end
1735 * of a block. For example, something like:
1736 *
1737 * a = ...
1738 * if (...) {
1739 *    ...
1740 *    a' = a
1741 *    b = ... // a evicted to make room for b
1742 *    ...
1743 * }
1744 * ... = a
1745 *
1746 * When we insert the copy to a' in insert_parallel_copy_instr(), this forces
1747 * to insert another copy "a = a'" at the end of the if. Normally this would
1748 * also entail adding a phi node, but since we're about to go out of SSA
1749 * anyway we just insert an extra move. Note, however, that "b" might be used
1750 * in a phi node at the end of the if and share registers with "a", so we
1751 * have to be careful to extend any preexisting parallelcopy instruction
1752 * instead of creating our own in order to guarantee that they properly get
1753 * swapped.
1754 */
1755
1756static void
1757insert_liveout_copy(struct ir3_block *block, physreg_t dst, physreg_t src,
1758                    struct ir3_register *reg)
1759{
1760   struct ir3_instruction *old_pcopy = NULL;
1761   if (!list_is_empty(&block->instr_list)) {
1762      struct ir3_instruction *last =
1763         LIST_ENTRY(struct ir3_instruction, block->instr_list.prev, node);
1764      if (last->opc == OPC_META_PARALLEL_COPY)
1765         old_pcopy = last;
1766   }
1767
1768   unsigned old_pcopy_srcs = old_pcopy ? old_pcopy->srcs_count : 0;
1769   struct ir3_instruction *pcopy = ir3_instr_create(
1770      block, OPC_META_PARALLEL_COPY, old_pcopy_srcs + 1, old_pcopy_srcs + 1);
1771
1772   for (unsigned i = 0; i < old_pcopy_srcs; i++) {
1773      old_pcopy->dsts[i]->instr = pcopy;
1774      pcopy->dsts[pcopy->dsts_count++] = old_pcopy->dsts[i];
1775   }
1776
1777   struct ir3_register *dst_reg =
1778      ir3_dst_create(pcopy, INVALID_REG, reg->flags & ~IR3_REG_SSA);
1779   dst_reg->wrmask = reg->wrmask;
1780   dst_reg->size = reg->size;
1781   assign_reg(pcopy, dst_reg, ra_physreg_to_num(dst, reg->flags));
1782
1783   for (unsigned i = 0; i < old_pcopy_srcs; i++) {
1784      pcopy->srcs[pcopy->srcs_count++] = old_pcopy->srcs[i];
1785   }
1786
1787   struct ir3_register *src_reg =
1788      ir3_src_create(pcopy, INVALID_REG, reg->flags & ~IR3_REG_SSA);
1789   src_reg->wrmask = reg->wrmask;
1790   src_reg->size = reg->size;
1791   assign_reg(pcopy, src_reg, ra_physreg_to_num(src, reg->flags));
1792
1793   if (old_pcopy)
1794      list_del(&old_pcopy->node);
1795}
1796
1797static void
1798insert_live_in_move(struct ra_ctx *ctx, struct ra_interval *interval)
1799{
1800   physreg_t physreg = ra_interval_get_physreg(interval);
1801
1802   bool shared = interval->interval.reg->flags & IR3_REG_SHARED;
1803   struct ir3_block **predecessors =
1804      shared ? ctx->block->physical_predecessors : ctx->block->predecessors;
1805   unsigned predecessors_count = shared
1806                                    ? ctx->block->physical_predecessors_count
1807                                    : ctx->block->predecessors_count;
1808
1809   for (unsigned i = 0; i < predecessors_count; i++) {
1810      struct ir3_block *pred = predecessors[i];
1811      struct ra_block_state *pred_state = &ctx->blocks[pred->index];
1812
1813      if (!pred_state->visited)
1814         continue;
1815
1816      physreg_t pred_reg = read_register(ctx, pred, interval->interval.reg);
1817      if (pred_reg != physreg) {
1818         insert_liveout_copy(pred, physreg, pred_reg, interval->interval.reg);
1819
1820         /* This is a bit tricky, but when visiting the destination of a
1821          * physical-only edge, we have two predecessors (the if and the
1822          * header block) and both have multiple successors. We pick the
1823          * register for all live-ins from the normal edge, which should
1824          * guarantee that there's no need for shuffling things around in
1825          * the normal predecessor as long as there are no phi nodes, but
1826          * we still may need to insert fixup code in the physical
1827          * predecessor (i.e. the last block of the if) and that has
1828          * another successor (the block after the if) so we need to update
1829          * the renames state for when we process the other successor. This
1830          * crucially depends on the other successor getting processed
1831          * after this.
1832          *
1833          * For normal (non-physical) edges we disallow critical edges so
1834          * that hacks like this aren't necessary.
1835          */
1836         if (!pred_state->renames)
1837            pred_state->renames = _mesa_pointer_hash_table_create(ctx);
1838         _mesa_hash_table_insert(pred_state->renames, interval->interval.reg,
1839                                 (void *)(uintptr_t)physreg);
1840      }
1841   }
1842}
1843
1844static void
1845insert_file_live_in_moves(struct ra_ctx *ctx, struct ra_file *file)
1846{
1847   BITSET_WORD *live_in = ctx->live->live_in[ctx->block->index];
1848   rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
1849                    physreg_node) {
1850      /* Skip phi nodes. This needs to happen after phi nodes are allocated,
1851       * because we may have to move live-ins around to make space for phi
1852       * nodes, but we shouldn't be handling phi nodes here.
1853       */
1854      if (BITSET_TEST(live_in, interval->interval.reg->name))
1855         insert_live_in_move(ctx, interval);
1856   }
1857}
1858
1859static void
1860insert_entry_regs(struct ra_block_state *state, struct ra_file *file)
1861{
1862   rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
1863                    physreg_node) {
1864      _mesa_hash_table_insert(state->entry_regs, interval->interval.reg,
1865                              (void *)(uintptr_t)interval->physreg_start);
1866   }
1867}
1868
1869static void
1870insert_live_in_moves(struct ra_ctx *ctx)
1871{
1872   insert_file_live_in_moves(ctx, &ctx->full);
1873   insert_file_live_in_moves(ctx, &ctx->half);
1874   insert_file_live_in_moves(ctx, &ctx->shared);
1875
1876   /* If not all predecessors are visited, insert live-in regs so that
1877    * insert_live_out_moves() will work.
1878    */
1879   bool all_preds_visited = true;
1880   for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
1881      if (!ctx->blocks[ctx->block->predecessors[i]->index].visited) {
1882         all_preds_visited = false;
1883         break;
1884      }
1885   }
1886
1887   if (!all_preds_visited) {
1888      struct ra_block_state *state = &ctx->blocks[ctx->block->index];
1889      state->entry_regs = _mesa_pointer_hash_table_create(ctx);
1890
1891      insert_entry_regs(state, &ctx->full);
1892      insert_entry_regs(state, &ctx->half);
1893      insert_entry_regs(state, &ctx->shared);
1894   }
1895}
1896
1897static void
1898insert_live_out_move(struct ra_ctx *ctx, struct ra_interval *interval)
1899{
1900   for (unsigned i = 0; i < 2; i++) {
1901      if (!ctx->block->successors[i])
1902         continue;
1903
1904      struct ir3_block *succ = ctx->block->successors[i];
1905      struct ra_block_state *succ_state = &ctx->blocks[succ->index];
1906
1907      if (!succ_state->visited)
1908         continue;
1909
1910      struct hash_entry *entry = _mesa_hash_table_search(
1911         succ_state->entry_regs, interval->interval.reg);
1912      if (!entry)
1913         continue;
1914
1915      physreg_t new_reg = (physreg_t)(uintptr_t)entry->data;
1916      if (new_reg != interval->physreg_start) {
1917         insert_liveout_copy(ctx->block, new_reg, interval->physreg_start,
1918                             interval->interval.reg);
1919      }
1920   }
1921}
1922
1923static void
1924insert_file_live_out_moves(struct ra_ctx *ctx, struct ra_file *file)
1925{
1926   rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
1927                    physreg_node) {
1928      insert_live_out_move(ctx, interval);
1929   }
1930}
1931
1932static void
1933insert_live_out_moves(struct ra_ctx *ctx)
1934{
1935   insert_file_live_out_moves(ctx, &ctx->full);
1936   insert_file_live_out_moves(ctx, &ctx->half);
1937   insert_file_live_out_moves(ctx, &ctx->shared);
1938}
1939
1940static void
1941handle_block(struct ra_ctx *ctx, struct ir3_block *block)
1942{
1943   ctx->block = block;
1944
1945   /* Reset the register files from the last block */
1946   ra_file_init(&ctx->full);
1947   ra_file_init(&ctx->half);
1948   ra_file_init(&ctx->shared);
1949
1950   /* Handle live-ins, phis, and input meta-instructions. These all appear
1951    * live at the beginning of the block, and interfere with each other
1952    * therefore need to be allocated "in parallel". This means that we
1953    * have to allocate all of them, inserting them into the file, and then
1954    * delay updating the IR until all of them are allocated.
1955    *
1956    * Handle precolored inputs first, because we need to make sure that other
1957    * inputs don't overwrite them. We shouldn't have both live-ins/phi nodes
1958    * and inputs at the same time, because the first block doesn't have
1959    * predecessors. Therefore handle_live_in doesn't have to worry about
1960    * them.
1961    */
1962
1963   foreach_instr (instr, &block->instr_list) {
1964      if (instr->opc == OPC_META_INPUT)
1965         handle_precolored_input(ctx, instr);
1966      else
1967         break;
1968   }
1969
1970   unsigned name;
1971   BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
1972                       ctx->live->definitions_count) {
1973      struct ir3_register *reg = ctx->live->definitions[name];
1974      handle_live_in(ctx, reg);
1975   }
1976
1977   foreach_instr (instr, &block->instr_list) {
1978      if (instr->opc == OPC_META_PHI)
1979         handle_phi(ctx, instr->dsts[0]);
1980      else if (instr->opc == OPC_META_INPUT ||
1981               instr->opc == OPC_META_TEX_PREFETCH)
1982         handle_input(ctx, instr);
1983      else
1984         break;
1985   }
1986
1987   /* After this point, every live-in/phi/input has an interval assigned to
1988    * it. We delay actually assigning values until everything has been
1989    * allocated, so we can simply ignore any parallel copy entries created
1990    * when shuffling them around.
1991    */
1992   ctx->parallel_copies_count = 0;
1993
1994   insert_live_in_moves(ctx);
1995
1996   if (RA_DEBUG) {
1997      d("after live-in block %u:\n", block->index);
1998      ra_ctx_dump(ctx);
1999   }
2000
2001   /* Now we're done with processing live-ins, and can handle the body of the
2002    * block.
2003    */
2004   foreach_instr (instr, &block->instr_list) {
2005      di(instr, "processing");
2006
2007      if (instr->opc == OPC_META_PHI)
2008         assign_phi(ctx, instr);
2009      else if (instr->opc == OPC_META_INPUT ||
2010               instr->opc == OPC_META_TEX_PREFETCH)
2011         assign_input(ctx, instr);
2012      else if (instr->opc == OPC_META_SPLIT)
2013         handle_split(ctx, instr);
2014      else if (instr->opc == OPC_META_COLLECT)
2015         handle_collect(ctx, instr);
2016      else if (instr->opc == OPC_META_PARALLEL_COPY)
2017         handle_pcopy(ctx, instr);
2018      else if (instr->opc == OPC_CHMASK)
2019         handle_chmask(ctx, instr);
2020      else
2021         handle_normal_instr(ctx, instr);
2022
2023      if (RA_DEBUG)
2024         ra_ctx_dump(ctx);
2025   }
2026
2027   insert_live_out_moves(ctx);
2028
2029   BITSET_FOREACH_SET (name, ctx->live->live_out[block->index],
2030                       ctx->live->definitions_count) {
2031      struct ir3_register *reg = ctx->live->definitions[name];
2032      handle_live_out(ctx, reg);
2033   }
2034
2035   ctx->blocks[block->index].visited = true;
2036}
2037
2038static unsigned
2039calc_target_full_pressure(struct ir3_shader_variant *v, unsigned pressure)
2040{
2041   /* Registers are allocated in units of vec4, so switch from units of
2042    * half-regs to vec4.
2043    */
2044   unsigned reg_count = DIV_ROUND_UP(pressure, 2 * 4);
2045
2046   bool double_threadsize = ir3_should_double_threadsize(v, reg_count);
2047
2048   unsigned target = reg_count;
2049   unsigned reg_independent_max_waves =
2050      ir3_get_reg_independent_max_waves(v, double_threadsize);
2051   unsigned reg_dependent_max_waves = ir3_get_reg_dependent_max_waves(
2052      v->shader->compiler, reg_count, double_threadsize);
2053   unsigned target_waves =
2054      MIN2(reg_independent_max_waves, reg_dependent_max_waves);
2055
2056   while (target <= RA_FULL_SIZE / (2 * 4) &&
2057          ir3_should_double_threadsize(v, target) == double_threadsize &&
2058          ir3_get_reg_dependent_max_waves(v->shader->compiler, target,
2059                                          double_threadsize) >= target_waves)
2060      target++;
2061
2062   return (target - 1) * 2 * 4;
2063}
2064
2065static void
2066add_pressure(struct ir3_pressure *pressure, struct ir3_register *reg,
2067             bool merged_regs)
2068{
2069   unsigned size = reg_size(reg);
2070   if (reg->flags & IR3_REG_HALF)
2071      pressure->half += size;
2072   if (!(reg->flags & IR3_REG_HALF) || merged_regs)
2073      pressure->full += size;
2074}
2075
2076static void
2077dummy_interval_add(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval)
2078{
2079}
2080
2081static void
2082dummy_interval_delete(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval)
2083{
2084}
2085
2086static void
2087dummy_interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *parent,
2088                     struct ir3_reg_interval *child)
2089{
2090}
2091
2092/* Calculate the minimum possible limit on register pressure so that spilling
2093 * still succeeds. Used to implement IR3_SHADER_DEBUG=spillall.
2094 */
2095
2096static void
2097calc_min_limit_pressure(struct ir3_shader_variant *v,
2098                        struct ir3_liveness *live,
2099                        struct ir3_pressure *limit)
2100{
2101   struct ir3_block *start = ir3_start_block(v->ir);
2102   struct ir3_reg_ctx *ctx = ralloc(NULL, struct ir3_reg_ctx);
2103   struct ir3_reg_interval *intervals =
2104      rzalloc_array(ctx, struct ir3_reg_interval, live->definitions_count);
2105
2106   ctx->interval_add = dummy_interval_add;
2107   ctx->interval_delete = dummy_interval_delete;
2108   ctx->interval_readd = dummy_interval_readd;
2109
2110   limit->full = limit->half = 0;
2111
2112   struct ir3_pressure cur_pressure = {0};
2113   foreach_instr (input, &start->instr_list) {
2114      if (input->opc != OPC_META_INPUT &&
2115          input->opc != OPC_META_TEX_PREFETCH)
2116         break;
2117
2118      add_pressure(&cur_pressure, input->dsts[0], v->mergedregs);
2119   }
2120
2121   limit->full = MAX2(limit->full, cur_pressure.full);
2122   limit->half = MAX2(limit->half, cur_pressure.half);
2123
2124   foreach_instr (input, &start->instr_list) {
2125      if (input->opc != OPC_META_INPUT &&
2126          input->opc != OPC_META_TEX_PREFETCH)
2127         break;
2128
2129      /* pre-colored inputs may have holes, which increases the pressure. */
2130      struct ir3_register *dst = input->dsts[0];
2131      if (dst->num != INVALID_REG) {
2132         unsigned physreg = ra_reg_get_physreg(dst) + reg_size(dst);
2133         if (dst->flags & IR3_REG_HALF)
2134            limit->half = MAX2(limit->half, physreg);
2135         if (!(dst->flags & IR3_REG_HALF) || v->mergedregs)
2136            limit->full = MAX2(limit->full, physreg);
2137      }
2138   }
2139
2140   foreach_block (block, &v->ir->block_list) {
2141      rb_tree_init(&ctx->intervals);
2142
2143      unsigned name;
2144      BITSET_FOREACH_SET (name, live->live_in[block->index],
2145                          live->definitions_count) {
2146         struct ir3_register *reg = live->definitions[name];
2147         ir3_reg_interval_init(&intervals[reg->name], reg);
2148         ir3_reg_interval_insert(ctx, &intervals[reg->name]);
2149      }
2150
2151      foreach_instr (instr, &block->instr_list) {
2152         ra_foreach_dst (dst, instr) {
2153            ir3_reg_interval_init(&intervals[dst->name], dst);
2154         }
2155         /* phis and parallel copies can be deleted via spilling */
2156
2157         if (instr->opc == OPC_META_PHI) {
2158            ir3_reg_interval_insert(ctx, &intervals[instr->dsts[0]->name]);
2159            continue;
2160         }
2161
2162         if (instr->opc == OPC_META_PARALLEL_COPY)
2163            continue;
2164
2165         cur_pressure = (struct ir3_pressure) {0};
2166
2167         ra_foreach_dst (dst, instr) {
2168            if (dst->tied && !(dst->tied->flags & IR3_REG_KILL))
2169               add_pressure(&cur_pressure, dst, v->mergedregs);
2170         }
2171
2172         ra_foreach_src_rev (src, instr) {
2173            /* We currently don't support spilling the parent of a source when
2174             * making space for sources, so we have to keep track of the
2175             * intervals and figure out the root of the tree to figure out how
2176             * much space we need.
2177             *
2178             * TODO: We should probably support this in the spiller.
2179             */
2180            struct ir3_reg_interval *interval = &intervals[src->def->name];
2181            while (interval->parent)
2182               interval = interval->parent;
2183            add_pressure(&cur_pressure, interval->reg, v->mergedregs);
2184
2185            if (src->flags & IR3_REG_FIRST_KILL)
2186               ir3_reg_interval_remove(ctx, &intervals[src->def->name]);
2187         }
2188
2189         limit->full = MAX2(limit->full, cur_pressure.full);
2190         limit->half = MAX2(limit->half, cur_pressure.half);
2191
2192         cur_pressure = (struct ir3_pressure) {0};
2193
2194         ra_foreach_dst (dst, instr) {
2195            ir3_reg_interval_init(&intervals[dst->name], dst);
2196            ir3_reg_interval_insert(ctx, &intervals[dst->name]);
2197            add_pressure(&cur_pressure, dst, v->mergedregs);
2198         }
2199
2200         limit->full = MAX2(limit->full, cur_pressure.full);
2201         limit->half = MAX2(limit->half, cur_pressure.half);
2202      }
2203   }
2204
2205   /* Account for the base register, which needs to be available everywhere. */
2206   limit->full += 2;
2207
2208   ralloc_free(ctx);
2209}
2210
2211int
2212ir3_ra(struct ir3_shader_variant *v)
2213{
2214   ir3_calc_dominance(v->ir);
2215
2216   ir3_create_parallel_copies(v->ir);
2217
2218   struct ra_ctx *ctx = rzalloc(NULL, struct ra_ctx);
2219
2220   ctx->merged_regs = v->mergedregs;
2221   ctx->compiler = v->shader->compiler;
2222   ctx->stage = v->type;
2223
2224   struct ir3_liveness *live = ir3_calc_liveness(ctx, v->ir);
2225
2226   ir3_debug_print(v->ir, "AFTER: create_parallel_copies");
2227
2228   ir3_merge_regs(live, v->ir);
2229
2230   struct ir3_pressure max_pressure;
2231   ir3_calc_pressure(v, live, &max_pressure);
2232   d("max pressure:");
2233   d("\tfull: %u", max_pressure.full);
2234   d("\thalf: %u", max_pressure.half);
2235   d("\tshared: %u", max_pressure.shared);
2236
2237   /* TODO: calculate half/full limit correctly for CS with barrier */
2238   struct ir3_pressure limit_pressure;
2239   limit_pressure.full = RA_FULL_SIZE;
2240   limit_pressure.half = RA_HALF_SIZE;
2241   limit_pressure.shared = RA_SHARED_SIZE;
2242
2243   /* If requested, lower the limit so that spilling happens more often. */
2244   if (ir3_shader_debug & IR3_DBG_SPILLALL)
2245      calc_min_limit_pressure(v, live, &limit_pressure);
2246
2247   if (max_pressure.shared > limit_pressure.shared) {
2248      /* TODO shared reg -> normal reg spilling */
2249      d("shared max pressure exceeded!");
2250      goto fail;
2251   }
2252
2253   bool spilled = false;
2254   if (max_pressure.full > limit_pressure.full ||
2255       max_pressure.half > limit_pressure.half) {
2256      if (!v->shader->compiler->has_pvtmem) {
2257         d("max pressure exceeded!");
2258         goto fail;
2259      }
2260      d("max pressure exceeded, spilling!");
2261      IR3_PASS(v->ir, ir3_spill, v, &live, &limit_pressure);
2262      ir3_calc_pressure(v, live, &max_pressure);
2263      assert(max_pressure.full <= limit_pressure.full &&
2264             max_pressure.half <= limit_pressure.half);
2265      spilled = true;
2266   }
2267
2268   ctx->live = live;
2269   ctx->intervals =
2270      rzalloc_array(ctx, struct ra_interval, live->definitions_count);
2271   ctx->blocks = rzalloc_array(ctx, struct ra_block_state, live->block_count);
2272
2273   ctx->full.size = calc_target_full_pressure(v, max_pressure.full);
2274   d("full size: %u", ctx->full.size);
2275
2276   if (!v->mergedregs)
2277      ctx->half.size = RA_HALF_SIZE;
2278
2279   ctx->shared.size = RA_SHARED_SIZE;
2280
2281   ctx->full.start = ctx->half.start = ctx->shared.start = 0;
2282
2283   foreach_block (block, &v->ir->block_list)
2284      handle_block(ctx, block);
2285
2286   ir3_ra_validate(v, ctx->full.size, ctx->half.size, live->block_count);
2287
2288   /* Strip array-ness and SSA-ness at the end, because various helpers still
2289    * need to work even on definitions that have already been assigned. For
2290    * example, we need to preserve array-ness so that array live-ins have the
2291    * right size.
2292    */
2293   foreach_block (block, &v->ir->block_list) {
2294      foreach_instr (instr, &block->instr_list) {
2295         for (unsigned i = 0; i < instr->dsts_count; i++) {
2296            instr->dsts[i]->flags &= ~IR3_REG_SSA;
2297
2298            /* Parallel copies of array registers copy the whole register, and
2299             * we need some way to let the parallel copy code know that this was
2300             * an array whose size is determined by reg->size. So keep the array
2301             * flag on those. spill/reload also need to work on the entire
2302             * array.
2303             */
2304            if (!is_meta(instr) && instr->opc != OPC_RELOAD_MACRO)
2305               instr->dsts[i]->flags &= ~IR3_REG_ARRAY;
2306         }
2307
2308         for (unsigned i = 0; i < instr->srcs_count; i++) {
2309            instr->srcs[i]->flags &= ~IR3_REG_SSA;
2310
2311            if (!is_meta(instr) && instr->opc != OPC_SPILL_MACRO)
2312               instr->srcs[i]->flags &= ~IR3_REG_ARRAY;
2313         }
2314      }
2315   }
2316
2317   ir3_debug_print(v->ir, "AFTER: register allocation");
2318
2319   if (spilled) {
2320      IR3_PASS(v->ir, ir3_lower_spill);
2321   }
2322
2323   ir3_lower_copies(v);
2324
2325   ir3_debug_print(v->ir, "AFTER: ir3_lower_copies");
2326
2327   ralloc_free(ctx);
2328
2329   return 0;
2330fail:
2331   ralloc_free(ctx);
2332   return -1;
2333}
2334