1/*
2 * Copyright (C) 2021 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24#include "ir3_ra.h"
25#include "ir3_shader.h"
26
27struct copy_src {
28   unsigned flags;
29   union {
30      uint32_t imm;
31      physreg_t reg;
32      unsigned const_num;
33   };
34};
35
36struct copy_entry {
37   physreg_t dst;
38   unsigned flags;
39   bool done;
40
41   struct copy_src src;
42};
43
44static unsigned
45copy_entry_size(const struct copy_entry *entry)
46{
47   return (entry->flags & IR3_REG_HALF) ? 1 : 2;
48}
49
50static struct copy_src
51get_copy_src(const struct ir3_register *reg, unsigned offset)
52{
53   if (reg->flags & IR3_REG_IMMED) {
54      return (struct copy_src){
55         .flags = IR3_REG_IMMED,
56         .imm = reg->uim_val,
57      };
58   } else if (reg->flags & IR3_REG_CONST) {
59      return (struct copy_src){
60         .flags = IR3_REG_CONST,
61         .const_num = reg->num,
62      };
63   } else {
64      return (struct copy_src){
65         .flags = 0,
66         .reg = ra_reg_get_physreg(reg) + offset,
67      };
68   }
69}
70
71static void
72do_xor(struct ir3_instruction *instr, unsigned dst_num, unsigned src1_num,
73       unsigned src2_num, unsigned flags)
74{
75   struct ir3_instruction * xor
76      = ir3_instr_create(instr->block, OPC_XOR_B, 1, 2);
77   ir3_dst_create(xor, dst_num, flags);
78   ir3_src_create(xor, src1_num, flags);
79   ir3_src_create(xor, src2_num, flags);
80
81   ir3_instr_move_before(xor, instr);
82}
83
84static void
85do_swap(struct ir3_compiler *compiler, struct ir3_instruction *instr,
86        const struct copy_entry *entry)
87{
88   assert(!entry->src.flags);
89
90   if (entry->flags & IR3_REG_HALF) {
91      /* We currently make sure to never emit parallel copies where the
92       * source/destination is a half-reg above the range accessable to half
93       * registers. However, when a full-reg source overlaps a half-reg
94       * destination or vice versa, it can be very, very complicated to come
95       * up with a series of "legal" swaps and copies to resolve the
96       * parallel copy. So here we provide a fallback to implement the
97       * "illegal" swap instead. This may also be useful for implementing
98       * "spilling" half-regs to the inaccessable space.
99       */
100      if (entry->src.reg >= RA_HALF_SIZE) {
101         /* Choose a temporary that doesn't overlap src or dst */
102         physreg_t tmp = entry->dst < 2 ? 2 : 0;
103
104         /* Swap src and the temporary */
105         do_swap(compiler, instr,
106                 &(struct copy_entry){
107                    .src = {.reg = entry->src.reg & ~1u},
108                    .dst = tmp,
109                    .flags = entry->flags & ~IR3_REG_HALF,
110                 });
111
112         /* If src and dst are within the same full register, then swapping src
113          * with tmp above will also move dst to tmp. Account for that here.
114          */
115         unsigned dst =
116            (entry->src.reg & ~1u) == (entry->dst & ~1u) ?
117            tmp + (entry->dst & 1u) : entry->dst;
118
119         /* Do the original swap with src replaced with tmp */
120         do_swap(compiler, instr,
121                 &(struct copy_entry){
122                    .src = {.reg = tmp + (entry->src.reg & 1)},
123                    .dst = dst,
124                    .flags = entry->flags,
125                 });
126
127         /* Swap src and the temporary back */
128         do_swap(compiler, instr,
129                 &(struct copy_entry){
130                    .src = {.reg = entry->src.reg & ~1u},
131                    .dst = tmp,
132                    .flags = entry->flags & ~IR3_REG_HALF,
133                 });
134         return;
135      }
136
137      /* If dst is not addressable, we only need to swap the arguments and
138       * let the case above handle it.
139       */
140      if (entry->dst >= RA_HALF_SIZE) {
141         do_swap(compiler, instr,
142                 &(struct copy_entry){
143                    .src = {.reg = entry->dst},
144                    .dst = entry->src.reg,
145                    .flags = entry->flags,
146                 });
147         return;
148      }
149   }
150
151   unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags);
152   unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
153
154   /* a5xx+ is known to support swz, which enables us to swap two registers
155    * in-place. If unsupported we emulate it using the xor trick.
156    */
157   if (compiler->gen < 5) {
158      /* Shared regs only exist since a5xx, so we don't have to provide a
159       * fallback path for them.
160       */
161      assert(!(entry->flags & IR3_REG_SHARED));
162      do_xor(instr, dst_num, dst_num, src_num, entry->flags);
163      do_xor(instr, src_num, src_num, dst_num, entry->flags);
164      do_xor(instr, dst_num, dst_num, src_num, entry->flags);
165   } else {
166      /* Use a macro for shared regs because any shared reg writes need to
167       * be wrapped in a getone block to work correctly. Writing shared regs
168       * with multiple threads active does not work, even if they all return
169       * the same value.
170       */
171      unsigned opc =
172         (entry->flags & IR3_REG_SHARED) ? OPC_SWZ_SHARED_MACRO : OPC_SWZ;
173      struct ir3_instruction *swz = ir3_instr_create(instr->block, opc, 2, 2);
174      ir3_dst_create(swz, dst_num, entry->flags);
175      ir3_dst_create(swz, src_num, entry->flags);
176      ir3_src_create(swz, src_num, entry->flags);
177      ir3_src_create(swz, dst_num, entry->flags);
178      swz->cat1.dst_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
179      swz->cat1.src_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
180      swz->repeat = 1;
181      ir3_instr_move_before(swz, instr);
182   }
183}
184
185static void
186do_copy(struct ir3_compiler *compiler, struct ir3_instruction *instr,
187        const struct copy_entry *entry)
188{
189   if (entry->flags & IR3_REG_HALF) {
190      /* See do_swap() for why this is here. */
191      if (entry->dst >= RA_HALF_SIZE) {
192         /* TODO: is there a hw instruction we can use for this case? */
193         physreg_t tmp = !entry->src.flags && entry->src.reg < 2 ? 2 : 0;
194
195         do_swap(compiler, instr,
196                 &(struct copy_entry){
197                    .src = {.reg = entry->dst & ~1u},
198                    .dst = tmp,
199                    .flags = entry->flags & ~IR3_REG_HALF,
200                 });
201
202         /* Similar to in do_swap(), account for src being swapped with tmp if
203          * src and dst are in the same register.
204          */
205         struct copy_src src = entry->src;
206         if (!src.flags && (src.reg & ~1u) == (entry->dst & ~1u))
207            src.reg = tmp + (src.reg & 1u);
208
209         do_copy(compiler, instr,
210                 &(struct copy_entry){
211                    .src = src,
212                    .dst = tmp + (entry->dst & 1),
213                    .flags = entry->flags,
214                 });
215
216         do_swap(compiler, instr,
217                 &(struct copy_entry){
218                    .src = {.reg = entry->dst & ~1u},
219                    .dst = tmp,
220                    .flags = entry->flags & ~IR3_REG_HALF,
221                 });
222         return;
223      }
224
225      if (!entry->src.flags && entry->src.reg >= RA_HALF_SIZE) {
226         unsigned src_num = ra_physreg_to_num(entry->src.reg & ~1u,
227                                              entry->flags & ~IR3_REG_HALF);
228         unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
229
230         if (entry->src.reg % 2 == 0) {
231            /* cov.u32u16 dst, src */
232            struct ir3_instruction *cov =
233               ir3_instr_create(instr->block, OPC_MOV, 1, 1);
234            ir3_dst_create(cov, dst_num, entry->flags);
235            ir3_src_create(cov, src_num, entry->flags & ~IR3_REG_HALF);
236            cov->cat1.dst_type = TYPE_U16;
237            cov->cat1.src_type = TYPE_U32;
238            ir3_instr_move_before(cov, instr);
239         } else {
240            /* shr.b dst, src, (16) */
241            struct ir3_instruction *shr =
242               ir3_instr_create(instr->block, OPC_SHR_B, 1, 2);
243            ir3_dst_create(shr, dst_num, entry->flags);
244            ir3_src_create(shr, src_num, entry->flags & ~IR3_REG_HALF);
245            ir3_src_create(shr, 0, IR3_REG_IMMED)->uim_val = 16;
246            ir3_instr_move_before(shr, instr);
247         }
248         return;
249      }
250   }
251
252   unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags);
253   unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
254
255   /* Similar to the swap case, we have to use a macro for shared regs. */
256   unsigned opc =
257      (entry->flags & IR3_REG_SHARED) ? OPC_READ_FIRST_MACRO : OPC_MOV;
258   struct ir3_instruction *mov = ir3_instr_create(instr->block, opc, 1, 1);
259   ir3_dst_create(mov, dst_num, entry->flags);
260   ir3_src_create(mov, src_num, entry->flags | entry->src.flags);
261   mov->cat1.dst_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
262   mov->cat1.src_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
263   if (entry->src.flags & IR3_REG_IMMED)
264      mov->srcs[0]->uim_val = entry->src.imm;
265   else if (entry->src.flags & IR3_REG_CONST)
266      mov->srcs[0]->num = entry->src.const_num;
267   ir3_instr_move_before(mov, instr);
268}
269
270struct copy_ctx {
271   /* For each physreg, the number of pending copy entries that use it as a
272    * source. Once this drops to zero, then the physreg is unblocked and can
273    * be moved to.
274    */
275   unsigned physreg_use_count[RA_MAX_FILE_SIZE];
276
277   /* For each physreg, the pending copy_entry that uses it as a dest. */
278   struct copy_entry *physreg_dst[RA_MAX_FILE_SIZE];
279
280   struct copy_entry entries[RA_MAX_FILE_SIZE];
281   unsigned entry_count;
282};
283
284static bool
285entry_blocked(struct copy_entry *entry, struct copy_ctx *ctx)
286{
287   for (unsigned i = 0; i < copy_entry_size(entry); i++) {
288      if (ctx->physreg_use_count[entry->dst + i] != 0)
289         return true;
290   }
291
292   return false;
293}
294
295static void
296split_32bit_copy(struct copy_ctx *ctx, struct copy_entry *entry)
297{
298   assert(!entry->done);
299   assert(!(entry->src.flags & (IR3_REG_IMMED | IR3_REG_CONST)));
300   assert(copy_entry_size(entry) == 2);
301   struct copy_entry *new_entry = &ctx->entries[ctx->entry_count++];
302
303   new_entry->dst = entry->dst + 1;
304   new_entry->src.flags = entry->src.flags;
305   new_entry->src.reg = entry->src.reg + 1;
306   new_entry->done = false;
307   entry->flags |= IR3_REG_HALF;
308   new_entry->flags = entry->flags;
309   ctx->physreg_dst[entry->dst + 1] = new_entry;
310}
311
312static void
313_handle_copies(struct ir3_compiler *compiler, struct ir3_instruction *instr,
314               struct copy_ctx *ctx)
315{
316   /* Set up the bookkeeping */
317   memset(ctx->physreg_dst, 0, sizeof(ctx->physreg_dst));
318   memset(ctx->physreg_use_count, 0, sizeof(ctx->physreg_use_count));
319
320   for (unsigned i = 0; i < ctx->entry_count; i++) {
321      struct copy_entry *entry = &ctx->entries[i];
322      for (unsigned j = 0; j < copy_entry_size(entry); j++) {
323         if (!entry->src.flags)
324            ctx->physreg_use_count[entry->src.reg + j]++;
325
326         /* Copies should not have overlapping destinations. */
327         assert(!ctx->physreg_dst[entry->dst + j]);
328         ctx->physreg_dst[entry->dst + j] = entry;
329      }
330   }
331
332   bool progress = true;
333   while (progress) {
334      progress = false;
335
336      /* Step 1: resolve paths in the transfer graph. This means finding
337       * copies whose destination aren't blocked by something else and then
338       * emitting them, continuing this process until every copy is blocked
339       * and there are only cycles left.
340       *
341       * TODO: We should note that src is also available in dst to unblock
342       * cycles that src is involved in.
343       */
344
345      for (unsigned i = 0; i < ctx->entry_count; i++) {
346         struct copy_entry *entry = &ctx->entries[i];
347         if (!entry->done && !entry_blocked(entry, ctx)) {
348            entry->done = true;
349            progress = true;
350            do_copy(compiler, instr, entry);
351            for (unsigned j = 0; j < copy_entry_size(entry); j++) {
352               if (!entry->src.flags)
353                  ctx->physreg_use_count[entry->src.reg + j]--;
354               ctx->physreg_dst[entry->dst + j] = NULL;
355            }
356         }
357      }
358
359      if (progress)
360         continue;
361
362      /* Step 2: Find partially blocked copies and split them. In the
363       * mergedregs case, we can 32-bit copies which are only blocked on one
364       * 16-bit half, and splitting them helps get things moving.
365       *
366       * We can skip splitting copies if the source isn't a register,
367       * however, because it does not unblock anything and therefore doesn't
368       * contribute to making forward progress with step 1. These copies
369       * should still be resolved eventually in step 1 because they can't be
370       * part of a cycle.
371       */
372      for (unsigned i = 0; i < ctx->entry_count; i++) {
373         struct copy_entry *entry = &ctx->entries[i];
374         if (entry->done || entry->flags & IR3_REG_HALF)
375            continue;
376
377         if (((ctx->physreg_use_count[entry->dst] == 0 ||
378               ctx->physreg_use_count[entry->dst + 1] == 0)) &&
379             !(entry->src.flags & (IR3_REG_IMMED | IR3_REG_CONST))) {
380            split_32bit_copy(ctx, entry);
381            progress = true;
382         }
383      }
384   }
385
386   /* Step 3: resolve cycles through swapping.
387    *
388    * At this point, the transfer graph should consist of only cycles.
389    * The reason is that, given any physreg n_1 that's the source of a
390    * remaining entry, it has a destination n_2, which (because every
391    * copy is blocked) is the source of some other copy whose destination
392    * is n_3, and so we can follow the chain until we get a cycle. If we
393    * reached some other node than n_1:
394    *
395    *  n_1 -> n_2 -> ... -> n_i
396    *          ^             |
397    *          |-------------|
398    *
399    *  then n_2 would be the destination of 2 copies, which is illegal
400    *  (checked above in an assert). So n_1 must be part of a cycle:
401    *
402    *  n_1 -> n_2 -> ... -> n_i
403    *  ^                     |
404    *  |---------------------|
405    *
406    *  and this must be only cycle n_1 is involved in, because any other
407    *  path starting from n_1 would also have to end in n_1, resulting in
408    *  a node somewhere along the way being the destination of 2 copies
409    *  when the 2 paths merge.
410    *
411    *  The way we resolve the cycle is through picking a copy (n_1, n_2)
412    *  and swapping n_1 and n_2. This moves n_1 to n_2, so n_2 is taken
413    *  out of the cycle:
414    *
415    *  n_1 -> ... -> n_i
416    *  ^              |
417    *  |--------------|
418    *
419    *  and we can keep repeating this until the cycle is empty.
420    */
421
422   for (unsigned i = 0; i < ctx->entry_count; i++) {
423      struct copy_entry *entry = &ctx->entries[i];
424      if (entry->done)
425         continue;
426
427      assert(!entry->src.flags);
428
429      /* catch trivial copies */
430      if (entry->dst == entry->src.reg) {
431         entry->done = true;
432         continue;
433      }
434
435      do_swap(compiler, instr, entry);
436
437      /* Split any blocking copies whose sources are only partially
438       * contained within our destination.
439       */
440      if (entry->flags & IR3_REG_HALF) {
441         for (unsigned j = 0; j < ctx->entry_count; j++) {
442            struct copy_entry *blocking = &ctx->entries[j];
443
444            if (blocking->done)
445               continue;
446
447            if (blocking->src.reg <= entry->dst &&
448                blocking->src.reg + 1 >= entry->dst &&
449                !(blocking->flags & IR3_REG_HALF)) {
450               split_32bit_copy(ctx, blocking);
451            }
452         }
453      }
454
455      /* Update sources of blocking copies.
456       *
457       * Note: at this point, every blocking copy's source should be
458       * contained within our destination.
459       */
460      for (unsigned j = 0; j < ctx->entry_count; j++) {
461         struct copy_entry *blocking = &ctx->entries[j];
462         if (blocking->src.reg >= entry->dst &&
463             blocking->src.reg < entry->dst + copy_entry_size(entry)) {
464            blocking->src.reg =
465               entry->src.reg + (blocking->src.reg - entry->dst);
466         }
467      }
468
469      entry->done = true;
470   }
471}
472
473static void
474handle_copies(struct ir3_shader_variant *v, struct ir3_instruction *instr,
475              struct copy_entry *entries, unsigned entry_count)
476{
477   struct copy_ctx ctx;
478
479   /* handle shared copies first */
480   ctx.entry_count = 0;
481   for (unsigned i = 0; i < entry_count; i++) {
482      if (entries[i].flags & IR3_REG_SHARED)
483         ctx.entries[ctx.entry_count++] = entries[i];
484   }
485   _handle_copies(v->shader->compiler, instr, &ctx);
486
487   if (v->mergedregs) {
488      /* Half regs and full regs are in the same file, so handle everything
489       * at once.
490       */
491      ctx.entry_count = 0;
492      for (unsigned i = 0; i < entry_count; i++) {
493         if (!(entries[i].flags & IR3_REG_SHARED))
494            ctx.entries[ctx.entry_count++] = entries[i];
495      }
496      _handle_copies(v->shader->compiler, instr, &ctx);
497   } else {
498      /* There may be both half copies and full copies, so we have to split
499       * them up since they don't interfere.
500       */
501      ctx.entry_count = 0;
502      for (unsigned i = 0; i < entry_count; i++) {
503         if (entries[i].flags & IR3_REG_HALF)
504            ctx.entries[ctx.entry_count++] = entries[i];
505      }
506      _handle_copies(v->shader->compiler, instr, &ctx);
507
508      ctx.entry_count = 0;
509      for (unsigned i = 0; i < entry_count; i++) {
510         if (!(entries[i].flags & (IR3_REG_HALF | IR3_REG_SHARED)))
511            ctx.entries[ctx.entry_count++] = entries[i];
512      }
513      _handle_copies(v->shader->compiler, instr, &ctx);
514   }
515}
516
517void
518ir3_lower_copies(struct ir3_shader_variant *v)
519{
520   DECLARE_ARRAY(struct copy_entry, copies);
521   copies_count = copies_sz = 0;
522   copies = NULL;
523
524   foreach_block (block, &v->ir->block_list) {
525      foreach_instr_safe (instr, &block->instr_list) {
526         if (instr->opc == OPC_META_PARALLEL_COPY) {
527            copies_count = 0;
528            for (unsigned i = 0; i < instr->dsts_count; i++) {
529               struct ir3_register *dst = instr->dsts[i];
530               struct ir3_register *src = instr->srcs[i];
531               unsigned flags = src->flags & (IR3_REG_HALF | IR3_REG_SHARED);
532               unsigned dst_physreg = ra_reg_get_physreg(dst);
533               for (unsigned j = 0; j < reg_elems(dst); j++) {
534                  array_insert(
535                     NULL, copies,
536                     (struct copy_entry){
537                        .dst = dst_physreg + j * reg_elem_size(dst),
538                        .src = get_copy_src(src, j * reg_elem_size(dst)),
539                        .flags = flags,
540                     });
541               }
542            }
543            handle_copies(v, instr, copies, copies_count);
544            list_del(&instr->node);
545         } else if (instr->opc == OPC_META_COLLECT) {
546            copies_count = 0;
547            struct ir3_register *dst = instr->dsts[0];
548            unsigned flags = dst->flags & (IR3_REG_HALF | IR3_REG_SHARED);
549            for (unsigned i = 0; i < instr->srcs_count; i++) {
550               struct ir3_register *src = instr->srcs[i];
551               array_insert(NULL, copies,
552                            (struct copy_entry){
553                               .dst = ra_num_to_physreg(dst->num + i, flags),
554                               .src = get_copy_src(src, 0),
555                               .flags = flags,
556                            });
557            }
558            handle_copies(v, instr, copies, copies_count);
559            list_del(&instr->node);
560         } else if (instr->opc == OPC_META_SPLIT) {
561            copies_count = 0;
562            struct ir3_register *dst = instr->dsts[0];
563            struct ir3_register *src = instr->srcs[0];
564            unsigned flags = src->flags & (IR3_REG_HALF | IR3_REG_SHARED);
565            array_insert(NULL, copies,
566                         (struct copy_entry){
567                            .dst = ra_reg_get_physreg(dst),
568                            .src = get_copy_src(
569                               src, instr->split.off * reg_elem_size(dst)),
570                            .flags = flags,
571                         });
572            handle_copies(v, instr, copies, copies_count);
573            list_del(&instr->node);
574         } else if (instr->opc == OPC_META_PHI) {
575            list_del(&instr->node);
576         }
577      }
578   }
579
580   if (copies)
581      ralloc_free(copies);
582}
583