ir3.c revision 7ec681f3
1/*
2 * Copyright (c) 2012 Rob Clark <robdclark@gmail.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24#include "ir3.h"
25
26#include <assert.h>
27#include <errno.h>
28#include <stdbool.h>
29#include <stdio.h>
30#include <stdlib.h>
31#include <string.h>
32
33#include "util/bitscan.h"
34#include "util/half_float.h"
35#include "util/ralloc.h"
36#include "util/u_math.h"
37
38#include "instr-a3xx.h"
39#include "ir3_shader.h"
40
41/* simple allocator to carve allocations out of an up-front allocated heap,
42 * so that we can free everything easily in one shot.
43 */
44void *
45ir3_alloc(struct ir3 *shader, int sz)
46{
47   return rzalloc_size(shader, sz); /* TODO: don't use rzalloc */
48}
49
50struct ir3 *
51ir3_create(struct ir3_compiler *compiler, struct ir3_shader_variant *v)
52{
53   struct ir3 *shader = rzalloc(v, struct ir3);
54
55   shader->compiler = compiler;
56   shader->type = v->type;
57
58   list_inithead(&shader->block_list);
59   list_inithead(&shader->array_list);
60
61   return shader;
62}
63
64void
65ir3_destroy(struct ir3 *shader)
66{
67   ralloc_free(shader);
68}
69
70static void
71collect_reg_info(struct ir3_instruction *instr, struct ir3_register *reg,
72                 struct ir3_info *info)
73{
74   struct ir3_shader_variant *v = info->data;
75   unsigned repeat = instr->repeat;
76
77   if (reg->flags & IR3_REG_IMMED) {
78      /* nothing to do */
79      return;
80   }
81
82   if (!(reg->flags & IR3_REG_R)) {
83      repeat = 0;
84   }
85
86   unsigned components;
87   int16_t max;
88
89   if (reg->flags & IR3_REG_RELATIV) {
90      components = reg->size;
91      max = (reg->array.base + components - 1);
92   } else {
93      components = util_last_bit(reg->wrmask);
94      max = (reg->num + repeat + components - 1);
95   }
96
97   if (reg->flags & IR3_REG_CONST) {
98      info->max_const = MAX2(info->max_const, max >> 2);
99   } else if (max < regid(48, 0)) {
100      if (reg->flags & IR3_REG_HALF) {
101         if (v->mergedregs) {
102            /* starting w/ a6xx, half regs conflict with full regs: */
103            info->max_reg = MAX2(info->max_reg, max >> 3);
104         } else {
105            info->max_half_reg = MAX2(info->max_half_reg, max >> 2);
106         }
107      } else {
108         info->max_reg = MAX2(info->max_reg, max >> 2);
109      }
110   }
111}
112
113bool
114ir3_should_double_threadsize(struct ir3_shader_variant *v, unsigned regs_count)
115{
116   const struct ir3_compiler *compiler = v->shader->compiler;
117
118   /* We can't support more than compiler->branchstack_size diverging threads
119    * in a wave. Thus, doubling the threadsize is only possible if we don't
120    * exceed the branchstack size limit.
121    */
122   if (MIN2(v->branchstack, compiler->threadsize_base * 2) >
123       compiler->branchstack_size) {
124      return false;
125   }
126
127   switch (v->type) {
128   case MESA_SHADER_COMPUTE: {
129      unsigned threads_per_wg =
130         v->local_size[0] * v->local_size[1] * v->local_size[2];
131
132      /* For a5xx, if the workgroup size is greater than the maximum number
133       * of threads per core with 32 threads per wave (512) then we have to
134       * use the doubled threadsize because otherwise the workgroup wouldn't
135       * fit. For smaller workgroup sizes, we follow the blob and use the
136       * smaller threadsize.
137       */
138      if (compiler->gen < 6) {
139         return v->local_size_variable ||
140                threads_per_wg >
141                   compiler->threadsize_base * compiler->max_waves;
142      }
143
144      /* On a6xx, we prefer the larger threadsize unless the workgroup is
145       * small enough that it would be useless. Note that because
146       * threadsize_base is bumped to 64, we don't have to worry about the
147       * workgroup fitting, unlike the a5xx case.
148       */
149      if (!v->local_size_variable) {
150         if (threads_per_wg <= compiler->threadsize_base)
151            return false;
152      }
153   }
154      FALLTHROUGH;
155   case MESA_SHADER_FRAGMENT: {
156      /* Check that doubling the threadsize wouldn't exceed the regfile size */
157      return regs_count * 2 <= compiler->reg_size_vec4;
158   }
159
160   default:
161      /* On a6xx+, it's impossible to use a doubled wavesize in the geometry
162       * stages - the bit doesn't exist. The blob never used it for the VS
163       * on earlier gen's anyway.
164       */
165      return false;
166   }
167}
168
169/* Get the maximum number of waves that could be used even if this shader
170 * didn't use any registers.
171 */
172unsigned
173ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v,
174                                  bool double_threadsize)
175{
176   const struct ir3_compiler *compiler = v->shader->compiler;
177   unsigned max_waves = compiler->max_waves;
178
179   /* If this is a compute shader, compute the limit based on shared size */
180   if (v->type == MESA_SHADER_COMPUTE) {
181      /* Shared is allocated in chunks of 1k */
182      unsigned shared_per_wg = ALIGN_POT(v->shared_size, 1024);
183      if (shared_per_wg > 0 && !v->local_size_variable) {
184         unsigned wgs_per_core = compiler->local_mem_size / shared_per_wg;
185         unsigned threads_per_wg =
186            v->local_size[0] * v->local_size[1] * v->local_size[2];
187         unsigned waves_per_wg =
188            DIV_ROUND_UP(threads_per_wg, compiler->threadsize_base *
189                                            (double_threadsize ? 2 : 1) *
190                                            compiler->wave_granularity);
191         max_waves = MIN2(max_waves, waves_per_wg * wgs_per_core *
192                                        compiler->wave_granularity);
193      }
194   }
195
196   /* Compute the limit based on branchstack */
197   if (v->branchstack > 0) {
198      unsigned branchstack_max_waves = compiler->branchstack_size /
199                                       v->branchstack *
200                                       compiler->wave_granularity;
201      max_waves = MIN2(max_waves, branchstack_max_waves);
202   }
203
204   return max_waves;
205}
206
207/* Get the maximum number of waves that could be launched limited by reg size.
208 */
209unsigned
210ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
211                                unsigned reg_count, bool double_threadsize)
212{
213   return reg_count ? (compiler->reg_size_vec4 /
214                       (reg_count * (double_threadsize ? 2 : 1)) *
215                       compiler->wave_granularity)
216                    : compiler->max_waves;
217}
218
219void
220ir3_collect_info(struct ir3_shader_variant *v)
221{
222   struct ir3_info *info = &v->info;
223   struct ir3 *shader = v->ir;
224   const struct ir3_compiler *compiler = v->shader->compiler;
225
226   memset(info, 0, sizeof(*info));
227   info->data = v;
228   info->max_reg = -1;
229   info->max_half_reg = -1;
230   info->max_const = -1;
231   info->multi_dword_ldp_stp = false;
232
233   uint32_t instr_count = 0;
234   foreach_block (block, &shader->block_list) {
235      foreach_instr (instr, &block->instr_list) {
236         instr_count++;
237      }
238   }
239
240   v->instrlen = DIV_ROUND_UP(instr_count, compiler->instr_align);
241
242   /* Pad out with NOPs to instrlen, including at least 4 so that cffdump
243    * doesn't try to decode the following data as instructions (such as the
244    * next stage's shader in turnip)
245    */
246   info->size = MAX2(v->instrlen * compiler->instr_align, instr_count + 4) * 8;
247   info->sizedwords = info->size / 4;
248
249   foreach_block (block, &shader->block_list) {
250      int sfu_delay = 0;
251
252      foreach_instr (instr, &block->instr_list) {
253
254         foreach_src (reg, instr) {
255            collect_reg_info(instr, reg, info);
256         }
257
258         foreach_dst (reg, instr) {
259            if (is_dest_gpr(reg)) {
260               collect_reg_info(instr, reg, info);
261            }
262         }
263
264         if ((instr->opc == OPC_STP || instr->opc == OPC_LDP)) {
265            unsigned components = instr->srcs[2]->uim_val;
266            if (components * type_size(instr->cat6.type) > 32) {
267               info->multi_dword_ldp_stp = true;
268            }
269
270            if (instr->opc == OPC_STP)
271               info->stp_count += components;
272            else
273               info->ldp_count += components;
274         }
275
276         if ((instr->opc == OPC_BARY_F) && (instr->dsts[0]->flags & IR3_REG_EI))
277            info->last_baryf = info->instrs_count;
278
279         unsigned instrs_count = 1 + instr->repeat + instr->nop;
280         unsigned nops_count = instr->nop;
281
282         if (instr->opc == OPC_NOP) {
283            nops_count = 1 + instr->repeat;
284            info->instrs_per_cat[0] += nops_count;
285         } else {
286            info->instrs_per_cat[opc_cat(instr->opc)] += 1 + instr->repeat;
287            info->instrs_per_cat[0] += nops_count;
288         }
289
290         if (instr->opc == OPC_MOV) {
291            if (instr->cat1.src_type == instr->cat1.dst_type) {
292               info->mov_count += 1 + instr->repeat;
293            } else {
294               info->cov_count += 1 + instr->repeat;
295            }
296         }
297
298         info->instrs_count += instrs_count;
299         info->nops_count += nops_count;
300
301         if (instr->flags & IR3_INSTR_SS) {
302            info->ss++;
303            info->sstall += sfu_delay;
304            sfu_delay = 0;
305         }
306
307         if (instr->flags & IR3_INSTR_SY)
308            info->sy++;
309
310         if (is_sfu(instr)) {
311            sfu_delay = 10;
312         } else {
313            int n = MIN2(sfu_delay, 1 + instr->repeat + instr->nop);
314            sfu_delay -= n;
315         }
316      }
317   }
318
319   /* TODO: for a5xx and below, is there a separate regfile for
320    * half-registers?
321    */
322   unsigned regs_count =
323      info->max_reg + 1 +
324      (compiler->gen >= 6 ? ((info->max_half_reg + 2) / 2) : 0);
325
326   info->double_threadsize = ir3_should_double_threadsize(v, regs_count);
327   unsigned reg_independent_max_waves =
328      ir3_get_reg_independent_max_waves(v, info->double_threadsize);
329   unsigned reg_dependent_max_waves = ir3_get_reg_dependent_max_waves(
330      compiler, regs_count, info->double_threadsize);
331   info->max_waves = MIN2(reg_independent_max_waves, reg_dependent_max_waves);
332   assert(info->max_waves <= v->shader->compiler->max_waves);
333}
334
335static struct ir3_register *
336reg_create(struct ir3 *shader, int num, int flags)
337{
338   struct ir3_register *reg = ir3_alloc(shader, sizeof(struct ir3_register));
339   reg->wrmask = 1;
340   reg->flags = flags;
341   reg->num = num;
342   return reg;
343}
344
345static void
346insert_instr(struct ir3_block *block, struct ir3_instruction *instr)
347{
348   struct ir3 *shader = block->shader;
349
350   instr->serialno = ++shader->instr_count;
351
352   list_addtail(&instr->node, &block->instr_list);
353
354   if (is_input(instr))
355      array_insert(shader, shader->baryfs, instr);
356}
357
358struct ir3_block *
359ir3_block_create(struct ir3 *shader)
360{
361   struct ir3_block *block = ir3_alloc(shader, sizeof(*block));
362#ifdef DEBUG
363   block->serialno = ++shader->block_count;
364#endif
365   block->shader = shader;
366   list_inithead(&block->node);
367   list_inithead(&block->instr_list);
368   return block;
369}
370
371void
372ir3_block_add_predecessor(struct ir3_block *block, struct ir3_block *pred)
373{
374   array_insert(block, block->predecessors, pred);
375}
376
377void
378ir3_block_add_physical_predecessor(struct ir3_block *block,
379                                   struct ir3_block *pred)
380{
381   array_insert(block, block->physical_predecessors, pred);
382}
383
384void
385ir3_block_remove_predecessor(struct ir3_block *block, struct ir3_block *pred)
386{
387   for (unsigned i = 0; i < block->predecessors_count; i++) {
388      if (block->predecessors[i] == pred) {
389         if (i < block->predecessors_count - 1) {
390            block->predecessors[i] =
391               block->predecessors[block->predecessors_count - 1];
392         }
393
394         block->predecessors_count--;
395         return;
396      }
397   }
398}
399
400void
401ir3_block_remove_physical_predecessor(struct ir3_block *block, struct ir3_block *pred)
402{
403   for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
404      if (block->physical_predecessors[i] == pred) {
405         if (i < block->physical_predecessors_count - 1) {
406            block->physical_predecessors[i] =
407               block->physical_predecessors[block->physical_predecessors_count - 1];
408         }
409
410         block->physical_predecessors_count--;
411         return;
412      }
413   }
414}
415
416unsigned
417ir3_block_get_pred_index(struct ir3_block *block, struct ir3_block *pred)
418{
419   for (unsigned i = 0; i < block->predecessors_count; i++) {
420      if (block->predecessors[i] == pred) {
421         return i;
422      }
423   }
424
425   unreachable("ir3_block_get_pred_index() invalid predecessor");
426}
427
428static struct ir3_instruction *
429instr_create(struct ir3_block *block, opc_t opc, int ndst, int nsrc)
430{
431   /* Add extra sources for array destinations and the address reg */
432   if (1 <= opc_cat(opc))
433      nsrc += 2;
434   struct ir3_instruction *instr;
435   unsigned sz = sizeof(*instr) + (ndst * sizeof(instr->dsts[0])) +
436                 (nsrc * sizeof(instr->srcs[0]));
437   char *ptr = ir3_alloc(block->shader, sz);
438
439   instr = (struct ir3_instruction *)ptr;
440   ptr += sizeof(*instr);
441   instr->dsts = (struct ir3_register **)ptr;
442   instr->srcs = instr->dsts + ndst;
443
444#ifdef DEBUG
445   instr->dsts_max = ndst;
446   instr->srcs_max = nsrc;
447#endif
448
449   return instr;
450}
451
452struct ir3_instruction *
453ir3_instr_create(struct ir3_block *block, opc_t opc, int ndst, int nsrc)
454{
455   struct ir3_instruction *instr = instr_create(block, opc, ndst, nsrc);
456   instr->block = block;
457   instr->opc = opc;
458   insert_instr(block, instr);
459   return instr;
460}
461
462struct ir3_instruction *
463ir3_instr_clone(struct ir3_instruction *instr)
464{
465   struct ir3_instruction *new_instr = instr_create(
466      instr->block, instr->opc, instr->dsts_count, instr->srcs_count);
467   struct ir3_register **dsts, **srcs;
468
469   dsts = new_instr->dsts;
470   srcs = new_instr->srcs;
471   *new_instr = *instr;
472   new_instr->dsts = dsts;
473   new_instr->srcs = srcs;
474
475   insert_instr(instr->block, new_instr);
476
477   /* clone registers: */
478   new_instr->dsts_count = 0;
479   new_instr->srcs_count = 0;
480   foreach_dst (reg, instr) {
481      struct ir3_register *new_reg =
482         ir3_dst_create(new_instr, reg->num, reg->flags);
483      *new_reg = *reg;
484      if (new_reg->instr)
485         new_reg->instr = new_instr;
486   }
487   foreach_src (reg, instr) {
488      struct ir3_register *new_reg =
489         ir3_src_create(new_instr, reg->num, reg->flags);
490      *new_reg = *reg;
491   }
492
493   if (instr->address) {
494      assert(instr->srcs_count > 0);
495      new_instr->address = new_instr->srcs[instr->srcs_count - 1];
496   }
497
498   return new_instr;
499}
500
501/* Add a false dependency to instruction, to ensure it is scheduled first: */
502void
503ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep)
504{
505   for (unsigned i = 0; i < instr->deps_count; i++) {
506      if (instr->deps[i] == dep)
507         return;
508   }
509
510   array_insert(instr, instr->deps, dep);
511}
512
513struct ir3_register *
514ir3_src_create(struct ir3_instruction *instr, int num, int flags)
515{
516   struct ir3 *shader = instr->block->shader;
517#ifdef DEBUG
518   debug_assert(instr->srcs_count < instr->srcs_max);
519#endif
520   struct ir3_register *reg = reg_create(shader, num, flags);
521   instr->srcs[instr->srcs_count++] = reg;
522   return reg;
523}
524
525struct ir3_register *
526ir3_dst_create(struct ir3_instruction *instr, int num, int flags)
527{
528   struct ir3 *shader = instr->block->shader;
529#ifdef DEBUG
530   debug_assert(instr->dsts_count < instr->dsts_max);
531#endif
532   struct ir3_register *reg = reg_create(shader, num, flags);
533   instr->dsts[instr->dsts_count++] = reg;
534   return reg;
535}
536
537struct ir3_register *
538ir3_reg_clone(struct ir3 *shader, struct ir3_register *reg)
539{
540   struct ir3_register *new_reg = reg_create(shader, 0, 0);
541   *new_reg = *reg;
542   return new_reg;
543}
544
545void
546ir3_reg_set_last_array(struct ir3_instruction *instr, struct ir3_register *reg,
547                       struct ir3_register *last_write)
548{
549   assert(reg->flags & IR3_REG_ARRAY);
550   struct ir3_register *new_reg = ir3_src_create(instr, 0, 0);
551   *new_reg = *reg;
552   new_reg->def = last_write;
553   ir3_reg_tie(reg, new_reg);
554}
555
556void
557ir3_instr_set_address(struct ir3_instruction *instr,
558                      struct ir3_instruction *addr)
559{
560   if (!instr->address) {
561      struct ir3 *ir = instr->block->shader;
562
563      debug_assert(instr->block == addr->block);
564
565      instr->address =
566         ir3_src_create(instr, addr->dsts[0]->num, addr->dsts[0]->flags);
567      instr->address->def = addr->dsts[0];
568      debug_assert(reg_num(addr->dsts[0]) == REG_A0);
569      unsigned comp = reg_comp(addr->dsts[0]);
570      if (comp == 0) {
571         array_insert(ir, ir->a0_users, instr);
572      } else {
573         debug_assert(comp == 1);
574         array_insert(ir, ir->a1_users, instr);
575      }
576   } else {
577      debug_assert(instr->address->def->instr == addr);
578   }
579}
580
581void
582ir3_block_clear_mark(struct ir3_block *block)
583{
584   foreach_instr (instr, &block->instr_list)
585      instr->flags &= ~IR3_INSTR_MARK;
586}
587
588void
589ir3_clear_mark(struct ir3 *ir)
590{
591   foreach_block (block, &ir->block_list) {
592      ir3_block_clear_mark(block);
593   }
594}
595
596unsigned
597ir3_count_instructions(struct ir3 *ir)
598{
599   unsigned cnt = 1;
600   foreach_block (block, &ir->block_list) {
601      block->start_ip = cnt;
602      foreach_instr (instr, &block->instr_list) {
603         instr->ip = cnt++;
604      }
605      block->end_ip = cnt;
606   }
607   return cnt;
608}
609
610/* When counting instructions for RA, we insert extra fake instructions at the
611 * beginning of each block, where values become live, and at the end where
612 * values die. This prevents problems where values live-in at the beginning or
613 * live-out at the end of a block from being treated as if they were
614 * live-in/live-out at the first/last instruction, which would be incorrect.
615 * In ir3_legalize these ip's are assumed to be actual ip's of the final
616 * program, so it would be incorrect to use this everywhere.
617 */
618
619unsigned
620ir3_count_instructions_ra(struct ir3 *ir)
621{
622   unsigned cnt = 1;
623   foreach_block (block, &ir->block_list) {
624      block->start_ip = cnt++;
625      foreach_instr (instr, &block->instr_list) {
626         instr->ip = cnt++;
627      }
628      block->end_ip = cnt++;
629   }
630   return cnt;
631}
632
633struct ir3_array *
634ir3_lookup_array(struct ir3 *ir, unsigned id)
635{
636   foreach_array (arr, &ir->array_list)
637      if (arr->id == id)
638         return arr;
639   return NULL;
640}
641
642void
643ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps)
644{
645   /* We could do this in a single pass if we can assume instructions
646    * are always sorted.  Which currently might not always be true.
647    * (In particular after ir3_group pass, but maybe other places.)
648    */
649   foreach_block (block, &ir->block_list)
650      foreach_instr (instr, &block->instr_list)
651         instr->uses = NULL;
652
653   foreach_block (block, &ir->block_list) {
654      foreach_instr (instr, &block->instr_list) {
655         foreach_ssa_src_n (src, n, instr) {
656            if (__is_false_dep(instr, n) && !falsedeps)
657               continue;
658            if (!src->uses)
659               src->uses = _mesa_pointer_set_create(mem_ctx);
660            _mesa_set_add(src->uses, instr);
661         }
662      }
663   }
664}
665
666/**
667 * Set the destination type of an instruction, for example if a
668 * conversion is folded in, handling the special cases where the
669 * instruction's dest type or opcode needs to be fixed up.
670 */
671void
672ir3_set_dst_type(struct ir3_instruction *instr, bool half)
673{
674   if (half) {
675      instr->dsts[0]->flags |= IR3_REG_HALF;
676   } else {
677      instr->dsts[0]->flags &= ~IR3_REG_HALF;
678   }
679
680   switch (opc_cat(instr->opc)) {
681   case 1: /* move instructions */
682      if (half) {
683         instr->cat1.dst_type = half_type(instr->cat1.dst_type);
684      } else {
685         instr->cat1.dst_type = full_type(instr->cat1.dst_type);
686      }
687      break;
688   case 4:
689      if (half) {
690         instr->opc = cat4_half_opc(instr->opc);
691      } else {
692         instr->opc = cat4_full_opc(instr->opc);
693      }
694      break;
695   case 5:
696      if (half) {
697         instr->cat5.type = half_type(instr->cat5.type);
698      } else {
699         instr->cat5.type = full_type(instr->cat5.type);
700      }
701      break;
702   }
703}
704
705/**
706 * One-time fixup for instruction src-types.  Other than cov's that
707 * are folded, an instruction's src type does not change.
708 */
709void
710ir3_fixup_src_type(struct ir3_instruction *instr)
711{
712   switch (opc_cat(instr->opc)) {
713   case 1: /* move instructions */
714      if (instr->srcs[0]->flags & IR3_REG_HALF) {
715         instr->cat1.src_type = half_type(instr->cat1.src_type);
716      } else {
717         instr->cat1.src_type = full_type(instr->cat1.src_type);
718      }
719      break;
720   case 3:
721      if (instr->srcs[0]->flags & IR3_REG_HALF) {
722         instr->opc = cat3_half_opc(instr->opc);
723      } else {
724         instr->opc = cat3_full_opc(instr->opc);
725      }
726      break;
727   }
728}
729
730/**
731 * Map a floating point immed to FLUT (float lookup table) value,
732 * returns negative for immediates that cannot be mapped.
733 */
734int
735ir3_flut(struct ir3_register *src_reg)
736{
737   static const struct {
738      uint32_t f32;
739      uint16_t f16;
740   } flut[] = {
741         { .f32 = 0x00000000, .f16 = 0x0000 },    /* 0.0 */
742         { .f32 = 0x3f000000, .f16 = 0x3800 },    /* 0.5 */
743         { .f32 = 0x3f800000, .f16 = 0x3c00 },    /* 1.0 */
744         { .f32 = 0x40000000, .f16 = 0x4000 },    /* 2.0 */
745         { .f32 = 0x402df854, .f16 = 0x4170 },    /* e */
746         { .f32 = 0x40490fdb, .f16 = 0x4248 },    /* pi */
747         { .f32 = 0x3ea2f983, .f16 = 0x3518 },    /* 1/pi */
748         { .f32 = 0x3f317218, .f16 = 0x398c },    /* 1/log2(e) */
749         { .f32 = 0x3fb8aa3b, .f16 = 0x3dc5 },    /* log2(e) */
750         { .f32 = 0x3e9a209b, .f16 = 0x34d1 },    /* 1/log2(10) */
751         { .f32 = 0x40549a78, .f16 = 0x42a5 },    /* log2(10) */
752         { .f32 = 0x40800000, .f16 = 0x4400 },    /* 4.0 */
753   };
754
755   if (src_reg->flags & IR3_REG_HALF) {
756      /* Note that half-float immeds are already lowered to 16b in nir: */
757      uint32_t imm = src_reg->uim_val;
758      for (unsigned i = 0; i < ARRAY_SIZE(flut); i++) {
759         if (flut[i].f16 == imm) {
760            return i;
761         }
762      }
763   } else {
764      uint32_t imm = src_reg->uim_val;
765      for (unsigned i = 0; i < ARRAY_SIZE(flut); i++) {
766         if (flut[i].f32 == imm) {
767            return i;
768         }
769      }
770   }
771
772   return -1;
773}
774
775static unsigned
776cp_flags(unsigned flags)
777{
778   /* only considering these flags (at least for now): */
779   flags &= (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_FNEG | IR3_REG_FABS |
780             IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT | IR3_REG_RELATIV |
781             IR3_REG_SHARED);
782   return flags;
783}
784
785bool
786ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags)
787{
788   struct ir3_compiler *compiler = instr->block->shader->compiler;
789   unsigned valid_flags;
790
791   if ((flags & IR3_REG_SHARED) && opc_cat(instr->opc) > 3)
792      return false;
793
794   flags = cp_flags(flags);
795
796   /* If destination is indirect, then source cannot be.. at least
797    * I don't think so..
798    */
799   if (instr->dsts_count > 0 && (instr->dsts[0]->flags & IR3_REG_RELATIV) &&
800       (flags & IR3_REG_RELATIV))
801      return false;
802
803   if (flags & IR3_REG_RELATIV) {
804      /* TODO need to test on earlier gens.. pretty sure the earlier
805       * problem was just that we didn't check that the src was from
806       * same block (since we can't propagate address register values
807       * across blocks currently)
808       */
809      if (compiler->gen < 6)
810         return false;
811
812      /* NOTE in the special try_swap_mad_two_srcs() case we can be
813       * called on a src that has already had an indirect load folded
814       * in, in which case ssa() returns NULL
815       */
816      if (instr->srcs[n]->flags & IR3_REG_SSA) {
817         struct ir3_instruction *src = ssa(instr->srcs[n]);
818         if (src->address->def->instr->block != instr->block)
819            return false;
820      }
821   }
822
823   if (is_meta(instr)) {
824      /* collect and phi nodes support const/immed sources, which will be
825       * turned into move instructions, but not anything else.
826       */
827      if (flags & ~(IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_SHARED))
828         return false;
829
830      if ((flags & IR3_REG_SHARED) && !(instr->dsts[0]->flags & IR3_REG_SHARED))
831         return false;
832
833      return true;
834   }
835
836   switch (opc_cat(instr->opc)) {
837   case 0: /* end, chmask */
838      return flags == 0;
839   case 1:
840      switch (instr->opc) {
841      case OPC_MOVMSK:
842      case OPC_SWZ:
843      case OPC_SCT:
844      case OPC_GAT:
845         valid_flags = IR3_REG_SHARED;
846         break;
847      default:
848         valid_flags =
849            IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV | IR3_REG_SHARED;
850      }
851      if (flags & ~valid_flags)
852         return false;
853      break;
854   case 2:
855      valid_flags = ir3_cat2_absneg(instr->opc) | IR3_REG_CONST |
856                    IR3_REG_RELATIV | IR3_REG_IMMED | IR3_REG_SHARED;
857
858      if (flags & ~valid_flags)
859         return false;
860
861      if (flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_SHARED)) {
862         unsigned m = n ^ 1;
863         /* cannot deal w/ const or shared in both srcs:
864          * (note that some cat2 actually only have a single src)
865          */
866         if (m < instr->srcs_count) {
867            struct ir3_register *reg = instr->srcs[m];
868            if ((flags & (IR3_REG_CONST | IR3_REG_SHARED)) &&
869                (reg->flags & (IR3_REG_CONST | IR3_REG_SHARED)))
870               return false;
871            if ((flags & IR3_REG_IMMED) && reg->flags & (IR3_REG_IMMED))
872               return false;
873         }
874      }
875      break;
876   case 3:
877      valid_flags =
878         ir3_cat3_absneg(instr->opc) | IR3_REG_RELATIV | IR3_REG_SHARED;
879
880      if (instr->opc == OPC_SHLG_B16) {
881         valid_flags |= IR3_REG_IMMED;
882         /* shlg.b16 can be RELATIV+CONST but not CONST: */
883         if (flags & IR3_REG_RELATIV)
884            valid_flags |= IR3_REG_CONST;
885      } else {
886         valid_flags |= IR3_REG_CONST;
887      }
888
889      if (flags & ~valid_flags)
890         return false;
891
892      if (flags & (IR3_REG_CONST | IR3_REG_SHARED | IR3_REG_RELATIV)) {
893         /* cannot deal w/ const/shared/relativ in 2nd src: */
894         if (n == 1)
895            return false;
896      }
897
898      break;
899   case 4:
900      /* seems like blob compiler avoids const as src.. */
901      /* TODO double check if this is still the case on a4xx */
902      if (flags & (IR3_REG_CONST | IR3_REG_IMMED))
903         return false;
904      if (flags & (IR3_REG_SABS | IR3_REG_SNEG))
905         return false;
906      break;
907   case 5:
908      /* no flags allowed */
909      if (flags)
910         return false;
911      break;
912   case 6:
913      valid_flags = IR3_REG_IMMED;
914      if (flags & ~valid_flags)
915         return false;
916
917      if (flags & IR3_REG_IMMED) {
918         /* doesn't seem like we can have immediate src for store
919          * instructions:
920          *
921          * TODO this restriction could also apply to load instructions,
922          * but for load instructions this arg is the address (and not
923          * really sure any good way to test a hard-coded immed addr src)
924          */
925         if (is_store(instr) && (instr->opc != OPC_STG) && (n == 1))
926            return false;
927
928         if ((instr->opc == OPC_LDL) && (n == 0))
929            return false;
930
931         if ((instr->opc == OPC_STL) && (n != 2))
932            return false;
933
934         if ((instr->opc == OPC_LDP) && (n == 0))
935            return false;
936
937         if ((instr->opc == OPC_STP) && (n != 2))
938            return false;
939
940         if (instr->opc == OPC_STLW && n == 0)
941            return false;
942
943         if (instr->opc == OPC_LDLW && n == 0)
944            return false;
945
946         /* disallow immediates in anything but the SSBO slot argument for
947          * cat6 instructions:
948          */
949         if (is_atomic(instr->opc) && (n != 0))
950            return false;
951
952         if (is_atomic(instr->opc) && !(instr->flags & IR3_INSTR_G))
953            return false;
954
955         if (instr->opc == OPC_STG && (n == 2))
956            return false;
957
958         if (instr->opc == OPC_STG_A && (n == 4))
959            return false;
960
961         /* as with atomics, these cat6 instrs can only have an immediate
962          * for SSBO/IBO slot argument
963          */
964         switch (instr->opc) {
965         case OPC_LDIB:
966         case OPC_STIB:
967         case OPC_RESINFO:
968            if (n != 0)
969               return false;
970            break;
971         default:
972            break;
973         }
974      }
975
976      break;
977   }
978
979   return true;
980}
981
982bool
983ir3_valid_immediate(struct ir3_instruction *instr, int32_t immed)
984{
985   if (instr->opc == OPC_MOV || is_meta(instr))
986      return true;
987
988   if (is_mem(instr)) {
989      switch (instr->opc) {
990      /* Some load/store instructions have a 13-bit offset and size which must
991       * always be an immediate and the rest of the sources cannot be
992       * immediates, so the frontend is responsible for checking the size:
993       */
994      case OPC_LDL:
995      case OPC_STL:
996      case OPC_LDP:
997      case OPC_STP:
998      case OPC_LDG:
999      case OPC_STG:
1000      case OPC_SPILL_MACRO:
1001      case OPC_RELOAD_MACRO:
1002      case OPC_LDG_A:
1003      case OPC_STG_A:
1004      case OPC_LDLW:
1005      case OPC_STLW:
1006      case OPC_LDLV:
1007         return true;
1008      default:
1009         /* most cat6 src immediates can only encode 8 bits: */
1010         return !(immed & ~0xff);
1011      }
1012   }
1013
1014   /* Other than cat1 (mov) we can only encode up to 10 bits, sign-extended: */
1015   return !(immed & ~0x1ff) || !(-immed & ~0x1ff);
1016}
1017