ir3.h revision 7ec681f3
1/*
2 * Copyright (c) 2013 Rob Clark <robdclark@gmail.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24#ifndef IR3_H_
25#define IR3_H_
26
27#include <stdbool.h>
28#include <stdint.h>
29
30#include "compiler/shader_enums.h"
31
32#include "util/bitscan.h"
33#include "util/list.h"
34#include "util/set.h"
35#include "util/u_debug.h"
36
37#include "instr-a3xx.h"
38
39/* low level intermediate representation of an adreno shader program */
40
41struct ir3_compiler;
42struct ir3;
43struct ir3_instruction;
44struct ir3_block;
45
46struct ir3_info {
47   void *data; /* used internally in ir3 assembler */
48   /* Size in bytes of the shader binary, including NIR constants and
49    * padding
50    */
51   uint32_t size;
52   /* byte offset from start of the shader to the NIR constant data. */
53   uint32_t constant_data_offset;
54   /* Size in dwords of the instructions. */
55   uint16_t sizedwords;
56   uint16_t instrs_count; /* expanded to account for rpt's */
57   uint16_t nops_count;   /* # of nop instructions, including nopN */
58   uint16_t mov_count;
59   uint16_t cov_count;
60   uint16_t stp_count;
61   uint16_t ldp_count;
62   /* NOTE: max_reg, etc, does not include registers not touched
63    * by the shader (ie. vertex fetched via VFD_DECODE but not
64    * touched by shader)
65    */
66   int8_t max_reg; /* highest GPR # used by shader */
67   int8_t max_half_reg;
68   int16_t max_const;
69   /* This is the maximum # of waves that can executed at once in one core,
70    * assuming that they are all executing this shader.
71    */
72   int8_t max_waves;
73   bool double_threadsize;
74   bool multi_dword_ldp_stp;
75
76   /* number of sync bits: */
77   uint16_t ss, sy;
78
79   /* estimate of number of cycles stalled on (ss) */
80   uint16_t sstall;
81
82   uint16_t last_baryf; /* instruction # of last varying fetch */
83
84   /* Number of instructions of a given category: */
85   uint16_t instrs_per_cat[8];
86};
87
88struct ir3_merge_set {
89   uint16_t preferred_reg;
90   uint16_t size;
91   uint16_t alignment;
92
93   unsigned interval_start;
94   unsigned spill_slot;
95
96   unsigned regs_count;
97   struct ir3_register **regs;
98};
99
100struct ir3_register {
101   enum {
102      IR3_REG_CONST = 0x001,
103      IR3_REG_IMMED = 0x002,
104      IR3_REG_HALF = 0x004,
105      /* Shared registers have the same value for all threads when read.
106       * They can only be written when one thread is active (that is, inside
107       * a "getone" block).
108       */
109      IR3_REG_SHARED = 0x008,
110      IR3_REG_RELATIV = 0x010,
111      IR3_REG_R = 0x020,
112      /* Most instructions, it seems, can do float abs/neg but not
113       * integer.  The CP pass needs to know what is intended (int or
114       * float) in order to do the right thing.  For this reason the
115       * abs/neg flags are split out into float and int variants.  In
116       * addition, .b (bitwise) operations, the negate is actually a
117       * bitwise not, so split that out into a new flag to make it
118       * more clear.
119       */
120      IR3_REG_FNEG = 0x040,
121      IR3_REG_FABS = 0x080,
122      IR3_REG_SNEG = 0x100,
123      IR3_REG_SABS = 0x200,
124      IR3_REG_BNOT = 0x400,
125      /* (ei) flag, end-input?  Set on last bary, presumably to signal
126       * that the shader needs no more input:
127       */
128      IR3_REG_EI = 0x2000,
129      /* meta-flags, for intermediate stages of IR, ie.
130       * before register assignment is done:
131       */
132      IR3_REG_SSA = 0x4000, /* 'def' is ptr to assigning destination */
133      IR3_REG_ARRAY = 0x8000,
134
135      /* Set on a use whenever the SSA value becomes dead after the current
136       * instruction.
137       */
138      IR3_REG_KILL = 0x10000,
139
140      /* Similar to IR3_REG_KILL, except that if there are multiple uses of the
141       * same SSA value in a single instruction, this is only set on the first
142       * use.
143       */
144      IR3_REG_FIRST_KILL = 0x20000,
145
146      /* Set when a destination doesn't have any uses and is dead immediately
147       * after the instruction. This can happen even after optimizations for
148       * corner cases such as destinations of atomic instructions.
149       */
150      IR3_REG_UNUSED = 0x40000,
151   } flags;
152
153   unsigned name;
154
155   /* used for cat5 instructions, but also for internal/IR level
156    * tracking of what registers are read/written by an instruction.
157    * wrmask may be a bad name since it is used to represent both
158    * src and dst that touch multiple adjacent registers.
159    */
160   unsigned wrmask : 16; /* up to vec16 */
161
162   /* for relative addressing, 32bits for array size is too small,
163    * but otoh we don't need to deal with disjoint sets, so instead
164    * use a simple size field (number of scalar components).
165    *
166    * Note the size field isn't important for relative const (since
167    * we don't have to do register allocation for constants).
168    */
169   unsigned size : 16;
170
171   /* normal registers:
172    * the component is in the low two bits of the reg #, so
173    * rN.x becomes: (N << 2) | x
174    */
175   uint16_t num;
176   union {
177      /* immediate: */
178      int32_t iim_val;
179      uint32_t uim_val;
180      float fim_val;
181      /* relative: */
182      struct {
183         uint16_t id;
184         int16_t offset;
185         uint16_t base;
186      } array;
187   };
188
189   /* For IR3_REG_DEST, pointer back to the instruction containing this
190    * register.
191    */
192   struct ir3_instruction *instr;
193
194   /* For IR3_REG_SSA, src registers contain ptr back to assigning
195    * instruction.
196    *
197    * For IR3_REG_ARRAY, the pointer is back to the last dependent
198    * array access (although the net effect is the same, it points
199    * back to a previous instruction that we depend on).
200    */
201   struct ir3_register *def;
202
203   /* Pointer to another register in the instruction that must share the same
204    * physical register. Each destination can be tied with one source, and
205    * they must have "tied" pointing to each other.
206    */
207   struct ir3_register *tied;
208
209   unsigned spill_slot, next_use;
210
211   unsigned merge_set_offset;
212   struct ir3_merge_set *merge_set;
213   unsigned interval_start, interval_end;
214};
215
216/*
217 * Stupid/simple growable array implementation:
218 */
219#define DECLARE_ARRAY(type, name)                                              \
220   unsigned name##_count, name##_sz;                                           \
221   type *name;
222
223#define array_insert(ctx, arr, ...)                                            \
224   do {                                                                        \
225      if (arr##_count == arr##_sz) {                                           \
226         arr##_sz = MAX2(2 * arr##_sz, 16);                                    \
227         arr = reralloc_size(ctx, arr, arr##_sz * sizeof(arr[0]));             \
228      }                                                                        \
229      arr[arr##_count++] = __VA_ARGS__;                                        \
230   } while (0)
231
232struct ir3_instruction {
233   struct ir3_block *block;
234   opc_t opc;
235   enum {
236      /* (sy) flag is set on first instruction, and after sample
237       * instructions (probably just on RAW hazard).
238       */
239      IR3_INSTR_SY = 0x001,
240      /* (ss) flag is set on first instruction, and first instruction
241       * to depend on the result of "long" instructions (RAW hazard):
242       *
243       *   rcp, rsq, log2, exp2, sin, cos, sqrt
244       *
245       * It seems to synchronize until all in-flight instructions are
246       * completed, for example:
247       *
248       *   rsq hr1.w, hr1.w
249       *   add.f hr2.z, (neg)hr2.z, hc0.y
250       *   mul.f hr2.w, (neg)hr2.y, (neg)hr2.y
251       *   rsq hr2.x, hr2.x
252       *   (rpt1)nop
253       *   mad.f16 hr2.w, hr2.z, hr2.z, hr2.w
254       *   nop
255       *   mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w
256       *   (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w
257       *   (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x
258       *
259       * The last mul.f does not have (ss) set, presumably because the
260       * (ss) on the previous instruction does the job.
261       *
262       * The blob driver also seems to set it on WAR hazards, although
263       * not really clear if this is needed or just blob compiler being
264       * sloppy.  So far I haven't found a case where removing the (ss)
265       * causes problems for WAR hazard, but I could just be getting
266       * lucky:
267       *
268       *   rcp r1.y, r3.y
269       *   (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z
270       *
271       */
272      IR3_INSTR_SS = 0x002,
273      /* (jp) flag is set on jump targets:
274       */
275      IR3_INSTR_JP = 0x004,
276      IR3_INSTR_UL = 0x008,
277      IR3_INSTR_3D = 0x010,
278      IR3_INSTR_A = 0x020,
279      IR3_INSTR_O = 0x040,
280      IR3_INSTR_P = 0x080,
281      IR3_INSTR_S = 0x100,
282      IR3_INSTR_S2EN = 0x200,
283      IR3_INSTR_G = 0x400,
284      IR3_INSTR_SAT = 0x800,
285      /* (cat5/cat6) Bindless */
286      IR3_INSTR_B = 0x1000,
287      /* (cat5/cat6) nonuniform */
288      IR3_INSTR_NONUNIF = 0x02000,
289      /* (cat5-only) Get some parts of the encoding from a1.x */
290      IR3_INSTR_A1EN = 0x04000,
291      /* meta-flags, for intermediate stages of IR, ie.
292       * before register assignment is done:
293       */
294      IR3_INSTR_MARK = 0x08000,
295      IR3_INSTR_UNUSED = 0x10000,
296   } flags;
297   uint8_t repeat;
298   uint8_t nop;
299#ifdef DEBUG
300   unsigned srcs_max, dsts_max;
301#endif
302   unsigned srcs_count, dsts_count;
303   struct ir3_register **dsts;
304   struct ir3_register **srcs;
305   union {
306      struct {
307         char inv1, inv2;
308         char comp1, comp2;
309         int immed;
310         struct ir3_block *target;
311         const char *target_label;
312         brtype_t brtype;
313         unsigned idx; /* for brac.N */
314      } cat0;
315      struct {
316         type_t src_type, dst_type;
317         round_t round;
318      } cat1;
319      struct {
320         enum {
321            IR3_COND_LT = 0,
322            IR3_COND_LE = 1,
323            IR3_COND_GT = 2,
324            IR3_COND_GE = 3,
325            IR3_COND_EQ = 4,
326            IR3_COND_NE = 5,
327         } condition;
328      } cat2;
329      struct {
330         unsigned samp, tex;
331         unsigned tex_base : 3;
332         type_t type;
333      } cat5;
334      struct {
335         type_t type;
336         /* TODO remove dst_offset and handle as a ir3_register
337          * which might be IMMED, similar to how src_offset is
338          * handled.
339          */
340         int dst_offset;
341         int iim_val   : 3; /* for ldgb/stgb, # of components */
342         unsigned d    : 3; /* for ldc, component offset */
343         bool typed    : 1;
344         unsigned base : 3;
345      } cat6;
346      struct {
347         unsigned w : 1; /* write */
348         unsigned r : 1; /* read */
349         unsigned l : 1; /* local */
350         unsigned g : 1; /* global */
351      } cat7;
352      /* for meta-instructions, just used to hold extra data
353       * before instruction scheduling, etc
354       */
355      struct {
356         int off; /* component/offset */
357      } split;
358      struct {
359         /* Per-source index back to the entry in the
360          * ir3_shader_variant::outputs table.
361          */
362         unsigned *outidxs;
363      } end;
364      struct {
365         /* used to temporarily hold reference to nir_phi_instr
366          * until we resolve the phi srcs
367          */
368         void *nphi;
369      } phi;
370      struct {
371         unsigned samp, tex;
372         unsigned input_offset;
373         unsigned samp_base : 3;
374         unsigned tex_base  : 3;
375      } prefetch;
376      struct {
377         /* maps back to entry in ir3_shader_variant::inputs table: */
378         int inidx;
379         /* for sysvals, identifies the sysval type.  Mostly so we can
380          * identify the special cases where a sysval should not be DCE'd
381          * (currently, just pre-fs texture fetch)
382          */
383         gl_system_value sysval;
384      } input;
385   };
386
387   /* For assigning jump offsets, we need instruction's position: */
388   uint32_t ip;
389
390   /* used for per-pass extra instruction data.
391    *
392    * TODO we should remove the per-pass data like this and 'use_count'
393    * and do something similar to what RA does w/ ir3_ra_instr_data..
394    * ie. use the ir3_count_instructions pass, and then use instr->ip
395    * to index into a table of pass-private data.
396    */
397   void *data;
398
399   /**
400    * Valid if pass calls ir3_find_ssa_uses().. see foreach_ssa_use()
401    */
402   struct set *uses;
403
404   int use_count; /* currently just updated/used by cp */
405
406   /* an instruction can reference at most one address register amongst
407    * it's src/dst registers.  Beyond that, you need to insert mov's.
408    *
409    * NOTE: do not write this directly, use ir3_instr_set_address()
410    */
411   struct ir3_register *address;
412
413   /* Tracking for additional dependent instructions.  Used to handle
414    * barriers, WAR hazards for arrays/SSBOs/etc.
415    */
416   DECLARE_ARRAY(struct ir3_instruction *, deps);
417
418   /*
419    * From PoV of instruction scheduling, not execution (ie. ignores global/
420    * local distinction):
421    *                            shared  image  atomic  SSBO  everything
422    *   barrier()/            -   R/W     R/W    R/W     R/W       X
423    *     groupMemoryBarrier()
424    *     memoryBarrier()
425    *     (but only images declared coherent?)
426    *   memoryBarrierAtomic() -                  R/W
427    *   memoryBarrierBuffer() -                          R/W
428    *   memoryBarrierImage()  -           R/W
429    *   memoryBarrierShared() -   R/W
430    *
431    * TODO I think for SSBO/image/shared, in cases where we can determine
432    * which variable is accessed, we don't need to care about accesses to
433    * different variables (unless declared coherent??)
434    */
435   enum {
436      IR3_BARRIER_EVERYTHING = 1 << 0,
437      IR3_BARRIER_SHARED_R = 1 << 1,
438      IR3_BARRIER_SHARED_W = 1 << 2,
439      IR3_BARRIER_IMAGE_R = 1 << 3,
440      IR3_BARRIER_IMAGE_W = 1 << 4,
441      IR3_BARRIER_BUFFER_R = 1 << 5,
442      IR3_BARRIER_BUFFER_W = 1 << 6,
443      IR3_BARRIER_ARRAY_R = 1 << 7,
444      IR3_BARRIER_ARRAY_W = 1 << 8,
445      IR3_BARRIER_PRIVATE_R = 1 << 9,
446      IR3_BARRIER_PRIVATE_W = 1 << 10,
447   } barrier_class,
448      barrier_conflict;
449
450   /* Entry in ir3_block's instruction list: */
451   struct list_head node;
452
453   uint32_t serialno;
454
455   // TODO only computerator/assembler:
456   int line;
457};
458
459struct ir3 {
460   struct ir3_compiler *compiler;
461   gl_shader_stage type;
462
463   DECLARE_ARRAY(struct ir3_instruction *, inputs);
464
465   /* Track bary.f (and ldlv) instructions.. this is needed in
466    * scheduling to ensure that all varying fetches happen before
467    * any potential kill instructions.  The hw gets grumpy if all
468    * threads in a group are killed before the last bary.f gets
469    * a chance to signal end of input (ei).
470    */
471   DECLARE_ARRAY(struct ir3_instruction *, baryfs);
472
473   /* Track all indirect instructions (read and write).  To avoid
474    * deadlock scenario where an address register gets scheduled,
475    * but other dependent src instructions cannot be scheduled due
476    * to dependency on a *different* address register value, the
477    * scheduler needs to ensure that all dependencies other than
478    * the instruction other than the address register are scheduled
479    * before the one that writes the address register.  Having a
480    * convenient list of instructions that reference some address
481    * register simplifies this.
482    */
483   DECLARE_ARRAY(struct ir3_instruction *, a0_users);
484
485   /* same for a1.x: */
486   DECLARE_ARRAY(struct ir3_instruction *, a1_users);
487
488   /* and same for instructions that consume predicate register: */
489   DECLARE_ARRAY(struct ir3_instruction *, predicates);
490
491   /* Track texture sample instructions which need texture state
492    * patched in (for astc-srgb workaround):
493    */
494   DECLARE_ARRAY(struct ir3_instruction *, astc_srgb);
495
496   /* List of blocks: */
497   struct list_head block_list;
498
499   /* List of ir3_array's: */
500   struct list_head array_list;
501
502#ifdef DEBUG
503   unsigned block_count;
504#endif
505   unsigned instr_count;
506};
507
508struct ir3_array {
509   struct list_head node;
510   unsigned length;
511   unsigned id;
512
513   struct nir_register *r;
514
515   /* To avoid array write's from getting DCE'd, keep track of the
516    * most recent write.  Any array access depends on the most
517    * recent write.  This way, nothing depends on writes after the
518    * last read.  But all the writes that happen before that have
519    * something depending on them
520    */
521   struct ir3_register *last_write;
522
523   /* extra stuff used in RA pass: */
524   unsigned base; /* base vreg name */
525   unsigned reg;  /* base physical reg */
526   uint16_t start_ip, end_ip;
527
528   /* Indicates if half-precision */
529   bool half;
530
531   bool unused;
532};
533
534struct ir3_array *ir3_lookup_array(struct ir3 *ir, unsigned id);
535
536enum ir3_branch_type {
537   IR3_BRANCH_COND,   /* condition */
538   IR3_BRANCH_ANY,    /* subgroupAny(condition) */
539   IR3_BRANCH_ALL,    /* subgroupAll(condition) */
540   IR3_BRANCH_GETONE, /* subgroupElect() */
541};
542
543struct ir3_block {
544   struct list_head node;
545   struct ir3 *shader;
546
547   const struct nir_block *nblock;
548
549   struct list_head instr_list; /* list of ir3_instruction */
550
551   /* The actual branch condition, if there are two successors */
552   enum ir3_branch_type brtype;
553
554   /* each block has either one or two successors.. in case of two
555    * successors, 'condition' decides which one to follow.  A block preceding
556    * an if/else has two successors.
557    *
558    * In some cases the path that the machine actually takes through the
559    * program may not match the per-thread view of the CFG. In particular
560    * this is the case for if/else, where the machine jumps from the end of
561    * the if to the beginning of the else and switches active lanes. While
562    * most things only care about the per-thread view, we need to use the
563    * "physical" view when allocating shared registers. "successors" contains
564    * the per-thread successors, and "physical_successors" contains the
565    * physical successors which includes the fallthrough edge from the if to
566    * the else.
567    */
568   struct ir3_instruction *condition;
569   struct ir3_block *successors[2];
570   struct ir3_block *physical_successors[2];
571
572   DECLARE_ARRAY(struct ir3_block *, predecessors);
573   DECLARE_ARRAY(struct ir3_block *, physical_predecessors);
574
575   uint16_t start_ip, end_ip;
576
577   /* Track instructions which do not write a register but other-
578    * wise must not be discarded (such as kill, stg, etc)
579    */
580   DECLARE_ARRAY(struct ir3_instruction *, keeps);
581
582   /* used for per-pass extra block data.  Mainly used right
583    * now in RA step to track livein/liveout.
584    */
585   void *data;
586
587   uint32_t index;
588
589   struct ir3_block *imm_dom;
590   DECLARE_ARRAY(struct ir3_block *, dom_children);
591
592   uint32_t dom_pre_index;
593   uint32_t dom_post_index;
594
595   uint32_t loop_id;
596   uint32_t loop_depth;
597
598#ifdef DEBUG
599   uint32_t serialno;
600#endif
601};
602
603static inline uint32_t
604block_id(struct ir3_block *block)
605{
606#ifdef DEBUG
607   return block->serialno;
608#else
609   return (uint32_t)(unsigned long)block;
610#endif
611}
612
613static inline struct ir3_block *
614ir3_start_block(struct ir3 *ir)
615{
616   return list_first_entry(&ir->block_list, struct ir3_block, node);
617}
618
619void ir3_block_add_predecessor(struct ir3_block *block, struct ir3_block *pred);
620void ir3_block_add_physical_predecessor(struct ir3_block *block,
621                                        struct ir3_block *pred);
622void ir3_block_remove_predecessor(struct ir3_block *block,
623                                  struct ir3_block *pred);
624void ir3_block_remove_physical_predecessor(struct ir3_block *block,
625                                           struct ir3_block *pred);
626unsigned ir3_block_get_pred_index(struct ir3_block *block,
627                                  struct ir3_block *pred);
628
629void ir3_calc_dominance(struct ir3 *ir);
630bool ir3_block_dominates(struct ir3_block *a, struct ir3_block *b);
631
632struct ir3_shader_variant;
633
634struct ir3 *ir3_create(struct ir3_compiler *compiler,
635                       struct ir3_shader_variant *v);
636void ir3_destroy(struct ir3 *shader);
637
638void ir3_collect_info(struct ir3_shader_variant *v);
639void *ir3_alloc(struct ir3 *shader, int sz);
640
641unsigned ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
642                                         unsigned reg_count,
643                                         bool double_threadsize);
644
645unsigned ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v,
646                                           bool double_threadsize);
647
648bool ir3_should_double_threadsize(struct ir3_shader_variant *v,
649                                  unsigned regs_count);
650
651struct ir3_block *ir3_block_create(struct ir3 *shader);
652
653struct ir3_instruction *ir3_instr_create(struct ir3_block *block, opc_t opc,
654                                         int ndst, int nsrc);
655struct ir3_instruction *ir3_instr_clone(struct ir3_instruction *instr);
656void ir3_instr_add_dep(struct ir3_instruction *instr,
657                       struct ir3_instruction *dep);
658const char *ir3_instr_name(struct ir3_instruction *instr);
659
660struct ir3_register *ir3_src_create(struct ir3_instruction *instr, int num,
661                                    int flags);
662struct ir3_register *ir3_dst_create(struct ir3_instruction *instr, int num,
663                                    int flags);
664struct ir3_register *ir3_reg_clone(struct ir3 *shader,
665                                   struct ir3_register *reg);
666
667static inline void
668ir3_reg_tie(struct ir3_register *dst, struct ir3_register *src)
669{
670   assert(!dst->tied && !src->tied);
671   dst->tied = src;
672   src->tied = dst;
673}
674
675void ir3_reg_set_last_array(struct ir3_instruction *instr,
676                            struct ir3_register *reg,
677                            struct ir3_register *last_write);
678
679void ir3_instr_set_address(struct ir3_instruction *instr,
680                           struct ir3_instruction *addr);
681
682static inline bool
683ir3_instr_check_mark(struct ir3_instruction *instr)
684{
685   if (instr->flags & IR3_INSTR_MARK)
686      return true; /* already visited */
687   instr->flags |= IR3_INSTR_MARK;
688   return false;
689}
690
691void ir3_block_clear_mark(struct ir3_block *block);
692void ir3_clear_mark(struct ir3 *shader);
693
694unsigned ir3_count_instructions(struct ir3 *ir);
695unsigned ir3_count_instructions_ra(struct ir3 *ir);
696
697/**
698 * Move 'instr' to just before 'after'
699 */
700static inline void
701ir3_instr_move_before(struct ir3_instruction *instr,
702                      struct ir3_instruction *after)
703{
704   list_delinit(&instr->node);
705   list_addtail(&instr->node, &after->node);
706}
707
708/**
709 * Move 'instr' to just after 'before':
710 */
711static inline void
712ir3_instr_move_after(struct ir3_instruction *instr,
713                     struct ir3_instruction *before)
714{
715   list_delinit(&instr->node);
716   list_add(&instr->node, &before->node);
717}
718
719/**
720 * Move 'instr' to the beginning of the block:
721 */
722static inline void
723ir3_instr_move_before_block(struct ir3_instruction *instr,
724                            struct ir3_block *block)
725{
726   list_delinit(&instr->node);
727   list_add(&instr->node, &block->instr_list);
728}
729
730void ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps);
731
732void ir3_set_dst_type(struct ir3_instruction *instr, bool half);
733void ir3_fixup_src_type(struct ir3_instruction *instr);
734
735int ir3_flut(struct ir3_register *src_reg);
736
737bool ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags);
738
739bool ir3_valid_immediate(struct ir3_instruction *instr, int32_t immed);
740
741#include "util/set.h"
742#define foreach_ssa_use(__use, __instr)                                        \
743   for (struct ir3_instruction *__use = (void *)~0; __use && (__instr)->uses;  \
744        __use = NULL)                                                          \
745      set_foreach ((__instr)->uses, __entry)                                   \
746         if ((__use = (void *)__entry->key))
747
748static inline uint32_t
749reg_num(const struct ir3_register *reg)
750{
751   return reg->num >> 2;
752}
753
754static inline uint32_t
755reg_comp(const struct ir3_register *reg)
756{
757   return reg->num & 0x3;
758}
759
760static inline bool
761is_flow(struct ir3_instruction *instr)
762{
763   return (opc_cat(instr->opc) == 0);
764}
765
766static inline bool
767is_kill_or_demote(struct ir3_instruction *instr)
768{
769   return instr->opc == OPC_KILL || instr->opc == OPC_DEMOTE;
770}
771
772static inline bool
773is_nop(struct ir3_instruction *instr)
774{
775   return instr->opc == OPC_NOP;
776}
777
778static inline bool
779is_same_type_reg(struct ir3_register *dst, struct ir3_register *src)
780{
781   unsigned dst_type = (dst->flags & IR3_REG_HALF);
782   unsigned src_type = (src->flags & IR3_REG_HALF);
783
784   /* Treat shared->normal copies as same-type, because they can generally be
785    * folded, but not normal->shared copies.
786    */
787   if (dst_type != src_type ||
788       ((dst->flags & IR3_REG_SHARED) && !(src->flags & IR3_REG_SHARED)))
789      return false;
790   else
791      return true;
792}
793
794/* Is it a non-transformative (ie. not type changing) mov?  This can
795 * also include absneg.s/absneg.f, which for the most part can be
796 * treated as a mov (single src argument).
797 */
798static inline bool
799is_same_type_mov(struct ir3_instruction *instr)
800{
801   struct ir3_register *dst;
802
803   switch (instr->opc) {
804   case OPC_MOV:
805      if (instr->cat1.src_type != instr->cat1.dst_type)
806         return false;
807      /* If the type of dest reg and src reg are different,
808       * it shouldn't be considered as same type mov
809       */
810      if (!is_same_type_reg(instr->dsts[0], instr->srcs[0]))
811         return false;
812      break;
813   case OPC_ABSNEG_F:
814   case OPC_ABSNEG_S:
815      if (instr->flags & IR3_INSTR_SAT)
816         return false;
817      /* If the type of dest reg and src reg are different,
818       * it shouldn't be considered as same type mov
819       */
820      if (!is_same_type_reg(instr->dsts[0], instr->srcs[0]))
821         return false;
822      break;
823   case OPC_META_PHI:
824      return instr->srcs_count == 1;
825   default:
826      return false;
827   }
828
829   dst = instr->dsts[0];
830
831   /* mov's that write to a0 or p0.x are special: */
832   if (dst->num == regid(REG_P0, 0))
833      return false;
834   if (reg_num(dst) == REG_A0)
835      return false;
836
837   if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
838      return false;
839
840   return true;
841}
842
843/* A move from const, which changes size but not type, can also be
844 * folded into dest instruction in some cases.
845 */
846static inline bool
847is_const_mov(struct ir3_instruction *instr)
848{
849   if (instr->opc != OPC_MOV)
850      return false;
851
852   if (!(instr->srcs[0]->flags & IR3_REG_CONST))
853      return false;
854
855   type_t src_type = instr->cat1.src_type;
856   type_t dst_type = instr->cat1.dst_type;
857
858   return (type_float(src_type) && type_float(dst_type)) ||
859          (type_uint(src_type) && type_uint(dst_type)) ||
860          (type_sint(src_type) && type_sint(dst_type));
861}
862
863static inline bool
864is_alu(struct ir3_instruction *instr)
865{
866   return (1 <= opc_cat(instr->opc)) && (opc_cat(instr->opc) <= 3);
867}
868
869static inline bool
870is_sfu(struct ir3_instruction *instr)
871{
872   return (opc_cat(instr->opc) == 4);
873}
874
875static inline bool
876is_tex(struct ir3_instruction *instr)
877{
878   return (opc_cat(instr->opc) == 5);
879}
880
881static inline bool
882is_tex_or_prefetch(struct ir3_instruction *instr)
883{
884   return is_tex(instr) || (instr->opc == OPC_META_TEX_PREFETCH);
885}
886
887static inline bool
888is_mem(struct ir3_instruction *instr)
889{
890   return (opc_cat(instr->opc) == 6);
891}
892
893static inline bool
894is_barrier(struct ir3_instruction *instr)
895{
896   return (opc_cat(instr->opc) == 7);
897}
898
899static inline bool
900is_half(struct ir3_instruction *instr)
901{
902   return !!(instr->dsts[0]->flags & IR3_REG_HALF);
903}
904
905static inline bool
906is_shared(struct ir3_instruction *instr)
907{
908   return !!(instr->dsts[0]->flags & IR3_REG_SHARED);
909}
910
911static inline bool
912is_store(struct ir3_instruction *instr)
913{
914   /* these instructions, the "destination" register is
915    * actually a source, the address to store to.
916    */
917   switch (instr->opc) {
918   case OPC_STG:
919   case OPC_STG_A:
920   case OPC_STGB:
921   case OPC_STIB:
922   case OPC_STP:
923   case OPC_STL:
924   case OPC_STLW:
925   case OPC_L2G:
926   case OPC_G2L:
927      return true;
928   default:
929      return false;
930   }
931}
932
933static inline bool
934is_load(struct ir3_instruction *instr)
935{
936   switch (instr->opc) {
937   case OPC_LDG:
938   case OPC_LDG_A:
939   case OPC_LDGB:
940   case OPC_LDIB:
941   case OPC_LDL:
942   case OPC_LDP:
943   case OPC_L2G:
944   case OPC_LDLW:
945   case OPC_LDC:
946   case OPC_LDLV:
947      /* probably some others too.. */
948      return true;
949   default:
950      return false;
951   }
952}
953
954static inline bool
955is_input(struct ir3_instruction *instr)
956{
957   /* in some cases, ldlv is used to fetch varying without
958    * interpolation.. fortunately inloc is the first src
959    * register in either case
960    */
961   switch (instr->opc) {
962   case OPC_LDLV:
963   case OPC_BARY_F:
964      return true;
965   default:
966      return false;
967   }
968}
969
970static inline bool
971is_bool(struct ir3_instruction *instr)
972{
973   switch (instr->opc) {
974   case OPC_CMPS_F:
975   case OPC_CMPS_S:
976   case OPC_CMPS_U:
977      return true;
978   default:
979      return false;
980   }
981}
982
983static inline opc_t
984cat3_half_opc(opc_t opc)
985{
986   switch (opc) {
987   case OPC_MAD_F32:
988      return OPC_MAD_F16;
989   case OPC_SEL_B32:
990      return OPC_SEL_B16;
991   case OPC_SEL_S32:
992      return OPC_SEL_S16;
993   case OPC_SEL_F32:
994      return OPC_SEL_F16;
995   case OPC_SAD_S32:
996      return OPC_SAD_S16;
997   default:
998      return opc;
999   }
1000}
1001
1002static inline opc_t
1003cat3_full_opc(opc_t opc)
1004{
1005   switch (opc) {
1006   case OPC_MAD_F16:
1007      return OPC_MAD_F32;
1008   case OPC_SEL_B16:
1009      return OPC_SEL_B32;
1010   case OPC_SEL_S16:
1011      return OPC_SEL_S32;
1012   case OPC_SEL_F16:
1013      return OPC_SEL_F32;
1014   case OPC_SAD_S16:
1015      return OPC_SAD_S32;
1016   default:
1017      return opc;
1018   }
1019}
1020
1021static inline opc_t
1022cat4_half_opc(opc_t opc)
1023{
1024   switch (opc) {
1025   case OPC_RSQ:
1026      return OPC_HRSQ;
1027   case OPC_LOG2:
1028      return OPC_HLOG2;
1029   case OPC_EXP2:
1030      return OPC_HEXP2;
1031   default:
1032      return opc;
1033   }
1034}
1035
1036static inline opc_t
1037cat4_full_opc(opc_t opc)
1038{
1039   switch (opc) {
1040   case OPC_HRSQ:
1041      return OPC_RSQ;
1042   case OPC_HLOG2:
1043      return OPC_LOG2;
1044   case OPC_HEXP2:
1045      return OPC_EXP2;
1046   default:
1047      return opc;
1048   }
1049}
1050
1051static inline bool
1052is_meta(struct ir3_instruction *instr)
1053{
1054   return (opc_cat(instr->opc) == -1);
1055}
1056
1057static inline unsigned
1058reg_elems(const struct ir3_register *reg)
1059{
1060   if (reg->flags & IR3_REG_ARRAY)
1061      return reg->size;
1062   else
1063      return util_last_bit(reg->wrmask);
1064}
1065
1066static inline unsigned
1067reg_elem_size(const struct ir3_register *reg)
1068{
1069   return (reg->flags & IR3_REG_HALF) ? 1 : 2;
1070}
1071
1072static inline unsigned
1073reg_size(const struct ir3_register *reg)
1074{
1075   return reg_elems(reg) * reg_elem_size(reg);
1076}
1077
1078static inline unsigned
1079dest_regs(struct ir3_instruction *instr)
1080{
1081   if (instr->dsts_count == 0)
1082      return 0;
1083
1084   debug_assert(instr->dsts_count == 1);
1085   return util_last_bit(instr->dsts[0]->wrmask);
1086}
1087
1088/* is dst a normal temp register: */
1089static inline bool
1090is_dest_gpr(struct ir3_register *dst)
1091{
1092   if (dst->wrmask == 0)
1093      return false;
1094   if ((reg_num(dst) == REG_A0) || (dst->num == regid(REG_P0, 0)))
1095      return false;
1096   return true;
1097}
1098
1099static inline bool
1100writes_gpr(struct ir3_instruction *instr)
1101{
1102   if (dest_regs(instr) == 0)
1103      return false;
1104   return is_dest_gpr(instr->dsts[0]);
1105}
1106
1107static inline bool
1108writes_addr0(struct ir3_instruction *instr)
1109{
1110   /* Note: only the first dest can write to a0.x */
1111   if (instr->dsts_count > 0) {
1112      struct ir3_register *dst = instr->dsts[0];
1113      return dst->num == regid(REG_A0, 0);
1114   }
1115   return false;
1116}
1117
1118static inline bool
1119writes_addr1(struct ir3_instruction *instr)
1120{
1121   /* Note: only the first dest can write to a1.x */
1122   if (instr->dsts_count > 0) {
1123      struct ir3_register *dst = instr->dsts[0];
1124      return dst->num == regid(REG_A0, 1);
1125   }
1126   return false;
1127}
1128
1129static inline bool
1130writes_pred(struct ir3_instruction *instr)
1131{
1132   /* Note: only the first dest can write to p0.x */
1133   if (instr->dsts_count > 0) {
1134      struct ir3_register *dst = instr->dsts[0];
1135      return reg_num(dst) == REG_P0;
1136   }
1137   return false;
1138}
1139
1140/* Is it something other than a normal register. Shared regs, p0, and a0/a1
1141 * are considered special here. Special registers are always accessed with one
1142 * size and never alias normal registers, even though a naive calculation
1143 * would sometimes make it seem like e.g. r30.z aliases a0.x.
1144 */
1145static inline bool
1146is_reg_special(const struct ir3_register *reg)
1147{
1148   return (reg->flags & IR3_REG_SHARED) || (reg_num(reg) == REG_A0) ||
1149          (reg_num(reg) == REG_P0);
1150}
1151
1152/* Same as above but in cases where we don't have a register. r48.x and above
1153 * are shared/special.
1154 */
1155static inline bool
1156is_reg_num_special(unsigned num)
1157{
1158   return num >= 48 * 4;
1159}
1160
1161/* returns defining instruction for reg */
1162/* TODO better name */
1163static inline struct ir3_instruction *
1164ssa(struct ir3_register *reg)
1165{
1166   if ((reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) && reg->def)
1167      return reg->def->instr;
1168   return NULL;
1169}
1170
1171static inline bool
1172conflicts(struct ir3_register *a, struct ir3_register *b)
1173{
1174   return (a && b) && (a->def != b->def);
1175}
1176
1177static inline bool
1178reg_gpr(struct ir3_register *r)
1179{
1180   if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED))
1181      return false;
1182   if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0))
1183      return false;
1184   return true;
1185}
1186
1187static inline type_t
1188half_type(type_t type)
1189{
1190   switch (type) {
1191   case TYPE_F32:
1192      return TYPE_F16;
1193   case TYPE_U32:
1194      return TYPE_U16;
1195   case TYPE_S32:
1196      return TYPE_S16;
1197   case TYPE_F16:
1198   case TYPE_U16:
1199   case TYPE_S16:
1200      return type;
1201   default:
1202      assert(0);
1203      return ~0;
1204   }
1205}
1206
1207static inline type_t
1208full_type(type_t type)
1209{
1210   switch (type) {
1211   case TYPE_F16:
1212      return TYPE_F32;
1213   case TYPE_U16:
1214      return TYPE_U32;
1215   case TYPE_S16:
1216      return TYPE_S32;
1217   case TYPE_F32:
1218   case TYPE_U32:
1219   case TYPE_S32:
1220      return type;
1221   default:
1222      assert(0);
1223      return ~0;
1224   }
1225}
1226
1227/* some cat2 instructions (ie. those which are not float) can embed an
1228 * immediate:
1229 */
1230static inline bool
1231ir3_cat2_int(opc_t opc)
1232{
1233   switch (opc) {
1234   case OPC_ADD_U:
1235   case OPC_ADD_S:
1236   case OPC_SUB_U:
1237   case OPC_SUB_S:
1238   case OPC_CMPS_U:
1239   case OPC_CMPS_S:
1240   case OPC_MIN_U:
1241   case OPC_MIN_S:
1242   case OPC_MAX_U:
1243   case OPC_MAX_S:
1244   case OPC_CMPV_U:
1245   case OPC_CMPV_S:
1246   case OPC_MUL_U24:
1247   case OPC_MUL_S24:
1248   case OPC_MULL_U:
1249   case OPC_CLZ_S:
1250   case OPC_ABSNEG_S:
1251   case OPC_AND_B:
1252   case OPC_OR_B:
1253   case OPC_NOT_B:
1254   case OPC_XOR_B:
1255   case OPC_BFREV_B:
1256   case OPC_CLZ_B:
1257   case OPC_SHL_B:
1258   case OPC_SHR_B:
1259   case OPC_ASHR_B:
1260   case OPC_MGEN_B:
1261   case OPC_GETBIT_B:
1262   case OPC_CBITS_B:
1263   case OPC_BARY_F:
1264      return true;
1265
1266   default:
1267      return false;
1268   }
1269}
1270
1271/* map cat2 instruction to valid abs/neg flags: */
1272static inline unsigned
1273ir3_cat2_absneg(opc_t opc)
1274{
1275   switch (opc) {
1276   case OPC_ADD_F:
1277   case OPC_MIN_F:
1278   case OPC_MAX_F:
1279   case OPC_MUL_F:
1280   case OPC_SIGN_F:
1281   case OPC_CMPS_F:
1282   case OPC_ABSNEG_F:
1283   case OPC_CMPV_F:
1284   case OPC_FLOOR_F:
1285   case OPC_CEIL_F:
1286   case OPC_RNDNE_F:
1287   case OPC_RNDAZ_F:
1288   case OPC_TRUNC_F:
1289   case OPC_BARY_F:
1290      return IR3_REG_FABS | IR3_REG_FNEG;
1291
1292   case OPC_ADD_U:
1293   case OPC_ADD_S:
1294   case OPC_SUB_U:
1295   case OPC_SUB_S:
1296   case OPC_CMPS_U:
1297   case OPC_CMPS_S:
1298   case OPC_MIN_U:
1299   case OPC_MIN_S:
1300   case OPC_MAX_U:
1301   case OPC_MAX_S:
1302   case OPC_CMPV_U:
1303   case OPC_CMPV_S:
1304   case OPC_MUL_U24:
1305   case OPC_MUL_S24:
1306   case OPC_MULL_U:
1307   case OPC_CLZ_S:
1308      return 0;
1309
1310   case OPC_ABSNEG_S:
1311      return IR3_REG_SABS | IR3_REG_SNEG;
1312
1313   case OPC_AND_B:
1314   case OPC_OR_B:
1315   case OPC_NOT_B:
1316   case OPC_XOR_B:
1317   case OPC_BFREV_B:
1318   case OPC_CLZ_B:
1319   case OPC_SHL_B:
1320   case OPC_SHR_B:
1321   case OPC_ASHR_B:
1322   case OPC_MGEN_B:
1323   case OPC_GETBIT_B:
1324   case OPC_CBITS_B:
1325      return IR3_REG_BNOT;
1326
1327   default:
1328      return 0;
1329   }
1330}
1331
1332/* map cat3 instructions to valid abs/neg flags: */
1333static inline unsigned
1334ir3_cat3_absneg(opc_t opc)
1335{
1336   switch (opc) {
1337   case OPC_MAD_F16:
1338   case OPC_MAD_F32:
1339   case OPC_SEL_F16:
1340   case OPC_SEL_F32:
1341      return IR3_REG_FNEG;
1342
1343   case OPC_MAD_U16:
1344   case OPC_MADSH_U16:
1345   case OPC_MAD_S16:
1346   case OPC_MADSH_M16:
1347   case OPC_MAD_U24:
1348   case OPC_MAD_S24:
1349   case OPC_SEL_S16:
1350   case OPC_SEL_S32:
1351   case OPC_SAD_S16:
1352   case OPC_SAD_S32:
1353      /* neg *may* work on 3rd src.. */
1354
1355   case OPC_SEL_B16:
1356   case OPC_SEL_B32:
1357
1358   case OPC_SHLG_B16:
1359
1360   default:
1361      return 0;
1362   }
1363}
1364
1365/* Return the type (float, int, or uint) the op uses when converting from the
1366 * internal result of the op (which is assumed to be the same size as the
1367 * sources) to the destination when they are not the same size. If F32 it does
1368 * a floating-point conversion, if U32 it does a truncation/zero-extension, if
1369 * S32 it does a truncation/sign-extension. "can_fold" will be false if it
1370 * doesn't do anything sensible or is unknown.
1371 */
1372static inline type_t
1373ir3_output_conv_type(struct ir3_instruction *instr, bool *can_fold)
1374{
1375   *can_fold = true;
1376   switch (instr->opc) {
1377   case OPC_ADD_F:
1378   case OPC_MUL_F:
1379   case OPC_BARY_F:
1380   case OPC_MAD_F32:
1381   case OPC_MAD_F16:
1382      return TYPE_F32;
1383
1384   case OPC_ADD_U:
1385   case OPC_SUB_U:
1386   case OPC_MIN_U:
1387   case OPC_MAX_U:
1388   case OPC_AND_B:
1389   case OPC_OR_B:
1390   case OPC_NOT_B:
1391   case OPC_XOR_B:
1392   case OPC_MUL_U24:
1393   case OPC_MULL_U:
1394   case OPC_SHL_B:
1395   case OPC_SHR_B:
1396   case OPC_ASHR_B:
1397   case OPC_MAD_U24:
1398   /* Comparison ops zero-extend/truncate their results, so consider them as
1399    * unsigned here.
1400    */
1401   case OPC_CMPS_F:
1402   case OPC_CMPV_F:
1403   case OPC_CMPS_U:
1404   case OPC_CMPS_S:
1405      return TYPE_U32;
1406
1407   case OPC_ADD_S:
1408   case OPC_SUB_S:
1409   case OPC_MIN_S:
1410   case OPC_MAX_S:
1411   case OPC_ABSNEG_S:
1412   case OPC_MUL_S24:
1413   case OPC_MAD_S24:
1414      return TYPE_S32;
1415
1416   /* We assume that any move->move folding that could be done was done by
1417    * NIR.
1418    */
1419   case OPC_MOV:
1420   default:
1421      *can_fold = false;
1422      return TYPE_U32;
1423   }
1424}
1425
1426/* Return the src and dst types for the conversion which is already folded
1427 * into the op. We can assume that instr has folded in a conversion from
1428 * ir3_output_conv_src_type() to ir3_output_conv_dst_type(). Only makes sense
1429 * to call if ir3_output_conv_type() returns can_fold = true.
1430 */
1431static inline type_t
1432ir3_output_conv_src_type(struct ir3_instruction *instr, type_t base_type)
1433{
1434   switch (instr->opc) {
1435   case OPC_CMPS_F:
1436   case OPC_CMPV_F:
1437   case OPC_CMPS_U:
1438   case OPC_CMPS_S:
1439      /* Comparisons only return 0/1 and the size of the comparison sources
1440       * is irrelevant, never consider them as having an output conversion
1441       * by returning a type with the dest size here:
1442       */
1443      return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1444                                                    : full_type(base_type);
1445
1446   case OPC_BARY_F:
1447      /* bary.f doesn't have an explicit source, but we can assume here that
1448       * the varying data it reads is in fp32.
1449       *
1450       * This may be fp16 on older gen's depending on some register
1451       * settings, but it's probably not worth plumbing that through for a
1452       * small improvement that NIR would hopefully handle for us anyway.
1453       */
1454      return TYPE_F32;
1455
1456   default:
1457      return (instr->srcs[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1458                                                    : full_type(base_type);
1459   }
1460}
1461
1462static inline type_t
1463ir3_output_conv_dst_type(struct ir3_instruction *instr, type_t base_type)
1464{
1465   return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1466                                                 : full_type(base_type);
1467}
1468
1469/* Some instructions have signed/unsigned variants which are identical except
1470 * for whether the folded conversion sign-extends or zero-extends, and we can
1471 * fold in a mismatching move by rewriting the opcode. Return the opcode to
1472 * switch signedness, and whether one exists.
1473 */
1474static inline opc_t
1475ir3_try_swap_signedness(opc_t opc, bool *can_swap)
1476{
1477   switch (opc) {
1478#define PAIR(u, s)                                                             \
1479   case OPC_##u:                                                               \
1480      return OPC_##s;                                                          \
1481   case OPC_##s:                                                               \
1482      return OPC_##u;
1483      PAIR(ADD_U, ADD_S)
1484      PAIR(SUB_U, SUB_S)
1485      /* Note: these are only identical when the sources are half, but that's
1486       * the only case we call this function for anyway.
1487       */
1488      PAIR(MUL_U24, MUL_S24)
1489
1490   default:
1491      *can_swap = false;
1492      return opc;
1493   }
1494}
1495
1496#define MASK(n) ((1 << (n)) - 1)
1497
1498/* iterator for an instructions's sources (reg), also returns src #: */
1499#define foreach_src_n(__srcreg, __n, __instr)                                  \
1500   if ((__instr)->srcs_count)                                                  \
1501      for (struct ir3_register *__srcreg = (void *)~0; __srcreg;               \
1502           __srcreg = NULL)                                                    \
1503         for (unsigned __cnt = (__instr)->srcs_count, __n = 0; __n < __cnt;    \
1504              __n++)                                                           \
1505            if ((__srcreg = (__instr)->srcs[__n]))
1506
1507/* iterator for an instructions's sources (reg): */
1508#define foreach_src(__srcreg, __instr) foreach_src_n (__srcreg, __i, __instr)
1509
1510/* iterator for an instructions's destinations (reg), also returns dst #: */
1511#define foreach_dst_n(__dstreg, __n, __instr)                                  \
1512   if ((__instr)->dsts_count)                                                  \
1513      for (struct ir3_register *__dstreg = (void *)~0; __dstreg;               \
1514           __dstreg = NULL)                                                    \
1515         for (unsigned __cnt = (__instr)->dsts_count, __n = 0; __n < __cnt;    \
1516              __n++)                                                           \
1517            if ((__dstreg = (__instr)->dsts[__n]))
1518
1519/* iterator for an instructions's destinations (reg): */
1520#define foreach_dst(__dstreg, __instr) foreach_dst_n (__dstreg, __i, __instr)
1521
1522static inline unsigned
1523__ssa_src_cnt(struct ir3_instruction *instr)
1524{
1525   return instr->srcs_count + instr->deps_count;
1526}
1527
1528static inline bool
1529__is_false_dep(struct ir3_instruction *instr, unsigned n)
1530{
1531   if (n >= instr->srcs_count)
1532      return true;
1533   return false;
1534}
1535
1536static inline struct ir3_instruction **
1537__ssa_srcp_n(struct ir3_instruction *instr, unsigned n)
1538{
1539   if (__is_false_dep(instr, n))
1540      return &instr->deps[n - instr->srcs_count];
1541   if (ssa(instr->srcs[n]))
1542      return &instr->srcs[n]->def->instr;
1543   return NULL;
1544}
1545
1546#define foreach_ssa_srcp_n(__srcp, __n, __instr)                               \
1547   for (struct ir3_instruction **__srcp = (void *)~0; __srcp; __srcp = NULL)   \
1548      for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt;      \
1549           __n++)                                                              \
1550         if ((__srcp = __ssa_srcp_n(__instr, __n)))
1551
1552#define foreach_ssa_srcp(__srcp, __instr)                                      \
1553   foreach_ssa_srcp_n (__srcp, __i, __instr)
1554
1555/* iterator for an instruction's SSA sources (instr), also returns src #: */
1556#define foreach_ssa_src_n(__srcinst, __n, __instr)                             \
1557   for (struct ir3_instruction *__srcinst = (void *)~0; __srcinst;             \
1558        __srcinst = NULL)                                                      \
1559      foreach_ssa_srcp_n (__srcp, __n, __instr)                                \
1560         if ((__srcinst = *__srcp))
1561
1562/* iterator for an instruction's SSA sources (instr): */
1563#define foreach_ssa_src(__srcinst, __instr)                                    \
1564   foreach_ssa_src_n (__srcinst, __i, __instr)
1565
1566/* iterators for shader inputs: */
1567#define foreach_input_n(__ininstr, __cnt, __ir)                                \
1568   for (struct ir3_instruction *__ininstr = (void *)~0; __ininstr;             \
1569        __ininstr = NULL)                                                      \
1570      for (unsigned __cnt = 0; __cnt < (__ir)->inputs_count; __cnt++)          \
1571         if ((__ininstr = (__ir)->inputs[__cnt]))
1572#define foreach_input(__ininstr, __ir) foreach_input_n (__ininstr, __i, __ir)
1573
1574/* iterators for instructions: */
1575#define foreach_instr(__instr, __list)                                         \
1576   list_for_each_entry (struct ir3_instruction, __instr, __list, node)
1577#define foreach_instr_rev(__instr, __list)                                     \
1578   list_for_each_entry_rev (struct ir3_instruction, __instr, __list, node)
1579#define foreach_instr_safe(__instr, __list)                                    \
1580   list_for_each_entry_safe (struct ir3_instruction, __instr, __list, node)
1581#define foreach_instr_from_safe(__instr, __start, __list)                      \
1582   list_for_each_entry_from_safe(struct ir3_instruction, __instr, __start,     \
1583                                 __list, node)
1584
1585/* iterators for blocks: */
1586#define foreach_block(__block, __list)                                         \
1587   list_for_each_entry (struct ir3_block, __block, __list, node)
1588#define foreach_block_safe(__block, __list)                                    \
1589   list_for_each_entry_safe (struct ir3_block, __block, __list, node)
1590#define foreach_block_rev(__block, __list)                                     \
1591   list_for_each_entry_rev (struct ir3_block, __block, __list, node)
1592
1593/* iterators for arrays: */
1594#define foreach_array(__array, __list)                                         \
1595   list_for_each_entry (struct ir3_array, __array, __list, node)
1596#define foreach_array_safe(__array, __list)                                    \
1597   list_for_each_entry_safe (struct ir3_array, __array, __list, node)
1598
1599#define IR3_PASS(ir, pass, ...)                                                \
1600   ({                                                                          \
1601      bool progress = pass(ir, ##__VA_ARGS__);                                 \
1602      if (progress) {                                                          \
1603         ir3_debug_print(ir, "AFTER: " #pass);                                 \
1604         ir3_validate(ir);                                                     \
1605      }                                                                        \
1606      progress;                                                                \
1607   })
1608
1609/* validate: */
1610void ir3_validate(struct ir3 *ir);
1611
1612/* dump: */
1613void ir3_print(struct ir3 *ir);
1614void ir3_print_instr(struct ir3_instruction *instr);
1615
1616struct log_stream;
1617void ir3_print_instr_stream(struct log_stream *stream, struct ir3_instruction *instr);
1618
1619/* delay calculation: */
1620int ir3_delayslots(struct ir3_instruction *assigner,
1621                   struct ir3_instruction *consumer, unsigned n, bool soft);
1622unsigned ir3_delay_calc_prera(struct ir3_block *block,
1623                              struct ir3_instruction *instr);
1624unsigned ir3_delay_calc_postra(struct ir3_block *block,
1625                               struct ir3_instruction *instr, bool soft,
1626                               bool mergedregs);
1627unsigned ir3_delay_calc_exact(struct ir3_block *block,
1628                              struct ir3_instruction *instr, bool mergedregs);
1629void ir3_remove_nops(struct ir3 *ir);
1630
1631/* unreachable block elimination: */
1632bool ir3_remove_unreachable(struct ir3 *ir);
1633
1634/* dead code elimination: */
1635struct ir3_shader_variant;
1636bool ir3_dce(struct ir3 *ir, struct ir3_shader_variant *so);
1637
1638/* fp16 conversion folding */
1639bool ir3_cf(struct ir3 *ir);
1640
1641/* copy-propagate: */
1642bool ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so);
1643bool ir3_cp_postsched(struct ir3 *ir);
1644
1645/* common subexpression elimination: */
1646bool ir3_cse(struct ir3 *ir);
1647
1648/* Make arrays SSA */
1649bool ir3_array_to_ssa(struct ir3 *ir);
1650
1651/* scheduling: */
1652bool ir3_sched_add_deps(struct ir3 *ir);
1653int ir3_sched(struct ir3 *ir);
1654
1655struct ir3_context;
1656bool ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v);
1657
1658/* register assignment: */
1659int ir3_ra(struct ir3_shader_variant *v);
1660
1661/* lower subgroup ops: */
1662bool ir3_lower_subgroups(struct ir3 *ir);
1663
1664/* legalize: */
1665bool ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary);
1666
1667static inline bool
1668ir3_has_latency_to_hide(struct ir3 *ir)
1669{
1670   /* VS/GS/TCS/TESS  co-exist with frag shader invocations, but we don't
1671    * know the nature of the fragment shader.  Just assume it will have
1672    * latency to hide:
1673    */
1674   if (ir->type != MESA_SHADER_FRAGMENT)
1675      return true;
1676
1677   foreach_block (block, &ir->block_list) {
1678      foreach_instr (instr, &block->instr_list) {
1679         if (is_tex_or_prefetch(instr))
1680            return true;
1681
1682         if (is_load(instr)) {
1683            switch (instr->opc) {
1684            case OPC_LDLV:
1685            case OPC_LDL:
1686            case OPC_LDLW:
1687               break;
1688            default:
1689               return true;
1690            }
1691         }
1692      }
1693   }
1694
1695   return false;
1696}
1697
1698/* ************************************************************************* */
1699/* instruction helpers */
1700
1701/* creates SSA src of correct type (ie. half vs full precision) */
1702static inline struct ir3_register *
1703__ssa_src(struct ir3_instruction *instr, struct ir3_instruction *src,
1704          unsigned flags)
1705{
1706   struct ir3_register *reg;
1707   if (src->dsts[0]->flags & IR3_REG_HALF)
1708      flags |= IR3_REG_HALF;
1709   reg = ir3_src_create(instr, INVALID_REG, IR3_REG_SSA | flags);
1710   reg->def = src->dsts[0];
1711   reg->wrmask = src->dsts[0]->wrmask;
1712   return reg;
1713}
1714
1715static inline struct ir3_register *
1716__ssa_dst(struct ir3_instruction *instr)
1717{
1718   struct ir3_register *reg = ir3_dst_create(instr, INVALID_REG, IR3_REG_SSA);
1719   reg->instr = instr;
1720   return reg;
1721}
1722
1723static inline struct ir3_instruction *
1724create_immed_typed(struct ir3_block *block, uint32_t val, type_t type)
1725{
1726   struct ir3_instruction *mov;
1727   unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
1728
1729   mov = ir3_instr_create(block, OPC_MOV, 1, 1);
1730   mov->cat1.src_type = type;
1731   mov->cat1.dst_type = type;
1732   __ssa_dst(mov)->flags |= flags;
1733   ir3_src_create(mov, 0, IR3_REG_IMMED | flags)->uim_val = val;
1734
1735   return mov;
1736}
1737
1738static inline struct ir3_instruction *
1739create_immed(struct ir3_block *block, uint32_t val)
1740{
1741   return create_immed_typed(block, val, TYPE_U32);
1742}
1743
1744static inline struct ir3_instruction *
1745create_uniform_typed(struct ir3_block *block, unsigned n, type_t type)
1746{
1747   struct ir3_instruction *mov;
1748   unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
1749
1750   mov = ir3_instr_create(block, OPC_MOV, 1, 1);
1751   mov->cat1.src_type = type;
1752   mov->cat1.dst_type = type;
1753   __ssa_dst(mov)->flags |= flags;
1754   ir3_src_create(mov, n, IR3_REG_CONST | flags);
1755
1756   return mov;
1757}
1758
1759static inline struct ir3_instruction *
1760create_uniform(struct ir3_block *block, unsigned n)
1761{
1762   return create_uniform_typed(block, n, TYPE_F32);
1763}
1764
1765static inline struct ir3_instruction *
1766create_uniform_indirect(struct ir3_block *block, int n, type_t type,
1767                        struct ir3_instruction *address)
1768{
1769   struct ir3_instruction *mov;
1770
1771   mov = ir3_instr_create(block, OPC_MOV, 1, 1);
1772   mov->cat1.src_type = type;
1773   mov->cat1.dst_type = type;
1774   __ssa_dst(mov);
1775   ir3_src_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n;
1776
1777   ir3_instr_set_address(mov, address);
1778
1779   return mov;
1780}
1781
1782static inline struct ir3_instruction *
1783ir3_MOV(struct ir3_block *block, struct ir3_instruction *src, type_t type)
1784{
1785   struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV, 1, 1);
1786   unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
1787
1788   __ssa_dst(instr)->flags |= flags;
1789   if (src->dsts[0]->flags & IR3_REG_ARRAY) {
1790      struct ir3_register *src_reg = __ssa_src(instr, src, IR3_REG_ARRAY);
1791      src_reg->array = src->dsts[0]->array;
1792   } else {
1793      __ssa_src(instr, src, src->dsts[0]->flags & IR3_REG_SHARED);
1794   }
1795   debug_assert(!(src->dsts[0]->flags & IR3_REG_RELATIV));
1796   instr->cat1.src_type = type;
1797   instr->cat1.dst_type = type;
1798   return instr;
1799}
1800
1801static inline struct ir3_instruction *
1802ir3_COV(struct ir3_block *block, struct ir3_instruction *src, type_t src_type,
1803        type_t dst_type)
1804{
1805   struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV, 1, 1);
1806   unsigned dst_flags = (type_size(dst_type) < 32) ? IR3_REG_HALF : 0;
1807   unsigned src_flags = (type_size(src_type) < 32) ? IR3_REG_HALF : 0;
1808
1809   debug_assert((src->dsts[0]->flags & IR3_REG_HALF) == src_flags);
1810
1811   __ssa_dst(instr)->flags |= dst_flags;
1812   __ssa_src(instr, src, 0);
1813   instr->cat1.src_type = src_type;
1814   instr->cat1.dst_type = dst_type;
1815   debug_assert(!(src->dsts[0]->flags & IR3_REG_ARRAY));
1816   return instr;
1817}
1818
1819static inline struct ir3_instruction *
1820ir3_MOVMSK(struct ir3_block *block, unsigned components)
1821{
1822   struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOVMSK, 1, 0);
1823
1824   struct ir3_register *dst = __ssa_dst(instr);
1825   dst->flags |= IR3_REG_SHARED;
1826   dst->wrmask = (1 << components) - 1;
1827   instr->repeat = components - 1;
1828   return instr;
1829}
1830
1831static inline struct ir3_instruction *
1832ir3_BALLOT_MACRO(struct ir3_block *block, struct ir3_instruction *src,
1833                 unsigned components)
1834{
1835   struct ir3_instruction *instr =
1836      ir3_instr_create(block, OPC_BALLOT_MACRO, 1, 1);
1837
1838   struct ir3_register *dst = __ssa_dst(instr);
1839   dst->flags |= IR3_REG_SHARED;
1840   dst->wrmask = (1 << components) - 1;
1841
1842   __ssa_src(instr, src, 0);
1843
1844   return instr;
1845}
1846
1847static inline struct ir3_instruction *
1848ir3_NOP(struct ir3_block *block)
1849{
1850   return ir3_instr_create(block, OPC_NOP, 0, 0);
1851}
1852
1853#define IR3_INSTR_0 0
1854
1855/* clang-format off */
1856#define __INSTR0(flag, name, opc)                                              \
1857static inline struct ir3_instruction *ir3_##name(struct ir3_block *block)      \
1858{                                                                              \
1859   struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 0);         \
1860   instr->flags |= flag;                                                       \
1861   return instr;                                                               \
1862}
1863/* clang-format on */
1864#define INSTR0F(f, name) __INSTR0(IR3_INSTR_##f, name##_##f, OPC_##name)
1865#define INSTR0(name)     __INSTR0(0, name, OPC_##name)
1866
1867/* clang-format off */
1868#define __INSTR1(flag, dst_count, name, opc)                                   \
1869static inline struct ir3_instruction *ir3_##name(                              \
1870   struct ir3_block *block, struct ir3_instruction *a, unsigned aflags)        \
1871{                                                                              \
1872   struct ir3_instruction *instr =                                             \
1873      ir3_instr_create(block, opc, dst_count, 1);                              \
1874   for (unsigned i = 0; i < dst_count; i++)                                    \
1875      __ssa_dst(instr);                                                        \
1876   __ssa_src(instr, a, aflags);                                                \
1877   instr->flags |= flag;                                                       \
1878   return instr;                                                               \
1879}
1880/* clang-format on */
1881#define INSTR1F(f, name)  __INSTR1(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
1882#define INSTR1(name)      __INSTR1(0, 1, name, OPC_##name)
1883#define INSTR1NODST(name) __INSTR1(0, 0, name, OPC_##name)
1884
1885/* clang-format off */
1886#define __INSTR2(flag, name, opc)                                              \
1887static inline struct ir3_instruction *ir3_##name(                              \
1888   struct ir3_block *block, struct ir3_instruction *a, unsigned aflags,        \
1889   struct ir3_instruction *b, unsigned bflags)                                 \
1890{                                                                              \
1891   struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 2);         \
1892   __ssa_dst(instr);                                                           \
1893   __ssa_src(instr, a, aflags);                                                \
1894   __ssa_src(instr, b, bflags);                                                \
1895   instr->flags |= flag;                                                       \
1896   return instr;                                                               \
1897}
1898/* clang-format on */
1899#define INSTR2F(f, name) __INSTR2(IR3_INSTR_##f, name##_##f, OPC_##name)
1900#define INSTR2(name)     __INSTR2(0, name, OPC_##name)
1901
1902/* clang-format off */
1903#define __INSTR3(flag, dst_count, name, opc)                                   \
1904static inline struct ir3_instruction *ir3_##name(                              \
1905   struct ir3_block *block, struct ir3_instruction *a, unsigned aflags,        \
1906   struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c,      \
1907   unsigned cflags)                                                            \
1908{                                                                              \
1909   struct ir3_instruction *instr =                                             \
1910      ir3_instr_create(block, opc, dst_count, 3);                              \
1911   for (unsigned i = 0; i < dst_count; i++)                                    \
1912      __ssa_dst(instr);                                                        \
1913   __ssa_src(instr, a, aflags);                                                \
1914   __ssa_src(instr, b, bflags);                                                \
1915   __ssa_src(instr, c, cflags);                                                \
1916   instr->flags |= flag;                                                       \
1917   return instr;                                                               \
1918}
1919/* clang-format on */
1920#define INSTR3F(f, name)  __INSTR3(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
1921#define INSTR3(name)      __INSTR3(0, 1, name, OPC_##name)
1922#define INSTR3NODST(name) __INSTR3(0, 0, name, OPC_##name)
1923
1924/* clang-format off */
1925#define __INSTR4(flag, dst_count, name, opc)                                   \
1926static inline struct ir3_instruction *ir3_##name(                              \
1927   struct ir3_block *block, struct ir3_instruction *a, unsigned aflags,        \
1928   struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c,      \
1929   unsigned cflags, struct ir3_instruction *d, unsigned dflags)                \
1930{                                                                              \
1931   struct ir3_instruction *instr =                                             \
1932      ir3_instr_create(block, opc, dst_count, 4);                              \
1933   for (unsigned i = 0; i < dst_count; i++)                                    \
1934      __ssa_dst(instr);                                                        \
1935   __ssa_src(instr, a, aflags);                                                \
1936   __ssa_src(instr, b, bflags);                                                \
1937   __ssa_src(instr, c, cflags);                                                \
1938   __ssa_src(instr, d, dflags);                                                \
1939   instr->flags |= flag;                                                       \
1940   return instr;                                                               \
1941}
1942/* clang-format on */
1943#define INSTR4F(f, name)  __INSTR4(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
1944#define INSTR4(name)      __INSTR4(0, 1, name, OPC_##name)
1945#define INSTR4NODST(name) __INSTR4(0, 0, name, OPC_##name)
1946
1947/* clang-format off */
1948#define __INSTR5(flag, name, opc)                                              \
1949static inline struct ir3_instruction *ir3_##name(                              \
1950   struct ir3_block *block, struct ir3_instruction *a, unsigned aflags,        \
1951   struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c,      \
1952   unsigned cflags, struct ir3_instruction *d, unsigned dflags,                \
1953   struct ir3_instruction *e, unsigned eflags)                                 \
1954{                                                                              \
1955   struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 5);         \
1956   __ssa_dst(instr);                                                           \
1957   __ssa_src(instr, a, aflags);                                                \
1958   __ssa_src(instr, b, bflags);                                                \
1959   __ssa_src(instr, c, cflags);                                                \
1960   __ssa_src(instr, d, dflags);                                                \
1961   __ssa_src(instr, e, eflags);                                                \
1962   instr->flags |= flag;                                                       \
1963   return instr;                                                               \
1964}
1965/* clang-format on */
1966#define INSTR5F(f, name) __INSTR5(IR3_INSTR_##f, name##_##f, OPC_##name)
1967#define INSTR5(name)     __INSTR5(0, name, OPC_##name)
1968
1969/* clang-format off */
1970#define __INSTR6(flag, dst_count, name, opc)                                   \
1971static inline struct ir3_instruction *ir3_##name(                              \
1972   struct ir3_block *block, struct ir3_instruction *a, unsigned aflags,        \
1973   struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c,      \
1974   unsigned cflags, struct ir3_instruction *d, unsigned dflags,                \
1975   struct ir3_instruction *e, unsigned eflags, struct ir3_instruction *f,      \
1976   unsigned fflags)                                                            \
1977{                                                                              \
1978   struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 6);         \
1979   for (unsigned i = 0; i < dst_count; i++)                                    \
1980      __ssa_dst(instr);                                                        \
1981   __ssa_src(instr, a, aflags);                                                \
1982   __ssa_src(instr, b, bflags);                                                \
1983   __ssa_src(instr, c, cflags);                                                \
1984   __ssa_src(instr, d, dflags);                                                \
1985   __ssa_src(instr, e, eflags);                                                \
1986   __ssa_src(instr, f, fflags);                                                \
1987   instr->flags |= flag;                                                       \
1988   return instr;                                                               \
1989}
1990/* clang-format on */
1991#define INSTR6F(f, name)  __INSTR6(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
1992#define INSTR6(name)      __INSTR6(0, 1, name, OPC_##name)
1993#define INSTR6NODST(name) __INSTR6(0, 0, name, OPC_##name)
1994
1995/* cat0 instructions: */
1996INSTR1NODST(B)
1997INSTR0(JUMP)
1998INSTR1NODST(KILL)
1999INSTR1NODST(DEMOTE)
2000INSTR0(END)
2001INSTR0(CHSH)
2002INSTR0(CHMASK)
2003INSTR1NODST(PREDT)
2004INSTR0(PREDF)
2005INSTR0(PREDE)
2006INSTR0(GETONE)
2007
2008/* cat1 macros */
2009INSTR1(ANY_MACRO)
2010INSTR1(ALL_MACRO)
2011INSTR1(READ_FIRST_MACRO)
2012INSTR2(READ_COND_MACRO)
2013
2014static inline struct ir3_instruction *
2015ir3_ELECT_MACRO(struct ir3_block *block)
2016{
2017   struct ir3_instruction *instr =
2018      ir3_instr_create(block, OPC_ELECT_MACRO, 1, 0);
2019   __ssa_dst(instr);
2020   return instr;
2021}
2022
2023/* cat2 instructions, most 2 src but some 1 src: */
2024INSTR2(ADD_F)
2025INSTR2(MIN_F)
2026INSTR2(MAX_F)
2027INSTR2(MUL_F)
2028INSTR1(SIGN_F)
2029INSTR2(CMPS_F)
2030INSTR1(ABSNEG_F)
2031INSTR2(CMPV_F)
2032INSTR1(FLOOR_F)
2033INSTR1(CEIL_F)
2034INSTR1(RNDNE_F)
2035INSTR1(RNDAZ_F)
2036INSTR1(TRUNC_F)
2037INSTR2(ADD_U)
2038INSTR2(ADD_S)
2039INSTR2(SUB_U)
2040INSTR2(SUB_S)
2041INSTR2(CMPS_U)
2042INSTR2(CMPS_S)
2043INSTR2(MIN_U)
2044INSTR2(MIN_S)
2045INSTR2(MAX_U)
2046INSTR2(MAX_S)
2047INSTR1(ABSNEG_S)
2048INSTR2(AND_B)
2049INSTR2(OR_B)
2050INSTR1(NOT_B)
2051INSTR2(XOR_B)
2052INSTR2(CMPV_U)
2053INSTR2(CMPV_S)
2054INSTR2(MUL_U24)
2055INSTR2(MUL_S24)
2056INSTR2(MULL_U)
2057INSTR1(BFREV_B)
2058INSTR1(CLZ_S)
2059INSTR1(CLZ_B)
2060INSTR2(SHL_B)
2061INSTR2(SHR_B)
2062INSTR2(ASHR_B)
2063INSTR2(BARY_F)
2064INSTR2(MGEN_B)
2065INSTR2(GETBIT_B)
2066INSTR1(SETRM)
2067INSTR1(CBITS_B)
2068INSTR2(SHB)
2069INSTR2(MSAD)
2070
2071/* cat3 instructions: */
2072INSTR3(MAD_U16)
2073INSTR3(MADSH_U16)
2074INSTR3(MAD_S16)
2075INSTR3(MADSH_M16)
2076INSTR3(MAD_U24)
2077INSTR3(MAD_S24)
2078INSTR3(MAD_F16)
2079INSTR3(MAD_F32)
2080/* NOTE: SEL_B32 checks for zero vs nonzero */
2081INSTR3(SEL_B16)
2082INSTR3(SEL_B32)
2083INSTR3(SEL_S16)
2084INSTR3(SEL_S32)
2085INSTR3(SEL_F16)
2086INSTR3(SEL_F32)
2087INSTR3(SAD_S16)
2088INSTR3(SAD_S32)
2089
2090/* cat4 instructions: */
2091INSTR1(RCP)
2092INSTR1(RSQ)
2093INSTR1(HRSQ)
2094INSTR1(LOG2)
2095INSTR1(HLOG2)
2096INSTR1(EXP2)
2097INSTR1(HEXP2)
2098INSTR1(SIN)
2099INSTR1(COS)
2100INSTR1(SQRT)
2101
2102/* cat5 instructions: */
2103INSTR1(DSX)
2104INSTR1(DSXPP_MACRO)
2105INSTR1(DSY)
2106INSTR1(DSYPP_MACRO)
2107INSTR1F(3D, DSX)
2108INSTR1F(3D, DSY)
2109INSTR1(RGETPOS)
2110
2111static inline struct ir3_instruction *
2112ir3_SAM(struct ir3_block *block, opc_t opc, type_t type, unsigned wrmask,
2113        unsigned flags, struct ir3_instruction *samp_tex,
2114        struct ir3_instruction *src0, struct ir3_instruction *src1)
2115{
2116   struct ir3_instruction *sam;
2117   unsigned nreg = 0;
2118
2119   if (flags & IR3_INSTR_S2EN) {
2120      nreg++;
2121   }
2122   if (src0) {
2123      nreg++;
2124   }
2125   if (src1) {
2126      nreg++;
2127   }
2128
2129   sam = ir3_instr_create(block, opc, 1, nreg);
2130   sam->flags |= flags;
2131   __ssa_dst(sam)->wrmask = wrmask;
2132   if (flags & IR3_INSTR_S2EN) {
2133      __ssa_src(sam, samp_tex, (flags & IR3_INSTR_B) ? 0 : IR3_REG_HALF);
2134   }
2135   if (src0) {
2136      __ssa_src(sam, src0, 0);
2137   }
2138   if (src1) {
2139      __ssa_src(sam, src1, 0);
2140   }
2141   sam->cat5.type = type;
2142
2143   return sam;
2144}
2145
2146/* cat6 instructions: */
2147INSTR2(LDLV)
2148INSTR3(LDG)
2149INSTR3(LDL)
2150INSTR3(LDLW)
2151INSTR3(LDP)
2152INSTR4NODST(STG)
2153INSTR3NODST(STL)
2154INSTR3NODST(STLW)
2155INSTR3NODST(STP)
2156INSTR1(RESINFO)
2157INSTR1(RESFMT)
2158INSTR2(ATOMIC_ADD)
2159INSTR2(ATOMIC_SUB)
2160INSTR2(ATOMIC_XCHG)
2161INSTR2(ATOMIC_INC)
2162INSTR2(ATOMIC_DEC)
2163INSTR2(ATOMIC_CMPXCHG)
2164INSTR2(ATOMIC_MIN)
2165INSTR2(ATOMIC_MAX)
2166INSTR2(ATOMIC_AND)
2167INSTR2(ATOMIC_OR)
2168INSTR2(ATOMIC_XOR)
2169INSTR2(LDC)
2170#if GPU >= 600
2171INSTR3NODST(STIB);
2172INSTR2(LDIB);
2173INSTR5(LDG_A);
2174INSTR6NODST(STG_A);
2175INSTR3F(G, ATOMIC_ADD)
2176INSTR3F(G, ATOMIC_SUB)
2177INSTR3F(G, ATOMIC_XCHG)
2178INSTR3F(G, ATOMIC_INC)
2179INSTR3F(G, ATOMIC_DEC)
2180INSTR3F(G, ATOMIC_CMPXCHG)
2181INSTR3F(G, ATOMIC_MIN)
2182INSTR3F(G, ATOMIC_MAX)
2183INSTR3F(G, ATOMIC_AND)
2184INSTR3F(G, ATOMIC_OR)
2185INSTR3F(G, ATOMIC_XOR)
2186#elif GPU >= 400
2187INSTR3(LDGB)
2188#if GPU >= 500
2189INSTR3(LDIB)
2190#endif
2191INSTR4NODST(STGB)
2192INSTR4NODST(STIB)
2193INSTR4F(G, ATOMIC_ADD)
2194INSTR4F(G, ATOMIC_SUB)
2195INSTR4F(G, ATOMIC_XCHG)
2196INSTR4F(G, ATOMIC_INC)
2197INSTR4F(G, ATOMIC_DEC)
2198INSTR4F(G, ATOMIC_CMPXCHG)
2199INSTR4F(G, ATOMIC_MIN)
2200INSTR4F(G, ATOMIC_MAX)
2201INSTR4F(G, ATOMIC_AND)
2202INSTR4F(G, ATOMIC_OR)
2203INSTR4F(G, ATOMIC_XOR)
2204#endif
2205
2206/* cat7 instructions: */
2207INSTR0(BAR)
2208INSTR0(FENCE)
2209
2210/* ************************************************************************* */
2211#include "bitset.h"
2212
2213#define MAX_REG 256
2214
2215typedef BITSET_DECLARE(regmaskstate_t, 2 * MAX_REG);
2216
2217typedef struct {
2218   bool mergedregs;
2219   regmaskstate_t mask;
2220} regmask_t;
2221
2222static inline bool
2223__regmask_get(regmask_t *regmask, bool half, unsigned n)
2224{
2225   if (regmask->mergedregs) {
2226      /* a6xx+ case, with merged register file, we track things in terms
2227       * of half-precision registers, with a full precisions register
2228       * using two half-precision slots.
2229       *
2230       * Pretend that special regs (a0.x, a1.x, etc.) are full registers to
2231       * avoid having them alias normal full regs.
2232       */
2233      if (half && !is_reg_num_special(n)) {
2234         return BITSET_TEST(regmask->mask, n);
2235      } else {
2236         n *= 2;
2237         return BITSET_TEST(regmask->mask, n) ||
2238                BITSET_TEST(regmask->mask, n + 1);
2239      }
2240   } else {
2241      /* pre a6xx case, with separate register file for half and full
2242       * precision:
2243       */
2244      if (half)
2245         n += MAX_REG;
2246      return BITSET_TEST(regmask->mask, n);
2247   }
2248}
2249
2250static inline void
2251__regmask_set(regmask_t *regmask, bool half, unsigned n)
2252{
2253   if (regmask->mergedregs) {
2254      /* a6xx+ case, with merged register file, we track things in terms
2255       * of half-precision registers, with a full precisions register
2256       * using two half-precision slots:
2257       */
2258      if (half && !is_reg_num_special(n)) {
2259         BITSET_SET(regmask->mask, n);
2260      } else {
2261         n *= 2;
2262         BITSET_SET(regmask->mask, n);
2263         BITSET_SET(regmask->mask, n + 1);
2264      }
2265   } else {
2266      /* pre a6xx case, with separate register file for half and full
2267       * precision:
2268       */
2269      if (half)
2270         n += MAX_REG;
2271      BITSET_SET(regmask->mask, n);
2272   }
2273}
2274
2275static inline void
2276__regmask_clear(regmask_t *regmask, bool half, unsigned n)
2277{
2278   if (regmask->mergedregs) {
2279      /* a6xx+ case, with merged register file, we track things in terms
2280       * of half-precision registers, with a full precisions register
2281       * using two half-precision slots:
2282       */
2283      if (half && !is_reg_num_special(n)) {
2284         BITSET_CLEAR(regmask->mask, n);
2285      } else {
2286         n *= 2;
2287         BITSET_CLEAR(regmask->mask, n);
2288         BITSET_CLEAR(regmask->mask, n + 1);
2289      }
2290   } else {
2291      /* pre a6xx case, with separate register file for half and full
2292       * precision:
2293       */
2294      if (half)
2295         n += MAX_REG;
2296      BITSET_CLEAR(regmask->mask, n);
2297   }
2298}
2299
2300static inline void
2301regmask_init(regmask_t *regmask, bool mergedregs)
2302{
2303   memset(&regmask->mask, 0, sizeof(regmask->mask));
2304   regmask->mergedregs = mergedregs;
2305}
2306
2307static inline void
2308regmask_or(regmask_t *dst, regmask_t *a, regmask_t *b)
2309{
2310   assert(dst->mergedregs == a->mergedregs);
2311   assert(dst->mergedregs == b->mergedregs);
2312
2313   for (unsigned i = 0; i < ARRAY_SIZE(dst->mask); i++)
2314      dst->mask[i] = a->mask[i] | b->mask[i];
2315}
2316
2317
2318static inline void
2319regmask_set(regmask_t *regmask, struct ir3_register *reg)
2320{
2321   bool half = reg->flags & IR3_REG_HALF;
2322   if (reg->flags & IR3_REG_RELATIV) {
2323      for (unsigned i = 0; i < reg->size; i++)
2324         __regmask_set(regmask, half, reg->array.base + i);
2325   } else {
2326      for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++)
2327         if (mask & 1)
2328            __regmask_set(regmask, half, n);
2329   }
2330}
2331
2332static inline bool
2333regmask_get(regmask_t *regmask, struct ir3_register *reg)
2334{
2335   bool half = reg->flags & IR3_REG_HALF;
2336   if (reg->flags & IR3_REG_RELATIV) {
2337      for (unsigned i = 0; i < reg->size; i++)
2338         if (__regmask_get(regmask, half, reg->array.base + i))
2339            return true;
2340   } else {
2341      for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++)
2342         if (mask & 1)
2343            if (__regmask_get(regmask, half, n))
2344               return true;
2345   }
2346   return false;
2347}
2348/* ************************************************************************* */
2349
2350#endif /* IR3_H_ */
2351