Home | History | Annotate | Line # | Download | only in ir3
      1 /*
      2  * Copyright (c) 2013 Rob Clark <robdclark (at) gmail.com>
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
     21  * SOFTWARE.
     22  */
     23 
     24 #ifndef IR3_H_
     25 #define IR3_H_
     26 
     27 #include <stdbool.h>
     28 #include <stdint.h>
     29 
     30 #include "compiler/shader_enums.h"
     31 
     32 #include "util/bitscan.h"
     33 #include "util/list.h"
     34 #include "util/set.h"
     35 #include "util/u_debug.h"
     36 
     37 #include "instr-a3xx.h"
     38 
     39 /* low level intermediate representation of an adreno shader program */
     40 
     41 struct ir3_compiler;
     42 struct ir3;
     43 struct ir3_instruction;
     44 struct ir3_block;
     45 
     46 struct ir3_info {
     47    void *data; /* used internally in ir3 assembler */
     48    /* Size in bytes of the shader binary, including NIR constants and
     49     * padding
     50     */
     51    uint32_t size;
     52    /* byte offset from start of the shader to the NIR constant data. */
     53    uint32_t constant_data_offset;
     54    /* Size in dwords of the instructions. */
     55    uint16_t sizedwords;
     56    uint16_t instrs_count; /* expanded to account for rpt's */
     57    uint16_t nops_count;   /* # of nop instructions, including nopN */
     58    uint16_t mov_count;
     59    uint16_t cov_count;
     60    uint16_t stp_count;
     61    uint16_t ldp_count;
     62    /* NOTE: max_reg, etc, does not include registers not touched
     63     * by the shader (ie. vertex fetched via VFD_DECODE but not
     64     * touched by shader)
     65     */
     66    int8_t max_reg; /* highest GPR # used by shader */
     67    int8_t max_half_reg;
     68    int16_t max_const;
     69    /* This is the maximum # of waves that can executed at once in one core,
     70     * assuming that they are all executing this shader.
     71     */
     72    int8_t max_waves;
     73    bool double_threadsize;
     74    bool multi_dword_ldp_stp;
     75 
     76    /* number of sync bits: */
     77    uint16_t ss, sy;
     78 
     79    /* estimate of number of cycles stalled on (ss) */
     80    uint16_t sstall;
     81 
     82    uint16_t last_baryf; /* instruction # of last varying fetch */
     83 
     84    /* Number of instructions of a given category: */
     85    uint16_t instrs_per_cat[8];
     86 };
     87 
     88 struct ir3_merge_set {
     89    uint16_t preferred_reg;
     90    uint16_t size;
     91    uint16_t alignment;
     92 
     93    unsigned interval_start;
     94    unsigned spill_slot;
     95 
     96    unsigned regs_count;
     97    struct ir3_register **regs;
     98 };
     99 
    100 struct ir3_register {
    101    enum {
    102       IR3_REG_CONST = 0x001,
    103       IR3_REG_IMMED = 0x002,
    104       IR3_REG_HALF = 0x004,
    105       /* Shared registers have the same value for all threads when read.
    106        * They can only be written when one thread is active (that is, inside
    107        * a "getone" block).
    108        */
    109       IR3_REG_SHARED = 0x008,
    110       IR3_REG_RELATIV = 0x010,
    111       IR3_REG_R = 0x020,
    112       /* Most instructions, it seems, can do float abs/neg but not
    113        * integer.  The CP pass needs to know what is intended (int or
    114        * float) in order to do the right thing.  For this reason the
    115        * abs/neg flags are split out into float and int variants.  In
    116        * addition, .b (bitwise) operations, the negate is actually a
    117        * bitwise not, so split that out into a new flag to make it
    118        * more clear.
    119        */
    120       IR3_REG_FNEG = 0x040,
    121       IR3_REG_FABS = 0x080,
    122       IR3_REG_SNEG = 0x100,
    123       IR3_REG_SABS = 0x200,
    124       IR3_REG_BNOT = 0x400,
    125       /* (ei) flag, end-input?  Set on last bary, presumably to signal
    126        * that the shader needs no more input:
    127        */
    128       IR3_REG_EI = 0x2000,
    129       /* meta-flags, for intermediate stages of IR, ie.
    130        * before register assignment is done:
    131        */
    132       IR3_REG_SSA = 0x4000, /* 'def' is ptr to assigning destination */
    133       IR3_REG_ARRAY = 0x8000,
    134 
    135       /* Set on a use whenever the SSA value becomes dead after the current
    136        * instruction.
    137        */
    138       IR3_REG_KILL = 0x10000,
    139 
    140       /* Similar to IR3_REG_KILL, except that if there are multiple uses of the
    141        * same SSA value in a single instruction, this is only set on the first
    142        * use.
    143        */
    144       IR3_REG_FIRST_KILL = 0x20000,
    145 
    146       /* Set when a destination doesn't have any uses and is dead immediately
    147        * after the instruction. This can happen even after optimizations for
    148        * corner cases such as destinations of atomic instructions.
    149        */
    150       IR3_REG_UNUSED = 0x40000,
    151    } flags;
    152 
    153    unsigned name;
    154 
    155    /* used for cat5 instructions, but also for internal/IR level
    156     * tracking of what registers are read/written by an instruction.
    157     * wrmask may be a bad name since it is used to represent both
    158     * src and dst that touch multiple adjacent registers.
    159     */
    160    unsigned wrmask : 16; /* up to vec16 */
    161 
    162    /* for relative addressing, 32bits for array size is too small,
    163     * but otoh we don't need to deal with disjoint sets, so instead
    164     * use a simple size field (number of scalar components).
    165     *
    166     * Note the size field isn't important for relative const (since
    167     * we don't have to do register allocation for constants).
    168     */
    169    unsigned size : 16;
    170 
    171    /* normal registers:
    172     * the component is in the low two bits of the reg #, so
    173     * rN.x becomes: (N << 2) | x
    174     */
    175    uint16_t num;
    176    union {
    177       /* immediate: */
    178       int32_t iim_val;
    179       uint32_t uim_val;
    180       float fim_val;
    181       /* relative: */
    182       struct {
    183          uint16_t id;
    184          int16_t offset;
    185          uint16_t base;
    186       } array;
    187    };
    188 
    189    /* For IR3_REG_DEST, pointer back to the instruction containing this
    190     * register.
    191     */
    192    struct ir3_instruction *instr;
    193 
    194    /* For IR3_REG_SSA, src registers contain ptr back to assigning
    195     * instruction.
    196     *
    197     * For IR3_REG_ARRAY, the pointer is back to the last dependent
    198     * array access (although the net effect is the same, it points
    199     * back to a previous instruction that we depend on).
    200     */
    201    struct ir3_register *def;
    202 
    203    /* Pointer to another register in the instruction that must share the same
    204     * physical register. Each destination can be tied with one source, and
    205     * they must have "tied" pointing to each other.
    206     */
    207    struct ir3_register *tied;
    208 
    209    unsigned spill_slot, next_use;
    210 
    211    unsigned merge_set_offset;
    212    struct ir3_merge_set *merge_set;
    213    unsigned interval_start, interval_end;
    214 };
    215 
    216 /*
    217  * Stupid/simple growable array implementation:
    218  */
    219 #define DECLARE_ARRAY(type, name)                                              \
    220    unsigned name##_count, name##_sz;                                           \
    221    type *name;
    222 
    223 #define array_insert(ctx, arr, ...)                                            \
    224    do {                                                                        \
    225       if (arr##_count == arr##_sz) {                                           \
    226          arr##_sz = MAX2(2 * arr##_sz, 16);                                    \
    227          arr = reralloc_size(ctx, arr, arr##_sz * sizeof(arr[0]));             \
    228       }                                                                        \
    229       arr[arr##_count++] = __VA_ARGS__;                                        \
    230    } while (0)
    231 
    232 struct ir3_instruction {
    233    struct ir3_block *block;
    234    opc_t opc;
    235    enum {
    236       /* (sy) flag is set on first instruction, and after sample
    237        * instructions (probably just on RAW hazard).
    238        */
    239       IR3_INSTR_SY = 0x001,
    240       /* (ss) flag is set on first instruction, and first instruction
    241        * to depend on the result of "long" instructions (RAW hazard):
    242        *
    243        *   rcp, rsq, log2, exp2, sin, cos, sqrt
    244        *
    245        * It seems to synchronize until all in-flight instructions are
    246        * completed, for example:
    247        *
    248        *   rsq hr1.w, hr1.w
    249        *   add.f hr2.z, (neg)hr2.z, hc0.y
    250        *   mul.f hr2.w, (neg)hr2.y, (neg)hr2.y
    251        *   rsq hr2.x, hr2.x
    252        *   (rpt1)nop
    253        *   mad.f16 hr2.w, hr2.z, hr2.z, hr2.w
    254        *   nop
    255        *   mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w
    256        *   (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w
    257        *   (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x
    258        *
    259        * The last mul.f does not have (ss) set, presumably because the
    260        * (ss) on the previous instruction does the job.
    261        *
    262        * The blob driver also seems to set it on WAR hazards, although
    263        * not really clear if this is needed or just blob compiler being
    264        * sloppy.  So far I haven't found a case where removing the (ss)
    265        * causes problems for WAR hazard, but I could just be getting
    266        * lucky:
    267        *
    268        *   rcp r1.y, r3.y
    269        *   (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z
    270        *
    271        */
    272       IR3_INSTR_SS = 0x002,
    273       /* (jp) flag is set on jump targets:
    274        */
    275       IR3_INSTR_JP = 0x004,
    276       IR3_INSTR_UL = 0x008,
    277       IR3_INSTR_3D = 0x010,
    278       IR3_INSTR_A = 0x020,
    279       IR3_INSTR_O = 0x040,
    280       IR3_INSTR_P = 0x080,
    281       IR3_INSTR_S = 0x100,
    282       IR3_INSTR_S2EN = 0x200,
    283       IR3_INSTR_G = 0x400,
    284       IR3_INSTR_SAT = 0x800,
    285       /* (cat5/cat6) Bindless */
    286       IR3_INSTR_B = 0x1000,
    287       /* (cat5/cat6) nonuniform */
    288       IR3_INSTR_NONUNIF = 0x02000,
    289       /* (cat5-only) Get some parts of the encoding from a1.x */
    290       IR3_INSTR_A1EN = 0x04000,
    291       /* meta-flags, for intermediate stages of IR, ie.
    292        * before register assignment is done:
    293        */
    294       IR3_INSTR_MARK = 0x08000,
    295       IR3_INSTR_UNUSED = 0x10000,
    296    } flags;
    297    uint8_t repeat;
    298    uint8_t nop;
    299 #ifdef DEBUG
    300    unsigned srcs_max, dsts_max;
    301 #endif
    302    unsigned srcs_count, dsts_count;
    303    struct ir3_register **dsts;
    304    struct ir3_register **srcs;
    305    union {
    306       struct {
    307          char inv1, inv2;
    308          char comp1, comp2;
    309          int immed;
    310          struct ir3_block *target;
    311          const char *target_label;
    312          brtype_t brtype;
    313          unsigned idx; /* for brac.N */
    314       } cat0;
    315       struct {
    316          type_t src_type, dst_type;
    317          round_t round;
    318       } cat1;
    319       struct {
    320          enum {
    321             IR3_COND_LT = 0,
    322             IR3_COND_LE = 1,
    323             IR3_COND_GT = 2,
    324             IR3_COND_GE = 3,
    325             IR3_COND_EQ = 4,
    326             IR3_COND_NE = 5,
    327          } condition;
    328       } cat2;
    329       struct {
    330          unsigned samp, tex;
    331          unsigned tex_base : 3;
    332          type_t type;
    333       } cat5;
    334       struct {
    335          type_t type;
    336          /* TODO remove dst_offset and handle as a ir3_register
    337           * which might be IMMED, similar to how src_offset is
    338           * handled.
    339           */
    340          int dst_offset;
    341          int iim_val   : 3; /* for ldgb/stgb, # of components */
    342          unsigned d    : 3; /* for ldc, component offset */
    343          bool typed    : 1;
    344          unsigned base : 3;
    345       } cat6;
    346       struct {
    347          unsigned w : 1; /* write */
    348          unsigned r : 1; /* read */
    349          unsigned l : 1; /* local */
    350          unsigned g : 1; /* global */
    351       } cat7;
    352       /* for meta-instructions, just used to hold extra data
    353        * before instruction scheduling, etc
    354        */
    355       struct {
    356          int off; /* component/offset */
    357       } split;
    358       struct {
    359          /* Per-source index back to the entry in the
    360           * ir3_shader_variant::outputs table.
    361           */
    362          unsigned *outidxs;
    363       } end;
    364       struct {
    365          /* used to temporarily hold reference to nir_phi_instr
    366           * until we resolve the phi srcs
    367           */
    368          void *nphi;
    369       } phi;
    370       struct {
    371          unsigned samp, tex;
    372          unsigned input_offset;
    373          unsigned samp_base : 3;
    374          unsigned tex_base  : 3;
    375       } prefetch;
    376       struct {
    377          /* maps back to entry in ir3_shader_variant::inputs table: */
    378          int inidx;
    379          /* for sysvals, identifies the sysval type.  Mostly so we can
    380           * identify the special cases where a sysval should not be DCE'd
    381           * (currently, just pre-fs texture fetch)
    382           */
    383          gl_system_value sysval;
    384       } input;
    385    };
    386 
    387    /* For assigning jump offsets, we need instruction's position: */
    388    uint32_t ip;
    389 
    390    /* used for per-pass extra instruction data.
    391     *
    392     * TODO we should remove the per-pass data like this and 'use_count'
    393     * and do something similar to what RA does w/ ir3_ra_instr_data..
    394     * ie. use the ir3_count_instructions pass, and then use instr->ip
    395     * to index into a table of pass-private data.
    396     */
    397    void *data;
    398 
    399    /**
    400     * Valid if pass calls ir3_find_ssa_uses().. see foreach_ssa_use()
    401     */
    402    struct set *uses;
    403 
    404    int use_count; /* currently just updated/used by cp */
    405 
    406    /* an instruction can reference at most one address register amongst
    407     * it's src/dst registers.  Beyond that, you need to insert mov's.
    408     *
    409     * NOTE: do not write this directly, use ir3_instr_set_address()
    410     */
    411    struct ir3_register *address;
    412 
    413    /* Tracking for additional dependent instructions.  Used to handle
    414     * barriers, WAR hazards for arrays/SSBOs/etc.
    415     */
    416    DECLARE_ARRAY(struct ir3_instruction *, deps);
    417 
    418    /*
    419     * From PoV of instruction scheduling, not execution (ie. ignores global/
    420     * local distinction):
    421     *                            shared  image  atomic  SSBO  everything
    422     *   barrier()/            -   R/W     R/W    R/W     R/W       X
    423     *     groupMemoryBarrier()
    424     *     memoryBarrier()
    425     *     (but only images declared coherent?)
    426     *   memoryBarrierAtomic() -                  R/W
    427     *   memoryBarrierBuffer() -                          R/W
    428     *   memoryBarrierImage()  -           R/W
    429     *   memoryBarrierShared() -   R/W
    430     *
    431     * TODO I think for SSBO/image/shared, in cases where we can determine
    432     * which variable is accessed, we don't need to care about accesses to
    433     * different variables (unless declared coherent??)
    434     */
    435    enum {
    436       IR3_BARRIER_EVERYTHING = 1 << 0,
    437       IR3_BARRIER_SHARED_R = 1 << 1,
    438       IR3_BARRIER_SHARED_W = 1 << 2,
    439       IR3_BARRIER_IMAGE_R = 1 << 3,
    440       IR3_BARRIER_IMAGE_W = 1 << 4,
    441       IR3_BARRIER_BUFFER_R = 1 << 5,
    442       IR3_BARRIER_BUFFER_W = 1 << 6,
    443       IR3_BARRIER_ARRAY_R = 1 << 7,
    444       IR3_BARRIER_ARRAY_W = 1 << 8,
    445       IR3_BARRIER_PRIVATE_R = 1 << 9,
    446       IR3_BARRIER_PRIVATE_W = 1 << 10,
    447    } barrier_class,
    448       barrier_conflict;
    449 
    450    /* Entry in ir3_block's instruction list: */
    451    struct list_head node;
    452 
    453    uint32_t serialno;
    454 
    455    // TODO only computerator/assembler:
    456    int line;
    457 };
    458 
    459 struct ir3 {
    460    struct ir3_compiler *compiler;
    461    gl_shader_stage type;
    462 
    463    DECLARE_ARRAY(struct ir3_instruction *, inputs);
    464 
    465    /* Track bary.f (and ldlv) instructions.. this is needed in
    466     * scheduling to ensure that all varying fetches happen before
    467     * any potential kill instructions.  The hw gets grumpy if all
    468     * threads in a group are killed before the last bary.f gets
    469     * a chance to signal end of input (ei).
    470     */
    471    DECLARE_ARRAY(struct ir3_instruction *, baryfs);
    472 
    473    /* Track all indirect instructions (read and write).  To avoid
    474     * deadlock scenario where an address register gets scheduled,
    475     * but other dependent src instructions cannot be scheduled due
    476     * to dependency on a *different* address register value, the
    477     * scheduler needs to ensure that all dependencies other than
    478     * the instruction other than the address register are scheduled
    479     * before the one that writes the address register.  Having a
    480     * convenient list of instructions that reference some address
    481     * register simplifies this.
    482     */
    483    DECLARE_ARRAY(struct ir3_instruction *, a0_users);
    484 
    485    /* same for a1.x: */
    486    DECLARE_ARRAY(struct ir3_instruction *, a1_users);
    487 
    488    /* and same for instructions that consume predicate register: */
    489    DECLARE_ARRAY(struct ir3_instruction *, predicates);
    490 
    491    /* Track texture sample instructions which need texture state
    492     * patched in (for astc-srgb workaround):
    493     */
    494    DECLARE_ARRAY(struct ir3_instruction *, astc_srgb);
    495 
    496    /* List of blocks: */
    497    struct list_head block_list;
    498 
    499    /* List of ir3_array's: */
    500    struct list_head array_list;
    501 
    502 #ifdef DEBUG
    503    unsigned block_count;
    504 #endif
    505    unsigned instr_count;
    506 };
    507 
    508 struct ir3_array {
    509    struct list_head node;
    510    unsigned length;
    511    unsigned id;
    512 
    513    struct nir_register *r;
    514 
    515    /* To avoid array write's from getting DCE'd, keep track of the
    516     * most recent write.  Any array access depends on the most
    517     * recent write.  This way, nothing depends on writes after the
    518     * last read.  But all the writes that happen before that have
    519     * something depending on them
    520     */
    521    struct ir3_register *last_write;
    522 
    523    /* extra stuff used in RA pass: */
    524    unsigned base; /* base vreg name */
    525    unsigned reg;  /* base physical reg */
    526    uint16_t start_ip, end_ip;
    527 
    528    /* Indicates if half-precision */
    529    bool half;
    530 
    531    bool unused;
    532 };
    533 
    534 struct ir3_array *ir3_lookup_array(struct ir3 *ir, unsigned id);
    535 
    536 enum ir3_branch_type {
    537    IR3_BRANCH_COND,   /* condition */
    538    IR3_BRANCH_ANY,    /* subgroupAny(condition) */
    539    IR3_BRANCH_ALL,    /* subgroupAll(condition) */
    540    IR3_BRANCH_GETONE, /* subgroupElect() */
    541 };
    542 
    543 struct ir3_block {
    544    struct list_head node;
    545    struct ir3 *shader;
    546 
    547    const struct nir_block *nblock;
    548 
    549    struct list_head instr_list; /* list of ir3_instruction */
    550 
    551    /* The actual branch condition, if there are two successors */
    552    enum ir3_branch_type brtype;
    553 
    554    /* each block has either one or two successors.. in case of two
    555     * successors, 'condition' decides which one to follow.  A block preceding
    556     * an if/else has two successors.
    557     *
    558     * In some cases the path that the machine actually takes through the
    559     * program may not match the per-thread view of the CFG. In particular
    560     * this is the case for if/else, where the machine jumps from the end of
    561     * the if to the beginning of the else and switches active lanes. While
    562     * most things only care about the per-thread view, we need to use the
    563     * "physical" view when allocating shared registers. "successors" contains
    564     * the per-thread successors, and "physical_successors" contains the
    565     * physical successors which includes the fallthrough edge from the if to
    566     * the else.
    567     */
    568    struct ir3_instruction *condition;
    569    struct ir3_block *successors[2];
    570    struct ir3_block *physical_successors[2];
    571 
    572    DECLARE_ARRAY(struct ir3_block *, predecessors);
    573    DECLARE_ARRAY(struct ir3_block *, physical_predecessors);
    574 
    575    uint16_t start_ip, end_ip;
    576 
    577    /* Track instructions which do not write a register but other-
    578     * wise must not be discarded (such as kill, stg, etc)
    579     */
    580    DECLARE_ARRAY(struct ir3_instruction *, keeps);
    581 
    582    /* used for per-pass extra block data.  Mainly used right
    583     * now in RA step to track livein/liveout.
    584     */
    585    void *data;
    586 
    587    uint32_t index;
    588 
    589    struct ir3_block *imm_dom;
    590    DECLARE_ARRAY(struct ir3_block *, dom_children);
    591 
    592    uint32_t dom_pre_index;
    593    uint32_t dom_post_index;
    594 
    595    uint32_t loop_id;
    596    uint32_t loop_depth;
    597 
    598 #ifdef DEBUG
    599    uint32_t serialno;
    600 #endif
    601 };
    602 
    603 static inline uint32_t
    604 block_id(struct ir3_block *block)
    605 {
    606 #ifdef DEBUG
    607    return block->serialno;
    608 #else
    609    return (uint32_t)(unsigned long)block;
    610 #endif
    611 }
    612 
    613 static inline struct ir3_block *
    614 ir3_start_block(struct ir3 *ir)
    615 {
    616    return list_first_entry(&ir->block_list, struct ir3_block, node);
    617 }
    618 
    619 void ir3_block_add_predecessor(struct ir3_block *block, struct ir3_block *pred);
    620 void ir3_block_add_physical_predecessor(struct ir3_block *block,
    621                                         struct ir3_block *pred);
    622 void ir3_block_remove_predecessor(struct ir3_block *block,
    623                                   struct ir3_block *pred);
    624 void ir3_block_remove_physical_predecessor(struct ir3_block *block,
    625                                            struct ir3_block *pred);
    626 unsigned ir3_block_get_pred_index(struct ir3_block *block,
    627                                   struct ir3_block *pred);
    628 
    629 void ir3_calc_dominance(struct ir3 *ir);
    630 bool ir3_block_dominates(struct ir3_block *a, struct ir3_block *b);
    631 
    632 struct ir3_shader_variant;
    633 
    634 struct ir3 *ir3_create(struct ir3_compiler *compiler,
    635                        struct ir3_shader_variant *v);
    636 void ir3_destroy(struct ir3 *shader);
    637 
    638 void ir3_collect_info(struct ir3_shader_variant *v);
    639 void *ir3_alloc(struct ir3 *shader, int sz);
    640 
    641 unsigned ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
    642                                          unsigned reg_count,
    643                                          bool double_threadsize);
    644 
    645 unsigned ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v,
    646                                            bool double_threadsize);
    647 
    648 bool ir3_should_double_threadsize(struct ir3_shader_variant *v,
    649                                   unsigned regs_count);
    650 
    651 struct ir3_block *ir3_block_create(struct ir3 *shader);
    652 
    653 struct ir3_instruction *ir3_instr_create(struct ir3_block *block, opc_t opc,
    654                                          int ndst, int nsrc);
    655 struct ir3_instruction *ir3_instr_clone(struct ir3_instruction *instr);
    656 void ir3_instr_add_dep(struct ir3_instruction *instr,
    657                        struct ir3_instruction *dep);
    658 const char *ir3_instr_name(struct ir3_instruction *instr);
    659 
    660 struct ir3_register *ir3_src_create(struct ir3_instruction *instr, int num,
    661                                     int flags);
    662 struct ir3_register *ir3_dst_create(struct ir3_instruction *instr, int num,
    663                                     int flags);
    664 struct ir3_register *ir3_reg_clone(struct ir3 *shader,
    665                                    struct ir3_register *reg);
    666 
    667 static inline void
    668 ir3_reg_tie(struct ir3_register *dst, struct ir3_register *src)
    669 {
    670    assert(!dst->tied && !src->tied);
    671    dst->tied = src;
    672    src->tied = dst;
    673 }
    674 
    675 void ir3_reg_set_last_array(struct ir3_instruction *instr,
    676                             struct ir3_register *reg,
    677                             struct ir3_register *last_write);
    678 
    679 void ir3_instr_set_address(struct ir3_instruction *instr,
    680                            struct ir3_instruction *addr);
    681 
    682 static inline bool
    683 ir3_instr_check_mark(struct ir3_instruction *instr)
    684 {
    685    if (instr->flags & IR3_INSTR_MARK)
    686       return true; /* already visited */
    687    instr->flags |= IR3_INSTR_MARK;
    688    return false;
    689 }
    690 
    691 void ir3_block_clear_mark(struct ir3_block *block);
    692 void ir3_clear_mark(struct ir3 *shader);
    693 
    694 unsigned ir3_count_instructions(struct ir3 *ir);
    695 unsigned ir3_count_instructions_ra(struct ir3 *ir);
    696 
    697 /**
    698  * Move 'instr' to just before 'after'
    699  */
    700 static inline void
    701 ir3_instr_move_before(struct ir3_instruction *instr,
    702                       struct ir3_instruction *after)
    703 {
    704    list_delinit(&instr->node);
    705    list_addtail(&instr->node, &after->node);
    706 }
    707 
    708 /**
    709  * Move 'instr' to just after 'before':
    710  */
    711 static inline void
    712 ir3_instr_move_after(struct ir3_instruction *instr,
    713                      struct ir3_instruction *before)
    714 {
    715    list_delinit(&instr->node);
    716    list_add(&instr->node, &before->node);
    717 }
    718 
    719 /**
    720  * Move 'instr' to the beginning of the block:
    721  */
    722 static inline void
    723 ir3_instr_move_before_block(struct ir3_instruction *instr,
    724                             struct ir3_block *block)
    725 {
    726    list_delinit(&instr->node);
    727    list_add(&instr->node, &block->instr_list);
    728 }
    729 
    730 void ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps);
    731 
    732 void ir3_set_dst_type(struct ir3_instruction *instr, bool half);
    733 void ir3_fixup_src_type(struct ir3_instruction *instr);
    734 
    735 int ir3_flut(struct ir3_register *src_reg);
    736 
    737 bool ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags);
    738 
    739 bool ir3_valid_immediate(struct ir3_instruction *instr, int32_t immed);
    740 
    741 #include "util/set.h"
    742 #define foreach_ssa_use(__use, __instr)                                        \
    743    for (struct ir3_instruction *__use = (void *)~0; __use && (__instr)->uses;  \
    744         __use = NULL)                                                          \
    745       set_foreach ((__instr)->uses, __entry)                                   \
    746          if ((__use = (void *)__entry->key))
    747 
    748 static inline uint32_t
    749 reg_num(const struct ir3_register *reg)
    750 {
    751    return reg->num >> 2;
    752 }
    753 
    754 static inline uint32_t
    755 reg_comp(const struct ir3_register *reg)
    756 {
    757    return reg->num & 0x3;
    758 }
    759 
    760 static inline bool
    761 is_flow(struct ir3_instruction *instr)
    762 {
    763    return (opc_cat(instr->opc) == 0);
    764 }
    765 
    766 static inline bool
    767 is_kill_or_demote(struct ir3_instruction *instr)
    768 {
    769    return instr->opc == OPC_KILL || instr->opc == OPC_DEMOTE;
    770 }
    771 
    772 static inline bool
    773 is_nop(struct ir3_instruction *instr)
    774 {
    775    return instr->opc == OPC_NOP;
    776 }
    777 
    778 static inline bool
    779 is_same_type_reg(struct ir3_register *dst, struct ir3_register *src)
    780 {
    781    unsigned dst_type = (dst->flags & IR3_REG_HALF);
    782    unsigned src_type = (src->flags & IR3_REG_HALF);
    783 
    784    /* Treat shared->normal copies as same-type, because they can generally be
    785     * folded, but not normal->shared copies.
    786     */
    787    if (dst_type != src_type ||
    788        ((dst->flags & IR3_REG_SHARED) && !(src->flags & IR3_REG_SHARED)))
    789       return false;
    790    else
    791       return true;
    792 }
    793 
    794 /* Is it a non-transformative (ie. not type changing) mov?  This can
    795  * also include absneg.s/absneg.f, which for the most part can be
    796  * treated as a mov (single src argument).
    797  */
    798 static inline bool
    799 is_same_type_mov(struct ir3_instruction *instr)
    800 {
    801    struct ir3_register *dst;
    802 
    803    switch (instr->opc) {
    804    case OPC_MOV:
    805       if (instr->cat1.src_type != instr->cat1.dst_type)
    806          return false;
    807       /* If the type of dest reg and src reg are different,
    808        * it shouldn't be considered as same type mov
    809        */
    810       if (!is_same_type_reg(instr->dsts[0], instr->srcs[0]))
    811          return false;
    812       break;
    813    case OPC_ABSNEG_F:
    814    case OPC_ABSNEG_S:
    815       if (instr->flags & IR3_INSTR_SAT)
    816          return false;
    817       /* If the type of dest reg and src reg are different,
    818        * it shouldn't be considered as same type mov
    819        */
    820       if (!is_same_type_reg(instr->dsts[0], instr->srcs[0]))
    821          return false;
    822       break;
    823    case OPC_META_PHI:
    824       return instr->srcs_count == 1;
    825    default:
    826       return false;
    827    }
    828 
    829    dst = instr->dsts[0];
    830 
    831    /* mov's that write to a0 or p0.x are special: */
    832    if (dst->num == regid(REG_P0, 0))
    833       return false;
    834    if (reg_num(dst) == REG_A0)
    835       return false;
    836 
    837    if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
    838       return false;
    839 
    840    return true;
    841 }
    842 
    843 /* A move from const, which changes size but not type, can also be
    844  * folded into dest instruction in some cases.
    845  */
    846 static inline bool
    847 is_const_mov(struct ir3_instruction *instr)
    848 {
    849    if (instr->opc != OPC_MOV)
    850       return false;
    851 
    852    if (!(instr->srcs[0]->flags & IR3_REG_CONST))
    853       return false;
    854 
    855    type_t src_type = instr->cat1.src_type;
    856    type_t dst_type = instr->cat1.dst_type;
    857 
    858    return (type_float(src_type) && type_float(dst_type)) ||
    859           (type_uint(src_type) && type_uint(dst_type)) ||
    860           (type_sint(src_type) && type_sint(dst_type));
    861 }
    862 
    863 static inline bool
    864 is_alu(struct ir3_instruction *instr)
    865 {
    866    return (1 <= opc_cat(instr->opc)) && (opc_cat(instr->opc) <= 3);
    867 }
    868 
    869 static inline bool
    870 is_sfu(struct ir3_instruction *instr)
    871 {
    872    return (opc_cat(instr->opc) == 4);
    873 }
    874 
    875 static inline bool
    876 is_tex(struct ir3_instruction *instr)
    877 {
    878    return (opc_cat(instr->opc) == 5);
    879 }
    880 
    881 static inline bool
    882 is_tex_or_prefetch(struct ir3_instruction *instr)
    883 {
    884    return is_tex(instr) || (instr->opc == OPC_META_TEX_PREFETCH);
    885 }
    886 
    887 static inline bool
    888 is_mem(struct ir3_instruction *instr)
    889 {
    890    return (opc_cat(instr->opc) == 6);
    891 }
    892 
    893 static inline bool
    894 is_barrier(struct ir3_instruction *instr)
    895 {
    896    return (opc_cat(instr->opc) == 7);
    897 }
    898 
    899 static inline bool
    900 is_half(struct ir3_instruction *instr)
    901 {
    902    return !!(instr->dsts[0]->flags & IR3_REG_HALF);
    903 }
    904 
    905 static inline bool
    906 is_shared(struct ir3_instruction *instr)
    907 {
    908    return !!(instr->dsts[0]->flags & IR3_REG_SHARED);
    909 }
    910 
    911 static inline bool
    912 is_store(struct ir3_instruction *instr)
    913 {
    914    /* these instructions, the "destination" register is
    915     * actually a source, the address to store to.
    916     */
    917    switch (instr->opc) {
    918    case OPC_STG:
    919    case OPC_STG_A:
    920    case OPC_STGB:
    921    case OPC_STIB:
    922    case OPC_STP:
    923    case OPC_STL:
    924    case OPC_STLW:
    925    case OPC_L2G:
    926    case OPC_G2L:
    927       return true;
    928    default:
    929       return false;
    930    }
    931 }
    932 
    933 static inline bool
    934 is_load(struct ir3_instruction *instr)
    935 {
    936    switch (instr->opc) {
    937    case OPC_LDG:
    938    case OPC_LDG_A:
    939    case OPC_LDGB:
    940    case OPC_LDIB:
    941    case OPC_LDL:
    942    case OPC_LDP:
    943    case OPC_L2G:
    944    case OPC_LDLW:
    945    case OPC_LDC:
    946    case OPC_LDLV:
    947       /* probably some others too.. */
    948       return true;
    949    default:
    950       return false;
    951    }
    952 }
    953 
    954 static inline bool
    955 is_input(struct ir3_instruction *instr)
    956 {
    957    /* in some cases, ldlv is used to fetch varying without
    958     * interpolation.. fortunately inloc is the first src
    959     * register in either case
    960     */
    961    switch (instr->opc) {
    962    case OPC_LDLV:
    963    case OPC_BARY_F:
    964       return true;
    965    default:
    966       return false;
    967    }
    968 }
    969 
    970 static inline bool
    971 is_bool(struct ir3_instruction *instr)
    972 {
    973    switch (instr->opc) {
    974    case OPC_CMPS_F:
    975    case OPC_CMPS_S:
    976    case OPC_CMPS_U:
    977       return true;
    978    default:
    979       return false;
    980    }
    981 }
    982 
    983 static inline opc_t
    984 cat3_half_opc(opc_t opc)
    985 {
    986    switch (opc) {
    987    case OPC_MAD_F32:
    988       return OPC_MAD_F16;
    989    case OPC_SEL_B32:
    990       return OPC_SEL_B16;
    991    case OPC_SEL_S32:
    992       return OPC_SEL_S16;
    993    case OPC_SEL_F32:
    994       return OPC_SEL_F16;
    995    case OPC_SAD_S32:
    996       return OPC_SAD_S16;
    997    default:
    998       return opc;
    999    }
   1000 }
   1001 
   1002 static inline opc_t
   1003 cat3_full_opc(opc_t opc)
   1004 {
   1005    switch (opc) {
   1006    case OPC_MAD_F16:
   1007       return OPC_MAD_F32;
   1008    case OPC_SEL_B16:
   1009       return OPC_SEL_B32;
   1010    case OPC_SEL_S16:
   1011       return OPC_SEL_S32;
   1012    case OPC_SEL_F16:
   1013       return OPC_SEL_F32;
   1014    case OPC_SAD_S16:
   1015       return OPC_SAD_S32;
   1016    default:
   1017       return opc;
   1018    }
   1019 }
   1020 
   1021 static inline opc_t
   1022 cat4_half_opc(opc_t opc)
   1023 {
   1024    switch (opc) {
   1025    case OPC_RSQ:
   1026       return OPC_HRSQ;
   1027    case OPC_LOG2:
   1028       return OPC_HLOG2;
   1029    case OPC_EXP2:
   1030       return OPC_HEXP2;
   1031    default:
   1032       return opc;
   1033    }
   1034 }
   1035 
   1036 static inline opc_t
   1037 cat4_full_opc(opc_t opc)
   1038 {
   1039    switch (opc) {
   1040    case OPC_HRSQ:
   1041       return OPC_RSQ;
   1042    case OPC_HLOG2:
   1043       return OPC_LOG2;
   1044    case OPC_HEXP2:
   1045       return OPC_EXP2;
   1046    default:
   1047       return opc;
   1048    }
   1049 }
   1050 
   1051 static inline bool
   1052 is_meta(struct ir3_instruction *instr)
   1053 {
   1054    return (opc_cat(instr->opc) == -1);
   1055 }
   1056 
   1057 static inline unsigned
   1058 reg_elems(const struct ir3_register *reg)
   1059 {
   1060    if (reg->flags & IR3_REG_ARRAY)
   1061       return reg->size;
   1062    else
   1063       return util_last_bit(reg->wrmask);
   1064 }
   1065 
   1066 static inline unsigned
   1067 reg_elem_size(const struct ir3_register *reg)
   1068 {
   1069    return (reg->flags & IR3_REG_HALF) ? 1 : 2;
   1070 }
   1071 
   1072 static inline unsigned
   1073 reg_size(const struct ir3_register *reg)
   1074 {
   1075    return reg_elems(reg) * reg_elem_size(reg);
   1076 }
   1077 
   1078 static inline unsigned
   1079 dest_regs(struct ir3_instruction *instr)
   1080 {
   1081    if (instr->dsts_count == 0)
   1082       return 0;
   1083 
   1084    debug_assert(instr->dsts_count == 1);
   1085    return util_last_bit(instr->dsts[0]->wrmask);
   1086 }
   1087 
   1088 /* is dst a normal temp register: */
   1089 static inline bool
   1090 is_dest_gpr(struct ir3_register *dst)
   1091 {
   1092    if (dst->wrmask == 0)
   1093       return false;
   1094    if ((reg_num(dst) == REG_A0) || (dst->num == regid(REG_P0, 0)))
   1095       return false;
   1096    return true;
   1097 }
   1098 
   1099 static inline bool
   1100 writes_gpr(struct ir3_instruction *instr)
   1101 {
   1102    if (dest_regs(instr) == 0)
   1103       return false;
   1104    return is_dest_gpr(instr->dsts[0]);
   1105 }
   1106 
   1107 static inline bool
   1108 writes_addr0(struct ir3_instruction *instr)
   1109 {
   1110    /* Note: only the first dest can write to a0.x */
   1111    if (instr->dsts_count > 0) {
   1112       struct ir3_register *dst = instr->dsts[0];
   1113       return dst->num == regid(REG_A0, 0);
   1114    }
   1115    return false;
   1116 }
   1117 
   1118 static inline bool
   1119 writes_addr1(struct ir3_instruction *instr)
   1120 {
   1121    /* Note: only the first dest can write to a1.x */
   1122    if (instr->dsts_count > 0) {
   1123       struct ir3_register *dst = instr->dsts[0];
   1124       return dst->num == regid(REG_A0, 1);
   1125    }
   1126    return false;
   1127 }
   1128 
   1129 static inline bool
   1130 writes_pred(struct ir3_instruction *instr)
   1131 {
   1132    /* Note: only the first dest can write to p0.x */
   1133    if (instr->dsts_count > 0) {
   1134       struct ir3_register *dst = instr->dsts[0];
   1135       return reg_num(dst) == REG_P0;
   1136    }
   1137    return false;
   1138 }
   1139 
   1140 /* Is it something other than a normal register. Shared regs, p0, and a0/a1
   1141  * are considered special here. Special registers are always accessed with one
   1142  * size and never alias normal registers, even though a naive calculation
   1143  * would sometimes make it seem like e.g. r30.z aliases a0.x.
   1144  */
   1145 static inline bool
   1146 is_reg_special(const struct ir3_register *reg)
   1147 {
   1148    return (reg->flags & IR3_REG_SHARED) || (reg_num(reg) == REG_A0) ||
   1149           (reg_num(reg) == REG_P0);
   1150 }
   1151 
   1152 /* Same as above but in cases where we don't have a register. r48.x and above
   1153  * are shared/special.
   1154  */
   1155 static inline bool
   1156 is_reg_num_special(unsigned num)
   1157 {
   1158    return num >= 48 * 4;
   1159 }
   1160 
   1161 /* returns defining instruction for reg */
   1162 /* TODO better name */
   1163 static inline struct ir3_instruction *
   1164 ssa(struct ir3_register *reg)
   1165 {
   1166    if ((reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) && reg->def)
   1167       return reg->def->instr;
   1168    return NULL;
   1169 }
   1170 
   1171 static inline bool
   1172 conflicts(struct ir3_register *a, struct ir3_register *b)
   1173 {
   1174    return (a && b) && (a->def != b->def);
   1175 }
   1176 
   1177 static inline bool
   1178 reg_gpr(struct ir3_register *r)
   1179 {
   1180    if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED))
   1181       return false;
   1182    if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0))
   1183       return false;
   1184    return true;
   1185 }
   1186 
   1187 static inline type_t
   1188 half_type(type_t type)
   1189 {
   1190    switch (type) {
   1191    case TYPE_F32:
   1192       return TYPE_F16;
   1193    case TYPE_U32:
   1194       return TYPE_U16;
   1195    case TYPE_S32:
   1196       return TYPE_S16;
   1197    case TYPE_F16:
   1198    case TYPE_U16:
   1199    case TYPE_S16:
   1200       return type;
   1201    default:
   1202       assert(0);
   1203       return ~0;
   1204    }
   1205 }
   1206 
   1207 static inline type_t
   1208 full_type(type_t type)
   1209 {
   1210    switch (type) {
   1211    case TYPE_F16:
   1212       return TYPE_F32;
   1213    case TYPE_U16:
   1214       return TYPE_U32;
   1215    case TYPE_S16:
   1216       return TYPE_S32;
   1217    case TYPE_F32:
   1218    case TYPE_U32:
   1219    case TYPE_S32:
   1220       return type;
   1221    default:
   1222       assert(0);
   1223       return ~0;
   1224    }
   1225 }
   1226 
   1227 /* some cat2 instructions (ie. those which are not float) can embed an
   1228  * immediate:
   1229  */
   1230 static inline bool
   1231 ir3_cat2_int(opc_t opc)
   1232 {
   1233    switch (opc) {
   1234    case OPC_ADD_U:
   1235    case OPC_ADD_S:
   1236    case OPC_SUB_U:
   1237    case OPC_SUB_S:
   1238    case OPC_CMPS_U:
   1239    case OPC_CMPS_S:
   1240    case OPC_MIN_U:
   1241    case OPC_MIN_S:
   1242    case OPC_MAX_U:
   1243    case OPC_MAX_S:
   1244    case OPC_CMPV_U:
   1245    case OPC_CMPV_S:
   1246    case OPC_MUL_U24:
   1247    case OPC_MUL_S24:
   1248    case OPC_MULL_U:
   1249    case OPC_CLZ_S:
   1250    case OPC_ABSNEG_S:
   1251    case OPC_AND_B:
   1252    case OPC_OR_B:
   1253    case OPC_NOT_B:
   1254    case OPC_XOR_B:
   1255    case OPC_BFREV_B:
   1256    case OPC_CLZ_B:
   1257    case OPC_SHL_B:
   1258    case OPC_SHR_B:
   1259    case OPC_ASHR_B:
   1260    case OPC_MGEN_B:
   1261    case OPC_GETBIT_B:
   1262    case OPC_CBITS_B:
   1263    case OPC_BARY_F:
   1264       return true;
   1265 
   1266    default:
   1267       return false;
   1268    }
   1269 }
   1270 
   1271 /* map cat2 instruction to valid abs/neg flags: */
   1272 static inline unsigned
   1273 ir3_cat2_absneg(opc_t opc)
   1274 {
   1275    switch (opc) {
   1276    case OPC_ADD_F:
   1277    case OPC_MIN_F:
   1278    case OPC_MAX_F:
   1279    case OPC_MUL_F:
   1280    case OPC_SIGN_F:
   1281    case OPC_CMPS_F:
   1282    case OPC_ABSNEG_F:
   1283    case OPC_CMPV_F:
   1284    case OPC_FLOOR_F:
   1285    case OPC_CEIL_F:
   1286    case OPC_RNDNE_F:
   1287    case OPC_RNDAZ_F:
   1288    case OPC_TRUNC_F:
   1289    case OPC_BARY_F:
   1290       return IR3_REG_FABS | IR3_REG_FNEG;
   1291 
   1292    case OPC_ADD_U:
   1293    case OPC_ADD_S:
   1294    case OPC_SUB_U:
   1295    case OPC_SUB_S:
   1296    case OPC_CMPS_U:
   1297    case OPC_CMPS_S:
   1298    case OPC_MIN_U:
   1299    case OPC_MIN_S:
   1300    case OPC_MAX_U:
   1301    case OPC_MAX_S:
   1302    case OPC_CMPV_U:
   1303    case OPC_CMPV_S:
   1304    case OPC_MUL_U24:
   1305    case OPC_MUL_S24:
   1306    case OPC_MULL_U:
   1307    case OPC_CLZ_S:
   1308       return 0;
   1309 
   1310    case OPC_ABSNEG_S:
   1311       return IR3_REG_SABS | IR3_REG_SNEG;
   1312 
   1313    case OPC_AND_B:
   1314    case OPC_OR_B:
   1315    case OPC_NOT_B:
   1316    case OPC_XOR_B:
   1317    case OPC_BFREV_B:
   1318    case OPC_CLZ_B:
   1319    case OPC_SHL_B:
   1320    case OPC_SHR_B:
   1321    case OPC_ASHR_B:
   1322    case OPC_MGEN_B:
   1323    case OPC_GETBIT_B:
   1324    case OPC_CBITS_B:
   1325       return IR3_REG_BNOT;
   1326 
   1327    default:
   1328       return 0;
   1329    }
   1330 }
   1331 
   1332 /* map cat3 instructions to valid abs/neg flags: */
   1333 static inline unsigned
   1334 ir3_cat3_absneg(opc_t opc)
   1335 {
   1336    switch (opc) {
   1337    case OPC_MAD_F16:
   1338    case OPC_MAD_F32:
   1339    case OPC_SEL_F16:
   1340    case OPC_SEL_F32:
   1341       return IR3_REG_FNEG;
   1342 
   1343    case OPC_MAD_U16:
   1344    case OPC_MADSH_U16:
   1345    case OPC_MAD_S16:
   1346    case OPC_MADSH_M16:
   1347    case OPC_MAD_U24:
   1348    case OPC_MAD_S24:
   1349    case OPC_SEL_S16:
   1350    case OPC_SEL_S32:
   1351    case OPC_SAD_S16:
   1352    case OPC_SAD_S32:
   1353       /* neg *may* work on 3rd src.. */
   1354 
   1355    case OPC_SEL_B16:
   1356    case OPC_SEL_B32:
   1357 
   1358    case OPC_SHLG_B16:
   1359 
   1360    default:
   1361       return 0;
   1362    }
   1363 }
   1364 
   1365 /* Return the type (float, int, or uint) the op uses when converting from the
   1366  * internal result of the op (which is assumed to be the same size as the
   1367  * sources) to the destination when they are not the same size. If F32 it does
   1368  * a floating-point conversion, if U32 it does a truncation/zero-extension, if
   1369  * S32 it does a truncation/sign-extension. "can_fold" will be false if it
   1370  * doesn't do anything sensible or is unknown.
   1371  */
   1372 static inline type_t
   1373 ir3_output_conv_type(struct ir3_instruction *instr, bool *can_fold)
   1374 {
   1375    *can_fold = true;
   1376    switch (instr->opc) {
   1377    case OPC_ADD_F:
   1378    case OPC_MUL_F:
   1379    case OPC_BARY_F:
   1380    case OPC_MAD_F32:
   1381    case OPC_MAD_F16:
   1382       return TYPE_F32;
   1383 
   1384    case OPC_ADD_U:
   1385    case OPC_SUB_U:
   1386    case OPC_MIN_U:
   1387    case OPC_MAX_U:
   1388    case OPC_AND_B:
   1389    case OPC_OR_B:
   1390    case OPC_NOT_B:
   1391    case OPC_XOR_B:
   1392    case OPC_MUL_U24:
   1393    case OPC_MULL_U:
   1394    case OPC_SHL_B:
   1395    case OPC_SHR_B:
   1396    case OPC_ASHR_B:
   1397    case OPC_MAD_U24:
   1398    /* Comparison ops zero-extend/truncate their results, so consider them as
   1399     * unsigned here.
   1400     */
   1401    case OPC_CMPS_F:
   1402    case OPC_CMPV_F:
   1403    case OPC_CMPS_U:
   1404    case OPC_CMPS_S:
   1405       return TYPE_U32;
   1406 
   1407    case OPC_ADD_S:
   1408    case OPC_SUB_S:
   1409    case OPC_MIN_S:
   1410    case OPC_MAX_S:
   1411    case OPC_ABSNEG_S:
   1412    case OPC_MUL_S24:
   1413    case OPC_MAD_S24:
   1414       return TYPE_S32;
   1415 
   1416    /* We assume that any move->move folding that could be done was done by
   1417     * NIR.
   1418     */
   1419    case OPC_MOV:
   1420    default:
   1421       *can_fold = false;
   1422       return TYPE_U32;
   1423    }
   1424 }
   1425 
   1426 /* Return the src and dst types for the conversion which is already folded
   1427  * into the op. We can assume that instr has folded in a conversion from
   1428  * ir3_output_conv_src_type() to ir3_output_conv_dst_type(). Only makes sense
   1429  * to call if ir3_output_conv_type() returns can_fold = true.
   1430  */
   1431 static inline type_t
   1432 ir3_output_conv_src_type(struct ir3_instruction *instr, type_t base_type)
   1433 {
   1434    switch (instr->opc) {
   1435    case OPC_CMPS_F:
   1436    case OPC_CMPV_F:
   1437    case OPC_CMPS_U:
   1438    case OPC_CMPS_S:
   1439       /* Comparisons only return 0/1 and the size of the comparison sources
   1440        * is irrelevant, never consider them as having an output conversion
   1441        * by returning a type with the dest size here:
   1442        */
   1443       return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type)
   1444                                                     : full_type(base_type);
   1445 
   1446    case OPC_BARY_F:
   1447       /* bary.f doesn't have an explicit source, but we can assume here that
   1448        * the varying data it reads is in fp32.
   1449        *
   1450        * This may be fp16 on older gen's depending on some register
   1451        * settings, but it's probably not worth plumbing that through for a
   1452        * small improvement that NIR would hopefully handle for us anyway.
   1453        */
   1454       return TYPE_F32;
   1455 
   1456    default:
   1457       return (instr->srcs[0]->flags & IR3_REG_HALF) ? half_type(base_type)
   1458                                                     : full_type(base_type);
   1459    }
   1460 }
   1461 
   1462 static inline type_t
   1463 ir3_output_conv_dst_type(struct ir3_instruction *instr, type_t base_type)
   1464 {
   1465    return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type)
   1466                                                  : full_type(base_type);
   1467 }
   1468 
   1469 /* Some instructions have signed/unsigned variants which are identical except
   1470  * for whether the folded conversion sign-extends or zero-extends, and we can
   1471  * fold in a mismatching move by rewriting the opcode. Return the opcode to
   1472  * switch signedness, and whether one exists.
   1473  */
   1474 static inline opc_t
   1475 ir3_try_swap_signedness(opc_t opc, bool *can_swap)
   1476 {
   1477    switch (opc) {
   1478 #define PAIR(u, s)                                                             \
   1479    case OPC_##u:                                                               \
   1480       return OPC_##s;                                                          \
   1481    case OPC_##s:                                                               \
   1482       return OPC_##u;
   1483       PAIR(ADD_U, ADD_S)
   1484       PAIR(SUB_U, SUB_S)
   1485       /* Note: these are only identical when the sources are half, but that's
   1486        * the only case we call this function for anyway.
   1487        */
   1488       PAIR(MUL_U24, MUL_S24)
   1489 
   1490    default:
   1491       *can_swap = false;
   1492       return opc;
   1493    }
   1494 }
   1495 
   1496 #define MASK(n) ((1 << (n)) - 1)
   1497 
   1498 /* iterator for an instructions's sources (reg), also returns src #: */
   1499 #define foreach_src_n(__srcreg, __n, __instr)                                  \
   1500    if ((__instr)->srcs_count)                                                  \
   1501       for (struct ir3_register *__srcreg = (void *)~0; __srcreg;               \
   1502            __srcreg = NULL)                                                    \
   1503          for (unsigned __cnt = (__instr)->srcs_count, __n = 0; __n < __cnt;    \
   1504               __n++)                                                           \
   1505             if ((__srcreg = (__instr)->srcs[__n]))
   1506 
   1507 /* iterator for an instructions's sources (reg): */
   1508 #define foreach_src(__srcreg, __instr) foreach_src_n (__srcreg, __i, __instr)
   1509 
   1510 /* iterator for an instructions's destinations (reg), also returns dst #: */
   1511 #define foreach_dst_n(__dstreg, __n, __instr)                                  \
   1512    if ((__instr)->dsts_count)                                                  \
   1513       for (struct ir3_register *__dstreg = (void *)~0; __dstreg;               \
   1514            __dstreg = NULL)                                                    \
   1515          for (unsigned __cnt = (__instr)->dsts_count, __n = 0; __n < __cnt;    \
   1516               __n++)                                                           \
   1517             if ((__dstreg = (__instr)->dsts[__n]))
   1518 
   1519 /* iterator for an instructions's destinations (reg): */
   1520 #define foreach_dst(__dstreg, __instr) foreach_dst_n (__dstreg, __i, __instr)
   1521 
   1522 static inline unsigned
   1523 __ssa_src_cnt(struct ir3_instruction *instr)
   1524 {
   1525    return instr->srcs_count + instr->deps_count;
   1526 }
   1527 
   1528 static inline bool
   1529 __is_false_dep(struct ir3_instruction *instr, unsigned n)
   1530 {
   1531    if (n >= instr->srcs_count)
   1532       return true;
   1533    return false;
   1534 }
   1535 
   1536 static inline struct ir3_instruction **
   1537 __ssa_srcp_n(struct ir3_instruction *instr, unsigned n)
   1538 {
   1539    if (__is_false_dep(instr, n))
   1540       return &instr->deps[n - instr->srcs_count];
   1541    if (ssa(instr->srcs[n]))
   1542       return &instr->srcs[n]->def->instr;
   1543    return NULL;
   1544 }
   1545 
   1546 #define foreach_ssa_srcp_n(__srcp, __n, __instr)                               \
   1547    for (struct ir3_instruction **__srcp = (void *)~0; __srcp; __srcp = NULL)   \
   1548       for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt;      \
   1549            __n++)                                                              \
   1550          if ((__srcp = __ssa_srcp_n(__instr, __n)))
   1551 
   1552 #define foreach_ssa_srcp(__srcp, __instr)                                      \
   1553    foreach_ssa_srcp_n (__srcp, __i, __instr)
   1554 
   1555 /* iterator for an instruction's SSA sources (instr), also returns src #: */
   1556 #define foreach_ssa_src_n(__srcinst, __n, __instr)                             \
   1557    for (struct ir3_instruction *__srcinst = (void *)~0; __srcinst;             \
   1558         __srcinst = NULL)                                                      \
   1559       foreach_ssa_srcp_n (__srcp, __n, __instr)                                \
   1560          if ((__srcinst = *__srcp))
   1561 
   1562 /* iterator for an instruction's SSA sources (instr): */
   1563 #define foreach_ssa_src(__srcinst, __instr)                                    \
   1564    foreach_ssa_src_n (__srcinst, __i, __instr)
   1565 
   1566 /* iterators for shader inputs: */
   1567 #define foreach_input_n(__ininstr, __cnt, __ir)                                \
   1568    for (struct ir3_instruction *__ininstr = (void *)~0; __ininstr;             \
   1569         __ininstr = NULL)                                                      \
   1570       for (unsigned __cnt = 0; __cnt < (__ir)->inputs_count; __cnt++)          \
   1571          if ((__ininstr = (__ir)->inputs[__cnt]))
   1572 #define foreach_input(__ininstr, __ir) foreach_input_n (__ininstr, __i, __ir)
   1573 
   1574 /* iterators for instructions: */
   1575 #define foreach_instr(__instr, __list)                                         \
   1576    list_for_each_entry (struct ir3_instruction, __instr, __list, node)
   1577 #define foreach_instr_rev(__instr, __list)                                     \
   1578    list_for_each_entry_rev (struct ir3_instruction, __instr, __list, node)
   1579 #define foreach_instr_safe(__instr, __list)                                    \
   1580    list_for_each_entry_safe (struct ir3_instruction, __instr, __list, node)
   1581 #define foreach_instr_from_safe(__instr, __start, __list)                      \
   1582    list_for_each_entry_from_safe(struct ir3_instruction, __instr, __start,     \
   1583                                  __list, node)
   1584 
   1585 /* iterators for blocks: */
   1586 #define foreach_block(__block, __list)                                         \
   1587    list_for_each_entry (struct ir3_block, __block, __list, node)
   1588 #define foreach_block_safe(__block, __list)                                    \
   1589    list_for_each_entry_safe (struct ir3_block, __block, __list, node)
   1590 #define foreach_block_rev(__block, __list)                                     \
   1591    list_for_each_entry_rev (struct ir3_block, __block, __list, node)
   1592 
   1593 /* iterators for arrays: */
   1594 #define foreach_array(__array, __list)                                         \
   1595    list_for_each_entry (struct ir3_array, __array, __list, node)
   1596 #define foreach_array_safe(__array, __list)                                    \
   1597    list_for_each_entry_safe (struct ir3_array, __array, __list, node)
   1598 
   1599 #define IR3_PASS(ir, pass, ...)                                                \
   1600    ({                                                                          \
   1601       bool progress = pass(ir, ##__VA_ARGS__);                                 \
   1602       if (progress) {                                                          \
   1603          ir3_debug_print(ir, "AFTER: " #pass);                                 \
   1604          ir3_validate(ir);                                                     \
   1605       }                                                                        \
   1606       progress;                                                                \
   1607    })
   1608 
   1609 /* validate: */
   1610 void ir3_validate(struct ir3 *ir);
   1611 
   1612 /* dump: */
   1613 void ir3_print(struct ir3 *ir);
   1614 void ir3_print_instr(struct ir3_instruction *instr);
   1615 
   1616 struct log_stream;
   1617 void ir3_print_instr_stream(struct log_stream *stream, struct ir3_instruction *instr);
   1618 
   1619 /* delay calculation: */
   1620 int ir3_delayslots(struct ir3_instruction *assigner,
   1621                    struct ir3_instruction *consumer, unsigned n, bool soft);
   1622 unsigned ir3_delay_calc_prera(struct ir3_block *block,
   1623                               struct ir3_instruction *instr);
   1624 unsigned ir3_delay_calc_postra(struct ir3_block *block,
   1625                                struct ir3_instruction *instr, bool soft,
   1626                                bool mergedregs);
   1627 unsigned ir3_delay_calc_exact(struct ir3_block *block,
   1628                               struct ir3_instruction *instr, bool mergedregs);
   1629 void ir3_remove_nops(struct ir3 *ir);
   1630 
   1631 /* unreachable block elimination: */
   1632 bool ir3_remove_unreachable(struct ir3 *ir);
   1633 
   1634 /* dead code elimination: */
   1635 struct ir3_shader_variant;
   1636 bool ir3_dce(struct ir3 *ir, struct ir3_shader_variant *so);
   1637 
   1638 /* fp16 conversion folding */
   1639 bool ir3_cf(struct ir3 *ir);
   1640 
   1641 /* copy-propagate: */
   1642 bool ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so);
   1643 bool ir3_cp_postsched(struct ir3 *ir);
   1644 
   1645 /* common subexpression elimination: */
   1646 bool ir3_cse(struct ir3 *ir);
   1647 
   1648 /* Make arrays SSA */
   1649 bool ir3_array_to_ssa(struct ir3 *ir);
   1650 
   1651 /* scheduling: */
   1652 bool ir3_sched_add_deps(struct ir3 *ir);
   1653 int ir3_sched(struct ir3 *ir);
   1654 
   1655 struct ir3_context;
   1656 bool ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v);
   1657 
   1658 /* register assignment: */
   1659 int ir3_ra(struct ir3_shader_variant *v);
   1660 
   1661 /* lower subgroup ops: */
   1662 bool ir3_lower_subgroups(struct ir3 *ir);
   1663 
   1664 /* legalize: */
   1665 bool ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary);
   1666 
   1667 static inline bool
   1668 ir3_has_latency_to_hide(struct ir3 *ir)
   1669 {
   1670    /* VS/GS/TCS/TESS  co-exist with frag shader invocations, but we don't
   1671     * know the nature of the fragment shader.  Just assume it will have
   1672     * latency to hide:
   1673     */
   1674    if (ir->type != MESA_SHADER_FRAGMENT)
   1675       return true;
   1676 
   1677    foreach_block (block, &ir->block_list) {
   1678       foreach_instr (instr, &block->instr_list) {
   1679          if (is_tex_or_prefetch(instr))
   1680             return true;
   1681 
   1682          if (is_load(instr)) {
   1683             switch (instr->opc) {
   1684             case OPC_LDLV:
   1685             case OPC_LDL:
   1686             case OPC_LDLW:
   1687                break;
   1688             default:
   1689                return true;
   1690             }
   1691          }
   1692       }
   1693    }
   1694 
   1695    return false;
   1696 }
   1697 
   1698 /* ************************************************************************* */
   1699 /* instruction helpers */
   1700 
   1701 /* creates SSA src of correct type (ie. half vs full precision) */
   1702 static inline struct ir3_register *
   1703 __ssa_src(struct ir3_instruction *instr, struct ir3_instruction *src,
   1704           unsigned flags)
   1705 {
   1706    struct ir3_register *reg;
   1707    if (src->dsts[0]->flags & IR3_REG_HALF)
   1708       flags |= IR3_REG_HALF;
   1709    reg = ir3_src_create(instr, INVALID_REG, IR3_REG_SSA | flags);
   1710    reg->def = src->dsts[0];
   1711    reg->wrmask = src->dsts[0]->wrmask;
   1712    return reg;
   1713 }
   1714 
   1715 static inline struct ir3_register *
   1716 __ssa_dst(struct ir3_instruction *instr)
   1717 {
   1718    struct ir3_register *reg = ir3_dst_create(instr, INVALID_REG, IR3_REG_SSA);
   1719    reg->instr = instr;
   1720    return reg;
   1721 }
   1722 
   1723 static inline struct ir3_instruction *
   1724 create_immed_typed(struct ir3_block *block, uint32_t val, type_t type)
   1725 {
   1726    struct ir3_instruction *mov;
   1727    unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
   1728 
   1729    mov = ir3_instr_create(block, OPC_MOV, 1, 1);
   1730    mov->cat1.src_type = type;
   1731    mov->cat1.dst_type = type;
   1732    __ssa_dst(mov)->flags |= flags;
   1733    ir3_src_create(mov, 0, IR3_REG_IMMED | flags)->uim_val = val;
   1734 
   1735    return mov;
   1736 }
   1737 
   1738 static inline struct ir3_instruction *
   1739 create_immed(struct ir3_block *block, uint32_t val)
   1740 {
   1741    return create_immed_typed(block, val, TYPE_U32);
   1742 }
   1743 
   1744 static inline struct ir3_instruction *
   1745 create_uniform_typed(struct ir3_block *block, unsigned n, type_t type)
   1746 {
   1747    struct ir3_instruction *mov;
   1748    unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
   1749 
   1750    mov = ir3_instr_create(block, OPC_MOV, 1, 1);
   1751    mov->cat1.src_type = type;
   1752    mov->cat1.dst_type = type;
   1753    __ssa_dst(mov)->flags |= flags;
   1754    ir3_src_create(mov, n, IR3_REG_CONST | flags);
   1755 
   1756    return mov;
   1757 }
   1758 
   1759 static inline struct ir3_instruction *
   1760 create_uniform(struct ir3_block *block, unsigned n)
   1761 {
   1762    return create_uniform_typed(block, n, TYPE_F32);
   1763 }
   1764 
   1765 static inline struct ir3_instruction *
   1766 create_uniform_indirect(struct ir3_block *block, int n, type_t type,
   1767                         struct ir3_instruction *address)
   1768 {
   1769    struct ir3_instruction *mov;
   1770 
   1771    mov = ir3_instr_create(block, OPC_MOV, 1, 1);
   1772    mov->cat1.src_type = type;
   1773    mov->cat1.dst_type = type;
   1774    __ssa_dst(mov);
   1775    ir3_src_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n;
   1776 
   1777    ir3_instr_set_address(mov, address);
   1778 
   1779    return mov;
   1780 }
   1781 
   1782 static inline struct ir3_instruction *
   1783 ir3_MOV(struct ir3_block *block, struct ir3_instruction *src, type_t type)
   1784 {
   1785    struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV, 1, 1);
   1786    unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
   1787 
   1788    __ssa_dst(instr)->flags |= flags;
   1789    if (src->dsts[0]->flags & IR3_REG_ARRAY) {
   1790       struct ir3_register *src_reg = __ssa_src(instr, src, IR3_REG_ARRAY);
   1791       src_reg->array = src->dsts[0]->array;
   1792    } else {
   1793       __ssa_src(instr, src, src->dsts[0]->flags & IR3_REG_SHARED);
   1794    }
   1795    debug_assert(!(src->dsts[0]->flags & IR3_REG_RELATIV));
   1796    instr->cat1.src_type = type;
   1797    instr->cat1.dst_type = type;
   1798    return instr;
   1799 }
   1800 
   1801 static inline struct ir3_instruction *
   1802 ir3_COV(struct ir3_block *block, struct ir3_instruction *src, type_t src_type,
   1803         type_t dst_type)
   1804 {
   1805    struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV, 1, 1);
   1806    unsigned dst_flags = (type_size(dst_type) < 32) ? IR3_REG_HALF : 0;
   1807    unsigned src_flags = (type_size(src_type) < 32) ? IR3_REG_HALF : 0;
   1808 
   1809    debug_assert((src->dsts[0]->flags & IR3_REG_HALF) == src_flags);
   1810 
   1811    __ssa_dst(instr)->flags |= dst_flags;
   1812    __ssa_src(instr, src, 0);
   1813    instr->cat1.src_type = src_type;
   1814    instr->cat1.dst_type = dst_type;
   1815    debug_assert(!(src->dsts[0]->flags & IR3_REG_ARRAY));
   1816    return instr;
   1817 }
   1818 
   1819 static inline struct ir3_instruction *
   1820 ir3_MOVMSK(struct ir3_block *block, unsigned components)
   1821 {
   1822    struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOVMSK, 1, 0);
   1823 
   1824    struct ir3_register *dst = __ssa_dst(instr);
   1825    dst->flags |= IR3_REG_SHARED;
   1826    dst->wrmask = (1 << components) - 1;
   1827    instr->repeat = components - 1;
   1828    return instr;
   1829 }
   1830 
   1831 static inline struct ir3_instruction *
   1832 ir3_BALLOT_MACRO(struct ir3_block *block, struct ir3_instruction *src,
   1833                  unsigned components)
   1834 {
   1835    struct ir3_instruction *instr =
   1836       ir3_instr_create(block, OPC_BALLOT_MACRO, 1, 1);
   1837 
   1838    struct ir3_register *dst = __ssa_dst(instr);
   1839    dst->flags |= IR3_REG_SHARED;
   1840    dst->wrmask = (1 << components) - 1;
   1841 
   1842    __ssa_src(instr, src, 0);
   1843 
   1844    return instr;
   1845 }
   1846 
   1847 static inline struct ir3_instruction *
   1848 ir3_NOP(struct ir3_block *block)
   1849 {
   1850    return ir3_instr_create(block, OPC_NOP, 0, 0);
   1851 }
   1852 
   1853 #define IR3_INSTR_0 0
   1854 
   1855 /* clang-format off */
   1856 #define __INSTR0(flag, name, opc)                                              \
   1857 static inline struct ir3_instruction *ir3_##name(struct ir3_block *block)      \
   1858 {                                                                              \
   1859    struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 0);         \
   1860    instr->flags |= flag;                                                       \
   1861    return instr;                                                               \
   1862 }
   1863 /* clang-format on */
   1864 #define INSTR0F(f, name) __INSTR0(IR3_INSTR_##f, name##_##f, OPC_##name)
   1865 #define INSTR0(name)     __INSTR0(0, name, OPC_##name)
   1866 
   1867 /* clang-format off */
   1868 #define __INSTR1(flag, dst_count, name, opc)                                   \
   1869 static inline struct ir3_instruction *ir3_##name(                              \
   1870    struct ir3_block *block, struct ir3_instruction *a, unsigned aflags)        \
   1871 {                                                                              \
   1872    struct ir3_instruction *instr =                                             \
   1873       ir3_instr_create(block, opc, dst_count, 1);                              \
   1874    for (unsigned i = 0; i < dst_count; i++)                                    \
   1875       __ssa_dst(instr);                                                        \
   1876    __ssa_src(instr, a, aflags);                                                \
   1877    instr->flags |= flag;                                                       \
   1878    return instr;                                                               \
   1879 }
   1880 /* clang-format on */
   1881 #define INSTR1F(f, name)  __INSTR1(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
   1882 #define INSTR1(name)      __INSTR1(0, 1, name, OPC_##name)
   1883 #define INSTR1NODST(name) __INSTR1(0, 0, name, OPC_##name)
   1884 
   1885 /* clang-format off */
   1886 #define __INSTR2(flag, name, opc)                                              \
   1887 static inline struct ir3_instruction *ir3_##name(                              \
   1888    struct ir3_block *block, struct ir3_instruction *a, unsigned aflags,        \
   1889    struct ir3_instruction *b, unsigned bflags)                                 \
   1890 {                                                                              \
   1891    struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 2);         \
   1892    __ssa_dst(instr);                                                           \
   1893    __ssa_src(instr, a, aflags);                                                \
   1894    __ssa_src(instr, b, bflags);                                                \
   1895    instr->flags |= flag;                                                       \
   1896    return instr;                                                               \
   1897 }
   1898 /* clang-format on */
   1899 #define INSTR2F(f, name) __INSTR2(IR3_INSTR_##f, name##_##f, OPC_##name)
   1900 #define INSTR2(name)     __INSTR2(0, name, OPC_##name)
   1901 
   1902 /* clang-format off */
   1903 #define __INSTR3(flag, dst_count, name, opc)                                   \
   1904 static inline struct ir3_instruction *ir3_##name(                              \
   1905    struct ir3_block *block, struct ir3_instruction *a, unsigned aflags,        \
   1906    struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c,      \
   1907    unsigned cflags)                                                            \
   1908 {                                                                              \
   1909    struct ir3_instruction *instr =                                             \
   1910       ir3_instr_create(block, opc, dst_count, 3);                              \
   1911    for (unsigned i = 0; i < dst_count; i++)                                    \
   1912       __ssa_dst(instr);                                                        \
   1913    __ssa_src(instr, a, aflags);                                                \
   1914    __ssa_src(instr, b, bflags);                                                \
   1915    __ssa_src(instr, c, cflags);                                                \
   1916    instr->flags |= flag;                                                       \
   1917    return instr;                                                               \
   1918 }
   1919 /* clang-format on */
   1920 #define INSTR3F(f, name)  __INSTR3(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
   1921 #define INSTR3(name)      __INSTR3(0, 1, name, OPC_##name)
   1922 #define INSTR3NODST(name) __INSTR3(0, 0, name, OPC_##name)
   1923 
   1924 /* clang-format off */
   1925 #define __INSTR4(flag, dst_count, name, opc)                                   \
   1926 static inline struct ir3_instruction *ir3_##name(                              \
   1927    struct ir3_block *block, struct ir3_instruction *a, unsigned aflags,        \
   1928    struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c,      \
   1929    unsigned cflags, struct ir3_instruction *d, unsigned dflags)                \
   1930 {                                                                              \
   1931    struct ir3_instruction *instr =                                             \
   1932       ir3_instr_create(block, opc, dst_count, 4);                              \
   1933    for (unsigned i = 0; i < dst_count; i++)                                    \
   1934       __ssa_dst(instr);                                                        \
   1935    __ssa_src(instr, a, aflags);                                                \
   1936    __ssa_src(instr, b, bflags);                                                \
   1937    __ssa_src(instr, c, cflags);                                                \
   1938    __ssa_src(instr, d, dflags);                                                \
   1939    instr->flags |= flag;                                                       \
   1940    return instr;                                                               \
   1941 }
   1942 /* clang-format on */
   1943 #define INSTR4F(f, name)  __INSTR4(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
   1944 #define INSTR4(name)      __INSTR4(0, 1, name, OPC_##name)
   1945 #define INSTR4NODST(name) __INSTR4(0, 0, name, OPC_##name)
   1946 
   1947 /* clang-format off */
   1948 #define __INSTR5(flag, name, opc)                                              \
   1949 static inline struct ir3_instruction *ir3_##name(                              \
   1950    struct ir3_block *block, struct ir3_instruction *a, unsigned aflags,        \
   1951    struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c,      \
   1952    unsigned cflags, struct ir3_instruction *d, unsigned dflags,                \
   1953    struct ir3_instruction *e, unsigned eflags)                                 \
   1954 {                                                                              \
   1955    struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 5);         \
   1956    __ssa_dst(instr);                                                           \
   1957    __ssa_src(instr, a, aflags);                                                \
   1958    __ssa_src(instr, b, bflags);                                                \
   1959    __ssa_src(instr, c, cflags);                                                \
   1960    __ssa_src(instr, d, dflags);                                                \
   1961    __ssa_src(instr, e, eflags);                                                \
   1962    instr->flags |= flag;                                                       \
   1963    return instr;                                                               \
   1964 }
   1965 /* clang-format on */
   1966 #define INSTR5F(f, name) __INSTR5(IR3_INSTR_##f, name##_##f, OPC_##name)
   1967 #define INSTR5(name)     __INSTR5(0, name, OPC_##name)
   1968 
   1969 /* clang-format off */
   1970 #define __INSTR6(flag, dst_count, name, opc)                                   \
   1971 static inline struct ir3_instruction *ir3_##name(                              \
   1972    struct ir3_block *block, struct ir3_instruction *a, unsigned aflags,        \
   1973    struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c,      \
   1974    unsigned cflags, struct ir3_instruction *d, unsigned dflags,                \
   1975    struct ir3_instruction *e, unsigned eflags, struct ir3_instruction *f,      \
   1976    unsigned fflags)                                                            \
   1977 {                                                                              \
   1978    struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 6);         \
   1979    for (unsigned i = 0; i < dst_count; i++)                                    \
   1980       __ssa_dst(instr);                                                        \
   1981    __ssa_src(instr, a, aflags);                                                \
   1982    __ssa_src(instr, b, bflags);                                                \
   1983    __ssa_src(instr, c, cflags);                                                \
   1984    __ssa_src(instr, d, dflags);                                                \
   1985    __ssa_src(instr, e, eflags);                                                \
   1986    __ssa_src(instr, f, fflags);                                                \
   1987    instr->flags |= flag;                                                       \
   1988    return instr;                                                               \
   1989 }
   1990 /* clang-format on */
   1991 #define INSTR6F(f, name)  __INSTR6(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
   1992 #define INSTR6(name)      __INSTR6(0, 1, name, OPC_##name)
   1993 #define INSTR6NODST(name) __INSTR6(0, 0, name, OPC_##name)
   1994 
   1995 /* cat0 instructions: */
   1996 INSTR1NODST(B)
   1997 INSTR0(JUMP)
   1998 INSTR1NODST(KILL)
   1999 INSTR1NODST(DEMOTE)
   2000 INSTR0(END)
   2001 INSTR0(CHSH)
   2002 INSTR0(CHMASK)
   2003 INSTR1NODST(PREDT)
   2004 INSTR0(PREDF)
   2005 INSTR0(PREDE)
   2006 INSTR0(GETONE)
   2007 
   2008 /* cat1 macros */
   2009 INSTR1(ANY_MACRO)
   2010 INSTR1(ALL_MACRO)
   2011 INSTR1(READ_FIRST_MACRO)
   2012 INSTR2(READ_COND_MACRO)
   2013 
   2014 static inline struct ir3_instruction *
   2015 ir3_ELECT_MACRO(struct ir3_block *block)
   2016 {
   2017    struct ir3_instruction *instr =
   2018       ir3_instr_create(block, OPC_ELECT_MACRO, 1, 0);
   2019    __ssa_dst(instr);
   2020    return instr;
   2021 }
   2022 
   2023 /* cat2 instructions, most 2 src but some 1 src: */
   2024 INSTR2(ADD_F)
   2025 INSTR2(MIN_F)
   2026 INSTR2(MAX_F)
   2027 INSTR2(MUL_F)
   2028 INSTR1(SIGN_F)
   2029 INSTR2(CMPS_F)
   2030 INSTR1(ABSNEG_F)
   2031 INSTR2(CMPV_F)
   2032 INSTR1(FLOOR_F)
   2033 INSTR1(CEIL_F)
   2034 INSTR1(RNDNE_F)
   2035 INSTR1(RNDAZ_F)
   2036 INSTR1(TRUNC_F)
   2037 INSTR2(ADD_U)
   2038 INSTR2(ADD_S)
   2039 INSTR2(SUB_U)
   2040 INSTR2(SUB_S)
   2041 INSTR2(CMPS_U)
   2042 INSTR2(CMPS_S)
   2043 INSTR2(MIN_U)
   2044 INSTR2(MIN_S)
   2045 INSTR2(MAX_U)
   2046 INSTR2(MAX_S)
   2047 INSTR1(ABSNEG_S)
   2048 INSTR2(AND_B)
   2049 INSTR2(OR_B)
   2050 INSTR1(NOT_B)
   2051 INSTR2(XOR_B)
   2052 INSTR2(CMPV_U)
   2053 INSTR2(CMPV_S)
   2054 INSTR2(MUL_U24)
   2055 INSTR2(MUL_S24)
   2056 INSTR2(MULL_U)
   2057 INSTR1(BFREV_B)
   2058 INSTR1(CLZ_S)
   2059 INSTR1(CLZ_B)
   2060 INSTR2(SHL_B)
   2061 INSTR2(SHR_B)
   2062 INSTR2(ASHR_B)
   2063 INSTR2(BARY_F)
   2064 INSTR2(MGEN_B)
   2065 INSTR2(GETBIT_B)
   2066 INSTR1(SETRM)
   2067 INSTR1(CBITS_B)
   2068 INSTR2(SHB)
   2069 INSTR2(MSAD)
   2070 
   2071 /* cat3 instructions: */
   2072 INSTR3(MAD_U16)
   2073 INSTR3(MADSH_U16)
   2074 INSTR3(MAD_S16)
   2075 INSTR3(MADSH_M16)
   2076 INSTR3(MAD_U24)
   2077 INSTR3(MAD_S24)
   2078 INSTR3(MAD_F16)
   2079 INSTR3(MAD_F32)
   2080 /* NOTE: SEL_B32 checks for zero vs nonzero */
   2081 INSTR3(SEL_B16)
   2082 INSTR3(SEL_B32)
   2083 INSTR3(SEL_S16)
   2084 INSTR3(SEL_S32)
   2085 INSTR3(SEL_F16)
   2086 INSTR3(SEL_F32)
   2087 INSTR3(SAD_S16)
   2088 INSTR3(SAD_S32)
   2089 
   2090 /* cat4 instructions: */
   2091 INSTR1(RCP)
   2092 INSTR1(RSQ)
   2093 INSTR1(HRSQ)
   2094 INSTR1(LOG2)
   2095 INSTR1(HLOG2)
   2096 INSTR1(EXP2)
   2097 INSTR1(HEXP2)
   2098 INSTR1(SIN)
   2099 INSTR1(COS)
   2100 INSTR1(SQRT)
   2101 
   2102 /* cat5 instructions: */
   2103 INSTR1(DSX)
   2104 INSTR1(DSXPP_MACRO)
   2105 INSTR1(DSY)
   2106 INSTR1(DSYPP_MACRO)
   2107 INSTR1F(3D, DSX)
   2108 INSTR1F(3D, DSY)
   2109 INSTR1(RGETPOS)
   2110 
   2111 static inline struct ir3_instruction *
   2112 ir3_SAM(struct ir3_block *block, opc_t opc, type_t type, unsigned wrmask,
   2113         unsigned flags, struct ir3_instruction *samp_tex,
   2114         struct ir3_instruction *src0, struct ir3_instruction *src1)
   2115 {
   2116    struct ir3_instruction *sam;
   2117    unsigned nreg = 0;
   2118 
   2119    if (flags & IR3_INSTR_S2EN) {
   2120       nreg++;
   2121    }
   2122    if (src0) {
   2123       nreg++;
   2124    }
   2125    if (src1) {
   2126       nreg++;
   2127    }
   2128 
   2129    sam = ir3_instr_create(block, opc, 1, nreg);
   2130    sam->flags |= flags;
   2131    __ssa_dst(sam)->wrmask = wrmask;
   2132    if (flags & IR3_INSTR_S2EN) {
   2133       __ssa_src(sam, samp_tex, (flags & IR3_INSTR_B) ? 0 : IR3_REG_HALF);
   2134    }
   2135    if (src0) {
   2136       __ssa_src(sam, src0, 0);
   2137    }
   2138    if (src1) {
   2139       __ssa_src(sam, src1, 0);
   2140    }
   2141    sam->cat5.type = type;
   2142 
   2143    return sam;
   2144 }
   2145 
   2146 /* cat6 instructions: */
   2147 INSTR2(LDLV)
   2148 INSTR3(LDG)
   2149 INSTR3(LDL)
   2150 INSTR3(LDLW)
   2151 INSTR3(LDP)
   2152 INSTR4NODST(STG)
   2153 INSTR3NODST(STL)
   2154 INSTR3NODST(STLW)
   2155 INSTR3NODST(STP)
   2156 INSTR1(RESINFO)
   2157 INSTR1(RESFMT)
   2158 INSTR2(ATOMIC_ADD)
   2159 INSTR2(ATOMIC_SUB)
   2160 INSTR2(ATOMIC_XCHG)
   2161 INSTR2(ATOMIC_INC)
   2162 INSTR2(ATOMIC_DEC)
   2163 INSTR2(ATOMIC_CMPXCHG)
   2164 INSTR2(ATOMIC_MIN)
   2165 INSTR2(ATOMIC_MAX)
   2166 INSTR2(ATOMIC_AND)
   2167 INSTR2(ATOMIC_OR)
   2168 INSTR2(ATOMIC_XOR)
   2169 INSTR2(LDC)
   2170 #if GPU >= 600
   2171 INSTR3NODST(STIB);
   2172 INSTR2(LDIB);
   2173 INSTR5(LDG_A);
   2174 INSTR6NODST(STG_A);
   2175 INSTR3F(G, ATOMIC_ADD)
   2176 INSTR3F(G, ATOMIC_SUB)
   2177 INSTR3F(G, ATOMIC_XCHG)
   2178 INSTR3F(G, ATOMIC_INC)
   2179 INSTR3F(G, ATOMIC_DEC)
   2180 INSTR3F(G, ATOMIC_CMPXCHG)
   2181 INSTR3F(G, ATOMIC_MIN)
   2182 INSTR3F(G, ATOMIC_MAX)
   2183 INSTR3F(G, ATOMIC_AND)
   2184 INSTR3F(G, ATOMIC_OR)
   2185 INSTR3F(G, ATOMIC_XOR)
   2186 #elif GPU >= 400
   2187 INSTR3(LDGB)
   2188 #if GPU >= 500
   2189 INSTR3(LDIB)
   2190 #endif
   2191 INSTR4NODST(STGB)
   2192 INSTR4NODST(STIB)
   2193 INSTR4F(G, ATOMIC_ADD)
   2194 INSTR4F(G, ATOMIC_SUB)
   2195 INSTR4F(G, ATOMIC_XCHG)
   2196 INSTR4F(G, ATOMIC_INC)
   2197 INSTR4F(G, ATOMIC_DEC)
   2198 INSTR4F(G, ATOMIC_CMPXCHG)
   2199 INSTR4F(G, ATOMIC_MIN)
   2200 INSTR4F(G, ATOMIC_MAX)
   2201 INSTR4F(G, ATOMIC_AND)
   2202 INSTR4F(G, ATOMIC_OR)
   2203 INSTR4F(G, ATOMIC_XOR)
   2204 #endif
   2205 
   2206 /* cat7 instructions: */
   2207 INSTR0(BAR)
   2208 INSTR0(FENCE)
   2209 
   2210 /* ************************************************************************* */
   2211 #include "bitset.h"
   2212 
   2213 #define MAX_REG 256
   2214 
   2215 typedef BITSET_DECLARE(regmaskstate_t, 2 * MAX_REG);
   2216 
   2217 typedef struct {
   2218    bool mergedregs;
   2219    regmaskstate_t mask;
   2220 } regmask_t;
   2221 
   2222 static inline bool
   2223 __regmask_get(regmask_t *regmask, bool half, unsigned n)
   2224 {
   2225    if (regmask->mergedregs) {
   2226       /* a6xx+ case, with merged register file, we track things in terms
   2227        * of half-precision registers, with a full precisions register
   2228        * using two half-precision slots.
   2229        *
   2230        * Pretend that special regs (a0.x, a1.x, etc.) are full registers to
   2231        * avoid having them alias normal full regs.
   2232        */
   2233       if (half && !is_reg_num_special(n)) {
   2234          return BITSET_TEST(regmask->mask, n);
   2235       } else {
   2236          n *= 2;
   2237          return BITSET_TEST(regmask->mask, n) ||
   2238                 BITSET_TEST(regmask->mask, n + 1);
   2239       }
   2240    } else {
   2241       /* pre a6xx case, with separate register file for half and full
   2242        * precision:
   2243        */
   2244       if (half)
   2245          n += MAX_REG;
   2246       return BITSET_TEST(regmask->mask, n);
   2247    }
   2248 }
   2249 
   2250 static inline void
   2251 __regmask_set(regmask_t *regmask, bool half, unsigned n)
   2252 {
   2253    if (regmask->mergedregs) {
   2254       /* a6xx+ case, with merged register file, we track things in terms
   2255        * of half-precision registers, with a full precisions register
   2256        * using two half-precision slots:
   2257        */
   2258       if (half && !is_reg_num_special(n)) {
   2259          BITSET_SET(regmask->mask, n);
   2260       } else {
   2261          n *= 2;
   2262          BITSET_SET(regmask->mask, n);
   2263          BITSET_SET(regmask->mask, n + 1);
   2264       }
   2265    } else {
   2266       /* pre a6xx case, with separate register file for half and full
   2267        * precision:
   2268        */
   2269       if (half)
   2270          n += MAX_REG;
   2271       BITSET_SET(regmask->mask, n);
   2272    }
   2273 }
   2274 
   2275 static inline void
   2276 __regmask_clear(regmask_t *regmask, bool half, unsigned n)
   2277 {
   2278    if (regmask->mergedregs) {
   2279       /* a6xx+ case, with merged register file, we track things in terms
   2280        * of half-precision registers, with a full precisions register
   2281        * using two half-precision slots:
   2282        */
   2283       if (half && !is_reg_num_special(n)) {
   2284          BITSET_CLEAR(regmask->mask, n);
   2285       } else {
   2286          n *= 2;
   2287          BITSET_CLEAR(regmask->mask, n);
   2288          BITSET_CLEAR(regmask->mask, n + 1);
   2289       }
   2290    } else {
   2291       /* pre a6xx case, with separate register file for half and full
   2292        * precision:
   2293        */
   2294       if (half)
   2295          n += MAX_REG;
   2296       BITSET_CLEAR(regmask->mask, n);
   2297    }
   2298 }
   2299 
   2300 static inline void
   2301 regmask_init(regmask_t *regmask, bool mergedregs)
   2302 {
   2303    memset(&regmask->mask, 0, sizeof(regmask->mask));
   2304    regmask->mergedregs = mergedregs;
   2305 }
   2306 
   2307 static inline void
   2308 regmask_or(regmask_t *dst, regmask_t *a, regmask_t *b)
   2309 {
   2310    assert(dst->mergedregs == a->mergedregs);
   2311    assert(dst->mergedregs == b->mergedregs);
   2312 
   2313    for (unsigned i = 0; i < ARRAY_SIZE(dst->mask); i++)
   2314       dst->mask[i] = a->mask[i] | b->mask[i];
   2315 }
   2316 
   2317 
   2318 static inline void
   2319 regmask_set(regmask_t *regmask, struct ir3_register *reg)
   2320 {
   2321    bool half = reg->flags & IR3_REG_HALF;
   2322    if (reg->flags & IR3_REG_RELATIV) {
   2323       for (unsigned i = 0; i < reg->size; i++)
   2324          __regmask_set(regmask, half, reg->array.base + i);
   2325    } else {
   2326       for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++)
   2327          if (mask & 1)
   2328             __regmask_set(regmask, half, n);
   2329    }
   2330 }
   2331 
   2332 static inline bool
   2333 regmask_get(regmask_t *regmask, struct ir3_register *reg)
   2334 {
   2335    bool half = reg->flags & IR3_REG_HALF;
   2336    if (reg->flags & IR3_REG_RELATIV) {
   2337       for (unsigned i = 0; i < reg->size; i++)
   2338          if (__regmask_get(regmask, half, reg->array.base + i))
   2339             return true;
   2340    } else {
   2341       for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++)
   2342          if (mask & 1)
   2343             if (__regmask_get(regmask, half, n))
   2344                return true;
   2345    }
   2346    return false;
   2347 }
   2348 /* ************************************************************************* */
   2349 
   2350 #endif /* IR3_H_ */
   2351