Home | History | Annotate | Line # | Download | only in ir3
      1 /*
      2  * Copyright (c) 2012 Rob Clark <robdclark (at) gmail.com>
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
     21  * SOFTWARE.
     22  */
     23 
     24 #include "ir3.h"
     25 
     26 #include <assert.h>
     27 #include <errno.h>
     28 #include <stdbool.h>
     29 #include <stdio.h>
     30 #include <stdlib.h>
     31 #include <string.h>
     32 
     33 #include "util/bitscan.h"
     34 #include "util/half_float.h"
     35 #include "util/ralloc.h"
     36 #include "util/u_math.h"
     37 
     38 #include "instr-a3xx.h"
     39 #include "ir3_shader.h"
     40 
     41 /* simple allocator to carve allocations out of an up-front allocated heap,
     42  * so that we can free everything easily in one shot.
     43  */
     44 void *
     45 ir3_alloc(struct ir3 *shader, int sz)
     46 {
     47    return rzalloc_size(shader, sz); /* TODO: don't use rzalloc */
     48 }
     49 
     50 struct ir3 *
     51 ir3_create(struct ir3_compiler *compiler, struct ir3_shader_variant *v)
     52 {
     53    struct ir3 *shader = rzalloc(v, struct ir3);
     54 
     55    shader->compiler = compiler;
     56    shader->type = v->type;
     57 
     58    list_inithead(&shader->block_list);
     59    list_inithead(&shader->array_list);
     60 
     61    return shader;
     62 }
     63 
     64 void
     65 ir3_destroy(struct ir3 *shader)
     66 {
     67    ralloc_free(shader);
     68 }
     69 
     70 static void
     71 collect_reg_info(struct ir3_instruction *instr, struct ir3_register *reg,
     72                  struct ir3_info *info)
     73 {
     74    struct ir3_shader_variant *v = info->data;
     75    unsigned repeat = instr->repeat;
     76 
     77    if (reg->flags & IR3_REG_IMMED) {
     78       /* nothing to do */
     79       return;
     80    }
     81 
     82    if (!(reg->flags & IR3_REG_R)) {
     83       repeat = 0;
     84    }
     85 
     86    unsigned components;
     87    int16_t max;
     88 
     89    if (reg->flags & IR3_REG_RELATIV) {
     90       components = reg->size;
     91       max = (reg->array.base + components - 1);
     92    } else {
     93       components = util_last_bit(reg->wrmask);
     94       max = (reg->num + repeat + components - 1);
     95    }
     96 
     97    if (reg->flags & IR3_REG_CONST) {
     98       info->max_const = MAX2(info->max_const, max >> 2);
     99    } else if (max < regid(48, 0)) {
    100       if (reg->flags & IR3_REG_HALF) {
    101          if (v->mergedregs) {
    102             /* starting w/ a6xx, half regs conflict with full regs: */
    103             info->max_reg = MAX2(info->max_reg, max >> 3);
    104          } else {
    105             info->max_half_reg = MAX2(info->max_half_reg, max >> 2);
    106          }
    107       } else {
    108          info->max_reg = MAX2(info->max_reg, max >> 2);
    109       }
    110    }
    111 }
    112 
    113 bool
    114 ir3_should_double_threadsize(struct ir3_shader_variant *v, unsigned regs_count)
    115 {
    116    const struct ir3_compiler *compiler = v->shader->compiler;
    117 
    118    /* We can't support more than compiler->branchstack_size diverging threads
    119     * in a wave. Thus, doubling the threadsize is only possible if we don't
    120     * exceed the branchstack size limit.
    121     */
    122    if (MIN2(v->branchstack, compiler->threadsize_base * 2) >
    123        compiler->branchstack_size) {
    124       return false;
    125    }
    126 
    127    switch (v->type) {
    128    case MESA_SHADER_COMPUTE: {
    129       unsigned threads_per_wg =
    130          v->local_size[0] * v->local_size[1] * v->local_size[2];
    131 
    132       /* For a5xx, if the workgroup size is greater than the maximum number
    133        * of threads per core with 32 threads per wave (512) then we have to
    134        * use the doubled threadsize because otherwise the workgroup wouldn't
    135        * fit. For smaller workgroup sizes, we follow the blob and use the
    136        * smaller threadsize.
    137        */
    138       if (compiler->gen < 6) {
    139          return v->local_size_variable ||
    140                 threads_per_wg >
    141                    compiler->threadsize_base * compiler->max_waves;
    142       }
    143 
    144       /* On a6xx, we prefer the larger threadsize unless the workgroup is
    145        * small enough that it would be useless. Note that because
    146        * threadsize_base is bumped to 64, we don't have to worry about the
    147        * workgroup fitting, unlike the a5xx case.
    148        */
    149       if (!v->local_size_variable) {
    150          if (threads_per_wg <= compiler->threadsize_base)
    151             return false;
    152       }
    153    }
    154       FALLTHROUGH;
    155    case MESA_SHADER_FRAGMENT: {
    156       /* Check that doubling the threadsize wouldn't exceed the regfile size */
    157       return regs_count * 2 <= compiler->reg_size_vec4;
    158    }
    159 
    160    default:
    161       /* On a6xx+, it's impossible to use a doubled wavesize in the geometry
    162        * stages - the bit doesn't exist. The blob never used it for the VS
    163        * on earlier gen's anyway.
    164        */
    165       return false;
    166    }
    167 }
    168 
    169 /* Get the maximum number of waves that could be used even if this shader
    170  * didn't use any registers.
    171  */
    172 unsigned
    173 ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v,
    174                                   bool double_threadsize)
    175 {
    176    const struct ir3_compiler *compiler = v->shader->compiler;
    177    unsigned max_waves = compiler->max_waves;
    178 
    179    /* If this is a compute shader, compute the limit based on shared size */
    180    if (v->type == MESA_SHADER_COMPUTE) {
    181       /* Shared is allocated in chunks of 1k */
    182       unsigned shared_per_wg = ALIGN_POT(v->shared_size, 1024);
    183       if (shared_per_wg > 0 && !v->local_size_variable) {
    184          unsigned wgs_per_core = compiler->local_mem_size / shared_per_wg;
    185          unsigned threads_per_wg =
    186             v->local_size[0] * v->local_size[1] * v->local_size[2];
    187          unsigned waves_per_wg =
    188             DIV_ROUND_UP(threads_per_wg, compiler->threadsize_base *
    189                                             (double_threadsize ? 2 : 1) *
    190                                             compiler->wave_granularity);
    191          max_waves = MIN2(max_waves, waves_per_wg * wgs_per_core *
    192                                         compiler->wave_granularity);
    193       }
    194    }
    195 
    196    /* Compute the limit based on branchstack */
    197    if (v->branchstack > 0) {
    198       unsigned branchstack_max_waves = compiler->branchstack_size /
    199                                        v->branchstack *
    200                                        compiler->wave_granularity;
    201       max_waves = MIN2(max_waves, branchstack_max_waves);
    202    }
    203 
    204    return max_waves;
    205 }
    206 
    207 /* Get the maximum number of waves that could be launched limited by reg size.
    208  */
    209 unsigned
    210 ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
    211                                 unsigned reg_count, bool double_threadsize)
    212 {
    213    return reg_count ? (compiler->reg_size_vec4 /
    214                        (reg_count * (double_threadsize ? 2 : 1)) *
    215                        compiler->wave_granularity)
    216                     : compiler->max_waves;
    217 }
    218 
    219 void
    220 ir3_collect_info(struct ir3_shader_variant *v)
    221 {
    222    struct ir3_info *info = &v->info;
    223    struct ir3 *shader = v->ir;
    224    const struct ir3_compiler *compiler = v->shader->compiler;
    225 
    226    memset(info, 0, sizeof(*info));
    227    info->data = v;
    228    info->max_reg = -1;
    229    info->max_half_reg = -1;
    230    info->max_const = -1;
    231    info->multi_dword_ldp_stp = false;
    232 
    233    uint32_t instr_count = 0;
    234    foreach_block (block, &shader->block_list) {
    235       foreach_instr (instr, &block->instr_list) {
    236          instr_count++;
    237       }
    238    }
    239 
    240    v->instrlen = DIV_ROUND_UP(instr_count, compiler->instr_align);
    241 
    242    /* Pad out with NOPs to instrlen, including at least 4 so that cffdump
    243     * doesn't try to decode the following data as instructions (such as the
    244     * next stage's shader in turnip)
    245     */
    246    info->size = MAX2(v->instrlen * compiler->instr_align, instr_count + 4) * 8;
    247    info->sizedwords = info->size / 4;
    248 
    249    foreach_block (block, &shader->block_list) {
    250       int sfu_delay = 0;
    251 
    252       foreach_instr (instr, &block->instr_list) {
    253 
    254          foreach_src (reg, instr) {
    255             collect_reg_info(instr, reg, info);
    256          }
    257 
    258          foreach_dst (reg, instr) {
    259             if (is_dest_gpr(reg)) {
    260                collect_reg_info(instr, reg, info);
    261             }
    262          }
    263 
    264          if ((instr->opc == OPC_STP || instr->opc == OPC_LDP)) {
    265             unsigned components = instr->srcs[2]->uim_val;
    266             if (components * type_size(instr->cat6.type) > 32) {
    267                info->multi_dword_ldp_stp = true;
    268             }
    269 
    270             if (instr->opc == OPC_STP)
    271                info->stp_count += components;
    272             else
    273                info->ldp_count += components;
    274          }
    275 
    276          if ((instr->opc == OPC_BARY_F) && (instr->dsts[0]->flags & IR3_REG_EI))
    277             info->last_baryf = info->instrs_count;
    278 
    279          unsigned instrs_count = 1 + instr->repeat + instr->nop;
    280          unsigned nops_count = instr->nop;
    281 
    282          if (instr->opc == OPC_NOP) {
    283             nops_count = 1 + instr->repeat;
    284             info->instrs_per_cat[0] += nops_count;
    285          } else {
    286             info->instrs_per_cat[opc_cat(instr->opc)] += 1 + instr->repeat;
    287             info->instrs_per_cat[0] += nops_count;
    288          }
    289 
    290          if (instr->opc == OPC_MOV) {
    291             if (instr->cat1.src_type == instr->cat1.dst_type) {
    292                info->mov_count += 1 + instr->repeat;
    293             } else {
    294                info->cov_count += 1 + instr->repeat;
    295             }
    296          }
    297 
    298          info->instrs_count += instrs_count;
    299          info->nops_count += nops_count;
    300 
    301          if (instr->flags & IR3_INSTR_SS) {
    302             info->ss++;
    303             info->sstall += sfu_delay;
    304             sfu_delay = 0;
    305          }
    306 
    307          if (instr->flags & IR3_INSTR_SY)
    308             info->sy++;
    309 
    310          if (is_sfu(instr)) {
    311             sfu_delay = 10;
    312          } else {
    313             int n = MIN2(sfu_delay, 1 + instr->repeat + instr->nop);
    314             sfu_delay -= n;
    315          }
    316       }
    317    }
    318 
    319    /* TODO: for a5xx and below, is there a separate regfile for
    320     * half-registers?
    321     */
    322    unsigned regs_count =
    323       info->max_reg + 1 +
    324       (compiler->gen >= 6 ? ((info->max_half_reg + 2) / 2) : 0);
    325 
    326    info->double_threadsize = ir3_should_double_threadsize(v, regs_count);
    327    unsigned reg_independent_max_waves =
    328       ir3_get_reg_independent_max_waves(v, info->double_threadsize);
    329    unsigned reg_dependent_max_waves = ir3_get_reg_dependent_max_waves(
    330       compiler, regs_count, info->double_threadsize);
    331    info->max_waves = MIN2(reg_independent_max_waves, reg_dependent_max_waves);
    332    assert(info->max_waves <= v->shader->compiler->max_waves);
    333 }
    334 
    335 static struct ir3_register *
    336 reg_create(struct ir3 *shader, int num, int flags)
    337 {
    338    struct ir3_register *reg = ir3_alloc(shader, sizeof(struct ir3_register));
    339    reg->wrmask = 1;
    340    reg->flags = flags;
    341    reg->num = num;
    342    return reg;
    343 }
    344 
    345 static void
    346 insert_instr(struct ir3_block *block, struct ir3_instruction *instr)
    347 {
    348    struct ir3 *shader = block->shader;
    349 
    350    instr->serialno = ++shader->instr_count;
    351 
    352    list_addtail(&instr->node, &block->instr_list);
    353 
    354    if (is_input(instr))
    355       array_insert(shader, shader->baryfs, instr);
    356 }
    357 
    358 struct ir3_block *
    359 ir3_block_create(struct ir3 *shader)
    360 {
    361    struct ir3_block *block = ir3_alloc(shader, sizeof(*block));
    362 #ifdef DEBUG
    363    block->serialno = ++shader->block_count;
    364 #endif
    365    block->shader = shader;
    366    list_inithead(&block->node);
    367    list_inithead(&block->instr_list);
    368    return block;
    369 }
    370 
    371 void
    372 ir3_block_add_predecessor(struct ir3_block *block, struct ir3_block *pred)
    373 {
    374    array_insert(block, block->predecessors, pred);
    375 }
    376 
    377 void
    378 ir3_block_add_physical_predecessor(struct ir3_block *block,
    379                                    struct ir3_block *pred)
    380 {
    381    array_insert(block, block->physical_predecessors, pred);
    382 }
    383 
    384 void
    385 ir3_block_remove_predecessor(struct ir3_block *block, struct ir3_block *pred)
    386 {
    387    for (unsigned i = 0; i < block->predecessors_count; i++) {
    388       if (block->predecessors[i] == pred) {
    389          if (i < block->predecessors_count - 1) {
    390             block->predecessors[i] =
    391                block->predecessors[block->predecessors_count - 1];
    392          }
    393 
    394          block->predecessors_count--;
    395          return;
    396       }
    397    }
    398 }
    399 
    400 void
    401 ir3_block_remove_physical_predecessor(struct ir3_block *block, struct ir3_block *pred)
    402 {
    403    for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
    404       if (block->physical_predecessors[i] == pred) {
    405          if (i < block->physical_predecessors_count - 1) {
    406             block->physical_predecessors[i] =
    407                block->physical_predecessors[block->physical_predecessors_count - 1];
    408          }
    409 
    410          block->physical_predecessors_count--;
    411          return;
    412       }
    413    }
    414 }
    415 
    416 unsigned
    417 ir3_block_get_pred_index(struct ir3_block *block, struct ir3_block *pred)
    418 {
    419    for (unsigned i = 0; i < block->predecessors_count; i++) {
    420       if (block->predecessors[i] == pred) {
    421          return i;
    422       }
    423    }
    424 
    425    unreachable("ir3_block_get_pred_index() invalid predecessor");
    426 }
    427 
    428 static struct ir3_instruction *
    429 instr_create(struct ir3_block *block, opc_t opc, int ndst, int nsrc)
    430 {
    431    /* Add extra sources for array destinations and the address reg */
    432    if (1 <= opc_cat(opc))
    433       nsrc += 2;
    434    struct ir3_instruction *instr;
    435    unsigned sz = sizeof(*instr) + (ndst * sizeof(instr->dsts[0])) +
    436                  (nsrc * sizeof(instr->srcs[0]));
    437    char *ptr = ir3_alloc(block->shader, sz);
    438 
    439    instr = (struct ir3_instruction *)ptr;
    440    ptr += sizeof(*instr);
    441    instr->dsts = (struct ir3_register **)ptr;
    442    instr->srcs = instr->dsts + ndst;
    443 
    444 #ifdef DEBUG
    445    instr->dsts_max = ndst;
    446    instr->srcs_max = nsrc;
    447 #endif
    448 
    449    return instr;
    450 }
    451 
    452 struct ir3_instruction *
    453 ir3_instr_create(struct ir3_block *block, opc_t opc, int ndst, int nsrc)
    454 {
    455    struct ir3_instruction *instr = instr_create(block, opc, ndst, nsrc);
    456    instr->block = block;
    457    instr->opc = opc;
    458    insert_instr(block, instr);
    459    return instr;
    460 }
    461 
    462 struct ir3_instruction *
    463 ir3_instr_clone(struct ir3_instruction *instr)
    464 {
    465    struct ir3_instruction *new_instr = instr_create(
    466       instr->block, instr->opc, instr->dsts_count, instr->srcs_count);
    467    struct ir3_register **dsts, **srcs;
    468 
    469    dsts = new_instr->dsts;
    470    srcs = new_instr->srcs;
    471    *new_instr = *instr;
    472    new_instr->dsts = dsts;
    473    new_instr->srcs = srcs;
    474 
    475    insert_instr(instr->block, new_instr);
    476 
    477    /* clone registers: */
    478    new_instr->dsts_count = 0;
    479    new_instr->srcs_count = 0;
    480    foreach_dst (reg, instr) {
    481       struct ir3_register *new_reg =
    482          ir3_dst_create(new_instr, reg->num, reg->flags);
    483       *new_reg = *reg;
    484       if (new_reg->instr)
    485          new_reg->instr = new_instr;
    486    }
    487    foreach_src (reg, instr) {
    488       struct ir3_register *new_reg =
    489          ir3_src_create(new_instr, reg->num, reg->flags);
    490       *new_reg = *reg;
    491    }
    492 
    493    if (instr->address) {
    494       assert(instr->srcs_count > 0);
    495       new_instr->address = new_instr->srcs[instr->srcs_count - 1];
    496    }
    497 
    498    return new_instr;
    499 }
    500 
    501 /* Add a false dependency to instruction, to ensure it is scheduled first: */
    502 void
    503 ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep)
    504 {
    505    for (unsigned i = 0; i < instr->deps_count; i++) {
    506       if (instr->deps[i] == dep)
    507          return;
    508    }
    509 
    510    array_insert(instr, instr->deps, dep);
    511 }
    512 
    513 struct ir3_register *
    514 ir3_src_create(struct ir3_instruction *instr, int num, int flags)
    515 {
    516    struct ir3 *shader = instr->block->shader;
    517 #ifdef DEBUG
    518    debug_assert(instr->srcs_count < instr->srcs_max);
    519 #endif
    520    struct ir3_register *reg = reg_create(shader, num, flags);
    521    instr->srcs[instr->srcs_count++] = reg;
    522    return reg;
    523 }
    524 
    525 struct ir3_register *
    526 ir3_dst_create(struct ir3_instruction *instr, int num, int flags)
    527 {
    528    struct ir3 *shader = instr->block->shader;
    529 #ifdef DEBUG
    530    debug_assert(instr->dsts_count < instr->dsts_max);
    531 #endif
    532    struct ir3_register *reg = reg_create(shader, num, flags);
    533    instr->dsts[instr->dsts_count++] = reg;
    534    return reg;
    535 }
    536 
    537 struct ir3_register *
    538 ir3_reg_clone(struct ir3 *shader, struct ir3_register *reg)
    539 {
    540    struct ir3_register *new_reg = reg_create(shader, 0, 0);
    541    *new_reg = *reg;
    542    return new_reg;
    543 }
    544 
    545 void
    546 ir3_reg_set_last_array(struct ir3_instruction *instr, struct ir3_register *reg,
    547                        struct ir3_register *last_write)
    548 {
    549    assert(reg->flags & IR3_REG_ARRAY);
    550    struct ir3_register *new_reg = ir3_src_create(instr, 0, 0);
    551    *new_reg = *reg;
    552    new_reg->def = last_write;
    553    ir3_reg_tie(reg, new_reg);
    554 }
    555 
    556 void
    557 ir3_instr_set_address(struct ir3_instruction *instr,
    558                       struct ir3_instruction *addr)
    559 {
    560    if (!instr->address) {
    561       struct ir3 *ir = instr->block->shader;
    562 
    563       debug_assert(instr->block == addr->block);
    564 
    565       instr->address =
    566          ir3_src_create(instr, addr->dsts[0]->num, addr->dsts[0]->flags);
    567       instr->address->def = addr->dsts[0];
    568       debug_assert(reg_num(addr->dsts[0]) == REG_A0);
    569       unsigned comp = reg_comp(addr->dsts[0]);
    570       if (comp == 0) {
    571          array_insert(ir, ir->a0_users, instr);
    572       } else {
    573          debug_assert(comp == 1);
    574          array_insert(ir, ir->a1_users, instr);
    575       }
    576    } else {
    577       debug_assert(instr->address->def->instr == addr);
    578    }
    579 }
    580 
    581 void
    582 ir3_block_clear_mark(struct ir3_block *block)
    583 {
    584    foreach_instr (instr, &block->instr_list)
    585       instr->flags &= ~IR3_INSTR_MARK;
    586 }
    587 
    588 void
    589 ir3_clear_mark(struct ir3 *ir)
    590 {
    591    foreach_block (block, &ir->block_list) {
    592       ir3_block_clear_mark(block);
    593    }
    594 }
    595 
    596 unsigned
    597 ir3_count_instructions(struct ir3 *ir)
    598 {
    599    unsigned cnt = 1;
    600    foreach_block (block, &ir->block_list) {
    601       block->start_ip = cnt;
    602       foreach_instr (instr, &block->instr_list) {
    603          instr->ip = cnt++;
    604       }
    605       block->end_ip = cnt;
    606    }
    607    return cnt;
    608 }
    609 
    610 /* When counting instructions for RA, we insert extra fake instructions at the
    611  * beginning of each block, where values become live, and at the end where
    612  * values die. This prevents problems where values live-in at the beginning or
    613  * live-out at the end of a block from being treated as if they were
    614  * live-in/live-out at the first/last instruction, which would be incorrect.
    615  * In ir3_legalize these ip's are assumed to be actual ip's of the final
    616  * program, so it would be incorrect to use this everywhere.
    617  */
    618 
    619 unsigned
    620 ir3_count_instructions_ra(struct ir3 *ir)
    621 {
    622    unsigned cnt = 1;
    623    foreach_block (block, &ir->block_list) {
    624       block->start_ip = cnt++;
    625       foreach_instr (instr, &block->instr_list) {
    626          instr->ip = cnt++;
    627       }
    628       block->end_ip = cnt++;
    629    }
    630    return cnt;
    631 }
    632 
    633 struct ir3_array *
    634 ir3_lookup_array(struct ir3 *ir, unsigned id)
    635 {
    636    foreach_array (arr, &ir->array_list)
    637       if (arr->id == id)
    638          return arr;
    639    return NULL;
    640 }
    641 
    642 void
    643 ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps)
    644 {
    645    /* We could do this in a single pass if we can assume instructions
    646     * are always sorted.  Which currently might not always be true.
    647     * (In particular after ir3_group pass, but maybe other places.)
    648     */
    649    foreach_block (block, &ir->block_list)
    650       foreach_instr (instr, &block->instr_list)
    651          instr->uses = NULL;
    652 
    653    foreach_block (block, &ir->block_list) {
    654       foreach_instr (instr, &block->instr_list) {
    655          foreach_ssa_src_n (src, n, instr) {
    656             if (__is_false_dep(instr, n) && !falsedeps)
    657                continue;
    658             if (!src->uses)
    659                src->uses = _mesa_pointer_set_create(mem_ctx);
    660             _mesa_set_add(src->uses, instr);
    661          }
    662       }
    663    }
    664 }
    665 
    666 /**
    667  * Set the destination type of an instruction, for example if a
    668  * conversion is folded in, handling the special cases where the
    669  * instruction's dest type or opcode needs to be fixed up.
    670  */
    671 void
    672 ir3_set_dst_type(struct ir3_instruction *instr, bool half)
    673 {
    674    if (half) {
    675       instr->dsts[0]->flags |= IR3_REG_HALF;
    676    } else {
    677       instr->dsts[0]->flags &= ~IR3_REG_HALF;
    678    }
    679 
    680    switch (opc_cat(instr->opc)) {
    681    case 1: /* move instructions */
    682       if (half) {
    683          instr->cat1.dst_type = half_type(instr->cat1.dst_type);
    684       } else {
    685          instr->cat1.dst_type = full_type(instr->cat1.dst_type);
    686       }
    687       break;
    688    case 4:
    689       if (half) {
    690          instr->opc = cat4_half_opc(instr->opc);
    691       } else {
    692          instr->opc = cat4_full_opc(instr->opc);
    693       }
    694       break;
    695    case 5:
    696       if (half) {
    697          instr->cat5.type = half_type(instr->cat5.type);
    698       } else {
    699          instr->cat5.type = full_type(instr->cat5.type);
    700       }
    701       break;
    702    }
    703 }
    704 
    705 /**
    706  * One-time fixup for instruction src-types.  Other than cov's that
    707  * are folded, an instruction's src type does not change.
    708  */
    709 void
    710 ir3_fixup_src_type(struct ir3_instruction *instr)
    711 {
    712    switch (opc_cat(instr->opc)) {
    713    case 1: /* move instructions */
    714       if (instr->srcs[0]->flags & IR3_REG_HALF) {
    715          instr->cat1.src_type = half_type(instr->cat1.src_type);
    716       } else {
    717          instr->cat1.src_type = full_type(instr->cat1.src_type);
    718       }
    719       break;
    720    case 3:
    721       if (instr->srcs[0]->flags & IR3_REG_HALF) {
    722          instr->opc = cat3_half_opc(instr->opc);
    723       } else {
    724          instr->opc = cat3_full_opc(instr->opc);
    725       }
    726       break;
    727    }
    728 }
    729 
    730 /**
    731  * Map a floating point immed to FLUT (float lookup table) value,
    732  * returns negative for immediates that cannot be mapped.
    733  */
    734 int
    735 ir3_flut(struct ir3_register *src_reg)
    736 {
    737    static const struct {
    738       uint32_t f32;
    739       uint16_t f16;
    740    } flut[] = {
    741          { .f32 = 0x00000000, .f16 = 0x0000 },    /* 0.0 */
    742          { .f32 = 0x3f000000, .f16 = 0x3800 },    /* 0.5 */
    743          { .f32 = 0x3f800000, .f16 = 0x3c00 },    /* 1.0 */
    744          { .f32 = 0x40000000, .f16 = 0x4000 },    /* 2.0 */
    745          { .f32 = 0x402df854, .f16 = 0x4170 },    /* e */
    746          { .f32 = 0x40490fdb, .f16 = 0x4248 },    /* pi */
    747          { .f32 = 0x3ea2f983, .f16 = 0x3518 },    /* 1/pi */
    748          { .f32 = 0x3f317218, .f16 = 0x398c },    /* 1/log2(e) */
    749          { .f32 = 0x3fb8aa3b, .f16 = 0x3dc5 },    /* log2(e) */
    750          { .f32 = 0x3e9a209b, .f16 = 0x34d1 },    /* 1/log2(10) */
    751          { .f32 = 0x40549a78, .f16 = 0x42a5 },    /* log2(10) */
    752          { .f32 = 0x40800000, .f16 = 0x4400 },    /* 4.0 */
    753    };
    754 
    755    if (src_reg->flags & IR3_REG_HALF) {
    756       /* Note that half-float immeds are already lowered to 16b in nir: */
    757       uint32_t imm = src_reg->uim_val;
    758       for (unsigned i = 0; i < ARRAY_SIZE(flut); i++) {
    759          if (flut[i].f16 == imm) {
    760             return i;
    761          }
    762       }
    763    } else {
    764       uint32_t imm = src_reg->uim_val;
    765       for (unsigned i = 0; i < ARRAY_SIZE(flut); i++) {
    766          if (flut[i].f32 == imm) {
    767             return i;
    768          }
    769       }
    770    }
    771 
    772    return -1;
    773 }
    774 
    775 static unsigned
    776 cp_flags(unsigned flags)
    777 {
    778    /* only considering these flags (at least for now): */
    779    flags &= (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_FNEG | IR3_REG_FABS |
    780              IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT | IR3_REG_RELATIV |
    781              IR3_REG_SHARED);
    782    return flags;
    783 }
    784 
    785 bool
    786 ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags)
    787 {
    788    struct ir3_compiler *compiler = instr->block->shader->compiler;
    789    unsigned valid_flags;
    790 
    791    if ((flags & IR3_REG_SHARED) && opc_cat(instr->opc) > 3)
    792       return false;
    793 
    794    flags = cp_flags(flags);
    795 
    796    /* If destination is indirect, then source cannot be.. at least
    797     * I don't think so..
    798     */
    799    if (instr->dsts_count > 0 && (instr->dsts[0]->flags & IR3_REG_RELATIV) &&
    800        (flags & IR3_REG_RELATIV))
    801       return false;
    802 
    803    if (flags & IR3_REG_RELATIV) {
    804       /* TODO need to test on earlier gens.. pretty sure the earlier
    805        * problem was just that we didn't check that the src was from
    806        * same block (since we can't propagate address register values
    807        * across blocks currently)
    808        */
    809       if (compiler->gen < 6)
    810          return false;
    811 
    812       /* NOTE in the special try_swap_mad_two_srcs() case we can be
    813        * called on a src that has already had an indirect load folded
    814        * in, in which case ssa() returns NULL
    815        */
    816       if (instr->srcs[n]->flags & IR3_REG_SSA) {
    817          struct ir3_instruction *src = ssa(instr->srcs[n]);
    818          if (src->address->def->instr->block != instr->block)
    819             return false;
    820       }
    821    }
    822 
    823    if (is_meta(instr)) {
    824       /* collect and phi nodes support const/immed sources, which will be
    825        * turned into move instructions, but not anything else.
    826        */
    827       if (flags & ~(IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_SHARED))
    828          return false;
    829 
    830       if ((flags & IR3_REG_SHARED) && !(instr->dsts[0]->flags & IR3_REG_SHARED))
    831          return false;
    832 
    833       return true;
    834    }
    835 
    836    switch (opc_cat(instr->opc)) {
    837    case 0: /* end, chmask */
    838       return flags == 0;
    839    case 1:
    840       switch (instr->opc) {
    841       case OPC_MOVMSK:
    842       case OPC_SWZ:
    843       case OPC_SCT:
    844       case OPC_GAT:
    845          valid_flags = IR3_REG_SHARED;
    846          break;
    847       default:
    848          valid_flags =
    849             IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV | IR3_REG_SHARED;
    850       }
    851       if (flags & ~valid_flags)
    852          return false;
    853       break;
    854    case 2:
    855       valid_flags = ir3_cat2_absneg(instr->opc) | IR3_REG_CONST |
    856                     IR3_REG_RELATIV | IR3_REG_IMMED | IR3_REG_SHARED;
    857 
    858       if (flags & ~valid_flags)
    859          return false;
    860 
    861       if (flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_SHARED)) {
    862          unsigned m = n ^ 1;
    863          /* cannot deal w/ const or shared in both srcs:
    864           * (note that some cat2 actually only have a single src)
    865           */
    866          if (m < instr->srcs_count) {
    867             struct ir3_register *reg = instr->srcs[m];
    868             if ((flags & (IR3_REG_CONST | IR3_REG_SHARED)) &&
    869                 (reg->flags & (IR3_REG_CONST | IR3_REG_SHARED)))
    870                return false;
    871             if ((flags & IR3_REG_IMMED) && reg->flags & (IR3_REG_IMMED))
    872                return false;
    873          }
    874       }
    875       break;
    876    case 3:
    877       valid_flags =
    878          ir3_cat3_absneg(instr->opc) | IR3_REG_RELATIV | IR3_REG_SHARED;
    879 
    880       if (instr->opc == OPC_SHLG_B16) {
    881          valid_flags |= IR3_REG_IMMED;
    882          /* shlg.b16 can be RELATIV+CONST but not CONST: */
    883          if (flags & IR3_REG_RELATIV)
    884             valid_flags |= IR3_REG_CONST;
    885       } else {
    886          valid_flags |= IR3_REG_CONST;
    887       }
    888 
    889       if (flags & ~valid_flags)
    890          return false;
    891 
    892       if (flags & (IR3_REG_CONST | IR3_REG_SHARED | IR3_REG_RELATIV)) {
    893          /* cannot deal w/ const/shared/relativ in 2nd src: */
    894          if (n == 1)
    895             return false;
    896       }
    897 
    898       break;
    899    case 4:
    900       /* seems like blob compiler avoids const as src.. */
    901       /* TODO double check if this is still the case on a4xx */
    902       if (flags & (IR3_REG_CONST | IR3_REG_IMMED))
    903          return false;
    904       if (flags & (IR3_REG_SABS | IR3_REG_SNEG))
    905          return false;
    906       break;
    907    case 5:
    908       /* no flags allowed */
    909       if (flags)
    910          return false;
    911       break;
    912    case 6:
    913       valid_flags = IR3_REG_IMMED;
    914       if (flags & ~valid_flags)
    915          return false;
    916 
    917       if (flags & IR3_REG_IMMED) {
    918          /* doesn't seem like we can have immediate src for store
    919           * instructions:
    920           *
    921           * TODO this restriction could also apply to load instructions,
    922           * but for load instructions this arg is the address (and not
    923           * really sure any good way to test a hard-coded immed addr src)
    924           */
    925          if (is_store(instr) && (instr->opc != OPC_STG) && (n == 1))
    926             return false;
    927 
    928          if ((instr->opc == OPC_LDL) && (n == 0))
    929             return false;
    930 
    931          if ((instr->opc == OPC_STL) && (n != 2))
    932             return false;
    933 
    934          if ((instr->opc == OPC_LDP) && (n == 0))
    935             return false;
    936 
    937          if ((instr->opc == OPC_STP) && (n != 2))
    938             return false;
    939 
    940          if (instr->opc == OPC_STLW && n == 0)
    941             return false;
    942 
    943          if (instr->opc == OPC_LDLW && n == 0)
    944             return false;
    945 
    946          /* disallow immediates in anything but the SSBO slot argument for
    947           * cat6 instructions:
    948           */
    949          if (is_atomic(instr->opc) && (n != 0))
    950             return false;
    951 
    952          if (is_atomic(instr->opc) && !(instr->flags & IR3_INSTR_G))
    953             return false;
    954 
    955          if (instr->opc == OPC_STG && (n == 2))
    956             return false;
    957 
    958          if (instr->opc == OPC_STG_A && (n == 4))
    959             return false;
    960 
    961          /* as with atomics, these cat6 instrs can only have an immediate
    962           * for SSBO/IBO slot argument
    963           */
    964          switch (instr->opc) {
    965          case OPC_LDIB:
    966          case OPC_STIB:
    967          case OPC_RESINFO:
    968             if (n != 0)
    969                return false;
    970             break;
    971          default:
    972             break;
    973          }
    974       }
    975 
    976       break;
    977    }
    978 
    979    return true;
    980 }
    981 
    982 bool
    983 ir3_valid_immediate(struct ir3_instruction *instr, int32_t immed)
    984 {
    985    if (instr->opc == OPC_MOV || is_meta(instr))
    986       return true;
    987 
    988    if (is_mem(instr)) {
    989       switch (instr->opc) {
    990       /* Some load/store instructions have a 13-bit offset and size which must
    991        * always be an immediate and the rest of the sources cannot be
    992        * immediates, so the frontend is responsible for checking the size:
    993        */
    994       case OPC_LDL:
    995       case OPC_STL:
    996       case OPC_LDP:
    997       case OPC_STP:
    998       case OPC_LDG:
    999       case OPC_STG:
   1000       case OPC_SPILL_MACRO:
   1001       case OPC_RELOAD_MACRO:
   1002       case OPC_LDG_A:
   1003       case OPC_STG_A:
   1004       case OPC_LDLW:
   1005       case OPC_STLW:
   1006       case OPC_LDLV:
   1007          return true;
   1008       default:
   1009          /* most cat6 src immediates can only encode 8 bits: */
   1010          return !(immed & ~0xff);
   1011       }
   1012    }
   1013 
   1014    /* Other than cat1 (mov) we can only encode up to 10 bits, sign-extended: */
   1015    return !(immed & ~0x1ff) || !(-immed & ~0x1ff);
   1016 }
   1017