17ec681f3Smrg/*
27ec681f3Smrg * Copyright (C) 2020 Collabora Ltd.
37ec681f3Smrg *
47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a
57ec681f3Smrg * copy of this software and associated documentation files (the "Software"),
67ec681f3Smrg * to deal in the Software without restriction, including without limitation
77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the
97ec681f3Smrg * Software is furnished to do so, subject to the following conditions:
107ec681f3Smrg *
117ec681f3Smrg * The above copyright notice and this permission notice (including the next
127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the
137ec681f3Smrg * Software.
147ec681f3Smrg *
157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
207ec681f3Smrg * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
217ec681f3Smrg * SOFTWARE.
227ec681f3Smrg *
237ec681f3Smrg * Authors (Collabora):
247ec681f3Smrg *      Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
257ec681f3Smrg */
267ec681f3Smrg
277ec681f3Smrg#include "main/mtypes.h"
287ec681f3Smrg#include "compiler/glsl/glsl_to_nir.h"
297ec681f3Smrg#include "compiler/nir_types.h"
307ec681f3Smrg#include "compiler/nir/nir_builder.h"
317ec681f3Smrg#include "util/u_debug.h"
327ec681f3Smrg
337ec681f3Smrg#include "disassemble.h"
347ec681f3Smrg#include "bifrost_compile.h"
357ec681f3Smrg#include "compiler.h"
367ec681f3Smrg#include "bi_quirks.h"
377ec681f3Smrg#include "bi_builder.h"
387ec681f3Smrg#include "bifrost_nir.h"
397ec681f3Smrg
407ec681f3Smrgstatic const struct debug_named_value bifrost_debug_options[] = {
417ec681f3Smrg        {"msgs",      BIFROST_DBG_MSGS,		"Print debug messages"},
427ec681f3Smrg        {"shaders",   BIFROST_DBG_SHADERS,	"Dump shaders in NIR and MIR"},
437ec681f3Smrg        {"shaderdb",  BIFROST_DBG_SHADERDB,	"Print statistics"},
447ec681f3Smrg        {"verbose",   BIFROST_DBG_VERBOSE,	"Disassemble verbosely"},
457ec681f3Smrg        {"internal",  BIFROST_DBG_INTERNAL,	"Dump even internal shaders"},
467ec681f3Smrg        {"nosched",   BIFROST_DBG_NOSCHED, 	"Force trivial bundling"},
477ec681f3Smrg        {"inorder",   BIFROST_DBG_INORDER, 	"Force in-order bundling"},
487ec681f3Smrg        {"novalidate",BIFROST_DBG_NOVALIDATE,   "Skip IR validation"},
497ec681f3Smrg        {"noopt",     BIFROST_DBG_NOOPT,        "Skip optimization passes"},
507ec681f3Smrg        DEBUG_NAMED_VALUE_END
517ec681f3Smrg};
527ec681f3Smrg
537ec681f3SmrgDEBUG_GET_ONCE_FLAGS_OPTION(bifrost_debug, "BIFROST_MESA_DEBUG", bifrost_debug_options, 0)
547ec681f3Smrg
557ec681f3Smrg/* How many bytes are prefetched by the Bifrost shader core. From the final
567ec681f3Smrg * clause of the shader, this range must be valid instructions or zero. */
577ec681f3Smrg#define BIFROST_SHADER_PREFETCH 128
587ec681f3Smrg
597ec681f3Smrgint bifrost_debug = 0;
607ec681f3Smrg
617ec681f3Smrg#define DBG(fmt, ...) \
627ec681f3Smrg		do { if (bifrost_debug & BIFROST_DBG_MSGS) \
637ec681f3Smrg			fprintf(stderr, "%s:%d: "fmt, \
647ec681f3Smrg				__FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)
657ec681f3Smrg
667ec681f3Smrgstatic bi_block *emit_cf_list(bi_context *ctx, struct exec_list *list);
677ec681f3Smrg
687ec681f3Smrgstatic void
697ec681f3Smrgbi_block_add_successor(bi_block *block, bi_block *successor)
707ec681f3Smrg{
717ec681f3Smrg        assert(block != NULL && successor != NULL);
727ec681f3Smrg
737ec681f3Smrg        /* Cull impossible edges */
747ec681f3Smrg        if (block->unconditional_jumps)
757ec681f3Smrg                return;
767ec681f3Smrg
777ec681f3Smrg        for (unsigned i = 0; i < ARRAY_SIZE(block->successors); ++i) {
787ec681f3Smrg                if (block->successors[i]) {
797ec681f3Smrg                       if (block->successors[i] == successor)
807ec681f3Smrg                               return;
817ec681f3Smrg                       else
827ec681f3Smrg                               continue;
837ec681f3Smrg                }
847ec681f3Smrg
857ec681f3Smrg                block->successors[i] = successor;
867ec681f3Smrg                _mesa_set_add(successor->predecessors, block);
877ec681f3Smrg                return;
887ec681f3Smrg        }
897ec681f3Smrg
907ec681f3Smrg        unreachable("Too many successors");
917ec681f3Smrg}
927ec681f3Smrg
937ec681f3Smrgstatic void
947ec681f3Smrgbi_emit_jump(bi_builder *b, nir_jump_instr *instr)
957ec681f3Smrg{
967ec681f3Smrg        bi_instr *branch = bi_jump(b, bi_zero());
977ec681f3Smrg
987ec681f3Smrg        switch (instr->type) {
997ec681f3Smrg        case nir_jump_break:
1007ec681f3Smrg                branch->branch_target = b->shader->break_block;
1017ec681f3Smrg                break;
1027ec681f3Smrg        case nir_jump_continue:
1037ec681f3Smrg                branch->branch_target = b->shader->continue_block;
1047ec681f3Smrg                break;
1057ec681f3Smrg        default:
1067ec681f3Smrg                unreachable("Unhandled jump type");
1077ec681f3Smrg        }
1087ec681f3Smrg
1097ec681f3Smrg        bi_block_add_successor(b->shader->current_block, branch->branch_target);
1107ec681f3Smrg        b->shader->current_block->unconditional_jumps = true;
1117ec681f3Smrg}
1127ec681f3Smrg
1137ec681f3Smrgstatic bi_index
1147ec681f3Smrgbi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr)
1157ec681f3Smrg{
1167ec681f3Smrg        switch (intr->intrinsic) {
1177ec681f3Smrg        case nir_intrinsic_load_barycentric_centroid:
1187ec681f3Smrg        case nir_intrinsic_load_barycentric_sample:
1197ec681f3Smrg                return bi_register(61);
1207ec681f3Smrg
1217ec681f3Smrg        /* Need to put the sample ID in the top 16-bits */
1227ec681f3Smrg        case nir_intrinsic_load_barycentric_at_sample:
1237ec681f3Smrg                return bi_mkvec_v2i16(b, bi_half(bi_dontcare(), false),
1247ec681f3Smrg                                bi_half(bi_src_index(&intr->src[0]), false));
1257ec681f3Smrg
1267ec681f3Smrg        /* Interpret as 8:8 signed fixed point positions in pixels along X and
1277ec681f3Smrg         * Y axes respectively, relative to top-left of pixel. In NIR, (0, 0)
1287ec681f3Smrg         * is the center of the pixel so we first fixup and then convert. For
1297ec681f3Smrg         * fp16 input:
1307ec681f3Smrg         *
1317ec681f3Smrg         * f2i16(((x, y) + (0.5, 0.5)) * 2**8) =
1327ec681f3Smrg         * f2i16((256 * (x, y)) + (128, 128)) =
1337ec681f3Smrg         * V2F16_TO_V2S16(FMA.v2f16((x, y), #256, #128))
1347ec681f3Smrg         *
1357ec681f3Smrg         * For fp32 input, that lacks enough precision for MSAA 16x, but the
1367ec681f3Smrg         * idea is the same. FIXME: still doesn't pass
1377ec681f3Smrg         */
1387ec681f3Smrg        case nir_intrinsic_load_barycentric_at_offset: {
1397ec681f3Smrg                bi_index offset = bi_src_index(&intr->src[0]);
1407ec681f3Smrg                bi_index f16 = bi_null();
1417ec681f3Smrg                unsigned sz = nir_src_bit_size(intr->src[0]);
1427ec681f3Smrg
1437ec681f3Smrg                if (sz == 16) {
1447ec681f3Smrg                        f16 = bi_fma_v2f16(b, offset, bi_imm_f16(256.0),
1457ec681f3Smrg                                        bi_imm_f16(128.0), BI_ROUND_NONE);
1467ec681f3Smrg                } else {
1477ec681f3Smrg                        assert(sz == 32);
1487ec681f3Smrg                        bi_index f[2];
1497ec681f3Smrg                        for (unsigned i = 0; i < 2; ++i) {
1507ec681f3Smrg                                f[i] = bi_fadd_rscale_f32(b,
1517ec681f3Smrg                                                bi_word(offset, i),
1527ec681f3Smrg                                                bi_imm_f32(0.5), bi_imm_u32(8),
1537ec681f3Smrg                                                BI_ROUND_NONE, BI_SPECIAL_NONE);
1547ec681f3Smrg                        }
1557ec681f3Smrg
1567ec681f3Smrg                        f16 = bi_v2f32_to_v2f16(b, f[0], f[1], BI_ROUND_NONE);
1577ec681f3Smrg                }
1587ec681f3Smrg
1597ec681f3Smrg                return bi_v2f16_to_v2s16(b, f16, BI_ROUND_RTZ);
1607ec681f3Smrg        }
1617ec681f3Smrg
1627ec681f3Smrg        case nir_intrinsic_load_barycentric_pixel:
1637ec681f3Smrg        default:
1647ec681f3Smrg                return bi_dontcare();
1657ec681f3Smrg        }
1667ec681f3Smrg}
1677ec681f3Smrg
1687ec681f3Smrgstatic enum bi_sample
1697ec681f3Smrgbi_interp_for_intrinsic(nir_intrinsic_op op)
1707ec681f3Smrg{
1717ec681f3Smrg        switch (op) {
1727ec681f3Smrg        case nir_intrinsic_load_barycentric_centroid:
1737ec681f3Smrg                return BI_SAMPLE_CENTROID;
1747ec681f3Smrg        case nir_intrinsic_load_barycentric_sample:
1757ec681f3Smrg        case nir_intrinsic_load_barycentric_at_sample:
1767ec681f3Smrg                return BI_SAMPLE_SAMPLE;
1777ec681f3Smrg        case nir_intrinsic_load_barycentric_at_offset:
1787ec681f3Smrg                return BI_SAMPLE_EXPLICIT;
1797ec681f3Smrg        case nir_intrinsic_load_barycentric_pixel:
1807ec681f3Smrg        default:
1817ec681f3Smrg                return BI_SAMPLE_CENTER;
1827ec681f3Smrg        }
1837ec681f3Smrg}
1847ec681f3Smrg
1857ec681f3Smrg/* auto, 64-bit omitted */
1867ec681f3Smrgstatic enum bi_register_format
1877ec681f3Smrgbi_reg_fmt_for_nir(nir_alu_type T)
1887ec681f3Smrg{
1897ec681f3Smrg        switch (T) {
1907ec681f3Smrg        case nir_type_float16: return BI_REGISTER_FORMAT_F16;
1917ec681f3Smrg        case nir_type_float32: return BI_REGISTER_FORMAT_F32;
1927ec681f3Smrg        case nir_type_int16:   return BI_REGISTER_FORMAT_S16;
1937ec681f3Smrg        case nir_type_uint16:  return BI_REGISTER_FORMAT_U16;
1947ec681f3Smrg        case nir_type_int32:   return BI_REGISTER_FORMAT_S32;
1957ec681f3Smrg        case nir_type_uint32:  return BI_REGISTER_FORMAT_U32;
1967ec681f3Smrg        default: unreachable("Invalid type for register format");
1977ec681f3Smrg        }
1987ec681f3Smrg}
1997ec681f3Smrg
2007ec681f3Smrg/* Checks if the _IMM variant of an intrinsic can be used, returning in imm the
2017ec681f3Smrg * immediate to be used (which applies even if _IMM can't be used) */
2027ec681f3Smrg
2037ec681f3Smrgstatic bool
2047ec681f3Smrgbi_is_intr_immediate(nir_intrinsic_instr *instr, unsigned *immediate, unsigned max)
2057ec681f3Smrg{
2067ec681f3Smrg        nir_src *offset = nir_get_io_offset_src(instr);
2077ec681f3Smrg
2087ec681f3Smrg        if (!nir_src_is_const(*offset))
2097ec681f3Smrg                return false;
2107ec681f3Smrg
2117ec681f3Smrg        *immediate = nir_intrinsic_base(instr) + nir_src_as_uint(*offset);
2127ec681f3Smrg        return (*immediate) < max;
2137ec681f3Smrg}
2147ec681f3Smrg
2157ec681f3Smrgstatic void
2167ec681f3Smrgbi_make_vec_to(bi_builder *b, bi_index final_dst,
2177ec681f3Smrg                bi_index *src,
2187ec681f3Smrg                unsigned *channel,
2197ec681f3Smrg                unsigned count,
2207ec681f3Smrg                unsigned bitsize);
2217ec681f3Smrg
2227ec681f3Smrg/* Bifrost's load instructions lack a component offset despite operating in
2237ec681f3Smrg * terms of vec4 slots. Usually I/O vectorization avoids nonzero components,
2247ec681f3Smrg * but they may be unavoidable with separate shaders in use. To solve this, we
2257ec681f3Smrg * lower to a larger load and an explicit copy of the desired components. */
2267ec681f3Smrg
2277ec681f3Smrgstatic void
2287ec681f3Smrgbi_copy_component(bi_builder *b, nir_intrinsic_instr *instr, bi_index tmp)
2297ec681f3Smrg{
2307ec681f3Smrg        unsigned component = nir_intrinsic_component(instr);
2317ec681f3Smrg
2327ec681f3Smrg        if (component == 0)
2337ec681f3Smrg                return;
2347ec681f3Smrg
2357ec681f3Smrg        bi_index srcs[] = { tmp, tmp, tmp, tmp };
2367ec681f3Smrg        unsigned channels[] = { component, component + 1, component + 2 };
2377ec681f3Smrg
2387ec681f3Smrg        bi_make_vec_to(b,
2397ec681f3Smrg                        bi_dest_index(&instr->dest),
2407ec681f3Smrg                        srcs, channels, instr->num_components,
2417ec681f3Smrg                        nir_dest_bit_size(instr->dest));
2427ec681f3Smrg}
2437ec681f3Smrg
2447ec681f3Smrgstatic void
2457ec681f3Smrgbi_emit_load_attr(bi_builder *b, nir_intrinsic_instr *instr)
2467ec681f3Smrg{
2477ec681f3Smrg        nir_alu_type T = nir_intrinsic_dest_type(instr);
2487ec681f3Smrg        enum bi_register_format regfmt = bi_reg_fmt_for_nir(T);
2497ec681f3Smrg        nir_src *offset = nir_get_io_offset_src(instr);
2507ec681f3Smrg        unsigned component = nir_intrinsic_component(instr);
2517ec681f3Smrg        enum bi_vecsize vecsize = (instr->num_components + component - 1);
2527ec681f3Smrg        unsigned imm_index = 0;
2537ec681f3Smrg        unsigned base = nir_intrinsic_base(instr);
2547ec681f3Smrg        bool constant = nir_src_is_const(*offset);
2557ec681f3Smrg        bool immediate = bi_is_intr_immediate(instr, &imm_index, 16);
2567ec681f3Smrg        bi_index dest = (component == 0) ? bi_dest_index(&instr->dest) : bi_temp(b->shader);
2577ec681f3Smrg
2587ec681f3Smrg        if (immediate) {
2597ec681f3Smrg                bi_ld_attr_imm_to(b, dest, bi_register(61), bi_register(62),
2607ec681f3Smrg                                regfmt, vecsize, imm_index);
2617ec681f3Smrg        } else {
2627ec681f3Smrg                bi_index idx = bi_src_index(&instr->src[0]);
2637ec681f3Smrg
2647ec681f3Smrg                if (constant)
2657ec681f3Smrg                        idx = bi_imm_u32(imm_index);
2667ec681f3Smrg                else if (base != 0)
2677ec681f3Smrg                        idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
2687ec681f3Smrg
2697ec681f3Smrg                bi_ld_attr_to(b, dest, bi_register(61), bi_register(62),
2707ec681f3Smrg                                idx, regfmt, vecsize);
2717ec681f3Smrg        }
2727ec681f3Smrg
2737ec681f3Smrg        bi_copy_component(b, instr, dest);
2747ec681f3Smrg}
2757ec681f3Smrg
2767ec681f3Smrgstatic void
2777ec681f3Smrgbi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)
2787ec681f3Smrg{
2797ec681f3Smrg        enum bi_sample sample = BI_SAMPLE_CENTER;
2807ec681f3Smrg        enum bi_update update = BI_UPDATE_STORE;
2817ec681f3Smrg        enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO;
2827ec681f3Smrg        bool smooth = instr->intrinsic == nir_intrinsic_load_interpolated_input;
2837ec681f3Smrg        bi_index src0 = bi_null();
2847ec681f3Smrg
2857ec681f3Smrg        unsigned component = nir_intrinsic_component(instr);
2867ec681f3Smrg        enum bi_vecsize vecsize = (instr->num_components + component - 1);
2877ec681f3Smrg        bi_index dest = (component == 0) ? bi_dest_index(&instr->dest) : bi_temp(b->shader);
2887ec681f3Smrg
2897ec681f3Smrg        unsigned sz = nir_dest_bit_size(instr->dest);
2907ec681f3Smrg
2917ec681f3Smrg        if (smooth) {
2927ec681f3Smrg                nir_intrinsic_instr *parent = nir_src_as_intrinsic(instr->src[0]);
2937ec681f3Smrg                assert(parent);
2947ec681f3Smrg
2957ec681f3Smrg                sample = bi_interp_for_intrinsic(parent->intrinsic);
2967ec681f3Smrg                src0 = bi_varying_src0_for_barycentric(b, parent);
2977ec681f3Smrg
2987ec681f3Smrg                assert(sz == 16 || sz == 32);
2997ec681f3Smrg                regfmt = (sz == 16) ? BI_REGISTER_FORMAT_F16
3007ec681f3Smrg                        : BI_REGISTER_FORMAT_F32;
3017ec681f3Smrg        } else {
3027ec681f3Smrg                assert(sz == 32);
3037ec681f3Smrg                regfmt = BI_REGISTER_FORMAT_U32;
3047ec681f3Smrg        }
3057ec681f3Smrg
3067ec681f3Smrg        nir_src *offset = nir_get_io_offset_src(instr);
3077ec681f3Smrg        unsigned imm_index = 0;
3087ec681f3Smrg        bool immediate = bi_is_intr_immediate(instr, &imm_index, 20);
3097ec681f3Smrg
3107ec681f3Smrg        if (immediate && smooth) {
3117ec681f3Smrg                bi_ld_var_imm_to(b, dest, src0, regfmt, sample, update,
3127ec681f3Smrg                                vecsize, imm_index);
3137ec681f3Smrg        } else if (immediate && !smooth) {
3147ec681f3Smrg                bi_ld_var_flat_imm_to(b, dest, BI_FUNCTION_NONE, regfmt,
3157ec681f3Smrg                                vecsize, imm_index);
3167ec681f3Smrg        } else {
3177ec681f3Smrg                bi_index idx = bi_src_index(offset);
3187ec681f3Smrg                unsigned base = nir_intrinsic_base(instr);
3197ec681f3Smrg
3207ec681f3Smrg                if (base != 0)
3217ec681f3Smrg                        idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
3227ec681f3Smrg
3237ec681f3Smrg                if (smooth) {
3247ec681f3Smrg                        bi_ld_var_to(b, dest, src0, idx, regfmt, sample,
3257ec681f3Smrg                                        update, vecsize);
3267ec681f3Smrg                } else {
3277ec681f3Smrg                        bi_ld_var_flat_to(b, dest, idx, BI_FUNCTION_NONE,
3287ec681f3Smrg                                        regfmt, vecsize);
3297ec681f3Smrg                }
3307ec681f3Smrg        }
3317ec681f3Smrg
3327ec681f3Smrg        bi_copy_component(b, instr, dest);
3337ec681f3Smrg}
3347ec681f3Smrg
3357ec681f3Smrgstatic void
3367ec681f3Smrgbi_make_vec16_to(bi_builder *b, bi_index dst, bi_index *src,
3377ec681f3Smrg                unsigned *channel, unsigned count)
3387ec681f3Smrg{
3397ec681f3Smrg        for (unsigned i = 0; i < count; i += 2) {
3407ec681f3Smrg                bool next = (i + 1) < count;
3417ec681f3Smrg
3427ec681f3Smrg                unsigned chan = channel ? channel[i] : 0;
3437ec681f3Smrg                unsigned nextc = next && channel ? channel[i + 1] : 0;
3447ec681f3Smrg
3457ec681f3Smrg                bi_index w0 = bi_word(src[i], chan >> 1);
3467ec681f3Smrg                bi_index w1 = next ? bi_word(src[i + 1], nextc >> 1) : bi_zero();
3477ec681f3Smrg
3487ec681f3Smrg                bi_index h0 = bi_half(w0, chan & 1);
3497ec681f3Smrg                bi_index h1 = bi_half(w1, nextc & 1);
3507ec681f3Smrg
3517ec681f3Smrg                bi_index to = bi_word(dst, i >> 1);
3527ec681f3Smrg
3537ec681f3Smrg                if (bi_is_word_equiv(w0, w1) && (chan & 1) == 0 && ((nextc & 1) == 1))
3547ec681f3Smrg                        bi_mov_i32_to(b, to, w0);
3557ec681f3Smrg                else if (bi_is_word_equiv(w0, w1))
3567ec681f3Smrg                        bi_swz_v2i16_to(b, to, bi_swz_16(w0, chan & 1, nextc & 1));
3577ec681f3Smrg                else
3587ec681f3Smrg                        bi_mkvec_v2i16_to(b, to, h0, h1);
3597ec681f3Smrg        }
3607ec681f3Smrg}
3617ec681f3Smrg
3627ec681f3Smrgstatic void
3637ec681f3Smrgbi_make_vec_to(bi_builder *b, bi_index final_dst,
3647ec681f3Smrg                bi_index *src,
3657ec681f3Smrg                unsigned *channel,
3667ec681f3Smrg                unsigned count,
3677ec681f3Smrg                unsigned bitsize)
3687ec681f3Smrg{
3697ec681f3Smrg        /* If we reads our own output, we need a temporary move to allow for
3707ec681f3Smrg         * swapping. TODO: Could do a bit better for pairwise swaps of 16-bit
3717ec681f3Smrg         * vectors */
3727ec681f3Smrg        bool reads_self = false;
3737ec681f3Smrg
3747ec681f3Smrg        for (unsigned i = 0; i < count; ++i)
3757ec681f3Smrg                reads_self |= bi_is_equiv(final_dst, src[i]);
3767ec681f3Smrg
3777ec681f3Smrg        /* SSA can't read itself */
3787ec681f3Smrg        assert(!reads_self || final_dst.reg);
3797ec681f3Smrg
3807ec681f3Smrg        bi_index dst = reads_self ? bi_temp(b->shader) : final_dst;
3817ec681f3Smrg
3827ec681f3Smrg        if (bitsize == 32) {
3837ec681f3Smrg                for (unsigned i = 0; i < count; ++i) {
3847ec681f3Smrg                        bi_mov_i32_to(b, bi_word(dst, i),
3857ec681f3Smrg                                        bi_word(src[i], channel ? channel[i] : 0));
3867ec681f3Smrg                }
3877ec681f3Smrg        } else if (bitsize == 16) {
3887ec681f3Smrg                bi_make_vec16_to(b, dst, src, channel, count);
3897ec681f3Smrg        } else if (bitsize == 8 && count == 1) {
3907ec681f3Smrg                bi_swz_v4i8_to(b, dst, bi_byte(
3917ec681f3Smrg                                        bi_word(src[0], channel[0] >> 2),
3927ec681f3Smrg                                        channel[0] & 3));
3937ec681f3Smrg        } else {
3947ec681f3Smrg                unreachable("8-bit mkvec not yet supported");
3957ec681f3Smrg        }
3967ec681f3Smrg
3977ec681f3Smrg        /* Emit an explicit copy if needed */
3987ec681f3Smrg        if (!bi_is_equiv(dst, final_dst)) {
3997ec681f3Smrg                unsigned shift = (bitsize == 8) ? 2 : (bitsize == 16) ? 1 : 0;
4007ec681f3Smrg                unsigned vec = (1 << shift);
4017ec681f3Smrg
4027ec681f3Smrg                for (unsigned i = 0; i < count; i += vec) {
4037ec681f3Smrg                        bi_mov_i32_to(b, bi_word(final_dst, i >> shift),
4047ec681f3Smrg                                        bi_word(dst, i >> shift));
4057ec681f3Smrg                }
4067ec681f3Smrg        }
4077ec681f3Smrg}
4087ec681f3Smrg
4097ec681f3Smrgstatic bi_instr *
4107ec681f3Smrgbi_load_sysval_to(bi_builder *b, bi_index dest, int sysval,
4117ec681f3Smrg                unsigned nr_components, unsigned offset)
4127ec681f3Smrg{
4137ec681f3Smrg        unsigned sysval_ubo =
4147ec681f3Smrg                MAX2(b->shader->inputs->sysval_ubo, b->shader->nir->info.num_ubos);
4157ec681f3Smrg        unsigned uniform =
4167ec681f3Smrg                pan_lookup_sysval(b->shader->sysval_to_id,
4177ec681f3Smrg                                  &b->shader->info->sysvals,
4187ec681f3Smrg                                  sysval);
4197ec681f3Smrg        unsigned idx = (uniform * 16) + offset;
4207ec681f3Smrg
4217ec681f3Smrg        return bi_load_to(b, nr_components * 32, dest,
4227ec681f3Smrg                        bi_imm_u32(idx),
4237ec681f3Smrg                        bi_imm_u32(sysval_ubo), BI_SEG_UBO);
4247ec681f3Smrg}
4257ec681f3Smrg
4267ec681f3Smrgstatic void
4277ec681f3Smrgbi_load_sysval_nir(bi_builder *b, nir_intrinsic_instr *intr,
4287ec681f3Smrg                unsigned nr_components, unsigned offset)
4297ec681f3Smrg{
4307ec681f3Smrg        bi_load_sysval_to(b, bi_dest_index(&intr->dest),
4317ec681f3Smrg                        panfrost_sysval_for_instr(&intr->instr, NULL),
4327ec681f3Smrg                        nr_components, offset);
4337ec681f3Smrg}
4347ec681f3Smrg
4357ec681f3Smrgstatic bi_index
4367ec681f3Smrgbi_load_sysval(bi_builder *b, int sysval,
4377ec681f3Smrg                unsigned nr_components, unsigned offset)
4387ec681f3Smrg{
4397ec681f3Smrg        bi_index tmp = bi_temp(b->shader);
4407ec681f3Smrg        bi_load_sysval_to(b, tmp, sysval, nr_components, offset);
4417ec681f3Smrg        return tmp;
4427ec681f3Smrg}
4437ec681f3Smrg
4447ec681f3Smrgstatic void
4457ec681f3Smrgbi_load_sample_id_to(bi_builder *b, bi_index dst)
4467ec681f3Smrg{
4477ec681f3Smrg        /* r61[16:23] contains the sampleID, mask it out. Upper bits
4487ec681f3Smrg         * seem to read garbage (despite being architecturally defined
4497ec681f3Smrg         * as zero), so use a 5-bit mask instead of 8-bits */
4507ec681f3Smrg
4517ec681f3Smrg        bi_rshift_and_i32_to(b, dst, bi_register(61), bi_imm_u32(0x1f),
4527ec681f3Smrg                                bi_imm_u8(16));
4537ec681f3Smrg}
4547ec681f3Smrg
4557ec681f3Smrgstatic bi_index
4567ec681f3Smrgbi_load_sample_id(bi_builder *b)
4577ec681f3Smrg{
4587ec681f3Smrg        bi_index sample_id = bi_temp(b->shader);
4597ec681f3Smrg        bi_load_sample_id_to(b, sample_id);
4607ec681f3Smrg        return sample_id;
4617ec681f3Smrg}
4627ec681f3Smrg
4637ec681f3Smrgstatic bi_index
4647ec681f3Smrgbi_pixel_indices(bi_builder *b, unsigned rt)
4657ec681f3Smrg{
4667ec681f3Smrg        /* We want to load the current pixel. */
4677ec681f3Smrg        struct bifrost_pixel_indices pix = {
4687ec681f3Smrg                .y = BIFROST_CURRENT_PIXEL,
4697ec681f3Smrg                .rt = rt
4707ec681f3Smrg        };
4717ec681f3Smrg
4727ec681f3Smrg        uint32_t indices_u32 = 0;
4737ec681f3Smrg        memcpy(&indices_u32, &pix, sizeof(indices_u32));
4747ec681f3Smrg        bi_index indices = bi_imm_u32(indices_u32);
4757ec681f3Smrg
4767ec681f3Smrg        /* Sample index above is left as zero. For multisampling, we need to
4777ec681f3Smrg         * fill in the actual sample ID in the lower byte */
4787ec681f3Smrg
4797ec681f3Smrg        if (b->shader->inputs->blend.nr_samples > 1)
4807ec681f3Smrg                indices = bi_iadd_u32(b, indices, bi_load_sample_id(b), false);
4817ec681f3Smrg
4827ec681f3Smrg        return indices;
4837ec681f3Smrg}
4847ec681f3Smrg
4857ec681f3Smrgstatic void
4867ec681f3Smrgbi_emit_load_blend_input(bi_builder *b, nir_intrinsic_instr *instr)
4877ec681f3Smrg{
4887ec681f3Smrg        ASSERTED nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
4897ec681f3Smrg
4907ec681f3Smrg        /* Source color is passed through r0-r3, or r4-r7 for the second
4917ec681f3Smrg         * source when dual-source blending.  TODO: Precolour instead */
4927ec681f3Smrg        bi_index srcs[] = {
4937ec681f3Smrg                bi_register(0), bi_register(1), bi_register(2), bi_register(3)
4947ec681f3Smrg        };
4957ec681f3Smrg        bi_index srcs2[] = {
4967ec681f3Smrg                bi_register(4), bi_register(5), bi_register(6), bi_register(7)
4977ec681f3Smrg        };
4987ec681f3Smrg
4997ec681f3Smrg        bool second_source = (sem.location == VARYING_SLOT_VAR0);
5007ec681f3Smrg
5017ec681f3Smrg        bi_make_vec_to(b, bi_dest_index(&instr->dest),
5027ec681f3Smrg                       second_source ? srcs2 : srcs,
5037ec681f3Smrg                       NULL, 4, 32);
5047ec681f3Smrg}
5057ec681f3Smrg
5067ec681f3Smrgstatic void
5077ec681f3Smrgbi_emit_blend_op(bi_builder *b, bi_index rgba, nir_alu_type T, unsigned rt)
5087ec681f3Smrg{
5097ec681f3Smrg        /* Reads 2 or 4 staging registers to cover the input */
5107ec681f3Smrg        unsigned size = nir_alu_type_get_type_size(T);
5117ec681f3Smrg        unsigned sr_count = (size <= 16) ? 2 : 4;
5127ec681f3Smrg        const struct panfrost_compile_inputs *inputs = b->shader->inputs;
5137ec681f3Smrg        uint64_t blend_desc = inputs->blend.bifrost_blend_desc;
5147ec681f3Smrg
5157ec681f3Smrg        if (inputs->is_blend && inputs->blend.nr_samples > 1) {
5167ec681f3Smrg                /* Conversion descriptor comes from the compile inputs, pixel
5177ec681f3Smrg                 * indices derived at run time based on sample ID */
5187ec681f3Smrg                bi_st_tile(b, rgba, bi_pixel_indices(b, rt), bi_register(60),
5197ec681f3Smrg                                bi_imm_u32(blend_desc >> 32), BI_VECSIZE_V4);
5207ec681f3Smrg        } else if (b->shader->inputs->is_blend) {
5217ec681f3Smrg                /* Blend descriptor comes from the compile inputs */
5227ec681f3Smrg                /* Put the result in r0 */
5237ec681f3Smrg                bi_blend_to(b, bi_register(0), rgba,
5247ec681f3Smrg                                bi_register(60),
5257ec681f3Smrg                                bi_imm_u32(blend_desc & 0xffffffff),
5267ec681f3Smrg                                bi_imm_u32(blend_desc >> 32), sr_count);
5277ec681f3Smrg        } else {
5287ec681f3Smrg                /* Blend descriptor comes from the FAU RAM. By convention, the
5297ec681f3Smrg                 * return address is stored in r48 and will be used by the
5307ec681f3Smrg                 * blend shader to jump back to the fragment shader after */
5317ec681f3Smrg                bi_blend_to(b, bi_register(48), rgba,
5327ec681f3Smrg                                bi_register(60),
5337ec681f3Smrg                                bi_fau(BIR_FAU_BLEND_0 + rt, false),
5347ec681f3Smrg                                bi_fau(BIR_FAU_BLEND_0 + rt, true), sr_count);
5357ec681f3Smrg        }
5367ec681f3Smrg
5377ec681f3Smrg        assert(rt < 8);
5387ec681f3Smrg        b->shader->info->bifrost.blend[rt].type = T;
5397ec681f3Smrg}
5407ec681f3Smrg
5417ec681f3Smrg/* Blend shaders do not need to run ATEST since they are dependent on a
5427ec681f3Smrg * fragment shader that runs it. Blit shaders may not need to run ATEST, since
5437ec681f3Smrg * ATEST is not needed if early-z is forced, alpha-to-coverage is disabled, and
5447ec681f3Smrg * there are no writes to the coverage mask. The latter two are satisfied for
5457ec681f3Smrg * all blit shaders, so we just care about early-z, which blit shaders force
5467ec681f3Smrg * iff they do not write depth or stencil */
5477ec681f3Smrg
5487ec681f3Smrgstatic bool
5497ec681f3Smrgbi_skip_atest(bi_context *ctx, bool emit_zs)
5507ec681f3Smrg{
5517ec681f3Smrg        return (ctx->inputs->is_blit && !emit_zs) || ctx->inputs->is_blend;
5527ec681f3Smrg}
5537ec681f3Smrg
5547ec681f3Smrgstatic void
5557ec681f3Smrgbi_emit_atest(bi_builder *b, bi_index alpha)
5567ec681f3Smrg{
5577ec681f3Smrg        bi_index coverage = bi_register(60);
5587ec681f3Smrg        bi_instr *atest = bi_atest_to(b, coverage, coverage, alpha);
5597ec681f3Smrg        b->shader->emitted_atest = true;
5607ec681f3Smrg
5617ec681f3Smrg        /* Pseudo-source to encode in the tuple */
5627ec681f3Smrg        atest->src[2] = bi_fau(BIR_FAU_ATEST_PARAM, false);
5637ec681f3Smrg}
5647ec681f3Smrg
5657ec681f3Smrgstatic void
5667ec681f3Smrgbi_emit_fragment_out(bi_builder *b, nir_intrinsic_instr *instr)
5677ec681f3Smrg{
5687ec681f3Smrg        bool combined = instr->intrinsic ==
5697ec681f3Smrg                nir_intrinsic_store_combined_output_pan;
5707ec681f3Smrg
5717ec681f3Smrg        unsigned writeout = combined ? nir_intrinsic_component(instr) :
5727ec681f3Smrg                PAN_WRITEOUT_C;
5737ec681f3Smrg
5747ec681f3Smrg        bool emit_blend = writeout & (PAN_WRITEOUT_C);
5757ec681f3Smrg        bool emit_zs = writeout & (PAN_WRITEOUT_Z | PAN_WRITEOUT_S);
5767ec681f3Smrg
5777ec681f3Smrg        const nir_variable *var =
5787ec681f3Smrg                nir_find_variable_with_driver_location(b->shader->nir,
5797ec681f3Smrg                                nir_var_shader_out, nir_intrinsic_base(instr));
5807ec681f3Smrg        assert(var);
5817ec681f3Smrg
5827ec681f3Smrg        unsigned loc = var->data.location;
5837ec681f3Smrg        bi_index src0 = bi_src_index(&instr->src[0]);
5847ec681f3Smrg
5857ec681f3Smrg        /* By ISA convention, the coverage mask is stored in R60. The store
5867ec681f3Smrg         * itself will be handled by a subsequent ATEST instruction */
5877ec681f3Smrg        if (loc == FRAG_RESULT_SAMPLE_MASK) {
5887ec681f3Smrg                bi_index orig = bi_register(60);
5897ec681f3Smrg                bi_index msaa = bi_load_sysval(b, PAN_SYSVAL_MULTISAMPLED, 1, 0);
5907ec681f3Smrg                bi_index new = bi_lshift_and_i32(b, orig, src0, bi_imm_u8(0));
5917ec681f3Smrg                bi_mux_i32_to(b, orig, orig, new, msaa, BI_MUX_INT_ZERO);
5927ec681f3Smrg                return;
5937ec681f3Smrg        }
5947ec681f3Smrg
5957ec681f3Smrg
5967ec681f3Smrg        /* Dual-source blending is implemented by putting the color in
5977ec681f3Smrg         * registers r4-r7. */
5987ec681f3Smrg        if (var->data.index) {
5997ec681f3Smrg                unsigned count = nir_src_num_components(instr->src[0]);
6007ec681f3Smrg
6017ec681f3Smrg                for (unsigned i = 0; i < count; ++i)
6027ec681f3Smrg                        bi_mov_i32_to(b, bi_register(4 + i), bi_word(src0, i));
6037ec681f3Smrg
6047ec681f3Smrg                b->shader->info->bifrost.blend_src1_type =
6057ec681f3Smrg                        nir_intrinsic_src_type(instr);
6067ec681f3Smrg
6077ec681f3Smrg                return;
6087ec681f3Smrg        }
6097ec681f3Smrg
6107ec681f3Smrg        /* Emit ATEST if we have to, note ATEST requires a floating-point alpha
6117ec681f3Smrg         * value, but render target #0 might not be floating point. However the
6127ec681f3Smrg         * alpha value is only used for alpha-to-coverage, a stage which is
6137ec681f3Smrg         * skipped for pure integer framebuffers, so the issue is moot. */
6147ec681f3Smrg
6157ec681f3Smrg        if (!b->shader->emitted_atest && !bi_skip_atest(b->shader, emit_zs)) {
6167ec681f3Smrg                nir_alu_type T = nir_intrinsic_src_type(instr);
6177ec681f3Smrg
6187ec681f3Smrg                bi_index rgba = bi_src_index(&instr->src[0]);
6197ec681f3Smrg                bi_index alpha =
6207ec681f3Smrg                        (T == nir_type_float16) ? bi_half(bi_word(rgba, 1), true) :
6217ec681f3Smrg                        (T == nir_type_float32) ? bi_word(rgba, 3) :
6227ec681f3Smrg                        bi_dontcare();
6237ec681f3Smrg
6247ec681f3Smrg                /* Don't read out-of-bounds */
6257ec681f3Smrg                if (nir_src_num_components(instr->src[0]) < 4)
6267ec681f3Smrg                        alpha = bi_imm_f32(1.0);
6277ec681f3Smrg
6287ec681f3Smrg                bi_emit_atest(b, alpha);
6297ec681f3Smrg        }
6307ec681f3Smrg
6317ec681f3Smrg        if (emit_zs) {
6327ec681f3Smrg                bi_index z = { 0 }, s = { 0 };
6337ec681f3Smrg
6347ec681f3Smrg                if (writeout & PAN_WRITEOUT_Z)
6357ec681f3Smrg                        z = bi_src_index(&instr->src[2]);
6367ec681f3Smrg
6377ec681f3Smrg                if (writeout & PAN_WRITEOUT_S)
6387ec681f3Smrg                        s = bi_src_index(&instr->src[3]);
6397ec681f3Smrg
6407ec681f3Smrg                bi_zs_emit_to(b, bi_register(60), z, s, bi_register(60),
6417ec681f3Smrg                                writeout & PAN_WRITEOUT_S,
6427ec681f3Smrg                                writeout & PAN_WRITEOUT_Z);
6437ec681f3Smrg        }
6447ec681f3Smrg
6457ec681f3Smrg        if (emit_blend) {
6467ec681f3Smrg                assert(loc >= FRAG_RESULT_DATA0);
6477ec681f3Smrg
6487ec681f3Smrg                unsigned rt = (loc - FRAG_RESULT_DATA0);
6497ec681f3Smrg                bi_index color = bi_src_index(&instr->src[0]);
6507ec681f3Smrg
6517ec681f3Smrg                /* Explicit copy since BLEND inputs are precoloured to R0-R3,
6527ec681f3Smrg                 * TODO: maybe schedule around this or implement in RA as a
6537ec681f3Smrg                 * spill */
6547ec681f3Smrg                bool has_mrt = false;
6557ec681f3Smrg
6567ec681f3Smrg                nir_foreach_shader_out_variable(var, b->shader->nir)
6577ec681f3Smrg                        has_mrt |= (var->data.location > FRAG_RESULT_DATA0);
6587ec681f3Smrg
6597ec681f3Smrg                if (has_mrt) {
6607ec681f3Smrg                        bi_index srcs[4] = { color, color, color, color };
6617ec681f3Smrg                        unsigned channels[4] = { 0, 1, 2, 3 };
6627ec681f3Smrg                        color = bi_temp(b->shader);
6637ec681f3Smrg                        bi_make_vec_to(b, color, srcs, channels,
6647ec681f3Smrg                                       nir_src_num_components(instr->src[0]),
6657ec681f3Smrg                                       nir_alu_type_get_type_size(nir_intrinsic_src_type(instr)));
6667ec681f3Smrg                }
6677ec681f3Smrg
6687ec681f3Smrg                bi_emit_blend_op(b, color, nir_intrinsic_src_type(instr), rt);
6697ec681f3Smrg        }
6707ec681f3Smrg
6717ec681f3Smrg        if (b->shader->inputs->is_blend) {
6727ec681f3Smrg                /* Jump back to the fragment shader, return address is stored
6737ec681f3Smrg                 * in r48 (see above).
6747ec681f3Smrg                 */
6757ec681f3Smrg                bi_jump(b, bi_register(48));
6767ec681f3Smrg        }
6777ec681f3Smrg}
6787ec681f3Smrg
6797ec681f3Smrgstatic void
6807ec681f3Smrgbi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr)
6817ec681f3Smrg{
6827ec681f3Smrg        /* In principle we can do better for 16-bit. At the moment we require
6837ec681f3Smrg         * 32-bit to permit the use of .auto, in order to force .u32 for flat
6847ec681f3Smrg         * varyings, to handle internal TGSI shaders that set flat in the VS
6857ec681f3Smrg         * but smooth in the FS */
6867ec681f3Smrg
6877ec681f3Smrg        ASSERTED nir_alu_type T = nir_intrinsic_src_type(instr);
6887ec681f3Smrg        assert(nir_alu_type_get_type_size(T) == 32);
6897ec681f3Smrg        enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO;
6907ec681f3Smrg
6917ec681f3Smrg        unsigned imm_index = 0;
6927ec681f3Smrg        bool immediate = bi_is_intr_immediate(instr, &imm_index, 16);
6937ec681f3Smrg
6947ec681f3Smrg        bi_index address;
6957ec681f3Smrg        if (immediate) {
6967ec681f3Smrg                address = bi_lea_attr_imm(b,
6977ec681f3Smrg                                          bi_register(61), bi_register(62),
6987ec681f3Smrg                                          regfmt, imm_index);
6997ec681f3Smrg        } else {
7007ec681f3Smrg                bi_index idx =
7017ec681f3Smrg                        bi_iadd_u32(b,
7027ec681f3Smrg                                    bi_src_index(nir_get_io_offset_src(instr)),
7037ec681f3Smrg                                    bi_imm_u32(nir_intrinsic_base(instr)),
7047ec681f3Smrg                                    false);
7057ec681f3Smrg                address = bi_lea_attr(b,
7067ec681f3Smrg                                      bi_register(61), bi_register(62),
7077ec681f3Smrg                                      idx, regfmt);
7087ec681f3Smrg        }
7097ec681f3Smrg
7107ec681f3Smrg        /* Only look at the total components needed. In effect, we fill in all
7117ec681f3Smrg         * the intermediate "holes" in the write mask, since we can't mask off
7127ec681f3Smrg         * stores. Since nir_lower_io_to_temporaries ensures each varying is
7137ec681f3Smrg         * written at most once, anything that's masked out is undefined, so it
7147ec681f3Smrg         * doesn't matter what we write there. So we may as well do the
7157ec681f3Smrg         * simplest thing possible. */
7167ec681f3Smrg        unsigned nr = util_last_bit(nir_intrinsic_write_mask(instr));
7177ec681f3Smrg        assert(nr > 0 && nr <= nir_intrinsic_src_components(instr, 0));
7187ec681f3Smrg
7197ec681f3Smrg        bi_st_cvt(b, bi_src_index(&instr->src[0]), address,
7207ec681f3Smrg                        bi_word(address, 1), bi_word(address, 2),
7217ec681f3Smrg                        regfmt, nr - 1);
7227ec681f3Smrg}
7237ec681f3Smrg
7247ec681f3Smrgstatic void
7257ec681f3Smrgbi_emit_load_ubo(bi_builder *b, nir_intrinsic_instr *instr)
7267ec681f3Smrg{
7277ec681f3Smrg        nir_src *offset = nir_get_io_offset_src(instr);
7287ec681f3Smrg
7297ec681f3Smrg        bool offset_is_const = nir_src_is_const(*offset);
7307ec681f3Smrg        bi_index dyn_offset = bi_src_index(offset);
7317ec681f3Smrg        uint32_t const_offset = offset_is_const ? nir_src_as_uint(*offset) : 0;
7327ec681f3Smrg        bool kernel_input = (instr->intrinsic == nir_intrinsic_load_kernel_input);
7337ec681f3Smrg
7347ec681f3Smrg        bi_load_to(b, instr->num_components * nir_dest_bit_size(instr->dest),
7357ec681f3Smrg                        bi_dest_index(&instr->dest), offset_is_const ?
7367ec681f3Smrg                        bi_imm_u32(const_offset) : dyn_offset,
7377ec681f3Smrg                        kernel_input ? bi_zero() : bi_src_index(&instr->src[0]),
7387ec681f3Smrg                        BI_SEG_UBO);
7397ec681f3Smrg}
7407ec681f3Smrg
7417ec681f3Smrgstatic bi_index
7427ec681f3Smrgbi_addr_high(nir_src *src)
7437ec681f3Smrg{
7447ec681f3Smrg	return (nir_src_bit_size(*src) == 64) ?
7457ec681f3Smrg		bi_word(bi_src_index(src), 1) : bi_zero();
7467ec681f3Smrg}
7477ec681f3Smrg
7487ec681f3Smrgstatic void
7497ec681f3Smrgbi_emit_load(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg)
7507ec681f3Smrg{
7517ec681f3Smrg        bi_load_to(b, instr->num_components * nir_dest_bit_size(instr->dest),
7527ec681f3Smrg                   bi_dest_index(&instr->dest),
7537ec681f3Smrg                   bi_src_index(&instr->src[0]), bi_addr_high(&instr->src[0]),
7547ec681f3Smrg                   seg);
7557ec681f3Smrg}
7567ec681f3Smrg
7577ec681f3Smrgstatic void
7587ec681f3Smrgbi_emit_store(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg)
7597ec681f3Smrg{
7607ec681f3Smrg        /* Require contiguous masks, gauranteed by nir_lower_wrmasks */
7617ec681f3Smrg        assert(nir_intrinsic_write_mask(instr) ==
7627ec681f3Smrg                        BITFIELD_MASK(instr->num_components));
7637ec681f3Smrg
7647ec681f3Smrg        bi_store(b, instr->num_components * nir_src_bit_size(instr->src[0]),
7657ec681f3Smrg                    bi_src_index(&instr->src[0]),
7667ec681f3Smrg                    bi_src_index(&instr->src[1]), bi_addr_high(&instr->src[1]),
7677ec681f3Smrg                    seg);
7687ec681f3Smrg}
7697ec681f3Smrg
7707ec681f3Smrg/* Exchanges the staging register with memory */
7717ec681f3Smrg
7727ec681f3Smrgstatic void
7737ec681f3Smrgbi_emit_axchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg, enum bi_seg seg)
7747ec681f3Smrg{
7757ec681f3Smrg        assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS);
7767ec681f3Smrg
7777ec681f3Smrg        unsigned sz = nir_src_bit_size(*arg);
7787ec681f3Smrg        assert(sz == 32 || sz == 64);
7797ec681f3Smrg
7807ec681f3Smrg        bi_index data = bi_src_index(arg);
7817ec681f3Smrg
7827ec681f3Smrg        bi_index data_words[] = {
7837ec681f3Smrg                bi_word(data, 0),
7847ec681f3Smrg                bi_word(data, 1),
7857ec681f3Smrg        };
7867ec681f3Smrg
7877ec681f3Smrg        bi_index inout = bi_temp_reg(b->shader);
7887ec681f3Smrg        bi_make_vec_to(b, inout, data_words, NULL, sz / 32, 32);
7897ec681f3Smrg
7907ec681f3Smrg        bi_axchg_to(b, sz, inout, inout,
7917ec681f3Smrg                        bi_word(addr, 0),
7927ec681f3Smrg                        (seg == BI_SEG_NONE) ? bi_word(addr, 1) : bi_zero(),
7937ec681f3Smrg                        seg);
7947ec681f3Smrg
7957ec681f3Smrg        bi_index inout_words[] = {
7967ec681f3Smrg                bi_word(inout, 0),
7977ec681f3Smrg                bi_word(inout, 1),
7987ec681f3Smrg        };
7997ec681f3Smrg
8007ec681f3Smrg        bi_make_vec_to(b, dst, inout_words, NULL, sz / 32, 32);
8017ec681f3Smrg}
8027ec681f3Smrg
8037ec681f3Smrg/* Exchanges the second staging register with memory if comparison with first
8047ec681f3Smrg * staging register passes */
8057ec681f3Smrg
8067ec681f3Smrgstatic void
8077ec681f3Smrgbi_emit_acmpxchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg_1, nir_src *arg_2, enum bi_seg seg)
8087ec681f3Smrg{
8097ec681f3Smrg        assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS);
8107ec681f3Smrg
8117ec681f3Smrg        /* hardware is swapped from NIR */
8127ec681f3Smrg        bi_index src0 = bi_src_index(arg_2);
8137ec681f3Smrg        bi_index src1 = bi_src_index(arg_1);
8147ec681f3Smrg
8157ec681f3Smrg        unsigned sz = nir_src_bit_size(*arg_1);
8167ec681f3Smrg        assert(sz == 32 || sz == 64);
8177ec681f3Smrg
8187ec681f3Smrg        bi_index data_words[] = {
8197ec681f3Smrg                bi_word(src0, 0),
8207ec681f3Smrg                sz == 32 ? bi_word(src1, 0) : bi_word(src0, 1),
8217ec681f3Smrg
8227ec681f3Smrg                /* 64-bit */
8237ec681f3Smrg                bi_word(src1, 0),
8247ec681f3Smrg                bi_word(src1, 1),
8257ec681f3Smrg        };
8267ec681f3Smrg
8277ec681f3Smrg        bi_index inout = bi_temp_reg(b->shader);
8287ec681f3Smrg        bi_make_vec_to(b, inout, data_words, NULL, 2 * (sz / 32), 32);
8297ec681f3Smrg
8307ec681f3Smrg        bi_acmpxchg_to(b, sz, inout, inout,
8317ec681f3Smrg                        bi_word(addr, 0),
8327ec681f3Smrg                        (seg == BI_SEG_NONE) ? bi_word(addr, 1) : bi_zero(),
8337ec681f3Smrg                        seg);
8347ec681f3Smrg
8357ec681f3Smrg        bi_index inout_words[] = {
8367ec681f3Smrg                bi_word(inout, 0),
8377ec681f3Smrg                bi_word(inout, 1),
8387ec681f3Smrg        };
8397ec681f3Smrg
8407ec681f3Smrg        bi_make_vec_to(b, dst, inout_words, NULL, sz / 32, 32);
8417ec681f3Smrg}
8427ec681f3Smrg
8437ec681f3Smrg/* Extracts an atomic opcode */
8447ec681f3Smrg
8457ec681f3Smrgstatic enum bi_atom_opc
8467ec681f3Smrgbi_atom_opc_for_nir(nir_intrinsic_op op)
8477ec681f3Smrg{
8487ec681f3Smrg        switch (op) {
8497ec681f3Smrg        case nir_intrinsic_global_atomic_add:
8507ec681f3Smrg        case nir_intrinsic_shared_atomic_add:
8517ec681f3Smrg        case nir_intrinsic_image_atomic_add:
8527ec681f3Smrg                return BI_ATOM_OPC_AADD;
8537ec681f3Smrg
8547ec681f3Smrg        case nir_intrinsic_global_atomic_imin:
8557ec681f3Smrg        case nir_intrinsic_shared_atomic_imin:
8567ec681f3Smrg        case nir_intrinsic_image_atomic_imin:
8577ec681f3Smrg                return BI_ATOM_OPC_ASMIN;
8587ec681f3Smrg
8597ec681f3Smrg        case nir_intrinsic_global_atomic_umin:
8607ec681f3Smrg        case nir_intrinsic_shared_atomic_umin:
8617ec681f3Smrg        case nir_intrinsic_image_atomic_umin:
8627ec681f3Smrg                return BI_ATOM_OPC_AUMIN;
8637ec681f3Smrg
8647ec681f3Smrg        case nir_intrinsic_global_atomic_imax:
8657ec681f3Smrg        case nir_intrinsic_shared_atomic_imax:
8667ec681f3Smrg        case nir_intrinsic_image_atomic_imax:
8677ec681f3Smrg                return BI_ATOM_OPC_ASMAX;
8687ec681f3Smrg
8697ec681f3Smrg        case nir_intrinsic_global_atomic_umax:
8707ec681f3Smrg        case nir_intrinsic_shared_atomic_umax:
8717ec681f3Smrg        case nir_intrinsic_image_atomic_umax:
8727ec681f3Smrg                return BI_ATOM_OPC_AUMAX;
8737ec681f3Smrg
8747ec681f3Smrg        case nir_intrinsic_global_atomic_and:
8757ec681f3Smrg        case nir_intrinsic_shared_atomic_and:
8767ec681f3Smrg        case nir_intrinsic_image_atomic_and:
8777ec681f3Smrg                return BI_ATOM_OPC_AAND;
8787ec681f3Smrg
8797ec681f3Smrg        case nir_intrinsic_global_atomic_or:
8807ec681f3Smrg        case nir_intrinsic_shared_atomic_or:
8817ec681f3Smrg        case nir_intrinsic_image_atomic_or:
8827ec681f3Smrg                return BI_ATOM_OPC_AOR;
8837ec681f3Smrg
8847ec681f3Smrg        case nir_intrinsic_global_atomic_xor:
8857ec681f3Smrg        case nir_intrinsic_shared_atomic_xor:
8867ec681f3Smrg        case nir_intrinsic_image_atomic_xor:
8877ec681f3Smrg                return BI_ATOM_OPC_AXOR;
8887ec681f3Smrg
8897ec681f3Smrg        default:
8907ec681f3Smrg                unreachable("Unexpected computational atomic");
8917ec681f3Smrg        }
8927ec681f3Smrg}
8937ec681f3Smrg
8947ec681f3Smrg/* Optimized unary atomics are available with an implied #1 argument */
8957ec681f3Smrg
8967ec681f3Smrgstatic bool
8977ec681f3Smrgbi_promote_atom_c1(enum bi_atom_opc op, bi_index arg, enum bi_atom_opc *out)
8987ec681f3Smrg{
8997ec681f3Smrg        /* Check we have a compatible constant */
9007ec681f3Smrg        if (arg.type != BI_INDEX_CONSTANT)
9017ec681f3Smrg                return false;
9027ec681f3Smrg
9037ec681f3Smrg        if (!(arg.value == 1 || (arg.value == -1 && op == BI_ATOM_OPC_AADD)))
9047ec681f3Smrg                return false;
9057ec681f3Smrg
9067ec681f3Smrg        /* Check for a compatible operation */
9077ec681f3Smrg        switch (op) {
9087ec681f3Smrg        case BI_ATOM_OPC_AADD:
9097ec681f3Smrg                *out = (arg.value == 1) ? BI_ATOM_OPC_AINC : BI_ATOM_OPC_ADEC;
9107ec681f3Smrg                return true;
9117ec681f3Smrg        case BI_ATOM_OPC_ASMAX:
9127ec681f3Smrg                *out = BI_ATOM_OPC_ASMAX1;
9137ec681f3Smrg                return true;
9147ec681f3Smrg        case BI_ATOM_OPC_AUMAX:
9157ec681f3Smrg                *out = BI_ATOM_OPC_AUMAX1;
9167ec681f3Smrg                return true;
9177ec681f3Smrg        case BI_ATOM_OPC_AOR:
9187ec681f3Smrg                *out = BI_ATOM_OPC_AOR1;
9197ec681f3Smrg                return true;
9207ec681f3Smrg        default:
9217ec681f3Smrg                return false;
9227ec681f3Smrg        }
9237ec681f3Smrg}
9247ec681f3Smrg
9257ec681f3Smrg/* Coordinates are 16-bit integers in Bifrost but 32-bit in NIR */
9267ec681f3Smrg
9277ec681f3Smrgstatic bi_index
9287ec681f3Smrgbi_emit_image_coord(bi_builder *b, bi_index coord, unsigned src_idx,
9297ec681f3Smrg                    unsigned coord_comps, bool is_array)
9307ec681f3Smrg{
9317ec681f3Smrg        assert(coord_comps > 0 && coord_comps <= 3);
9327ec681f3Smrg
9337ec681f3Smrg        if (src_idx == 0) {
9347ec681f3Smrg                if (coord_comps == 1 || (coord_comps == 2 && is_array))
9357ec681f3Smrg                        return bi_word(coord, 0);
9367ec681f3Smrg                else
9377ec681f3Smrg                        return bi_mkvec_v2i16(b,
9387ec681f3Smrg                                              bi_half(bi_word(coord, 0), false),
9397ec681f3Smrg                                              bi_half(bi_word(coord, 1), false));
9407ec681f3Smrg        } else {
9417ec681f3Smrg                if (coord_comps == 3)
9427ec681f3Smrg                        return bi_word(coord, 2);
9437ec681f3Smrg                else if (coord_comps == 2 && is_array)
9447ec681f3Smrg                        return bi_word(coord, 1);
9457ec681f3Smrg                else
9467ec681f3Smrg                        return bi_zero();
9477ec681f3Smrg        }
9487ec681f3Smrg}
9497ec681f3Smrg
9507ec681f3Smrgstatic bi_index
9517ec681f3Smrgbi_emit_image_index(bi_builder *b, nir_intrinsic_instr *instr)
9527ec681f3Smrg{
9537ec681f3Smrg        nir_src src = instr->src[0];
9547ec681f3Smrg        bi_index index = bi_src_index(&src);
9557ec681f3Smrg        bi_context *ctx = b->shader;
9567ec681f3Smrg
9577ec681f3Smrg        /* Images come after vertex attributes, so handle an explicit offset */
9587ec681f3Smrg        unsigned offset = (ctx->stage == MESA_SHADER_VERTEX) ?
9597ec681f3Smrg                util_bitcount64(ctx->nir->info.inputs_read) : 0;
9607ec681f3Smrg
9617ec681f3Smrg        if (offset == 0)
9627ec681f3Smrg                return index;
9637ec681f3Smrg        else if (nir_src_is_const(src))
9647ec681f3Smrg                return bi_imm_u32(nir_src_as_uint(src) + offset);
9657ec681f3Smrg        else
9667ec681f3Smrg                return bi_iadd_u32(b, index, bi_imm_u32(offset), false);
9677ec681f3Smrg}
9687ec681f3Smrg
9697ec681f3Smrgstatic void
9707ec681f3Smrgbi_emit_image_load(bi_builder *b, nir_intrinsic_instr *instr)
9717ec681f3Smrg{
9727ec681f3Smrg        enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
9737ec681f3Smrg        unsigned coord_comps = nir_image_intrinsic_coord_components(instr);
9747ec681f3Smrg        bool array = nir_intrinsic_image_array(instr);
9757ec681f3Smrg        ASSERTED unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim);
9767ec681f3Smrg
9777ec681f3Smrg        bi_index coords = bi_src_index(&instr->src[1]);
9787ec681f3Smrg        /* TODO: MSAA */
9797ec681f3Smrg        assert(nr_dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported");
9807ec681f3Smrg
9817ec681f3Smrg        bi_ld_attr_tex_to(b, bi_dest_index(&instr->dest),
9827ec681f3Smrg                          bi_emit_image_coord(b, coords, 0, coord_comps, array),
9837ec681f3Smrg                          bi_emit_image_coord(b, coords, 1, coord_comps, array),
9847ec681f3Smrg                          bi_emit_image_index(b, instr),
9857ec681f3Smrg                          bi_reg_fmt_for_nir(nir_intrinsic_dest_type(instr)),
9867ec681f3Smrg                          instr->num_components - 1);
9877ec681f3Smrg}
9887ec681f3Smrg
9897ec681f3Smrgstatic bi_index
9907ec681f3Smrgbi_emit_lea_image(bi_builder *b, nir_intrinsic_instr *instr)
9917ec681f3Smrg{
9927ec681f3Smrg        enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
9937ec681f3Smrg        bool array = nir_intrinsic_image_array(instr);
9947ec681f3Smrg        ASSERTED unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim);
9957ec681f3Smrg        unsigned coord_comps = nir_image_intrinsic_coord_components(instr);
9967ec681f3Smrg
9977ec681f3Smrg        /* TODO: MSAA */
9987ec681f3Smrg        assert(nr_dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported");
9997ec681f3Smrg
10007ec681f3Smrg        enum bi_register_format type = (instr->intrinsic == nir_intrinsic_image_store) ?
10017ec681f3Smrg                bi_reg_fmt_for_nir(nir_intrinsic_src_type(instr)) :
10027ec681f3Smrg                BI_REGISTER_FORMAT_AUTO;
10037ec681f3Smrg
10047ec681f3Smrg        bi_index coords = bi_src_index(&instr->src[1]);
10057ec681f3Smrg        bi_index xy = bi_emit_image_coord(b, coords, 0, coord_comps, array);
10067ec681f3Smrg        bi_index zw = bi_emit_image_coord(b, coords, 1, coord_comps, array);
10077ec681f3Smrg
10087ec681f3Smrg        bi_instr *I = bi_lea_attr_tex_to(b, bi_temp(b->shader), xy, zw,
10097ec681f3Smrg                        bi_emit_image_index(b, instr), type);
10107ec681f3Smrg
10117ec681f3Smrg        /* LEA_ATTR_TEX defaults to the secondary attribute table, but our ABI
10127ec681f3Smrg         * has all images in the primary attribute table */
10137ec681f3Smrg        I->table = BI_TABLE_ATTRIBUTE_1;
10147ec681f3Smrg
10157ec681f3Smrg        return I->dest[0];
10167ec681f3Smrg}
10177ec681f3Smrg
10187ec681f3Smrgstatic void
10197ec681f3Smrgbi_emit_image_store(bi_builder *b, nir_intrinsic_instr *instr)
10207ec681f3Smrg{
10217ec681f3Smrg        bi_index addr = bi_emit_lea_image(b, instr);
10227ec681f3Smrg
10237ec681f3Smrg        bi_st_cvt(b, bi_src_index(&instr->src[3]),
10247ec681f3Smrg                     addr, bi_word(addr, 1), bi_word(addr, 2),
10257ec681f3Smrg                     bi_reg_fmt_for_nir(nir_intrinsic_src_type(instr)),
10267ec681f3Smrg                     instr->num_components - 1);
10277ec681f3Smrg}
10287ec681f3Smrg
10297ec681f3Smrgstatic void
10307ec681f3Smrgbi_emit_atomic_i32_to(bi_builder *b, bi_index dst,
10317ec681f3Smrg                bi_index addr, bi_index arg, nir_intrinsic_op intrinsic)
10327ec681f3Smrg{
10337ec681f3Smrg        /* ATOM_C.i32 takes a vector with {arg, coalesced}, ATOM_C1.i32 doesn't
10347ec681f3Smrg         * take any vector but can still output in RETURN mode */
10357ec681f3Smrg        bi_index sr = bi_temp_reg(b->shader);
10367ec681f3Smrg
10377ec681f3Smrg        enum bi_atom_opc opc = bi_atom_opc_for_nir(intrinsic);
10387ec681f3Smrg        enum bi_atom_opc post_opc = opc;
10397ec681f3Smrg
10407ec681f3Smrg        /* Generate either ATOM_C or ATOM_C1 as required */
10417ec681f3Smrg        if (bi_promote_atom_c1(opc, arg, &opc)) {
10427ec681f3Smrg                bi_patom_c1_i32_to(b, sr, bi_word(addr, 0),
10437ec681f3Smrg                                bi_word(addr, 1), opc, 2);
10447ec681f3Smrg        } else {
10457ec681f3Smrg                bi_mov_i32_to(b, sr, arg);
10467ec681f3Smrg                bi_patom_c_i32_to(b, sr, sr, bi_word(addr, 0),
10477ec681f3Smrg                                bi_word(addr, 1), opc, 2);
10487ec681f3Smrg        }
10497ec681f3Smrg
10507ec681f3Smrg        /* Post-process it */
10517ec681f3Smrg        bi_atom_post_i32_to(b, dst, bi_word(sr, 0), bi_word(sr, 1), post_opc);
10527ec681f3Smrg}
10537ec681f3Smrg
10547ec681f3Smrg/* gl_FragCoord.xy = u16_to_f32(R59.xy) + 0.5
10557ec681f3Smrg * gl_FragCoord.z = ld_vary(fragz)
10567ec681f3Smrg * gl_FragCoord.w = ld_vary(fragw)
10577ec681f3Smrg */
10587ec681f3Smrg
10597ec681f3Smrgstatic void
10607ec681f3Smrgbi_emit_load_frag_coord(bi_builder *b, nir_intrinsic_instr *instr)
10617ec681f3Smrg{
10627ec681f3Smrg        bi_index src[4] = {};
10637ec681f3Smrg
10647ec681f3Smrg        for (unsigned i = 0; i < 2; ++i) {
10657ec681f3Smrg                src[i] = bi_fadd_f32(b,
10667ec681f3Smrg                                bi_u16_to_f32(b, bi_half(bi_register(59), i)),
10677ec681f3Smrg                                bi_imm_f32(0.5f), BI_ROUND_NONE);
10687ec681f3Smrg        }
10697ec681f3Smrg
10707ec681f3Smrg        for (unsigned i = 0; i < 2; ++i) {
10717ec681f3Smrg                src[2 + i] = bi_ld_var_special(b, bi_zero(),
10727ec681f3Smrg                                BI_REGISTER_FORMAT_F32, BI_SAMPLE_CENTER,
10737ec681f3Smrg                                BI_UPDATE_CLOBBER,
10747ec681f3Smrg                                (i == 0) ? BI_VARYING_NAME_FRAG_Z :
10757ec681f3Smrg                                        BI_VARYING_NAME_FRAG_W,
10767ec681f3Smrg                                BI_VECSIZE_NONE);
10777ec681f3Smrg        }
10787ec681f3Smrg
10797ec681f3Smrg        bi_make_vec_to(b, bi_dest_index(&instr->dest), src, NULL, 4, 32);
10807ec681f3Smrg}
10817ec681f3Smrg
10827ec681f3Smrgstatic void
10837ec681f3Smrgbi_emit_ld_tile(bi_builder *b, nir_intrinsic_instr *instr)
10847ec681f3Smrg{
10857ec681f3Smrg        unsigned rt = b->shader->inputs->blend.rt;
10867ec681f3Smrg        unsigned size = nir_dest_bit_size(instr->dest);
10877ec681f3Smrg
10887ec681f3Smrg        /* Get the render target */
10897ec681f3Smrg        if (!b->shader->inputs->is_blend) {
10907ec681f3Smrg                const nir_variable *var =
10917ec681f3Smrg                        nir_find_variable_with_driver_location(b->shader->nir,
10927ec681f3Smrg                                        nir_var_shader_out, nir_intrinsic_base(instr));
10937ec681f3Smrg                unsigned loc = var->data.location;
10947ec681f3Smrg                assert(loc >= FRAG_RESULT_DATA0);
10957ec681f3Smrg                rt = (loc - FRAG_RESULT_DATA0);
10967ec681f3Smrg        }
10977ec681f3Smrg
10987ec681f3Smrg        bi_index desc = b->shader->inputs->is_blend ?
10997ec681f3Smrg                bi_imm_u32(b->shader->inputs->blend.bifrost_blend_desc >> 32) :
11007ec681f3Smrg                b->shader->inputs->bifrost.static_rt_conv ?
11017ec681f3Smrg                bi_imm_u32(b->shader->inputs->bifrost.rt_conv[rt]) :
11027ec681f3Smrg                bi_load_sysval(b, PAN_SYSVAL(RT_CONVERSION, rt | (size << 4)), 1, 0);
11037ec681f3Smrg
11047ec681f3Smrg        bi_ld_tile_to(b, bi_dest_index(&instr->dest), bi_pixel_indices(b, rt),
11057ec681f3Smrg                        bi_register(60), desc, (instr->num_components - 1));
11067ec681f3Smrg}
11077ec681f3Smrg
11087ec681f3Smrgstatic void
11097ec681f3Smrgbi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
11107ec681f3Smrg{
11117ec681f3Smrg        bi_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest ?
11127ec681f3Smrg                bi_dest_index(&instr->dest) : bi_null();
11137ec681f3Smrg        gl_shader_stage stage = b->shader->stage;
11147ec681f3Smrg
11157ec681f3Smrg        switch (instr->intrinsic) {
11167ec681f3Smrg        case nir_intrinsic_load_barycentric_pixel:
11177ec681f3Smrg        case nir_intrinsic_load_barycentric_centroid:
11187ec681f3Smrg        case nir_intrinsic_load_barycentric_sample:
11197ec681f3Smrg        case nir_intrinsic_load_barycentric_at_sample:
11207ec681f3Smrg        case nir_intrinsic_load_barycentric_at_offset:
11217ec681f3Smrg                /* handled later via load_vary */
11227ec681f3Smrg                break;
11237ec681f3Smrg        case nir_intrinsic_load_interpolated_input:
11247ec681f3Smrg        case nir_intrinsic_load_input:
11257ec681f3Smrg                if (b->shader->inputs->is_blend)
11267ec681f3Smrg                        bi_emit_load_blend_input(b, instr);
11277ec681f3Smrg                else if (stage == MESA_SHADER_FRAGMENT)
11287ec681f3Smrg                        bi_emit_load_vary(b, instr);
11297ec681f3Smrg                else if (stage == MESA_SHADER_VERTEX)
11307ec681f3Smrg                        bi_emit_load_attr(b, instr);
11317ec681f3Smrg                else
11327ec681f3Smrg                        unreachable("Unsupported shader stage");
11337ec681f3Smrg                break;
11347ec681f3Smrg
11357ec681f3Smrg        case nir_intrinsic_store_output:
11367ec681f3Smrg                if (stage == MESA_SHADER_FRAGMENT)
11377ec681f3Smrg                        bi_emit_fragment_out(b, instr);
11387ec681f3Smrg                else if (stage == MESA_SHADER_VERTEX)
11397ec681f3Smrg                        bi_emit_store_vary(b, instr);
11407ec681f3Smrg                else
11417ec681f3Smrg                        unreachable("Unsupported shader stage");
11427ec681f3Smrg                break;
11437ec681f3Smrg
11447ec681f3Smrg        case nir_intrinsic_store_combined_output_pan:
11457ec681f3Smrg                assert(stage == MESA_SHADER_FRAGMENT);
11467ec681f3Smrg                bi_emit_fragment_out(b, instr);
11477ec681f3Smrg                break;
11487ec681f3Smrg
11497ec681f3Smrg        case nir_intrinsic_load_ubo:
11507ec681f3Smrg        case nir_intrinsic_load_kernel_input:
11517ec681f3Smrg                bi_emit_load_ubo(b, instr);
11527ec681f3Smrg                break;
11537ec681f3Smrg
11547ec681f3Smrg        case nir_intrinsic_load_global:
11557ec681f3Smrg        case nir_intrinsic_load_global_constant:
11567ec681f3Smrg                bi_emit_load(b, instr, BI_SEG_NONE);
11577ec681f3Smrg                break;
11587ec681f3Smrg
11597ec681f3Smrg        case nir_intrinsic_store_global:
11607ec681f3Smrg                bi_emit_store(b, instr, BI_SEG_NONE);
11617ec681f3Smrg                break;
11627ec681f3Smrg
11637ec681f3Smrg        case nir_intrinsic_load_scratch:
11647ec681f3Smrg                bi_emit_load(b, instr, BI_SEG_TL);
11657ec681f3Smrg                break;
11667ec681f3Smrg
11677ec681f3Smrg        case nir_intrinsic_store_scratch:
11687ec681f3Smrg                bi_emit_store(b, instr, BI_SEG_TL);
11697ec681f3Smrg                break;
11707ec681f3Smrg
11717ec681f3Smrg        case nir_intrinsic_load_shared:
11727ec681f3Smrg                bi_emit_load(b, instr, BI_SEG_WLS);
11737ec681f3Smrg                break;
11747ec681f3Smrg
11757ec681f3Smrg        case nir_intrinsic_store_shared:
11767ec681f3Smrg                bi_emit_store(b, instr, BI_SEG_WLS);
11777ec681f3Smrg                break;
11787ec681f3Smrg
11797ec681f3Smrg        /* Blob doesn't seem to do anything for memory barriers, note +BARRIER
11807ec681f3Smrg         * is illegal in fragment shaders */
11817ec681f3Smrg        case nir_intrinsic_memory_barrier:
11827ec681f3Smrg        case nir_intrinsic_memory_barrier_buffer:
11837ec681f3Smrg        case nir_intrinsic_memory_barrier_image:
11847ec681f3Smrg        case nir_intrinsic_memory_barrier_shared:
11857ec681f3Smrg        case nir_intrinsic_group_memory_barrier:
11867ec681f3Smrg                break;
11877ec681f3Smrg
11887ec681f3Smrg        case nir_intrinsic_control_barrier:
11897ec681f3Smrg                assert(b->shader->stage != MESA_SHADER_FRAGMENT);
11907ec681f3Smrg                bi_barrier(b);
11917ec681f3Smrg                break;
11927ec681f3Smrg
11937ec681f3Smrg        case nir_intrinsic_shared_atomic_add:
11947ec681f3Smrg        case nir_intrinsic_shared_atomic_imin:
11957ec681f3Smrg        case nir_intrinsic_shared_atomic_umin:
11967ec681f3Smrg        case nir_intrinsic_shared_atomic_imax:
11977ec681f3Smrg        case nir_intrinsic_shared_atomic_umax:
11987ec681f3Smrg        case nir_intrinsic_shared_atomic_and:
11997ec681f3Smrg        case nir_intrinsic_shared_atomic_or:
12007ec681f3Smrg        case nir_intrinsic_shared_atomic_xor: {
12017ec681f3Smrg                assert(nir_src_bit_size(instr->src[1]) == 32);
12027ec681f3Smrg
12037ec681f3Smrg                bi_index addr = bi_seg_add_i64(b, bi_src_index(&instr->src[0]),
12047ec681f3Smrg                                bi_zero(), false, BI_SEG_WLS);
12057ec681f3Smrg
12067ec681f3Smrg                bi_emit_atomic_i32_to(b, dst, addr, bi_src_index(&instr->src[1]),
12077ec681f3Smrg                                instr->intrinsic);
12087ec681f3Smrg                break;
12097ec681f3Smrg        }
12107ec681f3Smrg
12117ec681f3Smrg        case nir_intrinsic_image_atomic_add:
12127ec681f3Smrg        case nir_intrinsic_image_atomic_imin:
12137ec681f3Smrg        case nir_intrinsic_image_atomic_umin:
12147ec681f3Smrg        case nir_intrinsic_image_atomic_imax:
12157ec681f3Smrg        case nir_intrinsic_image_atomic_umax:
12167ec681f3Smrg        case nir_intrinsic_image_atomic_and:
12177ec681f3Smrg        case nir_intrinsic_image_atomic_or:
12187ec681f3Smrg        case nir_intrinsic_image_atomic_xor:
12197ec681f3Smrg                assert(nir_src_bit_size(instr->src[3]) == 32);
12207ec681f3Smrg
12217ec681f3Smrg                bi_emit_atomic_i32_to(b, dst,
12227ec681f3Smrg                                bi_emit_lea_image(b, instr),
12237ec681f3Smrg                                bi_src_index(&instr->src[3]),
12247ec681f3Smrg                                instr->intrinsic);
12257ec681f3Smrg                break;
12267ec681f3Smrg
12277ec681f3Smrg        case nir_intrinsic_global_atomic_add:
12287ec681f3Smrg        case nir_intrinsic_global_atomic_imin:
12297ec681f3Smrg        case nir_intrinsic_global_atomic_umin:
12307ec681f3Smrg        case nir_intrinsic_global_atomic_imax:
12317ec681f3Smrg        case nir_intrinsic_global_atomic_umax:
12327ec681f3Smrg        case nir_intrinsic_global_atomic_and:
12337ec681f3Smrg        case nir_intrinsic_global_atomic_or:
12347ec681f3Smrg        case nir_intrinsic_global_atomic_xor:
12357ec681f3Smrg                assert(nir_src_bit_size(instr->src[1]) == 32);
12367ec681f3Smrg
12377ec681f3Smrg                bi_emit_atomic_i32_to(b, dst,
12387ec681f3Smrg                                bi_src_index(&instr->src[0]),
12397ec681f3Smrg                                bi_src_index(&instr->src[1]),
12407ec681f3Smrg                                instr->intrinsic);
12417ec681f3Smrg                break;
12427ec681f3Smrg
12437ec681f3Smrg        case nir_intrinsic_image_load:
12447ec681f3Smrg                bi_emit_image_load(b, instr);
12457ec681f3Smrg                break;
12467ec681f3Smrg
12477ec681f3Smrg        case nir_intrinsic_image_store:
12487ec681f3Smrg                bi_emit_image_store(b, instr);
12497ec681f3Smrg                break;
12507ec681f3Smrg
12517ec681f3Smrg        case nir_intrinsic_global_atomic_exchange:
12527ec681f3Smrg                bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]),
12537ec681f3Smrg                                &instr->src[1], BI_SEG_NONE);
12547ec681f3Smrg                break;
12557ec681f3Smrg
12567ec681f3Smrg        case nir_intrinsic_image_atomic_exchange:
12577ec681f3Smrg                bi_emit_axchg_to(b, dst, bi_emit_lea_image(b, instr),
12587ec681f3Smrg                                &instr->src[3], BI_SEG_NONE);
12597ec681f3Smrg                break;
12607ec681f3Smrg
12617ec681f3Smrg        case nir_intrinsic_shared_atomic_exchange:
12627ec681f3Smrg                bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]),
12637ec681f3Smrg                                &instr->src[1], BI_SEG_WLS);
12647ec681f3Smrg                break;
12657ec681f3Smrg
12667ec681f3Smrg        case nir_intrinsic_global_atomic_comp_swap:
12677ec681f3Smrg                bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]),
12687ec681f3Smrg                                &instr->src[1], &instr->src[2], BI_SEG_NONE);
12697ec681f3Smrg                break;
12707ec681f3Smrg
12717ec681f3Smrg        case nir_intrinsic_image_atomic_comp_swap:
12727ec681f3Smrg                bi_emit_acmpxchg_to(b, dst, bi_emit_lea_image(b, instr),
12737ec681f3Smrg                                &instr->src[3], &instr->src[4], BI_SEG_NONE);
12747ec681f3Smrg                break;
12757ec681f3Smrg
12767ec681f3Smrg        case nir_intrinsic_shared_atomic_comp_swap:
12777ec681f3Smrg                bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]),
12787ec681f3Smrg                                &instr->src[1], &instr->src[2], BI_SEG_WLS);
12797ec681f3Smrg                break;
12807ec681f3Smrg
12817ec681f3Smrg        case nir_intrinsic_load_frag_coord:
12827ec681f3Smrg                bi_emit_load_frag_coord(b, instr);
12837ec681f3Smrg                break;
12847ec681f3Smrg
12857ec681f3Smrg        case nir_intrinsic_load_output:
12867ec681f3Smrg                bi_emit_ld_tile(b, instr);
12877ec681f3Smrg                break;
12887ec681f3Smrg
12897ec681f3Smrg        case nir_intrinsic_discard_if: {
12907ec681f3Smrg                bi_index src = bi_src_index(&instr->src[0]);
12917ec681f3Smrg                assert(nir_src_bit_size(instr->src[0]) == 1);
12927ec681f3Smrg                bi_discard_b32(b, bi_half(src, false));
12937ec681f3Smrg                break;
12947ec681f3Smrg        }
12957ec681f3Smrg
12967ec681f3Smrg        case nir_intrinsic_discard:
12977ec681f3Smrg                bi_discard_f32(b, bi_zero(), bi_zero(), BI_CMPF_EQ);
12987ec681f3Smrg                break;
12997ec681f3Smrg
13007ec681f3Smrg        case nir_intrinsic_load_ssbo_address:
13017ec681f3Smrg                bi_load_sysval_nir(b, instr, 2, 0);
13027ec681f3Smrg                break;
13037ec681f3Smrg
13047ec681f3Smrg        case nir_intrinsic_load_work_dim:
13057ec681f3Smrg                bi_load_sysval_nir(b, instr, 1, 0);
13067ec681f3Smrg                break;
13077ec681f3Smrg
13087ec681f3Smrg        case nir_intrinsic_load_first_vertex:
13097ec681f3Smrg                bi_load_sysval_nir(b, instr, 1, 0);
13107ec681f3Smrg                break;
13117ec681f3Smrg
13127ec681f3Smrg        case nir_intrinsic_load_base_vertex:
13137ec681f3Smrg                bi_load_sysval_nir(b, instr, 1, 4);
13147ec681f3Smrg                break;
13157ec681f3Smrg
13167ec681f3Smrg        case nir_intrinsic_load_base_instance:
13177ec681f3Smrg                bi_load_sysval_nir(b, instr, 1, 8);
13187ec681f3Smrg                break;
13197ec681f3Smrg
13207ec681f3Smrg        case nir_intrinsic_load_draw_id:
13217ec681f3Smrg                bi_load_sysval_nir(b, instr, 1, 0);
13227ec681f3Smrg                break;
13237ec681f3Smrg
13247ec681f3Smrg        case nir_intrinsic_get_ssbo_size:
13257ec681f3Smrg                bi_load_sysval_nir(b, instr, 1, 8);
13267ec681f3Smrg                break;
13277ec681f3Smrg
13287ec681f3Smrg        case nir_intrinsic_load_viewport_scale:
13297ec681f3Smrg        case nir_intrinsic_load_viewport_offset:
13307ec681f3Smrg        case nir_intrinsic_load_num_workgroups:
13317ec681f3Smrg        case nir_intrinsic_load_workgroup_size:
13327ec681f3Smrg                bi_load_sysval_nir(b, instr, 3, 0);
13337ec681f3Smrg                break;
13347ec681f3Smrg
13357ec681f3Smrg        case nir_intrinsic_image_size:
13367ec681f3Smrg                bi_load_sysval_nir(b, instr,
13377ec681f3Smrg                                nir_dest_num_components(instr->dest), 0);
13387ec681f3Smrg                break;
13397ec681f3Smrg
13407ec681f3Smrg        case nir_intrinsic_load_blend_const_color_rgba:
13417ec681f3Smrg                bi_load_sysval_nir(b, instr,
13427ec681f3Smrg                                   nir_dest_num_components(instr->dest), 0);
13437ec681f3Smrg                break;
13447ec681f3Smrg
13457ec681f3Smrg	case nir_intrinsic_load_sample_positions_pan:
13467ec681f3Smrg                bi_mov_i32_to(b, bi_word(dst, 0),
13477ec681f3Smrg                                bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, false));
13487ec681f3Smrg                bi_mov_i32_to(b, bi_word(dst, 1),
13497ec681f3Smrg                                bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, true));
13507ec681f3Smrg                break;
13517ec681f3Smrg
13527ec681f3Smrg	case nir_intrinsic_load_sample_mask_in:
13537ec681f3Smrg                /* r61[0:15] contains the coverage bitmap */
13547ec681f3Smrg                bi_u16_to_u32_to(b, dst, bi_half(bi_register(61), false));
13557ec681f3Smrg                break;
13567ec681f3Smrg
13577ec681f3Smrg        case nir_intrinsic_load_sample_id:
13587ec681f3Smrg                bi_load_sample_id_to(b, dst);
13597ec681f3Smrg                break;
13607ec681f3Smrg
13617ec681f3Smrg	case nir_intrinsic_load_front_face:
13627ec681f3Smrg                /* r58 == 0 means primitive is front facing */
13637ec681f3Smrg                bi_icmp_i32_to(b, dst, bi_register(58), bi_zero(), BI_CMPF_EQ,
13647ec681f3Smrg                                BI_RESULT_TYPE_M1);
13657ec681f3Smrg                break;
13667ec681f3Smrg
13677ec681f3Smrg        case nir_intrinsic_load_point_coord:
13687ec681f3Smrg                bi_ld_var_special_to(b, dst, bi_zero(), BI_REGISTER_FORMAT_F32,
13697ec681f3Smrg                                BI_SAMPLE_CENTER, BI_UPDATE_CLOBBER,
13707ec681f3Smrg                                BI_VARYING_NAME_POINT, BI_VECSIZE_V2);
13717ec681f3Smrg                break;
13727ec681f3Smrg
13737ec681f3Smrg        case nir_intrinsic_load_vertex_id_zero_base:
13747ec681f3Smrg                bi_mov_i32_to(b, dst, bi_register(61));
13757ec681f3Smrg                break;
13767ec681f3Smrg
13777ec681f3Smrg        case nir_intrinsic_load_instance_id:
13787ec681f3Smrg                bi_mov_i32_to(b, dst, bi_register(62));
13797ec681f3Smrg                break;
13807ec681f3Smrg
13817ec681f3Smrg        case nir_intrinsic_load_subgroup_invocation:
13827ec681f3Smrg                bi_mov_i32_to(b, dst, bi_fau(BIR_FAU_LANE_ID, false));
13837ec681f3Smrg                break;
13847ec681f3Smrg
13857ec681f3Smrg        case nir_intrinsic_load_local_invocation_id:
13867ec681f3Smrg                for (unsigned i = 0; i < 3; ++i)
13877ec681f3Smrg                        bi_u16_to_u32_to(b, bi_word(dst, i),
13887ec681f3Smrg                                         bi_half(bi_register(55 + i / 2), i % 2));
13897ec681f3Smrg                break;
13907ec681f3Smrg
13917ec681f3Smrg        case nir_intrinsic_load_workgroup_id:
13927ec681f3Smrg                for (unsigned i = 0; i < 3; ++i)
13937ec681f3Smrg                        bi_mov_i32_to(b, bi_word(dst, i), bi_register(57 + i));
13947ec681f3Smrg                break;
13957ec681f3Smrg
13967ec681f3Smrg        case nir_intrinsic_load_global_invocation_id:
13977ec681f3Smrg        case nir_intrinsic_load_global_invocation_id_zero_base:
13987ec681f3Smrg                for (unsigned i = 0; i < 3; ++i)
13997ec681f3Smrg                        bi_mov_i32_to(b, bi_word(dst, i), bi_register(60 + i));
14007ec681f3Smrg                break;
14017ec681f3Smrg
14027ec681f3Smrg        case nir_intrinsic_shader_clock:
14037ec681f3Smrg                bi_ld_gclk_u64_to(b, dst, BI_SOURCE_CYCLE_COUNTER);
14047ec681f3Smrg                break;
14057ec681f3Smrg
14067ec681f3Smrg        default:
14077ec681f3Smrg                fprintf(stderr, "Unhandled intrinsic %s\n", nir_intrinsic_infos[instr->intrinsic].name);
14087ec681f3Smrg                assert(0);
14097ec681f3Smrg        }
14107ec681f3Smrg}
14117ec681f3Smrg
14127ec681f3Smrgstatic void
14137ec681f3Smrgbi_emit_load_const(bi_builder *b, nir_load_const_instr *instr)
14147ec681f3Smrg{
14157ec681f3Smrg        /* Make sure we've been lowered */
14167ec681f3Smrg        assert(instr->def.num_components <= (32 / instr->def.bit_size));
14177ec681f3Smrg
14187ec681f3Smrg        /* Accumulate all the channels of the constant, as if we did an
14197ec681f3Smrg         * implicit SEL over them */
14207ec681f3Smrg        uint32_t acc = 0;
14217ec681f3Smrg
14227ec681f3Smrg        for (unsigned i = 0; i < instr->def.num_components; ++i) {
14237ec681f3Smrg                uint32_t v = nir_const_value_as_uint(instr->value[i], instr->def.bit_size);
14247ec681f3Smrg
14257ec681f3Smrg                v = bi_extend_constant(v, instr->def.bit_size);
14267ec681f3Smrg                acc |= (v << (i * instr->def.bit_size));
14277ec681f3Smrg        }
14287ec681f3Smrg
14297ec681f3Smrg        bi_mov_i32_to(b, bi_get_index(instr->def.index, false, 0), bi_imm_u32(acc));
14307ec681f3Smrg}
14317ec681f3Smrg
14327ec681f3Smrgstatic bi_index
14337ec681f3Smrgbi_alu_src_index(nir_alu_src src, unsigned comps)
14347ec681f3Smrg{
14357ec681f3Smrg        /* we don't lower modifiers until the backend */
14367ec681f3Smrg        assert(!(src.negate || src.abs));
14377ec681f3Smrg
14387ec681f3Smrg        unsigned bitsize = nir_src_bit_size(src.src);
14397ec681f3Smrg
14407ec681f3Smrg        /* TODO: Do we need to do something more clever with 1-bit bools? */
14417ec681f3Smrg        if (bitsize == 1)
14427ec681f3Smrg                bitsize = 16;
14437ec681f3Smrg
14447ec681f3Smrg        /* the bi_index carries the 32-bit (word) offset separate from the
14457ec681f3Smrg         * subword swizzle, first handle the offset */
14467ec681f3Smrg
14477ec681f3Smrg        unsigned offset = 0;
14487ec681f3Smrg
14497ec681f3Smrg        assert(bitsize == 8 || bitsize == 16 || bitsize == 32);
14507ec681f3Smrg        unsigned subword_shift = (bitsize == 32) ? 0 : (bitsize == 16) ? 1 : 2;
14517ec681f3Smrg
14527ec681f3Smrg        for (unsigned i = 0; i < comps; ++i) {
14537ec681f3Smrg                unsigned new_offset = (src.swizzle[i] >> subword_shift);
14547ec681f3Smrg
14557ec681f3Smrg                if (i > 0)
14567ec681f3Smrg                        assert(offset == new_offset && "wrong vectorization");
14577ec681f3Smrg
14587ec681f3Smrg                offset = new_offset;
14597ec681f3Smrg        }
14607ec681f3Smrg
14617ec681f3Smrg        bi_index idx = bi_word(bi_src_index(&src.src), offset);
14627ec681f3Smrg
14637ec681f3Smrg        /* Compose the subword swizzle with existing (identity) swizzle */
14647ec681f3Smrg        assert(idx.swizzle == BI_SWIZZLE_H01);
14657ec681f3Smrg
14667ec681f3Smrg        /* Bigger vectors should have been lowered */
14677ec681f3Smrg        assert(comps <= (1 << subword_shift));
14687ec681f3Smrg
14697ec681f3Smrg        if (bitsize == 16) {
14707ec681f3Smrg                unsigned c0 = src.swizzle[0] & 1;
14717ec681f3Smrg                unsigned c1 = (comps > 1) ? src.swizzle[1] & 1 : c0;
14727ec681f3Smrg                idx.swizzle = BI_SWIZZLE_H00 + c1 + (c0 << 1);
14737ec681f3Smrg        } else if (bitsize == 8) {
14747ec681f3Smrg                /* 8-bit vectors not yet supported */
14757ec681f3Smrg                assert(comps == 1 && "8-bit vectors not supported");
14767ec681f3Smrg                assert(src.swizzle[0] < 4 && "8-bit vectors not supported");
14777ec681f3Smrg                idx.swizzle = BI_SWIZZLE_B0000 + src.swizzle[0];
14787ec681f3Smrg        }
14797ec681f3Smrg
14807ec681f3Smrg        return idx;
14817ec681f3Smrg}
14827ec681f3Smrg
14837ec681f3Smrgstatic enum bi_round
14847ec681f3Smrgbi_nir_round(nir_op op)
14857ec681f3Smrg{
14867ec681f3Smrg        switch (op) {
14877ec681f3Smrg        case nir_op_fround_even: return BI_ROUND_NONE;
14887ec681f3Smrg        case nir_op_ftrunc: return BI_ROUND_RTZ;
14897ec681f3Smrg        case nir_op_fceil: return BI_ROUND_RTP;
14907ec681f3Smrg        case nir_op_ffloor: return BI_ROUND_RTN;
14917ec681f3Smrg        default: unreachable("invalid nir round op");
14927ec681f3Smrg        }
14937ec681f3Smrg}
14947ec681f3Smrg
14957ec681f3Smrg/* Convenience for lowered transcendentals */
14967ec681f3Smrg
14977ec681f3Smrgstatic bi_index
14987ec681f3Smrgbi_fmul_f32(bi_builder *b, bi_index s0, bi_index s1)
14997ec681f3Smrg{
15007ec681f3Smrg        return bi_fma_f32(b, s0, s1, bi_imm_f32(-0.0f), BI_ROUND_NONE);
15017ec681f3Smrg}
15027ec681f3Smrg
15037ec681f3Smrg/* Approximate with FRCP_APPROX.f32 and apply a single iteration of
15047ec681f3Smrg * Newton-Raphson to improve precision */
15057ec681f3Smrg
15067ec681f3Smrgstatic void
15077ec681f3Smrgbi_lower_frcp_32(bi_builder *b, bi_index dst, bi_index s0)
15087ec681f3Smrg{
15097ec681f3Smrg        bi_index x1 = bi_frcp_approx_f32(b, s0);
15107ec681f3Smrg        bi_index m  = bi_frexpm_f32(b, s0, false, false);
15117ec681f3Smrg        bi_index e  = bi_frexpe_f32(b, bi_neg(s0), false, false);
15127ec681f3Smrg        bi_index t1 = bi_fma_rscale_f32(b, m, bi_neg(x1), bi_imm_f32(1.0),
15137ec681f3Smrg                        bi_zero(), BI_ROUND_NONE, BI_SPECIAL_N);
15147ec681f3Smrg        bi_fma_rscale_f32_to(b, dst, t1, x1, x1, e,
15157ec681f3Smrg                        BI_ROUND_NONE, BI_SPECIAL_NONE);
15167ec681f3Smrg}
15177ec681f3Smrg
15187ec681f3Smrgstatic void
15197ec681f3Smrgbi_lower_frsq_32(bi_builder *b, bi_index dst, bi_index s0)
15207ec681f3Smrg{
15217ec681f3Smrg        bi_index x1 = bi_frsq_approx_f32(b, s0);
15227ec681f3Smrg        bi_index m  = bi_frexpm_f32(b, s0, false, true);
15237ec681f3Smrg        bi_index e  = bi_frexpe_f32(b, bi_neg(s0), false, true);
15247ec681f3Smrg        bi_index t1 = bi_fmul_f32(b, x1, x1);
15257ec681f3Smrg        bi_index t2 = bi_fma_rscale_f32(b, m, bi_neg(t1), bi_imm_f32(1.0),
15267ec681f3Smrg                        bi_imm_u32(-1), BI_ROUND_NONE, BI_SPECIAL_N);
15277ec681f3Smrg        bi_fma_rscale_f32_to(b, dst, t2, x1, x1, e,
15287ec681f3Smrg                        BI_ROUND_NONE, BI_SPECIAL_N);
15297ec681f3Smrg}
15307ec681f3Smrg
15317ec681f3Smrg/* More complex transcendentals, see
15327ec681f3Smrg * https://gitlab.freedesktop.org/panfrost/mali-isa-docs/-/blob/master/Bifrost.adoc
15337ec681f3Smrg * for documentation */
15347ec681f3Smrg
15357ec681f3Smrgstatic void
15367ec681f3Smrgbi_lower_fexp2_32(bi_builder *b, bi_index dst, bi_index s0)
15377ec681f3Smrg{
15387ec681f3Smrg        bi_index t1 = bi_temp(b->shader);
15397ec681f3Smrg        bi_instr *t1_instr = bi_fadd_f32_to(b, t1,
15407ec681f3Smrg                        s0, bi_imm_u32(0x49400000), BI_ROUND_NONE);
15417ec681f3Smrg        t1_instr->clamp = BI_CLAMP_CLAMP_0_INF;
15427ec681f3Smrg
15437ec681f3Smrg        bi_index t2 = bi_fadd_f32(b, t1, bi_imm_u32(0xc9400000), BI_ROUND_NONE);
15447ec681f3Smrg
15457ec681f3Smrg        bi_instr *a2 = bi_fadd_f32_to(b, bi_temp(b->shader),
15467ec681f3Smrg                        s0, bi_neg(t2), BI_ROUND_NONE);
15477ec681f3Smrg        a2->clamp = BI_CLAMP_CLAMP_M1_1;
15487ec681f3Smrg
15497ec681f3Smrg        bi_index a1t = bi_fexp_table_u4(b, t1, BI_ADJ_NONE);
15507ec681f3Smrg        bi_index t3 = bi_isub_u32(b, t1, bi_imm_u32(0x49400000), false);
15517ec681f3Smrg        bi_index a1i = bi_arshift_i32(b, t3, bi_null(), bi_imm_u8(4));
15527ec681f3Smrg        bi_index p1 = bi_fma_f32(b, a2->dest[0], bi_imm_u32(0x3d635635),
15537ec681f3Smrg                        bi_imm_u32(0x3e75fffa), BI_ROUND_NONE);
15547ec681f3Smrg        bi_index p2 = bi_fma_f32(b, p1, a2->dest[0],
15557ec681f3Smrg                        bi_imm_u32(0x3f317218), BI_ROUND_NONE);
15567ec681f3Smrg        bi_index p3 = bi_fmul_f32(b, a2->dest[0], p2);
15577ec681f3Smrg        bi_instr *x = bi_fma_rscale_f32_to(b, bi_temp(b->shader),
15587ec681f3Smrg                        p3, a1t, a1t, a1i, BI_ROUND_NONE, BI_SPECIAL_NONE);
15597ec681f3Smrg        x->clamp = BI_CLAMP_CLAMP_0_INF;
15607ec681f3Smrg
15617ec681f3Smrg        bi_instr *max = bi_fmax_f32_to(b, dst, x->dest[0], s0);
15627ec681f3Smrg        max->sem = BI_SEM_NAN_PROPAGATE;
15637ec681f3Smrg}
15647ec681f3Smrg
15657ec681f3Smrgstatic void
15667ec681f3Smrgbi_fexp_32(bi_builder *b, bi_index dst, bi_index s0, bi_index log2_base)
15677ec681f3Smrg{
15687ec681f3Smrg        /* Scale by base, Multiply by 2*24 and convert to integer to get a 8:24
15697ec681f3Smrg         * fixed-point input */
15707ec681f3Smrg        bi_index scale = bi_fma_rscale_f32(b, s0, log2_base, bi_negzero(),
15717ec681f3Smrg                        bi_imm_u32(24), BI_ROUND_NONE, BI_SPECIAL_NONE);
15727ec681f3Smrg        bi_index fixed_pt = bi_f32_to_s32(b, scale, BI_ROUND_NONE);
15737ec681f3Smrg
15747ec681f3Smrg        /* Compute the result for the fixed-point input, but pass along
15757ec681f3Smrg         * the floating-point scale for correct NaN propagation */
15767ec681f3Smrg        bi_fexp_f32_to(b, dst, fixed_pt, scale);
15777ec681f3Smrg}
15787ec681f3Smrg
15797ec681f3Smrgstatic void
15807ec681f3Smrgbi_lower_flog2_32(bi_builder *b, bi_index dst, bi_index s0)
15817ec681f3Smrg{
15827ec681f3Smrg        /* s0 = a1 * 2^e, with a1 in [0.75, 1.5) */
15837ec681f3Smrg        bi_index a1 = bi_frexpm_f32(b, s0, true, false);
15847ec681f3Smrg        bi_index ei = bi_frexpe_f32(b, s0, true, false);
15857ec681f3Smrg        bi_index ef = bi_s32_to_f32(b, ei, BI_ROUND_RTZ);
15867ec681f3Smrg
15877ec681f3Smrg        /* xt estimates -log(r1), a coarse approximation of log(a1) */
15887ec681f3Smrg        bi_index r1 = bi_flog_table_f32(b, s0, BI_MODE_RED, BI_PRECISION_NONE);
15897ec681f3Smrg        bi_index xt = bi_flog_table_f32(b, s0, BI_MODE_BASE2, BI_PRECISION_NONE);
15907ec681f3Smrg
15917ec681f3Smrg        /* log(s0) = log(a1 * 2^e) = e + log(a1) = e + log(a1 * r1) -
15927ec681f3Smrg         * log(r1), so let x1 = e - log(r1) ~= e + xt and x2 = log(a1 * r1),
15937ec681f3Smrg         * and then log(s0) = x1 + x2 */
15947ec681f3Smrg        bi_index x1 = bi_fadd_f32(b, ef, xt, BI_ROUND_NONE);
15957ec681f3Smrg
15967ec681f3Smrg        /* Since a1 * r1 is close to 1, x2 = log(a1 * r1) may be computed by
15977ec681f3Smrg         * polynomial approximation around 1. The series is expressed around
15987ec681f3Smrg         * 1, so set y = (a1 * r1) - 1.0 */
15997ec681f3Smrg        bi_index y = bi_fma_f32(b, a1, r1, bi_imm_f32(-1.0), BI_ROUND_NONE);
16007ec681f3Smrg
16017ec681f3Smrg        /* x2 = log_2(1 + y) = log_e(1 + y) * (1/log_e(2)), so approximate
16027ec681f3Smrg         * log_e(1 + y) by the Taylor series (lower precision than the blob):
16037ec681f3Smrg         * y - y^2/2 + O(y^3) = y(1 - y/2) + O(y^3) */
16047ec681f3Smrg        bi_index loge = bi_fmul_f32(b, y,
16057ec681f3Smrg                bi_fma_f32(b, y, bi_imm_f32(-0.5), bi_imm_f32(1.0), BI_ROUND_NONE));
16067ec681f3Smrg
16077ec681f3Smrg        bi_index x2 = bi_fmul_f32(b, loge, bi_imm_f32(1.0 / logf(2.0)));
16087ec681f3Smrg
16097ec681f3Smrg        /* log(s0) = x1 + x2 */
16107ec681f3Smrg        bi_fadd_f32_to(b, dst, x1, x2, BI_ROUND_NONE);
16117ec681f3Smrg}
16127ec681f3Smrg
16137ec681f3Smrgstatic void
16147ec681f3Smrgbi_flog2_32(bi_builder *b, bi_index dst, bi_index s0)
16157ec681f3Smrg{
16167ec681f3Smrg        bi_index frexp = bi_frexpe_f32(b, s0, true, false);
16177ec681f3Smrg        bi_index frexpi = bi_s32_to_f32(b, frexp, BI_ROUND_RTZ);
16187ec681f3Smrg        bi_index add = bi_fadd_lscale_f32(b, bi_imm_f32(-1.0f), s0);
16197ec681f3Smrg        bi_fma_f32_to(b, dst, bi_flogd_f32(b, s0), add, frexpi,
16207ec681f3Smrg                        BI_ROUND_NONE);
16217ec681f3Smrg}
16227ec681f3Smrg
16237ec681f3Smrgstatic void
16247ec681f3Smrgbi_lower_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp)
16257ec681f3Smrg{
16267ec681f3Smrg        bi_index log2_base = bi_null();
16277ec681f3Smrg
16287ec681f3Smrg        if (base.type == BI_INDEX_CONSTANT) {
16297ec681f3Smrg                log2_base = bi_imm_f32(log2f(uif(base.value)));
16307ec681f3Smrg        } else {
16317ec681f3Smrg                log2_base = bi_temp(b->shader);
16327ec681f3Smrg                bi_lower_flog2_32(b, log2_base, base);
16337ec681f3Smrg        }
16347ec681f3Smrg
16357ec681f3Smrg        return bi_lower_fexp2_32(b, dst, bi_fmul_f32(b, exp, log2_base));
16367ec681f3Smrg}
16377ec681f3Smrg
16387ec681f3Smrgstatic void
16397ec681f3Smrgbi_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp)
16407ec681f3Smrg{
16417ec681f3Smrg        bi_index log2_base = bi_null();
16427ec681f3Smrg
16437ec681f3Smrg        if (base.type == BI_INDEX_CONSTANT) {
16447ec681f3Smrg                log2_base = bi_imm_f32(log2f(uif(base.value)));
16457ec681f3Smrg        } else {
16467ec681f3Smrg                log2_base = bi_temp(b->shader);
16477ec681f3Smrg                bi_flog2_32(b, log2_base, base);
16487ec681f3Smrg        }
16497ec681f3Smrg
16507ec681f3Smrg        return bi_fexp_32(b, dst, exp, log2_base);
16517ec681f3Smrg}
16527ec681f3Smrg
16537ec681f3Smrg/* Bifrost has extremely coarse tables for approximating sin/cos, accessible as
16547ec681f3Smrg * FSIN/COS_TABLE.u6, which multiplies the bottom 6-bits by pi/32 and
16557ec681f3Smrg * calculates the results. We use them to calculate sin/cos via a Taylor
16567ec681f3Smrg * approximation:
16577ec681f3Smrg *
16587ec681f3Smrg * f(x + e) = f(x) + e f'(x) + (e^2)/2 f''(x)
16597ec681f3Smrg * sin(x + e) = sin(x) + e cos(x) - (e^2)/2 sin(x)
16607ec681f3Smrg * cos(x + e) = cos(x) - e sin(x) - (e^2)/2 cos(x)
16617ec681f3Smrg */
16627ec681f3Smrg
16637ec681f3Smrg#define TWO_OVER_PI  bi_imm_f32(2.0f / 3.14159f)
16647ec681f3Smrg#define MPI_OVER_TWO bi_imm_f32(-3.14159f / 2.0)
16657ec681f3Smrg#define SINCOS_BIAS  bi_imm_u32(0x49400000)
16667ec681f3Smrg
16677ec681f3Smrgstatic void
16687ec681f3Smrgbi_lower_fsincos_32(bi_builder *b, bi_index dst, bi_index s0, bool cos)
16697ec681f3Smrg{
16707ec681f3Smrg        /* bottom 6-bits of result times pi/32 approximately s0 mod 2pi */
16717ec681f3Smrg        bi_index x_u6 = bi_fma_f32(b, s0, TWO_OVER_PI, SINCOS_BIAS, BI_ROUND_NONE);
16727ec681f3Smrg
16737ec681f3Smrg        /* Approximate domain error (small) */
16747ec681f3Smrg        bi_index e = bi_fma_f32(b, bi_fadd_f32(b, x_u6, bi_neg(SINCOS_BIAS),
16757ec681f3Smrg                                BI_ROUND_NONE),
16767ec681f3Smrg                        MPI_OVER_TWO, s0, BI_ROUND_NONE);
16777ec681f3Smrg
16787ec681f3Smrg        /* Lookup sin(x), cos(x) */
16797ec681f3Smrg        bi_index sinx = bi_fsin_table_u6(b, x_u6, false);
16807ec681f3Smrg        bi_index cosx = bi_fcos_table_u6(b, x_u6, false);
16817ec681f3Smrg
16827ec681f3Smrg        /* e^2 / 2 */
16837ec681f3Smrg        bi_index e2_over_2 = bi_fma_rscale_f32(b, e, e, bi_negzero(),
16847ec681f3Smrg                        bi_imm_u32(-1), BI_ROUND_NONE, BI_SPECIAL_NONE);
16857ec681f3Smrg
16867ec681f3Smrg        /* (-e^2)/2 f''(x) */
16877ec681f3Smrg        bi_index quadratic = bi_fma_f32(b, bi_neg(e2_over_2),
16887ec681f3Smrg                        cos ? cosx : sinx,
16897ec681f3Smrg                        bi_negzero(),  BI_ROUND_NONE);
16907ec681f3Smrg
16917ec681f3Smrg        /* e f'(x) - (e^2/2) f''(x) */
16927ec681f3Smrg        bi_instr *I = bi_fma_f32_to(b, bi_temp(b->shader), e,
16937ec681f3Smrg                        cos ? bi_neg(sinx) : cosx,
16947ec681f3Smrg                        quadratic, BI_ROUND_NONE);
16957ec681f3Smrg        I->clamp = BI_CLAMP_CLAMP_M1_1;
16967ec681f3Smrg
16977ec681f3Smrg        /* f(x) + e f'(x) - (e^2/2) f''(x) */
16987ec681f3Smrg        bi_fadd_f32_to(b, dst, I->dest[0], cos ? cosx : sinx, BI_ROUND_NONE);
16997ec681f3Smrg}
17007ec681f3Smrg
17017ec681f3Smrg/* The XOR lane op is useful for derivative calculation, but was added in v7.
17027ec681f3Smrg * Add a safe helper that will do the appropriate lowering on v6 */
17037ec681f3Smrg
17047ec681f3Smrgstatic bi_index
17057ec681f3Smrgbi_clper_xor(bi_builder *b, bi_index s0, bi_index s1)
17067ec681f3Smrg{
17077ec681f3Smrg        if (b->shader->arch >= 7) {
17087ec681f3Smrg                return bi_clper_i32(b, s0, s1,
17097ec681f3Smrg                                BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_XOR,
17107ec681f3Smrg                                BI_SUBGROUP_SUBGROUP4);
17117ec681f3Smrg        }
17127ec681f3Smrg
17137ec681f3Smrg        bi_index lane_id = bi_fau(BIR_FAU_LANE_ID, false);
17147ec681f3Smrg        bi_index lane = bi_lshift_xor_i32(b, lane_id, s1, bi_imm_u8(0));
17157ec681f3Smrg        return bi_clper_v6_i32(b, s0, lane);
17167ec681f3Smrg}
17177ec681f3Smrg
17187ec681f3Smrgstatic bi_instr *
17197ec681f3Smrgbi_emit_alu_bool(bi_builder *b, unsigned sz, nir_op op,
17207ec681f3Smrg      bi_index dst, bi_index s0, bi_index s1, bi_index s2)
17217ec681f3Smrg{
17227ec681f3Smrg        /* Handle 1-bit bools as 0/~0 by default and let the optimizer deal
17237ec681f3Smrg         * with the bit patterns later. 0/~0 has the nice property of being
17247ec681f3Smrg         * independent of replicated vectorization. */
17257ec681f3Smrg        if (sz == 1) sz = 16;
17267ec681f3Smrg        bi_index f = bi_zero();
17277ec681f3Smrg        bi_index t = bi_imm_u16(0xFFFF);
17287ec681f3Smrg
17297ec681f3Smrg        switch (op) {
17307ec681f3Smrg        case nir_op_feq:
17317ec681f3Smrg                return bi_fcmp_to(b, sz, dst, s0, s1, BI_CMPF_EQ, BI_RESULT_TYPE_M1);
17327ec681f3Smrg        case nir_op_flt:
17337ec681f3Smrg                return bi_fcmp_to(b, sz, dst, s0, s1, BI_CMPF_LT, BI_RESULT_TYPE_M1);
17347ec681f3Smrg        case nir_op_fge:
17357ec681f3Smrg                return bi_fcmp_to(b, sz, dst, s0, s1, BI_CMPF_GE, BI_RESULT_TYPE_M1);
17367ec681f3Smrg        case nir_op_fneu:
17377ec681f3Smrg                return bi_fcmp_to(b, sz, dst, s0, s1, BI_CMPF_NE, BI_RESULT_TYPE_M1);
17387ec681f3Smrg
17397ec681f3Smrg        case nir_op_ieq:
17407ec681f3Smrg                return bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, BI_CMPF_EQ, BI_RESULT_TYPE_M1);
17417ec681f3Smrg        case nir_op_ine:
17427ec681f3Smrg                return bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, BI_CMPF_NE, BI_RESULT_TYPE_M1);
17437ec681f3Smrg        case nir_op_ilt:
17447ec681f3Smrg                return bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, BI_CMPF_LT, BI_RESULT_TYPE_M1);
17457ec681f3Smrg        case nir_op_ige:
17467ec681f3Smrg                return bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, BI_CMPF_GE, BI_RESULT_TYPE_M1);
17477ec681f3Smrg        case nir_op_ult:
17487ec681f3Smrg                return bi_icmp_to(b, nir_type_uint, sz, dst, s0, s1, BI_CMPF_LT, BI_RESULT_TYPE_M1);
17497ec681f3Smrg        case nir_op_uge:
17507ec681f3Smrg                return bi_icmp_to(b, nir_type_uint, sz, dst, s0, s1, BI_CMPF_GE, BI_RESULT_TYPE_M1);
17517ec681f3Smrg
17527ec681f3Smrg        case nir_op_iand:
17537ec681f3Smrg                return bi_lshift_and_to(b, sz, dst, s0, s1, bi_imm_u8(0));
17547ec681f3Smrg        case nir_op_ior:
17557ec681f3Smrg                return bi_lshift_or_to(b, sz, dst, s0, s1, bi_imm_u8(0));
17567ec681f3Smrg        case nir_op_ixor:
17577ec681f3Smrg                return bi_lshift_xor_to(b, sz, dst, s0, s1, bi_imm_u8(0));
17587ec681f3Smrg        case nir_op_inot:
17597ec681f3Smrg                return bi_lshift_or_to(b, sz, dst, bi_zero(), bi_not(s0), bi_imm_u8(0));
17607ec681f3Smrg
17617ec681f3Smrg        case nir_op_f2b1:
17627ec681f3Smrg                return bi_csel_to(b, nir_type_int, sz, dst, s0, f, f, t, BI_CMPF_EQ);
17637ec681f3Smrg        case nir_op_i2b1:
17647ec681f3Smrg                return bi_csel_to(b, nir_type_int, sz, dst, s0, f, f, t, BI_CMPF_EQ);
17657ec681f3Smrg        case nir_op_b2b1:
17667ec681f3Smrg                return bi_csel_to(b, nir_type_int, sz, dst, s0, f, f, t, BI_CMPF_EQ);
17677ec681f3Smrg
17687ec681f3Smrg        case nir_op_bcsel:
17697ec681f3Smrg                return bi_csel_to(b, nir_type_int, sz, dst, s0, f, s1, s2, BI_CMPF_NE);
17707ec681f3Smrg
17717ec681f3Smrg        default:
17727ec681f3Smrg                fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[op].name);
17737ec681f3Smrg                unreachable("Unhandled boolean ALU instruction");
17747ec681f3Smrg        }
17757ec681f3Smrg}
17767ec681f3Smrg
17777ec681f3Smrgstatic void
17787ec681f3Smrgbi_emit_alu(bi_builder *b, nir_alu_instr *instr)
17797ec681f3Smrg{
17807ec681f3Smrg        bi_index dst = bi_dest_index(&instr->dest.dest);
17817ec681f3Smrg        unsigned srcs = nir_op_infos[instr->op].num_inputs;
17827ec681f3Smrg        unsigned sz = nir_dest_bit_size(instr->dest.dest);
17837ec681f3Smrg        unsigned comps = nir_dest_num_components(instr->dest.dest);
17847ec681f3Smrg        unsigned src_sz = srcs > 0 ? nir_src_bit_size(instr->src[0].src) : 0;
17857ec681f3Smrg        unsigned src1_sz = srcs > 1 ? nir_src_bit_size(instr->src[1].src) : 0;
17867ec681f3Smrg        bool is_bool = (sz == 1);
17877ec681f3Smrg
17887ec681f3Smrg        /* TODO: Anything else? */
17897ec681f3Smrg        if (sz == 1)
17907ec681f3Smrg                sz = 16;
17917ec681f3Smrg
17927ec681f3Smrg        /* Indicate scalarness */
17937ec681f3Smrg        if (sz == 16 && comps == 1)
17947ec681f3Smrg                dst.swizzle = BI_SWIZZLE_H00;
17957ec681f3Smrg
17967ec681f3Smrg        if (!instr->dest.dest.is_ssa) {
17977ec681f3Smrg                for (unsigned i = 0; i < comps; ++i)
17987ec681f3Smrg                        assert(instr->dest.write_mask);
17997ec681f3Smrg        }
18007ec681f3Smrg
18017ec681f3Smrg        /* First, match against the various moves in NIR. These are
18027ec681f3Smrg         * special-cased because they can operate on vectors even after
18037ec681f3Smrg         * lowering ALU to scalar. For Bifrost, bi_alu_src_index assumes the
18047ec681f3Smrg         * instruction is no "bigger" than SIMD-within-a-register. These moves
18057ec681f3Smrg         * are the exceptions that need to handle swizzles specially. */
18067ec681f3Smrg
18077ec681f3Smrg        switch (instr->op) {
18087ec681f3Smrg        case nir_op_pack_32_2x16:
18097ec681f3Smrg        case nir_op_vec2:
18107ec681f3Smrg        case nir_op_vec3:
18117ec681f3Smrg        case nir_op_vec4: {
18127ec681f3Smrg                bi_index unoffset_srcs[4] = {
18137ec681f3Smrg                        srcs > 0 ? bi_src_index(&instr->src[0].src) : bi_null(),
18147ec681f3Smrg                        srcs > 1 ? bi_src_index(&instr->src[1].src) : bi_null(),
18157ec681f3Smrg                        srcs > 2 ? bi_src_index(&instr->src[2].src) : bi_null(),
18167ec681f3Smrg                        srcs > 3 ? bi_src_index(&instr->src[3].src) : bi_null(),
18177ec681f3Smrg                };
18187ec681f3Smrg
18197ec681f3Smrg                unsigned channels[4] = {
18207ec681f3Smrg                        instr->src[0].swizzle[0],
18217ec681f3Smrg                        instr->src[1].swizzle[0],
18227ec681f3Smrg                        srcs > 2 ? instr->src[2].swizzle[0] : 0,
18237ec681f3Smrg                        srcs > 3 ? instr->src[3].swizzle[0] : 0,
18247ec681f3Smrg                };
18257ec681f3Smrg
18267ec681f3Smrg                bi_make_vec_to(b, dst, unoffset_srcs, channels, srcs, sz);
18277ec681f3Smrg                return;
18287ec681f3Smrg        }
18297ec681f3Smrg
18307ec681f3Smrg        case nir_op_vec8:
18317ec681f3Smrg        case nir_op_vec16:
18327ec681f3Smrg                unreachable("should've been lowered");
18337ec681f3Smrg
18347ec681f3Smrg        case nir_op_unpack_32_2x16:
18357ec681f3Smrg        case nir_op_unpack_64_2x32_split_x:
18367ec681f3Smrg                bi_mov_i32_to(b, dst, bi_src_index(&instr->src[0].src));
18377ec681f3Smrg                return;
18387ec681f3Smrg
18397ec681f3Smrg        case nir_op_unpack_64_2x32_split_y:
18407ec681f3Smrg                bi_mov_i32_to(b, dst, bi_word(bi_src_index(&instr->src[0].src), 1));
18417ec681f3Smrg                return;
18427ec681f3Smrg
18437ec681f3Smrg        case nir_op_pack_64_2x32_split:
18447ec681f3Smrg                bi_mov_i32_to(b, bi_word(dst, 0), bi_src_index(&instr->src[0].src));
18457ec681f3Smrg                bi_mov_i32_to(b, bi_word(dst, 1), bi_src_index(&instr->src[1].src));
18467ec681f3Smrg                return;
18477ec681f3Smrg
18487ec681f3Smrg        case nir_op_pack_64_2x32:
18497ec681f3Smrg                bi_mov_i32_to(b, bi_word(dst, 0), bi_word(bi_src_index(&instr->src[0].src), 0));
18507ec681f3Smrg                bi_mov_i32_to(b, bi_word(dst, 1), bi_word(bi_src_index(&instr->src[0].src), 1));
18517ec681f3Smrg                return;
18527ec681f3Smrg
18537ec681f3Smrg        case nir_op_mov: {
18547ec681f3Smrg                bi_index idx = bi_src_index(&instr->src[0].src);
18557ec681f3Smrg                bi_index unoffset_srcs[4] = { idx, idx, idx, idx };
18567ec681f3Smrg
18577ec681f3Smrg                unsigned channels[4] = {
18587ec681f3Smrg                        comps > 0 ? instr->src[0].swizzle[0] : 0,
18597ec681f3Smrg                        comps > 1 ? instr->src[0].swizzle[1] : 0,
18607ec681f3Smrg                        comps > 2 ? instr->src[0].swizzle[2] : 0,
18617ec681f3Smrg                        comps > 3 ? instr->src[0].swizzle[3] : 0,
18627ec681f3Smrg                };
18637ec681f3Smrg
18647ec681f3Smrg                if (sz == 1) sz = 16;
18657ec681f3Smrg                bi_make_vec_to(b, dst, unoffset_srcs, channels, comps, sz);
18667ec681f3Smrg                return;
18677ec681f3Smrg        }
18687ec681f3Smrg
18697ec681f3Smrg        case nir_op_f2f16:
18707ec681f3Smrg                assert(src_sz == 32);
18717ec681f3Smrg                bi_index idx = bi_src_index(&instr->src[0].src);
18727ec681f3Smrg                bi_index s0 = bi_word(idx, instr->src[0].swizzle[0]);
18737ec681f3Smrg                bi_index s1 = comps > 1 ?
18747ec681f3Smrg                        bi_word(idx, instr->src[0].swizzle[1]) : s0;
18757ec681f3Smrg
18767ec681f3Smrg                bi_v2f32_to_v2f16_to(b, dst, s0, s1, BI_ROUND_NONE);
18777ec681f3Smrg                return;
18787ec681f3Smrg
18797ec681f3Smrg        /* Vectorized downcasts */
18807ec681f3Smrg        case nir_op_u2u16:
18817ec681f3Smrg        case nir_op_i2i16: {
18827ec681f3Smrg                if (!(src_sz == 32 && comps == 2))
18837ec681f3Smrg                        break;
18847ec681f3Smrg
18857ec681f3Smrg                bi_index idx = bi_src_index(&instr->src[0].src);
18867ec681f3Smrg                bi_index s0 = bi_word(idx, instr->src[0].swizzle[0]);
18877ec681f3Smrg                bi_index s1 = bi_word(idx, instr->src[0].swizzle[1]);
18887ec681f3Smrg
18897ec681f3Smrg                bi_mkvec_v2i16_to(b, dst,
18907ec681f3Smrg                                bi_half(s0, false), bi_half(s1, false));
18917ec681f3Smrg                return;
18927ec681f3Smrg        }
18937ec681f3Smrg
18947ec681f3Smrg        case nir_op_i2i8:
18957ec681f3Smrg        case nir_op_u2u8:
18967ec681f3Smrg        {
18977ec681f3Smrg                /* Acts like an 8-bit swizzle */
18987ec681f3Smrg                bi_index idx = bi_src_index(&instr->src[0].src);
18997ec681f3Smrg                unsigned factor = src_sz / 8;
19007ec681f3Smrg                unsigned chan[4] = { 0 };
19017ec681f3Smrg
19027ec681f3Smrg                for (unsigned i = 0; i < comps; ++i)
19037ec681f3Smrg                        chan[i] = instr->src[0].swizzle[i] * factor;
19047ec681f3Smrg
19057ec681f3Smrg                bi_make_vec_to(b, dst, &idx, chan, comps, 8);
19067ec681f3Smrg                return;
19077ec681f3Smrg        }
19087ec681f3Smrg
19097ec681f3Smrg        default:
19107ec681f3Smrg                break;
19117ec681f3Smrg        }
19127ec681f3Smrg
19137ec681f3Smrg        bi_index s0 = srcs > 0 ? bi_alu_src_index(instr->src[0], comps) : bi_null();
19147ec681f3Smrg        bi_index s1 = srcs > 1 ? bi_alu_src_index(instr->src[1], comps) : bi_null();
19157ec681f3Smrg        bi_index s2 = srcs > 2 ? bi_alu_src_index(instr->src[2], comps) : bi_null();
19167ec681f3Smrg
19177ec681f3Smrg        if (is_bool) {
19187ec681f3Smrg                bi_emit_alu_bool(b, src_sz, instr->op, dst, s0, s1, s2);
19197ec681f3Smrg                return;
19207ec681f3Smrg        }
19217ec681f3Smrg
19227ec681f3Smrg        switch (instr->op) {
19237ec681f3Smrg        case nir_op_ffma:
19247ec681f3Smrg                bi_fma_to(b, sz, dst, s0, s1, s2, BI_ROUND_NONE);
19257ec681f3Smrg                break;
19267ec681f3Smrg
19277ec681f3Smrg        case nir_op_fmul:
19287ec681f3Smrg                bi_fma_to(b, sz, dst, s0, s1, bi_negzero(), BI_ROUND_NONE);
19297ec681f3Smrg                break;
19307ec681f3Smrg
19317ec681f3Smrg        case nir_op_fsub:
19327ec681f3Smrg                s1 = bi_neg(s1);
19337ec681f3Smrg                FALLTHROUGH;
19347ec681f3Smrg        case nir_op_fadd:
19357ec681f3Smrg                bi_fadd_to(b, sz, dst, s0, s1, BI_ROUND_NONE);
19367ec681f3Smrg                break;
19377ec681f3Smrg
19387ec681f3Smrg        case nir_op_fsat: {
19397ec681f3Smrg                bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
19407ec681f3Smrg                I->clamp = BI_CLAMP_CLAMP_0_1;
19417ec681f3Smrg                break;
19427ec681f3Smrg        }
19437ec681f3Smrg
19447ec681f3Smrg        case nir_op_fsat_signed_mali: {
19457ec681f3Smrg                bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
19467ec681f3Smrg                I->clamp = BI_CLAMP_CLAMP_M1_1;
19477ec681f3Smrg                break;
19487ec681f3Smrg        }
19497ec681f3Smrg
19507ec681f3Smrg        case nir_op_fclamp_pos_mali: {
19517ec681f3Smrg                bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
19527ec681f3Smrg                I->clamp = BI_CLAMP_CLAMP_0_INF;
19537ec681f3Smrg                break;
19547ec681f3Smrg        }
19557ec681f3Smrg
19567ec681f3Smrg        case nir_op_fneg:
19577ec681f3Smrg                bi_fabsneg_to(b, sz, dst, bi_neg(s0));
19587ec681f3Smrg                break;
19597ec681f3Smrg
19607ec681f3Smrg        case nir_op_fabs:
19617ec681f3Smrg                bi_fabsneg_to(b, sz, dst, bi_abs(s0));
19627ec681f3Smrg                break;
19637ec681f3Smrg
19647ec681f3Smrg        case nir_op_fsin:
19657ec681f3Smrg                bi_lower_fsincos_32(b, dst, s0, false);
19667ec681f3Smrg                break;
19677ec681f3Smrg
19687ec681f3Smrg        case nir_op_fcos:
19697ec681f3Smrg                bi_lower_fsincos_32(b, dst, s0, true);
19707ec681f3Smrg                break;
19717ec681f3Smrg
19727ec681f3Smrg        case nir_op_fexp2:
19737ec681f3Smrg                assert(sz == 32); /* should've been lowered */
19747ec681f3Smrg
19757ec681f3Smrg                if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
19767ec681f3Smrg                        bi_lower_fexp2_32(b, dst, s0);
19777ec681f3Smrg                else
19787ec681f3Smrg                        bi_fexp_32(b, dst, s0, bi_imm_f32(1.0f));
19797ec681f3Smrg
19807ec681f3Smrg                break;
19817ec681f3Smrg
19827ec681f3Smrg        case nir_op_flog2:
19837ec681f3Smrg                assert(sz == 32); /* should've been lowered */
19847ec681f3Smrg
19857ec681f3Smrg                if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
19867ec681f3Smrg                        bi_lower_flog2_32(b, dst, s0);
19877ec681f3Smrg                else
19887ec681f3Smrg                        bi_flog2_32(b, dst, s0);
19897ec681f3Smrg
19907ec681f3Smrg                break;
19917ec681f3Smrg
19927ec681f3Smrg        case nir_op_fpow:
19937ec681f3Smrg                assert(sz == 32); /* should've been lowered */
19947ec681f3Smrg
19957ec681f3Smrg                if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
19967ec681f3Smrg                        bi_lower_fpow_32(b, dst, s0, s1);
19977ec681f3Smrg                else
19987ec681f3Smrg                        bi_fpow_32(b, dst, s0, s1);
19997ec681f3Smrg
20007ec681f3Smrg                break;
20017ec681f3Smrg
20027ec681f3Smrg        case nir_op_bcsel:
20037ec681f3Smrg                if (src1_sz == 8)
20047ec681f3Smrg                        bi_mux_v4i8_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
20057ec681f3Smrg                else
20067ec681f3Smrg                        bi_csel_to(b, nir_type_int, src1_sz,
20077ec681f3Smrg                                        dst, s0, bi_zero(), s1, s2, BI_CMPF_NE);
20087ec681f3Smrg                break;
20097ec681f3Smrg
20107ec681f3Smrg        case nir_op_ishl:
20117ec681f3Smrg                bi_lshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0));
20127ec681f3Smrg                break;
20137ec681f3Smrg        case nir_op_ushr:
20147ec681f3Smrg                bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0));
20157ec681f3Smrg                break;
20167ec681f3Smrg
20177ec681f3Smrg        case nir_op_ishr:
20187ec681f3Smrg                bi_arshift_to(b, sz, dst, s0, bi_null(), bi_byte(s1, 0));
20197ec681f3Smrg                break;
20207ec681f3Smrg
20217ec681f3Smrg        case nir_op_imin:
20227ec681f3Smrg        case nir_op_umin:
20237ec681f3Smrg                bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst,
20247ec681f3Smrg                                s0, s1, s0, s1, BI_CMPF_LT);
20257ec681f3Smrg                break;
20267ec681f3Smrg
20277ec681f3Smrg        case nir_op_imax:
20287ec681f3Smrg        case nir_op_umax:
20297ec681f3Smrg                bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst,
20307ec681f3Smrg                                s0, s1, s0, s1, BI_CMPF_GT);
20317ec681f3Smrg                break;
20327ec681f3Smrg
20337ec681f3Smrg        case nir_op_fddx_must_abs_mali:
20347ec681f3Smrg        case nir_op_fddy_must_abs_mali: {
20357ec681f3Smrg                bi_index bit = bi_imm_u32(instr->op == nir_op_fddx_must_abs_mali ? 1 : 2);
20367ec681f3Smrg                bi_index adjacent = bi_clper_xor(b, s0, bit);
20377ec681f3Smrg                bi_fadd_to(b, sz, dst, adjacent, bi_neg(s0), BI_ROUND_NONE);
20387ec681f3Smrg                break;
20397ec681f3Smrg        }
20407ec681f3Smrg
20417ec681f3Smrg        case nir_op_fddx:
20427ec681f3Smrg        case nir_op_fddy: {
20437ec681f3Smrg                bi_index lane1 = bi_lshift_and_i32(b,
20447ec681f3Smrg                                bi_fau(BIR_FAU_LANE_ID, false),
20457ec681f3Smrg                                bi_imm_u32(instr->op == nir_op_fddx ? 2 : 1),
20467ec681f3Smrg                                bi_imm_u8(0));
20477ec681f3Smrg
20487ec681f3Smrg                bi_index lane2 = bi_iadd_u32(b, lane1,
20497ec681f3Smrg                                bi_imm_u32(instr->op == nir_op_fddx ? 1 : 2),
20507ec681f3Smrg                                false);
20517ec681f3Smrg
20527ec681f3Smrg                bi_index left, right;
20537ec681f3Smrg
20547ec681f3Smrg                if (b->shader->quirks & BIFROST_LIMITED_CLPER) {
20557ec681f3Smrg                        left = bi_clper_v6_i32(b, s0, lane1);
20567ec681f3Smrg                        right = bi_clper_v6_i32(b, s0, lane2);
20577ec681f3Smrg                } else {
20587ec681f3Smrg                        left = bi_clper_i32(b, s0, lane1,
20597ec681f3Smrg                                        BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE,
20607ec681f3Smrg                                        BI_SUBGROUP_SUBGROUP4);
20617ec681f3Smrg
20627ec681f3Smrg                        right = bi_clper_i32(b, s0, lane2,
20637ec681f3Smrg                                        BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE,
20647ec681f3Smrg                                        BI_SUBGROUP_SUBGROUP4);
20657ec681f3Smrg                }
20667ec681f3Smrg
20677ec681f3Smrg                bi_fadd_to(b, sz, dst, right, bi_neg(left), BI_ROUND_NONE);
20687ec681f3Smrg                break;
20697ec681f3Smrg        }
20707ec681f3Smrg
20717ec681f3Smrg        case nir_op_f2f32:
20727ec681f3Smrg                bi_f16_to_f32_to(b, dst, s0);
20737ec681f3Smrg                break;
20747ec681f3Smrg
20757ec681f3Smrg        case nir_op_f2i32:
20767ec681f3Smrg                if (src_sz == 32)
20777ec681f3Smrg                        bi_f32_to_s32_to(b, dst, s0, BI_ROUND_RTZ);
20787ec681f3Smrg                else
20797ec681f3Smrg                        bi_f16_to_s32_to(b, dst, s0, BI_ROUND_RTZ);
20807ec681f3Smrg                break;
20817ec681f3Smrg
20827ec681f3Smrg        /* Note 32-bit sources => no vectorization, so 32-bit works */
20837ec681f3Smrg        case nir_op_f2u16:
20847ec681f3Smrg                if (src_sz == 32)
20857ec681f3Smrg                        bi_f32_to_u32_to(b, dst, s0, BI_ROUND_RTZ);
20867ec681f3Smrg                else
20877ec681f3Smrg                        bi_v2f16_to_v2u16_to(b, dst, s0, BI_ROUND_RTZ);
20887ec681f3Smrg                break;
20897ec681f3Smrg
20907ec681f3Smrg        case nir_op_f2i16:
20917ec681f3Smrg                if (src_sz == 32)
20927ec681f3Smrg                        bi_f32_to_s32_to(b, dst, s0, BI_ROUND_RTZ);
20937ec681f3Smrg                else
20947ec681f3Smrg                        bi_v2f16_to_v2s16_to(b, dst, s0, BI_ROUND_RTZ);
20957ec681f3Smrg                break;
20967ec681f3Smrg
20977ec681f3Smrg        case nir_op_f2u32:
20987ec681f3Smrg                if (src_sz == 32)
20997ec681f3Smrg                        bi_f32_to_u32_to(b, dst, s0, BI_ROUND_RTZ);
21007ec681f3Smrg                else
21017ec681f3Smrg                        bi_f16_to_u32_to(b, dst, s0, BI_ROUND_RTZ);
21027ec681f3Smrg                break;
21037ec681f3Smrg
21047ec681f3Smrg        case nir_op_u2f16:
21057ec681f3Smrg                if (src_sz == 32)
21067ec681f3Smrg                        bi_v2u16_to_v2f16_to(b, dst, bi_half(s0, false), BI_ROUND_RTZ);
21077ec681f3Smrg                else if (src_sz == 16)
21087ec681f3Smrg                        bi_v2u16_to_v2f16_to(b, dst, s0, BI_ROUND_RTZ);
21097ec681f3Smrg                else if (src_sz == 8)
21107ec681f3Smrg                        bi_v2u8_to_v2f16_to(b, dst, s0);
21117ec681f3Smrg                break;
21127ec681f3Smrg
21137ec681f3Smrg        case nir_op_u2f32:
21147ec681f3Smrg                if (src_sz == 32)
21157ec681f3Smrg                        bi_u32_to_f32_to(b, dst, s0, BI_ROUND_RTZ);
21167ec681f3Smrg                else if (src_sz == 16)
21177ec681f3Smrg                        bi_u16_to_f32_to(b, dst, s0);
21187ec681f3Smrg                else
21197ec681f3Smrg                        bi_u8_to_f32_to(b, dst, s0);
21207ec681f3Smrg                break;
21217ec681f3Smrg
21227ec681f3Smrg        case nir_op_i2f16:
21237ec681f3Smrg                if (src_sz == 32)
21247ec681f3Smrg                        bi_v2s16_to_v2f16_to(b, dst, bi_half(s0, false), BI_ROUND_RTZ);
21257ec681f3Smrg                else if (src_sz == 16)
21267ec681f3Smrg                        bi_v2s16_to_v2f16_to(b, dst, s0, BI_ROUND_RTZ);
21277ec681f3Smrg                else if (src_sz == 8)
21287ec681f3Smrg                        bi_v2s8_to_v2f16_to(b, dst, s0);
21297ec681f3Smrg                break;
21307ec681f3Smrg
21317ec681f3Smrg        case nir_op_i2f32:
21327ec681f3Smrg                if (src_sz == 32)
21337ec681f3Smrg                        bi_s32_to_f32_to(b, dst, s0, BI_ROUND_RTZ);
21347ec681f3Smrg                else if (src_sz == 16)
21357ec681f3Smrg                        bi_s16_to_f32_to(b, dst, s0);
21367ec681f3Smrg                else if (src_sz == 8)
21377ec681f3Smrg                        bi_s8_to_f32_to(b, dst, s0);
21387ec681f3Smrg                break;
21397ec681f3Smrg
21407ec681f3Smrg        case nir_op_i2i32:
21417ec681f3Smrg                if (src_sz == 16)
21427ec681f3Smrg                        bi_s16_to_s32_to(b, dst, s0);
21437ec681f3Smrg                else
21447ec681f3Smrg                        bi_s8_to_s32_to(b, dst, s0);
21457ec681f3Smrg                break;
21467ec681f3Smrg
21477ec681f3Smrg        case nir_op_u2u32:
21487ec681f3Smrg                if (src_sz == 16)
21497ec681f3Smrg                        bi_u16_to_u32_to(b, dst, s0);
21507ec681f3Smrg                else
21517ec681f3Smrg                        bi_u8_to_u32_to(b, dst, s0);
21527ec681f3Smrg                break;
21537ec681f3Smrg
21547ec681f3Smrg        case nir_op_i2i16:
21557ec681f3Smrg                assert(src_sz == 8 || src_sz == 32);
21567ec681f3Smrg
21577ec681f3Smrg                if (src_sz == 8)
21587ec681f3Smrg                        bi_v2s8_to_v2s16_to(b, dst, s0);
21597ec681f3Smrg                else
21607ec681f3Smrg                        bi_mov_i32_to(b, dst, s0);
21617ec681f3Smrg                break;
21627ec681f3Smrg
21637ec681f3Smrg        case nir_op_u2u16:
21647ec681f3Smrg                assert(src_sz == 8 || src_sz == 32);
21657ec681f3Smrg
21667ec681f3Smrg                if (src_sz == 8)
21677ec681f3Smrg                        bi_v2u8_to_v2u16_to(b, dst, s0);
21687ec681f3Smrg                else
21697ec681f3Smrg                        bi_mov_i32_to(b, dst, s0);
21707ec681f3Smrg                break;
21717ec681f3Smrg
21727ec681f3Smrg        case nir_op_b2f16:
21737ec681f3Smrg        case nir_op_b2f32:
21747ec681f3Smrg                bi_csel_to(b, nir_type_int, sz, dst, s0, bi_zero(),
21757ec681f3Smrg                                (sz == 16) ? bi_imm_f16(1.0) : bi_imm_f32(1.0),
21767ec681f3Smrg                                (sz == 16) ? bi_imm_f16(0.0) : bi_imm_f32(0.0),
21777ec681f3Smrg                                BI_CMPF_NE);
21787ec681f3Smrg                break;
21797ec681f3Smrg
21807ec681f3Smrg        case nir_op_b2b32:
21817ec681f3Smrg                bi_csel_to(b, nir_type_int, sz, dst, s0, bi_zero(),
21827ec681f3Smrg                                bi_imm_u32(~0), bi_zero(), BI_CMPF_NE);
21837ec681f3Smrg                break;
21847ec681f3Smrg
21857ec681f3Smrg        case nir_op_b2i8:
21867ec681f3Smrg        case nir_op_b2i16:
21877ec681f3Smrg        case nir_op_b2i32:
21887ec681f3Smrg                bi_lshift_and_to(b, sz, dst, s0, bi_imm_uintN(1, sz), bi_imm_u8(0));
21897ec681f3Smrg                break;
21907ec681f3Smrg
21917ec681f3Smrg        case nir_op_fround_even:
21927ec681f3Smrg        case nir_op_fceil:
21937ec681f3Smrg        case nir_op_ffloor:
21947ec681f3Smrg        case nir_op_ftrunc:
21957ec681f3Smrg                bi_fround_to(b, sz, dst, s0, bi_nir_round(instr->op));
21967ec681f3Smrg                break;
21977ec681f3Smrg
21987ec681f3Smrg        case nir_op_fmin:
21997ec681f3Smrg                bi_fmin_to(b, sz, dst, s0, s1);
22007ec681f3Smrg                break;
22017ec681f3Smrg
22027ec681f3Smrg        case nir_op_fmax:
22037ec681f3Smrg                bi_fmax_to(b, sz, dst, s0, s1);
22047ec681f3Smrg                break;
22057ec681f3Smrg
22067ec681f3Smrg        case nir_op_iadd:
22077ec681f3Smrg                bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, false);
22087ec681f3Smrg                break;
22097ec681f3Smrg
22107ec681f3Smrg        case nir_op_iadd_sat:
22117ec681f3Smrg                bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, true);
22127ec681f3Smrg                break;
22137ec681f3Smrg
22147ec681f3Smrg        case nir_op_uadd_sat:
22157ec681f3Smrg                bi_iadd_to(b, nir_type_uint, sz, dst, s0, s1, true);
22167ec681f3Smrg                break;
22177ec681f3Smrg
22187ec681f3Smrg        case nir_op_ihadd:
22197ec681f3Smrg                bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTN);
22207ec681f3Smrg                break;
22217ec681f3Smrg
22227ec681f3Smrg        case nir_op_irhadd:
22237ec681f3Smrg                bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTP);
22247ec681f3Smrg                break;
22257ec681f3Smrg
22267ec681f3Smrg        case nir_op_ineg:
22277ec681f3Smrg                bi_isub_to(b, nir_type_int, sz, dst, bi_zero(), s0, false);
22287ec681f3Smrg                break;
22297ec681f3Smrg
22307ec681f3Smrg        case nir_op_isub:
22317ec681f3Smrg                bi_isub_to(b, nir_type_int, sz, dst, s0, s1, false);
22327ec681f3Smrg                break;
22337ec681f3Smrg
22347ec681f3Smrg        case nir_op_isub_sat:
22357ec681f3Smrg                bi_isub_to(b, nir_type_int, sz, dst, s0, s1, true);
22367ec681f3Smrg                break;
22377ec681f3Smrg
22387ec681f3Smrg        case nir_op_usub_sat:
22397ec681f3Smrg                bi_isub_to(b, nir_type_uint, sz, dst, s0, s1, true);
22407ec681f3Smrg                break;
22417ec681f3Smrg
22427ec681f3Smrg        case nir_op_imul:
22437ec681f3Smrg                bi_imul_to(b, sz, dst, s0, s1);
22447ec681f3Smrg                break;
22457ec681f3Smrg
22467ec681f3Smrg        case nir_op_iabs:
22477ec681f3Smrg                bi_iabs_to(b, sz, dst, s0);
22487ec681f3Smrg                break;
22497ec681f3Smrg
22507ec681f3Smrg        case nir_op_iand:
22517ec681f3Smrg                bi_lshift_and_to(b, sz, dst, s0, s1, bi_imm_u8(0));
22527ec681f3Smrg                break;
22537ec681f3Smrg
22547ec681f3Smrg        case nir_op_ior:
22557ec681f3Smrg                bi_lshift_or_to(b, sz, dst, s0, s1, bi_imm_u8(0));
22567ec681f3Smrg                break;
22577ec681f3Smrg
22587ec681f3Smrg        case nir_op_ixor:
22597ec681f3Smrg                bi_lshift_xor_to(b, sz, dst, s0, s1, bi_imm_u8(0));
22607ec681f3Smrg                break;
22617ec681f3Smrg
22627ec681f3Smrg        case nir_op_inot:
22637ec681f3Smrg                bi_lshift_or_to(b, sz, dst, bi_zero(), bi_not(s0), bi_imm_u8(0));
22647ec681f3Smrg                break;
22657ec681f3Smrg
22667ec681f3Smrg        case nir_op_frsq:
22677ec681f3Smrg                if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
22687ec681f3Smrg                        bi_lower_frsq_32(b, dst, s0);
22697ec681f3Smrg                else
22707ec681f3Smrg                        bi_frsq_to(b, sz, dst, s0);
22717ec681f3Smrg                break;
22727ec681f3Smrg
22737ec681f3Smrg        case nir_op_frcp:
22747ec681f3Smrg                if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
22757ec681f3Smrg                        bi_lower_frcp_32(b, dst, s0);
22767ec681f3Smrg                else
22777ec681f3Smrg                        bi_frcp_to(b, sz, dst, s0);
22787ec681f3Smrg                break;
22797ec681f3Smrg
22807ec681f3Smrg        case nir_op_uclz:
22817ec681f3Smrg                bi_clz_to(b, sz, dst, s0, false);
22827ec681f3Smrg                break;
22837ec681f3Smrg
22847ec681f3Smrg        case nir_op_bit_count:
22857ec681f3Smrg                bi_popcount_i32_to(b, dst, s0);
22867ec681f3Smrg                break;
22877ec681f3Smrg
22887ec681f3Smrg        case nir_op_bitfield_reverse:
22897ec681f3Smrg                bi_bitrev_i32_to(b, dst, s0);
22907ec681f3Smrg                break;
22917ec681f3Smrg
22927ec681f3Smrg        case nir_op_ufind_msb: {
22937ec681f3Smrg                bi_index clz = bi_clz(b, src_sz, s0, false);
22947ec681f3Smrg
22957ec681f3Smrg                if (sz == 8)
22967ec681f3Smrg                        clz = bi_byte(clz, 0);
22977ec681f3Smrg                else if (sz == 16)
22987ec681f3Smrg                        clz = bi_half(clz, false);
22997ec681f3Smrg
23007ec681f3Smrg                bi_isub_u32_to(b, dst, bi_imm_u32(src_sz - 1), clz, false);
23017ec681f3Smrg                break;
23027ec681f3Smrg        }
23037ec681f3Smrg
23047ec681f3Smrg        default:
23057ec681f3Smrg                fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
23067ec681f3Smrg                unreachable("Unknown ALU op");
23077ec681f3Smrg        }
23087ec681f3Smrg}
23097ec681f3Smrg
23107ec681f3Smrg/* Returns dimension with 0 special casing cubemaps. Shamelessly copied from Midgard */
23117ec681f3Smrgstatic unsigned
23127ec681f3Smrgbifrost_tex_format(enum glsl_sampler_dim dim)
23137ec681f3Smrg{
23147ec681f3Smrg        switch (dim) {
23157ec681f3Smrg        case GLSL_SAMPLER_DIM_1D:
23167ec681f3Smrg        case GLSL_SAMPLER_DIM_BUF:
23177ec681f3Smrg                return 1;
23187ec681f3Smrg
23197ec681f3Smrg        case GLSL_SAMPLER_DIM_2D:
23207ec681f3Smrg        case GLSL_SAMPLER_DIM_MS:
23217ec681f3Smrg        case GLSL_SAMPLER_DIM_EXTERNAL:
23227ec681f3Smrg        case GLSL_SAMPLER_DIM_RECT:
23237ec681f3Smrg                return 2;
23247ec681f3Smrg
23257ec681f3Smrg        case GLSL_SAMPLER_DIM_3D:
23267ec681f3Smrg                return 3;
23277ec681f3Smrg
23287ec681f3Smrg        case GLSL_SAMPLER_DIM_CUBE:
23297ec681f3Smrg                return 0;
23307ec681f3Smrg
23317ec681f3Smrg        default:
23327ec681f3Smrg                DBG("Unknown sampler dim type\n");
23337ec681f3Smrg                assert(0);
23347ec681f3Smrg                return 0;
23357ec681f3Smrg        }
23367ec681f3Smrg}
23377ec681f3Smrg
23387ec681f3Smrgstatic enum bifrost_texture_format_full
23397ec681f3Smrgbi_texture_format(nir_alu_type T, enum bi_clamp clamp)
23407ec681f3Smrg{
23417ec681f3Smrg        switch (T) {
23427ec681f3Smrg        case nir_type_float16: return BIFROST_TEXTURE_FORMAT_F16 + clamp;
23437ec681f3Smrg        case nir_type_float32: return BIFROST_TEXTURE_FORMAT_F32 + clamp;
23447ec681f3Smrg        case nir_type_uint16:  return BIFROST_TEXTURE_FORMAT_U16;
23457ec681f3Smrg        case nir_type_int16:   return BIFROST_TEXTURE_FORMAT_S16;
23467ec681f3Smrg        case nir_type_uint32:  return BIFROST_TEXTURE_FORMAT_U32;
23477ec681f3Smrg        case nir_type_int32:   return BIFROST_TEXTURE_FORMAT_S32;
23487ec681f3Smrg        default:              unreachable("Invalid type for texturing");
23497ec681f3Smrg        }
23507ec681f3Smrg}
23517ec681f3Smrg
23527ec681f3Smrg/* Array indices are specified as 32-bit uints, need to convert. In .z component from NIR */
23537ec681f3Smrgstatic bi_index
23547ec681f3Smrgbi_emit_texc_array_index(bi_builder *b, bi_index idx, nir_alu_type T)
23557ec681f3Smrg{
23567ec681f3Smrg        /* For (u)int we can just passthrough */
23577ec681f3Smrg        nir_alu_type base = nir_alu_type_get_base_type(T);
23587ec681f3Smrg        if (base == nir_type_int || base == nir_type_uint)
23597ec681f3Smrg                return idx;
23607ec681f3Smrg
23617ec681f3Smrg        /* Otherwise we convert */
23627ec681f3Smrg        assert(T == nir_type_float32);
23637ec681f3Smrg
23647ec681f3Smrg        /* OpenGL ES 3.2 specification section 8.14.2 ("Coordinate Wrapping and
23657ec681f3Smrg         * Texel Selection") defines the layer to be taken from clamp(RNE(r),
23667ec681f3Smrg         * 0, dt - 1). So we use round RTE, clamping is handled at the data
23677ec681f3Smrg         * structure level */
23687ec681f3Smrg
23697ec681f3Smrg        return bi_f32_to_u32(b, idx, BI_ROUND_NONE);
23707ec681f3Smrg}
23717ec681f3Smrg
23727ec681f3Smrg/* TEXC's explicit and bias LOD modes requires the LOD to be transformed to a
23737ec681f3Smrg * 16-bit 8:8 fixed-point format. We lower as:
23747ec681f3Smrg *
23757ec681f3Smrg * F32_TO_S32(clamp(x, -16.0, +16.0) * 256.0) & 0xFFFF =
23767ec681f3Smrg * MKVEC(F32_TO_S32(clamp(x * 1.0/16.0, -1.0, 1.0) * (16.0 * 256.0)), #0)
23777ec681f3Smrg */
23787ec681f3Smrg
23797ec681f3Smrgstatic bi_index
23807ec681f3Smrgbi_emit_texc_lod_88(bi_builder *b, bi_index lod, bool fp16)
23817ec681f3Smrg{
23827ec681f3Smrg        /* Precompute for constant LODs to avoid general constant folding */
23837ec681f3Smrg        if (lod.type == BI_INDEX_CONSTANT) {
23847ec681f3Smrg                uint32_t raw = lod.value;
23857ec681f3Smrg                float x = fp16 ? _mesa_half_to_float(raw) : uif(raw);
23867ec681f3Smrg                int32_t s32 = CLAMP(x, -16.0f, 16.0f) * 256.0f;
23877ec681f3Smrg                return bi_imm_u32(s32 & 0xFFFF);
23887ec681f3Smrg        }
23897ec681f3Smrg
23907ec681f3Smrg        /* Sort of arbitrary. Must be less than 128.0, greater than or equal to
23917ec681f3Smrg         * the max LOD (16 since we cap at 2^16 texture dimensions), and
23927ec681f3Smrg         * preferably small to minimize precision loss */
23937ec681f3Smrg        const float max_lod = 16.0;
23947ec681f3Smrg
23957ec681f3Smrg        bi_instr *fsat = bi_fma_f32_to(b, bi_temp(b->shader),
23967ec681f3Smrg                        fp16 ? bi_half(lod, false) : lod,
23977ec681f3Smrg                        bi_imm_f32(1.0f / max_lod), bi_negzero(), BI_ROUND_NONE);
23987ec681f3Smrg
23997ec681f3Smrg        fsat->clamp = BI_CLAMP_CLAMP_M1_1;
24007ec681f3Smrg
24017ec681f3Smrg        bi_index fmul = bi_fma_f32(b, fsat->dest[0], bi_imm_f32(max_lod * 256.0f),
24027ec681f3Smrg                        bi_negzero(), BI_ROUND_NONE);
24037ec681f3Smrg
24047ec681f3Smrg        return bi_mkvec_v2i16(b,
24057ec681f3Smrg                        bi_half(bi_f32_to_s32(b, fmul, BI_ROUND_RTZ), false),
24067ec681f3Smrg                        bi_imm_u16(0));
24077ec681f3Smrg}
24087ec681f3Smrg
24097ec681f3Smrg/* FETCH takes a 32-bit staging register containing the LOD as an integer in
24107ec681f3Smrg * the bottom 16-bits and (if present) the cube face index in the top 16-bits.
24117ec681f3Smrg * TODO: Cube face.
24127ec681f3Smrg */
24137ec681f3Smrg
24147ec681f3Smrgstatic bi_index
24157ec681f3Smrgbi_emit_texc_lod_cube(bi_builder *b, bi_index lod)
24167ec681f3Smrg{
24177ec681f3Smrg        return bi_lshift_or_i32(b, lod, bi_zero(), bi_imm_u8(8));
24187ec681f3Smrg}
24197ec681f3Smrg
24207ec681f3Smrg/* The hardware specifies texel offsets and multisample indices together as a
24217ec681f3Smrg * u8vec4 <offset, ms index>. By default all are zero, so if have either a
24227ec681f3Smrg * nonzero texel offset or a nonzero multisample index, we build a u8vec4 with
24237ec681f3Smrg * the bits we need and return that to be passed as a staging register. Else we
24247ec681f3Smrg * return 0 to avoid allocating a data register when everything is zero. */
24257ec681f3Smrg
24267ec681f3Smrgstatic bi_index
24277ec681f3Smrgbi_emit_texc_offset_ms_index(bi_builder *b, nir_tex_instr *instr)
24287ec681f3Smrg{
24297ec681f3Smrg        bi_index dest = bi_zero();
24307ec681f3Smrg
24317ec681f3Smrg        int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset);
24327ec681f3Smrg        if (offs_idx >= 0 &&
24337ec681f3Smrg            (!nir_src_is_const(instr->src[offs_idx].src) ||
24347ec681f3Smrg             nir_src_as_uint(instr->src[offs_idx].src) != 0)) {
24357ec681f3Smrg                unsigned nr = nir_src_num_components(instr->src[offs_idx].src);
24367ec681f3Smrg                bi_index idx = bi_src_index(&instr->src[offs_idx].src);
24377ec681f3Smrg                dest = bi_mkvec_v4i8(b,
24387ec681f3Smrg                                (nr > 0) ? bi_byte(bi_word(idx, 0), 0) : bi_imm_u8(0),
24397ec681f3Smrg                                (nr > 1) ? bi_byte(bi_word(idx, 1), 0) : bi_imm_u8(0),
24407ec681f3Smrg                                (nr > 2) ? bi_byte(bi_word(idx, 2), 0) : bi_imm_u8(0),
24417ec681f3Smrg                                bi_imm_u8(0));
24427ec681f3Smrg        }
24437ec681f3Smrg
24447ec681f3Smrg        int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index);
24457ec681f3Smrg        if (ms_idx >= 0 &&
24467ec681f3Smrg            (!nir_src_is_const(instr->src[ms_idx].src) ||
24477ec681f3Smrg             nir_src_as_uint(instr->src[ms_idx].src) != 0)) {
24487ec681f3Smrg                dest = bi_lshift_or_i32(b,
24497ec681f3Smrg                                bi_src_index(&instr->src[ms_idx].src), dest,
24507ec681f3Smrg                                bi_imm_u8(24));
24517ec681f3Smrg        }
24527ec681f3Smrg
24537ec681f3Smrg        return dest;
24547ec681f3Smrg}
24557ec681f3Smrg
24567ec681f3Smrgstatic void
24577ec681f3Smrgbi_emit_cube_coord(bi_builder *b, bi_index coord,
24587ec681f3Smrg                    bi_index *face, bi_index *s, bi_index *t)
24597ec681f3Smrg{
24607ec681f3Smrg        /* Compute max { |x|, |y|, |z| } */
24617ec681f3Smrg        bi_instr *cubeface = bi_cubeface_to(b, bi_temp(b->shader),
24627ec681f3Smrg                        bi_temp(b->shader), coord,
24637ec681f3Smrg                        bi_word(coord, 1), bi_word(coord, 2));
24647ec681f3Smrg
24657ec681f3Smrg        /* Select coordinates */
24667ec681f3Smrg
24677ec681f3Smrg        bi_index ssel = bi_cube_ssel(b, bi_word(coord, 2), coord,
24687ec681f3Smrg                        cubeface->dest[1]);
24697ec681f3Smrg
24707ec681f3Smrg        bi_index tsel = bi_cube_tsel(b, bi_word(coord, 1), bi_word(coord, 2),
24717ec681f3Smrg                        cubeface->dest[1]);
24727ec681f3Smrg
24737ec681f3Smrg        /* The OpenGL ES specification requires us to transform an input vector
24747ec681f3Smrg         * (x, y, z) to the coordinate, given the selected S/T:
24757ec681f3Smrg         *
24767ec681f3Smrg         * (1/2 ((s / max{x,y,z}) + 1), 1/2 ((t / max{x, y, z}) + 1))
24777ec681f3Smrg         *
24787ec681f3Smrg         * We implement (s shown, t similar) in a form friendlier to FMA
24797ec681f3Smrg         * instructions, and clamp coordinates at the end for correct
24807ec681f3Smrg         * NaN/infinity handling:
24817ec681f3Smrg         *
24827ec681f3Smrg         * fsat(s * (0.5 * (1 / max{x, y, z})) + 0.5)
24837ec681f3Smrg         *
24847ec681f3Smrg         * Take the reciprocal of max{x, y, z}
24857ec681f3Smrg         */
24867ec681f3Smrg
24877ec681f3Smrg        bi_index rcp = bi_frcp_f32(b, cubeface->dest[0]);
24887ec681f3Smrg
24897ec681f3Smrg        /* Calculate 0.5 * (1.0 / max{x, y, z}) */
24907ec681f3Smrg        bi_index fma1 = bi_fma_f32(b, rcp, bi_imm_f32(0.5f), bi_negzero(),
24917ec681f3Smrg                        BI_ROUND_NONE);
24927ec681f3Smrg
24937ec681f3Smrg        /* Transform the coordinates */
24947ec681f3Smrg        *s = bi_temp(b->shader);
24957ec681f3Smrg        *t = bi_temp(b->shader);
24967ec681f3Smrg
24977ec681f3Smrg        bi_instr *S = bi_fma_f32_to(b, *s, fma1, ssel, bi_imm_f32(0.5f),
24987ec681f3Smrg                        BI_ROUND_NONE);
24997ec681f3Smrg        bi_instr *T = bi_fma_f32_to(b, *t, fma1, tsel, bi_imm_f32(0.5f),
25007ec681f3Smrg                        BI_ROUND_NONE);
25017ec681f3Smrg
25027ec681f3Smrg        S->clamp = BI_CLAMP_CLAMP_0_1;
25037ec681f3Smrg        T->clamp = BI_CLAMP_CLAMP_0_1;
25047ec681f3Smrg
25057ec681f3Smrg        /* Face index at bit[29:31], matching the cube map descriptor */
25067ec681f3Smrg        *face = cubeface->dest[1];
25077ec681f3Smrg}
25087ec681f3Smrg
25097ec681f3Smrg/* Emits a cube map descriptor, returning lower 32-bits and putting upper
25107ec681f3Smrg * 32-bits in passed pointer t. The packing of the face with the S coordinate
25117ec681f3Smrg * exploits the redundancy of floating points with the range restriction of
25127ec681f3Smrg * CUBEFACE output.
25137ec681f3Smrg *
25147ec681f3Smrg *     struct cube_map_descriptor {
25157ec681f3Smrg *         float s : 29;
25167ec681f3Smrg *         unsigned face : 3;
25177ec681f3Smrg *         float t : 32;
25187ec681f3Smrg *     }
25197ec681f3Smrg *
25207ec681f3Smrg * Since the cube face index is preshifted, this is easy to pack with a bitwise
25217ec681f3Smrg * MUX.i32 and a fixed mask, selecting the lower bits 29 from s and the upper 3
25227ec681f3Smrg * bits from face.
25237ec681f3Smrg */
25247ec681f3Smrg
25257ec681f3Smrgstatic bi_index
25267ec681f3Smrgbi_emit_texc_cube_coord(bi_builder *b, bi_index coord, bi_index *t)
25277ec681f3Smrg{
25287ec681f3Smrg        bi_index face, s;
25297ec681f3Smrg        bi_emit_cube_coord(b, coord, &face, &s, t);
25307ec681f3Smrg        bi_index mask = bi_imm_u32(BITFIELD_MASK(29));
25317ec681f3Smrg        return bi_mux_i32(b, s, face, mask, BI_MUX_BIT);
25327ec681f3Smrg}
25337ec681f3Smrg
25347ec681f3Smrg/* Map to the main texture op used. Some of these (txd in particular) will
25357ec681f3Smrg * lower to multiple texture ops with different opcodes (GRDESC_DER + TEX in
25367ec681f3Smrg * sequence). We assume that lowering is handled elsewhere.
25377ec681f3Smrg */
25387ec681f3Smrg
25397ec681f3Smrgstatic enum bifrost_tex_op
25407ec681f3Smrgbi_tex_op(nir_texop op)
25417ec681f3Smrg{
25427ec681f3Smrg        switch (op) {
25437ec681f3Smrg        case nir_texop_tex:
25447ec681f3Smrg        case nir_texop_txb:
25457ec681f3Smrg        case nir_texop_txl:
25467ec681f3Smrg        case nir_texop_txd:
25477ec681f3Smrg        case nir_texop_tex_prefetch:
25487ec681f3Smrg                return BIFROST_TEX_OP_TEX;
25497ec681f3Smrg        case nir_texop_txf:
25507ec681f3Smrg        case nir_texop_txf_ms:
25517ec681f3Smrg        case nir_texop_txf_ms_fb:
25527ec681f3Smrg        case nir_texop_tg4:
25537ec681f3Smrg                return BIFROST_TEX_OP_FETCH;
25547ec681f3Smrg        case nir_texop_txs:
25557ec681f3Smrg        case nir_texop_lod:
25567ec681f3Smrg        case nir_texop_query_levels:
25577ec681f3Smrg        case nir_texop_texture_samples:
25587ec681f3Smrg        case nir_texop_samples_identical:
25597ec681f3Smrg                unreachable("should've been lowered");
25607ec681f3Smrg        default:
25617ec681f3Smrg                unreachable("unsupported tex op");
25627ec681f3Smrg        }
25637ec681f3Smrg}
25647ec681f3Smrg
25657ec681f3Smrg/* Data registers required by texturing in the order they appear. All are
25667ec681f3Smrg * optional, the texture operation descriptor determines which are present.
25677ec681f3Smrg * Note since 3D arrays are not permitted at an API level, Z_COORD and
25687ec681f3Smrg * ARRAY/SHADOW are exlusive, so TEXC in practice reads at most 8 registers */
25697ec681f3Smrg
25707ec681f3Smrgenum bifrost_tex_dreg {
25717ec681f3Smrg        BIFROST_TEX_DREG_Z_COORD = 0,
25727ec681f3Smrg        BIFROST_TEX_DREG_Y_DELTAS = 1,
25737ec681f3Smrg        BIFROST_TEX_DREG_LOD = 2,
25747ec681f3Smrg        BIFROST_TEX_DREG_GRDESC_HI = 3,
25757ec681f3Smrg        BIFROST_TEX_DREG_SHADOW = 4,
25767ec681f3Smrg        BIFROST_TEX_DREG_ARRAY = 5,
25777ec681f3Smrg        BIFROST_TEX_DREG_OFFSETMS = 6,
25787ec681f3Smrg        BIFROST_TEX_DREG_SAMPLER = 7,
25797ec681f3Smrg        BIFROST_TEX_DREG_TEXTURE = 8,
25807ec681f3Smrg        BIFROST_TEX_DREG_COUNT,
25817ec681f3Smrg};
25827ec681f3Smrg
25837ec681f3Smrgstatic void
25847ec681f3Smrgbi_emit_texc(bi_builder *b, nir_tex_instr *instr)
25857ec681f3Smrg{
25867ec681f3Smrg        bool computed_lod = false;
25877ec681f3Smrg
25887ec681f3Smrg        struct bifrost_texture_operation desc = {
25897ec681f3Smrg                .op = bi_tex_op(instr->op),
25907ec681f3Smrg                .offset_or_bias_disable = false, /* TODO */
25917ec681f3Smrg                .shadow_or_clamp_disable = instr->is_shadow,
25927ec681f3Smrg                .array = instr->is_array,
25937ec681f3Smrg                .dimension = bifrost_tex_format(instr->sampler_dim),
25947ec681f3Smrg                .format = bi_texture_format(instr->dest_type | nir_dest_bit_size(instr->dest), BI_CLAMP_NONE), /* TODO */
25957ec681f3Smrg                .mask = 0xF,
25967ec681f3Smrg        };
25977ec681f3Smrg
25987ec681f3Smrg        switch (desc.op) {
25997ec681f3Smrg        case BIFROST_TEX_OP_TEX:
26007ec681f3Smrg                desc.lod_or_fetch = BIFROST_LOD_MODE_COMPUTE;
26017ec681f3Smrg                computed_lod = true;
26027ec681f3Smrg                break;
26037ec681f3Smrg        case BIFROST_TEX_OP_FETCH:
26047ec681f3Smrg                desc.lod_or_fetch = (enum bifrost_lod_mode)
26057ec681f3Smrg                   (instr->op == nir_texop_tg4 ?
26067ec681f3Smrg                        BIFROST_TEXTURE_FETCH_GATHER4_R + instr->component :
26077ec681f3Smrg                        BIFROST_TEXTURE_FETCH_TEXEL);
26087ec681f3Smrg                break;
26097ec681f3Smrg        default:
26107ec681f3Smrg                unreachable("texture op unsupported");
26117ec681f3Smrg        }
26127ec681f3Smrg
26137ec681f3Smrg        /* 32-bit indices to be allocated as consecutive staging registers */
26147ec681f3Smrg        bi_index dregs[BIFROST_TEX_DREG_COUNT] = { };
26157ec681f3Smrg        bi_index cx = bi_null(), cy = bi_null();
26167ec681f3Smrg
26177ec681f3Smrg        for (unsigned i = 0; i < instr->num_srcs; ++i) {
26187ec681f3Smrg                bi_index index = bi_src_index(&instr->src[i].src);
26197ec681f3Smrg                unsigned sz = nir_src_bit_size(instr->src[i].src);
26207ec681f3Smrg                ASSERTED nir_alu_type base = nir_tex_instr_src_type(instr, i);
26217ec681f3Smrg                nir_alu_type T = base | sz;
26227ec681f3Smrg
26237ec681f3Smrg                switch (instr->src[i].src_type) {
26247ec681f3Smrg                case nir_tex_src_coord:
26257ec681f3Smrg                        if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
26267ec681f3Smrg                                cx = bi_emit_texc_cube_coord(b, index, &cy);
26277ec681f3Smrg			} else {
26287ec681f3Smrg                                unsigned components = nir_src_num_components(instr->src[i].src);
26297ec681f3Smrg
26307ec681f3Smrg                                /* Copy XY (for 2D+) or XX (for 1D) */
26317ec681f3Smrg                                cx = index;
26327ec681f3Smrg                                cy = bi_word(index, MIN2(1, components - 1));
26337ec681f3Smrg
26347ec681f3Smrg                                assert(components >= 1 && components <= 3);
26357ec681f3Smrg
26367ec681f3Smrg                                if (components < 3) {
26377ec681f3Smrg                                        /* nothing to do */
26387ec681f3Smrg                                } else if (desc.array) {
26397ec681f3Smrg                                        /* 2D array */
26407ec681f3Smrg                                        dregs[BIFROST_TEX_DREG_ARRAY] =
26417ec681f3Smrg                                                bi_emit_texc_array_index(b,
26427ec681f3Smrg                                                                bi_word(index, 2), T);
26437ec681f3Smrg                                } else {
26447ec681f3Smrg                                        /* 3D */
26457ec681f3Smrg                                        dregs[BIFROST_TEX_DREG_Z_COORD] =
26467ec681f3Smrg                                                bi_word(index, 2);
26477ec681f3Smrg                                }
26487ec681f3Smrg                        }
26497ec681f3Smrg                        break;
26507ec681f3Smrg
26517ec681f3Smrg                case nir_tex_src_lod:
26527ec681f3Smrg                        if (desc.op == BIFROST_TEX_OP_TEX &&
26537ec681f3Smrg                            nir_src_is_const(instr->src[i].src) &&
26547ec681f3Smrg                            nir_src_as_uint(instr->src[i].src) == 0) {
26557ec681f3Smrg                                desc.lod_or_fetch = BIFROST_LOD_MODE_ZERO;
26567ec681f3Smrg                        } else if (desc.op == BIFROST_TEX_OP_TEX) {
26577ec681f3Smrg                                assert(base == nir_type_float);
26587ec681f3Smrg
26597ec681f3Smrg                                assert(sz == 16 || sz == 32);
26607ec681f3Smrg                                dregs[BIFROST_TEX_DREG_LOD] =
26617ec681f3Smrg                                        bi_emit_texc_lod_88(b, index, sz == 16);
26627ec681f3Smrg                                desc.lod_or_fetch = BIFROST_LOD_MODE_EXPLICIT;
26637ec681f3Smrg                        } else {
26647ec681f3Smrg                                assert(desc.op == BIFROST_TEX_OP_FETCH);
26657ec681f3Smrg                                assert(base == nir_type_uint || base == nir_type_int);
26667ec681f3Smrg                                assert(sz == 16 || sz == 32);
26677ec681f3Smrg
26687ec681f3Smrg                                dregs[BIFROST_TEX_DREG_LOD] =
26697ec681f3Smrg                                        bi_emit_texc_lod_cube(b, index);
26707ec681f3Smrg                        }
26717ec681f3Smrg
26727ec681f3Smrg                        break;
26737ec681f3Smrg
26747ec681f3Smrg                case nir_tex_src_bias:
26757ec681f3Smrg                        /* Upper 16-bits interpreted as a clamp, leave zero */
26767ec681f3Smrg                        assert(desc.op == BIFROST_TEX_OP_TEX);
26777ec681f3Smrg                        assert(base == nir_type_float);
26787ec681f3Smrg                        assert(sz == 16 || sz == 32);
26797ec681f3Smrg                        dregs[BIFROST_TEX_DREG_LOD] =
26807ec681f3Smrg                                bi_emit_texc_lod_88(b, index, sz == 16);
26817ec681f3Smrg                        desc.lod_or_fetch = BIFROST_LOD_MODE_BIAS;
26827ec681f3Smrg                        computed_lod = true;
26837ec681f3Smrg                        break;
26847ec681f3Smrg
26857ec681f3Smrg                case nir_tex_src_ms_index:
26867ec681f3Smrg                case nir_tex_src_offset:
26877ec681f3Smrg                        if (desc.offset_or_bias_disable)
26887ec681f3Smrg                                break;
26897ec681f3Smrg
26907ec681f3Smrg                        dregs[BIFROST_TEX_DREG_OFFSETMS] =
26917ec681f3Smrg	                        bi_emit_texc_offset_ms_index(b, instr);
26927ec681f3Smrg                        if (!bi_is_equiv(dregs[BIFROST_TEX_DREG_OFFSETMS], bi_zero()))
26937ec681f3Smrg                                desc.offset_or_bias_disable = true;
26947ec681f3Smrg                        break;
26957ec681f3Smrg
26967ec681f3Smrg                case nir_tex_src_comparator:
26977ec681f3Smrg                        dregs[BIFROST_TEX_DREG_SHADOW] = index;
26987ec681f3Smrg                        break;
26997ec681f3Smrg
27007ec681f3Smrg                case nir_tex_src_texture_offset:
27017ec681f3Smrg                        assert(instr->texture_index == 0);
27027ec681f3Smrg                        dregs[BIFROST_TEX_DREG_TEXTURE] = index;
27037ec681f3Smrg                        break;
27047ec681f3Smrg
27057ec681f3Smrg                case nir_tex_src_sampler_offset:
27067ec681f3Smrg                        assert(instr->sampler_index == 0);
27077ec681f3Smrg                        dregs[BIFROST_TEX_DREG_SAMPLER] = index;
27087ec681f3Smrg                        break;
27097ec681f3Smrg
27107ec681f3Smrg                default:
27117ec681f3Smrg                        unreachable("Unhandled src type in texc emit");
27127ec681f3Smrg                }
27137ec681f3Smrg        }
27147ec681f3Smrg
27157ec681f3Smrg        if (desc.op == BIFROST_TEX_OP_FETCH && bi_is_null(dregs[BIFROST_TEX_DREG_LOD])) {
27167ec681f3Smrg                dregs[BIFROST_TEX_DREG_LOD] =
27177ec681f3Smrg                        bi_emit_texc_lod_cube(b, bi_zero());
27187ec681f3Smrg        }
27197ec681f3Smrg
27207ec681f3Smrg        /* Choose an index mode */
27217ec681f3Smrg
27227ec681f3Smrg        bool direct_tex = bi_is_null(dregs[BIFROST_TEX_DREG_TEXTURE]);
27237ec681f3Smrg        bool direct_samp = bi_is_null(dregs[BIFROST_TEX_DREG_SAMPLER]);
27247ec681f3Smrg        bool direct = direct_tex && direct_samp;
27257ec681f3Smrg
27267ec681f3Smrg        desc.immediate_indices = direct && (instr->sampler_index < 16);
27277ec681f3Smrg
27287ec681f3Smrg        if (desc.immediate_indices) {
27297ec681f3Smrg                desc.sampler_index_or_mode = instr->sampler_index;
27307ec681f3Smrg                desc.index = instr->texture_index;
27317ec681f3Smrg        } else {
27327ec681f3Smrg                enum bifrost_index mode = 0;
27337ec681f3Smrg
27347ec681f3Smrg                if (direct && instr->sampler_index == instr->texture_index) {
27357ec681f3Smrg                        mode = BIFROST_INDEX_IMMEDIATE_SHARED;
27367ec681f3Smrg                        desc.index = instr->texture_index;
27377ec681f3Smrg                } else if (direct) {
27387ec681f3Smrg                        mode = BIFROST_INDEX_IMMEDIATE_SAMPLER;
27397ec681f3Smrg                        desc.index = instr->sampler_index;
27407ec681f3Smrg                        dregs[BIFROST_TEX_DREG_TEXTURE] = bi_mov_i32(b,
27417ec681f3Smrg                                        bi_imm_u32(instr->texture_index));
27427ec681f3Smrg                } else if (direct_tex) {
27437ec681f3Smrg                        assert(!direct_samp);
27447ec681f3Smrg                        mode = BIFROST_INDEX_IMMEDIATE_TEXTURE;
27457ec681f3Smrg                        desc.index = instr->texture_index;
27467ec681f3Smrg                } else if (direct_samp) {
27477ec681f3Smrg                        assert(!direct_tex);
27487ec681f3Smrg                        mode = BIFROST_INDEX_IMMEDIATE_SAMPLER;
27497ec681f3Smrg                        desc.index = instr->sampler_index;
27507ec681f3Smrg                } else {
27517ec681f3Smrg                        mode = BIFROST_INDEX_REGISTER;
27527ec681f3Smrg                }
27537ec681f3Smrg
27547ec681f3Smrg                desc.sampler_index_or_mode = mode | (0x3 << 2);
27557ec681f3Smrg        }
27567ec681f3Smrg
27577ec681f3Smrg        /* Allocate staging registers contiguously by compacting the array.
27587ec681f3Smrg         * Index is not SSA (tied operands) */
27597ec681f3Smrg
27607ec681f3Smrg        unsigned sr_count = 0;
27617ec681f3Smrg
27627ec681f3Smrg        for (unsigned i = 0; i < ARRAY_SIZE(dregs); ++i) {
27637ec681f3Smrg                if (!bi_is_null(dregs[i]))
27647ec681f3Smrg                        dregs[sr_count++] = dregs[i];
27657ec681f3Smrg        }
27667ec681f3Smrg
27677ec681f3Smrg        bi_index idx = sr_count ? bi_temp_reg(b->shader) : bi_null();
27687ec681f3Smrg
27697ec681f3Smrg        if (sr_count)
27707ec681f3Smrg                bi_make_vec_to(b, idx, dregs, NULL, sr_count, 32);
27717ec681f3Smrg
27727ec681f3Smrg        uint32_t desc_u = 0;
27737ec681f3Smrg        memcpy(&desc_u, &desc, sizeof(desc_u));
27747ec681f3Smrg        bi_texc_to(b, sr_count ? idx : bi_dest_index(&instr->dest),
27757ec681f3Smrg                        idx, cx, cy, bi_imm_u32(desc_u), !computed_lod,
27767ec681f3Smrg                        sr_count);
27777ec681f3Smrg
27787ec681f3Smrg        /* Explicit copy to facilitate tied operands */
27797ec681f3Smrg        if (sr_count) {
27807ec681f3Smrg                bi_index srcs[4] = { idx, idx, idx, idx };
27817ec681f3Smrg                unsigned channels[4] = { 0, 1, 2, 3 };
27827ec681f3Smrg                bi_make_vec_to(b, bi_dest_index(&instr->dest), srcs, channels, 4, 32);
27837ec681f3Smrg        }
27847ec681f3Smrg}
27857ec681f3Smrg
27867ec681f3Smrg/* Simple textures ops correspond to NIR tex or txl with LOD = 0 on 2D/cube
27877ec681f3Smrg * textures with sufficiently small immediate indices. Anything else
27887ec681f3Smrg * needs a complete texture op. */
27897ec681f3Smrg
27907ec681f3Smrgstatic void
27917ec681f3Smrgbi_emit_texs(bi_builder *b, nir_tex_instr *instr)
27927ec681f3Smrg{
27937ec681f3Smrg        int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord);
27947ec681f3Smrg        assert(coord_idx >= 0);
27957ec681f3Smrg        bi_index coords = bi_src_index(&instr->src[coord_idx].src);
27967ec681f3Smrg
27977ec681f3Smrg        if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
27987ec681f3Smrg                bi_index face, s, t;
27997ec681f3Smrg                bi_emit_cube_coord(b, coords, &face, &s, &t);
28007ec681f3Smrg
28017ec681f3Smrg                bi_texs_cube_to(b, nir_dest_bit_size(instr->dest),
28027ec681f3Smrg                                bi_dest_index(&instr->dest),
28037ec681f3Smrg                                s, t, face,
28047ec681f3Smrg                                instr->sampler_index, instr->texture_index);
28057ec681f3Smrg        } else {
28067ec681f3Smrg                bi_texs_2d_to(b, nir_dest_bit_size(instr->dest),
28077ec681f3Smrg                                bi_dest_index(&instr->dest),
28087ec681f3Smrg                                coords, bi_word(coords, 1),
28097ec681f3Smrg                                instr->op != nir_texop_tex, /* zero LOD */
28107ec681f3Smrg                                instr->sampler_index, instr->texture_index);
28117ec681f3Smrg        }
28127ec681f3Smrg}
28137ec681f3Smrg
28147ec681f3Smrgstatic bool
28157ec681f3Smrgbi_is_simple_tex(nir_tex_instr *instr)
28167ec681f3Smrg{
28177ec681f3Smrg        if (instr->op != nir_texop_tex && instr->op != nir_texop_txl)
28187ec681f3Smrg                return false;
28197ec681f3Smrg
28207ec681f3Smrg        if (instr->dest_type != nir_type_float32 &&
28217ec681f3Smrg            instr->dest_type != nir_type_float16)
28227ec681f3Smrg                return false;
28237ec681f3Smrg
28247ec681f3Smrg        if (instr->is_shadow || instr->is_array)
28257ec681f3Smrg                return false;
28267ec681f3Smrg
28277ec681f3Smrg        switch (instr->sampler_dim) {
28287ec681f3Smrg        case GLSL_SAMPLER_DIM_2D:
28297ec681f3Smrg        case GLSL_SAMPLER_DIM_EXTERNAL:
28307ec681f3Smrg        case GLSL_SAMPLER_DIM_RECT:
28317ec681f3Smrg                break;
28327ec681f3Smrg
28337ec681f3Smrg        case GLSL_SAMPLER_DIM_CUBE:
28347ec681f3Smrg                /* LOD can't be specified with TEXS_CUBE */
28357ec681f3Smrg                if (instr->op == nir_texop_txl)
28367ec681f3Smrg                        return false;
28377ec681f3Smrg                break;
28387ec681f3Smrg
28397ec681f3Smrg        default:
28407ec681f3Smrg                return false;
28417ec681f3Smrg        }
28427ec681f3Smrg
28437ec681f3Smrg        for (unsigned i = 0; i < instr->num_srcs; ++i) {
28447ec681f3Smrg                if (instr->src[i].src_type != nir_tex_src_lod &&
28457ec681f3Smrg                    instr->src[i].src_type != nir_tex_src_coord)
28467ec681f3Smrg                        return false;
28477ec681f3Smrg        }
28487ec681f3Smrg
28497ec681f3Smrg        /* Indices need to fit in provided bits */
28507ec681f3Smrg        unsigned idx_bits = instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE ? 2 : 3;
28517ec681f3Smrg        if (MAX2(instr->sampler_index, instr->texture_index) >= (1 << idx_bits))
28527ec681f3Smrg                return false;
28537ec681f3Smrg
28547ec681f3Smrg        int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod);
28557ec681f3Smrg        if (lod_idx < 0)
28567ec681f3Smrg                return true;
28577ec681f3Smrg
28587ec681f3Smrg        nir_src lod = instr->src[lod_idx].src;
28597ec681f3Smrg        return nir_src_is_const(lod) && nir_src_as_uint(lod) == 0;
28607ec681f3Smrg}
28617ec681f3Smrg
28627ec681f3Smrgstatic void
28637ec681f3Smrgbi_emit_tex(bi_builder *b, nir_tex_instr *instr)
28647ec681f3Smrg{
28657ec681f3Smrg        switch (instr->op) {
28667ec681f3Smrg        case nir_texop_txs:
28677ec681f3Smrg                bi_load_sysval_to(b, bi_dest_index(&instr->dest),
28687ec681f3Smrg                                panfrost_sysval_for_instr(&instr->instr, NULL),
28697ec681f3Smrg                                4, 0);
28707ec681f3Smrg                return;
28717ec681f3Smrg        case nir_texop_tex:
28727ec681f3Smrg        case nir_texop_txl:
28737ec681f3Smrg        case nir_texop_txb:
28747ec681f3Smrg        case nir_texop_txf:
28757ec681f3Smrg        case nir_texop_txf_ms:
28767ec681f3Smrg        case nir_texop_tg4:
28777ec681f3Smrg                break;
28787ec681f3Smrg        default:
28797ec681f3Smrg                unreachable("Invalid texture operation");
28807ec681f3Smrg        }
28817ec681f3Smrg
28827ec681f3Smrg        if (bi_is_simple_tex(instr))
28837ec681f3Smrg                bi_emit_texs(b, instr);
28847ec681f3Smrg        else
28857ec681f3Smrg                bi_emit_texc(b, instr);
28867ec681f3Smrg}
28877ec681f3Smrg
28887ec681f3Smrgstatic void
28897ec681f3Smrgbi_emit_instr(bi_builder *b, struct nir_instr *instr)
28907ec681f3Smrg{
28917ec681f3Smrg        switch (instr->type) {
28927ec681f3Smrg        case nir_instr_type_load_const:
28937ec681f3Smrg                bi_emit_load_const(b, nir_instr_as_load_const(instr));
28947ec681f3Smrg                break;
28957ec681f3Smrg
28967ec681f3Smrg        case nir_instr_type_intrinsic:
28977ec681f3Smrg                bi_emit_intrinsic(b, nir_instr_as_intrinsic(instr));
28987ec681f3Smrg                break;
28997ec681f3Smrg
29007ec681f3Smrg        case nir_instr_type_alu:
29017ec681f3Smrg                bi_emit_alu(b, nir_instr_as_alu(instr));
29027ec681f3Smrg                break;
29037ec681f3Smrg
29047ec681f3Smrg        case nir_instr_type_tex:
29057ec681f3Smrg                bi_emit_tex(b, nir_instr_as_tex(instr));
29067ec681f3Smrg                break;
29077ec681f3Smrg
29087ec681f3Smrg        case nir_instr_type_jump:
29097ec681f3Smrg                bi_emit_jump(b, nir_instr_as_jump(instr));
29107ec681f3Smrg                break;
29117ec681f3Smrg
29127ec681f3Smrg        default:
29137ec681f3Smrg                unreachable("should've been lowered");
29147ec681f3Smrg        }
29157ec681f3Smrg}
29167ec681f3Smrg
29177ec681f3Smrgstatic bi_block *
29187ec681f3Smrgcreate_empty_block(bi_context *ctx)
29197ec681f3Smrg{
29207ec681f3Smrg        bi_block *blk = rzalloc(ctx, bi_block);
29217ec681f3Smrg
29227ec681f3Smrg        blk->predecessors = _mesa_set_create(blk,
29237ec681f3Smrg                        _mesa_hash_pointer,
29247ec681f3Smrg                        _mesa_key_pointer_equal);
29257ec681f3Smrg
29267ec681f3Smrg        return blk;
29277ec681f3Smrg}
29287ec681f3Smrg
29297ec681f3Smrgstatic bi_block *
29307ec681f3Smrgemit_block(bi_context *ctx, nir_block *block)
29317ec681f3Smrg{
29327ec681f3Smrg        if (ctx->after_block) {
29337ec681f3Smrg                ctx->current_block = ctx->after_block;
29347ec681f3Smrg                ctx->after_block = NULL;
29357ec681f3Smrg        } else {
29367ec681f3Smrg                ctx->current_block = create_empty_block(ctx);
29377ec681f3Smrg        }
29387ec681f3Smrg
29397ec681f3Smrg        list_addtail(&ctx->current_block->link, &ctx->blocks);
29407ec681f3Smrg        list_inithead(&ctx->current_block->instructions);
29417ec681f3Smrg
29427ec681f3Smrg        bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
29437ec681f3Smrg
29447ec681f3Smrg        nir_foreach_instr(instr, block) {
29457ec681f3Smrg                bi_emit_instr(&_b, instr);
29467ec681f3Smrg                ++ctx->instruction_count;
29477ec681f3Smrg        }
29487ec681f3Smrg
29497ec681f3Smrg        return ctx->current_block;
29507ec681f3Smrg}
29517ec681f3Smrg
29527ec681f3Smrgstatic void
29537ec681f3Smrgemit_if(bi_context *ctx, nir_if *nif)
29547ec681f3Smrg{
29557ec681f3Smrg        bi_block *before_block = ctx->current_block;
29567ec681f3Smrg
29577ec681f3Smrg        /* Speculatively emit the branch, but we can't fill it in until later */
29587ec681f3Smrg        bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
29597ec681f3Smrg        bi_instr *then_branch = bi_branchz_i16(&_b,
29607ec681f3Smrg                        bi_half(bi_src_index(&nif->condition), false),
29617ec681f3Smrg                        bi_zero(), BI_CMPF_EQ);
29627ec681f3Smrg
29637ec681f3Smrg        /* Emit the two subblocks. */
29647ec681f3Smrg        bi_block *then_block = emit_cf_list(ctx, &nif->then_list);
29657ec681f3Smrg        bi_block *end_then_block = ctx->current_block;
29667ec681f3Smrg
29677ec681f3Smrg        /* Emit second block, and check if it's empty */
29687ec681f3Smrg
29697ec681f3Smrg        int count_in = ctx->instruction_count;
29707ec681f3Smrg        bi_block *else_block = emit_cf_list(ctx, &nif->else_list);
29717ec681f3Smrg        bi_block *end_else_block = ctx->current_block;
29727ec681f3Smrg        ctx->after_block = create_empty_block(ctx);
29737ec681f3Smrg
29747ec681f3Smrg        /* Now that we have the subblocks emitted, fix up the branches */
29757ec681f3Smrg
29767ec681f3Smrg        assert(then_block);
29777ec681f3Smrg        assert(else_block);
29787ec681f3Smrg
29797ec681f3Smrg        if (ctx->instruction_count == count_in) {
29807ec681f3Smrg                then_branch->branch_target = ctx->after_block;
29817ec681f3Smrg                bi_block_add_successor(end_then_block, ctx->after_block); /* fallthrough */
29827ec681f3Smrg        } else {
29837ec681f3Smrg                then_branch->branch_target = else_block;
29847ec681f3Smrg
29857ec681f3Smrg                /* Emit a jump from the end of the then block to the end of the else */
29867ec681f3Smrg                _b.cursor = bi_after_block(end_then_block);
29877ec681f3Smrg                bi_instr *then_exit = bi_jump(&_b, bi_zero());
29887ec681f3Smrg                then_exit->branch_target = ctx->after_block;
29897ec681f3Smrg
29907ec681f3Smrg                bi_block_add_successor(end_then_block, then_exit->branch_target);
29917ec681f3Smrg                bi_block_add_successor(end_else_block, ctx->after_block); /* fallthrough */
29927ec681f3Smrg        }
29937ec681f3Smrg
29947ec681f3Smrg        bi_block_add_successor(before_block, then_branch->branch_target); /* then_branch */
29957ec681f3Smrg        bi_block_add_successor(before_block, then_block); /* fallthrough */
29967ec681f3Smrg}
29977ec681f3Smrg
29987ec681f3Smrgstatic void
29997ec681f3Smrgemit_loop(bi_context *ctx, nir_loop *nloop)
30007ec681f3Smrg{
30017ec681f3Smrg        /* Remember where we are */
30027ec681f3Smrg        bi_block *start_block = ctx->current_block;
30037ec681f3Smrg
30047ec681f3Smrg        bi_block *saved_break = ctx->break_block;
30057ec681f3Smrg        bi_block *saved_continue = ctx->continue_block;
30067ec681f3Smrg
30077ec681f3Smrg        ctx->continue_block = create_empty_block(ctx);
30087ec681f3Smrg        ctx->break_block = create_empty_block(ctx);
30097ec681f3Smrg        ctx->after_block = ctx->continue_block;
30107ec681f3Smrg
30117ec681f3Smrg        /* Emit the body itself */
30127ec681f3Smrg        emit_cf_list(ctx, &nloop->body);
30137ec681f3Smrg
30147ec681f3Smrg        /* Branch back to loop back */
30157ec681f3Smrg        bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
30167ec681f3Smrg        bi_instr *I = bi_jump(&_b, bi_zero());
30177ec681f3Smrg        I->branch_target = ctx->continue_block;
30187ec681f3Smrg        bi_block_add_successor(start_block, ctx->continue_block);
30197ec681f3Smrg        bi_block_add_successor(ctx->current_block, ctx->continue_block);
30207ec681f3Smrg
30217ec681f3Smrg        ctx->after_block = ctx->break_block;
30227ec681f3Smrg
30237ec681f3Smrg        /* Pop off */
30247ec681f3Smrg        ctx->break_block = saved_break;
30257ec681f3Smrg        ctx->continue_block = saved_continue;
30267ec681f3Smrg        ++ctx->loop_count;
30277ec681f3Smrg}
30287ec681f3Smrg
30297ec681f3Smrgstatic bi_block *
30307ec681f3Smrgemit_cf_list(bi_context *ctx, struct exec_list *list)
30317ec681f3Smrg{
30327ec681f3Smrg        bi_block *start_block = NULL;
30337ec681f3Smrg
30347ec681f3Smrg        foreach_list_typed(nir_cf_node, node, node, list) {
30357ec681f3Smrg                switch (node->type) {
30367ec681f3Smrg                case nir_cf_node_block: {
30377ec681f3Smrg                        bi_block *block = emit_block(ctx, nir_cf_node_as_block(node));
30387ec681f3Smrg
30397ec681f3Smrg                        if (!start_block)
30407ec681f3Smrg                                start_block = block;
30417ec681f3Smrg
30427ec681f3Smrg                        break;
30437ec681f3Smrg                }
30447ec681f3Smrg
30457ec681f3Smrg                case nir_cf_node_if:
30467ec681f3Smrg                        emit_if(ctx, nir_cf_node_as_if(node));
30477ec681f3Smrg                        break;
30487ec681f3Smrg
30497ec681f3Smrg                case nir_cf_node_loop:
30507ec681f3Smrg                        emit_loop(ctx, nir_cf_node_as_loop(node));
30517ec681f3Smrg                        break;
30527ec681f3Smrg
30537ec681f3Smrg                default:
30547ec681f3Smrg                        unreachable("Unknown control flow");
30557ec681f3Smrg                }
30567ec681f3Smrg        }
30577ec681f3Smrg
30587ec681f3Smrg        return start_block;
30597ec681f3Smrg}
30607ec681f3Smrg
30617ec681f3Smrg/* shader-db stuff */
30627ec681f3Smrg
30637ec681f3Smrgstruct bi_stats {
30647ec681f3Smrg        unsigned nr_clauses, nr_tuples, nr_ins;
30657ec681f3Smrg        unsigned nr_arith, nr_texture, nr_varying, nr_ldst;
30667ec681f3Smrg};
30677ec681f3Smrg
30687ec681f3Smrgstatic void
30697ec681f3Smrgbi_count_tuple_stats(bi_clause *clause, bi_tuple *tuple, struct bi_stats *stats)
30707ec681f3Smrg{
30717ec681f3Smrg        /* Count instructions */
30727ec681f3Smrg        stats->nr_ins += (tuple->fma ? 1 : 0) + (tuple->add ? 1 : 0);
30737ec681f3Smrg
30747ec681f3Smrg        /* Non-message passing tuples are always arithmetic */
30757ec681f3Smrg        if (tuple->add != clause->message) {
30767ec681f3Smrg                stats->nr_arith++;
30777ec681f3Smrg                return;
30787ec681f3Smrg        }
30797ec681f3Smrg
30807ec681f3Smrg        /* Message + FMA we'll count as arithmetic _and_ message */
30817ec681f3Smrg        if (tuple->fma)
30827ec681f3Smrg                stats->nr_arith++;
30837ec681f3Smrg
30847ec681f3Smrg        switch (clause->message_type) {
30857ec681f3Smrg        case BIFROST_MESSAGE_VARYING:
30867ec681f3Smrg                /* Check components interpolated */
30877ec681f3Smrg                stats->nr_varying += (clause->message->vecsize + 1) *
30887ec681f3Smrg                        (bi_is_regfmt_16(clause->message->register_format) ? 1 : 2);
30897ec681f3Smrg                break;
30907ec681f3Smrg
30917ec681f3Smrg        case BIFROST_MESSAGE_VARTEX:
30927ec681f3Smrg                /* 2 coordinates, fp32 each */
30937ec681f3Smrg                stats->nr_varying += (2 * 2);
30947ec681f3Smrg                FALLTHROUGH;
30957ec681f3Smrg        case BIFROST_MESSAGE_TEX:
30967ec681f3Smrg                stats->nr_texture++;
30977ec681f3Smrg                break;
30987ec681f3Smrg
30997ec681f3Smrg        case BIFROST_MESSAGE_ATTRIBUTE:
31007ec681f3Smrg        case BIFROST_MESSAGE_LOAD:
31017ec681f3Smrg        case BIFROST_MESSAGE_STORE:
31027ec681f3Smrg        case BIFROST_MESSAGE_ATOMIC:
31037ec681f3Smrg                stats->nr_ldst++;
31047ec681f3Smrg                break;
31057ec681f3Smrg
31067ec681f3Smrg        case BIFROST_MESSAGE_NONE:
31077ec681f3Smrg        case BIFROST_MESSAGE_BARRIER:
31087ec681f3Smrg        case BIFROST_MESSAGE_BLEND:
31097ec681f3Smrg        case BIFROST_MESSAGE_TILE:
31107ec681f3Smrg        case BIFROST_MESSAGE_Z_STENCIL:
31117ec681f3Smrg        case BIFROST_MESSAGE_ATEST:
31127ec681f3Smrg        case BIFROST_MESSAGE_JOB:
31137ec681f3Smrg        case BIFROST_MESSAGE_64BIT:
31147ec681f3Smrg                /* Nothing to do */
31157ec681f3Smrg                break;
31167ec681f3Smrg        };
31177ec681f3Smrg
31187ec681f3Smrg}
31197ec681f3Smrg
31207ec681f3Smrgstatic void
31217ec681f3Smrgbi_print_stats(bi_context *ctx, unsigned size, FILE *fp)
31227ec681f3Smrg{
31237ec681f3Smrg        struct bi_stats stats = { 0 };
31247ec681f3Smrg
31257ec681f3Smrg        /* Count instructions, clauses, and tuples. Also attempt to construct
31267ec681f3Smrg         * normalized execution engine cycle counts, using the following ratio:
31277ec681f3Smrg         *
31287ec681f3Smrg         * 24 arith tuples/cycle
31297ec681f3Smrg         * 2 texture messages/cycle
31307ec681f3Smrg         * 16 x 16-bit varying channels interpolated/cycle
31317ec681f3Smrg         * 1 load store message/cycle
31327ec681f3Smrg         *
31337ec681f3Smrg         * These numbers seem to match Arm Mobile Studio's heuristic. The real
31347ec681f3Smrg         * cycle counts are surely more complicated.
31357ec681f3Smrg         */
31367ec681f3Smrg
31377ec681f3Smrg        bi_foreach_block(ctx, block) {
31387ec681f3Smrg                bi_foreach_clause_in_block(block, clause) {
31397ec681f3Smrg                        stats.nr_clauses++;
31407ec681f3Smrg                        stats.nr_tuples += clause->tuple_count;
31417ec681f3Smrg
31427ec681f3Smrg                        for (unsigned i = 0; i < clause->tuple_count; ++i)
31437ec681f3Smrg                                bi_count_tuple_stats(clause, &clause->tuples[i], &stats);
31447ec681f3Smrg                }
31457ec681f3Smrg        }
31467ec681f3Smrg
31477ec681f3Smrg        float cycles_arith = ((float) stats.nr_arith) / 24.0;
31487ec681f3Smrg        float cycles_texture = ((float) stats.nr_texture) / 2.0;
31497ec681f3Smrg        float cycles_varying = ((float) stats.nr_varying) / 16.0;
31507ec681f3Smrg        float cycles_ldst = ((float) stats.nr_ldst) / 1.0;
31517ec681f3Smrg
31527ec681f3Smrg        float cycles_message = MAX3(cycles_texture, cycles_varying, cycles_ldst);
31537ec681f3Smrg        float cycles_bound = MAX2(cycles_arith, cycles_message);
31547ec681f3Smrg
31557ec681f3Smrg        /* Thread count and register pressure are traded off only on v7 */
31567ec681f3Smrg        bool full_threads = (ctx->arch == 7 && ctx->info->work_reg_count <= 32);
31577ec681f3Smrg        unsigned nr_threads = full_threads ? 2 : 1;
31587ec681f3Smrg
31597ec681f3Smrg        /* Dump stats */
31607ec681f3Smrg
31617ec681f3Smrg        fprintf(stderr, "%s - %s shader: "
31627ec681f3Smrg                        "%u inst, %u tuples, %u clauses, "
31637ec681f3Smrg                        "%f cycles, %f arith, %f texture, %f vary, %f ldst, "
31647ec681f3Smrg                        "%u quadwords, %u threads, %u loops, "
31657ec681f3Smrg                        "%u:%u spills:fills\n",
31667ec681f3Smrg                        ctx->nir->info.label ?: "",
31677ec681f3Smrg                        ctx->inputs->is_blend ? "PAN_SHADER_BLEND" :
31687ec681f3Smrg                        gl_shader_stage_name(ctx->stage),
31697ec681f3Smrg                        stats.nr_ins, stats.nr_tuples, stats.nr_clauses,
31707ec681f3Smrg                        cycles_bound, cycles_arith, cycles_texture,
31717ec681f3Smrg                        cycles_varying, cycles_ldst,
31727ec681f3Smrg                        size / 16, nr_threads,
31737ec681f3Smrg                        ctx->loop_count,
31747ec681f3Smrg                        ctx->spills, ctx->fills);
31757ec681f3Smrg}
31767ec681f3Smrg
31777ec681f3Smrgstatic int
31787ec681f3Smrgglsl_type_size(const struct glsl_type *type, bool bindless)
31797ec681f3Smrg{
31807ec681f3Smrg        return glsl_count_attribute_slots(type, false);
31817ec681f3Smrg}
31827ec681f3Smrg
31837ec681f3Smrg/* Split stores to memory. We don't split stores to vertex outputs, since
31847ec681f3Smrg * nir_lower_io_to_temporaries will ensure there's only a single write.
31857ec681f3Smrg */
31867ec681f3Smrg
31877ec681f3Smrgstatic bool
31887ec681f3Smrgshould_split_wrmask(const nir_instr *instr, UNUSED const void *data)
31897ec681f3Smrg{
31907ec681f3Smrg        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
31917ec681f3Smrg
31927ec681f3Smrg        switch (intr->intrinsic) {
31937ec681f3Smrg        case nir_intrinsic_store_ssbo:
31947ec681f3Smrg        case nir_intrinsic_store_shared:
31957ec681f3Smrg        case nir_intrinsic_store_global:
31967ec681f3Smrg        case nir_intrinsic_store_scratch:
31977ec681f3Smrg                return true;
31987ec681f3Smrg        default:
31997ec681f3Smrg                return false;
32007ec681f3Smrg        }
32017ec681f3Smrg}
32027ec681f3Smrg
32037ec681f3Smrg/* Bifrost wants transcendentals as FP32 */
32047ec681f3Smrg
32057ec681f3Smrgstatic unsigned
32067ec681f3Smrgbi_lower_bit_size(const nir_instr *instr, UNUSED void *data)
32077ec681f3Smrg{
32087ec681f3Smrg        if (instr->type != nir_instr_type_alu)
32097ec681f3Smrg                return 0;
32107ec681f3Smrg
32117ec681f3Smrg        nir_alu_instr *alu = nir_instr_as_alu(instr);
32127ec681f3Smrg
32137ec681f3Smrg        switch (alu->op) {
32147ec681f3Smrg        case nir_op_fexp2:
32157ec681f3Smrg        case nir_op_flog2:
32167ec681f3Smrg        case nir_op_fpow:
32177ec681f3Smrg        case nir_op_fsin:
32187ec681f3Smrg        case nir_op_fcos:
32197ec681f3Smrg                return (nir_dest_bit_size(alu->dest.dest) == 32) ? 0 : 32;
32207ec681f3Smrg        default:
32217ec681f3Smrg                return 0;
32227ec681f3Smrg        }
32237ec681f3Smrg}
32247ec681f3Smrg
32257ec681f3Smrg/* Although Bifrost generally supports packed 16-bit vec2 and 8-bit vec4,
32267ec681f3Smrg * transcendentals are an exception. Also shifts because of lane size mismatch
32277ec681f3Smrg * (8-bit in Bifrost, 32-bit in NIR TODO - workaround!). Some conversions need
32287ec681f3Smrg * to be scalarized due to type size. */
32297ec681f3Smrg
32307ec681f3Smrgstatic bool
32317ec681f3Smrgbi_vectorize_filter(const nir_instr *instr, void *data)
32327ec681f3Smrg{
32337ec681f3Smrg        /* Defaults work for everything else */
32347ec681f3Smrg        if (instr->type != nir_instr_type_alu)
32357ec681f3Smrg                return true;
32367ec681f3Smrg
32377ec681f3Smrg        const nir_alu_instr *alu = nir_instr_as_alu(instr);
32387ec681f3Smrg
32397ec681f3Smrg        switch (alu->op) {
32407ec681f3Smrg        case nir_op_frcp:
32417ec681f3Smrg        case nir_op_frsq:
32427ec681f3Smrg        case nir_op_ishl:
32437ec681f3Smrg        case nir_op_ishr:
32447ec681f3Smrg        case nir_op_ushr:
32457ec681f3Smrg        case nir_op_f2i16:
32467ec681f3Smrg        case nir_op_f2u16:
32477ec681f3Smrg        case nir_op_i2f16:
32487ec681f3Smrg        case nir_op_u2f16:
32497ec681f3Smrg                return false;
32507ec681f3Smrg        default:
32517ec681f3Smrg                return true;
32527ec681f3Smrg        }
32537ec681f3Smrg}
32547ec681f3Smrg
32557ec681f3Smrg/* XXX: This is a kludge to workaround NIR's lack of divergence metadata. If we
32567ec681f3Smrg * keep divergence info around after we consume it for indirect lowering,
32577ec681f3Smrg * nir_convert_from_ssa will regress code quality since it will avoid
32587ec681f3Smrg * coalescing divergent with non-divergent nodes. */
32597ec681f3Smrg
32607ec681f3Smrgstatic bool
32617ec681f3Smrgnir_invalidate_divergence_ssa(nir_ssa_def *ssa, UNUSED void *data)
32627ec681f3Smrg{
32637ec681f3Smrg        ssa->divergent = false;
32647ec681f3Smrg        return true;
32657ec681f3Smrg}
32667ec681f3Smrg
32677ec681f3Smrgstatic bool
32687ec681f3Smrgnir_invalidate_divergence(struct nir_builder *b, nir_instr *instr,
32697ec681f3Smrg                UNUSED void *data)
32707ec681f3Smrg{
32717ec681f3Smrg        return nir_foreach_ssa_def(instr, nir_invalidate_divergence_ssa, NULL);
32727ec681f3Smrg}
32737ec681f3Smrg
32747ec681f3Smrg/* Ensure we write exactly 4 components */
32757ec681f3Smrgstatic nir_ssa_def *
32767ec681f3Smrgbifrost_nir_valid_channel(nir_builder *b, nir_ssa_def *in,
32777ec681f3Smrg                          unsigned channel, unsigned first, unsigned mask)
32787ec681f3Smrg{
32797ec681f3Smrg        if (!(mask & BITFIELD_BIT(channel)))
32807ec681f3Smrg                channel = first;
32817ec681f3Smrg
32827ec681f3Smrg        return nir_channel(b, in, channel);
32837ec681f3Smrg}
32847ec681f3Smrg
32857ec681f3Smrg/* Lower fragment store_output instructions to always write 4 components,
32867ec681f3Smrg * matching the hardware semantic. This may require additional moves. Skipping
32877ec681f3Smrg * these moves is possible in theory, but invokes undefined behaviour in the
32887ec681f3Smrg * compiler. The DDK inserts these moves, so we will as well. */
32897ec681f3Smrg
32907ec681f3Smrgstatic bool
32917ec681f3Smrgbifrost_nir_lower_blend_components(struct nir_builder *b,
32927ec681f3Smrg                                   nir_instr *instr, void *data)
32937ec681f3Smrg{
32947ec681f3Smrg        if (instr->type != nir_instr_type_intrinsic)
32957ec681f3Smrg                return false;
32967ec681f3Smrg
32977ec681f3Smrg        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
32987ec681f3Smrg
32997ec681f3Smrg        if (intr->intrinsic != nir_intrinsic_store_output)
33007ec681f3Smrg                return false;
33017ec681f3Smrg
33027ec681f3Smrg        nir_ssa_def *in = intr->src[0].ssa;
33037ec681f3Smrg        unsigned first = nir_intrinsic_component(intr);
33047ec681f3Smrg        unsigned mask = nir_intrinsic_write_mask(intr);
33057ec681f3Smrg
33067ec681f3Smrg        assert(first == 0 && "shouldn't get nonzero components");
33077ec681f3Smrg
33087ec681f3Smrg        /* Nothing to do */
33097ec681f3Smrg        if (mask == BITFIELD_MASK(4))
33107ec681f3Smrg                return false;
33117ec681f3Smrg
33127ec681f3Smrg        b->cursor = nir_before_instr(&intr->instr);
33137ec681f3Smrg
33147ec681f3Smrg        /* Replicate the first valid component instead */
33157ec681f3Smrg        nir_ssa_def *replicated =
33167ec681f3Smrg                nir_vec4(b, bifrost_nir_valid_channel(b, in, 0, first, mask),
33177ec681f3Smrg                            bifrost_nir_valid_channel(b, in, 1, first, mask),
33187ec681f3Smrg                            bifrost_nir_valid_channel(b, in, 2, first, mask),
33197ec681f3Smrg                            bifrost_nir_valid_channel(b, in, 3, first, mask));
33207ec681f3Smrg
33217ec681f3Smrg        /* Rewrite to use our replicated version */
33227ec681f3Smrg        nir_instr_rewrite_src_ssa(instr, &intr->src[0], replicated);
33237ec681f3Smrg        nir_intrinsic_set_component(intr, 0);
33247ec681f3Smrg        nir_intrinsic_set_write_mask(intr, 0xF);
33257ec681f3Smrg        intr->num_components = 4;
33267ec681f3Smrg
33277ec681f3Smrg        return true;
33287ec681f3Smrg}
33297ec681f3Smrg
33307ec681f3Smrgstatic void
33317ec681f3Smrgbi_optimize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend)
33327ec681f3Smrg{
33337ec681f3Smrg        bool progress;
33347ec681f3Smrg        unsigned lower_flrp = 16 | 32 | 64;
33357ec681f3Smrg
33367ec681f3Smrg        NIR_PASS(progress, nir, nir_lower_regs_to_ssa);
33377ec681f3Smrg
33387ec681f3Smrg        nir_lower_tex_options lower_tex_options = {
33397ec681f3Smrg                .lower_txs_lod = true,
33407ec681f3Smrg                .lower_txp = ~0,
33417ec681f3Smrg                .lower_tg4_broadcom_swizzle = true,
33427ec681f3Smrg                .lower_txd = true,
33437ec681f3Smrg        };
33447ec681f3Smrg
33457ec681f3Smrg        NIR_PASS(progress, nir, pan_nir_lower_64bit_intrin);
33467ec681f3Smrg        NIR_PASS(progress, nir, pan_lower_helper_invocation);
33477ec681f3Smrg
33487ec681f3Smrg        NIR_PASS(progress, nir, nir_lower_int64);
33497ec681f3Smrg
33507ec681f3Smrg        nir_lower_idiv_options idiv_options = {
33517ec681f3Smrg                .imprecise_32bit_lowering = true,
33527ec681f3Smrg                .allow_fp16 = true,
33537ec681f3Smrg        };
33547ec681f3Smrg        NIR_PASS(progress, nir, nir_lower_idiv, &idiv_options);
33557ec681f3Smrg
33567ec681f3Smrg        NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_options);
33577ec681f3Smrg        NIR_PASS(progress, nir, nir_lower_alu_to_scalar, NULL, NULL);
33587ec681f3Smrg        NIR_PASS(progress, nir, nir_lower_load_const_to_scalar);
33597ec681f3Smrg
33607ec681f3Smrg        do {
33617ec681f3Smrg                progress = false;
33627ec681f3Smrg
33637ec681f3Smrg                NIR_PASS(progress, nir, nir_lower_var_copies);
33647ec681f3Smrg                NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
33657ec681f3Smrg                NIR_PASS(progress, nir, nir_lower_wrmasks, should_split_wrmask, NULL);
33667ec681f3Smrg
33677ec681f3Smrg                NIR_PASS(progress, nir, nir_copy_prop);
33687ec681f3Smrg                NIR_PASS(progress, nir, nir_opt_remove_phis);
33697ec681f3Smrg                NIR_PASS(progress, nir, nir_opt_dce);
33707ec681f3Smrg                NIR_PASS(progress, nir, nir_opt_dead_cf);
33717ec681f3Smrg                NIR_PASS(progress, nir, nir_opt_cse);
33727ec681f3Smrg                NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
33737ec681f3Smrg                NIR_PASS(progress, nir, nir_opt_algebraic);
33747ec681f3Smrg                NIR_PASS(progress, nir, nir_opt_constant_folding);
33757ec681f3Smrg
33767ec681f3Smrg                NIR_PASS(progress, nir, nir_lower_alu);
33777ec681f3Smrg
33787ec681f3Smrg                if (lower_flrp != 0) {
33797ec681f3Smrg                        bool lower_flrp_progress = false;
33807ec681f3Smrg                        NIR_PASS(lower_flrp_progress,
33817ec681f3Smrg                                 nir,
33827ec681f3Smrg                                 nir_lower_flrp,
33837ec681f3Smrg                                 lower_flrp,
33847ec681f3Smrg                                 false /* always_precise */);
33857ec681f3Smrg                        if (lower_flrp_progress) {
33867ec681f3Smrg                                NIR_PASS(progress, nir,
33877ec681f3Smrg                                         nir_opt_constant_folding);
33887ec681f3Smrg                                progress = true;
33897ec681f3Smrg                        }
33907ec681f3Smrg
33917ec681f3Smrg                        /* Nothing should rematerialize any flrps, so we only
33927ec681f3Smrg                         * need to do this lowering once.
33937ec681f3Smrg                         */
33947ec681f3Smrg                        lower_flrp = 0;
33957ec681f3Smrg                }
33967ec681f3Smrg
33977ec681f3Smrg                NIR_PASS(progress, nir, nir_opt_undef);
33987ec681f3Smrg                NIR_PASS(progress, nir, nir_lower_undef_to_zero);
33997ec681f3Smrg
34007ec681f3Smrg                NIR_PASS(progress, nir, nir_opt_loop_unroll);
34017ec681f3Smrg        } while (progress);
34027ec681f3Smrg
34037ec681f3Smrg        /* TODO: Why is 64-bit getting rematerialized?
34047ec681f3Smrg         * KHR-GLES31.core.shader_image_load_store.basic-allTargets-atomicFS */
34057ec681f3Smrg        NIR_PASS(progress, nir, nir_lower_int64);
34067ec681f3Smrg
34077ec681f3Smrg        /* We need to cleanup after each iteration of late algebraic
34087ec681f3Smrg         * optimizations, since otherwise NIR can produce weird edge cases
34097ec681f3Smrg         * (like fneg of a constant) which we don't handle */
34107ec681f3Smrg        bool late_algebraic = true;
34117ec681f3Smrg        while (late_algebraic) {
34127ec681f3Smrg                late_algebraic = false;
34137ec681f3Smrg                NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late);
34147ec681f3Smrg                NIR_PASS(progress, nir, nir_opt_constant_folding);
34157ec681f3Smrg                NIR_PASS(progress, nir, nir_copy_prop);
34167ec681f3Smrg                NIR_PASS(progress, nir, nir_opt_dce);
34177ec681f3Smrg                NIR_PASS(progress, nir, nir_opt_cse);
34187ec681f3Smrg        }
34197ec681f3Smrg
34207ec681f3Smrg        NIR_PASS(progress, nir, nir_lower_alu_to_scalar, NULL, NULL);
34217ec681f3Smrg        NIR_PASS(progress, nir, nir_opt_vectorize, bi_vectorize_filter, NULL);
34227ec681f3Smrg        NIR_PASS(progress, nir, nir_lower_load_const_to_scalar);
34237ec681f3Smrg        NIR_PASS(progress, nir, nir_opt_dce);
34247ec681f3Smrg
34257ec681f3Smrg        /* Prepass to simplify instruction selection */
34267ec681f3Smrg        NIR_PASS(progress, nir, bifrost_nir_lower_algebraic_late);
34277ec681f3Smrg        NIR_PASS(progress, nir, nir_opt_dce);
34287ec681f3Smrg
34297ec681f3Smrg        if (nir->info.stage == MESA_SHADER_FRAGMENT) {
34307ec681f3Smrg                NIR_PASS_V(nir, nir_shader_instructions_pass,
34317ec681f3Smrg                           bifrost_nir_lower_blend_components,
34327ec681f3Smrg                           nir_metadata_block_index | nir_metadata_dominance,
34337ec681f3Smrg                           NULL);
34347ec681f3Smrg        }
34357ec681f3Smrg
34367ec681f3Smrg        /* Backend scheduler is purely local, so do some global optimizations
34377ec681f3Smrg         * to reduce register pressure. */
34387ec681f3Smrg        nir_move_options move_all =
34397ec681f3Smrg                nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
34407ec681f3Smrg                nir_move_comparisons | nir_move_copies | nir_move_load_ssbo;
34417ec681f3Smrg
34427ec681f3Smrg        NIR_PASS_V(nir, nir_opt_sink, move_all);
34437ec681f3Smrg        NIR_PASS_V(nir, nir_opt_move, move_all);
34447ec681f3Smrg
34457ec681f3Smrg        /* We might lower attribute, varying, and image indirects. Use the
34467ec681f3Smrg         * gathered info to skip the extra analysis in the happy path. */
34477ec681f3Smrg        bool any_indirects =
34487ec681f3Smrg                nir->info.inputs_read_indirectly ||
34497ec681f3Smrg                nir->info.outputs_accessed_indirectly ||
34507ec681f3Smrg                nir->info.patch_inputs_read_indirectly ||
34517ec681f3Smrg                nir->info.patch_outputs_accessed_indirectly ||
34527ec681f3Smrg                nir->info.images_used;
34537ec681f3Smrg
34547ec681f3Smrg        if (any_indirects) {
34557ec681f3Smrg                nir_convert_to_lcssa(nir, true, true);
34567ec681f3Smrg                NIR_PASS_V(nir, nir_divergence_analysis);
34577ec681f3Smrg                NIR_PASS_V(nir, bi_lower_divergent_indirects,
34587ec681f3Smrg                                bifrost_lanes_per_warp(gpu_id));
34597ec681f3Smrg                NIR_PASS_V(nir, nir_shader_instructions_pass,
34607ec681f3Smrg                        nir_invalidate_divergence, nir_metadata_all, NULL);
34617ec681f3Smrg        }
34627ec681f3Smrg
34637ec681f3Smrg        /* Take us out of SSA */
34647ec681f3Smrg        NIR_PASS(progress, nir, nir_lower_locals_to_regs);
34657ec681f3Smrg        NIR_PASS(progress, nir, nir_move_vec_src_uses_to_dest);
34667ec681f3Smrg        NIR_PASS(progress, nir, nir_convert_from_ssa, true);
34677ec681f3Smrg}
34687ec681f3Smrg
34697ec681f3Smrg/* The cmdstream lowers 8-bit fragment output as 16-bit, so we need to do the
34707ec681f3Smrg * same lowering here to zero-extend correctly */
34717ec681f3Smrg
34727ec681f3Smrgstatic bool
34737ec681f3Smrgbifrost_nir_lower_i8_fragout_impl(struct nir_builder *b,
34747ec681f3Smrg                nir_intrinsic_instr *intr, UNUSED void *data)
34757ec681f3Smrg{
34767ec681f3Smrg        if (nir_src_bit_size(intr->src[0]) != 8)
34777ec681f3Smrg                return false;
34787ec681f3Smrg
34797ec681f3Smrg        nir_alu_type type =
34807ec681f3Smrg                nir_alu_type_get_base_type(nir_intrinsic_src_type(intr));
34817ec681f3Smrg
34827ec681f3Smrg        assert(type == nir_type_int || type == nir_type_uint);
34837ec681f3Smrg
34847ec681f3Smrg        b->cursor = nir_before_instr(&intr->instr);
34857ec681f3Smrg        nir_ssa_def *cast = nir_convert_to_bit_size(b, intr->src[0].ssa, type, 16);
34867ec681f3Smrg
34877ec681f3Smrg        nir_intrinsic_set_src_type(intr, type | 16);
34887ec681f3Smrg        nir_instr_rewrite_src_ssa(&intr->instr, &intr->src[0], cast);
34897ec681f3Smrg        return true;
34907ec681f3Smrg}
34917ec681f3Smrg
34927ec681f3Smrgstatic bool
34937ec681f3Smrgbifrost_nir_lower_i8_fragin_impl(struct nir_builder *b,
34947ec681f3Smrg                nir_intrinsic_instr *intr, UNUSED void *data)
34957ec681f3Smrg{
34967ec681f3Smrg        if (nir_dest_bit_size(intr->dest) != 8)
34977ec681f3Smrg                return false;
34987ec681f3Smrg
34997ec681f3Smrg        nir_alu_type type =
35007ec681f3Smrg                nir_alu_type_get_base_type(nir_intrinsic_dest_type(intr));
35017ec681f3Smrg
35027ec681f3Smrg        assert(type == nir_type_int || type == nir_type_uint);
35037ec681f3Smrg
35047ec681f3Smrg        b->cursor = nir_before_instr(&intr->instr);
35057ec681f3Smrg        nir_ssa_def *out =
35067ec681f3Smrg                nir_load_output(b, intr->num_components, 16, intr->src[0].ssa,
35077ec681f3Smrg                        .base = nir_intrinsic_base(intr),
35087ec681f3Smrg                        .component = nir_intrinsic_component(intr),
35097ec681f3Smrg                        .dest_type = type | 16,
35107ec681f3Smrg                        .io_semantics = nir_intrinsic_io_semantics(intr));
35117ec681f3Smrg
35127ec681f3Smrg        nir_ssa_def *cast = nir_convert_to_bit_size(b, out, type, 8);
35137ec681f3Smrg        nir_ssa_def_rewrite_uses(&intr->dest.ssa, cast);
35147ec681f3Smrg        return true;
35157ec681f3Smrg}
35167ec681f3Smrg
35177ec681f3Smrgstatic bool
35187ec681f3Smrgbifrost_nir_lower_i8_frag(struct nir_builder *b,
35197ec681f3Smrg                nir_instr *instr, UNUSED void *data)
35207ec681f3Smrg{
35217ec681f3Smrg        if (instr->type != nir_instr_type_intrinsic)
35227ec681f3Smrg                return false;
35237ec681f3Smrg
35247ec681f3Smrg        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
35257ec681f3Smrg        if (intr->intrinsic == nir_intrinsic_load_output)
35267ec681f3Smrg                return bifrost_nir_lower_i8_fragin_impl(b, intr, data);
35277ec681f3Smrg        else if (intr->intrinsic == nir_intrinsic_store_output)
35287ec681f3Smrg                return bifrost_nir_lower_i8_fragout_impl(b, intr, data);
35297ec681f3Smrg        else
35307ec681f3Smrg                return false;
35317ec681f3Smrg}
35327ec681f3Smrg
35337ec681f3Smrgstatic void
35347ec681f3Smrgbi_opt_post_ra(bi_context *ctx)
35357ec681f3Smrg{
35367ec681f3Smrg        bi_foreach_instr_global_safe(ctx, ins) {
35377ec681f3Smrg                if (ins->op == BI_OPCODE_MOV_I32 && bi_is_equiv(ins->dest[0], ins->src[0]))
35387ec681f3Smrg                        bi_remove_instruction(ins);
35397ec681f3Smrg        }
35407ec681f3Smrg}
35417ec681f3Smrg
35427ec681f3Smrgstatic bool
35437ec681f3Smrgbifrost_nir_lower_store_component(struct nir_builder *b,
35447ec681f3Smrg                nir_instr *instr, void *data)
35457ec681f3Smrg{
35467ec681f3Smrg        if (instr->type != nir_instr_type_intrinsic)
35477ec681f3Smrg                return false;
35487ec681f3Smrg
35497ec681f3Smrg        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
35507ec681f3Smrg
35517ec681f3Smrg        if (intr->intrinsic != nir_intrinsic_store_output)
35527ec681f3Smrg                return false;
35537ec681f3Smrg
35547ec681f3Smrg        struct hash_table_u64 *slots = data;
35557ec681f3Smrg        unsigned component = nir_intrinsic_component(intr);
35567ec681f3Smrg        nir_src *slot_src = nir_get_io_offset_src(intr);
35577ec681f3Smrg        uint64_t slot = nir_src_as_uint(*slot_src) + nir_intrinsic_base(intr);
35587ec681f3Smrg
35597ec681f3Smrg        nir_intrinsic_instr *prev = _mesa_hash_table_u64_search(slots, slot);
35607ec681f3Smrg        unsigned mask = (prev ? nir_intrinsic_write_mask(prev) : 0);
35617ec681f3Smrg
35627ec681f3Smrg        nir_ssa_def *value = intr->src[0].ssa;
35637ec681f3Smrg        b->cursor = nir_before_instr(&intr->instr);
35647ec681f3Smrg
35657ec681f3Smrg        nir_ssa_def *undef = nir_ssa_undef(b, 1, value->bit_size);
35667ec681f3Smrg        nir_ssa_def *channels[4] = { undef, undef, undef, undef };
35677ec681f3Smrg
35687ec681f3Smrg        /* Copy old */
35697ec681f3Smrg        u_foreach_bit(i, mask) {
35707ec681f3Smrg                assert(prev != NULL);
35717ec681f3Smrg                nir_ssa_def *prev_ssa = prev->src[0].ssa;
35727ec681f3Smrg                channels[i] = nir_channel(b, prev_ssa, i);
35737ec681f3Smrg        }
35747ec681f3Smrg
35757ec681f3Smrg        /* Copy new */
35767ec681f3Smrg        unsigned new_mask = nir_intrinsic_write_mask(intr);
35777ec681f3Smrg        mask |= (new_mask << component);
35787ec681f3Smrg
35797ec681f3Smrg        u_foreach_bit(i, new_mask) {
35807ec681f3Smrg                assert(component + i < 4);
35817ec681f3Smrg                channels[component + i] = nir_channel(b, value, i);
35827ec681f3Smrg        }
35837ec681f3Smrg
35847ec681f3Smrg        intr->num_components = util_last_bit(mask);
35857ec681f3Smrg        nir_instr_rewrite_src_ssa(instr, &intr->src[0],
35867ec681f3Smrg                        nir_vec(b, channels, intr->num_components));
35877ec681f3Smrg
35887ec681f3Smrg        nir_intrinsic_set_component(intr, 0);
35897ec681f3Smrg        nir_intrinsic_set_write_mask(intr, mask);
35907ec681f3Smrg
35917ec681f3Smrg        if (prev) {
35927ec681f3Smrg                _mesa_hash_table_u64_remove(slots, slot);
35937ec681f3Smrg                nir_instr_remove(&prev->instr);
35947ec681f3Smrg        }
35957ec681f3Smrg
35967ec681f3Smrg        _mesa_hash_table_u64_insert(slots, slot, intr);
35977ec681f3Smrg        return false;
35987ec681f3Smrg}
35997ec681f3Smrg
36007ec681f3Smrg/* Dead code elimination for branches at the end of a block - only one branch
36017ec681f3Smrg * per block is legal semantically, but unreachable jumps can be generated.
36027ec681f3Smrg * Likewise we can generate jumps to the terminal block which need to be
36037ec681f3Smrg * lowered away to a jump to #0x0, which induces successful termination. */
36047ec681f3Smrg
36057ec681f3Smrgstatic void
36067ec681f3Smrgbi_lower_branch(bi_block *block)
36077ec681f3Smrg{
36087ec681f3Smrg        bool branched = false;
36097ec681f3Smrg        ASSERTED bool was_jump = false;
36107ec681f3Smrg
36117ec681f3Smrg        bi_foreach_instr_in_block_safe(block, ins) {
36127ec681f3Smrg                if (!ins->branch_target) continue;
36137ec681f3Smrg
36147ec681f3Smrg                if (branched) {
36157ec681f3Smrg                        assert(was_jump && (ins->op == BI_OPCODE_JUMP));
36167ec681f3Smrg                        bi_remove_instruction(ins);
36177ec681f3Smrg                        continue;
36187ec681f3Smrg                }
36197ec681f3Smrg
36207ec681f3Smrg                branched = true;
36217ec681f3Smrg                was_jump = ins->op == BI_OPCODE_JUMP;
36227ec681f3Smrg
36237ec681f3Smrg                if (bi_is_terminal_block(ins->branch_target))
36247ec681f3Smrg                        ins->branch_target = NULL;
36257ec681f3Smrg        }
36267ec681f3Smrg}
36277ec681f3Smrg
36287ec681f3Smrgstatic void
36297ec681f3Smrgbi_pack_clauses(bi_context *ctx, struct util_dynarray *binary)
36307ec681f3Smrg{
36317ec681f3Smrg        unsigned final_clause = bi_pack(ctx, binary);
36327ec681f3Smrg
36337ec681f3Smrg        /* If we need to wait for ATEST or BLEND in the first clause, pass the
36347ec681f3Smrg         * corresponding bits through to the renderer state descriptor */
36357ec681f3Smrg        bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link);
36367ec681f3Smrg        bi_clause *first_clause = bi_next_clause(ctx, first_block, NULL);
36377ec681f3Smrg
36387ec681f3Smrg        unsigned first_deps = first_clause ? first_clause->dependencies : 0;
36397ec681f3Smrg        ctx->info->bifrost.wait_6 = (first_deps & (1 << 6));
36407ec681f3Smrg        ctx->info->bifrost.wait_7 = (first_deps & (1 << 7));
36417ec681f3Smrg
36427ec681f3Smrg        /* Pad the shader with enough zero bytes to trick the prefetcher,
36437ec681f3Smrg         * unless we're compiling an empty shader (in which case we don't pad
36447ec681f3Smrg         * so the size remains 0) */
36457ec681f3Smrg        unsigned prefetch_size = BIFROST_SHADER_PREFETCH - final_clause;
36467ec681f3Smrg
36477ec681f3Smrg        if (binary->size) {
36487ec681f3Smrg                memset(util_dynarray_grow(binary, uint8_t, prefetch_size),
36497ec681f3Smrg                       0, prefetch_size);
36507ec681f3Smrg        }
36517ec681f3Smrg}
36527ec681f3Smrg
36537ec681f3Smrgvoid
36547ec681f3Smrgbifrost_compile_shader_nir(nir_shader *nir,
36557ec681f3Smrg                           const struct panfrost_compile_inputs *inputs,
36567ec681f3Smrg                           struct util_dynarray *binary,
36577ec681f3Smrg                           struct pan_shader_info *info)
36587ec681f3Smrg{
36597ec681f3Smrg        bifrost_debug = debug_get_option_bifrost_debug();
36607ec681f3Smrg
36617ec681f3Smrg        bi_context *ctx = rzalloc(NULL, bi_context);
36627ec681f3Smrg        ctx->sysval_to_id = panfrost_init_sysvals(&info->sysvals, ctx);
36637ec681f3Smrg
36647ec681f3Smrg        ctx->inputs = inputs;
36657ec681f3Smrg        ctx->nir = nir;
36667ec681f3Smrg        ctx->info = info;
36677ec681f3Smrg        ctx->stage = nir->info.stage;
36687ec681f3Smrg        ctx->quirks = bifrost_get_quirks(inputs->gpu_id);
36697ec681f3Smrg        ctx->arch = inputs->gpu_id >> 12;
36707ec681f3Smrg
36717ec681f3Smrg        /* If nothing is pushed, all UBOs need to be uploaded */
36727ec681f3Smrg        ctx->ubo_mask = ~0;
36737ec681f3Smrg
36747ec681f3Smrg        list_inithead(&ctx->blocks);
36757ec681f3Smrg
36767ec681f3Smrg        /* Lower gl_Position pre-optimisation, but after lowering vars to ssa
36777ec681f3Smrg         * (so we don't accidentally duplicate the epilogue since mesa/st has
36787ec681f3Smrg         * messed with our I/O quite a bit already) */
36797ec681f3Smrg
36807ec681f3Smrg        NIR_PASS_V(nir, nir_lower_vars_to_ssa);
36817ec681f3Smrg
36827ec681f3Smrg        if (ctx->stage == MESA_SHADER_VERTEX) {
36837ec681f3Smrg                NIR_PASS_V(nir, nir_lower_viewport_transform);
36847ec681f3Smrg                NIR_PASS_V(nir, nir_lower_point_size, 1.0, 1024.0);
36857ec681f3Smrg        }
36867ec681f3Smrg
36877ec681f3Smrg        /* Lower large arrays to scratch and small arrays to bcsel (TODO: tune
36887ec681f3Smrg         * threshold, but not until addresses / csel is optimized better) */
36897ec681f3Smrg        NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 16,
36907ec681f3Smrg                        glsl_get_natural_size_align_bytes);
36917ec681f3Smrg        NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0);
36927ec681f3Smrg
36937ec681f3Smrg        NIR_PASS_V(nir, nir_split_var_copies);
36947ec681f3Smrg        NIR_PASS_V(nir, nir_lower_global_vars_to_local);
36957ec681f3Smrg        NIR_PASS_V(nir, nir_lower_var_copies);
36967ec681f3Smrg        NIR_PASS_V(nir, nir_lower_vars_to_ssa);
36977ec681f3Smrg        NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
36987ec681f3Smrg                        glsl_type_size, 0);
36997ec681f3Smrg
37007ec681f3Smrg        if (ctx->stage == MESA_SHADER_FRAGMENT) {
37017ec681f3Smrg                NIR_PASS_V(nir, nir_lower_mediump_io, nir_var_shader_out,
37027ec681f3Smrg                                ~0, false);
37037ec681f3Smrg        } else {
37047ec681f3Smrg                struct hash_table_u64 *stores = _mesa_hash_table_u64_create(ctx);
37057ec681f3Smrg                NIR_PASS_V(nir, nir_shader_instructions_pass,
37067ec681f3Smrg                                bifrost_nir_lower_store_component,
37077ec681f3Smrg                                nir_metadata_block_index |
37087ec681f3Smrg                                nir_metadata_dominance, stores);
37097ec681f3Smrg                _mesa_hash_table_u64_destroy(stores);
37107ec681f3Smrg        }
37117ec681f3Smrg
37127ec681f3Smrg        NIR_PASS_V(nir, nir_lower_ssbo);
37137ec681f3Smrg        NIR_PASS_V(nir, pan_nir_lower_zs_store);
37147ec681f3Smrg        NIR_PASS_V(nir, pan_lower_sample_pos);
37157ec681f3Smrg        NIR_PASS_V(nir, nir_lower_bit_size, bi_lower_bit_size, NULL);
37167ec681f3Smrg
37177ec681f3Smrg        if (nir->info.stage == MESA_SHADER_FRAGMENT) {
37187ec681f3Smrg                NIR_PASS_V(nir, nir_shader_instructions_pass,
37197ec681f3Smrg                        bifrost_nir_lower_i8_frag,
37207ec681f3Smrg                        nir_metadata_block_index | nir_metadata_dominance,
37217ec681f3Smrg                        NULL);
37227ec681f3Smrg        }
37237ec681f3Smrg
37247ec681f3Smrg        bi_optimize_nir(nir, ctx->inputs->gpu_id, ctx->inputs->is_blend);
37257ec681f3Smrg
37267ec681f3Smrg        NIR_PASS_V(nir, pan_nir_reorder_writeout);
37277ec681f3Smrg
37287ec681f3Smrg        bool skip_internal = nir->info.internal;
37297ec681f3Smrg        skip_internal &= !(bifrost_debug & BIFROST_DBG_INTERNAL);
37307ec681f3Smrg
37317ec681f3Smrg        if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) {
37327ec681f3Smrg                nir_print_shader(nir, stdout);
37337ec681f3Smrg        }
37347ec681f3Smrg
37357ec681f3Smrg        info->tls_size = nir->scratch_size;
37367ec681f3Smrg
37377ec681f3Smrg        nir_foreach_function(func, nir) {
37387ec681f3Smrg                if (!func->impl)
37397ec681f3Smrg                        continue;
37407ec681f3Smrg
37417ec681f3Smrg                ctx->ssa_alloc += func->impl->ssa_alloc;
37427ec681f3Smrg                ctx->reg_alloc += func->impl->reg_alloc;
37437ec681f3Smrg
37447ec681f3Smrg                emit_cf_list(ctx, &func->impl->body);
37457ec681f3Smrg                break; /* TODO: Multi-function shaders */
37467ec681f3Smrg        }
37477ec681f3Smrg
37487ec681f3Smrg        unsigned block_source_count = 0;
37497ec681f3Smrg
37507ec681f3Smrg        bi_foreach_block(ctx, block) {
37517ec681f3Smrg                /* Name blocks now that we're done emitting so the order is
37527ec681f3Smrg                 * consistent */
37537ec681f3Smrg                block->name = block_source_count++;
37547ec681f3Smrg        }
37557ec681f3Smrg
37567ec681f3Smrg        bi_validate(ctx, "NIR -> BIR");
37577ec681f3Smrg
37587ec681f3Smrg        /* If the shader doesn't write any colour or depth outputs, it may
37597ec681f3Smrg         * still need an ATEST at the very end! */
37607ec681f3Smrg        bool need_dummy_atest =
37617ec681f3Smrg                (ctx->stage == MESA_SHADER_FRAGMENT) &&
37627ec681f3Smrg                !ctx->emitted_atest &&
37637ec681f3Smrg                !bi_skip_atest(ctx, false);
37647ec681f3Smrg
37657ec681f3Smrg        if (need_dummy_atest) {
37667ec681f3Smrg                bi_block *end = list_last_entry(&ctx->blocks, bi_block, link);
37677ec681f3Smrg                bi_builder b = bi_init_builder(ctx, bi_after_block(end));
37687ec681f3Smrg                bi_emit_atest(&b, bi_zero());
37697ec681f3Smrg        }
37707ec681f3Smrg
37717ec681f3Smrg        bool optimize = !(bifrost_debug & BIFROST_DBG_NOOPT);
37727ec681f3Smrg
37737ec681f3Smrg        /* Runs before constant folding */
37747ec681f3Smrg        bi_lower_swizzle(ctx);
37757ec681f3Smrg        bi_validate(ctx, "Early lowering");
37767ec681f3Smrg
37777ec681f3Smrg        /* Runs before copy prop */
37787ec681f3Smrg        if (optimize && !ctx->inputs->no_ubo_to_push) {
37797ec681f3Smrg                bi_opt_push_ubo(ctx);
37807ec681f3Smrg        }
37817ec681f3Smrg
37827ec681f3Smrg        if (likely(optimize)) {
37837ec681f3Smrg                bi_opt_copy_prop(ctx);
37847ec681f3Smrg                bi_opt_constant_fold(ctx);
37857ec681f3Smrg                bi_opt_copy_prop(ctx);
37867ec681f3Smrg                bi_opt_mod_prop_forward(ctx);
37877ec681f3Smrg                bi_opt_mod_prop_backward(ctx);
37887ec681f3Smrg                bi_opt_dead_code_eliminate(ctx);
37897ec681f3Smrg                bi_opt_cse(ctx);
37907ec681f3Smrg                bi_opt_dead_code_eliminate(ctx);
37917ec681f3Smrg                bi_validate(ctx, "Optimization passes");
37927ec681f3Smrg        }
37937ec681f3Smrg
37947ec681f3Smrg        bi_foreach_instr_global(ctx, I) {
37957ec681f3Smrg                bi_lower_opt_instruction(I);
37967ec681f3Smrg        }
37977ec681f3Smrg
37987ec681f3Smrg        bi_foreach_block(ctx, block) {
37997ec681f3Smrg                bi_lower_branch(block);
38007ec681f3Smrg        }
38017ec681f3Smrg
38027ec681f3Smrg        if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
38037ec681f3Smrg                bi_print_shader(ctx, stdout);
38047ec681f3Smrg        bi_lower_fau(ctx);
38057ec681f3Smrg
38067ec681f3Smrg        /* Analyze before register allocation to avoid false dependencies. The
38077ec681f3Smrg         * skip bit is a function of only the data flow graph and is invariant
38087ec681f3Smrg         * under valid scheduling. */
38097ec681f3Smrg        bi_analyze_helper_requirements(ctx);
38107ec681f3Smrg        bi_validate(ctx, "Late lowering");
38117ec681f3Smrg
38127ec681f3Smrg        bi_register_allocate(ctx);
38137ec681f3Smrg
38147ec681f3Smrg        if (likely(optimize))
38157ec681f3Smrg                bi_opt_post_ra(ctx);
38167ec681f3Smrg
38177ec681f3Smrg        if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
38187ec681f3Smrg                bi_print_shader(ctx, stdout);
38197ec681f3Smrg
38207ec681f3Smrg        if (ctx->arch <= 8) {
38217ec681f3Smrg                bi_schedule(ctx);
38227ec681f3Smrg                bi_assign_scoreboard(ctx);
38237ec681f3Smrg        }
38247ec681f3Smrg
38257ec681f3Smrg        /* Analyze after scheduling since we depend on instruction order. */
38267ec681f3Smrg        bi_analyze_helper_terminate(ctx);
38277ec681f3Smrg
38287ec681f3Smrg        if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
38297ec681f3Smrg                bi_print_shader(ctx, stdout);
38307ec681f3Smrg
38317ec681f3Smrg        if (ctx->arch <= 8) {
38327ec681f3Smrg                bi_pack_clauses(ctx, binary);
38337ec681f3Smrg        } else {
38347ec681f3Smrg                /* TODO: pack flat */
38357ec681f3Smrg        }
38367ec681f3Smrg
38377ec681f3Smrg        info->ubo_mask = ctx->ubo_mask & BITSET_MASK(ctx->nir->info.num_ubos);
38387ec681f3Smrg
38397ec681f3Smrg        if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) {
38407ec681f3Smrg                disassemble_bifrost(stdout, binary->data, binary->size,
38417ec681f3Smrg                                    bifrost_debug & BIFROST_DBG_VERBOSE);
38427ec681f3Smrg                fflush(stdout);
38437ec681f3Smrg        }
38447ec681f3Smrg
38457ec681f3Smrg        if ((bifrost_debug & BIFROST_DBG_SHADERDB || inputs->shaderdb) &&
38467ec681f3Smrg            !skip_internal) {
38477ec681f3Smrg                bi_print_stats(ctx, binary->size, stderr);
38487ec681f3Smrg        }
38497ec681f3Smrg
38507ec681f3Smrg        _mesa_hash_table_u64_destroy(ctx->sysval_to_id);
38517ec681f3Smrg        ralloc_free(ctx);
38527ec681f3Smrg}
3853