17ec681f3Smrg/*
27ec681f3Smrg * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
37ec681f3Smrg * Copyright (C) 2019-2020 Collabora, Ltd.
47ec681f3Smrg *
57ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a
67ec681f3Smrg * copy of this software and associated documentation files (the "Software"),
77ec681f3Smrg * to deal in the Software without restriction, including without limitation
87ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
97ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the
107ec681f3Smrg * Software is furnished to do so, subject to the following conditions:
117ec681f3Smrg *
127ec681f3Smrg * The above copyright notice and this permission notice (including the next
137ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the
147ec681f3Smrg * Software.
157ec681f3Smrg *
167ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
177ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
187ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
197ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
207ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
217ec681f3Smrg * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
227ec681f3Smrg * SOFTWARE.
237ec681f3Smrg */
247ec681f3Smrg
257ec681f3Smrg#include <sys/types.h>
267ec681f3Smrg#include <sys/stat.h>
277ec681f3Smrg#include <sys/mman.h>
287ec681f3Smrg#include <fcntl.h>
297ec681f3Smrg#include <stdint.h>
307ec681f3Smrg#include <stdlib.h>
317ec681f3Smrg#include <stdio.h>
327ec681f3Smrg#include <err.h>
337ec681f3Smrg
347ec681f3Smrg#include "main/mtypes.h"
357ec681f3Smrg#include "compiler/glsl/glsl_to_nir.h"
367ec681f3Smrg#include "compiler/nir_types.h"
377ec681f3Smrg#include "compiler/nir/nir_builder.h"
387ec681f3Smrg#include "util/half_float.h"
397ec681f3Smrg#include "util/u_math.h"
407ec681f3Smrg#include "util/u_debug.h"
417ec681f3Smrg#include "util/u_dynarray.h"
427ec681f3Smrg#include "util/list.h"
437ec681f3Smrg#include "main/mtypes.h"
447ec681f3Smrg
457ec681f3Smrg#include "midgard.h"
467ec681f3Smrg#include "midgard_nir.h"
477ec681f3Smrg#include "midgard_compile.h"
487ec681f3Smrg#include "midgard_ops.h"
497ec681f3Smrg#include "helpers.h"
507ec681f3Smrg#include "compiler.h"
517ec681f3Smrg#include "midgard_quirks.h"
527ec681f3Smrg#include "panfrost-quirks.h"
537ec681f3Smrg#include "panfrost/util/pan_lower_framebuffer.h"
547ec681f3Smrg
557ec681f3Smrg#include "disassemble.h"
567ec681f3Smrg
577ec681f3Smrgstatic const struct debug_named_value midgard_debug_options[] = {
587ec681f3Smrg        {"msgs",      MIDGARD_DBG_MSGS,		"Print debug messages"},
597ec681f3Smrg        {"shaders",   MIDGARD_DBG_SHADERS,	"Dump shaders in NIR and MIR"},
607ec681f3Smrg        {"shaderdb",  MIDGARD_DBG_SHADERDB,     "Prints shader-db statistics"},
617ec681f3Smrg        {"inorder",   MIDGARD_DBG_INORDER,      "Disables out-of-order scheduling"},
627ec681f3Smrg        {"verbose",   MIDGARD_DBG_VERBOSE,      "Dump shaders verbosely"},
637ec681f3Smrg        {"internal",  MIDGARD_DBG_INTERNAL,     "Dump internal shaders"},
647ec681f3Smrg        DEBUG_NAMED_VALUE_END
657ec681f3Smrg};
667ec681f3Smrg
677ec681f3SmrgDEBUG_GET_ONCE_FLAGS_OPTION(midgard_debug, "MIDGARD_MESA_DEBUG", midgard_debug_options, 0)
687ec681f3Smrg
697ec681f3Smrgint midgard_debug = 0;
707ec681f3Smrg
717ec681f3Smrg#define DBG(fmt, ...) \
727ec681f3Smrg		do { if (midgard_debug & MIDGARD_DBG_MSGS) \
737ec681f3Smrg			fprintf(stderr, "%s:%d: "fmt, \
747ec681f3Smrg				__FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)
757ec681f3Smrgstatic midgard_block *
767ec681f3Smrgcreate_empty_block(compiler_context *ctx)
777ec681f3Smrg{
787ec681f3Smrg        midgard_block *blk = rzalloc(ctx, midgard_block);
797ec681f3Smrg
807ec681f3Smrg        blk->base.predecessors = _mesa_set_create(blk,
817ec681f3Smrg                        _mesa_hash_pointer,
827ec681f3Smrg                        _mesa_key_pointer_equal);
837ec681f3Smrg
847ec681f3Smrg        blk->base.name = ctx->block_source_count++;
857ec681f3Smrg
867ec681f3Smrg        return blk;
877ec681f3Smrg}
887ec681f3Smrg
897ec681f3Smrgstatic void
907ec681f3Smrgschedule_barrier(compiler_context *ctx)
917ec681f3Smrg{
927ec681f3Smrg        midgard_block *temp = ctx->after_block;
937ec681f3Smrg        ctx->after_block = create_empty_block(ctx);
947ec681f3Smrg        ctx->block_count++;
957ec681f3Smrg        list_addtail(&ctx->after_block->base.link, &ctx->blocks);
967ec681f3Smrg        list_inithead(&ctx->after_block->base.instructions);
977ec681f3Smrg        pan_block_add_successor(&ctx->current_block->base, &ctx->after_block->base);
987ec681f3Smrg        ctx->current_block = ctx->after_block;
997ec681f3Smrg        ctx->after_block = temp;
1007ec681f3Smrg}
1017ec681f3Smrg
1027ec681f3Smrg/* Helpers to generate midgard_instruction's using macro magic, since every
1037ec681f3Smrg * driver seems to do it that way */
1047ec681f3Smrg
1057ec681f3Smrg#define EMIT(op, ...) emit_mir_instruction(ctx, v_##op(__VA_ARGS__));
1067ec681f3Smrg
1077ec681f3Smrg#define M_LOAD_STORE(name, store, T) \
1087ec681f3Smrg	static midgard_instruction m_##name(unsigned ssa, unsigned address) { \
1097ec681f3Smrg		midgard_instruction i = { \
1107ec681f3Smrg			.type = TAG_LOAD_STORE_4, \
1117ec681f3Smrg                        .mask = 0xF, \
1127ec681f3Smrg                        .dest = ~0, \
1137ec681f3Smrg                        .src = { ~0, ~0, ~0, ~0 }, \
1147ec681f3Smrg                        .swizzle = SWIZZLE_IDENTITY_4, \
1157ec681f3Smrg                        .op = midgard_op_##name, \
1167ec681f3Smrg			.load_store = { \
1177ec681f3Smrg				.signed_offset = address \
1187ec681f3Smrg			} \
1197ec681f3Smrg		}; \
1207ec681f3Smrg                \
1217ec681f3Smrg                if (store) { \
1227ec681f3Smrg                        i.src[0] = ssa; \
1237ec681f3Smrg                        i.src_types[0] = T; \
1247ec681f3Smrg                        i.dest_type = T; \
1257ec681f3Smrg                } else { \
1267ec681f3Smrg                        i.dest = ssa; \
1277ec681f3Smrg                        i.dest_type = T; \
1287ec681f3Smrg                } \
1297ec681f3Smrg		return i; \
1307ec681f3Smrg	}
1317ec681f3Smrg
1327ec681f3Smrg#define M_LOAD(name, T) M_LOAD_STORE(name, false, T)
1337ec681f3Smrg#define M_STORE(name, T) M_LOAD_STORE(name, true, T)
1347ec681f3Smrg
1357ec681f3SmrgM_LOAD(ld_attr_32, nir_type_uint32);
1367ec681f3SmrgM_LOAD(ld_vary_32, nir_type_uint32);
1377ec681f3SmrgM_LOAD(ld_ubo_32, nir_type_uint32);
1387ec681f3SmrgM_LOAD(ld_ubo_64, nir_type_uint32);
1397ec681f3SmrgM_LOAD(ld_ubo_128, nir_type_uint32);
1407ec681f3SmrgM_LOAD(ld_32, nir_type_uint32);
1417ec681f3SmrgM_LOAD(ld_64, nir_type_uint32);
1427ec681f3SmrgM_LOAD(ld_128, nir_type_uint32);
1437ec681f3SmrgM_STORE(st_32, nir_type_uint32);
1447ec681f3SmrgM_STORE(st_64, nir_type_uint32);
1457ec681f3SmrgM_STORE(st_128, nir_type_uint32);
1467ec681f3SmrgM_LOAD(ld_tilebuffer_raw, nir_type_uint32);
1477ec681f3SmrgM_LOAD(ld_tilebuffer_16f, nir_type_float16);
1487ec681f3SmrgM_LOAD(ld_tilebuffer_32f, nir_type_float32);
1497ec681f3SmrgM_STORE(st_vary_32, nir_type_uint32);
1507ec681f3SmrgM_LOAD(ld_cubemap_coords, nir_type_uint32);
1517ec681f3SmrgM_LOAD(ldst_mov, nir_type_uint32);
1527ec681f3SmrgM_LOAD(ld_image_32f, nir_type_float32);
1537ec681f3SmrgM_LOAD(ld_image_16f, nir_type_float16);
1547ec681f3SmrgM_LOAD(ld_image_32u, nir_type_uint32);
1557ec681f3SmrgM_LOAD(ld_image_32i, nir_type_int32);
1567ec681f3SmrgM_STORE(st_image_32f, nir_type_float32);
1577ec681f3SmrgM_STORE(st_image_16f, nir_type_float16);
1587ec681f3SmrgM_STORE(st_image_32u, nir_type_uint32);
1597ec681f3SmrgM_STORE(st_image_32i, nir_type_int32);
1607ec681f3SmrgM_LOAD(lea_image, nir_type_uint64);
1617ec681f3Smrg
1627ec681f3Smrg#define M_IMAGE(op) \
1637ec681f3Smrgstatic midgard_instruction \
1647ec681f3Smrgop ## _image(nir_alu_type type, unsigned val, unsigned address) \
1657ec681f3Smrg{ \
1667ec681f3Smrg        switch (type) { \
1677ec681f3Smrg        case nir_type_float32: \
1687ec681f3Smrg                 return m_ ## op ## _image_32f(val, address); \
1697ec681f3Smrg        case nir_type_float16: \
1707ec681f3Smrg                 return m_ ## op ## _image_16f(val, address); \
1717ec681f3Smrg        case nir_type_uint32: \
1727ec681f3Smrg                 return m_ ## op ## _image_32u(val, address); \
1737ec681f3Smrg        case nir_type_int32: \
1747ec681f3Smrg                 return m_ ## op ## _image_32i(val, address); \
1757ec681f3Smrg        default: \
1767ec681f3Smrg                 unreachable("Invalid image type"); \
1777ec681f3Smrg        } \
1787ec681f3Smrg}
1797ec681f3Smrg
1807ec681f3SmrgM_IMAGE(ld);
1817ec681f3SmrgM_IMAGE(st);
1827ec681f3Smrg
1837ec681f3Smrgstatic midgard_instruction
1847ec681f3Smrgv_branch(bool conditional, bool invert)
1857ec681f3Smrg{
1867ec681f3Smrg        midgard_instruction ins = {
1877ec681f3Smrg                .type = TAG_ALU_4,
1887ec681f3Smrg                .unit = ALU_ENAB_BRANCH,
1897ec681f3Smrg                .compact_branch = true,
1907ec681f3Smrg                .branch = {
1917ec681f3Smrg                        .conditional = conditional,
1927ec681f3Smrg                        .invert_conditional = invert
1937ec681f3Smrg                },
1947ec681f3Smrg                .dest = ~0,
1957ec681f3Smrg                .src = { ~0, ~0, ~0, ~0 },
1967ec681f3Smrg        };
1977ec681f3Smrg
1987ec681f3Smrg        return ins;
1997ec681f3Smrg}
2007ec681f3Smrg
2017ec681f3Smrgstatic void
2027ec681f3Smrgattach_constants(compiler_context *ctx, midgard_instruction *ins, void *constants, int name)
2037ec681f3Smrg{
2047ec681f3Smrg        ins->has_constants = true;
2057ec681f3Smrg        memcpy(&ins->constants, constants, 16);
2067ec681f3Smrg}
2077ec681f3Smrg
2087ec681f3Smrgstatic int
2097ec681f3Smrgglsl_type_size(const struct glsl_type *type, bool bindless)
2107ec681f3Smrg{
2117ec681f3Smrg        return glsl_count_attribute_slots(type, false);
2127ec681f3Smrg}
2137ec681f3Smrg
2147ec681f3Smrg/* Lower fdot2 to a vector multiplication followed by channel addition  */
2157ec681f3Smrgstatic bool
2167ec681f3Smrgmidgard_nir_lower_fdot2_instr(nir_builder *b, nir_instr *instr, void *data)
2177ec681f3Smrg{
2187ec681f3Smrg        if (instr->type != nir_instr_type_alu)
2197ec681f3Smrg                return false;
2207ec681f3Smrg
2217ec681f3Smrg        nir_alu_instr *alu = nir_instr_as_alu(instr);
2227ec681f3Smrg        if (alu->op != nir_op_fdot2)
2237ec681f3Smrg                return false;
2247ec681f3Smrg
2257ec681f3Smrg        b->cursor = nir_before_instr(&alu->instr);
2267ec681f3Smrg
2277ec681f3Smrg        nir_ssa_def *src0 = nir_ssa_for_alu_src(b, alu, 0);
2287ec681f3Smrg        nir_ssa_def *src1 = nir_ssa_for_alu_src(b, alu, 1);
2297ec681f3Smrg
2307ec681f3Smrg        nir_ssa_def *product = nir_fmul(b, src0, src1);
2317ec681f3Smrg
2327ec681f3Smrg        nir_ssa_def *sum = nir_fadd(b,
2337ec681f3Smrg                                    nir_channel(b, product, 0),
2347ec681f3Smrg                                    nir_channel(b, product, 1));
2357ec681f3Smrg
2367ec681f3Smrg        /* Replace the fdot2 with this sum */
2377ec681f3Smrg        nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, sum);
2387ec681f3Smrg
2397ec681f3Smrg        return true;
2407ec681f3Smrg}
2417ec681f3Smrg
2427ec681f3Smrgstatic bool
2437ec681f3Smrgmidgard_nir_lower_fdot2(nir_shader *shader)
2447ec681f3Smrg{
2457ec681f3Smrg        return nir_shader_instructions_pass(shader,
2467ec681f3Smrg                                            midgard_nir_lower_fdot2_instr,
2477ec681f3Smrg                                            nir_metadata_block_index | nir_metadata_dominance,
2487ec681f3Smrg                                            NULL);
2497ec681f3Smrg}
2507ec681f3Smrg
2517ec681f3Smrgstatic bool
2527ec681f3Smrgmdg_is_64(const nir_instr *instr, const void *_unused)
2537ec681f3Smrg{
2547ec681f3Smrg        const nir_alu_instr *alu = nir_instr_as_alu(instr);
2557ec681f3Smrg
2567ec681f3Smrg        if (nir_dest_bit_size(alu->dest.dest) == 64)
2577ec681f3Smrg                return true;
2587ec681f3Smrg
2597ec681f3Smrg        switch (alu->op) {
2607ec681f3Smrg        case nir_op_umul_high:
2617ec681f3Smrg        case nir_op_imul_high:
2627ec681f3Smrg                return true;
2637ec681f3Smrg        default:
2647ec681f3Smrg                return false;
2657ec681f3Smrg        }
2667ec681f3Smrg}
2677ec681f3Smrg
2687ec681f3Smrg/* Only vectorize int64 up to vec2 */
2697ec681f3Smrgstatic bool
2707ec681f3Smrgmidgard_vectorize_filter(const nir_instr *instr, void *data)
2717ec681f3Smrg{
2727ec681f3Smrg        if (instr->type != nir_instr_type_alu)
2737ec681f3Smrg                return true;
2747ec681f3Smrg
2757ec681f3Smrg        const nir_alu_instr *alu = nir_instr_as_alu(instr);
2767ec681f3Smrg
2777ec681f3Smrg        unsigned num_components = alu->dest.dest.ssa.num_components;
2787ec681f3Smrg
2797ec681f3Smrg        int src_bit_size = nir_src_bit_size(alu->src[0].src);
2807ec681f3Smrg        int dst_bit_size = nir_dest_bit_size(alu->dest.dest);
2817ec681f3Smrg
2827ec681f3Smrg        if (src_bit_size == 64 || dst_bit_size == 64) {
2837ec681f3Smrg                if (num_components > 1)
2847ec681f3Smrg                        return false;
2857ec681f3Smrg        }
2867ec681f3Smrg
2877ec681f3Smrg        return true;
2887ec681f3Smrg}
2897ec681f3Smrg
2907ec681f3Smrg
2917ec681f3Smrg/* Flushes undefined values to zero */
2927ec681f3Smrg
2937ec681f3Smrgstatic void
2947ec681f3Smrgoptimise_nir(nir_shader *nir, unsigned quirks, bool is_blend)
2957ec681f3Smrg{
2967ec681f3Smrg        bool progress;
2977ec681f3Smrg        unsigned lower_flrp =
2987ec681f3Smrg                (nir->options->lower_flrp16 ? 16 : 0) |
2997ec681f3Smrg                (nir->options->lower_flrp32 ? 32 : 0) |
3007ec681f3Smrg                (nir->options->lower_flrp64 ? 64 : 0);
3017ec681f3Smrg
3027ec681f3Smrg        NIR_PASS(progress, nir, nir_lower_regs_to_ssa);
3037ec681f3Smrg        nir_lower_idiv_options idiv_options = {
3047ec681f3Smrg                .imprecise_32bit_lowering = true,
3057ec681f3Smrg                .allow_fp16 = true,
3067ec681f3Smrg        };
3077ec681f3Smrg        NIR_PASS(progress, nir, nir_lower_idiv, &idiv_options);
3087ec681f3Smrg
3097ec681f3Smrg        nir_lower_tex_options lower_tex_options = {
3107ec681f3Smrg                .lower_txs_lod = true,
3117ec681f3Smrg                .lower_txp = ~0,
3127ec681f3Smrg                .lower_tg4_broadcom_swizzle = true,
3137ec681f3Smrg                /* TODO: we have native gradient.. */
3147ec681f3Smrg                .lower_txd = true,
3157ec681f3Smrg        };
3167ec681f3Smrg
3177ec681f3Smrg        NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_options);
3187ec681f3Smrg
3197ec681f3Smrg        /* Must lower fdot2 after tex is lowered */
3207ec681f3Smrg        NIR_PASS(progress, nir, midgard_nir_lower_fdot2);
3217ec681f3Smrg
3227ec681f3Smrg        /* T720 is broken. */
3237ec681f3Smrg
3247ec681f3Smrg        if (quirks & MIDGARD_BROKEN_LOD)
3257ec681f3Smrg                NIR_PASS_V(nir, midgard_nir_lod_errata);
3267ec681f3Smrg
3277ec681f3Smrg        /* Midgard image ops coordinates are 16-bit instead of 32-bit */
3287ec681f3Smrg        NIR_PASS(progress, nir, midgard_nir_lower_image_bitsize);
3297ec681f3Smrg        NIR_PASS(progress, nir, midgard_nir_lower_helper_writes);
3307ec681f3Smrg        NIR_PASS(progress, nir, pan_lower_helper_invocation);
3317ec681f3Smrg        NIR_PASS(progress, nir, pan_lower_sample_pos);
3327ec681f3Smrg
3337ec681f3Smrg        NIR_PASS(progress, nir, midgard_nir_lower_algebraic_early);
3347ec681f3Smrg
3357ec681f3Smrg        do {
3367ec681f3Smrg                progress = false;
3377ec681f3Smrg
3387ec681f3Smrg                NIR_PASS(progress, nir, nir_lower_var_copies);
3397ec681f3Smrg                NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
3407ec681f3Smrg
3417ec681f3Smrg                NIR_PASS(progress, nir, nir_copy_prop);
3427ec681f3Smrg                NIR_PASS(progress, nir, nir_opt_remove_phis);
3437ec681f3Smrg                NIR_PASS(progress, nir, nir_opt_dce);
3447ec681f3Smrg                NIR_PASS(progress, nir, nir_opt_dead_cf);
3457ec681f3Smrg                NIR_PASS(progress, nir, nir_opt_cse);
3467ec681f3Smrg                NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
3477ec681f3Smrg                NIR_PASS(progress, nir, nir_opt_algebraic);
3487ec681f3Smrg                NIR_PASS(progress, nir, nir_opt_constant_folding);
3497ec681f3Smrg
3507ec681f3Smrg                if (lower_flrp != 0) {
3517ec681f3Smrg                        bool lower_flrp_progress = false;
3527ec681f3Smrg                        NIR_PASS(lower_flrp_progress,
3537ec681f3Smrg                                 nir,
3547ec681f3Smrg                                 nir_lower_flrp,
3557ec681f3Smrg                                 lower_flrp,
3567ec681f3Smrg                                 false /* always_precise */);
3577ec681f3Smrg                        if (lower_flrp_progress) {
3587ec681f3Smrg                                NIR_PASS(progress, nir,
3597ec681f3Smrg                                         nir_opt_constant_folding);
3607ec681f3Smrg                                progress = true;
3617ec681f3Smrg                        }
3627ec681f3Smrg
3637ec681f3Smrg                        /* Nothing should rematerialize any flrps, so we only
3647ec681f3Smrg                         * need to do this lowering once.
3657ec681f3Smrg                         */
3667ec681f3Smrg                        lower_flrp = 0;
3677ec681f3Smrg                }
3687ec681f3Smrg
3697ec681f3Smrg                NIR_PASS(progress, nir, nir_opt_undef);
3707ec681f3Smrg                NIR_PASS(progress, nir, nir_lower_undef_to_zero);
3717ec681f3Smrg
3727ec681f3Smrg                NIR_PASS(progress, nir, nir_opt_loop_unroll);
3737ec681f3Smrg
3747ec681f3Smrg                NIR_PASS(progress, nir, nir_opt_vectorize,
3757ec681f3Smrg                         midgard_vectorize_filter, NULL);
3767ec681f3Smrg        } while (progress);
3777ec681f3Smrg
3787ec681f3Smrg        NIR_PASS_V(nir, nir_lower_alu_to_scalar, mdg_is_64, NULL);
3797ec681f3Smrg
3807ec681f3Smrg        /* Run after opts so it can hit more */
3817ec681f3Smrg        if (!is_blend)
3827ec681f3Smrg                NIR_PASS(progress, nir, nir_fuse_io_16);
3837ec681f3Smrg
3847ec681f3Smrg        /* Must be run at the end to prevent creation of fsin/fcos ops */
3857ec681f3Smrg        NIR_PASS(progress, nir, midgard_nir_scale_trig);
3867ec681f3Smrg
3877ec681f3Smrg        do {
3887ec681f3Smrg                progress = false;
3897ec681f3Smrg
3907ec681f3Smrg                NIR_PASS(progress, nir, nir_opt_dce);
3917ec681f3Smrg                NIR_PASS(progress, nir, nir_opt_algebraic);
3927ec681f3Smrg                NIR_PASS(progress, nir, nir_opt_constant_folding);
3937ec681f3Smrg                NIR_PASS(progress, nir, nir_copy_prop);
3947ec681f3Smrg        } while (progress);
3957ec681f3Smrg
3967ec681f3Smrg        NIR_PASS(progress, nir, nir_opt_algebraic_late);
3977ec681f3Smrg        NIR_PASS(progress, nir, nir_opt_algebraic_distribute_src_mods);
3987ec681f3Smrg
3997ec681f3Smrg        /* We implement booleans as 32-bit 0/~0 */
4007ec681f3Smrg        NIR_PASS(progress, nir, nir_lower_bool_to_int32);
4017ec681f3Smrg
4027ec681f3Smrg        /* Now that booleans are lowered, we can run out late opts */
4037ec681f3Smrg        NIR_PASS(progress, nir, midgard_nir_lower_algebraic_late);
4047ec681f3Smrg        NIR_PASS(progress, nir, midgard_nir_cancel_inot);
4057ec681f3Smrg
4067ec681f3Smrg        NIR_PASS(progress, nir, nir_copy_prop);
4077ec681f3Smrg        NIR_PASS(progress, nir, nir_opt_dce);
4087ec681f3Smrg
4097ec681f3Smrg        /* Backend scheduler is purely local, so do some global optimizations
4107ec681f3Smrg         * to reduce register pressure. */
4117ec681f3Smrg        nir_move_options move_all =
4127ec681f3Smrg                nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
4137ec681f3Smrg                nir_move_comparisons | nir_move_copies | nir_move_load_ssbo;
4147ec681f3Smrg
4157ec681f3Smrg        NIR_PASS_V(nir, nir_opt_sink, move_all);
4167ec681f3Smrg        NIR_PASS_V(nir, nir_opt_move, move_all);
4177ec681f3Smrg
4187ec681f3Smrg        /* Take us out of SSA */
4197ec681f3Smrg        NIR_PASS(progress, nir, nir_lower_locals_to_regs);
4207ec681f3Smrg        NIR_PASS(progress, nir, nir_convert_from_ssa, true);
4217ec681f3Smrg
4227ec681f3Smrg        /* We are a vector architecture; write combine where possible */
4237ec681f3Smrg        NIR_PASS(progress, nir, nir_move_vec_src_uses_to_dest);
4247ec681f3Smrg        NIR_PASS(progress, nir, nir_lower_vec_to_movs, NULL, NULL);
4257ec681f3Smrg
4267ec681f3Smrg        NIR_PASS(progress, nir, nir_opt_dce);
4277ec681f3Smrg}
4287ec681f3Smrg
4297ec681f3Smrg/* Do not actually emit a load; instead, cache the constant for inlining */
4307ec681f3Smrg
4317ec681f3Smrgstatic void
4327ec681f3Smrgemit_load_const(compiler_context *ctx, nir_load_const_instr *instr)
4337ec681f3Smrg{
4347ec681f3Smrg        nir_ssa_def def = instr->def;
4357ec681f3Smrg
4367ec681f3Smrg        midgard_constants *consts = rzalloc(ctx, midgard_constants);
4377ec681f3Smrg
4387ec681f3Smrg        assert(instr->def.num_components * instr->def.bit_size <= sizeof(*consts) * 8);
4397ec681f3Smrg
4407ec681f3Smrg#define RAW_CONST_COPY(bits)                                         \
4417ec681f3Smrg        nir_const_value_to_array(consts->u##bits, instr->value,      \
4427ec681f3Smrg                                 instr->def.num_components, u##bits)
4437ec681f3Smrg
4447ec681f3Smrg        switch (instr->def.bit_size) {
4457ec681f3Smrg        case 64:
4467ec681f3Smrg                RAW_CONST_COPY(64);
4477ec681f3Smrg                break;
4487ec681f3Smrg        case 32:
4497ec681f3Smrg                RAW_CONST_COPY(32);
4507ec681f3Smrg                break;
4517ec681f3Smrg        case 16:
4527ec681f3Smrg                RAW_CONST_COPY(16);
4537ec681f3Smrg                break;
4547ec681f3Smrg        case 8:
4557ec681f3Smrg                RAW_CONST_COPY(8);
4567ec681f3Smrg                break;
4577ec681f3Smrg        default:
4587ec681f3Smrg                unreachable("Invalid bit_size for load_const instruction\n");
4597ec681f3Smrg        }
4607ec681f3Smrg
4617ec681f3Smrg        /* Shifted for SSA, +1 for off-by-one */
4627ec681f3Smrg        _mesa_hash_table_u64_insert(ctx->ssa_constants, (def.index << 1) + 1, consts);
4637ec681f3Smrg}
4647ec681f3Smrg
4657ec681f3Smrg/* Normally constants are embedded implicitly, but for I/O and such we have to
4667ec681f3Smrg * explicitly emit a move with the constant source */
4677ec681f3Smrg
4687ec681f3Smrgstatic void
4697ec681f3Smrgemit_explicit_constant(compiler_context *ctx, unsigned node, unsigned to)
4707ec681f3Smrg{
4717ec681f3Smrg        void *constant_value = _mesa_hash_table_u64_search(ctx->ssa_constants, node + 1);
4727ec681f3Smrg
4737ec681f3Smrg        if (constant_value) {
4747ec681f3Smrg                midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), to);
4757ec681f3Smrg                attach_constants(ctx, &ins, constant_value, node + 1);
4767ec681f3Smrg                emit_mir_instruction(ctx, ins);
4777ec681f3Smrg        }
4787ec681f3Smrg}
4797ec681f3Smrg
4807ec681f3Smrgstatic bool
4817ec681f3Smrgnir_is_non_scalar_swizzle(nir_alu_src *src, unsigned nr_components)
4827ec681f3Smrg{
4837ec681f3Smrg        unsigned comp = src->swizzle[0];
4847ec681f3Smrg
4857ec681f3Smrg        for (unsigned c = 1; c < nr_components; ++c) {
4867ec681f3Smrg                if (src->swizzle[c] != comp)
4877ec681f3Smrg                        return true;
4887ec681f3Smrg        }
4897ec681f3Smrg
4907ec681f3Smrg        return false;
4917ec681f3Smrg}
4927ec681f3Smrg
4937ec681f3Smrg#define ATOMIC_CASE_IMPL(ctx, instr, nir, op, is_shared) \
4947ec681f3Smrg        case nir_intrinsic_##nir: \
4957ec681f3Smrg                emit_atomic(ctx, instr, is_shared, midgard_op_##op, ~0); \
4967ec681f3Smrg                break;
4977ec681f3Smrg
4987ec681f3Smrg#define ATOMIC_CASE(ctx, instr, nir, op) \
4997ec681f3Smrg        ATOMIC_CASE_IMPL(ctx, instr, shared_atomic_##nir, atomic_##op, true); \
5007ec681f3Smrg        ATOMIC_CASE_IMPL(ctx, instr, global_atomic_##nir, atomic_##op, false);
5017ec681f3Smrg
5027ec681f3Smrg#define IMAGE_ATOMIC_CASE(ctx, instr, nir, op) \
5037ec681f3Smrg        case nir_intrinsic_image_atomic_##nir: { \
5047ec681f3Smrg                midgard_instruction ins = emit_image_op(ctx, instr, true); \
5057ec681f3Smrg                emit_atomic(ctx, instr, false, midgard_op_atomic_##op, ins.dest); \
5067ec681f3Smrg                break; \
5077ec681f3Smrg        }
5087ec681f3Smrg
5097ec681f3Smrg#define ALU_CASE(nir, _op) \
5107ec681f3Smrg	case nir_op_##nir: \
5117ec681f3Smrg		op = midgard_alu_op_##_op; \
5127ec681f3Smrg                assert(src_bitsize == dst_bitsize); \
5137ec681f3Smrg		break;
5147ec681f3Smrg
5157ec681f3Smrg#define ALU_CASE_RTZ(nir, _op) \
5167ec681f3Smrg	case nir_op_##nir: \
5177ec681f3Smrg		op = midgard_alu_op_##_op; \
5187ec681f3Smrg                roundmode = MIDGARD_RTZ; \
5197ec681f3Smrg		break;
5207ec681f3Smrg
5217ec681f3Smrg#define ALU_CHECK_CMP() \
5227ec681f3Smrg                assert(src_bitsize == 16 || src_bitsize == 32 || src_bitsize == 64); \
5237ec681f3Smrg                assert(dst_bitsize == 16 || dst_bitsize == 32); \
5247ec681f3Smrg
5257ec681f3Smrg#define ALU_CASE_BCAST(nir, _op, count) \
5267ec681f3Smrg        case nir_op_##nir: \
5277ec681f3Smrg                op = midgard_alu_op_##_op; \
5287ec681f3Smrg                broadcast_swizzle = count; \
5297ec681f3Smrg                ALU_CHECK_CMP(); \
5307ec681f3Smrg                break;
5317ec681f3Smrg
5327ec681f3Smrg#define ALU_CASE_CMP(nir, _op) \
5337ec681f3Smrg	case nir_op_##nir: \
5347ec681f3Smrg		op = midgard_alu_op_##_op; \
5357ec681f3Smrg                ALU_CHECK_CMP(); \
5367ec681f3Smrg                break;
5377ec681f3Smrg
5387ec681f3Smrg/* Compare mir_lower_invert */
5397ec681f3Smrgstatic bool
5407ec681f3Smrgnir_accepts_inot(nir_op op, unsigned src)
5417ec681f3Smrg{
5427ec681f3Smrg        switch (op) {
5437ec681f3Smrg        case nir_op_ior:
5447ec681f3Smrg        case nir_op_iand: /* TODO: b2f16 */
5457ec681f3Smrg        case nir_op_ixor:
5467ec681f3Smrg                return true;
5477ec681f3Smrg        case nir_op_b32csel:
5487ec681f3Smrg                /* Only the condition */
5497ec681f3Smrg                return (src == 0);
5507ec681f3Smrg        default:
5517ec681f3Smrg                return false;
5527ec681f3Smrg        }
5537ec681f3Smrg}
5547ec681f3Smrg
5557ec681f3Smrgstatic bool
5567ec681f3Smrgmir_accept_dest_mod(compiler_context *ctx, nir_dest **dest, nir_op op)
5577ec681f3Smrg{
5587ec681f3Smrg        if (pan_has_dest_mod(dest, op)) {
5597ec681f3Smrg                assert((*dest)->is_ssa);
5607ec681f3Smrg                BITSET_SET(ctx->already_emitted, (*dest)->ssa.index);
5617ec681f3Smrg                return true;
5627ec681f3Smrg        }
5637ec681f3Smrg
5647ec681f3Smrg        return false;
5657ec681f3Smrg}
5667ec681f3Smrg
5677ec681f3Smrg/* Look for floating point mods. We have the mods clamp_m1_1, clamp_0_1,
5687ec681f3Smrg * and clamp_0_inf. We also have the relations (note 3 * 2 = 6 cases):
5697ec681f3Smrg *
5707ec681f3Smrg * clamp_0_1(clamp_0_inf(x))  = clamp_m1_1(x)
5717ec681f3Smrg * clamp_0_1(clamp_m1_1(x))   = clamp_m1_1(x)
5727ec681f3Smrg * clamp_0_inf(clamp_0_1(x))  = clamp_m1_1(x)
5737ec681f3Smrg * clamp_0_inf(clamp_m1_1(x)) = clamp_m1_1(x)
5747ec681f3Smrg * clamp_m1_1(clamp_0_1(x))   = clamp_m1_1(x)
5757ec681f3Smrg * clamp_m1_1(clamp_0_inf(x)) = clamp_m1_1(x)
5767ec681f3Smrg *
5777ec681f3Smrg * So by cases any composition of output modifiers is equivalent to
5787ec681f3Smrg * clamp_m1_1 alone.
5797ec681f3Smrg */
5807ec681f3Smrgstatic unsigned
5817ec681f3Smrgmir_determine_float_outmod(compiler_context *ctx, nir_dest **dest, unsigned prior_outmod)
5827ec681f3Smrg{
5837ec681f3Smrg        bool clamp_0_inf = mir_accept_dest_mod(ctx, dest, nir_op_fclamp_pos_mali);
5847ec681f3Smrg        bool clamp_0_1 = mir_accept_dest_mod(ctx, dest, nir_op_fsat);
5857ec681f3Smrg        bool clamp_m1_1 = mir_accept_dest_mod(ctx, dest, nir_op_fsat_signed_mali);
5867ec681f3Smrg        bool prior = (prior_outmod != midgard_outmod_none);
5877ec681f3Smrg        int count = (int) prior + (int) clamp_0_inf + (int) clamp_0_1 + (int) clamp_m1_1;
5887ec681f3Smrg
5897ec681f3Smrg        return ((count > 1) || clamp_0_1) ?  midgard_outmod_clamp_0_1 :
5907ec681f3Smrg                                clamp_0_inf ? midgard_outmod_clamp_0_inf :
5917ec681f3Smrg                                clamp_m1_1 ?   midgard_outmod_clamp_m1_1 :
5927ec681f3Smrg                                prior_outmod;
5937ec681f3Smrg}
5947ec681f3Smrg
5957ec681f3Smrgstatic void
5967ec681f3Smrgmir_copy_src(midgard_instruction *ins, nir_alu_instr *instr, unsigned i, unsigned to, bool *abs, bool *neg, bool *not, enum midgard_roundmode *roundmode, bool is_int, unsigned bcast_count)
5977ec681f3Smrg{
5987ec681f3Smrg        nir_alu_src src = instr->src[i];
5997ec681f3Smrg
6007ec681f3Smrg        if (!is_int) {
6017ec681f3Smrg                if (pan_has_source_mod(&src, nir_op_fneg))
6027ec681f3Smrg                        *neg = !(*neg);
6037ec681f3Smrg
6047ec681f3Smrg                if (pan_has_source_mod(&src, nir_op_fabs))
6057ec681f3Smrg                        *abs = true;
6067ec681f3Smrg        }
6077ec681f3Smrg
6087ec681f3Smrg        if (nir_accepts_inot(instr->op, i) && pan_has_source_mod(&src, nir_op_inot))
6097ec681f3Smrg                *not = true;
6107ec681f3Smrg
6117ec681f3Smrg        if (roundmode) {
6127ec681f3Smrg                if (pan_has_source_mod(&src, nir_op_fround_even))
6137ec681f3Smrg                        *roundmode = MIDGARD_RTE;
6147ec681f3Smrg
6157ec681f3Smrg                if (pan_has_source_mod(&src, nir_op_ftrunc))
6167ec681f3Smrg                        *roundmode = MIDGARD_RTZ;
6177ec681f3Smrg
6187ec681f3Smrg                if (pan_has_source_mod(&src, nir_op_ffloor))
6197ec681f3Smrg                        *roundmode = MIDGARD_RTN;
6207ec681f3Smrg
6217ec681f3Smrg                if (pan_has_source_mod(&src, nir_op_fceil))
6227ec681f3Smrg                        *roundmode = MIDGARD_RTP;
6237ec681f3Smrg        }
6247ec681f3Smrg
6257ec681f3Smrg        unsigned bits = nir_src_bit_size(src.src);
6267ec681f3Smrg
6277ec681f3Smrg        ins->src[to] = nir_src_index(NULL, &src.src);
6287ec681f3Smrg        ins->src_types[to] = nir_op_infos[instr->op].input_types[i] | bits;
6297ec681f3Smrg
6307ec681f3Smrg        for (unsigned c = 0; c < NIR_MAX_VEC_COMPONENTS; ++c) {
6317ec681f3Smrg                ins->swizzle[to][c] = src.swizzle[
6327ec681f3Smrg                        (!bcast_count || c < bcast_count) ? c :
6337ec681f3Smrg                                (bcast_count - 1)];
6347ec681f3Smrg        }
6357ec681f3Smrg}
6367ec681f3Smrg
6377ec681f3Smrg/* Midgard features both fcsel and icsel, depending on whether you want int or
6387ec681f3Smrg * float modifiers. NIR's csel is typeless, so we want a heuristic to guess if
6397ec681f3Smrg * we should emit an int or float csel depending on what modifiers could be
6407ec681f3Smrg * placed. In the absense of modifiers, this is probably arbitrary. */
6417ec681f3Smrg
6427ec681f3Smrgstatic bool
6437ec681f3Smrgmir_is_bcsel_float(nir_alu_instr *instr)
6447ec681f3Smrg{
6457ec681f3Smrg        nir_op intmods[] = {
6467ec681f3Smrg                nir_op_i2i8, nir_op_i2i16,
6477ec681f3Smrg                nir_op_i2i32, nir_op_i2i64
6487ec681f3Smrg        };
6497ec681f3Smrg
6507ec681f3Smrg        nir_op floatmods[] = {
6517ec681f3Smrg                nir_op_fabs, nir_op_fneg,
6527ec681f3Smrg                nir_op_f2f16, nir_op_f2f32,
6537ec681f3Smrg                nir_op_f2f64
6547ec681f3Smrg        };
6557ec681f3Smrg
6567ec681f3Smrg        nir_op floatdestmods[] = {
6577ec681f3Smrg                nir_op_fsat, nir_op_fsat_signed_mali, nir_op_fclamp_pos_mali,
6587ec681f3Smrg                nir_op_f2f16, nir_op_f2f32
6597ec681f3Smrg        };
6607ec681f3Smrg
6617ec681f3Smrg        signed score = 0;
6627ec681f3Smrg
6637ec681f3Smrg        for (unsigned i = 1; i < 3; ++i) {
6647ec681f3Smrg                nir_alu_src s = instr->src[i];
6657ec681f3Smrg                for (unsigned q = 0; q < ARRAY_SIZE(intmods); ++q) {
6667ec681f3Smrg                        if (pan_has_source_mod(&s, intmods[q]))
6677ec681f3Smrg                                score--;
6687ec681f3Smrg                }
6697ec681f3Smrg        }
6707ec681f3Smrg
6717ec681f3Smrg        for (unsigned i = 1; i < 3; ++i) {
6727ec681f3Smrg                nir_alu_src s = instr->src[i];
6737ec681f3Smrg                for (unsigned q = 0; q < ARRAY_SIZE(floatmods); ++q) {
6747ec681f3Smrg                        if (pan_has_source_mod(&s, floatmods[q]))
6757ec681f3Smrg                                score++;
6767ec681f3Smrg                }
6777ec681f3Smrg        }
6787ec681f3Smrg
6797ec681f3Smrg        for (unsigned q = 0; q < ARRAY_SIZE(floatdestmods); ++q) {
6807ec681f3Smrg                nir_dest *dest = &instr->dest.dest;
6817ec681f3Smrg                if (pan_has_dest_mod(&dest, floatdestmods[q]))
6827ec681f3Smrg                        score++;
6837ec681f3Smrg        }
6847ec681f3Smrg
6857ec681f3Smrg        return (score > 0);
6867ec681f3Smrg}
6877ec681f3Smrg
6887ec681f3Smrgstatic void
6897ec681f3Smrgemit_alu(compiler_context *ctx, nir_alu_instr *instr)
6907ec681f3Smrg{
6917ec681f3Smrg        nir_dest *dest = &instr->dest.dest;
6927ec681f3Smrg
6937ec681f3Smrg        if (dest->is_ssa && BITSET_TEST(ctx->already_emitted, dest->ssa.index))
6947ec681f3Smrg                return;
6957ec681f3Smrg
6967ec681f3Smrg        /* Derivatives end up emitted on the texture pipe, not the ALUs. This
6977ec681f3Smrg         * is handled elsewhere */
6987ec681f3Smrg
6997ec681f3Smrg        if (instr->op == nir_op_fddx || instr->op == nir_op_fddy) {
7007ec681f3Smrg                midgard_emit_derivatives(ctx, instr);
7017ec681f3Smrg                return;
7027ec681f3Smrg        }
7037ec681f3Smrg
7047ec681f3Smrg        bool is_ssa = dest->is_ssa;
7057ec681f3Smrg
7067ec681f3Smrg        unsigned nr_components = nir_dest_num_components(*dest);
7077ec681f3Smrg        unsigned nr_inputs = nir_op_infos[instr->op].num_inputs;
7087ec681f3Smrg        unsigned op = 0;
7097ec681f3Smrg
7107ec681f3Smrg        /* Number of components valid to check for the instruction (the rest
7117ec681f3Smrg         * will be forced to the last), or 0 to use as-is. Relevant as
7127ec681f3Smrg         * ball-type instructions have a channel count in NIR but are all vec4
7137ec681f3Smrg         * in Midgard */
7147ec681f3Smrg
7157ec681f3Smrg        unsigned broadcast_swizzle = 0;
7167ec681f3Smrg
7177ec681f3Smrg        /* Should we swap arguments? */
7187ec681f3Smrg        bool flip_src12 = false;
7197ec681f3Smrg
7207ec681f3Smrg        ASSERTED unsigned src_bitsize = nir_src_bit_size(instr->src[0].src);
7217ec681f3Smrg        ASSERTED unsigned dst_bitsize = nir_dest_bit_size(*dest);
7227ec681f3Smrg
7237ec681f3Smrg        enum midgard_roundmode roundmode = MIDGARD_RTE;
7247ec681f3Smrg
7257ec681f3Smrg        switch (instr->op) {
7267ec681f3Smrg                ALU_CASE(fadd, fadd);
7277ec681f3Smrg                ALU_CASE(fmul, fmul);
7287ec681f3Smrg                ALU_CASE(fmin, fmin);
7297ec681f3Smrg                ALU_CASE(fmax, fmax);
7307ec681f3Smrg                ALU_CASE(imin, imin);
7317ec681f3Smrg                ALU_CASE(imax, imax);
7327ec681f3Smrg                ALU_CASE(umin, umin);
7337ec681f3Smrg                ALU_CASE(umax, umax);
7347ec681f3Smrg                ALU_CASE(ffloor, ffloor);
7357ec681f3Smrg                ALU_CASE(fround_even, froundeven);
7367ec681f3Smrg                ALU_CASE(ftrunc, ftrunc);
7377ec681f3Smrg                ALU_CASE(fceil, fceil);
7387ec681f3Smrg                ALU_CASE(fdot3, fdot3);
7397ec681f3Smrg                ALU_CASE(fdot4, fdot4);
7407ec681f3Smrg                ALU_CASE(iadd, iadd);
7417ec681f3Smrg                ALU_CASE(isub, isub);
7427ec681f3Smrg                ALU_CASE(iadd_sat, iaddsat);
7437ec681f3Smrg                ALU_CASE(isub_sat, isubsat);
7447ec681f3Smrg                ALU_CASE(uadd_sat, uaddsat);
7457ec681f3Smrg                ALU_CASE(usub_sat, usubsat);
7467ec681f3Smrg                ALU_CASE(imul, imul);
7477ec681f3Smrg                ALU_CASE(imul_high, imul);
7487ec681f3Smrg                ALU_CASE(umul_high, imul);
7497ec681f3Smrg                ALU_CASE(uclz, iclz);
7507ec681f3Smrg
7517ec681f3Smrg                /* Zero shoved as second-arg */
7527ec681f3Smrg                ALU_CASE(iabs, iabsdiff);
7537ec681f3Smrg
7547ec681f3Smrg                ALU_CASE(uabs_isub, iabsdiff);
7557ec681f3Smrg                ALU_CASE(uabs_usub, uabsdiff);
7567ec681f3Smrg
7577ec681f3Smrg                ALU_CASE(mov, imov);
7587ec681f3Smrg
7597ec681f3Smrg                ALU_CASE_CMP(feq32, feq);
7607ec681f3Smrg                ALU_CASE_CMP(fneu32, fne);
7617ec681f3Smrg                ALU_CASE_CMP(flt32, flt);
7627ec681f3Smrg                ALU_CASE_CMP(ieq32, ieq);
7637ec681f3Smrg                ALU_CASE_CMP(ine32, ine);
7647ec681f3Smrg                ALU_CASE_CMP(ilt32, ilt);
7657ec681f3Smrg                ALU_CASE_CMP(ult32, ult);
7667ec681f3Smrg
7677ec681f3Smrg                /* We don't have a native b2f32 instruction. Instead, like many
7687ec681f3Smrg                 * GPUs, we exploit booleans as 0/~0 for false/true, and
7697ec681f3Smrg                 * correspondingly AND
7707ec681f3Smrg                 * by 1.0 to do the type conversion. For the moment, prime us
7717ec681f3Smrg                 * to emit:
7727ec681f3Smrg                 *
7737ec681f3Smrg                 * iand [whatever], #0
7747ec681f3Smrg                 *
7757ec681f3Smrg                 * At the end of emit_alu (as MIR), we'll fix-up the constant
7767ec681f3Smrg                 */
7777ec681f3Smrg
7787ec681f3Smrg                ALU_CASE_CMP(b2f32, iand);
7797ec681f3Smrg                ALU_CASE_CMP(b2f16, iand);
7807ec681f3Smrg                ALU_CASE_CMP(b2i32, iand);
7817ec681f3Smrg
7827ec681f3Smrg                /* Likewise, we don't have a dedicated f2b32 instruction, but
7837ec681f3Smrg                 * we can do a "not equal to 0.0" test. */
7847ec681f3Smrg
7857ec681f3Smrg                ALU_CASE_CMP(f2b32, fne);
7867ec681f3Smrg                ALU_CASE_CMP(i2b32, ine);
7877ec681f3Smrg
7887ec681f3Smrg                ALU_CASE(frcp, frcp);
7897ec681f3Smrg                ALU_CASE(frsq, frsqrt);
7907ec681f3Smrg                ALU_CASE(fsqrt, fsqrt);
7917ec681f3Smrg                ALU_CASE(fexp2, fexp2);
7927ec681f3Smrg                ALU_CASE(flog2, flog2);
7937ec681f3Smrg
7947ec681f3Smrg                ALU_CASE_RTZ(f2i64, f2i_rte);
7957ec681f3Smrg                ALU_CASE_RTZ(f2u64, f2u_rte);
7967ec681f3Smrg                ALU_CASE_RTZ(i2f64, i2f_rte);
7977ec681f3Smrg                ALU_CASE_RTZ(u2f64, u2f_rte);
7987ec681f3Smrg
7997ec681f3Smrg                ALU_CASE_RTZ(f2i32, f2i_rte);
8007ec681f3Smrg                ALU_CASE_RTZ(f2u32, f2u_rte);
8017ec681f3Smrg                ALU_CASE_RTZ(i2f32, i2f_rte);
8027ec681f3Smrg                ALU_CASE_RTZ(u2f32, u2f_rte);
8037ec681f3Smrg
8047ec681f3Smrg                ALU_CASE_RTZ(f2i8, f2i_rte);
8057ec681f3Smrg                ALU_CASE_RTZ(f2u8, f2u_rte);
8067ec681f3Smrg
8077ec681f3Smrg                ALU_CASE_RTZ(f2i16, f2i_rte);
8087ec681f3Smrg                ALU_CASE_RTZ(f2u16, f2u_rte);
8097ec681f3Smrg                ALU_CASE_RTZ(i2f16, i2f_rte);
8107ec681f3Smrg                ALU_CASE_RTZ(u2f16, u2f_rte);
8117ec681f3Smrg
8127ec681f3Smrg                ALU_CASE(fsin, fsinpi);
8137ec681f3Smrg                ALU_CASE(fcos, fcospi);
8147ec681f3Smrg
8157ec681f3Smrg                /* We'll get 0 in the second arg, so:
8167ec681f3Smrg                 * ~a = ~(a | 0) = nor(a, 0) */
8177ec681f3Smrg                ALU_CASE(inot, inor);
8187ec681f3Smrg                ALU_CASE(iand, iand);
8197ec681f3Smrg                ALU_CASE(ior, ior);
8207ec681f3Smrg                ALU_CASE(ixor, ixor);
8217ec681f3Smrg                ALU_CASE(ishl, ishl);
8227ec681f3Smrg                ALU_CASE(ishr, iasr);
8237ec681f3Smrg                ALU_CASE(ushr, ilsr);
8247ec681f3Smrg
8257ec681f3Smrg                ALU_CASE_BCAST(b32all_fequal2, fball_eq, 2);
8267ec681f3Smrg                ALU_CASE_BCAST(b32all_fequal3, fball_eq, 3);
8277ec681f3Smrg                ALU_CASE_CMP(b32all_fequal4, fball_eq);
8287ec681f3Smrg
8297ec681f3Smrg                ALU_CASE_BCAST(b32any_fnequal2, fbany_neq, 2);
8307ec681f3Smrg                ALU_CASE_BCAST(b32any_fnequal3, fbany_neq, 3);
8317ec681f3Smrg                ALU_CASE_CMP(b32any_fnequal4, fbany_neq);
8327ec681f3Smrg
8337ec681f3Smrg                ALU_CASE_BCAST(b32all_iequal2, iball_eq, 2);
8347ec681f3Smrg                ALU_CASE_BCAST(b32all_iequal3, iball_eq, 3);
8357ec681f3Smrg                ALU_CASE_CMP(b32all_iequal4, iball_eq);
8367ec681f3Smrg
8377ec681f3Smrg                ALU_CASE_BCAST(b32any_inequal2, ibany_neq, 2);
8387ec681f3Smrg                ALU_CASE_BCAST(b32any_inequal3, ibany_neq, 3);
8397ec681f3Smrg                ALU_CASE_CMP(b32any_inequal4, ibany_neq);
8407ec681f3Smrg
8417ec681f3Smrg                /* Source mods will be shoved in later */
8427ec681f3Smrg                ALU_CASE(fabs, fmov);
8437ec681f3Smrg                ALU_CASE(fneg, fmov);
8447ec681f3Smrg                ALU_CASE(fsat, fmov);
8457ec681f3Smrg                ALU_CASE(fsat_signed_mali, fmov);
8467ec681f3Smrg                ALU_CASE(fclamp_pos_mali, fmov);
8477ec681f3Smrg
8487ec681f3Smrg        /* For size conversion, we use a move. Ideally though we would squash
8497ec681f3Smrg         * these ops together; maybe that has to happen after in NIR as part of
8507ec681f3Smrg         * propagation...? An earlier algebraic pass ensured we step down by
8517ec681f3Smrg         * only / exactly one size. If stepping down, we use a dest override to
8527ec681f3Smrg         * reduce the size; if stepping up, we use a larger-sized move with a
8537ec681f3Smrg         * half source and a sign/zero-extension modifier */
8547ec681f3Smrg
8557ec681f3Smrg        case nir_op_i2i8:
8567ec681f3Smrg        case nir_op_i2i16:
8577ec681f3Smrg        case nir_op_i2i32:
8587ec681f3Smrg        case nir_op_i2i64:
8597ec681f3Smrg        case nir_op_u2u8:
8607ec681f3Smrg        case nir_op_u2u16:
8617ec681f3Smrg        case nir_op_u2u32:
8627ec681f3Smrg        case nir_op_u2u64:
8637ec681f3Smrg        case nir_op_f2f16:
8647ec681f3Smrg        case nir_op_f2f32:
8657ec681f3Smrg        case nir_op_f2f64: {
8667ec681f3Smrg                if (instr->op == nir_op_f2f16 || instr->op == nir_op_f2f32 ||
8677ec681f3Smrg                    instr->op == nir_op_f2f64)
8687ec681f3Smrg                        op = midgard_alu_op_fmov;
8697ec681f3Smrg                else
8707ec681f3Smrg                        op = midgard_alu_op_imov;
8717ec681f3Smrg
8727ec681f3Smrg                break;
8737ec681f3Smrg        }
8747ec681f3Smrg
8757ec681f3Smrg        /* For greater-or-equal, we lower to less-or-equal and flip the
8767ec681f3Smrg         * arguments */
8777ec681f3Smrg
8787ec681f3Smrg        case nir_op_fge:
8797ec681f3Smrg        case nir_op_fge32:
8807ec681f3Smrg        case nir_op_ige32:
8817ec681f3Smrg        case nir_op_uge32: {
8827ec681f3Smrg                op =
8837ec681f3Smrg                        instr->op == nir_op_fge   ? midgard_alu_op_fle :
8847ec681f3Smrg                        instr->op == nir_op_fge32 ? midgard_alu_op_fle :
8857ec681f3Smrg                        instr->op == nir_op_ige32 ? midgard_alu_op_ile :
8867ec681f3Smrg                        instr->op == nir_op_uge32 ? midgard_alu_op_ule :
8877ec681f3Smrg                        0;
8887ec681f3Smrg
8897ec681f3Smrg                flip_src12 = true;
8907ec681f3Smrg                ALU_CHECK_CMP();
8917ec681f3Smrg                break;
8927ec681f3Smrg        }
8937ec681f3Smrg
8947ec681f3Smrg        case nir_op_b32csel: {
8957ec681f3Smrg                bool mixed = nir_is_non_scalar_swizzle(&instr->src[0], nr_components);
8967ec681f3Smrg                bool is_float = mir_is_bcsel_float(instr);
8977ec681f3Smrg                op = is_float ?
8987ec681f3Smrg                        (mixed ? midgard_alu_op_fcsel_v : midgard_alu_op_fcsel) :
8997ec681f3Smrg                        (mixed ? midgard_alu_op_icsel_v : midgard_alu_op_icsel);
9007ec681f3Smrg
9017ec681f3Smrg                break;
9027ec681f3Smrg        }
9037ec681f3Smrg
9047ec681f3Smrg        case nir_op_unpack_32_2x16:
9057ec681f3Smrg        case nir_op_unpack_32_4x8:
9067ec681f3Smrg        case nir_op_pack_32_2x16:
9077ec681f3Smrg        case nir_op_pack_32_4x8: {
9087ec681f3Smrg                op = midgard_alu_op_imov;
9097ec681f3Smrg                break;
9107ec681f3Smrg        }
9117ec681f3Smrg
9127ec681f3Smrg        default:
9137ec681f3Smrg                DBG("Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
9147ec681f3Smrg                assert(0);
9157ec681f3Smrg                return;
9167ec681f3Smrg        }
9177ec681f3Smrg
9187ec681f3Smrg        /* Promote imov to fmov if it might help inline a constant */
9197ec681f3Smrg        if (op == midgard_alu_op_imov && nir_src_is_const(instr->src[0].src)
9207ec681f3Smrg                        && nir_src_bit_size(instr->src[0].src) == 32
9217ec681f3Smrg                        && nir_is_same_comp_swizzle(instr->src[0].swizzle,
9227ec681f3Smrg                                nir_src_num_components(instr->src[0].src))) {
9237ec681f3Smrg                op = midgard_alu_op_fmov;
9247ec681f3Smrg        }
9257ec681f3Smrg
9267ec681f3Smrg        /* Midgard can perform certain modifiers on output of an ALU op */
9277ec681f3Smrg
9287ec681f3Smrg        unsigned outmod = 0;
9297ec681f3Smrg        bool is_int = midgard_is_integer_op(op);
9307ec681f3Smrg
9317ec681f3Smrg        if (instr->op == nir_op_umul_high || instr->op == nir_op_imul_high) {
9327ec681f3Smrg                outmod = midgard_outmod_keephi;
9337ec681f3Smrg        } else if (midgard_is_integer_out_op(op)) {
9347ec681f3Smrg                outmod = midgard_outmod_keeplo;
9357ec681f3Smrg        } else if (instr->op == nir_op_fsat) {
9367ec681f3Smrg                outmod = midgard_outmod_clamp_0_1;
9377ec681f3Smrg        } else if (instr->op == nir_op_fsat_signed_mali) {
9387ec681f3Smrg                outmod = midgard_outmod_clamp_m1_1;
9397ec681f3Smrg        } else if (instr->op == nir_op_fclamp_pos_mali) {
9407ec681f3Smrg                outmod = midgard_outmod_clamp_0_inf;
9417ec681f3Smrg        }
9427ec681f3Smrg
9437ec681f3Smrg        /* Fetch unit, quirks, etc information */
9447ec681f3Smrg        unsigned opcode_props = alu_opcode_props[op].props;
9457ec681f3Smrg        bool quirk_flipped_r24 = opcode_props & QUIRK_FLIPPED_R24;
9467ec681f3Smrg
9477ec681f3Smrg        if (!midgard_is_integer_out_op(op)) {
9487ec681f3Smrg                outmod = mir_determine_float_outmod(ctx, &dest, outmod);
9497ec681f3Smrg        }
9507ec681f3Smrg
9517ec681f3Smrg        midgard_instruction ins = {
9527ec681f3Smrg                .type = TAG_ALU_4,
9537ec681f3Smrg                .dest = nir_dest_index(dest),
9547ec681f3Smrg                .dest_type = nir_op_infos[instr->op].output_type
9557ec681f3Smrg                        | nir_dest_bit_size(*dest),
9567ec681f3Smrg                .roundmode = roundmode,
9577ec681f3Smrg        };
9587ec681f3Smrg
9597ec681f3Smrg        enum midgard_roundmode *roundptr = (opcode_props & MIDGARD_ROUNDS) ?
9607ec681f3Smrg                &ins.roundmode : NULL;
9617ec681f3Smrg
9627ec681f3Smrg        for (unsigned i = nr_inputs; i < ARRAY_SIZE(ins.src); ++i)
9637ec681f3Smrg                ins.src[i] = ~0;
9647ec681f3Smrg
9657ec681f3Smrg        if (quirk_flipped_r24) {
9667ec681f3Smrg                ins.src[0] = ~0;
9677ec681f3Smrg                mir_copy_src(&ins, instr, 0, 1, &ins.src_abs[1], &ins.src_neg[1], &ins.src_invert[1], roundptr, is_int, broadcast_swizzle);
9687ec681f3Smrg        } else {
9697ec681f3Smrg                for (unsigned i = 0; i < nr_inputs; ++i) {
9707ec681f3Smrg                        unsigned to = i;
9717ec681f3Smrg
9727ec681f3Smrg                        if (instr->op == nir_op_b32csel) {
9737ec681f3Smrg                                /* The condition is the first argument; move
9747ec681f3Smrg                                 * the other arguments up one to be a binary
9757ec681f3Smrg                                 * instruction for Midgard with the condition
9767ec681f3Smrg                                 * last */
9777ec681f3Smrg
9787ec681f3Smrg                                if (i == 0)
9797ec681f3Smrg                                        to = 2;
9807ec681f3Smrg                                else if (flip_src12)
9817ec681f3Smrg                                        to = 2 - i;
9827ec681f3Smrg                                else
9837ec681f3Smrg                                        to = i - 1;
9847ec681f3Smrg                        } else if (flip_src12) {
9857ec681f3Smrg                                to = 1 - to;
9867ec681f3Smrg                        }
9877ec681f3Smrg
9887ec681f3Smrg                        mir_copy_src(&ins, instr, i, to, &ins.src_abs[to], &ins.src_neg[to], &ins.src_invert[to], roundptr, is_int, broadcast_swizzle);
9897ec681f3Smrg
9907ec681f3Smrg                        /* (!c) ? a : b = c ? b : a */
9917ec681f3Smrg                        if (instr->op == nir_op_b32csel && ins.src_invert[2]) {
9927ec681f3Smrg                                ins.src_invert[2] = false;
9937ec681f3Smrg                                flip_src12 ^= true;
9947ec681f3Smrg                        }
9957ec681f3Smrg                }
9967ec681f3Smrg        }
9977ec681f3Smrg
9987ec681f3Smrg        if (instr->op == nir_op_fneg || instr->op == nir_op_fabs) {
9997ec681f3Smrg                /* Lowered to move */
10007ec681f3Smrg                if (instr->op == nir_op_fneg)
10017ec681f3Smrg                        ins.src_neg[1] ^= true;
10027ec681f3Smrg
10037ec681f3Smrg                if (instr->op == nir_op_fabs)
10047ec681f3Smrg                        ins.src_abs[1] = true;
10057ec681f3Smrg        }
10067ec681f3Smrg
10077ec681f3Smrg        ins.mask = mask_of(nr_components);
10087ec681f3Smrg
10097ec681f3Smrg        /* Apply writemask if non-SSA, keeping in mind that we can't write to
10107ec681f3Smrg         * components that don't exist. Note modifier => SSA => !reg => no
10117ec681f3Smrg         * writemask, so we don't have to worry about writemasks here.*/
10127ec681f3Smrg
10137ec681f3Smrg        if (!is_ssa)
10147ec681f3Smrg                ins.mask &= instr->dest.write_mask;
10157ec681f3Smrg
10167ec681f3Smrg        ins.op = op;
10177ec681f3Smrg        ins.outmod = outmod;
10187ec681f3Smrg
10197ec681f3Smrg        /* Late fixup for emulated instructions */
10207ec681f3Smrg
10217ec681f3Smrg        if (instr->op == nir_op_b2f32 || instr->op == nir_op_b2i32) {
10227ec681f3Smrg                /* Presently, our second argument is an inline #0 constant.
10237ec681f3Smrg                 * Switch over to an embedded 1.0 constant (that can't fit
10247ec681f3Smrg                 * inline, since we're 32-bit, not 16-bit like the inline
10257ec681f3Smrg                 * constants) */
10267ec681f3Smrg
10277ec681f3Smrg                ins.has_inline_constant = false;
10287ec681f3Smrg                ins.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
10297ec681f3Smrg                ins.src_types[1] = nir_type_float32;
10307ec681f3Smrg                ins.has_constants = true;
10317ec681f3Smrg
10327ec681f3Smrg                if (instr->op == nir_op_b2f32)
10337ec681f3Smrg                        ins.constants.f32[0] = 1.0f;
10347ec681f3Smrg                else
10357ec681f3Smrg                        ins.constants.i32[0] = 1;
10367ec681f3Smrg
10377ec681f3Smrg                for (unsigned c = 0; c < 16; ++c)
10387ec681f3Smrg                        ins.swizzle[1][c] = 0;
10397ec681f3Smrg        } else if (instr->op == nir_op_b2f16) {
10407ec681f3Smrg                ins.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
10417ec681f3Smrg                ins.src_types[1] = nir_type_float16;
10427ec681f3Smrg                ins.has_constants = true;
10437ec681f3Smrg                ins.constants.i16[0] = _mesa_float_to_half(1.0);
10447ec681f3Smrg
10457ec681f3Smrg                for (unsigned c = 0; c < 16; ++c)
10467ec681f3Smrg                        ins.swizzle[1][c] = 0;
10477ec681f3Smrg        } else if (nr_inputs == 1 && !quirk_flipped_r24) {
10487ec681f3Smrg                /* Lots of instructions need a 0 plonked in */
10497ec681f3Smrg                ins.has_inline_constant = false;
10507ec681f3Smrg                ins.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
10517ec681f3Smrg                ins.src_types[1] = ins.src_types[0];
10527ec681f3Smrg                ins.has_constants = true;
10537ec681f3Smrg                ins.constants.u32[0] = 0;
10547ec681f3Smrg
10557ec681f3Smrg                for (unsigned c = 0; c < 16; ++c)
10567ec681f3Smrg                        ins.swizzle[1][c] = 0;
10577ec681f3Smrg        } else if (instr->op == nir_op_pack_32_2x16) {
10587ec681f3Smrg                ins.dest_type = nir_type_uint16;
10597ec681f3Smrg                ins.mask = mask_of(nr_components * 2);
10607ec681f3Smrg                ins.is_pack = true;
10617ec681f3Smrg        } else if (instr->op == nir_op_pack_32_4x8) {
10627ec681f3Smrg                ins.dest_type = nir_type_uint8;
10637ec681f3Smrg                ins.mask = mask_of(nr_components * 4);
10647ec681f3Smrg                ins.is_pack = true;
10657ec681f3Smrg        } else if (instr->op == nir_op_unpack_32_2x16) {
10667ec681f3Smrg                ins.dest_type = nir_type_uint32;
10677ec681f3Smrg                ins.mask = mask_of(nr_components >> 1);
10687ec681f3Smrg                ins.is_pack = true;
10697ec681f3Smrg        } else if (instr->op == nir_op_unpack_32_4x8) {
10707ec681f3Smrg                ins.dest_type = nir_type_uint32;
10717ec681f3Smrg                ins.mask = mask_of(nr_components >> 2);
10727ec681f3Smrg                ins.is_pack = true;
10737ec681f3Smrg        }
10747ec681f3Smrg
10757ec681f3Smrg        if ((opcode_props & UNITS_ALL) == UNIT_VLUT) {
10767ec681f3Smrg                /* To avoid duplicating the lookup tables (probably), true LUT
10777ec681f3Smrg                 * instructions can only operate as if they were scalars. Lower
10787ec681f3Smrg                 * them here by changing the component. */
10797ec681f3Smrg
10807ec681f3Smrg                unsigned orig_mask = ins.mask;
10817ec681f3Smrg
10827ec681f3Smrg                unsigned swizzle_back[MIR_VEC_COMPONENTS];
10837ec681f3Smrg                memcpy(&swizzle_back, ins.swizzle[0], sizeof(swizzle_back));
10847ec681f3Smrg
10857ec681f3Smrg                midgard_instruction ins_split[MIR_VEC_COMPONENTS];
10867ec681f3Smrg                unsigned ins_count = 0;
10877ec681f3Smrg
10887ec681f3Smrg                for (int i = 0; i < nr_components; ++i) {
10897ec681f3Smrg                        /* Mask the associated component, dropping the
10907ec681f3Smrg                         * instruction if needed */
10917ec681f3Smrg
10927ec681f3Smrg                        ins.mask = 1 << i;
10937ec681f3Smrg                        ins.mask &= orig_mask;
10947ec681f3Smrg
10957ec681f3Smrg                        for (unsigned j = 0; j < ins_count; ++j) {
10967ec681f3Smrg                                if (swizzle_back[i] == ins_split[j].swizzle[0][0]) {
10977ec681f3Smrg                                        ins_split[j].mask |= ins.mask;
10987ec681f3Smrg                                        ins.mask = 0;
10997ec681f3Smrg                                        break;
11007ec681f3Smrg                                }
11017ec681f3Smrg                        }
11027ec681f3Smrg
11037ec681f3Smrg                        if (!ins.mask)
11047ec681f3Smrg                                continue;
11057ec681f3Smrg
11067ec681f3Smrg                        for (unsigned j = 0; j < MIR_VEC_COMPONENTS; ++j)
11077ec681f3Smrg                                ins.swizzle[0][j] = swizzle_back[i]; /* Pull from the correct component */
11087ec681f3Smrg
11097ec681f3Smrg                        ins_split[ins_count] = ins;
11107ec681f3Smrg
11117ec681f3Smrg                        ++ins_count;
11127ec681f3Smrg                }
11137ec681f3Smrg
11147ec681f3Smrg                for (unsigned i = 0; i < ins_count; ++i) {
11157ec681f3Smrg                        emit_mir_instruction(ctx, ins_split[i]);
11167ec681f3Smrg                }
11177ec681f3Smrg        } else {
11187ec681f3Smrg                emit_mir_instruction(ctx, ins);
11197ec681f3Smrg        }
11207ec681f3Smrg}
11217ec681f3Smrg
11227ec681f3Smrg#undef ALU_CASE
11237ec681f3Smrg
11247ec681f3Smrgstatic void
11257ec681f3Smrgmir_set_intr_mask(nir_instr *instr, midgard_instruction *ins, bool is_read)
11267ec681f3Smrg{
11277ec681f3Smrg        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
11287ec681f3Smrg        unsigned nir_mask = 0;
11297ec681f3Smrg        unsigned dsize = 0;
11307ec681f3Smrg
11317ec681f3Smrg        if (is_read) {
11327ec681f3Smrg                nir_mask = mask_of(nir_intrinsic_dest_components(intr));
11337ec681f3Smrg                dsize = nir_dest_bit_size(intr->dest);
11347ec681f3Smrg        } else {
11357ec681f3Smrg                nir_mask = nir_intrinsic_write_mask(intr);
11367ec681f3Smrg                dsize = 32;
11377ec681f3Smrg        }
11387ec681f3Smrg
11397ec681f3Smrg        /* Once we have the NIR mask, we need to normalize to work in 32-bit space */
11407ec681f3Smrg        unsigned bytemask = pan_to_bytemask(dsize, nir_mask);
11417ec681f3Smrg        ins->dest_type = nir_type_uint | dsize;
11427ec681f3Smrg        mir_set_bytemask(ins, bytemask);
11437ec681f3Smrg}
11447ec681f3Smrg
11457ec681f3Smrg/* Uniforms and UBOs use a shared code path, as uniforms are just (slightly
11467ec681f3Smrg * optimized) versions of UBO #0 */
11477ec681f3Smrg
11487ec681f3Smrgstatic midgard_instruction *
11497ec681f3Smrgemit_ubo_read(
11507ec681f3Smrg        compiler_context *ctx,
11517ec681f3Smrg        nir_instr *instr,
11527ec681f3Smrg        unsigned dest,
11537ec681f3Smrg        unsigned offset,
11547ec681f3Smrg        nir_src *indirect_offset,
11557ec681f3Smrg        unsigned indirect_shift,
11567ec681f3Smrg        unsigned index,
11577ec681f3Smrg        unsigned nr_comps)
11587ec681f3Smrg{
11597ec681f3Smrg        midgard_instruction ins;
11607ec681f3Smrg
11617ec681f3Smrg        unsigned dest_size = (instr->type == nir_instr_type_intrinsic) ?
11627ec681f3Smrg                nir_dest_bit_size(nir_instr_as_intrinsic(instr)->dest) : 32;
11637ec681f3Smrg
11647ec681f3Smrg        unsigned bitsize = dest_size * nr_comps;
11657ec681f3Smrg
11667ec681f3Smrg        /* Pick the smallest intrinsic to avoid out-of-bounds reads */
11677ec681f3Smrg        if (bitsize <= 32)
11687ec681f3Smrg                ins = m_ld_ubo_32(dest, 0);
11697ec681f3Smrg        else if (bitsize <= 64)
11707ec681f3Smrg                ins = m_ld_ubo_64(dest, 0);
11717ec681f3Smrg        else if (bitsize <= 128)
11727ec681f3Smrg                ins = m_ld_ubo_128(dest, 0);
11737ec681f3Smrg        else
11747ec681f3Smrg                unreachable("Invalid UBO read size");
11757ec681f3Smrg
11767ec681f3Smrg        ins.constants.u32[0] = offset;
11777ec681f3Smrg
11787ec681f3Smrg        if (instr->type == nir_instr_type_intrinsic)
11797ec681f3Smrg                mir_set_intr_mask(instr, &ins, true);
11807ec681f3Smrg
11817ec681f3Smrg        if (indirect_offset) {
11827ec681f3Smrg                ins.src[2] = nir_src_index(ctx, indirect_offset);
11837ec681f3Smrg                ins.src_types[2] = nir_type_uint32;
11847ec681f3Smrg                ins.load_store.index_shift = indirect_shift;
11857ec681f3Smrg
11867ec681f3Smrg                /* X component for the whole swizzle to prevent register
11877ec681f3Smrg                 * pressure from ballooning from the extra components */
11887ec681f3Smrg                for (unsigned i = 0; i < ARRAY_SIZE(ins.swizzle[2]); ++i)
11897ec681f3Smrg                        ins.swizzle[2][i] = 0;
11907ec681f3Smrg        } else {
11917ec681f3Smrg                ins.load_store.index_reg = REGISTER_LDST_ZERO;
11927ec681f3Smrg        }
11937ec681f3Smrg
11947ec681f3Smrg        if (indirect_offset && indirect_offset->is_ssa && !indirect_shift)
11957ec681f3Smrg                mir_set_ubo_offset(&ins, indirect_offset, offset);
11967ec681f3Smrg
11977ec681f3Smrg        midgard_pack_ubo_index_imm(&ins.load_store, index);
11987ec681f3Smrg
11997ec681f3Smrg        return emit_mir_instruction(ctx, ins);
12007ec681f3Smrg}
12017ec681f3Smrg
12027ec681f3Smrg/* Globals are like UBOs if you squint. And shared memory is like globals if
12037ec681f3Smrg * you squint even harder */
12047ec681f3Smrg
12057ec681f3Smrgstatic void
12067ec681f3Smrgemit_global(
12077ec681f3Smrg        compiler_context *ctx,
12087ec681f3Smrg        nir_instr *instr,
12097ec681f3Smrg        bool is_read,
12107ec681f3Smrg        unsigned srcdest,
12117ec681f3Smrg        nir_src *offset,
12127ec681f3Smrg        unsigned seg)
12137ec681f3Smrg{
12147ec681f3Smrg        midgard_instruction ins;
12157ec681f3Smrg
12167ec681f3Smrg        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
12177ec681f3Smrg        if (is_read) {
12187ec681f3Smrg                unsigned bitsize = nir_dest_bit_size(intr->dest) *
12197ec681f3Smrg                        nir_dest_num_components(intr->dest);
12207ec681f3Smrg
12217ec681f3Smrg                if (bitsize <= 32)
12227ec681f3Smrg                        ins = m_ld_32(srcdest, 0);
12237ec681f3Smrg                else if (bitsize <= 64)
12247ec681f3Smrg                        ins = m_ld_64(srcdest, 0);
12257ec681f3Smrg                else if (bitsize <= 128)
12267ec681f3Smrg                        ins = m_ld_128(srcdest, 0);
12277ec681f3Smrg                else
12287ec681f3Smrg                        unreachable("Invalid global read size");
12297ec681f3Smrg        } else {
12307ec681f3Smrg                unsigned bitsize = nir_src_bit_size(intr->src[0]) *
12317ec681f3Smrg                        nir_src_num_components(intr->src[0]);
12327ec681f3Smrg
12337ec681f3Smrg                if (bitsize <= 32)
12347ec681f3Smrg                        ins = m_st_32(srcdest, 0);
12357ec681f3Smrg                else if (bitsize <= 64)
12367ec681f3Smrg                        ins = m_st_64(srcdest, 0);
12377ec681f3Smrg                else if (bitsize <= 128)
12387ec681f3Smrg                        ins = m_st_128(srcdest, 0);
12397ec681f3Smrg                else
12407ec681f3Smrg                        unreachable("Invalid global store size");
12417ec681f3Smrg        }
12427ec681f3Smrg
12437ec681f3Smrg        mir_set_offset(ctx, &ins, offset, seg);
12447ec681f3Smrg        mir_set_intr_mask(instr, &ins, is_read);
12457ec681f3Smrg
12467ec681f3Smrg        /* Set a valid swizzle for masked out components */
12477ec681f3Smrg        assert(ins.mask);
12487ec681f3Smrg        unsigned first_component = __builtin_ffs(ins.mask) - 1;
12497ec681f3Smrg
12507ec681f3Smrg        for (unsigned i = 0; i < ARRAY_SIZE(ins.swizzle[0]); ++i) {
12517ec681f3Smrg                if (!(ins.mask & (1 << i)))
12527ec681f3Smrg                        ins.swizzle[0][i] = first_component;
12537ec681f3Smrg        }
12547ec681f3Smrg
12557ec681f3Smrg        emit_mir_instruction(ctx, ins);
12567ec681f3Smrg}
12577ec681f3Smrg
12587ec681f3Smrg/* If is_shared is off, the only other possible value are globals, since
12597ec681f3Smrg * SSBO's are being lowered to globals through a NIR pass.
12607ec681f3Smrg * `image_direct_address` should be ~0 when instr is not an image_atomic
12617ec681f3Smrg * and the destination register of a lea_image op when it is an image_atomic. */
12627ec681f3Smrgstatic void
12637ec681f3Smrgemit_atomic(
12647ec681f3Smrg        compiler_context *ctx,
12657ec681f3Smrg        nir_intrinsic_instr *instr,
12667ec681f3Smrg        bool is_shared,
12677ec681f3Smrg        midgard_load_store_op op,
12687ec681f3Smrg        unsigned image_direct_address)
12697ec681f3Smrg{
12707ec681f3Smrg        nir_alu_type type =
12717ec681f3Smrg                (op == midgard_op_atomic_imin || op == midgard_op_atomic_imax) ?
12727ec681f3Smrg                nir_type_int : nir_type_uint;
12737ec681f3Smrg
12747ec681f3Smrg        bool is_image = image_direct_address != ~0;
12757ec681f3Smrg
12767ec681f3Smrg        unsigned dest = nir_dest_index(&instr->dest);
12777ec681f3Smrg        unsigned val_src = is_image ? 3 : 1;
12787ec681f3Smrg        unsigned val = nir_src_index(ctx, &instr->src[val_src]);
12797ec681f3Smrg        unsigned bitsize = nir_src_bit_size(instr->src[val_src]);
12807ec681f3Smrg        emit_explicit_constant(ctx, val, val);
12817ec681f3Smrg
12827ec681f3Smrg        midgard_instruction ins = {
12837ec681f3Smrg                .type = TAG_LOAD_STORE_4,
12847ec681f3Smrg                .mask = 0xF,
12857ec681f3Smrg                .dest = dest,
12867ec681f3Smrg                .src = { ~0, ~0, ~0, val },
12877ec681f3Smrg                .src_types = { 0, 0, 0, type | bitsize },
12887ec681f3Smrg                .op = op
12897ec681f3Smrg        };
12907ec681f3Smrg
12917ec681f3Smrg        nir_src *src_offset = nir_get_io_offset_src(instr);
12927ec681f3Smrg
12937ec681f3Smrg        if (op == midgard_op_atomic_cmpxchg) {
12947ec681f3Smrg                unsigned xchg_val_src = is_image ? 4 : 2;
12957ec681f3Smrg                unsigned xchg_val = nir_src_index(ctx, &instr->src[xchg_val_src]);
12967ec681f3Smrg                emit_explicit_constant(ctx, xchg_val, xchg_val);
12977ec681f3Smrg
12987ec681f3Smrg                ins.src[2] = val;
12997ec681f3Smrg                ins.src_types[2] = type | bitsize;
13007ec681f3Smrg                ins.src[3] = xchg_val;
13017ec681f3Smrg
13027ec681f3Smrg                if (is_shared) {
13037ec681f3Smrg                        ins.load_store.arg_reg = REGISTER_LDST_LOCAL_STORAGE_PTR;
13047ec681f3Smrg                        ins.load_store.arg_comp = COMPONENT_Z;
13057ec681f3Smrg                        ins.load_store.bitsize_toggle = true;
13067ec681f3Smrg                } else {
13077ec681f3Smrg                        for(unsigned i = 0; i < 2; ++i)
13087ec681f3Smrg                                ins.swizzle[1][i] = i;
13097ec681f3Smrg
13107ec681f3Smrg                        ins.src[1] = is_image ? image_direct_address :
13117ec681f3Smrg                                                nir_src_index(ctx, src_offset);
13127ec681f3Smrg                        ins.src_types[1] = nir_type_uint64;
13137ec681f3Smrg                }
13147ec681f3Smrg        } else if (is_image) {
13157ec681f3Smrg                for(unsigned i = 0; i < 2; ++i)
13167ec681f3Smrg                        ins.swizzle[2][i] = i;
13177ec681f3Smrg
13187ec681f3Smrg                ins.src[2] = image_direct_address;
13197ec681f3Smrg                ins.src_types[2] = nir_type_uint64;
13207ec681f3Smrg
13217ec681f3Smrg                ins.load_store.arg_reg = REGISTER_LDST_ZERO;
13227ec681f3Smrg                ins.load_store.bitsize_toggle = true;
13237ec681f3Smrg                ins.load_store.index_format = midgard_index_address_u64;
13247ec681f3Smrg        } else
13257ec681f3Smrg                mir_set_offset(ctx, &ins, src_offset, is_shared ? LDST_SHARED : LDST_GLOBAL);
13267ec681f3Smrg
13277ec681f3Smrg        mir_set_intr_mask(&instr->instr, &ins, true);
13287ec681f3Smrg
13297ec681f3Smrg        emit_mir_instruction(ctx, ins);
13307ec681f3Smrg}
13317ec681f3Smrg
13327ec681f3Smrgstatic void
13337ec681f3Smrgemit_varying_read(
13347ec681f3Smrg        compiler_context *ctx,
13357ec681f3Smrg        unsigned dest, unsigned offset,
13367ec681f3Smrg        unsigned nr_comp, unsigned component,
13377ec681f3Smrg        nir_src *indirect_offset, nir_alu_type type, bool flat)
13387ec681f3Smrg{
13397ec681f3Smrg        /* XXX: Half-floats? */
13407ec681f3Smrg        /* TODO: swizzle, mask */
13417ec681f3Smrg
13427ec681f3Smrg        midgard_instruction ins = m_ld_vary_32(dest, PACK_LDST_ATTRIB_OFS(offset));
13437ec681f3Smrg        ins.mask = mask_of(nr_comp);
13447ec681f3Smrg        ins.dest_type = type;
13457ec681f3Smrg
13467ec681f3Smrg        if (type == nir_type_float16) {
13477ec681f3Smrg                /* Ensure we are aligned so we can pack it later */
13487ec681f3Smrg                ins.mask = mask_of(ALIGN_POT(nr_comp, 2));
13497ec681f3Smrg        }
13507ec681f3Smrg
13517ec681f3Smrg        for (unsigned i = 0; i < ARRAY_SIZE(ins.swizzle[0]); ++i)
13527ec681f3Smrg                ins.swizzle[0][i] = MIN2(i + component, COMPONENT_W);
13537ec681f3Smrg
13547ec681f3Smrg
13557ec681f3Smrg        midgard_varying_params p = {
13567ec681f3Smrg                .flat_shading = flat,
13577ec681f3Smrg                .perspective_correction = 1,
13587ec681f3Smrg                .interpolate_sample = true,
13597ec681f3Smrg        };
13607ec681f3Smrg        midgard_pack_varying_params(&ins.load_store, p);
13617ec681f3Smrg
13627ec681f3Smrg        if (indirect_offset) {
13637ec681f3Smrg                ins.src[2] = nir_src_index(ctx, indirect_offset);
13647ec681f3Smrg                ins.src_types[2] = nir_type_uint32;
13657ec681f3Smrg        } else
13667ec681f3Smrg                ins.load_store.index_reg = REGISTER_LDST_ZERO;
13677ec681f3Smrg
13687ec681f3Smrg        ins.load_store.arg_reg = REGISTER_LDST_ZERO;
13697ec681f3Smrg        ins.load_store.index_format = midgard_index_address_u32;
13707ec681f3Smrg
13717ec681f3Smrg        /* Use the type appropriate load */
13727ec681f3Smrg        switch (type) {
13737ec681f3Smrg        case nir_type_uint32:
13747ec681f3Smrg        case nir_type_bool32:
13757ec681f3Smrg                ins.op = midgard_op_ld_vary_32u;
13767ec681f3Smrg                break;
13777ec681f3Smrg        case nir_type_int32:
13787ec681f3Smrg                ins.op = midgard_op_ld_vary_32i;
13797ec681f3Smrg                break;
13807ec681f3Smrg        case nir_type_float32:
13817ec681f3Smrg                ins.op = midgard_op_ld_vary_32;
13827ec681f3Smrg                break;
13837ec681f3Smrg        case nir_type_float16:
13847ec681f3Smrg                ins.op = midgard_op_ld_vary_16;
13857ec681f3Smrg                break;
13867ec681f3Smrg        default:
13877ec681f3Smrg                unreachable("Attempted to load unknown type");
13887ec681f3Smrg                break;
13897ec681f3Smrg        }
13907ec681f3Smrg
13917ec681f3Smrg        emit_mir_instruction(ctx, ins);
13927ec681f3Smrg}
13937ec681f3Smrg
13947ec681f3Smrg
13957ec681f3Smrg/* If `is_atomic` is true, we emit a `lea_image` since midgard doesn't not have special
13967ec681f3Smrg * image_atomic opcodes. The caller can then use that address to emit a normal atomic opcode. */
13977ec681f3Smrgstatic midgard_instruction
13987ec681f3Smrgemit_image_op(compiler_context *ctx, nir_intrinsic_instr *instr, bool is_atomic)
13997ec681f3Smrg{
14007ec681f3Smrg        enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
14017ec681f3Smrg        unsigned nr_attr = ctx->stage == MESA_SHADER_VERTEX ?
14027ec681f3Smrg                util_bitcount64(ctx->nir->info.inputs_read) : 0;
14037ec681f3Smrg        unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim);
14047ec681f3Smrg        bool is_array = nir_intrinsic_image_array(instr);
14057ec681f3Smrg        bool is_store = instr->intrinsic == nir_intrinsic_image_store;
14067ec681f3Smrg
14077ec681f3Smrg        /* TODO: MSAA */
14087ec681f3Smrg        assert(dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported");
14097ec681f3Smrg
14107ec681f3Smrg        unsigned coord_reg = nir_src_index(ctx, &instr->src[1]);
14117ec681f3Smrg        emit_explicit_constant(ctx, coord_reg, coord_reg);
14127ec681f3Smrg
14137ec681f3Smrg        nir_src *index = &instr->src[0];
14147ec681f3Smrg        bool is_direct = nir_src_is_const(*index);
14157ec681f3Smrg
14167ec681f3Smrg        /* For image opcodes, address is used as an index into the attribute descriptor */
14177ec681f3Smrg        unsigned address = nr_attr;
14187ec681f3Smrg        if (is_direct)
14197ec681f3Smrg                address += nir_src_as_uint(*index);
14207ec681f3Smrg
14217ec681f3Smrg        midgard_instruction ins;
14227ec681f3Smrg        if (is_store) { /* emit st_image_* */
14237ec681f3Smrg                unsigned val = nir_src_index(ctx, &instr->src[3]);
14247ec681f3Smrg                emit_explicit_constant(ctx, val, val);
14257ec681f3Smrg
14267ec681f3Smrg                nir_alu_type type = nir_intrinsic_src_type(instr);
14277ec681f3Smrg                ins = st_image(type, val, PACK_LDST_ATTRIB_OFS(address));
14287ec681f3Smrg                nir_alu_type base_type = nir_alu_type_get_base_type(type);
14297ec681f3Smrg                ins.src_types[0] = base_type | nir_src_bit_size(instr->src[3]);
14307ec681f3Smrg        } else if (is_atomic) { /* emit lea_image */
14317ec681f3Smrg                unsigned dest = make_compiler_temp_reg(ctx);
14327ec681f3Smrg                ins = m_lea_image(dest, PACK_LDST_ATTRIB_OFS(address));
14337ec681f3Smrg                ins.mask = mask_of(2); /* 64-bit memory address */
14347ec681f3Smrg        } else { /* emit ld_image_* */
14357ec681f3Smrg                nir_alu_type type = nir_intrinsic_dest_type(instr);
14367ec681f3Smrg                ins = ld_image(type, nir_dest_index(&instr->dest), PACK_LDST_ATTRIB_OFS(address));
14377ec681f3Smrg                ins.mask = mask_of(nir_intrinsic_dest_components(instr));
14387ec681f3Smrg                ins.dest_type = type;
14397ec681f3Smrg        }
14407ec681f3Smrg
14417ec681f3Smrg        /* Coord reg */
14427ec681f3Smrg        ins.src[1] = coord_reg;
14437ec681f3Smrg        ins.src_types[1] = nir_type_uint16;
14447ec681f3Smrg        if (nr_dim == 3 || is_array) {
14457ec681f3Smrg                ins.load_store.bitsize_toggle = true;
14467ec681f3Smrg        }
14477ec681f3Smrg
14487ec681f3Smrg        /* Image index reg */
14497ec681f3Smrg        if (!is_direct) {
14507ec681f3Smrg                ins.src[2] = nir_src_index(ctx, index);
14517ec681f3Smrg                ins.src_types[2] = nir_type_uint32;
14527ec681f3Smrg        } else
14537ec681f3Smrg                ins.load_store.index_reg = REGISTER_LDST_ZERO;
14547ec681f3Smrg
14557ec681f3Smrg        emit_mir_instruction(ctx, ins);
14567ec681f3Smrg
14577ec681f3Smrg        return ins;
14587ec681f3Smrg}
14597ec681f3Smrg
14607ec681f3Smrgstatic void
14617ec681f3Smrgemit_attr_read(
14627ec681f3Smrg        compiler_context *ctx,
14637ec681f3Smrg        unsigned dest, unsigned offset,
14647ec681f3Smrg        unsigned nr_comp, nir_alu_type t)
14657ec681f3Smrg{
14667ec681f3Smrg        midgard_instruction ins = m_ld_attr_32(dest, PACK_LDST_ATTRIB_OFS(offset));
14677ec681f3Smrg        ins.load_store.arg_reg = REGISTER_LDST_ZERO;
14687ec681f3Smrg        ins.load_store.index_reg = REGISTER_LDST_ZERO;
14697ec681f3Smrg        ins.mask = mask_of(nr_comp);
14707ec681f3Smrg
14717ec681f3Smrg        /* Use the type appropriate load */
14727ec681f3Smrg        switch (t) {
14737ec681f3Smrg        case nir_type_uint:
14747ec681f3Smrg        case nir_type_bool:
14757ec681f3Smrg                ins.op = midgard_op_ld_attr_32u;
14767ec681f3Smrg                break;
14777ec681f3Smrg        case nir_type_int:
14787ec681f3Smrg                ins.op = midgard_op_ld_attr_32i;
14797ec681f3Smrg                break;
14807ec681f3Smrg        case nir_type_float:
14817ec681f3Smrg                ins.op = midgard_op_ld_attr_32;
14827ec681f3Smrg                break;
14837ec681f3Smrg        default:
14847ec681f3Smrg                unreachable("Attempted to load unknown type");
14857ec681f3Smrg                break;
14867ec681f3Smrg        }
14877ec681f3Smrg
14887ec681f3Smrg        emit_mir_instruction(ctx, ins);
14897ec681f3Smrg}
14907ec681f3Smrg
14917ec681f3Smrgstatic void
14927ec681f3Smrgemit_sysval_read(compiler_context *ctx, nir_instr *instr,
14937ec681f3Smrg                unsigned nr_components, unsigned offset)
14947ec681f3Smrg{
14957ec681f3Smrg        nir_dest nir_dest;
14967ec681f3Smrg
14977ec681f3Smrg        /* Figure out which uniform this is */
14987ec681f3Smrg        unsigned sysval_ubo =
14997ec681f3Smrg                MAX2(ctx->inputs->sysval_ubo, ctx->nir->info.num_ubos);
15007ec681f3Smrg        int sysval = panfrost_sysval_for_instr(instr, &nir_dest);
15017ec681f3Smrg        unsigned dest = nir_dest_index(&nir_dest);
15027ec681f3Smrg        unsigned uniform =
15037ec681f3Smrg                pan_lookup_sysval(ctx->sysval_to_id, &ctx->info->sysvals, sysval);
15047ec681f3Smrg
15057ec681f3Smrg        /* Emit the read itself -- this is never indirect */
15067ec681f3Smrg        midgard_instruction *ins =
15077ec681f3Smrg                emit_ubo_read(ctx, instr, dest, (uniform * 16) + offset, NULL, 0,
15087ec681f3Smrg                              sysval_ubo, nr_components);
15097ec681f3Smrg
15107ec681f3Smrg        ins->mask = mask_of(nr_components);
15117ec681f3Smrg}
15127ec681f3Smrg
15137ec681f3Smrgstatic unsigned
15147ec681f3Smrgcompute_builtin_arg(nir_intrinsic_op op)
15157ec681f3Smrg{
15167ec681f3Smrg        switch (op) {
15177ec681f3Smrg        case nir_intrinsic_load_workgroup_id:
15187ec681f3Smrg                return REGISTER_LDST_GROUP_ID;
15197ec681f3Smrg        case nir_intrinsic_load_local_invocation_id:
15207ec681f3Smrg                return REGISTER_LDST_LOCAL_THREAD_ID;
15217ec681f3Smrg        case nir_intrinsic_load_global_invocation_id:
15227ec681f3Smrg        case nir_intrinsic_load_global_invocation_id_zero_base:
15237ec681f3Smrg                return REGISTER_LDST_GLOBAL_THREAD_ID;
15247ec681f3Smrg        default:
15257ec681f3Smrg                unreachable("Invalid compute paramater loaded");
15267ec681f3Smrg        }
15277ec681f3Smrg}
15287ec681f3Smrg
15297ec681f3Smrgstatic void
15307ec681f3Smrgemit_fragment_store(compiler_context *ctx, unsigned src, unsigned src_z, unsigned src_s,
15317ec681f3Smrg                    enum midgard_rt_id rt, unsigned sample_iter)
15327ec681f3Smrg{
15337ec681f3Smrg        assert(rt < ARRAY_SIZE(ctx->writeout_branch));
15347ec681f3Smrg        assert(sample_iter < ARRAY_SIZE(ctx->writeout_branch[0]));
15357ec681f3Smrg
15367ec681f3Smrg        midgard_instruction *br = ctx->writeout_branch[rt][sample_iter];
15377ec681f3Smrg
15387ec681f3Smrg        assert(!br);
15397ec681f3Smrg
15407ec681f3Smrg        emit_explicit_constant(ctx, src, src);
15417ec681f3Smrg
15427ec681f3Smrg        struct midgard_instruction ins =
15437ec681f3Smrg                v_branch(false, false);
15447ec681f3Smrg
15457ec681f3Smrg        bool depth_only = (rt == MIDGARD_ZS_RT);
15467ec681f3Smrg
15477ec681f3Smrg        ins.writeout = depth_only ? 0 : PAN_WRITEOUT_C;
15487ec681f3Smrg
15497ec681f3Smrg        /* Add dependencies */
15507ec681f3Smrg        ins.src[0] = src;
15517ec681f3Smrg        ins.src_types[0] = nir_type_uint32;
15527ec681f3Smrg
15537ec681f3Smrg        if (depth_only)
15547ec681f3Smrg                ins.constants.u32[0] = 0xFF;
15557ec681f3Smrg        else
15567ec681f3Smrg                ins.constants.u32[0] = ((rt - MIDGARD_COLOR_RT0) << 8) | sample_iter;
15577ec681f3Smrg
15587ec681f3Smrg        for (int i = 0; i < 4; ++i)
15597ec681f3Smrg                ins.swizzle[0][i] = i;
15607ec681f3Smrg
15617ec681f3Smrg        if (~src_z) {
15627ec681f3Smrg                emit_explicit_constant(ctx, src_z, src_z);
15637ec681f3Smrg                ins.src[2] = src_z;
15647ec681f3Smrg                ins.src_types[2] = nir_type_uint32;
15657ec681f3Smrg                ins.writeout |= PAN_WRITEOUT_Z;
15667ec681f3Smrg        }
15677ec681f3Smrg        if (~src_s) {
15687ec681f3Smrg                emit_explicit_constant(ctx, src_s, src_s);
15697ec681f3Smrg                ins.src[3] = src_s;
15707ec681f3Smrg                ins.src_types[3] = nir_type_uint32;
15717ec681f3Smrg                ins.writeout |= PAN_WRITEOUT_S;
15727ec681f3Smrg        }
15737ec681f3Smrg
15747ec681f3Smrg        /* Emit the branch */
15757ec681f3Smrg        br = emit_mir_instruction(ctx, ins);
15767ec681f3Smrg        schedule_barrier(ctx);
15777ec681f3Smrg        ctx->writeout_branch[rt][sample_iter] = br;
15787ec681f3Smrg
15797ec681f3Smrg        /* Push our current location = current block count - 1 = where we'll
15807ec681f3Smrg         * jump to. Maybe a bit too clever for my own good */
15817ec681f3Smrg
15827ec681f3Smrg        br->branch.target_block = ctx->block_count - 1;
15837ec681f3Smrg}
15847ec681f3Smrg
15857ec681f3Smrgstatic void
15867ec681f3Smrgemit_compute_builtin(compiler_context *ctx, nir_intrinsic_instr *instr)
15877ec681f3Smrg{
15887ec681f3Smrg        unsigned reg = nir_dest_index(&instr->dest);
15897ec681f3Smrg        midgard_instruction ins = m_ldst_mov(reg, 0);
15907ec681f3Smrg        ins.mask = mask_of(3);
15917ec681f3Smrg        ins.swizzle[0][3] = COMPONENT_X; /* xyzx */
15927ec681f3Smrg        ins.load_store.arg_reg = compute_builtin_arg(instr->intrinsic);
15937ec681f3Smrg        emit_mir_instruction(ctx, ins);
15947ec681f3Smrg}
15957ec681f3Smrg
15967ec681f3Smrgstatic unsigned
15977ec681f3Smrgvertex_builtin_arg(nir_intrinsic_op op)
15987ec681f3Smrg{
15997ec681f3Smrg        switch (op) {
16007ec681f3Smrg        case nir_intrinsic_load_vertex_id_zero_base:
16017ec681f3Smrg                return PAN_VERTEX_ID;
16027ec681f3Smrg        case nir_intrinsic_load_instance_id:
16037ec681f3Smrg                return PAN_INSTANCE_ID;
16047ec681f3Smrg        default:
16057ec681f3Smrg                unreachable("Invalid vertex builtin");
16067ec681f3Smrg        }
16077ec681f3Smrg}
16087ec681f3Smrg
16097ec681f3Smrgstatic void
16107ec681f3Smrgemit_vertex_builtin(compiler_context *ctx, nir_intrinsic_instr *instr)
16117ec681f3Smrg{
16127ec681f3Smrg        unsigned reg = nir_dest_index(&instr->dest);
16137ec681f3Smrg        emit_attr_read(ctx, reg, vertex_builtin_arg(instr->intrinsic), 1, nir_type_int);
16147ec681f3Smrg}
16157ec681f3Smrg
16167ec681f3Smrgstatic void
16177ec681f3Smrgemit_special(compiler_context *ctx, nir_intrinsic_instr *instr, unsigned idx)
16187ec681f3Smrg{
16197ec681f3Smrg        unsigned reg = nir_dest_index(&instr->dest);
16207ec681f3Smrg
16217ec681f3Smrg        midgard_instruction ld = m_ld_tilebuffer_raw(reg, 0);
16227ec681f3Smrg        ld.op = midgard_op_ld_special_32u;
16237ec681f3Smrg        ld.load_store.signed_offset = PACK_LDST_SELECTOR_OFS(idx);
16247ec681f3Smrg        ld.load_store.index_reg = REGISTER_LDST_ZERO;
16257ec681f3Smrg
16267ec681f3Smrg        for (int i = 0; i < 4; ++i)
16277ec681f3Smrg                ld.swizzle[0][i] = COMPONENT_X;
16287ec681f3Smrg
16297ec681f3Smrg        emit_mir_instruction(ctx, ld);
16307ec681f3Smrg}
16317ec681f3Smrg
16327ec681f3Smrgstatic void
16337ec681f3Smrgemit_control_barrier(compiler_context *ctx)
16347ec681f3Smrg{
16357ec681f3Smrg        midgard_instruction ins = {
16367ec681f3Smrg                .type = TAG_TEXTURE_4,
16377ec681f3Smrg                .dest = ~0,
16387ec681f3Smrg                .src = { ~0, ~0, ~0, ~0 },
16397ec681f3Smrg                .op = midgard_tex_op_barrier,
16407ec681f3Smrg        };
16417ec681f3Smrg
16427ec681f3Smrg        emit_mir_instruction(ctx, ins);
16437ec681f3Smrg}
16447ec681f3Smrg
16457ec681f3Smrgstatic unsigned
16467ec681f3Smrgmir_get_branch_cond(nir_src *src, bool *invert)
16477ec681f3Smrg{
16487ec681f3Smrg        /* Wrap it. No swizzle since it's a scalar */
16497ec681f3Smrg
16507ec681f3Smrg        nir_alu_src alu = {
16517ec681f3Smrg                .src = *src
16527ec681f3Smrg        };
16537ec681f3Smrg
16547ec681f3Smrg        *invert = pan_has_source_mod(&alu, nir_op_inot);
16557ec681f3Smrg        return nir_src_index(NULL, &alu.src);
16567ec681f3Smrg}
16577ec681f3Smrg
16587ec681f3Smrgstatic uint8_t
16597ec681f3Smrgoutput_load_rt_addr(compiler_context *ctx, nir_intrinsic_instr *instr)
16607ec681f3Smrg{
16617ec681f3Smrg        if (ctx->inputs->is_blend)
16627ec681f3Smrg                return MIDGARD_COLOR_RT0 + ctx->inputs->blend.rt;
16637ec681f3Smrg
16647ec681f3Smrg        const nir_variable *var;
16657ec681f3Smrg        var = nir_find_variable_with_driver_location(ctx->nir, nir_var_shader_out, nir_intrinsic_base(instr));
16667ec681f3Smrg        assert(var);
16677ec681f3Smrg
16687ec681f3Smrg        unsigned loc = var->data.location;
16697ec681f3Smrg
16707ec681f3Smrg        if (loc >= FRAG_RESULT_DATA0)
16717ec681f3Smrg                return loc - FRAG_RESULT_DATA0;
16727ec681f3Smrg
16737ec681f3Smrg        if (loc == FRAG_RESULT_DEPTH)
16747ec681f3Smrg                return 0x1F;
16757ec681f3Smrg        if (loc == FRAG_RESULT_STENCIL)
16767ec681f3Smrg                return 0x1E;
16777ec681f3Smrg
16787ec681f3Smrg        unreachable("Invalid RT to load from");
16797ec681f3Smrg}
16807ec681f3Smrg
16817ec681f3Smrgstatic void
16827ec681f3Smrgemit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr)
16837ec681f3Smrg{
16847ec681f3Smrg        unsigned offset = 0, reg;
16857ec681f3Smrg
16867ec681f3Smrg        switch (instr->intrinsic) {
16877ec681f3Smrg        case nir_intrinsic_discard_if:
16887ec681f3Smrg        case nir_intrinsic_discard: {
16897ec681f3Smrg                bool conditional = instr->intrinsic == nir_intrinsic_discard_if;
16907ec681f3Smrg                struct midgard_instruction discard = v_branch(conditional, false);
16917ec681f3Smrg                discard.branch.target_type = TARGET_DISCARD;
16927ec681f3Smrg
16937ec681f3Smrg                if (conditional) {
16947ec681f3Smrg                        discard.src[0] = mir_get_branch_cond(&instr->src[0],
16957ec681f3Smrg                                        &discard.branch.invert_conditional);
16967ec681f3Smrg                        discard.src_types[0] = nir_type_uint32;
16977ec681f3Smrg                }
16987ec681f3Smrg
16997ec681f3Smrg                emit_mir_instruction(ctx, discard);
17007ec681f3Smrg                schedule_barrier(ctx);
17017ec681f3Smrg
17027ec681f3Smrg                break;
17037ec681f3Smrg        }
17047ec681f3Smrg
17057ec681f3Smrg        case nir_intrinsic_image_load:
17067ec681f3Smrg        case nir_intrinsic_image_store:
17077ec681f3Smrg                emit_image_op(ctx, instr, false);
17087ec681f3Smrg                break;
17097ec681f3Smrg
17107ec681f3Smrg        case nir_intrinsic_image_size: {
17117ec681f3Smrg                unsigned nr_comp = nir_intrinsic_dest_components(instr);
17127ec681f3Smrg                emit_sysval_read(ctx, &instr->instr, nr_comp, 0);
17137ec681f3Smrg                break;
17147ec681f3Smrg        }
17157ec681f3Smrg
17167ec681f3Smrg        case nir_intrinsic_load_ubo:
17177ec681f3Smrg        case nir_intrinsic_load_global:
17187ec681f3Smrg        case nir_intrinsic_load_global_constant:
17197ec681f3Smrg        case nir_intrinsic_load_shared:
17207ec681f3Smrg        case nir_intrinsic_load_scratch:
17217ec681f3Smrg        case nir_intrinsic_load_input:
17227ec681f3Smrg        case nir_intrinsic_load_kernel_input:
17237ec681f3Smrg        case nir_intrinsic_load_interpolated_input: {
17247ec681f3Smrg                bool is_ubo = instr->intrinsic == nir_intrinsic_load_ubo;
17257ec681f3Smrg                bool is_global = instr->intrinsic == nir_intrinsic_load_global ||
17267ec681f3Smrg                        instr->intrinsic == nir_intrinsic_load_global_constant;
17277ec681f3Smrg                bool is_shared = instr->intrinsic == nir_intrinsic_load_shared;
17287ec681f3Smrg                bool is_scratch = instr->intrinsic == nir_intrinsic_load_scratch;
17297ec681f3Smrg                bool is_flat = instr->intrinsic == nir_intrinsic_load_input;
17307ec681f3Smrg                bool is_kernel = instr->intrinsic == nir_intrinsic_load_kernel_input;
17317ec681f3Smrg                bool is_interp = instr->intrinsic == nir_intrinsic_load_interpolated_input;
17327ec681f3Smrg
17337ec681f3Smrg                /* Get the base type of the intrinsic */
17347ec681f3Smrg                /* TODO: Infer type? Does it matter? */
17357ec681f3Smrg                nir_alu_type t =
17367ec681f3Smrg                        (is_interp) ? nir_type_float :
17377ec681f3Smrg                        (is_flat) ? nir_intrinsic_dest_type(instr) :
17387ec681f3Smrg                        nir_type_uint;
17397ec681f3Smrg
17407ec681f3Smrg                t = nir_alu_type_get_base_type(t);
17417ec681f3Smrg
17427ec681f3Smrg                if (!(is_ubo || is_global || is_scratch)) {
17437ec681f3Smrg                        offset = nir_intrinsic_base(instr);
17447ec681f3Smrg                }
17457ec681f3Smrg
17467ec681f3Smrg                unsigned nr_comp = nir_intrinsic_dest_components(instr);
17477ec681f3Smrg
17487ec681f3Smrg                nir_src *src_offset = nir_get_io_offset_src(instr);
17497ec681f3Smrg
17507ec681f3Smrg                bool direct = nir_src_is_const(*src_offset);
17517ec681f3Smrg                nir_src *indirect_offset = direct ? NULL : src_offset;
17527ec681f3Smrg
17537ec681f3Smrg                if (direct)
17547ec681f3Smrg                        offset += nir_src_as_uint(*src_offset);
17557ec681f3Smrg
17567ec681f3Smrg                /* We may need to apply a fractional offset */
17577ec681f3Smrg                int component = (is_flat || is_interp) ?
17587ec681f3Smrg                                nir_intrinsic_component(instr) : 0;
17597ec681f3Smrg                reg = nir_dest_index(&instr->dest);
17607ec681f3Smrg
17617ec681f3Smrg                if (is_kernel) {
17627ec681f3Smrg                        emit_ubo_read(ctx, &instr->instr, reg, offset, indirect_offset, 0, 0, nr_comp);
17637ec681f3Smrg                } else if (is_ubo) {
17647ec681f3Smrg                        nir_src index = instr->src[0];
17657ec681f3Smrg
17667ec681f3Smrg                        /* TODO: Is indirect block number possible? */
17677ec681f3Smrg                        assert(nir_src_is_const(index));
17687ec681f3Smrg
17697ec681f3Smrg                        uint32_t uindex = nir_src_as_uint(index);
17707ec681f3Smrg                        emit_ubo_read(ctx, &instr->instr, reg, offset, indirect_offset, 0, uindex, nr_comp);
17717ec681f3Smrg                } else if (is_global || is_shared || is_scratch) {
17727ec681f3Smrg                        unsigned seg = is_global ? LDST_GLOBAL : (is_shared ? LDST_SHARED : LDST_SCRATCH);
17737ec681f3Smrg                        emit_global(ctx, &instr->instr, true, reg, src_offset, seg);
17747ec681f3Smrg                } else if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->inputs->is_blend) {
17757ec681f3Smrg                        emit_varying_read(ctx, reg, offset, nr_comp, component, indirect_offset, t | nir_dest_bit_size(instr->dest), is_flat);
17767ec681f3Smrg                } else if (ctx->inputs->is_blend) {
17777ec681f3Smrg                        /* ctx->blend_input will be precoloured to r0/r2, where
17787ec681f3Smrg                         * the input is preloaded */
17797ec681f3Smrg
17807ec681f3Smrg                        unsigned *input = offset ? &ctx->blend_src1 : &ctx->blend_input;
17817ec681f3Smrg
17827ec681f3Smrg                        if (*input == ~0)
17837ec681f3Smrg                                *input = reg;
17847ec681f3Smrg                        else
17857ec681f3Smrg                                emit_mir_instruction(ctx, v_mov(*input, reg));
17867ec681f3Smrg                } else if (ctx->stage == MESA_SHADER_VERTEX) {
17877ec681f3Smrg                        emit_attr_read(ctx, reg, offset, nr_comp, t);
17887ec681f3Smrg                } else {
17897ec681f3Smrg                        DBG("Unknown load\n");
17907ec681f3Smrg                        assert(0);
17917ec681f3Smrg                }
17927ec681f3Smrg
17937ec681f3Smrg                break;
17947ec681f3Smrg        }
17957ec681f3Smrg
17967ec681f3Smrg        /* Handled together with load_interpolated_input */
17977ec681f3Smrg        case nir_intrinsic_load_barycentric_pixel:
17987ec681f3Smrg        case nir_intrinsic_load_barycentric_centroid:
17997ec681f3Smrg        case nir_intrinsic_load_barycentric_sample:
18007ec681f3Smrg                break;
18017ec681f3Smrg
18027ec681f3Smrg        /* Reads 128-bit value raw off the tilebuffer during blending, tasty */
18037ec681f3Smrg
18047ec681f3Smrg        case nir_intrinsic_load_raw_output_pan: {
18057ec681f3Smrg                reg = nir_dest_index(&instr->dest);
18067ec681f3Smrg
18077ec681f3Smrg                /* T720 and below use different blend opcodes with slightly
18087ec681f3Smrg                 * different semantics than T760 and up */
18097ec681f3Smrg
18107ec681f3Smrg                midgard_instruction ld = m_ld_tilebuffer_raw(reg, 0);
18117ec681f3Smrg
18127ec681f3Smrg                unsigned target = output_load_rt_addr(ctx, instr);
18137ec681f3Smrg                ld.load_store.index_comp = target & 0x3;
18147ec681f3Smrg                ld.load_store.index_reg = target >> 2;
18157ec681f3Smrg
18167ec681f3Smrg                if (nir_src_is_const(instr->src[0])) {
18177ec681f3Smrg                        unsigned sample = nir_src_as_uint(instr->src[0]);
18187ec681f3Smrg                        ld.load_store.arg_comp = sample & 0x3;
18197ec681f3Smrg                        ld.load_store.arg_reg = sample >> 2;
18207ec681f3Smrg                } else {
18217ec681f3Smrg                        /* Enable sample index via register. */
18227ec681f3Smrg                        ld.load_store.signed_offset |= 1;
18237ec681f3Smrg                        ld.src[1] = nir_src_index(ctx, &instr->src[0]);
18247ec681f3Smrg                        ld.src_types[1] = nir_type_int32;
18257ec681f3Smrg                }
18267ec681f3Smrg
18277ec681f3Smrg                if (ctx->quirks & MIDGARD_OLD_BLEND) {
18287ec681f3Smrg                        ld.op = midgard_op_ld_special_32u;
18297ec681f3Smrg                        ld.load_store.signed_offset = PACK_LDST_SELECTOR_OFS(16);
18307ec681f3Smrg                        ld.load_store.index_reg = REGISTER_LDST_ZERO;
18317ec681f3Smrg                }
18327ec681f3Smrg
18337ec681f3Smrg                emit_mir_instruction(ctx, ld);
18347ec681f3Smrg                break;
18357ec681f3Smrg        }
18367ec681f3Smrg
18377ec681f3Smrg        case nir_intrinsic_load_output: {
18387ec681f3Smrg                reg = nir_dest_index(&instr->dest);
18397ec681f3Smrg
18407ec681f3Smrg                unsigned bits = nir_dest_bit_size(instr->dest);
18417ec681f3Smrg
18427ec681f3Smrg                midgard_instruction ld;
18437ec681f3Smrg                if (bits == 16)
18447ec681f3Smrg                        ld = m_ld_tilebuffer_16f(reg, 0);
18457ec681f3Smrg                else
18467ec681f3Smrg                        ld = m_ld_tilebuffer_32f(reg, 0);
18477ec681f3Smrg
18487ec681f3Smrg                unsigned index = output_load_rt_addr(ctx, instr);
18497ec681f3Smrg                ld.load_store.index_comp = index & 0x3;
18507ec681f3Smrg                ld.load_store.index_reg = index >> 2;
18517ec681f3Smrg
18527ec681f3Smrg                for (unsigned c = 4; c < 16; ++c)
18537ec681f3Smrg                        ld.swizzle[0][c] = 0;
18547ec681f3Smrg
18557ec681f3Smrg                if (ctx->quirks & MIDGARD_OLD_BLEND) {
18567ec681f3Smrg                        if (bits == 16)
18577ec681f3Smrg                                ld.op = midgard_op_ld_special_16f;
18587ec681f3Smrg                        else
18597ec681f3Smrg                                ld.op = midgard_op_ld_special_32f;
18607ec681f3Smrg                        ld.load_store.signed_offset = PACK_LDST_SELECTOR_OFS(1);
18617ec681f3Smrg                        ld.load_store.index_reg = REGISTER_LDST_ZERO;
18627ec681f3Smrg                }
18637ec681f3Smrg
18647ec681f3Smrg                emit_mir_instruction(ctx, ld);
18657ec681f3Smrg                break;
18667ec681f3Smrg        }
18677ec681f3Smrg
18687ec681f3Smrg        case nir_intrinsic_store_output:
18697ec681f3Smrg        case nir_intrinsic_store_combined_output_pan:
18707ec681f3Smrg                assert(nir_src_is_const(instr->src[1]) && "no indirect outputs");
18717ec681f3Smrg
18727ec681f3Smrg                offset = nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[1]);
18737ec681f3Smrg
18747ec681f3Smrg                reg = nir_src_index(ctx, &instr->src[0]);
18757ec681f3Smrg
18767ec681f3Smrg                if (ctx->stage == MESA_SHADER_FRAGMENT) {
18777ec681f3Smrg                        bool combined = instr->intrinsic ==
18787ec681f3Smrg                                nir_intrinsic_store_combined_output_pan;
18797ec681f3Smrg
18807ec681f3Smrg                        const nir_variable *var;
18817ec681f3Smrg                        var = nir_find_variable_with_driver_location(ctx->nir, nir_var_shader_out,
18827ec681f3Smrg                                         nir_intrinsic_base(instr));
18837ec681f3Smrg                        assert(var);
18847ec681f3Smrg
18857ec681f3Smrg                        /* Dual-source blend writeout is done by leaving the
18867ec681f3Smrg                         * value in r2 for the blend shader to use. */
18877ec681f3Smrg                        if (var->data.index) {
18887ec681f3Smrg                                if (instr->src[0].is_ssa) {
18897ec681f3Smrg                                        emit_explicit_constant(ctx, reg, reg);
18907ec681f3Smrg
18917ec681f3Smrg                                        unsigned out = make_compiler_temp(ctx);
18927ec681f3Smrg
18937ec681f3Smrg                                        midgard_instruction ins = v_mov(reg, out);
18947ec681f3Smrg                                        emit_mir_instruction(ctx, ins);
18957ec681f3Smrg
18967ec681f3Smrg                                        ctx->blend_src1 = out;
18977ec681f3Smrg                                } else {
18987ec681f3Smrg                                        ctx->blend_src1 = reg;
18997ec681f3Smrg                                }
19007ec681f3Smrg
19017ec681f3Smrg                                break;
19027ec681f3Smrg                        }
19037ec681f3Smrg
19047ec681f3Smrg                        enum midgard_rt_id rt;
19057ec681f3Smrg                        if (var->data.location >= FRAG_RESULT_DATA0)
19067ec681f3Smrg                                rt = MIDGARD_COLOR_RT0 + var->data.location -
19077ec681f3Smrg                                     FRAG_RESULT_DATA0;
19087ec681f3Smrg                        else if (combined)
19097ec681f3Smrg                                rt = MIDGARD_ZS_RT;
19107ec681f3Smrg                        else
19117ec681f3Smrg                                unreachable("bad rt");
19127ec681f3Smrg
19137ec681f3Smrg                        unsigned reg_z = ~0, reg_s = ~0;
19147ec681f3Smrg                        if (combined) {
19157ec681f3Smrg                                unsigned writeout = nir_intrinsic_component(instr);
19167ec681f3Smrg                                if (writeout & PAN_WRITEOUT_Z)
19177ec681f3Smrg                                        reg_z = nir_src_index(ctx, &instr->src[2]);
19187ec681f3Smrg                                if (writeout & PAN_WRITEOUT_S)
19197ec681f3Smrg                                        reg_s = nir_src_index(ctx, &instr->src[3]);
19207ec681f3Smrg                        }
19217ec681f3Smrg
19227ec681f3Smrg                        emit_fragment_store(ctx, reg, reg_z, reg_s, rt, 0);
19237ec681f3Smrg                } else if (ctx->stage == MESA_SHADER_VERTEX) {
19247ec681f3Smrg                        assert(instr->intrinsic == nir_intrinsic_store_output);
19257ec681f3Smrg
19267ec681f3Smrg                        /* We should have been vectorized, though we don't
19277ec681f3Smrg                         * currently check that st_vary is emitted only once
19287ec681f3Smrg                         * per slot (this is relevant, since there's not a mask
19297ec681f3Smrg                         * parameter available on the store [set to 0 by the
19307ec681f3Smrg                         * blob]). We do respect the component by adjusting the
19317ec681f3Smrg                         * swizzle. If this is a constant source, we'll need to
19327ec681f3Smrg                         * emit that explicitly. */
19337ec681f3Smrg
19347ec681f3Smrg                        emit_explicit_constant(ctx, reg, reg);
19357ec681f3Smrg
19367ec681f3Smrg                        unsigned dst_component = nir_intrinsic_component(instr);
19377ec681f3Smrg                        unsigned nr_comp = nir_src_num_components(instr->src[0]);
19387ec681f3Smrg
19397ec681f3Smrg                        midgard_instruction st = m_st_vary_32(reg, PACK_LDST_ATTRIB_OFS(offset));
19407ec681f3Smrg                        st.load_store.arg_reg = REGISTER_LDST_ZERO;
19417ec681f3Smrg                        st.load_store.index_format = midgard_index_address_u32;
19427ec681f3Smrg                        st.load_store.index_reg = REGISTER_LDST_ZERO;
19437ec681f3Smrg
19447ec681f3Smrg                        switch (nir_alu_type_get_base_type(nir_intrinsic_src_type(instr))) {
19457ec681f3Smrg                        case nir_type_uint:
19467ec681f3Smrg                        case nir_type_bool:
19477ec681f3Smrg                                st.op = midgard_op_st_vary_32u;
19487ec681f3Smrg                                break;
19497ec681f3Smrg                        case nir_type_int:
19507ec681f3Smrg                                st.op = midgard_op_st_vary_32i;
19517ec681f3Smrg                                break;
19527ec681f3Smrg                        case nir_type_float:
19537ec681f3Smrg                                st.op = midgard_op_st_vary_32;
19547ec681f3Smrg                                break;
19557ec681f3Smrg                        default:
19567ec681f3Smrg                                unreachable("Attempted to store unknown type");
19577ec681f3Smrg                                break;
19587ec681f3Smrg                        }
19597ec681f3Smrg
19607ec681f3Smrg                        /* nir_intrinsic_component(store_intr) encodes the
19617ec681f3Smrg                         * destination component start. Source component offset
19627ec681f3Smrg                         * adjustment is taken care of in
19637ec681f3Smrg                         * install_registers_instr(), when offset_swizzle() is
19647ec681f3Smrg                         * called.
19657ec681f3Smrg                         */
19667ec681f3Smrg                        unsigned src_component = COMPONENT_X;
19677ec681f3Smrg
19687ec681f3Smrg                        assert(nr_comp > 0);
19697ec681f3Smrg                        for (unsigned i = 0; i < ARRAY_SIZE(st.swizzle); ++i) {
19707ec681f3Smrg                                st.swizzle[0][i] = src_component;
19717ec681f3Smrg                                if (i >= dst_component && i < dst_component + nr_comp - 1)
19727ec681f3Smrg                                        src_component++;
19737ec681f3Smrg                        }
19747ec681f3Smrg
19757ec681f3Smrg                        emit_mir_instruction(ctx, st);
19767ec681f3Smrg                } else {
19777ec681f3Smrg                        DBG("Unknown store\n");
19787ec681f3Smrg                        assert(0);
19797ec681f3Smrg                }
19807ec681f3Smrg
19817ec681f3Smrg                break;
19827ec681f3Smrg
19837ec681f3Smrg        /* Special case of store_output for lowered blend shaders */
19847ec681f3Smrg        case nir_intrinsic_store_raw_output_pan:
19857ec681f3Smrg                assert (ctx->stage == MESA_SHADER_FRAGMENT);
19867ec681f3Smrg                reg = nir_src_index(ctx, &instr->src[0]);
19877ec681f3Smrg                for (unsigned s = 0; s < ctx->blend_sample_iterations; s++)
19887ec681f3Smrg                        emit_fragment_store(ctx, reg, ~0, ~0,
19897ec681f3Smrg                                            ctx->inputs->blend.rt + MIDGARD_COLOR_RT0,
19907ec681f3Smrg                                            s);
19917ec681f3Smrg                break;
19927ec681f3Smrg
19937ec681f3Smrg        case nir_intrinsic_store_global:
19947ec681f3Smrg        case nir_intrinsic_store_shared:
19957ec681f3Smrg        case nir_intrinsic_store_scratch:
19967ec681f3Smrg                reg = nir_src_index(ctx, &instr->src[0]);
19977ec681f3Smrg                emit_explicit_constant(ctx, reg, reg);
19987ec681f3Smrg
19997ec681f3Smrg                unsigned seg;
20007ec681f3Smrg                if (instr->intrinsic == nir_intrinsic_store_global)
20017ec681f3Smrg                        seg = LDST_GLOBAL;
20027ec681f3Smrg                else if (instr->intrinsic == nir_intrinsic_store_shared)
20037ec681f3Smrg                        seg = LDST_SHARED;
20047ec681f3Smrg                else
20057ec681f3Smrg                        seg = LDST_SCRATCH;
20067ec681f3Smrg
20077ec681f3Smrg                emit_global(ctx, &instr->instr, false, reg, &instr->src[1], seg);
20087ec681f3Smrg                break;
20097ec681f3Smrg
20107ec681f3Smrg        case nir_intrinsic_load_first_vertex:
20117ec681f3Smrg        case nir_intrinsic_load_ssbo_address:
20127ec681f3Smrg        case nir_intrinsic_load_work_dim:
20137ec681f3Smrg                emit_sysval_read(ctx, &instr->instr, 1, 0);
20147ec681f3Smrg                break;
20157ec681f3Smrg
20167ec681f3Smrg        case nir_intrinsic_load_base_vertex:
20177ec681f3Smrg                emit_sysval_read(ctx, &instr->instr, 1, 4);
20187ec681f3Smrg                break;
20197ec681f3Smrg
20207ec681f3Smrg        case nir_intrinsic_load_base_instance:
20217ec681f3Smrg                emit_sysval_read(ctx, &instr->instr, 1, 8);
20227ec681f3Smrg                break;
20237ec681f3Smrg
20247ec681f3Smrg        case nir_intrinsic_load_sample_positions_pan:
20257ec681f3Smrg                emit_sysval_read(ctx, &instr->instr, 2, 0);
20267ec681f3Smrg                break;
20277ec681f3Smrg
20287ec681f3Smrg        case nir_intrinsic_get_ssbo_size:
20297ec681f3Smrg                emit_sysval_read(ctx, &instr->instr, 1, 8);
20307ec681f3Smrg                break;
20317ec681f3Smrg
20327ec681f3Smrg        case nir_intrinsic_load_viewport_scale:
20337ec681f3Smrg        case nir_intrinsic_load_viewport_offset:
20347ec681f3Smrg        case nir_intrinsic_load_num_workgroups:
20357ec681f3Smrg        case nir_intrinsic_load_sampler_lod_parameters_pan:
20367ec681f3Smrg        case nir_intrinsic_load_workgroup_size:
20377ec681f3Smrg                emit_sysval_read(ctx, &instr->instr, 3, 0);
20387ec681f3Smrg                break;
20397ec681f3Smrg
20407ec681f3Smrg        case nir_intrinsic_load_blend_const_color_rgba:
20417ec681f3Smrg                emit_sysval_read(ctx, &instr->instr, 4, 0);
20427ec681f3Smrg                break;
20437ec681f3Smrg
20447ec681f3Smrg        case nir_intrinsic_load_workgroup_id:
20457ec681f3Smrg        case nir_intrinsic_load_local_invocation_id:
20467ec681f3Smrg        case nir_intrinsic_load_global_invocation_id:
20477ec681f3Smrg        case nir_intrinsic_load_global_invocation_id_zero_base:
20487ec681f3Smrg                emit_compute_builtin(ctx, instr);
20497ec681f3Smrg                break;
20507ec681f3Smrg
20517ec681f3Smrg        case nir_intrinsic_load_vertex_id_zero_base:
20527ec681f3Smrg        case nir_intrinsic_load_instance_id:
20537ec681f3Smrg                emit_vertex_builtin(ctx, instr);
20547ec681f3Smrg                break;
20557ec681f3Smrg
20567ec681f3Smrg        case nir_intrinsic_load_sample_mask_in:
20577ec681f3Smrg                emit_special(ctx, instr, 96);
20587ec681f3Smrg                break;
20597ec681f3Smrg
20607ec681f3Smrg        case nir_intrinsic_load_sample_id:
20617ec681f3Smrg                emit_special(ctx, instr, 97);
20627ec681f3Smrg                break;
20637ec681f3Smrg
20647ec681f3Smrg        /* Midgard doesn't seem to want special handling */
20657ec681f3Smrg        case nir_intrinsic_memory_barrier:
20667ec681f3Smrg        case nir_intrinsic_memory_barrier_buffer:
20677ec681f3Smrg        case nir_intrinsic_memory_barrier_image:
20687ec681f3Smrg        case nir_intrinsic_memory_barrier_shared:
20697ec681f3Smrg        case nir_intrinsic_group_memory_barrier:
20707ec681f3Smrg                break;
20717ec681f3Smrg
20727ec681f3Smrg        case nir_intrinsic_control_barrier:
20737ec681f3Smrg                schedule_barrier(ctx);
20747ec681f3Smrg                emit_control_barrier(ctx);
20757ec681f3Smrg                schedule_barrier(ctx);
20767ec681f3Smrg                break;
20777ec681f3Smrg
20787ec681f3Smrg        ATOMIC_CASE(ctx, instr, add, add);
20797ec681f3Smrg        ATOMIC_CASE(ctx, instr, and, and);
20807ec681f3Smrg        ATOMIC_CASE(ctx, instr, comp_swap, cmpxchg);
20817ec681f3Smrg        ATOMIC_CASE(ctx, instr, exchange, xchg);
20827ec681f3Smrg        ATOMIC_CASE(ctx, instr, imax, imax);
20837ec681f3Smrg        ATOMIC_CASE(ctx, instr, imin, imin);
20847ec681f3Smrg        ATOMIC_CASE(ctx, instr, or, or);
20857ec681f3Smrg        ATOMIC_CASE(ctx, instr, umax, umax);
20867ec681f3Smrg        ATOMIC_CASE(ctx, instr, umin, umin);
20877ec681f3Smrg        ATOMIC_CASE(ctx, instr, xor, xor);
20887ec681f3Smrg
20897ec681f3Smrg        IMAGE_ATOMIC_CASE(ctx, instr, add, add);
20907ec681f3Smrg        IMAGE_ATOMIC_CASE(ctx, instr, and, and);
20917ec681f3Smrg        IMAGE_ATOMIC_CASE(ctx, instr, comp_swap, cmpxchg);
20927ec681f3Smrg        IMAGE_ATOMIC_CASE(ctx, instr, exchange, xchg);
20937ec681f3Smrg        IMAGE_ATOMIC_CASE(ctx, instr, imax, imax);
20947ec681f3Smrg        IMAGE_ATOMIC_CASE(ctx, instr, imin, imin);
20957ec681f3Smrg        IMAGE_ATOMIC_CASE(ctx, instr, or, or);
20967ec681f3Smrg        IMAGE_ATOMIC_CASE(ctx, instr, umax, umax);
20977ec681f3Smrg        IMAGE_ATOMIC_CASE(ctx, instr, umin, umin);
20987ec681f3Smrg        IMAGE_ATOMIC_CASE(ctx, instr, xor, xor);
20997ec681f3Smrg
21007ec681f3Smrg        default:
21017ec681f3Smrg                fprintf(stderr, "Unhandled intrinsic %s\n", nir_intrinsic_infos[instr->intrinsic].name);
21027ec681f3Smrg                assert(0);
21037ec681f3Smrg                break;
21047ec681f3Smrg        }
21057ec681f3Smrg}
21067ec681f3Smrg
21077ec681f3Smrg/* Returns dimension with 0 special casing cubemaps */
21087ec681f3Smrgstatic unsigned
21097ec681f3Smrgmidgard_tex_format(enum glsl_sampler_dim dim)
21107ec681f3Smrg{
21117ec681f3Smrg        switch (dim) {
21127ec681f3Smrg        case GLSL_SAMPLER_DIM_1D:
21137ec681f3Smrg        case GLSL_SAMPLER_DIM_BUF:
21147ec681f3Smrg                return 1;
21157ec681f3Smrg
21167ec681f3Smrg        case GLSL_SAMPLER_DIM_2D:
21177ec681f3Smrg        case GLSL_SAMPLER_DIM_MS:
21187ec681f3Smrg        case GLSL_SAMPLER_DIM_EXTERNAL:
21197ec681f3Smrg        case GLSL_SAMPLER_DIM_RECT:
21207ec681f3Smrg                return 2;
21217ec681f3Smrg
21227ec681f3Smrg        case GLSL_SAMPLER_DIM_3D:
21237ec681f3Smrg                return 3;
21247ec681f3Smrg
21257ec681f3Smrg        case GLSL_SAMPLER_DIM_CUBE:
21267ec681f3Smrg                return 0;
21277ec681f3Smrg
21287ec681f3Smrg        default:
21297ec681f3Smrg                DBG("Unknown sampler dim type\n");
21307ec681f3Smrg                assert(0);
21317ec681f3Smrg                return 0;
21327ec681f3Smrg        }
21337ec681f3Smrg}
21347ec681f3Smrg
21357ec681f3Smrg/* Tries to attach an explicit LOD or bias as a constant. Returns whether this
21367ec681f3Smrg * was successful */
21377ec681f3Smrg
21387ec681f3Smrgstatic bool
21397ec681f3Smrgpan_attach_constant_bias(
21407ec681f3Smrg        compiler_context *ctx,
21417ec681f3Smrg        nir_src lod,
21427ec681f3Smrg        midgard_texture_word *word)
21437ec681f3Smrg{
21447ec681f3Smrg        /* To attach as constant, it has to *be* constant */
21457ec681f3Smrg
21467ec681f3Smrg        if (!nir_src_is_const(lod))
21477ec681f3Smrg                return false;
21487ec681f3Smrg
21497ec681f3Smrg        float f = nir_src_as_float(lod);
21507ec681f3Smrg
21517ec681f3Smrg        /* Break into fixed-point */
21527ec681f3Smrg        signed lod_int = f;
21537ec681f3Smrg        float lod_frac = f - lod_int;
21547ec681f3Smrg
21557ec681f3Smrg        /* Carry over negative fractions */
21567ec681f3Smrg        if (lod_frac < 0.0) {
21577ec681f3Smrg                lod_int--;
21587ec681f3Smrg                lod_frac += 1.0;
21597ec681f3Smrg        }
21607ec681f3Smrg
21617ec681f3Smrg        /* Encode */
21627ec681f3Smrg        word->bias = float_to_ubyte(lod_frac);
21637ec681f3Smrg        word->bias_int = lod_int;
21647ec681f3Smrg
21657ec681f3Smrg        return true;
21667ec681f3Smrg}
21677ec681f3Smrg
21687ec681f3Smrgstatic enum mali_texture_mode
21697ec681f3Smrgmdg_texture_mode(nir_tex_instr *instr)
21707ec681f3Smrg{
21717ec681f3Smrg        if (instr->op == nir_texop_tg4 && instr->is_shadow)
21727ec681f3Smrg                return TEXTURE_GATHER_SHADOW;
21737ec681f3Smrg        else if (instr->op == nir_texop_tg4)
21747ec681f3Smrg                return TEXTURE_GATHER_X + instr->component;
21757ec681f3Smrg        else if (instr->is_shadow)
21767ec681f3Smrg                return TEXTURE_SHADOW;
21777ec681f3Smrg        else
21787ec681f3Smrg                return TEXTURE_NORMAL;
21797ec681f3Smrg}
21807ec681f3Smrg
21817ec681f3Smrgstatic void
21827ec681f3Smrgset_tex_coord(compiler_context *ctx, nir_tex_instr *instr,
21837ec681f3Smrg              midgard_instruction *ins)
21847ec681f3Smrg{
21857ec681f3Smrg        int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord);
21867ec681f3Smrg
21877ec681f3Smrg        assert(coord_idx >= 0);
21887ec681f3Smrg
21897ec681f3Smrg        int comparator_idx = nir_tex_instr_src_index(instr, nir_tex_src_comparator);
21907ec681f3Smrg        int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index);
21917ec681f3Smrg        assert(comparator_idx < 0 || ms_idx < 0);
21927ec681f3Smrg        int ms_or_comparator_idx = ms_idx >= 0 ? ms_idx : comparator_idx;
21937ec681f3Smrg
21947ec681f3Smrg        unsigned coords = nir_src_index(ctx, &instr->src[coord_idx].src);
21957ec681f3Smrg
21967ec681f3Smrg        emit_explicit_constant(ctx, coords, coords);
21977ec681f3Smrg
21987ec681f3Smrg        ins->src_types[1] = nir_tex_instr_src_type(instr, coord_idx) |
21997ec681f3Smrg                            nir_src_bit_size(instr->src[coord_idx].src);
22007ec681f3Smrg
22017ec681f3Smrg        unsigned nr_comps = instr->coord_components;
22027ec681f3Smrg        unsigned written_mask = 0, write_mask = 0;
22037ec681f3Smrg
22047ec681f3Smrg        /* Initialize all components to coord.x which is expected to always be
22057ec681f3Smrg         * present. Swizzle is updated below based on the texture dimension
22067ec681f3Smrg         * and extra attributes that are packed in the coordinate argument.
22077ec681f3Smrg         */
22087ec681f3Smrg        for (unsigned c = 0; c < MIR_VEC_COMPONENTS; c++)
22097ec681f3Smrg                ins->swizzle[1][c] = COMPONENT_X;
22107ec681f3Smrg
22117ec681f3Smrg        /* Shadow ref value is part of the coordinates if there's no comparator
22127ec681f3Smrg         * source, in that case it's always placed in the last component.
22137ec681f3Smrg         * Midgard wants the ref value in coord.z.
22147ec681f3Smrg         */
22157ec681f3Smrg        if (instr->is_shadow && comparator_idx < 0) {
22167ec681f3Smrg                ins->swizzle[1][COMPONENT_Z] = --nr_comps;
22177ec681f3Smrg                write_mask |= 1 << COMPONENT_Z;
22187ec681f3Smrg        }
22197ec681f3Smrg
22207ec681f3Smrg        /* The array index is the last component if there's no shadow ref value
22217ec681f3Smrg         * or second last if there's one. We already decremented the number of
22227ec681f3Smrg         * components to account for the shadow ref value above.
22237ec681f3Smrg         * Midgard wants the array index in coord.w.
22247ec681f3Smrg         */
22257ec681f3Smrg        if (instr->is_array) {
22267ec681f3Smrg                ins->swizzle[1][COMPONENT_W] = --nr_comps;
22277ec681f3Smrg                write_mask |= 1 << COMPONENT_W;
22287ec681f3Smrg        }
22297ec681f3Smrg
22307ec681f3Smrg        if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
22317ec681f3Smrg                /* texelFetch is undefined on samplerCube */
22327ec681f3Smrg                assert(ins->op != midgard_tex_op_fetch);
22337ec681f3Smrg
22347ec681f3Smrg                ins->src[1] = make_compiler_temp_reg(ctx);
22357ec681f3Smrg
22367ec681f3Smrg                /* For cubemaps, we use a special ld/st op to select the face
22377ec681f3Smrg                 * and copy the xy into the texture register
22387ec681f3Smrg                 */
22397ec681f3Smrg                midgard_instruction ld = m_ld_cubemap_coords(ins->src[1], 0);
22407ec681f3Smrg                ld.src[1] = coords;
22417ec681f3Smrg                ld.src_types[1] = ins->src_types[1];
22427ec681f3Smrg                ld.mask = 0x3; /* xy */
22437ec681f3Smrg                ld.load_store.bitsize_toggle = true;
22447ec681f3Smrg                ld.swizzle[1][3] = COMPONENT_X;
22457ec681f3Smrg                emit_mir_instruction(ctx, ld);
22467ec681f3Smrg
22477ec681f3Smrg                /* We packed cube coordiates (X,Y,Z) into (X,Y), update the
22487ec681f3Smrg                 * written mask accordingly and decrement the number of
22497ec681f3Smrg                 * components
22507ec681f3Smrg                 */
22517ec681f3Smrg                nr_comps--;
22527ec681f3Smrg                written_mask |= 3;
22537ec681f3Smrg        }
22547ec681f3Smrg
22557ec681f3Smrg        /* Now flag tex coord components that have not been written yet */
22567ec681f3Smrg        write_mask |= mask_of(nr_comps) & ~written_mask;
22577ec681f3Smrg        for (unsigned c = 0; c < nr_comps; c++)
22587ec681f3Smrg                ins->swizzle[1][c] = c;
22597ec681f3Smrg
22607ec681f3Smrg        /* Sample index and shadow ref are expected in coord.z */
22617ec681f3Smrg        if (ms_or_comparator_idx >= 0) {
22627ec681f3Smrg                assert(!((write_mask | written_mask) & (1 << COMPONENT_Z)));
22637ec681f3Smrg
22647ec681f3Smrg                unsigned sample_or_ref =
22657ec681f3Smrg                        nir_src_index(ctx, &instr->src[ms_or_comparator_idx].src);
22667ec681f3Smrg
22677ec681f3Smrg                emit_explicit_constant(ctx, sample_or_ref, sample_or_ref);
22687ec681f3Smrg
22697ec681f3Smrg                if (ins->src[1] == ~0)
22707ec681f3Smrg                        ins->src[1] = make_compiler_temp_reg(ctx);
22717ec681f3Smrg
22727ec681f3Smrg                midgard_instruction mov = v_mov(sample_or_ref, ins->src[1]);
22737ec681f3Smrg
22747ec681f3Smrg                for (unsigned c = 0; c < MIR_VEC_COMPONENTS; c++)
22757ec681f3Smrg                        mov.swizzle[1][c] = COMPONENT_X;
22767ec681f3Smrg
22777ec681f3Smrg                mov.mask = 1 << COMPONENT_Z;
22787ec681f3Smrg                written_mask |= 1 << COMPONENT_Z;
22797ec681f3Smrg                ins->swizzle[1][COMPONENT_Z] = COMPONENT_Z;
22807ec681f3Smrg                emit_mir_instruction(ctx, mov);
22817ec681f3Smrg        }
22827ec681f3Smrg
22837ec681f3Smrg        /* Texelfetch coordinates uses all four elements (xyz/index) regardless
22847ec681f3Smrg         * of texture dimensionality, which means it's necessary to zero the
22857ec681f3Smrg         * unused components to keep everything happy.
22867ec681f3Smrg         */
22877ec681f3Smrg        if (ins->op == midgard_tex_op_fetch &&
22887ec681f3Smrg            (written_mask | write_mask) != 0xF) {
22897ec681f3Smrg                if (ins->src[1] == ~0)
22907ec681f3Smrg                        ins->src[1] = make_compiler_temp_reg(ctx);
22917ec681f3Smrg
22927ec681f3Smrg                /* mov index.zw, #0, or generalized */
22937ec681f3Smrg                midgard_instruction mov =
22947ec681f3Smrg                        v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), ins->src[1]);
22957ec681f3Smrg                mov.has_constants = true;
22967ec681f3Smrg                mov.mask = (written_mask | write_mask) ^ 0xF;
22977ec681f3Smrg                emit_mir_instruction(ctx, mov);
22987ec681f3Smrg                for (unsigned c = 0; c < MIR_VEC_COMPONENTS; c++) {
22997ec681f3Smrg                        if (mov.mask & (1 << c))
23007ec681f3Smrg                                ins->swizzle[1][c] = c;
23017ec681f3Smrg                }
23027ec681f3Smrg        }
23037ec681f3Smrg
23047ec681f3Smrg        if (ins->src[1] == ~0) {
23057ec681f3Smrg                /* No temporary reg created, use the src coords directly */
23067ec681f3Smrg                ins->src[1] = coords;
23077ec681f3Smrg	} else if (write_mask) {
23087ec681f3Smrg                /* Move the remaining coordinates to the temporary reg */
23097ec681f3Smrg                midgard_instruction mov = v_mov(coords, ins->src[1]);
23107ec681f3Smrg
23117ec681f3Smrg                for (unsigned c = 0; c < MIR_VEC_COMPONENTS; c++) {
23127ec681f3Smrg                        if ((1 << c) & write_mask) {
23137ec681f3Smrg                                mov.swizzle[1][c] = ins->swizzle[1][c];
23147ec681f3Smrg                                ins->swizzle[1][c] = c;
23157ec681f3Smrg                        } else {
23167ec681f3Smrg                                mov.swizzle[1][c] = COMPONENT_X;
23177ec681f3Smrg                        }
23187ec681f3Smrg                }
23197ec681f3Smrg
23207ec681f3Smrg                mov.mask = write_mask;
23217ec681f3Smrg                emit_mir_instruction(ctx, mov);
23227ec681f3Smrg        }
23237ec681f3Smrg}
23247ec681f3Smrg
23257ec681f3Smrgstatic void
23267ec681f3Smrgemit_texop_native(compiler_context *ctx, nir_tex_instr *instr,
23277ec681f3Smrg                  unsigned midgard_texop)
23287ec681f3Smrg{
23297ec681f3Smrg        /* TODO */
23307ec681f3Smrg        //assert (!instr->sampler);
23317ec681f3Smrg
23327ec681f3Smrg        nir_dest *dest = &instr->dest;
23337ec681f3Smrg
23347ec681f3Smrg        int texture_index = instr->texture_index;
23357ec681f3Smrg        int sampler_index = instr->sampler_index;
23367ec681f3Smrg
23377ec681f3Smrg        nir_alu_type dest_base = nir_alu_type_get_base_type(instr->dest_type);
23387ec681f3Smrg
23397ec681f3Smrg        /* texture instructions support float outmods */
23407ec681f3Smrg        unsigned outmod = midgard_outmod_none;
23417ec681f3Smrg        if (dest_base == nir_type_float) {
23427ec681f3Smrg                outmod = mir_determine_float_outmod(ctx, &dest, 0);
23437ec681f3Smrg        }
23447ec681f3Smrg
23457ec681f3Smrg        midgard_instruction ins = {
23467ec681f3Smrg                .type = TAG_TEXTURE_4,
23477ec681f3Smrg                .mask = 0xF,
23487ec681f3Smrg                .dest = nir_dest_index(dest),
23497ec681f3Smrg                .src = { ~0, ~0, ~0, ~0 },
23507ec681f3Smrg                .dest_type = instr->dest_type,
23517ec681f3Smrg                .swizzle = SWIZZLE_IDENTITY_4,
23527ec681f3Smrg                .outmod = outmod,
23537ec681f3Smrg                .op = midgard_texop,
23547ec681f3Smrg                .texture = {
23557ec681f3Smrg                        .format = midgard_tex_format(instr->sampler_dim),
23567ec681f3Smrg                        .texture_handle = texture_index,
23577ec681f3Smrg                        .sampler_handle = sampler_index,
23587ec681f3Smrg                        .mode = mdg_texture_mode(instr)
23597ec681f3Smrg                }
23607ec681f3Smrg        };
23617ec681f3Smrg
23627ec681f3Smrg        if (instr->is_shadow && !instr->is_new_style_shadow && instr->op != nir_texop_tg4)
23637ec681f3Smrg           for (int i = 0; i < 4; ++i)
23647ec681f3Smrg              ins.swizzle[0][i] = COMPONENT_X;
23657ec681f3Smrg
23667ec681f3Smrg        for (unsigned i = 0; i < instr->num_srcs; ++i) {
23677ec681f3Smrg                int index = nir_src_index(ctx, &instr->src[i].src);
23687ec681f3Smrg                unsigned sz = nir_src_bit_size(instr->src[i].src);
23697ec681f3Smrg                nir_alu_type T = nir_tex_instr_src_type(instr, i) | sz;
23707ec681f3Smrg
23717ec681f3Smrg                switch (instr->src[i].src_type) {
23727ec681f3Smrg                case nir_tex_src_coord:
23737ec681f3Smrg                        set_tex_coord(ctx, instr, &ins);
23747ec681f3Smrg                        break;
23757ec681f3Smrg
23767ec681f3Smrg                case nir_tex_src_bias:
23777ec681f3Smrg                case nir_tex_src_lod: {
23787ec681f3Smrg                        /* Try as a constant if we can */
23797ec681f3Smrg
23807ec681f3Smrg                        bool is_txf = midgard_texop == midgard_tex_op_fetch;
23817ec681f3Smrg                        if (!is_txf && pan_attach_constant_bias(ctx, instr->src[i].src, &ins.texture))
23827ec681f3Smrg                                break;
23837ec681f3Smrg
23847ec681f3Smrg                        ins.texture.lod_register = true;
23857ec681f3Smrg                        ins.src[2] = index;
23867ec681f3Smrg                        ins.src_types[2] = T;
23877ec681f3Smrg
23887ec681f3Smrg                        for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c)
23897ec681f3Smrg                                ins.swizzle[2][c] = COMPONENT_X;
23907ec681f3Smrg
23917ec681f3Smrg                        emit_explicit_constant(ctx, index, index);
23927ec681f3Smrg
23937ec681f3Smrg                        break;
23947ec681f3Smrg                };
23957ec681f3Smrg
23967ec681f3Smrg                case nir_tex_src_offset: {
23977ec681f3Smrg                        ins.texture.offset_register = true;
23987ec681f3Smrg                        ins.src[3] = index;
23997ec681f3Smrg                        ins.src_types[3] = T;
24007ec681f3Smrg
24017ec681f3Smrg                        for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c)
24027ec681f3Smrg                                ins.swizzle[3][c] = (c > COMPONENT_Z) ? 0 : c;
24037ec681f3Smrg
24047ec681f3Smrg                        emit_explicit_constant(ctx, index, index);
24057ec681f3Smrg                        break;
24067ec681f3Smrg                };
24077ec681f3Smrg
24087ec681f3Smrg                case nir_tex_src_comparator:
24097ec681f3Smrg                case nir_tex_src_ms_index:
24107ec681f3Smrg                        /* Nothing to do, handled in set_tex_coord() */
24117ec681f3Smrg                        break;
24127ec681f3Smrg
24137ec681f3Smrg                default: {
24147ec681f3Smrg                        fprintf(stderr, "Unknown texture source type: %d\n", instr->src[i].src_type);
24157ec681f3Smrg                        assert(0);
24167ec681f3Smrg                }
24177ec681f3Smrg                }
24187ec681f3Smrg        }
24197ec681f3Smrg
24207ec681f3Smrg        emit_mir_instruction(ctx, ins);
24217ec681f3Smrg}
24227ec681f3Smrg
24237ec681f3Smrgstatic void
24247ec681f3Smrgemit_tex(compiler_context *ctx, nir_tex_instr *instr)
24257ec681f3Smrg{
24267ec681f3Smrg        switch (instr->op) {
24277ec681f3Smrg        case nir_texop_tex:
24287ec681f3Smrg        case nir_texop_txb:
24297ec681f3Smrg                emit_texop_native(ctx, instr, midgard_tex_op_normal);
24307ec681f3Smrg                break;
24317ec681f3Smrg        case nir_texop_txl:
24327ec681f3Smrg        case nir_texop_tg4:
24337ec681f3Smrg                emit_texop_native(ctx, instr, midgard_tex_op_gradient);
24347ec681f3Smrg                break;
24357ec681f3Smrg        case nir_texop_txf:
24367ec681f3Smrg        case nir_texop_txf_ms:
24377ec681f3Smrg                emit_texop_native(ctx, instr, midgard_tex_op_fetch);
24387ec681f3Smrg                break;
24397ec681f3Smrg        case nir_texop_txs:
24407ec681f3Smrg                emit_sysval_read(ctx, &instr->instr, 4, 0);
24417ec681f3Smrg                break;
24427ec681f3Smrg        default: {
24437ec681f3Smrg                fprintf(stderr, "Unhandled texture op: %d\n", instr->op);
24447ec681f3Smrg                assert(0);
24457ec681f3Smrg        }
24467ec681f3Smrg        }
24477ec681f3Smrg}
24487ec681f3Smrg
24497ec681f3Smrgstatic void
24507ec681f3Smrgemit_jump(compiler_context *ctx, nir_jump_instr *instr)
24517ec681f3Smrg{
24527ec681f3Smrg        switch (instr->type) {
24537ec681f3Smrg        case nir_jump_break: {
24547ec681f3Smrg                /* Emit a branch out of the loop */
24557ec681f3Smrg                struct midgard_instruction br = v_branch(false, false);
24567ec681f3Smrg                br.branch.target_type = TARGET_BREAK;
24577ec681f3Smrg                br.branch.target_break = ctx->current_loop_depth;
24587ec681f3Smrg                emit_mir_instruction(ctx, br);
24597ec681f3Smrg                break;
24607ec681f3Smrg        }
24617ec681f3Smrg
24627ec681f3Smrg        default:
24637ec681f3Smrg                DBG("Unknown jump type %d\n", instr->type);
24647ec681f3Smrg                break;
24657ec681f3Smrg        }
24667ec681f3Smrg}
24677ec681f3Smrg
24687ec681f3Smrgstatic void
24697ec681f3Smrgemit_instr(compiler_context *ctx, struct nir_instr *instr)
24707ec681f3Smrg{
24717ec681f3Smrg        switch (instr->type) {
24727ec681f3Smrg        case nir_instr_type_load_const:
24737ec681f3Smrg                emit_load_const(ctx, nir_instr_as_load_const(instr));
24747ec681f3Smrg                break;
24757ec681f3Smrg
24767ec681f3Smrg        case nir_instr_type_intrinsic:
24777ec681f3Smrg                emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
24787ec681f3Smrg                break;
24797ec681f3Smrg
24807ec681f3Smrg        case nir_instr_type_alu:
24817ec681f3Smrg                emit_alu(ctx, nir_instr_as_alu(instr));
24827ec681f3Smrg                break;
24837ec681f3Smrg
24847ec681f3Smrg        case nir_instr_type_tex:
24857ec681f3Smrg                emit_tex(ctx, nir_instr_as_tex(instr));
24867ec681f3Smrg                break;
24877ec681f3Smrg
24887ec681f3Smrg        case nir_instr_type_jump:
24897ec681f3Smrg                emit_jump(ctx, nir_instr_as_jump(instr));
24907ec681f3Smrg                break;
24917ec681f3Smrg
24927ec681f3Smrg        case nir_instr_type_ssa_undef:
24937ec681f3Smrg                /* Spurious */
24947ec681f3Smrg                break;
24957ec681f3Smrg
24967ec681f3Smrg        default:
24977ec681f3Smrg                DBG("Unhandled instruction type\n");
24987ec681f3Smrg                break;
24997ec681f3Smrg        }
25007ec681f3Smrg}
25017ec681f3Smrg
25027ec681f3Smrg
25037ec681f3Smrg/* ALU instructions can inline or embed constants, which decreases register
25047ec681f3Smrg * pressure and saves space. */
25057ec681f3Smrg
25067ec681f3Smrg#define CONDITIONAL_ATTACH(idx) { \
25077ec681f3Smrg	void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->src[idx] + 1); \
25087ec681f3Smrg\
25097ec681f3Smrg	if (entry) { \
25107ec681f3Smrg		attach_constants(ctx, alu, entry, alu->src[idx] + 1); \
25117ec681f3Smrg		alu->src[idx] = SSA_FIXED_REGISTER(REGISTER_CONSTANT); \
25127ec681f3Smrg	} \
25137ec681f3Smrg}
25147ec681f3Smrg
25157ec681f3Smrgstatic void
25167ec681f3Smrginline_alu_constants(compiler_context *ctx, midgard_block *block)
25177ec681f3Smrg{
25187ec681f3Smrg        mir_foreach_instr_in_block(block, alu) {
25197ec681f3Smrg                /* Other instructions cannot inline constants */
25207ec681f3Smrg                if (alu->type != TAG_ALU_4) continue;
25217ec681f3Smrg                if (alu->compact_branch) continue;
25227ec681f3Smrg
25237ec681f3Smrg                /* If there is already a constant here, we can do nothing */
25247ec681f3Smrg                if (alu->has_constants) continue;
25257ec681f3Smrg
25267ec681f3Smrg                CONDITIONAL_ATTACH(0);
25277ec681f3Smrg
25287ec681f3Smrg                if (!alu->has_constants) {
25297ec681f3Smrg                        CONDITIONAL_ATTACH(1)
25307ec681f3Smrg                } else if (!alu->inline_constant) {
25317ec681f3Smrg                        /* Corner case: _two_ vec4 constants, for instance with a
25327ec681f3Smrg                         * csel. For this case, we can only use a constant
25337ec681f3Smrg                         * register for one, we'll have to emit a move for the
25347ec681f3Smrg                         * other. */
25357ec681f3Smrg
25367ec681f3Smrg                        void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->src[1] + 1);
25377ec681f3Smrg                        unsigned scratch = make_compiler_temp(ctx);
25387ec681f3Smrg
25397ec681f3Smrg                        if (entry) {
25407ec681f3Smrg                                midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), scratch);
25417ec681f3Smrg                                attach_constants(ctx, &ins, entry, alu->src[1] + 1);
25427ec681f3Smrg
25437ec681f3Smrg                                /* Set the source */
25447ec681f3Smrg                                alu->src[1] = scratch;
25457ec681f3Smrg
25467ec681f3Smrg                                /* Inject us -before- the last instruction which set r31 */
25477ec681f3Smrg                                mir_insert_instruction_before(ctx, mir_prev_op(alu), ins);
25487ec681f3Smrg                        }
25497ec681f3Smrg                }
25507ec681f3Smrg        }
25517ec681f3Smrg}
25527ec681f3Smrg
25537ec681f3Smrgunsigned
25547ec681f3Smrgmax_bitsize_for_alu(midgard_instruction *ins)
25557ec681f3Smrg{
25567ec681f3Smrg        unsigned max_bitsize = 0;
25577ec681f3Smrg        for (int i = 0; i < MIR_SRC_COUNT; i++) {
25587ec681f3Smrg                if (ins->src[i] == ~0) continue;
25597ec681f3Smrg                unsigned src_bitsize = nir_alu_type_get_type_size(ins->src_types[i]);
25607ec681f3Smrg                max_bitsize = MAX2(src_bitsize, max_bitsize);
25617ec681f3Smrg        }
25627ec681f3Smrg        unsigned dst_bitsize = nir_alu_type_get_type_size(ins->dest_type);
25637ec681f3Smrg        max_bitsize = MAX2(dst_bitsize, max_bitsize);
25647ec681f3Smrg
25657ec681f3Smrg        /* We don't have fp16 LUTs, so we'll want to emit code like:
25667ec681f3Smrg         *
25677ec681f3Smrg         *      vlut.fsinr hr0, hr0
25687ec681f3Smrg         *
25697ec681f3Smrg         * where both input and output are 16-bit but the operation is carried
25707ec681f3Smrg         * out in 32-bit
25717ec681f3Smrg         */
25727ec681f3Smrg
25737ec681f3Smrg        switch (ins->op) {
25747ec681f3Smrg        case midgard_alu_op_fsqrt:
25757ec681f3Smrg        case midgard_alu_op_frcp:
25767ec681f3Smrg        case midgard_alu_op_frsqrt:
25777ec681f3Smrg        case midgard_alu_op_fsinpi:
25787ec681f3Smrg        case midgard_alu_op_fcospi:
25797ec681f3Smrg        case midgard_alu_op_fexp2:
25807ec681f3Smrg        case midgard_alu_op_flog2:
25817ec681f3Smrg                max_bitsize = MAX2(max_bitsize, 32);
25827ec681f3Smrg                break;
25837ec681f3Smrg
25847ec681f3Smrg        default:
25857ec681f3Smrg                break;
25867ec681f3Smrg        }
25877ec681f3Smrg
25887ec681f3Smrg        /* High implies computing at a higher bitsize, e.g umul_high of 32-bit
25897ec681f3Smrg         * requires computing at 64-bit */
25907ec681f3Smrg        if (midgard_is_integer_out_op(ins->op) && ins->outmod == midgard_outmod_keephi) {
25917ec681f3Smrg                max_bitsize *= 2;
25927ec681f3Smrg                assert(max_bitsize <= 64);
25937ec681f3Smrg        }
25947ec681f3Smrg
25957ec681f3Smrg        return max_bitsize;
25967ec681f3Smrg}
25977ec681f3Smrg
25987ec681f3Smrgmidgard_reg_mode
25997ec681f3Smrgreg_mode_for_bitsize(unsigned bitsize)
26007ec681f3Smrg{
26017ec681f3Smrg        switch (bitsize) {
26027ec681f3Smrg                /* use 16 pipe for 8 since we don't support vec16 yet */
26037ec681f3Smrg        case 8:
26047ec681f3Smrg        case 16:
26057ec681f3Smrg                return midgard_reg_mode_16;
26067ec681f3Smrg        case 32:
26077ec681f3Smrg                return midgard_reg_mode_32;
26087ec681f3Smrg        case 64:
26097ec681f3Smrg                return midgard_reg_mode_64;
26107ec681f3Smrg        default:
26117ec681f3Smrg                unreachable("invalid bit size");
26127ec681f3Smrg        }
26137ec681f3Smrg}
26147ec681f3Smrg
26157ec681f3Smrg/* Midgard supports two types of constants, embedded constants (128-bit) and
26167ec681f3Smrg * inline constants (16-bit). Sometimes, especially with scalar ops, embedded
26177ec681f3Smrg * constants can be demoted to inline constants, for space savings and
26187ec681f3Smrg * sometimes a performance boost */
26197ec681f3Smrg
26207ec681f3Smrgstatic void
26217ec681f3Smrgembedded_to_inline_constant(compiler_context *ctx, midgard_block *block)
26227ec681f3Smrg{
26237ec681f3Smrg        mir_foreach_instr_in_block(block, ins) {
26247ec681f3Smrg                if (!ins->has_constants) continue;
26257ec681f3Smrg                if (ins->has_inline_constant) continue;
26267ec681f3Smrg
26277ec681f3Smrg                unsigned max_bitsize = max_bitsize_for_alu(ins);
26287ec681f3Smrg
26297ec681f3Smrg                /* We can inline 32-bit (sometimes) or 16-bit (usually) */
26307ec681f3Smrg                bool is_16 = max_bitsize == 16;
26317ec681f3Smrg                bool is_32 = max_bitsize == 32;
26327ec681f3Smrg
26337ec681f3Smrg                if (!(is_16 || is_32))
26347ec681f3Smrg                        continue;
26357ec681f3Smrg
26367ec681f3Smrg                /* src1 cannot be an inline constant due to encoding
26377ec681f3Smrg                 * restrictions. So, if possible we try to flip the arguments
26387ec681f3Smrg                 * in that case */
26397ec681f3Smrg
26407ec681f3Smrg                int op = ins->op;
26417ec681f3Smrg
26427ec681f3Smrg                if (ins->src[0] == SSA_FIXED_REGISTER(REGISTER_CONSTANT) &&
26437ec681f3Smrg                                alu_opcode_props[op].props & OP_COMMUTES) {
26447ec681f3Smrg                        mir_flip(ins);
26457ec681f3Smrg                }
26467ec681f3Smrg
26477ec681f3Smrg                if (ins->src[1] == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) {
26487ec681f3Smrg                        /* Component is from the swizzle. Take a nonzero component */
26497ec681f3Smrg                        assert(ins->mask);
26507ec681f3Smrg                        unsigned first_comp = ffs(ins->mask) - 1;
26517ec681f3Smrg                        unsigned component = ins->swizzle[1][first_comp];
26527ec681f3Smrg
26537ec681f3Smrg                        /* Scale constant appropriately, if we can legally */
26547ec681f3Smrg                        int16_t scaled_constant = 0;
26557ec681f3Smrg
26567ec681f3Smrg                        if (is_16) {
26577ec681f3Smrg                                scaled_constant = ins->constants.u16[component];
26587ec681f3Smrg                        } else if (midgard_is_integer_op(op)) {
26597ec681f3Smrg                                scaled_constant = ins->constants.u32[component];
26607ec681f3Smrg
26617ec681f3Smrg                                /* Constant overflow after resize */
26627ec681f3Smrg                                if (scaled_constant != ins->constants.u32[component])
26637ec681f3Smrg                                        continue;
26647ec681f3Smrg                        } else {
26657ec681f3Smrg                                float original = ins->constants.f32[component];
26667ec681f3Smrg                                scaled_constant = _mesa_float_to_half(original);
26677ec681f3Smrg
26687ec681f3Smrg                                /* Check for loss of precision. If this is
26697ec681f3Smrg                                 * mediump, we don't care, but for a highp
26707ec681f3Smrg                                 * shader, we need to pay attention. NIR
26717ec681f3Smrg                                 * doesn't yet tell us which mode we're in!
26727ec681f3Smrg                                 * Practically this prevents most constants
26737ec681f3Smrg                                 * from being inlined, sadly. */
26747ec681f3Smrg
26757ec681f3Smrg                                float fp32 = _mesa_half_to_float(scaled_constant);
26767ec681f3Smrg
26777ec681f3Smrg                                if (fp32 != original)
26787ec681f3Smrg                                        continue;
26797ec681f3Smrg                        }
26807ec681f3Smrg
26817ec681f3Smrg                        /* Should've been const folded */
26827ec681f3Smrg                        if (ins->src_abs[1] || ins->src_neg[1])
26837ec681f3Smrg                                continue;
26847ec681f3Smrg
26857ec681f3Smrg                        /* Make sure that the constant is not itself a vector
26867ec681f3Smrg                         * by checking if all accessed values are the same. */
26877ec681f3Smrg
26887ec681f3Smrg                        const midgard_constants *cons = &ins->constants;
26897ec681f3Smrg                        uint32_t value = is_16 ? cons->u16[component] : cons->u32[component];
26907ec681f3Smrg
26917ec681f3Smrg                        bool is_vector = false;
26927ec681f3Smrg                        unsigned mask = effective_writemask(ins->op, ins->mask);
26937ec681f3Smrg
26947ec681f3Smrg                        for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c) {
26957ec681f3Smrg                                /* We only care if this component is actually used */
26967ec681f3Smrg                                if (!(mask & (1 << c)))
26977ec681f3Smrg                                        continue;
26987ec681f3Smrg
26997ec681f3Smrg                                uint32_t test = is_16 ?
27007ec681f3Smrg                                                cons->u16[ins->swizzle[1][c]] :
27017ec681f3Smrg                                                cons->u32[ins->swizzle[1][c]];
27027ec681f3Smrg
27037ec681f3Smrg                                if (test != value) {
27047ec681f3Smrg                                        is_vector = true;
27057ec681f3Smrg                                        break;
27067ec681f3Smrg                                }
27077ec681f3Smrg                        }
27087ec681f3Smrg
27097ec681f3Smrg                        if (is_vector)
27107ec681f3Smrg                                continue;
27117ec681f3Smrg
27127ec681f3Smrg                        /* Get rid of the embedded constant */
27137ec681f3Smrg                        ins->has_constants = false;
27147ec681f3Smrg                        ins->src[1] = ~0;
27157ec681f3Smrg                        ins->has_inline_constant = true;
27167ec681f3Smrg                        ins->inline_constant = scaled_constant;
27177ec681f3Smrg                }
27187ec681f3Smrg        }
27197ec681f3Smrg}
27207ec681f3Smrg
27217ec681f3Smrg/* Dead code elimination for branches at the end of a block - only one branch
27227ec681f3Smrg * per block is legal semantically */
27237ec681f3Smrg
27247ec681f3Smrgstatic void
27257ec681f3Smrgmidgard_cull_dead_branch(compiler_context *ctx, midgard_block *block)
27267ec681f3Smrg{
27277ec681f3Smrg        bool branched = false;
27287ec681f3Smrg
27297ec681f3Smrg        mir_foreach_instr_in_block_safe(block, ins) {
27307ec681f3Smrg                if (!midgard_is_branch_unit(ins->unit)) continue;
27317ec681f3Smrg
27327ec681f3Smrg                if (branched)
27337ec681f3Smrg                        mir_remove_instruction(ins);
27347ec681f3Smrg
27357ec681f3Smrg                branched = true;
27367ec681f3Smrg        }
27377ec681f3Smrg}
27387ec681f3Smrg
27397ec681f3Smrg/* We want to force the invert on AND/OR to the second slot to legalize into
27407ec681f3Smrg * iandnot/iornot. The relevant patterns are for AND (and OR respectively)
27417ec681f3Smrg *
27427ec681f3Smrg *   ~a & #b = ~a & ~(#~b)
27437ec681f3Smrg *   ~a & b = b & ~a
27447ec681f3Smrg */
27457ec681f3Smrg
27467ec681f3Smrgstatic void
27477ec681f3Smrgmidgard_legalize_invert(compiler_context *ctx, midgard_block *block)
27487ec681f3Smrg{
27497ec681f3Smrg        mir_foreach_instr_in_block(block, ins) {
27507ec681f3Smrg                if (ins->type != TAG_ALU_4) continue;
27517ec681f3Smrg
27527ec681f3Smrg                if (ins->op != midgard_alu_op_iand &&
27537ec681f3Smrg                    ins->op != midgard_alu_op_ior) continue;
27547ec681f3Smrg
27557ec681f3Smrg                if (ins->src_invert[1] || !ins->src_invert[0]) continue;
27567ec681f3Smrg
27577ec681f3Smrg                if (ins->has_inline_constant) {
27587ec681f3Smrg                        /* ~(#~a) = ~(~#a) = a, so valid, and forces both
27597ec681f3Smrg                         * inverts on */
27607ec681f3Smrg                        ins->inline_constant = ~ins->inline_constant;
27617ec681f3Smrg                        ins->src_invert[1] = true;
27627ec681f3Smrg                } else {
27637ec681f3Smrg                        /* Flip to the right invert order. Note
27647ec681f3Smrg                         * has_inline_constant false by assumption on the
27657ec681f3Smrg                         * branch, so flipping makes sense. */
27667ec681f3Smrg                        mir_flip(ins);
27677ec681f3Smrg                }
27687ec681f3Smrg        }
27697ec681f3Smrg}
27707ec681f3Smrg
27717ec681f3Smrgstatic unsigned
27727ec681f3Smrgemit_fragment_epilogue(compiler_context *ctx, unsigned rt, unsigned sample_iter)
27737ec681f3Smrg{
27747ec681f3Smrg        /* Loop to ourselves */
27757ec681f3Smrg        midgard_instruction *br = ctx->writeout_branch[rt][sample_iter];
27767ec681f3Smrg        struct midgard_instruction ins = v_branch(false, false);
27777ec681f3Smrg        ins.writeout = br->writeout;
27787ec681f3Smrg        ins.branch.target_block = ctx->block_count - 1;
27797ec681f3Smrg        ins.constants.u32[0] = br->constants.u32[0];
27807ec681f3Smrg        memcpy(&ins.src_types, &br->src_types, sizeof(ins.src_types));
27817ec681f3Smrg        emit_mir_instruction(ctx, ins);
27827ec681f3Smrg
27837ec681f3Smrg        ctx->current_block->epilogue = true;
27847ec681f3Smrg        schedule_barrier(ctx);
27857ec681f3Smrg        return ins.branch.target_block;
27867ec681f3Smrg}
27877ec681f3Smrg
27887ec681f3Smrgstatic midgard_block *
27897ec681f3Smrgemit_block_init(compiler_context *ctx)
27907ec681f3Smrg{
27917ec681f3Smrg        midgard_block *this_block = ctx->after_block;
27927ec681f3Smrg        ctx->after_block = NULL;
27937ec681f3Smrg
27947ec681f3Smrg        if (!this_block)
27957ec681f3Smrg                this_block = create_empty_block(ctx);
27967ec681f3Smrg
27977ec681f3Smrg        list_addtail(&this_block->base.link, &ctx->blocks);
27987ec681f3Smrg
27997ec681f3Smrg        this_block->scheduled = false;
28007ec681f3Smrg        ++ctx->block_count;
28017ec681f3Smrg
28027ec681f3Smrg        /* Set up current block */
28037ec681f3Smrg        list_inithead(&this_block->base.instructions);
28047ec681f3Smrg        ctx->current_block = this_block;
28057ec681f3Smrg
28067ec681f3Smrg        return this_block;
28077ec681f3Smrg}
28087ec681f3Smrg
28097ec681f3Smrgstatic midgard_block *
28107ec681f3Smrgemit_block(compiler_context *ctx, nir_block *block)
28117ec681f3Smrg{
28127ec681f3Smrg        midgard_block *this_block = emit_block_init(ctx);
28137ec681f3Smrg
28147ec681f3Smrg        nir_foreach_instr(instr, block) {
28157ec681f3Smrg                emit_instr(ctx, instr);
28167ec681f3Smrg                ++ctx->instruction_count;
28177ec681f3Smrg        }
28187ec681f3Smrg
28197ec681f3Smrg        return this_block;
28207ec681f3Smrg}
28217ec681f3Smrg
28227ec681f3Smrgstatic midgard_block *emit_cf_list(struct compiler_context *ctx, struct exec_list *list);
28237ec681f3Smrg
28247ec681f3Smrgstatic void
28257ec681f3Smrgemit_if(struct compiler_context *ctx, nir_if *nif)
28267ec681f3Smrg{
28277ec681f3Smrg        midgard_block *before_block = ctx->current_block;
28287ec681f3Smrg
28297ec681f3Smrg        /* Speculatively emit the branch, but we can't fill it in until later */
28307ec681f3Smrg        bool inv = false;
28317ec681f3Smrg        EMIT(branch, true, true);
28327ec681f3Smrg        midgard_instruction *then_branch = mir_last_in_block(ctx->current_block);
28337ec681f3Smrg        then_branch->src[0] = mir_get_branch_cond(&nif->condition, &inv);
28347ec681f3Smrg        then_branch->src_types[0] = nir_type_uint32;
28357ec681f3Smrg        then_branch->branch.invert_conditional = !inv;
28367ec681f3Smrg
28377ec681f3Smrg        /* Emit the two subblocks. */
28387ec681f3Smrg        midgard_block *then_block = emit_cf_list(ctx, &nif->then_list);
28397ec681f3Smrg        midgard_block *end_then_block = ctx->current_block;
28407ec681f3Smrg
28417ec681f3Smrg        /* Emit a jump from the end of the then block to the end of the else */
28427ec681f3Smrg        EMIT(branch, false, false);
28437ec681f3Smrg        midgard_instruction *then_exit = mir_last_in_block(ctx->current_block);
28447ec681f3Smrg
28457ec681f3Smrg        /* Emit second block, and check if it's empty */
28467ec681f3Smrg
28477ec681f3Smrg        int else_idx = ctx->block_count;
28487ec681f3Smrg        int count_in = ctx->instruction_count;
28497ec681f3Smrg        midgard_block *else_block = emit_cf_list(ctx, &nif->else_list);
28507ec681f3Smrg        midgard_block *end_else_block = ctx->current_block;
28517ec681f3Smrg        int after_else_idx = ctx->block_count;
28527ec681f3Smrg
28537ec681f3Smrg        /* Now that we have the subblocks emitted, fix up the branches */
28547ec681f3Smrg
28557ec681f3Smrg        assert(then_block);
28567ec681f3Smrg        assert(else_block);
28577ec681f3Smrg
28587ec681f3Smrg        if (ctx->instruction_count == count_in) {
28597ec681f3Smrg                /* The else block is empty, so don't emit an exit jump */
28607ec681f3Smrg                mir_remove_instruction(then_exit);
28617ec681f3Smrg                then_branch->branch.target_block = after_else_idx;
28627ec681f3Smrg        } else {
28637ec681f3Smrg                then_branch->branch.target_block = else_idx;
28647ec681f3Smrg                then_exit->branch.target_block = after_else_idx;
28657ec681f3Smrg        }
28667ec681f3Smrg
28677ec681f3Smrg        /* Wire up the successors */
28687ec681f3Smrg
28697ec681f3Smrg        ctx->after_block = create_empty_block(ctx);
28707ec681f3Smrg
28717ec681f3Smrg        pan_block_add_successor(&before_block->base, &then_block->base);
28727ec681f3Smrg        pan_block_add_successor(&before_block->base, &else_block->base);
28737ec681f3Smrg
28747ec681f3Smrg        pan_block_add_successor(&end_then_block->base, &ctx->after_block->base);
28757ec681f3Smrg        pan_block_add_successor(&end_else_block->base, &ctx->after_block->base);
28767ec681f3Smrg}
28777ec681f3Smrg
28787ec681f3Smrgstatic void
28797ec681f3Smrgemit_loop(struct compiler_context *ctx, nir_loop *nloop)
28807ec681f3Smrg{
28817ec681f3Smrg        /* Remember where we are */
28827ec681f3Smrg        midgard_block *start_block = ctx->current_block;
28837ec681f3Smrg
28847ec681f3Smrg        /* Allocate a loop number, growing the current inner loop depth */
28857ec681f3Smrg        int loop_idx = ++ctx->current_loop_depth;
28867ec681f3Smrg
28877ec681f3Smrg        /* Get index from before the body so we can loop back later */
28887ec681f3Smrg        int start_idx = ctx->block_count;
28897ec681f3Smrg
28907ec681f3Smrg        /* Emit the body itself */
28917ec681f3Smrg        midgard_block *loop_block = emit_cf_list(ctx, &nloop->body);
28927ec681f3Smrg
28937ec681f3Smrg        /* Branch back to loop back */
28947ec681f3Smrg        struct midgard_instruction br_back = v_branch(false, false);
28957ec681f3Smrg        br_back.branch.target_block = start_idx;
28967ec681f3Smrg        emit_mir_instruction(ctx, br_back);
28977ec681f3Smrg
28987ec681f3Smrg        /* Mark down that branch in the graph. */
28997ec681f3Smrg        pan_block_add_successor(&start_block->base, &loop_block->base);
29007ec681f3Smrg        pan_block_add_successor(&ctx->current_block->base, &loop_block->base);
29017ec681f3Smrg
29027ec681f3Smrg        /* Find the index of the block about to follow us (note: we don't add
29037ec681f3Smrg         * one; blocks are 0-indexed so we get a fencepost problem) */
29047ec681f3Smrg        int break_block_idx = ctx->block_count;
29057ec681f3Smrg
29067ec681f3Smrg        /* Fix up the break statements we emitted to point to the right place,
29077ec681f3Smrg         * now that we can allocate a block number for them */
29087ec681f3Smrg        ctx->after_block = create_empty_block(ctx);
29097ec681f3Smrg
29107ec681f3Smrg        mir_foreach_block_from(ctx, start_block, _block) {
29117ec681f3Smrg                mir_foreach_instr_in_block(((midgard_block *) _block), ins) {
29127ec681f3Smrg                        if (ins->type != TAG_ALU_4) continue;
29137ec681f3Smrg                        if (!ins->compact_branch) continue;
29147ec681f3Smrg
29157ec681f3Smrg                        /* We found a branch -- check the type to see if we need to do anything */
29167ec681f3Smrg                        if (ins->branch.target_type != TARGET_BREAK) continue;
29177ec681f3Smrg
29187ec681f3Smrg                        /* It's a break! Check if it's our break */
29197ec681f3Smrg                        if (ins->branch.target_break != loop_idx) continue;
29207ec681f3Smrg
29217ec681f3Smrg                        /* Okay, cool, we're breaking out of this loop.
29227ec681f3Smrg                         * Rewrite from a break to a goto */
29237ec681f3Smrg
29247ec681f3Smrg                        ins->branch.target_type = TARGET_GOTO;
29257ec681f3Smrg                        ins->branch.target_block = break_block_idx;
29267ec681f3Smrg
29277ec681f3Smrg                        pan_block_add_successor(_block, &ctx->after_block->base);
29287ec681f3Smrg                }
29297ec681f3Smrg        }
29307ec681f3Smrg
29317ec681f3Smrg        /* Now that we've finished emitting the loop, free up the depth again
29327ec681f3Smrg         * so we play nice with recursion amid nested loops */
29337ec681f3Smrg        --ctx->current_loop_depth;
29347ec681f3Smrg
29357ec681f3Smrg        /* Dump loop stats */
29367ec681f3Smrg        ++ctx->loop_count;
29377ec681f3Smrg}
29387ec681f3Smrg
29397ec681f3Smrgstatic midgard_block *
29407ec681f3Smrgemit_cf_list(struct compiler_context *ctx, struct exec_list *list)
29417ec681f3Smrg{
29427ec681f3Smrg        midgard_block *start_block = NULL;
29437ec681f3Smrg
29447ec681f3Smrg        foreach_list_typed(nir_cf_node, node, node, list) {
29457ec681f3Smrg                switch (node->type) {
29467ec681f3Smrg                case nir_cf_node_block: {
29477ec681f3Smrg                        midgard_block *block = emit_block(ctx, nir_cf_node_as_block(node));
29487ec681f3Smrg
29497ec681f3Smrg                        if (!start_block)
29507ec681f3Smrg                                start_block = block;
29517ec681f3Smrg
29527ec681f3Smrg                        break;
29537ec681f3Smrg                }
29547ec681f3Smrg
29557ec681f3Smrg                case nir_cf_node_if:
29567ec681f3Smrg                        emit_if(ctx, nir_cf_node_as_if(node));
29577ec681f3Smrg                        break;
29587ec681f3Smrg
29597ec681f3Smrg                case nir_cf_node_loop:
29607ec681f3Smrg                        emit_loop(ctx, nir_cf_node_as_loop(node));
29617ec681f3Smrg                        break;
29627ec681f3Smrg
29637ec681f3Smrg                case nir_cf_node_function:
29647ec681f3Smrg                        assert(0);
29657ec681f3Smrg                        break;
29667ec681f3Smrg                }
29677ec681f3Smrg        }
29687ec681f3Smrg
29697ec681f3Smrg        return start_block;
29707ec681f3Smrg}
29717ec681f3Smrg
29727ec681f3Smrg/* Due to lookahead, we need to report the first tag executed in the command
29737ec681f3Smrg * stream and in branch targets. An initial block might be empty, so iterate
29747ec681f3Smrg * until we find one that 'works' */
29757ec681f3Smrg
29767ec681f3Smrgunsigned
29777ec681f3Smrgmidgard_get_first_tag_from_block(compiler_context *ctx, unsigned block_idx)
29787ec681f3Smrg{
29797ec681f3Smrg        midgard_block *initial_block = mir_get_block(ctx, block_idx);
29807ec681f3Smrg
29817ec681f3Smrg        mir_foreach_block_from(ctx, initial_block, _v) {
29827ec681f3Smrg                midgard_block *v = (midgard_block *) _v;
29837ec681f3Smrg                if (v->quadword_count) {
29847ec681f3Smrg                        midgard_bundle *initial_bundle =
29857ec681f3Smrg                                util_dynarray_element(&v->bundles, midgard_bundle, 0);
29867ec681f3Smrg
29877ec681f3Smrg                        return initial_bundle->tag;
29887ec681f3Smrg                }
29897ec681f3Smrg        }
29907ec681f3Smrg
29917ec681f3Smrg        /* Default to a tag 1 which will break from the shader, in case we jump
29927ec681f3Smrg         * to the exit block (i.e. `return` in a compute shader) */
29937ec681f3Smrg
29947ec681f3Smrg        return 1;
29957ec681f3Smrg}
29967ec681f3Smrg
29977ec681f3Smrg/* For each fragment writeout instruction, generate a writeout loop to
29987ec681f3Smrg * associate with it */
29997ec681f3Smrg
30007ec681f3Smrgstatic void
30017ec681f3Smrgmir_add_writeout_loops(compiler_context *ctx)
30027ec681f3Smrg{
30037ec681f3Smrg        for (unsigned rt = 0; rt < ARRAY_SIZE(ctx->writeout_branch); ++rt) {
30047ec681f3Smrg                for (unsigned s = 0; s < MIDGARD_MAX_SAMPLE_ITER; ++s) {
30057ec681f3Smrg                        midgard_instruction *br = ctx->writeout_branch[rt][s];
30067ec681f3Smrg                        if (!br) continue;
30077ec681f3Smrg
30087ec681f3Smrg                        unsigned popped = br->branch.target_block;
30097ec681f3Smrg                        pan_block_add_successor(&(mir_get_block(ctx, popped - 1)->base),
30107ec681f3Smrg                                                &ctx->current_block->base);
30117ec681f3Smrg                        br->branch.target_block = emit_fragment_epilogue(ctx, rt, s);
30127ec681f3Smrg                        br->branch.target_type = TARGET_GOTO;
30137ec681f3Smrg
30147ec681f3Smrg                        /* If we have more RTs, we'll need to restore back after our
30157ec681f3Smrg                         * loop terminates */
30167ec681f3Smrg                        midgard_instruction *next_br = NULL;
30177ec681f3Smrg
30187ec681f3Smrg                        if ((s + 1) < MIDGARD_MAX_SAMPLE_ITER)
30197ec681f3Smrg                                next_br = ctx->writeout_branch[rt][s + 1];
30207ec681f3Smrg
30217ec681f3Smrg                        if (!next_br && (rt + 1) < ARRAY_SIZE(ctx->writeout_branch))
30227ec681f3Smrg			        next_br = ctx->writeout_branch[rt + 1][0];
30237ec681f3Smrg
30247ec681f3Smrg                        if (next_br) {
30257ec681f3Smrg                                midgard_instruction uncond = v_branch(false, false);
30267ec681f3Smrg                                uncond.branch.target_block = popped;
30277ec681f3Smrg                                uncond.branch.target_type = TARGET_GOTO;
30287ec681f3Smrg                                emit_mir_instruction(ctx, uncond);
30297ec681f3Smrg                                pan_block_add_successor(&ctx->current_block->base,
30307ec681f3Smrg                                                        &(mir_get_block(ctx, popped)->base));
30317ec681f3Smrg                                schedule_barrier(ctx);
30327ec681f3Smrg                        } else {
30337ec681f3Smrg                                /* We're last, so we can terminate here */
30347ec681f3Smrg                                br->last_writeout = true;
30357ec681f3Smrg                        }
30367ec681f3Smrg                }
30377ec681f3Smrg        }
30387ec681f3Smrg}
30397ec681f3Smrg
30407ec681f3Smrgvoid
30417ec681f3Smrgmidgard_compile_shader_nir(nir_shader *nir,
30427ec681f3Smrg                           const struct panfrost_compile_inputs *inputs,
30437ec681f3Smrg                           struct util_dynarray *binary,
30447ec681f3Smrg                           struct pan_shader_info *info)
30457ec681f3Smrg{
30467ec681f3Smrg        midgard_debug = debug_get_option_midgard_debug();
30477ec681f3Smrg
30487ec681f3Smrg        /* TODO: Bound against what? */
30497ec681f3Smrg        compiler_context *ctx = rzalloc(NULL, compiler_context);
30507ec681f3Smrg        ctx->sysval_to_id = panfrost_init_sysvals(&info->sysvals, ctx);
30517ec681f3Smrg
30527ec681f3Smrg        ctx->inputs = inputs;
30537ec681f3Smrg        ctx->nir = nir;
30547ec681f3Smrg        ctx->info = info;
30557ec681f3Smrg        ctx->stage = nir->info.stage;
30567ec681f3Smrg
30577ec681f3Smrg        if (inputs->is_blend) {
30587ec681f3Smrg                unsigned nr_samples = MAX2(inputs->blend.nr_samples, 1);
30597ec681f3Smrg                const struct util_format_description *desc =
30607ec681f3Smrg                        util_format_description(inputs->rt_formats[inputs->blend.rt]);
30617ec681f3Smrg
30627ec681f3Smrg                /* We have to split writeout in 128 bit chunks */
30637ec681f3Smrg                ctx->blend_sample_iterations =
30647ec681f3Smrg                        DIV_ROUND_UP(desc->block.bits * nr_samples, 128);
30657ec681f3Smrg        }
30667ec681f3Smrg        ctx->blend_input = ~0;
30677ec681f3Smrg        ctx->blend_src1 = ~0;
30687ec681f3Smrg        ctx->quirks = midgard_get_quirks(inputs->gpu_id);
30697ec681f3Smrg
30707ec681f3Smrg        /* Initialize at a global (not block) level hash tables */
30717ec681f3Smrg
30727ec681f3Smrg        ctx->ssa_constants = _mesa_hash_table_u64_create(ctx);
30737ec681f3Smrg
30747ec681f3Smrg        /* Lower gl_Position pre-optimisation, but after lowering vars to ssa
30757ec681f3Smrg         * (so we don't accidentally duplicate the epilogue since mesa/st has
30767ec681f3Smrg         * messed with our I/O quite a bit already) */
30777ec681f3Smrg
30787ec681f3Smrg        NIR_PASS_V(nir, nir_lower_vars_to_ssa);
30797ec681f3Smrg
30807ec681f3Smrg        if (ctx->stage == MESA_SHADER_VERTEX) {
30817ec681f3Smrg                NIR_PASS_V(nir, nir_lower_viewport_transform);
30827ec681f3Smrg                NIR_PASS_V(nir, nir_lower_point_size, 1.0, 1024.0);
30837ec681f3Smrg        }
30847ec681f3Smrg
30857ec681f3Smrg        NIR_PASS_V(nir, nir_lower_var_copies);
30867ec681f3Smrg        NIR_PASS_V(nir, nir_lower_vars_to_ssa);
30877ec681f3Smrg        NIR_PASS_V(nir, nir_split_var_copies);
30887ec681f3Smrg        NIR_PASS_V(nir, nir_lower_var_copies);
30897ec681f3Smrg        NIR_PASS_V(nir, nir_lower_global_vars_to_local);
30907ec681f3Smrg        NIR_PASS_V(nir, nir_lower_var_copies);
30917ec681f3Smrg        NIR_PASS_V(nir, nir_lower_vars_to_ssa);
30927ec681f3Smrg
30937ec681f3Smrg        unsigned pan_quirks = panfrost_get_quirks(inputs->gpu_id, 0);
30947ec681f3Smrg        NIR_PASS_V(nir, pan_lower_framebuffer,
30957ec681f3Smrg                   inputs->rt_formats, inputs->raw_fmt_mask,
30967ec681f3Smrg                   inputs->is_blend, pan_quirks);
30977ec681f3Smrg
30987ec681f3Smrg        NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
30997ec681f3Smrg                        glsl_type_size, 0);
31007ec681f3Smrg        NIR_PASS_V(nir, nir_lower_ssbo);
31017ec681f3Smrg        NIR_PASS_V(nir, pan_nir_lower_zs_store);
31027ec681f3Smrg
31037ec681f3Smrg        NIR_PASS_V(nir, pan_nir_lower_64bit_intrin);
31047ec681f3Smrg
31057ec681f3Smrg        /* Optimisation passes */
31067ec681f3Smrg
31077ec681f3Smrg        optimise_nir(nir, ctx->quirks, inputs->is_blend);
31087ec681f3Smrg
31097ec681f3Smrg        NIR_PASS_V(nir, pan_nir_reorder_writeout);
31107ec681f3Smrg
31117ec681f3Smrg        if ((midgard_debug & MIDGARD_DBG_SHADERS) &&
31127ec681f3Smrg            ((midgard_debug & MIDGARD_DBG_INTERNAL) || !nir->info.internal)) {
31137ec681f3Smrg                nir_print_shader(nir, stdout);
31147ec681f3Smrg        }
31157ec681f3Smrg
31167ec681f3Smrg        info->tls_size = nir->scratch_size;
31177ec681f3Smrg
31187ec681f3Smrg        nir_foreach_function(func, nir) {
31197ec681f3Smrg                if (!func->impl)
31207ec681f3Smrg                        continue;
31217ec681f3Smrg
31227ec681f3Smrg                list_inithead(&ctx->blocks);
31237ec681f3Smrg                ctx->block_count = 0;
31247ec681f3Smrg                ctx->func = func;
31257ec681f3Smrg                ctx->already_emitted = calloc(BITSET_WORDS(func->impl->ssa_alloc), sizeof(BITSET_WORD));
31267ec681f3Smrg
31277ec681f3Smrg                if (nir->info.outputs_read && !inputs->is_blend) {
31287ec681f3Smrg                        emit_block_init(ctx);
31297ec681f3Smrg
31307ec681f3Smrg                        struct midgard_instruction wait = v_branch(false, false);
31317ec681f3Smrg                        wait.branch.target_type = TARGET_TILEBUF_WAIT;
31327ec681f3Smrg
31337ec681f3Smrg                        emit_mir_instruction(ctx, wait);
31347ec681f3Smrg
31357ec681f3Smrg                        ++ctx->instruction_count;
31367ec681f3Smrg                }
31377ec681f3Smrg
31387ec681f3Smrg                emit_cf_list(ctx, &func->impl->body);
31397ec681f3Smrg                free(ctx->already_emitted);
31407ec681f3Smrg                break; /* TODO: Multi-function shaders */
31417ec681f3Smrg        }
31427ec681f3Smrg
31437ec681f3Smrg        /* Per-block lowering before opts */
31447ec681f3Smrg
31457ec681f3Smrg        mir_foreach_block(ctx, _block) {
31467ec681f3Smrg                midgard_block *block = (midgard_block *) _block;
31477ec681f3Smrg                inline_alu_constants(ctx, block);
31487ec681f3Smrg                embedded_to_inline_constant(ctx, block);
31497ec681f3Smrg        }
31507ec681f3Smrg        /* MIR-level optimizations */
31517ec681f3Smrg
31527ec681f3Smrg        bool progress = false;
31537ec681f3Smrg
31547ec681f3Smrg        do {
31557ec681f3Smrg                progress = false;
31567ec681f3Smrg                progress |= midgard_opt_dead_code_eliminate(ctx);
31577ec681f3Smrg
31587ec681f3Smrg                mir_foreach_block(ctx, _block) {
31597ec681f3Smrg                        midgard_block *block = (midgard_block *) _block;
31607ec681f3Smrg                        progress |= midgard_opt_copy_prop(ctx, block);
31617ec681f3Smrg                        progress |= midgard_opt_combine_projection(ctx, block);
31627ec681f3Smrg                        progress |= midgard_opt_varying_projection(ctx, block);
31637ec681f3Smrg                }
31647ec681f3Smrg        } while (progress);
31657ec681f3Smrg
31667ec681f3Smrg        mir_foreach_block(ctx, _block) {
31677ec681f3Smrg                midgard_block *block = (midgard_block *) _block;
31687ec681f3Smrg                midgard_lower_derivatives(ctx, block);
31697ec681f3Smrg                midgard_legalize_invert(ctx, block);
31707ec681f3Smrg                midgard_cull_dead_branch(ctx, block);
31717ec681f3Smrg        }
31727ec681f3Smrg
31737ec681f3Smrg        if (ctx->stage == MESA_SHADER_FRAGMENT)
31747ec681f3Smrg                mir_add_writeout_loops(ctx);
31757ec681f3Smrg
31767ec681f3Smrg        /* Analyze now that the code is known but before scheduling creates
31777ec681f3Smrg         * pipeline registers which are harder to track */
31787ec681f3Smrg        mir_analyze_helper_requirements(ctx);
31797ec681f3Smrg
31807ec681f3Smrg        /* Schedule! */
31817ec681f3Smrg        midgard_schedule_program(ctx);
31827ec681f3Smrg        mir_ra(ctx);
31837ec681f3Smrg
31847ec681f3Smrg        /* Analyze after scheduling since this is order-dependent */
31857ec681f3Smrg        mir_analyze_helper_terminate(ctx);
31867ec681f3Smrg
31877ec681f3Smrg        /* Emit flat binary from the instruction arrays. Iterate each block in
31887ec681f3Smrg         * sequence. Save instruction boundaries such that lookahead tags can
31897ec681f3Smrg         * be assigned easily */
31907ec681f3Smrg
31917ec681f3Smrg        /* Cache _all_ bundles in source order for lookahead across failed branches */
31927ec681f3Smrg
31937ec681f3Smrg        int bundle_count = 0;
31947ec681f3Smrg        mir_foreach_block(ctx, _block) {
31957ec681f3Smrg                midgard_block *block = (midgard_block *) _block;
31967ec681f3Smrg                bundle_count += block->bundles.size / sizeof(midgard_bundle);
31977ec681f3Smrg        }
31987ec681f3Smrg        midgard_bundle **source_order_bundles = malloc(sizeof(midgard_bundle *) * bundle_count);
31997ec681f3Smrg        int bundle_idx = 0;
32007ec681f3Smrg        mir_foreach_block(ctx, _block) {
32017ec681f3Smrg                midgard_block *block = (midgard_block *) _block;
32027ec681f3Smrg                util_dynarray_foreach(&block->bundles, midgard_bundle, bundle) {
32037ec681f3Smrg                        source_order_bundles[bundle_idx++] = bundle;
32047ec681f3Smrg                }
32057ec681f3Smrg        }
32067ec681f3Smrg
32077ec681f3Smrg        int current_bundle = 0;
32087ec681f3Smrg
32097ec681f3Smrg        /* Midgard prefetches instruction types, so during emission we
32107ec681f3Smrg         * need to lookahead. Unless this is the last instruction, in
32117ec681f3Smrg         * which we return 1. */
32127ec681f3Smrg
32137ec681f3Smrg        mir_foreach_block(ctx, _block) {
32147ec681f3Smrg                midgard_block *block = (midgard_block *) _block;
32157ec681f3Smrg                mir_foreach_bundle_in_block(block, bundle) {
32167ec681f3Smrg                        int lookahead = 1;
32177ec681f3Smrg
32187ec681f3Smrg                        if (!bundle->last_writeout && (current_bundle + 1 < bundle_count))
32197ec681f3Smrg                                lookahead = source_order_bundles[current_bundle + 1]->tag;
32207ec681f3Smrg
32217ec681f3Smrg                        emit_binary_bundle(ctx, block, bundle, binary, lookahead);
32227ec681f3Smrg                        ++current_bundle;
32237ec681f3Smrg                }
32247ec681f3Smrg
32257ec681f3Smrg                /* TODO: Free deeper */
32267ec681f3Smrg                //util_dynarray_fini(&block->instructions);
32277ec681f3Smrg        }
32287ec681f3Smrg
32297ec681f3Smrg        free(source_order_bundles);
32307ec681f3Smrg
32317ec681f3Smrg        /* Report the very first tag executed */
32327ec681f3Smrg        info->midgard.first_tag = midgard_get_first_tag_from_block(ctx, 0);
32337ec681f3Smrg
32347ec681f3Smrg        info->ubo_mask = ctx->ubo_mask & BITSET_MASK(ctx->nir->info.num_ubos);
32357ec681f3Smrg
32367ec681f3Smrg        if ((midgard_debug & MIDGARD_DBG_SHADERS) &&
32377ec681f3Smrg            ((midgard_debug & MIDGARD_DBG_INTERNAL) || !nir->info.internal)) {
32387ec681f3Smrg                disassemble_midgard(stdout, binary->data,
32397ec681f3Smrg                                    binary->size, inputs->gpu_id,
32407ec681f3Smrg                                    midgard_debug & MIDGARD_DBG_VERBOSE);
32417ec681f3Smrg                fflush(stdout);
32427ec681f3Smrg        }
32437ec681f3Smrg
32447ec681f3Smrg        /* A shader ending on a 16MB boundary causes INSTR_INVALID_PC faults,
32457ec681f3Smrg         * workaround by adding some padding to the end of the shader. (The
32467ec681f3Smrg         * kernel makes sure shader BOs can't cross 16MB boundaries.) */
32477ec681f3Smrg        if (binary->size)
32487ec681f3Smrg                memset(util_dynarray_grow(binary, uint8_t, 16), 0, 16);
32497ec681f3Smrg
32507ec681f3Smrg        if ((midgard_debug & MIDGARD_DBG_SHADERDB || inputs->shaderdb) &&
32517ec681f3Smrg            !nir->info.internal) {
32527ec681f3Smrg                unsigned nr_bundles = 0, nr_ins = 0;
32537ec681f3Smrg
32547ec681f3Smrg                /* Count instructions and bundles */
32557ec681f3Smrg
32567ec681f3Smrg                mir_foreach_block(ctx, _block) {
32577ec681f3Smrg                        midgard_block *block = (midgard_block *) _block;
32587ec681f3Smrg                        nr_bundles += util_dynarray_num_elements(
32597ec681f3Smrg                                              &block->bundles, midgard_bundle);
32607ec681f3Smrg
32617ec681f3Smrg                        mir_foreach_bundle_in_block(block, bun)
32627ec681f3Smrg                                nr_ins += bun->instruction_count;
32637ec681f3Smrg                }
32647ec681f3Smrg
32657ec681f3Smrg                /* Calculate thread count. There are certain cutoffs by
32667ec681f3Smrg                 * register count for thread count */
32677ec681f3Smrg
32687ec681f3Smrg                unsigned nr_registers = info->work_reg_count;
32697ec681f3Smrg
32707ec681f3Smrg                unsigned nr_threads =
32717ec681f3Smrg                        (nr_registers <= 4) ? 4 :
32727ec681f3Smrg                        (nr_registers <= 8) ? 2 :
32737ec681f3Smrg                        1;
32747ec681f3Smrg
32757ec681f3Smrg                /* Dump stats */
32767ec681f3Smrg
32777ec681f3Smrg                fprintf(stderr, "%s - %s shader: "
32787ec681f3Smrg                        "%u inst, %u bundles, %u quadwords, "
32797ec681f3Smrg                        "%u registers, %u threads, %u loops, "
32807ec681f3Smrg                        "%u:%u spills:fills\n",
32817ec681f3Smrg                        ctx->nir->info.label ?: "",
32827ec681f3Smrg                        ctx->inputs->is_blend ? "PAN_SHADER_BLEND" :
32837ec681f3Smrg                        gl_shader_stage_name(ctx->stage),
32847ec681f3Smrg                        nr_ins, nr_bundles, ctx->quadword_count,
32857ec681f3Smrg                        nr_registers, nr_threads,
32867ec681f3Smrg                        ctx->loop_count,
32877ec681f3Smrg                        ctx->spills, ctx->fills);
32887ec681f3Smrg        }
32897ec681f3Smrg
32907ec681f3Smrg        _mesa_hash_table_u64_destroy(ctx->ssa_constants);
32917ec681f3Smrg        _mesa_hash_table_u64_destroy(ctx->sysval_to_id);
32927ec681f3Smrg
32937ec681f3Smrg        ralloc_free(ctx);
32947ec681f3Smrg}
3295