midgard_compile.c revision 7ec681f3
1/*
2 * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
3 * Copyright (C) 2019-2020 Collabora, Ltd.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include <sys/types.h>
26#include <sys/stat.h>
27#include <sys/mman.h>
28#include <fcntl.h>
29#include <stdint.h>
30#include <stdlib.h>
31#include <stdio.h>
32#include <err.h>
33
34#include "main/mtypes.h"
35#include "compiler/glsl/glsl_to_nir.h"
36#include "compiler/nir_types.h"
37#include "compiler/nir/nir_builder.h"
38#include "util/half_float.h"
39#include "util/u_math.h"
40#include "util/u_debug.h"
41#include "util/u_dynarray.h"
42#include "util/list.h"
43#include "main/mtypes.h"
44
45#include "midgard.h"
46#include "midgard_nir.h"
47#include "midgard_compile.h"
48#include "midgard_ops.h"
49#include "helpers.h"
50#include "compiler.h"
51#include "midgard_quirks.h"
52#include "panfrost-quirks.h"
53#include "panfrost/util/pan_lower_framebuffer.h"
54
55#include "disassemble.h"
56
57static const struct debug_named_value midgard_debug_options[] = {
58        {"msgs",      MIDGARD_DBG_MSGS,		"Print debug messages"},
59        {"shaders",   MIDGARD_DBG_SHADERS,	"Dump shaders in NIR and MIR"},
60        {"shaderdb",  MIDGARD_DBG_SHADERDB,     "Prints shader-db statistics"},
61        {"inorder",   MIDGARD_DBG_INORDER,      "Disables out-of-order scheduling"},
62        {"verbose",   MIDGARD_DBG_VERBOSE,      "Dump shaders verbosely"},
63        {"internal",  MIDGARD_DBG_INTERNAL,     "Dump internal shaders"},
64        DEBUG_NAMED_VALUE_END
65};
66
67DEBUG_GET_ONCE_FLAGS_OPTION(midgard_debug, "MIDGARD_MESA_DEBUG", midgard_debug_options, 0)
68
69int midgard_debug = 0;
70
71#define DBG(fmt, ...) \
72		do { if (midgard_debug & MIDGARD_DBG_MSGS) \
73			fprintf(stderr, "%s:%d: "fmt, \
74				__FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)
75static midgard_block *
76create_empty_block(compiler_context *ctx)
77{
78        midgard_block *blk = rzalloc(ctx, midgard_block);
79
80        blk->base.predecessors = _mesa_set_create(blk,
81                        _mesa_hash_pointer,
82                        _mesa_key_pointer_equal);
83
84        blk->base.name = ctx->block_source_count++;
85
86        return blk;
87}
88
89static void
90schedule_barrier(compiler_context *ctx)
91{
92        midgard_block *temp = ctx->after_block;
93        ctx->after_block = create_empty_block(ctx);
94        ctx->block_count++;
95        list_addtail(&ctx->after_block->base.link, &ctx->blocks);
96        list_inithead(&ctx->after_block->base.instructions);
97        pan_block_add_successor(&ctx->current_block->base, &ctx->after_block->base);
98        ctx->current_block = ctx->after_block;
99        ctx->after_block = temp;
100}
101
102/* Helpers to generate midgard_instruction's using macro magic, since every
103 * driver seems to do it that way */
104
105#define EMIT(op, ...) emit_mir_instruction(ctx, v_##op(__VA_ARGS__));
106
107#define M_LOAD_STORE(name, store, T) \
108	static midgard_instruction m_##name(unsigned ssa, unsigned address) { \
109		midgard_instruction i = { \
110			.type = TAG_LOAD_STORE_4, \
111                        .mask = 0xF, \
112                        .dest = ~0, \
113                        .src = { ~0, ~0, ~0, ~0 }, \
114                        .swizzle = SWIZZLE_IDENTITY_4, \
115                        .op = midgard_op_##name, \
116			.load_store = { \
117				.signed_offset = address \
118			} \
119		}; \
120                \
121                if (store) { \
122                        i.src[0] = ssa; \
123                        i.src_types[0] = T; \
124                        i.dest_type = T; \
125                } else { \
126                        i.dest = ssa; \
127                        i.dest_type = T; \
128                } \
129		return i; \
130	}
131
132#define M_LOAD(name, T) M_LOAD_STORE(name, false, T)
133#define M_STORE(name, T) M_LOAD_STORE(name, true, T)
134
135M_LOAD(ld_attr_32, nir_type_uint32);
136M_LOAD(ld_vary_32, nir_type_uint32);
137M_LOAD(ld_ubo_32, nir_type_uint32);
138M_LOAD(ld_ubo_64, nir_type_uint32);
139M_LOAD(ld_ubo_128, nir_type_uint32);
140M_LOAD(ld_32, nir_type_uint32);
141M_LOAD(ld_64, nir_type_uint32);
142M_LOAD(ld_128, nir_type_uint32);
143M_STORE(st_32, nir_type_uint32);
144M_STORE(st_64, nir_type_uint32);
145M_STORE(st_128, nir_type_uint32);
146M_LOAD(ld_tilebuffer_raw, nir_type_uint32);
147M_LOAD(ld_tilebuffer_16f, nir_type_float16);
148M_LOAD(ld_tilebuffer_32f, nir_type_float32);
149M_STORE(st_vary_32, nir_type_uint32);
150M_LOAD(ld_cubemap_coords, nir_type_uint32);
151M_LOAD(ldst_mov, nir_type_uint32);
152M_LOAD(ld_image_32f, nir_type_float32);
153M_LOAD(ld_image_16f, nir_type_float16);
154M_LOAD(ld_image_32u, nir_type_uint32);
155M_LOAD(ld_image_32i, nir_type_int32);
156M_STORE(st_image_32f, nir_type_float32);
157M_STORE(st_image_16f, nir_type_float16);
158M_STORE(st_image_32u, nir_type_uint32);
159M_STORE(st_image_32i, nir_type_int32);
160M_LOAD(lea_image, nir_type_uint64);
161
162#define M_IMAGE(op) \
163static midgard_instruction \
164op ## _image(nir_alu_type type, unsigned val, unsigned address) \
165{ \
166        switch (type) { \
167        case nir_type_float32: \
168                 return m_ ## op ## _image_32f(val, address); \
169        case nir_type_float16: \
170                 return m_ ## op ## _image_16f(val, address); \
171        case nir_type_uint32: \
172                 return m_ ## op ## _image_32u(val, address); \
173        case nir_type_int32: \
174                 return m_ ## op ## _image_32i(val, address); \
175        default: \
176                 unreachable("Invalid image type"); \
177        } \
178}
179
180M_IMAGE(ld);
181M_IMAGE(st);
182
183static midgard_instruction
184v_branch(bool conditional, bool invert)
185{
186        midgard_instruction ins = {
187                .type = TAG_ALU_4,
188                .unit = ALU_ENAB_BRANCH,
189                .compact_branch = true,
190                .branch = {
191                        .conditional = conditional,
192                        .invert_conditional = invert
193                },
194                .dest = ~0,
195                .src = { ~0, ~0, ~0, ~0 },
196        };
197
198        return ins;
199}
200
201static void
202attach_constants(compiler_context *ctx, midgard_instruction *ins, void *constants, int name)
203{
204        ins->has_constants = true;
205        memcpy(&ins->constants, constants, 16);
206}
207
208static int
209glsl_type_size(const struct glsl_type *type, bool bindless)
210{
211        return glsl_count_attribute_slots(type, false);
212}
213
214/* Lower fdot2 to a vector multiplication followed by channel addition  */
215static bool
216midgard_nir_lower_fdot2_instr(nir_builder *b, nir_instr *instr, void *data)
217{
218        if (instr->type != nir_instr_type_alu)
219                return false;
220
221        nir_alu_instr *alu = nir_instr_as_alu(instr);
222        if (alu->op != nir_op_fdot2)
223                return false;
224
225        b->cursor = nir_before_instr(&alu->instr);
226
227        nir_ssa_def *src0 = nir_ssa_for_alu_src(b, alu, 0);
228        nir_ssa_def *src1 = nir_ssa_for_alu_src(b, alu, 1);
229
230        nir_ssa_def *product = nir_fmul(b, src0, src1);
231
232        nir_ssa_def *sum = nir_fadd(b,
233                                    nir_channel(b, product, 0),
234                                    nir_channel(b, product, 1));
235
236        /* Replace the fdot2 with this sum */
237        nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, sum);
238
239        return true;
240}
241
242static bool
243midgard_nir_lower_fdot2(nir_shader *shader)
244{
245        return nir_shader_instructions_pass(shader,
246                                            midgard_nir_lower_fdot2_instr,
247                                            nir_metadata_block_index | nir_metadata_dominance,
248                                            NULL);
249}
250
251static bool
252mdg_is_64(const nir_instr *instr, const void *_unused)
253{
254        const nir_alu_instr *alu = nir_instr_as_alu(instr);
255
256        if (nir_dest_bit_size(alu->dest.dest) == 64)
257                return true;
258
259        switch (alu->op) {
260        case nir_op_umul_high:
261        case nir_op_imul_high:
262                return true;
263        default:
264                return false;
265        }
266}
267
268/* Only vectorize int64 up to vec2 */
269static bool
270midgard_vectorize_filter(const nir_instr *instr, void *data)
271{
272        if (instr->type != nir_instr_type_alu)
273                return true;
274
275        const nir_alu_instr *alu = nir_instr_as_alu(instr);
276
277        unsigned num_components = alu->dest.dest.ssa.num_components;
278
279        int src_bit_size = nir_src_bit_size(alu->src[0].src);
280        int dst_bit_size = nir_dest_bit_size(alu->dest.dest);
281
282        if (src_bit_size == 64 || dst_bit_size == 64) {
283                if (num_components > 1)
284                        return false;
285        }
286
287        return true;
288}
289
290
291/* Flushes undefined values to zero */
292
293static void
294optimise_nir(nir_shader *nir, unsigned quirks, bool is_blend)
295{
296        bool progress;
297        unsigned lower_flrp =
298                (nir->options->lower_flrp16 ? 16 : 0) |
299                (nir->options->lower_flrp32 ? 32 : 0) |
300                (nir->options->lower_flrp64 ? 64 : 0);
301
302        NIR_PASS(progress, nir, nir_lower_regs_to_ssa);
303        nir_lower_idiv_options idiv_options = {
304                .imprecise_32bit_lowering = true,
305                .allow_fp16 = true,
306        };
307        NIR_PASS(progress, nir, nir_lower_idiv, &idiv_options);
308
309        nir_lower_tex_options lower_tex_options = {
310                .lower_txs_lod = true,
311                .lower_txp = ~0,
312                .lower_tg4_broadcom_swizzle = true,
313                /* TODO: we have native gradient.. */
314                .lower_txd = true,
315        };
316
317        NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_options);
318
319        /* Must lower fdot2 after tex is lowered */
320        NIR_PASS(progress, nir, midgard_nir_lower_fdot2);
321
322        /* T720 is broken. */
323
324        if (quirks & MIDGARD_BROKEN_LOD)
325                NIR_PASS_V(nir, midgard_nir_lod_errata);
326
327        /* Midgard image ops coordinates are 16-bit instead of 32-bit */
328        NIR_PASS(progress, nir, midgard_nir_lower_image_bitsize);
329        NIR_PASS(progress, nir, midgard_nir_lower_helper_writes);
330        NIR_PASS(progress, nir, pan_lower_helper_invocation);
331        NIR_PASS(progress, nir, pan_lower_sample_pos);
332
333        NIR_PASS(progress, nir, midgard_nir_lower_algebraic_early);
334
335        do {
336                progress = false;
337
338                NIR_PASS(progress, nir, nir_lower_var_copies);
339                NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
340
341                NIR_PASS(progress, nir, nir_copy_prop);
342                NIR_PASS(progress, nir, nir_opt_remove_phis);
343                NIR_PASS(progress, nir, nir_opt_dce);
344                NIR_PASS(progress, nir, nir_opt_dead_cf);
345                NIR_PASS(progress, nir, nir_opt_cse);
346                NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
347                NIR_PASS(progress, nir, nir_opt_algebraic);
348                NIR_PASS(progress, nir, nir_opt_constant_folding);
349
350                if (lower_flrp != 0) {
351                        bool lower_flrp_progress = false;
352                        NIR_PASS(lower_flrp_progress,
353                                 nir,
354                                 nir_lower_flrp,
355                                 lower_flrp,
356                                 false /* always_precise */);
357                        if (lower_flrp_progress) {
358                                NIR_PASS(progress, nir,
359                                         nir_opt_constant_folding);
360                                progress = true;
361                        }
362
363                        /* Nothing should rematerialize any flrps, so we only
364                         * need to do this lowering once.
365                         */
366                        lower_flrp = 0;
367                }
368
369                NIR_PASS(progress, nir, nir_opt_undef);
370                NIR_PASS(progress, nir, nir_lower_undef_to_zero);
371
372                NIR_PASS(progress, nir, nir_opt_loop_unroll);
373
374                NIR_PASS(progress, nir, nir_opt_vectorize,
375                         midgard_vectorize_filter, NULL);
376        } while (progress);
377
378        NIR_PASS_V(nir, nir_lower_alu_to_scalar, mdg_is_64, NULL);
379
380        /* Run after opts so it can hit more */
381        if (!is_blend)
382                NIR_PASS(progress, nir, nir_fuse_io_16);
383
384        /* Must be run at the end to prevent creation of fsin/fcos ops */
385        NIR_PASS(progress, nir, midgard_nir_scale_trig);
386
387        do {
388                progress = false;
389
390                NIR_PASS(progress, nir, nir_opt_dce);
391                NIR_PASS(progress, nir, nir_opt_algebraic);
392                NIR_PASS(progress, nir, nir_opt_constant_folding);
393                NIR_PASS(progress, nir, nir_copy_prop);
394        } while (progress);
395
396        NIR_PASS(progress, nir, nir_opt_algebraic_late);
397        NIR_PASS(progress, nir, nir_opt_algebraic_distribute_src_mods);
398
399        /* We implement booleans as 32-bit 0/~0 */
400        NIR_PASS(progress, nir, nir_lower_bool_to_int32);
401
402        /* Now that booleans are lowered, we can run out late opts */
403        NIR_PASS(progress, nir, midgard_nir_lower_algebraic_late);
404        NIR_PASS(progress, nir, midgard_nir_cancel_inot);
405
406        NIR_PASS(progress, nir, nir_copy_prop);
407        NIR_PASS(progress, nir, nir_opt_dce);
408
409        /* Backend scheduler is purely local, so do some global optimizations
410         * to reduce register pressure. */
411        nir_move_options move_all =
412                nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
413                nir_move_comparisons | nir_move_copies | nir_move_load_ssbo;
414
415        NIR_PASS_V(nir, nir_opt_sink, move_all);
416        NIR_PASS_V(nir, nir_opt_move, move_all);
417
418        /* Take us out of SSA */
419        NIR_PASS(progress, nir, nir_lower_locals_to_regs);
420        NIR_PASS(progress, nir, nir_convert_from_ssa, true);
421
422        /* We are a vector architecture; write combine where possible */
423        NIR_PASS(progress, nir, nir_move_vec_src_uses_to_dest);
424        NIR_PASS(progress, nir, nir_lower_vec_to_movs, NULL, NULL);
425
426        NIR_PASS(progress, nir, nir_opt_dce);
427}
428
429/* Do not actually emit a load; instead, cache the constant for inlining */
430
431static void
432emit_load_const(compiler_context *ctx, nir_load_const_instr *instr)
433{
434        nir_ssa_def def = instr->def;
435
436        midgard_constants *consts = rzalloc(ctx, midgard_constants);
437
438        assert(instr->def.num_components * instr->def.bit_size <= sizeof(*consts) * 8);
439
440#define RAW_CONST_COPY(bits)                                         \
441        nir_const_value_to_array(consts->u##bits, instr->value,      \
442                                 instr->def.num_components, u##bits)
443
444        switch (instr->def.bit_size) {
445        case 64:
446                RAW_CONST_COPY(64);
447                break;
448        case 32:
449                RAW_CONST_COPY(32);
450                break;
451        case 16:
452                RAW_CONST_COPY(16);
453                break;
454        case 8:
455                RAW_CONST_COPY(8);
456                break;
457        default:
458                unreachable("Invalid bit_size for load_const instruction\n");
459        }
460
461        /* Shifted for SSA, +1 for off-by-one */
462        _mesa_hash_table_u64_insert(ctx->ssa_constants, (def.index << 1) + 1, consts);
463}
464
465/* Normally constants are embedded implicitly, but for I/O and such we have to
466 * explicitly emit a move with the constant source */
467
468static void
469emit_explicit_constant(compiler_context *ctx, unsigned node, unsigned to)
470{
471        void *constant_value = _mesa_hash_table_u64_search(ctx->ssa_constants, node + 1);
472
473        if (constant_value) {
474                midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), to);
475                attach_constants(ctx, &ins, constant_value, node + 1);
476                emit_mir_instruction(ctx, ins);
477        }
478}
479
480static bool
481nir_is_non_scalar_swizzle(nir_alu_src *src, unsigned nr_components)
482{
483        unsigned comp = src->swizzle[0];
484
485        for (unsigned c = 1; c < nr_components; ++c) {
486                if (src->swizzle[c] != comp)
487                        return true;
488        }
489
490        return false;
491}
492
493#define ATOMIC_CASE_IMPL(ctx, instr, nir, op, is_shared) \
494        case nir_intrinsic_##nir: \
495                emit_atomic(ctx, instr, is_shared, midgard_op_##op, ~0); \
496                break;
497
498#define ATOMIC_CASE(ctx, instr, nir, op) \
499        ATOMIC_CASE_IMPL(ctx, instr, shared_atomic_##nir, atomic_##op, true); \
500        ATOMIC_CASE_IMPL(ctx, instr, global_atomic_##nir, atomic_##op, false);
501
502#define IMAGE_ATOMIC_CASE(ctx, instr, nir, op) \
503        case nir_intrinsic_image_atomic_##nir: { \
504                midgard_instruction ins = emit_image_op(ctx, instr, true); \
505                emit_atomic(ctx, instr, false, midgard_op_atomic_##op, ins.dest); \
506                break; \
507        }
508
509#define ALU_CASE(nir, _op) \
510	case nir_op_##nir: \
511		op = midgard_alu_op_##_op; \
512                assert(src_bitsize == dst_bitsize); \
513		break;
514
515#define ALU_CASE_RTZ(nir, _op) \
516	case nir_op_##nir: \
517		op = midgard_alu_op_##_op; \
518                roundmode = MIDGARD_RTZ; \
519		break;
520
521#define ALU_CHECK_CMP() \
522                assert(src_bitsize == 16 || src_bitsize == 32 || src_bitsize == 64); \
523                assert(dst_bitsize == 16 || dst_bitsize == 32); \
524
525#define ALU_CASE_BCAST(nir, _op, count) \
526        case nir_op_##nir: \
527                op = midgard_alu_op_##_op; \
528                broadcast_swizzle = count; \
529                ALU_CHECK_CMP(); \
530                break;
531
532#define ALU_CASE_CMP(nir, _op) \
533	case nir_op_##nir: \
534		op = midgard_alu_op_##_op; \
535                ALU_CHECK_CMP(); \
536                break;
537
538/* Compare mir_lower_invert */
539static bool
540nir_accepts_inot(nir_op op, unsigned src)
541{
542        switch (op) {
543        case nir_op_ior:
544        case nir_op_iand: /* TODO: b2f16 */
545        case nir_op_ixor:
546                return true;
547        case nir_op_b32csel:
548                /* Only the condition */
549                return (src == 0);
550        default:
551                return false;
552        }
553}
554
555static bool
556mir_accept_dest_mod(compiler_context *ctx, nir_dest **dest, nir_op op)
557{
558        if (pan_has_dest_mod(dest, op)) {
559                assert((*dest)->is_ssa);
560                BITSET_SET(ctx->already_emitted, (*dest)->ssa.index);
561                return true;
562        }
563
564        return false;
565}
566
567/* Look for floating point mods. We have the mods clamp_m1_1, clamp_0_1,
568 * and clamp_0_inf. We also have the relations (note 3 * 2 = 6 cases):
569 *
570 * clamp_0_1(clamp_0_inf(x))  = clamp_m1_1(x)
571 * clamp_0_1(clamp_m1_1(x))   = clamp_m1_1(x)
572 * clamp_0_inf(clamp_0_1(x))  = clamp_m1_1(x)
573 * clamp_0_inf(clamp_m1_1(x)) = clamp_m1_1(x)
574 * clamp_m1_1(clamp_0_1(x))   = clamp_m1_1(x)
575 * clamp_m1_1(clamp_0_inf(x)) = clamp_m1_1(x)
576 *
577 * So by cases any composition of output modifiers is equivalent to
578 * clamp_m1_1 alone.
579 */
580static unsigned
581mir_determine_float_outmod(compiler_context *ctx, nir_dest **dest, unsigned prior_outmod)
582{
583        bool clamp_0_inf = mir_accept_dest_mod(ctx, dest, nir_op_fclamp_pos_mali);
584        bool clamp_0_1 = mir_accept_dest_mod(ctx, dest, nir_op_fsat);
585        bool clamp_m1_1 = mir_accept_dest_mod(ctx, dest, nir_op_fsat_signed_mali);
586        bool prior = (prior_outmod != midgard_outmod_none);
587        int count = (int) prior + (int) clamp_0_inf + (int) clamp_0_1 + (int) clamp_m1_1;
588
589        return ((count > 1) || clamp_0_1) ?  midgard_outmod_clamp_0_1 :
590                                clamp_0_inf ? midgard_outmod_clamp_0_inf :
591                                clamp_m1_1 ?   midgard_outmod_clamp_m1_1 :
592                                prior_outmod;
593}
594
595static void
596mir_copy_src(midgard_instruction *ins, nir_alu_instr *instr, unsigned i, unsigned to, bool *abs, bool *neg, bool *not, enum midgard_roundmode *roundmode, bool is_int, unsigned bcast_count)
597{
598        nir_alu_src src = instr->src[i];
599
600        if (!is_int) {
601                if (pan_has_source_mod(&src, nir_op_fneg))
602                        *neg = !(*neg);
603
604                if (pan_has_source_mod(&src, nir_op_fabs))
605                        *abs = true;
606        }
607
608        if (nir_accepts_inot(instr->op, i) && pan_has_source_mod(&src, nir_op_inot))
609                *not = true;
610
611        if (roundmode) {
612                if (pan_has_source_mod(&src, nir_op_fround_even))
613                        *roundmode = MIDGARD_RTE;
614
615                if (pan_has_source_mod(&src, nir_op_ftrunc))
616                        *roundmode = MIDGARD_RTZ;
617
618                if (pan_has_source_mod(&src, nir_op_ffloor))
619                        *roundmode = MIDGARD_RTN;
620
621                if (pan_has_source_mod(&src, nir_op_fceil))
622                        *roundmode = MIDGARD_RTP;
623        }
624
625        unsigned bits = nir_src_bit_size(src.src);
626
627        ins->src[to] = nir_src_index(NULL, &src.src);
628        ins->src_types[to] = nir_op_infos[instr->op].input_types[i] | bits;
629
630        for (unsigned c = 0; c < NIR_MAX_VEC_COMPONENTS; ++c) {
631                ins->swizzle[to][c] = src.swizzle[
632                        (!bcast_count || c < bcast_count) ? c :
633                                (bcast_count - 1)];
634        }
635}
636
637/* Midgard features both fcsel and icsel, depending on whether you want int or
638 * float modifiers. NIR's csel is typeless, so we want a heuristic to guess if
639 * we should emit an int or float csel depending on what modifiers could be
640 * placed. In the absense of modifiers, this is probably arbitrary. */
641
642static bool
643mir_is_bcsel_float(nir_alu_instr *instr)
644{
645        nir_op intmods[] = {
646                nir_op_i2i8, nir_op_i2i16,
647                nir_op_i2i32, nir_op_i2i64
648        };
649
650        nir_op floatmods[] = {
651                nir_op_fabs, nir_op_fneg,
652                nir_op_f2f16, nir_op_f2f32,
653                nir_op_f2f64
654        };
655
656        nir_op floatdestmods[] = {
657                nir_op_fsat, nir_op_fsat_signed_mali, nir_op_fclamp_pos_mali,
658                nir_op_f2f16, nir_op_f2f32
659        };
660
661        signed score = 0;
662
663        for (unsigned i = 1; i < 3; ++i) {
664                nir_alu_src s = instr->src[i];
665                for (unsigned q = 0; q < ARRAY_SIZE(intmods); ++q) {
666                        if (pan_has_source_mod(&s, intmods[q]))
667                                score--;
668                }
669        }
670
671        for (unsigned i = 1; i < 3; ++i) {
672                nir_alu_src s = instr->src[i];
673                for (unsigned q = 0; q < ARRAY_SIZE(floatmods); ++q) {
674                        if (pan_has_source_mod(&s, floatmods[q]))
675                                score++;
676                }
677        }
678
679        for (unsigned q = 0; q < ARRAY_SIZE(floatdestmods); ++q) {
680                nir_dest *dest = &instr->dest.dest;
681                if (pan_has_dest_mod(&dest, floatdestmods[q]))
682                        score++;
683        }
684
685        return (score > 0);
686}
687
688static void
689emit_alu(compiler_context *ctx, nir_alu_instr *instr)
690{
691        nir_dest *dest = &instr->dest.dest;
692
693        if (dest->is_ssa && BITSET_TEST(ctx->already_emitted, dest->ssa.index))
694                return;
695
696        /* Derivatives end up emitted on the texture pipe, not the ALUs. This
697         * is handled elsewhere */
698
699        if (instr->op == nir_op_fddx || instr->op == nir_op_fddy) {
700                midgard_emit_derivatives(ctx, instr);
701                return;
702        }
703
704        bool is_ssa = dest->is_ssa;
705
706        unsigned nr_components = nir_dest_num_components(*dest);
707        unsigned nr_inputs = nir_op_infos[instr->op].num_inputs;
708        unsigned op = 0;
709
710        /* Number of components valid to check for the instruction (the rest
711         * will be forced to the last), or 0 to use as-is. Relevant as
712         * ball-type instructions have a channel count in NIR but are all vec4
713         * in Midgard */
714
715        unsigned broadcast_swizzle = 0;
716
717        /* Should we swap arguments? */
718        bool flip_src12 = false;
719
720        ASSERTED unsigned src_bitsize = nir_src_bit_size(instr->src[0].src);
721        ASSERTED unsigned dst_bitsize = nir_dest_bit_size(*dest);
722
723        enum midgard_roundmode roundmode = MIDGARD_RTE;
724
725        switch (instr->op) {
726                ALU_CASE(fadd, fadd);
727                ALU_CASE(fmul, fmul);
728                ALU_CASE(fmin, fmin);
729                ALU_CASE(fmax, fmax);
730                ALU_CASE(imin, imin);
731                ALU_CASE(imax, imax);
732                ALU_CASE(umin, umin);
733                ALU_CASE(umax, umax);
734                ALU_CASE(ffloor, ffloor);
735                ALU_CASE(fround_even, froundeven);
736                ALU_CASE(ftrunc, ftrunc);
737                ALU_CASE(fceil, fceil);
738                ALU_CASE(fdot3, fdot3);
739                ALU_CASE(fdot4, fdot4);
740                ALU_CASE(iadd, iadd);
741                ALU_CASE(isub, isub);
742                ALU_CASE(iadd_sat, iaddsat);
743                ALU_CASE(isub_sat, isubsat);
744                ALU_CASE(uadd_sat, uaddsat);
745                ALU_CASE(usub_sat, usubsat);
746                ALU_CASE(imul, imul);
747                ALU_CASE(imul_high, imul);
748                ALU_CASE(umul_high, imul);
749                ALU_CASE(uclz, iclz);
750
751                /* Zero shoved as second-arg */
752                ALU_CASE(iabs, iabsdiff);
753
754                ALU_CASE(uabs_isub, iabsdiff);
755                ALU_CASE(uabs_usub, uabsdiff);
756
757                ALU_CASE(mov, imov);
758
759                ALU_CASE_CMP(feq32, feq);
760                ALU_CASE_CMP(fneu32, fne);
761                ALU_CASE_CMP(flt32, flt);
762                ALU_CASE_CMP(ieq32, ieq);
763                ALU_CASE_CMP(ine32, ine);
764                ALU_CASE_CMP(ilt32, ilt);
765                ALU_CASE_CMP(ult32, ult);
766
767                /* We don't have a native b2f32 instruction. Instead, like many
768                 * GPUs, we exploit booleans as 0/~0 for false/true, and
769                 * correspondingly AND
770                 * by 1.0 to do the type conversion. For the moment, prime us
771                 * to emit:
772                 *
773                 * iand [whatever], #0
774                 *
775                 * At the end of emit_alu (as MIR), we'll fix-up the constant
776                 */
777
778                ALU_CASE_CMP(b2f32, iand);
779                ALU_CASE_CMP(b2f16, iand);
780                ALU_CASE_CMP(b2i32, iand);
781
782                /* Likewise, we don't have a dedicated f2b32 instruction, but
783                 * we can do a "not equal to 0.0" test. */
784
785                ALU_CASE_CMP(f2b32, fne);
786                ALU_CASE_CMP(i2b32, ine);
787
788                ALU_CASE(frcp, frcp);
789                ALU_CASE(frsq, frsqrt);
790                ALU_CASE(fsqrt, fsqrt);
791                ALU_CASE(fexp2, fexp2);
792                ALU_CASE(flog2, flog2);
793
794                ALU_CASE_RTZ(f2i64, f2i_rte);
795                ALU_CASE_RTZ(f2u64, f2u_rte);
796                ALU_CASE_RTZ(i2f64, i2f_rte);
797                ALU_CASE_RTZ(u2f64, u2f_rte);
798
799                ALU_CASE_RTZ(f2i32, f2i_rte);
800                ALU_CASE_RTZ(f2u32, f2u_rte);
801                ALU_CASE_RTZ(i2f32, i2f_rte);
802                ALU_CASE_RTZ(u2f32, u2f_rte);
803
804                ALU_CASE_RTZ(f2i8, f2i_rte);
805                ALU_CASE_RTZ(f2u8, f2u_rte);
806
807                ALU_CASE_RTZ(f2i16, f2i_rte);
808                ALU_CASE_RTZ(f2u16, f2u_rte);
809                ALU_CASE_RTZ(i2f16, i2f_rte);
810                ALU_CASE_RTZ(u2f16, u2f_rte);
811
812                ALU_CASE(fsin, fsinpi);
813                ALU_CASE(fcos, fcospi);
814
815                /* We'll get 0 in the second arg, so:
816                 * ~a = ~(a | 0) = nor(a, 0) */
817                ALU_CASE(inot, inor);
818                ALU_CASE(iand, iand);
819                ALU_CASE(ior, ior);
820                ALU_CASE(ixor, ixor);
821                ALU_CASE(ishl, ishl);
822                ALU_CASE(ishr, iasr);
823                ALU_CASE(ushr, ilsr);
824
825                ALU_CASE_BCAST(b32all_fequal2, fball_eq, 2);
826                ALU_CASE_BCAST(b32all_fequal3, fball_eq, 3);
827                ALU_CASE_CMP(b32all_fequal4, fball_eq);
828
829                ALU_CASE_BCAST(b32any_fnequal2, fbany_neq, 2);
830                ALU_CASE_BCAST(b32any_fnequal3, fbany_neq, 3);
831                ALU_CASE_CMP(b32any_fnequal4, fbany_neq);
832
833                ALU_CASE_BCAST(b32all_iequal2, iball_eq, 2);
834                ALU_CASE_BCAST(b32all_iequal3, iball_eq, 3);
835                ALU_CASE_CMP(b32all_iequal4, iball_eq);
836
837                ALU_CASE_BCAST(b32any_inequal2, ibany_neq, 2);
838                ALU_CASE_BCAST(b32any_inequal3, ibany_neq, 3);
839                ALU_CASE_CMP(b32any_inequal4, ibany_neq);
840
841                /* Source mods will be shoved in later */
842                ALU_CASE(fabs, fmov);
843                ALU_CASE(fneg, fmov);
844                ALU_CASE(fsat, fmov);
845                ALU_CASE(fsat_signed_mali, fmov);
846                ALU_CASE(fclamp_pos_mali, fmov);
847
848        /* For size conversion, we use a move. Ideally though we would squash
849         * these ops together; maybe that has to happen after in NIR as part of
850         * propagation...? An earlier algebraic pass ensured we step down by
851         * only / exactly one size. If stepping down, we use a dest override to
852         * reduce the size; if stepping up, we use a larger-sized move with a
853         * half source and a sign/zero-extension modifier */
854
855        case nir_op_i2i8:
856        case nir_op_i2i16:
857        case nir_op_i2i32:
858        case nir_op_i2i64:
859        case nir_op_u2u8:
860        case nir_op_u2u16:
861        case nir_op_u2u32:
862        case nir_op_u2u64:
863        case nir_op_f2f16:
864        case nir_op_f2f32:
865        case nir_op_f2f64: {
866                if (instr->op == nir_op_f2f16 || instr->op == nir_op_f2f32 ||
867                    instr->op == nir_op_f2f64)
868                        op = midgard_alu_op_fmov;
869                else
870                        op = midgard_alu_op_imov;
871
872                break;
873        }
874
875        /* For greater-or-equal, we lower to less-or-equal and flip the
876         * arguments */
877
878        case nir_op_fge:
879        case nir_op_fge32:
880        case nir_op_ige32:
881        case nir_op_uge32: {
882                op =
883                        instr->op == nir_op_fge   ? midgard_alu_op_fle :
884                        instr->op == nir_op_fge32 ? midgard_alu_op_fle :
885                        instr->op == nir_op_ige32 ? midgard_alu_op_ile :
886                        instr->op == nir_op_uge32 ? midgard_alu_op_ule :
887                        0;
888
889                flip_src12 = true;
890                ALU_CHECK_CMP();
891                break;
892        }
893
894        case nir_op_b32csel: {
895                bool mixed = nir_is_non_scalar_swizzle(&instr->src[0], nr_components);
896                bool is_float = mir_is_bcsel_float(instr);
897                op = is_float ?
898                        (mixed ? midgard_alu_op_fcsel_v : midgard_alu_op_fcsel) :
899                        (mixed ? midgard_alu_op_icsel_v : midgard_alu_op_icsel);
900
901                break;
902        }
903
904        case nir_op_unpack_32_2x16:
905        case nir_op_unpack_32_4x8:
906        case nir_op_pack_32_2x16:
907        case nir_op_pack_32_4x8: {
908                op = midgard_alu_op_imov;
909                break;
910        }
911
912        default:
913                DBG("Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
914                assert(0);
915                return;
916        }
917
918        /* Promote imov to fmov if it might help inline a constant */
919        if (op == midgard_alu_op_imov && nir_src_is_const(instr->src[0].src)
920                        && nir_src_bit_size(instr->src[0].src) == 32
921                        && nir_is_same_comp_swizzle(instr->src[0].swizzle,
922                                nir_src_num_components(instr->src[0].src))) {
923                op = midgard_alu_op_fmov;
924        }
925
926        /* Midgard can perform certain modifiers on output of an ALU op */
927
928        unsigned outmod = 0;
929        bool is_int = midgard_is_integer_op(op);
930
931        if (instr->op == nir_op_umul_high || instr->op == nir_op_imul_high) {
932                outmod = midgard_outmod_keephi;
933        } else if (midgard_is_integer_out_op(op)) {
934                outmod = midgard_outmod_keeplo;
935        } else if (instr->op == nir_op_fsat) {
936                outmod = midgard_outmod_clamp_0_1;
937        } else if (instr->op == nir_op_fsat_signed_mali) {
938                outmod = midgard_outmod_clamp_m1_1;
939        } else if (instr->op == nir_op_fclamp_pos_mali) {
940                outmod = midgard_outmod_clamp_0_inf;
941        }
942
943        /* Fetch unit, quirks, etc information */
944        unsigned opcode_props = alu_opcode_props[op].props;
945        bool quirk_flipped_r24 = opcode_props & QUIRK_FLIPPED_R24;
946
947        if (!midgard_is_integer_out_op(op)) {
948                outmod = mir_determine_float_outmod(ctx, &dest, outmod);
949        }
950
951        midgard_instruction ins = {
952                .type = TAG_ALU_4,
953                .dest = nir_dest_index(dest),
954                .dest_type = nir_op_infos[instr->op].output_type
955                        | nir_dest_bit_size(*dest),
956                .roundmode = roundmode,
957        };
958
959        enum midgard_roundmode *roundptr = (opcode_props & MIDGARD_ROUNDS) ?
960                &ins.roundmode : NULL;
961
962        for (unsigned i = nr_inputs; i < ARRAY_SIZE(ins.src); ++i)
963                ins.src[i] = ~0;
964
965        if (quirk_flipped_r24) {
966                ins.src[0] = ~0;
967                mir_copy_src(&ins, instr, 0, 1, &ins.src_abs[1], &ins.src_neg[1], &ins.src_invert[1], roundptr, is_int, broadcast_swizzle);
968        } else {
969                for (unsigned i = 0; i < nr_inputs; ++i) {
970                        unsigned to = i;
971
972                        if (instr->op == nir_op_b32csel) {
973                                /* The condition is the first argument; move
974                                 * the other arguments up one to be a binary
975                                 * instruction for Midgard with the condition
976                                 * last */
977
978                                if (i == 0)
979                                        to = 2;
980                                else if (flip_src12)
981                                        to = 2 - i;
982                                else
983                                        to = i - 1;
984                        } else if (flip_src12) {
985                                to = 1 - to;
986                        }
987
988                        mir_copy_src(&ins, instr, i, to, &ins.src_abs[to], &ins.src_neg[to], &ins.src_invert[to], roundptr, is_int, broadcast_swizzle);
989
990                        /* (!c) ? a : b = c ? b : a */
991                        if (instr->op == nir_op_b32csel && ins.src_invert[2]) {
992                                ins.src_invert[2] = false;
993                                flip_src12 ^= true;
994                        }
995                }
996        }
997
998        if (instr->op == nir_op_fneg || instr->op == nir_op_fabs) {
999                /* Lowered to move */
1000                if (instr->op == nir_op_fneg)
1001                        ins.src_neg[1] ^= true;
1002
1003                if (instr->op == nir_op_fabs)
1004                        ins.src_abs[1] = true;
1005        }
1006
1007        ins.mask = mask_of(nr_components);
1008
1009        /* Apply writemask if non-SSA, keeping in mind that we can't write to
1010         * components that don't exist. Note modifier => SSA => !reg => no
1011         * writemask, so we don't have to worry about writemasks here.*/
1012
1013        if (!is_ssa)
1014                ins.mask &= instr->dest.write_mask;
1015
1016        ins.op = op;
1017        ins.outmod = outmod;
1018
1019        /* Late fixup for emulated instructions */
1020
1021        if (instr->op == nir_op_b2f32 || instr->op == nir_op_b2i32) {
1022                /* Presently, our second argument is an inline #0 constant.
1023                 * Switch over to an embedded 1.0 constant (that can't fit
1024                 * inline, since we're 32-bit, not 16-bit like the inline
1025                 * constants) */
1026
1027                ins.has_inline_constant = false;
1028                ins.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
1029                ins.src_types[1] = nir_type_float32;
1030                ins.has_constants = true;
1031
1032                if (instr->op == nir_op_b2f32)
1033                        ins.constants.f32[0] = 1.0f;
1034                else
1035                        ins.constants.i32[0] = 1;
1036
1037                for (unsigned c = 0; c < 16; ++c)
1038                        ins.swizzle[1][c] = 0;
1039        } else if (instr->op == nir_op_b2f16) {
1040                ins.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
1041                ins.src_types[1] = nir_type_float16;
1042                ins.has_constants = true;
1043                ins.constants.i16[0] = _mesa_float_to_half(1.0);
1044
1045                for (unsigned c = 0; c < 16; ++c)
1046                        ins.swizzle[1][c] = 0;
1047        } else if (nr_inputs == 1 && !quirk_flipped_r24) {
1048                /* Lots of instructions need a 0 plonked in */
1049                ins.has_inline_constant = false;
1050                ins.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
1051                ins.src_types[1] = ins.src_types[0];
1052                ins.has_constants = true;
1053                ins.constants.u32[0] = 0;
1054
1055                for (unsigned c = 0; c < 16; ++c)
1056                        ins.swizzle[1][c] = 0;
1057        } else if (instr->op == nir_op_pack_32_2x16) {
1058                ins.dest_type = nir_type_uint16;
1059                ins.mask = mask_of(nr_components * 2);
1060                ins.is_pack = true;
1061        } else if (instr->op == nir_op_pack_32_4x8) {
1062                ins.dest_type = nir_type_uint8;
1063                ins.mask = mask_of(nr_components * 4);
1064                ins.is_pack = true;
1065        } else if (instr->op == nir_op_unpack_32_2x16) {
1066                ins.dest_type = nir_type_uint32;
1067                ins.mask = mask_of(nr_components >> 1);
1068                ins.is_pack = true;
1069        } else if (instr->op == nir_op_unpack_32_4x8) {
1070                ins.dest_type = nir_type_uint32;
1071                ins.mask = mask_of(nr_components >> 2);
1072                ins.is_pack = true;
1073        }
1074
1075        if ((opcode_props & UNITS_ALL) == UNIT_VLUT) {
1076                /* To avoid duplicating the lookup tables (probably), true LUT
1077                 * instructions can only operate as if they were scalars. Lower
1078                 * them here by changing the component. */
1079
1080                unsigned orig_mask = ins.mask;
1081
1082                unsigned swizzle_back[MIR_VEC_COMPONENTS];
1083                memcpy(&swizzle_back, ins.swizzle[0], sizeof(swizzle_back));
1084
1085                midgard_instruction ins_split[MIR_VEC_COMPONENTS];
1086                unsigned ins_count = 0;
1087
1088                for (int i = 0; i < nr_components; ++i) {
1089                        /* Mask the associated component, dropping the
1090                         * instruction if needed */
1091
1092                        ins.mask = 1 << i;
1093                        ins.mask &= orig_mask;
1094
1095                        for (unsigned j = 0; j < ins_count; ++j) {
1096                                if (swizzle_back[i] == ins_split[j].swizzle[0][0]) {
1097                                        ins_split[j].mask |= ins.mask;
1098                                        ins.mask = 0;
1099                                        break;
1100                                }
1101                        }
1102
1103                        if (!ins.mask)
1104                                continue;
1105
1106                        for (unsigned j = 0; j < MIR_VEC_COMPONENTS; ++j)
1107                                ins.swizzle[0][j] = swizzle_back[i]; /* Pull from the correct component */
1108
1109                        ins_split[ins_count] = ins;
1110
1111                        ++ins_count;
1112                }
1113
1114                for (unsigned i = 0; i < ins_count; ++i) {
1115                        emit_mir_instruction(ctx, ins_split[i]);
1116                }
1117        } else {
1118                emit_mir_instruction(ctx, ins);
1119        }
1120}
1121
1122#undef ALU_CASE
1123
1124static void
1125mir_set_intr_mask(nir_instr *instr, midgard_instruction *ins, bool is_read)
1126{
1127        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1128        unsigned nir_mask = 0;
1129        unsigned dsize = 0;
1130
1131        if (is_read) {
1132                nir_mask = mask_of(nir_intrinsic_dest_components(intr));
1133                dsize = nir_dest_bit_size(intr->dest);
1134        } else {
1135                nir_mask = nir_intrinsic_write_mask(intr);
1136                dsize = 32;
1137        }
1138
1139        /* Once we have the NIR mask, we need to normalize to work in 32-bit space */
1140        unsigned bytemask = pan_to_bytemask(dsize, nir_mask);
1141        ins->dest_type = nir_type_uint | dsize;
1142        mir_set_bytemask(ins, bytemask);
1143}
1144
1145/* Uniforms and UBOs use a shared code path, as uniforms are just (slightly
1146 * optimized) versions of UBO #0 */
1147
1148static midgard_instruction *
1149emit_ubo_read(
1150        compiler_context *ctx,
1151        nir_instr *instr,
1152        unsigned dest,
1153        unsigned offset,
1154        nir_src *indirect_offset,
1155        unsigned indirect_shift,
1156        unsigned index,
1157        unsigned nr_comps)
1158{
1159        midgard_instruction ins;
1160
1161        unsigned dest_size = (instr->type == nir_instr_type_intrinsic) ?
1162                nir_dest_bit_size(nir_instr_as_intrinsic(instr)->dest) : 32;
1163
1164        unsigned bitsize = dest_size * nr_comps;
1165
1166        /* Pick the smallest intrinsic to avoid out-of-bounds reads */
1167        if (bitsize <= 32)
1168                ins = m_ld_ubo_32(dest, 0);
1169        else if (bitsize <= 64)
1170                ins = m_ld_ubo_64(dest, 0);
1171        else if (bitsize <= 128)
1172                ins = m_ld_ubo_128(dest, 0);
1173        else
1174                unreachable("Invalid UBO read size");
1175
1176        ins.constants.u32[0] = offset;
1177
1178        if (instr->type == nir_instr_type_intrinsic)
1179                mir_set_intr_mask(instr, &ins, true);
1180
1181        if (indirect_offset) {
1182                ins.src[2] = nir_src_index(ctx, indirect_offset);
1183                ins.src_types[2] = nir_type_uint32;
1184                ins.load_store.index_shift = indirect_shift;
1185
1186                /* X component for the whole swizzle to prevent register
1187                 * pressure from ballooning from the extra components */
1188                for (unsigned i = 0; i < ARRAY_SIZE(ins.swizzle[2]); ++i)
1189                        ins.swizzle[2][i] = 0;
1190        } else {
1191                ins.load_store.index_reg = REGISTER_LDST_ZERO;
1192        }
1193
1194        if (indirect_offset && indirect_offset->is_ssa && !indirect_shift)
1195                mir_set_ubo_offset(&ins, indirect_offset, offset);
1196
1197        midgard_pack_ubo_index_imm(&ins.load_store, index);
1198
1199        return emit_mir_instruction(ctx, ins);
1200}
1201
1202/* Globals are like UBOs if you squint. And shared memory is like globals if
1203 * you squint even harder */
1204
1205static void
1206emit_global(
1207        compiler_context *ctx,
1208        nir_instr *instr,
1209        bool is_read,
1210        unsigned srcdest,
1211        nir_src *offset,
1212        unsigned seg)
1213{
1214        midgard_instruction ins;
1215
1216        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1217        if (is_read) {
1218                unsigned bitsize = nir_dest_bit_size(intr->dest) *
1219                        nir_dest_num_components(intr->dest);
1220
1221                if (bitsize <= 32)
1222                        ins = m_ld_32(srcdest, 0);
1223                else if (bitsize <= 64)
1224                        ins = m_ld_64(srcdest, 0);
1225                else if (bitsize <= 128)
1226                        ins = m_ld_128(srcdest, 0);
1227                else
1228                        unreachable("Invalid global read size");
1229        } else {
1230                unsigned bitsize = nir_src_bit_size(intr->src[0]) *
1231                        nir_src_num_components(intr->src[0]);
1232
1233                if (bitsize <= 32)
1234                        ins = m_st_32(srcdest, 0);
1235                else if (bitsize <= 64)
1236                        ins = m_st_64(srcdest, 0);
1237                else if (bitsize <= 128)
1238                        ins = m_st_128(srcdest, 0);
1239                else
1240                        unreachable("Invalid global store size");
1241        }
1242
1243        mir_set_offset(ctx, &ins, offset, seg);
1244        mir_set_intr_mask(instr, &ins, is_read);
1245
1246        /* Set a valid swizzle for masked out components */
1247        assert(ins.mask);
1248        unsigned first_component = __builtin_ffs(ins.mask) - 1;
1249
1250        for (unsigned i = 0; i < ARRAY_SIZE(ins.swizzle[0]); ++i) {
1251                if (!(ins.mask & (1 << i)))
1252                        ins.swizzle[0][i] = first_component;
1253        }
1254
1255        emit_mir_instruction(ctx, ins);
1256}
1257
1258/* If is_shared is off, the only other possible value are globals, since
1259 * SSBO's are being lowered to globals through a NIR pass.
1260 * `image_direct_address` should be ~0 when instr is not an image_atomic
1261 * and the destination register of a lea_image op when it is an image_atomic. */
1262static void
1263emit_atomic(
1264        compiler_context *ctx,
1265        nir_intrinsic_instr *instr,
1266        bool is_shared,
1267        midgard_load_store_op op,
1268        unsigned image_direct_address)
1269{
1270        nir_alu_type type =
1271                (op == midgard_op_atomic_imin || op == midgard_op_atomic_imax) ?
1272                nir_type_int : nir_type_uint;
1273
1274        bool is_image = image_direct_address != ~0;
1275
1276        unsigned dest = nir_dest_index(&instr->dest);
1277        unsigned val_src = is_image ? 3 : 1;
1278        unsigned val = nir_src_index(ctx, &instr->src[val_src]);
1279        unsigned bitsize = nir_src_bit_size(instr->src[val_src]);
1280        emit_explicit_constant(ctx, val, val);
1281
1282        midgard_instruction ins = {
1283                .type = TAG_LOAD_STORE_4,
1284                .mask = 0xF,
1285                .dest = dest,
1286                .src = { ~0, ~0, ~0, val },
1287                .src_types = { 0, 0, 0, type | bitsize },
1288                .op = op
1289        };
1290
1291        nir_src *src_offset = nir_get_io_offset_src(instr);
1292
1293        if (op == midgard_op_atomic_cmpxchg) {
1294                unsigned xchg_val_src = is_image ? 4 : 2;
1295                unsigned xchg_val = nir_src_index(ctx, &instr->src[xchg_val_src]);
1296                emit_explicit_constant(ctx, xchg_val, xchg_val);
1297
1298                ins.src[2] = val;
1299                ins.src_types[2] = type | bitsize;
1300                ins.src[3] = xchg_val;
1301
1302                if (is_shared) {
1303                        ins.load_store.arg_reg = REGISTER_LDST_LOCAL_STORAGE_PTR;
1304                        ins.load_store.arg_comp = COMPONENT_Z;
1305                        ins.load_store.bitsize_toggle = true;
1306                } else {
1307                        for(unsigned i = 0; i < 2; ++i)
1308                                ins.swizzle[1][i] = i;
1309
1310                        ins.src[1] = is_image ? image_direct_address :
1311                                                nir_src_index(ctx, src_offset);
1312                        ins.src_types[1] = nir_type_uint64;
1313                }
1314        } else if (is_image) {
1315                for(unsigned i = 0; i < 2; ++i)
1316                        ins.swizzle[2][i] = i;
1317
1318                ins.src[2] = image_direct_address;
1319                ins.src_types[2] = nir_type_uint64;
1320
1321                ins.load_store.arg_reg = REGISTER_LDST_ZERO;
1322                ins.load_store.bitsize_toggle = true;
1323                ins.load_store.index_format = midgard_index_address_u64;
1324        } else
1325                mir_set_offset(ctx, &ins, src_offset, is_shared ? LDST_SHARED : LDST_GLOBAL);
1326
1327        mir_set_intr_mask(&instr->instr, &ins, true);
1328
1329        emit_mir_instruction(ctx, ins);
1330}
1331
1332static void
1333emit_varying_read(
1334        compiler_context *ctx,
1335        unsigned dest, unsigned offset,
1336        unsigned nr_comp, unsigned component,
1337        nir_src *indirect_offset, nir_alu_type type, bool flat)
1338{
1339        /* XXX: Half-floats? */
1340        /* TODO: swizzle, mask */
1341
1342        midgard_instruction ins = m_ld_vary_32(dest, PACK_LDST_ATTRIB_OFS(offset));
1343        ins.mask = mask_of(nr_comp);
1344        ins.dest_type = type;
1345
1346        if (type == nir_type_float16) {
1347                /* Ensure we are aligned so we can pack it later */
1348                ins.mask = mask_of(ALIGN_POT(nr_comp, 2));
1349        }
1350
1351        for (unsigned i = 0; i < ARRAY_SIZE(ins.swizzle[0]); ++i)
1352                ins.swizzle[0][i] = MIN2(i + component, COMPONENT_W);
1353
1354
1355        midgard_varying_params p = {
1356                .flat_shading = flat,
1357                .perspective_correction = 1,
1358                .interpolate_sample = true,
1359        };
1360        midgard_pack_varying_params(&ins.load_store, p);
1361
1362        if (indirect_offset) {
1363                ins.src[2] = nir_src_index(ctx, indirect_offset);
1364                ins.src_types[2] = nir_type_uint32;
1365        } else
1366                ins.load_store.index_reg = REGISTER_LDST_ZERO;
1367
1368        ins.load_store.arg_reg = REGISTER_LDST_ZERO;
1369        ins.load_store.index_format = midgard_index_address_u32;
1370
1371        /* Use the type appropriate load */
1372        switch (type) {
1373        case nir_type_uint32:
1374        case nir_type_bool32:
1375                ins.op = midgard_op_ld_vary_32u;
1376                break;
1377        case nir_type_int32:
1378                ins.op = midgard_op_ld_vary_32i;
1379                break;
1380        case nir_type_float32:
1381                ins.op = midgard_op_ld_vary_32;
1382                break;
1383        case nir_type_float16:
1384                ins.op = midgard_op_ld_vary_16;
1385                break;
1386        default:
1387                unreachable("Attempted to load unknown type");
1388                break;
1389        }
1390
1391        emit_mir_instruction(ctx, ins);
1392}
1393
1394
1395/* If `is_atomic` is true, we emit a `lea_image` since midgard doesn't not have special
1396 * image_atomic opcodes. The caller can then use that address to emit a normal atomic opcode. */
1397static midgard_instruction
1398emit_image_op(compiler_context *ctx, nir_intrinsic_instr *instr, bool is_atomic)
1399{
1400        enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
1401        unsigned nr_attr = ctx->stage == MESA_SHADER_VERTEX ?
1402                util_bitcount64(ctx->nir->info.inputs_read) : 0;
1403        unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim);
1404        bool is_array = nir_intrinsic_image_array(instr);
1405        bool is_store = instr->intrinsic == nir_intrinsic_image_store;
1406
1407        /* TODO: MSAA */
1408        assert(dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported");
1409
1410        unsigned coord_reg = nir_src_index(ctx, &instr->src[1]);
1411        emit_explicit_constant(ctx, coord_reg, coord_reg);
1412
1413        nir_src *index = &instr->src[0];
1414        bool is_direct = nir_src_is_const(*index);
1415
1416        /* For image opcodes, address is used as an index into the attribute descriptor */
1417        unsigned address = nr_attr;
1418        if (is_direct)
1419                address += nir_src_as_uint(*index);
1420
1421        midgard_instruction ins;
1422        if (is_store) { /* emit st_image_* */
1423                unsigned val = nir_src_index(ctx, &instr->src[3]);
1424                emit_explicit_constant(ctx, val, val);
1425
1426                nir_alu_type type = nir_intrinsic_src_type(instr);
1427                ins = st_image(type, val, PACK_LDST_ATTRIB_OFS(address));
1428                nir_alu_type base_type = nir_alu_type_get_base_type(type);
1429                ins.src_types[0] = base_type | nir_src_bit_size(instr->src[3]);
1430        } else if (is_atomic) { /* emit lea_image */
1431                unsigned dest = make_compiler_temp_reg(ctx);
1432                ins = m_lea_image(dest, PACK_LDST_ATTRIB_OFS(address));
1433                ins.mask = mask_of(2); /* 64-bit memory address */
1434        } else { /* emit ld_image_* */
1435                nir_alu_type type = nir_intrinsic_dest_type(instr);
1436                ins = ld_image(type, nir_dest_index(&instr->dest), PACK_LDST_ATTRIB_OFS(address));
1437                ins.mask = mask_of(nir_intrinsic_dest_components(instr));
1438                ins.dest_type = type;
1439        }
1440
1441        /* Coord reg */
1442        ins.src[1] = coord_reg;
1443        ins.src_types[1] = nir_type_uint16;
1444        if (nr_dim == 3 || is_array) {
1445                ins.load_store.bitsize_toggle = true;
1446        }
1447
1448        /* Image index reg */
1449        if (!is_direct) {
1450                ins.src[2] = nir_src_index(ctx, index);
1451                ins.src_types[2] = nir_type_uint32;
1452        } else
1453                ins.load_store.index_reg = REGISTER_LDST_ZERO;
1454
1455        emit_mir_instruction(ctx, ins);
1456
1457        return ins;
1458}
1459
1460static void
1461emit_attr_read(
1462        compiler_context *ctx,
1463        unsigned dest, unsigned offset,
1464        unsigned nr_comp, nir_alu_type t)
1465{
1466        midgard_instruction ins = m_ld_attr_32(dest, PACK_LDST_ATTRIB_OFS(offset));
1467        ins.load_store.arg_reg = REGISTER_LDST_ZERO;
1468        ins.load_store.index_reg = REGISTER_LDST_ZERO;
1469        ins.mask = mask_of(nr_comp);
1470
1471        /* Use the type appropriate load */
1472        switch (t) {
1473        case nir_type_uint:
1474        case nir_type_bool:
1475                ins.op = midgard_op_ld_attr_32u;
1476                break;
1477        case nir_type_int:
1478                ins.op = midgard_op_ld_attr_32i;
1479                break;
1480        case nir_type_float:
1481                ins.op = midgard_op_ld_attr_32;
1482                break;
1483        default:
1484                unreachable("Attempted to load unknown type");
1485                break;
1486        }
1487
1488        emit_mir_instruction(ctx, ins);
1489}
1490
1491static void
1492emit_sysval_read(compiler_context *ctx, nir_instr *instr,
1493                unsigned nr_components, unsigned offset)
1494{
1495        nir_dest nir_dest;
1496
1497        /* Figure out which uniform this is */
1498        unsigned sysval_ubo =
1499                MAX2(ctx->inputs->sysval_ubo, ctx->nir->info.num_ubos);
1500        int sysval = panfrost_sysval_for_instr(instr, &nir_dest);
1501        unsigned dest = nir_dest_index(&nir_dest);
1502        unsigned uniform =
1503                pan_lookup_sysval(ctx->sysval_to_id, &ctx->info->sysvals, sysval);
1504
1505        /* Emit the read itself -- this is never indirect */
1506        midgard_instruction *ins =
1507                emit_ubo_read(ctx, instr, dest, (uniform * 16) + offset, NULL, 0,
1508                              sysval_ubo, nr_components);
1509
1510        ins->mask = mask_of(nr_components);
1511}
1512
1513static unsigned
1514compute_builtin_arg(nir_intrinsic_op op)
1515{
1516        switch (op) {
1517        case nir_intrinsic_load_workgroup_id:
1518                return REGISTER_LDST_GROUP_ID;
1519        case nir_intrinsic_load_local_invocation_id:
1520                return REGISTER_LDST_LOCAL_THREAD_ID;
1521        case nir_intrinsic_load_global_invocation_id:
1522        case nir_intrinsic_load_global_invocation_id_zero_base:
1523                return REGISTER_LDST_GLOBAL_THREAD_ID;
1524        default:
1525                unreachable("Invalid compute paramater loaded");
1526        }
1527}
1528
1529static void
1530emit_fragment_store(compiler_context *ctx, unsigned src, unsigned src_z, unsigned src_s,
1531                    enum midgard_rt_id rt, unsigned sample_iter)
1532{
1533        assert(rt < ARRAY_SIZE(ctx->writeout_branch));
1534        assert(sample_iter < ARRAY_SIZE(ctx->writeout_branch[0]));
1535
1536        midgard_instruction *br = ctx->writeout_branch[rt][sample_iter];
1537
1538        assert(!br);
1539
1540        emit_explicit_constant(ctx, src, src);
1541
1542        struct midgard_instruction ins =
1543                v_branch(false, false);
1544
1545        bool depth_only = (rt == MIDGARD_ZS_RT);
1546
1547        ins.writeout = depth_only ? 0 : PAN_WRITEOUT_C;
1548
1549        /* Add dependencies */
1550        ins.src[0] = src;
1551        ins.src_types[0] = nir_type_uint32;
1552
1553        if (depth_only)
1554                ins.constants.u32[0] = 0xFF;
1555        else
1556                ins.constants.u32[0] = ((rt - MIDGARD_COLOR_RT0) << 8) | sample_iter;
1557
1558        for (int i = 0; i < 4; ++i)
1559                ins.swizzle[0][i] = i;
1560
1561        if (~src_z) {
1562                emit_explicit_constant(ctx, src_z, src_z);
1563                ins.src[2] = src_z;
1564                ins.src_types[2] = nir_type_uint32;
1565                ins.writeout |= PAN_WRITEOUT_Z;
1566        }
1567        if (~src_s) {
1568                emit_explicit_constant(ctx, src_s, src_s);
1569                ins.src[3] = src_s;
1570                ins.src_types[3] = nir_type_uint32;
1571                ins.writeout |= PAN_WRITEOUT_S;
1572        }
1573
1574        /* Emit the branch */
1575        br = emit_mir_instruction(ctx, ins);
1576        schedule_barrier(ctx);
1577        ctx->writeout_branch[rt][sample_iter] = br;
1578
1579        /* Push our current location = current block count - 1 = where we'll
1580         * jump to. Maybe a bit too clever for my own good */
1581
1582        br->branch.target_block = ctx->block_count - 1;
1583}
1584
1585static void
1586emit_compute_builtin(compiler_context *ctx, nir_intrinsic_instr *instr)
1587{
1588        unsigned reg = nir_dest_index(&instr->dest);
1589        midgard_instruction ins = m_ldst_mov(reg, 0);
1590        ins.mask = mask_of(3);
1591        ins.swizzle[0][3] = COMPONENT_X; /* xyzx */
1592        ins.load_store.arg_reg = compute_builtin_arg(instr->intrinsic);
1593        emit_mir_instruction(ctx, ins);
1594}
1595
1596static unsigned
1597vertex_builtin_arg(nir_intrinsic_op op)
1598{
1599        switch (op) {
1600        case nir_intrinsic_load_vertex_id_zero_base:
1601                return PAN_VERTEX_ID;
1602        case nir_intrinsic_load_instance_id:
1603                return PAN_INSTANCE_ID;
1604        default:
1605                unreachable("Invalid vertex builtin");
1606        }
1607}
1608
1609static void
1610emit_vertex_builtin(compiler_context *ctx, nir_intrinsic_instr *instr)
1611{
1612        unsigned reg = nir_dest_index(&instr->dest);
1613        emit_attr_read(ctx, reg, vertex_builtin_arg(instr->intrinsic), 1, nir_type_int);
1614}
1615
1616static void
1617emit_special(compiler_context *ctx, nir_intrinsic_instr *instr, unsigned idx)
1618{
1619        unsigned reg = nir_dest_index(&instr->dest);
1620
1621        midgard_instruction ld = m_ld_tilebuffer_raw(reg, 0);
1622        ld.op = midgard_op_ld_special_32u;
1623        ld.load_store.signed_offset = PACK_LDST_SELECTOR_OFS(idx);
1624        ld.load_store.index_reg = REGISTER_LDST_ZERO;
1625
1626        for (int i = 0; i < 4; ++i)
1627                ld.swizzle[0][i] = COMPONENT_X;
1628
1629        emit_mir_instruction(ctx, ld);
1630}
1631
1632static void
1633emit_control_barrier(compiler_context *ctx)
1634{
1635        midgard_instruction ins = {
1636                .type = TAG_TEXTURE_4,
1637                .dest = ~0,
1638                .src = { ~0, ~0, ~0, ~0 },
1639                .op = midgard_tex_op_barrier,
1640        };
1641
1642        emit_mir_instruction(ctx, ins);
1643}
1644
1645static unsigned
1646mir_get_branch_cond(nir_src *src, bool *invert)
1647{
1648        /* Wrap it. No swizzle since it's a scalar */
1649
1650        nir_alu_src alu = {
1651                .src = *src
1652        };
1653
1654        *invert = pan_has_source_mod(&alu, nir_op_inot);
1655        return nir_src_index(NULL, &alu.src);
1656}
1657
1658static uint8_t
1659output_load_rt_addr(compiler_context *ctx, nir_intrinsic_instr *instr)
1660{
1661        if (ctx->inputs->is_blend)
1662                return MIDGARD_COLOR_RT0 + ctx->inputs->blend.rt;
1663
1664        const nir_variable *var;
1665        var = nir_find_variable_with_driver_location(ctx->nir, nir_var_shader_out, nir_intrinsic_base(instr));
1666        assert(var);
1667
1668        unsigned loc = var->data.location;
1669
1670        if (loc >= FRAG_RESULT_DATA0)
1671                return loc - FRAG_RESULT_DATA0;
1672
1673        if (loc == FRAG_RESULT_DEPTH)
1674                return 0x1F;
1675        if (loc == FRAG_RESULT_STENCIL)
1676                return 0x1E;
1677
1678        unreachable("Invalid RT to load from");
1679}
1680
1681static void
1682emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr)
1683{
1684        unsigned offset = 0, reg;
1685
1686        switch (instr->intrinsic) {
1687        case nir_intrinsic_discard_if:
1688        case nir_intrinsic_discard: {
1689                bool conditional = instr->intrinsic == nir_intrinsic_discard_if;
1690                struct midgard_instruction discard = v_branch(conditional, false);
1691                discard.branch.target_type = TARGET_DISCARD;
1692
1693                if (conditional) {
1694                        discard.src[0] = mir_get_branch_cond(&instr->src[0],
1695                                        &discard.branch.invert_conditional);
1696                        discard.src_types[0] = nir_type_uint32;
1697                }
1698
1699                emit_mir_instruction(ctx, discard);
1700                schedule_barrier(ctx);
1701
1702                break;
1703        }
1704
1705        case nir_intrinsic_image_load:
1706        case nir_intrinsic_image_store:
1707                emit_image_op(ctx, instr, false);
1708                break;
1709
1710        case nir_intrinsic_image_size: {
1711                unsigned nr_comp = nir_intrinsic_dest_components(instr);
1712                emit_sysval_read(ctx, &instr->instr, nr_comp, 0);
1713                break;
1714        }
1715
1716        case nir_intrinsic_load_ubo:
1717        case nir_intrinsic_load_global:
1718        case nir_intrinsic_load_global_constant:
1719        case nir_intrinsic_load_shared:
1720        case nir_intrinsic_load_scratch:
1721        case nir_intrinsic_load_input:
1722        case nir_intrinsic_load_kernel_input:
1723        case nir_intrinsic_load_interpolated_input: {
1724                bool is_ubo = instr->intrinsic == nir_intrinsic_load_ubo;
1725                bool is_global = instr->intrinsic == nir_intrinsic_load_global ||
1726                        instr->intrinsic == nir_intrinsic_load_global_constant;
1727                bool is_shared = instr->intrinsic == nir_intrinsic_load_shared;
1728                bool is_scratch = instr->intrinsic == nir_intrinsic_load_scratch;
1729                bool is_flat = instr->intrinsic == nir_intrinsic_load_input;
1730                bool is_kernel = instr->intrinsic == nir_intrinsic_load_kernel_input;
1731                bool is_interp = instr->intrinsic == nir_intrinsic_load_interpolated_input;
1732
1733                /* Get the base type of the intrinsic */
1734                /* TODO: Infer type? Does it matter? */
1735                nir_alu_type t =
1736                        (is_interp) ? nir_type_float :
1737                        (is_flat) ? nir_intrinsic_dest_type(instr) :
1738                        nir_type_uint;
1739
1740                t = nir_alu_type_get_base_type(t);
1741
1742                if (!(is_ubo || is_global || is_scratch)) {
1743                        offset = nir_intrinsic_base(instr);
1744                }
1745
1746                unsigned nr_comp = nir_intrinsic_dest_components(instr);
1747
1748                nir_src *src_offset = nir_get_io_offset_src(instr);
1749
1750                bool direct = nir_src_is_const(*src_offset);
1751                nir_src *indirect_offset = direct ? NULL : src_offset;
1752
1753                if (direct)
1754                        offset += nir_src_as_uint(*src_offset);
1755
1756                /* We may need to apply a fractional offset */
1757                int component = (is_flat || is_interp) ?
1758                                nir_intrinsic_component(instr) : 0;
1759                reg = nir_dest_index(&instr->dest);
1760
1761                if (is_kernel) {
1762                        emit_ubo_read(ctx, &instr->instr, reg, offset, indirect_offset, 0, 0, nr_comp);
1763                } else if (is_ubo) {
1764                        nir_src index = instr->src[0];
1765
1766                        /* TODO: Is indirect block number possible? */
1767                        assert(nir_src_is_const(index));
1768
1769                        uint32_t uindex = nir_src_as_uint(index);
1770                        emit_ubo_read(ctx, &instr->instr, reg, offset, indirect_offset, 0, uindex, nr_comp);
1771                } else if (is_global || is_shared || is_scratch) {
1772                        unsigned seg = is_global ? LDST_GLOBAL : (is_shared ? LDST_SHARED : LDST_SCRATCH);
1773                        emit_global(ctx, &instr->instr, true, reg, src_offset, seg);
1774                } else if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->inputs->is_blend) {
1775                        emit_varying_read(ctx, reg, offset, nr_comp, component, indirect_offset, t | nir_dest_bit_size(instr->dest), is_flat);
1776                } else if (ctx->inputs->is_blend) {
1777                        /* ctx->blend_input will be precoloured to r0/r2, where
1778                         * the input is preloaded */
1779
1780                        unsigned *input = offset ? &ctx->blend_src1 : &ctx->blend_input;
1781
1782                        if (*input == ~0)
1783                                *input = reg;
1784                        else
1785                                emit_mir_instruction(ctx, v_mov(*input, reg));
1786                } else if (ctx->stage == MESA_SHADER_VERTEX) {
1787                        emit_attr_read(ctx, reg, offset, nr_comp, t);
1788                } else {
1789                        DBG("Unknown load\n");
1790                        assert(0);
1791                }
1792
1793                break;
1794        }
1795
1796        /* Handled together with load_interpolated_input */
1797        case nir_intrinsic_load_barycentric_pixel:
1798        case nir_intrinsic_load_barycentric_centroid:
1799        case nir_intrinsic_load_barycentric_sample:
1800                break;
1801
1802        /* Reads 128-bit value raw off the tilebuffer during blending, tasty */
1803
1804        case nir_intrinsic_load_raw_output_pan: {
1805                reg = nir_dest_index(&instr->dest);
1806
1807                /* T720 and below use different blend opcodes with slightly
1808                 * different semantics than T760 and up */
1809
1810                midgard_instruction ld = m_ld_tilebuffer_raw(reg, 0);
1811
1812                unsigned target = output_load_rt_addr(ctx, instr);
1813                ld.load_store.index_comp = target & 0x3;
1814                ld.load_store.index_reg = target >> 2;
1815
1816                if (nir_src_is_const(instr->src[0])) {
1817                        unsigned sample = nir_src_as_uint(instr->src[0]);
1818                        ld.load_store.arg_comp = sample & 0x3;
1819                        ld.load_store.arg_reg = sample >> 2;
1820                } else {
1821                        /* Enable sample index via register. */
1822                        ld.load_store.signed_offset |= 1;
1823                        ld.src[1] = nir_src_index(ctx, &instr->src[0]);
1824                        ld.src_types[1] = nir_type_int32;
1825                }
1826
1827                if (ctx->quirks & MIDGARD_OLD_BLEND) {
1828                        ld.op = midgard_op_ld_special_32u;
1829                        ld.load_store.signed_offset = PACK_LDST_SELECTOR_OFS(16);
1830                        ld.load_store.index_reg = REGISTER_LDST_ZERO;
1831                }
1832
1833                emit_mir_instruction(ctx, ld);
1834                break;
1835        }
1836
1837        case nir_intrinsic_load_output: {
1838                reg = nir_dest_index(&instr->dest);
1839
1840                unsigned bits = nir_dest_bit_size(instr->dest);
1841
1842                midgard_instruction ld;
1843                if (bits == 16)
1844                        ld = m_ld_tilebuffer_16f(reg, 0);
1845                else
1846                        ld = m_ld_tilebuffer_32f(reg, 0);
1847
1848                unsigned index = output_load_rt_addr(ctx, instr);
1849                ld.load_store.index_comp = index & 0x3;
1850                ld.load_store.index_reg = index >> 2;
1851
1852                for (unsigned c = 4; c < 16; ++c)
1853                        ld.swizzle[0][c] = 0;
1854
1855                if (ctx->quirks & MIDGARD_OLD_BLEND) {
1856                        if (bits == 16)
1857                                ld.op = midgard_op_ld_special_16f;
1858                        else
1859                                ld.op = midgard_op_ld_special_32f;
1860                        ld.load_store.signed_offset = PACK_LDST_SELECTOR_OFS(1);
1861                        ld.load_store.index_reg = REGISTER_LDST_ZERO;
1862                }
1863
1864                emit_mir_instruction(ctx, ld);
1865                break;
1866        }
1867
1868        case nir_intrinsic_store_output:
1869        case nir_intrinsic_store_combined_output_pan:
1870                assert(nir_src_is_const(instr->src[1]) && "no indirect outputs");
1871
1872                offset = nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[1]);
1873
1874                reg = nir_src_index(ctx, &instr->src[0]);
1875
1876                if (ctx->stage == MESA_SHADER_FRAGMENT) {
1877                        bool combined = instr->intrinsic ==
1878                                nir_intrinsic_store_combined_output_pan;
1879
1880                        const nir_variable *var;
1881                        var = nir_find_variable_with_driver_location(ctx->nir, nir_var_shader_out,
1882                                         nir_intrinsic_base(instr));
1883                        assert(var);
1884
1885                        /* Dual-source blend writeout is done by leaving the
1886                         * value in r2 for the blend shader to use. */
1887                        if (var->data.index) {
1888                                if (instr->src[0].is_ssa) {
1889                                        emit_explicit_constant(ctx, reg, reg);
1890
1891                                        unsigned out = make_compiler_temp(ctx);
1892
1893                                        midgard_instruction ins = v_mov(reg, out);
1894                                        emit_mir_instruction(ctx, ins);
1895
1896                                        ctx->blend_src1 = out;
1897                                } else {
1898                                        ctx->blend_src1 = reg;
1899                                }
1900
1901                                break;
1902                        }
1903
1904                        enum midgard_rt_id rt;
1905                        if (var->data.location >= FRAG_RESULT_DATA0)
1906                                rt = MIDGARD_COLOR_RT0 + var->data.location -
1907                                     FRAG_RESULT_DATA0;
1908                        else if (combined)
1909                                rt = MIDGARD_ZS_RT;
1910                        else
1911                                unreachable("bad rt");
1912
1913                        unsigned reg_z = ~0, reg_s = ~0;
1914                        if (combined) {
1915                                unsigned writeout = nir_intrinsic_component(instr);
1916                                if (writeout & PAN_WRITEOUT_Z)
1917                                        reg_z = nir_src_index(ctx, &instr->src[2]);
1918                                if (writeout & PAN_WRITEOUT_S)
1919                                        reg_s = nir_src_index(ctx, &instr->src[3]);
1920                        }
1921
1922                        emit_fragment_store(ctx, reg, reg_z, reg_s, rt, 0);
1923                } else if (ctx->stage == MESA_SHADER_VERTEX) {
1924                        assert(instr->intrinsic == nir_intrinsic_store_output);
1925
1926                        /* We should have been vectorized, though we don't
1927                         * currently check that st_vary is emitted only once
1928                         * per slot (this is relevant, since there's not a mask
1929                         * parameter available on the store [set to 0 by the
1930                         * blob]). We do respect the component by adjusting the
1931                         * swizzle. If this is a constant source, we'll need to
1932                         * emit that explicitly. */
1933
1934                        emit_explicit_constant(ctx, reg, reg);
1935
1936                        unsigned dst_component = nir_intrinsic_component(instr);
1937                        unsigned nr_comp = nir_src_num_components(instr->src[0]);
1938
1939                        midgard_instruction st = m_st_vary_32(reg, PACK_LDST_ATTRIB_OFS(offset));
1940                        st.load_store.arg_reg = REGISTER_LDST_ZERO;
1941                        st.load_store.index_format = midgard_index_address_u32;
1942                        st.load_store.index_reg = REGISTER_LDST_ZERO;
1943
1944                        switch (nir_alu_type_get_base_type(nir_intrinsic_src_type(instr))) {
1945                        case nir_type_uint:
1946                        case nir_type_bool:
1947                                st.op = midgard_op_st_vary_32u;
1948                                break;
1949                        case nir_type_int:
1950                                st.op = midgard_op_st_vary_32i;
1951                                break;
1952                        case nir_type_float:
1953                                st.op = midgard_op_st_vary_32;
1954                                break;
1955                        default:
1956                                unreachable("Attempted to store unknown type");
1957                                break;
1958                        }
1959
1960                        /* nir_intrinsic_component(store_intr) encodes the
1961                         * destination component start. Source component offset
1962                         * adjustment is taken care of in
1963                         * install_registers_instr(), when offset_swizzle() is
1964                         * called.
1965                         */
1966                        unsigned src_component = COMPONENT_X;
1967
1968                        assert(nr_comp > 0);
1969                        for (unsigned i = 0; i < ARRAY_SIZE(st.swizzle); ++i) {
1970                                st.swizzle[0][i] = src_component;
1971                                if (i >= dst_component && i < dst_component + nr_comp - 1)
1972                                        src_component++;
1973                        }
1974
1975                        emit_mir_instruction(ctx, st);
1976                } else {
1977                        DBG("Unknown store\n");
1978                        assert(0);
1979                }
1980
1981                break;
1982
1983        /* Special case of store_output for lowered blend shaders */
1984        case nir_intrinsic_store_raw_output_pan:
1985                assert (ctx->stage == MESA_SHADER_FRAGMENT);
1986                reg = nir_src_index(ctx, &instr->src[0]);
1987                for (unsigned s = 0; s < ctx->blend_sample_iterations; s++)
1988                        emit_fragment_store(ctx, reg, ~0, ~0,
1989                                            ctx->inputs->blend.rt + MIDGARD_COLOR_RT0,
1990                                            s);
1991                break;
1992
1993        case nir_intrinsic_store_global:
1994        case nir_intrinsic_store_shared:
1995        case nir_intrinsic_store_scratch:
1996                reg = nir_src_index(ctx, &instr->src[0]);
1997                emit_explicit_constant(ctx, reg, reg);
1998
1999                unsigned seg;
2000                if (instr->intrinsic == nir_intrinsic_store_global)
2001                        seg = LDST_GLOBAL;
2002                else if (instr->intrinsic == nir_intrinsic_store_shared)
2003                        seg = LDST_SHARED;
2004                else
2005                        seg = LDST_SCRATCH;
2006
2007                emit_global(ctx, &instr->instr, false, reg, &instr->src[1], seg);
2008                break;
2009
2010        case nir_intrinsic_load_first_vertex:
2011        case nir_intrinsic_load_ssbo_address:
2012        case nir_intrinsic_load_work_dim:
2013                emit_sysval_read(ctx, &instr->instr, 1, 0);
2014                break;
2015
2016        case nir_intrinsic_load_base_vertex:
2017                emit_sysval_read(ctx, &instr->instr, 1, 4);
2018                break;
2019
2020        case nir_intrinsic_load_base_instance:
2021                emit_sysval_read(ctx, &instr->instr, 1, 8);
2022                break;
2023
2024        case nir_intrinsic_load_sample_positions_pan:
2025                emit_sysval_read(ctx, &instr->instr, 2, 0);
2026                break;
2027
2028        case nir_intrinsic_get_ssbo_size:
2029                emit_sysval_read(ctx, &instr->instr, 1, 8);
2030                break;
2031
2032        case nir_intrinsic_load_viewport_scale:
2033        case nir_intrinsic_load_viewport_offset:
2034        case nir_intrinsic_load_num_workgroups:
2035        case nir_intrinsic_load_sampler_lod_parameters_pan:
2036        case nir_intrinsic_load_workgroup_size:
2037                emit_sysval_read(ctx, &instr->instr, 3, 0);
2038                break;
2039
2040        case nir_intrinsic_load_blend_const_color_rgba:
2041                emit_sysval_read(ctx, &instr->instr, 4, 0);
2042                break;
2043
2044        case nir_intrinsic_load_workgroup_id:
2045        case nir_intrinsic_load_local_invocation_id:
2046        case nir_intrinsic_load_global_invocation_id:
2047        case nir_intrinsic_load_global_invocation_id_zero_base:
2048                emit_compute_builtin(ctx, instr);
2049                break;
2050
2051        case nir_intrinsic_load_vertex_id_zero_base:
2052        case nir_intrinsic_load_instance_id:
2053                emit_vertex_builtin(ctx, instr);
2054                break;
2055
2056        case nir_intrinsic_load_sample_mask_in:
2057                emit_special(ctx, instr, 96);
2058                break;
2059
2060        case nir_intrinsic_load_sample_id:
2061                emit_special(ctx, instr, 97);
2062                break;
2063
2064        /* Midgard doesn't seem to want special handling */
2065        case nir_intrinsic_memory_barrier:
2066        case nir_intrinsic_memory_barrier_buffer:
2067        case nir_intrinsic_memory_barrier_image:
2068        case nir_intrinsic_memory_barrier_shared:
2069        case nir_intrinsic_group_memory_barrier:
2070                break;
2071
2072        case nir_intrinsic_control_barrier:
2073                schedule_barrier(ctx);
2074                emit_control_barrier(ctx);
2075                schedule_barrier(ctx);
2076                break;
2077
2078        ATOMIC_CASE(ctx, instr, add, add);
2079        ATOMIC_CASE(ctx, instr, and, and);
2080        ATOMIC_CASE(ctx, instr, comp_swap, cmpxchg);
2081        ATOMIC_CASE(ctx, instr, exchange, xchg);
2082        ATOMIC_CASE(ctx, instr, imax, imax);
2083        ATOMIC_CASE(ctx, instr, imin, imin);
2084        ATOMIC_CASE(ctx, instr, or, or);
2085        ATOMIC_CASE(ctx, instr, umax, umax);
2086        ATOMIC_CASE(ctx, instr, umin, umin);
2087        ATOMIC_CASE(ctx, instr, xor, xor);
2088
2089        IMAGE_ATOMIC_CASE(ctx, instr, add, add);
2090        IMAGE_ATOMIC_CASE(ctx, instr, and, and);
2091        IMAGE_ATOMIC_CASE(ctx, instr, comp_swap, cmpxchg);
2092        IMAGE_ATOMIC_CASE(ctx, instr, exchange, xchg);
2093        IMAGE_ATOMIC_CASE(ctx, instr, imax, imax);
2094        IMAGE_ATOMIC_CASE(ctx, instr, imin, imin);
2095        IMAGE_ATOMIC_CASE(ctx, instr, or, or);
2096        IMAGE_ATOMIC_CASE(ctx, instr, umax, umax);
2097        IMAGE_ATOMIC_CASE(ctx, instr, umin, umin);
2098        IMAGE_ATOMIC_CASE(ctx, instr, xor, xor);
2099
2100        default:
2101                fprintf(stderr, "Unhandled intrinsic %s\n", nir_intrinsic_infos[instr->intrinsic].name);
2102                assert(0);
2103                break;
2104        }
2105}
2106
2107/* Returns dimension with 0 special casing cubemaps */
2108static unsigned
2109midgard_tex_format(enum glsl_sampler_dim dim)
2110{
2111        switch (dim) {
2112        case GLSL_SAMPLER_DIM_1D:
2113        case GLSL_SAMPLER_DIM_BUF:
2114                return 1;
2115
2116        case GLSL_SAMPLER_DIM_2D:
2117        case GLSL_SAMPLER_DIM_MS:
2118        case GLSL_SAMPLER_DIM_EXTERNAL:
2119        case GLSL_SAMPLER_DIM_RECT:
2120                return 2;
2121
2122        case GLSL_SAMPLER_DIM_3D:
2123                return 3;
2124
2125        case GLSL_SAMPLER_DIM_CUBE:
2126                return 0;
2127
2128        default:
2129                DBG("Unknown sampler dim type\n");
2130                assert(0);
2131                return 0;
2132        }
2133}
2134
2135/* Tries to attach an explicit LOD or bias as a constant. Returns whether this
2136 * was successful */
2137
2138static bool
2139pan_attach_constant_bias(
2140        compiler_context *ctx,
2141        nir_src lod,
2142        midgard_texture_word *word)
2143{
2144        /* To attach as constant, it has to *be* constant */
2145
2146        if (!nir_src_is_const(lod))
2147                return false;
2148
2149        float f = nir_src_as_float(lod);
2150
2151        /* Break into fixed-point */
2152        signed lod_int = f;
2153        float lod_frac = f - lod_int;
2154
2155        /* Carry over negative fractions */
2156        if (lod_frac < 0.0) {
2157                lod_int--;
2158                lod_frac += 1.0;
2159        }
2160
2161        /* Encode */
2162        word->bias = float_to_ubyte(lod_frac);
2163        word->bias_int = lod_int;
2164
2165        return true;
2166}
2167
2168static enum mali_texture_mode
2169mdg_texture_mode(nir_tex_instr *instr)
2170{
2171        if (instr->op == nir_texop_tg4 && instr->is_shadow)
2172                return TEXTURE_GATHER_SHADOW;
2173        else if (instr->op == nir_texop_tg4)
2174                return TEXTURE_GATHER_X + instr->component;
2175        else if (instr->is_shadow)
2176                return TEXTURE_SHADOW;
2177        else
2178                return TEXTURE_NORMAL;
2179}
2180
2181static void
2182set_tex_coord(compiler_context *ctx, nir_tex_instr *instr,
2183              midgard_instruction *ins)
2184{
2185        int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord);
2186
2187        assert(coord_idx >= 0);
2188
2189        int comparator_idx = nir_tex_instr_src_index(instr, nir_tex_src_comparator);
2190        int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index);
2191        assert(comparator_idx < 0 || ms_idx < 0);
2192        int ms_or_comparator_idx = ms_idx >= 0 ? ms_idx : comparator_idx;
2193
2194        unsigned coords = nir_src_index(ctx, &instr->src[coord_idx].src);
2195
2196        emit_explicit_constant(ctx, coords, coords);
2197
2198        ins->src_types[1] = nir_tex_instr_src_type(instr, coord_idx) |
2199                            nir_src_bit_size(instr->src[coord_idx].src);
2200
2201        unsigned nr_comps = instr->coord_components;
2202        unsigned written_mask = 0, write_mask = 0;
2203
2204        /* Initialize all components to coord.x which is expected to always be
2205         * present. Swizzle is updated below based on the texture dimension
2206         * and extra attributes that are packed in the coordinate argument.
2207         */
2208        for (unsigned c = 0; c < MIR_VEC_COMPONENTS; c++)
2209                ins->swizzle[1][c] = COMPONENT_X;
2210
2211        /* Shadow ref value is part of the coordinates if there's no comparator
2212         * source, in that case it's always placed in the last component.
2213         * Midgard wants the ref value in coord.z.
2214         */
2215        if (instr->is_shadow && comparator_idx < 0) {
2216                ins->swizzle[1][COMPONENT_Z] = --nr_comps;
2217                write_mask |= 1 << COMPONENT_Z;
2218        }
2219
2220        /* The array index is the last component if there's no shadow ref value
2221         * or second last if there's one. We already decremented the number of
2222         * components to account for the shadow ref value above.
2223         * Midgard wants the array index in coord.w.
2224         */
2225        if (instr->is_array) {
2226                ins->swizzle[1][COMPONENT_W] = --nr_comps;
2227                write_mask |= 1 << COMPONENT_W;
2228        }
2229
2230        if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
2231                /* texelFetch is undefined on samplerCube */
2232                assert(ins->op != midgard_tex_op_fetch);
2233
2234                ins->src[1] = make_compiler_temp_reg(ctx);
2235
2236                /* For cubemaps, we use a special ld/st op to select the face
2237                 * and copy the xy into the texture register
2238                 */
2239                midgard_instruction ld = m_ld_cubemap_coords(ins->src[1], 0);
2240                ld.src[1] = coords;
2241                ld.src_types[1] = ins->src_types[1];
2242                ld.mask = 0x3; /* xy */
2243                ld.load_store.bitsize_toggle = true;
2244                ld.swizzle[1][3] = COMPONENT_X;
2245                emit_mir_instruction(ctx, ld);
2246
2247                /* We packed cube coordiates (X,Y,Z) into (X,Y), update the
2248                 * written mask accordingly and decrement the number of
2249                 * components
2250                 */
2251                nr_comps--;
2252                written_mask |= 3;
2253        }
2254
2255        /* Now flag tex coord components that have not been written yet */
2256        write_mask |= mask_of(nr_comps) & ~written_mask;
2257        for (unsigned c = 0; c < nr_comps; c++)
2258                ins->swizzle[1][c] = c;
2259
2260        /* Sample index and shadow ref are expected in coord.z */
2261        if (ms_or_comparator_idx >= 0) {
2262                assert(!((write_mask | written_mask) & (1 << COMPONENT_Z)));
2263
2264                unsigned sample_or_ref =
2265                        nir_src_index(ctx, &instr->src[ms_or_comparator_idx].src);
2266
2267                emit_explicit_constant(ctx, sample_or_ref, sample_or_ref);
2268
2269                if (ins->src[1] == ~0)
2270                        ins->src[1] = make_compiler_temp_reg(ctx);
2271
2272                midgard_instruction mov = v_mov(sample_or_ref, ins->src[1]);
2273
2274                for (unsigned c = 0; c < MIR_VEC_COMPONENTS; c++)
2275                        mov.swizzle[1][c] = COMPONENT_X;
2276
2277                mov.mask = 1 << COMPONENT_Z;
2278                written_mask |= 1 << COMPONENT_Z;
2279                ins->swizzle[1][COMPONENT_Z] = COMPONENT_Z;
2280                emit_mir_instruction(ctx, mov);
2281        }
2282
2283        /* Texelfetch coordinates uses all four elements (xyz/index) regardless
2284         * of texture dimensionality, which means it's necessary to zero the
2285         * unused components to keep everything happy.
2286         */
2287        if (ins->op == midgard_tex_op_fetch &&
2288            (written_mask | write_mask) != 0xF) {
2289                if (ins->src[1] == ~0)
2290                        ins->src[1] = make_compiler_temp_reg(ctx);
2291
2292                /* mov index.zw, #0, or generalized */
2293                midgard_instruction mov =
2294                        v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), ins->src[1]);
2295                mov.has_constants = true;
2296                mov.mask = (written_mask | write_mask) ^ 0xF;
2297                emit_mir_instruction(ctx, mov);
2298                for (unsigned c = 0; c < MIR_VEC_COMPONENTS; c++) {
2299                        if (mov.mask & (1 << c))
2300                                ins->swizzle[1][c] = c;
2301                }
2302        }
2303
2304        if (ins->src[1] == ~0) {
2305                /* No temporary reg created, use the src coords directly */
2306                ins->src[1] = coords;
2307	} else if (write_mask) {
2308                /* Move the remaining coordinates to the temporary reg */
2309                midgard_instruction mov = v_mov(coords, ins->src[1]);
2310
2311                for (unsigned c = 0; c < MIR_VEC_COMPONENTS; c++) {
2312                        if ((1 << c) & write_mask) {
2313                                mov.swizzle[1][c] = ins->swizzle[1][c];
2314                                ins->swizzle[1][c] = c;
2315                        } else {
2316                                mov.swizzle[1][c] = COMPONENT_X;
2317                        }
2318                }
2319
2320                mov.mask = write_mask;
2321                emit_mir_instruction(ctx, mov);
2322        }
2323}
2324
2325static void
2326emit_texop_native(compiler_context *ctx, nir_tex_instr *instr,
2327                  unsigned midgard_texop)
2328{
2329        /* TODO */
2330        //assert (!instr->sampler);
2331
2332        nir_dest *dest = &instr->dest;
2333
2334        int texture_index = instr->texture_index;
2335        int sampler_index = instr->sampler_index;
2336
2337        nir_alu_type dest_base = nir_alu_type_get_base_type(instr->dest_type);
2338
2339        /* texture instructions support float outmods */
2340        unsigned outmod = midgard_outmod_none;
2341        if (dest_base == nir_type_float) {
2342                outmod = mir_determine_float_outmod(ctx, &dest, 0);
2343        }
2344
2345        midgard_instruction ins = {
2346                .type = TAG_TEXTURE_4,
2347                .mask = 0xF,
2348                .dest = nir_dest_index(dest),
2349                .src = { ~0, ~0, ~0, ~0 },
2350                .dest_type = instr->dest_type,
2351                .swizzle = SWIZZLE_IDENTITY_4,
2352                .outmod = outmod,
2353                .op = midgard_texop,
2354                .texture = {
2355                        .format = midgard_tex_format(instr->sampler_dim),
2356                        .texture_handle = texture_index,
2357                        .sampler_handle = sampler_index,
2358                        .mode = mdg_texture_mode(instr)
2359                }
2360        };
2361
2362        if (instr->is_shadow && !instr->is_new_style_shadow && instr->op != nir_texop_tg4)
2363           for (int i = 0; i < 4; ++i)
2364              ins.swizzle[0][i] = COMPONENT_X;
2365
2366        for (unsigned i = 0; i < instr->num_srcs; ++i) {
2367                int index = nir_src_index(ctx, &instr->src[i].src);
2368                unsigned sz = nir_src_bit_size(instr->src[i].src);
2369                nir_alu_type T = nir_tex_instr_src_type(instr, i) | sz;
2370
2371                switch (instr->src[i].src_type) {
2372                case nir_tex_src_coord:
2373                        set_tex_coord(ctx, instr, &ins);
2374                        break;
2375
2376                case nir_tex_src_bias:
2377                case nir_tex_src_lod: {
2378                        /* Try as a constant if we can */
2379
2380                        bool is_txf = midgard_texop == midgard_tex_op_fetch;
2381                        if (!is_txf && pan_attach_constant_bias(ctx, instr->src[i].src, &ins.texture))
2382                                break;
2383
2384                        ins.texture.lod_register = true;
2385                        ins.src[2] = index;
2386                        ins.src_types[2] = T;
2387
2388                        for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c)
2389                                ins.swizzle[2][c] = COMPONENT_X;
2390
2391                        emit_explicit_constant(ctx, index, index);
2392
2393                        break;
2394                };
2395
2396                case nir_tex_src_offset: {
2397                        ins.texture.offset_register = true;
2398                        ins.src[3] = index;
2399                        ins.src_types[3] = T;
2400
2401                        for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c)
2402                                ins.swizzle[3][c] = (c > COMPONENT_Z) ? 0 : c;
2403
2404                        emit_explicit_constant(ctx, index, index);
2405                        break;
2406                };
2407
2408                case nir_tex_src_comparator:
2409                case nir_tex_src_ms_index:
2410                        /* Nothing to do, handled in set_tex_coord() */
2411                        break;
2412
2413                default: {
2414                        fprintf(stderr, "Unknown texture source type: %d\n", instr->src[i].src_type);
2415                        assert(0);
2416                }
2417                }
2418        }
2419
2420        emit_mir_instruction(ctx, ins);
2421}
2422
2423static void
2424emit_tex(compiler_context *ctx, nir_tex_instr *instr)
2425{
2426        switch (instr->op) {
2427        case nir_texop_tex:
2428        case nir_texop_txb:
2429                emit_texop_native(ctx, instr, midgard_tex_op_normal);
2430                break;
2431        case nir_texop_txl:
2432        case nir_texop_tg4:
2433                emit_texop_native(ctx, instr, midgard_tex_op_gradient);
2434                break;
2435        case nir_texop_txf:
2436        case nir_texop_txf_ms:
2437                emit_texop_native(ctx, instr, midgard_tex_op_fetch);
2438                break;
2439        case nir_texop_txs:
2440                emit_sysval_read(ctx, &instr->instr, 4, 0);
2441                break;
2442        default: {
2443                fprintf(stderr, "Unhandled texture op: %d\n", instr->op);
2444                assert(0);
2445        }
2446        }
2447}
2448
2449static void
2450emit_jump(compiler_context *ctx, nir_jump_instr *instr)
2451{
2452        switch (instr->type) {
2453        case nir_jump_break: {
2454                /* Emit a branch out of the loop */
2455                struct midgard_instruction br = v_branch(false, false);
2456                br.branch.target_type = TARGET_BREAK;
2457                br.branch.target_break = ctx->current_loop_depth;
2458                emit_mir_instruction(ctx, br);
2459                break;
2460        }
2461
2462        default:
2463                DBG("Unknown jump type %d\n", instr->type);
2464                break;
2465        }
2466}
2467
2468static void
2469emit_instr(compiler_context *ctx, struct nir_instr *instr)
2470{
2471        switch (instr->type) {
2472        case nir_instr_type_load_const:
2473                emit_load_const(ctx, nir_instr_as_load_const(instr));
2474                break;
2475
2476        case nir_instr_type_intrinsic:
2477                emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
2478                break;
2479
2480        case nir_instr_type_alu:
2481                emit_alu(ctx, nir_instr_as_alu(instr));
2482                break;
2483
2484        case nir_instr_type_tex:
2485                emit_tex(ctx, nir_instr_as_tex(instr));
2486                break;
2487
2488        case nir_instr_type_jump:
2489                emit_jump(ctx, nir_instr_as_jump(instr));
2490                break;
2491
2492        case nir_instr_type_ssa_undef:
2493                /* Spurious */
2494                break;
2495
2496        default:
2497                DBG("Unhandled instruction type\n");
2498                break;
2499        }
2500}
2501
2502
2503/* ALU instructions can inline or embed constants, which decreases register
2504 * pressure and saves space. */
2505
2506#define CONDITIONAL_ATTACH(idx) { \
2507	void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->src[idx] + 1); \
2508\
2509	if (entry) { \
2510		attach_constants(ctx, alu, entry, alu->src[idx] + 1); \
2511		alu->src[idx] = SSA_FIXED_REGISTER(REGISTER_CONSTANT); \
2512	} \
2513}
2514
2515static void
2516inline_alu_constants(compiler_context *ctx, midgard_block *block)
2517{
2518        mir_foreach_instr_in_block(block, alu) {
2519                /* Other instructions cannot inline constants */
2520                if (alu->type != TAG_ALU_4) continue;
2521                if (alu->compact_branch) continue;
2522
2523                /* If there is already a constant here, we can do nothing */
2524                if (alu->has_constants) continue;
2525
2526                CONDITIONAL_ATTACH(0);
2527
2528                if (!alu->has_constants) {
2529                        CONDITIONAL_ATTACH(1)
2530                } else if (!alu->inline_constant) {
2531                        /* Corner case: _two_ vec4 constants, for instance with a
2532                         * csel. For this case, we can only use a constant
2533                         * register for one, we'll have to emit a move for the
2534                         * other. */
2535
2536                        void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->src[1] + 1);
2537                        unsigned scratch = make_compiler_temp(ctx);
2538
2539                        if (entry) {
2540                                midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), scratch);
2541                                attach_constants(ctx, &ins, entry, alu->src[1] + 1);
2542
2543                                /* Set the source */
2544                                alu->src[1] = scratch;
2545
2546                                /* Inject us -before- the last instruction which set r31 */
2547                                mir_insert_instruction_before(ctx, mir_prev_op(alu), ins);
2548                        }
2549                }
2550        }
2551}
2552
2553unsigned
2554max_bitsize_for_alu(midgard_instruction *ins)
2555{
2556        unsigned max_bitsize = 0;
2557        for (int i = 0; i < MIR_SRC_COUNT; i++) {
2558                if (ins->src[i] == ~0) continue;
2559                unsigned src_bitsize = nir_alu_type_get_type_size(ins->src_types[i]);
2560                max_bitsize = MAX2(src_bitsize, max_bitsize);
2561        }
2562        unsigned dst_bitsize = nir_alu_type_get_type_size(ins->dest_type);
2563        max_bitsize = MAX2(dst_bitsize, max_bitsize);
2564
2565        /* We don't have fp16 LUTs, so we'll want to emit code like:
2566         *
2567         *      vlut.fsinr hr0, hr0
2568         *
2569         * where both input and output are 16-bit but the operation is carried
2570         * out in 32-bit
2571         */
2572
2573        switch (ins->op) {
2574        case midgard_alu_op_fsqrt:
2575        case midgard_alu_op_frcp:
2576        case midgard_alu_op_frsqrt:
2577        case midgard_alu_op_fsinpi:
2578        case midgard_alu_op_fcospi:
2579        case midgard_alu_op_fexp2:
2580        case midgard_alu_op_flog2:
2581                max_bitsize = MAX2(max_bitsize, 32);
2582                break;
2583
2584        default:
2585                break;
2586        }
2587
2588        /* High implies computing at a higher bitsize, e.g umul_high of 32-bit
2589         * requires computing at 64-bit */
2590        if (midgard_is_integer_out_op(ins->op) && ins->outmod == midgard_outmod_keephi) {
2591                max_bitsize *= 2;
2592                assert(max_bitsize <= 64);
2593        }
2594
2595        return max_bitsize;
2596}
2597
2598midgard_reg_mode
2599reg_mode_for_bitsize(unsigned bitsize)
2600{
2601        switch (bitsize) {
2602                /* use 16 pipe for 8 since we don't support vec16 yet */
2603        case 8:
2604        case 16:
2605                return midgard_reg_mode_16;
2606        case 32:
2607                return midgard_reg_mode_32;
2608        case 64:
2609                return midgard_reg_mode_64;
2610        default:
2611                unreachable("invalid bit size");
2612        }
2613}
2614
2615/* Midgard supports two types of constants, embedded constants (128-bit) and
2616 * inline constants (16-bit). Sometimes, especially with scalar ops, embedded
2617 * constants can be demoted to inline constants, for space savings and
2618 * sometimes a performance boost */
2619
2620static void
2621embedded_to_inline_constant(compiler_context *ctx, midgard_block *block)
2622{
2623        mir_foreach_instr_in_block(block, ins) {
2624                if (!ins->has_constants) continue;
2625                if (ins->has_inline_constant) continue;
2626
2627                unsigned max_bitsize = max_bitsize_for_alu(ins);
2628
2629                /* We can inline 32-bit (sometimes) or 16-bit (usually) */
2630                bool is_16 = max_bitsize == 16;
2631                bool is_32 = max_bitsize == 32;
2632
2633                if (!(is_16 || is_32))
2634                        continue;
2635
2636                /* src1 cannot be an inline constant due to encoding
2637                 * restrictions. So, if possible we try to flip the arguments
2638                 * in that case */
2639
2640                int op = ins->op;
2641
2642                if (ins->src[0] == SSA_FIXED_REGISTER(REGISTER_CONSTANT) &&
2643                                alu_opcode_props[op].props & OP_COMMUTES) {
2644                        mir_flip(ins);
2645                }
2646
2647                if (ins->src[1] == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) {
2648                        /* Component is from the swizzle. Take a nonzero component */
2649                        assert(ins->mask);
2650                        unsigned first_comp = ffs(ins->mask) - 1;
2651                        unsigned component = ins->swizzle[1][first_comp];
2652
2653                        /* Scale constant appropriately, if we can legally */
2654                        int16_t scaled_constant = 0;
2655
2656                        if (is_16) {
2657                                scaled_constant = ins->constants.u16[component];
2658                        } else if (midgard_is_integer_op(op)) {
2659                                scaled_constant = ins->constants.u32[component];
2660
2661                                /* Constant overflow after resize */
2662                                if (scaled_constant != ins->constants.u32[component])
2663                                        continue;
2664                        } else {
2665                                float original = ins->constants.f32[component];
2666                                scaled_constant = _mesa_float_to_half(original);
2667
2668                                /* Check for loss of precision. If this is
2669                                 * mediump, we don't care, but for a highp
2670                                 * shader, we need to pay attention. NIR
2671                                 * doesn't yet tell us which mode we're in!
2672                                 * Practically this prevents most constants
2673                                 * from being inlined, sadly. */
2674
2675                                float fp32 = _mesa_half_to_float(scaled_constant);
2676
2677                                if (fp32 != original)
2678                                        continue;
2679                        }
2680
2681                        /* Should've been const folded */
2682                        if (ins->src_abs[1] || ins->src_neg[1])
2683                                continue;
2684
2685                        /* Make sure that the constant is not itself a vector
2686                         * by checking if all accessed values are the same. */
2687
2688                        const midgard_constants *cons = &ins->constants;
2689                        uint32_t value = is_16 ? cons->u16[component] : cons->u32[component];
2690
2691                        bool is_vector = false;
2692                        unsigned mask = effective_writemask(ins->op, ins->mask);
2693
2694                        for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c) {
2695                                /* We only care if this component is actually used */
2696                                if (!(mask & (1 << c)))
2697                                        continue;
2698
2699                                uint32_t test = is_16 ?
2700                                                cons->u16[ins->swizzle[1][c]] :
2701                                                cons->u32[ins->swizzle[1][c]];
2702
2703                                if (test != value) {
2704                                        is_vector = true;
2705                                        break;
2706                                }
2707                        }
2708
2709                        if (is_vector)
2710                                continue;
2711
2712                        /* Get rid of the embedded constant */
2713                        ins->has_constants = false;
2714                        ins->src[1] = ~0;
2715                        ins->has_inline_constant = true;
2716                        ins->inline_constant = scaled_constant;
2717                }
2718        }
2719}
2720
2721/* Dead code elimination for branches at the end of a block - only one branch
2722 * per block is legal semantically */
2723
2724static void
2725midgard_cull_dead_branch(compiler_context *ctx, midgard_block *block)
2726{
2727        bool branched = false;
2728
2729        mir_foreach_instr_in_block_safe(block, ins) {
2730                if (!midgard_is_branch_unit(ins->unit)) continue;
2731
2732                if (branched)
2733                        mir_remove_instruction(ins);
2734
2735                branched = true;
2736        }
2737}
2738
2739/* We want to force the invert on AND/OR to the second slot to legalize into
2740 * iandnot/iornot. The relevant patterns are for AND (and OR respectively)
2741 *
2742 *   ~a & #b = ~a & ~(#~b)
2743 *   ~a & b = b & ~a
2744 */
2745
2746static void
2747midgard_legalize_invert(compiler_context *ctx, midgard_block *block)
2748{
2749        mir_foreach_instr_in_block(block, ins) {
2750                if (ins->type != TAG_ALU_4) continue;
2751
2752                if (ins->op != midgard_alu_op_iand &&
2753                    ins->op != midgard_alu_op_ior) continue;
2754
2755                if (ins->src_invert[1] || !ins->src_invert[0]) continue;
2756
2757                if (ins->has_inline_constant) {
2758                        /* ~(#~a) = ~(~#a) = a, so valid, and forces both
2759                         * inverts on */
2760                        ins->inline_constant = ~ins->inline_constant;
2761                        ins->src_invert[1] = true;
2762                } else {
2763                        /* Flip to the right invert order. Note
2764                         * has_inline_constant false by assumption on the
2765                         * branch, so flipping makes sense. */
2766                        mir_flip(ins);
2767                }
2768        }
2769}
2770
2771static unsigned
2772emit_fragment_epilogue(compiler_context *ctx, unsigned rt, unsigned sample_iter)
2773{
2774        /* Loop to ourselves */
2775        midgard_instruction *br = ctx->writeout_branch[rt][sample_iter];
2776        struct midgard_instruction ins = v_branch(false, false);
2777        ins.writeout = br->writeout;
2778        ins.branch.target_block = ctx->block_count - 1;
2779        ins.constants.u32[0] = br->constants.u32[0];
2780        memcpy(&ins.src_types, &br->src_types, sizeof(ins.src_types));
2781        emit_mir_instruction(ctx, ins);
2782
2783        ctx->current_block->epilogue = true;
2784        schedule_barrier(ctx);
2785        return ins.branch.target_block;
2786}
2787
2788static midgard_block *
2789emit_block_init(compiler_context *ctx)
2790{
2791        midgard_block *this_block = ctx->after_block;
2792        ctx->after_block = NULL;
2793
2794        if (!this_block)
2795                this_block = create_empty_block(ctx);
2796
2797        list_addtail(&this_block->base.link, &ctx->blocks);
2798
2799        this_block->scheduled = false;
2800        ++ctx->block_count;
2801
2802        /* Set up current block */
2803        list_inithead(&this_block->base.instructions);
2804        ctx->current_block = this_block;
2805
2806        return this_block;
2807}
2808
2809static midgard_block *
2810emit_block(compiler_context *ctx, nir_block *block)
2811{
2812        midgard_block *this_block = emit_block_init(ctx);
2813
2814        nir_foreach_instr(instr, block) {
2815                emit_instr(ctx, instr);
2816                ++ctx->instruction_count;
2817        }
2818
2819        return this_block;
2820}
2821
2822static midgard_block *emit_cf_list(struct compiler_context *ctx, struct exec_list *list);
2823
2824static void
2825emit_if(struct compiler_context *ctx, nir_if *nif)
2826{
2827        midgard_block *before_block = ctx->current_block;
2828
2829        /* Speculatively emit the branch, but we can't fill it in until later */
2830        bool inv = false;
2831        EMIT(branch, true, true);
2832        midgard_instruction *then_branch = mir_last_in_block(ctx->current_block);
2833        then_branch->src[0] = mir_get_branch_cond(&nif->condition, &inv);
2834        then_branch->src_types[0] = nir_type_uint32;
2835        then_branch->branch.invert_conditional = !inv;
2836
2837        /* Emit the two subblocks. */
2838        midgard_block *then_block = emit_cf_list(ctx, &nif->then_list);
2839        midgard_block *end_then_block = ctx->current_block;
2840
2841        /* Emit a jump from the end of the then block to the end of the else */
2842        EMIT(branch, false, false);
2843        midgard_instruction *then_exit = mir_last_in_block(ctx->current_block);
2844
2845        /* Emit second block, and check if it's empty */
2846
2847        int else_idx = ctx->block_count;
2848        int count_in = ctx->instruction_count;
2849        midgard_block *else_block = emit_cf_list(ctx, &nif->else_list);
2850        midgard_block *end_else_block = ctx->current_block;
2851        int after_else_idx = ctx->block_count;
2852
2853        /* Now that we have the subblocks emitted, fix up the branches */
2854
2855        assert(then_block);
2856        assert(else_block);
2857
2858        if (ctx->instruction_count == count_in) {
2859                /* The else block is empty, so don't emit an exit jump */
2860                mir_remove_instruction(then_exit);
2861                then_branch->branch.target_block = after_else_idx;
2862        } else {
2863                then_branch->branch.target_block = else_idx;
2864                then_exit->branch.target_block = after_else_idx;
2865        }
2866
2867        /* Wire up the successors */
2868
2869        ctx->after_block = create_empty_block(ctx);
2870
2871        pan_block_add_successor(&before_block->base, &then_block->base);
2872        pan_block_add_successor(&before_block->base, &else_block->base);
2873
2874        pan_block_add_successor(&end_then_block->base, &ctx->after_block->base);
2875        pan_block_add_successor(&end_else_block->base, &ctx->after_block->base);
2876}
2877
2878static void
2879emit_loop(struct compiler_context *ctx, nir_loop *nloop)
2880{
2881        /* Remember where we are */
2882        midgard_block *start_block = ctx->current_block;
2883
2884        /* Allocate a loop number, growing the current inner loop depth */
2885        int loop_idx = ++ctx->current_loop_depth;
2886
2887        /* Get index from before the body so we can loop back later */
2888        int start_idx = ctx->block_count;
2889
2890        /* Emit the body itself */
2891        midgard_block *loop_block = emit_cf_list(ctx, &nloop->body);
2892
2893        /* Branch back to loop back */
2894        struct midgard_instruction br_back = v_branch(false, false);
2895        br_back.branch.target_block = start_idx;
2896        emit_mir_instruction(ctx, br_back);
2897
2898        /* Mark down that branch in the graph. */
2899        pan_block_add_successor(&start_block->base, &loop_block->base);
2900        pan_block_add_successor(&ctx->current_block->base, &loop_block->base);
2901
2902        /* Find the index of the block about to follow us (note: we don't add
2903         * one; blocks are 0-indexed so we get a fencepost problem) */
2904        int break_block_idx = ctx->block_count;
2905
2906        /* Fix up the break statements we emitted to point to the right place,
2907         * now that we can allocate a block number for them */
2908        ctx->after_block = create_empty_block(ctx);
2909
2910        mir_foreach_block_from(ctx, start_block, _block) {
2911                mir_foreach_instr_in_block(((midgard_block *) _block), ins) {
2912                        if (ins->type != TAG_ALU_4) continue;
2913                        if (!ins->compact_branch) continue;
2914
2915                        /* We found a branch -- check the type to see if we need to do anything */
2916                        if (ins->branch.target_type != TARGET_BREAK) continue;
2917
2918                        /* It's a break! Check if it's our break */
2919                        if (ins->branch.target_break != loop_idx) continue;
2920
2921                        /* Okay, cool, we're breaking out of this loop.
2922                         * Rewrite from a break to a goto */
2923
2924                        ins->branch.target_type = TARGET_GOTO;
2925                        ins->branch.target_block = break_block_idx;
2926
2927                        pan_block_add_successor(_block, &ctx->after_block->base);
2928                }
2929        }
2930
2931        /* Now that we've finished emitting the loop, free up the depth again
2932         * so we play nice with recursion amid nested loops */
2933        --ctx->current_loop_depth;
2934
2935        /* Dump loop stats */
2936        ++ctx->loop_count;
2937}
2938
2939static midgard_block *
2940emit_cf_list(struct compiler_context *ctx, struct exec_list *list)
2941{
2942        midgard_block *start_block = NULL;
2943
2944        foreach_list_typed(nir_cf_node, node, node, list) {
2945                switch (node->type) {
2946                case nir_cf_node_block: {
2947                        midgard_block *block = emit_block(ctx, nir_cf_node_as_block(node));
2948
2949                        if (!start_block)
2950                                start_block = block;
2951
2952                        break;
2953                }
2954
2955                case nir_cf_node_if:
2956                        emit_if(ctx, nir_cf_node_as_if(node));
2957                        break;
2958
2959                case nir_cf_node_loop:
2960                        emit_loop(ctx, nir_cf_node_as_loop(node));
2961                        break;
2962
2963                case nir_cf_node_function:
2964                        assert(0);
2965                        break;
2966                }
2967        }
2968
2969        return start_block;
2970}
2971
2972/* Due to lookahead, we need to report the first tag executed in the command
2973 * stream and in branch targets. An initial block might be empty, so iterate
2974 * until we find one that 'works' */
2975
2976unsigned
2977midgard_get_first_tag_from_block(compiler_context *ctx, unsigned block_idx)
2978{
2979        midgard_block *initial_block = mir_get_block(ctx, block_idx);
2980
2981        mir_foreach_block_from(ctx, initial_block, _v) {
2982                midgard_block *v = (midgard_block *) _v;
2983                if (v->quadword_count) {
2984                        midgard_bundle *initial_bundle =
2985                                util_dynarray_element(&v->bundles, midgard_bundle, 0);
2986
2987                        return initial_bundle->tag;
2988                }
2989        }
2990
2991        /* Default to a tag 1 which will break from the shader, in case we jump
2992         * to the exit block (i.e. `return` in a compute shader) */
2993
2994        return 1;
2995}
2996
2997/* For each fragment writeout instruction, generate a writeout loop to
2998 * associate with it */
2999
3000static void
3001mir_add_writeout_loops(compiler_context *ctx)
3002{
3003        for (unsigned rt = 0; rt < ARRAY_SIZE(ctx->writeout_branch); ++rt) {
3004                for (unsigned s = 0; s < MIDGARD_MAX_SAMPLE_ITER; ++s) {
3005                        midgard_instruction *br = ctx->writeout_branch[rt][s];
3006                        if (!br) continue;
3007
3008                        unsigned popped = br->branch.target_block;
3009                        pan_block_add_successor(&(mir_get_block(ctx, popped - 1)->base),
3010                                                &ctx->current_block->base);
3011                        br->branch.target_block = emit_fragment_epilogue(ctx, rt, s);
3012                        br->branch.target_type = TARGET_GOTO;
3013
3014                        /* If we have more RTs, we'll need to restore back after our
3015                         * loop terminates */
3016                        midgard_instruction *next_br = NULL;
3017
3018                        if ((s + 1) < MIDGARD_MAX_SAMPLE_ITER)
3019                                next_br = ctx->writeout_branch[rt][s + 1];
3020
3021                        if (!next_br && (rt + 1) < ARRAY_SIZE(ctx->writeout_branch))
3022			        next_br = ctx->writeout_branch[rt + 1][0];
3023
3024                        if (next_br) {
3025                                midgard_instruction uncond = v_branch(false, false);
3026                                uncond.branch.target_block = popped;
3027                                uncond.branch.target_type = TARGET_GOTO;
3028                                emit_mir_instruction(ctx, uncond);
3029                                pan_block_add_successor(&ctx->current_block->base,
3030                                                        &(mir_get_block(ctx, popped)->base));
3031                                schedule_barrier(ctx);
3032                        } else {
3033                                /* We're last, so we can terminate here */
3034                                br->last_writeout = true;
3035                        }
3036                }
3037        }
3038}
3039
3040void
3041midgard_compile_shader_nir(nir_shader *nir,
3042                           const struct panfrost_compile_inputs *inputs,
3043                           struct util_dynarray *binary,
3044                           struct pan_shader_info *info)
3045{
3046        midgard_debug = debug_get_option_midgard_debug();
3047
3048        /* TODO: Bound against what? */
3049        compiler_context *ctx = rzalloc(NULL, compiler_context);
3050        ctx->sysval_to_id = panfrost_init_sysvals(&info->sysvals, ctx);
3051
3052        ctx->inputs = inputs;
3053        ctx->nir = nir;
3054        ctx->info = info;
3055        ctx->stage = nir->info.stage;
3056
3057        if (inputs->is_blend) {
3058                unsigned nr_samples = MAX2(inputs->blend.nr_samples, 1);
3059                const struct util_format_description *desc =
3060                        util_format_description(inputs->rt_formats[inputs->blend.rt]);
3061
3062                /* We have to split writeout in 128 bit chunks */
3063                ctx->blend_sample_iterations =
3064                        DIV_ROUND_UP(desc->block.bits * nr_samples, 128);
3065        }
3066        ctx->blend_input = ~0;
3067        ctx->blend_src1 = ~0;
3068        ctx->quirks = midgard_get_quirks(inputs->gpu_id);
3069
3070        /* Initialize at a global (not block) level hash tables */
3071
3072        ctx->ssa_constants = _mesa_hash_table_u64_create(ctx);
3073
3074        /* Lower gl_Position pre-optimisation, but after lowering vars to ssa
3075         * (so we don't accidentally duplicate the epilogue since mesa/st has
3076         * messed with our I/O quite a bit already) */
3077
3078        NIR_PASS_V(nir, nir_lower_vars_to_ssa);
3079
3080        if (ctx->stage == MESA_SHADER_VERTEX) {
3081                NIR_PASS_V(nir, nir_lower_viewport_transform);
3082                NIR_PASS_V(nir, nir_lower_point_size, 1.0, 1024.0);
3083        }
3084
3085        NIR_PASS_V(nir, nir_lower_var_copies);
3086        NIR_PASS_V(nir, nir_lower_vars_to_ssa);
3087        NIR_PASS_V(nir, nir_split_var_copies);
3088        NIR_PASS_V(nir, nir_lower_var_copies);
3089        NIR_PASS_V(nir, nir_lower_global_vars_to_local);
3090        NIR_PASS_V(nir, nir_lower_var_copies);
3091        NIR_PASS_V(nir, nir_lower_vars_to_ssa);
3092
3093        unsigned pan_quirks = panfrost_get_quirks(inputs->gpu_id, 0);
3094        NIR_PASS_V(nir, pan_lower_framebuffer,
3095                   inputs->rt_formats, inputs->raw_fmt_mask,
3096                   inputs->is_blend, pan_quirks);
3097
3098        NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
3099                        glsl_type_size, 0);
3100        NIR_PASS_V(nir, nir_lower_ssbo);
3101        NIR_PASS_V(nir, pan_nir_lower_zs_store);
3102
3103        NIR_PASS_V(nir, pan_nir_lower_64bit_intrin);
3104
3105        /* Optimisation passes */
3106
3107        optimise_nir(nir, ctx->quirks, inputs->is_blend);
3108
3109        NIR_PASS_V(nir, pan_nir_reorder_writeout);
3110
3111        if ((midgard_debug & MIDGARD_DBG_SHADERS) &&
3112            ((midgard_debug & MIDGARD_DBG_INTERNAL) || !nir->info.internal)) {
3113                nir_print_shader(nir, stdout);
3114        }
3115
3116        info->tls_size = nir->scratch_size;
3117
3118        nir_foreach_function(func, nir) {
3119                if (!func->impl)
3120                        continue;
3121
3122                list_inithead(&ctx->blocks);
3123                ctx->block_count = 0;
3124                ctx->func = func;
3125                ctx->already_emitted = calloc(BITSET_WORDS(func->impl->ssa_alloc), sizeof(BITSET_WORD));
3126
3127                if (nir->info.outputs_read && !inputs->is_blend) {
3128                        emit_block_init(ctx);
3129
3130                        struct midgard_instruction wait = v_branch(false, false);
3131                        wait.branch.target_type = TARGET_TILEBUF_WAIT;
3132
3133                        emit_mir_instruction(ctx, wait);
3134
3135                        ++ctx->instruction_count;
3136                }
3137
3138                emit_cf_list(ctx, &func->impl->body);
3139                free(ctx->already_emitted);
3140                break; /* TODO: Multi-function shaders */
3141        }
3142
3143        /* Per-block lowering before opts */
3144
3145        mir_foreach_block(ctx, _block) {
3146                midgard_block *block = (midgard_block *) _block;
3147                inline_alu_constants(ctx, block);
3148                embedded_to_inline_constant(ctx, block);
3149        }
3150        /* MIR-level optimizations */
3151
3152        bool progress = false;
3153
3154        do {
3155                progress = false;
3156                progress |= midgard_opt_dead_code_eliminate(ctx);
3157
3158                mir_foreach_block(ctx, _block) {
3159                        midgard_block *block = (midgard_block *) _block;
3160                        progress |= midgard_opt_copy_prop(ctx, block);
3161                        progress |= midgard_opt_combine_projection(ctx, block);
3162                        progress |= midgard_opt_varying_projection(ctx, block);
3163                }
3164        } while (progress);
3165
3166        mir_foreach_block(ctx, _block) {
3167                midgard_block *block = (midgard_block *) _block;
3168                midgard_lower_derivatives(ctx, block);
3169                midgard_legalize_invert(ctx, block);
3170                midgard_cull_dead_branch(ctx, block);
3171        }
3172
3173        if (ctx->stage == MESA_SHADER_FRAGMENT)
3174                mir_add_writeout_loops(ctx);
3175
3176        /* Analyze now that the code is known but before scheduling creates
3177         * pipeline registers which are harder to track */
3178        mir_analyze_helper_requirements(ctx);
3179
3180        /* Schedule! */
3181        midgard_schedule_program(ctx);
3182        mir_ra(ctx);
3183
3184        /* Analyze after scheduling since this is order-dependent */
3185        mir_analyze_helper_terminate(ctx);
3186
3187        /* Emit flat binary from the instruction arrays. Iterate each block in
3188         * sequence. Save instruction boundaries such that lookahead tags can
3189         * be assigned easily */
3190
3191        /* Cache _all_ bundles in source order for lookahead across failed branches */
3192
3193        int bundle_count = 0;
3194        mir_foreach_block(ctx, _block) {
3195                midgard_block *block = (midgard_block *) _block;
3196                bundle_count += block->bundles.size / sizeof(midgard_bundle);
3197        }
3198        midgard_bundle **source_order_bundles = malloc(sizeof(midgard_bundle *) * bundle_count);
3199        int bundle_idx = 0;
3200        mir_foreach_block(ctx, _block) {
3201                midgard_block *block = (midgard_block *) _block;
3202                util_dynarray_foreach(&block->bundles, midgard_bundle, bundle) {
3203                        source_order_bundles[bundle_idx++] = bundle;
3204                }
3205        }
3206
3207        int current_bundle = 0;
3208
3209        /* Midgard prefetches instruction types, so during emission we
3210         * need to lookahead. Unless this is the last instruction, in
3211         * which we return 1. */
3212
3213        mir_foreach_block(ctx, _block) {
3214                midgard_block *block = (midgard_block *) _block;
3215                mir_foreach_bundle_in_block(block, bundle) {
3216                        int lookahead = 1;
3217
3218                        if (!bundle->last_writeout && (current_bundle + 1 < bundle_count))
3219                                lookahead = source_order_bundles[current_bundle + 1]->tag;
3220
3221                        emit_binary_bundle(ctx, block, bundle, binary, lookahead);
3222                        ++current_bundle;
3223                }
3224
3225                /* TODO: Free deeper */
3226                //util_dynarray_fini(&block->instructions);
3227        }
3228
3229        free(source_order_bundles);
3230
3231        /* Report the very first tag executed */
3232        info->midgard.first_tag = midgard_get_first_tag_from_block(ctx, 0);
3233
3234        info->ubo_mask = ctx->ubo_mask & BITSET_MASK(ctx->nir->info.num_ubos);
3235
3236        if ((midgard_debug & MIDGARD_DBG_SHADERS) &&
3237            ((midgard_debug & MIDGARD_DBG_INTERNAL) || !nir->info.internal)) {
3238                disassemble_midgard(stdout, binary->data,
3239                                    binary->size, inputs->gpu_id,
3240                                    midgard_debug & MIDGARD_DBG_VERBOSE);
3241                fflush(stdout);
3242        }
3243
3244        /* A shader ending on a 16MB boundary causes INSTR_INVALID_PC faults,
3245         * workaround by adding some padding to the end of the shader. (The
3246         * kernel makes sure shader BOs can't cross 16MB boundaries.) */
3247        if (binary->size)
3248                memset(util_dynarray_grow(binary, uint8_t, 16), 0, 16);
3249
3250        if ((midgard_debug & MIDGARD_DBG_SHADERDB || inputs->shaderdb) &&
3251            !nir->info.internal) {
3252                unsigned nr_bundles = 0, nr_ins = 0;
3253
3254                /* Count instructions and bundles */
3255
3256                mir_foreach_block(ctx, _block) {
3257                        midgard_block *block = (midgard_block *) _block;
3258                        nr_bundles += util_dynarray_num_elements(
3259                                              &block->bundles, midgard_bundle);
3260
3261                        mir_foreach_bundle_in_block(block, bun)
3262                                nr_ins += bun->instruction_count;
3263                }
3264
3265                /* Calculate thread count. There are certain cutoffs by
3266                 * register count for thread count */
3267
3268                unsigned nr_registers = info->work_reg_count;
3269
3270                unsigned nr_threads =
3271                        (nr_registers <= 4) ? 4 :
3272                        (nr_registers <= 8) ? 2 :
3273                        1;
3274
3275                /* Dump stats */
3276
3277                fprintf(stderr, "%s - %s shader: "
3278                        "%u inst, %u bundles, %u quadwords, "
3279                        "%u registers, %u threads, %u loops, "
3280                        "%u:%u spills:fills\n",
3281                        ctx->nir->info.label ?: "",
3282                        ctx->inputs->is_blend ? "PAN_SHADER_BLEND" :
3283                        gl_shader_stage_name(ctx->stage),
3284                        nr_ins, nr_bundles, ctx->quadword_count,
3285                        nr_registers, nr_threads,
3286                        ctx->loop_count,
3287                        ctx->spills, ctx->fills);
3288        }
3289
3290        _mesa_hash_table_u64_destroy(ctx->ssa_constants);
3291        _mesa_hash_table_u64_destroy(ctx->sysval_to_id);
3292
3293        ralloc_free(ctx);
3294}
3295