1/*
2 * Copyright (C) 2020 Collabora Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors (Collabora):
24 *      Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
25 */
26
27#include "main/mtypes.h"
28#include "compiler/glsl/glsl_to_nir.h"
29#include "compiler/nir_types.h"
30#include "compiler/nir/nir_builder.h"
31#include "util/u_debug.h"
32
33#include "disassemble.h"
34#include "bifrost_compile.h"
35#include "compiler.h"
36#include "bi_quirks.h"
37#include "bi_builder.h"
38#include "bifrost_nir.h"
39
40static const struct debug_named_value bifrost_debug_options[] = {
41        {"msgs",      BIFROST_DBG_MSGS,		"Print debug messages"},
42        {"shaders",   BIFROST_DBG_SHADERS,	"Dump shaders in NIR and MIR"},
43        {"shaderdb",  BIFROST_DBG_SHADERDB,	"Print statistics"},
44        {"verbose",   BIFROST_DBG_VERBOSE,	"Disassemble verbosely"},
45        {"internal",  BIFROST_DBG_INTERNAL,	"Dump even internal shaders"},
46        {"nosched",   BIFROST_DBG_NOSCHED, 	"Force trivial bundling"},
47        {"inorder",   BIFROST_DBG_INORDER, 	"Force in-order bundling"},
48        {"novalidate",BIFROST_DBG_NOVALIDATE,   "Skip IR validation"},
49        {"noopt",     BIFROST_DBG_NOOPT,        "Skip optimization passes"},
50        DEBUG_NAMED_VALUE_END
51};
52
53DEBUG_GET_ONCE_FLAGS_OPTION(bifrost_debug, "BIFROST_MESA_DEBUG", bifrost_debug_options, 0)
54
55/* How many bytes are prefetched by the Bifrost shader core. From the final
56 * clause of the shader, this range must be valid instructions or zero. */
57#define BIFROST_SHADER_PREFETCH 128
58
59int bifrost_debug = 0;
60
61#define DBG(fmt, ...) \
62		do { if (bifrost_debug & BIFROST_DBG_MSGS) \
63			fprintf(stderr, "%s:%d: "fmt, \
64				__FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)
65
66static bi_block *emit_cf_list(bi_context *ctx, struct exec_list *list);
67
68static void
69bi_block_add_successor(bi_block *block, bi_block *successor)
70{
71        assert(block != NULL && successor != NULL);
72
73        /* Cull impossible edges */
74        if (block->unconditional_jumps)
75                return;
76
77        for (unsigned i = 0; i < ARRAY_SIZE(block->successors); ++i) {
78                if (block->successors[i]) {
79                       if (block->successors[i] == successor)
80                               return;
81                       else
82                               continue;
83                }
84
85                block->successors[i] = successor;
86                _mesa_set_add(successor->predecessors, block);
87                return;
88        }
89
90        unreachable("Too many successors");
91}
92
93static void
94bi_emit_jump(bi_builder *b, nir_jump_instr *instr)
95{
96        bi_instr *branch = bi_jump(b, bi_zero());
97
98        switch (instr->type) {
99        case nir_jump_break:
100                branch->branch_target = b->shader->break_block;
101                break;
102        case nir_jump_continue:
103                branch->branch_target = b->shader->continue_block;
104                break;
105        default:
106                unreachable("Unhandled jump type");
107        }
108
109        bi_block_add_successor(b->shader->current_block, branch->branch_target);
110        b->shader->current_block->unconditional_jumps = true;
111}
112
113static bi_index
114bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr)
115{
116        switch (intr->intrinsic) {
117        case nir_intrinsic_load_barycentric_centroid:
118        case nir_intrinsic_load_barycentric_sample:
119                return bi_register(61);
120
121        /* Need to put the sample ID in the top 16-bits */
122        case nir_intrinsic_load_barycentric_at_sample:
123                return bi_mkvec_v2i16(b, bi_half(bi_dontcare(), false),
124                                bi_half(bi_src_index(&intr->src[0]), false));
125
126        /* Interpret as 8:8 signed fixed point positions in pixels along X and
127         * Y axes respectively, relative to top-left of pixel. In NIR, (0, 0)
128         * is the center of the pixel so we first fixup and then convert. For
129         * fp16 input:
130         *
131         * f2i16(((x, y) + (0.5, 0.5)) * 2**8) =
132         * f2i16((256 * (x, y)) + (128, 128)) =
133         * V2F16_TO_V2S16(FMA.v2f16((x, y), #256, #128))
134         *
135         * For fp32 input, that lacks enough precision for MSAA 16x, but the
136         * idea is the same. FIXME: still doesn't pass
137         */
138        case nir_intrinsic_load_barycentric_at_offset: {
139                bi_index offset = bi_src_index(&intr->src[0]);
140                bi_index f16 = bi_null();
141                unsigned sz = nir_src_bit_size(intr->src[0]);
142
143                if (sz == 16) {
144                        f16 = bi_fma_v2f16(b, offset, bi_imm_f16(256.0),
145                                        bi_imm_f16(128.0), BI_ROUND_NONE);
146                } else {
147                        assert(sz == 32);
148                        bi_index f[2];
149                        for (unsigned i = 0; i < 2; ++i) {
150                                f[i] = bi_fadd_rscale_f32(b,
151                                                bi_word(offset, i),
152                                                bi_imm_f32(0.5), bi_imm_u32(8),
153                                                BI_ROUND_NONE, BI_SPECIAL_NONE);
154                        }
155
156                        f16 = bi_v2f32_to_v2f16(b, f[0], f[1], BI_ROUND_NONE);
157                }
158
159                return bi_v2f16_to_v2s16(b, f16, BI_ROUND_RTZ);
160        }
161
162        case nir_intrinsic_load_barycentric_pixel:
163        default:
164                return bi_dontcare();
165        }
166}
167
168static enum bi_sample
169bi_interp_for_intrinsic(nir_intrinsic_op op)
170{
171        switch (op) {
172        case nir_intrinsic_load_barycentric_centroid:
173                return BI_SAMPLE_CENTROID;
174        case nir_intrinsic_load_barycentric_sample:
175        case nir_intrinsic_load_barycentric_at_sample:
176                return BI_SAMPLE_SAMPLE;
177        case nir_intrinsic_load_barycentric_at_offset:
178                return BI_SAMPLE_EXPLICIT;
179        case nir_intrinsic_load_barycentric_pixel:
180        default:
181                return BI_SAMPLE_CENTER;
182        }
183}
184
185/* auto, 64-bit omitted */
186static enum bi_register_format
187bi_reg_fmt_for_nir(nir_alu_type T)
188{
189        switch (T) {
190        case nir_type_float16: return BI_REGISTER_FORMAT_F16;
191        case nir_type_float32: return BI_REGISTER_FORMAT_F32;
192        case nir_type_int16:   return BI_REGISTER_FORMAT_S16;
193        case nir_type_uint16:  return BI_REGISTER_FORMAT_U16;
194        case nir_type_int32:   return BI_REGISTER_FORMAT_S32;
195        case nir_type_uint32:  return BI_REGISTER_FORMAT_U32;
196        default: unreachable("Invalid type for register format");
197        }
198}
199
200/* Checks if the _IMM variant of an intrinsic can be used, returning in imm the
201 * immediate to be used (which applies even if _IMM can't be used) */
202
203static bool
204bi_is_intr_immediate(nir_intrinsic_instr *instr, unsigned *immediate, unsigned max)
205{
206        nir_src *offset = nir_get_io_offset_src(instr);
207
208        if (!nir_src_is_const(*offset))
209                return false;
210
211        *immediate = nir_intrinsic_base(instr) + nir_src_as_uint(*offset);
212        return (*immediate) < max;
213}
214
215static void
216bi_make_vec_to(bi_builder *b, bi_index final_dst,
217                bi_index *src,
218                unsigned *channel,
219                unsigned count,
220                unsigned bitsize);
221
222/* Bifrost's load instructions lack a component offset despite operating in
223 * terms of vec4 slots. Usually I/O vectorization avoids nonzero components,
224 * but they may be unavoidable with separate shaders in use. To solve this, we
225 * lower to a larger load and an explicit copy of the desired components. */
226
227static void
228bi_copy_component(bi_builder *b, nir_intrinsic_instr *instr, bi_index tmp)
229{
230        unsigned component = nir_intrinsic_component(instr);
231
232        if (component == 0)
233                return;
234
235        bi_index srcs[] = { tmp, tmp, tmp, tmp };
236        unsigned channels[] = { component, component + 1, component + 2 };
237
238        bi_make_vec_to(b,
239                        bi_dest_index(&instr->dest),
240                        srcs, channels, instr->num_components,
241                        nir_dest_bit_size(instr->dest));
242}
243
244static void
245bi_emit_load_attr(bi_builder *b, nir_intrinsic_instr *instr)
246{
247        nir_alu_type T = nir_intrinsic_dest_type(instr);
248        enum bi_register_format regfmt = bi_reg_fmt_for_nir(T);
249        nir_src *offset = nir_get_io_offset_src(instr);
250        unsigned component = nir_intrinsic_component(instr);
251        enum bi_vecsize vecsize = (instr->num_components + component - 1);
252        unsigned imm_index = 0;
253        unsigned base = nir_intrinsic_base(instr);
254        bool constant = nir_src_is_const(*offset);
255        bool immediate = bi_is_intr_immediate(instr, &imm_index, 16);
256        bi_index dest = (component == 0) ? bi_dest_index(&instr->dest) : bi_temp(b->shader);
257
258        if (immediate) {
259                bi_ld_attr_imm_to(b, dest, bi_register(61), bi_register(62),
260                                regfmt, vecsize, imm_index);
261        } else {
262                bi_index idx = bi_src_index(&instr->src[0]);
263
264                if (constant)
265                        idx = bi_imm_u32(imm_index);
266                else if (base != 0)
267                        idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
268
269                bi_ld_attr_to(b, dest, bi_register(61), bi_register(62),
270                                idx, regfmt, vecsize);
271        }
272
273        bi_copy_component(b, instr, dest);
274}
275
276static void
277bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)
278{
279        enum bi_sample sample = BI_SAMPLE_CENTER;
280        enum bi_update update = BI_UPDATE_STORE;
281        enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO;
282        bool smooth = instr->intrinsic == nir_intrinsic_load_interpolated_input;
283        bi_index src0 = bi_null();
284
285        unsigned component = nir_intrinsic_component(instr);
286        enum bi_vecsize vecsize = (instr->num_components + component - 1);
287        bi_index dest = (component == 0) ? bi_dest_index(&instr->dest) : bi_temp(b->shader);
288
289        unsigned sz = nir_dest_bit_size(instr->dest);
290
291        if (smooth) {
292                nir_intrinsic_instr *parent = nir_src_as_intrinsic(instr->src[0]);
293                assert(parent);
294
295                sample = bi_interp_for_intrinsic(parent->intrinsic);
296                src0 = bi_varying_src0_for_barycentric(b, parent);
297
298                assert(sz == 16 || sz == 32);
299                regfmt = (sz == 16) ? BI_REGISTER_FORMAT_F16
300                        : BI_REGISTER_FORMAT_F32;
301        } else {
302                assert(sz == 32);
303                regfmt = BI_REGISTER_FORMAT_U32;
304        }
305
306        nir_src *offset = nir_get_io_offset_src(instr);
307        unsigned imm_index = 0;
308        bool immediate = bi_is_intr_immediate(instr, &imm_index, 20);
309
310        if (immediate && smooth) {
311                bi_ld_var_imm_to(b, dest, src0, regfmt, sample, update,
312                                vecsize, imm_index);
313        } else if (immediate && !smooth) {
314                bi_ld_var_flat_imm_to(b, dest, BI_FUNCTION_NONE, regfmt,
315                                vecsize, imm_index);
316        } else {
317                bi_index idx = bi_src_index(offset);
318                unsigned base = nir_intrinsic_base(instr);
319
320                if (base != 0)
321                        idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
322
323                if (smooth) {
324                        bi_ld_var_to(b, dest, src0, idx, regfmt, sample,
325                                        update, vecsize);
326                } else {
327                        bi_ld_var_flat_to(b, dest, idx, BI_FUNCTION_NONE,
328                                        regfmt, vecsize);
329                }
330        }
331
332        bi_copy_component(b, instr, dest);
333}
334
335static void
336bi_make_vec16_to(bi_builder *b, bi_index dst, bi_index *src,
337                unsigned *channel, unsigned count)
338{
339        for (unsigned i = 0; i < count; i += 2) {
340                bool next = (i + 1) < count;
341
342                unsigned chan = channel ? channel[i] : 0;
343                unsigned nextc = next && channel ? channel[i + 1] : 0;
344
345                bi_index w0 = bi_word(src[i], chan >> 1);
346                bi_index w1 = next ? bi_word(src[i + 1], nextc >> 1) : bi_zero();
347
348                bi_index h0 = bi_half(w0, chan & 1);
349                bi_index h1 = bi_half(w1, nextc & 1);
350
351                bi_index to = bi_word(dst, i >> 1);
352
353                if (bi_is_word_equiv(w0, w1) && (chan & 1) == 0 && ((nextc & 1) == 1))
354                        bi_mov_i32_to(b, to, w0);
355                else if (bi_is_word_equiv(w0, w1))
356                        bi_swz_v2i16_to(b, to, bi_swz_16(w0, chan & 1, nextc & 1));
357                else
358                        bi_mkvec_v2i16_to(b, to, h0, h1);
359        }
360}
361
362static void
363bi_make_vec_to(bi_builder *b, bi_index final_dst,
364                bi_index *src,
365                unsigned *channel,
366                unsigned count,
367                unsigned bitsize)
368{
369        /* If we reads our own output, we need a temporary move to allow for
370         * swapping. TODO: Could do a bit better for pairwise swaps of 16-bit
371         * vectors */
372        bool reads_self = false;
373
374        for (unsigned i = 0; i < count; ++i)
375                reads_self |= bi_is_equiv(final_dst, src[i]);
376
377        /* SSA can't read itself */
378        assert(!reads_self || final_dst.reg);
379
380        bi_index dst = reads_self ? bi_temp(b->shader) : final_dst;
381
382        if (bitsize == 32) {
383                for (unsigned i = 0; i < count; ++i) {
384                        bi_mov_i32_to(b, bi_word(dst, i),
385                                        bi_word(src[i], channel ? channel[i] : 0));
386                }
387        } else if (bitsize == 16) {
388                bi_make_vec16_to(b, dst, src, channel, count);
389        } else if (bitsize == 8 && count == 1) {
390                bi_swz_v4i8_to(b, dst, bi_byte(
391                                        bi_word(src[0], channel[0] >> 2),
392                                        channel[0] & 3));
393        } else {
394                unreachable("8-bit mkvec not yet supported");
395        }
396
397        /* Emit an explicit copy if needed */
398        if (!bi_is_equiv(dst, final_dst)) {
399                unsigned shift = (bitsize == 8) ? 2 : (bitsize == 16) ? 1 : 0;
400                unsigned vec = (1 << shift);
401
402                for (unsigned i = 0; i < count; i += vec) {
403                        bi_mov_i32_to(b, bi_word(final_dst, i >> shift),
404                                        bi_word(dst, i >> shift));
405                }
406        }
407}
408
409static bi_instr *
410bi_load_sysval_to(bi_builder *b, bi_index dest, int sysval,
411                unsigned nr_components, unsigned offset)
412{
413        unsigned sysval_ubo =
414                MAX2(b->shader->inputs->sysval_ubo, b->shader->nir->info.num_ubos);
415        unsigned uniform =
416                pan_lookup_sysval(b->shader->sysval_to_id,
417                                  &b->shader->info->sysvals,
418                                  sysval);
419        unsigned idx = (uniform * 16) + offset;
420
421        return bi_load_to(b, nr_components * 32, dest,
422                        bi_imm_u32(idx),
423                        bi_imm_u32(sysval_ubo), BI_SEG_UBO);
424}
425
426static void
427bi_load_sysval_nir(bi_builder *b, nir_intrinsic_instr *intr,
428                unsigned nr_components, unsigned offset)
429{
430        bi_load_sysval_to(b, bi_dest_index(&intr->dest),
431                        panfrost_sysval_for_instr(&intr->instr, NULL),
432                        nr_components, offset);
433}
434
435static bi_index
436bi_load_sysval(bi_builder *b, int sysval,
437                unsigned nr_components, unsigned offset)
438{
439        bi_index tmp = bi_temp(b->shader);
440        bi_load_sysval_to(b, tmp, sysval, nr_components, offset);
441        return tmp;
442}
443
444static void
445bi_load_sample_id_to(bi_builder *b, bi_index dst)
446{
447        /* r61[16:23] contains the sampleID, mask it out. Upper bits
448         * seem to read garbage (despite being architecturally defined
449         * as zero), so use a 5-bit mask instead of 8-bits */
450
451        bi_rshift_and_i32_to(b, dst, bi_register(61), bi_imm_u32(0x1f),
452                                bi_imm_u8(16));
453}
454
455static bi_index
456bi_load_sample_id(bi_builder *b)
457{
458        bi_index sample_id = bi_temp(b->shader);
459        bi_load_sample_id_to(b, sample_id);
460        return sample_id;
461}
462
463static bi_index
464bi_pixel_indices(bi_builder *b, unsigned rt)
465{
466        /* We want to load the current pixel. */
467        struct bifrost_pixel_indices pix = {
468                .y = BIFROST_CURRENT_PIXEL,
469                .rt = rt
470        };
471
472        uint32_t indices_u32 = 0;
473        memcpy(&indices_u32, &pix, sizeof(indices_u32));
474        bi_index indices = bi_imm_u32(indices_u32);
475
476        /* Sample index above is left as zero. For multisampling, we need to
477         * fill in the actual sample ID in the lower byte */
478
479        if (b->shader->inputs->blend.nr_samples > 1)
480                indices = bi_iadd_u32(b, indices, bi_load_sample_id(b), false);
481
482        return indices;
483}
484
485static void
486bi_emit_load_blend_input(bi_builder *b, nir_intrinsic_instr *instr)
487{
488        ASSERTED nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
489
490        /* Source color is passed through r0-r3, or r4-r7 for the second
491         * source when dual-source blending.  TODO: Precolour instead */
492        bi_index srcs[] = {
493                bi_register(0), bi_register(1), bi_register(2), bi_register(3)
494        };
495        bi_index srcs2[] = {
496                bi_register(4), bi_register(5), bi_register(6), bi_register(7)
497        };
498
499        bool second_source = (sem.location == VARYING_SLOT_VAR0);
500
501        bi_make_vec_to(b, bi_dest_index(&instr->dest),
502                       second_source ? srcs2 : srcs,
503                       NULL, 4, 32);
504}
505
506static void
507bi_emit_blend_op(bi_builder *b, bi_index rgba, nir_alu_type T, unsigned rt)
508{
509        /* Reads 2 or 4 staging registers to cover the input */
510        unsigned size = nir_alu_type_get_type_size(T);
511        unsigned sr_count = (size <= 16) ? 2 : 4;
512        const struct panfrost_compile_inputs *inputs = b->shader->inputs;
513        uint64_t blend_desc = inputs->blend.bifrost_blend_desc;
514
515        if (inputs->is_blend && inputs->blend.nr_samples > 1) {
516                /* Conversion descriptor comes from the compile inputs, pixel
517                 * indices derived at run time based on sample ID */
518                bi_st_tile(b, rgba, bi_pixel_indices(b, rt), bi_register(60),
519                                bi_imm_u32(blend_desc >> 32), BI_VECSIZE_V4);
520        } else if (b->shader->inputs->is_blend) {
521                /* Blend descriptor comes from the compile inputs */
522                /* Put the result in r0 */
523                bi_blend_to(b, bi_register(0), rgba,
524                                bi_register(60),
525                                bi_imm_u32(blend_desc & 0xffffffff),
526                                bi_imm_u32(blend_desc >> 32), sr_count);
527        } else {
528                /* Blend descriptor comes from the FAU RAM. By convention, the
529                 * return address is stored in r48 and will be used by the
530                 * blend shader to jump back to the fragment shader after */
531                bi_blend_to(b, bi_register(48), rgba,
532                                bi_register(60),
533                                bi_fau(BIR_FAU_BLEND_0 + rt, false),
534                                bi_fau(BIR_FAU_BLEND_0 + rt, true), sr_count);
535        }
536
537        assert(rt < 8);
538        b->shader->info->bifrost.blend[rt].type = T;
539}
540
541/* Blend shaders do not need to run ATEST since they are dependent on a
542 * fragment shader that runs it. Blit shaders may not need to run ATEST, since
543 * ATEST is not needed if early-z is forced, alpha-to-coverage is disabled, and
544 * there are no writes to the coverage mask. The latter two are satisfied for
545 * all blit shaders, so we just care about early-z, which blit shaders force
546 * iff they do not write depth or stencil */
547
548static bool
549bi_skip_atest(bi_context *ctx, bool emit_zs)
550{
551        return (ctx->inputs->is_blit && !emit_zs) || ctx->inputs->is_blend;
552}
553
554static void
555bi_emit_atest(bi_builder *b, bi_index alpha)
556{
557        bi_index coverage = bi_register(60);
558        bi_instr *atest = bi_atest_to(b, coverage, coverage, alpha);
559        b->shader->emitted_atest = true;
560
561        /* Pseudo-source to encode in the tuple */
562        atest->src[2] = bi_fau(BIR_FAU_ATEST_PARAM, false);
563}
564
565static void
566bi_emit_fragment_out(bi_builder *b, nir_intrinsic_instr *instr)
567{
568        bool combined = instr->intrinsic ==
569                nir_intrinsic_store_combined_output_pan;
570
571        unsigned writeout = combined ? nir_intrinsic_component(instr) :
572                PAN_WRITEOUT_C;
573
574        bool emit_blend = writeout & (PAN_WRITEOUT_C);
575        bool emit_zs = writeout & (PAN_WRITEOUT_Z | PAN_WRITEOUT_S);
576
577        const nir_variable *var =
578                nir_find_variable_with_driver_location(b->shader->nir,
579                                nir_var_shader_out, nir_intrinsic_base(instr));
580        assert(var);
581
582        unsigned loc = var->data.location;
583        bi_index src0 = bi_src_index(&instr->src[0]);
584
585        /* By ISA convention, the coverage mask is stored in R60. The store
586         * itself will be handled by a subsequent ATEST instruction */
587        if (loc == FRAG_RESULT_SAMPLE_MASK) {
588                bi_index orig = bi_register(60);
589                bi_index msaa = bi_load_sysval(b, PAN_SYSVAL_MULTISAMPLED, 1, 0);
590                bi_index new = bi_lshift_and_i32(b, orig, src0, bi_imm_u8(0));
591                bi_mux_i32_to(b, orig, orig, new, msaa, BI_MUX_INT_ZERO);
592                return;
593        }
594
595
596        /* Dual-source blending is implemented by putting the color in
597         * registers r4-r7. */
598        if (var->data.index) {
599                unsigned count = nir_src_num_components(instr->src[0]);
600
601                for (unsigned i = 0; i < count; ++i)
602                        bi_mov_i32_to(b, bi_register(4 + i), bi_word(src0, i));
603
604                b->shader->info->bifrost.blend_src1_type =
605                        nir_intrinsic_src_type(instr);
606
607                return;
608        }
609
610        /* Emit ATEST if we have to, note ATEST requires a floating-point alpha
611         * value, but render target #0 might not be floating point. However the
612         * alpha value is only used for alpha-to-coverage, a stage which is
613         * skipped for pure integer framebuffers, so the issue is moot. */
614
615        if (!b->shader->emitted_atest && !bi_skip_atest(b->shader, emit_zs)) {
616                nir_alu_type T = nir_intrinsic_src_type(instr);
617
618                bi_index rgba = bi_src_index(&instr->src[0]);
619                bi_index alpha =
620                        (T == nir_type_float16) ? bi_half(bi_word(rgba, 1), true) :
621                        (T == nir_type_float32) ? bi_word(rgba, 3) :
622                        bi_dontcare();
623
624                /* Don't read out-of-bounds */
625                if (nir_src_num_components(instr->src[0]) < 4)
626                        alpha = bi_imm_f32(1.0);
627
628                bi_emit_atest(b, alpha);
629        }
630
631        if (emit_zs) {
632                bi_index z = { 0 }, s = { 0 };
633
634                if (writeout & PAN_WRITEOUT_Z)
635                        z = bi_src_index(&instr->src[2]);
636
637                if (writeout & PAN_WRITEOUT_S)
638                        s = bi_src_index(&instr->src[3]);
639
640                bi_zs_emit_to(b, bi_register(60), z, s, bi_register(60),
641                                writeout & PAN_WRITEOUT_S,
642                                writeout & PAN_WRITEOUT_Z);
643        }
644
645        if (emit_blend) {
646                assert(loc >= FRAG_RESULT_DATA0);
647
648                unsigned rt = (loc - FRAG_RESULT_DATA0);
649                bi_index color = bi_src_index(&instr->src[0]);
650
651                /* Explicit copy since BLEND inputs are precoloured to R0-R3,
652                 * TODO: maybe schedule around this or implement in RA as a
653                 * spill */
654                bool has_mrt = false;
655
656                nir_foreach_shader_out_variable(var, b->shader->nir)
657                        has_mrt |= (var->data.location > FRAG_RESULT_DATA0);
658
659                if (has_mrt) {
660                        bi_index srcs[4] = { color, color, color, color };
661                        unsigned channels[4] = { 0, 1, 2, 3 };
662                        color = bi_temp(b->shader);
663                        bi_make_vec_to(b, color, srcs, channels,
664                                       nir_src_num_components(instr->src[0]),
665                                       nir_alu_type_get_type_size(nir_intrinsic_src_type(instr)));
666                }
667
668                bi_emit_blend_op(b, color, nir_intrinsic_src_type(instr), rt);
669        }
670
671        if (b->shader->inputs->is_blend) {
672                /* Jump back to the fragment shader, return address is stored
673                 * in r48 (see above).
674                 */
675                bi_jump(b, bi_register(48));
676        }
677}
678
679static void
680bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr)
681{
682        /* In principle we can do better for 16-bit. At the moment we require
683         * 32-bit to permit the use of .auto, in order to force .u32 for flat
684         * varyings, to handle internal TGSI shaders that set flat in the VS
685         * but smooth in the FS */
686
687        ASSERTED nir_alu_type T = nir_intrinsic_src_type(instr);
688        assert(nir_alu_type_get_type_size(T) == 32);
689        enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO;
690
691        unsigned imm_index = 0;
692        bool immediate = bi_is_intr_immediate(instr, &imm_index, 16);
693
694        bi_index address;
695        if (immediate) {
696                address = bi_lea_attr_imm(b,
697                                          bi_register(61), bi_register(62),
698                                          regfmt, imm_index);
699        } else {
700                bi_index idx =
701                        bi_iadd_u32(b,
702                                    bi_src_index(nir_get_io_offset_src(instr)),
703                                    bi_imm_u32(nir_intrinsic_base(instr)),
704                                    false);
705                address = bi_lea_attr(b,
706                                      bi_register(61), bi_register(62),
707                                      idx, regfmt);
708        }
709
710        /* Only look at the total components needed. In effect, we fill in all
711         * the intermediate "holes" in the write mask, since we can't mask off
712         * stores. Since nir_lower_io_to_temporaries ensures each varying is
713         * written at most once, anything that's masked out is undefined, so it
714         * doesn't matter what we write there. So we may as well do the
715         * simplest thing possible. */
716        unsigned nr = util_last_bit(nir_intrinsic_write_mask(instr));
717        assert(nr > 0 && nr <= nir_intrinsic_src_components(instr, 0));
718
719        bi_st_cvt(b, bi_src_index(&instr->src[0]), address,
720                        bi_word(address, 1), bi_word(address, 2),
721                        regfmt, nr - 1);
722}
723
724static void
725bi_emit_load_ubo(bi_builder *b, nir_intrinsic_instr *instr)
726{
727        nir_src *offset = nir_get_io_offset_src(instr);
728
729        bool offset_is_const = nir_src_is_const(*offset);
730        bi_index dyn_offset = bi_src_index(offset);
731        uint32_t const_offset = offset_is_const ? nir_src_as_uint(*offset) : 0;
732        bool kernel_input = (instr->intrinsic == nir_intrinsic_load_kernel_input);
733
734        bi_load_to(b, instr->num_components * nir_dest_bit_size(instr->dest),
735                        bi_dest_index(&instr->dest), offset_is_const ?
736                        bi_imm_u32(const_offset) : dyn_offset,
737                        kernel_input ? bi_zero() : bi_src_index(&instr->src[0]),
738                        BI_SEG_UBO);
739}
740
741static bi_index
742bi_addr_high(nir_src *src)
743{
744	return (nir_src_bit_size(*src) == 64) ?
745		bi_word(bi_src_index(src), 1) : bi_zero();
746}
747
748static void
749bi_emit_load(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg)
750{
751        bi_load_to(b, instr->num_components * nir_dest_bit_size(instr->dest),
752                   bi_dest_index(&instr->dest),
753                   bi_src_index(&instr->src[0]), bi_addr_high(&instr->src[0]),
754                   seg);
755}
756
757static void
758bi_emit_store(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg)
759{
760        /* Require contiguous masks, gauranteed by nir_lower_wrmasks */
761        assert(nir_intrinsic_write_mask(instr) ==
762                        BITFIELD_MASK(instr->num_components));
763
764        bi_store(b, instr->num_components * nir_src_bit_size(instr->src[0]),
765                    bi_src_index(&instr->src[0]),
766                    bi_src_index(&instr->src[1]), bi_addr_high(&instr->src[1]),
767                    seg);
768}
769
770/* Exchanges the staging register with memory */
771
772static void
773bi_emit_axchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg, enum bi_seg seg)
774{
775        assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS);
776
777        unsigned sz = nir_src_bit_size(*arg);
778        assert(sz == 32 || sz == 64);
779
780        bi_index data = bi_src_index(arg);
781
782        bi_index data_words[] = {
783                bi_word(data, 0),
784                bi_word(data, 1),
785        };
786
787        bi_index inout = bi_temp_reg(b->shader);
788        bi_make_vec_to(b, inout, data_words, NULL, sz / 32, 32);
789
790        bi_axchg_to(b, sz, inout, inout,
791                        bi_word(addr, 0),
792                        (seg == BI_SEG_NONE) ? bi_word(addr, 1) : bi_zero(),
793                        seg);
794
795        bi_index inout_words[] = {
796                bi_word(inout, 0),
797                bi_word(inout, 1),
798        };
799
800        bi_make_vec_to(b, dst, inout_words, NULL, sz / 32, 32);
801}
802
803/* Exchanges the second staging register with memory if comparison with first
804 * staging register passes */
805
806static void
807bi_emit_acmpxchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg_1, nir_src *arg_2, enum bi_seg seg)
808{
809        assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS);
810
811        /* hardware is swapped from NIR */
812        bi_index src0 = bi_src_index(arg_2);
813        bi_index src1 = bi_src_index(arg_1);
814
815        unsigned sz = nir_src_bit_size(*arg_1);
816        assert(sz == 32 || sz == 64);
817
818        bi_index data_words[] = {
819                bi_word(src0, 0),
820                sz == 32 ? bi_word(src1, 0) : bi_word(src0, 1),
821
822                /* 64-bit */
823                bi_word(src1, 0),
824                bi_word(src1, 1),
825        };
826
827        bi_index inout = bi_temp_reg(b->shader);
828        bi_make_vec_to(b, inout, data_words, NULL, 2 * (sz / 32), 32);
829
830        bi_acmpxchg_to(b, sz, inout, inout,
831                        bi_word(addr, 0),
832                        (seg == BI_SEG_NONE) ? bi_word(addr, 1) : bi_zero(),
833                        seg);
834
835        bi_index inout_words[] = {
836                bi_word(inout, 0),
837                bi_word(inout, 1),
838        };
839
840        bi_make_vec_to(b, dst, inout_words, NULL, sz / 32, 32);
841}
842
843/* Extracts an atomic opcode */
844
845static enum bi_atom_opc
846bi_atom_opc_for_nir(nir_intrinsic_op op)
847{
848        switch (op) {
849        case nir_intrinsic_global_atomic_add:
850        case nir_intrinsic_shared_atomic_add:
851        case nir_intrinsic_image_atomic_add:
852                return BI_ATOM_OPC_AADD;
853
854        case nir_intrinsic_global_atomic_imin:
855        case nir_intrinsic_shared_atomic_imin:
856        case nir_intrinsic_image_atomic_imin:
857                return BI_ATOM_OPC_ASMIN;
858
859        case nir_intrinsic_global_atomic_umin:
860        case nir_intrinsic_shared_atomic_umin:
861        case nir_intrinsic_image_atomic_umin:
862                return BI_ATOM_OPC_AUMIN;
863
864        case nir_intrinsic_global_atomic_imax:
865        case nir_intrinsic_shared_atomic_imax:
866        case nir_intrinsic_image_atomic_imax:
867                return BI_ATOM_OPC_ASMAX;
868
869        case nir_intrinsic_global_atomic_umax:
870        case nir_intrinsic_shared_atomic_umax:
871        case nir_intrinsic_image_atomic_umax:
872                return BI_ATOM_OPC_AUMAX;
873
874        case nir_intrinsic_global_atomic_and:
875        case nir_intrinsic_shared_atomic_and:
876        case nir_intrinsic_image_atomic_and:
877                return BI_ATOM_OPC_AAND;
878
879        case nir_intrinsic_global_atomic_or:
880        case nir_intrinsic_shared_atomic_or:
881        case nir_intrinsic_image_atomic_or:
882                return BI_ATOM_OPC_AOR;
883
884        case nir_intrinsic_global_atomic_xor:
885        case nir_intrinsic_shared_atomic_xor:
886        case nir_intrinsic_image_atomic_xor:
887                return BI_ATOM_OPC_AXOR;
888
889        default:
890                unreachable("Unexpected computational atomic");
891        }
892}
893
894/* Optimized unary atomics are available with an implied #1 argument */
895
896static bool
897bi_promote_atom_c1(enum bi_atom_opc op, bi_index arg, enum bi_atom_opc *out)
898{
899        /* Check we have a compatible constant */
900        if (arg.type != BI_INDEX_CONSTANT)
901                return false;
902
903        if (!(arg.value == 1 || (arg.value == -1 && op == BI_ATOM_OPC_AADD)))
904                return false;
905
906        /* Check for a compatible operation */
907        switch (op) {
908        case BI_ATOM_OPC_AADD:
909                *out = (arg.value == 1) ? BI_ATOM_OPC_AINC : BI_ATOM_OPC_ADEC;
910                return true;
911        case BI_ATOM_OPC_ASMAX:
912                *out = BI_ATOM_OPC_ASMAX1;
913                return true;
914        case BI_ATOM_OPC_AUMAX:
915                *out = BI_ATOM_OPC_AUMAX1;
916                return true;
917        case BI_ATOM_OPC_AOR:
918                *out = BI_ATOM_OPC_AOR1;
919                return true;
920        default:
921                return false;
922        }
923}
924
925/* Coordinates are 16-bit integers in Bifrost but 32-bit in NIR */
926
927static bi_index
928bi_emit_image_coord(bi_builder *b, bi_index coord, unsigned src_idx,
929                    unsigned coord_comps, bool is_array)
930{
931        assert(coord_comps > 0 && coord_comps <= 3);
932
933        if (src_idx == 0) {
934                if (coord_comps == 1 || (coord_comps == 2 && is_array))
935                        return bi_word(coord, 0);
936                else
937                        return bi_mkvec_v2i16(b,
938                                              bi_half(bi_word(coord, 0), false),
939                                              bi_half(bi_word(coord, 1), false));
940        } else {
941                if (coord_comps == 3)
942                        return bi_word(coord, 2);
943                else if (coord_comps == 2 && is_array)
944                        return bi_word(coord, 1);
945                else
946                        return bi_zero();
947        }
948}
949
950static bi_index
951bi_emit_image_index(bi_builder *b, nir_intrinsic_instr *instr)
952{
953        nir_src src = instr->src[0];
954        bi_index index = bi_src_index(&src);
955        bi_context *ctx = b->shader;
956
957        /* Images come after vertex attributes, so handle an explicit offset */
958        unsigned offset = (ctx->stage == MESA_SHADER_VERTEX) ?
959                util_bitcount64(ctx->nir->info.inputs_read) : 0;
960
961        if (offset == 0)
962                return index;
963        else if (nir_src_is_const(src))
964                return bi_imm_u32(nir_src_as_uint(src) + offset);
965        else
966                return bi_iadd_u32(b, index, bi_imm_u32(offset), false);
967}
968
969static void
970bi_emit_image_load(bi_builder *b, nir_intrinsic_instr *instr)
971{
972        enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
973        unsigned coord_comps = nir_image_intrinsic_coord_components(instr);
974        bool array = nir_intrinsic_image_array(instr);
975        ASSERTED unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim);
976
977        bi_index coords = bi_src_index(&instr->src[1]);
978        /* TODO: MSAA */
979        assert(nr_dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported");
980
981        bi_ld_attr_tex_to(b, bi_dest_index(&instr->dest),
982                          bi_emit_image_coord(b, coords, 0, coord_comps, array),
983                          bi_emit_image_coord(b, coords, 1, coord_comps, array),
984                          bi_emit_image_index(b, instr),
985                          bi_reg_fmt_for_nir(nir_intrinsic_dest_type(instr)),
986                          instr->num_components - 1);
987}
988
989static bi_index
990bi_emit_lea_image(bi_builder *b, nir_intrinsic_instr *instr)
991{
992        enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
993        bool array = nir_intrinsic_image_array(instr);
994        ASSERTED unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim);
995        unsigned coord_comps = nir_image_intrinsic_coord_components(instr);
996
997        /* TODO: MSAA */
998        assert(nr_dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported");
999
1000        enum bi_register_format type = (instr->intrinsic == nir_intrinsic_image_store) ?
1001                bi_reg_fmt_for_nir(nir_intrinsic_src_type(instr)) :
1002                BI_REGISTER_FORMAT_AUTO;
1003
1004        bi_index coords = bi_src_index(&instr->src[1]);
1005        bi_index xy = bi_emit_image_coord(b, coords, 0, coord_comps, array);
1006        bi_index zw = bi_emit_image_coord(b, coords, 1, coord_comps, array);
1007
1008        bi_instr *I = bi_lea_attr_tex_to(b, bi_temp(b->shader), xy, zw,
1009                        bi_emit_image_index(b, instr), type);
1010
1011        /* LEA_ATTR_TEX defaults to the secondary attribute table, but our ABI
1012         * has all images in the primary attribute table */
1013        I->table = BI_TABLE_ATTRIBUTE_1;
1014
1015        return I->dest[0];
1016}
1017
1018static void
1019bi_emit_image_store(bi_builder *b, nir_intrinsic_instr *instr)
1020{
1021        bi_index addr = bi_emit_lea_image(b, instr);
1022
1023        bi_st_cvt(b, bi_src_index(&instr->src[3]),
1024                     addr, bi_word(addr, 1), bi_word(addr, 2),
1025                     bi_reg_fmt_for_nir(nir_intrinsic_src_type(instr)),
1026                     instr->num_components - 1);
1027}
1028
1029static void
1030bi_emit_atomic_i32_to(bi_builder *b, bi_index dst,
1031                bi_index addr, bi_index arg, nir_intrinsic_op intrinsic)
1032{
1033        /* ATOM_C.i32 takes a vector with {arg, coalesced}, ATOM_C1.i32 doesn't
1034         * take any vector but can still output in RETURN mode */
1035        bi_index sr = bi_temp_reg(b->shader);
1036
1037        enum bi_atom_opc opc = bi_atom_opc_for_nir(intrinsic);
1038        enum bi_atom_opc post_opc = opc;
1039
1040        /* Generate either ATOM_C or ATOM_C1 as required */
1041        if (bi_promote_atom_c1(opc, arg, &opc)) {
1042                bi_patom_c1_i32_to(b, sr, bi_word(addr, 0),
1043                                bi_word(addr, 1), opc, 2);
1044        } else {
1045                bi_mov_i32_to(b, sr, arg);
1046                bi_patom_c_i32_to(b, sr, sr, bi_word(addr, 0),
1047                                bi_word(addr, 1), opc, 2);
1048        }
1049
1050        /* Post-process it */
1051        bi_atom_post_i32_to(b, dst, bi_word(sr, 0), bi_word(sr, 1), post_opc);
1052}
1053
1054/* gl_FragCoord.xy = u16_to_f32(R59.xy) + 0.5
1055 * gl_FragCoord.z = ld_vary(fragz)
1056 * gl_FragCoord.w = ld_vary(fragw)
1057 */
1058
1059static void
1060bi_emit_load_frag_coord(bi_builder *b, nir_intrinsic_instr *instr)
1061{
1062        bi_index src[4] = {};
1063
1064        for (unsigned i = 0; i < 2; ++i) {
1065                src[i] = bi_fadd_f32(b,
1066                                bi_u16_to_f32(b, bi_half(bi_register(59), i)),
1067                                bi_imm_f32(0.5f), BI_ROUND_NONE);
1068        }
1069
1070        for (unsigned i = 0; i < 2; ++i) {
1071                src[2 + i] = bi_ld_var_special(b, bi_zero(),
1072                                BI_REGISTER_FORMAT_F32, BI_SAMPLE_CENTER,
1073                                BI_UPDATE_CLOBBER,
1074                                (i == 0) ? BI_VARYING_NAME_FRAG_Z :
1075                                        BI_VARYING_NAME_FRAG_W,
1076                                BI_VECSIZE_NONE);
1077        }
1078
1079        bi_make_vec_to(b, bi_dest_index(&instr->dest), src, NULL, 4, 32);
1080}
1081
1082static void
1083bi_emit_ld_tile(bi_builder *b, nir_intrinsic_instr *instr)
1084{
1085        unsigned rt = b->shader->inputs->blend.rt;
1086        unsigned size = nir_dest_bit_size(instr->dest);
1087
1088        /* Get the render target */
1089        if (!b->shader->inputs->is_blend) {
1090                const nir_variable *var =
1091                        nir_find_variable_with_driver_location(b->shader->nir,
1092                                        nir_var_shader_out, nir_intrinsic_base(instr));
1093                unsigned loc = var->data.location;
1094                assert(loc >= FRAG_RESULT_DATA0);
1095                rt = (loc - FRAG_RESULT_DATA0);
1096        }
1097
1098        bi_index desc = b->shader->inputs->is_blend ?
1099                bi_imm_u32(b->shader->inputs->blend.bifrost_blend_desc >> 32) :
1100                b->shader->inputs->bifrost.static_rt_conv ?
1101                bi_imm_u32(b->shader->inputs->bifrost.rt_conv[rt]) :
1102                bi_load_sysval(b, PAN_SYSVAL(RT_CONVERSION, rt | (size << 4)), 1, 0);
1103
1104        bi_ld_tile_to(b, bi_dest_index(&instr->dest), bi_pixel_indices(b, rt),
1105                        bi_register(60), desc, (instr->num_components - 1));
1106}
1107
1108static void
1109bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
1110{
1111        bi_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest ?
1112                bi_dest_index(&instr->dest) : bi_null();
1113        gl_shader_stage stage = b->shader->stage;
1114
1115        switch (instr->intrinsic) {
1116        case nir_intrinsic_load_barycentric_pixel:
1117        case nir_intrinsic_load_barycentric_centroid:
1118        case nir_intrinsic_load_barycentric_sample:
1119        case nir_intrinsic_load_barycentric_at_sample:
1120        case nir_intrinsic_load_barycentric_at_offset:
1121                /* handled later via load_vary */
1122                break;
1123        case nir_intrinsic_load_interpolated_input:
1124        case nir_intrinsic_load_input:
1125                if (b->shader->inputs->is_blend)
1126                        bi_emit_load_blend_input(b, instr);
1127                else if (stage == MESA_SHADER_FRAGMENT)
1128                        bi_emit_load_vary(b, instr);
1129                else if (stage == MESA_SHADER_VERTEX)
1130                        bi_emit_load_attr(b, instr);
1131                else
1132                        unreachable("Unsupported shader stage");
1133                break;
1134
1135        case nir_intrinsic_store_output:
1136                if (stage == MESA_SHADER_FRAGMENT)
1137                        bi_emit_fragment_out(b, instr);
1138                else if (stage == MESA_SHADER_VERTEX)
1139                        bi_emit_store_vary(b, instr);
1140                else
1141                        unreachable("Unsupported shader stage");
1142                break;
1143
1144        case nir_intrinsic_store_combined_output_pan:
1145                assert(stage == MESA_SHADER_FRAGMENT);
1146                bi_emit_fragment_out(b, instr);
1147                break;
1148
1149        case nir_intrinsic_load_ubo:
1150        case nir_intrinsic_load_kernel_input:
1151                bi_emit_load_ubo(b, instr);
1152                break;
1153
1154        case nir_intrinsic_load_global:
1155        case nir_intrinsic_load_global_constant:
1156                bi_emit_load(b, instr, BI_SEG_NONE);
1157                break;
1158
1159        case nir_intrinsic_store_global:
1160                bi_emit_store(b, instr, BI_SEG_NONE);
1161                break;
1162
1163        case nir_intrinsic_load_scratch:
1164                bi_emit_load(b, instr, BI_SEG_TL);
1165                break;
1166
1167        case nir_intrinsic_store_scratch:
1168                bi_emit_store(b, instr, BI_SEG_TL);
1169                break;
1170
1171        case nir_intrinsic_load_shared:
1172                bi_emit_load(b, instr, BI_SEG_WLS);
1173                break;
1174
1175        case nir_intrinsic_store_shared:
1176                bi_emit_store(b, instr, BI_SEG_WLS);
1177                break;
1178
1179        /* Blob doesn't seem to do anything for memory barriers, note +BARRIER
1180         * is illegal in fragment shaders */
1181        case nir_intrinsic_memory_barrier:
1182        case nir_intrinsic_memory_barrier_buffer:
1183        case nir_intrinsic_memory_barrier_image:
1184        case nir_intrinsic_memory_barrier_shared:
1185        case nir_intrinsic_group_memory_barrier:
1186                break;
1187
1188        case nir_intrinsic_control_barrier:
1189                assert(b->shader->stage != MESA_SHADER_FRAGMENT);
1190                bi_barrier(b);
1191                break;
1192
1193        case nir_intrinsic_shared_atomic_add:
1194        case nir_intrinsic_shared_atomic_imin:
1195        case nir_intrinsic_shared_atomic_umin:
1196        case nir_intrinsic_shared_atomic_imax:
1197        case nir_intrinsic_shared_atomic_umax:
1198        case nir_intrinsic_shared_atomic_and:
1199        case nir_intrinsic_shared_atomic_or:
1200        case nir_intrinsic_shared_atomic_xor: {
1201                assert(nir_src_bit_size(instr->src[1]) == 32);
1202
1203                bi_index addr = bi_seg_add_i64(b, bi_src_index(&instr->src[0]),
1204                                bi_zero(), false, BI_SEG_WLS);
1205
1206                bi_emit_atomic_i32_to(b, dst, addr, bi_src_index(&instr->src[1]),
1207                                instr->intrinsic);
1208                break;
1209        }
1210
1211        case nir_intrinsic_image_atomic_add:
1212        case nir_intrinsic_image_atomic_imin:
1213        case nir_intrinsic_image_atomic_umin:
1214        case nir_intrinsic_image_atomic_imax:
1215        case nir_intrinsic_image_atomic_umax:
1216        case nir_intrinsic_image_atomic_and:
1217        case nir_intrinsic_image_atomic_or:
1218        case nir_intrinsic_image_atomic_xor:
1219                assert(nir_src_bit_size(instr->src[3]) == 32);
1220
1221                bi_emit_atomic_i32_to(b, dst,
1222                                bi_emit_lea_image(b, instr),
1223                                bi_src_index(&instr->src[3]),
1224                                instr->intrinsic);
1225                break;
1226
1227        case nir_intrinsic_global_atomic_add:
1228        case nir_intrinsic_global_atomic_imin:
1229        case nir_intrinsic_global_atomic_umin:
1230        case nir_intrinsic_global_atomic_imax:
1231        case nir_intrinsic_global_atomic_umax:
1232        case nir_intrinsic_global_atomic_and:
1233        case nir_intrinsic_global_atomic_or:
1234        case nir_intrinsic_global_atomic_xor:
1235                assert(nir_src_bit_size(instr->src[1]) == 32);
1236
1237                bi_emit_atomic_i32_to(b, dst,
1238                                bi_src_index(&instr->src[0]),
1239                                bi_src_index(&instr->src[1]),
1240                                instr->intrinsic);
1241                break;
1242
1243        case nir_intrinsic_image_load:
1244                bi_emit_image_load(b, instr);
1245                break;
1246
1247        case nir_intrinsic_image_store:
1248                bi_emit_image_store(b, instr);
1249                break;
1250
1251        case nir_intrinsic_global_atomic_exchange:
1252                bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]),
1253                                &instr->src[1], BI_SEG_NONE);
1254                break;
1255
1256        case nir_intrinsic_image_atomic_exchange:
1257                bi_emit_axchg_to(b, dst, bi_emit_lea_image(b, instr),
1258                                &instr->src[3], BI_SEG_NONE);
1259                break;
1260
1261        case nir_intrinsic_shared_atomic_exchange:
1262                bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]),
1263                                &instr->src[1], BI_SEG_WLS);
1264                break;
1265
1266        case nir_intrinsic_global_atomic_comp_swap:
1267                bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]),
1268                                &instr->src[1], &instr->src[2], BI_SEG_NONE);
1269                break;
1270
1271        case nir_intrinsic_image_atomic_comp_swap:
1272                bi_emit_acmpxchg_to(b, dst, bi_emit_lea_image(b, instr),
1273                                &instr->src[3], &instr->src[4], BI_SEG_NONE);
1274                break;
1275
1276        case nir_intrinsic_shared_atomic_comp_swap:
1277                bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]),
1278                                &instr->src[1], &instr->src[2], BI_SEG_WLS);
1279                break;
1280
1281        case nir_intrinsic_load_frag_coord:
1282                bi_emit_load_frag_coord(b, instr);
1283                break;
1284
1285        case nir_intrinsic_load_output:
1286                bi_emit_ld_tile(b, instr);
1287                break;
1288
1289        case nir_intrinsic_discard_if: {
1290                bi_index src = bi_src_index(&instr->src[0]);
1291                assert(nir_src_bit_size(instr->src[0]) == 1);
1292                bi_discard_b32(b, bi_half(src, false));
1293                break;
1294        }
1295
1296        case nir_intrinsic_discard:
1297                bi_discard_f32(b, bi_zero(), bi_zero(), BI_CMPF_EQ);
1298                break;
1299
1300        case nir_intrinsic_load_ssbo_address:
1301                bi_load_sysval_nir(b, instr, 2, 0);
1302                break;
1303
1304        case nir_intrinsic_load_work_dim:
1305                bi_load_sysval_nir(b, instr, 1, 0);
1306                break;
1307
1308        case nir_intrinsic_load_first_vertex:
1309                bi_load_sysval_nir(b, instr, 1, 0);
1310                break;
1311
1312        case nir_intrinsic_load_base_vertex:
1313                bi_load_sysval_nir(b, instr, 1, 4);
1314                break;
1315
1316        case nir_intrinsic_load_base_instance:
1317                bi_load_sysval_nir(b, instr, 1, 8);
1318                break;
1319
1320        case nir_intrinsic_load_draw_id:
1321                bi_load_sysval_nir(b, instr, 1, 0);
1322                break;
1323
1324        case nir_intrinsic_get_ssbo_size:
1325                bi_load_sysval_nir(b, instr, 1, 8);
1326                break;
1327
1328        case nir_intrinsic_load_viewport_scale:
1329        case nir_intrinsic_load_viewport_offset:
1330        case nir_intrinsic_load_num_workgroups:
1331        case nir_intrinsic_load_workgroup_size:
1332                bi_load_sysval_nir(b, instr, 3, 0);
1333                break;
1334
1335        case nir_intrinsic_image_size:
1336                bi_load_sysval_nir(b, instr,
1337                                nir_dest_num_components(instr->dest), 0);
1338                break;
1339
1340        case nir_intrinsic_load_blend_const_color_rgba:
1341                bi_load_sysval_nir(b, instr,
1342                                   nir_dest_num_components(instr->dest), 0);
1343                break;
1344
1345	case nir_intrinsic_load_sample_positions_pan:
1346                bi_mov_i32_to(b, bi_word(dst, 0),
1347                                bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, false));
1348                bi_mov_i32_to(b, bi_word(dst, 1),
1349                                bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, true));
1350                break;
1351
1352	case nir_intrinsic_load_sample_mask_in:
1353                /* r61[0:15] contains the coverage bitmap */
1354                bi_u16_to_u32_to(b, dst, bi_half(bi_register(61), false));
1355                break;
1356
1357        case nir_intrinsic_load_sample_id:
1358                bi_load_sample_id_to(b, dst);
1359                break;
1360
1361	case nir_intrinsic_load_front_face:
1362                /* r58 == 0 means primitive is front facing */
1363                bi_icmp_i32_to(b, dst, bi_register(58), bi_zero(), BI_CMPF_EQ,
1364                                BI_RESULT_TYPE_M1);
1365                break;
1366
1367        case nir_intrinsic_load_point_coord:
1368                bi_ld_var_special_to(b, dst, bi_zero(), BI_REGISTER_FORMAT_F32,
1369                                BI_SAMPLE_CENTER, BI_UPDATE_CLOBBER,
1370                                BI_VARYING_NAME_POINT, BI_VECSIZE_V2);
1371                break;
1372
1373        case nir_intrinsic_load_vertex_id_zero_base:
1374                bi_mov_i32_to(b, dst, bi_register(61));
1375                break;
1376
1377        case nir_intrinsic_load_instance_id:
1378                bi_mov_i32_to(b, dst, bi_register(62));
1379                break;
1380
1381        case nir_intrinsic_load_subgroup_invocation:
1382                bi_mov_i32_to(b, dst, bi_fau(BIR_FAU_LANE_ID, false));
1383                break;
1384
1385        case nir_intrinsic_load_local_invocation_id:
1386                for (unsigned i = 0; i < 3; ++i)
1387                        bi_u16_to_u32_to(b, bi_word(dst, i),
1388                                         bi_half(bi_register(55 + i / 2), i % 2));
1389                break;
1390
1391        case nir_intrinsic_load_workgroup_id:
1392                for (unsigned i = 0; i < 3; ++i)
1393                        bi_mov_i32_to(b, bi_word(dst, i), bi_register(57 + i));
1394                break;
1395
1396        case nir_intrinsic_load_global_invocation_id:
1397        case nir_intrinsic_load_global_invocation_id_zero_base:
1398                for (unsigned i = 0; i < 3; ++i)
1399                        bi_mov_i32_to(b, bi_word(dst, i), bi_register(60 + i));
1400                break;
1401
1402        case nir_intrinsic_shader_clock:
1403                bi_ld_gclk_u64_to(b, dst, BI_SOURCE_CYCLE_COUNTER);
1404                break;
1405
1406        default:
1407                fprintf(stderr, "Unhandled intrinsic %s\n", nir_intrinsic_infos[instr->intrinsic].name);
1408                assert(0);
1409        }
1410}
1411
1412static void
1413bi_emit_load_const(bi_builder *b, nir_load_const_instr *instr)
1414{
1415        /* Make sure we've been lowered */
1416        assert(instr->def.num_components <= (32 / instr->def.bit_size));
1417
1418        /* Accumulate all the channels of the constant, as if we did an
1419         * implicit SEL over them */
1420        uint32_t acc = 0;
1421
1422        for (unsigned i = 0; i < instr->def.num_components; ++i) {
1423                uint32_t v = nir_const_value_as_uint(instr->value[i], instr->def.bit_size);
1424
1425                v = bi_extend_constant(v, instr->def.bit_size);
1426                acc |= (v << (i * instr->def.bit_size));
1427        }
1428
1429        bi_mov_i32_to(b, bi_get_index(instr->def.index, false, 0), bi_imm_u32(acc));
1430}
1431
1432static bi_index
1433bi_alu_src_index(nir_alu_src src, unsigned comps)
1434{
1435        /* we don't lower modifiers until the backend */
1436        assert(!(src.negate || src.abs));
1437
1438        unsigned bitsize = nir_src_bit_size(src.src);
1439
1440        /* TODO: Do we need to do something more clever with 1-bit bools? */
1441        if (bitsize == 1)
1442                bitsize = 16;
1443
1444        /* the bi_index carries the 32-bit (word) offset separate from the
1445         * subword swizzle, first handle the offset */
1446
1447        unsigned offset = 0;
1448
1449        assert(bitsize == 8 || bitsize == 16 || bitsize == 32);
1450        unsigned subword_shift = (bitsize == 32) ? 0 : (bitsize == 16) ? 1 : 2;
1451
1452        for (unsigned i = 0; i < comps; ++i) {
1453                unsigned new_offset = (src.swizzle[i] >> subword_shift);
1454
1455                if (i > 0)
1456                        assert(offset == new_offset && "wrong vectorization");
1457
1458                offset = new_offset;
1459        }
1460
1461        bi_index idx = bi_word(bi_src_index(&src.src), offset);
1462
1463        /* Compose the subword swizzle with existing (identity) swizzle */
1464        assert(idx.swizzle == BI_SWIZZLE_H01);
1465
1466        /* Bigger vectors should have been lowered */
1467        assert(comps <= (1 << subword_shift));
1468
1469        if (bitsize == 16) {
1470                unsigned c0 = src.swizzle[0] & 1;
1471                unsigned c1 = (comps > 1) ? src.swizzle[1] & 1 : c0;
1472                idx.swizzle = BI_SWIZZLE_H00 + c1 + (c0 << 1);
1473        } else if (bitsize == 8) {
1474                /* 8-bit vectors not yet supported */
1475                assert(comps == 1 && "8-bit vectors not supported");
1476                assert(src.swizzle[0] < 4 && "8-bit vectors not supported");
1477                idx.swizzle = BI_SWIZZLE_B0000 + src.swizzle[0];
1478        }
1479
1480        return idx;
1481}
1482
1483static enum bi_round
1484bi_nir_round(nir_op op)
1485{
1486        switch (op) {
1487        case nir_op_fround_even: return BI_ROUND_NONE;
1488        case nir_op_ftrunc: return BI_ROUND_RTZ;
1489        case nir_op_fceil: return BI_ROUND_RTP;
1490        case nir_op_ffloor: return BI_ROUND_RTN;
1491        default: unreachable("invalid nir round op");
1492        }
1493}
1494
1495/* Convenience for lowered transcendentals */
1496
1497static bi_index
1498bi_fmul_f32(bi_builder *b, bi_index s0, bi_index s1)
1499{
1500        return bi_fma_f32(b, s0, s1, bi_imm_f32(-0.0f), BI_ROUND_NONE);
1501}
1502
1503/* Approximate with FRCP_APPROX.f32 and apply a single iteration of
1504 * Newton-Raphson to improve precision */
1505
1506static void
1507bi_lower_frcp_32(bi_builder *b, bi_index dst, bi_index s0)
1508{
1509        bi_index x1 = bi_frcp_approx_f32(b, s0);
1510        bi_index m  = bi_frexpm_f32(b, s0, false, false);
1511        bi_index e  = bi_frexpe_f32(b, bi_neg(s0), false, false);
1512        bi_index t1 = bi_fma_rscale_f32(b, m, bi_neg(x1), bi_imm_f32(1.0),
1513                        bi_zero(), BI_ROUND_NONE, BI_SPECIAL_N);
1514        bi_fma_rscale_f32_to(b, dst, t1, x1, x1, e,
1515                        BI_ROUND_NONE, BI_SPECIAL_NONE);
1516}
1517
1518static void
1519bi_lower_frsq_32(bi_builder *b, bi_index dst, bi_index s0)
1520{
1521        bi_index x1 = bi_frsq_approx_f32(b, s0);
1522        bi_index m  = bi_frexpm_f32(b, s0, false, true);
1523        bi_index e  = bi_frexpe_f32(b, bi_neg(s0), false, true);
1524        bi_index t1 = bi_fmul_f32(b, x1, x1);
1525        bi_index t2 = bi_fma_rscale_f32(b, m, bi_neg(t1), bi_imm_f32(1.0),
1526                        bi_imm_u32(-1), BI_ROUND_NONE, BI_SPECIAL_N);
1527        bi_fma_rscale_f32_to(b, dst, t2, x1, x1, e,
1528                        BI_ROUND_NONE, BI_SPECIAL_N);
1529}
1530
1531/* More complex transcendentals, see
1532 * https://gitlab.freedesktop.org/panfrost/mali-isa-docs/-/blob/master/Bifrost.adoc
1533 * for documentation */
1534
1535static void
1536bi_lower_fexp2_32(bi_builder *b, bi_index dst, bi_index s0)
1537{
1538        bi_index t1 = bi_temp(b->shader);
1539        bi_instr *t1_instr = bi_fadd_f32_to(b, t1,
1540                        s0, bi_imm_u32(0x49400000), BI_ROUND_NONE);
1541        t1_instr->clamp = BI_CLAMP_CLAMP_0_INF;
1542
1543        bi_index t2 = bi_fadd_f32(b, t1, bi_imm_u32(0xc9400000), BI_ROUND_NONE);
1544
1545        bi_instr *a2 = bi_fadd_f32_to(b, bi_temp(b->shader),
1546                        s0, bi_neg(t2), BI_ROUND_NONE);
1547        a2->clamp = BI_CLAMP_CLAMP_M1_1;
1548
1549        bi_index a1t = bi_fexp_table_u4(b, t1, BI_ADJ_NONE);
1550        bi_index t3 = bi_isub_u32(b, t1, bi_imm_u32(0x49400000), false);
1551        bi_index a1i = bi_arshift_i32(b, t3, bi_null(), bi_imm_u8(4));
1552        bi_index p1 = bi_fma_f32(b, a2->dest[0], bi_imm_u32(0x3d635635),
1553                        bi_imm_u32(0x3e75fffa), BI_ROUND_NONE);
1554        bi_index p2 = bi_fma_f32(b, p1, a2->dest[0],
1555                        bi_imm_u32(0x3f317218), BI_ROUND_NONE);
1556        bi_index p3 = bi_fmul_f32(b, a2->dest[0], p2);
1557        bi_instr *x = bi_fma_rscale_f32_to(b, bi_temp(b->shader),
1558                        p3, a1t, a1t, a1i, BI_ROUND_NONE, BI_SPECIAL_NONE);
1559        x->clamp = BI_CLAMP_CLAMP_0_INF;
1560
1561        bi_instr *max = bi_fmax_f32_to(b, dst, x->dest[0], s0);
1562        max->sem = BI_SEM_NAN_PROPAGATE;
1563}
1564
1565static void
1566bi_fexp_32(bi_builder *b, bi_index dst, bi_index s0, bi_index log2_base)
1567{
1568        /* Scale by base, Multiply by 2*24 and convert to integer to get a 8:24
1569         * fixed-point input */
1570        bi_index scale = bi_fma_rscale_f32(b, s0, log2_base, bi_negzero(),
1571                        bi_imm_u32(24), BI_ROUND_NONE, BI_SPECIAL_NONE);
1572        bi_index fixed_pt = bi_f32_to_s32(b, scale, BI_ROUND_NONE);
1573
1574        /* Compute the result for the fixed-point input, but pass along
1575         * the floating-point scale for correct NaN propagation */
1576        bi_fexp_f32_to(b, dst, fixed_pt, scale);
1577}
1578
1579static void
1580bi_lower_flog2_32(bi_builder *b, bi_index dst, bi_index s0)
1581{
1582        /* s0 = a1 * 2^e, with a1 in [0.75, 1.5) */
1583        bi_index a1 = bi_frexpm_f32(b, s0, true, false);
1584        bi_index ei = bi_frexpe_f32(b, s0, true, false);
1585        bi_index ef = bi_s32_to_f32(b, ei, BI_ROUND_RTZ);
1586
1587        /* xt estimates -log(r1), a coarse approximation of log(a1) */
1588        bi_index r1 = bi_flog_table_f32(b, s0, BI_MODE_RED, BI_PRECISION_NONE);
1589        bi_index xt = bi_flog_table_f32(b, s0, BI_MODE_BASE2, BI_PRECISION_NONE);
1590
1591        /* log(s0) = log(a1 * 2^e) = e + log(a1) = e + log(a1 * r1) -
1592         * log(r1), so let x1 = e - log(r1) ~= e + xt and x2 = log(a1 * r1),
1593         * and then log(s0) = x1 + x2 */
1594        bi_index x1 = bi_fadd_f32(b, ef, xt, BI_ROUND_NONE);
1595
1596        /* Since a1 * r1 is close to 1, x2 = log(a1 * r1) may be computed by
1597         * polynomial approximation around 1. The series is expressed around
1598         * 1, so set y = (a1 * r1) - 1.0 */
1599        bi_index y = bi_fma_f32(b, a1, r1, bi_imm_f32(-1.0), BI_ROUND_NONE);
1600
1601        /* x2 = log_2(1 + y) = log_e(1 + y) * (1/log_e(2)), so approximate
1602         * log_e(1 + y) by the Taylor series (lower precision than the blob):
1603         * y - y^2/2 + O(y^3) = y(1 - y/2) + O(y^3) */
1604        bi_index loge = bi_fmul_f32(b, y,
1605                bi_fma_f32(b, y, bi_imm_f32(-0.5), bi_imm_f32(1.0), BI_ROUND_NONE));
1606
1607        bi_index x2 = bi_fmul_f32(b, loge, bi_imm_f32(1.0 / logf(2.0)));
1608
1609        /* log(s0) = x1 + x2 */
1610        bi_fadd_f32_to(b, dst, x1, x2, BI_ROUND_NONE);
1611}
1612
1613static void
1614bi_flog2_32(bi_builder *b, bi_index dst, bi_index s0)
1615{
1616        bi_index frexp = bi_frexpe_f32(b, s0, true, false);
1617        bi_index frexpi = bi_s32_to_f32(b, frexp, BI_ROUND_RTZ);
1618        bi_index add = bi_fadd_lscale_f32(b, bi_imm_f32(-1.0f), s0);
1619        bi_fma_f32_to(b, dst, bi_flogd_f32(b, s0), add, frexpi,
1620                        BI_ROUND_NONE);
1621}
1622
1623static void
1624bi_lower_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp)
1625{
1626        bi_index log2_base = bi_null();
1627
1628        if (base.type == BI_INDEX_CONSTANT) {
1629                log2_base = bi_imm_f32(log2f(uif(base.value)));
1630        } else {
1631                log2_base = bi_temp(b->shader);
1632                bi_lower_flog2_32(b, log2_base, base);
1633        }
1634
1635        return bi_lower_fexp2_32(b, dst, bi_fmul_f32(b, exp, log2_base));
1636}
1637
1638static void
1639bi_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp)
1640{
1641        bi_index log2_base = bi_null();
1642
1643        if (base.type == BI_INDEX_CONSTANT) {
1644                log2_base = bi_imm_f32(log2f(uif(base.value)));
1645        } else {
1646                log2_base = bi_temp(b->shader);
1647                bi_flog2_32(b, log2_base, base);
1648        }
1649
1650        return bi_fexp_32(b, dst, exp, log2_base);
1651}
1652
1653/* Bifrost has extremely coarse tables for approximating sin/cos, accessible as
1654 * FSIN/COS_TABLE.u6, which multiplies the bottom 6-bits by pi/32 and
1655 * calculates the results. We use them to calculate sin/cos via a Taylor
1656 * approximation:
1657 *
1658 * f(x + e) = f(x) + e f'(x) + (e^2)/2 f''(x)
1659 * sin(x + e) = sin(x) + e cos(x) - (e^2)/2 sin(x)
1660 * cos(x + e) = cos(x) - e sin(x) - (e^2)/2 cos(x)
1661 */
1662
1663#define TWO_OVER_PI  bi_imm_f32(2.0f / 3.14159f)
1664#define MPI_OVER_TWO bi_imm_f32(-3.14159f / 2.0)
1665#define SINCOS_BIAS  bi_imm_u32(0x49400000)
1666
1667static void
1668bi_lower_fsincos_32(bi_builder *b, bi_index dst, bi_index s0, bool cos)
1669{
1670        /* bottom 6-bits of result times pi/32 approximately s0 mod 2pi */
1671        bi_index x_u6 = bi_fma_f32(b, s0, TWO_OVER_PI, SINCOS_BIAS, BI_ROUND_NONE);
1672
1673        /* Approximate domain error (small) */
1674        bi_index e = bi_fma_f32(b, bi_fadd_f32(b, x_u6, bi_neg(SINCOS_BIAS),
1675                                BI_ROUND_NONE),
1676                        MPI_OVER_TWO, s0, BI_ROUND_NONE);
1677
1678        /* Lookup sin(x), cos(x) */
1679        bi_index sinx = bi_fsin_table_u6(b, x_u6, false);
1680        bi_index cosx = bi_fcos_table_u6(b, x_u6, false);
1681
1682        /* e^2 / 2 */
1683        bi_index e2_over_2 = bi_fma_rscale_f32(b, e, e, bi_negzero(),
1684                        bi_imm_u32(-1), BI_ROUND_NONE, BI_SPECIAL_NONE);
1685
1686        /* (-e^2)/2 f''(x) */
1687        bi_index quadratic = bi_fma_f32(b, bi_neg(e2_over_2),
1688                        cos ? cosx : sinx,
1689                        bi_negzero(),  BI_ROUND_NONE);
1690
1691        /* e f'(x) - (e^2/2) f''(x) */
1692        bi_instr *I = bi_fma_f32_to(b, bi_temp(b->shader), e,
1693                        cos ? bi_neg(sinx) : cosx,
1694                        quadratic, BI_ROUND_NONE);
1695        I->clamp = BI_CLAMP_CLAMP_M1_1;
1696
1697        /* f(x) + e f'(x) - (e^2/2) f''(x) */
1698        bi_fadd_f32_to(b, dst, I->dest[0], cos ? cosx : sinx, BI_ROUND_NONE);
1699}
1700
1701/* The XOR lane op is useful for derivative calculation, but was added in v7.
1702 * Add a safe helper that will do the appropriate lowering on v6 */
1703
1704static bi_index
1705bi_clper_xor(bi_builder *b, bi_index s0, bi_index s1)
1706{
1707        if (b->shader->arch >= 7) {
1708                return bi_clper_i32(b, s0, s1,
1709                                BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_XOR,
1710                                BI_SUBGROUP_SUBGROUP4);
1711        }
1712
1713        bi_index lane_id = bi_fau(BIR_FAU_LANE_ID, false);
1714        bi_index lane = bi_lshift_xor_i32(b, lane_id, s1, bi_imm_u8(0));
1715        return bi_clper_v6_i32(b, s0, lane);
1716}
1717
1718static bi_instr *
1719bi_emit_alu_bool(bi_builder *b, unsigned sz, nir_op op,
1720      bi_index dst, bi_index s0, bi_index s1, bi_index s2)
1721{
1722        /* Handle 1-bit bools as 0/~0 by default and let the optimizer deal
1723         * with the bit patterns later. 0/~0 has the nice property of being
1724         * independent of replicated vectorization. */
1725        if (sz == 1) sz = 16;
1726        bi_index f = bi_zero();
1727        bi_index t = bi_imm_u16(0xFFFF);
1728
1729        switch (op) {
1730        case nir_op_feq:
1731                return bi_fcmp_to(b, sz, dst, s0, s1, BI_CMPF_EQ, BI_RESULT_TYPE_M1);
1732        case nir_op_flt:
1733                return bi_fcmp_to(b, sz, dst, s0, s1, BI_CMPF_LT, BI_RESULT_TYPE_M1);
1734        case nir_op_fge:
1735                return bi_fcmp_to(b, sz, dst, s0, s1, BI_CMPF_GE, BI_RESULT_TYPE_M1);
1736        case nir_op_fneu:
1737                return bi_fcmp_to(b, sz, dst, s0, s1, BI_CMPF_NE, BI_RESULT_TYPE_M1);
1738
1739        case nir_op_ieq:
1740                return bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, BI_CMPF_EQ, BI_RESULT_TYPE_M1);
1741        case nir_op_ine:
1742                return bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, BI_CMPF_NE, BI_RESULT_TYPE_M1);
1743        case nir_op_ilt:
1744                return bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, BI_CMPF_LT, BI_RESULT_TYPE_M1);
1745        case nir_op_ige:
1746                return bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, BI_CMPF_GE, BI_RESULT_TYPE_M1);
1747        case nir_op_ult:
1748                return bi_icmp_to(b, nir_type_uint, sz, dst, s0, s1, BI_CMPF_LT, BI_RESULT_TYPE_M1);
1749        case nir_op_uge:
1750                return bi_icmp_to(b, nir_type_uint, sz, dst, s0, s1, BI_CMPF_GE, BI_RESULT_TYPE_M1);
1751
1752        case nir_op_iand:
1753                return bi_lshift_and_to(b, sz, dst, s0, s1, bi_imm_u8(0));
1754        case nir_op_ior:
1755                return bi_lshift_or_to(b, sz, dst, s0, s1, bi_imm_u8(0));
1756        case nir_op_ixor:
1757                return bi_lshift_xor_to(b, sz, dst, s0, s1, bi_imm_u8(0));
1758        case nir_op_inot:
1759                return bi_lshift_or_to(b, sz, dst, bi_zero(), bi_not(s0), bi_imm_u8(0));
1760
1761        case nir_op_f2b1:
1762                return bi_csel_to(b, nir_type_int, sz, dst, s0, f, f, t, BI_CMPF_EQ);
1763        case nir_op_i2b1:
1764                return bi_csel_to(b, nir_type_int, sz, dst, s0, f, f, t, BI_CMPF_EQ);
1765        case nir_op_b2b1:
1766                return bi_csel_to(b, nir_type_int, sz, dst, s0, f, f, t, BI_CMPF_EQ);
1767
1768        case nir_op_bcsel:
1769                return bi_csel_to(b, nir_type_int, sz, dst, s0, f, s1, s2, BI_CMPF_NE);
1770
1771        default:
1772                fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[op].name);
1773                unreachable("Unhandled boolean ALU instruction");
1774        }
1775}
1776
1777static void
1778bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
1779{
1780        bi_index dst = bi_dest_index(&instr->dest.dest);
1781        unsigned srcs = nir_op_infos[instr->op].num_inputs;
1782        unsigned sz = nir_dest_bit_size(instr->dest.dest);
1783        unsigned comps = nir_dest_num_components(instr->dest.dest);
1784        unsigned src_sz = srcs > 0 ? nir_src_bit_size(instr->src[0].src) : 0;
1785        unsigned src1_sz = srcs > 1 ? nir_src_bit_size(instr->src[1].src) : 0;
1786        bool is_bool = (sz == 1);
1787
1788        /* TODO: Anything else? */
1789        if (sz == 1)
1790                sz = 16;
1791
1792        /* Indicate scalarness */
1793        if (sz == 16 && comps == 1)
1794                dst.swizzle = BI_SWIZZLE_H00;
1795
1796        if (!instr->dest.dest.is_ssa) {
1797                for (unsigned i = 0; i < comps; ++i)
1798                        assert(instr->dest.write_mask);
1799        }
1800
1801        /* First, match against the various moves in NIR. These are
1802         * special-cased because they can operate on vectors even after
1803         * lowering ALU to scalar. For Bifrost, bi_alu_src_index assumes the
1804         * instruction is no "bigger" than SIMD-within-a-register. These moves
1805         * are the exceptions that need to handle swizzles specially. */
1806
1807        switch (instr->op) {
1808        case nir_op_pack_32_2x16:
1809        case nir_op_vec2:
1810        case nir_op_vec3:
1811        case nir_op_vec4: {
1812                bi_index unoffset_srcs[4] = {
1813                        srcs > 0 ? bi_src_index(&instr->src[0].src) : bi_null(),
1814                        srcs > 1 ? bi_src_index(&instr->src[1].src) : bi_null(),
1815                        srcs > 2 ? bi_src_index(&instr->src[2].src) : bi_null(),
1816                        srcs > 3 ? bi_src_index(&instr->src[3].src) : bi_null(),
1817                };
1818
1819                unsigned channels[4] = {
1820                        instr->src[0].swizzle[0],
1821                        instr->src[1].swizzle[0],
1822                        srcs > 2 ? instr->src[2].swizzle[0] : 0,
1823                        srcs > 3 ? instr->src[3].swizzle[0] : 0,
1824                };
1825
1826                bi_make_vec_to(b, dst, unoffset_srcs, channels, srcs, sz);
1827                return;
1828        }
1829
1830        case nir_op_vec8:
1831        case nir_op_vec16:
1832                unreachable("should've been lowered");
1833
1834        case nir_op_unpack_32_2x16:
1835        case nir_op_unpack_64_2x32_split_x:
1836                bi_mov_i32_to(b, dst, bi_src_index(&instr->src[0].src));
1837                return;
1838
1839        case nir_op_unpack_64_2x32_split_y:
1840                bi_mov_i32_to(b, dst, bi_word(bi_src_index(&instr->src[0].src), 1));
1841                return;
1842
1843        case nir_op_pack_64_2x32_split:
1844                bi_mov_i32_to(b, bi_word(dst, 0), bi_src_index(&instr->src[0].src));
1845                bi_mov_i32_to(b, bi_word(dst, 1), bi_src_index(&instr->src[1].src));
1846                return;
1847
1848        case nir_op_pack_64_2x32:
1849                bi_mov_i32_to(b, bi_word(dst, 0), bi_word(bi_src_index(&instr->src[0].src), 0));
1850                bi_mov_i32_to(b, bi_word(dst, 1), bi_word(bi_src_index(&instr->src[0].src), 1));
1851                return;
1852
1853        case nir_op_mov: {
1854                bi_index idx = bi_src_index(&instr->src[0].src);
1855                bi_index unoffset_srcs[4] = { idx, idx, idx, idx };
1856
1857                unsigned channels[4] = {
1858                        comps > 0 ? instr->src[0].swizzle[0] : 0,
1859                        comps > 1 ? instr->src[0].swizzle[1] : 0,
1860                        comps > 2 ? instr->src[0].swizzle[2] : 0,
1861                        comps > 3 ? instr->src[0].swizzle[3] : 0,
1862                };
1863
1864                if (sz == 1) sz = 16;
1865                bi_make_vec_to(b, dst, unoffset_srcs, channels, comps, sz);
1866                return;
1867        }
1868
1869        case nir_op_f2f16:
1870                assert(src_sz == 32);
1871                bi_index idx = bi_src_index(&instr->src[0].src);
1872                bi_index s0 = bi_word(idx, instr->src[0].swizzle[0]);
1873                bi_index s1 = comps > 1 ?
1874                        bi_word(idx, instr->src[0].swizzle[1]) : s0;
1875
1876                bi_v2f32_to_v2f16_to(b, dst, s0, s1, BI_ROUND_NONE);
1877                return;
1878
1879        /* Vectorized downcasts */
1880        case nir_op_u2u16:
1881        case nir_op_i2i16: {
1882                if (!(src_sz == 32 && comps == 2))
1883                        break;
1884
1885                bi_index idx = bi_src_index(&instr->src[0].src);
1886                bi_index s0 = bi_word(idx, instr->src[0].swizzle[0]);
1887                bi_index s1 = bi_word(idx, instr->src[0].swizzle[1]);
1888
1889                bi_mkvec_v2i16_to(b, dst,
1890                                bi_half(s0, false), bi_half(s1, false));
1891                return;
1892        }
1893
1894        case nir_op_i2i8:
1895        case nir_op_u2u8:
1896        {
1897                /* Acts like an 8-bit swizzle */
1898                bi_index idx = bi_src_index(&instr->src[0].src);
1899                unsigned factor = src_sz / 8;
1900                unsigned chan[4] = { 0 };
1901
1902                for (unsigned i = 0; i < comps; ++i)
1903                        chan[i] = instr->src[0].swizzle[i] * factor;
1904
1905                bi_make_vec_to(b, dst, &idx, chan, comps, 8);
1906                return;
1907        }
1908
1909        default:
1910                break;
1911        }
1912
1913        bi_index s0 = srcs > 0 ? bi_alu_src_index(instr->src[0], comps) : bi_null();
1914        bi_index s1 = srcs > 1 ? bi_alu_src_index(instr->src[1], comps) : bi_null();
1915        bi_index s2 = srcs > 2 ? bi_alu_src_index(instr->src[2], comps) : bi_null();
1916
1917        if (is_bool) {
1918                bi_emit_alu_bool(b, src_sz, instr->op, dst, s0, s1, s2);
1919                return;
1920        }
1921
1922        switch (instr->op) {
1923        case nir_op_ffma:
1924                bi_fma_to(b, sz, dst, s0, s1, s2, BI_ROUND_NONE);
1925                break;
1926
1927        case nir_op_fmul:
1928                bi_fma_to(b, sz, dst, s0, s1, bi_negzero(), BI_ROUND_NONE);
1929                break;
1930
1931        case nir_op_fsub:
1932                s1 = bi_neg(s1);
1933                FALLTHROUGH;
1934        case nir_op_fadd:
1935                bi_fadd_to(b, sz, dst, s0, s1, BI_ROUND_NONE);
1936                break;
1937
1938        case nir_op_fsat: {
1939                bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
1940                I->clamp = BI_CLAMP_CLAMP_0_1;
1941                break;
1942        }
1943
1944        case nir_op_fsat_signed_mali: {
1945                bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
1946                I->clamp = BI_CLAMP_CLAMP_M1_1;
1947                break;
1948        }
1949
1950        case nir_op_fclamp_pos_mali: {
1951                bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
1952                I->clamp = BI_CLAMP_CLAMP_0_INF;
1953                break;
1954        }
1955
1956        case nir_op_fneg:
1957                bi_fabsneg_to(b, sz, dst, bi_neg(s0));
1958                break;
1959
1960        case nir_op_fabs:
1961                bi_fabsneg_to(b, sz, dst, bi_abs(s0));
1962                break;
1963
1964        case nir_op_fsin:
1965                bi_lower_fsincos_32(b, dst, s0, false);
1966                break;
1967
1968        case nir_op_fcos:
1969                bi_lower_fsincos_32(b, dst, s0, true);
1970                break;
1971
1972        case nir_op_fexp2:
1973                assert(sz == 32); /* should've been lowered */
1974
1975                if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
1976                        bi_lower_fexp2_32(b, dst, s0);
1977                else
1978                        bi_fexp_32(b, dst, s0, bi_imm_f32(1.0f));
1979
1980                break;
1981
1982        case nir_op_flog2:
1983                assert(sz == 32); /* should've been lowered */
1984
1985                if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
1986                        bi_lower_flog2_32(b, dst, s0);
1987                else
1988                        bi_flog2_32(b, dst, s0);
1989
1990                break;
1991
1992        case nir_op_fpow:
1993                assert(sz == 32); /* should've been lowered */
1994
1995                if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
1996                        bi_lower_fpow_32(b, dst, s0, s1);
1997                else
1998                        bi_fpow_32(b, dst, s0, s1);
1999
2000                break;
2001
2002        case nir_op_bcsel:
2003                if (src1_sz == 8)
2004                        bi_mux_v4i8_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
2005                else
2006                        bi_csel_to(b, nir_type_int, src1_sz,
2007                                        dst, s0, bi_zero(), s1, s2, BI_CMPF_NE);
2008                break;
2009
2010        case nir_op_ishl:
2011                bi_lshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0));
2012                break;
2013        case nir_op_ushr:
2014                bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0));
2015                break;
2016
2017        case nir_op_ishr:
2018                bi_arshift_to(b, sz, dst, s0, bi_null(), bi_byte(s1, 0));
2019                break;
2020
2021        case nir_op_imin:
2022        case nir_op_umin:
2023                bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst,
2024                                s0, s1, s0, s1, BI_CMPF_LT);
2025                break;
2026
2027        case nir_op_imax:
2028        case nir_op_umax:
2029                bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst,
2030                                s0, s1, s0, s1, BI_CMPF_GT);
2031                break;
2032
2033        case nir_op_fddx_must_abs_mali:
2034        case nir_op_fddy_must_abs_mali: {
2035                bi_index bit = bi_imm_u32(instr->op == nir_op_fddx_must_abs_mali ? 1 : 2);
2036                bi_index adjacent = bi_clper_xor(b, s0, bit);
2037                bi_fadd_to(b, sz, dst, adjacent, bi_neg(s0), BI_ROUND_NONE);
2038                break;
2039        }
2040
2041        case nir_op_fddx:
2042        case nir_op_fddy: {
2043                bi_index lane1 = bi_lshift_and_i32(b,
2044                                bi_fau(BIR_FAU_LANE_ID, false),
2045                                bi_imm_u32(instr->op == nir_op_fddx ? 2 : 1),
2046                                bi_imm_u8(0));
2047
2048                bi_index lane2 = bi_iadd_u32(b, lane1,
2049                                bi_imm_u32(instr->op == nir_op_fddx ? 1 : 2),
2050                                false);
2051
2052                bi_index left, right;
2053
2054                if (b->shader->quirks & BIFROST_LIMITED_CLPER) {
2055                        left = bi_clper_v6_i32(b, s0, lane1);
2056                        right = bi_clper_v6_i32(b, s0, lane2);
2057                } else {
2058                        left = bi_clper_i32(b, s0, lane1,
2059                                        BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE,
2060                                        BI_SUBGROUP_SUBGROUP4);
2061
2062                        right = bi_clper_i32(b, s0, lane2,
2063                                        BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE,
2064                                        BI_SUBGROUP_SUBGROUP4);
2065                }
2066
2067                bi_fadd_to(b, sz, dst, right, bi_neg(left), BI_ROUND_NONE);
2068                break;
2069        }
2070
2071        case nir_op_f2f32:
2072                bi_f16_to_f32_to(b, dst, s0);
2073                break;
2074
2075        case nir_op_f2i32:
2076                if (src_sz == 32)
2077                        bi_f32_to_s32_to(b, dst, s0, BI_ROUND_RTZ);
2078                else
2079                        bi_f16_to_s32_to(b, dst, s0, BI_ROUND_RTZ);
2080                break;
2081
2082        /* Note 32-bit sources => no vectorization, so 32-bit works */
2083        case nir_op_f2u16:
2084                if (src_sz == 32)
2085                        bi_f32_to_u32_to(b, dst, s0, BI_ROUND_RTZ);
2086                else
2087                        bi_v2f16_to_v2u16_to(b, dst, s0, BI_ROUND_RTZ);
2088                break;
2089
2090        case nir_op_f2i16:
2091                if (src_sz == 32)
2092                        bi_f32_to_s32_to(b, dst, s0, BI_ROUND_RTZ);
2093                else
2094                        bi_v2f16_to_v2s16_to(b, dst, s0, BI_ROUND_RTZ);
2095                break;
2096
2097        case nir_op_f2u32:
2098                if (src_sz == 32)
2099                        bi_f32_to_u32_to(b, dst, s0, BI_ROUND_RTZ);
2100                else
2101                        bi_f16_to_u32_to(b, dst, s0, BI_ROUND_RTZ);
2102                break;
2103
2104        case nir_op_u2f16:
2105                if (src_sz == 32)
2106                        bi_v2u16_to_v2f16_to(b, dst, bi_half(s0, false), BI_ROUND_RTZ);
2107                else if (src_sz == 16)
2108                        bi_v2u16_to_v2f16_to(b, dst, s0, BI_ROUND_RTZ);
2109                else if (src_sz == 8)
2110                        bi_v2u8_to_v2f16_to(b, dst, s0);
2111                break;
2112
2113        case nir_op_u2f32:
2114                if (src_sz == 32)
2115                        bi_u32_to_f32_to(b, dst, s0, BI_ROUND_RTZ);
2116                else if (src_sz == 16)
2117                        bi_u16_to_f32_to(b, dst, s0);
2118                else
2119                        bi_u8_to_f32_to(b, dst, s0);
2120                break;
2121
2122        case nir_op_i2f16:
2123                if (src_sz == 32)
2124                        bi_v2s16_to_v2f16_to(b, dst, bi_half(s0, false), BI_ROUND_RTZ);
2125                else if (src_sz == 16)
2126                        bi_v2s16_to_v2f16_to(b, dst, s0, BI_ROUND_RTZ);
2127                else if (src_sz == 8)
2128                        bi_v2s8_to_v2f16_to(b, dst, s0);
2129                break;
2130
2131        case nir_op_i2f32:
2132                if (src_sz == 32)
2133                        bi_s32_to_f32_to(b, dst, s0, BI_ROUND_RTZ);
2134                else if (src_sz == 16)
2135                        bi_s16_to_f32_to(b, dst, s0);
2136                else if (src_sz == 8)
2137                        bi_s8_to_f32_to(b, dst, s0);
2138                break;
2139
2140        case nir_op_i2i32:
2141                if (src_sz == 16)
2142                        bi_s16_to_s32_to(b, dst, s0);
2143                else
2144                        bi_s8_to_s32_to(b, dst, s0);
2145                break;
2146
2147        case nir_op_u2u32:
2148                if (src_sz == 16)
2149                        bi_u16_to_u32_to(b, dst, s0);
2150                else
2151                        bi_u8_to_u32_to(b, dst, s0);
2152                break;
2153
2154        case nir_op_i2i16:
2155                assert(src_sz == 8 || src_sz == 32);
2156
2157                if (src_sz == 8)
2158                        bi_v2s8_to_v2s16_to(b, dst, s0);
2159                else
2160                        bi_mov_i32_to(b, dst, s0);
2161                break;
2162
2163        case nir_op_u2u16:
2164                assert(src_sz == 8 || src_sz == 32);
2165
2166                if (src_sz == 8)
2167                        bi_v2u8_to_v2u16_to(b, dst, s0);
2168                else
2169                        bi_mov_i32_to(b, dst, s0);
2170                break;
2171
2172        case nir_op_b2f16:
2173        case nir_op_b2f32:
2174                bi_csel_to(b, nir_type_int, sz, dst, s0, bi_zero(),
2175                                (sz == 16) ? bi_imm_f16(1.0) : bi_imm_f32(1.0),
2176                                (sz == 16) ? bi_imm_f16(0.0) : bi_imm_f32(0.0),
2177                                BI_CMPF_NE);
2178                break;
2179
2180        case nir_op_b2b32:
2181                bi_csel_to(b, nir_type_int, sz, dst, s0, bi_zero(),
2182                                bi_imm_u32(~0), bi_zero(), BI_CMPF_NE);
2183                break;
2184
2185        case nir_op_b2i8:
2186        case nir_op_b2i16:
2187        case nir_op_b2i32:
2188                bi_lshift_and_to(b, sz, dst, s0, bi_imm_uintN(1, sz), bi_imm_u8(0));
2189                break;
2190
2191        case nir_op_fround_even:
2192        case nir_op_fceil:
2193        case nir_op_ffloor:
2194        case nir_op_ftrunc:
2195                bi_fround_to(b, sz, dst, s0, bi_nir_round(instr->op));
2196                break;
2197
2198        case nir_op_fmin:
2199                bi_fmin_to(b, sz, dst, s0, s1);
2200                break;
2201
2202        case nir_op_fmax:
2203                bi_fmax_to(b, sz, dst, s0, s1);
2204                break;
2205
2206        case nir_op_iadd:
2207                bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, false);
2208                break;
2209
2210        case nir_op_iadd_sat:
2211                bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, true);
2212                break;
2213
2214        case nir_op_uadd_sat:
2215                bi_iadd_to(b, nir_type_uint, sz, dst, s0, s1, true);
2216                break;
2217
2218        case nir_op_ihadd:
2219                bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTN);
2220                break;
2221
2222        case nir_op_irhadd:
2223                bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTP);
2224                break;
2225
2226        case nir_op_ineg:
2227                bi_isub_to(b, nir_type_int, sz, dst, bi_zero(), s0, false);
2228                break;
2229
2230        case nir_op_isub:
2231                bi_isub_to(b, nir_type_int, sz, dst, s0, s1, false);
2232                break;
2233
2234        case nir_op_isub_sat:
2235                bi_isub_to(b, nir_type_int, sz, dst, s0, s1, true);
2236                break;
2237
2238        case nir_op_usub_sat:
2239                bi_isub_to(b, nir_type_uint, sz, dst, s0, s1, true);
2240                break;
2241
2242        case nir_op_imul:
2243                bi_imul_to(b, sz, dst, s0, s1);
2244                break;
2245
2246        case nir_op_iabs:
2247                bi_iabs_to(b, sz, dst, s0);
2248                break;
2249
2250        case nir_op_iand:
2251                bi_lshift_and_to(b, sz, dst, s0, s1, bi_imm_u8(0));
2252                break;
2253
2254        case nir_op_ior:
2255                bi_lshift_or_to(b, sz, dst, s0, s1, bi_imm_u8(0));
2256                break;
2257
2258        case nir_op_ixor:
2259                bi_lshift_xor_to(b, sz, dst, s0, s1, bi_imm_u8(0));
2260                break;
2261
2262        case nir_op_inot:
2263                bi_lshift_or_to(b, sz, dst, bi_zero(), bi_not(s0), bi_imm_u8(0));
2264                break;
2265
2266        case nir_op_frsq:
2267                if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2268                        bi_lower_frsq_32(b, dst, s0);
2269                else
2270                        bi_frsq_to(b, sz, dst, s0);
2271                break;
2272
2273        case nir_op_frcp:
2274                if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2275                        bi_lower_frcp_32(b, dst, s0);
2276                else
2277                        bi_frcp_to(b, sz, dst, s0);
2278                break;
2279
2280        case nir_op_uclz:
2281                bi_clz_to(b, sz, dst, s0, false);
2282                break;
2283
2284        case nir_op_bit_count:
2285                bi_popcount_i32_to(b, dst, s0);
2286                break;
2287
2288        case nir_op_bitfield_reverse:
2289                bi_bitrev_i32_to(b, dst, s0);
2290                break;
2291
2292        case nir_op_ufind_msb: {
2293                bi_index clz = bi_clz(b, src_sz, s0, false);
2294
2295                if (sz == 8)
2296                        clz = bi_byte(clz, 0);
2297                else if (sz == 16)
2298                        clz = bi_half(clz, false);
2299
2300                bi_isub_u32_to(b, dst, bi_imm_u32(src_sz - 1), clz, false);
2301                break;
2302        }
2303
2304        default:
2305                fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
2306                unreachable("Unknown ALU op");
2307        }
2308}
2309
2310/* Returns dimension with 0 special casing cubemaps. Shamelessly copied from Midgard */
2311static unsigned
2312bifrost_tex_format(enum glsl_sampler_dim dim)
2313{
2314        switch (dim) {
2315        case GLSL_SAMPLER_DIM_1D:
2316        case GLSL_SAMPLER_DIM_BUF:
2317                return 1;
2318
2319        case GLSL_SAMPLER_DIM_2D:
2320        case GLSL_SAMPLER_DIM_MS:
2321        case GLSL_SAMPLER_DIM_EXTERNAL:
2322        case GLSL_SAMPLER_DIM_RECT:
2323                return 2;
2324
2325        case GLSL_SAMPLER_DIM_3D:
2326                return 3;
2327
2328        case GLSL_SAMPLER_DIM_CUBE:
2329                return 0;
2330
2331        default:
2332                DBG("Unknown sampler dim type\n");
2333                assert(0);
2334                return 0;
2335        }
2336}
2337
2338static enum bifrost_texture_format_full
2339bi_texture_format(nir_alu_type T, enum bi_clamp clamp)
2340{
2341        switch (T) {
2342        case nir_type_float16: return BIFROST_TEXTURE_FORMAT_F16 + clamp;
2343        case nir_type_float32: return BIFROST_TEXTURE_FORMAT_F32 + clamp;
2344        case nir_type_uint16:  return BIFROST_TEXTURE_FORMAT_U16;
2345        case nir_type_int16:   return BIFROST_TEXTURE_FORMAT_S16;
2346        case nir_type_uint32:  return BIFROST_TEXTURE_FORMAT_U32;
2347        case nir_type_int32:   return BIFROST_TEXTURE_FORMAT_S32;
2348        default:              unreachable("Invalid type for texturing");
2349        }
2350}
2351
2352/* Array indices are specified as 32-bit uints, need to convert. In .z component from NIR */
2353static bi_index
2354bi_emit_texc_array_index(bi_builder *b, bi_index idx, nir_alu_type T)
2355{
2356        /* For (u)int we can just passthrough */
2357        nir_alu_type base = nir_alu_type_get_base_type(T);
2358        if (base == nir_type_int || base == nir_type_uint)
2359                return idx;
2360
2361        /* Otherwise we convert */
2362        assert(T == nir_type_float32);
2363
2364        /* OpenGL ES 3.2 specification section 8.14.2 ("Coordinate Wrapping and
2365         * Texel Selection") defines the layer to be taken from clamp(RNE(r),
2366         * 0, dt - 1). So we use round RTE, clamping is handled at the data
2367         * structure level */
2368
2369        return bi_f32_to_u32(b, idx, BI_ROUND_NONE);
2370}
2371
2372/* TEXC's explicit and bias LOD modes requires the LOD to be transformed to a
2373 * 16-bit 8:8 fixed-point format. We lower as:
2374 *
2375 * F32_TO_S32(clamp(x, -16.0, +16.0) * 256.0) & 0xFFFF =
2376 * MKVEC(F32_TO_S32(clamp(x * 1.0/16.0, -1.0, 1.0) * (16.0 * 256.0)), #0)
2377 */
2378
2379static bi_index
2380bi_emit_texc_lod_88(bi_builder *b, bi_index lod, bool fp16)
2381{
2382        /* Precompute for constant LODs to avoid general constant folding */
2383        if (lod.type == BI_INDEX_CONSTANT) {
2384                uint32_t raw = lod.value;
2385                float x = fp16 ? _mesa_half_to_float(raw) : uif(raw);
2386                int32_t s32 = CLAMP(x, -16.0f, 16.0f) * 256.0f;
2387                return bi_imm_u32(s32 & 0xFFFF);
2388        }
2389
2390        /* Sort of arbitrary. Must be less than 128.0, greater than or equal to
2391         * the max LOD (16 since we cap at 2^16 texture dimensions), and
2392         * preferably small to minimize precision loss */
2393        const float max_lod = 16.0;
2394
2395        bi_instr *fsat = bi_fma_f32_to(b, bi_temp(b->shader),
2396                        fp16 ? bi_half(lod, false) : lod,
2397                        bi_imm_f32(1.0f / max_lod), bi_negzero(), BI_ROUND_NONE);
2398
2399        fsat->clamp = BI_CLAMP_CLAMP_M1_1;
2400
2401        bi_index fmul = bi_fma_f32(b, fsat->dest[0], bi_imm_f32(max_lod * 256.0f),
2402                        bi_negzero(), BI_ROUND_NONE);
2403
2404        return bi_mkvec_v2i16(b,
2405                        bi_half(bi_f32_to_s32(b, fmul, BI_ROUND_RTZ), false),
2406                        bi_imm_u16(0));
2407}
2408
2409/* FETCH takes a 32-bit staging register containing the LOD as an integer in
2410 * the bottom 16-bits and (if present) the cube face index in the top 16-bits.
2411 * TODO: Cube face.
2412 */
2413
2414static bi_index
2415bi_emit_texc_lod_cube(bi_builder *b, bi_index lod)
2416{
2417        return bi_lshift_or_i32(b, lod, bi_zero(), bi_imm_u8(8));
2418}
2419
2420/* The hardware specifies texel offsets and multisample indices together as a
2421 * u8vec4 <offset, ms index>. By default all are zero, so if have either a
2422 * nonzero texel offset or a nonzero multisample index, we build a u8vec4 with
2423 * the bits we need and return that to be passed as a staging register. Else we
2424 * return 0 to avoid allocating a data register when everything is zero. */
2425
2426static bi_index
2427bi_emit_texc_offset_ms_index(bi_builder *b, nir_tex_instr *instr)
2428{
2429        bi_index dest = bi_zero();
2430
2431        int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset);
2432        if (offs_idx >= 0 &&
2433            (!nir_src_is_const(instr->src[offs_idx].src) ||
2434             nir_src_as_uint(instr->src[offs_idx].src) != 0)) {
2435                unsigned nr = nir_src_num_components(instr->src[offs_idx].src);
2436                bi_index idx = bi_src_index(&instr->src[offs_idx].src);
2437                dest = bi_mkvec_v4i8(b,
2438                                (nr > 0) ? bi_byte(bi_word(idx, 0), 0) : bi_imm_u8(0),
2439                                (nr > 1) ? bi_byte(bi_word(idx, 1), 0) : bi_imm_u8(0),
2440                                (nr > 2) ? bi_byte(bi_word(idx, 2), 0) : bi_imm_u8(0),
2441                                bi_imm_u8(0));
2442        }
2443
2444        int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index);
2445        if (ms_idx >= 0 &&
2446            (!nir_src_is_const(instr->src[ms_idx].src) ||
2447             nir_src_as_uint(instr->src[ms_idx].src) != 0)) {
2448                dest = bi_lshift_or_i32(b,
2449                                bi_src_index(&instr->src[ms_idx].src), dest,
2450                                bi_imm_u8(24));
2451        }
2452
2453        return dest;
2454}
2455
2456static void
2457bi_emit_cube_coord(bi_builder *b, bi_index coord,
2458                    bi_index *face, bi_index *s, bi_index *t)
2459{
2460        /* Compute max { |x|, |y|, |z| } */
2461        bi_instr *cubeface = bi_cubeface_to(b, bi_temp(b->shader),
2462                        bi_temp(b->shader), coord,
2463                        bi_word(coord, 1), bi_word(coord, 2));
2464
2465        /* Select coordinates */
2466
2467        bi_index ssel = bi_cube_ssel(b, bi_word(coord, 2), coord,
2468                        cubeface->dest[1]);
2469
2470        bi_index tsel = bi_cube_tsel(b, bi_word(coord, 1), bi_word(coord, 2),
2471                        cubeface->dest[1]);
2472
2473        /* The OpenGL ES specification requires us to transform an input vector
2474         * (x, y, z) to the coordinate, given the selected S/T:
2475         *
2476         * (1/2 ((s / max{x,y,z}) + 1), 1/2 ((t / max{x, y, z}) + 1))
2477         *
2478         * We implement (s shown, t similar) in a form friendlier to FMA
2479         * instructions, and clamp coordinates at the end for correct
2480         * NaN/infinity handling:
2481         *
2482         * fsat(s * (0.5 * (1 / max{x, y, z})) + 0.5)
2483         *
2484         * Take the reciprocal of max{x, y, z}
2485         */
2486
2487        bi_index rcp = bi_frcp_f32(b, cubeface->dest[0]);
2488
2489        /* Calculate 0.5 * (1.0 / max{x, y, z}) */
2490        bi_index fma1 = bi_fma_f32(b, rcp, bi_imm_f32(0.5f), bi_negzero(),
2491                        BI_ROUND_NONE);
2492
2493        /* Transform the coordinates */
2494        *s = bi_temp(b->shader);
2495        *t = bi_temp(b->shader);
2496
2497        bi_instr *S = bi_fma_f32_to(b, *s, fma1, ssel, bi_imm_f32(0.5f),
2498                        BI_ROUND_NONE);
2499        bi_instr *T = bi_fma_f32_to(b, *t, fma1, tsel, bi_imm_f32(0.5f),
2500                        BI_ROUND_NONE);
2501
2502        S->clamp = BI_CLAMP_CLAMP_0_1;
2503        T->clamp = BI_CLAMP_CLAMP_0_1;
2504
2505        /* Face index at bit[29:31], matching the cube map descriptor */
2506        *face = cubeface->dest[1];
2507}
2508
2509/* Emits a cube map descriptor, returning lower 32-bits and putting upper
2510 * 32-bits in passed pointer t. The packing of the face with the S coordinate
2511 * exploits the redundancy of floating points with the range restriction of
2512 * CUBEFACE output.
2513 *
2514 *     struct cube_map_descriptor {
2515 *         float s : 29;
2516 *         unsigned face : 3;
2517 *         float t : 32;
2518 *     }
2519 *
2520 * Since the cube face index is preshifted, this is easy to pack with a bitwise
2521 * MUX.i32 and a fixed mask, selecting the lower bits 29 from s and the upper 3
2522 * bits from face.
2523 */
2524
2525static bi_index
2526bi_emit_texc_cube_coord(bi_builder *b, bi_index coord, bi_index *t)
2527{
2528        bi_index face, s;
2529        bi_emit_cube_coord(b, coord, &face, &s, t);
2530        bi_index mask = bi_imm_u32(BITFIELD_MASK(29));
2531        return bi_mux_i32(b, s, face, mask, BI_MUX_BIT);
2532}
2533
2534/* Map to the main texture op used. Some of these (txd in particular) will
2535 * lower to multiple texture ops with different opcodes (GRDESC_DER + TEX in
2536 * sequence). We assume that lowering is handled elsewhere.
2537 */
2538
2539static enum bifrost_tex_op
2540bi_tex_op(nir_texop op)
2541{
2542        switch (op) {
2543        case nir_texop_tex:
2544        case nir_texop_txb:
2545        case nir_texop_txl:
2546        case nir_texop_txd:
2547        case nir_texop_tex_prefetch:
2548                return BIFROST_TEX_OP_TEX;
2549        case nir_texop_txf:
2550        case nir_texop_txf_ms:
2551        case nir_texop_txf_ms_fb:
2552        case nir_texop_tg4:
2553                return BIFROST_TEX_OP_FETCH;
2554        case nir_texop_txs:
2555        case nir_texop_lod:
2556        case nir_texop_query_levels:
2557        case nir_texop_texture_samples:
2558        case nir_texop_samples_identical:
2559                unreachable("should've been lowered");
2560        default:
2561                unreachable("unsupported tex op");
2562        }
2563}
2564
2565/* Data registers required by texturing in the order they appear. All are
2566 * optional, the texture operation descriptor determines which are present.
2567 * Note since 3D arrays are not permitted at an API level, Z_COORD and
2568 * ARRAY/SHADOW are exlusive, so TEXC in practice reads at most 8 registers */
2569
2570enum bifrost_tex_dreg {
2571        BIFROST_TEX_DREG_Z_COORD = 0,
2572        BIFROST_TEX_DREG_Y_DELTAS = 1,
2573        BIFROST_TEX_DREG_LOD = 2,
2574        BIFROST_TEX_DREG_GRDESC_HI = 3,
2575        BIFROST_TEX_DREG_SHADOW = 4,
2576        BIFROST_TEX_DREG_ARRAY = 5,
2577        BIFROST_TEX_DREG_OFFSETMS = 6,
2578        BIFROST_TEX_DREG_SAMPLER = 7,
2579        BIFROST_TEX_DREG_TEXTURE = 8,
2580        BIFROST_TEX_DREG_COUNT,
2581};
2582
2583static void
2584bi_emit_texc(bi_builder *b, nir_tex_instr *instr)
2585{
2586        bool computed_lod = false;
2587
2588        struct bifrost_texture_operation desc = {
2589                .op = bi_tex_op(instr->op),
2590                .offset_or_bias_disable = false, /* TODO */
2591                .shadow_or_clamp_disable = instr->is_shadow,
2592                .array = instr->is_array,
2593                .dimension = bifrost_tex_format(instr->sampler_dim),
2594                .format = bi_texture_format(instr->dest_type | nir_dest_bit_size(instr->dest), BI_CLAMP_NONE), /* TODO */
2595                .mask = 0xF,
2596        };
2597
2598        switch (desc.op) {
2599        case BIFROST_TEX_OP_TEX:
2600                desc.lod_or_fetch = BIFROST_LOD_MODE_COMPUTE;
2601                computed_lod = true;
2602                break;
2603        case BIFROST_TEX_OP_FETCH:
2604                desc.lod_or_fetch = (enum bifrost_lod_mode)
2605                   (instr->op == nir_texop_tg4 ?
2606                        BIFROST_TEXTURE_FETCH_GATHER4_R + instr->component :
2607                        BIFROST_TEXTURE_FETCH_TEXEL);
2608                break;
2609        default:
2610                unreachable("texture op unsupported");
2611        }
2612
2613        /* 32-bit indices to be allocated as consecutive staging registers */
2614        bi_index dregs[BIFROST_TEX_DREG_COUNT] = { };
2615        bi_index cx = bi_null(), cy = bi_null();
2616
2617        for (unsigned i = 0; i < instr->num_srcs; ++i) {
2618                bi_index index = bi_src_index(&instr->src[i].src);
2619                unsigned sz = nir_src_bit_size(instr->src[i].src);
2620                ASSERTED nir_alu_type base = nir_tex_instr_src_type(instr, i);
2621                nir_alu_type T = base | sz;
2622
2623                switch (instr->src[i].src_type) {
2624                case nir_tex_src_coord:
2625                        if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
2626                                cx = bi_emit_texc_cube_coord(b, index, &cy);
2627			} else {
2628                                unsigned components = nir_src_num_components(instr->src[i].src);
2629
2630                                /* Copy XY (for 2D+) or XX (for 1D) */
2631                                cx = index;
2632                                cy = bi_word(index, MIN2(1, components - 1));
2633
2634                                assert(components >= 1 && components <= 3);
2635
2636                                if (components < 3) {
2637                                        /* nothing to do */
2638                                } else if (desc.array) {
2639                                        /* 2D array */
2640                                        dregs[BIFROST_TEX_DREG_ARRAY] =
2641                                                bi_emit_texc_array_index(b,
2642                                                                bi_word(index, 2), T);
2643                                } else {
2644                                        /* 3D */
2645                                        dregs[BIFROST_TEX_DREG_Z_COORD] =
2646                                                bi_word(index, 2);
2647                                }
2648                        }
2649                        break;
2650
2651                case nir_tex_src_lod:
2652                        if (desc.op == BIFROST_TEX_OP_TEX &&
2653                            nir_src_is_const(instr->src[i].src) &&
2654                            nir_src_as_uint(instr->src[i].src) == 0) {
2655                                desc.lod_or_fetch = BIFROST_LOD_MODE_ZERO;
2656                        } else if (desc.op == BIFROST_TEX_OP_TEX) {
2657                                assert(base == nir_type_float);
2658
2659                                assert(sz == 16 || sz == 32);
2660                                dregs[BIFROST_TEX_DREG_LOD] =
2661                                        bi_emit_texc_lod_88(b, index, sz == 16);
2662                                desc.lod_or_fetch = BIFROST_LOD_MODE_EXPLICIT;
2663                        } else {
2664                                assert(desc.op == BIFROST_TEX_OP_FETCH);
2665                                assert(base == nir_type_uint || base == nir_type_int);
2666                                assert(sz == 16 || sz == 32);
2667
2668                                dregs[BIFROST_TEX_DREG_LOD] =
2669                                        bi_emit_texc_lod_cube(b, index);
2670                        }
2671
2672                        break;
2673
2674                case nir_tex_src_bias:
2675                        /* Upper 16-bits interpreted as a clamp, leave zero */
2676                        assert(desc.op == BIFROST_TEX_OP_TEX);
2677                        assert(base == nir_type_float);
2678                        assert(sz == 16 || sz == 32);
2679                        dregs[BIFROST_TEX_DREG_LOD] =
2680                                bi_emit_texc_lod_88(b, index, sz == 16);
2681                        desc.lod_or_fetch = BIFROST_LOD_MODE_BIAS;
2682                        computed_lod = true;
2683                        break;
2684
2685                case nir_tex_src_ms_index:
2686                case nir_tex_src_offset:
2687                        if (desc.offset_or_bias_disable)
2688                                break;
2689
2690                        dregs[BIFROST_TEX_DREG_OFFSETMS] =
2691	                        bi_emit_texc_offset_ms_index(b, instr);
2692                        if (!bi_is_equiv(dregs[BIFROST_TEX_DREG_OFFSETMS], bi_zero()))
2693                                desc.offset_or_bias_disable = true;
2694                        break;
2695
2696                case nir_tex_src_comparator:
2697                        dregs[BIFROST_TEX_DREG_SHADOW] = index;
2698                        break;
2699
2700                case nir_tex_src_texture_offset:
2701                        assert(instr->texture_index == 0);
2702                        dregs[BIFROST_TEX_DREG_TEXTURE] = index;
2703                        break;
2704
2705                case nir_tex_src_sampler_offset:
2706                        assert(instr->sampler_index == 0);
2707                        dregs[BIFROST_TEX_DREG_SAMPLER] = index;
2708                        break;
2709
2710                default:
2711                        unreachable("Unhandled src type in texc emit");
2712                }
2713        }
2714
2715        if (desc.op == BIFROST_TEX_OP_FETCH && bi_is_null(dregs[BIFROST_TEX_DREG_LOD])) {
2716                dregs[BIFROST_TEX_DREG_LOD] =
2717                        bi_emit_texc_lod_cube(b, bi_zero());
2718        }
2719
2720        /* Choose an index mode */
2721
2722        bool direct_tex = bi_is_null(dregs[BIFROST_TEX_DREG_TEXTURE]);
2723        bool direct_samp = bi_is_null(dregs[BIFROST_TEX_DREG_SAMPLER]);
2724        bool direct = direct_tex && direct_samp;
2725
2726        desc.immediate_indices = direct && (instr->sampler_index < 16);
2727
2728        if (desc.immediate_indices) {
2729                desc.sampler_index_or_mode = instr->sampler_index;
2730                desc.index = instr->texture_index;
2731        } else {
2732                enum bifrost_index mode = 0;
2733
2734                if (direct && instr->sampler_index == instr->texture_index) {
2735                        mode = BIFROST_INDEX_IMMEDIATE_SHARED;
2736                        desc.index = instr->texture_index;
2737                } else if (direct) {
2738                        mode = BIFROST_INDEX_IMMEDIATE_SAMPLER;
2739                        desc.index = instr->sampler_index;
2740                        dregs[BIFROST_TEX_DREG_TEXTURE] = bi_mov_i32(b,
2741                                        bi_imm_u32(instr->texture_index));
2742                } else if (direct_tex) {
2743                        assert(!direct_samp);
2744                        mode = BIFROST_INDEX_IMMEDIATE_TEXTURE;
2745                        desc.index = instr->texture_index;
2746                } else if (direct_samp) {
2747                        assert(!direct_tex);
2748                        mode = BIFROST_INDEX_IMMEDIATE_SAMPLER;
2749                        desc.index = instr->sampler_index;
2750                } else {
2751                        mode = BIFROST_INDEX_REGISTER;
2752                }
2753
2754                desc.sampler_index_or_mode = mode | (0x3 << 2);
2755        }
2756
2757        /* Allocate staging registers contiguously by compacting the array.
2758         * Index is not SSA (tied operands) */
2759
2760        unsigned sr_count = 0;
2761
2762        for (unsigned i = 0; i < ARRAY_SIZE(dregs); ++i) {
2763                if (!bi_is_null(dregs[i]))
2764                        dregs[sr_count++] = dregs[i];
2765        }
2766
2767        bi_index idx = sr_count ? bi_temp_reg(b->shader) : bi_null();
2768
2769        if (sr_count)
2770                bi_make_vec_to(b, idx, dregs, NULL, sr_count, 32);
2771
2772        uint32_t desc_u = 0;
2773        memcpy(&desc_u, &desc, sizeof(desc_u));
2774        bi_texc_to(b, sr_count ? idx : bi_dest_index(&instr->dest),
2775                        idx, cx, cy, bi_imm_u32(desc_u), !computed_lod,
2776                        sr_count);
2777
2778        /* Explicit copy to facilitate tied operands */
2779        if (sr_count) {
2780                bi_index srcs[4] = { idx, idx, idx, idx };
2781                unsigned channels[4] = { 0, 1, 2, 3 };
2782                bi_make_vec_to(b, bi_dest_index(&instr->dest), srcs, channels, 4, 32);
2783        }
2784}
2785
2786/* Simple textures ops correspond to NIR tex or txl with LOD = 0 on 2D/cube
2787 * textures with sufficiently small immediate indices. Anything else
2788 * needs a complete texture op. */
2789
2790static void
2791bi_emit_texs(bi_builder *b, nir_tex_instr *instr)
2792{
2793        int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord);
2794        assert(coord_idx >= 0);
2795        bi_index coords = bi_src_index(&instr->src[coord_idx].src);
2796
2797        if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
2798                bi_index face, s, t;
2799                bi_emit_cube_coord(b, coords, &face, &s, &t);
2800
2801                bi_texs_cube_to(b, nir_dest_bit_size(instr->dest),
2802                                bi_dest_index(&instr->dest),
2803                                s, t, face,
2804                                instr->sampler_index, instr->texture_index);
2805        } else {
2806                bi_texs_2d_to(b, nir_dest_bit_size(instr->dest),
2807                                bi_dest_index(&instr->dest),
2808                                coords, bi_word(coords, 1),
2809                                instr->op != nir_texop_tex, /* zero LOD */
2810                                instr->sampler_index, instr->texture_index);
2811        }
2812}
2813
2814static bool
2815bi_is_simple_tex(nir_tex_instr *instr)
2816{
2817        if (instr->op != nir_texop_tex && instr->op != nir_texop_txl)
2818                return false;
2819
2820        if (instr->dest_type != nir_type_float32 &&
2821            instr->dest_type != nir_type_float16)
2822                return false;
2823
2824        if (instr->is_shadow || instr->is_array)
2825                return false;
2826
2827        switch (instr->sampler_dim) {
2828        case GLSL_SAMPLER_DIM_2D:
2829        case GLSL_SAMPLER_DIM_EXTERNAL:
2830        case GLSL_SAMPLER_DIM_RECT:
2831                break;
2832
2833        case GLSL_SAMPLER_DIM_CUBE:
2834                /* LOD can't be specified with TEXS_CUBE */
2835                if (instr->op == nir_texop_txl)
2836                        return false;
2837                break;
2838
2839        default:
2840                return false;
2841        }
2842
2843        for (unsigned i = 0; i < instr->num_srcs; ++i) {
2844                if (instr->src[i].src_type != nir_tex_src_lod &&
2845                    instr->src[i].src_type != nir_tex_src_coord)
2846                        return false;
2847        }
2848
2849        /* Indices need to fit in provided bits */
2850        unsigned idx_bits = instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE ? 2 : 3;
2851        if (MAX2(instr->sampler_index, instr->texture_index) >= (1 << idx_bits))
2852                return false;
2853
2854        int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod);
2855        if (lod_idx < 0)
2856                return true;
2857
2858        nir_src lod = instr->src[lod_idx].src;
2859        return nir_src_is_const(lod) && nir_src_as_uint(lod) == 0;
2860}
2861
2862static void
2863bi_emit_tex(bi_builder *b, nir_tex_instr *instr)
2864{
2865        switch (instr->op) {
2866        case nir_texop_txs:
2867                bi_load_sysval_to(b, bi_dest_index(&instr->dest),
2868                                panfrost_sysval_for_instr(&instr->instr, NULL),
2869                                4, 0);
2870                return;
2871        case nir_texop_tex:
2872        case nir_texop_txl:
2873        case nir_texop_txb:
2874        case nir_texop_txf:
2875        case nir_texop_txf_ms:
2876        case nir_texop_tg4:
2877                break;
2878        default:
2879                unreachable("Invalid texture operation");
2880        }
2881
2882        if (bi_is_simple_tex(instr))
2883                bi_emit_texs(b, instr);
2884        else
2885                bi_emit_texc(b, instr);
2886}
2887
2888static void
2889bi_emit_instr(bi_builder *b, struct nir_instr *instr)
2890{
2891        switch (instr->type) {
2892        case nir_instr_type_load_const:
2893                bi_emit_load_const(b, nir_instr_as_load_const(instr));
2894                break;
2895
2896        case nir_instr_type_intrinsic:
2897                bi_emit_intrinsic(b, nir_instr_as_intrinsic(instr));
2898                break;
2899
2900        case nir_instr_type_alu:
2901                bi_emit_alu(b, nir_instr_as_alu(instr));
2902                break;
2903
2904        case nir_instr_type_tex:
2905                bi_emit_tex(b, nir_instr_as_tex(instr));
2906                break;
2907
2908        case nir_instr_type_jump:
2909                bi_emit_jump(b, nir_instr_as_jump(instr));
2910                break;
2911
2912        default:
2913                unreachable("should've been lowered");
2914        }
2915}
2916
2917static bi_block *
2918create_empty_block(bi_context *ctx)
2919{
2920        bi_block *blk = rzalloc(ctx, bi_block);
2921
2922        blk->predecessors = _mesa_set_create(blk,
2923                        _mesa_hash_pointer,
2924                        _mesa_key_pointer_equal);
2925
2926        return blk;
2927}
2928
2929static bi_block *
2930emit_block(bi_context *ctx, nir_block *block)
2931{
2932        if (ctx->after_block) {
2933                ctx->current_block = ctx->after_block;
2934                ctx->after_block = NULL;
2935        } else {
2936                ctx->current_block = create_empty_block(ctx);
2937        }
2938
2939        list_addtail(&ctx->current_block->link, &ctx->blocks);
2940        list_inithead(&ctx->current_block->instructions);
2941
2942        bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
2943
2944        nir_foreach_instr(instr, block) {
2945                bi_emit_instr(&_b, instr);
2946                ++ctx->instruction_count;
2947        }
2948
2949        return ctx->current_block;
2950}
2951
2952static void
2953emit_if(bi_context *ctx, nir_if *nif)
2954{
2955        bi_block *before_block = ctx->current_block;
2956
2957        /* Speculatively emit the branch, but we can't fill it in until later */
2958        bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
2959        bi_instr *then_branch = bi_branchz_i16(&_b,
2960                        bi_half(bi_src_index(&nif->condition), false),
2961                        bi_zero(), BI_CMPF_EQ);
2962
2963        /* Emit the two subblocks. */
2964        bi_block *then_block = emit_cf_list(ctx, &nif->then_list);
2965        bi_block *end_then_block = ctx->current_block;
2966
2967        /* Emit second block, and check if it's empty */
2968
2969        int count_in = ctx->instruction_count;
2970        bi_block *else_block = emit_cf_list(ctx, &nif->else_list);
2971        bi_block *end_else_block = ctx->current_block;
2972        ctx->after_block = create_empty_block(ctx);
2973
2974        /* Now that we have the subblocks emitted, fix up the branches */
2975
2976        assert(then_block);
2977        assert(else_block);
2978
2979        if (ctx->instruction_count == count_in) {
2980                then_branch->branch_target = ctx->after_block;
2981                bi_block_add_successor(end_then_block, ctx->after_block); /* fallthrough */
2982        } else {
2983                then_branch->branch_target = else_block;
2984
2985                /* Emit a jump from the end of the then block to the end of the else */
2986                _b.cursor = bi_after_block(end_then_block);
2987                bi_instr *then_exit = bi_jump(&_b, bi_zero());
2988                then_exit->branch_target = ctx->after_block;
2989
2990                bi_block_add_successor(end_then_block, then_exit->branch_target);
2991                bi_block_add_successor(end_else_block, ctx->after_block); /* fallthrough */
2992        }
2993
2994        bi_block_add_successor(before_block, then_branch->branch_target); /* then_branch */
2995        bi_block_add_successor(before_block, then_block); /* fallthrough */
2996}
2997
2998static void
2999emit_loop(bi_context *ctx, nir_loop *nloop)
3000{
3001        /* Remember where we are */
3002        bi_block *start_block = ctx->current_block;
3003
3004        bi_block *saved_break = ctx->break_block;
3005        bi_block *saved_continue = ctx->continue_block;
3006
3007        ctx->continue_block = create_empty_block(ctx);
3008        ctx->break_block = create_empty_block(ctx);
3009        ctx->after_block = ctx->continue_block;
3010
3011        /* Emit the body itself */
3012        emit_cf_list(ctx, &nloop->body);
3013
3014        /* Branch back to loop back */
3015        bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
3016        bi_instr *I = bi_jump(&_b, bi_zero());
3017        I->branch_target = ctx->continue_block;
3018        bi_block_add_successor(start_block, ctx->continue_block);
3019        bi_block_add_successor(ctx->current_block, ctx->continue_block);
3020
3021        ctx->after_block = ctx->break_block;
3022
3023        /* Pop off */
3024        ctx->break_block = saved_break;
3025        ctx->continue_block = saved_continue;
3026        ++ctx->loop_count;
3027}
3028
3029static bi_block *
3030emit_cf_list(bi_context *ctx, struct exec_list *list)
3031{
3032        bi_block *start_block = NULL;
3033
3034        foreach_list_typed(nir_cf_node, node, node, list) {
3035                switch (node->type) {
3036                case nir_cf_node_block: {
3037                        bi_block *block = emit_block(ctx, nir_cf_node_as_block(node));
3038
3039                        if (!start_block)
3040                                start_block = block;
3041
3042                        break;
3043                }
3044
3045                case nir_cf_node_if:
3046                        emit_if(ctx, nir_cf_node_as_if(node));
3047                        break;
3048
3049                case nir_cf_node_loop:
3050                        emit_loop(ctx, nir_cf_node_as_loop(node));
3051                        break;
3052
3053                default:
3054                        unreachable("Unknown control flow");
3055                }
3056        }
3057
3058        return start_block;
3059}
3060
3061/* shader-db stuff */
3062
3063struct bi_stats {
3064        unsigned nr_clauses, nr_tuples, nr_ins;
3065        unsigned nr_arith, nr_texture, nr_varying, nr_ldst;
3066};
3067
3068static void
3069bi_count_tuple_stats(bi_clause *clause, bi_tuple *tuple, struct bi_stats *stats)
3070{
3071        /* Count instructions */
3072        stats->nr_ins += (tuple->fma ? 1 : 0) + (tuple->add ? 1 : 0);
3073
3074        /* Non-message passing tuples are always arithmetic */
3075        if (tuple->add != clause->message) {
3076                stats->nr_arith++;
3077                return;
3078        }
3079
3080        /* Message + FMA we'll count as arithmetic _and_ message */
3081        if (tuple->fma)
3082                stats->nr_arith++;
3083
3084        switch (clause->message_type) {
3085        case BIFROST_MESSAGE_VARYING:
3086                /* Check components interpolated */
3087                stats->nr_varying += (clause->message->vecsize + 1) *
3088                        (bi_is_regfmt_16(clause->message->register_format) ? 1 : 2);
3089                break;
3090
3091        case BIFROST_MESSAGE_VARTEX:
3092                /* 2 coordinates, fp32 each */
3093                stats->nr_varying += (2 * 2);
3094                FALLTHROUGH;
3095        case BIFROST_MESSAGE_TEX:
3096                stats->nr_texture++;
3097                break;
3098
3099        case BIFROST_MESSAGE_ATTRIBUTE:
3100        case BIFROST_MESSAGE_LOAD:
3101        case BIFROST_MESSAGE_STORE:
3102        case BIFROST_MESSAGE_ATOMIC:
3103                stats->nr_ldst++;
3104                break;
3105
3106        case BIFROST_MESSAGE_NONE:
3107        case BIFROST_MESSAGE_BARRIER:
3108        case BIFROST_MESSAGE_BLEND:
3109        case BIFROST_MESSAGE_TILE:
3110        case BIFROST_MESSAGE_Z_STENCIL:
3111        case BIFROST_MESSAGE_ATEST:
3112        case BIFROST_MESSAGE_JOB:
3113        case BIFROST_MESSAGE_64BIT:
3114                /* Nothing to do */
3115                break;
3116        };
3117
3118}
3119
3120static void
3121bi_print_stats(bi_context *ctx, unsigned size, FILE *fp)
3122{
3123        struct bi_stats stats = { 0 };
3124
3125        /* Count instructions, clauses, and tuples. Also attempt to construct
3126         * normalized execution engine cycle counts, using the following ratio:
3127         *
3128         * 24 arith tuples/cycle
3129         * 2 texture messages/cycle
3130         * 16 x 16-bit varying channels interpolated/cycle
3131         * 1 load store message/cycle
3132         *
3133         * These numbers seem to match Arm Mobile Studio's heuristic. The real
3134         * cycle counts are surely more complicated.
3135         */
3136
3137        bi_foreach_block(ctx, block) {
3138                bi_foreach_clause_in_block(block, clause) {
3139                        stats.nr_clauses++;
3140                        stats.nr_tuples += clause->tuple_count;
3141
3142                        for (unsigned i = 0; i < clause->tuple_count; ++i)
3143                                bi_count_tuple_stats(clause, &clause->tuples[i], &stats);
3144                }
3145        }
3146
3147        float cycles_arith = ((float) stats.nr_arith) / 24.0;
3148        float cycles_texture = ((float) stats.nr_texture) / 2.0;
3149        float cycles_varying = ((float) stats.nr_varying) / 16.0;
3150        float cycles_ldst = ((float) stats.nr_ldst) / 1.0;
3151
3152        float cycles_message = MAX3(cycles_texture, cycles_varying, cycles_ldst);
3153        float cycles_bound = MAX2(cycles_arith, cycles_message);
3154
3155        /* Thread count and register pressure are traded off only on v7 */
3156        bool full_threads = (ctx->arch == 7 && ctx->info->work_reg_count <= 32);
3157        unsigned nr_threads = full_threads ? 2 : 1;
3158
3159        /* Dump stats */
3160
3161        fprintf(stderr, "%s - %s shader: "
3162                        "%u inst, %u tuples, %u clauses, "
3163                        "%f cycles, %f arith, %f texture, %f vary, %f ldst, "
3164                        "%u quadwords, %u threads, %u loops, "
3165                        "%u:%u spills:fills\n",
3166                        ctx->nir->info.label ?: "",
3167                        ctx->inputs->is_blend ? "PAN_SHADER_BLEND" :
3168                        gl_shader_stage_name(ctx->stage),
3169                        stats.nr_ins, stats.nr_tuples, stats.nr_clauses,
3170                        cycles_bound, cycles_arith, cycles_texture,
3171                        cycles_varying, cycles_ldst,
3172                        size / 16, nr_threads,
3173                        ctx->loop_count,
3174                        ctx->spills, ctx->fills);
3175}
3176
3177static int
3178glsl_type_size(const struct glsl_type *type, bool bindless)
3179{
3180        return glsl_count_attribute_slots(type, false);
3181}
3182
3183/* Split stores to memory. We don't split stores to vertex outputs, since
3184 * nir_lower_io_to_temporaries will ensure there's only a single write.
3185 */
3186
3187static bool
3188should_split_wrmask(const nir_instr *instr, UNUSED const void *data)
3189{
3190        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
3191
3192        switch (intr->intrinsic) {
3193        case nir_intrinsic_store_ssbo:
3194        case nir_intrinsic_store_shared:
3195        case nir_intrinsic_store_global:
3196        case nir_intrinsic_store_scratch:
3197                return true;
3198        default:
3199                return false;
3200        }
3201}
3202
3203/* Bifrost wants transcendentals as FP32 */
3204
3205static unsigned
3206bi_lower_bit_size(const nir_instr *instr, UNUSED void *data)
3207{
3208        if (instr->type != nir_instr_type_alu)
3209                return 0;
3210
3211        nir_alu_instr *alu = nir_instr_as_alu(instr);
3212
3213        switch (alu->op) {
3214        case nir_op_fexp2:
3215        case nir_op_flog2:
3216        case nir_op_fpow:
3217        case nir_op_fsin:
3218        case nir_op_fcos:
3219                return (nir_dest_bit_size(alu->dest.dest) == 32) ? 0 : 32;
3220        default:
3221                return 0;
3222        }
3223}
3224
3225/* Although Bifrost generally supports packed 16-bit vec2 and 8-bit vec4,
3226 * transcendentals are an exception. Also shifts because of lane size mismatch
3227 * (8-bit in Bifrost, 32-bit in NIR TODO - workaround!). Some conversions need
3228 * to be scalarized due to type size. */
3229
3230static bool
3231bi_vectorize_filter(const nir_instr *instr, void *data)
3232{
3233        /* Defaults work for everything else */
3234        if (instr->type != nir_instr_type_alu)
3235                return true;
3236
3237        const nir_alu_instr *alu = nir_instr_as_alu(instr);
3238
3239        switch (alu->op) {
3240        case nir_op_frcp:
3241        case nir_op_frsq:
3242        case nir_op_ishl:
3243        case nir_op_ishr:
3244        case nir_op_ushr:
3245        case nir_op_f2i16:
3246        case nir_op_f2u16:
3247        case nir_op_i2f16:
3248        case nir_op_u2f16:
3249                return false;
3250        default:
3251                return true;
3252        }
3253}
3254
3255/* XXX: This is a kludge to workaround NIR's lack of divergence metadata. If we
3256 * keep divergence info around after we consume it for indirect lowering,
3257 * nir_convert_from_ssa will regress code quality since it will avoid
3258 * coalescing divergent with non-divergent nodes. */
3259
3260static bool
3261nir_invalidate_divergence_ssa(nir_ssa_def *ssa, UNUSED void *data)
3262{
3263        ssa->divergent = false;
3264        return true;
3265}
3266
3267static bool
3268nir_invalidate_divergence(struct nir_builder *b, nir_instr *instr,
3269                UNUSED void *data)
3270{
3271        return nir_foreach_ssa_def(instr, nir_invalidate_divergence_ssa, NULL);
3272}
3273
3274/* Ensure we write exactly 4 components */
3275static nir_ssa_def *
3276bifrost_nir_valid_channel(nir_builder *b, nir_ssa_def *in,
3277                          unsigned channel, unsigned first, unsigned mask)
3278{
3279        if (!(mask & BITFIELD_BIT(channel)))
3280                channel = first;
3281
3282        return nir_channel(b, in, channel);
3283}
3284
3285/* Lower fragment store_output instructions to always write 4 components,
3286 * matching the hardware semantic. This may require additional moves. Skipping
3287 * these moves is possible in theory, but invokes undefined behaviour in the
3288 * compiler. The DDK inserts these moves, so we will as well. */
3289
3290static bool
3291bifrost_nir_lower_blend_components(struct nir_builder *b,
3292                                   nir_instr *instr, void *data)
3293{
3294        if (instr->type != nir_instr_type_intrinsic)
3295                return false;
3296
3297        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
3298
3299        if (intr->intrinsic != nir_intrinsic_store_output)
3300                return false;
3301
3302        nir_ssa_def *in = intr->src[0].ssa;
3303        unsigned first = nir_intrinsic_component(intr);
3304        unsigned mask = nir_intrinsic_write_mask(intr);
3305
3306        assert(first == 0 && "shouldn't get nonzero components");
3307
3308        /* Nothing to do */
3309        if (mask == BITFIELD_MASK(4))
3310                return false;
3311
3312        b->cursor = nir_before_instr(&intr->instr);
3313
3314        /* Replicate the first valid component instead */
3315        nir_ssa_def *replicated =
3316                nir_vec4(b, bifrost_nir_valid_channel(b, in, 0, first, mask),
3317                            bifrost_nir_valid_channel(b, in, 1, first, mask),
3318                            bifrost_nir_valid_channel(b, in, 2, first, mask),
3319                            bifrost_nir_valid_channel(b, in, 3, first, mask));
3320
3321        /* Rewrite to use our replicated version */
3322        nir_instr_rewrite_src_ssa(instr, &intr->src[0], replicated);
3323        nir_intrinsic_set_component(intr, 0);
3324        nir_intrinsic_set_write_mask(intr, 0xF);
3325        intr->num_components = 4;
3326
3327        return true;
3328}
3329
3330static void
3331bi_optimize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend)
3332{
3333        bool progress;
3334        unsigned lower_flrp = 16 | 32 | 64;
3335
3336        NIR_PASS(progress, nir, nir_lower_regs_to_ssa);
3337
3338        nir_lower_tex_options lower_tex_options = {
3339                .lower_txs_lod = true,
3340                .lower_txp = ~0,
3341                .lower_tg4_broadcom_swizzle = true,
3342                .lower_txd = true,
3343        };
3344
3345        NIR_PASS(progress, nir, pan_nir_lower_64bit_intrin);
3346        NIR_PASS(progress, nir, pan_lower_helper_invocation);
3347
3348        NIR_PASS(progress, nir, nir_lower_int64);
3349
3350        nir_lower_idiv_options idiv_options = {
3351                .imprecise_32bit_lowering = true,
3352                .allow_fp16 = true,
3353        };
3354        NIR_PASS(progress, nir, nir_lower_idiv, &idiv_options);
3355
3356        NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_options);
3357        NIR_PASS(progress, nir, nir_lower_alu_to_scalar, NULL, NULL);
3358        NIR_PASS(progress, nir, nir_lower_load_const_to_scalar);
3359
3360        do {
3361                progress = false;
3362
3363                NIR_PASS(progress, nir, nir_lower_var_copies);
3364                NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
3365                NIR_PASS(progress, nir, nir_lower_wrmasks, should_split_wrmask, NULL);
3366
3367                NIR_PASS(progress, nir, nir_copy_prop);
3368                NIR_PASS(progress, nir, nir_opt_remove_phis);
3369                NIR_PASS(progress, nir, nir_opt_dce);
3370                NIR_PASS(progress, nir, nir_opt_dead_cf);
3371                NIR_PASS(progress, nir, nir_opt_cse);
3372                NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
3373                NIR_PASS(progress, nir, nir_opt_algebraic);
3374                NIR_PASS(progress, nir, nir_opt_constant_folding);
3375
3376                NIR_PASS(progress, nir, nir_lower_alu);
3377
3378                if (lower_flrp != 0) {
3379                        bool lower_flrp_progress = false;
3380                        NIR_PASS(lower_flrp_progress,
3381                                 nir,
3382                                 nir_lower_flrp,
3383                                 lower_flrp,
3384                                 false /* always_precise */);
3385                        if (lower_flrp_progress) {
3386                                NIR_PASS(progress, nir,
3387                                         nir_opt_constant_folding);
3388                                progress = true;
3389                        }
3390
3391                        /* Nothing should rematerialize any flrps, so we only
3392                         * need to do this lowering once.
3393                         */
3394                        lower_flrp = 0;
3395                }
3396
3397                NIR_PASS(progress, nir, nir_opt_undef);
3398                NIR_PASS(progress, nir, nir_lower_undef_to_zero);
3399
3400                NIR_PASS(progress, nir, nir_opt_loop_unroll);
3401        } while (progress);
3402
3403        /* TODO: Why is 64-bit getting rematerialized?
3404         * KHR-GLES31.core.shader_image_load_store.basic-allTargets-atomicFS */
3405        NIR_PASS(progress, nir, nir_lower_int64);
3406
3407        /* We need to cleanup after each iteration of late algebraic
3408         * optimizations, since otherwise NIR can produce weird edge cases
3409         * (like fneg of a constant) which we don't handle */
3410        bool late_algebraic = true;
3411        while (late_algebraic) {
3412                late_algebraic = false;
3413                NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late);
3414                NIR_PASS(progress, nir, nir_opt_constant_folding);
3415                NIR_PASS(progress, nir, nir_copy_prop);
3416                NIR_PASS(progress, nir, nir_opt_dce);
3417                NIR_PASS(progress, nir, nir_opt_cse);
3418        }
3419
3420        NIR_PASS(progress, nir, nir_lower_alu_to_scalar, NULL, NULL);
3421        NIR_PASS(progress, nir, nir_opt_vectorize, bi_vectorize_filter, NULL);
3422        NIR_PASS(progress, nir, nir_lower_load_const_to_scalar);
3423        NIR_PASS(progress, nir, nir_opt_dce);
3424
3425        /* Prepass to simplify instruction selection */
3426        NIR_PASS(progress, nir, bifrost_nir_lower_algebraic_late);
3427        NIR_PASS(progress, nir, nir_opt_dce);
3428
3429        if (nir->info.stage == MESA_SHADER_FRAGMENT) {
3430                NIR_PASS_V(nir, nir_shader_instructions_pass,
3431                           bifrost_nir_lower_blend_components,
3432                           nir_metadata_block_index | nir_metadata_dominance,
3433                           NULL);
3434        }
3435
3436        /* Backend scheduler is purely local, so do some global optimizations
3437         * to reduce register pressure. */
3438        nir_move_options move_all =
3439                nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
3440                nir_move_comparisons | nir_move_copies | nir_move_load_ssbo;
3441
3442        NIR_PASS_V(nir, nir_opt_sink, move_all);
3443        NIR_PASS_V(nir, nir_opt_move, move_all);
3444
3445        /* We might lower attribute, varying, and image indirects. Use the
3446         * gathered info to skip the extra analysis in the happy path. */
3447        bool any_indirects =
3448                nir->info.inputs_read_indirectly ||
3449                nir->info.outputs_accessed_indirectly ||
3450                nir->info.patch_inputs_read_indirectly ||
3451                nir->info.patch_outputs_accessed_indirectly ||
3452                nir->info.images_used;
3453
3454        if (any_indirects) {
3455                nir_convert_to_lcssa(nir, true, true);
3456                NIR_PASS_V(nir, nir_divergence_analysis);
3457                NIR_PASS_V(nir, bi_lower_divergent_indirects,
3458                                bifrost_lanes_per_warp(gpu_id));
3459                NIR_PASS_V(nir, nir_shader_instructions_pass,
3460                        nir_invalidate_divergence, nir_metadata_all, NULL);
3461        }
3462
3463        /* Take us out of SSA */
3464        NIR_PASS(progress, nir, nir_lower_locals_to_regs);
3465        NIR_PASS(progress, nir, nir_move_vec_src_uses_to_dest);
3466        NIR_PASS(progress, nir, nir_convert_from_ssa, true);
3467}
3468
3469/* The cmdstream lowers 8-bit fragment output as 16-bit, so we need to do the
3470 * same lowering here to zero-extend correctly */
3471
3472static bool
3473bifrost_nir_lower_i8_fragout_impl(struct nir_builder *b,
3474                nir_intrinsic_instr *intr, UNUSED void *data)
3475{
3476        if (nir_src_bit_size(intr->src[0]) != 8)
3477                return false;
3478
3479        nir_alu_type type =
3480                nir_alu_type_get_base_type(nir_intrinsic_src_type(intr));
3481
3482        assert(type == nir_type_int || type == nir_type_uint);
3483
3484        b->cursor = nir_before_instr(&intr->instr);
3485        nir_ssa_def *cast = nir_convert_to_bit_size(b, intr->src[0].ssa, type, 16);
3486
3487        nir_intrinsic_set_src_type(intr, type | 16);
3488        nir_instr_rewrite_src_ssa(&intr->instr, &intr->src[0], cast);
3489        return true;
3490}
3491
3492static bool
3493bifrost_nir_lower_i8_fragin_impl(struct nir_builder *b,
3494                nir_intrinsic_instr *intr, UNUSED void *data)
3495{
3496        if (nir_dest_bit_size(intr->dest) != 8)
3497                return false;
3498
3499        nir_alu_type type =
3500                nir_alu_type_get_base_type(nir_intrinsic_dest_type(intr));
3501
3502        assert(type == nir_type_int || type == nir_type_uint);
3503
3504        b->cursor = nir_before_instr(&intr->instr);
3505        nir_ssa_def *out =
3506                nir_load_output(b, intr->num_components, 16, intr->src[0].ssa,
3507                        .base = nir_intrinsic_base(intr),
3508                        .component = nir_intrinsic_component(intr),
3509                        .dest_type = type | 16,
3510                        .io_semantics = nir_intrinsic_io_semantics(intr));
3511
3512        nir_ssa_def *cast = nir_convert_to_bit_size(b, out, type, 8);
3513        nir_ssa_def_rewrite_uses(&intr->dest.ssa, cast);
3514        return true;
3515}
3516
3517static bool
3518bifrost_nir_lower_i8_frag(struct nir_builder *b,
3519                nir_instr *instr, UNUSED void *data)
3520{
3521        if (instr->type != nir_instr_type_intrinsic)
3522                return false;
3523
3524        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
3525        if (intr->intrinsic == nir_intrinsic_load_output)
3526                return bifrost_nir_lower_i8_fragin_impl(b, intr, data);
3527        else if (intr->intrinsic == nir_intrinsic_store_output)
3528                return bifrost_nir_lower_i8_fragout_impl(b, intr, data);
3529        else
3530                return false;
3531}
3532
3533static void
3534bi_opt_post_ra(bi_context *ctx)
3535{
3536        bi_foreach_instr_global_safe(ctx, ins) {
3537                if (ins->op == BI_OPCODE_MOV_I32 && bi_is_equiv(ins->dest[0], ins->src[0]))
3538                        bi_remove_instruction(ins);
3539        }
3540}
3541
3542static bool
3543bifrost_nir_lower_store_component(struct nir_builder *b,
3544                nir_instr *instr, void *data)
3545{
3546        if (instr->type != nir_instr_type_intrinsic)
3547                return false;
3548
3549        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
3550
3551        if (intr->intrinsic != nir_intrinsic_store_output)
3552                return false;
3553
3554        struct hash_table_u64 *slots = data;
3555        unsigned component = nir_intrinsic_component(intr);
3556        nir_src *slot_src = nir_get_io_offset_src(intr);
3557        uint64_t slot = nir_src_as_uint(*slot_src) + nir_intrinsic_base(intr);
3558
3559        nir_intrinsic_instr *prev = _mesa_hash_table_u64_search(slots, slot);
3560        unsigned mask = (prev ? nir_intrinsic_write_mask(prev) : 0);
3561
3562        nir_ssa_def *value = intr->src[0].ssa;
3563        b->cursor = nir_before_instr(&intr->instr);
3564
3565        nir_ssa_def *undef = nir_ssa_undef(b, 1, value->bit_size);
3566        nir_ssa_def *channels[4] = { undef, undef, undef, undef };
3567
3568        /* Copy old */
3569        u_foreach_bit(i, mask) {
3570                assert(prev != NULL);
3571                nir_ssa_def *prev_ssa = prev->src[0].ssa;
3572                channels[i] = nir_channel(b, prev_ssa, i);
3573        }
3574
3575        /* Copy new */
3576        unsigned new_mask = nir_intrinsic_write_mask(intr);
3577        mask |= (new_mask << component);
3578
3579        u_foreach_bit(i, new_mask) {
3580                assert(component + i < 4);
3581                channels[component + i] = nir_channel(b, value, i);
3582        }
3583
3584        intr->num_components = util_last_bit(mask);
3585        nir_instr_rewrite_src_ssa(instr, &intr->src[0],
3586                        nir_vec(b, channels, intr->num_components));
3587
3588        nir_intrinsic_set_component(intr, 0);
3589        nir_intrinsic_set_write_mask(intr, mask);
3590
3591        if (prev) {
3592                _mesa_hash_table_u64_remove(slots, slot);
3593                nir_instr_remove(&prev->instr);
3594        }
3595
3596        _mesa_hash_table_u64_insert(slots, slot, intr);
3597        return false;
3598}
3599
3600/* Dead code elimination for branches at the end of a block - only one branch
3601 * per block is legal semantically, but unreachable jumps can be generated.
3602 * Likewise we can generate jumps to the terminal block which need to be
3603 * lowered away to a jump to #0x0, which induces successful termination. */
3604
3605static void
3606bi_lower_branch(bi_block *block)
3607{
3608        bool branched = false;
3609        ASSERTED bool was_jump = false;
3610
3611        bi_foreach_instr_in_block_safe(block, ins) {
3612                if (!ins->branch_target) continue;
3613
3614                if (branched) {
3615                        assert(was_jump && (ins->op == BI_OPCODE_JUMP));
3616                        bi_remove_instruction(ins);
3617                        continue;
3618                }
3619
3620                branched = true;
3621                was_jump = ins->op == BI_OPCODE_JUMP;
3622
3623                if (bi_is_terminal_block(ins->branch_target))
3624                        ins->branch_target = NULL;
3625        }
3626}
3627
3628static void
3629bi_pack_clauses(bi_context *ctx, struct util_dynarray *binary)
3630{
3631        unsigned final_clause = bi_pack(ctx, binary);
3632
3633        /* If we need to wait for ATEST or BLEND in the first clause, pass the
3634         * corresponding bits through to the renderer state descriptor */
3635        bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link);
3636        bi_clause *first_clause = bi_next_clause(ctx, first_block, NULL);
3637
3638        unsigned first_deps = first_clause ? first_clause->dependencies : 0;
3639        ctx->info->bifrost.wait_6 = (first_deps & (1 << 6));
3640        ctx->info->bifrost.wait_7 = (first_deps & (1 << 7));
3641
3642        /* Pad the shader with enough zero bytes to trick the prefetcher,
3643         * unless we're compiling an empty shader (in which case we don't pad
3644         * so the size remains 0) */
3645        unsigned prefetch_size = BIFROST_SHADER_PREFETCH - final_clause;
3646
3647        if (binary->size) {
3648                memset(util_dynarray_grow(binary, uint8_t, prefetch_size),
3649                       0, prefetch_size);
3650        }
3651}
3652
3653void
3654bifrost_compile_shader_nir(nir_shader *nir,
3655                           const struct panfrost_compile_inputs *inputs,
3656                           struct util_dynarray *binary,
3657                           struct pan_shader_info *info)
3658{
3659        bifrost_debug = debug_get_option_bifrost_debug();
3660
3661        bi_context *ctx = rzalloc(NULL, bi_context);
3662        ctx->sysval_to_id = panfrost_init_sysvals(&info->sysvals, ctx);
3663
3664        ctx->inputs = inputs;
3665        ctx->nir = nir;
3666        ctx->info = info;
3667        ctx->stage = nir->info.stage;
3668        ctx->quirks = bifrost_get_quirks(inputs->gpu_id);
3669        ctx->arch = inputs->gpu_id >> 12;
3670
3671        /* If nothing is pushed, all UBOs need to be uploaded */
3672        ctx->ubo_mask = ~0;
3673
3674        list_inithead(&ctx->blocks);
3675
3676        /* Lower gl_Position pre-optimisation, but after lowering vars to ssa
3677         * (so we don't accidentally duplicate the epilogue since mesa/st has
3678         * messed with our I/O quite a bit already) */
3679
3680        NIR_PASS_V(nir, nir_lower_vars_to_ssa);
3681
3682        if (ctx->stage == MESA_SHADER_VERTEX) {
3683                NIR_PASS_V(nir, nir_lower_viewport_transform);
3684                NIR_PASS_V(nir, nir_lower_point_size, 1.0, 1024.0);
3685        }
3686
3687        /* Lower large arrays to scratch and small arrays to bcsel (TODO: tune
3688         * threshold, but not until addresses / csel is optimized better) */
3689        NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 16,
3690                        glsl_get_natural_size_align_bytes);
3691        NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0);
3692
3693        NIR_PASS_V(nir, nir_split_var_copies);
3694        NIR_PASS_V(nir, nir_lower_global_vars_to_local);
3695        NIR_PASS_V(nir, nir_lower_var_copies);
3696        NIR_PASS_V(nir, nir_lower_vars_to_ssa);
3697        NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
3698                        glsl_type_size, 0);
3699
3700        if (ctx->stage == MESA_SHADER_FRAGMENT) {
3701                NIR_PASS_V(nir, nir_lower_mediump_io, nir_var_shader_out,
3702                                ~0, false);
3703        } else {
3704                struct hash_table_u64 *stores = _mesa_hash_table_u64_create(ctx);
3705                NIR_PASS_V(nir, nir_shader_instructions_pass,
3706                                bifrost_nir_lower_store_component,
3707                                nir_metadata_block_index |
3708                                nir_metadata_dominance, stores);
3709                _mesa_hash_table_u64_destroy(stores);
3710        }
3711
3712        NIR_PASS_V(nir, nir_lower_ssbo);
3713        NIR_PASS_V(nir, pan_nir_lower_zs_store);
3714        NIR_PASS_V(nir, pan_lower_sample_pos);
3715        NIR_PASS_V(nir, nir_lower_bit_size, bi_lower_bit_size, NULL);
3716
3717        if (nir->info.stage == MESA_SHADER_FRAGMENT) {
3718                NIR_PASS_V(nir, nir_shader_instructions_pass,
3719                        bifrost_nir_lower_i8_frag,
3720                        nir_metadata_block_index | nir_metadata_dominance,
3721                        NULL);
3722        }
3723
3724        bi_optimize_nir(nir, ctx->inputs->gpu_id, ctx->inputs->is_blend);
3725
3726        NIR_PASS_V(nir, pan_nir_reorder_writeout);
3727
3728        bool skip_internal = nir->info.internal;
3729        skip_internal &= !(bifrost_debug & BIFROST_DBG_INTERNAL);
3730
3731        if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) {
3732                nir_print_shader(nir, stdout);
3733        }
3734
3735        info->tls_size = nir->scratch_size;
3736
3737        nir_foreach_function(func, nir) {
3738                if (!func->impl)
3739                        continue;
3740
3741                ctx->ssa_alloc += func->impl->ssa_alloc;
3742                ctx->reg_alloc += func->impl->reg_alloc;
3743
3744                emit_cf_list(ctx, &func->impl->body);
3745                break; /* TODO: Multi-function shaders */
3746        }
3747
3748        unsigned block_source_count = 0;
3749
3750        bi_foreach_block(ctx, block) {
3751                /* Name blocks now that we're done emitting so the order is
3752                 * consistent */
3753                block->name = block_source_count++;
3754        }
3755
3756        bi_validate(ctx, "NIR -> BIR");
3757
3758        /* If the shader doesn't write any colour or depth outputs, it may
3759         * still need an ATEST at the very end! */
3760        bool need_dummy_atest =
3761                (ctx->stage == MESA_SHADER_FRAGMENT) &&
3762                !ctx->emitted_atest &&
3763                !bi_skip_atest(ctx, false);
3764
3765        if (need_dummy_atest) {
3766                bi_block *end = list_last_entry(&ctx->blocks, bi_block, link);
3767                bi_builder b = bi_init_builder(ctx, bi_after_block(end));
3768                bi_emit_atest(&b, bi_zero());
3769        }
3770
3771        bool optimize = !(bifrost_debug & BIFROST_DBG_NOOPT);
3772
3773        /* Runs before constant folding */
3774        bi_lower_swizzle(ctx);
3775        bi_validate(ctx, "Early lowering");
3776
3777        /* Runs before copy prop */
3778        if (optimize && !ctx->inputs->no_ubo_to_push) {
3779                bi_opt_push_ubo(ctx);
3780        }
3781
3782        if (likely(optimize)) {
3783                bi_opt_copy_prop(ctx);
3784                bi_opt_constant_fold(ctx);
3785                bi_opt_copy_prop(ctx);
3786                bi_opt_mod_prop_forward(ctx);
3787                bi_opt_mod_prop_backward(ctx);
3788                bi_opt_dead_code_eliminate(ctx);
3789                bi_opt_cse(ctx);
3790                bi_opt_dead_code_eliminate(ctx);
3791                bi_validate(ctx, "Optimization passes");
3792        }
3793
3794        bi_foreach_instr_global(ctx, I) {
3795                bi_lower_opt_instruction(I);
3796        }
3797
3798        bi_foreach_block(ctx, block) {
3799                bi_lower_branch(block);
3800        }
3801
3802        if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
3803                bi_print_shader(ctx, stdout);
3804        bi_lower_fau(ctx);
3805
3806        /* Analyze before register allocation to avoid false dependencies. The
3807         * skip bit is a function of only the data flow graph and is invariant
3808         * under valid scheduling. */
3809        bi_analyze_helper_requirements(ctx);
3810        bi_validate(ctx, "Late lowering");
3811
3812        bi_register_allocate(ctx);
3813
3814        if (likely(optimize))
3815                bi_opt_post_ra(ctx);
3816
3817        if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
3818                bi_print_shader(ctx, stdout);
3819
3820        if (ctx->arch <= 8) {
3821                bi_schedule(ctx);
3822                bi_assign_scoreboard(ctx);
3823        }
3824
3825        /* Analyze after scheduling since we depend on instruction order. */
3826        bi_analyze_helper_terminate(ctx);
3827
3828        if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
3829                bi_print_shader(ctx, stdout);
3830
3831        if (ctx->arch <= 8) {
3832                bi_pack_clauses(ctx, binary);
3833        } else {
3834                /* TODO: pack flat */
3835        }
3836
3837        info->ubo_mask = ctx->ubo_mask & BITSET_MASK(ctx->nir->info.num_ubos);
3838
3839        if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) {
3840                disassemble_bifrost(stdout, binary->data, binary->size,
3841                                    bifrost_debug & BIFROST_DBG_VERBOSE);
3842                fflush(stdout);
3843        }
3844
3845        if ((bifrost_debug & BIFROST_DBG_SHADERDB || inputs->shaderdb) &&
3846            !skip_internal) {
3847                bi_print_stats(ctx, binary->size, stderr);
3848        }
3849
3850        _mesa_hash_table_u64_destroy(ctx->sysval_to_id);
3851        ralloc_free(ctx);
3852}
3853