1/*
2 * Copyright (C) 2020 Collabora Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors (Collabora):
24 *      Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
25 */
26
27#include "compiler.h"
28#include "bi_builder.h"
29
30/* Arguments common to worklist, passed by value for convenience */
31
32struct bi_worklist {
33        /* # of instructions in the block */
34        unsigned count;
35
36        /* Instructions in the block */
37        bi_instr **instructions;
38
39        /* Bitset of instructions in the block ready for scheduling */
40        BITSET_WORD *worklist;
41
42        /* The backwards dependency graph. nr_dependencies is the number of
43         * unscheduled instructions that must still be scheduled after (before)
44         * this instruction. dependents are which instructions need to be
45         * scheduled before (after) this instruction. */
46        unsigned *dep_counts;
47        BITSET_WORD **dependents;
48};
49
50/* State of a single tuple and clause under construction */
51
52struct bi_reg_state {
53        /* Number of register writes */
54        unsigned nr_writes;
55
56        /* Register reads, expressed as (equivalence classes of)
57         * sources. Only 3 reads are allowed, but up to 2 may spill as
58         * "forced" for the next scheduled tuple, provided such a tuple
59         * can be constructed */
60        bi_index reads[5];
61        unsigned nr_reads;
62
63        /* The previous tuple scheduled (= the next tuple executed in the
64         * program) may require certain writes, in order to bypass the register
65         * file and use a temporary passthrough for the value. Up to 2 such
66         * constraints are architecturally satisfiable */
67        unsigned forced_count;
68        bi_index forceds[2];
69};
70
71struct bi_tuple_state {
72        /* Is this the last tuple in the clause */
73        bool last;
74
75        /* Scheduled ADD instruction, or null if none */
76        bi_instr *add;
77
78        /* Reads for previous (succeeding) tuple */
79        bi_index prev_reads[5];
80        unsigned nr_prev_reads;
81        bi_tuple *prev;
82
83        /* Register slot state for current tuple */
84        struct bi_reg_state reg;
85
86        /* Constants are shared in the tuple. If constant_count is nonzero, it
87         * is a size for constant count. Otherwise, fau is the slot read from
88         * FAU, or zero if none is assigned. Ordinarily FAU slot 0 reads zero,
89         * but within a tuple, that should be encoded as constant_count != 0
90         * and constants[0] = constants[1] = 0 */
91        unsigned constant_count;
92
93        union {
94                uint32_t constants[2];
95                enum bir_fau fau;
96        };
97
98        unsigned pcrel_idx;
99};
100
101struct bi_const_state {
102        unsigned constant_count;
103        bool pcrel; /* applies to first const */
104        uint32_t constants[2];
105
106        /* Index of the constant into the clause */
107        unsigned word_idx;
108};
109
110struct bi_clause_state {
111        /* Has a message-passing instruction already been assigned? */
112        bool message;
113
114        /* Indices already accessed, this needs to be tracked to avoid hazards
115         * around message-passing instructions */
116        unsigned access_count;
117        bi_index accesses[(BI_MAX_SRCS + 2) * 16];
118
119        unsigned tuple_count;
120        struct bi_const_state consts[8];
121};
122
123/* Determines messsage type by checking the table and a few special cases. Only
124 * case missing is tilebuffer instructions that access depth/stencil, which
125 * require a Z_STENCIL message (to implement
126 * ARM_shader_framebuffer_fetch_depth_stencil) */
127
128static enum bifrost_message_type
129bi_message_type_for_instr(bi_instr *ins)
130{
131        enum bifrost_message_type msg = bi_opcode_props[ins->op].message;
132        bool ld_var_special = (ins->op == BI_OPCODE_LD_VAR_SPECIAL);
133
134        if (ld_var_special && ins->varying_name == BI_VARYING_NAME_FRAG_Z)
135                return BIFROST_MESSAGE_Z_STENCIL;
136
137        if (msg == BIFROST_MESSAGE_LOAD && ins->seg == BI_SEG_UBO)
138                return BIFROST_MESSAGE_ATTRIBUTE;
139
140        return msg;
141}
142
143/* Attribute, texture, and UBO load (attribute message) instructions support
144 * bindless, so just check the message type */
145
146ASSERTED static bool
147bi_supports_dtsel(bi_instr *ins)
148{
149        switch (bi_message_type_for_instr(ins)) {
150        case BIFROST_MESSAGE_ATTRIBUTE:
151                return ins->op != BI_OPCODE_LD_GCLK_U64;
152        case BIFROST_MESSAGE_TEX:
153                return true;
154        default:
155                return false;
156        }
157}
158
159/* Adds an edge to the dependency graph */
160
161static void
162bi_push_dependency(unsigned parent, unsigned child,
163                BITSET_WORD **dependents, unsigned *dep_counts)
164{
165        if (!BITSET_TEST(dependents[parent], child)) {
166                BITSET_SET(dependents[parent], child);
167                dep_counts[child]++;
168        }
169}
170
171static void
172add_dependency(struct util_dynarray *table, unsigned index, unsigned child,
173                BITSET_WORD **dependents, unsigned *dep_counts)
174{
175        assert(index < 64);
176        util_dynarray_foreach(table + index, unsigned, parent)
177                bi_push_dependency(*parent, child, dependents, dep_counts);
178}
179
180static void
181mark_access(struct util_dynarray *table, unsigned index, unsigned parent)
182{
183        assert(index < 64);
184        util_dynarray_append(&table[index], unsigned, parent);
185}
186
187static bool
188bi_is_sched_barrier(bi_instr *I)
189{
190        switch (I->op) {
191        case BI_OPCODE_BARRIER:
192        case BI_OPCODE_DISCARD_F32:
193                return true;
194        default:
195                return false;
196        }
197}
198
199static void
200bi_create_dependency_graph(struct bi_worklist st, bool inorder, bool is_blend)
201{
202        struct util_dynarray last_read[64], last_write[64];
203
204        for (unsigned i = 0; i < 64; ++i) {
205                util_dynarray_init(&last_read[i], NULL);
206                util_dynarray_init(&last_write[i], NULL);
207        }
208
209        /* Initialize dependency graph */
210        for (unsigned i = 0; i < st.count; ++i) {
211                st.dependents[i] =
212                        calloc(BITSET_WORDS(st.count), sizeof(BITSET_WORD));
213
214                st.dep_counts[i] = 0;
215        }
216
217        unsigned prev_msg = ~0;
218
219        /* Populate dependency graph */
220        for (signed i = st.count - 1; i >= 0; --i) {
221                bi_instr *ins = st.instructions[i];
222
223                bi_foreach_src(ins, s) {
224                        if (ins->src[s].type != BI_INDEX_REGISTER) continue;
225                        unsigned count = bi_count_read_registers(ins, s);
226
227                        for (unsigned c = 0; c < count; ++c)
228                                add_dependency(last_write, ins->src[s].value + c, i, st.dependents, st.dep_counts);
229                }
230
231                /* Keep message-passing ops in order. (This pass only cares
232                 * about bundling; reordering of message-passing instructions
233                 * happens during earlier scheduling.) */
234
235                if (bi_message_type_for_instr(ins)) {
236                        if (prev_msg != ~0)
237                                bi_push_dependency(prev_msg, i, st.dependents, st.dep_counts);
238
239                        prev_msg = i;
240                }
241
242                /* Handle schedule barriers, adding All the deps */
243                if (inorder || bi_is_sched_barrier(ins)) {
244                        for (unsigned j = 0; j < st.count; ++j) {
245                                if (i == j) continue;
246
247                                bi_push_dependency(MAX2(i, j), MIN2(i, j),
248                                                st.dependents, st.dep_counts);
249                        }
250                }
251
252                bi_foreach_dest(ins, d) {
253                        if (ins->dest[d].type != BI_INDEX_REGISTER) continue;
254                        unsigned dest = ins->dest[d].value;
255
256                        unsigned count = bi_count_write_registers(ins, d);
257
258                        for (unsigned c = 0; c < count; ++c) {
259                                add_dependency(last_read, dest + c, i, st.dependents, st.dep_counts);
260                                add_dependency(last_write, dest + c, i, st.dependents, st.dep_counts);
261                                mark_access(last_write, dest + c, i);
262                        }
263                }
264
265                /* Blend shaders are allowed to clobber R0-R15. Treat these
266                 * registers like extra destinations for scheduling purposes.
267                 */
268                if (ins->op == BI_OPCODE_BLEND && !is_blend) {
269                        for (unsigned c = 0; c < 16; ++c) {
270                                add_dependency(last_read, c, i, st.dependents, st.dep_counts);
271                                add_dependency(last_write, c, i, st.dependents, st.dep_counts);
272                                mark_access(last_write, c, i);
273                        }
274                }
275
276                bi_foreach_src(ins, s) {
277                        if (ins->src[s].type != BI_INDEX_REGISTER) continue;
278
279                        unsigned count = bi_count_read_registers(ins, s);
280
281                        for (unsigned c = 0; c < count; ++c)
282                                mark_access(last_read, ins->src[s].value + c, i);
283                }
284        }
285
286        /* If there is a branch, all instructions depend on it, as interblock
287         * execution must be purely in-order */
288
289        bi_instr *last = st.instructions[st.count - 1];
290        if (last->branch_target || last->op == BI_OPCODE_JUMP) {
291                for (signed i = st.count - 2; i >= 0; --i)
292                        bi_push_dependency(st.count - 1, i, st.dependents, st.dep_counts);
293        }
294
295        /* Free the intermediate structures */
296        for (unsigned i = 0; i < 64; ++i) {
297                util_dynarray_fini(&last_read[i]);
298                util_dynarray_fini(&last_write[i]);
299        }
300}
301
302/* Scheduler pseudoinstruction lowerings to enable instruction pairings.
303 * Currently only support CUBEFACE -> *CUBEFACE1/+CUBEFACE2
304 */
305
306static bi_instr *
307bi_lower_cubeface(bi_context *ctx,
308                struct bi_clause_state *clause, struct bi_tuple_state *tuple)
309{
310        bi_instr *pinstr = tuple->add;
311        bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
312        bi_instr *cubeface1 = bi_cubeface1_to(&b, pinstr->dest[0],
313                        pinstr->src[0], pinstr->src[1], pinstr->src[2]);
314
315        pinstr->op = BI_OPCODE_CUBEFACE2;
316        pinstr->dest[0] = pinstr->dest[1];
317        pinstr->dest[1] = bi_null();
318        pinstr->src[0] = cubeface1->dest[0];
319        pinstr->src[1] = bi_null();
320        pinstr->src[2] = bi_null();
321
322        return cubeface1;
323}
324
325/* Psuedo arguments are (rbase, address lo, address hi). We need *ATOM_C.i32 to
326 * have the arguments (address lo, address hi, rbase), and +ATOM_CX to have the
327 * arguments (rbase, address lo, address hi, rbase) */
328
329static bi_instr *
330bi_lower_atom_c(bi_context *ctx, struct bi_clause_state *clause, struct
331                bi_tuple_state *tuple)
332{
333        bi_instr *pinstr = tuple->add;
334        bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
335        bi_instr *atom_c = bi_atom_c_return_i32(&b,
336                        pinstr->src[1], pinstr->src[2], pinstr->src[0],
337                        pinstr->atom_opc);
338
339        if (bi_is_null(pinstr->dest[0]))
340                atom_c->op = BI_OPCODE_ATOM_C_I32;
341
342        pinstr->op = BI_OPCODE_ATOM_CX;
343        pinstr->src[3] = atom_c->src[2];
344
345        return atom_c;
346}
347
348static bi_instr *
349bi_lower_atom_c1(bi_context *ctx, struct bi_clause_state *clause, struct
350                bi_tuple_state *tuple)
351{
352        bi_instr *pinstr = tuple->add;
353        bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
354        bi_instr *atom_c = bi_atom_c1_return_i32(&b,
355                        pinstr->src[0], pinstr->src[1], pinstr->atom_opc);
356
357        if (bi_is_null(pinstr->dest[0]))
358                atom_c->op = BI_OPCODE_ATOM_C1_I32;
359
360        pinstr->op = BI_OPCODE_ATOM_CX;
361        pinstr->src[2] = pinstr->src[1];
362        pinstr->src[1] = pinstr->src[0];
363        pinstr->src[3] = bi_dontcare();
364        pinstr->src[0] = bi_null();
365
366        return atom_c;
367}
368
369static bi_instr *
370bi_lower_seg_add(bi_context *ctx,
371                struct bi_clause_state *clause, struct bi_tuple_state *tuple)
372{
373        bi_instr *pinstr = tuple->add;
374        bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
375
376        bi_instr *fma = bi_seg_add_to(&b, pinstr->dest[0], pinstr->src[0],
377                        pinstr->preserve_null, pinstr->seg);
378
379        pinstr->op = BI_OPCODE_SEG_ADD;
380        pinstr->src[0] = pinstr->src[1];
381        pinstr->src[1] = bi_null();
382
383        assert(pinstr->dest[0].type == BI_INDEX_REGISTER);
384        pinstr->dest[0].value += 1;
385
386        return fma;
387}
388
389static bi_instr *
390bi_lower_dtsel(bi_context *ctx,
391                struct bi_clause_state *clause, struct bi_tuple_state *tuple)
392{
393        bi_instr *add = tuple->add;
394        bi_builder b = bi_init_builder(ctx, bi_before_instr(add));
395
396        bi_instr *dtsel = bi_dtsel_imm_to(&b, bi_temp(b.shader),
397                        add->src[0], add->table);
398        add->src[0] = dtsel->dest[0];
399
400        assert(bi_supports_dtsel(add));
401        return dtsel;
402}
403
404/* Flatten linked list to array for O(1) indexing */
405
406static bi_instr **
407bi_flatten_block(bi_block *block, unsigned *len)
408{
409        if (list_is_empty(&block->instructions))
410                return NULL;
411
412        *len = list_length(&block->instructions);
413        bi_instr **instructions = malloc(sizeof(bi_instr *) * (*len));
414
415        unsigned i = 0;
416
417        bi_foreach_instr_in_block(block, ins)
418                instructions[i++] = ins;
419
420        return instructions;
421}
422
423/* The worklist would track instructions without outstanding dependencies. For
424 * debug, force in-order scheduling (no dependency graph is constructed).
425 */
426
427static struct bi_worklist
428bi_initialize_worklist(bi_block *block, bool inorder, bool is_blend)
429{
430        struct bi_worklist st = { };
431        st.instructions = bi_flatten_block(block, &st.count);
432
433        if (!st.count)
434                return st;
435
436        st.dependents = calloc(st.count, sizeof(st.dependents[0]));
437        st.dep_counts = calloc(st.count, sizeof(st.dep_counts[0]));
438
439        bi_create_dependency_graph(st, inorder, is_blend);
440        st.worklist = calloc(BITSET_WORDS(st.count), sizeof(BITSET_WORD));
441
442        for (unsigned i = 0; i < st.count; ++i) {
443                if (st.dep_counts[i] == 0)
444                        BITSET_SET(st.worklist, i);
445        }
446
447        return st;
448}
449
450static void
451bi_free_worklist(struct bi_worklist st)
452{
453        free(st.dep_counts);
454        free(st.dependents);
455        free(st.instructions);
456        free(st.worklist);
457}
458
459static void
460bi_update_worklist(struct bi_worklist st, unsigned idx)
461{
462        assert(st.dep_counts[idx] == 0);
463
464        if (!st.dependents[idx])
465                return;
466
467        /* Iterate each dependent to remove one dependency (`done`),
468         * adding dependents to the worklist where possible. */
469
470        unsigned i;
471        BITSET_FOREACH_SET(i, st.dependents[idx], st.count) {
472                assert(st.dep_counts[i] != 0);
473                unsigned new_deps = --st.dep_counts[i];
474
475                if (new_deps == 0)
476                        BITSET_SET(st.worklist, i);
477        }
478
479        free(st.dependents[idx]);
480}
481
482/* Scheduler predicates */
483
484/* IADDC.i32 can implement IADD.u32 if no saturation or swizzling is in use */
485static bool
486bi_can_iaddc(bi_instr *ins)
487{
488        return (ins->op == BI_OPCODE_IADD_U32 && !ins->saturate &&
489                ins->src[0].swizzle == BI_SWIZZLE_H01 &&
490                ins->src[1].swizzle == BI_SWIZZLE_H01);
491}
492
493/*
494 * The encoding of *FADD.v2f16 only specifies a single abs flag. All abs
495 * encodings are permitted by swapping operands; however, this scheme fails if
496 * both operands are equal. Test for this case.
497 */
498static bool
499bi_impacted_abs(bi_instr *I)
500{
501        return I->src[0].abs && I->src[1].abs &&
502               bi_is_word_equiv(I->src[0], I->src[1]);
503}
504
505bool
506bi_can_fma(bi_instr *ins)
507{
508        /* +IADD.i32 -> *IADDC.i32 */
509        if (bi_can_iaddc(ins))
510                return true;
511
512        /* *FADD.v2f16 has restricted abs modifiers, use +FADD.v2f16 instead */
513        if (ins->op == BI_OPCODE_FADD_V2F16 && bi_impacted_abs(ins))
514                return false;
515
516        /* TODO: some additional fp16 constraints */
517        return bi_opcode_props[ins->op].fma;
518}
519
520static bool
521bi_impacted_fadd_widens(bi_instr *I)
522{
523        enum bi_swizzle swz0 = I->src[0].swizzle;
524        enum bi_swizzle swz1 = I->src[1].swizzle;
525
526        return (swz0 == BI_SWIZZLE_H00 && swz1 == BI_SWIZZLE_H11) ||
527                (swz0 == BI_SWIZZLE_H11 && swz1 == BI_SWIZZLE_H11) ||
528                (swz0 == BI_SWIZZLE_H11 && swz1 == BI_SWIZZLE_H00);
529}
530
531bool
532bi_can_add(bi_instr *ins)
533{
534        /* +FADD.v2f16 lacks clamp modifier, use *FADD.v2f16 instead */
535        if (ins->op == BI_OPCODE_FADD_V2F16 && ins->clamp)
536                return false;
537
538        /* +FCMP.v2f16 lacks abs modifier, use *FCMP.v2f16 instead */
539        if (ins->op == BI_OPCODE_FCMP_V2F16 && (ins->src[0].abs || ins->src[1].abs))
540                return false;
541
542        /* +FADD.f32 has restricted widens, use +FADD.f32 for the full set */
543        if (ins->op == BI_OPCODE_FADD_F32 && bi_impacted_fadd_widens(ins))
544               return false;
545
546        /* TODO: some additional fp16 constraints */
547        return bi_opcode_props[ins->op].add;
548}
549
550/* Architecturally, no single instruction has a "not last" constraint. However,
551 * pseudoinstructions writing multiple destinations (expanding to multiple
552 * paired instructions) can run afoul of the "no two writes on the last clause"
553 * constraint, so we check for that here.
554 */
555
556static bool
557bi_must_not_last(bi_instr *ins)
558{
559        return !bi_is_null(ins->dest[0]) && !bi_is_null(ins->dest[1]);
560}
561
562/* Check for a message-passing instruction. +DISCARD.f32 is special-cased; we
563 * treat it as a message-passing instruction for the purpose of scheduling
564 * despite no passing no logical message. Otherwise invalid encoding faults may
565 * be raised for unknown reasons (possibly an errata).
566 */
567
568bool
569bi_must_message(bi_instr *ins)
570{
571        return (bi_opcode_props[ins->op].message != BIFROST_MESSAGE_NONE) ||
572                (ins->op == BI_OPCODE_DISCARD_F32);
573}
574
575static bool
576bi_fma_atomic(enum bi_opcode op)
577{
578        switch (op) {
579        case BI_OPCODE_ATOM_C_I32:
580        case BI_OPCODE_ATOM_C_I64:
581        case BI_OPCODE_ATOM_C1_I32:
582        case BI_OPCODE_ATOM_C1_I64:
583        case BI_OPCODE_ATOM_C1_RETURN_I32:
584        case BI_OPCODE_ATOM_C1_RETURN_I64:
585        case BI_OPCODE_ATOM_C_RETURN_I32:
586        case BI_OPCODE_ATOM_C_RETURN_I64:
587        case BI_OPCODE_ATOM_POST_I32:
588        case BI_OPCODE_ATOM_POST_I64:
589        case BI_OPCODE_ATOM_PRE_I64:
590                return true;
591        default:
592                return false;
593        }
594}
595
596bool
597bi_reads_zero(bi_instr *ins)
598{
599        return !(bi_fma_atomic(ins->op) || ins->op == BI_OPCODE_IMULD);
600}
601
602bool
603bi_reads_temps(bi_instr *ins, unsigned src)
604{
605        switch (ins->op) {
606        /* Cannot permute a temporary */
607        case BI_OPCODE_CLPER_I32:
608        case BI_OPCODE_CLPER_V6_I32:
609                return src != 0;
610        case BI_OPCODE_IMULD:
611                return false;
612        default:
613                return true;
614        }
615}
616
617static bool
618bi_impacted_t_modifiers(bi_instr *I, unsigned src)
619{
620        enum bi_swizzle swizzle = I->src[src].swizzle;
621
622        switch (I->op) {
623        case BI_OPCODE_F16_TO_F32:
624        case BI_OPCODE_F16_TO_S32:
625        case BI_OPCODE_F16_TO_U32:
626        case BI_OPCODE_MKVEC_V2I16:
627        case BI_OPCODE_S16_TO_F32:
628        case BI_OPCODE_S16_TO_S32:
629        case BI_OPCODE_U16_TO_F32:
630        case BI_OPCODE_U16_TO_U32:
631                return (swizzle != BI_SWIZZLE_H00);
632
633        case BI_OPCODE_BRANCH_F32:
634        case BI_OPCODE_LOGB_F32:
635        case BI_OPCODE_ILOGB_F32:
636        case BI_OPCODE_FADD_F32:
637        case BI_OPCODE_FCMP_F32:
638        case BI_OPCODE_FREXPE_F32:
639        case BI_OPCODE_FREXPM_F32:
640        case BI_OPCODE_FROUND_F32:
641                return (swizzle != BI_SWIZZLE_H01);
642
643        case BI_OPCODE_IADD_S32:
644        case BI_OPCODE_IADD_U32:
645        case BI_OPCODE_ISUB_S32:
646        case BI_OPCODE_ISUB_U32:
647        case BI_OPCODE_IADD_V4S8:
648        case BI_OPCODE_IADD_V4U8:
649        case BI_OPCODE_ISUB_V4S8:
650        case BI_OPCODE_ISUB_V4U8:
651                return (src == 1) && (swizzle != BI_SWIZZLE_H01);
652
653        case BI_OPCODE_S8_TO_F32:
654        case BI_OPCODE_S8_TO_S32:
655        case BI_OPCODE_U8_TO_F32:
656        case BI_OPCODE_U8_TO_U32:
657                return (swizzle != BI_SWIZZLE_B0000);
658
659        case BI_OPCODE_V2S8_TO_V2F16:
660        case BI_OPCODE_V2S8_TO_V2S16:
661        case BI_OPCODE_V2U8_TO_V2F16:
662        case BI_OPCODE_V2U8_TO_V2U16:
663                return (swizzle != BI_SWIZZLE_B0022);
664
665        case BI_OPCODE_IADD_V2S16:
666        case BI_OPCODE_IADD_V2U16:
667        case BI_OPCODE_ISUB_V2S16:
668        case BI_OPCODE_ISUB_V2U16:
669                return (src == 1) && (swizzle >= BI_SWIZZLE_H11);
670
671#if 0
672        /* Restriction on IADD in 64-bit clauses on G72 */
673        case BI_OPCODE_IADD_S64:
674        case BI_OPCODE_IADD_U64:
675                return (src == 1) && (swizzle != BI_SWIZZLE_D0);
676#endif
677
678        default:
679                return false;
680        }
681}
682
683bool
684bi_reads_t(bi_instr *ins, unsigned src)
685{
686        /* Branch offset cannot come from passthrough */
687        if (bi_opcode_props[ins->op].branch)
688                return src != 2;
689
690        /* Table can never read passthrough */
691        if (bi_opcode_props[ins->op].table)
692                return false;
693
694        /* Staging register reads may happen before the succeeding register
695         * block encodes a write, so effectively there is no passthrough */
696        if (src == 0 && bi_opcode_props[ins->op].sr_read)
697                return false;
698
699        /* Bifrost cores newer than Mali G71 have restrictions on swizzles on
700         * same-cycle temporaries. Check the list for these hazards. */
701        if (bi_impacted_t_modifiers(ins, src))
702                return false;
703
704        /* Descriptor must not come from a passthrough */
705        switch (ins->op) {
706        case BI_OPCODE_LD_CVT:
707        case BI_OPCODE_LD_TILE:
708        case BI_OPCODE_ST_CVT:
709        case BI_OPCODE_ST_TILE:
710        case BI_OPCODE_TEXC:
711                return src != 2;
712        case BI_OPCODE_BLEND:
713                return src != 2 && src != 3;
714
715        /* Else, just check if we can read any temps */
716        default:
717                return bi_reads_temps(ins, src);
718        }
719}
720
721/* Counts the number of 64-bit constants required by a clause. TODO: We
722 * might want to account for merging, right now we overestimate, but
723 * that's probably fine most of the time */
724
725static unsigned
726bi_nconstants(struct bi_clause_state *clause)
727{
728        unsigned count_32 = 0;
729
730        for (unsigned i = 0; i < ARRAY_SIZE(clause->consts); ++i)
731                count_32 += clause->consts[i].constant_count;
732
733        return DIV_ROUND_UP(count_32, 2);
734}
735
736/* Would there be space for constants if we added one tuple? */
737
738static bool
739bi_space_for_more_constants(struct bi_clause_state *clause)
740{
741        return (bi_nconstants(clause) < 13 - (clause->tuple_count + 1));
742}
743
744/* Updates the FAU assignment for a tuple. A valid FAU assignment must be
745 * possible (as a precondition), though not necessarily on the selected unit;
746 * this is gauranteed per-instruction by bi_lower_fau and per-tuple by
747 * bi_instr_schedulable */
748
749static bool
750bi_update_fau(struct bi_clause_state *clause,
751                struct bi_tuple_state *tuple,
752                bi_instr *instr, bool fma, bool destructive)
753{
754        /* Maintain our own constants, for nondestructive mode */
755        uint32_t copied_constants[2], copied_count;
756        unsigned *constant_count = &tuple->constant_count;
757        uint32_t *constants = tuple->constants;
758        enum bir_fau fau = tuple->fau;
759
760        if (!destructive) {
761                memcpy(copied_constants, tuple->constants,
762                                (*constant_count) * sizeof(constants[0]));
763                copied_count = tuple->constant_count;
764
765                constant_count = &copied_count;
766                constants = copied_constants;
767        }
768
769        bi_foreach_src(instr, s) {
770                bi_index src = instr->src[s];
771
772                if (src.type == BI_INDEX_FAU) {
773                        bool no_constants = *constant_count == 0;
774                        bool no_other_fau = (fau == src.value) || !fau;
775                        bool mergable = no_constants && no_other_fau;
776
777                        if (destructive) {
778                                assert(mergable);
779                                tuple->fau = src.value;
780                        } else if (!mergable) {
781                                return false;
782                        }
783
784                        fau = src.value;
785                } else if (src.type == BI_INDEX_CONSTANT) {
786                        /* No need to reserve space if we have a fast 0 */
787                        if (src.value == 0 && fma && bi_reads_zero(instr))
788                                continue;
789
790                        /* If there is a branch target, #0 by convention is the
791                         * PC-relative offset to the target */
792                        bool pcrel = instr->branch_target && src.value == 0;
793                        bool found = false;
794
795                        for (unsigned i = 0; i < *constant_count; ++i) {
796                                found |= (constants[i] == src.value) &&
797                                        (i != tuple->pcrel_idx);
798                        }
799
800                        /* pcrel constants are unique, so don't match */
801                        if (found && !pcrel)
802                                continue;
803
804                        bool no_fau = (*constant_count > 0) || !fau;
805                        bool mergable = no_fau && ((*constant_count) < 2);
806
807                        if (destructive) {
808                                assert(mergable);
809
810                                if (pcrel)
811                                        tuple->pcrel_idx = *constant_count;
812                        } else if (!mergable)
813                                return false;
814
815                        constants[(*constant_count)++] = src.value;
816                }
817        }
818
819        /* Constants per clause may be limited by tuple count */
820        bool room_for_constants = (*constant_count == 0) ||
821                bi_space_for_more_constants(clause);
822
823        if (destructive)
824                assert(room_for_constants);
825        else if (!room_for_constants)
826                return false;
827
828        return true;
829}
830
831/* Given an in-progress tuple, a candidate new instruction to add to the tuple,
832 * and a source (index) from that candidate, determine whether this source is
833 * "new", in the sense of requiring an additional read slot. That is, checks
834 * whether the specified source reads from the register file via a read slot
835 * (determined by its type and placement) and whether the source was already
836 * specified by a prior read slot (to avoid double counting) */
837
838static bool
839bi_tuple_is_new_src(bi_instr *instr, struct bi_reg_state *reg, unsigned src_idx)
840{
841        bi_index src = instr->src[src_idx];
842
843        /* Only consider sources which come from the register file */
844        if (!(src.type == BI_INDEX_NORMAL || src.type == BI_INDEX_REGISTER))
845                return false;
846
847        /* Staging register reads bypass the usual register file mechanism */
848        if (src_idx == 0 && bi_opcode_props[instr->op].sr_read)
849                return false;
850
851        /* If a source is already read in the tuple, it is already counted */
852        for (unsigned t = 0; t < reg->nr_reads; ++t)
853                if (bi_is_word_equiv(src, reg->reads[t]))
854                        return false;
855
856        /* If a source is read in _this instruction_, it is already counted */
857        for (unsigned t = 0; t < src_idx; ++t)
858                if (bi_is_word_equiv(src, instr->src[t]))
859                        return false;
860
861        return true;
862}
863
864/* Given two tuples in source order, count the number of register reads of the
865 * successor, determined as the number of unique words accessed that aren't
866 * written by the predecessor (since those are tempable).
867 */
868
869static unsigned
870bi_count_succ_reads(bi_index t0, bi_index t1,
871                bi_index *succ_reads, unsigned nr_succ_reads)
872{
873        unsigned reads = 0;
874
875        for (unsigned i = 0; i < nr_succ_reads; ++i) {
876                bool unique = true;
877
878                for (unsigned j = 0; j < i; ++j)
879                        if (bi_is_word_equiv(succ_reads[i], succ_reads[j]))
880                                unique = false;
881
882                if (!unique)
883                        continue;
884
885                if (bi_is_word_equiv(succ_reads[i], t0))
886                        continue;
887
888                if (bi_is_word_equiv(succ_reads[i], t1))
889                        continue;
890
891                reads++;
892        }
893
894        return reads;
895}
896
897/* Not all instructions can read from the staging passthrough (as determined by
898 * reads_t), check if a given pair of instructions has such a restriction. Note
899 * we also use this mechanism to prevent data races around staging register
900 * reads, so we allow the input source to potentially be vector-valued */
901
902static bool
903bi_has_staging_passthrough_hazard(bi_index fma, bi_instr *add)
904{
905        bi_foreach_src(add, s) {
906                bi_index src = add->src[s];
907
908                if (src.type != BI_INDEX_REGISTER)
909                        continue;
910
911                unsigned count = bi_count_read_registers(add, s);
912                bool read = false;
913
914                for (unsigned d = 0; d < count; ++d)
915                        read |= bi_is_equiv(fma, bi_register(src.value + d));
916
917                if (read && !bi_reads_t(add, s))
918                        return true;
919        }
920
921        return false;
922}
923
924/* Likewise for cross-tuple passthrough (reads_temps) */
925
926static bool
927bi_has_cross_passthrough_hazard(bi_tuple *succ, bi_instr *ins)
928{
929        bi_foreach_instr_in_tuple(succ, pins) {
930                bi_foreach_src(pins, s) {
931                        if (bi_is_word_equiv(ins->dest[0], pins->src[s]) &&
932                                        !bi_reads_temps(pins, s))
933                                return true;
934                }
935        }
936
937        return false;
938}
939
940/* Is a register written other than the staging mechanism? ATEST is special,
941 * writing to both a staging register and a regular register (fixed packing).
942 * BLEND is special since it has to write r48 the normal way even if it never
943 * gets read. This depends on liveness analysis, as a register is not needed
944 * for a write that will be discarded after one tuple. */
945
946static unsigned
947bi_write_count(bi_instr *instr, uint64_t live_after_temp)
948{
949        if (instr->op == BI_OPCODE_ATEST || instr->op == BI_OPCODE_BLEND)
950                return 1;
951
952        unsigned count = 0;
953
954        bi_foreach_dest(instr, d) {
955                if (d == 0 && bi_opcode_props[instr->op].sr_write)
956                        continue;
957
958                if (bi_is_null(instr->dest[d]))
959                        continue;
960
961                assert(instr->dest[0].type == BI_INDEX_REGISTER);
962                if (live_after_temp & BITFIELD64_BIT(instr->dest[0].value))
963                        count++;
964        }
965
966        return count;
967}
968
969/* Instruction placement entails two questions: what subset of instructions in
970 * the block can legally be scheduled? and of those which is the best? That is,
971 * we seek to maximize a cost function on a subset of the worklist satisfying a
972 * particular predicate. The necessary predicate is determined entirely by
973 * Bifrost's architectural limitations and is described in the accompanying
974 * whitepaper. The cost function is a heuristic. */
975
976static bool
977bi_instr_schedulable(bi_instr *instr,
978                struct bi_clause_state *clause,
979                struct bi_tuple_state *tuple,
980                uint64_t live_after_temp,
981                bool fma)
982{
983        /* The units must match */
984        if ((fma && !bi_can_fma(instr)) || (!fma && !bi_can_add(instr)))
985                return false;
986
987        /* There can only be one message-passing instruction per clause */
988        if (bi_must_message(instr) && clause->message)
989                return false;
990
991        /* Some instructions have placement requirements */
992        if (bi_opcode_props[instr->op].last && !tuple->last)
993                return false;
994
995        if (bi_must_not_last(instr) && tuple->last)
996                return false;
997
998        /* Message-passing instructions are not guaranteed write within the
999         * same clause (most likely they will not), so if a later instruction
1000         * in the clause accesses the destination, the message-passing
1001         * instruction can't be scheduled */
1002        if (bi_opcode_props[instr->op].sr_write && !bi_is_null(instr->dest[0])) {
1003                unsigned nr = bi_count_write_registers(instr, 0);
1004                assert(instr->dest[0].type == BI_INDEX_REGISTER);
1005                unsigned reg = instr->dest[0].value;
1006
1007                for (unsigned i = 0; i < clause->access_count; ++i) {
1008                        bi_index idx = clause->accesses[i];
1009                        for (unsigned d = 0; d < nr; ++d) {
1010                                if (bi_is_equiv(bi_register(reg + d), idx))
1011                                        return false;
1012                        }
1013                }
1014        }
1015
1016        if (bi_opcode_props[instr->op].sr_read && !bi_is_null(instr->src[0])) {
1017                unsigned nr = bi_count_read_registers(instr, 0);
1018                assert(instr->src[0].type == BI_INDEX_REGISTER);
1019                unsigned reg = instr->src[0].value;
1020
1021                for (unsigned i = 0; i < clause->access_count; ++i) {
1022                        bi_index idx = clause->accesses[i];
1023                        for (unsigned d = 0; d < nr; ++d) {
1024                                if (bi_is_equiv(bi_register(reg + d), idx))
1025                                        return false;
1026                        }
1027                }
1028        }
1029
1030        /* If FAU is already assigned, we may not disrupt that. Do a
1031         * non-disruptive test update */
1032        if (!bi_update_fau(clause, tuple, instr, fma, false))
1033                return false;
1034
1035        /* If this choice of FMA would force a staging passthrough, the ADD
1036         * instruction must support such a passthrough */
1037        if (tuple->add && bi_has_staging_passthrough_hazard(instr->dest[0], tuple->add))
1038                return false;
1039
1040        /* If this choice of destination would force a cross-tuple passthrough, the next tuple must support that */
1041        if (tuple->prev && bi_has_cross_passthrough_hazard(tuple->prev, instr))
1042                return false;
1043
1044        /* Register file writes are limited */
1045        unsigned total_writes = tuple->reg.nr_writes;
1046        total_writes += bi_write_count(instr, live_after_temp);
1047
1048        /* Last tuple in a clause can only write a single value */
1049        if (tuple->last && total_writes > 1)
1050                return false;
1051
1052        /* Register file reads are limited, so count unique */
1053
1054        unsigned unique_new_srcs = 0;
1055
1056        bi_foreach_src(instr, s) {
1057                if (bi_tuple_is_new_src(instr, &tuple->reg, s))
1058                        unique_new_srcs++;
1059        }
1060
1061        unsigned total_srcs = tuple->reg.nr_reads + unique_new_srcs;
1062
1063        bool can_spill_to_moves = (!tuple->add);
1064        can_spill_to_moves &= (bi_nconstants(clause) < 13 - (clause->tuple_count + 2));
1065        can_spill_to_moves &= (clause->tuple_count < 7);
1066
1067        /* However, we can get an extra 1 or 2 sources by inserting moves */
1068        if (total_srcs > (can_spill_to_moves ? 4 : 3))
1069                return false;
1070
1071        /* Count effective reads for the successor */
1072        unsigned succ_reads = bi_count_succ_reads(instr->dest[0],
1073                        tuple->add ? tuple->add->dest[0] : bi_null(),
1074                        tuple->prev_reads, tuple->nr_prev_reads);
1075
1076        /* Successor must satisfy R+W <= 4, so we require W <= 4-R */
1077        if ((signed) total_writes > (4 - (signed) succ_reads))
1078                return false;
1079
1080        return true;
1081}
1082
1083static signed
1084bi_instr_cost(bi_instr *instr, struct bi_tuple_state *tuple)
1085{
1086        signed cost = 0;
1087
1088        /* Instructions that can schedule to either FMA or to ADD should be
1089         * deprioritized since they're easier to reschedule elsewhere */
1090        if (bi_can_fma(instr) && bi_can_add(instr))
1091                cost++;
1092
1093        /* Message-passing instructions impose constraints on the registers
1094         * later in the clause, so schedule them as late within a clause as
1095         * possible (<==> prioritize them since we're backwards <==> decrease
1096         * cost) */
1097        if (bi_must_message(instr))
1098                cost--;
1099
1100        /* Last instructions are big constraints (XXX: no effect on shader-db) */
1101        if (bi_opcode_props[instr->op].last)
1102                cost -= 2;
1103
1104        return cost;
1105}
1106
1107static unsigned
1108bi_choose_index(struct bi_worklist st,
1109                struct bi_clause_state *clause,
1110                struct bi_tuple_state *tuple,
1111                uint64_t live_after_temp,
1112                bool fma)
1113{
1114        unsigned i, best_idx = ~0;
1115        signed best_cost = INT_MAX;
1116
1117        BITSET_FOREACH_SET(i, st.worklist, st.count) {
1118                bi_instr *instr = st.instructions[i];
1119
1120                if (!bi_instr_schedulable(instr, clause, tuple, live_after_temp, fma))
1121                        continue;
1122
1123                signed cost = bi_instr_cost(instr, tuple);
1124
1125                /* Tie break in favour of later instructions, under the
1126                 * assumption this promotes temporary usage (reducing pressure
1127                 * on the register file). This is a side effect of a prepass
1128                 * scheduling for pressure. */
1129
1130                if (cost <= best_cost) {
1131                        best_idx = i;
1132                        best_cost = cost;
1133                }
1134        }
1135
1136        return best_idx;
1137}
1138
1139static void
1140bi_pop_instr(struct bi_clause_state *clause, struct bi_tuple_state *tuple,
1141                bi_instr *instr, uint64_t live_after_temp, bool fma)
1142{
1143        bi_update_fau(clause, tuple, instr, fma, true);
1144
1145        /* TODO: maybe opt a bit? or maybe doesn't matter */
1146        assert(clause->access_count + BI_MAX_SRCS + BI_MAX_DESTS <= ARRAY_SIZE(clause->accesses));
1147        memcpy(clause->accesses + clause->access_count, instr->src, sizeof(instr->src));
1148        clause->access_count += BI_MAX_SRCS;
1149        memcpy(clause->accesses + clause->access_count, instr->dest, sizeof(instr->dest));
1150        clause->access_count += BI_MAX_DESTS;
1151        tuple->reg.nr_writes += bi_write_count(instr, live_after_temp);
1152
1153        bi_foreach_src(instr, s) {
1154                if (bi_tuple_is_new_src(instr, &tuple->reg, s))
1155                        tuple->reg.reads[tuple->reg.nr_reads++] = instr->src[s];
1156        }
1157}
1158
1159/* Choose the best instruction and pop it off the worklist. Returns NULL if no
1160 * instruction is available. This function is destructive. */
1161
1162static bi_instr *
1163bi_take_instr(bi_context *ctx, struct bi_worklist st,
1164                struct bi_clause_state *clause,
1165                struct bi_tuple_state *tuple,
1166                uint64_t live_after_temp,
1167                bool fma)
1168{
1169        if (tuple->add && tuple->add->op == BI_OPCODE_CUBEFACE)
1170                return bi_lower_cubeface(ctx, clause, tuple);
1171        else if (tuple->add && tuple->add->op == BI_OPCODE_PATOM_C_I32)
1172                return bi_lower_atom_c(ctx, clause, tuple);
1173        else if (tuple->add && tuple->add->op == BI_OPCODE_PATOM_C1_I32)
1174                return bi_lower_atom_c1(ctx, clause, tuple);
1175        else if (tuple->add && tuple->add->op == BI_OPCODE_SEG_ADD_I64)
1176                return bi_lower_seg_add(ctx, clause, tuple);
1177        else if (tuple->add && tuple->add->table)
1178                return bi_lower_dtsel(ctx, clause, tuple);
1179
1180        /* TODO: Optimize these moves */
1181        if (!fma && tuple->nr_prev_reads > 3) {
1182                /* Only spill by one source for now */
1183                assert(tuple->nr_prev_reads == 4);
1184
1185                /* Pick a source to spill */
1186                bi_index src = tuple->prev_reads[0];
1187
1188                /* Schedule the spill */
1189                bi_builder b = bi_init_builder(ctx, bi_before_tuple(tuple->prev));
1190                bi_instr *mov = bi_mov_i32_to(&b, src, src);
1191                bi_pop_instr(clause, tuple, mov, live_after_temp, fma);
1192                return mov;
1193        }
1194
1195#ifndef NDEBUG
1196        /* Don't pair instructions if debugging */
1197        if ((bifrost_debug & BIFROST_DBG_NOSCHED) && tuple->add)
1198                return NULL;
1199#endif
1200
1201        unsigned idx = bi_choose_index(st, clause, tuple, live_after_temp, fma);
1202
1203        if (idx >= st.count)
1204                return NULL;
1205
1206        /* Update state to reflect taking the instruction */
1207        bi_instr *instr = st.instructions[idx];
1208
1209        BITSET_CLEAR(st.worklist, idx);
1210        bi_update_worklist(st, idx);
1211        bi_pop_instr(clause, tuple, instr, live_after_temp, fma);
1212
1213        /* Fixups */
1214        if (instr->op == BI_OPCODE_IADD_U32 && fma) {
1215                assert(bi_can_iaddc(instr));
1216                instr->op = BI_OPCODE_IADDC_I32;
1217                instr->src[2] = bi_zero();
1218        }
1219
1220        return instr;
1221}
1222
1223/* Variant of bi_rewrite_index_src_single that uses word-equivalence, rewriting
1224 * to a passthrough register. If except_zero is true, the zeroth (first) source
1225 * is skipped, so staging register reads are not accidentally encoded as
1226 * passthrough (which is impossible) */
1227
1228static void
1229bi_use_passthrough(bi_instr *ins, bi_index old,
1230                enum bifrost_packed_src new,
1231                bool except_zero)
1232{
1233        /* Optional for convenience */
1234        if (!ins || bi_is_null(old))
1235                return;
1236
1237        bi_foreach_src(ins, i) {
1238                if (i == 0 && except_zero)
1239                        continue;
1240
1241                if (bi_is_word_equiv(ins->src[i], old)) {
1242                        ins->src[i].type = BI_INDEX_PASS;
1243                        ins->src[i].value = new;
1244                        ins->src[i].reg = false;
1245                        ins->src[i].offset = 0;
1246                }
1247        }
1248}
1249
1250/* Rewrites an adjacent pair of tuples _prec_eding and _succ_eding to use
1251 * intertuple passthroughs where necessary. Passthroughs are allowed as a
1252 * post-condition of scheduling. Note we rewrite ADD first, FMA second --
1253 * opposite the order of execution. This is deliberate -- if both FMA and ADD
1254 * write to the same logical register, the next executed tuple will get the
1255 * latter result. There's no interference issue under the assumption of correct
1256 * register allocation. */
1257
1258static void
1259bi_rewrite_passthrough(bi_tuple prec, bi_tuple succ)
1260{
1261        bool sr_read = succ.add ? bi_opcode_props[succ.add->op].sr_read : false;
1262
1263        if (prec.add) {
1264                bi_use_passthrough(succ.fma, prec.add->dest[0], BIFROST_SRC_PASS_ADD, false);
1265                bi_use_passthrough(succ.add, prec.add->dest[0], BIFROST_SRC_PASS_ADD, sr_read);
1266        }
1267
1268        if (prec.fma) {
1269                bi_use_passthrough(succ.fma, prec.fma->dest[0], BIFROST_SRC_PASS_FMA, false);
1270                bi_use_passthrough(succ.add, prec.fma->dest[0], BIFROST_SRC_PASS_FMA, sr_read);
1271        }
1272}
1273
1274static void
1275bi_rewrite_fau_to_pass(bi_tuple *tuple)
1276{
1277        bi_foreach_instr_and_src_in_tuple(tuple, ins, s) {
1278                if (ins->src[s].type != BI_INDEX_FAU) continue;
1279
1280                bi_index pass = bi_passthrough(ins->src[s].offset ?
1281                                BIFROST_SRC_FAU_HI : BIFROST_SRC_FAU_LO);
1282
1283                ins->src[s] = bi_replace_index(ins->src[s], pass);
1284        }
1285}
1286
1287static void
1288bi_rewrite_zero(bi_instr *ins, bool fma)
1289{
1290        bi_index zero = bi_passthrough(fma ? BIFROST_SRC_STAGE : BIFROST_SRC_FAU_LO);
1291
1292        bi_foreach_src(ins, s) {
1293                bi_index src = ins->src[s];
1294
1295                if (src.type == BI_INDEX_CONSTANT && src.value == 0)
1296                        ins->src[s] = bi_replace_index(src, zero);
1297        }
1298}
1299
1300/* Assumes #0 to {T, FAU} rewrite has already occurred */
1301
1302static void
1303bi_rewrite_constants_to_pass(bi_tuple *tuple, uint64_t constant, bool pcrel)
1304{
1305        bi_foreach_instr_and_src_in_tuple(tuple, ins, s) {
1306                if (ins->src[s].type != BI_INDEX_CONSTANT) continue;
1307
1308                uint32_t cons = ins->src[s].value;
1309
1310                ASSERTED bool lo = (cons == (constant & 0xffffffff));
1311                bool hi = (cons == (constant >> 32ull));
1312
1313                /* PC offsets always live in the upper half, set to zero by
1314                 * convention before pack time. (This is safe, since if you
1315                 * wanted to compare against zero, you would use a BRANCHZ
1316                 * instruction instead.) */
1317                if (cons == 0 && ins->branch_target != NULL) {
1318                        assert(pcrel);
1319                        hi = true;
1320                        lo = false;
1321                } else if (pcrel) {
1322                        hi = false;
1323                }
1324
1325                assert(lo || hi);
1326
1327                ins->src[s] = bi_replace_index(ins->src[s],
1328                                bi_passthrough(hi ?  BIFROST_SRC_FAU_HI :
1329                                        BIFROST_SRC_FAU_LO));
1330        }
1331}
1332
1333/* Constructs a constant state given a tuple state. This has the
1334 * postcondition that pcrel applies to the first constant by convention,
1335 * and PC-relative constants will be #0 by convention here, so swap to
1336 * match if needed */
1337
1338static struct bi_const_state
1339bi_get_const_state(struct bi_tuple_state *tuple)
1340{
1341        struct bi_const_state consts = {
1342                .constant_count = tuple->constant_count,
1343                .constants[0] = tuple->constants[0],
1344                .constants[1] = tuple->constants[1],
1345                .pcrel = tuple->add && tuple->add->branch_target,
1346        };
1347
1348        /* pcrel applies to the first constant by convention, and
1349         * PC-relative constants will be #0 by convention here, so swap
1350         * to match if needed */
1351        if (consts.pcrel && consts.constants[0]) {
1352                assert(consts.constant_count == 2);
1353                assert(consts.constants[1] == 0);
1354
1355                consts.constants[1] = consts.constants[0];
1356                consts.constants[0] = 0;
1357        }
1358
1359        return consts;
1360}
1361
1362/* Merges constants in a clause, satisfying the following rules, assuming no
1363 * more than one tuple has pcrel:
1364 *
1365 * 1. If a tuple has two constants, they must be packed together. If one is
1366 * pcrel, it must be the high constant to use the M1=4 modification [sx64(E0) +
1367 * (PC << 32)]. Otherwise choose an arbitrary order.
1368 *
1369 * 4. If a tuple has one constant, it may be shared with an existing
1370 * pair that already contains that constant, or it may be combined with another
1371 * (distinct) tuple of a single constant.
1372 *
1373 * This gaurantees a packing is possible. The next routine handles modification
1374 * related swapping, to satisfy format 12 and the lack of modification for
1375 * tuple count 5/8 in EC0.
1376 */
1377
1378static uint64_t
1379bi_merge_u32(uint32_t c0, uint32_t c1, bool pcrel)
1380{
1381        /* At this point in the constant merge algorithm, pcrel constants are
1382         * treated as zero, so pcrel implies at least one constants is zero */
1383        assert(!pcrel || (c0 == 0 || c1 == 0));
1384
1385        /* Order: pcrel, maximum non-pcrel, minimum non-pcrel */
1386        uint32_t hi = pcrel ? 0 : MAX2(c0, c1);
1387        uint32_t lo = (c0 == hi) ? c1 : c0;
1388
1389        /* Merge in the selected order */
1390        return lo | (((uint64_t) hi) << 32ull);
1391}
1392
1393static unsigned
1394bi_merge_pairs(struct bi_const_state *consts, unsigned tuple_count,
1395                uint64_t *merged, unsigned *pcrel_pair)
1396{
1397        unsigned merge_count = 0;
1398
1399        for (unsigned t = 0; t < tuple_count; ++t) {
1400                if (consts[t].constant_count != 2) continue;
1401
1402                unsigned idx = ~0;
1403                uint64_t val = bi_merge_u32(consts[t].constants[0],
1404                                consts[t].constants[1], consts[t].pcrel);
1405
1406                /* Skip the pcrel pair if assigned, because if one is assigned,
1407                 * this one is not pcrel by uniqueness so it's a mismatch */
1408                for (unsigned s = 0; s < merge_count; ++s) {
1409                        if (merged[s] == val && (*pcrel_pair) != s) {
1410                                idx = s;
1411                                break;
1412                        }
1413                }
1414
1415                if (idx == ~0) {
1416                        idx = merge_count++;
1417                        merged[idx] = val;
1418
1419                        if (consts[t].pcrel)
1420                                (*pcrel_pair) = idx;
1421                }
1422
1423                consts[t].word_idx = idx;
1424        }
1425
1426        return merge_count;
1427}
1428
1429static unsigned
1430bi_merge_singles(struct bi_const_state *consts, unsigned tuple_count,
1431                uint64_t *pairs, unsigned pair_count, unsigned *pcrel_pair)
1432{
1433        bool pending = false, pending_pcrel = false;
1434        uint32_t pending_single = 0;
1435
1436        for (unsigned t = 0; t < tuple_count; ++t) {
1437                if (consts[t].constant_count != 1) continue;
1438
1439                uint32_t val = consts[t].constants[0];
1440                unsigned idx = ~0;
1441
1442                /* Try to match, but don't match pcrel with non-pcrel, even
1443                 * though we can merge a pcrel with a non-pcrel single */
1444                for (unsigned i = 0; i < pair_count; ++i) {
1445                        bool lo = ((pairs[i] & 0xffffffff) == val);
1446                        bool hi = ((pairs[i] >> 32) == val);
1447                        bool match = (lo || hi);
1448                        match &= ((*pcrel_pair) != i);
1449                        if (match && !consts[t].pcrel) {
1450                                idx = i;
1451                                break;
1452                        }
1453                }
1454
1455                if (idx == ~0) {
1456                        idx = pair_count;
1457
1458                        if (pending && pending_single != val) {
1459                                assert(!(pending_pcrel && consts[t].pcrel));
1460                                bool pcrel = pending_pcrel || consts[t].pcrel;
1461
1462                                if (pcrel)
1463                                        *pcrel_pair = idx;
1464
1465                                pairs[pair_count++] = bi_merge_u32(pending_single, val, pcrel);
1466
1467                                pending = pending_pcrel = false;
1468                        } else {
1469                                pending = true;
1470                                pending_pcrel = consts[t].pcrel;
1471                                pending_single = val;
1472                        }
1473                }
1474
1475                consts[t].word_idx = idx;
1476        }
1477
1478        /* Shift so it works whether pending_pcrel is set or not */
1479        if (pending) {
1480                if (pending_pcrel)
1481                        *pcrel_pair = pair_count;
1482
1483                pairs[pair_count++] = ((uint64_t) pending_single) << 32ull;
1484        }
1485
1486        return pair_count;
1487}
1488
1489static unsigned
1490bi_merge_constants(struct bi_const_state *consts, uint64_t *pairs, unsigned *pcrel_idx)
1491{
1492        unsigned pair_count = bi_merge_pairs(consts, 8, pairs, pcrel_idx);
1493        return bi_merge_singles(consts, 8, pairs, pair_count, pcrel_idx);
1494}
1495
1496/* Swap two constants at word i and i+1 by swapping their actual positions and
1497 * swapping all references so the meaning of the clause is preserved */
1498
1499static void
1500bi_swap_constants(struct bi_const_state *consts, uint64_t *pairs, unsigned i)
1501{
1502        uint64_t tmp_pair = pairs[i + 0];
1503        pairs[i + 0] = pairs[i + 1];
1504        pairs[i + 1] = tmp_pair;
1505
1506        for (unsigned t = 0; t < 8; ++t) {
1507                if (consts[t].word_idx == i)
1508                        consts[t].word_idx = (i + 1);
1509                else if (consts[t].word_idx == (i + 1))
1510                        consts[t].word_idx = i;
1511        }
1512}
1513
1514/* Given merged constants, one of which might be PC-relative, fix up the M
1515 * values so the PC-relative constant (if it exists) has the M1=4 modification
1516 * and other constants are used as-is (which might require swapping) */
1517
1518static unsigned
1519bi_apply_constant_modifiers(struct bi_const_state *consts,
1520                uint64_t *pairs, unsigned *pcrel_idx,
1521                unsigned tuple_count, unsigned constant_count)
1522{
1523        unsigned start = bi_ec0_packed(tuple_count) ? 1 : 0;
1524
1525        /* Clauses with these tuple counts lack an M field for the packed EC0,
1526         * so EC0 cannot be PC-relative, which might require swapping (and
1527         * possibly adding an unused constant) to fit */
1528
1529        if (*pcrel_idx == 0 && (tuple_count == 5 || tuple_count == 8)) {
1530                constant_count = MAX2(constant_count, 2);
1531                *pcrel_idx = 1;
1532                bi_swap_constants(consts, pairs, 0);
1533        }
1534
1535        /* EC0 might be packed free, after that constants are packed in pairs
1536         * (with clause format 12), with M1 values computed from the pair */
1537
1538        for (unsigned i = start; i < constant_count; i += 2) {
1539                bool swap = false;
1540                bool last = (i + 1) == constant_count;
1541
1542                unsigned A1 = (pairs[i] >> 60);
1543                unsigned B1 = (pairs[i + 1] >> 60);
1544
1545                if (*pcrel_idx == i || *pcrel_idx == (i + 1)) {
1546                        /* PC-relative constant must be E0, not E1 */
1547                        swap = (*pcrel_idx == (i + 1));
1548
1549                        /* Set M1 = 4 by noting (A - B) mod 16 = 4 is
1550                         * equivalent to A = (B + 4) mod 16 and that we can
1551                         * control A */
1552                        unsigned B = swap ? A1 : B1;
1553                        unsigned A = (B + 4) & 0xF;
1554                        pairs[*pcrel_idx] |= ((uint64_t) A) << 60;
1555
1556                        /* Swapped if swap set, identity if swap not set */
1557                        *pcrel_idx = i;
1558                } else {
1559                        /* Compute M1 value if we don't swap */
1560                        unsigned M1 = (16 + A1 - B1) & 0xF;
1561
1562                        /* For M1 = 0 or M1 >= 8, the constants are unchanged,
1563                         * we have 0 < (A1 - B1) % 16 < 8, which implies (B1 -
1564                         * A1) % 16 >= 8, so swapping will let them be used
1565                         * unchanged */
1566                        swap = (M1 != 0) && (M1 < 8);
1567
1568                        /* However, we can't swap the last constant, so we
1569                         * force M1 = 0 instead for this case */
1570                        if (last && swap) {
1571                                pairs[i + 1] |= pairs[i] & (0xfull << 60);
1572                                swap = false;
1573                        }
1574                }
1575
1576                if (swap) {
1577                        assert(!last);
1578                        bi_swap_constants(consts, pairs, i);
1579                }
1580        }
1581
1582        return constant_count;
1583}
1584
1585/* Schedule a single clause. If no instructions remain, return NULL. */
1586
1587static bi_clause *
1588bi_schedule_clause(bi_context *ctx, bi_block *block, struct bi_worklist st, uint64_t *live)
1589{
1590        struct bi_clause_state clause_state = { 0 };
1591        bi_clause *clause = rzalloc(ctx, bi_clause);
1592        bi_tuple *tuple = NULL;
1593
1594        const unsigned max_tuples = ARRAY_SIZE(clause->tuples);
1595
1596        /* TODO: Decide flow control better */
1597        clause->flow_control = BIFROST_FLOW_NBTB;
1598
1599        /* The last clause can only write one instruction, so initialize that */
1600        struct bi_reg_state reg_state = {};
1601        bi_index prev_reads[5] = { bi_null() };
1602        unsigned nr_prev_reads = 0;
1603
1604        /* We need to track future liveness. The main *live set tracks what is
1605         * live at the current point int he program we are scheduling, but to
1606         * determine temp eligibility, we instead want what will be live after
1607         * the next tuple in the program. If you scheduled forwards, you'd need
1608         * a crystall ball for this. Luckily we schedule backwards, so we just
1609         * delay updates to the live_after_temp by an extra tuple. */
1610        uint64_t live_after_temp = *live;
1611        uint64_t live_next_tuple = live_after_temp;
1612
1613        do {
1614                struct bi_tuple_state tuple_state = {
1615                        .last = (clause->tuple_count == 0),
1616                        .reg = reg_state,
1617                        .nr_prev_reads = nr_prev_reads,
1618                        .prev = tuple,
1619                        .pcrel_idx = ~0,
1620                };
1621
1622                assert(nr_prev_reads < ARRAY_SIZE(prev_reads));
1623                memcpy(tuple_state.prev_reads, prev_reads, sizeof(prev_reads));
1624
1625                unsigned idx = max_tuples - clause->tuple_count - 1;
1626
1627                tuple = &clause->tuples[idx];
1628
1629                if (clause->message && bi_opcode_props[clause->message->op].sr_read && !bi_is_null(clause->message->src[0])) {
1630                        unsigned nr = bi_count_read_registers(clause->message, 0);
1631                        live_after_temp |= (BITFIELD64_MASK(nr) << clause->message->src[0].value);
1632                }
1633
1634                /* Since we schedule backwards, we schedule ADD first */
1635                tuple_state.add = bi_take_instr(ctx, st, &clause_state, &tuple_state, live_after_temp, false);
1636                tuple->fma = bi_take_instr(ctx, st, &clause_state, &tuple_state, live_after_temp, true);
1637                tuple->add = tuple_state.add;
1638
1639                /* Update liveness from the new instructions */
1640                if (tuple->add)
1641                        *live = bi_postra_liveness_ins(*live, tuple->add);
1642
1643                if (tuple->fma)
1644                        *live = bi_postra_liveness_ins(*live, tuple->fma);
1645
1646               /* Rotate in the new per-tuple liveness */
1647                live_after_temp = live_next_tuple;
1648                live_next_tuple = *live;
1649
1650                /* We may have a message, but only one per clause */
1651                if (tuple->add && bi_must_message(tuple->add)) {
1652                        assert(!clause_state.message);
1653                        clause_state.message = true;
1654
1655                        clause->message_type =
1656                                bi_message_type_for_instr(tuple->add);
1657                        clause->message = tuple->add;
1658
1659                        /* We don't need to set dependencies for blend shaders
1660                         * because the BLEND instruction in the fragment
1661                         * shader should have already done the wait */
1662                        if (!ctx->inputs->is_blend) {
1663                                switch (tuple->add->op) {
1664                                case BI_OPCODE_ATEST:
1665                                        clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_DEPTH);
1666                                        break;
1667                                case BI_OPCODE_LD_TILE:
1668                                case BI_OPCODE_ST_TILE:
1669                                        clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_COLOUR);
1670                                        break;
1671                                case BI_OPCODE_BLEND:
1672                                        clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_DEPTH);
1673                                        clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_COLOUR);
1674                                        break;
1675                                default:
1676                                        break;
1677                                }
1678                        }
1679                }
1680
1681                clause_state.consts[idx] = bi_get_const_state(&tuple_state);
1682
1683                /* Before merging constants, eliminate zeroes, otherwise the
1684                 * merging will fight over the #0 that never gets read (and is
1685                 * never marked as read by update_fau) */
1686                if (tuple->fma && bi_reads_zero(tuple->fma))
1687                        bi_rewrite_zero(tuple->fma, true);
1688
1689                /* Rewrite away FAU, constant write is deferred */
1690                if (!tuple_state.constant_count) {
1691                        tuple->fau_idx = tuple_state.fau;
1692                        bi_rewrite_fau_to_pass(tuple);
1693                }
1694
1695                /* Use passthrough register for cross-stage accesses. Since
1696                 * there are just FMA and ADD stages, that means we rewrite to
1697                 * passthrough the sources of the ADD that read from the
1698                 * destination of the FMA */
1699
1700                if (tuple->fma) {
1701                        bi_use_passthrough(tuple->add, tuple->fma->dest[0],
1702                                        BIFROST_SRC_STAGE, false);
1703                }
1704
1705                /* Don't add an empty tuple, unless the worklist has nothing
1706                 * but a (pseudo)instruction failing to schedule due to a "not
1707                 * last instruction" constraint */
1708
1709                int some_instruction = __bitset_ffs(st.worklist, BITSET_WORDS(st.count));
1710                bool not_last = (some_instruction > 0) &&
1711                        bi_must_not_last(st.instructions[some_instruction - 1]);
1712
1713                bool insert_empty = tuple_state.last && not_last;
1714
1715                if (!(tuple->fma || tuple->add || insert_empty))
1716                        break;
1717
1718                clause->tuple_count++;
1719
1720                /* Adding enough tuple might overflow constants */
1721                if (!bi_space_for_more_constants(&clause_state))
1722                        break;
1723
1724#ifndef NDEBUG
1725                /* Don't schedule more than 1 tuple if debugging */
1726                if ((bifrost_debug & BIFROST_DBG_NOSCHED) && !insert_empty)
1727                        break;
1728#endif
1729
1730                /* Link through the register state */
1731                STATIC_ASSERT(sizeof(prev_reads) == sizeof(tuple_state.reg.reads));
1732                memcpy(prev_reads, tuple_state.reg.reads, sizeof(prev_reads));
1733                nr_prev_reads = tuple_state.reg.nr_reads;
1734                clause_state.tuple_count++;
1735        } while(clause->tuple_count < 8);
1736
1737        /* Don't schedule an empty clause */
1738        if (!clause->tuple_count)
1739                return NULL;
1740
1741        /* Before merging, rewrite away any tuples that read only zero */
1742        for (unsigned i = max_tuples - clause->tuple_count; i < max_tuples; ++i) {
1743                bi_tuple *tuple = &clause->tuples[i];
1744                struct bi_const_state *st = &clause_state.consts[i];
1745
1746                if (st->constant_count == 0 || st->constants[0] || st->constants[1] || st->pcrel)
1747                        continue;
1748
1749                bi_foreach_instr_in_tuple(tuple, ins)
1750                        bi_rewrite_zero(ins, false);
1751
1752                /* Constant has been demoted to FAU, so don't pack it separately */
1753                st->constant_count = 0;
1754
1755                /* Default */
1756                assert(tuple->fau_idx == BIR_FAU_ZERO);
1757        }
1758
1759        uint64_t constant_pairs[8] = { 0 };
1760        unsigned pcrel_idx = ~0;
1761        unsigned constant_words =
1762                bi_merge_constants(clause_state.consts, constant_pairs, &pcrel_idx);
1763
1764        constant_words = bi_apply_constant_modifiers(clause_state.consts,
1765                        constant_pairs, &pcrel_idx, clause->tuple_count,
1766                        constant_words);
1767
1768        clause->pcrel_idx = pcrel_idx;
1769
1770        for (unsigned i = max_tuples - clause->tuple_count; i < max_tuples; ++i) {
1771                bi_tuple *tuple = &clause->tuples[i];
1772
1773                /* If no constants, leave FAU as it is, possibly defaulting to 0 */
1774                if (clause_state.consts[i].constant_count == 0)
1775                        continue;
1776
1777                /* FAU is already handled */
1778                assert(!tuple->fau_idx);
1779
1780                unsigned word_idx = clause_state.consts[i].word_idx;
1781                assert(word_idx <= 8);
1782
1783                /* We could try to merge regardless of bottom bits as well, but
1784                 * that's probably diminishing returns */
1785                uint64_t pair = constant_pairs[word_idx];
1786                unsigned lo = pair & 0xF;
1787
1788                tuple->fau_idx = bi_constant_field(word_idx) | lo;
1789                bi_rewrite_constants_to_pass(tuple, pair, word_idx == pcrel_idx);
1790        }
1791
1792        clause->constant_count = constant_words;
1793        memcpy(clause->constants, constant_pairs, sizeof(constant_pairs));
1794
1795        /* Branches must be last, so this can be factored out */
1796        bi_instr *last = clause->tuples[max_tuples - 1].add;
1797        clause->next_clause_prefetch = !last || (last->op != BI_OPCODE_JUMP);
1798        clause->block = block;
1799
1800        /* TODO: scoreboard assignment post-sched */
1801        clause->dependencies |= (1 << 0);
1802
1803        /* We emit in reverse and emitted to the back of the tuples array, so
1804         * move it up front for easy indexing */
1805        memmove(clause->tuples,
1806                       clause->tuples + (max_tuples - clause->tuple_count),
1807                       clause->tuple_count * sizeof(clause->tuples[0]));
1808
1809        /* Use passthrough register for cross-tuple accesses. Note this is
1810         * after the memmove, so this is forwards. Skip the first tuple since
1811         * there is nothing before it to passthrough */
1812
1813        for (unsigned t = 1; t < clause->tuple_count; ++t)
1814                bi_rewrite_passthrough(clause->tuples[t - 1], clause->tuples[t]);
1815
1816        return clause;
1817}
1818
1819static void
1820bi_schedule_block(bi_context *ctx, bi_block *block)
1821{
1822        list_inithead(&block->clauses);
1823
1824        /* Copy list to dynamic array */
1825        struct bi_worklist st = bi_initialize_worklist(block,
1826                        bifrost_debug & BIFROST_DBG_INORDER,
1827                        ctx->inputs->is_blend);
1828
1829        if (!st.count) {
1830                bi_free_worklist(st);
1831                return;
1832        }
1833
1834        /* We need to track liveness during scheduling in order to determine whether we can use temporary (passthrough) registers */
1835        uint64_t live = block->reg_live_out;
1836
1837        /* Schedule as many clauses as needed to fill the block */
1838        bi_clause *u = NULL;
1839        while((u = bi_schedule_clause(ctx, block, st, &live)))
1840                list_add(&u->link, &block->clauses);
1841
1842        /* Back-to-back bit affects only the last clause of a block,
1843         * the rest are implicitly true */
1844        if (!list_is_empty(&block->clauses)) {
1845                bi_clause *last_clause = list_last_entry(&block->clauses, bi_clause, link);
1846                if (bi_reconverge_branches(block))
1847                        last_clause->flow_control = BIFROST_FLOW_NBTB_UNCONDITIONAL;
1848        }
1849
1850        /* Reorder instructions to match the new schedule. First remove
1851         * existing instructions and then recreate the list */
1852
1853        bi_foreach_instr_in_block_safe(block, ins) {
1854                list_del(&ins->link);
1855        }
1856
1857        bi_foreach_clause_in_block(block, clause) {
1858                for (unsigned i = 0; i < clause->tuple_count; ++i)  {
1859                        bi_foreach_instr_in_tuple(&clause->tuples[i], ins) {
1860                                list_addtail(&ins->link, &block->instructions);
1861                        }
1862                }
1863        }
1864
1865        block->scheduled = true;
1866
1867#ifndef NDEBUG
1868        unsigned i;
1869        bool incomplete = false;
1870
1871        BITSET_FOREACH_SET(i, st.worklist, st.count) {
1872                bi_print_instr(st.instructions[i], stderr);
1873                incomplete = true;
1874        }
1875
1876        if (incomplete)
1877                unreachable("The above instructions failed to schedule.");
1878#endif
1879
1880        bi_free_worklist(st);
1881}
1882
1883static bool
1884bi_check_fau_src(bi_instr *ins, unsigned s, uint32_t *constants, unsigned *cwords, bi_index *fau)
1885{
1886        bi_index src = ins->src[s];
1887
1888        /* Staging registers can't have FAU accesses */
1889        if (s == 0 && bi_opcode_props[ins->op].sr_read)
1890                return (src.type != BI_INDEX_CONSTANT) && (src.type != BI_INDEX_FAU);
1891
1892        if (src.type == BI_INDEX_CONSTANT) {
1893                /* Allow fast zero */
1894                if (src.value == 0 && bi_opcode_props[ins->op].fma && bi_reads_zero(ins))
1895                        return true;
1896
1897                if (!bi_is_null(*fau))
1898                        return false;
1899
1900                /* Else, try to inline a constant */
1901                for (unsigned i = 0; i < *cwords; ++i) {
1902                        if (src.value == constants[i])
1903                                return true;
1904                }
1905
1906                if (*cwords >= 2)
1907                        return false;
1908
1909                constants[(*cwords)++] = src.value;
1910        } else if (src.type == BI_INDEX_FAU) {
1911                if (*cwords != 0)
1912                        return false;
1913
1914                /* Can only read from one pair of FAU words */
1915                if (!bi_is_null(*fau) && (src.value != fau->value))
1916                        return false;
1917
1918                /* If there is a target, we'll need a PC-relative constant */
1919                if (ins->branch_target)
1920                        return false;
1921
1922                *fau = src;
1923        }
1924
1925        return true;
1926}
1927
1928void
1929bi_lower_fau(bi_context *ctx)
1930{
1931        bi_foreach_instr_global_safe(ctx, ins) {
1932                bi_builder b = bi_init_builder(ctx, bi_before_instr(ins));
1933
1934                uint32_t constants[2];
1935                unsigned cwords = 0;
1936                bi_index fau = bi_null();
1937
1938                /* ATEST must have the ATEST datum encoded, not any other
1939                 * uniform. See to it this is the case. */
1940                if (ins->op == BI_OPCODE_ATEST)
1941                        fau = ins->src[2];
1942
1943                bi_foreach_src(ins, s) {
1944                        if (bi_check_fau_src(ins, s, constants, &cwords, &fau)) continue;
1945
1946                        bi_index copy = bi_mov_i32(&b, ins->src[s]);
1947                        ins->src[s] = bi_replace_index(ins->src[s], copy);
1948                }
1949        }
1950}
1951
1952/* Only v7 allows specifying a dependency on the tilebuffer for the first
1953 * clause of a shader. v6 requires adding a NOP clause with the depedency. */
1954
1955static void
1956bi_add_nop_for_atest(bi_context *ctx)
1957{
1958        /* Only needed on v6 */
1959        if (ctx->arch >= 7)
1960                return;
1961
1962        if (list_is_empty(&ctx->blocks))
1963                return;
1964
1965        /* Fetch the first clause of the shader */
1966        bi_block *block = list_first_entry(&ctx->blocks, bi_block, link);
1967        bi_clause *clause = bi_next_clause(ctx, block, NULL);
1968
1969        if (!clause || !(clause->dependencies & ((1 << BIFROST_SLOT_ELDEST_DEPTH) |
1970                                                 (1 << BIFROST_SLOT_ELDEST_COLOUR))))
1971                return;
1972
1973        /* Add a NOP so we can wait for the dependencies required by the first
1974         * clause */
1975
1976        bi_instr *I = rzalloc(ctx, bi_instr);
1977        I->op = BI_OPCODE_NOP;
1978        I->dest[0] = bi_null();
1979
1980        bi_clause *new_clause = ralloc(ctx, bi_clause);
1981        *new_clause = (bi_clause) {
1982                .flow_control = BIFROST_FLOW_NBTB,
1983                .next_clause_prefetch = true,
1984                .block = clause->block,
1985
1986                .tuple_count = 1,
1987                .tuples[0] = { .fma = I, },
1988        };
1989
1990        list_add(&new_clause->link, &clause->block->clauses);
1991}
1992
1993void
1994bi_schedule(bi_context *ctx)
1995{
1996        /* Fed into both scheduling and DCE */
1997        bi_postra_liveness(ctx);
1998
1999        bi_foreach_block(ctx, block) {
2000                bi_schedule_block(ctx, block);
2001        }
2002
2003        bi_opt_dce_post_ra(ctx);
2004        bi_add_nop_for_atest(ctx);
2005}
2006