qpu_schedule.c revision 01e04c3f
1/*
2 * Copyright © 2010 Intel Corporation
3 * Copyright © 2014-2017 Broadcom
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25/**
26 * @file
27 *
28 * The basic model of the list scheduler is to take a basic block, compute a
29 * DAG of the dependencies, and make a list of the DAG heads.  Heuristically
30 * pick a DAG head, then put all the children that are now DAG heads into the
31 * list of things to schedule.
32 *
33 * The goal of scheduling here is to pack pairs of operations together in a
34 * single QPU instruction.
35 */
36
37#include "qpu/qpu_disasm.h"
38#include "v3d_compiler.h"
39#include "util/ralloc.h"
40
41static bool debug;
42
43struct schedule_node_child;
44
45struct schedule_node {
46        struct list_head link;
47        struct qinst *inst;
48        struct schedule_node_child *children;
49        uint32_t child_count;
50        uint32_t child_array_size;
51        uint32_t parent_count;
52
53        /* Longest cycles + instruction_latency() of any parent of this node. */
54        uint32_t unblocked_time;
55
56        /**
57         * Minimum number of cycles from scheduling this instruction until the
58         * end of the program, based on the slowest dependency chain through
59         * the children.
60         */
61        uint32_t delay;
62
63        /**
64         * cycles between this instruction being scheduled and when its result
65         * can be consumed.
66         */
67        uint32_t latency;
68};
69
70struct schedule_node_child {
71        struct schedule_node *node;
72        bool write_after_read;
73};
74
75/* When walking the instructions in reverse, we need to swap before/after in
76 * add_dep().
77 */
78enum direction { F, R };
79
80struct schedule_state {
81        const struct v3d_device_info *devinfo;
82        struct schedule_node *last_r[6];
83        struct schedule_node *last_rf[64];
84        struct schedule_node *last_sf;
85        struct schedule_node *last_vpm_read;
86        struct schedule_node *last_tmu_write;
87        struct schedule_node *last_tmu_config;
88        struct schedule_node *last_tlb;
89        struct schedule_node *last_vpm;
90        struct schedule_node *last_unif;
91        struct schedule_node *last_rtop;
92        enum direction dir;
93        /* Estimated cycle when the current instruction would start. */
94        uint32_t time;
95};
96
97static void
98add_dep(struct schedule_state *state,
99        struct schedule_node *before,
100        struct schedule_node *after,
101        bool write)
102{
103        bool write_after_read = !write && state->dir == R;
104
105        if (!before || !after)
106                return;
107
108        assert(before != after);
109
110        if (state->dir == R) {
111                struct schedule_node *t = before;
112                before = after;
113                after = t;
114        }
115
116        for (int i = 0; i < before->child_count; i++) {
117                if (before->children[i].node == after &&
118                    (before->children[i].write_after_read == write_after_read)) {
119                        return;
120                }
121        }
122
123        if (before->child_array_size <= before->child_count) {
124                before->child_array_size = MAX2(before->child_array_size * 2, 16);
125                before->children = reralloc(before, before->children,
126                                            struct schedule_node_child,
127                                            before->child_array_size);
128        }
129
130        before->children[before->child_count].node = after;
131        before->children[before->child_count].write_after_read =
132                write_after_read;
133        before->child_count++;
134        after->parent_count++;
135}
136
137static void
138add_read_dep(struct schedule_state *state,
139              struct schedule_node *before,
140              struct schedule_node *after)
141{
142        add_dep(state, before, after, false);
143}
144
145static void
146add_write_dep(struct schedule_state *state,
147              struct schedule_node **before,
148              struct schedule_node *after)
149{
150        add_dep(state, *before, after, true);
151        *before = after;
152}
153
154static bool
155qpu_inst_is_tlb(const struct v3d_qpu_instr *inst)
156{
157        if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
158                return false;
159
160        if (inst->alu.add.magic_write &&
161            (inst->alu.add.waddr == V3D_QPU_WADDR_TLB ||
162             inst->alu.add.waddr == V3D_QPU_WADDR_TLBU))
163                return true;
164
165        if (inst->alu.mul.magic_write &&
166            (inst->alu.mul.waddr == V3D_QPU_WADDR_TLB ||
167             inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU))
168                return true;
169
170        return false;
171}
172
173static void
174process_mux_deps(struct schedule_state *state, struct schedule_node *n,
175                 enum v3d_qpu_mux mux)
176{
177        switch (mux) {
178        case V3D_QPU_MUX_A:
179                add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
180                break;
181        case V3D_QPU_MUX_B:
182                add_read_dep(state, state->last_rf[n->inst->qpu.raddr_b], n);
183                break;
184        default:
185                add_read_dep(state, state->last_r[mux - V3D_QPU_MUX_R0], n);
186                break;
187        }
188}
189
190
191static void
192process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
193                   uint32_t waddr, bool magic)
194{
195        if (!magic) {
196                add_write_dep(state, &state->last_rf[waddr], n);
197        } else if (v3d_qpu_magic_waddr_is_tmu(waddr)) {
198                add_write_dep(state, &state->last_tmu_write, n);
199                switch (waddr) {
200                case V3D_QPU_WADDR_TMUS:
201                case V3D_QPU_WADDR_TMUSCM:
202                case V3D_QPU_WADDR_TMUSF:
203                case V3D_QPU_WADDR_TMUSLOD:
204                        add_write_dep(state, &state->last_tmu_config, n);
205                        break;
206                default:
207                        break;
208                }
209        } else if (v3d_qpu_magic_waddr_is_sfu(waddr)) {
210                /* Handled by v3d_qpu_writes_r4() check. */
211        } else {
212                switch (waddr) {
213                case V3D_QPU_WADDR_R0:
214                case V3D_QPU_WADDR_R1:
215                case V3D_QPU_WADDR_R2:
216                        add_write_dep(state,
217                                      &state->last_r[waddr - V3D_QPU_WADDR_R0],
218                                      n);
219                        break;
220                case V3D_QPU_WADDR_R3:
221                case V3D_QPU_WADDR_R4:
222                case V3D_QPU_WADDR_R5:
223                        /* Handled by v3d_qpu_writes_r*() checks below. */
224                        break;
225
226                case V3D_QPU_WADDR_VPM:
227                case V3D_QPU_WADDR_VPMU:
228                        add_write_dep(state, &state->last_vpm, n);
229                        break;
230
231                case V3D_QPU_WADDR_TLB:
232                case V3D_QPU_WADDR_TLBU:
233                        add_write_dep(state, &state->last_tlb, n);
234                        break;
235
236                case V3D_QPU_WADDR_NOP:
237                        break;
238
239                default:
240                        fprintf(stderr, "Unknown waddr %d\n", waddr);
241                        abort();
242                }
243        }
244}
245
246static void
247process_cond_deps(struct schedule_state *state, struct schedule_node *n,
248                  enum v3d_qpu_cond cond)
249{
250        if (cond != V3D_QPU_COND_NONE)
251                add_read_dep(state, state->last_sf, n);
252}
253
254static void
255process_pf_deps(struct schedule_state *state, struct schedule_node *n,
256                enum v3d_qpu_pf pf)
257{
258        if (pf != V3D_QPU_PF_NONE)
259                add_write_dep(state, &state->last_sf, n);
260}
261
262static void
263process_uf_deps(struct schedule_state *state, struct schedule_node *n,
264                enum v3d_qpu_uf uf)
265{
266        if (uf != V3D_QPU_UF_NONE)
267                add_write_dep(state, &state->last_sf, n);
268}
269
270/**
271 * Common code for dependencies that need to be tracked both forward and
272 * backward.
273 *
274 * This is for things like "all reads of r4 have to happen between the r4
275 * writes that surround them".
276 */
277static void
278calculate_deps(struct schedule_state *state, struct schedule_node *n)
279{
280        const struct v3d_device_info *devinfo = state->devinfo;
281        struct qinst *qinst = n->inst;
282        struct v3d_qpu_instr *inst = &qinst->qpu;
283
284        if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
285                if (inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS)
286                        add_read_dep(state, state->last_sf, n);
287
288                /* XXX: BDI */
289                /* XXX: BDU */
290                /* XXX: ub */
291                /* XXX: raddr_a */
292
293                add_write_dep(state, &state->last_unif, n);
294                return;
295        }
296
297        assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
298
299        /* XXX: LOAD_IMM */
300
301        if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0)
302                process_mux_deps(state, n, inst->alu.add.a);
303        if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1)
304                process_mux_deps(state, n, inst->alu.add.b);
305
306        if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0)
307                process_mux_deps(state, n, inst->alu.mul.a);
308        if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1)
309                process_mux_deps(state, n, inst->alu.mul.b);
310
311        switch (inst->alu.add.op) {
312        case V3D_QPU_A_VPMSETUP:
313                /* Could distinguish read/write by unpacking the uniform. */
314                add_write_dep(state, &state->last_vpm, n);
315                add_write_dep(state, &state->last_vpm_read, n);
316                break;
317
318        case V3D_QPU_A_STVPMV:
319        case V3D_QPU_A_STVPMD:
320        case V3D_QPU_A_STVPMP:
321                add_write_dep(state, &state->last_vpm, n);
322                break;
323
324        case V3D_QPU_A_VPMWT:
325                add_read_dep(state, state->last_vpm, n);
326                break;
327
328        case V3D_QPU_A_MSF:
329                add_read_dep(state, state->last_tlb, n);
330                break;
331
332        case V3D_QPU_A_SETMSF:
333        case V3D_QPU_A_SETREVF:
334                add_write_dep(state, &state->last_tlb, n);
335                break;
336
337        case V3D_QPU_A_FLAPUSH:
338        case V3D_QPU_A_FLBPUSH:
339        case V3D_QPU_A_VFLA:
340        case V3D_QPU_A_VFLNA:
341        case V3D_QPU_A_VFLB:
342        case V3D_QPU_A_VFLNB:
343                add_read_dep(state, state->last_sf, n);
344                break;
345
346        case V3D_QPU_A_FLPOP:
347                add_write_dep(state, &state->last_sf, n);
348                break;
349
350        default:
351                break;
352        }
353
354        switch (inst->alu.mul.op) {
355        case V3D_QPU_M_MULTOP:
356        case V3D_QPU_M_UMUL24:
357                /* MULTOP sets rtop, and UMUL24 implicitly reads rtop and
358                 * resets it to 0.  We could possibly reorder umul24s relative
359                 * to each other, but for now just keep all the MUL parts in
360                 * order.
361                 */
362                add_write_dep(state, &state->last_rtop, n);
363                break;
364        default:
365                break;
366        }
367
368        if (inst->alu.add.op != V3D_QPU_A_NOP) {
369                process_waddr_deps(state, n, inst->alu.add.waddr,
370                                   inst->alu.add.magic_write);
371        }
372        if (inst->alu.mul.op != V3D_QPU_M_NOP) {
373                process_waddr_deps(state, n, inst->alu.mul.waddr,
374                                   inst->alu.mul.magic_write);
375        }
376        if (v3d_qpu_sig_writes_address(devinfo, &inst->sig)) {
377                process_waddr_deps(state, n, inst->sig_addr,
378                                   inst->sig_magic);
379        }
380
381        if (v3d_qpu_writes_r3(devinfo, inst))
382                add_write_dep(state, &state->last_r[3], n);
383        if (v3d_qpu_writes_r4(devinfo, inst))
384                add_write_dep(state, &state->last_r[4], n);
385        if (v3d_qpu_writes_r5(devinfo, inst))
386                add_write_dep(state, &state->last_r[5], n);
387
388        if (inst->sig.thrsw) {
389                /* All accumulator contents and flags are undefined after the
390                 * switch.
391                 */
392                for (int i = 0; i < ARRAY_SIZE(state->last_r); i++)
393                        add_write_dep(state, &state->last_r[i], n);
394                add_write_dep(state, &state->last_sf, n);
395                add_write_dep(state, &state->last_rtop, n);
396
397                /* Scoreboard-locking operations have to stay after the last
398                 * thread switch.
399                 */
400                add_write_dep(state, &state->last_tlb, n);
401
402                add_write_dep(state, &state->last_tmu_write, n);
403                add_write_dep(state, &state->last_tmu_config, n);
404        }
405
406        if (v3d_qpu_waits_on_tmu(inst)) {
407                /* TMU loads are coming from a FIFO, so ordering is important.
408                 */
409                add_write_dep(state, &state->last_tmu_write, n);
410        }
411
412        if (inst->sig.wrtmuc)
413                add_write_dep(state, &state->last_tmu_config, n);
414
415        if (inst->sig.ldtlb | inst->sig.ldtlbu)
416                add_read_dep(state, state->last_tlb, n);
417
418        if (inst->sig.ldvpm)
419                add_write_dep(state, &state->last_vpm_read, n);
420
421        /* inst->sig.ldunif or sideband uniform read */
422        if (qinst->uniform != ~0)
423                add_write_dep(state, &state->last_unif, n);
424
425        process_cond_deps(state, n, inst->flags.ac);
426        process_cond_deps(state, n, inst->flags.mc);
427        process_pf_deps(state, n, inst->flags.apf);
428        process_pf_deps(state, n, inst->flags.mpf);
429        process_uf_deps(state, n, inst->flags.auf);
430        process_uf_deps(state, n, inst->flags.muf);
431}
432
433static void
434calculate_forward_deps(struct v3d_compile *c, struct list_head *schedule_list)
435{
436        struct schedule_state state;
437
438        memset(&state, 0, sizeof(state));
439        state.devinfo = c->devinfo;
440        state.dir = F;
441
442        list_for_each_entry(struct schedule_node, node, schedule_list, link)
443                calculate_deps(&state, node);
444}
445
446static void
447calculate_reverse_deps(struct v3d_compile *c, struct list_head *schedule_list)
448{
449        struct list_head *node;
450        struct schedule_state state;
451
452        memset(&state, 0, sizeof(state));
453        state.devinfo = c->devinfo;
454        state.dir = R;
455
456        for (node = schedule_list->prev; schedule_list != node; node = node->prev) {
457                calculate_deps(&state, (struct schedule_node *)node);
458        }
459}
460
461struct choose_scoreboard {
462        int tick;
463        int last_magic_sfu_write_tick;
464        int last_ldvary_tick;
465        int last_uniforms_reset_tick;
466        int last_thrsw_tick;
467        bool tlb_locked;
468};
469
470static bool
471mux_reads_too_soon(struct choose_scoreboard *scoreboard,
472                   const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)
473{
474        switch (mux) {
475        case V3D_QPU_MUX_R4:
476                if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick <= 2)
477                        return true;
478                break;
479
480        case V3D_QPU_MUX_R5:
481                if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
482                        return true;
483                break;
484        default:
485                break;
486        }
487
488        return false;
489}
490
491static bool
492reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
493                           struct qinst *qinst)
494{
495        const struct v3d_qpu_instr *inst = &qinst->qpu;
496
497        /* XXX: Branching off of raddr. */
498        if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
499                return false;
500
501        assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
502
503        if (inst->alu.add.op != V3D_QPU_A_NOP) {
504                if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 &&
505                    mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) {
506                        return true;
507                }
508                if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 &&
509                    mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) {
510                        return true;
511                }
512        }
513
514        if (inst->alu.mul.op != V3D_QPU_M_NOP) {
515                if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 &&
516                    mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) {
517                        return true;
518                }
519                if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 &&
520                    mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) {
521                        return true;
522                }
523        }
524
525        /* XXX: imm */
526
527        return false;
528}
529
530static bool
531writes_too_soon_after_write(const struct v3d_device_info *devinfo,
532                            struct choose_scoreboard *scoreboard,
533                            struct qinst *qinst)
534{
535        const struct v3d_qpu_instr *inst = &qinst->qpu;
536
537        /* Don't schedule any other r4 write too soon after an SFU write.
538         * This would normally be prevented by dependency tracking, but might
539         * occur if a dead SFU computation makes it to scheduling.
540         */
541        if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick < 2 &&
542            v3d_qpu_writes_r4(devinfo, inst))
543                return true;
544
545        return false;
546}
547
548static bool
549pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard,
550                          const struct v3d_qpu_instr *inst)
551{
552        return (scoreboard->tick == 0 && qpu_inst_is_tlb(inst));
553}
554
555static int
556get_instruction_priority(const struct v3d_qpu_instr *inst)
557{
558        uint32_t baseline_score;
559        uint32_t next_score = 0;
560
561        /* Schedule TLB operations as late as possible, to get more
562         * parallelism between shaders.
563         */
564        if (qpu_inst_is_tlb(inst))
565                return next_score;
566        next_score++;
567
568        /* Schedule texture read results collection late to hide latency. */
569        if (v3d_qpu_waits_on_tmu(inst))
570                return next_score;
571        next_score++;
572
573        /* Default score for things that aren't otherwise special. */
574        baseline_score = next_score;
575        next_score++;
576
577        /* Schedule texture read setup early to hide their latency better. */
578        if (v3d_qpu_writes_tmu(inst))
579                return next_score;
580        next_score++;
581
582        return baseline_score;
583}
584
585static bool
586qpu_magic_waddr_is_periph(enum v3d_qpu_waddr waddr)
587{
588        return (v3d_qpu_magic_waddr_is_tmu(waddr) ||
589                v3d_qpu_magic_waddr_is_sfu(waddr) ||
590                v3d_qpu_magic_waddr_is_tlb(waddr) ||
591                v3d_qpu_magic_waddr_is_vpm(waddr) ||
592                v3d_qpu_magic_waddr_is_tsy(waddr));
593}
594
595static bool
596qpu_accesses_peripheral(const struct v3d_qpu_instr *inst)
597{
598        if (v3d_qpu_uses_vpm(inst))
599                return true;
600        if (v3d_qpu_uses_sfu(inst))
601                return true;
602
603        if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
604                if (inst->alu.add.op != V3D_QPU_A_NOP &&
605                    inst->alu.add.magic_write &&
606                    qpu_magic_waddr_is_periph(inst->alu.add.waddr)) {
607                        return true;
608                }
609
610                if (inst->alu.add.op == V3D_QPU_A_TMUWT)
611                        return true;
612
613                if (inst->alu.mul.op != V3D_QPU_M_NOP &&
614                    inst->alu.mul.magic_write &&
615                    qpu_magic_waddr_is_periph(inst->alu.mul.waddr)) {
616                        return true;
617                }
618        }
619
620        return (inst->sig.ldvpm ||
621                inst->sig.ldtmu ||
622                inst->sig.ldtlb ||
623                inst->sig.ldtlbu ||
624                inst->sig.wrtmuc);
625}
626
627static bool
628qpu_merge_inst(const struct v3d_device_info *devinfo,
629               struct v3d_qpu_instr *result,
630               const struct v3d_qpu_instr *a,
631               const struct v3d_qpu_instr *b)
632{
633        if (a->type != V3D_QPU_INSTR_TYPE_ALU ||
634            b->type != V3D_QPU_INSTR_TYPE_ALU) {
635                return false;
636        }
637
638        /* Can't do more than one peripheral access in an instruction.
639         *
640         * XXX: V3D 4.1 allows TMU read along with a VPM read or write, and
641         * WRTMUC with a TMU magic register write (other than tmuc).
642         */
643        if (qpu_accesses_peripheral(a) && qpu_accesses_peripheral(b))
644                return false;
645
646        struct v3d_qpu_instr merge = *a;
647
648        if (b->alu.add.op != V3D_QPU_A_NOP) {
649                if (a->alu.add.op != V3D_QPU_A_NOP)
650                        return false;
651                merge.alu.add = b->alu.add;
652
653                merge.flags.ac = b->flags.ac;
654                merge.flags.apf = b->flags.apf;
655                merge.flags.auf = b->flags.auf;
656        }
657
658        if (b->alu.mul.op != V3D_QPU_M_NOP) {
659                if (a->alu.mul.op != V3D_QPU_M_NOP)
660                        return false;
661                merge.alu.mul = b->alu.mul;
662
663                merge.flags.mc = b->flags.mc;
664                merge.flags.mpf = b->flags.mpf;
665                merge.flags.muf = b->flags.muf;
666        }
667
668        if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A)) {
669                if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A) &&
670                    a->raddr_a != b->raddr_a) {
671                        return false;
672                }
673                merge.raddr_a = b->raddr_a;
674        }
675
676        if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) {
677                if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_B) &&
678                    (a->raddr_b != b->raddr_b ||
679                     a->sig.small_imm != b->sig.small_imm)) {
680                        return false;
681                }
682                merge.raddr_b = b->raddr_b;
683        }
684
685        merge.sig.thrsw |= b->sig.thrsw;
686        merge.sig.ldunif |= b->sig.ldunif;
687        merge.sig.ldunifrf |= b->sig.ldunifrf;
688        merge.sig.ldunifa |= b->sig.ldunifa;
689        merge.sig.ldunifarf |= b->sig.ldunifarf;
690        merge.sig.ldtmu |= b->sig.ldtmu;
691        merge.sig.ldvary |= b->sig.ldvary;
692        merge.sig.ldvpm |= b->sig.ldvpm;
693        merge.sig.small_imm |= b->sig.small_imm;
694        merge.sig.ldtlb |= b->sig.ldtlb;
695        merge.sig.ldtlbu |= b->sig.ldtlbu;
696        merge.sig.ucb |= b->sig.ucb;
697        merge.sig.rotate |= b->sig.rotate;
698        merge.sig.wrtmuc |= b->sig.wrtmuc;
699
700        if (v3d_qpu_sig_writes_address(devinfo, &a->sig) &&
701            v3d_qpu_sig_writes_address(devinfo, &b->sig))
702                return false;
703        merge.sig_addr |= b->sig_addr;
704        merge.sig_magic |= b->sig_magic;
705
706        uint64_t packed;
707        bool ok = v3d_qpu_instr_pack(devinfo, &merge, &packed);
708
709        *result = merge;
710        /* No modifying the real instructions on failure. */
711        assert(ok || (a != result && b != result));
712
713        return ok;
714}
715
716static struct schedule_node *
717choose_instruction_to_schedule(const struct v3d_device_info *devinfo,
718                               struct choose_scoreboard *scoreboard,
719                               struct list_head *schedule_list,
720                               struct schedule_node *prev_inst)
721{
722        struct schedule_node *chosen = NULL;
723        int chosen_prio = 0;
724
725        /* Don't pair up anything with a thread switch signal -- emit_thrsw()
726         * will handle pairing it along with filling the delay slots.
727         */
728        if (prev_inst) {
729                if (prev_inst->inst->qpu.sig.thrsw)
730                        return NULL;
731        }
732
733        list_for_each_entry(struct schedule_node, n, schedule_list, link) {
734                const struct v3d_qpu_instr *inst = &n->inst->qpu;
735
736                /* Don't choose the branch instruction until it's the last one
737                 * left.  We'll move it up to fit its delay slots after we
738                 * choose it.
739                 */
740                if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
741                    !list_is_singular(schedule_list)) {
742                        continue;
743                }
744
745                /* "An instruction must not read from a location in physical
746                 *  regfile A or B that was written to by the previous
747                 *  instruction."
748                 */
749                if (reads_too_soon_after_write(scoreboard, n->inst))
750                        continue;
751
752                if (writes_too_soon_after_write(devinfo, scoreboard, n->inst))
753                        continue;
754
755                /* "A scoreboard wait must not occur in the first two
756                 *  instructions of a fragment shader. This is either the
757                 *  explicit Wait for Scoreboard signal or an implicit wait
758                 *  with the first tile-buffer read or write instruction."
759                 */
760                if (pixel_scoreboard_too_soon(scoreboard, inst))
761                        continue;
762
763                /* ldunif and ldvary both write r5, but ldunif does so a tick
764                 * sooner.  If the ldvary's r5 wasn't used, then ldunif might
765                 * otherwise get scheduled so ldunif and ldvary try to update
766                 * r5 in the same tick.
767                 */
768                if ((inst->sig.ldunif || inst->sig.ldunifa) &&
769                    scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
770                        continue;
771                }
772
773                /* If we're trying to pair with another instruction, check
774                 * that they're compatible.
775                 */
776                if (prev_inst) {
777                        /* Don't pair up a thread switch signal -- we'll
778                         * handle pairing it when we pick it on its own.
779                         */
780                        if (inst->sig.thrsw)
781                                continue;
782
783                        if (prev_inst->inst->uniform != -1 &&
784                            n->inst->uniform != -1)
785                                continue;
786
787                        /* Don't merge in something that will lock the TLB.
788                         * Hopwefully what we have in inst will release some
789                         * other instructions, allowing us to delay the
790                         * TLB-locking instruction until later.
791                         */
792                        if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst))
793                                continue;
794
795                        struct v3d_qpu_instr merged_inst;
796                        if (!qpu_merge_inst(devinfo, &merged_inst,
797                                            &prev_inst->inst->qpu, inst)) {
798                                continue;
799                        }
800                }
801
802                int prio = get_instruction_priority(inst);
803
804                /* Found a valid instruction.  If nothing better comes along,
805                 * this one works.
806                 */
807                if (!chosen) {
808                        chosen = n;
809                        chosen_prio = prio;
810                        continue;
811                }
812
813                if (prio > chosen_prio) {
814                        chosen = n;
815                        chosen_prio = prio;
816                } else if (prio < chosen_prio) {
817                        continue;
818                }
819
820                if (n->delay > chosen->delay) {
821                        chosen = n;
822                        chosen_prio = prio;
823                } else if (n->delay < chosen->delay) {
824                        continue;
825                }
826        }
827
828        return chosen;
829}
830
831static void
832update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard,
833                                  enum v3d_qpu_waddr waddr)
834{
835        if (v3d_qpu_magic_waddr_is_sfu(waddr))
836                scoreboard->last_magic_sfu_write_tick = scoreboard->tick;
837}
838
839static void
840update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
841                             const struct v3d_qpu_instr *inst)
842{
843        if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
844                return;
845
846        assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
847
848        if (inst->alu.add.op != V3D_QPU_A_NOP)  {
849                if (inst->alu.add.magic_write) {
850                        update_scoreboard_for_magic_waddr(scoreboard,
851                                                          inst->alu.add.waddr);
852                }
853        }
854
855        if (inst->alu.mul.op != V3D_QPU_M_NOP) {
856                if (inst->alu.mul.magic_write) {
857                        update_scoreboard_for_magic_waddr(scoreboard,
858                                                          inst->alu.mul.waddr);
859                }
860        }
861
862        if (inst->sig.ldvary)
863                scoreboard->last_ldvary_tick = scoreboard->tick;
864
865        if (qpu_inst_is_tlb(inst))
866                scoreboard->tlb_locked = true;
867}
868
869static void
870dump_state(const struct v3d_device_info *devinfo,
871           struct list_head *schedule_list)
872{
873        list_for_each_entry(struct schedule_node, n, schedule_list, link) {
874                fprintf(stderr, "         t=%4d: ", n->unblocked_time);
875                v3d_qpu_dump(devinfo, &n->inst->qpu);
876                fprintf(stderr, "\n");
877
878                for (int i = 0; i < n->child_count; i++) {
879                        struct schedule_node *child = n->children[i].node;
880                        if (!child)
881                                continue;
882
883                        fprintf(stderr, "                 - ");
884                        v3d_qpu_dump(devinfo, &child->inst->qpu);
885                        fprintf(stderr, " (%d parents, %c)\n",
886                                child->parent_count,
887                                n->children[i].write_after_read ? 'w' : 'r');
888                }
889        }
890}
891
892static uint32_t magic_waddr_latency(enum v3d_qpu_waddr waddr,
893                                    const struct v3d_qpu_instr *after)
894{
895        /* Apply some huge latency between texture fetch requests and getting
896         * their results back.
897         *
898         * FIXME: This is actually pretty bogus.  If we do:
899         *
900         * mov tmu0_s, a
901         * <a bit of math>
902         * mov tmu0_s, b
903         * load_tmu0
904         * <more math>
905         * load_tmu0
906         *
907         * we count that as worse than
908         *
909         * mov tmu0_s, a
910         * mov tmu0_s, b
911         * <lots of math>
912         * load_tmu0
913         * <more math>
914         * load_tmu0
915         *
916         * because we associate the first load_tmu0 with the *second* tmu0_s.
917         */
918        if (v3d_qpu_magic_waddr_is_tmu(waddr) && v3d_qpu_waits_on_tmu(after))
919                return 100;
920
921        /* Assume that anything depending on us is consuming the SFU result. */
922        if (v3d_qpu_magic_waddr_is_sfu(waddr))
923                return 3;
924
925        return 1;
926}
927
928static uint32_t
929instruction_latency(struct schedule_node *before, struct schedule_node *after)
930{
931        const struct v3d_qpu_instr *before_inst = &before->inst->qpu;
932        const struct v3d_qpu_instr *after_inst = &after->inst->qpu;
933        uint32_t latency = 1;
934
935        if (before_inst->type != V3D_QPU_INSTR_TYPE_ALU ||
936            after_inst->type != V3D_QPU_INSTR_TYPE_ALU)
937                return latency;
938
939        if (before_inst->alu.add.magic_write) {
940                latency = MAX2(latency,
941                               magic_waddr_latency(before_inst->alu.add.waddr,
942                                                   after_inst));
943        }
944
945        if (before_inst->alu.mul.magic_write) {
946                latency = MAX2(latency,
947                               magic_waddr_latency(before_inst->alu.mul.waddr,
948                                                   after_inst));
949        }
950
951        return latency;
952}
953
954/** Recursive computation of the delay member of a node. */
955static void
956compute_delay(struct schedule_node *n)
957{
958        if (!n->child_count) {
959                n->delay = 1;
960        } else {
961                for (int i = 0; i < n->child_count; i++) {
962                        if (!n->children[i].node->delay)
963                                compute_delay(n->children[i].node);
964                        n->delay = MAX2(n->delay,
965                                        n->children[i].node->delay +
966                                        instruction_latency(n, n->children[i].node));
967                }
968        }
969}
970
971static void
972mark_instruction_scheduled(struct list_head *schedule_list,
973                           uint32_t time,
974                           struct schedule_node *node,
975                           bool war_only)
976{
977        if (!node)
978                return;
979
980        for (int i = node->child_count - 1; i >= 0; i--) {
981                struct schedule_node *child =
982                        node->children[i].node;
983
984                if (!child)
985                        continue;
986
987                if (war_only && !node->children[i].write_after_read)
988                        continue;
989
990                /* If the requirement is only that the node not appear before
991                 * the last read of its destination, then it can be scheduled
992                 * immediately after (or paired with!) the thing reading the
993                 * destination.
994                 */
995                uint32_t latency = 0;
996                if (!war_only) {
997                        latency = instruction_latency(node,
998                                                      node->children[i].node);
999                }
1000
1001                child->unblocked_time = MAX2(child->unblocked_time,
1002                                             time + latency);
1003                child->parent_count--;
1004                if (child->parent_count == 0)
1005                        list_add(&child->link, schedule_list);
1006
1007                node->children[i].node = NULL;
1008        }
1009}
1010
1011static void
1012insert_scheduled_instruction(struct v3d_compile *c,
1013                             struct qblock *block,
1014                             struct choose_scoreboard *scoreboard,
1015                             struct qinst *inst)
1016{
1017        list_addtail(&inst->link, &block->instructions);
1018
1019        update_scoreboard_for_chosen(scoreboard, &inst->qpu);
1020        c->qpu_inst_count++;
1021        scoreboard->tick++;
1022}
1023
1024static struct qinst *
1025vir_nop()
1026{
1027        struct qreg undef = { QFILE_NULL, 0 };
1028        struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef);
1029
1030        return qinst;
1031}
1032
1033static void
1034emit_nop(struct v3d_compile *c, struct qblock *block,
1035         struct choose_scoreboard *scoreboard)
1036{
1037        insert_scheduled_instruction(c, block, scoreboard, vir_nop());
1038}
1039
1040static bool
1041qpu_instruction_valid_in_thrend_slot(struct v3d_compile *c,
1042                                     const struct qinst *qinst, int slot)
1043{
1044        const struct v3d_qpu_instr *inst = &qinst->qpu;
1045
1046        /* Only TLB Z writes are prohibited in the last slot, but we don't
1047         * have those flagged so prohibit all TLB ops for now.
1048         */
1049        if (slot == 2 && qpu_inst_is_tlb(inst))
1050                return false;
1051
1052        if (slot > 0 && qinst->uniform != ~0)
1053                return false;
1054
1055        if (v3d_qpu_uses_vpm(inst))
1056                return false;
1057
1058        if (inst->sig.ldvary)
1059                return false;
1060
1061        if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
1062                /* GFXH-1625: TMUWT not allowed in the final instruction. */
1063                if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT)
1064                        return false;
1065
1066                /* No writing physical registers at the end. */
1067                if (!inst->alu.add.magic_write ||
1068                    !inst->alu.mul.magic_write) {
1069                        return false;
1070                }
1071
1072                if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF)
1073                        return false;
1074
1075                /* RF0-2 might be overwritten during the delay slots by
1076                 * fragment shader setup.
1077                 */
1078                if (inst->raddr_a < 3 &&
1079                    (inst->alu.add.a == V3D_QPU_MUX_A ||
1080                     inst->alu.add.b == V3D_QPU_MUX_A ||
1081                     inst->alu.mul.a == V3D_QPU_MUX_A ||
1082                     inst->alu.mul.b == V3D_QPU_MUX_A)) {
1083                        return false;
1084                }
1085
1086                if (inst->raddr_b < 3 &&
1087                    !inst->sig.small_imm &&
1088                    (inst->alu.add.a == V3D_QPU_MUX_B ||
1089                     inst->alu.add.b == V3D_QPU_MUX_B ||
1090                     inst->alu.mul.a == V3D_QPU_MUX_B ||
1091                     inst->alu.mul.b == V3D_QPU_MUX_B)) {
1092                        return false;
1093                }
1094        }
1095
1096        return true;
1097}
1098
1099static bool
1100valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard,
1101                     struct qinst *qinst, int instructions_in_sequence,
1102                     bool is_thrend)
1103{
1104        /* No emitting our thrsw while the previous thrsw hasn't happened yet. */
1105        if (scoreboard->last_thrsw_tick + 3 >
1106            scoreboard->tick - instructions_in_sequence) {
1107                return false;
1108        }
1109
1110        for (int slot = 0; slot < instructions_in_sequence; slot++) {
1111                /* No scheduling SFU when the result would land in the other
1112                 * thread.  The simulator complains for safety, though it
1113                 * would only occur for dead code in our case.
1114                 */
1115                if (slot > 0 &&
1116                    qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
1117                    (v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.add.waddr) ||
1118                     v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.mul.waddr))) {
1119                        return false;
1120                }
1121
1122                if (slot > 0 && qinst->qpu.sig.ldvary)
1123                        return false;
1124
1125                if (is_thrend &&
1126                    !qpu_instruction_valid_in_thrend_slot(c, qinst, slot)) {
1127                        return false;
1128                }
1129
1130                /* Note that the list is circular, so we can only do this up
1131                 * to instructions_in_sequence.
1132                 */
1133                qinst = (struct qinst *)qinst->link.next;
1134        }
1135
1136        return true;
1137}
1138
1139/**
1140 * Emits a THRSW signal in the stream, trying to move it up to pair with
1141 * another instruction.
1142 */
1143static int
1144emit_thrsw(struct v3d_compile *c,
1145           struct qblock *block,
1146           struct choose_scoreboard *scoreboard,
1147           struct qinst *inst,
1148           bool is_thrend)
1149{
1150        int time = 0;
1151
1152        /* There should be nothing in a thrsw inst being scheduled other than
1153         * the signal bits.
1154         */
1155        assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
1156        assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP);
1157        assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP);
1158
1159        /* Find how far back into previous instructions we can put the THRSW. */
1160        int slots_filled = 0;
1161        struct qinst *merge_inst = NULL;
1162        vir_for_each_inst_rev(prev_inst, block) {
1163                struct v3d_qpu_sig sig = prev_inst->qpu.sig;
1164                sig.thrsw = true;
1165                uint32_t packed_sig;
1166
1167                if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig))
1168                        break;
1169
1170                if (!valid_thrsw_sequence(c, scoreboard,
1171                                          prev_inst, slots_filled + 1,
1172                                          is_thrend)) {
1173                        break;
1174                }
1175
1176                merge_inst = prev_inst;
1177                if (++slots_filled == 3)
1178                        break;
1179        }
1180
1181        bool needs_free = false;
1182        if (merge_inst) {
1183                merge_inst->qpu.sig.thrsw = true;
1184                needs_free = true;
1185                scoreboard->last_thrsw_tick = scoreboard->tick - slots_filled;
1186        } else {
1187                scoreboard->last_thrsw_tick = scoreboard->tick;
1188                insert_scheduled_instruction(c, block, scoreboard, inst);
1189                time++;
1190                slots_filled++;
1191                merge_inst = inst;
1192        }
1193
1194        /* Insert any extra delay slot NOPs we need. */
1195        for (int i = 0; i < 3 - slots_filled; i++) {
1196                emit_nop(c, block, scoreboard);
1197                time++;
1198        }
1199
1200        /* If we're emitting the last THRSW (other than program end), then
1201         * signal that to the HW by emitting two THRSWs in a row.
1202         */
1203        if (inst->is_last_thrsw) {
1204                struct qinst *second_inst =
1205                        (struct qinst *)merge_inst->link.next;
1206                second_inst->qpu.sig.thrsw = true;
1207        }
1208
1209        /* If we put our THRSW into another instruction, free up the
1210         * instruction that didn't end up scheduled into the list.
1211         */
1212        if (needs_free)
1213                free(inst);
1214
1215        return time;
1216}
1217
1218static uint32_t
1219schedule_instructions(struct v3d_compile *c,
1220                      struct choose_scoreboard *scoreboard,
1221                      struct qblock *block,
1222                      struct list_head *schedule_list,
1223                      enum quniform_contents *orig_uniform_contents,
1224                      uint32_t *orig_uniform_data,
1225                      uint32_t *next_uniform)
1226{
1227        const struct v3d_device_info *devinfo = c->devinfo;
1228        uint32_t time = 0;
1229
1230        if (debug) {
1231                fprintf(stderr, "initial deps:\n");
1232                dump_state(devinfo, schedule_list);
1233                fprintf(stderr, "\n");
1234        }
1235
1236        /* Remove non-DAG heads from the list. */
1237        list_for_each_entry_safe(struct schedule_node, n, schedule_list, link) {
1238                if (n->parent_count != 0)
1239                        list_del(&n->link);
1240        }
1241
1242        while (!list_empty(schedule_list)) {
1243                struct schedule_node *chosen =
1244                        choose_instruction_to_schedule(devinfo,
1245                                                       scoreboard,
1246                                                       schedule_list,
1247                                                       NULL);
1248                struct schedule_node *merge = NULL;
1249
1250                /* If there are no valid instructions to schedule, drop a NOP
1251                 * in.
1252                 */
1253                struct qinst *qinst = chosen ? chosen->inst : vir_nop();
1254                struct v3d_qpu_instr *inst = &qinst->qpu;
1255
1256                if (debug) {
1257                        fprintf(stderr, "t=%4d: current list:\n",
1258                                time);
1259                        dump_state(devinfo, schedule_list);
1260                        fprintf(stderr, "t=%4d: chose:   ", time);
1261                        v3d_qpu_dump(devinfo, inst);
1262                        fprintf(stderr, "\n");
1263                }
1264
1265                /* We can't mark_instruction_scheduled() the chosen inst until
1266                 * we're done identifying instructions to merge, so put the
1267                 * merged instructions on a list for a moment.
1268                 */
1269                struct list_head merged_list;
1270                list_inithead(&merged_list);
1271
1272                /* Schedule this instruction onto the QPU list. Also try to
1273                 * find an instruction to pair with it.
1274                 */
1275                if (chosen) {
1276                        time = MAX2(chosen->unblocked_time, time);
1277                        list_del(&chosen->link);
1278                        mark_instruction_scheduled(schedule_list, time,
1279                                                   chosen, true);
1280
1281                        while ((merge =
1282                                choose_instruction_to_schedule(devinfo,
1283                                                               scoreboard,
1284                                                               schedule_list,
1285                                                               chosen))) {
1286                                time = MAX2(merge->unblocked_time, time);
1287                                list_del(&merge->link);
1288                                list_addtail(&merge->link, &merged_list);
1289                                (void)qpu_merge_inst(devinfo, inst,
1290                                                     inst, &merge->inst->qpu);
1291                                if (merge->inst->uniform != -1) {
1292                                        chosen->inst->uniform =
1293                                                merge->inst->uniform;
1294                                }
1295
1296                                if (debug) {
1297                                        fprintf(stderr, "t=%4d: merging: ",
1298                                                time);
1299                                        v3d_qpu_dump(devinfo, &merge->inst->qpu);
1300                                        fprintf(stderr, "\n");
1301                                        fprintf(stderr, "         result: ");
1302                                        v3d_qpu_dump(devinfo, inst);
1303                                        fprintf(stderr, "\n");
1304                                }
1305                        }
1306                }
1307
1308                /* Update the uniform index for the rewritten location --
1309                 * branch target updating will still need to change
1310                 * c->uniform_data[] using this index.
1311                 */
1312                if (qinst->uniform != -1) {
1313                        if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
1314                                block->branch_uniform = *next_uniform;
1315
1316                        c->uniform_data[*next_uniform] =
1317                                orig_uniform_data[qinst->uniform];
1318                        c->uniform_contents[*next_uniform] =
1319                                orig_uniform_contents[qinst->uniform];
1320                        qinst->uniform = *next_uniform;
1321                        (*next_uniform)++;
1322                }
1323
1324                if (debug) {
1325                        fprintf(stderr, "\n");
1326                }
1327
1328                /* Now that we've scheduled a new instruction, some of its
1329                 * children can be promoted to the list of instructions ready to
1330                 * be scheduled.  Update the children's unblocked time for this
1331                 * DAG edge as we do so.
1332                 */
1333                mark_instruction_scheduled(schedule_list, time, chosen, false);
1334                list_for_each_entry(struct schedule_node, merge, &merged_list,
1335                                    link) {
1336                        mark_instruction_scheduled(schedule_list, time, merge,
1337                                                   false);
1338
1339                        /* The merged VIR instruction doesn't get re-added to the
1340                         * block, so free it now.
1341                         */
1342                        free(merge->inst);
1343                }
1344
1345                if (inst->sig.thrsw) {
1346                        time += emit_thrsw(c, block, scoreboard, qinst, false);
1347                } else {
1348                        insert_scheduled_instruction(c, block,
1349                                                     scoreboard, qinst);
1350
1351                        if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
1352                                block->branch_qpu_ip = c->qpu_inst_count - 1;
1353                                /* Fill the delay slots.
1354                                 *
1355                                 * We should fill these with actual instructions,
1356                                 * instead, but that will probably need to be done
1357                                 * after this, once we know what the leading
1358                                 * instructions of the successors are (so we can
1359                                 * handle A/B register file write latency)
1360                                 */
1361                                for (int i = 0; i < 3; i++)
1362                                        emit_nop(c, block, scoreboard);
1363                        }
1364                }
1365        }
1366
1367        return time;
1368}
1369
1370static uint32_t
1371qpu_schedule_instructions_block(struct v3d_compile *c,
1372                                struct choose_scoreboard *scoreboard,
1373                                struct qblock *block,
1374                                enum quniform_contents *orig_uniform_contents,
1375                                uint32_t *orig_uniform_data,
1376                                uint32_t *next_uniform)
1377{
1378        void *mem_ctx = ralloc_context(NULL);
1379        struct list_head schedule_list;
1380
1381        list_inithead(&schedule_list);
1382
1383        /* Wrap each instruction in a scheduler structure. */
1384        while (!list_empty(&block->instructions)) {
1385                struct qinst *qinst = (struct qinst *)block->instructions.next;
1386                struct schedule_node *n =
1387                        rzalloc(mem_ctx, struct schedule_node);
1388
1389                n->inst = qinst;
1390
1391                list_del(&qinst->link);
1392                list_addtail(&n->link, &schedule_list);
1393        }
1394
1395        calculate_forward_deps(c, &schedule_list);
1396        calculate_reverse_deps(c, &schedule_list);
1397
1398        list_for_each_entry(struct schedule_node, n, &schedule_list, link) {
1399                compute_delay(n);
1400        }
1401
1402        uint32_t cycles = schedule_instructions(c, scoreboard, block,
1403                                                &schedule_list,
1404                                                orig_uniform_contents,
1405                                                orig_uniform_data,
1406                                                next_uniform);
1407
1408        ralloc_free(mem_ctx);
1409
1410        return cycles;
1411}
1412
1413static void
1414qpu_set_branch_targets(struct v3d_compile *c)
1415{
1416        vir_for_each_block(block, c) {
1417                /* The end block of the program has no branch. */
1418                if (!block->successors[0])
1419                        continue;
1420
1421                /* If there was no branch instruction, then the successor
1422                 * block must follow immediately after this one.
1423                 */
1424                if (block->branch_qpu_ip == ~0) {
1425                        assert(block->end_qpu_ip + 1 ==
1426                               block->successors[0]->start_qpu_ip);
1427                        continue;
1428                }
1429
1430                /* Walk back through the delay slots to find the branch
1431                 * instr.
1432                 */
1433                struct list_head *entry = block->instructions.prev;
1434                for (int i = 0; i < 3; i++)
1435                        entry = entry->prev;
1436                struct qinst *branch = container_of(entry, branch, link);
1437                assert(branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
1438
1439                /* Make sure that the if-we-don't-jump
1440                 * successor was scheduled just after the
1441                 * delay slots.
1442                 */
1443                assert(!block->successors[1] ||
1444                       block->successors[1]->start_qpu_ip ==
1445                       block->branch_qpu_ip + 4);
1446
1447                branch->qpu.branch.offset =
1448                        ((block->successors[0]->start_qpu_ip -
1449                          (block->branch_qpu_ip + 4)) *
1450                         sizeof(uint64_t));
1451
1452                /* Set up the relative offset to jump in the
1453                 * uniform stream.
1454                 *
1455                 * Use a temporary here, because
1456                 * uniform_data[inst->uniform] may be shared
1457                 * between multiple instructions.
1458                 */
1459                assert(c->uniform_contents[branch->uniform] == QUNIFORM_CONSTANT);
1460                c->uniform_data[branch->uniform] =
1461                        (block->successors[0]->start_uniform -
1462                         (block->branch_uniform + 1)) * 4;
1463        }
1464}
1465
1466uint32_t
1467v3d_qpu_schedule_instructions(struct v3d_compile *c)
1468{
1469        const struct v3d_device_info *devinfo = c->devinfo;
1470        struct qblock *end_block = list_last_entry(&c->blocks,
1471                                                   struct qblock, link);
1472
1473        /* We reorder the uniforms as we schedule instructions, so save the
1474         * old data off and replace it.
1475         */
1476        uint32_t *uniform_data = c->uniform_data;
1477        enum quniform_contents *uniform_contents = c->uniform_contents;
1478        c->uniform_contents = ralloc_array(c, enum quniform_contents,
1479                                           c->num_uniforms);
1480        c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms);
1481        c->uniform_array_size = c->num_uniforms;
1482        uint32_t next_uniform = 0;
1483
1484        struct choose_scoreboard scoreboard;
1485        memset(&scoreboard, 0, sizeof(scoreboard));
1486        scoreboard.last_ldvary_tick = -10;
1487        scoreboard.last_magic_sfu_write_tick = -10;
1488        scoreboard.last_uniforms_reset_tick = -10;
1489        scoreboard.last_thrsw_tick = -10;
1490
1491        if (debug) {
1492                fprintf(stderr, "Pre-schedule instructions\n");
1493                vir_for_each_block(block, c) {
1494                        fprintf(stderr, "BLOCK %d\n", block->index);
1495                        list_for_each_entry(struct qinst, qinst,
1496                                            &block->instructions, link) {
1497                                v3d_qpu_dump(devinfo, &qinst->qpu);
1498                                fprintf(stderr, "\n");
1499                        }
1500                }
1501                fprintf(stderr, "\n");
1502        }
1503
1504        uint32_t cycles = 0;
1505        vir_for_each_block(block, c) {
1506                block->start_qpu_ip = c->qpu_inst_count;
1507                block->branch_qpu_ip = ~0;
1508                block->start_uniform = next_uniform;
1509
1510                cycles += qpu_schedule_instructions_block(c,
1511                                                          &scoreboard,
1512                                                          block,
1513                                                          uniform_contents,
1514                                                          uniform_data,
1515                                                          &next_uniform);
1516
1517                block->end_qpu_ip = c->qpu_inst_count - 1;
1518        }
1519
1520        /* Emit the program-end THRSW instruction. */;
1521        struct qinst *thrsw = vir_nop();
1522        thrsw->qpu.sig.thrsw = true;
1523        emit_thrsw(c, end_block, &scoreboard, thrsw, true);
1524
1525        qpu_set_branch_targets(c);
1526
1527        assert(next_uniform == c->num_uniforms);
1528
1529        return cycles;
1530}
1531