qpu_schedule.c revision b8e80941
1/*
2 * Copyright © 2010 Intel Corporation
3 * Copyright © 2014-2017 Broadcom
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25/**
26 * @file
27 *
28 * The basic model of the list scheduler is to take a basic block, compute a
29 * DAG of the dependencies, and make a list of the DAG heads.  Heuristically
30 * pick a DAG head, then put all the children that are now DAG heads into the
31 * list of things to schedule.
32 *
33 * The goal of scheduling here is to pack pairs of operations together in a
34 * single QPU instruction.
35 */
36
37#include "qpu/qpu_disasm.h"
38#include "v3d_compiler.h"
39#include "util/ralloc.h"
40#include "util/dag.h"
41
42static bool debug;
43
44struct schedule_node_child;
45
46struct schedule_node {
47        struct dag_node dag;
48        struct list_head link;
49        struct qinst *inst;
50
51        /* Longest cycles + instruction_latency() of any parent of this node. */
52        uint32_t unblocked_time;
53
54        /**
55         * Minimum number of cycles from scheduling this instruction until the
56         * end of the program, based on the slowest dependency chain through
57         * the children.
58         */
59        uint32_t delay;
60
61        /**
62         * cycles between this instruction being scheduled and when its result
63         * can be consumed.
64         */
65        uint32_t latency;
66};
67
68/* When walking the instructions in reverse, we need to swap before/after in
69 * add_dep().
70 */
71enum direction { F, R };
72
73struct schedule_state {
74        const struct v3d_device_info *devinfo;
75        struct dag *dag;
76        struct schedule_node *last_r[6];
77        struct schedule_node *last_rf[64];
78        struct schedule_node *last_sf;
79        struct schedule_node *last_vpm_read;
80        struct schedule_node *last_tmu_write;
81        struct schedule_node *last_tmu_config;
82        struct schedule_node *last_tlb;
83        struct schedule_node *last_vpm;
84        struct schedule_node *last_unif;
85        struct schedule_node *last_rtop;
86        enum direction dir;
87        /* Estimated cycle when the current instruction would start. */
88        uint32_t time;
89};
90
91static void
92add_dep(struct schedule_state *state,
93        struct schedule_node *before,
94        struct schedule_node *after,
95        bool write)
96{
97        bool write_after_read = !write && state->dir == R;
98        void *edge_data = (void *)(uintptr_t)write_after_read;
99
100        if (!before || !after)
101                return;
102
103        assert(before != after);
104
105        if (state->dir == F)
106                dag_add_edge(&before->dag, &after->dag, edge_data);
107        else
108                dag_add_edge(&after->dag, &before->dag, edge_data);
109}
110
111static void
112add_read_dep(struct schedule_state *state,
113              struct schedule_node *before,
114              struct schedule_node *after)
115{
116        add_dep(state, before, after, false);
117}
118
119static void
120add_write_dep(struct schedule_state *state,
121              struct schedule_node **before,
122              struct schedule_node *after)
123{
124        add_dep(state, *before, after, true);
125        *before = after;
126}
127
128static bool
129qpu_inst_is_tlb(const struct v3d_qpu_instr *inst)
130{
131        if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
132                return false;
133
134        if (inst->alu.add.magic_write &&
135            (inst->alu.add.waddr == V3D_QPU_WADDR_TLB ||
136             inst->alu.add.waddr == V3D_QPU_WADDR_TLBU))
137                return true;
138
139        if (inst->alu.mul.magic_write &&
140            (inst->alu.mul.waddr == V3D_QPU_WADDR_TLB ||
141             inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU))
142                return true;
143
144        return false;
145}
146
147static void
148process_mux_deps(struct schedule_state *state, struct schedule_node *n,
149                 enum v3d_qpu_mux mux)
150{
151        switch (mux) {
152        case V3D_QPU_MUX_A:
153                add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
154                break;
155        case V3D_QPU_MUX_B:
156                add_read_dep(state, state->last_rf[n->inst->qpu.raddr_b], n);
157                break;
158        default:
159                add_read_dep(state, state->last_r[mux - V3D_QPU_MUX_R0], n);
160                break;
161        }
162}
163
164
165static void
166process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
167                   uint32_t waddr, bool magic)
168{
169        if (!magic) {
170                add_write_dep(state, &state->last_rf[waddr], n);
171        } else if (v3d_qpu_magic_waddr_is_tmu(waddr)) {
172                /* XXX perf: For V3D 4.x, we could reorder TMU writes other
173                 * than the TMUS/TMUD/TMUA to improve scheduling flexibility.
174                 */
175                add_write_dep(state, &state->last_tmu_write, n);
176                switch (waddr) {
177                case V3D_QPU_WADDR_TMUS:
178                case V3D_QPU_WADDR_TMUSCM:
179                case V3D_QPU_WADDR_TMUSF:
180                case V3D_QPU_WADDR_TMUSLOD:
181                        add_write_dep(state, &state->last_tmu_config, n);
182                        break;
183                default:
184                        break;
185                }
186        } else if (v3d_qpu_magic_waddr_is_sfu(waddr)) {
187                /* Handled by v3d_qpu_writes_r4() check. */
188        } else {
189                switch (waddr) {
190                case V3D_QPU_WADDR_R0:
191                case V3D_QPU_WADDR_R1:
192                case V3D_QPU_WADDR_R2:
193                        add_write_dep(state,
194                                      &state->last_r[waddr - V3D_QPU_WADDR_R0],
195                                      n);
196                        break;
197                case V3D_QPU_WADDR_R3:
198                case V3D_QPU_WADDR_R4:
199                case V3D_QPU_WADDR_R5:
200                        /* Handled by v3d_qpu_writes_r*() checks below. */
201                        break;
202
203                case V3D_QPU_WADDR_VPM:
204                case V3D_QPU_WADDR_VPMU:
205                        add_write_dep(state, &state->last_vpm, n);
206                        break;
207
208                case V3D_QPU_WADDR_TLB:
209                case V3D_QPU_WADDR_TLBU:
210                        add_write_dep(state, &state->last_tlb, n);
211                        break;
212
213                case V3D_QPU_WADDR_SYNC:
214                case V3D_QPU_WADDR_SYNCB:
215                case V3D_QPU_WADDR_SYNCU:
216                        /* For CS barrier(): Sync against any other memory
217                         * accesses.  There doesn't appear to be any need for
218                         * barriers to affect ALU operations.
219                         */
220                        add_write_dep(state, &state->last_tmu_write, n);
221                        break;
222
223                case V3D_QPU_WADDR_NOP:
224                        break;
225
226                default:
227                        fprintf(stderr, "Unknown waddr %d\n", waddr);
228                        abort();
229                }
230        }
231}
232
233/**
234 * Common code for dependencies that need to be tracked both forward and
235 * backward.
236 *
237 * This is for things like "all reads of r4 have to happen between the r4
238 * writes that surround them".
239 */
240static void
241calculate_deps(struct schedule_state *state, struct schedule_node *n)
242{
243        const struct v3d_device_info *devinfo = state->devinfo;
244        struct qinst *qinst = n->inst;
245        struct v3d_qpu_instr *inst = &qinst->qpu;
246        /* If the input and output segments are shared, then all VPM reads to
247         * a location need to happen before all writes.  We handle this by
248         * serializing all VPM operations for now.
249         */
250        bool separate_vpm_segment = false;
251
252        if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
253                if (inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS)
254                        add_read_dep(state, state->last_sf, n);
255
256                /* XXX: BDI */
257                /* XXX: BDU */
258                /* XXX: ub */
259                /* XXX: raddr_a */
260
261                add_write_dep(state, &state->last_unif, n);
262                return;
263        }
264
265        assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
266
267        /* XXX: LOAD_IMM */
268
269        if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0)
270                process_mux_deps(state, n, inst->alu.add.a);
271        if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1)
272                process_mux_deps(state, n, inst->alu.add.b);
273
274        if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0)
275                process_mux_deps(state, n, inst->alu.mul.a);
276        if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1)
277                process_mux_deps(state, n, inst->alu.mul.b);
278
279        switch (inst->alu.add.op) {
280        case V3D_QPU_A_VPMSETUP:
281                /* Could distinguish read/write by unpacking the uniform. */
282                add_write_dep(state, &state->last_vpm, n);
283                add_write_dep(state, &state->last_vpm_read, n);
284                break;
285
286        case V3D_QPU_A_STVPMV:
287        case V3D_QPU_A_STVPMD:
288        case V3D_QPU_A_STVPMP:
289                add_write_dep(state, &state->last_vpm, n);
290                break;
291
292        case V3D_QPU_A_LDVPMV_IN:
293        case V3D_QPU_A_LDVPMD_IN:
294        case V3D_QPU_A_LDVPMG_IN:
295        case V3D_QPU_A_LDVPMP:
296                if (!separate_vpm_segment)
297                        add_write_dep(state, &state->last_vpm, n);
298                break;
299
300        case V3D_QPU_A_VPMWT:
301                add_read_dep(state, state->last_vpm, n);
302                break;
303
304        case V3D_QPU_A_MSF:
305                add_read_dep(state, state->last_tlb, n);
306                break;
307
308        case V3D_QPU_A_SETMSF:
309        case V3D_QPU_A_SETREVF:
310                add_write_dep(state, &state->last_tlb, n);
311                break;
312
313        default:
314                break;
315        }
316
317        switch (inst->alu.mul.op) {
318        case V3D_QPU_M_MULTOP:
319        case V3D_QPU_M_UMUL24:
320                /* MULTOP sets rtop, and UMUL24 implicitly reads rtop and
321                 * resets it to 0.  We could possibly reorder umul24s relative
322                 * to each other, but for now just keep all the MUL parts in
323                 * order.
324                 */
325                add_write_dep(state, &state->last_rtop, n);
326                break;
327        default:
328                break;
329        }
330
331        if (inst->alu.add.op != V3D_QPU_A_NOP) {
332                process_waddr_deps(state, n, inst->alu.add.waddr,
333                                   inst->alu.add.magic_write);
334        }
335        if (inst->alu.mul.op != V3D_QPU_M_NOP) {
336                process_waddr_deps(state, n, inst->alu.mul.waddr,
337                                   inst->alu.mul.magic_write);
338        }
339        if (v3d_qpu_sig_writes_address(devinfo, &inst->sig)) {
340                process_waddr_deps(state, n, inst->sig_addr,
341                                   inst->sig_magic);
342        }
343
344        if (v3d_qpu_writes_r3(devinfo, inst))
345                add_write_dep(state, &state->last_r[3], n);
346        if (v3d_qpu_writes_r4(devinfo, inst))
347                add_write_dep(state, &state->last_r[4], n);
348        if (v3d_qpu_writes_r5(devinfo, inst))
349                add_write_dep(state, &state->last_r[5], n);
350
351        if (inst->sig.thrsw) {
352                /* All accumulator contents and flags are undefined after the
353                 * switch.
354                 */
355                for (int i = 0; i < ARRAY_SIZE(state->last_r); i++)
356                        add_write_dep(state, &state->last_r[i], n);
357                add_write_dep(state, &state->last_sf, n);
358                add_write_dep(state, &state->last_rtop, n);
359
360                /* Scoreboard-locking operations have to stay after the last
361                 * thread switch.
362                 */
363                add_write_dep(state, &state->last_tlb, n);
364
365                add_write_dep(state, &state->last_tmu_write, n);
366                add_write_dep(state, &state->last_tmu_config, n);
367        }
368
369        if (v3d_qpu_waits_on_tmu(inst)) {
370                /* TMU loads are coming from a FIFO, so ordering is important.
371                 */
372                add_write_dep(state, &state->last_tmu_write, n);
373        }
374
375        if (inst->sig.wrtmuc)
376                add_write_dep(state, &state->last_tmu_config, n);
377
378        if (inst->sig.ldtlb | inst->sig.ldtlbu)
379                add_read_dep(state, state->last_tlb, n);
380
381        if (inst->sig.ldvpm) {
382                add_write_dep(state, &state->last_vpm_read, n);
383
384                /* At least for now, we're doing shared I/O segments, so queue
385                 * all writes after all reads.
386                 */
387                if (!separate_vpm_segment)
388                        add_write_dep(state, &state->last_vpm, n);
389        }
390
391        /* inst->sig.ldunif or sideband uniform read */
392        if (vir_has_uniform(qinst))
393                add_write_dep(state, &state->last_unif, n);
394
395        if (v3d_qpu_reads_flags(inst))
396                add_read_dep(state, state->last_sf, n);
397        if (v3d_qpu_writes_flags(inst))
398                add_write_dep(state, &state->last_sf, n);
399}
400
401static void
402calculate_forward_deps(struct v3d_compile *c, struct dag *dag,
403                       struct list_head *schedule_list)
404{
405        struct schedule_state state;
406
407        memset(&state, 0, sizeof(state));
408        state.dag = dag;
409        state.devinfo = c->devinfo;
410        state.dir = F;
411
412        list_for_each_entry(struct schedule_node, node, schedule_list, link)
413                calculate_deps(&state, node);
414}
415
416static void
417calculate_reverse_deps(struct v3d_compile *c, struct dag *dag,
418                       struct list_head *schedule_list)
419{
420        struct schedule_state state;
421
422        memset(&state, 0, sizeof(state));
423        state.dag = dag;
424        state.devinfo = c->devinfo;
425        state.dir = R;
426
427        list_for_each_entry_rev(struct schedule_node, node, schedule_list,
428                                link) {
429                calculate_deps(&state, (struct schedule_node *)node);
430        }
431}
432
433struct choose_scoreboard {
434        struct dag *dag;
435        int tick;
436        int last_magic_sfu_write_tick;
437        int last_ldvary_tick;
438        int last_uniforms_reset_tick;
439        int last_thrsw_tick;
440        bool tlb_locked;
441};
442
443static bool
444mux_reads_too_soon(struct choose_scoreboard *scoreboard,
445                   const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)
446{
447        switch (mux) {
448        case V3D_QPU_MUX_R4:
449                if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick <= 2)
450                        return true;
451                break;
452
453        case V3D_QPU_MUX_R5:
454                if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
455                        return true;
456                break;
457        default:
458                break;
459        }
460
461        return false;
462}
463
464static bool
465reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
466                           struct qinst *qinst)
467{
468        const struct v3d_qpu_instr *inst = &qinst->qpu;
469
470        /* XXX: Branching off of raddr. */
471        if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
472                return false;
473
474        assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
475
476        if (inst->alu.add.op != V3D_QPU_A_NOP) {
477                if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 &&
478                    mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) {
479                        return true;
480                }
481                if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 &&
482                    mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) {
483                        return true;
484                }
485        }
486
487        if (inst->alu.mul.op != V3D_QPU_M_NOP) {
488                if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 &&
489                    mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) {
490                        return true;
491                }
492                if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 &&
493                    mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) {
494                        return true;
495                }
496        }
497
498        /* XXX: imm */
499
500        return false;
501}
502
503static bool
504writes_too_soon_after_write(const struct v3d_device_info *devinfo,
505                            struct choose_scoreboard *scoreboard,
506                            struct qinst *qinst)
507{
508        const struct v3d_qpu_instr *inst = &qinst->qpu;
509
510        /* Don't schedule any other r4 write too soon after an SFU write.
511         * This would normally be prevented by dependency tracking, but might
512         * occur if a dead SFU computation makes it to scheduling.
513         */
514        if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick < 2 &&
515            v3d_qpu_writes_r4(devinfo, inst))
516                return true;
517
518        return false;
519}
520
521static bool
522pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard,
523                          const struct v3d_qpu_instr *inst)
524{
525        return (scoreboard->tick == 0 && qpu_inst_is_tlb(inst));
526}
527
528static int
529get_instruction_priority(const struct v3d_qpu_instr *inst)
530{
531        uint32_t baseline_score;
532        uint32_t next_score = 0;
533
534        /* Schedule TLB operations as late as possible, to get more
535         * parallelism between shaders.
536         */
537        if (qpu_inst_is_tlb(inst))
538                return next_score;
539        next_score++;
540
541        /* Schedule texture read results collection late to hide latency. */
542        if (v3d_qpu_waits_on_tmu(inst))
543                return next_score;
544        next_score++;
545
546        /* XXX perf: We should schedule SFU ALU ops so that the reader is 2
547         * instructions after the producer if possible, not just 1.
548         */
549
550        /* Default score for things that aren't otherwise special. */
551        baseline_score = next_score;
552        next_score++;
553
554        /* Schedule texture read setup early to hide their latency better. */
555        if (v3d_qpu_writes_tmu(inst))
556                return next_score;
557        next_score++;
558
559        return baseline_score;
560}
561
562static bool
563qpu_magic_waddr_is_periph(enum v3d_qpu_waddr waddr)
564{
565        return (v3d_qpu_magic_waddr_is_tmu(waddr) ||
566                v3d_qpu_magic_waddr_is_sfu(waddr) ||
567                v3d_qpu_magic_waddr_is_tlb(waddr) ||
568                v3d_qpu_magic_waddr_is_vpm(waddr) ||
569                v3d_qpu_magic_waddr_is_tsy(waddr));
570}
571
572static bool
573qpu_accesses_peripheral(const struct v3d_qpu_instr *inst)
574{
575        if (v3d_qpu_uses_vpm(inst))
576                return true;
577        if (v3d_qpu_uses_sfu(inst))
578                return true;
579
580        if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
581                if (inst->alu.add.op != V3D_QPU_A_NOP &&
582                    inst->alu.add.magic_write &&
583                    qpu_magic_waddr_is_periph(inst->alu.add.waddr)) {
584                        return true;
585                }
586
587                if (inst->alu.add.op == V3D_QPU_A_TMUWT)
588                        return true;
589
590                if (inst->alu.mul.op != V3D_QPU_M_NOP &&
591                    inst->alu.mul.magic_write &&
592                    qpu_magic_waddr_is_periph(inst->alu.mul.waddr)) {
593                        return true;
594                }
595        }
596
597        return (inst->sig.ldvpm ||
598                inst->sig.ldtmu ||
599                inst->sig.ldtlb ||
600                inst->sig.ldtlbu ||
601                inst->sig.wrtmuc);
602}
603
604static bool
605qpu_merge_inst(const struct v3d_device_info *devinfo,
606               struct v3d_qpu_instr *result,
607               const struct v3d_qpu_instr *a,
608               const struct v3d_qpu_instr *b)
609{
610        if (a->type != V3D_QPU_INSTR_TYPE_ALU ||
611            b->type != V3D_QPU_INSTR_TYPE_ALU) {
612                return false;
613        }
614
615        /* Can't do more than one peripheral access in an instruction.
616         *
617         * XXX: V3D 4.1 allows TMU read along with a VPM read or write, and
618         * WRTMUC with a TMU magic register write (other than tmuc).
619         */
620        if (qpu_accesses_peripheral(a) && qpu_accesses_peripheral(b))
621                return false;
622
623        struct v3d_qpu_instr merge = *a;
624
625        if (b->alu.add.op != V3D_QPU_A_NOP) {
626                if (a->alu.add.op != V3D_QPU_A_NOP)
627                        return false;
628                merge.alu.add = b->alu.add;
629
630                merge.flags.ac = b->flags.ac;
631                merge.flags.apf = b->flags.apf;
632                merge.flags.auf = b->flags.auf;
633        }
634
635        if (b->alu.mul.op != V3D_QPU_M_NOP) {
636                if (a->alu.mul.op != V3D_QPU_M_NOP)
637                        return false;
638                merge.alu.mul = b->alu.mul;
639
640                merge.flags.mc = b->flags.mc;
641                merge.flags.mpf = b->flags.mpf;
642                merge.flags.muf = b->flags.muf;
643        }
644
645        if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A)) {
646                if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A) &&
647                    a->raddr_a != b->raddr_a) {
648                        return false;
649                }
650                merge.raddr_a = b->raddr_a;
651        }
652
653        if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) {
654                if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_B) &&
655                    (a->raddr_b != b->raddr_b ||
656                     a->sig.small_imm != b->sig.small_imm)) {
657                        return false;
658                }
659                merge.raddr_b = b->raddr_b;
660        }
661
662        merge.sig.thrsw |= b->sig.thrsw;
663        merge.sig.ldunif |= b->sig.ldunif;
664        merge.sig.ldunifrf |= b->sig.ldunifrf;
665        merge.sig.ldunifa |= b->sig.ldunifa;
666        merge.sig.ldunifarf |= b->sig.ldunifarf;
667        merge.sig.ldtmu |= b->sig.ldtmu;
668        merge.sig.ldvary |= b->sig.ldvary;
669        merge.sig.ldvpm |= b->sig.ldvpm;
670        merge.sig.small_imm |= b->sig.small_imm;
671        merge.sig.ldtlb |= b->sig.ldtlb;
672        merge.sig.ldtlbu |= b->sig.ldtlbu;
673        merge.sig.ucb |= b->sig.ucb;
674        merge.sig.rotate |= b->sig.rotate;
675        merge.sig.wrtmuc |= b->sig.wrtmuc;
676
677        if (v3d_qpu_sig_writes_address(devinfo, &a->sig) &&
678            v3d_qpu_sig_writes_address(devinfo, &b->sig))
679                return false;
680        merge.sig_addr |= b->sig_addr;
681        merge.sig_magic |= b->sig_magic;
682
683        uint64_t packed;
684        bool ok = v3d_qpu_instr_pack(devinfo, &merge, &packed);
685
686        *result = merge;
687        /* No modifying the real instructions on failure. */
688        assert(ok || (a != result && b != result));
689
690        return ok;
691}
692
693static struct schedule_node *
694choose_instruction_to_schedule(const struct v3d_device_info *devinfo,
695                               struct choose_scoreboard *scoreboard,
696                               struct schedule_node *prev_inst)
697{
698        struct schedule_node *chosen = NULL;
699        int chosen_prio = 0;
700
701        /* Don't pair up anything with a thread switch signal -- emit_thrsw()
702         * will handle pairing it along with filling the delay slots.
703         */
704        if (prev_inst) {
705                if (prev_inst->inst->qpu.sig.thrsw)
706                        return NULL;
707        }
708
709        list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads,
710                            dag.link) {
711                const struct v3d_qpu_instr *inst = &n->inst->qpu;
712
713                /* Don't choose the branch instruction until it's the last one
714                 * left.  We'll move it up to fit its delay slots after we
715                 * choose it.
716                 */
717                if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
718                    !list_is_singular(&scoreboard->dag->heads)) {
719                        continue;
720                }
721
722                /* "An instruction must not read from a location in physical
723                 *  regfile A or B that was written to by the previous
724                 *  instruction."
725                 */
726                if (reads_too_soon_after_write(scoreboard, n->inst))
727                        continue;
728
729                if (writes_too_soon_after_write(devinfo, scoreboard, n->inst))
730                        continue;
731
732                /* "A scoreboard wait must not occur in the first two
733                 *  instructions of a fragment shader. This is either the
734                 *  explicit Wait for Scoreboard signal or an implicit wait
735                 *  with the first tile-buffer read or write instruction."
736                 */
737                if (pixel_scoreboard_too_soon(scoreboard, inst))
738                        continue;
739
740                /* ldunif and ldvary both write r5, but ldunif does so a tick
741                 * sooner.  If the ldvary's r5 wasn't used, then ldunif might
742                 * otherwise get scheduled so ldunif and ldvary try to update
743                 * r5 in the same tick.
744                 *
745                 * XXX perf: To get good pipelining of a sequence of varying
746                 * loads, we need to figure out how to pair the ldvary signal
747                 * up to the instruction before the last r5 user in the
748                 * previous ldvary sequence.  Currently, it usually pairs with
749                 * the last r5 user.
750                 */
751                if ((inst->sig.ldunif || inst->sig.ldunifa) &&
752                    scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
753                        continue;
754                }
755
756                /* If we're trying to pair with another instruction, check
757                 * that they're compatible.
758                 */
759                if (prev_inst) {
760                        /* Don't pair up a thread switch signal -- we'll
761                         * handle pairing it when we pick it on its own.
762                         */
763                        if (inst->sig.thrsw)
764                                continue;
765
766                        if (prev_inst->inst->uniform != -1 &&
767                            n->inst->uniform != -1)
768                                continue;
769
770                        /* Don't merge in something that will lock the TLB.
771                         * Hopwefully what we have in inst will release some
772                         * other instructions, allowing us to delay the
773                         * TLB-locking instruction until later.
774                         */
775                        if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst))
776                                continue;
777
778                        struct v3d_qpu_instr merged_inst;
779                        if (!qpu_merge_inst(devinfo, &merged_inst,
780                                            &prev_inst->inst->qpu, inst)) {
781                                continue;
782                        }
783                }
784
785                int prio = get_instruction_priority(inst);
786
787                /* Found a valid instruction.  If nothing better comes along,
788                 * this one works.
789                 */
790                if (!chosen) {
791                        chosen = n;
792                        chosen_prio = prio;
793                        continue;
794                }
795
796                if (prio > chosen_prio) {
797                        chosen = n;
798                        chosen_prio = prio;
799                } else if (prio < chosen_prio) {
800                        continue;
801                }
802
803                if (n->delay > chosen->delay) {
804                        chosen = n;
805                        chosen_prio = prio;
806                } else if (n->delay < chosen->delay) {
807                        continue;
808                }
809        }
810
811        return chosen;
812}
813
814static void
815update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard,
816                                  enum v3d_qpu_waddr waddr)
817{
818        if (v3d_qpu_magic_waddr_is_sfu(waddr))
819                scoreboard->last_magic_sfu_write_tick = scoreboard->tick;
820}
821
822static void
823update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
824                             const struct v3d_qpu_instr *inst)
825{
826        if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
827                return;
828
829        assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
830
831        if (inst->alu.add.op != V3D_QPU_A_NOP)  {
832                if (inst->alu.add.magic_write) {
833                        update_scoreboard_for_magic_waddr(scoreboard,
834                                                          inst->alu.add.waddr);
835                }
836        }
837
838        if (inst->alu.mul.op != V3D_QPU_M_NOP) {
839                if (inst->alu.mul.magic_write) {
840                        update_scoreboard_for_magic_waddr(scoreboard,
841                                                          inst->alu.mul.waddr);
842                }
843        }
844
845        if (inst->sig.ldvary)
846                scoreboard->last_ldvary_tick = scoreboard->tick;
847
848        if (qpu_inst_is_tlb(inst))
849                scoreboard->tlb_locked = true;
850}
851
852static void
853dump_state(const struct v3d_device_info *devinfo, struct dag *dag)
854{
855        list_for_each_entry(struct schedule_node, n, &dag->heads, dag.link) {
856                fprintf(stderr, "         t=%4d: ", n->unblocked_time);
857                v3d_qpu_dump(devinfo, &n->inst->qpu);
858                fprintf(stderr, "\n");
859
860                util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
861                        struct schedule_node *child =
862                                (struct schedule_node *)edge->child;
863                        if (!child)
864                                continue;
865
866                        fprintf(stderr, "                 - ");
867                        v3d_qpu_dump(devinfo, &child->inst->qpu);
868                        fprintf(stderr, " (%d parents, %c)\n",
869                                child->dag.parent_count,
870                                edge->data ? 'w' : 'r');
871                }
872        }
873}
874
875static uint32_t magic_waddr_latency(enum v3d_qpu_waddr waddr,
876                                    const struct v3d_qpu_instr *after)
877{
878        /* Apply some huge latency between texture fetch requests and getting
879         * their results back.
880         *
881         * FIXME: This is actually pretty bogus.  If we do:
882         *
883         * mov tmu0_s, a
884         * <a bit of math>
885         * mov tmu0_s, b
886         * load_tmu0
887         * <more math>
888         * load_tmu0
889         *
890         * we count that as worse than
891         *
892         * mov tmu0_s, a
893         * mov tmu0_s, b
894         * <lots of math>
895         * load_tmu0
896         * <more math>
897         * load_tmu0
898         *
899         * because we associate the first load_tmu0 with the *second* tmu0_s.
900         */
901        if (v3d_qpu_magic_waddr_is_tmu(waddr) && v3d_qpu_waits_on_tmu(after))
902                return 100;
903
904        /* Assume that anything depending on us is consuming the SFU result. */
905        if (v3d_qpu_magic_waddr_is_sfu(waddr))
906                return 3;
907
908        return 1;
909}
910
911static uint32_t
912instruction_latency(struct schedule_node *before, struct schedule_node *after)
913{
914        const struct v3d_qpu_instr *before_inst = &before->inst->qpu;
915        const struct v3d_qpu_instr *after_inst = &after->inst->qpu;
916        uint32_t latency = 1;
917
918        if (before_inst->type != V3D_QPU_INSTR_TYPE_ALU ||
919            after_inst->type != V3D_QPU_INSTR_TYPE_ALU)
920                return latency;
921
922        if (before_inst->alu.add.magic_write) {
923                latency = MAX2(latency,
924                               magic_waddr_latency(before_inst->alu.add.waddr,
925                                                   after_inst));
926        }
927
928        if (before_inst->alu.mul.magic_write) {
929                latency = MAX2(latency,
930                               magic_waddr_latency(before_inst->alu.mul.waddr,
931                                                   after_inst));
932        }
933
934        return latency;
935}
936
937/** Recursive computation of the delay member of a node. */
938static void
939compute_delay(struct dag_node *node, void *state)
940{
941        struct schedule_node *n = (struct schedule_node *)node;
942
943        n->delay = 1;
944
945        util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
946                struct schedule_node *child =
947                        (struct schedule_node *)edge->child;
948
949                n->delay = MAX2(n->delay, (child->delay +
950                                           instruction_latency(n, child)));
951        }
952}
953
954/* Removes a DAG head, but removing only the WAR edges. (dag_prune_head()
955 * should be called on it later to finish pruning the other edges).
956 */
957static void
958pre_remove_head(struct dag *dag, struct schedule_node *n)
959{
960        list_delinit(&n->dag.link);
961
962        util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
963                if (edge->data)
964                        dag_remove_edge(dag, edge);
965        }
966}
967
968static void
969mark_instruction_scheduled(struct dag *dag,
970                           uint32_t time,
971                           struct schedule_node *node)
972{
973        if (!node)
974                return;
975
976        util_dynarray_foreach(&node->dag.edges, struct dag_edge, edge) {
977                struct schedule_node *child =
978                        (struct schedule_node *)edge->child;
979
980                if (!child)
981                        continue;
982
983                uint32_t latency = instruction_latency(node, child);
984
985                child->unblocked_time = MAX2(child->unblocked_time,
986                                             time + latency);
987        }
988        dag_prune_head(dag, &node->dag);
989}
990
991static void
992insert_scheduled_instruction(struct v3d_compile *c,
993                             struct qblock *block,
994                             struct choose_scoreboard *scoreboard,
995                             struct qinst *inst)
996{
997        list_addtail(&inst->link, &block->instructions);
998
999        update_scoreboard_for_chosen(scoreboard, &inst->qpu);
1000        c->qpu_inst_count++;
1001        scoreboard->tick++;
1002}
1003
1004static struct qinst *
1005vir_nop()
1006{
1007        struct qreg undef = vir_nop_reg();
1008        struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef);
1009
1010        return qinst;
1011}
1012
1013static void
1014emit_nop(struct v3d_compile *c, struct qblock *block,
1015         struct choose_scoreboard *scoreboard)
1016{
1017        insert_scheduled_instruction(c, block, scoreboard, vir_nop());
1018}
1019
1020static bool
1021qpu_instruction_valid_in_thrend_slot(struct v3d_compile *c,
1022                                     const struct qinst *qinst, int slot)
1023{
1024        const struct v3d_qpu_instr *inst = &qinst->qpu;
1025
1026        /* Only TLB Z writes are prohibited in the last slot, but we don't
1027         * have those flagged so prohibit all TLB ops for now.
1028         */
1029        if (slot == 2 && qpu_inst_is_tlb(inst))
1030                return false;
1031
1032        if (slot > 0 && qinst->uniform != ~0)
1033                return false;
1034
1035        if (v3d_qpu_uses_vpm(inst))
1036                return false;
1037
1038        if (inst->sig.ldvary)
1039                return false;
1040
1041        if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
1042                /* GFXH-1625: TMUWT not allowed in the final instruction. */
1043                if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT)
1044                        return false;
1045
1046                /* No writing physical registers at the end. */
1047                if (!inst->alu.add.magic_write ||
1048                    !inst->alu.mul.magic_write) {
1049                        return false;
1050                }
1051
1052                if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF)
1053                        return false;
1054
1055                /* RF0-2 might be overwritten during the delay slots by
1056                 * fragment shader setup.
1057                 */
1058                if (inst->raddr_a < 3 &&
1059                    (inst->alu.add.a == V3D_QPU_MUX_A ||
1060                     inst->alu.add.b == V3D_QPU_MUX_A ||
1061                     inst->alu.mul.a == V3D_QPU_MUX_A ||
1062                     inst->alu.mul.b == V3D_QPU_MUX_A)) {
1063                        return false;
1064                }
1065
1066                if (inst->raddr_b < 3 &&
1067                    !inst->sig.small_imm &&
1068                    (inst->alu.add.a == V3D_QPU_MUX_B ||
1069                     inst->alu.add.b == V3D_QPU_MUX_B ||
1070                     inst->alu.mul.a == V3D_QPU_MUX_B ||
1071                     inst->alu.mul.b == V3D_QPU_MUX_B)) {
1072                        return false;
1073                }
1074        }
1075
1076        return true;
1077}
1078
1079static bool
1080valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard,
1081                     struct qinst *qinst, int instructions_in_sequence,
1082                     bool is_thrend)
1083{
1084        /* No emitting our thrsw while the previous thrsw hasn't happened yet. */
1085        if (scoreboard->last_thrsw_tick + 3 >
1086            scoreboard->tick - instructions_in_sequence) {
1087                return false;
1088        }
1089
1090        for (int slot = 0; slot < instructions_in_sequence; slot++) {
1091                /* No scheduling SFU when the result would land in the other
1092                 * thread.  The simulator complains for safety, though it
1093                 * would only occur for dead code in our case.
1094                 */
1095                if (slot > 0 &&
1096                    qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
1097                    (v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.add.waddr) ||
1098                     v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.mul.waddr))) {
1099                        return false;
1100                }
1101
1102                if (slot > 0 && qinst->qpu.sig.ldvary)
1103                        return false;
1104
1105                if (is_thrend &&
1106                    !qpu_instruction_valid_in_thrend_slot(c, qinst, slot)) {
1107                        return false;
1108                }
1109
1110                /* Note that the list is circular, so we can only do this up
1111                 * to instructions_in_sequence.
1112                 */
1113                qinst = (struct qinst *)qinst->link.next;
1114        }
1115
1116        return true;
1117}
1118
1119/**
1120 * Emits a THRSW signal in the stream, trying to move it up to pair with
1121 * another instruction.
1122 */
1123static int
1124emit_thrsw(struct v3d_compile *c,
1125           struct qblock *block,
1126           struct choose_scoreboard *scoreboard,
1127           struct qinst *inst,
1128           bool is_thrend)
1129{
1130        int time = 0;
1131
1132        /* There should be nothing in a thrsw inst being scheduled other than
1133         * the signal bits.
1134         */
1135        assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
1136        assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP);
1137        assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP);
1138
1139        /* Find how far back into previous instructions we can put the THRSW. */
1140        int slots_filled = 0;
1141        struct qinst *merge_inst = NULL;
1142        vir_for_each_inst_rev(prev_inst, block) {
1143                struct v3d_qpu_sig sig = prev_inst->qpu.sig;
1144                sig.thrsw = true;
1145                uint32_t packed_sig;
1146
1147                if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig))
1148                        break;
1149
1150                if (!valid_thrsw_sequence(c, scoreboard,
1151                                          prev_inst, slots_filled + 1,
1152                                          is_thrend)) {
1153                        break;
1154                }
1155
1156                merge_inst = prev_inst;
1157                if (++slots_filled == 3)
1158                        break;
1159        }
1160
1161        bool needs_free = false;
1162        if (merge_inst) {
1163                merge_inst->qpu.sig.thrsw = true;
1164                needs_free = true;
1165                scoreboard->last_thrsw_tick = scoreboard->tick - slots_filled;
1166        } else {
1167                scoreboard->last_thrsw_tick = scoreboard->tick;
1168                insert_scheduled_instruction(c, block, scoreboard, inst);
1169                time++;
1170                slots_filled++;
1171                merge_inst = inst;
1172        }
1173
1174        /* Insert any extra delay slot NOPs we need. */
1175        for (int i = 0; i < 3 - slots_filled; i++) {
1176                emit_nop(c, block, scoreboard);
1177                time++;
1178        }
1179
1180        /* If we're emitting the last THRSW (other than program end), then
1181         * signal that to the HW by emitting two THRSWs in a row.
1182         */
1183        if (inst->is_last_thrsw) {
1184                struct qinst *second_inst =
1185                        (struct qinst *)merge_inst->link.next;
1186                second_inst->qpu.sig.thrsw = true;
1187        }
1188
1189        /* If we put our THRSW into another instruction, free up the
1190         * instruction that didn't end up scheduled into the list.
1191         */
1192        if (needs_free)
1193                free(inst);
1194
1195        return time;
1196}
1197
1198static uint32_t
1199schedule_instructions(struct v3d_compile *c,
1200                      struct choose_scoreboard *scoreboard,
1201                      struct qblock *block,
1202                      enum quniform_contents *orig_uniform_contents,
1203                      uint32_t *orig_uniform_data,
1204                      uint32_t *next_uniform)
1205{
1206        const struct v3d_device_info *devinfo = c->devinfo;
1207        uint32_t time = 0;
1208
1209        while (!list_empty(&scoreboard->dag->heads)) {
1210                struct schedule_node *chosen =
1211                        choose_instruction_to_schedule(devinfo,
1212                                                       scoreboard,
1213                                                       NULL);
1214                struct schedule_node *merge = NULL;
1215
1216                /* If there are no valid instructions to schedule, drop a NOP
1217                 * in.
1218                 */
1219                struct qinst *qinst = chosen ? chosen->inst : vir_nop();
1220                struct v3d_qpu_instr *inst = &qinst->qpu;
1221
1222                if (debug) {
1223                        fprintf(stderr, "t=%4d: current list:\n",
1224                                time);
1225                        dump_state(devinfo, scoreboard->dag);
1226                        fprintf(stderr, "t=%4d: chose:   ", time);
1227                        v3d_qpu_dump(devinfo, inst);
1228                        fprintf(stderr, "\n");
1229                }
1230
1231                /* We can't mark_instruction_scheduled() the chosen inst until
1232                 * we're done identifying instructions to merge, so put the
1233                 * merged instructions on a list for a moment.
1234                 */
1235                struct list_head merged_list;
1236                list_inithead(&merged_list);
1237
1238                /* Schedule this instruction onto the QPU list. Also try to
1239                 * find an instruction to pair with it.
1240                 */
1241                if (chosen) {
1242                        time = MAX2(chosen->unblocked_time, time);
1243                        pre_remove_head(scoreboard->dag, chosen);
1244
1245                        while ((merge =
1246                                choose_instruction_to_schedule(devinfo,
1247                                                               scoreboard,
1248                                                               chosen))) {
1249                                time = MAX2(merge->unblocked_time, time);
1250                                pre_remove_head(scoreboard->dag, chosen);
1251                                list_addtail(&merge->link, &merged_list);
1252                                (void)qpu_merge_inst(devinfo, inst,
1253                                                     inst, &merge->inst->qpu);
1254                                if (merge->inst->uniform != -1) {
1255                                        chosen->inst->uniform =
1256                                                merge->inst->uniform;
1257                                }
1258
1259                                if (debug) {
1260                                        fprintf(stderr, "t=%4d: merging: ",
1261                                                time);
1262                                        v3d_qpu_dump(devinfo, &merge->inst->qpu);
1263                                        fprintf(stderr, "\n");
1264                                        fprintf(stderr, "         result: ");
1265                                        v3d_qpu_dump(devinfo, inst);
1266                                        fprintf(stderr, "\n");
1267                                }
1268                        }
1269                }
1270
1271                /* Update the uniform index for the rewritten location --
1272                 * branch target updating will still need to change
1273                 * c->uniform_data[] using this index.
1274                 */
1275                if (qinst->uniform != -1) {
1276                        if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
1277                                block->branch_uniform = *next_uniform;
1278
1279                        c->uniform_data[*next_uniform] =
1280                                orig_uniform_data[qinst->uniform];
1281                        c->uniform_contents[*next_uniform] =
1282                                orig_uniform_contents[qinst->uniform];
1283                        qinst->uniform = *next_uniform;
1284                        (*next_uniform)++;
1285                }
1286
1287                if (debug) {
1288                        fprintf(stderr, "\n");
1289                }
1290
1291                /* Now that we've scheduled a new instruction, some of its
1292                 * children can be promoted to the list of instructions ready to
1293                 * be scheduled.  Update the children's unblocked time for this
1294                 * DAG edge as we do so.
1295                 */
1296                mark_instruction_scheduled(scoreboard->dag, time, chosen);
1297                list_for_each_entry(struct schedule_node, merge, &merged_list,
1298                                    link) {
1299                        mark_instruction_scheduled(scoreboard->dag, time, merge);
1300
1301                        /* The merged VIR instruction doesn't get re-added to the
1302                         * block, so free it now.
1303                         */
1304                        free(merge->inst);
1305                }
1306
1307                if (inst->sig.thrsw) {
1308                        time += emit_thrsw(c, block, scoreboard, qinst, false);
1309                } else {
1310                        insert_scheduled_instruction(c, block,
1311                                                     scoreboard, qinst);
1312
1313                        if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
1314                                block->branch_qpu_ip = c->qpu_inst_count - 1;
1315                                /* Fill the delay slots.
1316                                 *
1317                                 * We should fill these with actual instructions,
1318                                 * instead, but that will probably need to be done
1319                                 * after this, once we know what the leading
1320                                 * instructions of the successors are (so we can
1321                                 * handle A/B register file write latency)
1322                                 */
1323                                for (int i = 0; i < 3; i++)
1324                                        emit_nop(c, block, scoreboard);
1325                        }
1326                }
1327        }
1328
1329        return time;
1330}
1331
1332static uint32_t
1333qpu_schedule_instructions_block(struct v3d_compile *c,
1334                                struct choose_scoreboard *scoreboard,
1335                                struct qblock *block,
1336                                enum quniform_contents *orig_uniform_contents,
1337                                uint32_t *orig_uniform_data,
1338                                uint32_t *next_uniform)
1339{
1340        void *mem_ctx = ralloc_context(NULL);
1341        scoreboard->dag = dag_create(mem_ctx);
1342        struct list_head setup_list;
1343
1344        list_inithead(&setup_list);
1345
1346        /* Wrap each instruction in a scheduler structure. */
1347        while (!list_empty(&block->instructions)) {
1348                struct qinst *qinst = (struct qinst *)block->instructions.next;
1349                struct schedule_node *n =
1350                        rzalloc(mem_ctx, struct schedule_node);
1351
1352                dag_init_node(scoreboard->dag, &n->dag);
1353                n->inst = qinst;
1354
1355                list_del(&qinst->link);
1356                list_addtail(&n->link, &setup_list);
1357        }
1358
1359        calculate_forward_deps(c, scoreboard->dag, &setup_list);
1360        calculate_reverse_deps(c, scoreboard->dag, &setup_list);
1361
1362        dag_traverse_bottom_up(scoreboard->dag, compute_delay, NULL);
1363
1364        uint32_t cycles = schedule_instructions(c, scoreboard, block,
1365                                                orig_uniform_contents,
1366                                                orig_uniform_data,
1367                                                next_uniform);
1368
1369        ralloc_free(mem_ctx);
1370        scoreboard->dag = NULL;
1371
1372        return cycles;
1373}
1374
1375static void
1376qpu_set_branch_targets(struct v3d_compile *c)
1377{
1378        vir_for_each_block(block, c) {
1379                /* The end block of the program has no branch. */
1380                if (!block->successors[0])
1381                        continue;
1382
1383                /* If there was no branch instruction, then the successor
1384                 * block must follow immediately after this one.
1385                 */
1386                if (block->branch_qpu_ip == ~0) {
1387                        assert(block->end_qpu_ip + 1 ==
1388                               block->successors[0]->start_qpu_ip);
1389                        continue;
1390                }
1391
1392                /* Walk back through the delay slots to find the branch
1393                 * instr.
1394                 */
1395                struct list_head *entry = block->instructions.prev;
1396                for (int i = 0; i < 3; i++)
1397                        entry = entry->prev;
1398                struct qinst *branch = container_of(entry, branch, link);
1399                assert(branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
1400
1401                /* Make sure that the if-we-don't-jump
1402                 * successor was scheduled just after the
1403                 * delay slots.
1404                 */
1405                assert(!block->successors[1] ||
1406                       block->successors[1]->start_qpu_ip ==
1407                       block->branch_qpu_ip + 4);
1408
1409                branch->qpu.branch.offset =
1410                        ((block->successors[0]->start_qpu_ip -
1411                          (block->branch_qpu_ip + 4)) *
1412                         sizeof(uint64_t));
1413
1414                /* Set up the relative offset to jump in the
1415                 * uniform stream.
1416                 *
1417                 * Use a temporary here, because
1418                 * uniform_data[inst->uniform] may be shared
1419                 * between multiple instructions.
1420                 */
1421                assert(c->uniform_contents[branch->uniform] == QUNIFORM_CONSTANT);
1422                c->uniform_data[branch->uniform] =
1423                        (block->successors[0]->start_uniform -
1424                         (block->branch_uniform + 1)) * 4;
1425        }
1426}
1427
1428uint32_t
1429v3d_qpu_schedule_instructions(struct v3d_compile *c)
1430{
1431        const struct v3d_device_info *devinfo = c->devinfo;
1432        struct qblock *end_block = list_last_entry(&c->blocks,
1433                                                   struct qblock, link);
1434
1435        /* We reorder the uniforms as we schedule instructions, so save the
1436         * old data off and replace it.
1437         */
1438        uint32_t *uniform_data = c->uniform_data;
1439        enum quniform_contents *uniform_contents = c->uniform_contents;
1440        c->uniform_contents = ralloc_array(c, enum quniform_contents,
1441                                           c->num_uniforms);
1442        c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms);
1443        c->uniform_array_size = c->num_uniforms;
1444        uint32_t next_uniform = 0;
1445
1446        struct choose_scoreboard scoreboard;
1447        memset(&scoreboard, 0, sizeof(scoreboard));
1448        scoreboard.last_ldvary_tick = -10;
1449        scoreboard.last_magic_sfu_write_tick = -10;
1450        scoreboard.last_uniforms_reset_tick = -10;
1451        scoreboard.last_thrsw_tick = -10;
1452
1453        if (debug) {
1454                fprintf(stderr, "Pre-schedule instructions\n");
1455                vir_for_each_block(block, c) {
1456                        fprintf(stderr, "BLOCK %d\n", block->index);
1457                        list_for_each_entry(struct qinst, qinst,
1458                                            &block->instructions, link) {
1459                                v3d_qpu_dump(devinfo, &qinst->qpu);
1460                                fprintf(stderr, "\n");
1461                        }
1462                }
1463                fprintf(stderr, "\n");
1464        }
1465
1466        uint32_t cycles = 0;
1467        vir_for_each_block(block, c) {
1468                block->start_qpu_ip = c->qpu_inst_count;
1469                block->branch_qpu_ip = ~0;
1470                block->start_uniform = next_uniform;
1471
1472                cycles += qpu_schedule_instructions_block(c,
1473                                                          &scoreboard,
1474                                                          block,
1475                                                          uniform_contents,
1476                                                          uniform_data,
1477                                                          &next_uniform);
1478
1479                block->end_qpu_ip = c->qpu_inst_count - 1;
1480        }
1481
1482        /* Emit the program-end THRSW instruction. */;
1483        struct qinst *thrsw = vir_nop();
1484        thrsw->qpu.sig.thrsw = true;
1485        emit_thrsw(c, end_block, &scoreboard, thrsw, true);
1486
1487        qpu_set_branch_targets(c);
1488
1489        assert(next_uniform == c->num_uniforms);
1490
1491        return cycles;
1492}
1493