101e04c3fSmrg/*
201e04c3fSmrg * Copyright © 2010 Intel Corporation
301e04c3fSmrg * Copyright © 2014-2017 Broadcom
401e04c3fSmrg *
501e04c3fSmrg * Permission is hereby granted, free of charge, to any person obtaining a
601e04c3fSmrg * copy of this software and associated documentation files (the "Software"),
701e04c3fSmrg * to deal in the Software without restriction, including without limitation
801e04c3fSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
901e04c3fSmrg * and/or sell copies of the Software, and to permit persons to whom the
1001e04c3fSmrg * Software is furnished to do so, subject to the following conditions:
1101e04c3fSmrg *
1201e04c3fSmrg * The above copyright notice and this permission notice (including the next
1301e04c3fSmrg * paragraph) shall be included in all copies or substantial portions of the
1401e04c3fSmrg * Software.
1501e04c3fSmrg *
1601e04c3fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1701e04c3fSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1801e04c3fSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
1901e04c3fSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
2001e04c3fSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
2101e04c3fSmrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
2201e04c3fSmrg * IN THE SOFTWARE.
2301e04c3fSmrg */
2401e04c3fSmrg
2501e04c3fSmrg/**
2601e04c3fSmrg * @file
2701e04c3fSmrg *
2801e04c3fSmrg * The basic model of the list scheduler is to take a basic block, compute a
2901e04c3fSmrg * DAG of the dependencies, and make a list of the DAG heads.  Heuristically
3001e04c3fSmrg * pick a DAG head, then put all the children that are now DAG heads into the
3101e04c3fSmrg * list of things to schedule.
3201e04c3fSmrg *
3301e04c3fSmrg * The goal of scheduling here is to pack pairs of operations together in a
3401e04c3fSmrg * single QPU instruction.
3501e04c3fSmrg */
3601e04c3fSmrg
3701e04c3fSmrg#include "qpu/qpu_disasm.h"
3801e04c3fSmrg#include "v3d_compiler.h"
3901e04c3fSmrg#include "util/ralloc.h"
40ed98bd31Smaya#include "util/dag.h"
4101e04c3fSmrg
4201e04c3fSmrgstatic bool debug;
4301e04c3fSmrg
4401e04c3fSmrgstruct schedule_node_child;
4501e04c3fSmrg
4601e04c3fSmrgstruct schedule_node {
47ed98bd31Smaya        struct dag_node dag;
4801e04c3fSmrg        struct list_head link;
4901e04c3fSmrg        struct qinst *inst;
5001e04c3fSmrg
5101e04c3fSmrg        /* Longest cycles + instruction_latency() of any parent of this node. */
5201e04c3fSmrg        uint32_t unblocked_time;
5301e04c3fSmrg
5401e04c3fSmrg        /**
5501e04c3fSmrg         * Minimum number of cycles from scheduling this instruction until the
5601e04c3fSmrg         * end of the program, based on the slowest dependency chain through
5701e04c3fSmrg         * the children.
5801e04c3fSmrg         */
5901e04c3fSmrg        uint32_t delay;
6001e04c3fSmrg
6101e04c3fSmrg        /**
6201e04c3fSmrg         * cycles between this instruction being scheduled and when its result
6301e04c3fSmrg         * can be consumed.
6401e04c3fSmrg         */
6501e04c3fSmrg        uint32_t latency;
6601e04c3fSmrg};
6701e04c3fSmrg
6801e04c3fSmrg/* When walking the instructions in reverse, we need to swap before/after in
6901e04c3fSmrg * add_dep().
7001e04c3fSmrg */
7101e04c3fSmrgenum direction { F, R };
7201e04c3fSmrg
7301e04c3fSmrgstruct schedule_state {
7401e04c3fSmrg        const struct v3d_device_info *devinfo;
75ed98bd31Smaya        struct dag *dag;
7601e04c3fSmrg        struct schedule_node *last_r[6];
7701e04c3fSmrg        struct schedule_node *last_rf[64];
7801e04c3fSmrg        struct schedule_node *last_sf;
7901e04c3fSmrg        struct schedule_node *last_vpm_read;
8001e04c3fSmrg        struct schedule_node *last_tmu_write;
8101e04c3fSmrg        struct schedule_node *last_tmu_config;
827ec681f3Smrg        struct schedule_node *last_tmu_read;
8301e04c3fSmrg        struct schedule_node *last_tlb;
8401e04c3fSmrg        struct schedule_node *last_vpm;
8501e04c3fSmrg        struct schedule_node *last_unif;
8601e04c3fSmrg        struct schedule_node *last_rtop;
877ec681f3Smrg        struct schedule_node *last_unifa;
8801e04c3fSmrg        enum direction dir;
8901e04c3fSmrg        /* Estimated cycle when the current instruction would start. */
9001e04c3fSmrg        uint32_t time;
9101e04c3fSmrg};
9201e04c3fSmrg
9301e04c3fSmrgstatic void
9401e04c3fSmrgadd_dep(struct schedule_state *state,
9501e04c3fSmrg        struct schedule_node *before,
9601e04c3fSmrg        struct schedule_node *after,
9701e04c3fSmrg        bool write)
9801e04c3fSmrg{
9901e04c3fSmrg        bool write_after_read = !write && state->dir == R;
100ed98bd31Smaya        void *edge_data = (void *)(uintptr_t)write_after_read;
10101e04c3fSmrg
10201e04c3fSmrg        if (!before || !after)
10301e04c3fSmrg                return;
10401e04c3fSmrg
10501e04c3fSmrg        assert(before != after);
10601e04c3fSmrg
107ed98bd31Smaya        if (state->dir == F)
108ed98bd31Smaya                dag_add_edge(&before->dag, &after->dag, edge_data);
109ed98bd31Smaya        else
110ed98bd31Smaya                dag_add_edge(&after->dag, &before->dag, edge_data);
11101e04c3fSmrg}
11201e04c3fSmrg
11301e04c3fSmrgstatic void
11401e04c3fSmrgadd_read_dep(struct schedule_state *state,
11501e04c3fSmrg              struct schedule_node *before,
11601e04c3fSmrg              struct schedule_node *after)
11701e04c3fSmrg{
11801e04c3fSmrg        add_dep(state, before, after, false);
11901e04c3fSmrg}
12001e04c3fSmrg
12101e04c3fSmrgstatic void
12201e04c3fSmrgadd_write_dep(struct schedule_state *state,
12301e04c3fSmrg              struct schedule_node **before,
12401e04c3fSmrg              struct schedule_node *after)
12501e04c3fSmrg{
12601e04c3fSmrg        add_dep(state, *before, after, true);
12701e04c3fSmrg        *before = after;
12801e04c3fSmrg}
12901e04c3fSmrg
13001e04c3fSmrgstatic bool
13101e04c3fSmrgqpu_inst_is_tlb(const struct v3d_qpu_instr *inst)
13201e04c3fSmrg{
1337ec681f3Smrg        if (inst->sig.ldtlb || inst->sig.ldtlbu)
1347ec681f3Smrg                return true;
1357ec681f3Smrg
13601e04c3fSmrg        if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
13701e04c3fSmrg                return false;
13801e04c3fSmrg
13901e04c3fSmrg        if (inst->alu.add.magic_write &&
14001e04c3fSmrg            (inst->alu.add.waddr == V3D_QPU_WADDR_TLB ||
14101e04c3fSmrg             inst->alu.add.waddr == V3D_QPU_WADDR_TLBU))
14201e04c3fSmrg                return true;
14301e04c3fSmrg
14401e04c3fSmrg        if (inst->alu.mul.magic_write &&
14501e04c3fSmrg            (inst->alu.mul.waddr == V3D_QPU_WADDR_TLB ||
14601e04c3fSmrg             inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU))
14701e04c3fSmrg                return true;
14801e04c3fSmrg
14901e04c3fSmrg        return false;
15001e04c3fSmrg}
15101e04c3fSmrg
15201e04c3fSmrgstatic void
15301e04c3fSmrgprocess_mux_deps(struct schedule_state *state, struct schedule_node *n,
15401e04c3fSmrg                 enum v3d_qpu_mux mux)
15501e04c3fSmrg{
15601e04c3fSmrg        switch (mux) {
15701e04c3fSmrg        case V3D_QPU_MUX_A:
15801e04c3fSmrg                add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
15901e04c3fSmrg                break;
16001e04c3fSmrg        case V3D_QPU_MUX_B:
1617ec681f3Smrg                if (!n->inst->qpu.sig.small_imm) {
1627ec681f3Smrg                        add_read_dep(state,
1637ec681f3Smrg                                     state->last_rf[n->inst->qpu.raddr_b], n);
1647ec681f3Smrg                }
16501e04c3fSmrg                break;
16601e04c3fSmrg        default:
16701e04c3fSmrg                add_read_dep(state, state->last_r[mux - V3D_QPU_MUX_R0], n);
16801e04c3fSmrg                break;
16901e04c3fSmrg        }
17001e04c3fSmrg}
17101e04c3fSmrg
1727ec681f3Smrgstatic bool
1737ec681f3Smrgtmu_write_is_sequence_terminator(uint32_t waddr)
1747ec681f3Smrg{
1757ec681f3Smrg        switch (waddr) {
1767ec681f3Smrg        case V3D_QPU_WADDR_TMUS:
1777ec681f3Smrg        case V3D_QPU_WADDR_TMUSCM:
1787ec681f3Smrg        case V3D_QPU_WADDR_TMUSF:
1797ec681f3Smrg        case V3D_QPU_WADDR_TMUSLOD:
1807ec681f3Smrg        case V3D_QPU_WADDR_TMUA:
1817ec681f3Smrg        case V3D_QPU_WADDR_TMUAU:
1827ec681f3Smrg                return true;
1837ec681f3Smrg        default:
1847ec681f3Smrg                return false;
1857ec681f3Smrg        }
1867ec681f3Smrg}
1877ec681f3Smrg
1887ec681f3Smrgstatic bool
1897ec681f3Smrgcan_reorder_tmu_write(const struct v3d_device_info *devinfo, uint32_t waddr)
1907ec681f3Smrg{
1917ec681f3Smrg        if (devinfo->ver < 40)
1927ec681f3Smrg                return false;
1937ec681f3Smrg
1947ec681f3Smrg        if (tmu_write_is_sequence_terminator(waddr))
1957ec681f3Smrg                return false;
1967ec681f3Smrg
1977ec681f3Smrg        if (waddr == V3D_QPU_WADDR_TMUD)
1987ec681f3Smrg                return false;
1997ec681f3Smrg
2007ec681f3Smrg        return true;
2017ec681f3Smrg}
20201e04c3fSmrg
20301e04c3fSmrgstatic void
20401e04c3fSmrgprocess_waddr_deps(struct schedule_state *state, struct schedule_node *n,
20501e04c3fSmrg                   uint32_t waddr, bool magic)
20601e04c3fSmrg{
20701e04c3fSmrg        if (!magic) {
20801e04c3fSmrg                add_write_dep(state, &state->last_rf[waddr], n);
2097ec681f3Smrg        } else if (v3d_qpu_magic_waddr_is_tmu(state->devinfo, waddr)) {
2107ec681f3Smrg                if (can_reorder_tmu_write(state->devinfo, waddr))
2117ec681f3Smrg                        add_read_dep(state, state->last_tmu_write, n);
2127ec681f3Smrg                else
2137ec681f3Smrg                        add_write_dep(state, &state->last_tmu_write, n);
2147ec681f3Smrg
2157ec681f3Smrg                if (tmu_write_is_sequence_terminator(waddr))
21601e04c3fSmrg                        add_write_dep(state, &state->last_tmu_config, n);
21701e04c3fSmrg        } else if (v3d_qpu_magic_waddr_is_sfu(waddr)) {
21801e04c3fSmrg                /* Handled by v3d_qpu_writes_r4() check. */
21901e04c3fSmrg        } else {
22001e04c3fSmrg                switch (waddr) {
22101e04c3fSmrg                case V3D_QPU_WADDR_R0:
22201e04c3fSmrg                case V3D_QPU_WADDR_R1:
22301e04c3fSmrg                case V3D_QPU_WADDR_R2:
22401e04c3fSmrg                        add_write_dep(state,
22501e04c3fSmrg                                      &state->last_r[waddr - V3D_QPU_WADDR_R0],
22601e04c3fSmrg                                      n);
22701e04c3fSmrg                        break;
22801e04c3fSmrg                case V3D_QPU_WADDR_R3:
22901e04c3fSmrg                case V3D_QPU_WADDR_R4:
23001e04c3fSmrg                case V3D_QPU_WADDR_R5:
23101e04c3fSmrg                        /* Handled by v3d_qpu_writes_r*() checks below. */
23201e04c3fSmrg                        break;
23301e04c3fSmrg
23401e04c3fSmrg                case V3D_QPU_WADDR_VPM:
23501e04c3fSmrg                case V3D_QPU_WADDR_VPMU:
23601e04c3fSmrg                        add_write_dep(state, &state->last_vpm, n);
23701e04c3fSmrg                        break;
23801e04c3fSmrg
23901e04c3fSmrg                case V3D_QPU_WADDR_TLB:
24001e04c3fSmrg                case V3D_QPU_WADDR_TLBU:
24101e04c3fSmrg                        add_write_dep(state, &state->last_tlb, n);
24201e04c3fSmrg                        break;
24301e04c3fSmrg
244ed98bd31Smaya                case V3D_QPU_WADDR_SYNC:
245ed98bd31Smaya                case V3D_QPU_WADDR_SYNCB:
246ed98bd31Smaya                case V3D_QPU_WADDR_SYNCU:
247ed98bd31Smaya                        /* For CS barrier(): Sync against any other memory
248ed98bd31Smaya                         * accesses.  There doesn't appear to be any need for
249ed98bd31Smaya                         * barriers to affect ALU operations.
250ed98bd31Smaya                         */
251ed98bd31Smaya                        add_write_dep(state, &state->last_tmu_write, n);
2527ec681f3Smrg                        add_write_dep(state, &state->last_tmu_read, n);
2537ec681f3Smrg                        break;
2547ec681f3Smrg
2557ec681f3Smrg                case V3D_QPU_WADDR_UNIFA:
2567ec681f3Smrg                        if (state->devinfo->ver >= 40)
2577ec681f3Smrg                                add_write_dep(state, &state->last_unifa, n);
258ed98bd31Smaya                        break;
259ed98bd31Smaya
26001e04c3fSmrg                case V3D_QPU_WADDR_NOP:
26101e04c3fSmrg                        break;
26201e04c3fSmrg
26301e04c3fSmrg                default:
26401e04c3fSmrg                        fprintf(stderr, "Unknown waddr %d\n", waddr);
26501e04c3fSmrg                        abort();
26601e04c3fSmrg                }
26701e04c3fSmrg        }
26801e04c3fSmrg}
26901e04c3fSmrg
27001e04c3fSmrg/**
27101e04c3fSmrg * Common code for dependencies that need to be tracked both forward and
27201e04c3fSmrg * backward.
27301e04c3fSmrg *
27401e04c3fSmrg * This is for things like "all reads of r4 have to happen between the r4
27501e04c3fSmrg * writes that surround them".
27601e04c3fSmrg */
27701e04c3fSmrgstatic void
27801e04c3fSmrgcalculate_deps(struct schedule_state *state, struct schedule_node *n)
27901e04c3fSmrg{
28001e04c3fSmrg        const struct v3d_device_info *devinfo = state->devinfo;
28101e04c3fSmrg        struct qinst *qinst = n->inst;
28201e04c3fSmrg        struct v3d_qpu_instr *inst = &qinst->qpu;
283ed98bd31Smaya        /* If the input and output segments are shared, then all VPM reads to
284ed98bd31Smaya         * a location need to happen before all writes.  We handle this by
285ed98bd31Smaya         * serializing all VPM operations for now.
286ed98bd31Smaya         */
287ed98bd31Smaya        bool separate_vpm_segment = false;
28801e04c3fSmrg
28901e04c3fSmrg        if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
29001e04c3fSmrg                if (inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS)
29101e04c3fSmrg                        add_read_dep(state, state->last_sf, n);
29201e04c3fSmrg
29301e04c3fSmrg                /* XXX: BDI */
29401e04c3fSmrg                /* XXX: BDU */
29501e04c3fSmrg                /* XXX: ub */
29601e04c3fSmrg                /* XXX: raddr_a */
29701e04c3fSmrg
29801e04c3fSmrg                add_write_dep(state, &state->last_unif, n);
29901e04c3fSmrg                return;
30001e04c3fSmrg        }
30101e04c3fSmrg
30201e04c3fSmrg        assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
30301e04c3fSmrg
30401e04c3fSmrg        /* XXX: LOAD_IMM */
30501e04c3fSmrg
30601e04c3fSmrg        if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0)
30701e04c3fSmrg                process_mux_deps(state, n, inst->alu.add.a);
30801e04c3fSmrg        if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1)
30901e04c3fSmrg                process_mux_deps(state, n, inst->alu.add.b);
31001e04c3fSmrg
31101e04c3fSmrg        if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0)
31201e04c3fSmrg                process_mux_deps(state, n, inst->alu.mul.a);
31301e04c3fSmrg        if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1)
31401e04c3fSmrg                process_mux_deps(state, n, inst->alu.mul.b);
31501e04c3fSmrg
31601e04c3fSmrg        switch (inst->alu.add.op) {
31701e04c3fSmrg        case V3D_QPU_A_VPMSETUP:
31801e04c3fSmrg                /* Could distinguish read/write by unpacking the uniform. */
31901e04c3fSmrg                add_write_dep(state, &state->last_vpm, n);
32001e04c3fSmrg                add_write_dep(state, &state->last_vpm_read, n);
32101e04c3fSmrg                break;
32201e04c3fSmrg
32301e04c3fSmrg        case V3D_QPU_A_STVPMV:
32401e04c3fSmrg        case V3D_QPU_A_STVPMD:
32501e04c3fSmrg        case V3D_QPU_A_STVPMP:
32601e04c3fSmrg                add_write_dep(state, &state->last_vpm, n);
32701e04c3fSmrg                break;
32801e04c3fSmrg
329ed98bd31Smaya        case V3D_QPU_A_LDVPMV_IN:
330ed98bd31Smaya        case V3D_QPU_A_LDVPMD_IN:
331ed98bd31Smaya        case V3D_QPU_A_LDVPMG_IN:
332ed98bd31Smaya        case V3D_QPU_A_LDVPMP:
333ed98bd31Smaya                if (!separate_vpm_segment)
334ed98bd31Smaya                        add_write_dep(state, &state->last_vpm, n);
335ed98bd31Smaya                break;
336ed98bd31Smaya
33701e04c3fSmrg        case V3D_QPU_A_VPMWT:
33801e04c3fSmrg                add_read_dep(state, state->last_vpm, n);
33901e04c3fSmrg                break;
34001e04c3fSmrg
34101e04c3fSmrg        case V3D_QPU_A_MSF:
34201e04c3fSmrg                add_read_dep(state, state->last_tlb, n);
34301e04c3fSmrg                break;
34401e04c3fSmrg
34501e04c3fSmrg        case V3D_QPU_A_SETMSF:
34601e04c3fSmrg        case V3D_QPU_A_SETREVF:
34701e04c3fSmrg                add_write_dep(state, &state->last_tlb, n);
34801e04c3fSmrg                break;
34901e04c3fSmrg
35001e04c3fSmrg        default:
35101e04c3fSmrg                break;
35201e04c3fSmrg        }
35301e04c3fSmrg
35401e04c3fSmrg        switch (inst->alu.mul.op) {
35501e04c3fSmrg        case V3D_QPU_M_MULTOP:
35601e04c3fSmrg        case V3D_QPU_M_UMUL24:
35701e04c3fSmrg                /* MULTOP sets rtop, and UMUL24 implicitly reads rtop and
35801e04c3fSmrg                 * resets it to 0.  We could possibly reorder umul24s relative
35901e04c3fSmrg                 * to each other, but for now just keep all the MUL parts in
36001e04c3fSmrg                 * order.
36101e04c3fSmrg                 */
36201e04c3fSmrg                add_write_dep(state, &state->last_rtop, n);
36301e04c3fSmrg                break;
36401e04c3fSmrg        default:
36501e04c3fSmrg                break;
36601e04c3fSmrg        }
36701e04c3fSmrg
36801e04c3fSmrg        if (inst->alu.add.op != V3D_QPU_A_NOP) {
36901e04c3fSmrg                process_waddr_deps(state, n, inst->alu.add.waddr,
37001e04c3fSmrg                                   inst->alu.add.magic_write);
37101e04c3fSmrg        }
37201e04c3fSmrg        if (inst->alu.mul.op != V3D_QPU_M_NOP) {
37301e04c3fSmrg                process_waddr_deps(state, n, inst->alu.mul.waddr,
37401e04c3fSmrg                                   inst->alu.mul.magic_write);
37501e04c3fSmrg        }
37601e04c3fSmrg        if (v3d_qpu_sig_writes_address(devinfo, &inst->sig)) {
37701e04c3fSmrg                process_waddr_deps(state, n, inst->sig_addr,
37801e04c3fSmrg                                   inst->sig_magic);
37901e04c3fSmrg        }
38001e04c3fSmrg
38101e04c3fSmrg        if (v3d_qpu_writes_r3(devinfo, inst))
38201e04c3fSmrg                add_write_dep(state, &state->last_r[3], n);
38301e04c3fSmrg        if (v3d_qpu_writes_r4(devinfo, inst))
38401e04c3fSmrg                add_write_dep(state, &state->last_r[4], n);
38501e04c3fSmrg        if (v3d_qpu_writes_r5(devinfo, inst))
38601e04c3fSmrg                add_write_dep(state, &state->last_r[5], n);
38701e04c3fSmrg
3887ec681f3Smrg        /* If we add any more dependencies here we should consider whether we
3897ec681f3Smrg         * also need to update qpu_inst_after_thrsw_valid_in_delay_slot.
3907ec681f3Smrg         */
39101e04c3fSmrg        if (inst->sig.thrsw) {
39201e04c3fSmrg                /* All accumulator contents and flags are undefined after the
39301e04c3fSmrg                 * switch.
39401e04c3fSmrg                 */
39501e04c3fSmrg                for (int i = 0; i < ARRAY_SIZE(state->last_r); i++)
39601e04c3fSmrg                        add_write_dep(state, &state->last_r[i], n);
39701e04c3fSmrg                add_write_dep(state, &state->last_sf, n);
39801e04c3fSmrg                add_write_dep(state, &state->last_rtop, n);
39901e04c3fSmrg
40001e04c3fSmrg                /* Scoreboard-locking operations have to stay after the last
40101e04c3fSmrg                 * thread switch.
40201e04c3fSmrg                 */
40301e04c3fSmrg                add_write_dep(state, &state->last_tlb, n);
40401e04c3fSmrg
40501e04c3fSmrg                add_write_dep(state, &state->last_tmu_write, n);
40601e04c3fSmrg                add_write_dep(state, &state->last_tmu_config, n);
40701e04c3fSmrg        }
40801e04c3fSmrg
40901e04c3fSmrg        if (v3d_qpu_waits_on_tmu(inst)) {
41001e04c3fSmrg                /* TMU loads are coming from a FIFO, so ordering is important.
41101e04c3fSmrg                 */
4127ec681f3Smrg                add_write_dep(state, &state->last_tmu_read, n);
4137ec681f3Smrg                /* Keep TMU loads after their TMU lookup terminator */
4147ec681f3Smrg                add_read_dep(state, state->last_tmu_config, n);
41501e04c3fSmrg        }
41601e04c3fSmrg
4177ec681f3Smrg        /* Allow wrtmuc to be reordered with other instructions in the
4187ec681f3Smrg         * same TMU sequence by using a read dependency on the last TMU
4197ec681f3Smrg         * sequence terminator.
4207ec681f3Smrg         */
42101e04c3fSmrg        if (inst->sig.wrtmuc)
4227ec681f3Smrg                add_read_dep(state, state->last_tmu_config, n);
42301e04c3fSmrg
42401e04c3fSmrg        if (inst->sig.ldtlb | inst->sig.ldtlbu)
4257ec681f3Smrg                add_write_dep(state, &state->last_tlb, n);
42601e04c3fSmrg
427ed98bd31Smaya        if (inst->sig.ldvpm) {
42801e04c3fSmrg                add_write_dep(state, &state->last_vpm_read, n);
42901e04c3fSmrg
430ed98bd31Smaya                /* At least for now, we're doing shared I/O segments, so queue
431ed98bd31Smaya                 * all writes after all reads.
432ed98bd31Smaya                 */
433ed98bd31Smaya                if (!separate_vpm_segment)
434ed98bd31Smaya                        add_write_dep(state, &state->last_vpm, n);
435ed98bd31Smaya        }
436ed98bd31Smaya
43701e04c3fSmrg        /* inst->sig.ldunif or sideband uniform read */
438ed98bd31Smaya        if (vir_has_uniform(qinst))
43901e04c3fSmrg                add_write_dep(state, &state->last_unif, n);
44001e04c3fSmrg
4417ec681f3Smrg        /* Both unifa and ldunifa must preserve ordering */
4427ec681f3Smrg        if (inst->sig.ldunifa || inst->sig.ldunifarf)
4437ec681f3Smrg                add_write_dep(state, &state->last_unifa, n);
4447ec681f3Smrg
445ed98bd31Smaya        if (v3d_qpu_reads_flags(inst))
446ed98bd31Smaya                add_read_dep(state, state->last_sf, n);
447ed98bd31Smaya        if (v3d_qpu_writes_flags(inst))
448ed98bd31Smaya                add_write_dep(state, &state->last_sf, n);
44901e04c3fSmrg}
45001e04c3fSmrg
45101e04c3fSmrgstatic void
452ed98bd31Smayacalculate_forward_deps(struct v3d_compile *c, struct dag *dag,
453ed98bd31Smaya                       struct list_head *schedule_list)
45401e04c3fSmrg{
45501e04c3fSmrg        struct schedule_state state;
45601e04c3fSmrg
45701e04c3fSmrg        memset(&state, 0, sizeof(state));
458ed98bd31Smaya        state.dag = dag;
45901e04c3fSmrg        state.devinfo = c->devinfo;
46001e04c3fSmrg        state.dir = F;
46101e04c3fSmrg
46201e04c3fSmrg        list_for_each_entry(struct schedule_node, node, schedule_list, link)
46301e04c3fSmrg                calculate_deps(&state, node);
46401e04c3fSmrg}
46501e04c3fSmrg
46601e04c3fSmrgstatic void
467ed98bd31Smayacalculate_reverse_deps(struct v3d_compile *c, struct dag *dag,
468ed98bd31Smaya                       struct list_head *schedule_list)
46901e04c3fSmrg{
47001e04c3fSmrg        struct schedule_state state;
47101e04c3fSmrg
47201e04c3fSmrg        memset(&state, 0, sizeof(state));
473ed98bd31Smaya        state.dag = dag;
47401e04c3fSmrg        state.devinfo = c->devinfo;
47501e04c3fSmrg        state.dir = R;
47601e04c3fSmrg
477ed98bd31Smaya        list_for_each_entry_rev(struct schedule_node, node, schedule_list,
478ed98bd31Smaya                                link) {
47901e04c3fSmrg                calculate_deps(&state, (struct schedule_node *)node);
48001e04c3fSmrg        }
48101e04c3fSmrg}
48201e04c3fSmrg
48301e04c3fSmrgstruct choose_scoreboard {
484ed98bd31Smaya        struct dag *dag;
48501e04c3fSmrg        int tick;
48601e04c3fSmrg        int last_magic_sfu_write_tick;
4877ec681f3Smrg        int last_stallable_sfu_reg;
4887ec681f3Smrg        int last_stallable_sfu_tick;
48901e04c3fSmrg        int last_ldvary_tick;
4907ec681f3Smrg        int last_unifa_write_tick;
49101e04c3fSmrg        int last_uniforms_reset_tick;
49201e04c3fSmrg        int last_thrsw_tick;
4937ec681f3Smrg        int last_branch_tick;
4947ec681f3Smrg        int last_setmsf_tick;
4957ec681f3Smrg        bool first_thrsw_emitted;
4967ec681f3Smrg        bool last_thrsw_emitted;
4977ec681f3Smrg        bool fixup_ldvary;
4987ec681f3Smrg        int ldvary_count;
49901e04c3fSmrg};
50001e04c3fSmrg
50101e04c3fSmrgstatic bool
50201e04c3fSmrgmux_reads_too_soon(struct choose_scoreboard *scoreboard,
50301e04c3fSmrg                   const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)
50401e04c3fSmrg{
50501e04c3fSmrg        switch (mux) {
50601e04c3fSmrg        case V3D_QPU_MUX_R4:
50701e04c3fSmrg                if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick <= 2)
50801e04c3fSmrg                        return true;
50901e04c3fSmrg                break;
51001e04c3fSmrg
51101e04c3fSmrg        case V3D_QPU_MUX_R5:
51201e04c3fSmrg                if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
51301e04c3fSmrg                        return true;
51401e04c3fSmrg                break;
51501e04c3fSmrg        default:
51601e04c3fSmrg                break;
51701e04c3fSmrg        }
51801e04c3fSmrg
51901e04c3fSmrg        return false;
52001e04c3fSmrg}
52101e04c3fSmrg
52201e04c3fSmrgstatic bool
52301e04c3fSmrgreads_too_soon_after_write(struct choose_scoreboard *scoreboard,
52401e04c3fSmrg                           struct qinst *qinst)
52501e04c3fSmrg{
52601e04c3fSmrg        const struct v3d_qpu_instr *inst = &qinst->qpu;
52701e04c3fSmrg
52801e04c3fSmrg        /* XXX: Branching off of raddr. */
52901e04c3fSmrg        if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
53001e04c3fSmrg                return false;
53101e04c3fSmrg
53201e04c3fSmrg        assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
53301e04c3fSmrg
53401e04c3fSmrg        if (inst->alu.add.op != V3D_QPU_A_NOP) {
53501e04c3fSmrg                if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 &&
53601e04c3fSmrg                    mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) {
53701e04c3fSmrg                        return true;
53801e04c3fSmrg                }
53901e04c3fSmrg                if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 &&
54001e04c3fSmrg                    mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) {
54101e04c3fSmrg                        return true;
54201e04c3fSmrg                }
54301e04c3fSmrg        }
54401e04c3fSmrg
54501e04c3fSmrg        if (inst->alu.mul.op != V3D_QPU_M_NOP) {
54601e04c3fSmrg                if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 &&
54701e04c3fSmrg                    mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) {
54801e04c3fSmrg                        return true;
54901e04c3fSmrg                }
55001e04c3fSmrg                if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 &&
55101e04c3fSmrg                    mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) {
55201e04c3fSmrg                        return true;
55301e04c3fSmrg                }
55401e04c3fSmrg        }
55501e04c3fSmrg
55601e04c3fSmrg        /* XXX: imm */
55701e04c3fSmrg
55801e04c3fSmrg        return false;
55901e04c3fSmrg}
56001e04c3fSmrg
56101e04c3fSmrgstatic bool
56201e04c3fSmrgwrites_too_soon_after_write(const struct v3d_device_info *devinfo,
56301e04c3fSmrg                            struct choose_scoreboard *scoreboard,
56401e04c3fSmrg                            struct qinst *qinst)
56501e04c3fSmrg{
56601e04c3fSmrg        const struct v3d_qpu_instr *inst = &qinst->qpu;
56701e04c3fSmrg
56801e04c3fSmrg        /* Don't schedule any other r4 write too soon after an SFU write.
56901e04c3fSmrg         * This would normally be prevented by dependency tracking, but might
57001e04c3fSmrg         * occur if a dead SFU computation makes it to scheduling.
57101e04c3fSmrg         */
57201e04c3fSmrg        if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick < 2 &&
57301e04c3fSmrg            v3d_qpu_writes_r4(devinfo, inst))
57401e04c3fSmrg                return true;
57501e04c3fSmrg
57601e04c3fSmrg        return false;
57701e04c3fSmrg}
57801e04c3fSmrg
57901e04c3fSmrgstatic bool
5807ec681f3Smrgscoreboard_is_locked(struct choose_scoreboard *scoreboard,
5817ec681f3Smrg                     bool lock_scoreboard_on_first_thrsw)
5827ec681f3Smrg{
5837ec681f3Smrg        if (lock_scoreboard_on_first_thrsw) {
5847ec681f3Smrg                return scoreboard->first_thrsw_emitted &&
5857ec681f3Smrg                       scoreboard->tick - scoreboard->last_thrsw_tick >= 3;
5867ec681f3Smrg        }
5877ec681f3Smrg
5887ec681f3Smrg        return scoreboard->last_thrsw_emitted &&
5897ec681f3Smrg               scoreboard->tick - scoreboard->last_thrsw_tick >= 3;
5907ec681f3Smrg}
5917ec681f3Smrg
5927ec681f3Smrgstatic bool
5937ec681f3Smrgpixel_scoreboard_too_soon(struct v3d_compile *c,
5947ec681f3Smrg                          struct choose_scoreboard *scoreboard,
59501e04c3fSmrg                          const struct v3d_qpu_instr *inst)
59601e04c3fSmrg{
5977ec681f3Smrg        return qpu_inst_is_tlb(inst) &&
5987ec681f3Smrg               !scoreboard_is_locked(scoreboard,
5997ec681f3Smrg                                     c->lock_scoreboard_on_first_thrsw);
6007ec681f3Smrg}
6017ec681f3Smrg
6027ec681f3Smrgstatic bool
6037ec681f3Smrgqpu_instruction_uses_rf(const struct v3d_qpu_instr *inst,
6047ec681f3Smrg                        uint32_t waddr) {
6057ec681f3Smrg
6067ec681f3Smrg        if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
6077ec681f3Smrg           return false;
6087ec681f3Smrg
6097ec681f3Smrg        if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
6107ec681f3Smrg            inst->raddr_a == waddr)
6117ec681f3Smrg              return true;
6127ec681f3Smrg
6137ec681f3Smrg        if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
6147ec681f3Smrg            !inst->sig.small_imm && (inst->raddr_b == waddr))
6157ec681f3Smrg              return true;
6167ec681f3Smrg
6177ec681f3Smrg        return false;
6187ec681f3Smrg}
6197ec681f3Smrg
6207ec681f3Smrgstatic bool
6217ec681f3Smrgmux_read_stalls(struct choose_scoreboard *scoreboard,
6227ec681f3Smrg                const struct v3d_qpu_instr *inst)
6237ec681f3Smrg{
6247ec681f3Smrg        return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 &&
6257ec681f3Smrg                qpu_instruction_uses_rf(inst,
6267ec681f3Smrg                                        scoreboard->last_stallable_sfu_reg);
62701e04c3fSmrg}
62801e04c3fSmrg
6297ec681f3Smrg/* We define a max schedule priority to allow negative priorities as result of
6307ec681f3Smrg * substracting this max when an instruction stalls. So instructions that
6317ec681f3Smrg * stall have lower priority than regular instructions. */
6327ec681f3Smrg#define MAX_SCHEDULE_PRIORITY 16
6337ec681f3Smrg
63401e04c3fSmrgstatic int
6357ec681f3Smrgget_instruction_priority(const struct v3d_device_info *devinfo,
6367ec681f3Smrg                         const struct v3d_qpu_instr *inst)
63701e04c3fSmrg{
63801e04c3fSmrg        uint32_t baseline_score;
63901e04c3fSmrg        uint32_t next_score = 0;
64001e04c3fSmrg
64101e04c3fSmrg        /* Schedule TLB operations as late as possible, to get more
64201e04c3fSmrg         * parallelism between shaders.
64301e04c3fSmrg         */
64401e04c3fSmrg        if (qpu_inst_is_tlb(inst))
64501e04c3fSmrg                return next_score;
64601e04c3fSmrg        next_score++;
64701e04c3fSmrg
64801e04c3fSmrg        /* Schedule texture read results collection late to hide latency. */
64901e04c3fSmrg        if (v3d_qpu_waits_on_tmu(inst))
65001e04c3fSmrg                return next_score;
65101e04c3fSmrg        next_score++;
65201e04c3fSmrg
65301e04c3fSmrg        /* Default score for things that aren't otherwise special. */
65401e04c3fSmrg        baseline_score = next_score;
65501e04c3fSmrg        next_score++;
65601e04c3fSmrg
65701e04c3fSmrg        /* Schedule texture read setup early to hide their latency better. */
6587ec681f3Smrg        if (v3d_qpu_writes_tmu(devinfo, inst))
65901e04c3fSmrg                return next_score;
66001e04c3fSmrg        next_score++;
66101e04c3fSmrg
6627ec681f3Smrg        /* We should increase the maximum if we assert here */
6637ec681f3Smrg        assert(next_score < MAX_SCHEDULE_PRIORITY);
6647ec681f3Smrg
66501e04c3fSmrg        return baseline_score;
66601e04c3fSmrg}
66701e04c3fSmrg
66801e04c3fSmrgstatic bool
6697ec681f3Smrgqpu_magic_waddr_is_periph(const struct v3d_device_info *devinfo,
6707ec681f3Smrg                          enum v3d_qpu_waddr waddr)
67101e04c3fSmrg{
6727ec681f3Smrg        return (v3d_qpu_magic_waddr_is_tmu(devinfo, waddr) ||
67301e04c3fSmrg                v3d_qpu_magic_waddr_is_sfu(waddr) ||
67401e04c3fSmrg                v3d_qpu_magic_waddr_is_tlb(waddr) ||
67501e04c3fSmrg                v3d_qpu_magic_waddr_is_vpm(waddr) ||
67601e04c3fSmrg                v3d_qpu_magic_waddr_is_tsy(waddr));
67701e04c3fSmrg}
67801e04c3fSmrg
67901e04c3fSmrgstatic bool
6807ec681f3Smrgqpu_accesses_peripheral(const struct v3d_device_info *devinfo,
6817ec681f3Smrg                        const struct v3d_qpu_instr *inst)
68201e04c3fSmrg{
68301e04c3fSmrg        if (v3d_qpu_uses_vpm(inst))
68401e04c3fSmrg                return true;
68501e04c3fSmrg        if (v3d_qpu_uses_sfu(inst))
68601e04c3fSmrg                return true;
68701e04c3fSmrg
68801e04c3fSmrg        if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
68901e04c3fSmrg                if (inst->alu.add.op != V3D_QPU_A_NOP &&
69001e04c3fSmrg                    inst->alu.add.magic_write &&
6917ec681f3Smrg                    qpu_magic_waddr_is_periph(devinfo, inst->alu.add.waddr)) {
69201e04c3fSmrg                        return true;
69301e04c3fSmrg                }
69401e04c3fSmrg
69501e04c3fSmrg                if (inst->alu.add.op == V3D_QPU_A_TMUWT)
69601e04c3fSmrg                        return true;
69701e04c3fSmrg
69801e04c3fSmrg                if (inst->alu.mul.op != V3D_QPU_M_NOP &&
69901e04c3fSmrg                    inst->alu.mul.magic_write &&
7007ec681f3Smrg                    qpu_magic_waddr_is_periph(devinfo, inst->alu.mul.waddr)) {
70101e04c3fSmrg                        return true;
70201e04c3fSmrg                }
70301e04c3fSmrg        }
70401e04c3fSmrg
70501e04c3fSmrg        return (inst->sig.ldvpm ||
70601e04c3fSmrg                inst->sig.ldtmu ||
70701e04c3fSmrg                inst->sig.ldtlb ||
70801e04c3fSmrg                inst->sig.ldtlbu ||
70901e04c3fSmrg                inst->sig.wrtmuc);
71001e04c3fSmrg}
71101e04c3fSmrg
7127ec681f3Smrgstatic bool
7137ec681f3Smrgqpu_compatible_peripheral_access(const struct v3d_device_info *devinfo,
7147ec681f3Smrg                                 const struct v3d_qpu_instr *a,
7157ec681f3Smrg                                 const struct v3d_qpu_instr *b)
7167ec681f3Smrg{
7177ec681f3Smrg        const bool a_uses_peripheral = qpu_accesses_peripheral(devinfo, a);
7187ec681f3Smrg        const bool b_uses_peripheral = qpu_accesses_peripheral(devinfo, b);
7197ec681f3Smrg
7207ec681f3Smrg        /* We can always do one peripheral access per instruction. */
7217ec681f3Smrg        if (!a_uses_peripheral || !b_uses_peripheral)
7227ec681f3Smrg                return true;
7237ec681f3Smrg
7247ec681f3Smrg        if (devinfo->ver < 41)
7257ec681f3Smrg                return false;
7267ec681f3Smrg
7277ec681f3Smrg        /* V3D 4.1 and later allow TMU read along with a VPM read or write, and
7287ec681f3Smrg         * WRTMUC with a TMU magic register write (other than tmuc).
7297ec681f3Smrg         */
7307ec681f3Smrg        if ((a->sig.ldtmu && v3d_qpu_reads_or_writes_vpm(b)) ||
7317ec681f3Smrg            (b->sig.ldtmu && v3d_qpu_reads_or_writes_vpm(a))) {
7327ec681f3Smrg                return true;
7337ec681f3Smrg        }
7347ec681f3Smrg
7357ec681f3Smrg        if ((a->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) ||
7367ec681f3Smrg            (b->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(devinfo, a))) {
7377ec681f3Smrg                return true;
7387ec681f3Smrg        }
7397ec681f3Smrg
7407ec681f3Smrg        return false;
7417ec681f3Smrg}
7427ec681f3Smrg
7437ec681f3Smrg/* Compute a bitmask of which rf registers are used between
7447ec681f3Smrg * the two instructions.
7457ec681f3Smrg */
7467ec681f3Smrgstatic uint64_t
7477ec681f3Smrgqpu_raddrs_used(const struct v3d_qpu_instr *a,
7487ec681f3Smrg                const struct v3d_qpu_instr *b)
7497ec681f3Smrg{
7507ec681f3Smrg        assert(a->type == V3D_QPU_INSTR_TYPE_ALU);
7517ec681f3Smrg        assert(b->type == V3D_QPU_INSTR_TYPE_ALU);
7527ec681f3Smrg
7537ec681f3Smrg        uint64_t raddrs_used = 0;
7547ec681f3Smrg        if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A))
7557ec681f3Smrg                raddrs_used |= (1ll << a->raddr_a);
7567ec681f3Smrg        if (!a->sig.small_imm && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B))
7577ec681f3Smrg                raddrs_used |= (1ll << a->raddr_b);
7587ec681f3Smrg        if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A))
7597ec681f3Smrg                raddrs_used |= (1ll << b->raddr_a);
7607ec681f3Smrg        if (!b->sig.small_imm && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B))
7617ec681f3Smrg                raddrs_used |= (1ll << b->raddr_b);
7627ec681f3Smrg
7637ec681f3Smrg        return raddrs_used;
7647ec681f3Smrg}
7657ec681f3Smrg
7667ec681f3Smrg/* Take two instructions and attempt to merge their raddr fields
7677ec681f3Smrg * into one merged instruction. Returns false if the two instructions
7687ec681f3Smrg * access more than two different rf registers between them, or more
7697ec681f3Smrg * than one rf register and one small immediate.
7707ec681f3Smrg */
7717ec681f3Smrgstatic bool
7727ec681f3Smrgqpu_merge_raddrs(struct v3d_qpu_instr *result,
7737ec681f3Smrg                 const struct v3d_qpu_instr *add_instr,
7747ec681f3Smrg                 const struct v3d_qpu_instr *mul_instr)
7757ec681f3Smrg{
7767ec681f3Smrg        uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr);
7777ec681f3Smrg        int naddrs = util_bitcount64(raddrs_used);
7787ec681f3Smrg
7797ec681f3Smrg        if (naddrs > 2)
7807ec681f3Smrg                return false;
7817ec681f3Smrg
7827ec681f3Smrg        if ((add_instr->sig.small_imm || mul_instr->sig.small_imm)) {
7837ec681f3Smrg                if (naddrs > 1)
7847ec681f3Smrg                        return false;
7857ec681f3Smrg
7867ec681f3Smrg                if (add_instr->sig.small_imm && mul_instr->sig.small_imm)
7877ec681f3Smrg                        if (add_instr->raddr_b != mul_instr->raddr_b)
7887ec681f3Smrg                                return false;
7897ec681f3Smrg
7907ec681f3Smrg                result->sig.small_imm = true;
7917ec681f3Smrg                result->raddr_b = add_instr->sig.small_imm ?
7927ec681f3Smrg                        add_instr->raddr_b : mul_instr->raddr_b;
7937ec681f3Smrg        }
7947ec681f3Smrg
7957ec681f3Smrg        if (naddrs == 0)
7967ec681f3Smrg                return true;
7977ec681f3Smrg
7987ec681f3Smrg        int raddr_a = ffsll(raddrs_used) - 1;
7997ec681f3Smrg        raddrs_used &= ~(1ll << raddr_a);
8007ec681f3Smrg        result->raddr_a = raddr_a;
8017ec681f3Smrg
8027ec681f3Smrg        if (!result->sig.small_imm) {
8037ec681f3Smrg                if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) &&
8047ec681f3Smrg                    raddr_a == add_instr->raddr_b) {
8057ec681f3Smrg                        if (add_instr->alu.add.a == V3D_QPU_MUX_B)
8067ec681f3Smrg                                result->alu.add.a = V3D_QPU_MUX_A;
8077ec681f3Smrg                        if (add_instr->alu.add.b == V3D_QPU_MUX_B &&
8087ec681f3Smrg                            v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
8097ec681f3Smrg                                result->alu.add.b = V3D_QPU_MUX_A;
8107ec681f3Smrg                        }
8117ec681f3Smrg                }
8127ec681f3Smrg                if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_B) &&
8137ec681f3Smrg                    raddr_a == mul_instr->raddr_b) {
8147ec681f3Smrg                        if (mul_instr->alu.mul.a == V3D_QPU_MUX_B)
8157ec681f3Smrg                                result->alu.mul.a = V3D_QPU_MUX_A;
8167ec681f3Smrg                        if (mul_instr->alu.mul.b == V3D_QPU_MUX_B &&
8177ec681f3Smrg                            v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
8187ec681f3Smrg                                result->alu.mul.b = V3D_QPU_MUX_A;
8197ec681f3Smrg                        }
8207ec681f3Smrg                }
8217ec681f3Smrg        }
8227ec681f3Smrg        if (!raddrs_used)
8237ec681f3Smrg                return true;
8247ec681f3Smrg
8257ec681f3Smrg        int raddr_b = ffsll(raddrs_used) - 1;
8267ec681f3Smrg        result->raddr_b = raddr_b;
8277ec681f3Smrg        if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_A) &&
8287ec681f3Smrg            raddr_b == add_instr->raddr_a) {
8297ec681f3Smrg                if (add_instr->alu.add.a == V3D_QPU_MUX_A)
8307ec681f3Smrg                        result->alu.add.a = V3D_QPU_MUX_B;
8317ec681f3Smrg                if (add_instr->alu.add.b == V3D_QPU_MUX_A &&
8327ec681f3Smrg                    v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
8337ec681f3Smrg                        result->alu.add.b = V3D_QPU_MUX_B;
8347ec681f3Smrg                }
8357ec681f3Smrg        }
8367ec681f3Smrg        if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_A) &&
8377ec681f3Smrg            raddr_b == mul_instr->raddr_a) {
8387ec681f3Smrg                if (mul_instr->alu.mul.a == V3D_QPU_MUX_A)
8397ec681f3Smrg                        result->alu.mul.a = V3D_QPU_MUX_B;
8407ec681f3Smrg                if (mul_instr->alu.mul.b == V3D_QPU_MUX_A &&
8417ec681f3Smrg                    v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
8427ec681f3Smrg                        result->alu.mul.b = V3D_QPU_MUX_B;
8437ec681f3Smrg                }
8447ec681f3Smrg        }
8457ec681f3Smrg
8467ec681f3Smrg        return true;
8477ec681f3Smrg}
8487ec681f3Smrg
8497ec681f3Smrgstatic bool
8507ec681f3Smrgcan_do_add_as_mul(enum v3d_qpu_add_op op)
8517ec681f3Smrg{
8527ec681f3Smrg        switch (op) {
8537ec681f3Smrg        case V3D_QPU_A_ADD:
8547ec681f3Smrg        case V3D_QPU_A_SUB:
8557ec681f3Smrg                return true;
8567ec681f3Smrg        default:
8577ec681f3Smrg                return false;
8587ec681f3Smrg        }
8597ec681f3Smrg}
8607ec681f3Smrg
8617ec681f3Smrgstatic enum v3d_qpu_mul_op
8627ec681f3Smrgadd_op_as_mul_op(enum v3d_qpu_add_op op)
8637ec681f3Smrg{
8647ec681f3Smrg        switch (op) {
8657ec681f3Smrg        case V3D_QPU_A_ADD:
8667ec681f3Smrg                return V3D_QPU_M_ADD;
8677ec681f3Smrg        case V3D_QPU_A_SUB:
8687ec681f3Smrg                return V3D_QPU_M_SUB;
8697ec681f3Smrg        default:
8707ec681f3Smrg                unreachable("unexpected add opcode");
8717ec681f3Smrg        }
8727ec681f3Smrg}
8737ec681f3Smrg
8747ec681f3Smrgstatic void
8757ec681f3Smrgqpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
8767ec681f3Smrg{
8777ec681f3Smrg        STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add));
8787ec681f3Smrg        assert(inst->alu.add.op != V3D_QPU_A_NOP);
8797ec681f3Smrg        assert(inst->alu.mul.op == V3D_QPU_M_NOP);
8807ec681f3Smrg
8817ec681f3Smrg        memcpy(&inst->alu.mul, &inst->alu.add, sizeof(inst->alu.mul));
8827ec681f3Smrg        inst->alu.mul.op = add_op_as_mul_op(inst->alu.add.op);
8837ec681f3Smrg        inst->alu.add.op = V3D_QPU_A_NOP;
8847ec681f3Smrg
8857ec681f3Smrg        inst->flags.mc = inst->flags.ac;
8867ec681f3Smrg        inst->flags.mpf = inst->flags.apf;
8877ec681f3Smrg        inst->flags.muf = inst->flags.auf;
8887ec681f3Smrg        inst->flags.ac = V3D_QPU_COND_NONE;
8897ec681f3Smrg        inst->flags.apf = V3D_QPU_PF_NONE;
8907ec681f3Smrg        inst->flags.auf = V3D_QPU_UF_NONE;
8917ec681f3Smrg}
8927ec681f3Smrg
89301e04c3fSmrgstatic bool
89401e04c3fSmrgqpu_merge_inst(const struct v3d_device_info *devinfo,
89501e04c3fSmrg               struct v3d_qpu_instr *result,
89601e04c3fSmrg               const struct v3d_qpu_instr *a,
89701e04c3fSmrg               const struct v3d_qpu_instr *b)
89801e04c3fSmrg{
89901e04c3fSmrg        if (a->type != V3D_QPU_INSTR_TYPE_ALU ||
90001e04c3fSmrg            b->type != V3D_QPU_INSTR_TYPE_ALU) {
90101e04c3fSmrg                return false;
90201e04c3fSmrg        }
90301e04c3fSmrg
9047ec681f3Smrg        if (!qpu_compatible_peripheral_access(devinfo, a, b))
90501e04c3fSmrg                return false;
90601e04c3fSmrg
90701e04c3fSmrg        struct v3d_qpu_instr merge = *a;
9087ec681f3Smrg        const struct v3d_qpu_instr *add_instr = NULL, *mul_instr = NULL;
90901e04c3fSmrg
9107ec681f3Smrg        struct v3d_qpu_instr mul_inst;
91101e04c3fSmrg        if (b->alu.add.op != V3D_QPU_A_NOP) {
9127ec681f3Smrg                if (a->alu.add.op == V3D_QPU_A_NOP) {
9137ec681f3Smrg                        merge.alu.add = b->alu.add;
9147ec681f3Smrg
9157ec681f3Smrg                        merge.flags.ac = b->flags.ac;
9167ec681f3Smrg                        merge.flags.apf = b->flags.apf;
9177ec681f3Smrg                        merge.flags.auf = b->flags.auf;
9187ec681f3Smrg
9197ec681f3Smrg                        add_instr = b;
9207ec681f3Smrg                        mul_instr = a;
9217ec681f3Smrg                }
9227ec681f3Smrg                /* If a's add op is used but its mul op is not, then see if we
9237ec681f3Smrg                 * can convert either a's add op or b's add op to a mul op
9247ec681f3Smrg                 * so we can merge.
9257ec681f3Smrg                 */
9267ec681f3Smrg                else if (a->alu.mul.op == V3D_QPU_M_NOP &&
9277ec681f3Smrg                         can_do_add_as_mul(b->alu.add.op)) {
9287ec681f3Smrg                        mul_inst = *b;
9297ec681f3Smrg                        qpu_convert_add_to_mul(&mul_inst);
9307ec681f3Smrg
9317ec681f3Smrg                        merge.alu.mul = mul_inst.alu.mul;
9327ec681f3Smrg
9337ec681f3Smrg                        merge.flags.mc = b->flags.ac;
9347ec681f3Smrg                        merge.flags.mpf = b->flags.apf;
9357ec681f3Smrg                        merge.flags.muf = b->flags.auf;
9367ec681f3Smrg
9377ec681f3Smrg                        add_instr = a;
9387ec681f3Smrg                        mul_instr = &mul_inst;
9397ec681f3Smrg                } else if (a->alu.mul.op == V3D_QPU_M_NOP &&
9407ec681f3Smrg                           can_do_add_as_mul(a->alu.add.op)) {
9417ec681f3Smrg                        mul_inst = *a;
9427ec681f3Smrg                        qpu_convert_add_to_mul(&mul_inst);
9437ec681f3Smrg
9447ec681f3Smrg                        merge = mul_inst;
9457ec681f3Smrg                        merge.alu.add = b->alu.add;
94601e04c3fSmrg
9477ec681f3Smrg                        merge.flags.ac = b->flags.ac;
9487ec681f3Smrg                        merge.flags.apf = b->flags.apf;
9497ec681f3Smrg                        merge.flags.auf = b->flags.auf;
9507ec681f3Smrg
9517ec681f3Smrg                        add_instr = b;
9527ec681f3Smrg                        mul_instr = &mul_inst;
9537ec681f3Smrg                } else {
9547ec681f3Smrg                        return false;
9557ec681f3Smrg                }
95601e04c3fSmrg        }
95701e04c3fSmrg
95801e04c3fSmrg        if (b->alu.mul.op != V3D_QPU_M_NOP) {
95901e04c3fSmrg                if (a->alu.mul.op != V3D_QPU_M_NOP)
96001e04c3fSmrg                        return false;
96101e04c3fSmrg                merge.alu.mul = b->alu.mul;
96201e04c3fSmrg
96301e04c3fSmrg                merge.flags.mc = b->flags.mc;
96401e04c3fSmrg                merge.flags.mpf = b->flags.mpf;
96501e04c3fSmrg                merge.flags.muf = b->flags.muf;
96601e04c3fSmrg
9677ec681f3Smrg                mul_instr = b;
9687ec681f3Smrg                add_instr = a;
96901e04c3fSmrg        }
97001e04c3fSmrg
9717ec681f3Smrg        if (add_instr && mul_instr &&
9727ec681f3Smrg            !qpu_merge_raddrs(&merge, add_instr, mul_instr)) {
97301e04c3fSmrg                        return false;
97401e04c3fSmrg        }
97501e04c3fSmrg
97601e04c3fSmrg        merge.sig.thrsw |= b->sig.thrsw;
97701e04c3fSmrg        merge.sig.ldunif |= b->sig.ldunif;
97801e04c3fSmrg        merge.sig.ldunifrf |= b->sig.ldunifrf;
97901e04c3fSmrg        merge.sig.ldunifa |= b->sig.ldunifa;
98001e04c3fSmrg        merge.sig.ldunifarf |= b->sig.ldunifarf;
98101e04c3fSmrg        merge.sig.ldtmu |= b->sig.ldtmu;
98201e04c3fSmrg        merge.sig.ldvary |= b->sig.ldvary;
98301e04c3fSmrg        merge.sig.ldvpm |= b->sig.ldvpm;
98401e04c3fSmrg        merge.sig.small_imm |= b->sig.small_imm;
98501e04c3fSmrg        merge.sig.ldtlb |= b->sig.ldtlb;
98601e04c3fSmrg        merge.sig.ldtlbu |= b->sig.ldtlbu;
98701e04c3fSmrg        merge.sig.ucb |= b->sig.ucb;
98801e04c3fSmrg        merge.sig.rotate |= b->sig.rotate;
98901e04c3fSmrg        merge.sig.wrtmuc |= b->sig.wrtmuc;
99001e04c3fSmrg
99101e04c3fSmrg        if (v3d_qpu_sig_writes_address(devinfo, &a->sig) &&
99201e04c3fSmrg            v3d_qpu_sig_writes_address(devinfo, &b->sig))
99301e04c3fSmrg                return false;
99401e04c3fSmrg        merge.sig_addr |= b->sig_addr;
99501e04c3fSmrg        merge.sig_magic |= b->sig_magic;
99601e04c3fSmrg
99701e04c3fSmrg        uint64_t packed;
99801e04c3fSmrg        bool ok = v3d_qpu_instr_pack(devinfo, &merge, &packed);
99901e04c3fSmrg
100001e04c3fSmrg        *result = merge;
100101e04c3fSmrg        /* No modifying the real instructions on failure. */
100201e04c3fSmrg        assert(ok || (a != result && b != result));
100301e04c3fSmrg
100401e04c3fSmrg        return ok;
100501e04c3fSmrg}
100601e04c3fSmrg
10077ec681f3Smrgstatic inline bool
10087ec681f3Smrgtry_skip_for_ldvary_pipelining(const struct v3d_qpu_instr *inst)
10097ec681f3Smrg{
10107ec681f3Smrg        return inst->sig.ldunif || inst->sig.ldunifrf;
10117ec681f3Smrg}
10127ec681f3Smrg
10137ec681f3Smrgstatic bool
10147ec681f3Smrgqpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
10157ec681f3Smrg                                         struct choose_scoreboard *scoreboard,
10167ec681f3Smrg                                         const struct qinst *qinst);
10177ec681f3Smrg
101801e04c3fSmrgstatic struct schedule_node *
10197ec681f3Smrgchoose_instruction_to_schedule(struct v3d_compile *c,
102001e04c3fSmrg                               struct choose_scoreboard *scoreboard,
102101e04c3fSmrg                               struct schedule_node *prev_inst)
102201e04c3fSmrg{
102301e04c3fSmrg        struct schedule_node *chosen = NULL;
102401e04c3fSmrg        int chosen_prio = 0;
102501e04c3fSmrg
102601e04c3fSmrg        /* Don't pair up anything with a thread switch signal -- emit_thrsw()
102701e04c3fSmrg         * will handle pairing it along with filling the delay slots.
102801e04c3fSmrg         */
102901e04c3fSmrg        if (prev_inst) {
103001e04c3fSmrg                if (prev_inst->inst->qpu.sig.thrsw)
103101e04c3fSmrg                        return NULL;
103201e04c3fSmrg        }
103301e04c3fSmrg
10347ec681f3Smrg        bool ldvary_pipelining = c->s->info.stage == MESA_SHADER_FRAGMENT &&
10357ec681f3Smrg                                 scoreboard->ldvary_count < c->num_inputs;
10367ec681f3Smrg        bool skipped_insts_for_ldvary_pipelining = false;
10377ec681f3Smrgretry:
1038ed98bd31Smaya        list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads,
1039ed98bd31Smaya                            dag.link) {
104001e04c3fSmrg                const struct v3d_qpu_instr *inst = &n->inst->qpu;
104101e04c3fSmrg
10427ec681f3Smrg                if (ldvary_pipelining && try_skip_for_ldvary_pipelining(inst)) {
10437ec681f3Smrg                        skipped_insts_for_ldvary_pipelining = true;
10447ec681f3Smrg                        continue;
10457ec681f3Smrg                }
10467ec681f3Smrg
104701e04c3fSmrg                /* Don't choose the branch instruction until it's the last one
104801e04c3fSmrg                 * left.  We'll move it up to fit its delay slots after we
104901e04c3fSmrg                 * choose it.
105001e04c3fSmrg                 */
105101e04c3fSmrg                if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
1052ed98bd31Smaya                    !list_is_singular(&scoreboard->dag->heads)) {
105301e04c3fSmrg                        continue;
105401e04c3fSmrg                }
105501e04c3fSmrg
10567ec681f3Smrg                /* We need to have 3 delay slots between a write to unifa and
10577ec681f3Smrg                 * a follow-up ldunifa.
10587ec681f3Smrg                 */
10597ec681f3Smrg                if ((inst->sig.ldunifa || inst->sig.ldunifarf) &&
10607ec681f3Smrg                    scoreboard->tick - scoreboard->last_unifa_write_tick <= 3)
10617ec681f3Smrg                        continue;
10627ec681f3Smrg
106301e04c3fSmrg                /* "An instruction must not read from a location in physical
106401e04c3fSmrg                 *  regfile A or B that was written to by the previous
106501e04c3fSmrg                 *  instruction."
106601e04c3fSmrg                 */
106701e04c3fSmrg                if (reads_too_soon_after_write(scoreboard, n->inst))
106801e04c3fSmrg                        continue;
106901e04c3fSmrg
10707ec681f3Smrg                if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst))
107101e04c3fSmrg                        continue;
107201e04c3fSmrg
10737ec681f3Smrg                /* "Before doing a TLB access a scoreboard wait must have been
10747ec681f3Smrg                 *  done. This happens either on the first or last thread
10757ec681f3Smrg                 *  switch, depending on a setting (scb_wait_on_first_thrsw) in
10767ec681f3Smrg                 *  the shader state."
107701e04c3fSmrg                 */
10787ec681f3Smrg                if (pixel_scoreboard_too_soon(c, scoreboard, inst))
107901e04c3fSmrg                        continue;
108001e04c3fSmrg
108101e04c3fSmrg                /* ldunif and ldvary both write r5, but ldunif does so a tick
108201e04c3fSmrg                 * sooner.  If the ldvary's r5 wasn't used, then ldunif might
108301e04c3fSmrg                 * otherwise get scheduled so ldunif and ldvary try to update
108401e04c3fSmrg                 * r5 in the same tick.
108501e04c3fSmrg                 */
108601e04c3fSmrg                if ((inst->sig.ldunif || inst->sig.ldunifa) &&
108701e04c3fSmrg                    scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
108801e04c3fSmrg                        continue;
108901e04c3fSmrg                }
109001e04c3fSmrg
10917ec681f3Smrg                /* If we are in a thrsw delay slot check that this instruction
10927ec681f3Smrg                 * is valid for that.
10937ec681f3Smrg                 */
10947ec681f3Smrg                if (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick &&
10957ec681f3Smrg                    !qpu_inst_after_thrsw_valid_in_delay_slot(c, scoreboard,
10967ec681f3Smrg                                                              n->inst)) {
10977ec681f3Smrg                        continue;
10987ec681f3Smrg                }
10997ec681f3Smrg
11007ec681f3Smrg                if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
11017ec681f3Smrg                        /* Don't try to put a branch in the delay slots of another
11027ec681f3Smrg                         * branch or a unifa write.
11037ec681f3Smrg                         */
11047ec681f3Smrg                        if (scoreboard->last_branch_tick + 3 >= scoreboard->tick)
11057ec681f3Smrg                                continue;
11067ec681f3Smrg                        if (scoreboard->last_unifa_write_tick + 3 >= scoreboard->tick)
11077ec681f3Smrg                                continue;
11087ec681f3Smrg
11097ec681f3Smrg                        /* No branch with cond != 0,2,3 and msfign != 0 after
11107ec681f3Smrg                         * setmsf.
11117ec681f3Smrg                         */
11127ec681f3Smrg                        if (scoreboard->last_setmsf_tick == scoreboard->tick - 1 &&
11137ec681f3Smrg                            inst->branch.msfign != V3D_QPU_MSFIGN_NONE &&
11147ec681f3Smrg                            inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS &&
11157ec681f3Smrg                            inst->branch.cond != V3D_QPU_BRANCH_COND_A0 &&
11167ec681f3Smrg                            inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) {
11177ec681f3Smrg                                continue;
11187ec681f3Smrg                        }
11197ec681f3Smrg                }
11207ec681f3Smrg
112101e04c3fSmrg                /* If we're trying to pair with another instruction, check
112201e04c3fSmrg                 * that they're compatible.
112301e04c3fSmrg                 */
112401e04c3fSmrg                if (prev_inst) {
112501e04c3fSmrg                        /* Don't pair up a thread switch signal -- we'll
112601e04c3fSmrg                         * handle pairing it when we pick it on its own.
112701e04c3fSmrg                         */
112801e04c3fSmrg                        if (inst->sig.thrsw)
112901e04c3fSmrg                                continue;
113001e04c3fSmrg
113101e04c3fSmrg                        if (prev_inst->inst->uniform != -1 &&
113201e04c3fSmrg                            n->inst->uniform != -1)
113301e04c3fSmrg                                continue;
113401e04c3fSmrg
11357ec681f3Smrg                       /* Simulator complains if we have two uniforms loaded in
11367ec681f3Smrg                        * the the same instruction, which could happen if we
11377ec681f3Smrg                        * have a ldunif or sideband uniform and we pair that
11387ec681f3Smrg                        * with ldunifa.
11397ec681f3Smrg                        */
11407ec681f3Smrg                        if (vir_has_uniform(prev_inst->inst) &&
11417ec681f3Smrg                            (inst->sig.ldunifa || inst->sig.ldunifarf)) {
11427ec681f3Smrg                                continue;
11437ec681f3Smrg                        }
11447ec681f3Smrg
11457ec681f3Smrg                        if ((prev_inst->inst->qpu.sig.ldunifa ||
11467ec681f3Smrg                             prev_inst->inst->qpu.sig.ldunifarf) &&
11477ec681f3Smrg                            vir_has_uniform(n->inst)) {
11487ec681f3Smrg                                continue;
11497ec681f3Smrg                        }
11507ec681f3Smrg
11517ec681f3Smrg                        /* Don't merge TLB instructions before we have acquired
11527ec681f3Smrg                         * the scoreboard lock.
115301e04c3fSmrg                         */
11547ec681f3Smrg                        if (pixel_scoreboard_too_soon(c, scoreboard, inst))
115501e04c3fSmrg                                continue;
115601e04c3fSmrg
11577ec681f3Smrg                        /* When we succesfully pair up an ldvary we then try
11587ec681f3Smrg                         * to merge it into the previous instruction if
11597ec681f3Smrg                         * possible to improve pipelining. Don't pick up the
11607ec681f3Smrg                         * ldvary now if the follow-up fixup would place
11617ec681f3Smrg                         * it in the delay slots of a thrsw, which is not
11627ec681f3Smrg                         * allowed and would prevent the fixup from being
11637ec681f3Smrg                         * successul.
11647ec681f3Smrg                         */
11657ec681f3Smrg                        if (inst->sig.ldvary &&
11667ec681f3Smrg                            scoreboard->last_thrsw_tick + 2 >= scoreboard->tick - 1) {
11677ec681f3Smrg                                continue;
11687ec681f3Smrg                        }
11697ec681f3Smrg
117001e04c3fSmrg                        struct v3d_qpu_instr merged_inst;
11717ec681f3Smrg                        if (!qpu_merge_inst(c->devinfo, &merged_inst,
117201e04c3fSmrg                                            &prev_inst->inst->qpu, inst)) {
117301e04c3fSmrg                                continue;
117401e04c3fSmrg                        }
117501e04c3fSmrg                }
117601e04c3fSmrg
11777ec681f3Smrg                int prio = get_instruction_priority(c->devinfo, inst);
11787ec681f3Smrg
11797ec681f3Smrg                if (mux_read_stalls(scoreboard, inst)) {
11807ec681f3Smrg                        /* Don't merge an instruction that stalls */
11817ec681f3Smrg                        if (prev_inst)
11827ec681f3Smrg                                continue;
11837ec681f3Smrg                        else {
11847ec681f3Smrg                                /* Any instruction that don't stall will have
11857ec681f3Smrg                                 * higher scheduling priority */
11867ec681f3Smrg                                prio -= MAX_SCHEDULE_PRIORITY;
11877ec681f3Smrg                                assert(prio < 0);
11887ec681f3Smrg                        }
11897ec681f3Smrg                }
119001e04c3fSmrg
119101e04c3fSmrg                /* Found a valid instruction.  If nothing better comes along,
119201e04c3fSmrg                 * this one works.
119301e04c3fSmrg                 */
119401e04c3fSmrg                if (!chosen) {
119501e04c3fSmrg                        chosen = n;
119601e04c3fSmrg                        chosen_prio = prio;
119701e04c3fSmrg                        continue;
119801e04c3fSmrg                }
119901e04c3fSmrg
120001e04c3fSmrg                if (prio > chosen_prio) {
120101e04c3fSmrg                        chosen = n;
120201e04c3fSmrg                        chosen_prio = prio;
120301e04c3fSmrg                } else if (prio < chosen_prio) {
120401e04c3fSmrg                        continue;
120501e04c3fSmrg                }
120601e04c3fSmrg
120701e04c3fSmrg                if (n->delay > chosen->delay) {
120801e04c3fSmrg                        chosen = n;
120901e04c3fSmrg                        chosen_prio = prio;
121001e04c3fSmrg                } else if (n->delay < chosen->delay) {
121101e04c3fSmrg                        continue;
121201e04c3fSmrg                }
121301e04c3fSmrg        }
121401e04c3fSmrg
12157ec681f3Smrg        /* If we did not find any instruction to schedule but we discarded
12167ec681f3Smrg         * some of them to prioritize ldvary pipelining, try again.
12177ec681f3Smrg         */
12187ec681f3Smrg        if (!chosen && !prev_inst && skipped_insts_for_ldvary_pipelining) {
12197ec681f3Smrg                skipped_insts_for_ldvary_pipelining = false;
12207ec681f3Smrg                ldvary_pipelining = false;
12217ec681f3Smrg                goto retry;
12227ec681f3Smrg        }
12237ec681f3Smrg
12247ec681f3Smrg        if (chosen && chosen->inst->qpu.sig.ldvary) {
12257ec681f3Smrg                scoreboard->ldvary_count++;
12267ec681f3Smrg                /* If we are pairing an ldvary, flag it so we can fix it up for
12277ec681f3Smrg                 * optimal pipelining of ldvary sequences.
12287ec681f3Smrg                 */
12297ec681f3Smrg                if (prev_inst)
12307ec681f3Smrg                        scoreboard->fixup_ldvary = true;
12317ec681f3Smrg        }
12327ec681f3Smrg
123301e04c3fSmrg        return chosen;
123401e04c3fSmrg}
123501e04c3fSmrg
123601e04c3fSmrgstatic void
123701e04c3fSmrgupdate_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard,
12387ec681f3Smrg                                  enum v3d_qpu_waddr waddr,
12397ec681f3Smrg                                  const struct v3d_device_info *devinfo)
124001e04c3fSmrg{
124101e04c3fSmrg        if (v3d_qpu_magic_waddr_is_sfu(waddr))
124201e04c3fSmrg                scoreboard->last_magic_sfu_write_tick = scoreboard->tick;
12437ec681f3Smrg        else if (devinfo->ver >= 40 && waddr == V3D_QPU_WADDR_UNIFA)
12447ec681f3Smrg                scoreboard->last_unifa_write_tick = scoreboard->tick;
12457ec681f3Smrg}
12467ec681f3Smrg
12477ec681f3Smrgstatic void
12487ec681f3Smrgupdate_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard,
12497ec681f3Smrg                                      const struct v3d_qpu_instr *inst)
12507ec681f3Smrg{
12517ec681f3Smrg        if (v3d_qpu_instr_is_sfu(inst)) {
12527ec681f3Smrg                scoreboard->last_stallable_sfu_reg = inst->alu.add.waddr;
12537ec681f3Smrg                scoreboard->last_stallable_sfu_tick = scoreboard->tick;
12547ec681f3Smrg        }
125501e04c3fSmrg}
125601e04c3fSmrg
125701e04c3fSmrgstatic void
125801e04c3fSmrgupdate_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
12597ec681f3Smrg                             const struct v3d_qpu_instr *inst,
12607ec681f3Smrg                             const struct v3d_device_info *devinfo)
126101e04c3fSmrg{
126201e04c3fSmrg        if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
126301e04c3fSmrg                return;
126401e04c3fSmrg
126501e04c3fSmrg        assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
126601e04c3fSmrg
126701e04c3fSmrg        if (inst->alu.add.op != V3D_QPU_A_NOP)  {
126801e04c3fSmrg                if (inst->alu.add.magic_write) {
126901e04c3fSmrg                        update_scoreboard_for_magic_waddr(scoreboard,
12707ec681f3Smrg                                                          inst->alu.add.waddr,
12717ec681f3Smrg                                                          devinfo);
12727ec681f3Smrg                } else {
12737ec681f3Smrg                        update_scoreboard_for_sfu_stall_waddr(scoreboard,
12747ec681f3Smrg                                                              inst);
127501e04c3fSmrg                }
12767ec681f3Smrg
12777ec681f3Smrg                if (inst->alu.add.op == V3D_QPU_A_SETMSF)
12787ec681f3Smrg                        scoreboard->last_setmsf_tick = scoreboard->tick;
127901e04c3fSmrg        }
128001e04c3fSmrg
128101e04c3fSmrg        if (inst->alu.mul.op != V3D_QPU_M_NOP) {
128201e04c3fSmrg                if (inst->alu.mul.magic_write) {
128301e04c3fSmrg                        update_scoreboard_for_magic_waddr(scoreboard,
12847ec681f3Smrg                                                          inst->alu.mul.waddr,
12857ec681f3Smrg                                                          devinfo);
128601e04c3fSmrg                }
128701e04c3fSmrg        }
128801e04c3fSmrg
128901e04c3fSmrg        if (inst->sig.ldvary)
129001e04c3fSmrg                scoreboard->last_ldvary_tick = scoreboard->tick;
129101e04c3fSmrg}
129201e04c3fSmrg
129301e04c3fSmrgstatic void
1294ed98bd31Smayadump_state(const struct v3d_device_info *devinfo, struct dag *dag)
129501e04c3fSmrg{
1296ed98bd31Smaya        list_for_each_entry(struct schedule_node, n, &dag->heads, dag.link) {
129701e04c3fSmrg                fprintf(stderr, "         t=%4d: ", n->unblocked_time);
129801e04c3fSmrg                v3d_qpu_dump(devinfo, &n->inst->qpu);
129901e04c3fSmrg                fprintf(stderr, "\n");
130001e04c3fSmrg
1301ed98bd31Smaya                util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1302ed98bd31Smaya                        struct schedule_node *child =
1303ed98bd31Smaya                                (struct schedule_node *)edge->child;
130401e04c3fSmrg                        if (!child)
130501e04c3fSmrg                                continue;
130601e04c3fSmrg
130701e04c3fSmrg                        fprintf(stderr, "                 - ");
130801e04c3fSmrg                        v3d_qpu_dump(devinfo, &child->inst->qpu);
130901e04c3fSmrg                        fprintf(stderr, " (%d parents, %c)\n",
1310ed98bd31Smaya                                child->dag.parent_count,
1311ed98bd31Smaya                                edge->data ? 'w' : 'r');
131201e04c3fSmrg                }
131301e04c3fSmrg        }
131401e04c3fSmrg}
131501e04c3fSmrg
13167ec681f3Smrgstatic uint32_t magic_waddr_latency(const struct v3d_device_info *devinfo,
13177ec681f3Smrg                                    enum v3d_qpu_waddr waddr,
131801e04c3fSmrg                                    const struct v3d_qpu_instr *after)
131901e04c3fSmrg{
132001e04c3fSmrg        /* Apply some huge latency between texture fetch requests and getting
132101e04c3fSmrg         * their results back.
132201e04c3fSmrg         *
132301e04c3fSmrg         * FIXME: This is actually pretty bogus.  If we do:
132401e04c3fSmrg         *
132501e04c3fSmrg         * mov tmu0_s, a
132601e04c3fSmrg         * <a bit of math>
132701e04c3fSmrg         * mov tmu0_s, b
132801e04c3fSmrg         * load_tmu0
132901e04c3fSmrg         * <more math>
133001e04c3fSmrg         * load_tmu0
133101e04c3fSmrg         *
133201e04c3fSmrg         * we count that as worse than
133301e04c3fSmrg         *
133401e04c3fSmrg         * mov tmu0_s, a
133501e04c3fSmrg         * mov tmu0_s, b
133601e04c3fSmrg         * <lots of math>
133701e04c3fSmrg         * load_tmu0
133801e04c3fSmrg         * <more math>
133901e04c3fSmrg         * load_tmu0
134001e04c3fSmrg         *
134101e04c3fSmrg         * because we associate the first load_tmu0 with the *second* tmu0_s.
134201e04c3fSmrg         */
13437ec681f3Smrg        if (v3d_qpu_magic_waddr_is_tmu(devinfo, waddr) &&
13447ec681f3Smrg            v3d_qpu_waits_on_tmu(after)) {
134501e04c3fSmrg                return 100;
13467ec681f3Smrg        }
134701e04c3fSmrg
134801e04c3fSmrg        /* Assume that anything depending on us is consuming the SFU result. */
134901e04c3fSmrg        if (v3d_qpu_magic_waddr_is_sfu(waddr))
135001e04c3fSmrg                return 3;
135101e04c3fSmrg
135201e04c3fSmrg        return 1;
135301e04c3fSmrg}
135401e04c3fSmrg
135501e04c3fSmrgstatic uint32_t
13567ec681f3Smrginstruction_latency(const struct v3d_device_info *devinfo,
13577ec681f3Smrg                    struct schedule_node *before, struct schedule_node *after)
135801e04c3fSmrg{
135901e04c3fSmrg        const struct v3d_qpu_instr *before_inst = &before->inst->qpu;
136001e04c3fSmrg        const struct v3d_qpu_instr *after_inst = &after->inst->qpu;
136101e04c3fSmrg        uint32_t latency = 1;
136201e04c3fSmrg
136301e04c3fSmrg        if (before_inst->type != V3D_QPU_INSTR_TYPE_ALU ||
136401e04c3fSmrg            after_inst->type != V3D_QPU_INSTR_TYPE_ALU)
136501e04c3fSmrg                return latency;
136601e04c3fSmrg
136701e04c3fSmrg        if (before_inst->alu.add.magic_write) {
136801e04c3fSmrg                latency = MAX2(latency,
13697ec681f3Smrg                               magic_waddr_latency(devinfo,
13707ec681f3Smrg                                                   before_inst->alu.add.waddr,
137101e04c3fSmrg                                                   after_inst));
137201e04c3fSmrg        }
137301e04c3fSmrg
137401e04c3fSmrg        if (before_inst->alu.mul.magic_write) {
137501e04c3fSmrg                latency = MAX2(latency,
13767ec681f3Smrg                               magic_waddr_latency(devinfo,
13777ec681f3Smrg                                                   before_inst->alu.mul.waddr,
137801e04c3fSmrg                                                   after_inst));
137901e04c3fSmrg        }
138001e04c3fSmrg
13817ec681f3Smrg        if (v3d_qpu_instr_is_sfu(before_inst))
13827ec681f3Smrg                return 2;
13837ec681f3Smrg
138401e04c3fSmrg        return latency;
138501e04c3fSmrg}
138601e04c3fSmrg
138701e04c3fSmrg/** Recursive computation of the delay member of a node. */
138801e04c3fSmrgstatic void
1389ed98bd31Smayacompute_delay(struct dag_node *node, void *state)
139001e04c3fSmrg{
1391ed98bd31Smaya        struct schedule_node *n = (struct schedule_node *)node;
13927ec681f3Smrg        struct v3d_compile *c = (struct v3d_compile *) state;
1393ed98bd31Smaya
1394ed98bd31Smaya        n->delay = 1;
1395ed98bd31Smaya
1396ed98bd31Smaya        util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1397ed98bd31Smaya                struct schedule_node *child =
1398ed98bd31Smaya                        (struct schedule_node *)edge->child;
1399ed98bd31Smaya
1400ed98bd31Smaya                n->delay = MAX2(n->delay, (child->delay +
14017ec681f3Smrg                                           instruction_latency(c->devinfo, n,
14027ec681f3Smrg                                                               child)));
140301e04c3fSmrg        }
140401e04c3fSmrg}
140501e04c3fSmrg
1406ed98bd31Smaya/* Removes a DAG head, but removing only the WAR edges. (dag_prune_head()
1407ed98bd31Smaya * should be called on it later to finish pruning the other edges).
1408ed98bd31Smaya */
140901e04c3fSmrgstatic void
1410ed98bd31Smayapre_remove_head(struct dag *dag, struct schedule_node *n)
1411ed98bd31Smaya{
1412ed98bd31Smaya        list_delinit(&n->dag.link);
1413ed98bd31Smaya
1414ed98bd31Smaya        util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1415ed98bd31Smaya                if (edge->data)
1416ed98bd31Smaya                        dag_remove_edge(dag, edge);
1417ed98bd31Smaya        }
1418ed98bd31Smaya}
1419ed98bd31Smaya
1420ed98bd31Smayastatic void
14217ec681f3Smrgmark_instruction_scheduled(const struct v3d_device_info *devinfo,
14227ec681f3Smrg                           struct dag *dag,
142301e04c3fSmrg                           uint32_t time,
1424ed98bd31Smaya                           struct schedule_node *node)
142501e04c3fSmrg{
142601e04c3fSmrg        if (!node)
142701e04c3fSmrg                return;
142801e04c3fSmrg
1429ed98bd31Smaya        util_dynarray_foreach(&node->dag.edges, struct dag_edge, edge) {
143001e04c3fSmrg                struct schedule_node *child =
1431ed98bd31Smaya                        (struct schedule_node *)edge->child;
143201e04c3fSmrg
143301e04c3fSmrg                if (!child)
143401e04c3fSmrg                        continue;
143501e04c3fSmrg
14367ec681f3Smrg                uint32_t latency = instruction_latency(devinfo, node, child);
143701e04c3fSmrg
143801e04c3fSmrg                child->unblocked_time = MAX2(child->unblocked_time,
143901e04c3fSmrg                                             time + latency);
144001e04c3fSmrg        }
1441ed98bd31Smaya        dag_prune_head(dag, &node->dag);
144201e04c3fSmrg}
144301e04c3fSmrg
144401e04c3fSmrgstatic void
144501e04c3fSmrginsert_scheduled_instruction(struct v3d_compile *c,
144601e04c3fSmrg                             struct qblock *block,
144701e04c3fSmrg                             struct choose_scoreboard *scoreboard,
144801e04c3fSmrg                             struct qinst *inst)
144901e04c3fSmrg{
145001e04c3fSmrg        list_addtail(&inst->link, &block->instructions);
145101e04c3fSmrg
14527ec681f3Smrg        update_scoreboard_for_chosen(scoreboard, &inst->qpu, c->devinfo);
145301e04c3fSmrg        c->qpu_inst_count++;
145401e04c3fSmrg        scoreboard->tick++;
145501e04c3fSmrg}
145601e04c3fSmrg
145701e04c3fSmrgstatic struct qinst *
145801e04c3fSmrgvir_nop()
145901e04c3fSmrg{
1460ed98bd31Smaya        struct qreg undef = vir_nop_reg();
146101e04c3fSmrg        struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef);
146201e04c3fSmrg
146301e04c3fSmrg        return qinst;
146401e04c3fSmrg}
146501e04c3fSmrg
146601e04c3fSmrgstatic void
146701e04c3fSmrgemit_nop(struct v3d_compile *c, struct qblock *block,
146801e04c3fSmrg         struct choose_scoreboard *scoreboard)
146901e04c3fSmrg{
147001e04c3fSmrg        insert_scheduled_instruction(c, block, scoreboard, vir_nop());
147101e04c3fSmrg}
147201e04c3fSmrg
147301e04c3fSmrgstatic bool
14747ec681f3Smrgqpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
14757ec681f3Smrg                              const struct qinst *qinst, int slot)
147601e04c3fSmrg{
147701e04c3fSmrg        const struct v3d_qpu_instr *inst = &qinst->qpu;
147801e04c3fSmrg
147901e04c3fSmrg        /* Only TLB Z writes are prohibited in the last slot, but we don't
148001e04c3fSmrg         * have those flagged so prohibit all TLB ops for now.
148101e04c3fSmrg         */
148201e04c3fSmrg        if (slot == 2 && qpu_inst_is_tlb(inst))
148301e04c3fSmrg                return false;
148401e04c3fSmrg
148501e04c3fSmrg        if (slot > 0 && qinst->uniform != ~0)
148601e04c3fSmrg                return false;
148701e04c3fSmrg
148801e04c3fSmrg        if (v3d_qpu_uses_vpm(inst))
148901e04c3fSmrg                return false;
149001e04c3fSmrg
149101e04c3fSmrg        if (inst->sig.ldvary)
149201e04c3fSmrg                return false;
149301e04c3fSmrg
149401e04c3fSmrg        if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
149501e04c3fSmrg                /* GFXH-1625: TMUWT not allowed in the final instruction. */
149601e04c3fSmrg                if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT)
149701e04c3fSmrg                        return false;
149801e04c3fSmrg
149901e04c3fSmrg                /* No writing physical registers at the end. */
150001e04c3fSmrg                if (!inst->alu.add.magic_write ||
150101e04c3fSmrg                    !inst->alu.mul.magic_write) {
150201e04c3fSmrg                        return false;
150301e04c3fSmrg                }
150401e04c3fSmrg
15057ec681f3Smrg                if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) &&
15067ec681f3Smrg                    !inst->sig_magic) {
15077ec681f3Smrg                        return false;
15087ec681f3Smrg                }
15097ec681f3Smrg
151001e04c3fSmrg                if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF)
151101e04c3fSmrg                        return false;
151201e04c3fSmrg
151301e04c3fSmrg                /* RF0-2 might be overwritten during the delay slots by
151401e04c3fSmrg                 * fragment shader setup.
151501e04c3fSmrg                 */
151601e04c3fSmrg                if (inst->raddr_a < 3 &&
151701e04c3fSmrg                    (inst->alu.add.a == V3D_QPU_MUX_A ||
151801e04c3fSmrg                     inst->alu.add.b == V3D_QPU_MUX_A ||
151901e04c3fSmrg                     inst->alu.mul.a == V3D_QPU_MUX_A ||
152001e04c3fSmrg                     inst->alu.mul.b == V3D_QPU_MUX_A)) {
152101e04c3fSmrg                        return false;
152201e04c3fSmrg                }
152301e04c3fSmrg
152401e04c3fSmrg                if (inst->raddr_b < 3 &&
152501e04c3fSmrg                    !inst->sig.small_imm &&
152601e04c3fSmrg                    (inst->alu.add.a == V3D_QPU_MUX_B ||
152701e04c3fSmrg                     inst->alu.add.b == V3D_QPU_MUX_B ||
152801e04c3fSmrg                     inst->alu.mul.a == V3D_QPU_MUX_B ||
152901e04c3fSmrg                     inst->alu.mul.b == V3D_QPU_MUX_B)) {
153001e04c3fSmrg                        return false;
153101e04c3fSmrg                }
153201e04c3fSmrg        }
153301e04c3fSmrg
153401e04c3fSmrg        return true;
153501e04c3fSmrg}
153601e04c3fSmrg
15377ec681f3Smrg/**
15387ec681f3Smrg * This is called when trying to merge a thrsw back into the instruction stream
15397ec681f3Smrg * of instructions that were scheduled *before* the thrsw signal to fill its
15407ec681f3Smrg * delay slots. Because the actual execution of the thrsw happens after the
15417ec681f3Smrg * delay slots, it is usually safe to do this, but there are some cases that
15427ec681f3Smrg * need special care.
15437ec681f3Smrg */
15447ec681f3Smrgstatic bool
15457ec681f3Smrgqpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
15467ec681f3Smrg                                          const struct qinst *qinst,
15477ec681f3Smrg                                          uint32_t slot)
15487ec681f3Smrg{
15497ec681f3Smrg        /* No scheduling SFU when the result would land in the other
15507ec681f3Smrg         * thread.  The simulator complains for safety, though it
15517ec681f3Smrg         * would only occur for dead code in our case.
15527ec681f3Smrg         */
15537ec681f3Smrg        if (slot > 0 &&
15547ec681f3Smrg            qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
15557ec681f3Smrg            (v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.add.waddr) ||
15567ec681f3Smrg             v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.mul.waddr))) {
15577ec681f3Smrg                return false;
15587ec681f3Smrg        }
15597ec681f3Smrg
15607ec681f3Smrg        if (slot > 0 && qinst->qpu.sig.ldvary)
15617ec681f3Smrg                return false;
15627ec681f3Smrg
15637ec681f3Smrg        /* unifa and the following 3 instructions can't overlap a
15647ec681f3Smrg         * thread switch/end. The docs further clarify that this means
15657ec681f3Smrg         * the cycle at which the actual thread switch/end happens
15667ec681f3Smrg         * and not when the thrsw instruction is processed, which would
15677ec681f3Smrg         * be after the 2 delay slots following the thrsw instruction.
15687ec681f3Smrg         * This means that we can move up a thrsw up to the instruction
15697ec681f3Smrg         * right after unifa:
15707ec681f3Smrg         *
15717ec681f3Smrg         * unifa, r5
15727ec681f3Smrg         * thrsw
15737ec681f3Smrg         * delay slot 1
15747ec681f3Smrg         * delay slot 2
15757ec681f3Smrg         * Thread switch happens here, 4 instructions away from unifa
15767ec681f3Smrg         */
15777ec681f3Smrg        if (v3d_qpu_writes_unifa(c->devinfo, &qinst->qpu))
15787ec681f3Smrg                return false;
15797ec681f3Smrg
15807ec681f3Smrg        return true;
15817ec681f3Smrg}
15827ec681f3Smrg
15837ec681f3Smrg/**
15847ec681f3Smrg * This is called for instructions scheduled *after* a thrsw signal that may
15857ec681f3Smrg * land in the delay slots of the thrsw. Because these instructions were
15867ec681f3Smrg * scheduled after the thrsw, we need to be careful when placing them into
15877ec681f3Smrg * the delay slots, since that means that we are moving them ahead of the
15887ec681f3Smrg * thread switch and we need to ensure that is not a problem.
15897ec681f3Smrg */
15907ec681f3Smrgstatic bool
15917ec681f3Smrgqpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
15927ec681f3Smrg                                         struct choose_scoreboard *scoreboard,
15937ec681f3Smrg                                         const struct qinst *qinst)
15947ec681f3Smrg{
15957ec681f3Smrg        const uint32_t slot = scoreboard->tick - scoreboard->last_thrsw_tick;
15967ec681f3Smrg        assert(slot <= 2);
15977ec681f3Smrg
15987ec681f3Smrg        /* We merge thrsw instructions back into the instruction stream
15997ec681f3Smrg         * manually, so any instructions scheduled after a thrsw shold be
16007ec681f3Smrg         * in the actual delay slots and not in the same slot as the thrsw.
16017ec681f3Smrg         */
16027ec681f3Smrg        assert(slot >= 1);
16037ec681f3Smrg
16047ec681f3Smrg        /* No emitting a thrsw while the previous thrsw hasn't happened yet. */
16057ec681f3Smrg        if (qinst->qpu.sig.thrsw)
16067ec681f3Smrg                return false;
16077ec681f3Smrg
16087ec681f3Smrg        /* The restrictions for instructions scheduled before the the thrsw
16097ec681f3Smrg         * also apply to instructions scheduled after the thrsw that we want
16107ec681f3Smrg         * to place in its delay slots.
16117ec681f3Smrg         */
16127ec681f3Smrg        if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))
16137ec681f3Smrg                return false;
16147ec681f3Smrg
16157ec681f3Smrg        /* TLB access is disallowed until scoreboard wait is executed, which
16167ec681f3Smrg         * we do on the last thread switch.
16177ec681f3Smrg         */
16187ec681f3Smrg        if (qpu_inst_is_tlb(&qinst->qpu))
16197ec681f3Smrg                return false;
16207ec681f3Smrg
16217ec681f3Smrg        /* Instruction sequence restrictions: Branch is not allowed in delay
16227ec681f3Smrg         * slots of a thrsw.
16237ec681f3Smrg         */
16247ec681f3Smrg        if (qinst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH)
16257ec681f3Smrg                return false;
16267ec681f3Smrg
16277ec681f3Smrg        /* Miscellaneous restrictions: At the point of a thrsw we need to have
16287ec681f3Smrg         * at least one outstanding lookup or TSY wait.
16297ec681f3Smrg         *
16307ec681f3Smrg         * So avoid placing TMU instructions scheduled after the thrsw into
16317ec681f3Smrg         * its delay slots or we may be compromising the integrity of our TMU
16327ec681f3Smrg         * sequences. Also, notice that if we moved these instructions into
16337ec681f3Smrg         * the delay slots of a previous thrsw we could overflow our TMU output
16347ec681f3Smrg         * fifo, since we could be effectively pipelining a lookup scheduled
16357ec681f3Smrg         * after the thrsw into the sequence before the thrsw.
16367ec681f3Smrg         */
16377ec681f3Smrg        if (v3d_qpu_writes_tmu(c->devinfo, &qinst->qpu) ||
16387ec681f3Smrg            qinst->qpu.sig.wrtmuc) {
16397ec681f3Smrg                return false;
16407ec681f3Smrg        }
16417ec681f3Smrg
16427ec681f3Smrg        /* Don't move instructions that wait on the TMU before the thread switch
16437ec681f3Smrg         * happens since that would make the current thread stall before the
16447ec681f3Smrg         * switch, which is exactly what we want to avoid with the thrsw
16457ec681f3Smrg         * instruction.
16467ec681f3Smrg         */
16477ec681f3Smrg        if (v3d_qpu_waits_on_tmu(&qinst->qpu))
16487ec681f3Smrg                return false;
16497ec681f3Smrg
16507ec681f3Smrg        /* A thread switch invalidates all accumulators, so don't place any
16517ec681f3Smrg         * instructions that write accumulators into the delay slots.
16527ec681f3Smrg         */
16537ec681f3Smrg        if (v3d_qpu_writes_accum(c->devinfo, &qinst->qpu))
16547ec681f3Smrg                return false;
16557ec681f3Smrg
16567ec681f3Smrg        /* Multop has an implicit write to the rtop register which is an
16577ec681f3Smrg         * specialized accumulator that is only used with this instruction.
16587ec681f3Smrg         */
16597ec681f3Smrg        if (qinst->qpu.alu.mul.op == V3D_QPU_M_MULTOP)
16607ec681f3Smrg                return false;
16617ec681f3Smrg
16627ec681f3Smrg        /* Flags are invalidated across a thread switch, so dont' place
16637ec681f3Smrg         * instructions that write flags into delay slots.
16647ec681f3Smrg         */
16657ec681f3Smrg        if (v3d_qpu_writes_flags(&qinst->qpu))
16667ec681f3Smrg                return false;
16677ec681f3Smrg
16687ec681f3Smrg        return true;
16697ec681f3Smrg}
16707ec681f3Smrg
167101e04c3fSmrgstatic bool
167201e04c3fSmrgvalid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard,
167301e04c3fSmrg                     struct qinst *qinst, int instructions_in_sequence,
167401e04c3fSmrg                     bool is_thrend)
167501e04c3fSmrg{
167601e04c3fSmrg        /* No emitting our thrsw while the previous thrsw hasn't happened yet. */
167701e04c3fSmrg        if (scoreboard->last_thrsw_tick + 3 >
167801e04c3fSmrg            scoreboard->tick - instructions_in_sequence) {
167901e04c3fSmrg                return false;
168001e04c3fSmrg        }
168101e04c3fSmrg
168201e04c3fSmrg        for (int slot = 0; slot < instructions_in_sequence; slot++) {
16837ec681f3Smrg                if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))
168401e04c3fSmrg                        return false;
168501e04c3fSmrg
168601e04c3fSmrg                if (is_thrend &&
16877ec681f3Smrg                    !qpu_inst_valid_in_thrend_slot(c, qinst, slot)) {
168801e04c3fSmrg                        return false;
168901e04c3fSmrg                }
169001e04c3fSmrg
169101e04c3fSmrg                /* Note that the list is circular, so we can only do this up
169201e04c3fSmrg                 * to instructions_in_sequence.
169301e04c3fSmrg                 */
169401e04c3fSmrg                qinst = (struct qinst *)qinst->link.next;
169501e04c3fSmrg        }
169601e04c3fSmrg
169701e04c3fSmrg        return true;
169801e04c3fSmrg}
169901e04c3fSmrg
170001e04c3fSmrg/**
170101e04c3fSmrg * Emits a THRSW signal in the stream, trying to move it up to pair with
170201e04c3fSmrg * another instruction.
170301e04c3fSmrg */
170401e04c3fSmrgstatic int
170501e04c3fSmrgemit_thrsw(struct v3d_compile *c,
170601e04c3fSmrg           struct qblock *block,
170701e04c3fSmrg           struct choose_scoreboard *scoreboard,
170801e04c3fSmrg           struct qinst *inst,
170901e04c3fSmrg           bool is_thrend)
171001e04c3fSmrg{
171101e04c3fSmrg        int time = 0;
171201e04c3fSmrg
171301e04c3fSmrg        /* There should be nothing in a thrsw inst being scheduled other than
171401e04c3fSmrg         * the signal bits.
171501e04c3fSmrg         */
171601e04c3fSmrg        assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
171701e04c3fSmrg        assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP);
171801e04c3fSmrg        assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP);
171901e04c3fSmrg
17207ec681f3Smrg        /* Don't try to emit a thrsw in the delay slots of a previous thrsw
17217ec681f3Smrg         * or branch.
17227ec681f3Smrg         */
17237ec681f3Smrg        while (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick) {
17247ec681f3Smrg                emit_nop(c, block, scoreboard);
17257ec681f3Smrg                time++;
17267ec681f3Smrg        }
17277ec681f3Smrg        while (scoreboard->last_branch_tick + 3 >= scoreboard->tick) {
17287ec681f3Smrg                emit_nop(c, block, scoreboard);
17297ec681f3Smrg                time++;
17307ec681f3Smrg        }
17317ec681f3Smrg
173201e04c3fSmrg        /* Find how far back into previous instructions we can put the THRSW. */
173301e04c3fSmrg        int slots_filled = 0;
173401e04c3fSmrg        struct qinst *merge_inst = NULL;
173501e04c3fSmrg        vir_for_each_inst_rev(prev_inst, block) {
173601e04c3fSmrg                struct v3d_qpu_sig sig = prev_inst->qpu.sig;
173701e04c3fSmrg                sig.thrsw = true;
173801e04c3fSmrg                uint32_t packed_sig;
173901e04c3fSmrg
174001e04c3fSmrg                if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig))
174101e04c3fSmrg                        break;
174201e04c3fSmrg
174301e04c3fSmrg                if (!valid_thrsw_sequence(c, scoreboard,
174401e04c3fSmrg                                          prev_inst, slots_filled + 1,
174501e04c3fSmrg                                          is_thrend)) {
174601e04c3fSmrg                        break;
174701e04c3fSmrg                }
174801e04c3fSmrg
174901e04c3fSmrg                merge_inst = prev_inst;
175001e04c3fSmrg                if (++slots_filled == 3)
175101e04c3fSmrg                        break;
175201e04c3fSmrg        }
175301e04c3fSmrg
175401e04c3fSmrg        bool needs_free = false;
175501e04c3fSmrg        if (merge_inst) {
175601e04c3fSmrg                merge_inst->qpu.sig.thrsw = true;
175701e04c3fSmrg                needs_free = true;
175801e04c3fSmrg                scoreboard->last_thrsw_tick = scoreboard->tick - slots_filled;
175901e04c3fSmrg        } else {
176001e04c3fSmrg                scoreboard->last_thrsw_tick = scoreboard->tick;
176101e04c3fSmrg                insert_scheduled_instruction(c, block, scoreboard, inst);
176201e04c3fSmrg                time++;
176301e04c3fSmrg                slots_filled++;
176401e04c3fSmrg                merge_inst = inst;
176501e04c3fSmrg        }
176601e04c3fSmrg
17677ec681f3Smrg        scoreboard->first_thrsw_emitted = true;
176801e04c3fSmrg
176901e04c3fSmrg        /* If we're emitting the last THRSW (other than program end), then
177001e04c3fSmrg         * signal that to the HW by emitting two THRSWs in a row.
177101e04c3fSmrg         */
177201e04c3fSmrg        if (inst->is_last_thrsw) {
17737ec681f3Smrg                if (slots_filled <= 1) {
17747ec681f3Smrg                        emit_nop(c, block, scoreboard);
17757ec681f3Smrg                        time++;
17767ec681f3Smrg                }
177701e04c3fSmrg                struct qinst *second_inst =
177801e04c3fSmrg                        (struct qinst *)merge_inst->link.next;
177901e04c3fSmrg                second_inst->qpu.sig.thrsw = true;
17807ec681f3Smrg                scoreboard->last_thrsw_emitted = true;
17817ec681f3Smrg        }
17827ec681f3Smrg
17837ec681f3Smrg        /* Make sure the thread end executes within the program lifespan */
17847ec681f3Smrg        if (is_thrend) {
17857ec681f3Smrg                for (int i = 0; i < 3 - slots_filled; i++) {
17867ec681f3Smrg                        emit_nop(c, block, scoreboard);
17877ec681f3Smrg                        time++;
17887ec681f3Smrg                }
178901e04c3fSmrg        }
179001e04c3fSmrg
179101e04c3fSmrg        /* If we put our THRSW into another instruction, free up the
179201e04c3fSmrg         * instruction that didn't end up scheduled into the list.
179301e04c3fSmrg         */
179401e04c3fSmrg        if (needs_free)
179501e04c3fSmrg                free(inst);
179601e04c3fSmrg
179701e04c3fSmrg        return time;
179801e04c3fSmrg}
179901e04c3fSmrg
18007ec681f3Smrgstatic bool
18017ec681f3Smrgqpu_inst_valid_in_branch_delay_slot(struct v3d_compile *c, struct qinst *inst)
18027ec681f3Smrg{
18037ec681f3Smrg        if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH)
18047ec681f3Smrg                return false;
18057ec681f3Smrg
18067ec681f3Smrg        if (inst->qpu.sig.thrsw)
18077ec681f3Smrg                return false;
18087ec681f3Smrg
18097ec681f3Smrg        if (v3d_qpu_writes_unifa(c->devinfo, &inst->qpu))
18107ec681f3Smrg                return false;
18117ec681f3Smrg
18127ec681f3Smrg        if (vir_has_uniform(inst))
18137ec681f3Smrg                return false;
18147ec681f3Smrg
18157ec681f3Smrg        return true;
18167ec681f3Smrg}
18177ec681f3Smrg
18187ec681f3Smrgstatic void
18197ec681f3Smrgemit_branch(struct v3d_compile *c,
18207ec681f3Smrg           struct qblock *block,
18217ec681f3Smrg           struct choose_scoreboard *scoreboard,
18227ec681f3Smrg           struct qinst *inst)
18237ec681f3Smrg{
18247ec681f3Smrg        assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
18257ec681f3Smrg
18267ec681f3Smrg        /* We should've not picked up a branch for the delay slots of a previous
18277ec681f3Smrg         * thrsw, branch or unifa write instruction.
18287ec681f3Smrg         */
18297ec681f3Smrg        int branch_tick = scoreboard->tick;
18307ec681f3Smrg        assert(scoreboard->last_thrsw_tick + 2 < branch_tick);
18317ec681f3Smrg        assert(scoreboard->last_branch_tick + 3 < branch_tick);
18327ec681f3Smrg        assert(scoreboard->last_unifa_write_tick + 3 < branch_tick);
18337ec681f3Smrg
18347ec681f3Smrg        /* Can't place a branch with msfign != 0 and cond != 0,2,3 after
18357ec681f3Smrg         * setmsf.
18367ec681f3Smrg         */
18377ec681f3Smrg        bool is_safe_msf_branch =
18387ec681f3Smrg                inst->qpu.branch.msfign == V3D_QPU_MSFIGN_NONE ||
18397ec681f3Smrg                inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS ||
18407ec681f3Smrg                inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_A0 ||
18417ec681f3Smrg                inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_NA0;
18427ec681f3Smrg        assert(scoreboard->last_setmsf_tick != branch_tick - 1 ||
18437ec681f3Smrg               is_safe_msf_branch);
18447ec681f3Smrg
18457ec681f3Smrg        /* Insert the branch instruction */
18467ec681f3Smrg        insert_scheduled_instruction(c, block, scoreboard, inst);
18477ec681f3Smrg
18487ec681f3Smrg        /* Now see if we can move the branch instruction back into the
18497ec681f3Smrg         * instruction stream to fill its delay slots
18507ec681f3Smrg         */
18517ec681f3Smrg        int slots_filled = 0;
18527ec681f3Smrg        while (slots_filled < 3 && block->instructions.next != &inst->link) {
18537ec681f3Smrg                struct qinst *prev_inst = (struct qinst *) inst->link.prev;
18547ec681f3Smrg                assert(prev_inst->qpu.type != V3D_QPU_INSTR_TYPE_BRANCH);
18557ec681f3Smrg
18567ec681f3Smrg                /* Can't move the branch instruction if that would place it
18577ec681f3Smrg                 * in the delay slots of other instructions.
18587ec681f3Smrg                 */
18597ec681f3Smrg                if (scoreboard->last_branch_tick + 3 >=
18607ec681f3Smrg                    branch_tick - slots_filled - 1) {
18617ec681f3Smrg                        break;
18627ec681f3Smrg                }
18637ec681f3Smrg
18647ec681f3Smrg                if (scoreboard->last_thrsw_tick + 2 >=
18657ec681f3Smrg                    branch_tick - slots_filled - 1) {
18667ec681f3Smrg                        break;
18677ec681f3Smrg                }
18687ec681f3Smrg
18697ec681f3Smrg                if (scoreboard->last_unifa_write_tick + 3 >=
18707ec681f3Smrg                    branch_tick - slots_filled - 1) {
18717ec681f3Smrg                        break;
18727ec681f3Smrg                }
18737ec681f3Smrg
18747ec681f3Smrg                /* Can't move a conditional branch before the instruction
18757ec681f3Smrg                 * that writes the flags for its condition.
18767ec681f3Smrg                 */
18777ec681f3Smrg                if (v3d_qpu_writes_flags(&prev_inst->qpu) &&
18787ec681f3Smrg                    inst->qpu.branch.cond != V3D_QPU_BRANCH_COND_ALWAYS) {
18797ec681f3Smrg                        break;
18807ec681f3Smrg                }
18817ec681f3Smrg
18827ec681f3Smrg                if (!qpu_inst_valid_in_branch_delay_slot(c, prev_inst))
18837ec681f3Smrg                        break;
18847ec681f3Smrg
18857ec681f3Smrg                if (!is_safe_msf_branch) {
18867ec681f3Smrg                        struct qinst *prev_prev_inst =
18877ec681f3Smrg                                (struct qinst *) prev_inst->link.prev;
18887ec681f3Smrg                        if (prev_prev_inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
18897ec681f3Smrg                            prev_prev_inst->qpu.alu.add.op == V3D_QPU_A_SETMSF) {
18907ec681f3Smrg                                break;
18917ec681f3Smrg                        }
18927ec681f3Smrg                }
18937ec681f3Smrg
18947ec681f3Smrg                list_del(&prev_inst->link);
18957ec681f3Smrg                list_add(&prev_inst->link, &inst->link);
18967ec681f3Smrg                slots_filled++;
18977ec681f3Smrg        }
18987ec681f3Smrg
18997ec681f3Smrg        block->branch_qpu_ip = c->qpu_inst_count - 1 - slots_filled;
19007ec681f3Smrg        scoreboard->last_branch_tick = branch_tick - slots_filled;
19017ec681f3Smrg
19027ec681f3Smrg        /* Fill any remaining delay slots.
19037ec681f3Smrg         *
19047ec681f3Smrg         * For unconditional branches we'll try to fill these with the
19057ec681f3Smrg         * first instructions in the successor block after scheduling
19067ec681f3Smrg         * all blocks when setting up branch targets.
19077ec681f3Smrg         */
19087ec681f3Smrg        for (int i = 0; i < 3 - slots_filled; i++)
19097ec681f3Smrg                emit_nop(c, block, scoreboard);
19107ec681f3Smrg}
19117ec681f3Smrg
19127ec681f3Smrgstatic bool
19137ec681f3Smrgalu_reads_register(struct v3d_qpu_instr *inst,
19147ec681f3Smrg                   bool add, bool magic, uint32_t index)
19157ec681f3Smrg{
19167ec681f3Smrg        uint32_t num_src;
19177ec681f3Smrg        enum v3d_qpu_mux mux_a, mux_b;
19187ec681f3Smrg
19197ec681f3Smrg        if (add) {
19207ec681f3Smrg                num_src = v3d_qpu_add_op_num_src(inst->alu.add.op);
19217ec681f3Smrg                mux_a = inst->alu.add.a;
19227ec681f3Smrg                mux_b = inst->alu.add.b;
19237ec681f3Smrg        } else {
19247ec681f3Smrg                num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
19257ec681f3Smrg                mux_a = inst->alu.mul.a;
19267ec681f3Smrg                mux_b = inst->alu.mul.b;
19277ec681f3Smrg        }
19287ec681f3Smrg
19297ec681f3Smrg        for (int i = 0; i < num_src; i++) {
19307ec681f3Smrg                if (magic) {
19317ec681f3Smrg                        if (i == 0 && mux_a == index)
19327ec681f3Smrg                                return true;
19337ec681f3Smrg                        if (i == 1 && mux_b == index)
19347ec681f3Smrg                                return true;
19357ec681f3Smrg                } else {
19367ec681f3Smrg                        if (i == 0 && mux_a == V3D_QPU_MUX_A &&
19377ec681f3Smrg                            inst->raddr_a == index) {
19387ec681f3Smrg                                return true;
19397ec681f3Smrg                        }
19407ec681f3Smrg                        if (i == 0 && mux_a == V3D_QPU_MUX_B &&
19417ec681f3Smrg                            inst->raddr_b == index) {
19427ec681f3Smrg                                return true;
19437ec681f3Smrg                        }
19447ec681f3Smrg                        if (i == 1 && mux_b == V3D_QPU_MUX_A &&
19457ec681f3Smrg                            inst->raddr_a == index) {
19467ec681f3Smrg                                return true;
19477ec681f3Smrg                        }
19487ec681f3Smrg                        if (i == 1 && mux_b == V3D_QPU_MUX_B &&
19497ec681f3Smrg                            inst->raddr_b == index) {
19507ec681f3Smrg                                return true;
19517ec681f3Smrg                        }
19527ec681f3Smrg                }
19537ec681f3Smrg        }
19547ec681f3Smrg
19557ec681f3Smrg        return false;
19567ec681f3Smrg}
19577ec681f3Smrg
19587ec681f3Smrg/**
19597ec681f3Smrg * This takes and ldvary signal merged into 'inst' and tries to move it up to
19607ec681f3Smrg * the previous instruction to get good pipelining of ldvary sequences,
19617ec681f3Smrg * transforming this:
19627ec681f3Smrg *
19637ec681f3Smrg * nop                  ; nop               ; ldvary.r4
19647ec681f3Smrg * nop                  ; fmul  r0, r4, rf0 ;
19657ec681f3Smrg * fadd  rf13, r0, r5   ; nop;              ; ldvary.r1  <-- inst
19667ec681f3Smrg *
19677ec681f3Smrg * into:
19687ec681f3Smrg *
19697ec681f3Smrg * nop                  ; nop               ; ldvary.r4
19707ec681f3Smrg * nop                  ; fmul  r0, r4, rf0 ; ldvary.r1
19717ec681f3Smrg * fadd  rf13, r0, r5   ; nop;              ;            <-- inst
19727ec681f3Smrg *
19737ec681f3Smrg * If we manage to do this successfully (we return true here), then flagging
19747ec681f3Smrg * the ldvary as "scheduled" may promote the follow-up fmul to a DAG head that
19757ec681f3Smrg * we will be able to pick up to merge into 'inst', leading to code like this:
19767ec681f3Smrg *
19777ec681f3Smrg * nop                  ; nop               ; ldvary.r4
19787ec681f3Smrg * nop                  ; fmul  r0, r4, rf0 ; ldvary.r1
19797ec681f3Smrg * fadd  rf13, r0, r5   ; fmul  r2, r1, rf0 ;            <-- inst
19807ec681f3Smrg */
19817ec681f3Smrgstatic bool
19827ec681f3Smrgfixup_pipelined_ldvary(struct v3d_compile *c,
19837ec681f3Smrg                       struct choose_scoreboard *scoreboard,
19847ec681f3Smrg                       struct qblock *block,
19857ec681f3Smrg                       struct v3d_qpu_instr *inst)
19867ec681f3Smrg{
19877ec681f3Smrg        /* We only call this if we have successfuly merged an ldvary into a
19887ec681f3Smrg         * previous instruction.
19897ec681f3Smrg         */
19907ec681f3Smrg        assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
19917ec681f3Smrg        assert(inst->sig.ldvary);
19927ec681f3Smrg        uint32_t ldvary_magic = inst->sig_magic;
19937ec681f3Smrg        uint32_t ldvary_index = inst->sig_addr;
19947ec681f3Smrg
19957ec681f3Smrg        /* The instruction in which we merged the ldvary cannot read
19967ec681f3Smrg         * the ldvary destination, if it does, then moving the ldvary before
19977ec681f3Smrg         * it would overwrite it.
19987ec681f3Smrg         */
19997ec681f3Smrg        if (alu_reads_register(inst, true, ldvary_magic, ldvary_index))
20007ec681f3Smrg                return false;
20017ec681f3Smrg        if (alu_reads_register(inst, false, ldvary_magic, ldvary_index))
20027ec681f3Smrg                return false;
20037ec681f3Smrg
20047ec681f3Smrg        /* The implicit ldvary destination may not be written to by a signal
20057ec681f3Smrg         * in the instruction following ldvary. Since we are planning to move
20067ec681f3Smrg         * ldvary to the previous instruction, this means we need to check if
20077ec681f3Smrg         * the current instruction has any other signal that could create this
20087ec681f3Smrg         * conflict. The only other signal that can write to the implicit
20097ec681f3Smrg         * ldvary destination that is compatible with ldvary in the same
20107ec681f3Smrg         * instruction is ldunif.
20117ec681f3Smrg         */
20127ec681f3Smrg        if (inst->sig.ldunif)
20137ec681f3Smrg                return false;
20147ec681f3Smrg
20157ec681f3Smrg        /* The previous instruction can't write to the same destination as the
20167ec681f3Smrg         * ldvary.
20177ec681f3Smrg         */
20187ec681f3Smrg        struct qinst *prev = (struct qinst *) block->instructions.prev;
20197ec681f3Smrg        if (!prev || prev->qpu.type != V3D_QPU_INSTR_TYPE_ALU)
20207ec681f3Smrg                return false;
20217ec681f3Smrg
20227ec681f3Smrg        if (prev->qpu.alu.add.op != V3D_QPU_A_NOP) {
20237ec681f3Smrg                if (prev->qpu.alu.add.magic_write == ldvary_magic &&
20247ec681f3Smrg                    prev->qpu.alu.add.waddr == ldvary_index) {
20257ec681f3Smrg                        return false;
20267ec681f3Smrg                }
20277ec681f3Smrg        }
20287ec681f3Smrg
20297ec681f3Smrg        if (prev->qpu.alu.mul.op != V3D_QPU_M_NOP) {
20307ec681f3Smrg                if (prev->qpu.alu.mul.magic_write == ldvary_magic &&
20317ec681f3Smrg                    prev->qpu.alu.mul.waddr == ldvary_index) {
20327ec681f3Smrg                        return false;
20337ec681f3Smrg                }
20347ec681f3Smrg        }
20357ec681f3Smrg
20367ec681f3Smrg        /* The previous instruction cannot have a conflicting signal */
20377ec681f3Smrg        if (v3d_qpu_sig_writes_address(c->devinfo, &prev->qpu.sig))
20387ec681f3Smrg                return false;
20397ec681f3Smrg
20407ec681f3Smrg        /* The previous instruction cannot use flags since ldvary uses the
20417ec681f3Smrg         * 'cond' instruction field to store the destination.
20427ec681f3Smrg         */
20437ec681f3Smrg        if (v3d_qpu_writes_flags(&prev->qpu))
20447ec681f3Smrg                return false;
20457ec681f3Smrg        if (v3d_qpu_reads_flags(&prev->qpu))
20467ec681f3Smrg                return false;
20477ec681f3Smrg
20487ec681f3Smrg        /* We can't put an ldvary in the delay slots of a thrsw. We should've
20497ec681f3Smrg         * prevented this when pairing up the ldvary with another instruction
20507ec681f3Smrg         * and flagging it for a fixup.
20517ec681f3Smrg         */
20527ec681f3Smrg        assert(scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1);
20537ec681f3Smrg
20547ec681f3Smrg        /* Move the ldvary to the previous instruction and remove it from the
20557ec681f3Smrg         * current one.
20567ec681f3Smrg         */
20577ec681f3Smrg        prev->qpu.sig.ldvary = true;
20587ec681f3Smrg        prev->qpu.sig_magic = ldvary_magic;
20597ec681f3Smrg        prev->qpu.sig_addr = ldvary_index;
20607ec681f3Smrg        scoreboard->last_ldvary_tick = scoreboard->tick - 1;
20617ec681f3Smrg
20627ec681f3Smrg        inst->sig.ldvary = false;
20637ec681f3Smrg        inst->sig_magic = false;
20647ec681f3Smrg        inst->sig_addr = 0;
20657ec681f3Smrg
20667ec681f3Smrg        /* By moving ldvary to the previous instruction we make it update
20677ec681f3Smrg         * r5 in the current one, so nothing else in it should write r5.
20687ec681f3Smrg         * This should've been prevented by our depedency tracking, which
20697ec681f3Smrg         * would not allow ldvary to be paired up with an instruction that
20707ec681f3Smrg         * writes r5 (since our dependency tracking doesn't know that the
20717ec681f3Smrg         * ldvary write r5 happens in the next instruction).
20727ec681f3Smrg         */
20737ec681f3Smrg        assert(!v3d_qpu_writes_r5(c->devinfo, inst));
20747ec681f3Smrg
20757ec681f3Smrg        return true;
20767ec681f3Smrg}
20777ec681f3Smrg
207801e04c3fSmrgstatic uint32_t
207901e04c3fSmrgschedule_instructions(struct v3d_compile *c,
208001e04c3fSmrg                      struct choose_scoreboard *scoreboard,
208101e04c3fSmrg                      struct qblock *block,
208201e04c3fSmrg                      enum quniform_contents *orig_uniform_contents,
208301e04c3fSmrg                      uint32_t *orig_uniform_data,
208401e04c3fSmrg                      uint32_t *next_uniform)
208501e04c3fSmrg{
208601e04c3fSmrg        const struct v3d_device_info *devinfo = c->devinfo;
208701e04c3fSmrg        uint32_t time = 0;
208801e04c3fSmrg
20897ec681f3Smrg        while (!list_is_empty(&scoreboard->dag->heads)) {
209001e04c3fSmrg                struct schedule_node *chosen =
20917ec681f3Smrg                        choose_instruction_to_schedule(c, scoreboard, NULL);
209201e04c3fSmrg                struct schedule_node *merge = NULL;
209301e04c3fSmrg
209401e04c3fSmrg                /* If there are no valid instructions to schedule, drop a NOP
209501e04c3fSmrg                 * in.
209601e04c3fSmrg                 */
209701e04c3fSmrg                struct qinst *qinst = chosen ? chosen->inst : vir_nop();
209801e04c3fSmrg                struct v3d_qpu_instr *inst = &qinst->qpu;
209901e04c3fSmrg
210001e04c3fSmrg                if (debug) {
210101e04c3fSmrg                        fprintf(stderr, "t=%4d: current list:\n",
210201e04c3fSmrg                                time);
2103ed98bd31Smaya                        dump_state(devinfo, scoreboard->dag);
210401e04c3fSmrg                        fprintf(stderr, "t=%4d: chose:   ", time);
210501e04c3fSmrg                        v3d_qpu_dump(devinfo, inst);
210601e04c3fSmrg                        fprintf(stderr, "\n");
210701e04c3fSmrg                }
210801e04c3fSmrg
210901e04c3fSmrg                /* We can't mark_instruction_scheduled() the chosen inst until
211001e04c3fSmrg                 * we're done identifying instructions to merge, so put the
211101e04c3fSmrg                 * merged instructions on a list for a moment.
211201e04c3fSmrg                 */
211301e04c3fSmrg                struct list_head merged_list;
211401e04c3fSmrg                list_inithead(&merged_list);
211501e04c3fSmrg
211601e04c3fSmrg                /* Schedule this instruction onto the QPU list. Also try to
211701e04c3fSmrg                 * find an instruction to pair with it.
211801e04c3fSmrg                 */
211901e04c3fSmrg                if (chosen) {
212001e04c3fSmrg                        time = MAX2(chosen->unblocked_time, time);
2121ed98bd31Smaya                        pre_remove_head(scoreboard->dag, chosen);
212201e04c3fSmrg
212301e04c3fSmrg                        while ((merge =
21247ec681f3Smrg                                choose_instruction_to_schedule(c, scoreboard,
212501e04c3fSmrg                                                               chosen))) {
212601e04c3fSmrg                                time = MAX2(merge->unblocked_time, time);
21277ec681f3Smrg                                pre_remove_head(scoreboard->dag, merge);
212801e04c3fSmrg                                list_addtail(&merge->link, &merged_list);
212901e04c3fSmrg                                (void)qpu_merge_inst(devinfo, inst,
213001e04c3fSmrg                                                     inst, &merge->inst->qpu);
213101e04c3fSmrg                                if (merge->inst->uniform != -1) {
213201e04c3fSmrg                                        chosen->inst->uniform =
213301e04c3fSmrg                                                merge->inst->uniform;
213401e04c3fSmrg                                }
213501e04c3fSmrg
213601e04c3fSmrg                                if (debug) {
213701e04c3fSmrg                                        fprintf(stderr, "t=%4d: merging: ",
213801e04c3fSmrg                                                time);
213901e04c3fSmrg                                        v3d_qpu_dump(devinfo, &merge->inst->qpu);
214001e04c3fSmrg                                        fprintf(stderr, "\n");
214101e04c3fSmrg                                        fprintf(stderr, "         result: ");
214201e04c3fSmrg                                        v3d_qpu_dump(devinfo, inst);
214301e04c3fSmrg                                        fprintf(stderr, "\n");
214401e04c3fSmrg                                }
21457ec681f3Smrg
21467ec681f3Smrg                                if (scoreboard->fixup_ldvary) {
21477ec681f3Smrg                                        scoreboard->fixup_ldvary = false;
21487ec681f3Smrg                                        if (fixup_pipelined_ldvary(c, scoreboard, block, inst)) {
21497ec681f3Smrg                                                /* Flag the ldvary as scheduled
21507ec681f3Smrg                                                 * now so we can try to merge the
21517ec681f3Smrg                                                 * follow-up instruction in the
21527ec681f3Smrg                                                 * the ldvary sequence into the
21537ec681f3Smrg                                                 * current instruction.
21547ec681f3Smrg                                                 */
21557ec681f3Smrg                                                mark_instruction_scheduled(
21567ec681f3Smrg                                                        devinfo, scoreboard->dag,
21577ec681f3Smrg                                                        time, merge);
21587ec681f3Smrg                                        }
21597ec681f3Smrg                                }
216001e04c3fSmrg                        }
21617ec681f3Smrg                        if (mux_read_stalls(scoreboard, inst))
21627ec681f3Smrg                                c->qpu_inst_stalled_count++;
216301e04c3fSmrg                }
216401e04c3fSmrg
216501e04c3fSmrg                /* Update the uniform index for the rewritten location --
216601e04c3fSmrg                 * branch target updating will still need to change
216701e04c3fSmrg                 * c->uniform_data[] using this index.
216801e04c3fSmrg                 */
216901e04c3fSmrg                if (qinst->uniform != -1) {
217001e04c3fSmrg                        if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
217101e04c3fSmrg                                block->branch_uniform = *next_uniform;
217201e04c3fSmrg
217301e04c3fSmrg                        c->uniform_data[*next_uniform] =
217401e04c3fSmrg                                orig_uniform_data[qinst->uniform];
217501e04c3fSmrg                        c->uniform_contents[*next_uniform] =
217601e04c3fSmrg                                orig_uniform_contents[qinst->uniform];
217701e04c3fSmrg                        qinst->uniform = *next_uniform;
217801e04c3fSmrg                        (*next_uniform)++;
217901e04c3fSmrg                }
218001e04c3fSmrg
218101e04c3fSmrg                if (debug) {
218201e04c3fSmrg                        fprintf(stderr, "\n");
218301e04c3fSmrg                }
218401e04c3fSmrg
218501e04c3fSmrg                /* Now that we've scheduled a new instruction, some of its
218601e04c3fSmrg                 * children can be promoted to the list of instructions ready to
218701e04c3fSmrg                 * be scheduled.  Update the children's unblocked time for this
218801e04c3fSmrg                 * DAG edge as we do so.
218901e04c3fSmrg                 */
21907ec681f3Smrg                mark_instruction_scheduled(devinfo, scoreboard->dag, time, chosen);
219101e04c3fSmrg                list_for_each_entry(struct schedule_node, merge, &merged_list,
219201e04c3fSmrg                                    link) {
21937ec681f3Smrg                        mark_instruction_scheduled(devinfo, scoreboard->dag, time, merge);
219401e04c3fSmrg
219501e04c3fSmrg                        /* The merged VIR instruction doesn't get re-added to the
219601e04c3fSmrg                         * block, so free it now.
219701e04c3fSmrg                         */
219801e04c3fSmrg                        free(merge->inst);
219901e04c3fSmrg                }
220001e04c3fSmrg
220101e04c3fSmrg                if (inst->sig.thrsw) {
220201e04c3fSmrg                        time += emit_thrsw(c, block, scoreboard, qinst, false);
22037ec681f3Smrg                } else if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
22047ec681f3Smrg                        emit_branch(c, block, scoreboard, qinst);
220501e04c3fSmrg                } else {
220601e04c3fSmrg                        insert_scheduled_instruction(c, block,
220701e04c3fSmrg                                                     scoreboard, qinst);
220801e04c3fSmrg                }
220901e04c3fSmrg        }
221001e04c3fSmrg
221101e04c3fSmrg        return time;
221201e04c3fSmrg}
221301e04c3fSmrg
221401e04c3fSmrgstatic uint32_t
221501e04c3fSmrgqpu_schedule_instructions_block(struct v3d_compile *c,
221601e04c3fSmrg                                struct choose_scoreboard *scoreboard,
221701e04c3fSmrg                                struct qblock *block,
221801e04c3fSmrg                                enum quniform_contents *orig_uniform_contents,
221901e04c3fSmrg                                uint32_t *orig_uniform_data,
222001e04c3fSmrg                                uint32_t *next_uniform)
222101e04c3fSmrg{
222201e04c3fSmrg        void *mem_ctx = ralloc_context(NULL);
2223ed98bd31Smaya        scoreboard->dag = dag_create(mem_ctx);
2224ed98bd31Smaya        struct list_head setup_list;
222501e04c3fSmrg
2226ed98bd31Smaya        list_inithead(&setup_list);
222701e04c3fSmrg
222801e04c3fSmrg        /* Wrap each instruction in a scheduler structure. */
22297ec681f3Smrg        while (!list_is_empty(&block->instructions)) {
223001e04c3fSmrg                struct qinst *qinst = (struct qinst *)block->instructions.next;
223101e04c3fSmrg                struct schedule_node *n =
223201e04c3fSmrg                        rzalloc(mem_ctx, struct schedule_node);
223301e04c3fSmrg
2234ed98bd31Smaya                dag_init_node(scoreboard->dag, &n->dag);
223501e04c3fSmrg                n->inst = qinst;
223601e04c3fSmrg
223701e04c3fSmrg                list_del(&qinst->link);
2238ed98bd31Smaya                list_addtail(&n->link, &setup_list);
223901e04c3fSmrg        }
224001e04c3fSmrg
2241ed98bd31Smaya        calculate_forward_deps(c, scoreboard->dag, &setup_list);
2242ed98bd31Smaya        calculate_reverse_deps(c, scoreboard->dag, &setup_list);
224301e04c3fSmrg
22447ec681f3Smrg        dag_traverse_bottom_up(scoreboard->dag, compute_delay, c);
224501e04c3fSmrg
224601e04c3fSmrg        uint32_t cycles = schedule_instructions(c, scoreboard, block,
224701e04c3fSmrg                                                orig_uniform_contents,
224801e04c3fSmrg                                                orig_uniform_data,
224901e04c3fSmrg                                                next_uniform);
225001e04c3fSmrg
225101e04c3fSmrg        ralloc_free(mem_ctx);
2252ed98bd31Smaya        scoreboard->dag = NULL;
225301e04c3fSmrg
225401e04c3fSmrg        return cycles;
225501e04c3fSmrg}
225601e04c3fSmrg
225701e04c3fSmrgstatic void
225801e04c3fSmrgqpu_set_branch_targets(struct v3d_compile *c)
225901e04c3fSmrg{
226001e04c3fSmrg        vir_for_each_block(block, c) {
226101e04c3fSmrg                /* The end block of the program has no branch. */
226201e04c3fSmrg                if (!block->successors[0])
226301e04c3fSmrg                        continue;
226401e04c3fSmrg
226501e04c3fSmrg                /* If there was no branch instruction, then the successor
226601e04c3fSmrg                 * block must follow immediately after this one.
226701e04c3fSmrg                 */
226801e04c3fSmrg                if (block->branch_qpu_ip == ~0) {
226901e04c3fSmrg                        assert(block->end_qpu_ip + 1 ==
227001e04c3fSmrg                               block->successors[0]->start_qpu_ip);
227101e04c3fSmrg                        continue;
227201e04c3fSmrg                }
227301e04c3fSmrg
227401e04c3fSmrg                /* Walk back through the delay slots to find the branch
227501e04c3fSmrg                 * instr.
227601e04c3fSmrg                 */
22777ec681f3Smrg                struct qinst *branch = NULL;
227801e04c3fSmrg                struct list_head *entry = block->instructions.prev;
22797ec681f3Smrg                int32_t delay_slot_count = -1;
22807ec681f3Smrg                struct qinst *delay_slots_start = NULL;
22817ec681f3Smrg                for (int i = 0; i < 3; i++) {
228201e04c3fSmrg                        entry = entry->prev;
22837ec681f3Smrg                        struct qinst *inst =
22847ec681f3Smrg                                container_of(entry, struct qinst, link);
22857ec681f3Smrg
22867ec681f3Smrg                        if (delay_slot_count == -1) {
22877ec681f3Smrg                                if (!v3d_qpu_is_nop(&inst->qpu))
22887ec681f3Smrg                                        delay_slot_count = i;
22897ec681f3Smrg                                else
22907ec681f3Smrg                                        delay_slots_start = inst;
22917ec681f3Smrg                        }
22927ec681f3Smrg
22937ec681f3Smrg                        if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH) {
22947ec681f3Smrg                                branch = inst;
22957ec681f3Smrg                                break;
22967ec681f3Smrg                        }
22977ec681f3Smrg                }
22987ec681f3Smrg                assert(branch && branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
22997ec681f3Smrg                assert(delay_slot_count >= 0 && delay_slot_count <= 3);
23007ec681f3Smrg                assert(delay_slot_count == 0 || delay_slots_start != NULL);
230101e04c3fSmrg
230201e04c3fSmrg                /* Make sure that the if-we-don't-jump
230301e04c3fSmrg                 * successor was scheduled just after the
230401e04c3fSmrg                 * delay slots.
230501e04c3fSmrg                 */
230601e04c3fSmrg                assert(!block->successors[1] ||
230701e04c3fSmrg                       block->successors[1]->start_qpu_ip ==
230801e04c3fSmrg                       block->branch_qpu_ip + 4);
230901e04c3fSmrg
231001e04c3fSmrg                branch->qpu.branch.offset =
231101e04c3fSmrg                        ((block->successors[0]->start_qpu_ip -
231201e04c3fSmrg                          (block->branch_qpu_ip + 4)) *
231301e04c3fSmrg                         sizeof(uint64_t));
231401e04c3fSmrg
231501e04c3fSmrg                /* Set up the relative offset to jump in the
231601e04c3fSmrg                 * uniform stream.
231701e04c3fSmrg                 *
231801e04c3fSmrg                 * Use a temporary here, because
231901e04c3fSmrg                 * uniform_data[inst->uniform] may be shared
232001e04c3fSmrg                 * between multiple instructions.
232101e04c3fSmrg                 */
232201e04c3fSmrg                assert(c->uniform_contents[branch->uniform] == QUNIFORM_CONSTANT);
232301e04c3fSmrg                c->uniform_data[branch->uniform] =
232401e04c3fSmrg                        (block->successors[0]->start_uniform -
232501e04c3fSmrg                         (block->branch_uniform + 1)) * 4;
23267ec681f3Smrg
23277ec681f3Smrg                /* If this is an unconditional branch, try to fill any remaining
23287ec681f3Smrg                 * delay slots with the initial instructions of the successor
23297ec681f3Smrg                 * block.
23307ec681f3Smrg                 *
23317ec681f3Smrg                 * FIXME: we can do the same for conditional branches if we
23327ec681f3Smrg                 * predicate the instructions to match the branch condition.
23337ec681f3Smrg                 */
23347ec681f3Smrg                if (branch->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS) {
23357ec681f3Smrg                        struct list_head *successor_insts =
23367ec681f3Smrg                                &block->successors[0]->instructions;
23377ec681f3Smrg                        delay_slot_count = MIN2(delay_slot_count,
23387ec681f3Smrg                                                list_length(successor_insts));
23397ec681f3Smrg                        struct qinst *s_inst =
23407ec681f3Smrg                                (struct qinst *) successor_insts->next;
23417ec681f3Smrg                        struct qinst *slot = delay_slots_start;
23427ec681f3Smrg                        int slots_filled = 0;
23437ec681f3Smrg                        while (slots_filled < delay_slot_count &&
23447ec681f3Smrg                               qpu_inst_valid_in_branch_delay_slot(c, s_inst)) {
23457ec681f3Smrg                                memcpy(&slot->qpu, &s_inst->qpu,
23467ec681f3Smrg                                       sizeof(slot->qpu));
23477ec681f3Smrg                                s_inst = (struct qinst *) s_inst->link.next;
23487ec681f3Smrg                                slot = (struct qinst *) slot->link.next;
23497ec681f3Smrg                                slots_filled++;
23507ec681f3Smrg                        }
23517ec681f3Smrg                        branch->qpu.branch.offset +=
23527ec681f3Smrg                                slots_filled * sizeof(uint64_t);
23537ec681f3Smrg                }
235401e04c3fSmrg        }
235501e04c3fSmrg}
235601e04c3fSmrg
235701e04c3fSmrguint32_t
235801e04c3fSmrgv3d_qpu_schedule_instructions(struct v3d_compile *c)
235901e04c3fSmrg{
236001e04c3fSmrg        const struct v3d_device_info *devinfo = c->devinfo;
236101e04c3fSmrg        struct qblock *end_block = list_last_entry(&c->blocks,
236201e04c3fSmrg                                                   struct qblock, link);
236301e04c3fSmrg
236401e04c3fSmrg        /* We reorder the uniforms as we schedule instructions, so save the
236501e04c3fSmrg         * old data off and replace it.
236601e04c3fSmrg         */
236701e04c3fSmrg        uint32_t *uniform_data = c->uniform_data;
236801e04c3fSmrg        enum quniform_contents *uniform_contents = c->uniform_contents;
236901e04c3fSmrg        c->uniform_contents = ralloc_array(c, enum quniform_contents,
237001e04c3fSmrg                                           c->num_uniforms);
237101e04c3fSmrg        c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms);
237201e04c3fSmrg        c->uniform_array_size = c->num_uniforms;
237301e04c3fSmrg        uint32_t next_uniform = 0;
237401e04c3fSmrg
237501e04c3fSmrg        struct choose_scoreboard scoreboard;
237601e04c3fSmrg        memset(&scoreboard, 0, sizeof(scoreboard));
237701e04c3fSmrg        scoreboard.last_ldvary_tick = -10;
23787ec681f3Smrg        scoreboard.last_unifa_write_tick = -10;
237901e04c3fSmrg        scoreboard.last_magic_sfu_write_tick = -10;
238001e04c3fSmrg        scoreboard.last_uniforms_reset_tick = -10;
238101e04c3fSmrg        scoreboard.last_thrsw_tick = -10;
23827ec681f3Smrg        scoreboard.last_branch_tick = -10;
23837ec681f3Smrg        scoreboard.last_setmsf_tick = -10;
23847ec681f3Smrg        scoreboard.last_stallable_sfu_tick = -10;
238501e04c3fSmrg
238601e04c3fSmrg        if (debug) {
238701e04c3fSmrg                fprintf(stderr, "Pre-schedule instructions\n");
238801e04c3fSmrg                vir_for_each_block(block, c) {
238901e04c3fSmrg                        fprintf(stderr, "BLOCK %d\n", block->index);
239001e04c3fSmrg                        list_for_each_entry(struct qinst, qinst,
239101e04c3fSmrg                                            &block->instructions, link) {
239201e04c3fSmrg                                v3d_qpu_dump(devinfo, &qinst->qpu);
239301e04c3fSmrg                                fprintf(stderr, "\n");
239401e04c3fSmrg                        }
239501e04c3fSmrg                }
239601e04c3fSmrg                fprintf(stderr, "\n");
239701e04c3fSmrg        }
239801e04c3fSmrg
239901e04c3fSmrg        uint32_t cycles = 0;
240001e04c3fSmrg        vir_for_each_block(block, c) {
240101e04c3fSmrg                block->start_qpu_ip = c->qpu_inst_count;
240201e04c3fSmrg                block->branch_qpu_ip = ~0;
240301e04c3fSmrg                block->start_uniform = next_uniform;
240401e04c3fSmrg
240501e04c3fSmrg                cycles += qpu_schedule_instructions_block(c,
240601e04c3fSmrg                                                          &scoreboard,
240701e04c3fSmrg                                                          block,
240801e04c3fSmrg                                                          uniform_contents,
240901e04c3fSmrg                                                          uniform_data,
241001e04c3fSmrg                                                          &next_uniform);
241101e04c3fSmrg
241201e04c3fSmrg                block->end_qpu_ip = c->qpu_inst_count - 1;
241301e04c3fSmrg        }
241401e04c3fSmrg
241501e04c3fSmrg        /* Emit the program-end THRSW instruction. */;
241601e04c3fSmrg        struct qinst *thrsw = vir_nop();
241701e04c3fSmrg        thrsw->qpu.sig.thrsw = true;
241801e04c3fSmrg        emit_thrsw(c, end_block, &scoreboard, thrsw, true);
241901e04c3fSmrg
242001e04c3fSmrg        qpu_set_branch_targets(c);
242101e04c3fSmrg
242201e04c3fSmrg        assert(next_uniform == c->num_uniforms);
242301e04c3fSmrg
242401e04c3fSmrg        return cycles;
242501e04c3fSmrg}
2426