101e04c3fSmrg/*
201e04c3fSmrg * Copyright © 2014 Broadcom
301e04c3fSmrg *
401e04c3fSmrg * Permission is hereby granted, free of charge, to any person obtaining a
501e04c3fSmrg * copy of this software and associated documentation files (the "Software"),
601e04c3fSmrg * to deal in the Software without restriction, including without limitation
701e04c3fSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
801e04c3fSmrg * and/or sell copies of the Software, and to permit persons to whom the
901e04c3fSmrg * Software is furnished to do so, subject to the following conditions:
1001e04c3fSmrg *
1101e04c3fSmrg * The above copyright notice and this permission notice (including the next
1201e04c3fSmrg * paragraph) shall be included in all copies or substantial portions of the
1301e04c3fSmrg * Software.
1401e04c3fSmrg *
1501e04c3fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1601e04c3fSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1701e04c3fSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
1801e04c3fSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1901e04c3fSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
2001e04c3fSmrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
2101e04c3fSmrg * IN THE SOFTWARE.
2201e04c3fSmrg */
2301e04c3fSmrg
2401e04c3fSmrg/**
2501e04c3fSmrg * @file
2601e04c3fSmrg *
2701e04c3fSmrg * Validates the QPU instruction sequence after register allocation and
2801e04c3fSmrg * scheduling.
2901e04c3fSmrg */
3001e04c3fSmrg
3101e04c3fSmrg#include <assert.h>
3201e04c3fSmrg#include <stdio.h>
3301e04c3fSmrg#include <stdlib.h>
3401e04c3fSmrg#include "v3d_compiler.h"
3501e04c3fSmrg#include "qpu/qpu_disasm.h"
3601e04c3fSmrg
3701e04c3fSmrgstruct v3d_qpu_validate_state {
3801e04c3fSmrg        struct v3d_compile *c;
3901e04c3fSmrg        const struct v3d_qpu_instr *last;
4001e04c3fSmrg        int ip;
4101e04c3fSmrg        int last_sfu_write;
4201e04c3fSmrg        int last_branch_ip;
4301e04c3fSmrg        int last_thrsw_ip;
4401e04c3fSmrg
4501e04c3fSmrg        /* Set when we've found the last-THRSW signal, or if we were started
4601e04c3fSmrg         * in single-segment mode.
4701e04c3fSmrg         */
4801e04c3fSmrg        bool last_thrsw_found;
4901e04c3fSmrg
5001e04c3fSmrg        /* Set when we've found the THRSW after the last THRSW */
5101e04c3fSmrg        bool thrend_found;
5201e04c3fSmrg
5301e04c3fSmrg        int thrsw_count;
5401e04c3fSmrg};
5501e04c3fSmrg
5601e04c3fSmrgstatic void
5701e04c3fSmrgfail_instr(struct v3d_qpu_validate_state *state, const char *msg)
5801e04c3fSmrg{
5901e04c3fSmrg        struct v3d_compile *c = state->c;
6001e04c3fSmrg
6101e04c3fSmrg        fprintf(stderr, "v3d_qpu_validate at ip %d: %s:\n", state->ip, msg);
6201e04c3fSmrg
6301e04c3fSmrg        int dump_ip = 0;
6401e04c3fSmrg        vir_for_each_inst_inorder(inst, c) {
6501e04c3fSmrg                v3d_qpu_dump(c->devinfo, &inst->qpu);
6601e04c3fSmrg
6701e04c3fSmrg                if (dump_ip++ == state->ip)
6801e04c3fSmrg                        fprintf(stderr, " *** ERROR ***");
6901e04c3fSmrg
7001e04c3fSmrg                fprintf(stderr, "\n");
7101e04c3fSmrg        }
7201e04c3fSmrg
7301e04c3fSmrg        fprintf(stderr, "\n");
7401e04c3fSmrg        abort();
7501e04c3fSmrg}
7601e04c3fSmrg
7701e04c3fSmrgstatic bool
7801e04c3fSmrgin_branch_delay_slots(struct v3d_qpu_validate_state *state)
7901e04c3fSmrg{
8001e04c3fSmrg        return (state->ip - state->last_branch_ip) < 3;
8101e04c3fSmrg}
8201e04c3fSmrg
8301e04c3fSmrgstatic bool
8401e04c3fSmrgin_thrsw_delay_slots(struct v3d_qpu_validate_state *state)
8501e04c3fSmrg{
8601e04c3fSmrg        return (state->ip - state->last_thrsw_ip) < 3;
8701e04c3fSmrg}
8801e04c3fSmrg
8901e04c3fSmrgstatic bool
9001e04c3fSmrgqpu_magic_waddr_matches(const struct v3d_qpu_instr *inst,
9101e04c3fSmrg                        bool (*predicate)(enum v3d_qpu_waddr waddr))
9201e04c3fSmrg{
9301e04c3fSmrg        if (inst->type == V3D_QPU_INSTR_TYPE_ALU)
9401e04c3fSmrg                return false;
9501e04c3fSmrg
9601e04c3fSmrg        if (inst->alu.add.op != V3D_QPU_A_NOP &&
9701e04c3fSmrg            inst->alu.add.magic_write &&
9801e04c3fSmrg            predicate(inst->alu.add.waddr))
9901e04c3fSmrg                return true;
10001e04c3fSmrg
10101e04c3fSmrg        if (inst->alu.mul.op != V3D_QPU_M_NOP &&
10201e04c3fSmrg            inst->alu.mul.magic_write &&
10301e04c3fSmrg            predicate(inst->alu.mul.waddr))
10401e04c3fSmrg                return true;
10501e04c3fSmrg
10601e04c3fSmrg        return false;
10701e04c3fSmrg}
10801e04c3fSmrg
10901e04c3fSmrgstatic void
11001e04c3fSmrgqpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
11101e04c3fSmrg{
11201e04c3fSmrg        const struct v3d_device_info *devinfo = state->c->devinfo;
11301e04c3fSmrg        const struct v3d_qpu_instr *inst = &qinst->qpu;
11401e04c3fSmrg
11501e04c3fSmrg        if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
11601e04c3fSmrg                return;
11701e04c3fSmrg
11801e04c3fSmrg        /* LDVARY writes r5 two instructions later and LDUNIF writes
11901e04c3fSmrg         * r5 one instruction later, which is illegal to have
12001e04c3fSmrg         * together.
12101e04c3fSmrg         */
12201e04c3fSmrg        if (state->last && state->last->sig.ldvary &&
12301e04c3fSmrg            (inst->sig.ldunif || inst->sig.ldunifa)) {
12401e04c3fSmrg                fail_instr(state, "LDUNIF after a LDVARY");
12501e04c3fSmrg        }
12601e04c3fSmrg
1277ec681f3Smrg        /* GFXH-1633 (fixed since V3D 4.2.14, which is Rpi4)
1287ec681f3Smrg         *
1297ec681f3Smrg         * FIXME: This would not check correctly for V3D 4.2 versions lower
1307ec681f3Smrg         * than V3D 4.2.14, but that is not a real issue because the simulator
1317ec681f3Smrg         * will still catch this, and we are not really targetting any such
1327ec681f3Smrg         * versions anyway.
1337ec681f3Smrg         */
1347ec681f3Smrg        if (state->c->devinfo->ver < 42) {
1357ec681f3Smrg                bool last_reads_ldunif = (state->last && (state->last->sig.ldunif ||
1367ec681f3Smrg                                                          state->last->sig.ldunifrf));
1377ec681f3Smrg                bool last_reads_ldunifa = (state->last && (state->last->sig.ldunifa ||
1387ec681f3Smrg                                                           state->last->sig.ldunifarf));
1397ec681f3Smrg                bool reads_ldunif = inst->sig.ldunif || inst->sig.ldunifrf;
1407ec681f3Smrg                bool reads_ldunifa = inst->sig.ldunifa || inst->sig.ldunifarf;
1417ec681f3Smrg                if ((last_reads_ldunif && reads_ldunifa) ||
1427ec681f3Smrg                    (last_reads_ldunifa && reads_ldunif)) {
1437ec681f3Smrg                        fail_instr(state,
1447ec681f3Smrg                                   "LDUNIF and LDUNIFA can't be next to each other");
1457ec681f3Smrg                }
14601e04c3fSmrg        }
14701e04c3fSmrg
14801e04c3fSmrg        int tmu_writes = 0;
14901e04c3fSmrg        int sfu_writes = 0;
15001e04c3fSmrg        int vpm_writes = 0;
15101e04c3fSmrg        int tlb_writes = 0;
15201e04c3fSmrg        int tsy_writes = 0;
15301e04c3fSmrg
15401e04c3fSmrg        if (inst->alu.add.op != V3D_QPU_A_NOP) {
15501e04c3fSmrg                if (inst->alu.add.magic_write) {
1567ec681f3Smrg                        if (v3d_qpu_magic_waddr_is_tmu(state->c->devinfo,
1577ec681f3Smrg                                                       inst->alu.add.waddr)) {
15801e04c3fSmrg                                tmu_writes++;
1597ec681f3Smrg                        }
16001e04c3fSmrg                        if (v3d_qpu_magic_waddr_is_sfu(inst->alu.add.waddr))
16101e04c3fSmrg                                sfu_writes++;
16201e04c3fSmrg                        if (v3d_qpu_magic_waddr_is_vpm(inst->alu.add.waddr))
16301e04c3fSmrg                                vpm_writes++;
16401e04c3fSmrg                        if (v3d_qpu_magic_waddr_is_tlb(inst->alu.add.waddr))
16501e04c3fSmrg                                tlb_writes++;
16601e04c3fSmrg                        if (v3d_qpu_magic_waddr_is_tsy(inst->alu.add.waddr))
16701e04c3fSmrg                                tsy_writes++;
16801e04c3fSmrg                }
16901e04c3fSmrg        }
17001e04c3fSmrg
17101e04c3fSmrg        if (inst->alu.mul.op != V3D_QPU_M_NOP) {
17201e04c3fSmrg                if (inst->alu.mul.magic_write) {
1737ec681f3Smrg                        if (v3d_qpu_magic_waddr_is_tmu(state->c->devinfo,
1747ec681f3Smrg                                                       inst->alu.mul.waddr)) {
17501e04c3fSmrg                                tmu_writes++;
1767ec681f3Smrg                        }
17701e04c3fSmrg                        if (v3d_qpu_magic_waddr_is_sfu(inst->alu.mul.waddr))
17801e04c3fSmrg                                sfu_writes++;
17901e04c3fSmrg                        if (v3d_qpu_magic_waddr_is_vpm(inst->alu.mul.waddr))
18001e04c3fSmrg                                vpm_writes++;
18101e04c3fSmrg                        if (v3d_qpu_magic_waddr_is_tlb(inst->alu.mul.waddr))
18201e04c3fSmrg                                tlb_writes++;
18301e04c3fSmrg                        if (v3d_qpu_magic_waddr_is_tsy(inst->alu.mul.waddr))
18401e04c3fSmrg                                tsy_writes++;
18501e04c3fSmrg                }
18601e04c3fSmrg        }
18701e04c3fSmrg
18801e04c3fSmrg        if (in_thrsw_delay_slots(state)) {
18901e04c3fSmrg                /* There's no way you want to start SFU during the THRSW delay
19001e04c3fSmrg                 * slots, since the result would land in the other thread.
19101e04c3fSmrg                 */
19201e04c3fSmrg                if (sfu_writes) {
19301e04c3fSmrg                        fail_instr(state,
19401e04c3fSmrg                                   "SFU write started during THRSW delay slots ");
19501e04c3fSmrg                }
19601e04c3fSmrg
19701e04c3fSmrg                if (inst->sig.ldvary)
19801e04c3fSmrg                        fail_instr(state, "LDVARY during THRSW delay slots");
19901e04c3fSmrg        }
20001e04c3fSmrg
20101e04c3fSmrg        (void)qpu_magic_waddr_matches; /* XXX */
20201e04c3fSmrg
20301e04c3fSmrg        /* SFU r4 results come back two instructions later.  No doing
20401e04c3fSmrg         * r4 read/writes or other SFU lookups until it's done.
20501e04c3fSmrg         */
20601e04c3fSmrg        if (state->ip - state->last_sfu_write < 2) {
20701e04c3fSmrg                if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_R4))
20801e04c3fSmrg                        fail_instr(state, "R4 read too soon after SFU");
20901e04c3fSmrg
21001e04c3fSmrg                if (v3d_qpu_writes_r4(devinfo, inst))
21101e04c3fSmrg                        fail_instr(state, "R4 write too soon after SFU");
21201e04c3fSmrg
21301e04c3fSmrg                if (sfu_writes)
21401e04c3fSmrg                        fail_instr(state, "SFU write too soon after SFU");
21501e04c3fSmrg        }
21601e04c3fSmrg
21701e04c3fSmrg        /* XXX: The docs say VPM can happen with the others, but the simulator
21801e04c3fSmrg         * disagrees.
21901e04c3fSmrg         */
22001e04c3fSmrg        if (tmu_writes +
22101e04c3fSmrg            sfu_writes +
22201e04c3fSmrg            vpm_writes +
22301e04c3fSmrg            tlb_writes +
22401e04c3fSmrg            tsy_writes +
22501e04c3fSmrg            inst->sig.ldtmu +
22601e04c3fSmrg            inst->sig.ldtlb +
22701e04c3fSmrg            inst->sig.ldvpm +
22801e04c3fSmrg            inst->sig.ldtlbu > 1) {
22901e04c3fSmrg                fail_instr(state,
23001e04c3fSmrg                           "Only one of [TMU, SFU, TSY, TLB read, VPM] allowed");
23101e04c3fSmrg        }
23201e04c3fSmrg
23301e04c3fSmrg        if (sfu_writes)
23401e04c3fSmrg                state->last_sfu_write = state->ip;
23501e04c3fSmrg
23601e04c3fSmrg        if (inst->sig.thrsw) {
23701e04c3fSmrg                if (in_branch_delay_slots(state))
23801e04c3fSmrg                        fail_instr(state, "THRSW in a branch delay slot.");
23901e04c3fSmrg
24001e04c3fSmrg                if (state->last_thrsw_found)
24101e04c3fSmrg                        state->thrend_found = true;
24201e04c3fSmrg
24301e04c3fSmrg                if (state->last_thrsw_ip == state->ip - 1) {
24401e04c3fSmrg                        /* If it's the second THRSW in a row, then it's just a
24501e04c3fSmrg                         * last-thrsw signal.
24601e04c3fSmrg                         */
24701e04c3fSmrg                        if (state->last_thrsw_found)
24801e04c3fSmrg                                fail_instr(state, "Two last-THRSW signals");
24901e04c3fSmrg                        state->last_thrsw_found = true;
25001e04c3fSmrg                } else {
25101e04c3fSmrg                        if (in_thrsw_delay_slots(state)) {
25201e04c3fSmrg                                fail_instr(state,
25301e04c3fSmrg                                           "THRSW too close to another THRSW.");
25401e04c3fSmrg                        }
25501e04c3fSmrg                        state->thrsw_count++;
25601e04c3fSmrg                        state->last_thrsw_ip = state->ip;
25701e04c3fSmrg                }
25801e04c3fSmrg        }
25901e04c3fSmrg
26001e04c3fSmrg        if (state->thrend_found &&
26101e04c3fSmrg            state->last_thrsw_ip - state->ip <= 2 &&
26201e04c3fSmrg            inst->type == V3D_QPU_INSTR_TYPE_ALU) {
26301e04c3fSmrg                if ((inst->alu.add.op != V3D_QPU_A_NOP &&
26401e04c3fSmrg                     !inst->alu.add.magic_write)) {
26501e04c3fSmrg                        fail_instr(state, "RF write after THREND");
26601e04c3fSmrg                }
26701e04c3fSmrg
26801e04c3fSmrg                if ((inst->alu.mul.op != V3D_QPU_M_NOP &&
26901e04c3fSmrg                     !inst->alu.mul.magic_write)) {
27001e04c3fSmrg                        fail_instr(state, "RF write after THREND");
27101e04c3fSmrg                }
27201e04c3fSmrg
2737ec681f3Smrg                if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
2747ec681f3Smrg                    !inst->sig_magic) {
27501e04c3fSmrg                        fail_instr(state, "RF write after THREND");
2767ec681f3Smrg                }
27701e04c3fSmrg
27801e04c3fSmrg                /* GFXH-1625: No TMUWT in the last instruction */
27901e04c3fSmrg                if (state->last_thrsw_ip - state->ip == 2 &&
28001e04c3fSmrg                    inst->alu.add.op == V3D_QPU_A_TMUWT)
28101e04c3fSmrg                        fail_instr(state, "TMUWT in last instruction");
28201e04c3fSmrg        }
28301e04c3fSmrg
28401e04c3fSmrg        if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
28501e04c3fSmrg                if (in_branch_delay_slots(state))
28601e04c3fSmrg                        fail_instr(state, "branch in a branch delay slot.");
28701e04c3fSmrg                if (in_thrsw_delay_slots(state))
28801e04c3fSmrg                        fail_instr(state, "branch in a THRSW delay slot.");
28901e04c3fSmrg                state->last_branch_ip = state->ip;
29001e04c3fSmrg        }
29101e04c3fSmrg}
29201e04c3fSmrg
29301e04c3fSmrgstatic void
29401e04c3fSmrgqpu_validate_block(struct v3d_qpu_validate_state *state, struct qblock *block)
29501e04c3fSmrg{
29601e04c3fSmrg        vir_for_each_inst(qinst, block) {
29701e04c3fSmrg                qpu_validate_inst(state, qinst);
29801e04c3fSmrg
29901e04c3fSmrg                state->last = &qinst->qpu;
30001e04c3fSmrg                state->ip++;
30101e04c3fSmrg        }
30201e04c3fSmrg}
30301e04c3fSmrg
30401e04c3fSmrg/**
30501e04c3fSmrg * Checks for the instruction restrictions from page 37 ("Summary of
30601e04c3fSmrg * Instruction Restrictions").
30701e04c3fSmrg */
30801e04c3fSmrgvoid
30901e04c3fSmrgqpu_validate(struct v3d_compile *c)
31001e04c3fSmrg{
31101e04c3fSmrg        /* We don't want to do validation in release builds, but we want to
31201e04c3fSmrg         * keep compiling the validation code to make sure it doesn't get
31301e04c3fSmrg         * broken.
31401e04c3fSmrg         */
31501e04c3fSmrg#ifndef DEBUG
31601e04c3fSmrg        return;
31701e04c3fSmrg#endif
31801e04c3fSmrg
31901e04c3fSmrg        struct v3d_qpu_validate_state state = {
32001e04c3fSmrg                .c = c,
32101e04c3fSmrg                .last_sfu_write = -10,
32201e04c3fSmrg                .last_thrsw_ip = -10,
32301e04c3fSmrg                .last_branch_ip = -10,
32401e04c3fSmrg                .ip = 0,
32501e04c3fSmrg
32601e04c3fSmrg                .last_thrsw_found = !c->last_thrsw,
32701e04c3fSmrg        };
32801e04c3fSmrg
32901e04c3fSmrg        vir_for_each_block(block, c) {
33001e04c3fSmrg                qpu_validate_block(&state, block);
33101e04c3fSmrg        }
33201e04c3fSmrg
33301e04c3fSmrg        if (state.thrsw_count > 1 && !state.last_thrsw_found) {
33401e04c3fSmrg                fail_instr(&state,
33501e04c3fSmrg                           "thread switch found without last-THRSW in program");
33601e04c3fSmrg        }
33701e04c3fSmrg
33801e04c3fSmrg        if (!state.thrend_found)
33901e04c3fSmrg                fail_instr(&state, "No program-end THRSW found");
34001e04c3fSmrg}
341