101e04c3fSmrg/* 201e04c3fSmrg * Copyright © 2014 Broadcom 301e04c3fSmrg * 401e04c3fSmrg * Permission is hereby granted, free of charge, to any person obtaining a 501e04c3fSmrg * copy of this software and associated documentation files (the "Software"), 601e04c3fSmrg * to deal in the Software without restriction, including without limitation 701e04c3fSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 801e04c3fSmrg * and/or sell copies of the Software, and to permit persons to whom the 901e04c3fSmrg * Software is furnished to do so, subject to the following conditions: 1001e04c3fSmrg * 1101e04c3fSmrg * The above copyright notice and this permission notice (including the next 1201e04c3fSmrg * paragraph) shall be included in all copies or substantial portions of the 1301e04c3fSmrg * Software. 1401e04c3fSmrg * 1501e04c3fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1601e04c3fSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1701e04c3fSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 1801e04c3fSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 1901e04c3fSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 2001e04c3fSmrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 2101e04c3fSmrg * IN THE SOFTWARE. 2201e04c3fSmrg */ 2301e04c3fSmrg 2401e04c3fSmrg/** 2501e04c3fSmrg * @file 2601e04c3fSmrg * 2701e04c3fSmrg * Validates the QPU instruction sequence after register allocation and 2801e04c3fSmrg * scheduling. 2901e04c3fSmrg */ 3001e04c3fSmrg 3101e04c3fSmrg#include <assert.h> 3201e04c3fSmrg#include <stdio.h> 3301e04c3fSmrg#include <stdlib.h> 3401e04c3fSmrg#include "v3d_compiler.h" 3501e04c3fSmrg#include "qpu/qpu_disasm.h" 3601e04c3fSmrg 3701e04c3fSmrgstruct v3d_qpu_validate_state { 3801e04c3fSmrg struct v3d_compile *c; 3901e04c3fSmrg const struct v3d_qpu_instr *last; 4001e04c3fSmrg int ip; 4101e04c3fSmrg int last_sfu_write; 4201e04c3fSmrg int last_branch_ip; 4301e04c3fSmrg int last_thrsw_ip; 4401e04c3fSmrg 4501e04c3fSmrg /* Set when we've found the last-THRSW signal, or if we were started 4601e04c3fSmrg * in single-segment mode. 4701e04c3fSmrg */ 4801e04c3fSmrg bool last_thrsw_found; 4901e04c3fSmrg 5001e04c3fSmrg /* Set when we've found the THRSW after the last THRSW */ 5101e04c3fSmrg bool thrend_found; 5201e04c3fSmrg 5301e04c3fSmrg int thrsw_count; 5401e04c3fSmrg}; 5501e04c3fSmrg 5601e04c3fSmrgstatic void 5701e04c3fSmrgfail_instr(struct v3d_qpu_validate_state *state, const char *msg) 5801e04c3fSmrg{ 5901e04c3fSmrg struct v3d_compile *c = state->c; 6001e04c3fSmrg 6101e04c3fSmrg fprintf(stderr, "v3d_qpu_validate at ip %d: %s:\n", state->ip, msg); 6201e04c3fSmrg 6301e04c3fSmrg int dump_ip = 0; 6401e04c3fSmrg vir_for_each_inst_inorder(inst, c) { 6501e04c3fSmrg v3d_qpu_dump(c->devinfo, &inst->qpu); 6601e04c3fSmrg 6701e04c3fSmrg if (dump_ip++ == state->ip) 6801e04c3fSmrg fprintf(stderr, " *** ERROR ***"); 6901e04c3fSmrg 7001e04c3fSmrg fprintf(stderr, "\n"); 7101e04c3fSmrg } 7201e04c3fSmrg 7301e04c3fSmrg fprintf(stderr, "\n"); 7401e04c3fSmrg abort(); 7501e04c3fSmrg} 7601e04c3fSmrg 7701e04c3fSmrgstatic bool 7801e04c3fSmrgin_branch_delay_slots(struct v3d_qpu_validate_state *state) 7901e04c3fSmrg{ 8001e04c3fSmrg return (state->ip - state->last_branch_ip) < 3; 8101e04c3fSmrg} 8201e04c3fSmrg 8301e04c3fSmrgstatic bool 8401e04c3fSmrgin_thrsw_delay_slots(struct v3d_qpu_validate_state *state) 8501e04c3fSmrg{ 8601e04c3fSmrg return (state->ip - state->last_thrsw_ip) < 3; 8701e04c3fSmrg} 8801e04c3fSmrg 8901e04c3fSmrgstatic bool 9001e04c3fSmrgqpu_magic_waddr_matches(const struct v3d_qpu_instr *inst, 9101e04c3fSmrg bool (*predicate)(enum v3d_qpu_waddr waddr)) 9201e04c3fSmrg{ 9301e04c3fSmrg if (inst->type == V3D_QPU_INSTR_TYPE_ALU) 9401e04c3fSmrg return false; 9501e04c3fSmrg 9601e04c3fSmrg if (inst->alu.add.op != V3D_QPU_A_NOP && 9701e04c3fSmrg inst->alu.add.magic_write && 9801e04c3fSmrg predicate(inst->alu.add.waddr)) 9901e04c3fSmrg return true; 10001e04c3fSmrg 10101e04c3fSmrg if (inst->alu.mul.op != V3D_QPU_M_NOP && 10201e04c3fSmrg inst->alu.mul.magic_write && 10301e04c3fSmrg predicate(inst->alu.mul.waddr)) 10401e04c3fSmrg return true; 10501e04c3fSmrg 10601e04c3fSmrg return false; 10701e04c3fSmrg} 10801e04c3fSmrg 10901e04c3fSmrgstatic void 11001e04c3fSmrgqpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst) 11101e04c3fSmrg{ 11201e04c3fSmrg const struct v3d_device_info *devinfo = state->c->devinfo; 11301e04c3fSmrg const struct v3d_qpu_instr *inst = &qinst->qpu; 11401e04c3fSmrg 11501e04c3fSmrg if (inst->type != V3D_QPU_INSTR_TYPE_ALU) 11601e04c3fSmrg return; 11701e04c3fSmrg 11801e04c3fSmrg /* LDVARY writes r5 two instructions later and LDUNIF writes 11901e04c3fSmrg * r5 one instruction later, which is illegal to have 12001e04c3fSmrg * together. 12101e04c3fSmrg */ 12201e04c3fSmrg if (state->last && state->last->sig.ldvary && 12301e04c3fSmrg (inst->sig.ldunif || inst->sig.ldunifa)) { 12401e04c3fSmrg fail_instr(state, "LDUNIF after a LDVARY"); 12501e04c3fSmrg } 12601e04c3fSmrg 1277ec681f3Smrg /* GFXH-1633 (fixed since V3D 4.2.14, which is Rpi4) 1287ec681f3Smrg * 1297ec681f3Smrg * FIXME: This would not check correctly for V3D 4.2 versions lower 1307ec681f3Smrg * than V3D 4.2.14, but that is not a real issue because the simulator 1317ec681f3Smrg * will still catch this, and we are not really targetting any such 1327ec681f3Smrg * versions anyway. 1337ec681f3Smrg */ 1347ec681f3Smrg if (state->c->devinfo->ver < 42) { 1357ec681f3Smrg bool last_reads_ldunif = (state->last && (state->last->sig.ldunif || 1367ec681f3Smrg state->last->sig.ldunifrf)); 1377ec681f3Smrg bool last_reads_ldunifa = (state->last && (state->last->sig.ldunifa || 1387ec681f3Smrg state->last->sig.ldunifarf)); 1397ec681f3Smrg bool reads_ldunif = inst->sig.ldunif || inst->sig.ldunifrf; 1407ec681f3Smrg bool reads_ldunifa = inst->sig.ldunifa || inst->sig.ldunifarf; 1417ec681f3Smrg if ((last_reads_ldunif && reads_ldunifa) || 1427ec681f3Smrg (last_reads_ldunifa && reads_ldunif)) { 1437ec681f3Smrg fail_instr(state, 1447ec681f3Smrg "LDUNIF and LDUNIFA can't be next to each other"); 1457ec681f3Smrg } 14601e04c3fSmrg } 14701e04c3fSmrg 14801e04c3fSmrg int tmu_writes = 0; 14901e04c3fSmrg int sfu_writes = 0; 15001e04c3fSmrg int vpm_writes = 0; 15101e04c3fSmrg int tlb_writes = 0; 15201e04c3fSmrg int tsy_writes = 0; 15301e04c3fSmrg 15401e04c3fSmrg if (inst->alu.add.op != V3D_QPU_A_NOP) { 15501e04c3fSmrg if (inst->alu.add.magic_write) { 1567ec681f3Smrg if (v3d_qpu_magic_waddr_is_tmu(state->c->devinfo, 1577ec681f3Smrg inst->alu.add.waddr)) { 15801e04c3fSmrg tmu_writes++; 1597ec681f3Smrg } 16001e04c3fSmrg if (v3d_qpu_magic_waddr_is_sfu(inst->alu.add.waddr)) 16101e04c3fSmrg sfu_writes++; 16201e04c3fSmrg if (v3d_qpu_magic_waddr_is_vpm(inst->alu.add.waddr)) 16301e04c3fSmrg vpm_writes++; 16401e04c3fSmrg if (v3d_qpu_magic_waddr_is_tlb(inst->alu.add.waddr)) 16501e04c3fSmrg tlb_writes++; 16601e04c3fSmrg if (v3d_qpu_magic_waddr_is_tsy(inst->alu.add.waddr)) 16701e04c3fSmrg tsy_writes++; 16801e04c3fSmrg } 16901e04c3fSmrg } 17001e04c3fSmrg 17101e04c3fSmrg if (inst->alu.mul.op != V3D_QPU_M_NOP) { 17201e04c3fSmrg if (inst->alu.mul.magic_write) { 1737ec681f3Smrg if (v3d_qpu_magic_waddr_is_tmu(state->c->devinfo, 1747ec681f3Smrg inst->alu.mul.waddr)) { 17501e04c3fSmrg tmu_writes++; 1767ec681f3Smrg } 17701e04c3fSmrg if (v3d_qpu_magic_waddr_is_sfu(inst->alu.mul.waddr)) 17801e04c3fSmrg sfu_writes++; 17901e04c3fSmrg if (v3d_qpu_magic_waddr_is_vpm(inst->alu.mul.waddr)) 18001e04c3fSmrg vpm_writes++; 18101e04c3fSmrg if (v3d_qpu_magic_waddr_is_tlb(inst->alu.mul.waddr)) 18201e04c3fSmrg tlb_writes++; 18301e04c3fSmrg if (v3d_qpu_magic_waddr_is_tsy(inst->alu.mul.waddr)) 18401e04c3fSmrg tsy_writes++; 18501e04c3fSmrg } 18601e04c3fSmrg } 18701e04c3fSmrg 18801e04c3fSmrg if (in_thrsw_delay_slots(state)) { 18901e04c3fSmrg /* There's no way you want to start SFU during the THRSW delay 19001e04c3fSmrg * slots, since the result would land in the other thread. 19101e04c3fSmrg */ 19201e04c3fSmrg if (sfu_writes) { 19301e04c3fSmrg fail_instr(state, 19401e04c3fSmrg "SFU write started during THRSW delay slots "); 19501e04c3fSmrg } 19601e04c3fSmrg 19701e04c3fSmrg if (inst->sig.ldvary) 19801e04c3fSmrg fail_instr(state, "LDVARY during THRSW delay slots"); 19901e04c3fSmrg } 20001e04c3fSmrg 20101e04c3fSmrg (void)qpu_magic_waddr_matches; /* XXX */ 20201e04c3fSmrg 20301e04c3fSmrg /* SFU r4 results come back two instructions later. No doing 20401e04c3fSmrg * r4 read/writes or other SFU lookups until it's done. 20501e04c3fSmrg */ 20601e04c3fSmrg if (state->ip - state->last_sfu_write < 2) { 20701e04c3fSmrg if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_R4)) 20801e04c3fSmrg fail_instr(state, "R4 read too soon after SFU"); 20901e04c3fSmrg 21001e04c3fSmrg if (v3d_qpu_writes_r4(devinfo, inst)) 21101e04c3fSmrg fail_instr(state, "R4 write too soon after SFU"); 21201e04c3fSmrg 21301e04c3fSmrg if (sfu_writes) 21401e04c3fSmrg fail_instr(state, "SFU write too soon after SFU"); 21501e04c3fSmrg } 21601e04c3fSmrg 21701e04c3fSmrg /* XXX: The docs say VPM can happen with the others, but the simulator 21801e04c3fSmrg * disagrees. 21901e04c3fSmrg */ 22001e04c3fSmrg if (tmu_writes + 22101e04c3fSmrg sfu_writes + 22201e04c3fSmrg vpm_writes + 22301e04c3fSmrg tlb_writes + 22401e04c3fSmrg tsy_writes + 22501e04c3fSmrg inst->sig.ldtmu + 22601e04c3fSmrg inst->sig.ldtlb + 22701e04c3fSmrg inst->sig.ldvpm + 22801e04c3fSmrg inst->sig.ldtlbu > 1) { 22901e04c3fSmrg fail_instr(state, 23001e04c3fSmrg "Only one of [TMU, SFU, TSY, TLB read, VPM] allowed"); 23101e04c3fSmrg } 23201e04c3fSmrg 23301e04c3fSmrg if (sfu_writes) 23401e04c3fSmrg state->last_sfu_write = state->ip; 23501e04c3fSmrg 23601e04c3fSmrg if (inst->sig.thrsw) { 23701e04c3fSmrg if (in_branch_delay_slots(state)) 23801e04c3fSmrg fail_instr(state, "THRSW in a branch delay slot."); 23901e04c3fSmrg 24001e04c3fSmrg if (state->last_thrsw_found) 24101e04c3fSmrg state->thrend_found = true; 24201e04c3fSmrg 24301e04c3fSmrg if (state->last_thrsw_ip == state->ip - 1) { 24401e04c3fSmrg /* If it's the second THRSW in a row, then it's just a 24501e04c3fSmrg * last-thrsw signal. 24601e04c3fSmrg */ 24701e04c3fSmrg if (state->last_thrsw_found) 24801e04c3fSmrg fail_instr(state, "Two last-THRSW signals"); 24901e04c3fSmrg state->last_thrsw_found = true; 25001e04c3fSmrg } else { 25101e04c3fSmrg if (in_thrsw_delay_slots(state)) { 25201e04c3fSmrg fail_instr(state, 25301e04c3fSmrg "THRSW too close to another THRSW."); 25401e04c3fSmrg } 25501e04c3fSmrg state->thrsw_count++; 25601e04c3fSmrg state->last_thrsw_ip = state->ip; 25701e04c3fSmrg } 25801e04c3fSmrg } 25901e04c3fSmrg 26001e04c3fSmrg if (state->thrend_found && 26101e04c3fSmrg state->last_thrsw_ip - state->ip <= 2 && 26201e04c3fSmrg inst->type == V3D_QPU_INSTR_TYPE_ALU) { 26301e04c3fSmrg if ((inst->alu.add.op != V3D_QPU_A_NOP && 26401e04c3fSmrg !inst->alu.add.magic_write)) { 26501e04c3fSmrg fail_instr(state, "RF write after THREND"); 26601e04c3fSmrg } 26701e04c3fSmrg 26801e04c3fSmrg if ((inst->alu.mul.op != V3D_QPU_M_NOP && 26901e04c3fSmrg !inst->alu.mul.magic_write)) { 27001e04c3fSmrg fail_instr(state, "RF write after THREND"); 27101e04c3fSmrg } 27201e04c3fSmrg 2737ec681f3Smrg if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) && 2747ec681f3Smrg !inst->sig_magic) { 27501e04c3fSmrg fail_instr(state, "RF write after THREND"); 2767ec681f3Smrg } 27701e04c3fSmrg 27801e04c3fSmrg /* GFXH-1625: No TMUWT in the last instruction */ 27901e04c3fSmrg if (state->last_thrsw_ip - state->ip == 2 && 28001e04c3fSmrg inst->alu.add.op == V3D_QPU_A_TMUWT) 28101e04c3fSmrg fail_instr(state, "TMUWT in last instruction"); 28201e04c3fSmrg } 28301e04c3fSmrg 28401e04c3fSmrg if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) { 28501e04c3fSmrg if (in_branch_delay_slots(state)) 28601e04c3fSmrg fail_instr(state, "branch in a branch delay slot."); 28701e04c3fSmrg if (in_thrsw_delay_slots(state)) 28801e04c3fSmrg fail_instr(state, "branch in a THRSW delay slot."); 28901e04c3fSmrg state->last_branch_ip = state->ip; 29001e04c3fSmrg } 29101e04c3fSmrg} 29201e04c3fSmrg 29301e04c3fSmrgstatic void 29401e04c3fSmrgqpu_validate_block(struct v3d_qpu_validate_state *state, struct qblock *block) 29501e04c3fSmrg{ 29601e04c3fSmrg vir_for_each_inst(qinst, block) { 29701e04c3fSmrg qpu_validate_inst(state, qinst); 29801e04c3fSmrg 29901e04c3fSmrg state->last = &qinst->qpu; 30001e04c3fSmrg state->ip++; 30101e04c3fSmrg } 30201e04c3fSmrg} 30301e04c3fSmrg 30401e04c3fSmrg/** 30501e04c3fSmrg * Checks for the instruction restrictions from page 37 ("Summary of 30601e04c3fSmrg * Instruction Restrictions"). 30701e04c3fSmrg */ 30801e04c3fSmrgvoid 30901e04c3fSmrgqpu_validate(struct v3d_compile *c) 31001e04c3fSmrg{ 31101e04c3fSmrg /* We don't want to do validation in release builds, but we want to 31201e04c3fSmrg * keep compiling the validation code to make sure it doesn't get 31301e04c3fSmrg * broken. 31401e04c3fSmrg */ 31501e04c3fSmrg#ifndef DEBUG 31601e04c3fSmrg return; 31701e04c3fSmrg#endif 31801e04c3fSmrg 31901e04c3fSmrg struct v3d_qpu_validate_state state = { 32001e04c3fSmrg .c = c, 32101e04c3fSmrg .last_sfu_write = -10, 32201e04c3fSmrg .last_thrsw_ip = -10, 32301e04c3fSmrg .last_branch_ip = -10, 32401e04c3fSmrg .ip = 0, 32501e04c3fSmrg 32601e04c3fSmrg .last_thrsw_found = !c->last_thrsw, 32701e04c3fSmrg }; 32801e04c3fSmrg 32901e04c3fSmrg vir_for_each_block(block, c) { 33001e04c3fSmrg qpu_validate_block(&state, block); 33101e04c3fSmrg } 33201e04c3fSmrg 33301e04c3fSmrg if (state.thrsw_count > 1 && !state.last_thrsw_found) { 33401e04c3fSmrg fail_instr(&state, 33501e04c3fSmrg "thread switch found without last-THRSW in program"); 33601e04c3fSmrg } 33701e04c3fSmrg 33801e04c3fSmrg if (!state.thrend_found) 33901e04c3fSmrg fail_instr(&state, "No program-end THRSW found"); 34001e04c3fSmrg} 341