101e04c3fSmrg/* 201e04c3fSmrg * Copyright © 2010 Intel Corporation 301e04c3fSmrg * Copyright © 2014-2017 Broadcom 401e04c3fSmrg * 501e04c3fSmrg * Permission is hereby granted, free of charge, to any person obtaining a 601e04c3fSmrg * copy of this software and associated documentation files (the "Software"), 701e04c3fSmrg * to deal in the Software without restriction, including without limitation 801e04c3fSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 901e04c3fSmrg * and/or sell copies of the Software, and to permit persons to whom the 1001e04c3fSmrg * Software is furnished to do so, subject to the following conditions: 1101e04c3fSmrg * 1201e04c3fSmrg * The above copyright notice and this permission notice (including the next 1301e04c3fSmrg * paragraph) shall be included in all copies or substantial portions of the 1401e04c3fSmrg * Software. 1501e04c3fSmrg * 1601e04c3fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1701e04c3fSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1801e04c3fSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 1901e04c3fSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 2001e04c3fSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 2101e04c3fSmrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 2201e04c3fSmrg * IN THE SOFTWARE. 2301e04c3fSmrg */ 2401e04c3fSmrg 2501e04c3fSmrg/** 2601e04c3fSmrg * @file 2701e04c3fSmrg * 2801e04c3fSmrg * The basic model of the list scheduler is to take a basic block, compute a 2901e04c3fSmrg * DAG of the dependencies, and make a list of the DAG heads. Heuristically 3001e04c3fSmrg * pick a DAG head, then put all the children that are now DAG heads into the 3101e04c3fSmrg * list of things to schedule. 3201e04c3fSmrg * 3301e04c3fSmrg * The goal of scheduling here is to pack pairs of operations together in a 3401e04c3fSmrg * single QPU instruction. 3501e04c3fSmrg */ 3601e04c3fSmrg 3701e04c3fSmrg#include "qpu/qpu_disasm.h" 3801e04c3fSmrg#include "v3d_compiler.h" 3901e04c3fSmrg#include "util/ralloc.h" 40ed98bd31Smaya#include "util/dag.h" 4101e04c3fSmrg 4201e04c3fSmrgstatic bool debug; 4301e04c3fSmrg 4401e04c3fSmrgstruct schedule_node_child; 4501e04c3fSmrg 4601e04c3fSmrgstruct schedule_node { 47ed98bd31Smaya struct dag_node dag; 4801e04c3fSmrg struct list_head link; 4901e04c3fSmrg struct qinst *inst; 5001e04c3fSmrg 5101e04c3fSmrg /* Longest cycles + instruction_latency() of any parent of this node. */ 5201e04c3fSmrg uint32_t unblocked_time; 5301e04c3fSmrg 5401e04c3fSmrg /** 5501e04c3fSmrg * Minimum number of cycles from scheduling this instruction until the 5601e04c3fSmrg * end of the program, based on the slowest dependency chain through 5701e04c3fSmrg * the children. 5801e04c3fSmrg */ 5901e04c3fSmrg uint32_t delay; 6001e04c3fSmrg 6101e04c3fSmrg /** 6201e04c3fSmrg * cycles between this instruction being scheduled and when its result 6301e04c3fSmrg * can be consumed. 6401e04c3fSmrg */ 6501e04c3fSmrg uint32_t latency; 6601e04c3fSmrg}; 6701e04c3fSmrg 6801e04c3fSmrg/* When walking the instructions in reverse, we need to swap before/after in 6901e04c3fSmrg * add_dep(). 7001e04c3fSmrg */ 7101e04c3fSmrgenum direction { F, R }; 7201e04c3fSmrg 7301e04c3fSmrgstruct schedule_state { 7401e04c3fSmrg const struct v3d_device_info *devinfo; 75ed98bd31Smaya struct dag *dag; 7601e04c3fSmrg struct schedule_node *last_r[6]; 7701e04c3fSmrg struct schedule_node *last_rf[64]; 7801e04c3fSmrg struct schedule_node *last_sf; 7901e04c3fSmrg struct schedule_node *last_vpm_read; 8001e04c3fSmrg struct schedule_node *last_tmu_write; 8101e04c3fSmrg struct schedule_node *last_tmu_config; 827ec681f3Smrg struct schedule_node *last_tmu_read; 8301e04c3fSmrg struct schedule_node *last_tlb; 8401e04c3fSmrg struct schedule_node *last_vpm; 8501e04c3fSmrg struct schedule_node *last_unif; 8601e04c3fSmrg struct schedule_node *last_rtop; 877ec681f3Smrg struct schedule_node *last_unifa; 8801e04c3fSmrg enum direction dir; 8901e04c3fSmrg /* Estimated cycle when the current instruction would start. */ 9001e04c3fSmrg uint32_t time; 9101e04c3fSmrg}; 9201e04c3fSmrg 9301e04c3fSmrgstatic void 9401e04c3fSmrgadd_dep(struct schedule_state *state, 9501e04c3fSmrg struct schedule_node *before, 9601e04c3fSmrg struct schedule_node *after, 9701e04c3fSmrg bool write) 9801e04c3fSmrg{ 9901e04c3fSmrg bool write_after_read = !write && state->dir == R; 100ed98bd31Smaya void *edge_data = (void *)(uintptr_t)write_after_read; 10101e04c3fSmrg 10201e04c3fSmrg if (!before || !after) 10301e04c3fSmrg return; 10401e04c3fSmrg 10501e04c3fSmrg assert(before != after); 10601e04c3fSmrg 107ed98bd31Smaya if (state->dir == F) 108ed98bd31Smaya dag_add_edge(&before->dag, &after->dag, edge_data); 109ed98bd31Smaya else 110ed98bd31Smaya dag_add_edge(&after->dag, &before->dag, edge_data); 11101e04c3fSmrg} 11201e04c3fSmrg 11301e04c3fSmrgstatic void 11401e04c3fSmrgadd_read_dep(struct schedule_state *state, 11501e04c3fSmrg struct schedule_node *before, 11601e04c3fSmrg struct schedule_node *after) 11701e04c3fSmrg{ 11801e04c3fSmrg add_dep(state, before, after, false); 11901e04c3fSmrg} 12001e04c3fSmrg 12101e04c3fSmrgstatic void 12201e04c3fSmrgadd_write_dep(struct schedule_state *state, 12301e04c3fSmrg struct schedule_node **before, 12401e04c3fSmrg struct schedule_node *after) 12501e04c3fSmrg{ 12601e04c3fSmrg add_dep(state, *before, after, true); 12701e04c3fSmrg *before = after; 12801e04c3fSmrg} 12901e04c3fSmrg 13001e04c3fSmrgstatic bool 13101e04c3fSmrgqpu_inst_is_tlb(const struct v3d_qpu_instr *inst) 13201e04c3fSmrg{ 1337ec681f3Smrg if (inst->sig.ldtlb || inst->sig.ldtlbu) 1347ec681f3Smrg return true; 1357ec681f3Smrg 13601e04c3fSmrg if (inst->type != V3D_QPU_INSTR_TYPE_ALU) 13701e04c3fSmrg return false; 13801e04c3fSmrg 13901e04c3fSmrg if (inst->alu.add.magic_write && 14001e04c3fSmrg (inst->alu.add.waddr == V3D_QPU_WADDR_TLB || 14101e04c3fSmrg inst->alu.add.waddr == V3D_QPU_WADDR_TLBU)) 14201e04c3fSmrg return true; 14301e04c3fSmrg 14401e04c3fSmrg if (inst->alu.mul.magic_write && 14501e04c3fSmrg (inst->alu.mul.waddr == V3D_QPU_WADDR_TLB || 14601e04c3fSmrg inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU)) 14701e04c3fSmrg return true; 14801e04c3fSmrg 14901e04c3fSmrg return false; 15001e04c3fSmrg} 15101e04c3fSmrg 15201e04c3fSmrgstatic void 15301e04c3fSmrgprocess_mux_deps(struct schedule_state *state, struct schedule_node *n, 15401e04c3fSmrg enum v3d_qpu_mux mux) 15501e04c3fSmrg{ 15601e04c3fSmrg switch (mux) { 15701e04c3fSmrg case V3D_QPU_MUX_A: 15801e04c3fSmrg add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n); 15901e04c3fSmrg break; 16001e04c3fSmrg case V3D_QPU_MUX_B: 1617ec681f3Smrg if (!n->inst->qpu.sig.small_imm) { 1627ec681f3Smrg add_read_dep(state, 1637ec681f3Smrg state->last_rf[n->inst->qpu.raddr_b], n); 1647ec681f3Smrg } 16501e04c3fSmrg break; 16601e04c3fSmrg default: 16701e04c3fSmrg add_read_dep(state, state->last_r[mux - V3D_QPU_MUX_R0], n); 16801e04c3fSmrg break; 16901e04c3fSmrg } 17001e04c3fSmrg} 17101e04c3fSmrg 1727ec681f3Smrgstatic bool 1737ec681f3Smrgtmu_write_is_sequence_terminator(uint32_t waddr) 1747ec681f3Smrg{ 1757ec681f3Smrg switch (waddr) { 1767ec681f3Smrg case V3D_QPU_WADDR_TMUS: 1777ec681f3Smrg case V3D_QPU_WADDR_TMUSCM: 1787ec681f3Smrg case V3D_QPU_WADDR_TMUSF: 1797ec681f3Smrg case V3D_QPU_WADDR_TMUSLOD: 1807ec681f3Smrg case V3D_QPU_WADDR_TMUA: 1817ec681f3Smrg case V3D_QPU_WADDR_TMUAU: 1827ec681f3Smrg return true; 1837ec681f3Smrg default: 1847ec681f3Smrg return false; 1857ec681f3Smrg } 1867ec681f3Smrg} 1877ec681f3Smrg 1887ec681f3Smrgstatic bool 1897ec681f3Smrgcan_reorder_tmu_write(const struct v3d_device_info *devinfo, uint32_t waddr) 1907ec681f3Smrg{ 1917ec681f3Smrg if (devinfo->ver < 40) 1927ec681f3Smrg return false; 1937ec681f3Smrg 1947ec681f3Smrg if (tmu_write_is_sequence_terminator(waddr)) 1957ec681f3Smrg return false; 1967ec681f3Smrg 1977ec681f3Smrg if (waddr == V3D_QPU_WADDR_TMUD) 1987ec681f3Smrg return false; 1997ec681f3Smrg 2007ec681f3Smrg return true; 2017ec681f3Smrg} 20201e04c3fSmrg 20301e04c3fSmrgstatic void 20401e04c3fSmrgprocess_waddr_deps(struct schedule_state *state, struct schedule_node *n, 20501e04c3fSmrg uint32_t waddr, bool magic) 20601e04c3fSmrg{ 20701e04c3fSmrg if (!magic) { 20801e04c3fSmrg add_write_dep(state, &state->last_rf[waddr], n); 2097ec681f3Smrg } else if (v3d_qpu_magic_waddr_is_tmu(state->devinfo, waddr)) { 2107ec681f3Smrg if (can_reorder_tmu_write(state->devinfo, waddr)) 2117ec681f3Smrg add_read_dep(state, state->last_tmu_write, n); 2127ec681f3Smrg else 2137ec681f3Smrg add_write_dep(state, &state->last_tmu_write, n); 2147ec681f3Smrg 2157ec681f3Smrg if (tmu_write_is_sequence_terminator(waddr)) 21601e04c3fSmrg add_write_dep(state, &state->last_tmu_config, n); 21701e04c3fSmrg } else if (v3d_qpu_magic_waddr_is_sfu(waddr)) { 21801e04c3fSmrg /* Handled by v3d_qpu_writes_r4() check. */ 21901e04c3fSmrg } else { 22001e04c3fSmrg switch (waddr) { 22101e04c3fSmrg case V3D_QPU_WADDR_R0: 22201e04c3fSmrg case V3D_QPU_WADDR_R1: 22301e04c3fSmrg case V3D_QPU_WADDR_R2: 22401e04c3fSmrg add_write_dep(state, 22501e04c3fSmrg &state->last_r[waddr - V3D_QPU_WADDR_R0], 22601e04c3fSmrg n); 22701e04c3fSmrg break; 22801e04c3fSmrg case V3D_QPU_WADDR_R3: 22901e04c3fSmrg case V3D_QPU_WADDR_R4: 23001e04c3fSmrg case V3D_QPU_WADDR_R5: 23101e04c3fSmrg /* Handled by v3d_qpu_writes_r*() checks below. */ 23201e04c3fSmrg break; 23301e04c3fSmrg 23401e04c3fSmrg case V3D_QPU_WADDR_VPM: 23501e04c3fSmrg case V3D_QPU_WADDR_VPMU: 23601e04c3fSmrg add_write_dep(state, &state->last_vpm, n); 23701e04c3fSmrg break; 23801e04c3fSmrg 23901e04c3fSmrg case V3D_QPU_WADDR_TLB: 24001e04c3fSmrg case V3D_QPU_WADDR_TLBU: 24101e04c3fSmrg add_write_dep(state, &state->last_tlb, n); 24201e04c3fSmrg break; 24301e04c3fSmrg 244ed98bd31Smaya case V3D_QPU_WADDR_SYNC: 245ed98bd31Smaya case V3D_QPU_WADDR_SYNCB: 246ed98bd31Smaya case V3D_QPU_WADDR_SYNCU: 247ed98bd31Smaya /* For CS barrier(): Sync against any other memory 248ed98bd31Smaya * accesses. There doesn't appear to be any need for 249ed98bd31Smaya * barriers to affect ALU operations. 250ed98bd31Smaya */ 251ed98bd31Smaya add_write_dep(state, &state->last_tmu_write, n); 2527ec681f3Smrg add_write_dep(state, &state->last_tmu_read, n); 2537ec681f3Smrg break; 2547ec681f3Smrg 2557ec681f3Smrg case V3D_QPU_WADDR_UNIFA: 2567ec681f3Smrg if (state->devinfo->ver >= 40) 2577ec681f3Smrg add_write_dep(state, &state->last_unifa, n); 258ed98bd31Smaya break; 259ed98bd31Smaya 26001e04c3fSmrg case V3D_QPU_WADDR_NOP: 26101e04c3fSmrg break; 26201e04c3fSmrg 26301e04c3fSmrg default: 26401e04c3fSmrg fprintf(stderr, "Unknown waddr %d\n", waddr); 26501e04c3fSmrg abort(); 26601e04c3fSmrg } 26701e04c3fSmrg } 26801e04c3fSmrg} 26901e04c3fSmrg 27001e04c3fSmrg/** 27101e04c3fSmrg * Common code for dependencies that need to be tracked both forward and 27201e04c3fSmrg * backward. 27301e04c3fSmrg * 27401e04c3fSmrg * This is for things like "all reads of r4 have to happen between the r4 27501e04c3fSmrg * writes that surround them". 27601e04c3fSmrg */ 27701e04c3fSmrgstatic void 27801e04c3fSmrgcalculate_deps(struct schedule_state *state, struct schedule_node *n) 27901e04c3fSmrg{ 28001e04c3fSmrg const struct v3d_device_info *devinfo = state->devinfo; 28101e04c3fSmrg struct qinst *qinst = n->inst; 28201e04c3fSmrg struct v3d_qpu_instr *inst = &qinst->qpu; 283ed98bd31Smaya /* If the input and output segments are shared, then all VPM reads to 284ed98bd31Smaya * a location need to happen before all writes. We handle this by 285ed98bd31Smaya * serializing all VPM operations for now. 286ed98bd31Smaya */ 287ed98bd31Smaya bool separate_vpm_segment = false; 28801e04c3fSmrg 28901e04c3fSmrg if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) { 29001e04c3fSmrg if (inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS) 29101e04c3fSmrg add_read_dep(state, state->last_sf, n); 29201e04c3fSmrg 29301e04c3fSmrg /* XXX: BDI */ 29401e04c3fSmrg /* XXX: BDU */ 29501e04c3fSmrg /* XXX: ub */ 29601e04c3fSmrg /* XXX: raddr_a */ 29701e04c3fSmrg 29801e04c3fSmrg add_write_dep(state, &state->last_unif, n); 29901e04c3fSmrg return; 30001e04c3fSmrg } 30101e04c3fSmrg 30201e04c3fSmrg assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); 30301e04c3fSmrg 30401e04c3fSmrg /* XXX: LOAD_IMM */ 30501e04c3fSmrg 30601e04c3fSmrg if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) 30701e04c3fSmrg process_mux_deps(state, n, inst->alu.add.a); 30801e04c3fSmrg if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) 30901e04c3fSmrg process_mux_deps(state, n, inst->alu.add.b); 31001e04c3fSmrg 31101e04c3fSmrg if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) 31201e04c3fSmrg process_mux_deps(state, n, inst->alu.mul.a); 31301e04c3fSmrg if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) 31401e04c3fSmrg process_mux_deps(state, n, inst->alu.mul.b); 31501e04c3fSmrg 31601e04c3fSmrg switch (inst->alu.add.op) { 31701e04c3fSmrg case V3D_QPU_A_VPMSETUP: 31801e04c3fSmrg /* Could distinguish read/write by unpacking the uniform. */ 31901e04c3fSmrg add_write_dep(state, &state->last_vpm, n); 32001e04c3fSmrg add_write_dep(state, &state->last_vpm_read, n); 32101e04c3fSmrg break; 32201e04c3fSmrg 32301e04c3fSmrg case V3D_QPU_A_STVPMV: 32401e04c3fSmrg case V3D_QPU_A_STVPMD: 32501e04c3fSmrg case V3D_QPU_A_STVPMP: 32601e04c3fSmrg add_write_dep(state, &state->last_vpm, n); 32701e04c3fSmrg break; 32801e04c3fSmrg 329ed98bd31Smaya case V3D_QPU_A_LDVPMV_IN: 330ed98bd31Smaya case V3D_QPU_A_LDVPMD_IN: 331ed98bd31Smaya case V3D_QPU_A_LDVPMG_IN: 332ed98bd31Smaya case V3D_QPU_A_LDVPMP: 333ed98bd31Smaya if (!separate_vpm_segment) 334ed98bd31Smaya add_write_dep(state, &state->last_vpm, n); 335ed98bd31Smaya break; 336ed98bd31Smaya 33701e04c3fSmrg case V3D_QPU_A_VPMWT: 33801e04c3fSmrg add_read_dep(state, state->last_vpm, n); 33901e04c3fSmrg break; 34001e04c3fSmrg 34101e04c3fSmrg case V3D_QPU_A_MSF: 34201e04c3fSmrg add_read_dep(state, state->last_tlb, n); 34301e04c3fSmrg break; 34401e04c3fSmrg 34501e04c3fSmrg case V3D_QPU_A_SETMSF: 34601e04c3fSmrg case V3D_QPU_A_SETREVF: 34701e04c3fSmrg add_write_dep(state, &state->last_tlb, n); 34801e04c3fSmrg break; 34901e04c3fSmrg 35001e04c3fSmrg default: 35101e04c3fSmrg break; 35201e04c3fSmrg } 35301e04c3fSmrg 35401e04c3fSmrg switch (inst->alu.mul.op) { 35501e04c3fSmrg case V3D_QPU_M_MULTOP: 35601e04c3fSmrg case V3D_QPU_M_UMUL24: 35701e04c3fSmrg /* MULTOP sets rtop, and UMUL24 implicitly reads rtop and 35801e04c3fSmrg * resets it to 0. We could possibly reorder umul24s relative 35901e04c3fSmrg * to each other, but for now just keep all the MUL parts in 36001e04c3fSmrg * order. 36101e04c3fSmrg */ 36201e04c3fSmrg add_write_dep(state, &state->last_rtop, n); 36301e04c3fSmrg break; 36401e04c3fSmrg default: 36501e04c3fSmrg break; 36601e04c3fSmrg } 36701e04c3fSmrg 36801e04c3fSmrg if (inst->alu.add.op != V3D_QPU_A_NOP) { 36901e04c3fSmrg process_waddr_deps(state, n, inst->alu.add.waddr, 37001e04c3fSmrg inst->alu.add.magic_write); 37101e04c3fSmrg } 37201e04c3fSmrg if (inst->alu.mul.op != V3D_QPU_M_NOP) { 37301e04c3fSmrg process_waddr_deps(state, n, inst->alu.mul.waddr, 37401e04c3fSmrg inst->alu.mul.magic_write); 37501e04c3fSmrg } 37601e04c3fSmrg if (v3d_qpu_sig_writes_address(devinfo, &inst->sig)) { 37701e04c3fSmrg process_waddr_deps(state, n, inst->sig_addr, 37801e04c3fSmrg inst->sig_magic); 37901e04c3fSmrg } 38001e04c3fSmrg 38101e04c3fSmrg if (v3d_qpu_writes_r3(devinfo, inst)) 38201e04c3fSmrg add_write_dep(state, &state->last_r[3], n); 38301e04c3fSmrg if (v3d_qpu_writes_r4(devinfo, inst)) 38401e04c3fSmrg add_write_dep(state, &state->last_r[4], n); 38501e04c3fSmrg if (v3d_qpu_writes_r5(devinfo, inst)) 38601e04c3fSmrg add_write_dep(state, &state->last_r[5], n); 38701e04c3fSmrg 3887ec681f3Smrg /* If we add any more dependencies here we should consider whether we 3897ec681f3Smrg * also need to update qpu_inst_after_thrsw_valid_in_delay_slot. 3907ec681f3Smrg */ 39101e04c3fSmrg if (inst->sig.thrsw) { 39201e04c3fSmrg /* All accumulator contents and flags are undefined after the 39301e04c3fSmrg * switch. 39401e04c3fSmrg */ 39501e04c3fSmrg for (int i = 0; i < ARRAY_SIZE(state->last_r); i++) 39601e04c3fSmrg add_write_dep(state, &state->last_r[i], n); 39701e04c3fSmrg add_write_dep(state, &state->last_sf, n); 39801e04c3fSmrg add_write_dep(state, &state->last_rtop, n); 39901e04c3fSmrg 40001e04c3fSmrg /* Scoreboard-locking operations have to stay after the last 40101e04c3fSmrg * thread switch. 40201e04c3fSmrg */ 40301e04c3fSmrg add_write_dep(state, &state->last_tlb, n); 40401e04c3fSmrg 40501e04c3fSmrg add_write_dep(state, &state->last_tmu_write, n); 40601e04c3fSmrg add_write_dep(state, &state->last_tmu_config, n); 40701e04c3fSmrg } 40801e04c3fSmrg 40901e04c3fSmrg if (v3d_qpu_waits_on_tmu(inst)) { 41001e04c3fSmrg /* TMU loads are coming from a FIFO, so ordering is important. 41101e04c3fSmrg */ 4127ec681f3Smrg add_write_dep(state, &state->last_tmu_read, n); 4137ec681f3Smrg /* Keep TMU loads after their TMU lookup terminator */ 4147ec681f3Smrg add_read_dep(state, state->last_tmu_config, n); 41501e04c3fSmrg } 41601e04c3fSmrg 4177ec681f3Smrg /* Allow wrtmuc to be reordered with other instructions in the 4187ec681f3Smrg * same TMU sequence by using a read dependency on the last TMU 4197ec681f3Smrg * sequence terminator. 4207ec681f3Smrg */ 42101e04c3fSmrg if (inst->sig.wrtmuc) 4227ec681f3Smrg add_read_dep(state, state->last_tmu_config, n); 42301e04c3fSmrg 42401e04c3fSmrg if (inst->sig.ldtlb | inst->sig.ldtlbu) 4257ec681f3Smrg add_write_dep(state, &state->last_tlb, n); 42601e04c3fSmrg 427ed98bd31Smaya if (inst->sig.ldvpm) { 42801e04c3fSmrg add_write_dep(state, &state->last_vpm_read, n); 42901e04c3fSmrg 430ed98bd31Smaya /* At least for now, we're doing shared I/O segments, so queue 431ed98bd31Smaya * all writes after all reads. 432ed98bd31Smaya */ 433ed98bd31Smaya if (!separate_vpm_segment) 434ed98bd31Smaya add_write_dep(state, &state->last_vpm, n); 435ed98bd31Smaya } 436ed98bd31Smaya 43701e04c3fSmrg /* inst->sig.ldunif or sideband uniform read */ 438ed98bd31Smaya if (vir_has_uniform(qinst)) 43901e04c3fSmrg add_write_dep(state, &state->last_unif, n); 44001e04c3fSmrg 4417ec681f3Smrg /* Both unifa and ldunifa must preserve ordering */ 4427ec681f3Smrg if (inst->sig.ldunifa || inst->sig.ldunifarf) 4437ec681f3Smrg add_write_dep(state, &state->last_unifa, n); 4447ec681f3Smrg 445ed98bd31Smaya if (v3d_qpu_reads_flags(inst)) 446ed98bd31Smaya add_read_dep(state, state->last_sf, n); 447ed98bd31Smaya if (v3d_qpu_writes_flags(inst)) 448ed98bd31Smaya add_write_dep(state, &state->last_sf, n); 44901e04c3fSmrg} 45001e04c3fSmrg 45101e04c3fSmrgstatic void 452ed98bd31Smayacalculate_forward_deps(struct v3d_compile *c, struct dag *dag, 453ed98bd31Smaya struct list_head *schedule_list) 45401e04c3fSmrg{ 45501e04c3fSmrg struct schedule_state state; 45601e04c3fSmrg 45701e04c3fSmrg memset(&state, 0, sizeof(state)); 458ed98bd31Smaya state.dag = dag; 45901e04c3fSmrg state.devinfo = c->devinfo; 46001e04c3fSmrg state.dir = F; 46101e04c3fSmrg 46201e04c3fSmrg list_for_each_entry(struct schedule_node, node, schedule_list, link) 46301e04c3fSmrg calculate_deps(&state, node); 46401e04c3fSmrg} 46501e04c3fSmrg 46601e04c3fSmrgstatic void 467ed98bd31Smayacalculate_reverse_deps(struct v3d_compile *c, struct dag *dag, 468ed98bd31Smaya struct list_head *schedule_list) 46901e04c3fSmrg{ 47001e04c3fSmrg struct schedule_state state; 47101e04c3fSmrg 47201e04c3fSmrg memset(&state, 0, sizeof(state)); 473ed98bd31Smaya state.dag = dag; 47401e04c3fSmrg state.devinfo = c->devinfo; 47501e04c3fSmrg state.dir = R; 47601e04c3fSmrg 477ed98bd31Smaya list_for_each_entry_rev(struct schedule_node, node, schedule_list, 478ed98bd31Smaya link) { 47901e04c3fSmrg calculate_deps(&state, (struct schedule_node *)node); 48001e04c3fSmrg } 48101e04c3fSmrg} 48201e04c3fSmrg 48301e04c3fSmrgstruct choose_scoreboard { 484ed98bd31Smaya struct dag *dag; 48501e04c3fSmrg int tick; 48601e04c3fSmrg int last_magic_sfu_write_tick; 4877ec681f3Smrg int last_stallable_sfu_reg; 4887ec681f3Smrg int last_stallable_sfu_tick; 48901e04c3fSmrg int last_ldvary_tick; 4907ec681f3Smrg int last_unifa_write_tick; 49101e04c3fSmrg int last_uniforms_reset_tick; 49201e04c3fSmrg int last_thrsw_tick; 4937ec681f3Smrg int last_branch_tick; 4947ec681f3Smrg int last_setmsf_tick; 4957ec681f3Smrg bool first_thrsw_emitted; 4967ec681f3Smrg bool last_thrsw_emitted; 4977ec681f3Smrg bool fixup_ldvary; 4987ec681f3Smrg int ldvary_count; 49901e04c3fSmrg}; 50001e04c3fSmrg 50101e04c3fSmrgstatic bool 50201e04c3fSmrgmux_reads_too_soon(struct choose_scoreboard *scoreboard, 50301e04c3fSmrg const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux) 50401e04c3fSmrg{ 50501e04c3fSmrg switch (mux) { 50601e04c3fSmrg case V3D_QPU_MUX_R4: 50701e04c3fSmrg if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick <= 2) 50801e04c3fSmrg return true; 50901e04c3fSmrg break; 51001e04c3fSmrg 51101e04c3fSmrg case V3D_QPU_MUX_R5: 51201e04c3fSmrg if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1) 51301e04c3fSmrg return true; 51401e04c3fSmrg break; 51501e04c3fSmrg default: 51601e04c3fSmrg break; 51701e04c3fSmrg } 51801e04c3fSmrg 51901e04c3fSmrg return false; 52001e04c3fSmrg} 52101e04c3fSmrg 52201e04c3fSmrgstatic bool 52301e04c3fSmrgreads_too_soon_after_write(struct choose_scoreboard *scoreboard, 52401e04c3fSmrg struct qinst *qinst) 52501e04c3fSmrg{ 52601e04c3fSmrg const struct v3d_qpu_instr *inst = &qinst->qpu; 52701e04c3fSmrg 52801e04c3fSmrg /* XXX: Branching off of raddr. */ 52901e04c3fSmrg if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) 53001e04c3fSmrg return false; 53101e04c3fSmrg 53201e04c3fSmrg assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); 53301e04c3fSmrg 53401e04c3fSmrg if (inst->alu.add.op != V3D_QPU_A_NOP) { 53501e04c3fSmrg if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 && 53601e04c3fSmrg mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) { 53701e04c3fSmrg return true; 53801e04c3fSmrg } 53901e04c3fSmrg if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 && 54001e04c3fSmrg mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) { 54101e04c3fSmrg return true; 54201e04c3fSmrg } 54301e04c3fSmrg } 54401e04c3fSmrg 54501e04c3fSmrg if (inst->alu.mul.op != V3D_QPU_M_NOP) { 54601e04c3fSmrg if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 && 54701e04c3fSmrg mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) { 54801e04c3fSmrg return true; 54901e04c3fSmrg } 55001e04c3fSmrg if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 && 55101e04c3fSmrg mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) { 55201e04c3fSmrg return true; 55301e04c3fSmrg } 55401e04c3fSmrg } 55501e04c3fSmrg 55601e04c3fSmrg /* XXX: imm */ 55701e04c3fSmrg 55801e04c3fSmrg return false; 55901e04c3fSmrg} 56001e04c3fSmrg 56101e04c3fSmrgstatic bool 56201e04c3fSmrgwrites_too_soon_after_write(const struct v3d_device_info *devinfo, 56301e04c3fSmrg struct choose_scoreboard *scoreboard, 56401e04c3fSmrg struct qinst *qinst) 56501e04c3fSmrg{ 56601e04c3fSmrg const struct v3d_qpu_instr *inst = &qinst->qpu; 56701e04c3fSmrg 56801e04c3fSmrg /* Don't schedule any other r4 write too soon after an SFU write. 56901e04c3fSmrg * This would normally be prevented by dependency tracking, but might 57001e04c3fSmrg * occur if a dead SFU computation makes it to scheduling. 57101e04c3fSmrg */ 57201e04c3fSmrg if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick < 2 && 57301e04c3fSmrg v3d_qpu_writes_r4(devinfo, inst)) 57401e04c3fSmrg return true; 57501e04c3fSmrg 57601e04c3fSmrg return false; 57701e04c3fSmrg} 57801e04c3fSmrg 57901e04c3fSmrgstatic bool 5807ec681f3Smrgscoreboard_is_locked(struct choose_scoreboard *scoreboard, 5817ec681f3Smrg bool lock_scoreboard_on_first_thrsw) 5827ec681f3Smrg{ 5837ec681f3Smrg if (lock_scoreboard_on_first_thrsw) { 5847ec681f3Smrg return scoreboard->first_thrsw_emitted && 5857ec681f3Smrg scoreboard->tick - scoreboard->last_thrsw_tick >= 3; 5867ec681f3Smrg } 5877ec681f3Smrg 5887ec681f3Smrg return scoreboard->last_thrsw_emitted && 5897ec681f3Smrg scoreboard->tick - scoreboard->last_thrsw_tick >= 3; 5907ec681f3Smrg} 5917ec681f3Smrg 5927ec681f3Smrgstatic bool 5937ec681f3Smrgpixel_scoreboard_too_soon(struct v3d_compile *c, 5947ec681f3Smrg struct choose_scoreboard *scoreboard, 59501e04c3fSmrg const struct v3d_qpu_instr *inst) 59601e04c3fSmrg{ 5977ec681f3Smrg return qpu_inst_is_tlb(inst) && 5987ec681f3Smrg !scoreboard_is_locked(scoreboard, 5997ec681f3Smrg c->lock_scoreboard_on_first_thrsw); 6007ec681f3Smrg} 6017ec681f3Smrg 6027ec681f3Smrgstatic bool 6037ec681f3Smrgqpu_instruction_uses_rf(const struct v3d_qpu_instr *inst, 6047ec681f3Smrg uint32_t waddr) { 6057ec681f3Smrg 6067ec681f3Smrg if (inst->type != V3D_QPU_INSTR_TYPE_ALU) 6077ec681f3Smrg return false; 6087ec681f3Smrg 6097ec681f3Smrg if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) && 6107ec681f3Smrg inst->raddr_a == waddr) 6117ec681f3Smrg return true; 6127ec681f3Smrg 6137ec681f3Smrg if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) && 6147ec681f3Smrg !inst->sig.small_imm && (inst->raddr_b == waddr)) 6157ec681f3Smrg return true; 6167ec681f3Smrg 6177ec681f3Smrg return false; 6187ec681f3Smrg} 6197ec681f3Smrg 6207ec681f3Smrgstatic bool 6217ec681f3Smrgmux_read_stalls(struct choose_scoreboard *scoreboard, 6227ec681f3Smrg const struct v3d_qpu_instr *inst) 6237ec681f3Smrg{ 6247ec681f3Smrg return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 && 6257ec681f3Smrg qpu_instruction_uses_rf(inst, 6267ec681f3Smrg scoreboard->last_stallable_sfu_reg); 62701e04c3fSmrg} 62801e04c3fSmrg 6297ec681f3Smrg/* We define a max schedule priority to allow negative priorities as result of 6307ec681f3Smrg * substracting this max when an instruction stalls. So instructions that 6317ec681f3Smrg * stall have lower priority than regular instructions. */ 6327ec681f3Smrg#define MAX_SCHEDULE_PRIORITY 16 6337ec681f3Smrg 63401e04c3fSmrgstatic int 6357ec681f3Smrgget_instruction_priority(const struct v3d_device_info *devinfo, 6367ec681f3Smrg const struct v3d_qpu_instr *inst) 63701e04c3fSmrg{ 63801e04c3fSmrg uint32_t baseline_score; 63901e04c3fSmrg uint32_t next_score = 0; 64001e04c3fSmrg 64101e04c3fSmrg /* Schedule TLB operations as late as possible, to get more 64201e04c3fSmrg * parallelism between shaders. 64301e04c3fSmrg */ 64401e04c3fSmrg if (qpu_inst_is_tlb(inst)) 64501e04c3fSmrg return next_score; 64601e04c3fSmrg next_score++; 64701e04c3fSmrg 64801e04c3fSmrg /* Schedule texture read results collection late to hide latency. */ 64901e04c3fSmrg if (v3d_qpu_waits_on_tmu(inst)) 65001e04c3fSmrg return next_score; 65101e04c3fSmrg next_score++; 65201e04c3fSmrg 65301e04c3fSmrg /* Default score for things that aren't otherwise special. */ 65401e04c3fSmrg baseline_score = next_score; 65501e04c3fSmrg next_score++; 65601e04c3fSmrg 65701e04c3fSmrg /* Schedule texture read setup early to hide their latency better. */ 6587ec681f3Smrg if (v3d_qpu_writes_tmu(devinfo, inst)) 65901e04c3fSmrg return next_score; 66001e04c3fSmrg next_score++; 66101e04c3fSmrg 6627ec681f3Smrg /* We should increase the maximum if we assert here */ 6637ec681f3Smrg assert(next_score < MAX_SCHEDULE_PRIORITY); 6647ec681f3Smrg 66501e04c3fSmrg return baseline_score; 66601e04c3fSmrg} 66701e04c3fSmrg 66801e04c3fSmrgstatic bool 6697ec681f3Smrgqpu_magic_waddr_is_periph(const struct v3d_device_info *devinfo, 6707ec681f3Smrg enum v3d_qpu_waddr waddr) 67101e04c3fSmrg{ 6727ec681f3Smrg return (v3d_qpu_magic_waddr_is_tmu(devinfo, waddr) || 67301e04c3fSmrg v3d_qpu_magic_waddr_is_sfu(waddr) || 67401e04c3fSmrg v3d_qpu_magic_waddr_is_tlb(waddr) || 67501e04c3fSmrg v3d_qpu_magic_waddr_is_vpm(waddr) || 67601e04c3fSmrg v3d_qpu_magic_waddr_is_tsy(waddr)); 67701e04c3fSmrg} 67801e04c3fSmrg 67901e04c3fSmrgstatic bool 6807ec681f3Smrgqpu_accesses_peripheral(const struct v3d_device_info *devinfo, 6817ec681f3Smrg const struct v3d_qpu_instr *inst) 68201e04c3fSmrg{ 68301e04c3fSmrg if (v3d_qpu_uses_vpm(inst)) 68401e04c3fSmrg return true; 68501e04c3fSmrg if (v3d_qpu_uses_sfu(inst)) 68601e04c3fSmrg return true; 68701e04c3fSmrg 68801e04c3fSmrg if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { 68901e04c3fSmrg if (inst->alu.add.op != V3D_QPU_A_NOP && 69001e04c3fSmrg inst->alu.add.magic_write && 6917ec681f3Smrg qpu_magic_waddr_is_periph(devinfo, inst->alu.add.waddr)) { 69201e04c3fSmrg return true; 69301e04c3fSmrg } 69401e04c3fSmrg 69501e04c3fSmrg if (inst->alu.add.op == V3D_QPU_A_TMUWT) 69601e04c3fSmrg return true; 69701e04c3fSmrg 69801e04c3fSmrg if (inst->alu.mul.op != V3D_QPU_M_NOP && 69901e04c3fSmrg inst->alu.mul.magic_write && 7007ec681f3Smrg qpu_magic_waddr_is_periph(devinfo, inst->alu.mul.waddr)) { 70101e04c3fSmrg return true; 70201e04c3fSmrg } 70301e04c3fSmrg } 70401e04c3fSmrg 70501e04c3fSmrg return (inst->sig.ldvpm || 70601e04c3fSmrg inst->sig.ldtmu || 70701e04c3fSmrg inst->sig.ldtlb || 70801e04c3fSmrg inst->sig.ldtlbu || 70901e04c3fSmrg inst->sig.wrtmuc); 71001e04c3fSmrg} 71101e04c3fSmrg 7127ec681f3Smrgstatic bool 7137ec681f3Smrgqpu_compatible_peripheral_access(const struct v3d_device_info *devinfo, 7147ec681f3Smrg const struct v3d_qpu_instr *a, 7157ec681f3Smrg const struct v3d_qpu_instr *b) 7167ec681f3Smrg{ 7177ec681f3Smrg const bool a_uses_peripheral = qpu_accesses_peripheral(devinfo, a); 7187ec681f3Smrg const bool b_uses_peripheral = qpu_accesses_peripheral(devinfo, b); 7197ec681f3Smrg 7207ec681f3Smrg /* We can always do one peripheral access per instruction. */ 7217ec681f3Smrg if (!a_uses_peripheral || !b_uses_peripheral) 7227ec681f3Smrg return true; 7237ec681f3Smrg 7247ec681f3Smrg if (devinfo->ver < 41) 7257ec681f3Smrg return false; 7267ec681f3Smrg 7277ec681f3Smrg /* V3D 4.1 and later allow TMU read along with a VPM read or write, and 7287ec681f3Smrg * WRTMUC with a TMU magic register write (other than tmuc). 7297ec681f3Smrg */ 7307ec681f3Smrg if ((a->sig.ldtmu && v3d_qpu_reads_or_writes_vpm(b)) || 7317ec681f3Smrg (b->sig.ldtmu && v3d_qpu_reads_or_writes_vpm(a))) { 7327ec681f3Smrg return true; 7337ec681f3Smrg } 7347ec681f3Smrg 7357ec681f3Smrg if ((a->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) || 7367ec681f3Smrg (b->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(devinfo, a))) { 7377ec681f3Smrg return true; 7387ec681f3Smrg } 7397ec681f3Smrg 7407ec681f3Smrg return false; 7417ec681f3Smrg} 7427ec681f3Smrg 7437ec681f3Smrg/* Compute a bitmask of which rf registers are used between 7447ec681f3Smrg * the two instructions. 7457ec681f3Smrg */ 7467ec681f3Smrgstatic uint64_t 7477ec681f3Smrgqpu_raddrs_used(const struct v3d_qpu_instr *a, 7487ec681f3Smrg const struct v3d_qpu_instr *b) 7497ec681f3Smrg{ 7507ec681f3Smrg assert(a->type == V3D_QPU_INSTR_TYPE_ALU); 7517ec681f3Smrg assert(b->type == V3D_QPU_INSTR_TYPE_ALU); 7527ec681f3Smrg 7537ec681f3Smrg uint64_t raddrs_used = 0; 7547ec681f3Smrg if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A)) 7557ec681f3Smrg raddrs_used |= (1ll << a->raddr_a); 7567ec681f3Smrg if (!a->sig.small_imm && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B)) 7577ec681f3Smrg raddrs_used |= (1ll << a->raddr_b); 7587ec681f3Smrg if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A)) 7597ec681f3Smrg raddrs_used |= (1ll << b->raddr_a); 7607ec681f3Smrg if (!b->sig.small_imm && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) 7617ec681f3Smrg raddrs_used |= (1ll << b->raddr_b); 7627ec681f3Smrg 7637ec681f3Smrg return raddrs_used; 7647ec681f3Smrg} 7657ec681f3Smrg 7667ec681f3Smrg/* Take two instructions and attempt to merge their raddr fields 7677ec681f3Smrg * into one merged instruction. Returns false if the two instructions 7687ec681f3Smrg * access more than two different rf registers between them, or more 7697ec681f3Smrg * than one rf register and one small immediate. 7707ec681f3Smrg */ 7717ec681f3Smrgstatic bool 7727ec681f3Smrgqpu_merge_raddrs(struct v3d_qpu_instr *result, 7737ec681f3Smrg const struct v3d_qpu_instr *add_instr, 7747ec681f3Smrg const struct v3d_qpu_instr *mul_instr) 7757ec681f3Smrg{ 7767ec681f3Smrg uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr); 7777ec681f3Smrg int naddrs = util_bitcount64(raddrs_used); 7787ec681f3Smrg 7797ec681f3Smrg if (naddrs > 2) 7807ec681f3Smrg return false; 7817ec681f3Smrg 7827ec681f3Smrg if ((add_instr->sig.small_imm || mul_instr->sig.small_imm)) { 7837ec681f3Smrg if (naddrs > 1) 7847ec681f3Smrg return false; 7857ec681f3Smrg 7867ec681f3Smrg if (add_instr->sig.small_imm && mul_instr->sig.small_imm) 7877ec681f3Smrg if (add_instr->raddr_b != mul_instr->raddr_b) 7887ec681f3Smrg return false; 7897ec681f3Smrg 7907ec681f3Smrg result->sig.small_imm = true; 7917ec681f3Smrg result->raddr_b = add_instr->sig.small_imm ? 7927ec681f3Smrg add_instr->raddr_b : mul_instr->raddr_b; 7937ec681f3Smrg } 7947ec681f3Smrg 7957ec681f3Smrg if (naddrs == 0) 7967ec681f3Smrg return true; 7977ec681f3Smrg 7987ec681f3Smrg int raddr_a = ffsll(raddrs_used) - 1; 7997ec681f3Smrg raddrs_used &= ~(1ll << raddr_a); 8007ec681f3Smrg result->raddr_a = raddr_a; 8017ec681f3Smrg 8027ec681f3Smrg if (!result->sig.small_imm) { 8037ec681f3Smrg if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) && 8047ec681f3Smrg raddr_a == add_instr->raddr_b) { 8057ec681f3Smrg if (add_instr->alu.add.a == V3D_QPU_MUX_B) 8067ec681f3Smrg result->alu.add.a = V3D_QPU_MUX_A; 8077ec681f3Smrg if (add_instr->alu.add.b == V3D_QPU_MUX_B && 8087ec681f3Smrg v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) { 8097ec681f3Smrg result->alu.add.b = V3D_QPU_MUX_A; 8107ec681f3Smrg } 8117ec681f3Smrg } 8127ec681f3Smrg if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_B) && 8137ec681f3Smrg raddr_a == mul_instr->raddr_b) { 8147ec681f3Smrg if (mul_instr->alu.mul.a == V3D_QPU_MUX_B) 8157ec681f3Smrg result->alu.mul.a = V3D_QPU_MUX_A; 8167ec681f3Smrg if (mul_instr->alu.mul.b == V3D_QPU_MUX_B && 8177ec681f3Smrg v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) { 8187ec681f3Smrg result->alu.mul.b = V3D_QPU_MUX_A; 8197ec681f3Smrg } 8207ec681f3Smrg } 8217ec681f3Smrg } 8227ec681f3Smrg if (!raddrs_used) 8237ec681f3Smrg return true; 8247ec681f3Smrg 8257ec681f3Smrg int raddr_b = ffsll(raddrs_used) - 1; 8267ec681f3Smrg result->raddr_b = raddr_b; 8277ec681f3Smrg if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_A) && 8287ec681f3Smrg raddr_b == add_instr->raddr_a) { 8297ec681f3Smrg if (add_instr->alu.add.a == V3D_QPU_MUX_A) 8307ec681f3Smrg result->alu.add.a = V3D_QPU_MUX_B; 8317ec681f3Smrg if (add_instr->alu.add.b == V3D_QPU_MUX_A && 8327ec681f3Smrg v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) { 8337ec681f3Smrg result->alu.add.b = V3D_QPU_MUX_B; 8347ec681f3Smrg } 8357ec681f3Smrg } 8367ec681f3Smrg if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_A) && 8377ec681f3Smrg raddr_b == mul_instr->raddr_a) { 8387ec681f3Smrg if (mul_instr->alu.mul.a == V3D_QPU_MUX_A) 8397ec681f3Smrg result->alu.mul.a = V3D_QPU_MUX_B; 8407ec681f3Smrg if (mul_instr->alu.mul.b == V3D_QPU_MUX_A && 8417ec681f3Smrg v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) { 8427ec681f3Smrg result->alu.mul.b = V3D_QPU_MUX_B; 8437ec681f3Smrg } 8447ec681f3Smrg } 8457ec681f3Smrg 8467ec681f3Smrg return true; 8477ec681f3Smrg} 8487ec681f3Smrg 8497ec681f3Smrgstatic bool 8507ec681f3Smrgcan_do_add_as_mul(enum v3d_qpu_add_op op) 8517ec681f3Smrg{ 8527ec681f3Smrg switch (op) { 8537ec681f3Smrg case V3D_QPU_A_ADD: 8547ec681f3Smrg case V3D_QPU_A_SUB: 8557ec681f3Smrg return true; 8567ec681f3Smrg default: 8577ec681f3Smrg return false; 8587ec681f3Smrg } 8597ec681f3Smrg} 8607ec681f3Smrg 8617ec681f3Smrgstatic enum v3d_qpu_mul_op 8627ec681f3Smrgadd_op_as_mul_op(enum v3d_qpu_add_op op) 8637ec681f3Smrg{ 8647ec681f3Smrg switch (op) { 8657ec681f3Smrg case V3D_QPU_A_ADD: 8667ec681f3Smrg return V3D_QPU_M_ADD; 8677ec681f3Smrg case V3D_QPU_A_SUB: 8687ec681f3Smrg return V3D_QPU_M_SUB; 8697ec681f3Smrg default: 8707ec681f3Smrg unreachable("unexpected add opcode"); 8717ec681f3Smrg } 8727ec681f3Smrg} 8737ec681f3Smrg 8747ec681f3Smrgstatic void 8757ec681f3Smrgqpu_convert_add_to_mul(struct v3d_qpu_instr *inst) 8767ec681f3Smrg{ 8777ec681f3Smrg STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add)); 8787ec681f3Smrg assert(inst->alu.add.op != V3D_QPU_A_NOP); 8797ec681f3Smrg assert(inst->alu.mul.op == V3D_QPU_M_NOP); 8807ec681f3Smrg 8817ec681f3Smrg memcpy(&inst->alu.mul, &inst->alu.add, sizeof(inst->alu.mul)); 8827ec681f3Smrg inst->alu.mul.op = add_op_as_mul_op(inst->alu.add.op); 8837ec681f3Smrg inst->alu.add.op = V3D_QPU_A_NOP; 8847ec681f3Smrg 8857ec681f3Smrg inst->flags.mc = inst->flags.ac; 8867ec681f3Smrg inst->flags.mpf = inst->flags.apf; 8877ec681f3Smrg inst->flags.muf = inst->flags.auf; 8887ec681f3Smrg inst->flags.ac = V3D_QPU_COND_NONE; 8897ec681f3Smrg inst->flags.apf = V3D_QPU_PF_NONE; 8907ec681f3Smrg inst->flags.auf = V3D_QPU_UF_NONE; 8917ec681f3Smrg} 8927ec681f3Smrg 89301e04c3fSmrgstatic bool 89401e04c3fSmrgqpu_merge_inst(const struct v3d_device_info *devinfo, 89501e04c3fSmrg struct v3d_qpu_instr *result, 89601e04c3fSmrg const struct v3d_qpu_instr *a, 89701e04c3fSmrg const struct v3d_qpu_instr *b) 89801e04c3fSmrg{ 89901e04c3fSmrg if (a->type != V3D_QPU_INSTR_TYPE_ALU || 90001e04c3fSmrg b->type != V3D_QPU_INSTR_TYPE_ALU) { 90101e04c3fSmrg return false; 90201e04c3fSmrg } 90301e04c3fSmrg 9047ec681f3Smrg if (!qpu_compatible_peripheral_access(devinfo, a, b)) 90501e04c3fSmrg return false; 90601e04c3fSmrg 90701e04c3fSmrg struct v3d_qpu_instr merge = *a; 9087ec681f3Smrg const struct v3d_qpu_instr *add_instr = NULL, *mul_instr = NULL; 90901e04c3fSmrg 9107ec681f3Smrg struct v3d_qpu_instr mul_inst; 91101e04c3fSmrg if (b->alu.add.op != V3D_QPU_A_NOP) { 9127ec681f3Smrg if (a->alu.add.op == V3D_QPU_A_NOP) { 9137ec681f3Smrg merge.alu.add = b->alu.add; 9147ec681f3Smrg 9157ec681f3Smrg merge.flags.ac = b->flags.ac; 9167ec681f3Smrg merge.flags.apf = b->flags.apf; 9177ec681f3Smrg merge.flags.auf = b->flags.auf; 9187ec681f3Smrg 9197ec681f3Smrg add_instr = b; 9207ec681f3Smrg mul_instr = a; 9217ec681f3Smrg } 9227ec681f3Smrg /* If a's add op is used but its mul op is not, then see if we 9237ec681f3Smrg * can convert either a's add op or b's add op to a mul op 9247ec681f3Smrg * so we can merge. 9257ec681f3Smrg */ 9267ec681f3Smrg else if (a->alu.mul.op == V3D_QPU_M_NOP && 9277ec681f3Smrg can_do_add_as_mul(b->alu.add.op)) { 9287ec681f3Smrg mul_inst = *b; 9297ec681f3Smrg qpu_convert_add_to_mul(&mul_inst); 9307ec681f3Smrg 9317ec681f3Smrg merge.alu.mul = mul_inst.alu.mul; 9327ec681f3Smrg 9337ec681f3Smrg merge.flags.mc = b->flags.ac; 9347ec681f3Smrg merge.flags.mpf = b->flags.apf; 9357ec681f3Smrg merge.flags.muf = b->flags.auf; 9367ec681f3Smrg 9377ec681f3Smrg add_instr = a; 9387ec681f3Smrg mul_instr = &mul_inst; 9397ec681f3Smrg } else if (a->alu.mul.op == V3D_QPU_M_NOP && 9407ec681f3Smrg can_do_add_as_mul(a->alu.add.op)) { 9417ec681f3Smrg mul_inst = *a; 9427ec681f3Smrg qpu_convert_add_to_mul(&mul_inst); 9437ec681f3Smrg 9447ec681f3Smrg merge = mul_inst; 9457ec681f3Smrg merge.alu.add = b->alu.add; 94601e04c3fSmrg 9477ec681f3Smrg merge.flags.ac = b->flags.ac; 9487ec681f3Smrg merge.flags.apf = b->flags.apf; 9497ec681f3Smrg merge.flags.auf = b->flags.auf; 9507ec681f3Smrg 9517ec681f3Smrg add_instr = b; 9527ec681f3Smrg mul_instr = &mul_inst; 9537ec681f3Smrg } else { 9547ec681f3Smrg return false; 9557ec681f3Smrg } 95601e04c3fSmrg } 95701e04c3fSmrg 95801e04c3fSmrg if (b->alu.mul.op != V3D_QPU_M_NOP) { 95901e04c3fSmrg if (a->alu.mul.op != V3D_QPU_M_NOP) 96001e04c3fSmrg return false; 96101e04c3fSmrg merge.alu.mul = b->alu.mul; 96201e04c3fSmrg 96301e04c3fSmrg merge.flags.mc = b->flags.mc; 96401e04c3fSmrg merge.flags.mpf = b->flags.mpf; 96501e04c3fSmrg merge.flags.muf = b->flags.muf; 96601e04c3fSmrg 9677ec681f3Smrg mul_instr = b; 9687ec681f3Smrg add_instr = a; 96901e04c3fSmrg } 97001e04c3fSmrg 9717ec681f3Smrg if (add_instr && mul_instr && 9727ec681f3Smrg !qpu_merge_raddrs(&merge, add_instr, mul_instr)) { 97301e04c3fSmrg return false; 97401e04c3fSmrg } 97501e04c3fSmrg 97601e04c3fSmrg merge.sig.thrsw |= b->sig.thrsw; 97701e04c3fSmrg merge.sig.ldunif |= b->sig.ldunif; 97801e04c3fSmrg merge.sig.ldunifrf |= b->sig.ldunifrf; 97901e04c3fSmrg merge.sig.ldunifa |= b->sig.ldunifa; 98001e04c3fSmrg merge.sig.ldunifarf |= b->sig.ldunifarf; 98101e04c3fSmrg merge.sig.ldtmu |= b->sig.ldtmu; 98201e04c3fSmrg merge.sig.ldvary |= b->sig.ldvary; 98301e04c3fSmrg merge.sig.ldvpm |= b->sig.ldvpm; 98401e04c3fSmrg merge.sig.small_imm |= b->sig.small_imm; 98501e04c3fSmrg merge.sig.ldtlb |= b->sig.ldtlb; 98601e04c3fSmrg merge.sig.ldtlbu |= b->sig.ldtlbu; 98701e04c3fSmrg merge.sig.ucb |= b->sig.ucb; 98801e04c3fSmrg merge.sig.rotate |= b->sig.rotate; 98901e04c3fSmrg merge.sig.wrtmuc |= b->sig.wrtmuc; 99001e04c3fSmrg 99101e04c3fSmrg if (v3d_qpu_sig_writes_address(devinfo, &a->sig) && 99201e04c3fSmrg v3d_qpu_sig_writes_address(devinfo, &b->sig)) 99301e04c3fSmrg return false; 99401e04c3fSmrg merge.sig_addr |= b->sig_addr; 99501e04c3fSmrg merge.sig_magic |= b->sig_magic; 99601e04c3fSmrg 99701e04c3fSmrg uint64_t packed; 99801e04c3fSmrg bool ok = v3d_qpu_instr_pack(devinfo, &merge, &packed); 99901e04c3fSmrg 100001e04c3fSmrg *result = merge; 100101e04c3fSmrg /* No modifying the real instructions on failure. */ 100201e04c3fSmrg assert(ok || (a != result && b != result)); 100301e04c3fSmrg 100401e04c3fSmrg return ok; 100501e04c3fSmrg} 100601e04c3fSmrg 10077ec681f3Smrgstatic inline bool 10087ec681f3Smrgtry_skip_for_ldvary_pipelining(const struct v3d_qpu_instr *inst) 10097ec681f3Smrg{ 10107ec681f3Smrg return inst->sig.ldunif || inst->sig.ldunifrf; 10117ec681f3Smrg} 10127ec681f3Smrg 10137ec681f3Smrgstatic bool 10147ec681f3Smrgqpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c, 10157ec681f3Smrg struct choose_scoreboard *scoreboard, 10167ec681f3Smrg const struct qinst *qinst); 10177ec681f3Smrg 101801e04c3fSmrgstatic struct schedule_node * 10197ec681f3Smrgchoose_instruction_to_schedule(struct v3d_compile *c, 102001e04c3fSmrg struct choose_scoreboard *scoreboard, 102101e04c3fSmrg struct schedule_node *prev_inst) 102201e04c3fSmrg{ 102301e04c3fSmrg struct schedule_node *chosen = NULL; 102401e04c3fSmrg int chosen_prio = 0; 102501e04c3fSmrg 102601e04c3fSmrg /* Don't pair up anything with a thread switch signal -- emit_thrsw() 102701e04c3fSmrg * will handle pairing it along with filling the delay slots. 102801e04c3fSmrg */ 102901e04c3fSmrg if (prev_inst) { 103001e04c3fSmrg if (prev_inst->inst->qpu.sig.thrsw) 103101e04c3fSmrg return NULL; 103201e04c3fSmrg } 103301e04c3fSmrg 10347ec681f3Smrg bool ldvary_pipelining = c->s->info.stage == MESA_SHADER_FRAGMENT && 10357ec681f3Smrg scoreboard->ldvary_count < c->num_inputs; 10367ec681f3Smrg bool skipped_insts_for_ldvary_pipelining = false; 10377ec681f3Smrgretry: 1038ed98bd31Smaya list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads, 1039ed98bd31Smaya dag.link) { 104001e04c3fSmrg const struct v3d_qpu_instr *inst = &n->inst->qpu; 104101e04c3fSmrg 10427ec681f3Smrg if (ldvary_pipelining && try_skip_for_ldvary_pipelining(inst)) { 10437ec681f3Smrg skipped_insts_for_ldvary_pipelining = true; 10447ec681f3Smrg continue; 10457ec681f3Smrg } 10467ec681f3Smrg 104701e04c3fSmrg /* Don't choose the branch instruction until it's the last one 104801e04c3fSmrg * left. We'll move it up to fit its delay slots after we 104901e04c3fSmrg * choose it. 105001e04c3fSmrg */ 105101e04c3fSmrg if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH && 1052ed98bd31Smaya !list_is_singular(&scoreboard->dag->heads)) { 105301e04c3fSmrg continue; 105401e04c3fSmrg } 105501e04c3fSmrg 10567ec681f3Smrg /* We need to have 3 delay slots between a write to unifa and 10577ec681f3Smrg * a follow-up ldunifa. 10587ec681f3Smrg */ 10597ec681f3Smrg if ((inst->sig.ldunifa || inst->sig.ldunifarf) && 10607ec681f3Smrg scoreboard->tick - scoreboard->last_unifa_write_tick <= 3) 10617ec681f3Smrg continue; 10627ec681f3Smrg 106301e04c3fSmrg /* "An instruction must not read from a location in physical 106401e04c3fSmrg * regfile A or B that was written to by the previous 106501e04c3fSmrg * instruction." 106601e04c3fSmrg */ 106701e04c3fSmrg if (reads_too_soon_after_write(scoreboard, n->inst)) 106801e04c3fSmrg continue; 106901e04c3fSmrg 10707ec681f3Smrg if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst)) 107101e04c3fSmrg continue; 107201e04c3fSmrg 10737ec681f3Smrg /* "Before doing a TLB access a scoreboard wait must have been 10747ec681f3Smrg * done. This happens either on the first or last thread 10757ec681f3Smrg * switch, depending on a setting (scb_wait_on_first_thrsw) in 10767ec681f3Smrg * the shader state." 107701e04c3fSmrg */ 10787ec681f3Smrg if (pixel_scoreboard_too_soon(c, scoreboard, inst)) 107901e04c3fSmrg continue; 108001e04c3fSmrg 108101e04c3fSmrg /* ldunif and ldvary both write r5, but ldunif does so a tick 108201e04c3fSmrg * sooner. If the ldvary's r5 wasn't used, then ldunif might 108301e04c3fSmrg * otherwise get scheduled so ldunif and ldvary try to update 108401e04c3fSmrg * r5 in the same tick. 108501e04c3fSmrg */ 108601e04c3fSmrg if ((inst->sig.ldunif || inst->sig.ldunifa) && 108701e04c3fSmrg scoreboard->tick == scoreboard->last_ldvary_tick + 1) { 108801e04c3fSmrg continue; 108901e04c3fSmrg } 109001e04c3fSmrg 10917ec681f3Smrg /* If we are in a thrsw delay slot check that this instruction 10927ec681f3Smrg * is valid for that. 10937ec681f3Smrg */ 10947ec681f3Smrg if (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick && 10957ec681f3Smrg !qpu_inst_after_thrsw_valid_in_delay_slot(c, scoreboard, 10967ec681f3Smrg n->inst)) { 10977ec681f3Smrg continue; 10987ec681f3Smrg } 10997ec681f3Smrg 11007ec681f3Smrg if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) { 11017ec681f3Smrg /* Don't try to put a branch in the delay slots of another 11027ec681f3Smrg * branch or a unifa write. 11037ec681f3Smrg */ 11047ec681f3Smrg if (scoreboard->last_branch_tick + 3 >= scoreboard->tick) 11057ec681f3Smrg continue; 11067ec681f3Smrg if (scoreboard->last_unifa_write_tick + 3 >= scoreboard->tick) 11077ec681f3Smrg continue; 11087ec681f3Smrg 11097ec681f3Smrg /* No branch with cond != 0,2,3 and msfign != 0 after 11107ec681f3Smrg * setmsf. 11117ec681f3Smrg */ 11127ec681f3Smrg if (scoreboard->last_setmsf_tick == scoreboard->tick - 1 && 11137ec681f3Smrg inst->branch.msfign != V3D_QPU_MSFIGN_NONE && 11147ec681f3Smrg inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS && 11157ec681f3Smrg inst->branch.cond != V3D_QPU_BRANCH_COND_A0 && 11167ec681f3Smrg inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) { 11177ec681f3Smrg continue; 11187ec681f3Smrg } 11197ec681f3Smrg } 11207ec681f3Smrg 112101e04c3fSmrg /* If we're trying to pair with another instruction, check 112201e04c3fSmrg * that they're compatible. 112301e04c3fSmrg */ 112401e04c3fSmrg if (prev_inst) { 112501e04c3fSmrg /* Don't pair up a thread switch signal -- we'll 112601e04c3fSmrg * handle pairing it when we pick it on its own. 112701e04c3fSmrg */ 112801e04c3fSmrg if (inst->sig.thrsw) 112901e04c3fSmrg continue; 113001e04c3fSmrg 113101e04c3fSmrg if (prev_inst->inst->uniform != -1 && 113201e04c3fSmrg n->inst->uniform != -1) 113301e04c3fSmrg continue; 113401e04c3fSmrg 11357ec681f3Smrg /* Simulator complains if we have two uniforms loaded in 11367ec681f3Smrg * the the same instruction, which could happen if we 11377ec681f3Smrg * have a ldunif or sideband uniform and we pair that 11387ec681f3Smrg * with ldunifa. 11397ec681f3Smrg */ 11407ec681f3Smrg if (vir_has_uniform(prev_inst->inst) && 11417ec681f3Smrg (inst->sig.ldunifa || inst->sig.ldunifarf)) { 11427ec681f3Smrg continue; 11437ec681f3Smrg } 11447ec681f3Smrg 11457ec681f3Smrg if ((prev_inst->inst->qpu.sig.ldunifa || 11467ec681f3Smrg prev_inst->inst->qpu.sig.ldunifarf) && 11477ec681f3Smrg vir_has_uniform(n->inst)) { 11487ec681f3Smrg continue; 11497ec681f3Smrg } 11507ec681f3Smrg 11517ec681f3Smrg /* Don't merge TLB instructions before we have acquired 11527ec681f3Smrg * the scoreboard lock. 115301e04c3fSmrg */ 11547ec681f3Smrg if (pixel_scoreboard_too_soon(c, scoreboard, inst)) 115501e04c3fSmrg continue; 115601e04c3fSmrg 11577ec681f3Smrg /* When we succesfully pair up an ldvary we then try 11587ec681f3Smrg * to merge it into the previous instruction if 11597ec681f3Smrg * possible to improve pipelining. Don't pick up the 11607ec681f3Smrg * ldvary now if the follow-up fixup would place 11617ec681f3Smrg * it in the delay slots of a thrsw, which is not 11627ec681f3Smrg * allowed and would prevent the fixup from being 11637ec681f3Smrg * successul. 11647ec681f3Smrg */ 11657ec681f3Smrg if (inst->sig.ldvary && 11667ec681f3Smrg scoreboard->last_thrsw_tick + 2 >= scoreboard->tick - 1) { 11677ec681f3Smrg continue; 11687ec681f3Smrg } 11697ec681f3Smrg 117001e04c3fSmrg struct v3d_qpu_instr merged_inst; 11717ec681f3Smrg if (!qpu_merge_inst(c->devinfo, &merged_inst, 117201e04c3fSmrg &prev_inst->inst->qpu, inst)) { 117301e04c3fSmrg continue; 117401e04c3fSmrg } 117501e04c3fSmrg } 117601e04c3fSmrg 11777ec681f3Smrg int prio = get_instruction_priority(c->devinfo, inst); 11787ec681f3Smrg 11797ec681f3Smrg if (mux_read_stalls(scoreboard, inst)) { 11807ec681f3Smrg /* Don't merge an instruction that stalls */ 11817ec681f3Smrg if (prev_inst) 11827ec681f3Smrg continue; 11837ec681f3Smrg else { 11847ec681f3Smrg /* Any instruction that don't stall will have 11857ec681f3Smrg * higher scheduling priority */ 11867ec681f3Smrg prio -= MAX_SCHEDULE_PRIORITY; 11877ec681f3Smrg assert(prio < 0); 11887ec681f3Smrg } 11897ec681f3Smrg } 119001e04c3fSmrg 119101e04c3fSmrg /* Found a valid instruction. If nothing better comes along, 119201e04c3fSmrg * this one works. 119301e04c3fSmrg */ 119401e04c3fSmrg if (!chosen) { 119501e04c3fSmrg chosen = n; 119601e04c3fSmrg chosen_prio = prio; 119701e04c3fSmrg continue; 119801e04c3fSmrg } 119901e04c3fSmrg 120001e04c3fSmrg if (prio > chosen_prio) { 120101e04c3fSmrg chosen = n; 120201e04c3fSmrg chosen_prio = prio; 120301e04c3fSmrg } else if (prio < chosen_prio) { 120401e04c3fSmrg continue; 120501e04c3fSmrg } 120601e04c3fSmrg 120701e04c3fSmrg if (n->delay > chosen->delay) { 120801e04c3fSmrg chosen = n; 120901e04c3fSmrg chosen_prio = prio; 121001e04c3fSmrg } else if (n->delay < chosen->delay) { 121101e04c3fSmrg continue; 121201e04c3fSmrg } 121301e04c3fSmrg } 121401e04c3fSmrg 12157ec681f3Smrg /* If we did not find any instruction to schedule but we discarded 12167ec681f3Smrg * some of them to prioritize ldvary pipelining, try again. 12177ec681f3Smrg */ 12187ec681f3Smrg if (!chosen && !prev_inst && skipped_insts_for_ldvary_pipelining) { 12197ec681f3Smrg skipped_insts_for_ldvary_pipelining = false; 12207ec681f3Smrg ldvary_pipelining = false; 12217ec681f3Smrg goto retry; 12227ec681f3Smrg } 12237ec681f3Smrg 12247ec681f3Smrg if (chosen && chosen->inst->qpu.sig.ldvary) { 12257ec681f3Smrg scoreboard->ldvary_count++; 12267ec681f3Smrg /* If we are pairing an ldvary, flag it so we can fix it up for 12277ec681f3Smrg * optimal pipelining of ldvary sequences. 12287ec681f3Smrg */ 12297ec681f3Smrg if (prev_inst) 12307ec681f3Smrg scoreboard->fixup_ldvary = true; 12317ec681f3Smrg } 12327ec681f3Smrg 123301e04c3fSmrg return chosen; 123401e04c3fSmrg} 123501e04c3fSmrg 123601e04c3fSmrgstatic void 123701e04c3fSmrgupdate_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard, 12387ec681f3Smrg enum v3d_qpu_waddr waddr, 12397ec681f3Smrg const struct v3d_device_info *devinfo) 124001e04c3fSmrg{ 124101e04c3fSmrg if (v3d_qpu_magic_waddr_is_sfu(waddr)) 124201e04c3fSmrg scoreboard->last_magic_sfu_write_tick = scoreboard->tick; 12437ec681f3Smrg else if (devinfo->ver >= 40 && waddr == V3D_QPU_WADDR_UNIFA) 12447ec681f3Smrg scoreboard->last_unifa_write_tick = scoreboard->tick; 12457ec681f3Smrg} 12467ec681f3Smrg 12477ec681f3Smrgstatic void 12487ec681f3Smrgupdate_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard, 12497ec681f3Smrg const struct v3d_qpu_instr *inst) 12507ec681f3Smrg{ 12517ec681f3Smrg if (v3d_qpu_instr_is_sfu(inst)) { 12527ec681f3Smrg scoreboard->last_stallable_sfu_reg = inst->alu.add.waddr; 12537ec681f3Smrg scoreboard->last_stallable_sfu_tick = scoreboard->tick; 12547ec681f3Smrg } 125501e04c3fSmrg} 125601e04c3fSmrg 125701e04c3fSmrgstatic void 125801e04c3fSmrgupdate_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, 12597ec681f3Smrg const struct v3d_qpu_instr *inst, 12607ec681f3Smrg const struct v3d_device_info *devinfo) 126101e04c3fSmrg{ 126201e04c3fSmrg if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) 126301e04c3fSmrg return; 126401e04c3fSmrg 126501e04c3fSmrg assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); 126601e04c3fSmrg 126701e04c3fSmrg if (inst->alu.add.op != V3D_QPU_A_NOP) { 126801e04c3fSmrg if (inst->alu.add.magic_write) { 126901e04c3fSmrg update_scoreboard_for_magic_waddr(scoreboard, 12707ec681f3Smrg inst->alu.add.waddr, 12717ec681f3Smrg devinfo); 12727ec681f3Smrg } else { 12737ec681f3Smrg update_scoreboard_for_sfu_stall_waddr(scoreboard, 12747ec681f3Smrg inst); 127501e04c3fSmrg } 12767ec681f3Smrg 12777ec681f3Smrg if (inst->alu.add.op == V3D_QPU_A_SETMSF) 12787ec681f3Smrg scoreboard->last_setmsf_tick = scoreboard->tick; 127901e04c3fSmrg } 128001e04c3fSmrg 128101e04c3fSmrg if (inst->alu.mul.op != V3D_QPU_M_NOP) { 128201e04c3fSmrg if (inst->alu.mul.magic_write) { 128301e04c3fSmrg update_scoreboard_for_magic_waddr(scoreboard, 12847ec681f3Smrg inst->alu.mul.waddr, 12857ec681f3Smrg devinfo); 128601e04c3fSmrg } 128701e04c3fSmrg } 128801e04c3fSmrg 128901e04c3fSmrg if (inst->sig.ldvary) 129001e04c3fSmrg scoreboard->last_ldvary_tick = scoreboard->tick; 129101e04c3fSmrg} 129201e04c3fSmrg 129301e04c3fSmrgstatic void 1294ed98bd31Smayadump_state(const struct v3d_device_info *devinfo, struct dag *dag) 129501e04c3fSmrg{ 1296ed98bd31Smaya list_for_each_entry(struct schedule_node, n, &dag->heads, dag.link) { 129701e04c3fSmrg fprintf(stderr, " t=%4d: ", n->unblocked_time); 129801e04c3fSmrg v3d_qpu_dump(devinfo, &n->inst->qpu); 129901e04c3fSmrg fprintf(stderr, "\n"); 130001e04c3fSmrg 1301ed98bd31Smaya util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { 1302ed98bd31Smaya struct schedule_node *child = 1303ed98bd31Smaya (struct schedule_node *)edge->child; 130401e04c3fSmrg if (!child) 130501e04c3fSmrg continue; 130601e04c3fSmrg 130701e04c3fSmrg fprintf(stderr, " - "); 130801e04c3fSmrg v3d_qpu_dump(devinfo, &child->inst->qpu); 130901e04c3fSmrg fprintf(stderr, " (%d parents, %c)\n", 1310ed98bd31Smaya child->dag.parent_count, 1311ed98bd31Smaya edge->data ? 'w' : 'r'); 131201e04c3fSmrg } 131301e04c3fSmrg } 131401e04c3fSmrg} 131501e04c3fSmrg 13167ec681f3Smrgstatic uint32_t magic_waddr_latency(const struct v3d_device_info *devinfo, 13177ec681f3Smrg enum v3d_qpu_waddr waddr, 131801e04c3fSmrg const struct v3d_qpu_instr *after) 131901e04c3fSmrg{ 132001e04c3fSmrg /* Apply some huge latency between texture fetch requests and getting 132101e04c3fSmrg * their results back. 132201e04c3fSmrg * 132301e04c3fSmrg * FIXME: This is actually pretty bogus. If we do: 132401e04c3fSmrg * 132501e04c3fSmrg * mov tmu0_s, a 132601e04c3fSmrg * <a bit of math> 132701e04c3fSmrg * mov tmu0_s, b 132801e04c3fSmrg * load_tmu0 132901e04c3fSmrg * <more math> 133001e04c3fSmrg * load_tmu0 133101e04c3fSmrg * 133201e04c3fSmrg * we count that as worse than 133301e04c3fSmrg * 133401e04c3fSmrg * mov tmu0_s, a 133501e04c3fSmrg * mov tmu0_s, b 133601e04c3fSmrg * <lots of math> 133701e04c3fSmrg * load_tmu0 133801e04c3fSmrg * <more math> 133901e04c3fSmrg * load_tmu0 134001e04c3fSmrg * 134101e04c3fSmrg * because we associate the first load_tmu0 with the *second* tmu0_s. 134201e04c3fSmrg */ 13437ec681f3Smrg if (v3d_qpu_magic_waddr_is_tmu(devinfo, waddr) && 13447ec681f3Smrg v3d_qpu_waits_on_tmu(after)) { 134501e04c3fSmrg return 100; 13467ec681f3Smrg } 134701e04c3fSmrg 134801e04c3fSmrg /* Assume that anything depending on us is consuming the SFU result. */ 134901e04c3fSmrg if (v3d_qpu_magic_waddr_is_sfu(waddr)) 135001e04c3fSmrg return 3; 135101e04c3fSmrg 135201e04c3fSmrg return 1; 135301e04c3fSmrg} 135401e04c3fSmrg 135501e04c3fSmrgstatic uint32_t 13567ec681f3Smrginstruction_latency(const struct v3d_device_info *devinfo, 13577ec681f3Smrg struct schedule_node *before, struct schedule_node *after) 135801e04c3fSmrg{ 135901e04c3fSmrg const struct v3d_qpu_instr *before_inst = &before->inst->qpu; 136001e04c3fSmrg const struct v3d_qpu_instr *after_inst = &after->inst->qpu; 136101e04c3fSmrg uint32_t latency = 1; 136201e04c3fSmrg 136301e04c3fSmrg if (before_inst->type != V3D_QPU_INSTR_TYPE_ALU || 136401e04c3fSmrg after_inst->type != V3D_QPU_INSTR_TYPE_ALU) 136501e04c3fSmrg return latency; 136601e04c3fSmrg 136701e04c3fSmrg if (before_inst->alu.add.magic_write) { 136801e04c3fSmrg latency = MAX2(latency, 13697ec681f3Smrg magic_waddr_latency(devinfo, 13707ec681f3Smrg before_inst->alu.add.waddr, 137101e04c3fSmrg after_inst)); 137201e04c3fSmrg } 137301e04c3fSmrg 137401e04c3fSmrg if (before_inst->alu.mul.magic_write) { 137501e04c3fSmrg latency = MAX2(latency, 13767ec681f3Smrg magic_waddr_latency(devinfo, 13777ec681f3Smrg before_inst->alu.mul.waddr, 137801e04c3fSmrg after_inst)); 137901e04c3fSmrg } 138001e04c3fSmrg 13817ec681f3Smrg if (v3d_qpu_instr_is_sfu(before_inst)) 13827ec681f3Smrg return 2; 13837ec681f3Smrg 138401e04c3fSmrg return latency; 138501e04c3fSmrg} 138601e04c3fSmrg 138701e04c3fSmrg/** Recursive computation of the delay member of a node. */ 138801e04c3fSmrgstatic void 1389ed98bd31Smayacompute_delay(struct dag_node *node, void *state) 139001e04c3fSmrg{ 1391ed98bd31Smaya struct schedule_node *n = (struct schedule_node *)node; 13927ec681f3Smrg struct v3d_compile *c = (struct v3d_compile *) state; 1393ed98bd31Smaya 1394ed98bd31Smaya n->delay = 1; 1395ed98bd31Smaya 1396ed98bd31Smaya util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { 1397ed98bd31Smaya struct schedule_node *child = 1398ed98bd31Smaya (struct schedule_node *)edge->child; 1399ed98bd31Smaya 1400ed98bd31Smaya n->delay = MAX2(n->delay, (child->delay + 14017ec681f3Smrg instruction_latency(c->devinfo, n, 14027ec681f3Smrg child))); 140301e04c3fSmrg } 140401e04c3fSmrg} 140501e04c3fSmrg 1406ed98bd31Smaya/* Removes a DAG head, but removing only the WAR edges. (dag_prune_head() 1407ed98bd31Smaya * should be called on it later to finish pruning the other edges). 1408ed98bd31Smaya */ 140901e04c3fSmrgstatic void 1410ed98bd31Smayapre_remove_head(struct dag *dag, struct schedule_node *n) 1411ed98bd31Smaya{ 1412ed98bd31Smaya list_delinit(&n->dag.link); 1413ed98bd31Smaya 1414ed98bd31Smaya util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { 1415ed98bd31Smaya if (edge->data) 1416ed98bd31Smaya dag_remove_edge(dag, edge); 1417ed98bd31Smaya } 1418ed98bd31Smaya} 1419ed98bd31Smaya 1420ed98bd31Smayastatic void 14217ec681f3Smrgmark_instruction_scheduled(const struct v3d_device_info *devinfo, 14227ec681f3Smrg struct dag *dag, 142301e04c3fSmrg uint32_t time, 1424ed98bd31Smaya struct schedule_node *node) 142501e04c3fSmrg{ 142601e04c3fSmrg if (!node) 142701e04c3fSmrg return; 142801e04c3fSmrg 1429ed98bd31Smaya util_dynarray_foreach(&node->dag.edges, struct dag_edge, edge) { 143001e04c3fSmrg struct schedule_node *child = 1431ed98bd31Smaya (struct schedule_node *)edge->child; 143201e04c3fSmrg 143301e04c3fSmrg if (!child) 143401e04c3fSmrg continue; 143501e04c3fSmrg 14367ec681f3Smrg uint32_t latency = instruction_latency(devinfo, node, child); 143701e04c3fSmrg 143801e04c3fSmrg child->unblocked_time = MAX2(child->unblocked_time, 143901e04c3fSmrg time + latency); 144001e04c3fSmrg } 1441ed98bd31Smaya dag_prune_head(dag, &node->dag); 144201e04c3fSmrg} 144301e04c3fSmrg 144401e04c3fSmrgstatic void 144501e04c3fSmrginsert_scheduled_instruction(struct v3d_compile *c, 144601e04c3fSmrg struct qblock *block, 144701e04c3fSmrg struct choose_scoreboard *scoreboard, 144801e04c3fSmrg struct qinst *inst) 144901e04c3fSmrg{ 145001e04c3fSmrg list_addtail(&inst->link, &block->instructions); 145101e04c3fSmrg 14527ec681f3Smrg update_scoreboard_for_chosen(scoreboard, &inst->qpu, c->devinfo); 145301e04c3fSmrg c->qpu_inst_count++; 145401e04c3fSmrg scoreboard->tick++; 145501e04c3fSmrg} 145601e04c3fSmrg 145701e04c3fSmrgstatic struct qinst * 145801e04c3fSmrgvir_nop() 145901e04c3fSmrg{ 1460ed98bd31Smaya struct qreg undef = vir_nop_reg(); 146101e04c3fSmrg struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef); 146201e04c3fSmrg 146301e04c3fSmrg return qinst; 146401e04c3fSmrg} 146501e04c3fSmrg 146601e04c3fSmrgstatic void 146701e04c3fSmrgemit_nop(struct v3d_compile *c, struct qblock *block, 146801e04c3fSmrg struct choose_scoreboard *scoreboard) 146901e04c3fSmrg{ 147001e04c3fSmrg insert_scheduled_instruction(c, block, scoreboard, vir_nop()); 147101e04c3fSmrg} 147201e04c3fSmrg 147301e04c3fSmrgstatic bool 14747ec681f3Smrgqpu_inst_valid_in_thrend_slot(struct v3d_compile *c, 14757ec681f3Smrg const struct qinst *qinst, int slot) 147601e04c3fSmrg{ 147701e04c3fSmrg const struct v3d_qpu_instr *inst = &qinst->qpu; 147801e04c3fSmrg 147901e04c3fSmrg /* Only TLB Z writes are prohibited in the last slot, but we don't 148001e04c3fSmrg * have those flagged so prohibit all TLB ops for now. 148101e04c3fSmrg */ 148201e04c3fSmrg if (slot == 2 && qpu_inst_is_tlb(inst)) 148301e04c3fSmrg return false; 148401e04c3fSmrg 148501e04c3fSmrg if (slot > 0 && qinst->uniform != ~0) 148601e04c3fSmrg return false; 148701e04c3fSmrg 148801e04c3fSmrg if (v3d_qpu_uses_vpm(inst)) 148901e04c3fSmrg return false; 149001e04c3fSmrg 149101e04c3fSmrg if (inst->sig.ldvary) 149201e04c3fSmrg return false; 149301e04c3fSmrg 149401e04c3fSmrg if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { 149501e04c3fSmrg /* GFXH-1625: TMUWT not allowed in the final instruction. */ 149601e04c3fSmrg if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT) 149701e04c3fSmrg return false; 149801e04c3fSmrg 149901e04c3fSmrg /* No writing physical registers at the end. */ 150001e04c3fSmrg if (!inst->alu.add.magic_write || 150101e04c3fSmrg !inst->alu.mul.magic_write) { 150201e04c3fSmrg return false; 150301e04c3fSmrg } 150401e04c3fSmrg 15057ec681f3Smrg if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) && 15067ec681f3Smrg !inst->sig_magic) { 15077ec681f3Smrg return false; 15087ec681f3Smrg } 15097ec681f3Smrg 151001e04c3fSmrg if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF) 151101e04c3fSmrg return false; 151201e04c3fSmrg 151301e04c3fSmrg /* RF0-2 might be overwritten during the delay slots by 151401e04c3fSmrg * fragment shader setup. 151501e04c3fSmrg */ 151601e04c3fSmrg if (inst->raddr_a < 3 && 151701e04c3fSmrg (inst->alu.add.a == V3D_QPU_MUX_A || 151801e04c3fSmrg inst->alu.add.b == V3D_QPU_MUX_A || 151901e04c3fSmrg inst->alu.mul.a == V3D_QPU_MUX_A || 152001e04c3fSmrg inst->alu.mul.b == V3D_QPU_MUX_A)) { 152101e04c3fSmrg return false; 152201e04c3fSmrg } 152301e04c3fSmrg 152401e04c3fSmrg if (inst->raddr_b < 3 && 152501e04c3fSmrg !inst->sig.small_imm && 152601e04c3fSmrg (inst->alu.add.a == V3D_QPU_MUX_B || 152701e04c3fSmrg inst->alu.add.b == V3D_QPU_MUX_B || 152801e04c3fSmrg inst->alu.mul.a == V3D_QPU_MUX_B || 152901e04c3fSmrg inst->alu.mul.b == V3D_QPU_MUX_B)) { 153001e04c3fSmrg return false; 153101e04c3fSmrg } 153201e04c3fSmrg } 153301e04c3fSmrg 153401e04c3fSmrg return true; 153501e04c3fSmrg} 153601e04c3fSmrg 15377ec681f3Smrg/** 15387ec681f3Smrg * This is called when trying to merge a thrsw back into the instruction stream 15397ec681f3Smrg * of instructions that were scheduled *before* the thrsw signal to fill its 15407ec681f3Smrg * delay slots. Because the actual execution of the thrsw happens after the 15417ec681f3Smrg * delay slots, it is usually safe to do this, but there are some cases that 15427ec681f3Smrg * need special care. 15437ec681f3Smrg */ 15447ec681f3Smrgstatic bool 15457ec681f3Smrgqpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c, 15467ec681f3Smrg const struct qinst *qinst, 15477ec681f3Smrg uint32_t slot) 15487ec681f3Smrg{ 15497ec681f3Smrg /* No scheduling SFU when the result would land in the other 15507ec681f3Smrg * thread. The simulator complains for safety, though it 15517ec681f3Smrg * would only occur for dead code in our case. 15527ec681f3Smrg */ 15537ec681f3Smrg if (slot > 0 && 15547ec681f3Smrg qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && 15557ec681f3Smrg (v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.add.waddr) || 15567ec681f3Smrg v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.mul.waddr))) { 15577ec681f3Smrg return false; 15587ec681f3Smrg } 15597ec681f3Smrg 15607ec681f3Smrg if (slot > 0 && qinst->qpu.sig.ldvary) 15617ec681f3Smrg return false; 15627ec681f3Smrg 15637ec681f3Smrg /* unifa and the following 3 instructions can't overlap a 15647ec681f3Smrg * thread switch/end. The docs further clarify that this means 15657ec681f3Smrg * the cycle at which the actual thread switch/end happens 15667ec681f3Smrg * and not when the thrsw instruction is processed, which would 15677ec681f3Smrg * be after the 2 delay slots following the thrsw instruction. 15687ec681f3Smrg * This means that we can move up a thrsw up to the instruction 15697ec681f3Smrg * right after unifa: 15707ec681f3Smrg * 15717ec681f3Smrg * unifa, r5 15727ec681f3Smrg * thrsw 15737ec681f3Smrg * delay slot 1 15747ec681f3Smrg * delay slot 2 15757ec681f3Smrg * Thread switch happens here, 4 instructions away from unifa 15767ec681f3Smrg */ 15777ec681f3Smrg if (v3d_qpu_writes_unifa(c->devinfo, &qinst->qpu)) 15787ec681f3Smrg return false; 15797ec681f3Smrg 15807ec681f3Smrg return true; 15817ec681f3Smrg} 15827ec681f3Smrg 15837ec681f3Smrg/** 15847ec681f3Smrg * This is called for instructions scheduled *after* a thrsw signal that may 15857ec681f3Smrg * land in the delay slots of the thrsw. Because these instructions were 15867ec681f3Smrg * scheduled after the thrsw, we need to be careful when placing them into 15877ec681f3Smrg * the delay slots, since that means that we are moving them ahead of the 15887ec681f3Smrg * thread switch and we need to ensure that is not a problem. 15897ec681f3Smrg */ 15907ec681f3Smrgstatic bool 15917ec681f3Smrgqpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c, 15927ec681f3Smrg struct choose_scoreboard *scoreboard, 15937ec681f3Smrg const struct qinst *qinst) 15947ec681f3Smrg{ 15957ec681f3Smrg const uint32_t slot = scoreboard->tick - scoreboard->last_thrsw_tick; 15967ec681f3Smrg assert(slot <= 2); 15977ec681f3Smrg 15987ec681f3Smrg /* We merge thrsw instructions back into the instruction stream 15997ec681f3Smrg * manually, so any instructions scheduled after a thrsw shold be 16007ec681f3Smrg * in the actual delay slots and not in the same slot as the thrsw. 16017ec681f3Smrg */ 16027ec681f3Smrg assert(slot >= 1); 16037ec681f3Smrg 16047ec681f3Smrg /* No emitting a thrsw while the previous thrsw hasn't happened yet. */ 16057ec681f3Smrg if (qinst->qpu.sig.thrsw) 16067ec681f3Smrg return false; 16077ec681f3Smrg 16087ec681f3Smrg /* The restrictions for instructions scheduled before the the thrsw 16097ec681f3Smrg * also apply to instructions scheduled after the thrsw that we want 16107ec681f3Smrg * to place in its delay slots. 16117ec681f3Smrg */ 16127ec681f3Smrg if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot)) 16137ec681f3Smrg return false; 16147ec681f3Smrg 16157ec681f3Smrg /* TLB access is disallowed until scoreboard wait is executed, which 16167ec681f3Smrg * we do on the last thread switch. 16177ec681f3Smrg */ 16187ec681f3Smrg if (qpu_inst_is_tlb(&qinst->qpu)) 16197ec681f3Smrg return false; 16207ec681f3Smrg 16217ec681f3Smrg /* Instruction sequence restrictions: Branch is not allowed in delay 16227ec681f3Smrg * slots of a thrsw. 16237ec681f3Smrg */ 16247ec681f3Smrg if (qinst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH) 16257ec681f3Smrg return false; 16267ec681f3Smrg 16277ec681f3Smrg /* Miscellaneous restrictions: At the point of a thrsw we need to have 16287ec681f3Smrg * at least one outstanding lookup or TSY wait. 16297ec681f3Smrg * 16307ec681f3Smrg * So avoid placing TMU instructions scheduled after the thrsw into 16317ec681f3Smrg * its delay slots or we may be compromising the integrity of our TMU 16327ec681f3Smrg * sequences. Also, notice that if we moved these instructions into 16337ec681f3Smrg * the delay slots of a previous thrsw we could overflow our TMU output 16347ec681f3Smrg * fifo, since we could be effectively pipelining a lookup scheduled 16357ec681f3Smrg * after the thrsw into the sequence before the thrsw. 16367ec681f3Smrg */ 16377ec681f3Smrg if (v3d_qpu_writes_tmu(c->devinfo, &qinst->qpu) || 16387ec681f3Smrg qinst->qpu.sig.wrtmuc) { 16397ec681f3Smrg return false; 16407ec681f3Smrg } 16417ec681f3Smrg 16427ec681f3Smrg /* Don't move instructions that wait on the TMU before the thread switch 16437ec681f3Smrg * happens since that would make the current thread stall before the 16447ec681f3Smrg * switch, which is exactly what we want to avoid with the thrsw 16457ec681f3Smrg * instruction. 16467ec681f3Smrg */ 16477ec681f3Smrg if (v3d_qpu_waits_on_tmu(&qinst->qpu)) 16487ec681f3Smrg return false; 16497ec681f3Smrg 16507ec681f3Smrg /* A thread switch invalidates all accumulators, so don't place any 16517ec681f3Smrg * instructions that write accumulators into the delay slots. 16527ec681f3Smrg */ 16537ec681f3Smrg if (v3d_qpu_writes_accum(c->devinfo, &qinst->qpu)) 16547ec681f3Smrg return false; 16557ec681f3Smrg 16567ec681f3Smrg /* Multop has an implicit write to the rtop register which is an 16577ec681f3Smrg * specialized accumulator that is only used with this instruction. 16587ec681f3Smrg */ 16597ec681f3Smrg if (qinst->qpu.alu.mul.op == V3D_QPU_M_MULTOP) 16607ec681f3Smrg return false; 16617ec681f3Smrg 16627ec681f3Smrg /* Flags are invalidated across a thread switch, so dont' place 16637ec681f3Smrg * instructions that write flags into delay slots. 16647ec681f3Smrg */ 16657ec681f3Smrg if (v3d_qpu_writes_flags(&qinst->qpu)) 16667ec681f3Smrg return false; 16677ec681f3Smrg 16687ec681f3Smrg return true; 16697ec681f3Smrg} 16707ec681f3Smrg 167101e04c3fSmrgstatic bool 167201e04c3fSmrgvalid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard, 167301e04c3fSmrg struct qinst *qinst, int instructions_in_sequence, 167401e04c3fSmrg bool is_thrend) 167501e04c3fSmrg{ 167601e04c3fSmrg /* No emitting our thrsw while the previous thrsw hasn't happened yet. */ 167701e04c3fSmrg if (scoreboard->last_thrsw_tick + 3 > 167801e04c3fSmrg scoreboard->tick - instructions_in_sequence) { 167901e04c3fSmrg return false; 168001e04c3fSmrg } 168101e04c3fSmrg 168201e04c3fSmrg for (int slot = 0; slot < instructions_in_sequence; slot++) { 16837ec681f3Smrg if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot)) 168401e04c3fSmrg return false; 168501e04c3fSmrg 168601e04c3fSmrg if (is_thrend && 16877ec681f3Smrg !qpu_inst_valid_in_thrend_slot(c, qinst, slot)) { 168801e04c3fSmrg return false; 168901e04c3fSmrg } 169001e04c3fSmrg 169101e04c3fSmrg /* Note that the list is circular, so we can only do this up 169201e04c3fSmrg * to instructions_in_sequence. 169301e04c3fSmrg */ 169401e04c3fSmrg qinst = (struct qinst *)qinst->link.next; 169501e04c3fSmrg } 169601e04c3fSmrg 169701e04c3fSmrg return true; 169801e04c3fSmrg} 169901e04c3fSmrg 170001e04c3fSmrg/** 170101e04c3fSmrg * Emits a THRSW signal in the stream, trying to move it up to pair with 170201e04c3fSmrg * another instruction. 170301e04c3fSmrg */ 170401e04c3fSmrgstatic int 170501e04c3fSmrgemit_thrsw(struct v3d_compile *c, 170601e04c3fSmrg struct qblock *block, 170701e04c3fSmrg struct choose_scoreboard *scoreboard, 170801e04c3fSmrg struct qinst *inst, 170901e04c3fSmrg bool is_thrend) 171001e04c3fSmrg{ 171101e04c3fSmrg int time = 0; 171201e04c3fSmrg 171301e04c3fSmrg /* There should be nothing in a thrsw inst being scheduled other than 171401e04c3fSmrg * the signal bits. 171501e04c3fSmrg */ 171601e04c3fSmrg assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU); 171701e04c3fSmrg assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP); 171801e04c3fSmrg assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP); 171901e04c3fSmrg 17207ec681f3Smrg /* Don't try to emit a thrsw in the delay slots of a previous thrsw 17217ec681f3Smrg * or branch. 17227ec681f3Smrg */ 17237ec681f3Smrg while (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick) { 17247ec681f3Smrg emit_nop(c, block, scoreboard); 17257ec681f3Smrg time++; 17267ec681f3Smrg } 17277ec681f3Smrg while (scoreboard->last_branch_tick + 3 >= scoreboard->tick) { 17287ec681f3Smrg emit_nop(c, block, scoreboard); 17297ec681f3Smrg time++; 17307ec681f3Smrg } 17317ec681f3Smrg 173201e04c3fSmrg /* Find how far back into previous instructions we can put the THRSW. */ 173301e04c3fSmrg int slots_filled = 0; 173401e04c3fSmrg struct qinst *merge_inst = NULL; 173501e04c3fSmrg vir_for_each_inst_rev(prev_inst, block) { 173601e04c3fSmrg struct v3d_qpu_sig sig = prev_inst->qpu.sig; 173701e04c3fSmrg sig.thrsw = true; 173801e04c3fSmrg uint32_t packed_sig; 173901e04c3fSmrg 174001e04c3fSmrg if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig)) 174101e04c3fSmrg break; 174201e04c3fSmrg 174301e04c3fSmrg if (!valid_thrsw_sequence(c, scoreboard, 174401e04c3fSmrg prev_inst, slots_filled + 1, 174501e04c3fSmrg is_thrend)) { 174601e04c3fSmrg break; 174701e04c3fSmrg } 174801e04c3fSmrg 174901e04c3fSmrg merge_inst = prev_inst; 175001e04c3fSmrg if (++slots_filled == 3) 175101e04c3fSmrg break; 175201e04c3fSmrg } 175301e04c3fSmrg 175401e04c3fSmrg bool needs_free = false; 175501e04c3fSmrg if (merge_inst) { 175601e04c3fSmrg merge_inst->qpu.sig.thrsw = true; 175701e04c3fSmrg needs_free = true; 175801e04c3fSmrg scoreboard->last_thrsw_tick = scoreboard->tick - slots_filled; 175901e04c3fSmrg } else { 176001e04c3fSmrg scoreboard->last_thrsw_tick = scoreboard->tick; 176101e04c3fSmrg insert_scheduled_instruction(c, block, scoreboard, inst); 176201e04c3fSmrg time++; 176301e04c3fSmrg slots_filled++; 176401e04c3fSmrg merge_inst = inst; 176501e04c3fSmrg } 176601e04c3fSmrg 17677ec681f3Smrg scoreboard->first_thrsw_emitted = true; 176801e04c3fSmrg 176901e04c3fSmrg /* If we're emitting the last THRSW (other than program end), then 177001e04c3fSmrg * signal that to the HW by emitting two THRSWs in a row. 177101e04c3fSmrg */ 177201e04c3fSmrg if (inst->is_last_thrsw) { 17737ec681f3Smrg if (slots_filled <= 1) { 17747ec681f3Smrg emit_nop(c, block, scoreboard); 17757ec681f3Smrg time++; 17767ec681f3Smrg } 177701e04c3fSmrg struct qinst *second_inst = 177801e04c3fSmrg (struct qinst *)merge_inst->link.next; 177901e04c3fSmrg second_inst->qpu.sig.thrsw = true; 17807ec681f3Smrg scoreboard->last_thrsw_emitted = true; 17817ec681f3Smrg } 17827ec681f3Smrg 17837ec681f3Smrg /* Make sure the thread end executes within the program lifespan */ 17847ec681f3Smrg if (is_thrend) { 17857ec681f3Smrg for (int i = 0; i < 3 - slots_filled; i++) { 17867ec681f3Smrg emit_nop(c, block, scoreboard); 17877ec681f3Smrg time++; 17887ec681f3Smrg } 178901e04c3fSmrg } 179001e04c3fSmrg 179101e04c3fSmrg /* If we put our THRSW into another instruction, free up the 179201e04c3fSmrg * instruction that didn't end up scheduled into the list. 179301e04c3fSmrg */ 179401e04c3fSmrg if (needs_free) 179501e04c3fSmrg free(inst); 179601e04c3fSmrg 179701e04c3fSmrg return time; 179801e04c3fSmrg} 179901e04c3fSmrg 18007ec681f3Smrgstatic bool 18017ec681f3Smrgqpu_inst_valid_in_branch_delay_slot(struct v3d_compile *c, struct qinst *inst) 18027ec681f3Smrg{ 18037ec681f3Smrg if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH) 18047ec681f3Smrg return false; 18057ec681f3Smrg 18067ec681f3Smrg if (inst->qpu.sig.thrsw) 18077ec681f3Smrg return false; 18087ec681f3Smrg 18097ec681f3Smrg if (v3d_qpu_writes_unifa(c->devinfo, &inst->qpu)) 18107ec681f3Smrg return false; 18117ec681f3Smrg 18127ec681f3Smrg if (vir_has_uniform(inst)) 18137ec681f3Smrg return false; 18147ec681f3Smrg 18157ec681f3Smrg return true; 18167ec681f3Smrg} 18177ec681f3Smrg 18187ec681f3Smrgstatic void 18197ec681f3Smrgemit_branch(struct v3d_compile *c, 18207ec681f3Smrg struct qblock *block, 18217ec681f3Smrg struct choose_scoreboard *scoreboard, 18227ec681f3Smrg struct qinst *inst) 18237ec681f3Smrg{ 18247ec681f3Smrg assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH); 18257ec681f3Smrg 18267ec681f3Smrg /* We should've not picked up a branch for the delay slots of a previous 18277ec681f3Smrg * thrsw, branch or unifa write instruction. 18287ec681f3Smrg */ 18297ec681f3Smrg int branch_tick = scoreboard->tick; 18307ec681f3Smrg assert(scoreboard->last_thrsw_tick + 2 < branch_tick); 18317ec681f3Smrg assert(scoreboard->last_branch_tick + 3 < branch_tick); 18327ec681f3Smrg assert(scoreboard->last_unifa_write_tick + 3 < branch_tick); 18337ec681f3Smrg 18347ec681f3Smrg /* Can't place a branch with msfign != 0 and cond != 0,2,3 after 18357ec681f3Smrg * setmsf. 18367ec681f3Smrg */ 18377ec681f3Smrg bool is_safe_msf_branch = 18387ec681f3Smrg inst->qpu.branch.msfign == V3D_QPU_MSFIGN_NONE || 18397ec681f3Smrg inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS || 18407ec681f3Smrg inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_A0 || 18417ec681f3Smrg inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_NA0; 18427ec681f3Smrg assert(scoreboard->last_setmsf_tick != branch_tick - 1 || 18437ec681f3Smrg is_safe_msf_branch); 18447ec681f3Smrg 18457ec681f3Smrg /* Insert the branch instruction */ 18467ec681f3Smrg insert_scheduled_instruction(c, block, scoreboard, inst); 18477ec681f3Smrg 18487ec681f3Smrg /* Now see if we can move the branch instruction back into the 18497ec681f3Smrg * instruction stream to fill its delay slots 18507ec681f3Smrg */ 18517ec681f3Smrg int slots_filled = 0; 18527ec681f3Smrg while (slots_filled < 3 && block->instructions.next != &inst->link) { 18537ec681f3Smrg struct qinst *prev_inst = (struct qinst *) inst->link.prev; 18547ec681f3Smrg assert(prev_inst->qpu.type != V3D_QPU_INSTR_TYPE_BRANCH); 18557ec681f3Smrg 18567ec681f3Smrg /* Can't move the branch instruction if that would place it 18577ec681f3Smrg * in the delay slots of other instructions. 18587ec681f3Smrg */ 18597ec681f3Smrg if (scoreboard->last_branch_tick + 3 >= 18607ec681f3Smrg branch_tick - slots_filled - 1) { 18617ec681f3Smrg break; 18627ec681f3Smrg } 18637ec681f3Smrg 18647ec681f3Smrg if (scoreboard->last_thrsw_tick + 2 >= 18657ec681f3Smrg branch_tick - slots_filled - 1) { 18667ec681f3Smrg break; 18677ec681f3Smrg } 18687ec681f3Smrg 18697ec681f3Smrg if (scoreboard->last_unifa_write_tick + 3 >= 18707ec681f3Smrg branch_tick - slots_filled - 1) { 18717ec681f3Smrg break; 18727ec681f3Smrg } 18737ec681f3Smrg 18747ec681f3Smrg /* Can't move a conditional branch before the instruction 18757ec681f3Smrg * that writes the flags for its condition. 18767ec681f3Smrg */ 18777ec681f3Smrg if (v3d_qpu_writes_flags(&prev_inst->qpu) && 18787ec681f3Smrg inst->qpu.branch.cond != V3D_QPU_BRANCH_COND_ALWAYS) { 18797ec681f3Smrg break; 18807ec681f3Smrg } 18817ec681f3Smrg 18827ec681f3Smrg if (!qpu_inst_valid_in_branch_delay_slot(c, prev_inst)) 18837ec681f3Smrg break; 18847ec681f3Smrg 18857ec681f3Smrg if (!is_safe_msf_branch) { 18867ec681f3Smrg struct qinst *prev_prev_inst = 18877ec681f3Smrg (struct qinst *) prev_inst->link.prev; 18887ec681f3Smrg if (prev_prev_inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && 18897ec681f3Smrg prev_prev_inst->qpu.alu.add.op == V3D_QPU_A_SETMSF) { 18907ec681f3Smrg break; 18917ec681f3Smrg } 18927ec681f3Smrg } 18937ec681f3Smrg 18947ec681f3Smrg list_del(&prev_inst->link); 18957ec681f3Smrg list_add(&prev_inst->link, &inst->link); 18967ec681f3Smrg slots_filled++; 18977ec681f3Smrg } 18987ec681f3Smrg 18997ec681f3Smrg block->branch_qpu_ip = c->qpu_inst_count - 1 - slots_filled; 19007ec681f3Smrg scoreboard->last_branch_tick = branch_tick - slots_filled; 19017ec681f3Smrg 19027ec681f3Smrg /* Fill any remaining delay slots. 19037ec681f3Smrg * 19047ec681f3Smrg * For unconditional branches we'll try to fill these with the 19057ec681f3Smrg * first instructions in the successor block after scheduling 19067ec681f3Smrg * all blocks when setting up branch targets. 19077ec681f3Smrg */ 19087ec681f3Smrg for (int i = 0; i < 3 - slots_filled; i++) 19097ec681f3Smrg emit_nop(c, block, scoreboard); 19107ec681f3Smrg} 19117ec681f3Smrg 19127ec681f3Smrgstatic bool 19137ec681f3Smrgalu_reads_register(struct v3d_qpu_instr *inst, 19147ec681f3Smrg bool add, bool magic, uint32_t index) 19157ec681f3Smrg{ 19167ec681f3Smrg uint32_t num_src; 19177ec681f3Smrg enum v3d_qpu_mux mux_a, mux_b; 19187ec681f3Smrg 19197ec681f3Smrg if (add) { 19207ec681f3Smrg num_src = v3d_qpu_add_op_num_src(inst->alu.add.op); 19217ec681f3Smrg mux_a = inst->alu.add.a; 19227ec681f3Smrg mux_b = inst->alu.add.b; 19237ec681f3Smrg } else { 19247ec681f3Smrg num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op); 19257ec681f3Smrg mux_a = inst->alu.mul.a; 19267ec681f3Smrg mux_b = inst->alu.mul.b; 19277ec681f3Smrg } 19287ec681f3Smrg 19297ec681f3Smrg for (int i = 0; i < num_src; i++) { 19307ec681f3Smrg if (magic) { 19317ec681f3Smrg if (i == 0 && mux_a == index) 19327ec681f3Smrg return true; 19337ec681f3Smrg if (i == 1 && mux_b == index) 19347ec681f3Smrg return true; 19357ec681f3Smrg } else { 19367ec681f3Smrg if (i == 0 && mux_a == V3D_QPU_MUX_A && 19377ec681f3Smrg inst->raddr_a == index) { 19387ec681f3Smrg return true; 19397ec681f3Smrg } 19407ec681f3Smrg if (i == 0 && mux_a == V3D_QPU_MUX_B && 19417ec681f3Smrg inst->raddr_b == index) { 19427ec681f3Smrg return true; 19437ec681f3Smrg } 19447ec681f3Smrg if (i == 1 && mux_b == V3D_QPU_MUX_A && 19457ec681f3Smrg inst->raddr_a == index) { 19467ec681f3Smrg return true; 19477ec681f3Smrg } 19487ec681f3Smrg if (i == 1 && mux_b == V3D_QPU_MUX_B && 19497ec681f3Smrg inst->raddr_b == index) { 19507ec681f3Smrg return true; 19517ec681f3Smrg } 19527ec681f3Smrg } 19537ec681f3Smrg } 19547ec681f3Smrg 19557ec681f3Smrg return false; 19567ec681f3Smrg} 19577ec681f3Smrg 19587ec681f3Smrg/** 19597ec681f3Smrg * This takes and ldvary signal merged into 'inst' and tries to move it up to 19607ec681f3Smrg * the previous instruction to get good pipelining of ldvary sequences, 19617ec681f3Smrg * transforming this: 19627ec681f3Smrg * 19637ec681f3Smrg * nop ; nop ; ldvary.r4 19647ec681f3Smrg * nop ; fmul r0, r4, rf0 ; 19657ec681f3Smrg * fadd rf13, r0, r5 ; nop; ; ldvary.r1 <-- inst 19667ec681f3Smrg * 19677ec681f3Smrg * into: 19687ec681f3Smrg * 19697ec681f3Smrg * nop ; nop ; ldvary.r4 19707ec681f3Smrg * nop ; fmul r0, r4, rf0 ; ldvary.r1 19717ec681f3Smrg * fadd rf13, r0, r5 ; nop; ; <-- inst 19727ec681f3Smrg * 19737ec681f3Smrg * If we manage to do this successfully (we return true here), then flagging 19747ec681f3Smrg * the ldvary as "scheduled" may promote the follow-up fmul to a DAG head that 19757ec681f3Smrg * we will be able to pick up to merge into 'inst', leading to code like this: 19767ec681f3Smrg * 19777ec681f3Smrg * nop ; nop ; ldvary.r4 19787ec681f3Smrg * nop ; fmul r0, r4, rf0 ; ldvary.r1 19797ec681f3Smrg * fadd rf13, r0, r5 ; fmul r2, r1, rf0 ; <-- inst 19807ec681f3Smrg */ 19817ec681f3Smrgstatic bool 19827ec681f3Smrgfixup_pipelined_ldvary(struct v3d_compile *c, 19837ec681f3Smrg struct choose_scoreboard *scoreboard, 19847ec681f3Smrg struct qblock *block, 19857ec681f3Smrg struct v3d_qpu_instr *inst) 19867ec681f3Smrg{ 19877ec681f3Smrg /* We only call this if we have successfuly merged an ldvary into a 19887ec681f3Smrg * previous instruction. 19897ec681f3Smrg */ 19907ec681f3Smrg assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); 19917ec681f3Smrg assert(inst->sig.ldvary); 19927ec681f3Smrg uint32_t ldvary_magic = inst->sig_magic; 19937ec681f3Smrg uint32_t ldvary_index = inst->sig_addr; 19947ec681f3Smrg 19957ec681f3Smrg /* The instruction in which we merged the ldvary cannot read 19967ec681f3Smrg * the ldvary destination, if it does, then moving the ldvary before 19977ec681f3Smrg * it would overwrite it. 19987ec681f3Smrg */ 19997ec681f3Smrg if (alu_reads_register(inst, true, ldvary_magic, ldvary_index)) 20007ec681f3Smrg return false; 20017ec681f3Smrg if (alu_reads_register(inst, false, ldvary_magic, ldvary_index)) 20027ec681f3Smrg return false; 20037ec681f3Smrg 20047ec681f3Smrg /* The implicit ldvary destination may not be written to by a signal 20057ec681f3Smrg * in the instruction following ldvary. Since we are planning to move 20067ec681f3Smrg * ldvary to the previous instruction, this means we need to check if 20077ec681f3Smrg * the current instruction has any other signal that could create this 20087ec681f3Smrg * conflict. The only other signal that can write to the implicit 20097ec681f3Smrg * ldvary destination that is compatible with ldvary in the same 20107ec681f3Smrg * instruction is ldunif. 20117ec681f3Smrg */ 20127ec681f3Smrg if (inst->sig.ldunif) 20137ec681f3Smrg return false; 20147ec681f3Smrg 20157ec681f3Smrg /* The previous instruction can't write to the same destination as the 20167ec681f3Smrg * ldvary. 20177ec681f3Smrg */ 20187ec681f3Smrg struct qinst *prev = (struct qinst *) block->instructions.prev; 20197ec681f3Smrg if (!prev || prev->qpu.type != V3D_QPU_INSTR_TYPE_ALU) 20207ec681f3Smrg return false; 20217ec681f3Smrg 20227ec681f3Smrg if (prev->qpu.alu.add.op != V3D_QPU_A_NOP) { 20237ec681f3Smrg if (prev->qpu.alu.add.magic_write == ldvary_magic && 20247ec681f3Smrg prev->qpu.alu.add.waddr == ldvary_index) { 20257ec681f3Smrg return false; 20267ec681f3Smrg } 20277ec681f3Smrg } 20287ec681f3Smrg 20297ec681f3Smrg if (prev->qpu.alu.mul.op != V3D_QPU_M_NOP) { 20307ec681f3Smrg if (prev->qpu.alu.mul.magic_write == ldvary_magic && 20317ec681f3Smrg prev->qpu.alu.mul.waddr == ldvary_index) { 20327ec681f3Smrg return false; 20337ec681f3Smrg } 20347ec681f3Smrg } 20357ec681f3Smrg 20367ec681f3Smrg /* The previous instruction cannot have a conflicting signal */ 20377ec681f3Smrg if (v3d_qpu_sig_writes_address(c->devinfo, &prev->qpu.sig)) 20387ec681f3Smrg return false; 20397ec681f3Smrg 20407ec681f3Smrg /* The previous instruction cannot use flags since ldvary uses the 20417ec681f3Smrg * 'cond' instruction field to store the destination. 20427ec681f3Smrg */ 20437ec681f3Smrg if (v3d_qpu_writes_flags(&prev->qpu)) 20447ec681f3Smrg return false; 20457ec681f3Smrg if (v3d_qpu_reads_flags(&prev->qpu)) 20467ec681f3Smrg return false; 20477ec681f3Smrg 20487ec681f3Smrg /* We can't put an ldvary in the delay slots of a thrsw. We should've 20497ec681f3Smrg * prevented this when pairing up the ldvary with another instruction 20507ec681f3Smrg * and flagging it for a fixup. 20517ec681f3Smrg */ 20527ec681f3Smrg assert(scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1); 20537ec681f3Smrg 20547ec681f3Smrg /* Move the ldvary to the previous instruction and remove it from the 20557ec681f3Smrg * current one. 20567ec681f3Smrg */ 20577ec681f3Smrg prev->qpu.sig.ldvary = true; 20587ec681f3Smrg prev->qpu.sig_magic = ldvary_magic; 20597ec681f3Smrg prev->qpu.sig_addr = ldvary_index; 20607ec681f3Smrg scoreboard->last_ldvary_tick = scoreboard->tick - 1; 20617ec681f3Smrg 20627ec681f3Smrg inst->sig.ldvary = false; 20637ec681f3Smrg inst->sig_magic = false; 20647ec681f3Smrg inst->sig_addr = 0; 20657ec681f3Smrg 20667ec681f3Smrg /* By moving ldvary to the previous instruction we make it update 20677ec681f3Smrg * r5 in the current one, so nothing else in it should write r5. 20687ec681f3Smrg * This should've been prevented by our depedency tracking, which 20697ec681f3Smrg * would not allow ldvary to be paired up with an instruction that 20707ec681f3Smrg * writes r5 (since our dependency tracking doesn't know that the 20717ec681f3Smrg * ldvary write r5 happens in the next instruction). 20727ec681f3Smrg */ 20737ec681f3Smrg assert(!v3d_qpu_writes_r5(c->devinfo, inst)); 20747ec681f3Smrg 20757ec681f3Smrg return true; 20767ec681f3Smrg} 20777ec681f3Smrg 207801e04c3fSmrgstatic uint32_t 207901e04c3fSmrgschedule_instructions(struct v3d_compile *c, 208001e04c3fSmrg struct choose_scoreboard *scoreboard, 208101e04c3fSmrg struct qblock *block, 208201e04c3fSmrg enum quniform_contents *orig_uniform_contents, 208301e04c3fSmrg uint32_t *orig_uniform_data, 208401e04c3fSmrg uint32_t *next_uniform) 208501e04c3fSmrg{ 208601e04c3fSmrg const struct v3d_device_info *devinfo = c->devinfo; 208701e04c3fSmrg uint32_t time = 0; 208801e04c3fSmrg 20897ec681f3Smrg while (!list_is_empty(&scoreboard->dag->heads)) { 209001e04c3fSmrg struct schedule_node *chosen = 20917ec681f3Smrg choose_instruction_to_schedule(c, scoreboard, NULL); 209201e04c3fSmrg struct schedule_node *merge = NULL; 209301e04c3fSmrg 209401e04c3fSmrg /* If there are no valid instructions to schedule, drop a NOP 209501e04c3fSmrg * in. 209601e04c3fSmrg */ 209701e04c3fSmrg struct qinst *qinst = chosen ? chosen->inst : vir_nop(); 209801e04c3fSmrg struct v3d_qpu_instr *inst = &qinst->qpu; 209901e04c3fSmrg 210001e04c3fSmrg if (debug) { 210101e04c3fSmrg fprintf(stderr, "t=%4d: current list:\n", 210201e04c3fSmrg time); 2103ed98bd31Smaya dump_state(devinfo, scoreboard->dag); 210401e04c3fSmrg fprintf(stderr, "t=%4d: chose: ", time); 210501e04c3fSmrg v3d_qpu_dump(devinfo, inst); 210601e04c3fSmrg fprintf(stderr, "\n"); 210701e04c3fSmrg } 210801e04c3fSmrg 210901e04c3fSmrg /* We can't mark_instruction_scheduled() the chosen inst until 211001e04c3fSmrg * we're done identifying instructions to merge, so put the 211101e04c3fSmrg * merged instructions on a list for a moment. 211201e04c3fSmrg */ 211301e04c3fSmrg struct list_head merged_list; 211401e04c3fSmrg list_inithead(&merged_list); 211501e04c3fSmrg 211601e04c3fSmrg /* Schedule this instruction onto the QPU list. Also try to 211701e04c3fSmrg * find an instruction to pair with it. 211801e04c3fSmrg */ 211901e04c3fSmrg if (chosen) { 212001e04c3fSmrg time = MAX2(chosen->unblocked_time, time); 2121ed98bd31Smaya pre_remove_head(scoreboard->dag, chosen); 212201e04c3fSmrg 212301e04c3fSmrg while ((merge = 21247ec681f3Smrg choose_instruction_to_schedule(c, scoreboard, 212501e04c3fSmrg chosen))) { 212601e04c3fSmrg time = MAX2(merge->unblocked_time, time); 21277ec681f3Smrg pre_remove_head(scoreboard->dag, merge); 212801e04c3fSmrg list_addtail(&merge->link, &merged_list); 212901e04c3fSmrg (void)qpu_merge_inst(devinfo, inst, 213001e04c3fSmrg inst, &merge->inst->qpu); 213101e04c3fSmrg if (merge->inst->uniform != -1) { 213201e04c3fSmrg chosen->inst->uniform = 213301e04c3fSmrg merge->inst->uniform; 213401e04c3fSmrg } 213501e04c3fSmrg 213601e04c3fSmrg if (debug) { 213701e04c3fSmrg fprintf(stderr, "t=%4d: merging: ", 213801e04c3fSmrg time); 213901e04c3fSmrg v3d_qpu_dump(devinfo, &merge->inst->qpu); 214001e04c3fSmrg fprintf(stderr, "\n"); 214101e04c3fSmrg fprintf(stderr, " result: "); 214201e04c3fSmrg v3d_qpu_dump(devinfo, inst); 214301e04c3fSmrg fprintf(stderr, "\n"); 214401e04c3fSmrg } 21457ec681f3Smrg 21467ec681f3Smrg if (scoreboard->fixup_ldvary) { 21477ec681f3Smrg scoreboard->fixup_ldvary = false; 21487ec681f3Smrg if (fixup_pipelined_ldvary(c, scoreboard, block, inst)) { 21497ec681f3Smrg /* Flag the ldvary as scheduled 21507ec681f3Smrg * now so we can try to merge the 21517ec681f3Smrg * follow-up instruction in the 21527ec681f3Smrg * the ldvary sequence into the 21537ec681f3Smrg * current instruction. 21547ec681f3Smrg */ 21557ec681f3Smrg mark_instruction_scheduled( 21567ec681f3Smrg devinfo, scoreboard->dag, 21577ec681f3Smrg time, merge); 21587ec681f3Smrg } 21597ec681f3Smrg } 216001e04c3fSmrg } 21617ec681f3Smrg if (mux_read_stalls(scoreboard, inst)) 21627ec681f3Smrg c->qpu_inst_stalled_count++; 216301e04c3fSmrg } 216401e04c3fSmrg 216501e04c3fSmrg /* Update the uniform index for the rewritten location -- 216601e04c3fSmrg * branch target updating will still need to change 216701e04c3fSmrg * c->uniform_data[] using this index. 216801e04c3fSmrg */ 216901e04c3fSmrg if (qinst->uniform != -1) { 217001e04c3fSmrg if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) 217101e04c3fSmrg block->branch_uniform = *next_uniform; 217201e04c3fSmrg 217301e04c3fSmrg c->uniform_data[*next_uniform] = 217401e04c3fSmrg orig_uniform_data[qinst->uniform]; 217501e04c3fSmrg c->uniform_contents[*next_uniform] = 217601e04c3fSmrg orig_uniform_contents[qinst->uniform]; 217701e04c3fSmrg qinst->uniform = *next_uniform; 217801e04c3fSmrg (*next_uniform)++; 217901e04c3fSmrg } 218001e04c3fSmrg 218101e04c3fSmrg if (debug) { 218201e04c3fSmrg fprintf(stderr, "\n"); 218301e04c3fSmrg } 218401e04c3fSmrg 218501e04c3fSmrg /* Now that we've scheduled a new instruction, some of its 218601e04c3fSmrg * children can be promoted to the list of instructions ready to 218701e04c3fSmrg * be scheduled. Update the children's unblocked time for this 218801e04c3fSmrg * DAG edge as we do so. 218901e04c3fSmrg */ 21907ec681f3Smrg mark_instruction_scheduled(devinfo, scoreboard->dag, time, chosen); 219101e04c3fSmrg list_for_each_entry(struct schedule_node, merge, &merged_list, 219201e04c3fSmrg link) { 21937ec681f3Smrg mark_instruction_scheduled(devinfo, scoreboard->dag, time, merge); 219401e04c3fSmrg 219501e04c3fSmrg /* The merged VIR instruction doesn't get re-added to the 219601e04c3fSmrg * block, so free it now. 219701e04c3fSmrg */ 219801e04c3fSmrg free(merge->inst); 219901e04c3fSmrg } 220001e04c3fSmrg 220101e04c3fSmrg if (inst->sig.thrsw) { 220201e04c3fSmrg time += emit_thrsw(c, block, scoreboard, qinst, false); 22037ec681f3Smrg } else if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) { 22047ec681f3Smrg emit_branch(c, block, scoreboard, qinst); 220501e04c3fSmrg } else { 220601e04c3fSmrg insert_scheduled_instruction(c, block, 220701e04c3fSmrg scoreboard, qinst); 220801e04c3fSmrg } 220901e04c3fSmrg } 221001e04c3fSmrg 221101e04c3fSmrg return time; 221201e04c3fSmrg} 221301e04c3fSmrg 221401e04c3fSmrgstatic uint32_t 221501e04c3fSmrgqpu_schedule_instructions_block(struct v3d_compile *c, 221601e04c3fSmrg struct choose_scoreboard *scoreboard, 221701e04c3fSmrg struct qblock *block, 221801e04c3fSmrg enum quniform_contents *orig_uniform_contents, 221901e04c3fSmrg uint32_t *orig_uniform_data, 222001e04c3fSmrg uint32_t *next_uniform) 222101e04c3fSmrg{ 222201e04c3fSmrg void *mem_ctx = ralloc_context(NULL); 2223ed98bd31Smaya scoreboard->dag = dag_create(mem_ctx); 2224ed98bd31Smaya struct list_head setup_list; 222501e04c3fSmrg 2226ed98bd31Smaya list_inithead(&setup_list); 222701e04c3fSmrg 222801e04c3fSmrg /* Wrap each instruction in a scheduler structure. */ 22297ec681f3Smrg while (!list_is_empty(&block->instructions)) { 223001e04c3fSmrg struct qinst *qinst = (struct qinst *)block->instructions.next; 223101e04c3fSmrg struct schedule_node *n = 223201e04c3fSmrg rzalloc(mem_ctx, struct schedule_node); 223301e04c3fSmrg 2234ed98bd31Smaya dag_init_node(scoreboard->dag, &n->dag); 223501e04c3fSmrg n->inst = qinst; 223601e04c3fSmrg 223701e04c3fSmrg list_del(&qinst->link); 2238ed98bd31Smaya list_addtail(&n->link, &setup_list); 223901e04c3fSmrg } 224001e04c3fSmrg 2241ed98bd31Smaya calculate_forward_deps(c, scoreboard->dag, &setup_list); 2242ed98bd31Smaya calculate_reverse_deps(c, scoreboard->dag, &setup_list); 224301e04c3fSmrg 22447ec681f3Smrg dag_traverse_bottom_up(scoreboard->dag, compute_delay, c); 224501e04c3fSmrg 224601e04c3fSmrg uint32_t cycles = schedule_instructions(c, scoreboard, block, 224701e04c3fSmrg orig_uniform_contents, 224801e04c3fSmrg orig_uniform_data, 224901e04c3fSmrg next_uniform); 225001e04c3fSmrg 225101e04c3fSmrg ralloc_free(mem_ctx); 2252ed98bd31Smaya scoreboard->dag = NULL; 225301e04c3fSmrg 225401e04c3fSmrg return cycles; 225501e04c3fSmrg} 225601e04c3fSmrg 225701e04c3fSmrgstatic void 225801e04c3fSmrgqpu_set_branch_targets(struct v3d_compile *c) 225901e04c3fSmrg{ 226001e04c3fSmrg vir_for_each_block(block, c) { 226101e04c3fSmrg /* The end block of the program has no branch. */ 226201e04c3fSmrg if (!block->successors[0]) 226301e04c3fSmrg continue; 226401e04c3fSmrg 226501e04c3fSmrg /* If there was no branch instruction, then the successor 226601e04c3fSmrg * block must follow immediately after this one. 226701e04c3fSmrg */ 226801e04c3fSmrg if (block->branch_qpu_ip == ~0) { 226901e04c3fSmrg assert(block->end_qpu_ip + 1 == 227001e04c3fSmrg block->successors[0]->start_qpu_ip); 227101e04c3fSmrg continue; 227201e04c3fSmrg } 227301e04c3fSmrg 227401e04c3fSmrg /* Walk back through the delay slots to find the branch 227501e04c3fSmrg * instr. 227601e04c3fSmrg */ 22777ec681f3Smrg struct qinst *branch = NULL; 227801e04c3fSmrg struct list_head *entry = block->instructions.prev; 22797ec681f3Smrg int32_t delay_slot_count = -1; 22807ec681f3Smrg struct qinst *delay_slots_start = NULL; 22817ec681f3Smrg for (int i = 0; i < 3; i++) { 228201e04c3fSmrg entry = entry->prev; 22837ec681f3Smrg struct qinst *inst = 22847ec681f3Smrg container_of(entry, struct qinst, link); 22857ec681f3Smrg 22867ec681f3Smrg if (delay_slot_count == -1) { 22877ec681f3Smrg if (!v3d_qpu_is_nop(&inst->qpu)) 22887ec681f3Smrg delay_slot_count = i; 22897ec681f3Smrg else 22907ec681f3Smrg delay_slots_start = inst; 22917ec681f3Smrg } 22927ec681f3Smrg 22937ec681f3Smrg if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH) { 22947ec681f3Smrg branch = inst; 22957ec681f3Smrg break; 22967ec681f3Smrg } 22977ec681f3Smrg } 22987ec681f3Smrg assert(branch && branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH); 22997ec681f3Smrg assert(delay_slot_count >= 0 && delay_slot_count <= 3); 23007ec681f3Smrg assert(delay_slot_count == 0 || delay_slots_start != NULL); 230101e04c3fSmrg 230201e04c3fSmrg /* Make sure that the if-we-don't-jump 230301e04c3fSmrg * successor was scheduled just after the 230401e04c3fSmrg * delay slots. 230501e04c3fSmrg */ 230601e04c3fSmrg assert(!block->successors[1] || 230701e04c3fSmrg block->successors[1]->start_qpu_ip == 230801e04c3fSmrg block->branch_qpu_ip + 4); 230901e04c3fSmrg 231001e04c3fSmrg branch->qpu.branch.offset = 231101e04c3fSmrg ((block->successors[0]->start_qpu_ip - 231201e04c3fSmrg (block->branch_qpu_ip + 4)) * 231301e04c3fSmrg sizeof(uint64_t)); 231401e04c3fSmrg 231501e04c3fSmrg /* Set up the relative offset to jump in the 231601e04c3fSmrg * uniform stream. 231701e04c3fSmrg * 231801e04c3fSmrg * Use a temporary here, because 231901e04c3fSmrg * uniform_data[inst->uniform] may be shared 232001e04c3fSmrg * between multiple instructions. 232101e04c3fSmrg */ 232201e04c3fSmrg assert(c->uniform_contents[branch->uniform] == QUNIFORM_CONSTANT); 232301e04c3fSmrg c->uniform_data[branch->uniform] = 232401e04c3fSmrg (block->successors[0]->start_uniform - 232501e04c3fSmrg (block->branch_uniform + 1)) * 4; 23267ec681f3Smrg 23277ec681f3Smrg /* If this is an unconditional branch, try to fill any remaining 23287ec681f3Smrg * delay slots with the initial instructions of the successor 23297ec681f3Smrg * block. 23307ec681f3Smrg * 23317ec681f3Smrg * FIXME: we can do the same for conditional branches if we 23327ec681f3Smrg * predicate the instructions to match the branch condition. 23337ec681f3Smrg */ 23347ec681f3Smrg if (branch->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS) { 23357ec681f3Smrg struct list_head *successor_insts = 23367ec681f3Smrg &block->successors[0]->instructions; 23377ec681f3Smrg delay_slot_count = MIN2(delay_slot_count, 23387ec681f3Smrg list_length(successor_insts)); 23397ec681f3Smrg struct qinst *s_inst = 23407ec681f3Smrg (struct qinst *) successor_insts->next; 23417ec681f3Smrg struct qinst *slot = delay_slots_start; 23427ec681f3Smrg int slots_filled = 0; 23437ec681f3Smrg while (slots_filled < delay_slot_count && 23447ec681f3Smrg qpu_inst_valid_in_branch_delay_slot(c, s_inst)) { 23457ec681f3Smrg memcpy(&slot->qpu, &s_inst->qpu, 23467ec681f3Smrg sizeof(slot->qpu)); 23477ec681f3Smrg s_inst = (struct qinst *) s_inst->link.next; 23487ec681f3Smrg slot = (struct qinst *) slot->link.next; 23497ec681f3Smrg slots_filled++; 23507ec681f3Smrg } 23517ec681f3Smrg branch->qpu.branch.offset += 23527ec681f3Smrg slots_filled * sizeof(uint64_t); 23537ec681f3Smrg } 235401e04c3fSmrg } 235501e04c3fSmrg} 235601e04c3fSmrg 235701e04c3fSmrguint32_t 235801e04c3fSmrgv3d_qpu_schedule_instructions(struct v3d_compile *c) 235901e04c3fSmrg{ 236001e04c3fSmrg const struct v3d_device_info *devinfo = c->devinfo; 236101e04c3fSmrg struct qblock *end_block = list_last_entry(&c->blocks, 236201e04c3fSmrg struct qblock, link); 236301e04c3fSmrg 236401e04c3fSmrg /* We reorder the uniforms as we schedule instructions, so save the 236501e04c3fSmrg * old data off and replace it. 236601e04c3fSmrg */ 236701e04c3fSmrg uint32_t *uniform_data = c->uniform_data; 236801e04c3fSmrg enum quniform_contents *uniform_contents = c->uniform_contents; 236901e04c3fSmrg c->uniform_contents = ralloc_array(c, enum quniform_contents, 237001e04c3fSmrg c->num_uniforms); 237101e04c3fSmrg c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms); 237201e04c3fSmrg c->uniform_array_size = c->num_uniforms; 237301e04c3fSmrg uint32_t next_uniform = 0; 237401e04c3fSmrg 237501e04c3fSmrg struct choose_scoreboard scoreboard; 237601e04c3fSmrg memset(&scoreboard, 0, sizeof(scoreboard)); 237701e04c3fSmrg scoreboard.last_ldvary_tick = -10; 23787ec681f3Smrg scoreboard.last_unifa_write_tick = -10; 237901e04c3fSmrg scoreboard.last_magic_sfu_write_tick = -10; 238001e04c3fSmrg scoreboard.last_uniforms_reset_tick = -10; 238101e04c3fSmrg scoreboard.last_thrsw_tick = -10; 23827ec681f3Smrg scoreboard.last_branch_tick = -10; 23837ec681f3Smrg scoreboard.last_setmsf_tick = -10; 23847ec681f3Smrg scoreboard.last_stallable_sfu_tick = -10; 238501e04c3fSmrg 238601e04c3fSmrg if (debug) { 238701e04c3fSmrg fprintf(stderr, "Pre-schedule instructions\n"); 238801e04c3fSmrg vir_for_each_block(block, c) { 238901e04c3fSmrg fprintf(stderr, "BLOCK %d\n", block->index); 239001e04c3fSmrg list_for_each_entry(struct qinst, qinst, 239101e04c3fSmrg &block->instructions, link) { 239201e04c3fSmrg v3d_qpu_dump(devinfo, &qinst->qpu); 239301e04c3fSmrg fprintf(stderr, "\n"); 239401e04c3fSmrg } 239501e04c3fSmrg } 239601e04c3fSmrg fprintf(stderr, "\n"); 239701e04c3fSmrg } 239801e04c3fSmrg 239901e04c3fSmrg uint32_t cycles = 0; 240001e04c3fSmrg vir_for_each_block(block, c) { 240101e04c3fSmrg block->start_qpu_ip = c->qpu_inst_count; 240201e04c3fSmrg block->branch_qpu_ip = ~0; 240301e04c3fSmrg block->start_uniform = next_uniform; 240401e04c3fSmrg 240501e04c3fSmrg cycles += qpu_schedule_instructions_block(c, 240601e04c3fSmrg &scoreboard, 240701e04c3fSmrg block, 240801e04c3fSmrg uniform_contents, 240901e04c3fSmrg uniform_data, 241001e04c3fSmrg &next_uniform); 241101e04c3fSmrg 241201e04c3fSmrg block->end_qpu_ip = c->qpu_inst_count - 1; 241301e04c3fSmrg } 241401e04c3fSmrg 241501e04c3fSmrg /* Emit the program-end THRSW instruction. */; 241601e04c3fSmrg struct qinst *thrsw = vir_nop(); 241701e04c3fSmrg thrsw->qpu.sig.thrsw = true; 241801e04c3fSmrg emit_thrsw(c, end_block, &scoreboard, thrsw, true); 241901e04c3fSmrg 242001e04c3fSmrg qpu_set_branch_targets(c); 242101e04c3fSmrg 242201e04c3fSmrg assert(next_uniform == c->num_uniforms); 242301e04c3fSmrg 242401e04c3fSmrg return cycles; 242501e04c3fSmrg} 2426