1b8e80941Smrg/* 2b8e80941Smrg * Copyright © 2010 Intel Corporation 3b8e80941Smrg * Copyright © 2014-2017 Broadcom 4b8e80941Smrg * 5b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a 6b8e80941Smrg * copy of this software and associated documentation files (the "Software"), 7b8e80941Smrg * to deal in the Software without restriction, including without limitation 8b8e80941Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9b8e80941Smrg * and/or sell copies of the Software, and to permit persons to whom the 10b8e80941Smrg * Software is furnished to do so, subject to the following conditions: 11b8e80941Smrg * 12b8e80941Smrg * The above copyright notice and this permission notice (including the next 13b8e80941Smrg * paragraph) shall be included in all copies or substantial portions of the 14b8e80941Smrg * Software. 15b8e80941Smrg * 16b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17b8e80941Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19b8e80941Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20b8e80941Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21b8e80941Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22b8e80941Smrg * IN THE SOFTWARE. 23b8e80941Smrg */ 24b8e80941Smrg 25b8e80941Smrg/** 26b8e80941Smrg * @file 27b8e80941Smrg * 28b8e80941Smrg * The basic model of the list scheduler is to take a basic block, compute a 29b8e80941Smrg * DAG of the dependencies, and make a list of the DAG heads. Heuristically 30b8e80941Smrg * pick a DAG head, then put all the children that are now DAG heads into the 31b8e80941Smrg * list of things to schedule. 32b8e80941Smrg * 33b8e80941Smrg * The goal of scheduling here is to pack pairs of operations together in a 34b8e80941Smrg * single QPU instruction. 35b8e80941Smrg */ 36b8e80941Smrg 37b8e80941Smrg#include "qpu/qpu_disasm.h" 38b8e80941Smrg#include "v3d_compiler.h" 39b8e80941Smrg#include "util/ralloc.h" 40b8e80941Smrg#include "util/dag.h" 41b8e80941Smrg 42b8e80941Smrgstatic bool debug; 43b8e80941Smrg 44b8e80941Smrgstruct schedule_node_child; 45b8e80941Smrg 46b8e80941Smrgstruct schedule_node { 47b8e80941Smrg struct dag_node dag; 48b8e80941Smrg struct list_head link; 49b8e80941Smrg struct qinst *inst; 50b8e80941Smrg 51b8e80941Smrg /* Longest cycles + instruction_latency() of any parent of this node. */ 52b8e80941Smrg uint32_t unblocked_time; 53b8e80941Smrg 54b8e80941Smrg /** 55b8e80941Smrg * Minimum number of cycles from scheduling this instruction until the 56b8e80941Smrg * end of the program, based on the slowest dependency chain through 57b8e80941Smrg * the children. 58b8e80941Smrg */ 59b8e80941Smrg uint32_t delay; 60b8e80941Smrg 61b8e80941Smrg /** 62b8e80941Smrg * cycles between this instruction being scheduled and when its result 63b8e80941Smrg * can be consumed. 64b8e80941Smrg */ 65b8e80941Smrg uint32_t latency; 66b8e80941Smrg}; 67b8e80941Smrg 68b8e80941Smrg/* When walking the instructions in reverse, we need to swap before/after in 69b8e80941Smrg * add_dep(). 70b8e80941Smrg */ 71b8e80941Smrgenum direction { F, R }; 72b8e80941Smrg 73b8e80941Smrgstruct schedule_state { 74b8e80941Smrg const struct v3d_device_info *devinfo; 75b8e80941Smrg struct dag *dag; 76b8e80941Smrg struct schedule_node *last_r[6]; 77b8e80941Smrg struct schedule_node *last_rf[64]; 78b8e80941Smrg struct schedule_node *last_sf; 79b8e80941Smrg struct schedule_node *last_vpm_read; 80b8e80941Smrg struct schedule_node *last_tmu_write; 81b8e80941Smrg struct schedule_node *last_tmu_config; 82b8e80941Smrg struct schedule_node *last_tlb; 83b8e80941Smrg struct schedule_node *last_vpm; 84b8e80941Smrg struct schedule_node *last_unif; 85b8e80941Smrg struct schedule_node *last_rtop; 86b8e80941Smrg enum direction dir; 87b8e80941Smrg /* Estimated cycle when the current instruction would start. */ 88b8e80941Smrg uint32_t time; 89b8e80941Smrg}; 90b8e80941Smrg 91b8e80941Smrgstatic void 92b8e80941Smrgadd_dep(struct schedule_state *state, 93b8e80941Smrg struct schedule_node *before, 94b8e80941Smrg struct schedule_node *after, 95b8e80941Smrg bool write) 96b8e80941Smrg{ 97b8e80941Smrg bool write_after_read = !write && state->dir == R; 98b8e80941Smrg void *edge_data = (void *)(uintptr_t)write_after_read; 99b8e80941Smrg 100b8e80941Smrg if (!before || !after) 101b8e80941Smrg return; 102b8e80941Smrg 103b8e80941Smrg assert(before != after); 104b8e80941Smrg 105b8e80941Smrg if (state->dir == F) 106b8e80941Smrg dag_add_edge(&before->dag, &after->dag, edge_data); 107b8e80941Smrg else 108b8e80941Smrg dag_add_edge(&after->dag, &before->dag, edge_data); 109b8e80941Smrg} 110b8e80941Smrg 111b8e80941Smrgstatic void 112b8e80941Smrgadd_read_dep(struct schedule_state *state, 113b8e80941Smrg struct schedule_node *before, 114b8e80941Smrg struct schedule_node *after) 115b8e80941Smrg{ 116b8e80941Smrg add_dep(state, before, after, false); 117b8e80941Smrg} 118b8e80941Smrg 119b8e80941Smrgstatic void 120b8e80941Smrgadd_write_dep(struct schedule_state *state, 121b8e80941Smrg struct schedule_node **before, 122b8e80941Smrg struct schedule_node *after) 123b8e80941Smrg{ 124b8e80941Smrg add_dep(state, *before, after, true); 125b8e80941Smrg *before = after; 126b8e80941Smrg} 127b8e80941Smrg 128b8e80941Smrgstatic bool 129b8e80941Smrgqpu_inst_is_tlb(const struct v3d_qpu_instr *inst) 130b8e80941Smrg{ 131b8e80941Smrg if (inst->type != V3D_QPU_INSTR_TYPE_ALU) 132b8e80941Smrg return false; 133b8e80941Smrg 134b8e80941Smrg if (inst->alu.add.magic_write && 135b8e80941Smrg (inst->alu.add.waddr == V3D_QPU_WADDR_TLB || 136b8e80941Smrg inst->alu.add.waddr == V3D_QPU_WADDR_TLBU)) 137b8e80941Smrg return true; 138b8e80941Smrg 139b8e80941Smrg if (inst->alu.mul.magic_write && 140b8e80941Smrg (inst->alu.mul.waddr == V3D_QPU_WADDR_TLB || 141b8e80941Smrg inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU)) 142b8e80941Smrg return true; 143b8e80941Smrg 144b8e80941Smrg return false; 145b8e80941Smrg} 146b8e80941Smrg 147b8e80941Smrgstatic void 148b8e80941Smrgprocess_mux_deps(struct schedule_state *state, struct schedule_node *n, 149b8e80941Smrg enum v3d_qpu_mux mux) 150b8e80941Smrg{ 151b8e80941Smrg switch (mux) { 152b8e80941Smrg case V3D_QPU_MUX_A: 153b8e80941Smrg add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n); 154b8e80941Smrg break; 155b8e80941Smrg case V3D_QPU_MUX_B: 156b8e80941Smrg add_read_dep(state, state->last_rf[n->inst->qpu.raddr_b], n); 157b8e80941Smrg break; 158b8e80941Smrg default: 159b8e80941Smrg add_read_dep(state, state->last_r[mux - V3D_QPU_MUX_R0], n); 160b8e80941Smrg break; 161b8e80941Smrg } 162b8e80941Smrg} 163b8e80941Smrg 164b8e80941Smrg 165b8e80941Smrgstatic void 166b8e80941Smrgprocess_waddr_deps(struct schedule_state *state, struct schedule_node *n, 167b8e80941Smrg uint32_t waddr, bool magic) 168b8e80941Smrg{ 169b8e80941Smrg if (!magic) { 170b8e80941Smrg add_write_dep(state, &state->last_rf[waddr], n); 171b8e80941Smrg } else if (v3d_qpu_magic_waddr_is_tmu(waddr)) { 172b8e80941Smrg /* XXX perf: For V3D 4.x, we could reorder TMU writes other 173b8e80941Smrg * than the TMUS/TMUD/TMUA to improve scheduling flexibility. 174b8e80941Smrg */ 175b8e80941Smrg add_write_dep(state, &state->last_tmu_write, n); 176b8e80941Smrg switch (waddr) { 177b8e80941Smrg case V3D_QPU_WADDR_TMUS: 178b8e80941Smrg case V3D_QPU_WADDR_TMUSCM: 179b8e80941Smrg case V3D_QPU_WADDR_TMUSF: 180b8e80941Smrg case V3D_QPU_WADDR_TMUSLOD: 181b8e80941Smrg add_write_dep(state, &state->last_tmu_config, n); 182b8e80941Smrg break; 183b8e80941Smrg default: 184b8e80941Smrg break; 185b8e80941Smrg } 186b8e80941Smrg } else if (v3d_qpu_magic_waddr_is_sfu(waddr)) { 187b8e80941Smrg /* Handled by v3d_qpu_writes_r4() check. */ 188b8e80941Smrg } else { 189b8e80941Smrg switch (waddr) { 190b8e80941Smrg case V3D_QPU_WADDR_R0: 191b8e80941Smrg case V3D_QPU_WADDR_R1: 192b8e80941Smrg case V3D_QPU_WADDR_R2: 193b8e80941Smrg add_write_dep(state, 194b8e80941Smrg &state->last_r[waddr - V3D_QPU_WADDR_R0], 195b8e80941Smrg n); 196b8e80941Smrg break; 197b8e80941Smrg case V3D_QPU_WADDR_R3: 198b8e80941Smrg case V3D_QPU_WADDR_R4: 199b8e80941Smrg case V3D_QPU_WADDR_R5: 200b8e80941Smrg /* Handled by v3d_qpu_writes_r*() checks below. */ 201b8e80941Smrg break; 202b8e80941Smrg 203b8e80941Smrg case V3D_QPU_WADDR_VPM: 204b8e80941Smrg case V3D_QPU_WADDR_VPMU: 205b8e80941Smrg add_write_dep(state, &state->last_vpm, n); 206b8e80941Smrg break; 207b8e80941Smrg 208b8e80941Smrg case V3D_QPU_WADDR_TLB: 209b8e80941Smrg case V3D_QPU_WADDR_TLBU: 210b8e80941Smrg add_write_dep(state, &state->last_tlb, n); 211b8e80941Smrg break; 212b8e80941Smrg 213b8e80941Smrg case V3D_QPU_WADDR_SYNC: 214b8e80941Smrg case V3D_QPU_WADDR_SYNCB: 215b8e80941Smrg case V3D_QPU_WADDR_SYNCU: 216b8e80941Smrg /* For CS barrier(): Sync against any other memory 217b8e80941Smrg * accesses. There doesn't appear to be any need for 218b8e80941Smrg * barriers to affect ALU operations. 219b8e80941Smrg */ 220b8e80941Smrg add_write_dep(state, &state->last_tmu_write, n); 221b8e80941Smrg break; 222b8e80941Smrg 223b8e80941Smrg case V3D_QPU_WADDR_NOP: 224b8e80941Smrg break; 225b8e80941Smrg 226b8e80941Smrg default: 227b8e80941Smrg fprintf(stderr, "Unknown waddr %d\n", waddr); 228b8e80941Smrg abort(); 229b8e80941Smrg } 230b8e80941Smrg } 231b8e80941Smrg} 232b8e80941Smrg 233b8e80941Smrg/** 234b8e80941Smrg * Common code for dependencies that need to be tracked both forward and 235b8e80941Smrg * backward. 236b8e80941Smrg * 237b8e80941Smrg * This is for things like "all reads of r4 have to happen between the r4 238b8e80941Smrg * writes that surround them". 239b8e80941Smrg */ 240b8e80941Smrgstatic void 241b8e80941Smrgcalculate_deps(struct schedule_state *state, struct schedule_node *n) 242b8e80941Smrg{ 243b8e80941Smrg const struct v3d_device_info *devinfo = state->devinfo; 244b8e80941Smrg struct qinst *qinst = n->inst; 245b8e80941Smrg struct v3d_qpu_instr *inst = &qinst->qpu; 246b8e80941Smrg /* If the input and output segments are shared, then all VPM reads to 247b8e80941Smrg * a location need to happen before all writes. We handle this by 248b8e80941Smrg * serializing all VPM operations for now. 249b8e80941Smrg */ 250b8e80941Smrg bool separate_vpm_segment = false; 251b8e80941Smrg 252b8e80941Smrg if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) { 253b8e80941Smrg if (inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS) 254b8e80941Smrg add_read_dep(state, state->last_sf, n); 255b8e80941Smrg 256b8e80941Smrg /* XXX: BDI */ 257b8e80941Smrg /* XXX: BDU */ 258b8e80941Smrg /* XXX: ub */ 259b8e80941Smrg /* XXX: raddr_a */ 260b8e80941Smrg 261b8e80941Smrg add_write_dep(state, &state->last_unif, n); 262b8e80941Smrg return; 263b8e80941Smrg } 264b8e80941Smrg 265b8e80941Smrg assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); 266b8e80941Smrg 267b8e80941Smrg /* XXX: LOAD_IMM */ 268b8e80941Smrg 269b8e80941Smrg if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) 270b8e80941Smrg process_mux_deps(state, n, inst->alu.add.a); 271b8e80941Smrg if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) 272b8e80941Smrg process_mux_deps(state, n, inst->alu.add.b); 273b8e80941Smrg 274b8e80941Smrg if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) 275b8e80941Smrg process_mux_deps(state, n, inst->alu.mul.a); 276b8e80941Smrg if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) 277b8e80941Smrg process_mux_deps(state, n, inst->alu.mul.b); 278b8e80941Smrg 279b8e80941Smrg switch (inst->alu.add.op) { 280b8e80941Smrg case V3D_QPU_A_VPMSETUP: 281b8e80941Smrg /* Could distinguish read/write by unpacking the uniform. */ 282b8e80941Smrg add_write_dep(state, &state->last_vpm, n); 283b8e80941Smrg add_write_dep(state, &state->last_vpm_read, n); 284b8e80941Smrg break; 285b8e80941Smrg 286b8e80941Smrg case V3D_QPU_A_STVPMV: 287b8e80941Smrg case V3D_QPU_A_STVPMD: 288b8e80941Smrg case V3D_QPU_A_STVPMP: 289b8e80941Smrg add_write_dep(state, &state->last_vpm, n); 290b8e80941Smrg break; 291b8e80941Smrg 292b8e80941Smrg case V3D_QPU_A_LDVPMV_IN: 293b8e80941Smrg case V3D_QPU_A_LDVPMD_IN: 294b8e80941Smrg case V3D_QPU_A_LDVPMG_IN: 295b8e80941Smrg case V3D_QPU_A_LDVPMP: 296b8e80941Smrg if (!separate_vpm_segment) 297b8e80941Smrg add_write_dep(state, &state->last_vpm, n); 298b8e80941Smrg break; 299b8e80941Smrg 300b8e80941Smrg case V3D_QPU_A_VPMWT: 301b8e80941Smrg add_read_dep(state, state->last_vpm, n); 302b8e80941Smrg break; 303b8e80941Smrg 304b8e80941Smrg case V3D_QPU_A_MSF: 305b8e80941Smrg add_read_dep(state, state->last_tlb, n); 306b8e80941Smrg break; 307b8e80941Smrg 308b8e80941Smrg case V3D_QPU_A_SETMSF: 309b8e80941Smrg case V3D_QPU_A_SETREVF: 310b8e80941Smrg add_write_dep(state, &state->last_tlb, n); 311b8e80941Smrg break; 312b8e80941Smrg 313b8e80941Smrg default: 314b8e80941Smrg break; 315b8e80941Smrg } 316b8e80941Smrg 317b8e80941Smrg switch (inst->alu.mul.op) { 318b8e80941Smrg case V3D_QPU_M_MULTOP: 319b8e80941Smrg case V3D_QPU_M_UMUL24: 320b8e80941Smrg /* MULTOP sets rtop, and UMUL24 implicitly reads rtop and 321b8e80941Smrg * resets it to 0. We could possibly reorder umul24s relative 322b8e80941Smrg * to each other, but for now just keep all the MUL parts in 323b8e80941Smrg * order. 324b8e80941Smrg */ 325b8e80941Smrg add_write_dep(state, &state->last_rtop, n); 326b8e80941Smrg break; 327b8e80941Smrg default: 328b8e80941Smrg break; 329b8e80941Smrg } 330b8e80941Smrg 331b8e80941Smrg if (inst->alu.add.op != V3D_QPU_A_NOP) { 332b8e80941Smrg process_waddr_deps(state, n, inst->alu.add.waddr, 333b8e80941Smrg inst->alu.add.magic_write); 334b8e80941Smrg } 335b8e80941Smrg if (inst->alu.mul.op != V3D_QPU_M_NOP) { 336b8e80941Smrg process_waddr_deps(state, n, inst->alu.mul.waddr, 337b8e80941Smrg inst->alu.mul.magic_write); 338b8e80941Smrg } 339b8e80941Smrg if (v3d_qpu_sig_writes_address(devinfo, &inst->sig)) { 340b8e80941Smrg process_waddr_deps(state, n, inst->sig_addr, 341b8e80941Smrg inst->sig_magic); 342b8e80941Smrg } 343b8e80941Smrg 344b8e80941Smrg if (v3d_qpu_writes_r3(devinfo, inst)) 345b8e80941Smrg add_write_dep(state, &state->last_r[3], n); 346b8e80941Smrg if (v3d_qpu_writes_r4(devinfo, inst)) 347b8e80941Smrg add_write_dep(state, &state->last_r[4], n); 348b8e80941Smrg if (v3d_qpu_writes_r5(devinfo, inst)) 349b8e80941Smrg add_write_dep(state, &state->last_r[5], n); 350b8e80941Smrg 351b8e80941Smrg if (inst->sig.thrsw) { 352b8e80941Smrg /* All accumulator contents and flags are undefined after the 353b8e80941Smrg * switch. 354b8e80941Smrg */ 355b8e80941Smrg for (int i = 0; i < ARRAY_SIZE(state->last_r); i++) 356b8e80941Smrg add_write_dep(state, &state->last_r[i], n); 357b8e80941Smrg add_write_dep(state, &state->last_sf, n); 358b8e80941Smrg add_write_dep(state, &state->last_rtop, n); 359b8e80941Smrg 360b8e80941Smrg /* Scoreboard-locking operations have to stay after the last 361b8e80941Smrg * thread switch. 362b8e80941Smrg */ 363b8e80941Smrg add_write_dep(state, &state->last_tlb, n); 364b8e80941Smrg 365b8e80941Smrg add_write_dep(state, &state->last_tmu_write, n); 366b8e80941Smrg add_write_dep(state, &state->last_tmu_config, n); 367b8e80941Smrg } 368b8e80941Smrg 369b8e80941Smrg if (v3d_qpu_waits_on_tmu(inst)) { 370b8e80941Smrg /* TMU loads are coming from a FIFO, so ordering is important. 371b8e80941Smrg */ 372b8e80941Smrg add_write_dep(state, &state->last_tmu_write, n); 373b8e80941Smrg } 374b8e80941Smrg 375b8e80941Smrg if (inst->sig.wrtmuc) 376b8e80941Smrg add_write_dep(state, &state->last_tmu_config, n); 377b8e80941Smrg 378b8e80941Smrg if (inst->sig.ldtlb | inst->sig.ldtlbu) 379b8e80941Smrg add_read_dep(state, state->last_tlb, n); 380b8e80941Smrg 381b8e80941Smrg if (inst->sig.ldvpm) { 382b8e80941Smrg add_write_dep(state, &state->last_vpm_read, n); 383b8e80941Smrg 384b8e80941Smrg /* At least for now, we're doing shared I/O segments, so queue 385b8e80941Smrg * all writes after all reads. 386b8e80941Smrg */ 387b8e80941Smrg if (!separate_vpm_segment) 388b8e80941Smrg add_write_dep(state, &state->last_vpm, n); 389b8e80941Smrg } 390b8e80941Smrg 391b8e80941Smrg /* inst->sig.ldunif or sideband uniform read */ 392b8e80941Smrg if (vir_has_uniform(qinst)) 393b8e80941Smrg add_write_dep(state, &state->last_unif, n); 394b8e80941Smrg 395b8e80941Smrg if (v3d_qpu_reads_flags(inst)) 396b8e80941Smrg add_read_dep(state, state->last_sf, n); 397b8e80941Smrg if (v3d_qpu_writes_flags(inst)) 398b8e80941Smrg add_write_dep(state, &state->last_sf, n); 399b8e80941Smrg} 400b8e80941Smrg 401b8e80941Smrgstatic void 402b8e80941Smrgcalculate_forward_deps(struct v3d_compile *c, struct dag *dag, 403b8e80941Smrg struct list_head *schedule_list) 404b8e80941Smrg{ 405b8e80941Smrg struct schedule_state state; 406b8e80941Smrg 407b8e80941Smrg memset(&state, 0, sizeof(state)); 408b8e80941Smrg state.dag = dag; 409b8e80941Smrg state.devinfo = c->devinfo; 410b8e80941Smrg state.dir = F; 411b8e80941Smrg 412b8e80941Smrg list_for_each_entry(struct schedule_node, node, schedule_list, link) 413b8e80941Smrg calculate_deps(&state, node); 414b8e80941Smrg} 415b8e80941Smrg 416b8e80941Smrgstatic void 417b8e80941Smrgcalculate_reverse_deps(struct v3d_compile *c, struct dag *dag, 418b8e80941Smrg struct list_head *schedule_list) 419b8e80941Smrg{ 420b8e80941Smrg struct schedule_state state; 421b8e80941Smrg 422b8e80941Smrg memset(&state, 0, sizeof(state)); 423b8e80941Smrg state.dag = dag; 424b8e80941Smrg state.devinfo = c->devinfo; 425b8e80941Smrg state.dir = R; 426b8e80941Smrg 427b8e80941Smrg list_for_each_entry_rev(struct schedule_node, node, schedule_list, 428b8e80941Smrg link) { 429b8e80941Smrg calculate_deps(&state, (struct schedule_node *)node); 430b8e80941Smrg } 431b8e80941Smrg} 432b8e80941Smrg 433b8e80941Smrgstruct choose_scoreboard { 434b8e80941Smrg struct dag *dag; 435b8e80941Smrg int tick; 436b8e80941Smrg int last_magic_sfu_write_tick; 437b8e80941Smrg int last_ldvary_tick; 438b8e80941Smrg int last_uniforms_reset_tick; 439b8e80941Smrg int last_thrsw_tick; 440b8e80941Smrg bool tlb_locked; 441b8e80941Smrg}; 442b8e80941Smrg 443b8e80941Smrgstatic bool 444b8e80941Smrgmux_reads_too_soon(struct choose_scoreboard *scoreboard, 445b8e80941Smrg const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux) 446b8e80941Smrg{ 447b8e80941Smrg switch (mux) { 448b8e80941Smrg case V3D_QPU_MUX_R4: 449b8e80941Smrg if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick <= 2) 450b8e80941Smrg return true; 451b8e80941Smrg break; 452b8e80941Smrg 453b8e80941Smrg case V3D_QPU_MUX_R5: 454b8e80941Smrg if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1) 455b8e80941Smrg return true; 456b8e80941Smrg break; 457b8e80941Smrg default: 458b8e80941Smrg break; 459b8e80941Smrg } 460b8e80941Smrg 461b8e80941Smrg return false; 462b8e80941Smrg} 463b8e80941Smrg 464b8e80941Smrgstatic bool 465b8e80941Smrgreads_too_soon_after_write(struct choose_scoreboard *scoreboard, 466b8e80941Smrg struct qinst *qinst) 467b8e80941Smrg{ 468b8e80941Smrg const struct v3d_qpu_instr *inst = &qinst->qpu; 469b8e80941Smrg 470b8e80941Smrg /* XXX: Branching off of raddr. */ 471b8e80941Smrg if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) 472b8e80941Smrg return false; 473b8e80941Smrg 474b8e80941Smrg assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); 475b8e80941Smrg 476b8e80941Smrg if (inst->alu.add.op != V3D_QPU_A_NOP) { 477b8e80941Smrg if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 && 478b8e80941Smrg mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) { 479b8e80941Smrg return true; 480b8e80941Smrg } 481b8e80941Smrg if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 && 482b8e80941Smrg mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) { 483b8e80941Smrg return true; 484b8e80941Smrg } 485b8e80941Smrg } 486b8e80941Smrg 487b8e80941Smrg if (inst->alu.mul.op != V3D_QPU_M_NOP) { 488b8e80941Smrg if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 && 489b8e80941Smrg mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) { 490b8e80941Smrg return true; 491b8e80941Smrg } 492b8e80941Smrg if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 && 493b8e80941Smrg mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) { 494b8e80941Smrg return true; 495b8e80941Smrg } 496b8e80941Smrg } 497b8e80941Smrg 498b8e80941Smrg /* XXX: imm */ 499b8e80941Smrg 500b8e80941Smrg return false; 501b8e80941Smrg} 502b8e80941Smrg 503b8e80941Smrgstatic bool 504b8e80941Smrgwrites_too_soon_after_write(const struct v3d_device_info *devinfo, 505b8e80941Smrg struct choose_scoreboard *scoreboard, 506b8e80941Smrg struct qinst *qinst) 507b8e80941Smrg{ 508b8e80941Smrg const struct v3d_qpu_instr *inst = &qinst->qpu; 509b8e80941Smrg 510b8e80941Smrg /* Don't schedule any other r4 write too soon after an SFU write. 511b8e80941Smrg * This would normally be prevented by dependency tracking, but might 512b8e80941Smrg * occur if a dead SFU computation makes it to scheduling. 513b8e80941Smrg */ 514b8e80941Smrg if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick < 2 && 515b8e80941Smrg v3d_qpu_writes_r4(devinfo, inst)) 516b8e80941Smrg return true; 517b8e80941Smrg 518b8e80941Smrg return false; 519b8e80941Smrg} 520b8e80941Smrg 521b8e80941Smrgstatic bool 522b8e80941Smrgpixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard, 523b8e80941Smrg const struct v3d_qpu_instr *inst) 524b8e80941Smrg{ 525b8e80941Smrg return (scoreboard->tick == 0 && qpu_inst_is_tlb(inst)); 526b8e80941Smrg} 527b8e80941Smrg 528b8e80941Smrgstatic int 529b8e80941Smrgget_instruction_priority(const struct v3d_qpu_instr *inst) 530b8e80941Smrg{ 531b8e80941Smrg uint32_t baseline_score; 532b8e80941Smrg uint32_t next_score = 0; 533b8e80941Smrg 534b8e80941Smrg /* Schedule TLB operations as late as possible, to get more 535b8e80941Smrg * parallelism between shaders. 536b8e80941Smrg */ 537b8e80941Smrg if (qpu_inst_is_tlb(inst)) 538b8e80941Smrg return next_score; 539b8e80941Smrg next_score++; 540b8e80941Smrg 541b8e80941Smrg /* Schedule texture read results collection late to hide latency. */ 542b8e80941Smrg if (v3d_qpu_waits_on_tmu(inst)) 543b8e80941Smrg return next_score; 544b8e80941Smrg next_score++; 545b8e80941Smrg 546b8e80941Smrg /* XXX perf: We should schedule SFU ALU ops so that the reader is 2 547b8e80941Smrg * instructions after the producer if possible, not just 1. 548b8e80941Smrg */ 549b8e80941Smrg 550b8e80941Smrg /* Default score for things that aren't otherwise special. */ 551b8e80941Smrg baseline_score = next_score; 552b8e80941Smrg next_score++; 553b8e80941Smrg 554b8e80941Smrg /* Schedule texture read setup early to hide their latency better. */ 555b8e80941Smrg if (v3d_qpu_writes_tmu(inst)) 556b8e80941Smrg return next_score; 557b8e80941Smrg next_score++; 558b8e80941Smrg 559b8e80941Smrg return baseline_score; 560b8e80941Smrg} 561b8e80941Smrg 562b8e80941Smrgstatic bool 563b8e80941Smrgqpu_magic_waddr_is_periph(enum v3d_qpu_waddr waddr) 564b8e80941Smrg{ 565b8e80941Smrg return (v3d_qpu_magic_waddr_is_tmu(waddr) || 566b8e80941Smrg v3d_qpu_magic_waddr_is_sfu(waddr) || 567b8e80941Smrg v3d_qpu_magic_waddr_is_tlb(waddr) || 568b8e80941Smrg v3d_qpu_magic_waddr_is_vpm(waddr) || 569b8e80941Smrg v3d_qpu_magic_waddr_is_tsy(waddr)); 570b8e80941Smrg} 571b8e80941Smrg 572b8e80941Smrgstatic bool 573b8e80941Smrgqpu_accesses_peripheral(const struct v3d_qpu_instr *inst) 574b8e80941Smrg{ 575b8e80941Smrg if (v3d_qpu_uses_vpm(inst)) 576b8e80941Smrg return true; 577b8e80941Smrg if (v3d_qpu_uses_sfu(inst)) 578b8e80941Smrg return true; 579b8e80941Smrg 580b8e80941Smrg if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { 581b8e80941Smrg if (inst->alu.add.op != V3D_QPU_A_NOP && 582b8e80941Smrg inst->alu.add.magic_write && 583b8e80941Smrg qpu_magic_waddr_is_periph(inst->alu.add.waddr)) { 584b8e80941Smrg return true; 585b8e80941Smrg } 586b8e80941Smrg 587b8e80941Smrg if (inst->alu.add.op == V3D_QPU_A_TMUWT) 588b8e80941Smrg return true; 589b8e80941Smrg 590b8e80941Smrg if (inst->alu.mul.op != V3D_QPU_M_NOP && 591b8e80941Smrg inst->alu.mul.magic_write && 592b8e80941Smrg qpu_magic_waddr_is_periph(inst->alu.mul.waddr)) { 593b8e80941Smrg return true; 594b8e80941Smrg } 595b8e80941Smrg } 596b8e80941Smrg 597b8e80941Smrg return (inst->sig.ldvpm || 598b8e80941Smrg inst->sig.ldtmu || 599b8e80941Smrg inst->sig.ldtlb || 600b8e80941Smrg inst->sig.ldtlbu || 601b8e80941Smrg inst->sig.wrtmuc); 602b8e80941Smrg} 603b8e80941Smrg 604b8e80941Smrgstatic bool 605b8e80941Smrgqpu_merge_inst(const struct v3d_device_info *devinfo, 606b8e80941Smrg struct v3d_qpu_instr *result, 607b8e80941Smrg const struct v3d_qpu_instr *a, 608b8e80941Smrg const struct v3d_qpu_instr *b) 609b8e80941Smrg{ 610b8e80941Smrg if (a->type != V3D_QPU_INSTR_TYPE_ALU || 611b8e80941Smrg b->type != V3D_QPU_INSTR_TYPE_ALU) { 612b8e80941Smrg return false; 613b8e80941Smrg } 614b8e80941Smrg 615b8e80941Smrg /* Can't do more than one peripheral access in an instruction. 616b8e80941Smrg * 617b8e80941Smrg * XXX: V3D 4.1 allows TMU read along with a VPM read or write, and 618b8e80941Smrg * WRTMUC with a TMU magic register write (other than tmuc). 619b8e80941Smrg */ 620b8e80941Smrg if (qpu_accesses_peripheral(a) && qpu_accesses_peripheral(b)) 621b8e80941Smrg return false; 622b8e80941Smrg 623b8e80941Smrg struct v3d_qpu_instr merge = *a; 624b8e80941Smrg 625b8e80941Smrg if (b->alu.add.op != V3D_QPU_A_NOP) { 626b8e80941Smrg if (a->alu.add.op != V3D_QPU_A_NOP) 627b8e80941Smrg return false; 628b8e80941Smrg merge.alu.add = b->alu.add; 629b8e80941Smrg 630b8e80941Smrg merge.flags.ac = b->flags.ac; 631b8e80941Smrg merge.flags.apf = b->flags.apf; 632b8e80941Smrg merge.flags.auf = b->flags.auf; 633b8e80941Smrg } 634b8e80941Smrg 635b8e80941Smrg if (b->alu.mul.op != V3D_QPU_M_NOP) { 636b8e80941Smrg if (a->alu.mul.op != V3D_QPU_M_NOP) 637b8e80941Smrg return false; 638b8e80941Smrg merge.alu.mul = b->alu.mul; 639b8e80941Smrg 640b8e80941Smrg merge.flags.mc = b->flags.mc; 641b8e80941Smrg merge.flags.mpf = b->flags.mpf; 642b8e80941Smrg merge.flags.muf = b->flags.muf; 643b8e80941Smrg } 644b8e80941Smrg 645b8e80941Smrg if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A)) { 646b8e80941Smrg if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A) && 647b8e80941Smrg a->raddr_a != b->raddr_a) { 648b8e80941Smrg return false; 649b8e80941Smrg } 650b8e80941Smrg merge.raddr_a = b->raddr_a; 651b8e80941Smrg } 652b8e80941Smrg 653b8e80941Smrg if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) { 654b8e80941Smrg if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_B) && 655b8e80941Smrg (a->raddr_b != b->raddr_b || 656b8e80941Smrg a->sig.small_imm != b->sig.small_imm)) { 657b8e80941Smrg return false; 658b8e80941Smrg } 659b8e80941Smrg merge.raddr_b = b->raddr_b; 660b8e80941Smrg } 661b8e80941Smrg 662b8e80941Smrg merge.sig.thrsw |= b->sig.thrsw; 663b8e80941Smrg merge.sig.ldunif |= b->sig.ldunif; 664b8e80941Smrg merge.sig.ldunifrf |= b->sig.ldunifrf; 665b8e80941Smrg merge.sig.ldunifa |= b->sig.ldunifa; 666b8e80941Smrg merge.sig.ldunifarf |= b->sig.ldunifarf; 667b8e80941Smrg merge.sig.ldtmu |= b->sig.ldtmu; 668b8e80941Smrg merge.sig.ldvary |= b->sig.ldvary; 669b8e80941Smrg merge.sig.ldvpm |= b->sig.ldvpm; 670b8e80941Smrg merge.sig.small_imm |= b->sig.small_imm; 671b8e80941Smrg merge.sig.ldtlb |= b->sig.ldtlb; 672b8e80941Smrg merge.sig.ldtlbu |= b->sig.ldtlbu; 673b8e80941Smrg merge.sig.ucb |= b->sig.ucb; 674b8e80941Smrg merge.sig.rotate |= b->sig.rotate; 675b8e80941Smrg merge.sig.wrtmuc |= b->sig.wrtmuc; 676b8e80941Smrg 677b8e80941Smrg if (v3d_qpu_sig_writes_address(devinfo, &a->sig) && 678b8e80941Smrg v3d_qpu_sig_writes_address(devinfo, &b->sig)) 679b8e80941Smrg return false; 680b8e80941Smrg merge.sig_addr |= b->sig_addr; 681b8e80941Smrg merge.sig_magic |= b->sig_magic; 682b8e80941Smrg 683b8e80941Smrg uint64_t packed; 684b8e80941Smrg bool ok = v3d_qpu_instr_pack(devinfo, &merge, &packed); 685b8e80941Smrg 686b8e80941Smrg *result = merge; 687b8e80941Smrg /* No modifying the real instructions on failure. */ 688b8e80941Smrg assert(ok || (a != result && b != result)); 689b8e80941Smrg 690b8e80941Smrg return ok; 691b8e80941Smrg} 692b8e80941Smrg 693b8e80941Smrgstatic struct schedule_node * 694b8e80941Smrgchoose_instruction_to_schedule(const struct v3d_device_info *devinfo, 695b8e80941Smrg struct choose_scoreboard *scoreboard, 696b8e80941Smrg struct schedule_node *prev_inst) 697b8e80941Smrg{ 698b8e80941Smrg struct schedule_node *chosen = NULL; 699b8e80941Smrg int chosen_prio = 0; 700b8e80941Smrg 701b8e80941Smrg /* Don't pair up anything with a thread switch signal -- emit_thrsw() 702b8e80941Smrg * will handle pairing it along with filling the delay slots. 703b8e80941Smrg */ 704b8e80941Smrg if (prev_inst) { 705b8e80941Smrg if (prev_inst->inst->qpu.sig.thrsw) 706b8e80941Smrg return NULL; 707b8e80941Smrg } 708b8e80941Smrg 709b8e80941Smrg list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads, 710b8e80941Smrg dag.link) { 711b8e80941Smrg const struct v3d_qpu_instr *inst = &n->inst->qpu; 712b8e80941Smrg 713b8e80941Smrg /* Don't choose the branch instruction until it's the last one 714b8e80941Smrg * left. We'll move it up to fit its delay slots after we 715b8e80941Smrg * choose it. 716b8e80941Smrg */ 717b8e80941Smrg if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH && 718b8e80941Smrg !list_is_singular(&scoreboard->dag->heads)) { 719b8e80941Smrg continue; 720b8e80941Smrg } 721b8e80941Smrg 722b8e80941Smrg /* "An instruction must not read from a location in physical 723b8e80941Smrg * regfile A or B that was written to by the previous 724b8e80941Smrg * instruction." 725b8e80941Smrg */ 726b8e80941Smrg if (reads_too_soon_after_write(scoreboard, n->inst)) 727b8e80941Smrg continue; 728b8e80941Smrg 729b8e80941Smrg if (writes_too_soon_after_write(devinfo, scoreboard, n->inst)) 730b8e80941Smrg continue; 731b8e80941Smrg 732b8e80941Smrg /* "A scoreboard wait must not occur in the first two 733b8e80941Smrg * instructions of a fragment shader. This is either the 734b8e80941Smrg * explicit Wait for Scoreboard signal or an implicit wait 735b8e80941Smrg * with the first tile-buffer read or write instruction." 736b8e80941Smrg */ 737b8e80941Smrg if (pixel_scoreboard_too_soon(scoreboard, inst)) 738b8e80941Smrg continue; 739b8e80941Smrg 740b8e80941Smrg /* ldunif and ldvary both write r5, but ldunif does so a tick 741b8e80941Smrg * sooner. If the ldvary's r5 wasn't used, then ldunif might 742b8e80941Smrg * otherwise get scheduled so ldunif and ldvary try to update 743b8e80941Smrg * r5 in the same tick. 744b8e80941Smrg * 745b8e80941Smrg * XXX perf: To get good pipelining of a sequence of varying 746b8e80941Smrg * loads, we need to figure out how to pair the ldvary signal 747b8e80941Smrg * up to the instruction before the last r5 user in the 748b8e80941Smrg * previous ldvary sequence. Currently, it usually pairs with 749b8e80941Smrg * the last r5 user. 750b8e80941Smrg */ 751b8e80941Smrg if ((inst->sig.ldunif || inst->sig.ldunifa) && 752b8e80941Smrg scoreboard->tick == scoreboard->last_ldvary_tick + 1) { 753b8e80941Smrg continue; 754b8e80941Smrg } 755b8e80941Smrg 756b8e80941Smrg /* If we're trying to pair with another instruction, check 757b8e80941Smrg * that they're compatible. 758b8e80941Smrg */ 759b8e80941Smrg if (prev_inst) { 760b8e80941Smrg /* Don't pair up a thread switch signal -- we'll 761b8e80941Smrg * handle pairing it when we pick it on its own. 762b8e80941Smrg */ 763b8e80941Smrg if (inst->sig.thrsw) 764b8e80941Smrg continue; 765b8e80941Smrg 766b8e80941Smrg if (prev_inst->inst->uniform != -1 && 767b8e80941Smrg n->inst->uniform != -1) 768b8e80941Smrg continue; 769b8e80941Smrg 770b8e80941Smrg /* Don't merge in something that will lock the TLB. 771b8e80941Smrg * Hopwefully what we have in inst will release some 772b8e80941Smrg * other instructions, allowing us to delay the 773b8e80941Smrg * TLB-locking instruction until later. 774b8e80941Smrg */ 775b8e80941Smrg if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst)) 776b8e80941Smrg continue; 777b8e80941Smrg 778b8e80941Smrg struct v3d_qpu_instr merged_inst; 779b8e80941Smrg if (!qpu_merge_inst(devinfo, &merged_inst, 780b8e80941Smrg &prev_inst->inst->qpu, inst)) { 781b8e80941Smrg continue; 782b8e80941Smrg } 783b8e80941Smrg } 784b8e80941Smrg 785b8e80941Smrg int prio = get_instruction_priority(inst); 786b8e80941Smrg 787b8e80941Smrg /* Found a valid instruction. If nothing better comes along, 788b8e80941Smrg * this one works. 789b8e80941Smrg */ 790b8e80941Smrg if (!chosen) { 791b8e80941Smrg chosen = n; 792b8e80941Smrg chosen_prio = prio; 793b8e80941Smrg continue; 794b8e80941Smrg } 795b8e80941Smrg 796b8e80941Smrg if (prio > chosen_prio) { 797b8e80941Smrg chosen = n; 798b8e80941Smrg chosen_prio = prio; 799b8e80941Smrg } else if (prio < chosen_prio) { 800b8e80941Smrg continue; 801b8e80941Smrg } 802b8e80941Smrg 803b8e80941Smrg if (n->delay > chosen->delay) { 804b8e80941Smrg chosen = n; 805b8e80941Smrg chosen_prio = prio; 806b8e80941Smrg } else if (n->delay < chosen->delay) { 807b8e80941Smrg continue; 808b8e80941Smrg } 809b8e80941Smrg } 810b8e80941Smrg 811b8e80941Smrg return chosen; 812b8e80941Smrg} 813b8e80941Smrg 814b8e80941Smrgstatic void 815b8e80941Smrgupdate_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard, 816b8e80941Smrg enum v3d_qpu_waddr waddr) 817b8e80941Smrg{ 818b8e80941Smrg if (v3d_qpu_magic_waddr_is_sfu(waddr)) 819b8e80941Smrg scoreboard->last_magic_sfu_write_tick = scoreboard->tick; 820b8e80941Smrg} 821b8e80941Smrg 822b8e80941Smrgstatic void 823b8e80941Smrgupdate_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, 824b8e80941Smrg const struct v3d_qpu_instr *inst) 825b8e80941Smrg{ 826b8e80941Smrg if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) 827b8e80941Smrg return; 828b8e80941Smrg 829b8e80941Smrg assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); 830b8e80941Smrg 831b8e80941Smrg if (inst->alu.add.op != V3D_QPU_A_NOP) { 832b8e80941Smrg if (inst->alu.add.magic_write) { 833b8e80941Smrg update_scoreboard_for_magic_waddr(scoreboard, 834b8e80941Smrg inst->alu.add.waddr); 835b8e80941Smrg } 836b8e80941Smrg } 837b8e80941Smrg 838b8e80941Smrg if (inst->alu.mul.op != V3D_QPU_M_NOP) { 839b8e80941Smrg if (inst->alu.mul.magic_write) { 840b8e80941Smrg update_scoreboard_for_magic_waddr(scoreboard, 841b8e80941Smrg inst->alu.mul.waddr); 842b8e80941Smrg } 843b8e80941Smrg } 844b8e80941Smrg 845b8e80941Smrg if (inst->sig.ldvary) 846b8e80941Smrg scoreboard->last_ldvary_tick = scoreboard->tick; 847b8e80941Smrg 848b8e80941Smrg if (qpu_inst_is_tlb(inst)) 849b8e80941Smrg scoreboard->tlb_locked = true; 850b8e80941Smrg} 851b8e80941Smrg 852b8e80941Smrgstatic void 853b8e80941Smrgdump_state(const struct v3d_device_info *devinfo, struct dag *dag) 854b8e80941Smrg{ 855b8e80941Smrg list_for_each_entry(struct schedule_node, n, &dag->heads, dag.link) { 856b8e80941Smrg fprintf(stderr, " t=%4d: ", n->unblocked_time); 857b8e80941Smrg v3d_qpu_dump(devinfo, &n->inst->qpu); 858b8e80941Smrg fprintf(stderr, "\n"); 859b8e80941Smrg 860b8e80941Smrg util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { 861b8e80941Smrg struct schedule_node *child = 862b8e80941Smrg (struct schedule_node *)edge->child; 863b8e80941Smrg if (!child) 864b8e80941Smrg continue; 865b8e80941Smrg 866b8e80941Smrg fprintf(stderr, " - "); 867b8e80941Smrg v3d_qpu_dump(devinfo, &child->inst->qpu); 868b8e80941Smrg fprintf(stderr, " (%d parents, %c)\n", 869b8e80941Smrg child->dag.parent_count, 870b8e80941Smrg edge->data ? 'w' : 'r'); 871b8e80941Smrg } 872b8e80941Smrg } 873b8e80941Smrg} 874b8e80941Smrg 875b8e80941Smrgstatic uint32_t magic_waddr_latency(enum v3d_qpu_waddr waddr, 876b8e80941Smrg const struct v3d_qpu_instr *after) 877b8e80941Smrg{ 878b8e80941Smrg /* Apply some huge latency between texture fetch requests and getting 879b8e80941Smrg * their results back. 880b8e80941Smrg * 881b8e80941Smrg * FIXME: This is actually pretty bogus. If we do: 882b8e80941Smrg * 883b8e80941Smrg * mov tmu0_s, a 884b8e80941Smrg * <a bit of math> 885b8e80941Smrg * mov tmu0_s, b 886b8e80941Smrg * load_tmu0 887b8e80941Smrg * <more math> 888b8e80941Smrg * load_tmu0 889b8e80941Smrg * 890b8e80941Smrg * we count that as worse than 891b8e80941Smrg * 892b8e80941Smrg * mov tmu0_s, a 893b8e80941Smrg * mov tmu0_s, b 894b8e80941Smrg * <lots of math> 895b8e80941Smrg * load_tmu0 896b8e80941Smrg * <more math> 897b8e80941Smrg * load_tmu0 898b8e80941Smrg * 899b8e80941Smrg * because we associate the first load_tmu0 with the *second* tmu0_s. 900b8e80941Smrg */ 901b8e80941Smrg if (v3d_qpu_magic_waddr_is_tmu(waddr) && v3d_qpu_waits_on_tmu(after)) 902b8e80941Smrg return 100; 903b8e80941Smrg 904b8e80941Smrg /* Assume that anything depending on us is consuming the SFU result. */ 905b8e80941Smrg if (v3d_qpu_magic_waddr_is_sfu(waddr)) 906b8e80941Smrg return 3; 907b8e80941Smrg 908b8e80941Smrg return 1; 909b8e80941Smrg} 910b8e80941Smrg 911b8e80941Smrgstatic uint32_t 912b8e80941Smrginstruction_latency(struct schedule_node *before, struct schedule_node *after) 913b8e80941Smrg{ 914b8e80941Smrg const struct v3d_qpu_instr *before_inst = &before->inst->qpu; 915b8e80941Smrg const struct v3d_qpu_instr *after_inst = &after->inst->qpu; 916b8e80941Smrg uint32_t latency = 1; 917b8e80941Smrg 918b8e80941Smrg if (before_inst->type != V3D_QPU_INSTR_TYPE_ALU || 919b8e80941Smrg after_inst->type != V3D_QPU_INSTR_TYPE_ALU) 920b8e80941Smrg return latency; 921b8e80941Smrg 922b8e80941Smrg if (before_inst->alu.add.magic_write) { 923b8e80941Smrg latency = MAX2(latency, 924b8e80941Smrg magic_waddr_latency(before_inst->alu.add.waddr, 925b8e80941Smrg after_inst)); 926b8e80941Smrg } 927b8e80941Smrg 928b8e80941Smrg if (before_inst->alu.mul.magic_write) { 929b8e80941Smrg latency = MAX2(latency, 930b8e80941Smrg magic_waddr_latency(before_inst->alu.mul.waddr, 931b8e80941Smrg after_inst)); 932b8e80941Smrg } 933b8e80941Smrg 934b8e80941Smrg return latency; 935b8e80941Smrg} 936b8e80941Smrg 937b8e80941Smrg/** Recursive computation of the delay member of a node. */ 938b8e80941Smrgstatic void 939b8e80941Smrgcompute_delay(struct dag_node *node, void *state) 940b8e80941Smrg{ 941b8e80941Smrg struct schedule_node *n = (struct schedule_node *)node; 942b8e80941Smrg 943b8e80941Smrg n->delay = 1; 944b8e80941Smrg 945b8e80941Smrg util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { 946b8e80941Smrg struct schedule_node *child = 947b8e80941Smrg (struct schedule_node *)edge->child; 948b8e80941Smrg 949b8e80941Smrg n->delay = MAX2(n->delay, (child->delay + 950b8e80941Smrg instruction_latency(n, child))); 951b8e80941Smrg } 952b8e80941Smrg} 953b8e80941Smrg 954b8e80941Smrg/* Removes a DAG head, but removing only the WAR edges. (dag_prune_head() 955b8e80941Smrg * should be called on it later to finish pruning the other edges). 956b8e80941Smrg */ 957b8e80941Smrgstatic void 958b8e80941Smrgpre_remove_head(struct dag *dag, struct schedule_node *n) 959b8e80941Smrg{ 960b8e80941Smrg list_delinit(&n->dag.link); 961b8e80941Smrg 962b8e80941Smrg util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { 963b8e80941Smrg if (edge->data) 964b8e80941Smrg dag_remove_edge(dag, edge); 965b8e80941Smrg } 966b8e80941Smrg} 967b8e80941Smrg 968b8e80941Smrgstatic void 969b8e80941Smrgmark_instruction_scheduled(struct dag *dag, 970b8e80941Smrg uint32_t time, 971b8e80941Smrg struct schedule_node *node) 972b8e80941Smrg{ 973b8e80941Smrg if (!node) 974b8e80941Smrg return; 975b8e80941Smrg 976b8e80941Smrg util_dynarray_foreach(&node->dag.edges, struct dag_edge, edge) { 977b8e80941Smrg struct schedule_node *child = 978b8e80941Smrg (struct schedule_node *)edge->child; 979b8e80941Smrg 980b8e80941Smrg if (!child) 981b8e80941Smrg continue; 982b8e80941Smrg 983b8e80941Smrg uint32_t latency = instruction_latency(node, child); 984b8e80941Smrg 985b8e80941Smrg child->unblocked_time = MAX2(child->unblocked_time, 986b8e80941Smrg time + latency); 987b8e80941Smrg } 988b8e80941Smrg dag_prune_head(dag, &node->dag); 989b8e80941Smrg} 990b8e80941Smrg 991b8e80941Smrgstatic void 992b8e80941Smrginsert_scheduled_instruction(struct v3d_compile *c, 993b8e80941Smrg struct qblock *block, 994b8e80941Smrg struct choose_scoreboard *scoreboard, 995b8e80941Smrg struct qinst *inst) 996b8e80941Smrg{ 997b8e80941Smrg list_addtail(&inst->link, &block->instructions); 998b8e80941Smrg 999b8e80941Smrg update_scoreboard_for_chosen(scoreboard, &inst->qpu); 1000b8e80941Smrg c->qpu_inst_count++; 1001b8e80941Smrg scoreboard->tick++; 1002b8e80941Smrg} 1003b8e80941Smrg 1004b8e80941Smrgstatic struct qinst * 1005b8e80941Smrgvir_nop() 1006b8e80941Smrg{ 1007b8e80941Smrg struct qreg undef = vir_nop_reg(); 1008b8e80941Smrg struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef); 1009b8e80941Smrg 1010b8e80941Smrg return qinst; 1011b8e80941Smrg} 1012b8e80941Smrg 1013b8e80941Smrgstatic void 1014b8e80941Smrgemit_nop(struct v3d_compile *c, struct qblock *block, 1015b8e80941Smrg struct choose_scoreboard *scoreboard) 1016b8e80941Smrg{ 1017b8e80941Smrg insert_scheduled_instruction(c, block, scoreboard, vir_nop()); 1018b8e80941Smrg} 1019b8e80941Smrg 1020b8e80941Smrgstatic bool 1021b8e80941Smrgqpu_instruction_valid_in_thrend_slot(struct v3d_compile *c, 1022b8e80941Smrg const struct qinst *qinst, int slot) 1023b8e80941Smrg{ 1024b8e80941Smrg const struct v3d_qpu_instr *inst = &qinst->qpu; 1025b8e80941Smrg 1026b8e80941Smrg /* Only TLB Z writes are prohibited in the last slot, but we don't 1027b8e80941Smrg * have those flagged so prohibit all TLB ops for now. 1028b8e80941Smrg */ 1029b8e80941Smrg if (slot == 2 && qpu_inst_is_tlb(inst)) 1030b8e80941Smrg return false; 1031b8e80941Smrg 1032b8e80941Smrg if (slot > 0 && qinst->uniform != ~0) 1033b8e80941Smrg return false; 1034b8e80941Smrg 1035b8e80941Smrg if (v3d_qpu_uses_vpm(inst)) 1036b8e80941Smrg return false; 1037b8e80941Smrg 1038b8e80941Smrg if (inst->sig.ldvary) 1039b8e80941Smrg return false; 1040b8e80941Smrg 1041b8e80941Smrg if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { 1042b8e80941Smrg /* GFXH-1625: TMUWT not allowed in the final instruction. */ 1043b8e80941Smrg if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT) 1044b8e80941Smrg return false; 1045b8e80941Smrg 1046b8e80941Smrg /* No writing physical registers at the end. */ 1047b8e80941Smrg if (!inst->alu.add.magic_write || 1048b8e80941Smrg !inst->alu.mul.magic_write) { 1049b8e80941Smrg return false; 1050b8e80941Smrg } 1051b8e80941Smrg 1052b8e80941Smrg if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF) 1053b8e80941Smrg return false; 1054b8e80941Smrg 1055b8e80941Smrg /* RF0-2 might be overwritten during the delay slots by 1056b8e80941Smrg * fragment shader setup. 1057b8e80941Smrg */ 1058b8e80941Smrg if (inst->raddr_a < 3 && 1059b8e80941Smrg (inst->alu.add.a == V3D_QPU_MUX_A || 1060b8e80941Smrg inst->alu.add.b == V3D_QPU_MUX_A || 1061b8e80941Smrg inst->alu.mul.a == V3D_QPU_MUX_A || 1062b8e80941Smrg inst->alu.mul.b == V3D_QPU_MUX_A)) { 1063b8e80941Smrg return false; 1064b8e80941Smrg } 1065b8e80941Smrg 1066b8e80941Smrg if (inst->raddr_b < 3 && 1067b8e80941Smrg !inst->sig.small_imm && 1068b8e80941Smrg (inst->alu.add.a == V3D_QPU_MUX_B || 1069b8e80941Smrg inst->alu.add.b == V3D_QPU_MUX_B || 1070b8e80941Smrg inst->alu.mul.a == V3D_QPU_MUX_B || 1071b8e80941Smrg inst->alu.mul.b == V3D_QPU_MUX_B)) { 1072b8e80941Smrg return false; 1073b8e80941Smrg } 1074b8e80941Smrg } 1075b8e80941Smrg 1076b8e80941Smrg return true; 1077b8e80941Smrg} 1078b8e80941Smrg 1079b8e80941Smrgstatic bool 1080b8e80941Smrgvalid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard, 1081b8e80941Smrg struct qinst *qinst, int instructions_in_sequence, 1082b8e80941Smrg bool is_thrend) 1083b8e80941Smrg{ 1084b8e80941Smrg /* No emitting our thrsw while the previous thrsw hasn't happened yet. */ 1085b8e80941Smrg if (scoreboard->last_thrsw_tick + 3 > 1086b8e80941Smrg scoreboard->tick - instructions_in_sequence) { 1087b8e80941Smrg return false; 1088b8e80941Smrg } 1089b8e80941Smrg 1090b8e80941Smrg for (int slot = 0; slot < instructions_in_sequence; slot++) { 1091b8e80941Smrg /* No scheduling SFU when the result would land in the other 1092b8e80941Smrg * thread. The simulator complains for safety, though it 1093b8e80941Smrg * would only occur for dead code in our case. 1094b8e80941Smrg */ 1095b8e80941Smrg if (slot > 0 && 1096b8e80941Smrg qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && 1097b8e80941Smrg (v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.add.waddr) || 1098b8e80941Smrg v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.mul.waddr))) { 1099b8e80941Smrg return false; 1100b8e80941Smrg } 1101b8e80941Smrg 1102b8e80941Smrg if (slot > 0 && qinst->qpu.sig.ldvary) 1103b8e80941Smrg return false; 1104b8e80941Smrg 1105b8e80941Smrg if (is_thrend && 1106b8e80941Smrg !qpu_instruction_valid_in_thrend_slot(c, qinst, slot)) { 1107b8e80941Smrg return false; 1108b8e80941Smrg } 1109b8e80941Smrg 1110b8e80941Smrg /* Note that the list is circular, so we can only do this up 1111b8e80941Smrg * to instructions_in_sequence. 1112b8e80941Smrg */ 1113b8e80941Smrg qinst = (struct qinst *)qinst->link.next; 1114b8e80941Smrg } 1115b8e80941Smrg 1116b8e80941Smrg return true; 1117b8e80941Smrg} 1118b8e80941Smrg 1119b8e80941Smrg/** 1120b8e80941Smrg * Emits a THRSW signal in the stream, trying to move it up to pair with 1121b8e80941Smrg * another instruction. 1122b8e80941Smrg */ 1123b8e80941Smrgstatic int 1124b8e80941Smrgemit_thrsw(struct v3d_compile *c, 1125b8e80941Smrg struct qblock *block, 1126b8e80941Smrg struct choose_scoreboard *scoreboard, 1127b8e80941Smrg struct qinst *inst, 1128b8e80941Smrg bool is_thrend) 1129b8e80941Smrg{ 1130b8e80941Smrg int time = 0; 1131b8e80941Smrg 1132b8e80941Smrg /* There should be nothing in a thrsw inst being scheduled other than 1133b8e80941Smrg * the signal bits. 1134b8e80941Smrg */ 1135b8e80941Smrg assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU); 1136b8e80941Smrg assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP); 1137b8e80941Smrg assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP); 1138b8e80941Smrg 1139b8e80941Smrg /* Find how far back into previous instructions we can put the THRSW. */ 1140b8e80941Smrg int slots_filled = 0; 1141b8e80941Smrg struct qinst *merge_inst = NULL; 1142b8e80941Smrg vir_for_each_inst_rev(prev_inst, block) { 1143b8e80941Smrg struct v3d_qpu_sig sig = prev_inst->qpu.sig; 1144b8e80941Smrg sig.thrsw = true; 1145b8e80941Smrg uint32_t packed_sig; 1146b8e80941Smrg 1147b8e80941Smrg if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig)) 1148b8e80941Smrg break; 1149b8e80941Smrg 1150b8e80941Smrg if (!valid_thrsw_sequence(c, scoreboard, 1151b8e80941Smrg prev_inst, slots_filled + 1, 1152b8e80941Smrg is_thrend)) { 1153b8e80941Smrg break; 1154b8e80941Smrg } 1155b8e80941Smrg 1156b8e80941Smrg merge_inst = prev_inst; 1157b8e80941Smrg if (++slots_filled == 3) 1158b8e80941Smrg break; 1159b8e80941Smrg } 1160b8e80941Smrg 1161b8e80941Smrg bool needs_free = false; 1162b8e80941Smrg if (merge_inst) { 1163b8e80941Smrg merge_inst->qpu.sig.thrsw = true; 1164b8e80941Smrg needs_free = true; 1165b8e80941Smrg scoreboard->last_thrsw_tick = scoreboard->tick - slots_filled; 1166b8e80941Smrg } else { 1167b8e80941Smrg scoreboard->last_thrsw_tick = scoreboard->tick; 1168b8e80941Smrg insert_scheduled_instruction(c, block, scoreboard, inst); 1169b8e80941Smrg time++; 1170b8e80941Smrg slots_filled++; 1171b8e80941Smrg merge_inst = inst; 1172b8e80941Smrg } 1173b8e80941Smrg 1174b8e80941Smrg /* Insert any extra delay slot NOPs we need. */ 1175b8e80941Smrg for (int i = 0; i < 3 - slots_filled; i++) { 1176b8e80941Smrg emit_nop(c, block, scoreboard); 1177b8e80941Smrg time++; 1178b8e80941Smrg } 1179b8e80941Smrg 1180b8e80941Smrg /* If we're emitting the last THRSW (other than program end), then 1181b8e80941Smrg * signal that to the HW by emitting two THRSWs in a row. 1182b8e80941Smrg */ 1183b8e80941Smrg if (inst->is_last_thrsw) { 1184b8e80941Smrg struct qinst *second_inst = 1185b8e80941Smrg (struct qinst *)merge_inst->link.next; 1186b8e80941Smrg second_inst->qpu.sig.thrsw = true; 1187b8e80941Smrg } 1188b8e80941Smrg 1189b8e80941Smrg /* If we put our THRSW into another instruction, free up the 1190b8e80941Smrg * instruction that didn't end up scheduled into the list. 1191b8e80941Smrg */ 1192b8e80941Smrg if (needs_free) 1193b8e80941Smrg free(inst); 1194b8e80941Smrg 1195b8e80941Smrg return time; 1196b8e80941Smrg} 1197b8e80941Smrg 1198b8e80941Smrgstatic uint32_t 1199b8e80941Smrgschedule_instructions(struct v3d_compile *c, 1200b8e80941Smrg struct choose_scoreboard *scoreboard, 1201b8e80941Smrg struct qblock *block, 1202b8e80941Smrg enum quniform_contents *orig_uniform_contents, 1203b8e80941Smrg uint32_t *orig_uniform_data, 1204b8e80941Smrg uint32_t *next_uniform) 1205b8e80941Smrg{ 1206b8e80941Smrg const struct v3d_device_info *devinfo = c->devinfo; 1207b8e80941Smrg uint32_t time = 0; 1208b8e80941Smrg 1209b8e80941Smrg while (!list_empty(&scoreboard->dag->heads)) { 1210b8e80941Smrg struct schedule_node *chosen = 1211b8e80941Smrg choose_instruction_to_schedule(devinfo, 1212b8e80941Smrg scoreboard, 1213b8e80941Smrg NULL); 1214b8e80941Smrg struct schedule_node *merge = NULL; 1215b8e80941Smrg 1216b8e80941Smrg /* If there are no valid instructions to schedule, drop a NOP 1217b8e80941Smrg * in. 1218b8e80941Smrg */ 1219b8e80941Smrg struct qinst *qinst = chosen ? chosen->inst : vir_nop(); 1220b8e80941Smrg struct v3d_qpu_instr *inst = &qinst->qpu; 1221b8e80941Smrg 1222b8e80941Smrg if (debug) { 1223b8e80941Smrg fprintf(stderr, "t=%4d: current list:\n", 1224b8e80941Smrg time); 1225b8e80941Smrg dump_state(devinfo, scoreboard->dag); 1226b8e80941Smrg fprintf(stderr, "t=%4d: chose: ", time); 1227b8e80941Smrg v3d_qpu_dump(devinfo, inst); 1228b8e80941Smrg fprintf(stderr, "\n"); 1229b8e80941Smrg } 1230b8e80941Smrg 1231b8e80941Smrg /* We can't mark_instruction_scheduled() the chosen inst until 1232b8e80941Smrg * we're done identifying instructions to merge, so put the 1233b8e80941Smrg * merged instructions on a list for a moment. 1234b8e80941Smrg */ 1235b8e80941Smrg struct list_head merged_list; 1236b8e80941Smrg list_inithead(&merged_list); 1237b8e80941Smrg 1238b8e80941Smrg /* Schedule this instruction onto the QPU list. Also try to 1239b8e80941Smrg * find an instruction to pair with it. 1240b8e80941Smrg */ 1241b8e80941Smrg if (chosen) { 1242b8e80941Smrg time = MAX2(chosen->unblocked_time, time); 1243b8e80941Smrg pre_remove_head(scoreboard->dag, chosen); 1244b8e80941Smrg 1245b8e80941Smrg while ((merge = 1246b8e80941Smrg choose_instruction_to_schedule(devinfo, 1247b8e80941Smrg scoreboard, 1248b8e80941Smrg chosen))) { 1249b8e80941Smrg time = MAX2(merge->unblocked_time, time); 1250b8e80941Smrg pre_remove_head(scoreboard->dag, chosen); 1251b8e80941Smrg list_addtail(&merge->link, &merged_list); 1252b8e80941Smrg (void)qpu_merge_inst(devinfo, inst, 1253b8e80941Smrg inst, &merge->inst->qpu); 1254b8e80941Smrg if (merge->inst->uniform != -1) { 1255b8e80941Smrg chosen->inst->uniform = 1256b8e80941Smrg merge->inst->uniform; 1257b8e80941Smrg } 1258b8e80941Smrg 1259b8e80941Smrg if (debug) { 1260b8e80941Smrg fprintf(stderr, "t=%4d: merging: ", 1261b8e80941Smrg time); 1262b8e80941Smrg v3d_qpu_dump(devinfo, &merge->inst->qpu); 1263b8e80941Smrg fprintf(stderr, "\n"); 1264b8e80941Smrg fprintf(stderr, " result: "); 1265b8e80941Smrg v3d_qpu_dump(devinfo, inst); 1266b8e80941Smrg fprintf(stderr, "\n"); 1267b8e80941Smrg } 1268b8e80941Smrg } 1269b8e80941Smrg } 1270b8e80941Smrg 1271b8e80941Smrg /* Update the uniform index for the rewritten location -- 1272b8e80941Smrg * branch target updating will still need to change 1273b8e80941Smrg * c->uniform_data[] using this index. 1274b8e80941Smrg */ 1275b8e80941Smrg if (qinst->uniform != -1) { 1276b8e80941Smrg if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) 1277b8e80941Smrg block->branch_uniform = *next_uniform; 1278b8e80941Smrg 1279b8e80941Smrg c->uniform_data[*next_uniform] = 1280b8e80941Smrg orig_uniform_data[qinst->uniform]; 1281b8e80941Smrg c->uniform_contents[*next_uniform] = 1282b8e80941Smrg orig_uniform_contents[qinst->uniform]; 1283b8e80941Smrg qinst->uniform = *next_uniform; 1284b8e80941Smrg (*next_uniform)++; 1285b8e80941Smrg } 1286b8e80941Smrg 1287b8e80941Smrg if (debug) { 1288b8e80941Smrg fprintf(stderr, "\n"); 1289b8e80941Smrg } 1290b8e80941Smrg 1291b8e80941Smrg /* Now that we've scheduled a new instruction, some of its 1292b8e80941Smrg * children can be promoted to the list of instructions ready to 1293b8e80941Smrg * be scheduled. Update the children's unblocked time for this 1294b8e80941Smrg * DAG edge as we do so. 1295b8e80941Smrg */ 1296b8e80941Smrg mark_instruction_scheduled(scoreboard->dag, time, chosen); 1297b8e80941Smrg list_for_each_entry(struct schedule_node, merge, &merged_list, 1298b8e80941Smrg link) { 1299b8e80941Smrg mark_instruction_scheduled(scoreboard->dag, time, merge); 1300b8e80941Smrg 1301b8e80941Smrg /* The merged VIR instruction doesn't get re-added to the 1302b8e80941Smrg * block, so free it now. 1303b8e80941Smrg */ 1304b8e80941Smrg free(merge->inst); 1305b8e80941Smrg } 1306b8e80941Smrg 1307b8e80941Smrg if (inst->sig.thrsw) { 1308b8e80941Smrg time += emit_thrsw(c, block, scoreboard, qinst, false); 1309b8e80941Smrg } else { 1310b8e80941Smrg insert_scheduled_instruction(c, block, 1311b8e80941Smrg scoreboard, qinst); 1312b8e80941Smrg 1313b8e80941Smrg if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) { 1314b8e80941Smrg block->branch_qpu_ip = c->qpu_inst_count - 1; 1315b8e80941Smrg /* Fill the delay slots. 1316b8e80941Smrg * 1317b8e80941Smrg * We should fill these with actual instructions, 1318b8e80941Smrg * instead, but that will probably need to be done 1319b8e80941Smrg * after this, once we know what the leading 1320b8e80941Smrg * instructions of the successors are (so we can 1321b8e80941Smrg * handle A/B register file write latency) 1322b8e80941Smrg */ 1323b8e80941Smrg for (int i = 0; i < 3; i++) 1324b8e80941Smrg emit_nop(c, block, scoreboard); 1325b8e80941Smrg } 1326b8e80941Smrg } 1327b8e80941Smrg } 1328b8e80941Smrg 1329b8e80941Smrg return time; 1330b8e80941Smrg} 1331b8e80941Smrg 1332b8e80941Smrgstatic uint32_t 1333b8e80941Smrgqpu_schedule_instructions_block(struct v3d_compile *c, 1334b8e80941Smrg struct choose_scoreboard *scoreboard, 1335b8e80941Smrg struct qblock *block, 1336b8e80941Smrg enum quniform_contents *orig_uniform_contents, 1337b8e80941Smrg uint32_t *orig_uniform_data, 1338b8e80941Smrg uint32_t *next_uniform) 1339b8e80941Smrg{ 1340b8e80941Smrg void *mem_ctx = ralloc_context(NULL); 1341b8e80941Smrg scoreboard->dag = dag_create(mem_ctx); 1342b8e80941Smrg struct list_head setup_list; 1343b8e80941Smrg 1344b8e80941Smrg list_inithead(&setup_list); 1345b8e80941Smrg 1346b8e80941Smrg /* Wrap each instruction in a scheduler structure. */ 1347b8e80941Smrg while (!list_empty(&block->instructions)) { 1348b8e80941Smrg struct qinst *qinst = (struct qinst *)block->instructions.next; 1349b8e80941Smrg struct schedule_node *n = 1350b8e80941Smrg rzalloc(mem_ctx, struct schedule_node); 1351b8e80941Smrg 1352b8e80941Smrg dag_init_node(scoreboard->dag, &n->dag); 1353b8e80941Smrg n->inst = qinst; 1354b8e80941Smrg 1355b8e80941Smrg list_del(&qinst->link); 1356b8e80941Smrg list_addtail(&n->link, &setup_list); 1357b8e80941Smrg } 1358b8e80941Smrg 1359b8e80941Smrg calculate_forward_deps(c, scoreboard->dag, &setup_list); 1360b8e80941Smrg calculate_reverse_deps(c, scoreboard->dag, &setup_list); 1361b8e80941Smrg 1362b8e80941Smrg dag_traverse_bottom_up(scoreboard->dag, compute_delay, NULL); 1363b8e80941Smrg 1364b8e80941Smrg uint32_t cycles = schedule_instructions(c, scoreboard, block, 1365b8e80941Smrg orig_uniform_contents, 1366b8e80941Smrg orig_uniform_data, 1367b8e80941Smrg next_uniform); 1368b8e80941Smrg 1369b8e80941Smrg ralloc_free(mem_ctx); 1370b8e80941Smrg scoreboard->dag = NULL; 1371b8e80941Smrg 1372b8e80941Smrg return cycles; 1373b8e80941Smrg} 1374b8e80941Smrg 1375b8e80941Smrgstatic void 1376b8e80941Smrgqpu_set_branch_targets(struct v3d_compile *c) 1377b8e80941Smrg{ 1378b8e80941Smrg vir_for_each_block(block, c) { 1379b8e80941Smrg /* The end block of the program has no branch. */ 1380b8e80941Smrg if (!block->successors[0]) 1381b8e80941Smrg continue; 1382b8e80941Smrg 1383b8e80941Smrg /* If there was no branch instruction, then the successor 1384b8e80941Smrg * block must follow immediately after this one. 1385b8e80941Smrg */ 1386b8e80941Smrg if (block->branch_qpu_ip == ~0) { 1387b8e80941Smrg assert(block->end_qpu_ip + 1 == 1388b8e80941Smrg block->successors[0]->start_qpu_ip); 1389b8e80941Smrg continue; 1390b8e80941Smrg } 1391b8e80941Smrg 1392b8e80941Smrg /* Walk back through the delay slots to find the branch 1393b8e80941Smrg * instr. 1394b8e80941Smrg */ 1395b8e80941Smrg struct list_head *entry = block->instructions.prev; 1396b8e80941Smrg for (int i = 0; i < 3; i++) 1397b8e80941Smrg entry = entry->prev; 1398b8e80941Smrg struct qinst *branch = container_of(entry, branch, link); 1399b8e80941Smrg assert(branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH); 1400b8e80941Smrg 1401b8e80941Smrg /* Make sure that the if-we-don't-jump 1402b8e80941Smrg * successor was scheduled just after the 1403b8e80941Smrg * delay slots. 1404b8e80941Smrg */ 1405b8e80941Smrg assert(!block->successors[1] || 1406b8e80941Smrg block->successors[1]->start_qpu_ip == 1407b8e80941Smrg block->branch_qpu_ip + 4); 1408b8e80941Smrg 1409b8e80941Smrg branch->qpu.branch.offset = 1410b8e80941Smrg ((block->successors[0]->start_qpu_ip - 1411b8e80941Smrg (block->branch_qpu_ip + 4)) * 1412b8e80941Smrg sizeof(uint64_t)); 1413b8e80941Smrg 1414b8e80941Smrg /* Set up the relative offset to jump in the 1415b8e80941Smrg * uniform stream. 1416b8e80941Smrg * 1417b8e80941Smrg * Use a temporary here, because 1418b8e80941Smrg * uniform_data[inst->uniform] may be shared 1419b8e80941Smrg * between multiple instructions. 1420b8e80941Smrg */ 1421b8e80941Smrg assert(c->uniform_contents[branch->uniform] == QUNIFORM_CONSTANT); 1422b8e80941Smrg c->uniform_data[branch->uniform] = 1423b8e80941Smrg (block->successors[0]->start_uniform - 1424b8e80941Smrg (block->branch_uniform + 1)) * 4; 1425b8e80941Smrg } 1426b8e80941Smrg} 1427b8e80941Smrg 1428b8e80941Smrguint32_t 1429b8e80941Smrgv3d_qpu_schedule_instructions(struct v3d_compile *c) 1430b8e80941Smrg{ 1431b8e80941Smrg const struct v3d_device_info *devinfo = c->devinfo; 1432b8e80941Smrg struct qblock *end_block = list_last_entry(&c->blocks, 1433b8e80941Smrg struct qblock, link); 1434b8e80941Smrg 1435b8e80941Smrg /* We reorder the uniforms as we schedule instructions, so save the 1436b8e80941Smrg * old data off and replace it. 1437b8e80941Smrg */ 1438b8e80941Smrg uint32_t *uniform_data = c->uniform_data; 1439b8e80941Smrg enum quniform_contents *uniform_contents = c->uniform_contents; 1440b8e80941Smrg c->uniform_contents = ralloc_array(c, enum quniform_contents, 1441b8e80941Smrg c->num_uniforms); 1442b8e80941Smrg c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms); 1443b8e80941Smrg c->uniform_array_size = c->num_uniforms; 1444b8e80941Smrg uint32_t next_uniform = 0; 1445b8e80941Smrg 1446b8e80941Smrg struct choose_scoreboard scoreboard; 1447b8e80941Smrg memset(&scoreboard, 0, sizeof(scoreboard)); 1448b8e80941Smrg scoreboard.last_ldvary_tick = -10; 1449b8e80941Smrg scoreboard.last_magic_sfu_write_tick = -10; 1450b8e80941Smrg scoreboard.last_uniforms_reset_tick = -10; 1451b8e80941Smrg scoreboard.last_thrsw_tick = -10; 1452b8e80941Smrg 1453b8e80941Smrg if (debug) { 1454b8e80941Smrg fprintf(stderr, "Pre-schedule instructions\n"); 1455b8e80941Smrg vir_for_each_block(block, c) { 1456b8e80941Smrg fprintf(stderr, "BLOCK %d\n", block->index); 1457b8e80941Smrg list_for_each_entry(struct qinst, qinst, 1458b8e80941Smrg &block->instructions, link) { 1459b8e80941Smrg v3d_qpu_dump(devinfo, &qinst->qpu); 1460b8e80941Smrg fprintf(stderr, "\n"); 1461b8e80941Smrg } 1462b8e80941Smrg } 1463b8e80941Smrg fprintf(stderr, "\n"); 1464b8e80941Smrg } 1465b8e80941Smrg 1466b8e80941Smrg uint32_t cycles = 0; 1467b8e80941Smrg vir_for_each_block(block, c) { 1468b8e80941Smrg block->start_qpu_ip = c->qpu_inst_count; 1469b8e80941Smrg block->branch_qpu_ip = ~0; 1470b8e80941Smrg block->start_uniform = next_uniform; 1471b8e80941Smrg 1472b8e80941Smrg cycles += qpu_schedule_instructions_block(c, 1473b8e80941Smrg &scoreboard, 1474b8e80941Smrg block, 1475b8e80941Smrg uniform_contents, 1476b8e80941Smrg uniform_data, 1477b8e80941Smrg &next_uniform); 1478b8e80941Smrg 1479b8e80941Smrg block->end_qpu_ip = c->qpu_inst_count - 1; 1480b8e80941Smrg } 1481b8e80941Smrg 1482b8e80941Smrg /* Emit the program-end THRSW instruction. */; 1483b8e80941Smrg struct qinst *thrsw = vir_nop(); 1484b8e80941Smrg thrsw->qpu.sig.thrsw = true; 1485b8e80941Smrg emit_thrsw(c, end_block, &scoreboard, thrsw, true); 1486b8e80941Smrg 1487b8e80941Smrg qpu_set_branch_targets(c); 1488b8e80941Smrg 1489b8e80941Smrg assert(next_uniform == c->num_uniforms); 1490b8e80941Smrg 1491b8e80941Smrg return cycles; 1492b8e80941Smrg} 1493