qpu_schedule.c revision b8e80941
1/* 2 * Copyright © 2010 Intel Corporation 3 * Copyright © 2014-2017 Broadcom 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22 * IN THE SOFTWARE. 23 */ 24 25/** 26 * @file 27 * 28 * The basic model of the list scheduler is to take a basic block, compute a 29 * DAG of the dependencies, and make a list of the DAG heads. Heuristically 30 * pick a DAG head, then put all the children that are now DAG heads into the 31 * list of things to schedule. 32 * 33 * The goal of scheduling here is to pack pairs of operations together in a 34 * single QPU instruction. 35 */ 36 37#include "qpu/qpu_disasm.h" 38#include "v3d_compiler.h" 39#include "util/ralloc.h" 40#include "util/dag.h" 41 42static bool debug; 43 44struct schedule_node_child; 45 46struct schedule_node { 47 struct dag_node dag; 48 struct list_head link; 49 struct qinst *inst; 50 51 /* Longest cycles + instruction_latency() of any parent of this node. */ 52 uint32_t unblocked_time; 53 54 /** 55 * Minimum number of cycles from scheduling this instruction until the 56 * end of the program, based on the slowest dependency chain through 57 * the children. 58 */ 59 uint32_t delay; 60 61 /** 62 * cycles between this instruction being scheduled and when its result 63 * can be consumed. 64 */ 65 uint32_t latency; 66}; 67 68/* When walking the instructions in reverse, we need to swap before/after in 69 * add_dep(). 70 */ 71enum direction { F, R }; 72 73struct schedule_state { 74 const struct v3d_device_info *devinfo; 75 struct dag *dag; 76 struct schedule_node *last_r[6]; 77 struct schedule_node *last_rf[64]; 78 struct schedule_node *last_sf; 79 struct schedule_node *last_vpm_read; 80 struct schedule_node *last_tmu_write; 81 struct schedule_node *last_tmu_config; 82 struct schedule_node *last_tlb; 83 struct schedule_node *last_vpm; 84 struct schedule_node *last_unif; 85 struct schedule_node *last_rtop; 86 enum direction dir; 87 /* Estimated cycle when the current instruction would start. */ 88 uint32_t time; 89}; 90 91static void 92add_dep(struct schedule_state *state, 93 struct schedule_node *before, 94 struct schedule_node *after, 95 bool write) 96{ 97 bool write_after_read = !write && state->dir == R; 98 void *edge_data = (void *)(uintptr_t)write_after_read; 99 100 if (!before || !after) 101 return; 102 103 assert(before != after); 104 105 if (state->dir == F) 106 dag_add_edge(&before->dag, &after->dag, edge_data); 107 else 108 dag_add_edge(&after->dag, &before->dag, edge_data); 109} 110 111static void 112add_read_dep(struct schedule_state *state, 113 struct schedule_node *before, 114 struct schedule_node *after) 115{ 116 add_dep(state, before, after, false); 117} 118 119static void 120add_write_dep(struct schedule_state *state, 121 struct schedule_node **before, 122 struct schedule_node *after) 123{ 124 add_dep(state, *before, after, true); 125 *before = after; 126} 127 128static bool 129qpu_inst_is_tlb(const struct v3d_qpu_instr *inst) 130{ 131 if (inst->type != V3D_QPU_INSTR_TYPE_ALU) 132 return false; 133 134 if (inst->alu.add.magic_write && 135 (inst->alu.add.waddr == V3D_QPU_WADDR_TLB || 136 inst->alu.add.waddr == V3D_QPU_WADDR_TLBU)) 137 return true; 138 139 if (inst->alu.mul.magic_write && 140 (inst->alu.mul.waddr == V3D_QPU_WADDR_TLB || 141 inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU)) 142 return true; 143 144 return false; 145} 146 147static void 148process_mux_deps(struct schedule_state *state, struct schedule_node *n, 149 enum v3d_qpu_mux mux) 150{ 151 switch (mux) { 152 case V3D_QPU_MUX_A: 153 add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n); 154 break; 155 case V3D_QPU_MUX_B: 156 add_read_dep(state, state->last_rf[n->inst->qpu.raddr_b], n); 157 break; 158 default: 159 add_read_dep(state, state->last_r[mux - V3D_QPU_MUX_R0], n); 160 break; 161 } 162} 163 164 165static void 166process_waddr_deps(struct schedule_state *state, struct schedule_node *n, 167 uint32_t waddr, bool magic) 168{ 169 if (!magic) { 170 add_write_dep(state, &state->last_rf[waddr], n); 171 } else if (v3d_qpu_magic_waddr_is_tmu(waddr)) { 172 /* XXX perf: For V3D 4.x, we could reorder TMU writes other 173 * than the TMUS/TMUD/TMUA to improve scheduling flexibility. 174 */ 175 add_write_dep(state, &state->last_tmu_write, n); 176 switch (waddr) { 177 case V3D_QPU_WADDR_TMUS: 178 case V3D_QPU_WADDR_TMUSCM: 179 case V3D_QPU_WADDR_TMUSF: 180 case V3D_QPU_WADDR_TMUSLOD: 181 add_write_dep(state, &state->last_tmu_config, n); 182 break; 183 default: 184 break; 185 } 186 } else if (v3d_qpu_magic_waddr_is_sfu(waddr)) { 187 /* Handled by v3d_qpu_writes_r4() check. */ 188 } else { 189 switch (waddr) { 190 case V3D_QPU_WADDR_R0: 191 case V3D_QPU_WADDR_R1: 192 case V3D_QPU_WADDR_R2: 193 add_write_dep(state, 194 &state->last_r[waddr - V3D_QPU_WADDR_R0], 195 n); 196 break; 197 case V3D_QPU_WADDR_R3: 198 case V3D_QPU_WADDR_R4: 199 case V3D_QPU_WADDR_R5: 200 /* Handled by v3d_qpu_writes_r*() checks below. */ 201 break; 202 203 case V3D_QPU_WADDR_VPM: 204 case V3D_QPU_WADDR_VPMU: 205 add_write_dep(state, &state->last_vpm, n); 206 break; 207 208 case V3D_QPU_WADDR_TLB: 209 case V3D_QPU_WADDR_TLBU: 210 add_write_dep(state, &state->last_tlb, n); 211 break; 212 213 case V3D_QPU_WADDR_SYNC: 214 case V3D_QPU_WADDR_SYNCB: 215 case V3D_QPU_WADDR_SYNCU: 216 /* For CS barrier(): Sync against any other memory 217 * accesses. There doesn't appear to be any need for 218 * barriers to affect ALU operations. 219 */ 220 add_write_dep(state, &state->last_tmu_write, n); 221 break; 222 223 case V3D_QPU_WADDR_NOP: 224 break; 225 226 default: 227 fprintf(stderr, "Unknown waddr %d\n", waddr); 228 abort(); 229 } 230 } 231} 232 233/** 234 * Common code for dependencies that need to be tracked both forward and 235 * backward. 236 * 237 * This is for things like "all reads of r4 have to happen between the r4 238 * writes that surround them". 239 */ 240static void 241calculate_deps(struct schedule_state *state, struct schedule_node *n) 242{ 243 const struct v3d_device_info *devinfo = state->devinfo; 244 struct qinst *qinst = n->inst; 245 struct v3d_qpu_instr *inst = &qinst->qpu; 246 /* If the input and output segments are shared, then all VPM reads to 247 * a location need to happen before all writes. We handle this by 248 * serializing all VPM operations for now. 249 */ 250 bool separate_vpm_segment = false; 251 252 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) { 253 if (inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS) 254 add_read_dep(state, state->last_sf, n); 255 256 /* XXX: BDI */ 257 /* XXX: BDU */ 258 /* XXX: ub */ 259 /* XXX: raddr_a */ 260 261 add_write_dep(state, &state->last_unif, n); 262 return; 263 } 264 265 assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); 266 267 /* XXX: LOAD_IMM */ 268 269 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) 270 process_mux_deps(state, n, inst->alu.add.a); 271 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) 272 process_mux_deps(state, n, inst->alu.add.b); 273 274 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) 275 process_mux_deps(state, n, inst->alu.mul.a); 276 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) 277 process_mux_deps(state, n, inst->alu.mul.b); 278 279 switch (inst->alu.add.op) { 280 case V3D_QPU_A_VPMSETUP: 281 /* Could distinguish read/write by unpacking the uniform. */ 282 add_write_dep(state, &state->last_vpm, n); 283 add_write_dep(state, &state->last_vpm_read, n); 284 break; 285 286 case V3D_QPU_A_STVPMV: 287 case V3D_QPU_A_STVPMD: 288 case V3D_QPU_A_STVPMP: 289 add_write_dep(state, &state->last_vpm, n); 290 break; 291 292 case V3D_QPU_A_LDVPMV_IN: 293 case V3D_QPU_A_LDVPMD_IN: 294 case V3D_QPU_A_LDVPMG_IN: 295 case V3D_QPU_A_LDVPMP: 296 if (!separate_vpm_segment) 297 add_write_dep(state, &state->last_vpm, n); 298 break; 299 300 case V3D_QPU_A_VPMWT: 301 add_read_dep(state, state->last_vpm, n); 302 break; 303 304 case V3D_QPU_A_MSF: 305 add_read_dep(state, state->last_tlb, n); 306 break; 307 308 case V3D_QPU_A_SETMSF: 309 case V3D_QPU_A_SETREVF: 310 add_write_dep(state, &state->last_tlb, n); 311 break; 312 313 default: 314 break; 315 } 316 317 switch (inst->alu.mul.op) { 318 case V3D_QPU_M_MULTOP: 319 case V3D_QPU_M_UMUL24: 320 /* MULTOP sets rtop, and UMUL24 implicitly reads rtop and 321 * resets it to 0. We could possibly reorder umul24s relative 322 * to each other, but for now just keep all the MUL parts in 323 * order. 324 */ 325 add_write_dep(state, &state->last_rtop, n); 326 break; 327 default: 328 break; 329 } 330 331 if (inst->alu.add.op != V3D_QPU_A_NOP) { 332 process_waddr_deps(state, n, inst->alu.add.waddr, 333 inst->alu.add.magic_write); 334 } 335 if (inst->alu.mul.op != V3D_QPU_M_NOP) { 336 process_waddr_deps(state, n, inst->alu.mul.waddr, 337 inst->alu.mul.magic_write); 338 } 339 if (v3d_qpu_sig_writes_address(devinfo, &inst->sig)) { 340 process_waddr_deps(state, n, inst->sig_addr, 341 inst->sig_magic); 342 } 343 344 if (v3d_qpu_writes_r3(devinfo, inst)) 345 add_write_dep(state, &state->last_r[3], n); 346 if (v3d_qpu_writes_r4(devinfo, inst)) 347 add_write_dep(state, &state->last_r[4], n); 348 if (v3d_qpu_writes_r5(devinfo, inst)) 349 add_write_dep(state, &state->last_r[5], n); 350 351 if (inst->sig.thrsw) { 352 /* All accumulator contents and flags are undefined after the 353 * switch. 354 */ 355 for (int i = 0; i < ARRAY_SIZE(state->last_r); i++) 356 add_write_dep(state, &state->last_r[i], n); 357 add_write_dep(state, &state->last_sf, n); 358 add_write_dep(state, &state->last_rtop, n); 359 360 /* Scoreboard-locking operations have to stay after the last 361 * thread switch. 362 */ 363 add_write_dep(state, &state->last_tlb, n); 364 365 add_write_dep(state, &state->last_tmu_write, n); 366 add_write_dep(state, &state->last_tmu_config, n); 367 } 368 369 if (v3d_qpu_waits_on_tmu(inst)) { 370 /* TMU loads are coming from a FIFO, so ordering is important. 371 */ 372 add_write_dep(state, &state->last_tmu_write, n); 373 } 374 375 if (inst->sig.wrtmuc) 376 add_write_dep(state, &state->last_tmu_config, n); 377 378 if (inst->sig.ldtlb | inst->sig.ldtlbu) 379 add_read_dep(state, state->last_tlb, n); 380 381 if (inst->sig.ldvpm) { 382 add_write_dep(state, &state->last_vpm_read, n); 383 384 /* At least for now, we're doing shared I/O segments, so queue 385 * all writes after all reads. 386 */ 387 if (!separate_vpm_segment) 388 add_write_dep(state, &state->last_vpm, n); 389 } 390 391 /* inst->sig.ldunif or sideband uniform read */ 392 if (vir_has_uniform(qinst)) 393 add_write_dep(state, &state->last_unif, n); 394 395 if (v3d_qpu_reads_flags(inst)) 396 add_read_dep(state, state->last_sf, n); 397 if (v3d_qpu_writes_flags(inst)) 398 add_write_dep(state, &state->last_sf, n); 399} 400 401static void 402calculate_forward_deps(struct v3d_compile *c, struct dag *dag, 403 struct list_head *schedule_list) 404{ 405 struct schedule_state state; 406 407 memset(&state, 0, sizeof(state)); 408 state.dag = dag; 409 state.devinfo = c->devinfo; 410 state.dir = F; 411 412 list_for_each_entry(struct schedule_node, node, schedule_list, link) 413 calculate_deps(&state, node); 414} 415 416static void 417calculate_reverse_deps(struct v3d_compile *c, struct dag *dag, 418 struct list_head *schedule_list) 419{ 420 struct schedule_state state; 421 422 memset(&state, 0, sizeof(state)); 423 state.dag = dag; 424 state.devinfo = c->devinfo; 425 state.dir = R; 426 427 list_for_each_entry_rev(struct schedule_node, node, schedule_list, 428 link) { 429 calculate_deps(&state, (struct schedule_node *)node); 430 } 431} 432 433struct choose_scoreboard { 434 struct dag *dag; 435 int tick; 436 int last_magic_sfu_write_tick; 437 int last_ldvary_tick; 438 int last_uniforms_reset_tick; 439 int last_thrsw_tick; 440 bool tlb_locked; 441}; 442 443static bool 444mux_reads_too_soon(struct choose_scoreboard *scoreboard, 445 const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux) 446{ 447 switch (mux) { 448 case V3D_QPU_MUX_R4: 449 if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick <= 2) 450 return true; 451 break; 452 453 case V3D_QPU_MUX_R5: 454 if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1) 455 return true; 456 break; 457 default: 458 break; 459 } 460 461 return false; 462} 463 464static bool 465reads_too_soon_after_write(struct choose_scoreboard *scoreboard, 466 struct qinst *qinst) 467{ 468 const struct v3d_qpu_instr *inst = &qinst->qpu; 469 470 /* XXX: Branching off of raddr. */ 471 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) 472 return false; 473 474 assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); 475 476 if (inst->alu.add.op != V3D_QPU_A_NOP) { 477 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 && 478 mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) { 479 return true; 480 } 481 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 && 482 mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) { 483 return true; 484 } 485 } 486 487 if (inst->alu.mul.op != V3D_QPU_M_NOP) { 488 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 && 489 mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) { 490 return true; 491 } 492 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 && 493 mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) { 494 return true; 495 } 496 } 497 498 /* XXX: imm */ 499 500 return false; 501} 502 503static bool 504writes_too_soon_after_write(const struct v3d_device_info *devinfo, 505 struct choose_scoreboard *scoreboard, 506 struct qinst *qinst) 507{ 508 const struct v3d_qpu_instr *inst = &qinst->qpu; 509 510 /* Don't schedule any other r4 write too soon after an SFU write. 511 * This would normally be prevented by dependency tracking, but might 512 * occur if a dead SFU computation makes it to scheduling. 513 */ 514 if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick < 2 && 515 v3d_qpu_writes_r4(devinfo, inst)) 516 return true; 517 518 return false; 519} 520 521static bool 522pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard, 523 const struct v3d_qpu_instr *inst) 524{ 525 return (scoreboard->tick == 0 && qpu_inst_is_tlb(inst)); 526} 527 528static int 529get_instruction_priority(const struct v3d_qpu_instr *inst) 530{ 531 uint32_t baseline_score; 532 uint32_t next_score = 0; 533 534 /* Schedule TLB operations as late as possible, to get more 535 * parallelism between shaders. 536 */ 537 if (qpu_inst_is_tlb(inst)) 538 return next_score; 539 next_score++; 540 541 /* Schedule texture read results collection late to hide latency. */ 542 if (v3d_qpu_waits_on_tmu(inst)) 543 return next_score; 544 next_score++; 545 546 /* XXX perf: We should schedule SFU ALU ops so that the reader is 2 547 * instructions after the producer if possible, not just 1. 548 */ 549 550 /* Default score for things that aren't otherwise special. */ 551 baseline_score = next_score; 552 next_score++; 553 554 /* Schedule texture read setup early to hide their latency better. */ 555 if (v3d_qpu_writes_tmu(inst)) 556 return next_score; 557 next_score++; 558 559 return baseline_score; 560} 561 562static bool 563qpu_magic_waddr_is_periph(enum v3d_qpu_waddr waddr) 564{ 565 return (v3d_qpu_magic_waddr_is_tmu(waddr) || 566 v3d_qpu_magic_waddr_is_sfu(waddr) || 567 v3d_qpu_magic_waddr_is_tlb(waddr) || 568 v3d_qpu_magic_waddr_is_vpm(waddr) || 569 v3d_qpu_magic_waddr_is_tsy(waddr)); 570} 571 572static bool 573qpu_accesses_peripheral(const struct v3d_qpu_instr *inst) 574{ 575 if (v3d_qpu_uses_vpm(inst)) 576 return true; 577 if (v3d_qpu_uses_sfu(inst)) 578 return true; 579 580 if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { 581 if (inst->alu.add.op != V3D_QPU_A_NOP && 582 inst->alu.add.magic_write && 583 qpu_magic_waddr_is_periph(inst->alu.add.waddr)) { 584 return true; 585 } 586 587 if (inst->alu.add.op == V3D_QPU_A_TMUWT) 588 return true; 589 590 if (inst->alu.mul.op != V3D_QPU_M_NOP && 591 inst->alu.mul.magic_write && 592 qpu_magic_waddr_is_periph(inst->alu.mul.waddr)) { 593 return true; 594 } 595 } 596 597 return (inst->sig.ldvpm || 598 inst->sig.ldtmu || 599 inst->sig.ldtlb || 600 inst->sig.ldtlbu || 601 inst->sig.wrtmuc); 602} 603 604static bool 605qpu_merge_inst(const struct v3d_device_info *devinfo, 606 struct v3d_qpu_instr *result, 607 const struct v3d_qpu_instr *a, 608 const struct v3d_qpu_instr *b) 609{ 610 if (a->type != V3D_QPU_INSTR_TYPE_ALU || 611 b->type != V3D_QPU_INSTR_TYPE_ALU) { 612 return false; 613 } 614 615 /* Can't do more than one peripheral access in an instruction. 616 * 617 * XXX: V3D 4.1 allows TMU read along with a VPM read or write, and 618 * WRTMUC with a TMU magic register write (other than tmuc). 619 */ 620 if (qpu_accesses_peripheral(a) && qpu_accesses_peripheral(b)) 621 return false; 622 623 struct v3d_qpu_instr merge = *a; 624 625 if (b->alu.add.op != V3D_QPU_A_NOP) { 626 if (a->alu.add.op != V3D_QPU_A_NOP) 627 return false; 628 merge.alu.add = b->alu.add; 629 630 merge.flags.ac = b->flags.ac; 631 merge.flags.apf = b->flags.apf; 632 merge.flags.auf = b->flags.auf; 633 } 634 635 if (b->alu.mul.op != V3D_QPU_M_NOP) { 636 if (a->alu.mul.op != V3D_QPU_M_NOP) 637 return false; 638 merge.alu.mul = b->alu.mul; 639 640 merge.flags.mc = b->flags.mc; 641 merge.flags.mpf = b->flags.mpf; 642 merge.flags.muf = b->flags.muf; 643 } 644 645 if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A)) { 646 if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A) && 647 a->raddr_a != b->raddr_a) { 648 return false; 649 } 650 merge.raddr_a = b->raddr_a; 651 } 652 653 if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) { 654 if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_B) && 655 (a->raddr_b != b->raddr_b || 656 a->sig.small_imm != b->sig.small_imm)) { 657 return false; 658 } 659 merge.raddr_b = b->raddr_b; 660 } 661 662 merge.sig.thrsw |= b->sig.thrsw; 663 merge.sig.ldunif |= b->sig.ldunif; 664 merge.sig.ldunifrf |= b->sig.ldunifrf; 665 merge.sig.ldunifa |= b->sig.ldunifa; 666 merge.sig.ldunifarf |= b->sig.ldunifarf; 667 merge.sig.ldtmu |= b->sig.ldtmu; 668 merge.sig.ldvary |= b->sig.ldvary; 669 merge.sig.ldvpm |= b->sig.ldvpm; 670 merge.sig.small_imm |= b->sig.small_imm; 671 merge.sig.ldtlb |= b->sig.ldtlb; 672 merge.sig.ldtlbu |= b->sig.ldtlbu; 673 merge.sig.ucb |= b->sig.ucb; 674 merge.sig.rotate |= b->sig.rotate; 675 merge.sig.wrtmuc |= b->sig.wrtmuc; 676 677 if (v3d_qpu_sig_writes_address(devinfo, &a->sig) && 678 v3d_qpu_sig_writes_address(devinfo, &b->sig)) 679 return false; 680 merge.sig_addr |= b->sig_addr; 681 merge.sig_magic |= b->sig_magic; 682 683 uint64_t packed; 684 bool ok = v3d_qpu_instr_pack(devinfo, &merge, &packed); 685 686 *result = merge; 687 /* No modifying the real instructions on failure. */ 688 assert(ok || (a != result && b != result)); 689 690 return ok; 691} 692 693static struct schedule_node * 694choose_instruction_to_schedule(const struct v3d_device_info *devinfo, 695 struct choose_scoreboard *scoreboard, 696 struct schedule_node *prev_inst) 697{ 698 struct schedule_node *chosen = NULL; 699 int chosen_prio = 0; 700 701 /* Don't pair up anything with a thread switch signal -- emit_thrsw() 702 * will handle pairing it along with filling the delay slots. 703 */ 704 if (prev_inst) { 705 if (prev_inst->inst->qpu.sig.thrsw) 706 return NULL; 707 } 708 709 list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads, 710 dag.link) { 711 const struct v3d_qpu_instr *inst = &n->inst->qpu; 712 713 /* Don't choose the branch instruction until it's the last one 714 * left. We'll move it up to fit its delay slots after we 715 * choose it. 716 */ 717 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH && 718 !list_is_singular(&scoreboard->dag->heads)) { 719 continue; 720 } 721 722 /* "An instruction must not read from a location in physical 723 * regfile A or B that was written to by the previous 724 * instruction." 725 */ 726 if (reads_too_soon_after_write(scoreboard, n->inst)) 727 continue; 728 729 if (writes_too_soon_after_write(devinfo, scoreboard, n->inst)) 730 continue; 731 732 /* "A scoreboard wait must not occur in the first two 733 * instructions of a fragment shader. This is either the 734 * explicit Wait for Scoreboard signal or an implicit wait 735 * with the first tile-buffer read or write instruction." 736 */ 737 if (pixel_scoreboard_too_soon(scoreboard, inst)) 738 continue; 739 740 /* ldunif and ldvary both write r5, but ldunif does so a tick 741 * sooner. If the ldvary's r5 wasn't used, then ldunif might 742 * otherwise get scheduled so ldunif and ldvary try to update 743 * r5 in the same tick. 744 * 745 * XXX perf: To get good pipelining of a sequence of varying 746 * loads, we need to figure out how to pair the ldvary signal 747 * up to the instruction before the last r5 user in the 748 * previous ldvary sequence. Currently, it usually pairs with 749 * the last r5 user. 750 */ 751 if ((inst->sig.ldunif || inst->sig.ldunifa) && 752 scoreboard->tick == scoreboard->last_ldvary_tick + 1) { 753 continue; 754 } 755 756 /* If we're trying to pair with another instruction, check 757 * that they're compatible. 758 */ 759 if (prev_inst) { 760 /* Don't pair up a thread switch signal -- we'll 761 * handle pairing it when we pick it on its own. 762 */ 763 if (inst->sig.thrsw) 764 continue; 765 766 if (prev_inst->inst->uniform != -1 && 767 n->inst->uniform != -1) 768 continue; 769 770 /* Don't merge in something that will lock the TLB. 771 * Hopwefully what we have in inst will release some 772 * other instructions, allowing us to delay the 773 * TLB-locking instruction until later. 774 */ 775 if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst)) 776 continue; 777 778 struct v3d_qpu_instr merged_inst; 779 if (!qpu_merge_inst(devinfo, &merged_inst, 780 &prev_inst->inst->qpu, inst)) { 781 continue; 782 } 783 } 784 785 int prio = get_instruction_priority(inst); 786 787 /* Found a valid instruction. If nothing better comes along, 788 * this one works. 789 */ 790 if (!chosen) { 791 chosen = n; 792 chosen_prio = prio; 793 continue; 794 } 795 796 if (prio > chosen_prio) { 797 chosen = n; 798 chosen_prio = prio; 799 } else if (prio < chosen_prio) { 800 continue; 801 } 802 803 if (n->delay > chosen->delay) { 804 chosen = n; 805 chosen_prio = prio; 806 } else if (n->delay < chosen->delay) { 807 continue; 808 } 809 } 810 811 return chosen; 812} 813 814static void 815update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard, 816 enum v3d_qpu_waddr waddr) 817{ 818 if (v3d_qpu_magic_waddr_is_sfu(waddr)) 819 scoreboard->last_magic_sfu_write_tick = scoreboard->tick; 820} 821 822static void 823update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, 824 const struct v3d_qpu_instr *inst) 825{ 826 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) 827 return; 828 829 assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); 830 831 if (inst->alu.add.op != V3D_QPU_A_NOP) { 832 if (inst->alu.add.magic_write) { 833 update_scoreboard_for_magic_waddr(scoreboard, 834 inst->alu.add.waddr); 835 } 836 } 837 838 if (inst->alu.mul.op != V3D_QPU_M_NOP) { 839 if (inst->alu.mul.magic_write) { 840 update_scoreboard_for_magic_waddr(scoreboard, 841 inst->alu.mul.waddr); 842 } 843 } 844 845 if (inst->sig.ldvary) 846 scoreboard->last_ldvary_tick = scoreboard->tick; 847 848 if (qpu_inst_is_tlb(inst)) 849 scoreboard->tlb_locked = true; 850} 851 852static void 853dump_state(const struct v3d_device_info *devinfo, struct dag *dag) 854{ 855 list_for_each_entry(struct schedule_node, n, &dag->heads, dag.link) { 856 fprintf(stderr, " t=%4d: ", n->unblocked_time); 857 v3d_qpu_dump(devinfo, &n->inst->qpu); 858 fprintf(stderr, "\n"); 859 860 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { 861 struct schedule_node *child = 862 (struct schedule_node *)edge->child; 863 if (!child) 864 continue; 865 866 fprintf(stderr, " - "); 867 v3d_qpu_dump(devinfo, &child->inst->qpu); 868 fprintf(stderr, " (%d parents, %c)\n", 869 child->dag.parent_count, 870 edge->data ? 'w' : 'r'); 871 } 872 } 873} 874 875static uint32_t magic_waddr_latency(enum v3d_qpu_waddr waddr, 876 const struct v3d_qpu_instr *after) 877{ 878 /* Apply some huge latency between texture fetch requests and getting 879 * their results back. 880 * 881 * FIXME: This is actually pretty bogus. If we do: 882 * 883 * mov tmu0_s, a 884 * <a bit of math> 885 * mov tmu0_s, b 886 * load_tmu0 887 * <more math> 888 * load_tmu0 889 * 890 * we count that as worse than 891 * 892 * mov tmu0_s, a 893 * mov tmu0_s, b 894 * <lots of math> 895 * load_tmu0 896 * <more math> 897 * load_tmu0 898 * 899 * because we associate the first load_tmu0 with the *second* tmu0_s. 900 */ 901 if (v3d_qpu_magic_waddr_is_tmu(waddr) && v3d_qpu_waits_on_tmu(after)) 902 return 100; 903 904 /* Assume that anything depending on us is consuming the SFU result. */ 905 if (v3d_qpu_magic_waddr_is_sfu(waddr)) 906 return 3; 907 908 return 1; 909} 910 911static uint32_t 912instruction_latency(struct schedule_node *before, struct schedule_node *after) 913{ 914 const struct v3d_qpu_instr *before_inst = &before->inst->qpu; 915 const struct v3d_qpu_instr *after_inst = &after->inst->qpu; 916 uint32_t latency = 1; 917 918 if (before_inst->type != V3D_QPU_INSTR_TYPE_ALU || 919 after_inst->type != V3D_QPU_INSTR_TYPE_ALU) 920 return latency; 921 922 if (before_inst->alu.add.magic_write) { 923 latency = MAX2(latency, 924 magic_waddr_latency(before_inst->alu.add.waddr, 925 after_inst)); 926 } 927 928 if (before_inst->alu.mul.magic_write) { 929 latency = MAX2(latency, 930 magic_waddr_latency(before_inst->alu.mul.waddr, 931 after_inst)); 932 } 933 934 return latency; 935} 936 937/** Recursive computation of the delay member of a node. */ 938static void 939compute_delay(struct dag_node *node, void *state) 940{ 941 struct schedule_node *n = (struct schedule_node *)node; 942 943 n->delay = 1; 944 945 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { 946 struct schedule_node *child = 947 (struct schedule_node *)edge->child; 948 949 n->delay = MAX2(n->delay, (child->delay + 950 instruction_latency(n, child))); 951 } 952} 953 954/* Removes a DAG head, but removing only the WAR edges. (dag_prune_head() 955 * should be called on it later to finish pruning the other edges). 956 */ 957static void 958pre_remove_head(struct dag *dag, struct schedule_node *n) 959{ 960 list_delinit(&n->dag.link); 961 962 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { 963 if (edge->data) 964 dag_remove_edge(dag, edge); 965 } 966} 967 968static void 969mark_instruction_scheduled(struct dag *dag, 970 uint32_t time, 971 struct schedule_node *node) 972{ 973 if (!node) 974 return; 975 976 util_dynarray_foreach(&node->dag.edges, struct dag_edge, edge) { 977 struct schedule_node *child = 978 (struct schedule_node *)edge->child; 979 980 if (!child) 981 continue; 982 983 uint32_t latency = instruction_latency(node, child); 984 985 child->unblocked_time = MAX2(child->unblocked_time, 986 time + latency); 987 } 988 dag_prune_head(dag, &node->dag); 989} 990 991static void 992insert_scheduled_instruction(struct v3d_compile *c, 993 struct qblock *block, 994 struct choose_scoreboard *scoreboard, 995 struct qinst *inst) 996{ 997 list_addtail(&inst->link, &block->instructions); 998 999 update_scoreboard_for_chosen(scoreboard, &inst->qpu); 1000 c->qpu_inst_count++; 1001 scoreboard->tick++; 1002} 1003 1004static struct qinst * 1005vir_nop() 1006{ 1007 struct qreg undef = vir_nop_reg(); 1008 struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef); 1009 1010 return qinst; 1011} 1012 1013static void 1014emit_nop(struct v3d_compile *c, struct qblock *block, 1015 struct choose_scoreboard *scoreboard) 1016{ 1017 insert_scheduled_instruction(c, block, scoreboard, vir_nop()); 1018} 1019 1020static bool 1021qpu_instruction_valid_in_thrend_slot(struct v3d_compile *c, 1022 const struct qinst *qinst, int slot) 1023{ 1024 const struct v3d_qpu_instr *inst = &qinst->qpu; 1025 1026 /* Only TLB Z writes are prohibited in the last slot, but we don't 1027 * have those flagged so prohibit all TLB ops for now. 1028 */ 1029 if (slot == 2 && qpu_inst_is_tlb(inst)) 1030 return false; 1031 1032 if (slot > 0 && qinst->uniform != ~0) 1033 return false; 1034 1035 if (v3d_qpu_uses_vpm(inst)) 1036 return false; 1037 1038 if (inst->sig.ldvary) 1039 return false; 1040 1041 if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { 1042 /* GFXH-1625: TMUWT not allowed in the final instruction. */ 1043 if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT) 1044 return false; 1045 1046 /* No writing physical registers at the end. */ 1047 if (!inst->alu.add.magic_write || 1048 !inst->alu.mul.magic_write) { 1049 return false; 1050 } 1051 1052 if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF) 1053 return false; 1054 1055 /* RF0-2 might be overwritten during the delay slots by 1056 * fragment shader setup. 1057 */ 1058 if (inst->raddr_a < 3 && 1059 (inst->alu.add.a == V3D_QPU_MUX_A || 1060 inst->alu.add.b == V3D_QPU_MUX_A || 1061 inst->alu.mul.a == V3D_QPU_MUX_A || 1062 inst->alu.mul.b == V3D_QPU_MUX_A)) { 1063 return false; 1064 } 1065 1066 if (inst->raddr_b < 3 && 1067 !inst->sig.small_imm && 1068 (inst->alu.add.a == V3D_QPU_MUX_B || 1069 inst->alu.add.b == V3D_QPU_MUX_B || 1070 inst->alu.mul.a == V3D_QPU_MUX_B || 1071 inst->alu.mul.b == V3D_QPU_MUX_B)) { 1072 return false; 1073 } 1074 } 1075 1076 return true; 1077} 1078 1079static bool 1080valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard, 1081 struct qinst *qinst, int instructions_in_sequence, 1082 bool is_thrend) 1083{ 1084 /* No emitting our thrsw while the previous thrsw hasn't happened yet. */ 1085 if (scoreboard->last_thrsw_tick + 3 > 1086 scoreboard->tick - instructions_in_sequence) { 1087 return false; 1088 } 1089 1090 for (int slot = 0; slot < instructions_in_sequence; slot++) { 1091 /* No scheduling SFU when the result would land in the other 1092 * thread. The simulator complains for safety, though it 1093 * would only occur for dead code in our case. 1094 */ 1095 if (slot > 0 && 1096 qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && 1097 (v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.add.waddr) || 1098 v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.mul.waddr))) { 1099 return false; 1100 } 1101 1102 if (slot > 0 && qinst->qpu.sig.ldvary) 1103 return false; 1104 1105 if (is_thrend && 1106 !qpu_instruction_valid_in_thrend_slot(c, qinst, slot)) { 1107 return false; 1108 } 1109 1110 /* Note that the list is circular, so we can only do this up 1111 * to instructions_in_sequence. 1112 */ 1113 qinst = (struct qinst *)qinst->link.next; 1114 } 1115 1116 return true; 1117} 1118 1119/** 1120 * Emits a THRSW signal in the stream, trying to move it up to pair with 1121 * another instruction. 1122 */ 1123static int 1124emit_thrsw(struct v3d_compile *c, 1125 struct qblock *block, 1126 struct choose_scoreboard *scoreboard, 1127 struct qinst *inst, 1128 bool is_thrend) 1129{ 1130 int time = 0; 1131 1132 /* There should be nothing in a thrsw inst being scheduled other than 1133 * the signal bits. 1134 */ 1135 assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU); 1136 assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP); 1137 assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP); 1138 1139 /* Find how far back into previous instructions we can put the THRSW. */ 1140 int slots_filled = 0; 1141 struct qinst *merge_inst = NULL; 1142 vir_for_each_inst_rev(prev_inst, block) { 1143 struct v3d_qpu_sig sig = prev_inst->qpu.sig; 1144 sig.thrsw = true; 1145 uint32_t packed_sig; 1146 1147 if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig)) 1148 break; 1149 1150 if (!valid_thrsw_sequence(c, scoreboard, 1151 prev_inst, slots_filled + 1, 1152 is_thrend)) { 1153 break; 1154 } 1155 1156 merge_inst = prev_inst; 1157 if (++slots_filled == 3) 1158 break; 1159 } 1160 1161 bool needs_free = false; 1162 if (merge_inst) { 1163 merge_inst->qpu.sig.thrsw = true; 1164 needs_free = true; 1165 scoreboard->last_thrsw_tick = scoreboard->tick - slots_filled; 1166 } else { 1167 scoreboard->last_thrsw_tick = scoreboard->tick; 1168 insert_scheduled_instruction(c, block, scoreboard, inst); 1169 time++; 1170 slots_filled++; 1171 merge_inst = inst; 1172 } 1173 1174 /* Insert any extra delay slot NOPs we need. */ 1175 for (int i = 0; i < 3 - slots_filled; i++) { 1176 emit_nop(c, block, scoreboard); 1177 time++; 1178 } 1179 1180 /* If we're emitting the last THRSW (other than program end), then 1181 * signal that to the HW by emitting two THRSWs in a row. 1182 */ 1183 if (inst->is_last_thrsw) { 1184 struct qinst *second_inst = 1185 (struct qinst *)merge_inst->link.next; 1186 second_inst->qpu.sig.thrsw = true; 1187 } 1188 1189 /* If we put our THRSW into another instruction, free up the 1190 * instruction that didn't end up scheduled into the list. 1191 */ 1192 if (needs_free) 1193 free(inst); 1194 1195 return time; 1196} 1197 1198static uint32_t 1199schedule_instructions(struct v3d_compile *c, 1200 struct choose_scoreboard *scoreboard, 1201 struct qblock *block, 1202 enum quniform_contents *orig_uniform_contents, 1203 uint32_t *orig_uniform_data, 1204 uint32_t *next_uniform) 1205{ 1206 const struct v3d_device_info *devinfo = c->devinfo; 1207 uint32_t time = 0; 1208 1209 while (!list_empty(&scoreboard->dag->heads)) { 1210 struct schedule_node *chosen = 1211 choose_instruction_to_schedule(devinfo, 1212 scoreboard, 1213 NULL); 1214 struct schedule_node *merge = NULL; 1215 1216 /* If there are no valid instructions to schedule, drop a NOP 1217 * in. 1218 */ 1219 struct qinst *qinst = chosen ? chosen->inst : vir_nop(); 1220 struct v3d_qpu_instr *inst = &qinst->qpu; 1221 1222 if (debug) { 1223 fprintf(stderr, "t=%4d: current list:\n", 1224 time); 1225 dump_state(devinfo, scoreboard->dag); 1226 fprintf(stderr, "t=%4d: chose: ", time); 1227 v3d_qpu_dump(devinfo, inst); 1228 fprintf(stderr, "\n"); 1229 } 1230 1231 /* We can't mark_instruction_scheduled() the chosen inst until 1232 * we're done identifying instructions to merge, so put the 1233 * merged instructions on a list for a moment. 1234 */ 1235 struct list_head merged_list; 1236 list_inithead(&merged_list); 1237 1238 /* Schedule this instruction onto the QPU list. Also try to 1239 * find an instruction to pair with it. 1240 */ 1241 if (chosen) { 1242 time = MAX2(chosen->unblocked_time, time); 1243 pre_remove_head(scoreboard->dag, chosen); 1244 1245 while ((merge = 1246 choose_instruction_to_schedule(devinfo, 1247 scoreboard, 1248 chosen))) { 1249 time = MAX2(merge->unblocked_time, time); 1250 pre_remove_head(scoreboard->dag, chosen); 1251 list_addtail(&merge->link, &merged_list); 1252 (void)qpu_merge_inst(devinfo, inst, 1253 inst, &merge->inst->qpu); 1254 if (merge->inst->uniform != -1) { 1255 chosen->inst->uniform = 1256 merge->inst->uniform; 1257 } 1258 1259 if (debug) { 1260 fprintf(stderr, "t=%4d: merging: ", 1261 time); 1262 v3d_qpu_dump(devinfo, &merge->inst->qpu); 1263 fprintf(stderr, "\n"); 1264 fprintf(stderr, " result: "); 1265 v3d_qpu_dump(devinfo, inst); 1266 fprintf(stderr, "\n"); 1267 } 1268 } 1269 } 1270 1271 /* Update the uniform index for the rewritten location -- 1272 * branch target updating will still need to change 1273 * c->uniform_data[] using this index. 1274 */ 1275 if (qinst->uniform != -1) { 1276 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) 1277 block->branch_uniform = *next_uniform; 1278 1279 c->uniform_data[*next_uniform] = 1280 orig_uniform_data[qinst->uniform]; 1281 c->uniform_contents[*next_uniform] = 1282 orig_uniform_contents[qinst->uniform]; 1283 qinst->uniform = *next_uniform; 1284 (*next_uniform)++; 1285 } 1286 1287 if (debug) { 1288 fprintf(stderr, "\n"); 1289 } 1290 1291 /* Now that we've scheduled a new instruction, some of its 1292 * children can be promoted to the list of instructions ready to 1293 * be scheduled. Update the children's unblocked time for this 1294 * DAG edge as we do so. 1295 */ 1296 mark_instruction_scheduled(scoreboard->dag, time, chosen); 1297 list_for_each_entry(struct schedule_node, merge, &merged_list, 1298 link) { 1299 mark_instruction_scheduled(scoreboard->dag, time, merge); 1300 1301 /* The merged VIR instruction doesn't get re-added to the 1302 * block, so free it now. 1303 */ 1304 free(merge->inst); 1305 } 1306 1307 if (inst->sig.thrsw) { 1308 time += emit_thrsw(c, block, scoreboard, qinst, false); 1309 } else { 1310 insert_scheduled_instruction(c, block, 1311 scoreboard, qinst); 1312 1313 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) { 1314 block->branch_qpu_ip = c->qpu_inst_count - 1; 1315 /* Fill the delay slots. 1316 * 1317 * We should fill these with actual instructions, 1318 * instead, but that will probably need to be done 1319 * after this, once we know what the leading 1320 * instructions of the successors are (so we can 1321 * handle A/B register file write latency) 1322 */ 1323 for (int i = 0; i < 3; i++) 1324 emit_nop(c, block, scoreboard); 1325 } 1326 } 1327 } 1328 1329 return time; 1330} 1331 1332static uint32_t 1333qpu_schedule_instructions_block(struct v3d_compile *c, 1334 struct choose_scoreboard *scoreboard, 1335 struct qblock *block, 1336 enum quniform_contents *orig_uniform_contents, 1337 uint32_t *orig_uniform_data, 1338 uint32_t *next_uniform) 1339{ 1340 void *mem_ctx = ralloc_context(NULL); 1341 scoreboard->dag = dag_create(mem_ctx); 1342 struct list_head setup_list; 1343 1344 list_inithead(&setup_list); 1345 1346 /* Wrap each instruction in a scheduler structure. */ 1347 while (!list_empty(&block->instructions)) { 1348 struct qinst *qinst = (struct qinst *)block->instructions.next; 1349 struct schedule_node *n = 1350 rzalloc(mem_ctx, struct schedule_node); 1351 1352 dag_init_node(scoreboard->dag, &n->dag); 1353 n->inst = qinst; 1354 1355 list_del(&qinst->link); 1356 list_addtail(&n->link, &setup_list); 1357 } 1358 1359 calculate_forward_deps(c, scoreboard->dag, &setup_list); 1360 calculate_reverse_deps(c, scoreboard->dag, &setup_list); 1361 1362 dag_traverse_bottom_up(scoreboard->dag, compute_delay, NULL); 1363 1364 uint32_t cycles = schedule_instructions(c, scoreboard, block, 1365 orig_uniform_contents, 1366 orig_uniform_data, 1367 next_uniform); 1368 1369 ralloc_free(mem_ctx); 1370 scoreboard->dag = NULL; 1371 1372 return cycles; 1373} 1374 1375static void 1376qpu_set_branch_targets(struct v3d_compile *c) 1377{ 1378 vir_for_each_block(block, c) { 1379 /* The end block of the program has no branch. */ 1380 if (!block->successors[0]) 1381 continue; 1382 1383 /* If there was no branch instruction, then the successor 1384 * block must follow immediately after this one. 1385 */ 1386 if (block->branch_qpu_ip == ~0) { 1387 assert(block->end_qpu_ip + 1 == 1388 block->successors[0]->start_qpu_ip); 1389 continue; 1390 } 1391 1392 /* Walk back through the delay slots to find the branch 1393 * instr. 1394 */ 1395 struct list_head *entry = block->instructions.prev; 1396 for (int i = 0; i < 3; i++) 1397 entry = entry->prev; 1398 struct qinst *branch = container_of(entry, branch, link); 1399 assert(branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH); 1400 1401 /* Make sure that the if-we-don't-jump 1402 * successor was scheduled just after the 1403 * delay slots. 1404 */ 1405 assert(!block->successors[1] || 1406 block->successors[1]->start_qpu_ip == 1407 block->branch_qpu_ip + 4); 1408 1409 branch->qpu.branch.offset = 1410 ((block->successors[0]->start_qpu_ip - 1411 (block->branch_qpu_ip + 4)) * 1412 sizeof(uint64_t)); 1413 1414 /* Set up the relative offset to jump in the 1415 * uniform stream. 1416 * 1417 * Use a temporary here, because 1418 * uniform_data[inst->uniform] may be shared 1419 * between multiple instructions. 1420 */ 1421 assert(c->uniform_contents[branch->uniform] == QUNIFORM_CONSTANT); 1422 c->uniform_data[branch->uniform] = 1423 (block->successors[0]->start_uniform - 1424 (block->branch_uniform + 1)) * 4; 1425 } 1426} 1427 1428uint32_t 1429v3d_qpu_schedule_instructions(struct v3d_compile *c) 1430{ 1431 const struct v3d_device_info *devinfo = c->devinfo; 1432 struct qblock *end_block = list_last_entry(&c->blocks, 1433 struct qblock, link); 1434 1435 /* We reorder the uniforms as we schedule instructions, so save the 1436 * old data off and replace it. 1437 */ 1438 uint32_t *uniform_data = c->uniform_data; 1439 enum quniform_contents *uniform_contents = c->uniform_contents; 1440 c->uniform_contents = ralloc_array(c, enum quniform_contents, 1441 c->num_uniforms); 1442 c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms); 1443 c->uniform_array_size = c->num_uniforms; 1444 uint32_t next_uniform = 0; 1445 1446 struct choose_scoreboard scoreboard; 1447 memset(&scoreboard, 0, sizeof(scoreboard)); 1448 scoreboard.last_ldvary_tick = -10; 1449 scoreboard.last_magic_sfu_write_tick = -10; 1450 scoreboard.last_uniforms_reset_tick = -10; 1451 scoreboard.last_thrsw_tick = -10; 1452 1453 if (debug) { 1454 fprintf(stderr, "Pre-schedule instructions\n"); 1455 vir_for_each_block(block, c) { 1456 fprintf(stderr, "BLOCK %d\n", block->index); 1457 list_for_each_entry(struct qinst, qinst, 1458 &block->instructions, link) { 1459 v3d_qpu_dump(devinfo, &qinst->qpu); 1460 fprintf(stderr, "\n"); 1461 } 1462 } 1463 fprintf(stderr, "\n"); 1464 } 1465 1466 uint32_t cycles = 0; 1467 vir_for_each_block(block, c) { 1468 block->start_qpu_ip = c->qpu_inst_count; 1469 block->branch_qpu_ip = ~0; 1470 block->start_uniform = next_uniform; 1471 1472 cycles += qpu_schedule_instructions_block(c, 1473 &scoreboard, 1474 block, 1475 uniform_contents, 1476 uniform_data, 1477 &next_uniform); 1478 1479 block->end_qpu_ip = c->qpu_inst_count - 1; 1480 } 1481 1482 /* Emit the program-end THRSW instruction. */; 1483 struct qinst *thrsw = vir_nop(); 1484 thrsw->qpu.sig.thrsw = true; 1485 emit_thrsw(c, end_block, &scoreboard, thrsw, true); 1486 1487 qpu_set_branch_targets(c); 1488 1489 assert(next_uniform == c->num_uniforms); 1490 1491 return cycles; 1492} 1493