1/* 2 * Copyright © 2010 Intel Corporation 3 * Copyright © 2014-2017 Broadcom 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22 * IN THE SOFTWARE. 23 */ 24 25/** 26 * @file 27 * 28 * The basic model of the list scheduler is to take a basic block, compute a 29 * DAG of the dependencies, and make a list of the DAG heads. Heuristically 30 * pick a DAG head, then put all the children that are now DAG heads into the 31 * list of things to schedule. 32 * 33 * The goal of scheduling here is to pack pairs of operations together in a 34 * single QPU instruction. 35 */ 36 37#include "qpu/qpu_disasm.h" 38#include "v3d_compiler.h" 39#include "util/ralloc.h" 40#include "util/dag.h" 41 42static bool debug; 43 44struct schedule_node_child; 45 46struct schedule_node { 47 struct dag_node dag; 48 struct list_head link; 49 struct qinst *inst; 50 51 /* Longest cycles + instruction_latency() of any parent of this node. */ 52 uint32_t unblocked_time; 53 54 /** 55 * Minimum number of cycles from scheduling this instruction until the 56 * end of the program, based on the slowest dependency chain through 57 * the children. 58 */ 59 uint32_t delay; 60 61 /** 62 * cycles between this instruction being scheduled and when its result 63 * can be consumed. 64 */ 65 uint32_t latency; 66}; 67 68/* When walking the instructions in reverse, we need to swap before/after in 69 * add_dep(). 70 */ 71enum direction { F, R }; 72 73struct schedule_state { 74 const struct v3d_device_info *devinfo; 75 struct dag *dag; 76 struct schedule_node *last_r[6]; 77 struct schedule_node *last_rf[64]; 78 struct schedule_node *last_sf; 79 struct schedule_node *last_vpm_read; 80 struct schedule_node *last_tmu_write; 81 struct schedule_node *last_tmu_config; 82 struct schedule_node *last_tmu_read; 83 struct schedule_node *last_tlb; 84 struct schedule_node *last_vpm; 85 struct schedule_node *last_unif; 86 struct schedule_node *last_rtop; 87 struct schedule_node *last_unifa; 88 enum direction dir; 89 /* Estimated cycle when the current instruction would start. */ 90 uint32_t time; 91}; 92 93static void 94add_dep(struct schedule_state *state, 95 struct schedule_node *before, 96 struct schedule_node *after, 97 bool write) 98{ 99 bool write_after_read = !write && state->dir == R; 100 void *edge_data = (void *)(uintptr_t)write_after_read; 101 102 if (!before || !after) 103 return; 104 105 assert(before != after); 106 107 if (state->dir == F) 108 dag_add_edge(&before->dag, &after->dag, edge_data); 109 else 110 dag_add_edge(&after->dag, &before->dag, edge_data); 111} 112 113static void 114add_read_dep(struct schedule_state *state, 115 struct schedule_node *before, 116 struct schedule_node *after) 117{ 118 add_dep(state, before, after, false); 119} 120 121static void 122add_write_dep(struct schedule_state *state, 123 struct schedule_node **before, 124 struct schedule_node *after) 125{ 126 add_dep(state, *before, after, true); 127 *before = after; 128} 129 130static bool 131qpu_inst_is_tlb(const struct v3d_qpu_instr *inst) 132{ 133 if (inst->sig.ldtlb || inst->sig.ldtlbu) 134 return true; 135 136 if (inst->type != V3D_QPU_INSTR_TYPE_ALU) 137 return false; 138 139 if (inst->alu.add.magic_write && 140 (inst->alu.add.waddr == V3D_QPU_WADDR_TLB || 141 inst->alu.add.waddr == V3D_QPU_WADDR_TLBU)) 142 return true; 143 144 if (inst->alu.mul.magic_write && 145 (inst->alu.mul.waddr == V3D_QPU_WADDR_TLB || 146 inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU)) 147 return true; 148 149 return false; 150} 151 152static void 153process_mux_deps(struct schedule_state *state, struct schedule_node *n, 154 enum v3d_qpu_mux mux) 155{ 156 switch (mux) { 157 case V3D_QPU_MUX_A: 158 add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n); 159 break; 160 case V3D_QPU_MUX_B: 161 if (!n->inst->qpu.sig.small_imm) { 162 add_read_dep(state, 163 state->last_rf[n->inst->qpu.raddr_b], n); 164 } 165 break; 166 default: 167 add_read_dep(state, state->last_r[mux - V3D_QPU_MUX_R0], n); 168 break; 169 } 170} 171 172static bool 173tmu_write_is_sequence_terminator(uint32_t waddr) 174{ 175 switch (waddr) { 176 case V3D_QPU_WADDR_TMUS: 177 case V3D_QPU_WADDR_TMUSCM: 178 case V3D_QPU_WADDR_TMUSF: 179 case V3D_QPU_WADDR_TMUSLOD: 180 case V3D_QPU_WADDR_TMUA: 181 case V3D_QPU_WADDR_TMUAU: 182 return true; 183 default: 184 return false; 185 } 186} 187 188static bool 189can_reorder_tmu_write(const struct v3d_device_info *devinfo, uint32_t waddr) 190{ 191 if (devinfo->ver < 40) 192 return false; 193 194 if (tmu_write_is_sequence_terminator(waddr)) 195 return false; 196 197 if (waddr == V3D_QPU_WADDR_TMUD) 198 return false; 199 200 return true; 201} 202 203static void 204process_waddr_deps(struct schedule_state *state, struct schedule_node *n, 205 uint32_t waddr, bool magic) 206{ 207 if (!magic) { 208 add_write_dep(state, &state->last_rf[waddr], n); 209 } else if (v3d_qpu_magic_waddr_is_tmu(state->devinfo, waddr)) { 210 if (can_reorder_tmu_write(state->devinfo, waddr)) 211 add_read_dep(state, state->last_tmu_write, n); 212 else 213 add_write_dep(state, &state->last_tmu_write, n); 214 215 if (tmu_write_is_sequence_terminator(waddr)) 216 add_write_dep(state, &state->last_tmu_config, n); 217 } else if (v3d_qpu_magic_waddr_is_sfu(waddr)) { 218 /* Handled by v3d_qpu_writes_r4() check. */ 219 } else { 220 switch (waddr) { 221 case V3D_QPU_WADDR_R0: 222 case V3D_QPU_WADDR_R1: 223 case V3D_QPU_WADDR_R2: 224 add_write_dep(state, 225 &state->last_r[waddr - V3D_QPU_WADDR_R0], 226 n); 227 break; 228 case V3D_QPU_WADDR_R3: 229 case V3D_QPU_WADDR_R4: 230 case V3D_QPU_WADDR_R5: 231 /* Handled by v3d_qpu_writes_r*() checks below. */ 232 break; 233 234 case V3D_QPU_WADDR_VPM: 235 case V3D_QPU_WADDR_VPMU: 236 add_write_dep(state, &state->last_vpm, n); 237 break; 238 239 case V3D_QPU_WADDR_TLB: 240 case V3D_QPU_WADDR_TLBU: 241 add_write_dep(state, &state->last_tlb, n); 242 break; 243 244 case V3D_QPU_WADDR_SYNC: 245 case V3D_QPU_WADDR_SYNCB: 246 case V3D_QPU_WADDR_SYNCU: 247 /* For CS barrier(): Sync against any other memory 248 * accesses. There doesn't appear to be any need for 249 * barriers to affect ALU operations. 250 */ 251 add_write_dep(state, &state->last_tmu_write, n); 252 add_write_dep(state, &state->last_tmu_read, n); 253 break; 254 255 case V3D_QPU_WADDR_UNIFA: 256 if (state->devinfo->ver >= 40) 257 add_write_dep(state, &state->last_unifa, n); 258 break; 259 260 case V3D_QPU_WADDR_NOP: 261 break; 262 263 default: 264 fprintf(stderr, "Unknown waddr %d\n", waddr); 265 abort(); 266 } 267 } 268} 269 270/** 271 * Common code for dependencies that need to be tracked both forward and 272 * backward. 273 * 274 * This is for things like "all reads of r4 have to happen between the r4 275 * writes that surround them". 276 */ 277static void 278calculate_deps(struct schedule_state *state, struct schedule_node *n) 279{ 280 const struct v3d_device_info *devinfo = state->devinfo; 281 struct qinst *qinst = n->inst; 282 struct v3d_qpu_instr *inst = &qinst->qpu; 283 /* If the input and output segments are shared, then all VPM reads to 284 * a location need to happen before all writes. We handle this by 285 * serializing all VPM operations for now. 286 */ 287 bool separate_vpm_segment = false; 288 289 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) { 290 if (inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS) 291 add_read_dep(state, state->last_sf, n); 292 293 /* XXX: BDI */ 294 /* XXX: BDU */ 295 /* XXX: ub */ 296 /* XXX: raddr_a */ 297 298 add_write_dep(state, &state->last_unif, n); 299 return; 300 } 301 302 assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); 303 304 /* XXX: LOAD_IMM */ 305 306 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) 307 process_mux_deps(state, n, inst->alu.add.a); 308 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) 309 process_mux_deps(state, n, inst->alu.add.b); 310 311 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) 312 process_mux_deps(state, n, inst->alu.mul.a); 313 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) 314 process_mux_deps(state, n, inst->alu.mul.b); 315 316 switch (inst->alu.add.op) { 317 case V3D_QPU_A_VPMSETUP: 318 /* Could distinguish read/write by unpacking the uniform. */ 319 add_write_dep(state, &state->last_vpm, n); 320 add_write_dep(state, &state->last_vpm_read, n); 321 break; 322 323 case V3D_QPU_A_STVPMV: 324 case V3D_QPU_A_STVPMD: 325 case V3D_QPU_A_STVPMP: 326 add_write_dep(state, &state->last_vpm, n); 327 break; 328 329 case V3D_QPU_A_LDVPMV_IN: 330 case V3D_QPU_A_LDVPMD_IN: 331 case V3D_QPU_A_LDVPMG_IN: 332 case V3D_QPU_A_LDVPMP: 333 if (!separate_vpm_segment) 334 add_write_dep(state, &state->last_vpm, n); 335 break; 336 337 case V3D_QPU_A_VPMWT: 338 add_read_dep(state, state->last_vpm, n); 339 break; 340 341 case V3D_QPU_A_MSF: 342 add_read_dep(state, state->last_tlb, n); 343 break; 344 345 case V3D_QPU_A_SETMSF: 346 case V3D_QPU_A_SETREVF: 347 add_write_dep(state, &state->last_tlb, n); 348 break; 349 350 default: 351 break; 352 } 353 354 switch (inst->alu.mul.op) { 355 case V3D_QPU_M_MULTOP: 356 case V3D_QPU_M_UMUL24: 357 /* MULTOP sets rtop, and UMUL24 implicitly reads rtop and 358 * resets it to 0. We could possibly reorder umul24s relative 359 * to each other, but for now just keep all the MUL parts in 360 * order. 361 */ 362 add_write_dep(state, &state->last_rtop, n); 363 break; 364 default: 365 break; 366 } 367 368 if (inst->alu.add.op != V3D_QPU_A_NOP) { 369 process_waddr_deps(state, n, inst->alu.add.waddr, 370 inst->alu.add.magic_write); 371 } 372 if (inst->alu.mul.op != V3D_QPU_M_NOP) { 373 process_waddr_deps(state, n, inst->alu.mul.waddr, 374 inst->alu.mul.magic_write); 375 } 376 if (v3d_qpu_sig_writes_address(devinfo, &inst->sig)) { 377 process_waddr_deps(state, n, inst->sig_addr, 378 inst->sig_magic); 379 } 380 381 if (v3d_qpu_writes_r3(devinfo, inst)) 382 add_write_dep(state, &state->last_r[3], n); 383 if (v3d_qpu_writes_r4(devinfo, inst)) 384 add_write_dep(state, &state->last_r[4], n); 385 if (v3d_qpu_writes_r5(devinfo, inst)) 386 add_write_dep(state, &state->last_r[5], n); 387 388 /* If we add any more dependencies here we should consider whether we 389 * also need to update qpu_inst_after_thrsw_valid_in_delay_slot. 390 */ 391 if (inst->sig.thrsw) { 392 /* All accumulator contents and flags are undefined after the 393 * switch. 394 */ 395 for (int i = 0; i < ARRAY_SIZE(state->last_r); i++) 396 add_write_dep(state, &state->last_r[i], n); 397 add_write_dep(state, &state->last_sf, n); 398 add_write_dep(state, &state->last_rtop, n); 399 400 /* Scoreboard-locking operations have to stay after the last 401 * thread switch. 402 */ 403 add_write_dep(state, &state->last_tlb, n); 404 405 add_write_dep(state, &state->last_tmu_write, n); 406 add_write_dep(state, &state->last_tmu_config, n); 407 } 408 409 if (v3d_qpu_waits_on_tmu(inst)) { 410 /* TMU loads are coming from a FIFO, so ordering is important. 411 */ 412 add_write_dep(state, &state->last_tmu_read, n); 413 /* Keep TMU loads after their TMU lookup terminator */ 414 add_read_dep(state, state->last_tmu_config, n); 415 } 416 417 /* Allow wrtmuc to be reordered with other instructions in the 418 * same TMU sequence by using a read dependency on the last TMU 419 * sequence terminator. 420 */ 421 if (inst->sig.wrtmuc) 422 add_read_dep(state, state->last_tmu_config, n); 423 424 if (inst->sig.ldtlb | inst->sig.ldtlbu) 425 add_write_dep(state, &state->last_tlb, n); 426 427 if (inst->sig.ldvpm) { 428 add_write_dep(state, &state->last_vpm_read, n); 429 430 /* At least for now, we're doing shared I/O segments, so queue 431 * all writes after all reads. 432 */ 433 if (!separate_vpm_segment) 434 add_write_dep(state, &state->last_vpm, n); 435 } 436 437 /* inst->sig.ldunif or sideband uniform read */ 438 if (vir_has_uniform(qinst)) 439 add_write_dep(state, &state->last_unif, n); 440 441 /* Both unifa and ldunifa must preserve ordering */ 442 if (inst->sig.ldunifa || inst->sig.ldunifarf) 443 add_write_dep(state, &state->last_unifa, n); 444 445 if (v3d_qpu_reads_flags(inst)) 446 add_read_dep(state, state->last_sf, n); 447 if (v3d_qpu_writes_flags(inst)) 448 add_write_dep(state, &state->last_sf, n); 449} 450 451static void 452calculate_forward_deps(struct v3d_compile *c, struct dag *dag, 453 struct list_head *schedule_list) 454{ 455 struct schedule_state state; 456 457 memset(&state, 0, sizeof(state)); 458 state.dag = dag; 459 state.devinfo = c->devinfo; 460 state.dir = F; 461 462 list_for_each_entry(struct schedule_node, node, schedule_list, link) 463 calculate_deps(&state, node); 464} 465 466static void 467calculate_reverse_deps(struct v3d_compile *c, struct dag *dag, 468 struct list_head *schedule_list) 469{ 470 struct schedule_state state; 471 472 memset(&state, 0, sizeof(state)); 473 state.dag = dag; 474 state.devinfo = c->devinfo; 475 state.dir = R; 476 477 list_for_each_entry_rev(struct schedule_node, node, schedule_list, 478 link) { 479 calculate_deps(&state, (struct schedule_node *)node); 480 } 481} 482 483struct choose_scoreboard { 484 struct dag *dag; 485 int tick; 486 int last_magic_sfu_write_tick; 487 int last_stallable_sfu_reg; 488 int last_stallable_sfu_tick; 489 int last_ldvary_tick; 490 int last_unifa_write_tick; 491 int last_uniforms_reset_tick; 492 int last_thrsw_tick; 493 int last_branch_tick; 494 int last_setmsf_tick; 495 bool first_thrsw_emitted; 496 bool last_thrsw_emitted; 497 bool fixup_ldvary; 498 int ldvary_count; 499}; 500 501static bool 502mux_reads_too_soon(struct choose_scoreboard *scoreboard, 503 const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux) 504{ 505 switch (mux) { 506 case V3D_QPU_MUX_R4: 507 if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick <= 2) 508 return true; 509 break; 510 511 case V3D_QPU_MUX_R5: 512 if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1) 513 return true; 514 break; 515 default: 516 break; 517 } 518 519 return false; 520} 521 522static bool 523reads_too_soon_after_write(struct choose_scoreboard *scoreboard, 524 struct qinst *qinst) 525{ 526 const struct v3d_qpu_instr *inst = &qinst->qpu; 527 528 /* XXX: Branching off of raddr. */ 529 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) 530 return false; 531 532 assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); 533 534 if (inst->alu.add.op != V3D_QPU_A_NOP) { 535 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 && 536 mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) { 537 return true; 538 } 539 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 && 540 mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) { 541 return true; 542 } 543 } 544 545 if (inst->alu.mul.op != V3D_QPU_M_NOP) { 546 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 && 547 mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) { 548 return true; 549 } 550 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 && 551 mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) { 552 return true; 553 } 554 } 555 556 /* XXX: imm */ 557 558 return false; 559} 560 561static bool 562writes_too_soon_after_write(const struct v3d_device_info *devinfo, 563 struct choose_scoreboard *scoreboard, 564 struct qinst *qinst) 565{ 566 const struct v3d_qpu_instr *inst = &qinst->qpu; 567 568 /* Don't schedule any other r4 write too soon after an SFU write. 569 * This would normally be prevented by dependency tracking, but might 570 * occur if a dead SFU computation makes it to scheduling. 571 */ 572 if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick < 2 && 573 v3d_qpu_writes_r4(devinfo, inst)) 574 return true; 575 576 return false; 577} 578 579static bool 580scoreboard_is_locked(struct choose_scoreboard *scoreboard, 581 bool lock_scoreboard_on_first_thrsw) 582{ 583 if (lock_scoreboard_on_first_thrsw) { 584 return scoreboard->first_thrsw_emitted && 585 scoreboard->tick - scoreboard->last_thrsw_tick >= 3; 586 } 587 588 return scoreboard->last_thrsw_emitted && 589 scoreboard->tick - scoreboard->last_thrsw_tick >= 3; 590} 591 592static bool 593pixel_scoreboard_too_soon(struct v3d_compile *c, 594 struct choose_scoreboard *scoreboard, 595 const struct v3d_qpu_instr *inst) 596{ 597 return qpu_inst_is_tlb(inst) && 598 !scoreboard_is_locked(scoreboard, 599 c->lock_scoreboard_on_first_thrsw); 600} 601 602static bool 603qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst, 604 uint32_t waddr) { 605 606 if (inst->type != V3D_QPU_INSTR_TYPE_ALU) 607 return false; 608 609 if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) && 610 inst->raddr_a == waddr) 611 return true; 612 613 if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) && 614 !inst->sig.small_imm && (inst->raddr_b == waddr)) 615 return true; 616 617 return false; 618} 619 620static bool 621mux_read_stalls(struct choose_scoreboard *scoreboard, 622 const struct v3d_qpu_instr *inst) 623{ 624 return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 && 625 qpu_instruction_uses_rf(inst, 626 scoreboard->last_stallable_sfu_reg); 627} 628 629/* We define a max schedule priority to allow negative priorities as result of 630 * substracting this max when an instruction stalls. So instructions that 631 * stall have lower priority than regular instructions. */ 632#define MAX_SCHEDULE_PRIORITY 16 633 634static int 635get_instruction_priority(const struct v3d_device_info *devinfo, 636 const struct v3d_qpu_instr *inst) 637{ 638 uint32_t baseline_score; 639 uint32_t next_score = 0; 640 641 /* Schedule TLB operations as late as possible, to get more 642 * parallelism between shaders. 643 */ 644 if (qpu_inst_is_tlb(inst)) 645 return next_score; 646 next_score++; 647 648 /* Schedule texture read results collection late to hide latency. */ 649 if (v3d_qpu_waits_on_tmu(inst)) 650 return next_score; 651 next_score++; 652 653 /* Default score for things that aren't otherwise special. */ 654 baseline_score = next_score; 655 next_score++; 656 657 /* Schedule texture read setup early to hide their latency better. */ 658 if (v3d_qpu_writes_tmu(devinfo, inst)) 659 return next_score; 660 next_score++; 661 662 /* We should increase the maximum if we assert here */ 663 assert(next_score < MAX_SCHEDULE_PRIORITY); 664 665 return baseline_score; 666} 667 668static bool 669qpu_magic_waddr_is_periph(const struct v3d_device_info *devinfo, 670 enum v3d_qpu_waddr waddr) 671{ 672 return (v3d_qpu_magic_waddr_is_tmu(devinfo, waddr) || 673 v3d_qpu_magic_waddr_is_sfu(waddr) || 674 v3d_qpu_magic_waddr_is_tlb(waddr) || 675 v3d_qpu_magic_waddr_is_vpm(waddr) || 676 v3d_qpu_magic_waddr_is_tsy(waddr)); 677} 678 679static bool 680qpu_accesses_peripheral(const struct v3d_device_info *devinfo, 681 const struct v3d_qpu_instr *inst) 682{ 683 if (v3d_qpu_uses_vpm(inst)) 684 return true; 685 if (v3d_qpu_uses_sfu(inst)) 686 return true; 687 688 if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { 689 if (inst->alu.add.op != V3D_QPU_A_NOP && 690 inst->alu.add.magic_write && 691 qpu_magic_waddr_is_periph(devinfo, inst->alu.add.waddr)) { 692 return true; 693 } 694 695 if (inst->alu.add.op == V3D_QPU_A_TMUWT) 696 return true; 697 698 if (inst->alu.mul.op != V3D_QPU_M_NOP && 699 inst->alu.mul.magic_write && 700 qpu_magic_waddr_is_periph(devinfo, inst->alu.mul.waddr)) { 701 return true; 702 } 703 } 704 705 return (inst->sig.ldvpm || 706 inst->sig.ldtmu || 707 inst->sig.ldtlb || 708 inst->sig.ldtlbu || 709 inst->sig.wrtmuc); 710} 711 712static bool 713qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo, 714 const struct v3d_qpu_instr *a, 715 const struct v3d_qpu_instr *b) 716{ 717 const bool a_uses_peripheral = qpu_accesses_peripheral(devinfo, a); 718 const bool b_uses_peripheral = qpu_accesses_peripheral(devinfo, b); 719 720 /* We can always do one peripheral access per instruction. */ 721 if (!a_uses_peripheral || !b_uses_peripheral) 722 return true; 723 724 if (devinfo->ver < 41) 725 return false; 726 727 /* V3D 4.1 and later allow TMU read along with a VPM read or write, and 728 * WRTMUC with a TMU magic register write (other than tmuc). 729 */ 730 if ((a->sig.ldtmu && v3d_qpu_reads_or_writes_vpm(b)) || 731 (b->sig.ldtmu && v3d_qpu_reads_or_writes_vpm(a))) { 732 return true; 733 } 734 735 if ((a->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) || 736 (b->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(devinfo, a))) { 737 return true; 738 } 739 740 return false; 741} 742 743/* Compute a bitmask of which rf registers are used between 744 * the two instructions. 745 */ 746static uint64_t 747qpu_raddrs_used(const struct v3d_qpu_instr *a, 748 const struct v3d_qpu_instr *b) 749{ 750 assert(a->type == V3D_QPU_INSTR_TYPE_ALU); 751 assert(b->type == V3D_QPU_INSTR_TYPE_ALU); 752 753 uint64_t raddrs_used = 0; 754 if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A)) 755 raddrs_used |= (1ll << a->raddr_a); 756 if (!a->sig.small_imm && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B)) 757 raddrs_used |= (1ll << a->raddr_b); 758 if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A)) 759 raddrs_used |= (1ll << b->raddr_a); 760 if (!b->sig.small_imm && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) 761 raddrs_used |= (1ll << b->raddr_b); 762 763 return raddrs_used; 764} 765 766/* Take two instructions and attempt to merge their raddr fields 767 * into one merged instruction. Returns false if the two instructions 768 * access more than two different rf registers between them, or more 769 * than one rf register and one small immediate. 770 */ 771static bool 772qpu_merge_raddrs(struct v3d_qpu_instr *result, 773 const struct v3d_qpu_instr *add_instr, 774 const struct v3d_qpu_instr *mul_instr) 775{ 776 uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr); 777 int naddrs = util_bitcount64(raddrs_used); 778 779 if (naddrs > 2) 780 return false; 781 782 if ((add_instr->sig.small_imm || mul_instr->sig.small_imm)) { 783 if (naddrs > 1) 784 return false; 785 786 if (add_instr->sig.small_imm && mul_instr->sig.small_imm) 787 if (add_instr->raddr_b != mul_instr->raddr_b) 788 return false; 789 790 result->sig.small_imm = true; 791 result->raddr_b = add_instr->sig.small_imm ? 792 add_instr->raddr_b : mul_instr->raddr_b; 793 } 794 795 if (naddrs == 0) 796 return true; 797 798 int raddr_a = ffsll(raddrs_used) - 1; 799 raddrs_used &= ~(1ll << raddr_a); 800 result->raddr_a = raddr_a; 801 802 if (!result->sig.small_imm) { 803 if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) && 804 raddr_a == add_instr->raddr_b) { 805 if (add_instr->alu.add.a == V3D_QPU_MUX_B) 806 result->alu.add.a = V3D_QPU_MUX_A; 807 if (add_instr->alu.add.b == V3D_QPU_MUX_B && 808 v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) { 809 result->alu.add.b = V3D_QPU_MUX_A; 810 } 811 } 812 if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_B) && 813 raddr_a == mul_instr->raddr_b) { 814 if (mul_instr->alu.mul.a == V3D_QPU_MUX_B) 815 result->alu.mul.a = V3D_QPU_MUX_A; 816 if (mul_instr->alu.mul.b == V3D_QPU_MUX_B && 817 v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) { 818 result->alu.mul.b = V3D_QPU_MUX_A; 819 } 820 } 821 } 822 if (!raddrs_used) 823 return true; 824 825 int raddr_b = ffsll(raddrs_used) - 1; 826 result->raddr_b = raddr_b; 827 if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_A) && 828 raddr_b == add_instr->raddr_a) { 829 if (add_instr->alu.add.a == V3D_QPU_MUX_A) 830 result->alu.add.a = V3D_QPU_MUX_B; 831 if (add_instr->alu.add.b == V3D_QPU_MUX_A && 832 v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) { 833 result->alu.add.b = V3D_QPU_MUX_B; 834 } 835 } 836 if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_A) && 837 raddr_b == mul_instr->raddr_a) { 838 if (mul_instr->alu.mul.a == V3D_QPU_MUX_A) 839 result->alu.mul.a = V3D_QPU_MUX_B; 840 if (mul_instr->alu.mul.b == V3D_QPU_MUX_A && 841 v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) { 842 result->alu.mul.b = V3D_QPU_MUX_B; 843 } 844 } 845 846 return true; 847} 848 849static bool 850can_do_add_as_mul(enum v3d_qpu_add_op op) 851{ 852 switch (op) { 853 case V3D_QPU_A_ADD: 854 case V3D_QPU_A_SUB: 855 return true; 856 default: 857 return false; 858 } 859} 860 861static enum v3d_qpu_mul_op 862add_op_as_mul_op(enum v3d_qpu_add_op op) 863{ 864 switch (op) { 865 case V3D_QPU_A_ADD: 866 return V3D_QPU_M_ADD; 867 case V3D_QPU_A_SUB: 868 return V3D_QPU_M_SUB; 869 default: 870 unreachable("unexpected add opcode"); 871 } 872} 873 874static void 875qpu_convert_add_to_mul(struct v3d_qpu_instr *inst) 876{ 877 STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add)); 878 assert(inst->alu.add.op != V3D_QPU_A_NOP); 879 assert(inst->alu.mul.op == V3D_QPU_M_NOP); 880 881 memcpy(&inst->alu.mul, &inst->alu.add, sizeof(inst->alu.mul)); 882 inst->alu.mul.op = add_op_as_mul_op(inst->alu.add.op); 883 inst->alu.add.op = V3D_QPU_A_NOP; 884 885 inst->flags.mc = inst->flags.ac; 886 inst->flags.mpf = inst->flags.apf; 887 inst->flags.muf = inst->flags.auf; 888 inst->flags.ac = V3D_QPU_COND_NONE; 889 inst->flags.apf = V3D_QPU_PF_NONE; 890 inst->flags.auf = V3D_QPU_UF_NONE; 891} 892 893static bool 894qpu_merge_inst(const struct v3d_device_info *devinfo, 895 struct v3d_qpu_instr *result, 896 const struct v3d_qpu_instr *a, 897 const struct v3d_qpu_instr *b) 898{ 899 if (a->type != V3D_QPU_INSTR_TYPE_ALU || 900 b->type != V3D_QPU_INSTR_TYPE_ALU) { 901 return false; 902 } 903 904 if (!qpu_compatible_peripheral_access(devinfo, a, b)) 905 return false; 906 907 struct v3d_qpu_instr merge = *a; 908 const struct v3d_qpu_instr *add_instr = NULL, *mul_instr = NULL; 909 910 struct v3d_qpu_instr mul_inst; 911 if (b->alu.add.op != V3D_QPU_A_NOP) { 912 if (a->alu.add.op == V3D_QPU_A_NOP) { 913 merge.alu.add = b->alu.add; 914 915 merge.flags.ac = b->flags.ac; 916 merge.flags.apf = b->flags.apf; 917 merge.flags.auf = b->flags.auf; 918 919 add_instr = b; 920 mul_instr = a; 921 } 922 /* If a's add op is used but its mul op is not, then see if we 923 * can convert either a's add op or b's add op to a mul op 924 * so we can merge. 925 */ 926 else if (a->alu.mul.op == V3D_QPU_M_NOP && 927 can_do_add_as_mul(b->alu.add.op)) { 928 mul_inst = *b; 929 qpu_convert_add_to_mul(&mul_inst); 930 931 merge.alu.mul = mul_inst.alu.mul; 932 933 merge.flags.mc = b->flags.ac; 934 merge.flags.mpf = b->flags.apf; 935 merge.flags.muf = b->flags.auf; 936 937 add_instr = a; 938 mul_instr = &mul_inst; 939 } else if (a->alu.mul.op == V3D_QPU_M_NOP && 940 can_do_add_as_mul(a->alu.add.op)) { 941 mul_inst = *a; 942 qpu_convert_add_to_mul(&mul_inst); 943 944 merge = mul_inst; 945 merge.alu.add = b->alu.add; 946 947 merge.flags.ac = b->flags.ac; 948 merge.flags.apf = b->flags.apf; 949 merge.flags.auf = b->flags.auf; 950 951 add_instr = b; 952 mul_instr = &mul_inst; 953 } else { 954 return false; 955 } 956 } 957 958 if (b->alu.mul.op != V3D_QPU_M_NOP) { 959 if (a->alu.mul.op != V3D_QPU_M_NOP) 960 return false; 961 merge.alu.mul = b->alu.mul; 962 963 merge.flags.mc = b->flags.mc; 964 merge.flags.mpf = b->flags.mpf; 965 merge.flags.muf = b->flags.muf; 966 967 mul_instr = b; 968 add_instr = a; 969 } 970 971 if (add_instr && mul_instr && 972 !qpu_merge_raddrs(&merge, add_instr, mul_instr)) { 973 return false; 974 } 975 976 merge.sig.thrsw |= b->sig.thrsw; 977 merge.sig.ldunif |= b->sig.ldunif; 978 merge.sig.ldunifrf |= b->sig.ldunifrf; 979 merge.sig.ldunifa |= b->sig.ldunifa; 980 merge.sig.ldunifarf |= b->sig.ldunifarf; 981 merge.sig.ldtmu |= b->sig.ldtmu; 982 merge.sig.ldvary |= b->sig.ldvary; 983 merge.sig.ldvpm |= b->sig.ldvpm; 984 merge.sig.small_imm |= b->sig.small_imm; 985 merge.sig.ldtlb |= b->sig.ldtlb; 986 merge.sig.ldtlbu |= b->sig.ldtlbu; 987 merge.sig.ucb |= b->sig.ucb; 988 merge.sig.rotate |= b->sig.rotate; 989 merge.sig.wrtmuc |= b->sig.wrtmuc; 990 991 if (v3d_qpu_sig_writes_address(devinfo, &a->sig) && 992 v3d_qpu_sig_writes_address(devinfo, &b->sig)) 993 return false; 994 merge.sig_addr |= b->sig_addr; 995 merge.sig_magic |= b->sig_magic; 996 997 uint64_t packed; 998 bool ok = v3d_qpu_instr_pack(devinfo, &merge, &packed); 999 1000 *result = merge; 1001 /* No modifying the real instructions on failure. */ 1002 assert(ok || (a != result && b != result)); 1003 1004 return ok; 1005} 1006 1007static inline bool 1008try_skip_for_ldvary_pipelining(const struct v3d_qpu_instr *inst) 1009{ 1010 return inst->sig.ldunif || inst->sig.ldunifrf; 1011} 1012 1013static bool 1014qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c, 1015 struct choose_scoreboard *scoreboard, 1016 const struct qinst *qinst); 1017 1018static struct schedule_node * 1019choose_instruction_to_schedule(struct v3d_compile *c, 1020 struct choose_scoreboard *scoreboard, 1021 struct schedule_node *prev_inst) 1022{ 1023 struct schedule_node *chosen = NULL; 1024 int chosen_prio = 0; 1025 1026 /* Don't pair up anything with a thread switch signal -- emit_thrsw() 1027 * will handle pairing it along with filling the delay slots. 1028 */ 1029 if (prev_inst) { 1030 if (prev_inst->inst->qpu.sig.thrsw) 1031 return NULL; 1032 } 1033 1034 bool ldvary_pipelining = c->s->info.stage == MESA_SHADER_FRAGMENT && 1035 scoreboard->ldvary_count < c->num_inputs; 1036 bool skipped_insts_for_ldvary_pipelining = false; 1037retry: 1038 list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads, 1039 dag.link) { 1040 const struct v3d_qpu_instr *inst = &n->inst->qpu; 1041 1042 if (ldvary_pipelining && try_skip_for_ldvary_pipelining(inst)) { 1043 skipped_insts_for_ldvary_pipelining = true; 1044 continue; 1045 } 1046 1047 /* Don't choose the branch instruction until it's the last one 1048 * left. We'll move it up to fit its delay slots after we 1049 * choose it. 1050 */ 1051 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH && 1052 !list_is_singular(&scoreboard->dag->heads)) { 1053 continue; 1054 } 1055 1056 /* We need to have 3 delay slots between a write to unifa and 1057 * a follow-up ldunifa. 1058 */ 1059 if ((inst->sig.ldunifa || inst->sig.ldunifarf) && 1060 scoreboard->tick - scoreboard->last_unifa_write_tick <= 3) 1061 continue; 1062 1063 /* "An instruction must not read from a location in physical 1064 * regfile A or B that was written to by the previous 1065 * instruction." 1066 */ 1067 if (reads_too_soon_after_write(scoreboard, n->inst)) 1068 continue; 1069 1070 if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst)) 1071 continue; 1072 1073 /* "Before doing a TLB access a scoreboard wait must have been 1074 * done. This happens either on the first or last thread 1075 * switch, depending on a setting (scb_wait_on_first_thrsw) in 1076 * the shader state." 1077 */ 1078 if (pixel_scoreboard_too_soon(c, scoreboard, inst)) 1079 continue; 1080 1081 /* ldunif and ldvary both write r5, but ldunif does so a tick 1082 * sooner. If the ldvary's r5 wasn't used, then ldunif might 1083 * otherwise get scheduled so ldunif and ldvary try to update 1084 * r5 in the same tick. 1085 */ 1086 if ((inst->sig.ldunif || inst->sig.ldunifa) && 1087 scoreboard->tick == scoreboard->last_ldvary_tick + 1) { 1088 continue; 1089 } 1090 1091 /* If we are in a thrsw delay slot check that this instruction 1092 * is valid for that. 1093 */ 1094 if (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick && 1095 !qpu_inst_after_thrsw_valid_in_delay_slot(c, scoreboard, 1096 n->inst)) { 1097 continue; 1098 } 1099 1100 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) { 1101 /* Don't try to put a branch in the delay slots of another 1102 * branch or a unifa write. 1103 */ 1104 if (scoreboard->last_branch_tick + 3 >= scoreboard->tick) 1105 continue; 1106 if (scoreboard->last_unifa_write_tick + 3 >= scoreboard->tick) 1107 continue; 1108 1109 /* No branch with cond != 0,2,3 and msfign != 0 after 1110 * setmsf. 1111 */ 1112 if (scoreboard->last_setmsf_tick == scoreboard->tick - 1 && 1113 inst->branch.msfign != V3D_QPU_MSFIGN_NONE && 1114 inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS && 1115 inst->branch.cond != V3D_QPU_BRANCH_COND_A0 && 1116 inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) { 1117 continue; 1118 } 1119 } 1120 1121 /* If we're trying to pair with another instruction, check 1122 * that they're compatible. 1123 */ 1124 if (prev_inst) { 1125 /* Don't pair up a thread switch signal -- we'll 1126 * handle pairing it when we pick it on its own. 1127 */ 1128 if (inst->sig.thrsw) 1129 continue; 1130 1131 if (prev_inst->inst->uniform != -1 && 1132 n->inst->uniform != -1) 1133 continue; 1134 1135 /* Simulator complains if we have two uniforms loaded in 1136 * the the same instruction, which could happen if we 1137 * have a ldunif or sideband uniform and we pair that 1138 * with ldunifa. 1139 */ 1140 if (vir_has_uniform(prev_inst->inst) && 1141 (inst->sig.ldunifa || inst->sig.ldunifarf)) { 1142 continue; 1143 } 1144 1145 if ((prev_inst->inst->qpu.sig.ldunifa || 1146 prev_inst->inst->qpu.sig.ldunifarf) && 1147 vir_has_uniform(n->inst)) { 1148 continue; 1149 } 1150 1151 /* Don't merge TLB instructions before we have acquired 1152 * the scoreboard lock. 1153 */ 1154 if (pixel_scoreboard_too_soon(c, scoreboard, inst)) 1155 continue; 1156 1157 /* When we succesfully pair up an ldvary we then try 1158 * to merge it into the previous instruction if 1159 * possible to improve pipelining. Don't pick up the 1160 * ldvary now if the follow-up fixup would place 1161 * it in the delay slots of a thrsw, which is not 1162 * allowed and would prevent the fixup from being 1163 * successul. 1164 */ 1165 if (inst->sig.ldvary && 1166 scoreboard->last_thrsw_tick + 2 >= scoreboard->tick - 1) { 1167 continue; 1168 } 1169 1170 struct v3d_qpu_instr merged_inst; 1171 if (!qpu_merge_inst(c->devinfo, &merged_inst, 1172 &prev_inst->inst->qpu, inst)) { 1173 continue; 1174 } 1175 } 1176 1177 int prio = get_instruction_priority(c->devinfo, inst); 1178 1179 if (mux_read_stalls(scoreboard, inst)) { 1180 /* Don't merge an instruction that stalls */ 1181 if (prev_inst) 1182 continue; 1183 else { 1184 /* Any instruction that don't stall will have 1185 * higher scheduling priority */ 1186 prio -= MAX_SCHEDULE_PRIORITY; 1187 assert(prio < 0); 1188 } 1189 } 1190 1191 /* Found a valid instruction. If nothing better comes along, 1192 * this one works. 1193 */ 1194 if (!chosen) { 1195 chosen = n; 1196 chosen_prio = prio; 1197 continue; 1198 } 1199 1200 if (prio > chosen_prio) { 1201 chosen = n; 1202 chosen_prio = prio; 1203 } else if (prio < chosen_prio) { 1204 continue; 1205 } 1206 1207 if (n->delay > chosen->delay) { 1208 chosen = n; 1209 chosen_prio = prio; 1210 } else if (n->delay < chosen->delay) { 1211 continue; 1212 } 1213 } 1214 1215 /* If we did not find any instruction to schedule but we discarded 1216 * some of them to prioritize ldvary pipelining, try again. 1217 */ 1218 if (!chosen && !prev_inst && skipped_insts_for_ldvary_pipelining) { 1219 skipped_insts_for_ldvary_pipelining = false; 1220 ldvary_pipelining = false; 1221 goto retry; 1222 } 1223 1224 if (chosen && chosen->inst->qpu.sig.ldvary) { 1225 scoreboard->ldvary_count++; 1226 /* If we are pairing an ldvary, flag it so we can fix it up for 1227 * optimal pipelining of ldvary sequences. 1228 */ 1229 if (prev_inst) 1230 scoreboard->fixup_ldvary = true; 1231 } 1232 1233 return chosen; 1234} 1235 1236static void 1237update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard, 1238 enum v3d_qpu_waddr waddr, 1239 const struct v3d_device_info *devinfo) 1240{ 1241 if (v3d_qpu_magic_waddr_is_sfu(waddr)) 1242 scoreboard->last_magic_sfu_write_tick = scoreboard->tick; 1243 else if (devinfo->ver >= 40 && waddr == V3D_QPU_WADDR_UNIFA) 1244 scoreboard->last_unifa_write_tick = scoreboard->tick; 1245} 1246 1247static void 1248update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard, 1249 const struct v3d_qpu_instr *inst) 1250{ 1251 if (v3d_qpu_instr_is_sfu(inst)) { 1252 scoreboard->last_stallable_sfu_reg = inst->alu.add.waddr; 1253 scoreboard->last_stallable_sfu_tick = scoreboard->tick; 1254 } 1255} 1256 1257static void 1258update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, 1259 const struct v3d_qpu_instr *inst, 1260 const struct v3d_device_info *devinfo) 1261{ 1262 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) 1263 return; 1264 1265 assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); 1266 1267 if (inst->alu.add.op != V3D_QPU_A_NOP) { 1268 if (inst->alu.add.magic_write) { 1269 update_scoreboard_for_magic_waddr(scoreboard, 1270 inst->alu.add.waddr, 1271 devinfo); 1272 } else { 1273 update_scoreboard_for_sfu_stall_waddr(scoreboard, 1274 inst); 1275 } 1276 1277 if (inst->alu.add.op == V3D_QPU_A_SETMSF) 1278 scoreboard->last_setmsf_tick = scoreboard->tick; 1279 } 1280 1281 if (inst->alu.mul.op != V3D_QPU_M_NOP) { 1282 if (inst->alu.mul.magic_write) { 1283 update_scoreboard_for_magic_waddr(scoreboard, 1284 inst->alu.mul.waddr, 1285 devinfo); 1286 } 1287 } 1288 1289 if (inst->sig.ldvary) 1290 scoreboard->last_ldvary_tick = scoreboard->tick; 1291} 1292 1293static void 1294dump_state(const struct v3d_device_info *devinfo, struct dag *dag) 1295{ 1296 list_for_each_entry(struct schedule_node, n, &dag->heads, dag.link) { 1297 fprintf(stderr, " t=%4d: ", n->unblocked_time); 1298 v3d_qpu_dump(devinfo, &n->inst->qpu); 1299 fprintf(stderr, "\n"); 1300 1301 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { 1302 struct schedule_node *child = 1303 (struct schedule_node *)edge->child; 1304 if (!child) 1305 continue; 1306 1307 fprintf(stderr, " - "); 1308 v3d_qpu_dump(devinfo, &child->inst->qpu); 1309 fprintf(stderr, " (%d parents, %c)\n", 1310 child->dag.parent_count, 1311 edge->data ? 'w' : 'r'); 1312 } 1313 } 1314} 1315 1316static uint32_t magic_waddr_latency(const struct v3d_device_info *devinfo, 1317 enum v3d_qpu_waddr waddr, 1318 const struct v3d_qpu_instr *after) 1319{ 1320 /* Apply some huge latency between texture fetch requests and getting 1321 * their results back. 1322 * 1323 * FIXME: This is actually pretty bogus. If we do: 1324 * 1325 * mov tmu0_s, a 1326 * <a bit of math> 1327 * mov tmu0_s, b 1328 * load_tmu0 1329 * <more math> 1330 * load_tmu0 1331 * 1332 * we count that as worse than 1333 * 1334 * mov tmu0_s, a 1335 * mov tmu0_s, b 1336 * <lots of math> 1337 * load_tmu0 1338 * <more math> 1339 * load_tmu0 1340 * 1341 * because we associate the first load_tmu0 with the *second* tmu0_s. 1342 */ 1343 if (v3d_qpu_magic_waddr_is_tmu(devinfo, waddr) && 1344 v3d_qpu_waits_on_tmu(after)) { 1345 return 100; 1346 } 1347 1348 /* Assume that anything depending on us is consuming the SFU result. */ 1349 if (v3d_qpu_magic_waddr_is_sfu(waddr)) 1350 return 3; 1351 1352 return 1; 1353} 1354 1355static uint32_t 1356instruction_latency(const struct v3d_device_info *devinfo, 1357 struct schedule_node *before, struct schedule_node *after) 1358{ 1359 const struct v3d_qpu_instr *before_inst = &before->inst->qpu; 1360 const struct v3d_qpu_instr *after_inst = &after->inst->qpu; 1361 uint32_t latency = 1; 1362 1363 if (before_inst->type != V3D_QPU_INSTR_TYPE_ALU || 1364 after_inst->type != V3D_QPU_INSTR_TYPE_ALU) 1365 return latency; 1366 1367 if (before_inst->alu.add.magic_write) { 1368 latency = MAX2(latency, 1369 magic_waddr_latency(devinfo, 1370 before_inst->alu.add.waddr, 1371 after_inst)); 1372 } 1373 1374 if (before_inst->alu.mul.magic_write) { 1375 latency = MAX2(latency, 1376 magic_waddr_latency(devinfo, 1377 before_inst->alu.mul.waddr, 1378 after_inst)); 1379 } 1380 1381 if (v3d_qpu_instr_is_sfu(before_inst)) 1382 return 2; 1383 1384 return latency; 1385} 1386 1387/** Recursive computation of the delay member of a node. */ 1388static void 1389compute_delay(struct dag_node *node, void *state) 1390{ 1391 struct schedule_node *n = (struct schedule_node *)node; 1392 struct v3d_compile *c = (struct v3d_compile *) state; 1393 1394 n->delay = 1; 1395 1396 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { 1397 struct schedule_node *child = 1398 (struct schedule_node *)edge->child; 1399 1400 n->delay = MAX2(n->delay, (child->delay + 1401 instruction_latency(c->devinfo, n, 1402 child))); 1403 } 1404} 1405 1406/* Removes a DAG head, but removing only the WAR edges. (dag_prune_head() 1407 * should be called on it later to finish pruning the other edges). 1408 */ 1409static void 1410pre_remove_head(struct dag *dag, struct schedule_node *n) 1411{ 1412 list_delinit(&n->dag.link); 1413 1414 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { 1415 if (edge->data) 1416 dag_remove_edge(dag, edge); 1417 } 1418} 1419 1420static void 1421mark_instruction_scheduled(const struct v3d_device_info *devinfo, 1422 struct dag *dag, 1423 uint32_t time, 1424 struct schedule_node *node) 1425{ 1426 if (!node) 1427 return; 1428 1429 util_dynarray_foreach(&node->dag.edges, struct dag_edge, edge) { 1430 struct schedule_node *child = 1431 (struct schedule_node *)edge->child; 1432 1433 if (!child) 1434 continue; 1435 1436 uint32_t latency = instruction_latency(devinfo, node, child); 1437 1438 child->unblocked_time = MAX2(child->unblocked_time, 1439 time + latency); 1440 } 1441 dag_prune_head(dag, &node->dag); 1442} 1443 1444static void 1445insert_scheduled_instruction(struct v3d_compile *c, 1446 struct qblock *block, 1447 struct choose_scoreboard *scoreboard, 1448 struct qinst *inst) 1449{ 1450 list_addtail(&inst->link, &block->instructions); 1451 1452 update_scoreboard_for_chosen(scoreboard, &inst->qpu, c->devinfo); 1453 c->qpu_inst_count++; 1454 scoreboard->tick++; 1455} 1456 1457static struct qinst * 1458vir_nop() 1459{ 1460 struct qreg undef = vir_nop_reg(); 1461 struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef); 1462 1463 return qinst; 1464} 1465 1466static void 1467emit_nop(struct v3d_compile *c, struct qblock *block, 1468 struct choose_scoreboard *scoreboard) 1469{ 1470 insert_scheduled_instruction(c, block, scoreboard, vir_nop()); 1471} 1472 1473static bool 1474qpu_inst_valid_in_thrend_slot(struct v3d_compile *c, 1475 const struct qinst *qinst, int slot) 1476{ 1477 const struct v3d_qpu_instr *inst = &qinst->qpu; 1478 1479 /* Only TLB Z writes are prohibited in the last slot, but we don't 1480 * have those flagged so prohibit all TLB ops for now. 1481 */ 1482 if (slot == 2 && qpu_inst_is_tlb(inst)) 1483 return false; 1484 1485 if (slot > 0 && qinst->uniform != ~0) 1486 return false; 1487 1488 if (v3d_qpu_uses_vpm(inst)) 1489 return false; 1490 1491 if (inst->sig.ldvary) 1492 return false; 1493 1494 if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { 1495 /* GFXH-1625: TMUWT not allowed in the final instruction. */ 1496 if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT) 1497 return false; 1498 1499 /* No writing physical registers at the end. */ 1500 if (!inst->alu.add.magic_write || 1501 !inst->alu.mul.magic_write) { 1502 return false; 1503 } 1504 1505 if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) && 1506 !inst->sig_magic) { 1507 return false; 1508 } 1509 1510 if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF) 1511 return false; 1512 1513 /* RF0-2 might be overwritten during the delay slots by 1514 * fragment shader setup. 1515 */ 1516 if (inst->raddr_a < 3 && 1517 (inst->alu.add.a == V3D_QPU_MUX_A || 1518 inst->alu.add.b == V3D_QPU_MUX_A || 1519 inst->alu.mul.a == V3D_QPU_MUX_A || 1520 inst->alu.mul.b == V3D_QPU_MUX_A)) { 1521 return false; 1522 } 1523 1524 if (inst->raddr_b < 3 && 1525 !inst->sig.small_imm && 1526 (inst->alu.add.a == V3D_QPU_MUX_B || 1527 inst->alu.add.b == V3D_QPU_MUX_B || 1528 inst->alu.mul.a == V3D_QPU_MUX_B || 1529 inst->alu.mul.b == V3D_QPU_MUX_B)) { 1530 return false; 1531 } 1532 } 1533 1534 return true; 1535} 1536 1537/** 1538 * This is called when trying to merge a thrsw back into the instruction stream 1539 * of instructions that were scheduled *before* the thrsw signal to fill its 1540 * delay slots. Because the actual execution of the thrsw happens after the 1541 * delay slots, it is usually safe to do this, but there are some cases that 1542 * need special care. 1543 */ 1544static bool 1545qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c, 1546 const struct qinst *qinst, 1547 uint32_t slot) 1548{ 1549 /* No scheduling SFU when the result would land in the other 1550 * thread. The simulator complains for safety, though it 1551 * would only occur for dead code in our case. 1552 */ 1553 if (slot > 0 && 1554 qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && 1555 (v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.add.waddr) || 1556 v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.mul.waddr))) { 1557 return false; 1558 } 1559 1560 if (slot > 0 && qinst->qpu.sig.ldvary) 1561 return false; 1562 1563 /* unifa and the following 3 instructions can't overlap a 1564 * thread switch/end. The docs further clarify that this means 1565 * the cycle at which the actual thread switch/end happens 1566 * and not when the thrsw instruction is processed, which would 1567 * be after the 2 delay slots following the thrsw instruction. 1568 * This means that we can move up a thrsw up to the instruction 1569 * right after unifa: 1570 * 1571 * unifa, r5 1572 * thrsw 1573 * delay slot 1 1574 * delay slot 2 1575 * Thread switch happens here, 4 instructions away from unifa 1576 */ 1577 if (v3d_qpu_writes_unifa(c->devinfo, &qinst->qpu)) 1578 return false; 1579 1580 return true; 1581} 1582 1583/** 1584 * This is called for instructions scheduled *after* a thrsw signal that may 1585 * land in the delay slots of the thrsw. Because these instructions were 1586 * scheduled after the thrsw, we need to be careful when placing them into 1587 * the delay slots, since that means that we are moving them ahead of the 1588 * thread switch and we need to ensure that is not a problem. 1589 */ 1590static bool 1591qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c, 1592 struct choose_scoreboard *scoreboard, 1593 const struct qinst *qinst) 1594{ 1595 const uint32_t slot = scoreboard->tick - scoreboard->last_thrsw_tick; 1596 assert(slot <= 2); 1597 1598 /* We merge thrsw instructions back into the instruction stream 1599 * manually, so any instructions scheduled after a thrsw shold be 1600 * in the actual delay slots and not in the same slot as the thrsw. 1601 */ 1602 assert(slot >= 1); 1603 1604 /* No emitting a thrsw while the previous thrsw hasn't happened yet. */ 1605 if (qinst->qpu.sig.thrsw) 1606 return false; 1607 1608 /* The restrictions for instructions scheduled before the the thrsw 1609 * also apply to instructions scheduled after the thrsw that we want 1610 * to place in its delay slots. 1611 */ 1612 if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot)) 1613 return false; 1614 1615 /* TLB access is disallowed until scoreboard wait is executed, which 1616 * we do on the last thread switch. 1617 */ 1618 if (qpu_inst_is_tlb(&qinst->qpu)) 1619 return false; 1620 1621 /* Instruction sequence restrictions: Branch is not allowed in delay 1622 * slots of a thrsw. 1623 */ 1624 if (qinst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH) 1625 return false; 1626 1627 /* Miscellaneous restrictions: At the point of a thrsw we need to have 1628 * at least one outstanding lookup or TSY wait. 1629 * 1630 * So avoid placing TMU instructions scheduled after the thrsw into 1631 * its delay slots or we may be compromising the integrity of our TMU 1632 * sequences. Also, notice that if we moved these instructions into 1633 * the delay slots of a previous thrsw we could overflow our TMU output 1634 * fifo, since we could be effectively pipelining a lookup scheduled 1635 * after the thrsw into the sequence before the thrsw. 1636 */ 1637 if (v3d_qpu_writes_tmu(c->devinfo, &qinst->qpu) || 1638 qinst->qpu.sig.wrtmuc) { 1639 return false; 1640 } 1641 1642 /* Don't move instructions that wait on the TMU before the thread switch 1643 * happens since that would make the current thread stall before the 1644 * switch, which is exactly what we want to avoid with the thrsw 1645 * instruction. 1646 */ 1647 if (v3d_qpu_waits_on_tmu(&qinst->qpu)) 1648 return false; 1649 1650 /* A thread switch invalidates all accumulators, so don't place any 1651 * instructions that write accumulators into the delay slots. 1652 */ 1653 if (v3d_qpu_writes_accum(c->devinfo, &qinst->qpu)) 1654 return false; 1655 1656 /* Multop has an implicit write to the rtop register which is an 1657 * specialized accumulator that is only used with this instruction. 1658 */ 1659 if (qinst->qpu.alu.mul.op == V3D_QPU_M_MULTOP) 1660 return false; 1661 1662 /* Flags are invalidated across a thread switch, so dont' place 1663 * instructions that write flags into delay slots. 1664 */ 1665 if (v3d_qpu_writes_flags(&qinst->qpu)) 1666 return false; 1667 1668 return true; 1669} 1670 1671static bool 1672valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard, 1673 struct qinst *qinst, int instructions_in_sequence, 1674 bool is_thrend) 1675{ 1676 /* No emitting our thrsw while the previous thrsw hasn't happened yet. */ 1677 if (scoreboard->last_thrsw_tick + 3 > 1678 scoreboard->tick - instructions_in_sequence) { 1679 return false; 1680 } 1681 1682 for (int slot = 0; slot < instructions_in_sequence; slot++) { 1683 if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot)) 1684 return false; 1685 1686 if (is_thrend && 1687 !qpu_inst_valid_in_thrend_slot(c, qinst, slot)) { 1688 return false; 1689 } 1690 1691 /* Note that the list is circular, so we can only do this up 1692 * to instructions_in_sequence. 1693 */ 1694 qinst = (struct qinst *)qinst->link.next; 1695 } 1696 1697 return true; 1698} 1699 1700/** 1701 * Emits a THRSW signal in the stream, trying to move it up to pair with 1702 * another instruction. 1703 */ 1704static int 1705emit_thrsw(struct v3d_compile *c, 1706 struct qblock *block, 1707 struct choose_scoreboard *scoreboard, 1708 struct qinst *inst, 1709 bool is_thrend) 1710{ 1711 int time = 0; 1712 1713 /* There should be nothing in a thrsw inst being scheduled other than 1714 * the signal bits. 1715 */ 1716 assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU); 1717 assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP); 1718 assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP); 1719 1720 /* Don't try to emit a thrsw in the delay slots of a previous thrsw 1721 * or branch. 1722 */ 1723 while (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick) { 1724 emit_nop(c, block, scoreboard); 1725 time++; 1726 } 1727 while (scoreboard->last_branch_tick + 3 >= scoreboard->tick) { 1728 emit_nop(c, block, scoreboard); 1729 time++; 1730 } 1731 1732 /* Find how far back into previous instructions we can put the THRSW. */ 1733 int slots_filled = 0; 1734 struct qinst *merge_inst = NULL; 1735 vir_for_each_inst_rev(prev_inst, block) { 1736 struct v3d_qpu_sig sig = prev_inst->qpu.sig; 1737 sig.thrsw = true; 1738 uint32_t packed_sig; 1739 1740 if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig)) 1741 break; 1742 1743 if (!valid_thrsw_sequence(c, scoreboard, 1744 prev_inst, slots_filled + 1, 1745 is_thrend)) { 1746 break; 1747 } 1748 1749 merge_inst = prev_inst; 1750 if (++slots_filled == 3) 1751 break; 1752 } 1753 1754 bool needs_free = false; 1755 if (merge_inst) { 1756 merge_inst->qpu.sig.thrsw = true; 1757 needs_free = true; 1758 scoreboard->last_thrsw_tick = scoreboard->tick - slots_filled; 1759 } else { 1760 scoreboard->last_thrsw_tick = scoreboard->tick; 1761 insert_scheduled_instruction(c, block, scoreboard, inst); 1762 time++; 1763 slots_filled++; 1764 merge_inst = inst; 1765 } 1766 1767 scoreboard->first_thrsw_emitted = true; 1768 1769 /* If we're emitting the last THRSW (other than program end), then 1770 * signal that to the HW by emitting two THRSWs in a row. 1771 */ 1772 if (inst->is_last_thrsw) { 1773 if (slots_filled <= 1) { 1774 emit_nop(c, block, scoreboard); 1775 time++; 1776 } 1777 struct qinst *second_inst = 1778 (struct qinst *)merge_inst->link.next; 1779 second_inst->qpu.sig.thrsw = true; 1780 scoreboard->last_thrsw_emitted = true; 1781 } 1782 1783 /* Make sure the thread end executes within the program lifespan */ 1784 if (is_thrend) { 1785 for (int i = 0; i < 3 - slots_filled; i++) { 1786 emit_nop(c, block, scoreboard); 1787 time++; 1788 } 1789 } 1790 1791 /* If we put our THRSW into another instruction, free up the 1792 * instruction that didn't end up scheduled into the list. 1793 */ 1794 if (needs_free) 1795 free(inst); 1796 1797 return time; 1798} 1799 1800static bool 1801qpu_inst_valid_in_branch_delay_slot(struct v3d_compile *c, struct qinst *inst) 1802{ 1803 if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH) 1804 return false; 1805 1806 if (inst->qpu.sig.thrsw) 1807 return false; 1808 1809 if (v3d_qpu_writes_unifa(c->devinfo, &inst->qpu)) 1810 return false; 1811 1812 if (vir_has_uniform(inst)) 1813 return false; 1814 1815 return true; 1816} 1817 1818static void 1819emit_branch(struct v3d_compile *c, 1820 struct qblock *block, 1821 struct choose_scoreboard *scoreboard, 1822 struct qinst *inst) 1823{ 1824 assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH); 1825 1826 /* We should've not picked up a branch for the delay slots of a previous 1827 * thrsw, branch or unifa write instruction. 1828 */ 1829 int branch_tick = scoreboard->tick; 1830 assert(scoreboard->last_thrsw_tick + 2 < branch_tick); 1831 assert(scoreboard->last_branch_tick + 3 < branch_tick); 1832 assert(scoreboard->last_unifa_write_tick + 3 < branch_tick); 1833 1834 /* Can't place a branch with msfign != 0 and cond != 0,2,3 after 1835 * setmsf. 1836 */ 1837 bool is_safe_msf_branch = 1838 inst->qpu.branch.msfign == V3D_QPU_MSFIGN_NONE || 1839 inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS || 1840 inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_A0 || 1841 inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_NA0; 1842 assert(scoreboard->last_setmsf_tick != branch_tick - 1 || 1843 is_safe_msf_branch); 1844 1845 /* Insert the branch instruction */ 1846 insert_scheduled_instruction(c, block, scoreboard, inst); 1847 1848 /* Now see if we can move the branch instruction back into the 1849 * instruction stream to fill its delay slots 1850 */ 1851 int slots_filled = 0; 1852 while (slots_filled < 3 && block->instructions.next != &inst->link) { 1853 struct qinst *prev_inst = (struct qinst *) inst->link.prev; 1854 assert(prev_inst->qpu.type != V3D_QPU_INSTR_TYPE_BRANCH); 1855 1856 /* Can't move the branch instruction if that would place it 1857 * in the delay slots of other instructions. 1858 */ 1859 if (scoreboard->last_branch_tick + 3 >= 1860 branch_tick - slots_filled - 1) { 1861 break; 1862 } 1863 1864 if (scoreboard->last_thrsw_tick + 2 >= 1865 branch_tick - slots_filled - 1) { 1866 break; 1867 } 1868 1869 if (scoreboard->last_unifa_write_tick + 3 >= 1870 branch_tick - slots_filled - 1) { 1871 break; 1872 } 1873 1874 /* Can't move a conditional branch before the instruction 1875 * that writes the flags for its condition. 1876 */ 1877 if (v3d_qpu_writes_flags(&prev_inst->qpu) && 1878 inst->qpu.branch.cond != V3D_QPU_BRANCH_COND_ALWAYS) { 1879 break; 1880 } 1881 1882 if (!qpu_inst_valid_in_branch_delay_slot(c, prev_inst)) 1883 break; 1884 1885 if (!is_safe_msf_branch) { 1886 struct qinst *prev_prev_inst = 1887 (struct qinst *) prev_inst->link.prev; 1888 if (prev_prev_inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && 1889 prev_prev_inst->qpu.alu.add.op == V3D_QPU_A_SETMSF) { 1890 break; 1891 } 1892 } 1893 1894 list_del(&prev_inst->link); 1895 list_add(&prev_inst->link, &inst->link); 1896 slots_filled++; 1897 } 1898 1899 block->branch_qpu_ip = c->qpu_inst_count - 1 - slots_filled; 1900 scoreboard->last_branch_tick = branch_tick - slots_filled; 1901 1902 /* Fill any remaining delay slots. 1903 * 1904 * For unconditional branches we'll try to fill these with the 1905 * first instructions in the successor block after scheduling 1906 * all blocks when setting up branch targets. 1907 */ 1908 for (int i = 0; i < 3 - slots_filled; i++) 1909 emit_nop(c, block, scoreboard); 1910} 1911 1912static bool 1913alu_reads_register(struct v3d_qpu_instr *inst, 1914 bool add, bool magic, uint32_t index) 1915{ 1916 uint32_t num_src; 1917 enum v3d_qpu_mux mux_a, mux_b; 1918 1919 if (add) { 1920 num_src = v3d_qpu_add_op_num_src(inst->alu.add.op); 1921 mux_a = inst->alu.add.a; 1922 mux_b = inst->alu.add.b; 1923 } else { 1924 num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op); 1925 mux_a = inst->alu.mul.a; 1926 mux_b = inst->alu.mul.b; 1927 } 1928 1929 for (int i = 0; i < num_src; i++) { 1930 if (magic) { 1931 if (i == 0 && mux_a == index) 1932 return true; 1933 if (i == 1 && mux_b == index) 1934 return true; 1935 } else { 1936 if (i == 0 && mux_a == V3D_QPU_MUX_A && 1937 inst->raddr_a == index) { 1938 return true; 1939 } 1940 if (i == 0 && mux_a == V3D_QPU_MUX_B && 1941 inst->raddr_b == index) { 1942 return true; 1943 } 1944 if (i == 1 && mux_b == V3D_QPU_MUX_A && 1945 inst->raddr_a == index) { 1946 return true; 1947 } 1948 if (i == 1 && mux_b == V3D_QPU_MUX_B && 1949 inst->raddr_b == index) { 1950 return true; 1951 } 1952 } 1953 } 1954 1955 return false; 1956} 1957 1958/** 1959 * This takes and ldvary signal merged into 'inst' and tries to move it up to 1960 * the previous instruction to get good pipelining of ldvary sequences, 1961 * transforming this: 1962 * 1963 * nop ; nop ; ldvary.r4 1964 * nop ; fmul r0, r4, rf0 ; 1965 * fadd rf13, r0, r5 ; nop; ; ldvary.r1 <-- inst 1966 * 1967 * into: 1968 * 1969 * nop ; nop ; ldvary.r4 1970 * nop ; fmul r0, r4, rf0 ; ldvary.r1 1971 * fadd rf13, r0, r5 ; nop; ; <-- inst 1972 * 1973 * If we manage to do this successfully (we return true here), then flagging 1974 * the ldvary as "scheduled" may promote the follow-up fmul to a DAG head that 1975 * we will be able to pick up to merge into 'inst', leading to code like this: 1976 * 1977 * nop ; nop ; ldvary.r4 1978 * nop ; fmul r0, r4, rf0 ; ldvary.r1 1979 * fadd rf13, r0, r5 ; fmul r2, r1, rf0 ; <-- inst 1980 */ 1981static bool 1982fixup_pipelined_ldvary(struct v3d_compile *c, 1983 struct choose_scoreboard *scoreboard, 1984 struct qblock *block, 1985 struct v3d_qpu_instr *inst) 1986{ 1987 /* We only call this if we have successfuly merged an ldvary into a 1988 * previous instruction. 1989 */ 1990 assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); 1991 assert(inst->sig.ldvary); 1992 uint32_t ldvary_magic = inst->sig_magic; 1993 uint32_t ldvary_index = inst->sig_addr; 1994 1995 /* The instruction in which we merged the ldvary cannot read 1996 * the ldvary destination, if it does, then moving the ldvary before 1997 * it would overwrite it. 1998 */ 1999 if (alu_reads_register(inst, true, ldvary_magic, ldvary_index)) 2000 return false; 2001 if (alu_reads_register(inst, false, ldvary_magic, ldvary_index)) 2002 return false; 2003 2004 /* The implicit ldvary destination may not be written to by a signal 2005 * in the instruction following ldvary. Since we are planning to move 2006 * ldvary to the previous instruction, this means we need to check if 2007 * the current instruction has any other signal that could create this 2008 * conflict. The only other signal that can write to the implicit 2009 * ldvary destination that is compatible with ldvary in the same 2010 * instruction is ldunif. 2011 */ 2012 if (inst->sig.ldunif) 2013 return false; 2014 2015 /* The previous instruction can't write to the same destination as the 2016 * ldvary. 2017 */ 2018 struct qinst *prev = (struct qinst *) block->instructions.prev; 2019 if (!prev || prev->qpu.type != V3D_QPU_INSTR_TYPE_ALU) 2020 return false; 2021 2022 if (prev->qpu.alu.add.op != V3D_QPU_A_NOP) { 2023 if (prev->qpu.alu.add.magic_write == ldvary_magic && 2024 prev->qpu.alu.add.waddr == ldvary_index) { 2025 return false; 2026 } 2027 } 2028 2029 if (prev->qpu.alu.mul.op != V3D_QPU_M_NOP) { 2030 if (prev->qpu.alu.mul.magic_write == ldvary_magic && 2031 prev->qpu.alu.mul.waddr == ldvary_index) { 2032 return false; 2033 } 2034 } 2035 2036 /* The previous instruction cannot have a conflicting signal */ 2037 if (v3d_qpu_sig_writes_address(c->devinfo, &prev->qpu.sig)) 2038 return false; 2039 2040 /* The previous instruction cannot use flags since ldvary uses the 2041 * 'cond' instruction field to store the destination. 2042 */ 2043 if (v3d_qpu_writes_flags(&prev->qpu)) 2044 return false; 2045 if (v3d_qpu_reads_flags(&prev->qpu)) 2046 return false; 2047 2048 /* We can't put an ldvary in the delay slots of a thrsw. We should've 2049 * prevented this when pairing up the ldvary with another instruction 2050 * and flagging it for a fixup. 2051 */ 2052 assert(scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1); 2053 2054 /* Move the ldvary to the previous instruction and remove it from the 2055 * current one. 2056 */ 2057 prev->qpu.sig.ldvary = true; 2058 prev->qpu.sig_magic = ldvary_magic; 2059 prev->qpu.sig_addr = ldvary_index; 2060 scoreboard->last_ldvary_tick = scoreboard->tick - 1; 2061 2062 inst->sig.ldvary = false; 2063 inst->sig_magic = false; 2064 inst->sig_addr = 0; 2065 2066 /* By moving ldvary to the previous instruction we make it update 2067 * r5 in the current one, so nothing else in it should write r5. 2068 * This should've been prevented by our depedency tracking, which 2069 * would not allow ldvary to be paired up with an instruction that 2070 * writes r5 (since our dependency tracking doesn't know that the 2071 * ldvary write r5 happens in the next instruction). 2072 */ 2073 assert(!v3d_qpu_writes_r5(c->devinfo, inst)); 2074 2075 return true; 2076} 2077 2078static uint32_t 2079schedule_instructions(struct v3d_compile *c, 2080 struct choose_scoreboard *scoreboard, 2081 struct qblock *block, 2082 enum quniform_contents *orig_uniform_contents, 2083 uint32_t *orig_uniform_data, 2084 uint32_t *next_uniform) 2085{ 2086 const struct v3d_device_info *devinfo = c->devinfo; 2087 uint32_t time = 0; 2088 2089 while (!list_is_empty(&scoreboard->dag->heads)) { 2090 struct schedule_node *chosen = 2091 choose_instruction_to_schedule(c, scoreboard, NULL); 2092 struct schedule_node *merge = NULL; 2093 2094 /* If there are no valid instructions to schedule, drop a NOP 2095 * in. 2096 */ 2097 struct qinst *qinst = chosen ? chosen->inst : vir_nop(); 2098 struct v3d_qpu_instr *inst = &qinst->qpu; 2099 2100 if (debug) { 2101 fprintf(stderr, "t=%4d: current list:\n", 2102 time); 2103 dump_state(devinfo, scoreboard->dag); 2104 fprintf(stderr, "t=%4d: chose: ", time); 2105 v3d_qpu_dump(devinfo, inst); 2106 fprintf(stderr, "\n"); 2107 } 2108 2109 /* We can't mark_instruction_scheduled() the chosen inst until 2110 * we're done identifying instructions to merge, so put the 2111 * merged instructions on a list for a moment. 2112 */ 2113 struct list_head merged_list; 2114 list_inithead(&merged_list); 2115 2116 /* Schedule this instruction onto the QPU list. Also try to 2117 * find an instruction to pair with it. 2118 */ 2119 if (chosen) { 2120 time = MAX2(chosen->unblocked_time, time); 2121 pre_remove_head(scoreboard->dag, chosen); 2122 2123 while ((merge = 2124 choose_instruction_to_schedule(c, scoreboard, 2125 chosen))) { 2126 time = MAX2(merge->unblocked_time, time); 2127 pre_remove_head(scoreboard->dag, merge); 2128 list_addtail(&merge->link, &merged_list); 2129 (void)qpu_merge_inst(devinfo, inst, 2130 inst, &merge->inst->qpu); 2131 if (merge->inst->uniform != -1) { 2132 chosen->inst->uniform = 2133 merge->inst->uniform; 2134 } 2135 2136 if (debug) { 2137 fprintf(stderr, "t=%4d: merging: ", 2138 time); 2139 v3d_qpu_dump(devinfo, &merge->inst->qpu); 2140 fprintf(stderr, "\n"); 2141 fprintf(stderr, " result: "); 2142 v3d_qpu_dump(devinfo, inst); 2143 fprintf(stderr, "\n"); 2144 } 2145 2146 if (scoreboard->fixup_ldvary) { 2147 scoreboard->fixup_ldvary = false; 2148 if (fixup_pipelined_ldvary(c, scoreboard, block, inst)) { 2149 /* Flag the ldvary as scheduled 2150 * now so we can try to merge the 2151 * follow-up instruction in the 2152 * the ldvary sequence into the 2153 * current instruction. 2154 */ 2155 mark_instruction_scheduled( 2156 devinfo, scoreboard->dag, 2157 time, merge); 2158 } 2159 } 2160 } 2161 if (mux_read_stalls(scoreboard, inst)) 2162 c->qpu_inst_stalled_count++; 2163 } 2164 2165 /* Update the uniform index for the rewritten location -- 2166 * branch target updating will still need to change 2167 * c->uniform_data[] using this index. 2168 */ 2169 if (qinst->uniform != -1) { 2170 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) 2171 block->branch_uniform = *next_uniform; 2172 2173 c->uniform_data[*next_uniform] = 2174 orig_uniform_data[qinst->uniform]; 2175 c->uniform_contents[*next_uniform] = 2176 orig_uniform_contents[qinst->uniform]; 2177 qinst->uniform = *next_uniform; 2178 (*next_uniform)++; 2179 } 2180 2181 if (debug) { 2182 fprintf(stderr, "\n"); 2183 } 2184 2185 /* Now that we've scheduled a new instruction, some of its 2186 * children can be promoted to the list of instructions ready to 2187 * be scheduled. Update the children's unblocked time for this 2188 * DAG edge as we do so. 2189 */ 2190 mark_instruction_scheduled(devinfo, scoreboard->dag, time, chosen); 2191 list_for_each_entry(struct schedule_node, merge, &merged_list, 2192 link) { 2193 mark_instruction_scheduled(devinfo, scoreboard->dag, time, merge); 2194 2195 /* The merged VIR instruction doesn't get re-added to the 2196 * block, so free it now. 2197 */ 2198 free(merge->inst); 2199 } 2200 2201 if (inst->sig.thrsw) { 2202 time += emit_thrsw(c, block, scoreboard, qinst, false); 2203 } else if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) { 2204 emit_branch(c, block, scoreboard, qinst); 2205 } else { 2206 insert_scheduled_instruction(c, block, 2207 scoreboard, qinst); 2208 } 2209 } 2210 2211 return time; 2212} 2213 2214static uint32_t 2215qpu_schedule_instructions_block(struct v3d_compile *c, 2216 struct choose_scoreboard *scoreboard, 2217 struct qblock *block, 2218 enum quniform_contents *orig_uniform_contents, 2219 uint32_t *orig_uniform_data, 2220 uint32_t *next_uniform) 2221{ 2222 void *mem_ctx = ralloc_context(NULL); 2223 scoreboard->dag = dag_create(mem_ctx); 2224 struct list_head setup_list; 2225 2226 list_inithead(&setup_list); 2227 2228 /* Wrap each instruction in a scheduler structure. */ 2229 while (!list_is_empty(&block->instructions)) { 2230 struct qinst *qinst = (struct qinst *)block->instructions.next; 2231 struct schedule_node *n = 2232 rzalloc(mem_ctx, struct schedule_node); 2233 2234 dag_init_node(scoreboard->dag, &n->dag); 2235 n->inst = qinst; 2236 2237 list_del(&qinst->link); 2238 list_addtail(&n->link, &setup_list); 2239 } 2240 2241 calculate_forward_deps(c, scoreboard->dag, &setup_list); 2242 calculate_reverse_deps(c, scoreboard->dag, &setup_list); 2243 2244 dag_traverse_bottom_up(scoreboard->dag, compute_delay, c); 2245 2246 uint32_t cycles = schedule_instructions(c, scoreboard, block, 2247 orig_uniform_contents, 2248 orig_uniform_data, 2249 next_uniform); 2250 2251 ralloc_free(mem_ctx); 2252 scoreboard->dag = NULL; 2253 2254 return cycles; 2255} 2256 2257static void 2258qpu_set_branch_targets(struct v3d_compile *c) 2259{ 2260 vir_for_each_block(block, c) { 2261 /* The end block of the program has no branch. */ 2262 if (!block->successors[0]) 2263 continue; 2264 2265 /* If there was no branch instruction, then the successor 2266 * block must follow immediately after this one. 2267 */ 2268 if (block->branch_qpu_ip == ~0) { 2269 assert(block->end_qpu_ip + 1 == 2270 block->successors[0]->start_qpu_ip); 2271 continue; 2272 } 2273 2274 /* Walk back through the delay slots to find the branch 2275 * instr. 2276 */ 2277 struct qinst *branch = NULL; 2278 struct list_head *entry = block->instructions.prev; 2279 int32_t delay_slot_count = -1; 2280 struct qinst *delay_slots_start = NULL; 2281 for (int i = 0; i < 3; i++) { 2282 entry = entry->prev; 2283 struct qinst *inst = 2284 container_of(entry, struct qinst, link); 2285 2286 if (delay_slot_count == -1) { 2287 if (!v3d_qpu_is_nop(&inst->qpu)) 2288 delay_slot_count = i; 2289 else 2290 delay_slots_start = inst; 2291 } 2292 2293 if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH) { 2294 branch = inst; 2295 break; 2296 } 2297 } 2298 assert(branch && branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH); 2299 assert(delay_slot_count >= 0 && delay_slot_count <= 3); 2300 assert(delay_slot_count == 0 || delay_slots_start != NULL); 2301 2302 /* Make sure that the if-we-don't-jump 2303 * successor was scheduled just after the 2304 * delay slots. 2305 */ 2306 assert(!block->successors[1] || 2307 block->successors[1]->start_qpu_ip == 2308 block->branch_qpu_ip + 4); 2309 2310 branch->qpu.branch.offset = 2311 ((block->successors[0]->start_qpu_ip - 2312 (block->branch_qpu_ip + 4)) * 2313 sizeof(uint64_t)); 2314 2315 /* Set up the relative offset to jump in the 2316 * uniform stream. 2317 * 2318 * Use a temporary here, because 2319 * uniform_data[inst->uniform] may be shared 2320 * between multiple instructions. 2321 */ 2322 assert(c->uniform_contents[branch->uniform] == QUNIFORM_CONSTANT); 2323 c->uniform_data[branch->uniform] = 2324 (block->successors[0]->start_uniform - 2325 (block->branch_uniform + 1)) * 4; 2326 2327 /* If this is an unconditional branch, try to fill any remaining 2328 * delay slots with the initial instructions of the successor 2329 * block. 2330 * 2331 * FIXME: we can do the same for conditional branches if we 2332 * predicate the instructions to match the branch condition. 2333 */ 2334 if (branch->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS) { 2335 struct list_head *successor_insts = 2336 &block->successors[0]->instructions; 2337 delay_slot_count = MIN2(delay_slot_count, 2338 list_length(successor_insts)); 2339 struct qinst *s_inst = 2340 (struct qinst *) successor_insts->next; 2341 struct qinst *slot = delay_slots_start; 2342 int slots_filled = 0; 2343 while (slots_filled < delay_slot_count && 2344 qpu_inst_valid_in_branch_delay_slot(c, s_inst)) { 2345 memcpy(&slot->qpu, &s_inst->qpu, 2346 sizeof(slot->qpu)); 2347 s_inst = (struct qinst *) s_inst->link.next; 2348 slot = (struct qinst *) slot->link.next; 2349 slots_filled++; 2350 } 2351 branch->qpu.branch.offset += 2352 slots_filled * sizeof(uint64_t); 2353 } 2354 } 2355} 2356 2357uint32_t 2358v3d_qpu_schedule_instructions(struct v3d_compile *c) 2359{ 2360 const struct v3d_device_info *devinfo = c->devinfo; 2361 struct qblock *end_block = list_last_entry(&c->blocks, 2362 struct qblock, link); 2363 2364 /* We reorder the uniforms as we schedule instructions, so save the 2365 * old data off and replace it. 2366 */ 2367 uint32_t *uniform_data = c->uniform_data; 2368 enum quniform_contents *uniform_contents = c->uniform_contents; 2369 c->uniform_contents = ralloc_array(c, enum quniform_contents, 2370 c->num_uniforms); 2371 c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms); 2372 c->uniform_array_size = c->num_uniforms; 2373 uint32_t next_uniform = 0; 2374 2375 struct choose_scoreboard scoreboard; 2376 memset(&scoreboard, 0, sizeof(scoreboard)); 2377 scoreboard.last_ldvary_tick = -10; 2378 scoreboard.last_unifa_write_tick = -10; 2379 scoreboard.last_magic_sfu_write_tick = -10; 2380 scoreboard.last_uniforms_reset_tick = -10; 2381 scoreboard.last_thrsw_tick = -10; 2382 scoreboard.last_branch_tick = -10; 2383 scoreboard.last_setmsf_tick = -10; 2384 scoreboard.last_stallable_sfu_tick = -10; 2385 2386 if (debug) { 2387 fprintf(stderr, "Pre-schedule instructions\n"); 2388 vir_for_each_block(block, c) { 2389 fprintf(stderr, "BLOCK %d\n", block->index); 2390 list_for_each_entry(struct qinst, qinst, 2391 &block->instructions, link) { 2392 v3d_qpu_dump(devinfo, &qinst->qpu); 2393 fprintf(stderr, "\n"); 2394 } 2395 } 2396 fprintf(stderr, "\n"); 2397 } 2398 2399 uint32_t cycles = 0; 2400 vir_for_each_block(block, c) { 2401 block->start_qpu_ip = c->qpu_inst_count; 2402 block->branch_qpu_ip = ~0; 2403 block->start_uniform = next_uniform; 2404 2405 cycles += qpu_schedule_instructions_block(c, 2406 &scoreboard, 2407 block, 2408 uniform_contents, 2409 uniform_data, 2410 &next_uniform); 2411 2412 block->end_qpu_ip = c->qpu_inst_count - 1; 2413 } 2414 2415 /* Emit the program-end THRSW instruction. */; 2416 struct qinst *thrsw = vir_nop(); 2417 thrsw->qpu.sig.thrsw = true; 2418 emit_thrsw(c, end_block, &scoreboard, thrsw, true); 2419 2420 qpu_set_branch_targets(c); 2421 2422 assert(next_uniform == c->num_uniforms); 2423 2424 return cycles; 2425} 2426