qpu_schedule.c revision 01e04c3f
1/* 2 * Copyright © 2010 Intel Corporation 3 * Copyright © 2014-2017 Broadcom 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22 * IN THE SOFTWARE. 23 */ 24 25/** 26 * @file 27 * 28 * The basic model of the list scheduler is to take a basic block, compute a 29 * DAG of the dependencies, and make a list of the DAG heads. Heuristically 30 * pick a DAG head, then put all the children that are now DAG heads into the 31 * list of things to schedule. 32 * 33 * The goal of scheduling here is to pack pairs of operations together in a 34 * single QPU instruction. 35 */ 36 37#include "qpu/qpu_disasm.h" 38#include "v3d_compiler.h" 39#include "util/ralloc.h" 40 41static bool debug; 42 43struct schedule_node_child; 44 45struct schedule_node { 46 struct list_head link; 47 struct qinst *inst; 48 struct schedule_node_child *children; 49 uint32_t child_count; 50 uint32_t child_array_size; 51 uint32_t parent_count; 52 53 /* Longest cycles + instruction_latency() of any parent of this node. */ 54 uint32_t unblocked_time; 55 56 /** 57 * Minimum number of cycles from scheduling this instruction until the 58 * end of the program, based on the slowest dependency chain through 59 * the children. 60 */ 61 uint32_t delay; 62 63 /** 64 * cycles between this instruction being scheduled and when its result 65 * can be consumed. 66 */ 67 uint32_t latency; 68}; 69 70struct schedule_node_child { 71 struct schedule_node *node; 72 bool write_after_read; 73}; 74 75/* When walking the instructions in reverse, we need to swap before/after in 76 * add_dep(). 77 */ 78enum direction { F, R }; 79 80struct schedule_state { 81 const struct v3d_device_info *devinfo; 82 struct schedule_node *last_r[6]; 83 struct schedule_node *last_rf[64]; 84 struct schedule_node *last_sf; 85 struct schedule_node *last_vpm_read; 86 struct schedule_node *last_tmu_write; 87 struct schedule_node *last_tmu_config; 88 struct schedule_node *last_tlb; 89 struct schedule_node *last_vpm; 90 struct schedule_node *last_unif; 91 struct schedule_node *last_rtop; 92 enum direction dir; 93 /* Estimated cycle when the current instruction would start. */ 94 uint32_t time; 95}; 96 97static void 98add_dep(struct schedule_state *state, 99 struct schedule_node *before, 100 struct schedule_node *after, 101 bool write) 102{ 103 bool write_after_read = !write && state->dir == R; 104 105 if (!before || !after) 106 return; 107 108 assert(before != after); 109 110 if (state->dir == R) { 111 struct schedule_node *t = before; 112 before = after; 113 after = t; 114 } 115 116 for (int i = 0; i < before->child_count; i++) { 117 if (before->children[i].node == after && 118 (before->children[i].write_after_read == write_after_read)) { 119 return; 120 } 121 } 122 123 if (before->child_array_size <= before->child_count) { 124 before->child_array_size = MAX2(before->child_array_size * 2, 16); 125 before->children = reralloc(before, before->children, 126 struct schedule_node_child, 127 before->child_array_size); 128 } 129 130 before->children[before->child_count].node = after; 131 before->children[before->child_count].write_after_read = 132 write_after_read; 133 before->child_count++; 134 after->parent_count++; 135} 136 137static void 138add_read_dep(struct schedule_state *state, 139 struct schedule_node *before, 140 struct schedule_node *after) 141{ 142 add_dep(state, before, after, false); 143} 144 145static void 146add_write_dep(struct schedule_state *state, 147 struct schedule_node **before, 148 struct schedule_node *after) 149{ 150 add_dep(state, *before, after, true); 151 *before = after; 152} 153 154static bool 155qpu_inst_is_tlb(const struct v3d_qpu_instr *inst) 156{ 157 if (inst->type != V3D_QPU_INSTR_TYPE_ALU) 158 return false; 159 160 if (inst->alu.add.magic_write && 161 (inst->alu.add.waddr == V3D_QPU_WADDR_TLB || 162 inst->alu.add.waddr == V3D_QPU_WADDR_TLBU)) 163 return true; 164 165 if (inst->alu.mul.magic_write && 166 (inst->alu.mul.waddr == V3D_QPU_WADDR_TLB || 167 inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU)) 168 return true; 169 170 return false; 171} 172 173static void 174process_mux_deps(struct schedule_state *state, struct schedule_node *n, 175 enum v3d_qpu_mux mux) 176{ 177 switch (mux) { 178 case V3D_QPU_MUX_A: 179 add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n); 180 break; 181 case V3D_QPU_MUX_B: 182 add_read_dep(state, state->last_rf[n->inst->qpu.raddr_b], n); 183 break; 184 default: 185 add_read_dep(state, state->last_r[mux - V3D_QPU_MUX_R0], n); 186 break; 187 } 188} 189 190 191static void 192process_waddr_deps(struct schedule_state *state, struct schedule_node *n, 193 uint32_t waddr, bool magic) 194{ 195 if (!magic) { 196 add_write_dep(state, &state->last_rf[waddr], n); 197 } else if (v3d_qpu_magic_waddr_is_tmu(waddr)) { 198 add_write_dep(state, &state->last_tmu_write, n); 199 switch (waddr) { 200 case V3D_QPU_WADDR_TMUS: 201 case V3D_QPU_WADDR_TMUSCM: 202 case V3D_QPU_WADDR_TMUSF: 203 case V3D_QPU_WADDR_TMUSLOD: 204 add_write_dep(state, &state->last_tmu_config, n); 205 break; 206 default: 207 break; 208 } 209 } else if (v3d_qpu_magic_waddr_is_sfu(waddr)) { 210 /* Handled by v3d_qpu_writes_r4() check. */ 211 } else { 212 switch (waddr) { 213 case V3D_QPU_WADDR_R0: 214 case V3D_QPU_WADDR_R1: 215 case V3D_QPU_WADDR_R2: 216 add_write_dep(state, 217 &state->last_r[waddr - V3D_QPU_WADDR_R0], 218 n); 219 break; 220 case V3D_QPU_WADDR_R3: 221 case V3D_QPU_WADDR_R4: 222 case V3D_QPU_WADDR_R5: 223 /* Handled by v3d_qpu_writes_r*() checks below. */ 224 break; 225 226 case V3D_QPU_WADDR_VPM: 227 case V3D_QPU_WADDR_VPMU: 228 add_write_dep(state, &state->last_vpm, n); 229 break; 230 231 case V3D_QPU_WADDR_TLB: 232 case V3D_QPU_WADDR_TLBU: 233 add_write_dep(state, &state->last_tlb, n); 234 break; 235 236 case V3D_QPU_WADDR_NOP: 237 break; 238 239 default: 240 fprintf(stderr, "Unknown waddr %d\n", waddr); 241 abort(); 242 } 243 } 244} 245 246static void 247process_cond_deps(struct schedule_state *state, struct schedule_node *n, 248 enum v3d_qpu_cond cond) 249{ 250 if (cond != V3D_QPU_COND_NONE) 251 add_read_dep(state, state->last_sf, n); 252} 253 254static void 255process_pf_deps(struct schedule_state *state, struct schedule_node *n, 256 enum v3d_qpu_pf pf) 257{ 258 if (pf != V3D_QPU_PF_NONE) 259 add_write_dep(state, &state->last_sf, n); 260} 261 262static void 263process_uf_deps(struct schedule_state *state, struct schedule_node *n, 264 enum v3d_qpu_uf uf) 265{ 266 if (uf != V3D_QPU_UF_NONE) 267 add_write_dep(state, &state->last_sf, n); 268} 269 270/** 271 * Common code for dependencies that need to be tracked both forward and 272 * backward. 273 * 274 * This is for things like "all reads of r4 have to happen between the r4 275 * writes that surround them". 276 */ 277static void 278calculate_deps(struct schedule_state *state, struct schedule_node *n) 279{ 280 const struct v3d_device_info *devinfo = state->devinfo; 281 struct qinst *qinst = n->inst; 282 struct v3d_qpu_instr *inst = &qinst->qpu; 283 284 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) { 285 if (inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS) 286 add_read_dep(state, state->last_sf, n); 287 288 /* XXX: BDI */ 289 /* XXX: BDU */ 290 /* XXX: ub */ 291 /* XXX: raddr_a */ 292 293 add_write_dep(state, &state->last_unif, n); 294 return; 295 } 296 297 assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); 298 299 /* XXX: LOAD_IMM */ 300 301 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) 302 process_mux_deps(state, n, inst->alu.add.a); 303 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) 304 process_mux_deps(state, n, inst->alu.add.b); 305 306 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) 307 process_mux_deps(state, n, inst->alu.mul.a); 308 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) 309 process_mux_deps(state, n, inst->alu.mul.b); 310 311 switch (inst->alu.add.op) { 312 case V3D_QPU_A_VPMSETUP: 313 /* Could distinguish read/write by unpacking the uniform. */ 314 add_write_dep(state, &state->last_vpm, n); 315 add_write_dep(state, &state->last_vpm_read, n); 316 break; 317 318 case V3D_QPU_A_STVPMV: 319 case V3D_QPU_A_STVPMD: 320 case V3D_QPU_A_STVPMP: 321 add_write_dep(state, &state->last_vpm, n); 322 break; 323 324 case V3D_QPU_A_VPMWT: 325 add_read_dep(state, state->last_vpm, n); 326 break; 327 328 case V3D_QPU_A_MSF: 329 add_read_dep(state, state->last_tlb, n); 330 break; 331 332 case V3D_QPU_A_SETMSF: 333 case V3D_QPU_A_SETREVF: 334 add_write_dep(state, &state->last_tlb, n); 335 break; 336 337 case V3D_QPU_A_FLAPUSH: 338 case V3D_QPU_A_FLBPUSH: 339 case V3D_QPU_A_VFLA: 340 case V3D_QPU_A_VFLNA: 341 case V3D_QPU_A_VFLB: 342 case V3D_QPU_A_VFLNB: 343 add_read_dep(state, state->last_sf, n); 344 break; 345 346 case V3D_QPU_A_FLPOP: 347 add_write_dep(state, &state->last_sf, n); 348 break; 349 350 default: 351 break; 352 } 353 354 switch (inst->alu.mul.op) { 355 case V3D_QPU_M_MULTOP: 356 case V3D_QPU_M_UMUL24: 357 /* MULTOP sets rtop, and UMUL24 implicitly reads rtop and 358 * resets it to 0. We could possibly reorder umul24s relative 359 * to each other, but for now just keep all the MUL parts in 360 * order. 361 */ 362 add_write_dep(state, &state->last_rtop, n); 363 break; 364 default: 365 break; 366 } 367 368 if (inst->alu.add.op != V3D_QPU_A_NOP) { 369 process_waddr_deps(state, n, inst->alu.add.waddr, 370 inst->alu.add.magic_write); 371 } 372 if (inst->alu.mul.op != V3D_QPU_M_NOP) { 373 process_waddr_deps(state, n, inst->alu.mul.waddr, 374 inst->alu.mul.magic_write); 375 } 376 if (v3d_qpu_sig_writes_address(devinfo, &inst->sig)) { 377 process_waddr_deps(state, n, inst->sig_addr, 378 inst->sig_magic); 379 } 380 381 if (v3d_qpu_writes_r3(devinfo, inst)) 382 add_write_dep(state, &state->last_r[3], n); 383 if (v3d_qpu_writes_r4(devinfo, inst)) 384 add_write_dep(state, &state->last_r[4], n); 385 if (v3d_qpu_writes_r5(devinfo, inst)) 386 add_write_dep(state, &state->last_r[5], n); 387 388 if (inst->sig.thrsw) { 389 /* All accumulator contents and flags are undefined after the 390 * switch. 391 */ 392 for (int i = 0; i < ARRAY_SIZE(state->last_r); i++) 393 add_write_dep(state, &state->last_r[i], n); 394 add_write_dep(state, &state->last_sf, n); 395 add_write_dep(state, &state->last_rtop, n); 396 397 /* Scoreboard-locking operations have to stay after the last 398 * thread switch. 399 */ 400 add_write_dep(state, &state->last_tlb, n); 401 402 add_write_dep(state, &state->last_tmu_write, n); 403 add_write_dep(state, &state->last_tmu_config, n); 404 } 405 406 if (v3d_qpu_waits_on_tmu(inst)) { 407 /* TMU loads are coming from a FIFO, so ordering is important. 408 */ 409 add_write_dep(state, &state->last_tmu_write, n); 410 } 411 412 if (inst->sig.wrtmuc) 413 add_write_dep(state, &state->last_tmu_config, n); 414 415 if (inst->sig.ldtlb | inst->sig.ldtlbu) 416 add_read_dep(state, state->last_tlb, n); 417 418 if (inst->sig.ldvpm) 419 add_write_dep(state, &state->last_vpm_read, n); 420 421 /* inst->sig.ldunif or sideband uniform read */ 422 if (qinst->uniform != ~0) 423 add_write_dep(state, &state->last_unif, n); 424 425 process_cond_deps(state, n, inst->flags.ac); 426 process_cond_deps(state, n, inst->flags.mc); 427 process_pf_deps(state, n, inst->flags.apf); 428 process_pf_deps(state, n, inst->flags.mpf); 429 process_uf_deps(state, n, inst->flags.auf); 430 process_uf_deps(state, n, inst->flags.muf); 431} 432 433static void 434calculate_forward_deps(struct v3d_compile *c, struct list_head *schedule_list) 435{ 436 struct schedule_state state; 437 438 memset(&state, 0, sizeof(state)); 439 state.devinfo = c->devinfo; 440 state.dir = F; 441 442 list_for_each_entry(struct schedule_node, node, schedule_list, link) 443 calculate_deps(&state, node); 444} 445 446static void 447calculate_reverse_deps(struct v3d_compile *c, struct list_head *schedule_list) 448{ 449 struct list_head *node; 450 struct schedule_state state; 451 452 memset(&state, 0, sizeof(state)); 453 state.devinfo = c->devinfo; 454 state.dir = R; 455 456 for (node = schedule_list->prev; schedule_list != node; node = node->prev) { 457 calculate_deps(&state, (struct schedule_node *)node); 458 } 459} 460 461struct choose_scoreboard { 462 int tick; 463 int last_magic_sfu_write_tick; 464 int last_ldvary_tick; 465 int last_uniforms_reset_tick; 466 int last_thrsw_tick; 467 bool tlb_locked; 468}; 469 470static bool 471mux_reads_too_soon(struct choose_scoreboard *scoreboard, 472 const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux) 473{ 474 switch (mux) { 475 case V3D_QPU_MUX_R4: 476 if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick <= 2) 477 return true; 478 break; 479 480 case V3D_QPU_MUX_R5: 481 if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1) 482 return true; 483 break; 484 default: 485 break; 486 } 487 488 return false; 489} 490 491static bool 492reads_too_soon_after_write(struct choose_scoreboard *scoreboard, 493 struct qinst *qinst) 494{ 495 const struct v3d_qpu_instr *inst = &qinst->qpu; 496 497 /* XXX: Branching off of raddr. */ 498 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) 499 return false; 500 501 assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); 502 503 if (inst->alu.add.op != V3D_QPU_A_NOP) { 504 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 && 505 mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) { 506 return true; 507 } 508 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 && 509 mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) { 510 return true; 511 } 512 } 513 514 if (inst->alu.mul.op != V3D_QPU_M_NOP) { 515 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 && 516 mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) { 517 return true; 518 } 519 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 && 520 mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) { 521 return true; 522 } 523 } 524 525 /* XXX: imm */ 526 527 return false; 528} 529 530static bool 531writes_too_soon_after_write(const struct v3d_device_info *devinfo, 532 struct choose_scoreboard *scoreboard, 533 struct qinst *qinst) 534{ 535 const struct v3d_qpu_instr *inst = &qinst->qpu; 536 537 /* Don't schedule any other r4 write too soon after an SFU write. 538 * This would normally be prevented by dependency tracking, but might 539 * occur if a dead SFU computation makes it to scheduling. 540 */ 541 if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick < 2 && 542 v3d_qpu_writes_r4(devinfo, inst)) 543 return true; 544 545 return false; 546} 547 548static bool 549pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard, 550 const struct v3d_qpu_instr *inst) 551{ 552 return (scoreboard->tick == 0 && qpu_inst_is_tlb(inst)); 553} 554 555static int 556get_instruction_priority(const struct v3d_qpu_instr *inst) 557{ 558 uint32_t baseline_score; 559 uint32_t next_score = 0; 560 561 /* Schedule TLB operations as late as possible, to get more 562 * parallelism between shaders. 563 */ 564 if (qpu_inst_is_tlb(inst)) 565 return next_score; 566 next_score++; 567 568 /* Schedule texture read results collection late to hide latency. */ 569 if (v3d_qpu_waits_on_tmu(inst)) 570 return next_score; 571 next_score++; 572 573 /* Default score for things that aren't otherwise special. */ 574 baseline_score = next_score; 575 next_score++; 576 577 /* Schedule texture read setup early to hide their latency better. */ 578 if (v3d_qpu_writes_tmu(inst)) 579 return next_score; 580 next_score++; 581 582 return baseline_score; 583} 584 585static bool 586qpu_magic_waddr_is_periph(enum v3d_qpu_waddr waddr) 587{ 588 return (v3d_qpu_magic_waddr_is_tmu(waddr) || 589 v3d_qpu_magic_waddr_is_sfu(waddr) || 590 v3d_qpu_magic_waddr_is_tlb(waddr) || 591 v3d_qpu_magic_waddr_is_vpm(waddr) || 592 v3d_qpu_magic_waddr_is_tsy(waddr)); 593} 594 595static bool 596qpu_accesses_peripheral(const struct v3d_qpu_instr *inst) 597{ 598 if (v3d_qpu_uses_vpm(inst)) 599 return true; 600 if (v3d_qpu_uses_sfu(inst)) 601 return true; 602 603 if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { 604 if (inst->alu.add.op != V3D_QPU_A_NOP && 605 inst->alu.add.magic_write && 606 qpu_magic_waddr_is_periph(inst->alu.add.waddr)) { 607 return true; 608 } 609 610 if (inst->alu.add.op == V3D_QPU_A_TMUWT) 611 return true; 612 613 if (inst->alu.mul.op != V3D_QPU_M_NOP && 614 inst->alu.mul.magic_write && 615 qpu_magic_waddr_is_periph(inst->alu.mul.waddr)) { 616 return true; 617 } 618 } 619 620 return (inst->sig.ldvpm || 621 inst->sig.ldtmu || 622 inst->sig.ldtlb || 623 inst->sig.ldtlbu || 624 inst->sig.wrtmuc); 625} 626 627static bool 628qpu_merge_inst(const struct v3d_device_info *devinfo, 629 struct v3d_qpu_instr *result, 630 const struct v3d_qpu_instr *a, 631 const struct v3d_qpu_instr *b) 632{ 633 if (a->type != V3D_QPU_INSTR_TYPE_ALU || 634 b->type != V3D_QPU_INSTR_TYPE_ALU) { 635 return false; 636 } 637 638 /* Can't do more than one peripheral access in an instruction. 639 * 640 * XXX: V3D 4.1 allows TMU read along with a VPM read or write, and 641 * WRTMUC with a TMU magic register write (other than tmuc). 642 */ 643 if (qpu_accesses_peripheral(a) && qpu_accesses_peripheral(b)) 644 return false; 645 646 struct v3d_qpu_instr merge = *a; 647 648 if (b->alu.add.op != V3D_QPU_A_NOP) { 649 if (a->alu.add.op != V3D_QPU_A_NOP) 650 return false; 651 merge.alu.add = b->alu.add; 652 653 merge.flags.ac = b->flags.ac; 654 merge.flags.apf = b->flags.apf; 655 merge.flags.auf = b->flags.auf; 656 } 657 658 if (b->alu.mul.op != V3D_QPU_M_NOP) { 659 if (a->alu.mul.op != V3D_QPU_M_NOP) 660 return false; 661 merge.alu.mul = b->alu.mul; 662 663 merge.flags.mc = b->flags.mc; 664 merge.flags.mpf = b->flags.mpf; 665 merge.flags.muf = b->flags.muf; 666 } 667 668 if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A)) { 669 if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A) && 670 a->raddr_a != b->raddr_a) { 671 return false; 672 } 673 merge.raddr_a = b->raddr_a; 674 } 675 676 if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) { 677 if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_B) && 678 (a->raddr_b != b->raddr_b || 679 a->sig.small_imm != b->sig.small_imm)) { 680 return false; 681 } 682 merge.raddr_b = b->raddr_b; 683 } 684 685 merge.sig.thrsw |= b->sig.thrsw; 686 merge.sig.ldunif |= b->sig.ldunif; 687 merge.sig.ldunifrf |= b->sig.ldunifrf; 688 merge.sig.ldunifa |= b->sig.ldunifa; 689 merge.sig.ldunifarf |= b->sig.ldunifarf; 690 merge.sig.ldtmu |= b->sig.ldtmu; 691 merge.sig.ldvary |= b->sig.ldvary; 692 merge.sig.ldvpm |= b->sig.ldvpm; 693 merge.sig.small_imm |= b->sig.small_imm; 694 merge.sig.ldtlb |= b->sig.ldtlb; 695 merge.sig.ldtlbu |= b->sig.ldtlbu; 696 merge.sig.ucb |= b->sig.ucb; 697 merge.sig.rotate |= b->sig.rotate; 698 merge.sig.wrtmuc |= b->sig.wrtmuc; 699 700 if (v3d_qpu_sig_writes_address(devinfo, &a->sig) && 701 v3d_qpu_sig_writes_address(devinfo, &b->sig)) 702 return false; 703 merge.sig_addr |= b->sig_addr; 704 merge.sig_magic |= b->sig_magic; 705 706 uint64_t packed; 707 bool ok = v3d_qpu_instr_pack(devinfo, &merge, &packed); 708 709 *result = merge; 710 /* No modifying the real instructions on failure. */ 711 assert(ok || (a != result && b != result)); 712 713 return ok; 714} 715 716static struct schedule_node * 717choose_instruction_to_schedule(const struct v3d_device_info *devinfo, 718 struct choose_scoreboard *scoreboard, 719 struct list_head *schedule_list, 720 struct schedule_node *prev_inst) 721{ 722 struct schedule_node *chosen = NULL; 723 int chosen_prio = 0; 724 725 /* Don't pair up anything with a thread switch signal -- emit_thrsw() 726 * will handle pairing it along with filling the delay slots. 727 */ 728 if (prev_inst) { 729 if (prev_inst->inst->qpu.sig.thrsw) 730 return NULL; 731 } 732 733 list_for_each_entry(struct schedule_node, n, schedule_list, link) { 734 const struct v3d_qpu_instr *inst = &n->inst->qpu; 735 736 /* Don't choose the branch instruction until it's the last one 737 * left. We'll move it up to fit its delay slots after we 738 * choose it. 739 */ 740 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH && 741 !list_is_singular(schedule_list)) { 742 continue; 743 } 744 745 /* "An instruction must not read from a location in physical 746 * regfile A or B that was written to by the previous 747 * instruction." 748 */ 749 if (reads_too_soon_after_write(scoreboard, n->inst)) 750 continue; 751 752 if (writes_too_soon_after_write(devinfo, scoreboard, n->inst)) 753 continue; 754 755 /* "A scoreboard wait must not occur in the first two 756 * instructions of a fragment shader. This is either the 757 * explicit Wait for Scoreboard signal or an implicit wait 758 * with the first tile-buffer read or write instruction." 759 */ 760 if (pixel_scoreboard_too_soon(scoreboard, inst)) 761 continue; 762 763 /* ldunif and ldvary both write r5, but ldunif does so a tick 764 * sooner. If the ldvary's r5 wasn't used, then ldunif might 765 * otherwise get scheduled so ldunif and ldvary try to update 766 * r5 in the same tick. 767 */ 768 if ((inst->sig.ldunif || inst->sig.ldunifa) && 769 scoreboard->tick == scoreboard->last_ldvary_tick + 1) { 770 continue; 771 } 772 773 /* If we're trying to pair with another instruction, check 774 * that they're compatible. 775 */ 776 if (prev_inst) { 777 /* Don't pair up a thread switch signal -- we'll 778 * handle pairing it when we pick it on its own. 779 */ 780 if (inst->sig.thrsw) 781 continue; 782 783 if (prev_inst->inst->uniform != -1 && 784 n->inst->uniform != -1) 785 continue; 786 787 /* Don't merge in something that will lock the TLB. 788 * Hopwefully what we have in inst will release some 789 * other instructions, allowing us to delay the 790 * TLB-locking instruction until later. 791 */ 792 if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst)) 793 continue; 794 795 struct v3d_qpu_instr merged_inst; 796 if (!qpu_merge_inst(devinfo, &merged_inst, 797 &prev_inst->inst->qpu, inst)) { 798 continue; 799 } 800 } 801 802 int prio = get_instruction_priority(inst); 803 804 /* Found a valid instruction. If nothing better comes along, 805 * this one works. 806 */ 807 if (!chosen) { 808 chosen = n; 809 chosen_prio = prio; 810 continue; 811 } 812 813 if (prio > chosen_prio) { 814 chosen = n; 815 chosen_prio = prio; 816 } else if (prio < chosen_prio) { 817 continue; 818 } 819 820 if (n->delay > chosen->delay) { 821 chosen = n; 822 chosen_prio = prio; 823 } else if (n->delay < chosen->delay) { 824 continue; 825 } 826 } 827 828 return chosen; 829} 830 831static void 832update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard, 833 enum v3d_qpu_waddr waddr) 834{ 835 if (v3d_qpu_magic_waddr_is_sfu(waddr)) 836 scoreboard->last_magic_sfu_write_tick = scoreboard->tick; 837} 838 839static void 840update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, 841 const struct v3d_qpu_instr *inst) 842{ 843 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) 844 return; 845 846 assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); 847 848 if (inst->alu.add.op != V3D_QPU_A_NOP) { 849 if (inst->alu.add.magic_write) { 850 update_scoreboard_for_magic_waddr(scoreboard, 851 inst->alu.add.waddr); 852 } 853 } 854 855 if (inst->alu.mul.op != V3D_QPU_M_NOP) { 856 if (inst->alu.mul.magic_write) { 857 update_scoreboard_for_magic_waddr(scoreboard, 858 inst->alu.mul.waddr); 859 } 860 } 861 862 if (inst->sig.ldvary) 863 scoreboard->last_ldvary_tick = scoreboard->tick; 864 865 if (qpu_inst_is_tlb(inst)) 866 scoreboard->tlb_locked = true; 867} 868 869static void 870dump_state(const struct v3d_device_info *devinfo, 871 struct list_head *schedule_list) 872{ 873 list_for_each_entry(struct schedule_node, n, schedule_list, link) { 874 fprintf(stderr, " t=%4d: ", n->unblocked_time); 875 v3d_qpu_dump(devinfo, &n->inst->qpu); 876 fprintf(stderr, "\n"); 877 878 for (int i = 0; i < n->child_count; i++) { 879 struct schedule_node *child = n->children[i].node; 880 if (!child) 881 continue; 882 883 fprintf(stderr, " - "); 884 v3d_qpu_dump(devinfo, &child->inst->qpu); 885 fprintf(stderr, " (%d parents, %c)\n", 886 child->parent_count, 887 n->children[i].write_after_read ? 'w' : 'r'); 888 } 889 } 890} 891 892static uint32_t magic_waddr_latency(enum v3d_qpu_waddr waddr, 893 const struct v3d_qpu_instr *after) 894{ 895 /* Apply some huge latency between texture fetch requests and getting 896 * their results back. 897 * 898 * FIXME: This is actually pretty bogus. If we do: 899 * 900 * mov tmu0_s, a 901 * <a bit of math> 902 * mov tmu0_s, b 903 * load_tmu0 904 * <more math> 905 * load_tmu0 906 * 907 * we count that as worse than 908 * 909 * mov tmu0_s, a 910 * mov tmu0_s, b 911 * <lots of math> 912 * load_tmu0 913 * <more math> 914 * load_tmu0 915 * 916 * because we associate the first load_tmu0 with the *second* tmu0_s. 917 */ 918 if (v3d_qpu_magic_waddr_is_tmu(waddr) && v3d_qpu_waits_on_tmu(after)) 919 return 100; 920 921 /* Assume that anything depending on us is consuming the SFU result. */ 922 if (v3d_qpu_magic_waddr_is_sfu(waddr)) 923 return 3; 924 925 return 1; 926} 927 928static uint32_t 929instruction_latency(struct schedule_node *before, struct schedule_node *after) 930{ 931 const struct v3d_qpu_instr *before_inst = &before->inst->qpu; 932 const struct v3d_qpu_instr *after_inst = &after->inst->qpu; 933 uint32_t latency = 1; 934 935 if (before_inst->type != V3D_QPU_INSTR_TYPE_ALU || 936 after_inst->type != V3D_QPU_INSTR_TYPE_ALU) 937 return latency; 938 939 if (before_inst->alu.add.magic_write) { 940 latency = MAX2(latency, 941 magic_waddr_latency(before_inst->alu.add.waddr, 942 after_inst)); 943 } 944 945 if (before_inst->alu.mul.magic_write) { 946 latency = MAX2(latency, 947 magic_waddr_latency(before_inst->alu.mul.waddr, 948 after_inst)); 949 } 950 951 return latency; 952} 953 954/** Recursive computation of the delay member of a node. */ 955static void 956compute_delay(struct schedule_node *n) 957{ 958 if (!n->child_count) { 959 n->delay = 1; 960 } else { 961 for (int i = 0; i < n->child_count; i++) { 962 if (!n->children[i].node->delay) 963 compute_delay(n->children[i].node); 964 n->delay = MAX2(n->delay, 965 n->children[i].node->delay + 966 instruction_latency(n, n->children[i].node)); 967 } 968 } 969} 970 971static void 972mark_instruction_scheduled(struct list_head *schedule_list, 973 uint32_t time, 974 struct schedule_node *node, 975 bool war_only) 976{ 977 if (!node) 978 return; 979 980 for (int i = node->child_count - 1; i >= 0; i--) { 981 struct schedule_node *child = 982 node->children[i].node; 983 984 if (!child) 985 continue; 986 987 if (war_only && !node->children[i].write_after_read) 988 continue; 989 990 /* If the requirement is only that the node not appear before 991 * the last read of its destination, then it can be scheduled 992 * immediately after (or paired with!) the thing reading the 993 * destination. 994 */ 995 uint32_t latency = 0; 996 if (!war_only) { 997 latency = instruction_latency(node, 998 node->children[i].node); 999 } 1000 1001 child->unblocked_time = MAX2(child->unblocked_time, 1002 time + latency); 1003 child->parent_count--; 1004 if (child->parent_count == 0) 1005 list_add(&child->link, schedule_list); 1006 1007 node->children[i].node = NULL; 1008 } 1009} 1010 1011static void 1012insert_scheduled_instruction(struct v3d_compile *c, 1013 struct qblock *block, 1014 struct choose_scoreboard *scoreboard, 1015 struct qinst *inst) 1016{ 1017 list_addtail(&inst->link, &block->instructions); 1018 1019 update_scoreboard_for_chosen(scoreboard, &inst->qpu); 1020 c->qpu_inst_count++; 1021 scoreboard->tick++; 1022} 1023 1024static struct qinst * 1025vir_nop() 1026{ 1027 struct qreg undef = { QFILE_NULL, 0 }; 1028 struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef); 1029 1030 return qinst; 1031} 1032 1033static void 1034emit_nop(struct v3d_compile *c, struct qblock *block, 1035 struct choose_scoreboard *scoreboard) 1036{ 1037 insert_scheduled_instruction(c, block, scoreboard, vir_nop()); 1038} 1039 1040static bool 1041qpu_instruction_valid_in_thrend_slot(struct v3d_compile *c, 1042 const struct qinst *qinst, int slot) 1043{ 1044 const struct v3d_qpu_instr *inst = &qinst->qpu; 1045 1046 /* Only TLB Z writes are prohibited in the last slot, but we don't 1047 * have those flagged so prohibit all TLB ops for now. 1048 */ 1049 if (slot == 2 && qpu_inst_is_tlb(inst)) 1050 return false; 1051 1052 if (slot > 0 && qinst->uniform != ~0) 1053 return false; 1054 1055 if (v3d_qpu_uses_vpm(inst)) 1056 return false; 1057 1058 if (inst->sig.ldvary) 1059 return false; 1060 1061 if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { 1062 /* GFXH-1625: TMUWT not allowed in the final instruction. */ 1063 if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT) 1064 return false; 1065 1066 /* No writing physical registers at the end. */ 1067 if (!inst->alu.add.magic_write || 1068 !inst->alu.mul.magic_write) { 1069 return false; 1070 } 1071 1072 if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF) 1073 return false; 1074 1075 /* RF0-2 might be overwritten during the delay slots by 1076 * fragment shader setup. 1077 */ 1078 if (inst->raddr_a < 3 && 1079 (inst->alu.add.a == V3D_QPU_MUX_A || 1080 inst->alu.add.b == V3D_QPU_MUX_A || 1081 inst->alu.mul.a == V3D_QPU_MUX_A || 1082 inst->alu.mul.b == V3D_QPU_MUX_A)) { 1083 return false; 1084 } 1085 1086 if (inst->raddr_b < 3 && 1087 !inst->sig.small_imm && 1088 (inst->alu.add.a == V3D_QPU_MUX_B || 1089 inst->alu.add.b == V3D_QPU_MUX_B || 1090 inst->alu.mul.a == V3D_QPU_MUX_B || 1091 inst->alu.mul.b == V3D_QPU_MUX_B)) { 1092 return false; 1093 } 1094 } 1095 1096 return true; 1097} 1098 1099static bool 1100valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard, 1101 struct qinst *qinst, int instructions_in_sequence, 1102 bool is_thrend) 1103{ 1104 /* No emitting our thrsw while the previous thrsw hasn't happened yet. */ 1105 if (scoreboard->last_thrsw_tick + 3 > 1106 scoreboard->tick - instructions_in_sequence) { 1107 return false; 1108 } 1109 1110 for (int slot = 0; slot < instructions_in_sequence; slot++) { 1111 /* No scheduling SFU when the result would land in the other 1112 * thread. The simulator complains for safety, though it 1113 * would only occur for dead code in our case. 1114 */ 1115 if (slot > 0 && 1116 qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && 1117 (v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.add.waddr) || 1118 v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.mul.waddr))) { 1119 return false; 1120 } 1121 1122 if (slot > 0 && qinst->qpu.sig.ldvary) 1123 return false; 1124 1125 if (is_thrend && 1126 !qpu_instruction_valid_in_thrend_slot(c, qinst, slot)) { 1127 return false; 1128 } 1129 1130 /* Note that the list is circular, so we can only do this up 1131 * to instructions_in_sequence. 1132 */ 1133 qinst = (struct qinst *)qinst->link.next; 1134 } 1135 1136 return true; 1137} 1138 1139/** 1140 * Emits a THRSW signal in the stream, trying to move it up to pair with 1141 * another instruction. 1142 */ 1143static int 1144emit_thrsw(struct v3d_compile *c, 1145 struct qblock *block, 1146 struct choose_scoreboard *scoreboard, 1147 struct qinst *inst, 1148 bool is_thrend) 1149{ 1150 int time = 0; 1151 1152 /* There should be nothing in a thrsw inst being scheduled other than 1153 * the signal bits. 1154 */ 1155 assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU); 1156 assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP); 1157 assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP); 1158 1159 /* Find how far back into previous instructions we can put the THRSW. */ 1160 int slots_filled = 0; 1161 struct qinst *merge_inst = NULL; 1162 vir_for_each_inst_rev(prev_inst, block) { 1163 struct v3d_qpu_sig sig = prev_inst->qpu.sig; 1164 sig.thrsw = true; 1165 uint32_t packed_sig; 1166 1167 if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig)) 1168 break; 1169 1170 if (!valid_thrsw_sequence(c, scoreboard, 1171 prev_inst, slots_filled + 1, 1172 is_thrend)) { 1173 break; 1174 } 1175 1176 merge_inst = prev_inst; 1177 if (++slots_filled == 3) 1178 break; 1179 } 1180 1181 bool needs_free = false; 1182 if (merge_inst) { 1183 merge_inst->qpu.sig.thrsw = true; 1184 needs_free = true; 1185 scoreboard->last_thrsw_tick = scoreboard->tick - slots_filled; 1186 } else { 1187 scoreboard->last_thrsw_tick = scoreboard->tick; 1188 insert_scheduled_instruction(c, block, scoreboard, inst); 1189 time++; 1190 slots_filled++; 1191 merge_inst = inst; 1192 } 1193 1194 /* Insert any extra delay slot NOPs we need. */ 1195 for (int i = 0; i < 3 - slots_filled; i++) { 1196 emit_nop(c, block, scoreboard); 1197 time++; 1198 } 1199 1200 /* If we're emitting the last THRSW (other than program end), then 1201 * signal that to the HW by emitting two THRSWs in a row. 1202 */ 1203 if (inst->is_last_thrsw) { 1204 struct qinst *second_inst = 1205 (struct qinst *)merge_inst->link.next; 1206 second_inst->qpu.sig.thrsw = true; 1207 } 1208 1209 /* If we put our THRSW into another instruction, free up the 1210 * instruction that didn't end up scheduled into the list. 1211 */ 1212 if (needs_free) 1213 free(inst); 1214 1215 return time; 1216} 1217 1218static uint32_t 1219schedule_instructions(struct v3d_compile *c, 1220 struct choose_scoreboard *scoreboard, 1221 struct qblock *block, 1222 struct list_head *schedule_list, 1223 enum quniform_contents *orig_uniform_contents, 1224 uint32_t *orig_uniform_data, 1225 uint32_t *next_uniform) 1226{ 1227 const struct v3d_device_info *devinfo = c->devinfo; 1228 uint32_t time = 0; 1229 1230 if (debug) { 1231 fprintf(stderr, "initial deps:\n"); 1232 dump_state(devinfo, schedule_list); 1233 fprintf(stderr, "\n"); 1234 } 1235 1236 /* Remove non-DAG heads from the list. */ 1237 list_for_each_entry_safe(struct schedule_node, n, schedule_list, link) { 1238 if (n->parent_count != 0) 1239 list_del(&n->link); 1240 } 1241 1242 while (!list_empty(schedule_list)) { 1243 struct schedule_node *chosen = 1244 choose_instruction_to_schedule(devinfo, 1245 scoreboard, 1246 schedule_list, 1247 NULL); 1248 struct schedule_node *merge = NULL; 1249 1250 /* If there are no valid instructions to schedule, drop a NOP 1251 * in. 1252 */ 1253 struct qinst *qinst = chosen ? chosen->inst : vir_nop(); 1254 struct v3d_qpu_instr *inst = &qinst->qpu; 1255 1256 if (debug) { 1257 fprintf(stderr, "t=%4d: current list:\n", 1258 time); 1259 dump_state(devinfo, schedule_list); 1260 fprintf(stderr, "t=%4d: chose: ", time); 1261 v3d_qpu_dump(devinfo, inst); 1262 fprintf(stderr, "\n"); 1263 } 1264 1265 /* We can't mark_instruction_scheduled() the chosen inst until 1266 * we're done identifying instructions to merge, so put the 1267 * merged instructions on a list for a moment. 1268 */ 1269 struct list_head merged_list; 1270 list_inithead(&merged_list); 1271 1272 /* Schedule this instruction onto the QPU list. Also try to 1273 * find an instruction to pair with it. 1274 */ 1275 if (chosen) { 1276 time = MAX2(chosen->unblocked_time, time); 1277 list_del(&chosen->link); 1278 mark_instruction_scheduled(schedule_list, time, 1279 chosen, true); 1280 1281 while ((merge = 1282 choose_instruction_to_schedule(devinfo, 1283 scoreboard, 1284 schedule_list, 1285 chosen))) { 1286 time = MAX2(merge->unblocked_time, time); 1287 list_del(&merge->link); 1288 list_addtail(&merge->link, &merged_list); 1289 (void)qpu_merge_inst(devinfo, inst, 1290 inst, &merge->inst->qpu); 1291 if (merge->inst->uniform != -1) { 1292 chosen->inst->uniform = 1293 merge->inst->uniform; 1294 } 1295 1296 if (debug) { 1297 fprintf(stderr, "t=%4d: merging: ", 1298 time); 1299 v3d_qpu_dump(devinfo, &merge->inst->qpu); 1300 fprintf(stderr, "\n"); 1301 fprintf(stderr, " result: "); 1302 v3d_qpu_dump(devinfo, inst); 1303 fprintf(stderr, "\n"); 1304 } 1305 } 1306 } 1307 1308 /* Update the uniform index for the rewritten location -- 1309 * branch target updating will still need to change 1310 * c->uniform_data[] using this index. 1311 */ 1312 if (qinst->uniform != -1) { 1313 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) 1314 block->branch_uniform = *next_uniform; 1315 1316 c->uniform_data[*next_uniform] = 1317 orig_uniform_data[qinst->uniform]; 1318 c->uniform_contents[*next_uniform] = 1319 orig_uniform_contents[qinst->uniform]; 1320 qinst->uniform = *next_uniform; 1321 (*next_uniform)++; 1322 } 1323 1324 if (debug) { 1325 fprintf(stderr, "\n"); 1326 } 1327 1328 /* Now that we've scheduled a new instruction, some of its 1329 * children can be promoted to the list of instructions ready to 1330 * be scheduled. Update the children's unblocked time for this 1331 * DAG edge as we do so. 1332 */ 1333 mark_instruction_scheduled(schedule_list, time, chosen, false); 1334 list_for_each_entry(struct schedule_node, merge, &merged_list, 1335 link) { 1336 mark_instruction_scheduled(schedule_list, time, merge, 1337 false); 1338 1339 /* The merged VIR instruction doesn't get re-added to the 1340 * block, so free it now. 1341 */ 1342 free(merge->inst); 1343 } 1344 1345 if (inst->sig.thrsw) { 1346 time += emit_thrsw(c, block, scoreboard, qinst, false); 1347 } else { 1348 insert_scheduled_instruction(c, block, 1349 scoreboard, qinst); 1350 1351 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) { 1352 block->branch_qpu_ip = c->qpu_inst_count - 1; 1353 /* Fill the delay slots. 1354 * 1355 * We should fill these with actual instructions, 1356 * instead, but that will probably need to be done 1357 * after this, once we know what the leading 1358 * instructions of the successors are (so we can 1359 * handle A/B register file write latency) 1360 */ 1361 for (int i = 0; i < 3; i++) 1362 emit_nop(c, block, scoreboard); 1363 } 1364 } 1365 } 1366 1367 return time; 1368} 1369 1370static uint32_t 1371qpu_schedule_instructions_block(struct v3d_compile *c, 1372 struct choose_scoreboard *scoreboard, 1373 struct qblock *block, 1374 enum quniform_contents *orig_uniform_contents, 1375 uint32_t *orig_uniform_data, 1376 uint32_t *next_uniform) 1377{ 1378 void *mem_ctx = ralloc_context(NULL); 1379 struct list_head schedule_list; 1380 1381 list_inithead(&schedule_list); 1382 1383 /* Wrap each instruction in a scheduler structure. */ 1384 while (!list_empty(&block->instructions)) { 1385 struct qinst *qinst = (struct qinst *)block->instructions.next; 1386 struct schedule_node *n = 1387 rzalloc(mem_ctx, struct schedule_node); 1388 1389 n->inst = qinst; 1390 1391 list_del(&qinst->link); 1392 list_addtail(&n->link, &schedule_list); 1393 } 1394 1395 calculate_forward_deps(c, &schedule_list); 1396 calculate_reverse_deps(c, &schedule_list); 1397 1398 list_for_each_entry(struct schedule_node, n, &schedule_list, link) { 1399 compute_delay(n); 1400 } 1401 1402 uint32_t cycles = schedule_instructions(c, scoreboard, block, 1403 &schedule_list, 1404 orig_uniform_contents, 1405 orig_uniform_data, 1406 next_uniform); 1407 1408 ralloc_free(mem_ctx); 1409 1410 return cycles; 1411} 1412 1413static void 1414qpu_set_branch_targets(struct v3d_compile *c) 1415{ 1416 vir_for_each_block(block, c) { 1417 /* The end block of the program has no branch. */ 1418 if (!block->successors[0]) 1419 continue; 1420 1421 /* If there was no branch instruction, then the successor 1422 * block must follow immediately after this one. 1423 */ 1424 if (block->branch_qpu_ip == ~0) { 1425 assert(block->end_qpu_ip + 1 == 1426 block->successors[0]->start_qpu_ip); 1427 continue; 1428 } 1429 1430 /* Walk back through the delay slots to find the branch 1431 * instr. 1432 */ 1433 struct list_head *entry = block->instructions.prev; 1434 for (int i = 0; i < 3; i++) 1435 entry = entry->prev; 1436 struct qinst *branch = container_of(entry, branch, link); 1437 assert(branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH); 1438 1439 /* Make sure that the if-we-don't-jump 1440 * successor was scheduled just after the 1441 * delay slots. 1442 */ 1443 assert(!block->successors[1] || 1444 block->successors[1]->start_qpu_ip == 1445 block->branch_qpu_ip + 4); 1446 1447 branch->qpu.branch.offset = 1448 ((block->successors[0]->start_qpu_ip - 1449 (block->branch_qpu_ip + 4)) * 1450 sizeof(uint64_t)); 1451 1452 /* Set up the relative offset to jump in the 1453 * uniform stream. 1454 * 1455 * Use a temporary here, because 1456 * uniform_data[inst->uniform] may be shared 1457 * between multiple instructions. 1458 */ 1459 assert(c->uniform_contents[branch->uniform] == QUNIFORM_CONSTANT); 1460 c->uniform_data[branch->uniform] = 1461 (block->successors[0]->start_uniform - 1462 (block->branch_uniform + 1)) * 4; 1463 } 1464} 1465 1466uint32_t 1467v3d_qpu_schedule_instructions(struct v3d_compile *c) 1468{ 1469 const struct v3d_device_info *devinfo = c->devinfo; 1470 struct qblock *end_block = list_last_entry(&c->blocks, 1471 struct qblock, link); 1472 1473 /* We reorder the uniforms as we schedule instructions, so save the 1474 * old data off and replace it. 1475 */ 1476 uint32_t *uniform_data = c->uniform_data; 1477 enum quniform_contents *uniform_contents = c->uniform_contents; 1478 c->uniform_contents = ralloc_array(c, enum quniform_contents, 1479 c->num_uniforms); 1480 c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms); 1481 c->uniform_array_size = c->num_uniforms; 1482 uint32_t next_uniform = 0; 1483 1484 struct choose_scoreboard scoreboard; 1485 memset(&scoreboard, 0, sizeof(scoreboard)); 1486 scoreboard.last_ldvary_tick = -10; 1487 scoreboard.last_magic_sfu_write_tick = -10; 1488 scoreboard.last_uniforms_reset_tick = -10; 1489 scoreboard.last_thrsw_tick = -10; 1490 1491 if (debug) { 1492 fprintf(stderr, "Pre-schedule instructions\n"); 1493 vir_for_each_block(block, c) { 1494 fprintf(stderr, "BLOCK %d\n", block->index); 1495 list_for_each_entry(struct qinst, qinst, 1496 &block->instructions, link) { 1497 v3d_qpu_dump(devinfo, &qinst->qpu); 1498 fprintf(stderr, "\n"); 1499 } 1500 } 1501 fprintf(stderr, "\n"); 1502 } 1503 1504 uint32_t cycles = 0; 1505 vir_for_each_block(block, c) { 1506 block->start_qpu_ip = c->qpu_inst_count; 1507 block->branch_qpu_ip = ~0; 1508 block->start_uniform = next_uniform; 1509 1510 cycles += qpu_schedule_instructions_block(c, 1511 &scoreboard, 1512 block, 1513 uniform_contents, 1514 uniform_data, 1515 &next_uniform); 1516 1517 block->end_qpu_ip = c->qpu_inst_count - 1; 1518 } 1519 1520 /* Emit the program-end THRSW instruction. */; 1521 struct qinst *thrsw = vir_nop(); 1522 thrsw->qpu.sig.thrsw = true; 1523 emit_thrsw(c, end_block, &scoreboard, thrsw, true); 1524 1525 qpu_set_branch_targets(c); 1526 1527 assert(next_uniform == c->num_uniforms); 1528 1529 return cycles; 1530} 1531