1/* 2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 * 23 * Authors: 24 * Rob Clark <robclark@freedesktop.org> 25 */ 26 27#include "util/dag.h" 28#include "util/u_math.h" 29 30#include "ir3.h" 31#include "ir3_compiler.h" 32 33#ifdef DEBUG 34#define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS) 35#else 36#define SCHED_DEBUG 0 37#endif 38#define d(fmt, ...) \ 39 do { \ 40 if (SCHED_DEBUG) { \ 41 mesa_logi("SCHED: " fmt, ##__VA_ARGS__); \ 42 } \ 43 } while (0) 44 45#define di(instr, fmt, ...) \ 46 do { \ 47 if (SCHED_DEBUG) { \ 48 struct log_stream *stream = mesa_log_streami(); \ 49 mesa_log_stream_printf(stream, "SCHED: " fmt ": ", ##__VA_ARGS__); \ 50 ir3_print_instr_stream(stream, instr); \ 51 mesa_log_stream_destroy(stream); \ 52 } \ 53 } while (0) 54 55/* 56 * Instruction Scheduling: 57 * 58 * A block-level pre-RA scheduler, which works by creating a DAG of 59 * instruction dependencies, and heuristically picking a DAG head 60 * (instruction with no unscheduled dependencies). 61 * 62 * Where possible, it tries to pick instructions that avoid nop delay 63 * slots, but it will prefer to pick instructions that reduce (or do 64 * not increase) the number of live values. 65 * 66 * If the only possible choices are instructions that increase the 67 * number of live values, it will try to pick the one with the earliest 68 * consumer (based on pre-sched program order). 69 * 70 * There are a few special cases that need to be handled, since sched 71 * is currently independent of register allocation. Usages of address 72 * register (a0.x) or predicate register (p0.x) must be serialized. Ie. 73 * if you have two pairs of instructions that write the same special 74 * register and then read it, then those pairs cannot be interleaved. 75 * To solve this, when we are in such a scheduling "critical section", 76 * and we encounter a conflicting write to a special register, we try 77 * to schedule any remaining instructions that use that value first. 78 * 79 * TODO we can detect too-large live_values here.. would be a good place 80 * to "spill" cheap things, like move from uniform/immed. (Constructing 81 * list of ssa def consumers before sched pass would make this easier. 82 * Also, in general it is general it might be best not to re-use load_immed 83 * across blocks. 84 * 85 * TODO we can use (abs)/(neg) src modifiers in a lot of cases to reduce 86 * the # of immediates in play (or at least that would help with 87 * dEQP-GLES31.functional.ubo.random.all_per_block_buffers.*).. probably 88 * do this in a nir pass that inserts fneg/etc? The cp pass should fold 89 * these into src modifiers.. 90 */ 91 92struct ir3_sched_ctx { 93 struct ir3_block *block; /* the current block */ 94 struct dag *dag; 95 96 struct list_head unscheduled_list; /* unscheduled instructions */ 97 struct ir3_instruction *scheduled; /* last scheduled instr */ 98 struct ir3_instruction *addr0; /* current a0.x user, if any */ 99 struct ir3_instruction *addr1; /* current a1.x user, if any */ 100 struct ir3_instruction *pred; /* current p0.x user, if any */ 101 102 struct ir3_instruction *split; /* most-recently-split a0/a1/p0 producer */ 103 104 int remaining_kills; 105 int remaining_tex; 106 107 bool error; 108 109 int sfu_delay; 110 int tex_delay; 111 112 /* We order the scheduled tex/SFU instructions, and keep track of the 113 * index of the last waited on instruction, so we can know which 114 * instructions are still outstanding (and therefore would require us to 115 * wait for all outstanding instructions before scheduling a use). 116 */ 117 int tex_index, first_outstanding_tex_index; 118 int sfu_index, first_outstanding_sfu_index; 119}; 120 121struct ir3_sched_node { 122 struct dag_node dag; /* must be first for util_dynarray_foreach */ 123 struct ir3_instruction *instr; 124 125 unsigned delay; 126 unsigned max_delay; 127 128 unsigned tex_index; 129 unsigned sfu_index; 130 131 /* For instructions that are a meta:collect src, once we schedule 132 * the first src of the collect, the entire vecN is live (at least 133 * from the PoV of the first RA pass.. the 2nd scalar pass can fill 134 * in some of the gaps, but often not all). So we want to help out 135 * RA, and realize that as soon as we schedule the first collect 136 * src, there is no penalty to schedule the remainder (ie. they 137 * don't make additional values live). In fact we'd prefer to 138 * schedule the rest ASAP to minimize the live range of the vecN. 139 * 140 * For instructions that are the src of a collect, we track the 141 * corresponding collect, and mark them as partially live as soon 142 * as any one of the src's is scheduled. 143 */ 144 struct ir3_instruction *collect; 145 bool partially_live; 146 147 /* Is this instruction a direct or indirect dependency for a kill? 148 * If so, we should prioritize it when possible 149 */ 150 bool kill_path; 151 152 /* This node represents a shader output. A semi-common pattern in 153 * shaders is something along the lines of: 154 * 155 * fragcolor.w = 1.0 156 * 157 * Which we'd prefer to schedule as late as possible, since it 158 * produces a live value that is never killed/consumed. So detect 159 * outputs up-front, and avoid scheduling them unless the reduce 160 * register pressure (or at least are neutral) 161 */ 162 bool output; 163}; 164 165#define foreach_sched_node(__n, __list) \ 166 list_for_each_entry (struct ir3_sched_node, __n, __list, dag.link) 167 168static void sched_node_init(struct ir3_sched_ctx *ctx, 169 struct ir3_instruction *instr); 170static void sched_node_add_dep(struct ir3_instruction *instr, 171 struct ir3_instruction *src, int i); 172 173static bool 174is_scheduled(struct ir3_instruction *instr) 175{ 176 return !!(instr->flags & IR3_INSTR_MARK); 177} 178 179/* check_src_cond() passing a ir3_sched_ctx. */ 180static bool 181sched_check_src_cond(struct ir3_instruction *instr, 182 bool (*cond)(struct ir3_instruction *, 183 struct ir3_sched_ctx *), 184 struct ir3_sched_ctx *ctx) 185{ 186 foreach_ssa_src (src, instr) { 187 /* meta:split/collect aren't real instructions, the thing that 188 * we actually care about is *their* srcs 189 */ 190 if ((src->opc == OPC_META_SPLIT) || (src->opc == OPC_META_COLLECT)) { 191 if (sched_check_src_cond(src, cond, ctx)) 192 return true; 193 } else { 194 if (cond(src, ctx)) 195 return true; 196 } 197 } 198 199 return false; 200} 201 202/* Is this a prefetch or tex that hasn't been waited on yet? */ 203 204static bool 205is_outstanding_tex_or_prefetch(struct ir3_instruction *instr, 206 struct ir3_sched_ctx *ctx) 207{ 208 if (!is_tex_or_prefetch(instr)) 209 return false; 210 211 /* The sched node is only valid within the same block, we cannot 212 * really say anything about src's from other blocks 213 */ 214 if (instr->block != ctx->block) 215 return true; 216 217 struct ir3_sched_node *n = instr->data; 218 return n->tex_index >= ctx->first_outstanding_tex_index; 219} 220 221static bool 222is_outstanding_sfu(struct ir3_instruction *instr, struct ir3_sched_ctx *ctx) 223{ 224 if (!is_sfu(instr)) 225 return false; 226 227 /* The sched node is only valid within the same block, we cannot 228 * really say anything about src's from other blocks 229 */ 230 if (instr->block != ctx->block) 231 return true; 232 233 struct ir3_sched_node *n = instr->data; 234 return n->sfu_index >= ctx->first_outstanding_sfu_index; 235} 236 237static unsigned 238cycle_count(struct ir3_instruction *instr) 239{ 240 if (instr->opc == OPC_META_COLLECT) { 241 /* Assume that only immed/const sources produce moves */ 242 unsigned n = 0; 243 foreach_src (src, instr) { 244 if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST)) 245 n++; 246 } 247 return n; 248 } else if (is_meta(instr)) { 249 return 0; 250 } else { 251 return 1; 252 } 253} 254 255static void 256schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) 257{ 258 debug_assert(ctx->block == instr->block); 259 260 /* remove from depth list: 261 */ 262 list_delinit(&instr->node); 263 264 if (writes_addr0(instr)) { 265 debug_assert(ctx->addr0 == NULL); 266 ctx->addr0 = instr; 267 } 268 269 if (writes_addr1(instr)) { 270 debug_assert(ctx->addr1 == NULL); 271 ctx->addr1 = instr; 272 } 273 274 if (writes_pred(instr)) { 275 debug_assert(ctx->pred == NULL); 276 ctx->pred = instr; 277 } 278 279 instr->flags |= IR3_INSTR_MARK; 280 281 di(instr, "schedule"); 282 283 list_addtail(&instr->node, &instr->block->instr_list); 284 ctx->scheduled = instr; 285 286 if (is_kill_or_demote(instr)) { 287 assert(ctx->remaining_kills > 0); 288 ctx->remaining_kills--; 289 } 290 291 struct ir3_sched_node *n = instr->data; 292 293 /* If this instruction is a meta:collect src, mark the remaining 294 * collect srcs as partially live. 295 */ 296 if (n->collect) { 297 foreach_ssa_src (src, n->collect) { 298 if (src->block != instr->block) 299 continue; 300 struct ir3_sched_node *sn = src->data; 301 sn->partially_live = true; 302 } 303 } 304 305 dag_prune_head(ctx->dag, &n->dag); 306 307 unsigned cycles = cycle_count(instr); 308 309 if (is_sfu(instr)) { 310 ctx->sfu_delay = 8; 311 n->sfu_index = ctx->sfu_index++; 312 } else if (!is_meta(instr) && 313 sched_check_src_cond(instr, is_outstanding_sfu, ctx)) { 314 ctx->sfu_delay = 0; 315 ctx->first_outstanding_sfu_index = ctx->sfu_index; 316 } else if (ctx->sfu_delay > 0) { 317 ctx->sfu_delay -= MIN2(cycles, ctx->sfu_delay); 318 } 319 320 if (is_tex_or_prefetch(instr)) { 321 /* NOTE that this isn't an attempt to hide texture fetch latency, 322 * but an attempt to hide the cost of switching to another warp. 323 * If we can, we'd like to try to schedule another texture fetch 324 * before scheduling something that would sync. 325 */ 326 ctx->tex_delay = 10; 327 assert(ctx->remaining_tex > 0); 328 ctx->remaining_tex--; 329 n->tex_index = ctx->tex_index++; 330 } else if (!is_meta(instr) && 331 sched_check_src_cond(instr, is_outstanding_tex_or_prefetch, 332 ctx)) { 333 ctx->tex_delay = 0; 334 ctx->first_outstanding_tex_index = ctx->tex_index; 335 } else if (ctx->tex_delay > 0) { 336 ctx->tex_delay -= MIN2(cycles, ctx->tex_delay); 337 } 338} 339 340struct ir3_sched_notes { 341 /* there is at least one kill which could be scheduled, except 342 * for unscheduled bary.f's: 343 */ 344 bool blocked_kill; 345 /* there is at least one instruction that could be scheduled, 346 * except for conflicting address/predicate register usage: 347 */ 348 bool addr0_conflict, addr1_conflict, pred_conflict; 349}; 350 351/* could an instruction be scheduled if specified ssa src was scheduled? */ 352static bool 353could_sched(struct ir3_instruction *instr, struct ir3_instruction *src) 354{ 355 foreach_ssa_src (other_src, instr) { 356 /* if dependency not scheduled, we aren't ready yet: */ 357 if ((src != other_src) && !is_scheduled(other_src)) { 358 return false; 359 } 360 } 361 return true; 362} 363 364/* Check if instruction is ok to schedule. Make sure it is not blocked 365 * by use of addr/predicate register, etc. 366 */ 367static bool 368check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, 369 struct ir3_instruction *instr) 370{ 371 debug_assert(!is_scheduled(instr)); 372 373 if (instr == ctx->split) { 374 /* Don't schedule instructions created by splitting a a0.x/a1.x/p0.x 375 * write until another "normal" instruction has been scheduled. 376 */ 377 return false; 378 } 379 380 if (ctx->remaining_kills && (is_tex(instr) || is_mem(instr))) { 381 /* avoid texture/memory access if we have unscheduled kills 382 * that could make the expensive operation unnecessary. By 383 * definition, if there are remaining kills, and this instr 384 * is not a dependency of a kill, there are other instructions 385 * that we can choose from. 386 */ 387 struct ir3_sched_node *n = instr->data; 388 if (!n->kill_path) 389 return false; 390 } 391 392 /* For instructions that write address register we need to 393 * make sure there is at least one instruction that uses the 394 * addr value which is otherwise ready. 395 * 396 * NOTE if any instructions use pred register and have other 397 * src args, we would need to do the same for writes_pred().. 398 */ 399 if (writes_addr0(instr)) { 400 struct ir3 *ir = instr->block->shader; 401 bool ready = false; 402 for (unsigned i = 0; (i < ir->a0_users_count) && !ready; i++) { 403 struct ir3_instruction *indirect = ir->a0_users[i]; 404 if (!indirect) 405 continue; 406 if (indirect->address->def != instr->dsts[0]) 407 continue; 408 ready = could_sched(indirect, instr); 409 } 410 411 /* nothing could be scheduled, so keep looking: */ 412 if (!ready) 413 return false; 414 } 415 416 if (writes_addr1(instr)) { 417 struct ir3 *ir = instr->block->shader; 418 bool ready = false; 419 for (unsigned i = 0; (i < ir->a1_users_count) && !ready; i++) { 420 struct ir3_instruction *indirect = ir->a1_users[i]; 421 if (!indirect) 422 continue; 423 if (indirect->address->def != instr->dsts[0]) 424 continue; 425 ready = could_sched(indirect, instr); 426 } 427 428 /* nothing could be scheduled, so keep looking: */ 429 if (!ready) 430 return false; 431 } 432 433 /* if this is a write to address/predicate register, and that 434 * register is currently in use, we need to defer until it is 435 * free: 436 */ 437 if (writes_addr0(instr) && ctx->addr0) { 438 debug_assert(ctx->addr0 != instr); 439 notes->addr0_conflict = true; 440 return false; 441 } 442 443 if (writes_addr1(instr) && ctx->addr1) { 444 debug_assert(ctx->addr1 != instr); 445 notes->addr1_conflict = true; 446 return false; 447 } 448 449 if (writes_pred(instr) && ctx->pred) { 450 debug_assert(ctx->pred != instr); 451 notes->pred_conflict = true; 452 return false; 453 } 454 455 /* if the instruction is a kill, we need to ensure *every* 456 * bary.f is scheduled. The hw seems unhappy if the thread 457 * gets killed before the end-input (ei) flag is hit. 458 * 459 * We could do this by adding each bary.f instruction as 460 * virtual ssa src for the kill instruction. But we have 461 * fixed length instr->srcs[]. 462 * 463 * TODO we could handle this by false-deps now, probably. 464 */ 465 if (is_kill_or_demote(instr)) { 466 struct ir3 *ir = instr->block->shader; 467 468 for (unsigned i = 0; i < ir->baryfs_count; i++) { 469 struct ir3_instruction *baryf = ir->baryfs[i]; 470 if (baryf->flags & IR3_INSTR_UNUSED) 471 continue; 472 if (!is_scheduled(baryf)) { 473 notes->blocked_kill = true; 474 return false; 475 } 476 } 477 } 478 479 return true; 480} 481 482/* Find the instr->ip of the closest use of an instruction, in 483 * pre-sched order. This isn't going to be the same as post-sched 484 * order, but it is a reasonable approximation to limit scheduling 485 * instructions *too* early. This is mostly to prevent bad behavior 486 * in cases where we have a large number of possible instructions 487 * to choose, to avoid creating too much parallelism (ie. blowing 488 * up register pressure) 489 * 490 * See 491 * dEQP-GLES31.functional.atomic_counter.layout.reverse_offset.inc_dec.8_counters_5_calls_1_thread 492 */ 493static int 494nearest_use(struct ir3_instruction *instr) 495{ 496 unsigned nearest = ~0; 497 foreach_ssa_use (use, instr) 498 if (!is_scheduled(use)) 499 nearest = MIN2(nearest, use->ip); 500 501 /* slight hack.. this heuristic tends to push bary.f's to later 502 * in the shader, closer to their uses. But we actually would 503 * prefer to get these scheduled earlier, to unlock varying 504 * storage for more VS jobs: 505 */ 506 if (is_input(instr)) 507 nearest /= 2; 508 509 return nearest; 510} 511 512static bool 513is_only_nonscheduled_use(struct ir3_instruction *instr, 514 struct ir3_instruction *use) 515{ 516 foreach_ssa_use (other_use, instr) { 517 if (other_use != use && !is_scheduled(other_use)) 518 return false; 519 } 520 521 return true; 522} 523 524/* find net change to live values if instruction were scheduled: */ 525static int 526live_effect(struct ir3_instruction *instr) 527{ 528 struct ir3_sched_node *n = instr->data; 529 int new_live = 530 (n->partially_live || !instr->uses || instr->uses->entries == 0) 531 ? 0 532 : dest_regs(instr); 533 int freed_live = 0; 534 535 /* if we schedule something that causes a vecN to be live, 536 * then count all it's other components too: 537 */ 538 if (n->collect) 539 new_live *= n->collect->srcs_count; 540 541 foreach_ssa_src_n (src, n, instr) { 542 if (__is_false_dep(instr, n)) 543 continue; 544 545 if (instr->block != src->block) 546 continue; 547 548 if (is_only_nonscheduled_use(src, instr)) 549 freed_live += dest_regs(src); 550 } 551 552 return new_live - freed_live; 553} 554 555/* Determine if this is an instruction that we'd prefer not to schedule 556 * yet, in order to avoid an (ss)/(sy) sync. This is limited by the 557 * sfu_delay/tex_delay counters, ie. the more cycles it has been since 558 * the last SFU/tex, the less costly a sync would be, and the number of 559 * outstanding SFU/tex instructions to prevent a blowup in register pressure. 560 */ 561static bool 562should_defer(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) 563{ 564 if (ctx->sfu_delay) { 565 if (sched_check_src_cond(instr, is_outstanding_sfu, ctx)) 566 return true; 567 } 568 569 /* We mostly just want to try to schedule another texture fetch 570 * before scheduling something that would (sy) sync, so we can 571 * limit this rule to cases where there are remaining texture 572 * fetches 573 */ 574 if (ctx->tex_delay && ctx->remaining_tex) { 575 if (sched_check_src_cond(instr, is_outstanding_tex_or_prefetch, ctx)) 576 return true; 577 } 578 579 /* Avoid scheduling too many outstanding texture or sfu instructions at 580 * once by deferring further tex/SFU instructions. This both prevents 581 * stalls when the queue of texture/sfu instructions becomes too large, 582 * and prevents unacceptably large increases in register pressure from too 583 * many outstanding texture instructions. 584 */ 585 if (ctx->tex_index - ctx->first_outstanding_tex_index >= 8 && is_tex(instr)) 586 return true; 587 588 if (ctx->sfu_index - ctx->first_outstanding_sfu_index >= 8 && is_sfu(instr)) 589 return true; 590 591 return false; 592} 593 594static struct ir3_sched_node *choose_instr_inc(struct ir3_sched_ctx *ctx, 595 struct ir3_sched_notes *notes, 596 bool defer, bool avoid_output); 597 598enum choose_instr_dec_rank { 599 DEC_NEUTRAL, 600 DEC_NEUTRAL_READY, 601 DEC_FREED, 602 DEC_FREED_READY, 603}; 604 605static const char * 606dec_rank_name(enum choose_instr_dec_rank rank) 607{ 608 switch (rank) { 609 case DEC_NEUTRAL: 610 return "neutral"; 611 case DEC_NEUTRAL_READY: 612 return "neutral+ready"; 613 case DEC_FREED: 614 return "freed"; 615 case DEC_FREED_READY: 616 return "freed+ready"; 617 default: 618 return NULL; 619 } 620} 621 622/** 623 * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code 624 * Scheduling for Register pressure) heuristic. 625 * 626 * Only handles the case of choosing instructions that reduce register pressure 627 * or are even. 628 */ 629static struct ir3_sched_node * 630choose_instr_dec(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, 631 bool defer) 632{ 633 const char *mode = defer ? "-d" : ""; 634 struct ir3_sched_node *chosen = NULL; 635 enum choose_instr_dec_rank chosen_rank = DEC_NEUTRAL; 636 637 foreach_sched_node (n, &ctx->dag->heads) { 638 if (defer && should_defer(ctx, n->instr)) 639 continue; 640 641 /* Note: mergedregs is only used post-RA, just set it to false */ 642 unsigned d = ir3_delay_calc_prera(ctx->block, n->instr); 643 644 int live = live_effect(n->instr); 645 if (live > 0) 646 continue; 647 648 if (!check_instr(ctx, notes, n->instr)) 649 continue; 650 651 enum choose_instr_dec_rank rank; 652 if (live < 0) { 653 /* Prioritize instrs which free up regs and can be scheduled with no 654 * delay. 655 */ 656 if (d == 0) 657 rank = DEC_FREED_READY; 658 else 659 rank = DEC_FREED; 660 } else { 661 /* Contra the paper, pick a leader with no effect on used regs. This 662 * may open up new opportunities, as otherwise a single-operand instr 663 * consuming a value will tend to block finding freeing that value. 664 * This had a massive effect on reducing spilling on V3D. 665 * 666 * XXX: Should this prioritize ready? 667 */ 668 if (d == 0) 669 rank = DEC_NEUTRAL_READY; 670 else 671 rank = DEC_NEUTRAL; 672 } 673 674 /* Prefer higher-ranked instructions, or in the case of a rank tie, the 675 * highest latency-to-end-of-program instruction. 676 */ 677 if (!chosen || rank > chosen_rank || 678 (rank == chosen_rank && chosen->max_delay < n->max_delay)) { 679 chosen = n; 680 chosen_rank = rank; 681 } 682 } 683 684 if (chosen) { 685 di(chosen->instr, "dec%s: chose (%s)", mode, dec_rank_name(chosen_rank)); 686 return chosen; 687 } 688 689 return choose_instr_inc(ctx, notes, defer, true); 690} 691 692enum choose_instr_inc_rank { 693 INC_DISTANCE, 694 INC_DISTANCE_READY, 695}; 696 697static const char * 698inc_rank_name(enum choose_instr_inc_rank rank) 699{ 700 switch (rank) { 701 case INC_DISTANCE: 702 return "distance"; 703 case INC_DISTANCE_READY: 704 return "distance+ready"; 705 default: 706 return NULL; 707 } 708} 709 710/** 711 * When we can't choose an instruction that reduces register pressure or 712 * is neutral, we end up here to try and pick the least bad option. 713 */ 714static struct ir3_sched_node * 715choose_instr_inc(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, 716 bool defer, bool avoid_output) 717{ 718 const char *mode = defer ? "-d" : ""; 719 struct ir3_sched_node *chosen = NULL; 720 enum choose_instr_inc_rank chosen_rank = INC_DISTANCE; 721 722 /* 723 * From hear on out, we are picking something that increases 724 * register pressure. So try to pick something which will 725 * be consumed soon: 726 */ 727 unsigned chosen_distance = 0; 728 729 /* Pick the max delay of the remaining ready set. */ 730 foreach_sched_node (n, &ctx->dag->heads) { 731 if (avoid_output && n->output) 732 continue; 733 734 if (defer && should_defer(ctx, n->instr)) 735 continue; 736 737 if (!check_instr(ctx, notes, n->instr)) 738 continue; 739 740 unsigned d = ir3_delay_calc_prera(ctx->block, n->instr); 741 742 enum choose_instr_inc_rank rank; 743 if (d == 0) 744 rank = INC_DISTANCE_READY; 745 else 746 rank = INC_DISTANCE; 747 748 unsigned distance = nearest_use(n->instr); 749 750 if (!chosen || rank > chosen_rank || 751 (rank == chosen_rank && distance < chosen_distance)) { 752 chosen = n; 753 chosen_distance = distance; 754 chosen_rank = rank; 755 } 756 } 757 758 if (chosen) { 759 di(chosen->instr, "inc%s: chose (%s)", mode, inc_rank_name(chosen_rank)); 760 return chosen; 761 } 762 763 return NULL; 764} 765 766/* Handles instruction selections for instructions we want to prioritize 767 * even if csp/csr would not pick them. 768 */ 769static struct ir3_sched_node * 770choose_instr_prio(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes) 771{ 772 struct ir3_sched_node *chosen = NULL; 773 774 foreach_sched_node (n, &ctx->dag->heads) { 775 /* 776 * - phi nodes and inputs must be scheduled first 777 * - split should be scheduled first, so that the vector value is 778 * killed as soon as possible. RA cannot split up the vector and 779 * reuse components that have been killed until it's been killed. 780 * - collect, on the other hand, should be treated as a "normal" 781 * instruction, and may add to register pressure if its sources are 782 * part of another vector or immediates. 783 */ 784 if (!is_meta(n->instr) || n->instr->opc == OPC_META_COLLECT) 785 continue; 786 787 if (!chosen || (chosen->max_delay < n->max_delay)) 788 chosen = n; 789 } 790 791 if (chosen) { 792 di(chosen->instr, "prio: chose (meta)"); 793 return chosen; 794 } 795 796 return NULL; 797} 798 799static void 800dump_state(struct ir3_sched_ctx *ctx) 801{ 802 if (!SCHED_DEBUG) 803 return; 804 805 foreach_sched_node (n, &ctx->dag->heads) { 806 di(n->instr, "maxdel=%3d le=%d del=%u ", n->max_delay, 807 live_effect(n->instr), ir3_delay_calc_prera(ctx->block, n->instr)); 808 809 util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) { 810 struct ir3_sched_node *child = (struct ir3_sched_node *)edge->child; 811 812 di(child->instr, " -> (%d parents) ", child->dag.parent_count); 813 } 814 } 815} 816 817/* find instruction to schedule: */ 818static struct ir3_instruction * 819choose_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes) 820{ 821 struct ir3_sched_node *chosen; 822 823 dump_state(ctx); 824 825 chosen = choose_instr_prio(ctx, notes); 826 if (chosen) 827 return chosen->instr; 828 829 chosen = choose_instr_dec(ctx, notes, true); 830 if (chosen) 831 return chosen->instr; 832 833 chosen = choose_instr_dec(ctx, notes, false); 834 if (chosen) 835 return chosen->instr; 836 837 chosen = choose_instr_inc(ctx, notes, false, false); 838 if (chosen) 839 return chosen->instr; 840 841 return NULL; 842} 843 844static struct ir3_instruction * 845split_instr(struct ir3_sched_ctx *ctx, struct ir3_instruction *orig_instr) 846{ 847 struct ir3_instruction *new_instr = ir3_instr_clone(orig_instr); 848 di(new_instr, "split instruction"); 849 sched_node_init(ctx, new_instr); 850 return new_instr; 851} 852 853/* "spill" the address registers by remapping any unscheduled 854 * instructions which depend on the current address register 855 * to a clone of the instruction which wrote the address reg. 856 */ 857static struct ir3_instruction * 858split_addr(struct ir3_sched_ctx *ctx, struct ir3_instruction **addr, 859 struct ir3_instruction **users, unsigned users_count) 860{ 861 struct ir3_instruction *new_addr = NULL; 862 unsigned i; 863 864 debug_assert(*addr); 865 866 for (i = 0; i < users_count; i++) { 867 struct ir3_instruction *indirect = users[i]; 868 869 if (!indirect) 870 continue; 871 872 /* skip instructions already scheduled: */ 873 if (is_scheduled(indirect)) 874 continue; 875 876 /* remap remaining instructions using current addr 877 * to new addr: 878 */ 879 if (indirect->address->def == (*addr)->dsts[0]) { 880 if (!new_addr) { 881 new_addr = split_instr(ctx, *addr); 882 /* original addr is scheduled, but new one isn't: */ 883 new_addr->flags &= ~IR3_INSTR_MARK; 884 } 885 indirect->address->def = new_addr->dsts[0]; 886 /* don't need to remove old dag edge since old addr is 887 * already scheduled: 888 */ 889 sched_node_add_dep(indirect, new_addr, 0); 890 di(indirect, "new address"); 891 } 892 } 893 894 /* all remaining indirects remapped to new addr: */ 895 *addr = NULL; 896 897 return new_addr; 898} 899 900/* "spill" the predicate register by remapping any unscheduled 901 * instructions which depend on the current predicate register 902 * to a clone of the instruction which wrote the address reg. 903 */ 904static struct ir3_instruction * 905split_pred(struct ir3_sched_ctx *ctx) 906{ 907 struct ir3 *ir; 908 struct ir3_instruction *new_pred = NULL; 909 unsigned i; 910 911 debug_assert(ctx->pred); 912 913 ir = ctx->pred->block->shader; 914 915 for (i = 0; i < ir->predicates_count; i++) { 916 struct ir3_instruction *predicated = ir->predicates[i]; 917 918 if (!predicated) 919 continue; 920 921 /* skip instructions already scheduled: */ 922 if (is_scheduled(predicated)) 923 continue; 924 925 /* remap remaining instructions using current pred 926 * to new pred: 927 * 928 * TODO is there ever a case when pred isn't first 929 * (and only) src? 930 */ 931 if (ssa(predicated->srcs[0]) == ctx->pred) { 932 if (!new_pred) { 933 new_pred = split_instr(ctx, ctx->pred); 934 /* original pred is scheduled, but new one isn't: */ 935 new_pred->flags &= ~IR3_INSTR_MARK; 936 } 937 predicated->srcs[0]->instr = new_pred; 938 /* don't need to remove old dag edge since old pred is 939 * already scheduled: 940 */ 941 sched_node_add_dep(predicated, new_pred, 0); 942 di(predicated, "new predicate"); 943 } 944 } 945 946 if (ctx->block->condition == ctx->pred) { 947 if (!new_pred) { 948 new_pred = split_instr(ctx, ctx->pred); 949 /* original pred is scheduled, but new one isn't: */ 950 new_pred->flags &= ~IR3_INSTR_MARK; 951 } 952 ctx->block->condition = new_pred; 953 d("new branch condition"); 954 } 955 956 /* all remaining predicated remapped to new pred: */ 957 ctx->pred = NULL; 958 959 return new_pred; 960} 961 962static void 963sched_node_init(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) 964{ 965 struct ir3_sched_node *n = rzalloc(ctx->dag, struct ir3_sched_node); 966 967 dag_init_node(ctx->dag, &n->dag); 968 969 n->instr = instr; 970 instr->data = n; 971} 972 973static void 974sched_node_add_dep(struct ir3_instruction *instr, struct ir3_instruction *src, 975 int i) 976{ 977 /* don't consider dependencies in other blocks: */ 978 if (src->block != instr->block) 979 return; 980 981 /* we could have false-dep's that end up unused: */ 982 if (src->flags & IR3_INSTR_UNUSED) { 983 debug_assert(__is_false_dep(instr, i)); 984 return; 985 } 986 987 struct ir3_sched_node *n = instr->data; 988 struct ir3_sched_node *sn = src->data; 989 990 /* If src is consumed by a collect, track that to realize that once 991 * any of the collect srcs are live, we should hurry up and schedule 992 * the rest. 993 */ 994 if (instr->opc == OPC_META_COLLECT) 995 sn->collect = instr; 996 997 dag_add_edge(&sn->dag, &n->dag, NULL); 998 999 unsigned d = ir3_delayslots(src, instr, i, true); 1000 1001 n->delay = MAX2(n->delay, d); 1002} 1003 1004static void 1005mark_kill_path(struct ir3_instruction *instr) 1006{ 1007 struct ir3_sched_node *n = instr->data; 1008 1009 if (n->kill_path) { 1010 return; 1011 } 1012 1013 n->kill_path = true; 1014 1015 foreach_ssa_src (src, instr) { 1016 if (src->block != instr->block) 1017 continue; 1018 mark_kill_path(src); 1019 } 1020} 1021 1022/* Is it an output? */ 1023static bool 1024is_output_collect(struct ir3_instruction *instr) 1025{ 1026 if (instr->opc != OPC_META_COLLECT) 1027 return false; 1028 1029 foreach_ssa_use (use, instr) { 1030 if (use->opc != OPC_END && use->opc != OPC_CHMASK) 1031 return false; 1032 } 1033 1034 return true; 1035} 1036 1037/* Is it's only use as output? */ 1038static bool 1039is_output_only(struct ir3_instruction *instr) 1040{ 1041 if (!writes_gpr(instr)) 1042 return false; 1043 1044 if (!(instr->dsts[0]->flags & IR3_REG_SSA)) 1045 return false; 1046 1047 foreach_ssa_use (use, instr) 1048 if (!is_output_collect(use)) 1049 return false; 1050 1051 return true; 1052} 1053 1054static void 1055sched_node_add_deps(struct ir3_instruction *instr) 1056{ 1057 /* There's nothing to do for phi nodes, since they always go first. And 1058 * phi nodes can reference sources later in the same block, so handling 1059 * sources is not only unnecessary but could cause problems. 1060 */ 1061 if (instr->opc == OPC_META_PHI) 1062 return; 1063 1064 /* Since foreach_ssa_src() already handles false-dep's we can construct 1065 * the DAG easily in a single pass. 1066 */ 1067 foreach_ssa_src_n (src, i, instr) { 1068 sched_node_add_dep(instr, src, i); 1069 } 1070 1071 /* NOTE that all inputs must be scheduled before a kill, so 1072 * mark these to be prioritized as well: 1073 */ 1074 if (is_kill_or_demote(instr) || is_input(instr)) { 1075 mark_kill_path(instr); 1076 } 1077 1078 if (is_output_only(instr)) { 1079 struct ir3_sched_node *n = instr->data; 1080 n->output = true; 1081 } 1082} 1083 1084static void 1085sched_dag_max_delay_cb(struct dag_node *node, void *state) 1086{ 1087 struct ir3_sched_node *n = (struct ir3_sched_node *)node; 1088 uint32_t max_delay = 0; 1089 1090 util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) { 1091 struct ir3_sched_node *child = (struct ir3_sched_node *)edge->child; 1092 max_delay = MAX2(child->max_delay, max_delay); 1093 } 1094 1095 n->max_delay = MAX2(n->max_delay, max_delay + n->delay); 1096} 1097 1098static void 1099sched_dag_init(struct ir3_sched_ctx *ctx) 1100{ 1101 ctx->dag = dag_create(ctx); 1102 1103 foreach_instr (instr, &ctx->unscheduled_list) { 1104 sched_node_init(ctx, instr); 1105 sched_node_add_deps(instr); 1106 } 1107 1108 dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL); 1109} 1110 1111static void 1112sched_dag_destroy(struct ir3_sched_ctx *ctx) 1113{ 1114 ralloc_free(ctx->dag); 1115 ctx->dag = NULL; 1116} 1117 1118static void 1119sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) 1120{ 1121 ctx->block = block; 1122 1123 /* addr/pred writes are per-block: */ 1124 ctx->addr0 = NULL; 1125 ctx->addr1 = NULL; 1126 ctx->pred = NULL; 1127 ctx->tex_delay = 0; 1128 ctx->sfu_delay = 0; 1129 ctx->tex_index = ctx->first_outstanding_tex_index = 0; 1130 ctx->sfu_index = ctx->first_outstanding_sfu_index = 0; 1131 1132 /* move all instructions to the unscheduled list, and 1133 * empty the block's instruction list (to which we will 1134 * be inserting). 1135 */ 1136 list_replace(&block->instr_list, &ctx->unscheduled_list); 1137 list_inithead(&block->instr_list); 1138 1139 sched_dag_init(ctx); 1140 1141 ctx->remaining_kills = 0; 1142 ctx->remaining_tex = 0; 1143 foreach_instr_safe (instr, &ctx->unscheduled_list) { 1144 if (is_kill_or_demote(instr)) 1145 ctx->remaining_kills++; 1146 if (is_tex_or_prefetch(instr)) 1147 ctx->remaining_tex++; 1148 } 1149 1150 /* First schedule all meta:input and meta:phi instructions, followed by 1151 * tex-prefetch. We want all of the instructions that load values into 1152 * registers before the shader starts to go before any other instructions. 1153 * But in particular we want inputs to come before prefetches. This is 1154 * because a FS's bary_ij input may not actually be live in the shader, 1155 * but it should not be scheduled on top of any other input (but can be 1156 * overwritten by a tex prefetch) 1157 * 1158 * Note: Because the first block cannot have predecessors, meta:input and 1159 * meta:phi cannot exist in the same block. 1160 */ 1161 foreach_instr_safe (instr, &ctx->unscheduled_list) 1162 if (instr->opc == OPC_META_INPUT || instr->opc == OPC_META_PHI) 1163 schedule(ctx, instr); 1164 1165 foreach_instr_safe (instr, &ctx->unscheduled_list) 1166 if (instr->opc == OPC_META_TEX_PREFETCH) 1167 schedule(ctx, instr); 1168 1169 while (!list_is_empty(&ctx->unscheduled_list)) { 1170 struct ir3_sched_notes notes = {0}; 1171 struct ir3_instruction *instr; 1172 1173 instr = choose_instr(ctx, ¬es); 1174 if (instr) { 1175 unsigned delay = ir3_delay_calc_prera(ctx->block, instr); 1176 d("delay=%u", delay); 1177 1178 /* and if we run out of instructions that can be scheduled, 1179 * then it is time for nop's: 1180 */ 1181 debug_assert(delay <= 6); 1182 while (delay > 0) { 1183 ir3_NOP(block); 1184 delay--; 1185 } 1186 1187 schedule(ctx, instr); 1188 1189 /* Since we've scheduled a "real" instruction, we can now 1190 * schedule any split instruction created by the scheduler again. 1191 */ 1192 ctx->split = NULL; 1193 } else { 1194 struct ir3_instruction *new_instr = NULL; 1195 struct ir3 *ir = block->shader; 1196 1197 /* nothing available to schedule.. if we are blocked on 1198 * address/predicate register conflict, then break the 1199 * deadlock by cloning the instruction that wrote that 1200 * reg: 1201 */ 1202 if (notes.addr0_conflict) { 1203 new_instr = 1204 split_addr(ctx, &ctx->addr0, ir->a0_users, ir->a0_users_count); 1205 } else if (notes.addr1_conflict) { 1206 new_instr = 1207 split_addr(ctx, &ctx->addr1, ir->a1_users, ir->a1_users_count); 1208 } else if (notes.pred_conflict) { 1209 new_instr = split_pred(ctx); 1210 } else { 1211 d("unscheduled_list:"); 1212 foreach_instr (instr, &ctx->unscheduled_list) 1213 di(instr, "unscheduled: "); 1214 debug_assert(0); 1215 ctx->error = true; 1216 return; 1217 } 1218 1219 if (new_instr) { 1220 list_delinit(&new_instr->node); 1221 list_addtail(&new_instr->node, &ctx->unscheduled_list); 1222 } 1223 1224 /* If we produced a new instruction, do not schedule it next to 1225 * guarantee progress. 1226 */ 1227 ctx->split = new_instr; 1228 } 1229 } 1230 1231 sched_dag_destroy(ctx); 1232} 1233 1234int 1235ir3_sched(struct ir3 *ir) 1236{ 1237 struct ir3_sched_ctx *ctx = rzalloc(NULL, struct ir3_sched_ctx); 1238 1239 foreach_block (block, &ir->block_list) { 1240 foreach_instr (instr, &block->instr_list) { 1241 instr->data = NULL; 1242 } 1243 } 1244 1245 ir3_count_instructions(ir); 1246 ir3_clear_mark(ir); 1247 ir3_find_ssa_uses(ir, ctx, false); 1248 1249 foreach_block (block, &ir->block_list) { 1250 sched_block(ctx, block); 1251 } 1252 1253 int ret = ctx->error ? -1 : 0; 1254 1255 ralloc_free(ctx); 1256 1257 return ret; 1258} 1259 1260static unsigned 1261get_array_id(struct ir3_instruction *instr) 1262{ 1263 /* The expectation is that there is only a single array 1264 * src or dst, ir3_cp should enforce this. 1265 */ 1266 1267 foreach_dst (dst, instr) 1268 if (dst->flags & IR3_REG_ARRAY) 1269 return dst->array.id; 1270 foreach_src (src, instr) 1271 if (src->flags & IR3_REG_ARRAY) 1272 return src->array.id; 1273 1274 unreachable("this was unexpected"); 1275} 1276 1277/* does instruction 'prior' need to be scheduled before 'instr'? */ 1278static bool 1279depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior) 1280{ 1281 /* TODO for dependencies that are related to a specific object, ie 1282 * a specific SSBO/image/array, we could relax this constraint to 1283 * make accesses to unrelated objects not depend on each other (at 1284 * least as long as not declared coherent) 1285 */ 1286 if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) && 1287 prior->barrier_class) || 1288 ((prior->barrier_class & IR3_BARRIER_EVERYTHING) && 1289 instr->barrier_class)) 1290 return true; 1291 1292 if (instr->barrier_class & prior->barrier_conflict) { 1293 if (!(instr->barrier_class & 1294 ~(IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W))) { 1295 /* if only array barrier, then we can further limit false-deps 1296 * by considering the array-id, ie reads/writes to different 1297 * arrays do not depend on each other (no aliasing) 1298 */ 1299 if (get_array_id(instr) != get_array_id(prior)) { 1300 return false; 1301 } 1302 } 1303 1304 return true; 1305 } 1306 1307 return false; 1308} 1309 1310static void 1311add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr) 1312{ 1313 struct list_head *prev = instr->node.prev; 1314 struct list_head *next = instr->node.next; 1315 1316 /* add dependencies on previous instructions that must be scheduled 1317 * prior to the current instruction 1318 */ 1319 while (prev != &block->instr_list) { 1320 struct ir3_instruction *pi = 1321 LIST_ENTRY(struct ir3_instruction, prev, node); 1322 1323 prev = prev->prev; 1324 1325 if (is_meta(pi)) 1326 continue; 1327 1328 if (instr->barrier_class == pi->barrier_class) { 1329 ir3_instr_add_dep(instr, pi); 1330 break; 1331 } 1332 1333 if (depends_on(instr, pi)) 1334 ir3_instr_add_dep(instr, pi); 1335 } 1336 1337 /* add dependencies on this instruction to following instructions 1338 * that must be scheduled after the current instruction: 1339 */ 1340 while (next != &block->instr_list) { 1341 struct ir3_instruction *ni = 1342 LIST_ENTRY(struct ir3_instruction, next, node); 1343 1344 next = next->next; 1345 1346 if (is_meta(ni)) 1347 continue; 1348 1349 if (instr->barrier_class == ni->barrier_class) { 1350 ir3_instr_add_dep(ni, instr); 1351 break; 1352 } 1353 1354 if (depends_on(ni, instr)) 1355 ir3_instr_add_dep(ni, instr); 1356 } 1357} 1358 1359/* before scheduling a block, we need to add any necessary false-dependencies 1360 * to ensure that: 1361 * 1362 * (1) barriers are scheduled in the right order wrt instructions related 1363 * to the barrier 1364 * 1365 * (2) reads that come before a write actually get scheduled before the 1366 * write 1367 */ 1368bool 1369ir3_sched_add_deps(struct ir3 *ir) 1370{ 1371 bool progress = false; 1372 1373 foreach_block (block, &ir->block_list) { 1374 foreach_instr (instr, &block->instr_list) { 1375 if (instr->barrier_class) { 1376 add_barrier_deps(block, instr); 1377 progress = true; 1378 } 1379 } 1380 } 1381 1382 return progress; 1383} 1384