nir_schedule.c revision 7ec681f3
1/* 2 * Copyright © 2019 Broadcom 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "nir_schedule.h" 25#include "util/dag.h" 26#include "util/u_dynarray.h" 27 28/** @file 29 * 30 * Implements basic-block-level prepass instruction scheduling in NIR to 31 * manage register pressure. 32 * 33 * This is based on the Goodman/Hsu paper (1988, cached copy at 34 * https://people.freedesktop.org/~anholt/scheduling-goodman-hsu.pdf). We 35 * make up the DDG for NIR (which can be mostly done using the NIR def/use 36 * chains for SSA instructions, plus some edges for ordering register writes 37 * vs reads, and some more for ordering intrinsics). Then we pick heads off 38 * of the DDG using their heuristic to emit the NIR instructions back into the 39 * block in their new order. 40 * 41 * The hard case for prepass scheduling on GPUs seems to always be consuming 42 * texture/ubo results. The register pressure heuristic doesn't want to pick 43 * an instr that starts consuming texture results because it usually won't be 44 * the only usage, so that instruction will increase pressure. 45 * 46 * If you try to force consumption of tex results always, then in a case where 47 * single sample is used for many outputs, you'll end up picking every other 48 * user and expanding register pressure. The partially_evaluated_path flag 49 * helps tremendously, in that if you happen for whatever reason to pick a 50 * texture sample's output, then you'll try to finish off that sample. Future 51 * work may include doing some local search before locking in a choice, to try 52 * to more reliably find the case where just a few choices going against the 53 * heuristic can manage to free the whole vector. 54 */ 55 56static bool debug; 57 58/** 59 * Represents a node in the DDG for a NIR instruction. 60 */ 61typedef struct { 62 struct dag_node dag; /* must be first for our u_dynarray_foreach */ 63 nir_instr *instr; 64 bool partially_evaluated_path; 65 66 /* Approximate estimate of the delay between starting this instruction and 67 * its results being available. 68 * 69 * Accuracy is not too important, given that we're prepass scheduling here 70 * and just trying to reduce excess dependencies introduced by a register 71 * allocator by stretching out the live intervals of expensive 72 * instructions. 73 */ 74 uint32_t delay; 75 76 /* Cost of the maximum-delay path from this node to the leaves. */ 77 uint32_t max_delay; 78 79 /* scoreboard->time value when this instruction can be scheduled without 80 * any stalls expected. 81 */ 82 uint32_t ready_time; 83} nir_schedule_node; 84 85typedef struct { 86 struct dag *dag; 87 88 nir_shader *shader; 89 90 /* Mapping from nir_register * or nir_ssa_def * to a struct set of 91 * instructions remaining to be scheduled using the register. 92 */ 93 struct hash_table *remaining_uses; 94 95 /* Map from nir_instr to nir_schedule_node * */ 96 struct hash_table *instr_map; 97 98 /* Set of nir_register * or nir_ssa_def * that have had any instruction 99 * scheduled on them. 100 */ 101 struct set *live_values; 102 103 /* An abstract approximation of the number of nir_scheduler_node->delay 104 * units since the start of the shader. 105 */ 106 uint32_t time; 107 108 /* Number of channels currently used by the NIR instructions that have been 109 * scheduled. 110 */ 111 int pressure; 112 113 /* Options specified by the backend */ 114 const nir_schedule_options *options; 115} nir_schedule_scoreboard; 116 117/* When walking the instructions in reverse, we use this flag to swap 118 * before/after in add_dep(). 119 */ 120enum direction { F, R }; 121 122struct nir_schedule_class_dep { 123 int klass; 124 nir_schedule_node *node; 125 struct nir_schedule_class_dep *next; 126}; 127 128typedef struct { 129 nir_schedule_scoreboard *scoreboard; 130 131 /* Map from nir_register to nir_schedule_node * */ 132 struct hash_table *reg_map; 133 134 /* Scheduler nodes for last instruction involved in some class of dependency. 135 */ 136 nir_schedule_node *load_input; 137 nir_schedule_node *store_shared; 138 nir_schedule_node *unknown_intrinsic; 139 nir_schedule_node *discard; 140 nir_schedule_node *jump; 141 142 struct nir_schedule_class_dep *class_deps; 143 144 enum direction dir; 145} nir_deps_state; 146 147static void * 148_mesa_hash_table_search_data(struct hash_table *ht, void *key) 149{ 150 struct hash_entry *entry = _mesa_hash_table_search(ht, key); 151 if (!entry) 152 return NULL; 153 return entry->data; 154} 155 156static nir_schedule_node * 157nir_schedule_get_node(struct hash_table *instr_map, nir_instr *instr) 158{ 159 return _mesa_hash_table_search_data(instr_map, instr); 160} 161 162static struct set * 163nir_schedule_scoreboard_get_src(nir_schedule_scoreboard *scoreboard, nir_src *src) 164{ 165 if (src->is_ssa) { 166 return _mesa_hash_table_search_data(scoreboard->remaining_uses, src->ssa); 167 } else { 168 return _mesa_hash_table_search_data(scoreboard->remaining_uses, 169 src->reg.reg); 170 } 171} 172 173static int 174nir_schedule_def_pressure(nir_ssa_def *def) 175{ 176 return def->num_components; 177} 178 179static int 180nir_schedule_src_pressure(nir_src *src) 181{ 182 if (src->is_ssa) 183 return nir_schedule_def_pressure(src->ssa); 184 else 185 return src->reg.reg->num_components; 186} 187 188static int 189nir_schedule_dest_pressure(nir_dest *dest) 190{ 191 if (dest->is_ssa) 192 return nir_schedule_def_pressure(&dest->ssa); 193 else 194 return dest->reg.reg->num_components; 195} 196 197/** 198 * Adds a dependency such that @after must appear in the final program after 199 * @before. 200 * 201 * We add @before as a child of @after, so that DAG heads are the outputs of 202 * the program and we make our scheduling decisions bottom to top. 203 */ 204static void 205add_dep(nir_deps_state *state, 206 nir_schedule_node *before, 207 nir_schedule_node *after) 208{ 209 if (!before || !after) 210 return; 211 212 assert(before != after); 213 214 if (state->dir == F) 215 dag_add_edge(&before->dag, &after->dag, NULL); 216 else 217 dag_add_edge(&after->dag, &before->dag, NULL); 218} 219 220 221static void 222add_read_dep(nir_deps_state *state, 223 nir_schedule_node *before, 224 nir_schedule_node *after) 225{ 226 add_dep(state, before, after); 227} 228 229static void 230add_write_dep(nir_deps_state *state, 231 nir_schedule_node **before, 232 nir_schedule_node *after) 233{ 234 add_dep(state, *before, after); 235 *before = after; 236} 237 238static bool 239nir_schedule_reg_src_deps(nir_src *src, void *in_state) 240{ 241 nir_deps_state *state = in_state; 242 243 if (src->is_ssa) 244 return true; 245 246 struct hash_entry *entry = _mesa_hash_table_search(state->reg_map, 247 src->reg.reg); 248 if (!entry) 249 return true; 250 nir_schedule_node *dst_n = entry->data; 251 252 nir_schedule_node *src_n = 253 nir_schedule_get_node(state->scoreboard->instr_map, 254 src->parent_instr); 255 256 add_dep(state, dst_n, src_n); 257 258 return true; 259} 260 261static bool 262nir_schedule_reg_dest_deps(nir_dest *dest, void *in_state) 263{ 264 nir_deps_state *state = in_state; 265 266 if (dest->is_ssa) 267 return true; 268 269 nir_schedule_node *dest_n = 270 nir_schedule_get_node(state->scoreboard->instr_map, 271 dest->reg.parent_instr); 272 273 struct hash_entry *entry = _mesa_hash_table_search(state->reg_map, 274 dest->reg.reg); 275 if (!entry) { 276 _mesa_hash_table_insert(state->reg_map, dest->reg.reg, dest_n); 277 return true; 278 } 279 nir_schedule_node **before = (nir_schedule_node **)&entry->data; 280 281 add_write_dep(state, before, dest_n); 282 283 return true; 284} 285 286static bool 287nir_schedule_ssa_deps(nir_ssa_def *def, void *in_state) 288{ 289 nir_deps_state *state = in_state; 290 struct hash_table *instr_map = state->scoreboard->instr_map; 291 nir_schedule_node *def_n = nir_schedule_get_node(instr_map, def->parent_instr); 292 293 nir_foreach_use(src, def) { 294 nir_schedule_node *use_n = nir_schedule_get_node(instr_map, 295 src->parent_instr); 296 297 add_read_dep(state, def_n, use_n); 298 } 299 300 return true; 301} 302 303static struct nir_schedule_class_dep * 304nir_schedule_get_class_dep(nir_deps_state *state, 305 int klass) 306{ 307 for (struct nir_schedule_class_dep *class_dep = state->class_deps; 308 class_dep != NULL; 309 class_dep = class_dep->next) { 310 if (class_dep->klass == klass) 311 return class_dep; 312 } 313 314 struct nir_schedule_class_dep *class_dep = 315 ralloc(state->reg_map, struct nir_schedule_class_dep); 316 317 class_dep->klass = klass; 318 class_dep->node = NULL; 319 class_dep->next = state->class_deps; 320 321 state->class_deps = class_dep; 322 323 return class_dep; 324} 325 326static void 327nir_schedule_intrinsic_deps(nir_deps_state *state, 328 nir_intrinsic_instr *instr) 329{ 330 nir_schedule_node *n = nir_schedule_get_node(state->scoreboard->instr_map, 331 &instr->instr); 332 const nir_schedule_options *options = state->scoreboard->options; 333 nir_schedule_dependency dep; 334 335 if (options->intrinsic_cb && 336 options->intrinsic_cb(instr, &dep, options->intrinsic_cb_data)) { 337 struct nir_schedule_class_dep *class_dep = 338 nir_schedule_get_class_dep(state, dep.klass); 339 340 switch (dep.type) { 341 case NIR_SCHEDULE_READ_DEPENDENCY: 342 add_read_dep(state, class_dep->node, n); 343 break; 344 case NIR_SCHEDULE_WRITE_DEPENDENCY: 345 add_write_dep(state, &class_dep->node, n); 346 break; 347 } 348 } 349 350 switch (instr->intrinsic) { 351 case nir_intrinsic_load_uniform: 352 case nir_intrinsic_load_ubo: 353 case nir_intrinsic_load_front_face: 354 break; 355 356 case nir_intrinsic_discard: 357 case nir_intrinsic_discard_if: 358 case nir_intrinsic_demote: 359 case nir_intrinsic_demote_if: 360 case nir_intrinsic_terminate: 361 case nir_intrinsic_terminate_if: 362 /* We are adding two dependencies: 363 * 364 * * A individual one that we could use to add a read_dep while handling 365 * nir_instr_type_tex 366 * 367 * * Include it on the unknown intrinsic set, as we want discard to be 368 * serialized in in the same order relative to intervening stores or 369 * atomic accesses to SSBOs and images 370 */ 371 add_write_dep(state, &state->discard, n); 372 add_write_dep(state, &state->unknown_intrinsic, n); 373 break; 374 375 case nir_intrinsic_store_output: 376 /* For some hardware and stages, output stores affect the same shared 377 * memory as input loads. 378 */ 379 if ((state->scoreboard->options->stages_with_shared_io_memory & 380 (1 << state->scoreboard->shader->info.stage))) 381 add_write_dep(state, &state->load_input, n); 382 383 /* Make sure that preceding discards stay before the store_output */ 384 add_read_dep(state, state->discard, n); 385 386 break; 387 388 case nir_intrinsic_load_input: 389 case nir_intrinsic_load_per_vertex_input: 390 add_read_dep(state, state->load_input, n); 391 break; 392 393 case nir_intrinsic_load_shared: 394 /* Don't move load_shared beyond a following store_shared, as it could 395 * change their value 396 */ 397 add_read_dep(state, state->store_shared, n); 398 break; 399 400 case nir_intrinsic_store_shared: 401 add_write_dep(state, &state->store_shared, n); 402 break; 403 404 case nir_intrinsic_control_barrier: 405 case nir_intrinsic_memory_barrier_shared: 406 add_write_dep(state, &state->store_shared, n); 407 408 /* Serialize against ssbos/atomics/etc. */ 409 add_write_dep(state, &state->unknown_intrinsic, n); 410 break; 411 412 default: 413 /* Attempt to handle other intrinsics that we haven't individually 414 * categorized by serializing them in the same order relative to each 415 * other. 416 */ 417 add_write_dep(state, &state->unknown_intrinsic, n); 418 break; 419 } 420} 421 422/** 423 * Common code for dependencies that need to be tracked both forward and 424 * backward. 425 * 426 * This is for things like "all reads of r4 have to happen between the r4 427 * writes that surround them". 428 */ 429static void 430nir_schedule_calculate_deps(nir_deps_state *state, nir_schedule_node *n) 431{ 432 nir_instr *instr = n->instr; 433 434 /* For NIR SSA defs, we only need to do a single pass of making the uses 435 * depend on the def. 436 */ 437 if (state->dir == F) 438 nir_foreach_ssa_def(instr, nir_schedule_ssa_deps, state); 439 440 /* For NIR regs, track the last writer in the scheduler state so that we 441 * can keep the writes in order and let reads get reordered only between 442 * each write. 443 */ 444 nir_foreach_src(instr, nir_schedule_reg_src_deps, state); 445 446 nir_foreach_dest(instr, nir_schedule_reg_dest_deps, state); 447 448 /* Make sure any other instructions keep their positions relative to 449 * jumps. 450 */ 451 if (instr->type != nir_instr_type_jump) 452 add_read_dep(state, state->jump, n); 453 454 switch (instr->type) { 455 case nir_instr_type_ssa_undef: 456 case nir_instr_type_load_const: 457 case nir_instr_type_alu: 458 case nir_instr_type_deref: 459 break; 460 461 case nir_instr_type_tex: 462 /* Don't move texture ops before a discard, as that could increase 463 * memory bandwidth for reading the discarded samples. 464 */ 465 add_read_dep(state, state->discard, n); 466 break; 467 468 case nir_instr_type_jump: 469 add_write_dep(state, &state->jump, n); 470 break; 471 472 case nir_instr_type_call: 473 unreachable("Calls should have been lowered"); 474 break; 475 476 case nir_instr_type_parallel_copy: 477 unreachable("Parallel copies should have been lowered"); 478 break; 479 480 case nir_instr_type_phi: 481 unreachable("nir_schedule() should be called after lowering from SSA"); 482 break; 483 484 case nir_instr_type_intrinsic: 485 nir_schedule_intrinsic_deps(state, nir_instr_as_intrinsic(instr)); 486 break; 487 } 488} 489 490static void 491calculate_forward_deps(nir_schedule_scoreboard *scoreboard, nir_block *block) 492{ 493 nir_deps_state state = { 494 .scoreboard = scoreboard, 495 .dir = F, 496 .reg_map = _mesa_pointer_hash_table_create(NULL), 497 }; 498 499 nir_foreach_instr(instr, block) { 500 nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map, 501 instr); 502 nir_schedule_calculate_deps(&state, node); 503 } 504 505 ralloc_free(state.reg_map); 506} 507 508static void 509calculate_reverse_deps(nir_schedule_scoreboard *scoreboard, nir_block *block) 510{ 511 nir_deps_state state = { 512 .scoreboard = scoreboard, 513 .dir = R, 514 .reg_map = _mesa_pointer_hash_table_create(NULL), 515 }; 516 517 nir_foreach_instr_reverse(instr, block) { 518 nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map, 519 instr); 520 nir_schedule_calculate_deps(&state, node); 521 } 522 523 ralloc_free(state.reg_map); 524} 525 526typedef struct { 527 nir_schedule_scoreboard *scoreboard; 528 int regs_freed; 529} nir_schedule_regs_freed_state; 530 531static bool 532nir_schedule_regs_freed_src_cb(nir_src *src, void *in_state) 533{ 534 nir_schedule_regs_freed_state *state = in_state; 535 nir_schedule_scoreboard *scoreboard = state->scoreboard; 536 struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src); 537 538 if (remaining_uses->entries == 1 && 539 _mesa_set_search(remaining_uses, src->parent_instr)) { 540 state->regs_freed += nir_schedule_src_pressure(src); 541 } 542 543 return true; 544} 545 546static bool 547nir_schedule_regs_freed_def_cb(nir_ssa_def *def, void *in_state) 548{ 549 nir_schedule_regs_freed_state *state = in_state; 550 551 state->regs_freed -= nir_schedule_def_pressure(def); 552 553 return true; 554} 555 556static bool 557nir_schedule_regs_freed_dest_cb(nir_dest *dest, void *in_state) 558{ 559 nir_schedule_regs_freed_state *state = in_state; 560 nir_schedule_scoreboard *scoreboard = state->scoreboard; 561 562 if (dest->is_ssa) 563 return true; 564 565 nir_register *reg = dest->reg.reg; 566 567 /* Only the first def of a reg counts against register pressure. */ 568 if (!_mesa_set_search(scoreboard->live_values, reg)) 569 state->regs_freed -= nir_schedule_dest_pressure(dest); 570 571 return true; 572} 573 574static int 575nir_schedule_regs_freed(nir_schedule_scoreboard *scoreboard, nir_schedule_node *n) 576{ 577 nir_schedule_regs_freed_state state = { 578 .scoreboard = scoreboard, 579 }; 580 581 nir_foreach_src(n->instr, nir_schedule_regs_freed_src_cb, &state); 582 583 nir_foreach_ssa_def(n->instr, nir_schedule_regs_freed_def_cb, &state); 584 585 nir_foreach_dest(n->instr, nir_schedule_regs_freed_dest_cb, &state); 586 587 return state.regs_freed; 588} 589 590/** 591 * Chooses an instruction that will minimise the register pressure as much as 592 * possible. This should only be used as a fallback when the regular scheduling 593 * generates a shader whose register allocation fails. 594 */ 595static nir_schedule_node * 596nir_schedule_choose_instruction_fallback(nir_schedule_scoreboard *scoreboard) 597{ 598 nir_schedule_node *chosen = NULL; 599 600 /* Find the leader in the ready (shouldn't-stall) set with the mininum 601 * cost. 602 */ 603 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { 604 if (scoreboard->time < n->ready_time) 605 continue; 606 607 if (!chosen || chosen->max_delay > n->max_delay) 608 chosen = n; 609 } 610 if (chosen) { 611 if (debug) { 612 fprintf(stderr, "chose (ready fallback): "); 613 nir_print_instr(chosen->instr, stderr); 614 fprintf(stderr, "\n"); 615 } 616 617 return chosen; 618 } 619 620 /* Otherwise, choose the leader with the minimum cost. */ 621 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { 622 if (!chosen || chosen->max_delay > n->max_delay) 623 chosen = n; 624 } 625 if (debug) { 626 fprintf(stderr, "chose (leader fallback): "); 627 nir_print_instr(chosen->instr, stderr); 628 fprintf(stderr, "\n"); 629 } 630 631 return chosen; 632} 633 634/** 635 * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSP (Code 636 * Scheduling for Parallelism) heuristic. 637 * 638 * Picks an instruction on the critical that's ready to execute without 639 * stalls, if possible, otherwise picks the instruction on the critical path. 640 */ 641static nir_schedule_node * 642nir_schedule_choose_instruction_csp(nir_schedule_scoreboard *scoreboard) 643{ 644 nir_schedule_node *chosen = NULL; 645 646 /* Find the leader in the ready (shouldn't-stall) set with the maximum 647 * cost. 648 */ 649 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { 650 if (scoreboard->time < n->ready_time) 651 continue; 652 653 if (!chosen || chosen->max_delay < n->max_delay) 654 chosen = n; 655 } 656 if (chosen) { 657 if (debug) { 658 fprintf(stderr, "chose (ready): "); 659 nir_print_instr(chosen->instr, stderr); 660 fprintf(stderr, "\n"); 661 } 662 663 return chosen; 664 } 665 666 /* Otherwise, choose the leader with the maximum cost. */ 667 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { 668 if (!chosen || chosen->max_delay < n->max_delay) 669 chosen = n; 670 } 671 if (debug) { 672 fprintf(stderr, "chose (leader): "); 673 nir_print_instr(chosen->instr, stderr); 674 fprintf(stderr, "\n"); 675 } 676 677 return chosen; 678} 679 680/** 681 * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code 682 * Scheduling for Register pressure) heuristic. 683 */ 684static nir_schedule_node * 685nir_schedule_choose_instruction_csr(nir_schedule_scoreboard *scoreboard) 686{ 687 nir_schedule_node *chosen = NULL; 688 689 /* Find a ready inst with regs freed and pick the one with max cost. */ 690 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { 691 if (n->ready_time > scoreboard->time) 692 continue; 693 694 int regs_freed = nir_schedule_regs_freed(scoreboard, n); 695 696 if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) { 697 chosen = n; 698 } 699 } 700 if (chosen) { 701 if (debug) { 702 fprintf(stderr, "chose (freed+ready): "); 703 nir_print_instr(chosen->instr, stderr); 704 fprintf(stderr, "\n"); 705 } 706 707 return chosen; 708 } 709 710 /* Find a leader with regs freed and pick the one with max cost. */ 711 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { 712 int regs_freed = nir_schedule_regs_freed(scoreboard, n); 713 714 if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) { 715 chosen = n; 716 } 717 } 718 if (chosen) { 719 if (debug) { 720 fprintf(stderr, "chose (regs freed): "); 721 nir_print_instr(chosen->instr, stderr); 722 fprintf(stderr, "\n"); 723 } 724 725 return chosen; 726 } 727 728 /* Find a partially evaluated path and try to finish it off */ 729 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { 730 if (n->partially_evaluated_path && 731 (!chosen || chosen->max_delay < n->max_delay)) { 732 chosen = n; 733 } 734 } 735 if (chosen) { 736 if (debug) { 737 fprintf(stderr, "chose (partial path): "); 738 nir_print_instr(chosen->instr, stderr); 739 fprintf(stderr, "\n"); 740 } 741 742 return chosen; 743 } 744 745 /* Contra the paper, pick a leader with no effect on used regs. This may 746 * open up new opportunities, as otherwise a single-operand instr consuming 747 * a value will tend to block finding freeing that value. This had a 748 * massive effect on reducing spilling on V3D. 749 * 750 * XXX: Should this prioritize ready? 751 */ 752 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { 753 if (nir_schedule_regs_freed(scoreboard, n) != 0) 754 continue; 755 756 if (!chosen || chosen->max_delay < n->max_delay) 757 chosen = n; 758 } 759 if (chosen) { 760 if (debug) { 761 fprintf(stderr, "chose (regs no-op): "); 762 nir_print_instr(chosen->instr, stderr); 763 fprintf(stderr, "\n"); 764 } 765 766 return chosen; 767 } 768 769 /* Pick the max delay of the remaining ready set. */ 770 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { 771 if (n->ready_time > scoreboard->time) 772 continue; 773 if (!chosen || chosen->max_delay < n->max_delay) 774 chosen = n; 775 } 776 if (chosen) { 777 if (debug) { 778 fprintf(stderr, "chose (ready max delay): "); 779 nir_print_instr(chosen->instr, stderr); 780 fprintf(stderr, "\n"); 781 } 782 return chosen; 783 } 784 785 /* Pick the max delay of the remaining leaders. */ 786 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { 787 if (!chosen || chosen->max_delay < n->max_delay) 788 chosen = n; 789 } 790 791 if (debug) { 792 fprintf(stderr, "chose (max delay): "); 793 nir_print_instr(chosen->instr, stderr); 794 fprintf(stderr, "\n"); 795 } 796 797 return chosen; 798} 799 800static void 801dump_state(nir_schedule_scoreboard *scoreboard) 802{ 803 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { 804 fprintf(stderr, "maxdel %5d ", n->max_delay); 805 nir_print_instr(n->instr, stderr); 806 fprintf(stderr, "\n"); 807 808 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { 809 nir_schedule_node *child = (nir_schedule_node *)edge->child; 810 811 fprintf(stderr, " -> (%d parents) ", child->dag.parent_count); 812 nir_print_instr(child->instr, stderr); 813 fprintf(stderr, "\n"); 814 } 815 } 816} 817 818static void 819nir_schedule_mark_use(nir_schedule_scoreboard *scoreboard, 820 void *reg_or_def, 821 nir_instr *reg_or_def_parent, 822 int pressure) 823{ 824 /* Make the value live if it's the first time it's been used. */ 825 if (!_mesa_set_search(scoreboard->live_values, reg_or_def)) { 826 _mesa_set_add(scoreboard->live_values, reg_or_def); 827 scoreboard->pressure += pressure; 828 } 829 830 /* Make the value dead if it's the last remaining use. Be careful when one 831 * instruction uses a value twice to not decrement pressure twice. 832 */ 833 struct set *remaining_uses = 834 _mesa_hash_table_search_data(scoreboard->remaining_uses, reg_or_def); 835 struct set_entry *entry = _mesa_set_search(remaining_uses, reg_or_def_parent); 836 if (entry) { 837 _mesa_set_remove(remaining_uses, entry); 838 839 if (remaining_uses->entries == 0) 840 scoreboard->pressure -= pressure; 841 } 842} 843 844static bool 845nir_schedule_mark_src_scheduled(nir_src *src, void *state) 846{ 847 nir_schedule_scoreboard *scoreboard = state; 848 struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src); 849 850 struct set_entry *entry = _mesa_set_search(remaining_uses, 851 src->parent_instr); 852 if (entry) { 853 /* Once we've used an SSA value in one instruction, bump the priority of 854 * the other uses so the SSA value can get fully consumed. 855 * 856 * We don't do this for registers, and it's would be a hassle and it's 857 * unclear if that would help or not. Also, skip it for constants, as 858 * they're often folded as immediates into backend instructions and have 859 * many unrelated instructions all referencing the same value (0). 860 */ 861 if (src->is_ssa && 862 src->ssa->parent_instr->type != nir_instr_type_load_const) { 863 nir_foreach_use(other_src, src->ssa) { 864 if (other_src->parent_instr == src->parent_instr) 865 continue; 866 867 nir_schedule_node *n = 868 nir_schedule_get_node(scoreboard->instr_map, 869 other_src->parent_instr); 870 871 if (n && !n->partially_evaluated_path) { 872 if (debug) { 873 fprintf(stderr, " New partially evaluated path: "); 874 nir_print_instr(n->instr, stderr); 875 fprintf(stderr, "\n"); 876 } 877 878 n->partially_evaluated_path = true; 879 } 880 } 881 } 882 } 883 884 nir_schedule_mark_use(scoreboard, 885 src->is_ssa ? (void *)src->ssa : (void *)src->reg.reg, 886 src->parent_instr, 887 nir_schedule_src_pressure(src)); 888 889 return true; 890} 891 892static bool 893nir_schedule_mark_def_scheduled(nir_ssa_def *def, void *state) 894{ 895 nir_schedule_scoreboard *scoreboard = state; 896 897 nir_schedule_mark_use(scoreboard, def, def->parent_instr, 898 nir_schedule_def_pressure(def)); 899 900 return true; 901} 902 903static bool 904nir_schedule_mark_dest_scheduled(nir_dest *dest, void *state) 905{ 906 nir_schedule_scoreboard *scoreboard = state; 907 908 /* SSA defs were handled in nir_schedule_mark_def_scheduled() 909 */ 910 if (dest->is_ssa) 911 return true; 912 913 /* XXX: This is not actually accurate for regs -- the last use of a reg may 914 * have a live interval that extends across control flow. We should 915 * calculate the live ranges of regs, and have scheduler nodes for the CF 916 * nodes that also "use" the reg. 917 */ 918 nir_schedule_mark_use(scoreboard, dest->reg.reg, 919 dest->reg.parent_instr, 920 nir_schedule_dest_pressure(dest)); 921 922 return true; 923} 924 925static void 926nir_schedule_mark_node_scheduled(nir_schedule_scoreboard *scoreboard, 927 nir_schedule_node *n) 928{ 929 nir_foreach_src(n->instr, nir_schedule_mark_src_scheduled, scoreboard); 930 nir_foreach_ssa_def(n->instr, nir_schedule_mark_def_scheduled, scoreboard); 931 nir_foreach_dest(n->instr, nir_schedule_mark_dest_scheduled, scoreboard); 932 933 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { 934 nir_schedule_node *child = (nir_schedule_node *)edge->child; 935 936 child->ready_time = MAX2(child->ready_time, 937 scoreboard->time + n->delay); 938 939 if (child->dag.parent_count == 1) { 940 if (debug) { 941 fprintf(stderr, " New DAG head: "); 942 nir_print_instr(child->instr, stderr); 943 fprintf(stderr, "\n"); 944 } 945 } 946 } 947 948 dag_prune_head(scoreboard->dag, &n->dag); 949 950 scoreboard->time = MAX2(n->ready_time, scoreboard->time); 951 scoreboard->time++; 952} 953 954static void 955nir_schedule_instructions(nir_schedule_scoreboard *scoreboard, nir_block *block) 956{ 957 while (!list_is_empty(&scoreboard->dag->heads)) { 958 if (debug) { 959 fprintf(stderr, "current list:\n"); 960 dump_state(scoreboard); 961 } 962 963 nir_schedule_node *chosen; 964 if (scoreboard->options->fallback) 965 chosen = nir_schedule_choose_instruction_fallback(scoreboard); 966 else if (scoreboard->pressure < scoreboard->options->threshold) 967 chosen = nir_schedule_choose_instruction_csp(scoreboard); 968 else 969 chosen = nir_schedule_choose_instruction_csr(scoreboard); 970 971 /* Now that we've scheduled a new instruction, some of its children may 972 * be promoted to the list of instructions ready to be scheduled. 973 */ 974 nir_schedule_mark_node_scheduled(scoreboard, chosen); 975 976 /* Move the instruction to the end (so our first chosen instructions are 977 * the start of the program). 978 */ 979 exec_node_remove(&chosen->instr->node); 980 exec_list_push_tail(&block->instr_list, &chosen->instr->node); 981 982 if (debug) 983 fprintf(stderr, "\n"); 984 } 985} 986 987static uint32_t 988nir_schedule_get_delay(nir_instr *instr) 989{ 990 switch (instr->type) { 991 case nir_instr_type_ssa_undef: 992 case nir_instr_type_load_const: 993 case nir_instr_type_alu: 994 case nir_instr_type_deref: 995 case nir_instr_type_jump: 996 case nir_instr_type_parallel_copy: 997 case nir_instr_type_call: 998 case nir_instr_type_phi: 999 return 1; 1000 1001 case nir_instr_type_intrinsic: 1002 /* XXX: Pick a large number for UBO/SSBO/image/shared loads */ 1003 return 1; 1004 1005 case nir_instr_type_tex: 1006 /* Pick some large number to try to fetch textures early and sample them 1007 * late. 1008 */ 1009 return 100; 1010 } 1011 1012 return 0; 1013} 1014 1015static void 1016nir_schedule_dag_max_delay_cb(struct dag_node *node, void *state) 1017{ 1018 nir_schedule_node *n = (nir_schedule_node *)node; 1019 uint32_t max_delay = 0; 1020 1021 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { 1022 nir_schedule_node *child = (nir_schedule_node *)edge->child; 1023 max_delay = MAX2(child->max_delay, max_delay); 1024 } 1025 1026 n->max_delay = MAX2(n->max_delay, max_delay + n->delay); 1027 } 1028 1029static void 1030nir_schedule_block(nir_schedule_scoreboard *scoreboard, nir_block *block) 1031{ 1032 void *mem_ctx = ralloc_context(NULL); 1033 scoreboard->instr_map = _mesa_pointer_hash_table_create(mem_ctx); 1034 1035 scoreboard->dag = dag_create(mem_ctx); 1036 1037 nir_foreach_instr(instr, block) { 1038 nir_schedule_node *n = 1039 rzalloc(mem_ctx, nir_schedule_node); 1040 1041 n->instr = instr; 1042 n->delay = nir_schedule_get_delay(instr); 1043 dag_init_node(scoreboard->dag, &n->dag); 1044 1045 _mesa_hash_table_insert(scoreboard->instr_map, instr, n); 1046 } 1047 1048 calculate_forward_deps(scoreboard, block); 1049 calculate_reverse_deps(scoreboard, block); 1050 1051 dag_traverse_bottom_up(scoreboard->dag, nir_schedule_dag_max_delay_cb, NULL); 1052 1053 nir_schedule_instructions(scoreboard, block); 1054 1055 ralloc_free(mem_ctx); 1056 scoreboard->instr_map = NULL; 1057} 1058 1059static bool 1060nir_schedule_ssa_def_init_scoreboard(nir_ssa_def *def, void *state) 1061{ 1062 nir_schedule_scoreboard *scoreboard = state; 1063 struct set *def_uses = _mesa_pointer_set_create(scoreboard); 1064 1065 _mesa_hash_table_insert(scoreboard->remaining_uses, def, def_uses); 1066 1067 _mesa_set_add(def_uses, def->parent_instr); 1068 1069 nir_foreach_use(src, def) { 1070 _mesa_set_add(def_uses, src->parent_instr); 1071 } 1072 1073 /* XXX: Handle if uses */ 1074 1075 return true; 1076} 1077 1078static nir_schedule_scoreboard * 1079nir_schedule_get_scoreboard(nir_shader *shader, 1080 const nir_schedule_options *options) 1081{ 1082 nir_schedule_scoreboard *scoreboard = rzalloc(NULL, nir_schedule_scoreboard); 1083 1084 scoreboard->shader = shader; 1085 scoreboard->live_values = _mesa_pointer_set_create(scoreboard); 1086 scoreboard->remaining_uses = _mesa_pointer_hash_table_create(scoreboard); 1087 scoreboard->options = options; 1088 scoreboard->pressure = 0; 1089 1090 nir_foreach_function(function, shader) { 1091 nir_foreach_register(reg, &function->impl->registers) { 1092 struct set *register_uses = 1093 _mesa_pointer_set_create(scoreboard); 1094 1095 _mesa_hash_table_insert(scoreboard->remaining_uses, reg, register_uses); 1096 1097 nir_foreach_use(src, reg) { 1098 _mesa_set_add(register_uses, src->parent_instr); 1099 } 1100 1101 /* XXX: Handle if uses */ 1102 1103 nir_foreach_def(dest, reg) { 1104 _mesa_set_add(register_uses, dest->reg.parent_instr); 1105 } 1106 } 1107 1108 nir_foreach_block(block, function->impl) { 1109 nir_foreach_instr(instr, block) { 1110 nir_foreach_ssa_def(instr, nir_schedule_ssa_def_init_scoreboard, 1111 scoreboard); 1112 } 1113 1114 /* XXX: We're ignoring if uses, which may prioritize scheduling other 1115 * uses of the if src even when it doesn't help. That's not many 1116 * values, though, so meh. 1117 */ 1118 } 1119 } 1120 1121 return scoreboard; 1122} 1123 1124static void 1125nir_schedule_validate_uses(nir_schedule_scoreboard *scoreboard) 1126{ 1127#ifdef NDEBUG 1128 return; 1129#endif 1130 1131 bool any_uses = false; 1132 1133 hash_table_foreach(scoreboard->remaining_uses, entry) { 1134 struct set *remaining_uses = entry->data; 1135 1136 set_foreach(remaining_uses, instr_entry) { 1137 if (!any_uses) { 1138 fprintf(stderr, "Tracked uses remain after scheduling. " 1139 "Affected instructions: \n"); 1140 any_uses = true; 1141 } 1142 nir_print_instr(instr_entry->key, stderr); 1143 fprintf(stderr, "\n"); 1144 } 1145 } 1146 1147 assert(!any_uses); 1148} 1149 1150/** 1151 * Schedules the NIR instructions to try to decrease stalls (for example, 1152 * delaying texture reads) while managing register pressure. 1153 * 1154 * The threshold represents "number of NIR register/SSA def channels live 1155 * before switching the scheduling heuristic to reduce register pressure", 1156 * since most of our GPU architectures are scalar (extending to vector with a 1157 * flag wouldn't be hard). This number should be a bit below the number of 1158 * registers available (counting any that may be occupied by system value 1159 * payload values, for example), since the heuristic may not always be able to 1160 * free a register immediately. The amount below the limit is up to you to 1161 * tune. 1162 */ 1163void 1164nir_schedule(nir_shader *shader, 1165 const nir_schedule_options *options) 1166{ 1167 nir_schedule_scoreboard *scoreboard = nir_schedule_get_scoreboard(shader, 1168 options); 1169 1170 if (debug) { 1171 fprintf(stderr, "NIR shader before scheduling:\n"); 1172 nir_print_shader(shader, stderr); 1173 } 1174 1175 nir_foreach_function(function, shader) { 1176 if (!function->impl) 1177 continue; 1178 1179 nir_foreach_block(block, function->impl) { 1180 nir_schedule_block(scoreboard, block); 1181 } 1182 } 1183 1184 nir_schedule_validate_uses(scoreboard); 1185 1186 ralloc_free(scoreboard); 1187} 1188