1/* 2 * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io> 3 * Copyright (C) 2019-2020 Collabora, Ltd. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25#include "compiler.h" 26#include "midgard_ops.h" 27#include "midgard_quirks.h" 28#include "util/u_memory.h" 29#include "util/u_math.h" 30#include "util/half_float.h" 31 32/* Scheduling for Midgard is complicated, to say the least. ALU instructions 33 * must be grouped into VLIW bundles according to following model: 34 * 35 * [VMUL] [SADD] 36 * [VADD] [SMUL] [VLUT] 37 * 38 * A given instruction can execute on some subset of the units (or a few can 39 * execute on all). Instructions can be either vector or scalar; only scalar 40 * instructions can execute on SADD/SMUL units. Units on a given line execute 41 * in parallel. Subsequent lines execute separately and can pass results 42 * directly via pipeline registers r24/r25, bypassing the register file. 43 * 44 * A bundle can optionally have 128-bits of embedded constants, shared across 45 * all of the instructions within a bundle. 46 * 47 * Instructions consuming conditionals (branches and conditional selects) 48 * require their condition to be written into the conditional register (r31) 49 * within the same bundle they are consumed. 50 * 51 * Fragment writeout requires its argument to be written in full within the 52 * same bundle as the branch, with no hanging dependencies. 53 * 54 * Load/store instructions are also in bundles of simply two instructions, and 55 * texture instructions have no bundling. 56 * 57 * ------------------------------------------------------------------------- 58 * 59 */ 60 61/* We create the dependency graph with per-byte granularity */ 62 63#define BYTE_COUNT 16 64 65static void 66add_dependency(struct util_dynarray *table, unsigned index, uint16_t mask, midgard_instruction **instructions, unsigned child) 67{ 68 for (unsigned i = 0; i < BYTE_COUNT; ++i) { 69 if (!(mask & (1 << i))) 70 continue; 71 72 struct util_dynarray *parents = &table[(BYTE_COUNT * index) + i]; 73 74 util_dynarray_foreach(parents, unsigned, parent) { 75 BITSET_WORD *dependents = instructions[*parent]->dependents; 76 77 /* Already have the dependency */ 78 if (BITSET_TEST(dependents, child)) 79 continue; 80 81 BITSET_SET(dependents, child); 82 instructions[child]->nr_dependencies++; 83 } 84 } 85} 86 87static void 88mark_access(struct util_dynarray *table, unsigned index, uint16_t mask, unsigned parent) 89{ 90 for (unsigned i = 0; i < BYTE_COUNT; ++i) { 91 if (!(mask & (1 << i))) 92 continue; 93 94 util_dynarray_append(&table[(BYTE_COUNT * index) + i], unsigned, parent); 95 } 96} 97 98static void 99mir_create_dependency_graph(midgard_instruction **instructions, unsigned count, unsigned node_count) 100{ 101 size_t sz = node_count * BYTE_COUNT; 102 103 struct util_dynarray *last_read = calloc(sizeof(struct util_dynarray), sz); 104 struct util_dynarray *last_write = calloc(sizeof(struct util_dynarray), sz); 105 106 for (unsigned i = 0; i < sz; ++i) { 107 util_dynarray_init(&last_read[i], NULL); 108 util_dynarray_init(&last_write[i], NULL); 109 } 110 111 /* Initialize dependency graph */ 112 for (unsigned i = 0; i < count; ++i) { 113 instructions[i]->dependents = 114 calloc(BITSET_WORDS(count), sizeof(BITSET_WORD)); 115 116 instructions[i]->nr_dependencies = 0; 117 } 118 119 unsigned prev_ldst[3] = {~0, ~0, ~0}; 120 121 /* Populate dependency graph */ 122 for (signed i = count - 1; i >= 0; --i) { 123 if (instructions[i]->compact_branch) 124 continue; 125 126 unsigned dest = instructions[i]->dest; 127 unsigned mask = mir_bytemask(instructions[i]); 128 129 mir_foreach_src((*instructions), s) { 130 unsigned src = instructions[i]->src[s]; 131 132 if (src < node_count) { 133 unsigned readmask = mir_bytemask_of_read_components(instructions[i], src); 134 add_dependency(last_write, src, readmask, instructions, i); 135 } 136 } 137 138 /* Create a list of dependencies for each type of load/store 139 * instruction to prevent reordering. */ 140 if (instructions[i]->type == TAG_LOAD_STORE_4 && 141 load_store_opcode_props[instructions[i]->op].props & LDST_ADDRESS) { 142 143 unsigned type = instructions[i]->load_store.arg_reg | 144 instructions[i]->load_store.arg_comp; 145 146 unsigned idx; 147 switch (type) { 148 case LDST_SHARED: idx = 0; break; 149 case LDST_SCRATCH: idx = 1; break; 150 default: idx = 2; break; 151 } 152 153 unsigned prev = prev_ldst[idx]; 154 155 if (prev != ~0) { 156 BITSET_WORD *dependents = instructions[prev]->dependents; 157 158 /* Already have the dependency */ 159 if (BITSET_TEST(dependents, i)) 160 continue; 161 162 BITSET_SET(dependents, i); 163 instructions[i]->nr_dependencies++; 164 } 165 166 prev_ldst[idx] = i; 167 } 168 169 if (dest < node_count) { 170 add_dependency(last_read, dest, mask, instructions, i); 171 add_dependency(last_write, dest, mask, instructions, i); 172 mark_access(last_write, dest, mask, i); 173 } 174 175 mir_foreach_src((*instructions), s) { 176 unsigned src = instructions[i]->src[s]; 177 178 if (src < node_count) { 179 unsigned readmask = mir_bytemask_of_read_components(instructions[i], src); 180 mark_access(last_read, src, readmask, i); 181 } 182 } 183 } 184 185 /* If there is a branch, all instructions depend on it, as interblock 186 * execution must be purely in-order */ 187 188 if (instructions[count - 1]->compact_branch) { 189 BITSET_WORD *dependents = instructions[count - 1]->dependents; 190 191 for (signed i = count - 2; i >= 0; --i) { 192 if (BITSET_TEST(dependents, i)) 193 continue; 194 195 BITSET_SET(dependents, i); 196 instructions[i]->nr_dependencies++; 197 } 198 } 199 200 /* Free the intermediate structures */ 201 for (unsigned i = 0; i < sz; ++i) { 202 util_dynarray_fini(&last_read[i]); 203 util_dynarray_fini(&last_write[i]); 204 } 205 206 free(last_read); 207 free(last_write); 208} 209 210/* Does the mask cover more than a scalar? */ 211 212static bool 213is_single_component_mask(unsigned mask) 214{ 215 int components = 0; 216 217 for (int c = 0; c < 8; ++c) { 218 if (mask & (1 << c)) 219 components++; 220 } 221 222 return components == 1; 223} 224 225/* Helpers for scheudling */ 226 227static bool 228mir_is_scalar(midgard_instruction *ains) 229{ 230 /* Do we try to use it as a vector op? */ 231 if (!is_single_component_mask(ains->mask)) 232 return false; 233 234 /* Otherwise, check mode hazards */ 235 bool could_scalar = true; 236 unsigned szd = nir_alu_type_get_type_size(ains->dest_type); 237 unsigned sz0 = nir_alu_type_get_type_size(ains->src_types[0]); 238 unsigned sz1 = nir_alu_type_get_type_size(ains->src_types[1]); 239 240 /* Only 16/32-bit can run on a scalar unit */ 241 could_scalar &= (szd == 16) || (szd == 32); 242 243 if (ains->src[0] != ~0) 244 could_scalar &= (sz0 == 16) || (sz0 == 32); 245 246 if (ains->src[1] != ~0) 247 could_scalar &= (sz1 == 16) || (sz1 == 32); 248 249 if (midgard_is_integer_out_op(ains->op) && ains->outmod != midgard_outmod_keeplo) 250 return false; 251 252 return could_scalar; 253} 254 255/* How many bytes does this ALU instruction add to the bundle? */ 256 257static unsigned 258bytes_for_instruction(midgard_instruction *ains) 259{ 260 if (ains->unit & UNITS_ANY_VECTOR) 261 return sizeof(midgard_reg_info) + sizeof(midgard_vector_alu); 262 else if (ains->unit == ALU_ENAB_BRANCH) 263 return sizeof(midgard_branch_extended); 264 else if (ains->compact_branch) 265 return sizeof(uint16_t); 266 else 267 return sizeof(midgard_reg_info) + sizeof(midgard_scalar_alu); 268} 269 270/* We would like to flatten the linked list of midgard_instructions in a bundle 271 * to an array of pointers on the heap for easy indexing */ 272 273static midgard_instruction ** 274flatten_mir(midgard_block *block, unsigned *len) 275{ 276 *len = list_length(&block->base.instructions); 277 278 if (!(*len)) 279 return NULL; 280 281 midgard_instruction **instructions = 282 calloc(sizeof(midgard_instruction *), *len); 283 284 unsigned i = 0; 285 286 mir_foreach_instr_in_block(block, ins) 287 instructions[i++] = ins; 288 289 return instructions; 290} 291 292/* The worklist is the set of instructions that can be scheduled now; that is, 293 * the set of instructions with no remaining dependencies */ 294 295static void 296mir_initialize_worklist(BITSET_WORD *worklist, midgard_instruction **instructions, unsigned count) 297{ 298 for (unsigned i = 0; i < count; ++i) { 299 if (instructions[i]->nr_dependencies == 0) 300 BITSET_SET(worklist, i); 301 } 302} 303 304/* Update the worklist after an instruction terminates. Remove its edges from 305 * the graph and if that causes any node to have no dependencies, add it to the 306 * worklist */ 307 308static void 309mir_update_worklist( 310 BITSET_WORD *worklist, unsigned count, 311 midgard_instruction **instructions, midgard_instruction *done) 312{ 313 /* Sanity check: if no instruction terminated, there is nothing to do. 314 * If the instruction that terminated had dependencies, that makes no 315 * sense and means we messed up the worklist. Finally, as the purpose 316 * of this routine is to update dependents, we abort early if there are 317 * no dependents defined. */ 318 319 if (!done) 320 return; 321 322 assert(done->nr_dependencies == 0); 323 324 if (!done->dependents) 325 return; 326 327 /* We have an instruction with dependents. Iterate each dependent to 328 * remove one dependency (`done`), adding dependents to the worklist 329 * where possible. */ 330 331 unsigned i; 332 BITSET_FOREACH_SET(i, done->dependents, count) { 333 assert(instructions[i]->nr_dependencies); 334 335 if (!(--instructions[i]->nr_dependencies)) 336 BITSET_SET(worklist, i); 337 } 338 339 free(done->dependents); 340} 341 342/* While scheduling, we need to choose instructions satisfying certain 343 * criteria. As we schedule backwards, we choose the *last* instruction in the 344 * worklist to simulate in-order scheduling. Chosen instructions must satisfy a 345 * given predicate. */ 346 347struct midgard_predicate { 348 /* TAG or ~0 for dont-care */ 349 unsigned tag; 350 351 /* True if we want to pop off the chosen instruction */ 352 bool destructive; 353 354 /* For ALU, choose only this unit */ 355 unsigned unit; 356 357 /* State for bundle constants. constants is the actual constants 358 * for the bundle. constant_count is the number of bytes (up to 359 * 16) currently in use for constants. When picking in destructive 360 * mode, the constants array will be updated, and the instruction 361 * will be adjusted to index into the constants array */ 362 363 midgard_constants *constants; 364 unsigned constant_mask; 365 366 /* Exclude this destination (if not ~0) */ 367 unsigned exclude; 368 369 /* Don't schedule instructions consuming conditionals (since we already 370 * scheduled one). Excludes conditional branches and csel */ 371 bool no_cond; 372 373 /* Require (or reject) a minimal mask and (if nonzero) given 374 * destination. Used for writeout optimizations */ 375 376 unsigned mask; 377 unsigned no_mask; 378 unsigned dest; 379 380 /* Whether to not-care/only/never schedule imov/fmov instructions This 381 * allows non-move instructions to get priority on each unit */ 382 unsigned move_mode; 383 384 /* For load/store: how many pipeline registers are in use? The two 385 * scheduled instructions cannot use more than the 256-bits of pipeline 386 * space available or RA will fail (as it would run out of pipeline 387 * registers and fail to spill without breaking the schedule) */ 388 389 unsigned pipeline_count; 390}; 391 392static bool 393mir_adjust_constant(midgard_instruction *ins, unsigned src, 394 unsigned *bundle_constant_mask, 395 unsigned *comp_mapping, 396 uint8_t *bundle_constants, 397 bool upper) 398{ 399 unsigned type_size = nir_alu_type_get_type_size(ins->src_types[src]) / 8; 400 unsigned type_shift = util_logbase2(type_size); 401 unsigned max_comp = mir_components_for_type(ins->src_types[src]); 402 unsigned comp_mask = mir_from_bytemask(mir_round_bytemask_up( 403 mir_bytemask_of_read_components_index(ins, src), 404 type_size * 8), 405 type_size * 8); 406 unsigned type_mask = (1 << type_size) - 1; 407 408 /* Upper only makes sense for 16-bit */ 409 if (type_size != 16 && upper) 410 return false; 411 412 /* For 16-bit, we need to stay on either upper or lower halves to avoid 413 * disrupting the swizzle */ 414 unsigned start = upper ? 8 : 0; 415 unsigned length = (type_size == 2) ? 8 : 16; 416 417 for (unsigned comp = 0; comp < max_comp; comp++) { 418 if (!(comp_mask & (1 << comp))) 419 continue; 420 421 uint8_t *constantp = ins->constants.u8 + (type_size * comp); 422 unsigned best_reuse_bytes = 0; 423 signed best_place = -1; 424 unsigned i, j; 425 426 for (i = start; i < (start + length); i += type_size) { 427 unsigned reuse_bytes = 0; 428 429 for (j = 0; j < type_size; j++) { 430 if (!(*bundle_constant_mask & (1 << (i + j)))) 431 continue; 432 if (constantp[j] != bundle_constants[i + j]) 433 break; 434 if ((i + j) > (start + length)) 435 break; 436 437 reuse_bytes++; 438 } 439 440 /* Select the place where existing bytes can be 441 * reused so we leave empty slots to others 442 */ 443 if (j == type_size && 444 (reuse_bytes > best_reuse_bytes || best_place < 0)) { 445 best_reuse_bytes = reuse_bytes; 446 best_place = i; 447 break; 448 } 449 } 450 451 /* This component couldn't fit in the remaining constant slot, 452 * no need check the remaining components, bail out now 453 */ 454 if (best_place < 0) 455 return false; 456 457 memcpy(&bundle_constants[i], constantp, type_size); 458 *bundle_constant_mask |= type_mask << best_place; 459 comp_mapping[comp] = best_place >> type_shift; 460 } 461 462 return true; 463} 464 465/* For an instruction that can fit, adjust it to fit and update the constants 466 * array, in destructive mode. Returns whether the fitting was successful. */ 467 468static bool 469mir_adjust_constants(midgard_instruction *ins, 470 struct midgard_predicate *pred, 471 bool destructive) 472{ 473 /* No constant, nothing to adjust */ 474 if (!ins->has_constants) 475 return true; 476 477 unsigned r_constant = SSA_FIXED_REGISTER(REGISTER_CONSTANT); 478 unsigned bundle_constant_mask = pred->constant_mask; 479 unsigned comp_mapping[2][16] = { }; 480 uint8_t bundle_constants[16]; 481 482 memcpy(bundle_constants, pred->constants, 16); 483 484 /* Let's try to find a place for each active component of the constant 485 * register. 486 */ 487 for (unsigned src = 0; src < 2; ++src) { 488 if (ins->src[src] != SSA_FIXED_REGISTER(REGISTER_CONSTANT)) 489 continue; 490 491 /* First, try lower half (or whole for !16) */ 492 if (mir_adjust_constant(ins, src, &bundle_constant_mask, 493 comp_mapping[src], bundle_constants, false)) 494 continue; 495 496 /* Next, try upper half */ 497 if (mir_adjust_constant(ins, src, &bundle_constant_mask, 498 comp_mapping[src], bundle_constants, true)) 499 continue; 500 501 /* Otherwise bail */ 502 return false; 503 } 504 505 /* If non-destructive, we're done */ 506 if (!destructive) 507 return true; 508 509 /* Otherwise update the constant_mask and constant values */ 510 pred->constant_mask = bundle_constant_mask; 511 memcpy(pred->constants, bundle_constants, 16); 512 513 /* Use comp_mapping as a swizzle */ 514 mir_foreach_src(ins, s) { 515 if (ins->src[s] == r_constant) 516 mir_compose_swizzle(ins->swizzle[s], comp_mapping[s], ins->swizzle[s]); 517 } 518 519 return true; 520} 521 522/* Conservative estimate of the pipeline registers required for load/store */ 523 524static unsigned 525mir_pipeline_count(midgard_instruction *ins) 526{ 527 unsigned bytecount = 0; 528 529 mir_foreach_src(ins, i) { 530 /* Skip empty source */ 531 if (ins->src[i] == ~0) continue; 532 533 if (i == 0) { 534 /* First source is a vector, worst-case the mask */ 535 unsigned bytemask = mir_bytemask_of_read_components_index(ins, i); 536 unsigned max = util_logbase2(bytemask) + 1; 537 bytecount += max; 538 } else { 539 /* Sources 1 on are scalars */ 540 bytecount += 4; 541 } 542 } 543 544 unsigned dwords = DIV_ROUND_UP(bytecount, 16); 545 assert(dwords <= 2); 546 547 return dwords; 548} 549 550/* Matches FADD x, x with modifiers compatible. Since x + x = x * 2, for 551 * any x including of the form f(y) for some swizzle/abs/neg function f */ 552 553static bool 554mir_is_add_2(midgard_instruction *ins) 555{ 556 if (ins->op != midgard_alu_op_fadd) 557 return false; 558 559 if (ins->src[0] != ins->src[1]) 560 return false; 561 562 if (ins->src_types[0] != ins->src_types[1]) 563 return false; 564 565 for (unsigned i = 0; i < MIR_VEC_COMPONENTS; ++i) { 566 if (ins->swizzle[0][i] != ins->swizzle[1][i]) 567 return false; 568 } 569 570 if (ins->src_abs[0] != ins->src_abs[1]) 571 return false; 572 573 if (ins->src_neg[0] != ins->src_neg[1]) 574 return false; 575 576 return true; 577} 578 579static void 580mir_adjust_unit(midgard_instruction *ins, unsigned unit) 581{ 582 /* FADD x, x = FMUL x, #2 */ 583 if (mir_is_add_2(ins) && (unit & (UNITS_MUL | UNIT_VLUT))) { 584 ins->op = midgard_alu_op_fmul; 585 586 ins->src[1] = ~0; 587 ins->src_abs[1] = false; 588 ins->src_neg[1] = false; 589 590 ins->has_inline_constant = true; 591 ins->inline_constant = _mesa_float_to_half(2.0); 592 } 593} 594 595static unsigned 596mir_has_unit(midgard_instruction *ins, unsigned unit) 597{ 598 if (alu_opcode_props[ins->op].props & unit) 599 return true; 600 601 /* FADD x, x can run on any adder or any multiplier */ 602 if (mir_is_add_2(ins)) 603 return true; 604 605 return false; 606} 607 608/* Net change in liveness if an instruction were scheduled. Loosely based on 609 * ir3's scheduler. */ 610 611static int 612mir_live_effect(uint16_t *liveness, midgard_instruction *ins, bool destructive) 613{ 614 /* TODO: what if dest is used multiple times? */ 615 int free_live = 0; 616 617 if (ins->dest < SSA_FIXED_MINIMUM) { 618 unsigned bytemask = mir_bytemask(ins); 619 bytemask = util_next_power_of_two(bytemask + 1) - 1; 620 free_live += util_bitcount(liveness[ins->dest] & bytemask); 621 622 if (destructive) 623 liveness[ins->dest] &= ~bytemask; 624 } 625 626 int new_live = 0; 627 628 mir_foreach_src(ins, s) { 629 unsigned S = ins->src[s]; 630 631 bool dupe = false; 632 633 for (unsigned q = 0; q < s; ++q) 634 dupe |= (ins->src[q] == S); 635 636 if (dupe) 637 continue; 638 639 if (S < SSA_FIXED_MINIMUM) { 640 unsigned bytemask = mir_bytemask_of_read_components(ins, S); 641 bytemask = util_next_power_of_two(bytemask + 1) - 1; 642 643 /* Count only the new components */ 644 new_live += util_bitcount(bytemask & ~(liveness[S])); 645 646 if (destructive) 647 liveness[S] |= bytemask; 648 } 649 } 650 651 return new_live - free_live; 652} 653 654static midgard_instruction * 655mir_choose_instruction( 656 midgard_instruction **instructions, 657 uint16_t *liveness, 658 BITSET_WORD *worklist, unsigned count, 659 struct midgard_predicate *predicate) 660{ 661 /* Parse the predicate */ 662 unsigned tag = predicate->tag; 663 unsigned unit = predicate->unit; 664 bool scalar = (unit != ~0) && (unit & UNITS_SCALAR); 665 bool no_cond = predicate->no_cond; 666 667 unsigned mask = predicate->mask; 668 unsigned dest = predicate->dest; 669 bool needs_dest = mask & 0xF; 670 671 /* Iterate to find the best instruction satisfying the predicate */ 672 unsigned i; 673 674 signed best_index = -1; 675 signed best_effect = INT_MAX; 676 bool best_conditional = false; 677 678 /* Enforce a simple metric limiting distance to keep down register 679 * pressure. TOOD: replace with liveness tracking for much better 680 * results */ 681 682 unsigned max_active = 0; 683 unsigned max_distance = 36; 684 685#ifndef NDEBUG 686 /* Force in-order scheduling */ 687 if (midgard_debug & MIDGARD_DBG_INORDER) 688 max_distance = 1; 689#endif 690 691 BITSET_FOREACH_SET(i, worklist, count) { 692 max_active = MAX2(max_active, i); 693 } 694 695 BITSET_FOREACH_SET(i, worklist, count) { 696 if ((max_active - i) >= max_distance) 697 continue; 698 699 if (tag != ~0 && instructions[i]->type != tag) 700 continue; 701 702 bool alu = (instructions[i]->type == TAG_ALU_4); 703 bool ldst = (instructions[i]->type == TAG_LOAD_STORE_4); 704 705 bool branch = alu && (unit == ALU_ENAB_BR_COMPACT); 706 bool is_move = alu && 707 (instructions[i]->op == midgard_alu_op_imov || 708 instructions[i]->op == midgard_alu_op_fmov); 709 710 if (predicate->exclude != ~0 && instructions[i]->dest == predicate->exclude) 711 continue; 712 713 if (alu && !branch && unit != ~0 && !(mir_has_unit(instructions[i], unit))) 714 continue; 715 716 /* 0: don't care, 1: no moves, 2: only moves */ 717 if (predicate->move_mode && ((predicate->move_mode - 1) != is_move)) 718 continue; 719 720 if (branch && !instructions[i]->compact_branch) 721 continue; 722 723 if (alu && scalar && !mir_is_scalar(instructions[i])) 724 continue; 725 726 if (alu && predicate->constants && !mir_adjust_constants(instructions[i], predicate, false)) 727 continue; 728 729 if (needs_dest && instructions[i]->dest != dest) 730 continue; 731 732 if (mask && ((~instructions[i]->mask) & mask)) 733 continue; 734 735 if (instructions[i]->mask & predicate->no_mask) 736 continue; 737 738 if (ldst && mir_pipeline_count(instructions[i]) + predicate->pipeline_count > 2) 739 continue; 740 741 bool conditional = alu && !branch && OP_IS_CSEL(instructions[i]->op); 742 conditional |= (branch && instructions[i]->branch.conditional); 743 744 if (conditional && no_cond) 745 continue; 746 747 int effect = mir_live_effect(liveness, instructions[i], false); 748 749 if (effect > best_effect) 750 continue; 751 752 if (effect == best_effect && (signed) i < best_index) 753 continue; 754 755 best_effect = effect; 756 best_index = i; 757 best_conditional = conditional; 758 } 759 760 /* Did we find anything? */ 761 762 if (best_index < 0) 763 return NULL; 764 765 /* If we found something, remove it from the worklist */ 766 assert(best_index < count); 767 midgard_instruction *I = instructions[best_index]; 768 769 if (predicate->destructive) { 770 BITSET_CLEAR(worklist, best_index); 771 772 if (I->type == TAG_ALU_4) 773 mir_adjust_constants(instructions[best_index], predicate, true); 774 775 if (I->type == TAG_LOAD_STORE_4) 776 predicate->pipeline_count += mir_pipeline_count(instructions[best_index]); 777 778 if (I->type == TAG_ALU_4) 779 mir_adjust_unit(instructions[best_index], unit); 780 781 /* Once we schedule a conditional, we can't again */ 782 predicate->no_cond |= best_conditional; 783 mir_live_effect(liveness, instructions[best_index], true); 784 } 785 786 return I; 787} 788 789/* Still, we don't choose instructions in a vacuum. We need a way to choose the 790 * best bundle type (ALU, load/store, texture). Nondestructive. */ 791 792static unsigned 793mir_choose_bundle( 794 midgard_instruction **instructions, 795 uint16_t *liveness, 796 BITSET_WORD *worklist, unsigned count, 797 unsigned num_ldst) 798{ 799 /* At the moment, our algorithm is very simple - use the bundle of the 800 * best instruction, regardless of what else could be scheduled 801 * alongside it. This is not optimal but it works okay for in-order */ 802 803 struct midgard_predicate predicate = { 804 .tag = ~0, 805 .unit = ~0, 806 .destructive = false, 807 .exclude = ~0 808 }; 809 810 midgard_instruction *chosen = mir_choose_instruction(instructions, liveness, worklist, count, &predicate); 811 812 if (chosen && chosen->type == TAG_LOAD_STORE_4 && !(num_ldst % 2)) { 813 /* Try to schedule load/store ops in pairs */ 814 815 predicate.exclude = chosen->dest; 816 predicate.tag = TAG_LOAD_STORE_4; 817 818 chosen = mir_choose_instruction(instructions, liveness, worklist, count, &predicate); 819 if (chosen) 820 return TAG_LOAD_STORE_4; 821 822 predicate.tag = ~0; 823 824 chosen = mir_choose_instruction(instructions, liveness, worklist, count, &predicate); 825 assert(chosen == NULL || chosen->type != TAG_LOAD_STORE_4); 826 827 if (chosen) 828 return chosen->type; 829 else 830 return TAG_LOAD_STORE_4; 831 } 832 833 if (chosen) 834 return chosen->type; 835 else 836 return ~0; 837} 838 839/* We want to choose an ALU instruction filling a given unit */ 840static void 841mir_choose_alu(midgard_instruction **slot, 842 midgard_instruction **instructions, 843 uint16_t *liveness, 844 BITSET_WORD *worklist, unsigned len, 845 struct midgard_predicate *predicate, 846 unsigned unit) 847{ 848 /* Did we already schedule to this slot? */ 849 if ((*slot) != NULL) 850 return; 851 852 /* Try to schedule something, if not */ 853 predicate->unit = unit; 854 *slot = mir_choose_instruction(instructions, liveness, worklist, len, predicate); 855 856 /* Store unit upon scheduling */ 857 if (*slot && !((*slot)->compact_branch)) 858 (*slot)->unit = unit; 859} 860 861/* When we are scheduling a branch/csel, we need the consumed condition in the 862 * same block as a pipeline register. There are two options to enable this: 863 * 864 * - Move the conditional into the bundle. Preferred, but only works if the 865 * conditional is used only once and is from this block. 866 * - Copy the conditional. 867 * 868 * We search for the conditional. If it's in this block, single-use, and 869 * without embedded constants, we schedule it immediately. Otherwise, we 870 * schedule a move for it. 871 * 872 * mir_comparison_mobile is a helper to find the moveable condition. 873 */ 874 875static unsigned 876mir_comparison_mobile( 877 compiler_context *ctx, 878 midgard_instruction **instructions, 879 struct midgard_predicate *predicate, 880 unsigned count, 881 unsigned cond) 882{ 883 if (!mir_single_use(ctx, cond)) 884 return ~0; 885 886 unsigned ret = ~0; 887 888 for (unsigned i = 0; i < count; ++i) { 889 if (instructions[i]->dest != cond) 890 continue; 891 892 /* Must fit in an ALU bundle */ 893 if (instructions[i]->type != TAG_ALU_4) 894 return ~0; 895 896 /* If it would itself require a condition, that's recursive */ 897 if (OP_IS_CSEL(instructions[i]->op)) 898 return ~0; 899 900 /* We'll need to rewrite to .w but that doesn't work for vector 901 * ops that don't replicate (ball/bany), so bail there */ 902 903 if (GET_CHANNEL_COUNT(alu_opcode_props[instructions[i]->op].props)) 904 return ~0; 905 906 /* Ensure it will fit with constants */ 907 908 if (!mir_adjust_constants(instructions[i], predicate, false)) 909 return ~0; 910 911 /* Ensure it is written only once */ 912 913 if (ret != ~0) 914 return ~0; 915 else 916 ret = i; 917 } 918 919 /* Inject constants now that we are sure we want to */ 920 if (ret != ~0) 921 mir_adjust_constants(instructions[ret], predicate, true); 922 923 return ret; 924} 925 926/* Using the information about the moveable conditional itself, we either pop 927 * that condition off the worklist for use now, or create a move to 928 * artificially schedule instead as a fallback */ 929 930static midgard_instruction * 931mir_schedule_comparison( 932 compiler_context *ctx, 933 midgard_instruction **instructions, 934 struct midgard_predicate *predicate, 935 BITSET_WORD *worklist, unsigned count, 936 unsigned cond, bool vector, unsigned *swizzle, 937 midgard_instruction *user) 938{ 939 /* TODO: swizzle when scheduling */ 940 unsigned comp_i = 941 (!vector && (swizzle[0] == 0)) ? 942 mir_comparison_mobile(ctx, instructions, predicate, count, cond) : ~0; 943 944 /* If we can, schedule the condition immediately */ 945 if ((comp_i != ~0) && BITSET_TEST(worklist, comp_i)) { 946 assert(comp_i < count); 947 BITSET_CLEAR(worklist, comp_i); 948 return instructions[comp_i]; 949 } 950 951 /* Otherwise, we insert a move */ 952 953 midgard_instruction mov = v_mov(cond, cond); 954 mov.mask = vector ? 0xF : 0x1; 955 memcpy(mov.swizzle[1], swizzle, sizeof(mov.swizzle[1])); 956 957 return mir_insert_instruction_before(ctx, user, mov); 958} 959 960/* Most generally, we need instructions writing to r31 in the appropriate 961 * components */ 962 963static midgard_instruction * 964mir_schedule_condition(compiler_context *ctx, 965 struct midgard_predicate *predicate, 966 BITSET_WORD *worklist, unsigned count, 967 midgard_instruction **instructions, 968 midgard_instruction *last) 969{ 970 /* For a branch, the condition is the only argument; for csel, third */ 971 bool branch = last->compact_branch; 972 unsigned condition_index = branch ? 0 : 2; 973 974 /* csel_v is vector; otherwise, conditions are scalar */ 975 bool vector = !branch && OP_IS_CSEL_V(last->op); 976 977 /* Grab the conditional instruction */ 978 979 midgard_instruction *cond = mir_schedule_comparison( 980 ctx, instructions, predicate, worklist, count, last->src[condition_index], 981 vector, last->swizzle[condition_index], last); 982 983 /* We have exclusive reign over this (possibly move) conditional 984 * instruction. We can rewrite into a pipeline conditional register */ 985 986 predicate->exclude = cond->dest; 987 cond->dest = SSA_FIXED_REGISTER(31); 988 989 if (!vector) { 990 cond->mask = (1 << COMPONENT_W); 991 992 mir_foreach_src(cond, s) { 993 if (cond->src[s] == ~0) 994 continue; 995 996 for (unsigned q = 0; q < 4; ++q) 997 cond->swizzle[s][q + COMPONENT_W] = cond->swizzle[s][q]; 998 } 999 } 1000 1001 /* Schedule the unit: csel is always in the latter pipeline, so a csel 1002 * condition must be in the former pipeline stage (vmul/sadd), 1003 * depending on scalar/vector of the instruction itself. A branch must 1004 * be written from the latter pipeline stage and a branch condition is 1005 * always scalar, so it is always in smul (exception: ball/bany, which 1006 * will be vadd) */ 1007 1008 if (branch) 1009 cond->unit = UNIT_SMUL; 1010 else 1011 cond->unit = vector ? UNIT_VMUL : UNIT_SADD; 1012 1013 return cond; 1014} 1015 1016/* Schedules a single bundle of the given type */ 1017 1018static midgard_bundle 1019mir_schedule_texture( 1020 midgard_instruction **instructions, 1021 uint16_t *liveness, 1022 BITSET_WORD *worklist, unsigned len, 1023 bool is_vertex) 1024{ 1025 struct midgard_predicate predicate = { 1026 .tag = TAG_TEXTURE_4, 1027 .destructive = true, 1028 .exclude = ~0 1029 }; 1030 1031 midgard_instruction *ins = 1032 mir_choose_instruction(instructions, liveness, worklist, len, &predicate); 1033 1034 mir_update_worklist(worklist, len, instructions, ins); 1035 1036 struct midgard_bundle out = { 1037 .tag = ins->op == midgard_tex_op_barrier ? 1038 TAG_TEXTURE_4_BARRIER : 1039 (ins->op == midgard_tex_op_fetch) || is_vertex ? 1040 TAG_TEXTURE_4_VTX : TAG_TEXTURE_4, 1041 .instruction_count = 1, 1042 .instructions = { ins } 1043 }; 1044 1045 return out; 1046} 1047 1048static midgard_bundle 1049mir_schedule_ldst( 1050 midgard_instruction **instructions, 1051 uint16_t *liveness, 1052 BITSET_WORD *worklist, unsigned len, 1053 unsigned *num_ldst) 1054{ 1055 struct midgard_predicate predicate = { 1056 .tag = TAG_LOAD_STORE_4, 1057 .destructive = true, 1058 .exclude = ~0 1059 }; 1060 1061 /* Try to pick two load/store ops. Second not gauranteed to exist */ 1062 1063 midgard_instruction *ins = 1064 mir_choose_instruction(instructions, liveness, worklist, len, &predicate); 1065 1066 midgard_instruction *pair = 1067 mir_choose_instruction(instructions, liveness, worklist, len, &predicate); 1068 1069 assert(ins != NULL); 1070 1071 struct midgard_bundle out = { 1072 .tag = TAG_LOAD_STORE_4, 1073 .instruction_count = pair ? 2 : 1, 1074 .instructions = { ins, pair } 1075 }; 1076 1077 *num_ldst -= out.instruction_count; 1078 1079 /* We have to update the worklist atomically, since the two 1080 * instructions run concurrently (TODO: verify it's not pipelined) */ 1081 1082 mir_update_worklist(worklist, len, instructions, ins); 1083 mir_update_worklist(worklist, len, instructions, pair); 1084 1085 return out; 1086} 1087 1088static void 1089mir_schedule_zs_write( 1090 compiler_context *ctx, 1091 struct midgard_predicate *predicate, 1092 midgard_instruction **instructions, 1093 uint16_t *liveness, 1094 BITSET_WORD *worklist, unsigned len, 1095 midgard_instruction *branch, 1096 midgard_instruction **smul, 1097 midgard_instruction **vadd, 1098 midgard_instruction **vlut, 1099 bool stencil) 1100{ 1101 bool success = false; 1102 unsigned idx = stencil ? 3 : 2; 1103 unsigned src = (branch->src[0] == ~0) ? SSA_FIXED_REGISTER(1) : branch->src[idx]; 1104 1105 predicate->dest = src; 1106 predicate->mask = 0x1; 1107 1108 midgard_instruction **units[] = { smul, vadd, vlut }; 1109 unsigned unit_names[] = { UNIT_SMUL, UNIT_VADD, UNIT_VLUT }; 1110 1111 for (unsigned i = 0; i < 3; ++i) { 1112 if (*(units[i])) 1113 continue; 1114 1115 predicate->unit = unit_names[i]; 1116 midgard_instruction *ins = 1117 mir_choose_instruction(instructions, liveness, worklist, len, predicate); 1118 1119 if (ins) { 1120 ins->unit = unit_names[i]; 1121 *(units[i]) = ins; 1122 success |= true; 1123 break; 1124 } 1125 } 1126 1127 predicate->dest = predicate->mask = 0; 1128 1129 if (success) 1130 return; 1131 1132 midgard_instruction *mov = ralloc(ctx, midgard_instruction); 1133 *mov = v_mov(src, make_compiler_temp(ctx)); 1134 mov->mask = 0x1; 1135 1136 branch->src[idx] = mov->dest; 1137 1138 if (stencil) { 1139 unsigned swizzle = (branch->src[0] == ~0) ? COMPONENT_Y : COMPONENT_X; 1140 1141 for (unsigned c = 0; c < 16; ++c) 1142 mov->swizzle[1][c] = swizzle; 1143 } 1144 1145 for (unsigned i = 0; i < 3; ++i) { 1146 if (!(*(units[i]))) { 1147 *(units[i]) = mov; 1148 mov->unit = unit_names[i]; 1149 return; 1150 } 1151 } 1152 1153 unreachable("Could not schedule Z/S move to any unit"); 1154} 1155 1156static midgard_bundle 1157mir_schedule_alu( 1158 compiler_context *ctx, 1159 midgard_instruction **instructions, 1160 uint16_t *liveness, 1161 BITSET_WORD *worklist, unsigned len) 1162{ 1163 struct midgard_bundle bundle = {}; 1164 1165 unsigned bytes_emitted = sizeof(bundle.control); 1166 1167 struct midgard_predicate predicate = { 1168 .tag = TAG_ALU_4, 1169 .destructive = true, 1170 .exclude = ~0, 1171 .constants = &bundle.constants 1172 }; 1173 1174 midgard_instruction *vmul = NULL; 1175 midgard_instruction *vadd = NULL; 1176 midgard_instruction *vlut = NULL; 1177 midgard_instruction *smul = NULL; 1178 midgard_instruction *sadd = NULL; 1179 midgard_instruction *branch = NULL; 1180 1181 mir_choose_alu(&branch, instructions, liveness, worklist, len, &predicate, ALU_ENAB_BR_COMPACT); 1182 mir_update_worklist(worklist, len, instructions, branch); 1183 unsigned writeout = branch ? branch->writeout : 0; 1184 1185 if (branch && branch->branch.conditional) { 1186 midgard_instruction *cond = mir_schedule_condition(ctx, &predicate, worklist, len, instructions, branch); 1187 1188 if (cond->unit == UNIT_VADD) 1189 vadd = cond; 1190 else if (cond->unit == UNIT_SMUL) 1191 smul = cond; 1192 else 1193 unreachable("Bad condition"); 1194 } 1195 1196 /* If we have a render target reference, schedule a move for it. Since 1197 * this will be in sadd, we boost this to prevent scheduling csel into 1198 * smul */ 1199 1200 if (writeout && (branch->constants.u32[0] || ctx->inputs->is_blend)) { 1201 sadd = ralloc(ctx, midgard_instruction); 1202 *sadd = v_mov(~0, make_compiler_temp(ctx)); 1203 sadd->unit = UNIT_SADD; 1204 sadd->mask = 0x1; 1205 sadd->has_inline_constant = true; 1206 sadd->inline_constant = branch->constants.u32[0]; 1207 branch->src[1] = sadd->dest; 1208 branch->src_types[1] = sadd->dest_type; 1209 } 1210 1211 if (writeout) { 1212 /* Propagate up */ 1213 bundle.last_writeout = branch->last_writeout; 1214 1215 /* Mask off any conditionals. 1216 * This prevents csel and csel_v being scheduled into smul 1217 * since we might not have room for a conditional in vmul/sadd. 1218 * This is important because both writeout and csel have same-bundle 1219 * requirements on their dependencies. */ 1220 predicate.no_cond = true; 1221 } 1222 1223 /* When MRT is in use, writeout loops require r1.w to be filled with a 1224 * return address for the blend shader to jump to. We always emit the 1225 * move for blend shaders themselves for ABI reasons. */ 1226 1227 if (writeout && (ctx->inputs->is_blend || ctx->writeout_branch[1])) { 1228 vadd = ralloc(ctx, midgard_instruction); 1229 *vadd = v_mov(~0, make_compiler_temp(ctx)); 1230 1231 if (!ctx->inputs->is_blend) { 1232 vadd->op = midgard_alu_op_iadd; 1233 vadd->src[0] = SSA_FIXED_REGISTER(31); 1234 vadd->src_types[0] = nir_type_uint32; 1235 1236 for (unsigned c = 0; c < 16; ++c) 1237 vadd->swizzle[0][c] = COMPONENT_X; 1238 1239 vadd->has_inline_constant = true; 1240 vadd->inline_constant = 0; 1241 } else { 1242 vadd->src[1] = SSA_FIXED_REGISTER(1); 1243 vadd->src_types[0] = nir_type_uint32; 1244 1245 for (unsigned c = 0; c < 16; ++c) 1246 vadd->swizzle[1][c] = COMPONENT_W; 1247 } 1248 1249 vadd->unit = UNIT_VADD; 1250 vadd->mask = 0x1; 1251 branch->dest = vadd->dest; 1252 branch->dest_type = vadd->dest_type; 1253 } 1254 1255 if (writeout & PAN_WRITEOUT_Z) 1256 mir_schedule_zs_write(ctx, &predicate, instructions, liveness, worklist, len, branch, &smul, &vadd, &vlut, false); 1257 1258 if (writeout & PAN_WRITEOUT_S) 1259 mir_schedule_zs_write(ctx, &predicate, instructions, liveness, worklist, len, branch, &smul, &vadd, &vlut, true); 1260 1261 mir_choose_alu(&smul, instructions, liveness, worklist, len, &predicate, UNIT_SMUL); 1262 1263 for (unsigned mode = 1; mode < 3; ++mode) { 1264 predicate.move_mode = mode; 1265 predicate.no_mask = writeout ? (1 << 3) : 0; 1266 mir_choose_alu(&vlut, instructions, liveness, worklist, len, &predicate, UNIT_VLUT); 1267 predicate.no_mask = 0; 1268 mir_choose_alu(&vadd, instructions, liveness, worklist, len, &predicate, UNIT_VADD); 1269 } 1270 1271 /* Reset */ 1272 predicate.move_mode = 0; 1273 1274 mir_update_worklist(worklist, len, instructions, vlut); 1275 mir_update_worklist(worklist, len, instructions, vadd); 1276 mir_update_worklist(worklist, len, instructions, smul); 1277 1278 bool vadd_csel = vadd && OP_IS_CSEL(vadd->op); 1279 bool smul_csel = smul && OP_IS_CSEL(smul->op); 1280 1281 if (vadd_csel || smul_csel) { 1282 midgard_instruction *ins = vadd_csel ? vadd : smul; 1283 midgard_instruction *cond = mir_schedule_condition(ctx, &predicate, worklist, len, instructions, ins); 1284 1285 if (cond->unit == UNIT_VMUL) 1286 vmul = cond; 1287 else if (cond->unit == UNIT_SADD) 1288 sadd = cond; 1289 else 1290 unreachable("Bad condition"); 1291 } 1292 1293 /* Stage 2, let's schedule sadd before vmul for writeout */ 1294 mir_choose_alu(&sadd, instructions, liveness, worklist, len, &predicate, UNIT_SADD); 1295 1296 /* Check if writeout reads its own register */ 1297 1298 if (writeout) { 1299 midgard_instruction *stages[] = { sadd, vadd, smul, vlut }; 1300 unsigned src = (branch->src[0] == ~0) ? SSA_FIXED_REGISTER(0) : branch->src[0]; 1301 unsigned writeout_mask = 0x0; 1302 bool bad_writeout = false; 1303 1304 for (unsigned i = 0; i < ARRAY_SIZE(stages); ++i) { 1305 if (!stages[i]) 1306 continue; 1307 1308 if (stages[i]->dest != src) 1309 continue; 1310 1311 writeout_mask |= stages[i]->mask; 1312 bad_writeout |= mir_has_arg(stages[i], branch->src[0]); 1313 } 1314 1315 /* It's possible we'll be able to schedule something into vmul 1316 * to fill r0. Let's peak into the future, trying to schedule 1317 * vmul specially that way. */ 1318 1319 unsigned full_mask = 0xF; 1320 1321 if (!bad_writeout && writeout_mask != full_mask) { 1322 predicate.unit = UNIT_VMUL; 1323 predicate.dest = src; 1324 predicate.mask = writeout_mask ^ full_mask; 1325 1326 struct midgard_instruction *peaked = 1327 mir_choose_instruction(instructions, liveness, worklist, len, &predicate); 1328 1329 if (peaked) { 1330 vmul = peaked; 1331 vmul->unit = UNIT_VMUL; 1332 writeout_mask |= predicate.mask; 1333 assert(writeout_mask == full_mask); 1334 } 1335 1336 /* Cleanup */ 1337 predicate.dest = predicate.mask = 0; 1338 } 1339 1340 /* Finally, add a move if necessary */ 1341 if (bad_writeout || writeout_mask != full_mask) { 1342 unsigned temp = (branch->src[0] == ~0) ? SSA_FIXED_REGISTER(0) : make_compiler_temp(ctx); 1343 1344 vmul = ralloc(ctx, midgard_instruction); 1345 *vmul = v_mov(src, temp); 1346 vmul->unit = UNIT_VMUL; 1347 vmul->mask = full_mask ^ writeout_mask; 1348 1349 /* Rewrite to use our temp */ 1350 1351 for (unsigned i = 0; i < ARRAY_SIZE(stages); ++i) { 1352 if (stages[i]) { 1353 mir_rewrite_index_dst_single(stages[i], src, temp); 1354 mir_rewrite_index_src_single(stages[i], src, temp); 1355 } 1356 } 1357 1358 mir_rewrite_index_src_single(branch, src, temp); 1359 } 1360 } 1361 1362 mir_choose_alu(&vmul, instructions, liveness, worklist, len, &predicate, UNIT_VMUL); 1363 1364 mir_update_worklist(worklist, len, instructions, vmul); 1365 mir_update_worklist(worklist, len, instructions, sadd); 1366 1367 bundle.has_embedded_constants = predicate.constant_mask != 0; 1368 1369 unsigned padding = 0; 1370 1371 /* Now that we have finished scheduling, build up the bundle */ 1372 midgard_instruction *stages[] = { vmul, sadd, vadd, smul, vlut, branch }; 1373 1374 for (unsigned i = 0; i < ARRAY_SIZE(stages); ++i) { 1375 if (stages[i]) { 1376 bundle.control |= stages[i]->unit; 1377 bytes_emitted += bytes_for_instruction(stages[i]); 1378 bundle.instructions[bundle.instruction_count++] = stages[i]; 1379 1380 /* If we branch, we can't spill to TLS since the store 1381 * instruction will never get executed. We could try to 1382 * break the bundle but this is probably easier for 1383 * now. */ 1384 1385 if (branch) 1386 stages[i]->no_spill |= (1 << REG_CLASS_WORK); 1387 } 1388 } 1389 1390 /* Pad ALU op to nearest word */ 1391 1392 if (bytes_emitted & 15) { 1393 padding = 16 - (bytes_emitted & 15); 1394 bytes_emitted += padding; 1395 } 1396 1397 /* Constants must always be quadwords */ 1398 if (bundle.has_embedded_constants) 1399 bytes_emitted += 16; 1400 1401 /* Size ALU instruction for tag */ 1402 bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1; 1403 1404 bool tilebuf_wait = branch && branch->compact_branch && 1405 branch->branch.target_type == TARGET_TILEBUF_WAIT; 1406 1407 /* MRT capable GPUs use a special writeout procedure */ 1408 if ((writeout || tilebuf_wait) && !(ctx->quirks & MIDGARD_NO_UPPER_ALU)) 1409 bundle.tag += 4; 1410 1411 bundle.padding = padding; 1412 bundle.control |= bundle.tag; 1413 1414 return bundle; 1415} 1416 1417/* Schedule a single block by iterating its instruction to create bundles. 1418 * While we go, tally about the bundle sizes to compute the block size. */ 1419 1420 1421static void 1422schedule_block(compiler_context *ctx, midgard_block *block) 1423{ 1424 /* Copy list to dynamic array */ 1425 unsigned len = 0; 1426 midgard_instruction **instructions = flatten_mir(block, &len); 1427 1428 if (!len) 1429 return; 1430 1431 /* Calculate dependencies and initial worklist */ 1432 unsigned node_count = ctx->temp_count + 1; 1433 mir_create_dependency_graph(instructions, len, node_count); 1434 1435 /* Allocate the worklist */ 1436 size_t sz = BITSET_WORDS(len) * sizeof(BITSET_WORD); 1437 BITSET_WORD *worklist = calloc(sz, 1); 1438 uint16_t *liveness = calloc(node_count, 2); 1439 mir_initialize_worklist(worklist, instructions, len); 1440 1441 /* Count the number of load/store instructions so we know when it's 1442 * worth trying to schedule them in pairs. */ 1443 unsigned num_ldst = 0; 1444 for (unsigned i = 0; i < len; ++i) { 1445 if (instructions[i]->type == TAG_LOAD_STORE_4) 1446 ++num_ldst; 1447 } 1448 1449 struct util_dynarray bundles; 1450 util_dynarray_init(&bundles, NULL); 1451 1452 block->quadword_count = 0; 1453 1454 for (;;) { 1455 unsigned tag = mir_choose_bundle(instructions, liveness, worklist, len, num_ldst); 1456 midgard_bundle bundle; 1457 1458 if (tag == TAG_TEXTURE_4) 1459 bundle = mir_schedule_texture(instructions, liveness, worklist, len, ctx->stage != MESA_SHADER_FRAGMENT); 1460 else if (tag == TAG_LOAD_STORE_4) 1461 bundle = mir_schedule_ldst(instructions, liveness, worklist, len, &num_ldst); 1462 else if (tag == TAG_ALU_4) 1463 bundle = mir_schedule_alu(ctx, instructions, liveness, worklist, len); 1464 else 1465 break; 1466 1467 for (unsigned i = 0; i < bundle.instruction_count; ++i) 1468 bundle.instructions[i]->bundle_id = 1469 ctx->quadword_count + block->quadword_count; 1470 1471 util_dynarray_append(&bundles, midgard_bundle, bundle); 1472 block->quadword_count += midgard_tag_props[bundle.tag].size; 1473 } 1474 1475 assert(num_ldst == 0); 1476 1477 /* We emitted bundles backwards; copy into the block in reverse-order */ 1478 1479 util_dynarray_init(&block->bundles, block); 1480 util_dynarray_foreach_reverse(&bundles, midgard_bundle, bundle) { 1481 util_dynarray_append(&block->bundles, midgard_bundle, *bundle); 1482 } 1483 util_dynarray_fini(&bundles); 1484 1485 block->scheduled = true; 1486 ctx->quadword_count += block->quadword_count; 1487 1488 /* Reorder instructions to match bundled. First remove existing 1489 * instructions and then recreate the list */ 1490 1491 mir_foreach_instr_in_block_safe(block, ins) { 1492 list_del(&ins->link); 1493 } 1494 1495 mir_foreach_instr_in_block_scheduled_rev(block, ins) { 1496 list_add(&ins->link, &block->base.instructions); 1497 } 1498 1499 free(instructions); /* Allocated by flatten_mir() */ 1500 free(worklist); 1501 free(liveness); 1502} 1503 1504/* Insert moves to ensure we can register allocate load/store registers */ 1505static void 1506mir_lower_ldst(compiler_context *ctx) 1507{ 1508 mir_foreach_instr_global_safe(ctx, I) { 1509 if (I->type != TAG_LOAD_STORE_4) continue; 1510 1511 mir_foreach_src(I, s) { 1512 if (s == 0) continue; 1513 if (I->src[s] == ~0) continue; 1514 if (I->swizzle[s][0] == 0) continue; 1515 1516 unsigned temp = make_compiler_temp(ctx); 1517 midgard_instruction mov = v_mov(I->src[s], temp); 1518 mov.mask = 0x1; 1519 mov.dest_type = I->src_types[s]; 1520 for (unsigned c = 0; c < NIR_MAX_VEC_COMPONENTS; ++c) 1521 mov.swizzle[1][c] = I->swizzle[s][0]; 1522 1523 mir_insert_instruction_before(ctx, I, mov); 1524 I->src[s] = mov.dest; 1525 I->swizzle[s][0] = 0; 1526 } 1527 } 1528} 1529 1530/* Insert moves to ensure we can register allocate blend writeout */ 1531static void 1532mir_lower_blend_input(compiler_context *ctx) 1533{ 1534 mir_foreach_block(ctx, _blk) { 1535 midgard_block *blk = (midgard_block *) _blk; 1536 1537 if (list_is_empty(&blk->base.instructions)) 1538 continue; 1539 1540 midgard_instruction *I = mir_last_in_block(blk); 1541 1542 if (!I || I->type != TAG_ALU_4 || !I->writeout) 1543 continue; 1544 1545 mir_foreach_src(I, s) { 1546 unsigned src = I->src[s]; 1547 1548 if (src >= ctx->temp_count) 1549 continue; 1550 1551 if (!_blk->live_out[src]) 1552 continue; 1553 1554 unsigned temp = make_compiler_temp(ctx); 1555 midgard_instruction mov = v_mov(src, temp); 1556 mov.mask = 0xF; 1557 mov.dest_type = nir_type_uint32; 1558 mir_insert_instruction_before(ctx, I, mov); 1559 I->src[s] = mov.dest; 1560 } 1561 } 1562} 1563 1564void 1565midgard_schedule_program(compiler_context *ctx) 1566{ 1567 mir_lower_ldst(ctx); 1568 midgard_promote_uniforms(ctx); 1569 1570 /* Must be lowered right before scheduling */ 1571 mir_squeeze_index(ctx); 1572 mir_lower_special_reads(ctx); 1573 1574 if (ctx->stage == MESA_SHADER_FRAGMENT) { 1575 mir_invalidate_liveness(ctx); 1576 mir_compute_liveness(ctx); 1577 mir_lower_blend_input(ctx); 1578 } 1579 1580 mir_squeeze_index(ctx); 1581 1582 /* Lowering can introduce some dead moves */ 1583 1584 mir_foreach_block(ctx, _block) { 1585 midgard_block *block = (midgard_block *) _block; 1586 midgard_opt_dead_move_eliminate(ctx, block); 1587 schedule_block(ctx, block); 1588 } 1589} 1590