1/* 2 * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io> 3 * Copyright (C) 2019 Collabora, Ltd. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25#include "compiler.h" 26#include "midgard_ops.h" 27#include "util/u_math.h" 28#include "util/u_memory.h" 29#include "midgard_quirks.h" 30 31struct phys_reg { 32 /* Physical register: 0-31 */ 33 unsigned reg; 34 35 /* Byte offset into the physical register: 0-15 */ 36 unsigned offset; 37 38 /* log2(bytes per component) for fast mul/div */ 39 unsigned shift; 40}; 41 42/* Shift up by reg_offset and horizontally by dst_offset. */ 43 44static void 45offset_swizzle(unsigned *swizzle, unsigned reg_offset, unsigned srcshift, unsigned dstshift, unsigned dst_offset) 46{ 47 unsigned out[MIR_VEC_COMPONENTS]; 48 49 signed reg_comp = reg_offset >> srcshift; 50 signed dst_comp = dst_offset >> dstshift; 51 52 unsigned max_component = (16 >> srcshift) - 1; 53 54 assert(reg_comp << srcshift == reg_offset); 55 assert(dst_comp << dstshift == dst_offset); 56 57 for (signed c = 0; c < MIR_VEC_COMPONENTS; ++c) { 58 signed comp = MAX2(c - dst_comp, 0); 59 out[c] = MIN2(swizzle[comp] + reg_comp, max_component); 60 } 61 62 memcpy(swizzle, out, sizeof(out)); 63} 64 65/* Helper to return the default phys_reg for a given register */ 66 67static struct phys_reg 68default_phys_reg(int reg, unsigned shift) 69{ 70 struct phys_reg r = { 71 .reg = reg, 72 .offset = 0, 73 .shift = shift 74 }; 75 76 return r; 77} 78 79/* Determine which physical register, swizzle, and mask a virtual 80 * register corresponds to */ 81 82static struct phys_reg 83index_to_reg(compiler_context *ctx, struct lcra_state *l, unsigned reg, unsigned shift) 84{ 85 /* Check for special cases */ 86 if (reg == ~0) 87 return default_phys_reg(REGISTER_UNUSED, shift); 88 else if (reg >= SSA_FIXED_MINIMUM) 89 return default_phys_reg(SSA_REG_FROM_FIXED(reg), shift); 90 else if (!l) 91 return default_phys_reg(REGISTER_UNUSED, shift); 92 93 struct phys_reg r = { 94 .reg = l->solutions[reg] / 16, 95 .offset = l->solutions[reg] & 0xF, 96 .shift = shift 97 }; 98 99 /* Report that we actually use this register, and return it */ 100 101 if (r.reg < 16) 102 ctx->info->work_reg_count = MAX2(ctx->info->work_reg_count, r.reg + 1); 103 104 return r; 105} 106 107static void 108set_class(unsigned *classes, unsigned node, unsigned class) 109{ 110 if (node < SSA_FIXED_MINIMUM && class != classes[node]) { 111 assert(classes[node] == REG_CLASS_WORK); 112 classes[node] = class; 113 } 114} 115 116/* Special register classes impose special constraints on who can read their 117 * values, so check that */ 118 119static bool ASSERTED 120check_read_class(unsigned *classes, unsigned tag, unsigned node) 121{ 122 /* Non-nodes are implicitly ok */ 123 if (node >= SSA_FIXED_MINIMUM) 124 return true; 125 126 switch (classes[node]) { 127 case REG_CLASS_LDST: 128 return (tag == TAG_LOAD_STORE_4); 129 case REG_CLASS_TEXR: 130 return (tag == TAG_TEXTURE_4); 131 case REG_CLASS_TEXW: 132 return (tag != TAG_LOAD_STORE_4); 133 case REG_CLASS_WORK: 134 return IS_ALU(tag); 135 default: 136 unreachable("Invalid class"); 137 } 138} 139 140static bool ASSERTED 141check_write_class(unsigned *classes, unsigned tag, unsigned node) 142{ 143 /* Non-nodes are implicitly ok */ 144 if (node >= SSA_FIXED_MINIMUM) 145 return true; 146 147 switch (classes[node]) { 148 case REG_CLASS_TEXR: 149 return true; 150 case REG_CLASS_TEXW: 151 return (tag == TAG_TEXTURE_4); 152 case REG_CLASS_LDST: 153 case REG_CLASS_WORK: 154 return IS_ALU(tag) || (tag == TAG_LOAD_STORE_4); 155 default: 156 unreachable("Invalid class"); 157 } 158} 159 160/* Prepass before RA to ensure special class restrictions are met. The idea is 161 * to create a bit field of types of instructions that read a particular index. 162 * Later, we'll add moves as appropriate and rewrite to specialize by type. */ 163 164static void 165mark_node_class (unsigned *bitfield, unsigned node) 166{ 167 if (node < SSA_FIXED_MINIMUM) 168 BITSET_SET(bitfield, node); 169} 170 171void 172mir_lower_special_reads(compiler_context *ctx) 173{ 174 size_t sz = BITSET_WORDS(ctx->temp_count) * sizeof(BITSET_WORD); 175 176 /* Bitfields for the various types of registers we could have. aluw can 177 * be written by either ALU or load/store */ 178 179 unsigned *alur = calloc(sz, 1); 180 unsigned *aluw = calloc(sz, 1); 181 unsigned *brar = calloc(sz, 1); 182 unsigned *ldst = calloc(sz, 1); 183 unsigned *texr = calloc(sz, 1); 184 unsigned *texw = calloc(sz, 1); 185 186 /* Pass #1 is analysis, a linear scan to fill out the bitfields */ 187 188 mir_foreach_instr_global(ctx, ins) { 189 switch (ins->type) { 190 case TAG_ALU_4: 191 mark_node_class(aluw, ins->dest); 192 mark_node_class(alur, ins->src[0]); 193 mark_node_class(alur, ins->src[1]); 194 mark_node_class(alur, ins->src[2]); 195 196 if (ins->compact_branch && ins->writeout) 197 mark_node_class(brar, ins->src[0]); 198 199 break; 200 201 case TAG_LOAD_STORE_4: 202 mark_node_class(aluw, ins->dest); 203 mark_node_class(ldst, ins->src[0]); 204 mark_node_class(ldst, ins->src[1]); 205 mark_node_class(ldst, ins->src[2]); 206 mark_node_class(ldst, ins->src[3]); 207 break; 208 209 case TAG_TEXTURE_4: 210 mark_node_class(texr, ins->src[0]); 211 mark_node_class(texr, ins->src[1]); 212 mark_node_class(texr, ins->src[2]); 213 mark_node_class(texw, ins->dest); 214 break; 215 216 default: 217 break; 218 } 219 } 220 221 /* Pass #2 is lowering now that we've analyzed all the classes. 222 * Conceptually, if an index is only marked for a single type of use, 223 * there is nothing to lower. If it is marked for different uses, we 224 * split up based on the number of types of uses. To do so, we divide 225 * into N distinct classes of use (where N>1 by definition), emit N-1 226 * moves from the index to copies of the index, and finally rewrite N-1 227 * of the types of uses to use the corresponding move */ 228 229 unsigned spill_idx = ctx->temp_count; 230 231 for (unsigned i = 0; i < ctx->temp_count; ++i) { 232 bool is_alur = BITSET_TEST(alur, i); 233 bool is_aluw = BITSET_TEST(aluw, i); 234 bool is_brar = BITSET_TEST(brar, i); 235 bool is_ldst = BITSET_TEST(ldst, i); 236 bool is_texr = BITSET_TEST(texr, i); 237 bool is_texw = BITSET_TEST(texw, i); 238 239 /* Analyse to check how many distinct uses there are. ALU ops 240 * (alur) can read the results of the texture pipeline (texw) 241 * but not ldst or texr. Load/store ops (ldst) cannot read 242 * anything but load/store inputs. Texture pipeline cannot read 243 * anything but texture inputs. TODO: Simplify. */ 244 245 bool collision = 246 (is_alur && (is_ldst || is_texr)) || 247 (is_ldst && (is_alur || is_texr || is_texw)) || 248 (is_texr && (is_alur || is_ldst || is_texw)) || 249 (is_texw && (is_aluw || is_ldst || is_texr)) || 250 (is_brar && is_texw); 251 252 if (!collision) 253 continue; 254 255 /* Use the index as-is as the work copy. Emit copies for 256 * special uses */ 257 258 unsigned classes[] = { TAG_LOAD_STORE_4, TAG_TEXTURE_4, TAG_TEXTURE_4, TAG_ALU_4}; 259 bool collisions[] = { is_ldst, is_texr, is_texw && is_aluw, is_brar }; 260 261 for (unsigned j = 0; j < ARRAY_SIZE(collisions); ++j) { 262 if (!collisions[j]) continue; 263 264 /* When the hazard is from reading, we move and rewrite 265 * sources (typical case). When it's from writing, we 266 * flip the move and rewrite destinations (obscure, 267 * only from control flow -- impossible in SSA) */ 268 269 bool hazard_write = (j == 2); 270 271 unsigned idx = spill_idx++; 272 273 /* Insert move before each read/write, depending on the 274 * hazard we're trying to account for */ 275 276 mir_foreach_instr_global_safe(ctx, pre_use) { 277 if (pre_use->type != classes[j]) 278 continue; 279 280 if (hazard_write) { 281 if (pre_use->dest != i) 282 continue; 283 284 midgard_instruction m = v_mov(idx, i); 285 m.dest_type = pre_use->dest_type; 286 m.src_types[1] = m.dest_type; 287 m.mask = pre_use->mask; 288 289 midgard_instruction *use = mir_next_op(pre_use); 290 assert(use); 291 mir_insert_instruction_before(ctx, use, m); 292 mir_rewrite_index_dst_single(pre_use, i, idx); 293 } else { 294 if (!mir_has_arg(pre_use, i)) 295 continue; 296 297 idx = spill_idx++; 298 299 midgard_instruction m = v_mov(i, idx); 300 m.mask = mir_from_bytemask(mir_round_bytemask_up( 301 mir_bytemask_of_read_components(pre_use, i), 32), 32); 302 mir_insert_instruction_before(ctx, pre_use, m); 303 mir_rewrite_index_src_single(pre_use, i, idx); 304 } 305 } 306 } 307 } 308 309 free(alur); 310 free(aluw); 311 free(brar); 312 free(ldst); 313 free(texr); 314 free(texw); 315} 316 317static void 318mir_compute_interference( 319 compiler_context *ctx, 320 struct lcra_state *l) 321{ 322 /* First, we need liveness information to be computed per block */ 323 mir_compute_liveness(ctx); 324 325 /* We need to force r1.w live throughout a blend shader */ 326 327 if (ctx->inputs->is_blend) { 328 unsigned r1w = ~0; 329 330 mir_foreach_block(ctx, _block) { 331 midgard_block *block = (midgard_block *) _block; 332 mir_foreach_instr_in_block_rev(block, ins) { 333 if (ins->writeout) 334 r1w = ins->dest; 335 } 336 337 if (r1w != ~0) 338 break; 339 } 340 341 mir_foreach_instr_global(ctx, ins) { 342 if (ins->dest < ctx->temp_count) 343 lcra_add_node_interference(l, ins->dest, mir_bytemask(ins), r1w, 0xF); 344 } 345 } 346 347 /* Now that every block has live_in/live_out computed, we can determine 348 * interference by walking each block linearly. Take live_out at the 349 * end of each block and walk the block backwards. */ 350 351 mir_foreach_block(ctx, _blk) { 352 midgard_block *blk = (midgard_block *) _blk; 353 uint16_t *live = mem_dup(_blk->live_out, ctx->temp_count * sizeof(uint16_t)); 354 355 mir_foreach_instr_in_block_rev(blk, ins) { 356 /* Mark all registers live after the instruction as 357 * interfering with the destination */ 358 359 unsigned dest = ins->dest; 360 361 if (dest < ctx->temp_count) { 362 for (unsigned i = 0; i < ctx->temp_count; ++i) { 363 if (live[i]) { 364 unsigned mask = mir_bytemask(ins); 365 lcra_add_node_interference(l, dest, mask, i, live[i]); 366 } 367 } 368 } 369 370 /* Add blend shader interference: blend shaders might 371 * clobber r0-r3. */ 372 if (ins->compact_branch && ins->writeout) { 373 for (unsigned i = 0; i < ctx->temp_count; ++i) { 374 if (!live[i]) 375 continue; 376 377 for (unsigned j = 0; j < 4; j++) { 378 lcra_add_node_interference(l, ctx->temp_count + j, 379 0xFFFF, 380 i, live[i]); 381 } 382 } 383 } 384 385 /* Update live_in */ 386 mir_liveness_ins_update(live, ins, ctx->temp_count); 387 } 388 389 free(live); 390 } 391} 392 393static bool 394mir_is_64(midgard_instruction *ins) 395{ 396 if (nir_alu_type_get_type_size(ins->dest_type) == 64) 397 return true; 398 399 mir_foreach_src(ins, v) { 400 if (nir_alu_type_get_type_size(ins->src_types[v]) == 64) 401 return true; 402 } 403 404 return false; 405} 406 407/* This routine performs the actual register allocation. It should be succeeded 408 * by install_registers */ 409 410static struct lcra_state * 411allocate_registers(compiler_context *ctx, bool *spilled) 412{ 413 /* The number of vec4 work registers available depends on the number of 414 * register-mapped uniforms and the shader stage. By ABI we limit blend 415 * shaders to 8 registers, should be lower XXX */ 416 int rmu = ctx->info->push.count / 4; 417 int work_count = ctx->inputs->is_blend ? 8 : 16 - MAX2(rmu - 8, 0); 418 419 /* No register allocation to do with no SSA */ 420 421 if (!ctx->temp_count) 422 return NULL; 423 424 /* Initialize LCRA. Allocate extra node at the end for r1-r3 for 425 * interference */ 426 427 struct lcra_state *l = lcra_alloc_equations(ctx->temp_count + 4, 5); 428 unsigned node_r1 = ctx->temp_count + 1; 429 430 /* Starts of classes, in bytes */ 431 l->class_start[REG_CLASS_WORK] = 16 * 0; 432 l->class_start[REG_CLASS_LDST] = 16 * 26; 433 l->class_start[REG_CLASS_TEXR] = 16 * 28; 434 l->class_start[REG_CLASS_TEXW] = 16 * 28; 435 436 l->class_size[REG_CLASS_WORK] = 16 * work_count; 437 l->class_size[REG_CLASS_LDST] = 16 * 2; 438 l->class_size[REG_CLASS_TEXR] = 16 * 2; 439 l->class_size[REG_CLASS_TEXW] = 16 * 2; 440 441 lcra_set_disjoint_class(l, REG_CLASS_TEXR, REG_CLASS_TEXW); 442 443 /* To save space on T*20, we don't have real texture registers. 444 * Instead, tex inputs reuse the load/store pipeline registers, and 445 * tex outputs use work r0/r1. Note we still use TEXR/TEXW classes, 446 * noting that this handles interferences and sizes correctly. */ 447 448 if (ctx->quirks & MIDGARD_INTERPIPE_REG_ALIASING) { 449 l->class_start[REG_CLASS_TEXR] = l->class_start[REG_CLASS_LDST]; 450 l->class_start[REG_CLASS_TEXW] = l->class_start[REG_CLASS_WORK]; 451 } 452 453 unsigned *found_class = calloc(sizeof(unsigned), ctx->temp_count); 454 unsigned *min_alignment = calloc(sizeof(unsigned), ctx->temp_count); 455 unsigned *min_bound = calloc(sizeof(unsigned), ctx->temp_count); 456 457 mir_foreach_instr_global(ctx, ins) { 458 /* Swizzles of 32-bit sources on 64-bit instructions need to be 459 * aligned to either bottom (xy) or top (zw). More general 460 * swizzle lowering should happen prior to scheduling (TODO), 461 * but once we get RA we shouldn't disrupt this further. Align 462 * sources of 64-bit instructions. */ 463 464 if (ins->type == TAG_ALU_4 && mir_is_64(ins)) { 465 mir_foreach_src(ins, v) { 466 unsigned s = ins->src[v]; 467 468 if (s < ctx->temp_count) 469 min_alignment[s] = 3; 470 } 471 } 472 473 if (ins->type == TAG_LOAD_STORE_4 && OP_HAS_ADDRESS(ins->op)) { 474 mir_foreach_src(ins, v) { 475 unsigned s = ins->src[v]; 476 unsigned size = nir_alu_type_get_type_size(ins->src_types[v]); 477 478 if (s < ctx->temp_count) 479 min_alignment[s] = (size == 64) ? 3 : 2; 480 } 481 } 482 483 if (ins->dest >= SSA_FIXED_MINIMUM) continue; 484 485 unsigned size = nir_alu_type_get_type_size(ins->dest_type); 486 487 if (ins->is_pack) 488 size = 32; 489 490 /* 0 for x, 1 for xy, 2 for xyz, 3 for xyzw */ 491 int comps1 = util_logbase2(ins->mask); 492 493 int bytes = (comps1 + 1) * (size / 8); 494 495 /* Use the largest class if there's ambiguity, this 496 * handles partial writes */ 497 498 int dest = ins->dest; 499 found_class[dest] = MAX2(found_class[dest], bytes); 500 501 min_alignment[dest] = 502 (size == 16) ? 1 : /* (1 << 1) = 2-byte */ 503 (size == 32) ? 2 : /* (1 << 2) = 4-byte */ 504 (size == 64) ? 3 : /* (1 << 3) = 8-byte */ 505 3; /* 8-bit todo */ 506 507 /* We can't cross xy/zw boundaries. TODO: vec8 can */ 508 if (size == 16) 509 min_bound[dest] = 8; 510 511 mir_foreach_src(ins, s) { 512 unsigned src_size = nir_alu_type_get_type_size(ins->src_types[s]); 513 if (src_size == 16 && ins->src[s] < SSA_FIXED_MINIMUM) 514 min_bound[ins->src[s]] = MAX2(min_bound[ins->src[s]], 8); 515 } 516 517 /* We don't have a swizzle for the conditional and we don't 518 * want to muck with the conditional itself, so just force 519 * alignment for now */ 520 521 if (ins->type == TAG_ALU_4 && OP_IS_CSEL_V(ins->op)) { 522 min_alignment[dest] = 4; /* 1 << 4= 16-byte = vec4 */ 523 524 /* LCRA assumes bound >= alignment */ 525 min_bound[dest] = 16; 526 } 527 528 /* Since ld/st swizzles and masks are 32-bit only, we need them 529 * aligned to enable final packing */ 530 if (ins->type == TAG_LOAD_STORE_4) 531 min_alignment[dest] = MAX2(min_alignment[dest], 2); 532 } 533 534 for (unsigned i = 0; i < ctx->temp_count; ++i) { 535 lcra_set_alignment(l, i, min_alignment[i] ? min_alignment[i] : 2, 536 min_bound[i] ? min_bound[i] : 16); 537 lcra_restrict_range(l, i, found_class[i]); 538 } 539 540 free(found_class); 541 free(min_alignment); 542 free(min_bound); 543 544 /* Next, we'll determine semantic class. We default to zero (work). 545 * But, if we're used with a special operation, that will force us to a 546 * particular class. Each node must be assigned to exactly one class; a 547 * prepass before RA should have lowered what-would-have-been 548 * multiclass nodes into a series of moves to break it up into multiple 549 * nodes (TODO) */ 550 551 mir_foreach_instr_global(ctx, ins) { 552 /* Check if this operation imposes any classes */ 553 554 if (ins->type == TAG_LOAD_STORE_4) { 555 set_class(l->class, ins->src[0], REG_CLASS_LDST); 556 set_class(l->class, ins->src[1], REG_CLASS_LDST); 557 set_class(l->class, ins->src[2], REG_CLASS_LDST); 558 set_class(l->class, ins->src[3], REG_CLASS_LDST); 559 560 if (OP_IS_VEC4_ONLY(ins->op)) { 561 lcra_restrict_range(l, ins->dest, 16); 562 lcra_restrict_range(l, ins->src[0], 16); 563 lcra_restrict_range(l, ins->src[1], 16); 564 lcra_restrict_range(l, ins->src[2], 16); 565 lcra_restrict_range(l, ins->src[3], 16); 566 } 567 } else if (ins->type == TAG_TEXTURE_4) { 568 set_class(l->class, ins->dest, REG_CLASS_TEXW); 569 set_class(l->class, ins->src[0], REG_CLASS_TEXR); 570 set_class(l->class, ins->src[1], REG_CLASS_TEXR); 571 set_class(l->class, ins->src[2], REG_CLASS_TEXR); 572 set_class(l->class, ins->src[3], REG_CLASS_TEXR); 573 } 574 } 575 576 /* Check that the semantics of the class are respected */ 577 mir_foreach_instr_global(ctx, ins) { 578 assert(check_write_class(l->class, ins->type, ins->dest)); 579 assert(check_read_class(l->class, ins->type, ins->src[0])); 580 assert(check_read_class(l->class, ins->type, ins->src[1])); 581 assert(check_read_class(l->class, ins->type, ins->src[2])); 582 assert(check_read_class(l->class, ins->type, ins->src[3])); 583 } 584 585 /* Mark writeout to r0, depth to r1.x, stencil to r1.y, 586 * render target to r1.z, unknown to r1.w */ 587 mir_foreach_instr_global(ctx, ins) { 588 if (!(ins->compact_branch && ins->writeout)) continue; 589 590 if (ins->src[0] < ctx->temp_count) 591 l->solutions[ins->src[0]] = 0; 592 593 if (ins->src[2] < ctx->temp_count) 594 l->solutions[ins->src[2]] = (16 * 1) + COMPONENT_X * 4; 595 596 if (ins->src[3] < ctx->temp_count) 597 l->solutions[ins->src[3]] = (16 * 1) + COMPONENT_Y * 4; 598 599 if (ins->src[1] < ctx->temp_count) 600 l->solutions[ins->src[1]] = (16 * 1) + COMPONENT_Z * 4; 601 602 if (ins->dest < ctx->temp_count) 603 l->solutions[ins->dest] = (16 * 1) + COMPONENT_W * 4; 604 } 605 606 /* Destinations of instructions in a writeout block cannot be assigned 607 * to r1 unless they are actually used as r1 from the writeout itself, 608 * since the writes to r1 are special. A code sequence like: 609 * 610 * sadd.fmov r1.x, [...] 611 * vadd.fadd r0, r1, r2 612 * [writeout branch] 613 * 614 * will misbehave since the r1.x write will be interpreted as a 615 * gl_FragDepth write so it won't show up correctly when r1 is read in 616 * the following segment. We model this as interference. 617 */ 618 619 for (unsigned i = 0; i < 4; ++i) 620 l->solutions[ctx->temp_count + i] = (16 * i); 621 622 mir_foreach_block(ctx, _blk) { 623 midgard_block *blk = (midgard_block *) _blk; 624 625 mir_foreach_bundle_in_block(blk, v) { 626 /* We need at least a writeout and nonwriteout instruction */ 627 if (v->instruction_count < 2) 628 continue; 629 630 /* Branches always come at the end */ 631 midgard_instruction *br = v->instructions[v->instruction_count - 1]; 632 633 if (!br->writeout) 634 continue; 635 636 for (signed i = v->instruction_count - 2; i >= 0; --i) { 637 midgard_instruction *ins = v->instructions[i]; 638 639 if (ins->dest >= ctx->temp_count) 640 continue; 641 642 bool used_as_r1 = (br->dest == ins->dest); 643 644 mir_foreach_src(br, s) 645 used_as_r1 |= (s > 0) && (br->src[s] == ins->dest); 646 647 if (!used_as_r1) 648 lcra_add_node_interference(l, ins->dest, mir_bytemask(ins), node_r1, 0xFFFF); 649 } 650 } 651 } 652 653 /* Precolour blend input to r0. Note writeout is necessarily at the end 654 * and blend shaders are single-RT only so there is only a single 655 * writeout block, so this cannot conflict with the writeout r0 (there 656 * is no need to have an intermediate move) */ 657 658 if (ctx->blend_input != ~0) { 659 assert(ctx->blend_input < ctx->temp_count); 660 l->solutions[ctx->blend_input] = 0; 661 } 662 663 /* Same for the dual-source blend input/output, except here we use r2, 664 * which is also set in the fragment shader. */ 665 666 if (ctx->blend_src1 != ~0) { 667 assert(ctx->blend_src1 < ctx->temp_count); 668 l->solutions[ctx->blend_src1] = (16 * 2); 669 ctx->info->work_reg_count = MAX2(ctx->info->work_reg_count, 3); 670 } 671 672 mir_compute_interference(ctx, l); 673 674 *spilled = !lcra_solve(l); 675 return l; 676} 677 678 679/* Once registers have been decided via register allocation 680 * (allocate_registers), we need to rewrite the MIR to use registers instead of 681 * indices */ 682 683static void 684install_registers_instr( 685 compiler_context *ctx, 686 struct lcra_state *l, 687 midgard_instruction *ins) 688{ 689 unsigned src_shift[MIR_SRC_COUNT]; 690 691 for (unsigned i = 0; i < MIR_SRC_COUNT; ++i) { 692 src_shift[i] = 693 util_logbase2(nir_alu_type_get_type_size(ins->src_types[i]) / 8); 694 } 695 696 unsigned dest_shift = 697 util_logbase2(nir_alu_type_get_type_size(ins->dest_type) / 8); 698 699 switch (ins->type) { 700 case TAG_ALU_4: 701 case TAG_ALU_8: 702 case TAG_ALU_12: 703 case TAG_ALU_16: { 704 if (ins->compact_branch) 705 return; 706 707 struct phys_reg src1 = index_to_reg(ctx, l, ins->src[0], src_shift[0]); 708 struct phys_reg src2 = index_to_reg(ctx, l, ins->src[1], src_shift[1]); 709 struct phys_reg dest = index_to_reg(ctx, l, ins->dest, dest_shift); 710 711 mir_set_bytemask(ins, mir_bytemask(ins) << dest.offset); 712 713 unsigned dest_offset = 714 GET_CHANNEL_COUNT(alu_opcode_props[ins->op].props) ? 0 : 715 dest.offset; 716 717 offset_swizzle(ins->swizzle[0], src1.offset, src1.shift, dest.shift, dest_offset); 718 if (!ins->has_inline_constant) 719 offset_swizzle(ins->swizzle[1], src2.offset, src2.shift, dest.shift, dest_offset); 720 if (ins->src[0] != ~0) 721 ins->src[0] = SSA_FIXED_REGISTER(src1.reg); 722 if (ins->src[1] != ~0) 723 ins->src[1] = SSA_FIXED_REGISTER(src2.reg); 724 if (ins->dest != ~0) 725 ins->dest = SSA_FIXED_REGISTER(dest.reg); 726 break; 727 } 728 729 case TAG_LOAD_STORE_4: { 730 /* Which physical register we read off depends on 731 * whether we are loading or storing -- think about the 732 * logical dataflow */ 733 734 bool encodes_src = OP_IS_STORE(ins->op); 735 736 if (encodes_src) { 737 struct phys_reg src = index_to_reg(ctx, l, ins->src[0], src_shift[0]); 738 assert(src.reg == 26 || src.reg == 27); 739 740 ins->src[0] = SSA_FIXED_REGISTER(src.reg); 741 offset_swizzle(ins->swizzle[0], src.offset, src.shift, 0, 0); 742 } else { 743 struct phys_reg dst = index_to_reg(ctx, l, ins->dest, dest_shift); 744 745 ins->dest = SSA_FIXED_REGISTER(dst.reg); 746 offset_swizzle(ins->swizzle[0], 0, 2, 2, dst.offset); 747 mir_set_bytemask(ins, mir_bytemask(ins) << dst.offset); 748 } 749 750 /* We also follow up by actual arguments */ 751 752 for (int i = 1; i <= 3; i++) { 753 unsigned src_index = ins->src[i]; 754 if (src_index != ~0) { 755 struct phys_reg src = index_to_reg(ctx, l, src_index, src_shift[i]); 756 unsigned component = src.offset >> src.shift; 757 assert(component << src.shift == src.offset); 758 ins->src[i] = SSA_FIXED_REGISTER(src.reg); 759 ins->swizzle[i][0] += component; 760 } 761 } 762 763 break; 764 } 765 766 case TAG_TEXTURE_4: { 767 if (ins->op == midgard_tex_op_barrier) 768 break; 769 770 /* Grab RA results */ 771 struct phys_reg dest = index_to_reg(ctx, l, ins->dest, dest_shift); 772 struct phys_reg coord = index_to_reg(ctx, l, ins->src[1], src_shift[1]); 773 struct phys_reg lod = index_to_reg(ctx, l, ins->src[2], src_shift[2]); 774 struct phys_reg offset = index_to_reg(ctx, l, ins->src[3], src_shift[3]); 775 776 /* First, install the texture coordinate */ 777 if (ins->src[1] != ~0) 778 ins->src[1] = SSA_FIXED_REGISTER(coord.reg); 779 offset_swizzle(ins->swizzle[1], coord.offset, coord.shift, dest.shift, 0); 780 781 /* Next, install the destination */ 782 if (ins->dest != ~0) 783 ins->dest = SSA_FIXED_REGISTER(dest.reg); 784 offset_swizzle(ins->swizzle[0], 0, 2, dest.shift, 785 dest_shift == 1 ? dest.offset % 8 : 786 dest.offset); 787 mir_set_bytemask(ins, mir_bytemask(ins) << dest.offset); 788 789 /* If there is a register LOD/bias, use it */ 790 if (ins->src[2] != ~0) { 791 assert(!(lod.offset & 3)); 792 ins->src[2] = SSA_FIXED_REGISTER(lod.reg); 793 ins->swizzle[2][0] = lod.offset / 4; 794 } 795 796 /* If there is an offset register, install it */ 797 if (ins->src[3] != ~0) { 798 ins->src[3] = SSA_FIXED_REGISTER(offset.reg); 799 ins->swizzle[3][0] = offset.offset / 4; 800 } 801 802 break; 803 } 804 805 default: 806 break; 807 } 808} 809 810static void 811install_registers(compiler_context *ctx, struct lcra_state *l) 812{ 813 mir_foreach_instr_global(ctx, ins) 814 install_registers_instr(ctx, l, ins); 815} 816 817 818/* If register allocation fails, find the best spill node */ 819 820static signed 821mir_choose_spill_node( 822 compiler_context *ctx, 823 struct lcra_state *l) 824{ 825 /* We can't spill a previously spilled value or an unspill */ 826 827 mir_foreach_instr_global(ctx, ins) { 828 if (ins->no_spill & (1 << l->spill_class)) { 829 lcra_set_node_spill_cost(l, ins->dest, -1); 830 831 if (l->spill_class != REG_CLASS_WORK) { 832 mir_foreach_src(ins, s) 833 lcra_set_node_spill_cost(l, ins->src[s], -1); 834 } 835 } 836 } 837 838 return lcra_get_best_spill_node(l); 839} 840 841/* Once we've chosen a spill node, spill it */ 842 843static void 844mir_spill_register( 845 compiler_context *ctx, 846 unsigned spill_node, 847 unsigned spill_class, 848 unsigned *spill_count) 849{ 850 if (spill_class == REG_CLASS_WORK && ctx->inputs->is_blend) 851 unreachable("Blend shader spilling is currently unimplemented"); 852 853 unsigned spill_index = ctx->temp_count; 854 855 /* We have a spill node, so check the class. Work registers 856 * legitimately spill to TLS, but special registers just spill to work 857 * registers */ 858 859 bool is_special = spill_class != REG_CLASS_WORK; 860 bool is_special_w = spill_class == REG_CLASS_TEXW; 861 862 /* Allocate TLS slot (maybe) */ 863 unsigned spill_slot = !is_special ? (*spill_count)++ : 0; 864 865 /* For special reads, figure out how many bytes we need */ 866 unsigned read_bytemask = 0; 867 868 /* If multiple instructions write to this destination, we'll have to 869 * fill from TLS before writing */ 870 unsigned write_count = 0; 871 872 mir_foreach_instr_global_safe(ctx, ins) { 873 read_bytemask |= mir_bytemask_of_read_components(ins, spill_node); 874 if (ins->dest == spill_node) 875 ++write_count; 876 } 877 878 /* For TLS, replace all stores to the spilled node. For 879 * special reads, just keep as-is; the class will be demoted 880 * implicitly. For special writes, spill to a work register */ 881 882 if (!is_special || is_special_w) { 883 if (is_special_w) 884 spill_slot = spill_index++; 885 886 mir_foreach_block(ctx, _block) { 887 midgard_block *block = (midgard_block *) _block; 888 mir_foreach_instr_in_block_safe(block, ins) { 889 if (ins->dest != spill_node) continue; 890 891 /* Note: it's important to match the mask of the spill 892 * with the mask of the instruction whose destination 893 * we're spilling, or otherwise we'll read invalid 894 * components and can fail RA in a subsequent iteration 895 */ 896 897 if (is_special_w) { 898 midgard_instruction st = v_mov(spill_node, spill_slot); 899 st.no_spill |= (1 << spill_class); 900 st.mask = ins->mask; 901 st.dest_type = st.src_types[1] = ins->dest_type; 902 903 /* Hint: don't rewrite this node */ 904 st.hint = true; 905 906 mir_insert_instruction_after_scheduled(ctx, block, ins, st); 907 } else { 908 unsigned dest = spill_index++; 909 910 if (write_count > 1 && mir_bytemask(ins) != 0xF) { 911 midgard_instruction read = 912 v_load_store_scratch(dest, spill_slot, false, 0xF); 913 mir_insert_instruction_before_scheduled(ctx, block, ins, read); 914 } 915 916 ins->dest = dest; 917 ins->no_spill |= (1 << spill_class); 918 919 bool move = false; 920 921 /* In the same bundle, reads of the destination 922 * of the spilt instruction need to be direct */ 923 midgard_instruction *it = ins; 924 while ((it = list_first_entry(&it->link, midgard_instruction, link)) 925 && (it->bundle_id == ins->bundle_id)) { 926 927 if (!mir_has_arg(it, spill_node)) continue; 928 929 mir_rewrite_index_src_single(it, spill_node, dest); 930 931 /* The spilt instruction will write to 932 * a work register for `it` to read but 933 * the spill needs an LD/ST register */ 934 move = true; 935 } 936 937 if (move) 938 dest = spill_index++; 939 940 midgard_instruction st = 941 v_load_store_scratch(dest, spill_slot, true, ins->mask); 942 mir_insert_instruction_after_scheduled(ctx, block, ins, st); 943 944 if (move) { 945 midgard_instruction mv = v_mov(ins->dest, dest); 946 mv.no_spill |= (1 << spill_class); 947 948 mir_insert_instruction_after_scheduled(ctx, block, ins, mv); 949 } 950 } 951 952 if (!is_special) 953 ctx->spills++; 954 } 955 } 956 } 957 958 /* Insert a load from TLS before the first consecutive 959 * use of the node, rewriting to use spilled indices to 960 * break up the live range. Or, for special, insert a 961 * move. Ironically the latter *increases* register 962 * pressure, but the two uses of the spilling mechanism 963 * are somewhat orthogonal. (special spilling is to use 964 * work registers to back special registers; TLS 965 * spilling is to use memory to back work registers) */ 966 967 mir_foreach_block(ctx, _block) { 968 midgard_block *block = (midgard_block *) _block; 969 mir_foreach_instr_in_block(block, ins) { 970 /* We can't rewrite the moves used to spill in the 971 * first place. These moves are hinted. */ 972 if (ins->hint) continue; 973 974 /* If we don't use the spilled value, nothing to do */ 975 if (!mir_has_arg(ins, spill_node)) continue; 976 977 unsigned index = 0; 978 979 if (!is_special_w) { 980 index = ++spill_index; 981 982 midgard_instruction *before = ins; 983 midgard_instruction st; 984 985 if (is_special) { 986 /* Move */ 987 st = v_mov(spill_node, index); 988 st.no_spill |= (1 << spill_class); 989 } else { 990 /* TLS load */ 991 st = v_load_store_scratch(index, spill_slot, false, 0xF); 992 } 993 994 /* Mask the load based on the component count 995 * actually needed to prevent RA loops */ 996 997 st.mask = mir_from_bytemask(mir_round_bytemask_up( 998 read_bytemask, 32), 32); 999 1000 mir_insert_instruction_before_scheduled(ctx, block, before, st); 1001 } else { 1002 /* Special writes already have their move spilled in */ 1003 index = spill_slot; 1004 } 1005 1006 1007 /* Rewrite to use */ 1008 mir_rewrite_index_src_single(ins, spill_node, index); 1009 1010 if (!is_special) 1011 ctx->fills++; 1012 } 1013 } 1014 1015 /* Reset hints */ 1016 1017 mir_foreach_instr_global(ctx, ins) { 1018 ins->hint = false; 1019 } 1020} 1021 1022static void 1023mir_demote_uniforms(compiler_context *ctx, unsigned new_cutoff) 1024{ 1025 unsigned uniforms = ctx->info->push.count / 4; 1026 unsigned old_work_count = 16 - MAX2(uniforms - 8, 0); 1027 unsigned work_count = 16 - MAX2((new_cutoff - 8), 0); 1028 1029 unsigned min_demote = SSA_FIXED_REGISTER(old_work_count); 1030 unsigned max_demote = SSA_FIXED_REGISTER(work_count); 1031 1032 mir_foreach_block(ctx, _block) { 1033 midgard_block *block = (midgard_block *) _block; 1034 mir_foreach_instr_in_block(block, ins) { 1035 mir_foreach_src(ins, i) { 1036 if (ins->src[i] < min_demote || ins->src[i] >= max_demote) 1037 continue; 1038 1039 midgard_instruction *before = ins; 1040 1041 unsigned temp = make_compiler_temp(ctx); 1042 unsigned idx = (23 - SSA_REG_FROM_FIXED(ins->src[i])) * 4; 1043 assert(idx < ctx->info->push.count); 1044 1045 ctx->ubo_mask |= BITSET_BIT(ctx->info->push.words[idx].ubo); 1046 1047 midgard_instruction ld = { 1048 .type = TAG_LOAD_STORE_4, 1049 .mask = 0xF, 1050 .dest = temp, 1051 .dest_type = ins->src_types[i], 1052 .src = { ~0, ~0, ~0, ~0 }, 1053 .swizzle = SWIZZLE_IDENTITY_4, 1054 .op = midgard_op_ld_ubo_128, 1055 .load_store = { 1056 .index_reg = REGISTER_LDST_ZERO, 1057 }, 1058 .constants.u32[0] = ctx->info->push.words[idx].offset 1059 }; 1060 1061 midgard_pack_ubo_index_imm(&ld.load_store, 1062 ctx->info->push.words[idx].ubo); 1063 1064 mir_insert_instruction_before_scheduled(ctx, block, before, ld); 1065 1066 mir_rewrite_index_src_single(ins, ins->src[i], temp); 1067 } 1068 } 1069 } 1070 1071 ctx->info->push.count = MIN2(ctx->info->push.count, new_cutoff * 4); 1072} 1073 1074/* Run register allocation in a loop, spilling until we succeed */ 1075 1076void 1077mir_ra(compiler_context *ctx) 1078{ 1079 struct lcra_state *l = NULL; 1080 bool spilled = false; 1081 int iter_count = 1000; /* max iterations */ 1082 1083 /* Number of 128-bit slots in memory we've spilled into */ 1084 unsigned spill_count = DIV_ROUND_UP(ctx->info->tls_size, 16); 1085 1086 1087 mir_create_pipeline_registers(ctx); 1088 1089 do { 1090 if (spilled) { 1091 signed spill_node = mir_choose_spill_node(ctx, l); 1092 unsigned uniforms = ctx->info->push.count / 4; 1093 1094 /* It's a lot cheaper to demote uniforms to get more 1095 * work registers than to spill to TLS. */ 1096 if (l->spill_class == REG_CLASS_WORK && uniforms > 8) { 1097 mir_demote_uniforms(ctx, MAX2(uniforms - 4, 8)); 1098 } else if (spill_node == -1) { 1099 fprintf(stderr, "ERROR: Failed to choose spill node\n"); 1100 lcra_free(l); 1101 return; 1102 } else { 1103 mir_spill_register(ctx, spill_node, l->spill_class, &spill_count); 1104 } 1105 } 1106 1107 mir_squeeze_index(ctx); 1108 mir_invalidate_liveness(ctx); 1109 1110 if (l) { 1111 lcra_free(l); 1112 l = NULL; 1113 } 1114 1115 l = allocate_registers(ctx, &spilled); 1116 } while(spilled && ((iter_count--) > 0)); 1117 1118 if (iter_count <= 0) { 1119 fprintf(stderr, "panfrost: Gave up allocating registers, rendering will be incomplete\n"); 1120 assert(0); 1121 } 1122 1123 /* Report spilling information. spill_count is in 128-bit slots (vec4 x 1124 * fp32), but tls_size is in bytes, so multiply by 16 */ 1125 1126 ctx->info->tls_size = spill_count * 16; 1127 1128 install_registers(ctx, l); 1129 1130 lcra_free(l); 1131} 1132