1/* 2 * Copyright © 2014 Broadcom 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "util/ralloc.h" 25#include "util/register_allocate.h" 26#include "common/v3d_device_info.h" 27#include "v3d_compiler.h" 28 29#define QPU_R(i) { .magic = false, .index = i } 30 31#define ACC_INDEX 0 32#define ACC_COUNT 6 33#define PHYS_INDEX (ACC_INDEX + ACC_COUNT) 34#define PHYS_COUNT 64 35 36static inline bool 37qinst_writes_tmu(const struct v3d_device_info *devinfo, 38 struct qinst *inst) 39{ 40 return (inst->dst.file == QFILE_MAGIC && 41 v3d_qpu_magic_waddr_is_tmu(devinfo, inst->dst.index)) || 42 inst->qpu.sig.wrtmuc; 43} 44 45static bool 46is_end_of_tmu_sequence(const struct v3d_device_info *devinfo, 47 struct qinst *inst, struct qblock *block) 48{ 49 if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && 50 inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) { 51 return true; 52 } 53 54 if (!inst->qpu.sig.ldtmu) 55 return false; 56 57 list_for_each_entry_from(struct qinst, scan_inst, inst->link.next, 58 &block->instructions, link) { 59 if (scan_inst->qpu.sig.ldtmu) 60 return false; 61 62 if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && 63 inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) { 64 return true; 65 } 66 67 if (qinst_writes_tmu(devinfo, scan_inst)) 68 return true; 69 } 70 71 return true; 72} 73 74static bool 75vir_is_mov_uniform(struct v3d_compile *c, int temp) 76{ 77 struct qinst *def = c->defs[temp]; 78 79 return def && def->qpu.sig.ldunif; 80} 81 82static int 83v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g, 84 uint32_t *temp_to_node) 85{ 86 const float tmu_scale = 5; 87 float block_scale = 1.0; 88 float spill_costs[c->num_temps]; 89 bool in_tmu_operation = false; 90 bool started_last_seg = false; 91 92 for (unsigned i = 0; i < c->num_temps; i++) 93 spill_costs[i] = 0.0; 94 95 /* XXX: Scale the cost up when inside of a loop. */ 96 vir_for_each_block(block, c) { 97 vir_for_each_inst(inst, block) { 98 /* We can't insert new thread switches after 99 * starting output writes. 100 */ 101 bool no_spilling = 102 c->threads > 1 && started_last_seg; 103 104 /* Discourage spilling of TMU operations */ 105 for (int i = 0; i < vir_get_nsrc(inst); i++) { 106 if (inst->src[i].file != QFILE_TEMP) 107 continue; 108 109 int temp = inst->src[i].index; 110 if (vir_is_mov_uniform(c, temp)) { 111 spill_costs[temp] += block_scale; 112 } else if (!no_spilling) { 113 float tmu_op_scale = in_tmu_operation ? 114 3.0 : 1.0; 115 spill_costs[temp] += (block_scale * 116 tmu_scale * 117 tmu_op_scale); 118 } else { 119 BITSET_CLEAR(c->spillable, temp); 120 } 121 } 122 123 if (inst->dst.file == QFILE_TEMP) { 124 int temp = inst->dst.index; 125 126 if (vir_is_mov_uniform(c, temp)) { 127 /* We just rematerialize the unform 128 * later. 129 */ 130 } else if (!no_spilling) { 131 spill_costs[temp] += (block_scale * 132 tmu_scale); 133 } else { 134 BITSET_CLEAR(c->spillable, temp); 135 } 136 } 137 138 /* Refuse to spill a ldvary's dst, because that means 139 * that ldvary's r5 would end up being used across a 140 * thrsw. 141 */ 142 if (inst->qpu.sig.ldvary) { 143 assert(inst->dst.file == QFILE_TEMP); 144 BITSET_CLEAR(c->spillable, inst->dst.index); 145 } 146 147 if (inst->is_last_thrsw) 148 started_last_seg = true; 149 150 if (v3d_qpu_writes_vpm(&inst->qpu) || 151 v3d_qpu_uses_tlb(&inst->qpu)) 152 started_last_seg = true; 153 154 /* Track when we're in between a TMU setup and the 155 * final LDTMU or TMUWT from that TMU setup. We 156 * penalize spills during that time. 157 */ 158 if (is_end_of_tmu_sequence(c->devinfo, inst, block)) 159 in_tmu_operation = false; 160 161 if (qinst_writes_tmu(c->devinfo, inst)) 162 in_tmu_operation = true; 163 } 164 } 165 166 for (unsigned i = 0; i < c->num_temps; i++) { 167 if (BITSET_TEST(c->spillable, i)) 168 ra_set_node_spill_cost(g, temp_to_node[i], spill_costs[i]); 169 } 170 171 return ra_get_best_spill_node(g); 172} 173 174/* The spill offset for this thread takes a bit of setup, so do it once at 175 * program start. 176 */ 177void 178v3d_setup_spill_base(struct v3d_compile *c) 179{ 180 /* Setting up the spill base is done in the entry block; so change 181 * both the current block to emit and the cursor. 182 */ 183 struct qblock *current_block = c->cur_block; 184 c->cur_block = vir_entry_block(c); 185 c->cursor = vir_before_block(c->cur_block); 186 187 int start_num_temps = c->num_temps; 188 189 /* Each thread wants to be in a separate region of the scratch space 190 * so that the QPUs aren't fighting over cache lines. We have the 191 * driver keep a single global spill BO rather than 192 * per-spilling-program BOs, so we need a uniform from the driver for 193 * what the per-thread scale is. 194 */ 195 struct qreg thread_offset = 196 vir_UMUL(c, 197 vir_TIDX(c), 198 vir_uniform(c, QUNIFORM_SPILL_SIZE_PER_THREAD, 0)); 199 200 /* Each channel in a reg is 4 bytes, so scale them up by that. */ 201 struct qreg element_offset = vir_SHL(c, vir_EIDX(c), 202 vir_uniform_ui(c, 2)); 203 204 c->spill_base = vir_ADD(c, 205 vir_ADD(c, thread_offset, element_offset), 206 vir_uniform(c, QUNIFORM_SPILL_OFFSET, 0)); 207 208 /* Make sure that we don't spill the spilling setup instructions. */ 209 for (int i = start_num_temps; i < c->num_temps; i++) 210 BITSET_CLEAR(c->spillable, i); 211 212 /* Restore the current block. */ 213 c->cur_block = current_block; 214 c->cursor = vir_after_block(c->cur_block); 215} 216 217static struct qinst * 218v3d_emit_spill_tmua(struct v3d_compile *c, uint32_t spill_offset) 219{ 220 return vir_ADD_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA), 221 c->spill_base, vir_uniform_ui(c, spill_offset)); 222} 223 224 225static void 226v3d_emit_tmu_spill(struct v3d_compile *c, struct qinst *inst, 227 struct qinst *position, uint32_t spill_offset) 228{ 229 assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU); 230 231 c->cursor = vir_after_inst(position); 232 inst->dst = vir_get_temp(c); 233 enum v3d_qpu_cond cond = vir_get_cond(inst); 234 struct qinst *tmp = 235 vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD), 236 inst->dst); 237 tmp->qpu.flags.mc = cond; 238 tmp = v3d_emit_spill_tmua(c, spill_offset); 239 tmp->qpu.flags.ac = cond; 240 vir_emit_thrsw(c); 241 vir_TMUWT(c); 242 c->spills++; 243 c->tmu_dirty_rcl = true; 244} 245 246static void 247v3d_spill_reg(struct v3d_compile *c, int spill_temp) 248{ 249 c->spill_count++; 250 251 bool is_uniform = vir_is_mov_uniform(c, spill_temp); 252 253 uint32_t spill_offset = 0; 254 255 if (!is_uniform) { 256 spill_offset = c->spill_size; 257 c->spill_size += V3D_CHANNELS * sizeof(uint32_t); 258 259 if (spill_offset == 0) 260 v3d_setup_spill_base(c); 261 } 262 263 struct qinst *last_thrsw = c->last_thrsw; 264 assert(last_thrsw && last_thrsw->is_last_thrsw); 265 266 int start_num_temps = c->num_temps; 267 268 int uniform_index = ~0; 269 if (is_uniform) { 270 struct qinst *orig_unif = c->defs[spill_temp]; 271 uniform_index = orig_unif->uniform; 272 } 273 274 /* We must disable the ldunif optimization if we are spilling uniforms */ 275 bool had_disable_ldunif_opt = c->disable_ldunif_opt; 276 c->disable_ldunif_opt = true; 277 278 struct qinst *start_of_tmu_sequence = NULL; 279 struct qinst *postponed_spill = NULL; 280 vir_for_each_block(block, c) { 281 vir_for_each_inst_safe(inst, block) { 282 /* Track when we're in between a TMU setup and the final 283 * LDTMU or TMUWT from that TMU setup. We can't spill/fill any 284 * temps during that time, because that involves inserting a 285 * new TMU setup/LDTMU sequence, so we postpone the spill or 286 * move the fill up to not intrude in the middle of the TMU 287 * sequence. 288 */ 289 if (is_end_of_tmu_sequence(c->devinfo, inst, block)) { 290 if (postponed_spill) { 291 v3d_emit_tmu_spill(c, postponed_spill, 292 inst, spill_offset); 293 } 294 295 start_of_tmu_sequence = NULL; 296 postponed_spill = NULL; 297 } 298 299 if (!start_of_tmu_sequence && 300 qinst_writes_tmu(c->devinfo, inst)) { 301 start_of_tmu_sequence = inst; 302 } 303 304 /* fills */ 305 for (int i = 0; i < vir_get_nsrc(inst); i++) { 306 if (inst->src[i].file != QFILE_TEMP || 307 inst->src[i].index != spill_temp) { 308 continue; 309 } 310 311 c->cursor = vir_before_inst(inst); 312 313 if (is_uniform) { 314 struct qreg unif = 315 vir_uniform(c, 316 c->uniform_contents[uniform_index], 317 c->uniform_data[uniform_index]); 318 inst->src[i] = unif; 319 } else { 320 /* If we have a postponed spill, we don't need 321 * a fill as the temp would not have been 322 * spilled yet. 323 */ 324 if (postponed_spill) 325 continue; 326 if (start_of_tmu_sequence) 327 c->cursor = vir_before_inst(start_of_tmu_sequence); 328 329 v3d_emit_spill_tmua(c, spill_offset); 330 vir_emit_thrsw(c); 331 inst->src[i] = vir_LDTMU(c); 332 c->fills++; 333 } 334 } 335 336 /* spills */ 337 if (inst->dst.file == QFILE_TEMP && 338 inst->dst.index == spill_temp) { 339 if (is_uniform) { 340 c->cursor.link = NULL; 341 vir_remove_instruction(c, inst); 342 } else { 343 if (start_of_tmu_sequence) 344 postponed_spill = inst; 345 else 346 v3d_emit_tmu_spill(c, inst, inst, 347 spill_offset); 348 } 349 } 350 } 351 } 352 353 /* Make sure c->last_thrsw is the actual last thrsw, not just one we 354 * inserted in our most recent unspill. 355 */ 356 c->last_thrsw = last_thrsw; 357 358 /* Don't allow spilling of our spilling instructions. There's no way 359 * they can help get things colored. 360 */ 361 for (int i = start_num_temps; i < c->num_temps; i++) 362 BITSET_CLEAR(c->spillable, i); 363 364 c->disable_ldunif_opt = had_disable_ldunif_opt; 365} 366 367struct node_to_temp_map { 368 uint32_t temp; 369 uint32_t priority; 370}; 371 372struct v3d_ra_select_callback_data { 373 uint32_t next_acc; 374 uint32_t next_phys; 375 struct node_to_temp_map *map; 376}; 377 378/* Choosing accumulators improves chances of merging QPU instructions 379 * due to these merges requiring that at most 2 rf registers are used 380 * by the add and mul instructions. 381 */ 382static bool 383v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra, 384 BITSET_WORD *regs, 385 int priority) 386{ 387 /* Favor accumulators if we have less that this number of physical 388 * registers. Accumulators have more restrictions (like being 389 * invalidated through thrsw), so running out of physical registers 390 * even if we have accumulators available can lead to register 391 * allocation failures. 392 */ 393 static const int available_rf_threshold = 5; 394 int available_rf = 0 ; 395 for (int i = 0; i < PHYS_COUNT; i++) { 396 if (BITSET_TEST(regs, PHYS_INDEX + i)) 397 available_rf++; 398 if (available_rf >= available_rf_threshold) 399 break; 400 } 401 if (available_rf < available_rf_threshold) 402 return true; 403 404 /* Favor accumulators for short-lived temps (our priority represents 405 * liveness), to prevent long-lived temps from grabbing accumulators 406 * and preventing follow-up instructions from using them, potentially 407 * leading to large portions of the shader being unable to use 408 * accumulators and therefore merge instructions successfully. 409 */ 410 static const int priority_threshold = 20; 411 if (priority <= priority_threshold) 412 return true; 413 414 return false; 415} 416 417static bool 418v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra, 419 BITSET_WORD *regs, 420 unsigned int *out) 421{ 422 /* Round-robin through our accumulators to give post-RA instruction 423 * selection more options. 424 */ 425 for (int i = 0; i < ACC_COUNT; i++) { 426 int acc_off = (v3d_ra->next_acc + i) % ACC_COUNT; 427 int acc = ACC_INDEX + acc_off; 428 429 if (BITSET_TEST(regs, acc)) { 430 v3d_ra->next_acc = acc_off + 1; 431 *out = acc; 432 return true; 433 } 434 } 435 436 return false; 437} 438 439static bool 440v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra, 441 BITSET_WORD *regs, 442 unsigned int *out) 443{ 444 for (int i = 0; i < PHYS_COUNT; i++) { 445 int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT; 446 int phys = PHYS_INDEX + phys_off; 447 448 if (BITSET_TEST(regs, phys)) { 449 v3d_ra->next_phys = phys_off + 1; 450 *out = phys; 451 return true; 452 } 453 } 454 455 return false; 456} 457 458static unsigned int 459v3d_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data) 460{ 461 struct v3d_ra_select_callback_data *v3d_ra = data; 462 int r5 = ACC_INDEX + 5; 463 464 /* Choose r5 for our ldunifs if possible (nobody else can load to that 465 * reg, and it keeps the QPU cond field free from being occupied by 466 * ldunifrf). 467 */ 468 if (BITSET_TEST(regs, r5)) 469 return r5; 470 471 unsigned int reg; 472 if (v3d_ra_favor_accum(v3d_ra, regs, v3d_ra->map[n].priority) && 473 v3d_ra_select_accum(v3d_ra, regs, ®)) { 474 return reg; 475 } 476 477 if (v3d_ra_select_rf(v3d_ra, regs, ®)) 478 return reg; 479 480 /* If we ran out of physical registers try to assign an accumulator 481 * if we didn't favor that option earlier. 482 */ 483 if (v3d_ra_select_accum(v3d_ra, regs, ®)) 484 return reg; 485 486 unreachable("RA must pass us at least one possible reg."); 487} 488 489bool 490vir_init_reg_sets(struct v3d_compiler *compiler) 491{ 492 /* Allocate up to 3 regfile classes, for the ways the physical 493 * register file can be divided up for fragment shader threading. 494 */ 495 int max_thread_index = (compiler->devinfo->ver >= 40 ? 2 : 3); 496 497 compiler->regs = ra_alloc_reg_set(compiler, PHYS_INDEX + PHYS_COUNT, 498 false); 499 if (!compiler->regs) 500 return false; 501 502 for (int threads = 0; threads < max_thread_index; threads++) { 503 compiler->reg_class_any[threads] = 504 ra_alloc_contig_reg_class(compiler->regs, 1); 505 compiler->reg_class_r5[threads] = 506 ra_alloc_contig_reg_class(compiler->regs, 1); 507 compiler->reg_class_phys_or_acc[threads] = 508 ra_alloc_contig_reg_class(compiler->regs, 1); 509 compiler->reg_class_phys[threads] = 510 ra_alloc_contig_reg_class(compiler->regs, 1); 511 512 for (int i = PHYS_INDEX; 513 i < PHYS_INDEX + (PHYS_COUNT >> threads); i++) { 514 ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i); 515 ra_class_add_reg(compiler->reg_class_phys[threads], i); 516 ra_class_add_reg(compiler->reg_class_any[threads], i); 517 } 518 519 for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) { 520 ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i); 521 ra_class_add_reg(compiler->reg_class_any[threads], i); 522 } 523 /* r5 can only store a single 32-bit value, so not much can 524 * use it. 525 */ 526 ra_class_add_reg(compiler->reg_class_r5[threads], 527 ACC_INDEX + 5); 528 ra_class_add_reg(compiler->reg_class_any[threads], 529 ACC_INDEX + 5); 530 } 531 532 ra_set_finalize(compiler->regs, NULL); 533 534 return true; 535} 536 537static int 538node_to_temp_priority(const void *in_a, const void *in_b) 539{ 540 const struct node_to_temp_map *a = in_a; 541 const struct node_to_temp_map *b = in_b; 542 543 return a->priority - b->priority; 544} 545 546/** 547 * Computes the number of registers to spill in a batch after a register 548 * allocation failure. 549 */ 550static uint32_t 551get_spill_batch_size(struct v3d_compile *c) 552{ 553 /* Allow up to 10 spills in batches of 1 in any case to avoid any chance of 554 * over-spilling if the program requires few spills to compile. 555 */ 556 if (c->spill_count < 10) 557 return 1; 558 559 /* If we have to spill more than that we assume performance is not going to 560 * be great and we shift focus to batching spills to cut down compile 561 * time at the expense of over-spilling. 562 */ 563 return 20; 564} 565 566/* Don't emit spills using the TMU until we've dropped thread count first. We, 567 * may also disable spilling when certain optimizations that are known to 568 * increase register pressure are active so we favor recompiling with 569 * optimizations disabled instead of spilling. 570 */ 571static inline bool 572tmu_spilling_allowed(struct v3d_compile *c, int thread_index) 573{ 574 return thread_index == 0 && c->tmu_spilling_allowed; 575} 576 577#define CLASS_BIT_PHYS (1 << 0) 578#define CLASS_BIT_ACC (1 << 1) 579#define CLASS_BIT_R5 (1 << 4) 580#define CLASS_BITS_ANY (CLASS_BIT_PHYS | \ 581 CLASS_BIT_ACC | \ 582 CLASS_BIT_R5) 583 584/** 585 * Returns a mapping from QFILE_TEMP indices to struct qpu_regs. 586 * 587 * The return value should be freed by the caller. 588 */ 589struct qpu_reg * 590v3d_register_allocate(struct v3d_compile *c, bool *spilled) 591{ 592 uint32_t UNUSED start_num_temps = c->num_temps; 593 struct node_to_temp_map map[c->num_temps]; 594 uint32_t temp_to_node[c->num_temps]; 595 uint8_t class_bits[c->num_temps]; 596 int acc_nodes[ACC_COUNT]; 597 struct v3d_ra_select_callback_data callback_data = { 598 .next_acc = 0, 599 /* Start at RF3, to try to keep the TLB writes from using 600 * RF0-2. 601 */ 602 .next_phys = 3, 603 .map = map, 604 }; 605 606 *spilled = false; 607 608 vir_calculate_live_intervals(c); 609 610 /* Convert 1, 2, 4 threads to 0, 1, 2 index. 611 * 612 * V3D 4.x has double the physical register space, so 64 physical regs 613 * are available at both 1x and 2x threading, and 4x has 32. 614 */ 615 int thread_index = ffs(c->threads) - 1; 616 if (c->devinfo->ver >= 40) { 617 if (thread_index >= 1) 618 thread_index--; 619 } 620 621 struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs, 622 c->num_temps + 623 ARRAY_SIZE(acc_nodes)); 624 ra_set_select_reg_callback(g, v3d_ra_select_callback, &callback_data); 625 626 /* Make some fixed nodes for the accumulators, which we will need to 627 * interfere with when ops have implied r3/r4 writes or for the thread 628 * switches. We could represent these as classes for the nodes to 629 * live in, but the classes take up a lot of memory to set up, so we 630 * don't want to make too many. 631 */ 632 for (int i = 0; i < ARRAY_SIZE(acc_nodes); i++) { 633 acc_nodes[i] = c->num_temps + i; 634 ra_set_node_reg(g, acc_nodes[i], ACC_INDEX + i); 635 } 636 637 for (uint32_t i = 0; i < c->num_temps; i++) { 638 map[i].temp = i; 639 map[i].priority = c->temp_end[i] - c->temp_start[i]; 640 } 641 qsort(map, c->num_temps, sizeof(map[0]), node_to_temp_priority); 642 for (uint32_t i = 0; i < c->num_temps; i++) { 643 temp_to_node[map[i].temp] = i; 644 } 645 646 /* Figure out our register classes and preallocated registers. We 647 * start with any temp being able to be in any file, then instructions 648 * incrementally remove bits that the temp definitely can't be in. 649 */ 650 memset(class_bits, CLASS_BITS_ANY, sizeof(class_bits)); 651 652 int ip = 0; 653 vir_for_each_inst_inorder(inst, c) { 654 /* If the instruction writes r3/r4 (and optionally moves its 655 * result to a temp), nothing else can be stored in r3/r4 across 656 * it. 657 */ 658 if (vir_writes_r3(c->devinfo, inst)) { 659 for (int i = 0; i < c->num_temps; i++) { 660 if (c->temp_start[i] < ip && 661 c->temp_end[i] > ip) { 662 ra_add_node_interference(g, 663 temp_to_node[i], 664 acc_nodes[3]); 665 } 666 } 667 } 668 if (vir_writes_r4(c->devinfo, inst)) { 669 for (int i = 0; i < c->num_temps; i++) { 670 if (c->temp_start[i] < ip && 671 c->temp_end[i] > ip) { 672 ra_add_node_interference(g, 673 temp_to_node[i], 674 acc_nodes[4]); 675 } 676 } 677 } 678 679 if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) { 680 switch (inst->qpu.alu.add.op) { 681 case V3D_QPU_A_LDVPMV_IN: 682 case V3D_QPU_A_LDVPMV_OUT: 683 case V3D_QPU_A_LDVPMD_IN: 684 case V3D_QPU_A_LDVPMD_OUT: 685 case V3D_QPU_A_LDVPMP: 686 case V3D_QPU_A_LDVPMG_IN: 687 case V3D_QPU_A_LDVPMG_OUT: 688 /* LDVPMs only store to temps (the MA flag 689 * decides whether the LDVPM is in or out) 690 */ 691 assert(inst->dst.file == QFILE_TEMP); 692 class_bits[inst->dst.index] &= CLASS_BIT_PHYS; 693 break; 694 695 case V3D_QPU_A_RECIP: 696 case V3D_QPU_A_RSQRT: 697 case V3D_QPU_A_EXP: 698 case V3D_QPU_A_LOG: 699 case V3D_QPU_A_SIN: 700 case V3D_QPU_A_RSQRT2: 701 /* The SFU instructions write directly to the 702 * phys regfile. 703 */ 704 assert(inst->dst.file == QFILE_TEMP); 705 class_bits[inst->dst.index] &= CLASS_BIT_PHYS; 706 break; 707 708 default: 709 break; 710 } 711 } 712 713 if (inst->src[0].file == QFILE_REG) { 714 switch (inst->src[0].index) { 715 case 0: 716 case 1: 717 case 2: 718 case 3: 719 /* Payload setup instructions: Force allocate 720 * the dst to the given register (so the MOV 721 * will disappear). 722 */ 723 assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV); 724 assert(inst->dst.file == QFILE_TEMP); 725 ra_set_node_reg(g, 726 temp_to_node[inst->dst.index], 727 PHYS_INDEX + 728 inst->src[0].index); 729 break; 730 } 731 } 732 733 if (inst->dst.file == QFILE_TEMP) { 734 /* Only a ldunif gets to write to R5, which only has a 735 * single 32-bit channel of storage. 736 */ 737 if (!inst->qpu.sig.ldunif) { 738 class_bits[inst->dst.index] &= ~CLASS_BIT_R5; 739 } else { 740 /* Until V3D 4.x, we could only load a uniform 741 * to r5, so we'll need to spill if uniform 742 * loads interfere with each other. 743 */ 744 if (c->devinfo->ver < 40) { 745 class_bits[inst->dst.index] &= 746 CLASS_BIT_R5; 747 } 748 } 749 } 750 751 if (inst->qpu.sig.thrsw) { 752 /* All accumulators are invalidated across a thread 753 * switch. 754 */ 755 for (int i = 0; i < c->num_temps; i++) { 756 if (c->temp_start[i] < ip && c->temp_end[i] > ip) 757 class_bits[i] &= CLASS_BIT_PHYS; 758 } 759 } 760 761 ip++; 762 } 763 764 for (uint32_t i = 0; i < c->num_temps; i++) { 765 if (class_bits[i] == CLASS_BIT_PHYS) { 766 ra_set_node_class(g, temp_to_node[i], 767 c->compiler->reg_class_phys[thread_index]); 768 } else if (class_bits[i] == (CLASS_BIT_R5)) { 769 ra_set_node_class(g, temp_to_node[i], 770 c->compiler->reg_class_r5[thread_index]); 771 } else if (class_bits[i] == (CLASS_BIT_PHYS | CLASS_BIT_ACC)) { 772 ra_set_node_class(g, temp_to_node[i], 773 c->compiler->reg_class_phys_or_acc[thread_index]); 774 } else { 775 assert(class_bits[i] == CLASS_BITS_ANY); 776 ra_set_node_class(g, temp_to_node[i], 777 c->compiler->reg_class_any[thread_index]); 778 } 779 } 780 781 for (uint32_t i = 0; i < c->num_temps; i++) { 782 for (uint32_t j = i + 1; j < c->num_temps; j++) { 783 if (!(c->temp_start[i] >= c->temp_end[j] || 784 c->temp_start[j] >= c->temp_end[i])) { 785 ra_add_node_interference(g, 786 temp_to_node[i], 787 temp_to_node[j]); 788 } 789 } 790 } 791 792 /* Debug code to force a bit of register spilling, for running across 793 * conformance tests to make sure that spilling works. 794 */ 795 int force_register_spills = 0; 796 if (c->spill_size < 797 V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) { 798 int node = v3d_choose_spill_node(c, g, temp_to_node); 799 if (node != -1) { 800 v3d_spill_reg(c, map[node].temp); 801 ralloc_free(g); 802 *spilled = true; 803 return NULL; 804 } 805 } 806 807 bool ok = ra_allocate(g); 808 if (!ok) { 809 const uint32_t spill_batch_size = get_spill_batch_size(c); 810 811 for (uint32_t i = 0; i < spill_batch_size; i++) { 812 int node = v3d_choose_spill_node(c, g, temp_to_node); 813 if (node == -1) 814 break; 815 816 /* TMU spills inject thrsw signals that invalidate 817 * accumulators, so we can't batch them. 818 */ 819 bool is_uniform = vir_is_mov_uniform(c, map[node].temp); 820 if (i > 0 && !is_uniform) 821 break; 822 823 if (is_uniform || tmu_spilling_allowed(c, thread_index)) { 824 v3d_spill_reg(c, map[node].temp); 825 826 /* Ask the outer loop to call back in. */ 827 *spilled = true; 828 829 /* See comment above about batching TMU spills. 830 */ 831 if (!is_uniform) { 832 assert(i == 0); 833 break; 834 } 835 } else { 836 break; 837 } 838 } 839 840 ralloc_free(g); 841 return NULL; 842 } 843 844 /* Ensure that we are not accessing temp_to_node out of bounds. We 845 * should never trigger this assertion because `c->num_temps` only 846 * grows when we spill, in which case we return early and don't get 847 * here. 848 */ 849 assert(start_num_temps == c->num_temps); 850 struct qpu_reg *temp_registers = calloc(c->num_temps, 851 sizeof(*temp_registers)); 852 853 for (uint32_t i = 0; i < c->num_temps; i++) { 854 int ra_reg = ra_get_node_reg(g, temp_to_node[i]); 855 if (ra_reg < PHYS_INDEX) { 856 temp_registers[i].magic = true; 857 temp_registers[i].index = (V3D_QPU_WADDR_R0 + 858 ra_reg - ACC_INDEX); 859 } else { 860 temp_registers[i].magic = false; 861 temp_registers[i].index = ra_reg - PHYS_INDEX; 862 } 863 } 864 865 ralloc_free(g); 866 867 return temp_registers; 868} 869