1b8e80941Smrg/* 2b8e80941Smrg * Copyright © 2014 Broadcom 3b8e80941Smrg * 4b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a 5b8e80941Smrg * copy of this software and associated documentation files (the "Software"), 6b8e80941Smrg * to deal in the Software without restriction, including without limitation 7b8e80941Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8b8e80941Smrg * and/or sell copies of the Software, and to permit persons to whom the 9b8e80941Smrg * Software is furnished to do so, subject to the following conditions: 10b8e80941Smrg * 11b8e80941Smrg * The above copyright notice and this permission notice (including the next 12b8e80941Smrg * paragraph) shall be included in all copies or substantial portions of the 13b8e80941Smrg * Software. 14b8e80941Smrg * 15b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16b8e80941Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18b8e80941Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19b8e80941Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20b8e80941Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21b8e80941Smrg * IN THE SOFTWARE. 22b8e80941Smrg */ 23b8e80941Smrg 24b8e80941Smrg#include "util/ralloc.h" 25b8e80941Smrg#include "util/register_allocate.h" 26b8e80941Smrg#include "common/v3d_device_info.h" 27b8e80941Smrg#include "v3d_compiler.h" 28b8e80941Smrg 29b8e80941Smrg#define QPU_R(i) { .magic = false, .index = i } 30b8e80941Smrg 31b8e80941Smrg#define ACC_INDEX 0 32b8e80941Smrg#define ACC_COUNT 6 33b8e80941Smrg#define PHYS_INDEX (ACC_INDEX + ACC_COUNT) 34b8e80941Smrg#define PHYS_COUNT 64 35b8e80941Smrg 36b8e80941Smrgstatic inline bool 37b8e80941Smrgqinst_writes_tmu(struct qinst *inst) 38b8e80941Smrg{ 39b8e80941Smrg return (inst->dst.file == QFILE_MAGIC && 40b8e80941Smrg v3d_qpu_magic_waddr_is_tmu(inst->dst.index)); 41b8e80941Smrg} 42b8e80941Smrg 43b8e80941Smrgstatic bool 44b8e80941Smrgis_last_ldtmu(struct qinst *inst, struct qblock *block) 45b8e80941Smrg{ 46b8e80941Smrg list_for_each_entry_from(struct qinst, scan_inst, inst->link.next, 47b8e80941Smrg &block->instructions, link) { 48b8e80941Smrg if (scan_inst->qpu.sig.ldtmu) 49b8e80941Smrg return false; 50b8e80941Smrg if (qinst_writes_tmu(scan_inst)) 51b8e80941Smrg return true; 52b8e80941Smrg } 53b8e80941Smrg 54b8e80941Smrg return true; 55b8e80941Smrg} 56b8e80941Smrg 57b8e80941Smrgstatic bool 58b8e80941Smrgvir_is_mov_uniform(struct v3d_compile *c, int temp) 59b8e80941Smrg{ 60b8e80941Smrg struct qinst *def = c->defs[temp]; 61b8e80941Smrg 62b8e80941Smrg return def && def->qpu.sig.ldunif; 63b8e80941Smrg} 64b8e80941Smrg 65b8e80941Smrgstatic int 66b8e80941Smrgv3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g, 67b8e80941Smrg uint32_t *temp_to_node) 68b8e80941Smrg{ 69b8e80941Smrg const float tmu_scale = 5; 70b8e80941Smrg float block_scale = 1.0; 71b8e80941Smrg float spill_costs[c->num_temps]; 72b8e80941Smrg bool in_tmu_operation = false; 73b8e80941Smrg bool started_last_seg = false; 74b8e80941Smrg 75b8e80941Smrg for (unsigned i = 0; i < c->num_temps; i++) 76b8e80941Smrg spill_costs[i] = 0.0; 77b8e80941Smrg 78b8e80941Smrg /* XXX: Scale the cost up when inside of a loop. */ 79b8e80941Smrg vir_for_each_block(block, c) { 80b8e80941Smrg vir_for_each_inst(inst, block) { 81b8e80941Smrg /* We can't insert a new TMU operation while currently 82b8e80941Smrg * in a TMU operation, and we can't insert new thread 83b8e80941Smrg * switches after starting output writes. 84b8e80941Smrg */ 85b8e80941Smrg bool no_spilling = 86b8e80941Smrg (in_tmu_operation || 87b8e80941Smrg (c->threads > 1 && started_last_seg)); 88b8e80941Smrg 89b8e80941Smrg for (int i = 0; i < vir_get_nsrc(inst); i++) { 90b8e80941Smrg if (inst->src[i].file != QFILE_TEMP) 91b8e80941Smrg continue; 92b8e80941Smrg 93b8e80941Smrg int temp = inst->src[i].index; 94b8e80941Smrg if (vir_is_mov_uniform(c, temp)) { 95b8e80941Smrg spill_costs[temp] += block_scale; 96b8e80941Smrg } else if (!no_spilling) { 97b8e80941Smrg spill_costs[temp] += (block_scale * 98b8e80941Smrg tmu_scale); 99b8e80941Smrg } else { 100b8e80941Smrg BITSET_CLEAR(c->spillable, temp); 101b8e80941Smrg } 102b8e80941Smrg } 103b8e80941Smrg 104b8e80941Smrg if (inst->dst.file == QFILE_TEMP) { 105b8e80941Smrg int temp = inst->dst.index; 106b8e80941Smrg 107b8e80941Smrg if (vir_is_mov_uniform(c, temp)) { 108b8e80941Smrg /* We just rematerialize the unform 109b8e80941Smrg * later. 110b8e80941Smrg */ 111b8e80941Smrg } else if (!no_spilling) { 112b8e80941Smrg spill_costs[temp] += (block_scale * 113b8e80941Smrg tmu_scale); 114b8e80941Smrg } else { 115b8e80941Smrg BITSET_CLEAR(c->spillable, temp); 116b8e80941Smrg } 117b8e80941Smrg } 118b8e80941Smrg 119b8e80941Smrg /* Refuse to spill a ldvary's dst, because that means 120b8e80941Smrg * that ldvary's r5 would end up being used across a 121b8e80941Smrg * thrsw. 122b8e80941Smrg */ 123b8e80941Smrg if (inst->qpu.sig.ldvary) { 124b8e80941Smrg assert(inst->dst.file == QFILE_TEMP); 125b8e80941Smrg BITSET_CLEAR(c->spillable, inst->dst.index); 126b8e80941Smrg } 127b8e80941Smrg 128b8e80941Smrg if (inst->is_last_thrsw) 129b8e80941Smrg started_last_seg = true; 130b8e80941Smrg 131b8e80941Smrg if (v3d_qpu_writes_vpm(&inst->qpu) || 132b8e80941Smrg v3d_qpu_uses_tlb(&inst->qpu)) 133b8e80941Smrg started_last_seg = true; 134b8e80941Smrg 135b8e80941Smrg /* Track when we're in between a TMU setup and the 136b8e80941Smrg * final LDTMU or TMUWT from that TMU setup. We can't 137b8e80941Smrg * spill/fill any temps during that time, because that 138b8e80941Smrg * involves inserting a new TMU setup/LDTMU sequence. 139b8e80941Smrg */ 140b8e80941Smrg if (inst->qpu.sig.ldtmu && 141b8e80941Smrg is_last_ldtmu(inst, block)) 142b8e80941Smrg in_tmu_operation = false; 143b8e80941Smrg 144b8e80941Smrg if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && 145b8e80941Smrg inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) 146b8e80941Smrg in_tmu_operation = false; 147b8e80941Smrg 148b8e80941Smrg if (qinst_writes_tmu(inst)) 149b8e80941Smrg in_tmu_operation = true; 150b8e80941Smrg } 151b8e80941Smrg } 152b8e80941Smrg 153b8e80941Smrg for (unsigned i = 0; i < c->num_temps; i++) { 154b8e80941Smrg int node = temp_to_node[i]; 155b8e80941Smrg 156b8e80941Smrg if (BITSET_TEST(c->spillable, i)) 157b8e80941Smrg ra_set_node_spill_cost(g, node, spill_costs[i]); 158b8e80941Smrg } 159b8e80941Smrg 160b8e80941Smrg return ra_get_best_spill_node(g); 161b8e80941Smrg} 162b8e80941Smrg 163b8e80941Smrg/* The spill offset for this thread takes a bit of setup, so do it once at 164b8e80941Smrg * program start. 165b8e80941Smrg */ 166b8e80941Smrgvoid 167b8e80941Smrgv3d_setup_spill_base(struct v3d_compile *c) 168b8e80941Smrg{ 169b8e80941Smrg c->cursor = vir_before_block(vir_entry_block(c)); 170b8e80941Smrg 171b8e80941Smrg int start_num_temps = c->num_temps; 172b8e80941Smrg 173b8e80941Smrg /* Each thread wants to be in a separate region of the scratch space 174b8e80941Smrg * so that the QPUs aren't fighting over cache lines. We have the 175b8e80941Smrg * driver keep a single global spill BO rather than 176b8e80941Smrg * per-spilling-program BOs, so we need a uniform from the driver for 177b8e80941Smrg * what the per-thread scale is. 178b8e80941Smrg */ 179b8e80941Smrg struct qreg thread_offset = 180b8e80941Smrg vir_UMUL(c, 181b8e80941Smrg vir_TIDX(c), 182b8e80941Smrg vir_uniform(c, QUNIFORM_SPILL_SIZE_PER_THREAD, 0)); 183b8e80941Smrg 184b8e80941Smrg /* Each channel in a reg is 4 bytes, so scale them up by that. */ 185b8e80941Smrg struct qreg element_offset = vir_SHL(c, vir_EIDX(c), 186b8e80941Smrg vir_uniform_ui(c, 2)); 187b8e80941Smrg 188b8e80941Smrg c->spill_base = vir_ADD(c, 189b8e80941Smrg vir_ADD(c, thread_offset, element_offset), 190b8e80941Smrg vir_uniform(c, QUNIFORM_SPILL_OFFSET, 0)); 191b8e80941Smrg 192b8e80941Smrg /* Make sure that we don't spill the spilling setup instructions. */ 193b8e80941Smrg for (int i = start_num_temps; i < c->num_temps; i++) 194b8e80941Smrg BITSET_CLEAR(c->spillable, i); 195b8e80941Smrg 196b8e80941Smrg c->cursor = vir_after_block(c->cur_block); 197b8e80941Smrg} 198b8e80941Smrg 199b8e80941Smrgstatic void 200b8e80941Smrgv3d_emit_spill_tmua(struct v3d_compile *c, uint32_t spill_offset) 201b8e80941Smrg{ 202b8e80941Smrg vir_ADD_dest(c, vir_reg(QFILE_MAGIC, 203b8e80941Smrg V3D_QPU_WADDR_TMUA), 204b8e80941Smrg c->spill_base, 205b8e80941Smrg vir_uniform_ui(c, spill_offset)); 206b8e80941Smrg} 207b8e80941Smrg 208b8e80941Smrgstatic void 209b8e80941Smrgv3d_spill_reg(struct v3d_compile *c, int spill_temp) 210b8e80941Smrg{ 211b8e80941Smrg bool is_uniform = vir_is_mov_uniform(c, spill_temp); 212b8e80941Smrg 213b8e80941Smrg uint32_t spill_offset = 0; 214b8e80941Smrg 215b8e80941Smrg if (!is_uniform) { 216b8e80941Smrg uint32_t spill_offset = c->spill_size; 217b8e80941Smrg c->spill_size += V3D_CHANNELS * sizeof(uint32_t); 218b8e80941Smrg 219b8e80941Smrg if (spill_offset == 0) 220b8e80941Smrg v3d_setup_spill_base(c); 221b8e80941Smrg } 222b8e80941Smrg 223b8e80941Smrg struct qinst *last_thrsw = c->last_thrsw; 224b8e80941Smrg assert(!last_thrsw || last_thrsw->is_last_thrsw); 225b8e80941Smrg 226b8e80941Smrg int start_num_temps = c->num_temps; 227b8e80941Smrg 228b8e80941Smrg int uniform_index = ~0; 229b8e80941Smrg if (is_uniform) { 230b8e80941Smrg struct qinst *orig_unif = c->defs[spill_temp]; 231b8e80941Smrg uniform_index = orig_unif->uniform; 232b8e80941Smrg } 233b8e80941Smrg 234b8e80941Smrg vir_for_each_inst_inorder_safe(inst, c) { 235b8e80941Smrg for (int i = 0; i < vir_get_nsrc(inst); i++) { 236b8e80941Smrg if (inst->src[i].file != QFILE_TEMP || 237b8e80941Smrg inst->src[i].index != spill_temp) { 238b8e80941Smrg continue; 239b8e80941Smrg } 240b8e80941Smrg 241b8e80941Smrg c->cursor = vir_before_inst(inst); 242b8e80941Smrg 243b8e80941Smrg if (is_uniform) { 244b8e80941Smrg struct qreg unif = 245b8e80941Smrg vir_uniform(c, 246b8e80941Smrg c->uniform_contents[uniform_index], 247b8e80941Smrg c->uniform_data[uniform_index]); 248b8e80941Smrg inst->src[i] = unif; 249b8e80941Smrg } else { 250b8e80941Smrg v3d_emit_spill_tmua(c, spill_offset); 251b8e80941Smrg vir_emit_thrsw(c); 252b8e80941Smrg inst->src[i] = vir_LDTMU(c); 253b8e80941Smrg c->fills++; 254b8e80941Smrg } 255b8e80941Smrg } 256b8e80941Smrg 257b8e80941Smrg if (inst->dst.file == QFILE_TEMP && 258b8e80941Smrg inst->dst.index == spill_temp) { 259b8e80941Smrg if (is_uniform) { 260b8e80941Smrg c->cursor.link = NULL; 261b8e80941Smrg vir_remove_instruction(c, inst); 262b8e80941Smrg } else { 263b8e80941Smrg c->cursor = vir_after_inst(inst); 264b8e80941Smrg 265b8e80941Smrg inst->dst.index = c->num_temps++; 266b8e80941Smrg vir_MOV_dest(c, vir_reg(QFILE_MAGIC, 267b8e80941Smrg V3D_QPU_WADDR_TMUD), 268b8e80941Smrg inst->dst); 269b8e80941Smrg v3d_emit_spill_tmua(c, spill_offset); 270b8e80941Smrg vir_emit_thrsw(c); 271b8e80941Smrg vir_TMUWT(c); 272b8e80941Smrg c->spills++; 273b8e80941Smrg } 274b8e80941Smrg } 275b8e80941Smrg 276b8e80941Smrg /* If we didn't have a last-thrsw inserted by nir_to_vir and 277b8e80941Smrg * we've been inserting thrsws, then insert a new last_thrsw 278b8e80941Smrg * right before we start the vpm/tlb sequence for the last 279b8e80941Smrg * thread segment. 280b8e80941Smrg */ 281b8e80941Smrg if (!is_uniform && !last_thrsw && c->last_thrsw && 282b8e80941Smrg (v3d_qpu_writes_vpm(&inst->qpu) || 283b8e80941Smrg v3d_qpu_uses_tlb(&inst->qpu))) { 284b8e80941Smrg c->cursor = vir_before_inst(inst); 285b8e80941Smrg vir_emit_thrsw(c); 286b8e80941Smrg 287b8e80941Smrg last_thrsw = c->last_thrsw; 288b8e80941Smrg last_thrsw->is_last_thrsw = true; 289b8e80941Smrg } 290b8e80941Smrg } 291b8e80941Smrg 292b8e80941Smrg /* Make sure c->last_thrsw is the actual last thrsw, not just one we 293b8e80941Smrg * inserted in our most recent unspill. 294b8e80941Smrg */ 295b8e80941Smrg if (last_thrsw) 296b8e80941Smrg c->last_thrsw = last_thrsw; 297b8e80941Smrg 298b8e80941Smrg /* Don't allow spilling of our spilling instructions. There's no way 299b8e80941Smrg * they can help get things colored. 300b8e80941Smrg */ 301b8e80941Smrg for (int i = start_num_temps; i < c->num_temps; i++) 302b8e80941Smrg BITSET_CLEAR(c->spillable, i); 303b8e80941Smrg} 304b8e80941Smrg 305b8e80941Smrgstruct v3d_ra_select_callback_data { 306b8e80941Smrg uint32_t next_acc; 307b8e80941Smrg uint32_t next_phys; 308b8e80941Smrg}; 309b8e80941Smrg 310b8e80941Smrgstatic unsigned int 311b8e80941Smrgv3d_ra_select_callback(struct ra_graph *g, BITSET_WORD *regs, void *data) 312b8e80941Smrg{ 313b8e80941Smrg struct v3d_ra_select_callback_data *v3d_ra = data; 314b8e80941Smrg int r5 = ACC_INDEX + 5; 315b8e80941Smrg 316b8e80941Smrg /* Choose r5 for our ldunifs if possible (nobody else can load to that 317b8e80941Smrg * reg, and it keeps the QPU cond field free from being occupied by 318b8e80941Smrg * ldunifrf). 319b8e80941Smrg */ 320b8e80941Smrg if (BITSET_TEST(regs, r5)) 321b8e80941Smrg return r5; 322b8e80941Smrg 323b8e80941Smrg /* Choose an accumulator if possible (I think it's lower power than 324b8e80941Smrg * phys regs), but round-robin through them to give post-RA 325b8e80941Smrg * instruction selection more options. 326b8e80941Smrg */ 327b8e80941Smrg for (int i = 0; i < ACC_COUNT; i++) { 328b8e80941Smrg int acc_off = (v3d_ra->next_acc + i) % ACC_COUNT; 329b8e80941Smrg int acc = ACC_INDEX + acc_off; 330b8e80941Smrg 331b8e80941Smrg if (BITSET_TEST(regs, acc)) { 332b8e80941Smrg v3d_ra->next_acc = acc_off + 1; 333b8e80941Smrg return acc; 334b8e80941Smrg } 335b8e80941Smrg } 336b8e80941Smrg 337b8e80941Smrg for (int i = 0; i < PHYS_COUNT; i++) { 338b8e80941Smrg int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT; 339b8e80941Smrg int phys = PHYS_INDEX + phys_off; 340b8e80941Smrg 341b8e80941Smrg if (BITSET_TEST(regs, phys)) { 342b8e80941Smrg v3d_ra->next_phys = phys_off + 1; 343b8e80941Smrg return phys; 344b8e80941Smrg } 345b8e80941Smrg } 346b8e80941Smrg 347b8e80941Smrg unreachable("RA must pass us at least one possible reg."); 348b8e80941Smrg} 349b8e80941Smrg 350b8e80941Smrgbool 351b8e80941Smrgvir_init_reg_sets(struct v3d_compiler *compiler) 352b8e80941Smrg{ 353b8e80941Smrg /* Allocate up to 3 regfile classes, for the ways the physical 354b8e80941Smrg * register file can be divided up for fragment shader threading. 355b8e80941Smrg */ 356b8e80941Smrg int max_thread_index = (compiler->devinfo->ver >= 40 ? 2 : 3); 357b8e80941Smrg 358b8e80941Smrg compiler->regs = ra_alloc_reg_set(compiler, PHYS_INDEX + PHYS_COUNT, 359b8e80941Smrg true); 360b8e80941Smrg if (!compiler->regs) 361b8e80941Smrg return false; 362b8e80941Smrg 363b8e80941Smrg for (int threads = 0; threads < max_thread_index; threads++) { 364b8e80941Smrg compiler->reg_class_any[threads] = 365b8e80941Smrg ra_alloc_reg_class(compiler->regs); 366b8e80941Smrg compiler->reg_class_r5[threads] = 367b8e80941Smrg ra_alloc_reg_class(compiler->regs); 368b8e80941Smrg compiler->reg_class_phys_or_acc[threads] = 369b8e80941Smrg ra_alloc_reg_class(compiler->regs); 370b8e80941Smrg compiler->reg_class_phys[threads] = 371b8e80941Smrg ra_alloc_reg_class(compiler->regs); 372b8e80941Smrg 373b8e80941Smrg for (int i = PHYS_INDEX; 374b8e80941Smrg i < PHYS_INDEX + (PHYS_COUNT >> threads); i++) { 375b8e80941Smrg ra_class_add_reg(compiler->regs, 376b8e80941Smrg compiler->reg_class_phys_or_acc[threads], i); 377b8e80941Smrg ra_class_add_reg(compiler->regs, 378b8e80941Smrg compiler->reg_class_phys[threads], i); 379b8e80941Smrg ra_class_add_reg(compiler->regs, 380b8e80941Smrg compiler->reg_class_any[threads], i); 381b8e80941Smrg } 382b8e80941Smrg 383b8e80941Smrg for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) { 384b8e80941Smrg ra_class_add_reg(compiler->regs, 385b8e80941Smrg compiler->reg_class_phys_or_acc[threads], i); 386b8e80941Smrg ra_class_add_reg(compiler->regs, 387b8e80941Smrg compiler->reg_class_any[threads], i); 388b8e80941Smrg } 389b8e80941Smrg /* r5 can only store a single 32-bit value, so not much can 390b8e80941Smrg * use it. 391b8e80941Smrg */ 392b8e80941Smrg ra_class_add_reg(compiler->regs, 393b8e80941Smrg compiler->reg_class_r5[threads], 394b8e80941Smrg ACC_INDEX + 5); 395b8e80941Smrg ra_class_add_reg(compiler->regs, 396b8e80941Smrg compiler->reg_class_any[threads], 397b8e80941Smrg ACC_INDEX + 5); 398b8e80941Smrg } 399b8e80941Smrg 400b8e80941Smrg ra_set_finalize(compiler->regs, NULL); 401b8e80941Smrg 402b8e80941Smrg return true; 403b8e80941Smrg} 404b8e80941Smrg 405b8e80941Smrgstruct node_to_temp_map { 406b8e80941Smrg uint32_t temp; 407b8e80941Smrg uint32_t priority; 408b8e80941Smrg}; 409b8e80941Smrg 410b8e80941Smrgstatic int 411b8e80941Smrgnode_to_temp_priority(const void *in_a, const void *in_b) 412b8e80941Smrg{ 413b8e80941Smrg const struct node_to_temp_map *a = in_a; 414b8e80941Smrg const struct node_to_temp_map *b = in_b; 415b8e80941Smrg 416b8e80941Smrg return a->priority - b->priority; 417b8e80941Smrg} 418b8e80941Smrg 419b8e80941Smrg#define CLASS_BIT_PHYS (1 << 0) 420b8e80941Smrg#define CLASS_BIT_ACC (1 << 1) 421b8e80941Smrg#define CLASS_BIT_R5 (1 << 4) 422b8e80941Smrg#define CLASS_BITS_ANY (CLASS_BIT_PHYS | \ 423b8e80941Smrg CLASS_BIT_ACC | \ 424b8e80941Smrg CLASS_BIT_R5) 425b8e80941Smrg 426b8e80941Smrg/** 427b8e80941Smrg * Returns a mapping from QFILE_TEMP indices to struct qpu_regs. 428b8e80941Smrg * 429b8e80941Smrg * The return value should be freed by the caller. 430b8e80941Smrg */ 431b8e80941Smrgstruct qpu_reg * 432b8e80941Smrgv3d_register_allocate(struct v3d_compile *c, bool *spilled) 433b8e80941Smrg{ 434b8e80941Smrg struct node_to_temp_map map[c->num_temps]; 435b8e80941Smrg uint32_t temp_to_node[c->num_temps]; 436b8e80941Smrg uint8_t class_bits[c->num_temps]; 437b8e80941Smrg int acc_nodes[ACC_COUNT]; 438b8e80941Smrg struct v3d_ra_select_callback_data callback_data = { 439b8e80941Smrg .next_acc = 0, 440b8e80941Smrg /* Start at RF3, to try to keep the TLB writes from using 441b8e80941Smrg * RF0-2. 442b8e80941Smrg */ 443b8e80941Smrg .next_phys = 3, 444b8e80941Smrg }; 445b8e80941Smrg 446b8e80941Smrg *spilled = false; 447b8e80941Smrg 448b8e80941Smrg vir_calculate_live_intervals(c); 449b8e80941Smrg 450b8e80941Smrg /* Convert 1, 2, 4 threads to 0, 1, 2 index. 451b8e80941Smrg * 452b8e80941Smrg * V3D 4.x has double the physical register space, so 64 physical regs 453b8e80941Smrg * are available at both 1x and 2x threading, and 4x has 32. 454b8e80941Smrg */ 455b8e80941Smrg int thread_index = ffs(c->threads) - 1; 456b8e80941Smrg if (c->devinfo->ver >= 40) { 457b8e80941Smrg if (thread_index >= 1) 458b8e80941Smrg thread_index--; 459b8e80941Smrg } 460b8e80941Smrg 461b8e80941Smrg struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs, 462b8e80941Smrg c->num_temps + 463b8e80941Smrg ARRAY_SIZE(acc_nodes)); 464b8e80941Smrg ra_set_select_reg_callback(g, v3d_ra_select_callback, &callback_data); 465b8e80941Smrg 466b8e80941Smrg /* Make some fixed nodes for the accumulators, which we will need to 467b8e80941Smrg * interfere with when ops have implied r3/r4 writes or for the thread 468b8e80941Smrg * switches. We could represent these as classes for the nodes to 469b8e80941Smrg * live in, but the classes take up a lot of memory to set up, so we 470b8e80941Smrg * don't want to make too many. 471b8e80941Smrg */ 472b8e80941Smrg for (int i = 0; i < ARRAY_SIZE(acc_nodes); i++) { 473b8e80941Smrg acc_nodes[i] = c->num_temps + i; 474b8e80941Smrg ra_set_node_reg(g, acc_nodes[i], ACC_INDEX + i); 475b8e80941Smrg } 476b8e80941Smrg 477b8e80941Smrg for (uint32_t i = 0; i < c->num_temps; i++) { 478b8e80941Smrg map[i].temp = i; 479b8e80941Smrg map[i].priority = c->temp_end[i] - c->temp_start[i]; 480b8e80941Smrg } 481b8e80941Smrg qsort(map, c->num_temps, sizeof(map[0]), node_to_temp_priority); 482b8e80941Smrg for (uint32_t i = 0; i < c->num_temps; i++) { 483b8e80941Smrg temp_to_node[map[i].temp] = i; 484b8e80941Smrg } 485b8e80941Smrg 486b8e80941Smrg /* Figure out our register classes and preallocated registers. We 487b8e80941Smrg * start with any temp being able to be in any file, then instructions 488b8e80941Smrg * incrementally remove bits that the temp definitely can't be in. 489b8e80941Smrg */ 490b8e80941Smrg memset(class_bits, CLASS_BITS_ANY, sizeof(class_bits)); 491b8e80941Smrg 492b8e80941Smrg int ip = 0; 493b8e80941Smrg vir_for_each_inst_inorder(inst, c) { 494b8e80941Smrg /* If the instruction writes r3/r4 (and optionally moves its 495b8e80941Smrg * result to a temp), nothing else can be stored in r3/r4 across 496b8e80941Smrg * it. 497b8e80941Smrg */ 498b8e80941Smrg if (vir_writes_r3(c->devinfo, inst)) { 499b8e80941Smrg for (int i = 0; i < c->num_temps; i++) { 500b8e80941Smrg if (c->temp_start[i] < ip && 501b8e80941Smrg c->temp_end[i] > ip) { 502b8e80941Smrg ra_add_node_interference(g, 503b8e80941Smrg temp_to_node[i], 504b8e80941Smrg acc_nodes[3]); 505b8e80941Smrg } 506b8e80941Smrg } 507b8e80941Smrg } 508b8e80941Smrg if (vir_writes_r4(c->devinfo, inst)) { 509b8e80941Smrg for (int i = 0; i < c->num_temps; i++) { 510b8e80941Smrg if (c->temp_start[i] < ip && 511b8e80941Smrg c->temp_end[i] > ip) { 512b8e80941Smrg ra_add_node_interference(g, 513b8e80941Smrg temp_to_node[i], 514b8e80941Smrg acc_nodes[4]); 515b8e80941Smrg } 516b8e80941Smrg } 517b8e80941Smrg } 518b8e80941Smrg 519b8e80941Smrg if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) { 520b8e80941Smrg switch (inst->qpu.alu.add.op) { 521b8e80941Smrg case V3D_QPU_A_LDVPMV_IN: 522b8e80941Smrg case V3D_QPU_A_LDVPMV_OUT: 523b8e80941Smrg case V3D_QPU_A_LDVPMD_IN: 524b8e80941Smrg case V3D_QPU_A_LDVPMD_OUT: 525b8e80941Smrg case V3D_QPU_A_LDVPMP: 526b8e80941Smrg case V3D_QPU_A_LDVPMG_IN: 527b8e80941Smrg case V3D_QPU_A_LDVPMG_OUT: 528b8e80941Smrg /* LDVPMs only store to temps (the MA flag 529b8e80941Smrg * decides whether the LDVPM is in or out) 530b8e80941Smrg */ 531b8e80941Smrg assert(inst->dst.file == QFILE_TEMP); 532b8e80941Smrg class_bits[inst->dst.index] &= CLASS_BIT_PHYS; 533b8e80941Smrg break; 534b8e80941Smrg 535b8e80941Smrg case V3D_QPU_A_RECIP: 536b8e80941Smrg case V3D_QPU_A_RSQRT: 537b8e80941Smrg case V3D_QPU_A_EXP: 538b8e80941Smrg case V3D_QPU_A_LOG: 539b8e80941Smrg case V3D_QPU_A_SIN: 540b8e80941Smrg case V3D_QPU_A_RSQRT2: 541b8e80941Smrg /* The SFU instructions write directly to the 542b8e80941Smrg * phys regfile. 543b8e80941Smrg */ 544b8e80941Smrg assert(inst->dst.file == QFILE_TEMP); 545b8e80941Smrg class_bits[inst->dst.index] &= CLASS_BIT_PHYS; 546b8e80941Smrg break; 547b8e80941Smrg 548b8e80941Smrg default: 549b8e80941Smrg break; 550b8e80941Smrg } 551b8e80941Smrg } 552b8e80941Smrg 553b8e80941Smrg if (inst->src[0].file == QFILE_REG) { 554b8e80941Smrg switch (inst->src[0].index) { 555b8e80941Smrg case 0: 556b8e80941Smrg case 1: 557b8e80941Smrg case 2: 558b8e80941Smrg case 3: 559b8e80941Smrg /* Payload setup instructions: Force allocate 560b8e80941Smrg * the dst to the given register (so the MOV 561b8e80941Smrg * will disappear). 562b8e80941Smrg */ 563b8e80941Smrg assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV); 564b8e80941Smrg assert(inst->dst.file == QFILE_TEMP); 565b8e80941Smrg ra_set_node_reg(g, 566b8e80941Smrg temp_to_node[inst->dst.index], 567b8e80941Smrg PHYS_INDEX + 568b8e80941Smrg inst->src[0].index); 569b8e80941Smrg break; 570b8e80941Smrg } 571b8e80941Smrg } 572b8e80941Smrg 573b8e80941Smrg if (inst->dst.file == QFILE_TEMP) { 574b8e80941Smrg /* Only a ldunif gets to write to R5, which only has a 575b8e80941Smrg * single 32-bit channel of storage. 576b8e80941Smrg */ 577b8e80941Smrg if (!inst->qpu.sig.ldunif) { 578b8e80941Smrg class_bits[inst->dst.index] &= ~CLASS_BIT_R5; 579b8e80941Smrg } else { 580b8e80941Smrg /* Until V3D 4.x, we could only load a uniform 581b8e80941Smrg * to r5, so we'll need to spill if uniform 582b8e80941Smrg * loads interfere with each other. 583b8e80941Smrg */ 584b8e80941Smrg if (c->devinfo->ver < 40) { 585b8e80941Smrg class_bits[inst->dst.index] &= 586b8e80941Smrg CLASS_BIT_R5; 587b8e80941Smrg } 588b8e80941Smrg } 589b8e80941Smrg } 590b8e80941Smrg 591b8e80941Smrg if (inst->qpu.sig.thrsw) { 592b8e80941Smrg /* All accumulators are invalidated across a thread 593b8e80941Smrg * switch. 594b8e80941Smrg */ 595b8e80941Smrg for (int i = 0; i < c->num_temps; i++) { 596b8e80941Smrg if (c->temp_start[i] < ip && c->temp_end[i] > ip) 597b8e80941Smrg class_bits[i] &= CLASS_BIT_PHYS; 598b8e80941Smrg } 599b8e80941Smrg } 600b8e80941Smrg 601b8e80941Smrg ip++; 602b8e80941Smrg } 603b8e80941Smrg 604b8e80941Smrg for (uint32_t i = 0; i < c->num_temps; i++) { 605b8e80941Smrg if (class_bits[i] == CLASS_BIT_PHYS) { 606b8e80941Smrg ra_set_node_class(g, temp_to_node[i], 607b8e80941Smrg c->compiler->reg_class_phys[thread_index]); 608b8e80941Smrg } else if (class_bits[i] == (CLASS_BIT_R5)) { 609b8e80941Smrg ra_set_node_class(g, temp_to_node[i], 610b8e80941Smrg c->compiler->reg_class_r5[thread_index]); 611b8e80941Smrg } else if (class_bits[i] == (CLASS_BIT_PHYS | CLASS_BIT_ACC)) { 612b8e80941Smrg ra_set_node_class(g, temp_to_node[i], 613b8e80941Smrg c->compiler->reg_class_phys_or_acc[thread_index]); 614b8e80941Smrg } else { 615b8e80941Smrg assert(class_bits[i] == CLASS_BITS_ANY); 616b8e80941Smrg ra_set_node_class(g, temp_to_node[i], 617b8e80941Smrg c->compiler->reg_class_any[thread_index]); 618b8e80941Smrg } 619b8e80941Smrg } 620b8e80941Smrg 621b8e80941Smrg for (uint32_t i = 0; i < c->num_temps; i++) { 622b8e80941Smrg for (uint32_t j = i + 1; j < c->num_temps; j++) { 623b8e80941Smrg if (!(c->temp_start[i] >= c->temp_end[j] || 624b8e80941Smrg c->temp_start[j] >= c->temp_end[i])) { 625b8e80941Smrg ra_add_node_interference(g, 626b8e80941Smrg temp_to_node[i], 627b8e80941Smrg temp_to_node[j]); 628b8e80941Smrg } 629b8e80941Smrg } 630b8e80941Smrg } 631b8e80941Smrg 632b8e80941Smrg /* Debug code to force a bit of register spilling, for running across 633b8e80941Smrg * conformance tests to make sure that spilling works. 634b8e80941Smrg */ 635b8e80941Smrg int force_register_spills = 0; 636b8e80941Smrg if (c->spill_size < 637b8e80941Smrg V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) { 638b8e80941Smrg int node = v3d_choose_spill_node(c, g, temp_to_node); 639b8e80941Smrg if (node != -1) { 640b8e80941Smrg v3d_spill_reg(c, map[node].temp); 641b8e80941Smrg ralloc_free(g); 642b8e80941Smrg *spilled = true; 643b8e80941Smrg return NULL; 644b8e80941Smrg } 645b8e80941Smrg } 646b8e80941Smrg 647b8e80941Smrg bool ok = ra_allocate(g); 648b8e80941Smrg if (!ok) { 649b8e80941Smrg int node = v3d_choose_spill_node(c, g, temp_to_node); 650b8e80941Smrg 651b8e80941Smrg /* Don't emit spills using the TMU until we've dropped thread 652b8e80941Smrg * conut first. 653b8e80941Smrg */ 654b8e80941Smrg if (node != -1 && 655b8e80941Smrg (vir_is_mov_uniform(c, map[node].temp) || 656b8e80941Smrg thread_index == 0)) { 657b8e80941Smrg v3d_spill_reg(c, map[node].temp); 658b8e80941Smrg 659b8e80941Smrg /* Ask the outer loop to call back in. */ 660b8e80941Smrg *spilled = true; 661b8e80941Smrg } 662b8e80941Smrg 663b8e80941Smrg ralloc_free(g); 664b8e80941Smrg return NULL; 665b8e80941Smrg } 666b8e80941Smrg 667b8e80941Smrg struct qpu_reg *temp_registers = calloc(c->num_temps, 668b8e80941Smrg sizeof(*temp_registers)); 669b8e80941Smrg 670b8e80941Smrg for (uint32_t i = 0; i < c->num_temps; i++) { 671b8e80941Smrg int ra_reg = ra_get_node_reg(g, temp_to_node[i]); 672b8e80941Smrg if (ra_reg < PHYS_INDEX) { 673b8e80941Smrg temp_registers[i].magic = true; 674b8e80941Smrg temp_registers[i].index = (V3D_QPU_WADDR_R0 + 675b8e80941Smrg ra_reg - ACC_INDEX); 676b8e80941Smrg } else { 677b8e80941Smrg temp_registers[i].magic = false; 678b8e80941Smrg temp_registers[i].index = ra_reg - PHYS_INDEX; 679b8e80941Smrg } 680b8e80941Smrg 681b8e80941Smrg /* If the value's never used, just write to the NOP register 682b8e80941Smrg * for clarity in debug output. 683b8e80941Smrg */ 684b8e80941Smrg if (c->temp_start[i] == c->temp_end[i]) { 685b8e80941Smrg temp_registers[i].magic = true; 686b8e80941Smrg temp_registers[i].index = V3D_QPU_WADDR_NOP; 687b8e80941Smrg } 688b8e80941Smrg } 689b8e80941Smrg 690b8e80941Smrg ralloc_free(g); 691b8e80941Smrg 692b8e80941Smrg return temp_registers; 693b8e80941Smrg} 694