101e04c3fSmrg/* 201e04c3fSmrg * Copyright © 2014 Broadcom 301e04c3fSmrg * 401e04c3fSmrg * Permission is hereby granted, free of charge, to any person obtaining a 501e04c3fSmrg * copy of this software and associated documentation files (the "Software"), 601e04c3fSmrg * to deal in the Software without restriction, including without limitation 701e04c3fSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 801e04c3fSmrg * and/or sell copies of the Software, and to permit persons to whom the 901e04c3fSmrg * Software is furnished to do so, subject to the following conditions: 1001e04c3fSmrg * 1101e04c3fSmrg * The above copyright notice and this permission notice (including the next 1201e04c3fSmrg * paragraph) shall be included in all copies or substantial portions of the 1301e04c3fSmrg * Software. 1401e04c3fSmrg * 1501e04c3fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1601e04c3fSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1701e04c3fSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 1801e04c3fSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 1901e04c3fSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 2001e04c3fSmrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 2101e04c3fSmrg * IN THE SOFTWARE. 2201e04c3fSmrg */ 2301e04c3fSmrg 2401e04c3fSmrg#include "util/ralloc.h" 2501e04c3fSmrg#include "util/register_allocate.h" 2601e04c3fSmrg#include "common/v3d_device_info.h" 2701e04c3fSmrg#include "v3d_compiler.h" 2801e04c3fSmrg 2901e04c3fSmrg#define QPU_R(i) { .magic = false, .index = i } 3001e04c3fSmrg 3101e04c3fSmrg#define ACC_INDEX 0 32ed98bd31Smaya#define ACC_COUNT 6 3301e04c3fSmrg#define PHYS_INDEX (ACC_INDEX + ACC_COUNT) 3401e04c3fSmrg#define PHYS_COUNT 64 3501e04c3fSmrg 36ed98bd31Smayastatic inline bool 377ec681f3Smrgqinst_writes_tmu(const struct v3d_device_info *devinfo, 387ec681f3Smrg struct qinst *inst) 39ed98bd31Smaya{ 40ed98bd31Smaya return (inst->dst.file == QFILE_MAGIC && 417ec681f3Smrg v3d_qpu_magic_waddr_is_tmu(devinfo, inst->dst.index)) || 427ec681f3Smrg inst->qpu.sig.wrtmuc; 43ed98bd31Smaya} 44ed98bd31Smaya 4501e04c3fSmrgstatic bool 467ec681f3Smrgis_end_of_tmu_sequence(const struct v3d_device_info *devinfo, 477ec681f3Smrg struct qinst *inst, struct qblock *block) 4801e04c3fSmrg{ 497ec681f3Smrg if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && 507ec681f3Smrg inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) { 517ec681f3Smrg return true; 527ec681f3Smrg } 537ec681f3Smrg 547ec681f3Smrg if (!inst->qpu.sig.ldtmu) 557ec681f3Smrg return false; 567ec681f3Smrg 57ed98bd31Smaya list_for_each_entry_from(struct qinst, scan_inst, inst->link.next, 5801e04c3fSmrg &block->instructions, link) { 59ed98bd31Smaya if (scan_inst->qpu.sig.ldtmu) 6001e04c3fSmrg return false; 617ec681f3Smrg 627ec681f3Smrg if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && 637ec681f3Smrg inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) { 647ec681f3Smrg return true; 657ec681f3Smrg } 667ec681f3Smrg 677ec681f3Smrg if (qinst_writes_tmu(devinfo, scan_inst)) 6801e04c3fSmrg return true; 6901e04c3fSmrg } 7001e04c3fSmrg 7101e04c3fSmrg return true; 7201e04c3fSmrg} 7301e04c3fSmrg 74ed98bd31Smayastatic bool 75ed98bd31Smayavir_is_mov_uniform(struct v3d_compile *c, int temp) 76ed98bd31Smaya{ 77ed98bd31Smaya struct qinst *def = c->defs[temp]; 78ed98bd31Smaya 79ed98bd31Smaya return def && def->qpu.sig.ldunif; 80ed98bd31Smaya} 81ed98bd31Smaya 8201e04c3fSmrgstatic int 8301e04c3fSmrgv3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g, 8401e04c3fSmrg uint32_t *temp_to_node) 8501e04c3fSmrg{ 86ed98bd31Smaya const float tmu_scale = 5; 8701e04c3fSmrg float block_scale = 1.0; 8801e04c3fSmrg float spill_costs[c->num_temps]; 8901e04c3fSmrg bool in_tmu_operation = false; 9001e04c3fSmrg bool started_last_seg = false; 9101e04c3fSmrg 9201e04c3fSmrg for (unsigned i = 0; i < c->num_temps; i++) 9301e04c3fSmrg spill_costs[i] = 0.0; 9401e04c3fSmrg 9501e04c3fSmrg /* XXX: Scale the cost up when inside of a loop. */ 9601e04c3fSmrg vir_for_each_block(block, c) { 9701e04c3fSmrg vir_for_each_inst(inst, block) { 987ec681f3Smrg /* We can't insert new thread switches after 997ec681f3Smrg * starting output writes. 10001e04c3fSmrg */ 10101e04c3fSmrg bool no_spilling = 1027ec681f3Smrg c->threads > 1 && started_last_seg; 10301e04c3fSmrg 1047ec681f3Smrg /* Discourage spilling of TMU operations */ 10501e04c3fSmrg for (int i = 0; i < vir_get_nsrc(inst); i++) { 10601e04c3fSmrg if (inst->src[i].file != QFILE_TEMP) 10701e04c3fSmrg continue; 10801e04c3fSmrg 10901e04c3fSmrg int temp = inst->src[i].index; 110ed98bd31Smaya if (vir_is_mov_uniform(c, temp)) { 11101e04c3fSmrg spill_costs[temp] += block_scale; 112ed98bd31Smaya } else if (!no_spilling) { 1137ec681f3Smrg float tmu_op_scale = in_tmu_operation ? 1147ec681f3Smrg 3.0 : 1.0; 115ed98bd31Smaya spill_costs[temp] += (block_scale * 1167ec681f3Smrg tmu_scale * 1177ec681f3Smrg tmu_op_scale); 118ed98bd31Smaya } else { 119ed98bd31Smaya BITSET_CLEAR(c->spillable, temp); 12001e04c3fSmrg } 12101e04c3fSmrg } 12201e04c3fSmrg 12301e04c3fSmrg if (inst->dst.file == QFILE_TEMP) { 12401e04c3fSmrg int temp = inst->dst.index; 12501e04c3fSmrg 126ed98bd31Smaya if (vir_is_mov_uniform(c, temp)) { 127ed98bd31Smaya /* We just rematerialize the unform 128ed98bd31Smaya * later. 129ed98bd31Smaya */ 130ed98bd31Smaya } else if (!no_spilling) { 131ed98bd31Smaya spill_costs[temp] += (block_scale * 132ed98bd31Smaya tmu_scale); 13301e04c3fSmrg } else { 134ed98bd31Smaya BITSET_CLEAR(c->spillable, temp); 13501e04c3fSmrg } 13601e04c3fSmrg } 13701e04c3fSmrg 13801e04c3fSmrg /* Refuse to spill a ldvary's dst, because that means 13901e04c3fSmrg * that ldvary's r5 would end up being used across a 14001e04c3fSmrg * thrsw. 14101e04c3fSmrg */ 14201e04c3fSmrg if (inst->qpu.sig.ldvary) { 14301e04c3fSmrg assert(inst->dst.file == QFILE_TEMP); 14401e04c3fSmrg BITSET_CLEAR(c->spillable, inst->dst.index); 14501e04c3fSmrg } 14601e04c3fSmrg 14701e04c3fSmrg if (inst->is_last_thrsw) 14801e04c3fSmrg started_last_seg = true; 14901e04c3fSmrg 15001e04c3fSmrg if (v3d_qpu_writes_vpm(&inst->qpu) || 15101e04c3fSmrg v3d_qpu_uses_tlb(&inst->qpu)) 15201e04c3fSmrg started_last_seg = true; 15301e04c3fSmrg 15401e04c3fSmrg /* Track when we're in between a TMU setup and the 1557ec681f3Smrg * final LDTMU or TMUWT from that TMU setup. We 1567ec681f3Smrg * penalize spills during that time. 15701e04c3fSmrg */ 1587ec681f3Smrg if (is_end_of_tmu_sequence(c->devinfo, inst, block)) 15901e04c3fSmrg in_tmu_operation = false; 16001e04c3fSmrg 1617ec681f3Smrg if (qinst_writes_tmu(c->devinfo, inst)) 16201e04c3fSmrg in_tmu_operation = true; 16301e04c3fSmrg } 16401e04c3fSmrg } 16501e04c3fSmrg 16601e04c3fSmrg for (unsigned i = 0; i < c->num_temps; i++) { 16701e04c3fSmrg if (BITSET_TEST(c->spillable, i)) 1687ec681f3Smrg ra_set_node_spill_cost(g, temp_to_node[i], spill_costs[i]); 16901e04c3fSmrg } 17001e04c3fSmrg 17101e04c3fSmrg return ra_get_best_spill_node(g); 17201e04c3fSmrg} 17301e04c3fSmrg 17401e04c3fSmrg/* The spill offset for this thread takes a bit of setup, so do it once at 17501e04c3fSmrg * program start. 17601e04c3fSmrg */ 177ed98bd31Smayavoid 17801e04c3fSmrgv3d_setup_spill_base(struct v3d_compile *c) 17901e04c3fSmrg{ 1807ec681f3Smrg /* Setting up the spill base is done in the entry block; so change 1817ec681f3Smrg * both the current block to emit and the cursor. 1827ec681f3Smrg */ 1837ec681f3Smrg struct qblock *current_block = c->cur_block; 1847ec681f3Smrg c->cur_block = vir_entry_block(c); 1857ec681f3Smrg c->cursor = vir_before_block(c->cur_block); 18601e04c3fSmrg 18701e04c3fSmrg int start_num_temps = c->num_temps; 18801e04c3fSmrg 18901e04c3fSmrg /* Each thread wants to be in a separate region of the scratch space 19001e04c3fSmrg * so that the QPUs aren't fighting over cache lines. We have the 19101e04c3fSmrg * driver keep a single global spill BO rather than 19201e04c3fSmrg * per-spilling-program BOs, so we need a uniform from the driver for 19301e04c3fSmrg * what the per-thread scale is. 19401e04c3fSmrg */ 19501e04c3fSmrg struct qreg thread_offset = 19601e04c3fSmrg vir_UMUL(c, 19701e04c3fSmrg vir_TIDX(c), 19801e04c3fSmrg vir_uniform(c, QUNIFORM_SPILL_SIZE_PER_THREAD, 0)); 19901e04c3fSmrg 20001e04c3fSmrg /* Each channel in a reg is 4 bytes, so scale them up by that. */ 20101e04c3fSmrg struct qreg element_offset = vir_SHL(c, vir_EIDX(c), 20201e04c3fSmrg vir_uniform_ui(c, 2)); 20301e04c3fSmrg 20401e04c3fSmrg c->spill_base = vir_ADD(c, 20501e04c3fSmrg vir_ADD(c, thread_offset, element_offset), 20601e04c3fSmrg vir_uniform(c, QUNIFORM_SPILL_OFFSET, 0)); 20701e04c3fSmrg 20801e04c3fSmrg /* Make sure that we don't spill the spilling setup instructions. */ 20901e04c3fSmrg for (int i = start_num_temps; i < c->num_temps; i++) 21001e04c3fSmrg BITSET_CLEAR(c->spillable, i); 211ed98bd31Smaya 2127ec681f3Smrg /* Restore the current block. */ 2137ec681f3Smrg c->cur_block = current_block; 214ed98bd31Smaya c->cursor = vir_after_block(c->cur_block); 21501e04c3fSmrg} 21601e04c3fSmrg 2177ec681f3Smrgstatic struct qinst * 21801e04c3fSmrgv3d_emit_spill_tmua(struct v3d_compile *c, uint32_t spill_offset) 21901e04c3fSmrg{ 2207ec681f3Smrg return vir_ADD_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA), 2217ec681f3Smrg c->spill_base, vir_uniform_ui(c, spill_offset)); 2227ec681f3Smrg} 2237ec681f3Smrg 2247ec681f3Smrg 2257ec681f3Smrgstatic void 2267ec681f3Smrgv3d_emit_tmu_spill(struct v3d_compile *c, struct qinst *inst, 2277ec681f3Smrg struct qinst *position, uint32_t spill_offset) 2287ec681f3Smrg{ 2297ec681f3Smrg assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU); 2307ec681f3Smrg 2317ec681f3Smrg c->cursor = vir_after_inst(position); 2327ec681f3Smrg inst->dst = vir_get_temp(c); 2337ec681f3Smrg enum v3d_qpu_cond cond = vir_get_cond(inst); 2347ec681f3Smrg struct qinst *tmp = 2357ec681f3Smrg vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD), 2367ec681f3Smrg inst->dst); 2377ec681f3Smrg tmp->qpu.flags.mc = cond; 2387ec681f3Smrg tmp = v3d_emit_spill_tmua(c, spill_offset); 2397ec681f3Smrg tmp->qpu.flags.ac = cond; 2407ec681f3Smrg vir_emit_thrsw(c); 2417ec681f3Smrg vir_TMUWT(c); 2427ec681f3Smrg c->spills++; 2437ec681f3Smrg c->tmu_dirty_rcl = true; 24401e04c3fSmrg} 24501e04c3fSmrg 24601e04c3fSmrgstatic void 24701e04c3fSmrgv3d_spill_reg(struct v3d_compile *c, int spill_temp) 24801e04c3fSmrg{ 2497ec681f3Smrg c->spill_count++; 2507ec681f3Smrg 251ed98bd31Smaya bool is_uniform = vir_is_mov_uniform(c, spill_temp); 252ed98bd31Smaya 253ed98bd31Smaya uint32_t spill_offset = 0; 25401e04c3fSmrg 255ed98bd31Smaya if (!is_uniform) { 2567ec681f3Smrg spill_offset = c->spill_size; 257ed98bd31Smaya c->spill_size += V3D_CHANNELS * sizeof(uint32_t); 258ed98bd31Smaya 259ed98bd31Smaya if (spill_offset == 0) 260ed98bd31Smaya v3d_setup_spill_base(c); 261ed98bd31Smaya } 26201e04c3fSmrg 26301e04c3fSmrg struct qinst *last_thrsw = c->last_thrsw; 2647ec681f3Smrg assert(last_thrsw && last_thrsw->is_last_thrsw); 26501e04c3fSmrg 26601e04c3fSmrg int start_num_temps = c->num_temps; 26701e04c3fSmrg 268ed98bd31Smaya int uniform_index = ~0; 269ed98bd31Smaya if (is_uniform) { 270ed98bd31Smaya struct qinst *orig_unif = c->defs[spill_temp]; 271ed98bd31Smaya uniform_index = orig_unif->uniform; 272ed98bd31Smaya } 273ed98bd31Smaya 2747ec681f3Smrg /* We must disable the ldunif optimization if we are spilling uniforms */ 2757ec681f3Smrg bool had_disable_ldunif_opt = c->disable_ldunif_opt; 2767ec681f3Smrg c->disable_ldunif_opt = true; 27701e04c3fSmrg 2787ec681f3Smrg struct qinst *start_of_tmu_sequence = NULL; 2797ec681f3Smrg struct qinst *postponed_spill = NULL; 2807ec681f3Smrg vir_for_each_block(block, c) { 2817ec681f3Smrg vir_for_each_inst_safe(inst, block) { 2827ec681f3Smrg /* Track when we're in between a TMU setup and the final 2837ec681f3Smrg * LDTMU or TMUWT from that TMU setup. We can't spill/fill any 2847ec681f3Smrg * temps during that time, because that involves inserting a 2857ec681f3Smrg * new TMU setup/LDTMU sequence, so we postpone the spill or 2867ec681f3Smrg * move the fill up to not intrude in the middle of the TMU 2877ec681f3Smrg * sequence. 2887ec681f3Smrg */ 2897ec681f3Smrg if (is_end_of_tmu_sequence(c->devinfo, inst, block)) { 2907ec681f3Smrg if (postponed_spill) { 2917ec681f3Smrg v3d_emit_tmu_spill(c, postponed_spill, 2927ec681f3Smrg inst, spill_offset); 2937ec681f3Smrg } 29401e04c3fSmrg 2957ec681f3Smrg start_of_tmu_sequence = NULL; 2967ec681f3Smrg postponed_spill = NULL; 297ed98bd31Smaya } 29801e04c3fSmrg 2997ec681f3Smrg if (!start_of_tmu_sequence && 3007ec681f3Smrg qinst_writes_tmu(c->devinfo, inst)) { 3017ec681f3Smrg start_of_tmu_sequence = inst; 302ed98bd31Smaya } 30301e04c3fSmrg 3047ec681f3Smrg /* fills */ 3057ec681f3Smrg for (int i = 0; i < vir_get_nsrc(inst); i++) { 3067ec681f3Smrg if (inst->src[i].file != QFILE_TEMP || 3077ec681f3Smrg inst->src[i].index != spill_temp) { 3087ec681f3Smrg continue; 3097ec681f3Smrg } 3107ec681f3Smrg 3117ec681f3Smrg c->cursor = vir_before_inst(inst); 3127ec681f3Smrg 3137ec681f3Smrg if (is_uniform) { 3147ec681f3Smrg struct qreg unif = 3157ec681f3Smrg vir_uniform(c, 3167ec681f3Smrg c->uniform_contents[uniform_index], 3177ec681f3Smrg c->uniform_data[uniform_index]); 3187ec681f3Smrg inst->src[i] = unif; 3197ec681f3Smrg } else { 3207ec681f3Smrg /* If we have a postponed spill, we don't need 3217ec681f3Smrg * a fill as the temp would not have been 3227ec681f3Smrg * spilled yet. 3237ec681f3Smrg */ 3247ec681f3Smrg if (postponed_spill) 3257ec681f3Smrg continue; 3267ec681f3Smrg if (start_of_tmu_sequence) 3277ec681f3Smrg c->cursor = vir_before_inst(start_of_tmu_sequence); 3287ec681f3Smrg 3297ec681f3Smrg v3d_emit_spill_tmua(c, spill_offset); 3307ec681f3Smrg vir_emit_thrsw(c); 3317ec681f3Smrg inst->src[i] = vir_LDTMU(c); 3327ec681f3Smrg c->fills++; 3337ec681f3Smrg } 3347ec681f3Smrg } 3357ec681f3Smrg 3367ec681f3Smrg /* spills */ 3377ec681f3Smrg if (inst->dst.file == QFILE_TEMP && 3387ec681f3Smrg inst->dst.index == spill_temp) { 3397ec681f3Smrg if (is_uniform) { 3407ec681f3Smrg c->cursor.link = NULL; 3417ec681f3Smrg vir_remove_instruction(c, inst); 3427ec681f3Smrg } else { 3437ec681f3Smrg if (start_of_tmu_sequence) 3447ec681f3Smrg postponed_spill = inst; 3457ec681f3Smrg else 3467ec681f3Smrg v3d_emit_tmu_spill(c, inst, inst, 3477ec681f3Smrg spill_offset); 3487ec681f3Smrg } 3497ec681f3Smrg } 35001e04c3fSmrg } 35101e04c3fSmrg } 35201e04c3fSmrg 35301e04c3fSmrg /* Make sure c->last_thrsw is the actual last thrsw, not just one we 35401e04c3fSmrg * inserted in our most recent unspill. 35501e04c3fSmrg */ 3567ec681f3Smrg c->last_thrsw = last_thrsw; 35701e04c3fSmrg 35801e04c3fSmrg /* Don't allow spilling of our spilling instructions. There's no way 35901e04c3fSmrg * they can help get things colored. 36001e04c3fSmrg */ 36101e04c3fSmrg for (int i = start_num_temps; i < c->num_temps; i++) 36201e04c3fSmrg BITSET_CLEAR(c->spillable, i); 3637ec681f3Smrg 3647ec681f3Smrg c->disable_ldunif_opt = had_disable_ldunif_opt; 36501e04c3fSmrg} 36601e04c3fSmrg 3677ec681f3Smrgstruct node_to_temp_map { 3687ec681f3Smrg uint32_t temp; 3697ec681f3Smrg uint32_t priority; 3707ec681f3Smrg}; 3717ec681f3Smrg 37201e04c3fSmrgstruct v3d_ra_select_callback_data { 37301e04c3fSmrg uint32_t next_acc; 37401e04c3fSmrg uint32_t next_phys; 3757ec681f3Smrg struct node_to_temp_map *map; 37601e04c3fSmrg}; 37701e04c3fSmrg 3787ec681f3Smrg/* Choosing accumulators improves chances of merging QPU instructions 3797ec681f3Smrg * due to these merges requiring that at most 2 rf registers are used 3807ec681f3Smrg * by the add and mul instructions. 3817ec681f3Smrg */ 3827ec681f3Smrgstatic bool 3837ec681f3Smrgv3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra, 3847ec681f3Smrg BITSET_WORD *regs, 3857ec681f3Smrg int priority) 38601e04c3fSmrg{ 3877ec681f3Smrg /* Favor accumulators if we have less that this number of physical 3887ec681f3Smrg * registers. Accumulators have more restrictions (like being 3897ec681f3Smrg * invalidated through thrsw), so running out of physical registers 3907ec681f3Smrg * even if we have accumulators available can lead to register 3917ec681f3Smrg * allocation failures. 392ed98bd31Smaya */ 3937ec681f3Smrg static const int available_rf_threshold = 5; 3947ec681f3Smrg int available_rf = 0 ; 3957ec681f3Smrg for (int i = 0; i < PHYS_COUNT; i++) { 3967ec681f3Smrg if (BITSET_TEST(regs, PHYS_INDEX + i)) 3977ec681f3Smrg available_rf++; 3987ec681f3Smrg if (available_rf >= available_rf_threshold) 3997ec681f3Smrg break; 4007ec681f3Smrg } 4017ec681f3Smrg if (available_rf < available_rf_threshold) 4027ec681f3Smrg return true; 4037ec681f3Smrg 4047ec681f3Smrg /* Favor accumulators for short-lived temps (our priority represents 4057ec681f3Smrg * liveness), to prevent long-lived temps from grabbing accumulators 4067ec681f3Smrg * and preventing follow-up instructions from using them, potentially 4077ec681f3Smrg * leading to large portions of the shader being unable to use 4087ec681f3Smrg * accumulators and therefore merge instructions successfully. 4097ec681f3Smrg */ 4107ec681f3Smrg static const int priority_threshold = 20; 4117ec681f3Smrg if (priority <= priority_threshold) 4127ec681f3Smrg return true; 41301e04c3fSmrg 4147ec681f3Smrg return false; 4157ec681f3Smrg} 4167ec681f3Smrg 4177ec681f3Smrgstatic bool 4187ec681f3Smrgv3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra, 4197ec681f3Smrg BITSET_WORD *regs, 4207ec681f3Smrg unsigned int *out) 4217ec681f3Smrg{ 4227ec681f3Smrg /* Round-robin through our accumulators to give post-RA instruction 4237ec681f3Smrg * selection more options. 42401e04c3fSmrg */ 42501e04c3fSmrg for (int i = 0; i < ACC_COUNT; i++) { 42601e04c3fSmrg int acc_off = (v3d_ra->next_acc + i) % ACC_COUNT; 42701e04c3fSmrg int acc = ACC_INDEX + acc_off; 42801e04c3fSmrg 42901e04c3fSmrg if (BITSET_TEST(regs, acc)) { 43001e04c3fSmrg v3d_ra->next_acc = acc_off + 1; 4317ec681f3Smrg *out = acc; 4327ec681f3Smrg return true; 43301e04c3fSmrg } 43401e04c3fSmrg } 43501e04c3fSmrg 4367ec681f3Smrg return false; 4377ec681f3Smrg} 4387ec681f3Smrg 4397ec681f3Smrgstatic bool 4407ec681f3Smrgv3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra, 4417ec681f3Smrg BITSET_WORD *regs, 4427ec681f3Smrg unsigned int *out) 4437ec681f3Smrg{ 44401e04c3fSmrg for (int i = 0; i < PHYS_COUNT; i++) { 44501e04c3fSmrg int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT; 44601e04c3fSmrg int phys = PHYS_INDEX + phys_off; 44701e04c3fSmrg 44801e04c3fSmrg if (BITSET_TEST(regs, phys)) { 44901e04c3fSmrg v3d_ra->next_phys = phys_off + 1; 4507ec681f3Smrg *out = phys; 4517ec681f3Smrg return true; 45201e04c3fSmrg } 45301e04c3fSmrg } 45401e04c3fSmrg 4557ec681f3Smrg return false; 4567ec681f3Smrg} 4577ec681f3Smrg 4587ec681f3Smrgstatic unsigned int 4597ec681f3Smrgv3d_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data) 4607ec681f3Smrg{ 4617ec681f3Smrg struct v3d_ra_select_callback_data *v3d_ra = data; 4627ec681f3Smrg int r5 = ACC_INDEX + 5; 4637ec681f3Smrg 4647ec681f3Smrg /* Choose r5 for our ldunifs if possible (nobody else can load to that 4657ec681f3Smrg * reg, and it keeps the QPU cond field free from being occupied by 4667ec681f3Smrg * ldunifrf). 4677ec681f3Smrg */ 4687ec681f3Smrg if (BITSET_TEST(regs, r5)) 4697ec681f3Smrg return r5; 4707ec681f3Smrg 4717ec681f3Smrg unsigned int reg; 4727ec681f3Smrg if (v3d_ra_favor_accum(v3d_ra, regs, v3d_ra->map[n].priority) && 4737ec681f3Smrg v3d_ra_select_accum(v3d_ra, regs, ®)) { 4747ec681f3Smrg return reg; 4757ec681f3Smrg } 4767ec681f3Smrg 4777ec681f3Smrg if (v3d_ra_select_rf(v3d_ra, regs, ®)) 4787ec681f3Smrg return reg; 4797ec681f3Smrg 4807ec681f3Smrg /* If we ran out of physical registers try to assign an accumulator 4817ec681f3Smrg * if we didn't favor that option earlier. 4827ec681f3Smrg */ 4837ec681f3Smrg if (v3d_ra_select_accum(v3d_ra, regs, ®)) 4847ec681f3Smrg return reg; 4857ec681f3Smrg 48601e04c3fSmrg unreachable("RA must pass us at least one possible reg."); 48701e04c3fSmrg} 48801e04c3fSmrg 48901e04c3fSmrgbool 49001e04c3fSmrgvir_init_reg_sets(struct v3d_compiler *compiler) 49101e04c3fSmrg{ 49201e04c3fSmrg /* Allocate up to 3 regfile classes, for the ways the physical 49301e04c3fSmrg * register file can be divided up for fragment shader threading. 49401e04c3fSmrg */ 49501e04c3fSmrg int max_thread_index = (compiler->devinfo->ver >= 40 ? 2 : 3); 49601e04c3fSmrg 49701e04c3fSmrg compiler->regs = ra_alloc_reg_set(compiler, PHYS_INDEX + PHYS_COUNT, 4987ec681f3Smrg false); 49901e04c3fSmrg if (!compiler->regs) 50001e04c3fSmrg return false; 50101e04c3fSmrg 50201e04c3fSmrg for (int threads = 0; threads < max_thread_index; threads++) { 503ed98bd31Smaya compiler->reg_class_any[threads] = 5047ec681f3Smrg ra_alloc_contig_reg_class(compiler->regs, 1); 505ed98bd31Smaya compiler->reg_class_r5[threads] = 5067ec681f3Smrg ra_alloc_contig_reg_class(compiler->regs, 1); 50701e04c3fSmrg compiler->reg_class_phys_or_acc[threads] = 5087ec681f3Smrg ra_alloc_contig_reg_class(compiler->regs, 1); 50901e04c3fSmrg compiler->reg_class_phys[threads] = 5107ec681f3Smrg ra_alloc_contig_reg_class(compiler->regs, 1); 51101e04c3fSmrg 51201e04c3fSmrg for (int i = PHYS_INDEX; 51301e04c3fSmrg i < PHYS_INDEX + (PHYS_COUNT >> threads); i++) { 5147ec681f3Smrg ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i); 5157ec681f3Smrg ra_class_add_reg(compiler->reg_class_phys[threads], i); 5167ec681f3Smrg ra_class_add_reg(compiler->reg_class_any[threads], i); 51701e04c3fSmrg } 51801e04c3fSmrg 519ed98bd31Smaya for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) { 5207ec681f3Smrg ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i); 5217ec681f3Smrg ra_class_add_reg(compiler->reg_class_any[threads], i); 52201e04c3fSmrg } 523ed98bd31Smaya /* r5 can only store a single 32-bit value, so not much can 524ed98bd31Smaya * use it. 525ed98bd31Smaya */ 5267ec681f3Smrg ra_class_add_reg(compiler->reg_class_r5[threads], 527ed98bd31Smaya ACC_INDEX + 5); 5287ec681f3Smrg ra_class_add_reg(compiler->reg_class_any[threads], 529ed98bd31Smaya ACC_INDEX + 5); 53001e04c3fSmrg } 53101e04c3fSmrg 53201e04c3fSmrg ra_set_finalize(compiler->regs, NULL); 53301e04c3fSmrg 53401e04c3fSmrg return true; 53501e04c3fSmrg} 53601e04c3fSmrg 53701e04c3fSmrgstatic int 53801e04c3fSmrgnode_to_temp_priority(const void *in_a, const void *in_b) 53901e04c3fSmrg{ 54001e04c3fSmrg const struct node_to_temp_map *a = in_a; 54101e04c3fSmrg const struct node_to_temp_map *b = in_b; 54201e04c3fSmrg 54301e04c3fSmrg return a->priority - b->priority; 54401e04c3fSmrg} 54501e04c3fSmrg 5467ec681f3Smrg/** 5477ec681f3Smrg * Computes the number of registers to spill in a batch after a register 5487ec681f3Smrg * allocation failure. 5497ec681f3Smrg */ 5507ec681f3Smrgstatic uint32_t 5517ec681f3Smrgget_spill_batch_size(struct v3d_compile *c) 5527ec681f3Smrg{ 5537ec681f3Smrg /* Allow up to 10 spills in batches of 1 in any case to avoid any chance of 5547ec681f3Smrg * over-spilling if the program requires few spills to compile. 5557ec681f3Smrg */ 5567ec681f3Smrg if (c->spill_count < 10) 5577ec681f3Smrg return 1; 5587ec681f3Smrg 5597ec681f3Smrg /* If we have to spill more than that we assume performance is not going to 5607ec681f3Smrg * be great and we shift focus to batching spills to cut down compile 5617ec681f3Smrg * time at the expense of over-spilling. 5627ec681f3Smrg */ 5637ec681f3Smrg return 20; 5647ec681f3Smrg} 5657ec681f3Smrg 5667ec681f3Smrg/* Don't emit spills using the TMU until we've dropped thread count first. We, 5677ec681f3Smrg * may also disable spilling when certain optimizations that are known to 5687ec681f3Smrg * increase register pressure are active so we favor recompiling with 5697ec681f3Smrg * optimizations disabled instead of spilling. 5707ec681f3Smrg */ 5717ec681f3Smrgstatic inline bool 5727ec681f3Smrgtmu_spilling_allowed(struct v3d_compile *c, int thread_index) 5737ec681f3Smrg{ 5747ec681f3Smrg return thread_index == 0 && c->tmu_spilling_allowed; 5757ec681f3Smrg} 5767ec681f3Smrg 57701e04c3fSmrg#define CLASS_BIT_PHYS (1 << 0) 578ed98bd31Smaya#define CLASS_BIT_ACC (1 << 1) 579ed98bd31Smaya#define CLASS_BIT_R5 (1 << 4) 580ed98bd31Smaya#define CLASS_BITS_ANY (CLASS_BIT_PHYS | \ 581ed98bd31Smaya CLASS_BIT_ACC | \ 582ed98bd31Smaya CLASS_BIT_R5) 58301e04c3fSmrg 58401e04c3fSmrg/** 58501e04c3fSmrg * Returns a mapping from QFILE_TEMP indices to struct qpu_regs. 58601e04c3fSmrg * 58701e04c3fSmrg * The return value should be freed by the caller. 58801e04c3fSmrg */ 58901e04c3fSmrgstruct qpu_reg * 59001e04c3fSmrgv3d_register_allocate(struct v3d_compile *c, bool *spilled) 59101e04c3fSmrg{ 5927ec681f3Smrg uint32_t UNUSED start_num_temps = c->num_temps; 59301e04c3fSmrg struct node_to_temp_map map[c->num_temps]; 59401e04c3fSmrg uint32_t temp_to_node[c->num_temps]; 59501e04c3fSmrg uint8_t class_bits[c->num_temps]; 59601e04c3fSmrg int acc_nodes[ACC_COUNT]; 59701e04c3fSmrg struct v3d_ra_select_callback_data callback_data = { 59801e04c3fSmrg .next_acc = 0, 59901e04c3fSmrg /* Start at RF3, to try to keep the TLB writes from using 60001e04c3fSmrg * RF0-2. 60101e04c3fSmrg */ 60201e04c3fSmrg .next_phys = 3, 6037ec681f3Smrg .map = map, 60401e04c3fSmrg }; 60501e04c3fSmrg 60601e04c3fSmrg *spilled = false; 60701e04c3fSmrg 60801e04c3fSmrg vir_calculate_live_intervals(c); 60901e04c3fSmrg 61001e04c3fSmrg /* Convert 1, 2, 4 threads to 0, 1, 2 index. 61101e04c3fSmrg * 61201e04c3fSmrg * V3D 4.x has double the physical register space, so 64 physical regs 61301e04c3fSmrg * are available at both 1x and 2x threading, and 4x has 32. 61401e04c3fSmrg */ 61501e04c3fSmrg int thread_index = ffs(c->threads) - 1; 61601e04c3fSmrg if (c->devinfo->ver >= 40) { 61701e04c3fSmrg if (thread_index >= 1) 61801e04c3fSmrg thread_index--; 61901e04c3fSmrg } 62001e04c3fSmrg 62101e04c3fSmrg struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs, 62201e04c3fSmrg c->num_temps + 62301e04c3fSmrg ARRAY_SIZE(acc_nodes)); 62401e04c3fSmrg ra_set_select_reg_callback(g, v3d_ra_select_callback, &callback_data); 62501e04c3fSmrg 62601e04c3fSmrg /* Make some fixed nodes for the accumulators, which we will need to 62701e04c3fSmrg * interfere with when ops have implied r3/r4 writes or for the thread 62801e04c3fSmrg * switches. We could represent these as classes for the nodes to 62901e04c3fSmrg * live in, but the classes take up a lot of memory to set up, so we 63001e04c3fSmrg * don't want to make too many. 63101e04c3fSmrg */ 63201e04c3fSmrg for (int i = 0; i < ARRAY_SIZE(acc_nodes); i++) { 63301e04c3fSmrg acc_nodes[i] = c->num_temps + i; 63401e04c3fSmrg ra_set_node_reg(g, acc_nodes[i], ACC_INDEX + i); 63501e04c3fSmrg } 63601e04c3fSmrg 63701e04c3fSmrg for (uint32_t i = 0; i < c->num_temps; i++) { 63801e04c3fSmrg map[i].temp = i; 63901e04c3fSmrg map[i].priority = c->temp_end[i] - c->temp_start[i]; 64001e04c3fSmrg } 64101e04c3fSmrg qsort(map, c->num_temps, sizeof(map[0]), node_to_temp_priority); 64201e04c3fSmrg for (uint32_t i = 0; i < c->num_temps; i++) { 64301e04c3fSmrg temp_to_node[map[i].temp] = i; 64401e04c3fSmrg } 64501e04c3fSmrg 64601e04c3fSmrg /* Figure out our register classes and preallocated registers. We 64701e04c3fSmrg * start with any temp being able to be in any file, then instructions 64801e04c3fSmrg * incrementally remove bits that the temp definitely can't be in. 64901e04c3fSmrg */ 650ed98bd31Smaya memset(class_bits, CLASS_BITS_ANY, sizeof(class_bits)); 65101e04c3fSmrg 65201e04c3fSmrg int ip = 0; 65301e04c3fSmrg vir_for_each_inst_inorder(inst, c) { 65401e04c3fSmrg /* If the instruction writes r3/r4 (and optionally moves its 65501e04c3fSmrg * result to a temp), nothing else can be stored in r3/r4 across 65601e04c3fSmrg * it. 65701e04c3fSmrg */ 65801e04c3fSmrg if (vir_writes_r3(c->devinfo, inst)) { 65901e04c3fSmrg for (int i = 0; i < c->num_temps; i++) { 66001e04c3fSmrg if (c->temp_start[i] < ip && 66101e04c3fSmrg c->temp_end[i] > ip) { 66201e04c3fSmrg ra_add_node_interference(g, 66301e04c3fSmrg temp_to_node[i], 66401e04c3fSmrg acc_nodes[3]); 66501e04c3fSmrg } 66601e04c3fSmrg } 66701e04c3fSmrg } 66801e04c3fSmrg if (vir_writes_r4(c->devinfo, inst)) { 66901e04c3fSmrg for (int i = 0; i < c->num_temps; i++) { 67001e04c3fSmrg if (c->temp_start[i] < ip && 67101e04c3fSmrg c->temp_end[i] > ip) { 67201e04c3fSmrg ra_add_node_interference(g, 67301e04c3fSmrg temp_to_node[i], 67401e04c3fSmrg acc_nodes[4]); 67501e04c3fSmrg } 67601e04c3fSmrg } 67701e04c3fSmrg } 67801e04c3fSmrg 67901e04c3fSmrg if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) { 68001e04c3fSmrg switch (inst->qpu.alu.add.op) { 68101e04c3fSmrg case V3D_QPU_A_LDVPMV_IN: 68201e04c3fSmrg case V3D_QPU_A_LDVPMV_OUT: 68301e04c3fSmrg case V3D_QPU_A_LDVPMD_IN: 68401e04c3fSmrg case V3D_QPU_A_LDVPMD_OUT: 68501e04c3fSmrg case V3D_QPU_A_LDVPMP: 68601e04c3fSmrg case V3D_QPU_A_LDVPMG_IN: 68701e04c3fSmrg case V3D_QPU_A_LDVPMG_OUT: 68801e04c3fSmrg /* LDVPMs only store to temps (the MA flag 68901e04c3fSmrg * decides whether the LDVPM is in or out) 69001e04c3fSmrg */ 69101e04c3fSmrg assert(inst->dst.file == QFILE_TEMP); 69201e04c3fSmrg class_bits[inst->dst.index] &= CLASS_BIT_PHYS; 69301e04c3fSmrg break; 69401e04c3fSmrg 69501e04c3fSmrg case V3D_QPU_A_RECIP: 69601e04c3fSmrg case V3D_QPU_A_RSQRT: 69701e04c3fSmrg case V3D_QPU_A_EXP: 69801e04c3fSmrg case V3D_QPU_A_LOG: 69901e04c3fSmrg case V3D_QPU_A_SIN: 70001e04c3fSmrg case V3D_QPU_A_RSQRT2: 70101e04c3fSmrg /* The SFU instructions write directly to the 70201e04c3fSmrg * phys regfile. 70301e04c3fSmrg */ 70401e04c3fSmrg assert(inst->dst.file == QFILE_TEMP); 70501e04c3fSmrg class_bits[inst->dst.index] &= CLASS_BIT_PHYS; 70601e04c3fSmrg break; 70701e04c3fSmrg 70801e04c3fSmrg default: 70901e04c3fSmrg break; 71001e04c3fSmrg } 71101e04c3fSmrg } 71201e04c3fSmrg 71301e04c3fSmrg if (inst->src[0].file == QFILE_REG) { 71401e04c3fSmrg switch (inst->src[0].index) { 71501e04c3fSmrg case 0: 71601e04c3fSmrg case 1: 71701e04c3fSmrg case 2: 718ed98bd31Smaya case 3: 71901e04c3fSmrg /* Payload setup instructions: Force allocate 72001e04c3fSmrg * the dst to the given register (so the MOV 72101e04c3fSmrg * will disappear). 72201e04c3fSmrg */ 72301e04c3fSmrg assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV); 72401e04c3fSmrg assert(inst->dst.file == QFILE_TEMP); 72501e04c3fSmrg ra_set_node_reg(g, 72601e04c3fSmrg temp_to_node[inst->dst.index], 72701e04c3fSmrg PHYS_INDEX + 72801e04c3fSmrg inst->src[0].index); 72901e04c3fSmrg break; 73001e04c3fSmrg } 73101e04c3fSmrg } 73201e04c3fSmrg 733ed98bd31Smaya if (inst->dst.file == QFILE_TEMP) { 734ed98bd31Smaya /* Only a ldunif gets to write to R5, which only has a 735ed98bd31Smaya * single 32-bit channel of storage. 736ed98bd31Smaya */ 737ed98bd31Smaya if (!inst->qpu.sig.ldunif) { 738ed98bd31Smaya class_bits[inst->dst.index] &= ~CLASS_BIT_R5; 739ed98bd31Smaya } else { 740ed98bd31Smaya /* Until V3D 4.x, we could only load a uniform 741ed98bd31Smaya * to r5, so we'll need to spill if uniform 742ed98bd31Smaya * loads interfere with each other. 743ed98bd31Smaya */ 744ed98bd31Smaya if (c->devinfo->ver < 40) { 745ed98bd31Smaya class_bits[inst->dst.index] &= 746ed98bd31Smaya CLASS_BIT_R5; 747ed98bd31Smaya } 748ed98bd31Smaya } 749ed98bd31Smaya } 750ed98bd31Smaya 75101e04c3fSmrg if (inst->qpu.sig.thrsw) { 75201e04c3fSmrg /* All accumulators are invalidated across a thread 75301e04c3fSmrg * switch. 75401e04c3fSmrg */ 75501e04c3fSmrg for (int i = 0; i < c->num_temps; i++) { 75601e04c3fSmrg if (c->temp_start[i] < ip && c->temp_end[i] > ip) 75701e04c3fSmrg class_bits[i] &= CLASS_BIT_PHYS; 75801e04c3fSmrg } 75901e04c3fSmrg } 76001e04c3fSmrg 76101e04c3fSmrg ip++; 76201e04c3fSmrg } 76301e04c3fSmrg 76401e04c3fSmrg for (uint32_t i = 0; i < c->num_temps; i++) { 76501e04c3fSmrg if (class_bits[i] == CLASS_BIT_PHYS) { 76601e04c3fSmrg ra_set_node_class(g, temp_to_node[i], 76701e04c3fSmrg c->compiler->reg_class_phys[thread_index]); 768ed98bd31Smaya } else if (class_bits[i] == (CLASS_BIT_R5)) { 769ed98bd31Smaya ra_set_node_class(g, temp_to_node[i], 770ed98bd31Smaya c->compiler->reg_class_r5[thread_index]); 771ed98bd31Smaya } else if (class_bits[i] == (CLASS_BIT_PHYS | CLASS_BIT_ACC)) { 77201e04c3fSmrg ra_set_node_class(g, temp_to_node[i], 77301e04c3fSmrg c->compiler->reg_class_phys_or_acc[thread_index]); 774ed98bd31Smaya } else { 775ed98bd31Smaya assert(class_bits[i] == CLASS_BITS_ANY); 776ed98bd31Smaya ra_set_node_class(g, temp_to_node[i], 777ed98bd31Smaya c->compiler->reg_class_any[thread_index]); 77801e04c3fSmrg } 77901e04c3fSmrg } 78001e04c3fSmrg 78101e04c3fSmrg for (uint32_t i = 0; i < c->num_temps; i++) { 78201e04c3fSmrg for (uint32_t j = i + 1; j < c->num_temps; j++) { 78301e04c3fSmrg if (!(c->temp_start[i] >= c->temp_end[j] || 78401e04c3fSmrg c->temp_start[j] >= c->temp_end[i])) { 78501e04c3fSmrg ra_add_node_interference(g, 78601e04c3fSmrg temp_to_node[i], 78701e04c3fSmrg temp_to_node[j]); 78801e04c3fSmrg } 78901e04c3fSmrg } 79001e04c3fSmrg } 79101e04c3fSmrg 79201e04c3fSmrg /* Debug code to force a bit of register spilling, for running across 79301e04c3fSmrg * conformance tests to make sure that spilling works. 79401e04c3fSmrg */ 79501e04c3fSmrg int force_register_spills = 0; 796ed98bd31Smaya if (c->spill_size < 797ed98bd31Smaya V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) { 79801e04c3fSmrg int node = v3d_choose_spill_node(c, g, temp_to_node); 79901e04c3fSmrg if (node != -1) { 80001e04c3fSmrg v3d_spill_reg(c, map[node].temp); 80101e04c3fSmrg ralloc_free(g); 80201e04c3fSmrg *spilled = true; 80301e04c3fSmrg return NULL; 80401e04c3fSmrg } 80501e04c3fSmrg } 80601e04c3fSmrg 80701e04c3fSmrg bool ok = ra_allocate(g); 80801e04c3fSmrg if (!ok) { 8097ec681f3Smrg const uint32_t spill_batch_size = get_spill_batch_size(c); 81001e04c3fSmrg 8117ec681f3Smrg for (uint32_t i = 0; i < spill_batch_size; i++) { 8127ec681f3Smrg int node = v3d_choose_spill_node(c, g, temp_to_node); 8137ec681f3Smrg if (node == -1) 8147ec681f3Smrg break; 81501e04c3fSmrg 8167ec681f3Smrg /* TMU spills inject thrsw signals that invalidate 8177ec681f3Smrg * accumulators, so we can't batch them. 8187ec681f3Smrg */ 8197ec681f3Smrg bool is_uniform = vir_is_mov_uniform(c, map[node].temp); 8207ec681f3Smrg if (i > 0 && !is_uniform) 8217ec681f3Smrg break; 8227ec681f3Smrg 8237ec681f3Smrg if (is_uniform || tmu_spilling_allowed(c, thread_index)) { 8247ec681f3Smrg v3d_spill_reg(c, map[node].temp); 8257ec681f3Smrg 8267ec681f3Smrg /* Ask the outer loop to call back in. */ 8277ec681f3Smrg *spilled = true; 8287ec681f3Smrg 8297ec681f3Smrg /* See comment above about batching TMU spills. 8307ec681f3Smrg */ 8317ec681f3Smrg if (!is_uniform) { 8327ec681f3Smrg assert(i == 0); 8337ec681f3Smrg break; 8347ec681f3Smrg } 8357ec681f3Smrg } else { 8367ec681f3Smrg break; 8377ec681f3Smrg } 83801e04c3fSmrg } 83901e04c3fSmrg 840ed98bd31Smaya ralloc_free(g); 84101e04c3fSmrg return NULL; 84201e04c3fSmrg } 84301e04c3fSmrg 8447ec681f3Smrg /* Ensure that we are not accessing temp_to_node out of bounds. We 8457ec681f3Smrg * should never trigger this assertion because `c->num_temps` only 8467ec681f3Smrg * grows when we spill, in which case we return early and don't get 8477ec681f3Smrg * here. 8487ec681f3Smrg */ 8497ec681f3Smrg assert(start_num_temps == c->num_temps); 850ed98bd31Smaya struct qpu_reg *temp_registers = calloc(c->num_temps, 851ed98bd31Smaya sizeof(*temp_registers)); 852ed98bd31Smaya 85301e04c3fSmrg for (uint32_t i = 0; i < c->num_temps; i++) { 85401e04c3fSmrg int ra_reg = ra_get_node_reg(g, temp_to_node[i]); 85501e04c3fSmrg if (ra_reg < PHYS_INDEX) { 85601e04c3fSmrg temp_registers[i].magic = true; 85701e04c3fSmrg temp_registers[i].index = (V3D_QPU_WADDR_R0 + 85801e04c3fSmrg ra_reg - ACC_INDEX); 85901e04c3fSmrg } else { 86001e04c3fSmrg temp_registers[i].magic = false; 86101e04c3fSmrg temp_registers[i].index = ra_reg - PHYS_INDEX; 86201e04c3fSmrg } 86301e04c3fSmrg } 86401e04c3fSmrg 86501e04c3fSmrg ralloc_free(g); 86601e04c3fSmrg 86701e04c3fSmrg return temp_registers; 86801e04c3fSmrg} 869