101e04c3fSmrg/* 201e04c3fSmrg * Copyright © 2010 Intel Corporation 301e04c3fSmrg * 401e04c3fSmrg * Permission is hereby granted, free of charge, to any person obtaining a 501e04c3fSmrg * copy of this software and associated documentation files (the "Software"), 601e04c3fSmrg * to deal in the Software without restriction, including without limitation 701e04c3fSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 801e04c3fSmrg * and/or sell copies of the Software, and to permit persons to whom the 901e04c3fSmrg * Software is furnished to do so, subject to the following conditions: 1001e04c3fSmrg * 1101e04c3fSmrg * The above copyright notice and this permission notice (including the next 1201e04c3fSmrg * paragraph) shall be included in all copies or substantial portions of the 1301e04c3fSmrg * Software. 1401e04c3fSmrg * 1501e04c3fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1601e04c3fSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1701e04c3fSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 1801e04c3fSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 1901e04c3fSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 2001e04c3fSmrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 2101e04c3fSmrg * IN THE SOFTWARE. 2201e04c3fSmrg * 2301e04c3fSmrg * Authors: 2401e04c3fSmrg * Eric Anholt <eric@anholt.net> 2501e04c3fSmrg * 2601e04c3fSmrg */ 2701e04c3fSmrg 2801e04c3fSmrg/** @file register_allocate.c 2901e04c3fSmrg * 3001e04c3fSmrg * Graph-coloring register allocator. 3101e04c3fSmrg * 3201e04c3fSmrg * The basic idea of graph coloring is to make a node in a graph for 3301e04c3fSmrg * every thing that needs a register (color) number assigned, and make 3401e04c3fSmrg * edges in the graph between nodes that interfere (can't be allocated 3501e04c3fSmrg * to the same register at the same time). 3601e04c3fSmrg * 3701e04c3fSmrg * During the "simplify" process, any any node with fewer edges than 3801e04c3fSmrg * there are registers means that that edge can get assigned a 3901e04c3fSmrg * register regardless of what its neighbors choose, so that node is 4001e04c3fSmrg * pushed on a stack and removed (with its edges) from the graph. 4101e04c3fSmrg * That likely causes other nodes to become trivially colorable as well. 4201e04c3fSmrg * 4301e04c3fSmrg * Then during the "select" process, nodes are popped off of that 4401e04c3fSmrg * stack, their edges restored, and assigned a color different from 4501e04c3fSmrg * their neighbors. Because they were pushed on the stack only when 4601e04c3fSmrg * they were trivially colorable, any color chosen won't interfere 4701e04c3fSmrg * with the registers to be popped later. 4801e04c3fSmrg * 4901e04c3fSmrg * The downside to most graph coloring is that real hardware often has 5001e04c3fSmrg * limitations, like registers that need to be allocated to a node in 5101e04c3fSmrg * pairs, or aligned on some boundary. This implementation follows 5201e04c3fSmrg * the paper "Retargetable Graph-Coloring Register Allocation for 5301e04c3fSmrg * Irregular Architectures" by Johan Runeson and Sven-Olof Nyström. 5401e04c3fSmrg * 5501e04c3fSmrg * In this system, there are register classes each containing various 5601e04c3fSmrg * registers, and registers may interfere with other registers. For 5701e04c3fSmrg * example, one might have a class of base registers, and a class of 5801e04c3fSmrg * aligned register pairs that would each interfere with their pair of 5901e04c3fSmrg * the base registers. Each node has a register class it needs to be 6001e04c3fSmrg * assigned to. Define p(B) to be the size of register class B, and 6101e04c3fSmrg * q(B,C) to be the number of registers in B that the worst choice 6201e04c3fSmrg * register in C could conflict with. Then, this system replaces the 6301e04c3fSmrg * basic graph coloring test of "fewer edges from this node than there 6401e04c3fSmrg * are registers" with "For this node of class B, the sum of q(B,C) 6501e04c3fSmrg * for each neighbor node of class C is less than pB". 6601e04c3fSmrg * 6701e04c3fSmrg * A nice feature of the pq test is that q(B,C) can be computed once 6801e04c3fSmrg * up front and stored in a 2-dimensional array, so that the cost of 6901e04c3fSmrg * coloring a node is constant with the number of registers. We do 7001e04c3fSmrg * this during ra_set_finalize(). 7101e04c3fSmrg */ 7201e04c3fSmrg 7301e04c3fSmrg#include <stdbool.h> 747ec681f3Smrg#include <stdlib.h> 7501e04c3fSmrg 767ec681f3Smrg#include "blob.h" 7701e04c3fSmrg#include "ralloc.h" 7801e04c3fSmrg#include "main/macros.h" 7901e04c3fSmrg#include "util/bitset.h" 807ec681f3Smrg#include "util/u_dynarray.h" 817ec681f3Smrg#include "u_math.h" 8201e04c3fSmrg#include "register_allocate.h" 837ec681f3Smrg#include "register_allocate_internal.h" 8401e04c3fSmrg 8501e04c3fSmrg/** 8601e04c3fSmrg * Creates a set of registers for the allocator. 8701e04c3fSmrg * 8801e04c3fSmrg * mem_ctx is a ralloc context for the allocator. The reg set may be freed 8901e04c3fSmrg * using ralloc_free(). 9001e04c3fSmrg */ 9101e04c3fSmrgstruct ra_regs * 9201e04c3fSmrgra_alloc_reg_set(void *mem_ctx, unsigned int count, bool need_conflict_lists) 9301e04c3fSmrg{ 9401e04c3fSmrg unsigned int i; 9501e04c3fSmrg struct ra_regs *regs; 9601e04c3fSmrg 9701e04c3fSmrg regs = rzalloc(mem_ctx, struct ra_regs); 9801e04c3fSmrg regs->count = count; 9901e04c3fSmrg regs->regs = rzalloc_array(regs, struct ra_reg, count); 10001e04c3fSmrg 10101e04c3fSmrg for (i = 0; i < count; i++) { 10201e04c3fSmrg regs->regs[i].conflicts = rzalloc_array(regs->regs, BITSET_WORD, 10301e04c3fSmrg BITSET_WORDS(count)); 10401e04c3fSmrg BITSET_SET(regs->regs[i].conflicts, i); 10501e04c3fSmrg 1067ec681f3Smrg util_dynarray_init(®s->regs[i].conflict_list, 1077ec681f3Smrg need_conflict_lists ? regs->regs : NULL); 1087ec681f3Smrg if (need_conflict_lists) 1097ec681f3Smrg util_dynarray_append(®s->regs[i].conflict_list, unsigned int, i); 11001e04c3fSmrg } 11101e04c3fSmrg 11201e04c3fSmrg return regs; 11301e04c3fSmrg} 11401e04c3fSmrg 11501e04c3fSmrg/** 11601e04c3fSmrg * The register allocator by default prefers to allocate low register numbers, 11701e04c3fSmrg * since it was written for hardware (gen4/5 Intel) that is limited in its 11801e04c3fSmrg * multithreadedness by the number of registers used in a given shader. 11901e04c3fSmrg * 12001e04c3fSmrg * However, for hardware without that restriction, densely packed register 12101e04c3fSmrg * allocation can put serious constraints on instruction scheduling. This 12201e04c3fSmrg * function tells the allocator to rotate around the registers if possible as 12301e04c3fSmrg * it allocates the nodes. 12401e04c3fSmrg */ 12501e04c3fSmrgvoid 12601e04c3fSmrgra_set_allocate_round_robin(struct ra_regs *regs) 12701e04c3fSmrg{ 12801e04c3fSmrg regs->round_robin = true; 12901e04c3fSmrg} 13001e04c3fSmrg 13101e04c3fSmrgstatic void 13201e04c3fSmrgra_add_conflict_list(struct ra_regs *regs, unsigned int r1, unsigned int r2) 13301e04c3fSmrg{ 13401e04c3fSmrg struct ra_reg *reg1 = ®s->regs[r1]; 13501e04c3fSmrg 1367ec681f3Smrg if (reg1->conflict_list.mem_ctx) { 1377ec681f3Smrg util_dynarray_append(®1->conflict_list, unsigned int, r2); 13801e04c3fSmrg } 13901e04c3fSmrg BITSET_SET(reg1->conflicts, r2); 14001e04c3fSmrg} 14101e04c3fSmrg 14201e04c3fSmrgvoid 14301e04c3fSmrgra_add_reg_conflict(struct ra_regs *regs, unsigned int r1, unsigned int r2) 14401e04c3fSmrg{ 14501e04c3fSmrg if (!BITSET_TEST(regs->regs[r1].conflicts, r2)) { 14601e04c3fSmrg ra_add_conflict_list(regs, r1, r2); 14701e04c3fSmrg ra_add_conflict_list(regs, r2, r1); 14801e04c3fSmrg } 14901e04c3fSmrg} 15001e04c3fSmrg 15101e04c3fSmrg/** 15201e04c3fSmrg * Adds a conflict between base_reg and reg, and also between reg and 15301e04c3fSmrg * anything that base_reg conflicts with. 15401e04c3fSmrg * 15501e04c3fSmrg * This can simplify code for setting up multiple register classes 15601e04c3fSmrg * which are aggregates of some base hardware registers, compared to 15701e04c3fSmrg * explicitly using ra_add_reg_conflict. 15801e04c3fSmrg */ 15901e04c3fSmrgvoid 16001e04c3fSmrgra_add_transitive_reg_conflict(struct ra_regs *regs, 16101e04c3fSmrg unsigned int base_reg, unsigned int reg) 16201e04c3fSmrg{ 16301e04c3fSmrg ra_add_reg_conflict(regs, reg, base_reg); 16401e04c3fSmrg 1657ec681f3Smrg util_dynarray_foreach(®s->regs[base_reg].conflict_list, unsigned int, 1667ec681f3Smrg r2p) { 1677ec681f3Smrg ra_add_reg_conflict(regs, reg, *r2p); 1687ec681f3Smrg } 1697ec681f3Smrg} 1707ec681f3Smrg 1717ec681f3Smrg/** 1727ec681f3Smrg * Set up conflicts between base_reg and it's two half registers reg0 and 1737ec681f3Smrg * reg1, but take care to not add conflicts between reg0 and reg1. 1747ec681f3Smrg * 1757ec681f3Smrg * This is useful for architectures where full size registers are aliased by 1767ec681f3Smrg * two half size registers (eg 32 bit float and 16 bit float registers). 1777ec681f3Smrg */ 1787ec681f3Smrgvoid 1797ec681f3Smrgra_add_transitive_reg_pair_conflict(struct ra_regs *regs, 1807ec681f3Smrg unsigned int base_reg, unsigned int reg0, unsigned int reg1) 1817ec681f3Smrg{ 1827ec681f3Smrg ra_add_reg_conflict(regs, reg0, base_reg); 1837ec681f3Smrg ra_add_reg_conflict(regs, reg1, base_reg); 1847ec681f3Smrg 1857ec681f3Smrg util_dynarray_foreach(®s->regs[base_reg].conflict_list, unsigned int, i) { 1867ec681f3Smrg unsigned int conflict = *i; 1877ec681f3Smrg if (conflict != reg1) 1887ec681f3Smrg ra_add_reg_conflict(regs, reg0, conflict); 1897ec681f3Smrg if (conflict != reg0) 1907ec681f3Smrg ra_add_reg_conflict(regs, reg1, conflict); 19101e04c3fSmrg } 19201e04c3fSmrg} 19301e04c3fSmrg 19401e04c3fSmrg/** 19501e04c3fSmrg * Makes every conflict on the given register transitive. In other words, 19601e04c3fSmrg * every register that conflicts with r will now conflict with every other 19701e04c3fSmrg * register conflicting with r. 19801e04c3fSmrg * 19901e04c3fSmrg * This can simplify code for setting up multiple register classes 20001e04c3fSmrg * which are aggregates of some base hardware registers, compared to 20101e04c3fSmrg * explicitly using ra_add_reg_conflict. 20201e04c3fSmrg */ 20301e04c3fSmrgvoid 20401e04c3fSmrgra_make_reg_conflicts_transitive(struct ra_regs *regs, unsigned int r) 20501e04c3fSmrg{ 20601e04c3fSmrg struct ra_reg *reg = ®s->regs[r]; 20701e04c3fSmrg int c; 20801e04c3fSmrg 2097ec681f3Smrg BITSET_FOREACH_SET(c, reg->conflicts, regs->count) { 21001e04c3fSmrg struct ra_reg *other = ®s->regs[c]; 21101e04c3fSmrg unsigned i; 21201e04c3fSmrg for (i = 0; i < BITSET_WORDS(regs->count); i++) 21301e04c3fSmrg other->conflicts[i] |= reg->conflicts[i]; 21401e04c3fSmrg } 21501e04c3fSmrg} 21601e04c3fSmrg 2177ec681f3Smrgstruct ra_class * 21801e04c3fSmrgra_alloc_reg_class(struct ra_regs *regs) 21901e04c3fSmrg{ 22001e04c3fSmrg struct ra_class *class; 22101e04c3fSmrg 22201e04c3fSmrg regs->classes = reralloc(regs->regs, regs->classes, struct ra_class *, 22301e04c3fSmrg regs->class_count + 1); 22401e04c3fSmrg 22501e04c3fSmrg class = rzalloc(regs, struct ra_class); 2267ec681f3Smrg class->regset = regs; 2277ec681f3Smrg 2287ec681f3Smrg /* Users may rely on the class index being allocated in order starting from 0. */ 2297ec681f3Smrg class->index = regs->class_count++; 2307ec681f3Smrg regs->classes[class->index] = class; 23101e04c3fSmrg 23201e04c3fSmrg class->regs = rzalloc_array(class, BITSET_WORD, BITSET_WORDS(regs->count)); 23301e04c3fSmrg 2347ec681f3Smrg return class; 2357ec681f3Smrg} 2367ec681f3Smrg 2377ec681f3Smrg/** 2387ec681f3Smrg * Creates a register class for contiguous register groups of a base register 2397ec681f3Smrg * set. 2407ec681f3Smrg * 2417ec681f3Smrg * A reg set using this type of register class must use only this type of 2427ec681f3Smrg * register class. 2437ec681f3Smrg */ 2447ec681f3Smrgstruct ra_class * 2457ec681f3Smrgra_alloc_contig_reg_class(struct ra_regs *regs, int contig_len) 2467ec681f3Smrg{ 2477ec681f3Smrg struct ra_class *c = ra_alloc_reg_class(regs); 2487ec681f3Smrg 2497ec681f3Smrg assert(contig_len != 0); 2507ec681f3Smrg c->contig_len = contig_len; 2517ec681f3Smrg 2527ec681f3Smrg return c; 2537ec681f3Smrg} 2547ec681f3Smrg 2557ec681f3Smrgstruct ra_class * 2567ec681f3Smrgra_get_class_from_index(struct ra_regs *regs, unsigned int class) 2577ec681f3Smrg{ 2587ec681f3Smrg return regs->classes[class]; 2597ec681f3Smrg} 2607ec681f3Smrg 2617ec681f3Smrgunsigned int 2627ec681f3Smrgra_class_index(struct ra_class *c) 2637ec681f3Smrg{ 2647ec681f3Smrg return c->index; 26501e04c3fSmrg} 26601e04c3fSmrg 26701e04c3fSmrgvoid 2687ec681f3Smrgra_class_add_reg(struct ra_class *class, unsigned int r) 26901e04c3fSmrg{ 2707ec681f3Smrg assert(r < class->regset->count); 2717ec681f3Smrg assert(r + class->contig_len <= class->regset->count); 27201e04c3fSmrg 27301e04c3fSmrg BITSET_SET(class->regs, r); 27401e04c3fSmrg class->p++; 27501e04c3fSmrg} 27601e04c3fSmrg 27701e04c3fSmrg/** 27801e04c3fSmrg * Returns true if the register belongs to the given class. 27901e04c3fSmrg */ 28001e04c3fSmrgstatic bool 28101e04c3fSmrgreg_belongs_to_class(unsigned int r, struct ra_class *c) 28201e04c3fSmrg{ 28301e04c3fSmrg return BITSET_TEST(c->regs, r); 28401e04c3fSmrg} 28501e04c3fSmrg 28601e04c3fSmrg/** 28701e04c3fSmrg * Must be called after all conflicts and register classes have been 28801e04c3fSmrg * set up and before the register set is used for allocation. 28901e04c3fSmrg * To avoid costly q value computation, use the q_values paramater 29001e04c3fSmrg * to pass precomputed q values to this function. 29101e04c3fSmrg */ 29201e04c3fSmrgvoid 29301e04c3fSmrgra_set_finalize(struct ra_regs *regs, unsigned int **q_values) 29401e04c3fSmrg{ 29501e04c3fSmrg unsigned int b, c; 29601e04c3fSmrg 29701e04c3fSmrg for (b = 0; b < regs->class_count; b++) { 29801e04c3fSmrg regs->classes[b]->q = ralloc_array(regs, unsigned int, regs->class_count); 29901e04c3fSmrg } 30001e04c3fSmrg 30101e04c3fSmrg if (q_values) { 30201e04c3fSmrg for (b = 0; b < regs->class_count; b++) { 30301e04c3fSmrg for (c = 0; c < regs->class_count; c++) { 30401e04c3fSmrg regs->classes[b]->q[c] = q_values[b][c]; 30501e04c3fSmrg } 30601e04c3fSmrg } 30701e04c3fSmrg } else { 30801e04c3fSmrg /* Compute, for each class B and C, how many regs of B an 30901e04c3fSmrg * allocation to C could conflict with. 31001e04c3fSmrg */ 31101e04c3fSmrg for (b = 0; b < regs->class_count; b++) { 31201e04c3fSmrg for (c = 0; c < regs->class_count; c++) { 3137ec681f3Smrg struct ra_class *class_b = regs->classes[b]; 3147ec681f3Smrg struct ra_class *class_c = regs->classes[c]; 3157ec681f3Smrg 3167ec681f3Smrg if (class_b->contig_len && class_c->contig_len) { 3177ec681f3Smrg if (class_b->contig_len == 1 && class_c->contig_len == 1) { 3187ec681f3Smrg /* If both classes are single registers, then they only 3197ec681f3Smrg * conflict if there are any regs shared between them. This 3207ec681f3Smrg * is a cheap test for a common case. 3217ec681f3Smrg */ 3227ec681f3Smrg class_b->q[c] = 0; 3237ec681f3Smrg for (int i = 0; i < BITSET_WORDS(regs->count); i++) { 3247ec681f3Smrg if (class_b->regs[i] & class_c->regs[i]) { 3257ec681f3Smrg class_b->q[c] = 1; 3267ec681f3Smrg break; 3277ec681f3Smrg } 3287ec681f3Smrg } 3297ec681f3Smrg } else { 3307ec681f3Smrg int max_possible_conflicts = class_b->contig_len + class_c->contig_len - 1; 3317ec681f3Smrg 3327ec681f3Smrg unsigned int max_conflicts = 0; 3337ec681f3Smrg unsigned int rc; 3347ec681f3Smrg BITSET_FOREACH_SET(rc, regs->classes[c]->regs, regs->count) { 3357ec681f3Smrg int start = MAX2(0, (int)rc - class_b->contig_len + 1); 3367ec681f3Smrg int end = MIN2(regs->count, rc + class_c->contig_len); 3377ec681f3Smrg unsigned int conflicts = 0; 3387ec681f3Smrg for (int i = start; i < end; i++) { 3397ec681f3Smrg if (BITSET_TEST(class_b->regs, i)) 3407ec681f3Smrg conflicts++; 3417ec681f3Smrg } 3427ec681f3Smrg max_conflicts = MAX2(max_conflicts, conflicts); 3437ec681f3Smrg /* Unless a class has some restriction like the register 3447ec681f3Smrg * bases are all aligned, then we should quickly find this 3457ec681f3Smrg * limit and exit the loop. 3467ec681f3Smrg */ 3477ec681f3Smrg if (max_conflicts == max_possible_conflicts) 3487ec681f3Smrg break; 3497ec681f3Smrg } 3507ec681f3Smrg class_b->q[c] = max_conflicts; 3517ec681f3Smrg } 3527ec681f3Smrg } else { 3537ec681f3Smrg /* If you're doing contiguous classes, you have to be all in 3547ec681f3Smrg * because I don't want to deal with it. 3557ec681f3Smrg */ 3567ec681f3Smrg assert(!class_b->contig_len && !class_c->contig_len); 3577ec681f3Smrg 3587ec681f3Smrg unsigned int rc; 3597ec681f3Smrg int max_conflicts = 0; 3607ec681f3Smrg 3617ec681f3Smrg BITSET_FOREACH_SET(rc, regs->classes[c]->regs, regs->count) { 3627ec681f3Smrg int conflicts = 0; 3637ec681f3Smrg 3647ec681f3Smrg util_dynarray_foreach(®s->regs[rc].conflict_list, 3657ec681f3Smrg unsigned int, rbp) { 3667ec681f3Smrg unsigned int rb = *rbp; 3677ec681f3Smrg if (reg_belongs_to_class(rb, regs->classes[b])) 3687ec681f3Smrg conflicts++; 3697ec681f3Smrg } 3707ec681f3Smrg max_conflicts = MAX2(max_conflicts, conflicts); 37101e04c3fSmrg } 3727ec681f3Smrg regs->classes[b]->q[c] = max_conflicts; 37301e04c3fSmrg } 37401e04c3fSmrg } 37501e04c3fSmrg } 37601e04c3fSmrg } 37701e04c3fSmrg 37801e04c3fSmrg for (b = 0; b < regs->count; b++) { 3797ec681f3Smrg util_dynarray_fini(®s->regs[b].conflict_list); 3807ec681f3Smrg } 3817ec681f3Smrg 3827ec681f3Smrg bool all_contig = true; 3837ec681f3Smrg for (int c = 0; c < regs->class_count; c++) 3847ec681f3Smrg all_contig &= regs->classes[c]->contig_len != 0; 3857ec681f3Smrg if (all_contig) { 3867ec681f3Smrg /* In this case, we never need the conflicts lists (and it would probably 3877ec681f3Smrg * be a mistake to look at conflicts when doing contiguous classes!), so 3887ec681f3Smrg * free them. TODO: Avoid the allocation in the first place. 3897ec681f3Smrg */ 3907ec681f3Smrg for (int i = 0; i < regs->count; i++) { 3917ec681f3Smrg ralloc_free(regs->regs[i].conflicts); 3927ec681f3Smrg regs->regs[i].conflicts = NULL; 3937ec681f3Smrg } 3947ec681f3Smrg } 3957ec681f3Smrg} 3967ec681f3Smrg 3977ec681f3Smrgvoid 3987ec681f3Smrgra_set_serialize(const struct ra_regs *regs, struct blob *blob) 3997ec681f3Smrg{ 4007ec681f3Smrg blob_write_uint32(blob, regs->count); 4017ec681f3Smrg blob_write_uint32(blob, regs->class_count); 4027ec681f3Smrg 4037ec681f3Smrg bool is_contig = regs->classes[0]->contig_len != 0; 4047ec681f3Smrg blob_write_uint8(blob, is_contig); 4057ec681f3Smrg 4067ec681f3Smrg if (!is_contig) { 4077ec681f3Smrg for (unsigned int r = 0; r < regs->count; r++) { 4087ec681f3Smrg struct ra_reg *reg = ®s->regs[r]; 4097ec681f3Smrg blob_write_bytes(blob, reg->conflicts, BITSET_WORDS(regs->count) * 4107ec681f3Smrg sizeof(BITSET_WORD)); 4117ec681f3Smrg assert(util_dynarray_num_elements(®->conflict_list, unsigned int) == 0); 4127ec681f3Smrg } 4137ec681f3Smrg } 4147ec681f3Smrg 4157ec681f3Smrg for (unsigned int c = 0; c < regs->class_count; c++) { 4167ec681f3Smrg struct ra_class *class = regs->classes[c]; 4177ec681f3Smrg blob_write_bytes(blob, class->regs, BITSET_WORDS(regs->count) * 4187ec681f3Smrg sizeof(BITSET_WORD)); 4197ec681f3Smrg blob_write_uint32(blob, class->contig_len); 4207ec681f3Smrg blob_write_uint32(blob, class->p); 4217ec681f3Smrg blob_write_bytes(blob, class->q, regs->class_count * sizeof(*class->q)); 42201e04c3fSmrg } 4237ec681f3Smrg 4247ec681f3Smrg blob_write_uint32(blob, regs->round_robin); 4257ec681f3Smrg} 4267ec681f3Smrg 4277ec681f3Smrgstruct ra_regs * 4287ec681f3Smrgra_set_deserialize(void *mem_ctx, struct blob_reader *blob) 4297ec681f3Smrg{ 4307ec681f3Smrg unsigned int reg_count = blob_read_uint32(blob); 4317ec681f3Smrg unsigned int class_count = blob_read_uint32(blob); 4327ec681f3Smrg bool is_contig = blob_read_uint8(blob); 4337ec681f3Smrg 4347ec681f3Smrg struct ra_regs *regs = ra_alloc_reg_set(mem_ctx, reg_count, false); 4357ec681f3Smrg assert(regs->count == reg_count); 4367ec681f3Smrg 4377ec681f3Smrg if (is_contig) { 4387ec681f3Smrg for (int i = 0; i < regs->count; i++) { 4397ec681f3Smrg ralloc_free(regs->regs[i].conflicts); 4407ec681f3Smrg regs->regs[i].conflicts = NULL; 4417ec681f3Smrg } 4427ec681f3Smrg } else { 4437ec681f3Smrg for (unsigned int r = 0; r < reg_count; r++) { 4447ec681f3Smrg struct ra_reg *reg = ®s->regs[r]; 4457ec681f3Smrg blob_copy_bytes(blob, reg->conflicts, BITSET_WORDS(reg_count) * 4467ec681f3Smrg sizeof(BITSET_WORD)); 4477ec681f3Smrg } 4487ec681f3Smrg } 4497ec681f3Smrg 4507ec681f3Smrg assert(regs->classes == NULL); 4517ec681f3Smrg regs->classes = ralloc_array(regs->regs, struct ra_class *, class_count); 4527ec681f3Smrg regs->class_count = class_count; 4537ec681f3Smrg 4547ec681f3Smrg for (unsigned int c = 0; c < class_count; c++) { 4557ec681f3Smrg struct ra_class *class = rzalloc(regs, struct ra_class); 4567ec681f3Smrg regs->classes[c] = class; 4577ec681f3Smrg class->regset = regs; 4587ec681f3Smrg class->index = c; 4597ec681f3Smrg 4607ec681f3Smrg class->regs = ralloc_array(class, BITSET_WORD, BITSET_WORDS(reg_count)); 4617ec681f3Smrg blob_copy_bytes(blob, class->regs, BITSET_WORDS(reg_count) * 4627ec681f3Smrg sizeof(BITSET_WORD)); 4637ec681f3Smrg 4647ec681f3Smrg class->contig_len = blob_read_uint32(blob); 4657ec681f3Smrg class->p = blob_read_uint32(blob); 4667ec681f3Smrg 4677ec681f3Smrg class->q = ralloc_array(regs->classes[c], unsigned int, class_count); 4687ec681f3Smrg blob_copy_bytes(blob, class->q, class_count * sizeof(*class->q)); 4697ec681f3Smrg } 4707ec681f3Smrg 4717ec681f3Smrg regs->round_robin = blob_read_uint32(blob); 4727ec681f3Smrg 4737ec681f3Smrg return regs; 47401e04c3fSmrg} 47501e04c3fSmrg 47601e04c3fSmrgstatic void 47701e04c3fSmrgra_add_node_adjacency(struct ra_graph *g, unsigned int n1, unsigned int n2) 47801e04c3fSmrg{ 47901e04c3fSmrg BITSET_SET(g->nodes[n1].adjacency, n2); 48001e04c3fSmrg 48101e04c3fSmrg assert(n1 != n2); 48201e04c3fSmrg 48301e04c3fSmrg int n1_class = g->nodes[n1].class; 48401e04c3fSmrg int n2_class = g->nodes[n2].class; 48501e04c3fSmrg g->nodes[n1].q_total += g->regs->classes[n1_class]->q[n2_class]; 48601e04c3fSmrg 4877ec681f3Smrg util_dynarray_append(&g->nodes[n1].adjacency_list, unsigned int, n2); 4887ec681f3Smrg} 4897ec681f3Smrg 4907ec681f3Smrgstatic void 4917ec681f3Smrgra_node_remove_adjacency(struct ra_graph *g, unsigned int n1, unsigned int n2) 4927ec681f3Smrg{ 4937ec681f3Smrg BITSET_CLEAR(g->nodes[n1].adjacency, n2); 4947ec681f3Smrg 4957ec681f3Smrg assert(n1 != n2); 4967ec681f3Smrg 4977ec681f3Smrg int n1_class = g->nodes[n1].class; 4987ec681f3Smrg int n2_class = g->nodes[n2].class; 4997ec681f3Smrg g->nodes[n1].q_total -= g->regs->classes[n1_class]->q[n2_class]; 5007ec681f3Smrg 5017ec681f3Smrg util_dynarray_delete_unordered(&g->nodes[n1].adjacency_list, unsigned int, 5027ec681f3Smrg n2); 5037ec681f3Smrg} 5047ec681f3Smrg 5057ec681f3Smrgstatic void 5067ec681f3Smrgra_realloc_interference_graph(struct ra_graph *g, unsigned int alloc) 5077ec681f3Smrg{ 5087ec681f3Smrg if (alloc <= g->alloc) 5097ec681f3Smrg return; 5107ec681f3Smrg 5117ec681f3Smrg /* If we always have a whole number of BITSET_WORDs, it makes it much 5127ec681f3Smrg * easier to memset the top of the growing bitsets. 5137ec681f3Smrg */ 5147ec681f3Smrg assert(g->alloc % BITSET_WORDBITS == 0); 5157ec681f3Smrg alloc = align64(alloc, BITSET_WORDBITS); 5167ec681f3Smrg 5177ec681f3Smrg g->nodes = reralloc(g, g->nodes, struct ra_node, alloc); 5187ec681f3Smrg 5197ec681f3Smrg unsigned g_bitset_count = BITSET_WORDS(g->alloc); 5207ec681f3Smrg unsigned bitset_count = BITSET_WORDS(alloc); 5217ec681f3Smrg /* For nodes already in the graph, we just have to grow the adjacency set */ 5227ec681f3Smrg for (unsigned i = 0; i < g->alloc; i++) { 5237ec681f3Smrg assert(g->nodes[i].adjacency != NULL); 5247ec681f3Smrg g->nodes[i].adjacency = rerzalloc(g, g->nodes[i].adjacency, BITSET_WORD, 5257ec681f3Smrg g_bitset_count, bitset_count); 5267ec681f3Smrg } 5277ec681f3Smrg 5287ec681f3Smrg /* For new nodes, we have to fully initialize them */ 5297ec681f3Smrg for (unsigned i = g->alloc; i < alloc; i++) { 5307ec681f3Smrg memset(&g->nodes[i], 0, sizeof(g->nodes[i])); 5317ec681f3Smrg g->nodes[i].adjacency = rzalloc_array(g, BITSET_WORD, bitset_count); 5327ec681f3Smrg util_dynarray_init(&g->nodes[i].adjacency_list, g); 5337ec681f3Smrg g->nodes[i].q_total = 0; 5347ec681f3Smrg 5357ec681f3Smrg g->nodes[i].forced_reg = NO_REG; 5367ec681f3Smrg g->nodes[i].reg = NO_REG; 53701e04c3fSmrg } 53801e04c3fSmrg 5397ec681f3Smrg /* These are scratch values and don't need to be zeroed. We'll clear them 5407ec681f3Smrg * as part of ra_select() setup. 5417ec681f3Smrg */ 5427ec681f3Smrg g->tmp.stack = reralloc(g, g->tmp.stack, unsigned int, alloc); 5437ec681f3Smrg g->tmp.in_stack = reralloc(g, g->tmp.in_stack, BITSET_WORD, bitset_count); 5447ec681f3Smrg 5457ec681f3Smrg g->tmp.reg_assigned = reralloc(g, g->tmp.reg_assigned, BITSET_WORD, 5467ec681f3Smrg bitset_count); 5477ec681f3Smrg g->tmp.pq_test = reralloc(g, g->tmp.pq_test, BITSET_WORD, bitset_count); 5487ec681f3Smrg g->tmp.min_q_total = reralloc(g, g->tmp.min_q_total, unsigned int, 5497ec681f3Smrg bitset_count); 5507ec681f3Smrg g->tmp.min_q_node = reralloc(g, g->tmp.min_q_node, unsigned int, 5517ec681f3Smrg bitset_count); 5527ec681f3Smrg 5537ec681f3Smrg g->alloc = alloc; 55401e04c3fSmrg} 55501e04c3fSmrg 55601e04c3fSmrgstruct ra_graph * 55701e04c3fSmrgra_alloc_interference_graph(struct ra_regs *regs, unsigned int count) 55801e04c3fSmrg{ 55901e04c3fSmrg struct ra_graph *g; 56001e04c3fSmrg 56101e04c3fSmrg g = rzalloc(NULL, struct ra_graph); 56201e04c3fSmrg g->regs = regs; 56301e04c3fSmrg g->count = count; 5647ec681f3Smrg ra_realloc_interference_graph(g, count); 56501e04c3fSmrg 56601e04c3fSmrg return g; 56701e04c3fSmrg} 56801e04c3fSmrg 5697ec681f3Smrgvoid 5707ec681f3Smrgra_resize_interference_graph(struct ra_graph *g, unsigned int count) 5717ec681f3Smrg{ 5727ec681f3Smrg g->count = count; 5737ec681f3Smrg if (count > g->alloc) 5747ec681f3Smrg ra_realloc_interference_graph(g, g->alloc * 2); 5757ec681f3Smrg} 5767ec681f3Smrg 57701e04c3fSmrgvoid ra_set_select_reg_callback(struct ra_graph *g, 5787ec681f3Smrg ra_select_reg_callback callback, 57901e04c3fSmrg void *data) 58001e04c3fSmrg{ 58101e04c3fSmrg g->select_reg_callback = callback; 58201e04c3fSmrg g->select_reg_callback_data = data; 58301e04c3fSmrg} 58401e04c3fSmrg 58501e04c3fSmrgvoid 58601e04c3fSmrgra_set_node_class(struct ra_graph *g, 5877ec681f3Smrg unsigned int n, struct ra_class *class) 5887ec681f3Smrg{ 5897ec681f3Smrg g->nodes[n].class = class->index; 5907ec681f3Smrg} 5917ec681f3Smrg 5927ec681f3Smrgstruct ra_class * 5937ec681f3Smrgra_get_node_class(struct ra_graph *g, 5947ec681f3Smrg unsigned int n) 5957ec681f3Smrg{ 5967ec681f3Smrg return g->regs->classes[g->nodes[n].class]; 5977ec681f3Smrg} 5987ec681f3Smrg 5997ec681f3Smrgunsigned int 6007ec681f3Smrgra_add_node(struct ra_graph *g, struct ra_class *class) 60101e04c3fSmrg{ 6027ec681f3Smrg unsigned int n = g->count; 6037ec681f3Smrg ra_resize_interference_graph(g, g->count + 1); 6047ec681f3Smrg 6057ec681f3Smrg ra_set_node_class(g, n, class); 6067ec681f3Smrg 6077ec681f3Smrg return n; 60801e04c3fSmrg} 60901e04c3fSmrg 61001e04c3fSmrgvoid 61101e04c3fSmrgra_add_node_interference(struct ra_graph *g, 61201e04c3fSmrg unsigned int n1, unsigned int n2) 61301e04c3fSmrg{ 6147ec681f3Smrg assert(n1 < g->count && n2 < g->count); 61501e04c3fSmrg if (n1 != n2 && !BITSET_TEST(g->nodes[n1].adjacency, n2)) { 61601e04c3fSmrg ra_add_node_adjacency(g, n1, n2); 61701e04c3fSmrg ra_add_node_adjacency(g, n2, n1); 61801e04c3fSmrg } 61901e04c3fSmrg} 62001e04c3fSmrg 6217ec681f3Smrgvoid 6227ec681f3Smrgra_reset_node_interference(struct ra_graph *g, unsigned int n) 62301e04c3fSmrg{ 6247ec681f3Smrg util_dynarray_foreach(&g->nodes[n].adjacency_list, unsigned int, n2p) { 6257ec681f3Smrg ra_node_remove_adjacency(g, *n2p, n); 6267ec681f3Smrg } 62701e04c3fSmrg 6287ec681f3Smrg memset(g->nodes[n].adjacency, 0, 6297ec681f3Smrg BITSET_WORDS(g->count) * sizeof(BITSET_WORD)); 6307ec681f3Smrg util_dynarray_clear(&g->nodes[n].adjacency_list); 63101e04c3fSmrg} 63201e04c3fSmrg 63301e04c3fSmrgstatic void 6347ec681f3Smrgupdate_pq_info(struct ra_graph *g, unsigned int n) 6357ec681f3Smrg{ 6367ec681f3Smrg int i = n / BITSET_WORDBITS; 6377ec681f3Smrg int n_class = g->nodes[n].class; 6387ec681f3Smrg if (g->nodes[n].tmp.q_total < g->regs->classes[n_class]->p) { 6397ec681f3Smrg BITSET_SET(g->tmp.pq_test, n); 6407ec681f3Smrg } else if (g->tmp.min_q_total[i] != UINT_MAX) { 6417ec681f3Smrg /* Only update min_q_total and min_q_node if min_q_total != UINT_MAX so 6427ec681f3Smrg * that we don't update while we have stale data and accidentally mark 6437ec681f3Smrg * it as non-stale. Also, in order to remain consistent with the old 6447ec681f3Smrg * naive implementation of the algorithm, we do a lexicographical sort 6457ec681f3Smrg * to ensure that we always choose the node with the highest node index. 6467ec681f3Smrg */ 6477ec681f3Smrg if (g->nodes[n].tmp.q_total < g->tmp.min_q_total[i] || 6487ec681f3Smrg (g->nodes[n].tmp.q_total == g->tmp.min_q_total[i] && 6497ec681f3Smrg n > g->tmp.min_q_node[i])) { 6507ec681f3Smrg g->tmp.min_q_total[i] = g->nodes[n].tmp.q_total; 6517ec681f3Smrg g->tmp.min_q_node[i] = n; 6527ec681f3Smrg } 6537ec681f3Smrg } 6547ec681f3Smrg} 6557ec681f3Smrg 6567ec681f3Smrgstatic void 6577ec681f3Smrgadd_node_to_stack(struct ra_graph *g, unsigned int n) 65801e04c3fSmrg{ 65901e04c3fSmrg int n_class = g->nodes[n].class; 66001e04c3fSmrg 6617ec681f3Smrg assert(!BITSET_TEST(g->tmp.in_stack, n)); 6627ec681f3Smrg 6637ec681f3Smrg util_dynarray_foreach(&g->nodes[n].adjacency_list, unsigned int, n2p) { 6647ec681f3Smrg unsigned int n2 = *n2p; 66501e04c3fSmrg unsigned int n2_class = g->nodes[n2].class; 66601e04c3fSmrg 6677ec681f3Smrg if (!BITSET_TEST(g->tmp.in_stack, n2) && 6687ec681f3Smrg !BITSET_TEST(g->tmp.reg_assigned, n2)) { 6697ec681f3Smrg assert(g->nodes[n2].tmp.q_total >= g->regs->classes[n2_class]->q[n_class]); 6707ec681f3Smrg g->nodes[n2].tmp.q_total -= g->regs->classes[n2_class]->q[n_class]; 6717ec681f3Smrg update_pq_info(g, n2); 67201e04c3fSmrg } 67301e04c3fSmrg } 6747ec681f3Smrg 6757ec681f3Smrg g->tmp.stack[g->tmp.stack_count] = n; 6767ec681f3Smrg g->tmp.stack_count++; 6777ec681f3Smrg BITSET_SET(g->tmp.in_stack, n); 6787ec681f3Smrg 6797ec681f3Smrg /* Flag the min_q_total for n's block as dirty so it gets recalculated */ 6807ec681f3Smrg g->tmp.min_q_total[n / BITSET_WORDBITS] = UINT_MAX; 68101e04c3fSmrg} 68201e04c3fSmrg 68301e04c3fSmrg/** 68401e04c3fSmrg * Simplifies the interference graph by pushing all 68501e04c3fSmrg * trivially-colorable nodes into a stack of nodes to be colored, 68601e04c3fSmrg * removing them from the graph, and rinsing and repeating. 68701e04c3fSmrg * 68801e04c3fSmrg * If we encounter a case where we can't push any nodes on the stack, then 68901e04c3fSmrg * we optimistically choose a node and push it on the stack. We heuristically 69001e04c3fSmrg * push the node with the lowest total q value, since it has the fewest 69101e04c3fSmrg * neighbors and therefore is most likely to be allocated. 69201e04c3fSmrg */ 69301e04c3fSmrgstatic void 69401e04c3fSmrgra_simplify(struct ra_graph *g) 69501e04c3fSmrg{ 69601e04c3fSmrg bool progress = true; 69701e04c3fSmrg unsigned int stack_optimistic_start = UINT_MAX; 6987ec681f3Smrg 6997ec681f3Smrg /* Figure out the high bit and bit mask for the first iteration of a loop 7007ec681f3Smrg * over BITSET_WORDs. 7017ec681f3Smrg */ 7027ec681f3Smrg const unsigned int top_word_high_bit = (g->count - 1) % BITSET_WORDBITS; 7037ec681f3Smrg 7047ec681f3Smrg /* Do a quick pre-pass to set things up */ 7057ec681f3Smrg g->tmp.stack_count = 0; 7067ec681f3Smrg for (int i = BITSET_WORDS(g->count) - 1, high_bit = top_word_high_bit; 7077ec681f3Smrg i >= 0; i--, high_bit = BITSET_WORDBITS - 1) { 7087ec681f3Smrg g->tmp.in_stack[i] = 0; 7097ec681f3Smrg g->tmp.reg_assigned[i] = 0; 7107ec681f3Smrg g->tmp.pq_test[i] = 0; 7117ec681f3Smrg g->tmp.min_q_total[i] = UINT_MAX; 7127ec681f3Smrg g->tmp.min_q_node[i] = UINT_MAX; 7137ec681f3Smrg for (int j = high_bit; j >= 0; j--) { 7147ec681f3Smrg unsigned int n = i * BITSET_WORDBITS + j; 7157ec681f3Smrg g->nodes[n].reg = g->nodes[n].forced_reg; 7167ec681f3Smrg g->nodes[n].tmp.q_total = g->nodes[n].q_total; 7177ec681f3Smrg if (g->nodes[n].reg != NO_REG) 7187ec681f3Smrg g->tmp.reg_assigned[i] |= BITSET_BIT(j); 7197ec681f3Smrg update_pq_info(g, n); 7207ec681f3Smrg } 7217ec681f3Smrg } 72201e04c3fSmrg 72301e04c3fSmrg while (progress) { 7247ec681f3Smrg unsigned int min_q_total = UINT_MAX; 7257ec681f3Smrg unsigned int min_q_node = UINT_MAX; 72601e04c3fSmrg 72701e04c3fSmrg progress = false; 72801e04c3fSmrg 7297ec681f3Smrg for (int i = BITSET_WORDS(g->count) - 1, high_bit = top_word_high_bit; 7307ec681f3Smrg i >= 0; i--, high_bit = BITSET_WORDBITS - 1) { 7317ec681f3Smrg BITSET_WORD mask = ~(BITSET_WORD)0 >> (31 - high_bit); 7327ec681f3Smrg 7337ec681f3Smrg BITSET_WORD skip = g->tmp.in_stack[i] | g->tmp.reg_assigned[i]; 7347ec681f3Smrg if (skip == mask) 7357ec681f3Smrg continue; 7367ec681f3Smrg 7377ec681f3Smrg BITSET_WORD pq = g->tmp.pq_test[i] & ~skip; 7387ec681f3Smrg if (pq) { 7397ec681f3Smrg /* In this case, we have stuff we can immediately take off the 7407ec681f3Smrg * stack. This also means that we're guaranteed to make progress 7417ec681f3Smrg * and we don't need to bother updating lowest_q_total because we 7427ec681f3Smrg * know we're going to loop again before attempting to do anything 7437ec681f3Smrg * optimistic. 7447ec681f3Smrg */ 7457ec681f3Smrg for (int j = high_bit; j >= 0; j--) { 7467ec681f3Smrg if (pq & BITSET_BIT(j)) { 7477ec681f3Smrg unsigned int n = i * BITSET_WORDBITS + j; 7487ec681f3Smrg assert(n < g->count); 7497ec681f3Smrg add_node_to_stack(g, n); 7507ec681f3Smrg /* add_node_to_stack() may update pq_test for this word so 7517ec681f3Smrg * we need to update our local copy. 7527ec681f3Smrg */ 7537ec681f3Smrg pq = g->tmp.pq_test[i] & ~skip; 7547ec681f3Smrg progress = true; 7557ec681f3Smrg } 7567ec681f3Smrg } 7577ec681f3Smrg } else if (!progress) { 7587ec681f3Smrg if (g->tmp.min_q_total[i] == UINT_MAX) { 7597ec681f3Smrg /* The min_q_total and min_q_node are dirty because we added 7607ec681f3Smrg * one of these nodes to the stack. It needs to be 7617ec681f3Smrg * recalculated. 7627ec681f3Smrg */ 7637ec681f3Smrg for (int j = high_bit; j >= 0; j--) { 7647ec681f3Smrg if (skip & BITSET_BIT(j)) 7657ec681f3Smrg continue; 7667ec681f3Smrg 7677ec681f3Smrg unsigned int n = i * BITSET_WORDBITS + j; 7687ec681f3Smrg assert(n < g->count); 7697ec681f3Smrg if (g->nodes[n].tmp.q_total < g->tmp.min_q_total[i]) { 7707ec681f3Smrg g->tmp.min_q_total[i] = g->nodes[n].tmp.q_total; 7717ec681f3Smrg g->tmp.min_q_node[i] = n; 7727ec681f3Smrg } 7737ec681f3Smrg } 7747ec681f3Smrg } 7757ec681f3Smrg if (g->tmp.min_q_total[i] < min_q_total) { 7767ec681f3Smrg min_q_node = g->tmp.min_q_node[i]; 7777ec681f3Smrg min_q_total = g->tmp.min_q_total[i]; 7787ec681f3Smrg } 7797ec681f3Smrg } 78001e04c3fSmrg } 78101e04c3fSmrg 7827ec681f3Smrg if (!progress && min_q_total != UINT_MAX) { 78301e04c3fSmrg if (stack_optimistic_start == UINT_MAX) 7847ec681f3Smrg stack_optimistic_start = g->tmp.stack_count; 78501e04c3fSmrg 7867ec681f3Smrg add_node_to_stack(g, min_q_node); 7877ec681f3Smrg progress = true; 78801e04c3fSmrg } 78901e04c3fSmrg } 79001e04c3fSmrg 7917ec681f3Smrg g->tmp.stack_optimistic_start = stack_optimistic_start; 79201e04c3fSmrg} 79301e04c3fSmrg 7947ec681f3Smrgbool 7957ec681f3Smrgra_class_allocations_conflict(struct ra_class *c1, unsigned int r1, 7967ec681f3Smrg struct ra_class *c2, unsigned int r2) 79701e04c3fSmrg{ 7987ec681f3Smrg if (c1->contig_len) { 7997ec681f3Smrg assert(c2->contig_len); 80001e04c3fSmrg 8017ec681f3Smrg int r1_end = r1 + c1->contig_len; 8027ec681f3Smrg int r2_end = r2 + c2->contig_len; 8037ec681f3Smrg return !(r2 >= r1_end || r1 >= r2_end); 8047ec681f3Smrg } else { 8057ec681f3Smrg return BITSET_TEST(c1->regset->regs[r1].conflicts, r2); 8067ec681f3Smrg } 8077ec681f3Smrg} 80801e04c3fSmrg 8097ec681f3Smrgstatic struct ra_node * 8107ec681f3Smrgra_find_conflicting_neighbor(struct ra_graph *g, unsigned int n, unsigned int r) 8117ec681f3Smrg{ 8127ec681f3Smrg util_dynarray_foreach(&g->nodes[n].adjacency_list, unsigned int, n2p) { 8137ec681f3Smrg unsigned int n2 = *n2p; 8147ec681f3Smrg 8157ec681f3Smrg /* If our adjacent node is in the stack, it's not allocated yet. */ 8167ec681f3Smrg if (!BITSET_TEST(g->tmp.in_stack, n2) && 8177ec681f3Smrg ra_class_allocations_conflict(g->regs->classes[g->nodes[n].class], r, 8187ec681f3Smrg g->regs->classes[g->nodes[n2].class], g->nodes[n2].reg)) { 8197ec681f3Smrg return &g->nodes[n2]; 82001e04c3fSmrg } 82101e04c3fSmrg } 82201e04c3fSmrg 8237ec681f3Smrg return NULL; 82401e04c3fSmrg} 82501e04c3fSmrg 82601e04c3fSmrg/* Computes a bitfield of what regs are available for a given register 82701e04c3fSmrg * selection. 82801e04c3fSmrg * 82901e04c3fSmrg * This lets drivers implement a more complicated policy than our simple first 83001e04c3fSmrg * or round robin policies (which don't require knowing the whole bitset) 83101e04c3fSmrg */ 83201e04c3fSmrgstatic bool 83301e04c3fSmrgra_compute_available_regs(struct ra_graph *g, unsigned int n, BITSET_WORD *regs) 83401e04c3fSmrg{ 83501e04c3fSmrg struct ra_class *c = g->regs->classes[g->nodes[n].class]; 83601e04c3fSmrg 83701e04c3fSmrg /* Populate with the set of regs that are in the node's class. */ 83801e04c3fSmrg memcpy(regs, c->regs, BITSET_WORDS(g->regs->count) * sizeof(BITSET_WORD)); 83901e04c3fSmrg 84001e04c3fSmrg /* Remove any regs that conflict with nodes that we're adjacent to and have 84101e04c3fSmrg * already colored. 84201e04c3fSmrg */ 8437ec681f3Smrg util_dynarray_foreach(&g->nodes[n].adjacency_list, unsigned int, n2p) { 8447ec681f3Smrg struct ra_node *n2 = &g->nodes[*n2p]; 8457ec681f3Smrg struct ra_class *n2c = g->regs->classes[n2->class]; 8467ec681f3Smrg 8477ec681f3Smrg if (!BITSET_TEST(g->tmp.in_stack, *n2p)) { 8487ec681f3Smrg if (c->contig_len) { 8497ec681f3Smrg int start = MAX2(0, (int)n2->reg - c->contig_len + 1); 8507ec681f3Smrg int end = MIN2(g->regs->count, n2->reg + n2c->contig_len); 8517ec681f3Smrg for (unsigned i = start; i < end; i++) 8527ec681f3Smrg BITSET_CLEAR(regs, i); 8537ec681f3Smrg } else { 8547ec681f3Smrg for (int j = 0; j < BITSET_WORDS(g->regs->count); j++) 8557ec681f3Smrg regs[j] &= ~g->regs->regs[n2->reg].conflicts[j]; 8567ec681f3Smrg } 85701e04c3fSmrg } 85801e04c3fSmrg } 85901e04c3fSmrg 86001e04c3fSmrg for (int i = 0; i < BITSET_WORDS(g->regs->count); i++) { 86101e04c3fSmrg if (regs[i]) 86201e04c3fSmrg return true; 86301e04c3fSmrg } 86401e04c3fSmrg 86501e04c3fSmrg return false; 86601e04c3fSmrg} 86701e04c3fSmrg 86801e04c3fSmrg/** 86901e04c3fSmrg * Pops nodes from the stack back into the graph, coloring them with 87001e04c3fSmrg * registers as they go. 87101e04c3fSmrg * 87201e04c3fSmrg * If all nodes were trivially colorable, then this must succeed. If 87301e04c3fSmrg * not (optimistic coloring), then it may return false; 87401e04c3fSmrg */ 87501e04c3fSmrgstatic bool 87601e04c3fSmrgra_select(struct ra_graph *g) 87701e04c3fSmrg{ 87801e04c3fSmrg int start_search_reg = 0; 87901e04c3fSmrg BITSET_WORD *select_regs = NULL; 88001e04c3fSmrg 88101e04c3fSmrg if (g->select_reg_callback) 88201e04c3fSmrg select_regs = malloc(BITSET_WORDS(g->regs->count) * sizeof(BITSET_WORD)); 88301e04c3fSmrg 8847ec681f3Smrg while (g->tmp.stack_count != 0) { 88501e04c3fSmrg unsigned int ri; 88601e04c3fSmrg unsigned int r = -1; 8877ec681f3Smrg int n = g->tmp.stack[g->tmp.stack_count - 1]; 88801e04c3fSmrg struct ra_class *c = g->regs->classes[g->nodes[n].class]; 88901e04c3fSmrg 89001e04c3fSmrg /* set this to false even if we return here so that 89101e04c3fSmrg * ra_get_best_spill_node() considers this node later. 89201e04c3fSmrg */ 8937ec681f3Smrg BITSET_CLEAR(g->tmp.in_stack, n); 89401e04c3fSmrg 89501e04c3fSmrg if (g->select_reg_callback) { 89601e04c3fSmrg if (!ra_compute_available_regs(g, n, select_regs)) { 89701e04c3fSmrg free(select_regs); 89801e04c3fSmrg return false; 89901e04c3fSmrg } 90001e04c3fSmrg 9017ec681f3Smrg r = g->select_reg_callback(n, select_regs, g->select_reg_callback_data); 9027ec681f3Smrg assert(r < g->regs->count); 90301e04c3fSmrg } else { 90401e04c3fSmrg /* Find the lowest-numbered reg which is not used by a member 90501e04c3fSmrg * of the graph adjacent to us. 90601e04c3fSmrg */ 90701e04c3fSmrg for (ri = 0; ri < g->regs->count; ri++) { 90801e04c3fSmrg r = (start_search_reg + ri) % g->regs->count; 90901e04c3fSmrg if (!reg_belongs_to_class(r, c)) 91001e04c3fSmrg continue; 91101e04c3fSmrg 9127ec681f3Smrg struct ra_node *conflicting = ra_find_conflicting_neighbor(g, n, r); 9137ec681f3Smrg if (!conflicting) { 9147ec681f3Smrg /* Found a reg! */ 91501e04c3fSmrg break; 9167ec681f3Smrg } 9177ec681f3Smrg if (g->regs->classes[conflicting->class]->contig_len) { 9187ec681f3Smrg /* Skip to point at the last base reg of the conflicting reg 9197ec681f3Smrg * allocation -- the loop will increment us to check the next reg 9207ec681f3Smrg * after the conflicting allocaiton. 9217ec681f3Smrg */ 9227ec681f3Smrg unsigned conflicting_end = (conflicting->reg + 9237ec681f3Smrg g->regs->classes[conflicting->class]->contig_len - 1); 9247ec681f3Smrg assert(conflicting_end >= r); 9257ec681f3Smrg ri += conflicting_end - r; 9267ec681f3Smrg } 92701e04c3fSmrg } 92801e04c3fSmrg 92901e04c3fSmrg if (ri >= g->regs->count) 93001e04c3fSmrg return false; 93101e04c3fSmrg } 93201e04c3fSmrg 93301e04c3fSmrg g->nodes[n].reg = r; 9347ec681f3Smrg g->tmp.stack_count--; 93501e04c3fSmrg 93601e04c3fSmrg /* Rotate the starting point except for any nodes above the lowest 93701e04c3fSmrg * optimistically colorable node. The likelihood that we will succeed 93801e04c3fSmrg * at allocating optimistically colorable nodes is highly dependent on 93901e04c3fSmrg * the way that the previous nodes popped off the stack are laid out. 94001e04c3fSmrg * The round-robin strategy increases the fragmentation of the register 94101e04c3fSmrg * file and decreases the number of nearby nodes assigned to the same 94201e04c3fSmrg * color, what increases the likelihood of spilling with respect to the 94301e04c3fSmrg * dense packing strategy. 94401e04c3fSmrg */ 94501e04c3fSmrg if (g->regs->round_robin && 9467ec681f3Smrg g->tmp.stack_count - 1 <= g->tmp.stack_optimistic_start) 94701e04c3fSmrg start_search_reg = r + 1; 94801e04c3fSmrg } 94901e04c3fSmrg 95001e04c3fSmrg free(select_regs); 95101e04c3fSmrg 95201e04c3fSmrg return true; 95301e04c3fSmrg} 95401e04c3fSmrg 95501e04c3fSmrgbool 95601e04c3fSmrgra_allocate(struct ra_graph *g) 95701e04c3fSmrg{ 95801e04c3fSmrg ra_simplify(g); 95901e04c3fSmrg return ra_select(g); 96001e04c3fSmrg} 96101e04c3fSmrg 96201e04c3fSmrgunsigned int 96301e04c3fSmrgra_get_node_reg(struct ra_graph *g, unsigned int n) 96401e04c3fSmrg{ 9657ec681f3Smrg if (g->nodes[n].forced_reg != NO_REG) 9667ec681f3Smrg return g->nodes[n].forced_reg; 9677ec681f3Smrg else 9687ec681f3Smrg return g->nodes[n].reg; 96901e04c3fSmrg} 97001e04c3fSmrg 97101e04c3fSmrg/** 97201e04c3fSmrg * Forces a node to a specific register. This can be used to avoid 97301e04c3fSmrg * creating a register class containing one node when handling data 97401e04c3fSmrg * that must live in a fixed location and is known to not conflict 97501e04c3fSmrg * with other forced register assignment (as is common with shader 97601e04c3fSmrg * input data). These nodes do not end up in the stack during 97701e04c3fSmrg * ra_simplify(), and thus at ra_select() time it is as if they were 97801e04c3fSmrg * the first popped off the stack and assigned their fixed locations. 97901e04c3fSmrg * Nodes that use this function do not need to be assigned a register 98001e04c3fSmrg * class. 98101e04c3fSmrg * 98201e04c3fSmrg * Must be called before ra_simplify(). 98301e04c3fSmrg */ 98401e04c3fSmrgvoid 98501e04c3fSmrgra_set_node_reg(struct ra_graph *g, unsigned int n, unsigned int reg) 98601e04c3fSmrg{ 9877ec681f3Smrg g->nodes[n].forced_reg = reg; 98801e04c3fSmrg} 98901e04c3fSmrg 99001e04c3fSmrgstatic float 99101e04c3fSmrgra_get_spill_benefit(struct ra_graph *g, unsigned int n) 99201e04c3fSmrg{ 99301e04c3fSmrg float benefit = 0; 99401e04c3fSmrg int n_class = g->nodes[n].class; 99501e04c3fSmrg 99601e04c3fSmrg /* Define the benefit of eliminating an interference between n, n2 99701e04c3fSmrg * through spilling as q(C, B) / p(C). This is similar to the 99801e04c3fSmrg * "count number of edges" approach of traditional graph coloring, 99901e04c3fSmrg * but takes classes into account. 100001e04c3fSmrg */ 10017ec681f3Smrg util_dynarray_foreach(&g->nodes[n].adjacency_list, unsigned int, n2p) { 10027ec681f3Smrg unsigned int n2 = *n2p; 100301e04c3fSmrg unsigned int n2_class = g->nodes[n2].class; 100401e04c3fSmrg benefit += ((float)g->regs->classes[n_class]->q[n2_class] / 100501e04c3fSmrg g->regs->classes[n_class]->p); 100601e04c3fSmrg } 100701e04c3fSmrg 100801e04c3fSmrg return benefit; 100901e04c3fSmrg} 101001e04c3fSmrg 101101e04c3fSmrg/** 101201e04c3fSmrg * Returns a node number to be spilled according to the cost/benefit using 101301e04c3fSmrg * the pq test, or -1 if there are no spillable nodes. 101401e04c3fSmrg */ 101501e04c3fSmrgint 101601e04c3fSmrgra_get_best_spill_node(struct ra_graph *g) 101701e04c3fSmrg{ 101801e04c3fSmrg unsigned int best_node = -1; 101901e04c3fSmrg float best_benefit = 0.0; 102001e04c3fSmrg unsigned int n; 102101e04c3fSmrg 102201e04c3fSmrg /* Consider any nodes that we colored successfully or the node we failed to 102301e04c3fSmrg * color for spilling. When we failed to color a node in ra_select(), we 102401e04c3fSmrg * only considered these nodes, so spilling any other ones would not result 102501e04c3fSmrg * in us making progress. 102601e04c3fSmrg */ 102701e04c3fSmrg for (n = 0; n < g->count; n++) { 102801e04c3fSmrg float cost = g->nodes[n].spill_cost; 102901e04c3fSmrg float benefit; 103001e04c3fSmrg 103101e04c3fSmrg if (cost <= 0.0f) 10327ec681f3Smrg continue; 103301e04c3fSmrg 10347ec681f3Smrg if (BITSET_TEST(g->tmp.in_stack, n)) 103501e04c3fSmrg continue; 103601e04c3fSmrg 103701e04c3fSmrg benefit = ra_get_spill_benefit(g, n); 103801e04c3fSmrg 103901e04c3fSmrg if (benefit / cost > best_benefit) { 10407ec681f3Smrg best_benefit = benefit / cost; 10417ec681f3Smrg best_node = n; 104201e04c3fSmrg } 104301e04c3fSmrg } 104401e04c3fSmrg 104501e04c3fSmrg return best_node; 104601e04c3fSmrg} 104701e04c3fSmrg 104801e04c3fSmrg/** 104901e04c3fSmrg * Only nodes with a spill cost set (cost != 0.0) will be considered 105001e04c3fSmrg * for register spilling. 105101e04c3fSmrg */ 105201e04c3fSmrgvoid 105301e04c3fSmrgra_set_node_spill_cost(struct ra_graph *g, unsigned int n, float cost) 105401e04c3fSmrg{ 105501e04c3fSmrg g->nodes[n].spill_cost = cost; 105601e04c3fSmrg} 1057