17ec681f3Smrg/*
27ec681f3Smrg * Copyright (C) 2020 Collabora Ltd.
37ec681f3Smrg *
47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a
57ec681f3Smrg * copy of this software and associated documentation files (the "Software"),
67ec681f3Smrg * to deal in the Software without restriction, including without limitation
77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the
97ec681f3Smrg * Software is furnished to do so, subject to the following conditions:
107ec681f3Smrg *
117ec681f3Smrg * The above copyright notice and this permission notice (including the next
127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the
137ec681f3Smrg * Software.
147ec681f3Smrg *
157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
207ec681f3Smrg * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
217ec681f3Smrg * SOFTWARE.
227ec681f3Smrg *
237ec681f3Smrg * Authors (Collabora):
247ec681f3Smrg *      Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
257ec681f3Smrg */
267ec681f3Smrg
277ec681f3Smrg#include "compiler.h"
287ec681f3Smrg#include "bi_builder.h"
297ec681f3Smrg#include "util/u_memory.h"
307ec681f3Smrg
317ec681f3Smrgstruct lcra_state {
327ec681f3Smrg        unsigned node_count;
337ec681f3Smrg        uint64_t *affinity;
347ec681f3Smrg
357ec681f3Smrg        /* Linear constraints imposed. Nested array sized upfront, organized as
367ec681f3Smrg         * linear[node_left][node_right]. That is, calculate indices as:
377ec681f3Smrg         *
387ec681f3Smrg         * Each element is itself a bit field denoting whether (c_j - c_i) bias
397ec681f3Smrg         * is present or not, including negative biases.
407ec681f3Smrg         *
417ec681f3Smrg         * Note for Bifrost, there are 4 components so the bias is in range
427ec681f3Smrg         * [-3, 3] so encoded by 8-bit field. */
437ec681f3Smrg
447ec681f3Smrg        uint8_t *linear;
457ec681f3Smrg
467ec681f3Smrg        /* Before solving, forced registers; after solving, solutions. */
477ec681f3Smrg        unsigned *solutions;
487ec681f3Smrg
497ec681f3Smrg        /** Node which caused register allocation to fail */
507ec681f3Smrg        unsigned spill_node;
517ec681f3Smrg};
527ec681f3Smrg
537ec681f3Smrg/* This module is an implementation of "Linearly Constrained
547ec681f3Smrg * Register Allocation". The paper is available in PDF form
557ec681f3Smrg * (https://people.collabora.com/~alyssa/LCRA.pdf) as well as Markdown+LaTeX
567ec681f3Smrg * (https://gitlab.freedesktop.org/alyssa/lcra/blob/master/LCRA.md)
577ec681f3Smrg */
587ec681f3Smrg
597ec681f3Smrgstatic struct lcra_state *
607ec681f3Smrglcra_alloc_equations(unsigned node_count)
617ec681f3Smrg{
627ec681f3Smrg        struct lcra_state *l = calloc(1, sizeof(*l));
637ec681f3Smrg
647ec681f3Smrg        l->node_count = node_count;
657ec681f3Smrg
667ec681f3Smrg        l->linear = calloc(sizeof(l->linear[0]), node_count * node_count);
677ec681f3Smrg        l->solutions = calloc(sizeof(l->solutions[0]), node_count);
687ec681f3Smrg        l->affinity = calloc(sizeof(l->affinity[0]), node_count);
697ec681f3Smrg
707ec681f3Smrg        memset(l->solutions, ~0, sizeof(l->solutions[0]) * node_count);
717ec681f3Smrg
727ec681f3Smrg        return l;
737ec681f3Smrg}
747ec681f3Smrg
757ec681f3Smrgstatic void
767ec681f3Smrglcra_free(struct lcra_state *l)
777ec681f3Smrg{
787ec681f3Smrg        free(l->linear);
797ec681f3Smrg        free(l->affinity);
807ec681f3Smrg        free(l->solutions);
817ec681f3Smrg        free(l);
827ec681f3Smrg}
837ec681f3Smrg
847ec681f3Smrgstatic void
857ec681f3Smrglcra_add_node_interference(struct lcra_state *l, unsigned i, unsigned cmask_i, unsigned j, unsigned cmask_j)
867ec681f3Smrg{
877ec681f3Smrg        if (i == j)
887ec681f3Smrg                return;
897ec681f3Smrg
907ec681f3Smrg        uint8_t constraint_fw = 0;
917ec681f3Smrg        uint8_t constraint_bw = 0;
927ec681f3Smrg
937ec681f3Smrg        for (unsigned D = 0; D < 4; ++D) {
947ec681f3Smrg                if (cmask_i & (cmask_j << D)) {
957ec681f3Smrg                        constraint_bw |= (1 << (3 + D));
967ec681f3Smrg                        constraint_fw |= (1 << (3 - D));
977ec681f3Smrg                }
987ec681f3Smrg
997ec681f3Smrg                if (cmask_i & (cmask_j >> D)) {
1007ec681f3Smrg                        constraint_fw |= (1 << (3 + D));
1017ec681f3Smrg                        constraint_bw |= (1 << (3 - D));
1027ec681f3Smrg                }
1037ec681f3Smrg        }
1047ec681f3Smrg
1057ec681f3Smrg        l->linear[j * l->node_count + i] |= constraint_fw;
1067ec681f3Smrg        l->linear[i * l->node_count + j] |= constraint_bw;
1077ec681f3Smrg}
1087ec681f3Smrg
1097ec681f3Smrgstatic bool
1107ec681f3Smrglcra_test_linear(struct lcra_state *l, unsigned *solutions, unsigned i)
1117ec681f3Smrg{
1127ec681f3Smrg        uint8_t *row = &l->linear[i * l->node_count];
1137ec681f3Smrg        signed constant = solutions[i];
1147ec681f3Smrg
1157ec681f3Smrg        for (unsigned j = 0; j < l->node_count; ++j) {
1167ec681f3Smrg                if (solutions[j] == ~0) continue;
1177ec681f3Smrg
1187ec681f3Smrg                signed lhs = solutions[j] - constant;
1197ec681f3Smrg
1207ec681f3Smrg                if (lhs < -3 || lhs > 3)
1217ec681f3Smrg                        continue;
1227ec681f3Smrg
1237ec681f3Smrg                if (row[j] & (1 << (lhs + 3)))
1247ec681f3Smrg                        return false;
1257ec681f3Smrg        }
1267ec681f3Smrg
1277ec681f3Smrg        return true;
1287ec681f3Smrg}
1297ec681f3Smrg
1307ec681f3Smrgstatic bool
1317ec681f3Smrglcra_solve(struct lcra_state *l)
1327ec681f3Smrg{
1337ec681f3Smrg        for (unsigned step = 0; step < l->node_count; ++step) {
1347ec681f3Smrg                if (l->solutions[step] != ~0) continue;
1357ec681f3Smrg                if (l->affinity[step] == 0) continue;
1367ec681f3Smrg
1377ec681f3Smrg                bool succ = false;
1387ec681f3Smrg
1397ec681f3Smrg                u_foreach_bit64(r, l->affinity[step]) {
1407ec681f3Smrg                        l->solutions[step] = r;
1417ec681f3Smrg
1427ec681f3Smrg                        if (lcra_test_linear(l, l->solutions, step)) {
1437ec681f3Smrg                                succ = true;
1447ec681f3Smrg                                break;
1457ec681f3Smrg                        }
1467ec681f3Smrg                }
1477ec681f3Smrg
1487ec681f3Smrg                /* Out of registers - prepare to spill */
1497ec681f3Smrg                if (!succ) {
1507ec681f3Smrg                        l->spill_node = step;
1517ec681f3Smrg                        return false;
1527ec681f3Smrg                }
1537ec681f3Smrg        }
1547ec681f3Smrg
1557ec681f3Smrg        return true;
1567ec681f3Smrg}
1577ec681f3Smrg
1587ec681f3Smrg/* Register spilling is implemented with a cost-benefit system. Costs are set
1597ec681f3Smrg * by the user. Benefits are calculated from the constraints. */
1607ec681f3Smrg
1617ec681f3Smrgstatic unsigned
1627ec681f3Smrglcra_count_constraints(struct lcra_state *l, unsigned i)
1637ec681f3Smrg{
1647ec681f3Smrg        unsigned count = 0;
1657ec681f3Smrg        uint8_t *constraints = &l->linear[i * l->node_count];
1667ec681f3Smrg
1677ec681f3Smrg        for (unsigned j = 0; j < l->node_count; ++j)
1687ec681f3Smrg                count += util_bitcount(constraints[j]);
1697ec681f3Smrg
1707ec681f3Smrg        return count;
1717ec681f3Smrg}
1727ec681f3Smrg
1737ec681f3Smrg/* Construct an affinity mask such that the vector with `count` elements does
1747ec681f3Smrg * not intersect any of the registers in the bitset `clobber`. In other words,
1757ec681f3Smrg * an allocated register r needs to satisfy for each i < count: a + i != b.
1767ec681f3Smrg * Equivalently that's a != b - i, so we need a \ne { b - i : i < n }. For the
1777ec681f3Smrg * entire clobber set B, we need a \ne union b \in B { b - i : i < n }, where
1787ec681f3Smrg * that union is the desired clobber set. That may be written equivalently as
1797ec681f3Smrg * the union over i < n of (B - i), where subtraction is defined elementwise
1807ec681f3Smrg * and corresponds to a shift of the entire bitset.
1817ec681f3Smrg *
1827ec681f3Smrg * EVEN_BITS_MASK is an affinity mask for aligned register pairs. Interpreted
1837ec681f3Smrg * as a bit set, it is { x : 0 <= x < 64 if x is even }
1847ec681f3Smrg */
1857ec681f3Smrg
1867ec681f3Smrg#define EVEN_BITS_MASK (0x5555555555555555ull)
1877ec681f3Smrg
1887ec681f3Smrgstatic uint64_t
1897ec681f3Smrgbi_make_affinity(uint64_t clobber, unsigned count, bool split_file)
1907ec681f3Smrg{
1917ec681f3Smrg        uint64_t clobbered = 0;
1927ec681f3Smrg
1937ec681f3Smrg        for (unsigned i = 0; i < count; ++i)
1947ec681f3Smrg                clobbered |= (clobber >> i);
1957ec681f3Smrg
1967ec681f3Smrg        /* Don't allocate past the end of the register file */
1977ec681f3Smrg        if (count > 1) {
1987ec681f3Smrg                unsigned excess = count - 1;
1997ec681f3Smrg                uint64_t mask = BITFIELD_MASK(excess);
2007ec681f3Smrg                clobbered |= mask << (64 - excess);
2017ec681f3Smrg
2027ec681f3Smrg                if (split_file)
2037ec681f3Smrg                        clobbered |= mask << (16 - excess);
2047ec681f3Smrg        }
2057ec681f3Smrg
2067ec681f3Smrg        /* Don't allocate the middle if we split out the middle */
2077ec681f3Smrg        if (split_file)
2087ec681f3Smrg                clobbered |= BITFIELD64_MASK(32) << 16;
2097ec681f3Smrg
2107ec681f3Smrg        /* We can use a register iff it's not clobberred */
2117ec681f3Smrg        return ~clobbered;
2127ec681f3Smrg}
2137ec681f3Smrg
2147ec681f3Smrgstatic void
2157ec681f3Smrgbi_mark_interference(bi_block *block, struct lcra_state *l, uint8_t *live, uint64_t preload_live, unsigned node_count, bool is_blend, bool split_file, bool aligned_sr)
2167ec681f3Smrg{
2177ec681f3Smrg        bi_foreach_instr_in_block_rev(block, ins) {
2187ec681f3Smrg                /* Mark all registers live after the instruction as
2197ec681f3Smrg                 * interfering with the destination */
2207ec681f3Smrg
2217ec681f3Smrg                bi_foreach_dest(ins, d) {
2227ec681f3Smrg                        unsigned node = bi_get_node(ins->dest[d]);
2237ec681f3Smrg
2247ec681f3Smrg                        if (node >= node_count)
2257ec681f3Smrg                                continue;
2267ec681f3Smrg
2277ec681f3Smrg                        /* Don't allocate to anything that's read later as a
2287ec681f3Smrg                         * preloaded register. The affinity is the intersection
2297ec681f3Smrg                         * of affinity masks for each write. Since writes have
2307ec681f3Smrg                         * offsets, but the affinity is for the whole node, we
2317ec681f3Smrg                         * need to offset the affinity opposite the write
2327ec681f3Smrg                         * offset, so we shift right. */
2337ec681f3Smrg                        unsigned count = bi_count_write_registers(ins, d);
2347ec681f3Smrg                        unsigned offset = ins->dest[d].offset;
2357ec681f3Smrg                        uint64_t affinity = bi_make_affinity(preload_live, count, split_file);
2367ec681f3Smrg
2377ec681f3Smrg                        /* Valhall needs >= 64-bit staging writes to be pair-aligned */
2387ec681f3Smrg                        if (aligned_sr && count >= 2)
2397ec681f3Smrg                                affinity &= EVEN_BITS_MASK;
2407ec681f3Smrg
2417ec681f3Smrg                        l->affinity[node] &= (affinity >> offset);
2427ec681f3Smrg
2437ec681f3Smrg                        for (unsigned i = 0; i < node_count; ++i) {
2447ec681f3Smrg                                if (live[i]) {
2457ec681f3Smrg                                        lcra_add_node_interference(l, node,
2467ec681f3Smrg                                                        bi_writemask(ins, d), i, live[i]);
2477ec681f3Smrg                                }
2487ec681f3Smrg                        }
2497ec681f3Smrg                }
2507ec681f3Smrg
2517ec681f3Smrg                /* Valhall needs >= 64-bit staging reads to be pair-aligned */
2527ec681f3Smrg                if (aligned_sr && bi_count_read_registers(ins, 0) >= 2) {
2537ec681f3Smrg                        unsigned node = bi_get_node(ins->src[0]);
2547ec681f3Smrg
2557ec681f3Smrg                        if (node < node_count)
2567ec681f3Smrg                                l->affinity[node] &= EVEN_BITS_MASK;
2577ec681f3Smrg                }
2587ec681f3Smrg
2597ec681f3Smrg                if (!is_blend && ins->op == BI_OPCODE_BLEND) {
2607ec681f3Smrg                        /* Blend shaders might clobber r0-r15, r48. */
2617ec681f3Smrg                        uint64_t clobber = BITFIELD64_MASK(16) | BITFIELD64_BIT(48);
2627ec681f3Smrg
2637ec681f3Smrg                        for (unsigned i = 0; i < node_count; ++i) {
2647ec681f3Smrg                                if (live[i])
2657ec681f3Smrg                                        l->affinity[i] &= ~clobber;
2667ec681f3Smrg                        }
2677ec681f3Smrg                }
2687ec681f3Smrg
2697ec681f3Smrg                /* Update live_in */
2707ec681f3Smrg                preload_live = bi_postra_liveness_ins(preload_live, ins);
2717ec681f3Smrg                bi_liveness_ins_update(live, ins, node_count);
2727ec681f3Smrg        }
2737ec681f3Smrg
2747ec681f3Smrg        block->reg_live_in = preload_live;
2757ec681f3Smrg}
2767ec681f3Smrg
2777ec681f3Smrgstatic void
2787ec681f3Smrgbi_compute_interference(bi_context *ctx, struct lcra_state *l, bool full_regs)
2797ec681f3Smrg{
2807ec681f3Smrg        unsigned node_count = bi_max_temp(ctx);
2817ec681f3Smrg
2827ec681f3Smrg        bi_compute_liveness(ctx);
2837ec681f3Smrg        bi_postra_liveness(ctx);
2847ec681f3Smrg
2857ec681f3Smrg        bi_foreach_block_rev(ctx, blk) {
2867ec681f3Smrg                uint8_t *live = mem_dup(blk->live_out, node_count);
2877ec681f3Smrg
2887ec681f3Smrg                bi_mark_interference(blk, l, live, blk->reg_live_out,
2897ec681f3Smrg                                node_count, ctx->inputs->is_blend, !full_regs,
2907ec681f3Smrg                                ctx->arch >= 9);
2917ec681f3Smrg
2927ec681f3Smrg                free(live);
2937ec681f3Smrg        }
2947ec681f3Smrg}
2957ec681f3Smrg
2967ec681f3Smrgstatic struct lcra_state *
2977ec681f3Smrgbi_allocate_registers(bi_context *ctx, bool *success, bool full_regs)
2987ec681f3Smrg{
2997ec681f3Smrg        unsigned node_count = bi_max_temp(ctx);
3007ec681f3Smrg        struct lcra_state *l = lcra_alloc_equations(node_count);
3017ec681f3Smrg
3027ec681f3Smrg        /* Blend shaders are restricted to R0-R15. Other shaders at full
3037ec681f3Smrg         * occupancy also can access R48-R63. At half occupancy they can access
3047ec681f3Smrg         * the whole file. */
3057ec681f3Smrg
3067ec681f3Smrg        uint64_t default_affinity =
3077ec681f3Smrg                ctx->inputs->is_blend ? BITFIELD64_MASK(16) :
3087ec681f3Smrg                full_regs ? BITFIELD64_MASK(64) :
3097ec681f3Smrg                (BITFIELD64_MASK(16) | (BITFIELD64_MASK(16) << 48));
3107ec681f3Smrg
3117ec681f3Smrg        bi_foreach_instr_global(ctx, ins) {
3127ec681f3Smrg                bi_foreach_dest(ins, d) {
3137ec681f3Smrg                        unsigned dest = bi_get_node(ins->dest[d]);
3147ec681f3Smrg
3157ec681f3Smrg                        /* Blend shaders expect the src colour to be in r0-r3 */
3167ec681f3Smrg                        if (ins->op == BI_OPCODE_BLEND &&
3177ec681f3Smrg                            !ctx->inputs->is_blend) {
3187ec681f3Smrg                                unsigned node = bi_get_node(ins->src[0]);
3197ec681f3Smrg                                assert(node < node_count);
3207ec681f3Smrg                                l->solutions[node] = 0;
3217ec681f3Smrg                        }
3227ec681f3Smrg
3237ec681f3Smrg                        if (dest < node_count)
3247ec681f3Smrg                                l->affinity[dest] = default_affinity;
3257ec681f3Smrg                }
3267ec681f3Smrg
3277ec681f3Smrg        }
3287ec681f3Smrg
3297ec681f3Smrg        bi_compute_interference(ctx, l, full_regs);
3307ec681f3Smrg
3317ec681f3Smrg        *success = lcra_solve(l);
3327ec681f3Smrg
3337ec681f3Smrg        return l;
3347ec681f3Smrg}
3357ec681f3Smrg
3367ec681f3Smrgstatic bi_index
3377ec681f3Smrgbi_reg_from_index(bi_context *ctx, struct lcra_state *l, bi_index index)
3387ec681f3Smrg{
3397ec681f3Smrg        /* Offsets can only be applied when we register allocated an index, or
3407ec681f3Smrg         * alternatively for FAU's encoding */
3417ec681f3Smrg
3427ec681f3Smrg        ASSERTED bool is_offset = (index.offset > 0) &&
3437ec681f3Smrg                (index.type != BI_INDEX_FAU);
3447ec681f3Smrg        unsigned node_count = bi_max_temp(ctx);
3457ec681f3Smrg
3467ec681f3Smrg        /* Did we run RA for this index at all */
3477ec681f3Smrg        if (bi_get_node(index) >= node_count) {
3487ec681f3Smrg                assert(!is_offset);
3497ec681f3Smrg                return index;
3507ec681f3Smrg        }
3517ec681f3Smrg
3527ec681f3Smrg        /* LCRA didn't bother solving this index (how lazy!) */
3537ec681f3Smrg        signed solution = l->solutions[bi_get_node(index)];
3547ec681f3Smrg        if (solution < 0) {
3557ec681f3Smrg                assert(!is_offset);
3567ec681f3Smrg                return index;
3577ec681f3Smrg        }
3587ec681f3Smrg
3597ec681f3Smrg        /* todo: do we want to compose with the subword swizzle? */
3607ec681f3Smrg        bi_index new_index = bi_register(solution + index.offset);
3617ec681f3Smrg        new_index.swizzle = index.swizzle;
3627ec681f3Smrg        new_index.abs = index.abs;
3637ec681f3Smrg        new_index.neg = index.neg;
3647ec681f3Smrg        return new_index;
3657ec681f3Smrg}
3667ec681f3Smrg
3677ec681f3Smrgstatic void
3687ec681f3Smrgbi_install_registers(bi_context *ctx, struct lcra_state *l)
3697ec681f3Smrg{
3707ec681f3Smrg        bi_foreach_instr_global(ctx, ins) {
3717ec681f3Smrg                bi_foreach_dest(ins, d)
3727ec681f3Smrg                        ins->dest[d] = bi_reg_from_index(ctx, l, ins->dest[d]);
3737ec681f3Smrg
3747ec681f3Smrg                bi_foreach_src(ins, s)
3757ec681f3Smrg                        ins->src[s] = bi_reg_from_index(ctx, l, ins->src[s]);
3767ec681f3Smrg        }
3777ec681f3Smrg}
3787ec681f3Smrg
3797ec681f3Smrgstatic void
3807ec681f3Smrgbi_rewrite_index_src_single(bi_instr *ins, bi_index old, bi_index new)
3817ec681f3Smrg{
3827ec681f3Smrg        bi_foreach_src(ins, i) {
3837ec681f3Smrg                if (bi_is_equiv(ins->src[i], old)) {
3847ec681f3Smrg                        ins->src[i].type = new.type;
3857ec681f3Smrg                        ins->src[i].reg = new.reg;
3867ec681f3Smrg                        ins->src[i].value = new.value;
3877ec681f3Smrg                }
3887ec681f3Smrg        }
3897ec681f3Smrg}
3907ec681f3Smrg
3917ec681f3Smrg/* If register allocation fails, find the best spill node */
3927ec681f3Smrg
3937ec681f3Smrgstatic signed
3947ec681f3Smrgbi_choose_spill_node(bi_context *ctx, struct lcra_state *l)
3957ec681f3Smrg{
3967ec681f3Smrg        /* Pick a node satisfying bi_spill_register's preconditions */
3977ec681f3Smrg        BITSET_WORD *no_spill = calloc(sizeof(BITSET_WORD), BITSET_WORDS(l->node_count));
3987ec681f3Smrg
3997ec681f3Smrg        bi_foreach_instr_global(ctx, ins) {
4007ec681f3Smrg                bi_foreach_dest(ins, d) {
4017ec681f3Smrg                        unsigned node = bi_get_node(ins->dest[d]);
4027ec681f3Smrg
4037ec681f3Smrg                        if (node < l->node_count && ins->no_spill)
4047ec681f3Smrg                                BITSET_SET(no_spill, node);
4057ec681f3Smrg                }
4067ec681f3Smrg        }
4077ec681f3Smrg
4087ec681f3Smrg        unsigned best_benefit = 0.0;
4097ec681f3Smrg        signed best_node = -1;
4107ec681f3Smrg
4117ec681f3Smrg        for (unsigned i = 0; i < l->node_count; ++i) {
4127ec681f3Smrg                if (BITSET_TEST(no_spill, i)) continue;
4137ec681f3Smrg
4147ec681f3Smrg                /* Only spill nodes that interfere with the node failing
4157ec681f3Smrg                 * register allocation. It's pointless to spill anything else */
4167ec681f3Smrg                if (!l->linear[(l->spill_node * l->node_count) + i]) continue;
4177ec681f3Smrg
4187ec681f3Smrg                unsigned benefit = lcra_count_constraints(l, i);
4197ec681f3Smrg
4207ec681f3Smrg                if (benefit > best_benefit) {
4217ec681f3Smrg                        best_benefit = benefit;
4227ec681f3Smrg                        best_node = i;
4237ec681f3Smrg                }
4247ec681f3Smrg        }
4257ec681f3Smrg
4267ec681f3Smrg        free(no_spill);
4277ec681f3Smrg        return best_node;
4287ec681f3Smrg}
4297ec681f3Smrg
4307ec681f3Smrgstatic unsigned
4317ec681f3Smrgbi_count_read_index(bi_instr *I, bi_index index)
4327ec681f3Smrg{
4337ec681f3Smrg        unsigned max = 0;
4347ec681f3Smrg
4357ec681f3Smrg        bi_foreach_src(I, s) {
4367ec681f3Smrg                if (bi_is_equiv(I->src[s], index)) {
4377ec681f3Smrg                        unsigned count = bi_count_read_registers(I, s);
4387ec681f3Smrg                        max = MAX2(max, count + I->src[s].offset);
4397ec681f3Smrg                }
4407ec681f3Smrg        }
4417ec681f3Smrg
4427ec681f3Smrg        return max;
4437ec681f3Smrg}
4447ec681f3Smrg
4457ec681f3Smrg/* Once we've chosen a spill node, spill it and returns bytes spilled */
4467ec681f3Smrg
4477ec681f3Smrgstatic unsigned
4487ec681f3Smrgbi_spill_register(bi_context *ctx, bi_index index, uint32_t offset)
4497ec681f3Smrg{
4507ec681f3Smrg        bi_builder b = { .shader = ctx };
4517ec681f3Smrg        unsigned channels = 0;
4527ec681f3Smrg
4537ec681f3Smrg        /* Spill after every store, fill before every load */
4547ec681f3Smrg        bi_foreach_instr_global_safe(ctx, I) {
4557ec681f3Smrg                bi_foreach_dest(I, d) {
4567ec681f3Smrg                        if (!bi_is_equiv(I->dest[d], index)) continue;
4577ec681f3Smrg
4587ec681f3Smrg                        unsigned extra = I->dest[d].offset;
4597ec681f3Smrg                        bi_index tmp = bi_temp(ctx);
4607ec681f3Smrg
4617ec681f3Smrg                        I->dest[d] = bi_replace_index(I->dest[d], tmp);
4627ec681f3Smrg                        I->no_spill = true;
4637ec681f3Smrg
4647ec681f3Smrg                        unsigned count = bi_count_write_registers(I, d);
4657ec681f3Smrg                        unsigned bits = count * 32;
4667ec681f3Smrg
4677ec681f3Smrg                        b.cursor = bi_after_instr(I);
4687ec681f3Smrg                        bi_index loc = bi_imm_u32(offset + 4 * extra);
4697ec681f3Smrg                        bi_store(&b, bits, tmp, loc, bi_zero(), BI_SEG_TL);
4707ec681f3Smrg
4717ec681f3Smrg                        ctx->spills++;
4727ec681f3Smrg                        channels = MAX2(channels, extra + count);
4737ec681f3Smrg                }
4747ec681f3Smrg
4757ec681f3Smrg                if (bi_has_arg(I, index)) {
4767ec681f3Smrg                        b.cursor = bi_before_instr(I);
4777ec681f3Smrg                        bi_index tmp = bi_temp(ctx);
4787ec681f3Smrg
4797ec681f3Smrg                        unsigned bits = bi_count_read_index(I, index) * 32;
4807ec681f3Smrg                        bi_rewrite_index_src_single(I, index, tmp);
4817ec681f3Smrg
4827ec681f3Smrg                        bi_instr *ld = bi_load_to(&b, bits, tmp,
4837ec681f3Smrg                                        bi_imm_u32(offset), bi_zero(), BI_SEG_TL);
4847ec681f3Smrg                        ld->no_spill = true;
4857ec681f3Smrg                        ctx->fills++;
4867ec681f3Smrg                }
4877ec681f3Smrg        }
4887ec681f3Smrg
4897ec681f3Smrg        return (channels * 4);
4907ec681f3Smrg}
4917ec681f3Smrg
4927ec681f3Smrgvoid
4937ec681f3Smrgbi_register_allocate(bi_context *ctx)
4947ec681f3Smrg{
4957ec681f3Smrg        struct lcra_state *l = NULL;
4967ec681f3Smrg        bool success = false;
4977ec681f3Smrg
4987ec681f3Smrg        unsigned iter_count = 1000; /* max iterations */
4997ec681f3Smrg
5007ec681f3Smrg        /* Number of bytes of memory we've spilled into */
5017ec681f3Smrg        unsigned spill_count = ctx->info->tls_size;
5027ec681f3Smrg
5037ec681f3Smrg        /* Try with reduced register pressure to improve thread count on v7 */
5047ec681f3Smrg        if (ctx->arch == 7) {
5057ec681f3Smrg                bi_invalidate_liveness(ctx);
5067ec681f3Smrg                l = bi_allocate_registers(ctx, &success, false);
5077ec681f3Smrg
5087ec681f3Smrg                if (success) {
5097ec681f3Smrg                        ctx->info->work_reg_count = 32;
5107ec681f3Smrg                } else {
5117ec681f3Smrg                        lcra_free(l);
5127ec681f3Smrg                        l = NULL;
5137ec681f3Smrg                }
5147ec681f3Smrg        }
5157ec681f3Smrg
5167ec681f3Smrg        /* Otherwise, use the register file and spill until we succeed */
5177ec681f3Smrg        while (!success && ((iter_count--) > 0)) {
5187ec681f3Smrg                bi_invalidate_liveness(ctx);
5197ec681f3Smrg                l = bi_allocate_registers(ctx, &success, true);
5207ec681f3Smrg
5217ec681f3Smrg                if (success) {
5227ec681f3Smrg                        ctx->info->work_reg_count = 64;
5237ec681f3Smrg                } else {
5247ec681f3Smrg                        signed spill_node = bi_choose_spill_node(ctx, l);
5257ec681f3Smrg                        lcra_free(l);
5267ec681f3Smrg                        l = NULL;
5277ec681f3Smrg
5287ec681f3Smrg                        if (spill_node == -1)
5297ec681f3Smrg                                unreachable("Failed to choose spill node\n");
5307ec681f3Smrg
5317ec681f3Smrg                        spill_count += bi_spill_register(ctx,
5327ec681f3Smrg                                        bi_node_to_index(spill_node, bi_max_temp(ctx)),
5337ec681f3Smrg                                        spill_count);
5347ec681f3Smrg                }
5357ec681f3Smrg        }
5367ec681f3Smrg
5377ec681f3Smrg        assert(success);
5387ec681f3Smrg        assert(l != NULL);
5397ec681f3Smrg
5407ec681f3Smrg        ctx->info->tls_size = spill_count;
5417ec681f3Smrg        bi_install_registers(ctx, l);
5427ec681f3Smrg
5437ec681f3Smrg        lcra_free(l);
5447ec681f3Smrg}
545