1/*
2 * Copyright (C) 2020 Collabora Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors (Collabora):
24 *      Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
25 */
26
27#include "compiler.h"
28#include "bi_builder.h"
29#include "util/u_memory.h"
30
31struct lcra_state {
32        unsigned node_count;
33        uint64_t *affinity;
34
35        /* Linear constraints imposed. Nested array sized upfront, organized as
36         * linear[node_left][node_right]. That is, calculate indices as:
37         *
38         * Each element is itself a bit field denoting whether (c_j - c_i) bias
39         * is present or not, including negative biases.
40         *
41         * Note for Bifrost, there are 4 components so the bias is in range
42         * [-3, 3] so encoded by 8-bit field. */
43
44        uint8_t *linear;
45
46        /* Before solving, forced registers; after solving, solutions. */
47        unsigned *solutions;
48
49        /** Node which caused register allocation to fail */
50        unsigned spill_node;
51};
52
53/* This module is an implementation of "Linearly Constrained
54 * Register Allocation". The paper is available in PDF form
55 * (https://people.collabora.com/~alyssa/LCRA.pdf) as well as Markdown+LaTeX
56 * (https://gitlab.freedesktop.org/alyssa/lcra/blob/master/LCRA.md)
57 */
58
59static struct lcra_state *
60lcra_alloc_equations(unsigned node_count)
61{
62        struct lcra_state *l = calloc(1, sizeof(*l));
63
64        l->node_count = node_count;
65
66        l->linear = calloc(sizeof(l->linear[0]), node_count * node_count);
67        l->solutions = calloc(sizeof(l->solutions[0]), node_count);
68        l->affinity = calloc(sizeof(l->affinity[0]), node_count);
69
70        memset(l->solutions, ~0, sizeof(l->solutions[0]) * node_count);
71
72        return l;
73}
74
75static void
76lcra_free(struct lcra_state *l)
77{
78        free(l->linear);
79        free(l->affinity);
80        free(l->solutions);
81        free(l);
82}
83
84static void
85lcra_add_node_interference(struct lcra_state *l, unsigned i, unsigned cmask_i, unsigned j, unsigned cmask_j)
86{
87        if (i == j)
88                return;
89
90        uint8_t constraint_fw = 0;
91        uint8_t constraint_bw = 0;
92
93        for (unsigned D = 0; D < 4; ++D) {
94                if (cmask_i & (cmask_j << D)) {
95                        constraint_bw |= (1 << (3 + D));
96                        constraint_fw |= (1 << (3 - D));
97                }
98
99                if (cmask_i & (cmask_j >> D)) {
100                        constraint_fw |= (1 << (3 + D));
101                        constraint_bw |= (1 << (3 - D));
102                }
103        }
104
105        l->linear[j * l->node_count + i] |= constraint_fw;
106        l->linear[i * l->node_count + j] |= constraint_bw;
107}
108
109static bool
110lcra_test_linear(struct lcra_state *l, unsigned *solutions, unsigned i)
111{
112        uint8_t *row = &l->linear[i * l->node_count];
113        signed constant = solutions[i];
114
115        for (unsigned j = 0; j < l->node_count; ++j) {
116                if (solutions[j] == ~0) continue;
117
118                signed lhs = solutions[j] - constant;
119
120                if (lhs < -3 || lhs > 3)
121                        continue;
122
123                if (row[j] & (1 << (lhs + 3)))
124                        return false;
125        }
126
127        return true;
128}
129
130static bool
131lcra_solve(struct lcra_state *l)
132{
133        for (unsigned step = 0; step < l->node_count; ++step) {
134                if (l->solutions[step] != ~0) continue;
135                if (l->affinity[step] == 0) continue;
136
137                bool succ = false;
138
139                u_foreach_bit64(r, l->affinity[step]) {
140                        l->solutions[step] = r;
141
142                        if (lcra_test_linear(l, l->solutions, step)) {
143                                succ = true;
144                                break;
145                        }
146                }
147
148                /* Out of registers - prepare to spill */
149                if (!succ) {
150                        l->spill_node = step;
151                        return false;
152                }
153        }
154
155        return true;
156}
157
158/* Register spilling is implemented with a cost-benefit system. Costs are set
159 * by the user. Benefits are calculated from the constraints. */
160
161static unsigned
162lcra_count_constraints(struct lcra_state *l, unsigned i)
163{
164        unsigned count = 0;
165        uint8_t *constraints = &l->linear[i * l->node_count];
166
167        for (unsigned j = 0; j < l->node_count; ++j)
168                count += util_bitcount(constraints[j]);
169
170        return count;
171}
172
173/* Construct an affinity mask such that the vector with `count` elements does
174 * not intersect any of the registers in the bitset `clobber`. In other words,
175 * an allocated register r needs to satisfy for each i < count: a + i != b.
176 * Equivalently that's a != b - i, so we need a \ne { b - i : i < n }. For the
177 * entire clobber set B, we need a \ne union b \in B { b - i : i < n }, where
178 * that union is the desired clobber set. That may be written equivalently as
179 * the union over i < n of (B - i), where subtraction is defined elementwise
180 * and corresponds to a shift of the entire bitset.
181 *
182 * EVEN_BITS_MASK is an affinity mask for aligned register pairs. Interpreted
183 * as a bit set, it is { x : 0 <= x < 64 if x is even }
184 */
185
186#define EVEN_BITS_MASK (0x5555555555555555ull)
187
188static uint64_t
189bi_make_affinity(uint64_t clobber, unsigned count, bool split_file)
190{
191        uint64_t clobbered = 0;
192
193        for (unsigned i = 0; i < count; ++i)
194                clobbered |= (clobber >> i);
195
196        /* Don't allocate past the end of the register file */
197        if (count > 1) {
198                unsigned excess = count - 1;
199                uint64_t mask = BITFIELD_MASK(excess);
200                clobbered |= mask << (64 - excess);
201
202                if (split_file)
203                        clobbered |= mask << (16 - excess);
204        }
205
206        /* Don't allocate the middle if we split out the middle */
207        if (split_file)
208                clobbered |= BITFIELD64_MASK(32) << 16;
209
210        /* We can use a register iff it's not clobberred */
211        return ~clobbered;
212}
213
214static void
215bi_mark_interference(bi_block *block, struct lcra_state *l, uint8_t *live, uint64_t preload_live, unsigned node_count, bool is_blend, bool split_file, bool aligned_sr)
216{
217        bi_foreach_instr_in_block_rev(block, ins) {
218                /* Mark all registers live after the instruction as
219                 * interfering with the destination */
220
221                bi_foreach_dest(ins, d) {
222                        unsigned node = bi_get_node(ins->dest[d]);
223
224                        if (node >= node_count)
225                                continue;
226
227                        /* Don't allocate to anything that's read later as a
228                         * preloaded register. The affinity is the intersection
229                         * of affinity masks for each write. Since writes have
230                         * offsets, but the affinity is for the whole node, we
231                         * need to offset the affinity opposite the write
232                         * offset, so we shift right. */
233                        unsigned count = bi_count_write_registers(ins, d);
234                        unsigned offset = ins->dest[d].offset;
235                        uint64_t affinity = bi_make_affinity(preload_live, count, split_file);
236
237                        /* Valhall needs >= 64-bit staging writes to be pair-aligned */
238                        if (aligned_sr && count >= 2)
239                                affinity &= EVEN_BITS_MASK;
240
241                        l->affinity[node] &= (affinity >> offset);
242
243                        for (unsigned i = 0; i < node_count; ++i) {
244                                if (live[i]) {
245                                        lcra_add_node_interference(l, node,
246                                                        bi_writemask(ins, d), i, live[i]);
247                                }
248                        }
249                }
250
251                /* Valhall needs >= 64-bit staging reads to be pair-aligned */
252                if (aligned_sr && bi_count_read_registers(ins, 0) >= 2) {
253                        unsigned node = bi_get_node(ins->src[0]);
254
255                        if (node < node_count)
256                                l->affinity[node] &= EVEN_BITS_MASK;
257                }
258
259                if (!is_blend && ins->op == BI_OPCODE_BLEND) {
260                        /* Blend shaders might clobber r0-r15, r48. */
261                        uint64_t clobber = BITFIELD64_MASK(16) | BITFIELD64_BIT(48);
262
263                        for (unsigned i = 0; i < node_count; ++i) {
264                                if (live[i])
265                                        l->affinity[i] &= ~clobber;
266                        }
267                }
268
269                /* Update live_in */
270                preload_live = bi_postra_liveness_ins(preload_live, ins);
271                bi_liveness_ins_update(live, ins, node_count);
272        }
273
274        block->reg_live_in = preload_live;
275}
276
277static void
278bi_compute_interference(bi_context *ctx, struct lcra_state *l, bool full_regs)
279{
280        unsigned node_count = bi_max_temp(ctx);
281
282        bi_compute_liveness(ctx);
283        bi_postra_liveness(ctx);
284
285        bi_foreach_block_rev(ctx, blk) {
286                uint8_t *live = mem_dup(blk->live_out, node_count);
287
288                bi_mark_interference(blk, l, live, blk->reg_live_out,
289                                node_count, ctx->inputs->is_blend, !full_regs,
290                                ctx->arch >= 9);
291
292                free(live);
293        }
294}
295
296static struct lcra_state *
297bi_allocate_registers(bi_context *ctx, bool *success, bool full_regs)
298{
299        unsigned node_count = bi_max_temp(ctx);
300        struct lcra_state *l = lcra_alloc_equations(node_count);
301
302        /* Blend shaders are restricted to R0-R15. Other shaders at full
303         * occupancy also can access R48-R63. At half occupancy they can access
304         * the whole file. */
305
306        uint64_t default_affinity =
307                ctx->inputs->is_blend ? BITFIELD64_MASK(16) :
308                full_regs ? BITFIELD64_MASK(64) :
309                (BITFIELD64_MASK(16) | (BITFIELD64_MASK(16) << 48));
310
311        bi_foreach_instr_global(ctx, ins) {
312                bi_foreach_dest(ins, d) {
313                        unsigned dest = bi_get_node(ins->dest[d]);
314
315                        /* Blend shaders expect the src colour to be in r0-r3 */
316                        if (ins->op == BI_OPCODE_BLEND &&
317                            !ctx->inputs->is_blend) {
318                                unsigned node = bi_get_node(ins->src[0]);
319                                assert(node < node_count);
320                                l->solutions[node] = 0;
321                        }
322
323                        if (dest < node_count)
324                                l->affinity[dest] = default_affinity;
325                }
326
327        }
328
329        bi_compute_interference(ctx, l, full_regs);
330
331        *success = lcra_solve(l);
332
333        return l;
334}
335
336static bi_index
337bi_reg_from_index(bi_context *ctx, struct lcra_state *l, bi_index index)
338{
339        /* Offsets can only be applied when we register allocated an index, or
340         * alternatively for FAU's encoding */
341
342        ASSERTED bool is_offset = (index.offset > 0) &&
343                (index.type != BI_INDEX_FAU);
344        unsigned node_count = bi_max_temp(ctx);
345
346        /* Did we run RA for this index at all */
347        if (bi_get_node(index) >= node_count) {
348                assert(!is_offset);
349                return index;
350        }
351
352        /* LCRA didn't bother solving this index (how lazy!) */
353        signed solution = l->solutions[bi_get_node(index)];
354        if (solution < 0) {
355                assert(!is_offset);
356                return index;
357        }
358
359        /* todo: do we want to compose with the subword swizzle? */
360        bi_index new_index = bi_register(solution + index.offset);
361        new_index.swizzle = index.swizzle;
362        new_index.abs = index.abs;
363        new_index.neg = index.neg;
364        return new_index;
365}
366
367static void
368bi_install_registers(bi_context *ctx, struct lcra_state *l)
369{
370        bi_foreach_instr_global(ctx, ins) {
371                bi_foreach_dest(ins, d)
372                        ins->dest[d] = bi_reg_from_index(ctx, l, ins->dest[d]);
373
374                bi_foreach_src(ins, s)
375                        ins->src[s] = bi_reg_from_index(ctx, l, ins->src[s]);
376        }
377}
378
379static void
380bi_rewrite_index_src_single(bi_instr *ins, bi_index old, bi_index new)
381{
382        bi_foreach_src(ins, i) {
383                if (bi_is_equiv(ins->src[i], old)) {
384                        ins->src[i].type = new.type;
385                        ins->src[i].reg = new.reg;
386                        ins->src[i].value = new.value;
387                }
388        }
389}
390
391/* If register allocation fails, find the best spill node */
392
393static signed
394bi_choose_spill_node(bi_context *ctx, struct lcra_state *l)
395{
396        /* Pick a node satisfying bi_spill_register's preconditions */
397        BITSET_WORD *no_spill = calloc(sizeof(BITSET_WORD), BITSET_WORDS(l->node_count));
398
399        bi_foreach_instr_global(ctx, ins) {
400                bi_foreach_dest(ins, d) {
401                        unsigned node = bi_get_node(ins->dest[d]);
402
403                        if (node < l->node_count && ins->no_spill)
404                                BITSET_SET(no_spill, node);
405                }
406        }
407
408        unsigned best_benefit = 0.0;
409        signed best_node = -1;
410
411        for (unsigned i = 0; i < l->node_count; ++i) {
412                if (BITSET_TEST(no_spill, i)) continue;
413
414                /* Only spill nodes that interfere with the node failing
415                 * register allocation. It's pointless to spill anything else */
416                if (!l->linear[(l->spill_node * l->node_count) + i]) continue;
417
418                unsigned benefit = lcra_count_constraints(l, i);
419
420                if (benefit > best_benefit) {
421                        best_benefit = benefit;
422                        best_node = i;
423                }
424        }
425
426        free(no_spill);
427        return best_node;
428}
429
430static unsigned
431bi_count_read_index(bi_instr *I, bi_index index)
432{
433        unsigned max = 0;
434
435        bi_foreach_src(I, s) {
436                if (bi_is_equiv(I->src[s], index)) {
437                        unsigned count = bi_count_read_registers(I, s);
438                        max = MAX2(max, count + I->src[s].offset);
439                }
440        }
441
442        return max;
443}
444
445/* Once we've chosen a spill node, spill it and returns bytes spilled */
446
447static unsigned
448bi_spill_register(bi_context *ctx, bi_index index, uint32_t offset)
449{
450        bi_builder b = { .shader = ctx };
451        unsigned channels = 0;
452
453        /* Spill after every store, fill before every load */
454        bi_foreach_instr_global_safe(ctx, I) {
455                bi_foreach_dest(I, d) {
456                        if (!bi_is_equiv(I->dest[d], index)) continue;
457
458                        unsigned extra = I->dest[d].offset;
459                        bi_index tmp = bi_temp(ctx);
460
461                        I->dest[d] = bi_replace_index(I->dest[d], tmp);
462                        I->no_spill = true;
463
464                        unsigned count = bi_count_write_registers(I, d);
465                        unsigned bits = count * 32;
466
467                        b.cursor = bi_after_instr(I);
468                        bi_index loc = bi_imm_u32(offset + 4 * extra);
469                        bi_store(&b, bits, tmp, loc, bi_zero(), BI_SEG_TL);
470
471                        ctx->spills++;
472                        channels = MAX2(channels, extra + count);
473                }
474
475                if (bi_has_arg(I, index)) {
476                        b.cursor = bi_before_instr(I);
477                        bi_index tmp = bi_temp(ctx);
478
479                        unsigned bits = bi_count_read_index(I, index) * 32;
480                        bi_rewrite_index_src_single(I, index, tmp);
481
482                        bi_instr *ld = bi_load_to(&b, bits, tmp,
483                                        bi_imm_u32(offset), bi_zero(), BI_SEG_TL);
484                        ld->no_spill = true;
485                        ctx->fills++;
486                }
487        }
488
489        return (channels * 4);
490}
491
492void
493bi_register_allocate(bi_context *ctx)
494{
495        struct lcra_state *l = NULL;
496        bool success = false;
497
498        unsigned iter_count = 1000; /* max iterations */
499
500        /* Number of bytes of memory we've spilled into */
501        unsigned spill_count = ctx->info->tls_size;
502
503        /* Try with reduced register pressure to improve thread count on v7 */
504        if (ctx->arch == 7) {
505                bi_invalidate_liveness(ctx);
506                l = bi_allocate_registers(ctx, &success, false);
507
508                if (success) {
509                        ctx->info->work_reg_count = 32;
510                } else {
511                        lcra_free(l);
512                        l = NULL;
513                }
514        }
515
516        /* Otherwise, use the register file and spill until we succeed */
517        while (!success && ((iter_count--) > 0)) {
518                bi_invalidate_liveness(ctx);
519                l = bi_allocate_registers(ctx, &success, true);
520
521                if (success) {
522                        ctx->info->work_reg_count = 64;
523                } else {
524                        signed spill_node = bi_choose_spill_node(ctx, l);
525                        lcra_free(l);
526                        l = NULL;
527
528                        if (spill_node == -1)
529                                unreachable("Failed to choose spill node\n");
530
531                        spill_count += bi_spill_register(ctx,
532                                        bi_node_to_index(spill_node, bi_max_temp(ctx)),
533                                        spill_count);
534                }
535        }
536
537        assert(success);
538        assert(l != NULL);
539
540        ctx->info->tls_size = spill_count;
541        bi_install_registers(ctx, l);
542
543        lcra_free(l);
544}
545