1/* 2 * Copyright (C) 2020 Collabora Ltd. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 * 23 * Authors (Collabora): 24 * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> 25 */ 26 27#include "compiler.h" 28#include "bi_builder.h" 29#include "util/u_memory.h" 30 31struct lcra_state { 32 unsigned node_count; 33 uint64_t *affinity; 34 35 /* Linear constraints imposed. Nested array sized upfront, organized as 36 * linear[node_left][node_right]. That is, calculate indices as: 37 * 38 * Each element is itself a bit field denoting whether (c_j - c_i) bias 39 * is present or not, including negative biases. 40 * 41 * Note for Bifrost, there are 4 components so the bias is in range 42 * [-3, 3] so encoded by 8-bit field. */ 43 44 uint8_t *linear; 45 46 /* Before solving, forced registers; after solving, solutions. */ 47 unsigned *solutions; 48 49 /** Node which caused register allocation to fail */ 50 unsigned spill_node; 51}; 52 53/* This module is an implementation of "Linearly Constrained 54 * Register Allocation". The paper is available in PDF form 55 * (https://people.collabora.com/~alyssa/LCRA.pdf) as well as Markdown+LaTeX 56 * (https://gitlab.freedesktop.org/alyssa/lcra/blob/master/LCRA.md) 57 */ 58 59static struct lcra_state * 60lcra_alloc_equations(unsigned node_count) 61{ 62 struct lcra_state *l = calloc(1, sizeof(*l)); 63 64 l->node_count = node_count; 65 66 l->linear = calloc(sizeof(l->linear[0]), node_count * node_count); 67 l->solutions = calloc(sizeof(l->solutions[0]), node_count); 68 l->affinity = calloc(sizeof(l->affinity[0]), node_count); 69 70 memset(l->solutions, ~0, sizeof(l->solutions[0]) * node_count); 71 72 return l; 73} 74 75static void 76lcra_free(struct lcra_state *l) 77{ 78 free(l->linear); 79 free(l->affinity); 80 free(l->solutions); 81 free(l); 82} 83 84static void 85lcra_add_node_interference(struct lcra_state *l, unsigned i, unsigned cmask_i, unsigned j, unsigned cmask_j) 86{ 87 if (i == j) 88 return; 89 90 uint8_t constraint_fw = 0; 91 uint8_t constraint_bw = 0; 92 93 for (unsigned D = 0; D < 4; ++D) { 94 if (cmask_i & (cmask_j << D)) { 95 constraint_bw |= (1 << (3 + D)); 96 constraint_fw |= (1 << (3 - D)); 97 } 98 99 if (cmask_i & (cmask_j >> D)) { 100 constraint_fw |= (1 << (3 + D)); 101 constraint_bw |= (1 << (3 - D)); 102 } 103 } 104 105 l->linear[j * l->node_count + i] |= constraint_fw; 106 l->linear[i * l->node_count + j] |= constraint_bw; 107} 108 109static bool 110lcra_test_linear(struct lcra_state *l, unsigned *solutions, unsigned i) 111{ 112 uint8_t *row = &l->linear[i * l->node_count]; 113 signed constant = solutions[i]; 114 115 for (unsigned j = 0; j < l->node_count; ++j) { 116 if (solutions[j] == ~0) continue; 117 118 signed lhs = solutions[j] - constant; 119 120 if (lhs < -3 || lhs > 3) 121 continue; 122 123 if (row[j] & (1 << (lhs + 3))) 124 return false; 125 } 126 127 return true; 128} 129 130static bool 131lcra_solve(struct lcra_state *l) 132{ 133 for (unsigned step = 0; step < l->node_count; ++step) { 134 if (l->solutions[step] != ~0) continue; 135 if (l->affinity[step] == 0) continue; 136 137 bool succ = false; 138 139 u_foreach_bit64(r, l->affinity[step]) { 140 l->solutions[step] = r; 141 142 if (lcra_test_linear(l, l->solutions, step)) { 143 succ = true; 144 break; 145 } 146 } 147 148 /* Out of registers - prepare to spill */ 149 if (!succ) { 150 l->spill_node = step; 151 return false; 152 } 153 } 154 155 return true; 156} 157 158/* Register spilling is implemented with a cost-benefit system. Costs are set 159 * by the user. Benefits are calculated from the constraints. */ 160 161static unsigned 162lcra_count_constraints(struct lcra_state *l, unsigned i) 163{ 164 unsigned count = 0; 165 uint8_t *constraints = &l->linear[i * l->node_count]; 166 167 for (unsigned j = 0; j < l->node_count; ++j) 168 count += util_bitcount(constraints[j]); 169 170 return count; 171} 172 173/* Construct an affinity mask such that the vector with `count` elements does 174 * not intersect any of the registers in the bitset `clobber`. In other words, 175 * an allocated register r needs to satisfy for each i < count: a + i != b. 176 * Equivalently that's a != b - i, so we need a \ne { b - i : i < n }. For the 177 * entire clobber set B, we need a \ne union b \in B { b - i : i < n }, where 178 * that union is the desired clobber set. That may be written equivalently as 179 * the union over i < n of (B - i), where subtraction is defined elementwise 180 * and corresponds to a shift of the entire bitset. 181 * 182 * EVEN_BITS_MASK is an affinity mask for aligned register pairs. Interpreted 183 * as a bit set, it is { x : 0 <= x < 64 if x is even } 184 */ 185 186#define EVEN_BITS_MASK (0x5555555555555555ull) 187 188static uint64_t 189bi_make_affinity(uint64_t clobber, unsigned count, bool split_file) 190{ 191 uint64_t clobbered = 0; 192 193 for (unsigned i = 0; i < count; ++i) 194 clobbered |= (clobber >> i); 195 196 /* Don't allocate past the end of the register file */ 197 if (count > 1) { 198 unsigned excess = count - 1; 199 uint64_t mask = BITFIELD_MASK(excess); 200 clobbered |= mask << (64 - excess); 201 202 if (split_file) 203 clobbered |= mask << (16 - excess); 204 } 205 206 /* Don't allocate the middle if we split out the middle */ 207 if (split_file) 208 clobbered |= BITFIELD64_MASK(32) << 16; 209 210 /* We can use a register iff it's not clobberred */ 211 return ~clobbered; 212} 213 214static void 215bi_mark_interference(bi_block *block, struct lcra_state *l, uint8_t *live, uint64_t preload_live, unsigned node_count, bool is_blend, bool split_file, bool aligned_sr) 216{ 217 bi_foreach_instr_in_block_rev(block, ins) { 218 /* Mark all registers live after the instruction as 219 * interfering with the destination */ 220 221 bi_foreach_dest(ins, d) { 222 unsigned node = bi_get_node(ins->dest[d]); 223 224 if (node >= node_count) 225 continue; 226 227 /* Don't allocate to anything that's read later as a 228 * preloaded register. The affinity is the intersection 229 * of affinity masks for each write. Since writes have 230 * offsets, but the affinity is for the whole node, we 231 * need to offset the affinity opposite the write 232 * offset, so we shift right. */ 233 unsigned count = bi_count_write_registers(ins, d); 234 unsigned offset = ins->dest[d].offset; 235 uint64_t affinity = bi_make_affinity(preload_live, count, split_file); 236 237 /* Valhall needs >= 64-bit staging writes to be pair-aligned */ 238 if (aligned_sr && count >= 2) 239 affinity &= EVEN_BITS_MASK; 240 241 l->affinity[node] &= (affinity >> offset); 242 243 for (unsigned i = 0; i < node_count; ++i) { 244 if (live[i]) { 245 lcra_add_node_interference(l, node, 246 bi_writemask(ins, d), i, live[i]); 247 } 248 } 249 } 250 251 /* Valhall needs >= 64-bit staging reads to be pair-aligned */ 252 if (aligned_sr && bi_count_read_registers(ins, 0) >= 2) { 253 unsigned node = bi_get_node(ins->src[0]); 254 255 if (node < node_count) 256 l->affinity[node] &= EVEN_BITS_MASK; 257 } 258 259 if (!is_blend && ins->op == BI_OPCODE_BLEND) { 260 /* Blend shaders might clobber r0-r15, r48. */ 261 uint64_t clobber = BITFIELD64_MASK(16) | BITFIELD64_BIT(48); 262 263 for (unsigned i = 0; i < node_count; ++i) { 264 if (live[i]) 265 l->affinity[i] &= ~clobber; 266 } 267 } 268 269 /* Update live_in */ 270 preload_live = bi_postra_liveness_ins(preload_live, ins); 271 bi_liveness_ins_update(live, ins, node_count); 272 } 273 274 block->reg_live_in = preload_live; 275} 276 277static void 278bi_compute_interference(bi_context *ctx, struct lcra_state *l, bool full_regs) 279{ 280 unsigned node_count = bi_max_temp(ctx); 281 282 bi_compute_liveness(ctx); 283 bi_postra_liveness(ctx); 284 285 bi_foreach_block_rev(ctx, blk) { 286 uint8_t *live = mem_dup(blk->live_out, node_count); 287 288 bi_mark_interference(blk, l, live, blk->reg_live_out, 289 node_count, ctx->inputs->is_blend, !full_regs, 290 ctx->arch >= 9); 291 292 free(live); 293 } 294} 295 296static struct lcra_state * 297bi_allocate_registers(bi_context *ctx, bool *success, bool full_regs) 298{ 299 unsigned node_count = bi_max_temp(ctx); 300 struct lcra_state *l = lcra_alloc_equations(node_count); 301 302 /* Blend shaders are restricted to R0-R15. Other shaders at full 303 * occupancy also can access R48-R63. At half occupancy they can access 304 * the whole file. */ 305 306 uint64_t default_affinity = 307 ctx->inputs->is_blend ? BITFIELD64_MASK(16) : 308 full_regs ? BITFIELD64_MASK(64) : 309 (BITFIELD64_MASK(16) | (BITFIELD64_MASK(16) << 48)); 310 311 bi_foreach_instr_global(ctx, ins) { 312 bi_foreach_dest(ins, d) { 313 unsigned dest = bi_get_node(ins->dest[d]); 314 315 /* Blend shaders expect the src colour to be in r0-r3 */ 316 if (ins->op == BI_OPCODE_BLEND && 317 !ctx->inputs->is_blend) { 318 unsigned node = bi_get_node(ins->src[0]); 319 assert(node < node_count); 320 l->solutions[node] = 0; 321 } 322 323 if (dest < node_count) 324 l->affinity[dest] = default_affinity; 325 } 326 327 } 328 329 bi_compute_interference(ctx, l, full_regs); 330 331 *success = lcra_solve(l); 332 333 return l; 334} 335 336static bi_index 337bi_reg_from_index(bi_context *ctx, struct lcra_state *l, bi_index index) 338{ 339 /* Offsets can only be applied when we register allocated an index, or 340 * alternatively for FAU's encoding */ 341 342 ASSERTED bool is_offset = (index.offset > 0) && 343 (index.type != BI_INDEX_FAU); 344 unsigned node_count = bi_max_temp(ctx); 345 346 /* Did we run RA for this index at all */ 347 if (bi_get_node(index) >= node_count) { 348 assert(!is_offset); 349 return index; 350 } 351 352 /* LCRA didn't bother solving this index (how lazy!) */ 353 signed solution = l->solutions[bi_get_node(index)]; 354 if (solution < 0) { 355 assert(!is_offset); 356 return index; 357 } 358 359 /* todo: do we want to compose with the subword swizzle? */ 360 bi_index new_index = bi_register(solution + index.offset); 361 new_index.swizzle = index.swizzle; 362 new_index.abs = index.abs; 363 new_index.neg = index.neg; 364 return new_index; 365} 366 367static void 368bi_install_registers(bi_context *ctx, struct lcra_state *l) 369{ 370 bi_foreach_instr_global(ctx, ins) { 371 bi_foreach_dest(ins, d) 372 ins->dest[d] = bi_reg_from_index(ctx, l, ins->dest[d]); 373 374 bi_foreach_src(ins, s) 375 ins->src[s] = bi_reg_from_index(ctx, l, ins->src[s]); 376 } 377} 378 379static void 380bi_rewrite_index_src_single(bi_instr *ins, bi_index old, bi_index new) 381{ 382 bi_foreach_src(ins, i) { 383 if (bi_is_equiv(ins->src[i], old)) { 384 ins->src[i].type = new.type; 385 ins->src[i].reg = new.reg; 386 ins->src[i].value = new.value; 387 } 388 } 389} 390 391/* If register allocation fails, find the best spill node */ 392 393static signed 394bi_choose_spill_node(bi_context *ctx, struct lcra_state *l) 395{ 396 /* Pick a node satisfying bi_spill_register's preconditions */ 397 BITSET_WORD *no_spill = calloc(sizeof(BITSET_WORD), BITSET_WORDS(l->node_count)); 398 399 bi_foreach_instr_global(ctx, ins) { 400 bi_foreach_dest(ins, d) { 401 unsigned node = bi_get_node(ins->dest[d]); 402 403 if (node < l->node_count && ins->no_spill) 404 BITSET_SET(no_spill, node); 405 } 406 } 407 408 unsigned best_benefit = 0.0; 409 signed best_node = -1; 410 411 for (unsigned i = 0; i < l->node_count; ++i) { 412 if (BITSET_TEST(no_spill, i)) continue; 413 414 /* Only spill nodes that interfere with the node failing 415 * register allocation. It's pointless to spill anything else */ 416 if (!l->linear[(l->spill_node * l->node_count) + i]) continue; 417 418 unsigned benefit = lcra_count_constraints(l, i); 419 420 if (benefit > best_benefit) { 421 best_benefit = benefit; 422 best_node = i; 423 } 424 } 425 426 free(no_spill); 427 return best_node; 428} 429 430static unsigned 431bi_count_read_index(bi_instr *I, bi_index index) 432{ 433 unsigned max = 0; 434 435 bi_foreach_src(I, s) { 436 if (bi_is_equiv(I->src[s], index)) { 437 unsigned count = bi_count_read_registers(I, s); 438 max = MAX2(max, count + I->src[s].offset); 439 } 440 } 441 442 return max; 443} 444 445/* Once we've chosen a spill node, spill it and returns bytes spilled */ 446 447static unsigned 448bi_spill_register(bi_context *ctx, bi_index index, uint32_t offset) 449{ 450 bi_builder b = { .shader = ctx }; 451 unsigned channels = 0; 452 453 /* Spill after every store, fill before every load */ 454 bi_foreach_instr_global_safe(ctx, I) { 455 bi_foreach_dest(I, d) { 456 if (!bi_is_equiv(I->dest[d], index)) continue; 457 458 unsigned extra = I->dest[d].offset; 459 bi_index tmp = bi_temp(ctx); 460 461 I->dest[d] = bi_replace_index(I->dest[d], tmp); 462 I->no_spill = true; 463 464 unsigned count = bi_count_write_registers(I, d); 465 unsigned bits = count * 32; 466 467 b.cursor = bi_after_instr(I); 468 bi_index loc = bi_imm_u32(offset + 4 * extra); 469 bi_store(&b, bits, tmp, loc, bi_zero(), BI_SEG_TL); 470 471 ctx->spills++; 472 channels = MAX2(channels, extra + count); 473 } 474 475 if (bi_has_arg(I, index)) { 476 b.cursor = bi_before_instr(I); 477 bi_index tmp = bi_temp(ctx); 478 479 unsigned bits = bi_count_read_index(I, index) * 32; 480 bi_rewrite_index_src_single(I, index, tmp); 481 482 bi_instr *ld = bi_load_to(&b, bits, tmp, 483 bi_imm_u32(offset), bi_zero(), BI_SEG_TL); 484 ld->no_spill = true; 485 ctx->fills++; 486 } 487 } 488 489 return (channels * 4); 490} 491 492void 493bi_register_allocate(bi_context *ctx) 494{ 495 struct lcra_state *l = NULL; 496 bool success = false; 497 498 unsigned iter_count = 1000; /* max iterations */ 499 500 /* Number of bytes of memory we've spilled into */ 501 unsigned spill_count = ctx->info->tls_size; 502 503 /* Try with reduced register pressure to improve thread count on v7 */ 504 if (ctx->arch == 7) { 505 bi_invalidate_liveness(ctx); 506 l = bi_allocate_registers(ctx, &success, false); 507 508 if (success) { 509 ctx->info->work_reg_count = 32; 510 } else { 511 lcra_free(l); 512 l = NULL; 513 } 514 } 515 516 /* Otherwise, use the register file and spill until we succeed */ 517 while (!success && ((iter_count--) > 0)) { 518 bi_invalidate_liveness(ctx); 519 l = bi_allocate_registers(ctx, &success, true); 520 521 if (success) { 522 ctx->info->work_reg_count = 64; 523 } else { 524 signed spill_node = bi_choose_spill_node(ctx, l); 525 lcra_free(l); 526 l = NULL; 527 528 if (spill_node == -1) 529 unreachable("Failed to choose spill node\n"); 530 531 spill_count += bi_spill_register(ctx, 532 bi_node_to_index(spill_node, bi_max_temp(ctx)), 533 spill_count); 534 } 535 } 536 537 assert(success); 538 assert(l != NULL); 539 540 ctx->info->tls_size = spill_count; 541 bi_install_registers(ctx, l); 542 543 lcra_free(l); 544} 545