1/* 2 * Copyright © 2014 Broadcom 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "util/ralloc.h" 25#include "util/register_allocate.h" 26#include "vc4_context.h" 27#include "vc4_qir.h" 28#include "vc4_qpu.h" 29 30#define QPU_R(file, index) { QPU_MUX_##file, index } 31 32static const struct qpu_reg vc4_regs[] = { 33 { QPU_MUX_R0, 0}, 34 { QPU_MUX_R1, 0}, 35 { QPU_MUX_R2, 0}, 36 { QPU_MUX_R3, 0}, 37 { QPU_MUX_R4, 0}, 38 QPU_R(A, 0), 39 QPU_R(B, 0), 40 QPU_R(A, 1), 41 QPU_R(B, 1), 42 QPU_R(A, 2), 43 QPU_R(B, 2), 44 QPU_R(A, 3), 45 QPU_R(B, 3), 46 QPU_R(A, 4), 47 QPU_R(B, 4), 48 QPU_R(A, 5), 49 QPU_R(B, 5), 50 QPU_R(A, 6), 51 QPU_R(B, 6), 52 QPU_R(A, 7), 53 QPU_R(B, 7), 54 QPU_R(A, 8), 55 QPU_R(B, 8), 56 QPU_R(A, 9), 57 QPU_R(B, 9), 58 QPU_R(A, 10), 59 QPU_R(B, 10), 60 QPU_R(A, 11), 61 QPU_R(B, 11), 62 QPU_R(A, 12), 63 QPU_R(B, 12), 64 QPU_R(A, 13), 65 QPU_R(B, 13), 66 QPU_R(A, 14), 67 QPU_R(B, 14), 68 QPU_R(A, 15), 69 QPU_R(B, 15), 70 QPU_R(A, 16), 71 QPU_R(B, 16), 72 QPU_R(A, 17), 73 QPU_R(B, 17), 74 QPU_R(A, 18), 75 QPU_R(B, 18), 76 QPU_R(A, 19), 77 QPU_R(B, 19), 78 QPU_R(A, 20), 79 QPU_R(B, 20), 80 QPU_R(A, 21), 81 QPU_R(B, 21), 82 QPU_R(A, 22), 83 QPU_R(B, 22), 84 QPU_R(A, 23), 85 QPU_R(B, 23), 86 QPU_R(A, 24), 87 QPU_R(B, 24), 88 QPU_R(A, 25), 89 QPU_R(B, 25), 90 QPU_R(A, 26), 91 QPU_R(B, 26), 92 QPU_R(A, 27), 93 QPU_R(B, 27), 94 QPU_R(A, 28), 95 QPU_R(B, 28), 96 QPU_R(A, 29), 97 QPU_R(B, 29), 98 QPU_R(A, 30), 99 QPU_R(B, 30), 100 QPU_R(A, 31), 101 QPU_R(B, 31), 102}; 103#define ACC_INDEX 0 104#define ACC_COUNT 5 105#define AB_INDEX (ACC_INDEX + ACC_COUNT) 106#define AB_COUNT 64 107 108static void 109vc4_alloc_reg_set(struct vc4_context *vc4) 110{ 111 assert(vc4_regs[AB_INDEX].addr == 0); 112 assert(vc4_regs[AB_INDEX + 1].addr == 0); 113 STATIC_ASSERT(ARRAY_SIZE(vc4_regs) == AB_INDEX + 64); 114 115 if (vc4->regs) 116 return; 117 118 vc4->regs = ra_alloc_reg_set(vc4, ARRAY_SIZE(vc4_regs), true); 119 120 /* The physical regfiles split us into two classes, with [0] being the 121 * whole space and [1] being the bottom half (for threaded fragment 122 * shaders). 123 */ 124 for (int i = 0; i < 2; i++) { 125 vc4->reg_class_any[i] = ra_alloc_reg_class(vc4->regs); 126 vc4->reg_class_a_or_b[i] = ra_alloc_reg_class(vc4->regs); 127 vc4->reg_class_a_or_b_or_acc[i] = ra_alloc_reg_class(vc4->regs); 128 vc4->reg_class_r4_or_a[i] = ra_alloc_reg_class(vc4->regs); 129 vc4->reg_class_a[i] = ra_alloc_reg_class(vc4->regs); 130 } 131 vc4->reg_class_r0_r3 = ra_alloc_reg_class(vc4->regs); 132 133 /* r0-r3 */ 134 for (uint32_t i = ACC_INDEX; i < ACC_INDEX + 4; i++) { 135 ra_class_add_reg(vc4->regs, vc4->reg_class_r0_r3, i); 136 ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc[0], i); 137 ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc[1], i); 138 } 139 140 /* R4 gets a special class because it can't be written as a general 141 * purpose register. (it's TMU_NOSWAP as a write address). 142 */ 143 for (int i = 0; i < 2; i++) { 144 ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a[i], 145 ACC_INDEX + 4); 146 ra_class_add_reg(vc4->regs, vc4->reg_class_any[i], 147 ACC_INDEX + 4); 148 } 149 150 /* A/B */ 151 for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i ++) { 152 /* Reserve ra14/rb14 for spilling fixup_raddr_conflict() in 153 * vc4_qpu_emit.c 154 */ 155 if (vc4_regs[i].addr == 14) 156 continue; 157 158 ra_class_add_reg(vc4->regs, vc4->reg_class_any[0], i); 159 ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b[0], i); 160 ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc[0], i); 161 162 if (vc4_regs[i].addr < 16) { 163 ra_class_add_reg(vc4->regs, vc4->reg_class_any[1], i); 164 ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b[1], i); 165 ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc[1], i); 166 } 167 168 169 /* A only */ 170 if (((i - AB_INDEX) & 1) == 0) { 171 ra_class_add_reg(vc4->regs, vc4->reg_class_a[0], i); 172 ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a[0], i); 173 174 if (vc4_regs[i].addr < 16) { 175 ra_class_add_reg(vc4->regs, 176 vc4->reg_class_a[1], i); 177 ra_class_add_reg(vc4->regs, 178 vc4->reg_class_r4_or_a[1], i); 179 } 180 } 181 } 182 183 ra_set_finalize(vc4->regs, NULL); 184} 185 186struct node_to_temp_map { 187 uint32_t temp; 188 uint32_t priority; 189}; 190 191static int 192node_to_temp_priority(const void *in_a, const void *in_b) 193{ 194 const struct node_to_temp_map *a = in_a; 195 const struct node_to_temp_map *b = in_b; 196 197 return a->priority - b->priority; 198} 199 200#define CLASS_BIT_A (1 << 0) 201#define CLASS_BIT_B (1 << 1) 202#define CLASS_BIT_R4 (1 << 2) 203#define CLASS_BIT_R0_R3 (1 << 4) 204 205struct vc4_ra_select_callback_data { 206 uint32_t next_acc; 207 uint32_t next_ab; 208}; 209 210static unsigned int 211vc4_ra_select_callback(struct ra_graph *g, BITSET_WORD *regs, void *data) 212{ 213 struct vc4_ra_select_callback_data *vc4_ra = data; 214 215 /* If r4 is available, always choose it -- few other things can go 216 * there, and choosing anything else means inserting a mov. 217 */ 218 if (BITSET_TEST(regs, ACC_INDEX + 4)) 219 return ACC_INDEX + 4; 220 221 /* Choose an accumulator if possible (no delay between write and 222 * read), but round-robin through them to give post-RA instruction 223 * selection more options. 224 */ 225 for (int i = 0; i < ACC_COUNT; i++) { 226 int acc_off = (vc4_ra->next_acc + i) % ACC_COUNT; 227 int acc = ACC_INDEX + acc_off; 228 229 if (BITSET_TEST(regs, acc)) { 230 vc4_ra->next_acc = acc_off + 1; 231 return acc; 232 } 233 } 234 235 for (int i = 0; i < AB_COUNT; i++) { 236 int ab_off = (vc4_ra->next_ab + i) % AB_COUNT; 237 int ab = AB_INDEX + ab_off; 238 239 if (BITSET_TEST(regs, ab)) { 240 vc4_ra->next_ab = ab_off + 1; 241 return ab; 242 } 243 } 244 245 unreachable("RA must pass us at least one possible reg."); 246} 247 248/** 249 * Returns a mapping from QFILE_TEMP indices to struct qpu_regs. 250 * 251 * The return value should be freed by the caller. 252 */ 253struct qpu_reg * 254vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) 255{ 256 struct node_to_temp_map map[c->num_temps]; 257 uint32_t temp_to_node[c->num_temps]; 258 uint8_t class_bits[c->num_temps]; 259 struct qpu_reg *temp_registers = calloc(c->num_temps, 260 sizeof(*temp_registers)); 261 struct vc4_ra_select_callback_data callback_data = { 262 .next_acc = 0, 263 .next_ab = 0, 264 }; 265 266 /* If things aren't ever written (undefined values), just read from 267 * r0. 268 */ 269 for (uint32_t i = 0; i < c->num_temps; i++) 270 temp_registers[i] = qpu_rn(0); 271 272 vc4_alloc_reg_set(vc4); 273 274 struct ra_graph *g = ra_alloc_interference_graph(vc4->regs, 275 c->num_temps); 276 277 /* Compute the live ranges so we can figure out interference. */ 278 qir_calculate_live_intervals(c); 279 280 ra_set_select_reg_callback(g, vc4_ra_select_callback, &callback_data); 281 282 for (uint32_t i = 0; i < c->num_temps; i++) { 283 map[i].temp = i; 284 map[i].priority = c->temp_end[i] - c->temp_start[i]; 285 } 286 qsort(map, c->num_temps, sizeof(map[0]), node_to_temp_priority); 287 for (uint32_t i = 0; i < c->num_temps; i++) { 288 temp_to_node[map[i].temp] = i; 289 } 290 291 /* Figure out our register classes and preallocated registers. We 292 * start with any temp being able to be in any file, then instructions 293 * incrementally remove bits that the temp definitely can't be in. 294 */ 295 memset(class_bits, 296 CLASS_BIT_A | CLASS_BIT_B | CLASS_BIT_R4 | CLASS_BIT_R0_R3, 297 sizeof(class_bits)); 298 299 int ip = 0; 300 qir_for_each_inst_inorder(inst, c) { 301 if (qir_writes_r4(inst)) { 302 /* This instruction writes r4 (and optionally moves 303 * its result to a temp), so nothing else can be 304 * stored in r4 across it. 305 */ 306 for (int i = 0; i < c->num_temps; i++) { 307 if (c->temp_start[i] < ip && c->temp_end[i] > ip) 308 class_bits[i] &= ~CLASS_BIT_R4; 309 } 310 311 /* If we're doing a conditional write of something 312 * writing R4 (math, tex results), then make sure that 313 * we store in a temp so that we actually 314 * conditionally move the result. 315 */ 316 if (inst->cond != QPU_COND_ALWAYS) 317 class_bits[inst->dst.index] &= ~CLASS_BIT_R4; 318 } else { 319 /* R4 can't be written as a general purpose 320 * register. (it's TMU_NOSWAP as a write address). 321 */ 322 if (inst->dst.file == QFILE_TEMP) 323 class_bits[inst->dst.index] &= ~CLASS_BIT_R4; 324 } 325 326 switch (inst->op) { 327 case QOP_FRAG_Z: 328 ra_set_node_reg(g, temp_to_node[inst->dst.index], 329 AB_INDEX + QPU_R_FRAG_PAYLOAD_ZW * 2 + 1); 330 break; 331 332 case QOP_FRAG_W: 333 ra_set_node_reg(g, temp_to_node[inst->dst.index], 334 AB_INDEX + QPU_R_FRAG_PAYLOAD_ZW * 2); 335 break; 336 337 case QOP_ROT_MUL: 338 assert(inst->src[0].file == QFILE_TEMP); 339 class_bits[inst->src[0].index] &= CLASS_BIT_R0_R3; 340 break; 341 342 case QOP_THRSW: 343 /* All accumulators are invalidated across a thread 344 * switch. 345 */ 346 for (int i = 0; i < c->num_temps; i++) { 347 if (c->temp_start[i] < ip && c->temp_end[i] > ip) 348 class_bits[i] &= ~(CLASS_BIT_R0_R3 | 349 CLASS_BIT_R4); 350 } 351 break; 352 353 default: 354 break; 355 } 356 357 if (inst->dst.pack && !qir_is_mul(inst)) { 358 /* The non-MUL pack flags require an A-file dst 359 * register. 360 */ 361 class_bits[inst->dst.index] &= CLASS_BIT_A; 362 } 363 364 /* Apply restrictions for src unpacks. The integer unpacks 365 * can only be done from regfile A, while float unpacks can be 366 * either A or R4. 367 */ 368 for (int i = 0; i < qir_get_nsrc(inst); i++) { 369 if (inst->src[i].file == QFILE_TEMP && 370 inst->src[i].pack) { 371 if (qir_is_float_input(inst)) { 372 class_bits[inst->src[i].index] &= 373 CLASS_BIT_A | CLASS_BIT_R4; 374 } else { 375 class_bits[inst->src[i].index] &= 376 CLASS_BIT_A; 377 } 378 } 379 } 380 381 ip++; 382 } 383 384 for (uint32_t i = 0; i < c->num_temps; i++) { 385 int node = temp_to_node[i]; 386 387 switch (class_bits[i]) { 388 case CLASS_BIT_A | CLASS_BIT_B | CLASS_BIT_R4 | CLASS_BIT_R0_R3: 389 ra_set_node_class(g, node, 390 vc4->reg_class_any[c->fs_threaded]); 391 break; 392 case CLASS_BIT_A | CLASS_BIT_B: 393 ra_set_node_class(g, node, 394 vc4->reg_class_a_or_b[c->fs_threaded]); 395 break; 396 case CLASS_BIT_A | CLASS_BIT_B | CLASS_BIT_R0_R3: 397 ra_set_node_class(g, node, 398 vc4->reg_class_a_or_b_or_acc[c->fs_threaded]); 399 break; 400 case CLASS_BIT_A | CLASS_BIT_R4: 401 ra_set_node_class(g, node, 402 vc4->reg_class_r4_or_a[c->fs_threaded]); 403 break; 404 case CLASS_BIT_A: 405 ra_set_node_class(g, node, 406 vc4->reg_class_a[c->fs_threaded]); 407 break; 408 case CLASS_BIT_R0_R3: 409 ra_set_node_class(g, node, vc4->reg_class_r0_r3); 410 break; 411 412 default: 413 /* DDX/DDY used across thread switched might get us 414 * here. 415 */ 416 if (c->fs_threaded) { 417 c->failed = true; 418 free(temp_registers); 419 return NULL; 420 } 421 422 fprintf(stderr, "temp %d: bad class bits: 0x%x\n", 423 i, class_bits[i]); 424 abort(); 425 break; 426 } 427 } 428 429 for (uint32_t i = 0; i < c->num_temps; i++) { 430 for (uint32_t j = i + 1; j < c->num_temps; j++) { 431 if (!(c->temp_start[i] >= c->temp_end[j] || 432 c->temp_start[j] >= c->temp_end[i])) { 433 ra_add_node_interference(g, 434 temp_to_node[i], 435 temp_to_node[j]); 436 } 437 } 438 } 439 440 bool ok = ra_allocate(g); 441 if (!ok) { 442 if (!c->fs_threaded) { 443 fprintf(stderr, "Failed to register allocate:\n"); 444 qir_dump(c); 445 } 446 447 c->failed = true; 448 free(temp_registers); 449 return NULL; 450 } 451 452 for (uint32_t i = 0; i < c->num_temps; i++) { 453 temp_registers[i] = vc4_regs[ra_get_node_reg(g, temp_to_node[i])]; 454 455 /* If the value's never used, just write to the NOP register 456 * for clarity in debug output. 457 */ 458 if (c->temp_start[i] == c->temp_end[i]) 459 temp_registers[i] = qpu_ra(QPU_W_NOP); 460 } 461 462 ralloc_free(g); 463 464 return temp_registers; 465} 466