1/* 2 * Copyright © 2014 Broadcom 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "util/ralloc.h" 25#include "util/register_allocate.h" 26#include "vc4_context.h" 27#include "vc4_qir.h" 28#include "vc4_qpu.h" 29 30#define QPU_R(file, index) { QPU_MUX_##file, index } 31 32static const struct qpu_reg vc4_regs[] = { 33 { QPU_MUX_R0, 0}, 34 { QPU_MUX_R1, 0}, 35 { QPU_MUX_R2, 0}, 36 { QPU_MUX_R3, 0}, 37 { QPU_MUX_R4, 0}, 38 QPU_R(A, 0), 39 QPU_R(B, 0), 40 QPU_R(A, 1), 41 QPU_R(B, 1), 42 QPU_R(A, 2), 43 QPU_R(B, 2), 44 QPU_R(A, 3), 45 QPU_R(B, 3), 46 QPU_R(A, 4), 47 QPU_R(B, 4), 48 QPU_R(A, 5), 49 QPU_R(B, 5), 50 QPU_R(A, 6), 51 QPU_R(B, 6), 52 QPU_R(A, 7), 53 QPU_R(B, 7), 54 QPU_R(A, 8), 55 QPU_R(B, 8), 56 QPU_R(A, 9), 57 QPU_R(B, 9), 58 QPU_R(A, 10), 59 QPU_R(B, 10), 60 QPU_R(A, 11), 61 QPU_R(B, 11), 62 QPU_R(A, 12), 63 QPU_R(B, 12), 64 QPU_R(A, 13), 65 QPU_R(B, 13), 66 QPU_R(A, 14), 67 QPU_R(B, 14), 68 QPU_R(A, 15), 69 QPU_R(B, 15), 70 QPU_R(A, 16), 71 QPU_R(B, 16), 72 QPU_R(A, 17), 73 QPU_R(B, 17), 74 QPU_R(A, 18), 75 QPU_R(B, 18), 76 QPU_R(A, 19), 77 QPU_R(B, 19), 78 QPU_R(A, 20), 79 QPU_R(B, 20), 80 QPU_R(A, 21), 81 QPU_R(B, 21), 82 QPU_R(A, 22), 83 QPU_R(B, 22), 84 QPU_R(A, 23), 85 QPU_R(B, 23), 86 QPU_R(A, 24), 87 QPU_R(B, 24), 88 QPU_R(A, 25), 89 QPU_R(B, 25), 90 QPU_R(A, 26), 91 QPU_R(B, 26), 92 QPU_R(A, 27), 93 QPU_R(B, 27), 94 QPU_R(A, 28), 95 QPU_R(B, 28), 96 QPU_R(A, 29), 97 QPU_R(B, 29), 98 QPU_R(A, 30), 99 QPU_R(B, 30), 100 QPU_R(A, 31), 101 QPU_R(B, 31), 102}; 103#define ACC_INDEX 0 104#define ACC_COUNT 5 105#define AB_INDEX (ACC_INDEX + ACC_COUNT) 106#define AB_COUNT 64 107 108static void 109vc4_alloc_reg_set(struct vc4_context *vc4) 110{ 111 assert(vc4_regs[AB_INDEX].addr == 0); 112 assert(vc4_regs[AB_INDEX + 1].addr == 0); 113 STATIC_ASSERT(ARRAY_SIZE(vc4_regs) == AB_INDEX + 64); 114 115 if (vc4->regs) 116 return; 117 118 vc4->regs = ra_alloc_reg_set(vc4, ARRAY_SIZE(vc4_regs), false); 119 120 /* The physical regfiles split us into two classes, with [0] being the 121 * whole space and [1] being the bottom half (for threaded fragment 122 * shaders). 123 */ 124 for (int i = 0; i < 2; i++) { 125 vc4->reg_class_any[i] = ra_alloc_contig_reg_class(vc4->regs, 1); 126 vc4->reg_class_a_or_b[i] = ra_alloc_contig_reg_class(vc4->regs, 1); 127 vc4->reg_class_a_or_b_or_acc[i] = ra_alloc_contig_reg_class(vc4->regs, 1); 128 vc4->reg_class_r4_or_a[i] = ra_alloc_contig_reg_class(vc4->regs, 1); 129 vc4->reg_class_a[i] = ra_alloc_contig_reg_class(vc4->regs, 1); 130 } 131 vc4->reg_class_r0_r3 = ra_alloc_contig_reg_class(vc4->regs, 1); 132 133 /* r0-r3 */ 134 for (uint32_t i = ACC_INDEX; i < ACC_INDEX + 4; i++) { 135 ra_class_add_reg(vc4->reg_class_r0_r3, i); 136 ra_class_add_reg(vc4->reg_class_a_or_b_or_acc[0], i); 137 ra_class_add_reg(vc4->reg_class_a_or_b_or_acc[1], i); 138 } 139 140 /* R4 gets a special class because it can't be written as a general 141 * purpose register. (it's TMU_NOSWAP as a write address). 142 */ 143 for (int i = 0; i < 2; i++) { 144 ra_class_add_reg(vc4->reg_class_r4_or_a[i], ACC_INDEX + 4); 145 ra_class_add_reg(vc4->reg_class_any[i], ACC_INDEX + 4); 146 } 147 148 /* A/B */ 149 for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i ++) { 150 /* Reserve ra14/rb14 for spilling fixup_raddr_conflict() in 151 * vc4_qpu_emit.c 152 */ 153 if (vc4_regs[i].addr == 14) 154 continue; 155 156 ra_class_add_reg(vc4->reg_class_any[0], i); 157 ra_class_add_reg(vc4->reg_class_a_or_b[0], i); 158 ra_class_add_reg(vc4->reg_class_a_or_b_or_acc[0], i); 159 160 if (vc4_regs[i].addr < 16) { 161 ra_class_add_reg(vc4->reg_class_any[1], i); 162 ra_class_add_reg(vc4->reg_class_a_or_b[1], i); 163 ra_class_add_reg(vc4->reg_class_a_or_b_or_acc[1], i); 164 } 165 166 167 /* A only */ 168 if (((i - AB_INDEX) & 1) == 0) { 169 ra_class_add_reg(vc4->reg_class_a[0], i); 170 ra_class_add_reg(vc4->reg_class_r4_or_a[0], i); 171 172 if (vc4_regs[i].addr < 16) { 173 ra_class_add_reg(vc4->reg_class_a[1], i); 174 ra_class_add_reg(vc4->reg_class_r4_or_a[1], i); 175 } 176 } 177 } 178 179 ra_set_finalize(vc4->regs, NULL); 180} 181 182struct node_to_temp_map { 183 uint32_t temp; 184 uint32_t priority; 185}; 186 187static int 188node_to_temp_priority(const void *in_a, const void *in_b) 189{ 190 const struct node_to_temp_map *a = in_a; 191 const struct node_to_temp_map *b = in_b; 192 193 return a->priority - b->priority; 194} 195 196#define CLASS_BIT_A (1 << 0) 197#define CLASS_BIT_B (1 << 1) 198#define CLASS_BIT_R4 (1 << 2) 199#define CLASS_BIT_R0_R3 (1 << 4) 200 201struct vc4_ra_select_callback_data { 202 uint32_t next_acc; 203 uint32_t next_ab; 204}; 205 206static unsigned int 207vc4_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data) 208{ 209 struct vc4_ra_select_callback_data *vc4_ra = data; 210 211 /* If r4 is available, always choose it -- few other things can go 212 * there, and choosing anything else means inserting a mov. 213 */ 214 if (BITSET_TEST(regs, ACC_INDEX + 4)) 215 return ACC_INDEX + 4; 216 217 /* Choose an accumulator if possible (no delay between write and 218 * read), but round-robin through them to give post-RA instruction 219 * selection more options. 220 */ 221 for (int i = 0; i < ACC_COUNT; i++) { 222 int acc_off = (vc4_ra->next_acc + i) % ACC_COUNT; 223 int acc = ACC_INDEX + acc_off; 224 225 if (BITSET_TEST(regs, acc)) { 226 vc4_ra->next_acc = acc_off + 1; 227 return acc; 228 } 229 } 230 231 for (int i = 0; i < AB_COUNT; i++) { 232 int ab_off = (vc4_ra->next_ab + i) % AB_COUNT; 233 int ab = AB_INDEX + ab_off; 234 235 if (BITSET_TEST(regs, ab)) { 236 vc4_ra->next_ab = ab_off + 1; 237 return ab; 238 } 239 } 240 241 unreachable("RA must pass us at least one possible reg."); 242} 243 244/** 245 * Returns a mapping from QFILE_TEMP indices to struct qpu_regs. 246 * 247 * The return value should be freed by the caller. 248 */ 249struct qpu_reg * 250vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) 251{ 252 struct node_to_temp_map map[c->num_temps]; 253 uint32_t temp_to_node[c->num_temps]; 254 uint8_t class_bits[c->num_temps]; 255 struct qpu_reg *temp_registers = calloc(c->num_temps, 256 sizeof(*temp_registers)); 257 struct vc4_ra_select_callback_data callback_data = { 258 .next_acc = 0, 259 .next_ab = 0, 260 }; 261 262 /* If things aren't ever written (undefined values), just read from 263 * r0. 264 */ 265 for (uint32_t i = 0; i < c->num_temps; i++) 266 temp_registers[i] = qpu_rn(0); 267 268 vc4_alloc_reg_set(vc4); 269 270 struct ra_graph *g = ra_alloc_interference_graph(vc4->regs, 271 c->num_temps); 272 273 /* Compute the live ranges so we can figure out interference. */ 274 qir_calculate_live_intervals(c); 275 276 ra_set_select_reg_callback(g, vc4_ra_select_callback, &callback_data); 277 278 for (uint32_t i = 0; i < c->num_temps; i++) { 279 map[i].temp = i; 280 map[i].priority = c->temp_end[i] - c->temp_start[i]; 281 } 282 qsort(map, c->num_temps, sizeof(map[0]), node_to_temp_priority); 283 for (uint32_t i = 0; i < c->num_temps; i++) { 284 temp_to_node[map[i].temp] = i; 285 } 286 287 /* Figure out our register classes and preallocated registers. We 288 * start with any temp being able to be in any file, then instructions 289 * incrementally remove bits that the temp definitely can't be in. 290 */ 291 memset(class_bits, 292 CLASS_BIT_A | CLASS_BIT_B | CLASS_BIT_R4 | CLASS_BIT_R0_R3, 293 sizeof(class_bits)); 294 295 int ip = 0; 296 qir_for_each_inst_inorder(inst, c) { 297 if (qir_writes_r4(inst)) { 298 /* This instruction writes r4 (and optionally moves 299 * its result to a temp), so nothing else can be 300 * stored in r4 across it. 301 */ 302 for (int i = 0; i < c->num_temps; i++) { 303 if (c->temp_start[i] < ip && c->temp_end[i] > ip) 304 class_bits[i] &= ~CLASS_BIT_R4; 305 } 306 307 /* If we're doing a conditional write of something 308 * writing R4 (math, tex results), then make sure that 309 * we store in a temp so that we actually 310 * conditionally move the result. 311 */ 312 if (inst->cond != QPU_COND_ALWAYS) 313 class_bits[inst->dst.index] &= ~CLASS_BIT_R4; 314 } else { 315 /* R4 can't be written as a general purpose 316 * register. (it's TMU_NOSWAP as a write address). 317 */ 318 if (inst->dst.file == QFILE_TEMP) 319 class_bits[inst->dst.index] &= ~CLASS_BIT_R4; 320 } 321 322 switch (inst->op) { 323 case QOP_FRAG_Z: 324 ra_set_node_reg(g, temp_to_node[inst->dst.index], 325 AB_INDEX + QPU_R_FRAG_PAYLOAD_ZW * 2 + 1); 326 break; 327 328 case QOP_FRAG_W: 329 ra_set_node_reg(g, temp_to_node[inst->dst.index], 330 AB_INDEX + QPU_R_FRAG_PAYLOAD_ZW * 2); 331 break; 332 333 case QOP_ROT_MUL: 334 assert(inst->src[0].file == QFILE_TEMP); 335 class_bits[inst->src[0].index] &= CLASS_BIT_R0_R3; 336 break; 337 338 case QOP_THRSW: 339 /* All accumulators are invalidated across a thread 340 * switch. 341 */ 342 for (int i = 0; i < c->num_temps; i++) { 343 if (c->temp_start[i] < ip && c->temp_end[i] > ip) 344 class_bits[i] &= ~(CLASS_BIT_R0_R3 | 345 CLASS_BIT_R4); 346 } 347 break; 348 349 default: 350 break; 351 } 352 353 if (inst->dst.pack && !qir_is_mul(inst)) { 354 /* The non-MUL pack flags require an A-file dst 355 * register. 356 */ 357 class_bits[inst->dst.index] &= CLASS_BIT_A; 358 } 359 360 /* Apply restrictions for src unpacks. The integer unpacks 361 * can only be done from regfile A, while float unpacks can be 362 * either A or R4. 363 */ 364 for (int i = 0; i < qir_get_nsrc(inst); i++) { 365 if (inst->src[i].file == QFILE_TEMP && 366 inst->src[i].pack) { 367 if (qir_is_float_input(inst)) { 368 class_bits[inst->src[i].index] &= 369 CLASS_BIT_A | CLASS_BIT_R4; 370 } else { 371 class_bits[inst->src[i].index] &= 372 CLASS_BIT_A; 373 } 374 } 375 } 376 377 ip++; 378 } 379 380 for (uint32_t i = 0; i < c->num_temps; i++) { 381 int node = temp_to_node[i]; 382 383 switch (class_bits[i]) { 384 case CLASS_BIT_A | CLASS_BIT_B | CLASS_BIT_R4 | CLASS_BIT_R0_R3: 385 ra_set_node_class(g, node, 386 vc4->reg_class_any[c->fs_threaded]); 387 break; 388 case CLASS_BIT_A | CLASS_BIT_B: 389 ra_set_node_class(g, node, 390 vc4->reg_class_a_or_b[c->fs_threaded]); 391 break; 392 case CLASS_BIT_A | CLASS_BIT_B | CLASS_BIT_R0_R3: 393 ra_set_node_class(g, node, 394 vc4->reg_class_a_or_b_or_acc[c->fs_threaded]); 395 break; 396 case CLASS_BIT_A | CLASS_BIT_R4: 397 ra_set_node_class(g, node, 398 vc4->reg_class_r4_or_a[c->fs_threaded]); 399 break; 400 case CLASS_BIT_A: 401 ra_set_node_class(g, node, 402 vc4->reg_class_a[c->fs_threaded]); 403 break; 404 case CLASS_BIT_R0_R3: 405 ra_set_node_class(g, node, vc4->reg_class_r0_r3); 406 break; 407 408 default: 409 /* DDX/DDY used across thread switched might get us 410 * here. 411 */ 412 if (c->fs_threaded) { 413 c->failed = true; 414 free(temp_registers); 415 return NULL; 416 } 417 418 fprintf(stderr, "temp %d: bad class bits: 0x%x\n", 419 i, class_bits[i]); 420 abort(); 421 break; 422 } 423 } 424 425 for (uint32_t i = 0; i < c->num_temps; i++) { 426 for (uint32_t j = i + 1; j < c->num_temps; j++) { 427 if (!(c->temp_start[i] >= c->temp_end[j] || 428 c->temp_start[j] >= c->temp_end[i])) { 429 ra_add_node_interference(g, 430 temp_to_node[i], 431 temp_to_node[j]); 432 } 433 } 434 } 435 436 bool ok = ra_allocate(g); 437 if (!ok) { 438 if (!c->fs_threaded) { 439 fprintf(stderr, "Failed to register allocate:\n"); 440 qir_dump(c); 441 } 442 443 c->failed = true; 444 free(temp_registers); 445 return NULL; 446 } 447 448 for (uint32_t i = 0; i < c->num_temps; i++) { 449 temp_registers[i] = vc4_regs[ra_get_node_reg(g, temp_to_node[i])]; 450 451 /* If the value's never used, just write to the NOP register 452 * for clarity in debug output. 453 */ 454 if (c->temp_start[i] == c->temp_end[i]) 455 temp_registers[i] = qpu_ra(QPU_W_NOP); 456 } 457 458 ralloc_free(g); 459 460 return temp_registers; 461} 462