1/* 2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 * 23 * Authors: 24 * Rob Clark <robclark@freedesktop.org> 25 */ 26 27#include "util/u_math.h" 28#include "util/register_allocate.h" 29#include "util/ralloc.h" 30#include "util/bitset.h" 31 32#include "ir3.h" 33#include "ir3_compiler.h" 34 35/* 36 * Register Assignment: 37 * 38 * Uses the register_allocate util, which implements graph coloring 39 * algo with interference classes. To handle the cases where we need 40 * consecutive registers (for example, texture sample instructions), 41 * we model these as larger (double/quad/etc) registers which conflict 42 * with the corresponding registers in other classes. 43 * 44 * Additionally we create additional classes for half-regs, which 45 * do not conflict with the full-reg classes. We do need at least 46 * sizes 1-4 (to deal w/ texture sample instructions output to half- 47 * reg). At the moment we don't create the higher order half-reg 48 * classes as half-reg frequently does not have enough precision 49 * for texture coords at higher resolutions. 50 * 51 * There are some additional cases that we need to handle specially, 52 * as the graph coloring algo doesn't understand "partial writes". 53 * For example, a sequence like: 54 * 55 * add r0.z, ... 56 * sam (f32)(xy)r0.x, ... 57 * ... 58 * sam (f32)(xyzw)r0.w, r0.x, ... ; 3d texture, so r0.xyz are coord 59 * 60 * In this scenario, we treat r0.xyz as class size 3, which is written 61 * (from a use/def perspective) at the 'add' instruction and ignore the 62 * subsequent partial writes to r0.xy. So the 'add r0.z, ...' is the 63 * defining instruction, as it is the first to partially write r0.xyz. 64 * 65 * Note i965 has a similar scenario, which they solve with a virtual 66 * LOAD_PAYLOAD instruction which gets turned into multiple MOV's after 67 * register assignment. But for us that is horrible from a scheduling 68 * standpoint. Instead what we do is use idea of 'definer' instruction. 69 * Ie. the first instruction (lowest ip) to write to the variable is the 70 * one we consider from use/def perspective when building interference 71 * graph. (Other instructions which write other variable components 72 * just define the variable some more.) 73 * 74 * Arrays of arbitrary size are handled via pre-coloring a consecutive 75 * sequence of registers. Additional scalar (single component) reg 76 * names are allocated starting at ctx->class_base[total_class_count] 77 * (see arr->base), which are pre-colored. In the use/def graph direct 78 * access is treated as a single element use/def, and indirect access 79 * is treated as use or def of all array elements. (Only the first 80 * def is tracked, in case of multiple indirect writes, etc.) 81 * 82 * TODO arrays that fit in one of the pre-defined class sizes should 83 * not need to be pre-colored, but instead could be given a normal 84 * vreg name. (Ignoring this for now since it is a good way to work 85 * out the kinks with arbitrary sized arrays.) 86 * 87 * TODO might be easier for debugging to split this into two passes, 88 * the first assigning vreg names in a way that we could ir3_print() 89 * the result. 90 */ 91 92static const unsigned class_sizes[] = { 93 1, 2, 3, 4, 94 4 + 4, /* txd + 1d/2d */ 95 4 + 6, /* txd + 3d */ 96}; 97#define class_count ARRAY_SIZE(class_sizes) 98 99static const unsigned half_class_sizes[] = { 100 1, 2, 3, 4, 101}; 102#define half_class_count ARRAY_SIZE(half_class_sizes) 103 104/* seems to just be used for compute shaders? Seems like vec1 and vec3 105 * are sufficient (for now?) 106 */ 107static const unsigned high_class_sizes[] = { 108 1, 3, 109}; 110#define high_class_count ARRAY_SIZE(high_class_sizes) 111 112#define total_class_count (class_count + half_class_count + high_class_count) 113 114/* Below a0.x are normal regs. RA doesn't need to assign a0.x/p0.x. */ 115#define NUM_REGS (4 * 48) /* r0 to r47 */ 116#define NUM_HIGH_REGS (4 * 8) /* r48 to r55 */ 117#define FIRST_HIGH_REG (4 * 48) 118/* Number of virtual regs in a given class: */ 119#define CLASS_REGS(i) (NUM_REGS - (class_sizes[i] - 1)) 120#define HALF_CLASS_REGS(i) (NUM_REGS - (half_class_sizes[i] - 1)) 121#define HIGH_CLASS_REGS(i) (NUM_HIGH_REGS - (high_class_sizes[i] - 1)) 122 123#define HALF_OFFSET (class_count) 124#define HIGH_OFFSET (class_count + half_class_count) 125 126/* register-set, created one time, used for all shaders: */ 127struct ir3_ra_reg_set { 128 struct ra_regs *regs; 129 unsigned int classes[class_count]; 130 unsigned int half_classes[half_class_count]; 131 unsigned int high_classes[high_class_count]; 132 /* maps flat virtual register space to base gpr: */ 133 uint16_t *ra_reg_to_gpr; 134 /* maps cls,gpr to flat virtual register space: */ 135 uint16_t **gpr_to_ra_reg; 136}; 137 138static void 139build_q_values(unsigned int **q_values, unsigned off, 140 const unsigned *sizes, unsigned count) 141{ 142 for (unsigned i = 0; i < count; i++) { 143 q_values[i + off] = rzalloc_array(q_values, unsigned, total_class_count); 144 145 /* From register_allocate.c: 146 * 147 * q(B,C) (indexed by C, B is this register class) in 148 * Runeson/Nyström paper. This is "how many registers of B could 149 * the worst choice register from C conflict with". 150 * 151 * If we just let the register allocation algorithm compute these 152 * values, is extremely expensive. However, since all of our 153 * registers are laid out, we can very easily compute them 154 * ourselves. View the register from C as fixed starting at GRF n 155 * somewhere in the middle, and the register from B as sliding back 156 * and forth. Then the first register to conflict from B is the 157 * one starting at n - class_size[B] + 1 and the last register to 158 * conflict will start at n + class_size[B] - 1. Therefore, the 159 * number of conflicts from B is class_size[B] + class_size[C] - 1. 160 * 161 * +-+-+-+-+-+-+ +-+-+-+-+-+-+ 162 * B | | | | | |n| --> | | | | | | | 163 * +-+-+-+-+-+-+ +-+-+-+-+-+-+ 164 * +-+-+-+-+-+ 165 * C |n| | | | | 166 * +-+-+-+-+-+ 167 * 168 * (Idea copied from brw_fs_reg_allocate.cpp) 169 */ 170 for (unsigned j = 0; j < count; j++) 171 q_values[i + off][j + off] = sizes[i] + sizes[j] - 1; 172 } 173} 174 175/* One-time setup of RA register-set, which describes all the possible 176 * "virtual" registers and their interferences. Ie. double register 177 * occupies (and conflicts with) two single registers, and so forth. 178 * Since registers do not need to be aligned to their class size, they 179 * can conflict with other registers in the same class too. Ie: 180 * 181 * Single (base) | Double 182 * --------------+--------------- 183 * R0 | D0 184 * R1 | D0 D1 185 * R2 | D1 D2 186 * R3 | D2 187 * .. and so on.. 188 * 189 * (NOTE the disassembler uses notation like r0.x/y/z/w but those are 190 * really just four scalar registers. Don't let that confuse you.) 191 */ 192struct ir3_ra_reg_set * 193ir3_ra_alloc_reg_set(struct ir3_compiler *compiler) 194{ 195 struct ir3_ra_reg_set *set = rzalloc(compiler, struct ir3_ra_reg_set); 196 unsigned ra_reg_count, reg, first_half_reg, first_high_reg, base; 197 unsigned int **q_values; 198 199 /* calculate # of regs across all classes: */ 200 ra_reg_count = 0; 201 for (unsigned i = 0; i < class_count; i++) 202 ra_reg_count += CLASS_REGS(i); 203 for (unsigned i = 0; i < half_class_count; i++) 204 ra_reg_count += HALF_CLASS_REGS(i); 205 for (unsigned i = 0; i < high_class_count; i++) 206 ra_reg_count += HIGH_CLASS_REGS(i); 207 208 /* allocate and populate q_values: */ 209 q_values = ralloc_array(set, unsigned *, total_class_count); 210 211 build_q_values(q_values, 0, class_sizes, class_count); 212 build_q_values(q_values, HALF_OFFSET, half_class_sizes, half_class_count); 213 build_q_values(q_values, HIGH_OFFSET, high_class_sizes, high_class_count); 214 215 /* allocate the reg-set.. */ 216 set->regs = ra_alloc_reg_set(set, ra_reg_count, true); 217 set->ra_reg_to_gpr = ralloc_array(set, uint16_t, ra_reg_count); 218 set->gpr_to_ra_reg = ralloc_array(set, uint16_t *, total_class_count); 219 220 /* .. and classes */ 221 reg = 0; 222 for (unsigned i = 0; i < class_count; i++) { 223 set->classes[i] = ra_alloc_reg_class(set->regs); 224 225 set->gpr_to_ra_reg[i] = ralloc_array(set, uint16_t, CLASS_REGS(i)); 226 227 for (unsigned j = 0; j < CLASS_REGS(i); j++) { 228 ra_class_add_reg(set->regs, set->classes[i], reg); 229 230 set->ra_reg_to_gpr[reg] = j; 231 set->gpr_to_ra_reg[i][j] = reg; 232 233 for (unsigned br = j; br < j + class_sizes[i]; br++) 234 ra_add_transitive_reg_conflict(set->regs, br, reg); 235 236 reg++; 237 } 238 } 239 240 first_half_reg = reg; 241 base = HALF_OFFSET; 242 243 for (unsigned i = 0; i < half_class_count; i++) { 244 set->half_classes[i] = ra_alloc_reg_class(set->regs); 245 246 set->gpr_to_ra_reg[base + i] = 247 ralloc_array(set, uint16_t, HALF_CLASS_REGS(i)); 248 249 for (unsigned j = 0; j < HALF_CLASS_REGS(i); j++) { 250 ra_class_add_reg(set->regs, set->half_classes[i], reg); 251 252 set->ra_reg_to_gpr[reg] = j; 253 set->gpr_to_ra_reg[base + i][j] = reg; 254 255 for (unsigned br = j; br < j + half_class_sizes[i]; br++) 256 ra_add_transitive_reg_conflict(set->regs, br + first_half_reg, reg); 257 258 reg++; 259 } 260 } 261 262 first_high_reg = reg; 263 base = HIGH_OFFSET; 264 265 for (unsigned i = 0; i < high_class_count; i++) { 266 set->high_classes[i] = ra_alloc_reg_class(set->regs); 267 268 set->gpr_to_ra_reg[base + i] = 269 ralloc_array(set, uint16_t, HIGH_CLASS_REGS(i)); 270 271 for (unsigned j = 0; j < HIGH_CLASS_REGS(i); j++) { 272 ra_class_add_reg(set->regs, set->high_classes[i], reg); 273 274 set->ra_reg_to_gpr[reg] = j; 275 set->gpr_to_ra_reg[base + i][j] = reg; 276 277 for (unsigned br = j; br < j + high_class_sizes[i]; br++) 278 ra_add_transitive_reg_conflict(set->regs, br + first_high_reg, reg); 279 280 reg++; 281 } 282 } 283 284 /* starting a6xx, half precision regs conflict w/ full precision regs: */ 285 if (compiler->gpu_id >= 600) { 286 /* because of transitivity, we can get away with just setting up 287 * conflicts between the first class of full and half regs: 288 */ 289 for (unsigned i = 0; i < half_class_count; i++) { 290 /* NOTE there are fewer half class sizes, but they match the 291 * first N full class sizes.. but assert in case that ever 292 * accidentially changes: 293 */ 294 debug_assert(class_sizes[i] == half_class_sizes[i]); 295 for (unsigned j = 0; j < CLASS_REGS(i) / 2; j++) { 296 unsigned freg = set->gpr_to_ra_reg[i][j]; 297 unsigned hreg0 = set->gpr_to_ra_reg[i + HALF_OFFSET][(j * 2) + 0]; 298 unsigned hreg1 = set->gpr_to_ra_reg[i + HALF_OFFSET][(j * 2) + 1]; 299 300 ra_add_transitive_reg_conflict(set->regs, freg, hreg0); 301 ra_add_transitive_reg_conflict(set->regs, freg, hreg1); 302 } 303 } 304 305 // TODO also need to update q_values, but for now: 306 ra_set_finalize(set->regs, NULL); 307 } else { 308 ra_set_finalize(set->regs, q_values); 309 } 310 311 ralloc_free(q_values); 312 313 return set; 314} 315 316/* additional block-data (per-block) */ 317struct ir3_ra_block_data { 318 BITSET_WORD *def; /* variables defined before used in block */ 319 BITSET_WORD *use; /* variables used before defined in block */ 320 BITSET_WORD *livein; /* which defs reach entry point of block */ 321 BITSET_WORD *liveout; /* which defs reach exit point of block */ 322}; 323 324/* additional instruction-data (per-instruction) */ 325struct ir3_ra_instr_data { 326 /* cached instruction 'definer' info: */ 327 struct ir3_instruction *defn; 328 int off, sz, cls; 329}; 330 331/* register-assign context, per-shader */ 332struct ir3_ra_ctx { 333 struct ir3 *ir; 334 gl_shader_stage type; 335 bool frag_face; 336 337 struct ir3_ra_reg_set *set; 338 struct ra_graph *g; 339 unsigned alloc_count; 340 /* one per class, plus one slot for arrays: */ 341 unsigned class_alloc_count[total_class_count + 1]; 342 unsigned class_base[total_class_count + 1]; 343 unsigned instr_cnt; 344 unsigned *def, *use; /* def/use table */ 345 struct ir3_ra_instr_data *instrd; 346}; 347 348/* does it conflict? */ 349static inline bool 350intersects(unsigned a_start, unsigned a_end, unsigned b_start, unsigned b_end) 351{ 352 return !((a_start >= b_end) || (b_start >= a_end)); 353} 354 355static bool 356is_half(struct ir3_instruction *instr) 357{ 358 return !!(instr->regs[0]->flags & IR3_REG_HALF); 359} 360 361static bool 362is_high(struct ir3_instruction *instr) 363{ 364 return !!(instr->regs[0]->flags & IR3_REG_HIGH); 365} 366 367static int 368size_to_class(unsigned sz, bool half, bool high) 369{ 370 if (high) { 371 for (unsigned i = 0; i < high_class_count; i++) 372 if (high_class_sizes[i] >= sz) 373 return i + HIGH_OFFSET; 374 } else if (half) { 375 for (unsigned i = 0; i < half_class_count; i++) 376 if (half_class_sizes[i] >= sz) 377 return i + HALF_OFFSET; 378 } else { 379 for (unsigned i = 0; i < class_count; i++) 380 if (class_sizes[i] >= sz) 381 return i; 382 } 383 debug_assert(0); 384 return -1; 385} 386 387static bool 388writes_gpr(struct ir3_instruction *instr) 389{ 390 if (is_store(instr)) 391 return false; 392 /* is dest a normal temp register: */ 393 struct ir3_register *reg = instr->regs[0]; 394 if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED)) 395 return false; 396 if ((reg->num == regid(REG_A0, 0)) || 397 (reg->num == regid(REG_P0, 0))) 398 return false; 399 return true; 400} 401 402static bool 403instr_before(struct ir3_instruction *a, struct ir3_instruction *b) 404{ 405 if (a->flags & IR3_INSTR_UNUSED) 406 return false; 407 return (a->ip < b->ip); 408} 409 410static struct ir3_instruction * 411get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr, 412 int *sz, int *off) 413{ 414 struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; 415 struct ir3_instruction *d = NULL; 416 417 if (id->defn) { 418 *sz = id->sz; 419 *off = id->off; 420 return id->defn; 421 } 422 423 if (instr->opc == OPC_META_FI) { 424 /* What about the case where collect is subset of array, we 425 * need to find the distance between where actual array starts 426 * and fanin.. that probably doesn't happen currently. 427 */ 428 struct ir3_register *src; 429 int dsz, doff; 430 431 /* note: don't use foreach_ssa_src as this gets called once 432 * while assigning regs (which clears SSA flag) 433 */ 434 foreach_src_n(src, n, instr) { 435 struct ir3_instruction *dd; 436 if (!src->instr) 437 continue; 438 439 dd = get_definer(ctx, src->instr, &dsz, &doff); 440 441 if ((!d) || instr_before(dd, d)) { 442 d = dd; 443 *sz = dsz; 444 *off = doff - n; 445 } 446 } 447 448 } else if (instr->cp.right || instr->cp.left) { 449 /* covers also the meta:fo case, which ends up w/ single 450 * scalar instructions for each component: 451 */ 452 struct ir3_instruction *f = ir3_neighbor_first(instr); 453 454 /* by definition, the entire sequence forms one linked list 455 * of single scalar register nodes (even if some of them may 456 * be fanouts from a texture sample (for example) instr. We 457 * just need to walk the list finding the first element of 458 * the group defined (lowest ip) 459 */ 460 int cnt = 0; 461 462 /* need to skip over unused in the group: */ 463 while (f && (f->flags & IR3_INSTR_UNUSED)) { 464 f = f->cp.right; 465 cnt++; 466 } 467 468 while (f) { 469 if ((!d) || instr_before(f, d)) 470 d = f; 471 if (f == instr) 472 *off = cnt; 473 f = f->cp.right; 474 cnt++; 475 } 476 477 *sz = cnt; 478 479 } else { 480 /* second case is looking directly at the instruction which 481 * produces multiple values (eg, texture sample), rather 482 * than the fanout nodes that point back to that instruction. 483 * This isn't quite right, because it may be part of a larger 484 * group, such as: 485 * 486 * sam (f32)(xyzw)r0.x, ... 487 * add r1.x, ... 488 * add r1.y, ... 489 * sam (f32)(xyzw)r2.x, r0.w <-- (r0.w, r1.x, r1.y) 490 * 491 * need to come up with a better way to handle that case. 492 */ 493 if (instr->address) { 494 *sz = instr->regs[0]->size; 495 } else { 496 *sz = util_last_bit(instr->regs[0]->wrmask); 497 } 498 *off = 0; 499 d = instr; 500 } 501 502 if (d->opc == OPC_META_FO) { 503 struct ir3_instruction *dd; 504 int dsz, doff; 505 506 dd = get_definer(ctx, d->regs[1]->instr, &dsz, &doff); 507 508 /* by definition, should come before: */ 509 debug_assert(instr_before(dd, d)); 510 511 *sz = MAX2(*sz, dsz); 512 513 if (instr->opc == OPC_META_FO) 514 *off = MAX2(*off, instr->fo.off); 515 516 d = dd; 517 } 518 519 debug_assert(d->opc != OPC_META_FO); 520 521 id->defn = d; 522 id->sz = *sz; 523 id->off = *off; 524 525 return d; 526} 527 528static void 529ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block) 530{ 531 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { 532 struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; 533 if (instr->regs_count == 0) 534 continue; 535 /* couple special cases: */ 536 if (writes_addr(instr) || writes_pred(instr)) { 537 id->cls = -1; 538 } else if (instr->regs[0]->flags & IR3_REG_ARRAY) { 539 id->cls = total_class_count; 540 } else { 541 /* and the normal case: */ 542 id->defn = get_definer(ctx, instr, &id->sz, &id->off); 543 id->cls = size_to_class(id->sz, is_half(id->defn), is_high(id->defn)); 544 545 /* this is a bit of duct-tape.. if we have a scenario like: 546 * 547 * sam (f32)(x) out.x, ... 548 * sam (f32)(x) out.y, ... 549 * 550 * Then the fanout/split meta instructions for the two different 551 * tex instructions end up grouped as left/right neighbors. The 552 * upshot is that in when you get_definer() on one of the meta:fo's 553 * you get definer as the first sam with sz=2, but when you call 554 * get_definer() on the either of the sam's you get itself as the 555 * definer with sz=1. 556 * 557 * (We actually avoid this scenario exactly, the neighbor links 558 * prevent one of the output mov's from being eliminated, so this 559 * hack should be enough. But probably we need to rethink how we 560 * find the "defining" instruction.) 561 * 562 * TODO how do we figure out offset properly... 563 */ 564 if (id->defn != instr) { 565 struct ir3_ra_instr_data *did = &ctx->instrd[id->defn->ip]; 566 if (did->sz < id->sz) { 567 did->sz = id->sz; 568 did->cls = id->cls; 569 } 570 } 571 } 572 } 573} 574 575/* give each instruction a name (and ip), and count up the # of names 576 * of each class 577 */ 578static void 579ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block) 580{ 581 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { 582 struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; 583 584#ifdef DEBUG 585 instr->name = ~0; 586#endif 587 588 ctx->instr_cnt++; 589 590 if (instr->regs_count == 0) 591 continue; 592 593 if (!writes_gpr(instr)) 594 continue; 595 596 if (id->defn != instr) 597 continue; 598 599 /* arrays which don't fit in one of the pre-defined class 600 * sizes are pre-colored: 601 */ 602 if ((id->cls >= 0) && (id->cls < total_class_count)) { 603 instr->name = ctx->class_alloc_count[id->cls]++; 604 ctx->alloc_count++; 605 } 606 } 607} 608 609static void 610ra_init(struct ir3_ra_ctx *ctx) 611{ 612 unsigned n, base; 613 614 ir3_clear_mark(ctx->ir); 615 n = ir3_count_instructions(ctx->ir); 616 617 ctx->instrd = rzalloc_array(NULL, struct ir3_ra_instr_data, n); 618 619 list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) { 620 ra_block_find_definers(ctx, block); 621 } 622 623 list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) { 624 ra_block_name_instructions(ctx, block); 625 } 626 627 /* figure out the base register name for each class. The 628 * actual ra name is class_base[cls] + instr->name; 629 */ 630 ctx->class_base[0] = 0; 631 for (unsigned i = 1; i <= total_class_count; i++) { 632 ctx->class_base[i] = ctx->class_base[i-1] + 633 ctx->class_alloc_count[i-1]; 634 } 635 636 /* and vreg names for array elements: */ 637 base = ctx->class_base[total_class_count]; 638 list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) { 639 arr->base = base; 640 ctx->class_alloc_count[total_class_count] += arr->length; 641 base += arr->length; 642 } 643 ctx->alloc_count += ctx->class_alloc_count[total_class_count]; 644 645 ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count); 646 ralloc_steal(ctx->g, ctx->instrd); 647 ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count); 648 ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count); 649} 650 651static unsigned 652__ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn) 653{ 654 unsigned name; 655 debug_assert(cls >= 0); 656 debug_assert(cls < total_class_count); /* we shouldn't get arrays here.. */ 657 name = ctx->class_base[cls] + defn->name; 658 debug_assert(name < ctx->alloc_count); 659 return name; 660} 661 662static int 663ra_name(struct ir3_ra_ctx *ctx, struct ir3_ra_instr_data *id) 664{ 665 /* TODO handle name mapping for arrays */ 666 return __ra_name(ctx, id->cls, id->defn); 667} 668 669static void 670ra_destroy(struct ir3_ra_ctx *ctx) 671{ 672 ralloc_free(ctx->g); 673} 674 675static void 676ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block) 677{ 678 struct ir3_ra_block_data *bd; 679 unsigned bitset_words = BITSET_WORDS(ctx->alloc_count); 680 681#define def(name, instr) \ 682 do { \ 683 /* defined on first write: */ \ 684 if (!ctx->def[name]) \ 685 ctx->def[name] = instr->ip; \ 686 ctx->use[name] = instr->ip; \ 687 BITSET_SET(bd->def, name); \ 688 } while(0); 689 690#define use(name, instr) \ 691 do { \ 692 ctx->use[name] = MAX2(ctx->use[name], instr->ip); \ 693 if (!BITSET_TEST(bd->def, name)) \ 694 BITSET_SET(bd->use, name); \ 695 } while(0); 696 697 bd = rzalloc(ctx->g, struct ir3_ra_block_data); 698 699 bd->def = rzalloc_array(bd, BITSET_WORD, bitset_words); 700 bd->use = rzalloc_array(bd, BITSET_WORD, bitset_words); 701 bd->livein = rzalloc_array(bd, BITSET_WORD, bitset_words); 702 bd->liveout = rzalloc_array(bd, BITSET_WORD, bitset_words); 703 704 block->data = bd; 705 706 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { 707 struct ir3_instruction *src; 708 struct ir3_register *reg; 709 710 if (instr->regs_count == 0) 711 continue; 712 713 /* There are a couple special cases to deal with here: 714 * 715 * fanout: used to split values from a higher class to a lower 716 * class, for example split the results of a texture fetch 717 * into individual scalar values; We skip over these from 718 * a 'def' perspective, and for a 'use' we walk the chain 719 * up to the defining instruction. 720 * 721 * fanin: used to collect values from lower class and assemble 722 * them together into a higher class, for example arguments 723 * to texture sample instructions; We consider these to be 724 * defined at the earliest fanin source. 725 * 726 * Most of this is handled in the get_definer() helper. 727 * 728 * In either case, we trace the instruction back to the original 729 * definer and consider that as the def/use ip. 730 */ 731 732 if (writes_gpr(instr)) { 733 struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; 734 struct ir3_register *dst = instr->regs[0]; 735 736 if (dst->flags & IR3_REG_ARRAY) { 737 struct ir3_array *arr = 738 ir3_lookup_array(ctx->ir, dst->array.id); 739 unsigned i; 740 741 arr->start_ip = MIN2(arr->start_ip, instr->ip); 742 arr->end_ip = MAX2(arr->end_ip, instr->ip); 743 744 /* set the node class now.. in case we don't encounter 745 * this array dst again. From register_alloc algo's 746 * perspective, these are all single/scalar regs: 747 */ 748 for (i = 0; i < arr->length; i++) { 749 unsigned name = arr->base + i; 750 ra_set_node_class(ctx->g, name, ctx->set->classes[0]); 751 } 752 753 /* indirect write is treated like a write to all array 754 * elements, since we don't know which one is actually 755 * written: 756 */ 757 if (dst->flags & IR3_REG_RELATIV) { 758 for (i = 0; i < arr->length; i++) { 759 unsigned name = arr->base + i; 760 def(name, instr); 761 } 762 } else { 763 unsigned name = arr->base + dst->array.offset; 764 def(name, instr); 765 } 766 767 } else if (id->defn == instr) { 768 unsigned name = ra_name(ctx, id); 769 770 /* since we are in SSA at this point: */ 771 debug_assert(!BITSET_TEST(bd->use, name)); 772 773 def(name, id->defn); 774 775 if (is_high(id->defn)) { 776 ra_set_node_class(ctx->g, name, 777 ctx->set->high_classes[id->cls - HIGH_OFFSET]); 778 } else if (is_half(id->defn)) { 779 ra_set_node_class(ctx->g, name, 780 ctx->set->half_classes[id->cls - HALF_OFFSET]); 781 } else { 782 ra_set_node_class(ctx->g, name, 783 ctx->set->classes[id->cls]); 784 } 785 } 786 } 787 788 foreach_src(reg, instr) { 789 if (reg->flags & IR3_REG_ARRAY) { 790 struct ir3_array *arr = 791 ir3_lookup_array(ctx->ir, reg->array.id); 792 arr->start_ip = MIN2(arr->start_ip, instr->ip); 793 arr->end_ip = MAX2(arr->end_ip, instr->ip); 794 795 /* indirect read is treated like a read fromall array 796 * elements, since we don't know which one is actually 797 * read: 798 */ 799 if (reg->flags & IR3_REG_RELATIV) { 800 unsigned i; 801 for (i = 0; i < arr->length; i++) { 802 unsigned name = arr->base + i; 803 use(name, instr); 804 } 805 } else { 806 unsigned name = arr->base + reg->array.offset; 807 use(name, instr); 808 /* NOTE: arrays are not SSA so unconditionally 809 * set use bit: 810 */ 811 BITSET_SET(bd->use, name); 812 debug_assert(reg->array.offset < arr->length); 813 } 814 } else if ((src = ssa(reg)) && writes_gpr(src)) { 815 unsigned name = ra_name(ctx, &ctx->instrd[src->ip]); 816 use(name, instr); 817 } 818 } 819 } 820} 821 822static bool 823ra_compute_livein_liveout(struct ir3_ra_ctx *ctx) 824{ 825 unsigned bitset_words = BITSET_WORDS(ctx->alloc_count); 826 bool progress = false; 827 828 list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) { 829 struct ir3_ra_block_data *bd = block->data; 830 831 /* update livein: */ 832 for (unsigned i = 0; i < bitset_words; i++) { 833 BITSET_WORD new_livein = 834 (bd->use[i] | (bd->liveout[i] & ~bd->def[i])); 835 836 if (new_livein & ~bd->livein[i]) { 837 bd->livein[i] |= new_livein; 838 progress = true; 839 } 840 } 841 842 /* update liveout: */ 843 for (unsigned j = 0; j < ARRAY_SIZE(block->successors); j++) { 844 struct ir3_block *succ = block->successors[j]; 845 struct ir3_ra_block_data *succ_bd; 846 847 if (!succ) 848 continue; 849 850 succ_bd = succ->data; 851 852 for (unsigned i = 0; i < bitset_words; i++) { 853 BITSET_WORD new_liveout = 854 (succ_bd->livein[i] & ~bd->liveout[i]); 855 856 if (new_liveout) { 857 bd->liveout[i] |= new_liveout; 858 progress = true; 859 } 860 } 861 } 862 } 863 864 return progress; 865} 866 867static void 868print_bitset(const char *name, BITSET_WORD *bs, unsigned cnt) 869{ 870 bool first = true; 871 debug_printf(" %s:", name); 872 for (unsigned i = 0; i < cnt; i++) { 873 if (BITSET_TEST(bs, i)) { 874 if (!first) 875 debug_printf(","); 876 debug_printf(" %04u", i); 877 first = false; 878 } 879 } 880 debug_printf("\n"); 881} 882 883static void 884ra_add_interference(struct ir3_ra_ctx *ctx) 885{ 886 struct ir3 *ir = ctx->ir; 887 888 /* initialize array live ranges: */ 889 list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) { 890 arr->start_ip = ~0; 891 arr->end_ip = 0; 892 } 893 894 /* compute live ranges (use/def) on a block level, also updating 895 * block's def/use bitmasks (used below to calculate per-block 896 * livein/liveout): 897 */ 898 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { 899 ra_block_compute_live_ranges(ctx, block); 900 } 901 902 /* update per-block livein/liveout: */ 903 while (ra_compute_livein_liveout(ctx)) {} 904 905 if (ir3_shader_debug & IR3_DBG_OPTMSGS) { 906 debug_printf("AFTER LIVEIN/OUT:\n"); 907 ir3_print(ir); 908 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { 909 struct ir3_ra_block_data *bd = block->data; 910 debug_printf("block%u:\n", block_id(block)); 911 print_bitset(" def", bd->def, ctx->alloc_count); 912 print_bitset(" use", bd->use, ctx->alloc_count); 913 print_bitset(" l/i", bd->livein, ctx->alloc_count); 914 print_bitset(" l/o", bd->liveout, ctx->alloc_count); 915 } 916 list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) { 917 debug_printf("array%u:\n", arr->id); 918 debug_printf(" length: %u\n", arr->length); 919 debug_printf(" start_ip: %u\n", arr->start_ip); 920 debug_printf(" end_ip: %u\n", arr->end_ip); 921 } 922 } 923 924 /* extend start/end ranges based on livein/liveout info from cfg: */ 925 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { 926 struct ir3_ra_block_data *bd = block->data; 927 928 for (unsigned i = 0; i < ctx->alloc_count; i++) { 929 if (BITSET_TEST(bd->livein, i)) { 930 ctx->def[i] = MIN2(ctx->def[i], block->start_ip); 931 ctx->use[i] = MAX2(ctx->use[i], block->start_ip); 932 } 933 934 if (BITSET_TEST(bd->liveout, i)) { 935 ctx->def[i] = MIN2(ctx->def[i], block->end_ip); 936 ctx->use[i] = MAX2(ctx->use[i], block->end_ip); 937 } 938 } 939 940 list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) { 941 for (unsigned i = 0; i < arr->length; i++) { 942 if (BITSET_TEST(bd->livein, i + arr->base)) { 943 arr->start_ip = MIN2(arr->start_ip, block->start_ip); 944 } 945 if (BITSET_TEST(bd->livein, i + arr->base)) { 946 arr->end_ip = MAX2(arr->end_ip, block->end_ip); 947 } 948 } 949 } 950 } 951 952 /* need to fix things up to keep outputs live: */ 953 for (unsigned i = 0; i < ir->noutputs; i++) { 954 struct ir3_instruction *instr = ir->outputs[i]; 955 if (!instr) 956 continue; 957 unsigned name = ra_name(ctx, &ctx->instrd[instr->ip]); 958 ctx->use[name] = ctx->instr_cnt; 959 } 960 961 for (unsigned i = 0; i < ctx->alloc_count; i++) { 962 for (unsigned j = 0; j < ctx->alloc_count; j++) { 963 if (intersects(ctx->def[i], ctx->use[i], 964 ctx->def[j], ctx->use[j])) { 965 ra_add_node_interference(ctx->g, i, j); 966 } 967 } 968 } 969} 970 971/* some instructions need fix-up if dst register is half precision: */ 972static void fixup_half_instr_dst(struct ir3_instruction *instr) 973{ 974 switch (opc_cat(instr->opc)) { 975 case 1: /* move instructions */ 976 instr->cat1.dst_type = half_type(instr->cat1.dst_type); 977 break; 978 case 3: 979 switch (instr->opc) { 980 case OPC_MAD_F32: 981 instr->opc = OPC_MAD_F16; 982 break; 983 case OPC_SEL_B32: 984 instr->opc = OPC_SEL_B16; 985 break; 986 case OPC_SEL_S32: 987 instr->opc = OPC_SEL_S16; 988 break; 989 case OPC_SEL_F32: 990 instr->opc = OPC_SEL_F16; 991 break; 992 case OPC_SAD_S32: 993 instr->opc = OPC_SAD_S16; 994 break; 995 /* instructions may already be fixed up: */ 996 case OPC_MAD_F16: 997 case OPC_SEL_B16: 998 case OPC_SEL_S16: 999 case OPC_SEL_F16: 1000 case OPC_SAD_S16: 1001 break; 1002 default: 1003 assert(0); 1004 break; 1005 } 1006 break; 1007 case 5: 1008 instr->cat5.type = half_type(instr->cat5.type); 1009 break; 1010 } 1011} 1012/* some instructions need fix-up if src register is half precision: */ 1013static void fixup_half_instr_src(struct ir3_instruction *instr) 1014{ 1015 switch (instr->opc) { 1016 case OPC_MOV: 1017 instr->cat1.src_type = half_type(instr->cat1.src_type); 1018 break; 1019 default: 1020 break; 1021 } 1022} 1023 1024/* NOTE: instr could be NULL for IR3_REG_ARRAY case, for the first 1025 * array access(es) which do not have any previous access to depend 1026 * on from scheduling point of view 1027 */ 1028static void 1029reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg, 1030 struct ir3_instruction *instr) 1031{ 1032 struct ir3_ra_instr_data *id; 1033 1034 if (reg->flags & IR3_REG_ARRAY) { 1035 struct ir3_array *arr = 1036 ir3_lookup_array(ctx->ir, reg->array.id); 1037 unsigned name = arr->base + reg->array.offset; 1038 unsigned r = ra_get_node_reg(ctx->g, name); 1039 unsigned num = ctx->set->ra_reg_to_gpr[r]; 1040 1041 if (reg->flags & IR3_REG_RELATIV) { 1042 reg->array.offset = num; 1043 } else { 1044 reg->num = num; 1045 reg->flags &= ~IR3_REG_SSA; 1046 } 1047 1048 reg->flags &= ~IR3_REG_ARRAY; 1049 } else if ((id = &ctx->instrd[instr->ip]) && id->defn) { 1050 unsigned name = ra_name(ctx, id); 1051 unsigned r = ra_get_node_reg(ctx->g, name); 1052 unsigned num = ctx->set->ra_reg_to_gpr[r] + id->off; 1053 1054 debug_assert(!(reg->flags & IR3_REG_RELATIV)); 1055 1056 if (is_high(id->defn)) 1057 num += FIRST_HIGH_REG; 1058 1059 reg->num = num; 1060 reg->flags &= ~IR3_REG_SSA; 1061 1062 if (is_half(id->defn)) 1063 reg->flags |= IR3_REG_HALF; 1064 } 1065} 1066 1067static void 1068ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block) 1069{ 1070 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { 1071 struct ir3_register *reg; 1072 1073 if (instr->regs_count == 0) 1074 continue; 1075 1076 if (writes_gpr(instr)) { 1077 reg_assign(ctx, instr->regs[0], instr); 1078 if (instr->regs[0]->flags & IR3_REG_HALF) 1079 fixup_half_instr_dst(instr); 1080 } 1081 1082 foreach_src_n(reg, n, instr) { 1083 struct ir3_instruction *src = reg->instr; 1084 /* Note: reg->instr could be null for IR3_REG_ARRAY */ 1085 if (!(src || (reg->flags & IR3_REG_ARRAY))) 1086 continue; 1087 reg_assign(ctx, instr->regs[n+1], src); 1088 if (instr->regs[n+1]->flags & IR3_REG_HALF) 1089 fixup_half_instr_src(instr); 1090 } 1091 } 1092} 1093 1094static int 1095ra_alloc(struct ir3_ra_ctx *ctx) 1096{ 1097 /* pre-assign array elements: 1098 */ 1099 list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) { 1100 unsigned base = 0; 1101 1102 if (arr->end_ip == 0) 1103 continue; 1104 1105 /* figure out what else we conflict with which has already 1106 * been assigned: 1107 */ 1108retry: 1109 list_for_each_entry (struct ir3_array, arr2, &ctx->ir->array_list, node) { 1110 if (arr2 == arr) 1111 break; 1112 if (arr2->end_ip == 0) 1113 continue; 1114 /* if it intersects with liverange AND register range.. */ 1115 if (intersects(arr->start_ip, arr->end_ip, 1116 arr2->start_ip, arr2->end_ip) && 1117 intersects(base, base + arr->length, 1118 arr2->reg, arr2->reg + arr2->length)) { 1119 base = MAX2(base, arr2->reg + arr2->length); 1120 goto retry; 1121 } 1122 } 1123 1124 arr->reg = base; 1125 1126 for (unsigned i = 0; i < arr->length; i++) { 1127 unsigned name, reg; 1128 1129 name = arr->base + i; 1130 reg = ctx->set->gpr_to_ra_reg[0][base++]; 1131 1132 ra_set_node_reg(ctx->g, name, reg); 1133 } 1134 } 1135 1136 if (!ra_allocate(ctx->g)) 1137 return -1; 1138 1139 list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) { 1140 ra_block_alloc(ctx, block); 1141 } 1142 1143 return 0; 1144} 1145 1146int ir3_ra(struct ir3 *ir, gl_shader_stage type, 1147 bool frag_coord, bool frag_face) 1148{ 1149 struct ir3_ra_ctx ctx = { 1150 .ir = ir, 1151 .type = type, 1152 .frag_face = frag_face, 1153 .set = ir->compiler->set, 1154 }; 1155 int ret; 1156 1157 ra_init(&ctx); 1158 ra_add_interference(&ctx); 1159 ret = ra_alloc(&ctx); 1160 ra_destroy(&ctx); 1161 1162 return ret; 1163} 1164