1/* 2 * Copyright (C) 2021 Valve Corporation 3 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org> 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25#include "ir3_ra.h" 26#include "util/rb_tree.h" 27#include "util/u_math.h" 28#include "ir3_shader.h" 29 30/* This file implements an SSA-based register allocator. Unlike other 31 * SSA-based allocators, it handles vector split/collect "smartly," meaning 32 * that multiple values may share the same register interval. From the 33 * perspective of the allocator itself, only the top-level intervals matter, 34 * and the allocator is only concerned with allocating top-level intervals, 35 * which may mean moving other top-level intervals around. Other intervals, 36 * like the destination of a split instruction or the source of a collect 37 * instruction, are "locked" to their parent interval. The details of this are 38 * mostly handled by ir3_merge_regs and ir3_reg_ctx. 39 * 40 * We currently don't do any backtracking, but we do use the merge sets as a 41 * form of affinity to try to avoid moves from phis/splits/collects. Each 42 * merge set is what a more "classic" graph-coloring or live-range based 43 * allocator would consider a single register, but here we use it as merely a 44 * hint, except when multiple overlapping values are live at the same time. 45 * Each merge set has a "preferred" register, and we try to honor that when 46 * allocating values in the merge set. 47 */ 48 49/* ir3_reg_ctx implementation. */ 50 51static int 52ir3_reg_interval_cmp(const struct rb_node *node, const void *data) 53{ 54 unsigned reg = *(const unsigned *)data; 55 const struct ir3_reg_interval *interval = 56 ir3_rb_node_to_interval_const(node); 57 if (interval->reg->interval_start > reg) 58 return -1; 59 else if (interval->reg->interval_end <= reg) 60 return 1; 61 else 62 return 0; 63} 64 65static struct ir3_reg_interval * 66ir3_reg_interval_search(struct rb_tree *tree, unsigned offset) 67{ 68 struct rb_node *node = rb_tree_search(tree, &offset, ir3_reg_interval_cmp); 69 return node ? ir3_rb_node_to_interval(node) : NULL; 70} 71 72static struct ir3_reg_interval * 73ir3_reg_interval_search_sloppy(struct rb_tree *tree, unsigned offset) 74{ 75 struct rb_node *node = 76 rb_tree_search_sloppy(tree, &offset, ir3_reg_interval_cmp); 77 return node ? ir3_rb_node_to_interval(node) : NULL; 78} 79 80/* Get the interval covering the reg, or the closest to the right if it 81 * doesn't exist. 82 */ 83static struct ir3_reg_interval * 84ir3_reg_interval_search_right(struct rb_tree *tree, unsigned offset) 85{ 86 struct ir3_reg_interval *interval = 87 ir3_reg_interval_search_sloppy(tree, offset); 88 if (!interval) { 89 return NULL; 90 } else if (interval->reg->interval_end > offset) { 91 return interval; 92 } else { 93 /* There is no interval covering reg, and ra_file_search_sloppy() 94 * returned the closest range to the left, so the next interval to the 95 * right should be the closest to the right. 96 */ 97 return ir3_reg_interval_next_or_null(interval); 98 } 99} 100 101static int 102ir3_reg_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b) 103{ 104 const struct ir3_reg_interval *a = ir3_rb_node_to_interval_const(_a); 105 const struct ir3_reg_interval *b = ir3_rb_node_to_interval_const(_b); 106 return b->reg->interval_start - a->reg->interval_start; 107} 108 109static void 110interval_insert(struct ir3_reg_ctx *ctx, struct rb_tree *tree, 111 struct ir3_reg_interval *interval) 112{ 113 struct ir3_reg_interval *right = 114 ir3_reg_interval_search_right(tree, interval->reg->interval_start); 115 if (right && right->reg->interval_start < interval->reg->interval_end) { 116 /* We disallow trees where different members have different half-ness. 117 * This means that we can't treat bitcasts as copies like normal 118 * split/collect, so something like this would require an extra copy 119 * in mergedregs mode, and count as 4 half-units of register pressure 120 * instead of 2: 121 * 122 * f16vec2 foo = unpackFloat2x16(bar) 123 * ... = foo.x 124 * ... = bar 125 * 126 * However, relaxing this rule would open a huge can of worms. What 127 * happens when there's a vector of 16 things, and the fifth element 128 * has been bitcasted as a half-reg? Would that element alone have to 129 * be small enough to be used as a half-reg source? Let's keep that 130 * can of worms firmly shut for now. 131 */ 132 assert((interval->reg->flags & IR3_REG_HALF) == 133 (right->reg->flags & IR3_REG_HALF)); 134 135 if (right->reg->interval_end <= interval->reg->interval_end && 136 right->reg->interval_start >= interval->reg->interval_start) { 137 /* Check if we're inserting something that's already inserted */ 138 assert(interval != right); 139 140 /* "right" is contained in "interval" and must become a child of 141 * it. There may be further children too. 142 */ 143 for (struct ir3_reg_interval *next = ir3_reg_interval_next(right); 144 right && right->reg->interval_start < interval->reg->interval_end; 145 right = next, next = ir3_reg_interval_next_or_null(next)) { 146 /* "right" must be contained in "interval." */ 147 assert(right->reg->interval_end <= interval->reg->interval_end); 148 assert((interval->reg->flags & IR3_REG_HALF) == 149 (right->reg->flags & IR3_REG_HALF)); 150 if (!right->parent) 151 ctx->interval_delete(ctx, right); 152 right->parent = interval; 153 rb_tree_remove(tree, &right->node); 154 rb_tree_insert(&interval->children, &right->node, 155 ir3_reg_interval_insert_cmp); 156 } 157 } else { 158 /* "right" must contain "interval," since intervals must form a 159 * tree. 160 */ 161 assert(right->reg->interval_start <= interval->reg->interval_start); 162 interval->parent = right; 163 interval_insert(ctx, &right->children, interval); 164 return; 165 } 166 } 167 168 if (!interval->parent) 169 ctx->interval_add(ctx, interval); 170 rb_tree_insert(tree, &interval->node, ir3_reg_interval_insert_cmp); 171 interval->inserted = true; 172} 173 174void 175ir3_reg_interval_insert(struct ir3_reg_ctx *ctx, 176 struct ir3_reg_interval *interval) 177{ 178 rb_tree_init(&interval->children); 179 interval->parent = NULL; 180 interval_insert(ctx, &ctx->intervals, interval); 181} 182 183/* Call after ir3_reg_interval_remove_temp() to reinsert the interval */ 184static void 185ir3_reg_interval_reinsert(struct ir3_reg_ctx *ctx, 186 struct ir3_reg_interval *interval) 187{ 188 interval->parent = NULL; 189 interval_insert(ctx, &ctx->intervals, interval); 190} 191 192void 193ir3_reg_interval_remove(struct ir3_reg_ctx *ctx, 194 struct ir3_reg_interval *interval) 195{ 196 if (interval->parent) { 197 rb_tree_remove(&interval->parent->children, &interval->node); 198 } else { 199 ctx->interval_delete(ctx, interval); 200 rb_tree_remove(&ctx->intervals, &interval->node); 201 } 202 203 rb_tree_foreach_safe (struct ir3_reg_interval, child, &interval->children, 204 node) { 205 rb_tree_remove(&interval->children, &child->node); 206 child->parent = interval->parent; 207 208 if (interval->parent) { 209 rb_tree_insert(&child->parent->children, &child->node, 210 ir3_reg_interval_insert_cmp); 211 } else { 212 ctx->interval_readd(ctx, interval, child); 213 rb_tree_insert(&ctx->intervals, &child->node, 214 ir3_reg_interval_insert_cmp); 215 } 216 } 217 218 interval->inserted = false; 219} 220 221static void 222_mark_free(struct ir3_reg_interval *interval) 223{ 224 interval->inserted = false; 225 rb_tree_foreach (struct ir3_reg_interval, child, &interval->children, node) { 226 _mark_free(child); 227 } 228} 229 230/* Remove an interval and all its children from the tree. */ 231void 232ir3_reg_interval_remove_all(struct ir3_reg_ctx *ctx, 233 struct ir3_reg_interval *interval) 234{ 235 assert(!interval->parent); 236 237 ctx->interval_delete(ctx, interval); 238 rb_tree_remove(&ctx->intervals, &interval->node); 239 _mark_free(interval); 240} 241 242/* Used when popping an interval to be shuffled around. Don't disturb children 243 * so that it can be later reinserted. 244 */ 245static void 246ir3_reg_interval_remove_temp(struct ir3_reg_ctx *ctx, 247 struct ir3_reg_interval *interval) 248{ 249 assert(!interval->parent); 250 251 ctx->interval_delete(ctx, interval); 252 rb_tree_remove(&ctx->intervals, &interval->node); 253} 254 255static void 256interval_dump(struct log_stream *stream, struct ir3_reg_interval *interval, 257 unsigned indent) 258{ 259 for (unsigned i = 0; i < indent; i++) 260 mesa_log_stream_printf(stream, "\t"); 261 mesa_log_stream_printf(stream, "reg %u start %u\n", interval->reg->name, 262 interval->reg->interval_start); 263 264 rb_tree_foreach (struct ir3_reg_interval, child, &interval->children, node) { 265 interval_dump(stream, child, indent + 1); 266 } 267 268 for (unsigned i = 0; i < indent; i++) 269 mesa_log_stream_printf(stream, "\t"); 270 mesa_log_stream_printf(stream, "reg %u end %u\n", interval->reg->name, 271 interval->reg->interval_end); 272} 273 274void 275ir3_reg_interval_dump(struct log_stream *stream, struct ir3_reg_interval *interval) 276{ 277 interval_dump(stream, interval, 0); 278} 279 280/* These are the core datastructures used by the register allocator. First 281 * ra_interval and ra_file, which are used for intra-block tracking and use 282 * the ir3_reg_ctx infrastructure: 283 */ 284 285struct ra_interval { 286 struct ir3_reg_interval interval; 287 288 struct rb_node physreg_node; 289 physreg_t physreg_start, physreg_end; 290 291 /* True if this is a source of the current instruction which is entirely 292 * killed. This means we can allocate the dest over it, but we can't break 293 * it up. 294 */ 295 bool is_killed; 296 297 /* True if this interval cannot be moved from its position. This is only 298 * used for precolored inputs to ensure that other inputs don't get 299 * allocated on top of them. 300 */ 301 bool frozen; 302}; 303 304struct ra_file { 305 struct ir3_reg_ctx reg_ctx; 306 307 BITSET_DECLARE(available, RA_MAX_FILE_SIZE); 308 BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE); 309 310 struct rb_tree physreg_intervals; 311 312 unsigned size; 313 unsigned start; 314}; 315 316/* State for inter-block tracking. When we split a live range to make space 317 * for a vector, we may need to insert fixup code when a block has multiple 318 * predecessors that have moved the same live value to different registers. 319 * This keeps track of state required to do that. 320 */ 321 322struct ra_block_state { 323 /* Map of defining ir3_register -> physreg it was allocated to at the end 324 * of the block. 325 */ 326 struct hash_table *renames; 327 328 /* For loops, we need to process a block before all its predecessors have 329 * been processed. In particular, we need to pick registers for values 330 * without knowing if all the predecessors have been renamed. This keeps 331 * track of the registers we chose so that when we visit the back-edge we 332 * can move them appropriately. If all predecessors have been visited 333 * before this block is visited then we don't need to fill this out. This 334 * is a map from ir3_register -> physreg. 335 */ 336 struct hash_table *entry_regs; 337 338 /* True if the block has been visited and "renames" is complete. 339 */ 340 bool visited; 341}; 342 343struct ra_parallel_copy { 344 struct ra_interval *interval; 345 physreg_t src; 346}; 347 348/* The main context: */ 349 350struct ra_ctx { 351 /* r0.x - r47.w. On a6xx with merged-regs, hr0.x-hr47.w go into the bottom 352 * half of this file too. 353 */ 354 struct ra_file full; 355 356 /* hr0.x - hr63.w, only used without merged-regs. */ 357 struct ra_file half; 358 359 /* Shared regs. */ 360 struct ra_file shared; 361 362 struct ir3_liveness *live; 363 364 struct ir3_block *block; 365 366 const struct ir3_compiler *compiler; 367 gl_shader_stage stage; 368 369 /* Pending moves of top-level intervals that will be emitted once we're 370 * finished: 371 */ 372 DECLARE_ARRAY(struct ra_parallel_copy, parallel_copies); 373 374 struct ra_interval *intervals; 375 struct ra_block_state *blocks; 376 377 bool merged_regs; 378}; 379 380#define foreach_interval(interval, file) \ 381 rb_tree_foreach (struct ra_interval, interval, &(file)->physreg_intervals, \ 382 physreg_node) 383#define foreach_interval_rev(interval, file) \ 384 rb_tree_foreach (struct ra_interval, interval, &(file)->physreg_intervals, \ 385 physreg_node) 386#define foreach_interval_safe(interval, file) \ 387 rb_tree_foreach_safe (struct ra_interval, interval, \ 388 &(file)->physreg_intervals, physreg_node) 389#define foreach_interval_rev_safe(interval, file) \ 390 rb_tree_foreach_rev_safe(struct ra_interval, interval, \ 391 &(file)->physreg_intervals, physreg_node) 392 393static struct ra_interval * 394rb_node_to_interval(struct rb_node *node) 395{ 396 return rb_node_data(struct ra_interval, node, physreg_node); 397} 398 399static const struct ra_interval * 400rb_node_to_interval_const(const struct rb_node *node) 401{ 402 return rb_node_data(struct ra_interval, node, physreg_node); 403} 404 405static struct ra_interval * 406ra_interval_next(struct ra_interval *interval) 407{ 408 struct rb_node *next = rb_node_next(&interval->physreg_node); 409 return next ? rb_node_to_interval(next) : NULL; 410} 411 412static struct ra_interval * 413ra_interval_next_or_null(struct ra_interval *interval) 414{ 415 return interval ? ra_interval_next(interval) : NULL; 416} 417 418static int 419ra_interval_cmp(const struct rb_node *node, const void *data) 420{ 421 physreg_t reg = *(const physreg_t *)data; 422 const struct ra_interval *interval = rb_node_to_interval_const(node); 423 if (interval->physreg_start > reg) 424 return -1; 425 else if (interval->physreg_end <= reg) 426 return 1; 427 else 428 return 0; 429} 430 431static struct ra_interval * 432ra_interval_search_sloppy(struct rb_tree *tree, physreg_t reg) 433{ 434 struct rb_node *node = rb_tree_search_sloppy(tree, ®, ra_interval_cmp); 435 return node ? rb_node_to_interval(node) : NULL; 436} 437 438/* Get the interval covering the reg, or the closest to the right if it 439 * doesn't exist. 440 */ 441static struct ra_interval * 442ra_interval_search_right(struct rb_tree *tree, physreg_t reg) 443{ 444 struct ra_interval *interval = ra_interval_search_sloppy(tree, reg); 445 if (!interval) { 446 return NULL; 447 } else if (interval->physreg_end > reg) { 448 return interval; 449 } else { 450 /* There is no interval covering reg, and ra_file_search_sloppy() 451 * returned the closest range to the left, so the next interval to the 452 * right should be the closest to the right. 453 */ 454 return ra_interval_next_or_null(interval); 455 } 456} 457 458static struct ra_interval * 459ra_file_search_right(struct ra_file *file, physreg_t reg) 460{ 461 return ra_interval_search_right(&file->physreg_intervals, reg); 462} 463 464static int 465ra_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b) 466{ 467 const struct ra_interval *a = rb_node_to_interval_const(_a); 468 const struct ra_interval *b = rb_node_to_interval_const(_b); 469 return b->physreg_start - a->physreg_start; 470} 471 472static struct ra_interval * 473ir3_reg_interval_to_ra_interval(struct ir3_reg_interval *interval) 474{ 475 return rb_node_data(struct ra_interval, interval, interval); 476} 477 478static struct ra_file * 479ir3_reg_ctx_to_file(struct ir3_reg_ctx *ctx) 480{ 481 return rb_node_data(struct ra_file, ctx, reg_ctx); 482} 483 484static void 485interval_add(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval) 486{ 487 struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval); 488 struct ra_file *file = ir3_reg_ctx_to_file(ctx); 489 490 /* We can assume in this case that physreg_start/physreg_end is already 491 * initialized. 492 */ 493 for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) { 494 BITSET_CLEAR(file->available, i); 495 BITSET_CLEAR(file->available_to_evict, i); 496 } 497 498 rb_tree_insert(&file->physreg_intervals, &interval->physreg_node, 499 ra_interval_insert_cmp); 500} 501 502static void 503interval_delete(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval) 504{ 505 struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval); 506 struct ra_file *file = ir3_reg_ctx_to_file(ctx); 507 508 for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) { 509 BITSET_SET(file->available, i); 510 BITSET_SET(file->available_to_evict, i); 511 } 512 513 rb_tree_remove(&file->physreg_intervals, &interval->physreg_node); 514} 515 516static void 517interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_parent, 518 struct ir3_reg_interval *_child) 519{ 520 struct ra_interval *parent = ir3_reg_interval_to_ra_interval(_parent); 521 struct ra_interval *child = ir3_reg_interval_to_ra_interval(_child); 522 523 child->physreg_start = 524 parent->physreg_start + (child->interval.reg->interval_start - 525 parent->interval.reg->interval_start); 526 child->physreg_end = 527 child->physreg_start + 528 (child->interval.reg->interval_end - child->interval.reg->interval_start); 529 530 interval_add(ctx, _child); 531} 532 533static void 534ra_file_init(struct ra_file *file) 535{ 536 for (unsigned i = 0; i < file->size; i++) { 537 BITSET_SET(file->available, i); 538 BITSET_SET(file->available_to_evict, i); 539 } 540 541 rb_tree_init(&file->reg_ctx.intervals); 542 rb_tree_init(&file->physreg_intervals); 543 544 file->reg_ctx.interval_add = interval_add; 545 file->reg_ctx.interval_delete = interval_delete; 546 file->reg_ctx.interval_readd = interval_readd; 547} 548 549static void 550ra_file_insert(struct ra_file *file, struct ra_interval *interval) 551{ 552 assert(interval->physreg_start < interval->physreg_end); 553 assert(interval->physreg_end <= file->size); 554 if (interval->interval.reg->flags & IR3_REG_HALF) 555 assert(interval->physreg_end <= RA_HALF_SIZE); 556 557 ir3_reg_interval_insert(&file->reg_ctx, &interval->interval); 558} 559 560static void 561ra_file_remove(struct ra_file *file, struct ra_interval *interval) 562{ 563 ir3_reg_interval_remove(&file->reg_ctx, &interval->interval); 564} 565 566static void 567ra_file_mark_killed(struct ra_file *file, struct ra_interval *interval) 568{ 569 assert(!interval->interval.parent); 570 571 for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) { 572 BITSET_SET(file->available, i); 573 } 574 575 interval->is_killed = true; 576} 577 578static void 579ra_file_unmark_killed(struct ra_file *file, struct ra_interval *interval) 580{ 581 assert(!interval->interval.parent); 582 583 for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) { 584 BITSET_CLEAR(file->available, i); 585 } 586 587 interval->is_killed = false; 588} 589 590static physreg_t 591ra_interval_get_physreg(const struct ra_interval *interval) 592{ 593 unsigned child_start = interval->interval.reg->interval_start; 594 595 while (interval->interval.parent) { 596 interval = ir3_reg_interval_to_ra_interval(interval->interval.parent); 597 } 598 599 return interval->physreg_start + 600 (child_start - interval->interval.reg->interval_start); 601} 602 603static unsigned 604ra_interval_get_num(const struct ra_interval *interval) 605{ 606 return ra_physreg_to_num(ra_interval_get_physreg(interval), 607 interval->interval.reg->flags); 608} 609 610static void 611ra_interval_init(struct ra_interval *interval, struct ir3_register *reg) 612{ 613 ir3_reg_interval_init(&interval->interval, reg); 614 interval->is_killed = false; 615 interval->frozen = false; 616} 617 618static void 619ra_interval_dump(struct log_stream *stream, struct ra_interval *interval) 620{ 621 mesa_log_stream_printf(stream, "physreg %u ", interval->physreg_start); 622 623 ir3_reg_interval_dump(stream, &interval->interval); 624} 625 626static void 627ra_file_dump(struct log_stream *stream, struct ra_file *file) 628{ 629 rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals, 630 physreg_node) { 631 ra_interval_dump(stream, interval); 632 } 633 634 unsigned start, end; 635 mesa_log_stream_printf(stream, "available:\n"); 636 BITSET_FOREACH_RANGE (start, end, file->available, file->size) { 637 mesa_log_stream_printf(stream, "%u-%u ", start, end); 638 } 639 mesa_log_stream_printf(stream, "\n"); 640 641 mesa_log_stream_printf(stream, "available to evict:\n"); 642 BITSET_FOREACH_RANGE (start, end, file->available_to_evict, file->size) { 643 mesa_log_stream_printf(stream, "%u-%u ", start, end); 644 } 645 mesa_log_stream_printf(stream, "\n"); 646 mesa_log_stream_printf(stream, "start: %u\n", file->start); 647} 648 649static void 650ra_ctx_dump(struct ra_ctx *ctx) 651{ 652 struct log_stream *stream = mesa_log_streami(); 653 mesa_log_stream_printf(stream, "full:\n"); 654 ra_file_dump(stream, &ctx->full); 655 mesa_log_stream_printf(stream, "half:\n"); 656 ra_file_dump(stream, &ctx->half); 657 mesa_log_stream_printf(stream, "shared:"); 658 ra_file_dump(stream, &ctx->shared); 659 mesa_log_stream_destroy(stream); 660} 661 662static unsigned 663reg_file_size(struct ra_file *file, struct ir3_register *reg) 664{ 665 /* Half-regs can only take up the first half of the combined regfile */ 666 if (reg->flags & IR3_REG_HALF) 667 return MIN2(file->size, RA_HALF_SIZE); 668 else 669 return file->size; 670} 671 672/* ra_pop_interval/ra_push_interval provide an API to shuffle around multiple 673 * top-level intervals at once. Pop multiple intervals, then push them back in 674 * any order. 675 */ 676 677struct ra_removed_interval { 678 struct ra_interval *interval; 679 unsigned size; 680}; 681 682static struct ra_removed_interval 683ra_pop_interval(struct ra_ctx *ctx, struct ra_file *file, 684 struct ra_interval *interval) 685{ 686 assert(!interval->interval.parent); 687 688 /* Check if we've already moved this reg before */ 689 unsigned pcopy_index; 690 for (pcopy_index = 0; pcopy_index < ctx->parallel_copies_count; 691 pcopy_index++) { 692 if (ctx->parallel_copies[pcopy_index].interval == interval) 693 break; 694 } 695 696 if (pcopy_index == ctx->parallel_copies_count) { 697 array_insert(ctx, ctx->parallel_copies, 698 (struct ra_parallel_copy){ 699 .interval = interval, 700 .src = interval->physreg_start, 701 }); 702 } 703 704 ir3_reg_interval_remove_temp(&file->reg_ctx, &interval->interval); 705 706 return (struct ra_removed_interval){ 707 .interval = interval, 708 .size = interval->physreg_end - interval->physreg_start, 709 }; 710} 711 712static void 713ra_push_interval(struct ra_ctx *ctx, struct ra_file *file, 714 const struct ra_removed_interval *removed, physreg_t dst) 715{ 716 struct ra_interval *interval = removed->interval; 717 718 interval->physreg_start = dst; 719 interval->physreg_end = dst + removed->size; 720 721 ir3_reg_interval_reinsert(&file->reg_ctx, &interval->interval); 722} 723 724/* Pick up the interval and place it at "dst". */ 725static void 726ra_move_interval(struct ra_ctx *ctx, struct ra_file *file, 727 struct ra_interval *interval, physreg_t dst) 728{ 729 struct ra_removed_interval temp = ra_pop_interval(ctx, file, interval); 730 ra_push_interval(ctx, file, &temp, dst); 731} 732 733static bool 734get_reg_specified(struct ra_file *file, struct ir3_register *reg, 735 physreg_t physreg, bool is_source) 736{ 737 for (unsigned i = 0; i < reg_size(reg); i++) { 738 if (!BITSET_TEST(is_source ? file->available_to_evict : file->available, 739 physreg + i)) 740 return false; 741 } 742 743 return true; 744} 745 746/* Try to evict any registers conflicting with the proposed spot "physreg" for 747 * "reg". That is, move them to other places so that we can allocate "physreg" 748 * here. 749 */ 750 751static bool 752try_evict_regs(struct ra_ctx *ctx, struct ra_file *file, 753 struct ir3_register *reg, physreg_t physreg, 754 unsigned *_eviction_count, bool is_source, bool speculative) 755{ 756 BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE); 757 memcpy(available_to_evict, file->available_to_evict, 758 sizeof(available_to_evict)); 759 760 BITSET_DECLARE(available, RA_MAX_FILE_SIZE); 761 memcpy(available, file->available, sizeof(available)); 762 763 for (unsigned i = 0; i < reg_size(reg); i++) { 764 BITSET_CLEAR(available_to_evict, physreg + i); 765 BITSET_CLEAR(available, physreg + i); 766 } 767 768 unsigned eviction_count = 0; 769 /* Iterate over each range conflicting with physreg */ 770 for (struct ra_interval *conflicting = ra_file_search_right(file, physreg), 771 *next = ra_interval_next_or_null(conflicting); 772 conflicting != NULL && 773 conflicting->physreg_start < physreg + reg_size(reg); 774 conflicting = next, next = ra_interval_next_or_null(next)) { 775 if (!is_source && conflicting->is_killed) 776 continue; 777 778 if (conflicting->frozen) { 779 assert(speculative); 780 return false; 781 } 782 783 unsigned conflicting_file_size = 784 reg_file_size(file, conflicting->interval.reg); 785 unsigned avail_start, avail_end; 786 bool evicted = false; 787 BITSET_FOREACH_RANGE (avail_start, avail_end, available_to_evict, 788 conflicting_file_size) { 789 unsigned size = avail_end - avail_start; 790 791 /* non-half registers must be aligned */ 792 if (!(conflicting->interval.reg->flags & IR3_REG_HALF) && 793 avail_start % 2 == 1) { 794 avail_start++; 795 size--; 796 } 797 798 if (size >= conflicting->physreg_end - conflicting->physreg_start) { 799 for (unsigned i = 0; 800 i < conflicting->physreg_end - conflicting->physreg_start; i++) 801 BITSET_CLEAR(available_to_evict, avail_start + i); 802 eviction_count += 803 conflicting->physreg_end - conflicting->physreg_start; 804 if (!speculative) 805 ra_move_interval(ctx, file, conflicting, avail_start); 806 evicted = true; 807 break; 808 } 809 } 810 811 if (evicted) 812 continue; 813 814 /* If we couldn't evict this range, we may be able to swap it with a 815 * killed range to acheive the same effect. 816 */ 817 foreach_interval (killed, file) { 818 if (!killed->is_killed) 819 continue; 820 821 if (killed->physreg_end - killed->physreg_start != 822 conflicting->physreg_end - conflicting->physreg_start) 823 continue; 824 825 if (killed->physreg_end > conflicting_file_size || 826 conflicting->physreg_end > reg_file_size(file, killed->interval.reg)) 827 continue; 828 829 /* We can't swap the killed range if it partially/fully overlaps the 830 * space we're trying to allocate or (in speculative mode) if it's 831 * already been swapped and will overlap when we actually evict. 832 */ 833 bool killed_available = true; 834 for (unsigned i = killed->physreg_start; i < killed->physreg_end; i++) { 835 if (!BITSET_TEST(available, i)) { 836 killed_available = false; 837 break; 838 } 839 } 840 841 if (!killed_available) 842 continue; 843 844 /* Check for alignment if one is a full reg */ 845 if ((!(killed->interval.reg->flags & IR3_REG_HALF) || 846 !(conflicting->interval.reg->flags & IR3_REG_HALF)) && 847 (killed->physreg_start % 2 != 0 || 848 conflicting->physreg_start % 2 != 0)) 849 continue; 850 851 for (unsigned i = killed->physreg_start; i < killed->physreg_end; i++) { 852 BITSET_CLEAR(available, i); 853 } 854 /* Because this will generate swaps instead of moves, multiply the 855 * cost by 2. 856 */ 857 eviction_count += (killed->physreg_end - killed->physreg_start) * 2; 858 if (!speculative) { 859 physreg_t killed_start = killed->physreg_start, 860 conflicting_start = conflicting->physreg_start; 861 struct ra_removed_interval killed_removed = 862 ra_pop_interval(ctx, file, killed); 863 struct ra_removed_interval conflicting_removed = 864 ra_pop_interval(ctx, file, conflicting); 865 ra_push_interval(ctx, file, &killed_removed, conflicting_start); 866 ra_push_interval(ctx, file, &conflicting_removed, killed_start); 867 } 868 869 evicted = true; 870 break; 871 } 872 873 if (!evicted) 874 return false; 875 } 876 877 *_eviction_count = eviction_count; 878 return true; 879} 880 881static int 882removed_interval_cmp(const void *_i1, const void *_i2) 883{ 884 const struct ra_removed_interval *i1 = _i1; 885 const struct ra_removed_interval *i2 = _i2; 886 887 /* We sort the registers as follows: 888 * 889 * |--------------------------------------------------------------------| 890 * | | | | | 891 * | Half live-through | Half killed | Full killed | Full live-through | 892 * | | | | | 893 * |--------------------------------------------------------------------| 894 * | | 895 * | Destination | 896 * | | 897 * |-----------------| 898 * 899 * Half-registers have to be first so that they stay in the low half of 900 * the register file. Then half and full killed must stay together so that 901 * there's a contiguous range where we can put the register. With this 902 * structure we should be able to accomodate any collection of intervals 903 * such that the total number of half components is within the half limit 904 * and the combined components are within the full limit. 905 */ 906 907 unsigned i1_align = reg_elem_size(i1->interval->interval.reg); 908 unsigned i2_align = reg_elem_size(i2->interval->interval.reg); 909 if (i1_align > i2_align) 910 return 1; 911 if (i1_align < i2_align) 912 return -1; 913 914 if (i1_align == 1) { 915 if (i2->interval->is_killed) 916 return -1; 917 if (i1->interval->is_killed) 918 return 1; 919 } else { 920 if (i2->interval->is_killed) 921 return 1; 922 if (i1->interval->is_killed) 923 return -1; 924 } 925 926 return 0; 927} 928 929/* "Compress" all the live intervals so that there is enough space for the 930 * destination register. As there can be gaps when a more-aligned interval 931 * follows a less-aligned interval, this also sorts them to remove such 932 * "padding", which may be required when space is very tight. This isn't 933 * amazing, but should be used only as a last resort in case the register file 934 * is almost full and badly fragmented. 935 * 936 * Return the physreg to use. 937 */ 938static physreg_t 939compress_regs_left(struct ra_ctx *ctx, struct ra_file *file, unsigned size, 940 unsigned align, bool is_source) 941{ 942 DECLARE_ARRAY(struct ra_removed_interval, intervals); 943 intervals_count = intervals_sz = 0; 944 intervals = NULL; 945 946 unsigned removed_size = 0, removed_half_size = 0; 947 unsigned file_size = 948 align == 1 ? MIN2(file->size, RA_HALF_SIZE) : file->size; 949 physreg_t start_reg = 0; 950 951 foreach_interval_rev_safe (interval, file) { 952 /* Check if we can sort the intervals *after* this one and have enough 953 * space leftover to accomodate "size" units. Also check that we have 954 * enough space leftover for half-registers, if we're inserting a 955 * half-register (otherwise we only shift any half-registers down so they 956 * should be safe). 957 */ 958 if (interval->physreg_end + size + removed_size <= file->size && 959 (align != 1 || 960 interval->physreg_end + size + removed_half_size <= file_size)) { 961 start_reg = interval->physreg_end; 962 break; 963 } 964 965 /* We assume that all frozen intervals are at the start and that we 966 * can avoid popping them. 967 */ 968 assert(!interval->frozen); 969 970 /* Killed sources don't count because they go at the end and can 971 * overlap the register we're trying to add, unless it's a source. 972 */ 973 if (!interval->is_killed || is_source) { 974 removed_size += interval->physreg_end - interval->physreg_start; 975 if (interval->interval.reg->flags & IR3_REG_HALF) { 976 removed_half_size += interval->physreg_end - 977 interval->physreg_start; 978 } 979 } 980 981 /* Now that we've done the accounting, pop this off */ 982 d("popping interval %u physreg %u\n", interval->interval.reg->name, 983 interval->physreg_start); 984 array_insert(ctx, intervals, ra_pop_interval(ctx, file, interval)); 985 } 986 987 /* TODO: In addition to skipping registers at the beginning that are 988 * well-packed, we should try to skip registers at the end. 989 */ 990 991 qsort(intervals, intervals_count, sizeof(*intervals), removed_interval_cmp); 992 993 physreg_t physreg = start_reg; 994 physreg_t ret_reg = (physreg_t)~0; 995 for (unsigned i = 0; i < intervals_count; i++) { 996 if (ret_reg == (physreg_t)~0 && 997 ((intervals[i].interval->is_killed && !is_source) || 998 !(intervals[i].interval->interval.reg->flags & IR3_REG_HALF))) { 999 ret_reg = ALIGN(physreg, align); 1000 } 1001 1002 if (ret_reg != (physreg_t)~0 && 1003 (is_source || !intervals[i].interval->is_killed)) { 1004 physreg = MAX2(physreg, ret_reg + size); 1005 } 1006 1007 if (!(intervals[i].interval->interval.reg->flags & IR3_REG_HALF)) { 1008 physreg = ALIGN(physreg, 2); 1009 } 1010 1011 if (physreg + intervals[i].size > 1012 reg_file_size(file, intervals[i].interval->interval.reg)) { 1013 d("ran out of room for interval %u!\n", 1014 intervals[i].interval->interval.reg->name); 1015 unreachable("reg pressure calculation was wrong!"); 1016 return 0; 1017 } 1018 1019 d("pushing interval %u physreg %u\n", 1020 intervals[i].interval->interval.reg->name, physreg); 1021 ra_push_interval(ctx, file, &intervals[i], physreg); 1022 1023 physreg += intervals[i].size; 1024 } 1025 1026 if (ret_reg == (physreg_t)~0) 1027 ret_reg = physreg; 1028 1029 ret_reg = ALIGN(ret_reg, align); 1030 if (ret_reg + size > file_size) { 1031 d("ran out of room for the new interval!\n"); 1032 unreachable("reg pressure calculation was wrong!"); 1033 return 0; 1034 } 1035 1036 return ret_reg; 1037} 1038 1039static void 1040update_affinity(struct ra_file *file, struct ir3_register *reg, 1041 physreg_t physreg) 1042{ 1043 if (!reg->merge_set || reg->merge_set->preferred_reg != (physreg_t)~0) 1044 return; 1045 1046 if (physreg < reg->merge_set_offset) 1047 return; 1048 1049 if ((physreg - reg->merge_set_offset + reg->merge_set->size) > file->size) 1050 return; 1051 1052 reg->merge_set->preferred_reg = physreg - reg->merge_set_offset; 1053} 1054 1055/* Try to find free space for a register without shuffling anything. This uses 1056 * a round-robin algorithm to reduce false dependencies. 1057 */ 1058static physreg_t 1059find_best_gap(struct ra_file *file, unsigned file_size, unsigned size, 1060 unsigned align, bool is_source) 1061{ 1062 /* This can happen if we create a very large merge set. Just bail out in that 1063 * case. 1064 */ 1065 if (size > file_size) 1066 return (physreg_t) ~0; 1067 1068 BITSET_WORD *available = 1069 is_source ? file->available_to_evict : file->available; 1070 1071 unsigned start = ALIGN(file->start, align) % (file_size - size + align); 1072 unsigned candidate = start; 1073 do { 1074 bool is_available = true; 1075 for (unsigned i = 0; i < size; i++) { 1076 if (!BITSET_TEST(available, candidate + i)) { 1077 is_available = false; 1078 break; 1079 } 1080 } 1081 1082 if (is_available) { 1083 file->start = (candidate + size) % file_size; 1084 return candidate; 1085 } 1086 1087 candidate += align; 1088 if (candidate + size > file_size) 1089 candidate = 0; 1090 } while (candidate != start); 1091 1092 return (physreg_t)~0; 1093} 1094 1095static struct ra_file * 1096ra_get_file(struct ra_ctx *ctx, struct ir3_register *reg) 1097{ 1098 if (reg->flags & IR3_REG_SHARED) 1099 return &ctx->shared; 1100 else if (ctx->merged_regs || !(reg->flags & IR3_REG_HALF)) 1101 return &ctx->full; 1102 else 1103 return &ctx->half; 1104} 1105 1106/* This is the main entrypoint for picking a register. Pick a free register 1107 * for "reg", shuffling around sources if necessary. In the normal case where 1108 * "is_source" is false, this register can overlap with killed sources 1109 * (intervals with "is_killed == true"). If "is_source" is true, then 1110 * is_killed is ignored and the register returned must not overlap with killed 1111 * sources. This must be used for tied registers, because we're actually 1112 * allocating the destination and the tied source at the same time. 1113 */ 1114 1115static physreg_t 1116get_reg(struct ra_ctx *ctx, struct ra_file *file, struct ir3_register *reg, 1117 bool is_source) 1118{ 1119 unsigned file_size = reg_file_size(file, reg); 1120 if (reg->merge_set && reg->merge_set->preferred_reg != (physreg_t)~0) { 1121 physreg_t preferred_reg = 1122 reg->merge_set->preferred_reg + reg->merge_set_offset; 1123 if (preferred_reg < file_size && 1124 preferred_reg % reg_elem_size(reg) == 0 && 1125 get_reg_specified(file, reg, preferred_reg, is_source)) 1126 return preferred_reg; 1127 } 1128 1129 /* If this register is a subset of a merge set which we have not picked a 1130 * register for, first try to allocate enough space for the entire merge 1131 * set. 1132 */ 1133 unsigned size = reg_size(reg); 1134 if (reg->merge_set && reg->merge_set->preferred_reg == (physreg_t)~0 && 1135 size < reg->merge_set->size) { 1136 physreg_t best_reg = find_best_gap(file, file_size, reg->merge_set->size, 1137 reg->merge_set->alignment, is_source); 1138 if (best_reg != (physreg_t)~0u) { 1139 best_reg += reg->merge_set_offset; 1140 return best_reg; 1141 } 1142 } 1143 1144 /* For ALU and SFU instructions, if the src reg is avail to pick, use it. 1145 * Because this doesn't introduce unnecessary dependencies, and it 1146 * potentially avoids needing (ss) syncs for write after read hazards for 1147 * SFU instructions: 1148 */ 1149 if (is_sfu(reg->instr) || is_alu(reg->instr)) { 1150 for (unsigned i = 0; i < reg->instr->srcs_count; i++) { 1151 struct ir3_register *src = reg->instr->srcs[i]; 1152 if (!ra_reg_is_src(src)) 1153 continue; 1154 if (ra_get_file(ctx, src) == file && reg_size(src) >= size) { 1155 struct ra_interval *src_interval = &ctx->intervals[src->def->name]; 1156 physreg_t src_physreg = ra_interval_get_physreg(src_interval); 1157 if (src_physreg % reg_elem_size(reg) == 0 && 1158 src_physreg + size <= file_size && 1159 get_reg_specified(file, reg, src_physreg, is_source)) 1160 return src_physreg; 1161 } 1162 } 1163 } 1164 1165 physreg_t best_reg = 1166 find_best_gap(file, file_size, size, reg_elem_size(reg), is_source); 1167 if (best_reg != (physreg_t)~0u) { 1168 return best_reg; 1169 } 1170 1171 /* Ok, we couldn't find anything that fits. Here is where we have to start 1172 * moving things around to make stuff fit. First try solely evicting 1173 * registers in the way. 1174 */ 1175 unsigned best_eviction_count = ~0; 1176 for (physreg_t i = 0; i + size <= file_size; i += reg_elem_size(reg)) { 1177 unsigned eviction_count; 1178 if (try_evict_regs(ctx, file, reg, i, &eviction_count, is_source, true)) { 1179 if (eviction_count < best_eviction_count) { 1180 best_eviction_count = eviction_count; 1181 best_reg = i; 1182 } 1183 } 1184 } 1185 1186 if (best_eviction_count != ~0) { 1187 ASSERTED bool result = try_evict_regs( 1188 ctx, file, reg, best_reg, &best_eviction_count, is_source, false); 1189 assert(result); 1190 return best_reg; 1191 } 1192 1193 /* Use the dumb fallback only if try_evict_regs() fails. */ 1194 return compress_regs_left(ctx, file, reg_size(reg), reg_elem_size(reg), 1195 is_source); 1196} 1197 1198static void 1199assign_reg(struct ir3_instruction *instr, struct ir3_register *reg, 1200 unsigned num) 1201{ 1202 if (reg->flags & IR3_REG_ARRAY) { 1203 reg->array.base = num; 1204 if (reg->flags & IR3_REG_RELATIV) 1205 reg->array.offset += num; 1206 else 1207 reg->num = num + reg->array.offset; 1208 } else { 1209 reg->num = num; 1210 } 1211} 1212 1213static void 1214mark_src_killed(struct ra_ctx *ctx, struct ir3_register *src) 1215{ 1216 struct ra_interval *interval = &ctx->intervals[src->def->name]; 1217 1218 if (!(src->flags & IR3_REG_FIRST_KILL) || interval->is_killed || 1219 interval->interval.parent || 1220 !rb_tree_is_empty(&interval->interval.children)) 1221 return; 1222 1223 ra_file_mark_killed(ra_get_file(ctx, src), interval); 1224} 1225 1226static void 1227insert_dst(struct ra_ctx *ctx, struct ir3_register *dst) 1228{ 1229 struct ra_file *file = ra_get_file(ctx, dst); 1230 struct ra_interval *interval = &ctx->intervals[dst->name]; 1231 1232 d("insert dst %u physreg %u", dst->name, ra_interval_get_physreg(interval)); 1233 1234 if (!(dst->flags & IR3_REG_UNUSED)) 1235 ra_file_insert(file, interval); 1236 1237 assign_reg(dst->instr, dst, ra_interval_get_num(interval)); 1238} 1239 1240static void 1241allocate_dst_fixed(struct ra_ctx *ctx, struct ir3_register *dst, 1242 physreg_t physreg) 1243{ 1244 struct ra_file *file = ra_get_file(ctx, dst); 1245 struct ra_interval *interval = &ctx->intervals[dst->name]; 1246 update_affinity(file, dst, physreg); 1247 1248 ra_interval_init(interval, dst); 1249 interval->physreg_start = physreg; 1250 interval->physreg_end = physreg + reg_size(dst); 1251} 1252 1253static void 1254allocate_dst(struct ra_ctx *ctx, struct ir3_register *dst) 1255{ 1256 struct ra_file *file = ra_get_file(ctx, dst); 1257 1258 struct ir3_register *tied = dst->tied; 1259 if (tied) { 1260 struct ra_interval *tied_interval = &ctx->intervals[tied->def->name]; 1261 struct ra_interval *dst_interval = &ctx->intervals[dst->name]; 1262 physreg_t tied_physreg = ra_interval_get_physreg(tied_interval); 1263 if (tied_interval->is_killed) { 1264 /* The easy case: the source is killed, so we can just reuse it 1265 * for the destination. 1266 */ 1267 allocate_dst_fixed(ctx, dst, ra_interval_get_physreg(tied_interval)); 1268 } else { 1269 /* The source is live-through, so we need to get a free register 1270 * (which is free for both the source and destination!), copy the 1271 * original source to it, then use that for the source and 1272 * destination. 1273 */ 1274 physreg_t physreg = get_reg(ctx, file, dst, true); 1275 allocate_dst_fixed(ctx, dst, physreg); 1276 array_insert(ctx, ctx->parallel_copies, 1277 (struct ra_parallel_copy){ 1278 .interval = dst_interval, 1279 .src = tied_physreg, 1280 }); 1281 } 1282 1283 return; 1284 } 1285 1286 /* All the hard work is done by get_reg here. */ 1287 physreg_t physreg = get_reg(ctx, file, dst, false); 1288 1289 allocate_dst_fixed(ctx, dst, physreg); 1290} 1291 1292static void 1293assign_src(struct ra_ctx *ctx, struct ir3_instruction *instr, 1294 struct ir3_register *src) 1295{ 1296 struct ra_interval *interval = &ctx->intervals[src->def->name]; 1297 struct ra_file *file = ra_get_file(ctx, src); 1298 1299 struct ir3_register *tied = src->tied; 1300 physreg_t physreg; 1301 if (tied) { 1302 struct ra_interval *tied_interval = &ctx->intervals[tied->name]; 1303 physreg = ra_interval_get_physreg(tied_interval); 1304 } else { 1305 physreg = ra_interval_get_physreg(interval); 1306 } 1307 1308 assign_reg(instr, src, ra_physreg_to_num(physreg, src->flags)); 1309 1310 if (src->flags & IR3_REG_FIRST_KILL) 1311 ra_file_remove(file, interval); 1312} 1313 1314/* Insert a parallel copy instruction before the instruction with the parallel 1315 * copy entries we've built up. 1316 */ 1317static void 1318insert_parallel_copy_instr(struct ra_ctx *ctx, struct ir3_instruction *instr) 1319{ 1320 if (ctx->parallel_copies_count == 0) 1321 return; 1322 1323 struct ir3_instruction *pcopy = 1324 ir3_instr_create(instr->block, OPC_META_PARALLEL_COPY, 1325 ctx->parallel_copies_count, ctx->parallel_copies_count); 1326 1327 for (unsigned i = 0; i < ctx->parallel_copies_count; i++) { 1328 struct ra_parallel_copy *entry = &ctx->parallel_copies[i]; 1329 struct ir3_register *reg = 1330 ir3_dst_create(pcopy, INVALID_REG, 1331 entry->interval->interval.reg->flags & ~IR3_REG_SSA); 1332 reg->size = entry->interval->interval.reg->size; 1333 reg->wrmask = entry->interval->interval.reg->wrmask; 1334 assign_reg(pcopy, reg, ra_interval_get_num(entry->interval)); 1335 } 1336 1337 for (unsigned i = 0; i < ctx->parallel_copies_count; i++) { 1338 struct ra_parallel_copy *entry = &ctx->parallel_copies[i]; 1339 struct ir3_register *reg = 1340 ir3_src_create(pcopy, INVALID_REG, 1341 entry->interval->interval.reg->flags & ~IR3_REG_SSA); 1342 reg->size = entry->interval->interval.reg->size; 1343 reg->wrmask = entry->interval->interval.reg->wrmask; 1344 assign_reg(pcopy, reg, ra_physreg_to_num(entry->src, reg->flags)); 1345 } 1346 1347 list_del(&pcopy->node); 1348 list_addtail(&pcopy->node, &instr->node); 1349 ctx->parallel_copies_count = 0; 1350} 1351 1352static void 1353handle_normal_instr(struct ra_ctx *ctx, struct ir3_instruction *instr) 1354{ 1355 /* First, mark sources as going-to-be-killed while allocating the dest. */ 1356 ra_foreach_src (src, instr) { 1357 mark_src_killed(ctx, src); 1358 } 1359 1360 /* Allocate the destination. */ 1361 ra_foreach_dst (dst, instr) { 1362 allocate_dst(ctx, dst); 1363 } 1364 1365 /* Now handle sources. Go backward so that in case there are multiple 1366 * sources with the same def and that def is killed we only remove it at 1367 * the end. 1368 */ 1369 ra_foreach_src_rev (src, instr) { 1370 assign_src(ctx, instr, src); 1371 } 1372 1373 /* Now finally insert the destination into the map. */ 1374 ra_foreach_dst (dst, instr) { 1375 insert_dst(ctx, dst); 1376 } 1377 1378 insert_parallel_copy_instr(ctx, instr); 1379} 1380 1381static void 1382handle_split(struct ra_ctx *ctx, struct ir3_instruction *instr) 1383{ 1384 struct ir3_register *dst = instr->dsts[0]; 1385 struct ir3_register *src = instr->srcs[0]; 1386 1387 if (dst->merge_set == NULL || src->def->merge_set != dst->merge_set) { 1388 handle_normal_instr(ctx, instr); 1389 return; 1390 } 1391 1392 struct ra_interval *src_interval = &ctx->intervals[src->def->name]; 1393 1394 physreg_t physreg = ra_interval_get_physreg(src_interval); 1395 assign_src(ctx, instr, src); 1396 1397 allocate_dst_fixed( 1398 ctx, dst, physreg - src->def->merge_set_offset + dst->merge_set_offset); 1399 insert_dst(ctx, dst); 1400} 1401 1402static void 1403handle_collect(struct ra_ctx *ctx, struct ir3_instruction *instr) 1404{ 1405 struct ir3_merge_set *dst_set = instr->dsts[0]->merge_set; 1406 unsigned dst_offset = instr->dsts[0]->merge_set_offset; 1407 1408 if (!dst_set || dst_set->regs_count == 1) { 1409 handle_normal_instr(ctx, instr); 1410 return; 1411 } 1412 1413 /* We need to check if any of the sources are contained in an interval 1414 * that is at least as large as the vector. In this case, we should put 1415 * the vector inside that larger interval. (There should be one 1416 * unambiguous place to put it, because values sharing the same merge set 1417 * should be allocated together.) This can happen in a case like: 1418 * 1419 * ssa_1 (wrmask=0xf) = ... 1420 * ssa_2 = split ssa_1 off:0 1421 * ssa_3 = split ssa_1 off:1 1422 * ssa_4 (wrmask=0x3) = collect (kill)ssa_2, (kill)ssa_3 1423 * ... = (kill)ssa_1 1424 * ... = (kill)ssa_4 1425 * 1426 * ssa_4 will be coalesced with ssa_1 and needs to be allocated inside it. 1427 */ 1428 physreg_t dst_fixed = (physreg_t)~0u; 1429 1430 ra_foreach_src (src, instr) { 1431 if (src->flags & IR3_REG_FIRST_KILL) { 1432 mark_src_killed(ctx, src); 1433 } 1434 1435 struct ra_interval *interval = &ctx->intervals[src->def->name]; 1436 1437 if (src->def->merge_set != dst_set || interval->is_killed) 1438 continue; 1439 while (interval->interval.parent != NULL) { 1440 interval = ir3_reg_interval_to_ra_interval(interval->interval.parent); 1441 } 1442 if (reg_size(interval->interval.reg) >= reg_size(instr->dsts[0])) { 1443 dst_fixed = interval->physreg_start - 1444 interval->interval.reg->merge_set_offset + dst_offset; 1445 } else { 1446 /* For sources whose root interval is smaller than the 1447 * destination (i.e. the normal case), we will shuffle them 1448 * around after allocating the destination. Mark them killed so 1449 * that the destination can be allocated over them, even if they 1450 * aren't actually killed. 1451 */ 1452 ra_file_mark_killed(ra_get_file(ctx, src), interval); 1453 } 1454 } 1455 1456 if (dst_fixed != (physreg_t)~0u) 1457 allocate_dst_fixed(ctx, instr->dsts[0], dst_fixed); 1458 else 1459 allocate_dst(ctx, instr->dsts[0]); 1460 1461 /* Remove the temporary is_killed we added */ 1462 ra_foreach_src (src, instr) { 1463 struct ra_interval *interval = &ctx->intervals[src->def->name]; 1464 while (interval->interval.parent != NULL) { 1465 interval = ir3_reg_interval_to_ra_interval(interval->interval.parent); 1466 } 1467 1468 /* Filter out cases where it actually should be killed */ 1469 if (interval != &ctx->intervals[src->def->name] || 1470 !(src->flags & IR3_REG_KILL)) { 1471 ra_file_unmark_killed(ra_get_file(ctx, src), interval); 1472 } 1473 } 1474 1475 ra_foreach_src_rev (src, instr) { 1476 assign_src(ctx, instr, src); 1477 } 1478 1479 /* We need to do this before insert_dst(), so that children of the 1480 * destination which got marked as killed and then shuffled around to make 1481 * space for the destination have the correct pcopy destination that 1482 * matches what we assign the source of the collect to in assign_src(). 1483 * 1484 * TODO: In this case we'll wind up copying the value in the pcopy and 1485 * then again in the collect. We could avoid one of those by updating the 1486 * pcopy destination to match up with the final location of the source 1487 * after the collect and making the collect a no-op. However this doesn't 1488 * seem to happen often. 1489 */ 1490 insert_parallel_copy_instr(ctx, instr); 1491 1492 /* Note: insert_dst will automatically shuffle around any intervals that 1493 * are a child of the collect by making them children of the collect. 1494 */ 1495 1496 insert_dst(ctx, instr->dsts[0]); 1497} 1498 1499/* Parallel copies before RA should only be at the end of the block, for 1500 * phi's. For these we only need to fill in the sources, and then we fill in 1501 * the destinations in the successor block. 1502 */ 1503static void 1504handle_pcopy(struct ra_ctx *ctx, struct ir3_instruction *instr) 1505{ 1506 ra_foreach_src_rev (src, instr) { 1507 assign_src(ctx, instr, src); 1508 } 1509} 1510 1511/* Some inputs may need to be precolored. We need to handle those first, so 1512 * that other non-precolored inputs don't accidentally get allocated over 1513 * them. Inputs are the very first thing in the shader, so it shouldn't be a 1514 * problem to allocate them to a specific physreg. 1515 */ 1516 1517static void 1518handle_precolored_input(struct ra_ctx *ctx, struct ir3_instruction *instr) 1519{ 1520 if (instr->dsts[0]->num == INVALID_REG) 1521 return; 1522 1523 struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name]; 1524 physreg_t physreg = ra_reg_get_physreg(instr->dsts[0]); 1525 allocate_dst_fixed(ctx, instr->dsts[0], physreg); 1526 insert_dst(ctx, instr->dsts[0]); 1527 interval->frozen = true; 1528} 1529 1530static void 1531handle_input(struct ra_ctx *ctx, struct ir3_instruction *instr) 1532{ 1533 if (instr->dsts[0]->num != INVALID_REG) 1534 return; 1535 1536 allocate_dst(ctx, instr->dsts[0]); 1537 1538 struct ra_file *file = ra_get_file(ctx, instr->dsts[0]); 1539 struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name]; 1540 ra_file_insert(file, interval); 1541} 1542 1543static void 1544assign_input(struct ra_ctx *ctx, struct ir3_instruction *instr) 1545{ 1546 struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name]; 1547 struct ra_file *file = ra_get_file(ctx, instr->dsts[0]); 1548 1549 if (instr->dsts[0]->num == INVALID_REG) { 1550 assign_reg(instr, instr->dsts[0], ra_interval_get_num(interval)); 1551 } else { 1552 interval->frozen = false; 1553 } 1554 1555 if (instr->dsts[0]->flags & IR3_REG_UNUSED) 1556 ra_file_remove(file, interval); 1557 1558 ra_foreach_src_rev (src, instr) 1559 assign_src(ctx, instr, src); 1560} 1561 1562/* chmask is a bit weird, because it has pre-colored sources due to the need 1563 * to pass some registers to the next stage. Fortunately there are only at 1564 * most two, and there should be no other live values by the time we get to 1565 * this instruction, so we only have to do the minimum and don't need any 1566 * fancy fallbacks. 1567 * 1568 * TODO: Add more complete handling of precolored sources, e.g. for function 1569 * argument handling. We'd need a way to mark sources as fixed so that they 1570 * don't get moved around when placing other sources in the fallback case, and 1571 * a duplication of much of the logic in get_reg(). This also opens another 1572 * can of worms, e.g. what if the precolored source is a split of a vector 1573 * which is still live -- this breaks our assumption that splits don't incur 1574 * any "extra" register requirements and we'd have to break it out of the 1575 * parent ra_interval. 1576 */ 1577 1578static void 1579handle_precolored_source(struct ra_ctx *ctx, struct ir3_register *src) 1580{ 1581 struct ra_file *file = ra_get_file(ctx, src); 1582 struct ra_interval *interval = &ctx->intervals[src->def->name]; 1583 physreg_t physreg = ra_reg_get_physreg(src); 1584 1585 if (ra_interval_get_num(interval) == src->num) 1586 return; 1587 1588 /* Try evicting stuff in our way if it isn't free. This won't move 1589 * anything unless it overlaps with our precolored physreg, so we don't 1590 * have to worry about evicting other precolored sources. 1591 */ 1592 if (!get_reg_specified(file, src, physreg, true)) { 1593 unsigned eviction_count; 1594 if (!try_evict_regs(ctx, file, src, physreg, &eviction_count, true, 1595 false)) { 1596 unreachable("failed to evict for precolored source!"); 1597 return; 1598 } 1599 } 1600 1601 ra_move_interval(ctx, file, interval, physreg); 1602} 1603 1604static void 1605handle_chmask(struct ra_ctx *ctx, struct ir3_instruction *instr) 1606{ 1607 /* Note: we purposely don't mark sources as killed, so that we can reuse 1608 * some of the get_reg() machinery as-if the source is a destination. 1609 * Marking it as killed would make e.g. get_reg_specified() wouldn't work 1610 * correctly. 1611 */ 1612 ra_foreach_src (src, instr) { 1613 assert(src->num != INVALID_REG); 1614 handle_precolored_source(ctx, src); 1615 } 1616 1617 ra_foreach_src (src, instr) { 1618 struct ra_file *file = ra_get_file(ctx, src); 1619 struct ra_interval *interval = &ctx->intervals[src->def->name]; 1620 if (src->flags & IR3_REG_FIRST_KILL) 1621 ra_file_remove(file, interval); 1622 } 1623 1624 insert_parallel_copy_instr(ctx, instr); 1625} 1626 1627static physreg_t 1628read_register(struct ra_ctx *ctx, struct ir3_block *block, 1629 struct ir3_register *def) 1630{ 1631 struct ra_block_state *state = &ctx->blocks[block->index]; 1632 if (state->renames) { 1633 struct hash_entry *entry = _mesa_hash_table_search(state->renames, def); 1634 if (entry) { 1635 return (physreg_t)(uintptr_t)entry->data; 1636 } 1637 } 1638 1639 return ra_reg_get_physreg(def); 1640} 1641 1642static void 1643handle_live_in(struct ra_ctx *ctx, struct ir3_register *def) 1644{ 1645 physreg_t physreg = ~0; 1646 for (unsigned i = 0; i < ctx->block->predecessors_count; i++) { 1647 struct ir3_block *pred = ctx->block->predecessors[i]; 1648 struct ra_block_state *pred_state = &ctx->blocks[pred->index]; 1649 1650 if (!pred_state->visited) 1651 continue; 1652 1653 physreg = read_register(ctx, pred, def); 1654 break; 1655 } 1656 1657 assert(physreg != (physreg_t)~0); 1658 1659 struct ra_interval *interval = &ctx->intervals[def->name]; 1660 struct ra_file *file = ra_get_file(ctx, def); 1661 ra_interval_init(interval, def); 1662 interval->physreg_start = physreg; 1663 interval->physreg_end = physreg + reg_size(def); 1664 ra_file_insert(file, interval); 1665} 1666 1667static void 1668handle_live_out(struct ra_ctx *ctx, struct ir3_register *def) 1669{ 1670 /* Skip parallelcopy's which in the original program are only used as phi 1671 * arguments. Even though phi arguments are live out, they are only 1672 * assigned when the phi is. 1673 */ 1674 if (def->instr->opc == OPC_META_PARALLEL_COPY) 1675 return; 1676 1677 struct ra_block_state *state = &ctx->blocks[ctx->block->index]; 1678 struct ra_interval *interval = &ctx->intervals[def->name]; 1679 physreg_t physreg = ra_interval_get_physreg(interval); 1680 if (physreg != ra_reg_get_physreg(def)) { 1681 if (!state->renames) 1682 state->renames = _mesa_pointer_hash_table_create(ctx); 1683 _mesa_hash_table_insert(state->renames, def, (void *)(uintptr_t)physreg); 1684 } 1685} 1686 1687static void 1688handle_phi(struct ra_ctx *ctx, struct ir3_register *def) 1689{ 1690 struct ra_file *file = ra_get_file(ctx, def); 1691 struct ra_interval *interval = &ctx->intervals[def->name]; 1692 1693 /* phis are always scalar, so they should already be the smallest possible 1694 * size. However they may be coalesced with other live-in values/phi 1695 * nodes, so check for that here. 1696 */ 1697 struct ir3_reg_interval *parent_ir3 = 1698 ir3_reg_interval_search(&file->reg_ctx.intervals, def->interval_start); 1699 physreg_t physreg; 1700 if (parent_ir3) { 1701 struct ra_interval *parent = ir3_reg_interval_to_ra_interval(parent_ir3); 1702 physreg = ra_interval_get_physreg(parent) + 1703 (def->interval_start - parent_ir3->reg->interval_start); 1704 } else { 1705 physreg = get_reg(ctx, file, def, false); 1706 } 1707 1708 allocate_dst_fixed(ctx, def, physreg); 1709 1710 ra_file_insert(file, interval); 1711} 1712 1713static void 1714assign_phi(struct ra_ctx *ctx, struct ir3_instruction *phi) 1715{ 1716 struct ra_file *file = ra_get_file(ctx, phi->dsts[0]); 1717 struct ra_interval *interval = &ctx->intervals[phi->dsts[0]->name]; 1718 assert(!interval->interval.parent); 1719 unsigned num = ra_interval_get_num(interval); 1720 assign_reg(phi, phi->dsts[0], num); 1721 1722 /* Assign the parallelcopy sources of this phi */ 1723 for (unsigned i = 0; i < phi->srcs_count; i++) { 1724 if (phi->srcs[i]->def) { 1725 assign_reg(phi, phi->srcs[i], num); 1726 assign_reg(phi, phi->srcs[i]->def, num); 1727 } 1728 } 1729 1730 if (phi->dsts[0]->flags & IR3_REG_UNUSED) 1731 ra_file_remove(file, interval); 1732} 1733 1734/* When we split a live range, we sometimes need to emit fixup code at the end 1735 * of a block. For example, something like: 1736 * 1737 * a = ... 1738 * if (...) { 1739 * ... 1740 * a' = a 1741 * b = ... // a evicted to make room for b 1742 * ... 1743 * } 1744 * ... = a 1745 * 1746 * When we insert the copy to a' in insert_parallel_copy_instr(), this forces 1747 * to insert another copy "a = a'" at the end of the if. Normally this would 1748 * also entail adding a phi node, but since we're about to go out of SSA 1749 * anyway we just insert an extra move. Note, however, that "b" might be used 1750 * in a phi node at the end of the if and share registers with "a", so we 1751 * have to be careful to extend any preexisting parallelcopy instruction 1752 * instead of creating our own in order to guarantee that they properly get 1753 * swapped. 1754 */ 1755 1756static void 1757insert_liveout_copy(struct ir3_block *block, physreg_t dst, physreg_t src, 1758 struct ir3_register *reg) 1759{ 1760 struct ir3_instruction *old_pcopy = NULL; 1761 if (!list_is_empty(&block->instr_list)) { 1762 struct ir3_instruction *last = 1763 LIST_ENTRY(struct ir3_instruction, block->instr_list.prev, node); 1764 if (last->opc == OPC_META_PARALLEL_COPY) 1765 old_pcopy = last; 1766 } 1767 1768 unsigned old_pcopy_srcs = old_pcopy ? old_pcopy->srcs_count : 0; 1769 struct ir3_instruction *pcopy = ir3_instr_create( 1770 block, OPC_META_PARALLEL_COPY, old_pcopy_srcs + 1, old_pcopy_srcs + 1); 1771 1772 for (unsigned i = 0; i < old_pcopy_srcs; i++) { 1773 old_pcopy->dsts[i]->instr = pcopy; 1774 pcopy->dsts[pcopy->dsts_count++] = old_pcopy->dsts[i]; 1775 } 1776 1777 struct ir3_register *dst_reg = 1778 ir3_dst_create(pcopy, INVALID_REG, reg->flags & ~IR3_REG_SSA); 1779 dst_reg->wrmask = reg->wrmask; 1780 dst_reg->size = reg->size; 1781 assign_reg(pcopy, dst_reg, ra_physreg_to_num(dst, reg->flags)); 1782 1783 for (unsigned i = 0; i < old_pcopy_srcs; i++) { 1784 pcopy->srcs[pcopy->srcs_count++] = old_pcopy->srcs[i]; 1785 } 1786 1787 struct ir3_register *src_reg = 1788 ir3_src_create(pcopy, INVALID_REG, reg->flags & ~IR3_REG_SSA); 1789 src_reg->wrmask = reg->wrmask; 1790 src_reg->size = reg->size; 1791 assign_reg(pcopy, src_reg, ra_physreg_to_num(src, reg->flags)); 1792 1793 if (old_pcopy) 1794 list_del(&old_pcopy->node); 1795} 1796 1797static void 1798insert_live_in_move(struct ra_ctx *ctx, struct ra_interval *interval) 1799{ 1800 physreg_t physreg = ra_interval_get_physreg(interval); 1801 1802 bool shared = interval->interval.reg->flags & IR3_REG_SHARED; 1803 struct ir3_block **predecessors = 1804 shared ? ctx->block->physical_predecessors : ctx->block->predecessors; 1805 unsigned predecessors_count = shared 1806 ? ctx->block->physical_predecessors_count 1807 : ctx->block->predecessors_count; 1808 1809 for (unsigned i = 0; i < predecessors_count; i++) { 1810 struct ir3_block *pred = predecessors[i]; 1811 struct ra_block_state *pred_state = &ctx->blocks[pred->index]; 1812 1813 if (!pred_state->visited) 1814 continue; 1815 1816 physreg_t pred_reg = read_register(ctx, pred, interval->interval.reg); 1817 if (pred_reg != physreg) { 1818 insert_liveout_copy(pred, physreg, pred_reg, interval->interval.reg); 1819 1820 /* This is a bit tricky, but when visiting the destination of a 1821 * physical-only edge, we have two predecessors (the if and the 1822 * header block) and both have multiple successors. We pick the 1823 * register for all live-ins from the normal edge, which should 1824 * guarantee that there's no need for shuffling things around in 1825 * the normal predecessor as long as there are no phi nodes, but 1826 * we still may need to insert fixup code in the physical 1827 * predecessor (i.e. the last block of the if) and that has 1828 * another successor (the block after the if) so we need to update 1829 * the renames state for when we process the other successor. This 1830 * crucially depends on the other successor getting processed 1831 * after this. 1832 * 1833 * For normal (non-physical) edges we disallow critical edges so 1834 * that hacks like this aren't necessary. 1835 */ 1836 if (!pred_state->renames) 1837 pred_state->renames = _mesa_pointer_hash_table_create(ctx); 1838 _mesa_hash_table_insert(pred_state->renames, interval->interval.reg, 1839 (void *)(uintptr_t)physreg); 1840 } 1841 } 1842} 1843 1844static void 1845insert_file_live_in_moves(struct ra_ctx *ctx, struct ra_file *file) 1846{ 1847 BITSET_WORD *live_in = ctx->live->live_in[ctx->block->index]; 1848 rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals, 1849 physreg_node) { 1850 /* Skip phi nodes. This needs to happen after phi nodes are allocated, 1851 * because we may have to move live-ins around to make space for phi 1852 * nodes, but we shouldn't be handling phi nodes here. 1853 */ 1854 if (BITSET_TEST(live_in, interval->interval.reg->name)) 1855 insert_live_in_move(ctx, interval); 1856 } 1857} 1858 1859static void 1860insert_entry_regs(struct ra_block_state *state, struct ra_file *file) 1861{ 1862 rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals, 1863 physreg_node) { 1864 _mesa_hash_table_insert(state->entry_regs, interval->interval.reg, 1865 (void *)(uintptr_t)interval->physreg_start); 1866 } 1867} 1868 1869static void 1870insert_live_in_moves(struct ra_ctx *ctx) 1871{ 1872 insert_file_live_in_moves(ctx, &ctx->full); 1873 insert_file_live_in_moves(ctx, &ctx->half); 1874 insert_file_live_in_moves(ctx, &ctx->shared); 1875 1876 /* If not all predecessors are visited, insert live-in regs so that 1877 * insert_live_out_moves() will work. 1878 */ 1879 bool all_preds_visited = true; 1880 for (unsigned i = 0; i < ctx->block->predecessors_count; i++) { 1881 if (!ctx->blocks[ctx->block->predecessors[i]->index].visited) { 1882 all_preds_visited = false; 1883 break; 1884 } 1885 } 1886 1887 if (!all_preds_visited) { 1888 struct ra_block_state *state = &ctx->blocks[ctx->block->index]; 1889 state->entry_regs = _mesa_pointer_hash_table_create(ctx); 1890 1891 insert_entry_regs(state, &ctx->full); 1892 insert_entry_regs(state, &ctx->half); 1893 insert_entry_regs(state, &ctx->shared); 1894 } 1895} 1896 1897static void 1898insert_live_out_move(struct ra_ctx *ctx, struct ra_interval *interval) 1899{ 1900 for (unsigned i = 0; i < 2; i++) { 1901 if (!ctx->block->successors[i]) 1902 continue; 1903 1904 struct ir3_block *succ = ctx->block->successors[i]; 1905 struct ra_block_state *succ_state = &ctx->blocks[succ->index]; 1906 1907 if (!succ_state->visited) 1908 continue; 1909 1910 struct hash_entry *entry = _mesa_hash_table_search( 1911 succ_state->entry_regs, interval->interval.reg); 1912 if (!entry) 1913 continue; 1914 1915 physreg_t new_reg = (physreg_t)(uintptr_t)entry->data; 1916 if (new_reg != interval->physreg_start) { 1917 insert_liveout_copy(ctx->block, new_reg, interval->physreg_start, 1918 interval->interval.reg); 1919 } 1920 } 1921} 1922 1923static void 1924insert_file_live_out_moves(struct ra_ctx *ctx, struct ra_file *file) 1925{ 1926 rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals, 1927 physreg_node) { 1928 insert_live_out_move(ctx, interval); 1929 } 1930} 1931 1932static void 1933insert_live_out_moves(struct ra_ctx *ctx) 1934{ 1935 insert_file_live_out_moves(ctx, &ctx->full); 1936 insert_file_live_out_moves(ctx, &ctx->half); 1937 insert_file_live_out_moves(ctx, &ctx->shared); 1938} 1939 1940static void 1941handle_block(struct ra_ctx *ctx, struct ir3_block *block) 1942{ 1943 ctx->block = block; 1944 1945 /* Reset the register files from the last block */ 1946 ra_file_init(&ctx->full); 1947 ra_file_init(&ctx->half); 1948 ra_file_init(&ctx->shared); 1949 1950 /* Handle live-ins, phis, and input meta-instructions. These all appear 1951 * live at the beginning of the block, and interfere with each other 1952 * therefore need to be allocated "in parallel". This means that we 1953 * have to allocate all of them, inserting them into the file, and then 1954 * delay updating the IR until all of them are allocated. 1955 * 1956 * Handle precolored inputs first, because we need to make sure that other 1957 * inputs don't overwrite them. We shouldn't have both live-ins/phi nodes 1958 * and inputs at the same time, because the first block doesn't have 1959 * predecessors. Therefore handle_live_in doesn't have to worry about 1960 * them. 1961 */ 1962 1963 foreach_instr (instr, &block->instr_list) { 1964 if (instr->opc == OPC_META_INPUT) 1965 handle_precolored_input(ctx, instr); 1966 else 1967 break; 1968 } 1969 1970 unsigned name; 1971 BITSET_FOREACH_SET (name, ctx->live->live_in[block->index], 1972 ctx->live->definitions_count) { 1973 struct ir3_register *reg = ctx->live->definitions[name]; 1974 handle_live_in(ctx, reg); 1975 } 1976 1977 foreach_instr (instr, &block->instr_list) { 1978 if (instr->opc == OPC_META_PHI) 1979 handle_phi(ctx, instr->dsts[0]); 1980 else if (instr->opc == OPC_META_INPUT || 1981 instr->opc == OPC_META_TEX_PREFETCH) 1982 handle_input(ctx, instr); 1983 else 1984 break; 1985 } 1986 1987 /* After this point, every live-in/phi/input has an interval assigned to 1988 * it. We delay actually assigning values until everything has been 1989 * allocated, so we can simply ignore any parallel copy entries created 1990 * when shuffling them around. 1991 */ 1992 ctx->parallel_copies_count = 0; 1993 1994 insert_live_in_moves(ctx); 1995 1996 if (RA_DEBUG) { 1997 d("after live-in block %u:\n", block->index); 1998 ra_ctx_dump(ctx); 1999 } 2000 2001 /* Now we're done with processing live-ins, and can handle the body of the 2002 * block. 2003 */ 2004 foreach_instr (instr, &block->instr_list) { 2005 di(instr, "processing"); 2006 2007 if (instr->opc == OPC_META_PHI) 2008 assign_phi(ctx, instr); 2009 else if (instr->opc == OPC_META_INPUT || 2010 instr->opc == OPC_META_TEX_PREFETCH) 2011 assign_input(ctx, instr); 2012 else if (instr->opc == OPC_META_SPLIT) 2013 handle_split(ctx, instr); 2014 else if (instr->opc == OPC_META_COLLECT) 2015 handle_collect(ctx, instr); 2016 else if (instr->opc == OPC_META_PARALLEL_COPY) 2017 handle_pcopy(ctx, instr); 2018 else if (instr->opc == OPC_CHMASK) 2019 handle_chmask(ctx, instr); 2020 else 2021 handle_normal_instr(ctx, instr); 2022 2023 if (RA_DEBUG) 2024 ra_ctx_dump(ctx); 2025 } 2026 2027 insert_live_out_moves(ctx); 2028 2029 BITSET_FOREACH_SET (name, ctx->live->live_out[block->index], 2030 ctx->live->definitions_count) { 2031 struct ir3_register *reg = ctx->live->definitions[name]; 2032 handle_live_out(ctx, reg); 2033 } 2034 2035 ctx->blocks[block->index].visited = true; 2036} 2037 2038static unsigned 2039calc_target_full_pressure(struct ir3_shader_variant *v, unsigned pressure) 2040{ 2041 /* Registers are allocated in units of vec4, so switch from units of 2042 * half-regs to vec4. 2043 */ 2044 unsigned reg_count = DIV_ROUND_UP(pressure, 2 * 4); 2045 2046 bool double_threadsize = ir3_should_double_threadsize(v, reg_count); 2047 2048 unsigned target = reg_count; 2049 unsigned reg_independent_max_waves = 2050 ir3_get_reg_independent_max_waves(v, double_threadsize); 2051 unsigned reg_dependent_max_waves = ir3_get_reg_dependent_max_waves( 2052 v->shader->compiler, reg_count, double_threadsize); 2053 unsigned target_waves = 2054 MIN2(reg_independent_max_waves, reg_dependent_max_waves); 2055 2056 while (target <= RA_FULL_SIZE / (2 * 4) && 2057 ir3_should_double_threadsize(v, target) == double_threadsize && 2058 ir3_get_reg_dependent_max_waves(v->shader->compiler, target, 2059 double_threadsize) >= target_waves) 2060 target++; 2061 2062 return (target - 1) * 2 * 4; 2063} 2064 2065static void 2066add_pressure(struct ir3_pressure *pressure, struct ir3_register *reg, 2067 bool merged_regs) 2068{ 2069 unsigned size = reg_size(reg); 2070 if (reg->flags & IR3_REG_HALF) 2071 pressure->half += size; 2072 if (!(reg->flags & IR3_REG_HALF) || merged_regs) 2073 pressure->full += size; 2074} 2075 2076static void 2077dummy_interval_add(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval) 2078{ 2079} 2080 2081static void 2082dummy_interval_delete(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval) 2083{ 2084} 2085 2086static void 2087dummy_interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *parent, 2088 struct ir3_reg_interval *child) 2089{ 2090} 2091 2092/* Calculate the minimum possible limit on register pressure so that spilling 2093 * still succeeds. Used to implement IR3_SHADER_DEBUG=spillall. 2094 */ 2095 2096static void 2097calc_min_limit_pressure(struct ir3_shader_variant *v, 2098 struct ir3_liveness *live, 2099 struct ir3_pressure *limit) 2100{ 2101 struct ir3_block *start = ir3_start_block(v->ir); 2102 struct ir3_reg_ctx *ctx = ralloc(NULL, struct ir3_reg_ctx); 2103 struct ir3_reg_interval *intervals = 2104 rzalloc_array(ctx, struct ir3_reg_interval, live->definitions_count); 2105 2106 ctx->interval_add = dummy_interval_add; 2107 ctx->interval_delete = dummy_interval_delete; 2108 ctx->interval_readd = dummy_interval_readd; 2109 2110 limit->full = limit->half = 0; 2111 2112 struct ir3_pressure cur_pressure = {0}; 2113 foreach_instr (input, &start->instr_list) { 2114 if (input->opc != OPC_META_INPUT && 2115 input->opc != OPC_META_TEX_PREFETCH) 2116 break; 2117 2118 add_pressure(&cur_pressure, input->dsts[0], v->mergedregs); 2119 } 2120 2121 limit->full = MAX2(limit->full, cur_pressure.full); 2122 limit->half = MAX2(limit->half, cur_pressure.half); 2123 2124 foreach_instr (input, &start->instr_list) { 2125 if (input->opc != OPC_META_INPUT && 2126 input->opc != OPC_META_TEX_PREFETCH) 2127 break; 2128 2129 /* pre-colored inputs may have holes, which increases the pressure. */ 2130 struct ir3_register *dst = input->dsts[0]; 2131 if (dst->num != INVALID_REG) { 2132 unsigned physreg = ra_reg_get_physreg(dst) + reg_size(dst); 2133 if (dst->flags & IR3_REG_HALF) 2134 limit->half = MAX2(limit->half, physreg); 2135 if (!(dst->flags & IR3_REG_HALF) || v->mergedregs) 2136 limit->full = MAX2(limit->full, physreg); 2137 } 2138 } 2139 2140 foreach_block (block, &v->ir->block_list) { 2141 rb_tree_init(&ctx->intervals); 2142 2143 unsigned name; 2144 BITSET_FOREACH_SET (name, live->live_in[block->index], 2145 live->definitions_count) { 2146 struct ir3_register *reg = live->definitions[name]; 2147 ir3_reg_interval_init(&intervals[reg->name], reg); 2148 ir3_reg_interval_insert(ctx, &intervals[reg->name]); 2149 } 2150 2151 foreach_instr (instr, &block->instr_list) { 2152 ra_foreach_dst (dst, instr) { 2153 ir3_reg_interval_init(&intervals[dst->name], dst); 2154 } 2155 /* phis and parallel copies can be deleted via spilling */ 2156 2157 if (instr->opc == OPC_META_PHI) { 2158 ir3_reg_interval_insert(ctx, &intervals[instr->dsts[0]->name]); 2159 continue; 2160 } 2161 2162 if (instr->opc == OPC_META_PARALLEL_COPY) 2163 continue; 2164 2165 cur_pressure = (struct ir3_pressure) {0}; 2166 2167 ra_foreach_dst (dst, instr) { 2168 if (dst->tied && !(dst->tied->flags & IR3_REG_KILL)) 2169 add_pressure(&cur_pressure, dst, v->mergedregs); 2170 } 2171 2172 ra_foreach_src_rev (src, instr) { 2173 /* We currently don't support spilling the parent of a source when 2174 * making space for sources, so we have to keep track of the 2175 * intervals and figure out the root of the tree to figure out how 2176 * much space we need. 2177 * 2178 * TODO: We should probably support this in the spiller. 2179 */ 2180 struct ir3_reg_interval *interval = &intervals[src->def->name]; 2181 while (interval->parent) 2182 interval = interval->parent; 2183 add_pressure(&cur_pressure, interval->reg, v->mergedregs); 2184 2185 if (src->flags & IR3_REG_FIRST_KILL) 2186 ir3_reg_interval_remove(ctx, &intervals[src->def->name]); 2187 } 2188 2189 limit->full = MAX2(limit->full, cur_pressure.full); 2190 limit->half = MAX2(limit->half, cur_pressure.half); 2191 2192 cur_pressure = (struct ir3_pressure) {0}; 2193 2194 ra_foreach_dst (dst, instr) { 2195 ir3_reg_interval_init(&intervals[dst->name], dst); 2196 ir3_reg_interval_insert(ctx, &intervals[dst->name]); 2197 add_pressure(&cur_pressure, dst, v->mergedregs); 2198 } 2199 2200 limit->full = MAX2(limit->full, cur_pressure.full); 2201 limit->half = MAX2(limit->half, cur_pressure.half); 2202 } 2203 } 2204 2205 /* Account for the base register, which needs to be available everywhere. */ 2206 limit->full += 2; 2207 2208 ralloc_free(ctx); 2209} 2210 2211int 2212ir3_ra(struct ir3_shader_variant *v) 2213{ 2214 ir3_calc_dominance(v->ir); 2215 2216 ir3_create_parallel_copies(v->ir); 2217 2218 struct ra_ctx *ctx = rzalloc(NULL, struct ra_ctx); 2219 2220 ctx->merged_regs = v->mergedregs; 2221 ctx->compiler = v->shader->compiler; 2222 ctx->stage = v->type; 2223 2224 struct ir3_liveness *live = ir3_calc_liveness(ctx, v->ir); 2225 2226 ir3_debug_print(v->ir, "AFTER: create_parallel_copies"); 2227 2228 ir3_merge_regs(live, v->ir); 2229 2230 struct ir3_pressure max_pressure; 2231 ir3_calc_pressure(v, live, &max_pressure); 2232 d("max pressure:"); 2233 d("\tfull: %u", max_pressure.full); 2234 d("\thalf: %u", max_pressure.half); 2235 d("\tshared: %u", max_pressure.shared); 2236 2237 /* TODO: calculate half/full limit correctly for CS with barrier */ 2238 struct ir3_pressure limit_pressure; 2239 limit_pressure.full = RA_FULL_SIZE; 2240 limit_pressure.half = RA_HALF_SIZE; 2241 limit_pressure.shared = RA_SHARED_SIZE; 2242 2243 /* If requested, lower the limit so that spilling happens more often. */ 2244 if (ir3_shader_debug & IR3_DBG_SPILLALL) 2245 calc_min_limit_pressure(v, live, &limit_pressure); 2246 2247 if (max_pressure.shared > limit_pressure.shared) { 2248 /* TODO shared reg -> normal reg spilling */ 2249 d("shared max pressure exceeded!"); 2250 goto fail; 2251 } 2252 2253 bool spilled = false; 2254 if (max_pressure.full > limit_pressure.full || 2255 max_pressure.half > limit_pressure.half) { 2256 if (!v->shader->compiler->has_pvtmem) { 2257 d("max pressure exceeded!"); 2258 goto fail; 2259 } 2260 d("max pressure exceeded, spilling!"); 2261 IR3_PASS(v->ir, ir3_spill, v, &live, &limit_pressure); 2262 ir3_calc_pressure(v, live, &max_pressure); 2263 assert(max_pressure.full <= limit_pressure.full && 2264 max_pressure.half <= limit_pressure.half); 2265 spilled = true; 2266 } 2267 2268 ctx->live = live; 2269 ctx->intervals = 2270 rzalloc_array(ctx, struct ra_interval, live->definitions_count); 2271 ctx->blocks = rzalloc_array(ctx, struct ra_block_state, live->block_count); 2272 2273 ctx->full.size = calc_target_full_pressure(v, max_pressure.full); 2274 d("full size: %u", ctx->full.size); 2275 2276 if (!v->mergedregs) 2277 ctx->half.size = RA_HALF_SIZE; 2278 2279 ctx->shared.size = RA_SHARED_SIZE; 2280 2281 ctx->full.start = ctx->half.start = ctx->shared.start = 0; 2282 2283 foreach_block (block, &v->ir->block_list) 2284 handle_block(ctx, block); 2285 2286 ir3_ra_validate(v, ctx->full.size, ctx->half.size, live->block_count); 2287 2288 /* Strip array-ness and SSA-ness at the end, because various helpers still 2289 * need to work even on definitions that have already been assigned. For 2290 * example, we need to preserve array-ness so that array live-ins have the 2291 * right size. 2292 */ 2293 foreach_block (block, &v->ir->block_list) { 2294 foreach_instr (instr, &block->instr_list) { 2295 for (unsigned i = 0; i < instr->dsts_count; i++) { 2296 instr->dsts[i]->flags &= ~IR3_REG_SSA; 2297 2298 /* Parallel copies of array registers copy the whole register, and 2299 * we need some way to let the parallel copy code know that this was 2300 * an array whose size is determined by reg->size. So keep the array 2301 * flag on those. spill/reload also need to work on the entire 2302 * array. 2303 */ 2304 if (!is_meta(instr) && instr->opc != OPC_RELOAD_MACRO) 2305 instr->dsts[i]->flags &= ~IR3_REG_ARRAY; 2306 } 2307 2308 for (unsigned i = 0; i < instr->srcs_count; i++) { 2309 instr->srcs[i]->flags &= ~IR3_REG_SSA; 2310 2311 if (!is_meta(instr) && instr->opc != OPC_SPILL_MACRO) 2312 instr->srcs[i]->flags &= ~IR3_REG_ARRAY; 2313 } 2314 } 2315 } 2316 2317 ir3_debug_print(v->ir, "AFTER: register allocation"); 2318 2319 if (spilled) { 2320 IR3_PASS(v->ir, ir3_lower_spill); 2321 } 2322 2323 ir3_lower_copies(v); 2324 2325 ir3_debug_print(v->ir, "AFTER: ir3_lower_copies"); 2326 2327 ralloc_free(ctx); 2328 2329 return 0; 2330fail: 2331 ralloc_free(ctx); 2332 return -1; 2333} 2334