1/* 2 * Copyright (C) 2021 Valve Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 */ 23 24#include "ir3_compiler.h" 25#include "ir3_ra.h" 26#include "ralloc.h" 27 28/* This pass "merges" compatible phi-web SSA values. First, we insert a bunch 29 * of parallelcopy's to trivially turn the program into CSSA form. Then we 30 * try to "merge" SSA def's into "merge sets" which could be allocated to a 31 * single register in order to eliminate copies. First we merge phi nodes, 32 * which should always succeed because of the parallelcopy's we inserted, and 33 * then we try to coalesce the copies we introduced. 34 * 35 * The merged registers are used for three purposes: 36 * 37 * 1. We always use the same pvtmem slot for spilling all SSA defs in each 38 * merge set. This prevents us from having to insert memory-to-memory copies 39 * in the spiller and makes sure we don't insert unecessary copies. 40 * 2. When two values are live at the same time, part of the same merge 41 * set, and they overlap each other in the merge set, they always occupy 42 * overlapping physical registers in RA. This reduces register pressure and 43 * copies in several important scenarios: 44 * - When sources of a collect are used later by something else, we don't 45 * have to introduce copies. 46 * - We can handle sequences of extracts that "explode" a vector into its 47 * components without any additional copying. 48 * 3. We use the merge sets for affinities in register allocation: That is, we 49 * try to allocate all the definitions in the same merge set to the 50 * same/compatible registers. This helps us e.g. allocate sources of a collect 51 * to contiguous registers without too much special code in RA. 52 * 53 * In a "normal" register allocator, or when spilling, we'd just merge 54 * registers in the same merge set to the same register, but with SSA-based 55 * register allocation we may have to split the live interval. 56 * 57 * The implementation is based on "Revisiting Out-of-SSA Translation for 58 * Correctness, CodeQuality, and Efficiency," and is broadly similar to the 59 * implementation in nir_from_ssa, with the twist that we also try to coalesce 60 * META_SPLIT and META_COLLECT. This makes this pass more complicated but 61 * prevents us from needing to handle these specially in RA and the spiller, 62 * which are already complicated enough. This also forces us to implement that 63 * value-comparison optimization they explain, as without it we wouldn't be 64 * able to coalesce META_SPLIT even in the simplest of cases. 65 */ 66 67/* In order to dynamically reconstruct the dominance forest, we need the 68 * instructions ordered by a preorder traversal of the dominance tree: 69 */ 70 71static unsigned 72index_instrs(struct ir3_block *block, unsigned index) 73{ 74 foreach_instr (instr, &block->instr_list) 75 instr->ip = index++; 76 77 for (unsigned i = 0; i < block->dom_children_count; i++) 78 index = index_instrs(block->dom_children[i], index); 79 80 return index; 81} 82 83/* Definitions within a merge set are ordered by instr->ip as set above: */ 84 85static bool 86def_after(struct ir3_register *a, struct ir3_register *b) 87{ 88 return a->instr->ip > b->instr->ip; 89} 90 91static bool 92def_dominates(struct ir3_register *a, struct ir3_register *b) 93{ 94 if (def_after(a, b)) { 95 return false; 96 } else if (a->instr->block == b->instr->block) { 97 return def_after(b, a); 98 } else { 99 return ir3_block_dominates(a->instr->block, b->instr->block); 100 } 101} 102 103/* This represents a region inside a register. The offset is relative to the 104 * start of the register, and offset + size <= size(reg). 105 */ 106struct def_value { 107 struct ir3_register *reg; 108 unsigned offset, size; 109}; 110 111/* Chase any copies to get the source of a region inside a register. This is 112 * Value(a) in the paper. 113 */ 114static struct def_value 115chase_copies(struct def_value value) 116{ 117 while (true) { 118 struct ir3_instruction *instr = value.reg->instr; 119 if (instr->opc == OPC_META_SPLIT) { 120 value.offset += instr->split.off * reg_elem_size(value.reg); 121 value.reg = instr->srcs[0]->def; 122 } else if (instr->opc == OPC_META_COLLECT) { 123 if (value.offset % reg_elem_size(value.reg) != 0 || 124 value.size > reg_elem_size(value.reg) || 125 value.offset + value.size > reg_size(value.reg)) 126 break; 127 struct ir3_register *src = 128 instr->srcs[value.offset / reg_elem_size(value.reg)]; 129 if (!src->def) 130 break; 131 value.offset = 0; 132 value.reg = src->def; 133 } else { 134 /* TODO: parallelcopy */ 135 break; 136 } 137 } 138 139 return value; 140} 141 142/* This represents an entry in the merge set, and consists of a register + 143 * offset from the merge set base. 144 */ 145struct merge_def { 146 struct ir3_register *reg; 147 unsigned offset; 148}; 149 150static bool 151can_skip_interference(const struct merge_def *a, const struct merge_def *b) 152{ 153 unsigned a_start = a->offset; 154 unsigned b_start = b->offset; 155 unsigned a_end = a_start + reg_size(a->reg); 156 unsigned b_end = b_start + reg_size(b->reg); 157 158 /* Registers that don't overlap never interfere */ 159 if (a_end <= b_start || b_end <= a_start) 160 return true; 161 162 /* Disallow skipping interference unless one definition contains the 163 * other. This restriction is important for register allocation, because 164 * it means that at any given point in the program, the live values in a 165 * given merge set will form a tree. If they didn't, then one live value 166 * would partially overlap another, and they would have overlapping live 167 * ranges because they're live at the same point. This simplifies register 168 * allocation and spilling. 169 */ 170 if (!((a_start <= b_start && a_end >= b_end) || 171 (b_start <= a_start && b_end >= a_end))) 172 return false; 173 174 /* For each register, chase the intersection of a and b to find the 175 * ultimate source. 176 */ 177 unsigned start = MAX2(a_start, b_start); 178 unsigned end = MIN2(a_end, b_end); 179 struct def_value a_value = chase_copies((struct def_value){ 180 .reg = a->reg, 181 .offset = start - a_start, 182 .size = end - start, 183 }); 184 struct def_value b_value = chase_copies((struct def_value){ 185 .reg = b->reg, 186 .offset = start - b_start, 187 .size = end - start, 188 }); 189 return a_value.reg == b_value.reg && a_value.offset == b_value.offset; 190} 191 192static struct ir3_merge_set * 193get_merge_set(struct ir3_register *def) 194{ 195 if (def->merge_set) 196 return def->merge_set; 197 198 struct ir3_merge_set *set = ralloc(def, struct ir3_merge_set); 199 set->preferred_reg = ~0; 200 set->interval_start = ~0; 201 set->spill_slot = ~0; 202 set->size = reg_size(def); 203 set->alignment = (def->flags & IR3_REG_HALF) ? 1 : 2; 204 set->regs_count = 1; 205 set->regs = ralloc(set, struct ir3_register *); 206 set->regs[0] = def; 207 208 return set; 209} 210 211/* Merges b into a */ 212static struct ir3_merge_set * 213merge_merge_sets(struct ir3_merge_set *a, struct ir3_merge_set *b, int b_offset) 214{ 215 if (b_offset < 0) 216 return merge_merge_sets(b, a, -b_offset); 217 218 struct ir3_register **new_regs = 219 rzalloc_array(a, struct ir3_register *, a->regs_count + b->regs_count); 220 221 unsigned a_index = 0, b_index = 0, new_index = 0; 222 for (; a_index < a->regs_count || b_index < b->regs_count; new_index++) { 223 if (b_index < b->regs_count && 224 (a_index == a->regs_count || 225 def_after(a->regs[a_index], b->regs[b_index]))) { 226 new_regs[new_index] = b->regs[b_index++]; 227 new_regs[new_index]->merge_set_offset += b_offset; 228 } else { 229 new_regs[new_index] = a->regs[a_index++]; 230 } 231 new_regs[new_index]->merge_set = a; 232 } 233 234 assert(new_index == a->regs_count + b->regs_count); 235 236 /* Technically this should be the lcm, but because alignment is only 1 or 237 * 2 so far this should be ok. 238 */ 239 a->alignment = MAX2(a->alignment, b->alignment); 240 a->regs_count += b->regs_count; 241 ralloc_free(a->regs); 242 a->regs = new_regs; 243 a->size = MAX2(a->size, b->size + b_offset); 244 245 return a; 246} 247 248static bool 249merge_sets_interfere(struct ir3_liveness *live, struct ir3_merge_set *a, 250 struct ir3_merge_set *b, int b_offset) 251{ 252 if (b_offset < 0) 253 return merge_sets_interfere(live, b, a, -b_offset); 254 255 struct merge_def dom[a->regs_count + b->regs_count]; 256 unsigned a_index = 0, b_index = 0; 257 int dom_index = -1; 258 259 /* Reject trying to merge the sets if the alignment doesn't work out */ 260 if (b_offset % a->alignment != 0) 261 return true; 262 263 while (a_index < a->regs_count || b_index < b->regs_count) { 264 struct merge_def current; 265 if (a_index == a->regs_count) { 266 current.reg = b->regs[b_index]; 267 current.offset = current.reg->merge_set_offset + b_offset; 268 b_index++; 269 } else if (b_index == b->regs_count) { 270 current.reg = a->regs[a_index]; 271 current.offset = current.reg->merge_set_offset; 272 a_index++; 273 } else { 274 if (def_after(b->regs[b_index], a->regs[a_index])) { 275 current.reg = a->regs[a_index]; 276 current.offset = current.reg->merge_set_offset; 277 a_index++; 278 } else { 279 current.reg = b->regs[b_index]; 280 current.offset = current.reg->merge_set_offset + b_offset; 281 b_index++; 282 } 283 } 284 285 while (dom_index >= 0 && 286 !def_dominates(dom[dom_index].reg, current.reg)) { 287 dom_index--; 288 } 289 290 /* TODO: in the original paper, just dom[dom_index] needs to be 291 * checked for interference. We implement the value-chasing extension 292 * as well as support for sub-registers, which complicates this 293 * significantly because it's no longer the case that if a dominates b 294 * dominates c and a and b don't interfere then we only need to check 295 * interference between b and c to be sure a and c don't interfere -- 296 * this means we may have to check for interference against values 297 * higher in the stack then dom[dom_index]. In the paper there's a 298 * description of a way to do less interference tests with the 299 * value-chasing extension, but we'd have to come up with something 300 * ourselves for handling the similar problems that come up with 301 * allowing values to contain subregisters. For now we just test 302 * everything in the stack. 303 */ 304 for (int i = 0; i <= dom_index; i++) { 305 if (can_skip_interference(¤t, &dom[i])) 306 continue; 307 308 /* Ok, now we actually have to check interference. Since we know 309 * that dom[i] dominates current, this boils down to checking 310 * whether dom[i] is live after current. 311 */ 312 if (ir3_def_live_after(live, dom[i].reg, current.reg->instr)) 313 return true; 314 } 315 316 dom[++dom_index] = current; 317 } 318 319 return false; 320} 321 322static void 323try_merge_defs(struct ir3_liveness *live, struct ir3_register *a, 324 struct ir3_register *b, unsigned b_offset) 325{ 326 struct ir3_merge_set *a_set = get_merge_set(a); 327 struct ir3_merge_set *b_set = get_merge_set(b); 328 329 if (a_set == b_set) { 330 /* Note: Even in this case we may not always successfully be able to 331 * coalesce this copy, if the offsets don't line up. But in any 332 * case, we can't do anything. 333 */ 334 return; 335 } 336 337 int b_set_offset = a->merge_set_offset + b_offset - b->merge_set_offset; 338 339 if (!merge_sets_interfere(live, a_set, b_set, b_set_offset)) 340 merge_merge_sets(a_set, b_set, b_set_offset); 341} 342 343void 344ir3_force_merge(struct ir3_register *a, struct ir3_register *b, int b_offset) 345{ 346 struct ir3_merge_set *a_set = get_merge_set(a); 347 struct ir3_merge_set *b_set = get_merge_set(b); 348 349 if (a_set == b_set) 350 return; 351 352 int b_set_offset = a->merge_set_offset + b_offset - b->merge_set_offset; 353 merge_merge_sets(a_set, b_set, b_set_offset); 354} 355 356static void 357coalesce_phi(struct ir3_liveness *live, struct ir3_instruction *phi) 358{ 359 for (unsigned i = 0; i < phi->srcs_count; i++) { 360 if (phi->srcs[i]->def) 361 try_merge_defs(live, phi->dsts[0], phi->srcs[i]->def, 0); 362 } 363} 364 365static void 366aggressive_coalesce_parallel_copy(struct ir3_liveness *live, 367 struct ir3_instruction *pcopy) 368{ 369 for (unsigned i = 0; i < pcopy->dsts_count; i++) { 370 if (!(pcopy->srcs[i]->flags & IR3_REG_SSA)) 371 continue; 372 try_merge_defs(live, pcopy->dsts[i], pcopy->srcs[i]->def, 0); 373 } 374} 375 376static void 377aggressive_coalesce_split(struct ir3_liveness *live, 378 struct ir3_instruction *split) 379{ 380 try_merge_defs(live, split->srcs[0]->def, split->dsts[0], 381 split->split.off * reg_elem_size(split->dsts[0])); 382} 383 384static void 385aggressive_coalesce_collect(struct ir3_liveness *live, 386 struct ir3_instruction *collect) 387{ 388 for (unsigned i = 0, offset = 0; i < collect->srcs_count; 389 offset += reg_elem_size(collect->srcs[i]), i++) { 390 if (!(collect->srcs[i]->flags & IR3_REG_SSA)) 391 continue; 392 try_merge_defs(live, collect->dsts[0], collect->srcs[i]->def, offset); 393 } 394} 395 396static void 397create_parallel_copy(struct ir3_block *block) 398{ 399 for (unsigned i = 0; i < 2; i++) { 400 if (!block->successors[i]) 401 continue; 402 403 struct ir3_block *succ = block->successors[i]; 404 405 unsigned pred_idx = ir3_block_get_pred_index(succ, block); 406 407 unsigned phi_count = 0; 408 foreach_instr (phi, &succ->instr_list) { 409 if (phi->opc != OPC_META_PHI) 410 break; 411 412 /* Avoid undef */ 413 if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) && 414 !phi->srcs[pred_idx]->def) 415 continue; 416 417 /* We don't support critical edges. If we were to support them, 418 * we'd need to insert parallel copies after the phi node to solve 419 * the lost-copy problem. 420 */ 421 assert(i == 0 && !block->successors[1]); 422 phi_count++; 423 } 424 425 if (phi_count == 0) 426 continue; 427 428 struct ir3_register *src[phi_count]; 429 unsigned j = 0; 430 foreach_instr (phi, &succ->instr_list) { 431 if (phi->opc != OPC_META_PHI) 432 break; 433 if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) && 434 !phi->srcs[pred_idx]->def) 435 continue; 436 src[j++] = phi->srcs[pred_idx]; 437 } 438 assert(j == phi_count); 439 440 struct ir3_instruction *pcopy = 441 ir3_instr_create(block, OPC_META_PARALLEL_COPY, phi_count, phi_count); 442 443 for (j = 0; j < phi_count; j++) { 444 struct ir3_register *reg = __ssa_dst(pcopy); 445 reg->flags |= src[j]->flags & (IR3_REG_HALF | IR3_REG_ARRAY); 446 reg->size = src[j]->size; 447 reg->wrmask = src[j]->wrmask; 448 } 449 450 for (j = 0; j < phi_count; j++) { 451 pcopy->srcs[pcopy->srcs_count++] = 452 ir3_reg_clone(block->shader, src[j]); 453 } 454 455 j = 0; 456 foreach_instr (phi, &succ->instr_list) { 457 if (phi->opc != OPC_META_PHI) 458 break; 459 if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) && 460 !phi->srcs[pred_idx]->def) 461 continue; 462 phi->srcs[pred_idx]->def = pcopy->dsts[j]; 463 phi->srcs[pred_idx]->flags = pcopy->dsts[j]->flags; 464 j++; 465 } 466 assert(j == phi_count); 467 } 468} 469 470void 471ir3_create_parallel_copies(struct ir3 *ir) 472{ 473 foreach_block (block, &ir->block_list) { 474 create_parallel_copy(block); 475 } 476} 477 478static void 479index_merge_sets(struct ir3_liveness *live, struct ir3 *ir) 480{ 481 unsigned offset = 0; 482 foreach_block (block, &ir->block_list) { 483 foreach_instr (instr, &block->instr_list) { 484 for (unsigned i = 0; i < instr->dsts_count; i++) { 485 struct ir3_register *dst = instr->dsts[i]; 486 487 unsigned dst_offset; 488 struct ir3_merge_set *merge_set = dst->merge_set; 489 unsigned size = reg_size(dst); 490 if (merge_set) { 491 if (merge_set->interval_start == ~0) { 492 merge_set->interval_start = offset; 493 offset += merge_set->size; 494 } 495 dst_offset = merge_set->interval_start + dst->merge_set_offset; 496 } else { 497 dst_offset = offset; 498 offset += size; 499 } 500 501 dst->interval_start = dst_offset; 502 dst->interval_end = dst_offset + size; 503 } 504 } 505 } 506 507 live->interval_offset = offset; 508} 509 510#define RESET "\x1b[0m" 511#define BLUE "\x1b[0;34m" 512#define SYN_SSA(x) BLUE x RESET 513 514static void 515dump_merge_sets(struct ir3 *ir) 516{ 517 d("merge sets:"); 518 struct set *merge_sets = _mesa_pointer_set_create(NULL); 519 foreach_block (block, &ir->block_list) { 520 foreach_instr (instr, &block->instr_list) { 521 for (unsigned i = 0; i < instr->dsts_count; i++) { 522 struct ir3_register *dst = instr->dsts[i]; 523 524 struct ir3_merge_set *merge_set = dst->merge_set; 525 if (!merge_set || _mesa_set_search(merge_sets, merge_set)) 526 continue; 527 528 d("merge set, size %u, align %u:", merge_set->size, 529 merge_set->alignment); 530 for (unsigned j = 0; j < merge_set->regs_count; j++) { 531 struct ir3_register *reg = merge_set->regs[j]; 532 d("\t" SYN_SSA("ssa_%u") ":%u, offset %u", 533 reg->instr->serialno, reg->name, reg->merge_set_offset); 534 } 535 536 _mesa_set_add(merge_sets, merge_set); 537 } 538 } 539 } 540 541 ralloc_free(merge_sets); 542} 543 544void 545ir3_merge_regs(struct ir3_liveness *live, struct ir3 *ir) 546{ 547 index_instrs(ir3_start_block(ir), 0); 548 549 /* First pass: coalesce phis, which must be together. */ 550 foreach_block (block, &ir->block_list) { 551 foreach_instr (instr, &block->instr_list) { 552 if (instr->opc != OPC_META_PHI) 553 break; 554 555 coalesce_phi(live, instr); 556 } 557 } 558 559 /* Second pass: aggressively coalesce parallelcopy, split, collect */ 560 foreach_block (block, &ir->block_list) { 561 foreach_instr (instr, &block->instr_list) { 562 switch (instr->opc) { 563 case OPC_META_SPLIT: 564 aggressive_coalesce_split(live, instr); 565 break; 566 case OPC_META_COLLECT: 567 aggressive_coalesce_collect(live, instr); 568 break; 569 case OPC_META_PARALLEL_COPY: 570 aggressive_coalesce_parallel_copy(live, instr); 571 break; 572 default: 573 break; 574 } 575 } 576 } 577 578 index_merge_sets(live, ir); 579 580 if (ir3_shader_debug & IR3_DBG_RAMSGS) 581 dump_merge_sets(ir); 582} 583