1/* 2 * Copyright (C) 2021 Valve Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 */ 23 24#include "ir3_ra.h" 25#include "ir3_shader.h" 26 27struct copy_src { 28 unsigned flags; 29 union { 30 uint32_t imm; 31 physreg_t reg; 32 unsigned const_num; 33 }; 34}; 35 36struct copy_entry { 37 physreg_t dst; 38 unsigned flags; 39 bool done; 40 41 struct copy_src src; 42}; 43 44static unsigned 45copy_entry_size(const struct copy_entry *entry) 46{ 47 return (entry->flags & IR3_REG_HALF) ? 1 : 2; 48} 49 50static struct copy_src 51get_copy_src(const struct ir3_register *reg, unsigned offset) 52{ 53 if (reg->flags & IR3_REG_IMMED) { 54 return (struct copy_src){ 55 .flags = IR3_REG_IMMED, 56 .imm = reg->uim_val, 57 }; 58 } else if (reg->flags & IR3_REG_CONST) { 59 return (struct copy_src){ 60 .flags = IR3_REG_CONST, 61 .const_num = reg->num, 62 }; 63 } else { 64 return (struct copy_src){ 65 .flags = 0, 66 .reg = ra_reg_get_physreg(reg) + offset, 67 }; 68 } 69} 70 71static void 72do_xor(struct ir3_instruction *instr, unsigned dst_num, unsigned src1_num, 73 unsigned src2_num, unsigned flags) 74{ 75 struct ir3_instruction * xor 76 = ir3_instr_create(instr->block, OPC_XOR_B, 1, 2); 77 ir3_dst_create(xor, dst_num, flags); 78 ir3_src_create(xor, src1_num, flags); 79 ir3_src_create(xor, src2_num, flags); 80 81 ir3_instr_move_before(xor, instr); 82} 83 84static void 85do_swap(struct ir3_compiler *compiler, struct ir3_instruction *instr, 86 const struct copy_entry *entry) 87{ 88 assert(!entry->src.flags); 89 90 if (entry->flags & IR3_REG_HALF) { 91 /* We currently make sure to never emit parallel copies where the 92 * source/destination is a half-reg above the range accessable to half 93 * registers. However, when a full-reg source overlaps a half-reg 94 * destination or vice versa, it can be very, very complicated to come 95 * up with a series of "legal" swaps and copies to resolve the 96 * parallel copy. So here we provide a fallback to implement the 97 * "illegal" swap instead. This may also be useful for implementing 98 * "spilling" half-regs to the inaccessable space. 99 */ 100 if (entry->src.reg >= RA_HALF_SIZE) { 101 /* Choose a temporary that doesn't overlap src or dst */ 102 physreg_t tmp = entry->dst < 2 ? 2 : 0; 103 104 /* Swap src and the temporary */ 105 do_swap(compiler, instr, 106 &(struct copy_entry){ 107 .src = {.reg = entry->src.reg & ~1u}, 108 .dst = tmp, 109 .flags = entry->flags & ~IR3_REG_HALF, 110 }); 111 112 /* If src and dst are within the same full register, then swapping src 113 * with tmp above will also move dst to tmp. Account for that here. 114 */ 115 unsigned dst = 116 (entry->src.reg & ~1u) == (entry->dst & ~1u) ? 117 tmp + (entry->dst & 1u) : entry->dst; 118 119 /* Do the original swap with src replaced with tmp */ 120 do_swap(compiler, instr, 121 &(struct copy_entry){ 122 .src = {.reg = tmp + (entry->src.reg & 1)}, 123 .dst = dst, 124 .flags = entry->flags, 125 }); 126 127 /* Swap src and the temporary back */ 128 do_swap(compiler, instr, 129 &(struct copy_entry){ 130 .src = {.reg = entry->src.reg & ~1u}, 131 .dst = tmp, 132 .flags = entry->flags & ~IR3_REG_HALF, 133 }); 134 return; 135 } 136 137 /* If dst is not addressable, we only need to swap the arguments and 138 * let the case above handle it. 139 */ 140 if (entry->dst >= RA_HALF_SIZE) { 141 do_swap(compiler, instr, 142 &(struct copy_entry){ 143 .src = {.reg = entry->dst}, 144 .dst = entry->src.reg, 145 .flags = entry->flags, 146 }); 147 return; 148 } 149 } 150 151 unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags); 152 unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags); 153 154 /* a5xx+ is known to support swz, which enables us to swap two registers 155 * in-place. If unsupported we emulate it using the xor trick. 156 */ 157 if (compiler->gen < 5) { 158 /* Shared regs only exist since a5xx, so we don't have to provide a 159 * fallback path for them. 160 */ 161 assert(!(entry->flags & IR3_REG_SHARED)); 162 do_xor(instr, dst_num, dst_num, src_num, entry->flags); 163 do_xor(instr, src_num, src_num, dst_num, entry->flags); 164 do_xor(instr, dst_num, dst_num, src_num, entry->flags); 165 } else { 166 /* Use a macro for shared regs because any shared reg writes need to 167 * be wrapped in a getone block to work correctly. Writing shared regs 168 * with multiple threads active does not work, even if they all return 169 * the same value. 170 */ 171 unsigned opc = 172 (entry->flags & IR3_REG_SHARED) ? OPC_SWZ_SHARED_MACRO : OPC_SWZ; 173 struct ir3_instruction *swz = ir3_instr_create(instr->block, opc, 2, 2); 174 ir3_dst_create(swz, dst_num, entry->flags); 175 ir3_dst_create(swz, src_num, entry->flags); 176 ir3_src_create(swz, src_num, entry->flags); 177 ir3_src_create(swz, dst_num, entry->flags); 178 swz->cat1.dst_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32; 179 swz->cat1.src_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32; 180 swz->repeat = 1; 181 ir3_instr_move_before(swz, instr); 182 } 183} 184 185static void 186do_copy(struct ir3_compiler *compiler, struct ir3_instruction *instr, 187 const struct copy_entry *entry) 188{ 189 if (entry->flags & IR3_REG_HALF) { 190 /* See do_swap() for why this is here. */ 191 if (entry->dst >= RA_HALF_SIZE) { 192 /* TODO: is there a hw instruction we can use for this case? */ 193 physreg_t tmp = !entry->src.flags && entry->src.reg < 2 ? 2 : 0; 194 195 do_swap(compiler, instr, 196 &(struct copy_entry){ 197 .src = {.reg = entry->dst & ~1u}, 198 .dst = tmp, 199 .flags = entry->flags & ~IR3_REG_HALF, 200 }); 201 202 /* Similar to in do_swap(), account for src being swapped with tmp if 203 * src and dst are in the same register. 204 */ 205 struct copy_src src = entry->src; 206 if (!src.flags && (src.reg & ~1u) == (entry->dst & ~1u)) 207 src.reg = tmp + (src.reg & 1u); 208 209 do_copy(compiler, instr, 210 &(struct copy_entry){ 211 .src = src, 212 .dst = tmp + (entry->dst & 1), 213 .flags = entry->flags, 214 }); 215 216 do_swap(compiler, instr, 217 &(struct copy_entry){ 218 .src = {.reg = entry->dst & ~1u}, 219 .dst = tmp, 220 .flags = entry->flags & ~IR3_REG_HALF, 221 }); 222 return; 223 } 224 225 if (!entry->src.flags && entry->src.reg >= RA_HALF_SIZE) { 226 unsigned src_num = ra_physreg_to_num(entry->src.reg & ~1u, 227 entry->flags & ~IR3_REG_HALF); 228 unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags); 229 230 if (entry->src.reg % 2 == 0) { 231 /* cov.u32u16 dst, src */ 232 struct ir3_instruction *cov = 233 ir3_instr_create(instr->block, OPC_MOV, 1, 1); 234 ir3_dst_create(cov, dst_num, entry->flags); 235 ir3_src_create(cov, src_num, entry->flags & ~IR3_REG_HALF); 236 cov->cat1.dst_type = TYPE_U16; 237 cov->cat1.src_type = TYPE_U32; 238 ir3_instr_move_before(cov, instr); 239 } else { 240 /* shr.b dst, src, (16) */ 241 struct ir3_instruction *shr = 242 ir3_instr_create(instr->block, OPC_SHR_B, 1, 2); 243 ir3_dst_create(shr, dst_num, entry->flags); 244 ir3_src_create(shr, src_num, entry->flags & ~IR3_REG_HALF); 245 ir3_src_create(shr, 0, IR3_REG_IMMED)->uim_val = 16; 246 ir3_instr_move_before(shr, instr); 247 } 248 return; 249 } 250 } 251 252 unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags); 253 unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags); 254 255 /* Similar to the swap case, we have to use a macro for shared regs. */ 256 unsigned opc = 257 (entry->flags & IR3_REG_SHARED) ? OPC_READ_FIRST_MACRO : OPC_MOV; 258 struct ir3_instruction *mov = ir3_instr_create(instr->block, opc, 1, 1); 259 ir3_dst_create(mov, dst_num, entry->flags); 260 ir3_src_create(mov, src_num, entry->flags | entry->src.flags); 261 mov->cat1.dst_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32; 262 mov->cat1.src_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32; 263 if (entry->src.flags & IR3_REG_IMMED) 264 mov->srcs[0]->uim_val = entry->src.imm; 265 else if (entry->src.flags & IR3_REG_CONST) 266 mov->srcs[0]->num = entry->src.const_num; 267 ir3_instr_move_before(mov, instr); 268} 269 270struct copy_ctx { 271 /* For each physreg, the number of pending copy entries that use it as a 272 * source. Once this drops to zero, then the physreg is unblocked and can 273 * be moved to. 274 */ 275 unsigned physreg_use_count[RA_MAX_FILE_SIZE]; 276 277 /* For each physreg, the pending copy_entry that uses it as a dest. */ 278 struct copy_entry *physreg_dst[RA_MAX_FILE_SIZE]; 279 280 struct copy_entry entries[RA_MAX_FILE_SIZE]; 281 unsigned entry_count; 282}; 283 284static bool 285entry_blocked(struct copy_entry *entry, struct copy_ctx *ctx) 286{ 287 for (unsigned i = 0; i < copy_entry_size(entry); i++) { 288 if (ctx->physreg_use_count[entry->dst + i] != 0) 289 return true; 290 } 291 292 return false; 293} 294 295static void 296split_32bit_copy(struct copy_ctx *ctx, struct copy_entry *entry) 297{ 298 assert(!entry->done); 299 assert(!(entry->src.flags & (IR3_REG_IMMED | IR3_REG_CONST))); 300 assert(copy_entry_size(entry) == 2); 301 struct copy_entry *new_entry = &ctx->entries[ctx->entry_count++]; 302 303 new_entry->dst = entry->dst + 1; 304 new_entry->src.flags = entry->src.flags; 305 new_entry->src.reg = entry->src.reg + 1; 306 new_entry->done = false; 307 entry->flags |= IR3_REG_HALF; 308 new_entry->flags = entry->flags; 309 ctx->physreg_dst[entry->dst + 1] = new_entry; 310} 311 312static void 313_handle_copies(struct ir3_compiler *compiler, struct ir3_instruction *instr, 314 struct copy_ctx *ctx) 315{ 316 /* Set up the bookkeeping */ 317 memset(ctx->physreg_dst, 0, sizeof(ctx->physreg_dst)); 318 memset(ctx->physreg_use_count, 0, sizeof(ctx->physreg_use_count)); 319 320 for (unsigned i = 0; i < ctx->entry_count; i++) { 321 struct copy_entry *entry = &ctx->entries[i]; 322 for (unsigned j = 0; j < copy_entry_size(entry); j++) { 323 if (!entry->src.flags) 324 ctx->physreg_use_count[entry->src.reg + j]++; 325 326 /* Copies should not have overlapping destinations. */ 327 assert(!ctx->physreg_dst[entry->dst + j]); 328 ctx->physreg_dst[entry->dst + j] = entry; 329 } 330 } 331 332 bool progress = true; 333 while (progress) { 334 progress = false; 335 336 /* Step 1: resolve paths in the transfer graph. This means finding 337 * copies whose destination aren't blocked by something else and then 338 * emitting them, continuing this process until every copy is blocked 339 * and there are only cycles left. 340 * 341 * TODO: We should note that src is also available in dst to unblock 342 * cycles that src is involved in. 343 */ 344 345 for (unsigned i = 0; i < ctx->entry_count; i++) { 346 struct copy_entry *entry = &ctx->entries[i]; 347 if (!entry->done && !entry_blocked(entry, ctx)) { 348 entry->done = true; 349 progress = true; 350 do_copy(compiler, instr, entry); 351 for (unsigned j = 0; j < copy_entry_size(entry); j++) { 352 if (!entry->src.flags) 353 ctx->physreg_use_count[entry->src.reg + j]--; 354 ctx->physreg_dst[entry->dst + j] = NULL; 355 } 356 } 357 } 358 359 if (progress) 360 continue; 361 362 /* Step 2: Find partially blocked copies and split them. In the 363 * mergedregs case, we can 32-bit copies which are only blocked on one 364 * 16-bit half, and splitting them helps get things moving. 365 * 366 * We can skip splitting copies if the source isn't a register, 367 * however, because it does not unblock anything and therefore doesn't 368 * contribute to making forward progress with step 1. These copies 369 * should still be resolved eventually in step 1 because they can't be 370 * part of a cycle. 371 */ 372 for (unsigned i = 0; i < ctx->entry_count; i++) { 373 struct copy_entry *entry = &ctx->entries[i]; 374 if (entry->done || entry->flags & IR3_REG_HALF) 375 continue; 376 377 if (((ctx->physreg_use_count[entry->dst] == 0 || 378 ctx->physreg_use_count[entry->dst + 1] == 0)) && 379 !(entry->src.flags & (IR3_REG_IMMED | IR3_REG_CONST))) { 380 split_32bit_copy(ctx, entry); 381 progress = true; 382 } 383 } 384 } 385 386 /* Step 3: resolve cycles through swapping. 387 * 388 * At this point, the transfer graph should consist of only cycles. 389 * The reason is that, given any physreg n_1 that's the source of a 390 * remaining entry, it has a destination n_2, which (because every 391 * copy is blocked) is the source of some other copy whose destination 392 * is n_3, and so we can follow the chain until we get a cycle. If we 393 * reached some other node than n_1: 394 * 395 * n_1 -> n_2 -> ... -> n_i 396 * ^ | 397 * |-------------| 398 * 399 * then n_2 would be the destination of 2 copies, which is illegal 400 * (checked above in an assert). So n_1 must be part of a cycle: 401 * 402 * n_1 -> n_2 -> ... -> n_i 403 * ^ | 404 * |---------------------| 405 * 406 * and this must be only cycle n_1 is involved in, because any other 407 * path starting from n_1 would also have to end in n_1, resulting in 408 * a node somewhere along the way being the destination of 2 copies 409 * when the 2 paths merge. 410 * 411 * The way we resolve the cycle is through picking a copy (n_1, n_2) 412 * and swapping n_1 and n_2. This moves n_1 to n_2, so n_2 is taken 413 * out of the cycle: 414 * 415 * n_1 -> ... -> n_i 416 * ^ | 417 * |--------------| 418 * 419 * and we can keep repeating this until the cycle is empty. 420 */ 421 422 for (unsigned i = 0; i < ctx->entry_count; i++) { 423 struct copy_entry *entry = &ctx->entries[i]; 424 if (entry->done) 425 continue; 426 427 assert(!entry->src.flags); 428 429 /* catch trivial copies */ 430 if (entry->dst == entry->src.reg) { 431 entry->done = true; 432 continue; 433 } 434 435 do_swap(compiler, instr, entry); 436 437 /* Split any blocking copies whose sources are only partially 438 * contained within our destination. 439 */ 440 if (entry->flags & IR3_REG_HALF) { 441 for (unsigned j = 0; j < ctx->entry_count; j++) { 442 struct copy_entry *blocking = &ctx->entries[j]; 443 444 if (blocking->done) 445 continue; 446 447 if (blocking->src.reg <= entry->dst && 448 blocking->src.reg + 1 >= entry->dst && 449 !(blocking->flags & IR3_REG_HALF)) { 450 split_32bit_copy(ctx, blocking); 451 } 452 } 453 } 454 455 /* Update sources of blocking copies. 456 * 457 * Note: at this point, every blocking copy's source should be 458 * contained within our destination. 459 */ 460 for (unsigned j = 0; j < ctx->entry_count; j++) { 461 struct copy_entry *blocking = &ctx->entries[j]; 462 if (blocking->src.reg >= entry->dst && 463 blocking->src.reg < entry->dst + copy_entry_size(entry)) { 464 blocking->src.reg = 465 entry->src.reg + (blocking->src.reg - entry->dst); 466 } 467 } 468 469 entry->done = true; 470 } 471} 472 473static void 474handle_copies(struct ir3_shader_variant *v, struct ir3_instruction *instr, 475 struct copy_entry *entries, unsigned entry_count) 476{ 477 struct copy_ctx ctx; 478 479 /* handle shared copies first */ 480 ctx.entry_count = 0; 481 for (unsigned i = 0; i < entry_count; i++) { 482 if (entries[i].flags & IR3_REG_SHARED) 483 ctx.entries[ctx.entry_count++] = entries[i]; 484 } 485 _handle_copies(v->shader->compiler, instr, &ctx); 486 487 if (v->mergedregs) { 488 /* Half regs and full regs are in the same file, so handle everything 489 * at once. 490 */ 491 ctx.entry_count = 0; 492 for (unsigned i = 0; i < entry_count; i++) { 493 if (!(entries[i].flags & IR3_REG_SHARED)) 494 ctx.entries[ctx.entry_count++] = entries[i]; 495 } 496 _handle_copies(v->shader->compiler, instr, &ctx); 497 } else { 498 /* There may be both half copies and full copies, so we have to split 499 * them up since they don't interfere. 500 */ 501 ctx.entry_count = 0; 502 for (unsigned i = 0; i < entry_count; i++) { 503 if (entries[i].flags & IR3_REG_HALF) 504 ctx.entries[ctx.entry_count++] = entries[i]; 505 } 506 _handle_copies(v->shader->compiler, instr, &ctx); 507 508 ctx.entry_count = 0; 509 for (unsigned i = 0; i < entry_count; i++) { 510 if (!(entries[i].flags & (IR3_REG_HALF | IR3_REG_SHARED))) 511 ctx.entries[ctx.entry_count++] = entries[i]; 512 } 513 _handle_copies(v->shader->compiler, instr, &ctx); 514 } 515} 516 517void 518ir3_lower_copies(struct ir3_shader_variant *v) 519{ 520 DECLARE_ARRAY(struct copy_entry, copies); 521 copies_count = copies_sz = 0; 522 copies = NULL; 523 524 foreach_block (block, &v->ir->block_list) { 525 foreach_instr_safe (instr, &block->instr_list) { 526 if (instr->opc == OPC_META_PARALLEL_COPY) { 527 copies_count = 0; 528 for (unsigned i = 0; i < instr->dsts_count; i++) { 529 struct ir3_register *dst = instr->dsts[i]; 530 struct ir3_register *src = instr->srcs[i]; 531 unsigned flags = src->flags & (IR3_REG_HALF | IR3_REG_SHARED); 532 unsigned dst_physreg = ra_reg_get_physreg(dst); 533 for (unsigned j = 0; j < reg_elems(dst); j++) { 534 array_insert( 535 NULL, copies, 536 (struct copy_entry){ 537 .dst = dst_physreg + j * reg_elem_size(dst), 538 .src = get_copy_src(src, j * reg_elem_size(dst)), 539 .flags = flags, 540 }); 541 } 542 } 543 handle_copies(v, instr, copies, copies_count); 544 list_del(&instr->node); 545 } else if (instr->opc == OPC_META_COLLECT) { 546 copies_count = 0; 547 struct ir3_register *dst = instr->dsts[0]; 548 unsigned flags = dst->flags & (IR3_REG_HALF | IR3_REG_SHARED); 549 for (unsigned i = 0; i < instr->srcs_count; i++) { 550 struct ir3_register *src = instr->srcs[i]; 551 array_insert(NULL, copies, 552 (struct copy_entry){ 553 .dst = ra_num_to_physreg(dst->num + i, flags), 554 .src = get_copy_src(src, 0), 555 .flags = flags, 556 }); 557 } 558 handle_copies(v, instr, copies, copies_count); 559 list_del(&instr->node); 560 } else if (instr->opc == OPC_META_SPLIT) { 561 copies_count = 0; 562 struct ir3_register *dst = instr->dsts[0]; 563 struct ir3_register *src = instr->srcs[0]; 564 unsigned flags = src->flags & (IR3_REG_HALF | IR3_REG_SHARED); 565 array_insert(NULL, copies, 566 (struct copy_entry){ 567 .dst = ra_reg_get_physreg(dst), 568 .src = get_copy_src( 569 src, instr->split.off * reg_elem_size(dst)), 570 .flags = flags, 571 }); 572 handle_copies(v, instr, copies, copies_count); 573 list_del(&instr->node); 574 } else if (instr->opc == OPC_META_PHI) { 575 list_del(&instr->node); 576 } 577 } 578 } 579 580 if (copies) 581 ralloc_free(copies); 582} 583