17ec681f3Smrg/* 27ec681f3Smrg * Copyright (C) 2021 Valve Corporation 37ec681f3Smrg * 47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a 57ec681f3Smrg * copy of this software and associated documentation files (the "Software"), 67ec681f3Smrg * to deal in the Software without restriction, including without limitation 77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the 97ec681f3Smrg * Software is furnished to do so, subject to the following conditions: 107ec681f3Smrg * 117ec681f3Smrg * The above copyright notice and this permission notice (including the next 127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the 137ec681f3Smrg * Software. 147ec681f3Smrg * 157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 207ec681f3Smrg * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 217ec681f3Smrg * SOFTWARE. 227ec681f3Smrg */ 237ec681f3Smrg 247ec681f3Smrg#include "ir3_ra.h" 257ec681f3Smrg#include "ir3_shader.h" 267ec681f3Smrg 277ec681f3Smrgstruct copy_src { 287ec681f3Smrg unsigned flags; 297ec681f3Smrg union { 307ec681f3Smrg uint32_t imm; 317ec681f3Smrg physreg_t reg; 327ec681f3Smrg unsigned const_num; 337ec681f3Smrg }; 347ec681f3Smrg}; 357ec681f3Smrg 367ec681f3Smrgstruct copy_entry { 377ec681f3Smrg physreg_t dst; 387ec681f3Smrg unsigned flags; 397ec681f3Smrg bool done; 407ec681f3Smrg 417ec681f3Smrg struct copy_src src; 427ec681f3Smrg}; 437ec681f3Smrg 447ec681f3Smrgstatic unsigned 457ec681f3Smrgcopy_entry_size(const struct copy_entry *entry) 467ec681f3Smrg{ 477ec681f3Smrg return (entry->flags & IR3_REG_HALF) ? 1 : 2; 487ec681f3Smrg} 497ec681f3Smrg 507ec681f3Smrgstatic struct copy_src 517ec681f3Smrgget_copy_src(const struct ir3_register *reg, unsigned offset) 527ec681f3Smrg{ 537ec681f3Smrg if (reg->flags & IR3_REG_IMMED) { 547ec681f3Smrg return (struct copy_src){ 557ec681f3Smrg .flags = IR3_REG_IMMED, 567ec681f3Smrg .imm = reg->uim_val, 577ec681f3Smrg }; 587ec681f3Smrg } else if (reg->flags & IR3_REG_CONST) { 597ec681f3Smrg return (struct copy_src){ 607ec681f3Smrg .flags = IR3_REG_CONST, 617ec681f3Smrg .const_num = reg->num, 627ec681f3Smrg }; 637ec681f3Smrg } else { 647ec681f3Smrg return (struct copy_src){ 657ec681f3Smrg .flags = 0, 667ec681f3Smrg .reg = ra_reg_get_physreg(reg) + offset, 677ec681f3Smrg }; 687ec681f3Smrg } 697ec681f3Smrg} 707ec681f3Smrg 717ec681f3Smrgstatic void 727ec681f3Smrgdo_xor(struct ir3_instruction *instr, unsigned dst_num, unsigned src1_num, 737ec681f3Smrg unsigned src2_num, unsigned flags) 747ec681f3Smrg{ 757ec681f3Smrg struct ir3_instruction * xor 767ec681f3Smrg = ir3_instr_create(instr->block, OPC_XOR_B, 1, 2); 777ec681f3Smrg ir3_dst_create(xor, dst_num, flags); 787ec681f3Smrg ir3_src_create(xor, src1_num, flags); 797ec681f3Smrg ir3_src_create(xor, src2_num, flags); 807ec681f3Smrg 817ec681f3Smrg ir3_instr_move_before(xor, instr); 827ec681f3Smrg} 837ec681f3Smrg 847ec681f3Smrgstatic void 857ec681f3Smrgdo_swap(struct ir3_compiler *compiler, struct ir3_instruction *instr, 867ec681f3Smrg const struct copy_entry *entry) 877ec681f3Smrg{ 887ec681f3Smrg assert(!entry->src.flags); 897ec681f3Smrg 907ec681f3Smrg if (entry->flags & IR3_REG_HALF) { 917ec681f3Smrg /* We currently make sure to never emit parallel copies where the 927ec681f3Smrg * source/destination is a half-reg above the range accessable to half 937ec681f3Smrg * registers. However, when a full-reg source overlaps a half-reg 947ec681f3Smrg * destination or vice versa, it can be very, very complicated to come 957ec681f3Smrg * up with a series of "legal" swaps and copies to resolve the 967ec681f3Smrg * parallel copy. So here we provide a fallback to implement the 977ec681f3Smrg * "illegal" swap instead. This may also be useful for implementing 987ec681f3Smrg * "spilling" half-regs to the inaccessable space. 997ec681f3Smrg */ 1007ec681f3Smrg if (entry->src.reg >= RA_HALF_SIZE) { 1017ec681f3Smrg /* Choose a temporary that doesn't overlap src or dst */ 1027ec681f3Smrg physreg_t tmp = entry->dst < 2 ? 2 : 0; 1037ec681f3Smrg 1047ec681f3Smrg /* Swap src and the temporary */ 1057ec681f3Smrg do_swap(compiler, instr, 1067ec681f3Smrg &(struct copy_entry){ 1077ec681f3Smrg .src = {.reg = entry->src.reg & ~1u}, 1087ec681f3Smrg .dst = tmp, 1097ec681f3Smrg .flags = entry->flags & ~IR3_REG_HALF, 1107ec681f3Smrg }); 1117ec681f3Smrg 1127ec681f3Smrg /* If src and dst are within the same full register, then swapping src 1137ec681f3Smrg * with tmp above will also move dst to tmp. Account for that here. 1147ec681f3Smrg */ 1157ec681f3Smrg unsigned dst = 1167ec681f3Smrg (entry->src.reg & ~1u) == (entry->dst & ~1u) ? 1177ec681f3Smrg tmp + (entry->dst & 1u) : entry->dst; 1187ec681f3Smrg 1197ec681f3Smrg /* Do the original swap with src replaced with tmp */ 1207ec681f3Smrg do_swap(compiler, instr, 1217ec681f3Smrg &(struct copy_entry){ 1227ec681f3Smrg .src = {.reg = tmp + (entry->src.reg & 1)}, 1237ec681f3Smrg .dst = dst, 1247ec681f3Smrg .flags = entry->flags, 1257ec681f3Smrg }); 1267ec681f3Smrg 1277ec681f3Smrg /* Swap src and the temporary back */ 1287ec681f3Smrg do_swap(compiler, instr, 1297ec681f3Smrg &(struct copy_entry){ 1307ec681f3Smrg .src = {.reg = entry->src.reg & ~1u}, 1317ec681f3Smrg .dst = tmp, 1327ec681f3Smrg .flags = entry->flags & ~IR3_REG_HALF, 1337ec681f3Smrg }); 1347ec681f3Smrg return; 1357ec681f3Smrg } 1367ec681f3Smrg 1377ec681f3Smrg /* If dst is not addressable, we only need to swap the arguments and 1387ec681f3Smrg * let the case above handle it. 1397ec681f3Smrg */ 1407ec681f3Smrg if (entry->dst >= RA_HALF_SIZE) { 1417ec681f3Smrg do_swap(compiler, instr, 1427ec681f3Smrg &(struct copy_entry){ 1437ec681f3Smrg .src = {.reg = entry->dst}, 1447ec681f3Smrg .dst = entry->src.reg, 1457ec681f3Smrg .flags = entry->flags, 1467ec681f3Smrg }); 1477ec681f3Smrg return; 1487ec681f3Smrg } 1497ec681f3Smrg } 1507ec681f3Smrg 1517ec681f3Smrg unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags); 1527ec681f3Smrg unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags); 1537ec681f3Smrg 1547ec681f3Smrg /* a5xx+ is known to support swz, which enables us to swap two registers 1557ec681f3Smrg * in-place. If unsupported we emulate it using the xor trick. 1567ec681f3Smrg */ 1577ec681f3Smrg if (compiler->gen < 5) { 1587ec681f3Smrg /* Shared regs only exist since a5xx, so we don't have to provide a 1597ec681f3Smrg * fallback path for them. 1607ec681f3Smrg */ 1617ec681f3Smrg assert(!(entry->flags & IR3_REG_SHARED)); 1627ec681f3Smrg do_xor(instr, dst_num, dst_num, src_num, entry->flags); 1637ec681f3Smrg do_xor(instr, src_num, src_num, dst_num, entry->flags); 1647ec681f3Smrg do_xor(instr, dst_num, dst_num, src_num, entry->flags); 1657ec681f3Smrg } else { 1667ec681f3Smrg /* Use a macro for shared regs because any shared reg writes need to 1677ec681f3Smrg * be wrapped in a getone block to work correctly. Writing shared regs 1687ec681f3Smrg * with multiple threads active does not work, even if they all return 1697ec681f3Smrg * the same value. 1707ec681f3Smrg */ 1717ec681f3Smrg unsigned opc = 1727ec681f3Smrg (entry->flags & IR3_REG_SHARED) ? OPC_SWZ_SHARED_MACRO : OPC_SWZ; 1737ec681f3Smrg struct ir3_instruction *swz = ir3_instr_create(instr->block, opc, 2, 2); 1747ec681f3Smrg ir3_dst_create(swz, dst_num, entry->flags); 1757ec681f3Smrg ir3_dst_create(swz, src_num, entry->flags); 1767ec681f3Smrg ir3_src_create(swz, src_num, entry->flags); 1777ec681f3Smrg ir3_src_create(swz, dst_num, entry->flags); 1787ec681f3Smrg swz->cat1.dst_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32; 1797ec681f3Smrg swz->cat1.src_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32; 1807ec681f3Smrg swz->repeat = 1; 1817ec681f3Smrg ir3_instr_move_before(swz, instr); 1827ec681f3Smrg } 1837ec681f3Smrg} 1847ec681f3Smrg 1857ec681f3Smrgstatic void 1867ec681f3Smrgdo_copy(struct ir3_compiler *compiler, struct ir3_instruction *instr, 1877ec681f3Smrg const struct copy_entry *entry) 1887ec681f3Smrg{ 1897ec681f3Smrg if (entry->flags & IR3_REG_HALF) { 1907ec681f3Smrg /* See do_swap() for why this is here. */ 1917ec681f3Smrg if (entry->dst >= RA_HALF_SIZE) { 1927ec681f3Smrg /* TODO: is there a hw instruction we can use for this case? */ 1937ec681f3Smrg physreg_t tmp = !entry->src.flags && entry->src.reg < 2 ? 2 : 0; 1947ec681f3Smrg 1957ec681f3Smrg do_swap(compiler, instr, 1967ec681f3Smrg &(struct copy_entry){ 1977ec681f3Smrg .src = {.reg = entry->dst & ~1u}, 1987ec681f3Smrg .dst = tmp, 1997ec681f3Smrg .flags = entry->flags & ~IR3_REG_HALF, 2007ec681f3Smrg }); 2017ec681f3Smrg 2027ec681f3Smrg /* Similar to in do_swap(), account for src being swapped with tmp if 2037ec681f3Smrg * src and dst are in the same register. 2047ec681f3Smrg */ 2057ec681f3Smrg struct copy_src src = entry->src; 2067ec681f3Smrg if (!src.flags && (src.reg & ~1u) == (entry->dst & ~1u)) 2077ec681f3Smrg src.reg = tmp + (src.reg & 1u); 2087ec681f3Smrg 2097ec681f3Smrg do_copy(compiler, instr, 2107ec681f3Smrg &(struct copy_entry){ 2117ec681f3Smrg .src = src, 2127ec681f3Smrg .dst = tmp + (entry->dst & 1), 2137ec681f3Smrg .flags = entry->flags, 2147ec681f3Smrg }); 2157ec681f3Smrg 2167ec681f3Smrg do_swap(compiler, instr, 2177ec681f3Smrg &(struct copy_entry){ 2187ec681f3Smrg .src = {.reg = entry->dst & ~1u}, 2197ec681f3Smrg .dst = tmp, 2207ec681f3Smrg .flags = entry->flags & ~IR3_REG_HALF, 2217ec681f3Smrg }); 2227ec681f3Smrg return; 2237ec681f3Smrg } 2247ec681f3Smrg 2257ec681f3Smrg if (!entry->src.flags && entry->src.reg >= RA_HALF_SIZE) { 2267ec681f3Smrg unsigned src_num = ra_physreg_to_num(entry->src.reg & ~1u, 2277ec681f3Smrg entry->flags & ~IR3_REG_HALF); 2287ec681f3Smrg unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags); 2297ec681f3Smrg 2307ec681f3Smrg if (entry->src.reg % 2 == 0) { 2317ec681f3Smrg /* cov.u32u16 dst, src */ 2327ec681f3Smrg struct ir3_instruction *cov = 2337ec681f3Smrg ir3_instr_create(instr->block, OPC_MOV, 1, 1); 2347ec681f3Smrg ir3_dst_create(cov, dst_num, entry->flags); 2357ec681f3Smrg ir3_src_create(cov, src_num, entry->flags & ~IR3_REG_HALF); 2367ec681f3Smrg cov->cat1.dst_type = TYPE_U16; 2377ec681f3Smrg cov->cat1.src_type = TYPE_U32; 2387ec681f3Smrg ir3_instr_move_before(cov, instr); 2397ec681f3Smrg } else { 2407ec681f3Smrg /* shr.b dst, src, (16) */ 2417ec681f3Smrg struct ir3_instruction *shr = 2427ec681f3Smrg ir3_instr_create(instr->block, OPC_SHR_B, 1, 2); 2437ec681f3Smrg ir3_dst_create(shr, dst_num, entry->flags); 2447ec681f3Smrg ir3_src_create(shr, src_num, entry->flags & ~IR3_REG_HALF); 2457ec681f3Smrg ir3_src_create(shr, 0, IR3_REG_IMMED)->uim_val = 16; 2467ec681f3Smrg ir3_instr_move_before(shr, instr); 2477ec681f3Smrg } 2487ec681f3Smrg return; 2497ec681f3Smrg } 2507ec681f3Smrg } 2517ec681f3Smrg 2527ec681f3Smrg unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags); 2537ec681f3Smrg unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags); 2547ec681f3Smrg 2557ec681f3Smrg /* Similar to the swap case, we have to use a macro for shared regs. */ 2567ec681f3Smrg unsigned opc = 2577ec681f3Smrg (entry->flags & IR3_REG_SHARED) ? OPC_READ_FIRST_MACRO : OPC_MOV; 2587ec681f3Smrg struct ir3_instruction *mov = ir3_instr_create(instr->block, opc, 1, 1); 2597ec681f3Smrg ir3_dst_create(mov, dst_num, entry->flags); 2607ec681f3Smrg ir3_src_create(mov, src_num, entry->flags | entry->src.flags); 2617ec681f3Smrg mov->cat1.dst_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32; 2627ec681f3Smrg mov->cat1.src_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32; 2637ec681f3Smrg if (entry->src.flags & IR3_REG_IMMED) 2647ec681f3Smrg mov->srcs[0]->uim_val = entry->src.imm; 2657ec681f3Smrg else if (entry->src.flags & IR3_REG_CONST) 2667ec681f3Smrg mov->srcs[0]->num = entry->src.const_num; 2677ec681f3Smrg ir3_instr_move_before(mov, instr); 2687ec681f3Smrg} 2697ec681f3Smrg 2707ec681f3Smrgstruct copy_ctx { 2717ec681f3Smrg /* For each physreg, the number of pending copy entries that use it as a 2727ec681f3Smrg * source. Once this drops to zero, then the physreg is unblocked and can 2737ec681f3Smrg * be moved to. 2747ec681f3Smrg */ 2757ec681f3Smrg unsigned physreg_use_count[RA_MAX_FILE_SIZE]; 2767ec681f3Smrg 2777ec681f3Smrg /* For each physreg, the pending copy_entry that uses it as a dest. */ 2787ec681f3Smrg struct copy_entry *physreg_dst[RA_MAX_FILE_SIZE]; 2797ec681f3Smrg 2807ec681f3Smrg struct copy_entry entries[RA_MAX_FILE_SIZE]; 2817ec681f3Smrg unsigned entry_count; 2827ec681f3Smrg}; 2837ec681f3Smrg 2847ec681f3Smrgstatic bool 2857ec681f3Smrgentry_blocked(struct copy_entry *entry, struct copy_ctx *ctx) 2867ec681f3Smrg{ 2877ec681f3Smrg for (unsigned i = 0; i < copy_entry_size(entry); i++) { 2887ec681f3Smrg if (ctx->physreg_use_count[entry->dst + i] != 0) 2897ec681f3Smrg return true; 2907ec681f3Smrg } 2917ec681f3Smrg 2927ec681f3Smrg return false; 2937ec681f3Smrg} 2947ec681f3Smrg 2957ec681f3Smrgstatic void 2967ec681f3Smrgsplit_32bit_copy(struct copy_ctx *ctx, struct copy_entry *entry) 2977ec681f3Smrg{ 2987ec681f3Smrg assert(!entry->done); 2997ec681f3Smrg assert(!(entry->src.flags & (IR3_REG_IMMED | IR3_REG_CONST))); 3007ec681f3Smrg assert(copy_entry_size(entry) == 2); 3017ec681f3Smrg struct copy_entry *new_entry = &ctx->entries[ctx->entry_count++]; 3027ec681f3Smrg 3037ec681f3Smrg new_entry->dst = entry->dst + 1; 3047ec681f3Smrg new_entry->src.flags = entry->src.flags; 3057ec681f3Smrg new_entry->src.reg = entry->src.reg + 1; 3067ec681f3Smrg new_entry->done = false; 3077ec681f3Smrg entry->flags |= IR3_REG_HALF; 3087ec681f3Smrg new_entry->flags = entry->flags; 3097ec681f3Smrg ctx->physreg_dst[entry->dst + 1] = new_entry; 3107ec681f3Smrg} 3117ec681f3Smrg 3127ec681f3Smrgstatic void 3137ec681f3Smrg_handle_copies(struct ir3_compiler *compiler, struct ir3_instruction *instr, 3147ec681f3Smrg struct copy_ctx *ctx) 3157ec681f3Smrg{ 3167ec681f3Smrg /* Set up the bookkeeping */ 3177ec681f3Smrg memset(ctx->physreg_dst, 0, sizeof(ctx->physreg_dst)); 3187ec681f3Smrg memset(ctx->physreg_use_count, 0, sizeof(ctx->physreg_use_count)); 3197ec681f3Smrg 3207ec681f3Smrg for (unsigned i = 0; i < ctx->entry_count; i++) { 3217ec681f3Smrg struct copy_entry *entry = &ctx->entries[i]; 3227ec681f3Smrg for (unsigned j = 0; j < copy_entry_size(entry); j++) { 3237ec681f3Smrg if (!entry->src.flags) 3247ec681f3Smrg ctx->physreg_use_count[entry->src.reg + j]++; 3257ec681f3Smrg 3267ec681f3Smrg /* Copies should not have overlapping destinations. */ 3277ec681f3Smrg assert(!ctx->physreg_dst[entry->dst + j]); 3287ec681f3Smrg ctx->physreg_dst[entry->dst + j] = entry; 3297ec681f3Smrg } 3307ec681f3Smrg } 3317ec681f3Smrg 3327ec681f3Smrg bool progress = true; 3337ec681f3Smrg while (progress) { 3347ec681f3Smrg progress = false; 3357ec681f3Smrg 3367ec681f3Smrg /* Step 1: resolve paths in the transfer graph. This means finding 3377ec681f3Smrg * copies whose destination aren't blocked by something else and then 3387ec681f3Smrg * emitting them, continuing this process until every copy is blocked 3397ec681f3Smrg * and there are only cycles left. 3407ec681f3Smrg * 3417ec681f3Smrg * TODO: We should note that src is also available in dst to unblock 3427ec681f3Smrg * cycles that src is involved in. 3437ec681f3Smrg */ 3447ec681f3Smrg 3457ec681f3Smrg for (unsigned i = 0; i < ctx->entry_count; i++) { 3467ec681f3Smrg struct copy_entry *entry = &ctx->entries[i]; 3477ec681f3Smrg if (!entry->done && !entry_blocked(entry, ctx)) { 3487ec681f3Smrg entry->done = true; 3497ec681f3Smrg progress = true; 3507ec681f3Smrg do_copy(compiler, instr, entry); 3517ec681f3Smrg for (unsigned j = 0; j < copy_entry_size(entry); j++) { 3527ec681f3Smrg if (!entry->src.flags) 3537ec681f3Smrg ctx->physreg_use_count[entry->src.reg + j]--; 3547ec681f3Smrg ctx->physreg_dst[entry->dst + j] = NULL; 3557ec681f3Smrg } 3567ec681f3Smrg } 3577ec681f3Smrg } 3587ec681f3Smrg 3597ec681f3Smrg if (progress) 3607ec681f3Smrg continue; 3617ec681f3Smrg 3627ec681f3Smrg /* Step 2: Find partially blocked copies and split them. In the 3637ec681f3Smrg * mergedregs case, we can 32-bit copies which are only blocked on one 3647ec681f3Smrg * 16-bit half, and splitting them helps get things moving. 3657ec681f3Smrg * 3667ec681f3Smrg * We can skip splitting copies if the source isn't a register, 3677ec681f3Smrg * however, because it does not unblock anything and therefore doesn't 3687ec681f3Smrg * contribute to making forward progress with step 1. These copies 3697ec681f3Smrg * should still be resolved eventually in step 1 because they can't be 3707ec681f3Smrg * part of a cycle. 3717ec681f3Smrg */ 3727ec681f3Smrg for (unsigned i = 0; i < ctx->entry_count; i++) { 3737ec681f3Smrg struct copy_entry *entry = &ctx->entries[i]; 3747ec681f3Smrg if (entry->done || entry->flags & IR3_REG_HALF) 3757ec681f3Smrg continue; 3767ec681f3Smrg 3777ec681f3Smrg if (((ctx->physreg_use_count[entry->dst] == 0 || 3787ec681f3Smrg ctx->physreg_use_count[entry->dst + 1] == 0)) && 3797ec681f3Smrg !(entry->src.flags & (IR3_REG_IMMED | IR3_REG_CONST))) { 3807ec681f3Smrg split_32bit_copy(ctx, entry); 3817ec681f3Smrg progress = true; 3827ec681f3Smrg } 3837ec681f3Smrg } 3847ec681f3Smrg } 3857ec681f3Smrg 3867ec681f3Smrg /* Step 3: resolve cycles through swapping. 3877ec681f3Smrg * 3887ec681f3Smrg * At this point, the transfer graph should consist of only cycles. 3897ec681f3Smrg * The reason is that, given any physreg n_1 that's the source of a 3907ec681f3Smrg * remaining entry, it has a destination n_2, which (because every 3917ec681f3Smrg * copy is blocked) is the source of some other copy whose destination 3927ec681f3Smrg * is n_3, and so we can follow the chain until we get a cycle. If we 3937ec681f3Smrg * reached some other node than n_1: 3947ec681f3Smrg * 3957ec681f3Smrg * n_1 -> n_2 -> ... -> n_i 3967ec681f3Smrg * ^ | 3977ec681f3Smrg * |-------------| 3987ec681f3Smrg * 3997ec681f3Smrg * then n_2 would be the destination of 2 copies, which is illegal 4007ec681f3Smrg * (checked above in an assert). So n_1 must be part of a cycle: 4017ec681f3Smrg * 4027ec681f3Smrg * n_1 -> n_2 -> ... -> n_i 4037ec681f3Smrg * ^ | 4047ec681f3Smrg * |---------------------| 4057ec681f3Smrg * 4067ec681f3Smrg * and this must be only cycle n_1 is involved in, because any other 4077ec681f3Smrg * path starting from n_1 would also have to end in n_1, resulting in 4087ec681f3Smrg * a node somewhere along the way being the destination of 2 copies 4097ec681f3Smrg * when the 2 paths merge. 4107ec681f3Smrg * 4117ec681f3Smrg * The way we resolve the cycle is through picking a copy (n_1, n_2) 4127ec681f3Smrg * and swapping n_1 and n_2. This moves n_1 to n_2, so n_2 is taken 4137ec681f3Smrg * out of the cycle: 4147ec681f3Smrg * 4157ec681f3Smrg * n_1 -> ... -> n_i 4167ec681f3Smrg * ^ | 4177ec681f3Smrg * |--------------| 4187ec681f3Smrg * 4197ec681f3Smrg * and we can keep repeating this until the cycle is empty. 4207ec681f3Smrg */ 4217ec681f3Smrg 4227ec681f3Smrg for (unsigned i = 0; i < ctx->entry_count; i++) { 4237ec681f3Smrg struct copy_entry *entry = &ctx->entries[i]; 4247ec681f3Smrg if (entry->done) 4257ec681f3Smrg continue; 4267ec681f3Smrg 4277ec681f3Smrg assert(!entry->src.flags); 4287ec681f3Smrg 4297ec681f3Smrg /* catch trivial copies */ 4307ec681f3Smrg if (entry->dst == entry->src.reg) { 4317ec681f3Smrg entry->done = true; 4327ec681f3Smrg continue; 4337ec681f3Smrg } 4347ec681f3Smrg 4357ec681f3Smrg do_swap(compiler, instr, entry); 4367ec681f3Smrg 4377ec681f3Smrg /* Split any blocking copies whose sources are only partially 4387ec681f3Smrg * contained within our destination. 4397ec681f3Smrg */ 4407ec681f3Smrg if (entry->flags & IR3_REG_HALF) { 4417ec681f3Smrg for (unsigned j = 0; j < ctx->entry_count; j++) { 4427ec681f3Smrg struct copy_entry *blocking = &ctx->entries[j]; 4437ec681f3Smrg 4447ec681f3Smrg if (blocking->done) 4457ec681f3Smrg continue; 4467ec681f3Smrg 4477ec681f3Smrg if (blocking->src.reg <= entry->dst && 4487ec681f3Smrg blocking->src.reg + 1 >= entry->dst && 4497ec681f3Smrg !(blocking->flags & IR3_REG_HALF)) { 4507ec681f3Smrg split_32bit_copy(ctx, blocking); 4517ec681f3Smrg } 4527ec681f3Smrg } 4537ec681f3Smrg } 4547ec681f3Smrg 4557ec681f3Smrg /* Update sources of blocking copies. 4567ec681f3Smrg * 4577ec681f3Smrg * Note: at this point, every blocking copy's source should be 4587ec681f3Smrg * contained within our destination. 4597ec681f3Smrg */ 4607ec681f3Smrg for (unsigned j = 0; j < ctx->entry_count; j++) { 4617ec681f3Smrg struct copy_entry *blocking = &ctx->entries[j]; 4627ec681f3Smrg if (blocking->src.reg >= entry->dst && 4637ec681f3Smrg blocking->src.reg < entry->dst + copy_entry_size(entry)) { 4647ec681f3Smrg blocking->src.reg = 4657ec681f3Smrg entry->src.reg + (blocking->src.reg - entry->dst); 4667ec681f3Smrg } 4677ec681f3Smrg } 4687ec681f3Smrg 4697ec681f3Smrg entry->done = true; 4707ec681f3Smrg } 4717ec681f3Smrg} 4727ec681f3Smrg 4737ec681f3Smrgstatic void 4747ec681f3Smrghandle_copies(struct ir3_shader_variant *v, struct ir3_instruction *instr, 4757ec681f3Smrg struct copy_entry *entries, unsigned entry_count) 4767ec681f3Smrg{ 4777ec681f3Smrg struct copy_ctx ctx; 4787ec681f3Smrg 4797ec681f3Smrg /* handle shared copies first */ 4807ec681f3Smrg ctx.entry_count = 0; 4817ec681f3Smrg for (unsigned i = 0; i < entry_count; i++) { 4827ec681f3Smrg if (entries[i].flags & IR3_REG_SHARED) 4837ec681f3Smrg ctx.entries[ctx.entry_count++] = entries[i]; 4847ec681f3Smrg } 4857ec681f3Smrg _handle_copies(v->shader->compiler, instr, &ctx); 4867ec681f3Smrg 4877ec681f3Smrg if (v->mergedregs) { 4887ec681f3Smrg /* Half regs and full regs are in the same file, so handle everything 4897ec681f3Smrg * at once. 4907ec681f3Smrg */ 4917ec681f3Smrg ctx.entry_count = 0; 4927ec681f3Smrg for (unsigned i = 0; i < entry_count; i++) { 4937ec681f3Smrg if (!(entries[i].flags & IR3_REG_SHARED)) 4947ec681f3Smrg ctx.entries[ctx.entry_count++] = entries[i]; 4957ec681f3Smrg } 4967ec681f3Smrg _handle_copies(v->shader->compiler, instr, &ctx); 4977ec681f3Smrg } else { 4987ec681f3Smrg /* There may be both half copies and full copies, so we have to split 4997ec681f3Smrg * them up since they don't interfere. 5007ec681f3Smrg */ 5017ec681f3Smrg ctx.entry_count = 0; 5027ec681f3Smrg for (unsigned i = 0; i < entry_count; i++) { 5037ec681f3Smrg if (entries[i].flags & IR3_REG_HALF) 5047ec681f3Smrg ctx.entries[ctx.entry_count++] = entries[i]; 5057ec681f3Smrg } 5067ec681f3Smrg _handle_copies(v->shader->compiler, instr, &ctx); 5077ec681f3Smrg 5087ec681f3Smrg ctx.entry_count = 0; 5097ec681f3Smrg for (unsigned i = 0; i < entry_count; i++) { 5107ec681f3Smrg if (!(entries[i].flags & (IR3_REG_HALF | IR3_REG_SHARED))) 5117ec681f3Smrg ctx.entries[ctx.entry_count++] = entries[i]; 5127ec681f3Smrg } 5137ec681f3Smrg _handle_copies(v->shader->compiler, instr, &ctx); 5147ec681f3Smrg } 5157ec681f3Smrg} 5167ec681f3Smrg 5177ec681f3Smrgvoid 5187ec681f3Smrgir3_lower_copies(struct ir3_shader_variant *v) 5197ec681f3Smrg{ 5207ec681f3Smrg DECLARE_ARRAY(struct copy_entry, copies); 5217ec681f3Smrg copies_count = copies_sz = 0; 5227ec681f3Smrg copies = NULL; 5237ec681f3Smrg 5247ec681f3Smrg foreach_block (block, &v->ir->block_list) { 5257ec681f3Smrg foreach_instr_safe (instr, &block->instr_list) { 5267ec681f3Smrg if (instr->opc == OPC_META_PARALLEL_COPY) { 5277ec681f3Smrg copies_count = 0; 5287ec681f3Smrg for (unsigned i = 0; i < instr->dsts_count; i++) { 5297ec681f3Smrg struct ir3_register *dst = instr->dsts[i]; 5307ec681f3Smrg struct ir3_register *src = instr->srcs[i]; 5317ec681f3Smrg unsigned flags = src->flags & (IR3_REG_HALF | IR3_REG_SHARED); 5327ec681f3Smrg unsigned dst_physreg = ra_reg_get_physreg(dst); 5337ec681f3Smrg for (unsigned j = 0; j < reg_elems(dst); j++) { 5347ec681f3Smrg array_insert( 5357ec681f3Smrg NULL, copies, 5367ec681f3Smrg (struct copy_entry){ 5377ec681f3Smrg .dst = dst_physreg + j * reg_elem_size(dst), 5387ec681f3Smrg .src = get_copy_src(src, j * reg_elem_size(dst)), 5397ec681f3Smrg .flags = flags, 5407ec681f3Smrg }); 5417ec681f3Smrg } 5427ec681f3Smrg } 5437ec681f3Smrg handle_copies(v, instr, copies, copies_count); 5447ec681f3Smrg list_del(&instr->node); 5457ec681f3Smrg } else if (instr->opc == OPC_META_COLLECT) { 5467ec681f3Smrg copies_count = 0; 5477ec681f3Smrg struct ir3_register *dst = instr->dsts[0]; 5487ec681f3Smrg unsigned flags = dst->flags & (IR3_REG_HALF | IR3_REG_SHARED); 5497ec681f3Smrg for (unsigned i = 0; i < instr->srcs_count; i++) { 5507ec681f3Smrg struct ir3_register *src = instr->srcs[i]; 5517ec681f3Smrg array_insert(NULL, copies, 5527ec681f3Smrg (struct copy_entry){ 5537ec681f3Smrg .dst = ra_num_to_physreg(dst->num + i, flags), 5547ec681f3Smrg .src = get_copy_src(src, 0), 5557ec681f3Smrg .flags = flags, 5567ec681f3Smrg }); 5577ec681f3Smrg } 5587ec681f3Smrg handle_copies(v, instr, copies, copies_count); 5597ec681f3Smrg list_del(&instr->node); 5607ec681f3Smrg } else if (instr->opc == OPC_META_SPLIT) { 5617ec681f3Smrg copies_count = 0; 5627ec681f3Smrg struct ir3_register *dst = instr->dsts[0]; 5637ec681f3Smrg struct ir3_register *src = instr->srcs[0]; 5647ec681f3Smrg unsigned flags = src->flags & (IR3_REG_HALF | IR3_REG_SHARED); 5657ec681f3Smrg array_insert(NULL, copies, 5667ec681f3Smrg (struct copy_entry){ 5677ec681f3Smrg .dst = ra_reg_get_physreg(dst), 5687ec681f3Smrg .src = get_copy_src( 5697ec681f3Smrg src, instr->split.off * reg_elem_size(dst)), 5707ec681f3Smrg .flags = flags, 5717ec681f3Smrg }); 5727ec681f3Smrg handle_copies(v, instr, copies, copies_count); 5737ec681f3Smrg list_del(&instr->node); 5747ec681f3Smrg } else if (instr->opc == OPC_META_PHI) { 5757ec681f3Smrg list_del(&instr->node); 5767ec681f3Smrg } 5777ec681f3Smrg } 5787ec681f3Smrg } 5797ec681f3Smrg 5807ec681f3Smrg if (copies) 5817ec681f3Smrg ralloc_free(copies); 5827ec681f3Smrg} 583