17ec681f3Smrg/* 27ec681f3Smrg * Copyright (C) 2021 Alyssa Rosenzweig <alyssa@rosenzweig.io> 37ec681f3Smrg * 47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a 57ec681f3Smrg * copy of this software and associated documentation files (the "Software"), 67ec681f3Smrg * to deal in the Software without restriction, including without limitation 77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the 97ec681f3Smrg * Software is furnished to do so, subject to the following conditions: 107ec681f3Smrg * 117ec681f3Smrg * The above copyright notice and this permission notice (including the next 127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the 137ec681f3Smrg * Software. 147ec681f3Smrg * 157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 207ec681f3Smrg * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 217ec681f3Smrg * SOFTWARE. 227ec681f3Smrg */ 237ec681f3Smrg 247ec681f3Smrg#include "agx_compiler.h" 257ec681f3Smrg#include "agx_minifloat.h" 267ec681f3Smrg 277ec681f3Smrg/* AGX peephole optimizer responsible for instruction combining. It operates in 287ec681f3Smrg * a forward direction and a backward direction, in each case traversing in 297ec681f3Smrg * source order. SSA means the forward pass satisfies the invariant: 307ec681f3Smrg * 317ec681f3Smrg * Every def is visited before any of its uses. 327ec681f3Smrg * 337ec681f3Smrg * Dually, the backend pass satisfies the invariant: 347ec681f3Smrg * 357ec681f3Smrg * Every use of a def is visited before the def. 367ec681f3Smrg * 377ec681f3Smrg * This means the forward pass can propagate modifiers forward, whereas the 387ec681f3Smrg * backwards pass propagates modifiers backward. Consider an example: 397ec681f3Smrg * 407ec681f3Smrg * 1 = fabs 0 417ec681f3Smrg * 2 = fround 1 427ec681f3Smrg * 3 = fsat 1 437ec681f3Smrg * 447ec681f3Smrg * The forwards pass would propagate the fabs to the fround (since we can 457ec681f3Smrg * lookup the fabs from the fround source and do the replacement). By contrast 467ec681f3Smrg * the backwards pass would propagate the fsat back to the fround (since when 477ec681f3Smrg * we see the fround we know it has only a single user, fsat). Propagatable 487ec681f3Smrg * instruction have natural directions (like pushforwards and pullbacks). 497ec681f3Smrg * 507ec681f3Smrg * We are careful to update the tracked state whenever we modify an instruction 517ec681f3Smrg * to ensure the passes are linear-time and converge in a single iteration. 527ec681f3Smrg * 537ec681f3Smrg * Size conversions are worth special discussion. Consider the snippet: 547ec681f3Smrg * 557ec681f3Smrg * 2 = fadd 0, 1 567ec681f3Smrg * 3 = f2f16 2 577ec681f3Smrg * 4 = fround 3 587ec681f3Smrg * 597ec681f3Smrg * A priori, we can move the f2f16 in either direction. But it's not equal -- 607ec681f3Smrg * if we move it up to the fadd, we get FP16 for two instructions, whereas if 617ec681f3Smrg * we push it into the fround, we effectively get FP32 for two instructions. So 627ec681f3Smrg * f2f16 is backwards. Likewise, consider 637ec681f3Smrg * 647ec681f3Smrg * 2 = fadd 0, 1 657ec681f3Smrg * 3 = f2f32 1 667ec681f3Smrg * 4 = fround 3 677ec681f3Smrg * 687ec681f3Smrg * This time if we move f2f32 up to the fadd, we get FP32 for two, but if we 697ec681f3Smrg * move it down to the fround, we get FP16 to too. So f2f32 is backwards. 707ec681f3Smrg */ 717ec681f3Smrg 727ec681f3Smrgstatic bool 737ec681f3Smrgagx_is_fmov(agx_instr *def) 747ec681f3Smrg{ 757ec681f3Smrg return (def->op == AGX_OPCODE_FADD) 767ec681f3Smrg && agx_is_equiv(def->src[1], agx_negzero()); 777ec681f3Smrg} 787ec681f3Smrg 797ec681f3Smrg/* Compose floating-point modifiers with floating-point sources */ 807ec681f3Smrg 817ec681f3Smrgstatic agx_index 827ec681f3Smrgagx_compose_float_src(agx_index to, agx_index from) 837ec681f3Smrg{ 847ec681f3Smrg if (to.abs) 857ec681f3Smrg from.neg = false; 867ec681f3Smrg 877ec681f3Smrg from.abs |= to.abs; 887ec681f3Smrg from.neg |= to.neg; 897ec681f3Smrg 907ec681f3Smrg return from; 917ec681f3Smrg} 927ec681f3Smrg 937ec681f3Smrgstatic void 947ec681f3Smrgagx_optimizer_fmov(agx_instr **defs, agx_instr *ins, unsigned srcs) 957ec681f3Smrg{ 967ec681f3Smrg for (unsigned s = 0; s < srcs; ++s) { 977ec681f3Smrg agx_index src = ins->src[s]; 987ec681f3Smrg if (src.type != AGX_INDEX_NORMAL) continue; 997ec681f3Smrg 1007ec681f3Smrg agx_instr *def = defs[src.value]; 1017ec681f3Smrg if (!agx_is_fmov(def)) continue; 1027ec681f3Smrg if (def->saturate) continue; 1037ec681f3Smrg 1047ec681f3Smrg ins->src[s] = agx_compose_float_src(src, def->src[0]); 1057ec681f3Smrg } 1067ec681f3Smrg} 1077ec681f3Smrg 1087ec681f3Smrgstatic void 1097ec681f3Smrgagx_optimizer_inline_imm(agx_instr **defs, agx_instr *I, 1107ec681f3Smrg unsigned srcs, bool is_float) 1117ec681f3Smrg{ 1127ec681f3Smrg for (unsigned s = 0; s < srcs; ++s) { 1137ec681f3Smrg agx_index src = I->src[s]; 1147ec681f3Smrg if (src.type != AGX_INDEX_NORMAL) continue; 1157ec681f3Smrg 1167ec681f3Smrg agx_instr *def = defs[src.value]; 1177ec681f3Smrg if (def->op != AGX_OPCODE_MOV_IMM) continue; 1187ec681f3Smrg 1197ec681f3Smrg uint8_t value = def->imm; 1207ec681f3Smrg bool float_src = is_float; 1217ec681f3Smrg 1227ec681f3Smrg /* cmpselsrc takes integer immediates only */ 1237ec681f3Smrg if (s >= 2 && I->op == AGX_OPCODE_FCMPSEL) float_src = false; 1247ec681f3Smrg 1257ec681f3Smrg if (float_src) { 1267ec681f3Smrg bool fp16 = (def->dest[0].size == AGX_SIZE_16); 1277ec681f3Smrg assert(fp16 || (def->dest[0].size == AGX_SIZE_32)); 1287ec681f3Smrg 1297ec681f3Smrg float f = fp16 ? _mesa_half_to_float(def->imm) : uif(def->imm); 1307ec681f3Smrg if (!agx_minifloat_exact(f)) continue; 1317ec681f3Smrg 1327ec681f3Smrg value = agx_minifloat_encode(f); 1337ec681f3Smrg } else if (value != def->imm) { 1347ec681f3Smrg continue; 1357ec681f3Smrg } 1367ec681f3Smrg 1377ec681f3Smrg I->src[s].type = AGX_INDEX_IMMEDIATE; 1387ec681f3Smrg I->src[s].value = value; 1397ec681f3Smrg } 1407ec681f3Smrg} 1417ec681f3Smrg 1427ec681f3Smrgstatic bool 1437ec681f3Smrgagx_optimizer_fmov_rev(agx_instr *I, agx_instr *use) 1447ec681f3Smrg{ 1457ec681f3Smrg if (!agx_is_fmov(use)) return false; 1467ec681f3Smrg if (use->src[0].neg || use->src[0].abs) return false; 1477ec681f3Smrg 1487ec681f3Smrg /* saturate(saturate(x)) = saturate(x) */ 1497ec681f3Smrg I->saturate |= use->saturate; 1507ec681f3Smrg I->dest[0] = use->dest[0]; 1517ec681f3Smrg return true; 1527ec681f3Smrg} 1537ec681f3Smrg 1547ec681f3Smrgstatic void 1557ec681f3Smrgagx_optimizer_forward(agx_context *ctx) 1567ec681f3Smrg{ 1577ec681f3Smrg agx_instr **defs = calloc(ctx->alloc, sizeof(*defs)); 1587ec681f3Smrg 1597ec681f3Smrg agx_foreach_instr_global(ctx, I) { 1607ec681f3Smrg struct agx_opcode_info info = agx_opcodes_info[I->op]; 1617ec681f3Smrg 1627ec681f3Smrg for (unsigned d = 0; d < info.nr_dests; ++d) { 1637ec681f3Smrg if (I->dest[d].type == AGX_INDEX_NORMAL) 1647ec681f3Smrg defs[I->dest[d].value] = I; 1657ec681f3Smrg } 1667ec681f3Smrg 1677ec681f3Smrg /* Propagate fmov down */ 1687ec681f3Smrg if (info.is_float) 1697ec681f3Smrg agx_optimizer_fmov(defs, I, info.nr_srcs); 1707ec681f3Smrg 1717ec681f3Smrg /* Inline immediates if we can. TODO: systematic */ 1727ec681f3Smrg if (I->op != AGX_OPCODE_ST_VARY && I->op != AGX_OPCODE_ST_TILE && I->op != AGX_OPCODE_P_EXTRACT && I->op != AGX_OPCODE_P_COMBINE) 1737ec681f3Smrg agx_optimizer_inline_imm(defs, I, info.nr_srcs, info.is_float); 1747ec681f3Smrg } 1757ec681f3Smrg 1767ec681f3Smrg free(defs); 1777ec681f3Smrg} 1787ec681f3Smrg 1797ec681f3Smrgstatic void 1807ec681f3Smrgagx_optimizer_backward(agx_context *ctx) 1817ec681f3Smrg{ 1827ec681f3Smrg agx_instr **uses = calloc(ctx->alloc, sizeof(*uses)); 1837ec681f3Smrg BITSET_WORD *multiple = calloc(BITSET_WORDS(ctx->alloc), sizeof(*multiple)); 1847ec681f3Smrg 1857ec681f3Smrg agx_foreach_instr_global_rev(ctx, I) { 1867ec681f3Smrg struct agx_opcode_info info = agx_opcodes_info[I->op]; 1877ec681f3Smrg 1887ec681f3Smrg for (unsigned s = 0; s < info.nr_srcs; ++s) { 1897ec681f3Smrg if (I->src[s].type == AGX_INDEX_NORMAL) { 1907ec681f3Smrg unsigned v = I->src[s].value; 1917ec681f3Smrg 1927ec681f3Smrg if (uses[v]) 1937ec681f3Smrg BITSET_SET(multiple, v); 1947ec681f3Smrg else 1957ec681f3Smrg uses[v] = I; 1967ec681f3Smrg } 1977ec681f3Smrg } 1987ec681f3Smrg 1997ec681f3Smrg if (info.nr_dests != 1) 2007ec681f3Smrg continue; 2017ec681f3Smrg 2027ec681f3Smrg if (I->dest[0].type != AGX_INDEX_NORMAL) 2037ec681f3Smrg continue; 2047ec681f3Smrg 2057ec681f3Smrg agx_instr *use = uses[I->dest[0].value]; 2067ec681f3Smrg 2077ec681f3Smrg if (!use || BITSET_TEST(multiple, I->dest[0].value)) 2087ec681f3Smrg continue; 2097ec681f3Smrg 2107ec681f3Smrg /* Destination has a single use, try to propagate */ 2117ec681f3Smrg if (info.is_float && agx_optimizer_fmov_rev(I, use)) { 2127ec681f3Smrg agx_remove_instruction(use); 2137ec681f3Smrg continue; 2147ec681f3Smrg } 2157ec681f3Smrg } 2167ec681f3Smrg 2177ec681f3Smrg free(uses); 2187ec681f3Smrg free(multiple); 2197ec681f3Smrg} 2207ec681f3Smrg 2217ec681f3Smrgvoid 2227ec681f3Smrgagx_optimizer(agx_context *ctx) 2237ec681f3Smrg{ 2247ec681f3Smrg agx_optimizer_backward(ctx); 2257ec681f3Smrg agx_optimizer_forward(ctx); 2267ec681f3Smrg} 227