17ec681f3Smrg/* 27ec681f3Smrg * Copyright © 2018 Valve Corporation 37ec681f3Smrg * 47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a 57ec681f3Smrg * copy of this software and associated documentation files (the "Software"), 67ec681f3Smrg * to deal in the Software without restriction, including without limitation 77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the 97ec681f3Smrg * Software is furnished to do so, subject to the following conditions: 107ec681f3Smrg * 117ec681f3Smrg * The above copyright notice and this permission notice (including the next 127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the 137ec681f3Smrg * Software. 147ec681f3Smrg * 157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 207ec681f3Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 217ec681f3Smrg * IN THE SOFTWARE. 227ec681f3Smrg * 237ec681f3Smrg */ 247ec681f3Smrg 257ec681f3Smrg#include "aco_builder.h" 267ec681f3Smrg#include "aco_ir.h" 277ec681f3Smrg 287ec681f3Smrg#include "common/sid.h" 297ec681f3Smrg 307ec681f3Smrg#include "util/memstream.h" 317ec681f3Smrg 327ec681f3Smrg#include <algorithm> 337ec681f3Smrg#include <map> 347ec681f3Smrg#include <vector> 357ec681f3Smrg 367ec681f3Smrgnamespace aco { 377ec681f3Smrg 387ec681f3Smrgstruct constaddr_info { 397ec681f3Smrg unsigned getpc_end; 407ec681f3Smrg unsigned add_literal; 417ec681f3Smrg}; 427ec681f3Smrg 437ec681f3Smrgstruct asm_context { 447ec681f3Smrg Program* program; 457ec681f3Smrg enum chip_class chip_class; 467ec681f3Smrg std::vector<std::pair<int, SOPP_instruction*>> branches; 477ec681f3Smrg std::map<unsigned, constaddr_info> constaddrs; 487ec681f3Smrg const int16_t* opcode; 497ec681f3Smrg // TODO: keep track of branch instructions referring blocks 507ec681f3Smrg // and, when emitting the block, correct the offset in instr 517ec681f3Smrg asm_context(Program* program_) : program(program_), chip_class(program->chip_class) 527ec681f3Smrg { 537ec681f3Smrg if (chip_class <= GFX7) 547ec681f3Smrg opcode = &instr_info.opcode_gfx7[0]; 557ec681f3Smrg else if (chip_class <= GFX9) 567ec681f3Smrg opcode = &instr_info.opcode_gfx9[0]; 577ec681f3Smrg else if (chip_class >= GFX10) 587ec681f3Smrg opcode = &instr_info.opcode_gfx10[0]; 597ec681f3Smrg } 607ec681f3Smrg 617ec681f3Smrg int subvector_begin_pos = -1; 627ec681f3Smrg}; 637ec681f3Smrg 647ec681f3Smrgunsigned 657ec681f3Smrgget_mimg_nsa_dwords(const Instruction* instr) 667ec681f3Smrg{ 677ec681f3Smrg unsigned addr_dwords = instr->operands.size() - 3; 687ec681f3Smrg for (unsigned i = 1; i < addr_dwords; i++) { 697ec681f3Smrg if (instr->operands[3 + i].physReg() != instr->operands[3].physReg().advance(i * 4)) 707ec681f3Smrg return DIV_ROUND_UP(addr_dwords - 1, 4); 717ec681f3Smrg } 727ec681f3Smrg return 0; 737ec681f3Smrg} 747ec681f3Smrg 757ec681f3Smrgvoid 767ec681f3Smrgemit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* instr) 777ec681f3Smrg{ 787ec681f3Smrg /* lower remaining pseudo-instructions */ 797ec681f3Smrg if (instr->opcode == aco_opcode::p_constaddr_getpc) { 807ec681f3Smrg ctx.constaddrs[instr->operands[0].constantValue()].getpc_end = out.size() + 1; 817ec681f3Smrg 827ec681f3Smrg instr->opcode = aco_opcode::s_getpc_b64; 837ec681f3Smrg instr->operands.pop_back(); 847ec681f3Smrg } else if (instr->opcode == aco_opcode::p_constaddr_addlo) { 857ec681f3Smrg ctx.constaddrs[instr->operands[1].constantValue()].add_literal = out.size() + 1; 867ec681f3Smrg 877ec681f3Smrg instr->opcode = aco_opcode::s_add_u32; 887ec681f3Smrg instr->operands[1] = Operand::zero(); 897ec681f3Smrg instr->operands[1].setFixed(PhysReg(255)); 907ec681f3Smrg } 917ec681f3Smrg 927ec681f3Smrg uint32_t opcode = ctx.opcode[(int)instr->opcode]; 937ec681f3Smrg if (opcode == (uint32_t)-1) { 947ec681f3Smrg char* outmem; 957ec681f3Smrg size_t outsize; 967ec681f3Smrg struct u_memstream mem; 977ec681f3Smrg u_memstream_open(&mem, &outmem, &outsize); 987ec681f3Smrg FILE* const memf = u_memstream_get(&mem); 997ec681f3Smrg 1007ec681f3Smrg fprintf(memf, "Unsupported opcode: "); 1017ec681f3Smrg aco_print_instr(instr, memf); 1027ec681f3Smrg u_memstream_close(&mem); 1037ec681f3Smrg 1047ec681f3Smrg aco_err(ctx.program, outmem); 1057ec681f3Smrg free(outmem); 1067ec681f3Smrg 1077ec681f3Smrg abort(); 1087ec681f3Smrg } 1097ec681f3Smrg 1107ec681f3Smrg switch (instr->format) { 1117ec681f3Smrg case Format::SOP2: { 1127ec681f3Smrg uint32_t encoding = (0b10 << 30); 1137ec681f3Smrg encoding |= opcode << 23; 1147ec681f3Smrg encoding |= !instr->definitions.empty() ? instr->definitions[0].physReg() << 16 : 0; 1157ec681f3Smrg encoding |= instr->operands.size() >= 2 ? instr->operands[1].physReg() << 8 : 0; 1167ec681f3Smrg encoding |= !instr->operands.empty() ? instr->operands[0].physReg() : 0; 1177ec681f3Smrg out.push_back(encoding); 1187ec681f3Smrg break; 1197ec681f3Smrg } 1207ec681f3Smrg case Format::SOPK: { 1217ec681f3Smrg SOPK_instruction& sopk = instr->sopk(); 1227ec681f3Smrg 1237ec681f3Smrg if (instr->opcode == aco_opcode::s_subvector_loop_begin) { 1247ec681f3Smrg assert(ctx.chip_class >= GFX10); 1257ec681f3Smrg assert(ctx.subvector_begin_pos == -1); 1267ec681f3Smrg ctx.subvector_begin_pos = out.size(); 1277ec681f3Smrg } else if (instr->opcode == aco_opcode::s_subvector_loop_end) { 1287ec681f3Smrg assert(ctx.chip_class >= GFX10); 1297ec681f3Smrg assert(ctx.subvector_begin_pos != -1); 1307ec681f3Smrg /* Adjust s_subvector_loop_begin instruction to the address after the end */ 1317ec681f3Smrg out[ctx.subvector_begin_pos] |= (out.size() - ctx.subvector_begin_pos); 1327ec681f3Smrg /* Adjust s_subvector_loop_end instruction to the address after the beginning */ 1337ec681f3Smrg sopk.imm = (uint16_t)(ctx.subvector_begin_pos - (int)out.size()); 1347ec681f3Smrg ctx.subvector_begin_pos = -1; 1357ec681f3Smrg } 1367ec681f3Smrg 1377ec681f3Smrg uint32_t encoding = (0b1011 << 28); 1387ec681f3Smrg encoding |= opcode << 23; 1397ec681f3Smrg encoding |= !instr->definitions.empty() && !(instr->definitions[0].physReg() == scc) 1407ec681f3Smrg ? instr->definitions[0].physReg() << 16 1417ec681f3Smrg : !instr->operands.empty() && instr->operands[0].physReg() <= 127 1427ec681f3Smrg ? instr->operands[0].physReg() << 16 1437ec681f3Smrg : 0; 1447ec681f3Smrg encoding |= sopk.imm; 1457ec681f3Smrg out.push_back(encoding); 1467ec681f3Smrg break; 1477ec681f3Smrg } 1487ec681f3Smrg case Format::SOP1: { 1497ec681f3Smrg uint32_t encoding = (0b101111101 << 23); 1507ec681f3Smrg if (opcode >= 55 && ctx.chip_class <= GFX9) { 1517ec681f3Smrg assert(ctx.chip_class == GFX9 && opcode < 60); 1527ec681f3Smrg opcode = opcode - 4; 1537ec681f3Smrg } 1547ec681f3Smrg encoding |= !instr->definitions.empty() ? instr->definitions[0].physReg() << 16 : 0; 1557ec681f3Smrg encoding |= opcode << 8; 1567ec681f3Smrg encoding |= !instr->operands.empty() ? instr->operands[0].physReg() : 0; 1577ec681f3Smrg out.push_back(encoding); 1587ec681f3Smrg break; 1597ec681f3Smrg } 1607ec681f3Smrg case Format::SOPC: { 1617ec681f3Smrg uint32_t encoding = (0b101111110 << 23); 1627ec681f3Smrg encoding |= opcode << 16; 1637ec681f3Smrg encoding |= instr->operands.size() == 2 ? instr->operands[1].physReg() << 8 : 0; 1647ec681f3Smrg encoding |= !instr->operands.empty() ? instr->operands[0].physReg() : 0; 1657ec681f3Smrg out.push_back(encoding); 1667ec681f3Smrg break; 1677ec681f3Smrg } 1687ec681f3Smrg case Format::SOPP: { 1697ec681f3Smrg SOPP_instruction& sopp = instr->sopp(); 1707ec681f3Smrg uint32_t encoding = (0b101111111 << 23); 1717ec681f3Smrg encoding |= opcode << 16; 1727ec681f3Smrg encoding |= (uint16_t)sopp.imm; 1737ec681f3Smrg if (sopp.block != -1) { 1747ec681f3Smrg sopp.pass_flags = 0; 1757ec681f3Smrg ctx.branches.emplace_back(out.size(), &sopp); 1767ec681f3Smrg } 1777ec681f3Smrg out.push_back(encoding); 1787ec681f3Smrg break; 1797ec681f3Smrg } 1807ec681f3Smrg case Format::SMEM: { 1817ec681f3Smrg SMEM_instruction& smem = instr->smem(); 1827ec681f3Smrg bool soe = instr->operands.size() >= (!instr->definitions.empty() ? 3 : 4); 1837ec681f3Smrg bool is_load = !instr->definitions.empty(); 1847ec681f3Smrg uint32_t encoding = 0; 1857ec681f3Smrg 1867ec681f3Smrg if (ctx.chip_class <= GFX7) { 1877ec681f3Smrg encoding = (0b11000 << 27); 1887ec681f3Smrg encoding |= opcode << 22; 1897ec681f3Smrg encoding |= instr->definitions.size() ? instr->definitions[0].physReg() << 15 : 0; 1907ec681f3Smrg encoding |= instr->operands.size() ? (instr->operands[0].physReg() >> 1) << 9 : 0; 1917ec681f3Smrg if (instr->operands.size() >= 2) { 1927ec681f3Smrg if (!instr->operands[1].isConstant()) { 1937ec681f3Smrg encoding |= instr->operands[1].physReg().reg(); 1947ec681f3Smrg } else if (instr->operands[1].constantValue() >= 1024) { 1957ec681f3Smrg encoding |= 255; /* SQ_SRC_LITERAL */ 1967ec681f3Smrg } else { 1977ec681f3Smrg encoding |= instr->operands[1].constantValue() >> 2; 1987ec681f3Smrg encoding |= 1 << 8; 1997ec681f3Smrg } 2007ec681f3Smrg } 2017ec681f3Smrg out.push_back(encoding); 2027ec681f3Smrg /* SMRD instructions can take a literal on GFX7 */ 2037ec681f3Smrg if (instr->operands.size() >= 2 && instr->operands[1].isConstant() && 2047ec681f3Smrg instr->operands[1].constantValue() >= 1024) 2057ec681f3Smrg out.push_back(instr->operands[1].constantValue() >> 2); 2067ec681f3Smrg return; 2077ec681f3Smrg } 2087ec681f3Smrg 2097ec681f3Smrg if (ctx.chip_class <= GFX9) { 2107ec681f3Smrg encoding = (0b110000 << 26); 2117ec681f3Smrg assert(!smem.dlc); /* Device-level coherent is not supported on GFX9 and lower */ 2127ec681f3Smrg encoding |= smem.nv ? 1 << 15 : 0; 2137ec681f3Smrg } else { 2147ec681f3Smrg encoding = (0b111101 << 26); 2157ec681f3Smrg assert(!smem.nv); /* Non-volatile is not supported on GFX10 */ 2167ec681f3Smrg encoding |= smem.dlc ? 1 << 14 : 0; 2177ec681f3Smrg } 2187ec681f3Smrg 2197ec681f3Smrg encoding |= opcode << 18; 2207ec681f3Smrg encoding |= smem.glc ? 1 << 16 : 0; 2217ec681f3Smrg 2227ec681f3Smrg if (ctx.chip_class <= GFX9) { 2237ec681f3Smrg if (instr->operands.size() >= 2) 2247ec681f3Smrg encoding |= instr->operands[1].isConstant() ? 1 << 17 : 0; /* IMM - immediate enable */ 2257ec681f3Smrg } 2267ec681f3Smrg if (ctx.chip_class == GFX9) { 2277ec681f3Smrg encoding |= soe ? 1 << 14 : 0; 2287ec681f3Smrg } 2297ec681f3Smrg 2307ec681f3Smrg if (is_load || instr->operands.size() >= 3) { /* SDATA */ 2317ec681f3Smrg encoding |= (is_load ? instr->definitions[0].physReg() : instr->operands[2].physReg()) 2327ec681f3Smrg << 6; 2337ec681f3Smrg } 2347ec681f3Smrg if (instr->operands.size() >= 1) { /* SBASE */ 2357ec681f3Smrg encoding |= instr->operands[0].physReg() >> 1; 2367ec681f3Smrg } 2377ec681f3Smrg 2387ec681f3Smrg out.push_back(encoding); 2397ec681f3Smrg encoding = 0; 2407ec681f3Smrg 2417ec681f3Smrg int32_t offset = 0; 2427ec681f3Smrg uint32_t soffset = ctx.chip_class >= GFX10 2437ec681f3Smrg ? sgpr_null /* On GFX10 this is disabled by specifying SGPR_NULL */ 2447ec681f3Smrg : 0; /* On GFX9, it is disabled by the SOE bit (and it's not present on 2457ec681f3Smrg GFX8 and below) */ 2467ec681f3Smrg if (instr->operands.size() >= 2) { 2477ec681f3Smrg const Operand& op_off1 = instr->operands[1]; 2487ec681f3Smrg if (ctx.chip_class <= GFX9) { 2497ec681f3Smrg offset = op_off1.isConstant() ? op_off1.constantValue() : op_off1.physReg(); 2507ec681f3Smrg } else { 2517ec681f3Smrg /* GFX10 only supports constants in OFFSET, so put the operand in SOFFSET if it's an 2527ec681f3Smrg * SGPR */ 2537ec681f3Smrg if (op_off1.isConstant()) { 2547ec681f3Smrg offset = op_off1.constantValue(); 2557ec681f3Smrg } else { 2567ec681f3Smrg soffset = op_off1.physReg(); 2577ec681f3Smrg assert(!soe); /* There is no place to put the other SGPR offset, if any */ 2587ec681f3Smrg } 2597ec681f3Smrg } 2607ec681f3Smrg 2617ec681f3Smrg if (soe) { 2627ec681f3Smrg const Operand& op_off2 = instr->operands.back(); 2637ec681f3Smrg assert(ctx.chip_class >= GFX9); /* GFX8 and below don't support specifying a constant 2647ec681f3Smrg and an SGPR at the same time */ 2657ec681f3Smrg assert(!op_off2.isConstant()); 2667ec681f3Smrg soffset = op_off2.physReg(); 2677ec681f3Smrg } 2687ec681f3Smrg } 2697ec681f3Smrg encoding |= offset; 2707ec681f3Smrg encoding |= soffset << 25; 2717ec681f3Smrg 2727ec681f3Smrg out.push_back(encoding); 2737ec681f3Smrg return; 2747ec681f3Smrg } 2757ec681f3Smrg case Format::VOP2: { 2767ec681f3Smrg uint32_t encoding = 0; 2777ec681f3Smrg encoding |= opcode << 25; 2787ec681f3Smrg encoding |= (0xFF & instr->definitions[0].physReg()) << 17; 2797ec681f3Smrg encoding |= (0xFF & instr->operands[1].physReg()) << 9; 2807ec681f3Smrg encoding |= instr->operands[0].physReg(); 2817ec681f3Smrg out.push_back(encoding); 2827ec681f3Smrg break; 2837ec681f3Smrg } 2847ec681f3Smrg case Format::VOP1: { 2857ec681f3Smrg uint32_t encoding = (0b0111111 << 25); 2867ec681f3Smrg if (!instr->definitions.empty()) 2877ec681f3Smrg encoding |= (0xFF & instr->definitions[0].physReg()) << 17; 2887ec681f3Smrg encoding |= opcode << 9; 2897ec681f3Smrg if (!instr->operands.empty()) 2907ec681f3Smrg encoding |= instr->operands[0].physReg(); 2917ec681f3Smrg out.push_back(encoding); 2927ec681f3Smrg break; 2937ec681f3Smrg } 2947ec681f3Smrg case Format::VOPC: { 2957ec681f3Smrg uint32_t encoding = (0b0111110 << 25); 2967ec681f3Smrg encoding |= opcode << 17; 2977ec681f3Smrg encoding |= (0xFF & instr->operands[1].physReg()) << 9; 2987ec681f3Smrg encoding |= instr->operands[0].physReg(); 2997ec681f3Smrg out.push_back(encoding); 3007ec681f3Smrg break; 3017ec681f3Smrg } 3027ec681f3Smrg case Format::VINTRP: { 3037ec681f3Smrg Interp_instruction& interp = instr->vintrp(); 3047ec681f3Smrg uint32_t encoding = 0; 3057ec681f3Smrg 3067ec681f3Smrg if (instr->opcode == aco_opcode::v_interp_p1ll_f16 || 3077ec681f3Smrg instr->opcode == aco_opcode::v_interp_p1lv_f16 || 3087ec681f3Smrg instr->opcode == aco_opcode::v_interp_p2_legacy_f16 || 3097ec681f3Smrg instr->opcode == aco_opcode::v_interp_p2_f16) { 3107ec681f3Smrg if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) { 3117ec681f3Smrg encoding = (0b110100 << 26); 3127ec681f3Smrg } else if (ctx.chip_class >= GFX10) { 3137ec681f3Smrg encoding = (0b110101 << 26); 3147ec681f3Smrg } else { 3157ec681f3Smrg unreachable("Unknown chip_class."); 3167ec681f3Smrg } 3177ec681f3Smrg 3187ec681f3Smrg encoding |= opcode << 16; 3197ec681f3Smrg encoding |= (0xFF & instr->definitions[0].physReg()); 3207ec681f3Smrg out.push_back(encoding); 3217ec681f3Smrg 3227ec681f3Smrg encoding = 0; 3237ec681f3Smrg encoding |= interp.attribute; 3247ec681f3Smrg encoding |= interp.component << 6; 3257ec681f3Smrg encoding |= instr->operands[0].physReg() << 9; 3267ec681f3Smrg if (instr->opcode == aco_opcode::v_interp_p2_f16 || 3277ec681f3Smrg instr->opcode == aco_opcode::v_interp_p2_legacy_f16 || 3287ec681f3Smrg instr->opcode == aco_opcode::v_interp_p1lv_f16) { 3297ec681f3Smrg encoding |= instr->operands[2].physReg() << 18; 3307ec681f3Smrg } 3317ec681f3Smrg out.push_back(encoding); 3327ec681f3Smrg } else { 3337ec681f3Smrg if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) { 3347ec681f3Smrg encoding = (0b110101 << 26); /* Vega ISA doc says 110010 but it's wrong */ 3357ec681f3Smrg } else { 3367ec681f3Smrg encoding = (0b110010 << 26); 3377ec681f3Smrg } 3387ec681f3Smrg 3397ec681f3Smrg assert(encoding); 3407ec681f3Smrg encoding |= (0xFF & instr->definitions[0].physReg()) << 18; 3417ec681f3Smrg encoding |= opcode << 16; 3427ec681f3Smrg encoding |= interp.attribute << 10; 3437ec681f3Smrg encoding |= interp.component << 8; 3447ec681f3Smrg if (instr->opcode == aco_opcode::v_interp_mov_f32) 3457ec681f3Smrg encoding |= (0x3 & instr->operands[0].constantValue()); 3467ec681f3Smrg else 3477ec681f3Smrg encoding |= (0xFF & instr->operands[0].physReg()); 3487ec681f3Smrg out.push_back(encoding); 3497ec681f3Smrg } 3507ec681f3Smrg break; 3517ec681f3Smrg } 3527ec681f3Smrg case Format::DS: { 3537ec681f3Smrg DS_instruction& ds = instr->ds(); 3547ec681f3Smrg uint32_t encoding = (0b110110 << 26); 3557ec681f3Smrg if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) { 3567ec681f3Smrg encoding |= opcode << 17; 3577ec681f3Smrg encoding |= (ds.gds ? 1 : 0) << 16; 3587ec681f3Smrg } else { 3597ec681f3Smrg encoding |= opcode << 18; 3607ec681f3Smrg encoding |= (ds.gds ? 1 : 0) << 17; 3617ec681f3Smrg } 3627ec681f3Smrg encoding |= ((0xFF & ds.offset1) << 8); 3637ec681f3Smrg encoding |= (0xFFFF & ds.offset0); 3647ec681f3Smrg out.push_back(encoding); 3657ec681f3Smrg encoding = 0; 3667ec681f3Smrg unsigned reg = !instr->definitions.empty() ? instr->definitions[0].physReg() : 0; 3677ec681f3Smrg encoding |= (0xFF & reg) << 24; 3687ec681f3Smrg reg = instr->operands.size() >= 3 && !(instr->operands[2].physReg() == m0) 3697ec681f3Smrg ? instr->operands[2].physReg() 3707ec681f3Smrg : 0; 3717ec681f3Smrg encoding |= (0xFF & reg) << 16; 3727ec681f3Smrg reg = instr->operands.size() >= 2 && !(instr->operands[1].physReg() == m0) 3737ec681f3Smrg ? instr->operands[1].physReg() 3747ec681f3Smrg : 0; 3757ec681f3Smrg encoding |= (0xFF & reg) << 8; 3767ec681f3Smrg encoding |= (0xFF & instr->operands[0].physReg()); 3777ec681f3Smrg out.push_back(encoding); 3787ec681f3Smrg break; 3797ec681f3Smrg } 3807ec681f3Smrg case Format::MUBUF: { 3817ec681f3Smrg MUBUF_instruction& mubuf = instr->mubuf(); 3827ec681f3Smrg uint32_t encoding = (0b111000 << 26); 3837ec681f3Smrg encoding |= opcode << 18; 3847ec681f3Smrg encoding |= (mubuf.lds ? 1 : 0) << 16; 3857ec681f3Smrg encoding |= (mubuf.glc ? 1 : 0) << 14; 3867ec681f3Smrg encoding |= (mubuf.idxen ? 1 : 0) << 13; 3877ec681f3Smrg assert(!mubuf.addr64 || ctx.chip_class <= GFX7); 3887ec681f3Smrg if (ctx.chip_class == GFX6 || ctx.chip_class == GFX7) 3897ec681f3Smrg encoding |= (mubuf.addr64 ? 1 : 0) << 15; 3907ec681f3Smrg encoding |= (mubuf.offen ? 1 : 0) << 12; 3917ec681f3Smrg if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) { 3927ec681f3Smrg assert(!mubuf.dlc); /* Device-level coherent is not supported on GFX9 and lower */ 3937ec681f3Smrg encoding |= (mubuf.slc ? 1 : 0) << 17; 3947ec681f3Smrg } else if (ctx.chip_class >= GFX10) { 3957ec681f3Smrg encoding |= (mubuf.dlc ? 1 : 0) << 15; 3967ec681f3Smrg } 3977ec681f3Smrg encoding |= 0x0FFF & mubuf.offset; 3987ec681f3Smrg out.push_back(encoding); 3997ec681f3Smrg encoding = 0; 4007ec681f3Smrg if (ctx.chip_class <= GFX7 || ctx.chip_class >= GFX10) { 4017ec681f3Smrg encoding |= (mubuf.slc ? 1 : 0) << 22; 4027ec681f3Smrg } 4037ec681f3Smrg encoding |= instr->operands[2].physReg() << 24; 4047ec681f3Smrg encoding |= (mubuf.tfe ? 1 : 0) << 23; 4057ec681f3Smrg encoding |= (instr->operands[0].physReg() >> 2) << 16; 4067ec681f3Smrg unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg() 4077ec681f3Smrg : instr->definitions[0].physReg(); 4087ec681f3Smrg encoding |= (0xFF & reg) << 8; 4097ec681f3Smrg encoding |= (0xFF & instr->operands[1].physReg()); 4107ec681f3Smrg out.push_back(encoding); 4117ec681f3Smrg break; 4127ec681f3Smrg } 4137ec681f3Smrg case Format::MTBUF: { 4147ec681f3Smrg MTBUF_instruction& mtbuf = instr->mtbuf(); 4157ec681f3Smrg 4167ec681f3Smrg uint32_t img_format = ac_get_tbuffer_format(ctx.chip_class, mtbuf.dfmt, mtbuf.nfmt); 4177ec681f3Smrg uint32_t encoding = (0b111010 << 26); 4187ec681f3Smrg assert(img_format <= 0x7F); 4197ec681f3Smrg assert(!mtbuf.dlc || ctx.chip_class >= GFX10); 4207ec681f3Smrg encoding |= (mtbuf.dlc ? 1 : 0) << 15; /* DLC bit replaces one bit of the OPCODE on GFX10 */ 4217ec681f3Smrg encoding |= (mtbuf.glc ? 1 : 0) << 14; 4227ec681f3Smrg encoding |= (mtbuf.idxen ? 1 : 0) << 13; 4237ec681f3Smrg encoding |= (mtbuf.offen ? 1 : 0) << 12; 4247ec681f3Smrg encoding |= 0x0FFF & mtbuf.offset; 4257ec681f3Smrg encoding |= (img_format << 19); /* Handles both the GFX10 FORMAT and the old NFMT+DFMT */ 4267ec681f3Smrg 4277ec681f3Smrg if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) { 4287ec681f3Smrg encoding |= opcode << 15; 4297ec681f3Smrg } else { 4307ec681f3Smrg encoding |= (opcode & 0x07) << 16; /* 3 LSBs of 4-bit OPCODE */ 4317ec681f3Smrg } 4327ec681f3Smrg 4337ec681f3Smrg out.push_back(encoding); 4347ec681f3Smrg encoding = 0; 4357ec681f3Smrg 4367ec681f3Smrg encoding |= instr->operands[2].physReg() << 24; 4377ec681f3Smrg encoding |= (mtbuf.tfe ? 1 : 0) << 23; 4387ec681f3Smrg encoding |= (mtbuf.slc ? 1 : 0) << 22; 4397ec681f3Smrg encoding |= (instr->operands[0].physReg() >> 2) << 16; 4407ec681f3Smrg unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg() 4417ec681f3Smrg : instr->definitions[0].physReg(); 4427ec681f3Smrg encoding |= (0xFF & reg) << 8; 4437ec681f3Smrg encoding |= (0xFF & instr->operands[1].physReg()); 4447ec681f3Smrg 4457ec681f3Smrg if (ctx.chip_class >= GFX10) { 4467ec681f3Smrg encoding |= (((opcode & 0x08) >> 3) << 21); /* MSB of 4-bit OPCODE */ 4477ec681f3Smrg } 4487ec681f3Smrg 4497ec681f3Smrg out.push_back(encoding); 4507ec681f3Smrg break; 4517ec681f3Smrg } 4527ec681f3Smrg case Format::MIMG: { 4537ec681f3Smrg unsigned nsa_dwords = get_mimg_nsa_dwords(instr); 4547ec681f3Smrg assert(!nsa_dwords || ctx.chip_class >= GFX10); 4557ec681f3Smrg 4567ec681f3Smrg MIMG_instruction& mimg = instr->mimg(); 4577ec681f3Smrg uint32_t encoding = (0b111100 << 26); 4587ec681f3Smrg encoding |= mimg.slc ? 1 << 25 : 0; 4597ec681f3Smrg encoding |= (opcode & 0x7f) << 18; 4607ec681f3Smrg encoding |= (opcode >> 7) & 1; 4617ec681f3Smrg encoding |= mimg.lwe ? 1 << 17 : 0; 4627ec681f3Smrg encoding |= mimg.tfe ? 1 << 16 : 0; 4637ec681f3Smrg encoding |= mimg.glc ? 1 << 13 : 0; 4647ec681f3Smrg encoding |= mimg.unrm ? 1 << 12 : 0; 4657ec681f3Smrg if (ctx.chip_class <= GFX9) { 4667ec681f3Smrg assert(!mimg.dlc); /* Device-level coherent is not supported on GFX9 and lower */ 4677ec681f3Smrg assert(!mimg.r128); 4687ec681f3Smrg encoding |= mimg.a16 ? 1 << 15 : 0; 4697ec681f3Smrg encoding |= mimg.da ? 1 << 14 : 0; 4707ec681f3Smrg } else { 4717ec681f3Smrg encoding |= mimg.r128 ? 1 << 15 4727ec681f3Smrg : 0; /* GFX10: A16 moved to 2nd word, R128 replaces it in 1st word */ 4737ec681f3Smrg encoding |= nsa_dwords << 1; 4747ec681f3Smrg encoding |= mimg.dim << 3; /* GFX10: dimensionality instead of declare array */ 4757ec681f3Smrg encoding |= mimg.dlc ? 1 << 7 : 0; 4767ec681f3Smrg } 4777ec681f3Smrg encoding |= (0xF & mimg.dmask) << 8; 4787ec681f3Smrg out.push_back(encoding); 4797ec681f3Smrg encoding = (0xFF & instr->operands[3].physReg()); /* VADDR */ 4807ec681f3Smrg if (!instr->definitions.empty()) { 4817ec681f3Smrg encoding |= (0xFF & instr->definitions[0].physReg()) << 8; /* VDATA */ 4827ec681f3Smrg } else if (!instr->operands[2].isUndefined()) { 4837ec681f3Smrg encoding |= (0xFF & instr->operands[2].physReg()) << 8; /* VDATA */ 4847ec681f3Smrg } 4857ec681f3Smrg encoding |= (0x1F & (instr->operands[0].physReg() >> 2)) << 16; /* T# (resource) */ 4867ec681f3Smrg if (!instr->operands[1].isUndefined()) 4877ec681f3Smrg encoding |= (0x1F & (instr->operands[1].physReg() >> 2)) << 21; /* sampler */ 4887ec681f3Smrg 4897ec681f3Smrg assert(!mimg.d16 || ctx.chip_class >= GFX9); 4907ec681f3Smrg encoding |= mimg.d16 ? 1 << 31 : 0; 4917ec681f3Smrg if (ctx.chip_class >= GFX10) { 4927ec681f3Smrg /* GFX10: A16 still exists, but is in a different place */ 4937ec681f3Smrg encoding |= mimg.a16 ? 1 << 30 : 0; 4947ec681f3Smrg } 4957ec681f3Smrg 4967ec681f3Smrg out.push_back(encoding); 4977ec681f3Smrg 4987ec681f3Smrg if (nsa_dwords) { 4997ec681f3Smrg out.resize(out.size() + nsa_dwords); 5007ec681f3Smrg std::vector<uint32_t>::iterator nsa = std::prev(out.end(), nsa_dwords); 5017ec681f3Smrg for (unsigned i = 0; i < instr->operands.size() - 4u; i++) 5027ec681f3Smrg nsa[i / 4] |= (0xFF & instr->operands[4 + i].physReg().reg()) << (i % 4 * 8); 5037ec681f3Smrg } 5047ec681f3Smrg break; 5057ec681f3Smrg } 5067ec681f3Smrg case Format::FLAT: 5077ec681f3Smrg case Format::SCRATCH: 5087ec681f3Smrg case Format::GLOBAL: { 5097ec681f3Smrg FLAT_instruction& flat = instr->flatlike(); 5107ec681f3Smrg uint32_t encoding = (0b110111 << 26); 5117ec681f3Smrg encoding |= opcode << 18; 5127ec681f3Smrg if (ctx.chip_class <= GFX9) { 5137ec681f3Smrg assert(flat.offset <= 0x1fff); 5147ec681f3Smrg encoding |= flat.offset & 0x1fff; 5157ec681f3Smrg } else if (instr->isFlat()) { 5167ec681f3Smrg /* GFX10 has a 12-bit immediate OFFSET field, 5177ec681f3Smrg * but it has a hw bug: it ignores the offset, called FlatSegmentOffsetBug 5187ec681f3Smrg */ 5197ec681f3Smrg assert(flat.offset == 0); 5207ec681f3Smrg } else { 5217ec681f3Smrg assert(flat.offset <= 0xfff); 5227ec681f3Smrg encoding |= flat.offset & 0xfff; 5237ec681f3Smrg } 5247ec681f3Smrg if (instr->isScratch()) 5257ec681f3Smrg encoding |= 1 << 14; 5267ec681f3Smrg else if (instr->isGlobal()) 5277ec681f3Smrg encoding |= 2 << 14; 5287ec681f3Smrg encoding |= flat.lds ? 1 << 13 : 0; 5297ec681f3Smrg encoding |= flat.glc ? 1 << 16 : 0; 5307ec681f3Smrg encoding |= flat.slc ? 1 << 17 : 0; 5317ec681f3Smrg if (ctx.chip_class >= GFX10) { 5327ec681f3Smrg assert(!flat.nv); 5337ec681f3Smrg encoding |= flat.dlc ? 1 << 12 : 0; 5347ec681f3Smrg } else { 5357ec681f3Smrg assert(!flat.dlc); 5367ec681f3Smrg } 5377ec681f3Smrg out.push_back(encoding); 5387ec681f3Smrg encoding = (0xFF & instr->operands[0].physReg()); 5397ec681f3Smrg if (!instr->definitions.empty()) 5407ec681f3Smrg encoding |= (0xFF & instr->definitions[0].physReg()) << 24; 5417ec681f3Smrg if (instr->operands.size() >= 3) 5427ec681f3Smrg encoding |= (0xFF & instr->operands[2].physReg()) << 8; 5437ec681f3Smrg if (!instr->operands[1].isUndefined()) { 5447ec681f3Smrg assert(ctx.chip_class >= GFX10 || instr->operands[1].physReg() != 0x7F); 5457ec681f3Smrg assert(instr->format != Format::FLAT); 5467ec681f3Smrg encoding |= instr->operands[1].physReg() << 16; 5477ec681f3Smrg } else if (instr->format != Format::FLAT || 5487ec681f3Smrg ctx.chip_class >= GFX10) { /* SADDR is actually used with FLAT on GFX10 */ 5497ec681f3Smrg if (ctx.chip_class <= GFX9) 5507ec681f3Smrg encoding |= 0x7F << 16; 5517ec681f3Smrg else 5527ec681f3Smrg encoding |= sgpr_null << 16; 5537ec681f3Smrg } 5547ec681f3Smrg encoding |= flat.nv ? 1 << 23 : 0; 5557ec681f3Smrg out.push_back(encoding); 5567ec681f3Smrg break; 5577ec681f3Smrg } 5587ec681f3Smrg case Format::EXP: { 5597ec681f3Smrg Export_instruction& exp = instr->exp(); 5607ec681f3Smrg uint32_t encoding; 5617ec681f3Smrg if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) { 5627ec681f3Smrg encoding = (0b110001 << 26); 5637ec681f3Smrg } else { 5647ec681f3Smrg encoding = (0b111110 << 26); 5657ec681f3Smrg } 5667ec681f3Smrg 5677ec681f3Smrg encoding |= exp.valid_mask ? 0b1 << 12 : 0; 5687ec681f3Smrg encoding |= exp.done ? 0b1 << 11 : 0; 5697ec681f3Smrg encoding |= exp.compressed ? 0b1 << 10 : 0; 5707ec681f3Smrg encoding |= exp.dest << 4; 5717ec681f3Smrg encoding |= exp.enabled_mask; 5727ec681f3Smrg out.push_back(encoding); 5737ec681f3Smrg encoding = 0xFF & exp.operands[0].physReg(); 5747ec681f3Smrg encoding |= (0xFF & exp.operands[1].physReg()) << 8; 5757ec681f3Smrg encoding |= (0xFF & exp.operands[2].physReg()) << 16; 5767ec681f3Smrg encoding |= (0xFF & exp.operands[3].physReg()) << 24; 5777ec681f3Smrg out.push_back(encoding); 5787ec681f3Smrg break; 5797ec681f3Smrg } 5807ec681f3Smrg case Format::PSEUDO: 5817ec681f3Smrg case Format::PSEUDO_BARRIER: 5827ec681f3Smrg if (instr->opcode != aco_opcode::p_unit_test) 5837ec681f3Smrg unreachable("Pseudo instructions should be lowered before assembly."); 5847ec681f3Smrg break; 5857ec681f3Smrg default: 5867ec681f3Smrg if (instr->isVOP3()) { 5877ec681f3Smrg VOP3_instruction& vop3 = instr->vop3(); 5887ec681f3Smrg 5897ec681f3Smrg if (instr->isVOP2()) { 5907ec681f3Smrg opcode = opcode + 0x100; 5917ec681f3Smrg } else if (instr->isVOP1()) { 5927ec681f3Smrg if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) 5937ec681f3Smrg opcode = opcode + 0x140; 5947ec681f3Smrg else 5957ec681f3Smrg opcode = opcode + 0x180; 5967ec681f3Smrg } else if (instr->isVOPC()) { 5977ec681f3Smrg opcode = opcode + 0x0; 5987ec681f3Smrg } else if (instr->isVINTRP()) { 5997ec681f3Smrg opcode = opcode + 0x270; 6007ec681f3Smrg } 6017ec681f3Smrg 6027ec681f3Smrg uint32_t encoding; 6037ec681f3Smrg if (ctx.chip_class <= GFX9) { 6047ec681f3Smrg encoding = (0b110100 << 26); 6057ec681f3Smrg } else if (ctx.chip_class >= GFX10) { 6067ec681f3Smrg encoding = (0b110101 << 26); 6077ec681f3Smrg } else { 6087ec681f3Smrg unreachable("Unknown chip_class."); 6097ec681f3Smrg } 6107ec681f3Smrg 6117ec681f3Smrg if (ctx.chip_class <= GFX7) { 6127ec681f3Smrg encoding |= opcode << 17; 6137ec681f3Smrg encoding |= (vop3.clamp ? 1 : 0) << 11; 6147ec681f3Smrg } else { 6157ec681f3Smrg encoding |= opcode << 16; 6167ec681f3Smrg encoding |= (vop3.clamp ? 1 : 0) << 15; 6177ec681f3Smrg } 6187ec681f3Smrg encoding |= vop3.opsel << 11; 6197ec681f3Smrg for (unsigned i = 0; i < 3; i++) 6207ec681f3Smrg encoding |= vop3.abs[i] << (8 + i); 6217ec681f3Smrg if (instr->definitions.size() == 2) 6227ec681f3Smrg encoding |= instr->definitions[1].physReg() << 8; 6237ec681f3Smrg encoding |= (0xFF & instr->definitions[0].physReg()); 6247ec681f3Smrg out.push_back(encoding); 6257ec681f3Smrg encoding = 0; 6267ec681f3Smrg if (instr->opcode == aco_opcode::v_interp_mov_f32) { 6277ec681f3Smrg encoding = 0x3 & instr->operands[0].constantValue(); 6287ec681f3Smrg } else if (instr->opcode == aco_opcode::v_writelane_b32_e64) { 6297ec681f3Smrg encoding |= instr->operands[0].physReg() << 0; 6307ec681f3Smrg encoding |= instr->operands[1].physReg() << 9; 6317ec681f3Smrg /* Encoding src2 works fine with hardware but breaks some disassemblers. */ 6327ec681f3Smrg } else { 6337ec681f3Smrg for (unsigned i = 0; i < instr->operands.size(); i++) 6347ec681f3Smrg encoding |= instr->operands[i].physReg() << (i * 9); 6357ec681f3Smrg } 6367ec681f3Smrg encoding |= vop3.omod << 27; 6377ec681f3Smrg for (unsigned i = 0; i < 3; i++) 6387ec681f3Smrg encoding |= vop3.neg[i] << (29 + i); 6397ec681f3Smrg out.push_back(encoding); 6407ec681f3Smrg 6417ec681f3Smrg } else if (instr->isVOP3P()) { 6427ec681f3Smrg VOP3P_instruction& vop3 = instr->vop3p(); 6437ec681f3Smrg 6447ec681f3Smrg uint32_t encoding; 6457ec681f3Smrg if (ctx.chip_class == GFX9) { 6467ec681f3Smrg encoding = (0b110100111 << 23); 6477ec681f3Smrg } else if (ctx.chip_class >= GFX10) { 6487ec681f3Smrg encoding = (0b110011 << 26); 6497ec681f3Smrg } else { 6507ec681f3Smrg unreachable("Unknown chip_class."); 6517ec681f3Smrg } 6527ec681f3Smrg 6537ec681f3Smrg encoding |= opcode << 16; 6547ec681f3Smrg encoding |= (vop3.clamp ? 1 : 0) << 15; 6557ec681f3Smrg encoding |= vop3.opsel_lo << 11; 6567ec681f3Smrg encoding |= ((vop3.opsel_hi & 0x4) ? 1 : 0) << 14; 6577ec681f3Smrg for (unsigned i = 0; i < 3; i++) 6587ec681f3Smrg encoding |= vop3.neg_hi[i] << (8 + i); 6597ec681f3Smrg encoding |= (0xFF & instr->definitions[0].physReg()); 6607ec681f3Smrg out.push_back(encoding); 6617ec681f3Smrg encoding = 0; 6627ec681f3Smrg for (unsigned i = 0; i < instr->operands.size(); i++) 6637ec681f3Smrg encoding |= instr->operands[i].physReg() << (i * 9); 6647ec681f3Smrg encoding |= (vop3.opsel_hi & 0x3) << 27; 6657ec681f3Smrg for (unsigned i = 0; i < 3; i++) 6667ec681f3Smrg encoding |= vop3.neg_lo[i] << (29 + i); 6677ec681f3Smrg out.push_back(encoding); 6687ec681f3Smrg 6697ec681f3Smrg } else if (instr->isDPP()) { 6707ec681f3Smrg assert(ctx.chip_class >= GFX8); 6717ec681f3Smrg DPP_instruction& dpp = instr->dpp(); 6727ec681f3Smrg 6737ec681f3Smrg /* first emit the instruction without the DPP operand */ 6747ec681f3Smrg Operand dpp_op = instr->operands[0]; 6757ec681f3Smrg instr->operands[0] = Operand(PhysReg{250}, v1); 6767ec681f3Smrg instr->format = (Format)((uint16_t)instr->format & ~(uint16_t)Format::DPP); 6777ec681f3Smrg emit_instruction(ctx, out, instr); 6787ec681f3Smrg uint32_t encoding = (0xF & dpp.row_mask) << 28; 6797ec681f3Smrg encoding |= (0xF & dpp.bank_mask) << 24; 6807ec681f3Smrg encoding |= dpp.abs[1] << 23; 6817ec681f3Smrg encoding |= dpp.neg[1] << 22; 6827ec681f3Smrg encoding |= dpp.abs[0] << 21; 6837ec681f3Smrg encoding |= dpp.neg[0] << 20; 6847ec681f3Smrg if (ctx.chip_class >= GFX10) 6857ec681f3Smrg encoding |= 1 << 18; /* set Fetch Inactive to match GFX9 behaviour */ 6867ec681f3Smrg encoding |= dpp.bound_ctrl << 19; 6877ec681f3Smrg encoding |= dpp.dpp_ctrl << 8; 6887ec681f3Smrg encoding |= (0xFF) & dpp_op.physReg(); 6897ec681f3Smrg out.push_back(encoding); 6907ec681f3Smrg return; 6917ec681f3Smrg } else if (instr->isSDWA()) { 6927ec681f3Smrg SDWA_instruction& sdwa = instr->sdwa(); 6937ec681f3Smrg 6947ec681f3Smrg /* first emit the instruction without the SDWA operand */ 6957ec681f3Smrg Operand sdwa_op = instr->operands[0]; 6967ec681f3Smrg instr->operands[0] = Operand(PhysReg{249}, v1); 6977ec681f3Smrg instr->format = (Format)((uint16_t)instr->format & ~(uint16_t)Format::SDWA); 6987ec681f3Smrg emit_instruction(ctx, out, instr); 6997ec681f3Smrg 7007ec681f3Smrg uint32_t encoding = 0; 7017ec681f3Smrg 7027ec681f3Smrg if (instr->isVOPC()) { 7037ec681f3Smrg if (instr->definitions[0].physReg() != vcc) { 7047ec681f3Smrg encoding |= instr->definitions[0].physReg() << 8; 7057ec681f3Smrg encoding |= 1 << 15; 7067ec681f3Smrg } 7077ec681f3Smrg encoding |= (sdwa.clamp ? 1 : 0) << 13; 7087ec681f3Smrg } else { 7097ec681f3Smrg encoding |= sdwa.dst_sel.to_sdwa_sel(instr->definitions[0].physReg().byte()) << 8; 7107ec681f3Smrg uint32_t dst_u = sdwa.dst_sel.sign_extend() ? 1 : 0; 7117ec681f3Smrg if (instr->definitions[0].bytes() < 4) /* dst_preserve */ 7127ec681f3Smrg dst_u = 2; 7137ec681f3Smrg encoding |= dst_u << 11; 7147ec681f3Smrg encoding |= (sdwa.clamp ? 1 : 0) << 13; 7157ec681f3Smrg encoding |= sdwa.omod << 14; 7167ec681f3Smrg } 7177ec681f3Smrg 7187ec681f3Smrg encoding |= sdwa.sel[0].to_sdwa_sel(sdwa_op.physReg().byte()) << 16; 7197ec681f3Smrg encoding |= sdwa.sel[0].sign_extend() ? 1 << 19 : 0; 7207ec681f3Smrg encoding |= sdwa.abs[0] << 21; 7217ec681f3Smrg encoding |= sdwa.neg[0] << 20; 7227ec681f3Smrg 7237ec681f3Smrg if (instr->operands.size() >= 2) { 7247ec681f3Smrg encoding |= sdwa.sel[1].to_sdwa_sel(instr->operands[1].physReg().byte()) << 24; 7257ec681f3Smrg encoding |= sdwa.sel[1].sign_extend() ? 1 << 27 : 0; 7267ec681f3Smrg encoding |= sdwa.abs[1] << 29; 7277ec681f3Smrg encoding |= sdwa.neg[1] << 28; 7287ec681f3Smrg } 7297ec681f3Smrg 7307ec681f3Smrg encoding |= 0xFF & sdwa_op.physReg(); 7317ec681f3Smrg encoding |= (sdwa_op.physReg() < 256) << 23; 7327ec681f3Smrg if (instr->operands.size() >= 2) 7337ec681f3Smrg encoding |= (instr->operands[1].physReg() < 256) << 31; 7347ec681f3Smrg out.push_back(encoding); 7357ec681f3Smrg } else { 7367ec681f3Smrg unreachable("unimplemented instruction format"); 7377ec681f3Smrg } 7387ec681f3Smrg break; 7397ec681f3Smrg } 7407ec681f3Smrg 7417ec681f3Smrg /* append literal dword */ 7427ec681f3Smrg for (const Operand& op : instr->operands) { 7437ec681f3Smrg if (op.isLiteral()) { 7447ec681f3Smrg out.push_back(op.constantValue()); 7457ec681f3Smrg break; 7467ec681f3Smrg } 7477ec681f3Smrg } 7487ec681f3Smrg} 7497ec681f3Smrg 7507ec681f3Smrgvoid 7517ec681f3Smrgemit_block(asm_context& ctx, std::vector<uint32_t>& out, Block& block) 7527ec681f3Smrg{ 7537ec681f3Smrg for (aco_ptr<Instruction>& instr : block.instructions) { 7547ec681f3Smrg#if 0 7557ec681f3Smrg int start_idx = out.size(); 7567ec681f3Smrg std::cerr << "Encoding:\t" << std::endl; 7577ec681f3Smrg aco_print_instr(&*instr, stderr); 7587ec681f3Smrg std::cerr << std::endl; 7597ec681f3Smrg#endif 7607ec681f3Smrg emit_instruction(ctx, out, instr.get()); 7617ec681f3Smrg#if 0 7627ec681f3Smrg for (int i = start_idx; i < out.size(); i++) 7637ec681f3Smrg std::cerr << "encoding: " << "0x" << std::setfill('0') << std::setw(8) << std::hex << out[i] << std::endl; 7647ec681f3Smrg#endif 7657ec681f3Smrg } 7667ec681f3Smrg} 7677ec681f3Smrg 7687ec681f3Smrgvoid 7697ec681f3Smrgfix_exports(asm_context& ctx, std::vector<uint32_t>& out, Program* program) 7707ec681f3Smrg{ 7717ec681f3Smrg bool exported = false; 7727ec681f3Smrg for (Block& block : program->blocks) { 7737ec681f3Smrg if (!(block.kind & block_kind_export_end)) 7747ec681f3Smrg continue; 7757ec681f3Smrg std::vector<aco_ptr<Instruction>>::reverse_iterator it = block.instructions.rbegin(); 7767ec681f3Smrg while (it != block.instructions.rend()) { 7777ec681f3Smrg if ((*it)->isEXP()) { 7787ec681f3Smrg Export_instruction& exp = (*it)->exp(); 7797ec681f3Smrg if (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG) { 7807ec681f3Smrg if (exp.dest >= V_008DFC_SQ_EXP_POS && exp.dest <= (V_008DFC_SQ_EXP_POS + 3)) { 7817ec681f3Smrg exp.done = true; 7827ec681f3Smrg exported = true; 7837ec681f3Smrg break; 7847ec681f3Smrg } 7857ec681f3Smrg } else { 7867ec681f3Smrg exp.done = true; 7877ec681f3Smrg exp.valid_mask = true; 7887ec681f3Smrg exported = true; 7897ec681f3Smrg break; 7907ec681f3Smrg } 7917ec681f3Smrg } else if ((*it)->definitions.size() && (*it)->definitions[0].physReg() == exec) 7927ec681f3Smrg break; 7937ec681f3Smrg ++it; 7947ec681f3Smrg } 7957ec681f3Smrg } 7967ec681f3Smrg 7977ec681f3Smrg if (!exported) { 7987ec681f3Smrg /* Abort in order to avoid a GPU hang. */ 7997ec681f3Smrg bool is_vertex_or_ngg = 8007ec681f3Smrg (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG); 8017ec681f3Smrg aco_err(program, 8027ec681f3Smrg "Missing export in %s shader:", is_vertex_or_ngg ? "vertex or NGG" : "fragment"); 8037ec681f3Smrg aco_print_program(program, stderr); 8047ec681f3Smrg abort(); 8057ec681f3Smrg } 8067ec681f3Smrg} 8077ec681f3Smrg 8087ec681f3Smrgstatic void 8097ec681f3Smrginsert_code(asm_context& ctx, std::vector<uint32_t>& out, unsigned insert_before, 8107ec681f3Smrg unsigned insert_count, const uint32_t* insert_data) 8117ec681f3Smrg{ 8127ec681f3Smrg out.insert(out.begin() + insert_before, insert_data, insert_data + insert_count); 8137ec681f3Smrg 8147ec681f3Smrg /* Update the offset of each affected block */ 8157ec681f3Smrg for (Block& block : ctx.program->blocks) { 8167ec681f3Smrg if (block.offset >= insert_before) 8177ec681f3Smrg block.offset += insert_count; 8187ec681f3Smrg } 8197ec681f3Smrg 8207ec681f3Smrg /* Find first branch after the inserted code */ 8217ec681f3Smrg auto branch_it = std::find_if(ctx.branches.begin(), ctx.branches.end(), 8227ec681f3Smrg [insert_before](const auto& branch) -> bool 8237ec681f3Smrg { return (unsigned)branch.first >= insert_before; }); 8247ec681f3Smrg 8257ec681f3Smrg /* Update the locations of branches */ 8267ec681f3Smrg for (; branch_it != ctx.branches.end(); ++branch_it) 8277ec681f3Smrg branch_it->first += insert_count; 8287ec681f3Smrg 8297ec681f3Smrg /* Update the locations of p_constaddr instructions */ 8307ec681f3Smrg for (auto& constaddr : ctx.constaddrs) { 8317ec681f3Smrg constaddr_info& info = constaddr.second; 8327ec681f3Smrg if (info.getpc_end >= insert_before) 8337ec681f3Smrg info.getpc_end += insert_count; 8347ec681f3Smrg if (info.add_literal >= insert_before) 8357ec681f3Smrg info.add_literal += insert_count; 8367ec681f3Smrg } 8377ec681f3Smrg} 8387ec681f3Smrg 8397ec681f3Smrgstatic void 8407ec681f3Smrgfix_branches_gfx10(asm_context& ctx, std::vector<uint32_t>& out) 8417ec681f3Smrg{ 8427ec681f3Smrg /* Branches with an offset of 0x3f are buggy on GFX10, 8437ec681f3Smrg * we workaround by inserting NOPs if needed. 8447ec681f3Smrg */ 8457ec681f3Smrg bool gfx10_3f_bug = false; 8467ec681f3Smrg 8477ec681f3Smrg do { 8487ec681f3Smrg auto buggy_branch_it = std::find_if( 8497ec681f3Smrg ctx.branches.begin(), ctx.branches.end(), 8507ec681f3Smrg [&ctx](const auto& branch) -> bool { 8517ec681f3Smrg return ((int)ctx.program->blocks[branch.second->block].offset - branch.first - 1) == 8527ec681f3Smrg 0x3f; 8537ec681f3Smrg }); 8547ec681f3Smrg 8557ec681f3Smrg gfx10_3f_bug = buggy_branch_it != ctx.branches.end(); 8567ec681f3Smrg 8577ec681f3Smrg if (gfx10_3f_bug) { 8587ec681f3Smrg /* Insert an s_nop after the branch */ 8597ec681f3Smrg constexpr uint32_t s_nop_0 = 0xbf800000u; 8607ec681f3Smrg insert_code(ctx, out, buggy_branch_it->first + 1, 1, &s_nop_0); 8617ec681f3Smrg } 8627ec681f3Smrg } while (gfx10_3f_bug); 8637ec681f3Smrg} 8647ec681f3Smrg 8657ec681f3Smrgvoid 8667ec681f3Smrgemit_long_jump(asm_context& ctx, SOPP_instruction* branch, bool backwards, 8677ec681f3Smrg std::vector<uint32_t>& out) 8687ec681f3Smrg{ 8697ec681f3Smrg Builder bld(ctx.program); 8707ec681f3Smrg 8717ec681f3Smrg Definition def_tmp_lo(branch->definitions[0].physReg(), s1); 8727ec681f3Smrg Operand op_tmp_lo(branch->definitions[0].physReg(), s1); 8737ec681f3Smrg Definition def_tmp_hi(branch->definitions[0].physReg().advance(4), s1); 8747ec681f3Smrg Operand op_tmp_hi(branch->definitions[0].physReg().advance(4), s1); 8757ec681f3Smrg 8767ec681f3Smrg aco_ptr<Instruction> instr; 8777ec681f3Smrg 8787ec681f3Smrg if (branch->opcode != aco_opcode::s_branch) { 8797ec681f3Smrg /* for conditional branches, skip the long jump if the condition is false */ 8807ec681f3Smrg aco_opcode inv; 8817ec681f3Smrg switch (branch->opcode) { 8827ec681f3Smrg case aco_opcode::s_cbranch_scc0: inv = aco_opcode::s_cbranch_scc1; break; 8837ec681f3Smrg case aco_opcode::s_cbranch_scc1: inv = aco_opcode::s_cbranch_scc0; break; 8847ec681f3Smrg case aco_opcode::s_cbranch_vccz: inv = aco_opcode::s_cbranch_vccnz; break; 8857ec681f3Smrg case aco_opcode::s_cbranch_vccnz: inv = aco_opcode::s_cbranch_vccz; break; 8867ec681f3Smrg case aco_opcode::s_cbranch_execz: inv = aco_opcode::s_cbranch_execnz; break; 8877ec681f3Smrg case aco_opcode::s_cbranch_execnz: inv = aco_opcode::s_cbranch_execz; break; 8887ec681f3Smrg default: unreachable("Unhandled long jump."); 8897ec681f3Smrg } 8907ec681f3Smrg instr.reset(bld.sopp(inv, -1, 7)); 8917ec681f3Smrg emit_instruction(ctx, out, instr.get()); 8927ec681f3Smrg } 8937ec681f3Smrg 8947ec681f3Smrg /* create the new PC and stash SCC in the LSB */ 8957ec681f3Smrg instr.reset(bld.sop1(aco_opcode::s_getpc_b64, branch->definitions[0]).instr); 8967ec681f3Smrg emit_instruction(ctx, out, instr.get()); 8977ec681f3Smrg 8987ec681f3Smrg instr.reset(bld.sop2(aco_opcode::s_addc_u32, def_tmp_lo, op_tmp_lo, Operand::zero()).instr); 8997ec681f3Smrg instr->operands[1].setFixed(PhysReg{255}); /* this operand has to be a literal */ 9007ec681f3Smrg emit_instruction(ctx, out, instr.get()); 9017ec681f3Smrg branch->pass_flags = out.size(); 9027ec681f3Smrg 9037ec681f3Smrg instr.reset(bld.sop2(aco_opcode::s_addc_u32, def_tmp_hi, op_tmp_hi, 9047ec681f3Smrg Operand::c32(backwards ? UINT32_MAX : 0u)) 9057ec681f3Smrg .instr); 9067ec681f3Smrg emit_instruction(ctx, out, instr.get()); 9077ec681f3Smrg 9087ec681f3Smrg /* restore SCC and clear the LSB of the new PC */ 9097ec681f3Smrg instr.reset(bld.sopc(aco_opcode::s_bitcmp1_b32, def_tmp_lo, op_tmp_lo, Operand::zero()).instr); 9107ec681f3Smrg emit_instruction(ctx, out, instr.get()); 9117ec681f3Smrg instr.reset(bld.sop1(aco_opcode::s_bitset0_b32, def_tmp_lo, Operand::zero()).instr); 9127ec681f3Smrg emit_instruction(ctx, out, instr.get()); 9137ec681f3Smrg 9147ec681f3Smrg /* create the s_setpc_b64 to jump */ 9157ec681f3Smrg instr.reset( 9167ec681f3Smrg bld.sop1(aco_opcode::s_setpc_b64, Operand(branch->definitions[0].physReg(), s2)).instr); 9177ec681f3Smrg emit_instruction(ctx, out, instr.get()); 9187ec681f3Smrg} 9197ec681f3Smrg 9207ec681f3Smrgvoid 9217ec681f3Smrgfix_branches(asm_context& ctx, std::vector<uint32_t>& out) 9227ec681f3Smrg{ 9237ec681f3Smrg bool repeat = false; 9247ec681f3Smrg do { 9257ec681f3Smrg repeat = false; 9267ec681f3Smrg 9277ec681f3Smrg if (ctx.chip_class == GFX10) 9287ec681f3Smrg fix_branches_gfx10(ctx, out); 9297ec681f3Smrg 9307ec681f3Smrg for (std::pair<int, SOPP_instruction*>& branch : ctx.branches) { 9317ec681f3Smrg int offset = (int)ctx.program->blocks[branch.second->block].offset - branch.first - 1; 9327ec681f3Smrg if ((offset < INT16_MIN || offset > INT16_MAX) && !branch.second->pass_flags) { 9337ec681f3Smrg std::vector<uint32_t> long_jump; 9347ec681f3Smrg bool backwards = 9357ec681f3Smrg ctx.program->blocks[branch.second->block].offset < (unsigned)branch.first; 9367ec681f3Smrg emit_long_jump(ctx, branch.second, backwards, long_jump); 9377ec681f3Smrg 9387ec681f3Smrg out[branch.first] = long_jump[0]; 9397ec681f3Smrg insert_code(ctx, out, branch.first + 1, long_jump.size() - 1, long_jump.data() + 1); 9407ec681f3Smrg 9417ec681f3Smrg repeat = true; 9427ec681f3Smrg break; 9437ec681f3Smrg } 9447ec681f3Smrg 9457ec681f3Smrg if (branch.second->pass_flags) { 9467ec681f3Smrg int after_getpc = branch.first + branch.second->pass_flags - 2; 9477ec681f3Smrg offset = (int)ctx.program->blocks[branch.second->block].offset - after_getpc; 9487ec681f3Smrg out[branch.first + branch.second->pass_flags - 1] = offset * 4; 9497ec681f3Smrg } else { 9507ec681f3Smrg out[branch.first] &= 0xffff0000u; 9517ec681f3Smrg out[branch.first] |= (uint16_t)offset; 9527ec681f3Smrg } 9537ec681f3Smrg } 9547ec681f3Smrg } while (repeat); 9557ec681f3Smrg} 9567ec681f3Smrg 9577ec681f3Smrgvoid 9587ec681f3Smrgfix_constaddrs(asm_context& ctx, std::vector<uint32_t>& out) 9597ec681f3Smrg{ 9607ec681f3Smrg for (auto& constaddr : ctx.constaddrs) { 9617ec681f3Smrg constaddr_info& info = constaddr.second; 9627ec681f3Smrg out[info.add_literal] += (out.size() - info.getpc_end) * 4u; 9637ec681f3Smrg } 9647ec681f3Smrg} 9657ec681f3Smrg 9667ec681f3Smrgunsigned 9677ec681f3Smrgemit_program(Program* program, std::vector<uint32_t>& code) 9687ec681f3Smrg{ 9697ec681f3Smrg asm_context ctx(program); 9707ec681f3Smrg 9717ec681f3Smrg if (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::FS || 9727ec681f3Smrg program->stage.hw == HWStage::NGG) 9737ec681f3Smrg fix_exports(ctx, code, program); 9747ec681f3Smrg 9757ec681f3Smrg for (Block& block : program->blocks) { 9767ec681f3Smrg block.offset = code.size(); 9777ec681f3Smrg emit_block(ctx, code, block); 9787ec681f3Smrg } 9797ec681f3Smrg 9807ec681f3Smrg fix_branches(ctx, code); 9817ec681f3Smrg 9827ec681f3Smrg unsigned exec_size = code.size() * sizeof(uint32_t); 9837ec681f3Smrg 9847ec681f3Smrg if (program->chip_class >= GFX10) { 9857ec681f3Smrg /* Pad output with s_code_end so instruction prefetching doesn't cause 9867ec681f3Smrg * page faults */ 9877ec681f3Smrg unsigned final_size = align(code.size() + 3 * 16, 16); 9887ec681f3Smrg while (code.size() < final_size) 9897ec681f3Smrg code.push_back(0xbf9f0000u); 9907ec681f3Smrg } 9917ec681f3Smrg 9927ec681f3Smrg fix_constaddrs(ctx, code); 9937ec681f3Smrg 9947ec681f3Smrg while (program->constant_data.size() % 4u) 9957ec681f3Smrg program->constant_data.push_back(0); 9967ec681f3Smrg /* Copy constant data */ 9977ec681f3Smrg code.insert(code.end(), (uint32_t*)program->constant_data.data(), 9987ec681f3Smrg (uint32_t*)(program->constant_data.data() + program->constant_data.size())); 9997ec681f3Smrg 10007ec681f3Smrg return exec_size; 10017ec681f3Smrg} 10027ec681f3Smrg 10037ec681f3Smrg} // namespace aco 1004