aco_assembler.cpp revision 7ec681f3
1/* 2 * Copyright © 2018 Valve Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25#include "aco_builder.h" 26#include "aco_ir.h" 27 28#include "common/sid.h" 29 30#include "util/memstream.h" 31 32#include <algorithm> 33#include <map> 34#include <vector> 35 36namespace aco { 37 38struct constaddr_info { 39 unsigned getpc_end; 40 unsigned add_literal; 41}; 42 43struct asm_context { 44 Program* program; 45 enum chip_class chip_class; 46 std::vector<std::pair<int, SOPP_instruction*>> branches; 47 std::map<unsigned, constaddr_info> constaddrs; 48 const int16_t* opcode; 49 // TODO: keep track of branch instructions referring blocks 50 // and, when emitting the block, correct the offset in instr 51 asm_context(Program* program_) : program(program_), chip_class(program->chip_class) 52 { 53 if (chip_class <= GFX7) 54 opcode = &instr_info.opcode_gfx7[0]; 55 else if (chip_class <= GFX9) 56 opcode = &instr_info.opcode_gfx9[0]; 57 else if (chip_class >= GFX10) 58 opcode = &instr_info.opcode_gfx10[0]; 59 } 60 61 int subvector_begin_pos = -1; 62}; 63 64unsigned 65get_mimg_nsa_dwords(const Instruction* instr) 66{ 67 unsigned addr_dwords = instr->operands.size() - 3; 68 for (unsigned i = 1; i < addr_dwords; i++) { 69 if (instr->operands[3 + i].physReg() != instr->operands[3].physReg().advance(i * 4)) 70 return DIV_ROUND_UP(addr_dwords - 1, 4); 71 } 72 return 0; 73} 74 75void 76emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* instr) 77{ 78 /* lower remaining pseudo-instructions */ 79 if (instr->opcode == aco_opcode::p_constaddr_getpc) { 80 ctx.constaddrs[instr->operands[0].constantValue()].getpc_end = out.size() + 1; 81 82 instr->opcode = aco_opcode::s_getpc_b64; 83 instr->operands.pop_back(); 84 } else if (instr->opcode == aco_opcode::p_constaddr_addlo) { 85 ctx.constaddrs[instr->operands[1].constantValue()].add_literal = out.size() + 1; 86 87 instr->opcode = aco_opcode::s_add_u32; 88 instr->operands[1] = Operand::zero(); 89 instr->operands[1].setFixed(PhysReg(255)); 90 } 91 92 uint32_t opcode = ctx.opcode[(int)instr->opcode]; 93 if (opcode == (uint32_t)-1) { 94 char* outmem; 95 size_t outsize; 96 struct u_memstream mem; 97 u_memstream_open(&mem, &outmem, &outsize); 98 FILE* const memf = u_memstream_get(&mem); 99 100 fprintf(memf, "Unsupported opcode: "); 101 aco_print_instr(instr, memf); 102 u_memstream_close(&mem); 103 104 aco_err(ctx.program, outmem); 105 free(outmem); 106 107 abort(); 108 } 109 110 switch (instr->format) { 111 case Format::SOP2: { 112 uint32_t encoding = (0b10 << 30); 113 encoding |= opcode << 23; 114 encoding |= !instr->definitions.empty() ? instr->definitions[0].physReg() << 16 : 0; 115 encoding |= instr->operands.size() >= 2 ? instr->operands[1].physReg() << 8 : 0; 116 encoding |= !instr->operands.empty() ? instr->operands[0].physReg() : 0; 117 out.push_back(encoding); 118 break; 119 } 120 case Format::SOPK: { 121 SOPK_instruction& sopk = instr->sopk(); 122 123 if (instr->opcode == aco_opcode::s_subvector_loop_begin) { 124 assert(ctx.chip_class >= GFX10); 125 assert(ctx.subvector_begin_pos == -1); 126 ctx.subvector_begin_pos = out.size(); 127 } else if (instr->opcode == aco_opcode::s_subvector_loop_end) { 128 assert(ctx.chip_class >= GFX10); 129 assert(ctx.subvector_begin_pos != -1); 130 /* Adjust s_subvector_loop_begin instruction to the address after the end */ 131 out[ctx.subvector_begin_pos] |= (out.size() - ctx.subvector_begin_pos); 132 /* Adjust s_subvector_loop_end instruction to the address after the beginning */ 133 sopk.imm = (uint16_t)(ctx.subvector_begin_pos - (int)out.size()); 134 ctx.subvector_begin_pos = -1; 135 } 136 137 uint32_t encoding = (0b1011 << 28); 138 encoding |= opcode << 23; 139 encoding |= !instr->definitions.empty() && !(instr->definitions[0].physReg() == scc) 140 ? instr->definitions[0].physReg() << 16 141 : !instr->operands.empty() && instr->operands[0].physReg() <= 127 142 ? instr->operands[0].physReg() << 16 143 : 0; 144 encoding |= sopk.imm; 145 out.push_back(encoding); 146 break; 147 } 148 case Format::SOP1: { 149 uint32_t encoding = (0b101111101 << 23); 150 if (opcode >= 55 && ctx.chip_class <= GFX9) { 151 assert(ctx.chip_class == GFX9 && opcode < 60); 152 opcode = opcode - 4; 153 } 154 encoding |= !instr->definitions.empty() ? instr->definitions[0].physReg() << 16 : 0; 155 encoding |= opcode << 8; 156 encoding |= !instr->operands.empty() ? instr->operands[0].physReg() : 0; 157 out.push_back(encoding); 158 break; 159 } 160 case Format::SOPC: { 161 uint32_t encoding = (0b101111110 << 23); 162 encoding |= opcode << 16; 163 encoding |= instr->operands.size() == 2 ? instr->operands[1].physReg() << 8 : 0; 164 encoding |= !instr->operands.empty() ? instr->operands[0].physReg() : 0; 165 out.push_back(encoding); 166 break; 167 } 168 case Format::SOPP: { 169 SOPP_instruction& sopp = instr->sopp(); 170 uint32_t encoding = (0b101111111 << 23); 171 encoding |= opcode << 16; 172 encoding |= (uint16_t)sopp.imm; 173 if (sopp.block != -1) { 174 sopp.pass_flags = 0; 175 ctx.branches.emplace_back(out.size(), &sopp); 176 } 177 out.push_back(encoding); 178 break; 179 } 180 case Format::SMEM: { 181 SMEM_instruction& smem = instr->smem(); 182 bool soe = instr->operands.size() >= (!instr->definitions.empty() ? 3 : 4); 183 bool is_load = !instr->definitions.empty(); 184 uint32_t encoding = 0; 185 186 if (ctx.chip_class <= GFX7) { 187 encoding = (0b11000 << 27); 188 encoding |= opcode << 22; 189 encoding |= instr->definitions.size() ? instr->definitions[0].physReg() << 15 : 0; 190 encoding |= instr->operands.size() ? (instr->operands[0].physReg() >> 1) << 9 : 0; 191 if (instr->operands.size() >= 2) { 192 if (!instr->operands[1].isConstant()) { 193 encoding |= instr->operands[1].physReg().reg(); 194 } else if (instr->operands[1].constantValue() >= 1024) { 195 encoding |= 255; /* SQ_SRC_LITERAL */ 196 } else { 197 encoding |= instr->operands[1].constantValue() >> 2; 198 encoding |= 1 << 8; 199 } 200 } 201 out.push_back(encoding); 202 /* SMRD instructions can take a literal on GFX7 */ 203 if (instr->operands.size() >= 2 && instr->operands[1].isConstant() && 204 instr->operands[1].constantValue() >= 1024) 205 out.push_back(instr->operands[1].constantValue() >> 2); 206 return; 207 } 208 209 if (ctx.chip_class <= GFX9) { 210 encoding = (0b110000 << 26); 211 assert(!smem.dlc); /* Device-level coherent is not supported on GFX9 and lower */ 212 encoding |= smem.nv ? 1 << 15 : 0; 213 } else { 214 encoding = (0b111101 << 26); 215 assert(!smem.nv); /* Non-volatile is not supported on GFX10 */ 216 encoding |= smem.dlc ? 1 << 14 : 0; 217 } 218 219 encoding |= opcode << 18; 220 encoding |= smem.glc ? 1 << 16 : 0; 221 222 if (ctx.chip_class <= GFX9) { 223 if (instr->operands.size() >= 2) 224 encoding |= instr->operands[1].isConstant() ? 1 << 17 : 0; /* IMM - immediate enable */ 225 } 226 if (ctx.chip_class == GFX9) { 227 encoding |= soe ? 1 << 14 : 0; 228 } 229 230 if (is_load || instr->operands.size() >= 3) { /* SDATA */ 231 encoding |= (is_load ? instr->definitions[0].physReg() : instr->operands[2].physReg()) 232 << 6; 233 } 234 if (instr->operands.size() >= 1) { /* SBASE */ 235 encoding |= instr->operands[0].physReg() >> 1; 236 } 237 238 out.push_back(encoding); 239 encoding = 0; 240 241 int32_t offset = 0; 242 uint32_t soffset = ctx.chip_class >= GFX10 243 ? sgpr_null /* On GFX10 this is disabled by specifying SGPR_NULL */ 244 : 0; /* On GFX9, it is disabled by the SOE bit (and it's not present on 245 GFX8 and below) */ 246 if (instr->operands.size() >= 2) { 247 const Operand& op_off1 = instr->operands[1]; 248 if (ctx.chip_class <= GFX9) { 249 offset = op_off1.isConstant() ? op_off1.constantValue() : op_off1.physReg(); 250 } else { 251 /* GFX10 only supports constants in OFFSET, so put the operand in SOFFSET if it's an 252 * SGPR */ 253 if (op_off1.isConstant()) { 254 offset = op_off1.constantValue(); 255 } else { 256 soffset = op_off1.physReg(); 257 assert(!soe); /* There is no place to put the other SGPR offset, if any */ 258 } 259 } 260 261 if (soe) { 262 const Operand& op_off2 = instr->operands.back(); 263 assert(ctx.chip_class >= GFX9); /* GFX8 and below don't support specifying a constant 264 and an SGPR at the same time */ 265 assert(!op_off2.isConstant()); 266 soffset = op_off2.physReg(); 267 } 268 } 269 encoding |= offset; 270 encoding |= soffset << 25; 271 272 out.push_back(encoding); 273 return; 274 } 275 case Format::VOP2: { 276 uint32_t encoding = 0; 277 encoding |= opcode << 25; 278 encoding |= (0xFF & instr->definitions[0].physReg()) << 17; 279 encoding |= (0xFF & instr->operands[1].physReg()) << 9; 280 encoding |= instr->operands[0].physReg(); 281 out.push_back(encoding); 282 break; 283 } 284 case Format::VOP1: { 285 uint32_t encoding = (0b0111111 << 25); 286 if (!instr->definitions.empty()) 287 encoding |= (0xFF & instr->definitions[0].physReg()) << 17; 288 encoding |= opcode << 9; 289 if (!instr->operands.empty()) 290 encoding |= instr->operands[0].physReg(); 291 out.push_back(encoding); 292 break; 293 } 294 case Format::VOPC: { 295 uint32_t encoding = (0b0111110 << 25); 296 encoding |= opcode << 17; 297 encoding |= (0xFF & instr->operands[1].physReg()) << 9; 298 encoding |= instr->operands[0].physReg(); 299 out.push_back(encoding); 300 break; 301 } 302 case Format::VINTRP: { 303 Interp_instruction& interp = instr->vintrp(); 304 uint32_t encoding = 0; 305 306 if (instr->opcode == aco_opcode::v_interp_p1ll_f16 || 307 instr->opcode == aco_opcode::v_interp_p1lv_f16 || 308 instr->opcode == aco_opcode::v_interp_p2_legacy_f16 || 309 instr->opcode == aco_opcode::v_interp_p2_f16) { 310 if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) { 311 encoding = (0b110100 << 26); 312 } else if (ctx.chip_class >= GFX10) { 313 encoding = (0b110101 << 26); 314 } else { 315 unreachable("Unknown chip_class."); 316 } 317 318 encoding |= opcode << 16; 319 encoding |= (0xFF & instr->definitions[0].physReg()); 320 out.push_back(encoding); 321 322 encoding = 0; 323 encoding |= interp.attribute; 324 encoding |= interp.component << 6; 325 encoding |= instr->operands[0].physReg() << 9; 326 if (instr->opcode == aco_opcode::v_interp_p2_f16 || 327 instr->opcode == aco_opcode::v_interp_p2_legacy_f16 || 328 instr->opcode == aco_opcode::v_interp_p1lv_f16) { 329 encoding |= instr->operands[2].physReg() << 18; 330 } 331 out.push_back(encoding); 332 } else { 333 if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) { 334 encoding = (0b110101 << 26); /* Vega ISA doc says 110010 but it's wrong */ 335 } else { 336 encoding = (0b110010 << 26); 337 } 338 339 assert(encoding); 340 encoding |= (0xFF & instr->definitions[0].physReg()) << 18; 341 encoding |= opcode << 16; 342 encoding |= interp.attribute << 10; 343 encoding |= interp.component << 8; 344 if (instr->opcode == aco_opcode::v_interp_mov_f32) 345 encoding |= (0x3 & instr->operands[0].constantValue()); 346 else 347 encoding |= (0xFF & instr->operands[0].physReg()); 348 out.push_back(encoding); 349 } 350 break; 351 } 352 case Format::DS: { 353 DS_instruction& ds = instr->ds(); 354 uint32_t encoding = (0b110110 << 26); 355 if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) { 356 encoding |= opcode << 17; 357 encoding |= (ds.gds ? 1 : 0) << 16; 358 } else { 359 encoding |= opcode << 18; 360 encoding |= (ds.gds ? 1 : 0) << 17; 361 } 362 encoding |= ((0xFF & ds.offset1) << 8); 363 encoding |= (0xFFFF & ds.offset0); 364 out.push_back(encoding); 365 encoding = 0; 366 unsigned reg = !instr->definitions.empty() ? instr->definitions[0].physReg() : 0; 367 encoding |= (0xFF & reg) << 24; 368 reg = instr->operands.size() >= 3 && !(instr->operands[2].physReg() == m0) 369 ? instr->operands[2].physReg() 370 : 0; 371 encoding |= (0xFF & reg) << 16; 372 reg = instr->operands.size() >= 2 && !(instr->operands[1].physReg() == m0) 373 ? instr->operands[1].physReg() 374 : 0; 375 encoding |= (0xFF & reg) << 8; 376 encoding |= (0xFF & instr->operands[0].physReg()); 377 out.push_back(encoding); 378 break; 379 } 380 case Format::MUBUF: { 381 MUBUF_instruction& mubuf = instr->mubuf(); 382 uint32_t encoding = (0b111000 << 26); 383 encoding |= opcode << 18; 384 encoding |= (mubuf.lds ? 1 : 0) << 16; 385 encoding |= (mubuf.glc ? 1 : 0) << 14; 386 encoding |= (mubuf.idxen ? 1 : 0) << 13; 387 assert(!mubuf.addr64 || ctx.chip_class <= GFX7); 388 if (ctx.chip_class == GFX6 || ctx.chip_class == GFX7) 389 encoding |= (mubuf.addr64 ? 1 : 0) << 15; 390 encoding |= (mubuf.offen ? 1 : 0) << 12; 391 if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) { 392 assert(!mubuf.dlc); /* Device-level coherent is not supported on GFX9 and lower */ 393 encoding |= (mubuf.slc ? 1 : 0) << 17; 394 } else if (ctx.chip_class >= GFX10) { 395 encoding |= (mubuf.dlc ? 1 : 0) << 15; 396 } 397 encoding |= 0x0FFF & mubuf.offset; 398 out.push_back(encoding); 399 encoding = 0; 400 if (ctx.chip_class <= GFX7 || ctx.chip_class >= GFX10) { 401 encoding |= (mubuf.slc ? 1 : 0) << 22; 402 } 403 encoding |= instr->operands[2].physReg() << 24; 404 encoding |= (mubuf.tfe ? 1 : 0) << 23; 405 encoding |= (instr->operands[0].physReg() >> 2) << 16; 406 unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg() 407 : instr->definitions[0].physReg(); 408 encoding |= (0xFF & reg) << 8; 409 encoding |= (0xFF & instr->operands[1].physReg()); 410 out.push_back(encoding); 411 break; 412 } 413 case Format::MTBUF: { 414 MTBUF_instruction& mtbuf = instr->mtbuf(); 415 416 uint32_t img_format = ac_get_tbuffer_format(ctx.chip_class, mtbuf.dfmt, mtbuf.nfmt); 417 uint32_t encoding = (0b111010 << 26); 418 assert(img_format <= 0x7F); 419 assert(!mtbuf.dlc || ctx.chip_class >= GFX10); 420 encoding |= (mtbuf.dlc ? 1 : 0) << 15; /* DLC bit replaces one bit of the OPCODE on GFX10 */ 421 encoding |= (mtbuf.glc ? 1 : 0) << 14; 422 encoding |= (mtbuf.idxen ? 1 : 0) << 13; 423 encoding |= (mtbuf.offen ? 1 : 0) << 12; 424 encoding |= 0x0FFF & mtbuf.offset; 425 encoding |= (img_format << 19); /* Handles both the GFX10 FORMAT and the old NFMT+DFMT */ 426 427 if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) { 428 encoding |= opcode << 15; 429 } else { 430 encoding |= (opcode & 0x07) << 16; /* 3 LSBs of 4-bit OPCODE */ 431 } 432 433 out.push_back(encoding); 434 encoding = 0; 435 436 encoding |= instr->operands[2].physReg() << 24; 437 encoding |= (mtbuf.tfe ? 1 : 0) << 23; 438 encoding |= (mtbuf.slc ? 1 : 0) << 22; 439 encoding |= (instr->operands[0].physReg() >> 2) << 16; 440 unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg() 441 : instr->definitions[0].physReg(); 442 encoding |= (0xFF & reg) << 8; 443 encoding |= (0xFF & instr->operands[1].physReg()); 444 445 if (ctx.chip_class >= GFX10) { 446 encoding |= (((opcode & 0x08) >> 3) << 21); /* MSB of 4-bit OPCODE */ 447 } 448 449 out.push_back(encoding); 450 break; 451 } 452 case Format::MIMG: { 453 unsigned nsa_dwords = get_mimg_nsa_dwords(instr); 454 assert(!nsa_dwords || ctx.chip_class >= GFX10); 455 456 MIMG_instruction& mimg = instr->mimg(); 457 uint32_t encoding = (0b111100 << 26); 458 encoding |= mimg.slc ? 1 << 25 : 0; 459 encoding |= (opcode & 0x7f) << 18; 460 encoding |= (opcode >> 7) & 1; 461 encoding |= mimg.lwe ? 1 << 17 : 0; 462 encoding |= mimg.tfe ? 1 << 16 : 0; 463 encoding |= mimg.glc ? 1 << 13 : 0; 464 encoding |= mimg.unrm ? 1 << 12 : 0; 465 if (ctx.chip_class <= GFX9) { 466 assert(!mimg.dlc); /* Device-level coherent is not supported on GFX9 and lower */ 467 assert(!mimg.r128); 468 encoding |= mimg.a16 ? 1 << 15 : 0; 469 encoding |= mimg.da ? 1 << 14 : 0; 470 } else { 471 encoding |= mimg.r128 ? 1 << 15 472 : 0; /* GFX10: A16 moved to 2nd word, R128 replaces it in 1st word */ 473 encoding |= nsa_dwords << 1; 474 encoding |= mimg.dim << 3; /* GFX10: dimensionality instead of declare array */ 475 encoding |= mimg.dlc ? 1 << 7 : 0; 476 } 477 encoding |= (0xF & mimg.dmask) << 8; 478 out.push_back(encoding); 479 encoding = (0xFF & instr->operands[3].physReg()); /* VADDR */ 480 if (!instr->definitions.empty()) { 481 encoding |= (0xFF & instr->definitions[0].physReg()) << 8; /* VDATA */ 482 } else if (!instr->operands[2].isUndefined()) { 483 encoding |= (0xFF & instr->operands[2].physReg()) << 8; /* VDATA */ 484 } 485 encoding |= (0x1F & (instr->operands[0].physReg() >> 2)) << 16; /* T# (resource) */ 486 if (!instr->operands[1].isUndefined()) 487 encoding |= (0x1F & (instr->operands[1].physReg() >> 2)) << 21; /* sampler */ 488 489 assert(!mimg.d16 || ctx.chip_class >= GFX9); 490 encoding |= mimg.d16 ? 1 << 31 : 0; 491 if (ctx.chip_class >= GFX10) { 492 /* GFX10: A16 still exists, but is in a different place */ 493 encoding |= mimg.a16 ? 1 << 30 : 0; 494 } 495 496 out.push_back(encoding); 497 498 if (nsa_dwords) { 499 out.resize(out.size() + nsa_dwords); 500 std::vector<uint32_t>::iterator nsa = std::prev(out.end(), nsa_dwords); 501 for (unsigned i = 0; i < instr->operands.size() - 4u; i++) 502 nsa[i / 4] |= (0xFF & instr->operands[4 + i].physReg().reg()) << (i % 4 * 8); 503 } 504 break; 505 } 506 case Format::FLAT: 507 case Format::SCRATCH: 508 case Format::GLOBAL: { 509 FLAT_instruction& flat = instr->flatlike(); 510 uint32_t encoding = (0b110111 << 26); 511 encoding |= opcode << 18; 512 if (ctx.chip_class <= GFX9) { 513 assert(flat.offset <= 0x1fff); 514 encoding |= flat.offset & 0x1fff; 515 } else if (instr->isFlat()) { 516 /* GFX10 has a 12-bit immediate OFFSET field, 517 * but it has a hw bug: it ignores the offset, called FlatSegmentOffsetBug 518 */ 519 assert(flat.offset == 0); 520 } else { 521 assert(flat.offset <= 0xfff); 522 encoding |= flat.offset & 0xfff; 523 } 524 if (instr->isScratch()) 525 encoding |= 1 << 14; 526 else if (instr->isGlobal()) 527 encoding |= 2 << 14; 528 encoding |= flat.lds ? 1 << 13 : 0; 529 encoding |= flat.glc ? 1 << 16 : 0; 530 encoding |= flat.slc ? 1 << 17 : 0; 531 if (ctx.chip_class >= GFX10) { 532 assert(!flat.nv); 533 encoding |= flat.dlc ? 1 << 12 : 0; 534 } else { 535 assert(!flat.dlc); 536 } 537 out.push_back(encoding); 538 encoding = (0xFF & instr->operands[0].physReg()); 539 if (!instr->definitions.empty()) 540 encoding |= (0xFF & instr->definitions[0].physReg()) << 24; 541 if (instr->operands.size() >= 3) 542 encoding |= (0xFF & instr->operands[2].physReg()) << 8; 543 if (!instr->operands[1].isUndefined()) { 544 assert(ctx.chip_class >= GFX10 || instr->operands[1].physReg() != 0x7F); 545 assert(instr->format != Format::FLAT); 546 encoding |= instr->operands[1].physReg() << 16; 547 } else if (instr->format != Format::FLAT || 548 ctx.chip_class >= GFX10) { /* SADDR is actually used with FLAT on GFX10 */ 549 if (ctx.chip_class <= GFX9) 550 encoding |= 0x7F << 16; 551 else 552 encoding |= sgpr_null << 16; 553 } 554 encoding |= flat.nv ? 1 << 23 : 0; 555 out.push_back(encoding); 556 break; 557 } 558 case Format::EXP: { 559 Export_instruction& exp = instr->exp(); 560 uint32_t encoding; 561 if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) { 562 encoding = (0b110001 << 26); 563 } else { 564 encoding = (0b111110 << 26); 565 } 566 567 encoding |= exp.valid_mask ? 0b1 << 12 : 0; 568 encoding |= exp.done ? 0b1 << 11 : 0; 569 encoding |= exp.compressed ? 0b1 << 10 : 0; 570 encoding |= exp.dest << 4; 571 encoding |= exp.enabled_mask; 572 out.push_back(encoding); 573 encoding = 0xFF & exp.operands[0].physReg(); 574 encoding |= (0xFF & exp.operands[1].physReg()) << 8; 575 encoding |= (0xFF & exp.operands[2].physReg()) << 16; 576 encoding |= (0xFF & exp.operands[3].physReg()) << 24; 577 out.push_back(encoding); 578 break; 579 } 580 case Format::PSEUDO: 581 case Format::PSEUDO_BARRIER: 582 if (instr->opcode != aco_opcode::p_unit_test) 583 unreachable("Pseudo instructions should be lowered before assembly."); 584 break; 585 default: 586 if (instr->isVOP3()) { 587 VOP3_instruction& vop3 = instr->vop3(); 588 589 if (instr->isVOP2()) { 590 opcode = opcode + 0x100; 591 } else if (instr->isVOP1()) { 592 if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) 593 opcode = opcode + 0x140; 594 else 595 opcode = opcode + 0x180; 596 } else if (instr->isVOPC()) { 597 opcode = opcode + 0x0; 598 } else if (instr->isVINTRP()) { 599 opcode = opcode + 0x270; 600 } 601 602 uint32_t encoding; 603 if (ctx.chip_class <= GFX9) { 604 encoding = (0b110100 << 26); 605 } else if (ctx.chip_class >= GFX10) { 606 encoding = (0b110101 << 26); 607 } else { 608 unreachable("Unknown chip_class."); 609 } 610 611 if (ctx.chip_class <= GFX7) { 612 encoding |= opcode << 17; 613 encoding |= (vop3.clamp ? 1 : 0) << 11; 614 } else { 615 encoding |= opcode << 16; 616 encoding |= (vop3.clamp ? 1 : 0) << 15; 617 } 618 encoding |= vop3.opsel << 11; 619 for (unsigned i = 0; i < 3; i++) 620 encoding |= vop3.abs[i] << (8 + i); 621 if (instr->definitions.size() == 2) 622 encoding |= instr->definitions[1].physReg() << 8; 623 encoding |= (0xFF & instr->definitions[0].physReg()); 624 out.push_back(encoding); 625 encoding = 0; 626 if (instr->opcode == aco_opcode::v_interp_mov_f32) { 627 encoding = 0x3 & instr->operands[0].constantValue(); 628 } else if (instr->opcode == aco_opcode::v_writelane_b32_e64) { 629 encoding |= instr->operands[0].physReg() << 0; 630 encoding |= instr->operands[1].physReg() << 9; 631 /* Encoding src2 works fine with hardware but breaks some disassemblers. */ 632 } else { 633 for (unsigned i = 0; i < instr->operands.size(); i++) 634 encoding |= instr->operands[i].physReg() << (i * 9); 635 } 636 encoding |= vop3.omod << 27; 637 for (unsigned i = 0; i < 3; i++) 638 encoding |= vop3.neg[i] << (29 + i); 639 out.push_back(encoding); 640 641 } else if (instr->isVOP3P()) { 642 VOP3P_instruction& vop3 = instr->vop3p(); 643 644 uint32_t encoding; 645 if (ctx.chip_class == GFX9) { 646 encoding = (0b110100111 << 23); 647 } else if (ctx.chip_class >= GFX10) { 648 encoding = (0b110011 << 26); 649 } else { 650 unreachable("Unknown chip_class."); 651 } 652 653 encoding |= opcode << 16; 654 encoding |= (vop3.clamp ? 1 : 0) << 15; 655 encoding |= vop3.opsel_lo << 11; 656 encoding |= ((vop3.opsel_hi & 0x4) ? 1 : 0) << 14; 657 for (unsigned i = 0; i < 3; i++) 658 encoding |= vop3.neg_hi[i] << (8 + i); 659 encoding |= (0xFF & instr->definitions[0].physReg()); 660 out.push_back(encoding); 661 encoding = 0; 662 for (unsigned i = 0; i < instr->operands.size(); i++) 663 encoding |= instr->operands[i].physReg() << (i * 9); 664 encoding |= (vop3.opsel_hi & 0x3) << 27; 665 for (unsigned i = 0; i < 3; i++) 666 encoding |= vop3.neg_lo[i] << (29 + i); 667 out.push_back(encoding); 668 669 } else if (instr->isDPP()) { 670 assert(ctx.chip_class >= GFX8); 671 DPP_instruction& dpp = instr->dpp(); 672 673 /* first emit the instruction without the DPP operand */ 674 Operand dpp_op = instr->operands[0]; 675 instr->operands[0] = Operand(PhysReg{250}, v1); 676 instr->format = (Format)((uint16_t)instr->format & ~(uint16_t)Format::DPP); 677 emit_instruction(ctx, out, instr); 678 uint32_t encoding = (0xF & dpp.row_mask) << 28; 679 encoding |= (0xF & dpp.bank_mask) << 24; 680 encoding |= dpp.abs[1] << 23; 681 encoding |= dpp.neg[1] << 22; 682 encoding |= dpp.abs[0] << 21; 683 encoding |= dpp.neg[0] << 20; 684 if (ctx.chip_class >= GFX10) 685 encoding |= 1 << 18; /* set Fetch Inactive to match GFX9 behaviour */ 686 encoding |= dpp.bound_ctrl << 19; 687 encoding |= dpp.dpp_ctrl << 8; 688 encoding |= (0xFF) & dpp_op.physReg(); 689 out.push_back(encoding); 690 return; 691 } else if (instr->isSDWA()) { 692 SDWA_instruction& sdwa = instr->sdwa(); 693 694 /* first emit the instruction without the SDWA operand */ 695 Operand sdwa_op = instr->operands[0]; 696 instr->operands[0] = Operand(PhysReg{249}, v1); 697 instr->format = (Format)((uint16_t)instr->format & ~(uint16_t)Format::SDWA); 698 emit_instruction(ctx, out, instr); 699 700 uint32_t encoding = 0; 701 702 if (instr->isVOPC()) { 703 if (instr->definitions[0].physReg() != vcc) { 704 encoding |= instr->definitions[0].physReg() << 8; 705 encoding |= 1 << 15; 706 } 707 encoding |= (sdwa.clamp ? 1 : 0) << 13; 708 } else { 709 encoding |= sdwa.dst_sel.to_sdwa_sel(instr->definitions[0].physReg().byte()) << 8; 710 uint32_t dst_u = sdwa.dst_sel.sign_extend() ? 1 : 0; 711 if (instr->definitions[0].bytes() < 4) /* dst_preserve */ 712 dst_u = 2; 713 encoding |= dst_u << 11; 714 encoding |= (sdwa.clamp ? 1 : 0) << 13; 715 encoding |= sdwa.omod << 14; 716 } 717 718 encoding |= sdwa.sel[0].to_sdwa_sel(sdwa_op.physReg().byte()) << 16; 719 encoding |= sdwa.sel[0].sign_extend() ? 1 << 19 : 0; 720 encoding |= sdwa.abs[0] << 21; 721 encoding |= sdwa.neg[0] << 20; 722 723 if (instr->operands.size() >= 2) { 724 encoding |= sdwa.sel[1].to_sdwa_sel(instr->operands[1].physReg().byte()) << 24; 725 encoding |= sdwa.sel[1].sign_extend() ? 1 << 27 : 0; 726 encoding |= sdwa.abs[1] << 29; 727 encoding |= sdwa.neg[1] << 28; 728 } 729 730 encoding |= 0xFF & sdwa_op.physReg(); 731 encoding |= (sdwa_op.physReg() < 256) << 23; 732 if (instr->operands.size() >= 2) 733 encoding |= (instr->operands[1].physReg() < 256) << 31; 734 out.push_back(encoding); 735 } else { 736 unreachable("unimplemented instruction format"); 737 } 738 break; 739 } 740 741 /* append literal dword */ 742 for (const Operand& op : instr->operands) { 743 if (op.isLiteral()) { 744 out.push_back(op.constantValue()); 745 break; 746 } 747 } 748} 749 750void 751emit_block(asm_context& ctx, std::vector<uint32_t>& out, Block& block) 752{ 753 for (aco_ptr<Instruction>& instr : block.instructions) { 754#if 0 755 int start_idx = out.size(); 756 std::cerr << "Encoding:\t" << std::endl; 757 aco_print_instr(&*instr, stderr); 758 std::cerr << std::endl; 759#endif 760 emit_instruction(ctx, out, instr.get()); 761#if 0 762 for (int i = start_idx; i < out.size(); i++) 763 std::cerr << "encoding: " << "0x" << std::setfill('0') << std::setw(8) << std::hex << out[i] << std::endl; 764#endif 765 } 766} 767 768void 769fix_exports(asm_context& ctx, std::vector<uint32_t>& out, Program* program) 770{ 771 bool exported = false; 772 for (Block& block : program->blocks) { 773 if (!(block.kind & block_kind_export_end)) 774 continue; 775 std::vector<aco_ptr<Instruction>>::reverse_iterator it = block.instructions.rbegin(); 776 while (it != block.instructions.rend()) { 777 if ((*it)->isEXP()) { 778 Export_instruction& exp = (*it)->exp(); 779 if (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG) { 780 if (exp.dest >= V_008DFC_SQ_EXP_POS && exp.dest <= (V_008DFC_SQ_EXP_POS + 3)) { 781 exp.done = true; 782 exported = true; 783 break; 784 } 785 } else { 786 exp.done = true; 787 exp.valid_mask = true; 788 exported = true; 789 break; 790 } 791 } else if ((*it)->definitions.size() && (*it)->definitions[0].physReg() == exec) 792 break; 793 ++it; 794 } 795 } 796 797 if (!exported) { 798 /* Abort in order to avoid a GPU hang. */ 799 bool is_vertex_or_ngg = 800 (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG); 801 aco_err(program, 802 "Missing export in %s shader:", is_vertex_or_ngg ? "vertex or NGG" : "fragment"); 803 aco_print_program(program, stderr); 804 abort(); 805 } 806} 807 808static void 809insert_code(asm_context& ctx, std::vector<uint32_t>& out, unsigned insert_before, 810 unsigned insert_count, const uint32_t* insert_data) 811{ 812 out.insert(out.begin() + insert_before, insert_data, insert_data + insert_count); 813 814 /* Update the offset of each affected block */ 815 for (Block& block : ctx.program->blocks) { 816 if (block.offset >= insert_before) 817 block.offset += insert_count; 818 } 819 820 /* Find first branch after the inserted code */ 821 auto branch_it = std::find_if(ctx.branches.begin(), ctx.branches.end(), 822 [insert_before](const auto& branch) -> bool 823 { return (unsigned)branch.first >= insert_before; }); 824 825 /* Update the locations of branches */ 826 for (; branch_it != ctx.branches.end(); ++branch_it) 827 branch_it->first += insert_count; 828 829 /* Update the locations of p_constaddr instructions */ 830 for (auto& constaddr : ctx.constaddrs) { 831 constaddr_info& info = constaddr.second; 832 if (info.getpc_end >= insert_before) 833 info.getpc_end += insert_count; 834 if (info.add_literal >= insert_before) 835 info.add_literal += insert_count; 836 } 837} 838 839static void 840fix_branches_gfx10(asm_context& ctx, std::vector<uint32_t>& out) 841{ 842 /* Branches with an offset of 0x3f are buggy on GFX10, 843 * we workaround by inserting NOPs if needed. 844 */ 845 bool gfx10_3f_bug = false; 846 847 do { 848 auto buggy_branch_it = std::find_if( 849 ctx.branches.begin(), ctx.branches.end(), 850 [&ctx](const auto& branch) -> bool { 851 return ((int)ctx.program->blocks[branch.second->block].offset - branch.first - 1) == 852 0x3f; 853 }); 854 855 gfx10_3f_bug = buggy_branch_it != ctx.branches.end(); 856 857 if (gfx10_3f_bug) { 858 /* Insert an s_nop after the branch */ 859 constexpr uint32_t s_nop_0 = 0xbf800000u; 860 insert_code(ctx, out, buggy_branch_it->first + 1, 1, &s_nop_0); 861 } 862 } while (gfx10_3f_bug); 863} 864 865void 866emit_long_jump(asm_context& ctx, SOPP_instruction* branch, bool backwards, 867 std::vector<uint32_t>& out) 868{ 869 Builder bld(ctx.program); 870 871 Definition def_tmp_lo(branch->definitions[0].physReg(), s1); 872 Operand op_tmp_lo(branch->definitions[0].physReg(), s1); 873 Definition def_tmp_hi(branch->definitions[0].physReg().advance(4), s1); 874 Operand op_tmp_hi(branch->definitions[0].physReg().advance(4), s1); 875 876 aco_ptr<Instruction> instr; 877 878 if (branch->opcode != aco_opcode::s_branch) { 879 /* for conditional branches, skip the long jump if the condition is false */ 880 aco_opcode inv; 881 switch (branch->opcode) { 882 case aco_opcode::s_cbranch_scc0: inv = aco_opcode::s_cbranch_scc1; break; 883 case aco_opcode::s_cbranch_scc1: inv = aco_opcode::s_cbranch_scc0; break; 884 case aco_opcode::s_cbranch_vccz: inv = aco_opcode::s_cbranch_vccnz; break; 885 case aco_opcode::s_cbranch_vccnz: inv = aco_opcode::s_cbranch_vccz; break; 886 case aco_opcode::s_cbranch_execz: inv = aco_opcode::s_cbranch_execnz; break; 887 case aco_opcode::s_cbranch_execnz: inv = aco_opcode::s_cbranch_execz; break; 888 default: unreachable("Unhandled long jump."); 889 } 890 instr.reset(bld.sopp(inv, -1, 7)); 891 emit_instruction(ctx, out, instr.get()); 892 } 893 894 /* create the new PC and stash SCC in the LSB */ 895 instr.reset(bld.sop1(aco_opcode::s_getpc_b64, branch->definitions[0]).instr); 896 emit_instruction(ctx, out, instr.get()); 897 898 instr.reset(bld.sop2(aco_opcode::s_addc_u32, def_tmp_lo, op_tmp_lo, Operand::zero()).instr); 899 instr->operands[1].setFixed(PhysReg{255}); /* this operand has to be a literal */ 900 emit_instruction(ctx, out, instr.get()); 901 branch->pass_flags = out.size(); 902 903 instr.reset(bld.sop2(aco_opcode::s_addc_u32, def_tmp_hi, op_tmp_hi, 904 Operand::c32(backwards ? UINT32_MAX : 0u)) 905 .instr); 906 emit_instruction(ctx, out, instr.get()); 907 908 /* restore SCC and clear the LSB of the new PC */ 909 instr.reset(bld.sopc(aco_opcode::s_bitcmp1_b32, def_tmp_lo, op_tmp_lo, Operand::zero()).instr); 910 emit_instruction(ctx, out, instr.get()); 911 instr.reset(bld.sop1(aco_opcode::s_bitset0_b32, def_tmp_lo, Operand::zero()).instr); 912 emit_instruction(ctx, out, instr.get()); 913 914 /* create the s_setpc_b64 to jump */ 915 instr.reset( 916 bld.sop1(aco_opcode::s_setpc_b64, Operand(branch->definitions[0].physReg(), s2)).instr); 917 emit_instruction(ctx, out, instr.get()); 918} 919 920void 921fix_branches(asm_context& ctx, std::vector<uint32_t>& out) 922{ 923 bool repeat = false; 924 do { 925 repeat = false; 926 927 if (ctx.chip_class == GFX10) 928 fix_branches_gfx10(ctx, out); 929 930 for (std::pair<int, SOPP_instruction*>& branch : ctx.branches) { 931 int offset = (int)ctx.program->blocks[branch.second->block].offset - branch.first - 1; 932 if ((offset < INT16_MIN || offset > INT16_MAX) && !branch.second->pass_flags) { 933 std::vector<uint32_t> long_jump; 934 bool backwards = 935 ctx.program->blocks[branch.second->block].offset < (unsigned)branch.first; 936 emit_long_jump(ctx, branch.second, backwards, long_jump); 937 938 out[branch.first] = long_jump[0]; 939 insert_code(ctx, out, branch.first + 1, long_jump.size() - 1, long_jump.data() + 1); 940 941 repeat = true; 942 break; 943 } 944 945 if (branch.second->pass_flags) { 946 int after_getpc = branch.first + branch.second->pass_flags - 2; 947 offset = (int)ctx.program->blocks[branch.second->block].offset - after_getpc; 948 out[branch.first + branch.second->pass_flags - 1] = offset * 4; 949 } else { 950 out[branch.first] &= 0xffff0000u; 951 out[branch.first] |= (uint16_t)offset; 952 } 953 } 954 } while (repeat); 955} 956 957void 958fix_constaddrs(asm_context& ctx, std::vector<uint32_t>& out) 959{ 960 for (auto& constaddr : ctx.constaddrs) { 961 constaddr_info& info = constaddr.second; 962 out[info.add_literal] += (out.size() - info.getpc_end) * 4u; 963 } 964} 965 966unsigned 967emit_program(Program* program, std::vector<uint32_t>& code) 968{ 969 asm_context ctx(program); 970 971 if (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::FS || 972 program->stage.hw == HWStage::NGG) 973 fix_exports(ctx, code, program); 974 975 for (Block& block : program->blocks) { 976 block.offset = code.size(); 977 emit_block(ctx, code, block); 978 } 979 980 fix_branches(ctx, code); 981 982 unsigned exec_size = code.size() * sizeof(uint32_t); 983 984 if (program->chip_class >= GFX10) { 985 /* Pad output with s_code_end so instruction prefetching doesn't cause 986 * page faults */ 987 unsigned final_size = align(code.size() + 3 * 16, 16); 988 while (code.size() < final_size) 989 code.push_back(0xbf9f0000u); 990 } 991 992 fix_constaddrs(ctx, code); 993 994 while (program->constant_data.size() % 4u) 995 program->constant_data.push_back(0); 996 /* Copy constant data */ 997 code.insert(code.end(), (uint32_t*)program->constant_data.data(), 998 (uint32_t*)(program->constant_data.data() + program->constant_data.size())); 999 1000 return exec_size; 1001} 1002 1003} // namespace aco 1004