disassemble.c revision b8e80941
1/* 2 * Copyright (C) 2019 Connor Abbott <cwabbott0@gmail.com> 3 * Copyright (C) 2019 Lyude Paul <thatslyude@gmail.com> 4 * Copyright (C) 2019 Ryan Houdek <Sonicadvance1@gmail.com> 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice (including the next 14 * paragraph) shall be included in all copies or substantial portions of the 15 * Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 * SOFTWARE. 24 */ 25 26#include <stdbool.h> 27#include <stdio.h> 28#include <stdint.h> 29#include <assert.h> 30#include <inttypes.h> 31#include <string.h> 32 33#include "bifrost.h" 34#include "disassemble.h" 35#include "util/macros.h" 36 37// return bits (high, lo] 38static uint64_t bits(uint32_t word, unsigned lo, unsigned high) 39{ 40 if (high == 32) 41 return word >> lo; 42 return (word & ((1 << high) - 1)) >> lo; 43} 44 45// each of these structs represents an instruction that's dispatched in one 46// cycle. Note that these instructions are packed in funny ways within the 47// clause, hence the need for a separate struct. 48struct bifrost_alu_inst { 49 uint32_t fma_bits; 50 uint32_t add_bits; 51 uint64_t reg_bits; 52}; 53 54struct bifrost_regs { 55 unsigned uniform_const : 8; 56 unsigned reg2 : 6; 57 unsigned reg3 : 6; 58 unsigned reg0 : 5; 59 unsigned reg1 : 6; 60 unsigned ctrl : 4; 61}; 62 63static unsigned get_reg0(struct bifrost_regs regs) 64{ 65 if (regs.ctrl == 0) 66 return regs.reg0 | ((regs.reg1 & 0x1) << 5); 67 68 return regs.reg0 <= regs.reg1 ? regs.reg0 : 63 - regs.reg0; 69} 70 71static unsigned get_reg1(struct bifrost_regs regs) 72{ 73 return regs.reg0 <= regs.reg1 ? regs.reg1 : 63 - regs.reg1; 74} 75 76enum bifrost_reg_write_unit { 77 REG_WRITE_NONE = 0, // don't write 78 REG_WRITE_TWO, // write using reg2 79 REG_WRITE_THREE, // write using reg3 80}; 81 82// this represents the decoded version of the ctrl register field. 83struct bifrost_reg_ctrl{ 84 bool read_reg0; 85 bool read_reg1; 86 bool read_reg3; 87 enum bifrost_reg_write_unit fma_write_unit; 88 enum bifrost_reg_write_unit add_write_unit; 89 bool clause_start; 90}; 91 92enum fma_src_type { 93 FMA_ONE_SRC, 94 FMA_TWO_SRC, 95 FMA_FADD, 96 FMA_FMINMAX, 97 FMA_FADD16, 98 FMA_FMINMAX16, 99 FMA_FCMP, 100 FMA_FCMP16, 101 FMA_THREE_SRC, 102 FMA_FMA, 103 FMA_FMA16, 104 FMA_FOUR_SRC, 105 FMA_FMA_MSCALE, 106 FMA_SHIFT_ADD64, 107}; 108 109struct fma_op_info { 110 unsigned op; 111 char name[30]; 112 enum fma_src_type src_type; 113}; 114 115enum add_src_type { 116 ADD_ONE_SRC, 117 ADD_TWO_SRC, 118 ADD_FADD, 119 ADD_FMINMAX, 120 ADD_FADD16, 121 ADD_FMINMAX16, 122 ADD_THREE_SRC, 123 ADD_FADDMscale, 124 ADD_FCMP, 125 ADD_FCMP16, 126 ADD_TEX_COMPACT, // texture instruction with embedded sampler 127 ADD_TEX, // texture instruction with sampler/etc. in uniform port 128 ADD_VARYING_INTERP, 129 ADD_BLENDING, 130 ADD_LOAD_ATTR, 131 ADD_VARYING_ADDRESS, 132 ADD_BRANCH, 133}; 134 135struct add_op_info { 136 unsigned op; 137 char name[30]; 138 enum add_src_type src_type; 139 bool has_data_reg; 140}; 141 142struct bifrost_tex_ctrl { 143 unsigned sampler_index : 4; // also used to signal indirects 144 unsigned tex_index : 7; 145 bool no_merge_index : 1; // whether to merge (direct) sampler & texture indices 146 bool filter : 1; // use the usual filtering pipeline (0 for texelFetch & textureGather) 147 unsigned unk0 : 2; 148 bool texel_offset : 1; // *Offset() 149 bool is_shadow : 1; 150 bool is_array : 1; 151 unsigned tex_type : 2; // 2D, 3D, Cube, Buffer 152 bool compute_lod : 1; // 0 for *Lod() 153 bool not_supply_lod : 1; // 0 for *Lod() or when a bias is applied 154 bool calc_gradients : 1; // 0 for *Grad() 155 unsigned unk1 : 1; 156 unsigned result_type : 4; // integer, unsigned, float TODO: why is this 4 bits? 157 unsigned unk2 : 4; 158}; 159 160struct bifrost_dual_tex_ctrl { 161 unsigned sampler_index0 : 2; 162 unsigned unk0 : 2; 163 unsigned tex_index0 : 2; 164 unsigned sampler_index1 : 2; 165 unsigned tex_index1 : 2; 166 unsigned unk1 : 22; 167}; 168 169enum branch_cond { 170 BR_COND_LT = 0, 171 BR_COND_LE = 1, 172 BR_COND_GE = 2, 173 BR_COND_GT = 3, 174 // Equal vs. not-equal determined by src0/src1 comparison 175 BR_COND_EQ = 4, 176 // floating-point comparisons 177 // Becomes UNE when you flip the arguments 178 BR_COND_OEQ = 5, 179 // TODO what happens when you flip the arguments? 180 BR_COND_OGT = 6, 181 BR_COND_OLT = 7, 182}; 183 184enum branch_bit_size { 185 BR_SIZE_32 = 0, 186 BR_SIZE_16XX = 1, 187 BR_SIZE_16YY = 2, 188 // For the above combinations of bitsize and location, an extra bit is 189 // encoded via comparing the sources. The only possible source of ambiguity 190 // would be if the sources were the same, but then the branch condition 191 // would be always true or always false anyways, so we can ignore it. But 192 // this no longer works when comparing the y component to the x component, 193 // since it's valid to compare the y component of a source against its own 194 // x component. Instead, the extra bit is encoded via an extra bitsize. 195 BR_SIZE_16YX0 = 3, 196 BR_SIZE_16YX1 = 4, 197 BR_SIZE_32_AND_16X = 5, 198 BR_SIZE_32_AND_16Y = 6, 199 // Used for comparisons with zero and always-true, see below. I think this 200 // only works for integer comparisons. 201 BR_SIZE_ZERO = 7, 202}; 203 204enum branch_code { 205 BR_ALWAYS = 63, 206}; 207 208void dump_header(struct bifrost_header header, bool verbose); 209void dump_instr(const struct bifrost_alu_inst *instr, struct bifrost_regs next_regs, uint64_t *consts, 210 unsigned data_reg, unsigned offset, bool verbose); 211bool dump_clause(uint32_t *words, unsigned *size, unsigned offset, bool verbose); 212 213void dump_header(struct bifrost_header header, bool verbose) { 214 if (header.clause_type != 0) { 215 printf("id(%du) ", header.scoreboard_index); 216 } 217 218 if (header.scoreboard_deps != 0) { 219 printf("next-wait("); 220 bool first = true; 221 for (unsigned i = 0; i < 8; i++) { 222 if (header.scoreboard_deps & (1 << i)) { 223 if (!first) { 224 printf(", "); 225 } 226 printf("%d", i); 227 first = false; 228 } 229 } 230 printf(") "); 231 } 232 233 if (header.datareg_writebarrier) 234 printf("data-reg-barrier "); 235 236 if (!header.no_end_of_shader) 237 printf("eos "); 238 239 if (!header.back_to_back) { 240 printf("nbb "); 241 if (header.branch_cond) 242 printf("branch-cond "); 243 else 244 printf("branch-uncond "); 245 } 246 247 if (header.elide_writes) 248 printf("we "); 249 250 if (header.suppress_inf) 251 printf("suppress-inf "); 252 if (header.suppress_nan) 253 printf("suppress-nan "); 254 255 if (header.unk0) 256 printf("unk0 "); 257 if (header.unk1) 258 printf("unk1 "); 259 if (header.unk2) 260 printf("unk2 "); 261 if (header.unk3) 262 printf("unk3 "); 263 if (header.unk4) 264 printf("unk4 "); 265 266 printf("\n"); 267 268 if (verbose) { 269 printf("# clause type %d, next clause type %d\n", 270 header.clause_type, header.next_clause_type); 271 } 272} 273 274static struct bifrost_reg_ctrl DecodeRegCtrl(struct bifrost_regs regs) 275{ 276 struct bifrost_reg_ctrl decoded = {}; 277 unsigned ctrl; 278 if (regs.ctrl == 0) { 279 ctrl = regs.reg1 >> 2; 280 decoded.read_reg0 = !(regs.reg1 & 0x2); 281 decoded.read_reg1 = false; 282 } else { 283 ctrl = regs.ctrl; 284 decoded.read_reg0 = decoded.read_reg1 = true; 285 } 286 switch (ctrl) { 287 case 1: 288 decoded.fma_write_unit = REG_WRITE_TWO; 289 break; 290 case 3: 291 decoded.fma_write_unit = REG_WRITE_TWO; 292 decoded.read_reg3 = true; 293 break; 294 case 4: 295 decoded.read_reg3 = true; 296 break; 297 case 5: 298 decoded.add_write_unit = REG_WRITE_TWO; 299 break; 300 case 6: 301 decoded.add_write_unit = REG_WRITE_TWO; 302 decoded.read_reg3 = true; 303 break; 304 case 8: 305 decoded.clause_start = true; 306 break; 307 case 9: 308 decoded.fma_write_unit = REG_WRITE_TWO; 309 decoded.clause_start = true; 310 break; 311 case 11: 312 break; 313 case 12: 314 decoded.read_reg3 = true; 315 decoded.clause_start = true; 316 break; 317 case 13: 318 decoded.add_write_unit = REG_WRITE_TWO; 319 decoded.clause_start = true; 320 break; 321 case 15: 322 decoded.fma_write_unit = REG_WRITE_THREE; 323 decoded.add_write_unit = REG_WRITE_TWO; 324 break; 325 default: 326 printf("# unknown reg ctrl %d\n", ctrl); 327 } 328 329 return decoded; 330} 331 332// Pass in the add_write_unit or fma_write_unit, and this returns which register 333// the ADD/FMA units are writing to 334static unsigned GetRegToWrite(enum bifrost_reg_write_unit unit, struct bifrost_regs regs) 335{ 336 switch (unit) { 337 case REG_WRITE_TWO: 338 return regs.reg2; 339 case REG_WRITE_THREE: 340 return regs.reg3; 341 default: /* REG_WRITE_NONE */ 342 assert(0); 343 return 0; 344 } 345} 346 347static void dump_regs(struct bifrost_regs srcs) 348{ 349 struct bifrost_reg_ctrl ctrl = DecodeRegCtrl(srcs); 350 printf("# "); 351 if (ctrl.read_reg0) 352 printf("port 0: R%d ", get_reg0(srcs)); 353 if (ctrl.read_reg1) 354 printf("port 1: R%d ", get_reg1(srcs)); 355 356 if (ctrl.fma_write_unit == REG_WRITE_TWO) 357 printf("port 2: R%d (write FMA) ", srcs.reg2); 358 else if (ctrl.add_write_unit == REG_WRITE_TWO) 359 printf("port 2: R%d (write ADD) ", srcs.reg2); 360 361 if (ctrl.fma_write_unit == REG_WRITE_THREE) 362 printf("port 3: R%d (write FMA) ", srcs.reg3); 363 else if (ctrl.add_write_unit == REG_WRITE_THREE) 364 printf("port 3: R%d (write ADD) ", srcs.reg3); 365 else if (ctrl.read_reg3) 366 printf("port 3: R%d (read) ", srcs.reg3); 367 368 if (srcs.uniform_const) { 369 if (srcs.uniform_const & 0x80) { 370 printf("uniform: U%d", (srcs.uniform_const & 0x7f) * 2); 371 } 372 } 373 374 printf("\n"); 375} 376static void dump_const_imm(uint32_t imm) 377{ 378 union { 379 float f; 380 uint32_t i; 381 } fi; 382 fi.i = imm; 383 printf("0x%08x /* %f */", imm, fi.f); 384} 385 386static uint64_t get_const(uint64_t *consts, struct bifrost_regs srcs) 387{ 388 unsigned low_bits = srcs.uniform_const & 0xf; 389 uint64_t imm; 390 switch (srcs.uniform_const >> 4) { 391 case 4: imm = consts[0]; break; 392 case 5: imm = consts[1]; break; 393 case 6: imm = consts[2]; break; 394 case 7: imm = consts[3]; break; 395 case 2: imm = consts[4]; break; 396 case 3: imm = consts[5]; break; 397 default: assert(0); break; 398 } 399 return imm | low_bits; 400} 401 402static void dump_uniform_const_src(struct bifrost_regs srcs, uint64_t *consts, bool high32) 403{ 404 if (srcs.uniform_const & 0x80) { 405 unsigned uniform = (srcs.uniform_const & 0x7f) * 2; 406 printf("U%d", uniform + (high32 ? 1 : 0)); 407 } else if (srcs.uniform_const >= 0x20) { 408 uint64_t imm = get_const(consts, srcs); 409 if (high32) 410 dump_const_imm(imm >> 32); 411 else 412 dump_const_imm(imm); 413 } else { 414 switch (srcs.uniform_const) { 415 case 0: printf("0"); break; 416 case 5: printf("atest-data"); break; 417 case 6: printf("sample-ptr"); break; 418 case 8: 419 case 9: 420 case 10: 421 case 11: 422 case 12: 423 case 13: 424 case 14: 425 case 15: 426 printf("blend-descriptor%u", (unsigned) srcs.uniform_const - 8); 427 break; 428 default: 429 printf("unkConst%u", (unsigned) srcs.uniform_const); 430 break; 431 } 432 433 if (high32) 434 printf(".y"); 435 else 436 printf(".x"); 437 } 438} 439 440static void dump_src(unsigned src, struct bifrost_regs srcs, uint64_t *consts, bool isFMA) 441{ 442 switch (src) { 443 case 0: printf("R%d", get_reg0(srcs)); break; 444 case 1: printf("R%d", get_reg1(srcs)); break; 445 case 2: printf("R%d", srcs.reg3); break; 446 case 3: 447 if (isFMA) 448 printf("0"); 449 else 450 printf("T"); // i.e. the output of FMA this cycle 451 break; 452 case 4: 453 dump_uniform_const_src(srcs, consts, false); 454 break; 455 case 5: 456 dump_uniform_const_src(srcs, consts, true); 457 break; 458 case 6: printf("T0"); break; 459 case 7: printf("T1"); break; 460 } 461} 462 463static void dump_output_mod(unsigned mod) 464{ 465 switch (mod) { 466 case 0: 467 break; 468 case 1: 469 printf(".clamp_0_inf"); break; // max(out, 0) 470 case 2: 471 printf(".clamp_m1_1"); break; // clamp(out, -1, 1) 472 case 3: 473 printf(".clamp_0_1"); break; // clamp(out, 0, 1) 474 default: 475 break; 476 } 477} 478 479static void dump_minmax_mode(unsigned mod) 480{ 481 switch (mod) { 482 case 0: 483 /* Same as fmax() and fmin() -- return the other number if any 484 * number is NaN. Also always return +0 if one argument is +0 and 485 * the other is -0. 486 */ 487 break; 488 case 1: 489 /* Instead of never returning a NaN, always return one. The 490 * "greater"/"lesser" NaN is always returned, first by checking the 491 * sign and then the mantissa bits. 492 */ 493 printf(".nan_wins"); break; 494 case 2: 495 /* For max, implement src0 > src1 ? src0 : src1 496 * For min, implement src0 < src1 ? src0 : src1 497 * 498 * This includes handling NaN's and signedness of 0 differently 499 * from above, since +0 and -0 compare equal and comparisons always 500 * return false for NaN's. As a result, this mode is *not* 501 * commutative. 502 */ 503 printf(".src1_wins"); break; 504 case 3: 505 /* For max, implement src0 < src1 ? src1 : src0 506 * For min, implement src0 > src1 ? src1 : src0 507 */ 508 printf(".src0_wins"); break; 509 default: 510 break; 511 } 512} 513 514static void dump_round_mode(unsigned mod) 515{ 516 switch (mod) { 517 case 0: 518 /* roundTiesToEven, the IEEE default. */ 519 break; 520 case 1: 521 /* roundTowardPositive in the IEEE spec. */ 522 printf(".round_pos"); break; 523 case 2: 524 /* roundTowardNegative in the IEEE spec. */ 525 printf(".round_neg"); break; 526 case 3: 527 /* roundTowardZero in the IEEE spec. */ 528 printf(".round_zero"); break; 529 default: 530 break; 531 } 532} 533 534static const struct fma_op_info FMAOpInfos[] = { 535 { 0x00000, "FMA.f32", FMA_FMA }, 536 { 0x40000, "MAX.f32", FMA_FMINMAX }, 537 { 0x44000, "MIN.f32", FMA_FMINMAX }, 538 { 0x48000, "FCMP.GL", FMA_FCMP }, 539 { 0x4c000, "FCMP.D3D", FMA_FCMP }, 540 { 0x4ff98, "ADD.i32", FMA_TWO_SRC }, 541 { 0x4ffd8, "SUB.i32", FMA_TWO_SRC }, 542 { 0x4fff0, "SUBB.i32", FMA_TWO_SRC }, 543 { 0x50000, "FMA_MSCALE", FMA_FMA_MSCALE }, 544 { 0x58000, "ADD.f32", FMA_FADD }, 545 { 0x5c000, "CSEL.FEQ.f32", FMA_FOUR_SRC }, 546 { 0x5c200, "CSEL.FGT.f32", FMA_FOUR_SRC }, 547 { 0x5c400, "CSEL.FGE.f32", FMA_FOUR_SRC }, 548 { 0x5c600, "CSEL.IEQ.f32", FMA_FOUR_SRC }, 549 { 0x5c800, "CSEL.IGT.i32", FMA_FOUR_SRC }, 550 { 0x5ca00, "CSEL.IGE.i32", FMA_FOUR_SRC }, 551 { 0x5cc00, "CSEL.UGT.i32", FMA_FOUR_SRC }, 552 { 0x5ce00, "CSEL.UGE.i32", FMA_FOUR_SRC }, 553 { 0x5d8d0, "ICMP.D3D.GT.v2i16", FMA_TWO_SRC }, 554 { 0x5d9d0, "UCMP.D3D.GT.v2i16", FMA_TWO_SRC }, 555 { 0x5dad0, "ICMP.D3D.GE.v2i16", FMA_TWO_SRC }, 556 { 0x5dbd0, "UCMP.D3D.GE.v2i16", FMA_TWO_SRC }, 557 { 0x5dcd0, "ICMP.D3D.EQ.v2i16", FMA_TWO_SRC }, 558 { 0x5de40, "ICMP.GL.GT.i32", FMA_TWO_SRC }, // src0 > src1 ? 1 : 0 559 { 0x5de48, "ICMP.GL.GE.i32", FMA_TWO_SRC }, 560 { 0x5de50, "UCMP.GL.GT.i32", FMA_TWO_SRC }, 561 { 0x5de58, "UCMP.GL.GE.i32", FMA_TWO_SRC }, 562 { 0x5de60, "ICMP.GL.EQ.i32", FMA_TWO_SRC }, 563 { 0x5dec0, "ICMP.D3D.GT.i32", FMA_TWO_SRC }, // src0 > src1 ? ~0 : 0 564 { 0x5dec8, "ICMP.D3D.GE.i32", FMA_TWO_SRC }, 565 { 0x5ded0, "UCMP.D3D.GT.i32", FMA_TWO_SRC }, 566 { 0x5ded8, "UCMP.D3D.GE.i32", FMA_TWO_SRC }, 567 { 0x5dee0, "ICMP.D3D.EQ.i32", FMA_TWO_SRC }, 568 { 0x60200, "RSHIFT_NAND.i32", FMA_THREE_SRC }, 569 { 0x603c0, "RSHIFT_NAND.v2i16", FMA_THREE_SRC }, 570 { 0x60e00, "RSHIFT_OR.i32", FMA_THREE_SRC }, 571 { 0x60fc0, "RSHIFT_OR.v2i16", FMA_THREE_SRC }, 572 { 0x61200, "RSHIFT_AND.i32", FMA_THREE_SRC }, 573 { 0x613c0, "RSHIFT_AND.v2i16", FMA_THREE_SRC }, 574 { 0x61e00, "RSHIFT_NOR.i32", FMA_THREE_SRC }, // ~((src0 << src2) | src1) 575 { 0x61fc0, "RSHIFT_NOR.v2i16", FMA_THREE_SRC }, // ~((src0 << src2) | src1) 576 { 0x62200, "LSHIFT_NAND.i32", FMA_THREE_SRC }, 577 { 0x623c0, "LSHIFT_NAND.v2i16", FMA_THREE_SRC }, 578 { 0x62e00, "LSHIFT_OR.i32", FMA_THREE_SRC }, // (src0 << src2) | src1 579 { 0x62fc0, "LSHIFT_OR.v2i16", FMA_THREE_SRC }, // (src0 << src2) | src1 580 { 0x63200, "LSHIFT_AND.i32", FMA_THREE_SRC }, // (src0 << src2) & src1 581 { 0x633c0, "LSHIFT_AND.v2i16", FMA_THREE_SRC }, 582 { 0x63e00, "LSHIFT_NOR.i32", FMA_THREE_SRC }, 583 { 0x63fc0, "LSHIFT_NOR.v2i16", FMA_THREE_SRC }, 584 { 0x64200, "RSHIFT_XOR.i32", FMA_THREE_SRC }, 585 { 0x643c0, "RSHIFT_XOR.v2i16", FMA_THREE_SRC }, 586 { 0x64600, "RSHIFT_XNOR.i32", FMA_THREE_SRC }, // ~((src0 >> src2) ^ src1) 587 { 0x647c0, "RSHIFT_XNOR.v2i16", FMA_THREE_SRC }, // ~((src0 >> src2) ^ src1) 588 { 0x64a00, "LSHIFT_XOR.i32", FMA_THREE_SRC }, 589 { 0x64bc0, "LSHIFT_XOR.v2i16", FMA_THREE_SRC }, 590 { 0x64e00, "LSHIFT_XNOR.i32", FMA_THREE_SRC }, // ~((src0 >> src2) ^ src1) 591 { 0x64fc0, "LSHIFT_XNOR.v2i16", FMA_THREE_SRC }, // ~((src0 >> src2) ^ src1) 592 { 0x65200, "LSHIFT_ADD.i32", FMA_THREE_SRC }, 593 { 0x65600, "LSHIFT_SUB.i32", FMA_THREE_SRC }, // (src0 << src2) - src1 594 { 0x65a00, "LSHIFT_RSUB.i32", FMA_THREE_SRC }, // src1 - (src0 << src2) 595 { 0x65e00, "RSHIFT_ADD.i32", FMA_THREE_SRC }, 596 { 0x66200, "RSHIFT_SUB.i32", FMA_THREE_SRC }, 597 { 0x66600, "RSHIFT_RSUB.i32", FMA_THREE_SRC }, 598 { 0x66a00, "ARSHIFT_ADD.i32", FMA_THREE_SRC }, 599 { 0x66e00, "ARSHIFT_SUB.i32", FMA_THREE_SRC }, 600 { 0x67200, "ARSHIFT_RSUB.i32", FMA_THREE_SRC }, 601 { 0x80000, "FMA.v2f16", FMA_FMA16 }, 602 { 0xc0000, "MAX.v2f16", FMA_FMINMAX16 }, 603 { 0xc4000, "MIN.v2f16", FMA_FMINMAX16 }, 604 { 0xc8000, "FCMP.GL", FMA_FCMP16 }, 605 { 0xcc000, "FCMP.D3D", FMA_FCMP16 }, 606 { 0xcf900, "ADD.v2i16", FMA_TWO_SRC }, 607 { 0xcfc10, "ADDC.i32", FMA_TWO_SRC }, 608 { 0xcfd80, "ADD.i32.i16.X", FMA_TWO_SRC }, 609 { 0xcfd90, "ADD.i32.u16.X", FMA_TWO_SRC }, 610 { 0xcfdc0, "ADD.i32.i16.Y", FMA_TWO_SRC }, 611 { 0xcfdd0, "ADD.i32.u16.Y", FMA_TWO_SRC }, 612 { 0xd8000, "ADD.v2f16", FMA_FADD16 }, 613 { 0xdc000, "CSEL.FEQ.v2f16", FMA_FOUR_SRC }, 614 { 0xdc200, "CSEL.FGT.v2f16", FMA_FOUR_SRC }, 615 { 0xdc400, "CSEL.FGE.v2f16", FMA_FOUR_SRC }, 616 { 0xdc600, "CSEL.IEQ.v2f16", FMA_FOUR_SRC }, 617 { 0xdc800, "CSEL.IGT.v2i16", FMA_FOUR_SRC }, 618 { 0xdca00, "CSEL.IGE.v2i16", FMA_FOUR_SRC }, 619 { 0xdcc00, "CSEL.UGT.v2i16", FMA_FOUR_SRC }, 620 { 0xdce00, "CSEL.UGE.v2i16", FMA_FOUR_SRC }, 621 { 0xdd000, "F32_TO_F16", FMA_TWO_SRC }, 622 { 0xe0046, "F16_TO_I16.XX", FMA_ONE_SRC }, 623 { 0xe0047, "F16_TO_U16.XX", FMA_ONE_SRC }, 624 { 0xe004e, "F16_TO_I16.YX", FMA_ONE_SRC }, 625 { 0xe004f, "F16_TO_U16.YX", FMA_ONE_SRC }, 626 { 0xe0056, "F16_TO_I16.XY", FMA_ONE_SRC }, 627 { 0xe0057, "F16_TO_U16.XY", FMA_ONE_SRC }, 628 { 0xe005e, "F16_TO_I16.YY", FMA_ONE_SRC }, 629 { 0xe005f, "F16_TO_U16.YY", FMA_ONE_SRC }, 630 { 0xe00c0, "I16_TO_F16.XX", FMA_ONE_SRC }, 631 { 0xe00c1, "U16_TO_F16.XX", FMA_ONE_SRC }, 632 { 0xe00c8, "I16_TO_F16.YX", FMA_ONE_SRC }, 633 { 0xe00c9, "U16_TO_F16.YX", FMA_ONE_SRC }, 634 { 0xe00d0, "I16_TO_F16.XY", FMA_ONE_SRC }, 635 { 0xe00d1, "U16_TO_F16.XY", FMA_ONE_SRC }, 636 { 0xe00d8, "I16_TO_F16.YY", FMA_ONE_SRC }, 637 { 0xe00d9, "U16_TO_F16.YY", FMA_ONE_SRC }, 638 { 0xe0136, "F32_TO_I32", FMA_ONE_SRC }, 639 { 0xe0137, "F32_TO_U32", FMA_ONE_SRC }, 640 { 0xe0178, "I32_TO_F32", FMA_ONE_SRC }, 641 { 0xe0179, "U32_TO_F32", FMA_ONE_SRC }, 642 { 0xe0198, "I16_TO_I32.X", FMA_ONE_SRC }, 643 { 0xe0199, "U16_TO_U32.X", FMA_ONE_SRC }, 644 { 0xe019a, "I16_TO_I32.Y", FMA_ONE_SRC }, 645 { 0xe019b, "U16_TO_U32.Y", FMA_ONE_SRC }, 646 { 0xe019c, "I16_TO_F32.X", FMA_ONE_SRC }, 647 { 0xe019d, "U16_TO_F32.X", FMA_ONE_SRC }, 648 { 0xe019e, "I16_TO_F32.Y", FMA_ONE_SRC }, 649 { 0xe019f, "U16_TO_F32.Y", FMA_ONE_SRC }, 650 { 0xe01a2, "F16_TO_F32.X", FMA_ONE_SRC }, 651 { 0xe01a3, "F16_TO_F32.Y", FMA_ONE_SRC }, 652 { 0xe032c, "NOP", FMA_ONE_SRC }, 653 { 0xe032d, "MOV", FMA_ONE_SRC }, 654 { 0xe032f, "SWZ.YY.v2i16", FMA_ONE_SRC }, 655 // From the ARM patent US20160364209A1: 656 // "Decompose v (the input) into numbers x1 and s such that v = x1 * 2^s, 657 // and x1 is a floating point value in a predetermined range where the 658 // value 1 is within the range and not at one extremity of the range (e.g. 659 // choose a range where 1 is towards middle of range)." 660 // 661 // This computes x1. 662 { 0xe0345, "LOG_FREXPM", FMA_ONE_SRC }, 663 // Given a floating point number m * 2^e, returns m * 2^{-1}. This is 664 // exactly the same as the mantissa part of frexp(). 665 { 0xe0365, "FRCP_FREXPM", FMA_ONE_SRC }, 666 // Given a floating point number m * 2^e, returns m * 2^{-2} if e is even, 667 // and m * 2^{-1} if e is odd. In other words, scales by powers of 4 until 668 // within the range [0.25, 1). Used for square-root and reciprocal 669 // square-root. 670 { 0xe0375, "FSQRT_FREXPM", FMA_ONE_SRC }, 671 // Given a floating point number m * 2^e, computes -e - 1 as an integer. 672 // Zero and infinity/NaN return 0. 673 { 0xe038d, "FRCP_FREXPE", FMA_ONE_SRC }, 674 // Computes floor(e/2) + 1. 675 { 0xe03a5, "FSQRT_FREXPE", FMA_ONE_SRC }, 676 // Given a floating point number m * 2^e, computes -floor(e/2) - 1 as an 677 // integer. 678 { 0xe03ad, "FRSQ_FREXPE", FMA_ONE_SRC }, 679 { 0xe03c5, "LOG_FREXPE", FMA_ONE_SRC }, 680 { 0xe0b80, "IMAX3", FMA_THREE_SRC }, 681 { 0xe0bc0, "UMAX3", FMA_THREE_SRC }, 682 { 0xe0c00, "IMIN3", FMA_THREE_SRC }, 683 { 0xe0c40, "UMIN3", FMA_THREE_SRC }, 684 { 0xe0f40, "CSEL", FMA_THREE_SRC }, // src2 != 0 ? src1 : src0 685 { 0xe0fc0, "MUX.i32", FMA_THREE_SRC }, // see ADD comment 686 { 0xe1845, "CEIL", FMA_ONE_SRC }, 687 { 0xe1885, "FLOOR", FMA_ONE_SRC }, 688 { 0xe19b0, "ATAN_LDEXP.Y.f32", FMA_TWO_SRC }, 689 { 0xe19b8, "ATAN_LDEXP.X.f32", FMA_TWO_SRC }, 690 // These instructions in the FMA slot, together with LSHIFT_ADD_HIGH32.i32 691 // in the ADD slot, allow one to do a 64-bit addition with an extra small 692 // shift on one of the sources. There are three possible scenarios: 693 // 694 // 1) Full 64-bit addition. Do: 695 // out.x = LSHIFT_ADD_LOW32.i64 src1.x, src2.x, shift 696 // out.y = LSHIFT_ADD_HIGH32.i32 src1.y, src2.y 697 // 698 // The shift amount is applied to src2 before adding. The shift amount, and 699 // any extra bits from src2 plus the overflow bit, are sent directly from 700 // FMA to ADD instead of being passed explicitly. Hence, these two must be 701 // bundled together into the same instruction. 702 // 703 // 2) Add a 64-bit value src1 to a zero-extended 32-bit value src2. Do: 704 // out.x = LSHIFT_ADD_LOW32.u32 src1.x, src2, shift 705 // out.y = LSHIFT_ADD_HIGH32.i32 src1.x, 0 706 // 707 // Note that in this case, the second argument to LSHIFT_ADD_HIGH32 is 708 // ignored, so it can actually be anything. As before, the shift is applied 709 // to src2 before adding. 710 // 711 // 3) Add a 64-bit value to a sign-extended 32-bit value src2. Do: 712 // out.x = LSHIFT_ADD_LOW32.i32 src1.x, src2, shift 713 // out.y = LSHIFT_ADD_HIGH32.i32 src1.x, 0 714 // 715 // The only difference is the .i32 instead of .u32. Otherwise, this is 716 // exactly the same as before. 717 // 718 // In all these instructions, the shift amount is stored where the third 719 // source would be, so the shift has to be a small immediate from 0 to 7. 720 // This is fine for the expected use-case of these instructions, which is 721 // manipulating 64-bit pointers. 722 // 723 // These instructions can also be combined with various load/store 724 // instructions which normally take a 64-bit pointer in order to add a 725 // 32-bit or 64-bit offset to the pointer before doing the operation, 726 // optionally shifting the offset. The load/store op implicity does 727 // LSHIFT_ADD_HIGH32.i32 internally. Letting ptr be the pointer, and offset 728 // the desired offset, the cases go as follows: 729 // 730 // 1) Add a 64-bit offset: 731 // LSHIFT_ADD_LOW32.i64 ptr.x, offset.x, shift 732 // ld_st_op ptr.y, offset.y, ... 733 // 734 // Note that the output of LSHIFT_ADD_LOW32.i64 is not used, instead being 735 // implicitly sent to the load/store op to serve as the low 32 bits of the 736 // pointer. 737 // 738 // 2) Add a 32-bit unsigned offset: 739 // temp = LSHIFT_ADD_LOW32.u32 ptr.x, offset, shift 740 // ld_st_op temp, ptr.y, ... 741 // 742 // Now, the low 32 bits of offset << shift + ptr are passed explicitly to 743 // the ld_st_op, to match the case where there is no offset and ld_st_op is 744 // called directly. 745 // 746 // 3) Add a 32-bit signed offset: 747 // temp = LSHIFT_ADD_LOW32.i32 ptr.x, offset, shift 748 // ld_st_op temp, ptr.y, ... 749 // 750 // Again, the same as the unsigned case except for the offset. 751 { 0xe1c80, "LSHIFT_ADD_LOW32.u32", FMA_SHIFT_ADD64 }, 752 { 0xe1cc0, "LSHIFT_ADD_LOW32.i64", FMA_SHIFT_ADD64 }, 753 { 0xe1d80, "LSHIFT_ADD_LOW32.i32", FMA_SHIFT_ADD64 }, 754 { 0xe1e00, "SEL.XX.i16", FMA_TWO_SRC }, 755 { 0xe1e08, "SEL.YX.i16", FMA_TWO_SRC }, 756 { 0xe1e10, "SEL.XY.i16", FMA_TWO_SRC }, 757 { 0xe1e18, "SEL.YY.i16", FMA_TWO_SRC }, 758 { 0xe7800, "IMAD", FMA_THREE_SRC }, 759 { 0xe78db, "POPCNT", FMA_ONE_SRC }, 760}; 761 762static struct fma_op_info find_fma_op_info(unsigned op) 763{ 764 for (unsigned i = 0; i < ARRAY_SIZE(FMAOpInfos); i++) { 765 unsigned opCmp = ~0; 766 switch (FMAOpInfos[i].src_type) { 767 case FMA_ONE_SRC: 768 opCmp = op; 769 break; 770 case FMA_TWO_SRC: 771 opCmp = op & ~0x7; 772 break; 773 case FMA_FCMP: 774 case FMA_FCMP16: 775 opCmp = op & ~0x1fff; 776 break; 777 case FMA_THREE_SRC: 778 case FMA_SHIFT_ADD64: 779 opCmp = op & ~0x3f; 780 break; 781 case FMA_FADD: 782 case FMA_FMINMAX: 783 case FMA_FADD16: 784 case FMA_FMINMAX16: 785 opCmp = op & ~0x3fff; 786 break; 787 case FMA_FMA: 788 case FMA_FMA16: 789 opCmp = op & ~0x3ffff; 790 break; 791 case FMA_FOUR_SRC: 792 opCmp = op & ~0x1ff; 793 break; 794 case FMA_FMA_MSCALE: 795 opCmp = op & ~0x7fff; 796 break; 797 default: 798 opCmp = ~0; 799 break; 800 } 801 if (FMAOpInfos[i].op == opCmp) 802 return FMAOpInfos[i]; 803 } 804 805 struct fma_op_info info; 806 snprintf(info.name, sizeof(info.name), "op%04x", op); 807 info.op = op; 808 info.src_type = FMA_THREE_SRC; 809 return info; 810} 811 812static void dump_fcmp(unsigned op) 813{ 814 switch (op) { 815 case 0: 816 printf(".OEQ"); 817 break; 818 case 1: 819 printf(".OGT"); 820 break; 821 case 2: 822 printf(".OGE"); 823 break; 824 case 3: 825 printf(".UNE"); 826 break; 827 case 4: 828 printf(".OLT"); 829 break; 830 case 5: 831 printf(".OLE"); 832 break; 833 default: 834 printf(".unk%d", op); 835 break; 836 } 837} 838 839static void dump_16swizzle(unsigned swiz) 840{ 841 if (swiz == 2) 842 return; 843 printf(".%c%c", "xy"[swiz & 1], "xy"[(swiz >> 1) & 1]); 844} 845 846static void dump_fma_expand_src0(unsigned ctrl) 847{ 848 switch (ctrl) { 849 case 3: 850 case 4: 851 case 6: 852 printf(".x"); 853 break; 854 case 5: 855 case 7: 856 printf(".y"); 857 break; 858 case 0: 859 case 1: 860 case 2: 861 break; 862 default: 863 printf(".unk"); 864 break; 865 } 866} 867 868static void dump_fma_expand_src1(unsigned ctrl) 869{ 870 switch (ctrl) { 871 case 1: 872 case 3: 873 printf(".x"); 874 break; 875 case 2: 876 case 4: 877 case 5: 878 printf(".y"); 879 break; 880 case 0: 881 case 6: 882 case 7: 883 break; 884 default: 885 printf(".unk"); 886 break; 887 } 888} 889 890static void dump_fma(uint64_t word, struct bifrost_regs regs, struct bifrost_regs next_regs, uint64_t *consts, bool verbose) 891{ 892 if (verbose) { 893 printf("# FMA: %016" PRIx64 "\n", word); 894 } 895 struct bifrost_fma_inst FMA; 896 memcpy((char *) &FMA, (char *) &word, sizeof(struct bifrost_fma_inst)); 897 struct fma_op_info info = find_fma_op_info(FMA.op); 898 899 printf("%s", info.name); 900 if (info.src_type == FMA_FADD || 901 info.src_type == FMA_FMINMAX || 902 info.src_type == FMA_FMA || 903 info.src_type == FMA_FADD16 || 904 info.src_type == FMA_FMINMAX16 || 905 info.src_type == FMA_FMA16) { 906 dump_output_mod(bits(FMA.op, 12, 14)); 907 switch (info.src_type) { 908 case FMA_FADD: 909 case FMA_FMA: 910 case FMA_FADD16: 911 case FMA_FMA16: 912 dump_round_mode(bits(FMA.op, 10, 12)); 913 break; 914 case FMA_FMINMAX: 915 case FMA_FMINMAX16: 916 dump_minmax_mode(bits(FMA.op, 10, 12)); 917 break; 918 default: 919 assert(0); 920 } 921 } else if (info.src_type == FMA_FCMP || info.src_type == FMA_FCMP16) { 922 dump_fcmp(bits(FMA.op, 10, 13)); 923 if (info.src_type == FMA_FCMP) 924 printf(".f32"); 925 else 926 printf(".v2f16"); 927 } else if (info.src_type == FMA_FMA_MSCALE) { 928 if (FMA.op & (1 << 11)) { 929 switch ((FMA.op >> 9) & 0x3) { 930 case 0: 931 /* This mode seems to do a few things: 932 * - Makes 0 * infinity (and incidentally 0 * nan) return 0, 933 * since generating a nan would poison the result of 934 * 1/infinity and 1/0. 935 * - Fiddles with which nan is returned in nan * nan, 936 * presumably to make sure that the same exact nan is 937 * returned for 1/nan. 938 */ 939 printf(".rcp_mode"); 940 break; 941 case 3: 942 /* Similar to the above, but src0 always wins when multiplying 943 * 0 by infinity. 944 */ 945 printf(".sqrt_mode"); 946 break; 947 default: 948 printf(".unk%d_mode", (int) (FMA.op >> 9) & 0x3); 949 } 950 } else { 951 dump_output_mod(bits(FMA.op, 9, 11)); 952 } 953 } 954 955 printf(" "); 956 957 struct bifrost_reg_ctrl next_ctrl = DecodeRegCtrl(next_regs); 958 if (next_ctrl.fma_write_unit != REG_WRITE_NONE) { 959 printf("{R%d, T0}, ", GetRegToWrite(next_ctrl.fma_write_unit, next_regs)); 960 } else { 961 printf("T0, "); 962 } 963 964 switch (info.src_type) { 965 case FMA_ONE_SRC: 966 dump_src(FMA.src0, regs, consts, true); 967 break; 968 case FMA_TWO_SRC: 969 dump_src(FMA.src0, regs, consts, true); 970 printf(", "); 971 dump_src(FMA.op & 0x7, regs, consts, true); 972 break; 973 case FMA_FADD: 974 case FMA_FMINMAX: 975 if (FMA.op & 0x10) 976 printf("-"); 977 if (FMA.op & 0x200) 978 printf("abs("); 979 dump_src(FMA.src0, regs, consts, true); 980 dump_fma_expand_src0((FMA.op >> 6) & 0x7); 981 if (FMA.op & 0x200) 982 printf(")"); 983 printf(", "); 984 if (FMA.op & 0x20) 985 printf("-"); 986 if (FMA.op & 0x8) 987 printf("abs("); 988 dump_src(FMA.op & 0x7, regs, consts, true); 989 dump_fma_expand_src1((FMA.op >> 6) & 0x7); 990 if (FMA.op & 0x8) 991 printf(")"); 992 break; 993 case FMA_FADD16: 994 case FMA_FMINMAX16: { 995 bool abs1 = FMA.op & 0x8; 996 bool abs2 = (FMA.op & 0x7) < FMA.src0; 997 if (FMA.op & 0x10) 998 printf("-"); 999 if (abs1 || abs2) 1000 printf("abs("); 1001 dump_src(FMA.src0, regs, consts, true); 1002 dump_16swizzle((FMA.op >> 6) & 0x3); 1003 if (abs1 || abs2) 1004 printf(")"); 1005 printf(", "); 1006 if (FMA.op & 0x20) 1007 printf("-"); 1008 if (abs1 && abs2) 1009 printf("abs("); 1010 dump_src(FMA.op & 0x7, regs, consts, true); 1011 dump_16swizzle((FMA.op >> 8) & 0x3); 1012 if (abs1 && abs2) 1013 printf(")"); 1014 break; 1015 } 1016 case FMA_FCMP: 1017 if (FMA.op & 0x200) 1018 printf("abs("); 1019 dump_src(FMA.src0, regs, consts, true); 1020 dump_fma_expand_src0((FMA.op >> 6) & 0x7); 1021 if (FMA.op & 0x200) 1022 printf(")"); 1023 printf(", "); 1024 if (FMA.op & 0x20) 1025 printf("-"); 1026 if (FMA.op & 0x8) 1027 printf("abs("); 1028 dump_src(FMA.op & 0x7, regs, consts, true); 1029 dump_fma_expand_src1((FMA.op >> 6) & 0x7); 1030 if (FMA.op & 0x8) 1031 printf(")"); 1032 break; 1033 case FMA_FCMP16: 1034 dump_src(FMA.src0, regs, consts, true); 1035 // Note: this is kinda a guess, I haven't seen the blob set this to 1036 // anything other than the identity, but it matches FMA_TWO_SRCFmod16 1037 dump_16swizzle((FMA.op >> 6) & 0x3); 1038 printf(", "); 1039 dump_src(FMA.op & 0x7, regs, consts, true); 1040 dump_16swizzle((FMA.op >> 8) & 0x3); 1041 break; 1042 case FMA_SHIFT_ADD64: 1043 dump_src(FMA.src0, regs, consts, true); 1044 printf(", "); 1045 dump_src(FMA.op & 0x7, regs, consts, true); 1046 printf(", "); 1047 printf("shift:%u", (FMA.op >> 3) & 0x7); 1048 break; 1049 case FMA_THREE_SRC: 1050 dump_src(FMA.src0, regs, consts, true); 1051 printf(", "); 1052 dump_src(FMA.op & 0x7, regs, consts, true); 1053 printf(", "); 1054 dump_src((FMA.op >> 3) & 0x7, regs, consts, true); 1055 break; 1056 case FMA_FMA: 1057 if (FMA.op & (1 << 14)) 1058 printf("-"); 1059 if (FMA.op & (1 << 9)) 1060 printf("abs("); 1061 dump_src(FMA.src0, regs, consts, true); 1062 dump_fma_expand_src0((FMA.op >> 6) & 0x7); 1063 if (FMA.op & (1 << 9)) 1064 printf(")"); 1065 printf(", "); 1066 if (FMA.op & (1 << 16)) 1067 printf("abs("); 1068 dump_src(FMA.op & 0x7, regs, consts, true); 1069 dump_fma_expand_src1((FMA.op >> 6) & 0x7); 1070 if (FMA.op & (1 << 16)) 1071 printf(")"); 1072 printf(", "); 1073 if (FMA.op & (1 << 15)) 1074 printf("-"); 1075 if (FMA.op & (1 << 17)) 1076 printf("abs("); 1077 dump_src((FMA.op >> 3) & 0x7, regs, consts, true); 1078 if (FMA.op & (1 << 17)) 1079 printf(")"); 1080 break; 1081 case FMA_FMA16: 1082 if (FMA.op & (1 << 14)) 1083 printf("-"); 1084 dump_src(FMA.src0, regs, consts, true); 1085 dump_16swizzle((FMA.op >> 6) & 0x3); 1086 printf(", "); 1087 dump_src(FMA.op & 0x7, regs, consts, true); 1088 dump_16swizzle((FMA.op >> 8) & 0x3); 1089 printf(", "); 1090 if (FMA.op & (1 << 15)) 1091 printf("-"); 1092 dump_src((FMA.op >> 3) & 0x7, regs, consts, true); 1093 dump_16swizzle((FMA.op >> 16) & 0x3); 1094 break; 1095 case FMA_FOUR_SRC: 1096 dump_src(FMA.src0, regs, consts, true); 1097 printf(", "); 1098 dump_src(FMA.op & 0x7, regs, consts, true); 1099 printf(", "); 1100 dump_src((FMA.op >> 3) & 0x7, regs, consts, true); 1101 printf(", "); 1102 dump_src((FMA.op >> 6) & 0x7, regs, consts, true); 1103 break; 1104 case FMA_FMA_MSCALE: 1105 if (FMA.op & (1 << 12)) 1106 printf("abs("); 1107 dump_src(FMA.src0, regs, consts, true); 1108 if (FMA.op & (1 << 12)) 1109 printf(")"); 1110 printf(", "); 1111 if (FMA.op & (1 << 13)) 1112 printf("-"); 1113 dump_src(FMA.op & 0x7, regs, consts, true); 1114 printf(", "); 1115 if (FMA.op & (1 << 14)) 1116 printf("-"); 1117 dump_src((FMA.op >> 3) & 0x7, regs, consts, true); 1118 printf(", "); 1119 dump_src((FMA.op >> 6) & 0x7, regs, consts, true); 1120 break; 1121 } 1122 printf("\n"); 1123} 1124 1125static const struct add_op_info add_op_infos[] = { 1126 { 0x00000, "MAX.f32", ADD_FMINMAX }, 1127 { 0x02000, "MIN.f32", ADD_FMINMAX }, 1128 { 0x04000, "ADD.f32", ADD_FADD }, 1129 { 0x06000, "FCMP.GL", ADD_FCMP }, 1130 { 0x07000, "FCMP.D3D", ADD_FCMP }, 1131 { 0x07856, "F16_TO_I16", ADD_ONE_SRC }, 1132 { 0x07857, "F16_TO_U16", ADD_ONE_SRC }, 1133 { 0x078c0, "I16_TO_F16.XX", ADD_ONE_SRC }, 1134 { 0x078c1, "U16_TO_F16.XX", ADD_ONE_SRC }, 1135 { 0x078c8, "I16_TO_F16.YX", ADD_ONE_SRC }, 1136 { 0x078c9, "U16_TO_F16.YX", ADD_ONE_SRC }, 1137 { 0x078d0, "I16_TO_F16.XY", ADD_ONE_SRC }, 1138 { 0x078d1, "U16_TO_F16.XY", ADD_ONE_SRC }, 1139 { 0x078d8, "I16_TO_F16.YY", ADD_ONE_SRC }, 1140 { 0x078d9, "U16_TO_F16.YY", ADD_ONE_SRC }, 1141 { 0x07936, "F32_TO_I32", ADD_ONE_SRC }, 1142 { 0x07937, "F32_TO_U32", ADD_ONE_SRC }, 1143 { 0x07978, "I32_TO_F32", ADD_ONE_SRC }, 1144 { 0x07979, "U32_TO_F32", ADD_ONE_SRC }, 1145 { 0x07998, "I16_TO_I32.X", ADD_ONE_SRC }, 1146 { 0x07999, "U16_TO_U32.X", ADD_ONE_SRC }, 1147 { 0x0799a, "I16_TO_I32.Y", ADD_ONE_SRC }, 1148 { 0x0799b, "U16_TO_U32.Y", ADD_ONE_SRC }, 1149 { 0x0799c, "I16_TO_F32.X", ADD_ONE_SRC }, 1150 { 0x0799d, "U16_TO_F32.X", ADD_ONE_SRC }, 1151 { 0x0799e, "I16_TO_F32.Y", ADD_ONE_SRC }, 1152 { 0x0799f, "U16_TO_F32.Y", ADD_ONE_SRC }, 1153 // take the low 16 bits, and expand it to a 32-bit float 1154 { 0x079a2, "F16_TO_F32.X", ADD_ONE_SRC }, 1155 // take the high 16 bits, ... 1156 { 0x079a3, "F16_TO_F32.Y", ADD_ONE_SRC }, 1157 { 0x07b2b, "SWZ.YX.v2i16", ADD_ONE_SRC }, 1158 { 0x07b2c, "NOP", ADD_ONE_SRC }, 1159 { 0x07b29, "SWZ.XX.v2i16", ADD_ONE_SRC }, 1160 // Logically, this should be SWZ.XY, but that's equivalent to a move, and 1161 // this seems to be the canonical way the blob generates a MOV. 1162 { 0x07b2d, "MOV", ADD_ONE_SRC }, 1163 { 0x07b2f, "SWZ.YY.v2i16", ADD_ONE_SRC }, 1164 // Given a floating point number m * 2^e, returns m ^ 2^{-1}. 1165 { 0x07b65, "FRCP_FREXPM", ADD_ONE_SRC }, 1166 { 0x07b75, "FSQRT_FREXPM", ADD_ONE_SRC }, 1167 { 0x07b8d, "FRCP_FREXPE", ADD_ONE_SRC }, 1168 { 0x07ba5, "FSQRT_FREXPE", ADD_ONE_SRC }, 1169 { 0x07bad, "FRSQ_FREXPE", ADD_ONE_SRC }, 1170 // From the ARM patent US20160364209A1: 1171 // "Decompose v (the input) into numbers x1 and s such that v = x1 * 2^s, 1172 // and x1 is a floating point value in a predetermined range where the 1173 // value 1 is within the range and not at one extremity of the range (e.g. 1174 // choose a range where 1 is towards middle of range)." 1175 // 1176 // This computes s. 1177 { 0x07bc5, "FLOG_FREXPE", ADD_ONE_SRC }, 1178 { 0x07d45, "CEIL", ADD_ONE_SRC }, 1179 { 0x07d85, "FLOOR", ADD_ONE_SRC }, 1180 { 0x07f18, "LSHIFT_ADD_HIGH32.i32", ADD_TWO_SRC }, 1181 { 0x08000, "LD_ATTR.f16", ADD_LOAD_ATTR, true }, 1182 { 0x08100, "LD_ATTR.v2f16", ADD_LOAD_ATTR, true }, 1183 { 0x08200, "LD_ATTR.v3f16", ADD_LOAD_ATTR, true }, 1184 { 0x08300, "LD_ATTR.v4f16", ADD_LOAD_ATTR, true }, 1185 { 0x08400, "LD_ATTR.f32", ADD_LOAD_ATTR, true }, 1186 { 0x08500, "LD_ATTR.v3f32", ADD_LOAD_ATTR, true }, 1187 { 0x08600, "LD_ATTR.v3f32", ADD_LOAD_ATTR, true }, 1188 { 0x08700, "LD_ATTR.v4f32", ADD_LOAD_ATTR, true }, 1189 { 0x08800, "LD_ATTR.i32", ADD_LOAD_ATTR, true }, 1190 { 0x08900, "LD_ATTR.v3i32", ADD_LOAD_ATTR, true }, 1191 { 0x08a00, "LD_ATTR.v3i32", ADD_LOAD_ATTR, true }, 1192 { 0x08b00, "LD_ATTR.v4i32", ADD_LOAD_ATTR, true }, 1193 { 0x08c00, "LD_ATTR.u32", ADD_LOAD_ATTR, true }, 1194 { 0x08d00, "LD_ATTR.v3u32", ADD_LOAD_ATTR, true }, 1195 { 0x08e00, "LD_ATTR.v3u32", ADD_LOAD_ATTR, true }, 1196 { 0x08f00, "LD_ATTR.v4u32", ADD_LOAD_ATTR, true }, 1197 { 0x0a000, "LD_VAR.32", ADD_VARYING_INTERP, true }, 1198 { 0x0b000, "TEX", ADD_TEX_COMPACT, true }, 1199 { 0x0c188, "LOAD.i32", ADD_TWO_SRC, true }, 1200 { 0x0c1a0, "LD_UBO.i32", ADD_TWO_SRC, true }, 1201 { 0x0c1b8, "LD_SCRATCH.v2i32", ADD_TWO_SRC, true }, 1202 { 0x0c1c8, "LOAD.v2i32", ADD_TWO_SRC, true }, 1203 { 0x0c1e0, "LD_UBO.v2i32", ADD_TWO_SRC, true }, 1204 { 0x0c1f8, "LD_SCRATCH.v2i32", ADD_TWO_SRC, true }, 1205 { 0x0c208, "LOAD.v4i32", ADD_TWO_SRC, true }, 1206 // src0 = offset, src1 = binding 1207 { 0x0c220, "LD_UBO.v4i32", ADD_TWO_SRC, true }, 1208 { 0x0c238, "LD_SCRATCH.v4i32", ADD_TWO_SRC, true }, 1209 { 0x0c248, "STORE.v4i32", ADD_TWO_SRC, true }, 1210 { 0x0c278, "ST_SCRATCH.v4i32", ADD_TWO_SRC, true }, 1211 { 0x0c588, "STORE.i32", ADD_TWO_SRC, true }, 1212 { 0x0c5b8, "ST_SCRATCH.i32", ADD_TWO_SRC, true }, 1213 { 0x0c5c8, "STORE.v2i32", ADD_TWO_SRC, true }, 1214 { 0x0c5f8, "ST_SCRATCH.v2i32", ADD_TWO_SRC, true }, 1215 { 0x0c648, "LOAD.u16", ADD_TWO_SRC, true }, // zero-extends 1216 { 0x0ca88, "LOAD.v3i32", ADD_TWO_SRC, true }, 1217 { 0x0caa0, "LD_UBO.v3i32", ADD_TWO_SRC, true }, 1218 { 0x0cab8, "LD_SCRATCH.v3i32", ADD_TWO_SRC, true }, 1219 { 0x0cb88, "STORE.v3i32", ADD_TWO_SRC, true }, 1220 { 0x0cbb8, "ST_SCRATCH.v3i32", ADD_TWO_SRC, true }, 1221 // *_FAST does not exist on G71 (added to G51, G72, and everything after) 1222 { 0x0cc00, "FRCP_FAST.f32", ADD_ONE_SRC }, 1223 { 0x0cc20, "FRSQ_FAST.f32", ADD_ONE_SRC }, 1224 // Given a floating point number m * 2^e, produces a table-based 1225 // approximation of 2/m using the top 17 bits. Includes special cases for 1226 // infinity, NaN, and zero, and copies the sign bit. 1227 { 0x0ce00, "FRCP_TABLE", ADD_ONE_SRC }, 1228 // Exists on G71 1229 { 0x0ce10, "FRCP_FAST.f16.X", ADD_ONE_SRC }, 1230 // A similar table for inverse square root, using the high 17 bits of the 1231 // mantissa as well as the low bit of the exponent. 1232 { 0x0ce20, "FRSQ_TABLE", ADD_ONE_SRC }, 1233 { 0x0ce30, "FRCP_FAST.f16.Y", ADD_ONE_SRC }, 1234 { 0x0ce50, "FRSQ_FAST.f16.X", ADD_ONE_SRC }, 1235 // Used in the argument reduction for log. Given a floating-point number 1236 // m * 2^e, uses the top 4 bits of m to produce an approximation to 1/m 1237 // with the exponent forced to 0 and only the top 5 bits are nonzero. 0, 1238 // infinity, and NaN all return 1.0. 1239 // See the ARM patent for more information. 1240 { 0x0ce60, "FRCP_APPROX", ADD_ONE_SRC }, 1241 { 0x0ce70, "FRSQ_FAST.f16.Y", ADD_ONE_SRC }, 1242 { 0x0cf40, "ATAN_ASSIST", ADD_TWO_SRC }, 1243 { 0x0cf48, "ATAN_TABLE", ADD_TWO_SRC }, 1244 { 0x0cf50, "SIN_TABLE", ADD_ONE_SRC }, 1245 { 0x0cf51, "COS_TABLE", ADD_ONE_SRC }, 1246 { 0x0cf58, "EXP_TABLE", ADD_ONE_SRC }, 1247 { 0x0cf60, "FLOG2_TABLE", ADD_ONE_SRC }, 1248 { 0x0cf64, "FLOGE_TABLE", ADD_ONE_SRC }, 1249 { 0x0d000, "BRANCH", ADD_BRANCH }, 1250 // For each bit i, return src2[i] ? src0[i] : src1[i]. In other words, this 1251 // is the same as (src2 & src0) | (~src2 & src1). 1252 { 0x0e8c0, "MUX", ADD_THREE_SRC }, 1253 { 0x0e9b0, "ATAN_LDEXP.Y.f32", ADD_TWO_SRC }, 1254 { 0x0e9b8, "ATAN_LDEXP.X.f32", ADD_TWO_SRC }, 1255 { 0x0ea60, "SEL.XX.i16", ADD_TWO_SRC }, 1256 { 0x0ea70, "SEL.XY.i16", ADD_TWO_SRC }, 1257 { 0x0ea68, "SEL.YX.i16", ADD_TWO_SRC }, 1258 { 0x0ea78, "SEL.YY.i16", ADD_TWO_SRC }, 1259 { 0x0ec00, "F32_TO_F16", ADD_TWO_SRC }, 1260 { 0x0f640, "ICMP.GL.GT", ADD_TWO_SRC }, // src0 > src1 ? 1 : 0 1261 { 0x0f648, "ICMP.GL.GE", ADD_TWO_SRC }, 1262 { 0x0f650, "UCMP.GL.GT", ADD_TWO_SRC }, 1263 { 0x0f658, "UCMP.GL.GE", ADD_TWO_SRC }, 1264 { 0x0f660, "ICMP.GL.EQ", ADD_TWO_SRC }, 1265 { 0x0f6c0, "ICMP.D3D.GT", ADD_TWO_SRC }, // src0 > src1 ? ~0 : 0 1266 { 0x0f6c8, "ICMP.D3D.GE", ADD_TWO_SRC }, 1267 { 0x0f6d0, "UCMP.D3D.GT", ADD_TWO_SRC }, 1268 { 0x0f6d8, "UCMP.D3D.GE", ADD_TWO_SRC }, 1269 { 0x0f6e0, "ICMP.D3D.EQ", ADD_TWO_SRC }, 1270 { 0x10000, "MAX.v2f16", ADD_FMINMAX16 }, 1271 { 0x11000, "ADD_MSCALE.f32", ADD_FADDMscale }, 1272 { 0x12000, "MIN.v2f16", ADD_FMINMAX16 }, 1273 { 0x14000, "ADD.v2f16", ADD_FADD16 }, 1274 { 0x17000, "FCMP.D3D", ADD_FCMP16 }, 1275 { 0x178c0, "ADD.i32", ADD_TWO_SRC }, 1276 { 0x17900, "ADD.v2i16", ADD_TWO_SRC }, 1277 { 0x17ac0, "SUB.i32", ADD_TWO_SRC }, 1278 { 0x17c10, "ADDC.i32", ADD_TWO_SRC }, // adds src0 to the bottom bit of src1 1279 { 0x17d80, "ADD.i32.i16.X", ADD_TWO_SRC }, 1280 { 0x17d90, "ADD.i32.u16.X", ADD_TWO_SRC }, 1281 { 0x17dc0, "ADD.i32.i16.Y", ADD_TWO_SRC }, 1282 { 0x17dd0, "ADD.i32.u16.Y", ADD_TWO_SRC }, 1283 // Compute varying address and datatype (for storing in the vertex shader), 1284 // and store the vec3 result in the data register. The result is passed as 1285 // the 3 normal arguments to ST_VAR. 1286 { 0x18000, "LD_VAR_ADDR.f16", ADD_VARYING_ADDRESS, true }, 1287 { 0x18100, "LD_VAR_ADDR.f32", ADD_VARYING_ADDRESS, true }, 1288 { 0x18200, "LD_VAR_ADDR.i32", ADD_VARYING_ADDRESS, true }, 1289 { 0x18300, "LD_VAR_ADDR.u32", ADD_VARYING_ADDRESS, true }, 1290 // Implements alpha-to-coverage, as well as possibly the late depth and 1291 // stencil tests. The first source is the existing sample mask in R60 1292 // (possibly modified by gl_SampleMask), and the second source is the alpha 1293 // value. The sample mask is written right away based on the 1294 // alpha-to-coverage result using the normal register write mechanism, 1295 // since that doesn't need to read from any memory, and then written again 1296 // later based on the result of the stencil and depth tests using the 1297 // special register. 1298 { 0x191e8, "ATEST.f32", ADD_TWO_SRC, true }, 1299 { 0x191f0, "ATEST.X.f16", ADD_TWO_SRC, true }, 1300 { 0x191f8, "ATEST.Y.f16", ADD_TWO_SRC, true }, 1301 // store a varying given the address and datatype from LD_VAR_ADDR 1302 { 0x19300, "ST_VAR.v1", ADD_THREE_SRC, true }, 1303 { 0x19340, "ST_VAR.v2", ADD_THREE_SRC, true }, 1304 { 0x19380, "ST_VAR.v3", ADD_THREE_SRC, true }, 1305 { 0x193c0, "ST_VAR.v4", ADD_THREE_SRC, true }, 1306 // This takes the sample coverage mask (computed by ATEST above) as a 1307 // regular argument, in addition to the vec4 color in the special register. 1308 { 0x1952c, "BLEND", ADD_BLENDING, true }, 1309 { 0x1a000, "LD_VAR.16", ADD_VARYING_INTERP, true }, 1310 { 0x1ae60, "TEX", ADD_TEX, true }, 1311 { 0x1c000, "RSHIFT_NAND.i32", ADD_THREE_SRC }, 1312 { 0x1c300, "RSHIFT_OR.i32", ADD_THREE_SRC }, 1313 { 0x1c400, "RSHIFT_AND.i32", ADD_THREE_SRC }, 1314 { 0x1c700, "RSHIFT_NOR.i32", ADD_THREE_SRC }, 1315 { 0x1c800, "LSHIFT_NAND.i32", ADD_THREE_SRC }, 1316 { 0x1cb00, "LSHIFT_OR.i32", ADD_THREE_SRC }, 1317 { 0x1cc00, "LSHIFT_AND.i32", ADD_THREE_SRC }, 1318 { 0x1cf00, "LSHIFT_NOR.i32", ADD_THREE_SRC }, 1319 { 0x1d000, "RSHIFT_XOR.i32", ADD_THREE_SRC }, 1320 { 0x1d100, "RSHIFT_XNOR.i32", ADD_THREE_SRC }, 1321 { 0x1d200, "LSHIFT_XOR.i32", ADD_THREE_SRC }, 1322 { 0x1d300, "LSHIFT_XNOR.i32", ADD_THREE_SRC }, 1323 { 0x1d400, "LSHIFT_ADD.i32", ADD_THREE_SRC }, 1324 { 0x1d500, "LSHIFT_SUB.i32", ADD_THREE_SRC }, 1325 { 0x1d500, "LSHIFT_RSUB.i32", ADD_THREE_SRC }, 1326 { 0x1d700, "RSHIFT_ADD.i32", ADD_THREE_SRC }, 1327 { 0x1d800, "RSHIFT_SUB.i32", ADD_THREE_SRC }, 1328 { 0x1d900, "RSHIFT_RSUB.i32", ADD_THREE_SRC }, 1329 { 0x1da00, "ARSHIFT_ADD.i32", ADD_THREE_SRC }, 1330 { 0x1db00, "ARSHIFT_SUB.i32", ADD_THREE_SRC }, 1331 { 0x1dc00, "ARSHIFT_RSUB.i32", ADD_THREE_SRC }, 1332 { 0x1dd18, "OR.i32", ADD_TWO_SRC }, 1333 { 0x1dd20, "AND.i32", ADD_TWO_SRC }, 1334 { 0x1dd60, "LSHIFT.i32", ADD_TWO_SRC }, 1335 { 0x1dd50, "XOR.i32", ADD_TWO_SRC }, 1336 { 0x1dd80, "RSHIFT.i32", ADD_TWO_SRC }, 1337 { 0x1dda0, "ARSHIFT.i32", ADD_TWO_SRC }, 1338}; 1339 1340static struct add_op_info find_add_op_info(unsigned op) 1341{ 1342 for (unsigned i = 0; i < ARRAY_SIZE(add_op_infos); i++) { 1343 unsigned opCmp = ~0; 1344 switch (add_op_infos[i].src_type) { 1345 case ADD_ONE_SRC: 1346 case ADD_BLENDING: 1347 opCmp = op; 1348 break; 1349 case ADD_TWO_SRC: 1350 opCmp = op & ~0x7; 1351 break; 1352 case ADD_THREE_SRC: 1353 opCmp = op & ~0x3f; 1354 break; 1355 case ADD_TEX: 1356 opCmp = op & ~0xf; 1357 break; 1358 case ADD_FADD: 1359 case ADD_FMINMAX: 1360 case ADD_FADD16: 1361 opCmp = op & ~0x1fff; 1362 break; 1363 case ADD_FMINMAX16: 1364 case ADD_FADDMscale: 1365 opCmp = op & ~0xfff; 1366 break; 1367 case ADD_FCMP: 1368 case ADD_FCMP16: 1369 opCmp = op & ~0x7ff; 1370 break; 1371 case ADD_TEX_COMPACT: 1372 opCmp = op & ~0x3ff; 1373 break; 1374 case ADD_VARYING_INTERP: 1375 opCmp = op & ~0x7ff; 1376 break; 1377 case ADD_VARYING_ADDRESS: 1378 opCmp = op & ~0xff; 1379 break; 1380 case ADD_LOAD_ATTR: 1381 opCmp = op & ~0x7f; 1382 break; 1383 case ADD_BRANCH: 1384 opCmp = op & ~0xfff; 1385 break; 1386 default: 1387 opCmp = ~0; 1388 break; 1389 } 1390 if (add_op_infos[i].op == opCmp) 1391 return add_op_infos[i]; 1392 } 1393 1394 struct add_op_info info; 1395 snprintf(info.name, sizeof(info.name), "op%04x", op); 1396 info.op = op; 1397 info.src_type = ADD_TWO_SRC; 1398 info.has_data_reg = true; 1399 return info; 1400} 1401 1402static void dump_add(uint64_t word, struct bifrost_regs regs, struct bifrost_regs next_regs, uint64_t *consts, 1403 unsigned data_reg, unsigned offset, bool verbose) 1404{ 1405 if (verbose) { 1406 printf("# ADD: %016" PRIx64 "\n", word); 1407 } 1408 struct bifrost_add_inst ADD; 1409 memcpy((char *) &ADD, (char *) &word, sizeof(ADD)); 1410 struct add_op_info info = find_add_op_info(ADD.op); 1411 1412 printf("%s", info.name); 1413 1414 // float16 seems like it doesn't support output modifiers 1415 if (info.src_type == ADD_FADD || info.src_type == ADD_FMINMAX) { 1416 // output modifiers 1417 dump_output_mod(bits(ADD.op, 8, 10)); 1418 if (info.src_type == ADD_FADD) 1419 dump_round_mode(bits(ADD.op, 10, 12)); 1420 else 1421 dump_minmax_mode(bits(ADD.op, 10, 12)); 1422 } else if (info.src_type == ADD_FCMP || info.src_type == ADD_FCMP16) { 1423 dump_fcmp(bits(ADD.op, 3, 6)); 1424 if (info.src_type == ADD_FCMP) 1425 printf(".f32"); 1426 else 1427 printf(".v2f16"); 1428 } else if (info.src_type == ADD_FADDMscale) { 1429 switch ((ADD.op >> 6) & 0x7) { 1430 case 0: break; 1431 // causes GPU hangs on G71 1432 case 1: printf(".invalid"); break; 1433 // Same as usual outmod value. 1434 case 2: printf(".clamp_0_1"); break; 1435 // If src0 is infinite or NaN, flush it to zero so that the other 1436 // source is passed through unmodified. 1437 case 3: printf(".flush_src0_inf_nan"); break; 1438 // Vice versa. 1439 case 4: printf(".flush_src1_inf_nan"); break; 1440 // Every other case seems to behave the same as the above? 1441 default: printf(".unk%d", (ADD.op >> 6) & 0x7); break; 1442 } 1443 } else if (info.src_type == ADD_VARYING_INTERP) { 1444 if (ADD.op & 0x200) 1445 printf(".reuse"); 1446 if (ADD.op & 0x400) 1447 printf(".flat"); 1448 switch ((ADD.op >> 7) & 0x3) { 1449 case 0: printf(".per_frag"); break; 1450 case 1: printf(".centroid"); break; 1451 case 2: break; 1452 case 3: printf(".explicit"); break; 1453 } 1454 printf(".v%d", ((ADD.op >> 5) & 0x3) + 1); 1455 } else if (info.src_type == ADD_BRANCH) { 1456 enum branch_code branchCode = (enum branch_code) ((ADD.op >> 6) & 0x3f); 1457 if (branchCode == BR_ALWAYS) { 1458 // unconditional branch 1459 } else { 1460 enum branch_cond cond = (enum branch_cond) ((ADD.op >> 6) & 0x7); 1461 enum branch_bit_size size = (enum branch_bit_size) ((ADD.op >> 9) & 0x7); 1462 bool portSwapped = (ADD.op & 0x7) < ADD.src0; 1463 // See the comment in branch_bit_size 1464 if (size == BR_SIZE_16YX0) 1465 portSwapped = true; 1466 if (size == BR_SIZE_16YX1) 1467 portSwapped = false; 1468 // These sizes are only for floating point comparisons, so the 1469 // non-floating-point comparisons are reused to encode the flipped 1470 // versions. 1471 if (size == BR_SIZE_32_AND_16X || size == BR_SIZE_32_AND_16Y) 1472 portSwapped = false; 1473 // There's only one argument, so we reuse the extra argument to 1474 // encode this. 1475 if (size == BR_SIZE_ZERO) 1476 portSwapped = !(ADD.op & 1); 1477 1478 switch (cond) { 1479 case BR_COND_LT: 1480 if (portSwapped) 1481 printf(".LT.u"); 1482 else 1483 printf(".LT.i"); 1484 break; 1485 case BR_COND_LE: 1486 if (size == BR_SIZE_32_AND_16X || size == BR_SIZE_32_AND_16Y) { 1487 printf(".UNE.f"); 1488 } else { 1489 if (portSwapped) 1490 printf(".LE.u"); 1491 else 1492 printf(".LE.i"); 1493 } 1494 break; 1495 case BR_COND_GT: 1496 if (portSwapped) 1497 printf(".GT.u"); 1498 else 1499 printf(".GT.i"); 1500 break; 1501 case BR_COND_GE: 1502 if (portSwapped) 1503 printf(".GE.u"); 1504 else 1505 printf(".GE.i"); 1506 break; 1507 case BR_COND_EQ: 1508 if (portSwapped) 1509 printf(".NE.i"); 1510 else 1511 printf(".EQ.i"); 1512 break; 1513 case BR_COND_OEQ: 1514 if (portSwapped) 1515 printf(".UNE.f"); 1516 else 1517 printf(".OEQ.f"); 1518 break; 1519 case BR_COND_OGT: 1520 if (portSwapped) 1521 printf(".OGT.unk.f"); 1522 else 1523 printf(".OGT.f"); 1524 break; 1525 case BR_COND_OLT: 1526 if (portSwapped) 1527 printf(".OLT.unk.f"); 1528 else 1529 printf(".OLT.f"); 1530 break; 1531 } 1532 switch (size) { 1533 case BR_SIZE_32: 1534 case BR_SIZE_32_AND_16X: 1535 case BR_SIZE_32_AND_16Y: 1536 printf("32"); 1537 break; 1538 case BR_SIZE_16XX: 1539 case BR_SIZE_16YY: 1540 case BR_SIZE_16YX0: 1541 case BR_SIZE_16YX1: 1542 printf("16"); 1543 break; 1544 case BR_SIZE_ZERO: { 1545 unsigned ctrl = (ADD.op >> 1) & 0x3; 1546 if (ctrl == 0) 1547 printf("32.Z"); 1548 else 1549 printf("16.Z"); 1550 break; 1551 } 1552 } 1553 } 1554 } 1555 printf(" "); 1556 1557 struct bifrost_reg_ctrl next_ctrl = DecodeRegCtrl(next_regs); 1558 if (next_ctrl.add_write_unit != REG_WRITE_NONE) { 1559 printf("{R%d, T1}, ", GetRegToWrite(next_ctrl.add_write_unit, next_regs)); 1560 } else { 1561 printf("T1, "); 1562 } 1563 1564 switch (info.src_type) { 1565 case ADD_BLENDING: 1566 // Note: in this case, regs.uniform_const == location | 0x8 1567 // This probably means we can't load uniforms or immediates in the 1568 // same instruction. This re-uses the encoding that normally means 1569 // "disabled", where the low 4 bits are ignored. Perhaps the extra 1570 // 0x8 or'd in indicates this is happening. 1571 printf("location:%d, ", regs.uniform_const & 0x7); 1572 // fallthrough 1573 case ADD_ONE_SRC: 1574 dump_src(ADD.src0, regs, consts, false); 1575 break; 1576 case ADD_TEX: 1577 case ADD_TEX_COMPACT: { 1578 int tex_index; 1579 int sampler_index; 1580 bool dualTex = false; 1581 if (info.src_type == ADD_TEX_COMPACT) { 1582 tex_index = (ADD.op >> 3) & 0x7; 1583 sampler_index = (ADD.op >> 7) & 0x7; 1584 bool unknown = (ADD.op & 0x40); 1585 // TODO: figure out if the unknown bit is ever 0 1586 if (!unknown) 1587 printf("unknown "); 1588 } else { 1589 uint64_t constVal = get_const(consts, regs); 1590 uint32_t controlBits = (ADD.op & 0x8) ? (constVal >> 32) : constVal; 1591 struct bifrost_tex_ctrl ctrl; 1592 memcpy((char *) &ctrl, (char *) &controlBits, sizeof(ctrl)); 1593 1594 // TODO: figure out what actually triggers dual-tex 1595 if (ctrl.result_type == 9) { 1596 struct bifrost_dual_tex_ctrl dualCtrl; 1597 memcpy((char *) &dualCtrl, (char *) &controlBits, sizeof(ctrl)); 1598 printf("(dualtex) tex0:%d samp0:%d tex1:%d samp1:%d ", 1599 dualCtrl.tex_index0, dualCtrl.sampler_index0, 1600 dualCtrl.tex_index1, dualCtrl.sampler_index1); 1601 if (dualCtrl.unk0 != 3) 1602 printf("unk:%d ", dualCtrl.unk0); 1603 dualTex = true; 1604 } else { 1605 if (ctrl.no_merge_index) { 1606 tex_index = ctrl.tex_index; 1607 sampler_index = ctrl.sampler_index; 1608 } else { 1609 tex_index = sampler_index = ctrl.tex_index; 1610 unsigned unk = ctrl.sampler_index >> 2; 1611 if (unk != 3) 1612 printf("unk:%d ", unk); 1613 if (ctrl.sampler_index & 1) 1614 tex_index = -1; 1615 if (ctrl.sampler_index & 2) 1616 sampler_index = -1; 1617 } 1618 1619 if (ctrl.unk0 != 3) 1620 printf("unk0:%d ", ctrl.unk0); 1621 if (ctrl.unk1) 1622 printf("unk1 "); 1623 if (ctrl.unk2 != 0xf) 1624 printf("unk2:%x ", ctrl.unk2); 1625 1626 switch (ctrl.result_type) { 1627 case 0x4: 1628 printf("f32 "); break; 1629 case 0xe: 1630 printf("i32 "); break; 1631 case 0xf: 1632 printf("u32 "); break; 1633 default: 1634 printf("unktype(%x) ", ctrl.result_type); 1635 } 1636 1637 switch (ctrl.tex_type) { 1638 case 0: 1639 printf("cube "); break; 1640 case 1: 1641 printf("buffer "); break; 1642 case 2: 1643 printf("2D "); break; 1644 case 3: 1645 printf("3D "); break; 1646 } 1647 1648 if (ctrl.is_shadow) 1649 printf("shadow "); 1650 if (ctrl.is_array) 1651 printf("array "); 1652 1653 if (!ctrl.filter) { 1654 if (ctrl.calc_gradients) { 1655 int comp = (controlBits >> 20) & 0x3; 1656 printf("txg comp:%d ", comp); 1657 } else { 1658 printf("txf "); 1659 } 1660 } else { 1661 if (!ctrl.not_supply_lod) { 1662 if (ctrl.compute_lod) 1663 printf("lod_bias "); 1664 else 1665 printf("lod "); 1666 } 1667 1668 if (!ctrl.calc_gradients) 1669 printf("grad "); 1670 } 1671 1672 if (ctrl.texel_offset) 1673 printf("offset "); 1674 } 1675 } 1676 1677 if (!dualTex) { 1678 if (tex_index == -1) 1679 printf("tex:indirect "); 1680 else 1681 printf("tex:%d ", tex_index); 1682 1683 if (sampler_index == -1) 1684 printf("samp:indirect "); 1685 else 1686 printf("samp:%d ", sampler_index); 1687 } 1688 break; 1689 } 1690 case ADD_VARYING_INTERP: { 1691 unsigned addr = ADD.op & 0x1f; 1692 if (addr < 0b10100) { 1693 // direct addr 1694 printf("%d", addr); 1695 } else if (addr < 0b11000) { 1696 if (addr == 22) 1697 printf("fragw"); 1698 else if (addr == 23) 1699 printf("fragz"); 1700 else 1701 printf("unk%d", addr); 1702 } else { 1703 dump_src(ADD.op & 0x7, regs, consts, false); 1704 } 1705 printf(", "); 1706 dump_src(ADD.src0, regs, consts, false); 1707 break; 1708 } 1709 case ADD_VARYING_ADDRESS: { 1710 dump_src(ADD.src0, regs, consts, false); 1711 printf(", "); 1712 dump_src(ADD.op & 0x7, regs, consts, false); 1713 printf(", "); 1714 unsigned location = (ADD.op >> 3) & 0x1f; 1715 if (location < 16) { 1716 printf("location:%d", location); 1717 } else if (location == 20) { 1718 printf("location:%u", (uint32_t) get_const(consts, regs)); 1719 } else if (location == 21) { 1720 printf("location:%u", (uint32_t) (get_const(consts, regs) >> 32)); 1721 } else { 1722 printf("location:%d(unk)", location); 1723 } 1724 break; 1725 } 1726 case ADD_LOAD_ATTR: 1727 printf("location:%d, ", (ADD.op >> 3) & 0xf); 1728 case ADD_TWO_SRC: 1729 dump_src(ADD.src0, regs, consts, false); 1730 printf(", "); 1731 dump_src(ADD.op & 0x7, regs, consts, false); 1732 break; 1733 case ADD_THREE_SRC: 1734 dump_src(ADD.src0, regs, consts, false); 1735 printf(", "); 1736 dump_src(ADD.op & 0x7, regs, consts, false); 1737 printf(", "); 1738 dump_src((ADD.op >> 3) & 0x7, regs, consts, false); 1739 break; 1740 case ADD_FADD: 1741 case ADD_FMINMAX: 1742 if (ADD.op & 0x10) 1743 printf("-"); 1744 if (ADD.op & 0x1000) 1745 printf("abs("); 1746 dump_src(ADD.src0, regs, consts, false); 1747 switch ((ADD.op >> 6) & 0x3) { 1748 case 3: 1749 printf(".x"); 1750 break; 1751 default: 1752 break; 1753 } 1754 if (ADD.op & 0x1000) 1755 printf(")"); 1756 printf(", "); 1757 if (ADD.op & 0x20) 1758 printf("-"); 1759 if (ADD.op & 0x8) 1760 printf("abs("); 1761 dump_src(ADD.op & 0x7, regs, consts, false); 1762 switch ((ADD.op >> 6) & 0x3) { 1763 case 1: 1764 case 3: 1765 printf(".x"); 1766 break; 1767 case 2: 1768 printf(".y"); 1769 break; 1770 case 0: 1771 break; 1772 default: 1773 printf(".unk"); 1774 break; 1775 } 1776 if (ADD.op & 0x8) 1777 printf(")"); 1778 break; 1779 case ADD_FADD16: 1780 if (ADD.op & 0x10) 1781 printf("-"); 1782 if (ADD.op & 0x1000) 1783 printf("abs("); 1784 dump_src(ADD.src0, regs, consts, false); 1785 if (ADD.op & 0x1000) 1786 printf(")"); 1787 dump_16swizzle((ADD.op >> 6) & 0x3); 1788 printf(", "); 1789 if (ADD.op & 0x20) 1790 printf("-"); 1791 if (ADD.op & 0x8) 1792 printf("abs("); 1793 dump_src(ADD.op & 0x7, regs, consts, false); 1794 dump_16swizzle((ADD.op >> 8) & 0x3); 1795 if (ADD.op & 0x8) 1796 printf(")"); 1797 break; 1798 case ADD_FMINMAX16: { 1799 bool abs1 = ADD.op & 0x8; 1800 bool abs2 = (ADD.op & 0x7) < ADD.src0; 1801 if (ADD.op & 0x10) 1802 printf("-"); 1803 if (abs1 || abs2) 1804 printf("abs("); 1805 dump_src(ADD.src0, regs, consts, false); 1806 dump_16swizzle((ADD.op >> 6) & 0x3); 1807 if (abs1 || abs2) 1808 printf(")"); 1809 printf(", "); 1810 if (ADD.op & 0x20) 1811 printf("-"); 1812 if (abs1 && abs2) 1813 printf("abs("); 1814 dump_src(ADD.op & 0x7, regs, consts, false); 1815 dump_16swizzle((ADD.op >> 8) & 0x3); 1816 if (abs1 && abs2) 1817 printf(")"); 1818 break; 1819 } 1820 case ADD_FADDMscale: { 1821 if (ADD.op & 0x400) 1822 printf("-"); 1823 if (ADD.op & 0x200) 1824 printf("abs("); 1825 dump_src(ADD.src0, regs, consts, false); 1826 if (ADD.op & 0x200) 1827 printf(")"); 1828 1829 printf(", "); 1830 1831 if (ADD.op & 0x800) 1832 printf("-"); 1833 dump_src(ADD.op & 0x7, regs, consts, false); 1834 1835 printf(", "); 1836 1837 dump_src((ADD.op >> 3) & 0x7, regs, consts, false); 1838 break; 1839 } 1840 case ADD_FCMP: 1841 if (ADD.op & 0x400) { 1842 printf("-"); 1843 } 1844 if (ADD.op & 0x100) { 1845 printf("abs("); 1846 } 1847 dump_src(ADD.src0, regs, consts, false); 1848 switch ((ADD.op >> 6) & 0x3) { 1849 case 3: 1850 printf(".x"); 1851 break; 1852 default: 1853 break; 1854 } 1855 if (ADD.op & 0x100) { 1856 printf(")"); 1857 } 1858 printf(", "); 1859 if (ADD.op & 0x200) { 1860 printf("abs("); 1861 } 1862 dump_src(ADD.op & 0x7, regs, consts, false); 1863 switch ((ADD.op >> 6) & 0x3) { 1864 case 1: 1865 case 3: 1866 printf(".x"); 1867 break; 1868 case 2: 1869 printf(".y"); 1870 break; 1871 case 0: 1872 break; 1873 default: 1874 printf(".unk"); 1875 break; 1876 } 1877 if (ADD.op & 0x200) { 1878 printf(")"); 1879 } 1880 break; 1881 case ADD_FCMP16: 1882 dump_src(ADD.src0, regs, consts, false); 1883 dump_16swizzle((ADD.op >> 6) & 0x3); 1884 printf(", "); 1885 dump_src(ADD.op & 0x7, regs, consts, false); 1886 dump_16swizzle((ADD.op >> 8) & 0x3); 1887 break; 1888 case ADD_BRANCH: { 1889 enum branch_code code = (enum branch_code) ((ADD.op >> 6) & 0x3f); 1890 enum branch_bit_size size = (enum branch_bit_size) ((ADD.op >> 9) & 0x7); 1891 if (code != BR_ALWAYS) { 1892 dump_src(ADD.src0, regs, consts, false); 1893 switch (size) { 1894 case BR_SIZE_16XX: 1895 printf(".x"); 1896 break; 1897 case BR_SIZE_16YY: 1898 case BR_SIZE_16YX0: 1899 case BR_SIZE_16YX1: 1900 printf(".y"); 1901 break; 1902 case BR_SIZE_ZERO: { 1903 unsigned ctrl = (ADD.op >> 1) & 0x3; 1904 switch (ctrl) { 1905 case 1: 1906 printf(".y"); 1907 break; 1908 case 2: 1909 printf(".x"); 1910 break; 1911 default: 1912 break; 1913 } 1914 } 1915 default: 1916 break; 1917 } 1918 printf(", "); 1919 } 1920 if (code != BR_ALWAYS && size != BR_SIZE_ZERO) { 1921 dump_src(ADD.op & 0x7, regs, consts, false); 1922 switch (size) { 1923 case BR_SIZE_16XX: 1924 case BR_SIZE_16YX0: 1925 case BR_SIZE_16YX1: 1926 case BR_SIZE_32_AND_16X: 1927 printf(".x"); 1928 break; 1929 case BR_SIZE_16YY: 1930 case BR_SIZE_32_AND_16Y: 1931 printf(".y"); 1932 break; 1933 default: 1934 break; 1935 } 1936 printf(", "); 1937 } 1938 // I haven't had the chance to test if this actually specifies the 1939 // branch offset, since I couldn't get it to produce values other 1940 // than 5 (uniform/const high), but these three bits are always 1941 // consistent across branch instructions, so it makes sense... 1942 int offsetSrc = (ADD.op >> 3) & 0x7; 1943 if (offsetSrc == 4 || offsetSrc == 5) { 1944 // If the offset is known/constant, we can decode it 1945 uint32_t raw_offset; 1946 if (offsetSrc == 4) 1947 raw_offset = get_const(consts, regs); 1948 else 1949 raw_offset = get_const(consts, regs) >> 32; 1950 // The high 4 bits are flags, while the rest is the 1951 // twos-complement offset in bytes (here we convert to 1952 // clauses). 1953 int32_t branch_offset = ((int32_t) raw_offset << 4) >> 8; 1954 1955 // If high4 is the high 4 bits of the last 64-bit constant, 1956 // this is calculated as (high4 + 4) & 0xf, or 0 if the branch 1957 // offset itself is the last constant. Not sure if this is 1958 // actually used, or just garbage in unused bits, but in any 1959 // case, we can just ignore it here since it's redundant. Note 1960 // that if there is any padding, this will be 4 since the 1961 // padding counts as the last constant. 1962 unsigned flags = raw_offset >> 28; 1963 (void) flags; 1964 1965 // Note: the offset is in bytes, relative to the beginning of the 1966 // current clause, so a zero offset would be a loop back to the 1967 // same clause (annoyingly different from Midgard). 1968 printf("clause_%d", offset + branch_offset); 1969 } else { 1970 dump_src(offsetSrc, regs, consts, false); 1971 } 1972 } 1973 } 1974 if (info.has_data_reg) { 1975 printf(", R%d", data_reg); 1976 } 1977 printf("\n"); 1978} 1979 1980void dump_instr(const struct bifrost_alu_inst *instr, struct bifrost_regs next_regs, uint64_t *consts, 1981 unsigned data_reg, unsigned offset, bool verbose) 1982{ 1983 struct bifrost_regs regs; 1984 memcpy((char *) ®s, (char *) &instr->reg_bits, sizeof(regs)); 1985 1986 if (verbose) { 1987 printf("# regs: %016" PRIx64 "\n", instr->reg_bits); 1988 dump_regs(regs); 1989 } 1990 dump_fma(instr->fma_bits, regs, next_regs, consts, verbose); 1991 dump_add(instr->add_bits, regs, next_regs, consts, data_reg, offset, verbose); 1992} 1993 1994bool dump_clause(uint32_t *words, unsigned *size, unsigned offset, bool verbose) { 1995 // State for a decoded clause 1996 struct bifrost_alu_inst instrs[8] = {}; 1997 uint64_t consts[6] = {}; 1998 unsigned num_instrs = 0; 1999 unsigned num_consts = 0; 2000 uint64_t header_bits = 0; 2001 bool stopbit = false; 2002 2003 unsigned i; 2004 for (i = 0; ; i++, words += 4) { 2005 if (verbose) { 2006 printf("# "); 2007 for (int j = 0; j < 4; j++) 2008 printf("%08x ", words[3 - j]); // low bit on the right 2009 printf("\n"); 2010 } 2011 unsigned tag = bits(words[0], 0, 8); 2012 2013 // speculatively decode some things that are common between many formats, so we can share some code 2014 struct bifrost_alu_inst main_instr = {}; 2015 // 20 bits 2016 main_instr.add_bits = bits(words[2], 2, 32 - 13); 2017 // 23 bits 2018 main_instr.fma_bits = bits(words[1], 11, 32) | bits(words[2], 0, 2) << (32 - 11); 2019 // 35 bits 2020 main_instr.reg_bits = ((uint64_t) bits(words[1], 0, 11)) << 24 | (uint64_t) bits(words[0], 8, 32); 2021 2022 uint64_t const0 = bits(words[0], 8, 32) << 4 | (uint64_t) words[1] << 28 | bits(words[2], 0, 4) << 60; 2023 uint64_t const1 = bits(words[2], 4, 32) << 4 | (uint64_t) words[3] << 32; 2024 2025 bool stop = tag & 0x40; 2026 2027 if (verbose) { 2028 printf("# tag: 0x%02x\n", tag); 2029 } 2030 if (tag & 0x80) { 2031 unsigned idx = stop ? 5 : 2; 2032 main_instr.add_bits |= ((tag >> 3) & 0x7) << 17; 2033 instrs[idx + 1] = main_instr; 2034 instrs[idx].add_bits = bits(words[3], 0, 17) | ((tag & 0x7) << 17); 2035 instrs[idx].fma_bits |= bits(words[2], 19, 32) << 10; 2036 consts[0] = bits(words[3], 17, 32) << 4; 2037 } else { 2038 bool done = false; 2039 switch ((tag >> 3) & 0x7) { 2040 case 0x0: 2041 switch (tag & 0x7) { 2042 case 0x3: 2043 main_instr.add_bits |= bits(words[3], 29, 32) << 17; 2044 instrs[1] = main_instr; 2045 num_instrs = 2; 2046 done = stop; 2047 break; 2048 case 0x4: 2049 instrs[2].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17; 2050 instrs[2].fma_bits |= bits(words[2], 19, 32) << 10; 2051 consts[0] = const0; 2052 num_instrs = 3; 2053 num_consts = 1; 2054 done = stop; 2055 break; 2056 case 0x1: 2057 case 0x5: 2058 instrs[2].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17; 2059 instrs[2].fma_bits |= bits(words[2], 19, 32) << 10; 2060 main_instr.add_bits |= bits(words[3], 26, 29) << 17; 2061 instrs[3] = main_instr; 2062 if ((tag & 0x7) == 0x5) { 2063 num_instrs = 4; 2064 done = stop; 2065 } 2066 break; 2067 case 0x6: 2068 instrs[5].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17; 2069 instrs[5].fma_bits |= bits(words[2], 19, 32) << 10; 2070 consts[0] = const0; 2071 num_instrs = 6; 2072 num_consts = 1; 2073 done = stop; 2074 break; 2075 case 0x7: 2076 instrs[5].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17; 2077 instrs[5].fma_bits |= bits(words[2], 19, 32) << 10; 2078 main_instr.add_bits |= bits(words[3], 26, 29) << 17; 2079 instrs[6] = main_instr; 2080 num_instrs = 7; 2081 done = stop; 2082 break; 2083 default: 2084 printf("unknown tag bits 0x%02x\n", tag); 2085 } 2086 break; 2087 case 0x2: 2088 case 0x3: { 2089 unsigned idx = ((tag >> 3) & 0x7) == 2 ? 4 : 7; 2090 main_instr.add_bits |= (tag & 0x7) << 17; 2091 instrs[idx] = main_instr; 2092 consts[0] |= (bits(words[2], 19, 32) | ((uint64_t) words[3] << 13)) << 19; 2093 num_consts = 1; 2094 num_instrs = idx + 1; 2095 done = stop; 2096 break; 2097 } 2098 case 0x4: { 2099 unsigned idx = stop ? 4 : 1; 2100 main_instr.add_bits |= (tag & 0x7) << 17; 2101 instrs[idx] = main_instr; 2102 instrs[idx + 1].fma_bits |= bits(words[3], 22, 32); 2103 instrs[idx + 1].reg_bits = bits(words[2], 19, 32) | (bits(words[3], 0, 22) << (32 - 19)); 2104 break; 2105 } 2106 case 0x1: 2107 // only constants can come after this 2108 num_instrs = 1; 2109 done = stop; 2110 case 0x5: 2111 header_bits = bits(words[2], 19, 32) | ((uint64_t) words[3] << (32 - 19)); 2112 main_instr.add_bits |= (tag & 0x7) << 17; 2113 instrs[0] = main_instr; 2114 break; 2115 case 0x6: 2116 case 0x7: { 2117 unsigned pos = tag & 0xf; 2118 // note that `pos' encodes both the total number of 2119 // instructions and the position in the constant stream, 2120 // presumably because decoded constants and instructions 2121 // share a buffer in the decoder, but we only care about 2122 // the position in the constant stream; the total number of 2123 // instructions is redundant. 2124 unsigned const_idx = 7; 2125 switch (pos) { 2126 case 0: 2127 case 1: 2128 case 2: 2129 case 6: 2130 const_idx = 0; 2131 break; 2132 case 3: 2133 case 4: 2134 case 7: 2135 case 9: 2136 const_idx = 1; 2137 break; 2138 case 5: 2139 case 0xa: 2140 const_idx = 2; 2141 break; 2142 case 8: 2143 case 0xb: 2144 case 0xc: 2145 const_idx = 3; 2146 break; 2147 case 0xd: 2148 const_idx = 4; 2149 break; 2150 default: 2151 printf("# unknown pos 0x%x\n", pos); 2152 } 2153 if (num_consts < const_idx + 2) 2154 num_consts = const_idx + 2; 2155 consts[const_idx] = const0; 2156 consts[const_idx + 1] = const1; 2157 done = stop; 2158 break; 2159 } 2160 default: 2161 break; 2162 } 2163 2164 if (done) 2165 break; 2166 } 2167 } 2168 2169 *size = i + 1; 2170 2171 if (verbose) { 2172 printf("# header: %012" PRIx64 "\n", header_bits); 2173 } 2174 2175 struct bifrost_header header; 2176 memcpy((char *) &header, (char *) &header_bits, sizeof(struct bifrost_header)); 2177 dump_header(header, verbose); 2178 if (!header.no_end_of_shader) 2179 stopbit = true; 2180 2181 printf("{\n"); 2182 for (i = 0; i < num_instrs; i++) { 2183 struct bifrost_regs next_regs; 2184 if (i + 1 == num_instrs) { 2185 memcpy((char *) &next_regs, (char *) &instrs[0].reg_bits, 2186 sizeof(next_regs)); 2187 } else { 2188 memcpy((char *) &next_regs, (char *) &instrs[i + 1].reg_bits, 2189 sizeof(next_regs)); 2190 } 2191 2192 dump_instr(&instrs[i], next_regs, consts, header.datareg, offset, verbose); 2193 } 2194 printf("}\n"); 2195 2196 if (verbose) { 2197 for (unsigned i = 0; i < num_consts; i++) { 2198 printf("# const%d: %08lx\n", 2 * i, consts[i] & 0xffffffff); 2199 printf("# const%d: %08" PRIx64 "\n", 2 * i + 1, consts[i] >> 32); 2200 } 2201 } 2202 return stopbit; 2203} 2204 2205void disassemble_bifrost(uint8_t *code, size_t size, bool verbose) 2206{ 2207 uint32_t *words = (uint32_t *) code; 2208 uint32_t *words_end = words + (size / 4); 2209 // used for displaying branch targets 2210 unsigned offset = 0; 2211 while (words != words_end) 2212 { 2213 // we don't know what the program-end bit is quite yet, so for now just 2214 // assume that an all-0 quadword is padding 2215 uint32_t zero[4] = {}; 2216 if (memcmp(words, zero, 4 * sizeof(uint32_t)) == 0) 2217 break; 2218 printf("clause_%d:\n", offset); 2219 unsigned size; 2220 if (dump_clause(words, &size, offset, verbose) == true) { 2221 break; 2222 } 2223 words += size * 4; 2224 offset += size; 2225 } 2226} 2227 2228