1/* 2 * Copyright © 2018 Valve Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25#ifndef ACO_IR_H 26#define ACO_IR_H 27 28#include "aco_opcodes.h" 29#include "aco_util.h" 30 31#include "vulkan/radv_shader.h" 32 33#include "nir.h" 34 35#include <bitset> 36#include <memory> 37#include <vector> 38 39struct radv_shader_args; 40struct radv_shader_info; 41struct radv_vs_prolog_key; 42 43namespace aco { 44 45extern uint64_t debug_flags; 46 47enum { 48 DEBUG_VALIDATE_IR = 0x1, 49 DEBUG_VALIDATE_RA = 0x2, 50 DEBUG_PERFWARN = 0x4, 51 DEBUG_FORCE_WAITCNT = 0x8, 52 DEBUG_NO_VN = 0x10, 53 DEBUG_NO_OPT = 0x20, 54 DEBUG_NO_SCHED = 0x40, 55 DEBUG_PERF_INFO = 0x80, 56 DEBUG_LIVE_INFO = 0x100, 57}; 58 59/** 60 * Representation of the instruction's microcode encoding format 61 * Note: Some Vector ALU Formats can be combined, such that: 62 * - VOP2* | VOP3 represents a VOP2 instruction in VOP3 encoding 63 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive. 64 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing. 65 * 66 * (*) The same is applicable for VOP1 and VOPC instructions. 67 */ 68enum class Format : std::uint16_t { 69 /* Pseudo Instruction Format */ 70 PSEUDO = 0, 71 /* Scalar ALU & Control Formats */ 72 SOP1 = 1, 73 SOP2 = 2, 74 SOPK = 3, 75 SOPP = 4, 76 SOPC = 5, 77 /* Scalar Memory Format */ 78 SMEM = 6, 79 /* LDS/GDS Format */ 80 DS = 8, 81 /* Vector Memory Buffer Formats */ 82 MTBUF = 9, 83 MUBUF = 10, 84 /* Vector Memory Image Format */ 85 MIMG = 11, 86 /* Export Format */ 87 EXP = 12, 88 /* Flat Formats */ 89 FLAT = 13, 90 GLOBAL = 14, 91 SCRATCH = 15, 92 93 PSEUDO_BRANCH = 16, 94 PSEUDO_BARRIER = 17, 95 PSEUDO_REDUCTION = 18, 96 97 /* Vector ALU Formats */ 98 VOP3P = 19, 99 VOP1 = 1 << 8, 100 VOP2 = 1 << 9, 101 VOPC = 1 << 10, 102 VOP3 = 1 << 11, 103 /* Vector Parameter Interpolation Format */ 104 VINTRP = 1 << 12, 105 DPP = 1 << 13, 106 SDWA = 1 << 14, 107}; 108 109enum class instr_class : uint8_t { 110 valu32 = 0, 111 valu_convert32 = 1, 112 valu64 = 2, 113 valu_quarter_rate32 = 3, 114 valu_fma = 4, 115 valu_transcendental32 = 5, 116 valu_double = 6, 117 valu_double_add = 7, 118 valu_double_convert = 8, 119 valu_double_transcendental = 9, 120 salu = 10, 121 smem = 11, 122 barrier = 12, 123 branch = 13, 124 sendmsg = 14, 125 ds = 15, 126 exp = 16, 127 vmem = 17, 128 waitcnt = 18, 129 other = 19, 130 count, 131}; 132 133enum storage_class : uint8_t { 134 storage_none = 0x0, /* no synchronization and can be reordered around aliasing stores */ 135 storage_buffer = 0x1, /* SSBOs and global memory */ 136 storage_atomic_counter = 0x2, /* not used for Vulkan */ 137 storage_image = 0x4, 138 storage_shared = 0x8, /* or TCS output */ 139 storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */ 140 storage_scratch = 0x20, 141 storage_vgpr_spill = 0x40, 142 storage_count = 8, 143}; 144 145enum memory_semantics : uint8_t { 146 semantic_none = 0x0, 147 /* for loads: don't move any access after this load to before this load (even other loads) 148 * for barriers: don't move any access after the barrier to before any 149 * atomics/control_barriers/sendmsg_gs_done before the barrier */ 150 semantic_acquire = 0x1, 151 /* for stores: don't move any access before this store to after this store 152 * for barriers: don't move any access before the barrier to after any 153 * atomics/control_barriers/sendmsg_gs_done after the barrier */ 154 semantic_release = 0x2, 155 156 /* the rest are for load/stores/atomics only */ 157 /* cannot be DCE'd or CSE'd */ 158 semantic_volatile = 0x4, 159 /* does not interact with barriers and assumes this lane is the only lane 160 * accessing this memory */ 161 semantic_private = 0x8, 162 /* this operation can be reordered around operations of the same storage. 163 * says nothing about barriers */ 164 semantic_can_reorder = 0x10, 165 /* this is a atomic instruction (may only read or write memory) */ 166 semantic_atomic = 0x20, 167 /* this is instruction both reads and writes memory */ 168 semantic_rmw = 0x40, 169 170 semantic_acqrel = semantic_acquire | semantic_release, 171 semantic_atomicrmw = semantic_volatile | semantic_atomic | semantic_rmw, 172}; 173 174enum sync_scope : uint8_t { 175 scope_invocation = 0, 176 scope_subgroup = 1, 177 scope_workgroup = 2, 178 scope_queuefamily = 3, 179 scope_device = 4, 180}; 181 182struct memory_sync_info { 183 memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {} 184 memory_sync_info(int storage_, int semantics_ = 0, sync_scope scope_ = scope_invocation) 185 : storage((storage_class)storage_), semantics((memory_semantics)semantics_), scope(scope_) 186 {} 187 188 storage_class storage : 8; 189 memory_semantics semantics : 8; 190 sync_scope scope : 8; 191 192 bool operator==(const memory_sync_info& rhs) const 193 { 194 return storage == rhs.storage && semantics == rhs.semantics && scope == rhs.scope; 195 } 196 197 bool can_reorder() const 198 { 199 if (semantics & semantic_acqrel) 200 return false; 201 /* Also check storage so that zero-initialized memory_sync_info can be 202 * reordered. */ 203 return (!storage || (semantics & semantic_can_reorder)) && !(semantics & semantic_volatile); 204 } 205}; 206static_assert(sizeof(memory_sync_info) == 3, "Unexpected padding"); 207 208enum fp_round { 209 fp_round_ne = 0, 210 fp_round_pi = 1, 211 fp_round_ni = 2, 212 fp_round_tz = 3, 213}; 214 215enum fp_denorm { 216 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and 217 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */ 218 fp_denorm_flush = 0x0, 219 fp_denorm_keep_in = 0x1, 220 fp_denorm_keep_out = 0x2, 221 fp_denorm_keep = 0x3, 222}; 223 224struct float_mode { 225 /* matches encoding of the MODE register */ 226 union { 227 struct { 228 fp_round round32 : 2; 229 fp_round round16_64 : 2; 230 unsigned denorm32 : 2; 231 unsigned denorm16_64 : 2; 232 }; 233 struct { 234 uint8_t round : 4; 235 uint8_t denorm : 4; 236 }; 237 uint8_t val = 0; 238 }; 239 /* if false, optimizations which may remove infs/nan/-0.0 can be done */ 240 bool preserve_signed_zero_inf_nan32 : 1; 241 bool preserve_signed_zero_inf_nan16_64 : 1; 242 /* if false, optimizations which may remove denormal flushing can be done */ 243 bool must_flush_denorms32 : 1; 244 bool must_flush_denorms16_64 : 1; 245 bool care_about_round32 : 1; 246 bool care_about_round16_64 : 1; 247 248 /* Returns true if instructions using the mode "other" can safely use the 249 * current one instead. */ 250 bool canReplace(float_mode other) const noexcept 251 { 252 return val == other.val && 253 (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) && 254 (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) && 255 (must_flush_denorms32 || !other.must_flush_denorms32) && 256 (must_flush_denorms16_64 || !other.must_flush_denorms16_64) && 257 (care_about_round32 || !other.care_about_round32) && 258 (care_about_round16_64 || !other.care_about_round16_64); 259 } 260}; 261 262struct wait_imm { 263 static const uint8_t unset_counter = 0xff; 264 265 uint8_t vm; 266 uint8_t exp; 267 uint8_t lgkm; 268 uint8_t vs; 269 270 wait_imm(); 271 wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_); 272 wait_imm(enum chip_class chip, uint16_t packed); 273 274 uint16_t pack(enum chip_class chip) const; 275 276 bool combine(const wait_imm& other); 277 278 bool empty() const; 279}; 280 281constexpr Format 282asVOP3(Format format) 283{ 284 return (Format)((uint32_t)Format::VOP3 | (uint32_t)format); 285}; 286 287constexpr Format 288asSDWA(Format format) 289{ 290 assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC); 291 return (Format)((uint32_t)Format::SDWA | (uint32_t)format); 292} 293 294constexpr Format 295withoutDPP(Format format) 296{ 297 return (Format)((uint32_t)format & ~(uint32_t)Format::DPP); 298} 299 300enum class RegType { 301 none = 0, 302 sgpr, 303 vgpr, 304 linear_vgpr, 305}; 306 307struct RegClass { 308 309 enum RC : uint8_t { 310 s1 = 1, 311 s2 = 2, 312 s3 = 3, 313 s4 = 4, 314 s6 = 6, 315 s8 = 8, 316 s16 = 16, 317 v1 = s1 | (1 << 5), 318 v2 = s2 | (1 << 5), 319 v3 = s3 | (1 << 5), 320 v4 = s4 | (1 << 5), 321 v5 = 5 | (1 << 5), 322 v6 = 6 | (1 << 5), 323 v7 = 7 | (1 << 5), 324 v8 = 8 | (1 << 5), 325 /* byte-sized register class */ 326 v1b = v1 | (1 << 7), 327 v2b = v2 | (1 << 7), 328 v3b = v3 | (1 << 7), 329 v4b = v4 | (1 << 7), 330 v6b = v6 | (1 << 7), 331 v8b = v8 | (1 << 7), 332 /* these are used for WWM and spills to vgpr */ 333 v1_linear = v1 | (1 << 6), 334 v2_linear = v2 | (1 << 6), 335 }; 336 337 RegClass() = default; 338 constexpr RegClass(RC rc_) : rc(rc_) {} 339 constexpr RegClass(RegType type, unsigned size) 340 : rc((RC)((type == RegType::vgpr ? 1 << 5 : 0) | size)) 341 {} 342 343 constexpr operator RC() const { return rc; } 344 explicit operator bool() = delete; 345 346 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; } 347 constexpr bool is_linear_vgpr() const { return rc & (1 << 6); }; 348 constexpr bool is_subdword() const { return rc & (1 << 7); } 349 constexpr unsigned bytes() const { return ((unsigned)rc & 0x1F) * (is_subdword() ? 1 : 4); } 350 // TODO: use size() less in favor of bytes() 351 constexpr unsigned size() const { return (bytes() + 3) >> 2; } 352 constexpr bool is_linear() const { return rc <= RC::s16 || is_linear_vgpr(); } 353 constexpr RegClass as_linear() const { return RegClass((RC)(rc | (1 << 6))); } 354 constexpr RegClass as_subdword() const { return RegClass((RC)(rc | 1 << 7)); } 355 356 static constexpr RegClass get(RegType type, unsigned bytes) 357 { 358 if (type == RegType::sgpr) { 359 return RegClass(type, DIV_ROUND_UP(bytes, 4u)); 360 } else { 361 return bytes % 4u ? RegClass(type, bytes).as_subdword() : RegClass(type, bytes / 4u); 362 } 363 } 364 365 constexpr RegClass resize(unsigned bytes) const 366 { 367 if (is_linear_vgpr()) { 368 assert(bytes % 4u == 0); 369 return get(RegType::vgpr, bytes).as_linear(); 370 } 371 return get(type(), bytes); 372 } 373 374private: 375 RC rc; 376}; 377 378/* transitional helper expressions */ 379static constexpr RegClass s1{RegClass::s1}; 380static constexpr RegClass s2{RegClass::s2}; 381static constexpr RegClass s3{RegClass::s3}; 382static constexpr RegClass s4{RegClass::s4}; 383static constexpr RegClass s8{RegClass::s8}; 384static constexpr RegClass s16{RegClass::s16}; 385static constexpr RegClass v1{RegClass::v1}; 386static constexpr RegClass v2{RegClass::v2}; 387static constexpr RegClass v3{RegClass::v3}; 388static constexpr RegClass v4{RegClass::v4}; 389static constexpr RegClass v5{RegClass::v5}; 390static constexpr RegClass v6{RegClass::v6}; 391static constexpr RegClass v7{RegClass::v7}; 392static constexpr RegClass v8{RegClass::v8}; 393static constexpr RegClass v1b{RegClass::v1b}; 394static constexpr RegClass v2b{RegClass::v2b}; 395static constexpr RegClass v3b{RegClass::v3b}; 396static constexpr RegClass v4b{RegClass::v4b}; 397static constexpr RegClass v6b{RegClass::v6b}; 398static constexpr RegClass v8b{RegClass::v8b}; 399 400/** 401 * Temp Class 402 * Each temporary virtual register has a 403 * register class (i.e. size and type) 404 * and SSA id. 405 */ 406struct Temp { 407 Temp() noexcept : id_(0), reg_class(0) {} 408 constexpr Temp(uint32_t id, RegClass cls) noexcept : id_(id), reg_class(uint8_t(cls)) {} 409 410 constexpr uint32_t id() const noexcept { return id_; } 411 constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; } 412 413 constexpr unsigned bytes() const noexcept { return regClass().bytes(); } 414 constexpr unsigned size() const noexcept { return regClass().size(); } 415 constexpr RegType type() const noexcept { return regClass().type(); } 416 constexpr bool is_linear() const noexcept { return regClass().is_linear(); } 417 418 constexpr bool operator<(Temp other) const noexcept { return id() < other.id(); } 419 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); } 420 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); } 421 422private: 423 uint32_t id_ : 24; 424 uint32_t reg_class : 8; 425}; 426 427/** 428 * PhysReg 429 * Represents the physical register for each 430 * Operand and Definition. 431 */ 432struct PhysReg { 433 constexpr PhysReg() = default; 434 explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {} 435 constexpr unsigned reg() const { return reg_b >> 2; } 436 constexpr unsigned byte() const { return reg_b & 0x3; } 437 constexpr operator unsigned() const { return reg(); } 438 constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; } 439 constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; } 440 constexpr bool operator<(PhysReg other) const { return reg_b < other.reg_b; } 441 constexpr PhysReg advance(int bytes) const 442 { 443 PhysReg res = *this; 444 res.reg_b += bytes; 445 return res; 446 } 447 448 uint16_t reg_b = 0; 449}; 450 451/* helper expressions for special registers */ 452static constexpr PhysReg m0{124}; 453static constexpr PhysReg vcc{106}; 454static constexpr PhysReg vcc_hi{107}; 455static constexpr PhysReg tba{108}; /* GFX6-GFX8 */ 456static constexpr PhysReg tma{110}; /* GFX6-GFX8 */ 457static constexpr PhysReg ttmp0{112}; 458static constexpr PhysReg ttmp1{113}; 459static constexpr PhysReg ttmp2{114}; 460static constexpr PhysReg ttmp3{115}; 461static constexpr PhysReg ttmp4{116}; 462static constexpr PhysReg ttmp5{117}; 463static constexpr PhysReg ttmp6{118}; 464static constexpr PhysReg ttmp7{119}; 465static constexpr PhysReg ttmp8{120}; 466static constexpr PhysReg ttmp9{121}; 467static constexpr PhysReg ttmp10{122}; 468static constexpr PhysReg ttmp11{123}; 469static constexpr PhysReg sgpr_null{125}; /* GFX10+ */ 470static constexpr PhysReg exec{126}; 471static constexpr PhysReg exec_lo{126}; 472static constexpr PhysReg exec_hi{127}; 473static constexpr PhysReg vccz{251}; 474static constexpr PhysReg execz{252}; 475static constexpr PhysReg scc{253}; 476 477/** 478 * Operand Class 479 * Initially, each Operand refers to either 480 * a temporary virtual register 481 * or to a constant value 482 * Temporary registers get mapped to physical register during RA 483 * Constant values are inlined into the instruction sequence. 484 */ 485class Operand final { 486public: 487 constexpr Operand() 488 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false), isKill_(false), 489 isUndef_(true), isFirstKill_(false), constSize(0), isLateKill_(false), is16bit_(false), 490 is24bit_(false), signext(false) 491 {} 492 493 explicit Operand(Temp r) noexcept 494 { 495 data_.temp = r; 496 if (r.id()) { 497 isTemp_ = true; 498 } else { 499 isUndef_ = true; 500 setFixed(PhysReg{128}); 501 } 502 }; 503 explicit Operand(Temp r, PhysReg reg) noexcept 504 { 505 assert(r.id()); /* Don't allow fixing an undef to a register */ 506 data_.temp = r; 507 isTemp_ = true; 508 setFixed(reg); 509 }; 510 511 /* 8-bit constant */ 512 static Operand c8(uint8_t v) noexcept 513 { 514 /* 8-bit constants are only used for copies and copies from any 8-bit 515 * constant can be implemented with a SDWA v_mul_u32_u24. So consider all 516 * to be inline constants. */ 517 Operand op; 518 op.control_ = 0; 519 op.data_.i = v; 520 op.isConstant_ = true; 521 op.constSize = 0; 522 op.setFixed(PhysReg{0u}); 523 return op; 524 }; 525 526 /* 16-bit constant */ 527 static Operand c16(uint16_t v) noexcept 528 { 529 Operand op; 530 op.control_ = 0; 531 op.data_.i = v; 532 op.isConstant_ = true; 533 op.constSize = 1; 534 if (v <= 64) 535 op.setFixed(PhysReg{128u + v}); 536 else if (v >= 0xFFF0) /* [-16 .. -1] */ 537 op.setFixed(PhysReg{(unsigned)(192 - (int16_t)v)}); 538 else if (v == 0x3800) /* 0.5 */ 539 op.setFixed(PhysReg{240}); 540 else if (v == 0xB800) /* -0.5 */ 541 op.setFixed(PhysReg{241}); 542 else if (v == 0x3C00) /* 1.0 */ 543 op.setFixed(PhysReg{242}); 544 else if (v == 0xBC00) /* -1.0 */ 545 op.setFixed(PhysReg{243}); 546 else if (v == 0x4000) /* 2.0 */ 547 op.setFixed(PhysReg{244}); 548 else if (v == 0xC000) /* -2.0 */ 549 op.setFixed(PhysReg{245}); 550 else if (v == 0x4400) /* 4.0 */ 551 op.setFixed(PhysReg{246}); 552 else if (v == 0xC400) /* -4.0 */ 553 op.setFixed(PhysReg{247}); 554 else if (v == 0x3118) /* 1/2 PI */ 555 op.setFixed(PhysReg{248}); 556 else /* Literal Constant */ 557 op.setFixed(PhysReg{255}); 558 return op; 559 } 560 561 /* 32-bit constant */ 562 static Operand c32(uint32_t v) noexcept { return c32_or_c64(v, false); } 563 564 /* 64-bit constant */ 565 static Operand c64(uint64_t v) noexcept 566 { 567 Operand op; 568 op.control_ = 0; 569 op.isConstant_ = true; 570 op.constSize = 3; 571 if (v <= 64) { 572 op.data_.i = (uint32_t)v; 573 op.setFixed(PhysReg{128 + (uint32_t)v}); 574 } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */ 575 op.data_.i = (uint32_t)v; 576 op.setFixed(PhysReg{192 - (uint32_t)v}); 577 } else if (v == 0x3FE0000000000000) { /* 0.5 */ 578 op.data_.i = 0x3f000000; 579 op.setFixed(PhysReg{240}); 580 } else if (v == 0xBFE0000000000000) { /* -0.5 */ 581 op.data_.i = 0xbf000000; 582 op.setFixed(PhysReg{241}); 583 } else if (v == 0x3FF0000000000000) { /* 1.0 */ 584 op.data_.i = 0x3f800000; 585 op.setFixed(PhysReg{242}); 586 } else if (v == 0xBFF0000000000000) { /* -1.0 */ 587 op.data_.i = 0xbf800000; 588 op.setFixed(PhysReg{243}); 589 } else if (v == 0x4000000000000000) { /* 2.0 */ 590 op.data_.i = 0x40000000; 591 op.setFixed(PhysReg{244}); 592 } else if (v == 0xC000000000000000) { /* -2.0 */ 593 op.data_.i = 0xc0000000; 594 op.setFixed(PhysReg{245}); 595 } else if (v == 0x4010000000000000) { /* 4.0 */ 596 op.data_.i = 0x40800000; 597 op.setFixed(PhysReg{246}); 598 } else if (v == 0xC010000000000000) { /* -4.0 */ 599 op.data_.i = 0xc0800000; 600 op.setFixed(PhysReg{247}); 601 } else { /* Literal Constant: we don't know if it is a long or double.*/ 602 op.signext = v >> 63; 603 op.data_.i = v & 0xffffffffu; 604 op.setFixed(PhysReg{255}); 605 assert(op.constantValue64() == v && 606 "attempt to create a unrepresentable 64-bit literal constant"); 607 } 608 return op; 609 } 610 611 /* 32-bit constant stored as a 32-bit or 64-bit operand */ 612 static Operand c32_or_c64(uint32_t v, bool is64bit) noexcept 613 { 614 Operand op; 615 op.control_ = 0; 616 op.data_.i = v; 617 op.isConstant_ = true; 618 op.constSize = is64bit ? 3 : 2; 619 if (v <= 64) 620 op.setFixed(PhysReg{128 + v}); 621 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */ 622 op.setFixed(PhysReg{192 - v}); 623 else if (v == 0x3f000000) /* 0.5 */ 624 op.setFixed(PhysReg{240}); 625 else if (v == 0xbf000000) /* -0.5 */ 626 op.setFixed(PhysReg{241}); 627 else if (v == 0x3f800000) /* 1.0 */ 628 op.setFixed(PhysReg{242}); 629 else if (v == 0xbf800000) /* -1.0 */ 630 op.setFixed(PhysReg{243}); 631 else if (v == 0x40000000) /* 2.0 */ 632 op.setFixed(PhysReg{244}); 633 else if (v == 0xc0000000) /* -2.0 */ 634 op.setFixed(PhysReg{245}); 635 else if (v == 0x40800000) /* 4.0 */ 636 op.setFixed(PhysReg{246}); 637 else if (v == 0xc0800000) /* -4.0 */ 638 op.setFixed(PhysReg{247}); 639 else { /* Literal Constant */ 640 assert(!is64bit && "attempt to create a 64-bit literal constant"); 641 op.setFixed(PhysReg{255}); 642 } 643 return op; 644 } 645 646 explicit Operand(RegClass type) noexcept 647 { 648 isUndef_ = true; 649 data_.temp = Temp(0, type); 650 setFixed(PhysReg{128}); 651 }; 652 explicit Operand(PhysReg reg, RegClass type) noexcept 653 { 654 data_.temp = Temp(0, type); 655 setFixed(reg); 656 } 657 658 static Operand zero(unsigned bytes = 4) noexcept 659 { 660 if (bytes == 8) 661 return Operand::c64(0); 662 else if (bytes == 4) 663 return Operand::c32(0); 664 else if (bytes == 2) 665 return Operand::c16(0); 666 assert(bytes == 1); 667 return Operand::c8(0); 668 } 669 670 /* This is useful over the constructors when you want to take a chip class 671 * for 1/2 PI or an unknown operand size. 672 */ 673 static Operand get_const(enum chip_class chip, uint64_t val, unsigned bytes) 674 { 675 if (val == 0x3e22f983 && bytes == 4 && chip >= GFX8) { 676 /* 1/2 PI can be an inline constant on GFX8+ */ 677 Operand op = Operand::c32(val); 678 op.setFixed(PhysReg{248}); 679 return op; 680 } 681 682 if (bytes == 8) 683 return Operand::c64(val); 684 else if (bytes == 4) 685 return Operand::c32(val); 686 else if (bytes == 2) 687 return Operand::c16(val); 688 assert(bytes == 1); 689 return Operand::c8(val); 690 } 691 692 static bool is_constant_representable(uint64_t val, unsigned bytes, bool zext = false, 693 bool sext = false) 694 { 695 if (bytes <= 4) 696 return true; 697 698 if (zext && (val & 0xFFFFFFFF00000000) == 0x0000000000000000) 699 return true; 700 uint64_t upper33 = val & 0xFFFFFFFF80000000; 701 if (sext && (upper33 == 0xFFFFFFFF80000000 || upper33 == 0)) 702 return true; 703 704 return val >= 0xFFFFFFFFFFFFFFF0 || val <= 64 || /* [-16 .. 64] */ 705 val == 0x3FE0000000000000 || /* 0.5 */ 706 val == 0xBFE0000000000000 || /* -0.5 */ 707 val == 0x3FF0000000000000 || /* 1.0 */ 708 val == 0xBFF0000000000000 || /* -1.0 */ 709 val == 0x4000000000000000 || /* 2.0 */ 710 val == 0xC000000000000000 || /* -2.0 */ 711 val == 0x4010000000000000 || /* 4.0 */ 712 val == 0xC010000000000000; /* -4.0 */ 713 } 714 715 constexpr bool isTemp() const noexcept { return isTemp_; } 716 717 constexpr void setTemp(Temp t) noexcept 718 { 719 assert(!isConstant_); 720 isTemp_ = true; 721 data_.temp = t; 722 } 723 724 constexpr Temp getTemp() const noexcept { return data_.temp; } 725 726 constexpr uint32_t tempId() const noexcept { return data_.temp.id(); } 727 728 constexpr bool hasRegClass() const noexcept { return isTemp() || isUndefined(); } 729 730 constexpr RegClass regClass() const noexcept { return data_.temp.regClass(); } 731 732 constexpr unsigned bytes() const noexcept 733 { 734 if (isConstant()) 735 return 1 << constSize; 736 else 737 return data_.temp.bytes(); 738 } 739 740 constexpr unsigned size() const noexcept 741 { 742 if (isConstant()) 743 return constSize > 2 ? 2 : 1; 744 else 745 return data_.temp.size(); 746 } 747 748 constexpr bool isFixed() const noexcept { return isFixed_; } 749 750 constexpr PhysReg physReg() const noexcept { return reg_; } 751 752 constexpr void setFixed(PhysReg reg) noexcept 753 { 754 isFixed_ = reg != unsigned(-1); 755 reg_ = reg; 756 } 757 758 constexpr bool isConstant() const noexcept { return isConstant_; } 759 760 constexpr bool isLiteral() const noexcept { return isConstant() && reg_ == 255; } 761 762 constexpr bool isUndefined() const noexcept { return isUndef_; } 763 764 constexpr uint32_t constantValue() const noexcept { return data_.i; } 765 766 constexpr bool constantEquals(uint32_t cmp) const noexcept 767 { 768 return isConstant() && constantValue() == cmp; 769 } 770 771 constexpr uint64_t constantValue64() const noexcept 772 { 773 if (constSize == 3) { 774 if (reg_ <= 192) 775 return reg_ - 128; 776 else if (reg_ <= 208) 777 return 0xFFFFFFFFFFFFFFFF - (reg_ - 193); 778 779 switch (reg_) { 780 case 240: return 0x3FE0000000000000; 781 case 241: return 0xBFE0000000000000; 782 case 242: return 0x3FF0000000000000; 783 case 243: return 0xBFF0000000000000; 784 case 244: return 0x4000000000000000; 785 case 245: return 0xC000000000000000; 786 case 246: return 0x4010000000000000; 787 case 247: return 0xC010000000000000; 788 case 255: 789 return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i; 790 } 791 unreachable("invalid register for 64-bit constant"); 792 } else { 793 return data_.i; 794 } 795 } 796 797 constexpr bool isOfType(RegType type) const noexcept 798 { 799 return hasRegClass() && regClass().type() == type; 800 } 801 802 /* Indicates that the killed operand's live range intersects with the 803 * instruction's definitions. Unlike isKill() and isFirstKill(), this is 804 * not set by liveness analysis. */ 805 constexpr void setLateKill(bool flag) noexcept { isLateKill_ = flag; } 806 807 constexpr bool isLateKill() const noexcept { return isLateKill_; } 808 809 constexpr void setKill(bool flag) noexcept 810 { 811 isKill_ = flag; 812 if (!flag) 813 setFirstKill(false); 814 } 815 816 constexpr bool isKill() const noexcept { return isKill_ || isFirstKill(); } 817 818 constexpr void setFirstKill(bool flag) noexcept 819 { 820 isFirstKill_ = flag; 821 if (flag) 822 setKill(flag); 823 } 824 825 /* When there are multiple operands killing the same temporary, 826 * isFirstKill() is only returns true for the first one. */ 827 constexpr bool isFirstKill() const noexcept { return isFirstKill_; } 828 829 constexpr bool isKillBeforeDef() const noexcept { return isKill() && !isLateKill(); } 830 831 constexpr bool isFirstKillBeforeDef() const noexcept { return isFirstKill() && !isLateKill(); } 832 833 constexpr bool operator==(Operand other) const noexcept 834 { 835 if (other.size() != size()) 836 return false; 837 if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef()) 838 return false; 839 if (isFixed() && other.isFixed() && physReg() != other.physReg()) 840 return false; 841 if (isLiteral()) 842 return other.isLiteral() && other.constantValue() == constantValue(); 843 else if (isConstant()) 844 return other.isConstant() && other.physReg() == physReg(); 845 else if (isUndefined()) 846 return other.isUndefined() && other.regClass() == regClass(); 847 else 848 return other.isTemp() && other.getTemp() == getTemp(); 849 } 850 851 constexpr bool operator!=(Operand other) const noexcept { return !operator==(other); } 852 853 constexpr void set16bit(bool flag) noexcept { is16bit_ = flag; } 854 855 constexpr bool is16bit() const noexcept { return is16bit_; } 856 857 constexpr void set24bit(bool flag) noexcept { is24bit_ = flag; } 858 859 constexpr bool is24bit() const noexcept { return is24bit_; } 860 861private: 862 union { 863 Temp temp; 864 uint32_t i; 865 float f; 866 } data_ = {Temp(0, s1)}; 867 PhysReg reg_; 868 union { 869 struct { 870 uint8_t isTemp_ : 1; 871 uint8_t isFixed_ : 1; 872 uint8_t isConstant_ : 1; 873 uint8_t isKill_ : 1; 874 uint8_t isUndef_ : 1; 875 uint8_t isFirstKill_ : 1; 876 uint8_t constSize : 2; 877 uint8_t isLateKill_ : 1; 878 uint8_t is16bit_ : 1; 879 uint8_t is24bit_ : 1; 880 uint8_t signext : 1; 881 }; 882 /* can't initialize bit-fields in c++11, so work around using a union */ 883 uint16_t control_ = 0; 884 }; 885}; 886 887/** 888 * Definition Class 889 * Definitions are the results of Instructions 890 * and refer to temporary virtual registers 891 * which are later mapped to physical registers 892 */ 893class Definition final { 894public: 895 constexpr Definition() 896 : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0), isPrecise_(0), isNUW_(0), 897 isNoCSE_(0) 898 {} 899 Definition(uint32_t index, RegClass type) noexcept : temp(index, type) {} 900 explicit Definition(Temp tmp) noexcept : temp(tmp) {} 901 Definition(PhysReg reg, RegClass type) noexcept : temp(Temp(0, type)) { setFixed(reg); } 902 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept : temp(Temp(tmpId, type)) 903 { 904 setFixed(reg); 905 } 906 907 constexpr bool isTemp() const noexcept { return tempId() > 0; } 908 909 constexpr Temp getTemp() const noexcept { return temp; } 910 911 constexpr uint32_t tempId() const noexcept { return temp.id(); } 912 913 constexpr void setTemp(Temp t) noexcept { temp = t; } 914 915 void swapTemp(Definition& other) noexcept { std::swap(temp, other.temp); } 916 917 constexpr RegClass regClass() const noexcept { return temp.regClass(); } 918 919 constexpr unsigned bytes() const noexcept { return temp.bytes(); } 920 921 constexpr unsigned size() const noexcept { return temp.size(); } 922 923 constexpr bool isFixed() const noexcept { return isFixed_; } 924 925 constexpr PhysReg physReg() const noexcept { return reg_; } 926 927 constexpr void setFixed(PhysReg reg) noexcept 928 { 929 isFixed_ = 1; 930 reg_ = reg; 931 } 932 933 constexpr void setHint(PhysReg reg) noexcept 934 { 935 hasHint_ = 1; 936 reg_ = reg; 937 } 938 939 constexpr bool hasHint() const noexcept { return hasHint_; } 940 941 constexpr void setKill(bool flag) noexcept { isKill_ = flag; } 942 943 constexpr bool isKill() const noexcept { return isKill_; } 944 945 constexpr void setPrecise(bool precise) noexcept { isPrecise_ = precise; } 946 947 constexpr bool isPrecise() const noexcept { return isPrecise_; } 948 949 /* No Unsigned Wrap */ 950 constexpr void setNUW(bool nuw) noexcept { isNUW_ = nuw; } 951 952 constexpr bool isNUW() const noexcept { return isNUW_; } 953 954 constexpr void setNoCSE(bool noCSE) noexcept { isNoCSE_ = noCSE; } 955 956 constexpr bool isNoCSE() const noexcept { return isNoCSE_; } 957 958private: 959 Temp temp = Temp(0, s1); 960 PhysReg reg_; 961 union { 962 struct { 963 uint8_t isFixed_ : 1; 964 uint8_t hasHint_ : 1; 965 uint8_t isKill_ : 1; 966 uint8_t isPrecise_ : 1; 967 uint8_t isNUW_ : 1; 968 uint8_t isNoCSE_ : 1; 969 }; 970 /* can't initialize bit-fields in c++11, so work around using a union */ 971 uint8_t control_ = 0; 972 }; 973}; 974 975struct Block; 976struct Instruction; 977struct Pseudo_instruction; 978struct SOP1_instruction; 979struct SOP2_instruction; 980struct SOPK_instruction; 981struct SOPP_instruction; 982struct SOPC_instruction; 983struct SMEM_instruction; 984struct DS_instruction; 985struct MTBUF_instruction; 986struct MUBUF_instruction; 987struct MIMG_instruction; 988struct Export_instruction; 989struct FLAT_instruction; 990struct Pseudo_branch_instruction; 991struct Pseudo_barrier_instruction; 992struct Pseudo_reduction_instruction; 993struct VOP3P_instruction; 994struct VOP1_instruction; 995struct VOP2_instruction; 996struct VOPC_instruction; 997struct VOP3_instruction; 998struct Interp_instruction; 999struct DPP_instruction; 1000struct SDWA_instruction; 1001 1002struct Instruction { 1003 aco_opcode opcode; 1004 Format format; 1005 uint32_t pass_flags; 1006 1007 aco::span<Operand> operands; 1008 aco::span<Definition> definitions; 1009 1010 constexpr bool usesModifiers() const noexcept; 1011 1012 constexpr bool reads_exec() const noexcept 1013 { 1014 for (const Operand& op : operands) { 1015 if (op.isFixed() && op.physReg() == exec) 1016 return true; 1017 } 1018 return false; 1019 } 1020 1021 Pseudo_instruction& pseudo() noexcept 1022 { 1023 assert(isPseudo()); 1024 return *(Pseudo_instruction*)this; 1025 } 1026 const Pseudo_instruction& pseudo() const noexcept 1027 { 1028 assert(isPseudo()); 1029 return *(Pseudo_instruction*)this; 1030 } 1031 constexpr bool isPseudo() const noexcept { return format == Format::PSEUDO; } 1032 SOP1_instruction& sop1() noexcept 1033 { 1034 assert(isSOP1()); 1035 return *(SOP1_instruction*)this; 1036 } 1037 const SOP1_instruction& sop1() const noexcept 1038 { 1039 assert(isSOP1()); 1040 return *(SOP1_instruction*)this; 1041 } 1042 constexpr bool isSOP1() const noexcept { return format == Format::SOP1; } 1043 SOP2_instruction& sop2() noexcept 1044 { 1045 assert(isSOP2()); 1046 return *(SOP2_instruction*)this; 1047 } 1048 const SOP2_instruction& sop2() const noexcept 1049 { 1050 assert(isSOP2()); 1051 return *(SOP2_instruction*)this; 1052 } 1053 constexpr bool isSOP2() const noexcept { return format == Format::SOP2; } 1054 SOPK_instruction& sopk() noexcept 1055 { 1056 assert(isSOPK()); 1057 return *(SOPK_instruction*)this; 1058 } 1059 const SOPK_instruction& sopk() const noexcept 1060 { 1061 assert(isSOPK()); 1062 return *(SOPK_instruction*)this; 1063 } 1064 constexpr bool isSOPK() const noexcept { return format == Format::SOPK; } 1065 SOPP_instruction& sopp() noexcept 1066 { 1067 assert(isSOPP()); 1068 return *(SOPP_instruction*)this; 1069 } 1070 const SOPP_instruction& sopp() const noexcept 1071 { 1072 assert(isSOPP()); 1073 return *(SOPP_instruction*)this; 1074 } 1075 constexpr bool isSOPP() const noexcept { return format == Format::SOPP; } 1076 SOPC_instruction& sopc() noexcept 1077 { 1078 assert(isSOPC()); 1079 return *(SOPC_instruction*)this; 1080 } 1081 const SOPC_instruction& sopc() const noexcept 1082 { 1083 assert(isSOPC()); 1084 return *(SOPC_instruction*)this; 1085 } 1086 constexpr bool isSOPC() const noexcept { return format == Format::SOPC; } 1087 SMEM_instruction& smem() noexcept 1088 { 1089 assert(isSMEM()); 1090 return *(SMEM_instruction*)this; 1091 } 1092 const SMEM_instruction& smem() const noexcept 1093 { 1094 assert(isSMEM()); 1095 return *(SMEM_instruction*)this; 1096 } 1097 constexpr bool isSMEM() const noexcept { return format == Format::SMEM; } 1098 DS_instruction& ds() noexcept 1099 { 1100 assert(isDS()); 1101 return *(DS_instruction*)this; 1102 } 1103 const DS_instruction& ds() const noexcept 1104 { 1105 assert(isDS()); 1106 return *(DS_instruction*)this; 1107 } 1108 constexpr bool isDS() const noexcept { return format == Format::DS; } 1109 MTBUF_instruction& mtbuf() noexcept 1110 { 1111 assert(isMTBUF()); 1112 return *(MTBUF_instruction*)this; 1113 } 1114 const MTBUF_instruction& mtbuf() const noexcept 1115 { 1116 assert(isMTBUF()); 1117 return *(MTBUF_instruction*)this; 1118 } 1119 constexpr bool isMTBUF() const noexcept { return format == Format::MTBUF; } 1120 MUBUF_instruction& mubuf() noexcept 1121 { 1122 assert(isMUBUF()); 1123 return *(MUBUF_instruction*)this; 1124 } 1125 const MUBUF_instruction& mubuf() const noexcept 1126 { 1127 assert(isMUBUF()); 1128 return *(MUBUF_instruction*)this; 1129 } 1130 constexpr bool isMUBUF() const noexcept { return format == Format::MUBUF; } 1131 MIMG_instruction& mimg() noexcept 1132 { 1133 assert(isMIMG()); 1134 return *(MIMG_instruction*)this; 1135 } 1136 const MIMG_instruction& mimg() const noexcept 1137 { 1138 assert(isMIMG()); 1139 return *(MIMG_instruction*)this; 1140 } 1141 constexpr bool isMIMG() const noexcept { return format == Format::MIMG; } 1142 Export_instruction& exp() noexcept 1143 { 1144 assert(isEXP()); 1145 return *(Export_instruction*)this; 1146 } 1147 const Export_instruction& exp() const noexcept 1148 { 1149 assert(isEXP()); 1150 return *(Export_instruction*)this; 1151 } 1152 constexpr bool isEXP() const noexcept { return format == Format::EXP; } 1153 FLAT_instruction& flat() noexcept 1154 { 1155 assert(isFlat()); 1156 return *(FLAT_instruction*)this; 1157 } 1158 const FLAT_instruction& flat() const noexcept 1159 { 1160 assert(isFlat()); 1161 return *(FLAT_instruction*)this; 1162 } 1163 constexpr bool isFlat() const noexcept { return format == Format::FLAT; } 1164 FLAT_instruction& global() noexcept 1165 { 1166 assert(isGlobal()); 1167 return *(FLAT_instruction*)this; 1168 } 1169 const FLAT_instruction& global() const noexcept 1170 { 1171 assert(isGlobal()); 1172 return *(FLAT_instruction*)this; 1173 } 1174 constexpr bool isGlobal() const noexcept { return format == Format::GLOBAL; } 1175 FLAT_instruction& scratch() noexcept 1176 { 1177 assert(isScratch()); 1178 return *(FLAT_instruction*)this; 1179 } 1180 const FLAT_instruction& scratch() const noexcept 1181 { 1182 assert(isScratch()); 1183 return *(FLAT_instruction*)this; 1184 } 1185 constexpr bool isScratch() const noexcept { return format == Format::SCRATCH; } 1186 Pseudo_branch_instruction& branch() noexcept 1187 { 1188 assert(isBranch()); 1189 return *(Pseudo_branch_instruction*)this; 1190 } 1191 const Pseudo_branch_instruction& branch() const noexcept 1192 { 1193 assert(isBranch()); 1194 return *(Pseudo_branch_instruction*)this; 1195 } 1196 constexpr bool isBranch() const noexcept { return format == Format::PSEUDO_BRANCH; } 1197 Pseudo_barrier_instruction& barrier() noexcept 1198 { 1199 assert(isBarrier()); 1200 return *(Pseudo_barrier_instruction*)this; 1201 } 1202 const Pseudo_barrier_instruction& barrier() const noexcept 1203 { 1204 assert(isBarrier()); 1205 return *(Pseudo_barrier_instruction*)this; 1206 } 1207 constexpr bool isBarrier() const noexcept { return format == Format::PSEUDO_BARRIER; } 1208 Pseudo_reduction_instruction& reduction() noexcept 1209 { 1210 assert(isReduction()); 1211 return *(Pseudo_reduction_instruction*)this; 1212 } 1213 const Pseudo_reduction_instruction& reduction() const noexcept 1214 { 1215 assert(isReduction()); 1216 return *(Pseudo_reduction_instruction*)this; 1217 } 1218 constexpr bool isReduction() const noexcept { return format == Format::PSEUDO_REDUCTION; } 1219 VOP3P_instruction& vop3p() noexcept 1220 { 1221 assert(isVOP3P()); 1222 return *(VOP3P_instruction*)this; 1223 } 1224 const VOP3P_instruction& vop3p() const noexcept 1225 { 1226 assert(isVOP3P()); 1227 return *(VOP3P_instruction*)this; 1228 } 1229 constexpr bool isVOP3P() const noexcept { return format == Format::VOP3P; } 1230 VOP1_instruction& vop1() noexcept 1231 { 1232 assert(isVOP1()); 1233 return *(VOP1_instruction*)this; 1234 } 1235 const VOP1_instruction& vop1() const noexcept 1236 { 1237 assert(isVOP1()); 1238 return *(VOP1_instruction*)this; 1239 } 1240 constexpr bool isVOP1() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP1; } 1241 VOP2_instruction& vop2() noexcept 1242 { 1243 assert(isVOP2()); 1244 return *(VOP2_instruction*)this; 1245 } 1246 const VOP2_instruction& vop2() const noexcept 1247 { 1248 assert(isVOP2()); 1249 return *(VOP2_instruction*)this; 1250 } 1251 constexpr bool isVOP2() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP2; } 1252 VOPC_instruction& vopc() noexcept 1253 { 1254 assert(isVOPC()); 1255 return *(VOPC_instruction*)this; 1256 } 1257 const VOPC_instruction& vopc() const noexcept 1258 { 1259 assert(isVOPC()); 1260 return *(VOPC_instruction*)this; 1261 } 1262 constexpr bool isVOPC() const noexcept { return (uint16_t)format & (uint16_t)Format::VOPC; } 1263 VOP3_instruction& vop3() noexcept 1264 { 1265 assert(isVOP3()); 1266 return *(VOP3_instruction*)this; 1267 } 1268 const VOP3_instruction& vop3() const noexcept 1269 { 1270 assert(isVOP3()); 1271 return *(VOP3_instruction*)this; 1272 } 1273 constexpr bool isVOP3() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP3; } 1274 Interp_instruction& vintrp() noexcept 1275 { 1276 assert(isVINTRP()); 1277 return *(Interp_instruction*)this; 1278 } 1279 const Interp_instruction& vintrp() const noexcept 1280 { 1281 assert(isVINTRP()); 1282 return *(Interp_instruction*)this; 1283 } 1284 constexpr bool isVINTRP() const noexcept { return (uint16_t)format & (uint16_t)Format::VINTRP; } 1285 DPP_instruction& dpp() noexcept 1286 { 1287 assert(isDPP()); 1288 return *(DPP_instruction*)this; 1289 } 1290 const DPP_instruction& dpp() const noexcept 1291 { 1292 assert(isDPP()); 1293 return *(DPP_instruction*)this; 1294 } 1295 constexpr bool isDPP() const noexcept { return (uint16_t)format & (uint16_t)Format::DPP; } 1296 SDWA_instruction& sdwa() noexcept 1297 { 1298 assert(isSDWA()); 1299 return *(SDWA_instruction*)this; 1300 } 1301 const SDWA_instruction& sdwa() const noexcept 1302 { 1303 assert(isSDWA()); 1304 return *(SDWA_instruction*)this; 1305 } 1306 constexpr bool isSDWA() const noexcept { return (uint16_t)format & (uint16_t)Format::SDWA; } 1307 1308 FLAT_instruction& flatlike() { return *(FLAT_instruction*)this; } 1309 1310 const FLAT_instruction& flatlike() const { return *(FLAT_instruction*)this; } 1311 1312 constexpr bool isFlatLike() const noexcept { return isFlat() || isGlobal() || isScratch(); } 1313 1314 constexpr bool isVALU() const noexcept 1315 { 1316 return isVOP1() || isVOP2() || isVOPC() || isVOP3() || isVOP3P(); 1317 } 1318 1319 constexpr bool isSALU() const noexcept 1320 { 1321 return isSOP1() || isSOP2() || isSOPC() || isSOPK() || isSOPP(); 1322 } 1323 1324 constexpr bool isVMEM() const noexcept { return isMTBUF() || isMUBUF() || isMIMG(); } 1325}; 1326static_assert(sizeof(Instruction) == 16, "Unexpected padding"); 1327 1328struct SOPK_instruction : public Instruction { 1329 uint16_t imm; 1330 uint16_t padding; 1331}; 1332static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); 1333 1334struct SOPP_instruction : public Instruction { 1335 uint32_t imm; 1336 int block; 1337}; 1338static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); 1339 1340struct SOPC_instruction : public Instruction {}; 1341static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding"); 1342 1343struct SOP1_instruction : public Instruction {}; 1344static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding"); 1345 1346struct SOP2_instruction : public Instruction {}; 1347static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding"); 1348 1349/** 1350 * Scalar Memory Format: 1351 * For s_(buffer_)load_dword*: 1352 * Operand(0): SBASE - SGPR-pair which provides base address 1353 * Operand(1): Offset - immediate (un)signed offset or SGPR 1354 * Operand(2) / Definition(0): SDATA - SGPR for read / write result 1355 * Operand(n-1): SOffset - SGPR offset (Vega only) 1356 * 1357 * Having no operands is also valid for instructions such as s_dcache_inv. 1358 * 1359 */ 1360struct SMEM_instruction : public Instruction { 1361 memory_sync_info sync; 1362 bool glc : 1; /* VI+: globally coherent */ 1363 bool dlc : 1; /* NAVI: device level coherent */ 1364 bool nv : 1; /* VEGA only: Non-volatile */ 1365 bool disable_wqm : 1; 1366 bool prevent_overflow : 1; /* avoid overflow when combining additions */ 1367 uint8_t padding : 3; 1368}; 1369static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); 1370 1371struct VOP1_instruction : public Instruction {}; 1372static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding"); 1373 1374struct VOP2_instruction : public Instruction {}; 1375static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding"); 1376 1377struct VOPC_instruction : public Instruction {}; 1378static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding"); 1379 1380struct VOP3_instruction : public Instruction { 1381 bool abs[3]; 1382 bool neg[3]; 1383 uint8_t opsel : 4; 1384 uint8_t omod : 2; 1385 bool clamp : 1; 1386 uint8_t padding0 : 1; 1387 uint8_t padding1; 1388}; 1389static_assert(sizeof(VOP3_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); 1390 1391struct VOP3P_instruction : public Instruction { 1392 bool neg_lo[3]; 1393 bool neg_hi[3]; 1394 uint8_t opsel_lo : 3; 1395 uint8_t opsel_hi : 3; 1396 bool clamp : 1; 1397 uint8_t padding0 : 1; 1398 uint8_t padding1; 1399}; 1400static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); 1401 1402/** 1403 * Data Parallel Primitives Format: 1404 * This format can be used for VOP1, VOP2 or VOPC instructions. 1405 * The swizzle applies to the src0 operand. 1406 * 1407 */ 1408struct DPP_instruction : public Instruction { 1409 bool abs[2]; 1410 bool neg[2]; 1411 uint16_t dpp_ctrl; 1412 uint8_t row_mask : 4; 1413 uint8_t bank_mask : 4; 1414 bool bound_ctrl : 1; 1415 uint8_t padding : 7; 1416}; 1417static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); 1418 1419struct SubdwordSel { 1420 enum sdwa_sel : uint8_t { 1421 ubyte = 0x4, 1422 uword = 0x8, 1423 dword = 0x10, 1424 sext = 0x20, 1425 sbyte = ubyte | sext, 1426 sword = uword | sext, 1427 1428 ubyte0 = ubyte, 1429 ubyte1 = ubyte | 1, 1430 ubyte2 = ubyte | 2, 1431 ubyte3 = ubyte | 3, 1432 sbyte0 = sbyte, 1433 sbyte1 = sbyte | 1, 1434 sbyte2 = sbyte | 2, 1435 sbyte3 = sbyte | 3, 1436 uword0 = uword, 1437 uword1 = uword | 2, 1438 sword0 = sword, 1439 sword1 = sword | 2, 1440 }; 1441 1442 SubdwordSel() : sel((sdwa_sel)0) {} 1443 constexpr SubdwordSel(sdwa_sel sel_) : sel(sel_) {} 1444 constexpr SubdwordSel(unsigned size, unsigned offset, bool sign_extend) 1445 : sel((sdwa_sel)((sign_extend ? sext : 0) | size << 2 | offset)) 1446 {} 1447 constexpr operator sdwa_sel() const { return sel; } 1448 explicit operator bool() const { return sel != 0; } 1449 1450 constexpr unsigned size() const { return (sel >> 2) & 0x7; } 1451 constexpr unsigned offset() const { return sel & 0x3; } 1452 constexpr bool sign_extend() const { return sel & sext; } 1453 constexpr unsigned to_sdwa_sel(unsigned reg_byte_offset) const 1454 { 1455 reg_byte_offset += offset(); 1456 if (size() == 1) 1457 return reg_byte_offset; 1458 else if (size() == 2) 1459 return 4 + (reg_byte_offset >> 1); 1460 else 1461 return 6; 1462 } 1463 1464private: 1465 sdwa_sel sel; 1466}; 1467 1468/** 1469 * Sub-Dword Addressing Format: 1470 * This format can be used for VOP1, VOP2 or VOPC instructions. 1471 * 1472 * omod and SGPR/constant operands are only available on GFX9+. For VOPC, 1473 * the definition doesn't have to be VCC on GFX9+. 1474 * 1475 */ 1476struct SDWA_instruction : public Instruction { 1477 /* these destination modifiers aren't available with VOPC except for 1478 * clamp on GFX8 */ 1479 SubdwordSel sel[2]; 1480 SubdwordSel dst_sel; 1481 bool neg[2]; 1482 bool abs[2]; 1483 bool clamp : 1; 1484 uint8_t omod : 2; /* GFX9+ */ 1485 uint8_t padding : 5; 1486}; 1487static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); 1488 1489struct Interp_instruction : public Instruction { 1490 uint8_t attribute; 1491 uint8_t component; 1492 uint16_t padding; 1493}; 1494static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); 1495 1496/** 1497 * Local and Global Data Sharing instructions 1498 * Operand(0): ADDR - VGPR which supplies the address. 1499 * Operand(1): DATA0 - First data VGPR. 1500 * Operand(2): DATA1 - Second data VGPR. 1501 * Operand(n-1): M0 - LDS size. 1502 * Definition(0): VDST - Destination VGPR when results returned to VGPRs. 1503 * 1504 */ 1505struct DS_instruction : public Instruction { 1506 memory_sync_info sync; 1507 bool gds; 1508 int16_t offset0; 1509 int8_t offset1; 1510 uint8_t padding; 1511}; 1512static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); 1513 1514/** 1515 * Vector Memory Untyped-buffer Instructions 1516 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant) 1517 * Operand(1): VADDR - Address source. Can carry an index and/or offset 1518 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant) 1519 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data 1520 * 1521 */ 1522struct MUBUF_instruction : public Instruction { 1523 memory_sync_info sync; 1524 bool offen : 1; /* Supply an offset from VGPR (VADDR) */ 1525 bool idxen : 1; /* Supply an index from VGPR (VADDR) */ 1526 bool addr64 : 1; /* SI, CIK: Address size is 64-bit */ 1527 bool glc : 1; /* globally coherent */ 1528 bool dlc : 1; /* NAVI: device level coherent */ 1529 bool slc : 1; /* system level coherent */ 1530 bool tfe : 1; /* texture fail enable */ 1531 bool lds : 1; /* Return read-data to LDS instead of VGPRs */ 1532 uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */ 1533 uint16_t offset : 12; /* Unsigned byte offset - 12 bit */ 1534 uint16_t swizzled : 1; 1535 uint16_t padding0 : 2; 1536 uint16_t vtx_binding : 6; /* 0 if this is not a vertex attribute load */ 1537 uint16_t padding1 : 10; 1538}; 1539static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); 1540 1541/** 1542 * Vector Memory Typed-buffer Instructions 1543 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant) 1544 * Operand(1): VADDR - Address source. Can carry an index and/or offset 1545 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant) 1546 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data 1547 * 1548 */ 1549struct MTBUF_instruction : public Instruction { 1550 memory_sync_info sync; 1551 uint8_t dfmt : 4; /* Data Format of data in memory buffer */ 1552 uint8_t nfmt : 3; /* Numeric format of data in memory */ 1553 bool offen : 1; /* Supply an offset from VGPR (VADDR) */ 1554 uint16_t idxen : 1; /* Supply an index from VGPR (VADDR) */ 1555 uint16_t glc : 1; /* globally coherent */ 1556 uint16_t dlc : 1; /* NAVI: device level coherent */ 1557 uint16_t slc : 1; /* system level coherent */ 1558 uint16_t tfe : 1; /* texture fail enable */ 1559 uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */ 1560 uint16_t vtx_binding : 6; /* 0 if this is not a vertex attribute load */ 1561 uint16_t padding : 4; 1562 uint16_t offset; /* Unsigned byte offset - 12 bit */ 1563}; 1564static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); 1565 1566/** 1567 * Vector Memory Image Instructions 1568 * Operand(0) SRSRC - Scalar GPR that specifies the resource constant. 1569 * Operand(1): SSAMP - Scalar GPR that specifies sampler constant. 1570 * Operand(2): VDATA - Vector GPR for write data or zero if TFE/LWE=1. 1571 * Operand(3): VADDR - Address source. Can carry an offset or an index. 1572 * Definition(0): VDATA - Vector GPR for read result. 1573 * 1574 */ 1575struct MIMG_instruction : public Instruction { 1576 memory_sync_info sync; 1577 uint8_t dmask; /* Data VGPR enable mask */ 1578 uint8_t dim : 3; /* NAVI: dimensionality */ 1579 bool unrm : 1; /* Force address to be un-normalized */ 1580 bool dlc : 1; /* NAVI: device level coherent */ 1581 bool glc : 1; /* globally coherent */ 1582 bool slc : 1; /* system level coherent */ 1583 bool tfe : 1; /* texture fail enable */ 1584 bool da : 1; /* declare an array */ 1585 bool lwe : 1; /* LOD warning enable */ 1586 bool r128 : 1; /* NAVI: Texture resource size */ 1587 bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */ 1588 bool d16 : 1; /* Convert 32-bit data to 16-bit data */ 1589 bool disable_wqm : 1; /* Require an exec mask without helper invocations */ 1590 uint8_t padding0 : 2; 1591 uint8_t padding1; 1592 uint8_t padding2; 1593}; 1594static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); 1595 1596/** 1597 * Flat/Scratch/Global Instructions 1598 * Operand(0): ADDR 1599 * Operand(1): SADDR 1600 * Operand(2) / Definition(0): DATA/VDST 1601 * 1602 */ 1603struct FLAT_instruction : public Instruction { 1604 memory_sync_info sync; 1605 bool slc : 1; /* system level coherent */ 1606 bool glc : 1; /* globally coherent */ 1607 bool dlc : 1; /* NAVI: device level coherent */ 1608 bool lds : 1; 1609 bool nv : 1; 1610 bool disable_wqm : 1; /* Require an exec mask without helper invocations */ 1611 uint8_t padding0 : 2; 1612 uint16_t offset; /* Vega/Navi only */ 1613 uint16_t padding1; 1614}; 1615static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); 1616 1617struct Export_instruction : public Instruction { 1618 uint8_t enabled_mask; 1619 uint8_t dest; 1620 bool compressed : 1; 1621 bool done : 1; 1622 bool valid_mask : 1; 1623 uint8_t padding0 : 5; 1624 uint8_t padding1; 1625}; 1626static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); 1627 1628struct Pseudo_instruction : public Instruction { 1629 PhysReg scratch_sgpr; /* might not be valid if it's not needed */ 1630 bool tmp_in_scc; 1631 uint8_t padding; 1632}; 1633static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); 1634 1635struct Pseudo_branch_instruction : public Instruction { 1636 /* target[0] is the block index of the branch target. 1637 * For conditional branches, target[1] contains the fall-through alternative. 1638 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target). 1639 */ 1640 uint32_t target[2]; 1641}; 1642static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); 1643 1644struct Pseudo_barrier_instruction : public Instruction { 1645 memory_sync_info sync; 1646 sync_scope exec_scope; 1647}; 1648static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); 1649 1650enum ReduceOp : uint16_t { 1651 // clang-format off 1652 iadd8, iadd16, iadd32, iadd64, 1653 imul8, imul16, imul32, imul64, 1654 fadd16, fadd32, fadd64, 1655 fmul16, fmul32, fmul64, 1656 imin8, imin16, imin32, imin64, 1657 imax8, imax16, imax32, imax64, 1658 umin8, umin16, umin32, umin64, 1659 umax8, umax16, umax32, umax64, 1660 fmin16, fmin32, fmin64, 1661 fmax16, fmax32, fmax64, 1662 iand8, iand16, iand32, iand64, 1663 ior8, ior16, ior32, ior64, 1664 ixor8, ixor16, ixor32, ixor64, 1665 num_reduce_ops, 1666 // clang-format on 1667}; 1668 1669/** 1670 * Subgroup Reduction Instructions, everything except for the data to be 1671 * reduced and the result as inserted by setup_reduce_temp(). 1672 * Operand(0): data to be reduced 1673 * Operand(1): reduce temporary 1674 * Operand(2): vector temporary 1675 * Definition(0): result 1676 * Definition(1): scalar temporary 1677 * Definition(2): scalar identity temporary (not used to store identity on GFX10) 1678 * Definition(3): scc clobber 1679 * Definition(4): vcc clobber 1680 * 1681 */ 1682struct Pseudo_reduction_instruction : public Instruction { 1683 ReduceOp reduce_op; 1684 uint16_t cluster_size; // must be 0 for scans 1685}; 1686static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4, 1687 "Unexpected padding"); 1688 1689struct instr_deleter_functor { 1690 void operator()(void* p) { free(p); } 1691}; 1692 1693template <typename T> using aco_ptr = std::unique_ptr<T, instr_deleter_functor>; 1694 1695template <typename T> 1696T* 1697create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, 1698 uint32_t num_definitions) 1699{ 1700 std::size_t size = 1701 sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition); 1702 char* data = (char*)calloc(1, size); 1703 T* inst = (T*)data; 1704 1705 inst->opcode = opcode; 1706 inst->format = format; 1707 1708 uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands; 1709 inst->operands = aco::span<Operand>(operands_offset, num_operands); 1710 uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions; 1711 inst->definitions = aco::span<Definition>(definitions_offset, num_definitions); 1712 1713 return inst; 1714} 1715 1716constexpr bool 1717Instruction::usesModifiers() const noexcept 1718{ 1719 if (isDPP() || isSDWA()) 1720 return true; 1721 1722 if (isVOP3P()) { 1723 const VOP3P_instruction& vop3p = this->vop3p(); 1724 for (unsigned i = 0; i < operands.size(); i++) { 1725 if (vop3p.neg_lo[i] || vop3p.neg_hi[i]) 1726 return true; 1727 1728 /* opsel_hi must be 1 to not be considered a modifier - even for constants */ 1729 if (!(vop3p.opsel_hi & (1 << i))) 1730 return true; 1731 } 1732 return vop3p.opsel_lo || vop3p.clamp; 1733 } else if (isVOP3()) { 1734 const VOP3_instruction& vop3 = this->vop3(); 1735 for (unsigned i = 0; i < operands.size(); i++) { 1736 if (vop3.abs[i] || vop3.neg[i]) 1737 return true; 1738 } 1739 return vop3.opsel || vop3.clamp || vop3.omod; 1740 } 1741 return false; 1742} 1743 1744constexpr bool 1745is_phi(Instruction* instr) 1746{ 1747 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi; 1748} 1749 1750static inline bool 1751is_phi(aco_ptr<Instruction>& instr) 1752{ 1753 return is_phi(instr.get()); 1754} 1755 1756memory_sync_info get_sync_info(const Instruction* instr); 1757 1758bool is_dead(const std::vector<uint16_t>& uses, Instruction* instr); 1759 1760bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high); 1761bool instr_is_16bit(chip_class chip, aco_opcode op); 1762bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr, bool pre_ra); 1763bool can_use_DPP(const aco_ptr<Instruction>& instr, bool pre_ra); 1764/* updates "instr" and returns the old instruction (or NULL if no update was needed) */ 1765aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr); 1766aco_ptr<Instruction> convert_to_DPP(aco_ptr<Instruction>& instr); 1767bool needs_exec_mask(const Instruction* instr); 1768 1769aco_opcode get_ordered(aco_opcode op); 1770aco_opcode get_unordered(aco_opcode op); 1771aco_opcode get_inverse(aco_opcode op); 1772aco_opcode get_f32_cmp(aco_opcode op); 1773unsigned get_cmp_bitsize(aco_opcode op); 1774bool is_cmp(aco_opcode op); 1775 1776bool can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op); 1777 1778uint32_t get_reduction_identity(ReduceOp op, unsigned idx); 1779 1780unsigned get_mimg_nsa_dwords(const Instruction* instr); 1781 1782bool should_form_clause(const Instruction* a, const Instruction* b); 1783 1784enum block_kind { 1785 /* uniform indicates that leaving this block, 1786 * all actives lanes stay active */ 1787 block_kind_uniform = 1 << 0, 1788 block_kind_top_level = 1 << 1, 1789 block_kind_loop_preheader = 1 << 2, 1790 block_kind_loop_header = 1 << 3, 1791 block_kind_loop_exit = 1 << 4, 1792 block_kind_continue = 1 << 5, 1793 block_kind_break = 1 << 6, 1794 block_kind_continue_or_break = 1 << 7, 1795 block_kind_discard = 1 << 8, 1796 block_kind_branch = 1 << 9, 1797 block_kind_merge = 1 << 10, 1798 block_kind_invert = 1 << 11, 1799 block_kind_uses_discard_if = 1 << 12, 1800 block_kind_needs_lowering = 1 << 13, 1801 block_kind_uses_demote = 1 << 14, 1802 block_kind_export_end = 1 << 15, 1803}; 1804 1805struct RegisterDemand { 1806 constexpr RegisterDemand() = default; 1807 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept : vgpr{v}, sgpr{s} {} 1808 int16_t vgpr = 0; 1809 int16_t sgpr = 0; 1810 1811 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept 1812 { 1813 return a.vgpr == b.vgpr && a.sgpr == b.sgpr; 1814 } 1815 1816 constexpr bool exceeds(const RegisterDemand other) const noexcept 1817 { 1818 return vgpr > other.vgpr || sgpr > other.sgpr; 1819 } 1820 1821 constexpr RegisterDemand operator+(const Temp t) const noexcept 1822 { 1823 if (t.type() == RegType::sgpr) 1824 return RegisterDemand(vgpr, sgpr + t.size()); 1825 else 1826 return RegisterDemand(vgpr + t.size(), sgpr); 1827 } 1828 1829 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept 1830 { 1831 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr); 1832 } 1833 1834 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept 1835 { 1836 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr); 1837 } 1838 1839 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept 1840 { 1841 vgpr += other.vgpr; 1842 sgpr += other.sgpr; 1843 return *this; 1844 } 1845 1846 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept 1847 { 1848 vgpr -= other.vgpr; 1849 sgpr -= other.sgpr; 1850 return *this; 1851 } 1852 1853 constexpr RegisterDemand& operator+=(const Temp t) noexcept 1854 { 1855 if (t.type() == RegType::sgpr) 1856 sgpr += t.size(); 1857 else 1858 vgpr += t.size(); 1859 return *this; 1860 } 1861 1862 constexpr RegisterDemand& operator-=(const Temp t) noexcept 1863 { 1864 if (t.type() == RegType::sgpr) 1865 sgpr -= t.size(); 1866 else 1867 vgpr -= t.size(); 1868 return *this; 1869 } 1870 1871 constexpr void update(const RegisterDemand other) noexcept 1872 { 1873 vgpr = std::max(vgpr, other.vgpr); 1874 sgpr = std::max(sgpr, other.sgpr); 1875 } 1876}; 1877 1878/* CFG */ 1879struct Block { 1880 float_mode fp_mode; 1881 unsigned index; 1882 unsigned offset = 0; 1883 std::vector<aco_ptr<Instruction>> instructions; 1884 std::vector<unsigned> logical_preds; 1885 std::vector<unsigned> linear_preds; 1886 std::vector<unsigned> logical_succs; 1887 std::vector<unsigned> linear_succs; 1888 RegisterDemand register_demand = RegisterDemand(); 1889 uint16_t loop_nest_depth = 0; 1890 uint16_t divergent_if_logical_depth = 0; 1891 uint16_t uniform_if_depth = 0; 1892 uint16_t kind = 0; 1893 int logical_idom = -1; 1894 int linear_idom = -1; 1895 1896 /* this information is needed for predecessors to blocks with phis when 1897 * moving out of ssa */ 1898 bool scc_live_out = false; 1899 PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */ 1900 1901 Block() : index(0) {} 1902}; 1903 1904/* 1905 * Shader stages as provided in Vulkan by the application. Contrast this to HWStage. 1906 */ 1907enum class SWStage : uint8_t { 1908 None = 0, 1909 VS = 1 << 0, /* Vertex Shader */ 1910 GS = 1 << 1, /* Geometry Shader */ 1911 TCS = 1 << 2, /* Tessellation Control aka Hull Shader */ 1912 TES = 1 << 3, /* Tessellation Evaluation aka Domain Shader */ 1913 FS = 1 << 4, /* Fragment aka Pixel Shader */ 1914 CS = 1 << 5, /* Compute Shader */ 1915 GSCopy = 1 << 6, /* GS Copy Shader (internal) */ 1916 1917 /* Stage combinations merged to run on a single HWStage */ 1918 VS_GS = VS | GS, 1919 VS_TCS = VS | TCS, 1920 TES_GS = TES | GS, 1921}; 1922 1923constexpr SWStage 1924operator|(SWStage a, SWStage b) 1925{ 1926 return static_cast<SWStage>(static_cast<uint8_t>(a) | static_cast<uint8_t>(b)); 1927} 1928 1929/* 1930 * Shader stages as running on the AMD GPU. 1931 * 1932 * The relation between HWStages and SWStages is not a one-to-one mapping: 1933 * Some SWStages are merged by ACO to run on a single HWStage. 1934 * See README.md for details. 1935 */ 1936enum class HWStage : uint8_t { 1937 VS, 1938 ES, /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */ 1939 GS, /* Geometry shader on GFX10/legacy and GFX6-9. */ 1940 NGG, /* Primitive shader, used to implement VS, TES, GS. */ 1941 LS, /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */ 1942 HS, /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */ 1943 FS, 1944 CS, 1945}; 1946 1947/* 1948 * Set of SWStages to be merged into a single shader paired with the 1949 * HWStage it will run on. 1950 */ 1951struct Stage { 1952 constexpr Stage() = default; 1953 1954 explicit constexpr Stage(HWStage hw_, SWStage sw_) : sw(sw_), hw(hw_) {} 1955 1956 /* Check if the given SWStage is included */ 1957 constexpr bool has(SWStage stage) const 1958 { 1959 return (static_cast<uint8_t>(sw) & static_cast<uint8_t>(stage)); 1960 } 1961 1962 unsigned num_sw_stages() const { return util_bitcount(static_cast<uint8_t>(sw)); } 1963 1964 constexpr bool operator==(const Stage& other) const { return sw == other.sw && hw == other.hw; } 1965 1966 constexpr bool operator!=(const Stage& other) const { return sw != other.sw || hw != other.hw; } 1967 1968 /* Mask of merged software stages */ 1969 SWStage sw = SWStage::None; 1970 1971 /* Active hardware stage */ 1972 HWStage hw{}; 1973}; 1974 1975/* possible settings of Program::stage */ 1976static constexpr Stage vertex_vs(HWStage::VS, SWStage::VS); 1977static constexpr Stage fragment_fs(HWStage::FS, SWStage::FS); 1978static constexpr Stage compute_cs(HWStage::CS, SWStage::CS); 1979static constexpr Stage tess_eval_vs(HWStage::VS, SWStage::TES); 1980static constexpr Stage gs_copy_vs(HWStage::VS, SWStage::GSCopy); 1981/* GFX10/NGG */ 1982static constexpr Stage vertex_ngg(HWStage::NGG, SWStage::VS); 1983static constexpr Stage vertex_geometry_ngg(HWStage::NGG, SWStage::VS_GS); 1984static constexpr Stage tess_eval_ngg(HWStage::NGG, SWStage::TES); 1985static constexpr Stage tess_eval_geometry_ngg(HWStage::NGG, SWStage::TES_GS); 1986/* GFX9 (and GFX10 if NGG isn't used) */ 1987static constexpr Stage vertex_geometry_gs(HWStage::GS, SWStage::VS_GS); 1988static constexpr Stage vertex_tess_control_hs(HWStage::HS, SWStage::VS_TCS); 1989static constexpr Stage tess_eval_geometry_gs(HWStage::GS, SWStage::TES_GS); 1990/* pre-GFX9 */ 1991static constexpr Stage vertex_ls(HWStage::LS, SWStage::VS); /* vertex before tesselation control */ 1992static constexpr Stage vertex_es(HWStage::ES, SWStage::VS); /* vertex before geometry */ 1993static constexpr Stage tess_control_hs(HWStage::HS, SWStage::TCS); 1994static constexpr Stage tess_eval_es(HWStage::ES, 1995 SWStage::TES); /* tesselation evaluation before geometry */ 1996static constexpr Stage geometry_gs(HWStage::GS, SWStage::GS); 1997 1998enum statistic { 1999 statistic_hash, 2000 statistic_instructions, 2001 statistic_copies, 2002 statistic_branches, 2003 statistic_latency, 2004 statistic_inv_throughput, 2005 statistic_vmem_clauses, 2006 statistic_smem_clauses, 2007 statistic_sgpr_presched, 2008 statistic_vgpr_presched, 2009 num_statistics 2010}; 2011 2012struct DeviceInfo { 2013 uint16_t lds_encoding_granule; 2014 uint16_t lds_alloc_granule; 2015 uint32_t lds_limit; /* in bytes */ 2016 bool has_16bank_lds; 2017 uint16_t physical_sgprs; 2018 uint16_t physical_vgprs; 2019 uint16_t vgpr_limit; 2020 uint16_t sgpr_limit; 2021 uint16_t sgpr_alloc_granule; 2022 uint16_t vgpr_alloc_granule; /* must be power of two */ 2023 unsigned max_wave64_per_simd; 2024 unsigned simd_per_cu; 2025 bool has_fast_fma32 = false; 2026 bool xnack_enabled = false; 2027 bool sram_ecc_enabled = false; 2028}; 2029 2030enum class CompilationProgress { 2031 after_isel, 2032 after_spilling, 2033 after_ra, 2034}; 2035 2036class Program final { 2037public: 2038 std::vector<Block> blocks; 2039 std::vector<RegClass> temp_rc = {s1}; 2040 RegisterDemand max_reg_demand = RegisterDemand(); 2041 uint16_t num_waves = 0; 2042 uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */ 2043 ac_shader_config* config; 2044 const struct radv_shader_info* info; 2045 enum chip_class chip_class; 2046 enum radeon_family family; 2047 DeviceInfo dev; 2048 unsigned wave_size; 2049 RegClass lane_mask; 2050 Stage stage; 2051 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */ 2052 bool needs_wqm = false; /* there exists a p_wqm instruction */ 2053 2054 std::vector<uint8_t> constant_data; 2055 Temp private_segment_buffer; 2056 Temp scratch_offset; 2057 2058 uint16_t min_waves = 0; 2059 unsigned workgroup_size; /* if known; otherwise UINT_MAX */ 2060 bool wgp_mode; 2061 bool early_rast = false; /* whether rasterization can start as soon as the 1st DONE pos export */ 2062 2063 bool needs_vcc = false; 2064 bool needs_flat_scr = false; 2065 2066 CompilationProgress progress; 2067 2068 bool collect_statistics = false; 2069 uint32_t statistics[num_statistics]; 2070 2071 float_mode next_fp_mode; 2072 unsigned next_loop_depth = 0; 2073 unsigned next_divergent_if_logical_depth = 0; 2074 unsigned next_uniform_if_depth = 0; 2075 2076 std::vector<Definition> vs_inputs; 2077 2078 struct { 2079 FILE* output = stderr; 2080 bool shorten_messages = false; 2081 void (*func)(void* private_data, enum radv_compiler_debug_level level, const char* message); 2082 void* private_data; 2083 } debug; 2084 2085 uint32_t allocateId(RegClass rc) 2086 { 2087 assert(allocationID <= 16777215); 2088 temp_rc.push_back(rc); 2089 return allocationID++; 2090 } 2091 2092 void allocateRange(unsigned amount) 2093 { 2094 assert(allocationID + amount <= 16777216); 2095 temp_rc.resize(temp_rc.size() + amount); 2096 allocationID += amount; 2097 } 2098 2099 Temp allocateTmp(RegClass rc) { return Temp(allocateId(rc), rc); } 2100 2101 uint32_t peekAllocationId() { return allocationID; } 2102 2103 friend void reindex_ssa(Program* program); 2104 friend void reindex_ssa(Program* program, std::vector<IDSet>& live_out); 2105 2106 Block* create_and_insert_block() 2107 { 2108 Block block; 2109 return insert_block(std::move(block)); 2110 } 2111 2112 Block* insert_block(Block&& block) 2113 { 2114 block.index = blocks.size(); 2115 block.fp_mode = next_fp_mode; 2116 block.loop_nest_depth = next_loop_depth; 2117 block.divergent_if_logical_depth = next_divergent_if_logical_depth; 2118 block.uniform_if_depth = next_uniform_if_depth; 2119 blocks.emplace_back(std::move(block)); 2120 return &blocks.back(); 2121 } 2122 2123private: 2124 uint32_t allocationID = 1; 2125}; 2126 2127struct live { 2128 /* live temps out per block */ 2129 std::vector<IDSet> live_out; 2130 /* register demand (sgpr/vgpr) per instruction per block */ 2131 std::vector<std::vector<RegisterDemand>> register_demand; 2132}; 2133 2134struct ra_test_policy { 2135 /* Force RA to always use its pessimistic fallback algorithm */ 2136 bool skip_optimistic_path = false; 2137}; 2138 2139void init(); 2140 2141void init_program(Program* program, Stage stage, const struct radv_shader_info* info, 2142 enum chip_class chip_class, enum radeon_family family, bool wgp_mode, 2143 ac_shader_config* config); 2144 2145void select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders, 2146 ac_shader_config* config, const struct radv_shader_args* args); 2147void select_gs_copy_shader(Program* program, struct nir_shader* gs_shader, ac_shader_config* config, 2148 const struct radv_shader_args* args); 2149void select_trap_handler_shader(Program* program, struct nir_shader* shader, 2150 ac_shader_config* config, const struct radv_shader_args* args); 2151void select_vs_prolog(Program* program, const struct radv_vs_prolog_key* key, 2152 ac_shader_config* config, const struct radv_shader_args* args, 2153 unsigned* num_preserved_sgprs); 2154 2155void lower_phis(Program* program); 2156void calc_min_waves(Program* program); 2157void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand); 2158live live_var_analysis(Program* program); 2159std::vector<uint16_t> dead_code_analysis(Program* program); 2160void dominator_tree(Program* program); 2161void insert_exec_mask(Program* program); 2162void value_numbering(Program* program); 2163void optimize(Program* program); 2164void optimize_postRA(Program* program); 2165void setup_reduce_temp(Program* program); 2166void lower_to_cssa(Program* program, live& live_vars); 2167void register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, 2168 ra_test_policy = {}); 2169void ssa_elimination(Program* program); 2170void lower_to_hw_instr(Program* program); 2171void schedule_program(Program* program, live& live_vars); 2172void spill(Program* program, live& live_vars); 2173void insert_wait_states(Program* program); 2174void insert_NOPs(Program* program); 2175void form_hard_clauses(Program* program); 2176unsigned emit_program(Program* program, std::vector<uint32_t>& code); 2177/** 2178 * Returns true if print_asm can disassemble the given program for the current build/runtime 2179 * configuration 2180 */ 2181bool check_print_asm_support(Program* program); 2182bool print_asm(Program* program, std::vector<uint32_t>& binary, unsigned exec_size, FILE* output); 2183bool validate_ir(Program* program); 2184bool validate_ra(Program* program); 2185#ifndef NDEBUG 2186void perfwarn(Program* program, bool cond, const char* msg, Instruction* instr = NULL); 2187#else 2188#define perfwarn(program, cond, msg, ...) \ 2189 do { \ 2190 } while (0) 2191#endif 2192 2193void collect_presched_stats(Program* program); 2194void collect_preasm_stats(Program* program); 2195void collect_postasm_stats(Program* program, const std::vector<uint32_t>& code); 2196 2197enum print_flags { 2198 print_no_ssa = 0x1, 2199 print_perf_info = 0x2, 2200 print_kill = 0x4, 2201 print_live_vars = 0x8, 2202}; 2203 2204void aco_print_operand(const Operand* operand, FILE* output, unsigned flags = 0); 2205void aco_print_instr(const Instruction* instr, FILE* output, unsigned flags = 0); 2206void aco_print_program(const Program* program, FILE* output, unsigned flags = 0); 2207void aco_print_program(const Program* program, FILE* output, const live& live_vars, 2208 unsigned flags = 0); 2209 2210void _aco_perfwarn(Program* program, const char* file, unsigned line, const char* fmt, ...); 2211void _aco_err(Program* program, const char* file, unsigned line, const char* fmt, ...); 2212 2213#define aco_perfwarn(program, ...) _aco_perfwarn(program, __FILE__, __LINE__, __VA_ARGS__) 2214#define aco_err(program, ...) _aco_err(program, __FILE__, __LINE__, __VA_ARGS__) 2215 2216/* utilities for dealing with register demand */ 2217RegisterDemand get_live_changes(aco_ptr<Instruction>& instr); 2218RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr); 2219RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, 2220 aco_ptr<Instruction>& instr_before); 2221 2222/* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */ 2223uint16_t get_extra_sgprs(Program* program); 2224 2225/* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */ 2226uint16_t get_sgpr_alloc(Program* program, uint16_t addressable_sgprs); 2227uint16_t get_vgpr_alloc(Program* program, uint16_t addressable_vgprs); 2228 2229/* return number of addressable sgprs/vgprs for max_waves */ 2230uint16_t get_addr_sgpr_from_waves(Program* program, uint16_t max_waves); 2231uint16_t get_addr_vgpr_from_waves(Program* program, uint16_t max_waves); 2232 2233typedef struct { 2234 const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)]; 2235 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)]; 2236 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)]; 2237 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers; 2238 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers; 2239 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic; 2240 const char* name[static_cast<int>(aco_opcode::num_opcodes)]; 2241 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)]; 2242 /* sizes used for input/output modifiers and constants */ 2243 const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)]; 2244 const instr_class classes[static_cast<int>(aco_opcode::num_opcodes)]; 2245} Info; 2246 2247extern const Info instr_info; 2248 2249} // namespace aco 2250 2251#endif /* ACO_IR_H */ 2252