1/*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25#ifndef ACO_IR_H
26#define ACO_IR_H
27
28#include "aco_opcodes.h"
29#include "aco_util.h"
30
31#include "vulkan/radv_shader.h"
32
33#include "nir.h"
34
35#include <bitset>
36#include <memory>
37#include <vector>
38
39struct radv_shader_args;
40struct radv_shader_info;
41struct radv_vs_prolog_key;
42
43namespace aco {
44
45extern uint64_t debug_flags;
46
47enum {
48   DEBUG_VALIDATE_IR = 0x1,
49   DEBUG_VALIDATE_RA = 0x2,
50   DEBUG_PERFWARN = 0x4,
51   DEBUG_FORCE_WAITCNT = 0x8,
52   DEBUG_NO_VN = 0x10,
53   DEBUG_NO_OPT = 0x20,
54   DEBUG_NO_SCHED = 0x40,
55   DEBUG_PERF_INFO = 0x80,
56   DEBUG_LIVE_INFO = 0x100,
57};
58
59/**
60 * Representation of the instruction's microcode encoding format
61 * Note: Some Vector ALU Formats can be combined, such that:
62 * - VOP2* | VOP3 represents a VOP2 instruction in VOP3 encoding
63 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
64 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
65 *
66 * (*) The same is applicable for VOP1 and VOPC instructions.
67 */
68enum class Format : std::uint16_t {
69   /* Pseudo Instruction Format */
70   PSEUDO = 0,
71   /* Scalar ALU & Control Formats */
72   SOP1 = 1,
73   SOP2 = 2,
74   SOPK = 3,
75   SOPP = 4,
76   SOPC = 5,
77   /* Scalar Memory Format */
78   SMEM = 6,
79   /* LDS/GDS Format */
80   DS = 8,
81   /* Vector Memory Buffer Formats */
82   MTBUF = 9,
83   MUBUF = 10,
84   /* Vector Memory Image Format */
85   MIMG = 11,
86   /* Export Format */
87   EXP = 12,
88   /* Flat Formats */
89   FLAT = 13,
90   GLOBAL = 14,
91   SCRATCH = 15,
92
93   PSEUDO_BRANCH = 16,
94   PSEUDO_BARRIER = 17,
95   PSEUDO_REDUCTION = 18,
96
97   /* Vector ALU Formats */
98   VOP3P = 19,
99   VOP1 = 1 << 8,
100   VOP2 = 1 << 9,
101   VOPC = 1 << 10,
102   VOP3 = 1 << 11,
103   /* Vector Parameter Interpolation Format */
104   VINTRP = 1 << 12,
105   DPP = 1 << 13,
106   SDWA = 1 << 14,
107};
108
109enum class instr_class : uint8_t {
110   valu32 = 0,
111   valu_convert32 = 1,
112   valu64 = 2,
113   valu_quarter_rate32 = 3,
114   valu_fma = 4,
115   valu_transcendental32 = 5,
116   valu_double = 6,
117   valu_double_add = 7,
118   valu_double_convert = 8,
119   valu_double_transcendental = 9,
120   salu = 10,
121   smem = 11,
122   barrier = 12,
123   branch = 13,
124   sendmsg = 14,
125   ds = 15,
126   exp = 16,
127   vmem = 17,
128   waitcnt = 18,
129   other = 19,
130   count,
131};
132
133enum storage_class : uint8_t {
134   storage_none = 0x0,   /* no synchronization and can be reordered around aliasing stores */
135   storage_buffer = 0x1, /* SSBOs and global memory */
136   storage_atomic_counter = 0x2, /* not used for Vulkan */
137   storage_image = 0x4,
138   storage_shared = 0x8,       /* or TCS output */
139   storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */
140   storage_scratch = 0x20,
141   storage_vgpr_spill = 0x40,
142   storage_count = 8,
143};
144
145enum memory_semantics : uint8_t {
146   semantic_none = 0x0,
147   /* for loads: don't move any access after this load to before this load (even other loads)
148    * for barriers: don't move any access after the barrier to before any
149    * atomics/control_barriers/sendmsg_gs_done before the barrier */
150   semantic_acquire = 0x1,
151   /* for stores: don't move any access before this store to after this store
152    * for barriers: don't move any access before the barrier to after any
153    * atomics/control_barriers/sendmsg_gs_done after the barrier */
154   semantic_release = 0x2,
155
156   /* the rest are for load/stores/atomics only */
157   /* cannot be DCE'd or CSE'd */
158   semantic_volatile = 0x4,
159   /* does not interact with barriers and assumes this lane is the only lane
160    * accessing this memory */
161   semantic_private = 0x8,
162   /* this operation can be reordered around operations of the same storage.
163    * says nothing about barriers */
164   semantic_can_reorder = 0x10,
165   /* this is a atomic instruction (may only read or write memory) */
166   semantic_atomic = 0x20,
167   /* this is instruction both reads and writes memory */
168   semantic_rmw = 0x40,
169
170   semantic_acqrel = semantic_acquire | semantic_release,
171   semantic_atomicrmw = semantic_volatile | semantic_atomic | semantic_rmw,
172};
173
174enum sync_scope : uint8_t {
175   scope_invocation = 0,
176   scope_subgroup = 1,
177   scope_workgroup = 2,
178   scope_queuefamily = 3,
179   scope_device = 4,
180};
181
182struct memory_sync_info {
183   memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {}
184   memory_sync_info(int storage_, int semantics_ = 0, sync_scope scope_ = scope_invocation)
185       : storage((storage_class)storage_), semantics((memory_semantics)semantics_), scope(scope_)
186   {}
187
188   storage_class storage : 8;
189   memory_semantics semantics : 8;
190   sync_scope scope : 8;
191
192   bool operator==(const memory_sync_info& rhs) const
193   {
194      return storage == rhs.storage && semantics == rhs.semantics && scope == rhs.scope;
195   }
196
197   bool can_reorder() const
198   {
199      if (semantics & semantic_acqrel)
200         return false;
201      /* Also check storage so that zero-initialized memory_sync_info can be
202       * reordered. */
203      return (!storage || (semantics & semantic_can_reorder)) && !(semantics & semantic_volatile);
204   }
205};
206static_assert(sizeof(memory_sync_info) == 3, "Unexpected padding");
207
208enum fp_round {
209   fp_round_ne = 0,
210   fp_round_pi = 1,
211   fp_round_ni = 2,
212   fp_round_tz = 3,
213};
214
215enum fp_denorm {
216   /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
217    * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
218   fp_denorm_flush = 0x0,
219   fp_denorm_keep_in = 0x1,
220   fp_denorm_keep_out = 0x2,
221   fp_denorm_keep = 0x3,
222};
223
224struct float_mode {
225   /* matches encoding of the MODE register */
226   union {
227      struct {
228         fp_round round32 : 2;
229         fp_round round16_64 : 2;
230         unsigned denorm32 : 2;
231         unsigned denorm16_64 : 2;
232      };
233      struct {
234         uint8_t round : 4;
235         uint8_t denorm : 4;
236      };
237      uint8_t val = 0;
238   };
239   /* if false, optimizations which may remove infs/nan/-0.0 can be done */
240   bool preserve_signed_zero_inf_nan32 : 1;
241   bool preserve_signed_zero_inf_nan16_64 : 1;
242   /* if false, optimizations which may remove denormal flushing can be done */
243   bool must_flush_denorms32 : 1;
244   bool must_flush_denorms16_64 : 1;
245   bool care_about_round32 : 1;
246   bool care_about_round16_64 : 1;
247
248   /* Returns true if instructions using the mode "other" can safely use the
249    * current one instead. */
250   bool canReplace(float_mode other) const noexcept
251   {
252      return val == other.val &&
253             (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
254             (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
255             (must_flush_denorms32 || !other.must_flush_denorms32) &&
256             (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
257             (care_about_round32 || !other.care_about_round32) &&
258             (care_about_round16_64 || !other.care_about_round16_64);
259   }
260};
261
262struct wait_imm {
263   static const uint8_t unset_counter = 0xff;
264
265   uint8_t vm;
266   uint8_t exp;
267   uint8_t lgkm;
268   uint8_t vs;
269
270   wait_imm();
271   wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_);
272   wait_imm(enum chip_class chip, uint16_t packed);
273
274   uint16_t pack(enum chip_class chip) const;
275
276   bool combine(const wait_imm& other);
277
278   bool empty() const;
279};
280
281constexpr Format
282asVOP3(Format format)
283{
284   return (Format)((uint32_t)Format::VOP3 | (uint32_t)format);
285};
286
287constexpr Format
288asSDWA(Format format)
289{
290   assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
291   return (Format)((uint32_t)Format::SDWA | (uint32_t)format);
292}
293
294constexpr Format
295withoutDPP(Format format)
296{
297   return (Format)((uint32_t)format & ~(uint32_t)Format::DPP);
298}
299
300enum class RegType {
301   none = 0,
302   sgpr,
303   vgpr,
304   linear_vgpr,
305};
306
307struct RegClass {
308
309   enum RC : uint8_t {
310      s1 = 1,
311      s2 = 2,
312      s3 = 3,
313      s4 = 4,
314      s6 = 6,
315      s8 = 8,
316      s16 = 16,
317      v1 = s1 | (1 << 5),
318      v2 = s2 | (1 << 5),
319      v3 = s3 | (1 << 5),
320      v4 = s4 | (1 << 5),
321      v5 = 5 | (1 << 5),
322      v6 = 6 | (1 << 5),
323      v7 = 7 | (1 << 5),
324      v8 = 8 | (1 << 5),
325      /* byte-sized register class */
326      v1b = v1 | (1 << 7),
327      v2b = v2 | (1 << 7),
328      v3b = v3 | (1 << 7),
329      v4b = v4 | (1 << 7),
330      v6b = v6 | (1 << 7),
331      v8b = v8 | (1 << 7),
332      /* these are used for WWM and spills to vgpr */
333      v1_linear = v1 | (1 << 6),
334      v2_linear = v2 | (1 << 6),
335   };
336
337   RegClass() = default;
338   constexpr RegClass(RC rc_) : rc(rc_) {}
339   constexpr RegClass(RegType type, unsigned size)
340       : rc((RC)((type == RegType::vgpr ? 1 << 5 : 0) | size))
341   {}
342
343   constexpr operator RC() const { return rc; }
344   explicit operator bool() = delete;
345
346   constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
347   constexpr bool is_linear_vgpr() const { return rc & (1 << 6); };
348   constexpr bool is_subdword() const { return rc & (1 << 7); }
349   constexpr unsigned bytes() const { return ((unsigned)rc & 0x1F) * (is_subdword() ? 1 : 4); }
350   // TODO: use size() less in favor of bytes()
351   constexpr unsigned size() const { return (bytes() + 3) >> 2; }
352   constexpr bool is_linear() const { return rc <= RC::s16 || is_linear_vgpr(); }
353   constexpr RegClass as_linear() const { return RegClass((RC)(rc | (1 << 6))); }
354   constexpr RegClass as_subdword() const { return RegClass((RC)(rc | 1 << 7)); }
355
356   static constexpr RegClass get(RegType type, unsigned bytes)
357   {
358      if (type == RegType::sgpr) {
359         return RegClass(type, DIV_ROUND_UP(bytes, 4u));
360      } else {
361         return bytes % 4u ? RegClass(type, bytes).as_subdword() : RegClass(type, bytes / 4u);
362      }
363   }
364
365   constexpr RegClass resize(unsigned bytes) const
366   {
367      if (is_linear_vgpr()) {
368         assert(bytes % 4u == 0);
369         return get(RegType::vgpr, bytes).as_linear();
370      }
371      return get(type(), bytes);
372   }
373
374private:
375   RC rc;
376};
377
378/* transitional helper expressions */
379static constexpr RegClass s1{RegClass::s1};
380static constexpr RegClass s2{RegClass::s2};
381static constexpr RegClass s3{RegClass::s3};
382static constexpr RegClass s4{RegClass::s4};
383static constexpr RegClass s8{RegClass::s8};
384static constexpr RegClass s16{RegClass::s16};
385static constexpr RegClass v1{RegClass::v1};
386static constexpr RegClass v2{RegClass::v2};
387static constexpr RegClass v3{RegClass::v3};
388static constexpr RegClass v4{RegClass::v4};
389static constexpr RegClass v5{RegClass::v5};
390static constexpr RegClass v6{RegClass::v6};
391static constexpr RegClass v7{RegClass::v7};
392static constexpr RegClass v8{RegClass::v8};
393static constexpr RegClass v1b{RegClass::v1b};
394static constexpr RegClass v2b{RegClass::v2b};
395static constexpr RegClass v3b{RegClass::v3b};
396static constexpr RegClass v4b{RegClass::v4b};
397static constexpr RegClass v6b{RegClass::v6b};
398static constexpr RegClass v8b{RegClass::v8b};
399
400/**
401 * Temp Class
402 * Each temporary virtual register has a
403 * register class (i.e. size and type)
404 * and SSA id.
405 */
406struct Temp {
407   Temp() noexcept : id_(0), reg_class(0) {}
408   constexpr Temp(uint32_t id, RegClass cls) noexcept : id_(id), reg_class(uint8_t(cls)) {}
409
410   constexpr uint32_t id() const noexcept { return id_; }
411   constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
412
413   constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
414   constexpr unsigned size() const noexcept { return regClass().size(); }
415   constexpr RegType type() const noexcept { return regClass().type(); }
416   constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
417
418   constexpr bool operator<(Temp other) const noexcept { return id() < other.id(); }
419   constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
420   constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
421
422private:
423   uint32_t id_ : 24;
424   uint32_t reg_class : 8;
425};
426
427/**
428 * PhysReg
429 * Represents the physical register for each
430 * Operand and Definition.
431 */
432struct PhysReg {
433   constexpr PhysReg() = default;
434   explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
435   constexpr unsigned reg() const { return reg_b >> 2; }
436   constexpr unsigned byte() const { return reg_b & 0x3; }
437   constexpr operator unsigned() const { return reg(); }
438   constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
439   constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
440   constexpr bool operator<(PhysReg other) const { return reg_b < other.reg_b; }
441   constexpr PhysReg advance(int bytes) const
442   {
443      PhysReg res = *this;
444      res.reg_b += bytes;
445      return res;
446   }
447
448   uint16_t reg_b = 0;
449};
450
451/* helper expressions for special registers */
452static constexpr PhysReg m0{124};
453static constexpr PhysReg vcc{106};
454static constexpr PhysReg vcc_hi{107};
455static constexpr PhysReg tba{108}; /* GFX6-GFX8 */
456static constexpr PhysReg tma{110}; /* GFX6-GFX8 */
457static constexpr PhysReg ttmp0{112};
458static constexpr PhysReg ttmp1{113};
459static constexpr PhysReg ttmp2{114};
460static constexpr PhysReg ttmp3{115};
461static constexpr PhysReg ttmp4{116};
462static constexpr PhysReg ttmp5{117};
463static constexpr PhysReg ttmp6{118};
464static constexpr PhysReg ttmp7{119};
465static constexpr PhysReg ttmp8{120};
466static constexpr PhysReg ttmp9{121};
467static constexpr PhysReg ttmp10{122};
468static constexpr PhysReg ttmp11{123};
469static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
470static constexpr PhysReg exec{126};
471static constexpr PhysReg exec_lo{126};
472static constexpr PhysReg exec_hi{127};
473static constexpr PhysReg vccz{251};
474static constexpr PhysReg execz{252};
475static constexpr PhysReg scc{253};
476
477/**
478 * Operand Class
479 * Initially, each Operand refers to either
480 * a temporary virtual register
481 * or to a constant value
482 * Temporary registers get mapped to physical register during RA
483 * Constant values are inlined into the instruction sequence.
484 */
485class Operand final {
486public:
487   constexpr Operand()
488       : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false), isKill_(false),
489         isUndef_(true), isFirstKill_(false), constSize(0), isLateKill_(false), is16bit_(false),
490         is24bit_(false), signext(false)
491   {}
492
493   explicit Operand(Temp r) noexcept
494   {
495      data_.temp = r;
496      if (r.id()) {
497         isTemp_ = true;
498      } else {
499         isUndef_ = true;
500         setFixed(PhysReg{128});
501      }
502   };
503   explicit Operand(Temp r, PhysReg reg) noexcept
504   {
505      assert(r.id()); /* Don't allow fixing an undef to a register */
506      data_.temp = r;
507      isTemp_ = true;
508      setFixed(reg);
509   };
510
511   /* 8-bit constant */
512   static Operand c8(uint8_t v) noexcept
513   {
514      /* 8-bit constants are only used for copies and copies from any 8-bit
515       * constant can be implemented with a SDWA v_mul_u32_u24. So consider all
516       * to be inline constants. */
517      Operand op;
518      op.control_ = 0;
519      op.data_.i = v;
520      op.isConstant_ = true;
521      op.constSize = 0;
522      op.setFixed(PhysReg{0u});
523      return op;
524   };
525
526   /* 16-bit constant */
527   static Operand c16(uint16_t v) noexcept
528   {
529      Operand op;
530      op.control_ = 0;
531      op.data_.i = v;
532      op.isConstant_ = true;
533      op.constSize = 1;
534      if (v <= 64)
535         op.setFixed(PhysReg{128u + v});
536      else if (v >= 0xFFF0) /* [-16 .. -1] */
537         op.setFixed(PhysReg{(unsigned)(192 - (int16_t)v)});
538      else if (v == 0x3800) /* 0.5 */
539         op.setFixed(PhysReg{240});
540      else if (v == 0xB800) /* -0.5 */
541         op.setFixed(PhysReg{241});
542      else if (v == 0x3C00) /* 1.0 */
543         op.setFixed(PhysReg{242});
544      else if (v == 0xBC00) /* -1.0 */
545         op.setFixed(PhysReg{243});
546      else if (v == 0x4000) /* 2.0 */
547         op.setFixed(PhysReg{244});
548      else if (v == 0xC000) /* -2.0 */
549         op.setFixed(PhysReg{245});
550      else if (v == 0x4400) /* 4.0 */
551         op.setFixed(PhysReg{246});
552      else if (v == 0xC400) /* -4.0 */
553         op.setFixed(PhysReg{247});
554      else if (v == 0x3118) /* 1/2 PI */
555         op.setFixed(PhysReg{248});
556      else /* Literal Constant */
557         op.setFixed(PhysReg{255});
558      return op;
559   }
560
561   /* 32-bit constant */
562   static Operand c32(uint32_t v) noexcept { return c32_or_c64(v, false); }
563
564   /* 64-bit constant */
565   static Operand c64(uint64_t v) noexcept
566   {
567      Operand op;
568      op.control_ = 0;
569      op.isConstant_ = true;
570      op.constSize = 3;
571      if (v <= 64) {
572         op.data_.i = (uint32_t)v;
573         op.setFixed(PhysReg{128 + (uint32_t)v});
574      } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
575         op.data_.i = (uint32_t)v;
576         op.setFixed(PhysReg{192 - (uint32_t)v});
577      } else if (v == 0x3FE0000000000000) { /* 0.5 */
578         op.data_.i = 0x3f000000;
579         op.setFixed(PhysReg{240});
580      } else if (v == 0xBFE0000000000000) { /* -0.5 */
581         op.data_.i = 0xbf000000;
582         op.setFixed(PhysReg{241});
583      } else if (v == 0x3FF0000000000000) { /* 1.0 */
584         op.data_.i = 0x3f800000;
585         op.setFixed(PhysReg{242});
586      } else if (v == 0xBFF0000000000000) { /* -1.0 */
587         op.data_.i = 0xbf800000;
588         op.setFixed(PhysReg{243});
589      } else if (v == 0x4000000000000000) { /* 2.0 */
590         op.data_.i = 0x40000000;
591         op.setFixed(PhysReg{244});
592      } else if (v == 0xC000000000000000) { /* -2.0 */
593         op.data_.i = 0xc0000000;
594         op.setFixed(PhysReg{245});
595      } else if (v == 0x4010000000000000) { /* 4.0 */
596         op.data_.i = 0x40800000;
597         op.setFixed(PhysReg{246});
598      } else if (v == 0xC010000000000000) { /* -4.0 */
599         op.data_.i = 0xc0800000;
600         op.setFixed(PhysReg{247});
601      } else { /* Literal Constant: we don't know if it is a long or double.*/
602         op.signext = v >> 63;
603         op.data_.i = v & 0xffffffffu;
604         op.setFixed(PhysReg{255});
605         assert(op.constantValue64() == v &&
606                "attempt to create a unrepresentable 64-bit literal constant");
607      }
608      return op;
609   }
610
611   /* 32-bit constant stored as a 32-bit or 64-bit operand */
612   static Operand c32_or_c64(uint32_t v, bool is64bit) noexcept
613   {
614      Operand op;
615      op.control_ = 0;
616      op.data_.i = v;
617      op.isConstant_ = true;
618      op.constSize = is64bit ? 3 : 2;
619      if (v <= 64)
620         op.setFixed(PhysReg{128 + v});
621      else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
622         op.setFixed(PhysReg{192 - v});
623      else if (v == 0x3f000000) /* 0.5 */
624         op.setFixed(PhysReg{240});
625      else if (v == 0xbf000000) /* -0.5 */
626         op.setFixed(PhysReg{241});
627      else if (v == 0x3f800000) /* 1.0 */
628         op.setFixed(PhysReg{242});
629      else if (v == 0xbf800000) /* -1.0 */
630         op.setFixed(PhysReg{243});
631      else if (v == 0x40000000) /* 2.0 */
632         op.setFixed(PhysReg{244});
633      else if (v == 0xc0000000) /* -2.0 */
634         op.setFixed(PhysReg{245});
635      else if (v == 0x40800000) /* 4.0 */
636         op.setFixed(PhysReg{246});
637      else if (v == 0xc0800000) /* -4.0 */
638         op.setFixed(PhysReg{247});
639      else { /* Literal Constant */
640         assert(!is64bit && "attempt to create a 64-bit literal constant");
641         op.setFixed(PhysReg{255});
642      }
643      return op;
644   }
645
646   explicit Operand(RegClass type) noexcept
647   {
648      isUndef_ = true;
649      data_.temp = Temp(0, type);
650      setFixed(PhysReg{128});
651   };
652   explicit Operand(PhysReg reg, RegClass type) noexcept
653   {
654      data_.temp = Temp(0, type);
655      setFixed(reg);
656   }
657
658   static Operand zero(unsigned bytes = 4) noexcept
659   {
660      if (bytes == 8)
661         return Operand::c64(0);
662      else if (bytes == 4)
663         return Operand::c32(0);
664      else if (bytes == 2)
665         return Operand::c16(0);
666      assert(bytes == 1);
667      return Operand::c8(0);
668   }
669
670   /* This is useful over the constructors when you want to take a chip class
671    * for 1/2 PI or an unknown operand size.
672    */
673   static Operand get_const(enum chip_class chip, uint64_t val, unsigned bytes)
674   {
675      if (val == 0x3e22f983 && bytes == 4 && chip >= GFX8) {
676         /* 1/2 PI can be an inline constant on GFX8+ */
677         Operand op = Operand::c32(val);
678         op.setFixed(PhysReg{248});
679         return op;
680      }
681
682      if (bytes == 8)
683         return Operand::c64(val);
684      else if (bytes == 4)
685         return Operand::c32(val);
686      else if (bytes == 2)
687         return Operand::c16(val);
688      assert(bytes == 1);
689      return Operand::c8(val);
690   }
691
692   static bool is_constant_representable(uint64_t val, unsigned bytes, bool zext = false,
693                                         bool sext = false)
694   {
695      if (bytes <= 4)
696         return true;
697
698      if (zext && (val & 0xFFFFFFFF00000000) == 0x0000000000000000)
699         return true;
700      uint64_t upper33 = val & 0xFFFFFFFF80000000;
701      if (sext && (upper33 == 0xFFFFFFFF80000000 || upper33 == 0))
702         return true;
703
704      return val >= 0xFFFFFFFFFFFFFFF0 || val <= 64 || /* [-16 .. 64] */
705             val == 0x3FE0000000000000 ||              /* 0.5 */
706             val == 0xBFE0000000000000 ||              /* -0.5 */
707             val == 0x3FF0000000000000 ||              /* 1.0 */
708             val == 0xBFF0000000000000 ||              /* -1.0 */
709             val == 0x4000000000000000 ||              /* 2.0 */
710             val == 0xC000000000000000 ||              /* -2.0 */
711             val == 0x4010000000000000 ||              /* 4.0 */
712             val == 0xC010000000000000;                /* -4.0 */
713   }
714
715   constexpr bool isTemp() const noexcept { return isTemp_; }
716
717   constexpr void setTemp(Temp t) noexcept
718   {
719      assert(!isConstant_);
720      isTemp_ = true;
721      data_.temp = t;
722   }
723
724   constexpr Temp getTemp() const noexcept { return data_.temp; }
725
726   constexpr uint32_t tempId() const noexcept { return data_.temp.id(); }
727
728   constexpr bool hasRegClass() const noexcept { return isTemp() || isUndefined(); }
729
730   constexpr RegClass regClass() const noexcept { return data_.temp.regClass(); }
731
732   constexpr unsigned bytes() const noexcept
733   {
734      if (isConstant())
735         return 1 << constSize;
736      else
737         return data_.temp.bytes();
738   }
739
740   constexpr unsigned size() const noexcept
741   {
742      if (isConstant())
743         return constSize > 2 ? 2 : 1;
744      else
745         return data_.temp.size();
746   }
747
748   constexpr bool isFixed() const noexcept { return isFixed_; }
749
750   constexpr PhysReg physReg() const noexcept { return reg_; }
751
752   constexpr void setFixed(PhysReg reg) noexcept
753   {
754      isFixed_ = reg != unsigned(-1);
755      reg_ = reg;
756   }
757
758   constexpr bool isConstant() const noexcept { return isConstant_; }
759
760   constexpr bool isLiteral() const noexcept { return isConstant() && reg_ == 255; }
761
762   constexpr bool isUndefined() const noexcept { return isUndef_; }
763
764   constexpr uint32_t constantValue() const noexcept { return data_.i; }
765
766   constexpr bool constantEquals(uint32_t cmp) const noexcept
767   {
768      return isConstant() && constantValue() == cmp;
769   }
770
771   constexpr uint64_t constantValue64() const noexcept
772   {
773      if (constSize == 3) {
774         if (reg_ <= 192)
775            return reg_ - 128;
776         else if (reg_ <= 208)
777            return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
778
779         switch (reg_) {
780         case 240: return 0x3FE0000000000000;
781         case 241: return 0xBFE0000000000000;
782         case 242: return 0x3FF0000000000000;
783         case 243: return 0xBFF0000000000000;
784         case 244: return 0x4000000000000000;
785         case 245: return 0xC000000000000000;
786         case 246: return 0x4010000000000000;
787         case 247: return 0xC010000000000000;
788         case 255:
789            return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
790         }
791         unreachable("invalid register for 64-bit constant");
792      } else {
793         return data_.i;
794      }
795   }
796
797   constexpr bool isOfType(RegType type) const noexcept
798   {
799      return hasRegClass() && regClass().type() == type;
800   }
801
802   /* Indicates that the killed operand's live range intersects with the
803    * instruction's definitions. Unlike isKill() and isFirstKill(), this is
804    * not set by liveness analysis. */
805   constexpr void setLateKill(bool flag) noexcept { isLateKill_ = flag; }
806
807   constexpr bool isLateKill() const noexcept { return isLateKill_; }
808
809   constexpr void setKill(bool flag) noexcept
810   {
811      isKill_ = flag;
812      if (!flag)
813         setFirstKill(false);
814   }
815
816   constexpr bool isKill() const noexcept { return isKill_ || isFirstKill(); }
817
818   constexpr void setFirstKill(bool flag) noexcept
819   {
820      isFirstKill_ = flag;
821      if (flag)
822         setKill(flag);
823   }
824
825   /* When there are multiple operands killing the same temporary,
826    * isFirstKill() is only returns true for the first one. */
827   constexpr bool isFirstKill() const noexcept { return isFirstKill_; }
828
829   constexpr bool isKillBeforeDef() const noexcept { return isKill() && !isLateKill(); }
830
831   constexpr bool isFirstKillBeforeDef() const noexcept { return isFirstKill() && !isLateKill(); }
832
833   constexpr bool operator==(Operand other) const noexcept
834   {
835      if (other.size() != size())
836         return false;
837      if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
838         return false;
839      if (isFixed() && other.isFixed() && physReg() != other.physReg())
840         return false;
841      if (isLiteral())
842         return other.isLiteral() && other.constantValue() == constantValue();
843      else if (isConstant())
844         return other.isConstant() && other.physReg() == physReg();
845      else if (isUndefined())
846         return other.isUndefined() && other.regClass() == regClass();
847      else
848         return other.isTemp() && other.getTemp() == getTemp();
849   }
850
851   constexpr bool operator!=(Operand other) const noexcept { return !operator==(other); }
852
853   constexpr void set16bit(bool flag) noexcept { is16bit_ = flag; }
854
855   constexpr bool is16bit() const noexcept { return is16bit_; }
856
857   constexpr void set24bit(bool flag) noexcept { is24bit_ = flag; }
858
859   constexpr bool is24bit() const noexcept { return is24bit_; }
860
861private:
862   union {
863      Temp temp;
864      uint32_t i;
865      float f;
866   } data_ = {Temp(0, s1)};
867   PhysReg reg_;
868   union {
869      struct {
870         uint8_t isTemp_ : 1;
871         uint8_t isFixed_ : 1;
872         uint8_t isConstant_ : 1;
873         uint8_t isKill_ : 1;
874         uint8_t isUndef_ : 1;
875         uint8_t isFirstKill_ : 1;
876         uint8_t constSize : 2;
877         uint8_t isLateKill_ : 1;
878         uint8_t is16bit_ : 1;
879         uint8_t is24bit_ : 1;
880         uint8_t signext : 1;
881      };
882      /* can't initialize bit-fields in c++11, so work around using a union */
883      uint16_t control_ = 0;
884   };
885};
886
887/**
888 * Definition Class
889 * Definitions are the results of Instructions
890 * and refer to temporary virtual registers
891 * which are later mapped to physical registers
892 */
893class Definition final {
894public:
895   constexpr Definition()
896       : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0), isPrecise_(0), isNUW_(0),
897         isNoCSE_(0)
898   {}
899   Definition(uint32_t index, RegClass type) noexcept : temp(index, type) {}
900   explicit Definition(Temp tmp) noexcept : temp(tmp) {}
901   Definition(PhysReg reg, RegClass type) noexcept : temp(Temp(0, type)) { setFixed(reg); }
902   Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept : temp(Temp(tmpId, type))
903   {
904      setFixed(reg);
905   }
906
907   constexpr bool isTemp() const noexcept { return tempId() > 0; }
908
909   constexpr Temp getTemp() const noexcept { return temp; }
910
911   constexpr uint32_t tempId() const noexcept { return temp.id(); }
912
913   constexpr void setTemp(Temp t) noexcept { temp = t; }
914
915   void swapTemp(Definition& other) noexcept { std::swap(temp, other.temp); }
916
917   constexpr RegClass regClass() const noexcept { return temp.regClass(); }
918
919   constexpr unsigned bytes() const noexcept { return temp.bytes(); }
920
921   constexpr unsigned size() const noexcept { return temp.size(); }
922
923   constexpr bool isFixed() const noexcept { return isFixed_; }
924
925   constexpr PhysReg physReg() const noexcept { return reg_; }
926
927   constexpr void setFixed(PhysReg reg) noexcept
928   {
929      isFixed_ = 1;
930      reg_ = reg;
931   }
932
933   constexpr void setHint(PhysReg reg) noexcept
934   {
935      hasHint_ = 1;
936      reg_ = reg;
937   }
938
939   constexpr bool hasHint() const noexcept { return hasHint_; }
940
941   constexpr void setKill(bool flag) noexcept { isKill_ = flag; }
942
943   constexpr bool isKill() const noexcept { return isKill_; }
944
945   constexpr void setPrecise(bool precise) noexcept { isPrecise_ = precise; }
946
947   constexpr bool isPrecise() const noexcept { return isPrecise_; }
948
949   /* No Unsigned Wrap */
950   constexpr void setNUW(bool nuw) noexcept { isNUW_ = nuw; }
951
952   constexpr bool isNUW() const noexcept { return isNUW_; }
953
954   constexpr void setNoCSE(bool noCSE) noexcept { isNoCSE_ = noCSE; }
955
956   constexpr bool isNoCSE() const noexcept { return isNoCSE_; }
957
958private:
959   Temp temp = Temp(0, s1);
960   PhysReg reg_;
961   union {
962      struct {
963         uint8_t isFixed_ : 1;
964         uint8_t hasHint_ : 1;
965         uint8_t isKill_ : 1;
966         uint8_t isPrecise_ : 1;
967         uint8_t isNUW_ : 1;
968         uint8_t isNoCSE_ : 1;
969      };
970      /* can't initialize bit-fields in c++11, so work around using a union */
971      uint8_t control_ = 0;
972   };
973};
974
975struct Block;
976struct Instruction;
977struct Pseudo_instruction;
978struct SOP1_instruction;
979struct SOP2_instruction;
980struct SOPK_instruction;
981struct SOPP_instruction;
982struct SOPC_instruction;
983struct SMEM_instruction;
984struct DS_instruction;
985struct MTBUF_instruction;
986struct MUBUF_instruction;
987struct MIMG_instruction;
988struct Export_instruction;
989struct FLAT_instruction;
990struct Pseudo_branch_instruction;
991struct Pseudo_barrier_instruction;
992struct Pseudo_reduction_instruction;
993struct VOP3P_instruction;
994struct VOP1_instruction;
995struct VOP2_instruction;
996struct VOPC_instruction;
997struct VOP3_instruction;
998struct Interp_instruction;
999struct DPP_instruction;
1000struct SDWA_instruction;
1001
1002struct Instruction {
1003   aco_opcode opcode;
1004   Format format;
1005   uint32_t pass_flags;
1006
1007   aco::span<Operand> operands;
1008   aco::span<Definition> definitions;
1009
1010   constexpr bool usesModifiers() const noexcept;
1011
1012   constexpr bool reads_exec() const noexcept
1013   {
1014      for (const Operand& op : operands) {
1015         if (op.isFixed() && op.physReg() == exec)
1016            return true;
1017      }
1018      return false;
1019   }
1020
1021   Pseudo_instruction& pseudo() noexcept
1022   {
1023      assert(isPseudo());
1024      return *(Pseudo_instruction*)this;
1025   }
1026   const Pseudo_instruction& pseudo() const noexcept
1027   {
1028      assert(isPseudo());
1029      return *(Pseudo_instruction*)this;
1030   }
1031   constexpr bool isPseudo() const noexcept { return format == Format::PSEUDO; }
1032   SOP1_instruction& sop1() noexcept
1033   {
1034      assert(isSOP1());
1035      return *(SOP1_instruction*)this;
1036   }
1037   const SOP1_instruction& sop1() const noexcept
1038   {
1039      assert(isSOP1());
1040      return *(SOP1_instruction*)this;
1041   }
1042   constexpr bool isSOP1() const noexcept { return format == Format::SOP1; }
1043   SOP2_instruction& sop2() noexcept
1044   {
1045      assert(isSOP2());
1046      return *(SOP2_instruction*)this;
1047   }
1048   const SOP2_instruction& sop2() const noexcept
1049   {
1050      assert(isSOP2());
1051      return *(SOP2_instruction*)this;
1052   }
1053   constexpr bool isSOP2() const noexcept { return format == Format::SOP2; }
1054   SOPK_instruction& sopk() noexcept
1055   {
1056      assert(isSOPK());
1057      return *(SOPK_instruction*)this;
1058   }
1059   const SOPK_instruction& sopk() const noexcept
1060   {
1061      assert(isSOPK());
1062      return *(SOPK_instruction*)this;
1063   }
1064   constexpr bool isSOPK() const noexcept { return format == Format::SOPK; }
1065   SOPP_instruction& sopp() noexcept
1066   {
1067      assert(isSOPP());
1068      return *(SOPP_instruction*)this;
1069   }
1070   const SOPP_instruction& sopp() const noexcept
1071   {
1072      assert(isSOPP());
1073      return *(SOPP_instruction*)this;
1074   }
1075   constexpr bool isSOPP() const noexcept { return format == Format::SOPP; }
1076   SOPC_instruction& sopc() noexcept
1077   {
1078      assert(isSOPC());
1079      return *(SOPC_instruction*)this;
1080   }
1081   const SOPC_instruction& sopc() const noexcept
1082   {
1083      assert(isSOPC());
1084      return *(SOPC_instruction*)this;
1085   }
1086   constexpr bool isSOPC() const noexcept { return format == Format::SOPC; }
1087   SMEM_instruction& smem() noexcept
1088   {
1089      assert(isSMEM());
1090      return *(SMEM_instruction*)this;
1091   }
1092   const SMEM_instruction& smem() const noexcept
1093   {
1094      assert(isSMEM());
1095      return *(SMEM_instruction*)this;
1096   }
1097   constexpr bool isSMEM() const noexcept { return format == Format::SMEM; }
1098   DS_instruction& ds() noexcept
1099   {
1100      assert(isDS());
1101      return *(DS_instruction*)this;
1102   }
1103   const DS_instruction& ds() const noexcept
1104   {
1105      assert(isDS());
1106      return *(DS_instruction*)this;
1107   }
1108   constexpr bool isDS() const noexcept { return format == Format::DS; }
1109   MTBUF_instruction& mtbuf() noexcept
1110   {
1111      assert(isMTBUF());
1112      return *(MTBUF_instruction*)this;
1113   }
1114   const MTBUF_instruction& mtbuf() const noexcept
1115   {
1116      assert(isMTBUF());
1117      return *(MTBUF_instruction*)this;
1118   }
1119   constexpr bool isMTBUF() const noexcept { return format == Format::MTBUF; }
1120   MUBUF_instruction& mubuf() noexcept
1121   {
1122      assert(isMUBUF());
1123      return *(MUBUF_instruction*)this;
1124   }
1125   const MUBUF_instruction& mubuf() const noexcept
1126   {
1127      assert(isMUBUF());
1128      return *(MUBUF_instruction*)this;
1129   }
1130   constexpr bool isMUBUF() const noexcept { return format == Format::MUBUF; }
1131   MIMG_instruction& mimg() noexcept
1132   {
1133      assert(isMIMG());
1134      return *(MIMG_instruction*)this;
1135   }
1136   const MIMG_instruction& mimg() const noexcept
1137   {
1138      assert(isMIMG());
1139      return *(MIMG_instruction*)this;
1140   }
1141   constexpr bool isMIMG() const noexcept { return format == Format::MIMG; }
1142   Export_instruction& exp() noexcept
1143   {
1144      assert(isEXP());
1145      return *(Export_instruction*)this;
1146   }
1147   const Export_instruction& exp() const noexcept
1148   {
1149      assert(isEXP());
1150      return *(Export_instruction*)this;
1151   }
1152   constexpr bool isEXP() const noexcept { return format == Format::EXP; }
1153   FLAT_instruction& flat() noexcept
1154   {
1155      assert(isFlat());
1156      return *(FLAT_instruction*)this;
1157   }
1158   const FLAT_instruction& flat() const noexcept
1159   {
1160      assert(isFlat());
1161      return *(FLAT_instruction*)this;
1162   }
1163   constexpr bool isFlat() const noexcept { return format == Format::FLAT; }
1164   FLAT_instruction& global() noexcept
1165   {
1166      assert(isGlobal());
1167      return *(FLAT_instruction*)this;
1168   }
1169   const FLAT_instruction& global() const noexcept
1170   {
1171      assert(isGlobal());
1172      return *(FLAT_instruction*)this;
1173   }
1174   constexpr bool isGlobal() const noexcept { return format == Format::GLOBAL; }
1175   FLAT_instruction& scratch() noexcept
1176   {
1177      assert(isScratch());
1178      return *(FLAT_instruction*)this;
1179   }
1180   const FLAT_instruction& scratch() const noexcept
1181   {
1182      assert(isScratch());
1183      return *(FLAT_instruction*)this;
1184   }
1185   constexpr bool isScratch() const noexcept { return format == Format::SCRATCH; }
1186   Pseudo_branch_instruction& branch() noexcept
1187   {
1188      assert(isBranch());
1189      return *(Pseudo_branch_instruction*)this;
1190   }
1191   const Pseudo_branch_instruction& branch() const noexcept
1192   {
1193      assert(isBranch());
1194      return *(Pseudo_branch_instruction*)this;
1195   }
1196   constexpr bool isBranch() const noexcept { return format == Format::PSEUDO_BRANCH; }
1197   Pseudo_barrier_instruction& barrier() noexcept
1198   {
1199      assert(isBarrier());
1200      return *(Pseudo_barrier_instruction*)this;
1201   }
1202   const Pseudo_barrier_instruction& barrier() const noexcept
1203   {
1204      assert(isBarrier());
1205      return *(Pseudo_barrier_instruction*)this;
1206   }
1207   constexpr bool isBarrier() const noexcept { return format == Format::PSEUDO_BARRIER; }
1208   Pseudo_reduction_instruction& reduction() noexcept
1209   {
1210      assert(isReduction());
1211      return *(Pseudo_reduction_instruction*)this;
1212   }
1213   const Pseudo_reduction_instruction& reduction() const noexcept
1214   {
1215      assert(isReduction());
1216      return *(Pseudo_reduction_instruction*)this;
1217   }
1218   constexpr bool isReduction() const noexcept { return format == Format::PSEUDO_REDUCTION; }
1219   VOP3P_instruction& vop3p() noexcept
1220   {
1221      assert(isVOP3P());
1222      return *(VOP3P_instruction*)this;
1223   }
1224   const VOP3P_instruction& vop3p() const noexcept
1225   {
1226      assert(isVOP3P());
1227      return *(VOP3P_instruction*)this;
1228   }
1229   constexpr bool isVOP3P() const noexcept { return format == Format::VOP3P; }
1230   VOP1_instruction& vop1() noexcept
1231   {
1232      assert(isVOP1());
1233      return *(VOP1_instruction*)this;
1234   }
1235   const VOP1_instruction& vop1() const noexcept
1236   {
1237      assert(isVOP1());
1238      return *(VOP1_instruction*)this;
1239   }
1240   constexpr bool isVOP1() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP1; }
1241   VOP2_instruction& vop2() noexcept
1242   {
1243      assert(isVOP2());
1244      return *(VOP2_instruction*)this;
1245   }
1246   const VOP2_instruction& vop2() const noexcept
1247   {
1248      assert(isVOP2());
1249      return *(VOP2_instruction*)this;
1250   }
1251   constexpr bool isVOP2() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP2; }
1252   VOPC_instruction& vopc() noexcept
1253   {
1254      assert(isVOPC());
1255      return *(VOPC_instruction*)this;
1256   }
1257   const VOPC_instruction& vopc() const noexcept
1258   {
1259      assert(isVOPC());
1260      return *(VOPC_instruction*)this;
1261   }
1262   constexpr bool isVOPC() const noexcept { return (uint16_t)format & (uint16_t)Format::VOPC; }
1263   VOP3_instruction& vop3() noexcept
1264   {
1265      assert(isVOP3());
1266      return *(VOP3_instruction*)this;
1267   }
1268   const VOP3_instruction& vop3() const noexcept
1269   {
1270      assert(isVOP3());
1271      return *(VOP3_instruction*)this;
1272   }
1273   constexpr bool isVOP3() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP3; }
1274   Interp_instruction& vintrp() noexcept
1275   {
1276      assert(isVINTRP());
1277      return *(Interp_instruction*)this;
1278   }
1279   const Interp_instruction& vintrp() const noexcept
1280   {
1281      assert(isVINTRP());
1282      return *(Interp_instruction*)this;
1283   }
1284   constexpr bool isVINTRP() const noexcept { return (uint16_t)format & (uint16_t)Format::VINTRP; }
1285   DPP_instruction& dpp() noexcept
1286   {
1287      assert(isDPP());
1288      return *(DPP_instruction*)this;
1289   }
1290   const DPP_instruction& dpp() const noexcept
1291   {
1292      assert(isDPP());
1293      return *(DPP_instruction*)this;
1294   }
1295   constexpr bool isDPP() const noexcept { return (uint16_t)format & (uint16_t)Format::DPP; }
1296   SDWA_instruction& sdwa() noexcept
1297   {
1298      assert(isSDWA());
1299      return *(SDWA_instruction*)this;
1300   }
1301   const SDWA_instruction& sdwa() const noexcept
1302   {
1303      assert(isSDWA());
1304      return *(SDWA_instruction*)this;
1305   }
1306   constexpr bool isSDWA() const noexcept { return (uint16_t)format & (uint16_t)Format::SDWA; }
1307
1308   FLAT_instruction& flatlike() { return *(FLAT_instruction*)this; }
1309
1310   const FLAT_instruction& flatlike() const { return *(FLAT_instruction*)this; }
1311
1312   constexpr bool isFlatLike() const noexcept { return isFlat() || isGlobal() || isScratch(); }
1313
1314   constexpr bool isVALU() const noexcept
1315   {
1316      return isVOP1() || isVOP2() || isVOPC() || isVOP3() || isVOP3P();
1317   }
1318
1319   constexpr bool isSALU() const noexcept
1320   {
1321      return isSOP1() || isSOP2() || isSOPC() || isSOPK() || isSOPP();
1322   }
1323
1324   constexpr bool isVMEM() const noexcept { return isMTBUF() || isMUBUF() || isMIMG(); }
1325};
1326static_assert(sizeof(Instruction) == 16, "Unexpected padding");
1327
1328struct SOPK_instruction : public Instruction {
1329   uint16_t imm;
1330   uint16_t padding;
1331};
1332static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1333
1334struct SOPP_instruction : public Instruction {
1335   uint32_t imm;
1336   int block;
1337};
1338static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1339
1340struct SOPC_instruction : public Instruction {};
1341static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1342
1343struct SOP1_instruction : public Instruction {};
1344static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1345
1346struct SOP2_instruction : public Instruction {};
1347static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1348
1349/**
1350 * Scalar Memory Format:
1351 * For s_(buffer_)load_dword*:
1352 * Operand(0): SBASE - SGPR-pair which provides base address
1353 * Operand(1): Offset - immediate (un)signed offset or SGPR
1354 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
1355 * Operand(n-1): SOffset - SGPR offset (Vega only)
1356 *
1357 * Having no operands is also valid for instructions such as s_dcache_inv.
1358 *
1359 */
1360struct SMEM_instruction : public Instruction {
1361   memory_sync_info sync;
1362   bool glc : 1; /* VI+: globally coherent */
1363   bool dlc : 1; /* NAVI: device level coherent */
1364   bool nv : 1;  /* VEGA only: Non-volatile */
1365   bool disable_wqm : 1;
1366   bool prevent_overflow : 1; /* avoid overflow when combining additions */
1367   uint8_t padding : 3;
1368};
1369static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1370
1371struct VOP1_instruction : public Instruction {};
1372static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1373
1374struct VOP2_instruction : public Instruction {};
1375static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1376
1377struct VOPC_instruction : public Instruction {};
1378static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1379
1380struct VOP3_instruction : public Instruction {
1381   bool abs[3];
1382   bool neg[3];
1383   uint8_t opsel : 4;
1384   uint8_t omod : 2;
1385   bool clamp : 1;
1386   uint8_t padding0 : 1;
1387   uint8_t padding1;
1388};
1389static_assert(sizeof(VOP3_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1390
1391struct VOP3P_instruction : public Instruction {
1392   bool neg_lo[3];
1393   bool neg_hi[3];
1394   uint8_t opsel_lo : 3;
1395   uint8_t opsel_hi : 3;
1396   bool clamp : 1;
1397   uint8_t padding0 : 1;
1398   uint8_t padding1;
1399};
1400static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1401
1402/**
1403 * Data Parallel Primitives Format:
1404 * This format can be used for VOP1, VOP2 or VOPC instructions.
1405 * The swizzle applies to the src0 operand.
1406 *
1407 */
1408struct DPP_instruction : public Instruction {
1409   bool abs[2];
1410   bool neg[2];
1411   uint16_t dpp_ctrl;
1412   uint8_t row_mask : 4;
1413   uint8_t bank_mask : 4;
1414   bool bound_ctrl : 1;
1415   uint8_t padding : 7;
1416};
1417static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1418
1419struct SubdwordSel {
1420   enum sdwa_sel : uint8_t {
1421      ubyte = 0x4,
1422      uword = 0x8,
1423      dword = 0x10,
1424      sext = 0x20,
1425      sbyte = ubyte | sext,
1426      sword = uword | sext,
1427
1428      ubyte0 = ubyte,
1429      ubyte1 = ubyte | 1,
1430      ubyte2 = ubyte | 2,
1431      ubyte3 = ubyte | 3,
1432      sbyte0 = sbyte,
1433      sbyte1 = sbyte | 1,
1434      sbyte2 = sbyte | 2,
1435      sbyte3 = sbyte | 3,
1436      uword0 = uword,
1437      uword1 = uword | 2,
1438      sword0 = sword,
1439      sword1 = sword | 2,
1440   };
1441
1442   SubdwordSel() : sel((sdwa_sel)0) {}
1443   constexpr SubdwordSel(sdwa_sel sel_) : sel(sel_) {}
1444   constexpr SubdwordSel(unsigned size, unsigned offset, bool sign_extend)
1445       : sel((sdwa_sel)((sign_extend ? sext : 0) | size << 2 | offset))
1446   {}
1447   constexpr operator sdwa_sel() const { return sel; }
1448   explicit operator bool() const { return sel != 0; }
1449
1450   constexpr unsigned size() const { return (sel >> 2) & 0x7; }
1451   constexpr unsigned offset() const { return sel & 0x3; }
1452   constexpr bool sign_extend() const { return sel & sext; }
1453   constexpr unsigned to_sdwa_sel(unsigned reg_byte_offset) const
1454   {
1455      reg_byte_offset += offset();
1456      if (size() == 1)
1457         return reg_byte_offset;
1458      else if (size() == 2)
1459         return 4 + (reg_byte_offset >> 1);
1460      else
1461         return 6;
1462   }
1463
1464private:
1465   sdwa_sel sel;
1466};
1467
1468/**
1469 * Sub-Dword Addressing Format:
1470 * This format can be used for VOP1, VOP2 or VOPC instructions.
1471 *
1472 * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
1473 * the definition doesn't have to be VCC on GFX9+.
1474 *
1475 */
1476struct SDWA_instruction : public Instruction {
1477   /* these destination modifiers aren't available with VOPC except for
1478    * clamp on GFX8 */
1479   SubdwordSel sel[2];
1480   SubdwordSel dst_sel;
1481   bool neg[2];
1482   bool abs[2];
1483   bool clamp : 1;
1484   uint8_t omod : 2; /* GFX9+ */
1485   uint8_t padding : 5;
1486};
1487static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1488
1489struct Interp_instruction : public Instruction {
1490   uint8_t attribute;
1491   uint8_t component;
1492   uint16_t padding;
1493};
1494static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1495
1496/**
1497 * Local and Global Data Sharing instructions
1498 * Operand(0): ADDR - VGPR which supplies the address.
1499 * Operand(1): DATA0 - First data VGPR.
1500 * Operand(2): DATA1 - Second data VGPR.
1501 * Operand(n-1): M0 - LDS size.
1502 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
1503 *
1504 */
1505struct DS_instruction : public Instruction {
1506   memory_sync_info sync;
1507   bool gds;
1508   int16_t offset0;
1509   int8_t offset1;
1510   uint8_t padding;
1511};
1512static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1513
1514/**
1515 * Vector Memory Untyped-buffer Instructions
1516 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1517 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1518 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1519 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1520 *
1521 */
1522struct MUBUF_instruction : public Instruction {
1523   memory_sync_info sync;
1524   bool offen : 1;           /* Supply an offset from VGPR (VADDR) */
1525   bool idxen : 1;           /* Supply an index from VGPR (VADDR) */
1526   bool addr64 : 1;          /* SI, CIK: Address size is 64-bit */
1527   bool glc : 1;             /* globally coherent */
1528   bool dlc : 1;             /* NAVI: device level coherent */
1529   bool slc : 1;             /* system level coherent */
1530   bool tfe : 1;             /* texture fail enable */
1531   bool lds : 1;             /* Return read-data to LDS instead of VGPRs */
1532   uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */
1533   uint16_t offset : 12;     /* Unsigned byte offset - 12 bit */
1534   uint16_t swizzled : 1;
1535   uint16_t padding0 : 2;
1536   uint16_t vtx_binding : 6; /* 0 if this is not a vertex attribute load */
1537   uint16_t padding1 : 10;
1538};
1539static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1540
1541/**
1542 * Vector Memory Typed-buffer Instructions
1543 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1544 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1545 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1546 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1547 *
1548 */
1549struct MTBUF_instruction : public Instruction {
1550   memory_sync_info sync;
1551   uint8_t dfmt : 4;         /* Data Format of data in memory buffer */
1552   uint8_t nfmt : 3;         /* Numeric format of data in memory */
1553   bool offen : 1;           /* Supply an offset from VGPR (VADDR) */
1554   uint16_t idxen : 1;       /* Supply an index from VGPR (VADDR) */
1555   uint16_t glc : 1;         /* globally coherent */
1556   uint16_t dlc : 1;         /* NAVI: device level coherent */
1557   uint16_t slc : 1;         /* system level coherent */
1558   uint16_t tfe : 1;         /* texture fail enable */
1559   uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */
1560   uint16_t vtx_binding : 6; /* 0 if this is not a vertex attribute load */
1561   uint16_t padding : 4;
1562   uint16_t offset; /* Unsigned byte offset - 12 bit */
1563};
1564static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1565
1566/**
1567 * Vector Memory Image Instructions
1568 * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1569 * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1570 * Operand(2): VDATA - Vector GPR for write data or zero if TFE/LWE=1.
1571 * Operand(3): VADDR - Address source. Can carry an offset or an index.
1572 * Definition(0): VDATA - Vector GPR for read result.
1573 *
1574 */
1575struct MIMG_instruction : public Instruction {
1576   memory_sync_info sync;
1577   uint8_t dmask;        /* Data VGPR enable mask */
1578   uint8_t dim : 3;      /* NAVI: dimensionality */
1579   bool unrm : 1;        /* Force address to be un-normalized */
1580   bool dlc : 1;         /* NAVI: device level coherent */
1581   bool glc : 1;         /* globally coherent */
1582   bool slc : 1;         /* system level coherent */
1583   bool tfe : 1;         /* texture fail enable */
1584   bool da : 1;          /* declare an array */
1585   bool lwe : 1;         /* LOD warning enable */
1586   bool r128 : 1;        /* NAVI: Texture resource size */
1587   bool a16 : 1;         /* VEGA, NAVI: Address components are 16-bits */
1588   bool d16 : 1;         /* Convert 32-bit data to 16-bit data */
1589   bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1590   uint8_t padding0 : 2;
1591   uint8_t padding1;
1592   uint8_t padding2;
1593};
1594static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1595
1596/**
1597 * Flat/Scratch/Global Instructions
1598 * Operand(0): ADDR
1599 * Operand(1): SADDR
1600 * Operand(2) / Definition(0): DATA/VDST
1601 *
1602 */
1603struct FLAT_instruction : public Instruction {
1604   memory_sync_info sync;
1605   bool slc : 1; /* system level coherent */
1606   bool glc : 1; /* globally coherent */
1607   bool dlc : 1; /* NAVI: device level coherent */
1608   bool lds : 1;
1609   bool nv : 1;
1610   bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1611   uint8_t padding0 : 2;
1612   uint16_t offset; /* Vega/Navi only */
1613   uint16_t padding1;
1614};
1615static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1616
1617struct Export_instruction : public Instruction {
1618   uint8_t enabled_mask;
1619   uint8_t dest;
1620   bool compressed : 1;
1621   bool done : 1;
1622   bool valid_mask : 1;
1623   uint8_t padding0 : 5;
1624   uint8_t padding1;
1625};
1626static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1627
1628struct Pseudo_instruction : public Instruction {
1629   PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1630   bool tmp_in_scc;
1631   uint8_t padding;
1632};
1633static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1634
1635struct Pseudo_branch_instruction : public Instruction {
1636   /* target[0] is the block index of the branch target.
1637    * For conditional branches, target[1] contains the fall-through alternative.
1638    * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1639    */
1640   uint32_t target[2];
1641};
1642static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1643
1644struct Pseudo_barrier_instruction : public Instruction {
1645   memory_sync_info sync;
1646   sync_scope exec_scope;
1647};
1648static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1649
1650enum ReduceOp : uint16_t {
1651   // clang-format off
1652   iadd8, iadd16, iadd32, iadd64,
1653   imul8, imul16, imul32, imul64,
1654          fadd16, fadd32, fadd64,
1655          fmul16, fmul32, fmul64,
1656   imin8, imin16, imin32, imin64,
1657   imax8, imax16, imax32, imax64,
1658   umin8, umin16, umin32, umin64,
1659   umax8, umax16, umax32, umax64,
1660          fmin16, fmin32, fmin64,
1661          fmax16, fmax32, fmax64,
1662   iand8, iand16, iand32, iand64,
1663   ior8, ior16, ior32, ior64,
1664   ixor8, ixor16, ixor32, ixor64,
1665   num_reduce_ops,
1666   // clang-format on
1667};
1668
1669/**
1670 * Subgroup Reduction Instructions, everything except for the data to be
1671 * reduced and the result as inserted by setup_reduce_temp().
1672 * Operand(0): data to be reduced
1673 * Operand(1): reduce temporary
1674 * Operand(2): vector temporary
1675 * Definition(0): result
1676 * Definition(1): scalar temporary
1677 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1678 * Definition(3): scc clobber
1679 * Definition(4): vcc clobber
1680 *
1681 */
1682struct Pseudo_reduction_instruction : public Instruction {
1683   ReduceOp reduce_op;
1684   uint16_t cluster_size; // must be 0 for scans
1685};
1686static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4,
1687              "Unexpected padding");
1688
1689struct instr_deleter_functor {
1690   void operator()(void* p) { free(p); }
1691};
1692
1693template <typename T> using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1694
1695template <typename T>
1696T*
1697create_instruction(aco_opcode opcode, Format format, uint32_t num_operands,
1698                   uint32_t num_definitions)
1699{
1700   std::size_t size =
1701      sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1702   char* data = (char*)calloc(1, size);
1703   T* inst = (T*)data;
1704
1705   inst->opcode = opcode;
1706   inst->format = format;
1707
1708   uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1709   inst->operands = aco::span<Operand>(operands_offset, num_operands);
1710   uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1711   inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1712
1713   return inst;
1714}
1715
1716constexpr bool
1717Instruction::usesModifiers() const noexcept
1718{
1719   if (isDPP() || isSDWA())
1720      return true;
1721
1722   if (isVOP3P()) {
1723      const VOP3P_instruction& vop3p = this->vop3p();
1724      for (unsigned i = 0; i < operands.size(); i++) {
1725         if (vop3p.neg_lo[i] || vop3p.neg_hi[i])
1726            return true;
1727
1728         /* opsel_hi must be 1 to not be considered a modifier - even for constants */
1729         if (!(vop3p.opsel_hi & (1 << i)))
1730            return true;
1731      }
1732      return vop3p.opsel_lo || vop3p.clamp;
1733   } else if (isVOP3()) {
1734      const VOP3_instruction& vop3 = this->vop3();
1735      for (unsigned i = 0; i < operands.size(); i++) {
1736         if (vop3.abs[i] || vop3.neg[i])
1737            return true;
1738      }
1739      return vop3.opsel || vop3.clamp || vop3.omod;
1740   }
1741   return false;
1742}
1743
1744constexpr bool
1745is_phi(Instruction* instr)
1746{
1747   return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1748}
1749
1750static inline bool
1751is_phi(aco_ptr<Instruction>& instr)
1752{
1753   return is_phi(instr.get());
1754}
1755
1756memory_sync_info get_sync_info(const Instruction* instr);
1757
1758bool is_dead(const std::vector<uint16_t>& uses, Instruction* instr);
1759
1760bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high);
1761bool instr_is_16bit(chip_class chip, aco_opcode op);
1762bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr, bool pre_ra);
1763bool can_use_DPP(const aco_ptr<Instruction>& instr, bool pre_ra);
1764/* updates "instr" and returns the old instruction (or NULL if no update was needed) */
1765aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr);
1766aco_ptr<Instruction> convert_to_DPP(aco_ptr<Instruction>& instr);
1767bool needs_exec_mask(const Instruction* instr);
1768
1769aco_opcode get_ordered(aco_opcode op);
1770aco_opcode get_unordered(aco_opcode op);
1771aco_opcode get_inverse(aco_opcode op);
1772aco_opcode get_f32_cmp(aco_opcode op);
1773unsigned get_cmp_bitsize(aco_opcode op);
1774bool is_cmp(aco_opcode op);
1775
1776bool can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op);
1777
1778uint32_t get_reduction_identity(ReduceOp op, unsigned idx);
1779
1780unsigned get_mimg_nsa_dwords(const Instruction* instr);
1781
1782bool should_form_clause(const Instruction* a, const Instruction* b);
1783
1784enum block_kind {
1785   /* uniform indicates that leaving this block,
1786    * all actives lanes stay active */
1787   block_kind_uniform = 1 << 0,
1788   block_kind_top_level = 1 << 1,
1789   block_kind_loop_preheader = 1 << 2,
1790   block_kind_loop_header = 1 << 3,
1791   block_kind_loop_exit = 1 << 4,
1792   block_kind_continue = 1 << 5,
1793   block_kind_break = 1 << 6,
1794   block_kind_continue_or_break = 1 << 7,
1795   block_kind_discard = 1 << 8,
1796   block_kind_branch = 1 << 9,
1797   block_kind_merge = 1 << 10,
1798   block_kind_invert = 1 << 11,
1799   block_kind_uses_discard_if = 1 << 12,
1800   block_kind_needs_lowering = 1 << 13,
1801   block_kind_uses_demote = 1 << 14,
1802   block_kind_export_end = 1 << 15,
1803};
1804
1805struct RegisterDemand {
1806   constexpr RegisterDemand() = default;
1807   constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept : vgpr{v}, sgpr{s} {}
1808   int16_t vgpr = 0;
1809   int16_t sgpr = 0;
1810
1811   constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept
1812   {
1813      return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1814   }
1815
1816   constexpr bool exceeds(const RegisterDemand other) const noexcept
1817   {
1818      return vgpr > other.vgpr || sgpr > other.sgpr;
1819   }
1820
1821   constexpr RegisterDemand operator+(const Temp t) const noexcept
1822   {
1823      if (t.type() == RegType::sgpr)
1824         return RegisterDemand(vgpr, sgpr + t.size());
1825      else
1826         return RegisterDemand(vgpr + t.size(), sgpr);
1827   }
1828
1829   constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept
1830   {
1831      return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1832   }
1833
1834   constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept
1835   {
1836      return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1837   }
1838
1839   constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept
1840   {
1841      vgpr += other.vgpr;
1842      sgpr += other.sgpr;
1843      return *this;
1844   }
1845
1846   constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept
1847   {
1848      vgpr -= other.vgpr;
1849      sgpr -= other.sgpr;
1850      return *this;
1851   }
1852
1853   constexpr RegisterDemand& operator+=(const Temp t) noexcept
1854   {
1855      if (t.type() == RegType::sgpr)
1856         sgpr += t.size();
1857      else
1858         vgpr += t.size();
1859      return *this;
1860   }
1861
1862   constexpr RegisterDemand& operator-=(const Temp t) noexcept
1863   {
1864      if (t.type() == RegType::sgpr)
1865         sgpr -= t.size();
1866      else
1867         vgpr -= t.size();
1868      return *this;
1869   }
1870
1871   constexpr void update(const RegisterDemand other) noexcept
1872   {
1873      vgpr = std::max(vgpr, other.vgpr);
1874      sgpr = std::max(sgpr, other.sgpr);
1875   }
1876};
1877
1878/* CFG */
1879struct Block {
1880   float_mode fp_mode;
1881   unsigned index;
1882   unsigned offset = 0;
1883   std::vector<aco_ptr<Instruction>> instructions;
1884   std::vector<unsigned> logical_preds;
1885   std::vector<unsigned> linear_preds;
1886   std::vector<unsigned> logical_succs;
1887   std::vector<unsigned> linear_succs;
1888   RegisterDemand register_demand = RegisterDemand();
1889   uint16_t loop_nest_depth = 0;
1890   uint16_t divergent_if_logical_depth = 0;
1891   uint16_t uniform_if_depth = 0;
1892   uint16_t kind = 0;
1893   int logical_idom = -1;
1894   int linear_idom = -1;
1895
1896   /* this information is needed for predecessors to blocks with phis when
1897    * moving out of ssa */
1898   bool scc_live_out = false;
1899   PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1900
1901   Block() : index(0) {}
1902};
1903
1904/*
1905 * Shader stages as provided in Vulkan by the application. Contrast this to HWStage.
1906 */
1907enum class SWStage : uint8_t {
1908   None = 0,
1909   VS = 1 << 0,     /* Vertex Shader */
1910   GS = 1 << 1,     /* Geometry Shader */
1911   TCS = 1 << 2,    /* Tessellation Control aka Hull Shader */
1912   TES = 1 << 3,    /* Tessellation Evaluation aka Domain Shader */
1913   FS = 1 << 4,     /* Fragment aka Pixel Shader */
1914   CS = 1 << 5,     /* Compute Shader */
1915   GSCopy = 1 << 6, /* GS Copy Shader (internal) */
1916
1917   /* Stage combinations merged to run on a single HWStage */
1918   VS_GS = VS | GS,
1919   VS_TCS = VS | TCS,
1920   TES_GS = TES | GS,
1921};
1922
1923constexpr SWStage
1924operator|(SWStage a, SWStage b)
1925{
1926   return static_cast<SWStage>(static_cast<uint8_t>(a) | static_cast<uint8_t>(b));
1927}
1928
1929/*
1930 * Shader stages as running on the AMD GPU.
1931 *
1932 * The relation between HWStages and SWStages is not a one-to-one mapping:
1933 * Some SWStages are merged by ACO to run on a single HWStage.
1934 * See README.md for details.
1935 */
1936enum class HWStage : uint8_t {
1937   VS,
1938   ES, /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1939   GS,  /* Geometry shader on GFX10/legacy and GFX6-9. */
1940   NGG, /* Primitive shader, used to implement VS, TES, GS. */
1941   LS,  /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1942   HS,  /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1943   FS,
1944   CS,
1945};
1946
1947/*
1948 * Set of SWStages to be merged into a single shader paired with the
1949 * HWStage it will run on.
1950 */
1951struct Stage {
1952   constexpr Stage() = default;
1953
1954   explicit constexpr Stage(HWStage hw_, SWStage sw_) : sw(sw_), hw(hw_) {}
1955
1956   /* Check if the given SWStage is included */
1957   constexpr bool has(SWStage stage) const
1958   {
1959      return (static_cast<uint8_t>(sw) & static_cast<uint8_t>(stage));
1960   }
1961
1962   unsigned num_sw_stages() const { return util_bitcount(static_cast<uint8_t>(sw)); }
1963
1964   constexpr bool operator==(const Stage& other) const { return sw == other.sw && hw == other.hw; }
1965
1966   constexpr bool operator!=(const Stage& other) const { return sw != other.sw || hw != other.hw; }
1967
1968   /* Mask of merged software stages */
1969   SWStage sw = SWStage::None;
1970
1971   /* Active hardware stage */
1972   HWStage hw{};
1973};
1974
1975/* possible settings of Program::stage */
1976static constexpr Stage vertex_vs(HWStage::VS, SWStage::VS);
1977static constexpr Stage fragment_fs(HWStage::FS, SWStage::FS);
1978static constexpr Stage compute_cs(HWStage::CS, SWStage::CS);
1979static constexpr Stage tess_eval_vs(HWStage::VS, SWStage::TES);
1980static constexpr Stage gs_copy_vs(HWStage::VS, SWStage::GSCopy);
1981/* GFX10/NGG */
1982static constexpr Stage vertex_ngg(HWStage::NGG, SWStage::VS);
1983static constexpr Stage vertex_geometry_ngg(HWStage::NGG, SWStage::VS_GS);
1984static constexpr Stage tess_eval_ngg(HWStage::NGG, SWStage::TES);
1985static constexpr Stage tess_eval_geometry_ngg(HWStage::NGG, SWStage::TES_GS);
1986/* GFX9 (and GFX10 if NGG isn't used) */
1987static constexpr Stage vertex_geometry_gs(HWStage::GS, SWStage::VS_GS);
1988static constexpr Stage vertex_tess_control_hs(HWStage::HS, SWStage::VS_TCS);
1989static constexpr Stage tess_eval_geometry_gs(HWStage::GS, SWStage::TES_GS);
1990/* pre-GFX9 */
1991static constexpr Stage vertex_ls(HWStage::LS, SWStage::VS); /* vertex before tesselation control */
1992static constexpr Stage vertex_es(HWStage::ES, SWStage::VS); /* vertex before geometry */
1993static constexpr Stage tess_control_hs(HWStage::HS, SWStage::TCS);
1994static constexpr Stage tess_eval_es(HWStage::ES,
1995                                    SWStage::TES); /* tesselation evaluation before geometry */
1996static constexpr Stage geometry_gs(HWStage::GS, SWStage::GS);
1997
1998enum statistic {
1999   statistic_hash,
2000   statistic_instructions,
2001   statistic_copies,
2002   statistic_branches,
2003   statistic_latency,
2004   statistic_inv_throughput,
2005   statistic_vmem_clauses,
2006   statistic_smem_clauses,
2007   statistic_sgpr_presched,
2008   statistic_vgpr_presched,
2009   num_statistics
2010};
2011
2012struct DeviceInfo {
2013   uint16_t lds_encoding_granule;
2014   uint16_t lds_alloc_granule;
2015   uint32_t lds_limit; /* in bytes */
2016   bool has_16bank_lds;
2017   uint16_t physical_sgprs;
2018   uint16_t physical_vgprs;
2019   uint16_t vgpr_limit;
2020   uint16_t sgpr_limit;
2021   uint16_t sgpr_alloc_granule;
2022   uint16_t vgpr_alloc_granule; /* must be power of two */
2023   unsigned max_wave64_per_simd;
2024   unsigned simd_per_cu;
2025   bool has_fast_fma32 = false;
2026   bool xnack_enabled = false;
2027   bool sram_ecc_enabled = false;
2028};
2029
2030enum class CompilationProgress {
2031   after_isel,
2032   after_spilling,
2033   after_ra,
2034};
2035
2036class Program final {
2037public:
2038   std::vector<Block> blocks;
2039   std::vector<RegClass> temp_rc = {s1};
2040   RegisterDemand max_reg_demand = RegisterDemand();
2041   uint16_t num_waves = 0;
2042   uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
2043   ac_shader_config* config;
2044   const struct radv_shader_info* info;
2045   enum chip_class chip_class;
2046   enum radeon_family family;
2047   DeviceInfo dev;
2048   unsigned wave_size;
2049   RegClass lane_mask;
2050   Stage stage;
2051   bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
2052   bool needs_wqm = false;   /* there exists a p_wqm instruction */
2053
2054   std::vector<uint8_t> constant_data;
2055   Temp private_segment_buffer;
2056   Temp scratch_offset;
2057
2058   uint16_t min_waves = 0;
2059   unsigned workgroup_size; /* if known; otherwise UINT_MAX */
2060   bool wgp_mode;
2061   bool early_rast = false; /* whether rasterization can start as soon as the 1st DONE pos export */
2062
2063   bool needs_vcc = false;
2064   bool needs_flat_scr = false;
2065
2066   CompilationProgress progress;
2067
2068   bool collect_statistics = false;
2069   uint32_t statistics[num_statistics];
2070
2071   float_mode next_fp_mode;
2072   unsigned next_loop_depth = 0;
2073   unsigned next_divergent_if_logical_depth = 0;
2074   unsigned next_uniform_if_depth = 0;
2075
2076   std::vector<Definition> vs_inputs;
2077
2078   struct {
2079      FILE* output = stderr;
2080      bool shorten_messages = false;
2081      void (*func)(void* private_data, enum radv_compiler_debug_level level, const char* message);
2082      void* private_data;
2083   } debug;
2084
2085   uint32_t allocateId(RegClass rc)
2086   {
2087      assert(allocationID <= 16777215);
2088      temp_rc.push_back(rc);
2089      return allocationID++;
2090   }
2091
2092   void allocateRange(unsigned amount)
2093   {
2094      assert(allocationID + amount <= 16777216);
2095      temp_rc.resize(temp_rc.size() + amount);
2096      allocationID += amount;
2097   }
2098
2099   Temp allocateTmp(RegClass rc) { return Temp(allocateId(rc), rc); }
2100
2101   uint32_t peekAllocationId() { return allocationID; }
2102
2103   friend void reindex_ssa(Program* program);
2104   friend void reindex_ssa(Program* program, std::vector<IDSet>& live_out);
2105
2106   Block* create_and_insert_block()
2107   {
2108      Block block;
2109      return insert_block(std::move(block));
2110   }
2111
2112   Block* insert_block(Block&& block)
2113   {
2114      block.index = blocks.size();
2115      block.fp_mode = next_fp_mode;
2116      block.loop_nest_depth = next_loop_depth;
2117      block.divergent_if_logical_depth = next_divergent_if_logical_depth;
2118      block.uniform_if_depth = next_uniform_if_depth;
2119      blocks.emplace_back(std::move(block));
2120      return &blocks.back();
2121   }
2122
2123private:
2124   uint32_t allocationID = 1;
2125};
2126
2127struct live {
2128   /* live temps out per block */
2129   std::vector<IDSet> live_out;
2130   /* register demand (sgpr/vgpr) per instruction per block */
2131   std::vector<std::vector<RegisterDemand>> register_demand;
2132};
2133
2134struct ra_test_policy {
2135   /* Force RA to always use its pessimistic fallback algorithm */
2136   bool skip_optimistic_path = false;
2137};
2138
2139void init();
2140
2141void init_program(Program* program, Stage stage, const struct radv_shader_info* info,
2142                  enum chip_class chip_class, enum radeon_family family, bool wgp_mode,
2143                  ac_shader_config* config);
2144
2145void select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
2146                    ac_shader_config* config, const struct radv_shader_args* args);
2147void select_gs_copy_shader(Program* program, struct nir_shader* gs_shader, ac_shader_config* config,
2148                           const struct radv_shader_args* args);
2149void select_trap_handler_shader(Program* program, struct nir_shader* shader,
2150                                ac_shader_config* config, const struct radv_shader_args* args);
2151void select_vs_prolog(Program* program, const struct radv_vs_prolog_key* key,
2152                      ac_shader_config* config, const struct radv_shader_args* args,
2153                      unsigned* num_preserved_sgprs);
2154
2155void lower_phis(Program* program);
2156void calc_min_waves(Program* program);
2157void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
2158live live_var_analysis(Program* program);
2159std::vector<uint16_t> dead_code_analysis(Program* program);
2160void dominator_tree(Program* program);
2161void insert_exec_mask(Program* program);
2162void value_numbering(Program* program);
2163void optimize(Program* program);
2164void optimize_postRA(Program* program);
2165void setup_reduce_temp(Program* program);
2166void lower_to_cssa(Program* program, live& live_vars);
2167void register_allocation(Program* program, std::vector<IDSet>& live_out_per_block,
2168                         ra_test_policy = {});
2169void ssa_elimination(Program* program);
2170void lower_to_hw_instr(Program* program);
2171void schedule_program(Program* program, live& live_vars);
2172void spill(Program* program, live& live_vars);
2173void insert_wait_states(Program* program);
2174void insert_NOPs(Program* program);
2175void form_hard_clauses(Program* program);
2176unsigned emit_program(Program* program, std::vector<uint32_t>& code);
2177/**
2178 * Returns true if print_asm can disassemble the given program for the current build/runtime
2179 * configuration
2180 */
2181bool check_print_asm_support(Program* program);
2182bool print_asm(Program* program, std::vector<uint32_t>& binary, unsigned exec_size, FILE* output);
2183bool validate_ir(Program* program);
2184bool validate_ra(Program* program);
2185#ifndef NDEBUG
2186void perfwarn(Program* program, bool cond, const char* msg, Instruction* instr = NULL);
2187#else
2188#define perfwarn(program, cond, msg, ...)                                                          \
2189   do {                                                                                            \
2190   } while (0)
2191#endif
2192
2193void collect_presched_stats(Program* program);
2194void collect_preasm_stats(Program* program);
2195void collect_postasm_stats(Program* program, const std::vector<uint32_t>& code);
2196
2197enum print_flags {
2198   print_no_ssa = 0x1,
2199   print_perf_info = 0x2,
2200   print_kill = 0x4,
2201   print_live_vars = 0x8,
2202};
2203
2204void aco_print_operand(const Operand* operand, FILE* output, unsigned flags = 0);
2205void aco_print_instr(const Instruction* instr, FILE* output, unsigned flags = 0);
2206void aco_print_program(const Program* program, FILE* output, unsigned flags = 0);
2207void aco_print_program(const Program* program, FILE* output, const live& live_vars,
2208                       unsigned flags = 0);
2209
2210void _aco_perfwarn(Program* program, const char* file, unsigned line, const char* fmt, ...);
2211void _aco_err(Program* program, const char* file, unsigned line, const char* fmt, ...);
2212
2213#define aco_perfwarn(program, ...) _aco_perfwarn(program, __FILE__, __LINE__, __VA_ARGS__)
2214#define aco_err(program, ...)      _aco_err(program, __FILE__, __LINE__, __VA_ARGS__)
2215
2216/* utilities for dealing with register demand */
2217RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
2218RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
2219RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr,
2220                                 aco_ptr<Instruction>& instr_before);
2221
2222/* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
2223uint16_t get_extra_sgprs(Program* program);
2224
2225/* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
2226uint16_t get_sgpr_alloc(Program* program, uint16_t addressable_sgprs);
2227uint16_t get_vgpr_alloc(Program* program, uint16_t addressable_vgprs);
2228
2229/* return number of addressable sgprs/vgprs for max_waves */
2230uint16_t get_addr_sgpr_from_waves(Program* program, uint16_t max_waves);
2231uint16_t get_addr_vgpr_from_waves(Program* program, uint16_t max_waves);
2232
2233typedef struct {
2234   const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
2235   const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
2236   const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
2237   const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
2238   const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
2239   const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
2240   const char* name[static_cast<int>(aco_opcode::num_opcodes)];
2241   const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
2242   /* sizes used for input/output modifiers and constants */
2243   const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)];
2244   const instr_class classes[static_cast<int>(aco_opcode::num_opcodes)];
2245} Info;
2246
2247extern const Info instr_info;
2248
2249} // namespace aco
2250
2251#endif /* ACO_IR_H */
2252