1/*
2 * Copyright © 2018 Valve Corporation
3 * Copyright © 2018 Google
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 *
24 */
25
26#include "aco_instruction_selection.h"
27
28#include "aco_builder.h"
29#include "aco_ir.h"
30
31#include "common/ac_exp_param.h"
32#include "common/sid.h"
33#include "vulkan/radv_descriptor_set.h"
34
35#include "util/fast_idiv_by_const.h"
36#include "util/memstream.h"
37
38#include <array>
39#include <functional>
40#include <map>
41#include <numeric>
42#include <stack>
43#include <utility>
44#include <vector>
45
46namespace aco {
47namespace {
48
49#define isel_err(...) _isel_err(ctx, __FILE__, __LINE__, __VA_ARGS__)
50
51static void
52_isel_err(isel_context* ctx, const char* file, unsigned line, const nir_instr* instr,
53          const char* msg)
54{
55   char* out;
56   size_t outsize;
57   struct u_memstream mem;
58   u_memstream_open(&mem, &out, &outsize);
59   FILE* const memf = u_memstream_get(&mem);
60
61   fprintf(memf, "%s: ", msg);
62   nir_print_instr(instr, memf);
63   u_memstream_close(&mem);
64
65   _aco_err(ctx->program, file, line, out);
66   free(out);
67}
68
69struct if_context {
70   Temp cond;
71
72   bool divergent_old;
73   bool exec_potentially_empty_discard_old;
74   bool exec_potentially_empty_break_old;
75   uint16_t exec_potentially_empty_break_depth_old;
76
77   unsigned BB_if_idx;
78   unsigned invert_idx;
79   bool uniform_has_then_branch;
80   bool then_branch_divergent;
81   Block BB_invert;
82   Block BB_endif;
83};
84
85struct loop_context {
86   Block loop_exit;
87
88   unsigned header_idx_old;
89   Block* exit_old;
90   bool divergent_cont_old;
91   bool divergent_branch_old;
92   bool divergent_if_old;
93};
94
95static bool visit_cf_list(struct isel_context* ctx, struct exec_list* list);
96
97static void
98add_logical_edge(unsigned pred_idx, Block* succ)
99{
100   succ->logical_preds.emplace_back(pred_idx);
101}
102
103static void
104add_linear_edge(unsigned pred_idx, Block* succ)
105{
106   succ->linear_preds.emplace_back(pred_idx);
107}
108
109static void
110add_edge(unsigned pred_idx, Block* succ)
111{
112   add_logical_edge(pred_idx, succ);
113   add_linear_edge(pred_idx, succ);
114}
115
116static void
117append_logical_start(Block* b)
118{
119   Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
120}
121
122static void
123append_logical_end(Block* b)
124{
125   Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
126}
127
128Temp
129get_ssa_temp(struct isel_context* ctx, nir_ssa_def* def)
130{
131   uint32_t id = ctx->first_temp_id + def->index;
132   return Temp(id, ctx->program->temp_rc[id]);
133}
134
135Temp
136emit_mbcnt(isel_context* ctx, Temp dst, Operand mask = Operand(), Operand base = Operand::zero())
137{
138   Builder bld(ctx->program, ctx->block);
139   assert(mask.isUndefined() || mask.isTemp() || (mask.isFixed() && mask.physReg() == exec));
140   assert(mask.isUndefined() || mask.bytes() == bld.lm.bytes());
141
142   if (ctx->program->wave_size == 32) {
143      Operand mask_lo = mask.isUndefined() ? Operand::c32(-1u) : mask;
144      return bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, Definition(dst), mask_lo, base);
145   }
146
147   Operand mask_lo = Operand::c32(-1u);
148   Operand mask_hi = Operand::c32(-1u);
149
150   if (mask.isTemp()) {
151      RegClass rc = RegClass(mask.regClass().type(), 1);
152      Builder::Result mask_split =
153         bld.pseudo(aco_opcode::p_split_vector, bld.def(rc), bld.def(rc), mask);
154      mask_lo = Operand(mask_split.def(0).getTemp());
155      mask_hi = Operand(mask_split.def(1).getTemp());
156   } else if (mask.physReg() == exec) {
157      mask_lo = Operand(exec_lo, s1);
158      mask_hi = Operand(exec_hi, s1);
159   }
160
161   Temp mbcnt_lo = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), mask_lo, base);
162
163   if (ctx->program->chip_class <= GFX7)
164      return bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, Definition(dst), mask_hi, mbcnt_lo);
165   else
166      return bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, Definition(dst), mask_hi, mbcnt_lo);
167}
168
169Temp
170emit_wqm(Builder& bld, Temp src, Temp dst = Temp(0, s1), bool program_needs_wqm = false)
171{
172   if (!dst.id())
173      dst = bld.tmp(src.regClass());
174
175   assert(src.size() == dst.size());
176
177   if (bld.program->stage != fragment_fs) {
178      if (!dst.id())
179         return src;
180
181      bld.copy(Definition(dst), src);
182      return dst;
183   }
184
185   bld.pseudo(aco_opcode::p_wqm, Definition(dst), src);
186   bld.program->needs_wqm |= program_needs_wqm;
187   return dst;
188}
189
190static Temp
191emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data)
192{
193   if (index.regClass() == s1)
194      return bld.readlane(bld.def(s1), data, index);
195
196   if (ctx->options->chip_class <= GFX7) {
197      /* GFX6-7: there is no bpermute instruction */
198      Operand index_op(index);
199      Operand input_data(data);
200      index_op.setLateKill(true);
201      input_data.setLateKill(true);
202
203      return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(bld.lm), bld.def(bld.lm, vcc),
204                        index_op, input_data);
205   } else if (ctx->options->chip_class >= GFX10 && ctx->program->wave_size == 64) {
206
207      /* GFX10 wave64 mode: emulate full-wave bpermute */
208      Temp index_is_lo =
209         bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand::c32(31u), index);
210      Builder::Result index_is_lo_split =
211         bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo);
212      Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc),
213                                     index_is_lo_split.def(1).getTemp());
214      Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
215                                     index_is_lo_split.def(0).getTemp(), index_is_lo_n1);
216      Operand index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index);
217      Operand input_data(data);
218
219      index_x4.setLateKill(true);
220      input_data.setLateKill(true);
221      same_half.setLateKill(true);
222
223      /* We need one pair of shared VGPRs:
224       * Note, that these have twice the allocation granularity of normal VGPRs */
225      ctx->program->config->num_shared_vgprs = 2 * ctx->program->dev.vgpr_alloc_granule;
226
227      return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc),
228                        index_x4, input_data, same_half);
229   } else {
230      /* GFX8-9 or GFX10 wave32: bpermute works normally */
231      Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index);
232      return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data);
233   }
234}
235
236static Temp
237emit_masked_swizzle(isel_context* ctx, Builder& bld, Temp src, unsigned mask)
238{
239   if (ctx->options->chip_class >= GFX8) {
240      unsigned and_mask = mask & 0x1f;
241      unsigned or_mask = (mask >> 5) & 0x1f;
242      unsigned xor_mask = (mask >> 10) & 0x1f;
243
244      uint16_t dpp_ctrl = 0xffff;
245
246      // TODO: we could use DPP8 for some swizzles
247      if (and_mask == 0x1f && or_mask < 4 && xor_mask < 4) {
248         unsigned res[4] = {0, 1, 2, 3};
249         for (unsigned i = 0; i < 4; i++)
250            res[i] = ((res[i] | or_mask) ^ xor_mask) & 0x3;
251         dpp_ctrl = dpp_quad_perm(res[0], res[1], res[2], res[3]);
252      } else if (and_mask == 0x1f && !or_mask && xor_mask == 8) {
253         dpp_ctrl = dpp_row_rr(8);
254      } else if (and_mask == 0x1f && !or_mask && xor_mask == 0xf) {
255         dpp_ctrl = dpp_row_mirror;
256      } else if (and_mask == 0x1f && !or_mask && xor_mask == 0x7) {
257         dpp_ctrl = dpp_row_half_mirror;
258      }
259
260      if (dpp_ctrl != 0xffff)
261         return bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
262   }
263
264   return bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false);
265}
266
267Temp
268as_vgpr(isel_context* ctx, Temp val)
269{
270   if (val.type() == RegType::sgpr) {
271      Builder bld(ctx->program, ctx->block);
272      return bld.copy(bld.def(RegType::vgpr, val.size()), val);
273   }
274   assert(val.type() == RegType::vgpr);
275   return val;
276}
277
278// assumes a != 0xffffffff
279void
280emit_v_div_u32(isel_context* ctx, Temp dst, Temp a, uint32_t b)
281{
282   assert(b != 0);
283   Builder bld(ctx->program, ctx->block);
284
285   if (util_is_power_of_two_or_zero(b)) {
286      bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand::c32(util_logbase2(b)), a);
287      return;
288   }
289
290   util_fast_udiv_info info = util_compute_fast_udiv_info(b, 32, 32);
291
292   assert(info.multiplier <= 0xffffffff);
293
294   bool pre_shift = info.pre_shift != 0;
295   bool increment = info.increment != 0;
296   bool multiply = true;
297   bool post_shift = info.post_shift != 0;
298
299   if (!pre_shift && !increment && !multiply && !post_shift) {
300      bld.copy(Definition(dst), a);
301      return;
302   }
303
304   Temp pre_shift_dst = a;
305   if (pre_shift) {
306      pre_shift_dst = (increment || multiply || post_shift) ? bld.tmp(v1) : dst;
307      bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), Operand::c32(info.pre_shift),
308               a);
309   }
310
311   Temp increment_dst = pre_shift_dst;
312   if (increment) {
313      increment_dst = (post_shift || multiply) ? bld.tmp(v1) : dst;
314      bld.vadd32(Definition(increment_dst), Operand::c32(info.increment), pre_shift_dst);
315   }
316
317   Temp multiply_dst = increment_dst;
318   if (multiply) {
319      multiply_dst = post_shift ? bld.tmp(v1) : dst;
320      bld.vop3(aco_opcode::v_mul_hi_u32, Definition(multiply_dst), increment_dst,
321               bld.copy(bld.def(v1), Operand::c32(info.multiplier)));
322   }
323
324   if (post_shift) {
325      bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand::c32(info.post_shift),
326               multiply_dst);
327   }
328}
329
330void
331emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
332{
333   Builder bld(ctx->program, ctx->block);
334   bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::c32(idx));
335}
336
337Temp
338emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
339{
340   /* no need to extract the whole vector */
341   if (src.regClass() == dst_rc) {
342      assert(idx == 0);
343      return src;
344   }
345
346   assert(src.bytes() > (idx * dst_rc.bytes()));
347   Builder bld(ctx->program, ctx->block);
348   auto it = ctx->allocated_vec.find(src.id());
349   if (it != ctx->allocated_vec.end() && dst_rc.bytes() == it->second[idx].regClass().bytes()) {
350      if (it->second[idx].regClass() == dst_rc) {
351         return it->second[idx];
352      } else {
353         assert(!dst_rc.is_subdword());
354         assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
355         return bld.copy(bld.def(dst_rc), it->second[idx]);
356      }
357   }
358
359   if (dst_rc.is_subdword())
360      src = as_vgpr(ctx, src);
361
362   if (src.bytes() == dst_rc.bytes()) {
363      assert(idx == 0);
364      return bld.copy(bld.def(dst_rc), src);
365   } else {
366      Temp dst = bld.tmp(dst_rc);
367      emit_extract_vector(ctx, src, idx, dst);
368      return dst;
369   }
370}
371
372void
373emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
374{
375   if (num_components == 1)
376      return;
377   if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
378      return;
379   RegClass rc;
380   if (num_components > vec_src.size()) {
381      if (vec_src.type() == RegType::sgpr) {
382         /* should still help get_alu_src() */
383         emit_split_vector(ctx, vec_src, vec_src.size());
384         return;
385      }
386      /* sub-dword split */
387      rc = RegClass(RegType::vgpr, vec_src.bytes() / num_components).as_subdword();
388   } else {
389      rc = RegClass(vec_src.type(), vec_src.size() / num_components);
390   }
391   aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(
392      aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
393   split->operands[0] = Operand(vec_src);
394   std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
395   for (unsigned i = 0; i < num_components; i++) {
396      elems[i] = ctx->program->allocateTmp(rc);
397      split->definitions[i] = Definition(elems[i]);
398   }
399   ctx->block->instructions.emplace_back(std::move(split));
400   ctx->allocated_vec.emplace(vec_src.id(), elems);
401}
402
403/* This vector expansion uses a mask to determine which elements in the new vector
404 * come from the original vector. The other elements are undefined. */
405void
406expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask)
407{
408   emit_split_vector(ctx, vec_src, util_bitcount(mask));
409
410   if (vec_src == dst)
411      return;
412
413   Builder bld(ctx->program, ctx->block);
414   if (num_components == 1) {
415      if (dst.type() == RegType::sgpr)
416         bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
417      else
418         bld.copy(Definition(dst), vec_src);
419      return;
420   }
421
422   unsigned component_size = dst.size() / num_components;
423   std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
424
425   aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
426      aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
427   vec->definitions[0] = Definition(dst);
428   unsigned k = 0;
429   for (unsigned i = 0; i < num_components; i++) {
430      if (mask & (1 << i)) {
431         Temp src =
432            emit_extract_vector(ctx, vec_src, k++, RegClass(vec_src.type(), component_size));
433         if (dst.type() == RegType::sgpr)
434            src = bld.as_uniform(src);
435         vec->operands[i] = Operand(src);
436      } else {
437         vec->operands[i] = Operand::zero(component_size == 2 ? 8 : 4);
438      }
439      elems[i] = vec->operands[i].getTemp();
440   }
441   ctx->block->instructions.emplace_back(std::move(vec));
442   ctx->allocated_vec.emplace(dst.id(), elems);
443}
444
445/* adjust misaligned small bit size loads */
446void
447byte_align_scalar(isel_context* ctx, Temp vec, Operand offset, Temp dst)
448{
449   Builder bld(ctx->program, ctx->block);
450   Operand shift;
451   Temp select = Temp();
452   if (offset.isConstant()) {
453      assert(offset.constantValue() && offset.constantValue() < 4);
454      shift = Operand::c32(offset.constantValue() * 8);
455   } else {
456      /* bit_offset = 8 * (offset & 0x3) */
457      Temp tmp =
458         bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), offset, Operand::c32(3u));
459      select = bld.tmp(s1);
460      shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.scc(Definition(select)), tmp,
461                       Operand::c32(3u));
462   }
463
464   if (vec.size() == 1) {
465      bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), vec, shift);
466   } else if (vec.size() == 2) {
467      Temp tmp = dst.size() == 2 ? dst : bld.tmp(s2);
468      bld.sop2(aco_opcode::s_lshr_b64, Definition(tmp), bld.def(s1, scc), vec, shift);
469      if (tmp == dst)
470         emit_split_vector(ctx, dst, 2);
471      else
472         emit_extract_vector(ctx, tmp, 0, dst);
473   } else if (vec.size() == 3 || vec.size() == 4) {
474      Temp lo = bld.tmp(s2), hi;
475      if (vec.size() == 3) {
476         /* this can happen if we use VMEM for a uniform load */
477         hi = bld.tmp(s1);
478         bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
479      } else {
480         hi = bld.tmp(s2);
481         bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
482         hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(s1), hi, Operand::zero());
483      }
484      if (select != Temp())
485         hi =
486            bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand::zero(), bld.scc(select));
487      lo = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lo, shift);
488      Temp mid = bld.tmp(s1);
489      lo = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), Definition(mid), lo);
490      hi = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), hi, shift);
491      mid = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), hi, mid);
492      bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, mid);
493      emit_split_vector(ctx, dst, 2);
494   }
495}
496
497void
498byte_align_vector(isel_context* ctx, Temp vec, Operand offset, Temp dst, unsigned component_size)
499{
500   Builder bld(ctx->program, ctx->block);
501   if (offset.isTemp()) {
502      Temp tmp[4] = {vec, vec, vec, vec};
503
504      if (vec.size() == 4) {
505         tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1), tmp[3] = bld.tmp(v1);
506         bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]),
507                    Definition(tmp[2]), Definition(tmp[3]), vec);
508      } else if (vec.size() == 3) {
509         tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1);
510         bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]),
511                    Definition(tmp[2]), vec);
512      } else if (vec.size() == 2) {
513         tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1];
514         bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec);
515      }
516      for (unsigned i = 0; i < dst.size(); i++)
517         tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], offset);
518
519      vec = tmp[0];
520      if (dst.size() == 2)
521         vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]);
522
523      offset = Operand::zero();
524   }
525
526   unsigned num_components = vec.bytes() / component_size;
527   if (vec.regClass() == dst.regClass()) {
528      assert(offset.constantValue() == 0);
529      bld.copy(Definition(dst), vec);
530      emit_split_vector(ctx, dst, num_components);
531      return;
532   }
533
534   emit_split_vector(ctx, vec, num_components);
535   std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
536   RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword();
537
538   assert(offset.constantValue() % component_size == 0);
539   unsigned skip = offset.constantValue() / component_size;
540   for (unsigned i = skip; i < num_components; i++)
541      elems[i - skip] = emit_extract_vector(ctx, vec, i, rc);
542
543   if (dst.type() == RegType::vgpr) {
544      /* if dst is vgpr - split the src and create a shrunk version according to the mask. */
545      num_components = dst.bytes() / component_size;
546      aco_ptr<Pseudo_instruction> create_vec{create_instruction<Pseudo_instruction>(
547         aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
548      for (unsigned i = 0; i < num_components; i++)
549         create_vec->operands[i] = Operand(elems[i]);
550      create_vec->definitions[0] = Definition(dst);
551      bld.insert(std::move(create_vec));
552
553   } else if (skip) {
554      /* if dst is sgpr - split the src, but move the original to sgpr. */
555      vec = bld.pseudo(aco_opcode::p_as_uniform, bld.def(RegClass(RegType::sgpr, vec.size())), vec);
556      byte_align_scalar(ctx, vec, offset, dst);
557   } else {
558      assert(dst.size() == vec.size());
559      bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
560   }
561
562   ctx->allocated_vec.emplace(dst.id(), elems);
563}
564
565Temp
566bool_to_vector_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s2))
567{
568   Builder bld(ctx->program, ctx->block);
569   if (!dst.id())
570      dst = bld.tmp(bld.lm);
571
572   assert(val.regClass() == s1);
573   assert(dst.regClass() == bld.lm);
574
575   return bld.sop2(Builder::s_cselect, Definition(dst), Operand::c32(-1), Operand::zero(),
576                   bld.scc(val));
577}
578
579Temp
580bool_to_scalar_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s1))
581{
582   Builder bld(ctx->program, ctx->block);
583   if (!dst.id())
584      dst = bld.tmp(s1);
585
586   assert(val.regClass() == bld.lm);
587   assert(dst.regClass() == s1);
588
589   /* if we're currently in WQM mode, ensure that the source is also computed in WQM */
590   Temp tmp = bld.tmp(s1);
591   bld.sop2(Builder::s_and, bld.def(bld.lm), bld.scc(Definition(tmp)), val, Operand(exec, bld.lm));
592   return emit_wqm(bld, tmp, dst);
593}
594
595/**
596 * Copies the first src_bits of the input to the output Temp. Input bits at positions larger than
597 * src_bits and dst_bits are truncated.
598 *
599 * Sign extension may be applied using the sign_extend parameter. The position of the input sign
600 * bit is indicated by src_bits in this case.
601 *
602 * If dst.bytes() is larger than dst_bits/8, the value of the upper bits is undefined.
603 */
604Temp
605convert_int(isel_context* ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits,
606            bool sign_extend, Temp dst = Temp())
607{
608   assert(!(sign_extend && dst_bits < src_bits) &&
609          "Shrinking integers is not supported for signed inputs");
610
611   if (!dst.id()) {
612      if (dst_bits % 32 == 0 || src.type() == RegType::sgpr)
613         dst = bld.tmp(src.type(), DIV_ROUND_UP(dst_bits, 32u));
614      else
615         dst = bld.tmp(RegClass(RegType::vgpr, dst_bits / 8u).as_subdword());
616   }
617
618   assert(src.type() == RegType::sgpr || src_bits == src.bytes() * 8);
619   assert(dst.type() == RegType::sgpr || dst_bits == dst.bytes() * 8);
620
621   if (dst.bytes() == src.bytes() && dst_bits < src_bits) {
622      /* Copy the raw value, leaving an undefined value in the upper bits for
623       * the caller to handle appropriately */
624      return bld.copy(Definition(dst), src);
625   } else if (dst.bytes() < src.bytes()) {
626      return bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::zero());
627   }
628
629   Temp tmp = dst;
630   if (dst_bits == 64)
631      tmp = src_bits == 32 ? src : bld.tmp(src.type(), 1);
632
633   if (tmp == src) {
634   } else if (src.regClass() == s1) {
635      assert(src_bits < 32);
636      bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), src, Operand::zero(),
637                 Operand::c32(src_bits), Operand::c32((unsigned)sign_extend));
638   } else {
639      assert(src_bits < 32);
640      bld.pseudo(aco_opcode::p_extract, Definition(tmp), src, Operand::zero(), Operand::c32(src_bits),
641                 Operand::c32((unsigned)sign_extend));
642   }
643
644   if (dst_bits == 64) {
645      if (sign_extend && dst.regClass() == s2) {
646         Temp high =
647            bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand::c32(31u));
648         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
649      } else if (sign_extend && dst.regClass() == v2) {
650         Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), tmp);
651         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
652      } else {
653         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand::zero());
654      }
655   }
656
657   return dst;
658}
659
660enum sgpr_extract_mode {
661   sgpr_extract_sext,
662   sgpr_extract_zext,
663   sgpr_extract_undef,
664};
665
666Temp
667extract_8_16_bit_sgpr_element(isel_context* ctx, Temp dst, nir_alu_src* src, sgpr_extract_mode mode)
668{
669   Temp vec = get_ssa_temp(ctx, src->src.ssa);
670   unsigned src_size = src->src.ssa->bit_size;
671   unsigned swizzle = src->swizzle[0];
672
673   if (vec.size() > 1) {
674      assert(src_size == 16);
675      vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
676      swizzle = swizzle & 1;
677   }
678
679   Builder bld(ctx->program, ctx->block);
680   Temp tmp = dst.regClass() == s2 ? bld.tmp(s1) : dst;
681
682   if (mode == sgpr_extract_undef && swizzle == 0)
683      bld.copy(Definition(tmp), vec);
684   else
685      bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), Operand(vec),
686                 Operand::c32(swizzle), Operand::c32(src_size),
687                 Operand::c32((mode == sgpr_extract_sext)));
688
689   if (dst.regClass() == s2)
690      convert_int(ctx, bld, tmp, 32, 64, mode == sgpr_extract_sext, dst);
691
692   return dst;
693}
694
695Temp
696get_alu_src(struct isel_context* ctx, nir_alu_src src, unsigned size = 1)
697{
698   if (src.src.ssa->num_components == 1 && size == 1)
699      return get_ssa_temp(ctx, src.src.ssa);
700
701   Temp vec = get_ssa_temp(ctx, src.src.ssa);
702   unsigned elem_size = src.src.ssa->bit_size / 8u;
703   bool identity_swizzle = true;
704
705   for (unsigned i = 0; identity_swizzle && i < size; i++) {
706      if (src.swizzle[i] != i)
707         identity_swizzle = false;
708   }
709   if (identity_swizzle)
710      return emit_extract_vector(ctx, vec, 0, RegClass::get(vec.type(), elem_size * size));
711
712   assert(elem_size > 0);
713   assert(vec.bytes() % elem_size == 0);
714
715   if (elem_size < 4 && vec.type() == RegType::sgpr && size == 1) {
716      assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16);
717      return extract_8_16_bit_sgpr_element(ctx, ctx->program->allocateTmp(s1), &src,
718                                           sgpr_extract_undef);
719   }
720
721   bool as_uniform = elem_size < 4 && vec.type() == RegType::sgpr;
722   if (as_uniform)
723      vec = as_vgpr(ctx, vec);
724
725   RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword()
726                                    : RegClass(vec.type(), elem_size / 4);
727   if (size == 1) {
728      return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
729   } else {
730      assert(size <= 4);
731      std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
732      aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(
733         aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
734      for (unsigned i = 0; i < size; ++i) {
735         elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
736         vec_instr->operands[i] = Operand{elems[i]};
737      }
738      Temp dst = ctx->program->allocateTmp(RegClass(vec.type(), elem_size * size / 4));
739      vec_instr->definitions[0] = Definition(dst);
740      ctx->block->instructions.emplace_back(std::move(vec_instr));
741      ctx->allocated_vec.emplace(dst.id(), elems);
742      return vec.type() == RegType::sgpr ? Builder(ctx->program, ctx->block).as_uniform(dst) : dst;
743   }
744}
745
746Temp
747get_alu_src_vop3p(struct isel_context* ctx, nir_alu_src src)
748{
749   /* returns v2b or v1 for vop3p usage.
750    * The source expects exactly 2 16bit components
751    * which are within the same dword
752    */
753   assert(src.src.ssa->bit_size == 16);
754   assert(src.swizzle[0] >> 1 == src.swizzle[1] >> 1);
755
756   Temp tmp = get_ssa_temp(ctx, src.src.ssa);
757   if (tmp.size() == 1)
758      return tmp;
759
760   /* the size is larger than 1 dword: check the swizzle */
761   unsigned dword = src.swizzle[0] >> 1;
762
763   /* extract a full dword if possible */
764   if (tmp.bytes() >= (dword + 1) * 4) {
765      return emit_extract_vector(ctx, tmp, dword, RegClass(tmp.type(), 1));
766   } else {
767      /* This must be a swizzled access to %a.zz where %a is v6b */
768      assert(((src.swizzle[0] | src.swizzle[1]) & 1) == 0);
769      assert(tmp.regClass() == v6b && dword == 1);
770      return emit_extract_vector(ctx, tmp, dword * 2, v2b);
771   }
772}
773
774uint32_t
775get_alu_src_ub(isel_context* ctx, nir_alu_instr* instr, int src_idx)
776{
777   nir_ssa_scalar scalar =
778      nir_ssa_scalar{instr->src[src_idx].src.ssa, instr->src[src_idx].swizzle[0]};
779   return nir_unsigned_upper_bound(ctx->shader, ctx->range_ht, scalar, &ctx->ub_config);
780}
781
782Temp
783convert_pointer_to_64_bit(isel_context* ctx, Temp ptr, bool non_uniform = false)
784{
785   if (ptr.size() == 2)
786      return ptr;
787   Builder bld(ctx->program, ctx->block);
788   if (ptr.type() == RegType::vgpr && !non_uniform)
789      ptr = bld.as_uniform(ptr);
790   return bld.pseudo(aco_opcode::p_create_vector, bld.def(RegClass(ptr.type(), 2)), ptr,
791                     Operand::c32((unsigned)ctx->options->address32_hi));
792}
793
794void
795emit_sop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
796                      bool writes_scc, uint8_t uses_ub = 0)
797{
798   aco_ptr<SOP2_instruction> sop2{
799      create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)};
800   sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
801   sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
802   sop2->definitions[0] = Definition(dst);
803   if (instr->no_unsigned_wrap)
804      sop2->definitions[0].setNUW(true);
805   if (writes_scc)
806      sop2->definitions[1] = Definition(ctx->program->allocateId(s1), scc, s1);
807
808   for (int i = 0; i < 2; i++) {
809      if (uses_ub & (1 << i)) {
810         uint32_t src_ub = get_alu_src_ub(ctx, instr, i);
811         if (src_ub <= 0xffff)
812            sop2->operands[i].set16bit(true);
813         else if (src_ub <= 0xffffff)
814            sop2->operands[i].set24bit(true);
815      }
816   }
817
818   ctx->block->instructions.emplace_back(std::move(sop2));
819}
820
821void
822emit_vop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode opc, Temp dst,
823                      bool commutative, bool swap_srcs = false, bool flush_denorms = false,
824                      bool nuw = false, uint8_t uses_ub = 0)
825{
826   Builder bld(ctx->program, ctx->block);
827   bld.is_precise = instr->exact;
828
829   Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
830   Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
831   if (src1.type() == RegType::sgpr) {
832      if (commutative && src0.type() == RegType::vgpr) {
833         Temp t = src0;
834         src0 = src1;
835         src1 = t;
836      } else {
837         src1 = as_vgpr(ctx, src1);
838      }
839   }
840
841   Operand op[2] = {Operand(src0), Operand(src1)};
842
843   for (int i = 0; i < 2; i++) {
844      if (uses_ub & (1 << i)) {
845         uint32_t src_ub = get_alu_src_ub(ctx, instr, swap_srcs ? !i : i);
846         if (src_ub <= 0xffff)
847            op[i].set16bit(true);
848         else if (src_ub <= 0xffffff)
849            op[i].set24bit(true);
850      }
851   }
852
853   if (flush_denorms && ctx->program->chip_class < GFX9) {
854      assert(dst.size() == 1);
855      Temp tmp = bld.vop2(opc, bld.def(v1), op[0], op[1]);
856      bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp);
857   } else {
858      if (nuw) {
859         bld.nuw().vop2(opc, Definition(dst), op[0], op[1]);
860      } else {
861         bld.vop2(opc, Definition(dst), op[0], op[1]);
862      }
863   }
864}
865
866void
867emit_vop2_instruction_logic64(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
868{
869   Builder bld(ctx->program, ctx->block);
870   bld.is_precise = instr->exact;
871
872   Temp src0 = get_alu_src(ctx, instr->src[0]);
873   Temp src1 = get_alu_src(ctx, instr->src[1]);
874
875   if (src1.type() == RegType::sgpr) {
876      assert(src0.type() == RegType::vgpr);
877      std::swap(src0, src1);
878   }
879
880   Temp src00 = bld.tmp(src0.type(), 1);
881   Temp src01 = bld.tmp(src0.type(), 1);
882   bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
883   Temp src10 = bld.tmp(v1);
884   Temp src11 = bld.tmp(v1);
885   bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
886   Temp lo = bld.vop2(op, bld.def(v1), src00, src10);
887   Temp hi = bld.vop2(op, bld.def(v1), src01, src11);
888   bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
889}
890
891void
892emit_vop3a_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
893                       bool flush_denorms = false, unsigned num_sources = 2, bool swap_srcs = false)
894{
895   assert(num_sources == 2 || num_sources == 3);
896   Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)};
897   bool has_sgpr = false;
898   for (unsigned i = 0; i < num_sources; i++) {
899      src[i] = get_alu_src(ctx, instr->src[swap_srcs ? 1 - i : i]);
900      if (has_sgpr)
901         src[i] = as_vgpr(ctx, src[i]);
902      else
903         has_sgpr = src[i].type() == RegType::sgpr;
904   }
905
906   Builder bld(ctx->program, ctx->block);
907   bld.is_precise = instr->exact;
908   if (flush_denorms && ctx->program->chip_class < GFX9) {
909      Temp tmp;
910      if (num_sources == 3)
911         tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1], src[2]);
912      else
913         tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1]);
914      if (dst.size() == 1)
915         bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp);
916      else
917         bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand::c64(0x3FF0000000000000), tmp);
918   } else if (num_sources == 3) {
919      bld.vop3(op, Definition(dst), src[0], src[1], src[2]);
920   } else {
921      bld.vop3(op, Definition(dst), src[0], src[1]);
922   }
923}
924
925Builder::Result
926emit_vop3p_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
927                       bool swap_srcs = false)
928{
929   Temp src0 = get_alu_src_vop3p(ctx, instr->src[swap_srcs]);
930   Temp src1 = get_alu_src_vop3p(ctx, instr->src[!swap_srcs]);
931   if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
932      src1 = as_vgpr(ctx, src1);
933   assert(instr->dest.dest.ssa.num_components == 2);
934
935   /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */
936   unsigned opsel_lo =
937      (instr->src[!swap_srcs].swizzle[0] & 1) << 1 | (instr->src[swap_srcs].swizzle[0] & 1);
938   unsigned opsel_hi =
939      (instr->src[!swap_srcs].swizzle[1] & 1) << 1 | (instr->src[swap_srcs].swizzle[1] & 1);
940
941   Builder bld(ctx->program, ctx->block);
942   bld.is_precise = instr->exact;
943   Builder::Result res = bld.vop3p(op, Definition(dst), src0, src1, opsel_lo, opsel_hi);
944   emit_split_vector(ctx, dst, 2);
945   return res;
946}
947
948void
949emit_idot_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst, bool clamp)
950{
951   Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)};
952   bool has_sgpr = false;
953   for (unsigned i = 0; i < 3; i++) {
954      src[i] = get_alu_src(ctx, instr->src[i]);
955      if (has_sgpr)
956         src[i] = as_vgpr(ctx, src[i]);
957      else
958         has_sgpr = src[i].type() == RegType::sgpr;
959   }
960
961   Builder bld(ctx->program, ctx->block);
962   bld.is_precise = instr->exact;
963   bld.vop3p(op, Definition(dst), src[0], src[1], src[2], 0x0, 0x7).instr->vop3p().clamp = clamp;
964}
965
966void
967emit_vop1_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
968{
969   Builder bld(ctx->program, ctx->block);
970   bld.is_precise = instr->exact;
971   if (dst.type() == RegType::sgpr)
972      bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
973                 bld.vop1(op, bld.def(RegType::vgpr, dst.size()), get_alu_src(ctx, instr->src[0])));
974   else
975      bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
976}
977
978void
979emit_vopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
980{
981   Temp src0 = get_alu_src(ctx, instr->src[0]);
982   Temp src1 = get_alu_src(ctx, instr->src[1]);
983   assert(src0.size() == src1.size());
984
985   aco_ptr<Instruction> vopc;
986   if (src1.type() == RegType::sgpr) {
987      if (src0.type() == RegType::vgpr) {
988         /* to swap the operands, we might also have to change the opcode */
989         switch (op) {
990         case aco_opcode::v_cmp_lt_f16: op = aco_opcode::v_cmp_gt_f16; break;
991         case aco_opcode::v_cmp_ge_f16: op = aco_opcode::v_cmp_le_f16; break;
992         case aco_opcode::v_cmp_lt_i16: op = aco_opcode::v_cmp_gt_i16; break;
993         case aco_opcode::v_cmp_ge_i16: op = aco_opcode::v_cmp_le_i16; break;
994         case aco_opcode::v_cmp_lt_u16: op = aco_opcode::v_cmp_gt_u16; break;
995         case aco_opcode::v_cmp_ge_u16: op = aco_opcode::v_cmp_le_u16; break;
996         case aco_opcode::v_cmp_lt_f32: op = aco_opcode::v_cmp_gt_f32; break;
997         case aco_opcode::v_cmp_ge_f32: op = aco_opcode::v_cmp_le_f32; break;
998         case aco_opcode::v_cmp_lt_i32: op = aco_opcode::v_cmp_gt_i32; break;
999         case aco_opcode::v_cmp_ge_i32: op = aco_opcode::v_cmp_le_i32; break;
1000         case aco_opcode::v_cmp_lt_u32: op = aco_opcode::v_cmp_gt_u32; break;
1001         case aco_opcode::v_cmp_ge_u32: op = aco_opcode::v_cmp_le_u32; break;
1002         case aco_opcode::v_cmp_lt_f64: op = aco_opcode::v_cmp_gt_f64; break;
1003         case aco_opcode::v_cmp_ge_f64: op = aco_opcode::v_cmp_le_f64; break;
1004         case aco_opcode::v_cmp_lt_i64: op = aco_opcode::v_cmp_gt_i64; break;
1005         case aco_opcode::v_cmp_ge_i64: op = aco_opcode::v_cmp_le_i64; break;
1006         case aco_opcode::v_cmp_lt_u64: op = aco_opcode::v_cmp_gt_u64; break;
1007         case aco_opcode::v_cmp_ge_u64: op = aco_opcode::v_cmp_le_u64; break;
1008         default: /* eq and ne are commutative */ break;
1009         }
1010         Temp t = src0;
1011         src0 = src1;
1012         src1 = t;
1013      } else {
1014         src1 = as_vgpr(ctx, src1);
1015      }
1016   }
1017
1018   Builder bld(ctx->program, ctx->block);
1019   bld.vopc(op, bld.hint_vcc(Definition(dst)), src0, src1);
1020}
1021
1022void
1023emit_sopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
1024{
1025   Temp src0 = get_alu_src(ctx, instr->src[0]);
1026   Temp src1 = get_alu_src(ctx, instr->src[1]);
1027   Builder bld(ctx->program, ctx->block);
1028
1029   assert(dst.regClass() == bld.lm);
1030   assert(src0.type() == RegType::sgpr);
1031   assert(src1.type() == RegType::sgpr);
1032   assert(src0.regClass() == src1.regClass());
1033
1034   /* Emit the SALU comparison instruction */
1035   Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1);
1036   /* Turn the result into a per-lane bool */
1037   bool_to_vector_condition(ctx, cmp, dst);
1038}
1039
1040void
1041emit_comparison(isel_context* ctx, nir_alu_instr* instr, Temp dst, aco_opcode v16_op,
1042                aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::num_opcodes,
1043                aco_opcode s64_op = aco_opcode::num_opcodes)
1044{
1045   aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64   ? s64_op
1046                     : instr->src[0].src.ssa->bit_size == 32 ? s32_op
1047                                                             : aco_opcode::num_opcodes;
1048   aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64   ? v64_op
1049                     : instr->src[0].src.ssa->bit_size == 32 ? v32_op
1050                                                             : v16_op;
1051   bool use_valu = s_op == aco_opcode::num_opcodes || nir_dest_is_divergent(instr->dest.dest) ||
1052                   get_ssa_temp(ctx, instr->src[0].src.ssa).type() == RegType::vgpr ||
1053                   get_ssa_temp(ctx, instr->src[1].src.ssa).type() == RegType::vgpr;
1054   aco_opcode op = use_valu ? v_op : s_op;
1055   assert(op != aco_opcode::num_opcodes);
1056   assert(dst.regClass() == ctx->program->lane_mask);
1057
1058   if (use_valu)
1059      emit_vopc_instruction(ctx, instr, op, dst);
1060   else
1061      emit_sopc_instruction(ctx, instr, op, dst);
1062}
1063
1064void
1065emit_boolean_logic(isel_context* ctx, nir_alu_instr* instr, Builder::WaveSpecificOpcode op,
1066                   Temp dst)
1067{
1068   Builder bld(ctx->program, ctx->block);
1069   Temp src0 = get_alu_src(ctx, instr->src[0]);
1070   Temp src1 = get_alu_src(ctx, instr->src[1]);
1071
1072   assert(dst.regClass() == bld.lm);
1073   assert(src0.regClass() == bld.lm);
1074   assert(src1.regClass() == bld.lm);
1075
1076   bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1);
1077}
1078
1079void
1080emit_bcsel(isel_context* ctx, nir_alu_instr* instr, Temp dst)
1081{
1082   Builder bld(ctx->program, ctx->block);
1083   Temp cond = get_alu_src(ctx, instr->src[0]);
1084   Temp then = get_alu_src(ctx, instr->src[1]);
1085   Temp els = get_alu_src(ctx, instr->src[2]);
1086
1087   assert(cond.regClass() == bld.lm);
1088
1089   if (dst.type() == RegType::vgpr) {
1090      aco_ptr<Instruction> bcsel;
1091      if (dst.size() == 1) {
1092         then = as_vgpr(ctx, then);
1093         els = as_vgpr(ctx, els);
1094
1095         bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
1096      } else if (dst.size() == 2) {
1097         Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
1098         bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
1099         Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
1100         bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
1101
1102         Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
1103         Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
1104
1105         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1106      } else {
1107         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1108      }
1109      return;
1110   }
1111
1112   if (instr->dest.dest.ssa.bit_size == 1) {
1113      assert(dst.regClass() == bld.lm);
1114      assert(then.regClass() == bld.lm);
1115      assert(els.regClass() == bld.lm);
1116   }
1117
1118   if (!nir_src_is_divergent(instr->src[0].src)) { /* uniform condition and values in sgpr */
1119      if (dst.regClass() == s1 || dst.regClass() == s2) {
1120         assert((then.regClass() == s1 || then.regClass() == s2) &&
1121                els.regClass() == then.regClass());
1122         assert(dst.size() == then.size());
1123         aco_opcode op =
1124            dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
1125         bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));
1126      } else {
1127         isel_err(&instr->instr, "Unimplemented uniform bcsel bit size");
1128      }
1129      return;
1130   }
1131
1132   /* divergent boolean bcsel
1133    * this implements bcsel on bools: dst = s0 ? s1 : s2
1134    * are going to be: dst = (s0 & s1) | (~s0 & s2) */
1135   assert(instr->dest.dest.ssa.bit_size == 1);
1136
1137   if (cond.id() != then.id())
1138      then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then);
1139
1140   if (cond.id() == els.id())
1141      bld.copy(Definition(dst), then);
1142   else
1143      bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then,
1144               bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond));
1145}
1146
1147void
1148emit_scaled_op(isel_context* ctx, Builder& bld, Definition dst, Temp val, aco_opcode op,
1149               uint32_t undo)
1150{
1151   /* multiply by 16777216 to handle denormals */
1152   Temp is_denormal =
1153      bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)), as_vgpr(ctx, val),
1154               bld.copy(bld.def(v1), Operand::c32((1u << 7) | (1u << 4))));
1155   Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x4b800000u), val);
1156   scaled = bld.vop1(op, bld.def(v1), scaled);
1157   scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(undo), scaled);
1158
1159   Temp not_scaled = bld.vop1(op, bld.def(v1), val);
1160
1161   bld.vop2(aco_opcode::v_cndmask_b32, dst, not_scaled, scaled, is_denormal);
1162}
1163
1164void
1165emit_rcp(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1166{
1167   if (ctx->block->fp_mode.denorm32 == 0) {
1168      bld.vop1(aco_opcode::v_rcp_f32, dst, val);
1169      return;
1170   }
1171
1172   emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, 0x4b800000u);
1173}
1174
1175void
1176emit_rsq(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1177{
1178   if (ctx->block->fp_mode.denorm32 == 0) {
1179      bld.vop1(aco_opcode::v_rsq_f32, dst, val);
1180      return;
1181   }
1182
1183   emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, 0x45800000u);
1184}
1185
1186void
1187emit_sqrt(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1188{
1189   if (ctx->block->fp_mode.denorm32 == 0) {
1190      bld.vop1(aco_opcode::v_sqrt_f32, dst, val);
1191      return;
1192   }
1193
1194   emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, 0x39800000u);
1195}
1196
1197void
1198emit_log2(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1199{
1200   if (ctx->block->fp_mode.denorm32 == 0) {
1201      bld.vop1(aco_opcode::v_log_f32, dst, val);
1202      return;
1203   }
1204
1205   emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, 0xc1c00000u);
1206}
1207
1208Temp
1209emit_trunc_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1210{
1211   if (ctx->options->chip_class >= GFX7)
1212      return bld.vop1(aco_opcode::v_trunc_f64, Definition(dst), val);
1213
1214   /* GFX6 doesn't support V_TRUNC_F64, lower it. */
1215   /* TODO: create more efficient code! */
1216   if (val.type() == RegType::sgpr)
1217      val = as_vgpr(ctx, val);
1218
1219   /* Split the input value. */
1220   Temp val_lo = bld.tmp(v1), val_hi = bld.tmp(v1);
1221   bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
1222
1223   /* Extract the exponent and compute the unbiased value. */
1224   Temp exponent =
1225      bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand::c32(20u), Operand::c32(11u));
1226   exponent = bld.vsub32(bld.def(v1), exponent, Operand::c32(1023u));
1227
1228   /* Extract the fractional part. */
1229   Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u),
1230                                Operand::c32(0x000fffffu));
1231   fract_mask = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), fract_mask, exponent);
1232
1233   Temp fract_mask_lo = bld.tmp(v1), fract_mask_hi = bld.tmp(v1);
1234   bld.pseudo(aco_opcode::p_split_vector, Definition(fract_mask_lo), Definition(fract_mask_hi),
1235              fract_mask);
1236
1237   Temp fract_lo = bld.tmp(v1), fract_hi = bld.tmp(v1);
1238   Temp tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_lo);
1239   fract_lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_lo, tmp);
1240   tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_hi);
1241   fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp);
1242
1243   /* Get the sign bit. */
1244   Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x80000000u), val_hi);
1245
1246   /* Decide the operation to apply depending on the unbiased exponent. */
1247   Temp exp_lt0 = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)), exponent,
1248                               Operand::zero());
1249   Temp dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_lo,
1250                          bld.copy(bld.def(v1), Operand::zero()), exp_lt0);
1251   Temp dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_hi, sign, exp_lt0);
1252   Temp exp_gt51 = bld.vopc_e64(aco_opcode::v_cmp_gt_i32, bld.def(s2), exponent, Operand::c32(51u));
1253   dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_lo, val_lo, exp_gt51);
1254   dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_hi, val_hi, exp_gt51);
1255
1256   return bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst_lo, dst_hi);
1257}
1258
1259Temp
1260emit_floor_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1261{
1262   if (ctx->options->chip_class >= GFX7)
1263      return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val);
1264
1265   /* GFX6 doesn't support V_FLOOR_F64, lower it (note that it's actually
1266    * lowered at NIR level for precision reasons). */
1267   Temp src0 = as_vgpr(ctx, val);
1268
1269   Temp mask = bld.copy(bld.def(s1), Operand::c32(3u)); /* isnan */
1270   Temp min_val = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::c32(-1u),
1271                             Operand::c32(0x3fefffffu));
1272
1273   Temp isnan =
1274      bld.vopc_e64(aco_opcode::v_cmp_class_f64, bld.hint_vcc(bld.def(bld.lm)), src0, mask);
1275   Temp fract = bld.vop1(aco_opcode::v_fract_f64, bld.def(v2), src0);
1276   Temp min = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), fract, min_val);
1277
1278   Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
1279   bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), src0);
1280   Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
1281   bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), min);
1282
1283   Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, isnan);
1284   Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, isnan);
1285
1286   Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);
1287
1288   Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, v);
1289   add->vop3().neg[1] = true;
1290
1291   return add->definitions[0].getTemp();
1292}
1293
1294Temp
1295uadd32_sat(Builder& bld, Definition dst, Temp src0, Temp src1)
1296{
1297   if (bld.program->chip_class < GFX8) {
1298      Builder::Result add = bld.vadd32(bld.def(v1), src0, src1, true);
1299      return bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, add.def(0).getTemp(), Operand::c32(-1),
1300                          add.def(1).getTemp());
1301   }
1302
1303   Builder::Result add(NULL);
1304   if (bld.program->chip_class >= GFX9) {
1305      add = bld.vop2_e64(aco_opcode::v_add_u32, dst, src0, src1);
1306   } else {
1307      add = bld.vop2_e64(aco_opcode::v_add_co_u32, dst, bld.hint_vcc(bld.def(bld.lm)), src0, src1);
1308   }
1309   add.instr->vop3().clamp = 1;
1310   return dst.getTemp();
1311}
1312
1313void
1314visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
1315{
1316   if (!instr->dest.dest.is_ssa) {
1317      isel_err(&instr->instr, "nir alu dst not in ssa");
1318      abort();
1319   }
1320   Builder bld(ctx->program, ctx->block);
1321   bld.is_precise = instr->exact;
1322   Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa);
1323   switch (instr->op) {
1324   case nir_op_vec2:
1325   case nir_op_vec3:
1326   case nir_op_vec4:
1327   case nir_op_vec5: {
1328      std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
1329      unsigned num = instr->dest.dest.ssa.num_components;
1330      for (unsigned i = 0; i < num; ++i)
1331         elems[i] = get_alu_src(ctx, instr->src[i]);
1332
1333      if (instr->dest.dest.ssa.bit_size >= 32 || dst.type() == RegType::vgpr) {
1334         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
1335            aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
1336         RegClass elem_rc = RegClass::get(RegType::vgpr, instr->dest.dest.ssa.bit_size / 8u);
1337         for (unsigned i = 0; i < num; ++i) {
1338            if (elems[i].type() == RegType::sgpr && elem_rc.is_subdword())
1339               elems[i] = emit_extract_vector(ctx, elems[i], 0, elem_rc);
1340            vec->operands[i] = Operand{elems[i]};
1341         }
1342         vec->definitions[0] = Definition(dst);
1343         ctx->block->instructions.emplace_back(std::move(vec));
1344         ctx->allocated_vec.emplace(dst.id(), elems);
1345      } else {
1346         bool use_s_pack = ctx->program->chip_class >= GFX9;
1347         Temp mask = bld.copy(bld.def(s1), Operand::c32((1u << instr->dest.dest.ssa.bit_size) - 1));
1348
1349         std::array<Temp, NIR_MAX_VEC_COMPONENTS> packed;
1350         uint32_t const_vals[NIR_MAX_VEC_COMPONENTS] = {};
1351         for (unsigned i = 0; i < num; i++) {
1352            unsigned packed_size = use_s_pack ? 16 : 32;
1353            unsigned idx = i * instr->dest.dest.ssa.bit_size / packed_size;
1354            unsigned offset = i * instr->dest.dest.ssa.bit_size % packed_size;
1355            if (nir_src_is_const(instr->src[i].src)) {
1356               const_vals[idx] |= nir_src_as_uint(instr->src[i].src) << offset;
1357               continue;
1358            }
1359
1360            if (offset != packed_size - instr->dest.dest.ssa.bit_size)
1361               elems[i] =
1362                  bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), elems[i], mask);
1363
1364            if (offset)
1365               elems[i] = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), elems[i],
1366                                   Operand::c32(offset));
1367
1368            if (packed[idx].id())
1369               packed[idx] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), elems[i],
1370                                      packed[idx]);
1371            else
1372               packed[idx] = elems[i];
1373         }
1374
1375         if (use_s_pack) {
1376            for (unsigned i = 0; i < dst.size(); i++) {
1377               bool same = !!packed[i * 2].id() == !!packed[i * 2 + 1].id();
1378
1379               if (packed[i * 2].id() && packed[i * 2 + 1].id())
1380                  packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],
1381                                       packed[i * 2 + 1]);
1382               else if (packed[i * 2 + 1].id())
1383                  packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1),
1384                                       Operand::c32(const_vals[i * 2]), packed[i * 2 + 1]);
1385               else if (packed[i * 2].id())
1386                  packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],
1387                                       Operand::c32(const_vals[i * 2 + 1]));
1388
1389               if (same)
1390                  const_vals[i] = const_vals[i * 2] | (const_vals[i * 2 + 1] << 16);
1391               else
1392                  const_vals[i] = 0;
1393            }
1394         }
1395
1396         for (unsigned i = 0; i < dst.size(); i++) {
1397            if (const_vals[i] && packed[i].id())
1398               packed[i] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
1399                                    Operand::c32(const_vals[i]), packed[i]);
1400            else if (!packed[i].id())
1401               packed[i] = bld.copy(bld.def(s1), Operand::c32(const_vals[i]));
1402         }
1403
1404         if (dst.size() == 1)
1405            bld.copy(Definition(dst), packed[0]);
1406         else if (dst.size() == 2)
1407            bld.pseudo(aco_opcode::p_create_vector, Definition(dst), packed[0], packed[1]);
1408         else
1409            bld.pseudo(aco_opcode::p_create_vector, Definition(dst), packed[0], packed[1],
1410                       packed[2]);
1411      }
1412      break;
1413   }
1414   case nir_op_mov: {
1415      Temp src = get_alu_src(ctx, instr->src[0]);
1416      if (src.type() == RegType::vgpr && dst.type() == RegType::sgpr) {
1417         /* use size() instead of bytes() for 8/16-bit */
1418         assert(src.size() == dst.size() && "wrong src or dst register class for nir_op_mov");
1419         bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
1420      } else {
1421         assert(src.bytes() == dst.bytes() && "wrong src or dst register class for nir_op_mov");
1422         bld.copy(Definition(dst), src);
1423      }
1424      break;
1425   }
1426   case nir_op_inot: {
1427      Temp src = get_alu_src(ctx, instr->src[0]);
1428      if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1429         emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
1430      } else if (dst.regClass() == v2) {
1431         Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1432         bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1433         lo = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), lo);
1434         hi = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), hi);
1435         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
1436      } else if (dst.type() == RegType::sgpr) {
1437         aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
1438         bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
1439      } else {
1440         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1441      }
1442      break;
1443   }
1444   case nir_op_iabs: {
1445      Temp src = get_alu_src(ctx, instr->src[0]);
1446      if (dst.regClass() == s1) {
1447         bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), src);
1448      } else if (dst.regClass() == v1) {
1449         bld.vop2(aco_opcode::v_max_i32, Definition(dst), src,
1450                  bld.vsub32(bld.def(v1), Operand::zero(), src));
1451      } else {
1452         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1453      }
1454      break;
1455   }
1456   case nir_op_isign: {
1457      Temp src = get_alu_src(ctx, instr->src[0]);
1458      if (dst.regClass() == s1) {
1459         Temp tmp =
1460            bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), src, Operand::c32(-1));
1461         bld.sop2(aco_opcode::s_min_i32, Definition(dst), bld.def(s1, scc), tmp, Operand::c32(1u));
1462      } else if (dst.regClass() == s2) {
1463         Temp neg =
1464            bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand::c32(63u));
1465         Temp neqz;
1466         if (ctx->program->chip_class >= GFX8)
1467            neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand::zero());
1468         else
1469            neqz =
1470               bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand::zero())
1471                  .def(1)
1472                  .getTemp();
1473         /* SCC gets zero-extended to 64 bit */
1474         bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz));
1475      } else if (dst.regClass() == v1) {
1476         bld.vop3(aco_opcode::v_med3_i32, Definition(dst), Operand::c32(-1), src, Operand::c32(1u));
1477      } else if (dst.regClass() == v2) {
1478         Temp upper = emit_extract_vector(ctx, src, 1, v1);
1479         Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), upper);
1480         Temp gtz =
1481            bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(bld.lm)), Operand::zero(), src);
1482         Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(1u), neg, gtz);
1483         upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), neg, gtz);
1484         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1485      } else {
1486         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1487      }
1488      break;
1489   }
1490   case nir_op_imax: {
1491      if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1492         emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_i16_e64, dst);
1493      } else if (dst.regClass() == v2b) {
1494         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i16, dst, true);
1495      } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1496         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_i16, dst);
1497      } else if (dst.regClass() == v1) {
1498         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
1499      } else if (dst.regClass() == s1) {
1500         emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
1501      } else {
1502         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1503      }
1504      break;
1505   }
1506   case nir_op_umax: {
1507      if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1508         emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_u16_e64, dst);
1509      } else if (dst.regClass() == v2b) {
1510         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u16, dst, true);
1511      } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1512         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_u16, dst);
1513      } else if (dst.regClass() == v1) {
1514         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
1515      } else if (dst.regClass() == s1) {
1516         emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
1517      } else {
1518         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1519      }
1520      break;
1521   }
1522   case nir_op_imin: {
1523      if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1524         emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_i16_e64, dst);
1525      } else if (dst.regClass() == v2b) {
1526         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i16, dst, true);
1527      } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1528         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_i16, dst);
1529      } else if (dst.regClass() == v1) {
1530         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
1531      } else if (dst.regClass() == s1) {
1532         emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
1533      } else {
1534         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1535      }
1536      break;
1537   }
1538   case nir_op_umin: {
1539      if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1540         emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_u16_e64, dst);
1541      } else if (dst.regClass() == v2b) {
1542         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u16, dst, true);
1543      } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1544         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_u16, dst);
1545      } else if (dst.regClass() == v1) {
1546         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
1547      } else if (dst.regClass() == s1) {
1548         emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
1549      } else {
1550         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1551      }
1552      break;
1553   }
1554   case nir_op_ior: {
1555      if (instr->dest.dest.ssa.bit_size == 1) {
1556         emit_boolean_logic(ctx, instr, Builder::s_or, dst);
1557      } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1558         emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
1559      } else if (dst.regClass() == v2) {
1560         emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst);
1561      } else if (dst.regClass() == s1) {
1562         emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
1563      } else if (dst.regClass() == s2) {
1564         emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
1565      } else {
1566         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1567      }
1568      break;
1569   }
1570   case nir_op_iand: {
1571      if (instr->dest.dest.ssa.bit_size == 1) {
1572         emit_boolean_logic(ctx, instr, Builder::s_and, dst);
1573      } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1574         emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
1575      } else if (dst.regClass() == v2) {
1576         emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_and_b32, dst);
1577      } else if (dst.regClass() == s1) {
1578         emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
1579      } else if (dst.regClass() == s2) {
1580         emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
1581      } else {
1582         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1583      }
1584      break;
1585   }
1586   case nir_op_ixor: {
1587      if (instr->dest.dest.ssa.bit_size == 1) {
1588         emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
1589      } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1590         emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
1591      } else if (dst.regClass() == v2) {
1592         emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_xor_b32, dst);
1593      } else if (dst.regClass() == s1) {
1594         emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
1595      } else if (dst.regClass() == s2) {
1596         emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
1597      } else {
1598         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1599      }
1600      break;
1601   }
1602   case nir_op_ushr: {
1603      if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1604         emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshrrev_b16_e64, dst, false, 2, true);
1605      } else if (dst.regClass() == v2b) {
1606         emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b16, dst, false, true);
1607      } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1608         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshrrev_b16, dst, true);
1609      } else if (dst.regClass() == v1) {
1610         emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
1611      } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1612         bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1613                  get_alu_src(ctx, instr->src[0]));
1614      } else if (dst.regClass() == v2) {
1615         emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshr_b64, dst);
1616      } else if (dst.regClass() == s2) {
1617         emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
1618      } else if (dst.regClass() == s1) {
1619         emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
1620      } else {
1621         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1622      }
1623      break;
1624   }
1625   case nir_op_ishl: {
1626      if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1627         emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshlrev_b16_e64, dst, false, 2, true);
1628      } else if (dst.regClass() == v2b) {
1629         emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b16, dst, false, true);
1630      } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1631         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshlrev_b16, dst, true);
1632      } else if (dst.regClass() == v1) {
1633         emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true, false,
1634                               false, 2);
1635      } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1636         bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1637                  get_alu_src(ctx, instr->src[0]));
1638      } else if (dst.regClass() == v2) {
1639         emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshl_b64, dst);
1640      } else if (dst.regClass() == s1) {
1641         emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true, 1);
1642      } else if (dst.regClass() == s2) {
1643         emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
1644      } else {
1645         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1646      }
1647      break;
1648   }
1649   case nir_op_ishr: {
1650      if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1651         emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashrrev_i16_e64, dst, false, 2, true);
1652      } else if (dst.regClass() == v2b) {
1653         emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i16, dst, false, true);
1654      } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1655         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_ashrrev_i16, dst, true);
1656      } else if (dst.regClass() == v1) {
1657         emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
1658      } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1659         bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1660                  get_alu_src(ctx, instr->src[0]));
1661      } else if (dst.regClass() == v2) {
1662         emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashr_i64, dst);
1663      } else if (dst.regClass() == s1) {
1664         emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
1665      } else if (dst.regClass() == s2) {
1666         emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
1667      } else {
1668         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1669      }
1670      break;
1671   }
1672   case nir_op_find_lsb: {
1673      Temp src = get_alu_src(ctx, instr->src[0]);
1674      if (src.regClass() == s1) {
1675         bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
1676      } else if (src.regClass() == v1) {
1677         emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
1678      } else if (src.regClass() == s2) {
1679         bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
1680      } else {
1681         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1682      }
1683      break;
1684   }
1685   case nir_op_ufind_msb:
1686   case nir_op_ifind_msb: {
1687      Temp src = get_alu_src(ctx, instr->src[0]);
1688      if (src.regClass() == s1 || src.regClass() == s2) {
1689         aco_opcode op = src.regClass() == s2
1690                            ? (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64
1691                                                             : aco_opcode::s_flbit_i32_i64)
1692                            : (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32
1693                                                             : aco_opcode::s_flbit_i32);
1694         Temp msb_rev = bld.sop1(op, bld.def(s1), src);
1695
1696         Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
1697                                        Operand::c32(src.size() * 32u - 1u), msb_rev);
1698         Temp msb = sub.def(0).getTemp();
1699         Temp carry = sub.def(1).getTemp();
1700
1701         bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), msb,
1702                  bld.scc(carry));
1703      } else if (src.regClass() == v1) {
1704         aco_opcode op =
1705            instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1706         Temp msb_rev = bld.tmp(v1);
1707         emit_vop1_instruction(ctx, instr, op, msb_rev);
1708         Temp msb = bld.tmp(v1);
1709         Temp carry =
1710            bld.vsub32(Definition(msb), Operand::c32(31u), Operand(msb_rev), true).def(1).getTemp();
1711         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand::c32(-1), carry);
1712      } else if (src.regClass() == v2) {
1713         aco_opcode op =
1714            instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1715
1716         Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1717         bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1718
1719         lo = uadd32_sat(bld, bld.def(v1), bld.copy(bld.def(s1), Operand::c32(32u)),
1720                         bld.vop1(op, bld.def(v1), lo));
1721         hi = bld.vop1(op, bld.def(v1), hi);
1722         Temp found_hi = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::c32(-1), hi);
1723
1724         Temp msb_rev = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lo, hi, found_hi);
1725
1726         Temp msb = bld.tmp(v1);
1727         Temp carry =
1728            bld.vsub32(Definition(msb), Operand::c32(63u), Operand(msb_rev), true).def(1).getTemp();
1729         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand::c32(-1), carry);
1730      } else {
1731         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1732      }
1733      break;
1734   }
1735   case nir_op_bitfield_reverse: {
1736      if (dst.regClass() == s1) {
1737         bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1738      } else if (dst.regClass() == v1) {
1739         bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1740      } else {
1741         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1742      }
1743      break;
1744   }
1745   case nir_op_iadd: {
1746      if (dst.regClass() == s1) {
1747         emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
1748         break;
1749      } else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX10) {
1750         emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_u16_e64, dst);
1751         break;
1752      } else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX8) {
1753         emit_vop2_instruction(ctx, instr, aco_opcode::v_add_u16, dst, true);
1754         break;
1755      } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1756         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_u16, dst);
1757         break;
1758      }
1759
1760      Temp src0 = get_alu_src(ctx, instr->src[0]);
1761      Temp src1 = get_alu_src(ctx, instr->src[1]);
1762      if (dst.type() == RegType::vgpr && dst.bytes() <= 4) {
1763         bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
1764         break;
1765      }
1766
1767      assert(src0.size() == 2 && src1.size() == 2);
1768      Temp src00 = bld.tmp(src0.type(), 1);
1769      Temp src01 = bld.tmp(dst.type(), 1);
1770      bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1771      Temp src10 = bld.tmp(src1.type(), 1);
1772      Temp src11 = bld.tmp(dst.type(), 1);
1773      bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1774
1775      if (dst.regClass() == s2) {
1776         Temp carry = bld.tmp(s1);
1777         Temp dst0 =
1778            bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1779         Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11,
1780                              bld.scc(carry));
1781         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1782      } else if (dst.regClass() == v2) {
1783         Temp dst0 = bld.tmp(v1);
1784         Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
1785         Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
1786         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1787      } else {
1788         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1789      }
1790      break;
1791   }
1792   case nir_op_uadd_sat: {
1793      Temp src0 = get_alu_src(ctx, instr->src[0]);
1794      Temp src1 = get_alu_src(ctx, instr->src[1]);
1795      if (dst.regClass() == s1) {
1796         Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
1797         bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)), src0, src1);
1798         bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), tmp,
1799                  bld.scc(carry));
1800      } else if (dst.regClass() == v2b) {
1801         Instruction* add_instr;
1802         if (ctx->program->chip_class >= GFX10) {
1803            add_instr = bld.vop3(aco_opcode::v_add_u16_e64, Definition(dst), src0, src1).instr;
1804         } else {
1805            if (src1.type() == RegType::sgpr)
1806               std::swap(src0, src1);
1807            add_instr =
1808               bld.vop2_e64(aco_opcode::v_add_u16, Definition(dst), src0, as_vgpr(ctx, src1)).instr;
1809         }
1810         add_instr->vop3().clamp = 1;
1811      } else if (dst.regClass() == v1) {
1812         uadd32_sat(bld, Definition(dst), src0, src1);
1813      } else {
1814         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1815      }
1816      break;
1817   }
1818   case nir_op_iadd_sat: {
1819      Temp src0 = get_alu_src(ctx, instr->src[0]);
1820      Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
1821      if (dst.regClass() == v2b) {
1822         Instruction* add_instr =
1823            bld.vop3(aco_opcode::v_add_i16, Definition(dst), src0, src1).instr;
1824         add_instr->vop3().clamp = 1;
1825      } else if (dst.regClass() == v1) {
1826         Instruction* add_instr =
1827            bld.vop3(aco_opcode::v_add_i32, Definition(dst), src0, src1).instr;
1828         add_instr->vop3().clamp = 1;
1829      } else {
1830         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1831      }
1832      break;
1833   }
1834   case nir_op_uadd_carry: {
1835      Temp src0 = get_alu_src(ctx, instr->src[0]);
1836      Temp src1 = get_alu_src(ctx, instr->src[1]);
1837      if (dst.regClass() == s1) {
1838         bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1839         break;
1840      }
1841      if (dst.regClass() == v1) {
1842         Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
1843         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
1844                      carry);
1845         break;
1846      }
1847
1848      Temp src00 = bld.tmp(src0.type(), 1);
1849      Temp src01 = bld.tmp(dst.type(), 1);
1850      bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1851      Temp src10 = bld.tmp(src1.type(), 1);
1852      Temp src11 = bld.tmp(dst.type(), 1);
1853      bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1854      if (dst.regClass() == s2) {
1855         Temp carry = bld.tmp(s1);
1856         bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1857         carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,
1858                          bld.scc(carry))
1859                    .def(1)
1860                    .getTemp();
1861         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero());
1862      } else if (dst.regClass() == v2) {
1863         Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
1864         carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
1865         carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
1866                              Operand::c32(1u), carry);
1867         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero());
1868      } else {
1869         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1870      }
1871      break;
1872   }
1873   case nir_op_isub: {
1874      if (dst.regClass() == s1) {
1875         emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
1876         break;
1877      } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1878         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_u16, dst);
1879         break;
1880      }
1881
1882      Temp src0 = get_alu_src(ctx, instr->src[0]);
1883      Temp src1 = get_alu_src(ctx, instr->src[1]);
1884      if (dst.regClass() == v1) {
1885         bld.vsub32(Definition(dst), src0, src1);
1886         break;
1887      } else if (dst.bytes() <= 2) {
1888         if (ctx->program->chip_class >= GFX10)
1889            bld.vop3(aco_opcode::v_sub_u16_e64, Definition(dst), src0, src1);
1890         else if (src1.type() == RegType::sgpr)
1891            bld.vop2(aco_opcode::v_subrev_u16, Definition(dst), src1, as_vgpr(ctx, src0));
1892         else if (ctx->program->chip_class >= GFX8)
1893            bld.vop2(aco_opcode::v_sub_u16, Definition(dst), src0, as_vgpr(ctx, src1));
1894         else
1895            bld.vsub32(Definition(dst), src0, src1);
1896         break;
1897      }
1898
1899      Temp src00 = bld.tmp(src0.type(), 1);
1900      Temp src01 = bld.tmp(dst.type(), 1);
1901      bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1902      Temp src10 = bld.tmp(src1.type(), 1);
1903      Temp src11 = bld.tmp(dst.type(), 1);
1904      bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1905      if (dst.regClass() == s2) {
1906         Temp borrow = bld.tmp(s1);
1907         Temp dst0 =
1908            bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
1909         Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11,
1910                              bld.scc(borrow));
1911         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1912      } else if (dst.regClass() == v2) {
1913         Temp lower = bld.tmp(v1);
1914         Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
1915         Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
1916         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1917      } else {
1918         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1919      }
1920      break;
1921   }
1922   case nir_op_usub_borrow: {
1923      Temp src0 = get_alu_src(ctx, instr->src[0]);
1924      Temp src1 = get_alu_src(ctx, instr->src[1]);
1925      if (dst.regClass() == s1) {
1926         bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1927         break;
1928      } else if (dst.regClass() == v1) {
1929         Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
1930         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
1931                      borrow);
1932         break;
1933      }
1934
1935      Temp src00 = bld.tmp(src0.type(), 1);
1936      Temp src01 = bld.tmp(dst.type(), 1);
1937      bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1938      Temp src10 = bld.tmp(src1.type(), 1);
1939      Temp src11 = bld.tmp(dst.type(), 1);
1940      bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1941      if (dst.regClass() == s2) {
1942         Temp borrow = bld.tmp(s1);
1943         bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
1944         borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,
1945                           bld.scc(borrow))
1946                     .def(1)
1947                     .getTemp();
1948         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero());
1949      } else if (dst.regClass() == v2) {
1950         Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
1951         borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
1952         borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
1953                               Operand::c32(1u), borrow);
1954         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero());
1955      } else {
1956         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1957      }
1958      break;
1959   }
1960   case nir_op_imul: {
1961      if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX10) {
1962         emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u16_e64, dst);
1963      } else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX8) {
1964         emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, true);
1965      } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1966         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_lo_u16, dst);
1967      } else if (dst.type() == RegType::vgpr) {
1968         uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
1969         uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
1970
1971         if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
1972            bool nuw_16bit = src0_ub <= 0xffff && src1_ub <= 0xffff && src0_ub * src1_ub <= 0xffff;
1973            emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_u32_u24, dst,
1974                                  true /* commutative */, false, false, nuw_16bit);
1975         } else if (nir_src_is_const(instr->src[0].src)) {
1976            bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[1]),
1977                          nir_src_as_uint(instr->src[0].src), false);
1978         } else if (nir_src_is_const(instr->src[1].src)) {
1979            bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[0]),
1980                          nir_src_as_uint(instr->src[1].src), false);
1981         } else {
1982            emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u32, dst);
1983         }
1984      } else if (dst.regClass() == s1) {
1985         emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
1986      } else {
1987         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1988      }
1989      break;
1990   }
1991   case nir_op_umul_high: {
1992      if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1993         emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_u32, dst, false);
1994      } else if (dst.bytes() == 4) {
1995         uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
1996         uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
1997
1998         Temp tmp = dst.regClass() == s1 ? bld.tmp(v1) : dst;
1999         if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
2000            emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_hi_u32_u24, tmp, true);
2001         } else {
2002            emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_u32, tmp);
2003         }
2004
2005         if (dst.regClass() == s1)
2006            bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2007      } else {
2008         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2009      }
2010      break;
2011   }
2012   case nir_op_imul_high: {
2013      if (dst.regClass() == v1) {
2014         emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_i32, dst);
2015      } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
2016         emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_i32, dst, false);
2017      } else if (dst.regClass() == s1) {
2018         Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
2019                             as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
2020         bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2021      } else {
2022         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2023      }
2024      break;
2025   }
2026   case nir_op_fmul: {
2027      if (dst.regClass() == v2b) {
2028         emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, dst, true);
2029      } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2030         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_f16, dst);
2031      } else if (dst.regClass() == v1) {
2032         emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
2033      } else if (dst.regClass() == v2) {
2034         emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_f64, dst);
2035      } else {
2036         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2037      }
2038      break;
2039   }
2040   case nir_op_fadd: {
2041      if (dst.regClass() == v2b) {
2042         emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, dst, true);
2043      } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2044         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);
2045      } else if (dst.regClass() == v1) {
2046         emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
2047      } else if (dst.regClass() == v2) {
2048         emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_f64, dst);
2049      } else {
2050         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2051      }
2052      break;
2053   }
2054   case nir_op_fsub: {
2055      if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2056         Instruction* add = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);
2057         VOP3P_instruction& sub = add->vop3p();
2058         sub.neg_lo[1] = true;
2059         sub.neg_hi[1] = true;
2060         break;
2061      }
2062
2063      Temp src0 = get_alu_src(ctx, instr->src[0]);
2064      Temp src1 = get_alu_src(ctx, instr->src[1]);
2065      if (dst.regClass() == v2b) {
2066         if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
2067            emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, dst, false);
2068         else
2069            emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, dst, true);
2070      } else if (dst.regClass() == v1) {
2071         if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
2072            emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
2073         else
2074            emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
2075      } else if (dst.regClass() == v2) {
2076         Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), as_vgpr(ctx, src0),
2077                                     as_vgpr(ctx, src1));
2078         add->vop3().neg[1] = true;
2079      } else {
2080         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2081      }
2082      break;
2083   }
2084   case nir_op_fmax: {
2085      if (dst.regClass() == v2b) {
2086         // TODO: check fp_mode.must_flush_denorms16_64
2087         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, dst, true);
2088      } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2089         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_f16, dst);
2090      } else if (dst.regClass() == v1) {
2091         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false,
2092                               ctx->block->fp_mode.must_flush_denorms32);
2093      } else if (dst.regClass() == v2) {
2094         emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_f64, dst,
2095                                ctx->block->fp_mode.must_flush_denorms16_64);
2096      } else {
2097         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2098      }
2099      break;
2100   }
2101   case nir_op_fmin: {
2102      if (dst.regClass() == v2b) {
2103         // TODO: check fp_mode.must_flush_denorms16_64
2104         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, dst, true);
2105      } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2106         emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_f16, dst, true);
2107      } else if (dst.regClass() == v1) {
2108         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false,
2109                               ctx->block->fp_mode.must_flush_denorms32);
2110      } else if (dst.regClass() == v2) {
2111         emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_f64, dst,
2112                                ctx->block->fp_mode.must_flush_denorms16_64);
2113      } else {
2114         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2115      }
2116      break;
2117   }
2118   case nir_op_sdot_4x8_iadd: {
2119      emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, false);
2120      break;
2121   }
2122   case nir_op_sdot_4x8_iadd_sat: {
2123      emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, true);
2124      break;
2125   }
2126   case nir_op_udot_4x8_uadd: {
2127      emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, false);
2128      break;
2129   }
2130   case nir_op_udot_4x8_uadd_sat: {
2131      emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, true);
2132      break;
2133   }
2134   case nir_op_sdot_2x16_iadd: {
2135      emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, false);
2136      break;
2137   }
2138   case nir_op_sdot_2x16_iadd_sat: {
2139      emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, true);
2140      break;
2141   }
2142   case nir_op_udot_2x16_uadd: {
2143      emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, false);
2144      break;
2145   }
2146   case nir_op_udot_2x16_uadd_sat: {
2147      emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, true);
2148      break;
2149   }
2150   case nir_op_cube_face_coord_amd: {
2151      Temp in = get_alu_src(ctx, instr->src[0], 3);
2152      Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1),
2153                     emit_extract_vector(ctx, in, 2, v1)};
2154      Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
2155      ma = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ma);
2156      Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
2157      Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
2158      sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::c32(0x3f000000u /*0.5*/),
2159                    bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, ma));
2160      tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::c32(0x3f000000u /*0.5*/),
2161                    bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, ma));
2162      bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc);
2163      break;
2164   }
2165   case nir_op_cube_face_index_amd: {
2166      Temp in = get_alu_src(ctx, instr->src[0], 3);
2167      Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1),
2168                     emit_extract_vector(ctx, in, 2, v1)};
2169      bld.vop3(aco_opcode::v_cubeid_f32, Definition(dst), src[0], src[1], src[2]);
2170      break;
2171   }
2172   case nir_op_bcsel: {
2173      emit_bcsel(ctx, instr, dst);
2174      break;
2175   }
2176   case nir_op_frsq: {
2177      if (dst.regClass() == v2b) {
2178         emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f16, dst);
2179      } else if (dst.regClass() == v1) {
2180         Temp src = get_alu_src(ctx, instr->src[0]);
2181         emit_rsq(ctx, bld, Definition(dst), src);
2182      } else if (dst.regClass() == v2) {
2183         /* Lowered at NIR level for precision reasons. */
2184         emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
2185      } else {
2186         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2187      }
2188      break;
2189   }
2190   case nir_op_fneg: {
2191      if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2192         Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2193         bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0xBC00),
2194                   instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);
2195         emit_split_vector(ctx, dst, 2);
2196         break;
2197      }
2198      Temp src = get_alu_src(ctx, instr->src[0]);
2199      if (dst.regClass() == v2b) {
2200         bld.vop2(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0xbc00u), as_vgpr(ctx, src));
2201      } else if (dst.regClass() == v1) {
2202         bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0xbf800000u),
2203                  as_vgpr(ctx, src));
2204      } else if (dst.regClass() == v2) {
2205         if (ctx->block->fp_mode.must_flush_denorms16_64)
2206            src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand::c64(0x3FF0000000000000),
2207                           as_vgpr(ctx, src));
2208         Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
2209         bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2210         upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand::c32(0x80000000u), upper);
2211         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2212      } else {
2213         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2214      }
2215      break;
2216   }
2217   case nir_op_fabs: {
2218      Temp src = get_alu_src(ctx, instr->src[0]);
2219      if (dst.regClass() == v2b) {
2220         Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f16, Definition(dst),
2221                                         Operand::c16(0x3c00), as_vgpr(ctx, src))
2222                               .instr;
2223         mul->vop3().abs[1] = true;
2224      } else if (dst.regClass() == v1) {
2225         Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f32, Definition(dst),
2226                                         Operand::c32(0x3f800000u), as_vgpr(ctx, src))
2227                               .instr;
2228         mul->vop3().abs[1] = true;
2229      } else if (dst.regClass() == v2) {
2230         if (ctx->block->fp_mode.must_flush_denorms16_64)
2231            src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand::c64(0x3FF0000000000000),
2232                           as_vgpr(ctx, src));
2233         Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
2234         bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2235         upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7FFFFFFFu), upper);
2236         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2237      } else {
2238         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2239      }
2240      break;
2241   }
2242   case nir_op_fsat: {
2243      if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2244         Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2245         Instruction* vop3p =
2246            bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0x3C00),
2247                      instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);
2248         vop3p->vop3p().clamp = true;
2249         emit_split_vector(ctx, dst, 2);
2250         break;
2251      }
2252      Temp src = get_alu_src(ctx, instr->src[0]);
2253      if (dst.regClass() == v2b) {
2254         bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand::c16(0u), Operand::c16(0x3c00),
2255                  src);
2256      } else if (dst.regClass() == v1) {
2257         bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand::zero(),
2258                  Operand::c32(0x3f800000u), src);
2259         /* apparently, it is not necessary to flush denorms if this instruction is used with these
2260          * operands */
2261         // TODO: confirm that this holds under any circumstances
2262      } else if (dst.regClass() == v2) {
2263         Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand::zero());
2264         add->vop3().clamp = true;
2265      } else {
2266         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2267      }
2268      break;
2269   }
2270   case nir_op_flog2: {
2271      if (dst.regClass() == v2b) {
2272         emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f16, dst);
2273      } else if (dst.regClass() == v1) {
2274         Temp src = get_alu_src(ctx, instr->src[0]);
2275         emit_log2(ctx, bld, Definition(dst), src);
2276      } else {
2277         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2278      }
2279      break;
2280   }
2281   case nir_op_frcp: {
2282      if (dst.regClass() == v2b) {
2283         emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f16, dst);
2284      } else if (dst.regClass() == v1) {
2285         Temp src = get_alu_src(ctx, instr->src[0]);
2286         emit_rcp(ctx, bld, Definition(dst), src);
2287      } else if (dst.regClass() == v2) {
2288         /* Lowered at NIR level for precision reasons. */
2289         emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
2290      } else {
2291         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2292      }
2293      break;
2294   }
2295   case nir_op_fexp2: {
2296      if (dst.regClass() == v2b) {
2297         emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f16, dst);
2298      } else if (dst.regClass() == v1) {
2299         emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
2300      } else {
2301         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2302      }
2303      break;
2304   }
2305   case nir_op_fsqrt: {
2306      if (dst.regClass() == v2b) {
2307         emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f16, dst);
2308      } else if (dst.regClass() == v1) {
2309         Temp src = get_alu_src(ctx, instr->src[0]);
2310         emit_sqrt(ctx, bld, Definition(dst), src);
2311      } else if (dst.regClass() == v2) {
2312         /* Lowered at NIR level for precision reasons. */
2313         emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
2314      } else {
2315         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2316      }
2317      break;
2318   }
2319   case nir_op_ffract: {
2320      if (dst.regClass() == v2b) {
2321         emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f16, dst);
2322      } else if (dst.regClass() == v1) {
2323         emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
2324      } else if (dst.regClass() == v2) {
2325         emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
2326      } else {
2327         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2328      }
2329      break;
2330   }
2331   case nir_op_ffloor: {
2332      if (dst.regClass() == v2b) {
2333         emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f16, dst);
2334      } else if (dst.regClass() == v1) {
2335         emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
2336      } else if (dst.regClass() == v2) {
2337         Temp src = get_alu_src(ctx, instr->src[0]);
2338         emit_floor_f64(ctx, bld, Definition(dst), src);
2339      } else {
2340         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2341      }
2342      break;
2343   }
2344   case nir_op_fceil: {
2345      if (dst.regClass() == v2b) {
2346         emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f16, dst);
2347      } else if (dst.regClass() == v1) {
2348         emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
2349      } else if (dst.regClass() == v2) {
2350         if (ctx->options->chip_class >= GFX7) {
2351            emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
2352         } else {
2353            /* GFX6 doesn't support V_CEIL_F64, lower it. */
2354            /* trunc = trunc(src0)
2355             * if (src0 > 0.0 && src0 != trunc)
2356             *    trunc += 1.0
2357             */
2358            Temp src0 = get_alu_src(ctx, instr->src[0]);
2359            Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src0);
2360            Temp tmp0 =
2361               bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, Operand::zero());
2362            Temp tmp1 =
2363               bld.vopc(aco_opcode::v_cmp_lg_f64, bld.hint_vcc(bld.def(bld.lm)), src0, trunc);
2364            Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.hint_vcc(bld.def(s2)), bld.def(s1, scc),
2365                                 tmp0, tmp1);
2366            Temp add = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
2367                                bld.copy(bld.def(v1), Operand::zero()),
2368                                bld.copy(bld.def(v1), Operand::c32(0x3ff00000u)), cond);
2369            add = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2),
2370                             bld.copy(bld.def(v1), Operand::zero()), add);
2371            bld.vop3(aco_opcode::v_add_f64, Definition(dst), trunc, add);
2372         }
2373      } else {
2374         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2375      }
2376      break;
2377   }
2378   case nir_op_ftrunc: {
2379      if (dst.regClass() == v2b) {
2380         emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f16, dst);
2381      } else if (dst.regClass() == v1) {
2382         emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
2383      } else if (dst.regClass() == v2) {
2384         Temp src = get_alu_src(ctx, instr->src[0]);
2385         emit_trunc_f64(ctx, bld, Definition(dst), src);
2386      } else {
2387         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2388      }
2389      break;
2390   }
2391   case nir_op_fround_even: {
2392      if (dst.regClass() == v2b) {
2393         emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f16, dst);
2394      } else if (dst.regClass() == v1) {
2395         emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
2396      } else if (dst.regClass() == v2) {
2397         if (ctx->options->chip_class >= GFX7) {
2398            emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
2399         } else {
2400            /* GFX6 doesn't support V_RNDNE_F64, lower it. */
2401            Temp src0_lo = bld.tmp(v1), src0_hi = bld.tmp(v1);
2402            Temp src0 = get_alu_src(ctx, instr->src[0]);
2403            bld.pseudo(aco_opcode::p_split_vector, Definition(src0_lo), Definition(src0_hi), src0);
2404
2405            Temp bitmask = bld.sop1(aco_opcode::s_brev_b32, bld.def(s1),
2406                                    bld.copy(bld.def(s1), Operand::c32(-2u)));
2407            Temp bfi =
2408               bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), bitmask,
2409                        bld.copy(bld.def(v1), Operand::c32(0x43300000u)), as_vgpr(ctx, src0_hi));
2410            Temp tmp =
2411               bld.vop3(aco_opcode::v_add_f64, bld.def(v2), src0,
2412                        bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi));
2413            Instruction* sub =
2414               bld.vop3(aco_opcode::v_add_f64, bld.def(v2), tmp,
2415                        bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi));
2416            sub->vop3().neg[1] = true;
2417            tmp = sub->definitions[0].getTemp();
2418
2419            Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u),
2420                                Operand::c32(0x432fffffu));
2421            Instruction* vop3 =
2422               bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.hint_vcc(bld.def(bld.lm)), src0, v);
2423            vop3->vop3().abs[0] = true;
2424            Temp cond = vop3->definitions[0].getTemp();
2425
2426            Temp tmp_lo = bld.tmp(v1), tmp_hi = bld.tmp(v1);
2427            bld.pseudo(aco_opcode::p_split_vector, Definition(tmp_lo), Definition(tmp_hi), tmp);
2428            Temp dst0 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_lo,
2429                                     as_vgpr(ctx, src0_lo), cond);
2430            Temp dst1 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_hi,
2431                                     as_vgpr(ctx, src0_hi), cond);
2432
2433            bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2434         }
2435      } else {
2436         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2437      }
2438      break;
2439   }
2440   case nir_op_fsin:
2441   case nir_op_fcos: {
2442      Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2443      aco_ptr<Instruction> norm;
2444      if (dst.regClass() == v2b) {
2445         Temp half_pi = bld.copy(bld.def(s1), Operand::c32(0x3118u));
2446         Temp tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v1), half_pi, src);
2447         aco_opcode opcode =
2448            instr->op == nir_op_fsin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
2449         bld.vop1(opcode, Definition(dst), tmp);
2450      } else if (dst.regClass() == v1) {
2451         Temp half_pi = bld.copy(bld.def(s1), Operand::c32(0x3e22f983u));
2452         Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
2453
2454         /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
2455         if (ctx->options->chip_class < GFX9)
2456            tmp = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), tmp);
2457
2458         aco_opcode opcode =
2459            instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
2460         bld.vop1(opcode, Definition(dst), tmp);
2461      } else {
2462         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2463      }
2464      break;
2465   }
2466   case nir_op_ldexp: {
2467      if (dst.regClass() == v2b) {
2468         emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, dst, false);
2469      } else if (dst.regClass() == v1) {
2470         emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f32, dst);
2471      } else if (dst.regClass() == v2) {
2472         emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f64, dst);
2473      } else {
2474         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2475      }
2476      break;
2477   }
2478   case nir_op_frexp_sig: {
2479      if (dst.regClass() == v2b) {
2480         emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f16, dst);
2481      } else if (dst.regClass() == v1) {
2482         emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f32, dst);
2483      } else if (dst.regClass() == v2) {
2484         emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f64, dst);
2485      } else {
2486         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2487      }
2488      break;
2489   }
2490   case nir_op_frexp_exp: {
2491      if (instr->src[0].src.ssa->bit_size == 16) {
2492         Temp src = get_alu_src(ctx, instr->src[0]);
2493         Temp tmp = bld.vop1(aco_opcode::v_frexp_exp_i16_f16, bld.def(v1), src);
2494         tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tmp, Operand::zero());
2495         convert_int(ctx, bld, tmp, 8, 32, true, dst);
2496      } else if (instr->src[0].src.ssa->bit_size == 32) {
2497         emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f32, dst);
2498      } else if (instr->src[0].src.ssa->bit_size == 64) {
2499         emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f64, dst);
2500      } else {
2501         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2502      }
2503      break;
2504   }
2505   case nir_op_fsign: {
2506      Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2507      if (dst.regClass() == v2b) {
2508         assert(ctx->program->chip_class >= GFX9);
2509         /* replace negative zero with positive zero */
2510         src = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), Operand::zero(), src);
2511         src =
2512            bld.vop3(aco_opcode::v_med3_i16, bld.def(v2b), Operand::c16(-1), src, Operand::c16(1u));
2513         bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2514      } else if (dst.regClass() == v1) {
2515         src = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::zero(), src);
2516         src =
2517            bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand::c32(-1), src, Operand::c32(1u));
2518         bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
2519      } else if (dst.regClass() == v2) {
2520         Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(bld.lm)),
2521                              Operand::zero(), src);
2522         Temp tmp = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));
2523         Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp,
2524                                   emit_extract_vector(ctx, src, 1, v1), cond);
2525
2526         cond =
2527            bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(bld.lm)), Operand::zero(), src);
2528         tmp = bld.copy(bld.def(v1), Operand::c32(0xBFF00000u));
2529         upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
2530
2531         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper);
2532      } else {
2533         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2534      }
2535      break;
2536   }
2537   case nir_op_f2f16:
2538   case nir_op_f2f16_rtne: {
2539      Temp src = get_alu_src(ctx, instr->src[0]);
2540      if (instr->src[0].src.ssa->bit_size == 64)
2541         src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
2542      if (instr->op == nir_op_f2f16_rtne && ctx->block->fp_mode.round16_64 != fp_round_ne)
2543         /* We emit s_round_mode/s_setreg_imm32 in lower_to_hw_instr to
2544          * keep value numbering and the scheduler simpler.
2545          */
2546         bld.vop1(aco_opcode::p_cvt_f16_f32_rtne, Definition(dst), src);
2547      else
2548         bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2549      break;
2550   }
2551   case nir_op_f2f16_rtz: {
2552      Temp src = get_alu_src(ctx, instr->src[0]);
2553      if (instr->src[0].src.ssa->bit_size == 64)
2554         src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
2555      if (ctx->block->fp_mode.round16_64 == fp_round_tz)
2556         bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2557      else if (ctx->program->chip_class == GFX8 || ctx->program->chip_class == GFX9)
2558         bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, Definition(dst), src, Operand::zero());
2559      else
2560         bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, as_vgpr(ctx, src));
2561      break;
2562   }
2563   case nir_op_f2f32: {
2564      if (instr->src[0].src.ssa->bit_size == 16) {
2565         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, dst);
2566      } else if (instr->src[0].src.ssa->bit_size == 64) {
2567         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
2568      } else {
2569         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2570      }
2571      break;
2572   }
2573   case nir_op_f2f64: {
2574      Temp src = get_alu_src(ctx, instr->src[0]);
2575      if (instr->src[0].src.ssa->bit_size == 16)
2576         src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2577      bld.vop1(aco_opcode::v_cvt_f64_f32, Definition(dst), src);
2578      break;
2579   }
2580   case nir_op_i2f16: {
2581      assert(dst.regClass() == v2b);
2582      Temp src = get_alu_src(ctx, instr->src[0]);
2583      const unsigned input_size = instr->src[0].src.ssa->bit_size;
2584      if (input_size <= 16) {
2585         /* Expand integer to the size expected by the uint→float converter used below */
2586         unsigned target_size = (ctx->program->chip_class >= GFX8 ? 16 : 32);
2587         if (input_size != target_size) {
2588            src = convert_int(ctx, bld, src, input_size, target_size, true);
2589         }
2590      } else if (input_size == 64) {
2591         /* Truncate down to 32 bits; if any of the upper bits are relevant,
2592          * the value does not fall into the single-precision float range
2593          * anyway. SPIR-V does not mandate any specific behavior for such
2594          * large inputs.
2595          */
2596         src = convert_int(ctx, bld, src, 64, 32, false);
2597      }
2598
2599      if (ctx->program->chip_class >= GFX8 && input_size <= 16) {
2600         bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2601      } else {
2602         /* Convert to f32 and then down to f16. This is needed to handle
2603          * inputs slightly outside the range [INT16_MIN, INT16_MAX],
2604          * which are representable via f16 but wouldn't be converted
2605          * correctly by v_cvt_f16_i16.
2606          *
2607          * This is also the fallback-path taken on GFX7 and earlier, which
2608          * do not support direct f16⟷i16 conversions.
2609          */
2610         src = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), src);
2611         bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2612      }
2613      break;
2614   }
2615   case nir_op_i2f32: {
2616      assert(dst.size() == 1);
2617      Temp src = get_alu_src(ctx, instr->src[0]);
2618      const unsigned input_size = instr->src[0].src.ssa->bit_size;
2619      if (input_size <= 32) {
2620         if (input_size <= 16) {
2621            /* Sign-extend to 32-bits */
2622            src = convert_int(ctx, bld, src, input_size, 32, true);
2623         }
2624         bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
2625      } else {
2626         assert(input_size == 64);
2627         RegClass rc = RegClass(src.type(), 1);
2628         Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2629         bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2630         lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2631         upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper);
2632         upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand::c32(32u));
2633         upper = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), lower, upper);
2634         bld.vop1(aco_opcode::v_cvt_f32_f64, Definition(dst), upper);
2635      }
2636
2637      break;
2638   }
2639   case nir_op_i2f64: {
2640      if (instr->src[0].src.ssa->bit_size <= 32) {
2641         Temp src = get_alu_src(ctx, instr->src[0]);
2642         if (instr->src[0].src.ssa->bit_size <= 16)
2643            src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
2644         bld.vop1(aco_opcode::v_cvt_f64_i32, Definition(dst), src);
2645      } else if (instr->src[0].src.ssa->bit_size == 64) {
2646         Temp src = get_alu_src(ctx, instr->src[0]);
2647         RegClass rc = RegClass(src.type(), 1);
2648         Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2649         bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2650         lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2651         upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper);
2652         upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand::c32(32u));
2653         bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
2654
2655      } else {
2656         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2657      }
2658      break;
2659   }
2660   case nir_op_u2f16: {
2661      assert(dst.regClass() == v2b);
2662      Temp src = get_alu_src(ctx, instr->src[0]);
2663      const unsigned input_size = instr->src[0].src.ssa->bit_size;
2664      if (input_size <= 16) {
2665         /* Expand integer to the size expected by the uint→float converter used below */
2666         unsigned target_size = (ctx->program->chip_class >= GFX8 ? 16 : 32);
2667         if (input_size != target_size) {
2668            src = convert_int(ctx, bld, src, input_size, target_size, false);
2669         }
2670      } else if (input_size == 64) {
2671         /* Truncate down to 32 bits; if any of the upper bits are non-zero,
2672          * the value does not fall into the single-precision float range
2673          * anyway. SPIR-V does not mandate any specific behavior for such
2674          * large inputs.
2675          */
2676         src = convert_int(ctx, bld, src, 64, 32, false);
2677      }
2678
2679      if (ctx->program->chip_class >= GFX8) {
2680         /* float16 has a range of [0, 65519]. Converting from larger
2681          * inputs is UB, so we just need to consider the lower 16 bits */
2682         bld.vop1(aco_opcode::v_cvt_f16_u16, Definition(dst), src);
2683      } else {
2684         /* GFX7 and earlier do not support direct f16⟷u16 conversions */
2685         src = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), src);
2686         bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2687      }
2688      break;
2689   }
2690   case nir_op_u2f32: {
2691      assert(dst.size() == 1);
2692      Temp src = get_alu_src(ctx, instr->src[0]);
2693      const unsigned input_size = instr->src[0].src.ssa->bit_size;
2694      if (input_size == 8) {
2695         bld.vop1(aco_opcode::v_cvt_f32_ubyte0, Definition(dst), src);
2696      } else if (input_size <= 32) {
2697         if (input_size == 16)
2698            src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
2699         bld.vop1(aco_opcode::v_cvt_f32_u32, Definition(dst), src);
2700      } else {
2701         assert(input_size == 64);
2702         RegClass rc = RegClass(src.type(), 1);
2703         Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2704         bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2705         lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2706         upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper);
2707         upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand::c32(32u));
2708         upper = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), lower, upper);
2709         bld.vop1(aco_opcode::v_cvt_f32_f64, Definition(dst), upper);
2710      }
2711      break;
2712   }
2713   case nir_op_u2f64: {
2714      if (instr->src[0].src.ssa->bit_size <= 32) {
2715         Temp src = get_alu_src(ctx, instr->src[0]);
2716         if (instr->src[0].src.ssa->bit_size <= 16)
2717            src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
2718         bld.vop1(aco_opcode::v_cvt_f64_u32, Definition(dst), src);
2719      } else if (instr->src[0].src.ssa->bit_size == 64) {
2720         Temp src = get_alu_src(ctx, instr->src[0]);
2721         RegClass rc = RegClass(src.type(), 1);
2722         Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2723         bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2724         lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2725         upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper);
2726         upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand::c32(32u));
2727         bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
2728      } else {
2729         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2730      }
2731      break;
2732   }
2733   case nir_op_f2i8:
2734   case nir_op_f2i16: {
2735      if (instr->src[0].src.ssa->bit_size == 16) {
2736         if (ctx->program->chip_class >= GFX8) {
2737            emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i16_f16, dst);
2738         } else {
2739            /* GFX7 and earlier do not support direct f16⟷i16 conversions */
2740            Temp tmp = bld.tmp(v1);
2741            emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);
2742            tmp = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp);
2743            tmp = convert_int(ctx, bld, tmp, 32, instr->dest.dest.ssa.bit_size, false,
2744                              (dst.type() == RegType::sgpr) ? Temp() : dst);
2745            if (dst.type() == RegType::sgpr) {
2746               bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2747            }
2748         }
2749      } else if (instr->src[0].src.ssa->bit_size == 32) {
2750         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
2751      } else {
2752         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
2753      }
2754      break;
2755   }
2756   case nir_op_f2u8:
2757   case nir_op_f2u16: {
2758      if (instr->src[0].src.ssa->bit_size == 16) {
2759         if (ctx->program->chip_class >= GFX8) {
2760            emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u16_f16, dst);
2761         } else {
2762            /* GFX7 and earlier do not support direct f16⟷u16 conversions */
2763            Temp tmp = bld.tmp(v1);
2764            emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);
2765            tmp = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp);
2766            tmp = convert_int(ctx, bld, tmp, 32, instr->dest.dest.ssa.bit_size, false,
2767                              (dst.type() == RegType::sgpr) ? Temp() : dst);
2768            if (dst.type() == RegType::sgpr) {
2769               bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2770            }
2771         }
2772      } else if (instr->src[0].src.ssa->bit_size == 32) {
2773         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
2774      } else {
2775         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
2776      }
2777      break;
2778   }
2779   case nir_op_f2i32: {
2780      Temp src = get_alu_src(ctx, instr->src[0]);
2781      if (instr->src[0].src.ssa->bit_size == 16) {
2782         Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2783         if (dst.type() == RegType::vgpr) {
2784            bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), tmp);
2785         } else {
2786            bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
2787                       bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp));
2788         }
2789      } else if (instr->src[0].src.ssa->bit_size == 32) {
2790         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
2791      } else if (instr->src[0].src.ssa->bit_size == 64) {
2792         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
2793      } else {
2794         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2795      }
2796      break;
2797   }
2798   case nir_op_f2u32: {
2799      Temp src = get_alu_src(ctx, instr->src[0]);
2800      if (instr->src[0].src.ssa->bit_size == 16) {
2801         Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2802         if (dst.type() == RegType::vgpr) {
2803            bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), tmp);
2804         } else {
2805            bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
2806                       bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp));
2807         }
2808      } else if (instr->src[0].src.ssa->bit_size == 32) {
2809         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
2810      } else if (instr->src[0].src.ssa->bit_size == 64) {
2811         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
2812      } else {
2813         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2814      }
2815      break;
2816   }
2817   case nir_op_f2i64: {
2818      Temp src = get_alu_src(ctx, instr->src[0]);
2819      if (instr->src[0].src.ssa->bit_size == 16)
2820         src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2821
2822      if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::vgpr) {
2823         Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
2824         exponent = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand::zero(), exponent,
2825                             Operand::c32(64u));
2826         Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7fffffu), src);
2827         Temp sign = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), src);
2828         mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(0x800000u), mantissa);
2829         mantissa = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(7u), mantissa);
2830         mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), mantissa);
2831         Temp new_exponent = bld.tmp(v1);
2832         Temp borrow =
2833            bld.vsub32(Definition(new_exponent), Operand::c32(63u), exponent, true).def(1).getTemp();
2834         if (ctx->program->chip_class >= GFX8)
2835            mantissa = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), new_exponent, mantissa);
2836         else
2837            mantissa = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), mantissa, new_exponent);
2838         Temp saturate = bld.vop1(aco_opcode::v_bfrev_b32, bld.def(v1), Operand::c32(0xfffffffeu));
2839         Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
2840         bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2841         lower = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), lower,
2842                              Operand::c32(0xffffffffu), borrow);
2843         upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), upper, saturate, borrow);
2844         lower = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, lower);
2845         upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, upper);
2846         Temp new_lower = bld.tmp(v1);
2847         borrow = bld.vsub32(Definition(new_lower), lower, sign, true).def(1).getTemp();
2848         Temp new_upper = bld.vsub32(bld.def(v1), upper, sign, false, borrow);
2849         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), new_lower, new_upper);
2850
2851      } else if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::sgpr) {
2852         if (src.type() == RegType::vgpr)
2853            src = bld.as_uniform(src);
2854         Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src,
2855                                  Operand::c32(0x80017u));
2856         exponent = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent,
2857                             Operand::c32(126u));
2858         exponent = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand::zero(),
2859                             exponent);
2860         exponent = bld.sop2(aco_opcode::s_min_i32, bld.def(s1), bld.def(s1, scc),
2861                             Operand::c32(64u), exponent);
2862         Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
2863                                  Operand::c32(0x7fffffu), src);
2864         Temp sign =
2865            bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand::c32(31u));
2866         mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
2867                             Operand::c32(0x800000u), mantissa);
2868         mantissa = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), mantissa,
2869                             Operand::c32(7u));
2870         mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(), mantissa);
2871         exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
2872                             Operand::c32(63u), exponent);
2873         mantissa =
2874            bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent);
2875         Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), exponent,
2876                              Operand::c32(0xffffffffu)); // exp >= 64
2877         Temp saturate = bld.sop1(aco_opcode::s_brev_b64, bld.def(s2), Operand::c32(0xfffffffeu));
2878         mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), saturate, mantissa, cond);
2879         Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
2880         bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2881         lower = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, lower);
2882         upper = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, upper);
2883         Temp borrow = bld.tmp(s1);
2884         lower =
2885            bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), lower, sign);
2886         upper = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), upper, sign,
2887                          bld.scc(borrow));
2888         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2889
2890      } else if (instr->src[0].src.ssa->bit_size == 64) {
2891         Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(),
2892                               Operand::c32(0x3df00000u));
2893         Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src);
2894         Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
2895         vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(),
2896                          Operand::c32(0xc1f00000u));
2897         Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul);
2898         Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
2899         Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
2900         Temp upper = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), floor);
2901         if (dst.type() == RegType::sgpr) {
2902            lower = bld.as_uniform(lower);
2903            upper = bld.as_uniform(upper);
2904         }
2905         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2906
2907      } else {
2908         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2909      }
2910      break;
2911   }
2912   case nir_op_f2u64: {
2913      Temp src = get_alu_src(ctx, instr->src[0]);
2914      if (instr->src[0].src.ssa->bit_size == 16)
2915         src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2916
2917      if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::vgpr) {
2918         Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
2919         Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)),
2920                                           Operand::c32(64u), exponent);
2921         exponent = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand::zero(), exponent);
2922         Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7fffffu), src);
2923         mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(0x800000u), mantissa);
2924         Temp exponent_small = bld.vsub32(bld.def(v1), Operand::c32(24u), exponent);
2925         Temp small = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), exponent_small, mantissa);
2926         mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), mantissa);
2927         Temp new_exponent = bld.tmp(v1);
2928         Temp cond_small =
2929            bld.vsub32(Definition(new_exponent), exponent, Operand::c32(24u), true).def(1).getTemp();
2930         if (ctx->program->chip_class >= GFX8)
2931            mantissa = bld.vop3(aco_opcode::v_lshlrev_b64, bld.def(v2), new_exponent, mantissa);
2932         else
2933            mantissa = bld.vop3(aco_opcode::v_lshl_b64, bld.def(v2), mantissa, new_exponent);
2934         Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
2935         bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2936         lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lower, small, cond_small);
2937         upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), upper, Operand::zero(),
2938                              cond_small);
2939         lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(0xffffffffu), lower,
2940                          exponent_in_range);
2941         upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(0xffffffffu), upper,
2942                          exponent_in_range);
2943         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2944
2945      } else if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::sgpr) {
2946         if (src.type() == RegType::vgpr)
2947            src = bld.as_uniform(src);
2948         Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src,
2949                                  Operand::c32(0x80017u));
2950         exponent = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent,
2951                             Operand::c32(126u));
2952         exponent = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand::zero(),
2953                             exponent);
2954         Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
2955                                  Operand::c32(0x7fffffu), src);
2956         mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
2957                             Operand::c32(0x800000u), mantissa);
2958         Temp exponent_small = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
2959                                        Operand::c32(24u), exponent);
2960         Temp small = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), mantissa,
2961                               exponent_small);
2962         mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(), mantissa);
2963         Temp exponent_large = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
2964                                        exponent, Operand::c32(24u));
2965         mantissa = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), mantissa,
2966                             exponent_large);
2967         Temp cond =
2968            bld.sopc(aco_opcode::s_cmp_ge_i32, bld.def(s1, scc), Operand::c32(64u), exponent);
2969         mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), mantissa,
2970                             Operand::c32(0xffffffffu), cond);
2971         Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
2972         bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2973         Temp cond_small =
2974            bld.sopc(aco_opcode::s_cmp_le_i32, bld.def(s1, scc), exponent, Operand::c32(24u));
2975         lower = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), small, lower, cond_small);
2976         upper =
2977            bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::zero(), upper, cond_small);
2978         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2979
2980      } else if (instr->src[0].src.ssa->bit_size == 64) {
2981         Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(),
2982                               Operand::c32(0x3df00000u));
2983         Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src);
2984         Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
2985         vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(),
2986                          Operand::c32(0xc1f00000u));
2987         Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul);
2988         Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
2989         Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
2990         Temp upper = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), floor);
2991         if (dst.type() == RegType::sgpr) {
2992            lower = bld.as_uniform(lower);
2993            upper = bld.as_uniform(upper);
2994         }
2995         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2996
2997      } else {
2998         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2999      }
3000      break;
3001   }
3002   case nir_op_b2f16: {
3003      Temp src = get_alu_src(ctx, instr->src[0]);
3004      assert(src.regClass() == bld.lm);
3005
3006      if (dst.regClass() == s1) {
3007         src = bool_to_scalar_condition(ctx, src);
3008         bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3c00u), src);
3009      } else if (dst.regClass() == v2b) {
3010         Temp one = bld.copy(bld.def(v1), Operand::c32(0x3c00u));
3011         bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), one, src);
3012      } else {
3013         unreachable("Wrong destination register class for nir_op_b2f16.");
3014      }
3015      break;
3016   }
3017   case nir_op_b2f32: {
3018      Temp src = get_alu_src(ctx, instr->src[0]);
3019      assert(src.regClass() == bld.lm);
3020
3021      if (dst.regClass() == s1) {
3022         src = bool_to_scalar_condition(ctx, src);
3023         bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3f800000u), src);
3024      } else if (dst.regClass() == v1) {
3025         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(),
3026                      Operand::c32(0x3f800000u), src);
3027      } else {
3028         unreachable("Wrong destination register class for nir_op_b2f32.");
3029      }
3030      break;
3031   }
3032   case nir_op_b2f64: {
3033      Temp src = get_alu_src(ctx, instr->src[0]);
3034      assert(src.regClass() == bld.lm);
3035
3036      if (dst.regClass() == s2) {
3037         src = bool_to_scalar_condition(ctx, src);
3038         bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c32(0x3f800000u),
3039                  Operand::zero(), bld.scc(src));
3040      } else if (dst.regClass() == v2) {
3041         Temp one = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));
3042         Temp upper =
3043            bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), one, src);
3044         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper);
3045      } else {
3046         unreachable("Wrong destination register class for nir_op_b2f64.");
3047      }
3048      break;
3049   }
3050   case nir_op_i2i8:
3051   case nir_op_i2i16:
3052   case nir_op_i2i32:
3053   case nir_op_i2i64: {
3054      if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
3055         /* no need to do the extract in get_alu_src() */
3056         sgpr_extract_mode mode = instr->dest.dest.ssa.bit_size > instr->src[0].src.ssa->bit_size
3057                                     ? sgpr_extract_sext
3058                                     : sgpr_extract_undef;
3059         extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
3060      } else {
3061         const unsigned input_bitsize = instr->src[0].src.ssa->bit_size;
3062         const unsigned output_bitsize = instr->dest.dest.ssa.bit_size;
3063         convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), input_bitsize, output_bitsize,
3064                     output_bitsize > input_bitsize, dst);
3065      }
3066      break;
3067   }
3068   case nir_op_u2u8:
3069   case nir_op_u2u16:
3070   case nir_op_u2u32:
3071   case nir_op_u2u64: {
3072      if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
3073         /* no need to do the extract in get_alu_src() */
3074         sgpr_extract_mode mode = instr->dest.dest.ssa.bit_size > instr->src[0].src.ssa->bit_size
3075                                     ? sgpr_extract_zext
3076                                     : sgpr_extract_undef;
3077         extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
3078      } else {
3079         convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), instr->src[0].src.ssa->bit_size,
3080                     instr->dest.dest.ssa.bit_size, false, dst);
3081      }
3082      break;
3083   }
3084   case nir_op_b2b32:
3085   case nir_op_b2i8:
3086   case nir_op_b2i16:
3087   case nir_op_b2i32:
3088   case nir_op_b2i64: {
3089      Temp src = get_alu_src(ctx, instr->src[0]);
3090      assert(src.regClass() == bld.lm);
3091
3092      Temp tmp = dst.bytes() == 8 ? bld.tmp(RegClass::get(dst.type(), 4)) : dst;
3093      if (tmp.regClass() == s1) {
3094         bool_to_scalar_condition(ctx, src, tmp);
3095      } else if (tmp.type() == RegType::vgpr) {
3096         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(tmp), Operand::zero(), Operand::c32(1u),
3097                      src);
3098      } else {
3099         unreachable("Invalid register class for b2i32");
3100      }
3101
3102      if (tmp != dst)
3103         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand::zero());
3104      break;
3105   }
3106   case nir_op_b2b1:
3107   case nir_op_i2b1: {
3108      Temp src = get_alu_src(ctx, instr->src[0]);
3109      assert(dst.regClass() == bld.lm);
3110
3111      if (src.type() == RegType::vgpr) {
3112         assert(src.regClass() == v1 || src.regClass() == v2);
3113         assert(dst.regClass() == bld.lm);
3114         bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
3115                  Definition(dst), Operand::zero(), src)
3116            .def(0)
3117            .setHint(vcc);
3118      } else {
3119         assert(src.regClass() == s1 || src.regClass() == s2);
3120         Temp tmp;
3121         if (src.regClass() == s2 && ctx->program->chip_class <= GFX7) {
3122            tmp =
3123               bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), Operand::zero(), src)
3124                  .def(1)
3125                  .getTemp();
3126         } else {
3127            tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32,
3128                           bld.scc(bld.def(s1)), Operand::zero(), src);
3129         }
3130         bool_to_vector_condition(ctx, tmp, dst);
3131      }
3132      break;
3133   }
3134   case nir_op_unpack_64_2x32:
3135   case nir_op_unpack_32_2x16:
3136   case nir_op_unpack_64_4x16:
3137      bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3138      emit_split_vector(ctx, dst, instr->op == nir_op_unpack_64_4x16 ? 4 : 2);
3139      break;
3140   case nir_op_pack_64_2x32_split: {
3141      Temp src0 = get_alu_src(ctx, instr->src[0]);
3142      Temp src1 = get_alu_src(ctx, instr->src[1]);
3143
3144      bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
3145      break;
3146   }
3147   case nir_op_unpack_64_2x32_split_x:
3148      bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),
3149                 get_alu_src(ctx, instr->src[0]));
3150      break;
3151   case nir_op_unpack_64_2x32_split_y:
3152      bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),
3153                 get_alu_src(ctx, instr->src[0]));
3154      break;
3155   case nir_op_unpack_32_2x16_split_x:
3156      if (dst.type() == RegType::vgpr) {
3157         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),
3158                    get_alu_src(ctx, instr->src[0]));
3159      } else {
3160         bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3161      }
3162      break;
3163   case nir_op_unpack_32_2x16_split_y:
3164      if (dst.type() == RegType::vgpr) {
3165         bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),
3166                    get_alu_src(ctx, instr->src[0]));
3167      } else {
3168         bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc),
3169                    get_alu_src(ctx, instr->src[0]), Operand::c32(1u), Operand::c32(16u),
3170                    Operand::zero());
3171      }
3172      break;
3173   case nir_op_pack_32_2x16_split: {
3174      Temp src0 = get_alu_src(ctx, instr->src[0]);
3175      Temp src1 = get_alu_src(ctx, instr->src[1]);
3176      if (dst.regClass() == v1) {
3177         src0 = emit_extract_vector(ctx, src0, 0, v2b);
3178         src1 = emit_extract_vector(ctx, src1, 0, v2b);
3179         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
3180      } else {
3181         src0 = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), src0,
3182                         Operand::c32(0xFFFFu));
3183         src1 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), src1,
3184                         Operand::c32(16u));
3185         bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), src0, src1);
3186      }
3187      break;
3188   }
3189   case nir_op_pack_32_4x8: bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0], 4)); break;
3190   case nir_op_pack_half_2x16_split: {
3191      if (dst.regClass() == v1) {
3192         if (!ctx->block->fp_mode.care_about_round16_64 ||
3193             ctx->block->fp_mode.round16_64 == fp_round_tz) {
3194            if (ctx->program->chip_class == GFX8 || ctx->program->chip_class == GFX9)
3195               emit_vop3a_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32_e64, dst);
3196            else
3197               emit_vop2_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32, dst, false);
3198         } else {
3199            Temp src0 =
3200               bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), get_alu_src(ctx, instr->src[0]));
3201            Temp src1 =
3202               bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), get_alu_src(ctx, instr->src[1]));
3203            bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
3204         }
3205      } else {
3206         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3207      }
3208      break;
3209   }
3210   case nir_op_unpack_half_2x16_split_x_flush_to_zero:
3211   case nir_op_unpack_half_2x16_split_x: {
3212      Temp src = get_alu_src(ctx, instr->src[0]);
3213      if (src.regClass() == v1)
3214         src = bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src);
3215      if (dst.regClass() == v1) {
3216         assert(ctx->block->fp_mode.must_flush_denorms16_64 ==
3217                (instr->op == nir_op_unpack_half_2x16_split_x_flush_to_zero));
3218         bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);
3219      } else {
3220         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3221      }
3222      break;
3223   }
3224   case nir_op_unpack_half_2x16_split_y_flush_to_zero:
3225   case nir_op_unpack_half_2x16_split_y: {
3226      Temp src = get_alu_src(ctx, instr->src[0]);
3227      if (src.regClass() == s1)
3228         src =
3229            bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), src, Operand::c32(16u));
3230      else
3231         src =
3232            bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src).def(1).getTemp();
3233      if (dst.regClass() == v1) {
3234         assert(ctx->block->fp_mode.must_flush_denorms16_64 ==
3235                (instr->op == nir_op_unpack_half_2x16_split_y_flush_to_zero));
3236         bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);
3237      } else {
3238         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3239      }
3240      break;
3241   }
3242   case nir_op_sad_u8x4: {
3243      assert(dst.regClass() == v1);
3244      emit_vop3a_instruction(ctx, instr, aco_opcode::v_sad_u8, dst, false, 3u, false);
3245      break;
3246   }
3247   case nir_op_fquantize2f16: {
3248      Temp src = get_alu_src(ctx, instr->src[0]);
3249      Temp f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), src);
3250      Temp f32, cmp_res;
3251
3252      if (ctx->program->chip_class >= GFX8) {
3253         Temp mask = bld.copy(
3254            bld.def(s1), Operand::c32(0x36Fu)); /* value is NOT negative/positive denormal value */
3255         cmp_res =
3256            bld.vopc_e64(aco_opcode::v_cmp_class_f16, bld.hint_vcc(bld.def(bld.lm)), f16, mask);
3257         f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
3258      } else {
3259         /* 0x38800000 is smallest half float value (2^-14) in 32-bit float,
3260          * so compare the result and flush to 0 if it's smaller.
3261          */
3262         f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
3263         Temp smallest = bld.copy(bld.def(s1), Operand::c32(0x38800000u));
3264         Instruction* tmp0 = bld.vopc_e64(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), f32, smallest);
3265         tmp0->vop3().abs[0] = true;
3266         Temp tmp1 =
3267            bld.vopc(aco_opcode::v_cmp_lg_f32, bld.hint_vcc(bld.def(bld.lm)), Operand::zero(), f32);
3268         cmp_res = bld.sop2(aco_opcode::s_nand_b64, bld.def(s2), bld.def(s1, scc),
3269                            tmp0->definitions[0].getTemp(), tmp1);
3270      }
3271
3272      if (ctx->block->fp_mode.preserve_signed_zero_inf_nan32) {
3273         Temp copysign_0 =
3274            bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::zero(), as_vgpr(ctx, src));
3275         bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), copysign_0, f32, cmp_res);
3276      } else {
3277         bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), f32, cmp_res);
3278      }
3279      break;
3280   }
3281   case nir_op_bfm: {
3282      Temp bits = get_alu_src(ctx, instr->src[0]);
3283      Temp offset = get_alu_src(ctx, instr->src[1]);
3284
3285      if (dst.regClass() == s1) {
3286         bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset);
3287      } else if (dst.regClass() == v1) {
3288         bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);
3289      } else {
3290         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3291      }
3292      break;
3293   }
3294   case nir_op_bitfield_select: {
3295
3296      /* dst = (insert & bitmask) | (base & ~bitmask) */
3297      if (dst.regClass() == s1) {
3298         Temp bitmask = get_alu_src(ctx, instr->src[0]);
3299         Temp insert = get_alu_src(ctx, instr->src[1]);
3300         Temp base = get_alu_src(ctx, instr->src[2]);
3301         aco_ptr<Instruction> sop2;
3302         nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src);
3303         nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src);
3304         Operand lhs;
3305         if (const_insert && const_bitmask) {
3306            lhs = Operand::c32(const_insert->u32 & const_bitmask->u32);
3307         } else {
3308            insert =
3309               bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);
3310            lhs = Operand(insert);
3311         }
3312
3313         Operand rhs;
3314         nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src);
3315         if (const_base && const_bitmask) {
3316            rhs = Operand::c32(const_base->u32 & ~const_bitmask->u32);
3317         } else {
3318            base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask);
3319            rhs = Operand(base);
3320         }
3321
3322         bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs);
3323
3324      } else if (dst.regClass() == v1) {
3325         emit_vop3a_instruction(ctx, instr, aco_opcode::v_bfi_b32, dst, false, 3);
3326      } else {
3327         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3328      }
3329      break;
3330   }
3331   case nir_op_ubfe:
3332   case nir_op_ibfe: {
3333      if (dst.bytes() != 4)
3334         unreachable("Unsupported BFE bit size");
3335
3336      if (dst.type() == RegType::sgpr) {
3337         Temp base = get_alu_src(ctx, instr->src[0]);
3338
3339         nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src);
3340         nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src);
3341         if (const_offset && const_bits) {
3342            uint32_t extract = (const_bits->u32 << 16) | (const_offset->u32 & 0x1f);
3343            aco_opcode opcode =
3344               instr->op == nir_op_ubfe ? aco_opcode::s_bfe_u32 : aco_opcode::s_bfe_i32;
3345            bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, Operand::c32(extract));
3346            break;
3347         }
3348
3349         Temp offset = get_alu_src(ctx, instr->src[1]);
3350         Temp bits = get_alu_src(ctx, instr->src[2]);
3351         if (instr->op == nir_op_ubfe) {
3352            Temp mask = bld.sop2(aco_opcode::s_bfm_b32, bld.def(s1), bits, offset);
3353            Temp masked =
3354               bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), base, mask);
3355            bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), masked, offset);
3356         } else {
3357            Operand bits_op = const_bits ? Operand::c32(const_bits->u32 << 16)
3358                                         : bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1),
3359                                                    bld.def(s1, scc), bits, Operand::c32(16u));
3360            Operand offset_op = const_offset
3361                                   ? Operand::c32(const_offset->u32 & 0x1fu)
3362                                   : bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
3363                                              offset, Operand::c32(0x1fu));
3364
3365            Temp extract =
3366               bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), bits_op, offset_op);
3367            bld.sop2(aco_opcode::s_bfe_i32, Definition(dst), bld.def(s1, scc), base, extract);
3368         }
3369
3370      } else {
3371         aco_opcode opcode =
3372            instr->op == nir_op_ubfe ? aco_opcode::v_bfe_u32 : aco_opcode::v_bfe_i32;
3373         emit_vop3a_instruction(ctx, instr, opcode, dst, false, 3);
3374      }
3375      break;
3376   }
3377   case nir_op_extract_u8:
3378   case nir_op_extract_i8:
3379   case nir_op_extract_u16:
3380   case nir_op_extract_i16: {
3381      bool is_signed = instr->op == nir_op_extract_i16 || instr->op == nir_op_extract_i8;
3382      unsigned comp = instr->op == nir_op_extract_u8 || instr->op == nir_op_extract_i8 ? 4 : 2;
3383      uint32_t bits = comp == 4 ? 8 : 16;
3384      unsigned index = nir_src_as_uint(instr->src[1].src);
3385      if (bits >= instr->dest.dest.ssa.bit_size || index * bits >= instr->dest.dest.ssa.bit_size) {
3386         assert(index == 0);
3387         bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3388      } else if (dst.regClass() == s1 && instr->dest.dest.ssa.bit_size == 16) {
3389         Temp vec = get_ssa_temp(ctx, instr->src[0].src.ssa);
3390         unsigned swizzle = instr->src[0].swizzle[0];
3391         if (vec.size() > 1) {
3392            vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
3393            swizzle = swizzle & 1;
3394         }
3395         index += swizzle * instr->dest.dest.ssa.bit_size / bits;
3396         bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc), Operand(vec),
3397                    Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed));
3398      } else {
3399         Temp src = get_alu_src(ctx, instr->src[0]);
3400         Definition def(dst);
3401         if (dst.bytes() == 8) {
3402            src = emit_extract_vector(ctx, src, index / comp, RegClass(src.type(), 1));
3403            index %= comp;
3404            def = bld.def(src.type(), 1);
3405         }
3406         assert(def.bytes() <= 4);
3407         if (def.regClass() == s1) {
3408            bld.pseudo(aco_opcode::p_extract, def, bld.def(s1, scc), Operand(src),
3409                       Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed));
3410         } else {
3411            src = emit_extract_vector(ctx, src, 0, def.regClass());
3412            bld.pseudo(aco_opcode::p_extract, def, Operand(src), Operand::c32(index),
3413                       Operand::c32(bits), Operand::c32(is_signed));
3414         }
3415         if (dst.size() == 2)
3416            bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(),
3417                       Operand::zero());
3418      }
3419      break;
3420   }
3421   case nir_op_insert_u8:
3422   case nir_op_insert_u16: {
3423      unsigned comp = instr->op == nir_op_insert_u8 ? 4 : 2;
3424      uint32_t bits = comp == 4 ? 8 : 16;
3425      unsigned index = nir_src_as_uint(instr->src[1].src);
3426      if (bits >= instr->dest.dest.ssa.bit_size || index * bits >= instr->dest.dest.ssa.bit_size) {
3427         assert(index == 0);
3428         bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3429      } else {
3430         Temp src = get_alu_src(ctx, instr->src[0]);
3431         Definition def(dst);
3432         bool swap = false;
3433         if (dst.bytes() == 8) {
3434            src = emit_extract_vector(ctx, src, 0u, RegClass(src.type(), 1));
3435            swap = index >= comp;
3436            index %= comp;
3437            def = bld.def(src.type(), 1);
3438         }
3439         if (def.regClass() == s1) {
3440            bld.pseudo(aco_opcode::p_insert, def, bld.def(s1, scc), Operand(src),
3441                       Operand::c32(index), Operand::c32(bits));
3442         } else {
3443            src = emit_extract_vector(ctx, src, 0, def.regClass());
3444            bld.pseudo(aco_opcode::p_insert, def, Operand(src), Operand::c32(index),
3445                       Operand::c32(bits));
3446         }
3447         if (dst.size() == 2 && swap)
3448            bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(),
3449                       def.getTemp());
3450         else if (dst.size() == 2)
3451            bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(),
3452                       Operand::zero());
3453      }
3454      break;
3455   }
3456   case nir_op_bit_count: {
3457      Temp src = get_alu_src(ctx, instr->src[0]);
3458      if (src.regClass() == s1) {
3459         bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src);
3460      } else if (src.regClass() == v1) {
3461         bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand::zero());
3462      } else if (src.regClass() == v2) {
3463         bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), emit_extract_vector(ctx, src, 1, v1),
3464                  bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1),
3465                           emit_extract_vector(ctx, src, 0, v1), Operand::zero()));
3466      } else if (src.regClass() == s2) {
3467         bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src);
3468      } else {
3469         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3470      }
3471      break;
3472   }
3473   case nir_op_flt: {
3474      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_f16, aco_opcode::v_cmp_lt_f32,
3475                      aco_opcode::v_cmp_lt_f64);
3476      break;
3477   }
3478   case nir_op_fge: {
3479      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_f16, aco_opcode::v_cmp_ge_f32,
3480                      aco_opcode::v_cmp_ge_f64);
3481      break;
3482   }
3483   case nir_op_feq: {
3484      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_f16, aco_opcode::v_cmp_eq_f32,
3485                      aco_opcode::v_cmp_eq_f64);
3486      break;
3487   }
3488   case nir_op_fneu: {
3489      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_neq_f16, aco_opcode::v_cmp_neq_f32,
3490                      aco_opcode::v_cmp_neq_f64);
3491      break;
3492   }
3493   case nir_op_ilt: {
3494      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i16, aco_opcode::v_cmp_lt_i32,
3495                      aco_opcode::v_cmp_lt_i64, aco_opcode::s_cmp_lt_i32);
3496      break;
3497   }
3498   case nir_op_ige: {
3499      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i16, aco_opcode::v_cmp_ge_i32,
3500                      aco_opcode::v_cmp_ge_i64, aco_opcode::s_cmp_ge_i32);
3501      break;
3502   }
3503   case nir_op_ieq: {
3504      if (instr->src[0].src.ssa->bit_size == 1)
3505         emit_boolean_logic(ctx, instr, Builder::s_xnor, dst);
3506      else
3507         emit_comparison(
3508            ctx, instr, dst, aco_opcode::v_cmp_eq_i16, aco_opcode::v_cmp_eq_i32,
3509            aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32,
3510            ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_eq_u64 : aco_opcode::num_opcodes);
3511      break;
3512   }
3513   case nir_op_ine: {
3514      if (instr->src[0].src.ssa->bit_size == 1)
3515         emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
3516      else
3517         emit_comparison(
3518            ctx, instr, dst, aco_opcode::v_cmp_lg_i16, aco_opcode::v_cmp_lg_i32,
3519            aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32,
3520            ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::num_opcodes);
3521      break;
3522   }
3523   case nir_op_ult: {
3524      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u16, aco_opcode::v_cmp_lt_u32,
3525                      aco_opcode::v_cmp_lt_u64, aco_opcode::s_cmp_lt_u32);
3526      break;
3527   }
3528   case nir_op_uge: {
3529      emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u16, aco_opcode::v_cmp_ge_u32,
3530                      aco_opcode::v_cmp_ge_u64, aco_opcode::s_cmp_ge_u32);
3531      break;
3532   }
3533   case nir_op_fddx:
3534   case nir_op_fddy:
3535   case nir_op_fddx_fine:
3536   case nir_op_fddy_fine:
3537   case nir_op_fddx_coarse:
3538   case nir_op_fddy_coarse: {
3539      if (!nir_src_is_divergent(instr->src[0].src)) {
3540         /* Source is the same in all lanes, so the derivative is zero.
3541          * This also avoids emitting invalid IR.
3542          */
3543         bld.copy(Definition(dst), Operand::zero());
3544         break;
3545      }
3546
3547      Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
3548      uint16_t dpp_ctrl1, dpp_ctrl2;
3549      if (instr->op == nir_op_fddx_fine) {
3550         dpp_ctrl1 = dpp_quad_perm(0, 0, 2, 2);
3551         dpp_ctrl2 = dpp_quad_perm(1, 1, 3, 3);
3552      } else if (instr->op == nir_op_fddy_fine) {
3553         dpp_ctrl1 = dpp_quad_perm(0, 1, 0, 1);
3554         dpp_ctrl2 = dpp_quad_perm(2, 3, 2, 3);
3555      } else {
3556         dpp_ctrl1 = dpp_quad_perm(0, 0, 0, 0);
3557         if (instr->op == nir_op_fddx || instr->op == nir_op_fddx_coarse)
3558            dpp_ctrl2 = dpp_quad_perm(1, 1, 1, 1);
3559         else
3560            dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
3561      }
3562
3563      Temp tmp;
3564      if (ctx->program->chip_class >= GFX8) {
3565         Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl1);
3566         tmp = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), src, tl, dpp_ctrl2);
3567      } else {
3568         Temp tl = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl1);
3569         Temp tr = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl2);
3570         tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), tr, tl);
3571      }
3572      emit_wqm(bld, tmp, dst, true);
3573      break;
3574   }
3575   default: isel_err(&instr->instr, "Unknown NIR ALU instr");
3576   }
3577}
3578
3579void
3580visit_load_const(isel_context* ctx, nir_load_const_instr* instr)
3581{
3582   Temp dst = get_ssa_temp(ctx, &instr->def);
3583
3584   // TODO: we really want to have the resulting type as this would allow for 64bit literals
3585   // which get truncated the lsb if double and msb if int
3586   // for now, we only use s_mov_b64 with 64bit inline constants
3587   assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar.");
3588   assert(dst.type() == RegType::sgpr);
3589
3590   Builder bld(ctx->program, ctx->block);
3591
3592   if (instr->def.bit_size == 1) {
3593      assert(dst.regClass() == bld.lm);
3594      int val = instr->value[0].b ? -1 : 0;
3595      Operand op = bld.lm.size() == 1 ? Operand::c32(val) : Operand::c64(val);
3596      bld.copy(Definition(dst), op);
3597   } else if (instr->def.bit_size == 8) {
3598      bld.copy(Definition(dst), Operand::c32(instr->value[0].u8));
3599   } else if (instr->def.bit_size == 16) {
3600      /* sign-extend to use s_movk_i32 instead of a literal */
3601      bld.copy(Definition(dst), Operand::c32(instr->value[0].i16));
3602   } else if (dst.size() == 1) {
3603      bld.copy(Definition(dst), Operand::c32(instr->value[0].u32));
3604   } else {
3605      assert(dst.size() != 1);
3606      aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
3607         aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
3608      if (instr->def.bit_size == 64)
3609         for (unsigned i = 0; i < dst.size(); i++)
3610            vec->operands[i] = Operand::c32(instr->value[0].u64 >> i * 32);
3611      else {
3612         for (unsigned i = 0; i < dst.size(); i++)
3613            vec->operands[i] = Operand::c32(instr->value[i].u32);
3614      }
3615      vec->definitions[0] = Definition(dst);
3616      ctx->block->instructions.emplace_back(std::move(vec));
3617   }
3618}
3619
3620uint32_t
3621widen_mask(uint32_t mask, unsigned multiplier)
3622{
3623   uint32_t new_mask = 0;
3624   for (unsigned i = 0; i < 32 && (1u << i) <= mask; ++i)
3625      if (mask & (1u << i))
3626         new_mask |= ((1u << multiplier) - 1u) << (i * multiplier);
3627   return new_mask;
3628}
3629
3630struct LoadEmitInfo {
3631   Operand offset;
3632   Temp dst;
3633   unsigned num_components;
3634   unsigned component_size;
3635   Temp resource = Temp(0, s1);
3636   unsigned component_stride = 0;
3637   unsigned const_offset = 0;
3638   unsigned align_mul = 0;
3639   unsigned align_offset = 0;
3640
3641   bool glc = false;
3642   bool slc = false;
3643   unsigned swizzle_component_size = 0;
3644   memory_sync_info sync;
3645   Temp soffset = Temp(0, s1);
3646};
3647
3648struct EmitLoadParameters {
3649   using Callback = Temp (*)(Builder& bld, const LoadEmitInfo& info, Temp offset,
3650                             unsigned bytes_needed, unsigned align, unsigned const_offset,
3651                             Temp dst_hint);
3652
3653   Callback callback;
3654   bool byte_align_loads;
3655   bool supports_8bit_16bit_loads;
3656   unsigned max_const_offset_plus_one;
3657};
3658
3659void
3660emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info,
3661          const EmitLoadParameters& params)
3662{
3663   unsigned load_size = info.num_components * info.component_size;
3664   unsigned component_size = info.component_size;
3665
3666   unsigned num_vals = 0;
3667   Temp* const vals = (Temp*)alloca(info.dst.bytes() * sizeof(Temp));
3668
3669   unsigned const_offset = info.const_offset;
3670
3671   const unsigned align_mul = info.align_mul ? info.align_mul : component_size;
3672   unsigned align_offset = (info.align_offset + const_offset) % align_mul;
3673
3674   unsigned bytes_read = 0;
3675   while (bytes_read < load_size) {
3676      unsigned bytes_needed = load_size - bytes_read;
3677
3678      /* add buffer for unaligned loads */
3679      int byte_align = 0;
3680      if (params.byte_align_loads) {
3681         byte_align = align_mul % 4 == 0 ? align_offset % 4 : -1;
3682      }
3683
3684      if (byte_align) {
3685         if (bytes_needed > 2 || (bytes_needed == 2 && (align_mul % 2 || align_offset % 2)) ||
3686             !params.supports_8bit_16bit_loads) {
3687            if (info.component_stride) {
3688               assert(params.supports_8bit_16bit_loads && "unimplemented");
3689               bytes_needed = 2;
3690               byte_align = 0;
3691            } else {
3692               bytes_needed += byte_align == -1 ? 4 - info.align_mul : byte_align;
3693               bytes_needed = align(bytes_needed, 4);
3694            }
3695         } else {
3696            byte_align = 0;
3697         }
3698      }
3699
3700      if (info.swizzle_component_size)
3701         bytes_needed = MIN2(bytes_needed, info.swizzle_component_size);
3702      if (info.component_stride)
3703         bytes_needed = MIN2(bytes_needed, info.component_size);
3704
3705      bool need_to_align_offset = byte_align && (align_mul % 4 || align_offset % 4);
3706
3707      /* reduce constant offset */
3708      Operand offset = info.offset;
3709      unsigned reduced_const_offset = const_offset;
3710      bool remove_const_offset_completely = need_to_align_offset;
3711      if (const_offset &&
3712          (remove_const_offset_completely || const_offset >= params.max_const_offset_plus_one)) {
3713         unsigned to_add = const_offset;
3714         if (remove_const_offset_completely) {
3715            reduced_const_offset = 0;
3716         } else {
3717            to_add =
3718               const_offset / params.max_const_offset_plus_one * params.max_const_offset_plus_one;
3719            reduced_const_offset %= params.max_const_offset_plus_one;
3720         }
3721         Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
3722         if (offset.isConstant()) {
3723            offset = Operand::c32(offset.constantValue() + to_add);
3724         } else if (offset_tmp.regClass() == s1) {
3725            offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), offset_tmp,
3726                              Operand::c32(to_add));
3727         } else if (offset_tmp.regClass() == v1) {
3728            offset = bld.vadd32(bld.def(v1), offset_tmp, Operand::c32(to_add));
3729         } else {
3730            Temp lo = bld.tmp(offset_tmp.type(), 1);
3731            Temp hi = bld.tmp(offset_tmp.type(), 1);
3732            bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
3733
3734            if (offset_tmp.regClass() == s2) {
3735               Temp carry = bld.tmp(s1);
3736               lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), lo,
3737                             Operand::c32(to_add));
3738               hi = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), hi, carry);
3739               offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), lo, hi);
3740            } else {
3741               Temp new_lo = bld.tmp(v1);
3742               Temp carry =
3743                  bld.vadd32(Definition(new_lo), lo, Operand::c32(to_add), true).def(1).getTemp();
3744               hi = bld.vadd32(bld.def(v1), hi, Operand::zero(), false, carry);
3745               offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_lo, hi);
3746            }
3747         }
3748      }
3749
3750      /* align offset down if needed */
3751      Operand aligned_offset = offset;
3752      unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
3753      if (need_to_align_offset) {
3754         align = 4;
3755         Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
3756         if (offset.isConstant()) {
3757            aligned_offset = Operand::c32(offset.constantValue() & 0xfffffffcu);
3758         } else if (offset_tmp.regClass() == s1) {
3759            aligned_offset = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
3760                                      Operand::c32(0xfffffffcu), offset_tmp);
3761         } else if (offset_tmp.regClass() == s2) {
3762            aligned_offset = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc),
3763                                      Operand::c64(0xfffffffffffffffcllu), offset_tmp);
3764         } else if (offset_tmp.regClass() == v1) {
3765            aligned_offset =
3766               bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0xfffffffcu), offset_tmp);
3767         } else if (offset_tmp.regClass() == v2) {
3768            Temp hi = bld.tmp(v1), lo = bld.tmp(v1);
3769            bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
3770            lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0xfffffffcu), lo);
3771            aligned_offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), lo, hi);
3772         }
3773      }
3774      Temp aligned_offset_tmp =
3775         aligned_offset.isTemp() ? aligned_offset.getTemp() : bld.copy(bld.def(s1), aligned_offset);
3776
3777      Temp val = params.callback(bld, info, aligned_offset_tmp, bytes_needed, align,
3778                                 reduced_const_offset, byte_align ? Temp() : info.dst);
3779
3780      /* the callback wrote directly to dst */
3781      if (val == info.dst) {
3782         assert(num_vals == 0);
3783         emit_split_vector(ctx, info.dst, info.num_components);
3784         return;
3785      }
3786
3787      /* shift result right if needed */
3788      if (params.byte_align_loads && info.component_size < 4) {
3789         Operand byte_align_off = Operand::c32(byte_align);
3790         if (byte_align == -1) {
3791            if (offset.isConstant())
3792               byte_align_off = Operand::c32(offset.constantValue() % 4u);
3793            else if (offset.size() == 2)
3794               byte_align_off = Operand(emit_extract_vector(ctx, offset.getTemp(), 0,
3795                                                            RegClass(offset.getTemp().type(), 1)));
3796            else
3797               byte_align_off = offset;
3798         }
3799
3800         assert(val.bytes() >= load_size && "unimplemented");
3801         if (val.type() == RegType::sgpr)
3802            byte_align_scalar(ctx, val, byte_align_off, info.dst);
3803         else
3804            byte_align_vector(ctx, val, byte_align_off, info.dst, component_size);
3805         return;
3806      }
3807
3808      /* add result to list and advance */
3809      if (info.component_stride) {
3810         assert(val.bytes() == info.component_size && "unimplemented");
3811         const_offset += info.component_stride;
3812         align_offset = (align_offset + info.component_stride) % align_mul;
3813      } else {
3814         const_offset += val.bytes();
3815         align_offset = (align_offset + val.bytes()) % align_mul;
3816      }
3817      bytes_read += val.bytes();
3818      vals[num_vals++] = val;
3819   }
3820
3821   /* create array of components */
3822   unsigned components_split = 0;
3823   std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
3824   bool has_vgprs = false;
3825   for (unsigned i = 0; i < num_vals;) {
3826      Temp* const tmp = (Temp*)alloca(num_vals * sizeof(Temp));
3827      unsigned num_tmps = 0;
3828      unsigned tmp_size = 0;
3829      RegType reg_type = RegType::sgpr;
3830      while ((!tmp_size || (tmp_size % component_size)) && i < num_vals) {
3831         if (vals[i].type() == RegType::vgpr)
3832            reg_type = RegType::vgpr;
3833         tmp_size += vals[i].bytes();
3834         tmp[num_tmps++] = vals[i++];
3835      }
3836      if (num_tmps > 1) {
3837         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
3838            aco_opcode::p_create_vector, Format::PSEUDO, num_tmps, 1)};
3839         for (unsigned j = 0; j < num_tmps; j++)
3840            vec->operands[j] = Operand(tmp[j]);
3841         tmp[0] = bld.tmp(RegClass::get(reg_type, tmp_size));
3842         vec->definitions[0] = Definition(tmp[0]);
3843         bld.insert(std::move(vec));
3844      }
3845
3846      if (tmp[0].bytes() % component_size) {
3847         /* trim tmp[0] */
3848         assert(i == num_vals);
3849         RegClass new_rc =
3850            RegClass::get(reg_type, tmp[0].bytes() / component_size * component_size);
3851         tmp[0] =
3852            bld.pseudo(aco_opcode::p_extract_vector, bld.def(new_rc), tmp[0], Operand::zero());
3853      }
3854
3855      RegClass elem_rc = RegClass::get(reg_type, component_size);
3856
3857      unsigned start = components_split;
3858
3859      if (tmp_size == elem_rc.bytes()) {
3860         allocated_vec[components_split++] = tmp[0];
3861      } else {
3862         assert(tmp_size % elem_rc.bytes() == 0);
3863         aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(
3864            aco_opcode::p_split_vector, Format::PSEUDO, 1, tmp_size / elem_rc.bytes())};
3865         for (auto& def : split->definitions) {
3866            Temp component = bld.tmp(elem_rc);
3867            allocated_vec[components_split++] = component;
3868            def = Definition(component);
3869         }
3870         split->operands[0] = Operand(tmp[0]);
3871         bld.insert(std::move(split));
3872      }
3873
3874      /* try to p_as_uniform early so we can create more optimizable code and
3875       * also update allocated_vec */
3876      for (unsigned j = start; j < components_split; j++) {
3877         if (allocated_vec[j].bytes() % 4 == 0 && info.dst.type() == RegType::sgpr)
3878            allocated_vec[j] = bld.as_uniform(allocated_vec[j]);
3879         has_vgprs |= allocated_vec[j].type() == RegType::vgpr;
3880      }
3881   }
3882
3883   /* concatenate components and p_as_uniform() result if needed */
3884   if (info.dst.type() == RegType::vgpr || !has_vgprs)
3885      ctx->allocated_vec.emplace(info.dst.id(), allocated_vec);
3886
3887   int padding_bytes =
3888      MAX2((int)info.dst.bytes() - int(allocated_vec[0].bytes() * info.num_components), 0);
3889
3890   aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
3891      aco_opcode::p_create_vector, Format::PSEUDO, info.num_components + !!padding_bytes, 1)};
3892   for (unsigned i = 0; i < info.num_components; i++)
3893      vec->operands[i] = Operand(allocated_vec[i]);
3894   if (padding_bytes)
3895      vec->operands[info.num_components] = Operand(RegClass::get(RegType::vgpr, padding_bytes));
3896   if (info.dst.type() == RegType::sgpr && has_vgprs) {
3897      Temp tmp = bld.tmp(RegType::vgpr, info.dst.size());
3898      vec->definitions[0] = Definition(tmp);
3899      bld.insert(std::move(vec));
3900      bld.pseudo(aco_opcode::p_as_uniform, Definition(info.dst), tmp);
3901   } else {
3902      vec->definitions[0] = Definition(info.dst);
3903      bld.insert(std::move(vec));
3904   }
3905}
3906
3907Operand
3908load_lds_size_m0(Builder& bld)
3909{
3910   /* m0 does not need to be initialized on GFX9+ */
3911   if (bld.program->chip_class >= GFX9)
3912      return Operand(s1);
3913
3914   return bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand::c32(0xffffffffu)));
3915}
3916
3917Temp
3918lds_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
3919                  unsigned align, unsigned const_offset, Temp dst_hint)
3920{
3921   offset = offset.regClass() == s1 ? bld.copy(bld.def(v1), offset) : offset;
3922
3923   Operand m = load_lds_size_m0(bld);
3924
3925   bool large_ds_read = bld.program->chip_class >= GFX7;
3926   bool usable_read2 = bld.program->chip_class >= GFX7;
3927
3928   bool read2 = false;
3929   unsigned size = 0;
3930   aco_opcode op;
3931   if (bytes_needed >= 16 && align % 16 == 0 && large_ds_read) {
3932      size = 16;
3933      op = aco_opcode::ds_read_b128;
3934   } else if (bytes_needed >= 16 && align % 8 == 0 && const_offset % 8 == 0 && usable_read2) {
3935      size = 16;
3936      read2 = true;
3937      op = aco_opcode::ds_read2_b64;
3938   } else if (bytes_needed >= 12 && align % 16 == 0 && large_ds_read) {
3939      size = 12;
3940      op = aco_opcode::ds_read_b96;
3941   } else if (bytes_needed >= 8 && align % 8 == 0) {
3942      size = 8;
3943      op = aco_opcode::ds_read_b64;
3944   } else if (bytes_needed >= 8 && align % 4 == 0 && const_offset % 4 == 0 && usable_read2) {
3945      size = 8;
3946      read2 = true;
3947      op = aco_opcode::ds_read2_b32;
3948   } else if (bytes_needed >= 4 && align % 4 == 0) {
3949      size = 4;
3950      op = aco_opcode::ds_read_b32;
3951   } else if (bytes_needed >= 2 && align % 2 == 0) {
3952      size = 2;
3953      op = bld.program->chip_class >= GFX9 ? aco_opcode::ds_read_u16_d16 : aco_opcode::ds_read_u16;
3954   } else {
3955      size = 1;
3956      op = bld.program->chip_class >= GFX9 ? aco_opcode::ds_read_u8_d16 : aco_opcode::ds_read_u8;
3957   }
3958
3959   unsigned const_offset_unit = read2 ? size / 2u : 1u;
3960   unsigned const_offset_range = read2 ? 255 * const_offset_unit : 65536;
3961
3962   if (const_offset > (const_offset_range - const_offset_unit)) {
3963      unsigned excess = const_offset - (const_offset % const_offset_range);
3964      offset = bld.vadd32(bld.def(v1), offset, Operand::c32(excess));
3965      const_offset -= excess;
3966   }
3967
3968   const_offset /= const_offset_unit;
3969
3970   RegClass rc = RegClass::get(RegType::vgpr, size);
3971   Temp val = rc == info.dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc);
3972   Instruction* instr;
3973   if (read2)
3974      instr = bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
3975   else
3976      instr = bld.ds(op, Definition(val), offset, m, const_offset);
3977   instr->ds().sync = info.sync;
3978
3979   if (m.isUndefined())
3980      instr->operands.pop_back();
3981
3982   return val;
3983}
3984
3985const EmitLoadParameters lds_load_params{lds_load_callback, false, true, UINT32_MAX};
3986
3987Temp
3988smem_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
3989                   unsigned align, unsigned const_offset, Temp dst_hint)
3990{
3991   unsigned size = 0;
3992   aco_opcode op;
3993   if (bytes_needed <= 4) {
3994      size = 1;
3995      op = info.resource.id() ? aco_opcode::s_buffer_load_dword : aco_opcode::s_load_dword;
3996   } else if (bytes_needed <= 8) {
3997      size = 2;
3998      op = info.resource.id() ? aco_opcode::s_buffer_load_dwordx2 : aco_opcode::s_load_dwordx2;
3999   } else if (bytes_needed <= 16) {
4000      size = 4;
4001      op = info.resource.id() ? aco_opcode::s_buffer_load_dwordx4 : aco_opcode::s_load_dwordx4;
4002   } else if (bytes_needed <= 32) {
4003      size = 8;
4004      op = info.resource.id() ? aco_opcode::s_buffer_load_dwordx8 : aco_opcode::s_load_dwordx8;
4005   } else {
4006      size = 16;
4007      op = info.resource.id() ? aco_opcode::s_buffer_load_dwordx16 : aco_opcode::s_load_dwordx16;
4008   }
4009   aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
4010   if (info.resource.id()) {
4011      load->operands[0] = Operand(info.resource);
4012      load->operands[1] = Operand(offset);
4013   } else {
4014      load->operands[0] = Operand(offset);
4015      load->operands[1] = Operand::zero();
4016   }
4017   RegClass rc(RegType::sgpr, size);
4018   Temp val = dst_hint.id() && dst_hint.regClass() == rc ? dst_hint : bld.tmp(rc);
4019   load->definitions[0] = Definition(val);
4020   load->glc = info.glc;
4021   load->dlc = info.glc && bld.program->chip_class >= GFX10;
4022   load->sync = info.sync;
4023   bld.insert(std::move(load));
4024   return val;
4025}
4026
4027const EmitLoadParameters smem_load_params{smem_load_callback, true, false, 1024};
4028
4029Temp
4030mubuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4031                    unsigned align_, unsigned const_offset, Temp dst_hint)
4032{
4033   Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4034   Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
4035
4036   if (info.soffset.id()) {
4037      if (soffset.isTemp())
4038         vaddr = bld.copy(bld.def(v1), soffset);
4039      soffset = Operand(info.soffset);
4040   }
4041
4042   unsigned bytes_size = 0;
4043   aco_opcode op;
4044   if (bytes_needed == 1 || align_ % 2) {
4045      bytes_size = 1;
4046      op = aco_opcode::buffer_load_ubyte;
4047   } else if (bytes_needed == 2 || align_ % 4) {
4048      bytes_size = 2;
4049      op = aco_opcode::buffer_load_ushort;
4050   } else if (bytes_needed <= 4) {
4051      bytes_size = 4;
4052      op = aco_opcode::buffer_load_dword;
4053   } else if (bytes_needed <= 8) {
4054      bytes_size = 8;
4055      op = aco_opcode::buffer_load_dwordx2;
4056   } else if (bytes_needed <= 12 && bld.program->chip_class > GFX6) {
4057      bytes_size = 12;
4058      op = aco_opcode::buffer_load_dwordx3;
4059   } else {
4060      bytes_size = 16;
4061      op = aco_opcode::buffer_load_dwordx4;
4062   }
4063   aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
4064   mubuf->operands[0] = Operand(info.resource);
4065   mubuf->operands[1] = vaddr;
4066   mubuf->operands[2] = soffset;
4067   mubuf->offen = (offset.type() == RegType::vgpr);
4068   mubuf->glc = info.glc;
4069   mubuf->dlc = info.glc && bld.program->chip_class >= GFX10;
4070   mubuf->slc = info.slc;
4071   mubuf->sync = info.sync;
4072   mubuf->offset = const_offset;
4073   mubuf->swizzled = info.swizzle_component_size != 0;
4074   RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
4075   Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4076   mubuf->definitions[0] = Definition(val);
4077   bld.insert(std::move(mubuf));
4078
4079   return val;
4080}
4081
4082const EmitLoadParameters mubuf_load_params{mubuf_load_callback, true, true, 4096};
4083const EmitLoadParameters scratch_load_params{mubuf_load_callback, false, true, 4096};
4084
4085Temp
4086get_gfx6_global_rsrc(Builder& bld, Temp addr)
4087{
4088   uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
4089                        S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
4090
4091   if (addr.type() == RegType::vgpr)
4092      return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), Operand::zero(), Operand::zero(),
4093                        Operand::c32(-1u), Operand::c32(rsrc_conf));
4094   return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr, Operand::c32(-1u),
4095                     Operand::c32(rsrc_conf));
4096}
4097
4098Temp
4099global_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4100                     unsigned align_, unsigned const_offset, Temp dst_hint)
4101{
4102   unsigned bytes_size = 0;
4103   bool use_mubuf = bld.program->chip_class == GFX6;
4104   bool global = bld.program->chip_class >= GFX9;
4105   aco_opcode op;
4106   if (bytes_needed == 1) {
4107      bytes_size = 1;
4108      op = use_mubuf ? aco_opcode::buffer_load_ubyte
4109           : global  ? aco_opcode::global_load_ubyte
4110                     : aco_opcode::flat_load_ubyte;
4111   } else if (bytes_needed == 2) {
4112      bytes_size = 2;
4113      op = use_mubuf ? aco_opcode::buffer_load_ushort
4114           : global  ? aco_opcode::global_load_ushort
4115                     : aco_opcode::flat_load_ushort;
4116   } else if (bytes_needed <= 4) {
4117      bytes_size = 4;
4118      op = use_mubuf ? aco_opcode::buffer_load_dword
4119           : global  ? aco_opcode::global_load_dword
4120                     : aco_opcode::flat_load_dword;
4121   } else if (bytes_needed <= 8) {
4122      bytes_size = 8;
4123      op = use_mubuf ? aco_opcode::buffer_load_dwordx2
4124           : global  ? aco_opcode::global_load_dwordx2
4125                     : aco_opcode::flat_load_dwordx2;
4126   } else if (bytes_needed <= 12 && !use_mubuf) {
4127      bytes_size = 12;
4128      op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
4129   } else {
4130      bytes_size = 16;
4131      op = use_mubuf ? aco_opcode::buffer_load_dwordx4
4132           : global  ? aco_opcode::global_load_dwordx4
4133                     : aco_opcode::flat_load_dwordx4;
4134   }
4135   RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4));
4136   Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4137   if (use_mubuf) {
4138      aco_ptr<MUBUF_instruction> mubuf{
4139         create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
4140      mubuf->operands[0] = Operand(get_gfx6_global_rsrc(bld, offset));
4141      mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4142      mubuf->operands[2] = Operand::zero();
4143      mubuf->glc = info.glc;
4144      mubuf->dlc = false;
4145      mubuf->offset = 0;
4146      mubuf->addr64 = offset.type() == RegType::vgpr;
4147      mubuf->disable_wqm = false;
4148      mubuf->sync = info.sync;
4149      mubuf->definitions[0] = Definition(val);
4150      bld.insert(std::move(mubuf));
4151   } else {
4152      offset = offset.regClass() == s2 ? bld.copy(bld.def(v2), offset) : offset;
4153
4154      aco_ptr<FLAT_instruction> flat{
4155         create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
4156      flat->operands[0] = Operand(offset);
4157      flat->operands[1] = Operand(s1);
4158      flat->glc = info.glc;
4159      flat->dlc = info.glc && bld.program->chip_class >= GFX10;
4160      flat->sync = info.sync;
4161      flat->offset = 0u;
4162      flat->definitions[0] = Definition(val);
4163      bld.insert(std::move(flat));
4164   }
4165
4166   return val;
4167}
4168
4169const EmitLoadParameters global_load_params{global_load_callback, true, true, 1};
4170
4171Temp
4172load_lds(isel_context* ctx, unsigned elem_size_bytes, unsigned num_components, Temp dst,
4173         Temp address, unsigned base_offset, unsigned align)
4174{
4175   assert(util_is_power_of_two_nonzero(align));
4176
4177   Builder bld(ctx->program, ctx->block);
4178
4179   LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes};
4180   info.align_mul = align;
4181   info.align_offset = 0;
4182   info.sync = memory_sync_info(storage_shared);
4183   info.const_offset = base_offset;
4184   emit_load(ctx, bld, info, lds_load_params);
4185
4186   return dst;
4187}
4188
4189void
4190split_store_data(isel_context* ctx, RegType dst_type, unsigned count, Temp* dst, unsigned* bytes,
4191                 Temp src)
4192{
4193   if (!count)
4194      return;
4195
4196   Builder bld(ctx->program, ctx->block);
4197
4198   /* count == 1 fast path */
4199   if (count == 1) {
4200      if (dst_type == RegType::sgpr)
4201         dst[0] = bld.as_uniform(src);
4202      else
4203         dst[0] = as_vgpr(ctx, src);
4204      return;
4205   }
4206
4207   /* elem_size_bytes is the greatest common divisor which is a power of 2 */
4208   unsigned elem_size_bytes =
4209      1u << (ffs(std::accumulate(bytes, bytes + count, 8, std::bit_or<>{})) - 1);
4210
4211   ASSERTED bool is_subdword = elem_size_bytes < 4;
4212   assert(!is_subdword || dst_type == RegType::vgpr);
4213
4214   for (unsigned i = 0; i < count; i++)
4215      dst[i] = bld.tmp(RegClass::get(dst_type, bytes[i]));
4216
4217   std::vector<Temp> temps;
4218   /* use allocated_vec if possible */
4219   auto it = ctx->allocated_vec.find(src.id());
4220   if (it != ctx->allocated_vec.end()) {
4221      if (!it->second[0].id())
4222         goto split;
4223      unsigned elem_size = it->second[0].bytes();
4224      assert(src.bytes() % elem_size == 0);
4225
4226      for (unsigned i = 0; i < src.bytes() / elem_size; i++) {
4227         if (!it->second[i].id())
4228            goto split;
4229      }
4230      if (elem_size_bytes % elem_size)
4231         goto split;
4232
4233      temps.insert(temps.end(), it->second.begin(), it->second.begin() + src.bytes() / elem_size);
4234      elem_size_bytes = elem_size;
4235   }
4236
4237split:
4238   /* split src if necessary */
4239   if (temps.empty()) {
4240      if (is_subdword && src.type() == RegType::sgpr)
4241         src = as_vgpr(ctx, src);
4242      if (dst_type == RegType::sgpr)
4243         src = bld.as_uniform(src);
4244
4245      unsigned num_elems = src.bytes() / elem_size_bytes;
4246      aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(
4247         aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elems)};
4248      split->operands[0] = Operand(src);
4249      for (unsigned i = 0; i < num_elems; i++) {
4250         temps.emplace_back(bld.tmp(RegClass::get(dst_type, elem_size_bytes)));
4251         split->definitions[i] = Definition(temps.back());
4252      }
4253      bld.insert(std::move(split));
4254   }
4255
4256   unsigned idx = 0;
4257   for (unsigned i = 0; i < count; i++) {
4258      unsigned op_count = dst[i].bytes() / elem_size_bytes;
4259      if (op_count == 1) {
4260         if (dst_type == RegType::sgpr)
4261            dst[i] = bld.as_uniform(temps[idx++]);
4262         else
4263            dst[i] = as_vgpr(ctx, temps[idx++]);
4264         continue;
4265      }
4266
4267      aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
4268                                                                      Format::PSEUDO, op_count, 1)};
4269      for (unsigned j = 0; j < op_count; j++) {
4270         Temp tmp = temps[idx++];
4271         if (dst_type == RegType::sgpr)
4272            tmp = bld.as_uniform(tmp);
4273         vec->operands[j] = Operand(tmp);
4274      }
4275      vec->definitions[0] = Definition(dst[i]);
4276      bld.insert(std::move(vec));
4277   }
4278   return;
4279}
4280
4281bool
4282scan_write_mask(uint32_t mask, uint32_t todo_mask, int* start, int* count)
4283{
4284   unsigned start_elem = ffs(todo_mask) - 1;
4285   bool skip = !(mask & (1 << start_elem));
4286   if (skip)
4287      mask = ~mask & todo_mask;
4288
4289   mask &= todo_mask;
4290
4291   u_bit_scan_consecutive_range(&mask, start, count);
4292
4293   return !skip;
4294}
4295
4296void
4297advance_write_mask(uint32_t* todo_mask, int start, int count)
4298{
4299   *todo_mask &= ~u_bit_consecutive(0, count) << start;
4300}
4301
4302void
4303store_lds(isel_context* ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask, Temp address,
4304          unsigned base_offset, unsigned align)
4305{
4306   assert(util_is_power_of_two_nonzero(align));
4307   assert(util_is_power_of_two_nonzero(elem_size_bytes) && elem_size_bytes <= 8);
4308
4309   Builder bld(ctx->program, ctx->block);
4310   bool large_ds_write = ctx->options->chip_class >= GFX7;
4311   bool usable_write2 = ctx->options->chip_class >= GFX7;
4312
4313   unsigned write_count = 0;
4314   Temp write_datas[32];
4315   unsigned offsets[32];
4316   unsigned bytes[32];
4317   aco_opcode opcodes[32];
4318
4319   wrmask = widen_mask(wrmask, elem_size_bytes);
4320
4321   uint32_t todo = u_bit_consecutive(0, data.bytes());
4322   while (todo) {
4323      int offset, byte;
4324      if (!scan_write_mask(wrmask, todo, &offset, &byte)) {
4325         offsets[write_count] = offset;
4326         bytes[write_count] = byte;
4327         opcodes[write_count] = aco_opcode::num_opcodes;
4328         write_count++;
4329         advance_write_mask(&todo, offset, byte);
4330         continue;
4331      }
4332
4333      bool aligned2 = offset % 2 == 0 && align % 2 == 0;
4334      bool aligned4 = offset % 4 == 0 && align % 4 == 0;
4335      bool aligned8 = offset % 8 == 0 && align % 8 == 0;
4336      bool aligned16 = offset % 16 == 0 && align % 16 == 0;
4337
4338      // TODO: use ds_write_b8_d16_hi/ds_write_b16_d16_hi if beneficial
4339      aco_opcode op = aco_opcode::num_opcodes;
4340      if (byte >= 16 && aligned16 && large_ds_write) {
4341         op = aco_opcode::ds_write_b128;
4342         byte = 16;
4343      } else if (byte >= 12 && aligned16 && large_ds_write) {
4344         op = aco_opcode::ds_write_b96;
4345         byte = 12;
4346      } else if (byte >= 8 && aligned8) {
4347         op = aco_opcode::ds_write_b64;
4348         byte = 8;
4349      } else if (byte >= 4 && aligned4) {
4350         op = aco_opcode::ds_write_b32;
4351         byte = 4;
4352      } else if (byte >= 2 && aligned2) {
4353         op = aco_opcode::ds_write_b16;
4354         byte = 2;
4355      } else if (byte >= 1) {
4356         op = aco_opcode::ds_write_b8;
4357         byte = 1;
4358      } else {
4359         assert(false);
4360      }
4361
4362      offsets[write_count] = offset;
4363      bytes[write_count] = byte;
4364      opcodes[write_count] = op;
4365      write_count++;
4366      advance_write_mask(&todo, offset, byte);
4367   }
4368
4369   Operand m = load_lds_size_m0(bld);
4370
4371   split_store_data(ctx, RegType::vgpr, write_count, write_datas, bytes, data);
4372
4373   for (unsigned i = 0; i < write_count; i++) {
4374      aco_opcode op = opcodes[i];
4375      if (op == aco_opcode::num_opcodes)
4376         continue;
4377
4378      Temp split_data = write_datas[i];
4379
4380      unsigned second = write_count;
4381      if (usable_write2 && (op == aco_opcode::ds_write_b32 || op == aco_opcode::ds_write_b64)) {
4382         for (second = i + 1; second < write_count; second++) {
4383            if (opcodes[second] == op && (offsets[second] - offsets[i]) % split_data.bytes() == 0) {
4384               op = split_data.bytes() == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
4385               opcodes[second] = aco_opcode::num_opcodes;
4386               break;
4387            }
4388         }
4389      }
4390
4391      bool write2 = op == aco_opcode::ds_write2_b32 || op == aco_opcode::ds_write2_b64;
4392      unsigned write2_off = (offsets[second] - offsets[i]) / split_data.bytes();
4393
4394      unsigned inline_offset = base_offset + offsets[i];
4395      unsigned max_offset = write2 ? (255 - write2_off) * split_data.bytes() : 65535;
4396      Temp address_offset = address;
4397      if (inline_offset > max_offset) {
4398         address_offset = bld.vadd32(bld.def(v1), Operand::c32(base_offset), address_offset);
4399         inline_offset = offsets[i];
4400      }
4401
4402      /* offsets[i] shouldn't be large enough for this to happen */
4403      assert(inline_offset <= max_offset);
4404
4405      Instruction* instr;
4406      if (write2) {
4407         Temp second_data = write_datas[second];
4408         inline_offset /= split_data.bytes();
4409         instr = bld.ds(op, address_offset, split_data, second_data, m, inline_offset,
4410                        inline_offset + write2_off);
4411      } else {
4412         instr = bld.ds(op, address_offset, split_data, m, inline_offset);
4413      }
4414      instr->ds().sync = memory_sync_info(storage_shared);
4415
4416      if (m.isUndefined())
4417         instr->operands.pop_back();
4418   }
4419}
4420
4421aco_opcode
4422get_buffer_store_op(unsigned bytes)
4423{
4424   switch (bytes) {
4425   case 1: return aco_opcode::buffer_store_byte;
4426   case 2: return aco_opcode::buffer_store_short;
4427   case 4: return aco_opcode::buffer_store_dword;
4428   case 8: return aco_opcode::buffer_store_dwordx2;
4429   case 12: return aco_opcode::buffer_store_dwordx3;
4430   case 16: return aco_opcode::buffer_store_dwordx4;
4431   }
4432   unreachable("Unexpected store size");
4433   return aco_opcode::num_opcodes;
4434}
4435
4436void
4437split_buffer_store(isel_context* ctx, nir_intrinsic_instr* instr, bool smem, RegType dst_type,
4438                   Temp data, unsigned writemask, int swizzle_element_size, unsigned* write_count,
4439                   Temp* write_datas, unsigned* offsets)
4440{
4441   unsigned write_count_with_skips = 0;
4442   bool skips[16];
4443   unsigned bytes[16];
4444
4445   /* determine how to split the data */
4446   unsigned todo = u_bit_consecutive(0, data.bytes());
4447   while (todo) {
4448      int offset, byte;
4449      skips[write_count_with_skips] = !scan_write_mask(writemask, todo, &offset, &byte);
4450      offsets[write_count_with_skips] = offset;
4451      if (skips[write_count_with_skips]) {
4452         bytes[write_count_with_skips] = byte;
4453         advance_write_mask(&todo, offset, byte);
4454         write_count_with_skips++;
4455         continue;
4456      }
4457
4458      /* only supported sizes are 1, 2, 4, 8, 12 and 16 bytes and can't be
4459       * larger than swizzle_element_size */
4460      byte = MIN2(byte, swizzle_element_size);
4461      if (byte % 4)
4462         byte = byte > 4 ? byte & ~0x3 : MIN2(byte, 2);
4463
4464      /* SMEM and GFX6 VMEM can't emit 12-byte stores */
4465      if ((ctx->program->chip_class == GFX6 || smem) && byte == 12)
4466         byte = 8;
4467
4468      /* dword or larger stores have to be dword-aligned */
4469      unsigned align_mul = instr ? nir_intrinsic_align_mul(instr) : 4;
4470      unsigned align_offset = (instr ? nir_intrinsic_align_offset(instr) : 0) + offset;
4471      bool dword_aligned = align_offset % 4 == 0 && align_mul % 4 == 0;
4472      if (!dword_aligned)
4473         byte = MIN2(byte, (align_offset % 2 == 0 && align_mul % 2 == 0) ? 2 : 1);
4474
4475      bytes[write_count_with_skips] = byte;
4476      advance_write_mask(&todo, offset, byte);
4477      write_count_with_skips++;
4478   }
4479
4480   /* actually split data */
4481   split_store_data(ctx, dst_type, write_count_with_skips, write_datas, bytes, data);
4482
4483   /* remove skips */
4484   for (unsigned i = 0; i < write_count_with_skips; i++) {
4485      if (skips[i])
4486         continue;
4487      write_datas[*write_count] = write_datas[i];
4488      offsets[*write_count] = offsets[i];
4489      (*write_count)++;
4490   }
4491}
4492
4493Temp
4494create_vec_from_array(isel_context* ctx, Temp arr[], unsigned cnt, RegType reg_type,
4495                      unsigned elem_size_bytes, unsigned split_cnt = 0u, Temp dst = Temp())
4496{
4497   Builder bld(ctx->program, ctx->block);
4498   unsigned dword_size = elem_size_bytes / 4;
4499
4500   if (!dst.id())
4501      dst = bld.tmp(RegClass(reg_type, cnt * dword_size));
4502
4503   std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
4504   aco_ptr<Pseudo_instruction> instr{
4505      create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, cnt, 1)};
4506   instr->definitions[0] = Definition(dst);
4507
4508   for (unsigned i = 0; i < cnt; ++i) {
4509      if (arr[i].id()) {
4510         assert(arr[i].size() == dword_size);
4511         allocated_vec[i] = arr[i];
4512         instr->operands[i] = Operand(arr[i]);
4513      } else {
4514         Temp zero = bld.copy(bld.def(RegClass(reg_type, dword_size)),
4515                              Operand::zero(dword_size == 2 ? 8 : 4));
4516         allocated_vec[i] = zero;
4517         instr->operands[i] = Operand(zero);
4518      }
4519   }
4520
4521   bld.insert(std::move(instr));
4522
4523   if (split_cnt)
4524      emit_split_vector(ctx, dst, split_cnt);
4525   else
4526      ctx->allocated_vec.emplace(dst.id(), allocated_vec); /* emit_split_vector already does this */
4527
4528   return dst;
4529}
4530
4531inline unsigned
4532resolve_excess_vmem_const_offset(Builder& bld, Temp& voffset, unsigned const_offset)
4533{
4534   if (const_offset >= 4096) {
4535      unsigned excess_const_offset = const_offset / 4096u * 4096u;
4536      const_offset %= 4096u;
4537
4538      if (!voffset.id())
4539         voffset = bld.copy(bld.def(v1), Operand::c32(excess_const_offset));
4540      else if (unlikely(voffset.regClass() == s1))
4541         voffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc),
4542                            Operand::c32(excess_const_offset), Operand(voffset));
4543      else if (likely(voffset.regClass() == v1))
4544         voffset = bld.vadd32(bld.def(v1), Operand(voffset), Operand::c32(excess_const_offset));
4545      else
4546         unreachable("Unsupported register class of voffset");
4547   }
4548
4549   return const_offset;
4550}
4551
4552void
4553emit_single_mubuf_store(isel_context* ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata,
4554                        unsigned const_offset = 0u, memory_sync_info sync = memory_sync_info(),
4555                        bool slc = false, bool swizzled = false)
4556{
4557   assert(vdata.id());
4558   assert(vdata.size() != 3 || ctx->program->chip_class != GFX6);
4559   assert(vdata.size() >= 1 && vdata.size() <= 4);
4560
4561   Builder bld(ctx->program, ctx->block);
4562   aco_opcode op = get_buffer_store_op(vdata.bytes());
4563   const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset);
4564
4565   Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
4566   Operand soffset_op = soffset.id() ? Operand(soffset) : Operand::zero();
4567   Builder::Result r =
4568      bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset,
4569                /* offen */ !voffset_op.isUndefined(), /* swizzled */ swizzled,
4570                /* idxen*/ false, /* addr64 */ false, /* disable_wqm */ false, /* glc */ true,
4571                /* dlc*/ false, /* slc */ slc);
4572
4573   r.instr->mubuf().sync = sync;
4574}
4575
4576void
4577store_vmem_mubuf(isel_context* ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset,
4578                 unsigned base_const_offset, unsigned elem_size_bytes, unsigned write_mask,
4579                 bool allow_combining = true, memory_sync_info sync = memory_sync_info(),
4580                 bool slc = false)
4581{
4582   Builder bld(ctx->program, ctx->block);
4583   assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
4584   assert(write_mask);
4585   write_mask = widen_mask(write_mask, elem_size_bytes);
4586
4587   unsigned write_count = 0;
4588   Temp write_datas[32];
4589   unsigned offsets[32];
4590   split_buffer_store(ctx, NULL, false, RegType::vgpr, src, write_mask, allow_combining ? 16 : 4,
4591                      &write_count, write_datas, offsets);
4592
4593   for (unsigned i = 0; i < write_count; i++) {
4594      unsigned const_offset = offsets[i] + base_const_offset;
4595      emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, sync,
4596                              slc, !allow_combining);
4597   }
4598}
4599
4600void
4601load_vmem_mubuf(isel_context* ctx, Temp dst, Temp descriptor, Temp voffset, Temp soffset,
4602                unsigned base_const_offset, unsigned elem_size_bytes, unsigned num_components,
4603                unsigned stride = 0u, bool allow_combining = true, bool allow_reorder = true,
4604                bool slc = false)
4605{
4606   assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
4607   assert((num_components * elem_size_bytes) == dst.bytes());
4608   assert(!!stride != allow_combining);
4609
4610   Builder bld(ctx->program, ctx->block);
4611
4612   LoadEmitInfo info = {Operand(voffset), dst, num_components, elem_size_bytes, descriptor};
4613   info.component_stride = allow_combining ? 0 : stride;
4614   info.glc = true;
4615   info.slc = slc;
4616   info.swizzle_component_size = allow_combining ? 0 : 4;
4617   info.align_mul = MIN2(elem_size_bytes, 4);
4618   info.align_offset = 0;
4619   info.soffset = soffset;
4620   info.const_offset = base_const_offset;
4621   emit_load(ctx, bld, info, mubuf_load_params);
4622}
4623
4624Temp
4625wave_id_in_threadgroup(isel_context* ctx)
4626{
4627   Builder bld(ctx->program, ctx->block);
4628   return bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
4629                   get_arg(ctx, ctx->args->ac.merged_wave_info), Operand::c32(24u | (4u << 16)));
4630}
4631
4632Temp
4633thread_id_in_threadgroup(isel_context* ctx)
4634{
4635   /* tid_in_tg = wave_id * wave_size + tid_in_wave */
4636
4637   Builder bld(ctx->program, ctx->block);
4638   Temp tid_in_wave = emit_mbcnt(ctx, bld.tmp(v1));
4639
4640   if (ctx->program->workgroup_size <= ctx->program->wave_size)
4641      return tid_in_wave;
4642
4643   Temp wave_id_in_tg = wave_id_in_threadgroup(ctx);
4644   Temp num_pre_threads =
4645      bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), wave_id_in_tg,
4646               Operand::c32(ctx->program->wave_size == 64 ? 6u : 5u));
4647   return bld.vadd32(bld.def(v1), Operand(num_pre_threads), Operand(tid_in_wave));
4648}
4649
4650Temp
4651get_tess_rel_patch_id(isel_context* ctx)
4652{
4653   Builder bld(ctx->program, ctx->block);
4654
4655   switch (ctx->shader->info.stage) {
4656   case MESA_SHADER_TESS_CTRL:
4657      return bld.pseudo(aco_opcode::p_extract, bld.def(v1), get_arg(ctx, ctx->args->ac.tcs_rel_ids),
4658                        Operand::zero(), Operand::c32(8u), Operand::zero());
4659   case MESA_SHADER_TESS_EVAL: return get_arg(ctx, ctx->args->ac.tes_rel_patch_id);
4660   default: unreachable("Unsupported stage in get_tess_rel_patch_id");
4661   }
4662}
4663
4664bool
4665store_output_to_temps(isel_context* ctx, nir_intrinsic_instr* instr)
4666{
4667   unsigned write_mask = nir_intrinsic_write_mask(instr);
4668   unsigned component = nir_intrinsic_component(instr);
4669   unsigned idx = nir_intrinsic_base(instr) * 4u + component;
4670   nir_src offset = *nir_get_io_offset_src(instr);
4671
4672   if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
4673      return false;
4674
4675   Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
4676
4677   if (instr->src[0].ssa->bit_size == 64)
4678      write_mask = widen_mask(write_mask, 2);
4679
4680   RegClass rc = instr->src[0].ssa->bit_size == 16 ? v2b : v1;
4681
4682   for (unsigned i = 0; i < 8; ++i) {
4683      if (write_mask & (1 << i)) {
4684         ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u);
4685         ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, rc);
4686      }
4687      idx++;
4688   }
4689
4690   return true;
4691}
4692
4693bool
4694load_input_from_temps(isel_context* ctx, nir_intrinsic_instr* instr, Temp dst)
4695{
4696   /* Only TCS per-vertex inputs are supported by this function.
4697    * Per-vertex inputs only match between the VS/TCS invocation id when the number of invocations
4698    * is the same.
4699    */
4700   if (ctx->shader->info.stage != MESA_SHADER_TESS_CTRL || !ctx->tcs_in_out_eq)
4701      return false;
4702
4703   nir_src* off_src = nir_get_io_offset_src(instr);
4704   nir_src* vertex_index_src = nir_get_io_vertex_index_src(instr);
4705   nir_instr* vertex_index_instr = vertex_index_src->ssa->parent_instr;
4706   bool can_use_temps =
4707      nir_src_is_const(*off_src) && vertex_index_instr->type == nir_instr_type_intrinsic &&
4708      nir_instr_as_intrinsic(vertex_index_instr)->intrinsic == nir_intrinsic_load_invocation_id;
4709
4710   if (!can_use_temps)
4711      return false;
4712
4713   unsigned idx = nir_intrinsic_base(instr) * 4u + nir_intrinsic_component(instr) +
4714                  4 * nir_src_as_uint(*off_src);
4715   Temp* src = &ctx->inputs.temps[idx];
4716   create_vec_from_array(ctx, src, dst.size(), dst.regClass().type(), 4u, 0, dst);
4717
4718   return true;
4719}
4720
4721static void export_vs_varying(isel_context* ctx, int slot, bool is_pos, int* next_pos);
4722
4723void
4724visit_store_output(isel_context* ctx, nir_intrinsic_instr* instr)
4725{
4726   if (ctx->stage == vertex_vs || ctx->stage == tess_eval_vs || ctx->stage == fragment_fs ||
4727       ctx->stage == vertex_ngg || ctx->stage == tess_eval_ngg ||
4728       (ctx->stage == vertex_tess_control_hs && ctx->shader->info.stage == MESA_SHADER_VERTEX) ||
4729       ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
4730      bool stored_to_temps = store_output_to_temps(ctx, instr);
4731      if (!stored_to_temps) {
4732         isel_err(instr->src[1].ssa->parent_instr, "Unimplemented output offset instruction");
4733         abort();
4734      }
4735   } else {
4736      unreachable("Shader stage not implemented");
4737   }
4738
4739   /* For NGG VS and TES shaders the primitive ID is exported manually after the other exports so we
4740    * have to emit an exp here manually */
4741   if (ctx->stage.hw == HWStage::NGG &&
4742       (ctx->stage.has(SWStage::VS) || ctx->stage.has(SWStage::TES)) &&
4743       nir_intrinsic_io_semantics(instr).location == VARYING_SLOT_PRIMITIVE_ID)
4744      export_vs_varying(ctx, VARYING_SLOT_PRIMITIVE_ID, false, NULL);
4745}
4746
4747void
4748emit_interp_instr(isel_context* ctx, unsigned idx, unsigned component, Temp src, Temp dst,
4749                  Temp prim_mask)
4750{
4751   Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
4752   Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
4753
4754   Builder bld(ctx->program, ctx->block);
4755
4756   if (dst.regClass() == v2b) {
4757      if (ctx->program->dev.has_16bank_lds) {
4758         assert(ctx->options->chip_class <= GFX8);
4759         Builder::Result interp_p1 =
4760            bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand::c32(2u) /* P0 */,
4761                       bld.m0(prim_mask), idx, component);
4762         interp_p1 = bld.vintrp(aco_opcode::v_interp_p1lv_f16, bld.def(v2b), coord1,
4763                                bld.m0(prim_mask), interp_p1, idx, component);
4764         bld.vintrp(aco_opcode::v_interp_p2_legacy_f16, Definition(dst), coord2, bld.m0(prim_mask),
4765                    interp_p1, idx, component);
4766      } else {
4767         aco_opcode interp_p2_op = aco_opcode::v_interp_p2_f16;
4768
4769         if (ctx->options->chip_class == GFX8)
4770            interp_p2_op = aco_opcode::v_interp_p2_legacy_f16;
4771
4772         Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1ll_f16, bld.def(v1), coord1,
4773                                                bld.m0(prim_mask), idx, component);
4774         bld.vintrp(interp_p2_op, Definition(dst), coord2, bld.m0(prim_mask), interp_p1, idx,
4775                    component);
4776      }
4777   } else {
4778      Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1,
4779                                             bld.m0(prim_mask), idx, component);
4780
4781      if (ctx->program->dev.has_16bank_lds)
4782         interp_p1.instr->operands[0].setLateKill(true);
4783
4784      bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), interp_p1,
4785                 idx, component);
4786   }
4787}
4788
4789void
4790emit_load_frag_coord(isel_context* ctx, Temp dst, unsigned num_components)
4791{
4792   Builder bld(ctx->program, ctx->block);
4793
4794   aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(
4795      aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1));
4796   for (unsigned i = 0; i < num_components; i++) {
4797      if (ctx->args->ac.frag_pos[i].used)
4798         vec->operands[i] = Operand(get_arg(ctx, ctx->args->ac.frag_pos[i]));
4799      else
4800         vec->operands[i] = Operand(v1);
4801   }
4802   if (G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) {
4803      assert(num_components == 4);
4804      vec->operands[3] =
4805         bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_arg(ctx, ctx->args->ac.frag_pos[3]));
4806   }
4807
4808   if (ctx->options->adjust_frag_coord_z &&
4809       G_0286CC_POS_Z_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) {
4810      /* Adjust gl_FragCoord.z for VRS due to a hw bug on some GFX10.3 chips. */
4811      Operand frag_z = vec->operands[2];
4812      Temp adjusted_frag_z = bld.tmp(v1);
4813      Temp tmp;
4814
4815      /* dFdx fine */
4816      Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), frag_z, dpp_quad_perm(0, 0, 2, 2));
4817      tmp = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), frag_z, tl, dpp_quad_perm(1, 1, 3, 3));
4818      emit_wqm(bld, tmp, adjusted_frag_z, true);
4819
4820      /* adjusted_frag_z * 0.0625 + frag_z */
4821      adjusted_frag_z = bld.vop3(aco_opcode::v_fma_f32, bld.def(v1), adjusted_frag_z,
4822                                 Operand::c32(0x3d800000u /* 0.0625 */), frag_z);
4823
4824      /* VRS Rate X = Ancillary[2:3] */
4825      Temp x_rate =
4826         bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary),
4827                  Operand::c32(2u), Operand::c32(2u));
4828
4829      /* xRate = xRate == 0x1 ? adjusted_frag_z : frag_z. */
4830      Temp cond =
4831         bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(x_rate));
4832      vec->operands[2] =
4833         bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), frag_z, adjusted_frag_z, cond);
4834   }
4835
4836   for (Operand& op : vec->operands)
4837      op = op.isUndefined() ? Operand::zero() : op;
4838
4839   vec->definitions[0] = Definition(dst);
4840   ctx->block->instructions.emplace_back(std::move(vec));
4841   emit_split_vector(ctx, dst, num_components);
4842   return;
4843}
4844
4845void
4846emit_load_frag_shading_rate(isel_context* ctx, Temp dst)
4847{
4848   Builder bld(ctx->program, ctx->block);
4849   Temp cond;
4850
4851   /* VRS Rate X = Ancillary[2:3]
4852    * VRS Rate Y = Ancillary[4:5]
4853    */
4854   Temp x_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary),
4855                          Operand::c32(2u), Operand::c32(2u));
4856   Temp y_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary),
4857                          Operand::c32(4u), Operand::c32(2u));
4858
4859   /* xRate = xRate == 0x1 ? Horizontal2Pixels : None. */
4860   cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(x_rate));
4861   x_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand::zero()),
4862                     bld.copy(bld.def(v1), Operand::c32(4u)), cond);
4863
4864   /* yRate = yRate == 0x1 ? Vertical2Pixels : None. */
4865   cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(y_rate));
4866   y_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand::zero()),
4867                     bld.copy(bld.def(v1), Operand::c32(1u)), cond);
4868
4869   bld.vop2(aco_opcode::v_or_b32, Definition(dst), Operand(x_rate), Operand(y_rate));
4870}
4871
4872void
4873visit_load_interpolated_input(isel_context* ctx, nir_intrinsic_instr* instr)
4874{
4875   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4876   Temp coords = get_ssa_temp(ctx, instr->src[0].ssa);
4877   unsigned idx = nir_intrinsic_base(instr);
4878   unsigned component = nir_intrinsic_component(instr);
4879   Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
4880
4881   assert(nir_src_is_const(instr->src[1]) && !nir_src_as_uint(instr->src[1]));
4882
4883   if (instr->dest.ssa.num_components == 1) {
4884      emit_interp_instr(ctx, idx, component, coords, dst, prim_mask);
4885   } else {
4886      aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(
4887         aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.ssa.num_components, 1));
4888      for (unsigned i = 0; i < instr->dest.ssa.num_components; i++) {
4889         Temp tmp = ctx->program->allocateTmp(instr->dest.ssa.bit_size == 16 ? v2b : v1);
4890         emit_interp_instr(ctx, idx, component + i, coords, tmp, prim_mask);
4891         vec->operands[i] = Operand(tmp);
4892      }
4893      vec->definitions[0] = Definition(dst);
4894      ctx->block->instructions.emplace_back(std::move(vec));
4895   }
4896}
4897
4898bool
4899check_vertex_fetch_size(isel_context* ctx, const ac_data_format_info* vtx_info, unsigned offset,
4900                        unsigned binding_align, unsigned channels)
4901{
4902   unsigned vertex_byte_size = vtx_info->chan_byte_size * channels;
4903   if (vtx_info->chan_byte_size != 4 && channels == 3)
4904      return false;
4905
4906   /* Split typed vertex buffer loads on GFX6 and GFX10+ to avoid any
4907    * alignment issues that triggers memory violations and eventually a GPU
4908    * hang. This can happen if the stride (static or dynamic) is unaligned and
4909    * also if the VBO offset is aligned to a scalar (eg. stride is 8 and VBO
4910    * offset is 2 for R16G16B16A16_SNORM).
4911    */
4912   return (ctx->options->chip_class >= GFX7 && ctx->options->chip_class <= GFX9) ||
4913          (offset % vertex_byte_size == 0 && MAX2(binding_align, 1) % vertex_byte_size == 0);
4914}
4915
4916uint8_t
4917get_fetch_data_format(isel_context* ctx, const ac_data_format_info* vtx_info, unsigned offset,
4918                      unsigned* channels, unsigned max_channels, unsigned binding_align)
4919{
4920   if (!vtx_info->chan_byte_size) {
4921      *channels = vtx_info->num_channels;
4922      return vtx_info->chan_format;
4923   }
4924
4925   unsigned num_channels = *channels;
4926   if (!check_vertex_fetch_size(ctx, vtx_info, offset, binding_align, *channels)) {
4927      unsigned new_channels = num_channels + 1;
4928      /* first, assume more loads is worse and try using a larger data format */
4929      while (new_channels <= max_channels &&
4930             !check_vertex_fetch_size(ctx, vtx_info, offset, binding_align, new_channels)) {
4931         new_channels++;
4932      }
4933
4934      if (new_channels > max_channels) {
4935         /* then try decreasing load size (at the cost of more loads) */
4936         new_channels = *channels;
4937         while (new_channels > 1 &&
4938                !check_vertex_fetch_size(ctx, vtx_info, offset, binding_align, new_channels))
4939            new_channels--;
4940      }
4941
4942      if (new_channels < *channels)
4943         *channels = new_channels;
4944      num_channels = new_channels;
4945   }
4946
4947   switch (vtx_info->chan_format) {
4948   case V_008F0C_BUF_DATA_FORMAT_8:
4949      return std::array<uint8_t, 4>{V_008F0C_BUF_DATA_FORMAT_8, V_008F0C_BUF_DATA_FORMAT_8_8,
4950                                    V_008F0C_BUF_DATA_FORMAT_INVALID,
4951                                    V_008F0C_BUF_DATA_FORMAT_8_8_8_8}[num_channels - 1];
4952   case V_008F0C_BUF_DATA_FORMAT_16:
4953      return std::array<uint8_t, 4>{V_008F0C_BUF_DATA_FORMAT_16, V_008F0C_BUF_DATA_FORMAT_16_16,
4954                                    V_008F0C_BUF_DATA_FORMAT_INVALID,
4955                                    V_008F0C_BUF_DATA_FORMAT_16_16_16_16}[num_channels - 1];
4956   case V_008F0C_BUF_DATA_FORMAT_32:
4957      return std::array<uint8_t, 4>{V_008F0C_BUF_DATA_FORMAT_32, V_008F0C_BUF_DATA_FORMAT_32_32,
4958                                    V_008F0C_BUF_DATA_FORMAT_32_32_32,
4959                                    V_008F0C_BUF_DATA_FORMAT_32_32_32_32}[num_channels - 1];
4960   }
4961   unreachable("shouldn't reach here");
4962   return V_008F0C_BUF_DATA_FORMAT_INVALID;
4963}
4964
4965/* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
4966 * so we may need to fix it up. */
4967Temp
4968adjust_vertex_fetch_alpha(isel_context* ctx, enum radv_vs_input_alpha_adjust adjustment, Temp alpha)
4969{
4970   Builder bld(ctx->program, ctx->block);
4971
4972   if (adjustment == ALPHA_ADJUST_SSCALED)
4973      alpha = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), alpha);
4974
4975   /* For the integer-like cases, do a natural sign extension.
4976    *
4977    * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
4978    * and happen to contain 0, 1, 2, 3 as the two LSBs of the
4979    * exponent.
4980    */
4981   unsigned offset = adjustment == ALPHA_ADJUST_SNORM ? 23u : 0u;
4982   alpha =
4983      bld.vop3(aco_opcode::v_bfe_i32, bld.def(v1), alpha, Operand::c32(offset), Operand::c32(2u));
4984
4985   /* Convert back to the right type. */
4986   if (adjustment == ALPHA_ADJUST_SNORM) {
4987      alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
4988      alpha = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::c32(0xbf800000u), alpha);
4989   } else if (adjustment == ALPHA_ADJUST_SSCALED) {
4990      alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
4991   }
4992
4993   return alpha;
4994}
4995
4996void
4997visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr)
4998{
4999   Builder bld(ctx->program, ctx->block);
5000   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5001   nir_src offset = *nir_get_io_offset_src(instr);
5002
5003   if (ctx->shader->info.stage == MESA_SHADER_VERTEX && ctx->args->shader_info->vs.dynamic_inputs) {
5004      if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
5005         isel_err(offset.ssa->parent_instr,
5006                  "Unimplemented non-zero nir_intrinsic_load_input offset");
5007
5008      unsigned location = nir_intrinsic_base(instr) - VERT_ATTRIB_GENERIC0;
5009      unsigned component = nir_intrinsic_component(instr);
5010      unsigned bitsize = instr->dest.ssa.bit_size;
5011      unsigned num_components = instr->dest.ssa.num_components;
5012
5013      Temp input = get_arg(ctx, ctx->args->vs_inputs[location]);
5014
5015      aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
5016         aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
5017      std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
5018      for (unsigned i = 0; i < num_components; i++) {
5019         elems[i] = emit_extract_vector(ctx, input, component + i, bitsize == 64 ? v2 : v1);
5020         if (bitsize == 16) {
5021            if (nir_alu_type_get_base_type(nir_intrinsic_dest_type(instr)) == nir_type_float)
5022               elems[i] = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), elems[i]);
5023            else
5024               elems[i] = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), elems[i],
5025                                     Operand::c32(0u));
5026         }
5027         vec->operands[i] = Operand(elems[i]);
5028      }
5029      vec->definitions[0] = Definition(dst);
5030      ctx->block->instructions.emplace_back(std::move(vec));
5031      ctx->allocated_vec.emplace(dst.id(), elems);
5032   } else if (ctx->shader->info.stage == MESA_SHADER_VERTEX) {
5033
5034      if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
5035         isel_err(offset.ssa->parent_instr,
5036                  "Unimplemented non-zero nir_intrinsic_load_input offset");
5037
5038      Temp vertex_buffers =
5039         convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.vertex_buffers));
5040
5041      unsigned location = nir_intrinsic_base(instr) - VERT_ATTRIB_GENERIC0;
5042      unsigned component = nir_intrinsic_component(instr);
5043      unsigned bitsize = instr->dest.ssa.bit_size;
5044      unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location];
5045      uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location];
5046      uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location];
5047      unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[location];
5048      unsigned binding_align = ctx->options->key.vs.vertex_binding_align[attrib_binding];
5049      enum radv_vs_input_alpha_adjust alpha_adjust =
5050         ctx->options->key.vs.vertex_alpha_adjust[location];
5051
5052      unsigned dfmt = attrib_format & 0xf;
5053      unsigned nfmt = (attrib_format >> 4) & 0x7;
5054      const struct ac_data_format_info* vtx_info = ac_get_data_format_info(dfmt);
5055
5056      unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component;
5057      unsigned num_channels = MIN2(util_last_bit(mask), vtx_info->num_channels);
5058      bool post_shuffle = ctx->options->key.vs.vertex_post_shuffle & (1 << location);
5059      if (post_shuffle)
5060         num_channels = MAX2(num_channels, 3);
5061
5062      unsigned desc_index =
5063         ctx->program->info->vs.use_per_attribute_vb_descs ? location : attrib_binding;
5064      desc_index = util_bitcount(ctx->program->info->vs.vb_desc_usage_mask &
5065                                 u_bit_consecutive(0, desc_index));
5066      Operand off = bld.copy(bld.def(s1), Operand::c32(desc_index * 16u));
5067      Temp list = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), vertex_buffers, off);
5068
5069      Temp index;
5070      if (ctx->options->key.vs.instance_rate_inputs & (1u << location)) {
5071         uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[location];
5072         Temp start_instance = get_arg(ctx, ctx->args->ac.start_instance);
5073         if (divisor) {
5074            Temp instance_id = get_arg(ctx, ctx->args->ac.instance_id);
5075            if (divisor != 1) {
5076               Temp divided = bld.tmp(v1);
5077               emit_v_div_u32(ctx, divided, as_vgpr(ctx, instance_id), divisor);
5078               index = bld.vadd32(bld.def(v1), start_instance, divided);
5079            } else {
5080               index = bld.vadd32(bld.def(v1), start_instance, instance_id);
5081            }
5082         } else {
5083            index = bld.copy(bld.def(v1), start_instance);
5084         }
5085      } else {
5086         index = bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->ac.base_vertex),
5087                            get_arg(ctx, ctx->args->ac.vertex_id));
5088      }
5089
5090      Temp* const channels = (Temp*)alloca(num_channels * sizeof(Temp));
5091      unsigned channel_start = 0;
5092      bool direct_fetch = false;
5093
5094      /* skip unused channels at the start */
5095      if (vtx_info->chan_byte_size && !post_shuffle) {
5096         channel_start = ffs(mask) - 1;
5097         for (unsigned i = 0; i < MIN2(channel_start, num_channels); i++)
5098            channels[i] = Temp(0, s1);
5099      } else if (vtx_info->chan_byte_size && post_shuffle && !(mask & 0x8)) {
5100         num_channels = 3 - (ffs(mask) - 1);
5101      }
5102
5103      /* load channels */
5104      while (channel_start < num_channels) {
5105         unsigned fetch_component = num_channels - channel_start;
5106         unsigned fetch_offset = attrib_offset + channel_start * vtx_info->chan_byte_size;
5107         bool expanded = false;
5108
5109         /* use MUBUF when possible to avoid possible alignment issues */
5110         /* TODO: we could use SDWA to unpack 8/16-bit attributes without extra instructions */
5111         bool use_mubuf =
5112            (nfmt == V_008F0C_BUF_NUM_FORMAT_FLOAT || nfmt == V_008F0C_BUF_NUM_FORMAT_UINT ||
5113             nfmt == V_008F0C_BUF_NUM_FORMAT_SINT) &&
5114            vtx_info->chan_byte_size == 4;
5115         unsigned fetch_dfmt = V_008F0C_BUF_DATA_FORMAT_INVALID;
5116         if (!use_mubuf) {
5117            fetch_dfmt =
5118               get_fetch_data_format(ctx, vtx_info, fetch_offset, &fetch_component,
5119                                     vtx_info->num_channels - channel_start, binding_align);
5120         } else {
5121            if (fetch_component == 3 && ctx->options->chip_class == GFX6) {
5122               /* GFX6 only supports loading vec3 with MTBUF, expand to vec4. */
5123               fetch_component = 4;
5124               expanded = true;
5125            }
5126         }
5127
5128         unsigned fetch_bytes = fetch_component * bitsize / 8;
5129
5130         Temp fetch_index = index;
5131         if (attrib_stride != 0 && fetch_offset > attrib_stride) {
5132            fetch_index =
5133               bld.vadd32(bld.def(v1), Operand::c32(fetch_offset / attrib_stride), fetch_index);
5134            fetch_offset = fetch_offset % attrib_stride;
5135         }
5136
5137         Operand soffset = Operand::zero();
5138         if (fetch_offset >= 4096) {
5139            soffset = bld.copy(bld.def(s1), Operand::c32(fetch_offset / 4096 * 4096));
5140            fetch_offset %= 4096;
5141         }
5142
5143         aco_opcode opcode;
5144         switch (fetch_bytes) {
5145         case 2:
5146            assert(!use_mubuf && bitsize == 16);
5147            opcode = aco_opcode::tbuffer_load_format_d16_x;
5148            break;
5149         case 4:
5150            if (bitsize == 16) {
5151               assert(!use_mubuf);
5152               opcode = aco_opcode::tbuffer_load_format_d16_xy;
5153            } else {
5154               opcode =
5155                  use_mubuf ? aco_opcode::buffer_load_dword : aco_opcode::tbuffer_load_format_x;
5156            }
5157            break;
5158         case 6:
5159            assert(!use_mubuf && bitsize == 16);
5160            opcode = aco_opcode::tbuffer_load_format_d16_xyz;
5161            break;
5162         case 8:
5163            if (bitsize == 16) {
5164               assert(!use_mubuf);
5165               opcode = aco_opcode::tbuffer_load_format_d16_xyzw;
5166            } else {
5167               opcode =
5168                  use_mubuf ? aco_opcode::buffer_load_dwordx2 : aco_opcode::tbuffer_load_format_xy;
5169            }
5170            break;
5171         case 12:
5172            assert(ctx->options->chip_class >= GFX7 ||
5173                   (!use_mubuf && ctx->options->chip_class == GFX6));
5174            opcode =
5175               use_mubuf ? aco_opcode::buffer_load_dwordx3 : aco_opcode::tbuffer_load_format_xyz;
5176            break;
5177         case 16:
5178            opcode =
5179               use_mubuf ? aco_opcode::buffer_load_dwordx4 : aco_opcode::tbuffer_load_format_xyzw;
5180            break;
5181         default: unreachable("Unimplemented load_input vector size");
5182         }
5183
5184         Temp fetch_dst;
5185         if (channel_start == 0 && fetch_bytes == dst.bytes() && !post_shuffle && !expanded &&
5186             (alpha_adjust == ALPHA_ADJUST_NONE || num_channels <= 3)) {
5187            direct_fetch = true;
5188            fetch_dst = dst;
5189         } else {
5190            fetch_dst = bld.tmp(RegClass::get(RegType::vgpr, fetch_bytes));
5191         }
5192
5193         if (use_mubuf) {
5194            Instruction* mubuf = bld.mubuf(opcode, Definition(fetch_dst), list, fetch_index,
5195                                           soffset, fetch_offset, false, false, true)
5196                                    .instr;
5197            mubuf->mubuf().vtx_binding = attrib_binding + 1;
5198         } else {
5199            Instruction* mtbuf = bld.mtbuf(opcode, Definition(fetch_dst), list, fetch_index,
5200                                           soffset, fetch_dfmt, nfmt, fetch_offset, false, true)
5201                                    .instr;
5202            mtbuf->mtbuf().vtx_binding = attrib_binding + 1;
5203         }
5204
5205         emit_split_vector(ctx, fetch_dst, fetch_dst.size());
5206
5207         if (fetch_component == 1) {
5208            channels[channel_start] = fetch_dst;
5209         } else {
5210            for (unsigned i = 0; i < MIN2(fetch_component, num_channels - channel_start); i++)
5211               channels[channel_start + i] =
5212                  emit_extract_vector(ctx, fetch_dst, i, bitsize == 16 ? v2b : v1);
5213         }
5214
5215         channel_start += fetch_component;
5216      }
5217
5218      if (!direct_fetch) {
5219         bool is_float =
5220            nfmt != V_008F0C_BUF_NUM_FORMAT_UINT && nfmt != V_008F0C_BUF_NUM_FORMAT_SINT;
5221
5222         static const unsigned swizzle_normal[4] = {0, 1, 2, 3};
5223         static const unsigned swizzle_post_shuffle[4] = {2, 1, 0, 3};
5224         const unsigned* swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal;
5225         unsigned num_components = instr->dest.ssa.num_components;
5226
5227         aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
5228            aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
5229         std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
5230         unsigned num_temp = 0;
5231         for (unsigned i = 0; i < num_components; i++) {
5232            unsigned idx = i + component;
5233            if (swizzle[idx] < num_channels && channels[swizzle[idx]].id()) {
5234               Temp channel = channels[swizzle[idx]];
5235               if (idx == 3 && alpha_adjust != ALPHA_ADJUST_NONE)
5236                  channel = adjust_vertex_fetch_alpha(ctx, alpha_adjust, channel);
5237               vec->operands[i] = Operand(channel);
5238
5239               num_temp++;
5240               elems[i] = channel;
5241            } else if (is_float && idx == 3) {
5242               vec->operands[i] = Operand::c32(0x3f800000u);
5243            } else if (!is_float && idx == 3) {
5244               vec->operands[i] = Operand::c32(1u);
5245            } else {
5246               vec->operands[i] = Operand::zero();
5247            }
5248         }
5249         vec->definitions[0] = Definition(dst);
5250         ctx->block->instructions.emplace_back(std::move(vec));
5251         emit_split_vector(ctx, dst, num_components);
5252
5253         if (num_temp == num_components)
5254            ctx->allocated_vec.emplace(dst.id(), elems);
5255      }
5256   } else if (ctx->shader->info.stage == MESA_SHADER_FRAGMENT) {
5257      if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
5258         isel_err(offset.ssa->parent_instr,
5259                  "Unimplemented non-zero nir_intrinsic_load_input offset");
5260
5261      Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
5262
5263      unsigned idx = nir_intrinsic_base(instr);
5264      unsigned component = nir_intrinsic_component(instr);
5265      unsigned vertex_id = 2; /* P0 */
5266
5267      if (instr->intrinsic == nir_intrinsic_load_input_vertex) {
5268         nir_const_value* src0 = nir_src_as_const_value(instr->src[0]);
5269         switch (src0->u32) {
5270         case 0:
5271            vertex_id = 2; /* P0 */
5272            break;
5273         case 1:
5274            vertex_id = 0; /* P10 */
5275            break;
5276         case 2:
5277            vertex_id = 1; /* P20 */
5278            break;
5279         default: unreachable("invalid vertex index");
5280         }
5281      }
5282
5283      if (instr->dest.ssa.num_components == 1 &&
5284          instr->dest.ssa.bit_size != 64) {
5285         bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand::c32(vertex_id),
5286                    bld.m0(prim_mask), idx, component);
5287      } else {
5288         unsigned num_components = instr->dest.ssa.num_components;
5289         if (instr->dest.ssa.bit_size == 64)
5290            num_components *= 2;
5291         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
5292            aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
5293         for (unsigned i = 0; i < num_components; i++) {
5294            unsigned chan_component = (component + i) % 4;
5295            unsigned chan_idx = idx + (component + i) / 4;
5296            vec->operands[i] = bld.vintrp(
5297               aco_opcode::v_interp_mov_f32, bld.def(instr->dest.ssa.bit_size == 16 ? v2b : v1),
5298               Operand::c32(vertex_id), bld.m0(prim_mask), chan_idx, chan_component);
5299         }
5300         vec->definitions[0] = Definition(dst);
5301         bld.insert(std::move(vec));
5302      }
5303   } else {
5304      unreachable("Shader stage not implemented");
5305   }
5306}
5307
5308void
5309visit_load_tcs_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr)
5310{
5311   assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
5312
5313   Builder bld(ctx->program, ctx->block);
5314   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5315
5316   if (load_input_from_temps(ctx, instr, dst))
5317      return;
5318
5319   unreachable("LDS-based TCS input should have been lowered in NIR.");
5320}
5321
5322void
5323visit_load_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr)
5324{
5325   switch (ctx->shader->info.stage) {
5326   case MESA_SHADER_TESS_CTRL: visit_load_tcs_per_vertex_input(ctx, instr); break;
5327   default: unreachable("Unimplemented shader stage");
5328   }
5329}
5330
5331void
5332visit_load_tess_coord(isel_context* ctx, nir_intrinsic_instr* instr)
5333{
5334   assert(ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
5335
5336   Builder bld(ctx->program, ctx->block);
5337   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5338
5339   Operand tes_u(get_arg(ctx, ctx->args->ac.tes_u));
5340   Operand tes_v(get_arg(ctx, ctx->args->ac.tes_v));
5341   Operand tes_w = Operand::zero();
5342
5343   if (ctx->shader->info.tess.primitive_mode == GL_TRIANGLES) {
5344      Temp tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tes_u, tes_v);
5345      tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand::c32(0x3f800000u /* 1.0f */), tmp);
5346      tes_w = Operand(tmp);
5347   }
5348
5349   Temp tess_coord = bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tes_u, tes_v, tes_w);
5350   emit_split_vector(ctx, tess_coord, 3);
5351}
5352
5353Temp
5354load_desc_ptr(isel_context* ctx, unsigned desc_set)
5355{
5356   const struct radv_userdata_locations *user_sgprs_locs = &ctx->program->info->user_sgprs_locs;
5357
5358   if (user_sgprs_locs->shader_data[AC_UD_INDIRECT_DESCRIPTOR_SETS].sgpr_idx != -1) {
5359      Builder bld(ctx->program, ctx->block);
5360      Temp ptr64 = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->descriptor_sets[0]));
5361      Operand off = bld.copy(bld.def(s1), Operand::c32(desc_set << 2));
5362      return bld.smem(aco_opcode::s_load_dword, bld.def(s1), ptr64, off); //, false, false, false);
5363   }
5364
5365   return get_arg(ctx, ctx->args->descriptor_sets[desc_set]);
5366}
5367
5368void
5369visit_load_resource(isel_context* ctx, nir_intrinsic_instr* instr)
5370{
5371   Builder bld(ctx->program, ctx->block);
5372   Temp index = get_ssa_temp(ctx, instr->src[0].ssa);
5373   if (!nir_dest_is_divergent(instr->dest))
5374      index = bld.as_uniform(index);
5375   unsigned desc_set = nir_intrinsic_desc_set(instr);
5376   unsigned binding = nir_intrinsic_binding(instr);
5377
5378   Temp desc_ptr;
5379   radv_pipeline_layout* pipeline_layout = ctx->options->layout;
5380   radv_descriptor_set_layout* layout = pipeline_layout->set[desc_set].layout;
5381   unsigned offset = layout->binding[binding].offset;
5382   unsigned stride;
5383   if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
5384       layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
5385      unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start +
5386                     layout->binding[binding].dynamic_offset_offset;
5387      desc_ptr = get_arg(ctx, ctx->args->ac.push_constants);
5388      offset = pipeline_layout->push_constant_size + 16 * idx;
5389      stride = 16;
5390   } else {
5391      desc_ptr = load_desc_ptr(ctx, desc_set);
5392      stride = layout->binding[binding].size;
5393   }
5394
5395   if (nir_src_is_const(instr->src[0])) {
5396      index =
5397         bld.copy(bld.def(s1), Operand::c32((offset + nir_src_as_uint(instr->src[0]) * stride)));
5398   } else if (index.type() == RegType::vgpr) {
5399      if (stride != 1) {
5400         bool index24bit = layout->binding[binding].array_size <= 0x1000000;
5401         index = bld.v_mul_imm(bld.def(v1), index, stride, index24bit);
5402      }
5403      if (offset)
5404         index = bld.vadd32(bld.def(v1), Operand::c32(offset), index);
5405   } else {
5406      if (stride != 1)
5407         index = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(stride), index);
5408      if (offset)
5409         index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
5410                          Operand::c32(offset), index);
5411   }
5412
5413   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5414   std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
5415   elems[0] = desc_ptr;
5416   elems[1] = index;
5417   ctx->allocated_vec.emplace(dst.id(), elems);
5418   bld.pseudo(aco_opcode::p_create_vector, Definition(dst), desc_ptr, index, Operand::zero());
5419}
5420
5421void
5422load_buffer(isel_context* ctx, unsigned num_components, unsigned component_size, Temp dst,
5423            Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset, bool glc = false,
5424            bool allow_smem = true, memory_sync_info sync = memory_sync_info())
5425{
5426   Builder bld(ctx->program, ctx->block);
5427
5428   bool use_smem =
5429      dst.type() != RegType::vgpr && (!glc || ctx->options->chip_class >= GFX8) && allow_smem;
5430   if (use_smem)
5431      offset = bld.as_uniform(offset);
5432   else {
5433      /* GFX6-7 are affected by a hw bug that prevents address clamping to
5434       * work correctly when the SGPR offset is used.
5435       */
5436      if (offset.type() == RegType::sgpr && ctx->options->chip_class < GFX8)
5437         offset = as_vgpr(ctx, offset);
5438   }
5439
5440   LoadEmitInfo info = {Operand(offset), dst, num_components, component_size, rsrc};
5441   info.glc = glc;
5442   info.sync = sync;
5443   info.align_mul = align_mul;
5444   info.align_offset = align_offset;
5445   if (use_smem)
5446      emit_load(ctx, bld, info, smem_load_params);
5447   else
5448      emit_load(ctx, bld, info, mubuf_load_params);
5449}
5450
5451Temp
5452load_buffer_rsrc(isel_context* ctx, Temp rsrc)
5453{
5454   Builder bld(ctx->program, ctx->block);
5455   Temp set_ptr = emit_extract_vector(ctx, rsrc, 0, RegClass(rsrc.type(), 1));
5456   Temp binding = bld.as_uniform(emit_extract_vector(ctx, rsrc, 1, RegClass(rsrc.type(), 1)));
5457   set_ptr = convert_pointer_to_64_bit(ctx, set_ptr);
5458   return bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), set_ptr, binding);
5459}
5460
5461bool
5462is_inline_ubo(isel_context* ctx, nir_src rsrc)
5463{
5464   nir_binding binding = nir_chase_binding(rsrc);
5465   if (!binding.success)
5466      return false;
5467
5468   radv_descriptor_set_layout* layout = ctx->options->layout->set[binding.desc_set].layout;
5469   return layout->binding[binding.binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT;
5470}
5471
5472void
5473visit_load_ubo(isel_context* ctx, nir_intrinsic_instr* instr)
5474{
5475   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5476   Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa);
5477
5478   Builder bld(ctx->program, ctx->block);
5479
5480   if (is_inline_ubo(ctx, instr->src[0])) {
5481      Temp set_ptr = bld.as_uniform(emit_extract_vector(ctx, rsrc, 0, RegClass(rsrc.type(), 1)));
5482      Temp binding_off =
5483         bld.as_uniform(emit_extract_vector(ctx, rsrc, 1, RegClass(rsrc.type(), 1)));
5484      rsrc = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), set_ptr, binding_off);
5485
5486      uint32_t desc_type =
5487         S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
5488         S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
5489      if (ctx->options->chip_class >= GFX10) {
5490         desc_type |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
5491                      S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
5492      } else {
5493         desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5494                      S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
5495      }
5496      rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), rsrc,
5497                        Operand::c32(S_008F04_BASE_ADDRESS_HI(ctx->options->address32_hi)),
5498                        Operand::c32(0xFFFFFFFFu), Operand::c32(desc_type));
5499   } else {
5500      rsrc = load_buffer_rsrc(ctx, rsrc);
5501   }
5502   unsigned size = instr->dest.ssa.bit_size / 8;
5503   load_buffer(ctx, instr->num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
5504               nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr));
5505}
5506
5507void
5508visit_load_sbt_amd(isel_context* ctx, nir_intrinsic_instr* instr)
5509{
5510   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5511   unsigned binding = nir_intrinsic_binding(instr);
5512
5513   Builder bld(ctx->program, ctx->block);
5514   Temp desc_base = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.sbt_descriptors));
5515   Operand desc_off = bld.copy(bld.def(s1), Operand::c32(binding * 16u));
5516   bld.smem(aco_opcode::s_load_dwordx4, Definition(dst), desc_base, desc_off);
5517}
5518
5519void
5520visit_load_push_constant(isel_context* ctx, nir_intrinsic_instr* instr)
5521{
5522   Builder bld(ctx->program, ctx->block);
5523   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5524   unsigned offset = nir_intrinsic_base(instr);
5525   unsigned count = instr->dest.ssa.num_components;
5526   nir_const_value* index_cv = nir_src_as_const_value(instr->src[0]);
5527
5528   if (index_cv && instr->dest.ssa.bit_size == 32) {
5529      struct radv_userdata_info *loc =
5530         &ctx->args->shader_info->user_sgprs_locs.shader_data[AC_UD_INLINE_PUSH_CONSTANTS];
5531      unsigned start = (offset + index_cv->u32) / 4u;
5532      unsigned num_inline_push_consts = loc->sgpr_idx != -1 ? loc->num_sgprs : 0;
5533
5534      start -= ctx->args->shader_info->min_push_constant_used / 4;
5535      if (start + count <= num_inline_push_consts) {
5536         std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
5537         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
5538            aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
5539         for (unsigned i = 0; i < count; ++i) {
5540            elems[i] = get_arg(ctx, ctx->args->ac.inline_push_consts[start + i]);
5541            vec->operands[i] = Operand{elems[i]};
5542         }
5543         vec->definitions[0] = Definition(dst);
5544         ctx->block->instructions.emplace_back(std::move(vec));
5545         ctx->allocated_vec.emplace(dst.id(), elems);
5546         return;
5547      }
5548   }
5549
5550   Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
5551   if (offset != 0) // TODO check if index != 0 as well
5552      index = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
5553                             Operand::c32(offset), index);
5554   Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants));
5555   Temp vec = dst;
5556   bool trim = false;
5557   bool aligned = true;
5558
5559   if (instr->dest.ssa.bit_size == 8) {
5560      aligned = index_cv && (offset + index_cv->u32) % 4 == 0;
5561      bool fits_in_dword = count == 1 || (index_cv && ((offset + index_cv->u32) % 4 + count) <= 4);
5562      if (!aligned)
5563         vec = fits_in_dword ? bld.tmp(s1) : bld.tmp(s2);
5564   } else if (instr->dest.ssa.bit_size == 16) {
5565      aligned = index_cv && (offset + index_cv->u32) % 4 == 0;
5566      if (!aligned)
5567         vec = count == 4 ? bld.tmp(s4) : count > 1 ? bld.tmp(s2) : bld.tmp(s1);
5568   }
5569
5570   aco_opcode op;
5571
5572   switch (vec.size()) {
5573   case 1: op = aco_opcode::s_load_dword; break;
5574   case 2: op = aco_opcode::s_load_dwordx2; break;
5575   case 3:
5576      vec = bld.tmp(s4);
5577      trim = true;
5578      FALLTHROUGH;
5579   case 4: op = aco_opcode::s_load_dwordx4; break;
5580   case 6:
5581      vec = bld.tmp(s8);
5582      trim = true;
5583      FALLTHROUGH;
5584   case 8: op = aco_opcode::s_load_dwordx8; break;
5585   default: unreachable("unimplemented or forbidden load_push_constant.");
5586   }
5587
5588   bld.smem(op, Definition(vec), ptr, index).instr->smem().prevent_overflow = true;
5589
5590   if (!aligned) {
5591      Operand byte_offset = index_cv ? Operand::c32((offset + index_cv->u32) % 4) : Operand(index);
5592      byte_align_scalar(ctx, vec, byte_offset, dst);
5593      return;
5594   }
5595
5596   if (trim) {
5597      emit_split_vector(ctx, vec, 4);
5598      RegClass rc = dst.size() == 3 ? s1 : s2;
5599      bld.pseudo(aco_opcode::p_create_vector, Definition(dst), emit_extract_vector(ctx, vec, 0, rc),
5600                 emit_extract_vector(ctx, vec, 1, rc), emit_extract_vector(ctx, vec, 2, rc));
5601   }
5602   emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
5603}
5604
5605void
5606visit_load_constant(isel_context* ctx, nir_intrinsic_instr* instr)
5607{
5608   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5609
5610   Builder bld(ctx->program, ctx->block);
5611
5612   uint32_t desc_type =
5613      S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
5614      S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
5615   if (ctx->options->chip_class >= GFX10) {
5616      desc_type |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
5617                   S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
5618   } else {
5619      desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5620                   S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
5621   }
5622
5623   unsigned base = nir_intrinsic_base(instr);
5624   unsigned range = nir_intrinsic_range(instr);
5625
5626   Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
5627   if (base && offset.type() == RegType::sgpr)
5628      offset = bld.nuw().sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
5629                              Operand::c32(base));
5630   else if (base && offset.type() == RegType::vgpr)
5631      offset = bld.vadd32(bld.def(v1), Operand::c32(base), offset);
5632
5633   Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
5634                          bld.pseudo(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc),
5635                                     Operand::c32(ctx->constant_data_offset)),
5636                          Operand::c32(MIN2(base + range, ctx->shader->constant_data_size)),
5637                          Operand::c32(desc_type));
5638   unsigned size = instr->dest.ssa.bit_size / 8;
5639   // TODO: get alignment information for subdword constants
5640   load_buffer(ctx, instr->num_components, size, dst, rsrc, offset, size, 0);
5641}
5642
5643void
5644visit_discard_if(isel_context* ctx, nir_intrinsic_instr* instr)
5645{
5646   if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
5647      ctx->cf_info.exec_potentially_empty_discard = true;
5648
5649   ctx->program->needs_exact = true;
5650
5651   // TODO: optimize uniform conditions
5652   Builder bld(ctx->program, ctx->block);
5653   Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5654   assert(src.regClass() == bld.lm);
5655   src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
5656   bld.pseudo(aco_opcode::p_discard_if, src);
5657   ctx->block->kind |= block_kind_uses_discard_if;
5658   return;
5659}
5660
5661void
5662visit_discard(isel_context* ctx, nir_intrinsic_instr* instr)
5663{
5664   Builder bld(ctx->program, ctx->block);
5665
5666   if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
5667      ctx->cf_info.exec_potentially_empty_discard = true;
5668
5669   bool divergent =
5670      ctx->cf_info.parent_if.is_divergent || ctx->cf_info.parent_loop.has_divergent_continue;
5671
5672   if (ctx->block->loop_nest_depth && (nir_instr_is_last(&instr->instr) && !divergent)) {
5673      /* we handle discards the same way as jump instructions */
5674      append_logical_end(ctx->block);
5675
5676      /* in loops, discard behaves like break */
5677      Block* linear_target = ctx->cf_info.parent_loop.exit;
5678      ctx->block->kind |= block_kind_discard;
5679
5680      /* uniform discard - loop ends here */
5681      assert(nir_instr_is_last(&instr->instr));
5682      ctx->block->kind |= block_kind_uniform;
5683      ctx->cf_info.has_branch = true;
5684      bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
5685      add_linear_edge(ctx->block->index, linear_target);
5686      return;
5687   }
5688
5689   /* it can currently happen that NIR doesn't remove the unreachable code */
5690   if (!nir_instr_is_last(&instr->instr)) {
5691      ctx->program->needs_exact = true;
5692      /* save exec somewhere temporarily so that it doesn't get
5693       * overwritten before the discard from outer exec masks */
5694      Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc),
5695                           Operand::c32(0xFFFFFFFF), Operand(exec, bld.lm));
5696      bld.pseudo(aco_opcode::p_discard_if, cond);
5697      ctx->block->kind |= block_kind_uses_discard_if;
5698      return;
5699   }
5700
5701   /* This condition is incorrect for uniformly branched discards in a loop
5702    * predicated by a divergent condition, but the above code catches that case
5703    * and the discard would end up turning into a discard_if.
5704    * For example:
5705    * if (divergent) {
5706    *    while (...) {
5707    *       if (uniform) {
5708    *          discard;
5709    *       }
5710    *    }
5711    * }
5712    */
5713   if (!ctx->cf_info.parent_if.is_divergent) {
5714      /* program just ends here */
5715      ctx->block->kind |= block_kind_uses_discard_if;
5716      bld.pseudo(aco_opcode::p_discard_if, Operand::c32(0xFFFFFFFFu));
5717      // TODO: it will potentially be followed by a branch which is dead code to sanitize NIR phis
5718   } else {
5719      ctx->block->kind |= block_kind_discard;
5720      /* branch and linear edge is added by visit_if() */
5721   }
5722}
5723
5724enum aco_descriptor_type {
5725   ACO_DESC_IMAGE,
5726   ACO_DESC_FMASK,
5727   ACO_DESC_SAMPLER,
5728   ACO_DESC_BUFFER,
5729   ACO_DESC_PLANE_0,
5730   ACO_DESC_PLANE_1,
5731   ACO_DESC_PLANE_2,
5732};
5733
5734static bool
5735should_declare_array(isel_context* ctx, enum glsl_sampler_dim sampler_dim, bool is_array)
5736{
5737   if (sampler_dim == GLSL_SAMPLER_DIM_BUF)
5738      return false;
5739   ac_image_dim dim = ac_get_sampler_dim(ctx->options->chip_class, sampler_dim, is_array);
5740   return dim == ac_image_cube || dim == ac_image_1darray || dim == ac_image_2darray ||
5741          dim == ac_image_2darraymsaa;
5742}
5743
5744Temp
5745get_sampler_desc(isel_context* ctx, nir_deref_instr* deref_instr,
5746                 enum aco_descriptor_type desc_type, const nir_tex_instr* tex_instr, bool write)
5747{
5748   /* FIXME: we should lower the deref with some new nir_intrinsic_load_desc
5749      std::unordered_map<uint64_t, Temp>::iterator it = ctx->tex_desc.find((uint64_t) desc_type <<
5750      32 | deref_instr->dest.ssa.index); if (it != ctx->tex_desc.end()) return it->second;
5751   */
5752   Temp index = Temp();
5753   bool index_set = false;
5754   unsigned constant_index = 0;
5755   unsigned descriptor_set;
5756   unsigned base_index;
5757   Builder bld(ctx->program, ctx->block);
5758
5759   if (!deref_instr) {
5760      assert(tex_instr);
5761      descriptor_set = 0;
5762      base_index = tex_instr->sampler_index;
5763   } else {
5764      while (deref_instr->deref_type != nir_deref_type_var) {
5765         unsigned array_size = glsl_get_aoa_size(deref_instr->type);
5766         if (!array_size)
5767            array_size = 1;
5768
5769         assert(deref_instr->deref_type == nir_deref_type_array);
5770         nir_const_value* const_value = nir_src_as_const_value(deref_instr->arr.index);
5771         if (const_value) {
5772            constant_index += array_size * const_value->u32;
5773         } else {
5774            Temp indirect = get_ssa_temp(ctx, deref_instr->arr.index.ssa);
5775            if (indirect.type() == RegType::vgpr)
5776               indirect = bld.as_uniform(indirect);
5777
5778            if (array_size != 1)
5779               indirect =
5780                  bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(array_size), indirect);
5781
5782            if (!index_set) {
5783               index = indirect;
5784               index_set = true;
5785            } else {
5786               index =
5787                  bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), index, indirect);
5788            }
5789         }
5790
5791         deref_instr = nir_src_as_deref(deref_instr->parent);
5792      }
5793      descriptor_set = deref_instr->var->data.descriptor_set;
5794      base_index = deref_instr->var->data.binding;
5795   }
5796
5797   Temp list = load_desc_ptr(ctx, descriptor_set);
5798   list = convert_pointer_to_64_bit(ctx, list);
5799
5800   struct radv_descriptor_set_layout* layout = ctx->options->layout->set[descriptor_set].layout;
5801   struct radv_descriptor_set_binding_layout* binding = layout->binding + base_index;
5802   unsigned offset = binding->offset;
5803   unsigned stride = binding->size;
5804   aco_opcode opcode;
5805   RegClass type;
5806
5807   assert(base_index < layout->binding_count);
5808
5809   switch (desc_type) {
5810   case ACO_DESC_IMAGE:
5811      type = s8;
5812      opcode = aco_opcode::s_load_dwordx8;
5813      break;
5814   case ACO_DESC_FMASK:
5815      type = s8;
5816      opcode = aco_opcode::s_load_dwordx8;
5817      offset += 32;
5818      break;
5819   case ACO_DESC_SAMPLER:
5820      type = s4;
5821      opcode = aco_opcode::s_load_dwordx4;
5822      if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
5823         offset += radv_combined_image_descriptor_sampler_offset(binding);
5824      break;
5825   case ACO_DESC_BUFFER:
5826      type = s4;
5827      opcode = aco_opcode::s_load_dwordx4;
5828      break;
5829   case ACO_DESC_PLANE_0:
5830   case ACO_DESC_PLANE_1:
5831      type = s8;
5832      opcode = aco_opcode::s_load_dwordx8;
5833      offset += 32 * (desc_type - ACO_DESC_PLANE_0);
5834      break;
5835   case ACO_DESC_PLANE_2:
5836      type = s4;
5837      opcode = aco_opcode::s_load_dwordx4;
5838      offset += 64;
5839      break;
5840   default: unreachable("invalid desc_type\n");
5841   }
5842
5843   offset += constant_index * stride;
5844
5845   if (desc_type == ACO_DESC_SAMPLER && binding->immutable_samplers_offset &&
5846       (!index_set || binding->immutable_samplers_equal)) {
5847      if (binding->immutable_samplers_equal)
5848         constant_index = 0;
5849
5850      const uint32_t* samplers = radv_immutable_samplers(layout, binding);
5851      uint32_t dword0_mask = tex_instr->op == nir_texop_tg4 ? C_008F30_TRUNC_COORD : 0xffffffffu;
5852      return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
5853                        Operand::c32(samplers[constant_index * 4 + 0] & dword0_mask),
5854                        Operand::c32(samplers[constant_index * 4 + 1]),
5855                        Operand::c32(samplers[constant_index * 4 + 2]),
5856                        Operand::c32(samplers[constant_index * 4 + 3]));
5857   }
5858
5859   Operand off;
5860   if (!index_set) {
5861      off = bld.copy(bld.def(s1), Operand::c32(offset));
5862   } else {
5863      off = Operand(
5864         (Temp)bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand::c32(offset),
5865                        bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(stride), index)));
5866   }
5867
5868   Temp res = bld.smem(opcode, bld.def(type), list, off);
5869
5870   if (desc_type == ACO_DESC_PLANE_2) {
5871      Temp components[8];
5872      for (unsigned i = 0; i < 8; i++)
5873         components[i] = bld.tmp(s1);
5874      bld.pseudo(aco_opcode::p_split_vector, Definition(components[0]), Definition(components[1]),
5875                 Definition(components[2]), Definition(components[3]), res);
5876
5877      Temp desc2 = get_sampler_desc(ctx, deref_instr, ACO_DESC_PLANE_1, tex_instr, write);
5878      bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), bld.def(s1), bld.def(s1),
5879                 Definition(components[4]), Definition(components[5]), Definition(components[6]),
5880                 Definition(components[7]), desc2);
5881
5882      res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), components[0], components[1],
5883                       components[2], components[3], components[4], components[5], components[6],
5884                       components[7]);
5885   } else if (desc_type == ACO_DESC_IMAGE && ctx->options->has_image_load_dcc_bug && !tex_instr &&
5886              !write) {
5887      Temp components[8];
5888      for (unsigned i = 0; i < 8; i++)
5889         components[i] = bld.tmp(s1);
5890
5891      bld.pseudo(aco_opcode::p_split_vector, Definition(components[0]), Definition(components[1]),
5892                 Definition(components[2]), Definition(components[3]), Definition(components[4]),
5893                 Definition(components[5]), Definition(components[6]), Definition(components[7]),
5894                 res);
5895
5896      /* WRITE_COMPRESS_ENABLE must be 0 for all image loads to workaround a
5897       * hardware bug.
5898       */
5899      components[6] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), components[6],
5900                               bld.copy(bld.def(s1), Operand::c32(C_00A018_WRITE_COMPRESS_ENABLE)));
5901
5902      res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), components[0], components[1],
5903                       components[2], components[3], components[4], components[5], components[6],
5904                       components[7]);
5905   } else if (desc_type == ACO_DESC_SAMPLER && tex_instr->op == nir_texop_tg4) {
5906      Temp components[4];
5907      for (unsigned i = 0; i < 4; i++)
5908         components[i] = bld.tmp(s1);
5909
5910      bld.pseudo(aco_opcode::p_split_vector, Definition(components[0]), Definition(components[1]),
5911                 Definition(components[2]), Definition(components[3]), res);
5912
5913      /* We want to always use the linear filtering truncation behaviour for
5914       * nir_texop_tg4, even if the sampler uses nearest/point filtering.
5915       */
5916      components[0] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), components[0],
5917                               Operand::c32(C_008F30_TRUNC_COORD));
5918
5919      res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), components[0], components[1],
5920                       components[2], components[3]);
5921   }
5922
5923   return res;
5924}
5925
5926static int
5927image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
5928{
5929   switch (dim) {
5930   case GLSL_SAMPLER_DIM_BUF: return 1;
5931   case GLSL_SAMPLER_DIM_1D: return array ? 2 : 1;
5932   case GLSL_SAMPLER_DIM_2D: return array ? 3 : 2;
5933   case GLSL_SAMPLER_DIM_MS: return array ? 4 : 3;
5934   case GLSL_SAMPLER_DIM_3D:
5935   case GLSL_SAMPLER_DIM_CUBE: return 3;
5936   case GLSL_SAMPLER_DIM_RECT:
5937   case GLSL_SAMPLER_DIM_SUBPASS: return 2;
5938   case GLSL_SAMPLER_DIM_SUBPASS_MS: return 3;
5939   default: break;
5940   }
5941   return 0;
5942}
5943
5944static MIMG_instruction*
5945emit_mimg(Builder& bld, aco_opcode op, Definition dst, Temp rsrc, Operand samp,
5946          std::vector<Temp> coords, unsigned wqm_mask = 0, Operand vdata = Operand(v1))
5947{
5948   /* Limit NSA instructions to 3 dwords on GFX10 to avoid stability issues. */
5949   unsigned max_nsa_size = bld.program->chip_class >= GFX10_3 ? 13 : 5;
5950   bool use_nsa = bld.program->chip_class >= GFX10 && coords.size() <= max_nsa_size;
5951
5952   if (!use_nsa) {
5953      Temp coord = coords[0];
5954      if (coords.size() > 1) {
5955         coord = bld.tmp(RegType::vgpr, coords.size());
5956
5957         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
5958            aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
5959         for (unsigned i = 0; i < coords.size(); i++)
5960            vec->operands[i] = Operand(coords[i]);
5961         vec->definitions[0] = Definition(coord);
5962         bld.insert(std::move(vec));
5963      } else if (coord.type() == RegType::sgpr) {
5964         coord = bld.copy(bld.def(v1), coord);
5965      }
5966
5967      if (wqm_mask) {
5968         /* We don't need the bias, sample index, compare value or offset to be
5969          * computed in WQM but if the p_create_vector copies the coordinates, then it
5970          * needs to be in WQM. */
5971         coord = emit_wqm(bld, coord, bld.tmp(coord.regClass()), true);
5972      }
5973
5974      coords[0] = coord;
5975      coords.resize(1);
5976   } else {
5977      for (unsigned i = 0; i < coords.size(); i++) {
5978         if (wqm_mask & (1u << i))
5979            coords[i] = emit_wqm(bld, coords[i], bld.tmp(coords[i].regClass()), true);
5980      }
5981
5982      for (Temp& coord : coords) {
5983         if (coord.type() == RegType::sgpr)
5984            coord = bld.copy(bld.def(v1), coord);
5985      }
5986   }
5987
5988   aco_ptr<MIMG_instruction> mimg{
5989      create_instruction<MIMG_instruction>(op, Format::MIMG, 3 + coords.size(), dst.isTemp())};
5990   if (dst.isTemp())
5991      mimg->definitions[0] = dst;
5992   mimg->operands[0] = Operand(rsrc);
5993   mimg->operands[1] = samp;
5994   mimg->operands[2] = vdata;
5995   for (unsigned i = 0; i < coords.size(); i++)
5996      mimg->operands[3 + i] = Operand(coords[i]);
5997
5998   MIMG_instruction* res = mimg.get();
5999   bld.insert(std::move(mimg));
6000   return res;
6001}
6002
6003void
6004visit_bvh64_intersect_ray_amd(isel_context* ctx, nir_intrinsic_instr* instr)
6005{
6006   Builder bld(ctx->program, ctx->block);
6007   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6008   Temp resource = get_ssa_temp(ctx, instr->src[0].ssa);
6009   Temp node = get_ssa_temp(ctx, instr->src[1].ssa);
6010   Temp tmax = get_ssa_temp(ctx, instr->src[2].ssa);
6011   Temp origin = get_ssa_temp(ctx, instr->src[3].ssa);
6012   Temp dir = get_ssa_temp(ctx, instr->src[4].ssa);
6013   Temp inv_dir = get_ssa_temp(ctx, instr->src[5].ssa);
6014
6015   std::vector<Temp> args;
6016   args.push_back(emit_extract_vector(ctx, node, 0, v1));
6017   args.push_back(emit_extract_vector(ctx, node, 1, v1));
6018   args.push_back(as_vgpr(ctx, tmax));
6019   args.push_back(emit_extract_vector(ctx, origin, 0, v1));
6020   args.push_back(emit_extract_vector(ctx, origin, 1, v1));
6021   args.push_back(emit_extract_vector(ctx, origin, 2, v1));
6022   args.push_back(emit_extract_vector(ctx, dir, 0, v1));
6023   args.push_back(emit_extract_vector(ctx, dir, 1, v1));
6024   args.push_back(emit_extract_vector(ctx, dir, 2, v1));
6025   args.push_back(emit_extract_vector(ctx, inv_dir, 0, v1));
6026   args.push_back(emit_extract_vector(ctx, inv_dir, 1, v1));
6027   args.push_back(emit_extract_vector(ctx, inv_dir, 2, v1));
6028
6029   MIMG_instruction* mimg = emit_mimg(bld, aco_opcode::image_bvh64_intersect_ray, Definition(dst),
6030                                      resource, Operand(s4), args);
6031   mimg->dim = ac_image_1d;
6032   mimg->dmask = 0xf;
6033   mimg->unrm = true;
6034   mimg->r128 = true;
6035}
6036
6037static std::vector<Temp>
6038get_image_coords(isel_context* ctx, const nir_intrinsic_instr* instr)
6039{
6040
6041   Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa);
6042   enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6043   bool is_array = nir_intrinsic_image_array(instr);
6044   ASSERTED bool add_frag_pos =
6045      (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
6046   assert(!add_frag_pos && "Input attachments should be lowered.");
6047   bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
6048   bool gfx9_1d = ctx->options->chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D;
6049   int count = image_type_to_components_count(dim, is_array);
6050   std::vector<Temp> coords(count);
6051   Builder bld(ctx->program, ctx->block);
6052
6053   if (is_ms)
6054      coords[--count] = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[2].ssa), 0, v1);
6055
6056   if (gfx9_1d) {
6057      coords[0] = emit_extract_vector(ctx, src0, 0, v1);
6058      coords.resize(coords.size() + 1);
6059      coords[1] = bld.copy(bld.def(v1), Operand::zero());
6060      if (is_array)
6061         coords[2] = emit_extract_vector(ctx, src0, 1, v1);
6062   } else {
6063      for (int i = 0; i < count; i++)
6064         coords[i] = emit_extract_vector(ctx, src0, i, v1);
6065   }
6066
6067   if (instr->intrinsic == nir_intrinsic_image_deref_load ||
6068       instr->intrinsic == nir_intrinsic_image_deref_sparse_load ||
6069       instr->intrinsic == nir_intrinsic_image_deref_store) {
6070      int lod_index = instr->intrinsic == nir_intrinsic_image_deref_store ? 4 : 3;
6071      bool level_zero =
6072         nir_src_is_const(instr->src[lod_index]) && nir_src_as_uint(instr->src[lod_index]) == 0;
6073
6074      if (!level_zero)
6075         coords.emplace_back(get_ssa_temp(ctx, instr->src[lod_index].ssa));
6076   }
6077
6078   return coords;
6079}
6080
6081memory_sync_info
6082get_memory_sync_info(nir_intrinsic_instr* instr, storage_class storage, unsigned semantics)
6083{
6084   /* atomicrmw might not have NIR_INTRINSIC_ACCESS and there's nothing interesting there anyway */
6085   if (semantics & semantic_atomicrmw)
6086      return memory_sync_info(storage, semantics);
6087
6088   unsigned access = nir_intrinsic_access(instr);
6089
6090   if (access & ACCESS_VOLATILE)
6091      semantics |= semantic_volatile;
6092   if (access & ACCESS_CAN_REORDER)
6093      semantics |= semantic_can_reorder | semantic_private;
6094
6095   return memory_sync_info(storage, semantics);
6096}
6097
6098Operand
6099emit_tfe_init(Builder& bld, Temp dst)
6100{
6101   Temp tmp = bld.tmp(dst.regClass());
6102
6103   aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
6104      aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
6105   for (unsigned i = 0; i < dst.size(); i++)
6106      vec->operands[i] = Operand::zero();
6107   vec->definitions[0] = Definition(tmp);
6108   /* Since this is fixed to an instruction's definition register, any CSE will
6109    * just create copies. Copying costs about the same as zero-initialization,
6110    * but these copies can break up clauses.
6111    */
6112   vec->definitions[0].setNoCSE(true);
6113   bld.insert(std::move(vec));
6114
6115   return Operand(tmp);
6116}
6117
6118void
6119visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr)
6120{
6121   Builder bld(ctx->program, ctx->block);
6122   const nir_variable* var =
6123      nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
6124   const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6125   bool is_array = nir_intrinsic_image_array(instr);
6126   bool is_sparse = instr->intrinsic == nir_intrinsic_image_deref_sparse_load;
6127   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6128
6129   memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
6130   unsigned access = var->data.access | nir_intrinsic_access(instr);
6131
6132   unsigned result_size = instr->dest.ssa.num_components - is_sparse;
6133   unsigned expand_mask =
6134      nir_ssa_def_components_read(&instr->dest.ssa) & u_bit_consecutive(0, result_size);
6135   expand_mask = MAX2(expand_mask, 1); /* this can be zero in the case of sparse image loads */
6136   if (dim == GLSL_SAMPLER_DIM_BUF)
6137      expand_mask = (1u << util_last_bit(expand_mask)) - 1u;
6138   unsigned dmask = expand_mask;
6139   if (instr->dest.ssa.bit_size == 64) {
6140      expand_mask &= 0x9;
6141      /* only R64_UINT and R64_SINT supported. x is in xy of the result, w in zw */
6142      dmask = ((expand_mask & 0x1) ? 0x3 : 0) | ((expand_mask & 0x8) ? 0xc : 0);
6143   }
6144   if (is_sparse)
6145      expand_mask |= 1 << result_size;
6146   unsigned num_components = util_bitcount(dmask) + is_sparse;
6147
6148   Temp tmp;
6149   if (num_components == dst.size() && dst.type() == RegType::vgpr)
6150      tmp = dst;
6151   else
6152      tmp = ctx->program->allocateTmp(RegClass(RegType::vgpr, num_components));
6153
6154   Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6155                                    dim == GLSL_SAMPLER_DIM_BUF ? ACO_DESC_BUFFER : ACO_DESC_IMAGE,
6156                                    nullptr, false);
6157
6158   if (dim == GLSL_SAMPLER_DIM_BUF) {
6159      Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6160
6161      aco_opcode opcode;
6162      switch (util_bitcount(dmask)) {
6163      case 1: opcode = aco_opcode::buffer_load_format_x; break;
6164      case 2: opcode = aco_opcode::buffer_load_format_xy; break;
6165      case 3: opcode = aco_opcode::buffer_load_format_xyz; break;
6166      case 4: opcode = aco_opcode::buffer_load_format_xyzw; break;
6167      default: unreachable(">4 channel buffer image load");
6168      }
6169      aco_ptr<MUBUF_instruction> load{
6170         create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 3 + is_sparse, 1)};
6171      load->operands[0] = Operand(resource);
6172      load->operands[1] = Operand(vindex);
6173      load->operands[2] = Operand::c32(0);
6174      load->definitions[0] = Definition(tmp);
6175      load->idxen = true;
6176      load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
6177      load->dlc = load->glc && ctx->options->chip_class >= GFX10;
6178      load->sync = sync;
6179      load->tfe = is_sparse;
6180      if (load->tfe)
6181         load->operands[3] = emit_tfe_init(bld, tmp);
6182      ctx->block->instructions.emplace_back(std::move(load));
6183   } else {
6184      std::vector<Temp> coords = get_image_coords(ctx, instr);
6185
6186      bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0;
6187      aco_opcode opcode = level_zero ? aco_opcode::image_load : aco_opcode::image_load_mip;
6188
6189      Operand vdata = is_sparse ? emit_tfe_init(bld, tmp) : Operand(v1);
6190      MIMG_instruction* load =
6191         emit_mimg(bld, opcode, Definition(tmp), resource, Operand(s4), coords, 0, vdata);
6192      load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
6193      load->dlc = load->glc && ctx->options->chip_class >= GFX10;
6194      load->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
6195      load->dmask = dmask;
6196      load->unrm = true;
6197      load->da = should_declare_array(ctx, dim, is_array);
6198      load->sync = sync;
6199      load->tfe = is_sparse;
6200   }
6201
6202   if (is_sparse && instr->dest.ssa.bit_size == 64) {
6203      /* The result components are 64-bit but the sparse residency code is
6204       * 32-bit. So add a zero to the end so expand_vector() works correctly.
6205       */
6206      tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, tmp.size() + 1), tmp,
6207                       Operand::zero());
6208   }
6209
6210   expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, expand_mask);
6211}
6212
6213void
6214visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr)
6215{
6216   const nir_variable* var =
6217      nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
6218   const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6219   bool is_array = nir_intrinsic_image_array(instr);
6220   Temp data = get_ssa_temp(ctx, instr->src[3].ssa);
6221
6222   /* only R64_UINT and R64_SINT supported */
6223   if (instr->src[3].ssa->bit_size == 64 && data.bytes() > 8)
6224      data = emit_extract_vector(ctx, data, 0, RegClass(data.type(), 2));
6225   data = as_vgpr(ctx, data);
6226
6227   memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
6228   unsigned access = var->data.access | nir_intrinsic_access(instr);
6229   bool glc = ctx->options->chip_class == GFX6 ||
6230                    access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE)
6231                 ? 1
6232                 : 0;
6233
6234   if (dim == GLSL_SAMPLER_DIM_BUF) {
6235      Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6236                                   ACO_DESC_BUFFER, nullptr, true);
6237      Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6238      aco_opcode opcode;
6239      switch (data.size()) {
6240      case 1: opcode = aco_opcode::buffer_store_format_x; break;
6241      case 2: opcode = aco_opcode::buffer_store_format_xy; break;
6242      case 3: opcode = aco_opcode::buffer_store_format_xyz; break;
6243      case 4: opcode = aco_opcode::buffer_store_format_xyzw; break;
6244      default: unreachable(">4 channel buffer image store");
6245      }
6246      aco_ptr<MUBUF_instruction> store{
6247         create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
6248      store->operands[0] = Operand(rsrc);
6249      store->operands[1] = Operand(vindex);
6250      store->operands[2] = Operand::c32(0);
6251      store->operands[3] = Operand(data);
6252      store->idxen = true;
6253      store->glc = glc;
6254      store->dlc = false;
6255      store->disable_wqm = true;
6256      store->sync = sync;
6257      ctx->program->needs_exact = true;
6258      ctx->block->instructions.emplace_back(std::move(store));
6259      return;
6260   }
6261
6262   assert(data.type() == RegType::vgpr);
6263   std::vector<Temp> coords = get_image_coords(ctx, instr);
6264   Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6265                                    ACO_DESC_IMAGE, nullptr, true);
6266
6267   bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;
6268   aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip;
6269
6270   Builder bld(ctx->program, ctx->block);
6271   MIMG_instruction* store =
6272      emit_mimg(bld, opcode, Definition(), resource, Operand(s4), coords, 0, Operand(data));
6273   store->glc = glc;
6274   store->dlc = false;
6275   store->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
6276   store->dmask = (1 << data.size()) - 1;
6277   store->unrm = true;
6278   store->da = should_declare_array(ctx, dim, is_array);
6279   store->disable_wqm = true;
6280   store->sync = sync;
6281   ctx->program->needs_exact = true;
6282   return;
6283}
6284
6285void
6286visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
6287{
6288   bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa);
6289   const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6290   bool is_array = nir_intrinsic_image_array(instr);
6291   Builder bld(ctx->program, ctx->block);
6292
6293   Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
6294   bool is_64bit = data.bytes() == 8;
6295   assert((data.bytes() == 4 || data.bytes() == 8) && "only 32/64-bit image atomics implemented.");
6296
6297   if (instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap)
6298      data = bld.pseudo(aco_opcode::p_create_vector, bld.def(is_64bit ? v4 : v2),
6299                        get_ssa_temp(ctx, instr->src[4].ssa), data);
6300
6301   aco_opcode buf_op, buf_op64, image_op;
6302   switch (instr->intrinsic) {
6303   case nir_intrinsic_image_deref_atomic_add:
6304      buf_op = aco_opcode::buffer_atomic_add;
6305      buf_op64 = aco_opcode::buffer_atomic_add_x2;
6306      image_op = aco_opcode::image_atomic_add;
6307      break;
6308   case nir_intrinsic_image_deref_atomic_umin:
6309      buf_op = aco_opcode::buffer_atomic_umin;
6310      buf_op64 = aco_opcode::buffer_atomic_umin_x2;
6311      image_op = aco_opcode::image_atomic_umin;
6312      break;
6313   case nir_intrinsic_image_deref_atomic_imin:
6314      buf_op = aco_opcode::buffer_atomic_smin;
6315      buf_op64 = aco_opcode::buffer_atomic_smin_x2;
6316      image_op = aco_opcode::image_atomic_smin;
6317      break;
6318   case nir_intrinsic_image_deref_atomic_umax:
6319      buf_op = aco_opcode::buffer_atomic_umax;
6320      buf_op64 = aco_opcode::buffer_atomic_umax_x2;
6321      image_op = aco_opcode::image_atomic_umax;
6322      break;
6323   case nir_intrinsic_image_deref_atomic_imax:
6324      buf_op = aco_opcode::buffer_atomic_smax;
6325      buf_op64 = aco_opcode::buffer_atomic_smax_x2;
6326      image_op = aco_opcode::image_atomic_smax;
6327      break;
6328   case nir_intrinsic_image_deref_atomic_and:
6329      buf_op = aco_opcode::buffer_atomic_and;
6330      buf_op64 = aco_opcode::buffer_atomic_and_x2;
6331      image_op = aco_opcode::image_atomic_and;
6332      break;
6333   case nir_intrinsic_image_deref_atomic_or:
6334      buf_op = aco_opcode::buffer_atomic_or;
6335      buf_op64 = aco_opcode::buffer_atomic_or_x2;
6336      image_op = aco_opcode::image_atomic_or;
6337      break;
6338   case nir_intrinsic_image_deref_atomic_xor:
6339      buf_op = aco_opcode::buffer_atomic_xor;
6340      buf_op64 = aco_opcode::buffer_atomic_xor_x2;
6341      image_op = aco_opcode::image_atomic_xor;
6342      break;
6343   case nir_intrinsic_image_deref_atomic_exchange:
6344      buf_op = aco_opcode::buffer_atomic_swap;
6345      buf_op64 = aco_opcode::buffer_atomic_swap_x2;
6346      image_op = aco_opcode::image_atomic_swap;
6347      break;
6348   case nir_intrinsic_image_deref_atomic_comp_swap:
6349      buf_op = aco_opcode::buffer_atomic_cmpswap;
6350      buf_op64 = aco_opcode::buffer_atomic_cmpswap_x2;
6351      image_op = aco_opcode::image_atomic_cmpswap;
6352      break;
6353   case nir_intrinsic_image_deref_atomic_fmin:
6354      buf_op = aco_opcode::buffer_atomic_fmin;
6355      buf_op64 = aco_opcode::buffer_atomic_fmin_x2;
6356      image_op = aco_opcode::image_atomic_fmin;
6357      break;
6358   case nir_intrinsic_image_deref_atomic_fmax:
6359      buf_op = aco_opcode::buffer_atomic_fmax;
6360      buf_op64 = aco_opcode::buffer_atomic_fmax_x2;
6361      image_op = aco_opcode::image_atomic_fmax;
6362      break;
6363   default:
6364      unreachable("visit_image_atomic should only be called with "
6365                  "nir_intrinsic_image_deref_atomic_* instructions.");
6366   }
6367
6368   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6369   memory_sync_info sync = get_memory_sync_info(instr, storage_image, semantic_atomicrmw);
6370
6371   if (dim == GLSL_SAMPLER_DIM_BUF) {
6372      Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6373      Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6374                                       ACO_DESC_BUFFER, nullptr, true);
6375      // assert(ctx->options->chip_class < GFX9 && "GFX9 stride size workaround not yet
6376      // implemented.");
6377      aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(
6378         is_64bit ? buf_op64 : buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6379      mubuf->operands[0] = Operand(resource);
6380      mubuf->operands[1] = Operand(vindex);
6381      mubuf->operands[2] = Operand::c32(0);
6382      mubuf->operands[3] = Operand(data);
6383      if (return_previous)
6384         mubuf->definitions[0] = Definition(dst);
6385      mubuf->offset = 0;
6386      mubuf->idxen = true;
6387      mubuf->glc = return_previous;
6388      mubuf->dlc = false; /* Not needed for atomics */
6389      mubuf->disable_wqm = true;
6390      mubuf->sync = sync;
6391      ctx->program->needs_exact = true;
6392      ctx->block->instructions.emplace_back(std::move(mubuf));
6393      return;
6394   }
6395
6396   std::vector<Temp> coords = get_image_coords(ctx, instr);
6397   Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6398                                    ACO_DESC_IMAGE, nullptr, true);
6399   Definition def = return_previous ? Definition(dst) : Definition();
6400   MIMG_instruction* mimg =
6401      emit_mimg(bld, image_op, def, resource, Operand(s4), coords, 0, Operand(data));
6402   mimg->glc = return_previous;
6403   mimg->dlc = false; /* Not needed for atomics */
6404   mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
6405   mimg->dmask = (1 << data.size()) - 1;
6406   mimg->unrm = true;
6407   mimg->da = should_declare_array(ctx, dim, is_array);
6408   mimg->disable_wqm = true;
6409   mimg->sync = sync;
6410   ctx->program->needs_exact = true;
6411   return;
6412}
6413
6414void
6415get_buffer_size(isel_context* ctx, Temp desc, Temp dst)
6416{
6417   if (ctx->options->chip_class == GFX8) {
6418      /* we only have to divide by 1, 2, 4, 8, 12 or 16 */
6419      Builder bld(ctx->program, ctx->block);
6420
6421      Temp size = emit_extract_vector(ctx, desc, 2, s1);
6422
6423      Temp size_div3 = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1),
6424                                bld.copy(bld.def(v1), Operand::c32(0xaaaaaaabu)), size);
6425      size_div3 = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc),
6426                           bld.as_uniform(size_div3), Operand::c32(1u));
6427
6428      Temp stride = emit_extract_vector(ctx, desc, 1, s1);
6429      stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), stride,
6430                        Operand::c32((5u << 16) | 16u));
6431
6432      Temp is12 = bld.sopc(aco_opcode::s_cmp_eq_i32, bld.def(s1, scc), stride, Operand::c32(12u));
6433      size = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), size_div3, size, bld.scc(is12));
6434
6435      Temp shr_dst = dst.type() == RegType::vgpr ? bld.tmp(s1) : dst;
6436      bld.sop2(aco_opcode::s_lshr_b32, Definition(shr_dst), bld.def(s1, scc), size,
6437               bld.sop1(aco_opcode::s_ff1_i32_b32, bld.def(s1), stride));
6438      if (dst.type() == RegType::vgpr)
6439         bld.copy(Definition(dst), shr_dst);
6440
6441      /* TODO: we can probably calculate this faster with v_skip when stride != 12 */
6442   } else {
6443      emit_extract_vector(ctx, desc, 2, dst);
6444   }
6445}
6446
6447void
6448visit_image_size(isel_context* ctx, nir_intrinsic_instr* instr)
6449{
6450   const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6451   bool is_array = nir_intrinsic_image_array(instr);
6452   Builder bld(ctx->program, ctx->block);
6453
6454   if (dim == GLSL_SAMPLER_DIM_BUF) {
6455      Temp desc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6456                                   ACO_DESC_BUFFER, NULL, false);
6457      return get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa));
6458   }
6459
6460   /* LOD */
6461   assert(nir_src_as_uint(instr->src[1]) == 0);
6462   std::vector<Temp> lod{bld.copy(bld.def(v1), Operand::zero())};
6463
6464   /* Resource */
6465   Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6466                                    ACO_DESC_IMAGE, NULL, false);
6467
6468   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6469
6470   MIMG_instruction* mimg =
6471      emit_mimg(bld, aco_opcode::image_get_resinfo, Definition(dst), resource, Operand(s4), lod);
6472   uint8_t& dmask = mimg->dmask;
6473   mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
6474   mimg->dmask = (1 << instr->dest.ssa.num_components) - 1;
6475   mimg->da = is_array;
6476
6477   if (ctx->options->chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D && is_array) {
6478      assert(instr->dest.ssa.num_components == 2);
6479      dmask = 0x5;
6480   }
6481
6482   emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
6483}
6484
6485void
6486get_image_samples(isel_context* ctx, Definition dst, Temp resource)
6487{
6488   Builder bld(ctx->program, ctx->block);
6489
6490   Temp dword3 = emit_extract_vector(ctx, resource, 3, s1);
6491   Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3,
6492                                Operand::c32(16u | 4u << 16));
6493   Temp samples = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand::c32(1u),
6494                           samples_log2);
6495   Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3,
6496                        Operand::c32(28u | 4u << 16 /* offset=28, width=4 */));
6497
6498   Operand default_sample = Operand::c32(1u);
6499   if (ctx->options->robust_buffer_access) {
6500      /* Extract the second dword of the descriptor, if it's
6501       * all zero, then it's a null descriptor.
6502       */
6503      Temp dword1 = emit_extract_vector(ctx, resource, 1, s1);
6504      Temp is_non_null_descriptor =
6505         bld.sopc(aco_opcode::s_cmp_gt_u32, bld.def(s1, scc), dword1, Operand::zero());
6506      default_sample = Operand(is_non_null_descriptor);
6507   }
6508
6509   Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand::c32(14u));
6510   bld.sop2(aco_opcode::s_cselect_b32, dst, samples, default_sample, bld.scc(is_msaa));
6511}
6512
6513void
6514visit_image_samples(isel_context* ctx, nir_intrinsic_instr* instr)
6515{
6516   Builder bld(ctx->program, ctx->block);
6517   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6518   Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6519                                    ACO_DESC_IMAGE, NULL, false);
6520   get_image_samples(ctx, Definition(dst), resource);
6521}
6522
6523void
6524visit_load_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6525{
6526   Builder bld(ctx->program, ctx->block);
6527   unsigned num_components = instr->num_components;
6528
6529   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6530   Temp rsrc = load_buffer_rsrc(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6531
6532   unsigned access = nir_intrinsic_access(instr);
6533   bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
6534   unsigned size = instr->dest.ssa.bit_size / 8;
6535
6536   bool allow_smem = access & ACCESS_CAN_REORDER;
6537
6538   load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
6539               nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, allow_smem,
6540               get_memory_sync_info(instr, storage_buffer, 0));
6541}
6542
6543void
6544visit_store_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6545{
6546   Builder bld(ctx->program, ctx->block);
6547   Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
6548   unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6549   unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6550   Temp offset = get_ssa_temp(ctx, instr->src[2].ssa);
6551
6552   Temp rsrc = load_buffer_rsrc(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6553
6554   memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
6555   bool glc =
6556      nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
6557
6558   unsigned write_count = 0;
6559   Temp write_datas[32];
6560   unsigned offsets[32];
6561   split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,
6562                      write_datas, offsets);
6563
6564   /* GFX6-7 are affected by a hw bug that prevents address clamping to work
6565    * correctly when the SGPR offset is used.
6566    */
6567   if (offset.type() == RegType::sgpr && ctx->options->chip_class < GFX8)
6568      offset = as_vgpr(ctx, offset);
6569
6570   for (unsigned i = 0; i < write_count; i++) {
6571      aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
6572
6573      aco_ptr<MUBUF_instruction> store{
6574         create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
6575      store->operands[0] = Operand(rsrc);
6576      store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6577      store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
6578      store->operands[3] = Operand(write_datas[i]);
6579      store->offset = offsets[i];
6580      store->offen = (offset.type() == RegType::vgpr);
6581      store->glc = glc;
6582      store->dlc = false;
6583      store->disable_wqm = true;
6584      store->sync = sync;
6585      ctx->program->needs_exact = true;
6586      ctx->block->instructions.emplace_back(std::move(store));
6587   }
6588}
6589
6590void
6591visit_atomic_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6592{
6593   Builder bld(ctx->program, ctx->block);
6594   bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa);
6595   Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
6596
6597   if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap)
6598      data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
6599                        get_ssa_temp(ctx, instr->src[3].ssa), data);
6600
6601   Temp offset = get_ssa_temp(ctx, instr->src[1].ssa);
6602   Temp rsrc = load_buffer_rsrc(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6603
6604   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6605
6606   aco_opcode op32, op64;
6607   switch (instr->intrinsic) {
6608   case nir_intrinsic_ssbo_atomic_add:
6609      op32 = aco_opcode::buffer_atomic_add;
6610      op64 = aco_opcode::buffer_atomic_add_x2;
6611      break;
6612   case nir_intrinsic_ssbo_atomic_imin:
6613      op32 = aco_opcode::buffer_atomic_smin;
6614      op64 = aco_opcode::buffer_atomic_smin_x2;
6615      break;
6616   case nir_intrinsic_ssbo_atomic_umin:
6617      op32 = aco_opcode::buffer_atomic_umin;
6618      op64 = aco_opcode::buffer_atomic_umin_x2;
6619      break;
6620   case nir_intrinsic_ssbo_atomic_imax:
6621      op32 = aco_opcode::buffer_atomic_smax;
6622      op64 = aco_opcode::buffer_atomic_smax_x2;
6623      break;
6624   case nir_intrinsic_ssbo_atomic_umax:
6625      op32 = aco_opcode::buffer_atomic_umax;
6626      op64 = aco_opcode::buffer_atomic_umax_x2;
6627      break;
6628   case nir_intrinsic_ssbo_atomic_and:
6629      op32 = aco_opcode::buffer_atomic_and;
6630      op64 = aco_opcode::buffer_atomic_and_x2;
6631      break;
6632   case nir_intrinsic_ssbo_atomic_or:
6633      op32 = aco_opcode::buffer_atomic_or;
6634      op64 = aco_opcode::buffer_atomic_or_x2;
6635      break;
6636   case nir_intrinsic_ssbo_atomic_xor:
6637      op32 = aco_opcode::buffer_atomic_xor;
6638      op64 = aco_opcode::buffer_atomic_xor_x2;
6639      break;
6640   case nir_intrinsic_ssbo_atomic_exchange:
6641      op32 = aco_opcode::buffer_atomic_swap;
6642      op64 = aco_opcode::buffer_atomic_swap_x2;
6643      break;
6644   case nir_intrinsic_ssbo_atomic_comp_swap:
6645      op32 = aco_opcode::buffer_atomic_cmpswap;
6646      op64 = aco_opcode::buffer_atomic_cmpswap_x2;
6647      break;
6648   case nir_intrinsic_ssbo_atomic_fmin:
6649      op32 = aco_opcode::buffer_atomic_fmin;
6650      op64 = aco_opcode::buffer_atomic_fmin_x2;
6651      break;
6652   case nir_intrinsic_ssbo_atomic_fmax:
6653      op32 = aco_opcode::buffer_atomic_fmax;
6654      op64 = aco_opcode::buffer_atomic_fmax_x2;
6655      break;
6656   default:
6657      unreachable(
6658         "visit_atomic_ssbo should only be called with nir_intrinsic_ssbo_atomic_* instructions.");
6659   }
6660   aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
6661   aco_ptr<MUBUF_instruction> mubuf{
6662      create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6663   mubuf->operands[0] = Operand(rsrc);
6664   mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6665   mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
6666   mubuf->operands[3] = Operand(data);
6667   if (return_previous)
6668      mubuf->definitions[0] = Definition(dst);
6669   mubuf->offset = 0;
6670   mubuf->offen = (offset.type() == RegType::vgpr);
6671   mubuf->glc = return_previous;
6672   mubuf->dlc = false; /* Not needed for atomics */
6673   mubuf->disable_wqm = true;
6674   mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
6675   ctx->program->needs_exact = true;
6676   ctx->block->instructions.emplace_back(std::move(mubuf));
6677}
6678
6679void
6680visit_get_ssbo_size(isel_context* ctx, nir_intrinsic_instr* instr)
6681{
6682
6683   Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa);
6684   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6685   bool non_uniform = dst.type() == RegType::vgpr;
6686
6687   Builder bld(ctx->program, ctx->block);
6688   if (non_uniform) {
6689      Temp set_ptr = emit_extract_vector(ctx, rsrc, 0, RegClass(rsrc.type(), 1));
6690      Temp binding = emit_extract_vector(ctx, rsrc, 1, RegClass(rsrc.type(), 1));
6691      Temp index = bld.vadd32(bld.def(v1), set_ptr, binding);
6692      index = convert_pointer_to_64_bit(ctx, index, non_uniform);
6693
6694      LoadEmitInfo info = {Operand(index), dst, 1, 4};
6695      info.align_mul = 4;
6696      info.const_offset = 8;
6697      emit_load(ctx, bld, info, global_load_params);
6698   } else {
6699      emit_extract_vector(ctx, load_buffer_rsrc(ctx, rsrc), 2, dst);
6700   }
6701}
6702
6703void
6704visit_load_global(isel_context* ctx, nir_intrinsic_instr* instr)
6705{
6706   Builder bld(ctx->program, ctx->block);
6707   unsigned num_components = instr->num_components;
6708   unsigned component_size = instr->dest.ssa.bit_size / 8;
6709
6710   LoadEmitInfo info = {Operand(get_ssa_temp(ctx, instr->src[0].ssa)),
6711                        get_ssa_temp(ctx, &instr->dest.ssa), num_components, component_size};
6712   info.glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
6713   info.align_mul = nir_intrinsic_align_mul(instr);
6714   info.align_offset = nir_intrinsic_align_offset(instr);
6715   info.sync = get_memory_sync_info(instr, storage_buffer, 0);
6716   /* VMEM stores don't update the SMEM cache and it's difficult to prove that
6717    * it's safe to use SMEM */
6718   bool can_use_smem = nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE;
6719   if (info.dst.type() == RegType::vgpr || (info.glc && ctx->options->chip_class < GFX8) ||
6720       !can_use_smem) {
6721      emit_load(ctx, bld, info, global_load_params);
6722   } else {
6723      info.offset = Operand(bld.as_uniform(info.offset));
6724      emit_load(ctx, bld, info, smem_load_params);
6725   }
6726}
6727
6728void
6729visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr)
6730{
6731   Builder bld(ctx->program, ctx->block);
6732   unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6733   unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6734
6735   Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6736   Temp addr = get_ssa_temp(ctx, instr->src[1].ssa);
6737   memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
6738   bool glc =
6739      nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
6740
6741   if (ctx->options->chip_class >= GFX7)
6742      addr = as_vgpr(ctx, addr);
6743
6744   unsigned write_count = 0;
6745   Temp write_datas[32];
6746   unsigned offsets[32];
6747   split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,
6748                      write_datas, offsets);
6749
6750   for (unsigned i = 0; i < write_count; i++) {
6751      if (ctx->options->chip_class >= GFX7) {
6752         unsigned offset = offsets[i];
6753         Temp store_addr = addr;
6754         if (offset > 0 && ctx->options->chip_class < GFX9) {
6755            Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1);
6756            Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1);
6757            Temp carry = bld.tmp(bld.lm);
6758            bld.pseudo(aco_opcode::p_split_vector, Definition(addr0), Definition(addr1), addr);
6759
6760            bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0),
6761                     bld.hint_vcc(Definition(carry)), Operand::c32(offset), addr0);
6762            bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(bld.lm),
6763                     Operand::zero(), addr1, carry)
6764               .def(1)
6765               .setHint(vcc);
6766
6767            store_addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1);
6768
6769            offset = 0;
6770         }
6771
6772         bool global = ctx->options->chip_class >= GFX9;
6773         aco_opcode op;
6774         switch (write_datas[i].bytes()) {
6775         case 1: op = global ? aco_opcode::global_store_byte : aco_opcode::flat_store_byte; break;
6776         case 2: op = global ? aco_opcode::global_store_short : aco_opcode::flat_store_short; break;
6777         case 4: op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword; break;
6778         case 8:
6779            op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2;
6780            break;
6781         case 12:
6782            op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3;
6783            break;
6784         case 16:
6785            op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4;
6786            break;
6787         default: unreachable("store_global not implemented for this size.");
6788         }
6789
6790         aco_ptr<FLAT_instruction> flat{
6791            create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
6792         flat->operands[0] = Operand(store_addr);
6793         flat->operands[1] = Operand(s1);
6794         flat->operands[2] = Operand(write_datas[i]);
6795         flat->glc = glc;
6796         flat->dlc = false;
6797         flat->offset = offset;
6798         flat->disable_wqm = true;
6799         flat->sync = sync;
6800         ctx->program->needs_exact = true;
6801         ctx->block->instructions.emplace_back(std::move(flat));
6802      } else {
6803         assert(ctx->options->chip_class == GFX6);
6804
6805         aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
6806
6807         Temp rsrc = get_gfx6_global_rsrc(bld, addr);
6808
6809         aco_ptr<MUBUF_instruction> mubuf{
6810            create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
6811         mubuf->operands[0] = Operand(rsrc);
6812         mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
6813         mubuf->operands[2] = Operand::zero();
6814         mubuf->operands[3] = Operand(write_datas[i]);
6815         mubuf->glc = glc;
6816         mubuf->dlc = false;
6817         mubuf->offset = offsets[i];
6818         mubuf->addr64 = addr.type() == RegType::vgpr;
6819         mubuf->disable_wqm = true;
6820         mubuf->sync = sync;
6821         ctx->program->needs_exact = true;
6822         ctx->block->instructions.emplace_back(std::move(mubuf));
6823      }
6824   }
6825}
6826
6827void
6828visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
6829{
6830   Builder bld(ctx->program, ctx->block);
6831   bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa);
6832   Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
6833   Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6834
6835   if (ctx->options->chip_class >= GFX7)
6836      addr = as_vgpr(ctx, addr);
6837
6838   if (instr->intrinsic == nir_intrinsic_global_atomic_comp_swap)
6839      data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
6840                        get_ssa_temp(ctx, instr->src[2].ssa), data);
6841
6842   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6843
6844   aco_opcode op32, op64;
6845
6846   if (ctx->options->chip_class >= GFX7) {
6847      bool global = ctx->options->chip_class >= GFX9;
6848      switch (instr->intrinsic) {
6849      case nir_intrinsic_global_atomic_add:
6850         op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add;
6851         op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2;
6852         break;
6853      case nir_intrinsic_global_atomic_imin:
6854         op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin;
6855         op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2;
6856         break;
6857      case nir_intrinsic_global_atomic_umin:
6858         op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin;
6859         op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2;
6860         break;
6861      case nir_intrinsic_global_atomic_imax:
6862         op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax;
6863         op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2;
6864         break;
6865      case nir_intrinsic_global_atomic_umax:
6866         op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax;
6867         op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2;
6868         break;
6869      case nir_intrinsic_global_atomic_and:
6870         op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and;
6871         op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2;
6872         break;
6873      case nir_intrinsic_global_atomic_or:
6874         op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or;
6875         op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2;
6876         break;
6877      case nir_intrinsic_global_atomic_xor:
6878         op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor;
6879         op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2;
6880         break;
6881      case nir_intrinsic_global_atomic_exchange:
6882         op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap;
6883         op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2;
6884         break;
6885      case nir_intrinsic_global_atomic_comp_swap:
6886         op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap;
6887         op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2;
6888         break;
6889      case nir_intrinsic_global_atomic_fmin:
6890         op32 = global ? aco_opcode::global_atomic_fmin : aco_opcode::flat_atomic_fmin;
6891         op64 = global ? aco_opcode::global_atomic_fmin_x2 : aco_opcode::flat_atomic_fmin_x2;
6892         break;
6893      case nir_intrinsic_global_atomic_fmax:
6894         op32 = global ? aco_opcode::global_atomic_fmax : aco_opcode::flat_atomic_fmax;
6895         op64 = global ? aco_opcode::global_atomic_fmax_x2 : aco_opcode::flat_atomic_fmax_x2;
6896         break;
6897      default:
6898         unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* "
6899                     "instructions.");
6900      }
6901
6902      aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
6903      aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(
6904         op, global ? Format::GLOBAL : Format::FLAT, 3, return_previous ? 1 : 0)};
6905      flat->operands[0] = Operand(addr);
6906      flat->operands[1] = Operand(s1);
6907      flat->operands[2] = Operand(data);
6908      if (return_previous)
6909         flat->definitions[0] = Definition(dst);
6910      flat->glc = return_previous;
6911      flat->dlc = false; /* Not needed for atomics */
6912      flat->offset = 0;
6913      flat->disable_wqm = true;
6914      flat->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
6915      ctx->program->needs_exact = true;
6916      ctx->block->instructions.emplace_back(std::move(flat));
6917   } else {
6918      assert(ctx->options->chip_class == GFX6);
6919
6920      switch (instr->intrinsic) {
6921      case nir_intrinsic_global_atomic_add:
6922         op32 = aco_opcode::buffer_atomic_add;
6923         op64 = aco_opcode::buffer_atomic_add_x2;
6924         break;
6925      case nir_intrinsic_global_atomic_imin:
6926         op32 = aco_opcode::buffer_atomic_smin;
6927         op64 = aco_opcode::buffer_atomic_smin_x2;
6928         break;
6929      case nir_intrinsic_global_atomic_umin:
6930         op32 = aco_opcode::buffer_atomic_umin;
6931         op64 = aco_opcode::buffer_atomic_umin_x2;
6932         break;
6933      case nir_intrinsic_global_atomic_imax:
6934         op32 = aco_opcode::buffer_atomic_smax;
6935         op64 = aco_opcode::buffer_atomic_smax_x2;
6936         break;
6937      case nir_intrinsic_global_atomic_umax:
6938         op32 = aco_opcode::buffer_atomic_umax;
6939         op64 = aco_opcode::buffer_atomic_umax_x2;
6940         break;
6941      case nir_intrinsic_global_atomic_and:
6942         op32 = aco_opcode::buffer_atomic_and;
6943         op64 = aco_opcode::buffer_atomic_and_x2;
6944         break;
6945      case nir_intrinsic_global_atomic_or:
6946         op32 = aco_opcode::buffer_atomic_or;
6947         op64 = aco_opcode::buffer_atomic_or_x2;
6948         break;
6949      case nir_intrinsic_global_atomic_xor:
6950         op32 = aco_opcode::buffer_atomic_xor;
6951         op64 = aco_opcode::buffer_atomic_xor_x2;
6952         break;
6953      case nir_intrinsic_global_atomic_exchange:
6954         op32 = aco_opcode::buffer_atomic_swap;
6955         op64 = aco_opcode::buffer_atomic_swap_x2;
6956         break;
6957      case nir_intrinsic_global_atomic_comp_swap:
6958         op32 = aco_opcode::buffer_atomic_cmpswap;
6959         op64 = aco_opcode::buffer_atomic_cmpswap_x2;
6960         break;
6961      case nir_intrinsic_global_atomic_fmin:
6962         op32 = aco_opcode::buffer_atomic_fmin;
6963         op64 = aco_opcode::buffer_atomic_fmin_x2;
6964         break;
6965      case nir_intrinsic_global_atomic_fmax:
6966         op32 = aco_opcode::buffer_atomic_fmax;
6967         op64 = aco_opcode::buffer_atomic_fmax_x2;
6968         break;
6969      default:
6970         unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* "
6971                     "instructions.");
6972      }
6973
6974      Temp rsrc = get_gfx6_global_rsrc(bld, addr);
6975
6976      aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
6977
6978      aco_ptr<MUBUF_instruction> mubuf{
6979         create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6980      mubuf->operands[0] = Operand(rsrc);
6981      mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
6982      mubuf->operands[2] = Operand::zero();
6983      mubuf->operands[3] = Operand(data);
6984      if (return_previous)
6985         mubuf->definitions[0] = Definition(dst);
6986      mubuf->glc = return_previous;
6987      mubuf->dlc = false;
6988      mubuf->offset = 0;
6989      mubuf->addr64 = addr.type() == RegType::vgpr;
6990      mubuf->disable_wqm = true;
6991      mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
6992      ctx->program->needs_exact = true;
6993      ctx->block->instructions.emplace_back(std::move(mubuf));
6994   }
6995}
6996
6997void
6998visit_load_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
6999{
7000   Builder bld(ctx->program, ctx->block);
7001
7002   Temp dst = get_ssa_temp(ctx, &intrin->dest.ssa);
7003   Temp descriptor = bld.as_uniform(get_ssa_temp(ctx, intrin->src[0].ssa));
7004   Temp v_offset = as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[1].ssa));
7005   Temp s_offset = bld.as_uniform(get_ssa_temp(ctx, intrin->src[2].ssa));
7006
7007   bool swizzled = nir_intrinsic_is_swizzled(intrin);
7008   bool reorder = nir_intrinsic_can_reorder(intrin);
7009   bool slc = nir_intrinsic_slc_amd(intrin);
7010
7011   unsigned const_offset = nir_intrinsic_base(intrin);
7012   unsigned elem_size_bytes = intrin->dest.ssa.bit_size / 8u;
7013   unsigned num_components = intrin->dest.ssa.num_components;
7014   unsigned swizzle_element_size = swizzled ? (ctx->program->chip_class <= GFX8 ? 4 : 16) : 0;
7015
7016   load_vmem_mubuf(ctx, dst, descriptor, v_offset, s_offset, const_offset, elem_size_bytes,
7017                   num_components, swizzle_element_size, !swizzled, reorder, slc);
7018}
7019
7020void
7021visit_store_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
7022{
7023   Temp store_src = get_ssa_temp(ctx, intrin->src[0].ssa);
7024   Temp descriptor = get_ssa_temp(ctx, intrin->src[1].ssa);
7025   Temp v_offset = get_ssa_temp(ctx, intrin->src[2].ssa);
7026   Temp s_offset = get_ssa_temp(ctx, intrin->src[3].ssa);
7027
7028   bool swizzled = nir_intrinsic_is_swizzled(intrin);
7029   bool slc = nir_intrinsic_slc_amd(intrin);
7030
7031   unsigned const_offset = nir_intrinsic_base(intrin);
7032   unsigned write_mask = nir_intrinsic_write_mask(intrin);
7033   unsigned elem_size_bytes = intrin->src[0].ssa->bit_size / 8u;
7034
7035   nir_variable_mode mem_mode = nir_intrinsic_memory_modes(intrin);
7036   memory_sync_info sync(mem_mode == nir_var_shader_out ? storage_vmem_output : storage_none);
7037
7038   store_vmem_mubuf(ctx, store_src, descriptor, v_offset, s_offset, const_offset, elem_size_bytes,
7039                    write_mask, !swizzled, sync, slc);
7040}
7041
7042sync_scope
7043translate_nir_scope(nir_scope scope)
7044{
7045   switch (scope) {
7046   case NIR_SCOPE_NONE:
7047   case NIR_SCOPE_INVOCATION: return scope_invocation;
7048   case NIR_SCOPE_SUBGROUP: return scope_subgroup;
7049   case NIR_SCOPE_WORKGROUP: return scope_workgroup;
7050   case NIR_SCOPE_QUEUE_FAMILY: return scope_queuefamily;
7051   case NIR_SCOPE_DEVICE: return scope_device;
7052   case NIR_SCOPE_SHADER_CALL: return scope_invocation;
7053   }
7054   unreachable("invalid scope");
7055}
7056
7057void
7058emit_scoped_barrier(isel_context* ctx, nir_intrinsic_instr* instr)
7059{
7060   Builder bld(ctx->program, ctx->block);
7061
7062   unsigned semantics = 0;
7063   unsigned storage = 0;
7064   sync_scope mem_scope = translate_nir_scope(nir_intrinsic_memory_scope(instr));
7065   sync_scope exec_scope = translate_nir_scope(nir_intrinsic_execution_scope(instr));
7066
7067   /* We use shared storage for the following:
7068    * - compute shaders expose it in their API
7069    * - when tessellation is used, TCS and VS I/O is lowered to shared memory
7070    * - when GS is used on GFX9+, VS->GS and TES->GS I/O is lowered to shared memory
7071    * - additionally, when NGG is used on GFX10+, shared memory is used for certain features
7072    */
7073   bool shared_storage_used = ctx->stage.hw == HWStage::CS || ctx->stage.hw == HWStage::LS ||
7074                              ctx->stage.hw == HWStage::HS ||
7075                              (ctx->stage.hw == HWStage::GS && ctx->program->chip_class >= GFX9) ||
7076                              ctx->stage.hw == HWStage::NGG;
7077
7078   /* Workgroup barriers can hang merged shaders that can potentially have 0 threads in either half.
7079    * They are allowed in CS, TCS, and in any NGG shader.
7080    */
7081   ASSERTED bool workgroup_scope_allowed =
7082      ctx->stage.hw == HWStage::CS || ctx->stage.hw == HWStage::HS || ctx->stage.hw == HWStage::NGG;
7083
7084   unsigned nir_storage = nir_intrinsic_memory_modes(instr);
7085   if (nir_storage & (nir_var_mem_ssbo | nir_var_mem_global))
7086      storage |= storage_buffer | storage_image; // TODO: split this when NIR gets nir_var_mem_image
7087   if (shared_storage_used && (nir_storage & nir_var_mem_shared))
7088      storage |= storage_shared;
7089
7090   unsigned nir_semantics = nir_intrinsic_memory_semantics(instr);
7091   if (nir_semantics & NIR_MEMORY_ACQUIRE)
7092      semantics |= semantic_acquire | semantic_release;
7093   if (nir_semantics & NIR_MEMORY_RELEASE)
7094      semantics |= semantic_acquire | semantic_release;
7095
7096   assert(!(nir_semantics & (NIR_MEMORY_MAKE_AVAILABLE | NIR_MEMORY_MAKE_VISIBLE)));
7097   assert(exec_scope != scope_workgroup || workgroup_scope_allowed);
7098
7099   bld.barrier(aco_opcode::p_barrier,
7100               memory_sync_info((storage_class)storage, (memory_semantics)semantics, mem_scope),
7101               exec_scope);
7102}
7103
7104void
7105visit_load_shared(isel_context* ctx, nir_intrinsic_instr* instr)
7106{
7107   // TODO: implement sparse reads using ds_read2_b32 and nir_ssa_def_components_read()
7108   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7109   Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7110   Builder bld(ctx->program, ctx->block);
7111
7112   unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
7113   unsigned num_components = instr->dest.ssa.num_components;
7114   unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
7115   load_lds(ctx, elem_size_bytes, num_components, dst, address, nir_intrinsic_base(instr), align);
7116}
7117
7118void
7119visit_store_shared(isel_context* ctx, nir_intrinsic_instr* instr)
7120{
7121   unsigned writemask = nir_intrinsic_write_mask(instr);
7122   Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
7123   Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7124   unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
7125
7126   unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
7127   store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align);
7128}
7129
7130void
7131visit_shared_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
7132{
7133   unsigned offset = nir_intrinsic_base(instr);
7134   Builder bld(ctx->program, ctx->block);
7135   Operand m = load_lds_size_m0(bld);
7136   Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7137   Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7138
7139   unsigned num_operands = 3;
7140   aco_opcode op32, op64, op32_rtn, op64_rtn;
7141   switch (instr->intrinsic) {
7142   case nir_intrinsic_shared_atomic_add:
7143      op32 = aco_opcode::ds_add_u32;
7144      op64 = aco_opcode::ds_add_u64;
7145      op32_rtn = aco_opcode::ds_add_rtn_u32;
7146      op64_rtn = aco_opcode::ds_add_rtn_u64;
7147      break;
7148   case nir_intrinsic_shared_atomic_imin:
7149      op32 = aco_opcode::ds_min_i32;
7150      op64 = aco_opcode::ds_min_i64;
7151      op32_rtn = aco_opcode::ds_min_rtn_i32;
7152      op64_rtn = aco_opcode::ds_min_rtn_i64;
7153      break;
7154   case nir_intrinsic_shared_atomic_umin:
7155      op32 = aco_opcode::ds_min_u32;
7156      op64 = aco_opcode::ds_min_u64;
7157      op32_rtn = aco_opcode::ds_min_rtn_u32;
7158      op64_rtn = aco_opcode::ds_min_rtn_u64;
7159      break;
7160   case nir_intrinsic_shared_atomic_imax:
7161      op32 = aco_opcode::ds_max_i32;
7162      op64 = aco_opcode::ds_max_i64;
7163      op32_rtn = aco_opcode::ds_max_rtn_i32;
7164      op64_rtn = aco_opcode::ds_max_rtn_i64;
7165      break;
7166   case nir_intrinsic_shared_atomic_umax:
7167      op32 = aco_opcode::ds_max_u32;
7168      op64 = aco_opcode::ds_max_u64;
7169      op32_rtn = aco_opcode::ds_max_rtn_u32;
7170      op64_rtn = aco_opcode::ds_max_rtn_u64;
7171      break;
7172   case nir_intrinsic_shared_atomic_and:
7173      op32 = aco_opcode::ds_and_b32;
7174      op64 = aco_opcode::ds_and_b64;
7175      op32_rtn = aco_opcode::ds_and_rtn_b32;
7176      op64_rtn = aco_opcode::ds_and_rtn_b64;
7177      break;
7178   case nir_intrinsic_shared_atomic_or:
7179      op32 = aco_opcode::ds_or_b32;
7180      op64 = aco_opcode::ds_or_b64;
7181      op32_rtn = aco_opcode::ds_or_rtn_b32;
7182      op64_rtn = aco_opcode::ds_or_rtn_b64;
7183      break;
7184   case nir_intrinsic_shared_atomic_xor:
7185      op32 = aco_opcode::ds_xor_b32;
7186      op64 = aco_opcode::ds_xor_b64;
7187      op32_rtn = aco_opcode::ds_xor_rtn_b32;
7188      op64_rtn = aco_opcode::ds_xor_rtn_b64;
7189      break;
7190   case nir_intrinsic_shared_atomic_exchange:
7191      op32 = aco_opcode::ds_write_b32;
7192      op64 = aco_opcode::ds_write_b64;
7193      op32_rtn = aco_opcode::ds_wrxchg_rtn_b32;
7194      op64_rtn = aco_opcode::ds_wrxchg_rtn_b64;
7195      break;
7196   case nir_intrinsic_shared_atomic_comp_swap:
7197      op32 = aco_opcode::ds_cmpst_b32;
7198      op64 = aco_opcode::ds_cmpst_b64;
7199      op32_rtn = aco_opcode::ds_cmpst_rtn_b32;
7200      op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
7201      num_operands = 4;
7202      break;
7203   case nir_intrinsic_shared_atomic_fadd:
7204      op32 = aco_opcode::ds_add_f32;
7205      op32_rtn = aco_opcode::ds_add_rtn_f32;
7206      op64 = aco_opcode::num_opcodes;
7207      op64_rtn = aco_opcode::num_opcodes;
7208      break;
7209   case nir_intrinsic_shared_atomic_fmin:
7210      op32 = aco_opcode::ds_min_f32;
7211      op32_rtn = aco_opcode::ds_min_rtn_f32;
7212      op64 = aco_opcode::ds_min_f64;
7213      op64_rtn = aco_opcode::ds_min_rtn_f64;
7214      break;
7215   case nir_intrinsic_shared_atomic_fmax:
7216      op32 = aco_opcode::ds_max_f32;
7217      op32_rtn = aco_opcode::ds_max_rtn_f32;
7218      op64 = aco_opcode::ds_max_f64;
7219      op64_rtn = aco_opcode::ds_max_rtn_f64;
7220      break;
7221   default: unreachable("Unhandled shared atomic intrinsic");
7222   }
7223
7224   bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa);
7225
7226   aco_opcode op;
7227   if (data.size() == 1) {
7228      assert(instr->dest.ssa.bit_size == 32);
7229      op = return_previous ? op32_rtn : op32;
7230   } else {
7231      assert(instr->dest.ssa.bit_size == 64);
7232      op = return_previous ? op64_rtn : op64;
7233   }
7234
7235   if (offset > 65535) {
7236      address = bld.vadd32(bld.def(v1), Operand::c32(offset), address);
7237      offset = 0;
7238   }
7239
7240   aco_ptr<DS_instruction> ds;
7241   ds.reset(
7242      create_instruction<DS_instruction>(op, Format::DS, num_operands, return_previous ? 1 : 0));
7243   ds->operands[0] = Operand(address);
7244   ds->operands[1] = Operand(data);
7245   if (num_operands == 4) {
7246      Temp data2 = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
7247      ds->operands[2] = Operand(data2);
7248   }
7249   ds->operands[num_operands - 1] = m;
7250   ds->offset0 = offset;
7251   if (return_previous)
7252      ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa));
7253   ds->sync = memory_sync_info(storage_shared, semantic_atomicrmw);
7254
7255   if (m.isUndefined())
7256      ds->operands.pop_back();
7257
7258   ctx->block->instructions.emplace_back(std::move(ds));
7259}
7260
7261Temp
7262get_scratch_resource(isel_context* ctx)
7263{
7264   Builder bld(ctx->program, ctx->block);
7265   Temp scratch_addr = ctx->program->private_segment_buffer;
7266   if (ctx->stage != compute_cs)
7267      scratch_addr =
7268         bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand::zero());
7269
7270   uint32_t rsrc_conf =
7271      S_008F0C_ADD_TID_ENABLE(1) | S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);
7272
7273   if (ctx->program->chip_class >= GFX10) {
7274      rsrc_conf |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
7275                   S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
7276   } else if (ctx->program->chip_class <=
7277              GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */
7278      rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
7279                   S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
7280   }
7281
7282   /* older generations need element size = 4 bytes. element size removed in GFX9 */
7283   if (ctx->program->chip_class <= GFX8)
7284      rsrc_conf |= S_008F0C_ELEMENT_SIZE(1);
7285
7286   return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand::c32(-1u),
7287                     Operand::c32(rsrc_conf));
7288}
7289
7290void
7291visit_load_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
7292{
7293   Builder bld(ctx->program, ctx->block);
7294   Temp rsrc = get_scratch_resource(ctx);
7295   Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7296   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7297
7298   LoadEmitInfo info = {Operand(offset), dst, instr->dest.ssa.num_components,
7299                        instr->dest.ssa.bit_size / 8u, rsrc};
7300   info.align_mul = nir_intrinsic_align_mul(instr);
7301   info.align_offset = nir_intrinsic_align_offset(instr);
7302   info.swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 0;
7303   info.sync = memory_sync_info(storage_scratch, semantic_private);
7304   info.soffset = ctx->program->scratch_offset;
7305   emit_load(ctx, bld, info, scratch_load_params);
7306}
7307
7308void
7309visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
7310{
7311   Builder bld(ctx->program, ctx->block);
7312   Temp rsrc = get_scratch_resource(ctx);
7313   Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7314   Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7315
7316   unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
7317   unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
7318
7319   unsigned write_count = 0;
7320   Temp write_datas[32];
7321   unsigned offsets[32];
7322   unsigned swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 16;
7323   split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, swizzle_component_size,
7324                      &write_count, write_datas, offsets);
7325
7326   for (unsigned i = 0; i < write_count; i++) {
7327      aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
7328      Instruction* mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i],
7329                                     offsets[i], true, true);
7330      mubuf->mubuf().sync = memory_sync_info(storage_scratch, semantic_private);
7331   }
7332}
7333
7334void
7335visit_load_sample_mask_in(isel_context* ctx, nir_intrinsic_instr* instr)
7336{
7337   uint8_t log2_ps_iter_samples;
7338   if (ctx->program->info->ps.uses_sample_shading) {
7339      log2_ps_iter_samples = util_logbase2(ctx->options->key.ps.num_samples);
7340   } else {
7341      log2_ps_iter_samples = ctx->options->key.ps.log2_ps_iter_samples;
7342   }
7343
7344   Builder bld(ctx->program, ctx->block);
7345
7346   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7347
7348   if (log2_ps_iter_samples) {
7349      /* gl_SampleMaskIn[0] = (SampleCoverage & (1 << gl_SampleID)). */
7350      Temp sample_id =
7351         bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary),
7352                  Operand::c32(8u), Operand::c32(4u));
7353      Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sample_id,
7354                           bld.copy(bld.def(v1), Operand::c32(1u)));
7355      bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask,
7356               get_arg(ctx, ctx->args->ac.sample_coverage));
7357   } else {
7358      bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.sample_coverage));
7359   }
7360}
7361
7362void
7363visit_emit_vertex_with_counter(isel_context* ctx, nir_intrinsic_instr* instr)
7364{
7365   Builder bld(ctx->program, ctx->block);
7366
7367   unsigned stream = nir_intrinsic_stream_id(instr);
7368   Temp next_vertex = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7369   next_vertex = bld.v_mul_imm(bld.def(v1), next_vertex, 4u);
7370   nir_const_value* next_vertex_cv = nir_src_as_const_value(instr->src[0]);
7371
7372   /* get GSVS ring */
7373   Temp gsvs_ring =
7374      bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer,
7375               Operand::c32(RING_GSVS_GS * 16u));
7376
7377   unsigned num_components = ctx->program->info->gs.num_stream_output_components[stream];
7378
7379   unsigned stride = 4u * num_components * ctx->shader->info.gs.vertices_out;
7380   unsigned stream_offset = 0;
7381   for (unsigned i = 0; i < stream; i++) {
7382      unsigned prev_stride = 4u * ctx->program->info->gs.num_stream_output_components[i] *
7383                             ctx->shader->info.gs.vertices_out;
7384      stream_offset += prev_stride * ctx->program->wave_size;
7385   }
7386
7387   /* Limit on the stride field for <= GFX7. */
7388   assert(stride < (1 << 14));
7389
7390   Temp gsvs_dwords[4];
7391   for (unsigned i = 0; i < 4; i++)
7392      gsvs_dwords[i] = bld.tmp(s1);
7393   bld.pseudo(aco_opcode::p_split_vector, Definition(gsvs_dwords[0]), Definition(gsvs_dwords[1]),
7394              Definition(gsvs_dwords[2]), Definition(gsvs_dwords[3]), gsvs_ring);
7395
7396   if (stream_offset) {
7397      Temp stream_offset_tmp = bld.copy(bld.def(s1), Operand::c32(stream_offset));
7398
7399      Temp carry = bld.tmp(s1);
7400      gsvs_dwords[0] = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)),
7401                                gsvs_dwords[0], stream_offset_tmp);
7402      gsvs_dwords[1] = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc),
7403                                gsvs_dwords[1], Operand::zero(), bld.scc(carry));
7404   }
7405
7406   gsvs_dwords[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1],
7407                             Operand::c32(S_008F04_STRIDE(stride)));
7408   gsvs_dwords[2] = bld.copy(bld.def(s1), Operand::c32(ctx->program->wave_size));
7409
7410   gsvs_ring = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), gsvs_dwords[0], gsvs_dwords[1],
7411                          gsvs_dwords[2], gsvs_dwords[3]);
7412
7413   unsigned offset = 0;
7414   for (unsigned i = 0; i <= VARYING_SLOT_VAR31; i++) {
7415      if (ctx->program->info->gs.output_streams[i] != stream)
7416         continue;
7417
7418      for (unsigned j = 0; j < 4; j++) {
7419         if (!(ctx->program->info->gs.output_usage_mask[i] & (1 << j)))
7420            continue;
7421
7422         if (ctx->outputs.mask[i] & (1 << j)) {
7423            Operand vaddr_offset = next_vertex_cv ? Operand(v1) : Operand(next_vertex);
7424            unsigned const_offset = (offset + (next_vertex_cv ? next_vertex_cv->u32 : 0u)) * 4u;
7425            if (const_offset >= 4096u) {
7426               if (vaddr_offset.isUndefined())
7427                  vaddr_offset = bld.copy(bld.def(v1), Operand::c32(const_offset / 4096u * 4096u));
7428               else
7429                  vaddr_offset = bld.vadd32(bld.def(v1), Operand::c32(const_offset / 4096u * 4096u),
7430                                            vaddr_offset);
7431               const_offset %= 4096u;
7432            }
7433
7434            aco_ptr<MTBUF_instruction> mtbuf{create_instruction<MTBUF_instruction>(
7435               aco_opcode::tbuffer_store_format_x, Format::MTBUF, 4, 0)};
7436            mtbuf->operands[0] = Operand(gsvs_ring);
7437            mtbuf->operands[1] = vaddr_offset;
7438            mtbuf->operands[2] = Operand(get_arg(ctx, ctx->args->ac.gs2vs_offset));
7439            mtbuf->operands[3] = Operand(ctx->outputs.temps[i * 4u + j]);
7440            mtbuf->offen = !vaddr_offset.isUndefined();
7441            mtbuf->dfmt = V_008F0C_BUF_DATA_FORMAT_32;
7442            mtbuf->nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
7443            mtbuf->offset = const_offset;
7444            mtbuf->glc = true;
7445            mtbuf->slc = true;
7446            mtbuf->sync = memory_sync_info(storage_vmem_output, semantic_can_reorder);
7447            bld.insert(std::move(mtbuf));
7448         }
7449
7450         offset += ctx->shader->info.gs.vertices_out;
7451      }
7452
7453      /* outputs for the next vertex are undefined and keeping them around can
7454       * create invalid IR with control flow */
7455      ctx->outputs.mask[i] = 0;
7456   }
7457
7458   bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(false, true, stream));
7459}
7460
7461Temp
7462emit_boolean_reduce(isel_context* ctx, nir_op op, unsigned cluster_size, Temp src)
7463{
7464   Builder bld(ctx->program, ctx->block);
7465
7466   if (cluster_size == 1) {
7467      return src;
7468   }
7469   if (op == nir_op_iand && cluster_size == 4) {
7470      /* subgroupClusteredAnd(val, 4) -> ~wqm(exec & ~val) */
7471      Temp tmp =
7472         bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
7473      return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc),
7474                      bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), tmp));
7475   } else if (op == nir_op_ior && cluster_size == 4) {
7476      /* subgroupClusteredOr(val, 4) -> wqm(val & exec) */
7477      return bld.sop1(
7478         Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc),
7479         bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)));
7480   } else if (op == nir_op_iand && cluster_size == ctx->program->wave_size) {
7481      /* subgroupAnd(val) -> (exec & ~val) == 0 */
7482      Temp tmp =
7483         bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src)
7484            .def(1)
7485            .getTemp();
7486      Temp cond = bool_to_vector_condition(ctx, emit_wqm(bld, tmp));
7487      return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), cond);
7488   } else if (op == nir_op_ior && cluster_size == ctx->program->wave_size) {
7489      /* subgroupOr(val) -> (val & exec) != 0 */
7490      Temp tmp =
7491         bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm))
7492            .def(1)
7493            .getTemp();
7494      return bool_to_vector_condition(ctx, tmp);
7495   } else if (op == nir_op_ixor && cluster_size == ctx->program->wave_size) {
7496      /* subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1 */
7497      Temp tmp =
7498         bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7499      tmp = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), tmp);
7500      tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand::c32(1u))
7501               .def(1)
7502               .getTemp();
7503      return bool_to_vector_condition(ctx, tmp);
7504   } else {
7505      /* subgroupClustered{And,Or,Xor}(val, n):
7506       *   lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0)) (just v_mbcnt_lo on wave32)
7507       *   cluster_offset = ~(n - 1) & lane_id cluster_mask = ((1 << n) - 1)
7508       * subgroupClusteredAnd():
7509       *   return ((val | ~exec) >> cluster_offset) & cluster_mask == cluster_mask
7510       * subgroupClusteredOr():
7511       *   return ((val & exec) >> cluster_offset) & cluster_mask != 0
7512       * subgroupClusteredXor():
7513       *   return v_bnt_u32_b32(((val & exec) >> cluster_offset) & cluster_mask, 0) & 1 != 0
7514       */
7515      Temp lane_id = emit_mbcnt(ctx, bld.tmp(v1));
7516      Temp cluster_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1),
7517                                     Operand::c32(~uint32_t(cluster_size - 1)), lane_id);
7518
7519      Temp tmp;
7520      if (op == nir_op_iand)
7521         tmp = bld.sop2(Builder::s_orn2, bld.def(bld.lm), bld.def(s1, scc), src,
7522                        Operand(exec, bld.lm));
7523      else
7524         tmp =
7525            bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7526
7527      uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u;
7528
7529      if (ctx->program->chip_class <= GFX7)
7530         tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), tmp, cluster_offset);
7531      else if (ctx->program->wave_size == 64)
7532         tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp);
7533      else
7534         tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), cluster_offset, tmp);
7535      tmp = emit_extract_vector(ctx, tmp, 0, v1);
7536      if (cluster_mask != 0xffffffff)
7537         tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(cluster_mask), tmp);
7538
7539      if (op == nir_op_iand) {
7540         return bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.lm), Operand::c32(cluster_mask),
7541                         tmp);
7542      } else if (op == nir_op_ior) {
7543         return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.hint_vcc(bld.lm), Operand::zero(), tmp);
7544      } else if (op == nir_op_ixor) {
7545         tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u),
7546                        bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), tmp, Operand::zero()));
7547         return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.hint_vcc(bld.lm), Operand::zero(), tmp);
7548      }
7549      assert(false);
7550      return Temp();
7551   }
7552}
7553
7554Temp
7555emit_boolean_exclusive_scan(isel_context* ctx, nir_op op, Temp src)
7556{
7557   Builder bld(ctx->program, ctx->block);
7558   assert(src.regClass() == bld.lm);
7559
7560   /* subgroupExclusiveAnd(val) -> mbcnt(exec & ~val) == 0
7561    * subgroupExclusiveOr(val) -> mbcnt(val & exec) != 0
7562    * subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0
7563    */
7564   Temp tmp;
7565   if (op == nir_op_iand)
7566      tmp =
7567         bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
7568   else
7569      tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7570
7571   Temp mbcnt = emit_mbcnt(ctx, bld.tmp(v1), Operand(tmp));
7572
7573   if (op == nir_op_iand)
7574      return bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.lm), Operand::zero(), mbcnt);
7575   else if (op == nir_op_ior)
7576      return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.hint_vcc(bld.lm), Operand::zero(), mbcnt);
7577   else if (op == nir_op_ixor)
7578      return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.hint_vcc(bld.lm), Operand::zero(),
7579                      bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), mbcnt));
7580
7581   assert(false);
7582   return Temp();
7583}
7584
7585Temp
7586emit_boolean_inclusive_scan(isel_context* ctx, nir_op op, Temp src)
7587{
7588   Builder bld(ctx->program, ctx->block);
7589
7590   /* subgroupInclusiveAnd(val) -> subgroupExclusiveAnd(val) && val
7591    * subgroupInclusiveOr(val) -> subgroupExclusiveOr(val) || val
7592    * subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val
7593    */
7594   Temp tmp = emit_boolean_exclusive_scan(ctx, op, src);
7595   if (op == nir_op_iand)
7596      return bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7597   else if (op == nir_op_ior)
7598      return bld.sop2(Builder::s_or, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7599   else if (op == nir_op_ixor)
7600      return bld.sop2(Builder::s_xor, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7601
7602   assert(false);
7603   return Temp();
7604}
7605
7606ReduceOp
7607get_reduce_op(nir_op op, unsigned bit_size)
7608{
7609   switch (op) {
7610#define CASEI(name)                                                                                \
7611   case nir_op_##name:                                                                             \
7612      return (bit_size == 32)   ? name##32                                                         \
7613             : (bit_size == 16) ? name##16                                                         \
7614             : (bit_size == 8)  ? name##8                                                          \
7615                                : name##64;
7616#define CASEF(name)                                                                                \
7617   case nir_op_##name: return (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64;
7618      CASEI(iadd)
7619      CASEI(imul)
7620      CASEI(imin)
7621      CASEI(umin)
7622      CASEI(imax)
7623      CASEI(umax)
7624      CASEI(iand)
7625      CASEI(ior)
7626      CASEI(ixor)
7627      CASEF(fadd)
7628      CASEF(fmul)
7629      CASEF(fmin)
7630      CASEF(fmax)
7631   default: unreachable("unknown reduction op");
7632#undef CASEI
7633#undef CASEF
7634   }
7635}
7636
7637void
7638emit_uniform_subgroup(isel_context* ctx, nir_intrinsic_instr* instr, Temp src)
7639{
7640   Builder bld(ctx->program, ctx->block);
7641   Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
7642   assert(dst.regClass().type() != RegType::vgpr);
7643   if (src.regClass().type() == RegType::vgpr)
7644      bld.pseudo(aco_opcode::p_as_uniform, dst, src);
7645   else
7646      bld.copy(dst, src);
7647}
7648
7649void
7650emit_addition_uniform_reduce(isel_context* ctx, nir_op op, Definition dst, nir_src src, Temp count)
7651{
7652   Builder bld(ctx->program, ctx->block);
7653   Temp src_tmp = get_ssa_temp(ctx, src.ssa);
7654
7655   if (op == nir_op_fadd) {
7656      src_tmp = as_vgpr(ctx, src_tmp);
7657      Temp tmp = dst.regClass() == s1 ? bld.tmp(src_tmp.regClass()) : dst.getTemp();
7658
7659      if (src.ssa->bit_size == 16) {
7660         count = bld.vop1(aco_opcode::v_cvt_f16_u16, bld.def(v2b), count);
7661         bld.vop2(aco_opcode::v_mul_f16, Definition(tmp), count, src_tmp);
7662      } else {
7663         assert(src.ssa->bit_size == 32);
7664         count = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), count);
7665         bld.vop2(aco_opcode::v_mul_f32, Definition(tmp), count, src_tmp);
7666      }
7667
7668      if (tmp != dst.getTemp())
7669         bld.pseudo(aco_opcode::p_as_uniform, dst, tmp);
7670
7671      return;
7672   }
7673
7674   if (dst.regClass() == s1)
7675      src_tmp = bld.as_uniform(src_tmp);
7676
7677   if (op == nir_op_ixor && count.type() == RegType::sgpr)
7678      count =
7679         bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), count, Operand::c32(1u));
7680   else if (op == nir_op_ixor)
7681      count = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), count);
7682
7683   assert(dst.getTemp().type() == count.type());
7684
7685   if (nir_src_is_const(src)) {
7686      if (nir_src_as_uint(src) == 1 && dst.bytes() <= 2)
7687         bld.pseudo(aco_opcode::p_extract_vector, dst, count, Operand::zero());
7688      else if (nir_src_as_uint(src) == 1)
7689         bld.copy(dst, count);
7690      else if (nir_src_as_uint(src) == 0 && dst.bytes() <= 2)
7691         bld.vop1(aco_opcode::v_mov_b32, dst, Operand::zero()); /* RA will use SDWA if possible */
7692      else if (nir_src_as_uint(src) == 0)
7693         bld.copy(dst, Operand::zero());
7694      else if (count.type() == RegType::vgpr)
7695         bld.v_mul_imm(dst, count, nir_src_as_uint(src));
7696      else
7697         bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);
7698   } else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX10) {
7699      bld.vop3(aco_opcode::v_mul_lo_u16_e64, dst, src_tmp, count);
7700   } else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX8) {
7701      bld.vop2(aco_opcode::v_mul_lo_u16, dst, src_tmp, count);
7702   } else if (dst.getTemp().type() == RegType::vgpr) {
7703      bld.vop3(aco_opcode::v_mul_lo_u32, dst, src_tmp, count);
7704   } else {
7705      bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);
7706   }
7707}
7708
7709bool
7710emit_uniform_reduce(isel_context* ctx, nir_intrinsic_instr* instr)
7711{
7712   nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
7713   if (op == nir_op_imul || op == nir_op_fmul)
7714      return false;
7715
7716   if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {
7717      Builder bld(ctx->program, ctx->block);
7718      Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
7719      unsigned bit_size = instr->src[0].ssa->bit_size;
7720      if (bit_size > 32)
7721         return false;
7722
7723      Temp thread_count =
7724         bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), Operand(exec, bld.lm));
7725
7726      emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], thread_count);
7727   } else {
7728      emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
7729   }
7730
7731   return true;
7732}
7733
7734bool
7735emit_uniform_scan(isel_context* ctx, nir_intrinsic_instr* instr)
7736{
7737   Builder bld(ctx->program, ctx->block);
7738   Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
7739   nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
7740   bool inc = instr->intrinsic == nir_intrinsic_inclusive_scan;
7741
7742   if (op == nir_op_imul || op == nir_op_fmul)
7743      return false;
7744
7745   if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {
7746      if (instr->src[0].ssa->bit_size > 32)
7747         return false;
7748
7749      Temp packed_tid;
7750      if (inc)
7751         packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm), Operand::c32(1u));
7752      else
7753         packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm));
7754
7755      emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], packed_tid);
7756      return true;
7757   }
7758
7759   assert(op == nir_op_imin || op == nir_op_umin || op == nir_op_imax || op == nir_op_umax ||
7760          op == nir_op_iand || op == nir_op_ior || op == nir_op_fmin || op == nir_op_fmax);
7761
7762   if (inc) {
7763      emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
7764      return true;
7765   }
7766
7767   /* Copy the source and write the reduction operation identity to the first lane. */
7768   Temp lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
7769   Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7770   ReduceOp reduce_op = get_reduce_op(op, instr->src[0].ssa->bit_size);
7771   if (dst.bytes() == 8) {
7772      Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7773      bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7774      uint32_t identity_lo = get_reduction_identity(reduce_op, 0);
7775      uint32_t identity_hi = get_reduction_identity(reduce_op, 1);
7776
7777      lo =
7778         bld.writelane(bld.def(v1), bld.copy(bld.hint_m0(s1), Operand::c32(identity_lo)), lane, lo);
7779      hi =
7780         bld.writelane(bld.def(v1), bld.copy(bld.hint_m0(s1), Operand::c32(identity_hi)), lane, hi);
7781      bld.pseudo(aco_opcode::p_create_vector, dst, lo, hi);
7782   } else {
7783      uint32_t identity = get_reduction_identity(reduce_op, 0);
7784      bld.writelane(dst, bld.copy(bld.hint_m0(s1), Operand::c32(identity)), lane,
7785                    as_vgpr(ctx, src));
7786   }
7787
7788   return true;
7789}
7790
7791Temp
7792emit_reduction_instr(isel_context* ctx, aco_opcode aco_op, ReduceOp op, unsigned cluster_size,
7793                     Definition dst, Temp src)
7794{
7795   assert(src.bytes() <= 8);
7796   assert(src.type() == RegType::vgpr);
7797
7798   Builder bld(ctx->program, ctx->block);
7799
7800   unsigned num_defs = 0;
7801   Definition defs[5];
7802   defs[num_defs++] = dst;
7803   defs[num_defs++] = bld.def(bld.lm); /* used internally to save/restore exec */
7804
7805   /* scalar identity temporary */
7806   bool need_sitmp = (ctx->program->chip_class <= GFX7 || ctx->program->chip_class >= GFX10) &&
7807                     aco_op != aco_opcode::p_reduce;
7808   if (aco_op == aco_opcode::p_exclusive_scan) {
7809      need_sitmp |= (op == imin8 || op == imin16 || op == imin32 || op == imin64 || op == imax8 ||
7810                     op == imax16 || op == imax32 || op == imax64 || op == fmin16 || op == fmin32 ||
7811                     op == fmin64 || op == fmax16 || op == fmax32 || op == fmax64 || op == fmul16 ||
7812                     op == fmul64);
7813   }
7814   if (need_sitmp)
7815      defs[num_defs++] = bld.def(RegType::sgpr, dst.size());
7816
7817   /* scc clobber */
7818   defs[num_defs++] = bld.def(s1, scc);
7819
7820   /* vcc clobber */
7821   bool clobber_vcc = false;
7822   if ((op == iadd32 || op == imul64) && ctx->program->chip_class < GFX9)
7823      clobber_vcc = true;
7824   if ((op == iadd8 || op == iadd16) && ctx->program->chip_class < GFX8)
7825      clobber_vcc = true;
7826   if (op == iadd64 || op == umin64 || op == umax64 || op == imin64 || op == imax64)
7827      clobber_vcc = true;
7828
7829   if (clobber_vcc)
7830      defs[num_defs++] = bld.def(bld.lm, vcc);
7831
7832   Pseudo_reduction_instruction* reduce = create_instruction<Pseudo_reduction_instruction>(
7833      aco_op, Format::PSEUDO_REDUCTION, 3, num_defs);
7834   reduce->operands[0] = Operand(src);
7835   /* setup_reduce_temp will update these undef operands if needed */
7836   reduce->operands[1] = Operand(RegClass(RegType::vgpr, dst.size()).as_linear());
7837   reduce->operands[2] = Operand(v1.as_linear());
7838   std::copy(defs, defs + num_defs, reduce->definitions.begin());
7839
7840   reduce->reduce_op = op;
7841   reduce->cluster_size = cluster_size;
7842   bld.insert(std::move(reduce));
7843
7844   return dst.getTemp();
7845}
7846
7847void
7848emit_interp_center(isel_context* ctx, Temp dst, Temp bary, Temp pos1, Temp pos2)
7849{
7850   Builder bld(ctx->program, ctx->block);
7851   Temp p1 = emit_extract_vector(ctx, bary, 0, v1);
7852   Temp p2 = emit_extract_vector(ctx, bary, 1, v1);
7853
7854   Temp ddx_1, ddx_2, ddy_1, ddy_2;
7855   uint32_t dpp_ctrl0 = dpp_quad_perm(0, 0, 0, 0);
7856   uint32_t dpp_ctrl1 = dpp_quad_perm(1, 1, 1, 1);
7857   uint32_t dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
7858
7859   /* Build DD X/Y */
7860   if (ctx->program->chip_class >= GFX8) {
7861      Temp tl_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p1, dpp_ctrl0);
7862      ddx_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl1);
7863      ddy_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl2);
7864      Temp tl_2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p2, dpp_ctrl0);
7865      ddx_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl1);
7866      ddy_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl2);
7867   } else {
7868      Temp tl_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl0);
7869      ddx_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl1);
7870      ddx_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_1, tl_1);
7871      ddx_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl2);
7872      ddx_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_2, tl_1);
7873      Temp tl_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl0);
7874      ddy_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl1);
7875      ddy_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_1, tl_2);
7876      ddy_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl2);
7877      ddy_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_2, tl_2);
7878   }
7879
7880   /* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */
7881   aco_opcode mad =
7882      ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32;
7883   Temp tmp1 = bld.vop3(mad, bld.def(v1), ddx_1, pos1, p1);
7884   Temp tmp2 = bld.vop3(mad, bld.def(v1), ddx_2, pos1, p2);
7885   tmp1 = bld.vop3(mad, bld.def(v1), ddy_1, pos2, tmp1);
7886   tmp2 = bld.vop3(mad, bld.def(v1), ddy_2, pos2, tmp2);
7887   Temp wqm1 = bld.tmp(v1);
7888   emit_wqm(bld, tmp1, wqm1, true);
7889   Temp wqm2 = bld.tmp(v1);
7890   emit_wqm(bld, tmp2, wqm2, true);
7891   bld.pseudo(aco_opcode::p_create_vector, Definition(dst), wqm1, wqm2);
7892   return;
7893}
7894
7895Temp merged_wave_info_to_mask(isel_context* ctx, unsigned i);
7896void ngg_emit_sendmsg_gs_alloc_req(isel_context* ctx, Temp vtx_cnt, Temp prm_cnt);
7897static void create_vs_exports(isel_context* ctx);
7898
7899Temp
7900get_interp_param(isel_context* ctx, nir_intrinsic_op intrin,
7901                 enum glsl_interp_mode interp)
7902{
7903   bool linear = interp == INTERP_MODE_NOPERSPECTIVE;
7904   if (intrin == nir_intrinsic_load_barycentric_pixel ||
7905       intrin == nir_intrinsic_load_barycentric_at_sample ||
7906       intrin == nir_intrinsic_load_barycentric_at_offset) {
7907      return get_arg(ctx, linear ? ctx->args->ac.linear_center : ctx->args->ac.persp_center);
7908   } else if (intrin == nir_intrinsic_load_barycentric_centroid) {
7909      return linear ? ctx->linear_centroid : ctx->persp_centroid;
7910   } else {
7911      assert(intrin == nir_intrinsic_load_barycentric_sample);
7912      return get_arg(ctx, linear ? ctx->args->ac.linear_sample : ctx->args->ac.persp_sample);
7913   }
7914}
7915
7916void
7917visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
7918{
7919   Builder bld(ctx->program, ctx->block);
7920   switch (instr->intrinsic) {
7921   case nir_intrinsic_load_barycentric_sample:
7922   case nir_intrinsic_load_barycentric_pixel:
7923   case nir_intrinsic_load_barycentric_centroid: {
7924      glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(instr);
7925      Temp bary = get_interp_param(ctx, instr->intrinsic, mode);
7926      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7927      Temp p1 = emit_extract_vector(ctx, bary, 0, v1);
7928      Temp p2 = emit_extract_vector(ctx, bary, 1, v1);
7929      bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(p1), Operand(p2));
7930      emit_split_vector(ctx, dst, 2);
7931      break;
7932   }
7933   case nir_intrinsic_load_barycentric_model: {
7934      Temp model = get_arg(ctx, ctx->args->ac.pull_model);
7935
7936      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7937      Temp p1 = emit_extract_vector(ctx, model, 0, v1);
7938      Temp p2 = emit_extract_vector(ctx, model, 1, v1);
7939      Temp p3 = emit_extract_vector(ctx, model, 2, v1);
7940      bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(p1), Operand(p2),
7941                 Operand(p3));
7942      emit_split_vector(ctx, dst, 3);
7943      break;
7944   }
7945   case nir_intrinsic_load_barycentric_at_sample: {
7946      uint32_t sample_pos_offset = RING_PS_SAMPLE_POSITIONS * 16;
7947      switch (ctx->options->key.ps.num_samples) {
7948      case 2: sample_pos_offset += 1 << 3; break;
7949      case 4: sample_pos_offset += 3 << 3; break;
7950      case 8: sample_pos_offset += 7 << 3; break;
7951      default: break;
7952      }
7953      Temp sample_pos;
7954      Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
7955      nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]);
7956      Temp private_segment_buffer = ctx->program->private_segment_buffer;
7957      // TODO: bounds checking?
7958      if (addr.type() == RegType::sgpr) {
7959         Operand offset;
7960         if (const_addr) {
7961            sample_pos_offset += const_addr->u32 << 3;
7962            offset = Operand::c32(sample_pos_offset);
7963         } else if (ctx->options->chip_class >= GFX9) {
7964            offset = bld.sop2(aco_opcode::s_lshl3_add_u32, bld.def(s1), bld.def(s1, scc), addr,
7965                              Operand::c32(sample_pos_offset));
7966         } else {
7967            offset = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), addr,
7968                              Operand::c32(3u));
7969            offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
7970                              Operand::c32(sample_pos_offset));
7971         }
7972
7973         Operand off = bld.copy(bld.def(s1), Operand(offset));
7974         sample_pos =
7975            bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, off);
7976
7977      } else if (ctx->options->chip_class >= GFX9) {
7978         addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), addr);
7979         sample_pos = bld.global(aco_opcode::global_load_dwordx2, bld.def(v2), addr,
7980                                 private_segment_buffer, sample_pos_offset);
7981      } else if (ctx->options->chip_class >= GFX7) {
7982         /* addr += private_segment_buffer + sample_pos_offset */
7983         Temp tmp0 = bld.tmp(s1);
7984         Temp tmp1 = bld.tmp(s1);
7985         bld.pseudo(aco_opcode::p_split_vector, Definition(tmp0), Definition(tmp1),
7986                    private_segment_buffer);
7987         Definition scc_tmp = bld.def(s1, scc);
7988         tmp0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), scc_tmp, tmp0,
7989                         Operand::c32(sample_pos_offset));
7990         tmp1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), tmp1,
7991                         Operand::zero(), bld.scc(scc_tmp.getTemp()));
7992         addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), addr);
7993         Temp pck0 = bld.tmp(v1);
7994         Temp carry = bld.vadd32(Definition(pck0), tmp0, addr, true).def(1).getTemp();
7995         tmp1 = as_vgpr(ctx, tmp1);
7996         Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1),
7997                                  bld.hint_vcc(bld.def(bld.lm)), tmp1, Operand::zero(), carry);
7998         addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), pck0, pck1);
7999
8000         /* sample_pos = flat_load_dwordx2 addr */
8001         sample_pos = bld.flat(aco_opcode::flat_load_dwordx2, bld.def(v2), addr, Operand(s1));
8002      } else {
8003         assert(ctx->options->chip_class == GFX6);
8004
8005         uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
8006                              S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
8007         Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer,
8008                                Operand::zero(), Operand::c32(rsrc_conf));
8009
8010         addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), addr);
8011         addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), addr, Operand::zero());
8012
8013         sample_pos = bld.tmp(v2);
8014
8015         aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(
8016            aco_opcode::buffer_load_dwordx2, Format::MUBUF, 3, 1)};
8017         load->definitions[0] = Definition(sample_pos);
8018         load->operands[0] = Operand(rsrc);
8019         load->operands[1] = Operand(addr);
8020         load->operands[2] = Operand::zero();
8021         load->offset = sample_pos_offset;
8022         load->offen = 0;
8023         load->addr64 = true;
8024         load->glc = false;
8025         load->dlc = false;
8026         load->disable_wqm = false;
8027         ctx->block->instructions.emplace_back(std::move(load));
8028      }
8029
8030      /* sample_pos -= 0.5 */
8031      Temp pos1 = bld.tmp(RegClass(sample_pos.type(), 1));
8032      Temp pos2 = bld.tmp(RegClass(sample_pos.type(), 1));
8033      bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), sample_pos);
8034      pos1 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos1, Operand::c32(0x3f000000u));
8035      pos2 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos2, Operand::c32(0x3f000000u));
8036
8037      Temp bary = get_interp_param(ctx, instr->intrinsic, (glsl_interp_mode)nir_intrinsic_interp_mode(instr));
8038      emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), bary, pos1, pos2);
8039      break;
8040   }
8041   case nir_intrinsic_load_barycentric_at_offset: {
8042      Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
8043      RegClass rc = RegClass(offset.type(), 1);
8044      Temp pos1 = bld.tmp(rc), pos2 = bld.tmp(rc);
8045      bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), offset);
8046      Temp bary = get_interp_param(ctx, instr->intrinsic, (glsl_interp_mode)nir_intrinsic_interp_mode(instr));
8047      emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), bary, pos1, pos2);
8048      break;
8049   }
8050   case nir_intrinsic_load_front_face: {
8051      bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8052               Operand::zero(), get_arg(ctx, ctx->args->ac.front_face))
8053         .def(0)
8054         .setHint(vcc);
8055      break;
8056   }
8057   case nir_intrinsic_load_view_index: {
8058      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8059      bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.view_index)));
8060      break;
8061   }
8062   case nir_intrinsic_load_frag_coord: {
8063      emit_load_frag_coord(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 4);
8064      break;
8065   }
8066   case nir_intrinsic_load_frag_shading_rate:
8067      emit_load_frag_shading_rate(ctx, get_ssa_temp(ctx, &instr->dest.ssa));
8068      break;
8069   case nir_intrinsic_load_sample_pos: {
8070      Temp posx = get_arg(ctx, ctx->args->ac.frag_pos[0]);
8071      Temp posy = get_arg(ctx, ctx->args->ac.frag_pos[1]);
8072      bld.pseudo(
8073         aco_opcode::p_create_vector, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8074         posx.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posx) : Operand::zero(),
8075         posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand::zero());
8076      break;
8077   }
8078   case nir_intrinsic_load_tess_coord: visit_load_tess_coord(ctx, instr); break;
8079   case nir_intrinsic_load_interpolated_input: visit_load_interpolated_input(ctx, instr); break;
8080   case nir_intrinsic_store_output: visit_store_output(ctx, instr); break;
8081   case nir_intrinsic_load_input:
8082   case nir_intrinsic_load_input_vertex: visit_load_input(ctx, instr); break;
8083   case nir_intrinsic_load_per_vertex_input: visit_load_per_vertex_input(ctx, instr); break;
8084   case nir_intrinsic_load_ubo: visit_load_ubo(ctx, instr); break;
8085   case nir_intrinsic_load_push_constant: visit_load_push_constant(ctx, instr); break;
8086   case nir_intrinsic_load_constant: visit_load_constant(ctx, instr); break;
8087   case nir_intrinsic_vulkan_resource_index: visit_load_resource(ctx, instr); break;
8088   case nir_intrinsic_terminate:
8089   case nir_intrinsic_discard: visit_discard(ctx, instr); break;
8090   case nir_intrinsic_terminate_if:
8091   case nir_intrinsic_discard_if: visit_discard_if(ctx, instr); break;
8092   case nir_intrinsic_load_shared: visit_load_shared(ctx, instr); break;
8093   case nir_intrinsic_store_shared: visit_store_shared(ctx, instr); break;
8094   case nir_intrinsic_shared_atomic_add:
8095   case nir_intrinsic_shared_atomic_imin:
8096   case nir_intrinsic_shared_atomic_umin:
8097   case nir_intrinsic_shared_atomic_imax:
8098   case nir_intrinsic_shared_atomic_umax:
8099   case nir_intrinsic_shared_atomic_and:
8100   case nir_intrinsic_shared_atomic_or:
8101   case nir_intrinsic_shared_atomic_xor:
8102   case nir_intrinsic_shared_atomic_exchange:
8103   case nir_intrinsic_shared_atomic_comp_swap:
8104   case nir_intrinsic_shared_atomic_fadd:
8105   case nir_intrinsic_shared_atomic_fmin:
8106   case nir_intrinsic_shared_atomic_fmax: visit_shared_atomic(ctx, instr); break;
8107   case nir_intrinsic_image_deref_load:
8108   case nir_intrinsic_image_deref_sparse_load: visit_image_load(ctx, instr); break;
8109   case nir_intrinsic_image_deref_store: visit_image_store(ctx, instr); break;
8110   case nir_intrinsic_image_deref_atomic_add:
8111   case nir_intrinsic_image_deref_atomic_umin:
8112   case nir_intrinsic_image_deref_atomic_imin:
8113   case nir_intrinsic_image_deref_atomic_umax:
8114   case nir_intrinsic_image_deref_atomic_imax:
8115   case nir_intrinsic_image_deref_atomic_and:
8116   case nir_intrinsic_image_deref_atomic_or:
8117   case nir_intrinsic_image_deref_atomic_xor:
8118   case nir_intrinsic_image_deref_atomic_exchange:
8119   case nir_intrinsic_image_deref_atomic_comp_swap:
8120   case nir_intrinsic_image_deref_atomic_fmin:
8121   case nir_intrinsic_image_deref_atomic_fmax: visit_image_atomic(ctx, instr); break;
8122   case nir_intrinsic_image_deref_size: visit_image_size(ctx, instr); break;
8123   case nir_intrinsic_image_deref_samples: visit_image_samples(ctx, instr); break;
8124   case nir_intrinsic_load_ssbo: visit_load_ssbo(ctx, instr); break;
8125   case nir_intrinsic_store_ssbo: visit_store_ssbo(ctx, instr); break;
8126   case nir_intrinsic_load_global_constant:
8127   case nir_intrinsic_load_global: visit_load_global(ctx, instr); break;
8128   case nir_intrinsic_load_buffer_amd: visit_load_buffer(ctx, instr); break;
8129   case nir_intrinsic_store_buffer_amd: visit_store_buffer(ctx, instr); break;
8130   case nir_intrinsic_store_global: visit_store_global(ctx, instr); break;
8131   case nir_intrinsic_global_atomic_add:
8132   case nir_intrinsic_global_atomic_imin:
8133   case nir_intrinsic_global_atomic_umin:
8134   case nir_intrinsic_global_atomic_imax:
8135   case nir_intrinsic_global_atomic_umax:
8136   case nir_intrinsic_global_atomic_and:
8137   case nir_intrinsic_global_atomic_or:
8138   case nir_intrinsic_global_atomic_xor:
8139   case nir_intrinsic_global_atomic_exchange:
8140   case nir_intrinsic_global_atomic_comp_swap:
8141   case nir_intrinsic_global_atomic_fmin:
8142   case nir_intrinsic_global_atomic_fmax: visit_global_atomic(ctx, instr); break;
8143   case nir_intrinsic_ssbo_atomic_add:
8144   case nir_intrinsic_ssbo_atomic_imin:
8145   case nir_intrinsic_ssbo_atomic_umin:
8146   case nir_intrinsic_ssbo_atomic_imax:
8147   case nir_intrinsic_ssbo_atomic_umax:
8148   case nir_intrinsic_ssbo_atomic_and:
8149   case nir_intrinsic_ssbo_atomic_or:
8150   case nir_intrinsic_ssbo_atomic_xor:
8151   case nir_intrinsic_ssbo_atomic_exchange:
8152   case nir_intrinsic_ssbo_atomic_comp_swap:
8153   case nir_intrinsic_ssbo_atomic_fmin:
8154   case nir_intrinsic_ssbo_atomic_fmax: visit_atomic_ssbo(ctx, instr); break;
8155   case nir_intrinsic_load_scratch: visit_load_scratch(ctx, instr); break;
8156   case nir_intrinsic_store_scratch: visit_store_scratch(ctx, instr); break;
8157   case nir_intrinsic_get_ssbo_size: visit_get_ssbo_size(ctx, instr); break;
8158   case nir_intrinsic_scoped_barrier: emit_scoped_barrier(ctx, instr); break;
8159   case nir_intrinsic_load_num_workgroups: {
8160      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8161      bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.num_work_groups)));
8162      emit_split_vector(ctx, dst, 3);
8163      break;
8164   }
8165   case nir_intrinsic_load_ray_launch_size: {
8166      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8167      bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.ray_launch_size)));
8168      emit_split_vector(ctx, dst, 3);
8169      break;
8170   }
8171   case nir_intrinsic_load_local_invocation_id: {
8172      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8173      bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.local_invocation_ids)));
8174      emit_split_vector(ctx, dst, 3);
8175      break;
8176   }
8177   case nir_intrinsic_load_workgroup_id: {
8178      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8179      const struct ac_arg* args = ctx->args->ac.workgroup_ids;
8180      bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
8181                 args[0].used ? Operand(get_arg(ctx, args[0])) : Operand::zero(),
8182                 args[1].used ? Operand(get_arg(ctx, args[1])) : Operand::zero(),
8183                 args[2].used ? Operand(get_arg(ctx, args[2])) : Operand::zero());
8184      emit_split_vector(ctx, dst, 3);
8185      break;
8186   }
8187   case nir_intrinsic_load_local_invocation_index: {
8188      if (ctx->stage.hw == HWStage::LS || ctx->stage.hw == HWStage::HS) {
8189         bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8190                  get_arg(ctx, ctx->args->ac.vs_rel_patch_id));
8191         break;
8192      } else if (ctx->stage.hw == HWStage::GS || ctx->stage.hw == HWStage::NGG) {
8193         bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), thread_id_in_threadgroup(ctx));
8194         break;
8195      }
8196
8197      Temp id = emit_mbcnt(ctx, bld.tmp(v1));
8198
8199      /* The tg_size bits [6:11] contain the subgroup id,
8200       * we need this multiplied by the wave size, and then OR the thread id to it.
8201       */
8202      if (ctx->program->wave_size == 64) {
8203         /* After the s_and the bits are already multiplied by 64 (left shifted by 6) so we can just
8204          * feed that to v_or */
8205         Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
8206                                Operand::c32(0xfc0u), get_arg(ctx, ctx->args->ac.tg_size));
8207         bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num,
8208                  id);
8209      } else {
8210         /* Extract the bit field and multiply the result by 32 (left shift by 5), then do the OR */
8211         Temp tg_num =
8212            bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
8213                     get_arg(ctx, ctx->args->ac.tg_size), Operand::c32(0x6u | (0x6u << 16)));
8214         bld.vop3(aco_opcode::v_lshl_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8215                  tg_num, Operand::c32(0x5u), id);
8216      }
8217      break;
8218   }
8219   case nir_intrinsic_load_subgroup_id: {
8220      if (ctx->stage == compute_cs) {
8221         bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8222                  bld.def(s1, scc), get_arg(ctx, ctx->args->ac.tg_size),
8223                  Operand::c32(0x6u | (0x6u << 16)));
8224      } else if (ctx->stage.hw == HWStage::NGG) {
8225         /* Get the id of the current wave within the threadgroup (workgroup) */
8226         bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8227                  bld.def(s1, scc), get_arg(ctx, ctx->args->ac.merged_wave_info),
8228                  Operand::c32(24u | (4u << 16)));
8229      } else {
8230         bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand::zero());
8231      }
8232      break;
8233   }
8234   case nir_intrinsic_load_subgroup_invocation: {
8235      emit_mbcnt(ctx, get_ssa_temp(ctx, &instr->dest.ssa));
8236      break;
8237   }
8238   case nir_intrinsic_load_num_subgroups: {
8239      if (ctx->stage == compute_cs)
8240         bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8241                  bld.def(s1, scc), Operand::c32(0x3fu), get_arg(ctx, ctx->args->ac.tg_size));
8242      else if (ctx->stage.hw == HWStage::NGG)
8243         bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8244                  bld.def(s1, scc), get_arg(ctx, ctx->args->ac.merged_wave_info),
8245                  Operand::c32(28u | (4u << 16)));
8246      else
8247         bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand::c32(0x1u));
8248      break;
8249   }
8250   case nir_intrinsic_ballot: {
8251      Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8252      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8253
8254      if (instr->src[0].ssa->bit_size == 1) {
8255         assert(src.regClass() == bld.lm);
8256      } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {
8257         src = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), src);
8258      } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
8259         src = bld.vopc(aco_opcode::v_cmp_lg_u64, bld.def(bld.lm), Operand::zero(), src);
8260      } else {
8261         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8262      }
8263
8264      /* Make sure that all inactive lanes return zero.
8265       * Value-numbering might remove the comparison above */
8266      src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8267      if (dst.size() != bld.lm.size()) {
8268         /* Wave32 with ballot size set to 64 */
8269         src =
8270            bld.pseudo(aco_opcode::p_create_vector, bld.def(dst.regClass()), src, Operand::zero());
8271      }
8272
8273      emit_wqm(bld, src, dst);
8274      break;
8275   }
8276   case nir_intrinsic_shuffle:
8277   case nir_intrinsic_read_invocation: {
8278      Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8279      if (!nir_src_is_divergent(instr->src[0])) {
8280         emit_uniform_subgroup(ctx, instr, src);
8281      } else {
8282         Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
8283         if (instr->intrinsic == nir_intrinsic_read_invocation ||
8284             !nir_src_is_divergent(instr->src[1]))
8285            tid = bld.as_uniform(tid);
8286         Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8287
8288         if (instr->dest.ssa.bit_size != 1)
8289            src = as_vgpr(ctx, src);
8290
8291         if (src.regClass() == v1b || src.regClass() == v2b) {
8292            Temp tmp = bld.tmp(v1);
8293            tmp = emit_wqm(bld, emit_bpermute(ctx, bld, tid, src), tmp);
8294            if (dst.type() == RegType::vgpr)
8295               bld.pseudo(aco_opcode::p_split_vector, Definition(dst),
8296                          bld.def(src.regClass() == v1b ? v3b : v2b), tmp);
8297            else
8298               bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
8299         } else if (src.regClass() == v1) {
8300            emit_wqm(bld, emit_bpermute(ctx, bld, tid, src), dst);
8301         } else if (src.regClass() == v2) {
8302            Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8303            bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8304            lo = emit_wqm(bld, emit_bpermute(ctx, bld, tid, lo));
8305            hi = emit_wqm(bld, emit_bpermute(ctx, bld, tid, hi));
8306            bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8307            emit_split_vector(ctx, dst, 2);
8308         } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == s1) {
8309            assert(src.regClass() == bld.lm);
8310            Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src, tid);
8311            bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst);
8312         } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == v1) {
8313            assert(src.regClass() == bld.lm);
8314            Temp tmp;
8315            if (ctx->program->chip_class <= GFX7)
8316               tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), src, tid);
8317            else if (ctx->program->wave_size == 64)
8318               tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src);
8319            else
8320               tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), tid, src);
8321            tmp = emit_extract_vector(ctx, tmp, 0, v1);
8322            tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), tmp);
8323            emit_wqm(bld, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), tmp),
8324                     dst);
8325         } else {
8326            isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8327         }
8328      }
8329      break;
8330   }
8331   case nir_intrinsic_load_sample_id: {
8332      bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8333               get_arg(ctx, ctx->args->ac.ancillary), Operand::c32(8u), Operand::c32(4u));
8334      break;
8335   }
8336   case nir_intrinsic_load_sample_mask_in: {
8337      visit_load_sample_mask_in(ctx, instr);
8338      break;
8339   }
8340   case nir_intrinsic_read_first_invocation: {
8341      Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8342      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8343      if (src.regClass() == v1b || src.regClass() == v2b || src.regClass() == v1) {
8344         emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src), dst);
8345      } else if (src.regClass() == v2) {
8346         Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8347         bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8348         lo = emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), lo));
8349         hi = emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), hi));
8350         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8351         emit_split_vector(ctx, dst, 2);
8352      } else if (instr->dest.ssa.bit_size == 1) {
8353         assert(src.regClass() == bld.lm);
8354         Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src,
8355                             bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)));
8356         bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst);
8357      } else {
8358         bld.copy(Definition(dst), src);
8359      }
8360      break;
8361   }
8362   case nir_intrinsic_vote_all: {
8363      Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8364      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8365      assert(src.regClass() == bld.lm);
8366      assert(dst.regClass() == bld.lm);
8367
8368      Temp tmp =
8369         bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src)
8370            .def(1)
8371            .getTemp();
8372      Temp cond = bool_to_vector_condition(ctx, emit_wqm(bld, tmp));
8373      bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), cond);
8374      break;
8375   }
8376   case nir_intrinsic_vote_any: {
8377      Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8378      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8379      assert(src.regClass() == bld.lm);
8380      assert(dst.regClass() == bld.lm);
8381
8382      Temp tmp = bool_to_scalar_condition(ctx, src);
8383      bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst);
8384      break;
8385   }
8386   case nir_intrinsic_reduce:
8387   case nir_intrinsic_inclusive_scan:
8388   case nir_intrinsic_exclusive_scan: {
8389      Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8390      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8391      nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
8392      unsigned cluster_size =
8393         instr->intrinsic == nir_intrinsic_reduce ? nir_intrinsic_cluster_size(instr) : 0;
8394      cluster_size = util_next_power_of_two(
8395         MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
8396
8397      if (!nir_src_is_divergent(instr->src[0]) && cluster_size == ctx->program->wave_size &&
8398          instr->dest.ssa.bit_size != 1) {
8399         /* We use divergence analysis to assign the regclass, so check if it's
8400          * working as expected */
8401         ASSERTED bool expected_divergent = instr->intrinsic == nir_intrinsic_exclusive_scan;
8402         if (instr->intrinsic == nir_intrinsic_inclusive_scan)
8403            expected_divergent = op == nir_op_iadd || op == nir_op_fadd || op == nir_op_ixor;
8404         assert(nir_dest_is_divergent(instr->dest) == expected_divergent);
8405
8406         if (instr->intrinsic == nir_intrinsic_reduce) {
8407            if (emit_uniform_reduce(ctx, instr))
8408               break;
8409         } else if (emit_uniform_scan(ctx, instr)) {
8410            break;
8411         }
8412      }
8413
8414      if (instr->dest.ssa.bit_size == 1) {
8415         if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin)
8416            op = nir_op_iand;
8417         else if (op == nir_op_iadd)
8418            op = nir_op_ixor;
8419         else if (op == nir_op_umax || op == nir_op_imax)
8420            op = nir_op_ior;
8421         assert(op == nir_op_iand || op == nir_op_ior || op == nir_op_ixor);
8422
8423         switch (instr->intrinsic) {
8424         case nir_intrinsic_reduce:
8425            emit_wqm(bld, emit_boolean_reduce(ctx, op, cluster_size, src), dst);
8426            break;
8427         case nir_intrinsic_exclusive_scan:
8428            emit_wqm(bld, emit_boolean_exclusive_scan(ctx, op, src), dst);
8429            break;
8430         case nir_intrinsic_inclusive_scan:
8431            emit_wqm(bld, emit_boolean_inclusive_scan(ctx, op, src), dst);
8432            break;
8433         default: assert(false);
8434         }
8435      } else if (cluster_size == 1) {
8436         bld.copy(Definition(dst), src);
8437      } else {
8438         unsigned bit_size = instr->src[0].ssa->bit_size;
8439
8440         src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8));
8441
8442         ReduceOp reduce_op = get_reduce_op(op, bit_size);
8443
8444         aco_opcode aco_op;
8445         switch (instr->intrinsic) {
8446         case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break;
8447         case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break;
8448         case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break;
8449         default: unreachable("unknown reduce intrinsic");
8450         }
8451
8452         Temp tmp_dst = emit_reduction_instr(ctx, aco_op, reduce_op, cluster_size,
8453                                             bld.def(dst.regClass()), src);
8454         emit_wqm(bld, tmp_dst, dst);
8455      }
8456      break;
8457   }
8458   case nir_intrinsic_quad_broadcast:
8459   case nir_intrinsic_quad_swap_horizontal:
8460   case nir_intrinsic_quad_swap_vertical:
8461   case nir_intrinsic_quad_swap_diagonal:
8462   case nir_intrinsic_quad_swizzle_amd: {
8463      Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8464
8465      if (!nir_dest_is_divergent(instr->dest)) {
8466         emit_uniform_subgroup(ctx, instr, src);
8467         break;
8468      }
8469
8470      /* Quad broadcast lane. */
8471      unsigned lane = 0;
8472      /* Use VALU for the bool instructions that don't have a SALU-only special case. */
8473      bool bool_use_valu = instr->dest.ssa.bit_size == 1;
8474
8475      uint16_t dpp_ctrl = 0;
8476
8477      switch (instr->intrinsic) {
8478      case nir_intrinsic_quad_swap_horizontal: dpp_ctrl = dpp_quad_perm(1, 0, 3, 2); break;
8479      case nir_intrinsic_quad_swap_vertical: dpp_ctrl = dpp_quad_perm(2, 3, 0, 1); break;
8480      case nir_intrinsic_quad_swap_diagonal: dpp_ctrl = dpp_quad_perm(3, 2, 1, 0); break;
8481      case nir_intrinsic_quad_swizzle_amd: dpp_ctrl = nir_intrinsic_swizzle_mask(instr); break;
8482      case nir_intrinsic_quad_broadcast:
8483         lane = nir_src_as_const_value(instr->src[1])->u32;
8484         dpp_ctrl = dpp_quad_perm(lane, lane, lane, lane);
8485         bool_use_valu = false;
8486         break;
8487      default: break;
8488      }
8489
8490      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8491      Temp tmp(dst);
8492
8493      /* Setup source. */
8494      if (bool_use_valu)
8495         src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
8496                            Operand::c32(-1), src);
8497      else if (instr->dest.ssa.bit_size != 1)
8498         src = as_vgpr(ctx, src);
8499
8500      /* Setup temporary destination. */
8501      if (bool_use_valu)
8502         tmp = bld.tmp(v1);
8503      else if (ctx->program->stage == fragment_fs)
8504         tmp = bld.tmp(dst.regClass());
8505
8506      if (instr->dest.ssa.bit_size == 1 && instr->intrinsic == nir_intrinsic_quad_broadcast) {
8507         /* Special case for quad broadcast using SALU only. */
8508         assert(src.regClass() == bld.lm && tmp.regClass() == bld.lm);
8509
8510         uint32_t half_mask = 0x11111111u << lane;
8511         Operand mask_tmp = bld.lm.bytes() == 4
8512                               ? Operand::c32(half_mask)
8513                               : bld.pseudo(aco_opcode::p_create_vector, bld.def(bld.lm),
8514                                            Operand::c32(half_mask), Operand::c32(half_mask));
8515
8516         src =
8517            bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8518         src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp, src);
8519         bld.sop1(Builder::s_wqm, Definition(tmp), src);
8520      } else if (instr->dest.ssa.bit_size <= 32 || bool_use_valu) {
8521         unsigned excess_bytes = bool_use_valu ? 0 : 4 - instr->dest.ssa.bit_size / 8;
8522         Definition def = excess_bytes ? bld.def(v1) : Definition(tmp);
8523
8524         if (ctx->program->chip_class >= GFX8)
8525            bld.vop1_dpp(aco_opcode::v_mov_b32, def, src, dpp_ctrl);
8526         else
8527            bld.ds(aco_opcode::ds_swizzle_b32, def, src, (1 << 15) | dpp_ctrl);
8528
8529         if (excess_bytes)
8530            bld.pseudo(aco_opcode::p_split_vector, Definition(tmp),
8531                       bld.def(RegClass::get(tmp.type(), excess_bytes)), def.getTemp());
8532      } else if (instr->dest.ssa.bit_size == 64) {
8533         Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8534         bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8535
8536         if (ctx->program->chip_class >= GFX8) {
8537            lo = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl);
8538            hi = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl);
8539         } else {
8540            lo = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, (1 << 15) | dpp_ctrl);
8541            hi = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, (1 << 15) | dpp_ctrl);
8542         }
8543
8544         bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), lo, hi);
8545         emit_split_vector(ctx, tmp, 2);
8546      } else {
8547         isel_err(&instr->instr, "Unimplemented NIR quad group instruction bit size.");
8548      }
8549
8550      if (tmp.id() != dst.id()) {
8551         if (bool_use_valu)
8552            tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), tmp);
8553
8554         /* Vulkan spec 9.25: Helper invocations must be active for quad group instructions. */
8555         emit_wqm(bld, tmp, dst, true);
8556      }
8557
8558      break;
8559   }
8560   case nir_intrinsic_masked_swizzle_amd: {
8561      Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8562      if (!nir_dest_is_divergent(instr->dest)) {
8563         emit_uniform_subgroup(ctx, instr, src);
8564         break;
8565      }
8566      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8567      uint32_t mask = nir_intrinsic_swizzle_mask(instr);
8568
8569      if (instr->dest.ssa.bit_size != 1)
8570         src = as_vgpr(ctx, src);
8571
8572      if (instr->dest.ssa.bit_size == 1) {
8573         assert(src.regClass() == bld.lm);
8574         src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
8575                            Operand::c32(-1), src);
8576         src = emit_masked_swizzle(ctx, bld, src, mask);
8577         Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), src);
8578         emit_wqm(bld, tmp, dst);
8579      } else if (dst.regClass() == v1b) {
8580         Temp tmp = emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask));
8581         emit_extract_vector(ctx, tmp, 0, dst);
8582      } else if (dst.regClass() == v2b) {
8583         Temp tmp = emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask));
8584         emit_extract_vector(ctx, tmp, 0, dst);
8585      } else if (dst.regClass() == v1) {
8586         emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask), dst);
8587      } else if (dst.regClass() == v2) {
8588         Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8589         bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8590         lo = emit_wqm(bld, emit_masked_swizzle(ctx, bld, lo, mask));
8591         hi = emit_wqm(bld, emit_masked_swizzle(ctx, bld, hi, mask));
8592         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8593         emit_split_vector(ctx, dst, 2);
8594      } else {
8595         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8596      }
8597      break;
8598   }
8599   case nir_intrinsic_write_invocation_amd: {
8600      Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
8601      Temp val = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
8602      Temp lane = bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa));
8603      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8604      if (dst.regClass() == v1) {
8605         /* src2 is ignored for writelane. RA assigns the same reg for dst */
8606         emit_wqm(bld, bld.writelane(bld.def(v1), val, lane, src), dst);
8607      } else if (dst.regClass() == v2) {
8608         Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1);
8609         Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1);
8610         bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src);
8611         bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
8612         Temp lo = emit_wqm(bld, bld.writelane(bld.def(v1), val_lo, lane, src_hi));
8613         Temp hi = emit_wqm(bld, bld.writelane(bld.def(v1), val_hi, lane, src_hi));
8614         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8615         emit_split_vector(ctx, dst, 2);
8616      } else {
8617         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8618      }
8619      break;
8620   }
8621   case nir_intrinsic_mbcnt_amd: {
8622      Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8623      Temp add_src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
8624      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8625      /* Fit 64-bit mask for wave32 */
8626      src = emit_extract_vector(ctx, src, 0, RegClass(src.type(), bld.lm.size()));
8627      Temp wqm_tmp = emit_mbcnt(ctx, bld.tmp(v1), Operand(src), Operand(add_src));
8628      emit_wqm(bld, wqm_tmp, dst);
8629      break;
8630   }
8631   case nir_intrinsic_byte_permute_amd: {
8632      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8633      assert(dst.regClass() == v1);
8634      assert(ctx->program->chip_class >= GFX8);
8635      bld.vop3(aco_opcode::v_perm_b32, Definition(dst), get_ssa_temp(ctx, instr->src[0].ssa),
8636               as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)),
8637               as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa)));
8638      break;
8639   }
8640   case nir_intrinsic_lane_permute_16_amd: {
8641      Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8642      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8643      assert(ctx->program->chip_class >= GFX10);
8644
8645      if (src.regClass() == s1) {
8646         bld.copy(Definition(dst), src);
8647      } else if (dst.regClass() == v1 && src.regClass() == v1) {
8648         bld.vop3(aco_opcode::v_permlane16_b32, Definition(dst), src,
8649                  bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa)),
8650                  bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa)));
8651      } else {
8652         isel_err(&instr->instr, "Unimplemented lane_permute_16_amd");
8653      }
8654      break;
8655   }
8656   case nir_intrinsic_load_helper_invocation:
8657   case nir_intrinsic_is_helper_invocation: {
8658      /* load_helper() after demote() get lowered to is_helper().
8659       * Otherwise, these two behave the same. */
8660      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8661      bld.pseudo(aco_opcode::p_is_helper, Definition(dst), Operand(exec, bld.lm));
8662      ctx->block->kind |= block_kind_needs_lowering;
8663      ctx->program->needs_exact = true;
8664      break;
8665   }
8666   case nir_intrinsic_demote:
8667      bld.pseudo(aco_opcode::p_demote_to_helper, Operand::c32(-1u));
8668
8669      if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
8670         ctx->cf_info.exec_potentially_empty_discard = true;
8671      ctx->block->kind |= block_kind_uses_demote;
8672      ctx->program->needs_exact = true;
8673      break;
8674   case nir_intrinsic_demote_if: {
8675      Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8676      assert(src.regClass() == bld.lm);
8677      Temp cond =
8678         bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8679      bld.pseudo(aco_opcode::p_demote_to_helper, cond);
8680
8681      if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
8682         ctx->cf_info.exec_potentially_empty_discard = true;
8683      ctx->block->kind |= block_kind_uses_demote;
8684      ctx->program->needs_exact = true;
8685      break;
8686   }
8687   case nir_intrinsic_first_invocation: {
8688      emit_wqm(bld, bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)),
8689               get_ssa_temp(ctx, &instr->dest.ssa));
8690      break;
8691   }
8692   case nir_intrinsic_last_invocation: {
8693      Temp flbit = bld.sop1(Builder::s_flbit_i32, bld.def(s1), Operand(exec, bld.lm));
8694      Temp last = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc),
8695                           Operand::c32(ctx->program->wave_size - 1u), flbit);
8696      emit_wqm(bld, last, get_ssa_temp(ctx, &instr->dest.ssa));
8697      break;
8698   }
8699   case nir_intrinsic_elect: {
8700      /* p_elect is lowered in aco_insert_exec_mask.
8701       * Use exec as an operand so value numbering and the pre-RA optimizer won't recognize
8702       * two p_elect with different exec masks as the same.
8703       */
8704      Temp elected = bld.pseudo(aco_opcode::p_elect, bld.def(bld.lm), Operand(exec, bld.lm));
8705      emit_wqm(bld, elected, get_ssa_temp(ctx, &instr->dest.ssa));
8706      ctx->block->kind |= block_kind_needs_lowering;
8707      break;
8708   }
8709   case nir_intrinsic_shader_clock: {
8710      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8711      if (nir_intrinsic_memory_scope(instr) == NIR_SCOPE_SUBGROUP &&
8712          ctx->options->chip_class >= GFX10_3) {
8713         /* "((size - 1) << 11) | register" (SHADER_CYCLES is encoded as register 29) */
8714         Temp clock = bld.sopk(aco_opcode::s_getreg_b32, bld.def(s1), ((20 - 1) << 11) | 29);
8715         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), clock, Operand::zero());
8716      } else {
8717         aco_opcode opcode = nir_intrinsic_memory_scope(instr) == NIR_SCOPE_DEVICE
8718                                ? aco_opcode::s_memrealtime
8719                                : aco_opcode::s_memtime;
8720         bld.smem(opcode, Definition(dst), memory_sync_info(0, semantic_volatile));
8721      }
8722      emit_split_vector(ctx, dst, 2);
8723      break;
8724   }
8725   case nir_intrinsic_load_vertex_id_zero_base: {
8726      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8727      bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.vertex_id));
8728      break;
8729   }
8730   case nir_intrinsic_load_first_vertex: {
8731      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8732      bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.base_vertex));
8733      break;
8734   }
8735   case nir_intrinsic_load_base_instance: {
8736      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8737      bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.start_instance));
8738      break;
8739   }
8740   case nir_intrinsic_load_instance_id: {
8741      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8742      bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.instance_id));
8743      break;
8744   }
8745   case nir_intrinsic_load_draw_id: {
8746      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8747      bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.draw_id));
8748      break;
8749   }
8750   case nir_intrinsic_load_invocation_id: {
8751      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8752
8753      if (ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
8754         if (ctx->options->chip_class >= GFX10)
8755            bld.vop2_e64(aco_opcode::v_and_b32, Definition(dst), Operand::c32(127u),
8756                         get_arg(ctx, ctx->args->ac.gs_invocation_id));
8757         else
8758            bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_invocation_id));
8759      } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
8760         bld.vop3(aco_opcode::v_bfe_u32, Definition(dst), get_arg(ctx, ctx->args->ac.tcs_rel_ids),
8761                  Operand::c32(8u), Operand::c32(5u));
8762      } else {
8763         unreachable("Unsupported stage for load_invocation_id");
8764      }
8765
8766      break;
8767   }
8768   case nir_intrinsic_load_primitive_id: {
8769      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8770
8771      switch (ctx->shader->info.stage) {
8772      case MESA_SHADER_GEOMETRY:
8773         bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_prim_id));
8774         break;
8775      case MESA_SHADER_TESS_CTRL:
8776         bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.tcs_patch_id));
8777         break;
8778      case MESA_SHADER_TESS_EVAL:
8779         bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.tes_patch_id));
8780         break;
8781      default:
8782         if (ctx->stage.hw == HWStage::NGG && !ctx->stage.has(SWStage::GS)) {
8783            /* In case of NGG, the GS threads always have the primitive ID
8784             * even if there is no SW GS. */
8785            bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_prim_id));
8786            break;
8787         }
8788         unreachable("Unimplemented shader stage for nir_intrinsic_load_primitive_id");
8789      }
8790
8791      break;
8792   }
8793   case nir_intrinsic_load_patch_vertices_in: {
8794      assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL ||
8795             ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
8796
8797      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8798      bld.copy(Definition(dst), Operand::c32(ctx->args->options->key.tcs.tess_input_vertices));
8799      break;
8800   }
8801   case nir_intrinsic_emit_vertex_with_counter: {
8802      assert(ctx->stage.hw == HWStage::GS);
8803      visit_emit_vertex_with_counter(ctx, instr);
8804      break;
8805   }
8806   case nir_intrinsic_end_primitive_with_counter: {
8807      if (ctx->stage.hw != HWStage::NGG) {
8808         unsigned stream = nir_intrinsic_stream_id(instr);
8809         bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1,
8810                  sendmsg_gs(true, false, stream));
8811      }
8812      break;
8813   }
8814   case nir_intrinsic_set_vertex_and_primitive_count: {
8815      assert(ctx->stage.hw == HWStage::GS);
8816      /* unused in the legacy pipeline, the HW keeps track of this for us */
8817      break;
8818   }
8819   case nir_intrinsic_load_tess_rel_patch_id_amd: {
8820      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), get_tess_rel_patch_id(ctx));
8821      break;
8822   }
8823   case nir_intrinsic_load_ring_tess_factors_amd: {
8824      bld.smem(aco_opcode::s_load_dwordx4, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8825               ctx->program->private_segment_buffer, Operand::c32(RING_HS_TESS_FACTOR * 16u));
8826      break;
8827   }
8828   case nir_intrinsic_load_ring_tess_factors_offset_amd: {
8829      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8830               get_arg(ctx, ctx->args->ac.tcs_factor_offset));
8831      break;
8832   }
8833   case nir_intrinsic_load_ring_tess_offchip_amd: {
8834      bld.smem(aco_opcode::s_load_dwordx4, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8835               ctx->program->private_segment_buffer, Operand::c32(RING_HS_TESS_OFFCHIP * 16u));
8836      break;
8837   }
8838   case nir_intrinsic_load_ring_tess_offchip_offset_amd: {
8839      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8840               get_arg(ctx, ctx->args->ac.tess_offchip_offset));
8841      break;
8842   }
8843   case nir_intrinsic_load_ring_esgs_amd: {
8844      unsigned ring = ctx->stage.hw == HWStage::ES ? RING_ESGS_VS : RING_ESGS_GS;
8845      bld.smem(aco_opcode::s_load_dwordx4, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8846               ctx->program->private_segment_buffer, Operand::c32(ring * 16u));
8847      break;
8848   }
8849   case nir_intrinsic_load_ring_es2gs_offset_amd: {
8850      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8851               get_arg(ctx, ctx->args->ac.es2gs_offset));
8852      break;
8853   }
8854   case nir_intrinsic_load_gs_vertex_offset_amd: {
8855      /* GFX6-8 uses 6 separate args, while GFX9+ packs these into only 3 args. */
8856      unsigned b = nir_intrinsic_base(instr);
8857      assert(b <= (ctx->program->chip_class >= GFX9 ? 2 : 5));
8858      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8859               get_arg(ctx, ctx->args->ac.gs_vtx_offset[b]));
8860      break;
8861   }
8862   case nir_intrinsic_has_input_vertex_amd:
8863   case nir_intrinsic_has_input_primitive_amd: {
8864      assert(ctx->stage.hw == HWStage::NGG);
8865      unsigned i = instr->intrinsic == nir_intrinsic_has_input_vertex_amd ? 0 : 1;
8866      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), merged_wave_info_to_mask(ctx, i));
8867      break;
8868   }
8869   case nir_intrinsic_load_workgroup_num_input_vertices_amd:
8870   case nir_intrinsic_load_workgroup_num_input_primitives_amd: {
8871      assert(ctx->stage.hw == HWStage::NGG);
8872      unsigned pos =
8873         instr->intrinsic == nir_intrinsic_load_workgroup_num_input_vertices_amd ? 12 : 22;
8874      bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8875               bld.def(s1, scc), get_arg(ctx, ctx->args->ac.gs_tg_info),
8876               Operand::c32(pos | (9u << 16u)));
8877      break;
8878   }
8879   case nir_intrinsic_load_initial_edgeflags_amd: {
8880      assert(ctx->stage.hw == HWStage::NGG);
8881
8882      Temp gs_invocation_id = get_arg(ctx, ctx->args->ac.gs_invocation_id);
8883      /* Get initial edgeflags for each vertex at bits 8, 9, 10 of gs_invocation_id. */
8884      Temp flags =
8885         bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x700u), gs_invocation_id);
8886      /* Move the bits to their desired position: 8->9, 9->19, 10->29. */
8887      flags = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), Operand::c32(0x80402u), flags);
8888      /* Remove garbage bits that are a byproduct of the multiplication. */
8889      bld.vop2(aco_opcode::v_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8890               Operand::c32(0x20080200), flags);
8891      break;
8892   }
8893   case nir_intrinsic_load_packed_passthrough_primitive_amd: {
8894      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8895               get_arg(ctx, ctx->args->ac.gs_vtx_offset[0]));
8896      break;
8897   }
8898   case nir_intrinsic_export_vertex_amd: {
8899      ctx->block->kind |= block_kind_export_end;
8900      create_vs_exports(ctx);
8901      break;
8902   }
8903   case nir_intrinsic_export_primitive_amd: {
8904      assert(ctx->stage.hw == HWStage::NGG);
8905      Temp prim_exp_arg = get_ssa_temp(ctx, instr->src[0].ssa);
8906      bld.exp(aco_opcode::exp, prim_exp_arg, Operand(v1), Operand(v1), Operand(v1),
8907              1 /* enabled mask */, V_008DFC_SQ_EXP_PRIM /* dest */, false /* compressed */,
8908              true /* done */, false /* valid mask */);
8909      break;
8910   }
8911   case nir_intrinsic_alloc_vertices_and_primitives_amd: {
8912      assert(ctx->stage.hw == HWStage::NGG);
8913      Temp num_vertices = get_ssa_temp(ctx, instr->src[0].ssa);
8914      Temp num_primitives = get_ssa_temp(ctx, instr->src[1].ssa);
8915      ngg_emit_sendmsg_gs_alloc_req(ctx, num_vertices, num_primitives);
8916      break;
8917   }
8918   case nir_intrinsic_gds_atomic_add_amd: {
8919      Temp store_val = get_ssa_temp(ctx, instr->src[0].ssa);
8920      Temp gds_addr = get_ssa_temp(ctx, instr->src[1].ssa);
8921      Temp m0_val = get_ssa_temp(ctx, instr->src[2].ssa);
8922      Operand m = bld.m0((Temp)bld.copy(bld.def(s1, m0), bld.as_uniform(m0_val)));
8923      bld.ds(aco_opcode::ds_add_u32, as_vgpr(ctx, gds_addr), as_vgpr(ctx, store_val), m, 0u, 0u,
8924             true);
8925      break;
8926   }
8927   case nir_intrinsic_load_shader_query_enabled_amd: {
8928      unsigned cmp_bit = 0;
8929      Temp shader_query_enabled =
8930         bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc),
8931                  get_arg(ctx, ctx->args->ngg_gs_state), Operand::c32(cmp_bit));
8932      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8933               bool_to_vector_condition(ctx, shader_query_enabled));
8934      break;
8935   }
8936   case nir_intrinsic_load_cull_front_face_enabled_amd:
8937   case nir_intrinsic_load_cull_back_face_enabled_amd:
8938   case nir_intrinsic_load_cull_ccw_amd:
8939   case nir_intrinsic_load_cull_small_primitives_enabled_amd: {
8940      unsigned cmp_bit;
8941      if (instr->intrinsic == nir_intrinsic_load_cull_front_face_enabled_amd)
8942         cmp_bit = 0;
8943      else if (instr->intrinsic == nir_intrinsic_load_cull_back_face_enabled_amd)
8944         cmp_bit = 1;
8945      else if (instr->intrinsic == nir_intrinsic_load_cull_ccw_amd)
8946         cmp_bit = 2;
8947      else if (instr->intrinsic == nir_intrinsic_load_cull_small_primitives_enabled_amd)
8948         cmp_bit = 3;
8949      else
8950         unreachable("unimplemented culling intrinsic");
8951
8952      Builder::Result enabled =
8953         bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc),
8954                  get_arg(ctx, ctx->args->ngg_culling_settings), Operand::c32(cmp_bit));
8955      enabled.instr->definitions[0].setNoCSE(true);
8956      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8957               bool_to_vector_condition(ctx, enabled));
8958      break;
8959   }
8960   case nir_intrinsic_load_sbt_amd: visit_load_sbt_amd(ctx, instr); break;
8961   case nir_intrinsic_bvh64_intersect_ray_amd: visit_bvh64_intersect_ray_amd(ctx, instr); break;
8962   case nir_intrinsic_load_cull_any_enabled_amd: {
8963      Builder::Result cull_any_enabled =
8964         bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
8965                  get_arg(ctx, ctx->args->ngg_culling_settings), Operand::c32(0xbu));
8966      cull_any_enabled.instr->definitions[1].setNoCSE(true);
8967      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8968               bool_to_vector_condition(ctx, cull_any_enabled.def(1).getTemp()));
8969      break;
8970   }
8971   case nir_intrinsic_load_cull_small_prim_precision_amd: {
8972      /* Exponent is 8-bit signed int, move that into a signed 32-bit int. */
8973      Temp exponent = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc),
8974                               get_arg(ctx, ctx->args->ngg_culling_settings), Operand::c32(24u));
8975      /* small_prim_precision = 1.0 * 2^X */
8976      bld.vop3(aco_opcode::v_ldexp_f32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8977               Operand::c32(0x3f800000u), Operand(exponent));
8978      break;
8979   }
8980   case nir_intrinsic_load_viewport_x_scale: {
8981      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8982               get_arg(ctx, ctx->args->ngg_viewport_scale[0]));
8983      break;
8984   }
8985   case nir_intrinsic_load_viewport_y_scale: {
8986      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8987               get_arg(ctx, ctx->args->ngg_viewport_scale[1]));
8988      break;
8989   }
8990   case nir_intrinsic_load_viewport_x_offset: {
8991      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8992               get_arg(ctx, ctx->args->ngg_viewport_translate[0]));
8993      break;
8994   }
8995   case nir_intrinsic_load_viewport_y_offset: {
8996      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8997               get_arg(ctx, ctx->args->ngg_viewport_translate[1]));
8998      break;
8999   }
9000   case nir_intrinsic_overwrite_vs_arguments_amd: {
9001      ctx->arg_temps[ctx->args->ac.vertex_id.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa);
9002      ctx->arg_temps[ctx->args->ac.instance_id.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa);
9003      break;
9004   }
9005   case nir_intrinsic_overwrite_tes_arguments_amd: {
9006      ctx->arg_temps[ctx->args->ac.tes_u.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa);
9007      ctx->arg_temps[ctx->args->ac.tes_v.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa);
9008      ctx->arg_temps[ctx->args->ac.tes_rel_patch_id.arg_index] =
9009         get_ssa_temp(ctx, instr->src[2].ssa);
9010      ctx->arg_temps[ctx->args->ac.tes_patch_id.arg_index] = get_ssa_temp(ctx, instr->src[3].ssa);
9011      break;
9012   }
9013   default:
9014      isel_err(&instr->instr, "Unimplemented intrinsic instr");
9015      abort();
9016
9017      break;
9018   }
9019}
9020
9021void
9022tex_fetch_ptrs(isel_context* ctx, nir_tex_instr* instr, Temp* res_ptr, Temp* samp_ptr,
9023               enum glsl_base_type* stype)
9024{
9025   nir_deref_instr* texture_deref_instr = NULL;
9026   nir_deref_instr* sampler_deref_instr = NULL;
9027   int plane = -1;
9028
9029   for (unsigned i = 0; i < instr->num_srcs; i++) {
9030      switch (instr->src[i].src_type) {
9031      case nir_tex_src_texture_deref:
9032         texture_deref_instr = nir_src_as_deref(instr->src[i].src);
9033         break;
9034      case nir_tex_src_sampler_deref:
9035         sampler_deref_instr = nir_src_as_deref(instr->src[i].src);
9036         break;
9037      case nir_tex_src_plane: plane = nir_src_as_int(instr->src[i].src); break;
9038      default: break;
9039      }
9040   }
9041
9042   *stype = glsl_get_sampler_result_type(texture_deref_instr->type);
9043
9044   if (!sampler_deref_instr)
9045      sampler_deref_instr = texture_deref_instr;
9046
9047   if (plane >= 0) {
9048      assert(instr->sampler_dim != GLSL_SAMPLER_DIM_BUF);
9049      *res_ptr = get_sampler_desc(ctx, texture_deref_instr,
9050                                  (aco_descriptor_type)(ACO_DESC_PLANE_0 + plane), instr, false);
9051   } else if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
9052      *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_BUFFER, instr, false);
9053   } else if (instr->op == nir_texop_fragment_mask_fetch_amd) {
9054      *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false);
9055   } else {
9056      *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_IMAGE, instr, false);
9057   }
9058   if (samp_ptr) {
9059      *samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, ACO_DESC_SAMPLER, instr, false);
9060
9061      if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT && ctx->options->chip_class < GFX8) {
9062         /* fix sampler aniso on SI/CI: samp[0] = samp[0] & img[7] */
9063         Builder bld(ctx->program, ctx->block);
9064
9065         /* to avoid unnecessary moves, we split and recombine sampler and image */
9066         Temp img[8] = {bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1),
9067                        bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1)};
9068         Temp samp[4] = {bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1)};
9069         bld.pseudo(aco_opcode::p_split_vector, Definition(img[0]), Definition(img[1]),
9070                    Definition(img[2]), Definition(img[3]), Definition(img[4]), Definition(img[5]),
9071                    Definition(img[6]), Definition(img[7]), *res_ptr);
9072         bld.pseudo(aco_opcode::p_split_vector, Definition(samp[0]), Definition(samp[1]),
9073                    Definition(samp[2]), Definition(samp[3]), *samp_ptr);
9074
9075         samp[0] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), samp[0], img[7]);
9076         *res_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), img[0], img[1], img[2],
9077                               img[3], img[4], img[5], img[6], img[7]);
9078         *samp_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), samp[0], samp[1], samp[2],
9079                                samp[3]);
9080      }
9081   }
9082}
9083
9084void
9085build_cube_select(isel_context* ctx, Temp ma, Temp id, Temp deriv, Temp* out_ma, Temp* out_sc,
9086                  Temp* out_tc)
9087{
9088   Builder bld(ctx->program, ctx->block);
9089
9090   Temp deriv_x = emit_extract_vector(ctx, deriv, 0, v1);
9091   Temp deriv_y = emit_extract_vector(ctx, deriv, 1, v1);
9092   Temp deriv_z = emit_extract_vector(ctx, deriv, 2, v1);
9093
9094   Operand neg_one = Operand::c32(0xbf800000u);
9095   Operand one = Operand::c32(0x3f800000u);
9096   Operand two = Operand::c32(0x40000000u);
9097   Operand four = Operand::c32(0x40800000u);
9098
9099   Temp is_ma_positive =
9100      bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand::zero(), ma);
9101   Temp sgn_ma = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, one, is_ma_positive);
9102   Temp neg_sgn_ma = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand::zero(), sgn_ma);
9103
9104   Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), four, id);
9105   Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(bld.lm), two, id);
9106   is_ma_y = bld.sop2(Builder::s_andn2, bld.hint_vcc(bld.def(bld.lm)), is_ma_y, is_ma_z);
9107   Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(bld.lm)),
9108                               bld.def(s1, scc), is_ma_z, is_ma_y);
9109
9110   /* select sc */
9111   Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_z, deriv_x, is_not_ma_x);
9112   Temp sgn = bld.vop2_e64(
9113      aco_opcode::v_cndmask_b32, bld.def(v1),
9114      bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_sgn_ma, sgn_ma, is_ma_z), one, is_ma_y);
9115   *out_sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
9116
9117   /* select tc */
9118   tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_y, deriv_z, is_ma_y);
9119   sgn = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, sgn_ma, is_ma_y);
9120   *out_tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
9121
9122   /* select ma */
9123   tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9124                  bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_x, deriv_y, is_ma_y),
9125                  deriv_z, is_ma_z);
9126   tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7fffffffu), tmp);
9127   *out_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), two, tmp);
9128}
9129
9130void
9131prepare_cube_coords(isel_context* ctx, std::vector<Temp>& coords, Temp* ddx, Temp* ddy,
9132                    bool is_deriv, bool is_array)
9133{
9134   Builder bld(ctx->program, ctx->block);
9135   Temp ma, tc, sc, id;
9136   aco_opcode madak =
9137      ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fmaak_f32 : aco_opcode::v_madak_f32;
9138   aco_opcode madmk =
9139      ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fmamk_f32 : aco_opcode::v_madmk_f32;
9140
9141   if (is_array) {
9142      coords[3] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[3]);
9143
9144      /* see comment in ac_prepare_cube_coords() */
9145      if (ctx->options->chip_class <= GFX8)
9146         coords[3] = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::zero(), coords[3]);
9147   }
9148
9149   ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), coords[0], coords[1], coords[2]);
9150
9151   aco_ptr<VOP3_instruction> vop3a{
9152      create_instruction<VOP3_instruction>(aco_opcode::v_rcp_f32, asVOP3(Format::VOP1), 1, 1)};
9153   vop3a->operands[0] = Operand(ma);
9154   vop3a->abs[0] = true;
9155   Temp invma = bld.tmp(v1);
9156   vop3a->definitions[0] = Definition(invma);
9157   ctx->block->instructions.emplace_back(std::move(vop3a));
9158
9159   sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), coords[0], coords[1], coords[2]);
9160   if (!is_deriv)
9161      sc = bld.vop2(madak, bld.def(v1), sc, invma, Operand::c32(0x3fc00000u /*1.5*/));
9162
9163   tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), coords[0], coords[1], coords[2]);
9164   if (!is_deriv)
9165      tc = bld.vop2(madak, bld.def(v1), tc, invma, Operand::c32(0x3fc00000u /*1.5*/));
9166
9167   id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), coords[0], coords[1], coords[2]);
9168
9169   if (is_deriv) {
9170      sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, invma);
9171      tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, invma);
9172
9173      for (unsigned i = 0; i < 2; i++) {
9174         /* see comment in ac_prepare_cube_coords() */
9175         Temp deriv_ma;
9176         Temp deriv_sc, deriv_tc;
9177         build_cube_select(ctx, ma, id, i ? *ddy : *ddx, &deriv_ma, &deriv_sc, &deriv_tc);
9178
9179         deriv_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, invma);
9180
9181         Temp x = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
9182                           bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_sc, invma),
9183                           bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, sc));
9184         Temp y = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
9185                           bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_tc, invma),
9186                           bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, tc));
9187         *(i ? ddy : ddx) = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), x, y);
9188      }
9189
9190      sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::c32(0x3fc00000u /*1.5*/), sc);
9191      tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::c32(0x3fc00000u /*1.5*/), tc);
9192   }
9193
9194   if (is_array)
9195      id = bld.vop2(madmk, bld.def(v1), coords[3], id, Operand::c32(0x41000000u /*8.0*/));
9196   coords.resize(3);
9197   coords[0] = sc;
9198   coords[1] = tc;
9199   coords[2] = id;
9200}
9201
9202void
9203get_const_vec(nir_ssa_def* vec, nir_const_value* cv[4])
9204{
9205   if (vec->parent_instr->type != nir_instr_type_alu)
9206      return;
9207   nir_alu_instr* vec_instr = nir_instr_as_alu(vec->parent_instr);
9208   if (vec_instr->op != nir_op_vec(vec->num_components))
9209      return;
9210
9211   for (unsigned i = 0; i < vec->num_components; i++) {
9212      cv[i] =
9213         vec_instr->src[i].swizzle[0] == 0 ? nir_src_as_const_value(vec_instr->src[i].src) : NULL;
9214   }
9215}
9216
9217void
9218visit_tex(isel_context* ctx, nir_tex_instr* instr)
9219{
9220   assert(instr->op != nir_texop_txf_ms && instr->op != nir_texop_samples_identical);
9221
9222   Builder bld(ctx->program, ctx->block);
9223   bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
9224        has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false,
9225        has_sample_index = false, has_clamped_lod = false;
9226   Temp resource, sampler, bias = Temp(), compare = Temp(), sample_index = Temp(), lod = Temp(),
9227                           offset = Temp(), ddx = Temp(), ddy = Temp(), clamped_lod = Temp();
9228   std::vector<Temp> coords;
9229   std::vector<Temp> derivs;
9230   nir_const_value* const_offset[4] = {NULL, NULL, NULL, NULL};
9231   enum glsl_base_type stype;
9232   tex_fetch_ptrs(ctx, instr, &resource, &sampler, &stype);
9233
9234   bool tg4_integer_workarounds = ctx->options->chip_class <= GFX8 && instr->op == nir_texop_tg4 &&
9235                                  (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT);
9236   bool tg4_integer_cube_workaround =
9237      tg4_integer_workarounds && instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
9238
9239   for (unsigned i = 0; i < instr->num_srcs; i++) {
9240      switch (instr->src[i].src_type) {
9241      case nir_tex_src_coord: {
9242         Temp coord = get_ssa_temp(ctx, instr->src[i].src.ssa);
9243         for (unsigned j = 0; j < coord.size(); j++)
9244            coords.emplace_back(emit_extract_vector(ctx, coord, j, v1));
9245         break;
9246      }
9247      case nir_tex_src_bias:
9248         bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
9249         has_bias = true;
9250         break;
9251      case nir_tex_src_lod: {
9252         if (nir_src_is_const(instr->src[i].src) && nir_src_as_uint(instr->src[i].src) == 0) {
9253            level_zero = true;
9254         } else {
9255            lod = get_ssa_temp(ctx, instr->src[i].src.ssa);
9256            has_lod = true;
9257         }
9258         break;
9259      }
9260      case nir_tex_src_min_lod:
9261         clamped_lod = get_ssa_temp(ctx, instr->src[i].src.ssa);
9262         has_clamped_lod = true;
9263         break;
9264      case nir_tex_src_comparator:
9265         if (instr->is_shadow) {
9266            compare = get_ssa_temp(ctx, instr->src[i].src.ssa);
9267            has_compare = true;
9268         }
9269         break;
9270      case nir_tex_src_offset:
9271         offset = get_ssa_temp(ctx, instr->src[i].src.ssa);
9272         get_const_vec(instr->src[i].src.ssa, const_offset);
9273         has_offset = true;
9274         break;
9275      case nir_tex_src_ddx:
9276         ddx = get_ssa_temp(ctx, instr->src[i].src.ssa);
9277         has_ddx = true;
9278         break;
9279      case nir_tex_src_ddy:
9280         ddy = get_ssa_temp(ctx, instr->src[i].src.ssa);
9281         has_ddy = true;
9282         break;
9283      case nir_tex_src_ms_index:
9284         sample_index = get_ssa_temp(ctx, instr->src[i].src.ssa);
9285         has_sample_index = true;
9286         break;
9287      case nir_tex_src_texture_offset:
9288      case nir_tex_src_sampler_offset:
9289      default: break;
9290      }
9291   }
9292
9293   if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
9294      return get_buffer_size(ctx, resource, get_ssa_temp(ctx, &instr->dest.ssa));
9295
9296   if (instr->op == nir_texop_texture_samples) {
9297      get_image_samples(ctx, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), resource);
9298      return;
9299   }
9300
9301   if (has_offset && instr->op != nir_texop_txf) {
9302      aco_ptr<Instruction> tmp_instr;
9303      Temp acc, pack = Temp();
9304
9305      uint32_t pack_const = 0;
9306      for (unsigned i = 0; i < offset.size(); i++) {
9307         if (!const_offset[i])
9308            continue;
9309         pack_const |= (const_offset[i]->u32 & 0x3Fu) << (8u * i);
9310      }
9311
9312      if (offset.type() == RegType::sgpr) {
9313         for (unsigned i = 0; i < offset.size(); i++) {
9314            if (const_offset[i])
9315               continue;
9316
9317            acc = emit_extract_vector(ctx, offset, i, s1);
9318            acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc,
9319                           Operand::c32(0x3Fu));
9320
9321            if (i) {
9322               acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc,
9323                              Operand::c32(8u * i));
9324            }
9325
9326            if (pack == Temp()) {
9327               pack = acc;
9328            } else {
9329               pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), pack, acc);
9330            }
9331         }
9332
9333         if (pack_const && pack != Temp())
9334            pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
9335                            Operand::c32(pack_const), pack);
9336      } else {
9337         for (unsigned i = 0; i < offset.size(); i++) {
9338            if (const_offset[i])
9339               continue;
9340
9341            acc = emit_extract_vector(ctx, offset, i, v1);
9342            acc = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x3Fu), acc);
9343
9344            if (i) {
9345               acc = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(8u * i), acc);
9346            }
9347
9348            if (pack == Temp()) {
9349               pack = acc;
9350            } else {
9351               pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), pack, acc);
9352            }
9353         }
9354
9355         if (pack_const && pack != Temp())
9356            pack = bld.sop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(pack_const), pack);
9357      }
9358      if (pack_const && pack == Temp())
9359         offset = bld.copy(bld.def(v1), Operand::c32(pack_const));
9360      else if (pack == Temp())
9361         has_offset = false;
9362      else
9363         offset = pack;
9364   }
9365
9366   if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && instr->coord_components)
9367      prepare_cube_coords(ctx, coords, &ddx, &ddy, instr->op == nir_texop_txd,
9368                          instr->is_array && instr->op != nir_texop_lod);
9369
9370   /* pack derivatives */
9371   if (has_ddx || has_ddy) {
9372      if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D && ctx->options->chip_class == GFX9) {
9373         assert(has_ddx && has_ddy && ddx.size() == 1 && ddy.size() == 1);
9374         Temp zero = bld.copy(bld.def(v1), Operand::zero());
9375         derivs = {ddx, zero, ddy, zero};
9376      } else {
9377         for (unsigned i = 0; has_ddx && i < ddx.size(); i++)
9378            derivs.emplace_back(emit_extract_vector(ctx, ddx, i, v1));
9379         for (unsigned i = 0; has_ddy && i < ddy.size(); i++)
9380            derivs.emplace_back(emit_extract_vector(ctx, ddy, i, v1));
9381      }
9382      has_derivs = true;
9383   }
9384
9385   if (instr->coord_components > 1 && instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
9386       instr->is_array && instr->op != nir_texop_txf)
9387      coords[1] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[1]);
9388
9389   if (instr->coord_components > 2 &&
9390       (instr->sampler_dim == GLSL_SAMPLER_DIM_2D || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
9391        instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS ||
9392        instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
9393       instr->is_array && instr->op != nir_texop_txf && instr->op != nir_texop_fragment_fetch_amd &&
9394       instr->op != nir_texop_fragment_mask_fetch_amd)
9395      coords[2] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[2]);
9396
9397   if (ctx->options->chip_class == GFX9 && instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
9398       instr->op != nir_texop_lod && instr->coord_components) {
9399      assert(coords.size() > 0 && coords.size() < 3);
9400
9401      coords.insert(std::next(coords.begin()),
9402                    bld.copy(bld.def(v1), instr->op == nir_texop_txf ? Operand::c32(0)
9403                                                                     : Operand::c32(0x3f000000)));
9404   }
9405
9406   bool da = should_declare_array(ctx, instr->sampler_dim, instr->is_array);
9407
9408   if (has_offset && instr->op == nir_texop_txf) {
9409      for (unsigned i = 0; i < std::min(offset.size(), instr->coord_components); i++) {
9410         Temp off = emit_extract_vector(ctx, offset, i, v1);
9411         coords[i] = bld.vadd32(bld.def(v1), coords[i], off);
9412      }
9413      has_offset = false;
9414   }
9415
9416   /* Build tex instruction */
9417   unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa) & 0xf;
9418   if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
9419      dmask = u_bit_consecutive(0, util_last_bit(dmask));
9420   if (instr->is_sparse)
9421      dmask = MAX2(dmask, 1) | 0x10;
9422   unsigned dim =
9423      ctx->options->chip_class >= GFX10 && instr->sampler_dim != GLSL_SAMPLER_DIM_BUF
9424         ? ac_get_sampler_dim(ctx->options->chip_class, instr->sampler_dim, instr->is_array)
9425         : 0;
9426   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
9427   Temp tmp_dst = dst;
9428
9429   /* gather4 selects the component by dmask and always returns vec4 (vec5 if sparse) */
9430   if (instr->op == nir_texop_tg4) {
9431      assert(instr->dest.ssa.num_components == (4 + instr->is_sparse));
9432      if (instr->is_shadow)
9433         dmask = 1;
9434      else
9435         dmask = 1 << instr->component;
9436      if (tg4_integer_cube_workaround || dst.type() == RegType::sgpr)
9437         tmp_dst = bld.tmp(instr->is_sparse ? v5 : v4);
9438   } else if (instr->op == nir_texop_fragment_mask_fetch_amd) {
9439      tmp_dst = bld.tmp(v1);
9440   } else if (util_bitcount(dmask) != instr->dest.ssa.num_components ||
9441              dst.type() == RegType::sgpr) {
9442      tmp_dst = bld.tmp(RegClass(RegType::vgpr, util_bitcount(dmask)));
9443   }
9444
9445   if (instr->op == nir_texop_txs || instr->op == nir_texop_query_levels) {
9446      if (!has_lod)
9447         lod = bld.copy(bld.def(v1), Operand::zero());
9448
9449      MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, Definition(tmp_dst),
9450                                        resource, Operand(s4), std::vector<Temp>{lod});
9451      if (ctx->options->chip_class == GFX9 && instr->op == nir_texop_txs &&
9452          instr->sampler_dim == GLSL_SAMPLER_DIM_1D && instr->is_array) {
9453         tex->dmask = (dmask & 0x1) | ((dmask & 0x2) << 1);
9454      } else if (instr->op == nir_texop_query_levels) {
9455         tex->dmask = 1 << 3;
9456      } else {
9457         tex->dmask = dmask;
9458      }
9459      tex->da = da;
9460      tex->dim = dim;
9461
9462      expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
9463      return;
9464   }
9465
9466   Temp tg4_compare_cube_wa64 = Temp();
9467
9468   if (tg4_integer_workarounds) {
9469      Temp tg4_lod = bld.copy(bld.def(v1), Operand::zero());
9470      Temp size = bld.tmp(v2);
9471      MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, Definition(size),
9472                                        resource, Operand(s4), std::vector<Temp>{tg4_lod});
9473      tex->dim = dim;
9474      tex->dmask = 0x3;
9475      tex->da = da;
9476      emit_split_vector(ctx, size, size.size());
9477
9478      Temp half_texel[2];
9479      for (unsigned i = 0; i < 2; i++) {
9480         half_texel[i] = emit_extract_vector(ctx, size, i, v1);
9481         half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]);
9482         half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]);
9483         half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1),
9484                                  Operand::c32(0xbf000000 /*-0.5*/), half_texel[i]);
9485      }
9486
9487      if (instr->sampler_dim == GLSL_SAMPLER_DIM_2D && !instr->is_array) {
9488         /* In vulkan, whether the sampler uses unnormalized
9489          * coordinates or not is a dynamic property of the
9490          * sampler. Hence, to figure out whether or not we
9491          * need to divide by the texture size, we need to test
9492          * the sampler at runtime. This tests the bit set by
9493          * radv_init_sampler().
9494          */
9495         unsigned bit_idx = ffs(S_008F30_FORCE_UNNORMALIZED(1)) - 1;
9496         Temp not_needed =
9497            bld.sopc(aco_opcode::s_bitcmp0_b32, bld.def(s1, scc), sampler, Operand::c32(bit_idx));
9498
9499         not_needed = bool_to_vector_condition(ctx, not_needed);
9500         half_texel[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9501                                  Operand::c32(0xbf000000 /*-0.5*/), half_texel[0], not_needed);
9502         half_texel[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9503                                  Operand::c32(0xbf000000 /*-0.5*/), half_texel[1], not_needed);
9504      }
9505
9506      Temp new_coords[2] = {bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[0], half_texel[0]),
9507                            bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[1], half_texel[1])};
9508
9509      if (tg4_integer_cube_workaround) {
9510         /* see comment in ac_nir_to_llvm.c's lower_gather4_integer() */
9511         Temp* const desc = (Temp*)alloca(resource.size() * sizeof(Temp));
9512         aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(
9513            aco_opcode::p_split_vector, Format::PSEUDO, 1, resource.size())};
9514         split->operands[0] = Operand(resource);
9515         for (unsigned i = 0; i < resource.size(); i++) {
9516            desc[i] = bld.tmp(s1);
9517            split->definitions[i] = Definition(desc[i]);
9518         }
9519         ctx->block->instructions.emplace_back(std::move(split));
9520
9521         Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1],
9522                              Operand::c32(20u | (6u << 16)));
9523         Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt,
9524                                         Operand::c32(V_008F14_IMG_DATA_FORMAT_8_8_8_8));
9525
9526         Temp nfmt;
9527         if (stype == GLSL_TYPE_UINT) {
9528            nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
9529                            Operand::c32(V_008F14_IMG_NUM_FORMAT_USCALED),
9530                            Operand::c32(V_008F14_IMG_NUM_FORMAT_UINT), bld.scc(compare_cube_wa));
9531         } else {
9532            nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
9533                            Operand::c32(V_008F14_IMG_NUM_FORMAT_SSCALED),
9534                            Operand::c32(V_008F14_IMG_NUM_FORMAT_SINT), bld.scc(compare_cube_wa));
9535         }
9536         tg4_compare_cube_wa64 = bld.tmp(bld.lm);
9537         bool_to_vector_condition(ctx, compare_cube_wa, tg4_compare_cube_wa64);
9538
9539         nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt,
9540                         Operand::c32(26u));
9541
9542         desc[1] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), desc[1],
9543                            Operand::c32(C_008F14_NUM_FORMAT));
9544         desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt);
9545
9546         aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
9547            aco_opcode::p_create_vector, Format::PSEUDO, resource.size(), 1)};
9548         for (unsigned i = 0; i < resource.size(); i++)
9549            vec->operands[i] = Operand(desc[i]);
9550         resource = bld.tmp(resource.regClass());
9551         vec->definitions[0] = Definition(resource);
9552         ctx->block->instructions.emplace_back(std::move(vec));
9553
9554         new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[0], coords[0],
9555                                  tg4_compare_cube_wa64);
9556         new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[1], coords[1],
9557                                  tg4_compare_cube_wa64);
9558      }
9559      coords[0] = new_coords[0];
9560      coords[1] = new_coords[1];
9561   }
9562
9563   if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
9564      // FIXME: if (ctx->abi->gfx9_stride_size_workaround) return
9565      // ac_build_buffer_load_format_gfx9_safe()
9566
9567      assert(coords.size() == 1);
9568      aco_opcode op;
9569      switch (util_last_bit(dmask & 0xf)) {
9570      case 1: op = aco_opcode::buffer_load_format_x; break;
9571      case 2: op = aco_opcode::buffer_load_format_xy; break;
9572      case 3: op = aco_opcode::buffer_load_format_xyz; break;
9573      case 4: op = aco_opcode::buffer_load_format_xyzw; break;
9574      default: unreachable("Tex instruction loads more than 4 components.");
9575      }
9576
9577      aco_ptr<MUBUF_instruction> mubuf{
9578         create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3 + instr->is_sparse, 1)};
9579      mubuf->operands[0] = Operand(resource);
9580      mubuf->operands[1] = Operand(coords[0]);
9581      mubuf->operands[2] = Operand::c32(0);
9582      mubuf->definitions[0] = Definition(tmp_dst);
9583      mubuf->idxen = true;
9584      mubuf->tfe = instr->is_sparse;
9585      if (mubuf->tfe)
9586         mubuf->operands[3] = emit_tfe_init(bld, tmp_dst);
9587      ctx->block->instructions.emplace_back(std::move(mubuf));
9588
9589      expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
9590      return;
9591   }
9592
9593   /* gather MIMG address components */
9594   std::vector<Temp> args;
9595   unsigned wqm_mask = 0;
9596   if (has_offset) {
9597      wqm_mask |= u_bit_consecutive(args.size(), 1);
9598      args.emplace_back(offset);
9599   }
9600   if (has_bias)
9601      args.emplace_back(bias);
9602   if (has_compare)
9603      args.emplace_back(compare);
9604   if (has_derivs)
9605      args.insert(args.end(), derivs.begin(), derivs.end());
9606
9607   wqm_mask |= u_bit_consecutive(args.size(), coords.size());
9608   args.insert(args.end(), coords.begin(), coords.end());
9609
9610   if (has_sample_index)
9611      args.emplace_back(sample_index);
9612   if (has_lod)
9613      args.emplace_back(lod);
9614   if (has_clamped_lod)
9615      args.emplace_back(clamped_lod);
9616
9617   if (instr->op == nir_texop_txf || instr->op == nir_texop_fragment_fetch_amd ||
9618       instr->op == nir_texop_fragment_mask_fetch_amd) {
9619      aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
9620                            instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS
9621                         ? aco_opcode::image_load
9622                         : aco_opcode::image_load_mip;
9623      Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
9624      MIMG_instruction* tex =
9625         emit_mimg(bld, op, Definition(tmp_dst), resource, Operand(s4), args, 0, vdata);
9626      if (instr->op == nir_texop_fragment_mask_fetch_amd)
9627         tex->dim = da ? ac_image_2darray : ac_image_2d;
9628      else
9629         tex->dim = dim;
9630      tex->dmask = dmask & 0xf;
9631      tex->unrm = true;
9632      tex->da = da;
9633      tex->tfe = instr->is_sparse;
9634
9635      if (instr->op == nir_texop_fragment_mask_fetch_amd) {
9636         /* Use 0x76543210 if the image doesn't have FMASK. */
9637         assert(dmask == 1 && dst.bytes() == 4);
9638         assert(dst.id() != tmp_dst.id());
9639
9640         if (dst.regClass() == s1) {
9641            Temp is_not_null = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand::zero(),
9642                                        emit_extract_vector(ctx, resource, 1, s1));
9643            bld.sop2(aco_opcode::s_cselect_b32, Definition(dst),
9644                     bld.as_uniform(tmp_dst), Operand::c32(0x76543210),
9645                     bld.scc(is_not_null));
9646         } else {
9647            Temp is_not_null = bld.tmp(bld.lm);
9648            bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(is_not_null), Operand::zero(),
9649                         emit_extract_vector(ctx, resource, 1, s1))
9650               .def(0)
9651               .setHint(vcc);
9652            bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst),
9653                     bld.copy(bld.def(v1), Operand::c32(0x76543210)), tmp_dst, is_not_null);
9654         }
9655      } else {
9656         expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
9657      }
9658      return;
9659   }
9660
9661   // TODO: would be better to do this by adding offsets, but needs the opcodes ordered.
9662   aco_opcode opcode = aco_opcode::image_sample;
9663   if (has_offset) { /* image_sample_*_o */
9664      if (has_clamped_lod) {
9665         if (has_compare) {
9666            opcode = aco_opcode::image_sample_c_cl_o;
9667            if (has_derivs)
9668               opcode = aco_opcode::image_sample_c_d_cl_o;
9669            if (has_bias)
9670               opcode = aco_opcode::image_sample_c_b_cl_o;
9671         } else {
9672            opcode = aco_opcode::image_sample_cl_o;
9673            if (has_derivs)
9674               opcode = aco_opcode::image_sample_d_cl_o;
9675            if (has_bias)
9676               opcode = aco_opcode::image_sample_b_cl_o;
9677         }
9678      } else if (has_compare) {
9679         opcode = aco_opcode::image_sample_c_o;
9680         if (has_derivs)
9681            opcode = aco_opcode::image_sample_c_d_o;
9682         if (has_bias)
9683            opcode = aco_opcode::image_sample_c_b_o;
9684         if (level_zero)
9685            opcode = aco_opcode::image_sample_c_lz_o;
9686         if (has_lod)
9687            opcode = aco_opcode::image_sample_c_l_o;
9688      } else {
9689         opcode = aco_opcode::image_sample_o;
9690         if (has_derivs)
9691            opcode = aco_opcode::image_sample_d_o;
9692         if (has_bias)
9693            opcode = aco_opcode::image_sample_b_o;
9694         if (level_zero)
9695            opcode = aco_opcode::image_sample_lz_o;
9696         if (has_lod)
9697            opcode = aco_opcode::image_sample_l_o;
9698      }
9699   } else if (has_clamped_lod) { /* image_sample_*_cl */
9700      if (has_compare) {
9701         opcode = aco_opcode::image_sample_c_cl;
9702         if (has_derivs)
9703            opcode = aco_opcode::image_sample_c_d_cl;
9704         if (has_bias)
9705            opcode = aco_opcode::image_sample_c_b_cl;
9706      } else {
9707         opcode = aco_opcode::image_sample_cl;
9708         if (has_derivs)
9709            opcode = aco_opcode::image_sample_d_cl;
9710         if (has_bias)
9711            opcode = aco_opcode::image_sample_b_cl;
9712      }
9713   } else { /* no offset */
9714      if (has_compare) {
9715         opcode = aco_opcode::image_sample_c;
9716         if (has_derivs)
9717            opcode = aco_opcode::image_sample_c_d;
9718         if (has_bias)
9719            opcode = aco_opcode::image_sample_c_b;
9720         if (level_zero)
9721            opcode = aco_opcode::image_sample_c_lz;
9722         if (has_lod)
9723            opcode = aco_opcode::image_sample_c_l;
9724      } else {
9725         opcode = aco_opcode::image_sample;
9726         if (has_derivs)
9727            opcode = aco_opcode::image_sample_d;
9728         if (has_bias)
9729            opcode = aco_opcode::image_sample_b;
9730         if (level_zero)
9731            opcode = aco_opcode::image_sample_lz;
9732         if (has_lod)
9733            opcode = aco_opcode::image_sample_l;
9734      }
9735   }
9736
9737   if (instr->op == nir_texop_tg4) {
9738      if (has_offset) { /* image_gather4_*_o */
9739         if (has_compare) {
9740            opcode = aco_opcode::image_gather4_c_lz_o;
9741            if (has_lod)
9742               opcode = aco_opcode::image_gather4_c_l_o;
9743            if (has_bias)
9744               opcode = aco_opcode::image_gather4_c_b_o;
9745         } else {
9746            opcode = aco_opcode::image_gather4_lz_o;
9747            if (has_lod)
9748               opcode = aco_opcode::image_gather4_l_o;
9749            if (has_bias)
9750               opcode = aco_opcode::image_gather4_b_o;
9751         }
9752      } else {
9753         if (has_compare) {
9754            opcode = aco_opcode::image_gather4_c_lz;
9755            if (has_lod)
9756               opcode = aco_opcode::image_gather4_c_l;
9757            if (has_bias)
9758               opcode = aco_opcode::image_gather4_c_b;
9759         } else {
9760            opcode = aco_opcode::image_gather4_lz;
9761            if (has_lod)
9762               opcode = aco_opcode::image_gather4_l;
9763            if (has_bias)
9764               opcode = aco_opcode::image_gather4_b;
9765         }
9766      }
9767   } else if (instr->op == nir_texop_lod) {
9768      opcode = aco_opcode::image_get_lod;
9769   }
9770
9771   bool implicit_derivs = bld.program->stage == fragment_fs && !has_derivs && !has_lod &&
9772                          !level_zero && instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
9773                          instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS;
9774
9775   Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
9776   MIMG_instruction* tex = emit_mimg(bld, opcode, Definition(tmp_dst), resource, Operand(sampler),
9777                                     args, implicit_derivs ? wqm_mask : 0, vdata);
9778   tex->dim = dim;
9779   tex->dmask = dmask & 0xf;
9780   tex->da = da;
9781   tex->tfe = instr->is_sparse;
9782
9783   if (tg4_integer_cube_workaround) {
9784      assert(tmp_dst.id() != dst.id());
9785      assert(tmp_dst.size() == dst.size());
9786
9787      emit_split_vector(ctx, tmp_dst, tmp_dst.size());
9788      Temp val[4];
9789      for (unsigned i = 0; i < 4; i++) {
9790         val[i] = emit_extract_vector(ctx, tmp_dst, i, v1);
9791         Temp cvt_val;
9792         if (stype == GLSL_TYPE_UINT)
9793            cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]);
9794         else
9795            cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]);
9796         val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val,
9797                           tg4_compare_cube_wa64);
9798      }
9799
9800      Temp tmp = dst.regClass() == tmp_dst.regClass() ? dst : bld.tmp(tmp_dst.regClass());
9801      if (instr->is_sparse)
9802         tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2],
9803                              val[3], emit_extract_vector(ctx, tmp_dst, 4, v1));
9804      else
9805         tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2],
9806                              val[3]);
9807   }
9808   unsigned mask = instr->op == nir_texop_tg4 ? (instr->is_sparse ? 0x1F : 0xF) : dmask;
9809   expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, mask);
9810}
9811
9812Operand
9813get_phi_operand(isel_context* ctx, nir_ssa_def* ssa, RegClass rc, bool logical)
9814{
9815   Temp tmp = get_ssa_temp(ctx, ssa);
9816   if (ssa->parent_instr->type == nir_instr_type_ssa_undef) {
9817      return Operand(rc);
9818   } else if (logical && ssa->bit_size == 1 &&
9819              ssa->parent_instr->type == nir_instr_type_load_const) {
9820      if (ctx->program->wave_size == 64)
9821         return Operand::c64(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT64_MAX
9822                                                                                    : 0u);
9823      else
9824         return Operand::c32(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT32_MAX
9825                                                                                    : 0u);
9826   } else {
9827      return Operand(tmp);
9828   }
9829}
9830
9831void
9832visit_phi(isel_context* ctx, nir_phi_instr* instr)
9833{
9834   aco_ptr<Pseudo_instruction> phi;
9835   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
9836   assert(instr->dest.ssa.bit_size != 1 || dst.regClass() == ctx->program->lane_mask);
9837
9838   bool logical = !dst.is_linear() || nir_dest_is_divergent(instr->dest);
9839   logical |= (ctx->block->kind & block_kind_merge) != 0;
9840   aco_opcode opcode = logical ? aco_opcode::p_phi : aco_opcode::p_linear_phi;
9841
9842   /* we want a sorted list of sources, since the predecessor list is also sorted */
9843   std::map<unsigned, nir_ssa_def*> phi_src;
9844   nir_foreach_phi_src (src, instr)
9845      phi_src[src->pred->index] = src->src.ssa;
9846
9847   std::vector<unsigned>& preds = logical ? ctx->block->logical_preds : ctx->block->linear_preds;
9848   unsigned num_operands = 0;
9849   Operand* const operands = (Operand*)alloca(
9850      (std::max(exec_list_length(&instr->srcs), (unsigned)preds.size()) + 1) * sizeof(Operand));
9851   unsigned num_defined = 0;
9852   unsigned cur_pred_idx = 0;
9853   for (std::pair<unsigned, nir_ssa_def*> src : phi_src) {
9854      if (cur_pred_idx < preds.size()) {
9855         /* handle missing preds (IF merges with discard/break) and extra preds
9856          * (loop exit with discard) */
9857         unsigned block = ctx->cf_info.nir_to_aco[src.first];
9858         unsigned skipped = 0;
9859         while (cur_pred_idx + skipped < preds.size() && preds[cur_pred_idx + skipped] != block)
9860            skipped++;
9861         if (cur_pred_idx + skipped < preds.size()) {
9862            for (unsigned i = 0; i < skipped; i++)
9863               operands[num_operands++] = Operand(dst.regClass());
9864            cur_pred_idx += skipped;
9865         } else {
9866            continue;
9867         }
9868      }
9869      /* Handle missing predecessors at the end. This shouldn't happen with loop
9870       * headers and we can't ignore these sources for loop header phis. */
9871      if (!(ctx->block->kind & block_kind_loop_header) && cur_pred_idx >= preds.size())
9872         continue;
9873      cur_pred_idx++;
9874      Operand op = get_phi_operand(ctx, src.second, dst.regClass(), logical);
9875      operands[num_operands++] = op;
9876      num_defined += !op.isUndefined();
9877   }
9878   /* handle block_kind_continue_or_break at loop exit blocks */
9879   while (cur_pred_idx++ < preds.size())
9880      operands[num_operands++] = Operand(dst.regClass());
9881
9882   /* If the loop ends with a break, still add a linear continue edge in case
9883    * that break is divergent or continue_or_break is used. We'll either remove
9884    * this operand later in visit_loop() if it's not necessary or replace the
9885    * undef with something correct. */
9886   if (!logical && ctx->block->kind & block_kind_loop_header) {
9887      nir_loop* loop = nir_cf_node_as_loop(instr->instr.block->cf_node.parent);
9888      nir_block* last = nir_loop_last_block(loop);
9889      if (last->successors[0] != instr->instr.block)
9890         operands[num_operands++] = Operand(RegClass());
9891   }
9892
9893   /* we can use a linear phi in some cases if one src is undef */
9894   if (dst.is_linear() && ctx->block->kind & block_kind_merge && num_defined == 1) {
9895      phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO,
9896                                                       num_operands, 1));
9897
9898      Block* linear_else = &ctx->program->blocks[ctx->block->linear_preds[1]];
9899      Block* invert = &ctx->program->blocks[linear_else->linear_preds[0]];
9900      assert(invert->kind & block_kind_invert);
9901
9902      unsigned then_block = invert->linear_preds[0];
9903
9904      Block* insert_block = NULL;
9905      for (unsigned i = 0; i < num_operands; i++) {
9906         Operand op = operands[i];
9907         if (op.isUndefined())
9908            continue;
9909         insert_block = ctx->block->logical_preds[i] == then_block ? invert : ctx->block;
9910         phi->operands[0] = op;
9911         break;
9912      }
9913      assert(insert_block); /* should be handled by the "num_defined == 0" case above */
9914      phi->operands[1] = Operand(dst.regClass());
9915      phi->definitions[0] = Definition(dst);
9916      insert_block->instructions.emplace(insert_block->instructions.begin(), std::move(phi));
9917      return;
9918   }
9919
9920   phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_operands, 1));
9921   for (unsigned i = 0; i < num_operands; i++)
9922      phi->operands[i] = operands[i];
9923   phi->definitions[0] = Definition(dst);
9924   ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
9925}
9926
9927void
9928visit_undef(isel_context* ctx, nir_ssa_undef_instr* instr)
9929{
9930   Temp dst = get_ssa_temp(ctx, &instr->def);
9931
9932   assert(dst.type() == RegType::sgpr);
9933
9934   if (dst.size() == 1) {
9935      Builder(ctx->program, ctx->block).copy(Definition(dst), Operand::zero());
9936   } else {
9937      aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
9938         aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
9939      for (unsigned i = 0; i < dst.size(); i++)
9940         vec->operands[i] = Operand::zero();
9941      vec->definitions[0] = Definition(dst);
9942      ctx->block->instructions.emplace_back(std::move(vec));
9943   }
9944}
9945
9946void
9947begin_loop(isel_context* ctx, loop_context* lc)
9948{
9949   // TODO: we might want to wrap the loop around a branch if exec_potentially_empty=true
9950   append_logical_end(ctx->block);
9951   ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform;
9952   Builder bld(ctx->program, ctx->block);
9953   bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
9954   unsigned loop_preheader_idx = ctx->block->index;
9955
9956   lc->loop_exit.kind |= (block_kind_loop_exit | (ctx->block->kind & block_kind_top_level));
9957
9958   ctx->program->next_loop_depth++;
9959
9960   Block* loop_header = ctx->program->create_and_insert_block();
9961   loop_header->kind |= block_kind_loop_header;
9962   add_edge(loop_preheader_idx, loop_header);
9963   ctx->block = loop_header;
9964
9965   append_logical_start(ctx->block);
9966
9967   lc->header_idx_old = std::exchange(ctx->cf_info.parent_loop.header_idx, loop_header->index);
9968   lc->exit_old = std::exchange(ctx->cf_info.parent_loop.exit, &lc->loop_exit);
9969   lc->divergent_cont_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_continue, false);
9970   lc->divergent_branch_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_branch, false);
9971   lc->divergent_if_old = std::exchange(ctx->cf_info.parent_if.is_divergent, false);
9972}
9973
9974void
9975end_loop(isel_context* ctx, loop_context* lc)
9976{
9977   // TODO: what if a loop ends with a unconditional or uniformly branched continue
9978   //       and this branch is never taken?
9979   if (!ctx->cf_info.has_branch) {
9980      unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx;
9981      Builder bld(ctx->program, ctx->block);
9982      append_logical_end(ctx->block);
9983
9984      if (ctx->cf_info.exec_potentially_empty_discard ||
9985          ctx->cf_info.exec_potentially_empty_break) {
9986         /* Discards can result in code running with an empty exec mask.
9987          * This would result in divergent breaks not ever being taken. As a
9988          * workaround, break the loop when the loop mask is empty instead of
9989          * always continuing. */
9990         ctx->block->kind |= (block_kind_continue_or_break | block_kind_uniform);
9991         unsigned block_idx = ctx->block->index;
9992
9993         /* create helper blocks to avoid critical edges */
9994         Block* break_block = ctx->program->create_and_insert_block();
9995         break_block->kind = block_kind_uniform;
9996         bld.reset(break_block);
9997         bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
9998         add_linear_edge(block_idx, break_block);
9999         add_linear_edge(break_block->index, &lc->loop_exit);
10000
10001         Block* continue_block = ctx->program->create_and_insert_block();
10002         continue_block->kind = block_kind_uniform;
10003         bld.reset(continue_block);
10004         bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10005         add_linear_edge(block_idx, continue_block);
10006         add_linear_edge(continue_block->index, &ctx->program->blocks[loop_header_idx]);
10007
10008         if (!ctx->cf_info.parent_loop.has_divergent_branch)
10009            add_logical_edge(block_idx, &ctx->program->blocks[loop_header_idx]);
10010         ctx->block = &ctx->program->blocks[block_idx];
10011      } else {
10012         ctx->block->kind |= (block_kind_continue | block_kind_uniform);
10013         if (!ctx->cf_info.parent_loop.has_divergent_branch)
10014            add_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
10015         else
10016            add_linear_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
10017      }
10018
10019      bld.reset(ctx->block);
10020      bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10021   }
10022
10023   ctx->cf_info.has_branch = false;
10024   ctx->program->next_loop_depth--;
10025
10026   // TODO: if the loop has not a single exit, we must add one °°
10027   /* emit loop successor block */
10028   ctx->block = ctx->program->insert_block(std::move(lc->loop_exit));
10029   append_logical_start(ctx->block);
10030
10031#if 0
10032   // TODO: check if it is beneficial to not branch on continues
10033   /* trim linear phis in loop header */
10034   for (auto&& instr : loop_entry->instructions) {
10035      if (instr->opcode == aco_opcode::p_linear_phi) {
10036         aco_ptr<Pseudo_instruction> new_phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, loop_entry->linear_predecessors.size(), 1)};
10037         new_phi->definitions[0] = instr->definitions[0];
10038         for (unsigned i = 0; i < new_phi->operands.size(); i++)
10039            new_phi->operands[i] = instr->operands[i];
10040         /* check that the remaining operands are all the same */
10041         for (unsigned i = new_phi->operands.size(); i < instr->operands.size(); i++)
10042            assert(instr->operands[i].tempId() == instr->operands.back().tempId());
10043         instr.swap(new_phi);
10044      } else if (instr->opcode == aco_opcode::p_phi) {
10045         continue;
10046      } else {
10047         break;
10048      }
10049   }
10050#endif
10051
10052   ctx->cf_info.parent_loop.header_idx = lc->header_idx_old;
10053   ctx->cf_info.parent_loop.exit = lc->exit_old;
10054   ctx->cf_info.parent_loop.has_divergent_continue = lc->divergent_cont_old;
10055   ctx->cf_info.parent_loop.has_divergent_branch = lc->divergent_branch_old;
10056   ctx->cf_info.parent_if.is_divergent = lc->divergent_if_old;
10057   if (!ctx->block->loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
10058      ctx->cf_info.exec_potentially_empty_discard = false;
10059}
10060
10061void
10062emit_loop_jump(isel_context* ctx, bool is_break)
10063{
10064   Builder bld(ctx->program, ctx->block);
10065   Block* logical_target;
10066   append_logical_end(ctx->block);
10067   unsigned idx = ctx->block->index;
10068
10069   if (is_break) {
10070      logical_target = ctx->cf_info.parent_loop.exit;
10071      add_logical_edge(idx, logical_target);
10072      ctx->block->kind |= block_kind_break;
10073
10074      if (!ctx->cf_info.parent_if.is_divergent &&
10075          !ctx->cf_info.parent_loop.has_divergent_continue) {
10076         /* uniform break - directly jump out of the loop */
10077         ctx->block->kind |= block_kind_uniform;
10078         ctx->cf_info.has_branch = true;
10079         bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10080         add_linear_edge(idx, logical_target);
10081         return;
10082      }
10083      ctx->cf_info.parent_loop.has_divergent_branch = true;
10084   } else {
10085      logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
10086      add_logical_edge(idx, logical_target);
10087      ctx->block->kind |= block_kind_continue;
10088
10089      if (!ctx->cf_info.parent_if.is_divergent) {
10090         /* uniform continue - directly jump to the loop header */
10091         ctx->block->kind |= block_kind_uniform;
10092         ctx->cf_info.has_branch = true;
10093         bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10094         add_linear_edge(idx, logical_target);
10095         return;
10096      }
10097
10098      /* for potential uniform breaks after this continue,
10099         we must ensure that they are handled correctly */
10100      ctx->cf_info.parent_loop.has_divergent_continue = true;
10101      ctx->cf_info.parent_loop.has_divergent_branch = true;
10102   }
10103
10104   if (ctx->cf_info.parent_if.is_divergent && !ctx->cf_info.exec_potentially_empty_break) {
10105      ctx->cf_info.exec_potentially_empty_break = true;
10106      ctx->cf_info.exec_potentially_empty_break_depth = ctx->block->loop_nest_depth;
10107   }
10108
10109   /* remove critical edges from linear CFG */
10110   bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10111   Block* break_block = ctx->program->create_and_insert_block();
10112   break_block->kind |= block_kind_uniform;
10113   add_linear_edge(idx, break_block);
10114   /* the loop_header pointer might be invalidated by this point */
10115   if (!is_break)
10116      logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
10117   add_linear_edge(break_block->index, logical_target);
10118   bld.reset(break_block);
10119   bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10120
10121   Block* continue_block = ctx->program->create_and_insert_block();
10122   add_linear_edge(idx, continue_block);
10123   append_logical_start(continue_block);
10124   ctx->block = continue_block;
10125}
10126
10127void
10128emit_loop_break(isel_context* ctx)
10129{
10130   emit_loop_jump(ctx, true);
10131}
10132
10133void
10134emit_loop_continue(isel_context* ctx)
10135{
10136   emit_loop_jump(ctx, false);
10137}
10138
10139void
10140visit_jump(isel_context* ctx, nir_jump_instr* instr)
10141{
10142   /* visit_block() would usually do this but divergent jumps updates ctx->block */
10143   ctx->cf_info.nir_to_aco[instr->instr.block->index] = ctx->block->index;
10144
10145   switch (instr->type) {
10146   case nir_jump_break: emit_loop_break(ctx); break;
10147   case nir_jump_continue: emit_loop_continue(ctx); break;
10148   default: isel_err(&instr->instr, "Unknown NIR jump instr"); abort();
10149   }
10150}
10151
10152void
10153visit_block(isel_context* ctx, nir_block* block)
10154{
10155   nir_foreach_instr (instr, block) {
10156      switch (instr->type) {
10157      case nir_instr_type_alu: visit_alu_instr(ctx, nir_instr_as_alu(instr)); break;
10158      case nir_instr_type_load_const: visit_load_const(ctx, nir_instr_as_load_const(instr)); break;
10159      case nir_instr_type_intrinsic: visit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); break;
10160      case nir_instr_type_tex: visit_tex(ctx, nir_instr_as_tex(instr)); break;
10161      case nir_instr_type_phi: visit_phi(ctx, nir_instr_as_phi(instr)); break;
10162      case nir_instr_type_ssa_undef: visit_undef(ctx, nir_instr_as_ssa_undef(instr)); break;
10163      case nir_instr_type_deref: break;
10164      case nir_instr_type_jump: visit_jump(ctx, nir_instr_as_jump(instr)); break;
10165      default: isel_err(instr, "Unknown NIR instr type");
10166      }
10167   }
10168
10169   if (!ctx->cf_info.parent_loop.has_divergent_branch)
10170      ctx->cf_info.nir_to_aco[block->index] = ctx->block->index;
10171}
10172
10173static Operand
10174create_continue_phis(isel_context* ctx, unsigned first, unsigned last,
10175                     aco_ptr<Instruction>& header_phi, Operand* vals)
10176{
10177   vals[0] = Operand(header_phi->definitions[0].getTemp());
10178   RegClass rc = vals[0].regClass();
10179
10180   unsigned loop_nest_depth = ctx->program->blocks[first].loop_nest_depth;
10181
10182   unsigned next_pred = 1;
10183
10184   for (unsigned idx = first + 1; idx <= last; idx++) {
10185      Block& block = ctx->program->blocks[idx];
10186      if (block.loop_nest_depth != loop_nest_depth) {
10187         vals[idx - first] = vals[idx - 1 - first];
10188         continue;
10189      }
10190
10191      if ((block.kind & block_kind_continue) && block.index != last) {
10192         vals[idx - first] = header_phi->operands[next_pred];
10193         next_pred++;
10194         continue;
10195      }
10196
10197      bool all_same = true;
10198      for (unsigned i = 1; all_same && (i < block.linear_preds.size()); i++)
10199         all_same = vals[block.linear_preds[i] - first] == vals[block.linear_preds[0] - first];
10200
10201      Operand val;
10202      if (all_same) {
10203         val = vals[block.linear_preds[0] - first];
10204      } else {
10205         aco_ptr<Instruction> phi(create_instruction<Pseudo_instruction>(
10206            aco_opcode::p_linear_phi, Format::PSEUDO, block.linear_preds.size(), 1));
10207         for (unsigned i = 0; i < block.linear_preds.size(); i++)
10208            phi->operands[i] = vals[block.linear_preds[i] - first];
10209         val = Operand(ctx->program->allocateTmp(rc));
10210         phi->definitions[0] = Definition(val.getTemp());
10211         block.instructions.emplace(block.instructions.begin(), std::move(phi));
10212      }
10213      vals[idx - first] = val;
10214   }
10215
10216   return vals[last - first];
10217}
10218
10219static void begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond);
10220static void begin_uniform_if_else(isel_context* ctx, if_context* ic);
10221static void end_uniform_if(isel_context* ctx, if_context* ic);
10222
10223static void
10224visit_loop(isel_context* ctx, nir_loop* loop)
10225{
10226   loop_context lc;
10227   begin_loop(ctx, &lc);
10228
10229   /* NIR seems to allow this, and even though the loop exit has no predecessors, SSA defs from the
10230    * loop header are live. Handle this without complicating the ACO IR by creating a dummy break.
10231    */
10232   if (nir_cf_node_cf_tree_next(&loop->cf_node)->predecessors->entries == 0) {
10233      Builder bld(ctx->program, ctx->block);
10234      Temp cond = bld.copy(bld.def(s1, scc), Operand::zero());
10235      if_context ic;
10236      begin_uniform_if_then(ctx, &ic, cond);
10237      emit_loop_break(ctx);
10238      begin_uniform_if_else(ctx, &ic);
10239      end_uniform_if(ctx, &ic);
10240   }
10241
10242   bool unreachable = visit_cf_list(ctx, &loop->body);
10243
10244   unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx;
10245
10246   /* Fixup phis in loop header from unreachable blocks.
10247    * has_branch/has_divergent_branch also indicates if the loop ends with a
10248    * break/continue instruction, but we don't emit those if unreachable=true */
10249   if (unreachable) {
10250      assert(ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch);
10251      bool linear = ctx->cf_info.has_branch;
10252      bool logical = ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch;
10253      for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
10254         if ((logical && instr->opcode == aco_opcode::p_phi) ||
10255             (linear && instr->opcode == aco_opcode::p_linear_phi)) {
10256            /* the last operand should be the one that needs to be removed */
10257            instr->operands.pop_back();
10258         } else if (!is_phi(instr)) {
10259            break;
10260         }
10261      }
10262   }
10263
10264   /* Fixup linear phis in loop header from expecting a continue. Both this fixup
10265    * and the previous one shouldn't both happen at once because a break in the
10266    * merge block would get CSE'd */
10267   if (nir_loop_last_block(loop)->successors[0] != nir_loop_first_block(loop)) {
10268      unsigned num_vals = ctx->cf_info.has_branch ? 1 : (ctx->block->index - loop_header_idx + 1);
10269      Operand* const vals = (Operand*)alloca(num_vals * sizeof(Operand));
10270      for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
10271         if (instr->opcode == aco_opcode::p_linear_phi) {
10272            if (ctx->cf_info.has_branch)
10273               instr->operands.pop_back();
10274            else
10275               instr->operands.back() =
10276                  create_continue_phis(ctx, loop_header_idx, ctx->block->index, instr, vals);
10277         } else if (!is_phi(instr)) {
10278            break;
10279         }
10280      }
10281   }
10282
10283   end_loop(ctx, &lc);
10284}
10285
10286static void
10287begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond)
10288{
10289   ic->cond = cond;
10290
10291   append_logical_end(ctx->block);
10292   ctx->block->kind |= block_kind_branch;
10293
10294   /* branch to linear then block */
10295   assert(cond.regClass() == ctx->program->lane_mask);
10296   aco_ptr<Pseudo_branch_instruction> branch;
10297   branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z,
10298                                                              Format::PSEUDO_BRANCH, 1, 1));
10299   branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10300   branch->definitions[0].setHint(vcc);
10301   branch->operands[0] = Operand(cond);
10302   ctx->block->instructions.push_back(std::move(branch));
10303
10304   ic->BB_if_idx = ctx->block->index;
10305   ic->BB_invert = Block();
10306   /* Invert blocks are intentionally not marked as top level because they
10307    * are not part of the logical cfg. */
10308   ic->BB_invert.kind |= block_kind_invert;
10309   ic->BB_endif = Block();
10310   ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level));
10311
10312   ic->exec_potentially_empty_discard_old = ctx->cf_info.exec_potentially_empty_discard;
10313   ic->exec_potentially_empty_break_old = ctx->cf_info.exec_potentially_empty_break;
10314   ic->exec_potentially_empty_break_depth_old = ctx->cf_info.exec_potentially_empty_break_depth;
10315   ic->divergent_old = ctx->cf_info.parent_if.is_divergent;
10316   ctx->cf_info.parent_if.is_divergent = true;
10317
10318   /* divergent branches use cbranch_execz */
10319   ctx->cf_info.exec_potentially_empty_discard = false;
10320   ctx->cf_info.exec_potentially_empty_break = false;
10321   ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10322
10323   /** emit logical then block */
10324   ctx->program->next_divergent_if_logical_depth++;
10325   Block* BB_then_logical = ctx->program->create_and_insert_block();
10326   add_edge(ic->BB_if_idx, BB_then_logical);
10327   ctx->block = BB_then_logical;
10328   append_logical_start(BB_then_logical);
10329}
10330
10331static void
10332begin_divergent_if_else(isel_context* ctx, if_context* ic)
10333{
10334   Block* BB_then_logical = ctx->block;
10335   append_logical_end(BB_then_logical);
10336   /* branch from logical then block to invert block */
10337   aco_ptr<Pseudo_branch_instruction> branch;
10338   branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10339                                                              Format::PSEUDO_BRANCH, 0, 1));
10340   branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10341   branch->definitions[0].setHint(vcc);
10342   BB_then_logical->instructions.emplace_back(std::move(branch));
10343   add_linear_edge(BB_then_logical->index, &ic->BB_invert);
10344   if (!ctx->cf_info.parent_loop.has_divergent_branch)
10345      add_logical_edge(BB_then_logical->index, &ic->BB_endif);
10346   BB_then_logical->kind |= block_kind_uniform;
10347   assert(!ctx->cf_info.has_branch);
10348   ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
10349   ctx->cf_info.parent_loop.has_divergent_branch = false;
10350   ctx->program->next_divergent_if_logical_depth--;
10351
10352   /** emit linear then block */
10353   Block* BB_then_linear = ctx->program->create_and_insert_block();
10354   BB_then_linear->kind |= block_kind_uniform;
10355   add_linear_edge(ic->BB_if_idx, BB_then_linear);
10356   /* branch from linear then block to invert block */
10357   branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10358                                                              Format::PSEUDO_BRANCH, 0, 1));
10359   branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10360   branch->definitions[0].setHint(vcc);
10361   BB_then_linear->instructions.emplace_back(std::move(branch));
10362   add_linear_edge(BB_then_linear->index, &ic->BB_invert);
10363
10364   /** emit invert merge block */
10365   ctx->block = ctx->program->insert_block(std::move(ic->BB_invert));
10366   ic->invert_idx = ctx->block->index;
10367
10368   /* branch to linear else block (skip else) */
10369   branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10370                                                              Format::PSEUDO_BRANCH, 0, 1));
10371   branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10372   branch->definitions[0].setHint(vcc);
10373   ctx->block->instructions.push_back(std::move(branch));
10374
10375   ic->exec_potentially_empty_discard_old |= ctx->cf_info.exec_potentially_empty_discard;
10376   ic->exec_potentially_empty_break_old |= ctx->cf_info.exec_potentially_empty_break;
10377   ic->exec_potentially_empty_break_depth_old = std::min(
10378      ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
10379   /* divergent branches use cbranch_execz */
10380   ctx->cf_info.exec_potentially_empty_discard = false;
10381   ctx->cf_info.exec_potentially_empty_break = false;
10382   ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10383
10384   /** emit logical else block */
10385   ctx->program->next_divergent_if_logical_depth++;
10386   Block* BB_else_logical = ctx->program->create_and_insert_block();
10387   add_logical_edge(ic->BB_if_idx, BB_else_logical);
10388   add_linear_edge(ic->invert_idx, BB_else_logical);
10389   ctx->block = BB_else_logical;
10390   append_logical_start(BB_else_logical);
10391}
10392
10393static void
10394end_divergent_if(isel_context* ctx, if_context* ic)
10395{
10396   Block* BB_else_logical = ctx->block;
10397   append_logical_end(BB_else_logical);
10398
10399   /* branch from logical else block to endif block */
10400   aco_ptr<Pseudo_branch_instruction> branch;
10401   branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10402                                                              Format::PSEUDO_BRANCH, 0, 1));
10403   branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10404   branch->definitions[0].setHint(vcc);
10405   BB_else_logical->instructions.emplace_back(std::move(branch));
10406   add_linear_edge(BB_else_logical->index, &ic->BB_endif);
10407   if (!ctx->cf_info.parent_loop.has_divergent_branch)
10408      add_logical_edge(BB_else_logical->index, &ic->BB_endif);
10409   BB_else_logical->kind |= block_kind_uniform;
10410   ctx->program->next_divergent_if_logical_depth--;
10411
10412   assert(!ctx->cf_info.has_branch);
10413   ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
10414
10415   /** emit linear else block */
10416   Block* BB_else_linear = ctx->program->create_and_insert_block();
10417   BB_else_linear->kind |= block_kind_uniform;
10418   add_linear_edge(ic->invert_idx, BB_else_linear);
10419
10420   /* branch from linear else block to endif block */
10421   branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10422                                                              Format::PSEUDO_BRANCH, 0, 1));
10423   branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10424   branch->definitions[0].setHint(vcc);
10425   BB_else_linear->instructions.emplace_back(std::move(branch));
10426   add_linear_edge(BB_else_linear->index, &ic->BB_endif);
10427
10428   /** emit endif merge block */
10429   ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
10430   append_logical_start(ctx->block);
10431
10432   ctx->cf_info.parent_if.is_divergent = ic->divergent_old;
10433   ctx->cf_info.exec_potentially_empty_discard |= ic->exec_potentially_empty_discard_old;
10434   ctx->cf_info.exec_potentially_empty_break |= ic->exec_potentially_empty_break_old;
10435   ctx->cf_info.exec_potentially_empty_break_depth = std::min(
10436      ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
10437   if (ctx->block->loop_nest_depth == ctx->cf_info.exec_potentially_empty_break_depth &&
10438       !ctx->cf_info.parent_if.is_divergent) {
10439      ctx->cf_info.exec_potentially_empty_break = false;
10440      ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10441   }
10442   /* uniform control flow never has an empty exec-mask */
10443   if (!ctx->block->loop_nest_depth && !ctx->cf_info.parent_if.is_divergent) {
10444      ctx->cf_info.exec_potentially_empty_discard = false;
10445      ctx->cf_info.exec_potentially_empty_break = false;
10446      ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10447   }
10448}
10449
10450static void
10451begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond)
10452{
10453   assert(cond.regClass() == s1);
10454
10455   append_logical_end(ctx->block);
10456   ctx->block->kind |= block_kind_uniform;
10457
10458   aco_ptr<Pseudo_branch_instruction> branch;
10459   aco_opcode branch_opcode = aco_opcode::p_cbranch_z;
10460   branch.reset(
10461      create_instruction<Pseudo_branch_instruction>(branch_opcode, Format::PSEUDO_BRANCH, 1, 1));
10462   branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10463   branch->definitions[0].setHint(vcc);
10464   branch->operands[0] = Operand(cond);
10465   branch->operands[0].setFixed(scc);
10466   ctx->block->instructions.emplace_back(std::move(branch));
10467
10468   ic->BB_if_idx = ctx->block->index;
10469   ic->BB_endif = Block();
10470   ic->BB_endif.kind |= ctx->block->kind & block_kind_top_level;
10471
10472   ctx->cf_info.has_branch = false;
10473   ctx->cf_info.parent_loop.has_divergent_branch = false;
10474
10475   /** emit then block */
10476   ctx->program->next_uniform_if_depth++;
10477   Block* BB_then = ctx->program->create_and_insert_block();
10478   add_edge(ic->BB_if_idx, BB_then);
10479   append_logical_start(BB_then);
10480   ctx->block = BB_then;
10481}
10482
10483static void
10484begin_uniform_if_else(isel_context* ctx, if_context* ic)
10485{
10486   Block* BB_then = ctx->block;
10487
10488   ic->uniform_has_then_branch = ctx->cf_info.has_branch;
10489   ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
10490
10491   if (!ic->uniform_has_then_branch) {
10492      append_logical_end(BB_then);
10493      /* branch from then block to endif block */
10494      aco_ptr<Pseudo_branch_instruction> branch;
10495      branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10496                                                                 Format::PSEUDO_BRANCH, 0, 1));
10497      branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10498      branch->definitions[0].setHint(vcc);
10499      BB_then->instructions.emplace_back(std::move(branch));
10500      add_linear_edge(BB_then->index, &ic->BB_endif);
10501      if (!ic->then_branch_divergent)
10502         add_logical_edge(BB_then->index, &ic->BB_endif);
10503      BB_then->kind |= block_kind_uniform;
10504   }
10505
10506   ctx->cf_info.has_branch = false;
10507   ctx->cf_info.parent_loop.has_divergent_branch = false;
10508
10509   /** emit else block */
10510   Block* BB_else = ctx->program->create_and_insert_block();
10511   add_edge(ic->BB_if_idx, BB_else);
10512   append_logical_start(BB_else);
10513   ctx->block = BB_else;
10514}
10515
10516static void
10517end_uniform_if(isel_context* ctx, if_context* ic)
10518{
10519   Block* BB_else = ctx->block;
10520
10521   if (!ctx->cf_info.has_branch) {
10522      append_logical_end(BB_else);
10523      /* branch from then block to endif block */
10524      aco_ptr<Pseudo_branch_instruction> branch;
10525      branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10526                                                                 Format::PSEUDO_BRANCH, 0, 1));
10527      branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10528      branch->definitions[0].setHint(vcc);
10529      BB_else->instructions.emplace_back(std::move(branch));
10530      add_linear_edge(BB_else->index, &ic->BB_endif);
10531      if (!ctx->cf_info.parent_loop.has_divergent_branch)
10532         add_logical_edge(BB_else->index, &ic->BB_endif);
10533      BB_else->kind |= block_kind_uniform;
10534   }
10535
10536   ctx->cf_info.has_branch &= ic->uniform_has_then_branch;
10537   ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
10538
10539   /** emit endif merge block */
10540   ctx->program->next_uniform_if_depth--;
10541   if (!ctx->cf_info.has_branch) {
10542      ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
10543      append_logical_start(ctx->block);
10544   }
10545}
10546
10547static bool
10548visit_if(isel_context* ctx, nir_if* if_stmt)
10549{
10550   Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa);
10551   Builder bld(ctx->program, ctx->block);
10552   aco_ptr<Pseudo_branch_instruction> branch;
10553   if_context ic;
10554
10555   if (!nir_src_is_divergent(if_stmt->condition)) { /* uniform condition */
10556      /**
10557       * Uniform conditionals are represented in the following way*) :
10558       *
10559       * The linear and logical CFG:
10560       *                        BB_IF
10561       *                        /    \
10562       *       BB_THEN (logical)      BB_ELSE (logical)
10563       *                        \    /
10564       *                        BB_ENDIF
10565       *
10566       * *) Exceptions may be due to break and continue statements within loops
10567       *    If a break/continue happens within uniform control flow, it branches
10568       *    to the loop exit/entry block. Otherwise, it branches to the next
10569       *    merge block.
10570       **/
10571
10572      assert(cond.regClass() == ctx->program->lane_mask);
10573      cond = bool_to_scalar_condition(ctx, cond);
10574
10575      begin_uniform_if_then(ctx, &ic, cond);
10576      visit_cf_list(ctx, &if_stmt->then_list);
10577
10578      begin_uniform_if_else(ctx, &ic);
10579      visit_cf_list(ctx, &if_stmt->else_list);
10580
10581      end_uniform_if(ctx, &ic);
10582   } else { /* non-uniform condition */
10583      /**
10584       * To maintain a logical and linear CFG without critical edges,
10585       * non-uniform conditionals are represented in the following way*) :
10586       *
10587       * The linear CFG:
10588       *                        BB_IF
10589       *                        /    \
10590       *       BB_THEN (logical)      BB_THEN (linear)
10591       *                        \    /
10592       *                        BB_INVERT (linear)
10593       *                        /    \
10594       *       BB_ELSE (logical)      BB_ELSE (linear)
10595       *                        \    /
10596       *                        BB_ENDIF
10597       *
10598       * The logical CFG:
10599       *                        BB_IF
10600       *                        /    \
10601       *       BB_THEN (logical)      BB_ELSE (logical)
10602       *                        \    /
10603       *                        BB_ENDIF
10604       *
10605       * *) Exceptions may be due to break and continue statements within loops
10606       **/
10607
10608      begin_divergent_if_then(ctx, &ic, cond);
10609      visit_cf_list(ctx, &if_stmt->then_list);
10610
10611      begin_divergent_if_else(ctx, &ic);
10612      visit_cf_list(ctx, &if_stmt->else_list);
10613
10614      end_divergent_if(ctx, &ic);
10615   }
10616
10617   return !ctx->cf_info.has_branch && !ctx->block->logical_preds.empty();
10618}
10619
10620static bool
10621visit_cf_list(isel_context* ctx, struct exec_list* list)
10622{
10623   foreach_list_typed (nir_cf_node, node, node, list) {
10624      switch (node->type) {
10625      case nir_cf_node_block: visit_block(ctx, nir_cf_node_as_block(node)); break;
10626      case nir_cf_node_if:
10627         if (!visit_if(ctx, nir_cf_node_as_if(node)))
10628            return true;
10629         break;
10630      case nir_cf_node_loop: visit_loop(ctx, nir_cf_node_as_loop(node)); break;
10631      default: unreachable("unimplemented cf list type");
10632      }
10633   }
10634   return false;
10635}
10636
10637static void
10638export_vs_varying(isel_context* ctx, int slot, bool is_pos, int* next_pos)
10639{
10640   assert(ctx->stage.hw == HWStage::VS || ctx->stage.hw == HWStage::NGG);
10641
10642   int offset = (ctx->stage.has(SWStage::TES) && !ctx->stage.has(SWStage::GS))
10643                   ? ctx->program->info->tes.outinfo.vs_output_param_offset[slot]
10644                   : ctx->program->info->vs.outinfo.vs_output_param_offset[slot];
10645   unsigned mask = ctx->outputs.mask[slot];
10646   if (!is_pos && !mask)
10647      return;
10648   if (!is_pos && offset == AC_EXP_PARAM_UNDEFINED)
10649      return;
10650   aco_ptr<Export_instruction> exp{
10651      create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
10652   exp->enabled_mask = mask;
10653   for (unsigned i = 0; i < 4; ++i) {
10654      if (mask & (1 << i))
10655         exp->operands[i] = Operand(ctx->outputs.temps[slot * 4u + i]);
10656      else
10657         exp->operands[i] = Operand(v1);
10658   }
10659   /* GFX10 (Navi1x) skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
10660    * Setting valid_mask=1 prevents it and has no other effect.
10661    */
10662   exp->valid_mask = ctx->options->chip_class == GFX10 && is_pos && *next_pos == 0;
10663   exp->done = false;
10664   exp->compressed = false;
10665   if (is_pos)
10666      exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
10667   else
10668      exp->dest = V_008DFC_SQ_EXP_PARAM + offset;
10669   ctx->block->instructions.emplace_back(std::move(exp));
10670}
10671
10672static void
10673export_vs_psiz_layer_viewport_vrs(isel_context* ctx, int* next_pos)
10674{
10675   aco_ptr<Export_instruction> exp{
10676      create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
10677   exp->enabled_mask = 0;
10678   for (unsigned i = 0; i < 4; ++i)
10679      exp->operands[i] = Operand(v1);
10680   if (ctx->outputs.mask[VARYING_SLOT_PSIZ]) {
10681      exp->operands[0] = Operand(ctx->outputs.temps[VARYING_SLOT_PSIZ * 4u]);
10682      exp->enabled_mask |= 0x1;
10683   }
10684   if (ctx->outputs.mask[VARYING_SLOT_LAYER]) {
10685      exp->operands[2] = Operand(ctx->outputs.temps[VARYING_SLOT_LAYER * 4u]);
10686      exp->enabled_mask |= 0x4;
10687   }
10688   if (ctx->outputs.mask[VARYING_SLOT_VIEWPORT]) {
10689      if (ctx->options->chip_class < GFX9) {
10690         exp->operands[3] = Operand(ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u]);
10691         exp->enabled_mask |= 0x8;
10692      } else {
10693         Builder bld(ctx->program, ctx->block);
10694
10695         Temp out = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(16u),
10696                             Operand(ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u]));
10697         if (exp->operands[2].isTemp())
10698            out = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(out), exp->operands[2]);
10699
10700         exp->operands[2] = Operand(out);
10701         exp->enabled_mask |= 0x4;
10702      }
10703   }
10704   if (ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_SHADING_RATE]) {
10705      exp->operands[1] = Operand(ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_SHADING_RATE * 4u]);
10706      exp->enabled_mask |= 0x2;
10707   } else if (ctx->options->force_vrs_rates) {
10708      /* Bits [2:3] = VRS rate X
10709       * Bits [4:5] = VRS rate Y
10710       *
10711       * The range is [-2, 1]. Values:
10712       *   1: 2x coarser shading rate in that direction.
10713       *   0: normal shading rate
10714       *  -1: 2x finer shading rate (sample shading, not directional)
10715       *  -2: 4x finer shading rate (sample shading, not directional)
10716       *
10717       * Sample shading can't go above 8 samples, so both numbers can't be -2
10718       * at the same time.
10719       */
10720      Builder bld(ctx->program, ctx->block);
10721      Temp rates = bld.copy(bld.def(v1), Operand::c32((unsigned)ctx->options->force_vrs_rates));
10722
10723      /* If Pos.W != 1 (typical for non-GUI elements), use 2x2 coarse shading. */
10724      Temp cond = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), Operand::c32(0x3f800000u),
10725                           Operand(ctx->outputs.temps[VARYING_SLOT_POS + 3]));
10726      rates = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
10727                       bld.copy(bld.def(v1), Operand::zero()), rates, cond);
10728
10729      exp->operands[1] = Operand(rates);
10730      exp->enabled_mask |= 0x2;
10731   }
10732
10733   exp->valid_mask = ctx->options->chip_class == GFX10 && *next_pos == 0;
10734   exp->done = false;
10735   exp->compressed = false;
10736   exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
10737   ctx->block->instructions.emplace_back(std::move(exp));
10738}
10739
10740static void
10741create_vs_exports(isel_context* ctx)
10742{
10743   assert(ctx->stage.hw == HWStage::VS || ctx->stage.hw == HWStage::NGG);
10744
10745   const radv_vs_output_info* outinfo = (ctx->stage.has(SWStage::TES) && !ctx->stage.has(SWStage::GS))
10746                                        ? &ctx->program->info->tes.outinfo
10747                                        : &ctx->program->info->vs.outinfo;
10748
10749   ctx->block->kind |= block_kind_export_end;
10750
10751   if (outinfo->export_prim_id && ctx->stage.hw != HWStage::NGG) {
10752      ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
10753      if (ctx->stage.has(SWStage::TES))
10754         ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] =
10755            get_arg(ctx, ctx->args->ac.tes_patch_id);
10756      else
10757         ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] =
10758            get_arg(ctx, ctx->args->ac.vs_prim_id);
10759   }
10760
10761   if (ctx->options->key.has_multiview_view_index) {
10762      ctx->outputs.mask[VARYING_SLOT_LAYER] |= 0x1;
10763      ctx->outputs.temps[VARYING_SLOT_LAYER * 4u] =
10764         as_vgpr(ctx, get_arg(ctx, ctx->args->ac.view_index));
10765   }
10766
10767   /* Hardware requires position data to always be exported, even if the
10768    * application did not write gl_Position.
10769    */
10770   ctx->outputs.mask[VARYING_SLOT_POS] = 0xf;
10771
10772   /* the order these position exports are created is important */
10773   int next_pos = 0;
10774   export_vs_varying(ctx, VARYING_SLOT_POS, true, &next_pos);
10775
10776   bool writes_primitive_shading_rate =
10777      outinfo->writes_primitive_shading_rate || ctx->options->force_vrs_rates;
10778   if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_viewport_index ||
10779       writes_primitive_shading_rate) {
10780      export_vs_psiz_layer_viewport_vrs(ctx, &next_pos);
10781   }
10782   if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
10783      export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, true, &next_pos);
10784   if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
10785      export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, true, &next_pos);
10786
10787   if (ctx->export_clip_dists) {
10788      if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
10789         export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, false, &next_pos);
10790      if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
10791         export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, false, &next_pos);
10792   }
10793
10794   for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
10795      if (i < VARYING_SLOT_VAR0 && i != VARYING_SLOT_LAYER && i != VARYING_SLOT_PRIMITIVE_ID &&
10796          i != VARYING_SLOT_VIEWPORT)
10797         continue;
10798
10799      export_vs_varying(ctx, i, false, NULL);
10800   }
10801}
10802
10803static bool
10804export_fs_mrt_z(isel_context* ctx)
10805{
10806   Builder bld(ctx->program, ctx->block);
10807   unsigned enabled_channels = 0;
10808   bool compr = false;
10809   Operand values[4];
10810
10811   for (unsigned i = 0; i < 4; ++i) {
10812      values[i] = Operand(v1);
10813   }
10814
10815   /* Both stencil and sample mask only need 16-bits. */
10816   if (!ctx->program->info->ps.writes_z &&
10817       (ctx->program->info->ps.writes_stencil || ctx->program->info->ps.writes_sample_mask)) {
10818      compr = true; /* COMPR flag */
10819
10820      if (ctx->program->info->ps.writes_stencil) {
10821         /* Stencil should be in X[23:16]. */
10822         values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u]);
10823         values[0] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(16u), values[0]);
10824         enabled_channels |= 0x3;
10825      }
10826
10827      if (ctx->program->info->ps.writes_sample_mask) {
10828         /* SampleMask should be in Y[15:0]. */
10829         values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]);
10830         enabled_channels |= 0xc;
10831      }
10832   } else {
10833      if (ctx->program->info->ps.writes_z) {
10834         values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_DEPTH * 4u]);
10835         enabled_channels |= 0x1;
10836      }
10837
10838      if (ctx->program->info->ps.writes_stencil) {
10839         values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u]);
10840         enabled_channels |= 0x2;
10841      }
10842
10843      if (ctx->program->info->ps.writes_sample_mask) {
10844         values[2] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]);
10845         enabled_channels |= 0x4;
10846      }
10847   }
10848
10849   /* GFX6 (except OLAND and HAINAN) has a bug that it only looks at the X
10850    * writemask component.
10851    */
10852   if (ctx->options->chip_class == GFX6 && ctx->options->family != CHIP_OLAND &&
10853       ctx->options->family != CHIP_HAINAN) {
10854      enabled_channels |= 0x1;
10855   }
10856
10857   bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], enabled_channels,
10858           V_008DFC_SQ_EXP_MRTZ, compr);
10859
10860   return true;
10861}
10862
10863static bool
10864export_fs_mrt_color(isel_context* ctx, int slot)
10865{
10866   Builder bld(ctx->program, ctx->block);
10867   unsigned write_mask = ctx->outputs.mask[slot];
10868   Operand values[4];
10869
10870   for (unsigned i = 0; i < 4; ++i) {
10871      if (write_mask & (1 << i)) {
10872         values[i] = Operand(ctx->outputs.temps[slot * 4u + i]);
10873      } else {
10874         values[i] = Operand(v1);
10875      }
10876   }
10877
10878   unsigned target, col_format;
10879   unsigned enabled_channels = 0;
10880   aco_opcode compr_op = (aco_opcode)0;
10881   bool compr = false;
10882
10883   slot -= FRAG_RESULT_DATA0;
10884   target = V_008DFC_SQ_EXP_MRT + slot;
10885   col_format = (ctx->options->key.ps.col_format >> (4 * slot)) & 0xf;
10886
10887   bool is_int8 = (ctx->options->key.ps.is_int8 >> slot) & 1;
10888   bool is_int10 = (ctx->options->key.ps.is_int10 >> slot) & 1;
10889   bool is_16bit = values[0].regClass() == v2b;
10890
10891   /* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */
10892   if (ctx->options->enable_mrt_output_nan_fixup && !is_16bit &&
10893       (col_format == V_028714_SPI_SHADER_32_R || col_format == V_028714_SPI_SHADER_32_GR ||
10894        col_format == V_028714_SPI_SHADER_32_AR || col_format == V_028714_SPI_SHADER_32_ABGR ||
10895        col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
10896      for (int i = 0; i < 4; i++) {
10897         if (!(write_mask & (1 << i)))
10898            continue;
10899
10900         Temp isnan = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)),
10901                               values[i], bld.copy(bld.def(v1), Operand::c32(3u)));
10902         values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), values[i],
10903                              bld.copy(bld.def(v1), Operand::zero()), isnan);
10904      }
10905   }
10906
10907   switch (col_format) {
10908   case V_028714_SPI_SHADER_32_R: enabled_channels = 1; break;
10909
10910   case V_028714_SPI_SHADER_32_GR: enabled_channels = 0x3; break;
10911
10912   case V_028714_SPI_SHADER_32_AR:
10913      if (ctx->options->chip_class >= GFX10) {
10914         /* Special case: on GFX10, the outputs are different for 32_AR */
10915         enabled_channels = 0x3;
10916         values[1] = values[3];
10917         values[3] = Operand(v1);
10918      } else {
10919         enabled_channels = 0x9;
10920      }
10921      break;
10922
10923   case V_028714_SPI_SHADER_FP16_ABGR:
10924      for (int i = 0; i < 2; i++) {
10925         bool enabled = (write_mask >> (i * 2)) & 0x3;
10926         if (enabled) {
10927            enabled_channels |= 0x3 << (i * 2);
10928            if (is_16bit) {
10929               values[i] =
10930                  bld.pseudo(aco_opcode::p_create_vector, bld.def(v1),
10931                             values[i * 2].isUndefined() ? Operand(v2b) : values[i * 2],
10932                             values[i * 2 + 1].isUndefined() ? Operand(v2b) : values[i * 2 + 1]);
10933            } else if (ctx->options->chip_class == GFX8 || ctx->options->chip_class == GFX9) {
10934               values[i] =
10935                  bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, bld.def(v1),
10936                           values[i * 2].isUndefined() ? Operand::zero() : values[i * 2],
10937                           values[i * 2 + 1].isUndefined() ? Operand::zero() : values[i * 2 + 1]);
10938            } else {
10939               values[i] =
10940                  bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, bld.def(v1),
10941                           values[i * 2].isUndefined() ? values[i * 2 + 1] : values[i * 2],
10942                           values[i * 2 + 1].isUndefined() ? values[i * 2] : values[i * 2 + 1]);
10943            }
10944         } else {
10945            values[i] = Operand(v1);
10946         }
10947      }
10948      values[2] = Operand(v1);
10949      values[3] = Operand(v1);
10950      compr = true;
10951      break;
10952
10953   case V_028714_SPI_SHADER_UNORM16_ABGR:
10954      if (is_16bit && ctx->options->chip_class >= GFX9) {
10955         compr_op = aco_opcode::v_cvt_pknorm_u16_f16;
10956      } else {
10957         compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
10958      }
10959      break;
10960
10961   case V_028714_SPI_SHADER_SNORM16_ABGR:
10962      if (is_16bit && ctx->options->chip_class >= GFX9) {
10963         compr_op = aco_opcode::v_cvt_pknorm_i16_f16;
10964      } else {
10965         compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
10966      }
10967      break;
10968
10969   case V_028714_SPI_SHADER_UINT16_ABGR: {
10970      compr_op = aco_opcode::v_cvt_pk_u16_u32;
10971      if (is_int8 || is_int10) {
10972         /* clamp */
10973         uint32_t max_rgb = is_int8 ? 255 : is_int10 ? 1023 : 0;
10974         Temp max_rgb_val = bld.copy(bld.def(s1), Operand::c32(max_rgb));
10975
10976         for (unsigned i = 0; i < 4; i++) {
10977            if ((write_mask >> i) & 1) {
10978               values[i] =
10979                  bld.vop2(aco_opcode::v_min_u32, bld.def(v1),
10980                           i == 3 && is_int10 ? Operand::c32(3u) : Operand(max_rgb_val), values[i]);
10981            }
10982         }
10983      } else if (is_16bit) {
10984         for (unsigned i = 0; i < 4; i++) {
10985            if ((write_mask >> i) & 1) {
10986               Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, false);
10987               values[i] = Operand(tmp);
10988            }
10989         }
10990      }
10991      break;
10992   }
10993
10994   case V_028714_SPI_SHADER_SINT16_ABGR:
10995      compr_op = aco_opcode::v_cvt_pk_i16_i32;
10996      if (is_int8 || is_int10) {
10997         /* clamp */
10998         uint32_t max_rgb = is_int8 ? 127 : is_int10 ? 511 : 0;
10999         uint32_t min_rgb = is_int8 ? -128 : is_int10 ? -512 : 0;
11000         Temp max_rgb_val = bld.copy(bld.def(s1), Operand::c32(max_rgb));
11001         Temp min_rgb_val = bld.copy(bld.def(s1), Operand::c32(min_rgb));
11002
11003         for (unsigned i = 0; i < 4; i++) {
11004            if ((write_mask >> i) & 1) {
11005               values[i] =
11006                  bld.vop2(aco_opcode::v_min_i32, bld.def(v1),
11007                           i == 3 && is_int10 ? Operand::c32(1u) : Operand(max_rgb_val), values[i]);
11008               values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1),
11009                                    i == 3 && is_int10 ? Operand::c32(-2u) : Operand(min_rgb_val),
11010                                    values[i]);
11011            }
11012         }
11013      } else if (is_16bit) {
11014         for (unsigned i = 0; i < 4; i++) {
11015            if ((write_mask >> i) & 1) {
11016               Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, true);
11017               values[i] = Operand(tmp);
11018            }
11019         }
11020      }
11021      break;
11022
11023   case V_028714_SPI_SHADER_32_ABGR: enabled_channels = 0xF; break;
11024
11025   case V_028714_SPI_SHADER_ZERO:
11026   default: return false;
11027   }
11028
11029   if ((bool)compr_op) {
11030      for (int i = 0; i < 2; i++) {
11031         /* check if at least one of the values to be compressed is enabled */
11032         bool enabled = (write_mask >> (i * 2)) & 0x3;
11033         if (enabled) {
11034            enabled_channels |= 0x3 << (i * 2);
11035            values[i] = bld.vop3(
11036               compr_op, bld.def(v1), values[i * 2].isUndefined() ? Operand::zero() : values[i * 2],
11037               values[i * 2 + 1].isUndefined() ? Operand::zero() : values[i * 2 + 1]);
11038         } else {
11039            values[i] = Operand(v1);
11040         }
11041      }
11042      values[2] = Operand(v1);
11043      values[3] = Operand(v1);
11044      compr = true;
11045   } else if (!compr) {
11046      for (int i = 0; i < 4; i++)
11047         values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);
11048   }
11049
11050   bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], enabled_channels, target,
11051           compr);
11052   return true;
11053}
11054
11055static void
11056create_fs_null_export(isel_context* ctx)
11057{
11058   /* FS must always have exports.
11059    * So when there are none, we need to add a null export.
11060    */
11061
11062   Builder bld(ctx->program, ctx->block);
11063   unsigned dest = V_008DFC_SQ_EXP_NULL;
11064   bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
11065           /* enabled_mask */ 0, dest, /* compr */ false, /* done */ true, /* vm */ true);
11066}
11067
11068static void
11069create_fs_exports(isel_context* ctx)
11070{
11071   bool exported = false;
11072
11073   /* Export depth, stencil and sample mask. */
11074   if (ctx->outputs.mask[FRAG_RESULT_DEPTH] || ctx->outputs.mask[FRAG_RESULT_STENCIL] ||
11075       ctx->outputs.mask[FRAG_RESULT_SAMPLE_MASK])
11076      exported |= export_fs_mrt_z(ctx);
11077
11078   /* Export all color render targets. */
11079   for (unsigned i = FRAG_RESULT_DATA0; i < FRAG_RESULT_DATA7 + 1; ++i)
11080      if (ctx->outputs.mask[i])
11081         exported |= export_fs_mrt_color(ctx, i);
11082
11083   if (!exported)
11084      create_fs_null_export(ctx);
11085
11086   ctx->block->kind |= block_kind_export_end;
11087}
11088
11089static void
11090create_workgroup_barrier(Builder& bld)
11091{
11092   bld.barrier(aco_opcode::p_barrier,
11093               memory_sync_info(storage_shared, semantic_acqrel, scope_workgroup), scope_workgroup);
11094}
11095
11096static void
11097emit_stream_output(isel_context* ctx, Temp const* so_buffers, Temp const* so_write_offset,
11098                   const struct radv_stream_output* output)
11099{
11100   unsigned num_comps = util_bitcount(output->component_mask);
11101   unsigned writemask = (1 << num_comps) - 1;
11102   unsigned loc = output->location;
11103   unsigned buf = output->buffer;
11104
11105   assert(num_comps && num_comps <= 4);
11106   if (!num_comps || num_comps > 4)
11107      return;
11108
11109   unsigned first_comp = ffs(output->component_mask) - 1;
11110
11111   Temp out[4];
11112   bool all_undef = true;
11113   assert(ctx->stage.hw == HWStage::VS);
11114   for (unsigned i = 0; i < num_comps; i++) {
11115      out[i] = ctx->outputs.temps[loc * 4 + first_comp + i];
11116      all_undef = all_undef && !out[i].id();
11117   }
11118   if (all_undef)
11119      return;
11120
11121   while (writemask) {
11122      int start, count;
11123      u_bit_scan_consecutive_range(&writemask, &start, &count);
11124      if (count == 3 && ctx->options->chip_class == GFX6) {
11125         /* GFX6 doesn't support storing vec3, split it. */
11126         writemask |= 1u << (start + 2);
11127         count = 2;
11128      }
11129
11130      unsigned offset = output->offset + start * 4;
11131
11132      Temp write_data = ctx->program->allocateTmp(RegClass(RegType::vgpr, count));
11133      aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
11134         aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
11135      for (int i = 0; i < count; ++i)
11136         vec->operands[i] =
11137            (ctx->outputs.mask[loc] & 1 << (start + first_comp + i)) ? Operand(out[start + i]) : Operand::zero();
11138      vec->definitions[0] = Definition(write_data);
11139      ctx->block->instructions.emplace_back(std::move(vec));
11140
11141      aco_opcode opcode;
11142      switch (count) {
11143      case 1: opcode = aco_opcode::buffer_store_dword; break;
11144      case 2: opcode = aco_opcode::buffer_store_dwordx2; break;
11145      case 3: opcode = aco_opcode::buffer_store_dwordx3; break;
11146      case 4: opcode = aco_opcode::buffer_store_dwordx4; break;
11147      default: unreachable("Unsupported dword count.");
11148      }
11149
11150      aco_ptr<MUBUF_instruction> store{
11151         create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
11152      store->operands[0] = Operand(so_buffers[buf]);
11153      store->operands[1] = Operand(so_write_offset[buf]);
11154      store->operands[2] = Operand::c32(0);
11155      store->operands[3] = Operand(write_data);
11156      if (offset > 4095) {
11157         /* Don't think this can happen in RADV, but maybe GL? It's easy to do this anyway. */
11158         Builder bld(ctx->program, ctx->block);
11159         store->operands[0] =
11160            bld.vadd32(bld.def(v1), Operand::c32(offset), Operand(so_write_offset[buf]));
11161      } else {
11162         store->offset = offset;
11163      }
11164      store->offen = true;
11165      store->glc = true;
11166      store->dlc = false;
11167      store->slc = true;
11168      ctx->block->instructions.emplace_back(std::move(store));
11169   }
11170}
11171
11172static void
11173emit_streamout(isel_context* ctx, unsigned stream)
11174{
11175   Builder bld(ctx->program, ctx->block);
11176
11177   Temp so_vtx_count =
11178      bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11179               get_arg(ctx, ctx->args->ac.streamout_config), Operand::c32(0x70010u));
11180
11181   Temp tid = emit_mbcnt(ctx, bld.tmp(v1));
11182
11183   Temp can_emit = bld.vopc(aco_opcode::v_cmp_gt_i32, bld.def(bld.lm), so_vtx_count, tid);
11184
11185   if_context ic;
11186   begin_divergent_if_then(ctx, &ic, can_emit);
11187
11188   bld.reset(ctx->block);
11189
11190   Temp so_write_index =
11191      bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->ac.streamout_write_index), tid);
11192
11193   Temp so_buffers[4];
11194   Temp so_write_offset[4];
11195   Temp buf_ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->streamout_buffers));
11196
11197   for (unsigned i = 0; i < 4; i++) {
11198      unsigned stride = ctx->program->info->so.strides[i];
11199      if (!stride)
11200         continue;
11201
11202      so_buffers[i] = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), buf_ptr,
11203                               bld.copy(bld.def(s1), Operand::c32(i * 16u)));
11204
11205      if (stride == 1) {
11206         Temp offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
11207                                get_arg(ctx, ctx->args->ac.streamout_write_index),
11208                                get_arg(ctx, ctx->args->ac.streamout_offset[i]));
11209         Temp new_offset = bld.vadd32(bld.def(v1), offset, tid);
11210
11211         so_write_offset[i] =
11212            bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), new_offset);
11213      } else {
11214         Temp offset = bld.v_mul_imm(bld.def(v1), so_write_index, stride * 4u);
11215         Temp offset2 = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(4u),
11216                                 get_arg(ctx, ctx->args->ac.streamout_offset[i]));
11217         so_write_offset[i] = bld.vadd32(bld.def(v1), offset, offset2);
11218      }
11219   }
11220
11221   for (unsigned i = 0; i < ctx->program->info->so.num_outputs; i++) {
11222      const struct radv_stream_output* output = &ctx->program->info->so.outputs[i];
11223      if (stream != output->stream)
11224         continue;
11225
11226      emit_stream_output(ctx, so_buffers, so_write_offset, output);
11227   }
11228
11229   begin_divergent_if_else(ctx, &ic);
11230   end_divergent_if(ctx, &ic);
11231}
11232
11233Pseudo_instruction*
11234add_startpgm(struct isel_context* ctx)
11235{
11236   aco_ptr<Pseudo_instruction> startpgm{
11237      create_instruction<Pseudo_instruction>(aco_opcode::p_startpgm, Format::PSEUDO, 0, ctx->args->ac.arg_count)};
11238   for (unsigned i = 0, arg = 0; i < ctx->args->ac.arg_count; i++) {
11239      if (ctx->args->ac.args[i].skip)
11240         continue;
11241
11242      enum ac_arg_regfile file = ctx->args->ac.args[i].file;
11243      unsigned size = ctx->args->ac.args[i].size;
11244      unsigned reg = ctx->args->ac.args[i].offset;
11245      RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
11246      Temp dst = ctx->program->allocateTmp(type);
11247      ctx->arg_temps[i] = dst;
11248      startpgm->definitions[arg] = Definition(dst);
11249      startpgm->definitions[arg].setFixed(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256});
11250      arg++;
11251   }
11252   Pseudo_instruction* instr = startpgm.get();
11253   ctx->block->instructions.push_back(std::move(startpgm));
11254
11255   /* Stash these in the program so that they can be accessed later when
11256    * handling spilling.
11257    */
11258   ctx->program->private_segment_buffer = get_arg(ctx, ctx->args->ring_offsets);
11259   ctx->program->scratch_offset = get_arg(ctx, ctx->args->ac.scratch_offset);
11260
11261   if (ctx->stage.has(SWStage::VS) && ctx->program->info->vs.dynamic_inputs) {
11262      unsigned num_attributes = util_last_bit(ctx->program->info->vs.vb_desc_usage_mask);
11263      for (unsigned i = 0; i < num_attributes; i++) {
11264         Definition def(get_arg(ctx, ctx->args->vs_inputs[i]));
11265
11266         unsigned idx = ctx->args->vs_inputs[i].arg_index;
11267         def.setFixed(PhysReg(256 + ctx->args->ac.args[idx].offset));
11268
11269         ctx->program->vs_inputs.push_back(def);
11270      }
11271   }
11272
11273   return instr;
11274}
11275
11276void
11277fix_ls_vgpr_init_bug(isel_context* ctx, Pseudo_instruction* startpgm)
11278{
11279   assert(ctx->shader->info.stage == MESA_SHADER_VERTEX);
11280   Builder bld(ctx->program, ctx->block);
11281   constexpr unsigned hs_idx = 1u;
11282   Builder::Result hs_thread_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11283                                              get_arg(ctx, ctx->args->ac.merged_wave_info),
11284                                              Operand::c32((8u << 16) | (hs_idx * 8u)));
11285   Temp ls_has_nonzero_hs_threads = bool_to_vector_condition(ctx, hs_thread_count.def(1).getTemp());
11286
11287   /* If there are no HS threads, SPI mistakenly loads the LS VGPRs starting at VGPR 0. */
11288
11289   Temp instance_id =
11290      bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->ac.vertex_id),
11291               get_arg(ctx, ctx->args->ac.instance_id), ls_has_nonzero_hs_threads);
11292   Temp vs_rel_patch_id =
11293      bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->ac.tcs_rel_ids),
11294               get_arg(ctx, ctx->args->ac.vs_rel_patch_id), ls_has_nonzero_hs_threads);
11295   Temp vertex_id =
11296      bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->ac.tcs_patch_id),
11297               get_arg(ctx, ctx->args->ac.vertex_id), ls_has_nonzero_hs_threads);
11298
11299   ctx->arg_temps[ctx->args->ac.instance_id.arg_index] = instance_id;
11300   ctx->arg_temps[ctx->args->ac.vs_rel_patch_id.arg_index] = vs_rel_patch_id;
11301   ctx->arg_temps[ctx->args->ac.vertex_id.arg_index] = vertex_id;
11302}
11303
11304void
11305split_arguments(isel_context* ctx, Pseudo_instruction* startpgm)
11306{
11307   /* Split all arguments except for the first (ring_offsets) and the last
11308    * (exec) so that the dead channels don't stay live throughout the program.
11309    */
11310   for (int i = 1; i < startpgm->definitions.size(); i++) {
11311      if (startpgm->definitions[i].regClass().size() > 1) {
11312         emit_split_vector(ctx, startpgm->definitions[i].getTemp(),
11313                           startpgm->definitions[i].regClass().size());
11314      }
11315   }
11316}
11317
11318void
11319handle_bc_optimize(isel_context* ctx)
11320{
11321   /* needed when SPI_PS_IN_CONTROL.BC_OPTIMIZE_DISABLE is set to 0 */
11322   Builder bld(ctx->program, ctx->block);
11323   uint32_t spi_ps_input_ena = ctx->program->config->spi_ps_input_ena;
11324   bool uses_center =
11325      G_0286CC_PERSP_CENTER_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTER_ENA(spi_ps_input_ena);
11326   bool uses_persp_centroid = G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena);
11327   bool uses_linear_centroid = G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena);
11328
11329   if (uses_persp_centroid)
11330      ctx->persp_centroid = get_arg(ctx, ctx->args->ac.persp_centroid);
11331   if (uses_linear_centroid)
11332      ctx->linear_centroid = get_arg(ctx, ctx->args->ac.linear_centroid);
11333
11334   if (uses_center && (uses_persp_centroid || uses_linear_centroid)) {
11335      Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)),
11336                              get_arg(ctx, ctx->args->ac.prim_mask), Operand::zero());
11337
11338      if (uses_persp_centroid) {
11339         Temp new_coord[2];
11340         for (unsigned i = 0; i < 2; i++) {
11341            Temp persp_centroid =
11342               emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_centroid), i, v1);
11343            Temp persp_center =
11344               emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_center), i, v1);
11345            new_coord[i] =
11346               bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), persp_centroid, persp_center, sel);
11347         }
11348         ctx->persp_centroid = bld.tmp(v2);
11349         bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->persp_centroid),
11350                    Operand(new_coord[0]), Operand(new_coord[1]));
11351         emit_split_vector(ctx, ctx->persp_centroid, 2);
11352      }
11353
11354      if (uses_linear_centroid) {
11355         Temp new_coord[2];
11356         for (unsigned i = 0; i < 2; i++) {
11357            Temp linear_centroid =
11358               emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_centroid), i, v1);
11359            Temp linear_center =
11360               emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_center), i, v1);
11361            new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), linear_centroid,
11362                                    linear_center, sel);
11363         }
11364         ctx->linear_centroid = bld.tmp(v2);
11365         bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->linear_centroid),
11366                    Operand(new_coord[0]), Operand(new_coord[1]));
11367         emit_split_vector(ctx, ctx->linear_centroid, 2);
11368      }
11369   }
11370}
11371
11372void
11373setup_fp_mode(isel_context* ctx, nir_shader* shader)
11374{
11375   Program* program = ctx->program;
11376
11377   unsigned float_controls = shader->info.float_controls_execution_mode;
11378
11379   program->next_fp_mode.preserve_signed_zero_inf_nan32 =
11380      float_controls & FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32;
11381   program->next_fp_mode.preserve_signed_zero_inf_nan16_64 =
11382      float_controls & (FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16 |
11383                        FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP64);
11384
11385   program->next_fp_mode.must_flush_denorms32 =
11386      float_controls & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32;
11387   program->next_fp_mode.must_flush_denorms16_64 =
11388      float_controls &
11389      (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 | FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64);
11390
11391   program->next_fp_mode.care_about_round32 =
11392      float_controls &
11393      (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32);
11394
11395   program->next_fp_mode.care_about_round16_64 =
11396      float_controls &
11397      (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 |
11398       FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64);
11399
11400   /* default to preserving fp16 and fp64 denorms, since it's free for fp64 and
11401    * the precision seems needed for Wolfenstein: Youngblood to render correctly */
11402   if (program->next_fp_mode.must_flush_denorms16_64)
11403      program->next_fp_mode.denorm16_64 = 0;
11404   else
11405      program->next_fp_mode.denorm16_64 = fp_denorm_keep;
11406
11407   /* preserving fp32 denorms is expensive, so only do it if asked */
11408   if (float_controls & FLOAT_CONTROLS_DENORM_PRESERVE_FP32)
11409      program->next_fp_mode.denorm32 = fp_denorm_keep;
11410   else
11411      program->next_fp_mode.denorm32 = 0;
11412
11413   if (float_controls & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)
11414      program->next_fp_mode.round32 = fp_round_tz;
11415   else
11416      program->next_fp_mode.round32 = fp_round_ne;
11417
11418   if (float_controls &
11419       (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
11420      program->next_fp_mode.round16_64 = fp_round_tz;
11421   else
11422      program->next_fp_mode.round16_64 = fp_round_ne;
11423
11424   ctx->block->fp_mode = program->next_fp_mode;
11425}
11426
11427void
11428cleanup_cfg(Program* program)
11429{
11430   /* create linear_succs/logical_succs */
11431   for (Block& BB : program->blocks) {
11432      for (unsigned idx : BB.linear_preds)
11433         program->blocks[idx].linear_succs.emplace_back(BB.index);
11434      for (unsigned idx : BB.logical_preds)
11435         program->blocks[idx].logical_succs.emplace_back(BB.index);
11436   }
11437}
11438
11439Temp
11440lanecount_to_mask(isel_context* ctx, Temp count, bool allow64 = true)
11441{
11442   assert(count.regClass() == s1);
11443
11444   Builder bld(ctx->program, ctx->block);
11445   Temp mask = bld.sop2(aco_opcode::s_bfm_b64, bld.def(s2), count, Operand::zero());
11446   Temp cond;
11447
11448   if (ctx->program->wave_size == 64) {
11449      /* If we know that all 64 threads can't be active at a time, we just use the mask as-is */
11450      if (!allow64)
11451         return mask;
11452
11453      /* Special case for 64 active invocations, because 64 doesn't work with s_bfm */
11454      Temp active_64 = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), count,
11455                                Operand::c32(6u /* log2(64) */));
11456      cond =
11457         bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand::c32(-1u), mask, bld.scc(active_64));
11458   } else {
11459      /* We use s_bfm_b64 (not _b32) which works with 32, but we need to extract the lower half of
11460       * the register */
11461      cond = emit_extract_vector(ctx, mask, 0, bld.lm);
11462   }
11463
11464   return cond;
11465}
11466
11467Temp
11468merged_wave_info_to_mask(isel_context* ctx, unsigned i)
11469{
11470   Builder bld(ctx->program, ctx->block);
11471
11472   /* lanecount_to_mask() only cares about s0.u[6:0] so we don't need either s_bfe nor s_and here */
11473   Temp count = i == 0
11474                   ? get_arg(ctx, ctx->args->ac.merged_wave_info)
11475                   : bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc),
11476                              get_arg(ctx, ctx->args->ac.merged_wave_info), Operand::c32(i * 8u));
11477
11478   return lanecount_to_mask(ctx, count);
11479}
11480
11481void
11482ngg_emit_sendmsg_gs_alloc_req(isel_context* ctx, Temp vtx_cnt, Temp prm_cnt)
11483{
11484   assert(vtx_cnt.id() && prm_cnt.id());
11485
11486   Builder bld(ctx->program, ctx->block);
11487   Temp prm_cnt_0;
11488
11489   if (ctx->program->chip_class == GFX10 &&
11490       (ctx->stage.has(SWStage::GS) || ctx->program->info->has_ngg_culling)) {
11491      /* Navi 1x workaround: check whether the workgroup has no output.
11492       * If so, change the number of exported vertices and primitives to 1.
11493       */
11494      prm_cnt_0 = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), prm_cnt, Operand::zero());
11495      prm_cnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::c32(1u), prm_cnt,
11496                         bld.scc(prm_cnt_0));
11497      vtx_cnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::c32(1u), vtx_cnt,
11498                         bld.scc(prm_cnt_0));
11499   }
11500
11501   /* Put the number of vertices and primitives into m0 for the GS_ALLOC_REQ */
11502   Temp tmp =
11503      bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), prm_cnt, Operand::c32(12u));
11504   tmp = bld.sop2(aco_opcode::s_or_b32, bld.m0(bld.def(s1)), bld.def(s1, scc), tmp, vtx_cnt);
11505
11506   /* Request the SPI to allocate space for the primitives and vertices
11507    * that will be exported by the threadgroup.
11508    */
11509   bld.sopp(aco_opcode::s_sendmsg, bld.m0(tmp), -1, sendmsg_gs_alloc_req);
11510
11511   if (prm_cnt_0.id()) {
11512      /* Navi 1x workaround: export a triangle with NaN coordinates when NGG has no output.
11513       * It can't have all-zero positions because that would render an undesired pixel with
11514       * conservative rasterization.
11515       */
11516      Temp first_lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
11517      Temp cond = bld.sop2(Builder::s_lshl, bld.def(bld.lm), bld.def(s1, scc),
11518                           Operand::c32_or_c64(1u, ctx->program->wave_size == 64), first_lane);
11519      cond = bld.sop2(Builder::s_cselect, bld.def(bld.lm), cond,
11520                      Operand::zero(ctx->program->wave_size == 64 ? 8 : 4), bld.scc(prm_cnt_0));
11521
11522      if_context ic_prim_0;
11523      begin_divergent_if_then(ctx, &ic_prim_0, cond);
11524      bld.reset(ctx->block);
11525      ctx->block->kind |= block_kind_export_end;
11526
11527      /* Use zero: means that it's a triangle whose every vertex index is 0. */
11528      Temp zero = bld.copy(bld.def(v1), Operand::zero());
11529      /* Use NaN for the coordinates, so that the rasterizer allways culls it.  */
11530      Temp nan_coord = bld.copy(bld.def(v1), Operand::c32(-1u));
11531
11532      bld.exp(aco_opcode::exp, zero, Operand(v1), Operand(v1), Operand(v1), 1 /* enabled mask */,
11533              V_008DFC_SQ_EXP_PRIM /* dest */, false /* compressed */, true /* done */,
11534              false /* valid mask */);
11535      bld.exp(aco_opcode::exp, nan_coord, nan_coord, nan_coord, nan_coord, 0xf /* enabled mask */,
11536              V_008DFC_SQ_EXP_POS /* dest */, false /* compressed */, true /* done */,
11537              true /* valid mask */);
11538
11539      begin_divergent_if_else(ctx, &ic_prim_0);
11540      end_divergent_if(ctx, &ic_prim_0);
11541      bld.reset(ctx->block);
11542   }
11543}
11544
11545} /* end namespace */
11546
11547void
11548select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
11549               ac_shader_config* config, const struct radv_shader_args* args)
11550{
11551   isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args, false);
11552   if_context ic_merged_wave_info;
11553   bool ngg_gs = ctx.stage.hw == HWStage::NGG && ctx.stage.has(SWStage::GS);
11554
11555   for (unsigned i = 0; i < shader_count; i++) {
11556      nir_shader* nir = shaders[i];
11557      init_context(&ctx, nir);
11558
11559      setup_fp_mode(&ctx, nir);
11560
11561      if (!i) {
11562         /* needs to be after init_context() for FS */
11563         Pseudo_instruction* startpgm = add_startpgm(&ctx);
11564         append_logical_start(ctx.block);
11565
11566         if (unlikely(args->options->has_ls_vgpr_init_bug && ctx.stage == vertex_tess_control_hs))
11567            fix_ls_vgpr_init_bug(&ctx, startpgm);
11568
11569         split_arguments(&ctx, startpgm);
11570
11571         if (!args->shader_info->vs.has_prolog &&
11572             (program->stage.has(SWStage::VS) || program->stage.has(SWStage::TES))) {
11573            Builder(ctx.program, ctx.block).sopp(aco_opcode::s_setprio, -1u, 0x3u);
11574         }
11575      }
11576
11577      /* In a merged VS+TCS HS, the VS implementation can be completely empty. */
11578      nir_function_impl* func = nir_shader_get_entrypoint(nir);
11579      bool empty_shader =
11580         nir_cf_list_is_empty_block(&func->body) &&
11581         ((nir->info.stage == MESA_SHADER_VERTEX &&
11582           (ctx.stage == vertex_tess_control_hs || ctx.stage == vertex_geometry_gs)) ||
11583          (nir->info.stage == MESA_SHADER_TESS_EVAL && ctx.stage == tess_eval_geometry_gs));
11584
11585      bool check_merged_wave_info =
11586         ctx.tcs_in_out_eq ? i == 0 : (shader_count >= 2 && !empty_shader && !(ngg_gs && i == 1));
11587      bool endif_merged_wave_info =
11588         ctx.tcs_in_out_eq ? i == 1 : (check_merged_wave_info && !(ngg_gs && i == 1));
11589
11590      if (program->chip_class == GFX10 && program->stage.hw == HWStage::NGG &&
11591          program->stage.num_sw_stages() == 1) {
11592         /* Workaround for Navi1x HW bug to ensure that all NGG waves launch before
11593          * s_sendmsg(GS_ALLOC_REQ). */
11594         Builder(ctx.program, ctx.block).sopp(aco_opcode::s_barrier, -1u, 0u);
11595      }
11596
11597      if (check_merged_wave_info) {
11598         Temp cond = merged_wave_info_to_mask(&ctx, i);
11599         begin_divergent_if_then(&ctx, &ic_merged_wave_info, cond);
11600      }
11601
11602      if (i) {
11603         Builder bld(ctx.program, ctx.block);
11604
11605         /* Skip s_barrier from TCS when VS outputs are not stored in the LDS. */
11606         bool tcs_skip_barrier = ctx.stage == vertex_tess_control_hs &&
11607                                 ctx.tcs_temp_only_inputs == nir->info.inputs_read;
11608
11609         if (!ngg_gs && !tcs_skip_barrier)
11610            create_workgroup_barrier(bld);
11611
11612         if (ctx.stage == vertex_geometry_gs || ctx.stage == tess_eval_geometry_gs) {
11613            ctx.gs_wave_id = bld.pseudo(aco_opcode::p_extract, bld.def(s1, m0), bld.def(s1, scc),
11614                                        get_arg(&ctx, args->ac.merged_wave_info), Operand::c32(2u),
11615                                        Operand::c32(8u), Operand::zero());
11616         }
11617      } else if (ctx.stage == geometry_gs)
11618         ctx.gs_wave_id = get_arg(&ctx, args->ac.gs_wave_id);
11619
11620      if (ctx.stage == fragment_fs)
11621         handle_bc_optimize(&ctx);
11622
11623      visit_cf_list(&ctx, &func->body);
11624
11625      if (ctx.program->info->so.num_outputs && ctx.stage.hw == HWStage::VS)
11626         emit_streamout(&ctx, 0);
11627
11628      if (ctx.stage.hw == HWStage::VS) {
11629         create_vs_exports(&ctx);
11630      } else if (nir->info.stage == MESA_SHADER_GEOMETRY && !ngg_gs) {
11631         Builder bld(ctx.program, ctx.block);
11632         bld.barrier(aco_opcode::p_barrier,
11633                     memory_sync_info(storage_vmem_output, semantic_release, scope_device));
11634         bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx.gs_wave_id), -1,
11635                  sendmsg_gs_done(false, false, 0));
11636      }
11637
11638      if (ctx.stage == fragment_fs) {
11639         create_fs_exports(&ctx);
11640      }
11641
11642      if (endif_merged_wave_info) {
11643         begin_divergent_if_else(&ctx, &ic_merged_wave_info);
11644         end_divergent_if(&ctx, &ic_merged_wave_info);
11645      }
11646
11647      if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) {
11648         /* Outputs of the previous stage are inputs to the next stage */
11649         ctx.inputs = ctx.outputs;
11650         ctx.outputs = shader_io_state();
11651      }
11652
11653      cleanup_context(&ctx);
11654   }
11655
11656   program->config->float_mode = program->blocks[0].fp_mode.val;
11657
11658   append_logical_end(ctx.block);
11659   ctx.block->kind |= block_kind_uniform;
11660   Builder bld(ctx.program, ctx.block);
11661   bld.sopp(aco_opcode::s_endpgm);
11662
11663   cleanup_cfg(program);
11664}
11665
11666void
11667select_gs_copy_shader(Program* program, struct nir_shader* gs_shader, ac_shader_config* config,
11668                      const struct radv_shader_args* args)
11669{
11670   isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, args, true);
11671
11672   ctx.block->fp_mode = program->next_fp_mode;
11673
11674   add_startpgm(&ctx);
11675   append_logical_start(ctx.block);
11676
11677   Builder bld(ctx.program, ctx.block);
11678
11679   Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4),
11680                             program->private_segment_buffer, Operand::c32(RING_GSVS_VS * 16u));
11681
11682   Operand stream_id = Operand::zero();
11683   if (args->shader_info->so.num_outputs)
11684      stream_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11685                           get_arg(&ctx, ctx.args->ac.streamout_config), Operand::c32(0x20018u));
11686
11687   Temp vtx_offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u),
11688                              get_arg(&ctx, ctx.args->ac.vertex_id));
11689
11690   std::stack<if_context, std::vector<if_context>> if_contexts;
11691
11692   for (unsigned stream = 0; stream < 4; stream++) {
11693      if (stream_id.isConstant() && stream != stream_id.constantValue())
11694         continue;
11695
11696      unsigned num_components = args->shader_info->gs.num_stream_output_components[stream];
11697      if (stream > 0 && (!num_components || !args->shader_info->so.num_outputs))
11698         continue;
11699
11700      memset(ctx.outputs.mask, 0, sizeof(ctx.outputs.mask));
11701
11702      if (!stream_id.isConstant()) {
11703         Temp cond =
11704            bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), stream_id, Operand::c32(stream));
11705         if_contexts.emplace();
11706         begin_uniform_if_then(&ctx, &if_contexts.top(), cond);
11707         bld.reset(ctx.block);
11708      }
11709
11710      unsigned offset = 0;
11711      for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
11712         if (args->shader_info->gs.output_streams[i] != stream)
11713            continue;
11714
11715         unsigned output_usage_mask = args->shader_info->gs.output_usage_mask[i];
11716         unsigned length = util_last_bit(output_usage_mask);
11717         for (unsigned j = 0; j < length; ++j) {
11718            if (!(output_usage_mask & (1 << j)))
11719               continue;
11720
11721            Temp val = bld.tmp(v1);
11722            unsigned const_offset = offset * args->shader_info->gs.vertices_out * 16 * 4;
11723            load_vmem_mubuf(&ctx, val, gsvs_ring, vtx_offset, Temp(), const_offset, 4, 1, 0u, true,
11724                            true, true);
11725
11726            ctx.outputs.mask[i] |= 1 << j;
11727            ctx.outputs.temps[i * 4u + j] = val;
11728
11729            offset++;
11730         }
11731      }
11732
11733      if (args->shader_info->so.num_outputs) {
11734         emit_streamout(&ctx, stream);
11735         bld.reset(ctx.block);
11736      }
11737
11738      if (stream == 0) {
11739         create_vs_exports(&ctx);
11740      }
11741
11742      if (!stream_id.isConstant()) {
11743         begin_uniform_if_else(&ctx, &if_contexts.top());
11744         bld.reset(ctx.block);
11745      }
11746   }
11747
11748   while (!if_contexts.empty()) {
11749      end_uniform_if(&ctx, &if_contexts.top());
11750      if_contexts.pop();
11751   }
11752
11753   program->config->float_mode = program->blocks[0].fp_mode.val;
11754
11755   append_logical_end(ctx.block);
11756   ctx.block->kind |= block_kind_uniform;
11757   bld.reset(ctx.block);
11758   bld.sopp(aco_opcode::s_endpgm);
11759
11760   cleanup_cfg(program);
11761}
11762
11763void
11764select_trap_handler_shader(Program* program, struct nir_shader* shader, ac_shader_config* config,
11765                           const struct radv_shader_args* args)
11766{
11767   assert(args->options->chip_class == GFX8);
11768
11769   init_program(program, compute_cs, args->shader_info, args->options->chip_class,
11770                args->options->family, args->options->wgp_mode, config);
11771
11772   isel_context ctx = {};
11773   ctx.program = program;
11774   ctx.args = args;
11775   ctx.options = args->options;
11776   ctx.stage = program->stage;
11777
11778   ctx.block = ctx.program->create_and_insert_block();
11779   ctx.block->kind = block_kind_top_level;
11780
11781   program->workgroup_size = 1; /* XXX */
11782
11783   add_startpgm(&ctx);
11784   append_logical_start(ctx.block);
11785
11786   Builder bld(ctx.program, ctx.block);
11787
11788   /* Load the buffer descriptor from TMA. */
11789   bld.smem(aco_opcode::s_load_dwordx4, Definition(PhysReg{ttmp4}, s4), Operand(PhysReg{tma}, s2),
11790            Operand::zero());
11791
11792   /* Store TTMP0-TTMP1. */
11793   bld.smem(aco_opcode::s_buffer_store_dwordx2, Operand(PhysReg{ttmp4}, s4), Operand::zero(),
11794            Operand(PhysReg{ttmp0}, s2), memory_sync_info(), true);
11795
11796   uint32_t hw_regs_idx[] = {
11797      2, /* HW_REG_STATUS */
11798      3, /* HW_REG_TRAP_STS */
11799      4, /* HW_REG_HW_ID */
11800      7, /* HW_REG_IB_STS */
11801   };
11802
11803   /* Store some hardware registers. */
11804   for (unsigned i = 0; i < ARRAY_SIZE(hw_regs_idx); i++) {
11805      /* "((size - 1) << 11) | register" */
11806      bld.sopk(aco_opcode::s_getreg_b32, Definition(PhysReg{ttmp8}, s1),
11807               ((20 - 1) << 11) | hw_regs_idx[i]);
11808
11809      bld.smem(aco_opcode::s_buffer_store_dword, Operand(PhysReg{ttmp4}, s4),
11810               Operand::c32(8u + i * 4), Operand(PhysReg{ttmp8}, s1), memory_sync_info(), true);
11811   }
11812
11813   program->config->float_mode = program->blocks[0].fp_mode.val;
11814
11815   append_logical_end(ctx.block);
11816   ctx.block->kind |= block_kind_uniform;
11817   bld.sopp(aco_opcode::s_endpgm);
11818
11819   cleanup_cfg(program);
11820}
11821
11822Operand
11823get_arg_fixed(const struct radv_shader_args* args, struct ac_arg arg)
11824{
11825   assert(arg.used);
11826
11827   enum ac_arg_regfile file = args->ac.args[arg.arg_index].file;
11828   unsigned size = args->ac.args[arg.arg_index].size;
11829   unsigned reg = args->ac.args[arg.arg_index].offset;
11830
11831   return Operand(PhysReg(file == AC_ARG_SGPR ? reg : reg + 256),
11832                  RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size));
11833}
11834
11835unsigned
11836load_vb_descs(Builder& bld, PhysReg dest, Operand base, unsigned start, unsigned max)
11837{
11838   unsigned count = MIN2((bld.program->dev.sgpr_limit - dest.reg()) / 4u, max);
11839
11840   unsigned num_loads = (count / 4u) + util_bitcount(count & 0x3);
11841   if (bld.program->chip_class >= GFX10 && num_loads > 1)
11842      bld.sopp(aco_opcode::s_clause, -1, num_loads - 1);
11843
11844   for (unsigned i = 0; i < count;) {
11845      unsigned size = 1u << util_logbase2(MIN2(count - i, 4));
11846
11847      if (size == 4)
11848         bld.smem(aco_opcode::s_load_dwordx16, Definition(dest, s16), base,
11849                  Operand::c32((start + i) * 16u));
11850      else if (size == 2)
11851         bld.smem(aco_opcode::s_load_dwordx8, Definition(dest, s8), base,
11852                  Operand::c32((start + i) * 16u));
11853      else
11854         bld.smem(aco_opcode::s_load_dwordx4, Definition(dest, s4), base,
11855                  Operand::c32((start + i) * 16u));
11856
11857      dest = dest.advance(size * 16u);
11858      i += size;
11859   }
11860
11861   return count;
11862}
11863
11864Operand
11865calc_nontrivial_instance_id(Builder& bld, const struct radv_shader_args* args, unsigned index,
11866                            Operand instance_id, Operand start_instance, PhysReg tmp_sgpr,
11867                            PhysReg tmp_vgpr0, PhysReg tmp_vgpr1)
11868{
11869   bld.smem(aco_opcode::s_load_dwordx2, Definition(tmp_sgpr, s2),
11870            get_arg_fixed(args, args->prolog_inputs), Operand::c32(8u + index * 8u));
11871
11872   wait_imm lgkm_imm;
11873   lgkm_imm.lgkm = 0;
11874   bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(bld.program->chip_class));
11875
11876   Definition fetch_index_def(tmp_vgpr0, v1);
11877   Operand fetch_index(tmp_vgpr0, v1);
11878
11879   Operand div_info(tmp_sgpr, s1);
11880   if (bld.program->chip_class >= GFX8) {
11881      /* use SDWA */
11882      if (bld.program->chip_class < GFX9) {
11883         bld.vop1(aco_opcode::v_mov_b32, Definition(tmp_vgpr1, v1), div_info);
11884         div_info = Operand(tmp_vgpr1, v1);
11885      }
11886
11887      bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, instance_id).instr;
11888
11889      Instruction* instr;
11890      if (bld.program->chip_class >= GFX9)
11891         instr = bld.vop2_sdwa(aco_opcode::v_add_u32, fetch_index_def, div_info, fetch_index).instr;
11892      else
11893         instr = bld.vop2_sdwa(aco_opcode::v_add_co_u32, fetch_index_def, Definition(vcc, bld.lm),
11894                               div_info, fetch_index)
11895                    .instr;
11896      instr->sdwa().sel[0] = SubdwordSel::ubyte1;
11897
11898      bld.vop3(aco_opcode::v_mul_hi_u32, fetch_index_def, Operand(tmp_sgpr.advance(4), s1),
11899               fetch_index);
11900
11901      instr =
11902         bld.vop2_sdwa(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, fetch_index).instr;
11903      instr->sdwa().sel[0] = SubdwordSel::ubyte2;
11904   } else {
11905      Operand tmp_op(tmp_vgpr1, v1);
11906      Definition tmp_def(tmp_vgpr1, v1);
11907
11908      bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, instance_id);
11909
11910      bld.vop3(aco_opcode::v_bfe_u32, tmp_def, div_info, Operand::c32(8u), Operand::c32(8u));
11911      bld.vadd32(fetch_index_def, tmp_op, fetch_index, false, Operand(s2), true);
11912
11913      bld.vop3(aco_opcode::v_mul_hi_u32, fetch_index_def, fetch_index,
11914               Operand(tmp_sgpr.advance(4), s1));
11915
11916      bld.vop3(aco_opcode::v_bfe_u32, tmp_def, div_info, Operand::c32(16u), Operand::c32(8u));
11917      bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, tmp_op, fetch_index);
11918   }
11919
11920   bld.vadd32(fetch_index_def, start_instance, fetch_index, false, Operand(s2), true);
11921
11922   return fetch_index;
11923}
11924
11925void
11926select_vs_prolog(Program* program, const struct radv_vs_prolog_key* key, ac_shader_config* config,
11927                 const struct radv_shader_args* args, unsigned* num_preserved_sgprs)
11928{
11929   assert(key->num_attributes > 0);
11930
11931   /* This should be enough for any shader/stage. */
11932   unsigned max_user_sgprs = args->options->chip_class >= GFX9 ? 32 : 16;
11933   *num_preserved_sgprs = max_user_sgprs + 14;
11934
11935   init_program(program, compute_cs, args->shader_info, args->options->chip_class,
11936                args->options->family, args->options->wgp_mode, config);
11937
11938   Block* block = program->create_and_insert_block();
11939   block->kind = block_kind_top_level;
11940
11941   program->workgroup_size = 64;
11942   calc_min_waves(program);
11943
11944   Builder bld(program, block);
11945
11946   block->instructions.reserve(16 + key->num_attributes * 4);
11947
11948   bld.sopp(aco_opcode::s_setprio, -1u, 0x3u);
11949
11950   uint32_t attrib_mask = BITFIELD_MASK(key->num_attributes);
11951   bool has_nontrivial_divisors = key->state->nontrivial_divisors & attrib_mask;
11952
11953   wait_imm lgkm_imm;
11954   lgkm_imm.lgkm = 0;
11955
11956   /* choose sgprs */
11957   PhysReg vertex_buffers(align(*num_preserved_sgprs, 2));
11958   PhysReg prolog_input = vertex_buffers.advance(8);
11959   PhysReg desc(
11960      align((has_nontrivial_divisors ? prolog_input : vertex_buffers).advance(8).reg(), 4));
11961
11962   Operand start_instance = get_arg_fixed(args, args->ac.start_instance);
11963   Operand instance_id = get_arg_fixed(args, args->ac.instance_id);
11964
11965   PhysReg attributes_start(256 + args->ac.num_vgprs_used);
11966   /* choose vgprs that won't be used for anything else until the last attribute load */
11967   PhysReg vertex_index(attributes_start.reg() + key->num_attributes * 4 - 1);
11968   PhysReg instance_index(attributes_start.reg() + key->num_attributes * 4 - 2);
11969   PhysReg start_instance_vgpr(attributes_start.reg() + key->num_attributes * 4 - 3);
11970   PhysReg nontrivial_tmp_vgpr0(attributes_start.reg() + key->num_attributes * 4 - 4);
11971   PhysReg nontrivial_tmp_vgpr1(attributes_start.reg() + key->num_attributes * 4);
11972
11973   bld.sop1(aco_opcode::s_mov_b32, Definition(vertex_buffers, s1),
11974            get_arg_fixed(args, args->ac.vertex_buffers));
11975   bld.sop1(aco_opcode::s_mov_b32, Definition(vertex_buffers.advance(4), s1),
11976            Operand::c32((unsigned)args->options->address32_hi));
11977
11978   /* calculate vgpr requirements */
11979   unsigned num_vgprs = attributes_start.reg() - 256;
11980   num_vgprs += key->num_attributes * 4;
11981   if (has_nontrivial_divisors && program->chip_class <= GFX8)
11982      num_vgprs++; /* make space for nontrivial_tmp_vgpr1 */
11983   unsigned num_sgprs = 0;
11984
11985   for (unsigned loc = 0; loc < key->num_attributes;) {
11986      unsigned num_descs =
11987         load_vb_descs(bld, desc, Operand(vertex_buffers, s2), loc, key->num_attributes - loc);
11988      num_sgprs = MAX2(num_sgprs, desc.advance(num_descs * 16u).reg());
11989
11990      if (loc == 0) {
11991         /* perform setup while we load the descriptors */
11992         if (key->is_ngg || key->next_stage != MESA_SHADER_VERTEX) {
11993            Operand count = get_arg_fixed(args, args->ac.merged_wave_info);
11994            bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), count, Operand::c32(0u));
11995            if (program->wave_size == 64) {
11996               bld.sopc(aco_opcode::s_bitcmp1_b32, Definition(scc, s1), count,
11997                        Operand::c32(6u /* log2(64) */));
11998               bld.sop2(aco_opcode::s_cselect_b64, Definition(exec, s2), Operand::c64(UINT64_MAX),
11999                        Operand(exec, s2), Operand(scc, s1));
12000            }
12001         }
12002
12003         bool needs_instance_index = false;
12004         bool needs_start_instance = false;
12005         u_foreach_bit(i, key->state->instance_rate_inputs & attrib_mask)
12006         {
12007            needs_instance_index |= key->state->divisors[i] == 1;
12008            needs_start_instance |= key->state->divisors[i] == 0;
12009         }
12010         bool needs_vertex_index = ~key->state->instance_rate_inputs & attrib_mask;
12011         if (needs_vertex_index)
12012            bld.vadd32(Definition(vertex_index, v1), get_arg_fixed(args, args->ac.base_vertex),
12013                       get_arg_fixed(args, args->ac.vertex_id), false, Operand(s2), true);
12014         if (needs_instance_index)
12015            bld.vadd32(Definition(instance_index, v1), start_instance, instance_id, false,
12016                       Operand(s2), true);
12017         if (needs_start_instance)
12018            bld.vop1(aco_opcode::v_mov_b32, Definition(start_instance_vgpr, v1), start_instance);
12019      }
12020
12021      bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(program->chip_class));
12022
12023      for (unsigned i = 0; i < num_descs; i++, loc++) {
12024         PhysReg dest(attributes_start.reg() + loc * 4u);
12025
12026         /* calculate index */
12027         Operand fetch_index = Operand(vertex_index, v1);
12028         if (key->state->instance_rate_inputs & (1u << loc)) {
12029            uint32_t divisor = key->state->divisors[loc];
12030            if (divisor) {
12031               fetch_index = instance_id;
12032               if (key->state->nontrivial_divisors & (1u << loc)) {
12033                  unsigned index =
12034                     util_bitcount(key->state->nontrivial_divisors & BITFIELD_MASK(loc));
12035                  fetch_index = calc_nontrivial_instance_id(
12036                     bld, args, index, instance_id, start_instance, prolog_input,
12037                     nontrivial_tmp_vgpr0, nontrivial_tmp_vgpr1);
12038               } else {
12039                  fetch_index = Operand(instance_index, v1);
12040               }
12041            } else {
12042               fetch_index = Operand(start_instance_vgpr, v1);
12043            }
12044         }
12045
12046         /* perform load */
12047         PhysReg cur_desc = desc.advance(i * 16);
12048         if ((key->misaligned_mask & (1u << loc))) {
12049            unsigned dfmt = key->state->formats[loc] & 0xf;
12050            unsigned nfmt = key->state->formats[loc] >> 4;
12051            const struct ac_data_format_info* vtx_info = ac_get_data_format_info(dfmt);
12052            for (unsigned j = 0; j < vtx_info->num_channels; j++) {
12053               bool post_shuffle = key->state->post_shuffle & (1u << loc);
12054               unsigned offset = vtx_info->chan_byte_size * (post_shuffle && j < 3 ? 2 - j : j);
12055
12056               /* Use MUBUF to workaround hangs for byte-aligned dword loads. The Vulkan spec
12057                * doesn't require this to work, but some GL CTS tests over Zink do this anyway.
12058                * MTBUF can hang, but MUBUF doesn't (probably gives garbage, but GL CTS doesn't
12059                * care).
12060                */
12061               if (vtx_info->chan_format == V_008F0C_BUF_DATA_FORMAT_32)
12062                  bld.mubuf(aco_opcode::buffer_load_dword, Definition(dest.advance(j * 4u), v1),
12063                            Operand(cur_desc, s4), fetch_index, Operand::c32(0u), offset, false,
12064                            false, true);
12065               else
12066                  bld.mtbuf(aco_opcode::tbuffer_load_format_x, Definition(dest.advance(j * 4u), v1),
12067                            Operand(cur_desc, s4), fetch_index, Operand::c32(0u),
12068                            vtx_info->chan_format, nfmt, offset, false, true);
12069            }
12070            uint32_t one =
12071               nfmt == V_008F0C_BUF_NUM_FORMAT_UINT || nfmt == V_008F0C_BUF_NUM_FORMAT_SINT
12072                  ? 1u
12073                  : 0x3f800000u;
12074            for (unsigned j = vtx_info->num_channels; j < 4; j++) {
12075               bld.vop1(aco_opcode::v_mov_b32, Definition(dest.advance(j * 4u), v1),
12076                        Operand::c32(j == 3 ? one : 0u));
12077            }
12078         } else {
12079            bld.mubuf(aco_opcode::buffer_load_format_xyzw, Definition(dest, v4),
12080                      Operand(cur_desc, s4), fetch_index, Operand::c32(0u), 0u, false, false, true);
12081         }
12082      }
12083   }
12084
12085   if (key->state->alpha_adjust_lo | key->state->alpha_adjust_hi) {
12086      wait_imm vm_imm;
12087      vm_imm.vm = 0;
12088      bld.sopp(aco_opcode::s_waitcnt, -1, vm_imm.pack(program->chip_class));
12089   }
12090
12091   /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
12092    * so we may need to fix it up. */
12093   u_foreach_bit(loc, (key->state->alpha_adjust_lo | key->state->alpha_adjust_hi))
12094   {
12095      PhysReg alpha(attributes_start.reg() + loc * 4u + 3);
12096
12097      unsigned alpha_adjust = (key->state->alpha_adjust_lo >> loc) & 0x1;
12098      alpha_adjust |= ((key->state->alpha_adjust_hi >> loc) & 0x1) << 1;
12099
12100      if (alpha_adjust == ALPHA_ADJUST_SSCALED)
12101         bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(alpha, v1), Operand(alpha, v1));
12102
12103      /* For the integer-like cases, do a natural sign extension.
12104       *
12105       * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
12106       * and happen to contain 0, 1, 2, 3 as the two LSBs of the
12107       * exponent.
12108       */
12109      unsigned offset = alpha_adjust == ALPHA_ADJUST_SNORM ? 23u : 0u;
12110      bld.vop3(aco_opcode::v_bfe_i32, Definition(alpha, v1), Operand(alpha, v1),
12111               Operand::c32(offset), Operand::c32(2u));
12112
12113      /* Convert back to the right type. */
12114      if (alpha_adjust == ALPHA_ADJUST_SNORM) {
12115         bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(alpha, v1), Operand(alpha, v1));
12116         bld.vop2(aco_opcode::v_max_f32, Definition(alpha, v1), Operand::c32(0xbf800000u),
12117                  Operand(alpha, v1));
12118      } else if (alpha_adjust == ALPHA_ADJUST_SSCALED) {
12119         bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(alpha, v1), Operand(alpha, v1));
12120      }
12121   }
12122
12123   block->kind |= block_kind_uniform;
12124
12125   /* continue on to the main shader */
12126   Operand continue_pc = get_arg_fixed(args, args->prolog_inputs);
12127   if (has_nontrivial_divisors) {
12128      bld.smem(aco_opcode::s_load_dwordx2, Definition(prolog_input, s2),
12129               get_arg_fixed(args, args->prolog_inputs), Operand::c32(0u));
12130      bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(program->chip_class));
12131      continue_pc = Operand(prolog_input, s2);
12132   }
12133
12134   bld.sop1(aco_opcode::s_setpc_b64, continue_pc);
12135
12136   program->config->float_mode = program->blocks[0].fp_mode.val;
12137   /* addition on GFX6-8 requires a carry-out (we use VCC) */
12138   program->needs_vcc = program->chip_class <= GFX8;
12139   program->config->num_vgprs = get_vgpr_alloc(program, num_vgprs);
12140   program->config->num_sgprs = get_sgpr_alloc(program, num_sgprs);
12141}
12142} // namespace aco
12143