1/*
2 * Copyright (C) 2021 Alyssa Rosenzweig <alyssa@rosenzweig.io>
3 * Copyright (C) 2020 Collabora Ltd.
4 * Copyright © 2016 Broadcom
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 * SOFTWARE.
24 */
25
26#include "main/mtypes.h"
27#include "compiler/nir_types.h"
28#include "compiler/nir/nir_builder.h"
29#include "util/u_debug.h"
30#include "util/fast_idiv_by_const.h"
31#include "agx_compile.h"
32#include "agx_compiler.h"
33#include "agx_builder.h"
34
35static const struct debug_named_value agx_debug_options[] = {
36   {"msgs",      AGX_DBG_MSGS,		"Print debug messages"},
37   {"shaders",   AGX_DBG_SHADERS,	"Dump shaders in NIR and AIR"},
38   {"shaderdb",  AGX_DBG_SHADERDB,	"Print statistics"},
39   {"verbose",   AGX_DBG_VERBOSE,	"Disassemble verbosely"},
40   {"internal",  AGX_DBG_INTERNAL,	"Dump even internal shaders"},
41   DEBUG_NAMED_VALUE_END
42};
43
44DEBUG_GET_ONCE_FLAGS_OPTION(agx_debug, "AGX_MESA_DEBUG", agx_debug_options, 0)
45
46int agx_debug = 0;
47
48#define DBG(fmt, ...) \
49   do { if (agx_debug & AGX_DBG_MSGS) \
50      fprintf(stderr, "%s:%d: "fmt, \
51            __FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)
52
53static void
54agx_block_add_successor(agx_block *block, agx_block *successor)
55{
56   assert(block != NULL && successor != NULL);
57
58   /* Cull impossible edges */
59   if (block->unconditional_jumps)
60      return;
61
62   for (unsigned i = 0; i < ARRAY_SIZE(block->successors); ++i) {
63      if (block->successors[i]) {
64         if (block->successors[i] == successor)
65            return;
66         else
67            continue;
68      }
69
70      block->successors[i] = successor;
71      _mesa_set_add(successor->predecessors, block);
72      return;
73   }
74
75   unreachable("Too many successors");
76}
77
78static void
79agx_emit_load_const(agx_builder *b, nir_load_const_instr *instr)
80{
81   /* Ensure we've been scalarized and bit size lowered */
82   unsigned bit_size = instr->def.bit_size;
83   assert(instr->def.num_components == 1);
84   assert(bit_size == 1 || bit_size == 16 || bit_size == 32);
85
86   /* Emit move, later passes can inline/push if useful */
87   agx_mov_imm_to(b,
88                  agx_get_index(instr->def.index, agx_size_for_bits(bit_size)),
89                  nir_const_value_as_uint(instr->value[0], bit_size));
90}
91
92/* Emit code dividing P by Q */
93static agx_index
94agx_udiv_const(agx_builder *b, agx_index P, uint32_t Q)
95{
96   /* P / 1 = P */
97   if (Q == 1) {
98      return P;
99   }
100
101   /* P / UINT32_MAX = 0, unless P = UINT32_MAX when it's one */
102   if (Q == UINT32_MAX) {
103      agx_index max = agx_mov_imm(b, 32, UINT32_MAX);
104      agx_index one = agx_mov_imm(b, 32, 1);
105      return agx_icmpsel(b, P, max, one, agx_zero(), AGX_ICOND_UEQ);
106   }
107
108   /* P / 2^N = P >> N */
109   if (util_is_power_of_two_or_zero(Q)) {
110      return agx_ushr(b, P, agx_mov_imm(b, 32, util_logbase2(Q)));
111   }
112
113   /* Fall back on multiplication by a magic number */
114   struct util_fast_udiv_info info = util_compute_fast_udiv_info(Q, 32, 32);
115   agx_index preshift = agx_mov_imm(b, 32, info.pre_shift);
116   agx_index increment = agx_mov_imm(b, 32, info.increment);
117   agx_index postshift = agx_mov_imm(b, 32, info.post_shift);
118   agx_index multiplier = agx_mov_imm(b, 32, info.multiplier);
119   agx_index multiplied = agx_temp(b->shader, AGX_SIZE_64);
120   agx_index n = P;
121
122   if (info.pre_shift != 0) n = agx_ushr(b, n, preshift);
123   if (info.increment != 0) n = agx_iadd(b, n, increment, 0);
124
125   /* 64-bit multiplication, zero extending 32-bit x 32-bit, get the top word */
126   agx_imad_to(b, multiplied, agx_abs(n), agx_abs(multiplier), agx_zero(), 0);
127   n = agx_temp(b->shader, AGX_SIZE_32);
128   agx_p_extract_to(b, n, multiplied, 1);
129
130   if (info.post_shift != 0) n = agx_ushr(b, n, postshift);
131
132   return n;
133}
134
135/* AGX appears to lack support for vertex attributes. Lower to global loads. */
136static agx_instr *
137agx_emit_load_attr(agx_builder *b, nir_intrinsic_instr *instr)
138{
139   nir_src *offset_src = nir_get_io_offset_src(instr);
140   assert(nir_src_is_const(*offset_src) && "no attribute indirects");
141   unsigned index = nir_intrinsic_base(instr) +
142                    nir_src_as_uint(*offset_src);
143
144   struct agx_shader_key *key = b->shader->key;
145   struct agx_attribute attrib = key->vs.attributes[index];
146
147   /* address = base + (stride * vertex_id) + src_offset */
148   unsigned buf = attrib.buf;
149   unsigned stride = key->vs.vbuf_strides[buf];
150   unsigned shift = agx_format_shift(attrib.format);
151
152   agx_index shifted_stride = agx_mov_imm(b, 32, stride >> shift);
153   agx_index src_offset = agx_mov_imm(b, 32, attrib.src_offset);
154
155   agx_index vertex_id = agx_register(10, AGX_SIZE_32);
156   agx_index instance_id = agx_register(12, AGX_SIZE_32);
157
158   /* A nonzero divisor requires dividing the instance ID. A zero divisor
159    * specifies per-instance data. */
160   agx_index element_id = (attrib.divisor == 0) ? vertex_id :
161                          agx_udiv_const(b, instance_id, attrib.divisor);
162
163   agx_index offset = agx_imad(b, element_id, shifted_stride, src_offset, 0);
164
165   /* Each VBO has a 64-bit = 4 x 16-bit address, lookup the base address as a sysval */
166   unsigned num_vbos = key->vs.num_vbufs;
167   unsigned base_length = (num_vbos * 4);
168   agx_index base = agx_indexed_sysval(b->shader,
169                                       AGX_PUSH_VBO_BASES, AGX_SIZE_64, buf * 4, base_length);
170
171   /* Load the data */
172   assert(instr->num_components <= 4);
173
174   bool pad = ((attrib.nr_comps_minus_1 + 1) < instr->num_components);
175   agx_index real_dest = agx_dest_index(&instr->dest);
176   agx_index dest = pad ? agx_temp(b->shader, AGX_SIZE_32) : real_dest;
177
178   agx_device_load_to(b, dest, base, offset, attrib.format,
179                      BITFIELD_MASK(attrib.nr_comps_minus_1 + 1), 0);
180
181   agx_wait(b, 0);
182
183   if (pad) {
184      agx_index one = agx_mov_imm(b, 32, fui(1.0));
185      agx_index zero = agx_mov_imm(b, 32, 0);
186      agx_index channels[4] = { zero, zero, zero, one };
187      for (unsigned i = 0; i < (attrib.nr_comps_minus_1 + 1); ++i)
188         channels[i] = agx_p_extract(b, dest, i);
189      for (unsigned i = instr->num_components; i < 4; ++i)
190         channels[i] = agx_null();
191      agx_p_combine_to(b, real_dest, channels[0], channels[1], channels[2], channels[3]);
192   }
193
194   return NULL;
195}
196
197static agx_instr *
198agx_emit_load_vary_flat(agx_builder *b, nir_intrinsic_instr *instr)
199{
200   unsigned components = instr->num_components;
201   assert(components >= 1 && components <= 4);
202
203   nir_src *offset = nir_get_io_offset_src(instr);
204   assert(nir_src_is_const(*offset) && "no indirects");
205   unsigned imm_index = b->shader->varyings[nir_intrinsic_base(instr)];
206   imm_index += nir_src_as_uint(*offset);
207
208   agx_index chan[4] = { agx_null() };
209
210   for (unsigned i = 0; i < components; ++i) {
211      /* vec3 for each vertex, unknown what first 2 channels are for */
212      agx_index values = agx_ld_vary_flat(b, agx_immediate(imm_index + i), 1);
213      chan[i] = agx_p_extract(b, values, 2);
214   }
215
216   return agx_p_combine_to(b, agx_dest_index(&instr->dest),
217         chan[0], chan[1], chan[2], chan[3]);
218}
219
220static agx_instr *
221agx_emit_load_vary(agx_builder *b, nir_intrinsic_instr *instr)
222{
223   ASSERTED unsigned components = instr->num_components;
224   ASSERTED nir_intrinsic_instr *parent = nir_src_as_intrinsic(instr->src[0]);
225
226   assert(components >= 1 && components <= 4);
227   assert(parent);
228
229   /* TODO: Interpolation modes */
230   assert(parent->intrinsic == nir_intrinsic_load_barycentric_pixel);
231
232   nir_src *offset = nir_get_io_offset_src(instr);
233   assert(nir_src_is_const(*offset) && "no indirects");
234   unsigned imm_index = b->shader->varyings[nir_intrinsic_base(instr)];
235   imm_index += nir_src_as_uint(*offset) * 4;
236
237   return agx_ld_vary_to(b, agx_dest_index(&instr->dest),
238         agx_immediate(imm_index), components, true);
239}
240
241static agx_instr *
242agx_emit_store_vary(agx_builder *b, nir_intrinsic_instr *instr)
243{
244   nir_src *offset = nir_get_io_offset_src(instr);
245   assert(nir_src_is_const(*offset) && "todo: indirects");
246   unsigned imm_index = b->shader->varyings[nir_intrinsic_base(instr)];
247   imm_index += nir_intrinsic_component(instr);
248   imm_index += nir_src_as_uint(*offset);
249
250   /* nir_lower_io_to_scalar */
251   assert(nir_intrinsic_write_mask(instr) == 0x1);
252
253   return agx_st_vary(b,
254               agx_immediate(imm_index),
255               agx_src_index(&instr->src[0]));
256}
257
258static agx_instr *
259agx_emit_fragment_out(agx_builder *b, nir_intrinsic_instr *instr)
260{
261   const nir_variable *var =
262      nir_find_variable_with_driver_location(b->shader->nir,
263            nir_var_shader_out, nir_intrinsic_base(instr));
264   assert(var);
265
266   unsigned loc = var->data.location;
267   assert(var->data.index == 0 && "todo: dual-source blending");
268   assert(loc == FRAG_RESULT_DATA0 && "todo: MRT");
269   unsigned rt = (loc - FRAG_RESULT_DATA0);
270
271   /* TODO: Reverse-engineer interactions with MRT */
272   if (b->shader->nir->info.internal) {
273      /* clear */
274   } else if (b->shader->did_writeout) {
275	   agx_writeout(b, 0x0004);
276   } else {
277	   agx_writeout(b, 0xC200);
278	   agx_writeout(b, 0x000C);
279   }
280
281   b->shader->did_writeout = true;
282   return agx_st_tile(b, agx_src_index(&instr->src[0]),
283             b->shader->key->fs.tib_formats[rt]);
284}
285
286static agx_instr *
287agx_emit_load_tile(agx_builder *b, nir_intrinsic_instr *instr)
288{
289   const nir_variable *var =
290      nir_find_variable_with_driver_location(b->shader->nir,
291            nir_var_shader_out, nir_intrinsic_base(instr));
292   assert(var);
293
294   unsigned loc = var->data.location;
295   assert(var->data.index == 0 && "todo: dual-source blending");
296   assert(loc == FRAG_RESULT_DATA0 && "todo: MRT");
297   unsigned rt = (loc - FRAG_RESULT_DATA0);
298
299   /* TODO: Reverse-engineer interactions with MRT */
300   agx_writeout(b, 0xC200);
301   agx_writeout(b, 0x0008);
302   b->shader->did_writeout = true;
303   b->shader->out->reads_tib = true;
304
305   return agx_ld_tile_to(b, agx_dest_index(&instr->dest),
306         b->shader->key->fs.tib_formats[rt]);
307}
308
309static enum agx_format
310agx_format_for_bits(unsigned bits)
311{
312   switch (bits) {
313   case 8: return AGX_FORMAT_I8;
314   case 16: return AGX_FORMAT_I16;
315   case 32: return AGX_FORMAT_I32;
316   default: unreachable("Invalid bit size for load/store");
317   }
318}
319
320static agx_instr *
321agx_emit_load_ubo(agx_builder *b, nir_intrinsic_instr *instr)
322{
323   bool kernel_input = (instr->intrinsic == nir_intrinsic_load_kernel_input);
324   nir_src *offset = nir_get_io_offset_src(instr);
325
326   if (!kernel_input && !nir_src_is_const(instr->src[0]))
327      unreachable("todo: indirect UBO access");
328
329   /* Constant offsets for device_load are 16-bit */
330   bool offset_is_const = nir_src_is_const(*offset);
331   assert(offset_is_const && "todo: indirect UBO access");
332   int32_t const_offset = offset_is_const ? nir_src_as_int(*offset) : 0;
333
334   /* Offsets are shifted by the type size, so divide that out */
335   unsigned bytes = nir_dest_bit_size(instr->dest) / 8;
336   assert((const_offset & (bytes - 1)) == 0);
337   const_offset = const_offset / bytes;
338   int16_t const_as_16 = const_offset;
339
340   /* UBO blocks are specified (kernel inputs are always 0) */
341   uint32_t block = kernel_input ? 0 : nir_src_as_uint(instr->src[0]);
342
343   /* Each UBO has a 64-bit = 4 x 16-bit address */
344   unsigned num_ubos = b->shader->nir->info.num_ubos;
345   unsigned base_length = (num_ubos * 4);
346   unsigned index = block * 4; /* 16 bit units */
347
348   /* Lookup the base address (TODO: indirection) */
349   agx_index base = agx_indexed_sysval(b->shader,
350                                       AGX_PUSH_UBO_BASES, AGX_SIZE_64,
351                                       index, base_length);
352
353   /* Load the data */
354   assert(instr->num_components <= 4);
355
356   agx_device_load_to(b, agx_dest_index(&instr->dest),
357                      base,
358                      (offset_is_const && (const_offset == const_as_16)) ?
359                      agx_immediate(const_as_16) : agx_mov_imm(b, 32, const_offset),
360                      agx_format_for_bits(nir_dest_bit_size(instr->dest)),
361                      BITFIELD_MASK(instr->num_components), 0);
362
363   return agx_wait(b, 0);
364}
365
366static agx_instr *
367agx_emit_load_frag_coord(agx_builder *b, nir_intrinsic_instr *instr)
368{
369   agx_index xy[2];
370
371   for (unsigned i = 0; i < 2; ++i) {
372      xy[i] = agx_fadd(b, agx_convert(b, agx_immediate(AGX_CONVERT_U32_TO_F),
373               agx_get_sr(b, 32, AGX_SR_THREAD_POSITION_IN_GRID_X + i),
374               AGX_ROUND_RTE), agx_immediate_f(0.5f));
375   }
376
377   /* Ordering by the ABI */
378   agx_index z = agx_ld_vary(b, agx_immediate(1), 1, false);
379   agx_index w = agx_ld_vary(b, agx_immediate(0), 1, false);
380
381   return agx_p_combine_to(b, agx_dest_index(&instr->dest),
382         xy[0], xy[1], z, w);
383}
384
385static agx_instr *
386agx_blend_const(agx_builder *b, agx_index dst, unsigned comp)
387{
388     agx_index val = agx_indexed_sysval(b->shader,
389           AGX_PUSH_BLEND_CONST, AGX_SIZE_32, comp * 2, 4 * 2);
390
391     return agx_mov_to(b, dst, val);
392}
393
394static agx_instr *
395agx_emit_intrinsic(agx_builder *b, nir_intrinsic_instr *instr)
396{
397  agx_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest ?
398     agx_dest_index(&instr->dest) : agx_null();
399  gl_shader_stage stage = b->shader->stage;
400
401  switch (instr->intrinsic) {
402  case nir_intrinsic_load_barycentric_pixel:
403  case nir_intrinsic_load_barycentric_centroid:
404  case nir_intrinsic_load_barycentric_sample:
405  case nir_intrinsic_load_barycentric_at_sample:
406  case nir_intrinsic_load_barycentric_at_offset:
407     /* handled later via load_vary */
408     return NULL;
409  case nir_intrinsic_load_interpolated_input:
410     assert(stage == MESA_SHADER_FRAGMENT);
411     return agx_emit_load_vary(b, instr);
412
413  case nir_intrinsic_load_input:
414     if (stage == MESA_SHADER_FRAGMENT)
415        return agx_emit_load_vary_flat(b, instr);
416     else if (stage == MESA_SHADER_VERTEX)
417        return agx_emit_load_attr(b, instr);
418     else
419        unreachable("Unsupported shader stage");
420
421  case nir_intrinsic_store_output:
422     if (stage == MESA_SHADER_FRAGMENT)
423        return agx_emit_fragment_out(b, instr);
424     else if (stage == MESA_SHADER_VERTEX)
425        return agx_emit_store_vary(b, instr);
426     else
427        unreachable("Unsupported shader stage");
428
429  case nir_intrinsic_load_output:
430     assert(stage == MESA_SHADER_FRAGMENT);
431     return agx_emit_load_tile(b, instr);
432
433  case nir_intrinsic_load_ubo:
434  case nir_intrinsic_load_kernel_input:
435     return agx_emit_load_ubo(b, instr);
436
437  case nir_intrinsic_load_frag_coord:
438     return agx_emit_load_frag_coord(b, instr);
439
440  case nir_intrinsic_load_back_face_agx:
441     return agx_get_sr_to(b, dst, AGX_SR_BACKFACING);
442
443  case nir_intrinsic_load_vertex_id:
444     return agx_mov_to(b, dst, agx_abs(agx_register(10, AGX_SIZE_32)));
445
446  case nir_intrinsic_load_instance_id:
447     return agx_mov_to(b, dst, agx_abs(agx_register(12, AGX_SIZE_32)));
448
449  case nir_intrinsic_load_blend_const_color_r_float: return agx_blend_const(b, dst, 0);
450  case nir_intrinsic_load_blend_const_color_g_float: return agx_blend_const(b, dst, 1);
451  case nir_intrinsic_load_blend_const_color_b_float: return agx_blend_const(b, dst, 2);
452  case nir_intrinsic_load_blend_const_color_a_float: return agx_blend_const(b, dst, 3);
453
454  default:
455       fprintf(stderr, "Unhandled intrinsic %s\n", nir_intrinsic_infos[instr->intrinsic].name);
456       unreachable("Unhandled intrinsic");
457  }
458}
459
460static agx_index
461agx_alu_src_index(agx_builder *b, nir_alu_src src)
462{
463   /* Check well-formedness of the input NIR */
464   ASSERTED unsigned bitsize = nir_src_bit_size(src.src);
465   unsigned comps = nir_src_num_components(src.src);
466   unsigned channel = src.swizzle[0];
467
468   assert(bitsize == 1 || bitsize == 16 || bitsize == 32 || bitsize == 64);
469   assert(!(src.negate || src.abs));
470   assert(channel < comps);
471
472   agx_index idx = agx_src_index(&src.src);
473
474   /* We only deal with scalars, emit p_extract if needed */
475   if (comps > 1)
476      return agx_p_extract(b, idx, channel);
477   else
478      return idx;
479}
480
481static agx_instr *
482agx_emit_alu_bool(agx_builder *b, nir_op op,
483      agx_index dst, agx_index s0, agx_index s1, agx_index s2)
484{
485   /* Handle 1-bit bools as zero/nonzero rather than specifically 0/1 or 0/~0.
486    * This will give the optimizer flexibility. */
487   agx_index f = agx_immediate(0);
488   agx_index t = agx_immediate(0x1);
489
490   switch (op) {
491   case nir_op_feq: return agx_fcmpsel_to(b, dst, s0, s1, t, f, AGX_FCOND_EQ);
492   case nir_op_flt: return agx_fcmpsel_to(b, dst, s0, s1, t, f, AGX_FCOND_LT);
493   case nir_op_fge: return agx_fcmpsel_to(b, dst, s0, s1, t, f, AGX_FCOND_GE);
494   case nir_op_fneu: return agx_fcmpsel_to(b, dst, s0, s1, f, t, AGX_FCOND_EQ);
495
496   case nir_op_ieq: return agx_icmpsel_to(b, dst, s0, s1, t, f, AGX_ICOND_UEQ);
497   case nir_op_ine: return agx_icmpsel_to(b, dst, s0, s1, f, t, AGX_ICOND_UEQ);
498   case nir_op_ilt: return agx_icmpsel_to(b, dst, s0, s1, t, f, AGX_ICOND_SLT);
499   case nir_op_ige: return agx_icmpsel_to(b, dst, s0, s1, f, t, AGX_ICOND_SLT);
500   case nir_op_ult: return agx_icmpsel_to(b, dst, s0, s1, t, f, AGX_ICOND_ULT);
501   case nir_op_uge: return agx_icmpsel_to(b, dst, s0, s1, f, t, AGX_ICOND_ULT);
502
503   case nir_op_mov: return agx_mov_to(b, dst, s0);
504   case nir_op_iand: return agx_and_to(b, dst, s0, s1);
505   case nir_op_ior: return agx_or_to(b, dst, s0, s1);
506   case nir_op_ixor: return agx_xor_to(b, dst, s0, s1);
507   case nir_op_inot: return agx_xor_to(b, dst, s0, t);
508
509   case nir_op_f2b1: return agx_fcmpsel_to(b, dst, s0, f, f, t, AGX_FCOND_EQ);
510   case nir_op_i2b1: return agx_icmpsel_to(b, dst, s0, f, f, t, AGX_ICOND_UEQ);
511   case nir_op_b2b1: return agx_icmpsel_to(b, dst, s0, f, f, t, AGX_ICOND_UEQ);
512
513   case nir_op_bcsel:
514      return agx_icmpsel_to(b, dst, s0, f, s2, s1, AGX_ICOND_UEQ);
515
516   default:
517      fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[op].name);
518      unreachable("Unhandled boolean ALU instruction");
519   }
520}
521
522static agx_instr *
523agx_emit_alu(agx_builder *b, nir_alu_instr *instr)
524{
525   unsigned srcs = nir_op_infos[instr->op].num_inputs;
526   unsigned sz = nir_dest_bit_size(instr->dest.dest);
527   unsigned src_sz = srcs ? nir_src_bit_size(instr->src[0].src) : 0;
528   ASSERTED unsigned comps = nir_dest_num_components(instr->dest.dest);
529
530   assert(comps == 1 || nir_op_is_vec(instr->op));
531   assert(sz == 1 || sz == 16 || sz == 32 || sz == 64);
532
533   agx_index dst = agx_dest_index(&instr->dest.dest);
534   agx_index s0 = srcs > 0 ? agx_alu_src_index(b, instr->src[0]) : agx_null();
535   agx_index s1 = srcs > 1 ? agx_alu_src_index(b, instr->src[1]) : agx_null();
536   agx_index s2 = srcs > 2 ? agx_alu_src_index(b, instr->src[2]) : agx_null();
537   agx_index s3 = srcs > 3 ? agx_alu_src_index(b, instr->src[3]) : agx_null();
538
539   /* 1-bit bools are a bit special, only handle with select ops */
540   if (sz == 1)
541      return agx_emit_alu_bool(b, instr->op, dst, s0, s1, s2);
542
543#define UNOP(nop, aop) \
544   case nir_op_ ## nop: return agx_ ## aop ## _to(b, dst, s0);
545#define BINOP(nop, aop) \
546   case nir_op_ ## nop: return agx_ ## aop ## _to(b, dst, s0, s1);
547#define TRIOP(nop, aop) \
548   case nir_op_ ## nop: return agx_ ## aop ## _to(b, dst, s0, s1, s2);
549
550   switch (instr->op) {
551   BINOP(fadd, fadd);
552   BINOP(fmul, fmul);
553   TRIOP(ffma, fma);
554
555   UNOP(f2f16, fmov);
556   UNOP(f2f32, fmov);
557   UNOP(fround_even, roundeven);
558   UNOP(ftrunc, trunc);
559   UNOP(ffloor, floor);
560   UNOP(fceil, ceil);
561   UNOP(frcp, rcp);
562   UNOP(frsq, rsqrt);
563   UNOP(flog2, log2);
564   UNOP(fexp2, exp2);
565
566   UNOP(fddx, dfdx);
567   UNOP(fddx_coarse, dfdx);
568   UNOP(fddx_fine, dfdx);
569
570   UNOP(fddy, dfdy);
571   UNOP(fddy_coarse, dfdy);
572   UNOP(fddy_fine, dfdy);
573
574   UNOP(mov, mov);
575   UNOP(u2u16, mov);
576   UNOP(u2u32, mov);
577   UNOP(inot, not);
578   BINOP(iand, and);
579   BINOP(ior, or);
580   BINOP(ixor, xor);
581
582   case nir_op_fsqrt: return agx_fmul_to(b, dst, s0, agx_srsqrt(b, s0));
583   case nir_op_fsub: return agx_fadd_to(b, dst, s0, agx_neg(s1));
584   case nir_op_fabs: return agx_fmov_to(b, dst, agx_abs(s0));
585   case nir_op_fneg: return agx_fmov_to(b, dst, agx_neg(s0));
586
587   case nir_op_fmin: return agx_fcmpsel_to(b, dst, s0, s1, s0, s1, AGX_FCOND_LTN);
588   case nir_op_fmax: return agx_fcmpsel_to(b, dst, s0, s1, s0, s1, AGX_FCOND_GTN);
589   case nir_op_imin: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_SLT);
590   case nir_op_imax: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_SGT);
591   case nir_op_umin: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_ULT);
592   case nir_op_umax: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_UGT);
593
594   case nir_op_iadd: return agx_iadd_to(b, dst, s0, s1, 0);
595   case nir_op_isub: return agx_iadd_to(b, dst, s0, agx_neg(s1), 0);
596   case nir_op_ineg: return agx_iadd_to(b, dst, agx_zero(), agx_neg(s0), 0);
597   case nir_op_imul: return agx_imad_to(b, dst, s0, s1, agx_zero(), 0);
598
599   case nir_op_ishl: return agx_bfi_to(b, dst, agx_zero(), s0, s1, 0);
600   case nir_op_ushr: return agx_ushr_to(b, dst, s0, s1);
601   case nir_op_ishr: return agx_asr_to(b, dst, s0, s1);
602
603   case nir_op_bcsel:
604      return agx_icmpsel_to(b, dst, s0, agx_zero(), s2, s1, AGX_ICOND_UEQ);
605
606   case nir_op_b2i32:
607   case nir_op_b2i16:
608      return agx_icmpsel_to(b, dst, s0, agx_zero(), agx_zero(), agx_immediate(1), AGX_ICOND_UEQ);
609
610   case nir_op_b2f16:
611   case nir_op_b2f32:
612   {
613      /* At this point, boolean is just zero/nonzero, so compare with zero */
614      agx_index one = (sz == 16) ?
615         agx_mov_imm(b, 16, _mesa_float_to_half(1.0)) :
616         agx_mov_imm(b, 32, fui(1.0));
617
618      agx_index zero = agx_zero();
619
620      return agx_fcmpsel_to(b, dst, s0, zero, zero, one, AGX_FCOND_EQ);
621   }
622
623   case nir_op_i2i32:
624   {
625      if (s0.size != AGX_SIZE_16)
626         unreachable("todo: more conversions");
627
628      return agx_iadd_to(b, dst, s0, agx_zero(), 0);
629   }
630
631   case nir_op_i2i16:
632   {
633      if (s0.size != AGX_SIZE_32)
634         unreachable("todo: more conversions");
635
636      return agx_iadd_to(b, dst, s0, agx_zero(), 0);
637   }
638
639   case nir_op_iadd_sat:
640   {
641      agx_instr *I = agx_iadd_to(b, dst, s0, s1, 0);
642      I->saturate = true;
643      return I;
644   }
645
646   case nir_op_isub_sat:
647   {
648      agx_instr *I = agx_iadd_to(b, dst, s0, agx_neg(s1), 0);
649      I->saturate = true;
650      return I;
651   }
652
653   case nir_op_uadd_sat:
654   {
655      agx_instr *I = agx_iadd_to(b, dst, agx_abs(s0), agx_abs(s1), 0);
656      I->saturate = true;
657      return I;
658   }
659
660   case nir_op_usub_sat:
661   {
662      agx_instr *I = agx_iadd_to(b, dst, agx_abs(s0), agx_neg(agx_abs(s1)), 0);
663      I->saturate = true;
664      return I;
665   }
666
667   case nir_op_fsat:
668   {
669      agx_instr *I = agx_fadd_to(b, dst, s0, agx_negzero());
670      I->saturate = true;
671      return I;
672   }
673
674   case nir_op_fsin_agx:
675   {
676      agx_index fixup = agx_sin_pt_1(b, s0);
677      agx_index sinc = agx_sin_pt_2(b, fixup);
678      return agx_fmul_to(b, dst, sinc, fixup);
679   }
680
681   case nir_op_f2i16:
682      return agx_convert_to(b, dst,
683            agx_immediate(AGX_CONVERT_F_TO_S16), s0, AGX_ROUND_RTZ);
684
685   case nir_op_f2i32:
686      return agx_convert_to(b, dst,
687            agx_immediate(AGX_CONVERT_F_TO_S32), s0, AGX_ROUND_RTZ);
688
689   case nir_op_f2u16:
690      return agx_convert_to(b, dst,
691            agx_immediate(AGX_CONVERT_F_TO_U16), s0, AGX_ROUND_RTZ);
692
693   case nir_op_f2u32:
694      return agx_convert_to(b, dst,
695            agx_immediate(AGX_CONVERT_F_TO_U32), s0, AGX_ROUND_RTZ);
696
697   case nir_op_u2f16:
698   case nir_op_u2f32:
699   {
700      if (src_sz == 64)
701         unreachable("64-bit conversions unimplemented");
702
703      enum agx_convert mode =
704         (src_sz == 32) ? AGX_CONVERT_U32_TO_F :
705         (src_sz == 16) ? AGX_CONVERT_U16_TO_F :
706                          AGX_CONVERT_U8_TO_F;
707
708      return agx_convert_to(b, dst, agx_immediate(mode), s0, AGX_ROUND_RTE);
709   }
710
711   case nir_op_i2f16:
712   case nir_op_i2f32:
713   {
714      if (src_sz == 64)
715         unreachable("64-bit conversions unimplemented");
716
717      enum agx_convert mode =
718         (src_sz == 32) ? AGX_CONVERT_S32_TO_F :
719         (src_sz == 16) ? AGX_CONVERT_S16_TO_F :
720                          AGX_CONVERT_S8_TO_F;
721
722      return agx_convert_to(b, dst, agx_immediate(mode), s0, AGX_ROUND_RTE);
723   }
724
725   case nir_op_vec2:
726   case nir_op_vec3:
727   case nir_op_vec4:
728      return agx_p_combine_to(b, dst, s0, s1, s2, s3);
729
730   case nir_op_vec8:
731   case nir_op_vec16:
732      unreachable("should've been lowered");
733
734   default:
735      fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
736      unreachable("Unhandled ALU instruction");
737   }
738}
739
740static enum agx_dim
741agx_tex_dim(enum glsl_sampler_dim dim, bool array)
742{
743   switch (dim) {
744   case GLSL_SAMPLER_DIM_1D:
745   case GLSL_SAMPLER_DIM_BUF:
746      return array ? AGX_DIM_TEX_1D_ARRAY : AGX_DIM_TEX_1D;
747
748   case GLSL_SAMPLER_DIM_2D:
749   case GLSL_SAMPLER_DIM_RECT:
750   case GLSL_SAMPLER_DIM_EXTERNAL:
751      return array ? AGX_DIM_TEX_2D_ARRAY : AGX_DIM_TEX_2D;
752
753   case GLSL_SAMPLER_DIM_MS:
754      assert(!array && "multisampled arrays unsupported");
755      return AGX_DIM_TEX_2D_MS;
756
757   case GLSL_SAMPLER_DIM_3D:
758      assert(!array && "3D arrays unsupported");
759      return AGX_DIM_TEX_3D;
760
761   case GLSL_SAMPLER_DIM_CUBE:
762      return array ? AGX_DIM_TEX_CUBE_ARRAY : AGX_DIM_TEX_CUBE;
763
764   default:
765      unreachable("Invalid sampler dim\n");
766   }
767}
768
769static void
770agx_emit_tex(agx_builder *b, nir_tex_instr *instr)
771{
772   switch (instr->op) {
773   case nir_texop_tex:
774   case nir_texop_txl:
775      break;
776   default:
777      unreachable("Unhandled texture op");
778   }
779
780   enum agx_lod_mode lod_mode = (instr->op == nir_texop_tex) ?
781      AGX_LOD_MODE_AUTO_LOD : AGX_LOD_MODE_LOD_MIN;
782
783   agx_index coords = agx_null(),
784             texture = agx_immediate(instr->texture_index),
785             sampler = agx_immediate(instr->sampler_index),
786             lod = agx_immediate(0),
787             offset = agx_null();
788
789   for (unsigned i = 0; i < instr->num_srcs; ++i) {
790      agx_index index = agx_src_index(&instr->src[i].src);
791
792      switch (instr->src[i].src_type) {
793      case nir_tex_src_coord:
794         coords = index;
795         break;
796
797      case nir_tex_src_lod:
798         lod = index;
799         break;
800
801      case nir_tex_src_bias:
802      case nir_tex_src_ms_index:
803      case nir_tex_src_offset:
804      case nir_tex_src_comparator:
805      case nir_tex_src_texture_offset:
806      case nir_tex_src_sampler_offset:
807      default:
808         unreachable("todo");
809      }
810   }
811
812   agx_texture_sample_to(b, agx_dest_index(&instr->dest),
813         coords, lod, texture, sampler, offset,
814         agx_tex_dim(instr->sampler_dim, instr->is_array),
815         lod_mode,
816         0xF, /* TODO: wrmask */
817         0);
818
819   agx_wait(b, 0);
820}
821
822/* NIR loops are treated as a pair of AGX loops:
823 *
824 *    do {
825 *       do {
826 *          ...
827 *       } while (0);
828 *    } while (cond);
829 *
830 * By manipulating the nesting counter (r0l), we may break out of nested loops,
831 * so under the model, both break and continue may be implemented as breaks,
832 * where break breaks out of the outer loop (2 layers) and continue breaks out
833 * of the inner loop (1 layer).
834 *
835 * After manipulating the nesting counter directly, pop_exec #0 must be used to
836 * flush the update to the execution mask.
837 */
838
839static void
840agx_emit_jump(agx_builder *b, nir_jump_instr *instr)
841{
842   agx_context *ctx = b->shader;
843   assert (instr->type == nir_jump_break || instr->type == nir_jump_continue);
844
845   /* Break out of either one or two loops */
846   unsigned nestings = b->shader->loop_nesting;
847
848   if (instr->type == nir_jump_continue) {
849      nestings += 1;
850      agx_block_add_successor(ctx->current_block, ctx->continue_block);
851   } else if (instr->type == nir_jump_break) {
852      nestings += 2;
853      agx_block_add_successor(ctx->current_block, ctx->break_block);
854   }
855
856   /* Update the counter and flush */
857   agx_index r0l = agx_register(0, false);
858   agx_mov_to(b, r0l, agx_immediate(nestings));
859   agx_pop_exec(b, 0);
860
861   ctx->current_block->unconditional_jumps = true;
862}
863
864static void
865agx_emit_instr(agx_builder *b, struct nir_instr *instr)
866{
867   switch (instr->type) {
868   case nir_instr_type_load_const:
869      agx_emit_load_const(b, nir_instr_as_load_const(instr));
870      break;
871
872   case nir_instr_type_intrinsic:
873      agx_emit_intrinsic(b, nir_instr_as_intrinsic(instr));
874      break;
875
876   case nir_instr_type_alu:
877      agx_emit_alu(b, nir_instr_as_alu(instr));
878      break;
879
880   case nir_instr_type_tex:
881      agx_emit_tex(b, nir_instr_as_tex(instr));
882      break;
883
884   case nir_instr_type_jump:
885      agx_emit_jump(b, nir_instr_as_jump(instr));
886      break;
887
888   default:
889      unreachable("should've been lowered");
890   }
891}
892
893static agx_block *
894agx_create_block(agx_context *ctx)
895{
896   agx_block *blk = rzalloc(ctx, agx_block);
897
898   blk->predecessors = _mesa_set_create(blk,
899         _mesa_hash_pointer, _mesa_key_pointer_equal);
900
901   return blk;
902}
903
904static agx_block *
905emit_block(agx_context *ctx, nir_block *block)
906{
907   if (ctx->after_block) {
908      ctx->current_block = ctx->after_block;
909      ctx->after_block = NULL;
910   } else {
911      ctx->current_block = agx_create_block(ctx);
912   }
913
914   agx_block *blk = ctx->current_block;
915   list_addtail(&blk->link, &ctx->blocks);
916   list_inithead(&blk->instructions);
917
918   agx_builder _b = agx_init_builder(ctx, agx_after_block(blk));
919
920   nir_foreach_instr(instr, block) {
921      agx_emit_instr(&_b, instr);
922   }
923
924   return blk;
925}
926
927static agx_block *
928emit_cf_list(agx_context *ctx, struct exec_list *list);
929
930/* Emit if-else as
931 *
932 *    if_icmp cond != 0
933 *       ...
934 *    else_icmp cond == 0
935 *       ...
936 *    pop_exec
937 *
938 * If the else is empty, we can omit the else_icmp. This is not usually
939 * optimal, but it's a start.
940 */
941
942static void
943emit_if(agx_context *ctx, nir_if *nif)
944{
945   nir_block *nir_else_block = nir_if_first_else_block(nif);
946   bool empty_else_block =
947      (nir_else_block == nir_if_last_else_block(nif) &&
948       exec_list_is_empty(&nir_else_block->instr_list));
949
950   agx_block *first_block = ctx->current_block;
951   agx_builder _b = agx_init_builder(ctx, agx_after_block(first_block));
952   agx_index cond = agx_src_index(&nif->condition);
953
954   agx_if_icmp(&_b, cond, agx_zero(), 1, AGX_ICOND_UEQ, true);
955   ctx->loop_nesting++;
956
957   /* Emit the two subblocks. */
958   agx_block *if_block = emit_cf_list(ctx, &nif->then_list);
959   agx_block *end_then = ctx->current_block;
960
961   if (!empty_else_block) {
962      _b.cursor = agx_after_block(ctx->current_block);
963      agx_else_icmp(&_b, cond, agx_zero(), 1, AGX_ICOND_UEQ, false);
964   }
965
966   agx_block *else_block = emit_cf_list(ctx, &nif->else_list);
967   agx_block *end_else = ctx->current_block;
968
969   ctx->after_block = agx_create_block(ctx);
970
971   agx_block_add_successor(first_block, if_block);
972   agx_block_add_successor(first_block, else_block);
973   agx_block_add_successor(end_then, ctx->after_block);
974   agx_block_add_successor(end_else, ctx->after_block);
975
976   _b.cursor = agx_after_block(ctx->current_block);
977   agx_pop_exec(&_b, 1);
978   ctx->loop_nesting--;
979}
980
981static void
982emit_loop(agx_context *ctx, nir_loop *nloop)
983{
984   /* We only track nesting within the innermost loop, so reset */
985   ctx->loop_nesting = 0;
986
987   agx_block *popped_break = ctx->break_block;
988   agx_block *popped_continue = ctx->continue_block;
989
990   ctx->break_block = agx_create_block(ctx);
991   ctx->continue_block = agx_create_block(ctx);
992
993   /* Make room for break/continue nesting (TODO: skip if no divergent CF) */
994   agx_builder _b = agx_init_builder(ctx, agx_after_block(ctx->current_block));
995   agx_push_exec(&_b, 2);
996
997   /* Fallthrough to body */
998   agx_block_add_successor(ctx->current_block, ctx->continue_block);
999
1000   /* Emit the body */
1001   ctx->after_block = ctx->continue_block;
1002   agx_block *start_block = emit_cf_list(ctx, &nloop->body);
1003
1004   /* Fix up the nesting counter via an always true while_icmp, and branch back
1005    * to start of loop if any lanes are active */
1006   _b.cursor = agx_after_block(ctx->current_block);
1007   agx_while_icmp(&_b, agx_zero(), agx_zero(), 2, AGX_ICOND_UEQ, false);
1008   agx_jmp_exec_any(&_b, start_block);
1009   agx_pop_exec(&_b, 2);
1010   agx_block_add_successor(ctx->current_block, ctx->continue_block);
1011
1012   /* Pop off */
1013   ctx->after_block = ctx->break_block;
1014   ctx->break_block = popped_break;
1015   ctx->continue_block = popped_continue;
1016
1017   /* Update shader-db stats */
1018   ++ctx->loop_count;
1019
1020   /* All nested control flow must have finished */
1021   assert(ctx->loop_nesting == 0);
1022}
1023
1024/* Before the first control flow structure, the nesting counter (r0l) needs to
1025 * be zeroed for correct operation. This only happens at most once, since by
1026 * definition this occurs at the end of the first block, which dominates the
1027 * rest of the program. */
1028
1029static void
1030emit_first_cf(agx_context *ctx)
1031{
1032   if (ctx->any_cf)
1033      return;
1034
1035   agx_builder _b = agx_init_builder(ctx, agx_after_block(ctx->current_block));
1036   agx_index r0l = agx_register(0, false);
1037
1038   agx_mov_to(&_b, r0l, agx_immediate(0));
1039   ctx->any_cf = true;
1040}
1041
1042static agx_block *
1043emit_cf_list(agx_context *ctx, struct exec_list *list)
1044{
1045   agx_block *start_block = NULL;
1046
1047   foreach_list_typed(nir_cf_node, node, node, list) {
1048      switch (node->type) {
1049      case nir_cf_node_block: {
1050         agx_block *block = emit_block(ctx, nir_cf_node_as_block(node));
1051
1052         if (!start_block)
1053            start_block = block;
1054
1055         break;
1056      }
1057
1058      case nir_cf_node_if:
1059         emit_first_cf(ctx);
1060         emit_if(ctx, nir_cf_node_as_if(node));
1061         break;
1062
1063      case nir_cf_node_loop:
1064         emit_first_cf(ctx);
1065         emit_loop(ctx, nir_cf_node_as_loop(node));
1066         break;
1067
1068      default:
1069         unreachable("Unknown control flow");
1070      }
1071   }
1072
1073   return start_block;
1074}
1075
1076static void
1077agx_set_st_vary_final(agx_context *ctx)
1078{
1079   agx_foreach_instr_global_rev(ctx, I) {
1080      if (I->op == AGX_OPCODE_ST_VARY) {
1081         I->last = true;
1082         return;
1083      }
1084   }
1085}
1086
1087static void
1088agx_print_stats(agx_context *ctx, unsigned size, FILE *fp)
1089{
1090   unsigned nr_ins = 0, nr_bytes = 0, nr_threads = 1;
1091
1092   /* TODO */
1093   fprintf(stderr, "%s shader: %u inst, %u bytes, %u threads, %u loops,"
1094           "%u:%u spills:fills\n",
1095           ctx->nir->info.label ?: "",
1096           nr_ins, nr_bytes, nr_threads, ctx->loop_count,
1097           ctx->spills, ctx->fills);
1098}
1099
1100static int
1101glsl_type_size(const struct glsl_type *type, bool bindless)
1102{
1103   return glsl_count_attribute_slots(type, false);
1104}
1105
1106static bool
1107agx_lower_sincos_filter(const nir_instr *instr, UNUSED const void *_)
1108{
1109   if (instr->type != nir_instr_type_alu)
1110      return false;
1111
1112   nir_alu_instr *alu = nir_instr_as_alu(instr);
1113   return alu->op == nir_op_fsin || alu->op == nir_op_fcos;
1114}
1115
1116/* Sine and cosine are implemented via the sin_pt_1 and sin_pt_2 opcodes for
1117 * heavy lifting. sin_pt_2 implements sinc in the first quadrant, expressed in
1118 * turns (sin (tau x) / x), while sin_pt_1 implements a piecewise sign/offset
1119 * fixup to transform a quadrant angle [0, 4] to [-1, 1]. The NIR opcode
1120 * fsin_agx models the fixup, sinc, and multiply to obtain sine, so we just
1121 * need to change units from radians to quadrants modulo turns. Cosine is
1122 * implemented by shifting by one quadrant: cos(x) = sin(x + tau/4).
1123 */
1124
1125static nir_ssa_def *
1126agx_lower_sincos_impl(struct nir_builder *b, nir_instr *instr, UNUSED void *_)
1127{
1128   nir_alu_instr *alu = nir_instr_as_alu(instr);
1129   nir_ssa_def *x = nir_mov_alu(b, alu->src[0], 1);
1130   nir_ssa_def *turns = nir_fmul_imm(b, x, M_1_PI * 0.5f);
1131
1132   if (alu->op == nir_op_fcos)
1133      turns = nir_fadd_imm(b, turns, 0.25f);
1134
1135   nir_ssa_def *quadrants = nir_fmul_imm(b, nir_ffract(b, turns), 4.0);
1136   return nir_fsin_agx(b, quadrants);
1137}
1138
1139static bool
1140agx_lower_sincos(nir_shader *shader)
1141{
1142   return nir_shader_lower_instructions(shader,
1143         agx_lower_sincos_filter, agx_lower_sincos_impl, NULL);
1144}
1145
1146static bool
1147agx_lower_front_face(struct nir_builder *b,
1148                     nir_instr *instr, UNUSED void *data)
1149{
1150   if (instr->type != nir_instr_type_intrinsic)
1151      return false;
1152
1153   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1154   if (intr->intrinsic != nir_intrinsic_load_front_face)
1155      return false;
1156
1157   assert(intr->dest.is_ssa);
1158   nir_ssa_def *def = &intr->dest.ssa;
1159   assert(def->bit_size == 1);
1160
1161   b->cursor = nir_before_instr(&intr->instr);
1162   nir_ssa_def_rewrite_uses(def, nir_inot(b, nir_load_back_face_agx(b, 1)));
1163   return true;
1164}
1165
1166static bool
1167agx_lower_point_coord(struct nir_builder *b,
1168                      nir_instr *instr, UNUSED void *data)
1169{
1170   if (instr->type != nir_instr_type_intrinsic)
1171      return false;
1172
1173   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1174
1175   if (intr->intrinsic != nir_intrinsic_load_deref)
1176      return false;
1177
1178   nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
1179   nir_variable *var = nir_deref_instr_get_variable(deref);
1180
1181   if (var->data.mode != nir_var_shader_in)
1182      return false;
1183
1184   if (var->data.location != VARYING_SLOT_PNTC)
1185      return false;
1186
1187   assert(intr->dest.is_ssa);
1188   assert(intr->dest.ssa.num_components == 2);
1189
1190   b->cursor = nir_after_instr(&intr->instr);
1191   nir_ssa_def *def = nir_load_deref(b, deref);
1192   nir_ssa_def *y = nir_channel(b, def, 1);
1193   nir_ssa_def *flipped_y = nir_fadd_imm(b, nir_fneg(b, y), 1.0);
1194   nir_ssa_def *flipped = nir_vec2(b, nir_channel(b, def, 0), flipped_y);
1195   nir_ssa_def_rewrite_uses(&intr->dest.ssa, flipped);
1196   return true;
1197}
1198
1199static void
1200agx_optimize_nir(nir_shader *nir)
1201{
1202   bool progress;
1203
1204   nir_lower_idiv_options idiv_options = {
1205      .imprecise_32bit_lowering = true,
1206      .allow_fp16 = true,
1207   };
1208
1209   NIR_PASS_V(nir, nir_lower_regs_to_ssa);
1210   NIR_PASS_V(nir, nir_lower_int64);
1211   NIR_PASS_V(nir, nir_lower_idiv, &idiv_options);
1212   NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);
1213   NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
1214   NIR_PASS_V(nir, nir_lower_flrp, 16 | 32 | 64, false);
1215   NIR_PASS_V(nir, agx_lower_sincos);
1216   NIR_PASS_V(nir, nir_shader_instructions_pass,
1217         agx_lower_front_face,
1218         nir_metadata_block_index | nir_metadata_dominance, NULL);
1219
1220   do {
1221      progress = false;
1222
1223      NIR_PASS(progress, nir, nir_lower_var_copies);
1224      NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
1225
1226      NIR_PASS(progress, nir, nir_copy_prop);
1227      NIR_PASS(progress, nir, nir_opt_remove_phis);
1228      NIR_PASS(progress, nir, nir_opt_dce);
1229      NIR_PASS(progress, nir, nir_opt_dead_cf);
1230      NIR_PASS(progress, nir, nir_opt_cse);
1231      NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
1232      NIR_PASS(progress, nir, nir_opt_algebraic);
1233      NIR_PASS(progress, nir, nir_opt_constant_folding);
1234
1235      NIR_PASS(progress, nir, nir_opt_undef);
1236      NIR_PASS(progress, nir, nir_lower_undef_to_zero);
1237
1238      NIR_PASS(progress, nir, nir_opt_loop_unroll);
1239   } while (progress);
1240
1241   NIR_PASS_V(nir, nir_opt_algebraic_late);
1242   NIR_PASS_V(nir, nir_opt_constant_folding);
1243   NIR_PASS_V(nir, nir_copy_prop);
1244   NIR_PASS_V(nir, nir_opt_dce);
1245   NIR_PASS_V(nir, nir_opt_cse);
1246   NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);
1247   NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
1248
1249   /* Cleanup optimizations */
1250   nir_move_options move_all =
1251      nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
1252      nir_move_comparisons | nir_move_copies | nir_move_load_ssbo;
1253
1254   NIR_PASS_V(nir, nir_opt_sink, move_all);
1255   NIR_PASS_V(nir, nir_opt_move, move_all);
1256   NIR_PASS_V(nir, nir_convert_from_ssa, true);
1257}
1258
1259/* ABI: position first, then user, then psiz */
1260static void
1261agx_remap_varyings_vs(nir_shader *nir, struct agx_varyings *varyings,
1262                      unsigned *remap)
1263{
1264   unsigned base = 0;
1265
1266   nir_variable *pos = nir_find_variable_with_location(nir, nir_var_shader_out, VARYING_SLOT_POS);
1267   if (pos) {
1268      assert(pos->data.driver_location < AGX_MAX_VARYINGS);
1269      remap[pos->data.driver_location] = base;
1270      base += 4;
1271   }
1272
1273   nir_foreach_shader_out_variable(var, nir) {
1274      unsigned loc = var->data.location;
1275
1276      if(loc == VARYING_SLOT_POS || loc == VARYING_SLOT_PSIZ) {
1277         continue;
1278      }
1279
1280      assert(var->data.driver_location < AGX_MAX_VARYINGS);
1281      remap[var->data.driver_location] = base;
1282      base += 4;
1283   }
1284
1285   nir_variable *psiz = nir_find_variable_with_location(nir, nir_var_shader_out, VARYING_SLOT_PSIZ);
1286   if (psiz) {
1287      assert(psiz->data.driver_location < AGX_MAX_VARYINGS);
1288      remap[psiz->data.driver_location] = base;
1289      base += 1;
1290   }
1291
1292   varyings->nr_slots = base;
1293}
1294
1295static void
1296agx_remap_varyings_fs(nir_shader *nir, struct agx_varyings *varyings,
1297                      unsigned *remap)
1298{
1299   struct agx_varying_packed *packed = varyings->packed;
1300   unsigned base = 0;
1301
1302   agx_pack(packed, VARYING, cfg) {
1303      cfg.type = AGX_VARYING_TYPE_FRAGCOORD_W;
1304      cfg.components = 1;
1305      cfg.triangle_slot = cfg.point_slot = base;
1306   }
1307
1308   base++;
1309   packed++;
1310
1311   agx_pack(packed, VARYING, cfg) {
1312      cfg.type = AGX_VARYING_TYPE_FRAGCOORD_Z;
1313      cfg.components = 1;
1314      cfg.triangle_slot = cfg.point_slot = base;
1315   }
1316
1317   base++;
1318   packed++;
1319
1320   unsigned comps[MAX_VARYING] = { 0 };
1321
1322   nir_foreach_shader_in_variable(var, nir) {
1323     unsigned loc = var->data.driver_location;
1324     const struct glsl_type *column =
1325        glsl_without_array_or_matrix(var->type);
1326     unsigned chan = glsl_get_components(column);
1327
1328     /* If we have a fractional location added, we need to increase the size
1329      * so it will fit, i.e. a vec3 in YZW requires us to allocate a vec4.
1330      * We could do better but this is an edge case as it is, normally
1331      * packed varyings will be aligned.
1332      */
1333     chan += var->data.location_frac;
1334     comps[loc] = MAX2(comps[loc], chan);
1335   }
1336
1337   nir_foreach_shader_in_variable(var, nir) {
1338     unsigned loc = var->data.driver_location;
1339     unsigned sz = glsl_count_attribute_slots(var->type, FALSE);
1340     unsigned channels = comps[loc];
1341
1342     assert(var->data.driver_location <= AGX_MAX_VARYINGS);
1343     remap[var->data.driver_location] = base;
1344
1345     for (int c = 0; c < sz; ++c) {
1346        agx_pack(packed, VARYING, cfg) {
1347           cfg.type = (var->data.location == VARYING_SLOT_PNTC) ?
1348              AGX_VARYING_TYPE_POINT_COORDINATES :
1349              (var->data.interpolation == INTERP_MODE_FLAT) ?
1350                 AGX_VARYING_TYPE_FLAT_LAST :
1351                 AGX_VARYING_TYPE_SMOOTH;
1352
1353           cfg.components = channels;
1354           cfg.triangle_slot = cfg.point_slot = base;
1355        }
1356
1357        base += channels;
1358        packed++;
1359     }
1360   }
1361
1362   varyings->nr_descs = (packed - varyings->packed);
1363   varyings->nr_slots = base;
1364}
1365
1366void
1367agx_compile_shader_nir(nir_shader *nir,
1368      struct agx_shader_key *key,
1369      struct util_dynarray *binary,
1370      struct agx_shader_info *out)
1371{
1372   agx_debug = debug_get_option_agx_debug();
1373
1374   agx_context *ctx = rzalloc(NULL, agx_context);
1375   ctx->nir = nir;
1376   ctx->out = out;
1377   ctx->key = key;
1378   ctx->stage = nir->info.stage;
1379   list_inithead(&ctx->blocks);
1380
1381   if (ctx->stage == MESA_SHADER_VERTEX) {
1382      out->writes_psiz = nir->info.outputs_written &
1383         BITFIELD_BIT(VARYING_SLOT_PSIZ);
1384   }
1385
1386   NIR_PASS_V(nir, nir_lower_vars_to_ssa);
1387
1388   /* Lower large arrays to scratch and small arrays to csel */
1389   NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 16,
1390         glsl_get_natural_size_align_bytes);
1391   NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0);
1392
1393   if (ctx->stage == MESA_SHADER_VERTEX) {
1394      /* Lower from OpenGL [-1, 1] to [0, 1] if half-z is not set */
1395      if (!key->vs.clip_halfz)
1396         NIR_PASS_V(nir, nir_lower_clip_halfz);
1397   } else if (ctx->stage == MESA_SHADER_FRAGMENT) {
1398      /* Flip point coordinate since OpenGL and Metal disagree */
1399      NIR_PASS_V(nir, nir_shader_instructions_pass,
1400            agx_lower_point_coord,
1401            nir_metadata_block_index | nir_metadata_dominance, NULL);
1402   }
1403
1404   NIR_PASS_V(nir, nir_split_var_copies);
1405   NIR_PASS_V(nir, nir_lower_global_vars_to_local);
1406   NIR_PASS_V(nir, nir_lower_var_copies);
1407   NIR_PASS_V(nir, nir_lower_vars_to_ssa);
1408   NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
1409         glsl_type_size, 0);
1410   if (ctx->stage == MESA_SHADER_FRAGMENT) {
1411      NIR_PASS_V(nir, nir_lower_mediump_io,
1412            nir_var_shader_in | nir_var_shader_out, ~0, false);
1413   }
1414   NIR_PASS_V(nir, nir_lower_ssbo);
1415
1416   /* Varying output is scalar, other I/O is vector */
1417   if (ctx->stage == MESA_SHADER_VERTEX) {
1418      NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_out);
1419   }
1420
1421   nir_lower_tex_options lower_tex_options = {
1422      .lower_txs_lod = true,
1423      .lower_txp = ~0,
1424   };
1425
1426   nir_tex_src_type_constraints tex_constraints = {
1427      [nir_tex_src_lod] = { true, 16 }
1428   };
1429
1430   NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options);
1431   NIR_PASS_V(nir, nir_legalize_16bit_sampler_srcs, tex_constraints);
1432
1433   agx_optimize_nir(nir);
1434
1435   /* Must be last since NIR passes can remap driver_location freely */
1436   if (ctx->stage == MESA_SHADER_VERTEX) {
1437      agx_remap_varyings_vs(nir, &out->varyings, ctx->varyings);
1438   } else if (ctx->stage == MESA_SHADER_FRAGMENT) {
1439      agx_remap_varyings_fs(nir, &out->varyings, ctx->varyings);
1440   }
1441
1442   bool skip_internal = nir->info.internal;
1443   skip_internal &= !(agx_debug & AGX_DBG_INTERNAL);
1444
1445   if (agx_debug & AGX_DBG_SHADERS && !skip_internal) {
1446      nir_print_shader(nir, stdout);
1447   }
1448
1449   nir_foreach_function(func, nir) {
1450      if (!func->impl)
1451         continue;
1452
1453      /* TODO: Handle phi nodes instead of just convert_from_ssa and yolo'ing
1454       * the mapping of nir_register to hardware registers and guaranteeing bad
1455       * performance and breaking spilling... */
1456      ctx->nir_regalloc = rzalloc_array(ctx, unsigned, func->impl->reg_alloc);
1457
1458      /* Leave the last 4 registers for hacky p-copy lowering */
1459      unsigned nir_regalloc = AGX_NUM_REGS - (4 * 2);
1460
1461      /* Assign backwards so we don't need to guess a size */
1462      nir_foreach_register(reg, &func->impl->registers) {
1463         /* Ensure alignment */
1464         if (reg->bit_size >= 32 && (nir_regalloc & 1))
1465            nir_regalloc--;
1466
1467         unsigned size = DIV_ROUND_UP(reg->bit_size * reg->num_components, 16);
1468         nir_regalloc -= size;
1469         ctx->nir_regalloc[reg->index] = nir_regalloc;
1470      }
1471
1472      ctx->max_register = nir_regalloc;
1473      ctx->alloc += func->impl->ssa_alloc;
1474      emit_cf_list(ctx, &func->impl->body);
1475      break; /* TODO: Multi-function shaders */
1476   }
1477
1478   /* TODO: Actual RA... this way passes don't need to deal nir_register */
1479   agx_foreach_instr_global(ctx, I) {
1480      agx_foreach_dest(I, d) {
1481         if (I->dest[d].type == AGX_INDEX_NIR_REGISTER) {
1482            I->dest[d].type = AGX_INDEX_REGISTER;
1483            I->dest[d].value = ctx->nir_regalloc[I->dest[d].value];
1484         }
1485      }
1486
1487      agx_foreach_src(I, s) {
1488         if (I->src[s].type == AGX_INDEX_NIR_REGISTER) {
1489            I->src[s].type = AGX_INDEX_REGISTER;
1490            I->src[s].value = ctx->nir_regalloc[I->src[s].value];
1491         }
1492      }
1493   }
1494
1495   /* Terminate the shader after the exit block */
1496   agx_block *last_block = list_last_entry(&ctx->blocks, agx_block, link);
1497   agx_builder _b = agx_init_builder(ctx, agx_after_block(last_block));
1498   agx_stop(&_b);
1499
1500   /* Also add traps to match the blob, unsure what the function is */
1501   for (unsigned i = 0; i < 8; ++i)
1502      agx_trap(&_b);
1503
1504   unsigned block_source_count = 0;
1505
1506   /* Name blocks now that we're done emitting so the order is consistent */
1507   agx_foreach_block(ctx, block)
1508      block->name = block_source_count++;
1509
1510   if (agx_debug & AGX_DBG_SHADERS && !skip_internal)
1511      agx_print_shader(ctx, stdout);
1512
1513   agx_optimizer(ctx);
1514   agx_dce(ctx);
1515
1516   if (agx_debug & AGX_DBG_SHADERS && !skip_internal)
1517      agx_print_shader(ctx, stdout);
1518
1519   agx_ra(ctx);
1520
1521   if (ctx->stage == MESA_SHADER_VERTEX)
1522      agx_set_st_vary_final(ctx);
1523
1524   if (agx_debug & AGX_DBG_SHADERS && !skip_internal)
1525      agx_print_shader(ctx, stdout);
1526
1527   agx_pack_binary(ctx, binary);
1528
1529   if ((agx_debug & AGX_DBG_SHADERDB) && !skip_internal)
1530      agx_print_stats(ctx, binary->size, stderr);
1531
1532   ralloc_free(ctx);
1533}
1534