1/*
2 * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 *    Jonathan Marek <jonathan@marek.ca>
25 */
26
27#include "ir2_private.h"
28
29#include "fd2_program.h"
30#include "freedreno_util.h"
31
32static const nir_shader_compiler_options options = {
33   .lower_fpow = true,
34   .lower_flrp32 = true,
35   .lower_fmod = true,
36   .lower_fdiv = true,
37   .lower_fceil = true,
38   .fuse_ffma16 = true,
39   .fuse_ffma32 = true,
40   .fuse_ffma64 = true,
41   /* .fdot_replicates = true, it is replicated, but it makes things worse */
42   .lower_all_io_to_temps = true,
43   .vertex_id_zero_based = true, /* its not implemented anyway */
44   .lower_bitops = true,
45   .lower_rotate = true,
46   .lower_vector_cmp = true,
47   .lower_fdph = true,
48   .has_fsub = true,
49   .has_isub = true,
50   .lower_insert_byte = true,
51   .lower_insert_word = true,
52   .force_indirect_unrolling = nir_var_all,
53};
54
55const nir_shader_compiler_options *
56ir2_get_compiler_options(void)
57{
58   return &options;
59}
60
61#define OPT(nir, pass, ...)                                                    \
62   ({                                                                          \
63      bool this_progress = false;                                              \
64      NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);                       \
65      this_progress;                                                           \
66   })
67#define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
68
69static void
70ir2_optimize_loop(nir_shader *s)
71{
72   bool progress;
73   do {
74      progress = false;
75
76      OPT_V(s, nir_lower_vars_to_ssa);
77      progress |= OPT(s, nir_opt_copy_prop_vars);
78      progress |= OPT(s, nir_copy_prop);
79      progress |= OPT(s, nir_opt_dce);
80      progress |= OPT(s, nir_opt_cse);
81      /* progress |= OPT(s, nir_opt_gcm, true); */
82      progress |= OPT(s, nir_opt_peephole_select, UINT_MAX, true, true);
83      progress |= OPT(s, nir_opt_intrinsics);
84      progress |= OPT(s, nir_opt_algebraic);
85      progress |= OPT(s, nir_opt_constant_folding);
86      progress |= OPT(s, nir_opt_dead_cf);
87      if (OPT(s, nir_opt_trivial_continues)) {
88         progress |= true;
89         /* If nir_opt_trivial_continues makes progress, then we need to clean
90          * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
91          * to make progress.
92          */
93         OPT(s, nir_copy_prop);
94         OPT(s, nir_opt_dce);
95      }
96      progress |= OPT(s, nir_opt_loop_unroll);
97      progress |= OPT(s, nir_opt_if, false);
98      progress |= OPT(s, nir_opt_remove_phis);
99      progress |= OPT(s, nir_opt_undef);
100
101   } while (progress);
102}
103
104/* trig workarounds is the same as ir3.. but we don't want to include ir3 */
105bool ir3_nir_apply_trig_workarounds(nir_shader *shader);
106
107int
108ir2_optimize_nir(nir_shader *s, bool lower)
109{
110   struct nir_lower_tex_options tex_options = {
111      .lower_txp = ~0u,
112      .lower_rect = 0,
113   };
114
115   if (FD_DBG(DISASM)) {
116      debug_printf("----------------------\n");
117      nir_print_shader(s, stdout);
118      debug_printf("----------------------\n");
119   }
120
121   OPT_V(s, nir_lower_regs_to_ssa);
122   OPT_V(s, nir_lower_vars_to_ssa);
123   OPT_V(s, nir_lower_indirect_derefs, nir_var_shader_in | nir_var_shader_out,
124         UINT32_MAX);
125
126   if (lower) {
127      OPT_V(s, ir3_nir_apply_trig_workarounds);
128      OPT_V(s, nir_lower_tex, &tex_options);
129   }
130
131   ir2_optimize_loop(s);
132
133   OPT_V(s, nir_remove_dead_variables, nir_var_function_temp, NULL);
134   OPT_V(s, nir_opt_sink, nir_move_const_undef);
135
136   /* TODO we dont want to get shaders writing to depth for depth textures */
137   if (s->info.stage == MESA_SHADER_FRAGMENT) {
138      nir_foreach_shader_out_variable (var, s) {
139         if (var->data.location == FRAG_RESULT_DEPTH)
140            return -1;
141      }
142   }
143
144   return 0;
145}
146
147static struct ir2_src
148load_const(struct ir2_context *ctx, float *value_f, unsigned ncomp)
149{
150   struct fd2_shader_stateobj *so = ctx->so;
151   unsigned imm_ncomp, swiz, idx, i, j;
152   uint32_t *value = (uint32_t *)value_f;
153
154   /* try to merge with existing immediate (TODO: try with neg) */
155   for (idx = 0; idx < so->num_immediates; idx++) {
156      swiz = 0;
157      imm_ncomp = so->immediates[idx].ncomp;
158      for (i = 0; i < ncomp; i++) {
159         for (j = 0; j < imm_ncomp; j++) {
160            if (value[i] == so->immediates[idx].val[j])
161               break;
162         }
163         if (j == imm_ncomp) {
164            if (j == 4)
165               break;
166            so->immediates[idx].val[imm_ncomp++] = value[i];
167         }
168         swiz |= swiz_set(j, i);
169      }
170      /* matched all components */
171      if (i == ncomp)
172         break;
173   }
174
175   /* need to allocate new immediate */
176   if (idx == so->num_immediates) {
177      swiz = 0;
178      imm_ncomp = 0;
179      for (i = 0; i < ncomp; i++) {
180         for (j = 0; j < imm_ncomp; j++) {
181            if (value[i] == ctx->so->immediates[idx].val[j])
182               break;
183         }
184         if (j == imm_ncomp) {
185            so->immediates[idx].val[imm_ncomp++] = value[i];
186         }
187         swiz |= swiz_set(j, i);
188      }
189      so->num_immediates++;
190   }
191   so->immediates[idx].ncomp = imm_ncomp;
192
193   if (ncomp == 1)
194      swiz = swiz_merge(swiz, IR2_SWIZZLE_XXXX);
195
196   return ir2_src(so->first_immediate + idx, swiz, IR2_SRC_CONST);
197}
198
199struct ir2_src
200ir2_zero(struct ir2_context *ctx)
201{
202   return load_const(ctx, (float[]){0.0f}, 1);
203}
204
205static void
206update_range(struct ir2_context *ctx, struct ir2_reg *reg)
207{
208   if (!reg->initialized) {
209      reg->initialized = true;
210      reg->loop_depth = ctx->loop_depth;
211   }
212
213   if (ctx->loop_depth > reg->loop_depth) {
214      reg->block_idx_free = ctx->loop_last_block[reg->loop_depth + 1];
215   } else {
216      reg->loop_depth = ctx->loop_depth;
217      reg->block_idx_free = -1;
218   }
219
220   /* for regs we want to free at the end of the loop in any case
221    * XXX dont do this for ssa
222    */
223   if (reg->loop_depth)
224      reg->block_idx_free = ctx->loop_last_block[reg->loop_depth];
225}
226
227static struct ir2_src
228make_src(struct ir2_context *ctx, nir_src src)
229{
230   struct ir2_src res = {};
231   struct ir2_reg *reg;
232
233   nir_const_value *const_value = nir_src_as_const_value(src);
234
235   if (const_value) {
236      assert(src.is_ssa);
237      float c[src.ssa->num_components];
238      nir_const_value_to_array(c, const_value, src.ssa->num_components, f32);
239      return load_const(ctx, c, src.ssa->num_components);
240   }
241
242   if (!src.is_ssa) {
243      res.num = src.reg.reg->index;
244      res.type = IR2_SRC_REG;
245      reg = &ctx->reg[res.num];
246   } else {
247      assert(ctx->ssa_map[src.ssa->index] >= 0);
248      res.num = ctx->ssa_map[src.ssa->index];
249      res.type = IR2_SRC_SSA;
250      reg = &ctx->instr[res.num].ssa;
251   }
252
253   update_range(ctx, reg);
254   return res;
255}
256
257static void
258set_index(struct ir2_context *ctx, nir_dest *dst, struct ir2_instr *instr)
259{
260   struct ir2_reg *reg = &instr->ssa;
261
262   if (dst->is_ssa) {
263      ctx->ssa_map[dst->ssa.index] = instr->idx;
264   } else {
265      assert(instr->is_ssa);
266      reg = &ctx->reg[dst->reg.reg->index];
267
268      instr->is_ssa = false;
269      instr->reg = reg;
270   }
271   update_range(ctx, reg);
272}
273
274static struct ir2_instr *
275ir2_instr_create(struct ir2_context *ctx, int type)
276{
277   struct ir2_instr *instr;
278
279   instr = &ctx->instr[ctx->instr_count++];
280   instr->idx = ctx->instr_count - 1;
281   instr->type = type;
282   instr->block_idx = ctx->block_idx;
283   instr->pred = ctx->pred;
284   instr->is_ssa = true;
285   return instr;
286}
287
288static struct ir2_instr *
289instr_create_alu(struct ir2_context *ctx, nir_op opcode, unsigned ncomp)
290{
291   /* emit_alu will fixup instrs that don't map directly */
292   static const struct ir2_opc {
293      int8_t scalar, vector;
294   } nir_ir2_opc[nir_num_opcodes + 1] = {
295      [0 ... nir_num_opcodes - 1] = {-1, -1},
296
297      [nir_op_mov] = {MAXs, MAXv},
298      [nir_op_fneg] = {MAXs, MAXv},
299      [nir_op_fabs] = {MAXs, MAXv},
300      [nir_op_fsat] = {MAXs, MAXv},
301      [nir_op_fsign] = {-1, CNDGTEv},
302      [nir_op_fadd] = {ADDs, ADDv},
303      [nir_op_fsub] = {ADDs, ADDv},
304      [nir_op_fmul] = {MULs, MULv},
305      [nir_op_ffma] = {-1, MULADDv},
306      [nir_op_fmax] = {MAXs, MAXv},
307      [nir_op_fmin] = {MINs, MINv},
308      [nir_op_ffloor] = {FLOORs, FLOORv},
309      [nir_op_ffract] = {FRACs, FRACv},
310      [nir_op_ftrunc] = {TRUNCs, TRUNCv},
311      [nir_op_fdot2] = {-1, DOT2ADDv},
312      [nir_op_fdot3] = {-1, DOT3v},
313      [nir_op_fdot4] = {-1, DOT4v},
314      [nir_op_sge] = {-1, SETGTEv},
315      [nir_op_slt] = {-1, SETGTv},
316      [nir_op_sne] = {-1, SETNEv},
317      [nir_op_seq] = {-1, SETEv},
318      [nir_op_fcsel] = {-1, CNDEv},
319      [nir_op_frsq] = {RECIPSQ_IEEE, -1},
320      [nir_op_frcp] = {RECIP_IEEE, -1},
321      [nir_op_flog2] = {LOG_IEEE, -1},
322      [nir_op_fexp2] = {EXP_IEEE, -1},
323      [nir_op_fsqrt] = {SQRT_IEEE, -1},
324      [nir_op_fcos] = {COS, -1},
325      [nir_op_fsin] = {SIN, -1},
326   /* no fsat, fneg, fabs since source mods deal with those */
327
328   /* so we can use this function with non-nir op */
329#define ir2_op_cube nir_num_opcodes
330      [ir2_op_cube] = {-1, CUBEv},
331   };
332
333   struct ir2_opc op = nir_ir2_opc[opcode];
334   assert(op.vector >= 0 || op.scalar >= 0);
335
336   struct ir2_instr *instr = ir2_instr_create(ctx, IR2_ALU);
337   instr->alu.vector_opc = op.vector;
338   instr->alu.scalar_opc = op.scalar;
339   instr->alu.export = -1;
340   instr->alu.write_mask = (1 << ncomp) - 1;
341   instr->src_count =
342      opcode == ir2_op_cube ? 2 : nir_op_infos[opcode].num_inputs;
343   instr->ssa.ncomp = ncomp;
344   return instr;
345}
346
347static struct ir2_instr *
348instr_create_alu_reg(struct ir2_context *ctx, nir_op opcode, uint8_t write_mask,
349                     struct ir2_instr *share_reg)
350{
351   struct ir2_instr *instr;
352   struct ir2_reg *reg;
353
354   reg = share_reg ? share_reg->reg : &ctx->reg[ctx->reg_count++];
355   reg->ncomp = MAX2(reg->ncomp, util_logbase2(write_mask) + 1);
356
357   instr = instr_create_alu(ctx, opcode, util_bitcount(write_mask));
358   instr->alu.write_mask = write_mask;
359   instr->reg = reg;
360   instr->is_ssa = false;
361   return instr;
362}
363
364static struct ir2_instr *
365instr_create_alu_dest(struct ir2_context *ctx, nir_op opcode, nir_dest *dst)
366{
367   struct ir2_instr *instr;
368   instr = instr_create_alu(ctx, opcode, nir_dest_num_components(*dst));
369   set_index(ctx, dst, instr);
370   return instr;
371}
372
373static struct ir2_instr *
374ir2_instr_create_fetch(struct ir2_context *ctx, nir_dest *dst,
375                       instr_fetch_opc_t opc)
376{
377   struct ir2_instr *instr = ir2_instr_create(ctx, IR2_FETCH);
378   instr->fetch.opc = opc;
379   instr->src_count = 1;
380   instr->ssa.ncomp = nir_dest_num_components(*dst);
381   set_index(ctx, dst, instr);
382   return instr;
383}
384
385static struct ir2_src
386make_src_noconst(struct ir2_context *ctx, nir_src src)
387{
388   struct ir2_instr *instr;
389
390   if (nir_src_as_const_value(src)) {
391      assert(src.is_ssa);
392      instr = instr_create_alu(ctx, nir_op_mov, src.ssa->num_components);
393      instr->src[0] = make_src(ctx, src);
394      return ir2_src(instr->idx, 0, IR2_SRC_SSA);
395   }
396
397   return make_src(ctx, src);
398}
399
400static void
401emit_alu(struct ir2_context *ctx, nir_alu_instr *alu)
402{
403   const nir_op_info *info = &nir_op_infos[alu->op];
404   nir_dest *dst = &alu->dest.dest;
405   struct ir2_instr *instr;
406   struct ir2_src tmp;
407   unsigned ncomp;
408
409   /* get the number of dst components */
410   if (dst->is_ssa) {
411      ncomp = dst->ssa.num_components;
412   } else {
413      ncomp = 0;
414      for (int i = 0; i < 4; i++)
415         ncomp += !!(alu->dest.write_mask & 1 << i);
416   }
417
418   instr = instr_create_alu(ctx, alu->op, ncomp);
419   set_index(ctx, dst, instr);
420   instr->alu.saturate = alu->dest.saturate;
421   instr->alu.write_mask = alu->dest.write_mask;
422
423   for (int i = 0; i < info->num_inputs; i++) {
424      nir_alu_src *src = &alu->src[i];
425
426      /* compress swizzle with writemask when applicable */
427      unsigned swiz = 0, j = 0;
428      for (int i = 0; i < 4; i++) {
429         if (!(alu->dest.write_mask & 1 << i) && !info->output_size)
430            continue;
431         swiz |= swiz_set(src->swizzle[i], j++);
432      }
433
434      instr->src[i] = make_src(ctx, src->src);
435      instr->src[i].swizzle = swiz_merge(instr->src[i].swizzle, swiz);
436      instr->src[i].negate = src->negate;
437      instr->src[i].abs = src->abs;
438   }
439
440   /* workarounds for NIR ops that don't map directly to a2xx ops */
441   switch (alu->op) {
442   case nir_op_fneg:
443      instr->src[0].negate = 1;
444      break;
445   case nir_op_fabs:
446      instr->src[0].abs = 1;
447      break;
448   case nir_op_fsat:
449      instr->alu.saturate = 1;
450      break;
451   case nir_op_slt:
452      tmp = instr->src[0];
453      instr->src[0] = instr->src[1];
454      instr->src[1] = tmp;
455      break;
456   case nir_op_fcsel:
457      tmp = instr->src[1];
458      instr->src[1] = instr->src[2];
459      instr->src[2] = tmp;
460      break;
461   case nir_op_fsub:
462      instr->src[1].negate = !instr->src[1].negate;
463      break;
464   case nir_op_fdot2:
465      instr->src_count = 3;
466      instr->src[2] = ir2_zero(ctx);
467      break;
468   case nir_op_fsign: {
469      /* we need an extra instruction to deal with the zero case */
470      struct ir2_instr *tmp;
471
472      /* tmp = x == 0 ? 0 : 1 */
473      tmp = instr_create_alu(ctx, nir_op_fcsel, ncomp);
474      tmp->src[0] = instr->src[0];
475      tmp->src[1] = ir2_zero(ctx);
476      tmp->src[2] = load_const(ctx, (float[]){1.0f}, 1);
477
478      /* result = x >= 0 ? tmp : -tmp */
479      instr->src[1] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);
480      instr->src[2] = instr->src[1];
481      instr->src[2].negate = true;
482      instr->src_count = 3;
483   } break;
484   default:
485      break;
486   }
487}
488
489static void
490load_input(struct ir2_context *ctx, nir_dest *dst, unsigned idx)
491{
492   struct ir2_instr *instr;
493   int slot = -1;
494
495   if (ctx->so->type == MESA_SHADER_VERTEX) {
496      instr = ir2_instr_create_fetch(ctx, dst, 0);
497      instr->src[0] = ir2_src(0, 0, IR2_SRC_INPUT);
498      instr->fetch.vtx.const_idx = 20 + (idx / 3);
499      instr->fetch.vtx.const_idx_sel = idx % 3;
500      return;
501   }
502
503   /* get slot from idx */
504   nir_foreach_shader_in_variable (var, ctx->nir) {
505      if (var->data.driver_location == idx) {
506         slot = var->data.location;
507         break;
508      }
509   }
510   assert(slot >= 0);
511
512   switch (slot) {
513   case VARYING_SLOT_POS:
514      /* need to extract xy with abs and add tile offset on a20x
515       * zw from fragcoord input (w inverted in fragment shader)
516       * TODO: only components that are required by fragment shader
517       */
518      instr = instr_create_alu_reg(
519         ctx, ctx->so->is_a20x ? nir_op_fadd : nir_op_mov, 3, NULL);
520      instr->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT);
521      instr->src[0].abs = true;
522      /* on a20x, C64 contains the tile offset */
523      instr->src[1] = ir2_src(64, 0, IR2_SRC_CONST);
524
525      instr = instr_create_alu_reg(ctx, nir_op_mov, 4, instr);
526      instr->src[0] = ir2_src(ctx->f->fragcoord, 0, IR2_SRC_INPUT);
527
528      instr = instr_create_alu_reg(ctx, nir_op_frcp, 8, instr);
529      instr->src[0] = ir2_src(ctx->f->fragcoord, IR2_SWIZZLE_Y, IR2_SRC_INPUT);
530
531      unsigned reg_idx = instr->reg - ctx->reg; /* XXX */
532      instr = instr_create_alu_dest(ctx, nir_op_mov, dst);
533      instr->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);
534      break;
535   default:
536      instr = instr_create_alu_dest(ctx, nir_op_mov, dst);
537      instr->src[0] = ir2_src(idx, 0, IR2_SRC_INPUT);
538      break;
539   }
540}
541
542static unsigned
543output_slot(struct ir2_context *ctx, nir_intrinsic_instr *intr)
544{
545   int slot = -1;
546   unsigned idx = nir_intrinsic_base(intr);
547   nir_foreach_shader_out_variable (var, ctx->nir) {
548      if (var->data.driver_location == idx) {
549         slot = var->data.location;
550         break;
551      }
552   }
553   assert(slot != -1);
554   return slot;
555}
556
557static void
558store_output(struct ir2_context *ctx, nir_src src, unsigned slot,
559             unsigned ncomp)
560{
561   struct ir2_instr *instr;
562   unsigned idx = 0;
563
564   if (ctx->so->type == MESA_SHADER_VERTEX) {
565      switch (slot) {
566      case VARYING_SLOT_POS:
567         ctx->position = make_src(ctx, src);
568         idx = 62;
569         break;
570      case VARYING_SLOT_PSIZ:
571         ctx->so->writes_psize = true;
572         idx = 63;
573         break;
574      default:
575         /* find matching slot from fragment shader input */
576         for (idx = 0; idx < ctx->f->inputs_count; idx++)
577            if (ctx->f->inputs[idx].slot == slot)
578               break;
579         if (idx == ctx->f->inputs_count)
580            return;
581      }
582   } else if (slot != FRAG_RESULT_COLOR && slot != FRAG_RESULT_DATA0) {
583      /* only color output is implemented */
584      return;
585   }
586
587   instr = instr_create_alu(ctx, nir_op_mov, ncomp);
588   instr->src[0] = make_src(ctx, src);
589   instr->alu.export = idx;
590}
591
592static void
593emit_intrinsic(struct ir2_context *ctx, nir_intrinsic_instr *intr)
594{
595   struct ir2_instr *instr;
596   ASSERTED nir_const_value *const_offset;
597   unsigned idx;
598
599   switch (intr->intrinsic) {
600   case nir_intrinsic_load_input:
601      load_input(ctx, &intr->dest, nir_intrinsic_base(intr));
602      break;
603   case nir_intrinsic_store_output:
604      store_output(ctx, intr->src[0], output_slot(ctx, intr),
605                   intr->num_components);
606      break;
607   case nir_intrinsic_load_uniform:
608      const_offset = nir_src_as_const_value(intr->src[0]);
609      assert(const_offset); /* TODO can be false in ES2? */
610      idx = nir_intrinsic_base(intr);
611      idx += (uint32_t)const_offset[0].f32;
612      instr = instr_create_alu_dest(ctx, nir_op_mov, &intr->dest);
613      instr->src[0] = ir2_src(idx, 0, IR2_SRC_CONST);
614      break;
615   case nir_intrinsic_discard:
616   case nir_intrinsic_discard_if:
617      instr = ir2_instr_create(ctx, IR2_ALU);
618      instr->alu.vector_opc = VECTOR_NONE;
619      if (intr->intrinsic == nir_intrinsic_discard_if) {
620         instr->alu.scalar_opc = KILLNEs;
621         instr->src[0] = make_src(ctx, intr->src[0]);
622      } else {
623         instr->alu.scalar_opc = KILLEs;
624         instr->src[0] = ir2_zero(ctx);
625      }
626      instr->alu.export = -1;
627      instr->src_count = 1;
628      ctx->so->has_kill = true;
629      break;
630   case nir_intrinsic_load_front_face:
631      /* gl_FrontFacing is in the sign of param.x
632       * rcp required because otherwise we can't differentiate -0.0 and +0.0
633       */
634      ctx->so->need_param = true;
635
636      struct ir2_instr *tmp = instr_create_alu(ctx, nir_op_frcp, 1);
637      tmp->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT);
638
639      instr = instr_create_alu_dest(ctx, nir_op_sge, &intr->dest);
640      instr->src[0] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);
641      instr->src[1] = ir2_zero(ctx);
642      break;
643   case nir_intrinsic_load_point_coord:
644      /* param.zw (note: abs might be needed like fragcoord in param.xy?) */
645      ctx->so->need_param = true;
646
647      instr = instr_create_alu_dest(ctx, nir_op_mov, &intr->dest);
648      instr->src[0] =
649         ir2_src(ctx->f->inputs_count, IR2_SWIZZLE_ZW, IR2_SRC_INPUT);
650      break;
651   default:
652      compile_error(ctx, "unimplemented intr %d\n", intr->intrinsic);
653      break;
654   }
655}
656
657static void
658emit_tex(struct ir2_context *ctx, nir_tex_instr *tex)
659{
660   bool is_rect = false, is_cube = false;
661   struct ir2_instr *instr;
662   nir_src *coord, *lod_bias;
663
664   coord = lod_bias = NULL;
665
666   for (unsigned i = 0; i < tex->num_srcs; i++) {
667      switch (tex->src[i].src_type) {
668      case nir_tex_src_coord:
669         coord = &tex->src[i].src;
670         break;
671      case nir_tex_src_bias:
672      case nir_tex_src_lod:
673         assert(!lod_bias);
674         lod_bias = &tex->src[i].src;
675         break;
676      default:
677         compile_error(ctx, "Unhandled NIR tex src type: %d\n",
678                       tex->src[i].src_type);
679         return;
680      }
681   }
682
683   switch (tex->op) {
684   case nir_texop_tex:
685   case nir_texop_txb:
686   case nir_texop_txl:
687      break;
688   default:
689      compile_error(ctx, "unimplemented texop %d\n", tex->op);
690      return;
691   }
692
693   switch (tex->sampler_dim) {
694   case GLSL_SAMPLER_DIM_2D:
695   case GLSL_SAMPLER_DIM_EXTERNAL:
696      break;
697   case GLSL_SAMPLER_DIM_RECT:
698      is_rect = true;
699      break;
700   case GLSL_SAMPLER_DIM_CUBE:
701      is_cube = true;
702      break;
703   default:
704      compile_error(ctx, "unimplemented sampler %d\n", tex->sampler_dim);
705      return;
706   }
707
708   struct ir2_src src_coord = make_src_noconst(ctx, *coord);
709
710   /* for cube maps
711    * tmp = cube(coord)
712    * tmp.xy = tmp.xy / |tmp.z| + 1.5
713    * coord = tmp.xyw
714    */
715   if (is_cube) {
716      struct ir2_instr *rcp, *coord_xy;
717      unsigned reg_idx;
718
719      instr = instr_create_alu_reg(ctx, ir2_op_cube, 15, NULL);
720      instr->src[0] = src_coord;
721      instr->src[0].swizzle = IR2_SWIZZLE_ZZXY;
722      instr->src[1] = src_coord;
723      instr->src[1].swizzle = IR2_SWIZZLE_YXZZ;
724
725      reg_idx = instr->reg - ctx->reg; /* hacky */
726
727      rcp = instr_create_alu(ctx, nir_op_frcp, 1);
728      rcp->src[0] = ir2_src(reg_idx, IR2_SWIZZLE_Z, IR2_SRC_REG);
729      rcp->src[0].abs = true;
730
731      coord_xy = instr_create_alu_reg(ctx, nir_op_ffma, 3, instr);
732      coord_xy->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);
733      coord_xy->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
734      coord_xy->src[2] = load_const(ctx, (float[]){1.5f}, 1);
735
736      src_coord = ir2_src(reg_idx, 0, IR2_SRC_REG);
737      /* TODO: lod/bias transformed by src_coord.z ? */
738   }
739
740   instr = ir2_instr_create_fetch(ctx, &tex->dest, TEX_FETCH);
741   instr->src[0] = src_coord;
742   instr->src[0].swizzle = is_cube ? IR2_SWIZZLE_YXW : 0;
743   instr->fetch.tex.is_cube = is_cube;
744   instr->fetch.tex.is_rect = is_rect;
745   instr->fetch.tex.samp_id = tex->sampler_index;
746
747   /* for lod/bias, we insert an extra src for the backend to deal with */
748   if (lod_bias) {
749      instr->src[1] = make_src_noconst(ctx, *lod_bias);
750      /* backend will use 2-3 components so apply swizzle */
751      swiz_merge_p(&instr->src[1].swizzle, IR2_SWIZZLE_XXXX);
752      instr->src_count = 2;
753   }
754}
755
756static void
757setup_input(struct ir2_context *ctx, nir_variable *in)
758{
759   struct fd2_shader_stateobj *so = ctx->so;
760   ASSERTED unsigned array_len = MAX2(glsl_get_length(in->type), 1);
761   unsigned n = in->data.driver_location;
762   unsigned slot = in->data.location;
763
764   assert(array_len == 1);
765
766   /* handle later */
767   if (ctx->so->type == MESA_SHADER_VERTEX)
768      return;
769
770   if (ctx->so->type != MESA_SHADER_FRAGMENT)
771      compile_error(ctx, "unknown shader type: %d\n", ctx->so->type);
772
773   n = ctx->f->inputs_count++;
774
775   /* half of fragcoord from param reg, half from a varying */
776   if (slot == VARYING_SLOT_POS) {
777      ctx->f->fragcoord = n;
778      so->need_param = true;
779   }
780
781   ctx->f->inputs[n].slot = slot;
782   ctx->f->inputs[n].ncomp = glsl_get_components(in->type);
783
784   /* in->data.interpolation?
785    * opengl ES 2.0 can't do flat mode, but we still get it from GALLIUM_HUD
786    */
787}
788
789static void
790emit_undef(struct ir2_context *ctx, nir_ssa_undef_instr *undef)
791{
792   /* TODO we don't want to emit anything for undefs */
793
794   struct ir2_instr *instr;
795
796   instr = instr_create_alu_dest(
797      ctx, nir_op_mov, &(nir_dest){.ssa = undef->def, .is_ssa = true});
798   instr->src[0] = ir2_src(0, 0, IR2_SRC_CONST);
799}
800
801static void
802emit_instr(struct ir2_context *ctx, nir_instr *instr)
803{
804   switch (instr->type) {
805   case nir_instr_type_alu:
806      emit_alu(ctx, nir_instr_as_alu(instr));
807      break;
808   case nir_instr_type_deref:
809      /* ignored, handled as part of the intrinsic they are src to */
810      break;
811   case nir_instr_type_intrinsic:
812      emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
813      break;
814   case nir_instr_type_load_const:
815      /* dealt with when using nir_src */
816      break;
817   case nir_instr_type_tex:
818      emit_tex(ctx, nir_instr_as_tex(instr));
819      break;
820   case nir_instr_type_jump:
821      ctx->block_has_jump[ctx->block_idx] = true;
822      break;
823   case nir_instr_type_ssa_undef:
824      emit_undef(ctx, nir_instr_as_ssa_undef(instr));
825      break;
826   default:
827      break;
828   }
829}
830
831/* fragcoord.zw and a20x hw binning outputs */
832static void
833extra_position_exports(struct ir2_context *ctx, bool binning)
834{
835   struct ir2_instr *instr, *rcp, *sc, *wincoord, *off;
836
837   if (ctx->f->fragcoord < 0 && !binning)
838      return;
839
840   instr = instr_create_alu(ctx, nir_op_fmax, 1);
841   instr->src[0] = ctx->position;
842   instr->src[0].swizzle = IR2_SWIZZLE_W;
843   instr->src[1] = ir2_zero(ctx);
844
845   rcp = instr_create_alu(ctx, nir_op_frcp, 1);
846   rcp->src[0] = ir2_src(instr->idx, 0, IR2_SRC_SSA);
847
848   sc = instr_create_alu(ctx, nir_op_fmul, 4);
849   sc->src[0] = ctx->position;
850   sc->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
851
852   wincoord = instr_create_alu(ctx, nir_op_ffma, 4);
853   wincoord->src[0] = ir2_src(66, 0, IR2_SRC_CONST);
854   wincoord->src[1] = ir2_src(sc->idx, 0, IR2_SRC_SSA);
855   wincoord->src[2] = ir2_src(65, 0, IR2_SRC_CONST);
856
857   /* fragcoord z/w */
858   if (ctx->f->fragcoord >= 0 && !binning) {
859      instr = instr_create_alu(ctx, nir_op_mov, 1);
860      instr->src[0] = ir2_src(wincoord->idx, IR2_SWIZZLE_Z, IR2_SRC_SSA);
861      instr->alu.export = ctx->f->fragcoord;
862
863      instr = instr_create_alu(ctx, nir_op_mov, 1);
864      instr->src[0] = ctx->position;
865      instr->src[0].swizzle = IR2_SWIZZLE_W;
866      instr->alu.export = ctx->f->fragcoord;
867      instr->alu.write_mask = 2;
868   }
869
870   if (!binning)
871      return;
872
873   off = instr_create_alu(ctx, nir_op_fadd, 1);
874   off->src[0] = ir2_src(64, 0, IR2_SRC_CONST);
875   off->src[1] = ir2_src(2, 0, IR2_SRC_INPUT);
876
877   /* 8 max set in freedreno_screen.. unneeded instrs patched out */
878   for (int i = 0; i < 8; i++) {
879      instr = instr_create_alu(ctx, nir_op_ffma, 4);
880      instr->src[0] = ir2_src(1, IR2_SWIZZLE_WYWW, IR2_SRC_CONST);
881      instr->src[1] = ir2_src(off->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
882      instr->src[2] = ir2_src(3 + i, 0, IR2_SRC_CONST);
883      instr->alu.export = 32;
884
885      instr = instr_create_alu(ctx, nir_op_ffma, 4);
886      instr->src[0] = ir2_src(68 + i * 2, 0, IR2_SRC_CONST);
887      instr->src[1] = ir2_src(wincoord->idx, 0, IR2_SRC_SSA);
888      instr->src[2] = ir2_src(67 + i * 2, 0, IR2_SRC_CONST);
889      instr->alu.export = 33;
890   }
891}
892
893static bool emit_cf_list(struct ir2_context *ctx, struct exec_list *list);
894
895static bool
896emit_block(struct ir2_context *ctx, nir_block *block)
897{
898   struct ir2_instr *instr;
899   nir_block *succs = block->successors[0];
900
901   ctx->block_idx = block->index;
902
903   nir_foreach_instr (instr, block)
904      emit_instr(ctx, instr);
905
906   if (!succs || !succs->index)
907      return false;
908
909   /* we want to be smart and always jump and have the backend cleanup
910    * but we are not, so there are two cases where jump is needed:
911    *  loops (succs index lower)
912    *  jumps (jump instruction seen in block)
913    */
914   if (succs->index > block->index && !ctx->block_has_jump[block->index])
915      return false;
916
917   assert(block->successors[1] == NULL);
918
919   instr = ir2_instr_create(ctx, IR2_CF);
920   instr->cf.block_idx = succs->index;
921   /* XXX can't jump to a block with different predicate */
922   return true;
923}
924
925static void
926emit_if(struct ir2_context *ctx, nir_if *nif)
927{
928   unsigned pred = ctx->pred, pred_idx = ctx->pred_idx;
929   struct ir2_instr *instr;
930
931   /* XXX: blob seems to always use same register for condition */
932
933   instr = ir2_instr_create(ctx, IR2_ALU);
934   instr->src[0] = make_src(ctx, nif->condition);
935   instr->src_count = 1;
936   instr->ssa.ncomp = 1;
937   instr->alu.vector_opc = VECTOR_NONE;
938   instr->alu.scalar_opc = SCALAR_NONE;
939   instr->alu.export = -1;
940   instr->alu.write_mask = 1;
941   instr->pred = 0;
942
943   /* if nested, use PRED_SETNE_PUSHv */
944   if (pred) {
945      instr->alu.vector_opc = PRED_SETNE_PUSHv;
946      instr->src[1] = instr->src[0];
947      instr->src[0] = ir2_src(pred_idx, 0, IR2_SRC_SSA);
948      instr->src[0].swizzle = IR2_SWIZZLE_XXXX;
949      instr->src[1].swizzle = IR2_SWIZZLE_XXXX;
950      instr->src_count = 2;
951   } else {
952      instr->alu.scalar_opc = PRED_SETNEs;
953   }
954
955   ctx->pred_idx = instr->idx;
956   ctx->pred = 3;
957
958   emit_cf_list(ctx, &nif->then_list);
959
960   /* TODO: if these is no else branch we don't need this
961    * and if the else branch is simple, can just flip ctx->pred instead
962    */
963   instr = ir2_instr_create(ctx, IR2_ALU);
964   instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA);
965   instr->src_count = 1;
966   instr->ssa.ncomp = 1;
967   instr->alu.vector_opc = VECTOR_NONE;
968   instr->alu.scalar_opc = PRED_SET_INVs;
969   instr->alu.export = -1;
970   instr->alu.write_mask = 1;
971   instr->pred = 0;
972   ctx->pred_idx = instr->idx;
973
974   emit_cf_list(ctx, &nif->else_list);
975
976   /* restore predicate for nested predicates */
977   if (pred) {
978      instr = ir2_instr_create(ctx, IR2_ALU);
979      instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA);
980      instr->src_count = 1;
981      instr->ssa.ncomp = 1;
982      instr->alu.vector_opc = VECTOR_NONE;
983      instr->alu.scalar_opc = PRED_SET_POPs;
984      instr->alu.export = -1;
985      instr->alu.write_mask = 1;
986      instr->pred = 0;
987      ctx->pred_idx = instr->idx;
988   }
989
990   /* restore ctx->pred */
991   ctx->pred = pred;
992}
993
994/* get the highest block idx in the loop, so we know when
995 * we can free registers that are allocated outside the loop
996 */
997static unsigned
998loop_last_block(struct exec_list *list)
999{
1000   nir_cf_node *node =
1001      exec_node_data(nir_cf_node, exec_list_get_tail(list), node);
1002   switch (node->type) {
1003   case nir_cf_node_block:
1004      return nir_cf_node_as_block(node)->index;
1005   case nir_cf_node_if:
1006      assert(0); /* XXX could this ever happen? */
1007      return 0;
1008   case nir_cf_node_loop:
1009      return loop_last_block(&nir_cf_node_as_loop(node)->body);
1010   default:
1011      compile_error(ctx, "Not supported\n");
1012      return 0;
1013   }
1014}
1015
1016static void
1017emit_loop(struct ir2_context *ctx, nir_loop *nloop)
1018{
1019   ctx->loop_last_block[++ctx->loop_depth] = loop_last_block(&nloop->body);
1020   emit_cf_list(ctx, &nloop->body);
1021   ctx->loop_depth--;
1022}
1023
1024static bool
1025emit_cf_list(struct ir2_context *ctx, struct exec_list *list)
1026{
1027   bool ret = false;
1028   foreach_list_typed (nir_cf_node, node, node, list) {
1029      ret = false;
1030      switch (node->type) {
1031      case nir_cf_node_block:
1032         ret = emit_block(ctx, nir_cf_node_as_block(node));
1033         break;
1034      case nir_cf_node_if:
1035         emit_if(ctx, nir_cf_node_as_if(node));
1036         break;
1037      case nir_cf_node_loop:
1038         emit_loop(ctx, nir_cf_node_as_loop(node));
1039         break;
1040      case nir_cf_node_function:
1041         compile_error(ctx, "Not supported\n");
1042         break;
1043      }
1044   }
1045   return ret;
1046}
1047
1048static void
1049cleanup_binning(struct ir2_context *ctx)
1050{
1051   assert(ctx->so->type == MESA_SHADER_VERTEX);
1052
1053   /* kill non-position outputs for binning variant */
1054   nir_foreach_block (block, nir_shader_get_entrypoint(ctx->nir)) {
1055      nir_foreach_instr_safe (instr, block) {
1056         if (instr->type != nir_instr_type_intrinsic)
1057            continue;
1058
1059         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1060         if (intr->intrinsic != nir_intrinsic_store_output)
1061            continue;
1062
1063         if (output_slot(ctx, intr) != VARYING_SLOT_POS)
1064            nir_instr_remove(instr);
1065      }
1066   }
1067
1068   ir2_optimize_nir(ctx->nir, false);
1069}
1070
1071static bool
1072ir2_alu_to_scalar_filter_cb(const nir_instr *instr, const void *data)
1073{
1074   if (instr->type != nir_instr_type_alu)
1075      return false;
1076
1077   nir_alu_instr *alu = nir_instr_as_alu(instr);
1078   switch (alu->op) {
1079   case nir_op_frsq:
1080   case nir_op_frcp:
1081   case nir_op_flog2:
1082   case nir_op_fexp2:
1083   case nir_op_fsqrt:
1084   case nir_op_fcos:
1085   case nir_op_fsin:
1086      return true;
1087   default:
1088      break;
1089   }
1090
1091   return false;
1092}
1093
1094void
1095ir2_nir_compile(struct ir2_context *ctx, bool binning)
1096{
1097   struct fd2_shader_stateobj *so = ctx->so;
1098
1099   memset(ctx->ssa_map, 0xff, sizeof(ctx->ssa_map));
1100
1101   ctx->nir = nir_shader_clone(NULL, so->nir);
1102
1103   if (binning)
1104      cleanup_binning(ctx);
1105
1106   OPT_V(ctx->nir, nir_copy_prop);
1107   OPT_V(ctx->nir, nir_opt_dce);
1108   OPT_V(ctx->nir, nir_opt_move, nir_move_comparisons);
1109
1110   OPT_V(ctx->nir, nir_lower_int_to_float);
1111   OPT_V(ctx->nir, nir_lower_bool_to_float);
1112   while (OPT(ctx->nir, nir_opt_algebraic))
1113      ;
1114   OPT_V(ctx->nir, nir_opt_algebraic_late);
1115   OPT_V(ctx->nir, nir_lower_to_source_mods, nir_lower_all_source_mods);
1116
1117   OPT_V(ctx->nir, nir_lower_alu_to_scalar, ir2_alu_to_scalar_filter_cb, NULL);
1118
1119   OPT_V(ctx->nir, nir_lower_locals_to_regs);
1120
1121   OPT_V(ctx->nir, nir_convert_from_ssa, true);
1122
1123   OPT_V(ctx->nir, nir_move_vec_src_uses_to_dest);
1124   OPT_V(ctx->nir, nir_lower_vec_to_movs, NULL, NULL);
1125
1126   OPT_V(ctx->nir, nir_opt_dce);
1127
1128   nir_sweep(ctx->nir);
1129
1130   if (FD_DBG(DISASM)) {
1131      debug_printf("----------------------\n");
1132      nir_print_shader(ctx->nir, stdout);
1133      debug_printf("----------------------\n");
1134   }
1135
1136   /* fd2_shader_stateobj init */
1137   if (so->type == MESA_SHADER_FRAGMENT) {
1138      ctx->f->fragcoord = -1;
1139      ctx->f->inputs_count = 0;
1140      memset(ctx->f->inputs, 0, sizeof(ctx->f->inputs));
1141   }
1142
1143   /* Setup inputs: */
1144   nir_foreach_shader_in_variable (in, ctx->nir)
1145      setup_input(ctx, in);
1146
1147   if (so->type == MESA_SHADER_FRAGMENT) {
1148      unsigned idx;
1149      for (idx = 0; idx < ctx->f->inputs_count; idx++) {
1150         ctx->input[idx].ncomp = ctx->f->inputs[idx].ncomp;
1151         update_range(ctx, &ctx->input[idx]);
1152      }
1153      /* assume we have param input and kill it later if not */
1154      ctx->input[idx].ncomp = 4;
1155      update_range(ctx, &ctx->input[idx]);
1156   } else {
1157      ctx->input[0].ncomp = 1;
1158      ctx->input[2].ncomp = 1;
1159      update_range(ctx, &ctx->input[0]);
1160      update_range(ctx, &ctx->input[2]);
1161   }
1162
1163   /* And emit the body: */
1164   nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->nir);
1165
1166   nir_foreach_register (reg, &fxn->registers) {
1167      ctx->reg[reg->index].ncomp = reg->num_components;
1168      ctx->reg_count = MAX2(ctx->reg_count, reg->index + 1);
1169   }
1170
1171   nir_metadata_require(fxn, nir_metadata_block_index);
1172   emit_cf_list(ctx, &fxn->body);
1173   /* TODO emit_block(ctx, fxn->end_block); */
1174
1175   if (so->type == MESA_SHADER_VERTEX)
1176      extra_position_exports(ctx, binning);
1177
1178   ralloc_free(ctx->nir);
1179
1180   /* kill unused param input */
1181   if (so->type == MESA_SHADER_FRAGMENT && !so->need_param)
1182      ctx->input[ctx->f->inputs_count].initialized = false;
1183}
1184