1/*
2 * Copyright 2010 Christoph Bumiller
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23#include "pipe/p_defines.h"
24
25#include "compiler/nir/nir.h"
26
27#include "nv50/nv50_program.h"
28#include "nv50/nv50_context.h"
29
30#include "codegen/nv50_ir_driver.h"
31
32static inline unsigned
33bitcount4(const uint32_t val)
34{
35   static const uint8_t cnt[16]
36   = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
37   return cnt[val & 0xf];
38}
39
40static int
41nv50_vertprog_assign_slots(struct nv50_ir_prog_info *info)
42{
43   struct nv50_program *prog = (struct nv50_program *)info->driverPriv;
44   unsigned i, n, c;
45
46   n = 0;
47   for (i = 0; i < info->numInputs; ++i) {
48      prog->in[i].id = i;
49      prog->in[i].sn = info->in[i].sn;
50      prog->in[i].si = info->in[i].si;
51      prog->in[i].hw = n;
52      prog->in[i].mask = info->in[i].mask;
53
54      prog->vp.attrs[(4 * i) / 32] |= info->in[i].mask << ((4 * i) % 32);
55
56      for (c = 0; c < 4; ++c)
57         if (info->in[i].mask & (1 << c))
58            info->in[i].slot[c] = n++;
59
60      if (info->in[i].sn == TGSI_SEMANTIC_PRIMID)
61         prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_PRIMITIVE_ID;
62   }
63   prog->in_nr = info->numInputs;
64
65   for (i = 0; i < info->numSysVals; ++i) {
66      switch (info->sv[i].sn) {
67      case TGSI_SEMANTIC_INSTANCEID:
68         prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_INSTANCE_ID;
69         continue;
70      case TGSI_SEMANTIC_VERTEXID:
71         prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_VERTEX_ID;
72         prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_VERTEX_ID_DRAW_ARRAYS_ADD_START;
73         continue;
74      default:
75         break;
76      }
77   }
78
79   /*
80    * Corner case: VP has no inputs, but we will still need to submit data to
81    * draw it. HW will shout at us and won't draw anything if we don't enable
82    * any input, so let's just pretend it's the first one.
83    */
84   if (prog->vp.attrs[0] == 0 &&
85       prog->vp.attrs[1] == 0 &&
86       prog->vp.attrs[2] == 0)
87      prog->vp.attrs[0] |= 0xf;
88
89   /* VertexID before InstanceID */
90   if (info->io.vertexId < info->numSysVals)
91      info->sv[info->io.vertexId].slot[0] = n++;
92   if (info->io.instanceId < info->numSysVals)
93      info->sv[info->io.instanceId].slot[0] = n++;
94
95   n = 0;
96   for (i = 0; i < info->numOutputs; ++i) {
97      switch (info->out[i].sn) {
98      case TGSI_SEMANTIC_PSIZE:
99         prog->vp.psiz = i;
100         break;
101      case TGSI_SEMANTIC_CLIPDIST:
102         prog->vp.clpd[info->out[i].si] = n;
103         break;
104      case TGSI_SEMANTIC_EDGEFLAG:
105         prog->vp.edgeflag = i;
106         break;
107      case TGSI_SEMANTIC_BCOLOR:
108         prog->vp.bfc[info->out[i].si] = i;
109         break;
110      case TGSI_SEMANTIC_LAYER:
111         prog->gp.has_layer = true;
112         prog->gp.layerid = n;
113         break;
114      case TGSI_SEMANTIC_VIEWPORT_INDEX:
115         prog->gp.has_viewport = true;
116         prog->gp.viewportid = n;
117         break;
118      default:
119         break;
120      }
121      prog->out[i].id = i;
122      prog->out[i].sn = info->out[i].sn;
123      prog->out[i].si = info->out[i].si;
124      prog->out[i].hw = n;
125      prog->out[i].mask = info->out[i].mask;
126
127      for (c = 0; c < 4; ++c)
128         if (info->out[i].mask & (1 << c))
129            info->out[i].slot[c] = n++;
130   }
131   prog->out_nr = info->numOutputs;
132   prog->max_out = n;
133   if (!prog->max_out)
134      prog->max_out = 1;
135
136   if (prog->vp.psiz < info->numOutputs)
137      prog->vp.psiz = prog->out[prog->vp.psiz].hw;
138
139   return 0;
140}
141
142static int
143nv50_fragprog_assign_slots(struct nv50_ir_prog_info *info)
144{
145   struct nv50_program *prog = (struct nv50_program *)info->driverPriv;
146   unsigned i, n, m, c;
147   unsigned nvary;
148   unsigned nflat;
149   unsigned nintp = 0;
150
151   /* count recorded non-flat inputs */
152   for (m = 0, i = 0; i < info->numInputs; ++i) {
153      switch (info->in[i].sn) {
154      case TGSI_SEMANTIC_POSITION:
155         continue;
156      default:
157         m += info->in[i].flat ? 0 : 1;
158         break;
159      }
160   }
161   /* careful: id may be != i in info->in[prog->in[i].id] */
162
163   /* Fill prog->in[] so that non-flat inputs are first and
164    * kick out special inputs that don't use the RESULT_MAP.
165    */
166   for (n = 0, i = 0; i < info->numInputs; ++i) {
167      if (info->in[i].sn == TGSI_SEMANTIC_POSITION) {
168         prog->fp.interp |= info->in[i].mask << 24;
169         for (c = 0; c < 4; ++c)
170            if (info->in[i].mask & (1 << c))
171               info->in[i].slot[c] = nintp++;
172      } else {
173         unsigned j = info->in[i].flat ? m++ : n++;
174
175         if (info->in[i].sn == TGSI_SEMANTIC_COLOR)
176            prog->vp.bfc[info->in[i].si] = j;
177         else if (info->in[i].sn == TGSI_SEMANTIC_PRIMID)
178            prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_PRIMITIVE_ID;
179
180         prog->in[j].id = i;
181         prog->in[j].mask = info->in[i].mask;
182         prog->in[j].sn = info->in[i].sn;
183         prog->in[j].si = info->in[i].si;
184         prog->in[j].linear = info->in[i].linear;
185
186         prog->in_nr++;
187      }
188   }
189   if (!(prog->fp.interp & (8 << 24))) {
190      ++nintp;
191      prog->fp.interp |= 8 << 24;
192   }
193
194   for (i = 0; i < prog->in_nr; ++i) {
195      int j = prog->in[i].id;
196
197      prog->in[i].hw = nintp;
198      for (c = 0; c < 4; ++c)
199         if (prog->in[i].mask & (1 << c))
200            info->in[j].slot[c] = nintp++;
201   }
202   /* (n == m) if m never increased, i.e. no flat inputs */
203   nflat = (n < m) ? (nintp - prog->in[n].hw) : 0;
204   nintp -= bitcount4(prog->fp.interp >> 24); /* subtract position inputs */
205   nvary = nintp - nflat;
206
207   prog->fp.interp |= nvary << NV50_3D_FP_INTERPOLANT_CTRL_COUNT_NONFLAT__SHIFT;
208   prog->fp.interp |= nintp << NV50_3D_FP_INTERPOLANT_CTRL_COUNT__SHIFT;
209
210   /* put front/back colors right after HPOS */
211   prog->fp.colors = 4 << NV50_3D_SEMANTIC_COLOR_FFC0_ID__SHIFT;
212   for (i = 0; i < 2; ++i)
213      if (prog->vp.bfc[i] < 0xff)
214         prog->fp.colors += bitcount4(prog->in[prog->vp.bfc[i]].mask) << 16;
215
216   /* FP outputs */
217
218   if (info->prop.fp.numColourResults > 1)
219      prog->fp.flags[0] |= NV50_3D_FP_CONTROL_MULTIPLE_RESULTS;
220
221   for (i = 0; i < info->numOutputs; ++i) {
222      prog->out[i].id = i;
223      prog->out[i].sn = info->out[i].sn;
224      prog->out[i].si = info->out[i].si;
225      prog->out[i].mask = info->out[i].mask;
226
227      if (i == info->io.fragDepth || i == info->io.sampleMask)
228         continue;
229      prog->out[i].hw = info->out[i].si * 4;
230
231      for (c = 0; c < 4; ++c)
232         info->out[i].slot[c] = prog->out[i].hw + c;
233
234      prog->max_out = MAX2(prog->max_out, prog->out[i].hw + 4);
235   }
236
237   if (info->io.sampleMask < PIPE_MAX_SHADER_OUTPUTS) {
238      info->out[info->io.sampleMask].slot[0] = prog->max_out++;
239      prog->fp.has_samplemask = 1;
240   }
241
242   if (info->io.fragDepth < PIPE_MAX_SHADER_OUTPUTS)
243      info->out[info->io.fragDepth].slot[2] = prog->max_out++;
244
245   if (!prog->max_out)
246      prog->max_out = 4;
247
248   return 0;
249}
250
251static int
252nv50_program_assign_varying_slots(struct nv50_ir_prog_info *info)
253{
254   switch (info->type) {
255   case PIPE_SHADER_VERTEX:
256      return nv50_vertprog_assign_slots(info);
257   case PIPE_SHADER_GEOMETRY:
258      return nv50_vertprog_assign_slots(info);
259   case PIPE_SHADER_FRAGMENT:
260      return nv50_fragprog_assign_slots(info);
261   case PIPE_SHADER_COMPUTE:
262      return 0;
263   default:
264      return -1;
265   }
266}
267
268static struct nv50_stream_output_state *
269nv50_program_create_strmout_state(const struct nv50_ir_prog_info *info,
270                                  const struct pipe_stream_output_info *pso)
271{
272   struct nv50_stream_output_state *so;
273   unsigned b, i, c;
274   unsigned base[4];
275
276   so = MALLOC_STRUCT(nv50_stream_output_state);
277   if (!so)
278      return NULL;
279   memset(so->map, 0xff, sizeof(so->map));
280
281   for (b = 0; b < 4; ++b)
282      so->num_attribs[b] = 0;
283   for (i = 0; i < pso->num_outputs; ++i) {
284      unsigned end =  pso->output[i].dst_offset + pso->output[i].num_components;
285      b = pso->output[i].output_buffer;
286      assert(b < 4);
287      so->num_attribs[b] = MAX2(so->num_attribs[b], end);
288   }
289
290   so->ctrl = NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED;
291
292   so->stride[0] = pso->stride[0] * 4;
293   base[0] = 0;
294   for (b = 1; b < 4; ++b) {
295      assert(!so->num_attribs[b] || so->num_attribs[b] == pso->stride[b]);
296      so->stride[b] = so->num_attribs[b] * 4;
297      if (so->num_attribs[b])
298         so->ctrl = (b + 1) << NV50_3D_STRMOUT_BUFFERS_CTRL_SEPARATE__SHIFT;
299      base[b] = align(base[b - 1] + so->num_attribs[b - 1], 4);
300   }
301   if (so->ctrl & NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED) {
302      assert(so->stride[0] < NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__MAX);
303      so->ctrl |= so->stride[0] << NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__SHIFT;
304   }
305
306   so->map_size = base[3] + so->num_attribs[3];
307
308   for (i = 0; i < pso->num_outputs; ++i) {
309      const unsigned s = pso->output[i].start_component;
310      const unsigned p = pso->output[i].dst_offset;
311      const unsigned r = pso->output[i].register_index;
312      b = pso->output[i].output_buffer;
313
314      if (r >= info->numOutputs)
315         continue;
316
317      for (c = 0; c < pso->output[i].num_components; ++c)
318         so->map[base[b] + p + c] = info->out[r].slot[s + c];
319   }
320
321   return so;
322}
323
324bool
325nv50_program_translate(struct nv50_program *prog, uint16_t chipset,
326                       struct pipe_debug_callback *debug)
327{
328   struct nv50_ir_prog_info *info;
329   int i, ret;
330   const uint8_t map_undef = (prog->type == PIPE_SHADER_VERTEX) ? 0x40 : 0x80;
331
332   info = CALLOC_STRUCT(nv50_ir_prog_info);
333   if (!info)
334      return false;
335
336   info->type = prog->type;
337   info->target = chipset;
338
339   info->bin.sourceRep = prog->pipe.type;
340   switch (prog->pipe.type) {
341   case PIPE_SHADER_IR_TGSI:
342      info->bin.source = (void *)prog->pipe.tokens;
343      break;
344   case PIPE_SHADER_IR_NIR:
345      info->bin.source = (void *)nir_shader_clone(NULL, prog->pipe.ir.nir);
346      break;
347   default:
348      assert(!"unsupported IR!");
349      free(info);
350      return false;
351   }
352
353   info->bin.smemSize = prog->cp.smem_size;
354   info->io.auxCBSlot = 15;
355   info->io.ucpBase = NV50_CB_AUX_UCP_OFFSET;
356   info->io.genUserClip = prog->vp.clpd_nr;
357   if (prog->fp.alphatest)
358      info->io.alphaRefBase = NV50_CB_AUX_ALPHATEST_OFFSET;
359
360   info->io.suInfoBase = NV50_CB_AUX_TEX_MS_OFFSET;
361   info->io.sampleInfoBase = NV50_CB_AUX_SAMPLE_OFFSET;
362   info->io.msInfoCBSlot = 15;
363   info->io.msInfoBase = NV50_CB_AUX_MS_OFFSET;
364
365   info->assignSlots = nv50_program_assign_varying_slots;
366
367   prog->vp.bfc[0] = 0xff;
368   prog->vp.bfc[1] = 0xff;
369   prog->vp.edgeflag = 0xff;
370   prog->vp.clpd[0] = map_undef;
371   prog->vp.clpd[1] = map_undef;
372   prog->vp.psiz = map_undef;
373   prog->gp.has_layer = 0;
374   prog->gp.has_viewport = 0;
375
376   if (prog->type == PIPE_SHADER_COMPUTE)
377      info->prop.cp.inputOffset = 0x10;
378
379   info->driverPriv = prog;
380
381#ifdef DEBUG
382   info->optLevel = debug_get_num_option("NV50_PROG_OPTIMIZE", 3);
383   info->dbgFlags = debug_get_num_option("NV50_PROG_DEBUG", 0);
384   info->omitLineNum = debug_get_num_option("NV50_PROG_DEBUG_OMIT_LINENUM", 0);
385#else
386   info->optLevel = 3;
387#endif
388
389   ret = nv50_ir_generate_code(info);
390   if (ret) {
391      NOUVEAU_ERR("shader translation failed: %i\n", ret);
392      goto out;
393   }
394
395   prog->code = info->bin.code;
396   prog->code_size = info->bin.codeSize;
397   prog->fixups = info->bin.relocData;
398   prog->interps = info->bin.fixupData;
399   prog->max_gpr = MAX2(4, (info->bin.maxGPR >> 1) + 1);
400   prog->tls_space = info->bin.tlsSpace;
401   prog->cp.smem_size = info->bin.smemSize;
402   prog->mul_zero_wins = info->io.mul_zero_wins;
403   prog->vp.need_vertex_id = info->io.vertexId < PIPE_MAX_SHADER_INPUTS;
404
405   prog->vp.clip_enable = (1 << info->io.clipDistances) - 1;
406   prog->vp.cull_enable =
407      ((1 << info->io.cullDistances) - 1) << info->io.clipDistances;
408   prog->vp.clip_mode = 0;
409   for (i = 0; i < info->io.cullDistances; ++i)
410      prog->vp.clip_mode |= 1 << ((info->io.clipDistances + i) * 4);
411
412   if (prog->type == PIPE_SHADER_FRAGMENT) {
413      if (info->prop.fp.writesDepth) {
414         prog->fp.flags[0] |= NV50_3D_FP_CONTROL_EXPORTS_Z;
415         prog->fp.flags[1] = 0x11;
416      }
417      if (info->prop.fp.usesDiscard)
418         prog->fp.flags[0] |= NV50_3D_FP_CONTROL_USES_KIL;
419   } else
420   if (prog->type == PIPE_SHADER_GEOMETRY) {
421      switch (info->prop.gp.outputPrim) {
422      case PIPE_PRIM_LINE_STRIP:
423         prog->gp.prim_type = NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_LINE_STRIP;
424         break;
425      case PIPE_PRIM_TRIANGLE_STRIP:
426         prog->gp.prim_type = NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_TRIANGLE_STRIP;
427         break;
428      case PIPE_PRIM_POINTS:
429      default:
430         assert(info->prop.gp.outputPrim == PIPE_PRIM_POINTS);
431         prog->gp.prim_type = NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_POINTS;
432         break;
433      }
434      prog->gp.vert_count = CLAMP(info->prop.gp.maxVertices, 1, 1024);
435   }
436
437   if (prog->type == PIPE_SHADER_COMPUTE) {
438      prog->cp.syms = info->bin.syms;
439      prog->cp.num_syms = info->bin.numSyms;
440   } else {
441      FREE(info->bin.syms);
442   }
443
444   if (prog->pipe.stream_output.num_outputs)
445      prog->so = nv50_program_create_strmout_state(info,
446                                                   &prog->pipe.stream_output);
447
448   pipe_debug_message(debug, SHADER_INFO,
449                      "type: %d, local: %d, shared: %d, gpr: %d, inst: %d, bytes: %d",
450                      prog->type, info->bin.tlsSpace, info->bin.smemSize,
451                      prog->max_gpr, info->bin.instructions,
452                      info->bin.codeSize);
453
454out:
455   if (info->bin.sourceRep == PIPE_SHADER_IR_NIR)
456      ralloc_free((void *)info->bin.source);
457   FREE(info);
458   return !ret;
459}
460
461bool
462nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
463{
464   struct nouveau_heap *heap;
465   int ret;
466   uint32_t size = align(prog->code_size, 0x40);
467   uint8_t prog_type;
468
469   switch (prog->type) {
470   case PIPE_SHADER_VERTEX:   heap = nv50->screen->vp_code_heap; break;
471   case PIPE_SHADER_GEOMETRY: heap = nv50->screen->gp_code_heap; break;
472   case PIPE_SHADER_FRAGMENT: heap = nv50->screen->fp_code_heap; break;
473   case PIPE_SHADER_COMPUTE:  heap = nv50->screen->fp_code_heap; break;
474   default:
475      assert(!"invalid program type");
476      return false;
477   }
478
479   ret = nouveau_heap_alloc(heap, size, prog, &prog->mem);
480   if (ret) {
481      /* Out of space: evict everything to compactify the code segment, hoping
482       * the working set is much smaller and drifts slowly. Improve me !
483       */
484      while (heap->next) {
485         struct nv50_program *evict = heap->next->priv;
486         if (evict)
487            nouveau_heap_free(&evict->mem);
488      }
489      debug_printf("WARNING: out of code space, evicting all shaders.\n");
490      ret = nouveau_heap_alloc(heap, size, prog, &prog->mem);
491      if (ret) {
492         NOUVEAU_ERR("shader too large (0x%x) to fit in code space ?\n", size);
493         return false;
494      }
495   }
496
497   if (prog->type == PIPE_SHADER_COMPUTE) {
498      /* CP code must be uploaded in FP code segment. */
499      prog_type = 1;
500   } else {
501      prog->code_base = prog->mem->start;
502      prog_type = prog->type;
503   }
504
505   ret = nv50_tls_realloc(nv50->screen, prog->tls_space);
506   if (ret < 0) {
507      nouveau_heap_free(&prog->mem);
508      return false;
509   }
510   if (ret > 0)
511      nv50->state.new_tls_space = true;
512
513   if (prog->fixups)
514      nv50_ir_relocate_code(prog->fixups, prog->code, prog->code_base, 0, 0);
515   if (prog->interps)
516      nv50_ir_apply_fixups(prog->interps, prog->code,
517                           prog->fp.force_persample_interp,
518                           false /* flatshade */,
519                           prog->fp.alphatest - 1);
520
521   nv50_sifc_linear_u8(&nv50->base, nv50->screen->code,
522                       (prog_type << NV50_CODE_BO_SIZE_LOG2) + prog->code_base,
523                       NOUVEAU_BO_VRAM, prog->code_size, prog->code);
524
525   BEGIN_NV04(nv50->base.pushbuf, NV50_3D(CODE_CB_FLUSH), 1);
526   PUSH_DATA (nv50->base.pushbuf, 0);
527
528   return true;
529}
530
531void
532nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
533{
534   const struct pipe_shader_state pipe = p->pipe;
535   const ubyte type = p->type;
536
537   if (p->mem)
538      nouveau_heap_free(&p->mem);
539
540   FREE(p->code);
541
542   FREE(p->fixups);
543   FREE(p->interps);
544   FREE(p->so);
545
546   if (type == PIPE_SHADER_COMPUTE)
547      FREE(p->cp.syms);
548
549   memset(p, 0, sizeof(*p));
550
551   p->pipe = pipe;
552   p->type = type;
553}
554