1/*
2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 *    Rob Clark <robclark@freedesktop.org>
25 */
26
27#include "pipe/p_screen.h"
28#include "pipe/p_state.h"
29#include "tgsi/tgsi_dump.h"
30#include "tgsi/tgsi_parse.h"
31#include "util/format/u_format.h"
32#include "util/u_inlines.h"
33#include "util/u_memory.h"
34#include "util/u_string.h"
35
36#include "nir/tgsi_to_nir.h"
37
38#include "freedreno_context.h"
39#include "freedreno_util.h"
40
41#include "ir3/ir3_cache.h"
42#include "ir3/ir3_compiler.h"
43#include "ir3/ir3_gallium.h"
44#include "ir3/ir3_nir.h"
45#include "ir3/ir3_shader.h"
46
47/**
48 * The hardware cso for shader state
49 *
50 * Initially just a container for the ir3_shader, but this is where we'll
51 * plumb in async compile.
52 */
53struct ir3_shader_state {
54   struct ir3_shader *shader;
55
56   /* Fence signalled when async compile is completed: */
57   struct util_queue_fence ready;
58};
59
60/**
61 * Should initial variants be compiled synchronously?
62 *
63 * The only case where pipe_debug_message() is used in the initial-variants
64 * path is with FD_MESA_DEBUG=shaderdb.  So if either debug is disabled (ie.
65 * debug.debug_message==NULL), or shaderdb stats are not enabled, we can
66 * compile the initial shader variant asynchronously.
67 */
68static bool
69initial_variants_synchronous(struct fd_context *ctx)
70{
71   return unlikely(ctx->debug.debug_message) || FD_DBG(SHADERDB) ||
72          FD_DBG(SERIALC);
73}
74
75static void
76dump_shader_info(struct ir3_shader_variant *v,
77                 struct pipe_debug_callback *debug)
78{
79   if (!FD_DBG(SHADERDB))
80      return;
81
82   pipe_debug_message(
83      debug, SHADER_INFO,
84      "%s shader: %u inst, %u nops, %u non-nops, %u mov, %u cov, "
85      "%u dwords, %u last-baryf, %u half, %u full, %u constlen, "
86      "%u cat0, %u cat1, %u cat2, %u cat3, %u cat4, %u cat5, %u cat6, %u cat7, "
87      "%u stp, %u ldp, %u sstall, %u (ss), %u (sy), %d waves, %d max_sun, "
88      "%d loops\n",
89      ir3_shader_stage(v), v->info.instrs_count, v->info.nops_count,
90      v->info.instrs_count - v->info.nops_count, v->info.mov_count,
91      v->info.cov_count, v->info.sizedwords, v->info.last_baryf,
92      v->info.max_half_reg + 1, v->info.max_reg + 1, v->constlen,
93      v->info.instrs_per_cat[0], v->info.instrs_per_cat[1],
94      v->info.instrs_per_cat[2], v->info.instrs_per_cat[3],
95      v->info.instrs_per_cat[4], v->info.instrs_per_cat[5],
96      v->info.instrs_per_cat[6], v->info.instrs_per_cat[7],
97      v->info.stp_count, v->info.ldp_count, v->info.sstall,
98      v->info.ss, v->info.sy, v->info.max_waves, v->max_sun, v->loops);
99}
100
101static void
102upload_shader_variant(struct ir3_shader_variant *v)
103{
104   struct shader_info *info = &v->shader->nir->info;
105   struct ir3_compiler *compiler = v->shader->compiler;
106
107   assert(!v->bo);
108
109   v->bo =
110      fd_bo_new(compiler->dev, v->info.size, 0,
111                "%s:%s", ir3_shader_stage(v), info->name);
112
113   /* Always include shaders in kernel crash dumps. */
114   fd_bo_mark_for_dump(v->bo);
115
116   memcpy(fd_bo_map(v->bo), v->bin, v->info.size);
117}
118
119struct ir3_shader_variant *
120ir3_shader_variant(struct ir3_shader *shader, struct ir3_shader_key key,
121                   bool binning_pass, struct pipe_debug_callback *debug)
122{
123   struct ir3_shader_variant *v;
124   bool created = false;
125
126   /* Some shader key values may not be used by a given ir3_shader (for
127    * example, fragment shader saturates in the vertex shader), so clean out
128    * those flags to avoid recompiling.
129    */
130   ir3_key_clear_unused(&key, shader);
131
132   v = ir3_shader_get_variant(shader, &key, binning_pass, false, &created);
133
134   if (created) {
135      if (shader->initial_variants_done) {
136         perf_debug_message(debug, SHADER_INFO,
137                            "%s shader: recompiling at draw time: global "
138                            "0x%08x, vfsamples %x/%x, astc %x/%x\n",
139                            ir3_shader_stage(v), key.global, key.vsamples,
140                            key.fsamples, key.vastc_srgb, key.fastc_srgb);
141      }
142
143      dump_shader_info(v, debug);
144      upload_shader_variant(v);
145
146      if (v->binning) {
147         upload_shader_variant(v->binning);
148         dump_shader_info(v->binning, debug);
149      }
150   }
151
152   return v;
153}
154
155static void
156copy_stream_out(struct ir3_stream_output_info *i,
157                const struct pipe_stream_output_info *p)
158{
159   STATIC_ASSERT(ARRAY_SIZE(i->stride) == ARRAY_SIZE(p->stride));
160   STATIC_ASSERT(ARRAY_SIZE(i->output) == ARRAY_SIZE(p->output));
161
162   i->num_outputs = p->num_outputs;
163   for (int n = 0; n < ARRAY_SIZE(i->stride); n++)
164      i->stride[n] = p->stride[n];
165
166   for (int n = 0; n < ARRAY_SIZE(i->output); n++) {
167      i->output[n].register_index = p->output[n].register_index;
168      i->output[n].start_component = p->output[n].start_component;
169      i->output[n].num_components = p->output[n].num_components;
170      i->output[n].output_buffer = p->output[n].output_buffer;
171      i->output[n].dst_offset = p->output[n].dst_offset;
172      i->output[n].stream = p->output[n].stream;
173   }
174}
175
176static void
177create_initial_variants(struct ir3_shader_state *hwcso,
178                        struct pipe_debug_callback *debug)
179{
180   struct ir3_shader *shader = hwcso->shader;
181   struct ir3_compiler *compiler = shader->compiler;
182   nir_shader *nir = shader->nir;
183
184   /* Compile standard variants immediately to try to avoid draw-time stalls
185    * to run the compiler.
186    */
187   struct ir3_shader_key key = {
188      .tessellation = IR3_TESS_NONE,
189      .ucp_enables = MASK(nir->info.clip_distance_array_size),
190      .msaa = true,
191   };
192
193   switch (nir->info.stage) {
194   case MESA_SHADER_TESS_EVAL:
195      key.tessellation = ir3_tess_mode(nir->info.tess.primitive_mode);
196      break;
197
198   case MESA_SHADER_TESS_CTRL:
199      /* The primitive_mode field, while it exists for TCS, is not
200       * populated (since separable shaders between TCS/TES are legal,
201       * so TCS wouldn't have access to TES's declaration).  Make a
202       * guess so that we shader-db something plausible for TCS.
203       */
204      if (nir->info.outputs_written & VARYING_BIT_TESS_LEVEL_INNER)
205         key.tessellation = IR3_TESS_TRIANGLES;
206      else
207         key.tessellation = IR3_TESS_ISOLINES;
208      break;
209
210   case MESA_SHADER_GEOMETRY:
211      key.has_gs = true;
212      break;
213
214   default:
215      break;
216   }
217
218   key.safe_constlen = false;
219   struct ir3_shader_variant *v = ir3_shader_variant(shader, key, false, debug);
220   if (!v)
221      return;
222
223   if (v->constlen > compiler->max_const_safe) {
224      key.safe_constlen = true;
225      ir3_shader_variant(shader, key, false, debug);
226   }
227
228   /* For vertex shaders, also compile initial binning pass shader: */
229   if (nir->info.stage == MESA_SHADER_VERTEX) {
230      key.safe_constlen = false;
231      v = ir3_shader_variant(shader, key, true, debug);
232      if (!v)
233         return;
234
235      if (v->constlen > compiler->max_const_safe) {
236         key.safe_constlen = true;
237         ir3_shader_variant(shader, key, true, debug);
238      }
239   }
240
241   shader->initial_variants_done = true;
242}
243
244static void
245create_initial_variants_async(void *job, void *gdata, int thread_index)
246{
247   struct ir3_shader_state *hwcso = job;
248   struct pipe_debug_callback debug = {};
249
250   create_initial_variants(hwcso, &debug);
251}
252
253static void
254create_initial_compute_variants_async(void *job, void *gdata, int thread_index)
255{
256   struct ir3_shader_state *hwcso = job;
257   struct ir3_shader *shader = hwcso->shader;
258   struct pipe_debug_callback debug = {};
259   static struct ir3_shader_key key; /* static is implicitly zeroed */
260
261   ir3_shader_variant(shader, key, false, &debug);
262   shader->initial_variants_done = true;
263}
264
265/* a bit annoying that compute-shader and normal shader state objects
266 * aren't a bit more aligned.
267 */
268void *
269ir3_shader_compute_state_create(struct pipe_context *pctx,
270                                const struct pipe_compute_state *cso)
271{
272   struct fd_context *ctx = fd_context(pctx);
273
274   /* req_input_mem will only be non-zero for cl kernels (ie. clover).
275    * This isn't a perfect test because I guess it is possible (but
276    * uncommon) for none for the kernel parameters to be a global,
277    * but ctx->set_global_bindings() can't fail, so this is the next
278    * best place to fail if we need a newer version of kernel driver:
279    */
280   if ((cso->req_input_mem > 0) &&
281       fd_device_version(ctx->dev) < FD_VERSION_BO_IOVA) {
282      return NULL;
283   }
284
285   struct ir3_compiler *compiler = ctx->screen->compiler;
286   nir_shader *nir;
287
288   if (cso->ir_type == PIPE_SHADER_IR_NIR) {
289      /* we take ownership of the reference: */
290      nir = (nir_shader *)cso->prog;
291   } else {
292      debug_assert(cso->ir_type == PIPE_SHADER_IR_TGSI);
293      if (ir3_shader_debug & IR3_DBG_DISASM) {
294         tgsi_dump(cso->prog, 0);
295      }
296      nir = tgsi_to_nir(cso->prog, pctx->screen, false);
297   }
298
299   struct ir3_shader *shader = ir3_shader_from_nir(compiler, nir, 0, NULL);
300   struct ir3_shader_state *hwcso = calloc(1, sizeof(*hwcso));
301
302   util_queue_fence_init(&hwcso->ready);
303   hwcso->shader = shader;
304
305   /* Immediately compile a standard variant.  We have so few variants in our
306    * shaders, that doing so almost eliminates draw-time recompiles.  (This
307    * is also how we get data from shader-db's ./run)
308    */
309
310   if (initial_variants_synchronous(ctx)) {
311      static struct ir3_shader_key key; /* static is implicitly zeroed */
312      ir3_shader_variant(shader, key, false, &ctx->debug);
313      shader->initial_variants_done = true;
314   } else {
315      struct fd_screen *screen = ctx->screen;
316      util_queue_add_job(&screen->compile_queue, hwcso, &hwcso->ready,
317                         create_initial_compute_variants_async, NULL, 0);
318   }
319
320   return hwcso;
321}
322
323void *
324ir3_shader_state_create(struct pipe_context *pctx,
325                        const struct pipe_shader_state *cso)
326{
327   struct fd_context *ctx = fd_context(pctx);
328   struct ir3_compiler *compiler = ctx->screen->compiler;
329   struct ir3_shader_state *hwcso = calloc(1, sizeof(*hwcso));
330
331   /*
332    * Convert to nir (if necessary):
333    */
334
335   nir_shader *nir;
336   if (cso->type == PIPE_SHADER_IR_NIR) {
337      /* we take ownership of the reference: */
338      nir = cso->ir.nir;
339   } else {
340      debug_assert(cso->type == PIPE_SHADER_IR_TGSI);
341      if (ir3_shader_debug & IR3_DBG_DISASM) {
342         tgsi_dump(cso->tokens, 0);
343      }
344      nir = tgsi_to_nir(cso->tokens, pctx->screen, false);
345   }
346
347   /*
348    * Create ir3_shader:
349    *
350    * This part is cheap, it doesn't compile initial variants
351    */
352
353   struct ir3_stream_output_info stream_output = {};
354   copy_stream_out(&stream_output, &cso->stream_output);
355
356   hwcso->shader = ir3_shader_from_nir(compiler, nir, 0, &stream_output);
357
358   /*
359    * Create initial variants to avoid draw-time stalls.  This is
360    * normally done asynchronously, unless debug is enabled (which
361    * will be the case for shader-db)
362    */
363
364   util_queue_fence_init(&hwcso->ready);
365
366   if (initial_variants_synchronous(ctx)) {
367      create_initial_variants(hwcso, &ctx->debug);
368   } else {
369      util_queue_add_job(&ctx->screen->compile_queue, hwcso, &hwcso->ready,
370                         create_initial_variants_async, NULL, 0);
371   }
372
373   return hwcso;
374}
375
376void
377ir3_shader_state_delete(struct pipe_context *pctx, void *_hwcso)
378{
379   struct fd_context *ctx = fd_context(pctx);
380   struct fd_screen *screen = ctx->screen;
381   struct ir3_shader_state *hwcso = _hwcso;
382   struct ir3_shader *so = hwcso->shader;
383
384   ir3_cache_invalidate(ctx->shader_cache, hwcso);
385
386   /* util_queue_drop_job() guarantees that either:
387    *  1) job did not execute
388    *  2) job completed
389    *
390    * In either case the fence is signaled
391    */
392   util_queue_drop_job(&screen->compile_queue, &hwcso->ready);
393
394   /* free the uploaded shaders, since this is handled outside of the
395    * shared ir3 code (ie. not used by turnip):
396    */
397   for (struct ir3_shader_variant *v = so->variants; v; v = v->next) {
398      fd_bo_del(v->bo);
399      v->bo = NULL;
400
401      if (v->binning && v->binning->bo) {
402         fd_bo_del(v->binning->bo);
403         v->binning->bo = NULL;
404      }
405   }
406
407   ir3_shader_destroy(so);
408   util_queue_fence_destroy(&hwcso->ready);
409   free(hwcso);
410}
411
412struct ir3_shader *
413ir3_get_shader(struct ir3_shader_state *hwcso)
414{
415   if (!hwcso)
416      return NULL;
417
418   struct ir3_shader *shader = hwcso->shader;
419   perf_time (1000, "waited for %s:%s:%s variants",
420              _mesa_shader_stage_to_abbrev(shader->type),
421              shader->nir->info.name,
422              shader->nir->info.label) {
423      /* wait for initial variants to compile: */
424      util_queue_fence_wait(&hwcso->ready);
425   }
426
427   return shader;
428}
429
430struct shader_info *
431ir3_get_shader_info(struct ir3_shader_state *hwcso)
432{
433   if (!hwcso)
434      return NULL;
435   return &hwcso->shader->nir->info;
436}
437
438/* fixup dirty shader state in case some "unrelated" (from the state-
439 * tracker's perspective) state change causes us to switch to a
440 * different variant.
441 */
442void
443ir3_fixup_shader_state(struct pipe_context *pctx, struct ir3_shader_key *key)
444{
445   struct fd_context *ctx = fd_context(pctx);
446
447   if (!ir3_shader_key_equal(ctx->last.key, key)) {
448      if (ir3_shader_key_changes_fs(ctx->last.key, key)) {
449         fd_context_dirty_shader(ctx, PIPE_SHADER_FRAGMENT,
450                                 FD_DIRTY_SHADER_PROG);
451      }
452
453      if (ir3_shader_key_changes_vs(ctx->last.key, key)) {
454         fd_context_dirty_shader(ctx, PIPE_SHADER_VERTEX, FD_DIRTY_SHADER_PROG);
455      }
456
457      /* NOTE: currently only a6xx has gs/tess, but needs no
458       * gs/tess specific lowering.
459       */
460
461      *ctx->last.key = *key;
462   }
463}
464
465static char *
466ir3_screen_finalize_nir(struct pipe_screen *pscreen, void *nir)
467{
468   struct fd_screen *screen = fd_screen(pscreen);
469
470   ir3_nir_lower_io_to_temporaries(nir);
471   ir3_finalize_nir(screen->compiler, nir);
472
473   return NULL;
474}
475
476static void
477ir3_set_max_shader_compiler_threads(struct pipe_screen *pscreen,
478                                    unsigned max_threads)
479{
480   struct fd_screen *screen = fd_screen(pscreen);
481
482   /* This function doesn't allow a greater number of threads than
483    * the queue had at its creation.
484    */
485   util_queue_adjust_num_threads(&screen->compile_queue, max_threads);
486}
487
488static bool
489ir3_is_parallel_shader_compilation_finished(struct pipe_screen *pscreen,
490                                            void *shader,
491                                            enum pipe_shader_type shader_type)
492{
493   struct ir3_shader_state *hwcso = (struct ir3_shader_state *)shader;
494
495   return util_queue_fence_is_signalled(&hwcso->ready);
496}
497
498void
499ir3_prog_init(struct pipe_context *pctx)
500{
501   pctx->create_vs_state = ir3_shader_state_create;
502   pctx->delete_vs_state = ir3_shader_state_delete;
503
504   pctx->create_tcs_state = ir3_shader_state_create;
505   pctx->delete_tcs_state = ir3_shader_state_delete;
506
507   pctx->create_tes_state = ir3_shader_state_create;
508   pctx->delete_tes_state = ir3_shader_state_delete;
509
510   pctx->create_gs_state = ir3_shader_state_create;
511   pctx->delete_gs_state = ir3_shader_state_delete;
512
513   pctx->create_fs_state = ir3_shader_state_create;
514   pctx->delete_fs_state = ir3_shader_state_delete;
515}
516
517void
518ir3_screen_init(struct pipe_screen *pscreen)
519{
520   struct fd_screen *screen = fd_screen(pscreen);
521
522   screen->compiler = ir3_compiler_create(screen->dev, screen->dev_id, false);
523
524   /* TODO do we want to limit things to # of fast cores, or just limit
525    * based on total # of both big and little cores.  The little cores
526    * tend to be in-order and probably much slower for compiling than
527    * big cores.  OTOH if they are sitting idle, maybe it is useful to
528    * use them?
529    */
530   unsigned num_threads = sysconf(_SC_NPROCESSORS_ONLN) - 1;
531
532   util_queue_init(&screen->compile_queue, "ir3q", 64, num_threads,
533                   UTIL_QUEUE_INIT_RESIZE_IF_FULL |
534                      UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY, NULL);
535
536   pscreen->finalize_nir = ir3_screen_finalize_nir;
537   pscreen->set_max_shader_compiler_threads =
538      ir3_set_max_shader_compiler_threads;
539   pscreen->is_parallel_shader_compilation_finished =
540      ir3_is_parallel_shader_compilation_finished;
541}
542
543void
544ir3_screen_fini(struct pipe_screen *pscreen)
545{
546   struct fd_screen *screen = fd_screen(pscreen);
547
548   util_queue_destroy(&screen->compile_queue);
549   ir3_compiler_destroy(screen->compiler);
550   screen->compiler = NULL;
551}
552
553void
554ir3_update_max_tf_vtx(struct fd_context *ctx,
555                      const struct ir3_shader_variant *v)
556{
557   struct fd_streamout_stateobj *so = &ctx->streamout;
558   struct ir3_stream_output_info *info = &v->shader->stream_output;
559   uint32_t maxvtxcnt = 0x7fffffff;
560
561   if (v->shader->stream_output.num_outputs == 0)
562      ctx->streamout.max_tf_vtx = 0;
563   if (so->num_targets == 0)
564      ctx->streamout.max_tf_vtx = 0;
565
566   /* offset to write to is:
567    *
568    *   total_vtxcnt = vtxcnt + offsets[i]
569    *   offset = total_vtxcnt * stride[i]
570    *
571    *   offset =   vtxcnt * stride[i]       ; calculated in shader
572    *            + offsets[i] * stride[i]   ; calculated at emit_tfbos()
573    *
574    * assuming for each vtx, each target buffer will have data written
575    * up to 'offset + stride[i]', that leaves maxvtxcnt as:
576    *
577    *   buffer_size = (maxvtxcnt * stride[i]) + stride[i]
578    *   maxvtxcnt   = (buffer_size - stride[i]) / stride[i]
579    *
580    * but shader is actually doing a less-than (rather than less-than-
581    * equal) check, so we can drop the -stride[i].
582    *
583    * TODO is assumption about `offset + stride[i]` legit?
584    */
585   for (unsigned i = 0; i < so->num_targets; i++) {
586      struct pipe_stream_output_target *target = so->targets[i];
587      unsigned stride = info->stride[i] * 4; /* convert dwords->bytes */
588      if (target) {
589         uint32_t max = target->buffer_size / stride;
590         maxvtxcnt = MIN2(maxvtxcnt, max);
591      }
592   }
593
594   ctx->streamout.max_tf_vtx = maxvtxcnt;
595}
596