1/*
2 * Copyright 2016 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25#include "ac_exp_param.h"
26#include "ac_nir_to_llvm.h"
27#include "ac_rtld.h"
28#include "si_pipe.h"
29#include "si_shader_internal.h"
30#include "sid.h"
31#include "tgsi/tgsi_from_mesa.h"
32#include "util/u_memory.h"
33
34struct si_llvm_diagnostics {
35   struct pipe_debug_callback *debug;
36   unsigned retval;
37};
38
39static void si_diagnostic_handler(LLVMDiagnosticInfoRef di, void *context)
40{
41   struct si_llvm_diagnostics *diag = (struct si_llvm_diagnostics *)context;
42   LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di);
43   const char *severity_str = NULL;
44
45   switch (severity) {
46   case LLVMDSError:
47      severity_str = "error";
48      break;
49   case LLVMDSWarning:
50      severity_str = "warning";
51      break;
52   case LLVMDSRemark:
53   case LLVMDSNote:
54   default:
55      return;
56   }
57
58   char *description = LLVMGetDiagInfoDescription(di);
59
60   pipe_debug_message(diag->debug, SHADER_INFO, "LLVM diagnostic (%s): %s", severity_str,
61                      description);
62
63   if (severity == LLVMDSError) {
64      diag->retval = 1;
65      fprintf(stderr, "LLVM triggered Diagnostic Handler: %s\n", description);
66   }
67
68   LLVMDisposeMessage(description);
69}
70
71bool si_compile_llvm(struct si_screen *sscreen, struct si_shader_binary *binary,
72                     struct ac_shader_config *conf, struct ac_llvm_compiler *compiler,
73                     struct ac_llvm_context *ac, struct pipe_debug_callback *debug,
74                     gl_shader_stage stage, const char *name, bool less_optimized)
75{
76   unsigned count = p_atomic_inc_return(&sscreen->num_compilations);
77
78   if (si_can_dump_shader(sscreen, stage)) {
79      fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
80
81      if (!(sscreen->debug_flags & (DBG(NO_IR) | DBG(PREOPT_IR)))) {
82         fprintf(stderr, "%s LLVM IR:\n\n", name);
83         ac_dump_module(ac->module);
84         fprintf(stderr, "\n");
85      }
86   }
87
88   if (sscreen->record_llvm_ir) {
89      char *ir = LLVMPrintModuleToString(ac->module);
90      binary->llvm_ir_string = strdup(ir);
91      LLVMDisposeMessage(ir);
92   }
93
94   if (!si_replace_shader(count, binary)) {
95      struct ac_compiler_passes *passes = compiler->passes;
96
97      if (less_optimized && compiler->low_opt_passes)
98         passes = compiler->low_opt_passes;
99
100      struct si_llvm_diagnostics diag = {debug};
101      LLVMContextSetDiagnosticHandler(ac->context, si_diagnostic_handler, &diag);
102
103      if (!ac_compile_module_to_elf(passes, ac->module, (char **)&binary->elf_buffer,
104                                    &binary->elf_size))
105         diag.retval = 1;
106
107      if (diag.retval != 0) {
108         pipe_debug_message(debug, SHADER_INFO, "LLVM compilation failed");
109         return false;
110      }
111   }
112
113   struct ac_rtld_binary rtld;
114   if (!ac_rtld_open(&rtld, (struct ac_rtld_open_info){
115                               .info = &sscreen->info,
116                               .shader_type = stage,
117                               .wave_size = ac->wave_size,
118                               .num_parts = 1,
119                               .elf_ptrs = &binary->elf_buffer,
120                               .elf_sizes = &binary->elf_size}))
121      return false;
122
123   bool ok = ac_rtld_read_config(&sscreen->info, &rtld, conf);
124   ac_rtld_close(&rtld);
125   return ok;
126}
127
128void si_llvm_context_init(struct si_shader_context *ctx, struct si_screen *sscreen,
129                          struct ac_llvm_compiler *compiler, unsigned wave_size)
130{
131   memset(ctx, 0, sizeof(*ctx));
132   ctx->screen = sscreen;
133   ctx->compiler = compiler;
134
135   ac_llvm_context_init(&ctx->ac, compiler, sscreen->info.chip_class, sscreen->info.family,
136                        &sscreen->info, AC_FLOAT_MODE_DEFAULT_OPENGL, wave_size, 64);
137}
138
139void si_llvm_create_func(struct si_shader_context *ctx, const char *name, LLVMTypeRef *return_types,
140                         unsigned num_return_elems, unsigned max_workgroup_size)
141{
142   LLVMTypeRef ret_type;
143   enum ac_llvm_calling_convention call_conv;
144
145   if (num_return_elems)
146      ret_type = LLVMStructTypeInContext(ctx->ac.context, return_types, num_return_elems, true);
147   else
148      ret_type = ctx->ac.voidt;
149
150   gl_shader_stage real_stage = ctx->stage;
151
152   /* LS is merged into HS (TCS), and ES is merged into GS. */
153   if (ctx->screen->info.chip_class >= GFX9) {
154      if (ctx->shader->key.as_ls)
155         real_stage = MESA_SHADER_TESS_CTRL;
156      else if (ctx->shader->key.as_es || ctx->shader->key.as_ngg)
157         real_stage = MESA_SHADER_GEOMETRY;
158   }
159
160   switch (real_stage) {
161   case MESA_SHADER_VERTEX:
162   case MESA_SHADER_TESS_EVAL:
163      call_conv = AC_LLVM_AMDGPU_VS;
164      break;
165   case MESA_SHADER_TESS_CTRL:
166      call_conv = AC_LLVM_AMDGPU_HS;
167      break;
168   case MESA_SHADER_GEOMETRY:
169      call_conv = AC_LLVM_AMDGPU_GS;
170      break;
171   case MESA_SHADER_FRAGMENT:
172      call_conv = AC_LLVM_AMDGPU_PS;
173      break;
174   case MESA_SHADER_COMPUTE:
175      call_conv = AC_LLVM_AMDGPU_CS;
176      break;
177   default:
178      unreachable("Unhandle shader type");
179   }
180
181   /* Setup the function */
182   ctx->return_type = ret_type;
183   ctx->main_fn = ac_build_main(&ctx->args, &ctx->ac, call_conv, name, ret_type, ctx->ac.module);
184   ctx->return_value = LLVMGetUndef(ctx->return_type);
185
186   if (ctx->screen->info.address32_hi) {
187      ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-32bit-address-high-bits",
188                                           ctx->screen->info.address32_hi);
189   }
190
191   ac_llvm_set_workgroup_size(ctx->main_fn, max_workgroup_size);
192   ac_llvm_set_target_features(ctx->main_fn, &ctx->ac);
193}
194
195void si_llvm_create_main_func(struct si_shader_context *ctx, bool ngg_cull_shader)
196{
197   struct si_shader *shader = ctx->shader;
198   LLVMTypeRef returns[AC_MAX_ARGS];
199   unsigned i;
200
201   si_init_shader_args(ctx, ngg_cull_shader);
202
203   for (i = 0; i < ctx->args.num_sgprs_returned; i++)
204      returns[i] = ctx->ac.i32; /* SGPR */
205   for (; i < ctx->args.return_count; i++)
206      returns[i] = ctx->ac.f32; /* VGPR */
207
208   si_llvm_create_func(ctx, ngg_cull_shader ? "ngg_cull_main" : "main", returns,
209                       ctx->args.return_count, si_get_max_workgroup_size(shader));
210
211   /* Reserve register locations for VGPR inputs the PS prolog may need. */
212   if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->shader->is_monolithic) {
213      ac_llvm_add_target_dep_function_attr(
214         ctx->main_fn, "InitialPSInputAddr",
215         S_0286D0_PERSP_SAMPLE_ENA(1) | S_0286D0_PERSP_CENTER_ENA(1) |
216            S_0286D0_PERSP_CENTROID_ENA(1) | S_0286D0_LINEAR_SAMPLE_ENA(1) |
217            S_0286D0_LINEAR_CENTER_ENA(1) | S_0286D0_LINEAR_CENTROID_ENA(1) |
218            S_0286D0_FRONT_FACE_ENA(1) | S_0286D0_ANCILLARY_ENA(1) | S_0286D0_POS_FIXED_PT_ENA(1));
219   }
220
221
222   if (shader->key.as_ls || ctx->stage == MESA_SHADER_TESS_CTRL) {
223      if (USE_LDS_SYMBOLS) {
224         /* The LSHS size is not known until draw time, so we append it
225          * at the end of whatever LDS use there may be in the rest of
226          * the shader (currently none, unless LLVM decides to do its
227          * own LDS-based lowering).
228          */
229         ctx->ac.lds = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
230                                                   "__lds_end", AC_ADDR_SPACE_LDS);
231         LLVMSetAlignment(ctx->ac.lds, 256);
232      } else {
233         ac_declare_lds_as_pointer(&ctx->ac);
234      }
235   }
236
237   /* Unlike radv, we override these arguments in the prolog, so to the
238    * API shader they appear as normal arguments.
239    */
240   if (ctx->stage == MESA_SHADER_VERTEX) {
241      ctx->abi.vertex_id = ac_get_arg(&ctx->ac, ctx->args.vertex_id);
242      ctx->abi.instance_id = ac_get_arg(&ctx->ac, ctx->args.instance_id);
243   } else if (ctx->stage == MESA_SHADER_FRAGMENT) {
244      ctx->abi.persp_centroid = ac_get_arg(&ctx->ac, ctx->args.persp_centroid);
245      ctx->abi.linear_centroid = ac_get_arg(&ctx->ac, ctx->args.linear_centroid);
246   }
247}
248
249void si_llvm_optimize_module(struct si_shader_context *ctx)
250{
251   /* Dump LLVM IR before any optimization passes */
252   if (ctx->screen->debug_flags & DBG(PREOPT_IR) && si_can_dump_shader(ctx->screen, ctx->stage))
253      LLVMDumpModule(ctx->ac.module);
254
255   /* Run the pass */
256   LLVMRunPassManager(ctx->compiler->passmgr, ctx->ac.module);
257   LLVMDisposeBuilder(ctx->ac.builder);
258}
259
260void si_llvm_dispose(struct si_shader_context *ctx)
261{
262   LLVMDisposeModule(ctx->ac.module);
263   LLVMContextDispose(ctx->ac.context);
264   ac_llvm_context_dispose(&ctx->ac);
265}
266
267/**
268 * Load a dword from a constant buffer.
269 */
270LLVMValueRef si_buffer_load_const(struct si_shader_context *ctx, LLVMValueRef resource,
271                                  LLVMValueRef offset)
272{
273   return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL, 0, ctx->ac.f32,
274                               0, true, true);
275}
276
277void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
278{
279   if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
280      LLVMBuildRetVoid(ctx->ac.builder);
281   else
282      LLVMBuildRet(ctx->ac.builder, ret);
283}
284
285LLVMValueRef si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
286                                 struct ac_arg param, unsigned return_index)
287{
288   return LLVMBuildInsertValue(ctx->ac.builder, ret, ac_get_arg(&ctx->ac, param), return_index, "");
289}
290
291LLVMValueRef si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
292                                       struct ac_arg param, unsigned return_index)
293{
294   LLVMBuilderRef builder = ctx->ac.builder;
295   LLVMValueRef p = ac_get_arg(&ctx->ac, param);
296
297   return LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, p), return_index, "");
298}
299
300LLVMValueRef si_insert_input_ptr(struct si_shader_context *ctx, LLVMValueRef ret,
301                                 struct ac_arg param, unsigned return_index)
302{
303   LLVMBuilderRef builder = ctx->ac.builder;
304   LLVMValueRef ptr = ac_get_arg(&ctx->ac, param);
305   ptr = LLVMBuildPtrToInt(builder, ptr, ctx->ac.i32, "");
306   return LLVMBuildInsertValue(builder, ret, ptr, return_index, "");
307}
308
309LLVMValueRef si_prolog_get_internal_bindings(struct si_shader_context *ctx)
310{
311   LLVMValueRef ptr[2], list;
312   bool merged_shader = si_is_merged_shader(ctx->shader);
313
314   ptr[0] = LLVMGetParam(ctx->main_fn, (merged_shader ? 8 : 0) + SI_SGPR_INTERNAL_BINDINGS);
315   list =
316      LLVMBuildIntToPtr(ctx->ac.builder, ptr[0], ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
317   return list;
318}
319
320void si_llvm_emit_barrier(struct si_shader_context *ctx)
321{
322   /* GFX6 only (thanks to a hw bug workaround):
323    * The real barrier instruction isn’t needed, because an entire patch
324    * always fits into a single wave.
325    */
326   if (ctx->screen->info.chip_class == GFX6 && ctx->stage == MESA_SHADER_TESS_CTRL) {
327      ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM | AC_WAIT_VLOAD | AC_WAIT_VSTORE);
328      return;
329   }
330
331   ac_build_s_barrier(&ctx->ac);
332}
333
334/* Ensure that the esgs ring is declared.
335 *
336 * We declare it with 64KB alignment as a hint that the
337 * pointer value will always be 0.
338 */
339void si_llvm_declare_esgs_ring(struct si_shader_context *ctx)
340{
341   if (ctx->esgs_ring)
342      return;
343
344   assert(!LLVMGetNamedGlobal(ctx->ac.module, "esgs_ring"));
345
346   ctx->esgs_ring = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
347                                                "esgs_ring", AC_ADDR_SPACE_LDS);
348   LLVMSetLinkage(ctx->esgs_ring, LLVMExternalLinkage);
349   LLVMSetAlignment(ctx->esgs_ring, 64 * 1024);
350}
351
352static void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param,
353                                    unsigned bitoffset)
354{
355   LLVMValueRef args[] = {
356      ac_get_arg(&ctx->ac, param),
357      LLVMConstInt(ctx->ac.i32, bitoffset, 0),
358   };
359   ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.init.exec.from.input", ctx->ac.voidt, args, 2,
360                      AC_FUNC_ATTR_CONVERGENT);
361}
362
363/**
364 * Get the value of a shader input parameter and extract a bitfield.
365 */
366static LLVMValueRef unpack_llvm_param(struct si_shader_context *ctx, LLVMValueRef value,
367                                      unsigned rshift, unsigned bitwidth)
368{
369   if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
370      value = ac_to_integer(&ctx->ac, value);
371
372   if (rshift)
373      value = LLVMBuildLShr(ctx->ac.builder, value, LLVMConstInt(ctx->ac.i32, rshift, 0), "");
374
375   if (rshift + bitwidth < 32) {
376      unsigned mask = (1 << bitwidth) - 1;
377      value = LLVMBuildAnd(ctx->ac.builder, value, LLVMConstInt(ctx->ac.i32, mask, 0), "");
378   }
379
380   return value;
381}
382
383LLVMValueRef si_unpack_param(struct si_shader_context *ctx, struct ac_arg param, unsigned rshift,
384                             unsigned bitwidth)
385{
386   LLVMValueRef value = ac_get_arg(&ctx->ac, param);
387
388   return unpack_llvm_param(ctx, value, rshift, bitwidth);
389}
390
391LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx, unsigned swizzle)
392{
393   if (swizzle > 0)
394      return ctx->ac.i32_0;
395
396   switch (ctx->stage) {
397   case MESA_SHADER_VERTEX:
398      return ac_get_arg(&ctx->ac, ctx->args.vs_prim_id);
399   case MESA_SHADER_TESS_CTRL:
400      return ac_get_arg(&ctx->ac, ctx->args.tcs_patch_id);
401   case MESA_SHADER_TESS_EVAL:
402      return ac_get_arg(&ctx->ac, ctx->args.tes_patch_id);
403   case MESA_SHADER_GEOMETRY:
404      return ac_get_arg(&ctx->ac, ctx->args.gs_prim_id);
405   default:
406      assert(0);
407      return ctx->ac.i32_0;
408   }
409}
410
411static LLVMValueRef si_llvm_get_block_size(struct ac_shader_abi *abi)
412{
413   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
414
415   assert(ctx->shader->selector->info.base.workgroup_size_variable &&
416          ctx->shader->selector->info.uses_variable_block_size);
417
418   LLVMValueRef chan[3] = {
419      si_unpack_param(ctx, ctx->block_size, 0, 10),
420      si_unpack_param(ctx, ctx->block_size, 10, 10),
421      si_unpack_param(ctx, ctx->block_size, 20, 10),
422   };
423   return ac_build_gather_values(&ctx->ac, chan, 3);
424}
425
426static void si_llvm_declare_compute_memory(struct si_shader_context *ctx)
427{
428   struct si_shader_selector *sel = ctx->shader->selector;
429   unsigned lds_size = sel->info.base.shared_size;
430
431   LLVMTypeRef i8p = LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS);
432   LLVMValueRef var;
433
434   assert(!ctx->ac.lds);
435
436   var = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i8, lds_size),
437                                     "compute_lds", AC_ADDR_SPACE_LDS);
438   LLVMSetAlignment(var, 64 * 1024);
439
440   ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, "");
441}
442
443static bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir)
444{
445   if (nir->info.stage == MESA_SHADER_FRAGMENT) {
446      unsigned colors_read = ctx->shader->selector->info.colors_read;
447      LLVMValueRef main_fn = ctx->main_fn;
448
449      LLVMValueRef undef = LLVMGetUndef(ctx->ac.f32);
450
451      unsigned offset = SI_PARAM_POS_FIXED_PT + 1;
452
453      if (colors_read & 0x0f) {
454         unsigned mask = colors_read & 0x0f;
455         LLVMValueRef values[4];
456         values[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef;
457         values[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef;
458         values[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef;
459         values[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef;
460         ctx->abi.color0 = ac_to_integer(&ctx->ac, ac_build_gather_values(&ctx->ac, values, 4));
461      }
462      if (colors_read & 0xf0) {
463         unsigned mask = (colors_read & 0xf0) >> 4;
464         LLVMValueRef values[4];
465         values[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef;
466         values[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef;
467         values[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef;
468         values[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef;
469         ctx->abi.color1 = ac_to_integer(&ctx->ac, ac_build_gather_values(&ctx->ac, values, 4));
470      }
471
472      ctx->abi.interp_at_sample_force_center =
473         ctx->shader->key.mono.u.ps.interpolate_at_sample_force_center;
474
475      ctx->abi.kill_ps_if_inf_interp =
476         ctx->screen->options.no_infinite_interp &&
477         (ctx->shader->selector->info.uses_persp_center ||
478          ctx->shader->selector->info.uses_persp_centroid ||
479          ctx->shader->selector->info.uses_persp_sample);
480
481   } else if (nir->info.stage == MESA_SHADER_COMPUTE) {
482      if (nir->info.cs.user_data_components_amd) {
483         ctx->abi.user_data = ac_get_arg(&ctx->ac, ctx->cs_user_data);
484         ctx->abi.user_data = ac_build_expand_to_vec4(&ctx->ac, ctx->abi.user_data,
485                                                      nir->info.cs.user_data_components_amd);
486      }
487
488      if (ctx->shader->selector->info.base.shared_size)
489         si_llvm_declare_compute_memory(ctx);
490   }
491
492   ctx->abi.clamp_shadow_reference = true;
493   ctx->abi.robust_buffer_access = true;
494   ctx->abi.convert_undef_to_zero = true;
495   ctx->abi.clamp_div_by_zero = ctx->screen->options.clamp_div_by_zero;
496   ctx->abi.adjust_frag_coord_z = false;
497
498   const struct si_shader_info *info = &ctx->shader->selector->info;
499   for (unsigned i = 0; i < info->num_outputs; i++) {
500      LLVMTypeRef type = ctx->ac.f32;
501
502      /* Only FS uses unpacked f16. Other stages pack 16-bit outputs into low and high bits of f32. */
503      if (nir->info.stage == MESA_SHADER_FRAGMENT &&
504          nir_alu_type_get_type_size(ctx->shader->selector->info.output_type[i]) == 16)
505         type = ctx->ac.f16;
506
507      for (unsigned j = 0; j < 4; j++)
508         ctx->abi.outputs[i * 4 + j] = ac_build_alloca_undef(&ctx->ac, type, "");
509   }
510
511   ac_nir_translate(&ctx->ac, &ctx->abi, &ctx->args, nir);
512
513   return true;
514}
515
516/**
517 * Given a list of shader part functions, build a wrapper function that
518 * runs them in sequence to form a monolithic shader.
519 */
520void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *parts,
521                               unsigned num_parts, unsigned main_part,
522                               unsigned next_shader_first_part, bool same_thread_count)
523{
524   LLVMBuilderRef builder = ctx->ac.builder;
525   /* PS epilog has one arg per color component; gfx9 merged shader
526    * prologs need to forward 40 SGPRs.
527    */
528   LLVMValueRef initial[AC_MAX_ARGS], out[AC_MAX_ARGS];
529   LLVMTypeRef function_type;
530   unsigned num_first_params;
531   unsigned num_out, initial_num_out;
532   ASSERTED unsigned num_out_sgpr;         /* used in debug checks */
533   ASSERTED unsigned initial_num_out_sgpr; /* used in debug checks */
534   unsigned num_sgprs, num_vgprs;
535   unsigned gprs;
536
537   memset(&ctx->args, 0, sizeof(ctx->args));
538
539   for (unsigned i = 0; i < num_parts; ++i) {
540      ac_add_function_attr(ctx->ac.context, parts[i], -1, AC_FUNC_ATTR_ALWAYSINLINE);
541      LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
542   }
543
544   /* The parameters of the wrapper function correspond to those of the
545    * first part in terms of SGPRs and VGPRs, but we use the types of the
546    * main part to get the right types. This is relevant for the
547    * dereferenceable attribute on descriptor table pointers.
548    */
549   num_sgprs = 0;
550   num_vgprs = 0;
551
552   function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
553   num_first_params = LLVMCountParamTypes(function_type);
554
555   for (unsigned i = 0; i < num_first_params; ++i) {
556      LLVMValueRef param = LLVMGetParam(parts[0], i);
557
558      if (ac_is_sgpr_param(param)) {
559         assert(num_vgprs == 0);
560         num_sgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
561      } else {
562         num_vgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
563      }
564   }
565
566   gprs = 0;
567   while (gprs < num_sgprs + num_vgprs) {
568      LLVMValueRef param = LLVMGetParam(parts[main_part], ctx->args.arg_count);
569      LLVMTypeRef type = LLVMTypeOf(param);
570      unsigned size = ac_get_type_size(type) / 4;
571
572      /* This is going to get casted anyways, so we don't have to
573       * have the exact same type. But we do have to preserve the
574       * pointer-ness so that LLVM knows about it.
575       */
576      enum ac_arg_type arg_type = AC_ARG_INT;
577      if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
578         type = LLVMGetElementType(type);
579
580         if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
581            if (LLVMGetVectorSize(type) == 4)
582               arg_type = AC_ARG_CONST_DESC_PTR;
583            else if (LLVMGetVectorSize(type) == 8)
584               arg_type = AC_ARG_CONST_IMAGE_PTR;
585            else
586               assert(0);
587         } else if (type == ctx->ac.f32) {
588            arg_type = AC_ARG_CONST_FLOAT_PTR;
589         } else {
590            assert(0);
591         }
592      }
593
594      ac_add_arg(&ctx->args, gprs < num_sgprs ? AC_ARG_SGPR : AC_ARG_VGPR, size, arg_type, NULL);
595
596      assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
597      assert(gprs + size <= num_sgprs + num_vgprs &&
598             (gprs >= num_sgprs || gprs + size <= num_sgprs));
599
600      gprs += size;
601   }
602
603   /* Prepare the return type. */
604   unsigned num_returns = 0;
605   LLVMTypeRef returns[AC_MAX_ARGS], last_func_type, return_type;
606
607   last_func_type = LLVMGetElementType(LLVMTypeOf(parts[num_parts - 1]));
608   return_type = LLVMGetReturnType(last_func_type);
609
610   switch (LLVMGetTypeKind(return_type)) {
611   case LLVMStructTypeKind:
612      num_returns = LLVMCountStructElementTypes(return_type);
613      assert(num_returns <= ARRAY_SIZE(returns));
614      LLVMGetStructElementTypes(return_type, returns);
615      break;
616   case LLVMVoidTypeKind:
617      break;
618   default:
619      unreachable("unexpected type");
620   }
621
622   si_llvm_create_func(ctx, "wrapper", returns, num_returns,
623                       si_get_max_workgroup_size(ctx->shader));
624
625   if (si_is_merged_shader(ctx->shader) && !same_thread_count)
626      ac_init_exec_full_mask(&ctx->ac);
627
628   /* Record the arguments of the function as if they were an output of
629    * a previous part.
630    */
631   num_out = 0;
632   num_out_sgpr = 0;
633
634   for (unsigned i = 0; i < ctx->args.arg_count; ++i) {
635      LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
636      LLVMTypeRef param_type = LLVMTypeOf(param);
637      LLVMTypeRef out_type = ctx->args.args[i].file == AC_ARG_SGPR ? ctx->ac.i32 : ctx->ac.f32;
638      unsigned size = ac_get_type_size(param_type) / 4;
639
640      if (size == 1) {
641         if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
642            param = LLVMBuildPtrToInt(builder, param, ctx->ac.i32, "");
643            param_type = ctx->ac.i32;
644         }
645
646         if (param_type != out_type)
647            param = LLVMBuildBitCast(builder, param, out_type, "");
648         out[num_out++] = param;
649      } else {
650         LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
651
652         if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
653            param = LLVMBuildPtrToInt(builder, param, ctx->ac.i64, "");
654            param_type = ctx->ac.i64;
655         }
656
657         if (param_type != vector_type)
658            param = LLVMBuildBitCast(builder, param, vector_type, "");
659
660         for (unsigned j = 0; j < size; ++j)
661            out[num_out++] =
662               LLVMBuildExtractElement(builder, param, LLVMConstInt(ctx->ac.i32, j, 0), "");
663      }
664
665      if (ctx->args.args[i].file == AC_ARG_SGPR)
666         num_out_sgpr = num_out;
667   }
668
669   memcpy(initial, out, sizeof(out));
670   initial_num_out = num_out;
671   initial_num_out_sgpr = num_out_sgpr;
672
673   /* Now chain the parts. */
674   LLVMValueRef ret = NULL;
675   for (unsigned part = 0; part < num_parts; ++part) {
676      LLVMValueRef in[AC_MAX_ARGS];
677      LLVMTypeRef ret_type;
678      unsigned out_idx = 0;
679      unsigned num_params = LLVMCountParams(parts[part]);
680
681      /* Merged shaders are executed conditionally depending
682       * on the number of enabled threads passed in the input SGPRs. */
683      if (si_is_multi_part_shader(ctx->shader) && part == 0) {
684         if (same_thread_count) {
685            struct ac_arg arg;
686            arg.arg_index = 3;
687            arg.used = true;
688
689            si_init_exec_from_input(ctx, arg, 0);
690         } else {
691            LLVMValueRef ena, count = initial[3];
692
693            count = LLVMBuildAnd(builder, count, LLVMConstInt(ctx->ac.i32, 0x7f, 0), "");
694            ena = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), count, "");
695            ac_build_ifcc(&ctx->ac, ena, 6506);
696         }
697      }
698
699      /* Derive arguments for the next part from outputs of the
700       * previous one.
701       */
702      for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
703         LLVMValueRef param;
704         LLVMTypeRef param_type;
705         bool is_sgpr;
706         unsigned param_size;
707         LLVMValueRef arg = NULL;
708
709         param = LLVMGetParam(parts[part], param_idx);
710         param_type = LLVMTypeOf(param);
711         param_size = ac_get_type_size(param_type) / 4;
712         is_sgpr = ac_is_sgpr_param(param);
713
714         if (is_sgpr) {
715            ac_add_function_attr(ctx->ac.context, parts[part], param_idx + 1, AC_FUNC_ATTR_INREG);
716         } else if (out_idx < num_out_sgpr) {
717            /* Skip returned SGPRs the current part doesn't
718             * declare on the input. */
719            out_idx = num_out_sgpr;
720         }
721
722         assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
723
724         if (param_size == 1)
725            arg = out[out_idx];
726         else
727            arg = ac_build_gather_values(&ctx->ac, &out[out_idx], param_size);
728
729         if (LLVMTypeOf(arg) != param_type) {
730            if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
731               if (LLVMGetPointerAddressSpace(param_type) == AC_ADDR_SPACE_CONST_32BIT) {
732                  arg = LLVMBuildBitCast(builder, arg, ctx->ac.i32, "");
733                  arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
734               } else {
735                  arg = LLVMBuildBitCast(builder, arg, ctx->ac.i64, "");
736                  arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
737               }
738            } else {
739               arg = LLVMBuildBitCast(builder, arg, param_type, "");
740            }
741         }
742
743         in[param_idx] = arg;
744         out_idx += param_size;
745      }
746
747      ret = ac_build_call(&ctx->ac, parts[part], in, num_params);
748
749      if (!same_thread_count &&
750          si_is_multi_part_shader(ctx->shader) && part + 1 == next_shader_first_part) {
751         ac_build_endif(&ctx->ac, 6506);
752
753         /* The second half of the merged shader should use
754          * the inputs from the toplevel (wrapper) function,
755          * not the return value from the last call.
756          *
757          * That's because the last call was executed condi-
758          * tionally, so we can't consume it in the main
759          * block.
760          */
761         memcpy(out, initial, sizeof(initial));
762         num_out = initial_num_out;
763         num_out_sgpr = initial_num_out_sgpr;
764
765         /* Execute the second shader conditionally based on the number of
766          * enabled threads there.
767          */
768         if (ctx->stage == MESA_SHADER_TESS_CTRL) {
769            LLVMValueRef ena, count = initial[3];
770
771            count = LLVMBuildLShr(builder, count, LLVMConstInt(ctx->ac.i32, 8, 0), "");
772            count = LLVMBuildAnd(builder, count, LLVMConstInt(ctx->ac.i32, 0x7f, 0), "");
773            ena = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), count, "");
774            ac_build_ifcc(&ctx->ac, ena, 6507);
775         }
776         continue;
777      }
778
779      /* Extract the returned GPRs. */
780      ret_type = LLVMTypeOf(ret);
781      num_out = 0;
782      num_out_sgpr = 0;
783
784      if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
785         assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
786
787         unsigned ret_size = LLVMCountStructElementTypes(ret_type);
788
789         for (unsigned i = 0; i < ret_size; ++i) {
790            LLVMValueRef val = LLVMBuildExtractValue(builder, ret, i, "");
791
792            assert(num_out < ARRAY_SIZE(out));
793            out[num_out++] = val;
794
795            if (LLVMTypeOf(val) == ctx->ac.i32) {
796               assert(num_out_sgpr + 1 == num_out);
797               num_out_sgpr = num_out;
798            }
799         }
800      }
801   }
802
803   /* Close the conditional wrapping the second shader. */
804   if (ctx->stage == MESA_SHADER_TESS_CTRL &&
805       !same_thread_count && si_is_multi_part_shader(ctx->shader))
806      ac_build_endif(&ctx->ac, 6507);
807
808   if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
809      LLVMBuildRetVoid(builder);
810   else
811      LLVMBuildRet(builder, ret);
812}
813
814bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shader,
815                           struct nir_shader *nir, bool free_nir, bool ngg_cull_shader)
816{
817   struct si_shader_selector *sel = shader->selector;
818   const struct si_shader_info *info = &sel->info;
819
820   ctx->shader = shader;
821   ctx->stage = sel->info.stage;
822
823   ctx->num_const_buffers = info->base.num_ubos;
824   ctx->num_shader_buffers = info->base.num_ssbos;
825
826   ctx->num_samplers = BITSET_LAST_BIT(info->base.textures_used);
827   ctx->num_images = info->base.num_images;
828
829   si_llvm_init_resource_callbacks(ctx);
830
831   switch (ctx->stage) {
832   case MESA_SHADER_VERTEX:
833      si_llvm_init_vs_callbacks(ctx, ngg_cull_shader);
834      break;
835   case MESA_SHADER_TESS_CTRL:
836      si_llvm_init_tcs_callbacks(ctx);
837      break;
838   case MESA_SHADER_TESS_EVAL:
839      si_llvm_init_tes_callbacks(ctx, ngg_cull_shader);
840      break;
841   case MESA_SHADER_GEOMETRY:
842      si_llvm_init_gs_callbacks(ctx);
843      break;
844   case MESA_SHADER_FRAGMENT:
845      si_llvm_init_ps_callbacks(ctx);
846      break;
847   case MESA_SHADER_COMPUTE:
848      ctx->abi.load_local_group_size = si_llvm_get_block_size;
849      break;
850   default:
851      assert(!"Unsupported shader type");
852      return false;
853   }
854
855   si_llvm_create_main_func(ctx, ngg_cull_shader);
856
857   if (ctx->shader->key.as_es || ctx->stage == MESA_SHADER_GEOMETRY)
858      si_preload_esgs_ring(ctx);
859
860   if (ctx->stage == MESA_SHADER_GEOMETRY)
861      si_preload_gs_rings(ctx);
862   else if (ctx->stage == MESA_SHADER_TESS_EVAL)
863      si_llvm_preload_tes_rings(ctx);
864
865   if (ctx->stage == MESA_SHADER_TESS_CTRL && sel->info.tessfactors_are_def_in_all_invocs) {
866      for (unsigned i = 0; i < 6; i++) {
867         ctx->invoc0_tess_factors[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
868      }
869   }
870
871   if (ctx->stage == MESA_SHADER_GEOMETRY) {
872      for (unsigned i = 0; i < 4; i++) {
873         ctx->gs_next_vertex[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
874      }
875      if (shader->key.as_ngg) {
876         for (unsigned i = 0; i < 4; ++i) {
877            ctx->gs_curprim_verts[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
878            ctx->gs_generated_prims[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
879         }
880
881         assert(!ctx->gs_ngg_scratch);
882         LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, gfx10_ngg_get_scratch_dw_size(shader));
883         ctx->gs_ngg_scratch =
884            LLVMAddGlobalInAddressSpace(ctx->ac.module, ai32, "ngg_scratch", AC_ADDR_SPACE_LDS);
885         LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(ai32));
886         LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
887
888         ctx->gs_ngg_emit = LLVMAddGlobalInAddressSpace(
889            ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS);
890         LLVMSetLinkage(ctx->gs_ngg_emit, LLVMExternalLinkage);
891         LLVMSetAlignment(ctx->gs_ngg_emit, 4);
892      }
893   }
894
895   if (ctx->stage != MESA_SHADER_GEOMETRY && (shader->key.as_ngg && !shader->key.as_es)) {
896      /* Unconditionally declare scratch space base for streamout and
897       * vertex compaction. Whether space is actually allocated is
898       * determined during linking / PM4 creation.
899       */
900      si_llvm_declare_esgs_ring(ctx);
901
902      /* This is really only needed when streamout and / or vertex
903       * compaction is enabled.
904       */
905      if (!ctx->gs_ngg_scratch && (sel->so.num_outputs || shader->key.opt.ngg_culling)) {
906         LLVMTypeRef asi32 = LLVMArrayType(ctx->ac.i32, gfx10_ngg_get_scratch_dw_size(shader));
907         ctx->gs_ngg_scratch =
908            LLVMAddGlobalInAddressSpace(ctx->ac.module, asi32, "ngg_scratch", AC_ADDR_SPACE_LDS);
909         LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(asi32));
910         LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
911      }
912   }
913
914   /* For merged shaders (VS-TCS, VS-GS, TES-GS): */
915   if (ctx->screen->info.chip_class >= GFX9 && si_is_merged_shader(shader)) {
916      LLVMValueRef thread_enabled = NULL;
917
918      /* TES is special because it has only 1 shader part if NGG shader culling is disabled,
919       * and therefore it doesn't use the wrapper function.
920       */
921      bool no_wrapper_func = ctx->stage == MESA_SHADER_TESS_EVAL && !shader->key.as_es &&
922                             !shader->key.opt.ngg_culling;
923
924      /* Set EXEC = ~0 before the first shader. If the prolog is present, EXEC is set there
925       * instead. For monolithic shaders, the wrapper function does this.
926       */
927      if ((!shader->is_monolithic || no_wrapper_func) &&
928          (ctx->stage == MESA_SHADER_TESS_EVAL ||
929           (ctx->stage == MESA_SHADER_VERTEX &&
930            !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, ngg_cull_shader))))
931         ac_init_exec_full_mask(&ctx->ac);
932
933      /* NGG VS and NGG TES: Send gs_alloc_req and the prim export at the beginning to decrease
934       * register usage.
935       */
936      if ((ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) &&
937          shader->key.as_ngg && !shader->key.as_es && !shader->key.opt.ngg_culling) {
938         /* GFX10 requires a barrier before gs_alloc_req due to a hw bug. */
939         if (ctx->screen->info.chip_class == GFX10)
940            ac_build_s_barrier(&ctx->ac);
941
942         gfx10_ngg_build_sendmsg_gs_alloc_req(ctx);
943
944         /* Build the primitive export at the beginning
945          * of the shader if possible.
946          */
947         if (gfx10_ngg_export_prim_early(shader))
948            gfx10_ngg_build_export_prim(ctx, NULL, NULL);
949      }
950
951      /* NGG GS: Initialize LDS and insert s_barrier, which must not be inside the if statement. */
952      if (ctx->stage == MESA_SHADER_GEOMETRY && shader->key.as_ngg)
953         gfx10_ngg_gs_emit_prologue(ctx);
954
955      if (ctx->stage == MESA_SHADER_GEOMETRY ||
956          (ctx->stage == MESA_SHADER_TESS_CTRL && !shader->is_monolithic)) {
957         /* Wrap both shaders in an if statement according to the number of enabled threads
958          * there. For monolithic TCS, the if statement is inserted by the wrapper function,
959          * not here.
960          */
961         thread_enabled = si_is_gs_thread(ctx); /* 2nd shader: thread enabled bool */
962      } else if (((shader->key.as_ls || shader->key.as_es) && !shader->is_monolithic) ||
963                 (shader->key.as_ngg && !shader->key.as_es)) {
964         /* This is NGG VS or NGG TES or VS before GS or TES before GS or VS before TCS.
965          * For monolithic LS (VS before TCS) and ES (VS before GS and TES before GS),
966          * the if statement is inserted by the wrapper function.
967          */
968         thread_enabled = si_is_es_thread(ctx); /* 1st shader: thread enabled bool */
969      }
970
971      if (thread_enabled) {
972         ctx->merged_wrap_if_entry_block = LLVMGetInsertBlock(ctx->ac.builder);
973         ctx->merged_wrap_if_label = 11500;
974         ac_build_ifcc(&ctx->ac, thread_enabled, ctx->merged_wrap_if_label);
975      }
976
977      /* Execute a barrier before the second shader in
978       * a merged shader.
979       *
980       * Execute the barrier inside the conditional block,
981       * so that empty waves can jump directly to s_endpgm,
982       * which will also signal the barrier.
983       *
984       * This is possible in gfx9, because an empty wave
985       * for the second shader does not participate in
986       * the epilogue. With NGG, empty waves may still
987       * be required to export data (e.g. GS output vertices),
988       * so we cannot let them exit early.
989       *
990       * If the shader is TCS and the TCS epilog is present
991       * and contains a barrier, it will wait there and then
992       * reach s_endpgm.
993       */
994      if (ctx->stage == MESA_SHADER_TESS_CTRL) {
995         /* We need the barrier only if TCS inputs are read from LDS. */
996         if (!shader->key.opt.same_patch_vertices ||
997             shader->selector->info.base.inputs_read &
998             ~shader->selector->tcs_vgpr_only_inputs)
999            ac_build_s_barrier(&ctx->ac);
1000      } else if (ctx->stage == MESA_SHADER_GEOMETRY && !shader->key.as_ngg) {
1001         /* gfx10_ngg_gs_emit_prologue inserts the barrier for NGG. */
1002         ac_build_s_barrier(&ctx->ac);
1003      }
1004   }
1005
1006   bool success = si_nir_build_llvm(ctx, nir);
1007   if (free_nir)
1008      ralloc_free(nir);
1009   if (!success) {
1010      fprintf(stderr, "Failed to translate shader from NIR to LLVM\n");
1011      return false;
1012   }
1013
1014   si_llvm_build_ret(ctx, ctx->return_value);
1015   return true;
1016}
1017
1018static bool si_should_optimize_less(struct ac_llvm_compiler *compiler,
1019                                    struct si_shader_selector *sel)
1020{
1021   if (!compiler->low_opt_passes)
1022      return false;
1023
1024   /* Assume a slow CPU. */
1025   assert(!sel->screen->info.has_dedicated_vram && sel->screen->info.chip_class <= GFX8);
1026
1027   /* For a crazy dEQP test containing 2597 memory opcodes, mostly
1028    * buffer stores. */
1029   return sel->info.stage == MESA_SHADER_COMPUTE && sel->info.num_memory_stores > 1000;
1030}
1031
1032static void si_optimize_vs_outputs(struct si_shader_context *ctx)
1033{
1034   struct si_shader *shader = ctx->shader;
1035   struct si_shader_info *info = &shader->selector->info;
1036   unsigned skip_vs_optim_mask = 0;
1037
1038   if ((ctx->stage != MESA_SHADER_VERTEX && ctx->stage != MESA_SHADER_TESS_EVAL) ||
1039       shader->key.as_ls || shader->key.as_es)
1040      return;
1041
1042   /* Optimizing these outputs is not possible, since they might be overriden
1043    * at runtime with S_028644_PT_SPRITE_TEX. */
1044   for (int i = 0; i < info->num_outputs; i++) {
1045      if (info->output_semantic[i] == VARYING_SLOT_PNTC ||
1046          (info->output_semantic[i] >= VARYING_SLOT_TEX0 &&
1047           info->output_semantic[i] <= VARYING_SLOT_TEX7)) {
1048         skip_vs_optim_mask |= 1u << shader->info.vs_output_param_offset[i];
1049      }
1050   }
1051
1052   ac_optimize_vs_outputs(&ctx->ac, ctx->main_fn, shader->info.vs_output_param_offset,
1053                          info->num_outputs, skip_vs_optim_mask,
1054                          &shader->info.nr_param_exports);
1055}
1056
1057bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
1058                            struct si_shader *shader, struct pipe_debug_callback *debug,
1059                            struct nir_shader *nir, bool free_nir)
1060{
1061   struct si_shader_selector *sel = shader->selector;
1062   struct si_shader_context ctx;
1063
1064   si_llvm_context_init(&ctx, sscreen, compiler, si_get_shader_wave_size(shader));
1065
1066   LLVMValueRef ngg_cull_main_fn = NULL;
1067   if (shader->key.opt.ngg_culling) {
1068      if (!si_llvm_translate_nir(&ctx, shader, nir, false, true)) {
1069         si_llvm_dispose(&ctx);
1070         return false;
1071      }
1072      ngg_cull_main_fn = ctx.main_fn;
1073      ctx.main_fn = NULL;
1074   }
1075
1076   if (!si_llvm_translate_nir(&ctx, shader, nir, free_nir, false)) {
1077      si_llvm_dispose(&ctx);
1078      return false;
1079   }
1080
1081   if (shader->is_monolithic && ctx.stage == MESA_SHADER_VERTEX) {
1082      LLVMValueRef parts[4];
1083      unsigned num_parts = 0;
1084      bool first_is_prolog = false;
1085      LLVMValueRef main_fn = ctx.main_fn;
1086
1087      if (ngg_cull_main_fn) {
1088         if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, true)) {
1089            union si_shader_part_key prolog_key;
1090            si_get_vs_prolog_key(&sel->info, shader->info.num_input_sgprs, true,
1091                                 &shader->key.part.vs.prolog, shader, &prolog_key);
1092            prolog_key.vs_prolog.is_monolithic = true;
1093            si_llvm_build_vs_prolog(&ctx, &prolog_key);
1094            parts[num_parts++] = ctx.main_fn;
1095            first_is_prolog = true;
1096         }
1097         parts[num_parts++] = ngg_cull_main_fn;
1098      }
1099
1100      if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, false)) {
1101         union si_shader_part_key prolog_key;
1102         si_get_vs_prolog_key(&sel->info, shader->info.num_input_sgprs, false,
1103                              &shader->key.part.vs.prolog, shader, &prolog_key);
1104         prolog_key.vs_prolog.is_monolithic = true;
1105         si_llvm_build_vs_prolog(&ctx, &prolog_key);
1106         parts[num_parts++] = ctx.main_fn;
1107         if (num_parts == 1)
1108            first_is_prolog = true;
1109      }
1110      parts[num_parts++] = main_fn;
1111
1112      si_build_wrapper_function(&ctx, parts, num_parts, first_is_prolog ? 1 : 0, 0, false);
1113   } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_TESS_EVAL && ngg_cull_main_fn) {
1114      LLVMValueRef parts[3], prolog, main_fn = ctx.main_fn;
1115
1116      /* We reuse the VS prolog code for TES just to load the input VGPRs from LDS. */
1117      union si_shader_part_key prolog_key;
1118      memset(&prolog_key, 0, sizeof(prolog_key));
1119      prolog_key.vs_prolog.num_input_sgprs = shader->info.num_input_sgprs;
1120      prolog_key.vs_prolog.num_merged_next_stage_vgprs = 5;
1121      prolog_key.vs_prolog.as_ngg = 1;
1122      prolog_key.vs_prolog.load_vgprs_after_culling = 1;
1123      prolog_key.vs_prolog.is_monolithic = true;
1124      si_llvm_build_vs_prolog(&ctx, &prolog_key);
1125      prolog = ctx.main_fn;
1126
1127      parts[0] = ngg_cull_main_fn;
1128      parts[1] = prolog;
1129      parts[2] = main_fn;
1130
1131      si_build_wrapper_function(&ctx, parts, 3, 0, 0, false);
1132   } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_TESS_CTRL) {
1133      if (sscreen->info.chip_class >= GFX9) {
1134         struct si_shader_selector *ls = shader->key.part.tcs.ls;
1135         LLVMValueRef parts[4];
1136         bool vs_needs_prolog =
1137            si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog, &shader->key, false);
1138
1139         /* TCS main part */
1140         parts[2] = ctx.main_fn;
1141
1142         /* TCS epilog */
1143         union si_shader_part_key tcs_epilog_key;
1144         memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
1145         tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
1146         si_llvm_build_tcs_epilog(&ctx, &tcs_epilog_key);
1147         parts[3] = ctx.main_fn;
1148
1149         /* VS as LS main part */
1150         ctx.next_shader_sel = ctx.shader->selector;
1151         nir = si_get_nir_shader(ls, NULL, &free_nir);
1152         struct si_shader shader_ls = {};
1153         shader_ls.selector = ls;
1154         shader_ls.key.as_ls = 1;
1155         shader_ls.key.mono = shader->key.mono;
1156         shader_ls.key.opt = shader->key.opt;
1157         shader_ls.is_monolithic = true;
1158
1159         if (!si_llvm_translate_nir(&ctx, &shader_ls, nir, free_nir, false)) {
1160            si_llvm_dispose(&ctx);
1161            return false;
1162         }
1163         shader->info.uses_instanceid |= ls->info.uses_instanceid;
1164         parts[1] = ctx.main_fn;
1165
1166         /* LS prolog */
1167         if (vs_needs_prolog) {
1168            union si_shader_part_key vs_prolog_key;
1169            si_get_vs_prolog_key(&ls->info, shader_ls.info.num_input_sgprs, false,
1170                                 &shader->key.part.tcs.ls_prolog, shader, &vs_prolog_key);
1171            vs_prolog_key.vs_prolog.is_monolithic = true;
1172            si_llvm_build_vs_prolog(&ctx, &vs_prolog_key);
1173            parts[0] = ctx.main_fn;
1174         }
1175
1176         /* Reset the shader context. */
1177         ctx.shader = shader;
1178         ctx.stage = MESA_SHADER_TESS_CTRL;
1179
1180         si_build_wrapper_function(&ctx, parts + !vs_needs_prolog, 4 - !vs_needs_prolog,
1181                                   vs_needs_prolog, vs_needs_prolog ? 2 : 1,
1182                                   shader->key.opt.same_patch_vertices);
1183      } else {
1184         LLVMValueRef parts[2];
1185         union si_shader_part_key epilog_key;
1186
1187         parts[0] = ctx.main_fn;
1188
1189         memset(&epilog_key, 0, sizeof(epilog_key));
1190         epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
1191         si_llvm_build_tcs_epilog(&ctx, &epilog_key);
1192         parts[1] = ctx.main_fn;
1193
1194         si_build_wrapper_function(&ctx, parts, 2, 0, 0, false);
1195      }
1196   } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_GEOMETRY) {
1197      if (ctx.screen->info.chip_class >= GFX9) {
1198         struct si_shader_selector *es = shader->key.part.gs.es;
1199         LLVMValueRef es_prolog = NULL;
1200         LLVMValueRef es_main = NULL;
1201         LLVMValueRef gs_prolog = NULL;
1202         LLVMValueRef gs_main = ctx.main_fn;
1203
1204         /* GS prolog */
1205         union si_shader_part_key gs_prolog_key;
1206         memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
1207         gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
1208         gs_prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;
1209         si_llvm_build_gs_prolog(&ctx, &gs_prolog_key);
1210         gs_prolog = ctx.main_fn;
1211
1212         /* ES main part */
1213         nir = si_get_nir_shader(es, NULL, &free_nir);
1214         struct si_shader shader_es = {};
1215         shader_es.selector = es;
1216         shader_es.key.as_es = 1;
1217         shader_es.key.as_ngg = shader->key.as_ngg;
1218         shader_es.key.mono = shader->key.mono;
1219         shader_es.key.opt = shader->key.opt;
1220         shader_es.is_monolithic = true;
1221
1222         if (!si_llvm_translate_nir(&ctx, &shader_es, nir, free_nir, false)) {
1223            si_llvm_dispose(&ctx);
1224            return false;
1225         }
1226         shader->info.uses_instanceid |= es->info.uses_instanceid;
1227         es_main = ctx.main_fn;
1228
1229         /* ES prolog */
1230         if (es->info.stage == MESA_SHADER_VERTEX &&
1231             si_vs_needs_prolog(es, &shader->key.part.gs.vs_prolog, &shader->key, false)) {
1232            union si_shader_part_key vs_prolog_key;
1233            si_get_vs_prolog_key(&es->info, shader_es.info.num_input_sgprs, false,
1234                                 &shader->key.part.gs.vs_prolog, shader, &vs_prolog_key);
1235            vs_prolog_key.vs_prolog.is_monolithic = true;
1236            si_llvm_build_vs_prolog(&ctx, &vs_prolog_key);
1237            es_prolog = ctx.main_fn;
1238         }
1239
1240         /* Reset the shader context. */
1241         ctx.shader = shader;
1242         ctx.stage = MESA_SHADER_GEOMETRY;
1243
1244         /* Prepare the array of shader parts. */
1245         LLVMValueRef parts[4];
1246         unsigned num_parts = 0, main_part, next_first_part;
1247
1248         if (es_prolog)
1249            parts[num_parts++] = es_prolog;
1250
1251         parts[main_part = num_parts++] = es_main;
1252         parts[next_first_part = num_parts++] = gs_prolog;
1253         parts[num_parts++] = gs_main;
1254
1255         si_build_wrapper_function(&ctx, parts, num_parts, main_part, next_first_part, false);
1256      } else {
1257         LLVMValueRef parts[2];
1258         union si_shader_part_key prolog_key;
1259
1260         parts[1] = ctx.main_fn;
1261
1262         memset(&prolog_key, 0, sizeof(prolog_key));
1263         prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
1264         si_llvm_build_gs_prolog(&ctx, &prolog_key);
1265         parts[0] = ctx.main_fn;
1266
1267         si_build_wrapper_function(&ctx, parts, 2, 1, 0, false);
1268      }
1269   } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_FRAGMENT) {
1270      si_llvm_build_monolithic_ps(&ctx, shader);
1271   }
1272
1273   si_llvm_optimize_module(&ctx);
1274
1275   /* Post-optimization transformations and analysis. */
1276   si_optimize_vs_outputs(&ctx);
1277
1278   if ((debug && debug->debug_message) || si_can_dump_shader(sscreen, ctx.stage)) {
1279      ctx.shader->info.private_mem_vgprs = ac_count_scratch_private_memory(ctx.main_fn);
1280   }
1281
1282   /* Make sure the input is a pointer and not integer followed by inttoptr. */
1283   assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) == LLVMPointerTypeKind);
1284
1285   /* Compile to bytecode. */
1286   if (!si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler, &ctx.ac, debug,
1287                        ctx.stage, si_get_shader_name(shader),
1288                        si_should_optimize_less(compiler, shader->selector))) {
1289      si_llvm_dispose(&ctx);
1290      fprintf(stderr, "LLVM failed to compile shader\n");
1291      return false;
1292   }
1293
1294   si_llvm_dispose(&ctx);
1295   return true;
1296}
1297