1/*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * based in part on anv driver which is:
6 * Copyright © 2015 Intel Corporation
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the next
16 * paragraph) shall be included in all copies or substantial portions of the
17 * Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * IN THE SOFTWARE.
26 */
27
28#include "nir/nir.h"
29#include "radv_debug.h"
30#include "radv_llvm_helper.h"
31#include "radv_private.h"
32#include "radv_shader.h"
33#include "radv_shader_args.h"
34
35#include "ac_binary.h"
36#include "ac_exp_param.h"
37#include "ac_llvm_build.h"
38#include "ac_nir_to_llvm.h"
39#include "ac_shader_abi.h"
40#include "ac_shader_util.h"
41#include "sid.h"
42
43struct radv_shader_context {
44   struct ac_llvm_context ac;
45   const struct nir_shader *shader;
46   struct ac_shader_abi abi;
47   const struct radv_shader_args *args;
48
49   gl_shader_stage stage;
50
51   unsigned max_workgroup_size;
52   LLVMContextRef context;
53   LLVMValueRef main_function;
54
55   LLVMValueRef descriptor_sets[MAX_SETS];
56
57   LLVMValueRef ring_offsets;
58
59   LLVMValueRef vs_rel_patch_id;
60
61   LLVMValueRef gs_wave_id;
62   LLVMValueRef gs_vtx_offset[6];
63
64   LLVMValueRef esgs_ring;
65   LLVMValueRef gsvs_ring[4];
66   LLVMValueRef hs_ring_tess_offchip;
67   LLVMValueRef hs_ring_tess_factor;
68
69   uint64_t output_mask;
70
71   LLVMValueRef gs_next_vertex[4];
72   LLVMValueRef gs_curprim_verts[4];
73   LLVMValueRef gs_generated_prims[4];
74   LLVMValueRef gs_ngg_emit;
75   LLVMValueRef gs_ngg_scratch;
76
77   LLVMValueRef vertexptr; /* GFX10 only */
78};
79
80struct radv_shader_output_values {
81   LLVMValueRef values[4];
82   unsigned slot_name;
83   unsigned slot_index;
84   unsigned usage_mask;
85};
86
87static inline struct radv_shader_context *
88radv_shader_context_from_abi(struct ac_shader_abi *abi)
89{
90   return container_of(abi, struct radv_shader_context, abi);
91}
92
93static LLVMValueRef
94create_llvm_function(struct ac_llvm_context *ctx, LLVMModuleRef module, LLVMBuilderRef builder,
95                     const struct ac_shader_args *args, enum ac_llvm_calling_convention convention,
96                     unsigned max_workgroup_size, const struct radv_nir_compiler_options *options)
97{
98   LLVMValueRef main_function = ac_build_main(args, ctx, convention, "main", ctx->voidt, module);
99
100   if (options->address32_hi) {
101      ac_llvm_add_target_dep_function_attr(main_function, "amdgpu-32bit-address-high-bits",
102                                           options->address32_hi);
103   }
104
105   ac_llvm_set_workgroup_size(main_function, max_workgroup_size);
106   ac_llvm_set_target_features(main_function, ctx);
107
108   return main_function;
109}
110
111static void
112load_descriptor_sets(struct radv_shader_context *ctx)
113{
114   struct radv_userdata_locations *user_sgprs_locs = &ctx->args->shader_info->user_sgprs_locs;
115   uint32_t mask = ctx->args->shader_info->desc_set_used_mask;
116
117   if (user_sgprs_locs->shader_data[AC_UD_INDIRECT_DESCRIPTOR_SETS].sgpr_idx != -1) {
118      LLVMValueRef desc_sets = ac_get_arg(&ctx->ac, ctx->args->descriptor_sets[0]);
119      while (mask) {
120         int i = u_bit_scan(&mask);
121
122         ctx->descriptor_sets[i] =
123            ac_build_load_to_sgpr(&ctx->ac, desc_sets, LLVMConstInt(ctx->ac.i32, i, false));
124         LLVMSetAlignment(ctx->descriptor_sets[i], 4);
125      }
126   } else {
127      while (mask) {
128         int i = u_bit_scan(&mask);
129
130         ctx->descriptor_sets[i] = ac_get_arg(&ctx->ac, ctx->args->descriptor_sets[i]);
131      }
132   }
133}
134
135static enum ac_llvm_calling_convention
136get_llvm_calling_convention(LLVMValueRef func, gl_shader_stage stage)
137{
138   switch (stage) {
139   case MESA_SHADER_VERTEX:
140   case MESA_SHADER_TESS_EVAL:
141      return AC_LLVM_AMDGPU_VS;
142      break;
143   case MESA_SHADER_GEOMETRY:
144      return AC_LLVM_AMDGPU_GS;
145      break;
146   case MESA_SHADER_TESS_CTRL:
147      return AC_LLVM_AMDGPU_HS;
148      break;
149   case MESA_SHADER_FRAGMENT:
150      return AC_LLVM_AMDGPU_PS;
151      break;
152   case MESA_SHADER_COMPUTE:
153      return AC_LLVM_AMDGPU_CS;
154      break;
155   default:
156      unreachable("Unhandle shader type");
157   }
158}
159
160/* Returns whether the stage is a stage that can be directly before the GS */
161static bool
162is_pre_gs_stage(gl_shader_stage stage)
163{
164   return stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL;
165}
166
167static void
168create_function(struct radv_shader_context *ctx, gl_shader_stage stage, bool has_previous_stage)
169{
170   if (ctx->ac.chip_class >= GFX10) {
171      if (is_pre_gs_stage(stage) && ctx->args->shader_info->is_ngg) {
172         /* On GFX10, VS is merged into GS for NGG. */
173         stage = MESA_SHADER_GEOMETRY;
174         has_previous_stage = true;
175      }
176   }
177
178   ctx->main_function =
179      create_llvm_function(&ctx->ac, ctx->ac.module, ctx->ac.builder, &ctx->args->ac,
180                           get_llvm_calling_convention(ctx->main_function, stage),
181                           ctx->max_workgroup_size, ctx->args->options);
182
183   ctx->ring_offsets = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.implicit.buffer.ptr",
184                                          LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_CONST), NULL, 0,
185                                          AC_FUNC_ATTR_READNONE);
186   ctx->ring_offsets = LLVMBuildBitCast(ctx->ac.builder, ctx->ring_offsets,
187                                        ac_array_in_const_addr_space(ctx->ac.v4i32), "");
188
189   load_descriptor_sets(ctx);
190
191   if (stage == MESA_SHADER_TESS_CTRL ||
192       (stage == MESA_SHADER_VERTEX && ctx->args->shader_info->vs.as_ls) ||
193       /* GFX9 has the ESGS ring buffer in LDS. */
194       (stage == MESA_SHADER_GEOMETRY && has_previous_stage)) {
195      ac_declare_lds_as_pointer(&ctx->ac);
196   }
197}
198
199static LLVMValueRef
200radv_load_resource(struct ac_shader_abi *abi, LLVMValueRef index, unsigned desc_set,
201                   unsigned binding)
202{
203   struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
204   LLVMValueRef desc_ptr = ctx->descriptor_sets[desc_set];
205   struct radv_pipeline_layout *pipeline_layout = ctx->args->options->layout;
206   struct radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout;
207   unsigned base_offset = layout->binding[binding].offset;
208   LLVMValueRef offset, stride;
209
210   if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
211       layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
212      unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start +
213                     layout->binding[binding].dynamic_offset_offset;
214      desc_ptr = ac_get_arg(&ctx->ac, ctx->args->ac.push_constants);
215      base_offset = pipeline_layout->push_constant_size + 16 * idx;
216      stride = LLVMConstInt(ctx->ac.i32, 16, false);
217   } else
218      stride = LLVMConstInt(ctx->ac.i32, layout->binding[binding].size, false);
219
220   offset = LLVMConstInt(ctx->ac.i32, base_offset, false);
221
222   if (layout->binding[binding].type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
223      offset = ac_build_imad(&ctx->ac, index, stride, offset);
224   }
225
226   desc_ptr = LLVMBuildPtrToInt(ctx->ac.builder, desc_ptr, ctx->ac.i32, "");
227
228   LLVMValueRef res[] = {desc_ptr, offset, ctx->ac.i32_0};
229   return ac_build_gather_values(&ctx->ac, res, 3);
230}
231
232static uint32_t
233radv_get_sample_pos_offset(uint32_t num_samples)
234{
235   uint32_t sample_pos_offset = 0;
236
237   switch (num_samples) {
238   case 2:
239      sample_pos_offset = 1;
240      break;
241   case 4:
242      sample_pos_offset = 3;
243      break;
244   case 8:
245      sample_pos_offset = 7;
246      break;
247   default:
248      break;
249   }
250   return sample_pos_offset;
251}
252
253static LLVMValueRef
254load_sample_position(struct ac_shader_abi *abi, LLVMValueRef sample_id)
255{
256   struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
257
258   LLVMValueRef result;
259   LLVMValueRef index = LLVMConstInt(ctx->ac.i32, RING_PS_SAMPLE_POSITIONS, false);
260   LLVMValueRef ptr = LLVMBuildGEP(ctx->ac.builder, ctx->ring_offsets, &index, 1, "");
261
262   ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ac_array_in_const_addr_space(ctx->ac.v2f32), "");
263
264   uint32_t sample_pos_offset = radv_get_sample_pos_offset(ctx->args->options->key.ps.num_samples);
265
266   sample_id = LLVMBuildAdd(ctx->ac.builder, sample_id,
267                            LLVMConstInt(ctx->ac.i32, sample_pos_offset, false), "");
268   result = ac_build_load_invariant(&ctx->ac, ptr, sample_id);
269
270   return result;
271}
272
273static LLVMValueRef
274load_sample_mask_in(struct ac_shader_abi *abi)
275{
276   struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
277   uint8_t log2_ps_iter_samples;
278
279   if (ctx->args->shader_info->ps.uses_sample_shading) {
280      log2_ps_iter_samples = util_logbase2(ctx->args->options->key.ps.num_samples);
281   } else {
282      log2_ps_iter_samples = ctx->args->options->key.ps.log2_ps_iter_samples;
283   }
284
285   LLVMValueRef result, sample_id;
286   if (log2_ps_iter_samples) {
287      /* gl_SampleMaskIn[0] = (SampleCoverage & (1 << gl_SampleID)). */
288      sample_id = ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.ancillary), 8, 4);
289      sample_id = LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, 1, false), sample_id, "");
290      result = LLVMBuildAnd(ctx->ac.builder, sample_id,
291                            ac_get_arg(&ctx->ac, ctx->args->ac.sample_coverage), "");
292   } else {
293      result = ac_get_arg(&ctx->ac, ctx->args->ac.sample_coverage);
294   }
295
296   return result;
297}
298
299static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx, unsigned stream,
300                                     LLVMValueRef vertexidx, LLVMValueRef *addrs);
301
302static void
303visit_emit_vertex_with_counter(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef vertexidx,
304                               LLVMValueRef *addrs)
305{
306   unsigned offset = 0;
307   struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
308
309   if (ctx->args->shader_info->is_ngg) {
310      gfx10_ngg_gs_emit_vertex(ctx, stream, vertexidx, addrs);
311      return;
312   }
313
314   for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
315      unsigned output_usage_mask = ctx->args->shader_info->gs.output_usage_mask[i];
316      uint8_t output_stream = ctx->args->shader_info->gs.output_streams[i];
317      LLVMValueRef *out_ptr = &addrs[i * 4];
318      int length = util_last_bit(output_usage_mask);
319
320      if (!(ctx->output_mask & (1ull << i)) || output_stream != stream)
321         continue;
322
323      for (unsigned j = 0; j < length; j++) {
324         if (!(output_usage_mask & (1 << j)))
325            continue;
326
327         LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, out_ptr[j], "");
328         LLVMValueRef voffset =
329            LLVMConstInt(ctx->ac.i32, offset * ctx->shader->info.gs.vertices_out, false);
330
331         offset++;
332
333         voffset = LLVMBuildAdd(ctx->ac.builder, voffset, vertexidx, "");
334         voffset = LLVMBuildMul(ctx->ac.builder, voffset, LLVMConstInt(ctx->ac.i32, 4, false), "");
335
336         out_val = ac_to_integer(&ctx->ac, out_val);
337         out_val = LLVMBuildZExtOrBitCast(ctx->ac.builder, out_val, ctx->ac.i32, "");
338
339         ac_build_buffer_store_dword(&ctx->ac, ctx->gsvs_ring[stream], out_val, 1, voffset,
340                                     ac_get_arg(&ctx->ac, ctx->args->ac.gs2vs_offset), 0,
341                                     ac_glc | ac_slc | ac_swizzled);
342      }
343   }
344
345   ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
346                    ctx->gs_wave_id);
347}
348
349static void
350visit_end_primitive(struct ac_shader_abi *abi, unsigned stream)
351{
352   struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
353
354   if (ctx->args->shader_info->is_ngg) {
355      LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]);
356      return;
357   }
358
359   ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
360                    ctx->gs_wave_id);
361}
362
363static LLVMValueRef
364load_ring_tess_factors(struct ac_shader_abi *abi)
365{
366   struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
367   assert(ctx->stage == MESA_SHADER_TESS_CTRL);
368
369   return ctx->hs_ring_tess_factor;
370}
371
372static LLVMValueRef
373load_ring_tess_offchip(struct ac_shader_abi *abi)
374{
375   struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
376   assert(ctx->stage == MESA_SHADER_TESS_CTRL || ctx->stage == MESA_SHADER_TESS_EVAL);
377
378   return ctx->hs_ring_tess_offchip;
379}
380
381static LLVMValueRef
382load_ring_esgs(struct ac_shader_abi *abi)
383{
384   struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
385   assert(ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL ||
386          ctx->stage == MESA_SHADER_GEOMETRY);
387
388   return ctx->esgs_ring;
389}
390
391static LLVMValueRef
392radv_load_base_vertex(struct ac_shader_abi *abi, bool non_indexed_is_zero)
393{
394   struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
395   return ac_get_arg(&ctx->ac, ctx->args->ac.base_vertex);
396}
397
398static LLVMValueRef
399get_desc_ptr(struct radv_shader_context *ctx, LLVMValueRef ptr, bool non_uniform)
400{
401   LLVMValueRef set_ptr = ac_llvm_extract_elem(&ctx->ac, ptr, 0);
402   LLVMValueRef offset = ac_llvm_extract_elem(&ctx->ac, ptr, 1);
403   ptr = LLVMBuildNUWAdd(ctx->ac.builder, set_ptr, offset, "");
404
405   unsigned addr_space = AC_ADDR_SPACE_CONST_32BIT;
406   if (non_uniform) {
407      /* 32-bit seems to always use SMEM. addrspacecast from 32-bit -> 64-bit is broken. */
408      LLVMValueRef dwords[] = {ptr,
409                               LLVMConstInt(ctx->ac.i32, ctx->args->options->address32_hi, false)};
410      ptr = ac_build_gather_values(&ctx->ac, dwords, 2);
411      ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ctx->ac.i64, "");
412      addr_space = AC_ADDR_SPACE_CONST;
413   }
414   return LLVMBuildIntToPtr(ctx->ac.builder, ptr, LLVMPointerType(ctx->ac.v4i32, addr_space), "");
415}
416
417static LLVMValueRef
418radv_load_ssbo(struct ac_shader_abi *abi, LLVMValueRef buffer_ptr, bool write, bool non_uniform)
419{
420   struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
421   LLVMValueRef result;
422
423   buffer_ptr = get_desc_ptr(ctx, buffer_ptr, non_uniform);
424   if (!non_uniform)
425      LLVMSetMetadata(buffer_ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md);
426
427   result = LLVMBuildLoad(ctx->ac.builder, buffer_ptr, "");
428   LLVMSetMetadata(result, ctx->ac.invariant_load_md_kind, ctx->ac.empty_md);
429   LLVMSetAlignment(result, 4);
430
431   return result;
432}
433
434static LLVMValueRef
435radv_load_ubo(struct ac_shader_abi *abi, unsigned desc_set, unsigned binding, bool valid_binding,
436              LLVMValueRef buffer_ptr)
437{
438   struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
439   LLVMValueRef result;
440
441   if (valid_binding) {
442      struct radv_pipeline_layout *pipeline_layout = ctx->args->options->layout;
443      struct radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout;
444
445      if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
446         LLVMValueRef set_ptr = ac_llvm_extract_elem(&ctx->ac, buffer_ptr, 0);
447         LLVMValueRef offset = ac_llvm_extract_elem(&ctx->ac, buffer_ptr, 1);
448         buffer_ptr = LLVMBuildNUWAdd(ctx->ac.builder, set_ptr, offset, "");
449
450         uint32_t desc_type =
451            S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
452            S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
453
454         if (ctx->ac.chip_class >= GFX10) {
455            desc_type |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
456                         S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
457         } else {
458            desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
459                         S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
460         }
461
462         LLVMValueRef desc_components[4] = {
463            LLVMBuildPtrToInt(ctx->ac.builder, buffer_ptr, ctx->ac.intptr, ""),
464            LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->args->options->address32_hi),
465                         false),
466            LLVMConstInt(ctx->ac.i32, 0xffffffff, false),
467            LLVMConstInt(ctx->ac.i32, desc_type, false),
468         };
469
470         return ac_build_gather_values(&ctx->ac, desc_components, 4);
471      }
472   }
473
474   buffer_ptr = get_desc_ptr(ctx, buffer_ptr, false);
475   LLVMSetMetadata(buffer_ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md);
476
477   result = LLVMBuildLoad(ctx->ac.builder, buffer_ptr, "");
478   LLVMSetMetadata(result, ctx->ac.invariant_load_md_kind, ctx->ac.empty_md);
479   LLVMSetAlignment(result, 4);
480
481   return result;
482}
483
484static LLVMValueRef
485radv_get_sampler_desc(struct ac_shader_abi *abi, unsigned descriptor_set, unsigned base_index,
486                      unsigned constant_index, LLVMValueRef index,
487                      enum ac_descriptor_type desc_type, bool image, bool write, bool bindless)
488{
489   struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
490   LLVMValueRef list = ctx->descriptor_sets[descriptor_set];
491   struct radv_descriptor_set_layout *layout =
492      ctx->args->options->layout->set[descriptor_set].layout;
493   struct radv_descriptor_set_binding_layout *binding = layout->binding + base_index;
494   unsigned offset = binding->offset;
495   unsigned stride = binding->size;
496   unsigned type_size;
497   LLVMBuilderRef builder = ctx->ac.builder;
498   LLVMTypeRef type;
499
500   assert(base_index < layout->binding_count);
501
502   if (binding->type == VK_DESCRIPTOR_TYPE_STORAGE_IMAGE && desc_type == AC_DESC_FMASK)
503      return NULL;
504
505   switch (desc_type) {
506   case AC_DESC_IMAGE:
507      type = ctx->ac.v8i32;
508      type_size = 32;
509      break;
510   case AC_DESC_FMASK:
511      type = ctx->ac.v8i32;
512      offset += 32;
513      type_size = 32;
514      break;
515   case AC_DESC_SAMPLER:
516      type = ctx->ac.v4i32;
517      if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) {
518         offset += radv_combined_image_descriptor_sampler_offset(binding);
519      }
520
521      type_size = 16;
522      break;
523   case AC_DESC_BUFFER:
524      type = ctx->ac.v4i32;
525      type_size = 16;
526      break;
527   case AC_DESC_PLANE_0:
528   case AC_DESC_PLANE_1:
529   case AC_DESC_PLANE_2:
530      type = ctx->ac.v8i32;
531      type_size = 32;
532      offset += 32 * (desc_type - AC_DESC_PLANE_0);
533      break;
534   default:
535      unreachable("invalid desc_type\n");
536   }
537
538   offset += constant_index * stride;
539
540   if (desc_type == AC_DESC_SAMPLER && binding->immutable_samplers_offset &&
541       (!index || binding->immutable_samplers_equal)) {
542      if (binding->immutable_samplers_equal)
543         constant_index = 0;
544
545      const uint32_t *samplers = radv_immutable_samplers(layout, binding);
546
547      LLVMValueRef constants[] = {
548         LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 0], 0),
549         LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 1], 0),
550         LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 2], 0),
551         LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 3], 0),
552      };
553      return ac_build_gather_values(&ctx->ac, constants, 4);
554   }
555
556   assert(stride % type_size == 0);
557
558   LLVMValueRef adjusted_index = index;
559   if (!adjusted_index)
560      adjusted_index = ctx->ac.i32_0;
561
562   adjusted_index =
563      LLVMBuildMul(builder, adjusted_index, LLVMConstInt(ctx->ac.i32, stride / type_size, 0), "");
564
565   LLVMValueRef val_offset = LLVMConstInt(ctx->ac.i32, offset, 0);
566   list = LLVMBuildGEP(builder, list, &val_offset, 1, "");
567   list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(type), "");
568
569   LLVMValueRef descriptor = ac_build_load_to_sgpr(&ctx->ac, list, adjusted_index);
570
571   /* 3 plane formats always have same size and format for plane 1 & 2, so
572    * use the tail from plane 1 so that we can store only the first 16 bytes
573    * of the last plane. */
574   if (desc_type == AC_DESC_PLANE_2) {
575      LLVMValueRef descriptor2 =
576         radv_get_sampler_desc(abi, descriptor_set, base_index, constant_index, index,
577                               AC_DESC_PLANE_1, image, write, bindless);
578
579      LLVMValueRef components[8];
580      for (unsigned i = 0; i < 4; ++i)
581         components[i] = ac_llvm_extract_elem(&ctx->ac, descriptor, i);
582
583      for (unsigned i = 4; i < 8; ++i)
584         components[i] = ac_llvm_extract_elem(&ctx->ac, descriptor2, i);
585      descriptor = ac_build_gather_values(&ctx->ac, components, 8);
586   } else if (desc_type == AC_DESC_IMAGE &&
587              ctx->args->options->has_image_load_dcc_bug &&
588              image && !write) {
589      LLVMValueRef components[8];
590
591      for (unsigned i = 0; i < 8; i++)
592         components[i] = ac_llvm_extract_elem(&ctx->ac, descriptor, i);
593
594      /* WRITE_COMPRESS_ENABLE must be 0 for all image loads to workaround a hardware bug. */
595      components[6] = LLVMBuildAnd(ctx->ac.builder, components[6],
596                                   LLVMConstInt(ctx->ac.i32, C_00A018_WRITE_COMPRESS_ENABLE, false), "");
597
598      descriptor = ac_build_gather_values(&ctx->ac, components, 8);
599   }
600
601   return descriptor;
602}
603
604/* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
605 * so we may need to fix it up. */
606static LLVMValueRef
607adjust_vertex_fetch_alpha(struct radv_shader_context *ctx, unsigned adjustment, LLVMValueRef alpha)
608{
609   if (adjustment == ALPHA_ADJUST_NONE)
610      return alpha;
611
612   LLVMValueRef c30 = LLVMConstInt(ctx->ac.i32, 30, 0);
613
614   alpha = LLVMBuildBitCast(ctx->ac.builder, alpha, ctx->ac.f32, "");
615
616   if (adjustment == ALPHA_ADJUST_SSCALED)
617      alpha = LLVMBuildFPToUI(ctx->ac.builder, alpha, ctx->ac.i32, "");
618   else
619      alpha = ac_to_integer(&ctx->ac, alpha);
620
621   /* For the integer-like cases, do a natural sign extension.
622    *
623    * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
624    * and happen to contain 0, 1, 2, 3 as the two LSBs of the
625    * exponent.
626    */
627   alpha =
628      LLVMBuildShl(ctx->ac.builder, alpha,
629                   adjustment == ALPHA_ADJUST_SNORM ? LLVMConstInt(ctx->ac.i32, 7, 0) : c30, "");
630   alpha = LLVMBuildAShr(ctx->ac.builder, alpha, c30, "");
631
632   /* Convert back to the right type. */
633   if (adjustment == ALPHA_ADJUST_SNORM) {
634      LLVMValueRef clamp;
635      LLVMValueRef neg_one = LLVMConstReal(ctx->ac.f32, -1.0);
636      alpha = LLVMBuildSIToFP(ctx->ac.builder, alpha, ctx->ac.f32, "");
637      clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, alpha, neg_one, "");
638      alpha = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, alpha, "");
639   } else if (adjustment == ALPHA_ADJUST_SSCALED) {
640      alpha = LLVMBuildSIToFP(ctx->ac.builder, alpha, ctx->ac.f32, "");
641   }
642
643   return LLVMBuildBitCast(ctx->ac.builder, alpha, ctx->ac.i32, "");
644}
645
646static LLVMValueRef
647radv_fixup_vertex_input_fetches(struct radv_shader_context *ctx, LLVMValueRef value,
648                                unsigned num_channels, bool is_float)
649{
650   LLVMValueRef zero = is_float ? ctx->ac.f32_0 : ctx->ac.i32_0;
651   LLVMValueRef one = is_float ? ctx->ac.f32_1 : ctx->ac.i32_1;
652   LLVMValueRef chan[4];
653
654   if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
655      unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
656
657      if (num_channels == 4 && num_channels == vec_size)
658         return value;
659
660      num_channels = MIN2(num_channels, vec_size);
661
662      for (unsigned i = 0; i < num_channels; i++)
663         chan[i] = ac_llvm_extract_elem(&ctx->ac, value, i);
664   } else {
665      assert(num_channels == 1);
666      chan[0] = value;
667   }
668
669   for (unsigned i = num_channels; i < 4; i++) {
670      chan[i] = i == 3 ? one : zero;
671      chan[i] = ac_to_integer(&ctx->ac, chan[i]);
672   }
673
674   return ac_build_gather_values(&ctx->ac, chan, 4);
675}
676
677static void
678load_vs_input(struct radv_shader_context *ctx, unsigned driver_location, LLVMTypeRef dest_type,
679              LLVMValueRef out[4])
680{
681   LLVMValueRef t_list_ptr = ac_get_arg(&ctx->ac, ctx->args->ac.vertex_buffers);
682   LLVMValueRef t_offset;
683   LLVMValueRef t_list;
684   LLVMValueRef input;
685   LLVMValueRef buffer_index;
686   unsigned attrib_index = driver_location - VERT_ATTRIB_GENERIC0;
687   unsigned attrib_format = ctx->args->options->key.vs.vertex_attribute_formats[attrib_index];
688   unsigned data_format = attrib_format & 0x0f;
689   unsigned num_format = (attrib_format >> 4) & 0x07;
690   bool is_float =
691      num_format != V_008F0C_BUF_NUM_FORMAT_UINT && num_format != V_008F0C_BUF_NUM_FORMAT_SINT;
692   uint8_t input_usage_mask =
693      ctx->args->shader_info->vs.input_usage_mask[driver_location];
694   unsigned num_input_channels = util_last_bit(input_usage_mask);
695
696   if (ctx->args->options->key.vs.instance_rate_inputs & (1u << attrib_index)) {
697      uint32_t divisor = ctx->args->options->key.vs.instance_rate_divisors[attrib_index];
698
699      if (divisor) {
700         buffer_index = ctx->abi.instance_id;
701
702         if (divisor != 1) {
703            buffer_index = LLVMBuildUDiv(ctx->ac.builder, buffer_index,
704                                         LLVMConstInt(ctx->ac.i32, divisor, 0), "");
705         }
706      } else {
707         buffer_index = ctx->ac.i32_0;
708      }
709
710      buffer_index = LLVMBuildAdd(
711         ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->ac.start_instance), buffer_index, "");
712   } else {
713      buffer_index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id,
714                                  ac_get_arg(&ctx->ac, ctx->args->ac.base_vertex), "");
715   }
716
717   const struct ac_data_format_info *vtx_info = ac_get_data_format_info(data_format);
718
719   /* Adjust the number of channels to load based on the vertex attribute format. */
720   unsigned num_channels = MIN2(num_input_channels, vtx_info->num_channels);
721   unsigned attrib_binding = ctx->args->options->key.vs.vertex_attribute_bindings[attrib_index];
722   unsigned attrib_offset = ctx->args->options->key.vs.vertex_attribute_offsets[attrib_index];
723   unsigned attrib_stride = ctx->args->options->key.vs.vertex_attribute_strides[attrib_index];
724   unsigned alpha_adjust = ctx->args->options->key.vs.vertex_alpha_adjust[attrib_index];
725
726   if (ctx->args->options->key.vs.vertex_post_shuffle & (1 << attrib_index)) {
727      /* Always load, at least, 3 channels for formats that need to be shuffled because X<->Z. */
728      num_channels = MAX2(num_channels, 3);
729   }
730
731   unsigned desc_index =
732      ctx->args->shader_info->vs.use_per_attribute_vb_descs ? attrib_index : attrib_binding;
733   desc_index = util_bitcount(ctx->args->shader_info->vs.vb_desc_usage_mask &
734                              u_bit_consecutive(0, desc_index));
735   t_offset = LLVMConstInt(ctx->ac.i32, desc_index, false);
736   t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);
737
738   /* Always split typed vertex buffer loads on GFX6 and GFX10+ to avoid any alignment issues that
739    * triggers memory violations and eventually a GPU hang. This can happen if the stride (static or
740    * dynamic) is unaligned and also if the VBO offset is aligned to a scalar (eg. stride is 8 and
741    * VBO offset is 2 for R16G16B16A16_SNORM).
742    */
743   if (ctx->ac.chip_class == GFX6 || ctx->ac.chip_class >= GFX10) {
744      unsigned chan_format = vtx_info->chan_format;
745      LLVMValueRef values[4];
746
747      assert(ctx->ac.chip_class == GFX6 || ctx->ac.chip_class >= GFX10);
748
749      for (unsigned chan = 0; chan < num_channels; chan++) {
750         unsigned chan_offset = attrib_offset + chan * vtx_info->chan_byte_size;
751         LLVMValueRef chan_index = buffer_index;
752
753         if (attrib_stride != 0 && chan_offset > attrib_stride) {
754            LLVMValueRef buffer_offset =
755               LLVMConstInt(ctx->ac.i32, chan_offset / attrib_stride, false);
756
757            chan_index = LLVMBuildAdd(ctx->ac.builder, buffer_index, buffer_offset, "");
758
759            chan_offset = chan_offset % attrib_stride;
760         }
761
762         values[chan] = ac_build_struct_tbuffer_load(
763            &ctx->ac, t_list, chan_index, LLVMConstInt(ctx->ac.i32, chan_offset, false),
764            ctx->ac.i32_0, ctx->ac.i32_0, 1, chan_format, num_format, 0, true);
765      }
766
767      input = ac_build_gather_values(&ctx->ac, values, num_channels);
768   } else {
769      if (attrib_stride != 0 && attrib_offset > attrib_stride) {
770         LLVMValueRef buffer_offset =
771            LLVMConstInt(ctx->ac.i32, attrib_offset / attrib_stride, false);
772
773         buffer_index = LLVMBuildAdd(ctx->ac.builder, buffer_index, buffer_offset, "");
774
775         attrib_offset = attrib_offset % attrib_stride;
776      }
777
778      input = ac_build_struct_tbuffer_load(
779         &ctx->ac, t_list, buffer_index, LLVMConstInt(ctx->ac.i32, attrib_offset, false),
780         ctx->ac.i32_0, ctx->ac.i32_0, num_channels, data_format, num_format, 0, true);
781   }
782
783   if (ctx->args->options->key.vs.vertex_post_shuffle & (1 << attrib_index)) {
784      LLVMValueRef c[4];
785      c[0] = ac_llvm_extract_elem(&ctx->ac, input, 2);
786      c[1] = ac_llvm_extract_elem(&ctx->ac, input, 1);
787      c[2] = ac_llvm_extract_elem(&ctx->ac, input, 0);
788      c[3] = ac_llvm_extract_elem(&ctx->ac, input, 3);
789
790      input = ac_build_gather_values(&ctx->ac, c, 4);
791   }
792
793   input = radv_fixup_vertex_input_fetches(ctx, input, num_channels, is_float);
794
795   for (unsigned chan = 0; chan < 4; chan++) {
796      LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false);
797      out[chan] = LLVMBuildExtractElement(ctx->ac.builder, input, llvm_chan, "");
798      if (dest_type == ctx->ac.i16 && is_float) {
799         out[chan] = LLVMBuildBitCast(ctx->ac.builder, out[chan], ctx->ac.f32, "");
800         out[chan] = LLVMBuildFPTrunc(ctx->ac.builder, out[chan], ctx->ac.f16, "");
801      }
802   }
803
804   out[3] = adjust_vertex_fetch_alpha(ctx, alpha_adjust, out[3]);
805
806   for (unsigned chan = 0; chan < 4; chan++) {
807      out[chan] = ac_to_integer(&ctx->ac, out[chan]);
808      if (dest_type == ctx->ac.i16 && !is_float)
809         out[chan] = LLVMBuildTrunc(ctx->ac.builder, out[chan], ctx->ac.i16, "");
810   }
811}
812
813static LLVMValueRef
814radv_load_vs_inputs(struct ac_shader_abi *abi, unsigned driver_location, unsigned component,
815                    unsigned num_components, unsigned vertex_index, LLVMTypeRef type)
816{
817   struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
818   LLVMValueRef values[4];
819
820   load_vs_input(ctx, driver_location, type, values);
821
822   for (unsigned i = 0; i < 4; i++)
823      values[i] = LLVMBuildBitCast(ctx->ac.builder, values[i], type, "");
824
825   return ac_build_varying_gather_values(&ctx->ac, values, num_components, component);
826}
827
828static void
829prepare_interp_optimize(struct radv_shader_context *ctx, struct nir_shader *nir)
830{
831   bool uses_center = false;
832   bool uses_centroid = false;
833   nir_foreach_shader_in_variable (variable, nir) {
834      if (glsl_get_base_type(glsl_without_array(variable->type)) != GLSL_TYPE_FLOAT ||
835          variable->data.sample)
836         continue;
837
838      if (variable->data.centroid)
839         uses_centroid = true;
840      else
841         uses_center = true;
842   }
843
844   ctx->abi.persp_centroid = ac_get_arg(&ctx->ac, ctx->args->ac.persp_centroid);
845   ctx->abi.linear_centroid = ac_get_arg(&ctx->ac, ctx->args->ac.linear_centroid);
846
847   if (uses_center && uses_centroid) {
848      LLVMValueRef sel =
849         LLVMBuildICmp(ctx->ac.builder, LLVMIntSLT, ac_get_arg(&ctx->ac, ctx->args->ac.prim_mask),
850                       ctx->ac.i32_0, "");
851      ctx->abi.persp_centroid =
852         LLVMBuildSelect(ctx->ac.builder, sel, ac_get_arg(&ctx->ac, ctx->args->ac.persp_center),
853                         ctx->abi.persp_centroid, "");
854      ctx->abi.linear_centroid =
855         LLVMBuildSelect(ctx->ac.builder, sel, ac_get_arg(&ctx->ac, ctx->args->ac.linear_center),
856                         ctx->abi.linear_centroid, "");
857   }
858}
859
860static void
861scan_shader_output_decl(struct radv_shader_context *ctx, struct nir_variable *variable,
862                        struct nir_shader *shader, gl_shader_stage stage)
863{
864   int idx = variable->data.driver_location;
865   unsigned attrib_count = glsl_count_attribute_slots(variable->type, false);
866   uint64_t mask_attribs;
867
868   if (variable->data.compact) {
869      unsigned component_count = variable->data.location_frac + glsl_get_length(variable->type);
870      attrib_count = (component_count + 3) / 4;
871   }
872
873   mask_attribs = ((1ull << attrib_count) - 1) << idx;
874
875   ctx->output_mask |= mask_attribs;
876}
877
878/* Initialize arguments for the shader export intrinsic */
879static void
880si_llvm_init_export_args(struct radv_shader_context *ctx, LLVMValueRef *values,
881                         unsigned enabled_channels, unsigned target, struct ac_export_args *args)
882{
883   /* Specify the channels that are enabled. */
884   args->enabled_channels = enabled_channels;
885
886   /* Specify whether the EXEC mask represents the valid mask */
887   args->valid_mask = 0;
888
889   /* Specify whether this is the last export */
890   args->done = 0;
891
892   /* Specify the target we are exporting */
893   args->target = target;
894
895   args->compr = false;
896   args->out[0] = LLVMGetUndef(ctx->ac.f32);
897   args->out[1] = LLVMGetUndef(ctx->ac.f32);
898   args->out[2] = LLVMGetUndef(ctx->ac.f32);
899   args->out[3] = LLVMGetUndef(ctx->ac.f32);
900
901   if (!values)
902      return;
903
904   bool is_16bit = ac_get_type_size(LLVMTypeOf(values[0])) == 2;
905   if (ctx->stage == MESA_SHADER_FRAGMENT) {
906      unsigned index = target - V_008DFC_SQ_EXP_MRT;
907      unsigned col_format = (ctx->args->options->key.ps.col_format >> (4 * index)) & 0xf;
908      bool is_int8 = (ctx->args->options->key.ps.is_int8 >> index) & 1;
909      bool is_int10 = (ctx->args->options->key.ps.is_int10 >> index) & 1;
910
911      LLVMValueRef (*packf)(struct ac_llvm_context * ctx, LLVMValueRef args[2]) = NULL;
912      LLVMValueRef (*packi)(struct ac_llvm_context * ctx, LLVMValueRef args[2], unsigned bits,
913                            bool hi) = NULL;
914
915      switch (col_format) {
916      case V_028714_SPI_SHADER_ZERO:
917         args->enabled_channels = 0; /* writemask */
918         args->target = V_008DFC_SQ_EXP_NULL;
919         break;
920
921      case V_028714_SPI_SHADER_32_R:
922         args->enabled_channels = 1;
923         args->out[0] = values[0];
924         break;
925
926      case V_028714_SPI_SHADER_32_GR:
927         args->enabled_channels = 0x3;
928         args->out[0] = values[0];
929         args->out[1] = values[1];
930         break;
931
932      case V_028714_SPI_SHADER_32_AR:
933         if (ctx->ac.chip_class >= GFX10) {
934            args->enabled_channels = 0x3;
935            args->out[0] = values[0];
936            args->out[1] = values[3];
937         } else {
938            args->enabled_channels = 0x9;
939            args->out[0] = values[0];
940            args->out[3] = values[3];
941         }
942         break;
943
944      case V_028714_SPI_SHADER_FP16_ABGR:
945         args->enabled_channels = 0xf;
946         packf = ac_build_cvt_pkrtz_f16;
947         if (is_16bit) {
948            for (unsigned chan = 0; chan < 4; chan++)
949               values[chan] = LLVMBuildFPExt(ctx->ac.builder, values[chan], ctx->ac.f32, "");
950         }
951         break;
952
953      case V_028714_SPI_SHADER_UNORM16_ABGR:
954         args->enabled_channels = 0xf;
955         packf = ac_build_cvt_pknorm_u16;
956         break;
957
958      case V_028714_SPI_SHADER_SNORM16_ABGR:
959         args->enabled_channels = 0xf;
960         packf = ac_build_cvt_pknorm_i16;
961         break;
962
963      case V_028714_SPI_SHADER_UINT16_ABGR:
964         args->enabled_channels = 0xf;
965         packi = ac_build_cvt_pk_u16;
966         if (is_16bit) {
967            for (unsigned chan = 0; chan < 4; chan++)
968               values[chan] = LLVMBuildZExt(ctx->ac.builder, ac_to_integer(&ctx->ac, values[chan]),
969                                            ctx->ac.i32, "");
970         }
971         break;
972
973      case V_028714_SPI_SHADER_SINT16_ABGR:
974         args->enabled_channels = 0xf;
975         packi = ac_build_cvt_pk_i16;
976         if (is_16bit) {
977            for (unsigned chan = 0; chan < 4; chan++)
978               values[chan] = LLVMBuildSExt(ctx->ac.builder, ac_to_integer(&ctx->ac, values[chan]),
979                                            ctx->ac.i32, "");
980         }
981         break;
982
983      default:
984      case V_028714_SPI_SHADER_32_ABGR:
985         memcpy(&args->out[0], values, sizeof(values[0]) * 4);
986         break;
987      }
988
989      /* Replace NaN by zero (only 32-bit) to fix game bugs if
990       * requested.
991       */
992      if (ctx->args->options->enable_mrt_output_nan_fixup && !is_16bit &&
993          (col_format == V_028714_SPI_SHADER_32_R || col_format == V_028714_SPI_SHADER_32_GR ||
994           col_format == V_028714_SPI_SHADER_32_AR || col_format == V_028714_SPI_SHADER_32_ABGR ||
995           col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
996         for (unsigned i = 0; i < 4; i++) {
997            LLVMValueRef class_args[2] = {values[i],
998                                          LLVMConstInt(ctx->ac.i32, S_NAN | Q_NAN, false)};
999            LLVMValueRef isnan = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.class.f32", ctx->ac.i1,
1000                                                    class_args, 2, AC_FUNC_ATTR_READNONE);
1001            values[i] = LLVMBuildSelect(ctx->ac.builder, isnan, ctx->ac.f32_0, values[i], "");
1002         }
1003      }
1004
1005      /* Pack f16 or norm_i16/u16. */
1006      if (packf) {
1007         for (unsigned chan = 0; chan < 2; chan++) {
1008            LLVMValueRef pack_args[2] = {values[2 * chan], values[2 * chan + 1]};
1009            LLVMValueRef packed;
1010
1011            packed = packf(&ctx->ac, pack_args);
1012            args->out[chan] = ac_to_float(&ctx->ac, packed);
1013         }
1014         args->compr = 1; /* COMPR flag */
1015      }
1016
1017      /* Pack i16/u16. */
1018      if (packi) {
1019         for (unsigned chan = 0; chan < 2; chan++) {
1020            LLVMValueRef pack_args[2] = {ac_to_integer(&ctx->ac, values[2 * chan]),
1021                                         ac_to_integer(&ctx->ac, values[2 * chan + 1])};
1022            LLVMValueRef packed;
1023
1024            packed = packi(&ctx->ac, pack_args, is_int8 ? 8 : is_int10 ? 10 : 16, chan == 1);
1025            args->out[chan] = ac_to_float(&ctx->ac, packed);
1026         }
1027         args->compr = 1; /* COMPR flag */
1028      }
1029      return;
1030   }
1031
1032   if (is_16bit) {
1033      for (unsigned chan = 0; chan < 4; chan++) {
1034         values[chan] = LLVMBuildBitCast(ctx->ac.builder, values[chan], ctx->ac.i16, "");
1035         args->out[chan] = LLVMBuildZExt(ctx->ac.builder, values[chan], ctx->ac.i32, "");
1036      }
1037   } else
1038      memcpy(&args->out[0], values, sizeof(values[0]) * 4);
1039
1040   for (unsigned i = 0; i < 4; ++i)
1041      args->out[i] = ac_to_float(&ctx->ac, args->out[i]);
1042}
1043
1044static void
1045radv_export_param(struct radv_shader_context *ctx, unsigned index, LLVMValueRef *values,
1046                  unsigned enabled_channels)
1047{
1048   struct ac_export_args args;
1049
1050   si_llvm_init_export_args(ctx, values, enabled_channels, V_008DFC_SQ_EXP_PARAM + index, &args);
1051   ac_build_export(&ctx->ac, &args);
1052}
1053
1054static LLVMValueRef
1055radv_load_output(struct radv_shader_context *ctx, unsigned index, unsigned chan)
1056{
1057   LLVMValueRef output = ctx->abi.outputs[ac_llvm_reg_index_soa(index, chan)];
1058   return LLVMBuildLoad(ctx->ac.builder, output, "");
1059}
1060
1061static void
1062radv_emit_stream_output(struct radv_shader_context *ctx, LLVMValueRef const *so_buffers,
1063                        LLVMValueRef const *so_write_offsets,
1064                        const struct radv_stream_output *output,
1065                        struct radv_shader_output_values *shader_out)
1066{
1067   unsigned num_comps = util_bitcount(output->component_mask);
1068   unsigned buf = output->buffer;
1069   unsigned offset = output->offset;
1070   unsigned start;
1071   LLVMValueRef out[4];
1072
1073   assert(num_comps && num_comps <= 4);
1074   if (!num_comps || num_comps > 4)
1075      return;
1076
1077   /* Get the first component. */
1078   start = ffs(output->component_mask) - 1;
1079
1080   /* Load the output as int. */
1081   for (int i = 0; i < num_comps; i++) {
1082      out[i] = ac_to_integer(&ctx->ac, shader_out->values[start + i]);
1083   }
1084
1085   /* Pack the output. */
1086   LLVMValueRef vdata = NULL;
1087
1088   switch (num_comps) {
1089   case 1: /* as i32 */
1090      vdata = out[0];
1091      break;
1092   case 2: /* as v2i32 */
1093   case 3: /* as v4i32 (aligned to 4) */
1094      out[3] = LLVMGetUndef(ctx->ac.i32);
1095      FALLTHROUGH;
1096   case 4: /* as v4i32 */
1097      vdata = ac_build_gather_values(&ctx->ac, out,
1098                                     !ac_has_vec3_support(ctx->ac.chip_class, false)
1099                                        ? util_next_power_of_two(num_comps)
1100                                        : num_comps);
1101      break;
1102   }
1103
1104   ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf], vdata, num_comps, so_write_offsets[buf],
1105                               ctx->ac.i32_0, offset, ac_glc | ac_slc);
1106}
1107
1108static void
1109radv_emit_streamout(struct radv_shader_context *ctx, unsigned stream)
1110{
1111   int i;
1112
1113   /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
1114   assert(ctx->args->ac.streamout_config.used);
1115   LLVMValueRef so_vtx_count = ac_build_bfe(
1116      &ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.streamout_config),
1117      LLVMConstInt(ctx->ac.i32, 16, false), LLVMConstInt(ctx->ac.i32, 7, false), false);
1118
1119   LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
1120
1121   /* can_emit = tid < so_vtx_count; */
1122   LLVMValueRef can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, tid, so_vtx_count, "");
1123
1124   /* Emit the streamout code conditionally. This actually avoids
1125    * out-of-bounds buffer access. The hw tells us via the SGPR
1126    * (so_vtx_count) which threads are allowed to emit streamout data.
1127    */
1128   ac_build_ifcc(&ctx->ac, can_emit, 6501);
1129   {
1130      /* The buffer offset is computed as follows:
1131       *   ByteOffset = streamout_offset[buffer_id]*4 +
1132       *                (streamout_write_index + thread_id)*stride[buffer_id] +
1133       *                attrib_offset
1134       */
1135      LLVMValueRef so_write_index = ac_get_arg(&ctx->ac, ctx->args->ac.streamout_write_index);
1136
1137      /* Compute (streamout_write_index + thread_id). */
1138      so_write_index = LLVMBuildAdd(ctx->ac.builder, so_write_index, tid, "");
1139
1140      /* Load the descriptor and compute the write offset for each
1141       * enabled buffer.
1142       */
1143      LLVMValueRef so_write_offset[4] = {0};
1144      LLVMValueRef so_buffers[4] = {0};
1145      LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->args->streamout_buffers);
1146
1147      for (i = 0; i < 4; i++) {
1148         uint16_t stride = ctx->args->shader_info->so.strides[i];
1149
1150         if (!stride)
1151            continue;
1152
1153         LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, i, false);
1154
1155         so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
1156
1157         LLVMValueRef so_offset = ac_get_arg(&ctx->ac, ctx->args->ac.streamout_offset[i]);
1158
1159         so_offset =
1160            LLVMBuildMul(ctx->ac.builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, false), "");
1161
1162         so_write_offset[i] = ac_build_imad(
1163            &ctx->ac, so_write_index, LLVMConstInt(ctx->ac.i32, stride * 4, false), so_offset);
1164      }
1165
1166      /* Write streamout data. */
1167      for (i = 0; i < ctx->args->shader_info->so.num_outputs; i++) {
1168         struct radv_shader_output_values shader_out = {0};
1169         struct radv_stream_output *output = &ctx->args->shader_info->so.outputs[i];
1170
1171         if (stream != output->stream)
1172            continue;
1173
1174         for (int j = 0; j < 4; j++) {
1175            shader_out.values[j] = radv_load_output(ctx, output->location, j);
1176         }
1177
1178         radv_emit_stream_output(ctx, so_buffers, so_write_offset, output, &shader_out);
1179      }
1180   }
1181   ac_build_endif(&ctx->ac, 6501);
1182}
1183
1184static void
1185radv_build_param_exports(struct radv_shader_context *ctx, struct radv_shader_output_values *outputs,
1186                         unsigned noutput, struct radv_vs_output_info *outinfo,
1187                         bool export_clip_dists)
1188{
1189   for (unsigned i = 0; i < noutput; i++) {
1190      unsigned slot_name = outputs[i].slot_name;
1191      unsigned usage_mask = outputs[i].usage_mask;
1192
1193      if (slot_name != VARYING_SLOT_LAYER && slot_name != VARYING_SLOT_PRIMITIVE_ID &&
1194          slot_name != VARYING_SLOT_VIEWPORT && slot_name != VARYING_SLOT_CLIP_DIST0 &&
1195          slot_name != VARYING_SLOT_CLIP_DIST1 && slot_name < VARYING_SLOT_VAR0)
1196         continue;
1197
1198      if ((slot_name == VARYING_SLOT_CLIP_DIST0 || slot_name == VARYING_SLOT_CLIP_DIST1) &&
1199          !export_clip_dists)
1200         continue;
1201
1202      radv_export_param(ctx, outinfo->vs_output_param_offset[slot_name], outputs[i].values,
1203                        usage_mask);
1204   }
1205}
1206
1207/* Generate export instructions for hardware VS shader stage or NGG GS stage
1208 * (position and parameter data only).
1209 */
1210static void
1211radv_llvm_export_vs(struct radv_shader_context *ctx, struct radv_shader_output_values *outputs,
1212                    unsigned noutput, struct radv_vs_output_info *outinfo, bool export_clip_dists)
1213{
1214   LLVMValueRef psize_value = NULL, layer_value = NULL, viewport_value = NULL;
1215   LLVMValueRef primitive_shading_rate = NULL;
1216   struct ac_export_args pos_args[4] = {0};
1217   unsigned pos_idx, index;
1218   int i;
1219
1220   /* Build position exports */
1221   for (i = 0; i < noutput; i++) {
1222      switch (outputs[i].slot_name) {
1223      case VARYING_SLOT_POS:
1224         si_llvm_init_export_args(ctx, outputs[i].values, 0xf, V_008DFC_SQ_EXP_POS, &pos_args[0]);
1225         break;
1226      case VARYING_SLOT_PSIZ:
1227         psize_value = outputs[i].values[0];
1228         break;
1229      case VARYING_SLOT_LAYER:
1230         layer_value = outputs[i].values[0];
1231         break;
1232      case VARYING_SLOT_VIEWPORT:
1233         viewport_value = outputs[i].values[0];
1234         break;
1235      case VARYING_SLOT_PRIMITIVE_SHADING_RATE:
1236         primitive_shading_rate = outputs[i].values[0];
1237         break;
1238      case VARYING_SLOT_CLIP_DIST0:
1239      case VARYING_SLOT_CLIP_DIST1:
1240         index = 2 + outputs[i].slot_index;
1241         si_llvm_init_export_args(ctx, outputs[i].values, 0xf, V_008DFC_SQ_EXP_POS + index,
1242                                  &pos_args[index]);
1243         break;
1244      default:
1245         break;
1246      }
1247   }
1248
1249   /* We need to add the position output manually if it's missing. */
1250   if (!pos_args[0].out[0]) {
1251      pos_args[0].enabled_channels = 0xf; /* writemask */
1252      pos_args[0].valid_mask = 0;         /* EXEC mask */
1253      pos_args[0].done = 0;               /* last export? */
1254      pos_args[0].target = V_008DFC_SQ_EXP_POS;
1255      pos_args[0].compr = 0;              /* COMPR flag */
1256      pos_args[0].out[0] = ctx->ac.f32_0; /* X */
1257      pos_args[0].out[1] = ctx->ac.f32_0; /* Y */
1258      pos_args[0].out[2] = ctx->ac.f32_0; /* Z */
1259      pos_args[0].out[3] = ctx->ac.f32_1; /* W */
1260   }
1261
1262   bool writes_primitive_shading_rate = outinfo->writes_primitive_shading_rate ||
1263                                        ctx->args->options->force_vrs_rates;
1264
1265   if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_layer ||
1266       outinfo->writes_viewport_index || writes_primitive_shading_rate) {
1267      pos_args[1].enabled_channels = ((outinfo->writes_pointsize == true ? 1 : 0) |
1268                                      (writes_primitive_shading_rate == true ? 2 : 0) |
1269                                      (outinfo->writes_layer == true ? 4 : 0));
1270      pos_args[1].valid_mask = 0;
1271      pos_args[1].done = 0;
1272      pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
1273      pos_args[1].compr = 0;
1274      pos_args[1].out[0] = ctx->ac.f32_0; /* X */
1275      pos_args[1].out[1] = ctx->ac.f32_0; /* Y */
1276      pos_args[1].out[2] = ctx->ac.f32_0; /* Z */
1277      pos_args[1].out[3] = ctx->ac.f32_0; /* W */
1278
1279      if (outinfo->writes_pointsize == true)
1280         pos_args[1].out[0] = psize_value;
1281      if (outinfo->writes_layer == true)
1282         pos_args[1].out[2] = layer_value;
1283      if (outinfo->writes_viewport_index == true) {
1284         if (ctx->args->options->chip_class >= GFX9) {
1285            /* GFX9 has the layer in out.z[10:0] and the viewport
1286             * index in out.z[19:16].
1287             */
1288            LLVMValueRef v = viewport_value;
1289            v = ac_to_integer(&ctx->ac, v);
1290            v = LLVMBuildShl(ctx->ac.builder, v, LLVMConstInt(ctx->ac.i32, 16, false), "");
1291            v = LLVMBuildOr(ctx->ac.builder, v, ac_to_integer(&ctx->ac, pos_args[1].out[2]), "");
1292
1293            pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
1294            pos_args[1].enabled_channels |= 1 << 2;
1295         } else {
1296            pos_args[1].out[3] = viewport_value;
1297            pos_args[1].enabled_channels |= 1 << 3;
1298         }
1299      }
1300
1301      if (outinfo->writes_primitive_shading_rate) {
1302         pos_args[1].out[1] = primitive_shading_rate;
1303      } else if (ctx->args->options->force_vrs_rates) {
1304         /* Bits [2:3] = VRS rate X
1305          * Bits [4:5] = VRS rate Y
1306          *
1307          * The range is [-2, 1]. Values:
1308          *   1: 2x coarser shading rate in that direction.
1309          *   0: normal shading rate
1310          *  -1: 2x finer shading rate (sample shading, not directional)
1311          *  -2: 4x finer shading rate (sample shading, not directional)
1312          *
1313          * Sample shading can't go above 8 samples, so both numbers can't be -2 at the same time.
1314          */
1315         LLVMValueRef rates = LLVMConstInt(ctx->ac.i32, ctx->args->options->force_vrs_rates, false);
1316         LLVMValueRef cond;
1317         LLVMValueRef v;
1318
1319         /* If Pos.W != 1 (typical for non-GUI elements), use 2x2 coarse shading. */
1320         cond = LLVMBuildFCmp(ctx->ac.builder, LLVMRealUNE, pos_args[0].out[3], ctx->ac.f32_1, "");
1321         v = LLVMBuildSelect(ctx->ac.builder, cond, rates, ctx->ac.i32_0, "");
1322
1323         pos_args[1].out[1] = ac_to_float(&ctx->ac, v);
1324      }
1325   }
1326
1327   /* GFX10 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
1328    * Setting valid_mask=1 prevents it and has no other effect.
1329    */
1330   if (ctx->ac.chip_class == GFX10)
1331      pos_args[0].valid_mask = 1;
1332
1333   pos_idx = 0;
1334   for (i = 0; i < 4; i++) {
1335      if (!pos_args[i].out[0])
1336         continue;
1337
1338      /* Specify the target we are exporting */
1339      pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
1340
1341      if (pos_idx == outinfo->pos_exports)
1342         /* Specify that this is the last export */
1343         pos_args[i].done = 1;
1344
1345      ac_build_export(&ctx->ac, &pos_args[i]);
1346   }
1347
1348   /* Build parameter exports */
1349   radv_build_param_exports(ctx, outputs, noutput, outinfo, export_clip_dists);
1350}
1351
1352static void
1353handle_vs_outputs_post(struct radv_shader_context *ctx, bool export_prim_id, bool export_clip_dists,
1354                       struct radv_vs_output_info *outinfo)
1355{
1356   struct radv_shader_output_values *outputs;
1357   unsigned noutput = 0;
1358
1359   if (ctx->args->options->key.has_multiview_view_index) {
1360      LLVMValueRef *tmp_out = &ctx->abi.outputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)];
1361      if (!*tmp_out) {
1362         for (unsigned i = 0; i < 4; ++i)
1363            ctx->abi.outputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, i)] =
1364               ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, "");
1365      }
1366
1367      LLVMValueRef view_index = ac_get_arg(&ctx->ac, ctx->args->ac.view_index);
1368      LLVMBuildStore(ctx->ac.builder, ac_to_float(&ctx->ac, view_index), *tmp_out);
1369      ctx->output_mask |= 1ull << VARYING_SLOT_LAYER;
1370   }
1371
1372   if (ctx->args->shader_info->so.num_outputs && !ctx->args->is_gs_copy_shader) {
1373      /* The GS copy shader emission already emits streamout. */
1374      radv_emit_streamout(ctx, 0);
1375   }
1376
1377   /* Allocate a temporary array for the output values. */
1378   unsigned num_outputs = util_bitcount64(ctx->output_mask) + export_prim_id;
1379   outputs = malloc(num_outputs * sizeof(outputs[0]));
1380
1381   for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
1382      if (!(ctx->output_mask & (1ull << i)))
1383         continue;
1384
1385      outputs[noutput].slot_name = i;
1386      outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1;
1387
1388      if (ctx->stage == MESA_SHADER_VERTEX && !ctx->args->is_gs_copy_shader) {
1389         outputs[noutput].usage_mask = ctx->args->shader_info->vs.output_usage_mask[i];
1390      } else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
1391         outputs[noutput].usage_mask = ctx->args->shader_info->tes.output_usage_mask[i];
1392      } else {
1393         assert(ctx->args->is_gs_copy_shader);
1394         outputs[noutput].usage_mask = ctx->args->shader_info->gs.output_usage_mask[i];
1395      }
1396
1397      for (unsigned j = 0; j < 4; j++) {
1398         outputs[noutput].values[j] = ac_to_float(&ctx->ac, radv_load_output(ctx, i, j));
1399      }
1400
1401      noutput++;
1402   }
1403
1404   /* Export PrimitiveID. */
1405   if (export_prim_id) {
1406      outputs[noutput].slot_name = VARYING_SLOT_PRIMITIVE_ID;
1407      outputs[noutput].slot_index = 0;
1408      outputs[noutput].usage_mask = 0x1;
1409      if (ctx->stage == MESA_SHADER_TESS_EVAL)
1410         outputs[noutput].values[0] = ac_get_arg(&ctx->ac, ctx->args->ac.tes_patch_id);
1411      else
1412         outputs[noutput].values[0] = ac_get_arg(&ctx->ac, ctx->args->ac.vs_prim_id);
1413      for (unsigned j = 1; j < 4; j++)
1414         outputs[noutput].values[j] = ctx->ac.f32_0;
1415      noutput++;
1416   }
1417
1418   radv_llvm_export_vs(ctx, outputs, noutput, outinfo, export_clip_dists);
1419
1420   free(outputs);
1421}
1422
1423static LLVMValueRef
1424get_wave_id_in_tg(struct radv_shader_context *ctx)
1425{
1426   return ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 24, 4);
1427}
1428
1429static LLVMValueRef
1430get_tgsize(struct radv_shader_context *ctx)
1431{
1432   return ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 28, 4);
1433}
1434
1435static LLVMValueRef
1436get_thread_id_in_tg(struct radv_shader_context *ctx)
1437{
1438   LLVMBuilderRef builder = ctx->ac.builder;
1439   LLVMValueRef tmp;
1440   tmp = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
1441                      LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), "");
1442   return LLVMBuildAdd(builder, tmp, ac_get_thread_id(&ctx->ac), "");
1443}
1444
1445static LLVMValueRef
1446ngg_get_vtx_cnt(struct radv_shader_context *ctx)
1447{
1448   return ac_build_bfe(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_tg_info),
1449                       LLVMConstInt(ctx->ac.i32, 12, false), LLVMConstInt(ctx->ac.i32, 9, false),
1450                       false);
1451}
1452
1453static LLVMValueRef
1454ngg_get_prim_cnt(struct radv_shader_context *ctx)
1455{
1456   return ac_build_bfe(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_tg_info),
1457                       LLVMConstInt(ctx->ac.i32, 22, false), LLVMConstInt(ctx->ac.i32, 9, false),
1458                       false);
1459}
1460
1461static LLVMValueRef
1462ngg_gs_get_vertex_storage(struct radv_shader_context *ctx)
1463{
1464   unsigned num_outputs = util_bitcount64(ctx->output_mask);
1465
1466   if (ctx->args->options->key.has_multiview_view_index)
1467      num_outputs++;
1468
1469   LLVMTypeRef elements[2] = {
1470      LLVMArrayType(ctx->ac.i32, 4 * num_outputs),
1471      LLVMArrayType(ctx->ac.i8, 4),
1472   };
1473   LLVMTypeRef type = LLVMStructTypeInContext(ctx->ac.context, elements, 2, false);
1474   type = LLVMPointerType(LLVMArrayType(type, 0), AC_ADDR_SPACE_LDS);
1475   return LLVMBuildBitCast(ctx->ac.builder, ctx->gs_ngg_emit, type, "");
1476}
1477
1478/**
1479 * Return a pointer to the LDS storage reserved for the N'th vertex, where N
1480 * is in emit order; that is:
1481 * - during the epilogue, N is the threadidx (relative to the entire threadgroup)
1482 * - during vertex emit, i.e. while the API GS shader invocation is running,
1483 *   N = threadidx * gs_max_out_vertices + emitidx
1484 *
1485 * Goals of the LDS memory layout:
1486 * 1. Eliminate bank conflicts on write for geometry shaders that have all emits
1487 *    in uniform control flow
1488 * 2. Eliminate bank conflicts on read for export if, additionally, there is no
1489 *    culling
1490 * 3. Agnostic to the number of waves (since we don't know it before compiling)
1491 * 4. Allow coalescing of LDS instructions (ds_write_b128 etc.)
1492 * 5. Avoid wasting memory.
1493 *
1494 * We use an AoS layout due to point 4 (this also helps point 3). In an AoS
1495 * layout, elimination of bank conflicts requires that each vertex occupy an
1496 * odd number of dwords. We use the additional dword to store the output stream
1497 * index as well as a flag to indicate whether this vertex ends a primitive
1498 * for rasterization.
1499 *
1500 * Swizzling is required to satisfy points 1 and 2 simultaneously.
1501 *
1502 * Vertices are stored in export order (gsthread * gs_max_out_vertices + emitidx).
1503 * Indices are swizzled in groups of 32, which ensures point 1 without
1504 * disturbing point 2.
1505 *
1506 * \return an LDS pointer to type {[N x i32], [4 x i8]}
1507 */
1508static LLVMValueRef
1509ngg_gs_vertex_ptr(struct radv_shader_context *ctx, LLVMValueRef vertexidx)
1510{
1511   LLVMBuilderRef builder = ctx->ac.builder;
1512   LLVMValueRef storage = ngg_gs_get_vertex_storage(ctx);
1513
1514   /* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */
1515   unsigned write_stride_2exp = ffs(MAX2(ctx->shader->info.gs.vertices_out, 1)) - 1;
1516   if (write_stride_2exp) {
1517      LLVMValueRef row = LLVMBuildLShr(builder, vertexidx, LLVMConstInt(ctx->ac.i32, 5, false), "");
1518      LLVMValueRef swizzle = LLVMBuildAnd(
1519         builder, row, LLVMConstInt(ctx->ac.i32, (1u << write_stride_2exp) - 1, false), "");
1520      vertexidx = LLVMBuildXor(builder, vertexidx, swizzle, "");
1521   }
1522
1523   return ac_build_gep0(&ctx->ac, storage, vertexidx);
1524}
1525
1526static LLVMValueRef
1527ngg_gs_emit_vertex_ptr(struct radv_shader_context *ctx, LLVMValueRef gsthread, LLVMValueRef emitidx)
1528{
1529   LLVMBuilderRef builder = ctx->ac.builder;
1530   LLVMValueRef tmp;
1531
1532   tmp = LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false);
1533   tmp = LLVMBuildMul(builder, tmp, gsthread, "");
1534   const LLVMValueRef vertexidx = LLVMBuildAdd(builder, tmp, emitidx, "");
1535   return ngg_gs_vertex_ptr(ctx, vertexidx);
1536}
1537
1538static LLVMValueRef
1539ngg_gs_get_emit_output_ptr(struct radv_shader_context *ctx, LLVMValueRef vertexptr,
1540                           unsigned out_idx)
1541{
1542   LLVMValueRef gep_idx[3] = {
1543      ctx->ac.i32_0, /* implied C-style array */
1544      ctx->ac.i32_0, /* first struct entry */
1545      LLVMConstInt(ctx->ac.i32, out_idx, false),
1546   };
1547   return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
1548}
1549
1550static LLVMValueRef
1551ngg_gs_get_emit_primflag_ptr(struct radv_shader_context *ctx, LLVMValueRef vertexptr,
1552                             unsigned stream)
1553{
1554   LLVMValueRef gep_idx[3] = {
1555      ctx->ac.i32_0, /* implied C-style array */
1556      ctx->ac.i32_1, /* second struct entry */
1557      LLVMConstInt(ctx->ac.i32, stream, false),
1558   };
1559   return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
1560}
1561
1562static void
1563handle_ngg_outputs_post_2(struct radv_shader_context *ctx)
1564{
1565   LLVMBuilderRef builder = ctx->ac.builder;
1566   LLVMValueRef tmp;
1567
1568   assert((ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) &&
1569          !ctx->args->is_gs_copy_shader);
1570
1571   LLVMValueRef prims_in_wave =
1572      ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 8, 8);
1573   LLVMValueRef vtx_in_wave =
1574      ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 0, 8);
1575   LLVMValueRef is_gs_thread =
1576      LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), prims_in_wave, "");
1577   LLVMValueRef is_es_thread =
1578      LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), vtx_in_wave, "");
1579   LLVMValueRef vtxindex[] = {
1580      ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[0]), 0, 16),
1581      ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[0]), 16, 16),
1582      ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[1]), 0, 16),
1583   };
1584
1585   /* Determine the number of vertices per primitive. */
1586   unsigned num_vertices;
1587
1588   if (ctx->stage == MESA_SHADER_VERTEX) {
1589      num_vertices = 3; /* TODO: optimize for points & lines */
1590   } else {
1591      assert(ctx->stage == MESA_SHADER_TESS_EVAL);
1592
1593      if (ctx->shader->info.tess.point_mode)
1594         num_vertices = 1;
1595      else if (ctx->shader->info.tess.primitive_mode == GL_ISOLINES)
1596         num_vertices = 2;
1597      else
1598         num_vertices = 3;
1599   }
1600
1601   /* Copy Primitive IDs from GS threads to the LDS address corresponding
1602    * to the ES thread of the provoking vertex.
1603    */
1604   if (ctx->stage == MESA_SHADER_VERTEX && ctx->args->shader_info->vs.outinfo.export_prim_id) {
1605      ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
1606
1607      LLVMValueRef provoking_vtx_in_prim = LLVMConstInt(ctx->ac.i32, 0, false);
1608
1609      /* For provoking vertex last mode, use num_vtx_in_prim - 1. */
1610      if (ctx->args->options->key.vs.provoking_vtx_last) {
1611         uint8_t outprim = si_conv_prim_to_gs_out(ctx->args->options->key.vs.topology);
1612         provoking_vtx_in_prim = LLVMConstInt(ctx->ac.i32, outprim, false);
1613      }
1614
1615      /* provoking_vtx_index = vtxindex[provoking_vtx_in_prim]; */
1616      LLVMValueRef indices = ac_build_gather_values(&ctx->ac, vtxindex, 3);
1617      LLVMValueRef provoking_vtx_index =
1618         LLVMBuildExtractElement(builder, indices, provoking_vtx_in_prim, "");
1619
1620      LLVMBuildStore(builder, ac_get_arg(&ctx->ac, ctx->args->ac.gs_prim_id),
1621                     ac_build_gep0(&ctx->ac, ctx->esgs_ring, provoking_vtx_index));
1622      ac_build_endif(&ctx->ac, 5400);
1623   }
1624
1625   /* TODO: primitive culling */
1626
1627   ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), ngg_get_vtx_cnt(ctx),
1628                                 ngg_get_prim_cnt(ctx));
1629
1630   /* TODO: streamout queries */
1631   /* Export primitive data to the index buffer.
1632    *
1633    * For the first version, we will always build up all three indices
1634    * independent of the primitive type. The additional garbage data
1635    * shouldn't hurt.
1636    *
1637    * TODO: culling depends on the primitive type, so can have some
1638    * interaction here.
1639    */
1640   ac_build_ifcc(&ctx->ac, is_gs_thread, 6001);
1641   {
1642      struct ac_ngg_prim prim = {0};
1643
1644      if (ctx->args->shader_info->is_ngg_passthrough) {
1645         prim.passthrough = ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[0]);
1646      } else {
1647         prim.num_vertices = num_vertices;
1648         prim.isnull = ctx->ac.i1false;
1649         prim.edgeflags = ctx->ac.i32_0;
1650         memcpy(prim.index, vtxindex, sizeof(vtxindex[0]) * 3);
1651      }
1652
1653      ac_build_export_prim(&ctx->ac, &prim);
1654   }
1655   ac_build_endif(&ctx->ac, 6001);
1656
1657   /* Export per-vertex data (positions and parameters). */
1658   ac_build_ifcc(&ctx->ac, is_es_thread, 6002);
1659   {
1660      struct radv_vs_output_info *outinfo = ctx->stage == MESA_SHADER_TESS_EVAL
1661                                               ? &ctx->args->shader_info->tes.outinfo
1662                                               : &ctx->args->shader_info->vs.outinfo;
1663
1664      /* Exporting the primitive ID is handled below. */
1665      /* TODO: use the new VS export path */
1666      handle_vs_outputs_post(ctx, false, outinfo->export_clip_dists, outinfo);
1667
1668      if (outinfo->export_prim_id) {
1669         LLVMValueRef values[4];
1670
1671         if (ctx->stage == MESA_SHADER_VERTEX) {
1672            /* Wait for GS stores to finish. */
1673            ac_build_s_barrier(&ctx->ac);
1674
1675            tmp = ac_build_gep0(&ctx->ac, ctx->esgs_ring, get_thread_id_in_tg(ctx));
1676            values[0] = LLVMBuildLoad(builder, tmp, "");
1677         } else {
1678            assert(ctx->stage == MESA_SHADER_TESS_EVAL);
1679            values[0] = ac_get_arg(&ctx->ac, ctx->args->ac.tes_patch_id);
1680         }
1681
1682         values[0] = ac_to_float(&ctx->ac, values[0]);
1683         for (unsigned j = 1; j < 4; j++)
1684            values[j] = ctx->ac.f32_0;
1685
1686         radv_export_param(ctx, outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID], values,
1687                           0x1);
1688      }
1689   }
1690   ac_build_endif(&ctx->ac, 6002);
1691}
1692
1693static void
1694gfx10_ngg_gs_emit_prologue(struct radv_shader_context *ctx)
1695{
1696   /* Zero out the part of LDS scratch that is used to accumulate the
1697    * per-stream generated primitive count.
1698    */
1699   LLVMBuilderRef builder = ctx->ac.builder;
1700   LLVMValueRef scratchptr = ctx->gs_ngg_scratch;
1701   LLVMValueRef tid = get_thread_id_in_tg(ctx);
1702   LLVMBasicBlockRef merge_block;
1703   LLVMValueRef cond;
1704
1705   LLVMValueRef fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->ac.builder));
1706   LLVMBasicBlockRef then_block = LLVMAppendBasicBlockInContext(ctx->ac.context, fn, "");
1707   merge_block = LLVMAppendBasicBlockInContext(ctx->ac.context, fn, "");
1708
1709   cond = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->ac.i32, 4, false), "");
1710   LLVMBuildCondBr(ctx->ac.builder, cond, then_block, merge_block);
1711   LLVMPositionBuilderAtEnd(ctx->ac.builder, then_block);
1712
1713   LLVMValueRef ptr = ac_build_gep0(&ctx->ac, scratchptr, tid);
1714   LLVMBuildStore(builder, ctx->ac.i32_0, ptr);
1715
1716   LLVMBuildBr(ctx->ac.builder, merge_block);
1717   LLVMPositionBuilderAtEnd(ctx->ac.builder, merge_block);
1718
1719   ac_build_s_barrier(&ctx->ac);
1720}
1721
1722static void
1723gfx10_ngg_gs_emit_epilogue_1(struct radv_shader_context *ctx)
1724{
1725   LLVMBuilderRef builder = ctx->ac.builder;
1726   LLVMValueRef i8_0 = LLVMConstInt(ctx->ac.i8, 0, false);
1727   LLVMValueRef tmp;
1728
1729   /* Zero out remaining (non-emitted) primitive flags.
1730    *
1731    * Note: Alternatively, we could pass the relevant gs_next_vertex to
1732    *       the emit threads via LDS. This is likely worse in the expected
1733    *       typical case where each GS thread emits the full set of
1734    *       vertices.
1735    */
1736   for (unsigned stream = 0; stream < 4; ++stream) {
1737      unsigned num_components;
1738
1739      num_components = ctx->args->shader_info->gs.num_stream_output_components[stream];
1740      if (!num_components)
1741         continue;
1742
1743      const LLVMValueRef gsthread = get_thread_id_in_tg(ctx);
1744
1745      ac_build_bgnloop(&ctx->ac, 5100);
1746
1747      const LLVMValueRef vertexidx = LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
1748      tmp = LLVMBuildICmp(builder, LLVMIntUGE, vertexidx,
1749                          LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false), "");
1750      ac_build_ifcc(&ctx->ac, tmp, 5101);
1751      ac_build_break(&ctx->ac);
1752      ac_build_endif(&ctx->ac, 5101);
1753
1754      tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
1755      LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
1756
1757      tmp = ngg_gs_emit_vertex_ptr(ctx, gsthread, vertexidx);
1758      LLVMBuildStore(builder, i8_0, ngg_gs_get_emit_primflag_ptr(ctx, tmp, stream));
1759
1760      ac_build_endloop(&ctx->ac, 5100);
1761   }
1762
1763   /* Accumulate generated primitives counts across the entire threadgroup. */
1764   for (unsigned stream = 0; stream < 4; ++stream) {
1765      unsigned num_components;
1766
1767      num_components = ctx->args->shader_info->gs.num_stream_output_components[stream];
1768      if (!num_components)
1769         continue;
1770
1771      LLVMValueRef numprims = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
1772      numprims = ac_build_reduce(&ctx->ac, numprims, nir_op_iadd, ctx->ac.wave_size);
1773
1774      tmp = LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(&ctx->ac), ctx->ac.i32_0, "");
1775      ac_build_ifcc(&ctx->ac, tmp, 5105);
1776      {
1777         LLVMBuildAtomicRMW(
1778            builder, LLVMAtomicRMWBinOpAdd,
1779            ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, LLVMConstInt(ctx->ac.i32, stream, false)),
1780            numprims, LLVMAtomicOrderingMonotonic, false);
1781      }
1782      ac_build_endif(&ctx->ac, 5105);
1783   }
1784}
1785
1786static void
1787gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx)
1788{
1789   const unsigned verts_per_prim =
1790      si_conv_gl_prim_to_vertices(ctx->shader->info.gs.output_primitive);
1791   LLVMBuilderRef builder = ctx->ac.builder;
1792   LLVMValueRef tmp, tmp2;
1793
1794   ac_build_s_barrier(&ctx->ac);
1795
1796   const LLVMValueRef tid = get_thread_id_in_tg(ctx);
1797   LLVMValueRef num_emit_threads = ngg_get_prim_cnt(ctx);
1798
1799   /* Write shader query data. */
1800   tmp = ac_get_arg(&ctx->ac, ctx->args->ngg_gs_state);
1801   tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
1802   ac_build_ifcc(&ctx->ac, tmp, 5109);
1803   tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->ac.i32, 4, false), "");
1804   ac_build_ifcc(&ctx->ac, tmp, 5110);
1805   {
1806      tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), "");
1807
1808      ac_llvm_add_target_dep_function_attr(ctx->main_function, "amdgpu-gds-size", 256);
1809
1810      LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS);
1811      LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, "");
1812
1813      const char *sync_scope = "workgroup-one-as";
1814
1815      /* Use a plain GDS atomic to accumulate the number of generated
1816       * primitives.
1817       */
1818      ac_build_atomic_rmw(&ctx->ac, LLVMAtomicRMWBinOpAdd, gdsbase, tmp, sync_scope);
1819   }
1820   ac_build_endif(&ctx->ac, 5110);
1821   ac_build_endif(&ctx->ac, 5109);
1822
1823   /* TODO: culling */
1824
1825   /* Determine vertex liveness. */
1826   LLVMValueRef vertliveptr = ac_build_alloca(&ctx->ac, ctx->ac.i1, "vertexlive");
1827
1828   tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
1829   ac_build_ifcc(&ctx->ac, tmp, 5120);
1830   {
1831      for (unsigned i = 0; i < verts_per_prim; ++i) {
1832         const LLVMValueRef primidx =
1833            LLVMBuildAdd(builder, tid, LLVMConstInt(ctx->ac.i32, i, false), "");
1834
1835         if (i > 0) {
1836            tmp = LLVMBuildICmp(builder, LLVMIntULT, primidx, num_emit_threads, "");
1837            ac_build_ifcc(&ctx->ac, tmp, 5121 + i);
1838         }
1839
1840         /* Load primitive liveness */
1841         tmp = ngg_gs_vertex_ptr(ctx, primidx);
1842         tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
1843         const LLVMValueRef primlive = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
1844
1845         tmp = LLVMBuildLoad(builder, vertliveptr, "");
1846         tmp = LLVMBuildOr(builder, tmp, primlive, ""), LLVMBuildStore(builder, tmp, vertliveptr);
1847
1848         if (i > 0)
1849            ac_build_endif(&ctx->ac, 5121 + i);
1850      }
1851   }
1852   ac_build_endif(&ctx->ac, 5120);
1853
1854   /* Inclusive scan addition across the current wave. */
1855   LLVMValueRef vertlive = LLVMBuildLoad(builder, vertliveptr, "");
1856   struct ac_wg_scan vertlive_scan = {0};
1857   vertlive_scan.op = nir_op_iadd;
1858   vertlive_scan.enable_reduce = true;
1859   vertlive_scan.enable_exclusive = true;
1860   vertlive_scan.src = vertlive;
1861   vertlive_scan.scratch = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ctx->ac.i32_0);
1862   vertlive_scan.waveidx = get_wave_id_in_tg(ctx);
1863   vertlive_scan.numwaves = get_tgsize(ctx);
1864   vertlive_scan.maxwaves = 8;
1865
1866   ac_build_wg_scan(&ctx->ac, &vertlive_scan);
1867
1868   /* Skip all exports (including index exports) when possible. At least on
1869    * early gfx10 revisions this is also to avoid hangs.
1870    */
1871   LLVMValueRef have_exports =
1872      LLVMBuildICmp(builder, LLVMIntNE, vertlive_scan.result_reduce, ctx->ac.i32_0, "");
1873   num_emit_threads = LLVMBuildSelect(builder, have_exports, num_emit_threads, ctx->ac.i32_0, "");
1874
1875   /* Allocate export space. Send this message as early as possible, to
1876    * hide the latency of the SQ <-> SPI roundtrip.
1877    *
1878    * Note: We could consider compacting primitives for export as well.
1879    *       PA processes 1 non-null prim / clock, but it fetches 4 DW of
1880    *       prim data per clock and skips null primitives at no additional
1881    *       cost. So compacting primitives can only be beneficial when
1882    *       there are 4 or more contiguous null primitives in the export
1883    *       (in the common case of single-dword prim exports).
1884    */
1885   ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), vertlive_scan.result_reduce,
1886                                 num_emit_threads);
1887
1888   /* Setup the reverse vertex compaction permutation. We re-use stream 1
1889    * of the primitive liveness flags, relying on the fact that each
1890    * threadgroup can have at most 256 threads. */
1891   ac_build_ifcc(&ctx->ac, vertlive, 5130);
1892   {
1893      tmp = ngg_gs_vertex_ptr(ctx, vertlive_scan.result_exclusive);
1894      tmp2 = LLVMBuildTrunc(builder, tid, ctx->ac.i8, "");
1895      LLVMBuildStore(builder, tmp2, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1));
1896   }
1897   ac_build_endif(&ctx->ac, 5130);
1898
1899   ac_build_s_barrier(&ctx->ac);
1900
1901   /* Export primitive data */
1902   tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
1903   ac_build_ifcc(&ctx->ac, tmp, 5140);
1904   {
1905      LLVMValueRef flags;
1906      struct ac_ngg_prim prim = {0};
1907      prim.num_vertices = verts_per_prim;
1908
1909      tmp = ngg_gs_vertex_ptr(ctx, tid);
1910      flags = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
1911      prim.isnull = LLVMBuildNot(builder, LLVMBuildTrunc(builder, flags, ctx->ac.i1, ""), "");
1912      prim.edgeflags = ctx->ac.i32_0;
1913
1914      for (unsigned i = 0; i < verts_per_prim; ++i) {
1915         prim.index[i] = LLVMBuildSub(builder, vertlive_scan.result_exclusive,
1916                                      LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), "");
1917      }
1918
1919      /* Geometry shaders output triangle strips, but NGG expects triangles. */
1920      if (verts_per_prim == 3) {
1921         LLVMValueRef is_odd = LLVMBuildLShr(builder, flags, ctx->ac.i8_1, "");
1922         is_odd = LLVMBuildTrunc(builder, is_odd, ctx->ac.i1, "");
1923
1924         LLVMValueRef flatshade_first =
1925            LLVMConstInt(ctx->ac.i1, !ctx->args->options->key.vs.provoking_vtx_last, false);
1926
1927         ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd, flatshade_first, prim.index);
1928      }
1929
1930      ac_build_export_prim(&ctx->ac, &prim);
1931   }
1932   ac_build_endif(&ctx->ac, 5140);
1933
1934   /* Export position and parameter data */
1935   tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, vertlive_scan.result_reduce, "");
1936   ac_build_ifcc(&ctx->ac, tmp, 5145);
1937   {
1938      struct radv_vs_output_info *outinfo = &ctx->args->shader_info->vs.outinfo;
1939      bool export_view_index = ctx->args->options->key.has_multiview_view_index;
1940      struct radv_shader_output_values *outputs;
1941      unsigned noutput = 0;
1942
1943      /* Allocate a temporary array for the output values. */
1944      unsigned num_outputs = util_bitcount64(ctx->output_mask) + export_view_index;
1945      outputs = calloc(num_outputs, sizeof(outputs[0]));
1946
1947      tmp = ngg_gs_vertex_ptr(ctx, tid);
1948      tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1), "");
1949      tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, "");
1950      const LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tmp);
1951
1952      unsigned out_idx = 0;
1953      for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
1954         unsigned output_usage_mask = ctx->args->shader_info->gs.output_usage_mask[i];
1955         int length = util_last_bit(output_usage_mask);
1956
1957         if (!(ctx->output_mask & (1ull << i)))
1958            continue;
1959
1960         outputs[noutput].slot_name = i;
1961         outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1;
1962         outputs[noutput].usage_mask = output_usage_mask;
1963
1964         for (unsigned j = 0; j < length; j++, out_idx++) {
1965            if (!(output_usage_mask & (1 << j)))
1966               continue;
1967
1968            tmp = ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx);
1969            tmp = LLVMBuildLoad(builder, tmp, "");
1970
1971            LLVMTypeRef type = LLVMGetAllocatedType(ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]);
1972            if (ac_get_type_size(type) == 2) {
1973               tmp = ac_to_integer(&ctx->ac, tmp);
1974               tmp = LLVMBuildTrunc(ctx->ac.builder, tmp, ctx->ac.i16, "");
1975            }
1976
1977            outputs[noutput].values[j] = ac_to_float(&ctx->ac, tmp);
1978         }
1979
1980         for (unsigned j = length; j < 4; j++)
1981            outputs[noutput].values[j] = LLVMGetUndef(ctx->ac.f32);
1982
1983         noutput++;
1984      }
1985
1986      /* Export ViewIndex. */
1987      if (export_view_index) {
1988         outputs[noutput].slot_name = VARYING_SLOT_LAYER;
1989         outputs[noutput].slot_index = 0;
1990         outputs[noutput].usage_mask = 0x1;
1991         outputs[noutput].values[0] =
1992            ac_to_float(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.view_index));
1993         for (unsigned j = 1; j < 4; j++)
1994            outputs[noutput].values[j] = ctx->ac.f32_0;
1995         noutput++;
1996      }
1997
1998      radv_llvm_export_vs(ctx, outputs, noutput, outinfo, outinfo->export_clip_dists);
1999      FREE(outputs);
2000   }
2001   ac_build_endif(&ctx->ac, 5145);
2002}
2003
2004static void
2005gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx, unsigned stream, LLVMValueRef vertexidx,
2006                         LLVMValueRef *addrs)
2007{
2008   LLVMBuilderRef builder = ctx->ac.builder;
2009   LLVMValueRef tmp;
2010
2011   const LLVMValueRef vertexptr = ngg_gs_emit_vertex_ptr(ctx, get_thread_id_in_tg(ctx), vertexidx);
2012   unsigned out_idx = 0;
2013   for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
2014      unsigned output_usage_mask = ctx->args->shader_info->gs.output_usage_mask[i];
2015      uint8_t output_stream = ctx->args->shader_info->gs.output_streams[i];
2016      LLVMValueRef *out_ptr = &addrs[i * 4];
2017      int length = util_last_bit(output_usage_mask);
2018
2019      if (!(ctx->output_mask & (1ull << i)) || output_stream != stream)
2020         continue;
2021
2022      for (unsigned j = 0; j < length; j++, out_idx++) {
2023         if (!(output_usage_mask & (1 << j)))
2024            continue;
2025
2026         LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, out_ptr[j], "");
2027         out_val = ac_to_integer(&ctx->ac, out_val);
2028         out_val = LLVMBuildZExtOrBitCast(ctx->ac.builder, out_val, ctx->ac.i32, "");
2029
2030         LLVMBuildStore(builder, out_val, ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx));
2031      }
2032   }
2033   assert(out_idx * 4 <= ctx->args->shader_info->gs.gsvs_vertex_size);
2034
2035   /* Store the current number of emitted vertices to zero out remaining
2036    * primitive flags in case the geometry shader doesn't emit the maximum
2037    * number of vertices.
2038    */
2039   tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
2040   LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
2041
2042   /* Determine and store whether this vertex completed a primitive. */
2043   const LLVMValueRef curverts = LLVMBuildLoad(builder, ctx->gs_curprim_verts[stream], "");
2044
2045   tmp = LLVMConstInt(
2046      ctx->ac.i32, si_conv_gl_prim_to_vertices(ctx->shader->info.gs.output_primitive) - 1, false);
2047   const LLVMValueRef iscompleteprim = LLVMBuildICmp(builder, LLVMIntUGE, curverts, tmp, "");
2048
2049   /* Since the geometry shader emits triangle strips, we need to
2050    * track which primitive is odd and swap vertex indices to get
2051    * the correct vertex order.
2052    */
2053   LLVMValueRef is_odd = ctx->ac.i1false;
2054   if (stream == 0 && si_conv_gl_prim_to_vertices(ctx->shader->info.gs.output_primitive) == 3) {
2055      tmp = LLVMBuildAnd(builder, curverts, ctx->ac.i32_1, "");
2056      is_odd = LLVMBuildICmp(builder, LLVMIntEQ, tmp, ctx->ac.i32_1, "");
2057   }
2058
2059   tmp = LLVMBuildAdd(builder, curverts, ctx->ac.i32_1, "");
2060   LLVMBuildStore(builder, tmp, ctx->gs_curprim_verts[stream]);
2061
2062   /* The per-vertex primitive flag encoding:
2063    *   bit 0: whether this vertex finishes a primitive
2064    *   bit 1: whether the primitive is odd (if we are emitting triangle strips)
2065    */
2066   tmp = LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i8, "");
2067   tmp = LLVMBuildOr(
2068      builder, tmp,
2069      LLVMBuildShl(builder, LLVMBuildZExt(builder, is_odd, ctx->ac.i8, ""), ctx->ac.i8_1, ""), "");
2070   LLVMBuildStore(builder, tmp, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream));
2071
2072   tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
2073   tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), "");
2074   LLVMBuildStore(builder, tmp, ctx->gs_generated_prims[stream]);
2075}
2076
2077static bool
2078si_export_mrt_color(struct radv_shader_context *ctx, LLVMValueRef *color, unsigned index,
2079                    struct ac_export_args *args)
2080{
2081   /* Export */
2082   si_llvm_init_export_args(ctx, color, 0xf, V_008DFC_SQ_EXP_MRT + index, args);
2083   if (!args->enabled_channels)
2084      return false; /* unnecessary NULL export */
2085
2086   return true;
2087}
2088
2089static void
2090radv_export_mrt_z(struct radv_shader_context *ctx, LLVMValueRef depth, LLVMValueRef stencil,
2091                  LLVMValueRef samplemask)
2092{
2093   struct ac_export_args args;
2094
2095   ac_export_mrt_z(&ctx->ac, depth, stencil, samplemask, &args);
2096
2097   ac_build_export(&ctx->ac, &args);
2098}
2099
2100static void
2101handle_fs_outputs_post(struct radv_shader_context *ctx)
2102{
2103   unsigned index = 0;
2104   LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
2105   struct ac_export_args color_args[8];
2106
2107   for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
2108      LLVMValueRef values[4];
2109
2110      if (!(ctx->output_mask & (1ull << i)))
2111         continue;
2112
2113      if (i < FRAG_RESULT_DATA0)
2114         continue;
2115
2116      for (unsigned j = 0; j < 4; j++)
2117         values[j] = ac_to_float(&ctx->ac, radv_load_output(ctx, i, j));
2118
2119      bool ret = si_export_mrt_color(ctx, values, i - FRAG_RESULT_DATA0, &color_args[index]);
2120      if (ret)
2121         index++;
2122   }
2123
2124   /* Process depth, stencil, samplemask. */
2125   if (ctx->args->shader_info->ps.writes_z) {
2126      depth = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_DEPTH, 0));
2127   }
2128   if (ctx->args->shader_info->ps.writes_stencil) {
2129      stencil = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_STENCIL, 0));
2130   }
2131   if (ctx->args->shader_info->ps.writes_sample_mask) {
2132      samplemask = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_SAMPLE_MASK, 0));
2133   }
2134
2135   /* Set the DONE bit on last non-null color export only if Z isn't
2136    * exported.
2137    */
2138   if (index > 0 && !ctx->args->shader_info->ps.writes_z &&
2139       !ctx->args->shader_info->ps.writes_stencil &&
2140       !ctx->args->shader_info->ps.writes_sample_mask) {
2141      unsigned last = index - 1;
2142
2143      color_args[last].valid_mask = 1; /* whether the EXEC mask is valid */
2144      color_args[last].done = 1;       /* DONE bit */
2145   }
2146
2147   /* Export PS outputs. */
2148   for (unsigned i = 0; i < index; i++)
2149      ac_build_export(&ctx->ac, &color_args[i]);
2150
2151   if (depth || stencil || samplemask)
2152      radv_export_mrt_z(ctx, depth, stencil, samplemask);
2153   else if (!index)
2154      ac_build_export_null(&ctx->ac);
2155}
2156
2157static void
2158emit_gs_epilogue(struct radv_shader_context *ctx)
2159{
2160   if (ctx->args->shader_info->is_ngg) {
2161      gfx10_ngg_gs_emit_epilogue_1(ctx);
2162      return;
2163   }
2164
2165   if (ctx->ac.chip_class >= GFX10)
2166      LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, "");
2167
2168   ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, ctx->gs_wave_id);
2169}
2170
2171static void
2172handle_shader_outputs_post(struct ac_shader_abi *abi)
2173{
2174   struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
2175
2176   switch (ctx->stage) {
2177   case MESA_SHADER_VERTEX:
2178      if (ctx->args->shader_info->vs.as_ls)
2179         break; /* Lowered in NIR */
2180      else if (ctx->args->shader_info->vs.as_es)
2181         break; /* Lowered in NIR */
2182      else if (ctx->args->shader_info->is_ngg)
2183         break;
2184      else
2185         handle_vs_outputs_post(ctx, ctx->args->shader_info->vs.outinfo.export_prim_id,
2186                                ctx->args->shader_info->vs.outinfo.export_clip_dists,
2187                                &ctx->args->shader_info->vs.outinfo);
2188      break;
2189   case MESA_SHADER_FRAGMENT:
2190      handle_fs_outputs_post(ctx);
2191      break;
2192   case MESA_SHADER_GEOMETRY:
2193      emit_gs_epilogue(ctx);
2194      break;
2195   case MESA_SHADER_TESS_CTRL:
2196      break; /* Lowered in NIR */
2197   case MESA_SHADER_TESS_EVAL:
2198      if (ctx->args->shader_info->tes.as_es)
2199         break; /* Lowered in NIR */
2200      else if (ctx->args->shader_info->is_ngg)
2201         break;
2202      else
2203         handle_vs_outputs_post(ctx, ctx->args->shader_info->tes.outinfo.export_prim_id,
2204                                ctx->args->shader_info->tes.outinfo.export_clip_dists,
2205                                &ctx->args->shader_info->tes.outinfo);
2206      break;
2207   default:
2208      break;
2209   }
2210}
2211
2212static void
2213ac_llvm_finalize_module(struct radv_shader_context *ctx, LLVMPassManagerRef passmgr,
2214                        const struct radv_nir_compiler_options *options)
2215{
2216   LLVMRunPassManager(passmgr, ctx->ac.module);
2217   LLVMDisposeBuilder(ctx->ac.builder);
2218
2219   ac_llvm_context_dispose(&ctx->ac);
2220}
2221
2222static void
2223ac_nir_eliminate_const_vs_outputs(struct radv_shader_context *ctx)
2224{
2225   struct radv_vs_output_info *outinfo;
2226
2227   switch (ctx->stage) {
2228   case MESA_SHADER_FRAGMENT:
2229   case MESA_SHADER_COMPUTE:
2230   case MESA_SHADER_TESS_CTRL:
2231   case MESA_SHADER_GEOMETRY:
2232      return;
2233   case MESA_SHADER_VERTEX:
2234      if (ctx->args->shader_info->vs.as_ls ||
2235          ctx->args->shader_info->vs.as_es)
2236         return;
2237      outinfo = &ctx->args->shader_info->vs.outinfo;
2238      break;
2239   case MESA_SHADER_TESS_EVAL:
2240      if (ctx->args->shader_info->tes.as_es)
2241         return;
2242      outinfo = &ctx->args->shader_info->tes.outinfo;
2243      break;
2244   default:
2245      unreachable("Unhandled shader type");
2246   }
2247
2248   ac_optimize_vs_outputs(&ctx->ac, ctx->main_function, outinfo->vs_output_param_offset,
2249                          VARYING_SLOT_MAX, 0, &outinfo->param_exports);
2250}
2251
2252static void
2253ac_setup_rings(struct radv_shader_context *ctx)
2254{
2255   if (ctx->args->options->chip_class <= GFX8 &&
2256       (ctx->stage == MESA_SHADER_GEOMETRY ||
2257        (ctx->stage == MESA_SHADER_VERTEX && ctx->args->shader_info->vs.as_es) ||
2258        (ctx->stage == MESA_SHADER_TESS_EVAL && ctx->args->shader_info->tes.as_es))) {
2259      unsigned ring = ctx->stage == MESA_SHADER_GEOMETRY ? RING_ESGS_GS : RING_ESGS_VS;
2260      LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, false);
2261
2262      ctx->esgs_ring = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, offset);
2263   }
2264
2265   if (ctx->args->is_gs_copy_shader) {
2266      ctx->gsvs_ring[0] = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets,
2267                                                LLVMConstInt(ctx->ac.i32, RING_GSVS_VS, false));
2268   }
2269
2270   if (ctx->stage == MESA_SHADER_GEOMETRY) {
2271      /* The conceptual layout of the GSVS ring is
2272       *   v0c0 .. vLv0 v0c1 .. vLc1 ..
2273       * but the real memory layout is swizzled across
2274       * threads:
2275       *   t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
2276       *   t16v0c0 ..
2277       * Override the buffer descriptor accordingly.
2278       */
2279      LLVMTypeRef v2i64 = LLVMVectorType(ctx->ac.i64, 2);
2280      uint64_t stream_offset = 0;
2281      unsigned num_records = ctx->ac.wave_size;
2282      LLVMValueRef base_ring;
2283
2284      base_ring = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets,
2285                                        LLVMConstInt(ctx->ac.i32, RING_GSVS_GS, false));
2286
2287      for (unsigned stream = 0; stream < 4; stream++) {
2288         unsigned num_components, stride;
2289         LLVMValueRef ring, tmp;
2290
2291         num_components = ctx->args->shader_info->gs.num_stream_output_components[stream];
2292
2293         if (!num_components)
2294            continue;
2295
2296         stride = 4 * num_components * ctx->shader->info.gs.vertices_out;
2297
2298         /* Limit on the stride field for <= GFX7. */
2299         assert(stride < (1 << 14));
2300
2301         ring = LLVMBuildBitCast(ctx->ac.builder, base_ring, v2i64, "");
2302         tmp = LLVMBuildExtractElement(ctx->ac.builder, ring, ctx->ac.i32_0, "");
2303         tmp = LLVMBuildAdd(ctx->ac.builder, tmp, LLVMConstInt(ctx->ac.i64, stream_offset, 0), "");
2304         ring = LLVMBuildInsertElement(ctx->ac.builder, ring, tmp, ctx->ac.i32_0, "");
2305
2306         stream_offset += stride * ctx->ac.wave_size;
2307
2308         ring = LLVMBuildBitCast(ctx->ac.builder, ring, ctx->ac.v4i32, "");
2309
2310         tmp = LLVMBuildExtractElement(ctx->ac.builder, ring, ctx->ac.i32_1, "");
2311         tmp = LLVMBuildOr(ctx->ac.builder, tmp,
2312                           LLVMConstInt(ctx->ac.i32, S_008F04_STRIDE(stride), false), "");
2313         ring = LLVMBuildInsertElement(ctx->ac.builder, ring, tmp, ctx->ac.i32_1, "");
2314
2315         ring = LLVMBuildInsertElement(ctx->ac.builder, ring,
2316                                       LLVMConstInt(ctx->ac.i32, num_records, false),
2317                                       LLVMConstInt(ctx->ac.i32, 2, false), "");
2318
2319         ctx->gsvs_ring[stream] = ring;
2320      }
2321   }
2322
2323   if (ctx->stage == MESA_SHADER_TESS_CTRL || ctx->stage == MESA_SHADER_TESS_EVAL) {
2324      ctx->hs_ring_tess_offchip = ac_build_load_to_sgpr(
2325         &ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_HS_TESS_OFFCHIP, false));
2326      ctx->hs_ring_tess_factor = ac_build_load_to_sgpr(
2327         &ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_HS_TESS_FACTOR, false));
2328   }
2329}
2330
2331/* Fixup the HW not emitting the TCS regs if there are no HS threads. */
2332static void
2333ac_nir_fixup_ls_hs_input_vgprs(struct radv_shader_context *ctx)
2334{
2335   LLVMValueRef count =
2336      ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 8, 8);
2337   LLVMValueRef hs_empty = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, count, ctx->ac.i32_0, "");
2338   ctx->abi.instance_id =
2339      LLVMBuildSelect(ctx->ac.builder, hs_empty, ac_get_arg(&ctx->ac, ctx->args->ac.vertex_id),
2340                      ctx->abi.instance_id, "");
2341   ctx->vs_rel_patch_id =
2342      LLVMBuildSelect(ctx->ac.builder, hs_empty, ac_get_arg(&ctx->ac, ctx->args->ac.tcs_rel_ids),
2343                      ctx->vs_rel_patch_id, "");
2344   ctx->abi.vertex_id =
2345      LLVMBuildSelect(ctx->ac.builder, hs_empty, ac_get_arg(&ctx->ac, ctx->args->ac.tcs_patch_id),
2346                      ctx->abi.vertex_id, "");
2347}
2348
2349static void
2350prepare_gs_input_vgprs(struct radv_shader_context *ctx, bool merged)
2351{
2352   if (merged) {
2353      for (int i = 5; i >= 0; --i) {
2354         ctx->gs_vtx_offset[i] = ac_unpack_param(
2355            &ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[i / 2]), (i & 1) * 16, 16);
2356      }
2357
2358      ctx->gs_wave_id =
2359         ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 16, 8);
2360   } else {
2361      for (int i = 0; i < 6; i++)
2362         ctx->gs_vtx_offset[i] = ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[i]);
2363      ctx->gs_wave_id = ac_get_arg(&ctx->ac, ctx->args->ac.gs_wave_id);
2364   }
2365}
2366
2367/* Ensure that the esgs ring is declared.
2368 *
2369 * We declare it with 64KB alignment as a hint that the
2370 * pointer value will always be 0.
2371 */
2372static void
2373declare_esgs_ring(struct radv_shader_context *ctx)
2374{
2375   if (ctx->esgs_ring)
2376      return;
2377
2378   assert(!LLVMGetNamedGlobal(ctx->ac.module, "esgs_ring"));
2379
2380   ctx->esgs_ring = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
2381                                                "esgs_ring", AC_ADDR_SPACE_LDS);
2382   LLVMSetLinkage(ctx->esgs_ring, LLVMExternalLinkage);
2383   LLVMSetAlignment(ctx->esgs_ring, 64 * 1024);
2384}
2385
2386static LLVMModuleRef
2387ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, struct nir_shader *const *shaders,
2388                         int shader_count, const struct radv_shader_args *args)
2389{
2390   struct radv_shader_context ctx = {0};
2391   ctx.args = args;
2392
2393   enum ac_float_mode float_mode = AC_FLOAT_MODE_DEFAULT;
2394
2395   if (shaders[0]->info.float_controls_execution_mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32) {
2396      float_mode = AC_FLOAT_MODE_DENORM_FLUSH_TO_ZERO;
2397   }
2398
2399   ac_llvm_context_init(&ctx.ac, ac_llvm, args->options->chip_class, args->options->family,
2400                        args->options->info, float_mode, args->shader_info->wave_size,
2401                        args->shader_info->ballot_bit_size);
2402   ctx.context = ctx.ac.context;
2403
2404   ctx.max_workgroup_size = args->shader_info->workgroup_size;
2405
2406   if (ctx.ac.chip_class >= GFX10) {
2407      if (is_pre_gs_stage(shaders[0]->info.stage) && args->shader_info->is_ngg) {
2408         ctx.max_workgroup_size = 128;
2409      }
2410   }
2411
2412   create_function(&ctx, shaders[shader_count - 1]->info.stage, shader_count >= 2);
2413
2414   ctx.abi.emit_outputs = handle_shader_outputs_post;
2415   ctx.abi.emit_vertex_with_counter = visit_emit_vertex_with_counter;
2416   ctx.abi.load_ubo = radv_load_ubo;
2417   ctx.abi.load_ssbo = radv_load_ssbo;
2418   ctx.abi.load_sampler_desc = radv_get_sampler_desc;
2419   ctx.abi.load_resource = radv_load_resource;
2420   ctx.abi.load_ring_tess_factors = load_ring_tess_factors;
2421   ctx.abi.load_ring_tess_offchip = load_ring_tess_offchip;
2422   ctx.abi.load_ring_esgs = load_ring_esgs;
2423   ctx.abi.clamp_shadow_reference = false;
2424   ctx.abi.adjust_frag_coord_z = args->options->adjust_frag_coord_z;
2425   ctx.abi.robust_buffer_access = args->options->robust_buffer_access;
2426
2427   bool is_ngg = is_pre_gs_stage(shaders[0]->info.stage) && args->shader_info->is_ngg;
2428   if (shader_count >= 2 || is_ngg)
2429      ac_init_exec_full_mask(&ctx.ac);
2430
2431   if (args->ac.vertex_id.used)
2432      ctx.abi.vertex_id = ac_get_arg(&ctx.ac, args->ac.vertex_id);
2433   if (args->ac.vs_rel_patch_id.used)
2434      ctx.vs_rel_patch_id = ac_get_arg(&ctx.ac, args->ac.vs_rel_patch_id);
2435   if (args->ac.instance_id.used)
2436      ctx.abi.instance_id = ac_get_arg(&ctx.ac, args->ac.instance_id);
2437
2438   if (args->options->has_ls_vgpr_init_bug &&
2439       shaders[shader_count - 1]->info.stage == MESA_SHADER_TESS_CTRL)
2440      ac_nir_fixup_ls_hs_input_vgprs(&ctx);
2441
2442   if (is_ngg) {
2443      /* Declare scratch space base for streamout and vertex
2444       * compaction. Whether space is actually allocated is
2445       * determined during linking / PM4 creation.
2446       *
2447       * Add an extra dword per vertex to ensure an odd stride, which
2448       * avoids bank conflicts for SoA accesses.
2449       */
2450      if (!args->shader_info->is_ngg_passthrough)
2451         declare_esgs_ring(&ctx);
2452
2453      /* GFX10 hang workaround - there needs to be an s_barrier before gs_alloc_req always */
2454      if (ctx.ac.chip_class == GFX10 && shader_count == 1)
2455         ac_build_s_barrier(&ctx.ac);
2456   }
2457
2458   for (int shader_idx = 0; shader_idx < shader_count; ++shader_idx) {
2459      ctx.stage = shaders[shader_idx]->info.stage;
2460      ctx.shader = shaders[shader_idx];
2461      ctx.output_mask = 0;
2462
2463      if (shaders[shader_idx]->info.stage == MESA_SHADER_GEOMETRY) {
2464         for (int i = 0; i < 4; i++) {
2465            ctx.gs_next_vertex[i] = ac_build_alloca(&ctx.ac, ctx.ac.i32, "");
2466         }
2467         if (args->shader_info->is_ngg) {
2468            for (unsigned i = 0; i < 4; ++i) {
2469               ctx.gs_curprim_verts[i] = ac_build_alloca(&ctx.ac, ctx.ac.i32, "");
2470               ctx.gs_generated_prims[i] = ac_build_alloca(&ctx.ac, ctx.ac.i32, "");
2471            }
2472
2473            LLVMTypeRef ai32 = LLVMArrayType(ctx.ac.i32, 8);
2474            ctx.gs_ngg_scratch =
2475               LLVMAddGlobalInAddressSpace(ctx.ac.module, ai32, "ngg_scratch", AC_ADDR_SPACE_LDS);
2476            LLVMSetInitializer(ctx.gs_ngg_scratch, LLVMGetUndef(ai32));
2477            LLVMSetAlignment(ctx.gs_ngg_scratch, 4);
2478
2479            ctx.gs_ngg_emit = LLVMAddGlobalInAddressSpace(
2480               ctx.ac.module, LLVMArrayType(ctx.ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS);
2481            LLVMSetLinkage(ctx.gs_ngg_emit, LLVMExternalLinkage);
2482            LLVMSetAlignment(ctx.gs_ngg_emit, 4);
2483         }
2484
2485         ctx.abi.emit_primitive = visit_end_primitive;
2486      } else if (shaders[shader_idx]->info.stage == MESA_SHADER_TESS_EVAL) {
2487      } else if (shaders[shader_idx]->info.stage == MESA_SHADER_VERTEX) {
2488         ctx.abi.load_base_vertex = radv_load_base_vertex;
2489         ctx.abi.load_inputs = radv_load_vs_inputs;
2490      } else if (shaders[shader_idx]->info.stage == MESA_SHADER_FRAGMENT) {
2491         ctx.abi.load_sample_position = load_sample_position;
2492         ctx.abi.load_sample_mask_in = load_sample_mask_in;
2493      }
2494
2495      if (shaders[shader_idx]->info.stage == MESA_SHADER_VERTEX &&
2496          args->shader_info->is_ngg &&
2497          args->shader_info->vs.outinfo.export_prim_id) {
2498         declare_esgs_ring(&ctx);
2499      }
2500
2501      bool nested_barrier = false;
2502
2503      if (shader_idx) {
2504         if (shaders[shader_idx]->info.stage == MESA_SHADER_GEOMETRY &&
2505             args->shader_info->is_ngg) {
2506            gfx10_ngg_gs_emit_prologue(&ctx);
2507            nested_barrier = false;
2508         } else {
2509            nested_barrier = true;
2510         }
2511      }
2512
2513      if (nested_barrier) {
2514         /* Execute a barrier before the second shader in
2515          * a merged shader.
2516          *
2517          * Execute the barrier inside the conditional block,
2518          * so that empty waves can jump directly to s_endpgm,
2519          * which will also signal the barrier.
2520          *
2521          * This is possible in gfx9, because an empty wave
2522          * for the second shader does not participate in
2523          * the epilogue. With NGG, empty waves may still
2524          * be required to export data (e.g. GS output vertices),
2525          * so we cannot let them exit early.
2526          *
2527          * If the shader is TCS and the TCS epilog is present
2528          * and contains a barrier, it will wait there and then
2529          * reach s_endpgm.
2530          */
2531         ac_emit_barrier(&ctx.ac, ctx.stage);
2532      }
2533
2534      nir_foreach_shader_out_variable(variable, shaders[shader_idx]) scan_shader_output_decl(
2535         &ctx, variable, shaders[shader_idx], shaders[shader_idx]->info.stage);
2536
2537      ac_setup_rings(&ctx);
2538
2539      LLVMBasicBlockRef merge_block = NULL;
2540      if (shader_count >= 2 || is_ngg) {
2541         LLVMValueRef fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder));
2542         LLVMBasicBlockRef then_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, "");
2543         merge_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, "");
2544
2545         LLVMValueRef count = ac_unpack_param(
2546            &ctx.ac, ac_get_arg(&ctx.ac, args->ac.merged_wave_info), 8 * shader_idx, 8);
2547         LLVMValueRef thread_id = ac_get_thread_id(&ctx.ac);
2548         LLVMValueRef cond = LLVMBuildICmp(ctx.ac.builder, LLVMIntULT, thread_id, count, "");
2549         LLVMBuildCondBr(ctx.ac.builder, cond, then_block, merge_block);
2550
2551         LLVMPositionBuilderAtEnd(ctx.ac.builder, then_block);
2552      }
2553
2554      if (shaders[shader_idx]->info.stage == MESA_SHADER_FRAGMENT)
2555         prepare_interp_optimize(&ctx, shaders[shader_idx]);
2556      else if (shaders[shader_idx]->info.stage == MESA_SHADER_GEOMETRY)
2557         prepare_gs_input_vgprs(&ctx, shader_count >= 2);
2558
2559      ac_nir_translate(&ctx.ac, &ctx.abi, &args->ac, shaders[shader_idx]);
2560
2561      if (shader_count >= 2 || is_ngg) {
2562         LLVMBuildBr(ctx.ac.builder, merge_block);
2563         LLVMPositionBuilderAtEnd(ctx.ac.builder, merge_block);
2564      }
2565
2566      /* This needs to be outside the if wrapping the shader body, as sometimes
2567       * the HW generates waves with 0 es/vs threads. */
2568      if (is_pre_gs_stage(shaders[shader_idx]->info.stage) &&
2569          args->shader_info->is_ngg && shader_idx == shader_count - 1) {
2570         handle_ngg_outputs_post_2(&ctx);
2571      } else if (shaders[shader_idx]->info.stage == MESA_SHADER_GEOMETRY &&
2572                 args->shader_info->is_ngg) {
2573         gfx10_ngg_gs_emit_epilogue_2(&ctx);
2574      }
2575   }
2576
2577   LLVMBuildRetVoid(ctx.ac.builder);
2578
2579   if (args->options->dump_preoptir) {
2580      fprintf(stderr, "%s LLVM IR:\n\n",
2581              radv_get_shader_name(args->shader_info, shaders[shader_count - 1]->info.stage));
2582      ac_dump_module(ctx.ac.module);
2583      fprintf(stderr, "\n");
2584   }
2585
2586   ac_llvm_finalize_module(&ctx, ac_llvm->passmgr, args->options);
2587
2588   if (shader_count == 1)
2589      ac_nir_eliminate_const_vs_outputs(&ctx);
2590
2591   return ctx.ac.module;
2592}
2593
2594static void
2595ac_diagnostic_handler(LLVMDiagnosticInfoRef di, void *context)
2596{
2597   unsigned *retval = (unsigned *)context;
2598   LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di);
2599   char *description = LLVMGetDiagInfoDescription(di);
2600
2601   if (severity == LLVMDSError) {
2602      *retval = 1;
2603      fprintf(stderr, "LLVM triggered Diagnostic Handler: %s\n", description);
2604   }
2605
2606   LLVMDisposeMessage(description);
2607}
2608
2609static unsigned
2610radv_llvm_compile(LLVMModuleRef M, char **pelf_buffer, size_t *pelf_size,
2611                  struct ac_llvm_compiler *ac_llvm)
2612{
2613   unsigned retval = 0;
2614   LLVMContextRef llvm_ctx;
2615
2616   /* Setup Diagnostic Handler*/
2617   llvm_ctx = LLVMGetModuleContext(M);
2618
2619   LLVMContextSetDiagnosticHandler(llvm_ctx, ac_diagnostic_handler, &retval);
2620
2621   /* Compile IR*/
2622   if (!radv_compile_to_elf(ac_llvm, M, pelf_buffer, pelf_size))
2623      retval = 1;
2624   return retval;
2625}
2626
2627static void
2628ac_compile_llvm_module(struct ac_llvm_compiler *ac_llvm, LLVMModuleRef llvm_module,
2629                       struct radv_shader_binary **rbinary, gl_shader_stage stage, const char *name,
2630                       const struct radv_nir_compiler_options *options)
2631{
2632   char *elf_buffer = NULL;
2633   size_t elf_size = 0;
2634   char *llvm_ir_string = NULL;
2635
2636   if (options->dump_shader) {
2637      fprintf(stderr, "%s LLVM IR:\n\n", name);
2638      ac_dump_module(llvm_module);
2639      fprintf(stderr, "\n");
2640   }
2641
2642   if (options->record_ir) {
2643      char *llvm_ir = LLVMPrintModuleToString(llvm_module);
2644      llvm_ir_string = strdup(llvm_ir);
2645      LLVMDisposeMessage(llvm_ir);
2646   }
2647
2648   int v = radv_llvm_compile(llvm_module, &elf_buffer, &elf_size, ac_llvm);
2649   if (v) {
2650      fprintf(stderr, "compile failed\n");
2651   }
2652
2653   LLVMContextRef ctx = LLVMGetModuleContext(llvm_module);
2654   LLVMDisposeModule(llvm_module);
2655   LLVMContextDispose(ctx);
2656
2657   size_t llvm_ir_size = llvm_ir_string ? strlen(llvm_ir_string) : 0;
2658   size_t alloc_size = sizeof(struct radv_shader_binary_rtld) + elf_size + llvm_ir_size + 1;
2659   struct radv_shader_binary_rtld *rbin = calloc(1, alloc_size);
2660   memcpy(rbin->data, elf_buffer, elf_size);
2661   if (llvm_ir_string)
2662      memcpy(rbin->data + elf_size, llvm_ir_string, llvm_ir_size + 1);
2663
2664   rbin->base.type = RADV_BINARY_TYPE_RTLD;
2665   rbin->base.stage = stage;
2666   rbin->base.total_size = alloc_size;
2667   rbin->elf_size = elf_size;
2668   rbin->llvm_ir_size = llvm_ir_size;
2669   *rbinary = &rbin->base;
2670
2671   free(llvm_ir_string);
2672   free(elf_buffer);
2673}
2674
2675static void
2676radv_compile_nir_shader(struct ac_llvm_compiler *ac_llvm, struct radv_shader_binary **rbinary,
2677                        const struct radv_shader_args *args, struct nir_shader *const *nir,
2678                        int nir_count)
2679{
2680
2681   LLVMModuleRef llvm_module;
2682
2683   llvm_module = ac_translate_nir_to_llvm(ac_llvm, nir, nir_count, args);
2684
2685   ac_compile_llvm_module(ac_llvm, llvm_module, rbinary, nir[nir_count - 1]->info.stage,
2686                          radv_get_shader_name(args->shader_info, nir[nir_count - 1]->info.stage),
2687                          args->options);
2688}
2689
2690static void
2691ac_gs_copy_shader_emit(struct radv_shader_context *ctx)
2692{
2693   LLVMValueRef vtx_offset =
2694      LLVMBuildMul(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->ac.vertex_id),
2695                   LLVMConstInt(ctx->ac.i32, 4, false), "");
2696   LLVMValueRef stream_id;
2697
2698   /* Fetch the vertex stream ID. */
2699   if (ctx->args->shader_info->so.num_outputs) {
2700      stream_id =
2701         ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.streamout_config), 24, 2);
2702   } else {
2703      stream_id = ctx->ac.i32_0;
2704   }
2705
2706   LLVMBasicBlockRef end_bb;
2707   LLVMValueRef switch_inst;
2708
2709   end_bb = LLVMAppendBasicBlockInContext(ctx->ac.context, ctx->main_function, "end");
2710   switch_inst = LLVMBuildSwitch(ctx->ac.builder, stream_id, end_bb, 4);
2711
2712   for (unsigned stream = 0; stream < 4; stream++) {
2713      unsigned num_components = ctx->args->shader_info->gs.num_stream_output_components[stream];
2714      LLVMBasicBlockRef bb;
2715      unsigned offset;
2716
2717      if (stream > 0 && !num_components)
2718         continue;
2719
2720      if (stream > 0 && !ctx->args->shader_info->so.num_outputs)
2721         continue;
2722
2723      bb = LLVMInsertBasicBlockInContext(ctx->ac.context, end_bb, "out");
2724      LLVMAddCase(switch_inst, LLVMConstInt(ctx->ac.i32, stream, 0), bb);
2725      LLVMPositionBuilderAtEnd(ctx->ac.builder, bb);
2726
2727      offset = 0;
2728      for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
2729         unsigned output_usage_mask = ctx->args->shader_info->gs.output_usage_mask[i];
2730         unsigned output_stream = ctx->args->shader_info->gs.output_streams[i];
2731         int length = util_last_bit(output_usage_mask);
2732
2733         if (!(ctx->output_mask & (1ull << i)) || output_stream != stream)
2734            continue;
2735
2736         for (unsigned j = 0; j < length; j++) {
2737            LLVMValueRef value, soffset;
2738
2739            if (!(output_usage_mask & (1 << j)))
2740               continue;
2741
2742            soffset = LLVMConstInt(ctx->ac.i32, offset * ctx->shader->info.gs.vertices_out * 16 * 4,
2743                                   false);
2744
2745            offset++;
2746
2747            value = ac_build_buffer_load(&ctx->ac, ctx->gsvs_ring[0], 1, ctx->ac.i32_0, vtx_offset,
2748                                         soffset, 0, ctx->ac.f32, ac_glc | ac_slc, true, false);
2749
2750            LLVMTypeRef type = LLVMGetAllocatedType(ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]);
2751            if (ac_get_type_size(type) == 2) {
2752               value = LLVMBuildBitCast(ctx->ac.builder, value, ctx->ac.i32, "");
2753               value = LLVMBuildTrunc(ctx->ac.builder, value, ctx->ac.i16, "");
2754            }
2755
2756            LLVMBuildStore(ctx->ac.builder, ac_to_float(&ctx->ac, value),
2757                           ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]);
2758         }
2759      }
2760
2761      if (ctx->args->shader_info->so.num_outputs)
2762         radv_emit_streamout(ctx, stream);
2763
2764      if (stream == 0) {
2765         handle_vs_outputs_post(ctx, false, ctx->args->shader_info->vs.outinfo.export_clip_dists,
2766                                &ctx->args->shader_info->vs.outinfo);
2767      }
2768
2769      LLVMBuildBr(ctx->ac.builder, end_bb);
2770   }
2771
2772   LLVMPositionBuilderAtEnd(ctx->ac.builder, end_bb);
2773}
2774
2775static void
2776radv_compile_gs_copy_shader(struct ac_llvm_compiler *ac_llvm, struct nir_shader *geom_shader,
2777                            struct radv_shader_binary **rbinary,
2778                            const struct radv_shader_args *args)
2779{
2780   struct radv_shader_context ctx = {0};
2781   ctx.args = args;
2782
2783   assert(args->is_gs_copy_shader);
2784
2785   ac_llvm_context_init(&ctx.ac, ac_llvm, args->options->chip_class, args->options->family,
2786                        args->options->info, AC_FLOAT_MODE_DEFAULT, 64, 64);
2787   ctx.context = ctx.ac.context;
2788
2789   ctx.stage = MESA_SHADER_VERTEX;
2790   ctx.shader = geom_shader;
2791
2792   create_function(&ctx, MESA_SHADER_VERTEX, false);
2793
2794   ac_setup_rings(&ctx);
2795
2796   nir_foreach_shader_out_variable(variable, geom_shader)
2797   {
2798      scan_shader_output_decl(&ctx, variable, geom_shader, MESA_SHADER_VERTEX);
2799      ac_handle_shader_output_decl(&ctx.ac, &ctx.abi, geom_shader, variable, MESA_SHADER_VERTEX);
2800   }
2801
2802   ac_gs_copy_shader_emit(&ctx);
2803
2804   LLVMBuildRetVoid(ctx.ac.builder);
2805
2806   ac_llvm_finalize_module(&ctx, ac_llvm->passmgr, args->options);
2807
2808   ac_compile_llvm_module(ac_llvm, ctx.ac.module, rbinary, MESA_SHADER_VERTEX, "GS Copy Shader",
2809                          args->options);
2810   (*rbinary)->is_gs_copy_shader = true;
2811}
2812
2813void
2814llvm_compile_shader(struct radv_device *device, unsigned shader_count,
2815                    struct nir_shader *const *shaders, struct radv_shader_binary **binary,
2816                    struct radv_shader_args *args)
2817{
2818   enum ac_target_machine_options tm_options = 0;
2819   struct ac_llvm_compiler ac_llvm;
2820
2821   tm_options |= AC_TM_SUPPORTS_SPILL;
2822   if (args->options->check_ir)
2823      tm_options |= AC_TM_CHECK_IR;
2824
2825   radv_init_llvm_compiler(&ac_llvm, args->options->family, tm_options,
2826                           args->shader_info->wave_size);
2827
2828   if (args->is_gs_copy_shader) {
2829      radv_compile_gs_copy_shader(&ac_llvm, *shaders, binary, args);
2830   } else {
2831      radv_compile_nir_shader(&ac_llvm, binary, args, shaders, shader_count);
2832   }
2833}
2834