1848b8605Smrg/*
2848b8605Smrg * Copyright 2012 Advanced Micro Devices, Inc.
3b8e80941Smrg * All Rights Reserved.
4848b8605Smrg *
5848b8605Smrg * Permission is hereby granted, free of charge, to any person obtaining a
6848b8605Smrg * copy of this software and associated documentation files (the "Software"),
7848b8605Smrg * to deal in the Software without restriction, including without limitation
8848b8605Smrg * on the rights to use, copy, modify, merge, publish, distribute, sub
9848b8605Smrg * license, and/or sell copies of the Software, and to permit persons to whom
10848b8605Smrg * the Software is furnished to do so, subject to the following conditions:
11848b8605Smrg *
12848b8605Smrg * The above copyright notice and this permission notice (including the next
13848b8605Smrg * paragraph) shall be included in all copies or substantial portions of the
14848b8605Smrg * Software.
15848b8605Smrg *
16848b8605Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17848b8605Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18848b8605Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19848b8605Smrg * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20848b8605Smrg * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21848b8605Smrg * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22848b8605Smrg * USE OR OTHER DEALINGS IN THE SOFTWARE.
23848b8605Smrg */
24848b8605Smrg
25848b8605Smrg#include "util/u_memory.h"
26b8e80941Smrg#include "util/u_string.h"
27b8e80941Smrg#include "tgsi/tgsi_build.h"
28848b8605Smrg#include "tgsi/tgsi_util.h"
29848b8605Smrg#include "tgsi/tgsi_dump.h"
30848b8605Smrg
31b8e80941Smrg#include "ac_exp_param.h"
32b8e80941Smrg#include "ac_shader_util.h"
33b8e80941Smrg#include "ac_llvm_util.h"
34b8e80941Smrg#include "si_shader_internal.h"
35848b8605Smrg#include "si_pipe.h"
36848b8605Smrg#include "sid.h"
37848b8605Smrg
38b8e80941Smrg#include "compiler/nir/nir.h"
39b8e80941Smrg
40b8e80941Smrgstatic const char *scratch_rsrc_dword0_symbol =
41b8e80941Smrg	"SCRATCH_RSRC_DWORD0";
42b8e80941Smrg
43b8e80941Smrgstatic const char *scratch_rsrc_dword1_symbol =
44b8e80941Smrg	"SCRATCH_RSRC_DWORD1";
45848b8605Smrg
46848b8605Smrgstruct si_shader_output_values
47848b8605Smrg{
48848b8605Smrg	LLVMValueRef values[4];
49b8e80941Smrg	unsigned semantic_name;
50b8e80941Smrg	unsigned semantic_index;
51b8e80941Smrg	ubyte vertex_stream[4];
52848b8605Smrg};
53848b8605Smrg
54b8e80941Smrg/**
55b8e80941Smrg * Used to collect types and other info about arguments of the LLVM function
56b8e80941Smrg * before the function is created.
57b8e80941Smrg */
58b8e80941Smrgstruct si_function_info {
59b8e80941Smrg	LLVMTypeRef types[100];
60b8e80941Smrg	LLVMValueRef *assign[100];
61b8e80941Smrg	unsigned num_sgpr_params;
62b8e80941Smrg	unsigned num_params;
63b8e80941Smrg};
64b8e80941Smrg
65b8e80941Smrgenum si_arg_regfile {
66b8e80941Smrg	ARG_SGPR,
67b8e80941Smrg	ARG_VGPR
68848b8605Smrg};
69848b8605Smrg
70b8e80941Smrgstatic void si_init_shader_ctx(struct si_shader_context *ctx,
71b8e80941Smrg			       struct si_screen *sscreen,
72b8e80941Smrg			       struct ac_llvm_compiler *compiler);
73b8e80941Smrg
74b8e80941Smrgstatic void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
75b8e80941Smrg				 struct lp_build_tgsi_context *bld_base,
76b8e80941Smrg				 struct lp_build_emit_data *emit_data);
77b8e80941Smrg
78b8e80941Smrgstatic void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
79b8e80941Smrg			       FILE *f);
80b8e80941Smrg
81b8e80941Smrgstatic void si_build_vs_prolog_function(struct si_shader_context *ctx,
82b8e80941Smrg					union si_shader_part_key *key);
83b8e80941Smrgstatic void si_build_tcs_epilog_function(struct si_shader_context *ctx,
84b8e80941Smrg					 union si_shader_part_key *key);
85b8e80941Smrgstatic void si_build_ps_prolog_function(struct si_shader_context *ctx,
86b8e80941Smrg					union si_shader_part_key *key);
87b8e80941Smrgstatic void si_build_ps_epilog_function(struct si_shader_context *ctx,
88b8e80941Smrg					union si_shader_part_key *key);
89b8e80941Smrgstatic void si_fix_resource_usage(struct si_screen *sscreen,
90b8e80941Smrg				  struct si_shader *shader);
91b8e80941Smrg
92b8e80941Smrg/* Ideally pass the sample mask input to the PS epilog as v14, which
93b8e80941Smrg * is its usual location, so that the shader doesn't have to add v_mov.
94b8e80941Smrg */
95b8e80941Smrg#define PS_EPILOG_SAMPLEMASK_MIN_LOC 14
96b8e80941Smrg
97b8e80941Smrgstatic bool llvm_type_is_64bit(struct si_shader_context *ctx,
98b8e80941Smrg			       LLVMTypeRef type)
99b8e80941Smrg{
100b8e80941Smrg	if (type == ctx->ac.i64 || type == ctx->ac.f64)
101b8e80941Smrg		return true;
102b8e80941Smrg
103b8e80941Smrg	return false;
104b8e80941Smrg}
105b8e80941Smrg
106b8e80941Smrgstatic bool is_merged_shader(struct si_shader_context *ctx)
107848b8605Smrg{
108b8e80941Smrg	if (ctx->screen->info.chip_class <= VI)
109b8e80941Smrg		return false;
110b8e80941Smrg
111b8e80941Smrg	return ctx->shader->key.as_ls ||
112b8e80941Smrg	       ctx->shader->key.as_es ||
113b8e80941Smrg	       ctx->type == PIPE_SHADER_TESS_CTRL ||
114b8e80941Smrg	       ctx->type == PIPE_SHADER_GEOMETRY;
115848b8605Smrg}
116848b8605Smrg
117b8e80941Smrgstatic void si_init_function_info(struct si_function_info *fninfo)
118b8e80941Smrg{
119b8e80941Smrg	fninfo->num_params = 0;
120b8e80941Smrg	fninfo->num_sgpr_params = 0;
121b8e80941Smrg}
122848b8605Smrg
123b8e80941Smrgstatic unsigned add_arg_assign(struct si_function_info *fninfo,
124b8e80941Smrg			enum si_arg_regfile regfile, LLVMTypeRef type,
125b8e80941Smrg			LLVMValueRef *assign)
126b8e80941Smrg{
127b8e80941Smrg	assert(regfile != ARG_SGPR || fninfo->num_sgpr_params == fninfo->num_params);
128848b8605Smrg
129b8e80941Smrg	unsigned idx = fninfo->num_params++;
130b8e80941Smrg	assert(idx < ARRAY_SIZE(fninfo->types));
131848b8605Smrg
132b8e80941Smrg	if (regfile == ARG_SGPR)
133b8e80941Smrg		fninfo->num_sgpr_params = fninfo->num_params;
134848b8605Smrg
135b8e80941Smrg	fninfo->types[idx] = type;
136b8e80941Smrg	fninfo->assign[idx] = assign;
137b8e80941Smrg	return idx;
138b8e80941Smrg}
139848b8605Smrg
140b8e80941Smrgstatic unsigned add_arg(struct si_function_info *fninfo,
141b8e80941Smrg			enum si_arg_regfile regfile, LLVMTypeRef type)
142b8e80941Smrg{
143b8e80941Smrg	return add_arg_assign(fninfo, regfile, type, NULL);
144b8e80941Smrg}
145848b8605Smrg
146b8e80941Smrgstatic void add_arg_assign_checked(struct si_function_info *fninfo,
147b8e80941Smrg				   enum si_arg_regfile regfile, LLVMTypeRef type,
148b8e80941Smrg				   LLVMValueRef *assign, unsigned idx)
149b8e80941Smrg{
150b8e80941Smrg	MAYBE_UNUSED unsigned actual = add_arg_assign(fninfo, regfile, type, assign);
151b8e80941Smrg	assert(actual == idx);
152b8e80941Smrg}
153848b8605Smrg
154b8e80941Smrgstatic void add_arg_checked(struct si_function_info *fninfo,
155b8e80941Smrg			    enum si_arg_regfile regfile, LLVMTypeRef type,
156b8e80941Smrg			    unsigned idx)
157b8e80941Smrg{
158b8e80941Smrg	add_arg_assign_checked(fninfo, regfile, type, NULL, idx);
159b8e80941Smrg}
160848b8605Smrg
161848b8605Smrg/**
162b8e80941Smrg * Returns a unique index for a per-patch semantic name and index. The index
163b8e80941Smrg * must be less than 32, so that a 32-bit bitmask of used inputs or outputs
164b8e80941Smrg * can be calculated.
165848b8605Smrg */
166b8e80941Smrgunsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index)
167848b8605Smrg{
168b8e80941Smrg	switch (semantic_name) {
169b8e80941Smrg	case TGSI_SEMANTIC_TESSOUTER:
170b8e80941Smrg		return 0;
171b8e80941Smrg	case TGSI_SEMANTIC_TESSINNER:
172b8e80941Smrg		return 1;
173b8e80941Smrg	case TGSI_SEMANTIC_PATCH:
174b8e80941Smrg		assert(index < 30);
175b8e80941Smrg		return 2 + index;
176848b8605Smrg
177b8e80941Smrg	default:
178b8e80941Smrg		assert(!"invalid semantic name");
179b8e80941Smrg		return 0;
180b8e80941Smrg	}
181b8e80941Smrg}
182848b8605Smrg
183b8e80941Smrg/**
184b8e80941Smrg * Returns a unique index for a semantic name and index. The index must be
185b8e80941Smrg * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
186b8e80941Smrg * calculated.
187b8e80941Smrg */
188b8e80941Smrgunsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index,
189b8e80941Smrg				       unsigned is_varying)
190b8e80941Smrg{
191b8e80941Smrg	switch (semantic_name) {
192b8e80941Smrg	case TGSI_SEMANTIC_POSITION:
193b8e80941Smrg		return 0;
194b8e80941Smrg	case TGSI_SEMANTIC_GENERIC:
195b8e80941Smrg		/* Since some shader stages use the the highest used IO index
196b8e80941Smrg		 * to determine the size to allocate for inputs/outputs
197b8e80941Smrg		 * (in LDS, tess and GS rings). GENERIC should be placed right
198b8e80941Smrg		 * after POSITION to make that size as small as possible.
199b8e80941Smrg		 */
200b8e80941Smrg		if (index < SI_MAX_IO_GENERIC)
201b8e80941Smrg			return 1 + index;
202b8e80941Smrg
203b8e80941Smrg		assert(!"invalid generic index");
204b8e80941Smrg		return 0;
205b8e80941Smrg	case TGSI_SEMANTIC_PSIZE:
206b8e80941Smrg		return SI_MAX_IO_GENERIC + 1;
207b8e80941Smrg	case TGSI_SEMANTIC_CLIPDIST:
208b8e80941Smrg		assert(index <= 1);
209b8e80941Smrg		return SI_MAX_IO_GENERIC + 2 + index;
210b8e80941Smrg	case TGSI_SEMANTIC_FOG:
211b8e80941Smrg		return SI_MAX_IO_GENERIC + 4;
212b8e80941Smrg	case TGSI_SEMANTIC_LAYER:
213b8e80941Smrg		return SI_MAX_IO_GENERIC + 5;
214b8e80941Smrg	case TGSI_SEMANTIC_VIEWPORT_INDEX:
215b8e80941Smrg		return SI_MAX_IO_GENERIC + 6;
216b8e80941Smrg	case TGSI_SEMANTIC_PRIMID:
217b8e80941Smrg		return SI_MAX_IO_GENERIC + 7;
218b8e80941Smrg	case TGSI_SEMANTIC_COLOR:
219b8e80941Smrg		assert(index < 2);
220b8e80941Smrg		return SI_MAX_IO_GENERIC + 8 + index;
221b8e80941Smrg	case TGSI_SEMANTIC_BCOLOR:
222b8e80941Smrg		assert(index < 2);
223b8e80941Smrg		/* If it's a varying, COLOR and BCOLOR alias. */
224b8e80941Smrg		if (is_varying)
225b8e80941Smrg			return SI_MAX_IO_GENERIC + 8 + index;
226b8e80941Smrg		else
227b8e80941Smrg			return SI_MAX_IO_GENERIC + 10 + index;
228b8e80941Smrg	case TGSI_SEMANTIC_TEXCOORD:
229b8e80941Smrg		assert(index < 8);
230b8e80941Smrg		STATIC_ASSERT(SI_MAX_IO_GENERIC + 12 + 8 <= 63);
231b8e80941Smrg		return SI_MAX_IO_GENERIC + 12 + index;
232b8e80941Smrg	case TGSI_SEMANTIC_CLIPVERTEX:
233b8e80941Smrg		return 63;
234b8e80941Smrg	default:
235b8e80941Smrg		fprintf(stderr, "invalid semantic name = %u\n", semantic_name);
236b8e80941Smrg		assert(!"invalid semantic name");
237b8e80941Smrg		return 0;
238b8e80941Smrg	}
239848b8605Smrg}
240848b8605Smrg
241b8e80941Smrg/**
242b8e80941Smrg * Get the value of a shader input parameter and extract a bitfield.
243b8e80941Smrg */
244b8e80941Smrgstatic LLVMValueRef unpack_llvm_param(struct si_shader_context *ctx,
245b8e80941Smrg				      LLVMValueRef value, unsigned rshift,
246b8e80941Smrg				      unsigned bitwidth)
247848b8605Smrg{
248b8e80941Smrg	if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
249b8e80941Smrg		value = ac_to_integer(&ctx->ac, value);
250848b8605Smrg
251b8e80941Smrg	if (rshift)
252b8e80941Smrg		value = LLVMBuildLShr(ctx->ac.builder, value,
253b8e80941Smrg				      LLVMConstInt(ctx->i32, rshift, 0), "");
254848b8605Smrg
255b8e80941Smrg	if (rshift + bitwidth < 32) {
256b8e80941Smrg		unsigned mask = (1 << bitwidth) - 1;
257b8e80941Smrg		value = LLVMBuildAnd(ctx->ac.builder, value,
258b8e80941Smrg				     LLVMConstInt(ctx->i32, mask, 0), "");
259b8e80941Smrg	}
260848b8605Smrg
261b8e80941Smrg	return value;
262848b8605Smrg}
263848b8605Smrg
264b8e80941SmrgLLVMValueRef si_unpack_param(struct si_shader_context *ctx,
265b8e80941Smrg			     unsigned param, unsigned rshift,
266b8e80941Smrg			     unsigned bitwidth)
267848b8605Smrg{
268b8e80941Smrg	LLVMValueRef value = LLVMGetParam(ctx->main_fn, param);
269848b8605Smrg
270b8e80941Smrg	return unpack_llvm_param(ctx, value, rshift, bitwidth);
271b8e80941Smrg}
272848b8605Smrg
273b8e80941Smrgstatic LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
274b8e80941Smrg{
275b8e80941Smrg	switch (ctx->type) {
276b8e80941Smrg	case PIPE_SHADER_TESS_CTRL:
277b8e80941Smrg		return unpack_llvm_param(ctx, ctx->abi.tcs_rel_ids, 0, 8);
278b8e80941Smrg
279b8e80941Smrg	case PIPE_SHADER_TESS_EVAL:
280b8e80941Smrg		return LLVMGetParam(ctx->main_fn,
281b8e80941Smrg				    ctx->param_tes_rel_patch_id);
282b8e80941Smrg
283b8e80941Smrg	default:
284b8e80941Smrg		assert(0);
285b8e80941Smrg		return NULL;
286848b8605Smrg	}
287b8e80941Smrg}
288b8e80941Smrg
289b8e80941Smrg/* Tessellation shaders pass outputs to the next shader using LDS.
290b8e80941Smrg *
291b8e80941Smrg * LS outputs = TCS inputs
292b8e80941Smrg * TCS outputs = TES inputs
293b8e80941Smrg *
294b8e80941Smrg * The LDS layout is:
295b8e80941Smrg * - TCS inputs for patch 0
296b8e80941Smrg * - TCS inputs for patch 1
297b8e80941Smrg * - TCS inputs for patch 2		= get_tcs_in_current_patch_offset (if RelPatchID==2)
298b8e80941Smrg * - ...
299b8e80941Smrg * - TCS outputs for patch 0            = get_tcs_out_patch0_offset
300b8e80941Smrg * - Per-patch TCS outputs for patch 0  = get_tcs_out_patch0_patch_data_offset
301b8e80941Smrg * - TCS outputs for patch 1
302b8e80941Smrg * - Per-patch TCS outputs for patch 1
303b8e80941Smrg * - TCS outputs for patch 2            = get_tcs_out_current_patch_offset (if RelPatchID==2)
304b8e80941Smrg * - Per-patch TCS outputs for patch 2  = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
305b8e80941Smrg * - ...
306b8e80941Smrg *
307b8e80941Smrg * All three shaders VS(LS), TCS, TES share the same LDS space.
308b8e80941Smrg */
309848b8605Smrg
310b8e80941Smrgstatic LLVMValueRef
311b8e80941Smrgget_tcs_in_patch_stride(struct si_shader_context *ctx)
312b8e80941Smrg{
313b8e80941Smrg	return si_unpack_param(ctx, ctx->param_vs_state_bits, 8, 13);
314848b8605Smrg}
315848b8605Smrg
316b8e80941Smrgstatic unsigned get_tcs_out_vertex_dw_stride_constant(struct si_shader_context *ctx)
317848b8605Smrg{
318b8e80941Smrg	assert(ctx->type == PIPE_SHADER_TESS_CTRL);
319848b8605Smrg
320b8e80941Smrg	if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
321b8e80941Smrg		return util_last_bit64(ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) * 4;
322848b8605Smrg
323b8e80941Smrg	return util_last_bit64(ctx->shader->selector->outputs_written) * 4;
324b8e80941Smrg}
325848b8605Smrg
326b8e80941Smrgstatic LLVMValueRef get_tcs_out_vertex_dw_stride(struct si_shader_context *ctx)
327b8e80941Smrg{
328b8e80941Smrg	unsigned stride = get_tcs_out_vertex_dw_stride_constant(ctx);
329848b8605Smrg
330b8e80941Smrg	return LLVMConstInt(ctx->i32, stride, 0);
331b8e80941Smrg}
332848b8605Smrg
333b8e80941Smrgstatic LLVMValueRef get_tcs_out_patch_stride(struct si_shader_context *ctx)
334b8e80941Smrg{
335b8e80941Smrg	if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
336b8e80941Smrg		return si_unpack_param(ctx, ctx->param_tcs_out_lds_layout, 0, 13);
337b8e80941Smrg
338b8e80941Smrg	const struct tgsi_shader_info *info = &ctx->shader->selector->info;
339b8e80941Smrg	unsigned tcs_out_vertices = info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
340b8e80941Smrg	unsigned vertex_dw_stride = get_tcs_out_vertex_dw_stride_constant(ctx);
341b8e80941Smrg	unsigned num_patch_outputs = util_last_bit64(ctx->shader->selector->patch_outputs_written);
342b8e80941Smrg	unsigned patch_dw_stride = tcs_out_vertices * vertex_dw_stride +
343b8e80941Smrg				   num_patch_outputs * 4;
344b8e80941Smrg	return LLVMConstInt(ctx->i32, patch_dw_stride, 0);
345b8e80941Smrg}
346848b8605Smrg
347b8e80941Smrgstatic LLVMValueRef
348b8e80941Smrgget_tcs_out_patch0_offset(struct si_shader_context *ctx)
349b8e80941Smrg{
350b8e80941Smrg	return LLVMBuildMul(ctx->ac.builder,
351b8e80941Smrg			    si_unpack_param(ctx,
352b8e80941Smrg					    ctx->param_tcs_out_lds_offsets,
353b8e80941Smrg					    0, 16),
354b8e80941Smrg			    LLVMConstInt(ctx->i32, 4, 0), "");
355b8e80941Smrg}
356848b8605Smrg
357b8e80941Smrgstatic LLVMValueRef
358b8e80941Smrgget_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
359b8e80941Smrg{
360b8e80941Smrg	return LLVMBuildMul(ctx->ac.builder,
361b8e80941Smrg			    si_unpack_param(ctx,
362b8e80941Smrg					    ctx->param_tcs_out_lds_offsets,
363b8e80941Smrg					    16, 16),
364b8e80941Smrg			    LLVMConstInt(ctx->i32, 4, 0), "");
365b8e80941Smrg}
366848b8605Smrg
367b8e80941Smrgstatic LLVMValueRef
368b8e80941Smrgget_tcs_in_current_patch_offset(struct si_shader_context *ctx)
369b8e80941Smrg{
370b8e80941Smrg	LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
371b8e80941Smrg	LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
372b8e80941Smrg
373b8e80941Smrg	return LLVMBuildMul(ctx->ac.builder, patch_stride, rel_patch_id, "");
374848b8605Smrg}
375848b8605Smrg
376b8e80941Smrgstatic LLVMValueRef
377b8e80941Smrgget_tcs_out_current_patch_offset(struct si_shader_context *ctx)
378848b8605Smrg{
379b8e80941Smrg	LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
380b8e80941Smrg	LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
381b8e80941Smrg	LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
382848b8605Smrg
383b8e80941Smrg	return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_offset);
384b8e80941Smrg}
385b8e80941Smrg
386b8e80941Smrgstatic LLVMValueRef
387b8e80941Smrgget_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
388b8e80941Smrg{
389b8e80941Smrg	LLVMValueRef patch0_patch_data_offset =
390b8e80941Smrg		get_tcs_out_patch0_patch_data_offset(ctx);
391b8e80941Smrg	LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
392b8e80941Smrg	LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
393848b8605Smrg
394b8e80941Smrg	return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_patch_data_offset);
395848b8605Smrg}
396848b8605Smrg
397b8e80941Smrgstatic LLVMValueRef get_num_tcs_out_vertices(struct si_shader_context *ctx)
398b8e80941Smrg{
399b8e80941Smrg	unsigned tcs_out_vertices =
400b8e80941Smrg		ctx->shader->selector ?
401b8e80941Smrg		ctx->shader->selector->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] : 0;
402848b8605Smrg
403b8e80941Smrg	/* If !tcs_out_vertices, it's either the fixed-func TCS or the TCS epilog. */
404b8e80941Smrg	if (ctx->type == PIPE_SHADER_TESS_CTRL && tcs_out_vertices)
405b8e80941Smrg		return LLVMConstInt(ctx->i32, tcs_out_vertices, 0);
406848b8605Smrg
407b8e80941Smrg	return si_unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6);
408b8e80941Smrg}
409b8e80941Smrg
410b8e80941Smrgstatic LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx)
411b8e80941Smrg{
412b8e80941Smrg	unsigned stride;
413b8e80941Smrg
414b8e80941Smrg	switch (ctx->type) {
415b8e80941Smrg	case PIPE_SHADER_VERTEX:
416b8e80941Smrg		stride = ctx->shader->selector->lshs_vertex_stride / 4;
417b8e80941Smrg		return LLVMConstInt(ctx->i32, stride, 0);
418b8e80941Smrg
419b8e80941Smrg	case PIPE_SHADER_TESS_CTRL:
420b8e80941Smrg		if (ctx->screen->info.chip_class >= GFX9 &&
421b8e80941Smrg		    ctx->shader->is_monolithic) {
422b8e80941Smrg			stride = ctx->shader->key.part.tcs.ls->lshs_vertex_stride / 4;
423b8e80941Smrg			return LLVMConstInt(ctx->i32, stride, 0);
424848b8605Smrg		}
425b8e80941Smrg		return si_unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
426b8e80941Smrg
427b8e80941Smrg	default:
428b8e80941Smrg		assert(0);
429b8e80941Smrg		return NULL;
430848b8605Smrg	}
431b8e80941Smrg}
432848b8605Smrg
433b8e80941Smrg/* Bitcast <4 x float> to <2 x double>, extract the component, and convert
434b8e80941Smrg * to float. */
435b8e80941Smrgstatic LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
436b8e80941Smrg					    LLVMValueRef vec4,
437b8e80941Smrg					    unsigned double_index)
438b8e80941Smrg{
439b8e80941Smrg	LLVMBuilderRef builder = ctx->ac.builder;
440b8e80941Smrg	LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->ac.context);
441b8e80941Smrg	LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4,
442b8e80941Smrg					      LLVMVectorType(f64, 2), "");
443b8e80941Smrg	LLVMValueRef index = LLVMConstInt(ctx->i32, double_index, 0);
444b8e80941Smrg	LLVMValueRef value = LLVMBuildExtractElement(builder, dvec2, index, "");
445b8e80941Smrg	return LLVMBuildFPTrunc(builder, value, ctx->f32, "");
446848b8605Smrg}
447848b8605Smrg
448b8e80941Smrgstatic LLVMValueRef unpack_sint16(struct si_shader_context *ctx,
449b8e80941Smrg				 LLVMValueRef i32, unsigned index)
450848b8605Smrg{
451b8e80941Smrg	assert(index <= 1);
452848b8605Smrg
453b8e80941Smrg	if (index == 1)
454b8e80941Smrg		return LLVMBuildAShr(ctx->ac.builder, i32,
455b8e80941Smrg				     LLVMConstInt(ctx->i32, 16, 0), "");
456848b8605Smrg
457b8e80941Smrg	return LLVMBuildSExt(ctx->ac.builder,
458b8e80941Smrg			     LLVMBuildTrunc(ctx->ac.builder, i32,
459b8e80941Smrg					    ctx->ac.i16, ""),
460b8e80941Smrg			     ctx->i32, "");
461b8e80941Smrg}
462848b8605Smrg
463b8e80941Smrgvoid si_llvm_load_input_vs(
464b8e80941Smrg	struct si_shader_context *ctx,
465b8e80941Smrg	unsigned input_index,
466b8e80941Smrg	LLVMValueRef out[4])
467b8e80941Smrg{
468b8e80941Smrg	const struct tgsi_shader_info *info = &ctx->shader->selector->info;
469b8e80941Smrg	unsigned vs_blit_property = info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS];
470b8e80941Smrg
471b8e80941Smrg	if (vs_blit_property) {
472b8e80941Smrg		LLVMValueRef vertex_id = ctx->abi.vertex_id;
473b8e80941Smrg		LLVMValueRef sel_x1 = LLVMBuildICmp(ctx->ac.builder,
474b8e80941Smrg						    LLVMIntULE, vertex_id,
475b8e80941Smrg						    ctx->i32_1, "");
476b8e80941Smrg		/* Use LLVMIntNE, because we have 3 vertices and only
477b8e80941Smrg		 * the middle one should use y2.
478b8e80941Smrg		 */
479b8e80941Smrg		LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder,
480b8e80941Smrg						    LLVMIntNE, vertex_id,
481b8e80941Smrg						    ctx->i32_1, "");
482b8e80941Smrg
483b8e80941Smrg		if (input_index == 0) {
484b8e80941Smrg			/* Position: */
485b8e80941Smrg			LLVMValueRef x1y1 = LLVMGetParam(ctx->main_fn,
486b8e80941Smrg							 ctx->param_vs_blit_inputs);
487b8e80941Smrg			LLVMValueRef x2y2 = LLVMGetParam(ctx->main_fn,
488b8e80941Smrg							 ctx->param_vs_blit_inputs + 1);
489b8e80941Smrg
490b8e80941Smrg			LLVMValueRef x1 = unpack_sint16(ctx, x1y1, 0);
491b8e80941Smrg			LLVMValueRef y1 = unpack_sint16(ctx, x1y1, 1);
492b8e80941Smrg			LLVMValueRef x2 = unpack_sint16(ctx, x2y2, 0);
493b8e80941Smrg			LLVMValueRef y2 = unpack_sint16(ctx, x2y2, 1);
494b8e80941Smrg
495b8e80941Smrg			LLVMValueRef x = LLVMBuildSelect(ctx->ac.builder, sel_x1,
496b8e80941Smrg							 x1, x2, "");
497b8e80941Smrg			LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1,
498b8e80941Smrg							 y1, y2, "");
499b8e80941Smrg
500b8e80941Smrg			out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->f32, "");
501b8e80941Smrg			out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->f32, "");
502b8e80941Smrg			out[2] = LLVMGetParam(ctx->main_fn,
503b8e80941Smrg					      ctx->param_vs_blit_inputs + 2);
504b8e80941Smrg			out[3] = ctx->ac.f32_1;
505b8e80941Smrg			return;
506b8e80941Smrg		}
507848b8605Smrg
508b8e80941Smrg		/* Color or texture coordinates: */
509b8e80941Smrg		assert(input_index == 1);
510b8e80941Smrg
511b8e80941Smrg		if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
512b8e80941Smrg			for (int i = 0; i < 4; i++) {
513b8e80941Smrg				out[i] = LLVMGetParam(ctx->main_fn,
514b8e80941Smrg						      ctx->param_vs_blit_inputs + 3 + i);
515b8e80941Smrg			}
516b8e80941Smrg		} else {
517b8e80941Smrg			assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD);
518b8e80941Smrg			LLVMValueRef x1 = LLVMGetParam(ctx->main_fn,
519b8e80941Smrg						       ctx->param_vs_blit_inputs + 3);
520b8e80941Smrg			LLVMValueRef y1 = LLVMGetParam(ctx->main_fn,
521b8e80941Smrg						       ctx->param_vs_blit_inputs + 4);
522b8e80941Smrg			LLVMValueRef x2 = LLVMGetParam(ctx->main_fn,
523b8e80941Smrg						       ctx->param_vs_blit_inputs + 5);
524b8e80941Smrg			LLVMValueRef y2 = LLVMGetParam(ctx->main_fn,
525b8e80941Smrg						       ctx->param_vs_blit_inputs + 6);
526b8e80941Smrg
527b8e80941Smrg			out[0] = LLVMBuildSelect(ctx->ac.builder, sel_x1,
528b8e80941Smrg						 x1, x2, "");
529b8e80941Smrg			out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1,
530b8e80941Smrg						 y1, y2, "");
531b8e80941Smrg			out[2] = LLVMGetParam(ctx->main_fn,
532b8e80941Smrg					      ctx->param_vs_blit_inputs + 7);
533b8e80941Smrg			out[3] = LLVMGetParam(ctx->main_fn,
534b8e80941Smrg					      ctx->param_vs_blit_inputs + 8);
535848b8605Smrg		}
536848b8605Smrg		return;
537848b8605Smrg	}
538848b8605Smrg
539b8e80941Smrg	unsigned chan;
540b8e80941Smrg	unsigned fix_fetch;
541b8e80941Smrg	unsigned num_fetches;
542b8e80941Smrg	unsigned fetch_stride;
543b8e80941Smrg	unsigned num_channels;
544848b8605Smrg
545b8e80941Smrg	LLVMValueRef t_list_ptr;
546b8e80941Smrg	LLVMValueRef t_offset;
547b8e80941Smrg	LLVMValueRef t_list;
548b8e80941Smrg	LLVMValueRef vertex_index;
549b8e80941Smrg	LLVMValueRef input[3];
550848b8605Smrg
551b8e80941Smrg	/* Load the T list */
552b8e80941Smrg	t_list_ptr = LLVMGetParam(ctx->main_fn, ctx->param_vertex_buffers);
553848b8605Smrg
554b8e80941Smrg	t_offset = LLVMConstInt(ctx->i32, input_index, 0);
555848b8605Smrg
556b8e80941Smrg	t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);
557848b8605Smrg
558b8e80941Smrg	vertex_index = LLVMGetParam(ctx->main_fn,
559b8e80941Smrg				    ctx->param_vertex_index0 +
560b8e80941Smrg				    input_index);
561848b8605Smrg
562b8e80941Smrg	fix_fetch = ctx->shader->key.mono.vs_fix_fetch[input_index];
563b8e80941Smrg
564b8e80941Smrg	/* Do multiple loads for special formats. */
565b8e80941Smrg	switch (fix_fetch) {
566b8e80941Smrg	case SI_FIX_FETCH_RG_64_FLOAT:
567b8e80941Smrg		num_fetches = 1; /* 1 2-dword or 4-dword load */
568b8e80941Smrg		fetch_stride = 0;
569b8e80941Smrg		if (util_last_bit(info->input_usage_mask[input_index]) >= 2)
570b8e80941Smrg			num_channels = 4; /* 2 doubles in 4 dwords */
571848b8605Smrg		else
572b8e80941Smrg			num_channels = 2; /* 1 double in 2 dwords */
573848b8605Smrg		break;
574b8e80941Smrg	case SI_FIX_FETCH_RGB_64_FLOAT:
575b8e80941Smrg		num_fetches = 3; /* 3 2-dword loads */
576b8e80941Smrg		fetch_stride = 8;
577b8e80941Smrg		num_channels = 2;
578b8e80941Smrg		break;
579b8e80941Smrg	case SI_FIX_FETCH_RGBA_64_FLOAT:
580b8e80941Smrg		num_fetches = 2; /* 2 4-dword loads */
581b8e80941Smrg		fetch_stride = 16;
582b8e80941Smrg		num_channels = 4;
583b8e80941Smrg		break;
584b8e80941Smrg	case SI_FIX_FETCH_RGB_8:
585b8e80941Smrg	case SI_FIX_FETCH_RGB_8_INT:
586b8e80941Smrg		num_fetches = 3;
587b8e80941Smrg		fetch_stride = 1;
588b8e80941Smrg		num_channels = 1;
589b8e80941Smrg		break;
590b8e80941Smrg	case SI_FIX_FETCH_RGB_16:
591b8e80941Smrg	case SI_FIX_FETCH_RGB_16_INT:
592b8e80941Smrg		num_fetches = 3;
593b8e80941Smrg		fetch_stride = 2;
594b8e80941Smrg		num_channels = 1;
595848b8605Smrg		break;
596848b8605Smrg	default:
597b8e80941Smrg		num_fetches = 1;
598b8e80941Smrg		fetch_stride = 0;
599b8e80941Smrg		num_channels = util_last_bit(info->input_usage_mask[input_index]);
600848b8605Smrg	}
601848b8605Smrg
602b8e80941Smrg	for (unsigned i = 0; i < num_fetches; i++) {
603b8e80941Smrg		LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0);
604848b8605Smrg
605b8e80941Smrg		input[i] = ac_build_buffer_load_format(&ctx->ac, t_list,
606b8e80941Smrg						       vertex_index, voffset,
607b8e80941Smrg						       num_channels, false, true);
608b8e80941Smrg		input[i] = ac_build_expand_to_vec4(&ctx->ac, input[i], num_channels);
609b8e80941Smrg	}
610848b8605Smrg
611b8e80941Smrg	/* Break up the vec4 into individual components */
612b8e80941Smrg	for (chan = 0; chan < 4; chan++) {
613b8e80941Smrg		LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
614b8e80941Smrg		out[chan] = LLVMBuildExtractElement(ctx->ac.builder,
615b8e80941Smrg						    input[0], llvm_chan, "");
616b8e80941Smrg	}
617848b8605Smrg
618b8e80941Smrg	switch (fix_fetch) {
619b8e80941Smrg	case SI_FIX_FETCH_A2_SNORM:
620b8e80941Smrg	case SI_FIX_FETCH_A2_SSCALED:
621b8e80941Smrg	case SI_FIX_FETCH_A2_SINT: {
622b8e80941Smrg		/* The hardware returns an unsigned value; convert it to a
623b8e80941Smrg		 * signed one.
624b8e80941Smrg		 */
625b8e80941Smrg		LLVMValueRef tmp = out[3];
626b8e80941Smrg		LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0);
627848b8605Smrg
628b8e80941Smrg		/* First, recover the sign-extended signed integer value. */
629b8e80941Smrg		if (fix_fetch == SI_FIX_FETCH_A2_SSCALED)
630b8e80941Smrg			tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->i32, "");
631b8e80941Smrg		else
632b8e80941Smrg			tmp = ac_to_integer(&ctx->ac, tmp);
633848b8605Smrg
634b8e80941Smrg		/* For the integer-like cases, do a natural sign extension.
635b8e80941Smrg		 *
636b8e80941Smrg		 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
637b8e80941Smrg		 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
638b8e80941Smrg		 * exponent.
639b8e80941Smrg		 */
640b8e80941Smrg		tmp = LLVMBuildShl(ctx->ac.builder, tmp,
641b8e80941Smrg				   fix_fetch == SI_FIX_FETCH_A2_SNORM ?
642b8e80941Smrg				   LLVMConstInt(ctx->i32, 7, 0) : c30, "");
643b8e80941Smrg		tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, "");
644b8e80941Smrg
645b8e80941Smrg		/* Convert back to the right type. */
646b8e80941Smrg		if (fix_fetch == SI_FIX_FETCH_A2_SNORM) {
647b8e80941Smrg			LLVMValueRef clamp;
648b8e80941Smrg			LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
649b8e80941Smrg			tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, "");
650b8e80941Smrg			clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, "");
651b8e80941Smrg			tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, "");
652b8e80941Smrg		} else if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) {
653b8e80941Smrg			tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, "");
654848b8605Smrg		}
655848b8605Smrg
656b8e80941Smrg		out[3] = tmp;
657848b8605Smrg		break;
658b8e80941Smrg	}
659b8e80941Smrg	case SI_FIX_FETCH_RGBA_32_UNORM:
660b8e80941Smrg	case SI_FIX_FETCH_RGBX_32_UNORM:
661b8e80941Smrg		for (chan = 0; chan < 4; chan++) {
662b8e80941Smrg			out[chan] = ac_to_integer(&ctx->ac, out[chan]);
663b8e80941Smrg			out[chan] = LLVMBuildUIToFP(ctx->ac.builder,
664b8e80941Smrg						    out[chan], ctx->f32, "");
665b8e80941Smrg			out[chan] = LLVMBuildFMul(ctx->ac.builder, out[chan],
666b8e80941Smrg						  LLVMConstReal(ctx->f32, 1.0 / UINT_MAX), "");
667b8e80941Smrg		}
668b8e80941Smrg		/* RGBX UINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
669b8e80941Smrg		if (fix_fetch == SI_FIX_FETCH_RGBX_32_UNORM)
670b8e80941Smrg			out[3] = LLVMConstReal(ctx->f32, 1);
671848b8605Smrg		break;
672b8e80941Smrg	case SI_FIX_FETCH_RGBA_32_SNORM:
673b8e80941Smrg	case SI_FIX_FETCH_RGBX_32_SNORM:
674b8e80941Smrg	case SI_FIX_FETCH_RGBA_32_FIXED:
675b8e80941Smrg	case SI_FIX_FETCH_RGBX_32_FIXED: {
676b8e80941Smrg		double scale;
677b8e80941Smrg		if (fix_fetch >= SI_FIX_FETCH_RGBA_32_FIXED)
678b8e80941Smrg			scale = 1.0 / 0x10000;
679b8e80941Smrg		else
680b8e80941Smrg			scale = 1.0 / INT_MAX;
681848b8605Smrg
682b8e80941Smrg		for (chan = 0; chan < 4; chan++) {
683b8e80941Smrg			out[chan] = ac_to_integer(&ctx->ac, out[chan]);
684b8e80941Smrg			out[chan] = LLVMBuildSIToFP(ctx->ac.builder,
685b8e80941Smrg						    out[chan], ctx->f32, "");
686b8e80941Smrg			out[chan] = LLVMBuildFMul(ctx->ac.builder, out[chan],
687b8e80941Smrg						  LLVMConstReal(ctx->f32, scale), "");
688b8e80941Smrg		}
689b8e80941Smrg		/* RGBX SINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
690b8e80941Smrg		if (fix_fetch == SI_FIX_FETCH_RGBX_32_SNORM ||
691b8e80941Smrg		    fix_fetch == SI_FIX_FETCH_RGBX_32_FIXED)
692b8e80941Smrg			out[3] = LLVMConstReal(ctx->f32, 1);
693848b8605Smrg		break;
694b8e80941Smrg	}
695b8e80941Smrg	case SI_FIX_FETCH_RGBA_32_USCALED:
696b8e80941Smrg		for (chan = 0; chan < 4; chan++) {
697b8e80941Smrg			out[chan] = ac_to_integer(&ctx->ac, out[chan]);
698b8e80941Smrg			out[chan] = LLVMBuildUIToFP(ctx->ac.builder,
699b8e80941Smrg						    out[chan], ctx->f32, "");
700b8e80941Smrg		}
701b8e80941Smrg		break;
702b8e80941Smrg	case SI_FIX_FETCH_RGBA_32_SSCALED:
703b8e80941Smrg		for (chan = 0; chan < 4; chan++) {
704b8e80941Smrg			out[chan] = ac_to_integer(&ctx->ac, out[chan]);
705b8e80941Smrg			out[chan] = LLVMBuildSIToFP(ctx->ac.builder,
706b8e80941Smrg						    out[chan], ctx->f32, "");
707b8e80941Smrg		}
708b8e80941Smrg		break;
709b8e80941Smrg	case SI_FIX_FETCH_RG_64_FLOAT:
710b8e80941Smrg		for (chan = 0; chan < 2; chan++)
711b8e80941Smrg			out[chan] = extract_double_to_float(ctx, input[0], chan);
712848b8605Smrg
713b8e80941Smrg		out[2] = LLVMConstReal(ctx->f32, 0);
714b8e80941Smrg		out[3] = LLVMConstReal(ctx->f32, 1);
715b8e80941Smrg		break;
716b8e80941Smrg	case SI_FIX_FETCH_RGB_64_FLOAT:
717b8e80941Smrg		for (chan = 0; chan < 3; chan++)
718b8e80941Smrg			out[chan] = extract_double_to_float(ctx, input[chan], 0);
719848b8605Smrg
720b8e80941Smrg		out[3] = LLVMConstReal(ctx->f32, 1);
721b8e80941Smrg		break;
722b8e80941Smrg	case SI_FIX_FETCH_RGBA_64_FLOAT:
723b8e80941Smrg		for (chan = 0; chan < 4; chan++) {
724b8e80941Smrg			out[chan] = extract_double_to_float(ctx, input[chan / 2],
725b8e80941Smrg							    chan % 2);
726b8e80941Smrg		}
727b8e80941Smrg		break;
728b8e80941Smrg	case SI_FIX_FETCH_RGB_8:
729b8e80941Smrg	case SI_FIX_FETCH_RGB_8_INT:
730b8e80941Smrg	case SI_FIX_FETCH_RGB_16:
731b8e80941Smrg	case SI_FIX_FETCH_RGB_16_INT:
732b8e80941Smrg		for (chan = 0; chan < 3; chan++) {
733b8e80941Smrg			out[chan] = LLVMBuildExtractElement(ctx->ac.builder,
734b8e80941Smrg							    input[chan],
735b8e80941Smrg							    ctx->i32_0, "");
736b8e80941Smrg		}
737b8e80941Smrg		if (fix_fetch == SI_FIX_FETCH_RGB_8 ||
738b8e80941Smrg		    fix_fetch == SI_FIX_FETCH_RGB_16) {
739b8e80941Smrg			out[3] = LLVMConstReal(ctx->f32, 1);
740b8e80941Smrg		} else {
741b8e80941Smrg			out[3] = ac_to_float(&ctx->ac, ctx->i32_1);
742b8e80941Smrg		}
743848b8605Smrg		break;
744848b8605Smrg	}
745b8e80941Smrg}
746848b8605Smrg
747b8e80941Smrgstatic void declare_input_vs(
748b8e80941Smrg	struct si_shader_context *ctx,
749b8e80941Smrg	unsigned input_index,
750b8e80941Smrg	const struct tgsi_full_declaration *decl,
751b8e80941Smrg	LLVMValueRef out[4])
752b8e80941Smrg{
753b8e80941Smrg	si_llvm_load_input_vs(ctx, input_index, out);
754b8e80941Smrg}
755b8e80941Smrg
756b8e80941Smrgstatic LLVMValueRef get_primitive_id(struct si_shader_context *ctx,
757b8e80941Smrg				     unsigned swizzle)
758b8e80941Smrg{
759b8e80941Smrg	if (swizzle > 0)
760b8e80941Smrg		return ctx->i32_0;
761b8e80941Smrg
762b8e80941Smrg	switch (ctx->type) {
763b8e80941Smrg	case PIPE_SHADER_VERTEX:
764b8e80941Smrg		return LLVMGetParam(ctx->main_fn,
765b8e80941Smrg				    ctx->param_vs_prim_id);
766b8e80941Smrg	case PIPE_SHADER_TESS_CTRL:
767b8e80941Smrg		return ctx->abi.tcs_patch_id;
768b8e80941Smrg	case PIPE_SHADER_TESS_EVAL:
769b8e80941Smrg		return ctx->abi.tes_patch_id;
770b8e80941Smrg	case PIPE_SHADER_GEOMETRY:
771b8e80941Smrg		return ctx->abi.gs_prim_id;
772848b8605Smrg	default:
773b8e80941Smrg		assert(0);
774b8e80941Smrg		return ctx->i32_0;
775848b8605Smrg	}
776848b8605Smrg}
777848b8605Smrg
778b8e80941Smrg/**
779b8e80941Smrg * Return the value of tgsi_ind_register for indexing.
780b8e80941Smrg * This is the indirect index with the constant offset added to it.
781b8e80941Smrg */
782b8e80941SmrgLLVMValueRef si_get_indirect_index(struct si_shader_context *ctx,
783b8e80941Smrg				   const struct tgsi_ind_register *ind,
784b8e80941Smrg				   unsigned addr_mul,
785b8e80941Smrg				   int rel_index)
786848b8605Smrg{
787848b8605Smrg	LLVMValueRef result;
788848b8605Smrg
789b8e80941Smrg	if (ind->File == TGSI_FILE_ADDRESS) {
790b8e80941Smrg		result = ctx->addrs[ind->Index][ind->Swizzle];
791b8e80941Smrg		result = LLVMBuildLoad(ctx->ac.builder, result, "");
792b8e80941Smrg	} else {
793b8e80941Smrg		struct tgsi_full_src_register src = {};
794848b8605Smrg
795b8e80941Smrg		src.Register.File = ind->File;
796b8e80941Smrg		src.Register.Index = ind->Index;
797848b8605Smrg
798b8e80941Smrg		/* Set the second index to 0 for constants. */
799b8e80941Smrg		if (ind->File == TGSI_FILE_CONSTANT)
800b8e80941Smrg			src.Register.Dimension = 1;
801848b8605Smrg
802b8e80941Smrg		result = ctx->bld_base.emit_fetch_funcs[ind->File](&ctx->bld_base, &src,
803b8e80941Smrg								   TGSI_TYPE_SIGNED,
804b8e80941Smrg								   ind->Swizzle);
805b8e80941Smrg		result = ac_to_integer(&ctx->ac, result);
806b8e80941Smrg	}
807848b8605Smrg
808b8e80941Smrg	return ac_build_imad(&ctx->ac, result, LLVMConstInt(ctx->i32, addr_mul, 0),
809b8e80941Smrg			     LLVMConstInt(ctx->i32, rel_index, 0));
810b8e80941Smrg}
811848b8605Smrg
812b8e80941Smrg/**
813b8e80941Smrg * Like si_get_indirect_index, but restricts the return value to a (possibly
814b8e80941Smrg * undefined) value inside [0..num).
815b8e80941Smrg */
816b8e80941SmrgLLVMValueRef si_get_bounded_indirect_index(struct si_shader_context *ctx,
817b8e80941Smrg					   const struct tgsi_ind_register *ind,
818b8e80941Smrg					   int rel_index, unsigned num)
819b8e80941Smrg{
820b8e80941Smrg	LLVMValueRef result = si_get_indirect_index(ctx, ind, 1, rel_index);
821848b8605Smrg
822b8e80941Smrg	return si_llvm_bound_index(ctx, result, num);
823848b8605Smrg}
824848b8605Smrg
825b8e80941Smrgstatic LLVMValueRef get_dw_address_from_generic_indices(struct si_shader_context *ctx,
826b8e80941Smrg							LLVMValueRef vertex_dw_stride,
827b8e80941Smrg							LLVMValueRef base_addr,
828b8e80941Smrg							LLVMValueRef vertex_index,
829b8e80941Smrg							LLVMValueRef param_index,
830b8e80941Smrg							unsigned input_index,
831b8e80941Smrg							ubyte *name,
832b8e80941Smrg							ubyte *index,
833b8e80941Smrg							bool is_patch)
834848b8605Smrg{
835b8e80941Smrg	if (vertex_dw_stride) {
836b8e80941Smrg		base_addr = ac_build_imad(&ctx->ac, vertex_index,
837b8e80941Smrg					  vertex_dw_stride, base_addr);
838b8e80941Smrg	}
839848b8605Smrg
840b8e80941Smrg	if (param_index) {
841b8e80941Smrg		base_addr = ac_build_imad(&ctx->ac, param_index,
842b8e80941Smrg					  LLVMConstInt(ctx->i32, 4, 0), base_addr);
843b8e80941Smrg	}
844848b8605Smrg
845b8e80941Smrg	int param = is_patch ?
846b8e80941Smrg		si_shader_io_get_unique_index_patch(name[input_index],
847b8e80941Smrg						    index[input_index]) :
848b8e80941Smrg		si_shader_io_get_unique_index(name[input_index],
849b8e80941Smrg					      index[input_index], false);
850848b8605Smrg
851b8e80941Smrg	/* Add the base address of the element. */
852b8e80941Smrg	return LLVMBuildAdd(ctx->ac.builder, base_addr,
853b8e80941Smrg			    LLVMConstInt(ctx->i32, param * 4, 0), "");
854b8e80941Smrg}
855848b8605Smrg
856b8e80941Smrg/**
857b8e80941Smrg * Calculate a dword address given an input or output register and a stride.
858b8e80941Smrg */
859b8e80941Smrgstatic LLVMValueRef get_dw_address(struct si_shader_context *ctx,
860b8e80941Smrg				   const struct tgsi_full_dst_register *dst,
861b8e80941Smrg				   const struct tgsi_full_src_register *src,
862b8e80941Smrg				   LLVMValueRef vertex_dw_stride,
863b8e80941Smrg				   LLVMValueRef base_addr)
864b8e80941Smrg{
865b8e80941Smrg	struct tgsi_shader_info *info = &ctx->shader->selector->info;
866b8e80941Smrg	ubyte *name, *index, *array_first;
867b8e80941Smrg	int input_index;
868b8e80941Smrg	struct tgsi_full_dst_register reg;
869b8e80941Smrg	LLVMValueRef vertex_index = NULL;
870b8e80941Smrg	LLVMValueRef ind_index = NULL;
871b8e80941Smrg
872b8e80941Smrg	/* Set the register description. The address computation is the same
873b8e80941Smrg	 * for sources and destinations. */
874b8e80941Smrg	if (src) {
875b8e80941Smrg		reg.Register.File = src->Register.File;
876b8e80941Smrg		reg.Register.Index = src->Register.Index;
877b8e80941Smrg		reg.Register.Indirect = src->Register.Indirect;
878b8e80941Smrg		reg.Register.Dimension = src->Register.Dimension;
879b8e80941Smrg		reg.Indirect = src->Indirect;
880b8e80941Smrg		reg.Dimension = src->Dimension;
881b8e80941Smrg		reg.DimIndirect = src->DimIndirect;
882b8e80941Smrg	} else
883b8e80941Smrg		reg = *dst;
884b8e80941Smrg
885b8e80941Smrg	/* If the register is 2-dimensional (e.g. an array of vertices
886b8e80941Smrg	 * in a primitive), calculate the base address of the vertex. */
887b8e80941Smrg	if (reg.Register.Dimension) {
888b8e80941Smrg		if (reg.Dimension.Indirect)
889b8e80941Smrg			vertex_index = si_get_indirect_index(ctx, &reg.DimIndirect,
890b8e80941Smrg						      1, reg.Dimension.Index);
891b8e80941Smrg		else
892b8e80941Smrg			vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
893848b8605Smrg	}
894848b8605Smrg
895b8e80941Smrg	/* Get information about the register. */
896b8e80941Smrg	if (reg.Register.File == TGSI_FILE_INPUT) {
897b8e80941Smrg		name = info->input_semantic_name;
898b8e80941Smrg		index = info->input_semantic_index;
899b8e80941Smrg		array_first = info->input_array_first;
900b8e80941Smrg	} else if (reg.Register.File == TGSI_FILE_OUTPUT) {
901b8e80941Smrg		name = info->output_semantic_name;
902b8e80941Smrg		index = info->output_semantic_index;
903b8e80941Smrg		array_first = info->output_array_first;
904848b8605Smrg	} else {
905b8e80941Smrg		assert(0);
906b8e80941Smrg		return NULL;
907848b8605Smrg	}
908848b8605Smrg
909b8e80941Smrg	if (reg.Register.Indirect) {
910b8e80941Smrg		/* Add the relative address of the element. */
911b8e80941Smrg		if (reg.Indirect.ArrayID)
912b8e80941Smrg			input_index = array_first[reg.Indirect.ArrayID];
913b8e80941Smrg		else
914b8e80941Smrg			input_index = reg.Register.Index;
915848b8605Smrg
916b8e80941Smrg		ind_index = si_get_indirect_index(ctx, &reg.Indirect,
917b8e80941Smrg						  1, reg.Register.Index - input_index);
918b8e80941Smrg	} else {
919b8e80941Smrg		input_index = reg.Register.Index;
920b8e80941Smrg	}
921848b8605Smrg
922b8e80941Smrg	return get_dw_address_from_generic_indices(ctx, vertex_dw_stride,
923b8e80941Smrg						   base_addr, vertex_index,
924b8e80941Smrg						   ind_index, input_index,
925b8e80941Smrg						   name, index,
926b8e80941Smrg						   !reg.Register.Dimension);
927b8e80941Smrg}
928848b8605Smrg
929b8e80941Smrg/* The offchip buffer layout for TCS->TES is
930b8e80941Smrg *
931b8e80941Smrg * - attribute 0 of patch 0 vertex 0
932b8e80941Smrg * - attribute 0 of patch 0 vertex 1
933b8e80941Smrg * - attribute 0 of patch 0 vertex 2
934b8e80941Smrg *   ...
935b8e80941Smrg * - attribute 0 of patch 1 vertex 0
936b8e80941Smrg * - attribute 0 of patch 1 vertex 1
937b8e80941Smrg *   ...
938b8e80941Smrg * - attribute 1 of patch 0 vertex 0
939b8e80941Smrg * - attribute 1 of patch 0 vertex 1
940b8e80941Smrg *   ...
941b8e80941Smrg * - per patch attribute 0 of patch 0
942b8e80941Smrg * - per patch attribute 0 of patch 1
943b8e80941Smrg *   ...
944b8e80941Smrg *
945b8e80941Smrg * Note that every attribute has 4 components.
946b8e80941Smrg */
947b8e80941Smrgstatic LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
948b8e80941Smrg					       LLVMValueRef rel_patch_id,
949b8e80941Smrg                                               LLVMValueRef vertex_index,
950b8e80941Smrg                                               LLVMValueRef param_index)
951b8e80941Smrg{
952b8e80941Smrg	LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
953b8e80941Smrg	LLVMValueRef param_stride, constant16;
954b8e80941Smrg
955b8e80941Smrg	vertices_per_patch = get_num_tcs_out_vertices(ctx);
956b8e80941Smrg	num_patches = si_unpack_param(ctx, ctx->param_tcs_offchip_layout, 0, 6);
957b8e80941Smrg	total_vertices = LLVMBuildMul(ctx->ac.builder, vertices_per_patch,
958b8e80941Smrg	                              num_patches, "");
959b8e80941Smrg
960b8e80941Smrg	constant16 = LLVMConstInt(ctx->i32, 16, 0);
961b8e80941Smrg	if (vertex_index) {
962b8e80941Smrg		base_addr = ac_build_imad(&ctx->ac, rel_patch_id,
963b8e80941Smrg					  vertices_per_patch, vertex_index);
964b8e80941Smrg		param_stride = total_vertices;
965b8e80941Smrg	} else {
966b8e80941Smrg		base_addr = rel_patch_id;
967b8e80941Smrg		param_stride = num_patches;
968b8e80941Smrg	}
969848b8605Smrg
970b8e80941Smrg	base_addr = ac_build_imad(&ctx->ac, param_index, param_stride, base_addr);
971b8e80941Smrg	base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, "");
972b8e80941Smrg
973b8e80941Smrg	if (!vertex_index) {
974b8e80941Smrg		LLVMValueRef patch_data_offset =
975b8e80941Smrg		           si_unpack_param(ctx, ctx->param_tcs_offchip_layout, 12, 20);
976b8e80941Smrg
977b8e80941Smrg		base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
978b8e80941Smrg		                         patch_data_offset, "");
979b8e80941Smrg	}
980b8e80941Smrg	return base_addr;
981848b8605Smrg}
982848b8605Smrg
983b8e80941Smrg/* This is a generic helper that can be shared by the NIR and TGSI backends */
984b8e80941Smrgstatic LLVMValueRef get_tcs_tes_buffer_address_from_generic_indices(
985b8e80941Smrg					struct si_shader_context *ctx,
986b8e80941Smrg					LLVMValueRef vertex_index,
987b8e80941Smrg					LLVMValueRef param_index,
988b8e80941Smrg					unsigned param_base,
989b8e80941Smrg					ubyte *name,
990b8e80941Smrg					ubyte *index,
991b8e80941Smrg					bool is_patch)
992848b8605Smrg{
993b8e80941Smrg	unsigned param_index_base;
994848b8605Smrg
995b8e80941Smrg	param_index_base = is_patch ?
996b8e80941Smrg		si_shader_io_get_unique_index_patch(name[param_base], index[param_base]) :
997b8e80941Smrg		si_shader_io_get_unique_index(name[param_base], index[param_base], false);
998b8e80941Smrg
999b8e80941Smrg	if (param_index) {
1000b8e80941Smrg		param_index = LLVMBuildAdd(ctx->ac.builder, param_index,
1001b8e80941Smrg					   LLVMConstInt(ctx->i32, param_index_base, 0),
1002b8e80941Smrg					   "");
1003b8e80941Smrg	} else {
1004b8e80941Smrg		param_index = LLVMConstInt(ctx->i32, param_index_base, 0);
1005b8e80941Smrg	}
1006848b8605Smrg
1007b8e80941Smrg	return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx),
1008b8e80941Smrg					  vertex_index, param_index);
1009848b8605Smrg}
1010848b8605Smrg
1011b8e80941Smrgstatic LLVMValueRef get_tcs_tes_buffer_address_from_reg(
1012b8e80941Smrg                                       struct si_shader_context *ctx,
1013b8e80941Smrg                                       const struct tgsi_full_dst_register *dst,
1014b8e80941Smrg                                       const struct tgsi_full_src_register *src)
1015848b8605Smrg{
1016b8e80941Smrg	struct tgsi_shader_info *info = &ctx->shader->selector->info;
1017b8e80941Smrg	ubyte *name, *index, *array_first;
1018b8e80941Smrg	struct tgsi_full_src_register reg;
1019b8e80941Smrg	LLVMValueRef vertex_index = NULL;
1020b8e80941Smrg	LLVMValueRef param_index = NULL;
1021b8e80941Smrg	unsigned param_base;
1022848b8605Smrg
1023b8e80941Smrg	reg = src ? *src : tgsi_full_src_register_from_dst(dst);
1024b8e80941Smrg
1025b8e80941Smrg	if (reg.Register.Dimension) {
1026b8e80941Smrg
1027b8e80941Smrg		if (reg.Dimension.Indirect)
1028b8e80941Smrg			vertex_index = si_get_indirect_index(ctx, &reg.DimIndirect,
1029b8e80941Smrg							     1, reg.Dimension.Index);
1030b8e80941Smrg		else
1031b8e80941Smrg			vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
1032b8e80941Smrg	}
1033b8e80941Smrg
1034b8e80941Smrg	/* Get information about the register. */
1035b8e80941Smrg	if (reg.Register.File == TGSI_FILE_INPUT) {
1036b8e80941Smrg		name = info->input_semantic_name;
1037b8e80941Smrg		index = info->input_semantic_index;
1038b8e80941Smrg		array_first = info->input_array_first;
1039b8e80941Smrg	} else if (reg.Register.File == TGSI_FILE_OUTPUT) {
1040b8e80941Smrg		name = info->output_semantic_name;
1041b8e80941Smrg		index = info->output_semantic_index;
1042b8e80941Smrg		array_first = info->output_array_first;
1043b8e80941Smrg	} else {
1044b8e80941Smrg		assert(0);
1045b8e80941Smrg		return NULL;
1046b8e80941Smrg	}
1047b8e80941Smrg
1048b8e80941Smrg	if (reg.Register.Indirect) {
1049b8e80941Smrg		if (reg.Indirect.ArrayID)
1050b8e80941Smrg			param_base = array_first[reg.Indirect.ArrayID];
1051b8e80941Smrg		else
1052b8e80941Smrg			param_base = reg.Register.Index;
1053b8e80941Smrg
1054b8e80941Smrg		param_index = si_get_indirect_index(ctx, &reg.Indirect,
1055b8e80941Smrg						    1, reg.Register.Index - param_base);
1056848b8605Smrg
1057848b8605Smrg	} else {
1058b8e80941Smrg		param_base = reg.Register.Index;
1059848b8605Smrg	}
1060b8e80941Smrg
1061b8e80941Smrg	return get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index,
1062b8e80941Smrg							       param_index, param_base,
1063b8e80941Smrg							       name, index, !reg.Register.Dimension);
1064848b8605Smrg}
1065848b8605Smrg
1066b8e80941Smrgstatic LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
1067b8e80941Smrg                                LLVMTypeRef type, unsigned swizzle,
1068b8e80941Smrg                                LLVMValueRef buffer, LLVMValueRef offset,
1069b8e80941Smrg                                LLVMValueRef base, bool can_speculate)
1070848b8605Smrg{
1071b8e80941Smrg	struct si_shader_context *ctx = si_shader_context(bld_base);
1072b8e80941Smrg	LLVMValueRef value, value2;
1073b8e80941Smrg	LLVMTypeRef vec_type = LLVMVectorType(type, 4);
1074848b8605Smrg
1075b8e80941Smrg	if (swizzle == ~0) {
1076b8e80941Smrg		value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
1077b8e80941Smrg					     0, 1, 0, can_speculate, false);
1078848b8605Smrg
1079b8e80941Smrg		return LLVMBuildBitCast(ctx->ac.builder, value, vec_type, "");
1080b8e80941Smrg	}
1081848b8605Smrg
1082b8e80941Smrg	if (!llvm_type_is_64bit(ctx, type)) {
1083b8e80941Smrg		value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
1084b8e80941Smrg					     0, 1, 0, can_speculate, false);
1085848b8605Smrg
1086b8e80941Smrg		value = LLVMBuildBitCast(ctx->ac.builder, value, vec_type, "");
1087b8e80941Smrg		return LLVMBuildExtractElement(ctx->ac.builder, value,
1088b8e80941Smrg		                    LLVMConstInt(ctx->i32, swizzle, 0), "");
1089b8e80941Smrg	}
1090848b8605Smrg
1091b8e80941Smrg	value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
1092b8e80941Smrg	                          swizzle * 4, 1, 0, can_speculate, false);
1093b8e80941Smrg
1094b8e80941Smrg	value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
1095b8e80941Smrg	                           swizzle * 4 + 4, 1, 0, can_speculate, false);
1096b8e80941Smrg
1097b8e80941Smrg	return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
1098b8e80941Smrg}
1099b8e80941Smrg
1100b8e80941Smrg/**
1101b8e80941Smrg * Load from LDS.
1102b8e80941Smrg *
1103b8e80941Smrg * \param type		output value type
1104b8e80941Smrg * \param swizzle	offset (typically 0..3); it can be ~0, which loads a vec4
1105b8e80941Smrg * \param dw_addr	address in dwords
1106b8e80941Smrg */
1107b8e80941Smrgstatic LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
1108b8e80941Smrg			     LLVMTypeRef type, unsigned swizzle,
1109b8e80941Smrg			     LLVMValueRef dw_addr)
1110b8e80941Smrg{
1111b8e80941Smrg	struct si_shader_context *ctx = si_shader_context(bld_base);
1112b8e80941Smrg	LLVMValueRef value;
1113b8e80941Smrg
1114b8e80941Smrg	if (swizzle == ~0) {
1115b8e80941Smrg		LLVMValueRef values[TGSI_NUM_CHANNELS];
1116b8e80941Smrg
1117b8e80941Smrg		for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
1118b8e80941Smrg			values[chan] = lds_load(bld_base, type, chan, dw_addr);
1119b8e80941Smrg
1120b8e80941Smrg		return ac_build_gather_values(&ctx->ac, values,
1121b8e80941Smrg					      TGSI_NUM_CHANNELS);
1122b8e80941Smrg	}
1123b8e80941Smrg
1124b8e80941Smrg	/* Split 64-bit loads. */
1125b8e80941Smrg	if (llvm_type_is_64bit(ctx, type)) {
1126b8e80941Smrg		LLVMValueRef lo, hi;
1127848b8605Smrg
1128b8e80941Smrg		lo = lds_load(bld_base, ctx->i32, swizzle, dw_addr);
1129b8e80941Smrg		hi = lds_load(bld_base, ctx->i32, swizzle + 1, dw_addr);
1130b8e80941Smrg		return si_llvm_emit_fetch_64bit(bld_base, type, lo, hi);
1131848b8605Smrg	}
1132b8e80941Smrg
1133b8e80941Smrg	dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
1134b8e80941Smrg			       LLVMConstInt(ctx->i32, swizzle, 0), "");
1135b8e80941Smrg
1136b8e80941Smrg	value = ac_lds_load(&ctx->ac, dw_addr);
1137b8e80941Smrg
1138b8e80941Smrg	return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
1139848b8605Smrg}
1140848b8605Smrg
1141b8e80941Smrg/**
1142b8e80941Smrg * Store to LDS.
1143b8e80941Smrg *
1144b8e80941Smrg * \param swizzle	offset (typically 0..3)
1145b8e80941Smrg * \param dw_addr	address in dwords
1146b8e80941Smrg * \param value		value to store
1147b8e80941Smrg */
1148b8e80941Smrgstatic void lds_store(struct si_shader_context *ctx,
1149b8e80941Smrg		      unsigned dw_offset_imm, LLVMValueRef dw_addr,
1150b8e80941Smrg		      LLVMValueRef value)
1151848b8605Smrg{
1152b8e80941Smrg	dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
1153b8e80941Smrg			       LLVMConstInt(ctx->i32, dw_offset_imm, 0), "");
1154848b8605Smrg
1155b8e80941Smrg	ac_lds_store(&ctx->ac, dw_addr, value);
1156b8e80941Smrg}
1157848b8605Smrg
1158b8e80941Smrgenum si_tess_ring {
1159b8e80941Smrg	TCS_FACTOR_RING,
1160b8e80941Smrg	TESS_OFFCHIP_RING_TCS,
1161b8e80941Smrg	TESS_OFFCHIP_RING_TES,
1162b8e80941Smrg};
1163b8e80941Smrg
1164b8e80941Smrgstatic LLVMValueRef get_tess_ring_descriptor(struct si_shader_context *ctx,
1165b8e80941Smrg					     enum si_tess_ring ring)
1166b8e80941Smrg{
1167b8e80941Smrg	LLVMBuilderRef builder = ctx->ac.builder;
1168b8e80941Smrg	unsigned param = ring == TESS_OFFCHIP_RING_TES ? ctx->param_tes_offchip_addr :
1169b8e80941Smrg							 ctx->param_tcs_out_lds_layout;
1170b8e80941Smrg	LLVMValueRef addr = LLVMGetParam(ctx->main_fn, param);
1171b8e80941Smrg
1172b8e80941Smrg	/* TCS only receives high 13 bits of the address. */
1173b8e80941Smrg	if (ring == TESS_OFFCHIP_RING_TCS || ring == TCS_FACTOR_RING) {
1174b8e80941Smrg		addr = LLVMBuildAnd(builder, addr,
1175b8e80941Smrg				    LLVMConstInt(ctx->i32, 0xfff80000, 0), "");
1176b8e80941Smrg	}
1177b8e80941Smrg
1178b8e80941Smrg	if (ring == TCS_FACTOR_RING) {
1179b8e80941Smrg		unsigned tf_offset = ctx->screen->tess_offchip_ring_size;
1180b8e80941Smrg		addr = LLVMBuildAdd(builder, addr,
1181b8e80941Smrg				    LLVMConstInt(ctx->i32, tf_offset, 0), "");
1182848b8605Smrg	}
1183b8e80941Smrg
1184b8e80941Smrg	LLVMValueRef desc[4];
1185b8e80941Smrg	desc[0] = addr;
1186b8e80941Smrg	desc[1] = LLVMConstInt(ctx->i32,
1187b8e80941Smrg			       S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
1188b8e80941Smrg	desc[2] = LLVMConstInt(ctx->i32, 0xffffffff, 0);
1189b8e80941Smrg	desc[3] = LLVMConstInt(ctx->i32,
1190b8e80941Smrg			       S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
1191b8e80941Smrg			       S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
1192b8e80941Smrg			       S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
1193b8e80941Smrg			       S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
1194b8e80941Smrg			       S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
1195b8e80941Smrg			       S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32), 0);
1196b8e80941Smrg
1197b8e80941Smrg	return ac_build_gather_values(&ctx->ac, desc, 4);
1198848b8605Smrg}
1199848b8605Smrg
1200b8e80941Smrgstatic LLVMValueRef fetch_input_tcs(
1201b8e80941Smrg	struct lp_build_tgsi_context *bld_base,
1202b8e80941Smrg	const struct tgsi_full_src_register *reg,
1203b8e80941Smrg	enum tgsi_opcode_type type, unsigned swizzle_in)
1204b8e80941Smrg{
1205b8e80941Smrg	struct si_shader_context *ctx = si_shader_context(bld_base);
1206b8e80941Smrg	LLVMValueRef dw_addr, stride;
1207b8e80941Smrg	unsigned swizzle = swizzle_in & 0xffff;
1208b8e80941Smrg	stride = get_tcs_in_vertex_dw_stride(ctx);
1209b8e80941Smrg	dw_addr = get_tcs_in_current_patch_offset(ctx);
1210b8e80941Smrg	dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1211b8e80941Smrg
1212b8e80941Smrg	return lds_load(bld_base, tgsi2llvmtype(bld_base, type), swizzle, dw_addr);
1213b8e80941Smrg}
1214848b8605Smrg
1215b8e80941Smrgstatic LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi,
1216b8e80941Smrg					     LLVMTypeRef type,
1217b8e80941Smrg					     LLVMValueRef vertex_index,
1218b8e80941Smrg					     LLVMValueRef param_index,
1219b8e80941Smrg					     unsigned const_index,
1220b8e80941Smrg					     unsigned location,
1221b8e80941Smrg					     unsigned driver_location,
1222b8e80941Smrg					     unsigned component,
1223b8e80941Smrg					     unsigned num_components,
1224b8e80941Smrg					     bool is_patch,
1225b8e80941Smrg					     bool is_compact,
1226b8e80941Smrg					     bool load_input)
1227b8e80941Smrg{
1228b8e80941Smrg	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1229b8e80941Smrg	struct tgsi_shader_info *info = &ctx->shader->selector->info;
1230b8e80941Smrg	struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
1231b8e80941Smrg	LLVMValueRef dw_addr, stride;
1232b8e80941Smrg
1233b8e80941Smrg	driver_location = driver_location / 4;
1234b8e80941Smrg
1235b8e80941Smrg	if (load_input) {
1236b8e80941Smrg		stride = get_tcs_in_vertex_dw_stride(ctx);
1237b8e80941Smrg		dw_addr = get_tcs_in_current_patch_offset(ctx);
1238b8e80941Smrg	} else {
1239b8e80941Smrg		if (is_patch) {
1240b8e80941Smrg			stride = NULL;
1241b8e80941Smrg			dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1242b8e80941Smrg		} else {
1243b8e80941Smrg			stride = get_tcs_out_vertex_dw_stride(ctx);
1244b8e80941Smrg			dw_addr = get_tcs_out_current_patch_offset(ctx);
1245b8e80941Smrg		}
1246b8e80941Smrg	}
1247b8e80941Smrg
1248b8e80941Smrg	if (param_index) {
1249b8e80941Smrg		/* Add the constant index to the indirect index */
1250b8e80941Smrg		param_index = LLVMBuildAdd(ctx->ac.builder, param_index,
1251b8e80941Smrg					   LLVMConstInt(ctx->i32, const_index, 0), "");
1252b8e80941Smrg	} else {
1253b8e80941Smrg		param_index = LLVMConstInt(ctx->i32, const_index, 0);
1254b8e80941Smrg	}
1255b8e80941Smrg
1256b8e80941Smrg	ubyte *names;
1257b8e80941Smrg	ubyte *indices;
1258b8e80941Smrg	if (load_input) {
1259b8e80941Smrg		names = info->input_semantic_name;
1260b8e80941Smrg		indices = info->input_semantic_index;
1261b8e80941Smrg	} else {
1262b8e80941Smrg		names = info->output_semantic_name;
1263b8e80941Smrg		indices = info->output_semantic_index;
1264b8e80941Smrg	}
1265b8e80941Smrg
1266b8e80941Smrg	dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr,
1267b8e80941Smrg						      vertex_index, param_index,
1268b8e80941Smrg						      driver_location,
1269b8e80941Smrg						      names, indices,
1270b8e80941Smrg						      is_patch);
1271b8e80941Smrg
1272b8e80941Smrg	LLVMValueRef value[4];
1273b8e80941Smrg	for (unsigned i = 0; i < num_components; i++) {
1274b8e80941Smrg		unsigned offset = i;
1275b8e80941Smrg		if (llvm_type_is_64bit(ctx, type))
1276b8e80941Smrg			offset *= 2;
1277848b8605Smrg
1278b8e80941Smrg		offset += component;
1279b8e80941Smrg		value[i + component] = lds_load(bld_base, type, offset, dw_addr);
1280b8e80941Smrg	}
1281b8e80941Smrg
1282b8e80941Smrg	return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
1283848b8605Smrg}
1284848b8605Smrg
1285b8e80941Smrgstatic LLVMValueRef fetch_output_tcs(
1286b8e80941Smrg		struct lp_build_tgsi_context *bld_base,
1287b8e80941Smrg		const struct tgsi_full_src_register *reg,
1288b8e80941Smrg		enum tgsi_opcode_type type, unsigned swizzle_in)
1289848b8605Smrg{
1290b8e80941Smrg	struct si_shader_context *ctx = si_shader_context(bld_base);
1291b8e80941Smrg	LLVMValueRef dw_addr, stride;
1292b8e80941Smrg	unsigned swizzle = (swizzle_in & 0xffff);
1293b8e80941Smrg
1294b8e80941Smrg	if (reg->Register.Dimension) {
1295b8e80941Smrg		stride = get_tcs_out_vertex_dw_stride(ctx);
1296b8e80941Smrg		dw_addr = get_tcs_out_current_patch_offset(ctx);
1297b8e80941Smrg		dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1298b8e80941Smrg	} else {
1299b8e80941Smrg		dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1300b8e80941Smrg		dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
1301b8e80941Smrg	}
1302848b8605Smrg
1303b8e80941Smrg	return lds_load(bld_base, tgsi2llvmtype(bld_base, type), swizzle, dw_addr);
1304b8e80941Smrg}
1305848b8605Smrg
1306b8e80941Smrgstatic LLVMValueRef fetch_input_tes(
1307b8e80941Smrg	struct lp_build_tgsi_context *bld_base,
1308b8e80941Smrg	const struct tgsi_full_src_register *reg,
1309b8e80941Smrg	enum tgsi_opcode_type type, unsigned swizzle_in)
1310b8e80941Smrg{
1311b8e80941Smrg	struct si_shader_context *ctx = si_shader_context(bld_base);
1312b8e80941Smrg	LLVMValueRef base, addr;
1313b8e80941Smrg	unsigned swizzle = (swizzle_in & 0xffff);
1314848b8605Smrg
1315b8e80941Smrg	base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1316b8e80941Smrg	addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
1317848b8605Smrg
1318b8e80941Smrg	return buffer_load(bld_base, tgsi2llvmtype(bld_base, type), swizzle,
1319b8e80941Smrg			   ctx->tess_offchip_ring, base, addr, true);
1320b8e80941Smrg}
1321848b8605Smrg
1322b8e80941SmrgLLVMValueRef si_nir_load_input_tes(struct ac_shader_abi *abi,
1323b8e80941Smrg				   LLVMTypeRef type,
1324b8e80941Smrg				   LLVMValueRef vertex_index,
1325b8e80941Smrg				   LLVMValueRef param_index,
1326b8e80941Smrg				   unsigned const_index,
1327b8e80941Smrg				   unsigned location,
1328b8e80941Smrg				   unsigned driver_location,
1329b8e80941Smrg				   unsigned component,
1330b8e80941Smrg				   unsigned num_components,
1331b8e80941Smrg				   bool is_patch,
1332b8e80941Smrg				   bool is_compact,
1333b8e80941Smrg				   bool load_input)
1334b8e80941Smrg{
1335b8e80941Smrg	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1336b8e80941Smrg	struct tgsi_shader_info *info = &ctx->shader->selector->info;
1337b8e80941Smrg	LLVMValueRef base, addr;
1338848b8605Smrg
1339b8e80941Smrg	driver_location = driver_location / 4;
1340848b8605Smrg
1341b8e80941Smrg	base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1342848b8605Smrg
1343b8e80941Smrg	if (param_index) {
1344b8e80941Smrg		/* Add the constant index to the indirect index */
1345b8e80941Smrg		param_index = LLVMBuildAdd(ctx->ac.builder, param_index,
1346b8e80941Smrg					   LLVMConstInt(ctx->i32, const_index, 0), "");
1347b8e80941Smrg	} else {
1348b8e80941Smrg		param_index = LLVMConstInt(ctx->i32, const_index, 0);
1349b8e80941Smrg	}
1350848b8605Smrg
1351b8e80941Smrg	addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index,
1352b8e80941Smrg							       param_index, driver_location,
1353b8e80941Smrg							       info->input_semantic_name,
1354b8e80941Smrg							       info->input_semantic_index,
1355b8e80941Smrg							       is_patch);
1356b8e80941Smrg
1357b8e80941Smrg	/* TODO: This will generate rather ordinary llvm code, although it
1358b8e80941Smrg	 * should be easy for the optimiser to fix up. In future we might want
1359b8e80941Smrg	 * to refactor buffer_load(), but for now this maximises code sharing
1360b8e80941Smrg	 * between the NIR and TGSI backends.
1361b8e80941Smrg	 */
1362b8e80941Smrg	LLVMValueRef value[4];
1363b8e80941Smrg	for (unsigned i = 0; i < num_components; i++) {
1364b8e80941Smrg		unsigned offset = i;
1365b8e80941Smrg		if (llvm_type_is_64bit(ctx, type))
1366b8e80941Smrg			offset *= 2;
1367b8e80941Smrg
1368b8e80941Smrg		offset += component;
1369b8e80941Smrg		value[i + component] = buffer_load(&ctx->bld_base, type, offset,
1370b8e80941Smrg						   ctx->tess_offchip_ring, base, addr, true);
1371b8e80941Smrg	}
1372b8e80941Smrg
1373b8e80941Smrg	return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
1374b8e80941Smrg}
1375848b8605Smrg
1376b8e80941Smrgstatic void store_output_tcs(struct lp_build_tgsi_context *bld_base,
1377b8e80941Smrg			     const struct tgsi_full_instruction *inst,
1378b8e80941Smrg			     const struct tgsi_opcode_info *info,
1379b8e80941Smrg			     unsigned index,
1380b8e80941Smrg			     LLVMValueRef dst[4])
1381b8e80941Smrg{
1382b8e80941Smrg	struct si_shader_context *ctx = si_shader_context(bld_base);
1383b8e80941Smrg	const struct tgsi_full_dst_register *reg = &inst->Dst[index];
1384b8e80941Smrg	const struct tgsi_shader_info *sh_info = &ctx->shader->selector->info;
1385b8e80941Smrg	unsigned chan_index;
1386b8e80941Smrg	LLVMValueRef dw_addr, stride;
1387b8e80941Smrg	LLVMValueRef buffer, base, buf_addr;
1388b8e80941Smrg	LLVMValueRef values[4];
1389b8e80941Smrg	bool skip_lds_store;
1390b8e80941Smrg	bool is_tess_factor = false, is_tess_inner = false;
1391b8e80941Smrg
1392b8e80941Smrg	/* Only handle per-patch and per-vertex outputs here.
1393b8e80941Smrg	 * Vectors will be lowered to scalars and this function will be called again.
1394b8e80941Smrg	 */
1395b8e80941Smrg	if (reg->Register.File != TGSI_FILE_OUTPUT ||
1396b8e80941Smrg	    (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
1397b8e80941Smrg		si_llvm_emit_store(bld_base, inst, info, index, dst);
1398b8e80941Smrg		return;
1399b8e80941Smrg	}
1400848b8605Smrg
1401b8e80941Smrg	if (reg->Register.Dimension) {
1402b8e80941Smrg		stride = get_tcs_out_vertex_dw_stride(ctx);
1403b8e80941Smrg		dw_addr = get_tcs_out_current_patch_offset(ctx);
1404b8e80941Smrg		dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
1405b8e80941Smrg		skip_lds_store = !sh_info->reads_pervertex_outputs;
1406b8e80941Smrg	} else {
1407b8e80941Smrg		dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1408b8e80941Smrg		dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
1409b8e80941Smrg		skip_lds_store = !sh_info->reads_perpatch_outputs;
1410b8e80941Smrg
1411b8e80941Smrg		if (!reg->Register.Indirect) {
1412b8e80941Smrg			int name = sh_info->output_semantic_name[reg->Register.Index];
1413b8e80941Smrg
1414b8e80941Smrg			/* Always write tess factors into LDS for the TCS epilog. */
1415b8e80941Smrg			if (name == TGSI_SEMANTIC_TESSINNER ||
1416b8e80941Smrg			    name == TGSI_SEMANTIC_TESSOUTER) {
1417b8e80941Smrg				/* The epilog doesn't read LDS if invocation 0 defines tess factors. */
1418b8e80941Smrg				skip_lds_store = !sh_info->reads_tessfactor_outputs &&
1419b8e80941Smrg						 ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs;
1420b8e80941Smrg				is_tess_factor = true;
1421b8e80941Smrg				is_tess_inner = name == TGSI_SEMANTIC_TESSINNER;
1422b8e80941Smrg			}
1423848b8605Smrg		}
1424b8e80941Smrg	}
1425848b8605Smrg
1426b8e80941Smrg	buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
1427848b8605Smrg
1428b8e80941Smrg	base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1429b8e80941Smrg	buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
1430848b8605Smrg
1431b8e80941Smrg	uint32_t writemask = reg->Register.WriteMask;
1432b8e80941Smrg	while (writemask) {
1433b8e80941Smrg		chan_index = u_bit_scan(&writemask);
1434b8e80941Smrg		LLVMValueRef value = dst[chan_index];
1435848b8605Smrg
1436b8e80941Smrg		if (inst->Instruction.Saturate)
1437b8e80941Smrg			value = ac_build_clamp(&ctx->ac, value);
1438848b8605Smrg
1439b8e80941Smrg		/* Skip LDS stores if there is no LDS read of this output. */
1440b8e80941Smrg		if (!skip_lds_store)
1441b8e80941Smrg			lds_store(ctx, chan_index, dw_addr, value);
1442848b8605Smrg
1443b8e80941Smrg		value = ac_to_integer(&ctx->ac, value);
1444b8e80941Smrg		values[chan_index] = value;
1445848b8605Smrg
1446b8e80941Smrg		if (reg->Register.WriteMask != 0xF && !is_tess_factor) {
1447b8e80941Smrg			ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1,
1448b8e80941Smrg						    buf_addr, base,
1449b8e80941Smrg						    4 * chan_index, 1, 0, true, false);
1450b8e80941Smrg		}
1451848b8605Smrg
1452b8e80941Smrg		/* Write tess factors into VGPRs for the epilog. */
1453b8e80941Smrg		if (is_tess_factor &&
1454b8e80941Smrg		    ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) {
1455b8e80941Smrg			if (!is_tess_inner) {
1456b8e80941Smrg				LLVMBuildStore(ctx->ac.builder, value, /* outer */
1457b8e80941Smrg					       ctx->invoc0_tess_factors[chan_index]);
1458b8e80941Smrg			} else if (chan_index < 2) {
1459b8e80941Smrg				LLVMBuildStore(ctx->ac.builder, value, /* inner */
1460b8e80941Smrg					       ctx->invoc0_tess_factors[4 + chan_index]);
1461848b8605Smrg			}
1462848b8605Smrg		}
1463848b8605Smrg	}
1464848b8605Smrg
1465b8e80941Smrg	if (reg->Register.WriteMask == 0xF && !is_tess_factor) {
1466b8e80941Smrg		LLVMValueRef value = ac_build_gather_values(&ctx->ac,
1467b8e80941Smrg		                                            values, 4);
1468b8e80941Smrg		ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buf_addr,
1469b8e80941Smrg					    base, 0, 1, 0, true, false);
1470b8e80941Smrg	}
1471b8e80941Smrg}
1472848b8605Smrg
1473b8e80941Smrgstatic void si_nir_store_output_tcs(struct ac_shader_abi *abi,
1474b8e80941Smrg				    const struct nir_variable *var,
1475b8e80941Smrg				    LLVMValueRef vertex_index,
1476b8e80941Smrg				    LLVMValueRef param_index,
1477b8e80941Smrg				    unsigned const_index,
1478b8e80941Smrg				    LLVMValueRef src,
1479b8e80941Smrg				    unsigned writemask)
1480848b8605Smrg{
1481b8e80941Smrg	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1482b8e80941Smrg	struct tgsi_shader_info *info = &ctx->shader->selector->info;
1483b8e80941Smrg	const unsigned component = var->data.location_frac;
1484b8e80941Smrg	const bool is_patch = var->data.patch;
1485b8e80941Smrg	unsigned driver_location = var->data.driver_location;
1486b8e80941Smrg	LLVMValueRef dw_addr, stride;
1487b8e80941Smrg	LLVMValueRef buffer, base, addr;
1488b8e80941Smrg	LLVMValueRef values[4];
1489b8e80941Smrg	bool skip_lds_store;
1490b8e80941Smrg	bool is_tess_factor = false, is_tess_inner = false;
1491b8e80941Smrg
1492b8e80941Smrg	driver_location = driver_location / 4;
1493848b8605Smrg
1494b8e80941Smrg	if (param_index) {
1495b8e80941Smrg		/* Add the constant index to the indirect index */
1496b8e80941Smrg		param_index = LLVMBuildAdd(ctx->ac.builder, param_index,
1497b8e80941Smrg					   LLVMConstInt(ctx->i32, const_index, 0), "");
1498b8e80941Smrg	} else {
1499b8e80941Smrg		if (const_index != 0)
1500b8e80941Smrg			param_index = LLVMConstInt(ctx->i32, const_index, 0);
1501848b8605Smrg	}
1502848b8605Smrg
1503b8e80941Smrg	if (!is_patch) {
1504b8e80941Smrg		stride = get_tcs_out_vertex_dw_stride(ctx);
1505b8e80941Smrg		dw_addr = get_tcs_out_current_patch_offset(ctx);
1506b8e80941Smrg		dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr,
1507b8e80941Smrg							      vertex_index, param_index,
1508b8e80941Smrg							      driver_location,
1509b8e80941Smrg							      info->output_semantic_name,
1510b8e80941Smrg							      info->output_semantic_index,
1511b8e80941Smrg							      is_patch);
1512b8e80941Smrg
1513b8e80941Smrg		skip_lds_store = !info->reads_pervertex_outputs;
1514b8e80941Smrg	} else {
1515b8e80941Smrg		dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1516b8e80941Smrg		dw_addr = get_dw_address_from_generic_indices(ctx, NULL, dw_addr,
1517b8e80941Smrg							      vertex_index, param_index,
1518b8e80941Smrg							      driver_location,
1519b8e80941Smrg							      info->output_semantic_name,
1520b8e80941Smrg							      info->output_semantic_index,
1521b8e80941Smrg							      is_patch);
1522b8e80941Smrg
1523b8e80941Smrg		skip_lds_store = !info->reads_perpatch_outputs;
1524b8e80941Smrg
1525b8e80941Smrg		if (!param_index) {
1526b8e80941Smrg			int name = info->output_semantic_name[driver_location];
1527b8e80941Smrg
1528b8e80941Smrg			/* Always write tess factors into LDS for the TCS epilog. */
1529b8e80941Smrg			if (name == TGSI_SEMANTIC_TESSINNER ||
1530b8e80941Smrg			    name == TGSI_SEMANTIC_TESSOUTER) {
1531b8e80941Smrg				/* The epilog doesn't read LDS if invocation 0 defines tess factors. */
1532b8e80941Smrg				skip_lds_store = !info->reads_tessfactor_outputs &&
1533b8e80941Smrg						 ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs;
1534b8e80941Smrg				is_tess_factor = true;
1535b8e80941Smrg				is_tess_inner = name == TGSI_SEMANTIC_TESSINNER;
1536b8e80941Smrg			}
1537b8e80941Smrg		}
1538b8e80941Smrg	}
1539848b8605Smrg
1540b8e80941Smrg	buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
1541b8e80941Smrg
1542b8e80941Smrg	base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1543b8e80941Smrg
1544b8e80941Smrg	addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index,
1545b8e80941Smrg							       param_index, driver_location,
1546b8e80941Smrg							       info->output_semantic_name,
1547b8e80941Smrg							       info->output_semantic_index,
1548b8e80941Smrg							       is_patch);
1549b8e80941Smrg
1550b8e80941Smrg	for (unsigned chan = 0; chan < 4; chan++) {
1551b8e80941Smrg		if (!(writemask & (1 << chan)))
1552848b8605Smrg			continue;
1553b8e80941Smrg		LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component);
1554b8e80941Smrg
1555b8e80941Smrg		/* Skip LDS stores if there is no LDS read of this output. */
1556b8e80941Smrg		if (!skip_lds_store)
1557b8e80941Smrg			lds_store(ctx, chan, dw_addr, value);
1558b8e80941Smrg
1559b8e80941Smrg		value = ac_to_integer(&ctx->ac, value);
1560b8e80941Smrg		values[chan] = value;
1561b8e80941Smrg
1562b8e80941Smrg		if (writemask != 0xF && !is_tess_factor) {
1563b8e80941Smrg			ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1,
1564b8e80941Smrg						    addr, base,
1565b8e80941Smrg						    4 * chan, 1, 0, true, false);
1566b8e80941Smrg		}
1567b8e80941Smrg
1568b8e80941Smrg		/* Write tess factors into VGPRs for the epilog. */
1569b8e80941Smrg		if (is_tess_factor &&
1570b8e80941Smrg		    ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) {
1571b8e80941Smrg			if (!is_tess_inner) {
1572b8e80941Smrg				LLVMBuildStore(ctx->ac.builder, value, /* outer */
1573b8e80941Smrg					       ctx->invoc0_tess_factors[chan]);
1574b8e80941Smrg			} else if (chan < 2) {
1575b8e80941Smrg				LLVMBuildStore(ctx->ac.builder, value, /* inner */
1576b8e80941Smrg					       ctx->invoc0_tess_factors[4 + chan]);
1577b8e80941Smrg			}
1578b8e80941Smrg		}
1579b8e80941Smrg	}
1580b8e80941Smrg
1581b8e80941Smrg	if (writemask == 0xF && !is_tess_factor) {
1582b8e80941Smrg		LLVMValueRef value = ac_build_gather_values(&ctx->ac,
1583b8e80941Smrg		                                            values, 4);
1584b8e80941Smrg		ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, addr,
1585b8e80941Smrg					    base, 0, 1, 0, true, false);
1586b8e80941Smrg	}
1587b8e80941Smrg}
1588b8e80941Smrg
1589b8e80941SmrgLLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi,
1590b8e80941Smrg				   unsigned input_index,
1591b8e80941Smrg				   unsigned vtx_offset_param,
1592b8e80941Smrg				   LLVMTypeRef type,
1593b8e80941Smrg				   unsigned swizzle)
1594b8e80941Smrg{
1595b8e80941Smrg	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1596b8e80941Smrg	struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
1597b8e80941Smrg	struct si_shader *shader = ctx->shader;
1598b8e80941Smrg	LLVMValueRef vtx_offset, soffset;
1599b8e80941Smrg	struct tgsi_shader_info *info = &shader->selector->info;
1600b8e80941Smrg	unsigned semantic_name = info->input_semantic_name[input_index];
1601b8e80941Smrg	unsigned semantic_index = info->input_semantic_index[input_index];
1602b8e80941Smrg	unsigned param;
1603b8e80941Smrg	LLVMValueRef value;
1604b8e80941Smrg
1605b8e80941Smrg	param = si_shader_io_get_unique_index(semantic_name, semantic_index, false);
1606b8e80941Smrg
1607b8e80941Smrg	/* GFX9 has the ESGS ring in LDS. */
1608b8e80941Smrg	if (ctx->screen->info.chip_class >= GFX9) {
1609b8e80941Smrg		unsigned index = vtx_offset_param;
1610b8e80941Smrg
1611b8e80941Smrg		switch (index / 2) {
1612b8e80941Smrg		case 0:
1613b8e80941Smrg			vtx_offset = si_unpack_param(ctx, ctx->param_gs_vtx01_offset,
1614b8e80941Smrg						  index % 2 ? 16 : 0, 16);
1615848b8605Smrg			break;
1616b8e80941Smrg		case 1:
1617b8e80941Smrg			vtx_offset = si_unpack_param(ctx, ctx->param_gs_vtx23_offset,
1618b8e80941Smrg						  index % 2 ? 16 : 0, 16);
1619848b8605Smrg			break;
1620b8e80941Smrg		case 2:
1621b8e80941Smrg			vtx_offset = si_unpack_param(ctx, ctx->param_gs_vtx45_offset,
1622b8e80941Smrg						  index % 2 ? 16 : 0, 16);
1623848b8605Smrg			break;
1624848b8605Smrg		default:
1625b8e80941Smrg			assert(0);
1626b8e80941Smrg			return NULL;
1627848b8605Smrg		}
1628848b8605Smrg
1629b8e80941Smrg		vtx_offset = LLVMBuildAdd(ctx->ac.builder, vtx_offset,
1630b8e80941Smrg					  LLVMConstInt(ctx->i32, param * 4, 0), "");
1631b8e80941Smrg		return lds_load(bld_base, type, swizzle, vtx_offset);
1632b8e80941Smrg	}
1633848b8605Smrg
1634b8e80941Smrg	/* GFX6: input load from the ESGS ring in memory. */
1635b8e80941Smrg	if (swizzle == ~0) {
1636b8e80941Smrg		LLVMValueRef values[TGSI_NUM_CHANNELS];
1637b8e80941Smrg		unsigned chan;
1638b8e80941Smrg		for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1639b8e80941Smrg			values[chan] = si_llvm_load_input_gs(abi, input_index, vtx_offset_param,
1640b8e80941Smrg							     type, chan);
1641848b8605Smrg		}
1642b8e80941Smrg		return ac_build_gather_values(&ctx->ac, values,
1643b8e80941Smrg					      TGSI_NUM_CHANNELS);
1644848b8605Smrg	}
1645848b8605Smrg
1646b8e80941Smrg	/* Get the vertex offset parameter on GFX6. */
1647b8e80941Smrg	LLVMValueRef gs_vtx_offset = ctx->gs_vtx_offset[vtx_offset_param];
1648b8e80941Smrg
1649b8e80941Smrg	vtx_offset = LLVMBuildMul(ctx->ac.builder, gs_vtx_offset,
1650b8e80941Smrg				  LLVMConstInt(ctx->i32, 4, 0), "");
1651b8e80941Smrg
1652b8e80941Smrg	soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0);
1653b8e80941Smrg
1654b8e80941Smrg	value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0,
1655b8e80941Smrg				     vtx_offset, soffset, 0, 1, 0, true, false);
1656b8e80941Smrg	if (llvm_type_is_64bit(ctx, type)) {
1657b8e80941Smrg		LLVMValueRef value2;
1658b8e80941Smrg		soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0);
1659b8e80941Smrg
1660b8e80941Smrg		value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
1661b8e80941Smrg					      ctx->i32_0, vtx_offset, soffset,
1662b8e80941Smrg					      0, 1, 0, true, false);
1663b8e80941Smrg		return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
1664848b8605Smrg	}
1665b8e80941Smrg	return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
1666b8e80941Smrg}
1667848b8605Smrg
1668b8e80941Smrgstatic LLVMValueRef si_nir_load_input_gs(struct ac_shader_abi *abi,
1669b8e80941Smrg					 unsigned location,
1670b8e80941Smrg					 unsigned driver_location,
1671b8e80941Smrg					 unsigned component,
1672b8e80941Smrg					 unsigned num_components,
1673b8e80941Smrg					 unsigned vertex_index,
1674b8e80941Smrg					 unsigned const_index,
1675b8e80941Smrg					 LLVMTypeRef type)
1676b8e80941Smrg{
1677b8e80941Smrg	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1678848b8605Smrg
1679b8e80941Smrg	LLVMValueRef value[4];
1680b8e80941Smrg	for (unsigned i = 0; i < num_components; i++) {
1681b8e80941Smrg		unsigned offset = i;
1682b8e80941Smrg		if (llvm_type_is_64bit(ctx, type))
1683b8e80941Smrg			offset *= 2;
1684848b8605Smrg
1685b8e80941Smrg		offset += component;
1686b8e80941Smrg		value[i + component] = si_llvm_load_input_gs(&ctx->abi, driver_location  / 4,
1687b8e80941Smrg							     vertex_index, type, offset);
1688848b8605Smrg	}
1689848b8605Smrg
1690b8e80941Smrg	return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
1691b8e80941Smrg}
1692b8e80941Smrg
1693b8e80941Smrgstatic LLVMValueRef fetch_input_gs(
1694b8e80941Smrg	struct lp_build_tgsi_context *bld_base,
1695b8e80941Smrg	const struct tgsi_full_src_register *reg,
1696b8e80941Smrg	enum tgsi_opcode_type type,
1697b8e80941Smrg	unsigned swizzle_in)
1698b8e80941Smrg{
1699b8e80941Smrg	struct si_shader_context *ctx = si_shader_context(bld_base);
1700b8e80941Smrg	struct tgsi_shader_info *info = &ctx->shader->selector->info;
1701b8e80941Smrg	unsigned swizzle = swizzle_in & 0xffff;
1702b8e80941Smrg
1703b8e80941Smrg	unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
1704b8e80941Smrg	if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
1705b8e80941Smrg		return get_primitive_id(ctx, swizzle);
1706b8e80941Smrg
1707b8e80941Smrg	if (!reg->Register.Dimension)
1708b8e80941Smrg		return NULL;
1709b8e80941Smrg
1710b8e80941Smrg	return si_llvm_load_input_gs(&ctx->abi, reg->Register.Index,
1711b8e80941Smrg				     reg->Dimension.Index,
1712b8e80941Smrg				     tgsi2llvmtype(bld_base, type),
1713b8e80941Smrg				     swizzle);
1714b8e80941Smrg}
1715b8e80941Smrg
1716b8e80941Smrgstatic int lookup_interp_param_index(unsigned interpolate, unsigned location)
1717b8e80941Smrg{
1718b8e80941Smrg	switch (interpolate) {
1719b8e80941Smrg	case TGSI_INTERPOLATE_CONSTANT:
1720b8e80941Smrg		return 0;
1721b8e80941Smrg
1722b8e80941Smrg	case TGSI_INTERPOLATE_LINEAR:
1723b8e80941Smrg		if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1724b8e80941Smrg			return SI_PARAM_LINEAR_SAMPLE;
1725b8e80941Smrg		else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1726b8e80941Smrg			return SI_PARAM_LINEAR_CENTROID;
1727b8e80941Smrg		else
1728b8e80941Smrg			return SI_PARAM_LINEAR_CENTER;
1729b8e80941Smrg		break;
1730b8e80941Smrg	case TGSI_INTERPOLATE_COLOR:
1731b8e80941Smrg	case TGSI_INTERPOLATE_PERSPECTIVE:
1732b8e80941Smrg		if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1733b8e80941Smrg			return SI_PARAM_PERSP_SAMPLE;
1734b8e80941Smrg		else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1735b8e80941Smrg			return SI_PARAM_PERSP_CENTROID;
1736b8e80941Smrg		else
1737b8e80941Smrg			return SI_PARAM_PERSP_CENTER;
1738b8e80941Smrg		break;
1739b8e80941Smrg	default:
1740b8e80941Smrg		fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
1741b8e80941Smrg		return -1;
1742b8e80941Smrg	}
1743b8e80941Smrg}
1744b8e80941Smrg
1745b8e80941Smrgstatic LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx,
1746b8e80941Smrg				       unsigned attr_index, unsigned chan,
1747b8e80941Smrg				       LLVMValueRef prim_mask,
1748b8e80941Smrg				       LLVMValueRef i, LLVMValueRef j)
1749b8e80941Smrg{
1750b8e80941Smrg	if (i || j) {
1751b8e80941Smrg		return ac_build_fs_interp(&ctx->ac,
1752b8e80941Smrg					  LLVMConstInt(ctx->i32, chan, 0),
1753b8e80941Smrg					  LLVMConstInt(ctx->i32, attr_index, 0),
1754b8e80941Smrg					  prim_mask, i, j);
1755b8e80941Smrg	}
1756b8e80941Smrg	return ac_build_fs_interp_mov(&ctx->ac,
1757b8e80941Smrg				      LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1758b8e80941Smrg				      LLVMConstInt(ctx->i32, chan, 0),
1759b8e80941Smrg				      LLVMConstInt(ctx->i32, attr_index, 0),
1760b8e80941Smrg				      prim_mask);
1761b8e80941Smrg}
1762b8e80941Smrg
1763b8e80941Smrg/**
1764b8e80941Smrg * Interpolate a fragment shader input.
1765b8e80941Smrg *
1766b8e80941Smrg * @param ctx		context
1767b8e80941Smrg * @param input_index		index of the input in hardware
1768b8e80941Smrg * @param semantic_name		TGSI_SEMANTIC_*
1769b8e80941Smrg * @param semantic_index	semantic index
1770b8e80941Smrg * @param num_interp_inputs	number of all interpolated inputs (= BCOLOR offset)
1771b8e80941Smrg * @param colors_read_mask	color components read (4 bits for each color, 8 bits in total)
1772b8e80941Smrg * @param interp_param		interpolation weights (i,j)
1773b8e80941Smrg * @param prim_mask		SI_PARAM_PRIM_MASK
1774b8e80941Smrg * @param face			SI_PARAM_FRONT_FACE
1775b8e80941Smrg * @param result		the return value (4 components)
1776b8e80941Smrg */
1777b8e80941Smrgstatic void interp_fs_input(struct si_shader_context *ctx,
1778b8e80941Smrg			    unsigned input_index,
1779b8e80941Smrg			    unsigned semantic_name,
1780b8e80941Smrg			    unsigned semantic_index,
1781b8e80941Smrg			    unsigned num_interp_inputs,
1782b8e80941Smrg			    unsigned colors_read_mask,
1783b8e80941Smrg			    LLVMValueRef interp_param,
1784b8e80941Smrg			    LLVMValueRef prim_mask,
1785b8e80941Smrg			    LLVMValueRef face,
1786b8e80941Smrg			    LLVMValueRef result[4])
1787b8e80941Smrg{
1788b8e80941Smrg	LLVMValueRef i = NULL, j = NULL;
1789b8e80941Smrg	unsigned chan;
1790b8e80941Smrg
1791b8e80941Smrg	/* fs.constant returns the param from the middle vertex, so it's not
1792b8e80941Smrg	 * really useful for flat shading. It's meant to be used for custom
1793b8e80941Smrg	 * interpolation (but the intrinsic can't fetch from the other two
1794b8e80941Smrg	 * vertices).
1795b8e80941Smrg	 *
1796b8e80941Smrg	 * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
1797b8e80941Smrg	 * to do the right thing. The only reason we use fs.constant is that
1798b8e80941Smrg	 * fs.interp cannot be used on integers, because they can be equal
1799b8e80941Smrg	 * to NaN.
1800b8e80941Smrg	 *
1801b8e80941Smrg	 * When interp is false we will use fs.constant or for newer llvm,
1802b8e80941Smrg         * amdgcn.interp.mov.
1803b8e80941Smrg	 */
1804b8e80941Smrg	bool interp = interp_param != NULL;
1805b8e80941Smrg
1806b8e80941Smrg	if (interp) {
1807b8e80941Smrg		interp_param = LLVMBuildBitCast(ctx->ac.builder, interp_param,
1808b8e80941Smrg						LLVMVectorType(ctx->f32, 2), "");
1809b8e80941Smrg
1810b8e80941Smrg		i = LLVMBuildExtractElement(ctx->ac.builder, interp_param,
1811b8e80941Smrg						ctx->i32_0, "");
1812b8e80941Smrg		j = LLVMBuildExtractElement(ctx->ac.builder, interp_param,
1813b8e80941Smrg						ctx->i32_1, "");
1814b8e80941Smrg	}
1815b8e80941Smrg
1816b8e80941Smrg	if (semantic_name == TGSI_SEMANTIC_COLOR &&
1817b8e80941Smrg	    ctx->shader->key.part.ps.prolog.color_two_side) {
1818b8e80941Smrg		LLVMValueRef is_face_positive;
1819b8e80941Smrg
1820b8e80941Smrg		/* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1821b8e80941Smrg		 * otherwise it's at offset "num_inputs".
1822b8e80941Smrg		 */
1823b8e80941Smrg		unsigned back_attr_offset = num_interp_inputs;
1824b8e80941Smrg		if (semantic_index == 1 && colors_read_mask & 0xf)
1825b8e80941Smrg			back_attr_offset += 1;
1826b8e80941Smrg
1827b8e80941Smrg		is_face_positive = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,
1828b8e80941Smrg						 face, ctx->i32_0, "");
1829b8e80941Smrg
1830b8e80941Smrg		for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1831b8e80941Smrg			LLVMValueRef front, back;
1832b8e80941Smrg
1833b8e80941Smrg			front = si_build_fs_interp(ctx,
1834b8e80941Smrg						   input_index, chan,
1835b8e80941Smrg						   prim_mask, i, j);
1836b8e80941Smrg			back = si_build_fs_interp(ctx,
1837b8e80941Smrg						  back_attr_offset, chan,
1838b8e80941Smrg						  prim_mask, i, j);
1839b8e80941Smrg
1840b8e80941Smrg			result[chan] = LLVMBuildSelect(ctx->ac.builder,
1841b8e80941Smrg						is_face_positive,
1842b8e80941Smrg						front,
1843b8e80941Smrg						back,
1844b8e80941Smrg						"");
1845b8e80941Smrg		}
1846b8e80941Smrg	} else if (semantic_name == TGSI_SEMANTIC_FOG) {
1847b8e80941Smrg		result[0] = si_build_fs_interp(ctx, input_index,
1848b8e80941Smrg					       0, prim_mask, i, j);
1849b8e80941Smrg		result[1] =
1850b8e80941Smrg		result[2] = LLVMConstReal(ctx->f32, 0.0f);
1851b8e80941Smrg		result[3] = LLVMConstReal(ctx->f32, 1.0f);
1852b8e80941Smrg	} else {
1853b8e80941Smrg		for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1854b8e80941Smrg			result[chan] = si_build_fs_interp(ctx,
1855b8e80941Smrg							  input_index, chan,
1856b8e80941Smrg							  prim_mask, i, j);
1857b8e80941Smrg		}
1858b8e80941Smrg	}
1859b8e80941Smrg}
1860b8e80941Smrg
1861b8e80941Smrgvoid si_llvm_load_input_fs(
1862b8e80941Smrg	struct si_shader_context *ctx,
1863b8e80941Smrg	unsigned input_index,
1864b8e80941Smrg	LLVMValueRef out[4])
1865b8e80941Smrg{
1866b8e80941Smrg	struct si_shader *shader = ctx->shader;
1867b8e80941Smrg	struct tgsi_shader_info *info = &shader->selector->info;
1868b8e80941Smrg	LLVMValueRef main_fn = ctx->main_fn;
1869b8e80941Smrg	LLVMValueRef interp_param = NULL;
1870b8e80941Smrg	int interp_param_idx;
1871b8e80941Smrg	enum tgsi_semantic semantic_name = info->input_semantic_name[input_index];
1872b8e80941Smrg	unsigned semantic_index = info->input_semantic_index[input_index];
1873b8e80941Smrg	enum tgsi_interpolate_mode interp_mode = info->input_interpolate[input_index];
1874b8e80941Smrg	enum tgsi_interpolate_loc interp_loc = info->input_interpolate_loc[input_index];
1875b8e80941Smrg
1876b8e80941Smrg	/* Get colors from input VGPRs (set by the prolog). */
1877b8e80941Smrg	if (semantic_name == TGSI_SEMANTIC_COLOR) {
1878b8e80941Smrg		unsigned colors_read = shader->selector->info.colors_read;
1879b8e80941Smrg		unsigned mask = colors_read >> (semantic_index * 4);
1880b8e80941Smrg		unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1881b8e80941Smrg				  (semantic_index ? util_bitcount(colors_read & 0xf) : 0);
1882b8e80941Smrg		LLVMValueRef undef = LLVMGetUndef(ctx->f32);
1883b8e80941Smrg
1884b8e80941Smrg		out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef;
1885b8e80941Smrg		out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef;
1886b8e80941Smrg		out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef;
1887b8e80941Smrg		out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef;
1888b8e80941Smrg		return;
1889b8e80941Smrg	}
1890b8e80941Smrg
1891b8e80941Smrg	interp_param_idx = lookup_interp_param_index(interp_mode, interp_loc);
1892b8e80941Smrg	if (interp_param_idx == -1)
1893b8e80941Smrg		return;
1894b8e80941Smrg	else if (interp_param_idx) {
1895b8e80941Smrg		interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
1896b8e80941Smrg	}
1897b8e80941Smrg
1898b8e80941Smrg	interp_fs_input(ctx, input_index, semantic_name,
1899b8e80941Smrg			semantic_index, 0, /* this param is unused */
1900b8e80941Smrg			shader->selector->info.colors_read, interp_param,
1901b8e80941Smrg			ctx->abi.prim_mask,
1902b8e80941Smrg			LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1903b8e80941Smrg			&out[0]);
1904b8e80941Smrg}
1905b8e80941Smrg
1906b8e80941Smrgstatic void declare_input_fs(
1907b8e80941Smrg	struct si_shader_context *ctx,
1908b8e80941Smrg	unsigned input_index,
1909b8e80941Smrg	const struct tgsi_full_declaration *decl,
1910b8e80941Smrg	LLVMValueRef out[4])
1911b8e80941Smrg{
1912b8e80941Smrg	si_llvm_load_input_fs(ctx, input_index, out);
1913b8e80941Smrg}
1914b8e80941Smrg
1915b8e80941SmrgLLVMValueRef si_get_sample_id(struct si_shader_context *ctx)
1916b8e80941Smrg{
1917b8e80941Smrg	return si_unpack_param(ctx, SI_PARAM_ANCILLARY, 8, 4);
1918b8e80941Smrg}
1919b8e80941Smrg
1920b8e80941Smrgstatic LLVMValueRef get_base_vertex(struct ac_shader_abi *abi)
1921b8e80941Smrg{
1922b8e80941Smrg	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1923b8e80941Smrg
1924b8e80941Smrg	/* For non-indexed draws, the base vertex set by the driver
1925b8e80941Smrg	 * (for direct draws) or the CP (for indirect draws) is the
1926b8e80941Smrg	 * first vertex ID, but GLSL expects 0 to be returned.
1927b8e80941Smrg	 */
1928b8e80941Smrg	LLVMValueRef vs_state = LLVMGetParam(ctx->main_fn,
1929b8e80941Smrg					     ctx->param_vs_state_bits);
1930b8e80941Smrg	LLVMValueRef indexed;
1931b8e80941Smrg
1932b8e80941Smrg	indexed = LLVMBuildLShr(ctx->ac.builder, vs_state, ctx->i32_1, "");
1933b8e80941Smrg	indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->i1, "");
1934b8e80941Smrg
1935b8e80941Smrg	return LLVMBuildSelect(ctx->ac.builder, indexed, ctx->abi.base_vertex,
1936b8e80941Smrg			       ctx->i32_0, "");
1937b8e80941Smrg}
1938b8e80941Smrg
1939b8e80941Smrgstatic LLVMValueRef get_block_size(struct ac_shader_abi *abi)
1940b8e80941Smrg{
1941b8e80941Smrg	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1942b8e80941Smrg
1943b8e80941Smrg	LLVMValueRef values[3];
1944b8e80941Smrg	LLVMValueRef result;
1945b8e80941Smrg	unsigned i;
1946b8e80941Smrg	unsigned *properties = ctx->shader->selector->info.properties;
1947b8e80941Smrg
1948b8e80941Smrg	if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
1949b8e80941Smrg		unsigned sizes[3] = {
1950b8e80941Smrg			properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1951b8e80941Smrg			properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1952b8e80941Smrg			properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1953b8e80941Smrg		};
1954b8e80941Smrg
1955b8e80941Smrg		for (i = 0; i < 3; ++i)
1956b8e80941Smrg			values[i] = LLVMConstInt(ctx->i32, sizes[i], 0);
1957b8e80941Smrg
1958b8e80941Smrg		result = ac_build_gather_values(&ctx->ac, values, 3);
1959b8e80941Smrg	} else {
1960b8e80941Smrg		result = LLVMGetParam(ctx->main_fn, ctx->param_block_size);
1961b8e80941Smrg	}
1962b8e80941Smrg
1963b8e80941Smrg	return result;
1964b8e80941Smrg}
1965b8e80941Smrg
1966b8e80941Smrg/**
1967b8e80941Smrg * Load a dword from a constant buffer.
1968b8e80941Smrg */
1969b8e80941Smrgstatic LLVMValueRef buffer_load_const(struct si_shader_context *ctx,
1970b8e80941Smrg				      LLVMValueRef resource,
1971b8e80941Smrg				      LLVMValueRef offset)
1972b8e80941Smrg{
1973b8e80941Smrg	return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL,
1974b8e80941Smrg				    0, 0, 0, true, true);
1975b8e80941Smrg}
1976b8e80941Smrg
1977b8e80941Smrgstatic LLVMValueRef load_sample_position(struct ac_shader_abi *abi, LLVMValueRef sample_id)
1978b8e80941Smrg{
1979b8e80941Smrg	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1980b8e80941Smrg	LLVMValueRef desc = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1981b8e80941Smrg	LLVMValueRef buf_index = LLVMConstInt(ctx->i32, SI_PS_CONST_SAMPLE_POSITIONS, 0);
1982b8e80941Smrg	LLVMValueRef resource = ac_build_load_to_sgpr(&ctx->ac, desc, buf_index);
1983b8e80941Smrg
1984b8e80941Smrg	/* offset = sample_id * 8  (8 = 2 floats containing samplepos.xy) */
1985b8e80941Smrg	LLVMValueRef offset0 = LLVMBuildMul(ctx->ac.builder, sample_id, LLVMConstInt(ctx->i32, 8, 0), "");
1986b8e80941Smrg	LLVMValueRef offset1 = LLVMBuildAdd(ctx->ac.builder, offset0, LLVMConstInt(ctx->i32, 4, 0), "");
1987b8e80941Smrg
1988b8e80941Smrg	LLVMValueRef pos[4] = {
1989b8e80941Smrg		buffer_load_const(ctx, resource, offset0),
1990b8e80941Smrg		buffer_load_const(ctx, resource, offset1),
1991b8e80941Smrg		LLVMConstReal(ctx->f32, 0),
1992b8e80941Smrg		LLVMConstReal(ctx->f32, 0)
1993b8e80941Smrg	};
1994b8e80941Smrg
1995b8e80941Smrg	return ac_build_gather_values(&ctx->ac, pos, 4);
1996b8e80941Smrg}
1997b8e80941Smrg
1998b8e80941Smrgstatic LLVMValueRef load_sample_mask_in(struct ac_shader_abi *abi)
1999b8e80941Smrg{
2000b8e80941Smrg	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
2001b8e80941Smrg	return ac_to_integer(&ctx->ac, abi->sample_coverage);
2002b8e80941Smrg}
2003b8e80941Smrg
2004b8e80941Smrgstatic LLVMValueRef si_load_tess_coord(struct ac_shader_abi *abi)
2005b8e80941Smrg{
2006b8e80941Smrg	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
2007b8e80941Smrg	LLVMValueRef coord[4] = {
2008b8e80941Smrg		LLVMGetParam(ctx->main_fn, ctx->param_tes_u),
2009b8e80941Smrg		LLVMGetParam(ctx->main_fn, ctx->param_tes_v),
2010b8e80941Smrg		ctx->ac.f32_0,
2011b8e80941Smrg		ctx->ac.f32_0
2012b8e80941Smrg	};
2013b8e80941Smrg
2014b8e80941Smrg	/* For triangles, the vector should be (u, v, 1-u-v). */
2015b8e80941Smrg	if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
2016b8e80941Smrg	    PIPE_PRIM_TRIANGLES) {
2017b8e80941Smrg		coord[2] = LLVMBuildFSub(ctx->ac.builder, ctx->ac.f32_1,
2018b8e80941Smrg					 LLVMBuildFAdd(ctx->ac.builder,
2019b8e80941Smrg						       coord[0], coord[1], ""), "");
2020b8e80941Smrg	}
2021b8e80941Smrg	return ac_build_gather_values(&ctx->ac, coord, 4);
2022b8e80941Smrg}
2023b8e80941Smrg
2024b8e80941Smrgstatic LLVMValueRef load_tess_level(struct si_shader_context *ctx,
2025b8e80941Smrg				    unsigned semantic_name)
2026b8e80941Smrg{
2027b8e80941Smrg	LLVMValueRef base, addr;
2028b8e80941Smrg
2029b8e80941Smrg	int param = si_shader_io_get_unique_index_patch(semantic_name, 0);
2030b8e80941Smrg
2031b8e80941Smrg	base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2032b8e80941Smrg	addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
2033b8e80941Smrg					  LLVMConstInt(ctx->i32, param, 0));
2034b8e80941Smrg
2035b8e80941Smrg	return buffer_load(&ctx->bld_base, ctx->f32,
2036b8e80941Smrg			   ~0, ctx->tess_offchip_ring, base, addr, true);
2037b8e80941Smrg
2038b8e80941Smrg}
2039b8e80941Smrg
2040b8e80941Smrgstatic LLVMValueRef si_load_tess_level(struct ac_shader_abi *abi,
2041b8e80941Smrg				       unsigned varying_id)
2042b8e80941Smrg{
2043b8e80941Smrg	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
2044b8e80941Smrg	unsigned semantic_name;
2045b8e80941Smrg
2046b8e80941Smrg	switch (varying_id) {
2047b8e80941Smrg	case VARYING_SLOT_TESS_LEVEL_INNER:
2048b8e80941Smrg		semantic_name = TGSI_SEMANTIC_TESSINNER;
2049b8e80941Smrg		break;
2050b8e80941Smrg	case VARYING_SLOT_TESS_LEVEL_OUTER:
2051b8e80941Smrg		semantic_name = TGSI_SEMANTIC_TESSOUTER;
2052b8e80941Smrg		break;
2053b8e80941Smrg	default:
2054b8e80941Smrg		unreachable("unknown tess level");
2055b8e80941Smrg	}
2056b8e80941Smrg
2057b8e80941Smrg	return load_tess_level(ctx, semantic_name);
2058b8e80941Smrg
2059b8e80941Smrg}
2060b8e80941Smrg
2061b8e80941Smrgstatic LLVMValueRef si_load_patch_vertices_in(struct ac_shader_abi *abi)
2062b8e80941Smrg{
2063b8e80941Smrg	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
2064b8e80941Smrg	if (ctx->type == PIPE_SHADER_TESS_CTRL)
2065b8e80941Smrg		return si_unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 6);
2066b8e80941Smrg	else if (ctx->type == PIPE_SHADER_TESS_EVAL)
2067b8e80941Smrg		return get_num_tcs_out_vertices(ctx);
2068b8e80941Smrg	else
2069b8e80941Smrg		unreachable("invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
2070b8e80941Smrg}
2071b8e80941Smrg
2072b8e80941Smrgvoid si_load_system_value(struct si_shader_context *ctx,
2073b8e80941Smrg			  unsigned index,
2074b8e80941Smrg			  const struct tgsi_full_declaration *decl)
2075b8e80941Smrg{
2076b8e80941Smrg	LLVMValueRef value = 0;
2077b8e80941Smrg
2078b8e80941Smrg	assert(index < RADEON_LLVM_MAX_SYSTEM_VALUES);
2079b8e80941Smrg
2080b8e80941Smrg	switch (decl->Semantic.Name) {
2081b8e80941Smrg	case TGSI_SEMANTIC_INSTANCEID:
2082b8e80941Smrg		value = ctx->abi.instance_id;
2083b8e80941Smrg		break;
2084b8e80941Smrg
2085b8e80941Smrg	case TGSI_SEMANTIC_VERTEXID:
2086b8e80941Smrg		value = LLVMBuildAdd(ctx->ac.builder,
2087b8e80941Smrg				     ctx->abi.vertex_id,
2088b8e80941Smrg				     ctx->abi.base_vertex, "");
2089b8e80941Smrg		break;
2090b8e80941Smrg
2091b8e80941Smrg	case TGSI_SEMANTIC_VERTEXID_NOBASE:
2092b8e80941Smrg		/* Unused. Clarify the meaning in indexed vs. non-indexed
2093b8e80941Smrg		 * draws if this is ever used again. */
2094b8e80941Smrg		assert(false);
2095b8e80941Smrg		break;
2096b8e80941Smrg
2097b8e80941Smrg	case TGSI_SEMANTIC_BASEVERTEX:
2098b8e80941Smrg		value = get_base_vertex(&ctx->abi);
2099b8e80941Smrg		break;
2100b8e80941Smrg
2101b8e80941Smrg	case TGSI_SEMANTIC_BASEINSTANCE:
2102b8e80941Smrg		value = ctx->abi.start_instance;
2103b8e80941Smrg		break;
2104b8e80941Smrg
2105b8e80941Smrg	case TGSI_SEMANTIC_DRAWID:
2106b8e80941Smrg		value = ctx->abi.draw_id;
2107b8e80941Smrg		break;
2108b8e80941Smrg
2109b8e80941Smrg	case TGSI_SEMANTIC_INVOCATIONID:
2110b8e80941Smrg		if (ctx->type == PIPE_SHADER_TESS_CTRL)
2111b8e80941Smrg			value = unpack_llvm_param(ctx, ctx->abi.tcs_rel_ids, 8, 5);
2112b8e80941Smrg		else if (ctx->type == PIPE_SHADER_GEOMETRY)
2113b8e80941Smrg			value = ctx->abi.gs_invocation_id;
2114b8e80941Smrg		else
2115b8e80941Smrg			assert(!"INVOCATIONID not implemented");
2116b8e80941Smrg		break;
2117b8e80941Smrg
2118b8e80941Smrg	case TGSI_SEMANTIC_POSITION:
2119b8e80941Smrg	{
2120b8e80941Smrg		LLVMValueRef pos[4] = {
2121b8e80941Smrg			LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
2122b8e80941Smrg			LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
2123b8e80941Smrg			LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Z_FLOAT),
2124b8e80941Smrg			ac_build_fdiv(&ctx->ac, ctx->ac.f32_1,
2125b8e80941Smrg				      LLVMGetParam(ctx->main_fn, SI_PARAM_POS_W_FLOAT)),
2126b8e80941Smrg		};
2127b8e80941Smrg		value = ac_build_gather_values(&ctx->ac, pos, 4);
2128b8e80941Smrg		break;
2129b8e80941Smrg	}
2130b8e80941Smrg
2131b8e80941Smrg	case TGSI_SEMANTIC_FACE:
2132b8e80941Smrg		value = ctx->abi.front_face;
2133b8e80941Smrg		break;
2134b8e80941Smrg
2135b8e80941Smrg	case TGSI_SEMANTIC_SAMPLEID:
2136b8e80941Smrg		value = si_get_sample_id(ctx);
2137b8e80941Smrg		break;
2138b8e80941Smrg
2139b8e80941Smrg	case TGSI_SEMANTIC_SAMPLEPOS: {
2140b8e80941Smrg		LLVMValueRef pos[4] = {
2141b8e80941Smrg			LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
2142b8e80941Smrg			LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
2143b8e80941Smrg			LLVMConstReal(ctx->f32, 0),
2144b8e80941Smrg			LLVMConstReal(ctx->f32, 0)
2145b8e80941Smrg		};
2146b8e80941Smrg		pos[0] = ac_build_fract(&ctx->ac, pos[0], 32);
2147b8e80941Smrg		pos[1] = ac_build_fract(&ctx->ac, pos[1], 32);
2148b8e80941Smrg		value = ac_build_gather_values(&ctx->ac, pos, 4);
2149b8e80941Smrg		break;
2150b8e80941Smrg	}
2151b8e80941Smrg
2152b8e80941Smrg	case TGSI_SEMANTIC_SAMPLEMASK:
2153b8e80941Smrg		/* This can only occur with the OpenGL Core profile, which
2154b8e80941Smrg		 * doesn't support smoothing.
2155b8e80941Smrg		 */
2156b8e80941Smrg		value = LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLE_COVERAGE);
2157b8e80941Smrg		break;
2158b8e80941Smrg
2159b8e80941Smrg	case TGSI_SEMANTIC_TESSCOORD:
2160b8e80941Smrg		value = si_load_tess_coord(&ctx->abi);
2161b8e80941Smrg		break;
2162b8e80941Smrg
2163b8e80941Smrg	case TGSI_SEMANTIC_VERTICESIN:
2164b8e80941Smrg		value = si_load_patch_vertices_in(&ctx->abi);
2165b8e80941Smrg		break;
2166b8e80941Smrg
2167b8e80941Smrg	case TGSI_SEMANTIC_TESSINNER:
2168b8e80941Smrg	case TGSI_SEMANTIC_TESSOUTER:
2169b8e80941Smrg		value = load_tess_level(ctx, decl->Semantic.Name);
2170b8e80941Smrg		break;
2171b8e80941Smrg
2172b8e80941Smrg	case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
2173b8e80941Smrg	case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
2174b8e80941Smrg	{
2175b8e80941Smrg		LLVMValueRef buf, slot, val[4];
2176b8e80941Smrg		int i, offset;
2177b8e80941Smrg
2178b8e80941Smrg		slot = LLVMConstInt(ctx->i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0);
2179b8e80941Smrg		buf = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
2180b8e80941Smrg		buf = ac_build_load_to_sgpr(&ctx->ac, buf, slot);
2181b8e80941Smrg		offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
2182b8e80941Smrg
2183b8e80941Smrg		for (i = 0; i < 4; i++)
2184b8e80941Smrg			val[i] = buffer_load_const(ctx, buf,
2185b8e80941Smrg						   LLVMConstInt(ctx->i32, (offset + i) * 4, 0));
2186b8e80941Smrg		value = ac_build_gather_values(&ctx->ac, val, 4);
2187b8e80941Smrg		break;
2188b8e80941Smrg	}
2189b8e80941Smrg
2190b8e80941Smrg	case TGSI_SEMANTIC_PRIMID:
2191b8e80941Smrg		value = get_primitive_id(ctx, 0);
2192b8e80941Smrg		break;
2193b8e80941Smrg
2194b8e80941Smrg	case TGSI_SEMANTIC_GRID_SIZE:
2195b8e80941Smrg		value = ctx->abi.num_work_groups;
2196b8e80941Smrg		break;
2197b8e80941Smrg
2198b8e80941Smrg	case TGSI_SEMANTIC_BLOCK_SIZE:
2199b8e80941Smrg		value = get_block_size(&ctx->abi);
2200b8e80941Smrg		break;
2201b8e80941Smrg
2202b8e80941Smrg	case TGSI_SEMANTIC_BLOCK_ID:
2203b8e80941Smrg	{
2204b8e80941Smrg		LLVMValueRef values[3];
2205b8e80941Smrg
2206b8e80941Smrg		for (int i = 0; i < 3; i++) {
2207b8e80941Smrg			values[i] = ctx->i32_0;
2208b8e80941Smrg			if (ctx->abi.workgroup_ids[i]) {
2209b8e80941Smrg				values[i] = ctx->abi.workgroup_ids[i];
2210b8e80941Smrg			}
2211b8e80941Smrg		}
2212b8e80941Smrg		value = ac_build_gather_values(&ctx->ac, values, 3);
2213b8e80941Smrg		break;
2214b8e80941Smrg	}
2215b8e80941Smrg
2216b8e80941Smrg	case TGSI_SEMANTIC_THREAD_ID:
2217b8e80941Smrg		value = ctx->abi.local_invocation_ids;
2218b8e80941Smrg		break;
2219b8e80941Smrg
2220b8e80941Smrg	case TGSI_SEMANTIC_HELPER_INVOCATION:
2221b8e80941Smrg		value = ac_build_load_helper_invocation(&ctx->ac);
2222b8e80941Smrg		break;
2223b8e80941Smrg
2224b8e80941Smrg	case TGSI_SEMANTIC_SUBGROUP_SIZE:
2225b8e80941Smrg		value = LLVMConstInt(ctx->i32, 64, 0);
2226b8e80941Smrg		break;
2227b8e80941Smrg
2228b8e80941Smrg	case TGSI_SEMANTIC_SUBGROUP_INVOCATION:
2229b8e80941Smrg		value = ac_get_thread_id(&ctx->ac);
2230b8e80941Smrg		break;
2231b8e80941Smrg
2232b8e80941Smrg	case TGSI_SEMANTIC_SUBGROUP_EQ_MASK:
2233b8e80941Smrg	{
2234b8e80941Smrg		LLVMValueRef id = ac_get_thread_id(&ctx->ac);
2235b8e80941Smrg		id = LLVMBuildZExt(ctx->ac.builder, id, ctx->i64, "");
2236b8e80941Smrg		value = LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->i64, 1, 0), id, "");
2237b8e80941Smrg		value = LLVMBuildBitCast(ctx->ac.builder, value, ctx->v2i32, "");
2238b8e80941Smrg		break;
2239b8e80941Smrg	}
2240b8e80941Smrg
2241b8e80941Smrg	case TGSI_SEMANTIC_SUBGROUP_GE_MASK:
2242b8e80941Smrg	case TGSI_SEMANTIC_SUBGROUP_GT_MASK:
2243b8e80941Smrg	case TGSI_SEMANTIC_SUBGROUP_LE_MASK:
2244b8e80941Smrg	case TGSI_SEMANTIC_SUBGROUP_LT_MASK:
2245b8e80941Smrg	{
2246b8e80941Smrg		LLVMValueRef id = ac_get_thread_id(&ctx->ac);
2247b8e80941Smrg		if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_GT_MASK ||
2248b8e80941Smrg		    decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK) {
2249b8e80941Smrg			/* All bits set except LSB */
2250b8e80941Smrg			value = LLVMConstInt(ctx->i64, -2, 0);
2251b8e80941Smrg		} else {
2252b8e80941Smrg			/* All bits set */
2253b8e80941Smrg			value = LLVMConstInt(ctx->i64, -1, 0);
2254b8e80941Smrg		}
2255b8e80941Smrg		id = LLVMBuildZExt(ctx->ac.builder, id, ctx->i64, "");
2256b8e80941Smrg		value = LLVMBuildShl(ctx->ac.builder, value, id, "");
2257b8e80941Smrg		if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK ||
2258b8e80941Smrg		    decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LT_MASK)
2259b8e80941Smrg			value = LLVMBuildNot(ctx->ac.builder, value, "");
2260b8e80941Smrg		value = LLVMBuildBitCast(ctx->ac.builder, value, ctx->v2i32, "");
2261b8e80941Smrg		break;
2262b8e80941Smrg	}
2263b8e80941Smrg
2264b8e80941Smrg	case TGSI_SEMANTIC_CS_USER_DATA:
2265b8e80941Smrg		value = LLVMGetParam(ctx->main_fn, ctx->param_cs_user_data);
2266b8e80941Smrg		break;
2267b8e80941Smrg
2268b8e80941Smrg	default:
2269b8e80941Smrg		assert(!"unknown system value");
2270b8e80941Smrg		return;
2271b8e80941Smrg	}
2272b8e80941Smrg
2273b8e80941Smrg	ctx->system_values[index] = value;
2274b8e80941Smrg}
2275b8e80941Smrg
2276b8e80941Smrgvoid si_declare_compute_memory(struct si_shader_context *ctx)
2277b8e80941Smrg{
2278b8e80941Smrg	struct si_shader_selector *sel = ctx->shader->selector;
2279b8e80941Smrg	unsigned lds_size = sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE];
2280b8e80941Smrg
2281b8e80941Smrg	LLVMTypeRef i8p = LLVMPointerType(ctx->i8, AC_ADDR_SPACE_LDS);
2282b8e80941Smrg	LLVMValueRef var;
2283b8e80941Smrg
2284b8e80941Smrg	assert(!ctx->ac.lds);
2285b8e80941Smrg
2286b8e80941Smrg	var = LLVMAddGlobalInAddressSpace(ctx->ac.module,
2287b8e80941Smrg	                                  LLVMArrayType(ctx->i8, lds_size),
2288b8e80941Smrg	                                  "compute_lds",
2289b8e80941Smrg	                                  AC_ADDR_SPACE_LDS);
2290b8e80941Smrg	LLVMSetAlignment(var, 4);
2291b8e80941Smrg
2292b8e80941Smrg	ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, "");
2293b8e80941Smrg}
2294b8e80941Smrg
2295b8e80941Smrgvoid si_tgsi_declare_compute_memory(struct si_shader_context *ctx,
2296b8e80941Smrg				    const struct tgsi_full_declaration *decl)
2297b8e80941Smrg{
2298b8e80941Smrg	assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
2299b8e80941Smrg	assert(decl->Range.First == decl->Range.Last);
2300b8e80941Smrg
2301b8e80941Smrg	si_declare_compute_memory(ctx);
2302b8e80941Smrg}
2303b8e80941Smrg
2304b8e80941Smrgstatic LLVMValueRef load_const_buffer_desc_fast_path(struct si_shader_context *ctx)
2305b8e80941Smrg{
2306b8e80941Smrg	LLVMValueRef ptr =
2307b8e80941Smrg		LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
2308b8e80941Smrg	struct si_shader_selector *sel = ctx->shader->selector;
2309b8e80941Smrg
2310b8e80941Smrg	/* Do the bounds checking with a descriptor, because
2311b8e80941Smrg	 * doing computation and manual bounds checking of 64-bit
2312b8e80941Smrg	 * addresses generates horrible VALU code with very high
2313b8e80941Smrg	 * VGPR usage and very low SIMD occupancy.
2314b8e80941Smrg	 */
2315b8e80941Smrg	ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->ac.intptr, "");
2316b8e80941Smrg
2317b8e80941Smrg	LLVMValueRef desc0, desc1;
2318b8e80941Smrg	desc0 = ptr;
2319b8e80941Smrg	desc1 = LLVMConstInt(ctx->i32,
2320b8e80941Smrg			     S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
2321b8e80941Smrg
2322b8e80941Smrg	LLVMValueRef desc_elems[] = {
2323b8e80941Smrg		desc0,
2324b8e80941Smrg		desc1,
2325b8e80941Smrg		LLVMConstInt(ctx->i32, (sel->info.const_file_max[0] + 1) * 16, 0),
2326b8e80941Smrg		LLVMConstInt(ctx->i32,
2327b8e80941Smrg			S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
2328b8e80941Smrg			S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
2329b8e80941Smrg			S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
2330b8e80941Smrg			S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
2331b8e80941Smrg			S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
2332b8e80941Smrg			S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32), 0)
2333b8e80941Smrg	};
2334b8e80941Smrg
2335b8e80941Smrg	return ac_build_gather_values(&ctx->ac, desc_elems, 4);
2336b8e80941Smrg}
2337b8e80941Smrg
2338b8e80941Smrgstatic LLVMValueRef load_const_buffer_desc(struct si_shader_context *ctx, int i)
2339b8e80941Smrg{
2340b8e80941Smrg	LLVMValueRef list_ptr = LLVMGetParam(ctx->main_fn,
2341b8e80941Smrg					     ctx->param_const_and_shader_buffers);
2342b8e80941Smrg
2343b8e80941Smrg	return ac_build_load_to_sgpr(&ctx->ac, list_ptr,
2344b8e80941Smrg				     LLVMConstInt(ctx->i32, si_get_constbuf_slot(i), 0));
2345b8e80941Smrg}
2346b8e80941Smrg
2347b8e80941Smrgstatic LLVMValueRef load_ubo(struct ac_shader_abi *abi, LLVMValueRef index)
2348b8e80941Smrg{
2349b8e80941Smrg	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
2350b8e80941Smrg	struct si_shader_selector *sel = ctx->shader->selector;
2351b8e80941Smrg
2352b8e80941Smrg	LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
2353b8e80941Smrg
2354b8e80941Smrg	if (sel->info.const_buffers_declared == 1 &&
2355b8e80941Smrg	    sel->info.shader_buffers_declared == 0) {
2356b8e80941Smrg		return load_const_buffer_desc_fast_path(ctx);
2357b8e80941Smrg	}
2358b8e80941Smrg
2359b8e80941Smrg	index = si_llvm_bound_index(ctx, index, ctx->num_const_buffers);
2360b8e80941Smrg	index = LLVMBuildAdd(ctx->ac.builder, index,
2361b8e80941Smrg			     LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), "");
2362b8e80941Smrg
2363b8e80941Smrg	return ac_build_load_to_sgpr(&ctx->ac, ptr, index);
2364b8e80941Smrg}
2365b8e80941Smrg
2366b8e80941Smrgstatic LLVMValueRef
2367b8e80941Smrgload_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write)
2368b8e80941Smrg{
2369b8e80941Smrg	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
2370b8e80941Smrg	LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
2371b8e80941Smrg					     ctx->param_const_and_shader_buffers);
2372b8e80941Smrg
2373b8e80941Smrg	index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers);
2374b8e80941Smrg	index = LLVMBuildSub(ctx->ac.builder,
2375b8e80941Smrg			     LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS - 1, 0),
2376b8e80941Smrg			     index, "");
2377b8e80941Smrg
2378b8e80941Smrg	return ac_build_load_to_sgpr(&ctx->ac, rsrc_ptr, index);
2379b8e80941Smrg}
2380b8e80941Smrg
2381b8e80941Smrgstatic LLVMValueRef fetch_constant(
2382b8e80941Smrg	struct lp_build_tgsi_context *bld_base,
2383b8e80941Smrg	const struct tgsi_full_src_register *reg,
2384b8e80941Smrg	enum tgsi_opcode_type type,
2385b8e80941Smrg	unsigned swizzle_in)
2386b8e80941Smrg{
2387b8e80941Smrg	struct si_shader_context *ctx = si_shader_context(bld_base);
2388b8e80941Smrg	struct si_shader_selector *sel = ctx->shader->selector;
2389b8e80941Smrg	const struct tgsi_ind_register *ireg = &reg->Indirect;
2390b8e80941Smrg	unsigned buf, idx;
2391b8e80941Smrg	unsigned swizzle = swizzle_in & 0xffff;
2392b8e80941Smrg
2393b8e80941Smrg	LLVMValueRef addr, bufp;
2394b8e80941Smrg
2395b8e80941Smrg	if (swizzle_in == LP_CHAN_ALL) {
2396b8e80941Smrg		unsigned chan;
2397b8e80941Smrg		LLVMValueRef values[4];
2398b8e80941Smrg		for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
2399b8e80941Smrg			values[chan] = fetch_constant(bld_base, reg, type, chan);
2400b8e80941Smrg
2401b8e80941Smrg		return ac_build_gather_values(&ctx->ac, values, 4);
2402b8e80941Smrg	}
2403b8e80941Smrg
2404b8e80941Smrg	/* Split 64-bit loads. */
2405b8e80941Smrg	if (tgsi_type_is_64bit(type)) {
2406b8e80941Smrg		LLVMValueRef lo, hi;
2407b8e80941Smrg
2408b8e80941Smrg		lo = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED, swizzle);
2409b8e80941Smrg		hi = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED, (swizzle_in >> 16));
2410b8e80941Smrg		return si_llvm_emit_fetch_64bit(bld_base, tgsi2llvmtype(bld_base, type),
2411b8e80941Smrg						lo, hi);
2412b8e80941Smrg	}
2413b8e80941Smrg
2414b8e80941Smrg	idx = reg->Register.Index * 4 + swizzle;
2415b8e80941Smrg	if (reg->Register.Indirect) {
2416b8e80941Smrg		addr = si_get_indirect_index(ctx, ireg, 16, idx * 4);
2417b8e80941Smrg	} else {
2418b8e80941Smrg		addr = LLVMConstInt(ctx->i32, idx * 4, 0);
2419b8e80941Smrg	}
2420b8e80941Smrg
2421b8e80941Smrg	/* Fast path when user data SGPRs point to constant buffer 0 directly. */
2422b8e80941Smrg	if (sel->info.const_buffers_declared == 1 &&
2423b8e80941Smrg	    sel->info.shader_buffers_declared == 0) {
2424b8e80941Smrg		LLVMValueRef desc = load_const_buffer_desc_fast_path(ctx);
2425b8e80941Smrg		LLVMValueRef result = buffer_load_const(ctx, desc, addr);
2426b8e80941Smrg		return bitcast(bld_base, type, result);
2427b8e80941Smrg	}
2428b8e80941Smrg
2429b8e80941Smrg	assert(reg->Register.Dimension);
2430b8e80941Smrg	buf = reg->Dimension.Index;
2431b8e80941Smrg
2432b8e80941Smrg	if (reg->Dimension.Indirect) {
2433b8e80941Smrg		LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
2434b8e80941Smrg		LLVMValueRef index;
2435b8e80941Smrg		index = si_get_bounded_indirect_index(ctx, &reg->DimIndirect,
2436b8e80941Smrg						      reg->Dimension.Index,
2437b8e80941Smrg						      ctx->num_const_buffers);
2438b8e80941Smrg		index = LLVMBuildAdd(ctx->ac.builder, index,
2439b8e80941Smrg				     LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), "");
2440b8e80941Smrg		bufp = ac_build_load_to_sgpr(&ctx->ac, ptr, index);
2441b8e80941Smrg	} else
2442b8e80941Smrg		bufp = load_const_buffer_desc(ctx, buf);
2443b8e80941Smrg
2444b8e80941Smrg	return bitcast(bld_base, type, buffer_load_const(ctx, bufp, addr));
2445b8e80941Smrg}
2446b8e80941Smrg
2447b8e80941Smrg/* Initialize arguments for the shader export intrinsic */
2448b8e80941Smrgstatic void si_llvm_init_export_args(struct si_shader_context *ctx,
2449b8e80941Smrg				     LLVMValueRef *values,
2450b8e80941Smrg				     unsigned target,
2451b8e80941Smrg				     struct ac_export_args *args)
2452b8e80941Smrg{
2453b8e80941Smrg	LLVMValueRef f32undef = LLVMGetUndef(ctx->ac.f32);
2454b8e80941Smrg	unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
2455b8e80941Smrg	unsigned chan;
2456b8e80941Smrg	bool is_int8, is_int10;
2457b8e80941Smrg
2458b8e80941Smrg	/* Default is 0xf. Adjusted below depending on the format. */
2459b8e80941Smrg	args->enabled_channels = 0xf; /* writemask */
2460b8e80941Smrg
2461b8e80941Smrg	/* Specify whether the EXEC mask represents the valid mask */
2462b8e80941Smrg	args->valid_mask = 0;
2463b8e80941Smrg
2464b8e80941Smrg	/* Specify whether this is the last export */
2465b8e80941Smrg	args->done = 0;
2466b8e80941Smrg
2467b8e80941Smrg	/* Specify the target we are exporting */
2468b8e80941Smrg	args->target = target;
2469b8e80941Smrg
2470b8e80941Smrg	if (ctx->type == PIPE_SHADER_FRAGMENT) {
2471b8e80941Smrg		const struct si_shader_key *key = &ctx->shader->key;
2472b8e80941Smrg		unsigned col_formats = key->part.ps.epilog.spi_shader_col_format;
2473b8e80941Smrg		int cbuf = target - V_008DFC_SQ_EXP_MRT;
2474b8e80941Smrg
2475b8e80941Smrg		assert(cbuf >= 0 && cbuf < 8);
2476b8e80941Smrg		spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
2477b8e80941Smrg		is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1;
2478b8e80941Smrg		is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1;
2479b8e80941Smrg	}
2480b8e80941Smrg
2481b8e80941Smrg	args->compr = false;
2482b8e80941Smrg	args->out[0] = f32undef;
2483b8e80941Smrg	args->out[1] = f32undef;
2484b8e80941Smrg	args->out[2] = f32undef;
2485b8e80941Smrg	args->out[3] = f32undef;
2486b8e80941Smrg
2487b8e80941Smrg	LLVMValueRef (*packf)(struct ac_llvm_context *ctx, LLVMValueRef args[2]) = NULL;
2488b8e80941Smrg	LLVMValueRef (*packi)(struct ac_llvm_context *ctx, LLVMValueRef args[2],
2489b8e80941Smrg			      unsigned bits, bool hi) = NULL;
2490b8e80941Smrg
2491b8e80941Smrg	switch (spi_shader_col_format) {
2492b8e80941Smrg	case V_028714_SPI_SHADER_ZERO:
2493b8e80941Smrg		args->enabled_channels = 0; /* writemask */
2494b8e80941Smrg		args->target = V_008DFC_SQ_EXP_NULL;
2495b8e80941Smrg		break;
2496b8e80941Smrg
2497b8e80941Smrg	case V_028714_SPI_SHADER_32_R:
2498b8e80941Smrg		args->enabled_channels = 1; /* writemask */
2499b8e80941Smrg		args->out[0] = values[0];
2500b8e80941Smrg		break;
2501b8e80941Smrg
2502b8e80941Smrg	case V_028714_SPI_SHADER_32_GR:
2503b8e80941Smrg		args->enabled_channels = 0x3; /* writemask */
2504b8e80941Smrg		args->out[0] = values[0];
2505b8e80941Smrg		args->out[1] = values[1];
2506b8e80941Smrg		break;
2507b8e80941Smrg
2508b8e80941Smrg	case V_028714_SPI_SHADER_32_AR:
2509b8e80941Smrg		args->enabled_channels = 0x9; /* writemask */
2510b8e80941Smrg		args->out[0] = values[0];
2511b8e80941Smrg		args->out[3] = values[3];
2512b8e80941Smrg		break;
2513b8e80941Smrg
2514b8e80941Smrg	case V_028714_SPI_SHADER_FP16_ABGR:
2515b8e80941Smrg		packf = ac_build_cvt_pkrtz_f16;
2516b8e80941Smrg		break;
2517b8e80941Smrg
2518b8e80941Smrg	case V_028714_SPI_SHADER_UNORM16_ABGR:
2519b8e80941Smrg		packf = ac_build_cvt_pknorm_u16;
2520b8e80941Smrg		break;
2521b8e80941Smrg
2522b8e80941Smrg	case V_028714_SPI_SHADER_SNORM16_ABGR:
2523b8e80941Smrg		packf = ac_build_cvt_pknorm_i16;
2524b8e80941Smrg		break;
2525b8e80941Smrg
2526b8e80941Smrg	case V_028714_SPI_SHADER_UINT16_ABGR:
2527b8e80941Smrg		packi = ac_build_cvt_pk_u16;
2528b8e80941Smrg		break;
2529b8e80941Smrg
2530b8e80941Smrg	case V_028714_SPI_SHADER_SINT16_ABGR:
2531b8e80941Smrg		packi = ac_build_cvt_pk_i16;
2532b8e80941Smrg		break;
2533b8e80941Smrg
2534b8e80941Smrg	case V_028714_SPI_SHADER_32_ABGR:
2535b8e80941Smrg		memcpy(&args->out[0], values, sizeof(values[0]) * 4);
2536b8e80941Smrg		break;
2537b8e80941Smrg	}
2538b8e80941Smrg
2539b8e80941Smrg	/* Pack f16 or norm_i16/u16. */
2540b8e80941Smrg	if (packf) {
2541b8e80941Smrg		for (chan = 0; chan < 2; chan++) {
2542b8e80941Smrg			LLVMValueRef pack_args[2] = {
2543b8e80941Smrg				values[2 * chan],
2544b8e80941Smrg				values[2 * chan + 1]
2545b8e80941Smrg			};
2546b8e80941Smrg			LLVMValueRef packed;
2547b8e80941Smrg
2548b8e80941Smrg			packed = packf(&ctx->ac, pack_args);
2549b8e80941Smrg			args->out[chan] = ac_to_float(&ctx->ac, packed);
2550b8e80941Smrg		}
2551b8e80941Smrg		args->compr = 1; /* COMPR flag */
2552b8e80941Smrg	}
2553b8e80941Smrg	/* Pack i16/u16. */
2554b8e80941Smrg	if (packi) {
2555b8e80941Smrg		for (chan = 0; chan < 2; chan++) {
2556b8e80941Smrg			LLVMValueRef pack_args[2] = {
2557b8e80941Smrg				ac_to_integer(&ctx->ac, values[2 * chan]),
2558b8e80941Smrg				ac_to_integer(&ctx->ac, values[2 * chan + 1])
2559b8e80941Smrg			};
2560b8e80941Smrg			LLVMValueRef packed;
2561b8e80941Smrg
2562b8e80941Smrg			packed = packi(&ctx->ac, pack_args,
2563b8e80941Smrg				       is_int8 ? 8 : is_int10 ? 10 : 16,
2564b8e80941Smrg				       chan == 1);
2565b8e80941Smrg			args->out[chan] = ac_to_float(&ctx->ac, packed);
2566b8e80941Smrg		}
2567b8e80941Smrg		args->compr = 1; /* COMPR flag */
2568b8e80941Smrg	}
2569b8e80941Smrg}
2570b8e80941Smrg
2571b8e80941Smrgstatic void si_alpha_test(struct lp_build_tgsi_context *bld_base,
2572b8e80941Smrg			  LLVMValueRef alpha)
2573b8e80941Smrg{
2574b8e80941Smrg	struct si_shader_context *ctx = si_shader_context(bld_base);
2575b8e80941Smrg
2576b8e80941Smrg	if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
2577b8e80941Smrg		static LLVMRealPredicate cond_map[PIPE_FUNC_ALWAYS + 1] = {
2578b8e80941Smrg			[PIPE_FUNC_LESS] = LLVMRealOLT,
2579b8e80941Smrg			[PIPE_FUNC_EQUAL] = LLVMRealOEQ,
2580b8e80941Smrg			[PIPE_FUNC_LEQUAL] = LLVMRealOLE,
2581b8e80941Smrg			[PIPE_FUNC_GREATER] = LLVMRealOGT,
2582b8e80941Smrg			[PIPE_FUNC_NOTEQUAL] = LLVMRealONE,
2583b8e80941Smrg			[PIPE_FUNC_GEQUAL] = LLVMRealOGE,
2584b8e80941Smrg		};
2585b8e80941Smrg		LLVMRealPredicate cond = cond_map[ctx->shader->key.part.ps.epilog.alpha_func];
2586b8e80941Smrg		assert(cond);
2587b8e80941Smrg
2588b8e80941Smrg		LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn,
2589b8e80941Smrg				SI_PARAM_ALPHA_REF);
2590b8e80941Smrg		LLVMValueRef alpha_pass =
2591b8e80941Smrg			LLVMBuildFCmp(ctx->ac.builder, cond, alpha, alpha_ref, "");
2592b8e80941Smrg		ac_build_kill_if_false(&ctx->ac, alpha_pass);
2593b8e80941Smrg	} else {
2594b8e80941Smrg		ac_build_kill_if_false(&ctx->ac, ctx->i1false);
2595b8e80941Smrg	}
2596b8e80941Smrg}
2597b8e80941Smrg
2598b8e80941Smrgstatic LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
2599b8e80941Smrg						  LLVMValueRef alpha,
2600b8e80941Smrg						  unsigned samplemask_param)
2601b8e80941Smrg{
2602b8e80941Smrg	struct si_shader_context *ctx = si_shader_context(bld_base);
2603b8e80941Smrg	LLVMValueRef coverage;
2604b8e80941Smrg
2605b8e80941Smrg	/* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
2606b8e80941Smrg	coverage = LLVMGetParam(ctx->main_fn,
2607b8e80941Smrg				samplemask_param);
2608b8e80941Smrg	coverage = ac_to_integer(&ctx->ac, coverage);
2609b8e80941Smrg
2610b8e80941Smrg	coverage = ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i32",
2611b8e80941Smrg				   ctx->i32,
2612b8e80941Smrg				   &coverage, 1, AC_FUNC_ATTR_READNONE);
2613b8e80941Smrg
2614b8e80941Smrg	coverage = LLVMBuildUIToFP(ctx->ac.builder, coverage,
2615b8e80941Smrg				   ctx->f32, "");
2616b8e80941Smrg
2617b8e80941Smrg	coverage = LLVMBuildFMul(ctx->ac.builder, coverage,
2618b8e80941Smrg				 LLVMConstReal(ctx->f32,
2619b8e80941Smrg					1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
2620b8e80941Smrg
2621b8e80941Smrg	return LLVMBuildFMul(ctx->ac.builder, alpha, coverage, "");
2622b8e80941Smrg}
2623b8e80941Smrg
2624b8e80941Smrgstatic void si_llvm_emit_clipvertex(struct si_shader_context *ctx,
2625b8e80941Smrg				    struct ac_export_args *pos, LLVMValueRef *out_elts)
2626b8e80941Smrg{
2627b8e80941Smrg	unsigned reg_index;
2628b8e80941Smrg	unsigned chan;
2629b8e80941Smrg	unsigned const_chan;
2630b8e80941Smrg	LLVMValueRef base_elt;
2631b8e80941Smrg	LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
2632b8e80941Smrg	LLVMValueRef constbuf_index = LLVMConstInt(ctx->i32,
2633b8e80941Smrg						   SI_VS_CONST_CLIP_PLANES, 0);
2634b8e80941Smrg	LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index);
2635b8e80941Smrg
2636b8e80941Smrg	for (reg_index = 0; reg_index < 2; reg_index ++) {
2637b8e80941Smrg		struct ac_export_args *args = &pos[2 + reg_index];
2638b8e80941Smrg
2639b8e80941Smrg		args->out[0] =
2640b8e80941Smrg		args->out[1] =
2641b8e80941Smrg		args->out[2] =
2642b8e80941Smrg		args->out[3] = LLVMConstReal(ctx->f32, 0.0f);
2643b8e80941Smrg
2644b8e80941Smrg		/* Compute dot products of position and user clip plane vectors */
2645b8e80941Smrg		for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2646b8e80941Smrg			for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
2647b8e80941Smrg				LLVMValueRef addr =
2648b8e80941Smrg					LLVMConstInt(ctx->i32, ((reg_index * 4 + chan) * 4 +
2649b8e80941Smrg								const_chan) * 4, 0);
2650b8e80941Smrg				base_elt = buffer_load_const(ctx, const_resource,
2651b8e80941Smrg							     addr);
2652b8e80941Smrg				args->out[chan] = ac_build_fmad(&ctx->ac, base_elt,
2653b8e80941Smrg								out_elts[const_chan], args->out[chan]);
2654b8e80941Smrg			}
2655b8e80941Smrg		}
2656b8e80941Smrg
2657b8e80941Smrg		args->enabled_channels = 0xf;
2658b8e80941Smrg		args->valid_mask = 0;
2659b8e80941Smrg		args->done = 0;
2660b8e80941Smrg		args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
2661b8e80941Smrg		args->compr = 0;
2662b8e80941Smrg	}
2663b8e80941Smrg}
2664b8e80941Smrg
2665b8e80941Smrgstatic void si_dump_streamout(struct pipe_stream_output_info *so)
2666b8e80941Smrg{
2667b8e80941Smrg	unsigned i;
2668b8e80941Smrg
2669b8e80941Smrg	if (so->num_outputs)
2670b8e80941Smrg		fprintf(stderr, "STREAMOUT\n");
2671b8e80941Smrg
2672b8e80941Smrg	for (i = 0; i < so->num_outputs; i++) {
2673b8e80941Smrg		unsigned mask = ((1 << so->output[i].num_components) - 1) <<
2674b8e80941Smrg				so->output[i].start_component;
2675b8e80941Smrg		fprintf(stderr, "  %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
2676b8e80941Smrg			i, so->output[i].output_buffer,
2677b8e80941Smrg			so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
2678b8e80941Smrg			so->output[i].register_index,
2679b8e80941Smrg			mask & 1 ? "x" : "",
2680b8e80941Smrg		        mask & 2 ? "y" : "",
2681b8e80941Smrg		        mask & 4 ? "z" : "",
2682b8e80941Smrg		        mask & 8 ? "w" : "");
2683b8e80941Smrg	}
2684b8e80941Smrg}
2685b8e80941Smrg
2686b8e80941Smrgstatic void emit_streamout_output(struct si_shader_context *ctx,
2687b8e80941Smrg				  LLVMValueRef const *so_buffers,
2688b8e80941Smrg				  LLVMValueRef const *so_write_offsets,
2689b8e80941Smrg				  struct pipe_stream_output *stream_out,
2690b8e80941Smrg				  struct si_shader_output_values *shader_out)
2691b8e80941Smrg{
2692b8e80941Smrg	unsigned buf_idx = stream_out->output_buffer;
2693b8e80941Smrg	unsigned start = stream_out->start_component;
2694b8e80941Smrg	unsigned num_comps = stream_out->num_components;
2695b8e80941Smrg	LLVMValueRef out[4];
2696b8e80941Smrg
2697b8e80941Smrg	assert(num_comps && num_comps <= 4);
2698b8e80941Smrg	if (!num_comps || num_comps > 4)
2699b8e80941Smrg		return;
2700b8e80941Smrg
2701b8e80941Smrg	/* Load the output as int. */
2702b8e80941Smrg	for (int j = 0; j < num_comps; j++) {
2703b8e80941Smrg		assert(stream_out->stream == shader_out->vertex_stream[start + j]);
2704b8e80941Smrg
2705b8e80941Smrg		out[j] = ac_to_integer(&ctx->ac, shader_out->values[start + j]);
2706b8e80941Smrg	}
2707b8e80941Smrg
2708b8e80941Smrg	/* Pack the output. */
2709b8e80941Smrg	LLVMValueRef vdata = NULL;
2710b8e80941Smrg
2711b8e80941Smrg	switch (num_comps) {
2712b8e80941Smrg	case 1: /* as i32 */
2713b8e80941Smrg		vdata = out[0];
2714b8e80941Smrg		break;
2715b8e80941Smrg	case 2: /* as v2i32 */
2716b8e80941Smrg	case 3: /* as v4i32 (aligned to 4) */
2717b8e80941Smrg		out[3] = LLVMGetUndef(ctx->i32);
2718b8e80941Smrg		/* fall through */
2719b8e80941Smrg	case 4: /* as v4i32 */
2720b8e80941Smrg		vdata = ac_build_gather_values(&ctx->ac, out, util_next_power_of_two(num_comps));
2721b8e80941Smrg		break;
2722b8e80941Smrg	}
2723b8e80941Smrg
2724b8e80941Smrg	ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx],
2725b8e80941Smrg				    vdata, num_comps,
2726b8e80941Smrg				    so_write_offsets[buf_idx],
2727b8e80941Smrg				    ctx->i32_0,
2728b8e80941Smrg				    stream_out->dst_offset * 4, 1, 1, true, false);
2729b8e80941Smrg}
2730b8e80941Smrg
2731b8e80941Smrg/**
2732b8e80941Smrg * Write streamout data to buffers for vertex stream @p stream (different
2733b8e80941Smrg * vertex streams can occur for GS copy shaders).
2734b8e80941Smrg */
2735b8e80941Smrgstatic void si_llvm_emit_streamout(struct si_shader_context *ctx,
2736b8e80941Smrg				   struct si_shader_output_values *outputs,
2737b8e80941Smrg				   unsigned noutput, unsigned stream)
2738b8e80941Smrg{
2739b8e80941Smrg	struct si_shader_selector *sel = ctx->shader->selector;
2740b8e80941Smrg	struct pipe_stream_output_info *so = &sel->so;
2741b8e80941Smrg	LLVMBuilderRef builder = ctx->ac.builder;
2742b8e80941Smrg	int i;
2743b8e80941Smrg	struct lp_build_if_state if_ctx;
2744b8e80941Smrg
2745b8e80941Smrg	/* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2746b8e80941Smrg	LLVMValueRef so_vtx_count =
2747b8e80941Smrg		si_unpack_param(ctx, ctx->param_streamout_config, 16, 7);
2748b8e80941Smrg
2749b8e80941Smrg	LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
2750b8e80941Smrg
2751b8e80941Smrg	/* can_emit = tid < so_vtx_count; */
2752b8e80941Smrg	LLVMValueRef can_emit =
2753b8e80941Smrg		LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
2754b8e80941Smrg
2755b8e80941Smrg	/* Emit the streamout code conditionally. This actually avoids
2756b8e80941Smrg	 * out-of-bounds buffer access. The hw tells us via the SGPR
2757b8e80941Smrg	 * (so_vtx_count) which threads are allowed to emit streamout data. */
2758b8e80941Smrg	lp_build_if(&if_ctx, &ctx->gallivm, can_emit);
2759b8e80941Smrg	{
2760b8e80941Smrg		/* The buffer offset is computed as follows:
2761b8e80941Smrg		 *   ByteOffset = streamout_offset[buffer_id]*4 +
2762b8e80941Smrg		 *                (streamout_write_index + thread_id)*stride[buffer_id] +
2763b8e80941Smrg		 *                attrib_offset
2764b8e80941Smrg                 */
2765b8e80941Smrg
2766b8e80941Smrg		LLVMValueRef so_write_index =
2767b8e80941Smrg			LLVMGetParam(ctx->main_fn,
2768b8e80941Smrg				     ctx->param_streamout_write_index);
2769b8e80941Smrg
2770b8e80941Smrg		/* Compute (streamout_write_index + thread_id). */
2771b8e80941Smrg		so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
2772b8e80941Smrg
2773b8e80941Smrg		/* Load the descriptor and compute the write offset for each
2774b8e80941Smrg		 * enabled buffer. */
2775b8e80941Smrg		LLVMValueRef so_write_offset[4] = {};
2776b8e80941Smrg		LLVMValueRef so_buffers[4];
2777b8e80941Smrg		LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
2778b8e80941Smrg						    ctx->param_rw_buffers);
2779b8e80941Smrg
2780b8e80941Smrg		for (i = 0; i < 4; i++) {
2781b8e80941Smrg			if (!so->stride[i])
2782b8e80941Smrg				continue;
2783b8e80941Smrg
2784b8e80941Smrg			LLVMValueRef offset = LLVMConstInt(ctx->i32,
2785b8e80941Smrg							   SI_VS_STREAMOUT_BUF0 + i, 0);
2786b8e80941Smrg
2787b8e80941Smrg			so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
2788b8e80941Smrg
2789b8e80941Smrg			LLVMValueRef so_offset = LLVMGetParam(ctx->main_fn,
2790b8e80941Smrg							      ctx->param_streamout_offset[i]);
2791b8e80941Smrg			so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
2792b8e80941Smrg
2793b8e80941Smrg			so_write_offset[i] = ac_build_imad(&ctx->ac, so_write_index,
2794b8e80941Smrg							   LLVMConstInt(ctx->i32, so->stride[i]*4, 0),
2795b8e80941Smrg							   so_offset);
2796b8e80941Smrg		}
2797b8e80941Smrg
2798b8e80941Smrg		/* Write streamout data. */
2799b8e80941Smrg		for (i = 0; i < so->num_outputs; i++) {
2800b8e80941Smrg			unsigned reg = so->output[i].register_index;
2801b8e80941Smrg
2802b8e80941Smrg			if (reg >= noutput)
2803b8e80941Smrg				continue;
2804b8e80941Smrg
2805b8e80941Smrg			if (stream != so->output[i].stream)
2806b8e80941Smrg				continue;
2807b8e80941Smrg
2808b8e80941Smrg			emit_streamout_output(ctx, so_buffers, so_write_offset,
2809b8e80941Smrg					      &so->output[i], &outputs[reg]);
2810b8e80941Smrg		}
2811b8e80941Smrg	}
2812b8e80941Smrg	lp_build_endif(&if_ctx);
2813b8e80941Smrg}
2814b8e80941Smrg
2815b8e80941Smrgstatic void si_export_param(struct si_shader_context *ctx, unsigned index,
2816b8e80941Smrg			    LLVMValueRef *values)
2817b8e80941Smrg{
2818b8e80941Smrg	struct ac_export_args args;
2819b8e80941Smrg
2820b8e80941Smrg	si_llvm_init_export_args(ctx, values,
2821b8e80941Smrg				 V_008DFC_SQ_EXP_PARAM + index, &args);
2822b8e80941Smrg	ac_build_export(&ctx->ac, &args);
2823b8e80941Smrg}
2824b8e80941Smrg
2825b8e80941Smrgstatic void si_build_param_exports(struct si_shader_context *ctx,
2826b8e80941Smrg				   struct si_shader_output_values *outputs,
2827b8e80941Smrg			           unsigned noutput)
2828b8e80941Smrg{
2829b8e80941Smrg	struct si_shader *shader = ctx->shader;
2830b8e80941Smrg	unsigned param_count = 0;
2831b8e80941Smrg
2832b8e80941Smrg	for (unsigned i = 0; i < noutput; i++) {
2833b8e80941Smrg		unsigned semantic_name = outputs[i].semantic_name;
2834b8e80941Smrg		unsigned semantic_index = outputs[i].semantic_index;
2835b8e80941Smrg
2836b8e80941Smrg		if (outputs[i].vertex_stream[0] != 0 &&
2837b8e80941Smrg		    outputs[i].vertex_stream[1] != 0 &&
2838b8e80941Smrg		    outputs[i].vertex_stream[2] != 0 &&
2839b8e80941Smrg		    outputs[i].vertex_stream[3] != 0)
2840b8e80941Smrg			continue;
2841b8e80941Smrg
2842b8e80941Smrg		switch (semantic_name) {
2843b8e80941Smrg		case TGSI_SEMANTIC_LAYER:
2844b8e80941Smrg		case TGSI_SEMANTIC_VIEWPORT_INDEX:
2845b8e80941Smrg		case TGSI_SEMANTIC_CLIPDIST:
2846b8e80941Smrg		case TGSI_SEMANTIC_COLOR:
2847b8e80941Smrg		case TGSI_SEMANTIC_BCOLOR:
2848b8e80941Smrg		case TGSI_SEMANTIC_PRIMID:
2849b8e80941Smrg		case TGSI_SEMANTIC_FOG:
2850b8e80941Smrg		case TGSI_SEMANTIC_TEXCOORD:
2851b8e80941Smrg		case TGSI_SEMANTIC_GENERIC:
2852b8e80941Smrg			break;
2853b8e80941Smrg		default:
2854b8e80941Smrg			continue;
2855b8e80941Smrg		}
2856b8e80941Smrg
2857b8e80941Smrg		if ((semantic_name != TGSI_SEMANTIC_GENERIC ||
2858b8e80941Smrg		     semantic_index < SI_MAX_IO_GENERIC) &&
2859b8e80941Smrg		    shader->key.opt.kill_outputs &
2860b8e80941Smrg		    (1ull << si_shader_io_get_unique_index(semantic_name,
2861b8e80941Smrg							   semantic_index, true)))
2862b8e80941Smrg			continue;
2863b8e80941Smrg
2864b8e80941Smrg		si_export_param(ctx, param_count, outputs[i].values);
2865b8e80941Smrg
2866b8e80941Smrg		assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2867b8e80941Smrg		shader->info.vs_output_param_offset[i] = param_count++;
2868b8e80941Smrg	}
2869b8e80941Smrg
2870b8e80941Smrg	shader->info.nr_param_exports = param_count;
2871b8e80941Smrg}
2872b8e80941Smrg
2873b8e80941Smrg/* Generate export instructions for hardware VS shader stage */
2874b8e80941Smrgstatic void si_llvm_export_vs(struct si_shader_context *ctx,
2875b8e80941Smrg			      struct si_shader_output_values *outputs,
2876b8e80941Smrg			      unsigned noutput)
2877b8e80941Smrg{
2878b8e80941Smrg	struct si_shader *shader = ctx->shader;
2879b8e80941Smrg	struct ac_export_args pos_args[4] = {};
2880b8e80941Smrg	LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2881b8e80941Smrg	unsigned pos_idx;
2882b8e80941Smrg	int i;
2883b8e80941Smrg
2884b8e80941Smrg	/* Build position exports. */
2885b8e80941Smrg	for (i = 0; i < noutput; i++) {
2886b8e80941Smrg		switch (outputs[i].semantic_name) {
2887b8e80941Smrg		case TGSI_SEMANTIC_POSITION:
2888b8e80941Smrg			si_llvm_init_export_args(ctx, outputs[i].values,
2889b8e80941Smrg						 V_008DFC_SQ_EXP_POS, &pos_args[0]);
2890b8e80941Smrg			break;
2891b8e80941Smrg		case TGSI_SEMANTIC_PSIZE:
2892b8e80941Smrg			psize_value = outputs[i].values[0];
2893b8e80941Smrg			break;
2894b8e80941Smrg		case TGSI_SEMANTIC_LAYER:
2895b8e80941Smrg			layer_value = outputs[i].values[0];
2896b8e80941Smrg			break;
2897b8e80941Smrg		case TGSI_SEMANTIC_VIEWPORT_INDEX:
2898b8e80941Smrg			viewport_index_value = outputs[i].values[0];
2899b8e80941Smrg			break;
2900b8e80941Smrg		case TGSI_SEMANTIC_EDGEFLAG:
2901b8e80941Smrg			edgeflag_value = outputs[i].values[0];
2902b8e80941Smrg			break;
2903b8e80941Smrg		case TGSI_SEMANTIC_CLIPDIST:
2904b8e80941Smrg			if (!shader->key.opt.clip_disable) {
2905b8e80941Smrg				unsigned index = 2 + outputs[i].semantic_index;
2906b8e80941Smrg				si_llvm_init_export_args(ctx, outputs[i].values,
2907b8e80941Smrg							 V_008DFC_SQ_EXP_POS + index,
2908b8e80941Smrg							 &pos_args[index]);
2909b8e80941Smrg			}
2910b8e80941Smrg			break;
2911b8e80941Smrg		case TGSI_SEMANTIC_CLIPVERTEX:
2912b8e80941Smrg			if (!shader->key.opt.clip_disable) {
2913b8e80941Smrg				si_llvm_emit_clipvertex(ctx, pos_args,
2914b8e80941Smrg							outputs[i].values);
2915b8e80941Smrg			}
2916b8e80941Smrg			break;
2917b8e80941Smrg		}
2918b8e80941Smrg	}
2919b8e80941Smrg
2920b8e80941Smrg	/* We need to add the position output manually if it's missing. */
2921b8e80941Smrg	if (!pos_args[0].out[0]) {
2922b8e80941Smrg		pos_args[0].enabled_channels = 0xf; /* writemask */
2923b8e80941Smrg		pos_args[0].valid_mask = 0; /* EXEC mask */
2924b8e80941Smrg		pos_args[0].done = 0; /* last export? */
2925b8e80941Smrg		pos_args[0].target = V_008DFC_SQ_EXP_POS;
2926b8e80941Smrg		pos_args[0].compr = 0; /* COMPR flag */
2927b8e80941Smrg		pos_args[0].out[0] = ctx->ac.f32_0; /* X */
2928b8e80941Smrg		pos_args[0].out[1] = ctx->ac.f32_0; /* Y */
2929b8e80941Smrg		pos_args[0].out[2] = ctx->ac.f32_0; /* Z */
2930b8e80941Smrg		pos_args[0].out[3] = ctx->ac.f32_1;  /* W */
2931b8e80941Smrg	}
2932b8e80941Smrg
2933b8e80941Smrg	/* Write the misc vector (point size, edgeflag, layer, viewport). */
2934b8e80941Smrg	if (shader->selector->info.writes_psize ||
2935b8e80941Smrg	    shader->selector->info.writes_edgeflag ||
2936b8e80941Smrg	    shader->selector->info.writes_viewport_index ||
2937b8e80941Smrg	    shader->selector->info.writes_layer) {
2938b8e80941Smrg		pos_args[1].enabled_channels = shader->selector->info.writes_psize |
2939b8e80941Smrg					       (shader->selector->info.writes_edgeflag << 1) |
2940b8e80941Smrg					       (shader->selector->info.writes_layer << 2);
2941b8e80941Smrg
2942b8e80941Smrg		pos_args[1].valid_mask = 0; /* EXEC mask */
2943b8e80941Smrg		pos_args[1].done = 0; /* last export? */
2944b8e80941Smrg		pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
2945b8e80941Smrg		pos_args[1].compr = 0; /* COMPR flag */
2946b8e80941Smrg		pos_args[1].out[0] = ctx->ac.f32_0; /* X */
2947b8e80941Smrg		pos_args[1].out[1] = ctx->ac.f32_0; /* Y */
2948b8e80941Smrg		pos_args[1].out[2] = ctx->ac.f32_0; /* Z */
2949b8e80941Smrg		pos_args[1].out[3] = ctx->ac.f32_0; /* W */
2950b8e80941Smrg
2951b8e80941Smrg		if (shader->selector->info.writes_psize)
2952b8e80941Smrg			pos_args[1].out[0] = psize_value;
2953b8e80941Smrg
2954b8e80941Smrg		if (shader->selector->info.writes_edgeflag) {
2955b8e80941Smrg			/* The output is a float, but the hw expects an integer
2956b8e80941Smrg			 * with the first bit containing the edge flag. */
2957b8e80941Smrg			edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder,
2958b8e80941Smrg							 edgeflag_value,
2959b8e80941Smrg							 ctx->i32, "");
2960b8e80941Smrg			edgeflag_value = ac_build_umin(&ctx->ac,
2961b8e80941Smrg						      edgeflag_value,
2962b8e80941Smrg						      ctx->i32_1);
2963b8e80941Smrg
2964b8e80941Smrg			/* The LLVM intrinsic expects a float. */
2965b8e80941Smrg			pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value);
2966b8e80941Smrg		}
2967b8e80941Smrg
2968b8e80941Smrg		if (ctx->screen->info.chip_class >= GFX9) {
2969b8e80941Smrg			/* GFX9 has the layer in out.z[10:0] and the viewport
2970b8e80941Smrg			 * index in out.z[19:16].
2971b8e80941Smrg			 */
2972b8e80941Smrg			if (shader->selector->info.writes_layer)
2973b8e80941Smrg				pos_args[1].out[2] = layer_value;
2974b8e80941Smrg
2975b8e80941Smrg			if (shader->selector->info.writes_viewport_index) {
2976b8e80941Smrg				LLVMValueRef v = viewport_index_value;
2977b8e80941Smrg
2978b8e80941Smrg				v = ac_to_integer(&ctx->ac, v);
2979b8e80941Smrg				v = LLVMBuildShl(ctx->ac.builder, v,
2980b8e80941Smrg						 LLVMConstInt(ctx->i32, 16, 0), "");
2981b8e80941Smrg				v = LLVMBuildOr(ctx->ac.builder, v,
2982b8e80941Smrg						ac_to_integer(&ctx->ac,  pos_args[1].out[2]), "");
2983b8e80941Smrg				pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
2984b8e80941Smrg				pos_args[1].enabled_channels |= 1 << 2;
2985b8e80941Smrg			}
2986b8e80941Smrg		} else {
2987b8e80941Smrg			if (shader->selector->info.writes_layer)
2988b8e80941Smrg				pos_args[1].out[2] = layer_value;
2989b8e80941Smrg
2990b8e80941Smrg			if (shader->selector->info.writes_viewport_index) {
2991b8e80941Smrg				pos_args[1].out[3] = viewport_index_value;
2992b8e80941Smrg				pos_args[1].enabled_channels |= 1 << 3;
2993b8e80941Smrg			}
2994b8e80941Smrg		}
2995b8e80941Smrg	}
2996b8e80941Smrg
2997b8e80941Smrg	for (i = 0; i < 4; i++)
2998b8e80941Smrg		if (pos_args[i].out[0])
2999b8e80941Smrg			shader->info.nr_pos_exports++;
3000848b8605Smrg
3001848b8605Smrg	pos_idx = 0;
3002848b8605Smrg	for (i = 0; i < 4; i++) {
3003b8e80941Smrg		if (!pos_args[i].out[0])
3004b8e80941Smrg			continue;
3005b8e80941Smrg
3006b8e80941Smrg		/* Specify the target we are exporting */
3007b8e80941Smrg		pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
3008b8e80941Smrg
3009b8e80941Smrg		if (pos_idx == shader->info.nr_pos_exports)
3010b8e80941Smrg			/* Specify that this is the last export */
3011b8e80941Smrg			pos_args[i].done = 1;
3012b8e80941Smrg
3013b8e80941Smrg		ac_build_export(&ctx->ac, &pos_args[i]);
3014b8e80941Smrg	}
3015b8e80941Smrg
3016b8e80941Smrg	/* Build parameter exports. */
3017b8e80941Smrg	si_build_param_exports(ctx, outputs, noutput);
3018b8e80941Smrg}
3019b8e80941Smrg
3020b8e80941Smrg/**
3021b8e80941Smrg * Forward all outputs from the vertex shader to the TES. This is only used
3022b8e80941Smrg * for the fixed function TCS.
3023b8e80941Smrg */
3024b8e80941Smrgstatic void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
3025b8e80941Smrg{
3026b8e80941Smrg	struct si_shader_context *ctx = si_shader_context(bld_base);
3027b8e80941Smrg	LLVMValueRef invocation_id, buffer, buffer_offset;
3028b8e80941Smrg	LLVMValueRef lds_vertex_stride, lds_base;
3029b8e80941Smrg	uint64_t inputs;
3030b8e80941Smrg
3031b8e80941Smrg	invocation_id = unpack_llvm_param(ctx, ctx->abi.tcs_rel_ids, 8, 5);
3032b8e80941Smrg	buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
3033b8e80941Smrg	buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
3034b8e80941Smrg
3035b8e80941Smrg	lds_vertex_stride = get_tcs_in_vertex_dw_stride(ctx);
3036b8e80941Smrg	lds_base = get_tcs_in_current_patch_offset(ctx);
3037b8e80941Smrg	lds_base = ac_build_imad(&ctx->ac, invocation_id, lds_vertex_stride,
3038b8e80941Smrg				 lds_base);
3039b8e80941Smrg
3040b8e80941Smrg	inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy;
3041b8e80941Smrg	while (inputs) {
3042b8e80941Smrg		unsigned i = u_bit_scan64(&inputs);
3043b8e80941Smrg
3044b8e80941Smrg		LLVMValueRef lds_ptr = LLVMBuildAdd(ctx->ac.builder, lds_base,
3045b8e80941Smrg		                            LLVMConstInt(ctx->i32, 4 * i, 0),
3046b8e80941Smrg		                             "");
3047b8e80941Smrg
3048b8e80941Smrg		LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
3049b8e80941Smrg					      get_rel_patch_id(ctx),
3050b8e80941Smrg		                              invocation_id,
3051b8e80941Smrg		                              LLVMConstInt(ctx->i32, i, 0));
3052b8e80941Smrg
3053b8e80941Smrg		LLVMValueRef value = lds_load(bld_base, ctx->ac.i32, ~0,
3054b8e80941Smrg		                              lds_ptr);
3055b8e80941Smrg
3056b8e80941Smrg		ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr,
3057b8e80941Smrg					    buffer_offset, 0, 1, 0, true, false);
3058b8e80941Smrg	}
3059b8e80941Smrg}
3060b8e80941Smrg
3061b8e80941Smrgstatic void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
3062b8e80941Smrg				  LLVMValueRef rel_patch_id,
3063b8e80941Smrg				  LLVMValueRef invocation_id,
3064b8e80941Smrg				  LLVMValueRef tcs_out_current_patch_data_offset,
3065b8e80941Smrg				  LLVMValueRef invoc0_tf_outer[4],
3066b8e80941Smrg				  LLVMValueRef invoc0_tf_inner[2])
3067b8e80941Smrg{
3068b8e80941Smrg	struct si_shader_context *ctx = si_shader_context(bld_base);
3069b8e80941Smrg	struct si_shader *shader = ctx->shader;
3070b8e80941Smrg	unsigned tess_inner_index, tess_outer_index;
3071b8e80941Smrg	LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
3072b8e80941Smrg	LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
3073b8e80941Smrg	unsigned stride, outer_comps, inner_comps, i, offset;
3074b8e80941Smrg	struct lp_build_if_state if_ctx, inner_if_ctx;
3075b8e80941Smrg
3076b8e80941Smrg	/* Add a barrier before loading tess factors from LDS. */
3077b8e80941Smrg	if (!shader->key.part.tcs.epilog.invoc0_tess_factors_are_def)
3078b8e80941Smrg		si_llvm_emit_barrier(NULL, bld_base, NULL);
3079b8e80941Smrg
3080b8e80941Smrg	/* Do this only for invocation 0, because the tess levels are per-patch,
3081b8e80941Smrg	 * not per-vertex.
3082b8e80941Smrg	 *
3083b8e80941Smrg	 * This can't jump, because invocation 0 executes this. It should
3084b8e80941Smrg	 * at least mask out the loads and stores for other invocations.
3085b8e80941Smrg	 */
3086b8e80941Smrg	lp_build_if(&if_ctx, &ctx->gallivm,
3087b8e80941Smrg		    LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ,
3088b8e80941Smrg				  invocation_id, ctx->i32_0, ""));
3089b8e80941Smrg
3090b8e80941Smrg	/* Determine the layout of one tess factor element in the buffer. */
3091b8e80941Smrg	switch (shader->key.part.tcs.epilog.prim_mode) {
3092b8e80941Smrg	case PIPE_PRIM_LINES:
3093b8e80941Smrg		stride = 2; /* 2 dwords, 1 vec2 store */
3094b8e80941Smrg		outer_comps = 2;
3095b8e80941Smrg		inner_comps = 0;
3096b8e80941Smrg		break;
3097b8e80941Smrg	case PIPE_PRIM_TRIANGLES:
3098b8e80941Smrg		stride = 4; /* 4 dwords, 1 vec4 store */
3099b8e80941Smrg		outer_comps = 3;
3100b8e80941Smrg		inner_comps = 1;
3101b8e80941Smrg		break;
3102b8e80941Smrg	case PIPE_PRIM_QUADS:
3103b8e80941Smrg		stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
3104b8e80941Smrg		outer_comps = 4;
3105b8e80941Smrg		inner_comps = 2;
3106b8e80941Smrg		break;
3107b8e80941Smrg	default:
3108b8e80941Smrg		assert(0);
3109b8e80941Smrg		return;
3110b8e80941Smrg	}
3111b8e80941Smrg
3112b8e80941Smrg	for (i = 0; i < 4; i++) {
3113b8e80941Smrg		inner[i] = LLVMGetUndef(ctx->i32);
3114b8e80941Smrg		outer[i] = LLVMGetUndef(ctx->i32);
3115b8e80941Smrg	}
3116b8e80941Smrg
3117b8e80941Smrg	if (shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) {
3118b8e80941Smrg		/* Tess factors are in VGPRs. */
3119b8e80941Smrg		for (i = 0; i < outer_comps; i++)
3120b8e80941Smrg			outer[i] = out[i] = invoc0_tf_outer[i];
3121b8e80941Smrg		for (i = 0; i < inner_comps; i++)
3122b8e80941Smrg			inner[i] = out[outer_comps+i] = invoc0_tf_inner[i];
3123b8e80941Smrg	} else {
3124b8e80941Smrg		/* Load tess_inner and tess_outer from LDS.
3125b8e80941Smrg		 * Any invocation can write them, so we can't get them from a temporary.
3126b8e80941Smrg		 */
3127b8e80941Smrg		tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
3128b8e80941Smrg		tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);
3129b8e80941Smrg
3130b8e80941Smrg		lds_base = tcs_out_current_patch_data_offset;
3131b8e80941Smrg		lds_inner = LLVMBuildAdd(ctx->ac.builder, lds_base,
3132b8e80941Smrg					 LLVMConstInt(ctx->i32,
3133b8e80941Smrg						      tess_inner_index * 4, 0), "");
3134b8e80941Smrg		lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_base,
3135b8e80941Smrg					 LLVMConstInt(ctx->i32,
3136b8e80941Smrg						      tess_outer_index * 4, 0), "");
3137b8e80941Smrg
3138b8e80941Smrg		for (i = 0; i < outer_comps; i++) {
3139b8e80941Smrg			outer[i] = out[i] =
3140b8e80941Smrg				lds_load(bld_base, ctx->ac.i32, i, lds_outer);
3141b8e80941Smrg		}
3142b8e80941Smrg		for (i = 0; i < inner_comps; i++) {
3143b8e80941Smrg			inner[i] = out[outer_comps+i] =
3144b8e80941Smrg				lds_load(bld_base, ctx->ac.i32, i, lds_inner);
3145b8e80941Smrg		}
3146b8e80941Smrg	}
3147b8e80941Smrg
3148b8e80941Smrg	if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
3149b8e80941Smrg		/* For isolines, the hardware expects tess factors in the
3150b8e80941Smrg		 * reverse order from what GLSL / TGSI specify.
3151b8e80941Smrg		 */
3152b8e80941Smrg		LLVMValueRef tmp = out[0];
3153b8e80941Smrg		out[0] = out[1];
3154b8e80941Smrg		out[1] = tmp;
3155b8e80941Smrg	}
3156b8e80941Smrg
3157b8e80941Smrg	/* Convert the outputs to vectors for stores. */
3158b8e80941Smrg	vec0 = ac_build_gather_values(&ctx->ac, out, MIN2(stride, 4));
3159b8e80941Smrg	vec1 = NULL;
3160b8e80941Smrg
3161b8e80941Smrg	if (stride > 4)
3162b8e80941Smrg		vec1 = ac_build_gather_values(&ctx->ac, out+4, stride - 4);
3163b8e80941Smrg
3164b8e80941Smrg	/* Get the buffer. */
3165b8e80941Smrg	buffer = get_tess_ring_descriptor(ctx, TCS_FACTOR_RING);
3166b8e80941Smrg
3167b8e80941Smrg	/* Get the offset. */
3168b8e80941Smrg	tf_base = LLVMGetParam(ctx->main_fn,
3169b8e80941Smrg			       ctx->param_tcs_factor_offset);
3170b8e80941Smrg	byteoffset = LLVMBuildMul(ctx->ac.builder, rel_patch_id,
3171b8e80941Smrg				  LLVMConstInt(ctx->i32, 4 * stride, 0), "");
3172b8e80941Smrg
3173b8e80941Smrg	lp_build_if(&inner_if_ctx, &ctx->gallivm,
3174b8e80941Smrg		    LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ,
3175b8e80941Smrg				  rel_patch_id, ctx->i32_0, ""));
3176b8e80941Smrg
3177b8e80941Smrg	/* Store the dynamic HS control word. */
3178b8e80941Smrg	offset = 0;
3179b8e80941Smrg	if (ctx->screen->info.chip_class <= VI) {
3180b8e80941Smrg		ac_build_buffer_store_dword(&ctx->ac, buffer,
3181b8e80941Smrg					    LLVMConstInt(ctx->i32, 0x80000000, 0),
3182b8e80941Smrg					    1, ctx->i32_0, tf_base,
3183b8e80941Smrg					    offset, 1, 0, true, false);
3184b8e80941Smrg		offset += 4;
3185b8e80941Smrg	}
3186b8e80941Smrg
3187b8e80941Smrg	lp_build_endif(&inner_if_ctx);
3188b8e80941Smrg
3189b8e80941Smrg	/* Store the tessellation factors. */
3190b8e80941Smrg	ac_build_buffer_store_dword(&ctx->ac, buffer, vec0,
3191b8e80941Smrg				    MIN2(stride, 4), byteoffset, tf_base,
3192b8e80941Smrg				    offset, 1, 0, true, false);
3193b8e80941Smrg	offset += 16;
3194b8e80941Smrg	if (vec1)
3195b8e80941Smrg		ac_build_buffer_store_dword(&ctx->ac, buffer, vec1,
3196b8e80941Smrg					    stride - 4, byteoffset, tf_base,
3197b8e80941Smrg					    offset, 1, 0, true, false);
3198b8e80941Smrg
3199b8e80941Smrg	/* Store the tess factors into the offchip buffer if TES reads them. */
3200b8e80941Smrg	if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
3201b8e80941Smrg		LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
3202b8e80941Smrg		LLVMValueRef tf_inner_offset;
3203b8e80941Smrg		unsigned param_outer, param_inner;
3204b8e80941Smrg
3205b8e80941Smrg		buf = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
3206b8e80941Smrg		base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
3207b8e80941Smrg
3208b8e80941Smrg		param_outer = si_shader_io_get_unique_index_patch(
3209b8e80941Smrg				      TGSI_SEMANTIC_TESSOUTER, 0);
3210b8e80941Smrg		tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
3211b8e80941Smrg					LLVMConstInt(ctx->i32, param_outer, 0));
3212b8e80941Smrg
3213b8e80941Smrg		outer_vec = ac_build_gather_values(&ctx->ac, outer,
3214b8e80941Smrg						   util_next_power_of_two(outer_comps));
3215b8e80941Smrg
3216b8e80941Smrg		ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec,
3217b8e80941Smrg					    outer_comps, tf_outer_offset,
3218b8e80941Smrg					    base, 0, 1, 0, true, false);
3219b8e80941Smrg		if (inner_comps) {
3220b8e80941Smrg			param_inner = si_shader_io_get_unique_index_patch(
3221b8e80941Smrg					      TGSI_SEMANTIC_TESSINNER, 0);
3222b8e80941Smrg			tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
3223b8e80941Smrg					LLVMConstInt(ctx->i32, param_inner, 0));
3224b8e80941Smrg
3225b8e80941Smrg			inner_vec = inner_comps == 1 ? inner[0] :
3226b8e80941Smrg				    ac_build_gather_values(&ctx->ac, inner, inner_comps);
3227b8e80941Smrg			ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec,
3228b8e80941Smrg						    inner_comps, tf_inner_offset,
3229b8e80941Smrg						    base, 0, 1, 0, true, false);
3230b8e80941Smrg		}
3231b8e80941Smrg	}
3232b8e80941Smrg
3233b8e80941Smrg	lp_build_endif(&if_ctx);
3234b8e80941Smrg}
3235b8e80941Smrg
3236b8e80941Smrgstatic LLVMValueRef
3237b8e80941Smrgsi_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
3238b8e80941Smrg		    unsigned param, unsigned return_index)
3239b8e80941Smrg{
3240b8e80941Smrg	return LLVMBuildInsertValue(ctx->ac.builder, ret,
3241b8e80941Smrg				    LLVMGetParam(ctx->main_fn, param),
3242b8e80941Smrg				    return_index, "");
3243b8e80941Smrg}
3244b8e80941Smrg
3245b8e80941Smrgstatic LLVMValueRef
3246b8e80941Smrgsi_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
3247b8e80941Smrg			  unsigned param, unsigned return_index)
3248b8e80941Smrg{
3249b8e80941Smrg	LLVMBuilderRef builder = ctx->ac.builder;
3250b8e80941Smrg	LLVMValueRef p = LLVMGetParam(ctx->main_fn, param);
3251b8e80941Smrg
3252b8e80941Smrg	return LLVMBuildInsertValue(builder, ret,
3253b8e80941Smrg				    ac_to_float(&ctx->ac, p),
3254b8e80941Smrg				    return_index, "");
3255b8e80941Smrg}
3256b8e80941Smrg
3257b8e80941Smrgstatic LLVMValueRef
3258b8e80941Smrgsi_insert_input_ptr(struct si_shader_context *ctx, LLVMValueRef ret,
3259b8e80941Smrg		    unsigned param, unsigned return_index)
3260b8e80941Smrg{
3261b8e80941Smrg	LLVMBuilderRef builder = ctx->ac.builder;
3262b8e80941Smrg	LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, param);
3263b8e80941Smrg	ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i32, "");
3264b8e80941Smrg	return LLVMBuildInsertValue(builder, ret, ptr, return_index, "");
3265b8e80941Smrg}
3266b8e80941Smrg
3267b8e80941Smrg/* This only writes the tessellation factor levels. */
3268b8e80941Smrgstatic void si_llvm_emit_tcs_epilogue(struct ac_shader_abi *abi,
3269b8e80941Smrg				      unsigned max_outputs,
3270b8e80941Smrg				      LLVMValueRef *addrs)
3271b8e80941Smrg{
3272b8e80941Smrg	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3273b8e80941Smrg	struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
3274b8e80941Smrg	LLVMBuilderRef builder = ctx->ac.builder;
3275b8e80941Smrg	LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
3276b8e80941Smrg
3277b8e80941Smrg	si_copy_tcs_inputs(bld_base);
3278b8e80941Smrg
3279b8e80941Smrg	rel_patch_id = get_rel_patch_id(ctx);
3280b8e80941Smrg	invocation_id = unpack_llvm_param(ctx, ctx->abi.tcs_rel_ids, 8, 5);
3281b8e80941Smrg	tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
3282b8e80941Smrg
3283b8e80941Smrg	if (ctx->screen->info.chip_class >= GFX9) {
3284b8e80941Smrg		LLVMBasicBlockRef blocks[2] = {
3285b8e80941Smrg			LLVMGetInsertBlock(builder),
3286b8e80941Smrg			ctx->merged_wrap_if_state.entry_block
3287b8e80941Smrg		};
3288b8e80941Smrg		LLVMValueRef values[2];
3289b8e80941Smrg
3290b8e80941Smrg		lp_build_endif(&ctx->merged_wrap_if_state);
3291b8e80941Smrg
3292b8e80941Smrg		values[0] = rel_patch_id;
3293b8e80941Smrg		values[1] = LLVMGetUndef(ctx->i32);
3294b8e80941Smrg		rel_patch_id = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
3295b8e80941Smrg
3296b8e80941Smrg		values[0] = tf_lds_offset;
3297b8e80941Smrg		values[1] = LLVMGetUndef(ctx->i32);
3298b8e80941Smrg		tf_lds_offset = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
3299b8e80941Smrg
3300b8e80941Smrg		values[0] = invocation_id;
3301b8e80941Smrg		values[1] = ctx->i32_1; /* cause the epilog to skip threads */
3302b8e80941Smrg		invocation_id = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
3303b8e80941Smrg	}
3304b8e80941Smrg
3305b8e80941Smrg	/* Return epilog parameters from this function. */
3306b8e80941Smrg	LLVMValueRef ret = ctx->return_value;
3307b8e80941Smrg	unsigned vgpr;
3308b8e80941Smrg
3309b8e80941Smrg	if (ctx->screen->info.chip_class >= GFX9) {
3310b8e80941Smrg		ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
3311b8e80941Smrg					  8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
3312b8e80941Smrg		ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout,
3313b8e80941Smrg					  8 + GFX9_SGPR_TCS_OUT_LAYOUT);
3314b8e80941Smrg		/* Tess offchip and tess factor offsets are at the beginning. */
3315b8e80941Smrg		ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
3316b8e80941Smrg		ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
3317b8e80941Smrg		vgpr = 8 + GFX9_SGPR_TCS_OUT_LAYOUT + 1;
3318b8e80941Smrg	} else {
3319b8e80941Smrg		ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
3320b8e80941Smrg					  GFX6_SGPR_TCS_OFFCHIP_LAYOUT);
3321b8e80941Smrg		ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout,
3322b8e80941Smrg					  GFX6_SGPR_TCS_OUT_LAYOUT);
3323b8e80941Smrg		/* Tess offchip and tess factor offsets are after user SGPRs. */
3324b8e80941Smrg		ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset,
3325b8e80941Smrg					  GFX6_TCS_NUM_USER_SGPR);
3326b8e80941Smrg		ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset,
3327b8e80941Smrg					  GFX6_TCS_NUM_USER_SGPR + 1);
3328b8e80941Smrg		vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
3329b8e80941Smrg	}
3330b8e80941Smrg
3331b8e80941Smrg	/* VGPRs */
3332b8e80941Smrg	rel_patch_id = ac_to_float(&ctx->ac, rel_patch_id);
3333b8e80941Smrg	invocation_id = ac_to_float(&ctx->ac, invocation_id);
3334b8e80941Smrg	tf_lds_offset = ac_to_float(&ctx->ac, tf_lds_offset);
3335b8e80941Smrg
3336b8e80941Smrg	/* Leave a hole corresponding to the two input VGPRs. This ensures that
3337b8e80941Smrg	 * the invocation_id output does not alias the tcs_rel_ids input,
3338b8e80941Smrg	 * which saves a V_MOV on gfx9.
3339b8e80941Smrg	 */
3340b8e80941Smrg	vgpr += 2;
3341b8e80941Smrg
3342b8e80941Smrg	ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
3343b8e80941Smrg	ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
3344b8e80941Smrg
3345b8e80941Smrg	if (ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) {
3346b8e80941Smrg		vgpr++; /* skip the tess factor LDS offset */
3347b8e80941Smrg		for (unsigned i = 0; i < 6; i++) {
3348b8e80941Smrg			LLVMValueRef value =
3349b8e80941Smrg				LLVMBuildLoad(builder, ctx->invoc0_tess_factors[i], "");
3350b8e80941Smrg			value = ac_to_float(&ctx->ac, value);
3351b8e80941Smrg			ret = LLVMBuildInsertValue(builder, ret, value, vgpr++, "");
3352b8e80941Smrg		}
3353b8e80941Smrg	} else {
3354b8e80941Smrg		ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
3355b8e80941Smrg	}
3356b8e80941Smrg	ctx->return_value = ret;
3357b8e80941Smrg}
3358b8e80941Smrg
3359b8e80941Smrg/* Pass TCS inputs from LS to TCS on GFX9. */
3360b8e80941Smrgstatic void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
3361b8e80941Smrg{
3362b8e80941Smrg	LLVMValueRef ret = ctx->return_value;
3363b8e80941Smrg
3364b8e80941Smrg	ret = si_insert_input_ptr(ctx, ret, 0, 0);
3365b8e80941Smrg	ret = si_insert_input_ptr(ctx, ret, 1, 1);
3366b8e80941Smrg	ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
3367b8e80941Smrg	ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
3368b8e80941Smrg	ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
3369b8e80941Smrg	ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
3370b8e80941Smrg
3371b8e80941Smrg	ret = si_insert_input_ptr(ctx, ret, ctx->param_rw_buffers,
3372b8e80941Smrg				  8 + SI_SGPR_RW_BUFFERS);
3373b8e80941Smrg	ret = si_insert_input_ptr(ctx, ret,
3374b8e80941Smrg				  ctx->param_bindless_samplers_and_images,
3375b8e80941Smrg				  8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
3376b8e80941Smrg
3377b8e80941Smrg	ret = si_insert_input_ret(ctx, ret, ctx->param_vs_state_bits,
3378b8e80941Smrg				  8 + SI_SGPR_VS_STATE_BITS);
3379b8e80941Smrg
3380b8e80941Smrg	ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
3381b8e80941Smrg				  8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
3382b8e80941Smrg	ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_offsets,
3383b8e80941Smrg				  8 + GFX9_SGPR_TCS_OUT_OFFSETS);
3384b8e80941Smrg	ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout,
3385b8e80941Smrg				  8 + GFX9_SGPR_TCS_OUT_LAYOUT);
3386b8e80941Smrg
3387b8e80941Smrg	unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
3388b8e80941Smrg	ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
3389b8e80941Smrg				   ac_to_float(&ctx->ac, ctx->abi.tcs_patch_id),
3390b8e80941Smrg				   vgpr++, "");
3391b8e80941Smrg	ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
3392b8e80941Smrg				   ac_to_float(&ctx->ac, ctx->abi.tcs_rel_ids),
3393b8e80941Smrg				   vgpr++, "");
3394b8e80941Smrg	ctx->return_value = ret;
3395b8e80941Smrg}
3396b8e80941Smrg
3397b8e80941Smrg/* Pass GS inputs from ES to GS on GFX9. */
3398b8e80941Smrgstatic void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
3399b8e80941Smrg{
3400b8e80941Smrg	LLVMValueRef ret = ctx->return_value;
3401b8e80941Smrg
3402b8e80941Smrg	ret = si_insert_input_ptr(ctx, ret, 0, 0);
3403b8e80941Smrg	ret = si_insert_input_ptr(ctx, ret, 1, 1);
3404b8e80941Smrg	ret = si_insert_input_ret(ctx, ret, ctx->param_gs2vs_offset, 2);
3405b8e80941Smrg	ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
3406b8e80941Smrg	ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
3407b8e80941Smrg
3408b8e80941Smrg	ret = si_insert_input_ptr(ctx, ret, ctx->param_rw_buffers,
3409b8e80941Smrg				  8 + SI_SGPR_RW_BUFFERS);
3410b8e80941Smrg	ret = si_insert_input_ptr(ctx, ret,
3411b8e80941Smrg				  ctx->param_bindless_samplers_and_images,
3412b8e80941Smrg				  8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
3413b8e80941Smrg
3414b8e80941Smrg	unsigned vgpr;
3415b8e80941Smrg	if (ctx->type == PIPE_SHADER_VERTEX)
3416b8e80941Smrg		vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR;
3417b8e80941Smrg	else
3418b8e80941Smrg		vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
3419b8e80941Smrg
3420b8e80941Smrg	for (unsigned i = 0; i < 5; i++) {
3421b8e80941Smrg		unsigned param = ctx->param_gs_vtx01_offset + i;
3422b8e80941Smrg		ret = si_insert_input_ret_float(ctx, ret, param, vgpr++);
3423b8e80941Smrg	}
3424b8e80941Smrg	ctx->return_value = ret;
3425b8e80941Smrg}
3426b8e80941Smrg
3427b8e80941Smrgstatic void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi,
3428b8e80941Smrg				     unsigned max_outputs,
3429b8e80941Smrg				     LLVMValueRef *addrs)
3430b8e80941Smrg{
3431b8e80941Smrg	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3432b8e80941Smrg	struct si_shader *shader = ctx->shader;
3433b8e80941Smrg	struct tgsi_shader_info *info = &shader->selector->info;
3434b8e80941Smrg	unsigned i, chan;
3435b8e80941Smrg	LLVMValueRef vertex_id = LLVMGetParam(ctx->main_fn,
3436b8e80941Smrg					      ctx->param_rel_auto_id);
3437b8e80941Smrg	LLVMValueRef vertex_dw_stride = get_tcs_in_vertex_dw_stride(ctx);
3438b8e80941Smrg	LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id,
3439b8e80941Smrg						 vertex_dw_stride, "");
3440b8e80941Smrg
3441b8e80941Smrg	/* Write outputs to LDS. The next shader (TCS aka HS) will read
3442b8e80941Smrg	 * its inputs from it. */
3443b8e80941Smrg	for (i = 0; i < info->num_outputs; i++) {
3444b8e80941Smrg		unsigned name = info->output_semantic_name[i];
3445b8e80941Smrg		unsigned index = info->output_semantic_index[i];
3446b8e80941Smrg
3447b8e80941Smrg		/* The ARB_shader_viewport_layer_array spec contains the
3448b8e80941Smrg		 * following issue:
3449b8e80941Smrg		 *
3450b8e80941Smrg		 *    2) What happens if gl_ViewportIndex or gl_Layer is
3451b8e80941Smrg		 *    written in the vertex shader and a geometry shader is
3452b8e80941Smrg		 *    present?
3453b8e80941Smrg		 *
3454b8e80941Smrg		 *    RESOLVED: The value written by the last vertex processing
3455b8e80941Smrg		 *    stage is used. If the last vertex processing stage
3456b8e80941Smrg		 *    (vertex, tessellation evaluation or geometry) does not
3457b8e80941Smrg		 *    statically assign to gl_ViewportIndex or gl_Layer, index
3458b8e80941Smrg		 *    or layer zero is assumed.
3459b8e80941Smrg		 *
3460b8e80941Smrg		 * So writes to those outputs in VS-as-LS are simply ignored.
3461b8e80941Smrg		 */
3462b8e80941Smrg		if (name == TGSI_SEMANTIC_LAYER ||
3463b8e80941Smrg		    name == TGSI_SEMANTIC_VIEWPORT_INDEX)
3464b8e80941Smrg			continue;
3465b8e80941Smrg
3466b8e80941Smrg		int param = si_shader_io_get_unique_index(name, index, false);
3467b8e80941Smrg		LLVMValueRef dw_addr = LLVMBuildAdd(ctx->ac.builder, base_dw_addr,
3468b8e80941Smrg					LLVMConstInt(ctx->i32, param * 4, 0), "");
3469b8e80941Smrg
3470b8e80941Smrg		for (chan = 0; chan < 4; chan++) {
3471b8e80941Smrg			if (!(info->output_usagemask[i] & (1 << chan)))
3472b8e80941Smrg				continue;
3473b8e80941Smrg
3474b8e80941Smrg			lds_store(ctx, chan, dw_addr,
3475b8e80941Smrg				  LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""));
3476b8e80941Smrg		}
3477b8e80941Smrg	}
3478b8e80941Smrg
3479b8e80941Smrg	if (ctx->screen->info.chip_class >= GFX9)
3480b8e80941Smrg		si_set_ls_return_value_for_tcs(ctx);
3481b8e80941Smrg}
3482b8e80941Smrg
3483b8e80941Smrgstatic void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi,
3484b8e80941Smrg				     unsigned max_outputs,
3485b8e80941Smrg				     LLVMValueRef *addrs)
3486b8e80941Smrg{
3487b8e80941Smrg	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3488b8e80941Smrg	struct si_shader *es = ctx->shader;
3489b8e80941Smrg	struct tgsi_shader_info *info = &es->selector->info;
3490b8e80941Smrg	LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
3491b8e80941Smrg					    ctx->param_es2gs_offset);
3492b8e80941Smrg	LLVMValueRef lds_base = NULL;
3493b8e80941Smrg	unsigned chan;
3494b8e80941Smrg	int i;
3495b8e80941Smrg
3496b8e80941Smrg	if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) {
3497b8e80941Smrg		unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
3498b8e80941Smrg		LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
3499b8e80941Smrg		LLVMValueRef wave_idx = si_unpack_param(ctx, ctx->param_merged_wave_info, 24, 4);
3500b8e80941Smrg		vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx,
3501b8e80941Smrg					 LLVMBuildMul(ctx->ac.builder, wave_idx,
3502b8e80941Smrg						      LLVMConstInt(ctx->i32, 64, false), ""), "");
3503b8e80941Smrg		lds_base = LLVMBuildMul(ctx->ac.builder, vertex_idx,
3504b8e80941Smrg					LLVMConstInt(ctx->i32, itemsize_dw, 0), "");
3505b8e80941Smrg	}
3506b8e80941Smrg
3507b8e80941Smrg	for (i = 0; i < info->num_outputs; i++) {
3508b8e80941Smrg		int param;
3509b8e80941Smrg
3510b8e80941Smrg		if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
3511b8e80941Smrg		    info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
3512b8e80941Smrg			continue;
3513b8e80941Smrg
3514b8e80941Smrg		param = si_shader_io_get_unique_index(info->output_semantic_name[i],
3515b8e80941Smrg						      info->output_semantic_index[i], false);
3516b8e80941Smrg
3517b8e80941Smrg		for (chan = 0; chan < 4; chan++) {
3518b8e80941Smrg			if (!(info->output_usagemask[i] & (1 << chan)))
3519b8e80941Smrg				continue;
3520b8e80941Smrg
3521b8e80941Smrg			LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
3522b8e80941Smrg			out_val = ac_to_integer(&ctx->ac, out_val);
3523b8e80941Smrg
3524b8e80941Smrg			/* GFX9 has the ESGS ring in LDS. */
3525b8e80941Smrg			if (ctx->screen->info.chip_class >= GFX9) {
3526b8e80941Smrg				lds_store(ctx, param * 4 + chan, lds_base, out_val);
3527b8e80941Smrg				continue;
3528b8e80941Smrg			}
3529b8e80941Smrg
3530b8e80941Smrg			ac_build_buffer_store_dword(&ctx->ac,
3531b8e80941Smrg						    ctx->esgs_ring,
3532b8e80941Smrg						    out_val, 1, NULL, soffset,
3533b8e80941Smrg						    (4 * param + chan) * 4,
3534b8e80941Smrg						    1, 1, true, true);
3535b8e80941Smrg		}
3536b8e80941Smrg	}
3537b8e80941Smrg
3538b8e80941Smrg	if (ctx->screen->info.chip_class >= GFX9)
3539b8e80941Smrg		si_set_es_return_value_for_gs(ctx);
3540b8e80941Smrg}
3541b8e80941Smrg
3542b8e80941Smrgstatic LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
3543b8e80941Smrg{
3544b8e80941Smrg	if (ctx->screen->info.chip_class >= GFX9)
3545b8e80941Smrg		return si_unpack_param(ctx, ctx->param_merged_wave_info, 16, 8);
3546b8e80941Smrg	else
3547b8e80941Smrg		return LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id);
3548b8e80941Smrg}
3549b8e80941Smrg
3550b8e80941Smrgstatic void emit_gs_epilogue(struct si_shader_context *ctx)
3551b8e80941Smrg{
3552b8e80941Smrg	ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
3553b8e80941Smrg			 si_get_gs_wave_id(ctx));
3554b8e80941Smrg
3555b8e80941Smrg	if (ctx->screen->info.chip_class >= GFX9)
3556b8e80941Smrg		lp_build_endif(&ctx->merged_wrap_if_state);
3557b8e80941Smrg}
3558b8e80941Smrg
3559b8e80941Smrgstatic void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi,
3560b8e80941Smrg				     unsigned max_outputs,
3561b8e80941Smrg				     LLVMValueRef *addrs)
3562b8e80941Smrg{
3563b8e80941Smrg	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3564b8e80941Smrg	struct tgsi_shader_info UNUSED *info = &ctx->shader->selector->info;
3565b8e80941Smrg
3566b8e80941Smrg	assert(info->num_outputs <= max_outputs);
3567b8e80941Smrg
3568b8e80941Smrg	emit_gs_epilogue(ctx);
3569b8e80941Smrg}
3570b8e80941Smrg
3571b8e80941Smrgstatic void si_tgsi_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
3572b8e80941Smrg{
3573b8e80941Smrg	struct si_shader_context *ctx = si_shader_context(bld_base);
3574b8e80941Smrg	emit_gs_epilogue(ctx);
3575b8e80941Smrg}
3576b8e80941Smrg
3577b8e80941Smrgstatic void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi,
3578b8e80941Smrg				     unsigned max_outputs,
3579b8e80941Smrg				     LLVMValueRef *addrs)
3580b8e80941Smrg{
3581b8e80941Smrg	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3582b8e80941Smrg	struct tgsi_shader_info *info = &ctx->shader->selector->info;
3583b8e80941Smrg	struct si_shader_output_values *outputs = NULL;
3584b8e80941Smrg	int i,j;
3585b8e80941Smrg
3586b8e80941Smrg	assert(!ctx->shader->is_gs_copy_shader);
3587b8e80941Smrg	assert(info->num_outputs <= max_outputs);
3588b8e80941Smrg
3589b8e80941Smrg	outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
3590b8e80941Smrg
3591b8e80941Smrg	/* Vertex color clamping.
3592b8e80941Smrg	 *
3593b8e80941Smrg	 * This uses a state constant loaded in a user data SGPR and
3594b8e80941Smrg	 * an IF statement is added that clamps all colors if the constant
3595b8e80941Smrg	 * is true.
3596b8e80941Smrg	 */
3597b8e80941Smrg	struct lp_build_if_state if_ctx;
3598b8e80941Smrg	LLVMValueRef cond = NULL;
3599b8e80941Smrg	LLVMValueRef addr, val;
3600b8e80941Smrg
3601b8e80941Smrg	for (i = 0; i < info->num_outputs; i++) {
3602b8e80941Smrg		if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
3603b8e80941Smrg		    info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
3604b8e80941Smrg			continue;
3605b8e80941Smrg
3606b8e80941Smrg		/* We've found a color. */
3607b8e80941Smrg		if (!cond) {
3608b8e80941Smrg			/* The state is in the first bit of the user SGPR. */
3609b8e80941Smrg			cond = LLVMGetParam(ctx->main_fn,
3610b8e80941Smrg					    ctx->param_vs_state_bits);
3611b8e80941Smrg			cond = LLVMBuildTrunc(ctx->ac.builder, cond,
3612b8e80941Smrg					      ctx->i1, "");
3613b8e80941Smrg			lp_build_if(&if_ctx, &ctx->gallivm, cond);
3614b8e80941Smrg		}
3615b8e80941Smrg
3616b8e80941Smrg		for (j = 0; j < 4; j++) {
3617b8e80941Smrg			addr = addrs[4 * i + j];
3618b8e80941Smrg			val = LLVMBuildLoad(ctx->ac.builder, addr, "");
3619b8e80941Smrg			val = ac_build_clamp(&ctx->ac, val);
3620b8e80941Smrg			LLVMBuildStore(ctx->ac.builder, val, addr);
3621b8e80941Smrg		}
3622b8e80941Smrg	}
3623b8e80941Smrg
3624b8e80941Smrg	if (cond)
3625b8e80941Smrg		lp_build_endif(&if_ctx);
3626b8e80941Smrg
3627b8e80941Smrg	for (i = 0; i < info->num_outputs; i++) {
3628b8e80941Smrg		outputs[i].semantic_name = info->output_semantic_name[i];
3629b8e80941Smrg		outputs[i].semantic_index = info->output_semantic_index[i];
3630b8e80941Smrg
3631b8e80941Smrg		for (j = 0; j < 4; j++) {
3632b8e80941Smrg			outputs[i].values[j] =
3633b8e80941Smrg				LLVMBuildLoad(ctx->ac.builder,
3634b8e80941Smrg					      addrs[4 * i + j],
3635b8e80941Smrg					      "");
3636b8e80941Smrg			outputs[i].vertex_stream[j] =
3637b8e80941Smrg				(info->output_streams[i] >> (2 * j)) & 3;
3638b8e80941Smrg		}
3639b8e80941Smrg	}
3640b8e80941Smrg
3641b8e80941Smrg	if (ctx->shader->selector->so.num_outputs)
3642b8e80941Smrg		si_llvm_emit_streamout(ctx, outputs, i, 0);
3643b8e80941Smrg
3644b8e80941Smrg	/* Export PrimitiveID. */
3645b8e80941Smrg	if (ctx->shader->key.mono.u.vs_export_prim_id) {
3646b8e80941Smrg		outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
3647b8e80941Smrg		outputs[i].semantic_index = 0;
3648b8e80941Smrg		outputs[i].values[0] = ac_to_float(&ctx->ac, get_primitive_id(ctx, 0));
3649b8e80941Smrg		for (j = 1; j < 4; j++)
3650b8e80941Smrg			outputs[i].values[j] = LLVMConstReal(ctx->f32, 0);
3651b8e80941Smrg
3652b8e80941Smrg		memset(outputs[i].vertex_stream, 0,
3653b8e80941Smrg		       sizeof(outputs[i].vertex_stream));
3654b8e80941Smrg		i++;
3655b8e80941Smrg	}
3656b8e80941Smrg
3657b8e80941Smrg	si_llvm_export_vs(ctx, outputs, i);
3658b8e80941Smrg	FREE(outputs);
3659b8e80941Smrg}
3660b8e80941Smrg
3661b8e80941Smrgstatic void si_tgsi_emit_epilogue(struct lp_build_tgsi_context *bld_base)
3662b8e80941Smrg{
3663b8e80941Smrg	struct si_shader_context *ctx = si_shader_context(bld_base);
3664b8e80941Smrg
3665b8e80941Smrg	ctx->abi.emit_outputs(&ctx->abi, RADEON_LLVM_MAX_OUTPUTS,
3666b8e80941Smrg			      &ctx->outputs[0][0]);
3667b8e80941Smrg}
3668b8e80941Smrg
3669b8e80941Smrgstruct si_ps_exports {
3670b8e80941Smrg	unsigned num;
3671b8e80941Smrg	struct ac_export_args args[10];
3672b8e80941Smrg};
3673b8e80941Smrg
3674b8e80941Smrgstatic void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
3675b8e80941Smrg			    LLVMValueRef depth, LLVMValueRef stencil,
3676b8e80941Smrg			    LLVMValueRef samplemask, struct si_ps_exports *exp)
3677b8e80941Smrg{
3678b8e80941Smrg	struct si_shader_context *ctx = si_shader_context(bld_base);
3679b8e80941Smrg	struct ac_export_args args;
3680b8e80941Smrg
3681b8e80941Smrg	ac_export_mrt_z(&ctx->ac, depth, stencil, samplemask, &args);
3682b8e80941Smrg
3683b8e80941Smrg	memcpy(&exp->args[exp->num++], &args, sizeof(args));
3684b8e80941Smrg}
3685b8e80941Smrg
3686b8e80941Smrgstatic void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
3687b8e80941Smrg				LLVMValueRef *color, unsigned index,
3688b8e80941Smrg				unsigned samplemask_param,
3689b8e80941Smrg				bool is_last, struct si_ps_exports *exp)
3690b8e80941Smrg{
3691b8e80941Smrg	struct si_shader_context *ctx = si_shader_context(bld_base);
3692b8e80941Smrg	int i;
3693b8e80941Smrg
3694b8e80941Smrg	/* Clamp color */
3695b8e80941Smrg	if (ctx->shader->key.part.ps.epilog.clamp_color)
3696b8e80941Smrg		for (i = 0; i < 4; i++)
3697b8e80941Smrg			color[i] = ac_build_clamp(&ctx->ac, color[i]);
3698b8e80941Smrg
3699b8e80941Smrg	/* Alpha to one */
3700b8e80941Smrg	if (ctx->shader->key.part.ps.epilog.alpha_to_one)
3701b8e80941Smrg		color[3] = ctx->ac.f32_1;
3702b8e80941Smrg
3703b8e80941Smrg	/* Alpha test */
3704b8e80941Smrg	if (index == 0 &&
3705b8e80941Smrg	    ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
3706b8e80941Smrg		si_alpha_test(bld_base, color[3]);
3707b8e80941Smrg
3708b8e80941Smrg	/* Line & polygon smoothing */
3709b8e80941Smrg	if (ctx->shader->key.part.ps.epilog.poly_line_smoothing)
3710b8e80941Smrg		color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
3711b8e80941Smrg							 samplemask_param);
3712b8e80941Smrg
3713b8e80941Smrg	/* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3714b8e80941Smrg	if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) {
3715b8e80941Smrg		struct ac_export_args args[8];
3716b8e80941Smrg		int c, last = -1;
3717b8e80941Smrg
3718b8e80941Smrg		/* Get the export arguments, also find out what the last one is. */
3719b8e80941Smrg		for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3720b8e80941Smrg			si_llvm_init_export_args(ctx, color,
3721b8e80941Smrg						 V_008DFC_SQ_EXP_MRT + c, &args[c]);
3722b8e80941Smrg			if (args[c].enabled_channels)
3723b8e80941Smrg				last = c;
3724b8e80941Smrg		}
3725b8e80941Smrg
3726b8e80941Smrg		/* Emit all exports. */
3727b8e80941Smrg		for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3728b8e80941Smrg			if (is_last && last == c) {
3729b8e80941Smrg				args[c].valid_mask = 1; /* whether the EXEC mask is valid */
3730b8e80941Smrg				args[c].done = 1; /* DONE bit */
3731b8e80941Smrg			} else if (!args[c].enabled_channels)
3732b8e80941Smrg				continue; /* unnecessary NULL export */
3733b8e80941Smrg
3734b8e80941Smrg			memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c]));
3735b8e80941Smrg		}
3736b8e80941Smrg	} else {
3737b8e80941Smrg		struct ac_export_args args;
3738b8e80941Smrg
3739b8e80941Smrg		/* Export */
3740b8e80941Smrg		si_llvm_init_export_args(ctx, color, V_008DFC_SQ_EXP_MRT + index,
3741b8e80941Smrg					 &args);
3742b8e80941Smrg		if (is_last) {
3743b8e80941Smrg			args.valid_mask = 1; /* whether the EXEC mask is valid */
3744b8e80941Smrg			args.done = 1; /* DONE bit */
3745b8e80941Smrg		} else if (!args.enabled_channels)
3746b8e80941Smrg			return; /* unnecessary NULL export */
3747b8e80941Smrg
3748b8e80941Smrg		memcpy(&exp->args[exp->num++], &args, sizeof(args));
3749b8e80941Smrg	}
3750b8e80941Smrg}
3751b8e80941Smrg
3752b8e80941Smrgstatic void si_emit_ps_exports(struct si_shader_context *ctx,
3753b8e80941Smrg			       struct si_ps_exports *exp)
3754b8e80941Smrg{
3755b8e80941Smrg	for (unsigned i = 0; i < exp->num; i++)
3756b8e80941Smrg		ac_build_export(&ctx->ac, &exp->args[i]);
3757b8e80941Smrg}
3758b8e80941Smrg
3759b8e80941Smrg/**
3760b8e80941Smrg * Return PS outputs in this order:
3761b8e80941Smrg *
3762b8e80941Smrg * v[0:3] = color0.xyzw
3763b8e80941Smrg * v[4:7] = color1.xyzw
3764b8e80941Smrg * ...
3765b8e80941Smrg * vN+0 = Depth
3766b8e80941Smrg * vN+1 = Stencil
3767b8e80941Smrg * vN+2 = SampleMask
3768b8e80941Smrg * vN+3 = SampleMaskIn (used for OpenGL smoothing)
3769b8e80941Smrg *
3770b8e80941Smrg * The alpha-ref SGPR is returned via its original location.
3771b8e80941Smrg */
3772b8e80941Smrgstatic void si_llvm_return_fs_outputs(struct ac_shader_abi *abi,
3773b8e80941Smrg				      unsigned max_outputs,
3774b8e80941Smrg				      LLVMValueRef *addrs)
3775b8e80941Smrg{
3776b8e80941Smrg	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3777b8e80941Smrg	struct si_shader *shader = ctx->shader;
3778b8e80941Smrg	struct tgsi_shader_info *info = &shader->selector->info;
3779b8e80941Smrg	LLVMBuilderRef builder = ctx->ac.builder;
3780b8e80941Smrg	unsigned i, j, first_vgpr, vgpr;
3781b8e80941Smrg
3782b8e80941Smrg	LLVMValueRef color[8][4] = {};
3783b8e80941Smrg	LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3784b8e80941Smrg	LLVMValueRef ret;
3785b8e80941Smrg
3786b8e80941Smrg	if (ctx->postponed_kill)
3787b8e80941Smrg		ac_build_kill_if_false(&ctx->ac, LLVMBuildLoad(builder, ctx->postponed_kill, ""));
3788b8e80941Smrg
3789b8e80941Smrg	/* Read the output values. */
3790b8e80941Smrg	for (i = 0; i < info->num_outputs; i++) {
3791b8e80941Smrg		unsigned semantic_name = info->output_semantic_name[i];
3792b8e80941Smrg		unsigned semantic_index = info->output_semantic_index[i];
3793b8e80941Smrg
3794b8e80941Smrg		switch (semantic_name) {
3795b8e80941Smrg		case TGSI_SEMANTIC_COLOR:
3796b8e80941Smrg			assert(semantic_index < 8);
3797b8e80941Smrg			for (j = 0; j < 4; j++) {
3798b8e80941Smrg				LLVMValueRef ptr = addrs[4 * i + j];
3799b8e80941Smrg				LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
3800b8e80941Smrg				color[semantic_index][j] = result;
3801b8e80941Smrg			}
3802b8e80941Smrg			break;
3803b8e80941Smrg		case TGSI_SEMANTIC_POSITION:
3804b8e80941Smrg			depth = LLVMBuildLoad(builder,
3805b8e80941Smrg					      addrs[4 * i + 2], "");
3806b8e80941Smrg			break;
3807b8e80941Smrg		case TGSI_SEMANTIC_STENCIL:
3808b8e80941Smrg			stencil = LLVMBuildLoad(builder,
3809b8e80941Smrg						addrs[4 * i + 1], "");
3810b8e80941Smrg			break;
3811b8e80941Smrg		case TGSI_SEMANTIC_SAMPLEMASK:
3812b8e80941Smrg			samplemask = LLVMBuildLoad(builder,
3813b8e80941Smrg						   addrs[4 * i + 0], "");
3814b8e80941Smrg			break;
3815b8e80941Smrg		default:
3816b8e80941Smrg			fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
3817b8e80941Smrg				semantic_name);
3818b8e80941Smrg		}
3819b8e80941Smrg	}
3820b8e80941Smrg
3821b8e80941Smrg	/* Fill the return structure. */
3822b8e80941Smrg	ret = ctx->return_value;
3823b8e80941Smrg
3824b8e80941Smrg	/* Set SGPRs. */
3825b8e80941Smrg	ret = LLVMBuildInsertValue(builder, ret,
3826b8e80941Smrg				   ac_to_integer(&ctx->ac,
3827b8e80941Smrg                                                 LLVMGetParam(ctx->main_fn,
3828b8e80941Smrg                                                              SI_PARAM_ALPHA_REF)),
3829b8e80941Smrg				   SI_SGPR_ALPHA_REF, "");
3830b8e80941Smrg
3831b8e80941Smrg	/* Set VGPRs */
3832b8e80941Smrg	first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
3833b8e80941Smrg	for (i = 0; i < ARRAY_SIZE(color); i++) {
3834b8e80941Smrg		if (!color[i][0])
3835b8e80941Smrg			continue;
3836b8e80941Smrg
3837b8e80941Smrg		for (j = 0; j < 4; j++)
3838b8e80941Smrg			ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
3839b8e80941Smrg	}
3840b8e80941Smrg	if (depth)
3841b8e80941Smrg		ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
3842b8e80941Smrg	if (stencil)
3843b8e80941Smrg		ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
3844b8e80941Smrg	if (samplemask)
3845b8e80941Smrg		ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
3846b8e80941Smrg
3847b8e80941Smrg	/* Add the input sample mask for smoothing at the end. */
3848b8e80941Smrg	if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
3849b8e80941Smrg		vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
3850b8e80941Smrg	ret = LLVMBuildInsertValue(builder, ret,
3851b8e80941Smrg				   LLVMGetParam(ctx->main_fn,
3852b8e80941Smrg						SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
3853b8e80941Smrg
3854b8e80941Smrg	ctx->return_value = ret;
3855b8e80941Smrg}
3856b8e80941Smrg
3857b8e80941Smrgstatic void membar_emit(
3858b8e80941Smrg		const struct lp_build_tgsi_action *action,
3859b8e80941Smrg		struct lp_build_tgsi_context *bld_base,
3860b8e80941Smrg		struct lp_build_emit_data *emit_data)
3861b8e80941Smrg{
3862b8e80941Smrg	struct si_shader_context *ctx = si_shader_context(bld_base);
3863b8e80941Smrg	LLVMValueRef src0 = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0);
3864b8e80941Smrg	unsigned flags = LLVMConstIntGetZExtValue(src0);
3865b8e80941Smrg	unsigned waitcnt = NOOP_WAITCNT;
3866b8e80941Smrg
3867b8e80941Smrg	if (flags & TGSI_MEMBAR_THREAD_GROUP)
3868b8e80941Smrg		waitcnt &= VM_CNT & LGKM_CNT;
3869b8e80941Smrg
3870b8e80941Smrg	if (flags & (TGSI_MEMBAR_ATOMIC_BUFFER |
3871b8e80941Smrg		     TGSI_MEMBAR_SHADER_BUFFER |
3872b8e80941Smrg		     TGSI_MEMBAR_SHADER_IMAGE))
3873b8e80941Smrg		waitcnt &= VM_CNT;
3874b8e80941Smrg
3875b8e80941Smrg	if (flags & TGSI_MEMBAR_SHARED)
3876b8e80941Smrg		waitcnt &= LGKM_CNT;
3877b8e80941Smrg
3878b8e80941Smrg	if (waitcnt != NOOP_WAITCNT)
3879b8e80941Smrg		ac_build_waitcnt(&ctx->ac, waitcnt);
3880b8e80941Smrg}
3881b8e80941Smrg
3882b8e80941Smrgstatic void clock_emit(
3883b8e80941Smrg		const struct lp_build_tgsi_action *action,
3884b8e80941Smrg		struct lp_build_tgsi_context *bld_base,
3885b8e80941Smrg		struct lp_build_emit_data *emit_data)
3886b8e80941Smrg{
3887b8e80941Smrg	struct si_shader_context *ctx = si_shader_context(bld_base);
3888b8e80941Smrg	LLVMValueRef tmp = ac_build_shader_clock(&ctx->ac);
3889b8e80941Smrg
3890b8e80941Smrg	emit_data->output[0] =
3891b8e80941Smrg		LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->i32_0, "");
3892b8e80941Smrg	emit_data->output[1] =
3893b8e80941Smrg		LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->i32_1, "");
3894b8e80941Smrg}
3895b8e80941Smrg
3896b8e80941Smrgstatic void si_llvm_emit_ddxy(
3897b8e80941Smrg	const struct lp_build_tgsi_action *action,
3898b8e80941Smrg	struct lp_build_tgsi_context *bld_base,
3899b8e80941Smrg	struct lp_build_emit_data *emit_data)
3900b8e80941Smrg{
3901b8e80941Smrg	struct si_shader_context *ctx = si_shader_context(bld_base);
3902b8e80941Smrg	unsigned opcode = emit_data->info->opcode;
3903b8e80941Smrg	LLVMValueRef val;
3904b8e80941Smrg	int idx;
3905b8e80941Smrg	unsigned mask;
3906b8e80941Smrg
3907b8e80941Smrg	if (opcode == TGSI_OPCODE_DDX_FINE)
3908b8e80941Smrg		mask = AC_TID_MASK_LEFT;
3909b8e80941Smrg	else if (opcode == TGSI_OPCODE_DDY_FINE)
3910b8e80941Smrg		mask = AC_TID_MASK_TOP;
3911b8e80941Smrg	else
3912b8e80941Smrg		mask = AC_TID_MASK_TOP_LEFT;
3913b8e80941Smrg
3914b8e80941Smrg	/* for DDX we want to next X pixel, DDY next Y pixel. */
3915b8e80941Smrg	idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
3916b8e80941Smrg
3917b8e80941Smrg	val = ac_to_integer(&ctx->ac, emit_data->args[0]);
3918b8e80941Smrg	val = ac_build_ddxy(&ctx->ac, mask, idx, val);
3919b8e80941Smrg	emit_data->output[emit_data->chan] = val;
3920b8e80941Smrg}
3921b8e80941Smrg
3922b8e80941Smrgstatic void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
3923b8e80941Smrg				struct lp_build_tgsi_context *bld_base,
3924b8e80941Smrg				struct lp_build_emit_data *emit_data)
3925b8e80941Smrg{
3926b8e80941Smrg	struct si_shader_context *ctx = si_shader_context(bld_base);
3927b8e80941Smrg	struct si_shader *shader = ctx->shader;
3928b8e80941Smrg	const struct tgsi_shader_info *info = &shader->selector->info;
3929b8e80941Smrg	LLVMValueRef interp_param;
3930b8e80941Smrg	const struct tgsi_full_instruction *inst = emit_data->inst;
3931b8e80941Smrg	const struct tgsi_full_src_register *input = &inst->Src[0];
3932b8e80941Smrg	int input_base, input_array_size;
3933b8e80941Smrg	int chan;
3934b8e80941Smrg	int i;
3935b8e80941Smrg	LLVMValueRef prim_mask = ctx->abi.prim_mask;
3936b8e80941Smrg	LLVMValueRef array_idx, offset_x = NULL, offset_y = NULL;
3937b8e80941Smrg	int interp_param_idx;
3938b8e80941Smrg	unsigned interp;
3939b8e80941Smrg	unsigned location;
3940b8e80941Smrg
3941b8e80941Smrg	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
3942b8e80941Smrg		/* offset is in second src, first two channels */
3943b8e80941Smrg		offset_x = lp_build_emit_fetch(bld_base, emit_data->inst, 1,
3944b8e80941Smrg					       TGSI_CHAN_X);
3945b8e80941Smrg		offset_y = lp_build_emit_fetch(bld_base, emit_data->inst, 1,
3946b8e80941Smrg					       TGSI_CHAN_Y);
3947b8e80941Smrg	} else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
3948b8e80941Smrg		LLVMValueRef sample_position;
3949b8e80941Smrg		LLVMValueRef sample_id;
3950b8e80941Smrg		LLVMValueRef halfval = LLVMConstReal(ctx->f32, 0.5f);
3951b8e80941Smrg
3952b8e80941Smrg		/* fetch sample ID, then fetch its sample position,
3953b8e80941Smrg		 * and place into first two channels.
3954b8e80941Smrg		 */
3955b8e80941Smrg		sample_id = lp_build_emit_fetch(bld_base,
3956b8e80941Smrg						emit_data->inst, 1, TGSI_CHAN_X);
3957b8e80941Smrg		sample_id = ac_to_integer(&ctx->ac, sample_id);
3958b8e80941Smrg
3959b8e80941Smrg		/* Section 8.13.2 (Interpolation Functions) of the OpenGL Shading
3960b8e80941Smrg		 * Language 4.50 spec says about interpolateAtSample:
3961b8e80941Smrg		 *
3962b8e80941Smrg		 *    "Returns the value of the input interpolant variable at
3963b8e80941Smrg		 *     the location of sample number sample. If multisample
3964b8e80941Smrg		 *     buffers are not available, the input variable will be
3965b8e80941Smrg		 *     evaluated at the center of the pixel. If sample sample
3966b8e80941Smrg		 *     does not exist, the position used to interpolate the
3967b8e80941Smrg		 *     input variable is undefined."
3968b8e80941Smrg		 *
3969b8e80941Smrg		 * This means that sample_id values outside of the valid are
3970b8e80941Smrg		 * in fact valid input, and the usual mechanism for loading the
3971b8e80941Smrg		 * sample position doesn't work.
3972b8e80941Smrg		 */
3973b8e80941Smrg		if (ctx->shader->key.mono.u.ps.interpolate_at_sample_force_center) {
3974b8e80941Smrg			LLVMValueRef center[4] = {
3975b8e80941Smrg				LLVMConstReal(ctx->f32, 0.5),
3976b8e80941Smrg				LLVMConstReal(ctx->f32, 0.5),
3977b8e80941Smrg				ctx->ac.f32_0,
3978b8e80941Smrg				ctx->ac.f32_0,
3979b8e80941Smrg			};
3980b8e80941Smrg
3981b8e80941Smrg			sample_position = ac_build_gather_values(&ctx->ac, center, 4);
3982b8e80941Smrg		} else {
3983b8e80941Smrg			sample_position = load_sample_position(&ctx->abi, sample_id);
3984b8e80941Smrg		}
3985b8e80941Smrg
3986b8e80941Smrg		offset_x = LLVMBuildExtractElement(ctx->ac.builder, sample_position,
3987b8e80941Smrg						   ctx->i32_0, "");
3988b8e80941Smrg
3989b8e80941Smrg		offset_x = LLVMBuildFSub(ctx->ac.builder, offset_x, halfval, "");
3990b8e80941Smrg		offset_y = LLVMBuildExtractElement(ctx->ac.builder, sample_position,
3991b8e80941Smrg						   ctx->i32_1, "");
3992b8e80941Smrg		offset_y = LLVMBuildFSub(ctx->ac.builder, offset_y, halfval, "");
3993b8e80941Smrg	}
3994b8e80941Smrg
3995b8e80941Smrg	assert(input->Register.File == TGSI_FILE_INPUT);
3996b8e80941Smrg
3997b8e80941Smrg	if (input->Register.Indirect) {
3998b8e80941Smrg		unsigned array_id = input->Indirect.ArrayID;
3999b8e80941Smrg
4000b8e80941Smrg		if (array_id) {
4001b8e80941Smrg			input_base = info->input_array_first[array_id];
4002b8e80941Smrg			input_array_size = info->input_array_last[array_id] - input_base + 1;
4003b8e80941Smrg		} else {
4004b8e80941Smrg			input_base = inst->Src[0].Register.Index;
4005b8e80941Smrg			input_array_size = info->num_inputs - input_base;
4006b8e80941Smrg		}
4007b8e80941Smrg
4008b8e80941Smrg		array_idx = si_get_indirect_index(ctx, &input->Indirect,
4009b8e80941Smrg						  1, input->Register.Index - input_base);
4010b8e80941Smrg	} else {
4011b8e80941Smrg		input_base = inst->Src[0].Register.Index;
4012b8e80941Smrg		input_array_size = 1;
4013b8e80941Smrg		array_idx = ctx->i32_0;
4014b8e80941Smrg	}
4015b8e80941Smrg
4016b8e80941Smrg	interp = shader->selector->info.input_interpolate[input_base];
4017b8e80941Smrg
4018b8e80941Smrg	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
4019b8e80941Smrg	    inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
4020b8e80941Smrg		location = TGSI_INTERPOLATE_LOC_CENTER;
4021b8e80941Smrg	else
4022b8e80941Smrg		location = TGSI_INTERPOLATE_LOC_CENTROID;
4023b8e80941Smrg
4024b8e80941Smrg	interp_param_idx = lookup_interp_param_index(interp, location);
4025b8e80941Smrg	if (interp_param_idx == -1)
4026b8e80941Smrg		return;
4027b8e80941Smrg	else if (interp_param_idx)
4028b8e80941Smrg		interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
4029b8e80941Smrg	else
4030b8e80941Smrg		interp_param = NULL;
4031b8e80941Smrg
4032b8e80941Smrg	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
4033b8e80941Smrg	    inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
4034b8e80941Smrg		LLVMValueRef ij_out[2];
4035b8e80941Smrg		LLVMValueRef ddxy_out = ac_build_ddxy_interp(&ctx->ac, interp_param);
4036b8e80941Smrg
4037b8e80941Smrg		/*
4038b8e80941Smrg		 * take the I then J parameters, and the DDX/Y for it, and
4039b8e80941Smrg		 * calculate the IJ inputs for the interpolator.
4040b8e80941Smrg		 * temp1 = ddx * offset/sample.x + I;
4041b8e80941Smrg		 * interp_param.I = ddy * offset/sample.y + temp1;
4042b8e80941Smrg		 * temp1 = ddx * offset/sample.x + J;
4043b8e80941Smrg		 * interp_param.J = ddy * offset/sample.y + temp1;
4044b8e80941Smrg		 */
4045b8e80941Smrg		for (i = 0; i < 2; i++) {
4046b8e80941Smrg			LLVMValueRef ix_ll = LLVMConstInt(ctx->i32, i, 0);
4047b8e80941Smrg			LLVMValueRef iy_ll = LLVMConstInt(ctx->i32, i + 2, 0);
4048b8e80941Smrg			LLVMValueRef ddx_el = LLVMBuildExtractElement(ctx->ac.builder,
4049b8e80941Smrg								      ddxy_out, ix_ll, "");
4050b8e80941Smrg			LLVMValueRef ddy_el = LLVMBuildExtractElement(ctx->ac.builder,
4051b8e80941Smrg								      ddxy_out, iy_ll, "");
4052b8e80941Smrg			LLVMValueRef interp_el = LLVMBuildExtractElement(ctx->ac.builder,
4053b8e80941Smrg									 interp_param, ix_ll, "");
4054b8e80941Smrg			LLVMValueRef temp;
4055b8e80941Smrg
4056b8e80941Smrg			interp_el = ac_to_float(&ctx->ac, interp_el);
4057b8e80941Smrg
4058b8e80941Smrg			temp = ac_build_fmad(&ctx->ac, ddx_el, offset_x, interp_el);
4059b8e80941Smrg			ij_out[i] = ac_build_fmad(&ctx->ac, ddy_el, offset_y, temp);
4060b8e80941Smrg		}
4061b8e80941Smrg		interp_param = ac_build_gather_values(&ctx->ac, ij_out, 2);
4062b8e80941Smrg	}
4063b8e80941Smrg
4064b8e80941Smrg	if (interp_param)
4065b8e80941Smrg		interp_param = ac_to_float(&ctx->ac, interp_param);
4066b8e80941Smrg
4067b8e80941Smrg	for (chan = 0; chan < 4; chan++) {
4068b8e80941Smrg		LLVMValueRef gather = LLVMGetUndef(LLVMVectorType(ctx->f32, input_array_size));
4069b8e80941Smrg		unsigned schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
4070b8e80941Smrg
4071b8e80941Smrg		for (unsigned idx = 0; idx < input_array_size; ++idx) {
4072b8e80941Smrg			LLVMValueRef v, i = NULL, j = NULL;
4073b8e80941Smrg
4074b8e80941Smrg			if (interp_param) {
4075b8e80941Smrg				i = LLVMBuildExtractElement(
4076b8e80941Smrg					ctx->ac.builder, interp_param, ctx->i32_0, "");
4077b8e80941Smrg				j = LLVMBuildExtractElement(
4078b8e80941Smrg					ctx->ac.builder, interp_param, ctx->i32_1, "");
4079b8e80941Smrg			}
4080b8e80941Smrg			v = si_build_fs_interp(ctx, input_base + idx, schan,
4081b8e80941Smrg					       prim_mask, i, j);
4082b8e80941Smrg
4083b8e80941Smrg			gather = LLVMBuildInsertElement(ctx->ac.builder,
4084b8e80941Smrg				gather, v, LLVMConstInt(ctx->i32, idx, false), "");
4085b8e80941Smrg		}
4086b8e80941Smrg
4087b8e80941Smrg		emit_data->output[chan] = LLVMBuildExtractElement(
4088b8e80941Smrg			ctx->ac.builder, gather, array_idx, "");
4089b8e80941Smrg	}
4090b8e80941Smrg}
4091b8e80941Smrg
4092b8e80941Smrgstatic void vote_all_emit(
4093b8e80941Smrg	const struct lp_build_tgsi_action *action,
4094b8e80941Smrg	struct lp_build_tgsi_context *bld_base,
4095b8e80941Smrg	struct lp_build_emit_data *emit_data)
4096b8e80941Smrg{
4097b8e80941Smrg	struct si_shader_context *ctx = si_shader_context(bld_base);
4098b8e80941Smrg
4099b8e80941Smrg        LLVMValueRef tmp = ac_build_vote_all(&ctx->ac, emit_data->args[0]);
4100b8e80941Smrg	emit_data->output[emit_data->chan] =
4101b8e80941Smrg		LLVMBuildSExt(ctx->ac.builder, tmp, ctx->i32, "");
4102b8e80941Smrg}
4103b8e80941Smrg
4104b8e80941Smrgstatic void vote_any_emit(
4105b8e80941Smrg	const struct lp_build_tgsi_action *action,
4106b8e80941Smrg	struct lp_build_tgsi_context *bld_base,
4107b8e80941Smrg	struct lp_build_emit_data *emit_data)
4108b8e80941Smrg{
4109b8e80941Smrg	struct si_shader_context *ctx = si_shader_context(bld_base);
4110b8e80941Smrg
4111b8e80941Smrg        LLVMValueRef tmp = ac_build_vote_any(&ctx->ac, emit_data->args[0]);
4112b8e80941Smrg	emit_data->output[emit_data->chan] =
4113b8e80941Smrg		LLVMBuildSExt(ctx->ac.builder, tmp, ctx->i32, "");
4114b8e80941Smrg}
4115b8e80941Smrg
4116b8e80941Smrgstatic void vote_eq_emit(
4117b8e80941Smrg	const struct lp_build_tgsi_action *action,
4118b8e80941Smrg	struct lp_build_tgsi_context *bld_base,
4119b8e80941Smrg	struct lp_build_emit_data *emit_data)
4120b8e80941Smrg{
4121b8e80941Smrg	struct si_shader_context *ctx = si_shader_context(bld_base);
4122b8e80941Smrg
4123b8e80941Smrg        LLVMValueRef tmp = ac_build_vote_eq(&ctx->ac, emit_data->args[0]);
4124b8e80941Smrg	emit_data->output[emit_data->chan] =
4125b8e80941Smrg		LLVMBuildSExt(ctx->ac.builder, tmp, ctx->i32, "");
4126b8e80941Smrg}
4127b8e80941Smrg
4128b8e80941Smrgstatic void ballot_emit(
4129b8e80941Smrg	const struct lp_build_tgsi_action *action,
4130b8e80941Smrg	struct lp_build_tgsi_context *bld_base,
4131b8e80941Smrg	struct lp_build_emit_data *emit_data)
4132b8e80941Smrg{
4133b8e80941Smrg	struct si_shader_context *ctx = si_shader_context(bld_base);
4134b8e80941Smrg	LLVMBuilderRef builder = ctx->ac.builder;
4135b8e80941Smrg	LLVMValueRef tmp;
4136b8e80941Smrg
4137b8e80941Smrg	tmp = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
4138b8e80941Smrg	tmp = ac_build_ballot(&ctx->ac, tmp);
4139b8e80941Smrg	tmp = LLVMBuildBitCast(builder, tmp, ctx->v2i32, "");
4140b8e80941Smrg
4141b8e80941Smrg	emit_data->output[0] = LLVMBuildExtractElement(builder, tmp, ctx->i32_0, "");
4142b8e80941Smrg	emit_data->output[1] = LLVMBuildExtractElement(builder, tmp, ctx->i32_1, "");
4143b8e80941Smrg}
4144b8e80941Smrg
4145b8e80941Smrgstatic void read_lane_emit(
4146b8e80941Smrg	const struct lp_build_tgsi_action *action,
4147b8e80941Smrg	struct lp_build_tgsi_context *bld_base,
4148b8e80941Smrg	struct lp_build_emit_data *emit_data)
4149b8e80941Smrg{
4150b8e80941Smrg	struct si_shader_context *ctx = si_shader_context(bld_base);
4151b8e80941Smrg
4152b8e80941Smrg	if (emit_data->inst->Instruction.Opcode == TGSI_OPCODE_READ_INVOC) {
4153b8e80941Smrg		emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
4154b8e80941Smrg							 0, emit_data->src_chan);
4155b8e80941Smrg
4156b8e80941Smrg		/* Always read the source invocation (= lane) from the X channel. */
4157b8e80941Smrg		emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
4158b8e80941Smrg							 1, TGSI_CHAN_X);
4159b8e80941Smrg		emit_data->arg_count = 2;
4160b8e80941Smrg	}
4161b8e80941Smrg
4162b8e80941Smrg	/* We currently have no other way to prevent LLVM from lifting the icmp
4163b8e80941Smrg	 * calls to a dominating basic block.
4164b8e80941Smrg	 */
4165b8e80941Smrg	ac_build_optimization_barrier(&ctx->ac, &emit_data->args[0]);
4166b8e80941Smrg
4167b8e80941Smrg	for (unsigned i = 0; i < emit_data->arg_count; ++i)
4168b8e80941Smrg		emit_data->args[i] = ac_to_integer(&ctx->ac, emit_data->args[i]);
4169b8e80941Smrg
4170b8e80941Smrg	emit_data->output[emit_data->chan] =
4171b8e80941Smrg		ac_build_intrinsic(&ctx->ac, action->intr_name,
4172b8e80941Smrg				   ctx->i32, emit_data->args, emit_data->arg_count,
4173b8e80941Smrg				   AC_FUNC_ATTR_READNONE |
4174b8e80941Smrg				   AC_FUNC_ATTR_CONVERGENT);
4175b8e80941Smrg}
4176b8e80941Smrg
4177b8e80941Smrgstatic unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
4178b8e80941Smrg				       struct lp_build_emit_data *emit_data)
4179b8e80941Smrg{
4180b8e80941Smrg	struct si_shader_context *ctx = si_shader_context(bld_base);
4181b8e80941Smrg	struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
4182b8e80941Smrg	LLVMValueRef imm;
4183b8e80941Smrg	unsigned stream;
4184b8e80941Smrg
4185b8e80941Smrg	assert(src0.File == TGSI_FILE_IMMEDIATE);
4186b8e80941Smrg
4187b8e80941Smrg	imm = ctx->imms[src0.Index * TGSI_NUM_CHANNELS + src0.SwizzleX];
4188b8e80941Smrg	stream = LLVMConstIntGetZExtValue(imm) & 0x3;
4189b8e80941Smrg	return stream;
4190b8e80941Smrg}
4191b8e80941Smrg
4192b8e80941Smrg/* Emit one vertex from the geometry shader */
4193b8e80941Smrgstatic void si_llvm_emit_vertex(struct ac_shader_abi *abi,
4194b8e80941Smrg				unsigned stream,
4195b8e80941Smrg				LLVMValueRef *addrs)
4196b8e80941Smrg{
4197b8e80941Smrg	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
4198b8e80941Smrg	struct tgsi_shader_info *info = &ctx->shader->selector->info;
4199b8e80941Smrg	struct si_shader *shader = ctx->shader;
4200b8e80941Smrg	struct lp_build_if_state if_state;
4201b8e80941Smrg	LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
4202b8e80941Smrg					    ctx->param_gs2vs_offset);
4203b8e80941Smrg	LLVMValueRef gs_next_vertex;
4204b8e80941Smrg	LLVMValueRef can_emit;
4205b8e80941Smrg	unsigned chan, offset;
4206b8e80941Smrg	int i;
4207b8e80941Smrg
4208b8e80941Smrg	/* Write vertex attribute values to GSVS ring */
4209b8e80941Smrg	gs_next_vertex = LLVMBuildLoad(ctx->ac.builder,
4210b8e80941Smrg				       ctx->gs_next_vertex[stream],
4211b8e80941Smrg				       "");
4212b8e80941Smrg
4213b8e80941Smrg	/* If this thread has already emitted the declared maximum number of
4214b8e80941Smrg	 * vertices, skip the write: excessive vertex emissions are not
4215b8e80941Smrg	 * supposed to have any effect.
4216b8e80941Smrg	 *
4217b8e80941Smrg	 * If the shader has no writes to memory, kill it instead. This skips
4218b8e80941Smrg	 * further memory loads and may allow LLVM to skip to the end
4219b8e80941Smrg	 * altogether.
4220b8e80941Smrg	 */
4221b8e80941Smrg	can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex,
4222b8e80941Smrg				 LLVMConstInt(ctx->i32,
4223b8e80941Smrg					      shader->selector->gs_max_out_vertices, 0), "");
4224b8e80941Smrg
4225b8e80941Smrg	bool use_kill = !info->writes_memory;
4226b8e80941Smrg	if (use_kill) {
4227b8e80941Smrg		ac_build_kill_if_false(&ctx->ac, can_emit);
4228b8e80941Smrg	} else {
4229b8e80941Smrg		lp_build_if(&if_state, &ctx->gallivm, can_emit);
4230b8e80941Smrg	}
4231b8e80941Smrg
4232b8e80941Smrg	offset = 0;
4233b8e80941Smrg	for (i = 0; i < info->num_outputs; i++) {
4234b8e80941Smrg		for (chan = 0; chan < 4; chan++) {
4235b8e80941Smrg			if (!(info->output_usagemask[i] & (1 << chan)) ||
4236b8e80941Smrg			    ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
4237b8e80941Smrg				continue;
4238b8e80941Smrg
4239b8e80941Smrg			LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
4240b8e80941Smrg			LLVMValueRef voffset =
4241b8e80941Smrg				LLVMConstInt(ctx->i32, offset *
4242b8e80941Smrg					     shader->selector->gs_max_out_vertices, 0);
4243b8e80941Smrg			offset++;
4244b8e80941Smrg
4245b8e80941Smrg			voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, "");
4246b8e80941Smrg			voffset = LLVMBuildMul(ctx->ac.builder, voffset,
4247b8e80941Smrg					       LLVMConstInt(ctx->i32, 4, 0), "");
4248b8e80941Smrg
4249b8e80941Smrg			out_val = ac_to_integer(&ctx->ac, out_val);
4250b8e80941Smrg
4251b8e80941Smrg			ac_build_buffer_store_dword(&ctx->ac,
4252b8e80941Smrg						    ctx->gsvs_ring[stream],
4253b8e80941Smrg						    out_val, 1,
4254b8e80941Smrg						    voffset, soffset, 0,
4255b8e80941Smrg						    1, 1, true, true);
4256b8e80941Smrg		}
4257b8e80941Smrg	}
4258b8e80941Smrg
4259b8e80941Smrg	gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, ctx->i32_1, "");
4260b8e80941Smrg	LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
4261b8e80941Smrg
4262b8e80941Smrg	/* Signal vertex emission if vertex data was written. */
4263b8e80941Smrg	if (offset) {
4264b8e80941Smrg		ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
4265b8e80941Smrg				 si_get_gs_wave_id(ctx));
4266b8e80941Smrg	}
4267b8e80941Smrg
4268b8e80941Smrg	if (!use_kill)
4269b8e80941Smrg		lp_build_endif(&if_state);
4270b8e80941Smrg}
4271b8e80941Smrg
4272b8e80941Smrg/* Emit one vertex from the geometry shader */
4273b8e80941Smrgstatic void si_tgsi_emit_vertex(
4274b8e80941Smrg	const struct lp_build_tgsi_action *action,
4275b8e80941Smrg	struct lp_build_tgsi_context *bld_base,
4276b8e80941Smrg	struct lp_build_emit_data *emit_data)
4277b8e80941Smrg{
4278b8e80941Smrg	struct si_shader_context *ctx = si_shader_context(bld_base);
4279b8e80941Smrg	unsigned stream = si_llvm_get_stream(bld_base, emit_data);
4280b8e80941Smrg
4281b8e80941Smrg	si_llvm_emit_vertex(&ctx->abi, stream, ctx->outputs[0]);
4282b8e80941Smrg}
4283b8e80941Smrg
4284b8e80941Smrg/* Cut one primitive from the geometry shader */
4285b8e80941Smrgstatic void si_llvm_emit_primitive(struct ac_shader_abi *abi,
4286b8e80941Smrg				   unsigned stream)
4287b8e80941Smrg{
4288b8e80941Smrg	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
4289b8e80941Smrg
4290b8e80941Smrg	/* Signal primitive cut */
4291b8e80941Smrg	ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
4292b8e80941Smrg			 si_get_gs_wave_id(ctx));
4293b8e80941Smrg}
4294b8e80941Smrg
4295b8e80941Smrg/* Cut one primitive from the geometry shader */
4296b8e80941Smrgstatic void si_tgsi_emit_primitive(
4297b8e80941Smrg	const struct lp_build_tgsi_action *action,
4298b8e80941Smrg	struct lp_build_tgsi_context *bld_base,
4299b8e80941Smrg	struct lp_build_emit_data *emit_data)
4300b8e80941Smrg{
4301b8e80941Smrg	struct si_shader_context *ctx = si_shader_context(bld_base);
4302b8e80941Smrg
4303b8e80941Smrg	si_llvm_emit_primitive(&ctx->abi, si_llvm_get_stream(bld_base, emit_data));
4304b8e80941Smrg}
4305b8e80941Smrg
4306b8e80941Smrgstatic void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
4307b8e80941Smrg				 struct lp_build_tgsi_context *bld_base,
4308b8e80941Smrg				 struct lp_build_emit_data *emit_data)
4309b8e80941Smrg{
4310b8e80941Smrg	struct si_shader_context *ctx = si_shader_context(bld_base);
4311b8e80941Smrg
4312b8e80941Smrg	/* SI only (thanks to a hw bug workaround):
4313b8e80941Smrg	 * The real barrier instruction isn’t needed, because an entire patch
4314b8e80941Smrg	 * always fits into a single wave.
4315b8e80941Smrg	 */
4316b8e80941Smrg	if (ctx->screen->info.chip_class == SI &&
4317b8e80941Smrg	    ctx->type == PIPE_SHADER_TESS_CTRL) {
4318b8e80941Smrg		ac_build_waitcnt(&ctx->ac, LGKM_CNT & VM_CNT);
4319b8e80941Smrg		return;
4320b8e80941Smrg	}
4321b8e80941Smrg
4322b8e80941Smrg	ac_build_s_barrier(&ctx->ac);
4323b8e80941Smrg}
4324b8e80941Smrg
4325b8e80941Smrgstatic void si_create_function(struct si_shader_context *ctx,
4326b8e80941Smrg			       const char *name,
4327b8e80941Smrg			       LLVMTypeRef *returns, unsigned num_returns,
4328b8e80941Smrg			       struct si_function_info *fninfo,
4329b8e80941Smrg			       unsigned max_workgroup_size)
4330b8e80941Smrg{
4331b8e80941Smrg	int i;
4332b8e80941Smrg
4333b8e80941Smrg	si_llvm_create_func(ctx, name, returns, num_returns,
4334b8e80941Smrg			    fninfo->types, fninfo->num_params);
4335b8e80941Smrg	ctx->return_value = LLVMGetUndef(ctx->return_type);
4336b8e80941Smrg
4337b8e80941Smrg	for (i = 0; i < fninfo->num_sgpr_params; ++i) {
4338b8e80941Smrg		LLVMValueRef P = LLVMGetParam(ctx->main_fn, i);
4339b8e80941Smrg
4340b8e80941Smrg		/* The combination of:
4341b8e80941Smrg		 * - noalias
4342b8e80941Smrg		 * - dereferenceable
4343b8e80941Smrg		 * - invariant.load
4344b8e80941Smrg		 * allows the optimization passes to move loads and reduces
4345b8e80941Smrg		 * SGPR spilling significantly.
4346b8e80941Smrg		 */
4347b8e80941Smrg		ac_add_function_attr(ctx->ac.context, ctx->main_fn, i + 1,
4348b8e80941Smrg				     AC_FUNC_ATTR_INREG);
4349b8e80941Smrg
4350b8e80941Smrg		if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
4351b8e80941Smrg			ac_add_function_attr(ctx->ac.context, ctx->main_fn, i + 1,
4352b8e80941Smrg					     AC_FUNC_ATTR_NOALIAS);
4353b8e80941Smrg			ac_add_attr_dereferenceable(P, UINT64_MAX);
4354b8e80941Smrg		}
4355b8e80941Smrg	}
4356b8e80941Smrg
4357b8e80941Smrg	for (i = 0; i < fninfo->num_params; ++i) {
4358b8e80941Smrg		if (fninfo->assign[i])
4359b8e80941Smrg			*fninfo->assign[i] = LLVMGetParam(ctx->main_fn, i);
4360b8e80941Smrg	}
4361b8e80941Smrg
4362b8e80941Smrg	if (ctx->screen->info.address32_hi) {
4363b8e80941Smrg		ac_llvm_add_target_dep_function_attr(ctx->main_fn,
4364b8e80941Smrg						     "amdgpu-32bit-address-high-bits",
4365b8e80941Smrg						     ctx->screen->info.address32_hi);
4366b8e80941Smrg	}
4367b8e80941Smrg
4368b8e80941Smrg	if (max_workgroup_size) {
4369b8e80941Smrg		ac_llvm_add_target_dep_function_attr(ctx->main_fn,
4370b8e80941Smrg						     "amdgpu-max-work-group-size",
4371b8e80941Smrg						     max_workgroup_size);
4372b8e80941Smrg	}
4373b8e80941Smrg	LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4374b8e80941Smrg					   "no-signed-zeros-fp-math",
4375b8e80941Smrg					   "true");
4376b8e80941Smrg
4377b8e80941Smrg	if (ctx->screen->debug_flags & DBG(UNSAFE_MATH)) {
4378b8e80941Smrg		/* These were copied from some LLVM test. */
4379b8e80941Smrg		LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4380b8e80941Smrg						   "less-precise-fpmad",
4381b8e80941Smrg						   "true");
4382b8e80941Smrg		LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4383b8e80941Smrg						   "no-infs-fp-math",
4384b8e80941Smrg						   "true");
4385b8e80941Smrg		LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4386b8e80941Smrg						   "no-nans-fp-math",
4387b8e80941Smrg						   "true");
4388b8e80941Smrg		LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4389b8e80941Smrg						   "unsafe-fp-math",
4390b8e80941Smrg						   "true");
4391b8e80941Smrg	}
4392b8e80941Smrg}
4393b8e80941Smrg
4394b8e80941Smrgstatic void declare_streamout_params(struct si_shader_context *ctx,
4395b8e80941Smrg				     struct pipe_stream_output_info *so,
4396b8e80941Smrg				     struct si_function_info *fninfo)
4397b8e80941Smrg{
4398b8e80941Smrg	int i;
4399b8e80941Smrg
4400b8e80941Smrg	/* Streamout SGPRs. */
4401b8e80941Smrg	if (so->num_outputs) {
4402b8e80941Smrg		if (ctx->type != PIPE_SHADER_TESS_EVAL)
4403b8e80941Smrg			ctx->param_streamout_config = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4404b8e80941Smrg		else
4405b8e80941Smrg			ctx->param_streamout_config = fninfo->num_params - 1;
4406b8e80941Smrg
4407b8e80941Smrg		ctx->param_streamout_write_index = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4408b8e80941Smrg	}
4409b8e80941Smrg	/* A streamout buffer offset is loaded if the stride is non-zero. */
4410b8e80941Smrg	for (i = 0; i < 4; i++) {
4411b8e80941Smrg		if (!so->stride[i])
4412b8e80941Smrg			continue;
4413b8e80941Smrg
4414b8e80941Smrg		ctx->param_streamout_offset[i] = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4415b8e80941Smrg	}
4416b8e80941Smrg}
4417b8e80941Smrg
4418b8e80941Smrgstatic unsigned si_get_max_workgroup_size(const struct si_shader *shader)
4419b8e80941Smrg{
4420b8e80941Smrg	switch (shader->selector->type) {
4421b8e80941Smrg	case PIPE_SHADER_TESS_CTRL:
4422b8e80941Smrg		/* Return this so that LLVM doesn't remove s_barrier
4423b8e80941Smrg		 * instructions on chips where we use s_barrier. */
4424b8e80941Smrg		return shader->selector->screen->info.chip_class >= CIK ? 128 : 64;
4425b8e80941Smrg
4426b8e80941Smrg	case PIPE_SHADER_GEOMETRY:
4427b8e80941Smrg		return shader->selector->screen->info.chip_class >= GFX9 ? 128 : 64;
4428b8e80941Smrg
4429b8e80941Smrg	case PIPE_SHADER_COMPUTE:
4430b8e80941Smrg		break; /* see below */
4431b8e80941Smrg
4432b8e80941Smrg	default:
4433b8e80941Smrg		return 0;
4434b8e80941Smrg	}
4435b8e80941Smrg
4436b8e80941Smrg	const unsigned *properties = shader->selector->info.properties;
4437b8e80941Smrg	unsigned max_work_group_size =
4438b8e80941Smrg	               properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
4439b8e80941Smrg	               properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
4440b8e80941Smrg	               properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
4441b8e80941Smrg
4442b8e80941Smrg	if (!max_work_group_size) {
4443b8e80941Smrg		/* This is a variable group size compute shader,
4444b8e80941Smrg		 * compile it for the maximum possible group size.
4445b8e80941Smrg		 */
4446b8e80941Smrg		max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
4447b8e80941Smrg	}
4448b8e80941Smrg	return max_work_group_size;
4449b8e80941Smrg}
4450b8e80941Smrg
4451b8e80941Smrgstatic void declare_const_and_shader_buffers(struct si_shader_context *ctx,
4452b8e80941Smrg					     struct si_function_info *fninfo,
4453b8e80941Smrg					     bool assign_params)
4454b8e80941Smrg{
4455b8e80941Smrg	LLVMTypeRef const_shader_buf_type;
4456b8e80941Smrg
4457b8e80941Smrg	if (ctx->shader->selector->info.const_buffers_declared == 1 &&
4458b8e80941Smrg	    ctx->shader->selector->info.shader_buffers_declared == 0)
4459b8e80941Smrg		const_shader_buf_type = ctx->f32;
4460b8e80941Smrg	else
4461b8e80941Smrg		const_shader_buf_type = ctx->v4i32;
4462b8e80941Smrg
4463b8e80941Smrg	unsigned const_and_shader_buffers =
4464b8e80941Smrg		add_arg(fninfo, ARG_SGPR,
4465b8e80941Smrg			ac_array_in_const32_addr_space(const_shader_buf_type));
4466b8e80941Smrg
4467b8e80941Smrg	if (assign_params)
4468b8e80941Smrg		ctx->param_const_and_shader_buffers = const_and_shader_buffers;
4469b8e80941Smrg}
4470b8e80941Smrg
4471b8e80941Smrgstatic void declare_samplers_and_images(struct si_shader_context *ctx,
4472b8e80941Smrg					struct si_function_info *fninfo,
4473b8e80941Smrg					bool assign_params)
4474b8e80941Smrg{
4475b8e80941Smrg	unsigned samplers_and_images =
4476b8e80941Smrg		add_arg(fninfo, ARG_SGPR,
4477b8e80941Smrg			ac_array_in_const32_addr_space(ctx->v8i32));
4478b8e80941Smrg
4479b8e80941Smrg	if (assign_params)
4480b8e80941Smrg		ctx->param_samplers_and_images = samplers_and_images;
4481b8e80941Smrg}
4482b8e80941Smrg
4483b8e80941Smrgstatic void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
4484b8e80941Smrg					    struct si_function_info *fninfo,
4485b8e80941Smrg					    bool assign_params)
4486b8e80941Smrg{
4487b8e80941Smrg	declare_const_and_shader_buffers(ctx, fninfo, assign_params);
4488b8e80941Smrg	declare_samplers_and_images(ctx, fninfo, assign_params);
4489b8e80941Smrg}
4490b8e80941Smrg
4491b8e80941Smrgstatic void declare_global_desc_pointers(struct si_shader_context *ctx,
4492b8e80941Smrg					 struct si_function_info *fninfo)
4493b8e80941Smrg{
4494b8e80941Smrg	ctx->param_rw_buffers = add_arg(fninfo, ARG_SGPR,
4495b8e80941Smrg		ac_array_in_const32_addr_space(ctx->v4i32));
4496b8e80941Smrg	ctx->param_bindless_samplers_and_images = add_arg(fninfo, ARG_SGPR,
4497b8e80941Smrg		ac_array_in_const32_addr_space(ctx->v8i32));
4498b8e80941Smrg}
4499b8e80941Smrg
4500b8e80941Smrgstatic void declare_vs_specific_input_sgprs(struct si_shader_context *ctx,
4501b8e80941Smrg					    struct si_function_info *fninfo)
4502b8e80941Smrg{
4503b8e80941Smrg	ctx->param_vs_state_bits = add_arg(fninfo, ARG_SGPR, ctx->i32);
4504b8e80941Smrg	add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.base_vertex);
4505b8e80941Smrg	add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.start_instance);
4506b8e80941Smrg	add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.draw_id);
4507b8e80941Smrg}
4508b8e80941Smrg
4509b8e80941Smrgstatic void declare_vs_input_vgprs(struct si_shader_context *ctx,
4510b8e80941Smrg				   struct si_function_info *fninfo,
4511b8e80941Smrg				   unsigned *num_prolog_vgprs)
4512b8e80941Smrg{
4513b8e80941Smrg	struct si_shader *shader = ctx->shader;
4514b8e80941Smrg
4515b8e80941Smrg	add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.vertex_id);
4516b8e80941Smrg	if (shader->key.as_ls) {
4517b8e80941Smrg		ctx->param_rel_auto_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4518b8e80941Smrg		add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id);
4519b8e80941Smrg	} else {
4520b8e80941Smrg		add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id);
4521b8e80941Smrg		ctx->param_vs_prim_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4522b8e80941Smrg	}
4523b8e80941Smrg	add_arg(fninfo, ARG_VGPR, ctx->i32); /* unused */
4524b8e80941Smrg
4525b8e80941Smrg	if (!shader->is_gs_copy_shader) {
4526b8e80941Smrg		/* Vertex load indices. */
4527b8e80941Smrg		ctx->param_vertex_index0 = fninfo->num_params;
4528b8e80941Smrg		for (unsigned i = 0; i < shader->selector->info.num_inputs; i++)
4529b8e80941Smrg			add_arg(fninfo, ARG_VGPR, ctx->i32);
4530b8e80941Smrg		*num_prolog_vgprs += shader->selector->info.num_inputs;
4531b8e80941Smrg	}
4532b8e80941Smrg}
4533b8e80941Smrg
4534b8e80941Smrgstatic void declare_vs_blit_inputs(struct si_shader_context *ctx,
4535b8e80941Smrg				   struct si_function_info *fninfo,
4536b8e80941Smrg				   unsigned vs_blit_property)
4537b8e80941Smrg{
4538b8e80941Smrg	ctx->param_vs_blit_inputs = fninfo->num_params;
4539b8e80941Smrg	add_arg(fninfo, ARG_SGPR, ctx->i32); /* i16 x1, y1 */
4540b8e80941Smrg	add_arg(fninfo, ARG_SGPR, ctx->i32); /* i16 x2, y2 */
4541b8e80941Smrg	add_arg(fninfo, ARG_SGPR, ctx->f32); /* depth */
4542b8e80941Smrg
4543b8e80941Smrg	if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
4544b8e80941Smrg		add_arg(fninfo, ARG_SGPR, ctx->f32); /* color0 */
4545b8e80941Smrg		add_arg(fninfo, ARG_SGPR, ctx->f32); /* color1 */
4546b8e80941Smrg		add_arg(fninfo, ARG_SGPR, ctx->f32); /* color2 */
4547b8e80941Smrg		add_arg(fninfo, ARG_SGPR, ctx->f32); /* color3 */
4548b8e80941Smrg	} else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) {
4549b8e80941Smrg		add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.x1 */
4550b8e80941Smrg		add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.y1 */
4551b8e80941Smrg		add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.x2 */
4552b8e80941Smrg		add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.y2 */
4553b8e80941Smrg		add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.z */
4554b8e80941Smrg		add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.w */
4555b8e80941Smrg	}
4556b8e80941Smrg}
4557b8e80941Smrg
4558b8e80941Smrgstatic void declare_tes_input_vgprs(struct si_shader_context *ctx,
4559b8e80941Smrg				    struct si_function_info *fninfo)
4560b8e80941Smrg{
4561b8e80941Smrg	ctx->param_tes_u = add_arg(fninfo, ARG_VGPR, ctx->f32);
4562b8e80941Smrg	ctx->param_tes_v = add_arg(fninfo, ARG_VGPR, ctx->f32);
4563b8e80941Smrg	ctx->param_tes_rel_patch_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4564b8e80941Smrg	add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tes_patch_id);
4565b8e80941Smrg}
4566b8e80941Smrg
4567b8e80941Smrgenum {
4568b8e80941Smrg	/* Convenient merged shader definitions. */
4569b8e80941Smrg	SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES,
4570b8e80941Smrg	SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
4571b8e80941Smrg};
4572b8e80941Smrg
4573b8e80941Smrgstatic void create_function(struct si_shader_context *ctx)
4574b8e80941Smrg{
4575b8e80941Smrg	struct si_shader *shader = ctx->shader;
4576b8e80941Smrg	struct si_function_info fninfo;
4577b8e80941Smrg	LLVMTypeRef returns[16+32*4];
4578b8e80941Smrg	unsigned i, num_return_sgprs;
4579b8e80941Smrg	unsigned num_returns = 0;
4580b8e80941Smrg	unsigned num_prolog_vgprs = 0;
4581b8e80941Smrg	unsigned type = ctx->type;
4582b8e80941Smrg	unsigned vs_blit_property =
4583b8e80941Smrg		shader->selector->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS];
4584b8e80941Smrg
4585b8e80941Smrg	si_init_function_info(&fninfo);
4586b8e80941Smrg
4587b8e80941Smrg	/* Set MERGED shaders. */
4588b8e80941Smrg	if (ctx->screen->info.chip_class >= GFX9) {
4589b8e80941Smrg		if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
4590b8e80941Smrg			type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
4591b8e80941Smrg		else if (shader->key.as_es || type == PIPE_SHADER_GEOMETRY)
4592b8e80941Smrg			type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
4593b8e80941Smrg	}
4594b8e80941Smrg
4595b8e80941Smrg	LLVMTypeRef v3i32 = LLVMVectorType(ctx->i32, 3);
4596b8e80941Smrg
4597b8e80941Smrg	switch (type) {
4598b8e80941Smrg	case PIPE_SHADER_VERTEX:
4599b8e80941Smrg		declare_global_desc_pointers(ctx, &fninfo);
4600b8e80941Smrg
4601b8e80941Smrg		if (vs_blit_property) {
4602b8e80941Smrg			declare_vs_blit_inputs(ctx, &fninfo, vs_blit_property);
4603b8e80941Smrg
4604b8e80941Smrg			/* VGPRs */
4605b8e80941Smrg			declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs);
4606b8e80941Smrg			break;
4607b8e80941Smrg		}
4608b8e80941Smrg
4609b8e80941Smrg		declare_per_stage_desc_pointers(ctx, &fninfo, true);
4610b8e80941Smrg		declare_vs_specific_input_sgprs(ctx, &fninfo);
4611b8e80941Smrg		ctx->param_vertex_buffers = add_arg(&fninfo, ARG_SGPR,
4612b8e80941Smrg			ac_array_in_const32_addr_space(ctx->v4i32));
4613b8e80941Smrg
4614b8e80941Smrg		if (shader->key.as_es) {
4615b8e80941Smrg			ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4616b8e80941Smrg		} else if (shader->key.as_ls) {
4617b8e80941Smrg			/* no extra parameters */
4618b8e80941Smrg		} else {
4619b8e80941Smrg			if (shader->is_gs_copy_shader) {
4620b8e80941Smrg				fninfo.num_params = ctx->param_vs_state_bits + 1;
4621b8e80941Smrg				fninfo.num_sgpr_params = fninfo.num_params;
4622b8e80941Smrg			}
4623b8e80941Smrg
4624b8e80941Smrg			/* The locations of the other parameters are assigned dynamically. */
4625b8e80941Smrg			declare_streamout_params(ctx, &shader->selector->so,
4626b8e80941Smrg						 &fninfo);
4627b8e80941Smrg		}
4628b8e80941Smrg
4629b8e80941Smrg		/* VGPRs */
4630b8e80941Smrg		declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs);
4631b8e80941Smrg		break;
4632b8e80941Smrg
4633b8e80941Smrg	case PIPE_SHADER_TESS_CTRL: /* SI-CI-VI */
4634b8e80941Smrg		declare_global_desc_pointers(ctx, &fninfo);
4635b8e80941Smrg		declare_per_stage_desc_pointers(ctx, &fninfo, true);
4636b8e80941Smrg		ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4637b8e80941Smrg		ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4638b8e80941Smrg		ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4639b8e80941Smrg		ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4640b8e80941Smrg		ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4641b8e80941Smrg		ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4642b8e80941Smrg
4643b8e80941Smrg		/* VGPRs */
4644b8e80941Smrg		add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tcs_patch_id);
4645b8e80941Smrg		add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tcs_rel_ids);
4646b8e80941Smrg
4647b8e80941Smrg		/* param_tcs_offchip_offset and param_tcs_factor_offset are
4648b8e80941Smrg		 * placed after the user SGPRs.
4649b8e80941Smrg		 */
4650b8e80941Smrg		for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
4651b8e80941Smrg			returns[num_returns++] = ctx->i32; /* SGPRs */
4652b8e80941Smrg		for (i = 0; i < 11; i++)
4653b8e80941Smrg			returns[num_returns++] = ctx->f32; /* VGPRs */
4654b8e80941Smrg		break;
4655b8e80941Smrg
4656b8e80941Smrg	case SI_SHADER_MERGED_VERTEX_TESSCTRL:
4657b8e80941Smrg		/* Merged stages have 8 system SGPRs at the beginning. */
4658b8e80941Smrg		/* SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */
4659b8e80941Smrg		declare_per_stage_desc_pointers(ctx, &fninfo,
4660b8e80941Smrg						ctx->type == PIPE_SHADER_TESS_CTRL);
4661b8e80941Smrg		ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4662b8e80941Smrg		ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4663b8e80941Smrg		ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4664b8e80941Smrg		ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4665b8e80941Smrg		add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4666b8e80941Smrg		add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4667b8e80941Smrg
4668b8e80941Smrg		declare_global_desc_pointers(ctx, &fninfo);
4669b8e80941Smrg		declare_per_stage_desc_pointers(ctx, &fninfo,
4670b8e80941Smrg						ctx->type == PIPE_SHADER_VERTEX);
4671b8e80941Smrg		declare_vs_specific_input_sgprs(ctx, &fninfo);
4672b8e80941Smrg
4673b8e80941Smrg		ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4674b8e80941Smrg		ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4675b8e80941Smrg		ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4676b8e80941Smrg		ctx->param_vertex_buffers = add_arg(&fninfo, ARG_SGPR,
4677b8e80941Smrg			ac_array_in_const32_addr_space(ctx->v4i32));
4678b8e80941Smrg
4679b8e80941Smrg		/* VGPRs (first TCS, then VS) */
4680b8e80941Smrg		add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tcs_patch_id);
4681b8e80941Smrg		add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tcs_rel_ids);
4682b8e80941Smrg
4683b8e80941Smrg		if (ctx->type == PIPE_SHADER_VERTEX) {
4684b8e80941Smrg			declare_vs_input_vgprs(ctx, &fninfo,
4685b8e80941Smrg					       &num_prolog_vgprs);
4686b8e80941Smrg
4687b8e80941Smrg			/* LS return values are inputs to the TCS main shader part. */
4688b8e80941Smrg			for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
4689b8e80941Smrg				returns[num_returns++] = ctx->i32; /* SGPRs */
4690b8e80941Smrg			for (i = 0; i < 2; i++)
4691b8e80941Smrg				returns[num_returns++] = ctx->f32; /* VGPRs */
4692b8e80941Smrg		} else {
4693b8e80941Smrg			/* TCS return values are inputs to the TCS epilog.
4694b8e80941Smrg			 *
4695b8e80941Smrg			 * param_tcs_offchip_offset, param_tcs_factor_offset,
4696b8e80941Smrg			 * param_tcs_offchip_layout, and param_rw_buffers
4697b8e80941Smrg			 * should be passed to the epilog.
4698b8e80941Smrg			 */
4699b8e80941Smrg			for (i = 0; i <= 8 + GFX9_SGPR_TCS_OUT_LAYOUT; i++)
4700b8e80941Smrg				returns[num_returns++] = ctx->i32; /* SGPRs */
4701b8e80941Smrg			for (i = 0; i < 11; i++)
4702b8e80941Smrg				returns[num_returns++] = ctx->f32; /* VGPRs */
4703b8e80941Smrg		}
4704b8e80941Smrg		break;
4705b8e80941Smrg
4706b8e80941Smrg	case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
4707b8e80941Smrg		/* Merged stages have 8 system SGPRs at the beginning. */
4708b8e80941Smrg		/* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
4709b8e80941Smrg		declare_per_stage_desc_pointers(ctx, &fninfo,
4710b8e80941Smrg						ctx->type == PIPE_SHADER_GEOMETRY);
4711b8e80941Smrg		ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4712b8e80941Smrg		ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4713b8e80941Smrg		ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4714b8e80941Smrg		ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4715b8e80941Smrg		add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */
4716b8e80941Smrg		add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
4717b8e80941Smrg
4718b8e80941Smrg		declare_global_desc_pointers(ctx, &fninfo);
4719b8e80941Smrg		declare_per_stage_desc_pointers(ctx, &fninfo,
4720b8e80941Smrg						(ctx->type == PIPE_SHADER_VERTEX ||
4721b8e80941Smrg						 ctx->type == PIPE_SHADER_TESS_EVAL));
4722b8e80941Smrg		if (ctx->type == PIPE_SHADER_VERTEX) {
4723b8e80941Smrg			declare_vs_specific_input_sgprs(ctx, &fninfo);
4724b8e80941Smrg		} else {
4725b8e80941Smrg			ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4726b8e80941Smrg			ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4727b8e80941Smrg			ctx->param_tes_offchip_addr = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4728b8e80941Smrg			/* Declare as many input SGPRs as the VS has. */
4729b8e80941Smrg		}
4730b8e80941Smrg
4731b8e80941Smrg		if (ctx->type == PIPE_SHADER_VERTEX) {
4732b8e80941Smrg			ctx->param_vertex_buffers = add_arg(&fninfo, ARG_SGPR,
4733b8e80941Smrg				ac_array_in_const32_addr_space(ctx->v4i32));
4734b8e80941Smrg		}
4735b8e80941Smrg
4736b8e80941Smrg		/* VGPRs (first GS, then VS/TES) */
4737b8e80941Smrg		ctx->param_gs_vtx01_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4738b8e80941Smrg		ctx->param_gs_vtx23_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4739b8e80941Smrg		add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_prim_id);
4740b8e80941Smrg		add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_invocation_id);
4741b8e80941Smrg		ctx->param_gs_vtx45_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4742b8e80941Smrg
4743b8e80941Smrg		if (ctx->type == PIPE_SHADER_VERTEX) {
4744b8e80941Smrg			declare_vs_input_vgprs(ctx, &fninfo,
4745b8e80941Smrg					       &num_prolog_vgprs);
4746b8e80941Smrg		} else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
4747b8e80941Smrg			declare_tes_input_vgprs(ctx, &fninfo);
4748b8e80941Smrg		}
4749b8e80941Smrg
4750b8e80941Smrg		if (ctx->type == PIPE_SHADER_VERTEX ||
4751b8e80941Smrg		    ctx->type == PIPE_SHADER_TESS_EVAL) {
4752b8e80941Smrg			unsigned num_user_sgprs;
4753b8e80941Smrg
4754b8e80941Smrg			if (ctx->type == PIPE_SHADER_VERTEX)
4755b8e80941Smrg				num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR;
4756b8e80941Smrg			else
4757b8e80941Smrg				num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
4758b8e80941Smrg
4759b8e80941Smrg			/* ES return values are inputs to GS. */
4760b8e80941Smrg			for (i = 0; i < 8 + num_user_sgprs; i++)
4761b8e80941Smrg				returns[num_returns++] = ctx->i32; /* SGPRs */
4762b8e80941Smrg			for (i = 0; i < 5; i++)
4763b8e80941Smrg				returns[num_returns++] = ctx->f32; /* VGPRs */
4764b8e80941Smrg		}
4765b8e80941Smrg		break;
4766b8e80941Smrg
4767b8e80941Smrg	case PIPE_SHADER_TESS_EVAL:
4768b8e80941Smrg		declare_global_desc_pointers(ctx, &fninfo);
4769b8e80941Smrg		declare_per_stage_desc_pointers(ctx, &fninfo, true);
4770b8e80941Smrg		ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4771b8e80941Smrg		ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4772b8e80941Smrg		ctx->param_tes_offchip_addr = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4773b8e80941Smrg
4774b8e80941Smrg		if (shader->key.as_es) {
4775b8e80941Smrg			ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4776b8e80941Smrg			add_arg(&fninfo, ARG_SGPR, ctx->i32);
4777b8e80941Smrg			ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4778b8e80941Smrg		} else {
4779b8e80941Smrg			add_arg(&fninfo, ARG_SGPR, ctx->i32);
4780b8e80941Smrg			declare_streamout_params(ctx, &shader->selector->so,
4781b8e80941Smrg						 &fninfo);
4782b8e80941Smrg			ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4783b8e80941Smrg		}
4784b8e80941Smrg
4785b8e80941Smrg		/* VGPRs */
4786b8e80941Smrg		declare_tes_input_vgprs(ctx, &fninfo);
4787b8e80941Smrg		break;
4788b8e80941Smrg
4789b8e80941Smrg	case PIPE_SHADER_GEOMETRY:
4790b8e80941Smrg		declare_global_desc_pointers(ctx, &fninfo);
4791b8e80941Smrg		declare_per_stage_desc_pointers(ctx, &fninfo, true);
4792b8e80941Smrg		ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4793b8e80941Smrg		ctx->param_gs_wave_id = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4794b8e80941Smrg
4795b8e80941Smrg		/* VGPRs */
4796b8e80941Smrg		add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[0]);
4797b8e80941Smrg		add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[1]);
4798b8e80941Smrg		add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_prim_id);
4799b8e80941Smrg		add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[2]);
4800b8e80941Smrg		add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[3]);
4801b8e80941Smrg		add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[4]);
4802b8e80941Smrg		add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[5]);
4803b8e80941Smrg		add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_invocation_id);
4804b8e80941Smrg		break;
4805b8e80941Smrg
4806b8e80941Smrg	case PIPE_SHADER_FRAGMENT:
4807b8e80941Smrg		declare_global_desc_pointers(ctx, &fninfo);
4808b8e80941Smrg		declare_per_stage_desc_pointers(ctx, &fninfo, true);
4809b8e80941Smrg		add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF);
4810b8e80941Smrg		add_arg_assign_checked(&fninfo, ARG_SGPR, ctx->i32,
4811b8e80941Smrg				       &ctx->abi.prim_mask, SI_PARAM_PRIM_MASK);
4812b8e80941Smrg
4813b8e80941Smrg		add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_SAMPLE);
4814b8e80941Smrg		add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTER);
4815b8e80941Smrg		add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTROID);
4816b8e80941Smrg		add_arg_checked(&fninfo, ARG_VGPR, v3i32, SI_PARAM_PERSP_PULL_MODEL);
4817b8e80941Smrg		add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_SAMPLE);
4818b8e80941Smrg		add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTER);
4819b8e80941Smrg		add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTROID);
4820b8e80941Smrg		add_arg_checked(&fninfo, ARG_VGPR, ctx->f32, SI_PARAM_LINE_STIPPLE_TEX);
4821b8e80941Smrg		add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4822b8e80941Smrg				       &ctx->abi.frag_pos[0], SI_PARAM_POS_X_FLOAT);
4823b8e80941Smrg		add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4824b8e80941Smrg				       &ctx->abi.frag_pos[1], SI_PARAM_POS_Y_FLOAT);
4825b8e80941Smrg		add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4826b8e80941Smrg				       &ctx->abi.frag_pos[2], SI_PARAM_POS_Z_FLOAT);
4827b8e80941Smrg		add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4828b8e80941Smrg				       &ctx->abi.frag_pos[3], SI_PARAM_POS_W_FLOAT);
4829b8e80941Smrg		add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->i32,
4830b8e80941Smrg				       &ctx->abi.front_face, SI_PARAM_FRONT_FACE);
4831b8e80941Smrg		shader->info.face_vgpr_index = 20;
4832b8e80941Smrg		add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->i32,
4833b8e80941Smrg				       &ctx->abi.ancillary, SI_PARAM_ANCILLARY);
4834b8e80941Smrg		shader->info.ancillary_vgpr_index = 21;
4835b8e80941Smrg		add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4836b8e80941Smrg				       &ctx->abi.sample_coverage, SI_PARAM_SAMPLE_COVERAGE);
4837b8e80941Smrg		add_arg_checked(&fninfo, ARG_VGPR, ctx->i32, SI_PARAM_POS_FIXED_PT);
4838b8e80941Smrg
4839b8e80941Smrg		/* Color inputs from the prolog. */
4840b8e80941Smrg		if (shader->selector->info.colors_read) {
4841b8e80941Smrg			unsigned num_color_elements =
4842b8e80941Smrg				util_bitcount(shader->selector->info.colors_read);
4843b8e80941Smrg
4844b8e80941Smrg			assert(fninfo.num_params + num_color_elements <= ARRAY_SIZE(fninfo.types));
4845b8e80941Smrg			for (i = 0; i < num_color_elements; i++)
4846b8e80941Smrg				add_arg(&fninfo, ARG_VGPR, ctx->f32);
4847b8e80941Smrg
4848b8e80941Smrg			num_prolog_vgprs += num_color_elements;
4849b8e80941Smrg		}
4850b8e80941Smrg
4851b8e80941Smrg		/* Outputs for the epilog. */
4852b8e80941Smrg		num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
4853b8e80941Smrg		num_returns =
4854b8e80941Smrg			num_return_sgprs +
4855b8e80941Smrg			util_bitcount(shader->selector->info.colors_written) * 4 +
4856b8e80941Smrg			shader->selector->info.writes_z +
4857b8e80941Smrg			shader->selector->info.writes_stencil +
4858b8e80941Smrg			shader->selector->info.writes_samplemask +
4859b8e80941Smrg			1 /* SampleMaskIn */;
4860b8e80941Smrg
4861b8e80941Smrg		num_returns = MAX2(num_returns,
4862b8e80941Smrg				   num_return_sgprs +
4863b8e80941Smrg				   PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
4864b8e80941Smrg
4865b8e80941Smrg		for (i = 0; i < num_return_sgprs; i++)
4866b8e80941Smrg			returns[i] = ctx->i32;
4867b8e80941Smrg		for (; i < num_returns; i++)
4868b8e80941Smrg			returns[i] = ctx->f32;
4869b8e80941Smrg		break;
4870b8e80941Smrg
4871b8e80941Smrg	case PIPE_SHADER_COMPUTE:
4872b8e80941Smrg		declare_global_desc_pointers(ctx, &fninfo);
4873b8e80941Smrg		declare_per_stage_desc_pointers(ctx, &fninfo, true);
4874b8e80941Smrg		if (shader->selector->info.uses_grid_size)
4875b8e80941Smrg			add_arg_assign(&fninfo, ARG_SGPR, v3i32, &ctx->abi.num_work_groups);
4876b8e80941Smrg		if (shader->selector->info.uses_block_size &&
4877b8e80941Smrg		    shader->selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0)
4878b8e80941Smrg			ctx->param_block_size = add_arg(&fninfo, ARG_SGPR, v3i32);
4879b8e80941Smrg
4880b8e80941Smrg		unsigned cs_user_data_dwords =
4881b8e80941Smrg			shader->selector->info.properties[TGSI_PROPERTY_CS_USER_DATA_DWORDS];
4882b8e80941Smrg		if (cs_user_data_dwords) {
4883b8e80941Smrg			ctx->param_cs_user_data = add_arg(&fninfo, ARG_SGPR,
4884b8e80941Smrg							  LLVMVectorType(ctx->i32, cs_user_data_dwords));
4885b8e80941Smrg		}
4886b8e80941Smrg
4887b8e80941Smrg		for (i = 0; i < 3; i++) {
4888b8e80941Smrg			ctx->abi.workgroup_ids[i] = NULL;
4889b8e80941Smrg			if (shader->selector->info.uses_block_id[i])
4890b8e80941Smrg				add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &ctx->abi.workgroup_ids[i]);
4891b8e80941Smrg		}
4892b8e80941Smrg
4893b8e80941Smrg		add_arg_assign(&fninfo, ARG_VGPR, v3i32, &ctx->abi.local_invocation_ids);
4894b8e80941Smrg		break;
4895b8e80941Smrg	default:
4896b8e80941Smrg		assert(0 && "unimplemented shader");
4897b8e80941Smrg		return;
4898b8e80941Smrg	}
4899b8e80941Smrg
4900b8e80941Smrg	si_create_function(ctx, "main", returns, num_returns, &fninfo,
4901b8e80941Smrg			   si_get_max_workgroup_size(shader));
4902b8e80941Smrg
4903b8e80941Smrg	/* Reserve register locations for VGPR inputs the PS prolog may need. */
4904b8e80941Smrg	if (ctx->type == PIPE_SHADER_FRAGMENT && !ctx->shader->is_monolithic) {
4905b8e80941Smrg		ac_llvm_add_target_dep_function_attr(ctx->main_fn,
4906b8e80941Smrg						     "InitialPSInputAddr",
4907b8e80941Smrg						     S_0286D0_PERSP_SAMPLE_ENA(1) |
4908b8e80941Smrg						     S_0286D0_PERSP_CENTER_ENA(1) |
4909b8e80941Smrg						     S_0286D0_PERSP_CENTROID_ENA(1) |
4910b8e80941Smrg						     S_0286D0_LINEAR_SAMPLE_ENA(1) |
4911b8e80941Smrg						     S_0286D0_LINEAR_CENTER_ENA(1) |
4912b8e80941Smrg						     S_0286D0_LINEAR_CENTROID_ENA(1) |
4913b8e80941Smrg						     S_0286D0_FRONT_FACE_ENA(1) |
4914b8e80941Smrg						     S_0286D0_ANCILLARY_ENA(1) |
4915b8e80941Smrg						     S_0286D0_POS_FIXED_PT_ENA(1));
4916b8e80941Smrg	}
4917b8e80941Smrg
4918b8e80941Smrg	shader->info.num_input_sgprs = 0;
4919b8e80941Smrg	shader->info.num_input_vgprs = 0;
4920b8e80941Smrg
4921b8e80941Smrg	for (i = 0; i < fninfo.num_sgpr_params; ++i)
4922b8e80941Smrg		shader->info.num_input_sgprs += ac_get_type_size(fninfo.types[i]) / 4;
4923b8e80941Smrg
4924b8e80941Smrg	for (; i < fninfo.num_params; ++i)
4925b8e80941Smrg		shader->info.num_input_vgprs += ac_get_type_size(fninfo.types[i]) / 4;
4926b8e80941Smrg
4927b8e80941Smrg	assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
4928b8e80941Smrg	shader->info.num_input_vgprs -= num_prolog_vgprs;
4929b8e80941Smrg
4930b8e80941Smrg	if (shader->key.as_ls ||
4931b8e80941Smrg	    ctx->type == PIPE_SHADER_TESS_CTRL ||
4932b8e80941Smrg	    /* GFX9 has the ESGS ring buffer in LDS. */
4933b8e80941Smrg	    type == SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY)
4934b8e80941Smrg		ac_declare_lds_as_pointer(&ctx->ac);
4935b8e80941Smrg}
4936b8e80941Smrg
4937b8e80941Smrg/**
4938b8e80941Smrg * Load ESGS and GSVS ring buffer resource descriptors and save the variables
4939b8e80941Smrg * for later use.
4940b8e80941Smrg */
4941b8e80941Smrgstatic void preload_ring_buffers(struct si_shader_context *ctx)
4942b8e80941Smrg{
4943b8e80941Smrg	LLVMBuilderRef builder = ctx->ac.builder;
4944b8e80941Smrg
4945b8e80941Smrg	LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
4946b8e80941Smrg					    ctx->param_rw_buffers);
4947b8e80941Smrg
4948b8e80941Smrg	if (ctx->screen->info.chip_class <= VI &&
4949b8e80941Smrg	    (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)) {
4950b8e80941Smrg		unsigned ring =
4951b8e80941Smrg			ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
4952b8e80941Smrg							     : SI_ES_RING_ESGS;
4953b8e80941Smrg		LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0);
4954b8e80941Smrg
4955b8e80941Smrg		ctx->esgs_ring =
4956b8e80941Smrg			ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
4957b8e80941Smrg	}
4958b8e80941Smrg
4959b8e80941Smrg	if (ctx->shader->is_gs_copy_shader) {
4960b8e80941Smrg		LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
4961b8e80941Smrg
4962b8e80941Smrg		ctx->gsvs_ring[0] =
4963b8e80941Smrg			ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
4964b8e80941Smrg	} else if (ctx->type == PIPE_SHADER_GEOMETRY) {
4965b8e80941Smrg		const struct si_shader_selector *sel = ctx->shader->selector;
4966b8e80941Smrg		LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
4967b8e80941Smrg		LLVMValueRef base_ring;
4968b8e80941Smrg
4969b8e80941Smrg		base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
4970b8e80941Smrg
4971b8e80941Smrg		/* The conceptual layout of the GSVS ring is
4972b8e80941Smrg		 *   v0c0 .. vLv0 v0c1 .. vLc1 ..
4973b8e80941Smrg		 * but the real memory layout is swizzled across
4974b8e80941Smrg		 * threads:
4975b8e80941Smrg		 *   t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
4976b8e80941Smrg		 *   t16v0c0 ..
4977b8e80941Smrg		 * Override the buffer descriptor accordingly.
4978b8e80941Smrg		 */
4979b8e80941Smrg		LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2);
4980b8e80941Smrg		uint64_t stream_offset = 0;
4981b8e80941Smrg
4982b8e80941Smrg		for (unsigned stream = 0; stream < 4; ++stream) {
4983b8e80941Smrg			unsigned num_components;
4984b8e80941Smrg			unsigned stride;
4985b8e80941Smrg			unsigned num_records;
4986b8e80941Smrg			LLVMValueRef ring, tmp;
4987b8e80941Smrg
4988b8e80941Smrg			num_components = sel->info.num_stream_output_components[stream];
4989b8e80941Smrg			if (!num_components)
4990b8e80941Smrg				continue;
4991b8e80941Smrg
4992b8e80941Smrg			stride = 4 * num_components * sel->gs_max_out_vertices;
4993b8e80941Smrg
4994b8e80941Smrg			/* Limit on the stride field for <= CIK. */
4995b8e80941Smrg			assert(stride < (1 << 14));
4996b8e80941Smrg
4997b8e80941Smrg			num_records = 64;
4998b8e80941Smrg
4999b8e80941Smrg			ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
5000b8e80941Smrg			tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, "");
5001b8e80941Smrg			tmp = LLVMBuildAdd(builder, tmp,
5002b8e80941Smrg					   LLVMConstInt(ctx->i64,
5003b8e80941Smrg							stream_offset, 0), "");
5004b8e80941Smrg			stream_offset += stride * 64;
5005b8e80941Smrg
5006b8e80941Smrg			ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, "");
5007b8e80941Smrg			ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, "");
5008b8e80941Smrg			tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_1, "");
5009b8e80941Smrg			tmp = LLVMBuildOr(builder, tmp,
5010b8e80941Smrg				LLVMConstInt(ctx->i32,
5011b8e80941Smrg					     S_008F04_STRIDE(stride) |
5012b8e80941Smrg					     S_008F04_SWIZZLE_ENABLE(1), 0), "");
5013b8e80941Smrg			ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_1, "");
5014b8e80941Smrg			ring = LLVMBuildInsertElement(builder, ring,
5015b8e80941Smrg					LLVMConstInt(ctx->i32, num_records, 0),
5016b8e80941Smrg					LLVMConstInt(ctx->i32, 2, 0), "");
5017b8e80941Smrg			ring = LLVMBuildInsertElement(builder, ring,
5018b8e80941Smrg				LLVMConstInt(ctx->i32,
5019b8e80941Smrg					     S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
5020b8e80941Smrg					     S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
5021b8e80941Smrg					     S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
5022b8e80941Smrg					     S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
5023b8e80941Smrg					     S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5024b8e80941Smrg					     S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
5025b8e80941Smrg					     S_008F0C_ELEMENT_SIZE(1) | /* element_size = 4 (bytes) */
5026b8e80941Smrg					     S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
5027b8e80941Smrg					     S_008F0C_ADD_TID_ENABLE(1),
5028b8e80941Smrg					     0),
5029b8e80941Smrg				LLVMConstInt(ctx->i32, 3, 0), "");
5030b8e80941Smrg
5031b8e80941Smrg			ctx->gsvs_ring[stream] = ring;
5032b8e80941Smrg		}
5033b8e80941Smrg	} else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
5034b8e80941Smrg		ctx->tess_offchip_ring = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TES);
5035b8e80941Smrg	}
5036b8e80941Smrg}
5037b8e80941Smrg
5038b8e80941Smrgstatic void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
5039b8e80941Smrg					 LLVMValueRef param_rw_buffers,
5040b8e80941Smrg					 unsigned param_pos_fixed_pt)
5041b8e80941Smrg{
5042b8e80941Smrg	LLVMBuilderRef builder = ctx->ac.builder;
5043b8e80941Smrg	LLVMValueRef slot, desc, offset, row, bit, address[2];
5044b8e80941Smrg
5045b8e80941Smrg	/* Use the fixed-point gl_FragCoord input.
5046b8e80941Smrg	 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
5047b8e80941Smrg	 * per coordinate to get the repeating effect.
5048b8e80941Smrg	 */
5049b8e80941Smrg	address[0] = si_unpack_param(ctx, param_pos_fixed_pt, 0, 5);
5050b8e80941Smrg	address[1] = si_unpack_param(ctx, param_pos_fixed_pt, 16, 5);
5051b8e80941Smrg
5052b8e80941Smrg	/* Load the buffer descriptor. */
5053b8e80941Smrg	slot = LLVMConstInt(ctx->i32, SI_PS_CONST_POLY_STIPPLE, 0);
5054b8e80941Smrg	desc = ac_build_load_to_sgpr(&ctx->ac, param_rw_buffers, slot);
5055b8e80941Smrg
5056b8e80941Smrg	/* The stipple pattern is 32x32, each row has 32 bits. */
5057b8e80941Smrg	offset = LLVMBuildMul(builder, address[1],
5058b8e80941Smrg			      LLVMConstInt(ctx->i32, 4, 0), "");
5059b8e80941Smrg	row = buffer_load_const(ctx, desc, offset);
5060b8e80941Smrg	row = ac_to_integer(&ctx->ac, row);
5061b8e80941Smrg	bit = LLVMBuildLShr(builder, row, address[0], "");
5062b8e80941Smrg	bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
5063b8e80941Smrg	ac_build_kill_if_false(&ctx->ac, bit);
5064b8e80941Smrg}
5065b8e80941Smrg
5066b8e80941Smrgvoid si_shader_binary_read_config(struct ac_shader_binary *binary,
5067b8e80941Smrg				  struct si_shader_config *conf,
5068b8e80941Smrg				  unsigned symbol_offset)
5069b8e80941Smrg{
5070b8e80941Smrg	unsigned i;
5071b8e80941Smrg	const unsigned char *config =
5072b8e80941Smrg		ac_shader_binary_config_start(binary, symbol_offset);
5073b8e80941Smrg	bool really_needs_scratch = false;
5074b8e80941Smrg
5075b8e80941Smrg	/* LLVM adds SGPR spills to the scratch size.
5076b8e80941Smrg	 * Find out if we really need the scratch buffer.
5077b8e80941Smrg	 */
5078b8e80941Smrg	for (i = 0; i < binary->reloc_count; i++) {
5079b8e80941Smrg		const struct ac_shader_reloc *reloc = &binary->relocs[i];
5080b8e80941Smrg
5081b8e80941Smrg		if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
5082b8e80941Smrg		    !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
5083b8e80941Smrg			really_needs_scratch = true;
5084b8e80941Smrg			break;
5085b8e80941Smrg		}
5086b8e80941Smrg	}
5087b8e80941Smrg
5088b8e80941Smrg	/* XXX: We may be able to emit some of these values directly rather than
5089b8e80941Smrg	 * extracting fields to be emitted later.
5090b8e80941Smrg	 */
5091b8e80941Smrg
5092b8e80941Smrg	for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
5093b8e80941Smrg		unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
5094b8e80941Smrg		unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
5095b8e80941Smrg		switch (reg) {
5096b8e80941Smrg		case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
5097b8e80941Smrg		case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
5098b8e80941Smrg		case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
5099b8e80941Smrg		case R_00B428_SPI_SHADER_PGM_RSRC1_HS:
5100b8e80941Smrg		case R_00B848_COMPUTE_PGM_RSRC1:
5101b8e80941Smrg			conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
5102b8e80941Smrg			conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
5103b8e80941Smrg			conf->float_mode =  G_00B028_FLOAT_MODE(value);
5104b8e80941Smrg			conf->rsrc1 = value;
5105b8e80941Smrg			break;
5106b8e80941Smrg		case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
5107b8e80941Smrg			conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
5108b8e80941Smrg			break;
5109b8e80941Smrg		case R_00B84C_COMPUTE_PGM_RSRC2:
5110b8e80941Smrg			conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
5111b8e80941Smrg			conf->rsrc2 = value;
5112b8e80941Smrg			break;
5113b8e80941Smrg		case R_0286CC_SPI_PS_INPUT_ENA:
5114b8e80941Smrg			conf->spi_ps_input_ena = value;
5115b8e80941Smrg			break;
5116b8e80941Smrg		case R_0286D0_SPI_PS_INPUT_ADDR:
5117b8e80941Smrg			conf->spi_ps_input_addr = value;
5118b8e80941Smrg			break;
5119b8e80941Smrg		case R_0286E8_SPI_TMPRING_SIZE:
5120b8e80941Smrg		case R_00B860_COMPUTE_TMPRING_SIZE:
5121b8e80941Smrg			/* WAVESIZE is in units of 256 dwords. */
5122b8e80941Smrg			if (really_needs_scratch)
5123b8e80941Smrg				conf->scratch_bytes_per_wave =
5124b8e80941Smrg					G_00B860_WAVESIZE(value) * 256 * 4;
5125b8e80941Smrg			break;
5126b8e80941Smrg		case 0x4: /* SPILLED_SGPRS */
5127b8e80941Smrg			conf->spilled_sgprs = value;
5128b8e80941Smrg			break;
5129b8e80941Smrg		case 0x8: /* SPILLED_VGPRS */
5130b8e80941Smrg			conf->spilled_vgprs = value;
5131b8e80941Smrg			break;
5132b8e80941Smrg		default:
5133b8e80941Smrg			{
5134b8e80941Smrg				static bool printed;
5135b8e80941Smrg
5136b8e80941Smrg				if (!printed) {
5137b8e80941Smrg					fprintf(stderr, "Warning: LLVM emitted unknown "
5138b8e80941Smrg						"config register: 0x%x\n", reg);
5139b8e80941Smrg					printed = true;
5140b8e80941Smrg				}
5141b8e80941Smrg			}
5142b8e80941Smrg			break;
5143b8e80941Smrg		}
5144b8e80941Smrg	}
5145b8e80941Smrg
5146b8e80941Smrg	if (!conf->spi_ps_input_addr)
5147b8e80941Smrg		conf->spi_ps_input_addr = conf->spi_ps_input_ena;
5148b8e80941Smrg}
5149b8e80941Smrg
5150b8e80941Smrgvoid si_shader_apply_scratch_relocs(struct si_shader *shader,
5151b8e80941Smrg				    uint64_t scratch_va)
5152b8e80941Smrg{
5153b8e80941Smrg	unsigned i;
5154b8e80941Smrg	uint32_t scratch_rsrc_dword0 = scratch_va;
5155b8e80941Smrg	uint32_t scratch_rsrc_dword1 =
5156b8e80941Smrg		S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
5157b8e80941Smrg
5158b8e80941Smrg	/* Enable scratch coalescing. */
5159b8e80941Smrg	scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
5160b8e80941Smrg
5161b8e80941Smrg	for (i = 0 ; i < shader->binary.reloc_count; i++) {
5162b8e80941Smrg		const struct ac_shader_reloc *reloc =
5163b8e80941Smrg					&shader->binary.relocs[i];
5164b8e80941Smrg		if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
5165b8e80941Smrg			util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
5166b8e80941Smrg			&scratch_rsrc_dword0, 4);
5167b8e80941Smrg		} else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
5168b8e80941Smrg			util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
5169b8e80941Smrg			&scratch_rsrc_dword1, 4);
5170b8e80941Smrg		}
5171b8e80941Smrg	}
5172b8e80941Smrg}
5173b8e80941Smrg
5174b8e80941Smrg/* For the UMR disassembler. */
5175b8e80941Smrg#define DEBUGGER_END_OF_CODE_MARKER	0xbf9f0000 /* invalid instruction */
5176b8e80941Smrg#define DEBUGGER_NUM_MARKERS		5
5177b8e80941Smrg
5178b8e80941Smrgstatic unsigned si_get_shader_binary_size(const struct si_shader *shader)
5179b8e80941Smrg{
5180b8e80941Smrg	unsigned size = shader->binary.code_size;
5181b8e80941Smrg
5182b8e80941Smrg	if (shader->prolog)
5183b8e80941Smrg		size += shader->prolog->binary.code_size;
5184b8e80941Smrg	if (shader->previous_stage)
5185b8e80941Smrg		size += shader->previous_stage->binary.code_size;
5186b8e80941Smrg	if (shader->prolog2)
5187b8e80941Smrg		size += shader->prolog2->binary.code_size;
5188b8e80941Smrg	if (shader->epilog)
5189b8e80941Smrg		size += shader->epilog->binary.code_size;
5190b8e80941Smrg	return size + DEBUGGER_NUM_MARKERS * 4;
5191b8e80941Smrg}
5192b8e80941Smrg
5193b8e80941Smrgint si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
5194b8e80941Smrg{
5195b8e80941Smrg	const struct ac_shader_binary *prolog =
5196b8e80941Smrg		shader->prolog ? &shader->prolog->binary : NULL;
5197b8e80941Smrg	const struct ac_shader_binary *previous_stage =
5198b8e80941Smrg		shader->previous_stage ? &shader->previous_stage->binary : NULL;
5199b8e80941Smrg	const struct ac_shader_binary *prolog2 =
5200b8e80941Smrg		shader->prolog2 ? &shader->prolog2->binary : NULL;
5201b8e80941Smrg	const struct ac_shader_binary *epilog =
5202b8e80941Smrg		shader->epilog ? &shader->epilog->binary : NULL;
5203b8e80941Smrg	const struct ac_shader_binary *mainb = &shader->binary;
5204b8e80941Smrg	unsigned bo_size = si_get_shader_binary_size(shader) +
5205b8e80941Smrg			   (!epilog ? mainb->rodata_size : 0);
5206b8e80941Smrg	unsigned char *ptr;
5207b8e80941Smrg
5208b8e80941Smrg	assert(!prolog || !prolog->rodata_size);
5209b8e80941Smrg	assert(!previous_stage || !previous_stage->rodata_size);
5210b8e80941Smrg	assert(!prolog2 || !prolog2->rodata_size);
5211b8e80941Smrg	assert((!prolog && !previous_stage && !prolog2 && !epilog) ||
5212b8e80941Smrg	       !mainb->rodata_size);
5213b8e80941Smrg	assert(!epilog || !epilog->rodata_size);
5214b8e80941Smrg
5215b8e80941Smrg	si_resource_reference(&shader->bo, NULL);
5216b8e80941Smrg	shader->bo = si_aligned_buffer_create(&sscreen->b,
5217b8e80941Smrg					      sscreen->cpdma_prefetch_writes_memory ?
5218b8e80941Smrg						0 : SI_RESOURCE_FLAG_READ_ONLY,
5219b8e80941Smrg                                              PIPE_USAGE_IMMUTABLE,
5220b8e80941Smrg                                              align(bo_size, SI_CPDMA_ALIGNMENT),
5221b8e80941Smrg                                              256);
5222b8e80941Smrg	if (!shader->bo)
5223b8e80941Smrg		return -ENOMEM;
5224b8e80941Smrg
5225b8e80941Smrg	/* Upload. */
5226b8e80941Smrg	ptr = sscreen->ws->buffer_map(shader->bo->buf, NULL,
5227b8e80941Smrg					PIPE_TRANSFER_READ_WRITE |
5228b8e80941Smrg					PIPE_TRANSFER_UNSYNCHRONIZED |
5229b8e80941Smrg					RADEON_TRANSFER_TEMPORARY);
5230b8e80941Smrg
5231b8e80941Smrg	/* Don't use util_memcpy_cpu_to_le32. LLVM binaries are
5232b8e80941Smrg	 * endian-independent. */
5233b8e80941Smrg	if (prolog) {
5234b8e80941Smrg		memcpy(ptr, prolog->code, prolog->code_size);
5235b8e80941Smrg		ptr += prolog->code_size;
5236b8e80941Smrg	}
5237b8e80941Smrg	if (previous_stage) {
5238b8e80941Smrg		memcpy(ptr, previous_stage->code, previous_stage->code_size);
5239b8e80941Smrg		ptr += previous_stage->code_size;
5240b8e80941Smrg	}
5241b8e80941Smrg	if (prolog2) {
5242b8e80941Smrg		memcpy(ptr, prolog2->code, prolog2->code_size);
5243b8e80941Smrg		ptr += prolog2->code_size;
5244b8e80941Smrg	}
5245b8e80941Smrg
5246b8e80941Smrg	memcpy(ptr, mainb->code, mainb->code_size);
5247b8e80941Smrg	ptr += mainb->code_size;
5248b8e80941Smrg
5249b8e80941Smrg	if (epilog) {
5250b8e80941Smrg		memcpy(ptr, epilog->code, epilog->code_size);
5251b8e80941Smrg		ptr += epilog->code_size;
5252b8e80941Smrg	} else if (mainb->rodata_size > 0) {
5253b8e80941Smrg		memcpy(ptr, mainb->rodata, mainb->rodata_size);
5254b8e80941Smrg		ptr += mainb->rodata_size;
5255b8e80941Smrg	}
5256b8e80941Smrg
5257b8e80941Smrg	/* Add end-of-code markers for the UMR disassembler. */
5258b8e80941Smrg	uint32_t *ptr32 = (uint32_t*)ptr;
5259b8e80941Smrg	for (unsigned i = 0; i < DEBUGGER_NUM_MARKERS; i++)
5260b8e80941Smrg		ptr32[i] = DEBUGGER_END_OF_CODE_MARKER;
5261b8e80941Smrg
5262b8e80941Smrg	sscreen->ws->buffer_unmap(shader->bo->buf);
5263b8e80941Smrg	return 0;
5264b8e80941Smrg}
5265b8e80941Smrg
5266b8e80941Smrgstatic void si_shader_dump_disassembly(const struct ac_shader_binary *binary,
5267b8e80941Smrg				       struct pipe_debug_callback *debug,
5268b8e80941Smrg				       const char *name, FILE *file)
5269b8e80941Smrg{
5270b8e80941Smrg	char *line, *p;
5271b8e80941Smrg	unsigned i, count;
5272b8e80941Smrg
5273b8e80941Smrg	if (binary->disasm_string) {
5274b8e80941Smrg		fprintf(file, "Shader %s disassembly:\n", name);
5275b8e80941Smrg		fprintf(file, "%s", binary->disasm_string);
5276b8e80941Smrg
5277b8e80941Smrg		if (debug && debug->debug_message) {
5278b8e80941Smrg			/* Very long debug messages are cut off, so send the
5279b8e80941Smrg			 * disassembly one line at a time. This causes more
5280b8e80941Smrg			 * overhead, but on the plus side it simplifies
5281b8e80941Smrg			 * parsing of resulting logs.
5282b8e80941Smrg			 */
5283b8e80941Smrg			pipe_debug_message(debug, SHADER_INFO,
5284b8e80941Smrg					   "Shader Disassembly Begin");
5285b8e80941Smrg
5286b8e80941Smrg			line = binary->disasm_string;
5287b8e80941Smrg			while (*line) {
5288b8e80941Smrg				p = util_strchrnul(line, '\n');
5289b8e80941Smrg				count = p - line;
5290b8e80941Smrg
5291b8e80941Smrg				if (count) {
5292b8e80941Smrg					pipe_debug_message(debug, SHADER_INFO,
5293b8e80941Smrg							   "%.*s", count, line);
5294b8e80941Smrg				}
5295b8e80941Smrg
5296b8e80941Smrg				if (!*p)
5297b8e80941Smrg					break;
5298b8e80941Smrg				line = p + 1;
5299b8e80941Smrg			}
5300b8e80941Smrg
5301b8e80941Smrg			pipe_debug_message(debug, SHADER_INFO,
5302b8e80941Smrg					   "Shader Disassembly End");
5303b8e80941Smrg		}
5304b8e80941Smrg	} else {
5305b8e80941Smrg		fprintf(file, "Shader %s binary:\n", name);
5306b8e80941Smrg		for (i = 0; i < binary->code_size; i += 4) {
5307b8e80941Smrg			fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
5308b8e80941Smrg				binary->code[i + 3], binary->code[i + 2],
5309b8e80941Smrg				binary->code[i + 1], binary->code[i]);
5310b8e80941Smrg		}
5311b8e80941Smrg	}
5312b8e80941Smrg}
5313b8e80941Smrg
5314b8e80941Smrgstatic void si_calculate_max_simd_waves(struct si_shader *shader)
5315b8e80941Smrg{
5316b8e80941Smrg	struct si_screen *sscreen = shader->selector->screen;
5317b8e80941Smrg	struct si_shader_config *conf = &shader->config;
5318b8e80941Smrg	unsigned num_inputs = shader->selector->info.num_inputs;
5319b8e80941Smrg	unsigned lds_increment = sscreen->info.chip_class >= CIK ? 512 : 256;
5320b8e80941Smrg	unsigned lds_per_wave = 0;
5321b8e80941Smrg	unsigned max_simd_waves;
5322b8e80941Smrg
5323b8e80941Smrg	max_simd_waves = ac_get_max_simd_waves(sscreen->info.family);
5324b8e80941Smrg
5325b8e80941Smrg	/* Compute LDS usage for PS. */
5326b8e80941Smrg	switch (shader->selector->type) {
5327b8e80941Smrg	case PIPE_SHADER_FRAGMENT:
5328b8e80941Smrg		/* The minimum usage per wave is (num_inputs * 48). The maximum
5329b8e80941Smrg		 * usage is (num_inputs * 48 * 16).
5330b8e80941Smrg		 * We can get anything in between and it varies between waves.
5331b8e80941Smrg		 *
5332b8e80941Smrg		 * The 48 bytes per input for a single primitive is equal to
5333b8e80941Smrg		 * 4 bytes/component * 4 components/input * 3 points.
5334b8e80941Smrg		 *
5335b8e80941Smrg		 * Other stages don't know the size at compile time or don't
5336b8e80941Smrg		 * allocate LDS per wave, but instead they do it per thread group.
5337b8e80941Smrg		 */
5338b8e80941Smrg		lds_per_wave = conf->lds_size * lds_increment +
5339b8e80941Smrg			       align(num_inputs * 48, lds_increment);
5340b8e80941Smrg		break;
5341b8e80941Smrg	case PIPE_SHADER_COMPUTE:
5342b8e80941Smrg		if (shader->selector) {
5343b8e80941Smrg			unsigned max_workgroup_size =
5344b8e80941Smrg				si_get_max_workgroup_size(shader);
5345b8e80941Smrg			lds_per_wave = (conf->lds_size * lds_increment) /
5346b8e80941Smrg				       DIV_ROUND_UP(max_workgroup_size, 64);
5347b8e80941Smrg		}
5348b8e80941Smrg		break;
5349b8e80941Smrg	}
5350b8e80941Smrg
5351b8e80941Smrg	/* Compute the per-SIMD wave counts. */
5352b8e80941Smrg	if (conf->num_sgprs) {
5353b8e80941Smrg		max_simd_waves =
5354b8e80941Smrg			MIN2(max_simd_waves,
5355b8e80941Smrg			     ac_get_num_physical_sgprs(sscreen->info.chip_class) / conf->num_sgprs);
5356b8e80941Smrg	}
5357b8e80941Smrg
5358b8e80941Smrg	if (conf->num_vgprs)
5359b8e80941Smrg		max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
5360b8e80941Smrg
5361b8e80941Smrg	/* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above
5362b8e80941Smrg	 * 16KB makes some SIMDs unoccupied). */
5363b8e80941Smrg	if (lds_per_wave)
5364b8e80941Smrg		max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
5365b8e80941Smrg
5366b8e80941Smrg	conf->max_simd_waves = max_simd_waves;
5367b8e80941Smrg}
5368b8e80941Smrg
5369b8e80941Smrgvoid si_shader_dump_stats_for_shader_db(const struct si_shader *shader,
5370b8e80941Smrg					struct pipe_debug_callback *debug)
5371b8e80941Smrg{
5372b8e80941Smrg	const struct si_shader_config *conf = &shader->config;
5373b8e80941Smrg
5374b8e80941Smrg	pipe_debug_message(debug, SHADER_INFO,
5375b8e80941Smrg			   "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
5376b8e80941Smrg			   "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
5377b8e80941Smrg			   "Spilled VGPRs: %d PrivMem VGPRs: %d",
5378b8e80941Smrg			   conf->num_sgprs, conf->num_vgprs,
5379b8e80941Smrg			   si_get_shader_binary_size(shader),
5380b8e80941Smrg			   conf->lds_size, conf->scratch_bytes_per_wave,
5381b8e80941Smrg			   conf->max_simd_waves, conf->spilled_sgprs,
5382b8e80941Smrg			   conf->spilled_vgprs, conf->private_mem_vgprs);
5383b8e80941Smrg}
5384b8e80941Smrg
5385b8e80941Smrgstatic void si_shader_dump_stats(struct si_screen *sscreen,
5386b8e80941Smrg				 const struct si_shader *shader,
5387b8e80941Smrg			         unsigned processor,
5388b8e80941Smrg				 FILE *file,
5389b8e80941Smrg				 bool check_debug_option)
5390b8e80941Smrg{
5391b8e80941Smrg	const struct si_shader_config *conf = &shader->config;
5392b8e80941Smrg
5393b8e80941Smrg	if (!check_debug_option ||
5394b8e80941Smrg	    si_can_dump_shader(sscreen, processor)) {
5395b8e80941Smrg		if (processor == PIPE_SHADER_FRAGMENT) {
5396b8e80941Smrg			fprintf(file, "*** SHADER CONFIG ***\n"
5397b8e80941Smrg				"SPI_PS_INPUT_ADDR = 0x%04x\n"
5398b8e80941Smrg				"SPI_PS_INPUT_ENA  = 0x%04x\n",
5399b8e80941Smrg				conf->spi_ps_input_addr, conf->spi_ps_input_ena);
5400b8e80941Smrg		}
5401b8e80941Smrg
5402b8e80941Smrg		fprintf(file, "*** SHADER STATS ***\n"
5403b8e80941Smrg			"SGPRS: %d\n"
5404b8e80941Smrg			"VGPRS: %d\n"
5405b8e80941Smrg		        "Spilled SGPRs: %d\n"
5406b8e80941Smrg			"Spilled VGPRs: %d\n"
5407b8e80941Smrg			"Private memory VGPRs: %d\n"
5408b8e80941Smrg			"Code Size: %d bytes\n"
5409b8e80941Smrg			"LDS: %d blocks\n"
5410b8e80941Smrg			"Scratch: %d bytes per wave\n"
5411b8e80941Smrg			"Max Waves: %d\n"
5412b8e80941Smrg			"********************\n\n\n",
5413b8e80941Smrg			conf->num_sgprs, conf->num_vgprs,
5414b8e80941Smrg			conf->spilled_sgprs, conf->spilled_vgprs,
5415b8e80941Smrg			conf->private_mem_vgprs,
5416b8e80941Smrg			si_get_shader_binary_size(shader),
5417b8e80941Smrg			conf->lds_size, conf->scratch_bytes_per_wave,
5418b8e80941Smrg			conf->max_simd_waves);
5419b8e80941Smrg	}
5420b8e80941Smrg}
5421b8e80941Smrg
5422b8e80941Smrgconst char *si_get_shader_name(const struct si_shader *shader, unsigned processor)
5423b8e80941Smrg{
5424b8e80941Smrg	switch (processor) {
5425b8e80941Smrg	case PIPE_SHADER_VERTEX:
5426b8e80941Smrg		if (shader->key.as_es)
5427b8e80941Smrg			return "Vertex Shader as ES";
5428b8e80941Smrg		else if (shader->key.as_ls)
5429b8e80941Smrg			return "Vertex Shader as LS";
5430b8e80941Smrg		else
5431b8e80941Smrg			return "Vertex Shader as VS";
5432b8e80941Smrg	case PIPE_SHADER_TESS_CTRL:
5433b8e80941Smrg		return "Tessellation Control Shader";
5434b8e80941Smrg	case PIPE_SHADER_TESS_EVAL:
5435b8e80941Smrg		if (shader->key.as_es)
5436b8e80941Smrg			return "Tessellation Evaluation Shader as ES";
5437b8e80941Smrg		else
5438b8e80941Smrg			return "Tessellation Evaluation Shader as VS";
5439b8e80941Smrg	case PIPE_SHADER_GEOMETRY:
5440b8e80941Smrg		if (shader->is_gs_copy_shader)
5441b8e80941Smrg			return "GS Copy Shader as VS";
5442b8e80941Smrg		else
5443b8e80941Smrg			return "Geometry Shader";
5444b8e80941Smrg	case PIPE_SHADER_FRAGMENT:
5445b8e80941Smrg		return "Pixel Shader";
5446b8e80941Smrg	case PIPE_SHADER_COMPUTE:
5447b8e80941Smrg		return "Compute Shader";
5448b8e80941Smrg	default:
5449b8e80941Smrg		return "Unknown Shader";
5450b8e80941Smrg	}
5451b8e80941Smrg}
5452b8e80941Smrg
5453b8e80941Smrgvoid si_shader_dump(struct si_screen *sscreen, const struct si_shader *shader,
5454b8e80941Smrg		    struct pipe_debug_callback *debug, unsigned processor,
5455b8e80941Smrg		    FILE *file, bool check_debug_option)
5456b8e80941Smrg{
5457b8e80941Smrg	if (!check_debug_option ||
5458b8e80941Smrg	    si_can_dump_shader(sscreen, processor))
5459b8e80941Smrg		si_dump_shader_key(processor, shader, file);
5460b8e80941Smrg
5461b8e80941Smrg	if (!check_debug_option && shader->binary.llvm_ir_string) {
5462b8e80941Smrg		if (shader->previous_stage &&
5463b8e80941Smrg		    shader->previous_stage->binary.llvm_ir_string) {
5464b8e80941Smrg			fprintf(file, "\n%s - previous stage - LLVM IR:\n\n",
5465b8e80941Smrg				si_get_shader_name(shader, processor));
5466b8e80941Smrg			fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
5467b8e80941Smrg		}
5468b8e80941Smrg
5469b8e80941Smrg		fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
5470b8e80941Smrg			si_get_shader_name(shader, processor));
5471b8e80941Smrg		fprintf(file, "%s\n", shader->binary.llvm_ir_string);
5472b8e80941Smrg	}
5473b8e80941Smrg
5474b8e80941Smrg	if (!check_debug_option ||
5475b8e80941Smrg	    (si_can_dump_shader(sscreen, processor) &&
5476b8e80941Smrg	     !(sscreen->debug_flags & DBG(NO_ASM)))) {
5477b8e80941Smrg		fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
5478b8e80941Smrg
5479b8e80941Smrg		if (shader->prolog)
5480b8e80941Smrg			si_shader_dump_disassembly(&shader->prolog->binary,
5481b8e80941Smrg						   debug, "prolog", file);
5482b8e80941Smrg		if (shader->previous_stage)
5483b8e80941Smrg			si_shader_dump_disassembly(&shader->previous_stage->binary,
5484b8e80941Smrg						   debug, "previous stage", file);
5485b8e80941Smrg		if (shader->prolog2)
5486b8e80941Smrg			si_shader_dump_disassembly(&shader->prolog2->binary,
5487b8e80941Smrg						   debug, "prolog2", file);
5488b8e80941Smrg
5489b8e80941Smrg		si_shader_dump_disassembly(&shader->binary, debug, "main", file);
5490b8e80941Smrg
5491b8e80941Smrg		if (shader->epilog)
5492b8e80941Smrg			si_shader_dump_disassembly(&shader->epilog->binary,
5493b8e80941Smrg						   debug, "epilog", file);
5494b8e80941Smrg		fprintf(file, "\n");
5495b8e80941Smrg	}
5496b8e80941Smrg
5497b8e80941Smrg	si_shader_dump_stats(sscreen, shader, processor, file,
5498b8e80941Smrg			     check_debug_option);
5499b8e80941Smrg}
5500b8e80941Smrg
5501b8e80941Smrgstatic int si_compile_llvm(struct si_screen *sscreen,
5502b8e80941Smrg			   struct ac_shader_binary *binary,
5503b8e80941Smrg			   struct si_shader_config *conf,
5504b8e80941Smrg			   struct ac_llvm_compiler *compiler,
5505b8e80941Smrg			   LLVMModuleRef mod,
5506b8e80941Smrg			   struct pipe_debug_callback *debug,
5507b8e80941Smrg			   unsigned processor,
5508b8e80941Smrg			   const char *name,
5509b8e80941Smrg			   bool less_optimized)
5510b8e80941Smrg{
5511b8e80941Smrg	int r = 0;
5512b8e80941Smrg	unsigned count = p_atomic_inc_return(&sscreen->num_compilations);
5513b8e80941Smrg
5514b8e80941Smrg	if (si_can_dump_shader(sscreen, processor)) {
5515b8e80941Smrg		fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
5516b8e80941Smrg
5517b8e80941Smrg		if (!(sscreen->debug_flags & (DBG(NO_IR) | DBG(PREOPT_IR)))) {
5518b8e80941Smrg			fprintf(stderr, "%s LLVM IR:\n\n", name);
5519b8e80941Smrg			ac_dump_module(mod);
5520b8e80941Smrg			fprintf(stderr, "\n");
5521b8e80941Smrg		}
5522b8e80941Smrg	}
5523b8e80941Smrg
5524b8e80941Smrg	if (sscreen->record_llvm_ir) {
5525b8e80941Smrg		char *ir = LLVMPrintModuleToString(mod);
5526b8e80941Smrg		binary->llvm_ir_string = strdup(ir);
5527b8e80941Smrg		LLVMDisposeMessage(ir);
5528b8e80941Smrg	}
5529b8e80941Smrg
5530b8e80941Smrg	if (!si_replace_shader(count, binary)) {
5531b8e80941Smrg		r = si_llvm_compile(mod, binary, compiler, debug,
5532b8e80941Smrg				    less_optimized);
5533b8e80941Smrg		if (r)
5534b8e80941Smrg			return r;
5535b8e80941Smrg	}
5536b8e80941Smrg
5537b8e80941Smrg	si_shader_binary_read_config(binary, conf, 0);
5538b8e80941Smrg
5539b8e80941Smrg	/* Enable 64-bit and 16-bit denormals, because there is no performance
5540b8e80941Smrg	 * cost.
5541b8e80941Smrg	 *
5542b8e80941Smrg	 * If denormals are enabled, all floating-point output modifiers are
5543b8e80941Smrg	 * ignored.
5544b8e80941Smrg	 *
5545b8e80941Smrg	 * Don't enable denormals for 32-bit floats, because:
5546b8e80941Smrg	 * - Floating-point output modifiers would be ignored by the hw.
5547b8e80941Smrg	 * - Some opcodes don't support denormals, such as v_mad_f32. We would
5548b8e80941Smrg	 *   have to stop using those.
5549b8e80941Smrg	 * - SI & CI would be very slow.
5550b8e80941Smrg	 */
5551b8e80941Smrg	conf->float_mode |= V_00B028_FP_64_DENORMS;
5552b8e80941Smrg
5553b8e80941Smrg	FREE(binary->config);
5554b8e80941Smrg	FREE(binary->global_symbol_offsets);
5555b8e80941Smrg	binary->config = NULL;
5556b8e80941Smrg	binary->global_symbol_offsets = NULL;
5557b8e80941Smrg
5558b8e80941Smrg	/* Some shaders can't have rodata because their binaries can be
5559b8e80941Smrg	 * concatenated.
5560b8e80941Smrg	 */
5561b8e80941Smrg	if (binary->rodata_size &&
5562b8e80941Smrg	    (processor == PIPE_SHADER_VERTEX ||
5563b8e80941Smrg	     processor == PIPE_SHADER_TESS_CTRL ||
5564b8e80941Smrg	     processor == PIPE_SHADER_TESS_EVAL ||
5565b8e80941Smrg	     processor == PIPE_SHADER_FRAGMENT)) {
5566b8e80941Smrg		fprintf(stderr, "radeonsi: The shader can't have rodata.");
5567b8e80941Smrg		return -EINVAL;
5568b8e80941Smrg	}
5569b8e80941Smrg
5570b8e80941Smrg	return r;
5571b8e80941Smrg}
5572b8e80941Smrg
5573b8e80941Smrgstatic void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
5574b8e80941Smrg{
5575b8e80941Smrg	if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
5576b8e80941Smrg		LLVMBuildRetVoid(ctx->ac.builder);
5577b8e80941Smrg	else
5578b8e80941Smrg		LLVMBuildRet(ctx->ac.builder, ret);
5579b8e80941Smrg}
5580b8e80941Smrg
5581b8e80941Smrg/* Generate code for the hardware VS shader stage to go with a geometry shader */
5582b8e80941Smrgstruct si_shader *
5583b8e80941Smrgsi_generate_gs_copy_shader(struct si_screen *sscreen,
5584b8e80941Smrg			   struct ac_llvm_compiler *compiler,
5585b8e80941Smrg			   struct si_shader_selector *gs_selector,
5586b8e80941Smrg			   struct pipe_debug_callback *debug)
5587b8e80941Smrg{
5588b8e80941Smrg	struct si_shader_context ctx;
5589b8e80941Smrg	struct si_shader *shader;
5590b8e80941Smrg	LLVMBuilderRef builder;
5591b8e80941Smrg	struct si_shader_output_values outputs[SI_MAX_VS_OUTPUTS];
5592b8e80941Smrg	struct tgsi_shader_info *gsinfo = &gs_selector->info;
5593b8e80941Smrg	int i, r;
5594b8e80941Smrg
5595b8e80941Smrg
5596b8e80941Smrg	shader = CALLOC_STRUCT(si_shader);
5597b8e80941Smrg	if (!shader)
5598b8e80941Smrg		return NULL;
5599b8e80941Smrg
5600b8e80941Smrg	/* We can leave the fence as permanently signaled because the GS copy
5601b8e80941Smrg	 * shader only becomes visible globally after it has been compiled. */
5602b8e80941Smrg	util_queue_fence_init(&shader->ready);
5603b8e80941Smrg
5604b8e80941Smrg	shader->selector = gs_selector;
5605b8e80941Smrg	shader->is_gs_copy_shader = true;
5606b8e80941Smrg
5607b8e80941Smrg	si_init_shader_ctx(&ctx, sscreen, compiler);
5608b8e80941Smrg	ctx.shader = shader;
5609b8e80941Smrg	ctx.type = PIPE_SHADER_VERTEX;
5610b8e80941Smrg
5611b8e80941Smrg	builder = ctx.ac.builder;
5612b8e80941Smrg
5613b8e80941Smrg	create_function(&ctx);
5614b8e80941Smrg	preload_ring_buffers(&ctx);
5615b8e80941Smrg
5616b8e80941Smrg	LLVMValueRef voffset =
5617b8e80941Smrg		LLVMBuildMul(ctx.ac.builder, ctx.abi.vertex_id,
5618b8e80941Smrg			     LLVMConstInt(ctx.i32, 4, 0), "");
5619b8e80941Smrg
5620b8e80941Smrg	/* Fetch the vertex stream ID.*/
5621b8e80941Smrg	LLVMValueRef stream_id;
5622b8e80941Smrg
5623b8e80941Smrg	if (gs_selector->so.num_outputs)
5624b8e80941Smrg		stream_id = si_unpack_param(&ctx, ctx.param_streamout_config, 24, 2);
5625b8e80941Smrg	else
5626b8e80941Smrg		stream_id = ctx.i32_0;
5627b8e80941Smrg
5628b8e80941Smrg	/* Fill in output information. */
5629b8e80941Smrg	for (i = 0; i < gsinfo->num_outputs; ++i) {
5630b8e80941Smrg		outputs[i].semantic_name = gsinfo->output_semantic_name[i];
5631b8e80941Smrg		outputs[i].semantic_index = gsinfo->output_semantic_index[i];
5632b8e80941Smrg
5633b8e80941Smrg		for (int chan = 0; chan < 4; chan++) {
5634b8e80941Smrg			outputs[i].vertex_stream[chan] =
5635b8e80941Smrg				(gsinfo->output_streams[i] >> (2 * chan)) & 3;
5636b8e80941Smrg		}
5637b8e80941Smrg	}
5638b8e80941Smrg
5639b8e80941Smrg	LLVMBasicBlockRef end_bb;
5640b8e80941Smrg	LLVMValueRef switch_inst;
5641b8e80941Smrg
5642b8e80941Smrg	end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end");
5643b8e80941Smrg	switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
5644b8e80941Smrg
5645b8e80941Smrg	for (int stream = 0; stream < 4; stream++) {
5646b8e80941Smrg		LLVMBasicBlockRef bb;
5647b8e80941Smrg		unsigned offset;
5648b8e80941Smrg
5649b8e80941Smrg		if (!gsinfo->num_stream_output_components[stream])
5650b8e80941Smrg			continue;
5651b8e80941Smrg
5652b8e80941Smrg		if (stream > 0 && !gs_selector->so.num_outputs)
5653848b8605Smrg			continue;
5654848b8605Smrg
5655b8e80941Smrg		bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out");
5656b8e80941Smrg		LLVMAddCase(switch_inst, LLVMConstInt(ctx.i32, stream, 0), bb);
5657b8e80941Smrg		LLVMPositionBuilderAtEnd(builder, bb);
5658b8e80941Smrg
5659b8e80941Smrg		/* Fetch vertex data from GSVS ring */
5660b8e80941Smrg		offset = 0;
5661b8e80941Smrg		for (i = 0; i < gsinfo->num_outputs; ++i) {
5662b8e80941Smrg			for (unsigned chan = 0; chan < 4; chan++) {
5663b8e80941Smrg				if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
5664b8e80941Smrg				    outputs[i].vertex_stream[chan] != stream) {
5665b8e80941Smrg					outputs[i].values[chan] = LLVMGetUndef(ctx.f32);
5666b8e80941Smrg					continue;
5667b8e80941Smrg				}
5668b8e80941Smrg
5669b8e80941Smrg				LLVMValueRef soffset = LLVMConstInt(ctx.i32,
5670b8e80941Smrg					offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
5671b8e80941Smrg				offset++;
5672b8e80941Smrg
5673b8e80941Smrg				outputs[i].values[chan] =
5674b8e80941Smrg					ac_build_buffer_load(&ctx.ac,
5675b8e80941Smrg							     ctx.gsvs_ring[0], 1,
5676b8e80941Smrg							     ctx.i32_0, voffset,
5677b8e80941Smrg							     soffset, 0, 1, 1,
5678b8e80941Smrg							     true, false);
5679b8e80941Smrg			}
5680b8e80941Smrg		}
5681b8e80941Smrg
5682b8e80941Smrg		/* Streamout and exports. */
5683b8e80941Smrg		if (gs_selector->so.num_outputs) {
5684b8e80941Smrg			si_llvm_emit_streamout(&ctx, outputs,
5685b8e80941Smrg					       gsinfo->num_outputs,
5686b8e80941Smrg					       stream);
5687b8e80941Smrg		}
5688b8e80941Smrg
5689b8e80941Smrg		if (stream == 0) {
5690b8e80941Smrg			/* Vertex color clamping.
5691b8e80941Smrg			 *
5692b8e80941Smrg			 * This uses a state constant loaded in a user data SGPR and
5693b8e80941Smrg			 * an IF statement is added that clamps all colors if the constant
5694b8e80941Smrg			 * is true.
5695b8e80941Smrg			 */
5696b8e80941Smrg			struct lp_build_if_state if_ctx;
5697b8e80941Smrg			LLVMValueRef v[2], cond = NULL;
5698b8e80941Smrg			LLVMBasicBlockRef blocks[2];
5699b8e80941Smrg
5700b8e80941Smrg			for (unsigned i = 0; i < gsinfo->num_outputs; i++) {
5701b8e80941Smrg				if (gsinfo->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
5702b8e80941Smrg				    gsinfo->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
5703b8e80941Smrg					continue;
5704b8e80941Smrg
5705b8e80941Smrg				/* We've found a color. */
5706b8e80941Smrg				if (!cond) {
5707b8e80941Smrg					/* The state is in the first bit of the user SGPR. */
5708b8e80941Smrg					cond = LLVMGetParam(ctx.main_fn,
5709b8e80941Smrg							    ctx.param_vs_state_bits);
5710b8e80941Smrg					cond = LLVMBuildTrunc(ctx.ac.builder, cond,
5711b8e80941Smrg							      ctx.i1, "");
5712b8e80941Smrg					lp_build_if(&if_ctx, &ctx.gallivm, cond);
5713b8e80941Smrg					/* Remember blocks for Phi. */
5714b8e80941Smrg					blocks[0] = if_ctx.true_block;
5715b8e80941Smrg					blocks[1] = if_ctx.entry_block;
5716b8e80941Smrg				}
5717b8e80941Smrg
5718b8e80941Smrg				for (unsigned j = 0; j < 4; j++) {
5719b8e80941Smrg					/* Insert clamp into the true block. */
5720b8e80941Smrg					v[0] = ac_build_clamp(&ctx.ac, outputs[i].values[j]);
5721b8e80941Smrg					v[1] = outputs[i].values[j];
5722b8e80941Smrg
5723b8e80941Smrg					/* Insert Phi into the endif block. */
5724b8e80941Smrg					LLVMPositionBuilderAtEnd(ctx.ac.builder, if_ctx.merge_block);
5725b8e80941Smrg					outputs[i].values[j] = ac_build_phi(&ctx.ac, ctx.f32, 2, v, blocks);
5726b8e80941Smrg					LLVMPositionBuilderAtEnd(ctx.ac.builder, if_ctx.true_block);
5727b8e80941Smrg				}
5728b8e80941Smrg			}
5729b8e80941Smrg			if (cond)
5730b8e80941Smrg				lp_build_endif(&if_ctx);
5731b8e80941Smrg
5732b8e80941Smrg			si_llvm_export_vs(&ctx, outputs, gsinfo->num_outputs);
5733b8e80941Smrg		}
5734b8e80941Smrg
5735b8e80941Smrg		LLVMBuildBr(builder, end_bb);
5736b8e80941Smrg	}
5737b8e80941Smrg
5738b8e80941Smrg	LLVMPositionBuilderAtEnd(builder, end_bb);
5739b8e80941Smrg
5740b8e80941Smrg	LLVMBuildRetVoid(ctx.ac.builder);
5741b8e80941Smrg
5742b8e80941Smrg	ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */
5743b8e80941Smrg	si_llvm_optimize_module(&ctx);
5744b8e80941Smrg
5745b8e80941Smrg	r = si_compile_llvm(sscreen, &ctx.shader->binary,
5746b8e80941Smrg			    &ctx.shader->config, ctx.compiler,
5747b8e80941Smrg			    ctx.ac.module,
5748b8e80941Smrg			    debug, PIPE_SHADER_GEOMETRY,
5749b8e80941Smrg			    "GS Copy Shader", false);
5750b8e80941Smrg	if (!r) {
5751b8e80941Smrg		if (si_can_dump_shader(sscreen, PIPE_SHADER_GEOMETRY))
5752b8e80941Smrg			fprintf(stderr, "GS Copy Shader:\n");
5753b8e80941Smrg		si_shader_dump(sscreen, ctx.shader, debug,
5754b8e80941Smrg			       PIPE_SHADER_GEOMETRY, stderr, true);
5755b8e80941Smrg		r = si_shader_binary_upload(sscreen, ctx.shader);
5756b8e80941Smrg	}
5757b8e80941Smrg
5758b8e80941Smrg	si_llvm_dispose(&ctx);
5759b8e80941Smrg
5760b8e80941Smrg	if (r != 0) {
5761b8e80941Smrg		FREE(shader);
5762b8e80941Smrg		shader = NULL;
5763b8e80941Smrg	} else {
5764b8e80941Smrg		si_fix_resource_usage(sscreen, shader);
5765b8e80941Smrg	}
5766b8e80941Smrg	return shader;
5767b8e80941Smrg}
5768b8e80941Smrg
5769b8e80941Smrgstatic void si_dump_shader_key_vs(const struct si_shader_key *key,
5770b8e80941Smrg				  const struct si_vs_prolog_bits *prolog,
5771b8e80941Smrg				  const char *prefix, FILE *f)
5772b8e80941Smrg{
5773b8e80941Smrg	fprintf(f, "  %s.instance_divisor_is_one = %u\n",
5774b8e80941Smrg		prefix, prolog->instance_divisor_is_one);
5775b8e80941Smrg	fprintf(f, "  %s.instance_divisor_is_fetched = %u\n",
5776b8e80941Smrg		prefix, prolog->instance_divisor_is_fetched);
5777b8e80941Smrg	fprintf(f, "  %s.ls_vgpr_fix = %u\n",
5778b8e80941Smrg		prefix, prolog->ls_vgpr_fix);
5779b8e80941Smrg
5780b8e80941Smrg	fprintf(f, "  mono.vs.fix_fetch = {");
5781b8e80941Smrg	for (int i = 0; i < SI_MAX_ATTRIBS; i++)
5782b8e80941Smrg		fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]);
5783b8e80941Smrg	fprintf(f, "}\n");
5784b8e80941Smrg}
5785b8e80941Smrg
5786b8e80941Smrgstatic void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
5787b8e80941Smrg			       FILE *f)
5788b8e80941Smrg{
5789b8e80941Smrg	const struct si_shader_key *key = &shader->key;
5790b8e80941Smrg
5791b8e80941Smrg	fprintf(f, "SHADER KEY\n");
5792b8e80941Smrg
5793b8e80941Smrg	switch (processor) {
5794b8e80941Smrg	case PIPE_SHADER_VERTEX:
5795b8e80941Smrg		si_dump_shader_key_vs(key, &key->part.vs.prolog,
5796b8e80941Smrg				      "part.vs.prolog", f);
5797b8e80941Smrg		fprintf(f, "  as_es = %u\n", key->as_es);
5798b8e80941Smrg		fprintf(f, "  as_ls = %u\n", key->as_ls);
5799b8e80941Smrg		fprintf(f, "  mono.u.vs_export_prim_id = %u\n",
5800b8e80941Smrg			key->mono.u.vs_export_prim_id);
5801b8e80941Smrg		break;
5802b8e80941Smrg
5803b8e80941Smrg	case PIPE_SHADER_TESS_CTRL:
5804b8e80941Smrg		if (shader->selector->screen->info.chip_class >= GFX9) {
5805b8e80941Smrg			si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog,
5806b8e80941Smrg					      "part.tcs.ls_prolog", f);
5807b8e80941Smrg		}
5808b8e80941Smrg		fprintf(f, "  part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
5809b8e80941Smrg		fprintf(f, "  mono.u.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.u.ff_tcs_inputs_to_copy);
5810b8e80941Smrg		break;
5811b8e80941Smrg
5812b8e80941Smrg	case PIPE_SHADER_TESS_EVAL:
5813b8e80941Smrg		fprintf(f, "  as_es = %u\n", key->as_es);
5814b8e80941Smrg		fprintf(f, "  mono.u.vs_export_prim_id = %u\n",
5815b8e80941Smrg			key->mono.u.vs_export_prim_id);
5816b8e80941Smrg		break;
5817b8e80941Smrg
5818b8e80941Smrg	case PIPE_SHADER_GEOMETRY:
5819b8e80941Smrg		if (shader->is_gs_copy_shader)
5820b8e80941Smrg			break;
5821b8e80941Smrg
5822b8e80941Smrg		if (shader->selector->screen->info.chip_class >= GFX9 &&
5823b8e80941Smrg		    key->part.gs.es->type == PIPE_SHADER_VERTEX) {
5824b8e80941Smrg			si_dump_shader_key_vs(key, &key->part.gs.vs_prolog,
5825b8e80941Smrg					      "part.gs.vs_prolog", f);
5826b8e80941Smrg		}
5827b8e80941Smrg		fprintf(f, "  part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix);
5828b8e80941Smrg		break;
5829b8e80941Smrg
5830b8e80941Smrg	case PIPE_SHADER_COMPUTE:
5831b8e80941Smrg		break;
5832848b8605Smrg
5833b8e80941Smrg	case PIPE_SHADER_FRAGMENT:
5834b8e80941Smrg		fprintf(f, "  part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
5835b8e80941Smrg		fprintf(f, "  part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
5836b8e80941Smrg		fprintf(f, "  part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
5837b8e80941Smrg		fprintf(f, "  part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp);
5838b8e80941Smrg		fprintf(f, "  part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp);
5839b8e80941Smrg		fprintf(f, "  part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp);
5840b8e80941Smrg		fprintf(f, "  part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp);
5841b8e80941Smrg		fprintf(f, "  part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp);
5842b8e80941Smrg		fprintf(f, "  part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear);
5843b8e80941Smrg		fprintf(f, "  part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format);
5844b8e80941Smrg		fprintf(f, "  part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
5845b8e80941Smrg		fprintf(f, "  part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
5846b8e80941Smrg		fprintf(f, "  part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
5847b8e80941Smrg		fprintf(f, "  part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
5848b8e80941Smrg		fprintf(f, "  part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
5849b8e80941Smrg		fprintf(f, "  part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing);
5850b8e80941Smrg		fprintf(f, "  part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
5851b8e80941Smrg		break;
5852848b8605Smrg
5853b8e80941Smrg	default:
5854b8e80941Smrg		assert(0);
5855b8e80941Smrg	}
5856b8e80941Smrg
5857b8e80941Smrg	if ((processor == PIPE_SHADER_GEOMETRY ||
5858b8e80941Smrg	     processor == PIPE_SHADER_TESS_EVAL ||
5859b8e80941Smrg	     processor == PIPE_SHADER_VERTEX) &&
5860b8e80941Smrg	    !key->as_es && !key->as_ls) {
5861b8e80941Smrg		fprintf(f, "  opt.kill_outputs = 0x%"PRIx64"\n", key->opt.kill_outputs);
5862b8e80941Smrg		fprintf(f, "  opt.clip_disable = %u\n", key->opt.clip_disable);
5863848b8605Smrg	}
5864848b8605Smrg}
5865848b8605Smrg
5866b8e80941Smrgstatic void si_init_shader_ctx(struct si_shader_context *ctx,
5867b8e80941Smrg			       struct si_screen *sscreen,
5868b8e80941Smrg			       struct ac_llvm_compiler *compiler)
5869848b8605Smrg{
5870b8e80941Smrg	struct lp_build_tgsi_context *bld_base;
5871848b8605Smrg
5872b8e80941Smrg	si_llvm_context_init(ctx, sscreen, compiler);
5873848b8605Smrg
5874b8e80941Smrg	bld_base = &ctx->bld_base;
5875b8e80941Smrg	bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
5876848b8605Smrg
5877b8e80941Smrg	bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID].emit = build_interp_intrinsic;
5878b8e80941Smrg	bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE].emit = build_interp_intrinsic;
5879b8e80941Smrg	bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET].emit = build_interp_intrinsic;
5880848b8605Smrg
5881b8e80941Smrg	bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
5882848b8605Smrg
5883b8e80941Smrg	bld_base->op_actions[TGSI_OPCODE_CLOCK].emit = clock_emit;
5884848b8605Smrg
5885b8e80941Smrg	bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
5886b8e80941Smrg	bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
5887b8e80941Smrg	bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
5888b8e80941Smrg	bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
5889b8e80941Smrg
5890b8e80941Smrg	bld_base->op_actions[TGSI_OPCODE_VOTE_ALL].emit = vote_all_emit;
5891b8e80941Smrg	bld_base->op_actions[TGSI_OPCODE_VOTE_ANY].emit = vote_any_emit;
5892b8e80941Smrg	bld_base->op_actions[TGSI_OPCODE_VOTE_EQ].emit = vote_eq_emit;
5893b8e80941Smrg	bld_base->op_actions[TGSI_OPCODE_BALLOT].emit = ballot_emit;
5894b8e80941Smrg	bld_base->op_actions[TGSI_OPCODE_READ_FIRST].intr_name = "llvm.amdgcn.readfirstlane";
5895b8e80941Smrg	bld_base->op_actions[TGSI_OPCODE_READ_FIRST].emit = read_lane_emit;
5896b8e80941Smrg	bld_base->op_actions[TGSI_OPCODE_READ_INVOC].intr_name = "llvm.amdgcn.readlane";
5897b8e80941Smrg	bld_base->op_actions[TGSI_OPCODE_READ_INVOC].emit = read_lane_emit;
5898b8e80941Smrg
5899b8e80941Smrg	bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_tgsi_emit_vertex;
5900b8e80941Smrg	bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_tgsi_emit_primitive;
5901b8e80941Smrg	bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
5902b8e80941Smrg}
5903848b8605Smrg
5904b8e80941Smrgstatic void si_optimize_vs_outputs(struct si_shader_context *ctx)
5905b8e80941Smrg{
5906b8e80941Smrg	struct si_shader *shader = ctx->shader;
5907b8e80941Smrg	struct tgsi_shader_info *info = &shader->selector->info;
5908848b8605Smrg
5909b8e80941Smrg	if ((ctx->type != PIPE_SHADER_VERTEX &&
5910b8e80941Smrg	     ctx->type != PIPE_SHADER_TESS_EVAL) ||
5911b8e80941Smrg	    shader->key.as_ls ||
5912b8e80941Smrg	    shader->key.as_es)
5913b8e80941Smrg		return;
5914848b8605Smrg
5915b8e80941Smrg	ac_optimize_vs_outputs(&ctx->ac,
5916b8e80941Smrg			       ctx->main_fn,
5917b8e80941Smrg			       shader->info.vs_output_param_offset,
5918b8e80941Smrg			       info->num_outputs,
5919b8e80941Smrg			       &shader->info.nr_param_exports);
5920848b8605Smrg}
5921848b8605Smrg
5922b8e80941Smrgstatic void si_init_exec_from_input(struct si_shader_context *ctx,
5923b8e80941Smrg				    unsigned param, unsigned bitoffset)
5924848b8605Smrg{
5925b8e80941Smrg	LLVMValueRef args[] = {
5926b8e80941Smrg		LLVMGetParam(ctx->main_fn, param),
5927b8e80941Smrg		LLVMConstInt(ctx->i32, bitoffset, 0),
5928b8e80941Smrg	};
5929b8e80941Smrg	ac_build_intrinsic(&ctx->ac,
5930b8e80941Smrg			   "llvm.amdgcn.init.exec.from.input",
5931b8e80941Smrg			   ctx->voidt, args, 2, AC_FUNC_ATTR_CONVERGENT);
5932b8e80941Smrg}
5933848b8605Smrg
5934b8e80941Smrgstatic bool si_vs_needs_prolog(const struct si_shader_selector *sel,
5935b8e80941Smrg			       const struct si_vs_prolog_bits *key)
5936b8e80941Smrg{
5937b8e80941Smrg	/* VGPR initialization fixup for Vega10 and Raven is always done in the
5938b8e80941Smrg	 * VS prolog. */
5939b8e80941Smrg	return sel->vs_needs_prolog || key->ls_vgpr_fix;
5940848b8605Smrg}
5941848b8605Smrg
5942b8e80941Smrgstatic bool si_compile_tgsi_main(struct si_shader_context *ctx)
5943848b8605Smrg{
5944b8e80941Smrg	struct si_shader *shader = ctx->shader;
5945b8e80941Smrg	struct si_shader_selector *sel = shader->selector;
5946b8e80941Smrg	struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
5947b8e80941Smrg
5948b8e80941Smrg	// TODO clean all this up!
5949b8e80941Smrg	switch (ctx->type) {
5950b8e80941Smrg	case PIPE_SHADER_VERTEX:
5951b8e80941Smrg		ctx->load_input = declare_input_vs;
5952b8e80941Smrg		if (shader->key.as_ls)
5953b8e80941Smrg			ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue;
5954b8e80941Smrg		else if (shader->key.as_es)
5955b8e80941Smrg			ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
5956b8e80941Smrg		else
5957b8e80941Smrg			ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
5958b8e80941Smrg		bld_base->emit_epilogue = si_tgsi_emit_epilogue;
5959b8e80941Smrg		ctx->abi.load_base_vertex = get_base_vertex;
5960b8e80941Smrg		break;
5961b8e80941Smrg	case PIPE_SHADER_TESS_CTRL:
5962b8e80941Smrg		bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
5963b8e80941Smrg		ctx->abi.load_tess_varyings = si_nir_load_tcs_varyings;
5964b8e80941Smrg		bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
5965b8e80941Smrg		bld_base->emit_store = store_output_tcs;
5966b8e80941Smrg		ctx->abi.store_tcs_outputs = si_nir_store_output_tcs;
5967b8e80941Smrg		ctx->abi.emit_outputs = si_llvm_emit_tcs_epilogue;
5968b8e80941Smrg		ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
5969b8e80941Smrg		bld_base->emit_epilogue = si_tgsi_emit_epilogue;
5970b8e80941Smrg		break;
5971b8e80941Smrg	case PIPE_SHADER_TESS_EVAL:
5972b8e80941Smrg		bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
5973b8e80941Smrg		ctx->abi.load_tess_varyings = si_nir_load_input_tes;
5974b8e80941Smrg		ctx->abi.load_tess_coord = si_load_tess_coord;
5975b8e80941Smrg		ctx->abi.load_tess_level = si_load_tess_level;
5976b8e80941Smrg		ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
5977b8e80941Smrg		if (shader->key.as_es)
5978b8e80941Smrg			ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
5979b8e80941Smrg		else
5980b8e80941Smrg			ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
5981b8e80941Smrg		bld_base->emit_epilogue = si_tgsi_emit_epilogue;
5982b8e80941Smrg		break;
5983b8e80941Smrg	case PIPE_SHADER_GEOMETRY:
5984b8e80941Smrg		bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
5985b8e80941Smrg		ctx->abi.load_inputs = si_nir_load_input_gs;
5986b8e80941Smrg		ctx->abi.emit_vertex = si_llvm_emit_vertex;
5987b8e80941Smrg		ctx->abi.emit_primitive = si_llvm_emit_primitive;
5988b8e80941Smrg		ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue;
5989b8e80941Smrg		bld_base->emit_epilogue = si_tgsi_emit_gs_epilogue;
5990b8e80941Smrg		break;
5991b8e80941Smrg	case PIPE_SHADER_FRAGMENT:
5992b8e80941Smrg		ctx->load_input = declare_input_fs;
5993b8e80941Smrg		ctx->abi.emit_outputs = si_llvm_return_fs_outputs;
5994b8e80941Smrg		bld_base->emit_epilogue = si_tgsi_emit_epilogue;
5995b8e80941Smrg		ctx->abi.lookup_interp_param = si_nir_lookup_interp_param;
5996b8e80941Smrg		ctx->abi.load_sample_position = load_sample_position;
5997b8e80941Smrg		ctx->abi.load_sample_mask_in = load_sample_mask_in;
5998b8e80941Smrg		ctx->abi.emit_kill = si_llvm_emit_kill;
5999b8e80941Smrg		break;
6000b8e80941Smrg	case PIPE_SHADER_COMPUTE:
6001b8e80941Smrg		ctx->abi.load_local_group_size = get_block_size;
6002b8e80941Smrg		break;
6003b8e80941Smrg	default:
6004b8e80941Smrg		assert(!"Unsupported shader type");
6005b8e80941Smrg		return false;
6006b8e80941Smrg	}
6007848b8605Smrg
6008b8e80941Smrg	ctx->abi.load_ubo = load_ubo;
6009b8e80941Smrg	ctx->abi.load_ssbo = load_ssbo;
6010848b8605Smrg
6011b8e80941Smrg	create_function(ctx);
6012b8e80941Smrg	preload_ring_buffers(ctx);
6013848b8605Smrg
6014b8e80941Smrg	/* For GFX9 merged shaders:
6015b8e80941Smrg	 * - Set EXEC for the first shader. If the prolog is present, set
6016b8e80941Smrg	 *   EXEC there instead.
6017b8e80941Smrg	 * - Add a barrier before the second shader.
6018b8e80941Smrg	 * - In the second shader, reset EXEC to ~0 and wrap the main part in
6019b8e80941Smrg	 *   an if-statement. This is required for correctness in geometry
6020b8e80941Smrg	 *   shaders, to ensure that empty GS waves do not send GS_EMIT and
6021b8e80941Smrg	 *   GS_CUT messages.
6022b8e80941Smrg	 *
6023b8e80941Smrg	 * For monolithic merged shaders, the first shader is wrapped in an
6024b8e80941Smrg	 * if-block together with its prolog in si_build_wrapper_function.
6025b8e80941Smrg	 */
6026b8e80941Smrg	if (ctx->screen->info.chip_class >= GFX9) {
6027b8e80941Smrg		if (!shader->is_monolithic &&
6028b8e80941Smrg		    sel->info.num_instructions > 1 && /* not empty shader */
6029b8e80941Smrg		    (shader->key.as_es || shader->key.as_ls) &&
6030b8e80941Smrg		    (ctx->type == PIPE_SHADER_TESS_EVAL ||
6031b8e80941Smrg		     (ctx->type == PIPE_SHADER_VERTEX &&
6032b8e80941Smrg		      !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog)))) {
6033b8e80941Smrg			si_init_exec_from_input(ctx,
6034b8e80941Smrg						ctx->param_merged_wave_info, 0);
6035b8e80941Smrg		} else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
6036b8e80941Smrg			   ctx->type == PIPE_SHADER_GEOMETRY) {
6037b8e80941Smrg			if (!shader->is_monolithic)
6038b8e80941Smrg				ac_init_exec_full_mask(&ctx->ac);
6039b8e80941Smrg
6040b8e80941Smrg			LLVMValueRef num_threads = si_unpack_param(ctx, ctx->param_merged_wave_info, 8, 8);
6041b8e80941Smrg			LLVMValueRef ena =
6042b8e80941Smrg				LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
6043b8e80941Smrg					    ac_get_thread_id(&ctx->ac), num_threads, "");
6044b8e80941Smrg			lp_build_if(&ctx->merged_wrap_if_state, &ctx->gallivm, ena);
6045b8e80941Smrg
6046b8e80941Smrg			/* The barrier must execute for all shaders in a
6047b8e80941Smrg			 * threadgroup.
6048b8e80941Smrg			 *
6049b8e80941Smrg			 * Execute the barrier inside the conditional block,
6050b8e80941Smrg			 * so that empty waves can jump directly to s_endpgm,
6051b8e80941Smrg			 * which will also signal the barrier.
6052b8e80941Smrg			 *
6053b8e80941Smrg			 * If the shader is TCS and the TCS epilog is present
6054b8e80941Smrg			 * and contains a barrier, it will wait there and then
6055b8e80941Smrg			 * reach s_endpgm.
6056b8e80941Smrg			 */
6057b8e80941Smrg			si_llvm_emit_barrier(NULL, bld_base, NULL);
6058b8e80941Smrg		}
6059b8e80941Smrg	}
6060848b8605Smrg
6061b8e80941Smrg	if (ctx->type == PIPE_SHADER_TESS_CTRL &&
6062b8e80941Smrg	    sel->tcs_info.tessfactors_are_def_in_all_invocs) {
6063b8e80941Smrg		for (unsigned i = 0; i < 6; i++) {
6064b8e80941Smrg			ctx->invoc0_tess_factors[i] =
6065b8e80941Smrg				ac_build_alloca_undef(&ctx->ac, ctx->i32, "");
6066b8e80941Smrg		}
6067b8e80941Smrg	}
6068b8e80941Smrg
6069b8e80941Smrg	if (ctx->type == PIPE_SHADER_GEOMETRY) {
6070b8e80941Smrg		int i;
6071b8e80941Smrg		for (i = 0; i < 4; i++) {
6072b8e80941Smrg			ctx->gs_next_vertex[i] =
6073b8e80941Smrg				ac_build_alloca(&ctx->ac, ctx->i32, "");
6074b8e80941Smrg		}
6075b8e80941Smrg	}
6076848b8605Smrg
6077b8e80941Smrg	if (sel->force_correct_derivs_after_kill) {
6078b8e80941Smrg		ctx->postponed_kill = ac_build_alloca_undef(&ctx->ac, ctx->i1, "");
6079b8e80941Smrg		/* true = don't kill. */
6080b8e80941Smrg		LLVMBuildStore(ctx->ac.builder, ctx->i1true,
6081b8e80941Smrg			       ctx->postponed_kill);
6082b8e80941Smrg	}
6083848b8605Smrg
6084b8e80941Smrg	if (sel->tokens) {
6085b8e80941Smrg		if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
6086b8e80941Smrg			fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
6087b8e80941Smrg			return false;
6088b8e80941Smrg		}
6089b8e80941Smrg	} else {
6090b8e80941Smrg		if (!si_nir_build_llvm(ctx, sel->nir)) {
6091b8e80941Smrg			fprintf(stderr, "Failed to translate shader from NIR to LLVM\n");
6092b8e80941Smrg			return false;
6093848b8605Smrg		}
6094848b8605Smrg	}
6095848b8605Smrg
6096b8e80941Smrg	si_llvm_build_ret(ctx, ctx->return_value);
6097b8e80941Smrg	return true;
6098848b8605Smrg}
6099848b8605Smrg
6100b8e80941Smrg/**
6101b8e80941Smrg * Compute the VS prolog key, which contains all the information needed to
6102b8e80941Smrg * build the VS prolog function, and set shader->info bits where needed.
6103b8e80941Smrg *
6104b8e80941Smrg * \param info             Shader info of the vertex shader.
6105b8e80941Smrg * \param num_input_sgprs  Number of input SGPRs for the vertex shader.
6106b8e80941Smrg * \param prolog_key       Key of the VS prolog
6107b8e80941Smrg * \param shader_out       The vertex shader, or the next shader if merging LS+HS or ES+GS.
6108b8e80941Smrg * \param key              Output shader part key.
6109b8e80941Smrg */
6110b8e80941Smrgstatic void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
6111b8e80941Smrg				 unsigned num_input_sgprs,
6112b8e80941Smrg				 const struct si_vs_prolog_bits *prolog_key,
6113b8e80941Smrg				 struct si_shader *shader_out,
6114b8e80941Smrg				 union si_shader_part_key *key)
6115848b8605Smrg{
6116b8e80941Smrg	memset(key, 0, sizeof(*key));
6117b8e80941Smrg	key->vs_prolog.states = *prolog_key;
6118b8e80941Smrg	key->vs_prolog.num_input_sgprs = num_input_sgprs;
6119b8e80941Smrg	key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
6120b8e80941Smrg	key->vs_prolog.as_ls = shader_out->key.as_ls;
6121b8e80941Smrg	key->vs_prolog.as_es = shader_out->key.as_es;
6122b8e80941Smrg
6123b8e80941Smrg	if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
6124b8e80941Smrg		key->vs_prolog.as_ls = 1;
6125b8e80941Smrg		key->vs_prolog.num_merged_next_stage_vgprs = 2;
6126b8e80941Smrg	} else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
6127b8e80941Smrg		key->vs_prolog.as_es = 1;
6128b8e80941Smrg		key->vs_prolog.num_merged_next_stage_vgprs = 5;
6129b8e80941Smrg	}
6130848b8605Smrg
6131b8e80941Smrg	/* Enable loading the InstanceID VGPR. */
6132b8e80941Smrg	uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
6133848b8605Smrg
6134b8e80941Smrg	if ((key->vs_prolog.states.instance_divisor_is_one |
6135b8e80941Smrg	     key->vs_prolog.states.instance_divisor_is_fetched) & input_mask)
6136b8e80941Smrg		shader_out->info.uses_instanceid = true;
6137b8e80941Smrg}
6138848b8605Smrg
6139b8e80941Smrg/**
6140b8e80941Smrg * Compute the PS prolog key, which contains all the information needed to
6141b8e80941Smrg * build the PS prolog function, and set related bits in shader->config.
6142b8e80941Smrg */
6143b8e80941Smrgstatic void si_get_ps_prolog_key(struct si_shader *shader,
6144b8e80941Smrg				 union si_shader_part_key *key,
6145b8e80941Smrg				 bool separate_prolog)
6146b8e80941Smrg{
6147b8e80941Smrg	struct tgsi_shader_info *info = &shader->selector->info;
6148b8e80941Smrg
6149b8e80941Smrg	memset(key, 0, sizeof(*key));
6150b8e80941Smrg	key->ps_prolog.states = shader->key.part.ps.prolog;
6151b8e80941Smrg	key->ps_prolog.colors_read = info->colors_read;
6152b8e80941Smrg	key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
6153b8e80941Smrg	key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
6154b8e80941Smrg	key->ps_prolog.wqm = info->uses_derivatives &&
6155b8e80941Smrg		(key->ps_prolog.colors_read ||
6156b8e80941Smrg		 key->ps_prolog.states.force_persp_sample_interp ||
6157b8e80941Smrg		 key->ps_prolog.states.force_linear_sample_interp ||
6158b8e80941Smrg		 key->ps_prolog.states.force_persp_center_interp ||
6159b8e80941Smrg		 key->ps_prolog.states.force_linear_center_interp ||
6160b8e80941Smrg		 key->ps_prolog.states.bc_optimize_for_persp ||
6161b8e80941Smrg		 key->ps_prolog.states.bc_optimize_for_linear);
6162b8e80941Smrg	key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index;
6163b8e80941Smrg
6164b8e80941Smrg	if (info->colors_read) {
6165b8e80941Smrg		unsigned *color = shader->selector->color_attr_index;
6166b8e80941Smrg
6167b8e80941Smrg		if (shader->key.part.ps.prolog.color_two_side) {
6168b8e80941Smrg			/* BCOLORs are stored after the last input. */
6169b8e80941Smrg			key->ps_prolog.num_interp_inputs = info->num_inputs;
6170b8e80941Smrg			key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
6171b8e80941Smrg			shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
6172b8e80941Smrg		}
6173848b8605Smrg
6174b8e80941Smrg		for (unsigned i = 0; i < 2; i++) {
6175b8e80941Smrg			unsigned interp = info->input_interpolate[color[i]];
6176b8e80941Smrg			unsigned location = info->input_interpolate_loc[color[i]];
6177848b8605Smrg
6178b8e80941Smrg			if (!(info->colors_read & (0xf << i*4)))
6179848b8605Smrg				continue;
6180848b8605Smrg
6181b8e80941Smrg			key->ps_prolog.color_attr_index[i] = color[i];
6182848b8605Smrg
6183b8e80941Smrg			if (shader->key.part.ps.prolog.flatshade_colors &&
6184b8e80941Smrg			    interp == TGSI_INTERPOLATE_COLOR)
6185b8e80941Smrg				interp = TGSI_INTERPOLATE_CONSTANT;
6186b8e80941Smrg
6187b8e80941Smrg			switch (interp) {
6188b8e80941Smrg			case TGSI_INTERPOLATE_CONSTANT:
6189b8e80941Smrg				key->ps_prolog.color_interp_vgpr_index[i] = -1;
6190b8e80941Smrg				break;
6191b8e80941Smrg			case TGSI_INTERPOLATE_PERSPECTIVE:
6192b8e80941Smrg			case TGSI_INTERPOLATE_COLOR:
6193b8e80941Smrg				/* Force the interpolation location for colors here. */
6194b8e80941Smrg				if (shader->key.part.ps.prolog.force_persp_sample_interp)
6195b8e80941Smrg					location = TGSI_INTERPOLATE_LOC_SAMPLE;
6196b8e80941Smrg				if (shader->key.part.ps.prolog.force_persp_center_interp)
6197b8e80941Smrg					location = TGSI_INTERPOLATE_LOC_CENTER;
6198b8e80941Smrg
6199b8e80941Smrg				switch (location) {
6200b8e80941Smrg				case TGSI_INTERPOLATE_LOC_SAMPLE:
6201b8e80941Smrg					key->ps_prolog.color_interp_vgpr_index[i] = 0;
6202b8e80941Smrg					shader->config.spi_ps_input_ena |=
6203b8e80941Smrg						S_0286CC_PERSP_SAMPLE_ENA(1);
6204b8e80941Smrg					break;
6205b8e80941Smrg				case TGSI_INTERPOLATE_LOC_CENTER:
6206b8e80941Smrg					key->ps_prolog.color_interp_vgpr_index[i] = 2;
6207b8e80941Smrg					shader->config.spi_ps_input_ena |=
6208b8e80941Smrg						S_0286CC_PERSP_CENTER_ENA(1);
6209b8e80941Smrg					break;
6210b8e80941Smrg				case TGSI_INTERPOLATE_LOC_CENTROID:
6211b8e80941Smrg					key->ps_prolog.color_interp_vgpr_index[i] = 4;
6212b8e80941Smrg					shader->config.spi_ps_input_ena |=
6213b8e80941Smrg						S_0286CC_PERSP_CENTROID_ENA(1);
6214b8e80941Smrg					break;
6215b8e80941Smrg				default:
6216b8e80941Smrg					assert(0);
6217b8e80941Smrg				}
6218b8e80941Smrg				break;
6219b8e80941Smrg			case TGSI_INTERPOLATE_LINEAR:
6220b8e80941Smrg				/* Force the interpolation location for colors here. */
6221b8e80941Smrg				if (shader->key.part.ps.prolog.force_linear_sample_interp)
6222b8e80941Smrg					location = TGSI_INTERPOLATE_LOC_SAMPLE;
6223b8e80941Smrg				if (shader->key.part.ps.prolog.force_linear_center_interp)
6224b8e80941Smrg					location = TGSI_INTERPOLATE_LOC_CENTER;
6225b8e80941Smrg
6226b8e80941Smrg				/* The VGPR assignment for non-monolithic shaders
6227b8e80941Smrg				 * works because InitialPSInputAddr is set on the
6228b8e80941Smrg				 * main shader and PERSP_PULL_MODEL is never used.
6229b8e80941Smrg				 */
6230b8e80941Smrg				switch (location) {
6231b8e80941Smrg				case TGSI_INTERPOLATE_LOC_SAMPLE:
6232b8e80941Smrg					key->ps_prolog.color_interp_vgpr_index[i] =
6233b8e80941Smrg						separate_prolog ? 6 : 9;
6234b8e80941Smrg					shader->config.spi_ps_input_ena |=
6235b8e80941Smrg						S_0286CC_LINEAR_SAMPLE_ENA(1);
6236b8e80941Smrg					break;
6237b8e80941Smrg				case TGSI_INTERPOLATE_LOC_CENTER:
6238b8e80941Smrg					key->ps_prolog.color_interp_vgpr_index[i] =
6239b8e80941Smrg						separate_prolog ? 8 : 11;
6240b8e80941Smrg					shader->config.spi_ps_input_ena |=
6241b8e80941Smrg						S_0286CC_LINEAR_CENTER_ENA(1);
6242b8e80941Smrg					break;
6243b8e80941Smrg				case TGSI_INTERPOLATE_LOC_CENTROID:
6244b8e80941Smrg					key->ps_prolog.color_interp_vgpr_index[i] =
6245b8e80941Smrg						separate_prolog ? 10 : 13;
6246b8e80941Smrg					shader->config.spi_ps_input_ena |=
6247b8e80941Smrg						S_0286CC_LINEAR_CENTROID_ENA(1);
6248b8e80941Smrg					break;
6249b8e80941Smrg				default:
6250b8e80941Smrg					assert(0);
6251848b8605Smrg				}
6252b8e80941Smrg				break;
6253b8e80941Smrg			default:
6254b8e80941Smrg				assert(0);
6255848b8605Smrg			}
6256848b8605Smrg		}
6257848b8605Smrg	}
6258b8e80941Smrg}
6259848b8605Smrg
6260b8e80941Smrg/**
6261b8e80941Smrg * Check whether a PS prolog is required based on the key.
6262b8e80941Smrg */
6263b8e80941Smrgstatic bool si_need_ps_prolog(const union si_shader_part_key *key)
6264b8e80941Smrg{
6265b8e80941Smrg	return key->ps_prolog.colors_read ||
6266b8e80941Smrg	       key->ps_prolog.states.force_persp_sample_interp ||
6267b8e80941Smrg	       key->ps_prolog.states.force_linear_sample_interp ||
6268b8e80941Smrg	       key->ps_prolog.states.force_persp_center_interp ||
6269b8e80941Smrg	       key->ps_prolog.states.force_linear_center_interp ||
6270b8e80941Smrg	       key->ps_prolog.states.bc_optimize_for_persp ||
6271b8e80941Smrg	       key->ps_prolog.states.bc_optimize_for_linear ||
6272b8e80941Smrg	       key->ps_prolog.states.poly_stipple ||
6273b8e80941Smrg	       key->ps_prolog.states.samplemask_log_ps_iter;
6274b8e80941Smrg}
6275848b8605Smrg
6276b8e80941Smrg/**
6277b8e80941Smrg * Compute the PS epilog key, which contains all the information needed to
6278b8e80941Smrg * build the PS epilog function.
6279b8e80941Smrg */
6280b8e80941Smrgstatic void si_get_ps_epilog_key(struct si_shader *shader,
6281b8e80941Smrg				 union si_shader_part_key *key)
6282b8e80941Smrg{
6283b8e80941Smrg	struct tgsi_shader_info *info = &shader->selector->info;
6284b8e80941Smrg	memset(key, 0, sizeof(*key));
6285b8e80941Smrg	key->ps_epilog.colors_written = info->colors_written;
6286b8e80941Smrg	key->ps_epilog.writes_z = info->writes_z;
6287b8e80941Smrg	key->ps_epilog.writes_stencil = info->writes_stencil;
6288b8e80941Smrg	key->ps_epilog.writes_samplemask = info->writes_samplemask;
6289b8e80941Smrg	key->ps_epilog.states = shader->key.part.ps.epilog;
6290b8e80941Smrg}
6291848b8605Smrg
6292b8e80941Smrg/**
6293b8e80941Smrg * Build the GS prolog function. Rotate the input vertices for triangle strips
6294b8e80941Smrg * with adjacency.
6295b8e80941Smrg */
6296b8e80941Smrgstatic void si_build_gs_prolog_function(struct si_shader_context *ctx,
6297b8e80941Smrg					union si_shader_part_key *key)
6298b8e80941Smrg{
6299b8e80941Smrg	unsigned num_sgprs, num_vgprs;
6300b8e80941Smrg	struct si_function_info fninfo;
6301b8e80941Smrg	LLVMBuilderRef builder = ctx->ac.builder;
6302b8e80941Smrg	LLVMTypeRef returns[48];
6303b8e80941Smrg	LLVMValueRef func, ret;
6304848b8605Smrg
6305b8e80941Smrg	si_init_function_info(&fninfo);
6306848b8605Smrg
6307b8e80941Smrg	if (ctx->screen->info.chip_class >= GFX9) {
6308b8e80941Smrg		if (key->gs_prolog.states.gfx9_prev_is_vs)
6309b8e80941Smrg			num_sgprs = 8 + GFX9_VSGS_NUM_USER_SGPR;
6310848b8605Smrg		else
6311b8e80941Smrg			num_sgprs = 8 + GFX9_TESGS_NUM_USER_SGPR;
6312b8e80941Smrg		num_vgprs = 5; /* ES inputs are not needed by GS */
6313b8e80941Smrg	} else {
6314b8e80941Smrg		num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
6315b8e80941Smrg		num_vgprs = 8;
6316b8e80941Smrg	}
6317848b8605Smrg
6318b8e80941Smrg	for (unsigned i = 0; i < num_sgprs; ++i) {
6319b8e80941Smrg		add_arg(&fninfo, ARG_SGPR, ctx->i32);
6320b8e80941Smrg		returns[i] = ctx->i32;
6321848b8605Smrg	}
6322848b8605Smrg
6323b8e80941Smrg	for (unsigned i = 0; i < num_vgprs; ++i) {
6324b8e80941Smrg		add_arg(&fninfo, ARG_VGPR, ctx->i32);
6325b8e80941Smrg		returns[num_sgprs + i] = ctx->f32;
6326b8e80941Smrg	}
6327848b8605Smrg
6328b8e80941Smrg	/* Create the function. */
6329b8e80941Smrg	si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
6330b8e80941Smrg			   &fninfo, 0);
6331b8e80941Smrg	func = ctx->main_fn;
6332848b8605Smrg
6333b8e80941Smrg	/* Set the full EXEC mask for the prolog, because we are only fiddling
6334b8e80941Smrg	 * with registers here. The main shader part will set the correct EXEC
6335b8e80941Smrg	 * mask.
6336b8e80941Smrg	 */
6337b8e80941Smrg	if (ctx->screen->info.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
6338b8e80941Smrg		ac_init_exec_full_mask(&ctx->ac);
6339848b8605Smrg
6340b8e80941Smrg	/* Copy inputs to outputs. This should be no-op, as the registers match,
6341b8e80941Smrg	 * but it will prevent the compiler from overwriting them unintentionally.
6342b8e80941Smrg	 */
6343b8e80941Smrg	ret = ctx->return_value;
6344b8e80941Smrg	for (unsigned i = 0; i < num_sgprs; i++) {
6345b8e80941Smrg		LLVMValueRef p = LLVMGetParam(func, i);
6346b8e80941Smrg		ret = LLVMBuildInsertValue(builder, ret, p, i, "");
6347b8e80941Smrg	}
6348b8e80941Smrg	for (unsigned i = 0; i < num_vgprs; i++) {
6349b8e80941Smrg		LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
6350b8e80941Smrg		p = ac_to_float(&ctx->ac, p);
6351b8e80941Smrg		ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
6352848b8605Smrg	}
6353848b8605Smrg
6354b8e80941Smrg	if (key->gs_prolog.states.tri_strip_adj_fix) {
6355b8e80941Smrg		/* Remap the input vertices for every other primitive. */
6356b8e80941Smrg		const unsigned gfx6_vtx_params[6] = {
6357b8e80941Smrg			num_sgprs,
6358b8e80941Smrg			num_sgprs + 1,
6359b8e80941Smrg			num_sgprs + 3,
6360b8e80941Smrg			num_sgprs + 4,
6361b8e80941Smrg			num_sgprs + 5,
6362b8e80941Smrg			num_sgprs + 6
6363b8e80941Smrg		};
6364b8e80941Smrg		const unsigned gfx9_vtx_params[3] = {
6365b8e80941Smrg			num_sgprs,
6366b8e80941Smrg			num_sgprs + 1,
6367b8e80941Smrg			num_sgprs + 4,
6368b8e80941Smrg		};
6369b8e80941Smrg		LLVMValueRef vtx_in[6], vtx_out[6];
6370b8e80941Smrg		LLVMValueRef prim_id, rotate;
6371848b8605Smrg
6372b8e80941Smrg		if (ctx->screen->info.chip_class >= GFX9) {
6373b8e80941Smrg			for (unsigned i = 0; i < 3; i++) {
6374b8e80941Smrg				vtx_in[i*2] = si_unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
6375b8e80941Smrg				vtx_in[i*2+1] = si_unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
6376b8e80941Smrg			}
6377b8e80941Smrg		} else {
6378b8e80941Smrg			for (unsigned i = 0; i < 6; i++)
6379b8e80941Smrg				vtx_in[i] = LLVMGetParam(func, gfx6_vtx_params[i]);
6380b8e80941Smrg		}
6381848b8605Smrg
6382b8e80941Smrg		prim_id = LLVMGetParam(func, num_sgprs + 2);
6383b8e80941Smrg		rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
6384848b8605Smrg
6385b8e80941Smrg		for (unsigned i = 0; i < 6; ++i) {
6386b8e80941Smrg			LLVMValueRef base, rotated;
6387b8e80941Smrg			base = vtx_in[i];
6388b8e80941Smrg			rotated = vtx_in[(i + 4) % 6];
6389b8e80941Smrg			vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
6390b8e80941Smrg		}
6391848b8605Smrg
6392b8e80941Smrg		if (ctx->screen->info.chip_class >= GFX9) {
6393b8e80941Smrg			for (unsigned i = 0; i < 3; i++) {
6394b8e80941Smrg				LLVMValueRef hi, out;
6395848b8605Smrg
6396b8e80941Smrg				hi = LLVMBuildShl(builder, vtx_out[i*2+1],
6397b8e80941Smrg						  LLVMConstInt(ctx->i32, 16, 0), "");
6398b8e80941Smrg				out = LLVMBuildOr(builder, vtx_out[i*2], hi, "");
6399b8e80941Smrg				out = ac_to_float(&ctx->ac, out);
6400b8e80941Smrg				ret = LLVMBuildInsertValue(builder, ret, out,
6401b8e80941Smrg							   gfx9_vtx_params[i], "");
6402b8e80941Smrg			}
6403b8e80941Smrg		} else {
6404b8e80941Smrg			for (unsigned i = 0; i < 6; i++) {
6405b8e80941Smrg				LLVMValueRef out;
6406848b8605Smrg
6407b8e80941Smrg				out = ac_to_float(&ctx->ac, vtx_out[i]);
6408b8e80941Smrg				ret = LLVMBuildInsertValue(builder, ret, out,
6409b8e80941Smrg							   gfx6_vtx_params[i], "");
6410b8e80941Smrg			}
6411b8e80941Smrg		}
6412848b8605Smrg	}
6413848b8605Smrg
6414b8e80941Smrg	LLVMBuildRet(builder, ret);
6415b8e80941Smrg}
6416b8e80941Smrg
6417b8e80941Smrg/**
6418b8e80941Smrg * Given a list of shader part functions, build a wrapper function that
6419b8e80941Smrg * runs them in sequence to form a monolithic shader.
6420b8e80941Smrg */
6421b8e80941Smrgstatic void si_build_wrapper_function(struct si_shader_context *ctx,
6422b8e80941Smrg				      LLVMValueRef *parts,
6423b8e80941Smrg				      unsigned num_parts,
6424b8e80941Smrg				      unsigned main_part,
6425b8e80941Smrg				      unsigned next_shader_first_part)
6426b8e80941Smrg{
6427b8e80941Smrg	LLVMBuilderRef builder = ctx->ac.builder;
6428b8e80941Smrg	/* PS epilog has one arg per color component; gfx9 merged shader
6429b8e80941Smrg	 * prologs need to forward 32 user SGPRs.
6430b8e80941Smrg	 */
6431b8e80941Smrg	struct si_function_info fninfo;
6432b8e80941Smrg	LLVMValueRef initial[64], out[64];
6433b8e80941Smrg	LLVMTypeRef function_type;
6434b8e80941Smrg	unsigned num_first_params;
6435b8e80941Smrg	unsigned num_out, initial_num_out;
6436b8e80941Smrg	MAYBE_UNUSED unsigned num_out_sgpr; /* used in debug checks */
6437b8e80941Smrg	MAYBE_UNUSED unsigned initial_num_out_sgpr; /* used in debug checks */
6438b8e80941Smrg	unsigned num_sgprs, num_vgprs;
6439b8e80941Smrg	unsigned gprs;
6440b8e80941Smrg	struct lp_build_if_state if_state;
6441b8e80941Smrg
6442b8e80941Smrg	si_init_function_info(&fninfo);
6443b8e80941Smrg
6444b8e80941Smrg	for (unsigned i = 0; i < num_parts; ++i) {
6445b8e80941Smrg		ac_add_function_attr(ctx->ac.context, parts[i], -1,
6446b8e80941Smrg				     AC_FUNC_ATTR_ALWAYSINLINE);
6447b8e80941Smrg		LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
6448848b8605Smrg	}
6449848b8605Smrg
6450b8e80941Smrg	/* The parameters of the wrapper function correspond to those of the
6451b8e80941Smrg	 * first part in terms of SGPRs and VGPRs, but we use the types of the
6452b8e80941Smrg	 * main part to get the right types. This is relevant for the
6453b8e80941Smrg	 * dereferenceable attribute on descriptor table pointers.
6454b8e80941Smrg	 */
6455b8e80941Smrg	num_sgprs = 0;
6456b8e80941Smrg	num_vgprs = 0;
6457848b8605Smrg
6458b8e80941Smrg	function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
6459b8e80941Smrg	num_first_params = LLVMCountParamTypes(function_type);
6460848b8605Smrg
6461b8e80941Smrg	for (unsigned i = 0; i < num_first_params; ++i) {
6462b8e80941Smrg		LLVMValueRef param = LLVMGetParam(parts[0], i);
6463848b8605Smrg
6464b8e80941Smrg		if (ac_is_sgpr_param(param)) {
6465b8e80941Smrg			assert(num_vgprs == 0);
6466b8e80941Smrg			num_sgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
6467848b8605Smrg		} else {
6468b8e80941Smrg			num_vgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
6469848b8605Smrg		}
6470848b8605Smrg	}
6471848b8605Smrg
6472b8e80941Smrg	gprs = 0;
6473b8e80941Smrg	while (gprs < num_sgprs + num_vgprs) {
6474b8e80941Smrg		LLVMValueRef param = LLVMGetParam(parts[main_part], fninfo.num_params);
6475b8e80941Smrg		LLVMTypeRef type = LLVMTypeOf(param);
6476b8e80941Smrg		unsigned size = ac_get_type_size(type) / 4;
6477848b8605Smrg
6478b8e80941Smrg		add_arg(&fninfo, gprs < num_sgprs ? ARG_SGPR : ARG_VGPR, type);
6479848b8605Smrg
6480b8e80941Smrg		assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
6481b8e80941Smrg		assert(gprs + size <= num_sgprs + num_vgprs &&
6482b8e80941Smrg		       (gprs >= num_sgprs || gprs + size <= num_sgprs));
6483848b8605Smrg
6484b8e80941Smrg		gprs += size;
6485848b8605Smrg	}
6486848b8605Smrg
6487b8e80941Smrg	si_create_function(ctx, "wrapper", NULL, 0, &fninfo,
6488b8e80941Smrg			   si_get_max_workgroup_size(ctx->shader));
6489848b8605Smrg
6490b8e80941Smrg	if (is_merged_shader(ctx))
6491b8e80941Smrg		ac_init_exec_full_mask(&ctx->ac);
6492848b8605Smrg
6493b8e80941Smrg	/* Record the arguments of the function as if they were an output of
6494b8e80941Smrg	 * a previous part.
6495b8e80941Smrg	 */
6496b8e80941Smrg	num_out = 0;
6497b8e80941Smrg	num_out_sgpr = 0;
6498b8e80941Smrg
6499b8e80941Smrg	for (unsigned i = 0; i < fninfo.num_params; ++i) {
6500b8e80941Smrg		LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
6501b8e80941Smrg		LLVMTypeRef param_type = LLVMTypeOf(param);
6502b8e80941Smrg		LLVMTypeRef out_type = i < fninfo.num_sgpr_params ? ctx->i32 : ctx->f32;
6503b8e80941Smrg		unsigned size = ac_get_type_size(param_type) / 4;
6504b8e80941Smrg
6505b8e80941Smrg		if (size == 1) {
6506b8e80941Smrg			if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
6507b8e80941Smrg				param = LLVMBuildPtrToInt(builder, param, ctx->i32, "");
6508b8e80941Smrg				param_type = ctx->i32;
6509b8e80941Smrg			}
6510848b8605Smrg
6511b8e80941Smrg			if (param_type != out_type)
6512b8e80941Smrg				param = LLVMBuildBitCast(builder, param, out_type, "");
6513b8e80941Smrg			out[num_out++] = param;
6514b8e80941Smrg		} else {
6515b8e80941Smrg			LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
6516848b8605Smrg
6517b8e80941Smrg			if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
6518b8e80941Smrg				param = LLVMBuildPtrToInt(builder, param, ctx->i64, "");
6519b8e80941Smrg				param_type = ctx->i64;
6520848b8605Smrg			}
6521b8e80941Smrg
6522b8e80941Smrg			if (param_type != vector_type)
6523b8e80941Smrg				param = LLVMBuildBitCast(builder, param, vector_type, "");
6524b8e80941Smrg
6525b8e80941Smrg			for (unsigned j = 0; j < size; ++j)
6526b8e80941Smrg				out[num_out++] = LLVMBuildExtractElement(
6527b8e80941Smrg					builder, param, LLVMConstInt(ctx->i32, j, 0), "");
6528848b8605Smrg		}
6529848b8605Smrg
6530b8e80941Smrg		if (i < fninfo.num_sgpr_params)
6531b8e80941Smrg			num_out_sgpr = num_out;
6532b8e80941Smrg	}
6533b8e80941Smrg
6534b8e80941Smrg	memcpy(initial, out, sizeof(out));
6535b8e80941Smrg	initial_num_out = num_out;
6536b8e80941Smrg	initial_num_out_sgpr = num_out_sgpr;
6537b8e80941Smrg
6538b8e80941Smrg	/* Now chain the parts. */
6539b8e80941Smrg	for (unsigned part = 0; part < num_parts; ++part) {
6540b8e80941Smrg		LLVMValueRef in[48];
6541b8e80941Smrg		LLVMValueRef ret;
6542b8e80941Smrg		LLVMTypeRef ret_type;
6543b8e80941Smrg		unsigned out_idx = 0;
6544b8e80941Smrg		unsigned num_params = LLVMCountParams(parts[part]);
6545b8e80941Smrg
6546b8e80941Smrg		/* Merged shaders are executed conditionally depending
6547b8e80941Smrg		 * on the number of enabled threads passed in the input SGPRs. */
6548b8e80941Smrg		if (is_merged_shader(ctx) && part == 0) {
6549b8e80941Smrg			LLVMValueRef ena, count = initial[3];
6550b8e80941Smrg
6551b8e80941Smrg			count = LLVMBuildAnd(builder, count,
6552b8e80941Smrg					     LLVMConstInt(ctx->i32, 0x7f, 0), "");
6553b8e80941Smrg			ena = LLVMBuildICmp(builder, LLVMIntULT,
6554b8e80941Smrg					    ac_get_thread_id(&ctx->ac), count, "");
6555b8e80941Smrg			lp_build_if(&if_state, &ctx->gallivm, ena);
6556b8e80941Smrg		}
6557848b8605Smrg
6558b8e80941Smrg		/* Derive arguments for the next part from outputs of the
6559b8e80941Smrg		 * previous one.
6560b8e80941Smrg		 */
6561b8e80941Smrg		for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
6562b8e80941Smrg			LLVMValueRef param;
6563b8e80941Smrg			LLVMTypeRef param_type;
6564b8e80941Smrg			bool is_sgpr;
6565b8e80941Smrg			unsigned param_size;
6566b8e80941Smrg			LLVMValueRef arg = NULL;
6567b8e80941Smrg
6568b8e80941Smrg			param = LLVMGetParam(parts[part], param_idx);
6569b8e80941Smrg			param_type = LLVMTypeOf(param);
6570b8e80941Smrg			param_size = ac_get_type_size(param_type) / 4;
6571b8e80941Smrg			is_sgpr = ac_is_sgpr_param(param);
6572b8e80941Smrg
6573b8e80941Smrg			if (is_sgpr) {
6574b8e80941Smrg				ac_add_function_attr(ctx->ac.context, parts[part],
6575b8e80941Smrg						     param_idx + 1, AC_FUNC_ATTR_INREG);
6576b8e80941Smrg			} else if (out_idx < num_out_sgpr) {
6577b8e80941Smrg				/* Skip returned SGPRs the current part doesn't
6578b8e80941Smrg				 * declare on the input. */
6579b8e80941Smrg				out_idx = num_out_sgpr;
6580b8e80941Smrg			}
6581848b8605Smrg
6582b8e80941Smrg			assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
6583848b8605Smrg
6584b8e80941Smrg			if (param_size == 1)
6585b8e80941Smrg				arg = out[out_idx];
6586b8e80941Smrg			else
6587b8e80941Smrg				arg = ac_build_gather_values(&ctx->ac, &out[out_idx], param_size);
6588b8e80941Smrg
6589b8e80941Smrg			if (LLVMTypeOf(arg) != param_type) {
6590b8e80941Smrg				if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
6591b8e80941Smrg					if (LLVMGetPointerAddressSpace(param_type) ==
6592b8e80941Smrg					    AC_ADDR_SPACE_CONST_32BIT) {
6593b8e80941Smrg						arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
6594b8e80941Smrg						arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
6595b8e80941Smrg					} else {
6596b8e80941Smrg						arg = LLVMBuildBitCast(builder, arg, ctx->i64, "");
6597b8e80941Smrg						arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
6598b8e80941Smrg					}
6599b8e80941Smrg				} else {
6600b8e80941Smrg					arg = LLVMBuildBitCast(builder, arg, param_type, "");
6601b8e80941Smrg				}
6602848b8605Smrg			}
6603848b8605Smrg
6604b8e80941Smrg			in[param_idx] = arg;
6605b8e80941Smrg			out_idx += param_size;
6606848b8605Smrg		}
6607848b8605Smrg
6608b8e80941Smrg		ret = LLVMBuildCall(builder, parts[part], in, num_params, "");
6609848b8605Smrg
6610b8e80941Smrg		if (is_merged_shader(ctx) &&
6611b8e80941Smrg		    part + 1 == next_shader_first_part) {
6612b8e80941Smrg			lp_build_endif(&if_state);
6613848b8605Smrg
6614b8e80941Smrg			/* The second half of the merged shader should use
6615b8e80941Smrg			 * the inputs from the toplevel (wrapper) function,
6616b8e80941Smrg			 * not the return value from the last call.
6617b8e80941Smrg			 *
6618b8e80941Smrg			 * That's because the last call was executed condi-
6619b8e80941Smrg			 * tionally, so we can't consume it in the main
6620b8e80941Smrg			 * block.
6621b8e80941Smrg			 */
6622b8e80941Smrg			memcpy(out, initial, sizeof(initial));
6623b8e80941Smrg			num_out = initial_num_out;
6624b8e80941Smrg			num_out_sgpr = initial_num_out_sgpr;
6625b8e80941Smrg			continue;
6626b8e80941Smrg		}
6627848b8605Smrg
6628b8e80941Smrg		/* Extract the returned GPRs. */
6629b8e80941Smrg		ret_type = LLVMTypeOf(ret);
6630b8e80941Smrg		num_out = 0;
6631b8e80941Smrg		num_out_sgpr = 0;
6632848b8605Smrg
6633b8e80941Smrg		if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
6634b8e80941Smrg			assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
6635848b8605Smrg
6636b8e80941Smrg			unsigned ret_size = LLVMCountStructElementTypes(ret_type);
6637848b8605Smrg
6638b8e80941Smrg			for (unsigned i = 0; i < ret_size; ++i) {
6639b8e80941Smrg				LLVMValueRef val =
6640b8e80941Smrg					LLVMBuildExtractValue(builder, ret, i, "");
6641848b8605Smrg
6642b8e80941Smrg				assert(num_out < ARRAY_SIZE(out));
6643b8e80941Smrg				out[num_out++] = val;
6644848b8605Smrg
6645b8e80941Smrg				if (LLVMTypeOf(val) == ctx->i32) {
6646b8e80941Smrg					assert(num_out_sgpr + 1 == num_out);
6647b8e80941Smrg					num_out_sgpr = num_out;
6648b8e80941Smrg				}
6649b8e80941Smrg			}
6650848b8605Smrg		}
6651b8e80941Smrg	}
6652848b8605Smrg
6653b8e80941Smrg	LLVMBuildRetVoid(builder);
6654b8e80941Smrg}
6655848b8605Smrg
6656b8e80941Smrgstatic bool si_should_optimize_less(struct ac_llvm_compiler *compiler,
6657b8e80941Smrg				    struct si_shader_selector *sel)
6658b8e80941Smrg{
6659b8e80941Smrg	if (!compiler->low_opt_passes)
6660b8e80941Smrg		return false;
6661b8e80941Smrg
6662b8e80941Smrg	/* Assume a slow CPU. */
6663b8e80941Smrg	assert(!sel->screen->info.has_dedicated_vram &&
6664b8e80941Smrg	       sel->screen->info.chip_class <= VI);
6665b8e80941Smrg
6666b8e80941Smrg	/* For a crazy dEQP test containing 2597 memory opcodes, mostly
6667b8e80941Smrg	 * buffer stores. */
6668b8e80941Smrg	return sel->type == PIPE_SHADER_COMPUTE &&
6669b8e80941Smrg	       sel->info.num_memory_instructions > 1000;
6670848b8605Smrg}
6671848b8605Smrg
6672b8e80941Smrgint si_compile_tgsi_shader(struct si_screen *sscreen,
6673b8e80941Smrg			   struct ac_llvm_compiler *compiler,
6674b8e80941Smrg			   struct si_shader *shader,
6675b8e80941Smrg			   struct pipe_debug_callback *debug)
6676848b8605Smrg{
6677b8e80941Smrg	struct si_shader_selector *sel = shader->selector;
6678b8e80941Smrg	struct si_shader_context ctx;
6679b8e80941Smrg	int r = -1;
6680b8e80941Smrg
6681b8e80941Smrg	/* Dump TGSI code before doing TGSI->LLVM conversion in case the
6682b8e80941Smrg	 * conversion fails. */
6683b8e80941Smrg	if (si_can_dump_shader(sscreen, sel->info.processor) &&
6684b8e80941Smrg	    !(sscreen->debug_flags & DBG(NO_TGSI))) {
6685b8e80941Smrg		if (sel->tokens)
6686b8e80941Smrg			tgsi_dump(sel->tokens, 0);
6687b8e80941Smrg		else
6688b8e80941Smrg			nir_print_shader(sel->nir, stderr);
6689b8e80941Smrg		si_dump_streamout(&sel->so);
6690848b8605Smrg	}
6691848b8605Smrg
6692b8e80941Smrg	si_init_shader_ctx(&ctx, sscreen, compiler);
6693b8e80941Smrg	si_llvm_context_set_tgsi(&ctx, shader);
6694848b8605Smrg
6695b8e80941Smrg	memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
6696b8e80941Smrg	       sizeof(shader->info.vs_output_param_offset));
6697848b8605Smrg
6698b8e80941Smrg	shader->info.uses_instanceid = sel->info.uses_instanceid;
6699848b8605Smrg
6700b8e80941Smrg	if (!si_compile_tgsi_main(&ctx)) {
6701b8e80941Smrg		si_llvm_dispose(&ctx);
6702b8e80941Smrg		return -1;
6703b8e80941Smrg	}
6704848b8605Smrg
6705b8e80941Smrg	if (shader->is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
6706b8e80941Smrg		LLVMValueRef parts[2];
6707b8e80941Smrg		bool need_prolog = sel->vs_needs_prolog;
6708848b8605Smrg
6709b8e80941Smrg		parts[1] = ctx.main_fn;
6710848b8605Smrg
6711b8e80941Smrg		if (need_prolog) {
6712b8e80941Smrg			union si_shader_part_key prolog_key;
6713b8e80941Smrg			si_get_vs_prolog_key(&sel->info,
6714b8e80941Smrg					     shader->info.num_input_sgprs,
6715b8e80941Smrg					     &shader->key.part.vs.prolog,
6716b8e80941Smrg					     shader, &prolog_key);
6717b8e80941Smrg			si_build_vs_prolog_function(&ctx, &prolog_key);
6718b8e80941Smrg			parts[0] = ctx.main_fn;
6719b8e80941Smrg		}
6720848b8605Smrg
6721b8e80941Smrg		si_build_wrapper_function(&ctx, parts + !need_prolog,
6722b8e80941Smrg					  1 + need_prolog, need_prolog, 0);
6723b8e80941Smrg	} else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
6724b8e80941Smrg		if (sscreen->info.chip_class >= GFX9) {
6725b8e80941Smrg			struct si_shader_selector *ls = shader->key.part.tcs.ls;
6726b8e80941Smrg			LLVMValueRef parts[4];
6727b8e80941Smrg			bool vs_needs_prolog =
6728b8e80941Smrg				si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog);
6729b8e80941Smrg
6730b8e80941Smrg			/* TCS main part */
6731b8e80941Smrg			parts[2] = ctx.main_fn;
6732b8e80941Smrg
6733b8e80941Smrg			/* TCS epilog */
6734b8e80941Smrg			union si_shader_part_key tcs_epilog_key;
6735b8e80941Smrg			memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
6736b8e80941Smrg			tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
6737b8e80941Smrg			si_build_tcs_epilog_function(&ctx, &tcs_epilog_key);
6738b8e80941Smrg			parts[3] = ctx.main_fn;
6739b8e80941Smrg
6740b8e80941Smrg			/* VS as LS main part */
6741b8e80941Smrg			struct si_shader shader_ls = {};
6742b8e80941Smrg			shader_ls.selector = ls;
6743b8e80941Smrg			shader_ls.key.as_ls = 1;
6744b8e80941Smrg			shader_ls.key.mono = shader->key.mono;
6745b8e80941Smrg			shader_ls.key.opt = shader->key.opt;
6746b8e80941Smrg			shader_ls.is_monolithic = true;
6747b8e80941Smrg			si_llvm_context_set_tgsi(&ctx, &shader_ls);
6748b8e80941Smrg
6749b8e80941Smrg			if (!si_compile_tgsi_main(&ctx)) {
6750b8e80941Smrg				si_llvm_dispose(&ctx);
6751b8e80941Smrg				return -1;
6752b8e80941Smrg			}
6753b8e80941Smrg			shader->info.uses_instanceid |= ls->info.uses_instanceid;
6754b8e80941Smrg			parts[1] = ctx.main_fn;
6755b8e80941Smrg
6756b8e80941Smrg			/* LS prolog */
6757b8e80941Smrg			if (vs_needs_prolog) {
6758b8e80941Smrg				union si_shader_part_key vs_prolog_key;
6759b8e80941Smrg				si_get_vs_prolog_key(&ls->info,
6760b8e80941Smrg						     shader_ls.info.num_input_sgprs,
6761b8e80941Smrg						     &shader->key.part.tcs.ls_prolog,
6762b8e80941Smrg						     shader, &vs_prolog_key);
6763b8e80941Smrg				vs_prolog_key.vs_prolog.is_monolithic = true;
6764b8e80941Smrg				si_build_vs_prolog_function(&ctx, &vs_prolog_key);
6765b8e80941Smrg				parts[0] = ctx.main_fn;
6766b8e80941Smrg			}
6767848b8605Smrg
6768b8e80941Smrg			/* Reset the shader context. */
6769b8e80941Smrg			ctx.shader = shader;
6770b8e80941Smrg			ctx.type = PIPE_SHADER_TESS_CTRL;
6771848b8605Smrg
6772b8e80941Smrg			si_build_wrapper_function(&ctx,
6773b8e80941Smrg						  parts + !vs_needs_prolog,
6774b8e80941Smrg						  4 - !vs_needs_prolog, vs_needs_prolog,
6775b8e80941Smrg						  vs_needs_prolog ? 2 : 1);
6776b8e80941Smrg		} else {
6777b8e80941Smrg			LLVMValueRef parts[2];
6778b8e80941Smrg			union si_shader_part_key epilog_key;
6779848b8605Smrg
6780b8e80941Smrg			parts[0] = ctx.main_fn;
6781848b8605Smrg
6782b8e80941Smrg			memset(&epilog_key, 0, sizeof(epilog_key));
6783b8e80941Smrg			epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
6784b8e80941Smrg			si_build_tcs_epilog_function(&ctx, &epilog_key);
6785b8e80941Smrg			parts[1] = ctx.main_fn;
6786848b8605Smrg
6787b8e80941Smrg			si_build_wrapper_function(&ctx, parts, 2, 0, 0);
6788848b8605Smrg		}
6789b8e80941Smrg	} else if (shader->is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
6790b8e80941Smrg		if (ctx.screen->info.chip_class >= GFX9) {
6791b8e80941Smrg			struct si_shader_selector *es = shader->key.part.gs.es;
6792b8e80941Smrg			LLVMValueRef es_prolog = NULL;
6793b8e80941Smrg			LLVMValueRef es_main = NULL;
6794b8e80941Smrg			LLVMValueRef gs_prolog = NULL;
6795b8e80941Smrg			LLVMValueRef gs_main = ctx.main_fn;
6796b8e80941Smrg
6797b8e80941Smrg			/* GS prolog */
6798b8e80941Smrg			union si_shader_part_key gs_prolog_key;
6799b8e80941Smrg			memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
6800b8e80941Smrg			gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
6801b8e80941Smrg			gs_prolog_key.gs_prolog.is_monolithic = true;
6802b8e80941Smrg			si_build_gs_prolog_function(&ctx, &gs_prolog_key);
6803b8e80941Smrg			gs_prolog = ctx.main_fn;
6804b8e80941Smrg
6805b8e80941Smrg			/* ES main part */
6806b8e80941Smrg			struct si_shader shader_es = {};
6807b8e80941Smrg			shader_es.selector = es;
6808b8e80941Smrg			shader_es.key.as_es = 1;
6809b8e80941Smrg			shader_es.key.mono = shader->key.mono;
6810b8e80941Smrg			shader_es.key.opt = shader->key.opt;
6811b8e80941Smrg			shader_es.is_monolithic = true;
6812b8e80941Smrg			si_llvm_context_set_tgsi(&ctx, &shader_es);
6813b8e80941Smrg
6814b8e80941Smrg			if (!si_compile_tgsi_main(&ctx)) {
6815b8e80941Smrg				si_llvm_dispose(&ctx);
6816b8e80941Smrg				return -1;
6817b8e80941Smrg			}
6818b8e80941Smrg			shader->info.uses_instanceid |= es->info.uses_instanceid;
6819b8e80941Smrg			es_main = ctx.main_fn;
6820b8e80941Smrg
6821b8e80941Smrg			/* ES prolog */
6822b8e80941Smrg			if (es->vs_needs_prolog) {
6823b8e80941Smrg				union si_shader_part_key vs_prolog_key;
6824b8e80941Smrg				si_get_vs_prolog_key(&es->info,
6825b8e80941Smrg						     shader_es.info.num_input_sgprs,
6826b8e80941Smrg						     &shader->key.part.gs.vs_prolog,
6827b8e80941Smrg						     shader, &vs_prolog_key);
6828b8e80941Smrg				vs_prolog_key.vs_prolog.is_monolithic = true;
6829b8e80941Smrg				si_build_vs_prolog_function(&ctx, &vs_prolog_key);
6830b8e80941Smrg				es_prolog = ctx.main_fn;
6831b8e80941Smrg			}
6832848b8605Smrg
6833b8e80941Smrg			/* Reset the shader context. */
6834b8e80941Smrg			ctx.shader = shader;
6835b8e80941Smrg			ctx.type = PIPE_SHADER_GEOMETRY;
6836848b8605Smrg
6837b8e80941Smrg			/* Prepare the array of shader parts. */
6838b8e80941Smrg			LLVMValueRef parts[4];
6839b8e80941Smrg			unsigned num_parts = 0, main_part, next_first_part;
6840848b8605Smrg
6841b8e80941Smrg			if (es_prolog)
6842b8e80941Smrg				parts[num_parts++] = es_prolog;
6843848b8605Smrg
6844b8e80941Smrg			parts[main_part = num_parts++] = es_main;
6845b8e80941Smrg			parts[next_first_part = num_parts++] = gs_prolog;
6846b8e80941Smrg			parts[num_parts++] = gs_main;
6847848b8605Smrg
6848b8e80941Smrg			si_build_wrapper_function(&ctx, parts, num_parts,
6849b8e80941Smrg						  main_part, next_first_part);
6850b8e80941Smrg		} else {
6851b8e80941Smrg			LLVMValueRef parts[2];
6852b8e80941Smrg			union si_shader_part_key prolog_key;
6853848b8605Smrg
6854b8e80941Smrg			parts[1] = ctx.main_fn;
6855b8e80941Smrg
6856b8e80941Smrg			memset(&prolog_key, 0, sizeof(prolog_key));
6857b8e80941Smrg			prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
6858b8e80941Smrg			si_build_gs_prolog_function(&ctx, &prolog_key);
6859b8e80941Smrg			parts[0] = ctx.main_fn;
6860b8e80941Smrg
6861b8e80941Smrg			si_build_wrapper_function(&ctx, parts, 2, 1, 0);
6862b8e80941Smrg		}
6863b8e80941Smrg	} else if (shader->is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
6864b8e80941Smrg		LLVMValueRef parts[3];
6865b8e80941Smrg		union si_shader_part_key prolog_key;
6866b8e80941Smrg		union si_shader_part_key epilog_key;
6867b8e80941Smrg		bool need_prolog;
6868b8e80941Smrg
6869b8e80941Smrg		si_get_ps_prolog_key(shader, &prolog_key, false);
6870b8e80941Smrg		need_prolog = si_need_ps_prolog(&prolog_key);
6871b8e80941Smrg
6872b8e80941Smrg		parts[need_prolog ? 1 : 0] = ctx.main_fn;
6873b8e80941Smrg
6874b8e80941Smrg		if (need_prolog) {
6875b8e80941Smrg			si_build_ps_prolog_function(&ctx, &prolog_key);
6876b8e80941Smrg			parts[0] = ctx.main_fn;
6877b8e80941Smrg		}
6878848b8605Smrg
6879b8e80941Smrg		si_get_ps_epilog_key(shader, &epilog_key);
6880b8e80941Smrg		si_build_ps_epilog_function(&ctx, &epilog_key);
6881b8e80941Smrg		parts[need_prolog ? 2 : 1] = ctx.main_fn;
6882848b8605Smrg
6883b8e80941Smrg		si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2,
6884b8e80941Smrg					  need_prolog ? 1 : 0, 0);
6885b8e80941Smrg	}
6886848b8605Smrg
6887b8e80941Smrg	si_llvm_optimize_module(&ctx);
6888848b8605Smrg
6889b8e80941Smrg	/* Post-optimization transformations and analysis. */
6890b8e80941Smrg	si_optimize_vs_outputs(&ctx);
6891848b8605Smrg
6892b8e80941Smrg	if ((debug && debug->debug_message) ||
6893b8e80941Smrg	    si_can_dump_shader(sscreen, ctx.type)) {
6894b8e80941Smrg		ctx.shader->config.private_mem_vgprs =
6895b8e80941Smrg			ac_count_scratch_private_memory(ctx.main_fn);
6896848b8605Smrg	}
6897848b8605Smrg
6898b8e80941Smrg	/* Make sure the input is a pointer and not integer followed by inttoptr. */
6899b8e80941Smrg	assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) ==
6900b8e80941Smrg	       LLVMPointerTypeKind);
6901848b8605Smrg
6902b8e80941Smrg	/* Compile to bytecode. */
6903b8e80941Smrg	r = si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler,
6904b8e80941Smrg			    ctx.ac.module, debug, ctx.type,
6905b8e80941Smrg			    si_get_shader_name(shader, ctx.type),
6906b8e80941Smrg			    si_should_optimize_less(compiler, shader->selector));
6907b8e80941Smrg	si_llvm_dispose(&ctx);
6908b8e80941Smrg	if (r) {
6909b8e80941Smrg		fprintf(stderr, "LLVM failed to compile shader\n");
6910b8e80941Smrg		return r;
6911b8e80941Smrg	}
6912b8e80941Smrg
6913b8e80941Smrg	/* Validate SGPR and VGPR usage for compute to detect compiler bugs.
6914b8e80941Smrg	 * LLVM 3.9svn has this bug.
6915848b8605Smrg	 */
6916b8e80941Smrg	if (sel->type == PIPE_SHADER_COMPUTE) {
6917b8e80941Smrg		unsigned wave_size = 64;
6918b8e80941Smrg		unsigned max_vgprs = 256;
6919b8e80941Smrg		unsigned max_sgprs = sscreen->info.chip_class >= VI ? 800 : 512;
6920b8e80941Smrg		unsigned max_sgprs_per_wave = 128;
6921b8e80941Smrg		unsigned max_block_threads = si_get_max_workgroup_size(shader);
6922b8e80941Smrg		unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size);
6923b8e80941Smrg		unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4);
6924b8e80941Smrg
6925b8e80941Smrg		max_vgprs = max_vgprs / min_waves_per_simd;
6926b8e80941Smrg		max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave);
6927b8e80941Smrg
6928b8e80941Smrg		if (shader->config.num_sgprs > max_sgprs ||
6929b8e80941Smrg		    shader->config.num_vgprs > max_vgprs) {
6930b8e80941Smrg			fprintf(stderr, "LLVM failed to compile a shader correctly: "
6931b8e80941Smrg				"SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
6932b8e80941Smrg				shader->config.num_sgprs, shader->config.num_vgprs,
6933b8e80941Smrg				max_sgprs, max_vgprs);
6934b8e80941Smrg
6935b8e80941Smrg			/* Just terminate the process, because dependent
6936b8e80941Smrg			 * shaders can hang due to bad input data, but use
6937b8e80941Smrg			 * the env var to allow shader-db to work.
6938b8e80941Smrg			 */
6939b8e80941Smrg			if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
6940b8e80941Smrg				abort();
6941b8e80941Smrg		}
6942b8e80941Smrg	}
6943848b8605Smrg
6944b8e80941Smrg	/* Add the scratch offset to input SGPRs. */
6945b8e80941Smrg	if (shader->config.scratch_bytes_per_wave && !is_merged_shader(&ctx))
6946b8e80941Smrg		shader->info.num_input_sgprs += 1; /* scratch byte offset */
6947b8e80941Smrg
6948b8e80941Smrg	/* Calculate the number of fragment input VGPRs. */
6949b8e80941Smrg	if (ctx.type == PIPE_SHADER_FRAGMENT) {
6950b8e80941Smrg		shader->info.num_input_vgprs = 0;
6951b8e80941Smrg		shader->info.face_vgpr_index = -1;
6952b8e80941Smrg		shader->info.ancillary_vgpr_index = -1;
6953b8e80941Smrg
6954b8e80941Smrg		if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6955b8e80941Smrg			shader->info.num_input_vgprs += 2;
6956b8e80941Smrg		if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
6957b8e80941Smrg			shader->info.num_input_vgprs += 2;
6958b8e80941Smrg		if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
6959b8e80941Smrg			shader->info.num_input_vgprs += 2;
6960b8e80941Smrg		if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
6961b8e80941Smrg			shader->info.num_input_vgprs += 3;
6962b8e80941Smrg		if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6963b8e80941Smrg			shader->info.num_input_vgprs += 2;
6964b8e80941Smrg		if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
6965b8e80941Smrg			shader->info.num_input_vgprs += 2;
6966b8e80941Smrg		if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
6967b8e80941Smrg			shader->info.num_input_vgprs += 2;
6968b8e80941Smrg		if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
6969b8e80941Smrg			shader->info.num_input_vgprs += 1;
6970b8e80941Smrg		if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
6971b8e80941Smrg			shader->info.num_input_vgprs += 1;
6972b8e80941Smrg		if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
6973b8e80941Smrg			shader->info.num_input_vgprs += 1;
6974b8e80941Smrg		if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
6975b8e80941Smrg			shader->info.num_input_vgprs += 1;
6976b8e80941Smrg		if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
6977b8e80941Smrg			shader->info.num_input_vgprs += 1;
6978b8e80941Smrg		if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
6979b8e80941Smrg			shader->info.face_vgpr_index = shader->info.num_input_vgprs;
6980b8e80941Smrg			shader->info.num_input_vgprs += 1;
6981b8e80941Smrg		}
6982b8e80941Smrg		if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr)) {
6983b8e80941Smrg			shader->info.ancillary_vgpr_index = shader->info.num_input_vgprs;
6984b8e80941Smrg			shader->info.num_input_vgprs += 1;
6985b8e80941Smrg		}
6986b8e80941Smrg		if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
6987b8e80941Smrg			shader->info.num_input_vgprs += 1;
6988b8e80941Smrg		if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
6989b8e80941Smrg			shader->info.num_input_vgprs += 1;
6990b8e80941Smrg	}
6991b8e80941Smrg
6992b8e80941Smrg	si_calculate_max_simd_waves(shader);
6993b8e80941Smrg	si_shader_dump_stats_for_shader_db(shader, debug);
6994b8e80941Smrg	return 0;
6995b8e80941Smrg}
6996848b8605Smrg
6997b8e80941Smrg/**
6998b8e80941Smrg * Create, compile and return a shader part (prolog or epilog).
6999b8e80941Smrg *
7000b8e80941Smrg * \param sscreen	screen
7001b8e80941Smrg * \param list		list of shader parts of the same category
7002b8e80941Smrg * \param type		shader type
7003b8e80941Smrg * \param key		shader part key
7004b8e80941Smrg * \param prolog	whether the part being requested is a prolog
7005b8e80941Smrg * \param tm		LLVM target machine
7006b8e80941Smrg * \param debug		debug callback
7007b8e80941Smrg * \param build		the callback responsible for building the main function
7008b8e80941Smrg * \return		non-NULL on success
7009b8e80941Smrg */
7010b8e80941Smrgstatic struct si_shader_part *
7011b8e80941Smrgsi_get_shader_part(struct si_screen *sscreen,
7012b8e80941Smrg		   struct si_shader_part **list,
7013b8e80941Smrg		   enum pipe_shader_type type,
7014b8e80941Smrg		   bool prolog,
7015b8e80941Smrg		   union si_shader_part_key *key,
7016b8e80941Smrg		   struct ac_llvm_compiler *compiler,
7017b8e80941Smrg		   struct pipe_debug_callback *debug,
7018b8e80941Smrg		   void (*build)(struct si_shader_context *,
7019b8e80941Smrg				 union si_shader_part_key *),
7020b8e80941Smrg		   const char *name)
7021b8e80941Smrg{
7022b8e80941Smrg	struct si_shader_part *result;
7023848b8605Smrg
7024b8e80941Smrg	mtx_lock(&sscreen->shader_parts_mutex);
7025848b8605Smrg
7026b8e80941Smrg	/* Find existing. */
7027b8e80941Smrg	for (result = *list; result; result = result->next) {
7028b8e80941Smrg		if (memcmp(&result->key, key, sizeof(*key)) == 0) {
7029b8e80941Smrg			mtx_unlock(&sscreen->shader_parts_mutex);
7030b8e80941Smrg			return result;
7031848b8605Smrg		}
7032848b8605Smrg	}
7033848b8605Smrg
7034b8e80941Smrg	/* Compile a new one. */
7035b8e80941Smrg	result = CALLOC_STRUCT(si_shader_part);
7036b8e80941Smrg	result->key = *key;
7037848b8605Smrg
7038b8e80941Smrg	struct si_shader shader = {};
7039b8e80941Smrg	struct si_shader_context ctx;
7040848b8605Smrg
7041b8e80941Smrg	si_init_shader_ctx(&ctx, sscreen, compiler);
7042b8e80941Smrg	ctx.shader = &shader;
7043b8e80941Smrg	ctx.type = type;
7044848b8605Smrg
7045b8e80941Smrg	switch (type) {
7046b8e80941Smrg	case PIPE_SHADER_VERTEX:
7047b8e80941Smrg		shader.key.as_ls = key->vs_prolog.as_ls;
7048b8e80941Smrg		shader.key.as_es = key->vs_prolog.as_es;
7049b8e80941Smrg		break;
7050b8e80941Smrg	case PIPE_SHADER_TESS_CTRL:
7051b8e80941Smrg		assert(!prolog);
7052b8e80941Smrg		shader.key.part.tcs.epilog = key->tcs_epilog.states;
7053b8e80941Smrg		break;
7054b8e80941Smrg	case PIPE_SHADER_GEOMETRY:
7055b8e80941Smrg		assert(prolog);
7056b8e80941Smrg		break;
7057b8e80941Smrg	case PIPE_SHADER_FRAGMENT:
7058b8e80941Smrg		if (prolog)
7059b8e80941Smrg			shader.key.part.ps.prolog = key->ps_prolog.states;
7060b8e80941Smrg		else
7061b8e80941Smrg			shader.key.part.ps.epilog = key->ps_epilog.states;
7062b8e80941Smrg		break;
7063b8e80941Smrg	default:
7064b8e80941Smrg		unreachable("bad shader part");
7065b8e80941Smrg	}
7066848b8605Smrg
7067b8e80941Smrg	build(&ctx, key);
7068848b8605Smrg
7069b8e80941Smrg	/* Compile. */
7070b8e80941Smrg	si_llvm_optimize_module(&ctx);
7071b8e80941Smrg
7072b8e80941Smrg	if (si_compile_llvm(sscreen, &result->binary, &result->config, compiler,
7073b8e80941Smrg			    ctx.ac.module, debug, ctx.type, name, false)) {
7074b8e80941Smrg		FREE(result);
7075b8e80941Smrg		result = NULL;
7076b8e80941Smrg		goto out;
7077b8e80941Smrg	}
7078848b8605Smrg
7079b8e80941Smrg	result->next = *list;
7080b8e80941Smrg	*list = result;
7081848b8605Smrg
7082b8e80941Smrgout:
7083b8e80941Smrg	si_llvm_dispose(&ctx);
7084b8e80941Smrg	mtx_unlock(&sscreen->shader_parts_mutex);
7085b8e80941Smrg	return result;
7086848b8605Smrg}
7087848b8605Smrg
7088b8e80941Smrgstatic LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx)
7089848b8605Smrg{
7090b8e80941Smrg	LLVMValueRef ptr[2], list;
7091b8e80941Smrg	bool merged_shader = is_merged_shader(ctx);
7092b8e80941Smrg
7093b8e80941Smrg	ptr[0] = LLVMGetParam(ctx->main_fn, (merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS);
7094b8e80941Smrg	list = LLVMBuildIntToPtr(ctx->ac.builder, ptr[0],
7095b8e80941Smrg				 ac_array_in_const32_addr_space(ctx->v4i32), "");
7096b8e80941Smrg	return list;
7097848b8605Smrg}
7098848b8605Smrg
7099b8e80941Smrg/**
7100b8e80941Smrg * Build the vertex shader prolog function.
7101b8e80941Smrg *
7102b8e80941Smrg * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
7103b8e80941Smrg * All inputs are returned unmodified. The vertex load indices are
7104b8e80941Smrg * stored after them, which will be used by the API VS for fetching inputs.
7105b8e80941Smrg *
7106b8e80941Smrg * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
7107b8e80941Smrg *   input_v0,
7108b8e80941Smrg *   input_v1,
7109b8e80941Smrg *   input_v2,
7110b8e80941Smrg *   input_v3,
7111b8e80941Smrg *   (VertexID + BaseVertex),
7112b8e80941Smrg *   (InstanceID + StartInstance),
7113b8e80941Smrg *   (InstanceID / 2 + StartInstance)
7114b8e80941Smrg */
7115b8e80941Smrgstatic void si_build_vs_prolog_function(struct si_shader_context *ctx,
7116b8e80941Smrg					union si_shader_part_key *key)
7117848b8605Smrg{
7118b8e80941Smrg	struct si_function_info fninfo;
7119b8e80941Smrg	LLVMTypeRef *returns;
7120b8e80941Smrg	LLVMValueRef ret, func;
7121b8e80941Smrg	int num_returns, i;
7122b8e80941Smrg	unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
7123b8e80941Smrg	unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
7124b8e80941Smrg	LLVMValueRef input_vgprs[9];
7125b8e80941Smrg	unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
7126b8e80941Smrg				      num_input_vgprs;
7127b8e80941Smrg	unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
7128b8e80941Smrg
7129b8e80941Smrg	si_init_function_info(&fninfo);
7130b8e80941Smrg
7131b8e80941Smrg	/* 4 preloaded VGPRs + vertex load indices as prolog outputs */
7132b8e80941Smrg	returns = alloca((num_all_input_regs + key->vs_prolog.last_input + 1) *
7133b8e80941Smrg			 sizeof(LLVMTypeRef));
7134b8e80941Smrg	num_returns = 0;
7135b8e80941Smrg
7136b8e80941Smrg	/* Declare input and output SGPRs. */
7137b8e80941Smrg	for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
7138b8e80941Smrg		add_arg(&fninfo, ARG_SGPR, ctx->i32);
7139b8e80941Smrg		returns[num_returns++] = ctx->i32;
7140b8e80941Smrg	}
7141848b8605Smrg
7142b8e80941Smrg	/* Preloaded VGPRs (outputs must be floats) */
7143b8e80941Smrg	for (i = 0; i < num_input_vgprs; i++) {
7144b8e80941Smrg		add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &input_vgprs[i]);
7145b8e80941Smrg		returns[num_returns++] = ctx->f32;
7146b8e80941Smrg	}
7147848b8605Smrg
7148b8e80941Smrg	/* Vertex load indices. */
7149b8e80941Smrg	for (i = 0; i <= key->vs_prolog.last_input; i++)
7150b8e80941Smrg		returns[num_returns++] = ctx->f32;
7151848b8605Smrg
7152b8e80941Smrg	/* Create the function. */
7153b8e80941Smrg	si_create_function(ctx, "vs_prolog", returns, num_returns, &fninfo, 0);
7154b8e80941Smrg	func = ctx->main_fn;
7155848b8605Smrg
7156b8e80941Smrg	if (key->vs_prolog.num_merged_next_stage_vgprs) {
7157b8e80941Smrg		if (!key->vs_prolog.is_monolithic)
7158b8e80941Smrg			si_init_exec_from_input(ctx, 3, 0);
7159848b8605Smrg
7160b8e80941Smrg		if (key->vs_prolog.as_ls &&
7161b8e80941Smrg		    ctx->screen->has_ls_vgpr_init_bug) {
7162b8e80941Smrg			/* If there are no HS threads, SPI loads the LS VGPRs
7163b8e80941Smrg			 * starting at VGPR 0. Shift them back to where they
7164b8e80941Smrg			 * belong.
7165b8e80941Smrg			 */
7166b8e80941Smrg			LLVMValueRef has_hs_threads =
7167b8e80941Smrg				LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,
7168b8e80941Smrg				    si_unpack_param(ctx, 3, 8, 8),
7169b8e80941Smrg				    ctx->i32_0, "");
7170b8e80941Smrg
7171b8e80941Smrg			for (i = 4; i > 0; --i) {
7172b8e80941Smrg				input_vgprs[i + 1] =
7173b8e80941Smrg					LLVMBuildSelect(ctx->ac.builder, has_hs_threads,
7174b8e80941Smrg						        input_vgprs[i + 1],
7175b8e80941Smrg						        input_vgprs[i - 1], "");
7176848b8605Smrg			}
7177848b8605Smrg		}
7178b8e80941Smrg	}
7179848b8605Smrg
7180b8e80941Smrg	ctx->abi.vertex_id = input_vgprs[first_vs_vgpr];
7181b8e80941Smrg	ctx->abi.instance_id = input_vgprs[first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1)];
7182848b8605Smrg
7183b8e80941Smrg	/* Copy inputs to outputs. This should be no-op, as the registers match,
7184b8e80941Smrg	 * but it will prevent the compiler from overwriting them unintentionally.
7185b8e80941Smrg	 */
7186b8e80941Smrg	ret = ctx->return_value;
7187b8e80941Smrg	for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
7188b8e80941Smrg		LLVMValueRef p = LLVMGetParam(func, i);
7189b8e80941Smrg		ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
7190b8e80941Smrg	}
7191b8e80941Smrg	for (i = 0; i < num_input_vgprs; i++) {
7192b8e80941Smrg		LLVMValueRef p = input_vgprs[i];
7193b8e80941Smrg		p = ac_to_float(&ctx->ac, p);
7194b8e80941Smrg		ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p,
7195b8e80941Smrg					   key->vs_prolog.num_input_sgprs + i, "");
7196b8e80941Smrg	}
7197848b8605Smrg
7198b8e80941Smrg	/* Compute vertex load indices from instance divisors. */
7199b8e80941Smrg	LLVMValueRef instance_divisor_constbuf = NULL;
7200848b8605Smrg
7201b8e80941Smrg	if (key->vs_prolog.states.instance_divisor_is_fetched) {
7202b8e80941Smrg		LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
7203b8e80941Smrg		LLVMValueRef buf_index =
7204b8e80941Smrg			LLVMConstInt(ctx->i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
7205b8e80941Smrg		instance_divisor_constbuf =
7206b8e80941Smrg			ac_build_load_to_sgpr(&ctx->ac, list, buf_index);
7207848b8605Smrg	}
7208848b8605Smrg
7209b8e80941Smrg	for (i = 0; i <= key->vs_prolog.last_input; i++) {
7210b8e80941Smrg		bool divisor_is_one =
7211b8e80941Smrg			key->vs_prolog.states.instance_divisor_is_one & (1u << i);
7212b8e80941Smrg		bool divisor_is_fetched =
7213b8e80941Smrg			key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
7214b8e80941Smrg		LLVMValueRef index = NULL;
7215b8e80941Smrg
7216b8e80941Smrg		if (divisor_is_one) {
7217b8e80941Smrg			index = ctx->abi.instance_id;
7218b8e80941Smrg		} else if (divisor_is_fetched) {
7219b8e80941Smrg			LLVMValueRef udiv_factors[4];
7220b8e80941Smrg
7221b8e80941Smrg			for (unsigned j = 0; j < 4; j++) {
7222b8e80941Smrg				udiv_factors[j] =
7223b8e80941Smrg					buffer_load_const(ctx, instance_divisor_constbuf,
7224b8e80941Smrg							  LLVMConstInt(ctx->i32, i*16 + j*4, 0));
7225b8e80941Smrg				udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]);
7226b8e80941Smrg			}
7227b8e80941Smrg			/* The faster NUW version doesn't work when InstanceID == UINT_MAX.
7228b8e80941Smrg			 * Such InstanceID might not be achievable in a reasonable time though.
7229b8e80941Smrg			 */
7230b8e80941Smrg			index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id,
7231b8e80941Smrg						       udiv_factors[0], udiv_factors[1],
7232b8e80941Smrg						       udiv_factors[2], udiv_factors[3]);
7233b8e80941Smrg		}
7234848b8605Smrg
7235b8e80941Smrg		if (divisor_is_one || divisor_is_fetched) {
7236b8e80941Smrg			/* Add StartInstance. */
7237b8e80941Smrg			index = LLVMBuildAdd(ctx->ac.builder, index,
7238b8e80941Smrg					     LLVMGetParam(ctx->main_fn, user_sgpr_base +
7239b8e80941Smrg							  SI_SGPR_START_INSTANCE), "");
7240b8e80941Smrg		} else {
7241b8e80941Smrg			/* VertexID + BaseVertex */
7242b8e80941Smrg			index = LLVMBuildAdd(ctx->ac.builder,
7243b8e80941Smrg					     ctx->abi.vertex_id,
7244b8e80941Smrg					     LLVMGetParam(func, user_sgpr_base +
7245b8e80941Smrg								SI_SGPR_BASE_VERTEX), "");
7246848b8605Smrg		}
7247b8e80941Smrg
7248b8e80941Smrg		index = ac_to_float(&ctx->ac, index);
7249b8e80941Smrg		ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index,
7250b8e80941Smrg					   fninfo.num_params + i, "");
7251848b8605Smrg	}
7252848b8605Smrg
7253b8e80941Smrg	si_llvm_build_ret(ctx, ret);
7254b8e80941Smrg}
7255b8e80941Smrg
7256b8e80941Smrgstatic bool si_get_vs_prolog(struct si_screen *sscreen,
7257b8e80941Smrg			     struct ac_llvm_compiler *compiler,
7258b8e80941Smrg			     struct si_shader *shader,
7259b8e80941Smrg			     struct pipe_debug_callback *debug,
7260b8e80941Smrg			     struct si_shader *main_part,
7261b8e80941Smrg			     const struct si_vs_prolog_bits *key)
7262b8e80941Smrg{
7263b8e80941Smrg	struct si_shader_selector *vs = main_part->selector;
7264b8e80941Smrg
7265b8e80941Smrg	if (!si_vs_needs_prolog(vs, key))
7266b8e80941Smrg		return true;
7267b8e80941Smrg
7268b8e80941Smrg	/* Get the prolog. */
7269b8e80941Smrg	union si_shader_part_key prolog_key;
7270b8e80941Smrg	si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs,
7271b8e80941Smrg			     key, shader, &prolog_key);
7272b8e80941Smrg
7273b8e80941Smrg	shader->prolog =
7274b8e80941Smrg		si_get_shader_part(sscreen, &sscreen->vs_prologs,
7275b8e80941Smrg				   PIPE_SHADER_VERTEX, true, &prolog_key, compiler,
7276b8e80941Smrg				   debug, si_build_vs_prolog_function,
7277b8e80941Smrg				   "Vertex Shader Prolog");
7278b8e80941Smrg	return shader->prolog != NULL;
7279848b8605Smrg}
7280848b8605Smrg
7281b8e80941Smrg/**
7282b8e80941Smrg * Select and compile (or reuse) vertex shader parts (prolog & epilog).
7283b8e80941Smrg */
7284b8e80941Smrgstatic bool si_shader_select_vs_parts(struct si_screen *sscreen,
7285b8e80941Smrg				      struct ac_llvm_compiler *compiler,
7286b8e80941Smrg				      struct si_shader *shader,
7287b8e80941Smrg				      struct pipe_debug_callback *debug)
7288848b8605Smrg{
7289b8e80941Smrg	return si_get_vs_prolog(sscreen, compiler, shader, debug, shader,
7290b8e80941Smrg				&shader->key.part.vs.prolog);
7291b8e80941Smrg}
7292848b8605Smrg
7293b8e80941Smrg/**
7294b8e80941Smrg * Compile the TCS epilog function. This writes tesselation factors to memory
7295b8e80941Smrg * based on the output primitive type of the tesselator (determined by TES).
7296b8e80941Smrg */
7297b8e80941Smrgstatic void si_build_tcs_epilog_function(struct si_shader_context *ctx,
7298b8e80941Smrg					 union si_shader_part_key *key)
7299b8e80941Smrg{
7300b8e80941Smrg	struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
7301b8e80941Smrg	struct si_function_info fninfo;
7302b8e80941Smrg	LLVMValueRef func;
7303b8e80941Smrg
7304b8e80941Smrg	si_init_function_info(&fninfo);
7305b8e80941Smrg
7306b8e80941Smrg	if (ctx->screen->info.chip_class >= GFX9) {
7307b8e80941Smrg		add_arg(&fninfo, ARG_SGPR, ctx->i32);
7308b8e80941Smrg		add_arg(&fninfo, ARG_SGPR, ctx->i32);
7309b8e80941Smrg		ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7310b8e80941Smrg		add_arg(&fninfo, ARG_SGPR, ctx->i32); /* wave info */
7311b8e80941Smrg		ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7312b8e80941Smrg		add_arg(&fninfo, ARG_SGPR, ctx->i32);
7313b8e80941Smrg		add_arg(&fninfo, ARG_SGPR, ctx->i32);
7314b8e80941Smrg		add_arg(&fninfo, ARG_SGPR, ctx->i32);
7315b8e80941Smrg		add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7316b8e80941Smrg		add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7317b8e80941Smrg		add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7318b8e80941Smrg		add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7319b8e80941Smrg		add_arg(&fninfo, ARG_SGPR, ctx->i32);
7320b8e80941Smrg		add_arg(&fninfo, ARG_SGPR, ctx->i32);
7321b8e80941Smrg		add_arg(&fninfo, ARG_SGPR, ctx->i32);
7322b8e80941Smrg		add_arg(&fninfo, ARG_SGPR, ctx->i32);
7323b8e80941Smrg		ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7324b8e80941Smrg		add_arg(&fninfo, ARG_SGPR, ctx->i32);
7325b8e80941Smrg		ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7326b8e80941Smrg	} else {
7327b8e80941Smrg		add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7328b8e80941Smrg		add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7329b8e80941Smrg		add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7330b8e80941Smrg		add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7331b8e80941Smrg		ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7332b8e80941Smrg		add_arg(&fninfo, ARG_SGPR, ctx->i32);
7333b8e80941Smrg		ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7334b8e80941Smrg		add_arg(&fninfo, ARG_SGPR, ctx->i32);
7335b8e80941Smrg		ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7336b8e80941Smrg		ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7337b8e80941Smrg	}
7338848b8605Smrg
7339b8e80941Smrg	add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */
7340b8e80941Smrg	add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */
7341b8e80941Smrg	unsigned tess_factors_idx =
7342b8e80941Smrg		add_arg(&fninfo, ARG_VGPR, ctx->i32); /* patch index within the wave (REL_PATCH_ID) */
7343b8e80941Smrg	add_arg(&fninfo, ARG_VGPR, ctx->i32); /* invocation ID within the patch */
7344b8e80941Smrg	add_arg(&fninfo, ARG_VGPR, ctx->i32); /* LDS offset where tess factors should be loaded from */
7345b8e80941Smrg
7346b8e80941Smrg	for (unsigned i = 0; i < 6; i++)
7347b8e80941Smrg		add_arg(&fninfo, ARG_VGPR, ctx->i32); /* tess factors */
7348b8e80941Smrg
7349b8e80941Smrg	/* Create the function. */
7350b8e80941Smrg	si_create_function(ctx, "tcs_epilog", NULL, 0, &fninfo,
7351b8e80941Smrg			   ctx->screen->info.chip_class >= CIK ? 128 : 64);
7352b8e80941Smrg	ac_declare_lds_as_pointer(&ctx->ac);
7353b8e80941Smrg	func = ctx->main_fn;
7354b8e80941Smrg
7355b8e80941Smrg	LLVMValueRef invoc0_tess_factors[6];
7356b8e80941Smrg	for (unsigned i = 0; i < 6; i++)
7357b8e80941Smrg		invoc0_tess_factors[i] = LLVMGetParam(func, tess_factors_idx + 3 + i);
7358b8e80941Smrg
7359b8e80941Smrg	si_write_tess_factors(bld_base,
7360b8e80941Smrg			      LLVMGetParam(func, tess_factors_idx),
7361b8e80941Smrg			      LLVMGetParam(func, tess_factors_idx + 1),
7362b8e80941Smrg			      LLVMGetParam(func, tess_factors_idx + 2),
7363b8e80941Smrg			      invoc0_tess_factors, invoc0_tess_factors + 4);
7364b8e80941Smrg
7365b8e80941Smrg	LLVMBuildRetVoid(ctx->ac.builder);
7366b8e80941Smrg}
7367848b8605Smrg
7368b8e80941Smrg/**
7369b8e80941Smrg * Select and compile (or reuse) TCS parts (epilog).
7370b8e80941Smrg */
7371b8e80941Smrgstatic bool si_shader_select_tcs_parts(struct si_screen *sscreen,
7372b8e80941Smrg				       struct ac_llvm_compiler *compiler,
7373b8e80941Smrg				       struct si_shader *shader,
7374b8e80941Smrg				       struct pipe_debug_callback *debug)
7375b8e80941Smrg{
7376b8e80941Smrg	if (sscreen->info.chip_class >= GFX9) {
7377b8e80941Smrg		struct si_shader *ls_main_part =
7378b8e80941Smrg			shader->key.part.tcs.ls->main_shader_part_ls;
7379848b8605Smrg
7380b8e80941Smrg		if (!si_get_vs_prolog(sscreen, compiler, shader, debug, ls_main_part,
7381b8e80941Smrg				      &shader->key.part.tcs.ls_prolog))
7382b8e80941Smrg			return false;
7383848b8605Smrg
7384b8e80941Smrg		shader->previous_stage = ls_main_part;
7385848b8605Smrg	}
7386b8e80941Smrg
7387b8e80941Smrg	/* Get the epilog. */
7388b8e80941Smrg	union si_shader_part_key epilog_key;
7389b8e80941Smrg	memset(&epilog_key, 0, sizeof(epilog_key));
7390b8e80941Smrg	epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
7391b8e80941Smrg
7392b8e80941Smrg	shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
7393b8e80941Smrg					    PIPE_SHADER_TESS_CTRL, false,
7394b8e80941Smrg					    &epilog_key, compiler, debug,
7395b8e80941Smrg					    si_build_tcs_epilog_function,
7396b8e80941Smrg					    "Tessellation Control Shader Epilog");
7397b8e80941Smrg	return shader->epilog != NULL;
7398848b8605Smrg}
7399848b8605Smrg
7400b8e80941Smrg/**
7401b8e80941Smrg * Select and compile (or reuse) GS parts (prolog).
7402b8e80941Smrg */
7403b8e80941Smrgstatic bool si_shader_select_gs_parts(struct si_screen *sscreen,
7404b8e80941Smrg				      struct ac_llvm_compiler *compiler,
7405b8e80941Smrg				      struct si_shader *shader,
7406b8e80941Smrg				      struct pipe_debug_callback *debug)
7407848b8605Smrg{
7408b8e80941Smrg	if (sscreen->info.chip_class >= GFX9) {
7409b8e80941Smrg		struct si_shader *es_main_part =
7410b8e80941Smrg			shader->key.part.gs.es->main_shader_part_es;
7411848b8605Smrg
7412b8e80941Smrg		if (shader->key.part.gs.es->type == PIPE_SHADER_VERTEX &&
7413b8e80941Smrg		    !si_get_vs_prolog(sscreen, compiler, shader, debug, es_main_part,
7414b8e80941Smrg				      &shader->key.part.gs.vs_prolog))
7415b8e80941Smrg			return false;
7416848b8605Smrg
7417b8e80941Smrg		shader->previous_stage = es_main_part;
7418b8e80941Smrg	}
7419848b8605Smrg
7420b8e80941Smrg	if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
7421b8e80941Smrg		return true;
7422848b8605Smrg
7423b8e80941Smrg	union si_shader_part_key prolog_key;
7424b8e80941Smrg	memset(&prolog_key, 0, sizeof(prolog_key));
7425b8e80941Smrg	prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
7426848b8605Smrg
7427b8e80941Smrg	shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs,
7428b8e80941Smrg					    PIPE_SHADER_GEOMETRY, true,
7429b8e80941Smrg					    &prolog_key, compiler, debug,
7430b8e80941Smrg					    si_build_gs_prolog_function,
7431b8e80941Smrg					    "Geometry Shader Prolog");
7432b8e80941Smrg	return shader->prolog2 != NULL;
7433b8e80941Smrg}
7434848b8605Smrg
7435b8e80941Smrg/**
7436b8e80941Smrg * Build the pixel shader prolog function. This handles:
7437b8e80941Smrg * - two-side color selection and interpolation
7438b8e80941Smrg * - overriding interpolation parameters for the API PS
7439b8e80941Smrg * - polygon stippling
7440b8e80941Smrg *
7441b8e80941Smrg * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
7442b8e80941Smrg * overriden by other states. (e.g. per-sample interpolation)
7443b8e80941Smrg * Interpolated colors are stored after the preloaded VGPRs.
7444b8e80941Smrg */
7445b8e80941Smrgstatic void si_build_ps_prolog_function(struct si_shader_context *ctx,
7446b8e80941Smrg					union si_shader_part_key *key)
7447b8e80941Smrg{
7448b8e80941Smrg	struct si_function_info fninfo;
7449b8e80941Smrg	LLVMValueRef ret, func;
7450b8e80941Smrg	int num_returns, i, num_color_channels;
7451848b8605Smrg
7452b8e80941Smrg	assert(si_need_ps_prolog(key));
7453848b8605Smrg
7454b8e80941Smrg	si_init_function_info(&fninfo);
7455848b8605Smrg
7456b8e80941Smrg	/* Declare inputs. */
7457b8e80941Smrg	for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
7458b8e80941Smrg		add_arg(&fninfo, ARG_SGPR, ctx->i32);
7459848b8605Smrg
7460b8e80941Smrg	for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
7461b8e80941Smrg		add_arg(&fninfo, ARG_VGPR, ctx->f32);
7462848b8605Smrg
7463b8e80941Smrg	/* Declare outputs (same as inputs + add colors if needed) */
7464b8e80941Smrg	num_returns = fninfo.num_params;
7465b8e80941Smrg	num_color_channels = util_bitcount(key->ps_prolog.colors_read);
7466b8e80941Smrg	for (i = 0; i < num_color_channels; i++)
7467b8e80941Smrg		fninfo.types[num_returns++] = ctx->f32;
7468848b8605Smrg
7469b8e80941Smrg	/* Create the function. */
7470b8e80941Smrg	si_create_function(ctx, "ps_prolog", fninfo.types, num_returns,
7471b8e80941Smrg			   &fninfo, 0);
7472b8e80941Smrg	func = ctx->main_fn;
7473848b8605Smrg
7474b8e80941Smrg	/* Copy inputs to outputs. This should be no-op, as the registers match,
7475b8e80941Smrg	 * but it will prevent the compiler from overwriting them unintentionally.
7476b8e80941Smrg	 */
7477b8e80941Smrg	ret = ctx->return_value;
7478b8e80941Smrg	for (i = 0; i < fninfo.num_params; i++) {
7479b8e80941Smrg		LLVMValueRef p = LLVMGetParam(func, i);
7480b8e80941Smrg		ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
7481848b8605Smrg	}
7482848b8605Smrg
7483b8e80941Smrg	/* Polygon stippling. */
7484b8e80941Smrg	if (key->ps_prolog.states.poly_stipple) {
7485b8e80941Smrg		/* POS_FIXED_PT is always last. */
7486b8e80941Smrg		unsigned pos = key->ps_prolog.num_input_sgprs +
7487b8e80941Smrg			       key->ps_prolog.num_input_vgprs - 1;
7488b8e80941Smrg		LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
7489848b8605Smrg
7490b8e80941Smrg		si_llvm_emit_polygon_stipple(ctx, list, pos);
7491848b8605Smrg	}
7492848b8605Smrg
7493b8e80941Smrg	if (key->ps_prolog.states.bc_optimize_for_persp ||
7494b8e80941Smrg	    key->ps_prolog.states.bc_optimize_for_linear) {
7495b8e80941Smrg		unsigned i, base = key->ps_prolog.num_input_sgprs;
7496b8e80941Smrg		LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
7497b8e80941Smrg
7498b8e80941Smrg		/* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
7499b8e80941Smrg		 * The hw doesn't compute CENTROID if the whole wave only
7500b8e80941Smrg		 * contains fully-covered quads.
7501b8e80941Smrg		 *
7502b8e80941Smrg		 * PRIM_MASK is after user SGPRs.
7503b8e80941Smrg		 */
7504b8e80941Smrg		bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7505b8e80941Smrg		bc_optimize = LLVMBuildLShr(ctx->ac.builder, bc_optimize,
7506b8e80941Smrg					    LLVMConstInt(ctx->i32, 31, 0), "");
7507b8e80941Smrg		bc_optimize = LLVMBuildTrunc(ctx->ac.builder, bc_optimize,
7508b8e80941Smrg					     ctx->i1, "");
7509b8e80941Smrg
7510b8e80941Smrg		if (key->ps_prolog.states.bc_optimize_for_persp) {
7511b8e80941Smrg			/* Read PERSP_CENTER. */
7512b8e80941Smrg			for (i = 0; i < 2; i++)
7513b8e80941Smrg				center[i] = LLVMGetParam(func, base + 2 + i);
7514b8e80941Smrg			/* Read PERSP_CENTROID. */
7515b8e80941Smrg			for (i = 0; i < 2; i++)
7516b8e80941Smrg				centroid[i] = LLVMGetParam(func, base + 4 + i);
7517b8e80941Smrg			/* Select PERSP_CENTROID. */
7518b8e80941Smrg			for (i = 0; i < 2; i++) {
7519b8e80941Smrg				tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize,
7520b8e80941Smrg						      center[i], centroid[i], "");
7521b8e80941Smrg				ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7522b8e80941Smrg							   tmp, base + 4 + i, "");
7523b8e80941Smrg			}
7524b8e80941Smrg		}
7525b8e80941Smrg		if (key->ps_prolog.states.bc_optimize_for_linear) {
7526b8e80941Smrg			/* Read LINEAR_CENTER. */
7527b8e80941Smrg			for (i = 0; i < 2; i++)
7528b8e80941Smrg				center[i] = LLVMGetParam(func, base + 8 + i);
7529b8e80941Smrg			/* Read LINEAR_CENTROID. */
7530b8e80941Smrg			for (i = 0; i < 2; i++)
7531b8e80941Smrg				centroid[i] = LLVMGetParam(func, base + 10 + i);
7532b8e80941Smrg			/* Select LINEAR_CENTROID. */
7533b8e80941Smrg			for (i = 0; i < 2; i++) {
7534b8e80941Smrg				tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize,
7535b8e80941Smrg						      center[i], centroid[i], "");
7536b8e80941Smrg				ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7537b8e80941Smrg							   tmp, base + 10 + i, "");
7538b8e80941Smrg			}
7539848b8605Smrg		}
7540848b8605Smrg	}
7541848b8605Smrg
7542b8e80941Smrg	/* Force per-sample interpolation. */
7543b8e80941Smrg	if (key->ps_prolog.states.force_persp_sample_interp) {
7544b8e80941Smrg		unsigned i, base = key->ps_prolog.num_input_sgprs;
7545b8e80941Smrg		LLVMValueRef persp_sample[2];
7546b8e80941Smrg
7547b8e80941Smrg		/* Read PERSP_SAMPLE. */
7548b8e80941Smrg		for (i = 0; i < 2; i++)
7549b8e80941Smrg			persp_sample[i] = LLVMGetParam(func, base + i);
7550b8e80941Smrg		/* Overwrite PERSP_CENTER. */
7551b8e80941Smrg		for (i = 0; i < 2; i++)
7552b8e80941Smrg			ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7553b8e80941Smrg						   persp_sample[i], base + 2 + i, "");
7554b8e80941Smrg		/* Overwrite PERSP_CENTROID. */
7555b8e80941Smrg		for (i = 0; i < 2; i++)
7556b8e80941Smrg			ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7557b8e80941Smrg						   persp_sample[i], base + 4 + i, "");
7558b8e80941Smrg	}
7559b8e80941Smrg	if (key->ps_prolog.states.force_linear_sample_interp) {
7560b8e80941Smrg		unsigned i, base = key->ps_prolog.num_input_sgprs;
7561b8e80941Smrg		LLVMValueRef linear_sample[2];
7562b8e80941Smrg
7563b8e80941Smrg		/* Read LINEAR_SAMPLE. */
7564b8e80941Smrg		for (i = 0; i < 2; i++)
7565b8e80941Smrg			linear_sample[i] = LLVMGetParam(func, base + 6 + i);
7566b8e80941Smrg		/* Overwrite LINEAR_CENTER. */
7567b8e80941Smrg		for (i = 0; i < 2; i++)
7568b8e80941Smrg			ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7569b8e80941Smrg						   linear_sample[i], base + 8 + i, "");
7570b8e80941Smrg		/* Overwrite LINEAR_CENTROID. */
7571b8e80941Smrg		for (i = 0; i < 2; i++)
7572b8e80941Smrg			ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7573b8e80941Smrg						   linear_sample[i], base + 10 + i, "");
7574848b8605Smrg	}
7575848b8605Smrg
7576b8e80941Smrg	/* Force center interpolation. */
7577b8e80941Smrg	if (key->ps_prolog.states.force_persp_center_interp) {
7578b8e80941Smrg		unsigned i, base = key->ps_prolog.num_input_sgprs;
7579b8e80941Smrg		LLVMValueRef persp_center[2];
7580b8e80941Smrg
7581b8e80941Smrg		/* Read PERSP_CENTER. */
7582b8e80941Smrg		for (i = 0; i < 2; i++)
7583b8e80941Smrg			persp_center[i] = LLVMGetParam(func, base + 2 + i);
7584b8e80941Smrg		/* Overwrite PERSP_SAMPLE. */
7585b8e80941Smrg		for (i = 0; i < 2; i++)
7586b8e80941Smrg			ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7587b8e80941Smrg						   persp_center[i], base + i, "");
7588b8e80941Smrg		/* Overwrite PERSP_CENTROID. */
7589b8e80941Smrg		for (i = 0; i < 2; i++)
7590b8e80941Smrg			ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7591b8e80941Smrg						   persp_center[i], base + 4 + i, "");
7592b8e80941Smrg	}
7593b8e80941Smrg	if (key->ps_prolog.states.force_linear_center_interp) {
7594b8e80941Smrg		unsigned i, base = key->ps_prolog.num_input_sgprs;
7595b8e80941Smrg		LLVMValueRef linear_center[2];
7596b8e80941Smrg
7597b8e80941Smrg		/* Read LINEAR_CENTER. */
7598b8e80941Smrg		for (i = 0; i < 2; i++)
7599b8e80941Smrg			linear_center[i] = LLVMGetParam(func, base + 8 + i);
7600b8e80941Smrg		/* Overwrite LINEAR_SAMPLE. */
7601b8e80941Smrg		for (i = 0; i < 2; i++)
7602b8e80941Smrg			ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7603b8e80941Smrg						   linear_center[i], base + 6 + i, "");
7604b8e80941Smrg		/* Overwrite LINEAR_CENTROID. */
7605b8e80941Smrg		for (i = 0; i < 2; i++)
7606b8e80941Smrg			ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7607b8e80941Smrg						   linear_center[i], base + 10 + i, "");
7608848b8605Smrg	}
7609848b8605Smrg
7610b8e80941Smrg	/* Interpolate colors. */
7611b8e80941Smrg	unsigned color_out_idx = 0;
7612b8e80941Smrg	for (i = 0; i < 2; i++) {
7613b8e80941Smrg		unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
7614b8e80941Smrg		unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
7615b8e80941Smrg				     key->ps_prolog.face_vgpr_index;
7616b8e80941Smrg		LLVMValueRef interp[2], color[4];
7617b8e80941Smrg		LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
7618848b8605Smrg
7619b8e80941Smrg		if (!writemask)
7620b8e80941Smrg			continue;
7621848b8605Smrg
7622b8e80941Smrg		/* If the interpolation qualifier is not CONSTANT (-1). */
7623b8e80941Smrg		if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
7624b8e80941Smrg			unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
7625b8e80941Smrg					       key->ps_prolog.color_interp_vgpr_index[i];
7626b8e80941Smrg
7627b8e80941Smrg			/* Get the (i,j) updated by bc_optimize handling. */
7628b8e80941Smrg			interp[0] = LLVMBuildExtractValue(ctx->ac.builder, ret,
7629b8e80941Smrg							  interp_vgpr, "");
7630b8e80941Smrg			interp[1] = LLVMBuildExtractValue(ctx->ac.builder, ret,
7631b8e80941Smrg							  interp_vgpr + 1, "");
7632b8e80941Smrg			interp_ij = ac_build_gather_values(&ctx->ac, interp, 2);
7633b8e80941Smrg		}
7634848b8605Smrg
7635b8e80941Smrg		/* Use the absolute location of the input. */
7636b8e80941Smrg		prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7637848b8605Smrg
7638b8e80941Smrg		if (key->ps_prolog.states.color_two_side) {
7639b8e80941Smrg			face = LLVMGetParam(func, face_vgpr);
7640b8e80941Smrg			face = ac_to_integer(&ctx->ac, face);
7641b8e80941Smrg		}
7642848b8605Smrg
7643b8e80941Smrg		interp_fs_input(ctx,
7644b8e80941Smrg				key->ps_prolog.color_attr_index[i],
7645b8e80941Smrg				TGSI_SEMANTIC_COLOR, i,
7646b8e80941Smrg				key->ps_prolog.num_interp_inputs,
7647b8e80941Smrg				key->ps_prolog.colors_read, interp_ij,
7648b8e80941Smrg				prim_mask, face, color);
7649b8e80941Smrg
7650b8e80941Smrg		while (writemask) {
7651b8e80941Smrg			unsigned chan = u_bit_scan(&writemask);
7652b8e80941Smrg			ret = LLVMBuildInsertValue(ctx->ac.builder, ret, color[chan],
7653b8e80941Smrg						   fninfo.num_params + color_out_idx++, "");
7654b8e80941Smrg		}
7655b8e80941Smrg	}
7656b8e80941Smrg
7657b8e80941Smrg	/* Section 15.2.2 (Shader Inputs) of the OpenGL 4.5 (Core Profile) spec
7658b8e80941Smrg	 * says:
7659b8e80941Smrg	 *
7660b8e80941Smrg	 *    "When per-sample shading is active due to the use of a fragment
7661b8e80941Smrg	 *     input qualified by sample or due to the use of the gl_SampleID
7662b8e80941Smrg	 *     or gl_SamplePosition variables, only the bit for the current
7663b8e80941Smrg	 *     sample is set in gl_SampleMaskIn. When state specifies multiple
7664b8e80941Smrg	 *     fragment shader invocations for a given fragment, the sample
7665b8e80941Smrg	 *     mask for any single fragment shader invocation may specify a
7666b8e80941Smrg	 *     subset of the covered samples for the fragment. In this case,
7667b8e80941Smrg	 *     the bit corresponding to each covered sample will be set in
7668b8e80941Smrg	 *     exactly one fragment shader invocation."
7669b8e80941Smrg	 *
7670b8e80941Smrg	 * The samplemask loaded by hardware is always the coverage of the
7671b8e80941Smrg	 * entire pixel/fragment, so mask bits out based on the sample ID.
7672b8e80941Smrg	 */
7673b8e80941Smrg	if (key->ps_prolog.states.samplemask_log_ps_iter) {
7674b8e80941Smrg		/* The bit pattern matches that used by fixed function fragment
7675b8e80941Smrg		 * processing. */
7676b8e80941Smrg		static const uint16_t ps_iter_masks[] = {
7677b8e80941Smrg			0xffff, /* not used */
7678b8e80941Smrg			0x5555,
7679b8e80941Smrg			0x1111,
7680b8e80941Smrg			0x0101,
7681b8e80941Smrg			0x0001,
7682b8e80941Smrg		};
7683b8e80941Smrg		assert(key->ps_prolog.states.samplemask_log_ps_iter < ARRAY_SIZE(ps_iter_masks));
7684b8e80941Smrg
7685b8e80941Smrg		uint32_t ps_iter_mask = ps_iter_masks[key->ps_prolog.states.samplemask_log_ps_iter];
7686b8e80941Smrg		unsigned ancillary_vgpr = key->ps_prolog.num_input_sgprs +
7687b8e80941Smrg					  key->ps_prolog.ancillary_vgpr_index;
7688b8e80941Smrg		LLVMValueRef sampleid = si_unpack_param(ctx, ancillary_vgpr, 8, 4);
7689b8e80941Smrg		LLVMValueRef samplemask = LLVMGetParam(func, ancillary_vgpr + 1);
7690b8e80941Smrg
7691b8e80941Smrg		samplemask = ac_to_integer(&ctx->ac, samplemask);
7692b8e80941Smrg		samplemask = LLVMBuildAnd(
7693b8e80941Smrg			ctx->ac.builder,
7694b8e80941Smrg			samplemask,
7695b8e80941Smrg			LLVMBuildShl(ctx->ac.builder,
7696b8e80941Smrg				     LLVMConstInt(ctx->i32, ps_iter_mask, false),
7697b8e80941Smrg				     sampleid, ""),
7698b8e80941Smrg			"");
7699b8e80941Smrg		samplemask = ac_to_float(&ctx->ac, samplemask);
7700b8e80941Smrg
7701b8e80941Smrg		ret = LLVMBuildInsertValue(ctx->ac.builder, ret, samplemask,
7702b8e80941Smrg					   ancillary_vgpr + 1, "");
7703b8e80941Smrg	}
7704848b8605Smrg
7705b8e80941Smrg	/* Tell LLVM to insert WQM instruction sequence when needed. */
7706b8e80941Smrg	if (key->ps_prolog.wqm) {
7707b8e80941Smrg		LLVMAddTargetDependentFunctionAttr(func,
7708b8e80941Smrg						   "amdgpu-ps-wqm-outputs", "");
7709b8e80941Smrg	}
7710848b8605Smrg
7711b8e80941Smrg	si_llvm_build_ret(ctx, ret);
7712b8e80941Smrg}
7713848b8605Smrg
7714b8e80941Smrg/**
7715b8e80941Smrg * Build the pixel shader epilog function. This handles everything that must be
7716b8e80941Smrg * emulated for pixel shader exports. (alpha-test, format conversions, etc)
7717b8e80941Smrg */
7718b8e80941Smrgstatic void si_build_ps_epilog_function(struct si_shader_context *ctx,
7719b8e80941Smrg					union si_shader_part_key *key)
7720b8e80941Smrg{
7721b8e80941Smrg	struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
7722b8e80941Smrg	struct si_function_info fninfo;
7723b8e80941Smrg	LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
7724b8e80941Smrg	int i;
7725b8e80941Smrg	struct si_ps_exports exp = {};
7726b8e80941Smrg
7727b8e80941Smrg	si_init_function_info(&fninfo);
7728b8e80941Smrg
7729b8e80941Smrg	/* Declare input SGPRs. */
7730b8e80941Smrg	ctx->param_rw_buffers = add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7731b8e80941Smrg	ctx->param_bindless_samplers_and_images = add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7732b8e80941Smrg	ctx->param_const_and_shader_buffers = add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7733b8e80941Smrg	ctx->param_samplers_and_images = add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7734b8e80941Smrg	add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF);
7735b8e80941Smrg
7736b8e80941Smrg	/* Declare input VGPRs. */
7737b8e80941Smrg	unsigned required_num_params =
7738b8e80941Smrg		     fninfo.num_sgpr_params +
7739b8e80941Smrg		     util_bitcount(key->ps_epilog.colors_written) * 4 +
7740b8e80941Smrg		     key->ps_epilog.writes_z +
7741b8e80941Smrg		     key->ps_epilog.writes_stencil +
7742b8e80941Smrg		     key->ps_epilog.writes_samplemask;
7743b8e80941Smrg
7744b8e80941Smrg	required_num_params = MAX2(required_num_params,
7745b8e80941Smrg				   fninfo.num_sgpr_params + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
7746b8e80941Smrg
7747b8e80941Smrg	while (fninfo.num_params < required_num_params)
7748b8e80941Smrg		add_arg(&fninfo, ARG_VGPR, ctx->f32);
7749b8e80941Smrg
7750b8e80941Smrg	/* Create the function. */
7751b8e80941Smrg	si_create_function(ctx, "ps_epilog", NULL, 0, &fninfo, 0);
7752b8e80941Smrg	/* Disable elimination of unused inputs. */
7753b8e80941Smrg	ac_llvm_add_target_dep_function_attr(ctx->main_fn,
7754b8e80941Smrg					     "InitialPSInputAddr", 0xffffff);
7755b8e80941Smrg
7756b8e80941Smrg	/* Process colors. */
7757b8e80941Smrg	unsigned vgpr = fninfo.num_sgpr_params;
7758b8e80941Smrg	unsigned colors_written = key->ps_epilog.colors_written;
7759b8e80941Smrg	int last_color_export = -1;
7760b8e80941Smrg
7761b8e80941Smrg	/* Find the last color export. */
7762b8e80941Smrg	if (!key->ps_epilog.writes_z &&
7763b8e80941Smrg	    !key->ps_epilog.writes_stencil &&
7764b8e80941Smrg	    !key->ps_epilog.writes_samplemask) {
7765b8e80941Smrg		unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
7766b8e80941Smrg
7767b8e80941Smrg		/* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
7768b8e80941Smrg		if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
7769b8e80941Smrg			/* Just set this if any of the colorbuffers are enabled. */
7770b8e80941Smrg			if (spi_format &
7771b8e80941Smrg			    ((1ull << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
7772b8e80941Smrg				last_color_export = 0;
7773b8e80941Smrg		} else {
7774b8e80941Smrg			for (i = 0; i < 8; i++)
7775b8e80941Smrg				if (colors_written & (1 << i) &&
7776b8e80941Smrg				    (spi_format >> (i * 4)) & 0xf)
7777b8e80941Smrg					last_color_export = i;
7778848b8605Smrg		}
7779848b8605Smrg	}
7780848b8605Smrg
7781b8e80941Smrg	while (colors_written) {
7782b8e80941Smrg		LLVMValueRef color[4];
7783b8e80941Smrg		int mrt = u_bit_scan(&colors_written);
7784848b8605Smrg
7785b8e80941Smrg		for (i = 0; i < 4; i++)
7786b8e80941Smrg			color[i] = LLVMGetParam(ctx->main_fn, vgpr++);
7787848b8605Smrg
7788b8e80941Smrg		si_export_mrt_color(bld_base, color, mrt,
7789b8e80941Smrg				    fninfo.num_params - 1,
7790b8e80941Smrg				    mrt == last_color_export, &exp);
7791b8e80941Smrg	}
7792848b8605Smrg
7793b8e80941Smrg	/* Process depth, stencil, samplemask. */
7794b8e80941Smrg	if (key->ps_epilog.writes_z)
7795b8e80941Smrg		depth = LLVMGetParam(ctx->main_fn, vgpr++);
7796b8e80941Smrg	if (key->ps_epilog.writes_stencil)
7797b8e80941Smrg		stencil = LLVMGetParam(ctx->main_fn, vgpr++);
7798b8e80941Smrg	if (key->ps_epilog.writes_samplemask)
7799b8e80941Smrg		samplemask = LLVMGetParam(ctx->main_fn, vgpr++);
7800848b8605Smrg
7801b8e80941Smrg	if (depth || stencil || samplemask)
7802b8e80941Smrg		si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
7803b8e80941Smrg	else if (last_color_export == -1)
7804b8e80941Smrg		ac_build_export_null(&ctx->ac);
7805848b8605Smrg
7806b8e80941Smrg	if (exp.num)
7807b8e80941Smrg		si_emit_ps_exports(ctx, &exp);
7808b8e80941Smrg
7809b8e80941Smrg	/* Compile. */
7810b8e80941Smrg	LLVMBuildRetVoid(ctx->ac.builder);
7811848b8605Smrg}
7812848b8605Smrg
7813b8e80941Smrg/**
7814b8e80941Smrg * Select and compile (or reuse) pixel shader parts (prolog & epilog).
7815b8e80941Smrg */
7816b8e80941Smrgstatic bool si_shader_select_ps_parts(struct si_screen *sscreen,
7817b8e80941Smrg				      struct ac_llvm_compiler *compiler,
7818b8e80941Smrg				      struct si_shader *shader,
7819b8e80941Smrg				      struct pipe_debug_callback *debug)
7820848b8605Smrg{
7821b8e80941Smrg	union si_shader_part_key prolog_key;
7822b8e80941Smrg	union si_shader_part_key epilog_key;
7823b8e80941Smrg
7824b8e80941Smrg	/* Get the prolog. */
7825b8e80941Smrg	si_get_ps_prolog_key(shader, &prolog_key, true);
7826b8e80941Smrg
7827b8e80941Smrg	/* The prolog is a no-op if these aren't set. */
7828b8e80941Smrg	if (si_need_ps_prolog(&prolog_key)) {
7829b8e80941Smrg		shader->prolog =
7830b8e80941Smrg			si_get_shader_part(sscreen, &sscreen->ps_prologs,
7831b8e80941Smrg					   PIPE_SHADER_FRAGMENT, true,
7832b8e80941Smrg					   &prolog_key, compiler, debug,
7833b8e80941Smrg					   si_build_ps_prolog_function,
7834b8e80941Smrg					   "Fragment Shader Prolog");
7835b8e80941Smrg		if (!shader->prolog)
7836b8e80941Smrg			return false;
7837848b8605Smrg	}
7838848b8605Smrg
7839b8e80941Smrg	/* Get the epilog. */
7840b8e80941Smrg	si_get_ps_epilog_key(shader, &epilog_key);
7841b8e80941Smrg
7842b8e80941Smrg	shader->epilog =
7843b8e80941Smrg		si_get_shader_part(sscreen, &sscreen->ps_epilogs,
7844b8e80941Smrg				   PIPE_SHADER_FRAGMENT, false,
7845b8e80941Smrg				   &epilog_key, compiler, debug,
7846b8e80941Smrg				   si_build_ps_epilog_function,
7847b8e80941Smrg				   "Fragment Shader Epilog");
7848b8e80941Smrg	if (!shader->epilog)
7849b8e80941Smrg		return false;
7850b8e80941Smrg
7851b8e80941Smrg	/* Enable POS_FIXED_PT if polygon stippling is enabled. */
7852b8e80941Smrg	if (shader->key.part.ps.prolog.poly_stipple) {
7853b8e80941Smrg		shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
7854b8e80941Smrg		assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
7855b8e80941Smrg	}
7856848b8605Smrg
7857b8e80941Smrg	/* Set up the enable bits for per-sample shading if needed. */
7858b8e80941Smrg	if (shader->key.part.ps.prolog.force_persp_sample_interp &&
7859b8e80941Smrg	    (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7860b8e80941Smrg	     G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7861b8e80941Smrg		shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
7862b8e80941Smrg		shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7863b8e80941Smrg		shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
7864b8e80941Smrg	}
7865b8e80941Smrg	if (shader->key.part.ps.prolog.force_linear_sample_interp &&
7866b8e80941Smrg	    (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7867b8e80941Smrg	     G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7868b8e80941Smrg		shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
7869b8e80941Smrg		shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7870b8e80941Smrg		shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
7871b8e80941Smrg	}
7872b8e80941Smrg	if (shader->key.part.ps.prolog.force_persp_center_interp &&
7873b8e80941Smrg	    (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7874b8e80941Smrg	     G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7875b8e80941Smrg		shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
7876b8e80941Smrg		shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7877b8e80941Smrg		shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7878b8e80941Smrg	}
7879b8e80941Smrg	if (shader->key.part.ps.prolog.force_linear_center_interp &&
7880b8e80941Smrg	    (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7881b8e80941Smrg	     G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7882b8e80941Smrg		shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
7883b8e80941Smrg		shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7884b8e80941Smrg		shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7885b8e80941Smrg	}
7886848b8605Smrg
7887b8e80941Smrg	/* POW_W_FLOAT requires that one of the perspective weights is enabled. */
7888b8e80941Smrg	if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
7889b8e80941Smrg	    !(shader->config.spi_ps_input_ena & 0xf)) {
7890b8e80941Smrg		shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7891b8e80941Smrg		assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
7892b8e80941Smrg	}
7893848b8605Smrg
7894b8e80941Smrg	/* At least one pair of interpolation weights must be enabled. */
7895b8e80941Smrg	if (!(shader->config.spi_ps_input_ena & 0x7f)) {
7896b8e80941Smrg		shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7897b8e80941Smrg		assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
7898b8e80941Smrg	}
7899848b8605Smrg
7900b8e80941Smrg	/* Samplemask fixup requires the sample ID. */
7901b8e80941Smrg	if (shader->key.part.ps.prolog.samplemask_log_ps_iter) {
7902b8e80941Smrg		shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
7903b8e80941Smrg		assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr));
7904b8e80941Smrg	}
7905848b8605Smrg
7906b8e80941Smrg	/* The sample mask input is always enabled, because the API shader always
7907b8e80941Smrg	 * passes it through to the epilog. Disable it here if it's unused.
7908b8e80941Smrg	 */
7909b8e80941Smrg	if (!shader->key.part.ps.epilog.poly_line_smoothing &&
7910b8e80941Smrg	    !shader->selector->info.reads_samplemask)
7911b8e80941Smrg		shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
7912848b8605Smrg
7913b8e80941Smrg	return true;
7914b8e80941Smrg}
7915848b8605Smrg
7916b8e80941Smrgvoid si_multiwave_lds_size_workaround(struct si_screen *sscreen,
7917b8e80941Smrg				      unsigned *lds_size)
7918b8e80941Smrg{
7919b8e80941Smrg	/* If tessellation is all offchip and on-chip GS isn't used, this
7920b8e80941Smrg	 * workaround is not needed.
7921b8e80941Smrg	 */
7922b8e80941Smrg	return;
7923848b8605Smrg
7924b8e80941Smrg	/* SPI barrier management bug:
7925b8e80941Smrg	 *   Make sure we have at least 4k of LDS in use to avoid the bug.
7926b8e80941Smrg	 *   It applies to workgroup sizes of more than one wavefront.
7927b8e80941Smrg	 */
7928b8e80941Smrg	if (sscreen->info.family == CHIP_BONAIRE ||
7929b8e80941Smrg	    sscreen->info.family == CHIP_KABINI ||
7930b8e80941Smrg	    sscreen->info.family == CHIP_MULLINS)
7931b8e80941Smrg		*lds_size = MAX2(*lds_size, 8);
7932b8e80941Smrg}
7933848b8605Smrg
7934b8e80941Smrgstatic void si_fix_resource_usage(struct si_screen *sscreen,
7935b8e80941Smrg				  struct si_shader *shader)
7936b8e80941Smrg{
7937b8e80941Smrg	unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
7938848b8605Smrg
7939b8e80941Smrg	shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
7940848b8605Smrg
7941b8e80941Smrg	if (shader->selector->type == PIPE_SHADER_COMPUTE &&
7942b8e80941Smrg	    si_get_max_workgroup_size(shader) > 64) {
7943b8e80941Smrg		si_multiwave_lds_size_workaround(sscreen,
7944b8e80941Smrg						 &shader->config.lds_size);
7945848b8605Smrg	}
7946b8e80941Smrg}
7947848b8605Smrg
7948b8e80941Smrgint si_shader_create(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
7949b8e80941Smrg		     struct si_shader *shader,
7950b8e80941Smrg		     struct pipe_debug_callback *debug)
7951b8e80941Smrg{
7952b8e80941Smrg	struct si_shader_selector *sel = shader->selector;
7953b8e80941Smrg	struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
7954b8e80941Smrg	int r;
7955848b8605Smrg
7956b8e80941Smrg	/* LS, ES, VS are compiled on demand if the main part hasn't been
7957b8e80941Smrg	 * compiled for that stage.
7958b8e80941Smrg	 *
7959b8e80941Smrg	 * Vertex shaders are compiled on demand when a vertex fetch
7960b8e80941Smrg	 * workaround must be applied.
7961b8e80941Smrg	 */
7962b8e80941Smrg	if (shader->is_monolithic) {
7963b8e80941Smrg		/* Monolithic shader (compiled as a whole, has many variants,
7964b8e80941Smrg		 * may take a long time to compile).
7965b8e80941Smrg		 */
7966b8e80941Smrg		r = si_compile_tgsi_shader(sscreen, compiler, shader, debug);
7967b8e80941Smrg		if (r)
7968b8e80941Smrg			return r;
7969b8e80941Smrg	} else {
7970b8e80941Smrg		/* The shader consists of several parts:
7971b8e80941Smrg		 *
7972b8e80941Smrg		 * - the middle part is the user shader, it has 1 variant only
7973b8e80941Smrg		 *   and it was compiled during the creation of the shader
7974b8e80941Smrg		 *   selector
7975b8e80941Smrg		 * - the prolog part is inserted at the beginning
7976b8e80941Smrg		 * - the epilog part is inserted at the end
7977b8e80941Smrg		 *
7978b8e80941Smrg		 * The prolog and epilog have many (but simple) variants.
7979b8e80941Smrg		 *
7980b8e80941Smrg		 * Starting with gfx9, geometry and tessellation control
7981b8e80941Smrg		 * shaders also contain the prolog and user shader parts of
7982b8e80941Smrg		 * the previous shader stage.
7983b8e80941Smrg		 */
7984848b8605Smrg
7985b8e80941Smrg		if (!mainp)
7986b8e80941Smrg			return -1;
7987b8e80941Smrg
7988b8e80941Smrg		/* Copy the compiled TGSI shader data over. */
7989b8e80941Smrg		shader->is_binary_shared = true;
7990b8e80941Smrg		shader->binary = mainp->binary;
7991b8e80941Smrg		shader->config = mainp->config;
7992b8e80941Smrg		shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
7993b8e80941Smrg		shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
7994b8e80941Smrg		shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
7995b8e80941Smrg		shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index;
7996b8e80941Smrg		memcpy(shader->info.vs_output_param_offset,
7997b8e80941Smrg		       mainp->info.vs_output_param_offset,
7998b8e80941Smrg		       sizeof(mainp->info.vs_output_param_offset));
7999b8e80941Smrg		shader->info.uses_instanceid = mainp->info.uses_instanceid;
8000b8e80941Smrg		shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
8001b8e80941Smrg		shader->info.nr_param_exports = mainp->info.nr_param_exports;
8002b8e80941Smrg
8003b8e80941Smrg		/* Select prologs and/or epilogs. */
8004b8e80941Smrg		switch (sel->type) {
8005b8e80941Smrg		case PIPE_SHADER_VERTEX:
8006b8e80941Smrg			if (!si_shader_select_vs_parts(sscreen, compiler, shader, debug))
8007b8e80941Smrg				return -1;
8008b8e80941Smrg			break;
8009b8e80941Smrg		case PIPE_SHADER_TESS_CTRL:
8010b8e80941Smrg			if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug))
8011b8e80941Smrg				return -1;
8012b8e80941Smrg			break;
8013b8e80941Smrg		case PIPE_SHADER_TESS_EVAL:
8014b8e80941Smrg			break;
8015b8e80941Smrg		case PIPE_SHADER_GEOMETRY:
8016b8e80941Smrg			if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug))
8017b8e80941Smrg				return -1;
8018b8e80941Smrg			break;
8019b8e80941Smrg		case PIPE_SHADER_FRAGMENT:
8020b8e80941Smrg			if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug))
8021b8e80941Smrg				return -1;
8022848b8605Smrg
8023b8e80941Smrg			/* Make sure we have at least as many VGPRs as there
8024b8e80941Smrg			 * are allocated inputs.
8025b8e80941Smrg			 */
8026b8e80941Smrg			shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
8027b8e80941Smrg							shader->info.num_input_vgprs);
8028b8e80941Smrg			break;
8029b8e80941Smrg		}
8030848b8605Smrg
8031b8e80941Smrg		/* Update SGPR and VGPR counts. */
8032b8e80941Smrg		if (shader->prolog) {
8033b8e80941Smrg			shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
8034b8e80941Smrg							shader->prolog->config.num_sgprs);
8035b8e80941Smrg			shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
8036b8e80941Smrg							shader->prolog->config.num_vgprs);
8037b8e80941Smrg		}
8038b8e80941Smrg		if (shader->previous_stage) {
8039b8e80941Smrg			shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
8040b8e80941Smrg							shader->previous_stage->config.num_sgprs);
8041b8e80941Smrg			shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
8042b8e80941Smrg							shader->previous_stage->config.num_vgprs);
8043b8e80941Smrg			shader->config.spilled_sgprs =
8044b8e80941Smrg				MAX2(shader->config.spilled_sgprs,
8045b8e80941Smrg				     shader->previous_stage->config.spilled_sgprs);
8046b8e80941Smrg			shader->config.spilled_vgprs =
8047b8e80941Smrg				MAX2(shader->config.spilled_vgprs,
8048b8e80941Smrg				     shader->previous_stage->config.spilled_vgprs);
8049b8e80941Smrg			shader->config.private_mem_vgprs =
8050b8e80941Smrg				MAX2(shader->config.private_mem_vgprs,
8051b8e80941Smrg				     shader->previous_stage->config.private_mem_vgprs);
8052b8e80941Smrg			shader->config.scratch_bytes_per_wave =
8053b8e80941Smrg				MAX2(shader->config.scratch_bytes_per_wave,
8054b8e80941Smrg				     shader->previous_stage->config.scratch_bytes_per_wave);
8055b8e80941Smrg			shader->info.uses_instanceid |=
8056b8e80941Smrg				shader->previous_stage->info.uses_instanceid;
8057848b8605Smrg		}
8058b8e80941Smrg		if (shader->prolog2) {
8059b8e80941Smrg			shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
8060b8e80941Smrg							shader->prolog2->config.num_sgprs);
8061b8e80941Smrg			shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
8062b8e80941Smrg							shader->prolog2->config.num_vgprs);
8063b8e80941Smrg		}
8064b8e80941Smrg		if (shader->epilog) {
8065b8e80941Smrg			shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
8066b8e80941Smrg							shader->epilog->config.num_sgprs);
8067b8e80941Smrg			shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
8068b8e80941Smrg							shader->epilog->config.num_vgprs);
8069b8e80941Smrg		}
8070b8e80941Smrg		si_calculate_max_simd_waves(shader);
8071848b8605Smrg	}
8072848b8605Smrg
8073b8e80941Smrg	si_fix_resource_usage(sscreen, shader);
8074b8e80941Smrg	si_shader_dump(sscreen, shader, debug, sel->info.processor,
8075b8e80941Smrg		       stderr, true);
8076848b8605Smrg
8077b8e80941Smrg	/* Upload. */
8078b8e80941Smrg	r = si_shader_binary_upload(sscreen, shader);
8079b8e80941Smrg	if (r) {
8080b8e80941Smrg		fprintf(stderr, "LLVM failed to upload shader\n");
8081b8e80941Smrg		return r;
8082b8e80941Smrg	}
8083848b8605Smrg
8084b8e80941Smrg	return 0;
8085848b8605Smrg}
8086848b8605Smrg
8087b8e80941Smrgvoid si_shader_destroy(struct si_shader *shader)
8088848b8605Smrg{
8089b8e80941Smrg	if (shader->scratch_bo)
8090b8e80941Smrg		si_resource_reference(&shader->scratch_bo, NULL);
8091b8e80941Smrg
8092b8e80941Smrg	si_resource_reference(&shader->bo, NULL);
8093b8e80941Smrg
8094b8e80941Smrg	if (!shader->is_binary_shared)
8095b8e80941Smrg		ac_shader_binary_clean(&shader->binary);
8096848b8605Smrg
8097b8e80941Smrg	free(shader->shader_log);
8098848b8605Smrg}
8099