1848b8605Smrg/* 2848b8605Smrg * Copyright 2012 Advanced Micro Devices, Inc. 3b8e80941Smrg * All Rights Reserved. 4848b8605Smrg * 5848b8605Smrg * Permission is hereby granted, free of charge, to any person obtaining a 6848b8605Smrg * copy of this software and associated documentation files (the "Software"), 7848b8605Smrg * to deal in the Software without restriction, including without limitation 8848b8605Smrg * on the rights to use, copy, modify, merge, publish, distribute, sub 9848b8605Smrg * license, and/or sell copies of the Software, and to permit persons to whom 10848b8605Smrg * the Software is furnished to do so, subject to the following conditions: 11848b8605Smrg * 12848b8605Smrg * The above copyright notice and this permission notice (including the next 13848b8605Smrg * paragraph) shall be included in all copies or substantial portions of the 14848b8605Smrg * Software. 15848b8605Smrg * 16848b8605Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17848b8605Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18848b8605Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19848b8605Smrg * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 20848b8605Smrg * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21848b8605Smrg * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22848b8605Smrg * USE OR OTHER DEALINGS IN THE SOFTWARE. 23848b8605Smrg */ 24848b8605Smrg 25848b8605Smrg#include "util/u_memory.h" 26b8e80941Smrg#include "util/u_string.h" 27b8e80941Smrg#include "tgsi/tgsi_build.h" 28848b8605Smrg#include "tgsi/tgsi_util.h" 29848b8605Smrg#include "tgsi/tgsi_dump.h" 30848b8605Smrg 31b8e80941Smrg#include "ac_exp_param.h" 32b8e80941Smrg#include "ac_shader_util.h" 33b8e80941Smrg#include "ac_llvm_util.h" 34b8e80941Smrg#include "si_shader_internal.h" 35848b8605Smrg#include "si_pipe.h" 36848b8605Smrg#include "sid.h" 37848b8605Smrg 38b8e80941Smrg#include "compiler/nir/nir.h" 39b8e80941Smrg 40b8e80941Smrgstatic const char *scratch_rsrc_dword0_symbol = 41b8e80941Smrg "SCRATCH_RSRC_DWORD0"; 42b8e80941Smrg 43b8e80941Smrgstatic const char *scratch_rsrc_dword1_symbol = 44b8e80941Smrg "SCRATCH_RSRC_DWORD1"; 45848b8605Smrg 46848b8605Smrgstruct si_shader_output_values 47848b8605Smrg{ 48848b8605Smrg LLVMValueRef values[4]; 49b8e80941Smrg unsigned semantic_name; 50b8e80941Smrg unsigned semantic_index; 51b8e80941Smrg ubyte vertex_stream[4]; 52848b8605Smrg}; 53848b8605Smrg 54b8e80941Smrg/** 55b8e80941Smrg * Used to collect types and other info about arguments of the LLVM function 56b8e80941Smrg * before the function is created. 57b8e80941Smrg */ 58b8e80941Smrgstruct si_function_info { 59b8e80941Smrg LLVMTypeRef types[100]; 60b8e80941Smrg LLVMValueRef *assign[100]; 61b8e80941Smrg unsigned num_sgpr_params; 62b8e80941Smrg unsigned num_params; 63b8e80941Smrg}; 64b8e80941Smrg 65b8e80941Smrgenum si_arg_regfile { 66b8e80941Smrg ARG_SGPR, 67b8e80941Smrg ARG_VGPR 68848b8605Smrg}; 69848b8605Smrg 70b8e80941Smrgstatic void si_init_shader_ctx(struct si_shader_context *ctx, 71b8e80941Smrg struct si_screen *sscreen, 72b8e80941Smrg struct ac_llvm_compiler *compiler); 73b8e80941Smrg 74b8e80941Smrgstatic void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action, 75b8e80941Smrg struct lp_build_tgsi_context *bld_base, 76b8e80941Smrg struct lp_build_emit_data *emit_data); 77b8e80941Smrg 78b8e80941Smrgstatic void si_dump_shader_key(unsigned processor, const struct si_shader *shader, 79b8e80941Smrg FILE *f); 80b8e80941Smrg 81b8e80941Smrgstatic void si_build_vs_prolog_function(struct si_shader_context *ctx, 82b8e80941Smrg union si_shader_part_key *key); 83b8e80941Smrgstatic void si_build_tcs_epilog_function(struct si_shader_context *ctx, 84b8e80941Smrg union si_shader_part_key *key); 85b8e80941Smrgstatic void si_build_ps_prolog_function(struct si_shader_context *ctx, 86b8e80941Smrg union si_shader_part_key *key); 87b8e80941Smrgstatic void si_build_ps_epilog_function(struct si_shader_context *ctx, 88b8e80941Smrg union si_shader_part_key *key); 89b8e80941Smrgstatic void si_fix_resource_usage(struct si_screen *sscreen, 90b8e80941Smrg struct si_shader *shader); 91b8e80941Smrg 92b8e80941Smrg/* Ideally pass the sample mask input to the PS epilog as v14, which 93b8e80941Smrg * is its usual location, so that the shader doesn't have to add v_mov. 94b8e80941Smrg */ 95b8e80941Smrg#define PS_EPILOG_SAMPLEMASK_MIN_LOC 14 96b8e80941Smrg 97b8e80941Smrgstatic bool llvm_type_is_64bit(struct si_shader_context *ctx, 98b8e80941Smrg LLVMTypeRef type) 99b8e80941Smrg{ 100b8e80941Smrg if (type == ctx->ac.i64 || type == ctx->ac.f64) 101b8e80941Smrg return true; 102b8e80941Smrg 103b8e80941Smrg return false; 104b8e80941Smrg} 105b8e80941Smrg 106b8e80941Smrgstatic bool is_merged_shader(struct si_shader_context *ctx) 107848b8605Smrg{ 108b8e80941Smrg if (ctx->screen->info.chip_class <= VI) 109b8e80941Smrg return false; 110b8e80941Smrg 111b8e80941Smrg return ctx->shader->key.as_ls || 112b8e80941Smrg ctx->shader->key.as_es || 113b8e80941Smrg ctx->type == PIPE_SHADER_TESS_CTRL || 114b8e80941Smrg ctx->type == PIPE_SHADER_GEOMETRY; 115848b8605Smrg} 116848b8605Smrg 117b8e80941Smrgstatic void si_init_function_info(struct si_function_info *fninfo) 118b8e80941Smrg{ 119b8e80941Smrg fninfo->num_params = 0; 120b8e80941Smrg fninfo->num_sgpr_params = 0; 121b8e80941Smrg} 122848b8605Smrg 123b8e80941Smrgstatic unsigned add_arg_assign(struct si_function_info *fninfo, 124b8e80941Smrg enum si_arg_regfile regfile, LLVMTypeRef type, 125b8e80941Smrg LLVMValueRef *assign) 126b8e80941Smrg{ 127b8e80941Smrg assert(regfile != ARG_SGPR || fninfo->num_sgpr_params == fninfo->num_params); 128848b8605Smrg 129b8e80941Smrg unsigned idx = fninfo->num_params++; 130b8e80941Smrg assert(idx < ARRAY_SIZE(fninfo->types)); 131848b8605Smrg 132b8e80941Smrg if (regfile == ARG_SGPR) 133b8e80941Smrg fninfo->num_sgpr_params = fninfo->num_params; 134848b8605Smrg 135b8e80941Smrg fninfo->types[idx] = type; 136b8e80941Smrg fninfo->assign[idx] = assign; 137b8e80941Smrg return idx; 138b8e80941Smrg} 139848b8605Smrg 140b8e80941Smrgstatic unsigned add_arg(struct si_function_info *fninfo, 141b8e80941Smrg enum si_arg_regfile regfile, LLVMTypeRef type) 142b8e80941Smrg{ 143b8e80941Smrg return add_arg_assign(fninfo, regfile, type, NULL); 144b8e80941Smrg} 145848b8605Smrg 146b8e80941Smrgstatic void add_arg_assign_checked(struct si_function_info *fninfo, 147b8e80941Smrg enum si_arg_regfile regfile, LLVMTypeRef type, 148b8e80941Smrg LLVMValueRef *assign, unsigned idx) 149b8e80941Smrg{ 150b8e80941Smrg MAYBE_UNUSED unsigned actual = add_arg_assign(fninfo, regfile, type, assign); 151b8e80941Smrg assert(actual == idx); 152b8e80941Smrg} 153848b8605Smrg 154b8e80941Smrgstatic void add_arg_checked(struct si_function_info *fninfo, 155b8e80941Smrg enum si_arg_regfile regfile, LLVMTypeRef type, 156b8e80941Smrg unsigned idx) 157b8e80941Smrg{ 158b8e80941Smrg add_arg_assign_checked(fninfo, regfile, type, NULL, idx); 159b8e80941Smrg} 160848b8605Smrg 161848b8605Smrg/** 162b8e80941Smrg * Returns a unique index for a per-patch semantic name and index. The index 163b8e80941Smrg * must be less than 32, so that a 32-bit bitmask of used inputs or outputs 164b8e80941Smrg * can be calculated. 165848b8605Smrg */ 166b8e80941Smrgunsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index) 167848b8605Smrg{ 168b8e80941Smrg switch (semantic_name) { 169b8e80941Smrg case TGSI_SEMANTIC_TESSOUTER: 170b8e80941Smrg return 0; 171b8e80941Smrg case TGSI_SEMANTIC_TESSINNER: 172b8e80941Smrg return 1; 173b8e80941Smrg case TGSI_SEMANTIC_PATCH: 174b8e80941Smrg assert(index < 30); 175b8e80941Smrg return 2 + index; 176848b8605Smrg 177b8e80941Smrg default: 178b8e80941Smrg assert(!"invalid semantic name"); 179b8e80941Smrg return 0; 180b8e80941Smrg } 181b8e80941Smrg} 182848b8605Smrg 183b8e80941Smrg/** 184b8e80941Smrg * Returns a unique index for a semantic name and index. The index must be 185b8e80941Smrg * less than 64, so that a 64-bit bitmask of used inputs or outputs can be 186b8e80941Smrg * calculated. 187b8e80941Smrg */ 188b8e80941Smrgunsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index, 189b8e80941Smrg unsigned is_varying) 190b8e80941Smrg{ 191b8e80941Smrg switch (semantic_name) { 192b8e80941Smrg case TGSI_SEMANTIC_POSITION: 193b8e80941Smrg return 0; 194b8e80941Smrg case TGSI_SEMANTIC_GENERIC: 195b8e80941Smrg /* Since some shader stages use the the highest used IO index 196b8e80941Smrg * to determine the size to allocate for inputs/outputs 197b8e80941Smrg * (in LDS, tess and GS rings). GENERIC should be placed right 198b8e80941Smrg * after POSITION to make that size as small as possible. 199b8e80941Smrg */ 200b8e80941Smrg if (index < SI_MAX_IO_GENERIC) 201b8e80941Smrg return 1 + index; 202b8e80941Smrg 203b8e80941Smrg assert(!"invalid generic index"); 204b8e80941Smrg return 0; 205b8e80941Smrg case TGSI_SEMANTIC_PSIZE: 206b8e80941Smrg return SI_MAX_IO_GENERIC + 1; 207b8e80941Smrg case TGSI_SEMANTIC_CLIPDIST: 208b8e80941Smrg assert(index <= 1); 209b8e80941Smrg return SI_MAX_IO_GENERIC + 2 + index; 210b8e80941Smrg case TGSI_SEMANTIC_FOG: 211b8e80941Smrg return SI_MAX_IO_GENERIC + 4; 212b8e80941Smrg case TGSI_SEMANTIC_LAYER: 213b8e80941Smrg return SI_MAX_IO_GENERIC + 5; 214b8e80941Smrg case TGSI_SEMANTIC_VIEWPORT_INDEX: 215b8e80941Smrg return SI_MAX_IO_GENERIC + 6; 216b8e80941Smrg case TGSI_SEMANTIC_PRIMID: 217b8e80941Smrg return SI_MAX_IO_GENERIC + 7; 218b8e80941Smrg case TGSI_SEMANTIC_COLOR: 219b8e80941Smrg assert(index < 2); 220b8e80941Smrg return SI_MAX_IO_GENERIC + 8 + index; 221b8e80941Smrg case TGSI_SEMANTIC_BCOLOR: 222b8e80941Smrg assert(index < 2); 223b8e80941Smrg /* If it's a varying, COLOR and BCOLOR alias. */ 224b8e80941Smrg if (is_varying) 225b8e80941Smrg return SI_MAX_IO_GENERIC + 8 + index; 226b8e80941Smrg else 227b8e80941Smrg return SI_MAX_IO_GENERIC + 10 + index; 228b8e80941Smrg case TGSI_SEMANTIC_TEXCOORD: 229b8e80941Smrg assert(index < 8); 230b8e80941Smrg STATIC_ASSERT(SI_MAX_IO_GENERIC + 12 + 8 <= 63); 231b8e80941Smrg return SI_MAX_IO_GENERIC + 12 + index; 232b8e80941Smrg case TGSI_SEMANTIC_CLIPVERTEX: 233b8e80941Smrg return 63; 234b8e80941Smrg default: 235b8e80941Smrg fprintf(stderr, "invalid semantic name = %u\n", semantic_name); 236b8e80941Smrg assert(!"invalid semantic name"); 237b8e80941Smrg return 0; 238b8e80941Smrg } 239848b8605Smrg} 240848b8605Smrg 241b8e80941Smrg/** 242b8e80941Smrg * Get the value of a shader input parameter and extract a bitfield. 243b8e80941Smrg */ 244b8e80941Smrgstatic LLVMValueRef unpack_llvm_param(struct si_shader_context *ctx, 245b8e80941Smrg LLVMValueRef value, unsigned rshift, 246b8e80941Smrg unsigned bitwidth) 247848b8605Smrg{ 248b8e80941Smrg if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind) 249b8e80941Smrg value = ac_to_integer(&ctx->ac, value); 250848b8605Smrg 251b8e80941Smrg if (rshift) 252b8e80941Smrg value = LLVMBuildLShr(ctx->ac.builder, value, 253b8e80941Smrg LLVMConstInt(ctx->i32, rshift, 0), ""); 254848b8605Smrg 255b8e80941Smrg if (rshift + bitwidth < 32) { 256b8e80941Smrg unsigned mask = (1 << bitwidth) - 1; 257b8e80941Smrg value = LLVMBuildAnd(ctx->ac.builder, value, 258b8e80941Smrg LLVMConstInt(ctx->i32, mask, 0), ""); 259b8e80941Smrg } 260848b8605Smrg 261b8e80941Smrg return value; 262848b8605Smrg} 263848b8605Smrg 264b8e80941SmrgLLVMValueRef si_unpack_param(struct si_shader_context *ctx, 265b8e80941Smrg unsigned param, unsigned rshift, 266b8e80941Smrg unsigned bitwidth) 267848b8605Smrg{ 268b8e80941Smrg LLVMValueRef value = LLVMGetParam(ctx->main_fn, param); 269848b8605Smrg 270b8e80941Smrg return unpack_llvm_param(ctx, value, rshift, bitwidth); 271b8e80941Smrg} 272848b8605Smrg 273b8e80941Smrgstatic LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx) 274b8e80941Smrg{ 275b8e80941Smrg switch (ctx->type) { 276b8e80941Smrg case PIPE_SHADER_TESS_CTRL: 277b8e80941Smrg return unpack_llvm_param(ctx, ctx->abi.tcs_rel_ids, 0, 8); 278b8e80941Smrg 279b8e80941Smrg case PIPE_SHADER_TESS_EVAL: 280b8e80941Smrg return LLVMGetParam(ctx->main_fn, 281b8e80941Smrg ctx->param_tes_rel_patch_id); 282b8e80941Smrg 283b8e80941Smrg default: 284b8e80941Smrg assert(0); 285b8e80941Smrg return NULL; 286848b8605Smrg } 287b8e80941Smrg} 288b8e80941Smrg 289b8e80941Smrg/* Tessellation shaders pass outputs to the next shader using LDS. 290b8e80941Smrg * 291b8e80941Smrg * LS outputs = TCS inputs 292b8e80941Smrg * TCS outputs = TES inputs 293b8e80941Smrg * 294b8e80941Smrg * The LDS layout is: 295b8e80941Smrg * - TCS inputs for patch 0 296b8e80941Smrg * - TCS inputs for patch 1 297b8e80941Smrg * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2) 298b8e80941Smrg * - ... 299b8e80941Smrg * - TCS outputs for patch 0 = get_tcs_out_patch0_offset 300b8e80941Smrg * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset 301b8e80941Smrg * - TCS outputs for patch 1 302b8e80941Smrg * - Per-patch TCS outputs for patch 1 303b8e80941Smrg * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2) 304b8e80941Smrg * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2) 305b8e80941Smrg * - ... 306b8e80941Smrg * 307b8e80941Smrg * All three shaders VS(LS), TCS, TES share the same LDS space. 308b8e80941Smrg */ 309848b8605Smrg 310b8e80941Smrgstatic LLVMValueRef 311b8e80941Smrgget_tcs_in_patch_stride(struct si_shader_context *ctx) 312b8e80941Smrg{ 313b8e80941Smrg return si_unpack_param(ctx, ctx->param_vs_state_bits, 8, 13); 314848b8605Smrg} 315848b8605Smrg 316b8e80941Smrgstatic unsigned get_tcs_out_vertex_dw_stride_constant(struct si_shader_context *ctx) 317848b8605Smrg{ 318b8e80941Smrg assert(ctx->type == PIPE_SHADER_TESS_CTRL); 319848b8605Smrg 320b8e80941Smrg if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) 321b8e80941Smrg return util_last_bit64(ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) * 4; 322848b8605Smrg 323b8e80941Smrg return util_last_bit64(ctx->shader->selector->outputs_written) * 4; 324b8e80941Smrg} 325848b8605Smrg 326b8e80941Smrgstatic LLVMValueRef get_tcs_out_vertex_dw_stride(struct si_shader_context *ctx) 327b8e80941Smrg{ 328b8e80941Smrg unsigned stride = get_tcs_out_vertex_dw_stride_constant(ctx); 329848b8605Smrg 330b8e80941Smrg return LLVMConstInt(ctx->i32, stride, 0); 331b8e80941Smrg} 332848b8605Smrg 333b8e80941Smrgstatic LLVMValueRef get_tcs_out_patch_stride(struct si_shader_context *ctx) 334b8e80941Smrg{ 335b8e80941Smrg if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) 336b8e80941Smrg return si_unpack_param(ctx, ctx->param_tcs_out_lds_layout, 0, 13); 337b8e80941Smrg 338b8e80941Smrg const struct tgsi_shader_info *info = &ctx->shader->selector->info; 339b8e80941Smrg unsigned tcs_out_vertices = info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT]; 340b8e80941Smrg unsigned vertex_dw_stride = get_tcs_out_vertex_dw_stride_constant(ctx); 341b8e80941Smrg unsigned num_patch_outputs = util_last_bit64(ctx->shader->selector->patch_outputs_written); 342b8e80941Smrg unsigned patch_dw_stride = tcs_out_vertices * vertex_dw_stride + 343b8e80941Smrg num_patch_outputs * 4; 344b8e80941Smrg return LLVMConstInt(ctx->i32, patch_dw_stride, 0); 345b8e80941Smrg} 346848b8605Smrg 347b8e80941Smrgstatic LLVMValueRef 348b8e80941Smrgget_tcs_out_patch0_offset(struct si_shader_context *ctx) 349b8e80941Smrg{ 350b8e80941Smrg return LLVMBuildMul(ctx->ac.builder, 351b8e80941Smrg si_unpack_param(ctx, 352b8e80941Smrg ctx->param_tcs_out_lds_offsets, 353b8e80941Smrg 0, 16), 354b8e80941Smrg LLVMConstInt(ctx->i32, 4, 0), ""); 355b8e80941Smrg} 356848b8605Smrg 357b8e80941Smrgstatic LLVMValueRef 358b8e80941Smrgget_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx) 359b8e80941Smrg{ 360b8e80941Smrg return LLVMBuildMul(ctx->ac.builder, 361b8e80941Smrg si_unpack_param(ctx, 362b8e80941Smrg ctx->param_tcs_out_lds_offsets, 363b8e80941Smrg 16, 16), 364b8e80941Smrg LLVMConstInt(ctx->i32, 4, 0), ""); 365b8e80941Smrg} 366848b8605Smrg 367b8e80941Smrgstatic LLVMValueRef 368b8e80941Smrgget_tcs_in_current_patch_offset(struct si_shader_context *ctx) 369b8e80941Smrg{ 370b8e80941Smrg LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx); 371b8e80941Smrg LLVMValueRef rel_patch_id = get_rel_patch_id(ctx); 372b8e80941Smrg 373b8e80941Smrg return LLVMBuildMul(ctx->ac.builder, patch_stride, rel_patch_id, ""); 374848b8605Smrg} 375848b8605Smrg 376b8e80941Smrgstatic LLVMValueRef 377b8e80941Smrgget_tcs_out_current_patch_offset(struct si_shader_context *ctx) 378848b8605Smrg{ 379b8e80941Smrg LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx); 380b8e80941Smrg LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx); 381b8e80941Smrg LLVMValueRef rel_patch_id = get_rel_patch_id(ctx); 382848b8605Smrg 383b8e80941Smrg return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_offset); 384b8e80941Smrg} 385b8e80941Smrg 386b8e80941Smrgstatic LLVMValueRef 387b8e80941Smrgget_tcs_out_current_patch_data_offset(struct si_shader_context *ctx) 388b8e80941Smrg{ 389b8e80941Smrg LLVMValueRef patch0_patch_data_offset = 390b8e80941Smrg get_tcs_out_patch0_patch_data_offset(ctx); 391b8e80941Smrg LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx); 392b8e80941Smrg LLVMValueRef rel_patch_id = get_rel_patch_id(ctx); 393848b8605Smrg 394b8e80941Smrg return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_patch_data_offset); 395848b8605Smrg} 396848b8605Smrg 397b8e80941Smrgstatic LLVMValueRef get_num_tcs_out_vertices(struct si_shader_context *ctx) 398b8e80941Smrg{ 399b8e80941Smrg unsigned tcs_out_vertices = 400b8e80941Smrg ctx->shader->selector ? 401b8e80941Smrg ctx->shader->selector->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] : 0; 402848b8605Smrg 403b8e80941Smrg /* If !tcs_out_vertices, it's either the fixed-func TCS or the TCS epilog. */ 404b8e80941Smrg if (ctx->type == PIPE_SHADER_TESS_CTRL && tcs_out_vertices) 405b8e80941Smrg return LLVMConstInt(ctx->i32, tcs_out_vertices, 0); 406848b8605Smrg 407b8e80941Smrg return si_unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6); 408b8e80941Smrg} 409b8e80941Smrg 410b8e80941Smrgstatic LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx) 411b8e80941Smrg{ 412b8e80941Smrg unsigned stride; 413b8e80941Smrg 414b8e80941Smrg switch (ctx->type) { 415b8e80941Smrg case PIPE_SHADER_VERTEX: 416b8e80941Smrg stride = ctx->shader->selector->lshs_vertex_stride / 4; 417b8e80941Smrg return LLVMConstInt(ctx->i32, stride, 0); 418b8e80941Smrg 419b8e80941Smrg case PIPE_SHADER_TESS_CTRL: 420b8e80941Smrg if (ctx->screen->info.chip_class >= GFX9 && 421b8e80941Smrg ctx->shader->is_monolithic) { 422b8e80941Smrg stride = ctx->shader->key.part.tcs.ls->lshs_vertex_stride / 4; 423b8e80941Smrg return LLVMConstInt(ctx->i32, stride, 0); 424848b8605Smrg } 425b8e80941Smrg return si_unpack_param(ctx, ctx->param_vs_state_bits, 24, 8); 426b8e80941Smrg 427b8e80941Smrg default: 428b8e80941Smrg assert(0); 429b8e80941Smrg return NULL; 430848b8605Smrg } 431b8e80941Smrg} 432848b8605Smrg 433b8e80941Smrg/* Bitcast <4 x float> to <2 x double>, extract the component, and convert 434b8e80941Smrg * to float. */ 435b8e80941Smrgstatic LLVMValueRef extract_double_to_float(struct si_shader_context *ctx, 436b8e80941Smrg LLVMValueRef vec4, 437b8e80941Smrg unsigned double_index) 438b8e80941Smrg{ 439b8e80941Smrg LLVMBuilderRef builder = ctx->ac.builder; 440b8e80941Smrg LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->ac.context); 441b8e80941Smrg LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4, 442b8e80941Smrg LLVMVectorType(f64, 2), ""); 443b8e80941Smrg LLVMValueRef index = LLVMConstInt(ctx->i32, double_index, 0); 444b8e80941Smrg LLVMValueRef value = LLVMBuildExtractElement(builder, dvec2, index, ""); 445b8e80941Smrg return LLVMBuildFPTrunc(builder, value, ctx->f32, ""); 446848b8605Smrg} 447848b8605Smrg 448b8e80941Smrgstatic LLVMValueRef unpack_sint16(struct si_shader_context *ctx, 449b8e80941Smrg LLVMValueRef i32, unsigned index) 450848b8605Smrg{ 451b8e80941Smrg assert(index <= 1); 452848b8605Smrg 453b8e80941Smrg if (index == 1) 454b8e80941Smrg return LLVMBuildAShr(ctx->ac.builder, i32, 455b8e80941Smrg LLVMConstInt(ctx->i32, 16, 0), ""); 456848b8605Smrg 457b8e80941Smrg return LLVMBuildSExt(ctx->ac.builder, 458b8e80941Smrg LLVMBuildTrunc(ctx->ac.builder, i32, 459b8e80941Smrg ctx->ac.i16, ""), 460b8e80941Smrg ctx->i32, ""); 461b8e80941Smrg} 462848b8605Smrg 463b8e80941Smrgvoid si_llvm_load_input_vs( 464b8e80941Smrg struct si_shader_context *ctx, 465b8e80941Smrg unsigned input_index, 466b8e80941Smrg LLVMValueRef out[4]) 467b8e80941Smrg{ 468b8e80941Smrg const struct tgsi_shader_info *info = &ctx->shader->selector->info; 469b8e80941Smrg unsigned vs_blit_property = info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS]; 470b8e80941Smrg 471b8e80941Smrg if (vs_blit_property) { 472b8e80941Smrg LLVMValueRef vertex_id = ctx->abi.vertex_id; 473b8e80941Smrg LLVMValueRef sel_x1 = LLVMBuildICmp(ctx->ac.builder, 474b8e80941Smrg LLVMIntULE, vertex_id, 475b8e80941Smrg ctx->i32_1, ""); 476b8e80941Smrg /* Use LLVMIntNE, because we have 3 vertices and only 477b8e80941Smrg * the middle one should use y2. 478b8e80941Smrg */ 479b8e80941Smrg LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder, 480b8e80941Smrg LLVMIntNE, vertex_id, 481b8e80941Smrg ctx->i32_1, ""); 482b8e80941Smrg 483b8e80941Smrg if (input_index == 0) { 484b8e80941Smrg /* Position: */ 485b8e80941Smrg LLVMValueRef x1y1 = LLVMGetParam(ctx->main_fn, 486b8e80941Smrg ctx->param_vs_blit_inputs); 487b8e80941Smrg LLVMValueRef x2y2 = LLVMGetParam(ctx->main_fn, 488b8e80941Smrg ctx->param_vs_blit_inputs + 1); 489b8e80941Smrg 490b8e80941Smrg LLVMValueRef x1 = unpack_sint16(ctx, x1y1, 0); 491b8e80941Smrg LLVMValueRef y1 = unpack_sint16(ctx, x1y1, 1); 492b8e80941Smrg LLVMValueRef x2 = unpack_sint16(ctx, x2y2, 0); 493b8e80941Smrg LLVMValueRef y2 = unpack_sint16(ctx, x2y2, 1); 494b8e80941Smrg 495b8e80941Smrg LLVMValueRef x = LLVMBuildSelect(ctx->ac.builder, sel_x1, 496b8e80941Smrg x1, x2, ""); 497b8e80941Smrg LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1, 498b8e80941Smrg y1, y2, ""); 499b8e80941Smrg 500b8e80941Smrg out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->f32, ""); 501b8e80941Smrg out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->f32, ""); 502b8e80941Smrg out[2] = LLVMGetParam(ctx->main_fn, 503b8e80941Smrg ctx->param_vs_blit_inputs + 2); 504b8e80941Smrg out[3] = ctx->ac.f32_1; 505b8e80941Smrg return; 506b8e80941Smrg } 507848b8605Smrg 508b8e80941Smrg /* Color or texture coordinates: */ 509b8e80941Smrg assert(input_index == 1); 510b8e80941Smrg 511b8e80941Smrg if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) { 512b8e80941Smrg for (int i = 0; i < 4; i++) { 513b8e80941Smrg out[i] = LLVMGetParam(ctx->main_fn, 514b8e80941Smrg ctx->param_vs_blit_inputs + 3 + i); 515b8e80941Smrg } 516b8e80941Smrg } else { 517b8e80941Smrg assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD); 518b8e80941Smrg LLVMValueRef x1 = LLVMGetParam(ctx->main_fn, 519b8e80941Smrg ctx->param_vs_blit_inputs + 3); 520b8e80941Smrg LLVMValueRef y1 = LLVMGetParam(ctx->main_fn, 521b8e80941Smrg ctx->param_vs_blit_inputs + 4); 522b8e80941Smrg LLVMValueRef x2 = LLVMGetParam(ctx->main_fn, 523b8e80941Smrg ctx->param_vs_blit_inputs + 5); 524b8e80941Smrg LLVMValueRef y2 = LLVMGetParam(ctx->main_fn, 525b8e80941Smrg ctx->param_vs_blit_inputs + 6); 526b8e80941Smrg 527b8e80941Smrg out[0] = LLVMBuildSelect(ctx->ac.builder, sel_x1, 528b8e80941Smrg x1, x2, ""); 529b8e80941Smrg out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1, 530b8e80941Smrg y1, y2, ""); 531b8e80941Smrg out[2] = LLVMGetParam(ctx->main_fn, 532b8e80941Smrg ctx->param_vs_blit_inputs + 7); 533b8e80941Smrg out[3] = LLVMGetParam(ctx->main_fn, 534b8e80941Smrg ctx->param_vs_blit_inputs + 8); 535848b8605Smrg } 536848b8605Smrg return; 537848b8605Smrg } 538848b8605Smrg 539b8e80941Smrg unsigned chan; 540b8e80941Smrg unsigned fix_fetch; 541b8e80941Smrg unsigned num_fetches; 542b8e80941Smrg unsigned fetch_stride; 543b8e80941Smrg unsigned num_channels; 544848b8605Smrg 545b8e80941Smrg LLVMValueRef t_list_ptr; 546b8e80941Smrg LLVMValueRef t_offset; 547b8e80941Smrg LLVMValueRef t_list; 548b8e80941Smrg LLVMValueRef vertex_index; 549b8e80941Smrg LLVMValueRef input[3]; 550848b8605Smrg 551b8e80941Smrg /* Load the T list */ 552b8e80941Smrg t_list_ptr = LLVMGetParam(ctx->main_fn, ctx->param_vertex_buffers); 553848b8605Smrg 554b8e80941Smrg t_offset = LLVMConstInt(ctx->i32, input_index, 0); 555848b8605Smrg 556b8e80941Smrg t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset); 557848b8605Smrg 558b8e80941Smrg vertex_index = LLVMGetParam(ctx->main_fn, 559b8e80941Smrg ctx->param_vertex_index0 + 560b8e80941Smrg input_index); 561848b8605Smrg 562b8e80941Smrg fix_fetch = ctx->shader->key.mono.vs_fix_fetch[input_index]; 563b8e80941Smrg 564b8e80941Smrg /* Do multiple loads for special formats. */ 565b8e80941Smrg switch (fix_fetch) { 566b8e80941Smrg case SI_FIX_FETCH_RG_64_FLOAT: 567b8e80941Smrg num_fetches = 1; /* 1 2-dword or 4-dword load */ 568b8e80941Smrg fetch_stride = 0; 569b8e80941Smrg if (util_last_bit(info->input_usage_mask[input_index]) >= 2) 570b8e80941Smrg num_channels = 4; /* 2 doubles in 4 dwords */ 571848b8605Smrg else 572b8e80941Smrg num_channels = 2; /* 1 double in 2 dwords */ 573848b8605Smrg break; 574b8e80941Smrg case SI_FIX_FETCH_RGB_64_FLOAT: 575b8e80941Smrg num_fetches = 3; /* 3 2-dword loads */ 576b8e80941Smrg fetch_stride = 8; 577b8e80941Smrg num_channels = 2; 578b8e80941Smrg break; 579b8e80941Smrg case SI_FIX_FETCH_RGBA_64_FLOAT: 580b8e80941Smrg num_fetches = 2; /* 2 4-dword loads */ 581b8e80941Smrg fetch_stride = 16; 582b8e80941Smrg num_channels = 4; 583b8e80941Smrg break; 584b8e80941Smrg case SI_FIX_FETCH_RGB_8: 585b8e80941Smrg case SI_FIX_FETCH_RGB_8_INT: 586b8e80941Smrg num_fetches = 3; 587b8e80941Smrg fetch_stride = 1; 588b8e80941Smrg num_channels = 1; 589b8e80941Smrg break; 590b8e80941Smrg case SI_FIX_FETCH_RGB_16: 591b8e80941Smrg case SI_FIX_FETCH_RGB_16_INT: 592b8e80941Smrg num_fetches = 3; 593b8e80941Smrg fetch_stride = 2; 594b8e80941Smrg num_channels = 1; 595848b8605Smrg break; 596848b8605Smrg default: 597b8e80941Smrg num_fetches = 1; 598b8e80941Smrg fetch_stride = 0; 599b8e80941Smrg num_channels = util_last_bit(info->input_usage_mask[input_index]); 600848b8605Smrg } 601848b8605Smrg 602b8e80941Smrg for (unsigned i = 0; i < num_fetches; i++) { 603b8e80941Smrg LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0); 604848b8605Smrg 605b8e80941Smrg input[i] = ac_build_buffer_load_format(&ctx->ac, t_list, 606b8e80941Smrg vertex_index, voffset, 607b8e80941Smrg num_channels, false, true); 608b8e80941Smrg input[i] = ac_build_expand_to_vec4(&ctx->ac, input[i], num_channels); 609b8e80941Smrg } 610848b8605Smrg 611b8e80941Smrg /* Break up the vec4 into individual components */ 612b8e80941Smrg for (chan = 0; chan < 4; chan++) { 613b8e80941Smrg LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0); 614b8e80941Smrg out[chan] = LLVMBuildExtractElement(ctx->ac.builder, 615b8e80941Smrg input[0], llvm_chan, ""); 616b8e80941Smrg } 617848b8605Smrg 618b8e80941Smrg switch (fix_fetch) { 619b8e80941Smrg case SI_FIX_FETCH_A2_SNORM: 620b8e80941Smrg case SI_FIX_FETCH_A2_SSCALED: 621b8e80941Smrg case SI_FIX_FETCH_A2_SINT: { 622b8e80941Smrg /* The hardware returns an unsigned value; convert it to a 623b8e80941Smrg * signed one. 624b8e80941Smrg */ 625b8e80941Smrg LLVMValueRef tmp = out[3]; 626b8e80941Smrg LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0); 627848b8605Smrg 628b8e80941Smrg /* First, recover the sign-extended signed integer value. */ 629b8e80941Smrg if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) 630b8e80941Smrg tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->i32, ""); 631b8e80941Smrg else 632b8e80941Smrg tmp = ac_to_integer(&ctx->ac, tmp); 633848b8605Smrg 634b8e80941Smrg /* For the integer-like cases, do a natural sign extension. 635b8e80941Smrg * 636b8e80941Smrg * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0 637b8e80941Smrg * and happen to contain 0, 1, 2, 3 as the two LSBs of the 638b8e80941Smrg * exponent. 639b8e80941Smrg */ 640b8e80941Smrg tmp = LLVMBuildShl(ctx->ac.builder, tmp, 641b8e80941Smrg fix_fetch == SI_FIX_FETCH_A2_SNORM ? 642b8e80941Smrg LLVMConstInt(ctx->i32, 7, 0) : c30, ""); 643b8e80941Smrg tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, ""); 644b8e80941Smrg 645b8e80941Smrg /* Convert back to the right type. */ 646b8e80941Smrg if (fix_fetch == SI_FIX_FETCH_A2_SNORM) { 647b8e80941Smrg LLVMValueRef clamp; 648b8e80941Smrg LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0); 649b8e80941Smrg tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, ""); 650b8e80941Smrg clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, ""); 651b8e80941Smrg tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, ""); 652b8e80941Smrg } else if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) { 653b8e80941Smrg tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, ""); 654848b8605Smrg } 655848b8605Smrg 656b8e80941Smrg out[3] = tmp; 657848b8605Smrg break; 658b8e80941Smrg } 659b8e80941Smrg case SI_FIX_FETCH_RGBA_32_UNORM: 660b8e80941Smrg case SI_FIX_FETCH_RGBX_32_UNORM: 661b8e80941Smrg for (chan = 0; chan < 4; chan++) { 662b8e80941Smrg out[chan] = ac_to_integer(&ctx->ac, out[chan]); 663b8e80941Smrg out[chan] = LLVMBuildUIToFP(ctx->ac.builder, 664b8e80941Smrg out[chan], ctx->f32, ""); 665b8e80941Smrg out[chan] = LLVMBuildFMul(ctx->ac.builder, out[chan], 666b8e80941Smrg LLVMConstReal(ctx->f32, 1.0 / UINT_MAX), ""); 667b8e80941Smrg } 668b8e80941Smrg /* RGBX UINT returns 1 in alpha, which would be rounded to 0 by normalizing. */ 669b8e80941Smrg if (fix_fetch == SI_FIX_FETCH_RGBX_32_UNORM) 670b8e80941Smrg out[3] = LLVMConstReal(ctx->f32, 1); 671848b8605Smrg break; 672b8e80941Smrg case SI_FIX_FETCH_RGBA_32_SNORM: 673b8e80941Smrg case SI_FIX_FETCH_RGBX_32_SNORM: 674b8e80941Smrg case SI_FIX_FETCH_RGBA_32_FIXED: 675b8e80941Smrg case SI_FIX_FETCH_RGBX_32_FIXED: { 676b8e80941Smrg double scale; 677b8e80941Smrg if (fix_fetch >= SI_FIX_FETCH_RGBA_32_FIXED) 678b8e80941Smrg scale = 1.0 / 0x10000; 679b8e80941Smrg else 680b8e80941Smrg scale = 1.0 / INT_MAX; 681848b8605Smrg 682b8e80941Smrg for (chan = 0; chan < 4; chan++) { 683b8e80941Smrg out[chan] = ac_to_integer(&ctx->ac, out[chan]); 684b8e80941Smrg out[chan] = LLVMBuildSIToFP(ctx->ac.builder, 685b8e80941Smrg out[chan], ctx->f32, ""); 686b8e80941Smrg out[chan] = LLVMBuildFMul(ctx->ac.builder, out[chan], 687b8e80941Smrg LLVMConstReal(ctx->f32, scale), ""); 688b8e80941Smrg } 689b8e80941Smrg /* RGBX SINT returns 1 in alpha, which would be rounded to 0 by normalizing. */ 690b8e80941Smrg if (fix_fetch == SI_FIX_FETCH_RGBX_32_SNORM || 691b8e80941Smrg fix_fetch == SI_FIX_FETCH_RGBX_32_FIXED) 692b8e80941Smrg out[3] = LLVMConstReal(ctx->f32, 1); 693848b8605Smrg break; 694b8e80941Smrg } 695b8e80941Smrg case SI_FIX_FETCH_RGBA_32_USCALED: 696b8e80941Smrg for (chan = 0; chan < 4; chan++) { 697b8e80941Smrg out[chan] = ac_to_integer(&ctx->ac, out[chan]); 698b8e80941Smrg out[chan] = LLVMBuildUIToFP(ctx->ac.builder, 699b8e80941Smrg out[chan], ctx->f32, ""); 700b8e80941Smrg } 701b8e80941Smrg break; 702b8e80941Smrg case SI_FIX_FETCH_RGBA_32_SSCALED: 703b8e80941Smrg for (chan = 0; chan < 4; chan++) { 704b8e80941Smrg out[chan] = ac_to_integer(&ctx->ac, out[chan]); 705b8e80941Smrg out[chan] = LLVMBuildSIToFP(ctx->ac.builder, 706b8e80941Smrg out[chan], ctx->f32, ""); 707b8e80941Smrg } 708b8e80941Smrg break; 709b8e80941Smrg case SI_FIX_FETCH_RG_64_FLOAT: 710b8e80941Smrg for (chan = 0; chan < 2; chan++) 711b8e80941Smrg out[chan] = extract_double_to_float(ctx, input[0], chan); 712848b8605Smrg 713b8e80941Smrg out[2] = LLVMConstReal(ctx->f32, 0); 714b8e80941Smrg out[3] = LLVMConstReal(ctx->f32, 1); 715b8e80941Smrg break; 716b8e80941Smrg case SI_FIX_FETCH_RGB_64_FLOAT: 717b8e80941Smrg for (chan = 0; chan < 3; chan++) 718b8e80941Smrg out[chan] = extract_double_to_float(ctx, input[chan], 0); 719848b8605Smrg 720b8e80941Smrg out[3] = LLVMConstReal(ctx->f32, 1); 721b8e80941Smrg break; 722b8e80941Smrg case SI_FIX_FETCH_RGBA_64_FLOAT: 723b8e80941Smrg for (chan = 0; chan < 4; chan++) { 724b8e80941Smrg out[chan] = extract_double_to_float(ctx, input[chan / 2], 725b8e80941Smrg chan % 2); 726b8e80941Smrg } 727b8e80941Smrg break; 728b8e80941Smrg case SI_FIX_FETCH_RGB_8: 729b8e80941Smrg case SI_FIX_FETCH_RGB_8_INT: 730b8e80941Smrg case SI_FIX_FETCH_RGB_16: 731b8e80941Smrg case SI_FIX_FETCH_RGB_16_INT: 732b8e80941Smrg for (chan = 0; chan < 3; chan++) { 733b8e80941Smrg out[chan] = LLVMBuildExtractElement(ctx->ac.builder, 734b8e80941Smrg input[chan], 735b8e80941Smrg ctx->i32_0, ""); 736b8e80941Smrg } 737b8e80941Smrg if (fix_fetch == SI_FIX_FETCH_RGB_8 || 738b8e80941Smrg fix_fetch == SI_FIX_FETCH_RGB_16) { 739b8e80941Smrg out[3] = LLVMConstReal(ctx->f32, 1); 740b8e80941Smrg } else { 741b8e80941Smrg out[3] = ac_to_float(&ctx->ac, ctx->i32_1); 742b8e80941Smrg } 743848b8605Smrg break; 744848b8605Smrg } 745b8e80941Smrg} 746848b8605Smrg 747b8e80941Smrgstatic void declare_input_vs( 748b8e80941Smrg struct si_shader_context *ctx, 749b8e80941Smrg unsigned input_index, 750b8e80941Smrg const struct tgsi_full_declaration *decl, 751b8e80941Smrg LLVMValueRef out[4]) 752b8e80941Smrg{ 753b8e80941Smrg si_llvm_load_input_vs(ctx, input_index, out); 754b8e80941Smrg} 755b8e80941Smrg 756b8e80941Smrgstatic LLVMValueRef get_primitive_id(struct si_shader_context *ctx, 757b8e80941Smrg unsigned swizzle) 758b8e80941Smrg{ 759b8e80941Smrg if (swizzle > 0) 760b8e80941Smrg return ctx->i32_0; 761b8e80941Smrg 762b8e80941Smrg switch (ctx->type) { 763b8e80941Smrg case PIPE_SHADER_VERTEX: 764b8e80941Smrg return LLVMGetParam(ctx->main_fn, 765b8e80941Smrg ctx->param_vs_prim_id); 766b8e80941Smrg case PIPE_SHADER_TESS_CTRL: 767b8e80941Smrg return ctx->abi.tcs_patch_id; 768b8e80941Smrg case PIPE_SHADER_TESS_EVAL: 769b8e80941Smrg return ctx->abi.tes_patch_id; 770b8e80941Smrg case PIPE_SHADER_GEOMETRY: 771b8e80941Smrg return ctx->abi.gs_prim_id; 772848b8605Smrg default: 773b8e80941Smrg assert(0); 774b8e80941Smrg return ctx->i32_0; 775848b8605Smrg } 776848b8605Smrg} 777848b8605Smrg 778b8e80941Smrg/** 779b8e80941Smrg * Return the value of tgsi_ind_register for indexing. 780b8e80941Smrg * This is the indirect index with the constant offset added to it. 781b8e80941Smrg */ 782b8e80941SmrgLLVMValueRef si_get_indirect_index(struct si_shader_context *ctx, 783b8e80941Smrg const struct tgsi_ind_register *ind, 784b8e80941Smrg unsigned addr_mul, 785b8e80941Smrg int rel_index) 786848b8605Smrg{ 787848b8605Smrg LLVMValueRef result; 788848b8605Smrg 789b8e80941Smrg if (ind->File == TGSI_FILE_ADDRESS) { 790b8e80941Smrg result = ctx->addrs[ind->Index][ind->Swizzle]; 791b8e80941Smrg result = LLVMBuildLoad(ctx->ac.builder, result, ""); 792b8e80941Smrg } else { 793b8e80941Smrg struct tgsi_full_src_register src = {}; 794848b8605Smrg 795b8e80941Smrg src.Register.File = ind->File; 796b8e80941Smrg src.Register.Index = ind->Index; 797848b8605Smrg 798b8e80941Smrg /* Set the second index to 0 for constants. */ 799b8e80941Smrg if (ind->File == TGSI_FILE_CONSTANT) 800b8e80941Smrg src.Register.Dimension = 1; 801848b8605Smrg 802b8e80941Smrg result = ctx->bld_base.emit_fetch_funcs[ind->File](&ctx->bld_base, &src, 803b8e80941Smrg TGSI_TYPE_SIGNED, 804b8e80941Smrg ind->Swizzle); 805b8e80941Smrg result = ac_to_integer(&ctx->ac, result); 806b8e80941Smrg } 807848b8605Smrg 808b8e80941Smrg return ac_build_imad(&ctx->ac, result, LLVMConstInt(ctx->i32, addr_mul, 0), 809b8e80941Smrg LLVMConstInt(ctx->i32, rel_index, 0)); 810b8e80941Smrg} 811848b8605Smrg 812b8e80941Smrg/** 813b8e80941Smrg * Like si_get_indirect_index, but restricts the return value to a (possibly 814b8e80941Smrg * undefined) value inside [0..num). 815b8e80941Smrg */ 816b8e80941SmrgLLVMValueRef si_get_bounded_indirect_index(struct si_shader_context *ctx, 817b8e80941Smrg const struct tgsi_ind_register *ind, 818b8e80941Smrg int rel_index, unsigned num) 819b8e80941Smrg{ 820b8e80941Smrg LLVMValueRef result = si_get_indirect_index(ctx, ind, 1, rel_index); 821848b8605Smrg 822b8e80941Smrg return si_llvm_bound_index(ctx, result, num); 823848b8605Smrg} 824848b8605Smrg 825b8e80941Smrgstatic LLVMValueRef get_dw_address_from_generic_indices(struct si_shader_context *ctx, 826b8e80941Smrg LLVMValueRef vertex_dw_stride, 827b8e80941Smrg LLVMValueRef base_addr, 828b8e80941Smrg LLVMValueRef vertex_index, 829b8e80941Smrg LLVMValueRef param_index, 830b8e80941Smrg unsigned input_index, 831b8e80941Smrg ubyte *name, 832b8e80941Smrg ubyte *index, 833b8e80941Smrg bool is_patch) 834848b8605Smrg{ 835b8e80941Smrg if (vertex_dw_stride) { 836b8e80941Smrg base_addr = ac_build_imad(&ctx->ac, vertex_index, 837b8e80941Smrg vertex_dw_stride, base_addr); 838b8e80941Smrg } 839848b8605Smrg 840b8e80941Smrg if (param_index) { 841b8e80941Smrg base_addr = ac_build_imad(&ctx->ac, param_index, 842b8e80941Smrg LLVMConstInt(ctx->i32, 4, 0), base_addr); 843b8e80941Smrg } 844848b8605Smrg 845b8e80941Smrg int param = is_patch ? 846b8e80941Smrg si_shader_io_get_unique_index_patch(name[input_index], 847b8e80941Smrg index[input_index]) : 848b8e80941Smrg si_shader_io_get_unique_index(name[input_index], 849b8e80941Smrg index[input_index], false); 850848b8605Smrg 851b8e80941Smrg /* Add the base address of the element. */ 852b8e80941Smrg return LLVMBuildAdd(ctx->ac.builder, base_addr, 853b8e80941Smrg LLVMConstInt(ctx->i32, param * 4, 0), ""); 854b8e80941Smrg} 855848b8605Smrg 856b8e80941Smrg/** 857b8e80941Smrg * Calculate a dword address given an input or output register and a stride. 858b8e80941Smrg */ 859b8e80941Smrgstatic LLVMValueRef get_dw_address(struct si_shader_context *ctx, 860b8e80941Smrg const struct tgsi_full_dst_register *dst, 861b8e80941Smrg const struct tgsi_full_src_register *src, 862b8e80941Smrg LLVMValueRef vertex_dw_stride, 863b8e80941Smrg LLVMValueRef base_addr) 864b8e80941Smrg{ 865b8e80941Smrg struct tgsi_shader_info *info = &ctx->shader->selector->info; 866b8e80941Smrg ubyte *name, *index, *array_first; 867b8e80941Smrg int input_index; 868b8e80941Smrg struct tgsi_full_dst_register reg; 869b8e80941Smrg LLVMValueRef vertex_index = NULL; 870b8e80941Smrg LLVMValueRef ind_index = NULL; 871b8e80941Smrg 872b8e80941Smrg /* Set the register description. The address computation is the same 873b8e80941Smrg * for sources and destinations. */ 874b8e80941Smrg if (src) { 875b8e80941Smrg reg.Register.File = src->Register.File; 876b8e80941Smrg reg.Register.Index = src->Register.Index; 877b8e80941Smrg reg.Register.Indirect = src->Register.Indirect; 878b8e80941Smrg reg.Register.Dimension = src->Register.Dimension; 879b8e80941Smrg reg.Indirect = src->Indirect; 880b8e80941Smrg reg.Dimension = src->Dimension; 881b8e80941Smrg reg.DimIndirect = src->DimIndirect; 882b8e80941Smrg } else 883b8e80941Smrg reg = *dst; 884b8e80941Smrg 885b8e80941Smrg /* If the register is 2-dimensional (e.g. an array of vertices 886b8e80941Smrg * in a primitive), calculate the base address of the vertex. */ 887b8e80941Smrg if (reg.Register.Dimension) { 888b8e80941Smrg if (reg.Dimension.Indirect) 889b8e80941Smrg vertex_index = si_get_indirect_index(ctx, ®.DimIndirect, 890b8e80941Smrg 1, reg.Dimension.Index); 891b8e80941Smrg else 892b8e80941Smrg vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0); 893848b8605Smrg } 894848b8605Smrg 895b8e80941Smrg /* Get information about the register. */ 896b8e80941Smrg if (reg.Register.File == TGSI_FILE_INPUT) { 897b8e80941Smrg name = info->input_semantic_name; 898b8e80941Smrg index = info->input_semantic_index; 899b8e80941Smrg array_first = info->input_array_first; 900b8e80941Smrg } else if (reg.Register.File == TGSI_FILE_OUTPUT) { 901b8e80941Smrg name = info->output_semantic_name; 902b8e80941Smrg index = info->output_semantic_index; 903b8e80941Smrg array_first = info->output_array_first; 904848b8605Smrg } else { 905b8e80941Smrg assert(0); 906b8e80941Smrg return NULL; 907848b8605Smrg } 908848b8605Smrg 909b8e80941Smrg if (reg.Register.Indirect) { 910b8e80941Smrg /* Add the relative address of the element. */ 911b8e80941Smrg if (reg.Indirect.ArrayID) 912b8e80941Smrg input_index = array_first[reg.Indirect.ArrayID]; 913b8e80941Smrg else 914b8e80941Smrg input_index = reg.Register.Index; 915848b8605Smrg 916b8e80941Smrg ind_index = si_get_indirect_index(ctx, ®.Indirect, 917b8e80941Smrg 1, reg.Register.Index - input_index); 918b8e80941Smrg } else { 919b8e80941Smrg input_index = reg.Register.Index; 920b8e80941Smrg } 921848b8605Smrg 922b8e80941Smrg return get_dw_address_from_generic_indices(ctx, vertex_dw_stride, 923b8e80941Smrg base_addr, vertex_index, 924b8e80941Smrg ind_index, input_index, 925b8e80941Smrg name, index, 926b8e80941Smrg !reg.Register.Dimension); 927b8e80941Smrg} 928848b8605Smrg 929b8e80941Smrg/* The offchip buffer layout for TCS->TES is 930b8e80941Smrg * 931b8e80941Smrg * - attribute 0 of patch 0 vertex 0 932b8e80941Smrg * - attribute 0 of patch 0 vertex 1 933b8e80941Smrg * - attribute 0 of patch 0 vertex 2 934b8e80941Smrg * ... 935b8e80941Smrg * - attribute 0 of patch 1 vertex 0 936b8e80941Smrg * - attribute 0 of patch 1 vertex 1 937b8e80941Smrg * ... 938b8e80941Smrg * - attribute 1 of patch 0 vertex 0 939b8e80941Smrg * - attribute 1 of patch 0 vertex 1 940b8e80941Smrg * ... 941b8e80941Smrg * - per patch attribute 0 of patch 0 942b8e80941Smrg * - per patch attribute 0 of patch 1 943b8e80941Smrg * ... 944b8e80941Smrg * 945b8e80941Smrg * Note that every attribute has 4 components. 946b8e80941Smrg */ 947b8e80941Smrgstatic LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx, 948b8e80941Smrg LLVMValueRef rel_patch_id, 949b8e80941Smrg LLVMValueRef vertex_index, 950b8e80941Smrg LLVMValueRef param_index) 951b8e80941Smrg{ 952b8e80941Smrg LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices; 953b8e80941Smrg LLVMValueRef param_stride, constant16; 954b8e80941Smrg 955b8e80941Smrg vertices_per_patch = get_num_tcs_out_vertices(ctx); 956b8e80941Smrg num_patches = si_unpack_param(ctx, ctx->param_tcs_offchip_layout, 0, 6); 957b8e80941Smrg total_vertices = LLVMBuildMul(ctx->ac.builder, vertices_per_patch, 958b8e80941Smrg num_patches, ""); 959b8e80941Smrg 960b8e80941Smrg constant16 = LLVMConstInt(ctx->i32, 16, 0); 961b8e80941Smrg if (vertex_index) { 962b8e80941Smrg base_addr = ac_build_imad(&ctx->ac, rel_patch_id, 963b8e80941Smrg vertices_per_patch, vertex_index); 964b8e80941Smrg param_stride = total_vertices; 965b8e80941Smrg } else { 966b8e80941Smrg base_addr = rel_patch_id; 967b8e80941Smrg param_stride = num_patches; 968b8e80941Smrg } 969848b8605Smrg 970b8e80941Smrg base_addr = ac_build_imad(&ctx->ac, param_index, param_stride, base_addr); 971b8e80941Smrg base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, ""); 972b8e80941Smrg 973b8e80941Smrg if (!vertex_index) { 974b8e80941Smrg LLVMValueRef patch_data_offset = 975b8e80941Smrg si_unpack_param(ctx, ctx->param_tcs_offchip_layout, 12, 20); 976b8e80941Smrg 977b8e80941Smrg base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr, 978b8e80941Smrg patch_data_offset, ""); 979b8e80941Smrg } 980b8e80941Smrg return base_addr; 981848b8605Smrg} 982848b8605Smrg 983b8e80941Smrg/* This is a generic helper that can be shared by the NIR and TGSI backends */ 984b8e80941Smrgstatic LLVMValueRef get_tcs_tes_buffer_address_from_generic_indices( 985b8e80941Smrg struct si_shader_context *ctx, 986b8e80941Smrg LLVMValueRef vertex_index, 987b8e80941Smrg LLVMValueRef param_index, 988b8e80941Smrg unsigned param_base, 989b8e80941Smrg ubyte *name, 990b8e80941Smrg ubyte *index, 991b8e80941Smrg bool is_patch) 992848b8605Smrg{ 993b8e80941Smrg unsigned param_index_base; 994848b8605Smrg 995b8e80941Smrg param_index_base = is_patch ? 996b8e80941Smrg si_shader_io_get_unique_index_patch(name[param_base], index[param_base]) : 997b8e80941Smrg si_shader_io_get_unique_index(name[param_base], index[param_base], false); 998b8e80941Smrg 999b8e80941Smrg if (param_index) { 1000b8e80941Smrg param_index = LLVMBuildAdd(ctx->ac.builder, param_index, 1001b8e80941Smrg LLVMConstInt(ctx->i32, param_index_base, 0), 1002b8e80941Smrg ""); 1003b8e80941Smrg } else { 1004b8e80941Smrg param_index = LLVMConstInt(ctx->i32, param_index_base, 0); 1005b8e80941Smrg } 1006848b8605Smrg 1007b8e80941Smrg return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), 1008b8e80941Smrg vertex_index, param_index); 1009848b8605Smrg} 1010848b8605Smrg 1011b8e80941Smrgstatic LLVMValueRef get_tcs_tes_buffer_address_from_reg( 1012b8e80941Smrg struct si_shader_context *ctx, 1013b8e80941Smrg const struct tgsi_full_dst_register *dst, 1014b8e80941Smrg const struct tgsi_full_src_register *src) 1015848b8605Smrg{ 1016b8e80941Smrg struct tgsi_shader_info *info = &ctx->shader->selector->info; 1017b8e80941Smrg ubyte *name, *index, *array_first; 1018b8e80941Smrg struct tgsi_full_src_register reg; 1019b8e80941Smrg LLVMValueRef vertex_index = NULL; 1020b8e80941Smrg LLVMValueRef param_index = NULL; 1021b8e80941Smrg unsigned param_base; 1022848b8605Smrg 1023b8e80941Smrg reg = src ? *src : tgsi_full_src_register_from_dst(dst); 1024b8e80941Smrg 1025b8e80941Smrg if (reg.Register.Dimension) { 1026b8e80941Smrg 1027b8e80941Smrg if (reg.Dimension.Indirect) 1028b8e80941Smrg vertex_index = si_get_indirect_index(ctx, ®.DimIndirect, 1029b8e80941Smrg 1, reg.Dimension.Index); 1030b8e80941Smrg else 1031b8e80941Smrg vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0); 1032b8e80941Smrg } 1033b8e80941Smrg 1034b8e80941Smrg /* Get information about the register. */ 1035b8e80941Smrg if (reg.Register.File == TGSI_FILE_INPUT) { 1036b8e80941Smrg name = info->input_semantic_name; 1037b8e80941Smrg index = info->input_semantic_index; 1038b8e80941Smrg array_first = info->input_array_first; 1039b8e80941Smrg } else if (reg.Register.File == TGSI_FILE_OUTPUT) { 1040b8e80941Smrg name = info->output_semantic_name; 1041b8e80941Smrg index = info->output_semantic_index; 1042b8e80941Smrg array_first = info->output_array_first; 1043b8e80941Smrg } else { 1044b8e80941Smrg assert(0); 1045b8e80941Smrg return NULL; 1046b8e80941Smrg } 1047b8e80941Smrg 1048b8e80941Smrg if (reg.Register.Indirect) { 1049b8e80941Smrg if (reg.Indirect.ArrayID) 1050b8e80941Smrg param_base = array_first[reg.Indirect.ArrayID]; 1051b8e80941Smrg else 1052b8e80941Smrg param_base = reg.Register.Index; 1053b8e80941Smrg 1054b8e80941Smrg param_index = si_get_indirect_index(ctx, ®.Indirect, 1055b8e80941Smrg 1, reg.Register.Index - param_base); 1056848b8605Smrg 1057848b8605Smrg } else { 1058b8e80941Smrg param_base = reg.Register.Index; 1059848b8605Smrg } 1060b8e80941Smrg 1061b8e80941Smrg return get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, 1062b8e80941Smrg param_index, param_base, 1063b8e80941Smrg name, index, !reg.Register.Dimension); 1064848b8605Smrg} 1065848b8605Smrg 1066b8e80941Smrgstatic LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base, 1067b8e80941Smrg LLVMTypeRef type, unsigned swizzle, 1068b8e80941Smrg LLVMValueRef buffer, LLVMValueRef offset, 1069b8e80941Smrg LLVMValueRef base, bool can_speculate) 1070848b8605Smrg{ 1071b8e80941Smrg struct si_shader_context *ctx = si_shader_context(bld_base); 1072b8e80941Smrg LLVMValueRef value, value2; 1073b8e80941Smrg LLVMTypeRef vec_type = LLVMVectorType(type, 4); 1074848b8605Smrg 1075b8e80941Smrg if (swizzle == ~0) { 1076b8e80941Smrg value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset, 1077b8e80941Smrg 0, 1, 0, can_speculate, false); 1078848b8605Smrg 1079b8e80941Smrg return LLVMBuildBitCast(ctx->ac.builder, value, vec_type, ""); 1080b8e80941Smrg } 1081848b8605Smrg 1082b8e80941Smrg if (!llvm_type_is_64bit(ctx, type)) { 1083b8e80941Smrg value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset, 1084b8e80941Smrg 0, 1, 0, can_speculate, false); 1085848b8605Smrg 1086b8e80941Smrg value = LLVMBuildBitCast(ctx->ac.builder, value, vec_type, ""); 1087b8e80941Smrg return LLVMBuildExtractElement(ctx->ac.builder, value, 1088b8e80941Smrg LLVMConstInt(ctx->i32, swizzle, 0), ""); 1089b8e80941Smrg } 1090848b8605Smrg 1091b8e80941Smrg value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset, 1092b8e80941Smrg swizzle * 4, 1, 0, can_speculate, false); 1093b8e80941Smrg 1094b8e80941Smrg value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset, 1095b8e80941Smrg swizzle * 4 + 4, 1, 0, can_speculate, false); 1096b8e80941Smrg 1097b8e80941Smrg return si_llvm_emit_fetch_64bit(bld_base, type, value, value2); 1098b8e80941Smrg} 1099b8e80941Smrg 1100b8e80941Smrg/** 1101b8e80941Smrg * Load from LDS. 1102b8e80941Smrg * 1103b8e80941Smrg * \param type output value type 1104b8e80941Smrg * \param swizzle offset (typically 0..3); it can be ~0, which loads a vec4 1105b8e80941Smrg * \param dw_addr address in dwords 1106b8e80941Smrg */ 1107b8e80941Smrgstatic LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base, 1108b8e80941Smrg LLVMTypeRef type, unsigned swizzle, 1109b8e80941Smrg LLVMValueRef dw_addr) 1110b8e80941Smrg{ 1111b8e80941Smrg struct si_shader_context *ctx = si_shader_context(bld_base); 1112b8e80941Smrg LLVMValueRef value; 1113b8e80941Smrg 1114b8e80941Smrg if (swizzle == ~0) { 1115b8e80941Smrg LLVMValueRef values[TGSI_NUM_CHANNELS]; 1116b8e80941Smrg 1117b8e80941Smrg for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++) 1118b8e80941Smrg values[chan] = lds_load(bld_base, type, chan, dw_addr); 1119b8e80941Smrg 1120b8e80941Smrg return ac_build_gather_values(&ctx->ac, values, 1121b8e80941Smrg TGSI_NUM_CHANNELS); 1122b8e80941Smrg } 1123b8e80941Smrg 1124b8e80941Smrg /* Split 64-bit loads. */ 1125b8e80941Smrg if (llvm_type_is_64bit(ctx, type)) { 1126b8e80941Smrg LLVMValueRef lo, hi; 1127848b8605Smrg 1128b8e80941Smrg lo = lds_load(bld_base, ctx->i32, swizzle, dw_addr); 1129b8e80941Smrg hi = lds_load(bld_base, ctx->i32, swizzle + 1, dw_addr); 1130b8e80941Smrg return si_llvm_emit_fetch_64bit(bld_base, type, lo, hi); 1131848b8605Smrg } 1132b8e80941Smrg 1133b8e80941Smrg dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr, 1134b8e80941Smrg LLVMConstInt(ctx->i32, swizzle, 0), ""); 1135b8e80941Smrg 1136b8e80941Smrg value = ac_lds_load(&ctx->ac, dw_addr); 1137b8e80941Smrg 1138b8e80941Smrg return LLVMBuildBitCast(ctx->ac.builder, value, type, ""); 1139848b8605Smrg} 1140848b8605Smrg 1141b8e80941Smrg/** 1142b8e80941Smrg * Store to LDS. 1143b8e80941Smrg * 1144b8e80941Smrg * \param swizzle offset (typically 0..3) 1145b8e80941Smrg * \param dw_addr address in dwords 1146b8e80941Smrg * \param value value to store 1147b8e80941Smrg */ 1148b8e80941Smrgstatic void lds_store(struct si_shader_context *ctx, 1149b8e80941Smrg unsigned dw_offset_imm, LLVMValueRef dw_addr, 1150b8e80941Smrg LLVMValueRef value) 1151848b8605Smrg{ 1152b8e80941Smrg dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr, 1153b8e80941Smrg LLVMConstInt(ctx->i32, dw_offset_imm, 0), ""); 1154848b8605Smrg 1155b8e80941Smrg ac_lds_store(&ctx->ac, dw_addr, value); 1156b8e80941Smrg} 1157848b8605Smrg 1158b8e80941Smrgenum si_tess_ring { 1159b8e80941Smrg TCS_FACTOR_RING, 1160b8e80941Smrg TESS_OFFCHIP_RING_TCS, 1161b8e80941Smrg TESS_OFFCHIP_RING_TES, 1162b8e80941Smrg}; 1163b8e80941Smrg 1164b8e80941Smrgstatic LLVMValueRef get_tess_ring_descriptor(struct si_shader_context *ctx, 1165b8e80941Smrg enum si_tess_ring ring) 1166b8e80941Smrg{ 1167b8e80941Smrg LLVMBuilderRef builder = ctx->ac.builder; 1168b8e80941Smrg unsigned param = ring == TESS_OFFCHIP_RING_TES ? ctx->param_tes_offchip_addr : 1169b8e80941Smrg ctx->param_tcs_out_lds_layout; 1170b8e80941Smrg LLVMValueRef addr = LLVMGetParam(ctx->main_fn, param); 1171b8e80941Smrg 1172b8e80941Smrg /* TCS only receives high 13 bits of the address. */ 1173b8e80941Smrg if (ring == TESS_OFFCHIP_RING_TCS || ring == TCS_FACTOR_RING) { 1174b8e80941Smrg addr = LLVMBuildAnd(builder, addr, 1175b8e80941Smrg LLVMConstInt(ctx->i32, 0xfff80000, 0), ""); 1176b8e80941Smrg } 1177b8e80941Smrg 1178b8e80941Smrg if (ring == TCS_FACTOR_RING) { 1179b8e80941Smrg unsigned tf_offset = ctx->screen->tess_offchip_ring_size; 1180b8e80941Smrg addr = LLVMBuildAdd(builder, addr, 1181b8e80941Smrg LLVMConstInt(ctx->i32, tf_offset, 0), ""); 1182848b8605Smrg } 1183b8e80941Smrg 1184b8e80941Smrg LLVMValueRef desc[4]; 1185b8e80941Smrg desc[0] = addr; 1186b8e80941Smrg desc[1] = LLVMConstInt(ctx->i32, 1187b8e80941Smrg S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0); 1188b8e80941Smrg desc[2] = LLVMConstInt(ctx->i32, 0xffffffff, 0); 1189b8e80941Smrg desc[3] = LLVMConstInt(ctx->i32, 1190b8e80941Smrg S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | 1191b8e80941Smrg S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | 1192b8e80941Smrg S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | 1193b8e80941Smrg S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | 1194b8e80941Smrg S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | 1195b8e80941Smrg S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32), 0); 1196b8e80941Smrg 1197b8e80941Smrg return ac_build_gather_values(&ctx->ac, desc, 4); 1198848b8605Smrg} 1199848b8605Smrg 1200b8e80941Smrgstatic LLVMValueRef fetch_input_tcs( 1201b8e80941Smrg struct lp_build_tgsi_context *bld_base, 1202b8e80941Smrg const struct tgsi_full_src_register *reg, 1203b8e80941Smrg enum tgsi_opcode_type type, unsigned swizzle_in) 1204b8e80941Smrg{ 1205b8e80941Smrg struct si_shader_context *ctx = si_shader_context(bld_base); 1206b8e80941Smrg LLVMValueRef dw_addr, stride; 1207b8e80941Smrg unsigned swizzle = swizzle_in & 0xffff; 1208b8e80941Smrg stride = get_tcs_in_vertex_dw_stride(ctx); 1209b8e80941Smrg dw_addr = get_tcs_in_current_patch_offset(ctx); 1210b8e80941Smrg dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr); 1211b8e80941Smrg 1212b8e80941Smrg return lds_load(bld_base, tgsi2llvmtype(bld_base, type), swizzle, dw_addr); 1213b8e80941Smrg} 1214848b8605Smrg 1215b8e80941Smrgstatic LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi, 1216b8e80941Smrg LLVMTypeRef type, 1217b8e80941Smrg LLVMValueRef vertex_index, 1218b8e80941Smrg LLVMValueRef param_index, 1219b8e80941Smrg unsigned const_index, 1220b8e80941Smrg unsigned location, 1221b8e80941Smrg unsigned driver_location, 1222b8e80941Smrg unsigned component, 1223b8e80941Smrg unsigned num_components, 1224b8e80941Smrg bool is_patch, 1225b8e80941Smrg bool is_compact, 1226b8e80941Smrg bool load_input) 1227b8e80941Smrg{ 1228b8e80941Smrg struct si_shader_context *ctx = si_shader_context_from_abi(abi); 1229b8e80941Smrg struct tgsi_shader_info *info = &ctx->shader->selector->info; 1230b8e80941Smrg struct lp_build_tgsi_context *bld_base = &ctx->bld_base; 1231b8e80941Smrg LLVMValueRef dw_addr, stride; 1232b8e80941Smrg 1233b8e80941Smrg driver_location = driver_location / 4; 1234b8e80941Smrg 1235b8e80941Smrg if (load_input) { 1236b8e80941Smrg stride = get_tcs_in_vertex_dw_stride(ctx); 1237b8e80941Smrg dw_addr = get_tcs_in_current_patch_offset(ctx); 1238b8e80941Smrg } else { 1239b8e80941Smrg if (is_patch) { 1240b8e80941Smrg stride = NULL; 1241b8e80941Smrg dw_addr = get_tcs_out_current_patch_data_offset(ctx); 1242b8e80941Smrg } else { 1243b8e80941Smrg stride = get_tcs_out_vertex_dw_stride(ctx); 1244b8e80941Smrg dw_addr = get_tcs_out_current_patch_offset(ctx); 1245b8e80941Smrg } 1246b8e80941Smrg } 1247b8e80941Smrg 1248b8e80941Smrg if (param_index) { 1249b8e80941Smrg /* Add the constant index to the indirect index */ 1250b8e80941Smrg param_index = LLVMBuildAdd(ctx->ac.builder, param_index, 1251b8e80941Smrg LLVMConstInt(ctx->i32, const_index, 0), ""); 1252b8e80941Smrg } else { 1253b8e80941Smrg param_index = LLVMConstInt(ctx->i32, const_index, 0); 1254b8e80941Smrg } 1255b8e80941Smrg 1256b8e80941Smrg ubyte *names; 1257b8e80941Smrg ubyte *indices; 1258b8e80941Smrg if (load_input) { 1259b8e80941Smrg names = info->input_semantic_name; 1260b8e80941Smrg indices = info->input_semantic_index; 1261b8e80941Smrg } else { 1262b8e80941Smrg names = info->output_semantic_name; 1263b8e80941Smrg indices = info->output_semantic_index; 1264b8e80941Smrg } 1265b8e80941Smrg 1266b8e80941Smrg dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr, 1267b8e80941Smrg vertex_index, param_index, 1268b8e80941Smrg driver_location, 1269b8e80941Smrg names, indices, 1270b8e80941Smrg is_patch); 1271b8e80941Smrg 1272b8e80941Smrg LLVMValueRef value[4]; 1273b8e80941Smrg for (unsigned i = 0; i < num_components; i++) { 1274b8e80941Smrg unsigned offset = i; 1275b8e80941Smrg if (llvm_type_is_64bit(ctx, type)) 1276b8e80941Smrg offset *= 2; 1277848b8605Smrg 1278b8e80941Smrg offset += component; 1279b8e80941Smrg value[i + component] = lds_load(bld_base, type, offset, dw_addr); 1280b8e80941Smrg } 1281b8e80941Smrg 1282b8e80941Smrg return ac_build_varying_gather_values(&ctx->ac, value, num_components, component); 1283848b8605Smrg} 1284848b8605Smrg 1285b8e80941Smrgstatic LLVMValueRef fetch_output_tcs( 1286b8e80941Smrg struct lp_build_tgsi_context *bld_base, 1287b8e80941Smrg const struct tgsi_full_src_register *reg, 1288b8e80941Smrg enum tgsi_opcode_type type, unsigned swizzle_in) 1289848b8605Smrg{ 1290b8e80941Smrg struct si_shader_context *ctx = si_shader_context(bld_base); 1291b8e80941Smrg LLVMValueRef dw_addr, stride; 1292b8e80941Smrg unsigned swizzle = (swizzle_in & 0xffff); 1293b8e80941Smrg 1294b8e80941Smrg if (reg->Register.Dimension) { 1295b8e80941Smrg stride = get_tcs_out_vertex_dw_stride(ctx); 1296b8e80941Smrg dw_addr = get_tcs_out_current_patch_offset(ctx); 1297b8e80941Smrg dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr); 1298b8e80941Smrg } else { 1299b8e80941Smrg dw_addr = get_tcs_out_current_patch_data_offset(ctx); 1300b8e80941Smrg dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr); 1301b8e80941Smrg } 1302848b8605Smrg 1303b8e80941Smrg return lds_load(bld_base, tgsi2llvmtype(bld_base, type), swizzle, dw_addr); 1304b8e80941Smrg} 1305848b8605Smrg 1306b8e80941Smrgstatic LLVMValueRef fetch_input_tes( 1307b8e80941Smrg struct lp_build_tgsi_context *bld_base, 1308b8e80941Smrg const struct tgsi_full_src_register *reg, 1309b8e80941Smrg enum tgsi_opcode_type type, unsigned swizzle_in) 1310b8e80941Smrg{ 1311b8e80941Smrg struct si_shader_context *ctx = si_shader_context(bld_base); 1312b8e80941Smrg LLVMValueRef base, addr; 1313b8e80941Smrg unsigned swizzle = (swizzle_in & 0xffff); 1314848b8605Smrg 1315b8e80941Smrg base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset); 1316b8e80941Smrg addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg); 1317848b8605Smrg 1318b8e80941Smrg return buffer_load(bld_base, tgsi2llvmtype(bld_base, type), swizzle, 1319b8e80941Smrg ctx->tess_offchip_ring, base, addr, true); 1320b8e80941Smrg} 1321848b8605Smrg 1322b8e80941SmrgLLVMValueRef si_nir_load_input_tes(struct ac_shader_abi *abi, 1323b8e80941Smrg LLVMTypeRef type, 1324b8e80941Smrg LLVMValueRef vertex_index, 1325b8e80941Smrg LLVMValueRef param_index, 1326b8e80941Smrg unsigned const_index, 1327b8e80941Smrg unsigned location, 1328b8e80941Smrg unsigned driver_location, 1329b8e80941Smrg unsigned component, 1330b8e80941Smrg unsigned num_components, 1331b8e80941Smrg bool is_patch, 1332b8e80941Smrg bool is_compact, 1333b8e80941Smrg bool load_input) 1334b8e80941Smrg{ 1335b8e80941Smrg struct si_shader_context *ctx = si_shader_context_from_abi(abi); 1336b8e80941Smrg struct tgsi_shader_info *info = &ctx->shader->selector->info; 1337b8e80941Smrg LLVMValueRef base, addr; 1338848b8605Smrg 1339b8e80941Smrg driver_location = driver_location / 4; 1340848b8605Smrg 1341b8e80941Smrg base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset); 1342848b8605Smrg 1343b8e80941Smrg if (param_index) { 1344b8e80941Smrg /* Add the constant index to the indirect index */ 1345b8e80941Smrg param_index = LLVMBuildAdd(ctx->ac.builder, param_index, 1346b8e80941Smrg LLVMConstInt(ctx->i32, const_index, 0), ""); 1347b8e80941Smrg } else { 1348b8e80941Smrg param_index = LLVMConstInt(ctx->i32, const_index, 0); 1349b8e80941Smrg } 1350848b8605Smrg 1351b8e80941Smrg addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, 1352b8e80941Smrg param_index, driver_location, 1353b8e80941Smrg info->input_semantic_name, 1354b8e80941Smrg info->input_semantic_index, 1355b8e80941Smrg is_patch); 1356b8e80941Smrg 1357b8e80941Smrg /* TODO: This will generate rather ordinary llvm code, although it 1358b8e80941Smrg * should be easy for the optimiser to fix up. In future we might want 1359b8e80941Smrg * to refactor buffer_load(), but for now this maximises code sharing 1360b8e80941Smrg * between the NIR and TGSI backends. 1361b8e80941Smrg */ 1362b8e80941Smrg LLVMValueRef value[4]; 1363b8e80941Smrg for (unsigned i = 0; i < num_components; i++) { 1364b8e80941Smrg unsigned offset = i; 1365b8e80941Smrg if (llvm_type_is_64bit(ctx, type)) 1366b8e80941Smrg offset *= 2; 1367b8e80941Smrg 1368b8e80941Smrg offset += component; 1369b8e80941Smrg value[i + component] = buffer_load(&ctx->bld_base, type, offset, 1370b8e80941Smrg ctx->tess_offchip_ring, base, addr, true); 1371b8e80941Smrg } 1372b8e80941Smrg 1373b8e80941Smrg return ac_build_varying_gather_values(&ctx->ac, value, num_components, component); 1374b8e80941Smrg} 1375848b8605Smrg 1376b8e80941Smrgstatic void store_output_tcs(struct lp_build_tgsi_context *bld_base, 1377b8e80941Smrg const struct tgsi_full_instruction *inst, 1378b8e80941Smrg const struct tgsi_opcode_info *info, 1379b8e80941Smrg unsigned index, 1380b8e80941Smrg LLVMValueRef dst[4]) 1381b8e80941Smrg{ 1382b8e80941Smrg struct si_shader_context *ctx = si_shader_context(bld_base); 1383b8e80941Smrg const struct tgsi_full_dst_register *reg = &inst->Dst[index]; 1384b8e80941Smrg const struct tgsi_shader_info *sh_info = &ctx->shader->selector->info; 1385b8e80941Smrg unsigned chan_index; 1386b8e80941Smrg LLVMValueRef dw_addr, stride; 1387b8e80941Smrg LLVMValueRef buffer, base, buf_addr; 1388b8e80941Smrg LLVMValueRef values[4]; 1389b8e80941Smrg bool skip_lds_store; 1390b8e80941Smrg bool is_tess_factor = false, is_tess_inner = false; 1391b8e80941Smrg 1392b8e80941Smrg /* Only handle per-patch and per-vertex outputs here. 1393b8e80941Smrg * Vectors will be lowered to scalars and this function will be called again. 1394b8e80941Smrg */ 1395b8e80941Smrg if (reg->Register.File != TGSI_FILE_OUTPUT || 1396b8e80941Smrg (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) { 1397b8e80941Smrg si_llvm_emit_store(bld_base, inst, info, index, dst); 1398b8e80941Smrg return; 1399b8e80941Smrg } 1400848b8605Smrg 1401b8e80941Smrg if (reg->Register.Dimension) { 1402b8e80941Smrg stride = get_tcs_out_vertex_dw_stride(ctx); 1403b8e80941Smrg dw_addr = get_tcs_out_current_patch_offset(ctx); 1404b8e80941Smrg dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr); 1405b8e80941Smrg skip_lds_store = !sh_info->reads_pervertex_outputs; 1406b8e80941Smrg } else { 1407b8e80941Smrg dw_addr = get_tcs_out_current_patch_data_offset(ctx); 1408b8e80941Smrg dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr); 1409b8e80941Smrg skip_lds_store = !sh_info->reads_perpatch_outputs; 1410b8e80941Smrg 1411b8e80941Smrg if (!reg->Register.Indirect) { 1412b8e80941Smrg int name = sh_info->output_semantic_name[reg->Register.Index]; 1413b8e80941Smrg 1414b8e80941Smrg /* Always write tess factors into LDS for the TCS epilog. */ 1415b8e80941Smrg if (name == TGSI_SEMANTIC_TESSINNER || 1416b8e80941Smrg name == TGSI_SEMANTIC_TESSOUTER) { 1417b8e80941Smrg /* The epilog doesn't read LDS if invocation 0 defines tess factors. */ 1418b8e80941Smrg skip_lds_store = !sh_info->reads_tessfactor_outputs && 1419b8e80941Smrg ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs; 1420b8e80941Smrg is_tess_factor = true; 1421b8e80941Smrg is_tess_inner = name == TGSI_SEMANTIC_TESSINNER; 1422b8e80941Smrg } 1423848b8605Smrg } 1424b8e80941Smrg } 1425848b8605Smrg 1426b8e80941Smrg buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS); 1427848b8605Smrg 1428b8e80941Smrg base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset); 1429b8e80941Smrg buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL); 1430848b8605Smrg 1431b8e80941Smrg uint32_t writemask = reg->Register.WriteMask; 1432b8e80941Smrg while (writemask) { 1433b8e80941Smrg chan_index = u_bit_scan(&writemask); 1434b8e80941Smrg LLVMValueRef value = dst[chan_index]; 1435848b8605Smrg 1436b8e80941Smrg if (inst->Instruction.Saturate) 1437b8e80941Smrg value = ac_build_clamp(&ctx->ac, value); 1438848b8605Smrg 1439b8e80941Smrg /* Skip LDS stores if there is no LDS read of this output. */ 1440b8e80941Smrg if (!skip_lds_store) 1441b8e80941Smrg lds_store(ctx, chan_index, dw_addr, value); 1442848b8605Smrg 1443b8e80941Smrg value = ac_to_integer(&ctx->ac, value); 1444b8e80941Smrg values[chan_index] = value; 1445848b8605Smrg 1446b8e80941Smrg if (reg->Register.WriteMask != 0xF && !is_tess_factor) { 1447b8e80941Smrg ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1, 1448b8e80941Smrg buf_addr, base, 1449b8e80941Smrg 4 * chan_index, 1, 0, true, false); 1450b8e80941Smrg } 1451848b8605Smrg 1452b8e80941Smrg /* Write tess factors into VGPRs for the epilog. */ 1453b8e80941Smrg if (is_tess_factor && 1454b8e80941Smrg ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) { 1455b8e80941Smrg if (!is_tess_inner) { 1456b8e80941Smrg LLVMBuildStore(ctx->ac.builder, value, /* outer */ 1457b8e80941Smrg ctx->invoc0_tess_factors[chan_index]); 1458b8e80941Smrg } else if (chan_index < 2) { 1459b8e80941Smrg LLVMBuildStore(ctx->ac.builder, value, /* inner */ 1460b8e80941Smrg ctx->invoc0_tess_factors[4 + chan_index]); 1461848b8605Smrg } 1462848b8605Smrg } 1463848b8605Smrg } 1464848b8605Smrg 1465b8e80941Smrg if (reg->Register.WriteMask == 0xF && !is_tess_factor) { 1466b8e80941Smrg LLVMValueRef value = ac_build_gather_values(&ctx->ac, 1467b8e80941Smrg values, 4); 1468b8e80941Smrg ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buf_addr, 1469b8e80941Smrg base, 0, 1, 0, true, false); 1470b8e80941Smrg } 1471b8e80941Smrg} 1472848b8605Smrg 1473b8e80941Smrgstatic void si_nir_store_output_tcs(struct ac_shader_abi *abi, 1474b8e80941Smrg const struct nir_variable *var, 1475b8e80941Smrg LLVMValueRef vertex_index, 1476b8e80941Smrg LLVMValueRef param_index, 1477b8e80941Smrg unsigned const_index, 1478b8e80941Smrg LLVMValueRef src, 1479b8e80941Smrg unsigned writemask) 1480848b8605Smrg{ 1481b8e80941Smrg struct si_shader_context *ctx = si_shader_context_from_abi(abi); 1482b8e80941Smrg struct tgsi_shader_info *info = &ctx->shader->selector->info; 1483b8e80941Smrg const unsigned component = var->data.location_frac; 1484b8e80941Smrg const bool is_patch = var->data.patch; 1485b8e80941Smrg unsigned driver_location = var->data.driver_location; 1486b8e80941Smrg LLVMValueRef dw_addr, stride; 1487b8e80941Smrg LLVMValueRef buffer, base, addr; 1488b8e80941Smrg LLVMValueRef values[4]; 1489b8e80941Smrg bool skip_lds_store; 1490b8e80941Smrg bool is_tess_factor = false, is_tess_inner = false; 1491b8e80941Smrg 1492b8e80941Smrg driver_location = driver_location / 4; 1493848b8605Smrg 1494b8e80941Smrg if (param_index) { 1495b8e80941Smrg /* Add the constant index to the indirect index */ 1496b8e80941Smrg param_index = LLVMBuildAdd(ctx->ac.builder, param_index, 1497b8e80941Smrg LLVMConstInt(ctx->i32, const_index, 0), ""); 1498b8e80941Smrg } else { 1499b8e80941Smrg if (const_index != 0) 1500b8e80941Smrg param_index = LLVMConstInt(ctx->i32, const_index, 0); 1501848b8605Smrg } 1502848b8605Smrg 1503b8e80941Smrg if (!is_patch) { 1504b8e80941Smrg stride = get_tcs_out_vertex_dw_stride(ctx); 1505b8e80941Smrg dw_addr = get_tcs_out_current_patch_offset(ctx); 1506b8e80941Smrg dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr, 1507b8e80941Smrg vertex_index, param_index, 1508b8e80941Smrg driver_location, 1509b8e80941Smrg info->output_semantic_name, 1510b8e80941Smrg info->output_semantic_index, 1511b8e80941Smrg is_patch); 1512b8e80941Smrg 1513b8e80941Smrg skip_lds_store = !info->reads_pervertex_outputs; 1514b8e80941Smrg } else { 1515b8e80941Smrg dw_addr = get_tcs_out_current_patch_data_offset(ctx); 1516b8e80941Smrg dw_addr = get_dw_address_from_generic_indices(ctx, NULL, dw_addr, 1517b8e80941Smrg vertex_index, param_index, 1518b8e80941Smrg driver_location, 1519b8e80941Smrg info->output_semantic_name, 1520b8e80941Smrg info->output_semantic_index, 1521b8e80941Smrg is_patch); 1522b8e80941Smrg 1523b8e80941Smrg skip_lds_store = !info->reads_perpatch_outputs; 1524b8e80941Smrg 1525b8e80941Smrg if (!param_index) { 1526b8e80941Smrg int name = info->output_semantic_name[driver_location]; 1527b8e80941Smrg 1528b8e80941Smrg /* Always write tess factors into LDS for the TCS epilog. */ 1529b8e80941Smrg if (name == TGSI_SEMANTIC_TESSINNER || 1530b8e80941Smrg name == TGSI_SEMANTIC_TESSOUTER) { 1531b8e80941Smrg /* The epilog doesn't read LDS if invocation 0 defines tess factors. */ 1532b8e80941Smrg skip_lds_store = !info->reads_tessfactor_outputs && 1533b8e80941Smrg ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs; 1534b8e80941Smrg is_tess_factor = true; 1535b8e80941Smrg is_tess_inner = name == TGSI_SEMANTIC_TESSINNER; 1536b8e80941Smrg } 1537b8e80941Smrg } 1538b8e80941Smrg } 1539848b8605Smrg 1540b8e80941Smrg buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS); 1541b8e80941Smrg 1542b8e80941Smrg base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset); 1543b8e80941Smrg 1544b8e80941Smrg addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, 1545b8e80941Smrg param_index, driver_location, 1546b8e80941Smrg info->output_semantic_name, 1547b8e80941Smrg info->output_semantic_index, 1548b8e80941Smrg is_patch); 1549b8e80941Smrg 1550b8e80941Smrg for (unsigned chan = 0; chan < 4; chan++) { 1551b8e80941Smrg if (!(writemask & (1 << chan))) 1552848b8605Smrg continue; 1553b8e80941Smrg LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component); 1554b8e80941Smrg 1555b8e80941Smrg /* Skip LDS stores if there is no LDS read of this output. */ 1556b8e80941Smrg if (!skip_lds_store) 1557b8e80941Smrg lds_store(ctx, chan, dw_addr, value); 1558b8e80941Smrg 1559b8e80941Smrg value = ac_to_integer(&ctx->ac, value); 1560b8e80941Smrg values[chan] = value; 1561b8e80941Smrg 1562b8e80941Smrg if (writemask != 0xF && !is_tess_factor) { 1563b8e80941Smrg ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1, 1564b8e80941Smrg addr, base, 1565b8e80941Smrg 4 * chan, 1, 0, true, false); 1566b8e80941Smrg } 1567b8e80941Smrg 1568b8e80941Smrg /* Write tess factors into VGPRs for the epilog. */ 1569b8e80941Smrg if (is_tess_factor && 1570b8e80941Smrg ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) { 1571b8e80941Smrg if (!is_tess_inner) { 1572b8e80941Smrg LLVMBuildStore(ctx->ac.builder, value, /* outer */ 1573b8e80941Smrg ctx->invoc0_tess_factors[chan]); 1574b8e80941Smrg } else if (chan < 2) { 1575b8e80941Smrg LLVMBuildStore(ctx->ac.builder, value, /* inner */ 1576b8e80941Smrg ctx->invoc0_tess_factors[4 + chan]); 1577b8e80941Smrg } 1578b8e80941Smrg } 1579b8e80941Smrg } 1580b8e80941Smrg 1581b8e80941Smrg if (writemask == 0xF && !is_tess_factor) { 1582b8e80941Smrg LLVMValueRef value = ac_build_gather_values(&ctx->ac, 1583b8e80941Smrg values, 4); 1584b8e80941Smrg ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, addr, 1585b8e80941Smrg base, 0, 1, 0, true, false); 1586b8e80941Smrg } 1587b8e80941Smrg} 1588b8e80941Smrg 1589b8e80941SmrgLLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi, 1590b8e80941Smrg unsigned input_index, 1591b8e80941Smrg unsigned vtx_offset_param, 1592b8e80941Smrg LLVMTypeRef type, 1593b8e80941Smrg unsigned swizzle) 1594b8e80941Smrg{ 1595b8e80941Smrg struct si_shader_context *ctx = si_shader_context_from_abi(abi); 1596b8e80941Smrg struct lp_build_tgsi_context *bld_base = &ctx->bld_base; 1597b8e80941Smrg struct si_shader *shader = ctx->shader; 1598b8e80941Smrg LLVMValueRef vtx_offset, soffset; 1599b8e80941Smrg struct tgsi_shader_info *info = &shader->selector->info; 1600b8e80941Smrg unsigned semantic_name = info->input_semantic_name[input_index]; 1601b8e80941Smrg unsigned semantic_index = info->input_semantic_index[input_index]; 1602b8e80941Smrg unsigned param; 1603b8e80941Smrg LLVMValueRef value; 1604b8e80941Smrg 1605b8e80941Smrg param = si_shader_io_get_unique_index(semantic_name, semantic_index, false); 1606b8e80941Smrg 1607b8e80941Smrg /* GFX9 has the ESGS ring in LDS. */ 1608b8e80941Smrg if (ctx->screen->info.chip_class >= GFX9) { 1609b8e80941Smrg unsigned index = vtx_offset_param; 1610b8e80941Smrg 1611b8e80941Smrg switch (index / 2) { 1612b8e80941Smrg case 0: 1613b8e80941Smrg vtx_offset = si_unpack_param(ctx, ctx->param_gs_vtx01_offset, 1614b8e80941Smrg index % 2 ? 16 : 0, 16); 1615848b8605Smrg break; 1616b8e80941Smrg case 1: 1617b8e80941Smrg vtx_offset = si_unpack_param(ctx, ctx->param_gs_vtx23_offset, 1618b8e80941Smrg index % 2 ? 16 : 0, 16); 1619848b8605Smrg break; 1620b8e80941Smrg case 2: 1621b8e80941Smrg vtx_offset = si_unpack_param(ctx, ctx->param_gs_vtx45_offset, 1622b8e80941Smrg index % 2 ? 16 : 0, 16); 1623848b8605Smrg break; 1624848b8605Smrg default: 1625b8e80941Smrg assert(0); 1626b8e80941Smrg return NULL; 1627848b8605Smrg } 1628848b8605Smrg 1629b8e80941Smrg vtx_offset = LLVMBuildAdd(ctx->ac.builder, vtx_offset, 1630b8e80941Smrg LLVMConstInt(ctx->i32, param * 4, 0), ""); 1631b8e80941Smrg return lds_load(bld_base, type, swizzle, vtx_offset); 1632b8e80941Smrg } 1633848b8605Smrg 1634b8e80941Smrg /* GFX6: input load from the ESGS ring in memory. */ 1635b8e80941Smrg if (swizzle == ~0) { 1636b8e80941Smrg LLVMValueRef values[TGSI_NUM_CHANNELS]; 1637b8e80941Smrg unsigned chan; 1638b8e80941Smrg for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { 1639b8e80941Smrg values[chan] = si_llvm_load_input_gs(abi, input_index, vtx_offset_param, 1640b8e80941Smrg type, chan); 1641848b8605Smrg } 1642b8e80941Smrg return ac_build_gather_values(&ctx->ac, values, 1643b8e80941Smrg TGSI_NUM_CHANNELS); 1644848b8605Smrg } 1645848b8605Smrg 1646b8e80941Smrg /* Get the vertex offset parameter on GFX6. */ 1647b8e80941Smrg LLVMValueRef gs_vtx_offset = ctx->gs_vtx_offset[vtx_offset_param]; 1648b8e80941Smrg 1649b8e80941Smrg vtx_offset = LLVMBuildMul(ctx->ac.builder, gs_vtx_offset, 1650b8e80941Smrg LLVMConstInt(ctx->i32, 4, 0), ""); 1651b8e80941Smrg 1652b8e80941Smrg soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0); 1653b8e80941Smrg 1654b8e80941Smrg value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0, 1655b8e80941Smrg vtx_offset, soffset, 0, 1, 0, true, false); 1656b8e80941Smrg if (llvm_type_is_64bit(ctx, type)) { 1657b8e80941Smrg LLVMValueRef value2; 1658b8e80941Smrg soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0); 1659b8e80941Smrg 1660b8e80941Smrg value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, 1661b8e80941Smrg ctx->i32_0, vtx_offset, soffset, 1662b8e80941Smrg 0, 1, 0, true, false); 1663b8e80941Smrg return si_llvm_emit_fetch_64bit(bld_base, type, value, value2); 1664848b8605Smrg } 1665b8e80941Smrg return LLVMBuildBitCast(ctx->ac.builder, value, type, ""); 1666b8e80941Smrg} 1667848b8605Smrg 1668b8e80941Smrgstatic LLVMValueRef si_nir_load_input_gs(struct ac_shader_abi *abi, 1669b8e80941Smrg unsigned location, 1670b8e80941Smrg unsigned driver_location, 1671b8e80941Smrg unsigned component, 1672b8e80941Smrg unsigned num_components, 1673b8e80941Smrg unsigned vertex_index, 1674b8e80941Smrg unsigned const_index, 1675b8e80941Smrg LLVMTypeRef type) 1676b8e80941Smrg{ 1677b8e80941Smrg struct si_shader_context *ctx = si_shader_context_from_abi(abi); 1678848b8605Smrg 1679b8e80941Smrg LLVMValueRef value[4]; 1680b8e80941Smrg for (unsigned i = 0; i < num_components; i++) { 1681b8e80941Smrg unsigned offset = i; 1682b8e80941Smrg if (llvm_type_is_64bit(ctx, type)) 1683b8e80941Smrg offset *= 2; 1684848b8605Smrg 1685b8e80941Smrg offset += component; 1686b8e80941Smrg value[i + component] = si_llvm_load_input_gs(&ctx->abi, driver_location / 4, 1687b8e80941Smrg vertex_index, type, offset); 1688848b8605Smrg } 1689848b8605Smrg 1690b8e80941Smrg return ac_build_varying_gather_values(&ctx->ac, value, num_components, component); 1691b8e80941Smrg} 1692b8e80941Smrg 1693b8e80941Smrgstatic LLVMValueRef fetch_input_gs( 1694b8e80941Smrg struct lp_build_tgsi_context *bld_base, 1695b8e80941Smrg const struct tgsi_full_src_register *reg, 1696b8e80941Smrg enum tgsi_opcode_type type, 1697b8e80941Smrg unsigned swizzle_in) 1698b8e80941Smrg{ 1699b8e80941Smrg struct si_shader_context *ctx = si_shader_context(bld_base); 1700b8e80941Smrg struct tgsi_shader_info *info = &ctx->shader->selector->info; 1701b8e80941Smrg unsigned swizzle = swizzle_in & 0xffff; 1702b8e80941Smrg 1703b8e80941Smrg unsigned semantic_name = info->input_semantic_name[reg->Register.Index]; 1704b8e80941Smrg if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID) 1705b8e80941Smrg return get_primitive_id(ctx, swizzle); 1706b8e80941Smrg 1707b8e80941Smrg if (!reg->Register.Dimension) 1708b8e80941Smrg return NULL; 1709b8e80941Smrg 1710b8e80941Smrg return si_llvm_load_input_gs(&ctx->abi, reg->Register.Index, 1711b8e80941Smrg reg->Dimension.Index, 1712b8e80941Smrg tgsi2llvmtype(bld_base, type), 1713b8e80941Smrg swizzle); 1714b8e80941Smrg} 1715b8e80941Smrg 1716b8e80941Smrgstatic int lookup_interp_param_index(unsigned interpolate, unsigned location) 1717b8e80941Smrg{ 1718b8e80941Smrg switch (interpolate) { 1719b8e80941Smrg case TGSI_INTERPOLATE_CONSTANT: 1720b8e80941Smrg return 0; 1721b8e80941Smrg 1722b8e80941Smrg case TGSI_INTERPOLATE_LINEAR: 1723b8e80941Smrg if (location == TGSI_INTERPOLATE_LOC_SAMPLE) 1724b8e80941Smrg return SI_PARAM_LINEAR_SAMPLE; 1725b8e80941Smrg else if (location == TGSI_INTERPOLATE_LOC_CENTROID) 1726b8e80941Smrg return SI_PARAM_LINEAR_CENTROID; 1727b8e80941Smrg else 1728b8e80941Smrg return SI_PARAM_LINEAR_CENTER; 1729b8e80941Smrg break; 1730b8e80941Smrg case TGSI_INTERPOLATE_COLOR: 1731b8e80941Smrg case TGSI_INTERPOLATE_PERSPECTIVE: 1732b8e80941Smrg if (location == TGSI_INTERPOLATE_LOC_SAMPLE) 1733b8e80941Smrg return SI_PARAM_PERSP_SAMPLE; 1734b8e80941Smrg else if (location == TGSI_INTERPOLATE_LOC_CENTROID) 1735b8e80941Smrg return SI_PARAM_PERSP_CENTROID; 1736b8e80941Smrg else 1737b8e80941Smrg return SI_PARAM_PERSP_CENTER; 1738b8e80941Smrg break; 1739b8e80941Smrg default: 1740b8e80941Smrg fprintf(stderr, "Warning: Unhandled interpolation mode.\n"); 1741b8e80941Smrg return -1; 1742b8e80941Smrg } 1743b8e80941Smrg} 1744b8e80941Smrg 1745b8e80941Smrgstatic LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx, 1746b8e80941Smrg unsigned attr_index, unsigned chan, 1747b8e80941Smrg LLVMValueRef prim_mask, 1748b8e80941Smrg LLVMValueRef i, LLVMValueRef j) 1749b8e80941Smrg{ 1750b8e80941Smrg if (i || j) { 1751b8e80941Smrg return ac_build_fs_interp(&ctx->ac, 1752b8e80941Smrg LLVMConstInt(ctx->i32, chan, 0), 1753b8e80941Smrg LLVMConstInt(ctx->i32, attr_index, 0), 1754b8e80941Smrg prim_mask, i, j); 1755b8e80941Smrg } 1756b8e80941Smrg return ac_build_fs_interp_mov(&ctx->ac, 1757b8e80941Smrg LLVMConstInt(ctx->i32, 2, 0), /* P0 */ 1758b8e80941Smrg LLVMConstInt(ctx->i32, chan, 0), 1759b8e80941Smrg LLVMConstInt(ctx->i32, attr_index, 0), 1760b8e80941Smrg prim_mask); 1761b8e80941Smrg} 1762b8e80941Smrg 1763b8e80941Smrg/** 1764b8e80941Smrg * Interpolate a fragment shader input. 1765b8e80941Smrg * 1766b8e80941Smrg * @param ctx context 1767b8e80941Smrg * @param input_index index of the input in hardware 1768b8e80941Smrg * @param semantic_name TGSI_SEMANTIC_* 1769b8e80941Smrg * @param semantic_index semantic index 1770b8e80941Smrg * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset) 1771b8e80941Smrg * @param colors_read_mask color components read (4 bits for each color, 8 bits in total) 1772b8e80941Smrg * @param interp_param interpolation weights (i,j) 1773b8e80941Smrg * @param prim_mask SI_PARAM_PRIM_MASK 1774b8e80941Smrg * @param face SI_PARAM_FRONT_FACE 1775b8e80941Smrg * @param result the return value (4 components) 1776b8e80941Smrg */ 1777b8e80941Smrgstatic void interp_fs_input(struct si_shader_context *ctx, 1778b8e80941Smrg unsigned input_index, 1779b8e80941Smrg unsigned semantic_name, 1780b8e80941Smrg unsigned semantic_index, 1781b8e80941Smrg unsigned num_interp_inputs, 1782b8e80941Smrg unsigned colors_read_mask, 1783b8e80941Smrg LLVMValueRef interp_param, 1784b8e80941Smrg LLVMValueRef prim_mask, 1785b8e80941Smrg LLVMValueRef face, 1786b8e80941Smrg LLVMValueRef result[4]) 1787b8e80941Smrg{ 1788b8e80941Smrg LLVMValueRef i = NULL, j = NULL; 1789b8e80941Smrg unsigned chan; 1790b8e80941Smrg 1791b8e80941Smrg /* fs.constant returns the param from the middle vertex, so it's not 1792b8e80941Smrg * really useful for flat shading. It's meant to be used for custom 1793b8e80941Smrg * interpolation (but the intrinsic can't fetch from the other two 1794b8e80941Smrg * vertices). 1795b8e80941Smrg * 1796b8e80941Smrg * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state 1797b8e80941Smrg * to do the right thing. The only reason we use fs.constant is that 1798b8e80941Smrg * fs.interp cannot be used on integers, because they can be equal 1799b8e80941Smrg * to NaN. 1800b8e80941Smrg * 1801b8e80941Smrg * When interp is false we will use fs.constant or for newer llvm, 1802b8e80941Smrg * amdgcn.interp.mov. 1803b8e80941Smrg */ 1804b8e80941Smrg bool interp = interp_param != NULL; 1805b8e80941Smrg 1806b8e80941Smrg if (interp) { 1807b8e80941Smrg interp_param = LLVMBuildBitCast(ctx->ac.builder, interp_param, 1808b8e80941Smrg LLVMVectorType(ctx->f32, 2), ""); 1809b8e80941Smrg 1810b8e80941Smrg i = LLVMBuildExtractElement(ctx->ac.builder, interp_param, 1811b8e80941Smrg ctx->i32_0, ""); 1812b8e80941Smrg j = LLVMBuildExtractElement(ctx->ac.builder, interp_param, 1813b8e80941Smrg ctx->i32_1, ""); 1814b8e80941Smrg } 1815b8e80941Smrg 1816b8e80941Smrg if (semantic_name == TGSI_SEMANTIC_COLOR && 1817b8e80941Smrg ctx->shader->key.part.ps.prolog.color_two_side) { 1818b8e80941Smrg LLVMValueRef is_face_positive; 1819b8e80941Smrg 1820b8e80941Smrg /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1", 1821b8e80941Smrg * otherwise it's at offset "num_inputs". 1822b8e80941Smrg */ 1823b8e80941Smrg unsigned back_attr_offset = num_interp_inputs; 1824b8e80941Smrg if (semantic_index == 1 && colors_read_mask & 0xf) 1825b8e80941Smrg back_attr_offset += 1; 1826b8e80941Smrg 1827b8e80941Smrg is_face_positive = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, 1828b8e80941Smrg face, ctx->i32_0, ""); 1829b8e80941Smrg 1830b8e80941Smrg for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { 1831b8e80941Smrg LLVMValueRef front, back; 1832b8e80941Smrg 1833b8e80941Smrg front = si_build_fs_interp(ctx, 1834b8e80941Smrg input_index, chan, 1835b8e80941Smrg prim_mask, i, j); 1836b8e80941Smrg back = si_build_fs_interp(ctx, 1837b8e80941Smrg back_attr_offset, chan, 1838b8e80941Smrg prim_mask, i, j); 1839b8e80941Smrg 1840b8e80941Smrg result[chan] = LLVMBuildSelect(ctx->ac.builder, 1841b8e80941Smrg is_face_positive, 1842b8e80941Smrg front, 1843b8e80941Smrg back, 1844b8e80941Smrg ""); 1845b8e80941Smrg } 1846b8e80941Smrg } else if (semantic_name == TGSI_SEMANTIC_FOG) { 1847b8e80941Smrg result[0] = si_build_fs_interp(ctx, input_index, 1848b8e80941Smrg 0, prim_mask, i, j); 1849b8e80941Smrg result[1] = 1850b8e80941Smrg result[2] = LLVMConstReal(ctx->f32, 0.0f); 1851b8e80941Smrg result[3] = LLVMConstReal(ctx->f32, 1.0f); 1852b8e80941Smrg } else { 1853b8e80941Smrg for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { 1854b8e80941Smrg result[chan] = si_build_fs_interp(ctx, 1855b8e80941Smrg input_index, chan, 1856b8e80941Smrg prim_mask, i, j); 1857b8e80941Smrg } 1858b8e80941Smrg } 1859b8e80941Smrg} 1860b8e80941Smrg 1861b8e80941Smrgvoid si_llvm_load_input_fs( 1862b8e80941Smrg struct si_shader_context *ctx, 1863b8e80941Smrg unsigned input_index, 1864b8e80941Smrg LLVMValueRef out[4]) 1865b8e80941Smrg{ 1866b8e80941Smrg struct si_shader *shader = ctx->shader; 1867b8e80941Smrg struct tgsi_shader_info *info = &shader->selector->info; 1868b8e80941Smrg LLVMValueRef main_fn = ctx->main_fn; 1869b8e80941Smrg LLVMValueRef interp_param = NULL; 1870b8e80941Smrg int interp_param_idx; 1871b8e80941Smrg enum tgsi_semantic semantic_name = info->input_semantic_name[input_index]; 1872b8e80941Smrg unsigned semantic_index = info->input_semantic_index[input_index]; 1873b8e80941Smrg enum tgsi_interpolate_mode interp_mode = info->input_interpolate[input_index]; 1874b8e80941Smrg enum tgsi_interpolate_loc interp_loc = info->input_interpolate_loc[input_index]; 1875b8e80941Smrg 1876b8e80941Smrg /* Get colors from input VGPRs (set by the prolog). */ 1877b8e80941Smrg if (semantic_name == TGSI_SEMANTIC_COLOR) { 1878b8e80941Smrg unsigned colors_read = shader->selector->info.colors_read; 1879b8e80941Smrg unsigned mask = colors_read >> (semantic_index * 4); 1880b8e80941Smrg unsigned offset = SI_PARAM_POS_FIXED_PT + 1 + 1881b8e80941Smrg (semantic_index ? util_bitcount(colors_read & 0xf) : 0); 1882b8e80941Smrg LLVMValueRef undef = LLVMGetUndef(ctx->f32); 1883b8e80941Smrg 1884b8e80941Smrg out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef; 1885b8e80941Smrg out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef; 1886b8e80941Smrg out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef; 1887b8e80941Smrg out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef; 1888b8e80941Smrg return; 1889b8e80941Smrg } 1890b8e80941Smrg 1891b8e80941Smrg interp_param_idx = lookup_interp_param_index(interp_mode, interp_loc); 1892b8e80941Smrg if (interp_param_idx == -1) 1893b8e80941Smrg return; 1894b8e80941Smrg else if (interp_param_idx) { 1895b8e80941Smrg interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx); 1896b8e80941Smrg } 1897b8e80941Smrg 1898b8e80941Smrg interp_fs_input(ctx, input_index, semantic_name, 1899b8e80941Smrg semantic_index, 0, /* this param is unused */ 1900b8e80941Smrg shader->selector->info.colors_read, interp_param, 1901b8e80941Smrg ctx->abi.prim_mask, 1902b8e80941Smrg LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE), 1903b8e80941Smrg &out[0]); 1904b8e80941Smrg} 1905b8e80941Smrg 1906b8e80941Smrgstatic void declare_input_fs( 1907b8e80941Smrg struct si_shader_context *ctx, 1908b8e80941Smrg unsigned input_index, 1909b8e80941Smrg const struct tgsi_full_declaration *decl, 1910b8e80941Smrg LLVMValueRef out[4]) 1911b8e80941Smrg{ 1912b8e80941Smrg si_llvm_load_input_fs(ctx, input_index, out); 1913b8e80941Smrg} 1914b8e80941Smrg 1915b8e80941SmrgLLVMValueRef si_get_sample_id(struct si_shader_context *ctx) 1916b8e80941Smrg{ 1917b8e80941Smrg return si_unpack_param(ctx, SI_PARAM_ANCILLARY, 8, 4); 1918b8e80941Smrg} 1919b8e80941Smrg 1920b8e80941Smrgstatic LLVMValueRef get_base_vertex(struct ac_shader_abi *abi) 1921b8e80941Smrg{ 1922b8e80941Smrg struct si_shader_context *ctx = si_shader_context_from_abi(abi); 1923b8e80941Smrg 1924b8e80941Smrg /* For non-indexed draws, the base vertex set by the driver 1925b8e80941Smrg * (for direct draws) or the CP (for indirect draws) is the 1926b8e80941Smrg * first vertex ID, but GLSL expects 0 to be returned. 1927b8e80941Smrg */ 1928b8e80941Smrg LLVMValueRef vs_state = LLVMGetParam(ctx->main_fn, 1929b8e80941Smrg ctx->param_vs_state_bits); 1930b8e80941Smrg LLVMValueRef indexed; 1931b8e80941Smrg 1932b8e80941Smrg indexed = LLVMBuildLShr(ctx->ac.builder, vs_state, ctx->i32_1, ""); 1933b8e80941Smrg indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->i1, ""); 1934b8e80941Smrg 1935b8e80941Smrg return LLVMBuildSelect(ctx->ac.builder, indexed, ctx->abi.base_vertex, 1936b8e80941Smrg ctx->i32_0, ""); 1937b8e80941Smrg} 1938b8e80941Smrg 1939b8e80941Smrgstatic LLVMValueRef get_block_size(struct ac_shader_abi *abi) 1940b8e80941Smrg{ 1941b8e80941Smrg struct si_shader_context *ctx = si_shader_context_from_abi(abi); 1942b8e80941Smrg 1943b8e80941Smrg LLVMValueRef values[3]; 1944b8e80941Smrg LLVMValueRef result; 1945b8e80941Smrg unsigned i; 1946b8e80941Smrg unsigned *properties = ctx->shader->selector->info.properties; 1947b8e80941Smrg 1948b8e80941Smrg if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) { 1949b8e80941Smrg unsigned sizes[3] = { 1950b8e80941Smrg properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH], 1951b8e80941Smrg properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT], 1952b8e80941Smrg properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH] 1953b8e80941Smrg }; 1954b8e80941Smrg 1955b8e80941Smrg for (i = 0; i < 3; ++i) 1956b8e80941Smrg values[i] = LLVMConstInt(ctx->i32, sizes[i], 0); 1957b8e80941Smrg 1958b8e80941Smrg result = ac_build_gather_values(&ctx->ac, values, 3); 1959b8e80941Smrg } else { 1960b8e80941Smrg result = LLVMGetParam(ctx->main_fn, ctx->param_block_size); 1961b8e80941Smrg } 1962b8e80941Smrg 1963b8e80941Smrg return result; 1964b8e80941Smrg} 1965b8e80941Smrg 1966b8e80941Smrg/** 1967b8e80941Smrg * Load a dword from a constant buffer. 1968b8e80941Smrg */ 1969b8e80941Smrgstatic LLVMValueRef buffer_load_const(struct si_shader_context *ctx, 1970b8e80941Smrg LLVMValueRef resource, 1971b8e80941Smrg LLVMValueRef offset) 1972b8e80941Smrg{ 1973b8e80941Smrg return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL, 1974b8e80941Smrg 0, 0, 0, true, true); 1975b8e80941Smrg} 1976b8e80941Smrg 1977b8e80941Smrgstatic LLVMValueRef load_sample_position(struct ac_shader_abi *abi, LLVMValueRef sample_id) 1978b8e80941Smrg{ 1979b8e80941Smrg struct si_shader_context *ctx = si_shader_context_from_abi(abi); 1980b8e80941Smrg LLVMValueRef desc = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers); 1981b8e80941Smrg LLVMValueRef buf_index = LLVMConstInt(ctx->i32, SI_PS_CONST_SAMPLE_POSITIONS, 0); 1982b8e80941Smrg LLVMValueRef resource = ac_build_load_to_sgpr(&ctx->ac, desc, buf_index); 1983b8e80941Smrg 1984b8e80941Smrg /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */ 1985b8e80941Smrg LLVMValueRef offset0 = LLVMBuildMul(ctx->ac.builder, sample_id, LLVMConstInt(ctx->i32, 8, 0), ""); 1986b8e80941Smrg LLVMValueRef offset1 = LLVMBuildAdd(ctx->ac.builder, offset0, LLVMConstInt(ctx->i32, 4, 0), ""); 1987b8e80941Smrg 1988b8e80941Smrg LLVMValueRef pos[4] = { 1989b8e80941Smrg buffer_load_const(ctx, resource, offset0), 1990b8e80941Smrg buffer_load_const(ctx, resource, offset1), 1991b8e80941Smrg LLVMConstReal(ctx->f32, 0), 1992b8e80941Smrg LLVMConstReal(ctx->f32, 0) 1993b8e80941Smrg }; 1994b8e80941Smrg 1995b8e80941Smrg return ac_build_gather_values(&ctx->ac, pos, 4); 1996b8e80941Smrg} 1997b8e80941Smrg 1998b8e80941Smrgstatic LLVMValueRef load_sample_mask_in(struct ac_shader_abi *abi) 1999b8e80941Smrg{ 2000b8e80941Smrg struct si_shader_context *ctx = si_shader_context_from_abi(abi); 2001b8e80941Smrg return ac_to_integer(&ctx->ac, abi->sample_coverage); 2002b8e80941Smrg} 2003b8e80941Smrg 2004b8e80941Smrgstatic LLVMValueRef si_load_tess_coord(struct ac_shader_abi *abi) 2005b8e80941Smrg{ 2006b8e80941Smrg struct si_shader_context *ctx = si_shader_context_from_abi(abi); 2007b8e80941Smrg LLVMValueRef coord[4] = { 2008b8e80941Smrg LLVMGetParam(ctx->main_fn, ctx->param_tes_u), 2009b8e80941Smrg LLVMGetParam(ctx->main_fn, ctx->param_tes_v), 2010b8e80941Smrg ctx->ac.f32_0, 2011b8e80941Smrg ctx->ac.f32_0 2012b8e80941Smrg }; 2013b8e80941Smrg 2014b8e80941Smrg /* For triangles, the vector should be (u, v, 1-u-v). */ 2015b8e80941Smrg if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] == 2016b8e80941Smrg PIPE_PRIM_TRIANGLES) { 2017b8e80941Smrg coord[2] = LLVMBuildFSub(ctx->ac.builder, ctx->ac.f32_1, 2018b8e80941Smrg LLVMBuildFAdd(ctx->ac.builder, 2019b8e80941Smrg coord[0], coord[1], ""), ""); 2020b8e80941Smrg } 2021b8e80941Smrg return ac_build_gather_values(&ctx->ac, coord, 4); 2022b8e80941Smrg} 2023b8e80941Smrg 2024b8e80941Smrgstatic LLVMValueRef load_tess_level(struct si_shader_context *ctx, 2025b8e80941Smrg unsigned semantic_name) 2026b8e80941Smrg{ 2027b8e80941Smrg LLVMValueRef base, addr; 2028b8e80941Smrg 2029b8e80941Smrg int param = si_shader_io_get_unique_index_patch(semantic_name, 0); 2030b8e80941Smrg 2031b8e80941Smrg base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset); 2032b8e80941Smrg addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL, 2033b8e80941Smrg LLVMConstInt(ctx->i32, param, 0)); 2034b8e80941Smrg 2035b8e80941Smrg return buffer_load(&ctx->bld_base, ctx->f32, 2036b8e80941Smrg ~0, ctx->tess_offchip_ring, base, addr, true); 2037b8e80941Smrg 2038b8e80941Smrg} 2039b8e80941Smrg 2040b8e80941Smrgstatic LLVMValueRef si_load_tess_level(struct ac_shader_abi *abi, 2041b8e80941Smrg unsigned varying_id) 2042b8e80941Smrg{ 2043b8e80941Smrg struct si_shader_context *ctx = si_shader_context_from_abi(abi); 2044b8e80941Smrg unsigned semantic_name; 2045b8e80941Smrg 2046b8e80941Smrg switch (varying_id) { 2047b8e80941Smrg case VARYING_SLOT_TESS_LEVEL_INNER: 2048b8e80941Smrg semantic_name = TGSI_SEMANTIC_TESSINNER; 2049b8e80941Smrg break; 2050b8e80941Smrg case VARYING_SLOT_TESS_LEVEL_OUTER: 2051b8e80941Smrg semantic_name = TGSI_SEMANTIC_TESSOUTER; 2052b8e80941Smrg break; 2053b8e80941Smrg default: 2054b8e80941Smrg unreachable("unknown tess level"); 2055b8e80941Smrg } 2056b8e80941Smrg 2057b8e80941Smrg return load_tess_level(ctx, semantic_name); 2058b8e80941Smrg 2059b8e80941Smrg} 2060b8e80941Smrg 2061b8e80941Smrgstatic LLVMValueRef si_load_patch_vertices_in(struct ac_shader_abi *abi) 2062b8e80941Smrg{ 2063b8e80941Smrg struct si_shader_context *ctx = si_shader_context_from_abi(abi); 2064b8e80941Smrg if (ctx->type == PIPE_SHADER_TESS_CTRL) 2065b8e80941Smrg return si_unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 6); 2066b8e80941Smrg else if (ctx->type == PIPE_SHADER_TESS_EVAL) 2067b8e80941Smrg return get_num_tcs_out_vertices(ctx); 2068b8e80941Smrg else 2069b8e80941Smrg unreachable("invalid shader stage for TGSI_SEMANTIC_VERTICESIN"); 2070b8e80941Smrg} 2071b8e80941Smrg 2072b8e80941Smrgvoid si_load_system_value(struct si_shader_context *ctx, 2073b8e80941Smrg unsigned index, 2074b8e80941Smrg const struct tgsi_full_declaration *decl) 2075b8e80941Smrg{ 2076b8e80941Smrg LLVMValueRef value = 0; 2077b8e80941Smrg 2078b8e80941Smrg assert(index < RADEON_LLVM_MAX_SYSTEM_VALUES); 2079b8e80941Smrg 2080b8e80941Smrg switch (decl->Semantic.Name) { 2081b8e80941Smrg case TGSI_SEMANTIC_INSTANCEID: 2082b8e80941Smrg value = ctx->abi.instance_id; 2083b8e80941Smrg break; 2084b8e80941Smrg 2085b8e80941Smrg case TGSI_SEMANTIC_VERTEXID: 2086b8e80941Smrg value = LLVMBuildAdd(ctx->ac.builder, 2087b8e80941Smrg ctx->abi.vertex_id, 2088b8e80941Smrg ctx->abi.base_vertex, ""); 2089b8e80941Smrg break; 2090b8e80941Smrg 2091b8e80941Smrg case TGSI_SEMANTIC_VERTEXID_NOBASE: 2092b8e80941Smrg /* Unused. Clarify the meaning in indexed vs. non-indexed 2093b8e80941Smrg * draws if this is ever used again. */ 2094b8e80941Smrg assert(false); 2095b8e80941Smrg break; 2096b8e80941Smrg 2097b8e80941Smrg case TGSI_SEMANTIC_BASEVERTEX: 2098b8e80941Smrg value = get_base_vertex(&ctx->abi); 2099b8e80941Smrg break; 2100b8e80941Smrg 2101b8e80941Smrg case TGSI_SEMANTIC_BASEINSTANCE: 2102b8e80941Smrg value = ctx->abi.start_instance; 2103b8e80941Smrg break; 2104b8e80941Smrg 2105b8e80941Smrg case TGSI_SEMANTIC_DRAWID: 2106b8e80941Smrg value = ctx->abi.draw_id; 2107b8e80941Smrg break; 2108b8e80941Smrg 2109b8e80941Smrg case TGSI_SEMANTIC_INVOCATIONID: 2110b8e80941Smrg if (ctx->type == PIPE_SHADER_TESS_CTRL) 2111b8e80941Smrg value = unpack_llvm_param(ctx, ctx->abi.tcs_rel_ids, 8, 5); 2112b8e80941Smrg else if (ctx->type == PIPE_SHADER_GEOMETRY) 2113b8e80941Smrg value = ctx->abi.gs_invocation_id; 2114b8e80941Smrg else 2115b8e80941Smrg assert(!"INVOCATIONID not implemented"); 2116b8e80941Smrg break; 2117b8e80941Smrg 2118b8e80941Smrg case TGSI_SEMANTIC_POSITION: 2119b8e80941Smrg { 2120b8e80941Smrg LLVMValueRef pos[4] = { 2121b8e80941Smrg LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT), 2122b8e80941Smrg LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT), 2123b8e80941Smrg LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Z_FLOAT), 2124b8e80941Smrg ac_build_fdiv(&ctx->ac, ctx->ac.f32_1, 2125b8e80941Smrg LLVMGetParam(ctx->main_fn, SI_PARAM_POS_W_FLOAT)), 2126b8e80941Smrg }; 2127b8e80941Smrg value = ac_build_gather_values(&ctx->ac, pos, 4); 2128b8e80941Smrg break; 2129b8e80941Smrg } 2130b8e80941Smrg 2131b8e80941Smrg case TGSI_SEMANTIC_FACE: 2132b8e80941Smrg value = ctx->abi.front_face; 2133b8e80941Smrg break; 2134b8e80941Smrg 2135b8e80941Smrg case TGSI_SEMANTIC_SAMPLEID: 2136b8e80941Smrg value = si_get_sample_id(ctx); 2137b8e80941Smrg break; 2138b8e80941Smrg 2139b8e80941Smrg case TGSI_SEMANTIC_SAMPLEPOS: { 2140b8e80941Smrg LLVMValueRef pos[4] = { 2141b8e80941Smrg LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT), 2142b8e80941Smrg LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT), 2143b8e80941Smrg LLVMConstReal(ctx->f32, 0), 2144b8e80941Smrg LLVMConstReal(ctx->f32, 0) 2145b8e80941Smrg }; 2146b8e80941Smrg pos[0] = ac_build_fract(&ctx->ac, pos[0], 32); 2147b8e80941Smrg pos[1] = ac_build_fract(&ctx->ac, pos[1], 32); 2148b8e80941Smrg value = ac_build_gather_values(&ctx->ac, pos, 4); 2149b8e80941Smrg break; 2150b8e80941Smrg } 2151b8e80941Smrg 2152b8e80941Smrg case TGSI_SEMANTIC_SAMPLEMASK: 2153b8e80941Smrg /* This can only occur with the OpenGL Core profile, which 2154b8e80941Smrg * doesn't support smoothing. 2155b8e80941Smrg */ 2156b8e80941Smrg value = LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLE_COVERAGE); 2157b8e80941Smrg break; 2158b8e80941Smrg 2159b8e80941Smrg case TGSI_SEMANTIC_TESSCOORD: 2160b8e80941Smrg value = si_load_tess_coord(&ctx->abi); 2161b8e80941Smrg break; 2162b8e80941Smrg 2163b8e80941Smrg case TGSI_SEMANTIC_VERTICESIN: 2164b8e80941Smrg value = si_load_patch_vertices_in(&ctx->abi); 2165b8e80941Smrg break; 2166b8e80941Smrg 2167b8e80941Smrg case TGSI_SEMANTIC_TESSINNER: 2168b8e80941Smrg case TGSI_SEMANTIC_TESSOUTER: 2169b8e80941Smrg value = load_tess_level(ctx, decl->Semantic.Name); 2170b8e80941Smrg break; 2171b8e80941Smrg 2172b8e80941Smrg case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI: 2173b8e80941Smrg case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI: 2174b8e80941Smrg { 2175b8e80941Smrg LLVMValueRef buf, slot, val[4]; 2176b8e80941Smrg int i, offset; 2177b8e80941Smrg 2178b8e80941Smrg slot = LLVMConstInt(ctx->i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0); 2179b8e80941Smrg buf = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers); 2180b8e80941Smrg buf = ac_build_load_to_sgpr(&ctx->ac, buf, slot); 2181b8e80941Smrg offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0; 2182b8e80941Smrg 2183b8e80941Smrg for (i = 0; i < 4; i++) 2184b8e80941Smrg val[i] = buffer_load_const(ctx, buf, 2185b8e80941Smrg LLVMConstInt(ctx->i32, (offset + i) * 4, 0)); 2186b8e80941Smrg value = ac_build_gather_values(&ctx->ac, val, 4); 2187b8e80941Smrg break; 2188b8e80941Smrg } 2189b8e80941Smrg 2190b8e80941Smrg case TGSI_SEMANTIC_PRIMID: 2191b8e80941Smrg value = get_primitive_id(ctx, 0); 2192b8e80941Smrg break; 2193b8e80941Smrg 2194b8e80941Smrg case TGSI_SEMANTIC_GRID_SIZE: 2195b8e80941Smrg value = ctx->abi.num_work_groups; 2196b8e80941Smrg break; 2197b8e80941Smrg 2198b8e80941Smrg case TGSI_SEMANTIC_BLOCK_SIZE: 2199b8e80941Smrg value = get_block_size(&ctx->abi); 2200b8e80941Smrg break; 2201b8e80941Smrg 2202b8e80941Smrg case TGSI_SEMANTIC_BLOCK_ID: 2203b8e80941Smrg { 2204b8e80941Smrg LLVMValueRef values[3]; 2205b8e80941Smrg 2206b8e80941Smrg for (int i = 0; i < 3; i++) { 2207b8e80941Smrg values[i] = ctx->i32_0; 2208b8e80941Smrg if (ctx->abi.workgroup_ids[i]) { 2209b8e80941Smrg values[i] = ctx->abi.workgroup_ids[i]; 2210b8e80941Smrg } 2211b8e80941Smrg } 2212b8e80941Smrg value = ac_build_gather_values(&ctx->ac, values, 3); 2213b8e80941Smrg break; 2214b8e80941Smrg } 2215b8e80941Smrg 2216b8e80941Smrg case TGSI_SEMANTIC_THREAD_ID: 2217b8e80941Smrg value = ctx->abi.local_invocation_ids; 2218b8e80941Smrg break; 2219b8e80941Smrg 2220b8e80941Smrg case TGSI_SEMANTIC_HELPER_INVOCATION: 2221b8e80941Smrg value = ac_build_load_helper_invocation(&ctx->ac); 2222b8e80941Smrg break; 2223b8e80941Smrg 2224b8e80941Smrg case TGSI_SEMANTIC_SUBGROUP_SIZE: 2225b8e80941Smrg value = LLVMConstInt(ctx->i32, 64, 0); 2226b8e80941Smrg break; 2227b8e80941Smrg 2228b8e80941Smrg case TGSI_SEMANTIC_SUBGROUP_INVOCATION: 2229b8e80941Smrg value = ac_get_thread_id(&ctx->ac); 2230b8e80941Smrg break; 2231b8e80941Smrg 2232b8e80941Smrg case TGSI_SEMANTIC_SUBGROUP_EQ_MASK: 2233b8e80941Smrg { 2234b8e80941Smrg LLVMValueRef id = ac_get_thread_id(&ctx->ac); 2235b8e80941Smrg id = LLVMBuildZExt(ctx->ac.builder, id, ctx->i64, ""); 2236b8e80941Smrg value = LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->i64, 1, 0), id, ""); 2237b8e80941Smrg value = LLVMBuildBitCast(ctx->ac.builder, value, ctx->v2i32, ""); 2238b8e80941Smrg break; 2239b8e80941Smrg } 2240b8e80941Smrg 2241b8e80941Smrg case TGSI_SEMANTIC_SUBGROUP_GE_MASK: 2242b8e80941Smrg case TGSI_SEMANTIC_SUBGROUP_GT_MASK: 2243b8e80941Smrg case TGSI_SEMANTIC_SUBGROUP_LE_MASK: 2244b8e80941Smrg case TGSI_SEMANTIC_SUBGROUP_LT_MASK: 2245b8e80941Smrg { 2246b8e80941Smrg LLVMValueRef id = ac_get_thread_id(&ctx->ac); 2247b8e80941Smrg if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_GT_MASK || 2248b8e80941Smrg decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK) { 2249b8e80941Smrg /* All bits set except LSB */ 2250b8e80941Smrg value = LLVMConstInt(ctx->i64, -2, 0); 2251b8e80941Smrg } else { 2252b8e80941Smrg /* All bits set */ 2253b8e80941Smrg value = LLVMConstInt(ctx->i64, -1, 0); 2254b8e80941Smrg } 2255b8e80941Smrg id = LLVMBuildZExt(ctx->ac.builder, id, ctx->i64, ""); 2256b8e80941Smrg value = LLVMBuildShl(ctx->ac.builder, value, id, ""); 2257b8e80941Smrg if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK || 2258b8e80941Smrg decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LT_MASK) 2259b8e80941Smrg value = LLVMBuildNot(ctx->ac.builder, value, ""); 2260b8e80941Smrg value = LLVMBuildBitCast(ctx->ac.builder, value, ctx->v2i32, ""); 2261b8e80941Smrg break; 2262b8e80941Smrg } 2263b8e80941Smrg 2264b8e80941Smrg case TGSI_SEMANTIC_CS_USER_DATA: 2265b8e80941Smrg value = LLVMGetParam(ctx->main_fn, ctx->param_cs_user_data); 2266b8e80941Smrg break; 2267b8e80941Smrg 2268b8e80941Smrg default: 2269b8e80941Smrg assert(!"unknown system value"); 2270b8e80941Smrg return; 2271b8e80941Smrg } 2272b8e80941Smrg 2273b8e80941Smrg ctx->system_values[index] = value; 2274b8e80941Smrg} 2275b8e80941Smrg 2276b8e80941Smrgvoid si_declare_compute_memory(struct si_shader_context *ctx) 2277b8e80941Smrg{ 2278b8e80941Smrg struct si_shader_selector *sel = ctx->shader->selector; 2279b8e80941Smrg unsigned lds_size = sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE]; 2280b8e80941Smrg 2281b8e80941Smrg LLVMTypeRef i8p = LLVMPointerType(ctx->i8, AC_ADDR_SPACE_LDS); 2282b8e80941Smrg LLVMValueRef var; 2283b8e80941Smrg 2284b8e80941Smrg assert(!ctx->ac.lds); 2285b8e80941Smrg 2286b8e80941Smrg var = LLVMAddGlobalInAddressSpace(ctx->ac.module, 2287b8e80941Smrg LLVMArrayType(ctx->i8, lds_size), 2288b8e80941Smrg "compute_lds", 2289b8e80941Smrg AC_ADDR_SPACE_LDS); 2290b8e80941Smrg LLVMSetAlignment(var, 4); 2291b8e80941Smrg 2292b8e80941Smrg ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, ""); 2293b8e80941Smrg} 2294b8e80941Smrg 2295b8e80941Smrgvoid si_tgsi_declare_compute_memory(struct si_shader_context *ctx, 2296b8e80941Smrg const struct tgsi_full_declaration *decl) 2297b8e80941Smrg{ 2298b8e80941Smrg assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED); 2299b8e80941Smrg assert(decl->Range.First == decl->Range.Last); 2300b8e80941Smrg 2301b8e80941Smrg si_declare_compute_memory(ctx); 2302b8e80941Smrg} 2303b8e80941Smrg 2304b8e80941Smrgstatic LLVMValueRef load_const_buffer_desc_fast_path(struct si_shader_context *ctx) 2305b8e80941Smrg{ 2306b8e80941Smrg LLVMValueRef ptr = 2307b8e80941Smrg LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers); 2308b8e80941Smrg struct si_shader_selector *sel = ctx->shader->selector; 2309b8e80941Smrg 2310b8e80941Smrg /* Do the bounds checking with a descriptor, because 2311b8e80941Smrg * doing computation and manual bounds checking of 64-bit 2312b8e80941Smrg * addresses generates horrible VALU code with very high 2313b8e80941Smrg * VGPR usage and very low SIMD occupancy. 2314b8e80941Smrg */ 2315b8e80941Smrg ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->ac.intptr, ""); 2316b8e80941Smrg 2317b8e80941Smrg LLVMValueRef desc0, desc1; 2318b8e80941Smrg desc0 = ptr; 2319b8e80941Smrg desc1 = LLVMConstInt(ctx->i32, 2320b8e80941Smrg S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0); 2321b8e80941Smrg 2322b8e80941Smrg LLVMValueRef desc_elems[] = { 2323b8e80941Smrg desc0, 2324b8e80941Smrg desc1, 2325b8e80941Smrg LLVMConstInt(ctx->i32, (sel->info.const_file_max[0] + 1) * 16, 0), 2326b8e80941Smrg LLVMConstInt(ctx->i32, 2327b8e80941Smrg S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | 2328b8e80941Smrg S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | 2329b8e80941Smrg S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | 2330b8e80941Smrg S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | 2331b8e80941Smrg S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | 2332b8e80941Smrg S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32), 0) 2333b8e80941Smrg }; 2334b8e80941Smrg 2335b8e80941Smrg return ac_build_gather_values(&ctx->ac, desc_elems, 4); 2336b8e80941Smrg} 2337b8e80941Smrg 2338b8e80941Smrgstatic LLVMValueRef load_const_buffer_desc(struct si_shader_context *ctx, int i) 2339b8e80941Smrg{ 2340b8e80941Smrg LLVMValueRef list_ptr = LLVMGetParam(ctx->main_fn, 2341b8e80941Smrg ctx->param_const_and_shader_buffers); 2342b8e80941Smrg 2343b8e80941Smrg return ac_build_load_to_sgpr(&ctx->ac, list_ptr, 2344b8e80941Smrg LLVMConstInt(ctx->i32, si_get_constbuf_slot(i), 0)); 2345b8e80941Smrg} 2346b8e80941Smrg 2347b8e80941Smrgstatic LLVMValueRef load_ubo(struct ac_shader_abi *abi, LLVMValueRef index) 2348b8e80941Smrg{ 2349b8e80941Smrg struct si_shader_context *ctx = si_shader_context_from_abi(abi); 2350b8e80941Smrg struct si_shader_selector *sel = ctx->shader->selector; 2351b8e80941Smrg 2352b8e80941Smrg LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers); 2353b8e80941Smrg 2354b8e80941Smrg if (sel->info.const_buffers_declared == 1 && 2355b8e80941Smrg sel->info.shader_buffers_declared == 0) { 2356b8e80941Smrg return load_const_buffer_desc_fast_path(ctx); 2357b8e80941Smrg } 2358b8e80941Smrg 2359b8e80941Smrg index = si_llvm_bound_index(ctx, index, ctx->num_const_buffers); 2360b8e80941Smrg index = LLVMBuildAdd(ctx->ac.builder, index, 2361b8e80941Smrg LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), ""); 2362b8e80941Smrg 2363b8e80941Smrg return ac_build_load_to_sgpr(&ctx->ac, ptr, index); 2364b8e80941Smrg} 2365b8e80941Smrg 2366b8e80941Smrgstatic LLVMValueRef 2367b8e80941Smrgload_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write) 2368b8e80941Smrg{ 2369b8e80941Smrg struct si_shader_context *ctx = si_shader_context_from_abi(abi); 2370b8e80941Smrg LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn, 2371b8e80941Smrg ctx->param_const_and_shader_buffers); 2372b8e80941Smrg 2373b8e80941Smrg index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers); 2374b8e80941Smrg index = LLVMBuildSub(ctx->ac.builder, 2375b8e80941Smrg LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS - 1, 0), 2376b8e80941Smrg index, ""); 2377b8e80941Smrg 2378b8e80941Smrg return ac_build_load_to_sgpr(&ctx->ac, rsrc_ptr, index); 2379b8e80941Smrg} 2380b8e80941Smrg 2381b8e80941Smrgstatic LLVMValueRef fetch_constant( 2382b8e80941Smrg struct lp_build_tgsi_context *bld_base, 2383b8e80941Smrg const struct tgsi_full_src_register *reg, 2384b8e80941Smrg enum tgsi_opcode_type type, 2385b8e80941Smrg unsigned swizzle_in) 2386b8e80941Smrg{ 2387b8e80941Smrg struct si_shader_context *ctx = si_shader_context(bld_base); 2388b8e80941Smrg struct si_shader_selector *sel = ctx->shader->selector; 2389b8e80941Smrg const struct tgsi_ind_register *ireg = ®->Indirect; 2390b8e80941Smrg unsigned buf, idx; 2391b8e80941Smrg unsigned swizzle = swizzle_in & 0xffff; 2392b8e80941Smrg 2393b8e80941Smrg LLVMValueRef addr, bufp; 2394b8e80941Smrg 2395b8e80941Smrg if (swizzle_in == LP_CHAN_ALL) { 2396b8e80941Smrg unsigned chan; 2397b8e80941Smrg LLVMValueRef values[4]; 2398b8e80941Smrg for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) 2399b8e80941Smrg values[chan] = fetch_constant(bld_base, reg, type, chan); 2400b8e80941Smrg 2401b8e80941Smrg return ac_build_gather_values(&ctx->ac, values, 4); 2402b8e80941Smrg } 2403b8e80941Smrg 2404b8e80941Smrg /* Split 64-bit loads. */ 2405b8e80941Smrg if (tgsi_type_is_64bit(type)) { 2406b8e80941Smrg LLVMValueRef lo, hi; 2407b8e80941Smrg 2408b8e80941Smrg lo = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED, swizzle); 2409b8e80941Smrg hi = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED, (swizzle_in >> 16)); 2410b8e80941Smrg return si_llvm_emit_fetch_64bit(bld_base, tgsi2llvmtype(bld_base, type), 2411b8e80941Smrg lo, hi); 2412b8e80941Smrg } 2413b8e80941Smrg 2414b8e80941Smrg idx = reg->Register.Index * 4 + swizzle; 2415b8e80941Smrg if (reg->Register.Indirect) { 2416b8e80941Smrg addr = si_get_indirect_index(ctx, ireg, 16, idx * 4); 2417b8e80941Smrg } else { 2418b8e80941Smrg addr = LLVMConstInt(ctx->i32, idx * 4, 0); 2419b8e80941Smrg } 2420b8e80941Smrg 2421b8e80941Smrg /* Fast path when user data SGPRs point to constant buffer 0 directly. */ 2422b8e80941Smrg if (sel->info.const_buffers_declared == 1 && 2423b8e80941Smrg sel->info.shader_buffers_declared == 0) { 2424b8e80941Smrg LLVMValueRef desc = load_const_buffer_desc_fast_path(ctx); 2425b8e80941Smrg LLVMValueRef result = buffer_load_const(ctx, desc, addr); 2426b8e80941Smrg return bitcast(bld_base, type, result); 2427b8e80941Smrg } 2428b8e80941Smrg 2429b8e80941Smrg assert(reg->Register.Dimension); 2430b8e80941Smrg buf = reg->Dimension.Index; 2431b8e80941Smrg 2432b8e80941Smrg if (reg->Dimension.Indirect) { 2433b8e80941Smrg LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers); 2434b8e80941Smrg LLVMValueRef index; 2435b8e80941Smrg index = si_get_bounded_indirect_index(ctx, ®->DimIndirect, 2436b8e80941Smrg reg->Dimension.Index, 2437b8e80941Smrg ctx->num_const_buffers); 2438b8e80941Smrg index = LLVMBuildAdd(ctx->ac.builder, index, 2439b8e80941Smrg LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), ""); 2440b8e80941Smrg bufp = ac_build_load_to_sgpr(&ctx->ac, ptr, index); 2441b8e80941Smrg } else 2442b8e80941Smrg bufp = load_const_buffer_desc(ctx, buf); 2443b8e80941Smrg 2444b8e80941Smrg return bitcast(bld_base, type, buffer_load_const(ctx, bufp, addr)); 2445b8e80941Smrg} 2446b8e80941Smrg 2447b8e80941Smrg/* Initialize arguments for the shader export intrinsic */ 2448b8e80941Smrgstatic void si_llvm_init_export_args(struct si_shader_context *ctx, 2449b8e80941Smrg LLVMValueRef *values, 2450b8e80941Smrg unsigned target, 2451b8e80941Smrg struct ac_export_args *args) 2452b8e80941Smrg{ 2453b8e80941Smrg LLVMValueRef f32undef = LLVMGetUndef(ctx->ac.f32); 2454b8e80941Smrg unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR; 2455b8e80941Smrg unsigned chan; 2456b8e80941Smrg bool is_int8, is_int10; 2457b8e80941Smrg 2458b8e80941Smrg /* Default is 0xf. Adjusted below depending on the format. */ 2459b8e80941Smrg args->enabled_channels = 0xf; /* writemask */ 2460b8e80941Smrg 2461b8e80941Smrg /* Specify whether the EXEC mask represents the valid mask */ 2462b8e80941Smrg args->valid_mask = 0; 2463b8e80941Smrg 2464b8e80941Smrg /* Specify whether this is the last export */ 2465b8e80941Smrg args->done = 0; 2466b8e80941Smrg 2467b8e80941Smrg /* Specify the target we are exporting */ 2468b8e80941Smrg args->target = target; 2469b8e80941Smrg 2470b8e80941Smrg if (ctx->type == PIPE_SHADER_FRAGMENT) { 2471b8e80941Smrg const struct si_shader_key *key = &ctx->shader->key; 2472b8e80941Smrg unsigned col_formats = key->part.ps.epilog.spi_shader_col_format; 2473b8e80941Smrg int cbuf = target - V_008DFC_SQ_EXP_MRT; 2474b8e80941Smrg 2475b8e80941Smrg assert(cbuf >= 0 && cbuf < 8); 2476b8e80941Smrg spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf; 2477b8e80941Smrg is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1; 2478b8e80941Smrg is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1; 2479b8e80941Smrg } 2480b8e80941Smrg 2481b8e80941Smrg args->compr = false; 2482b8e80941Smrg args->out[0] = f32undef; 2483b8e80941Smrg args->out[1] = f32undef; 2484b8e80941Smrg args->out[2] = f32undef; 2485b8e80941Smrg args->out[3] = f32undef; 2486b8e80941Smrg 2487b8e80941Smrg LLVMValueRef (*packf)(struct ac_llvm_context *ctx, LLVMValueRef args[2]) = NULL; 2488b8e80941Smrg LLVMValueRef (*packi)(struct ac_llvm_context *ctx, LLVMValueRef args[2], 2489b8e80941Smrg unsigned bits, bool hi) = NULL; 2490b8e80941Smrg 2491b8e80941Smrg switch (spi_shader_col_format) { 2492b8e80941Smrg case V_028714_SPI_SHADER_ZERO: 2493b8e80941Smrg args->enabled_channels = 0; /* writemask */ 2494b8e80941Smrg args->target = V_008DFC_SQ_EXP_NULL; 2495b8e80941Smrg break; 2496b8e80941Smrg 2497b8e80941Smrg case V_028714_SPI_SHADER_32_R: 2498b8e80941Smrg args->enabled_channels = 1; /* writemask */ 2499b8e80941Smrg args->out[0] = values[0]; 2500b8e80941Smrg break; 2501b8e80941Smrg 2502b8e80941Smrg case V_028714_SPI_SHADER_32_GR: 2503b8e80941Smrg args->enabled_channels = 0x3; /* writemask */ 2504b8e80941Smrg args->out[0] = values[0]; 2505b8e80941Smrg args->out[1] = values[1]; 2506b8e80941Smrg break; 2507b8e80941Smrg 2508b8e80941Smrg case V_028714_SPI_SHADER_32_AR: 2509b8e80941Smrg args->enabled_channels = 0x9; /* writemask */ 2510b8e80941Smrg args->out[0] = values[0]; 2511b8e80941Smrg args->out[3] = values[3]; 2512b8e80941Smrg break; 2513b8e80941Smrg 2514b8e80941Smrg case V_028714_SPI_SHADER_FP16_ABGR: 2515b8e80941Smrg packf = ac_build_cvt_pkrtz_f16; 2516b8e80941Smrg break; 2517b8e80941Smrg 2518b8e80941Smrg case V_028714_SPI_SHADER_UNORM16_ABGR: 2519b8e80941Smrg packf = ac_build_cvt_pknorm_u16; 2520b8e80941Smrg break; 2521b8e80941Smrg 2522b8e80941Smrg case V_028714_SPI_SHADER_SNORM16_ABGR: 2523b8e80941Smrg packf = ac_build_cvt_pknorm_i16; 2524b8e80941Smrg break; 2525b8e80941Smrg 2526b8e80941Smrg case V_028714_SPI_SHADER_UINT16_ABGR: 2527b8e80941Smrg packi = ac_build_cvt_pk_u16; 2528b8e80941Smrg break; 2529b8e80941Smrg 2530b8e80941Smrg case V_028714_SPI_SHADER_SINT16_ABGR: 2531b8e80941Smrg packi = ac_build_cvt_pk_i16; 2532b8e80941Smrg break; 2533b8e80941Smrg 2534b8e80941Smrg case V_028714_SPI_SHADER_32_ABGR: 2535b8e80941Smrg memcpy(&args->out[0], values, sizeof(values[0]) * 4); 2536b8e80941Smrg break; 2537b8e80941Smrg } 2538b8e80941Smrg 2539b8e80941Smrg /* Pack f16 or norm_i16/u16. */ 2540b8e80941Smrg if (packf) { 2541b8e80941Smrg for (chan = 0; chan < 2; chan++) { 2542b8e80941Smrg LLVMValueRef pack_args[2] = { 2543b8e80941Smrg values[2 * chan], 2544b8e80941Smrg values[2 * chan + 1] 2545b8e80941Smrg }; 2546b8e80941Smrg LLVMValueRef packed; 2547b8e80941Smrg 2548b8e80941Smrg packed = packf(&ctx->ac, pack_args); 2549b8e80941Smrg args->out[chan] = ac_to_float(&ctx->ac, packed); 2550b8e80941Smrg } 2551b8e80941Smrg args->compr = 1; /* COMPR flag */ 2552b8e80941Smrg } 2553b8e80941Smrg /* Pack i16/u16. */ 2554b8e80941Smrg if (packi) { 2555b8e80941Smrg for (chan = 0; chan < 2; chan++) { 2556b8e80941Smrg LLVMValueRef pack_args[2] = { 2557b8e80941Smrg ac_to_integer(&ctx->ac, values[2 * chan]), 2558b8e80941Smrg ac_to_integer(&ctx->ac, values[2 * chan + 1]) 2559b8e80941Smrg }; 2560b8e80941Smrg LLVMValueRef packed; 2561b8e80941Smrg 2562b8e80941Smrg packed = packi(&ctx->ac, pack_args, 2563b8e80941Smrg is_int8 ? 8 : is_int10 ? 10 : 16, 2564b8e80941Smrg chan == 1); 2565b8e80941Smrg args->out[chan] = ac_to_float(&ctx->ac, packed); 2566b8e80941Smrg } 2567b8e80941Smrg args->compr = 1; /* COMPR flag */ 2568b8e80941Smrg } 2569b8e80941Smrg} 2570b8e80941Smrg 2571b8e80941Smrgstatic void si_alpha_test(struct lp_build_tgsi_context *bld_base, 2572b8e80941Smrg LLVMValueRef alpha) 2573b8e80941Smrg{ 2574b8e80941Smrg struct si_shader_context *ctx = si_shader_context(bld_base); 2575b8e80941Smrg 2576b8e80941Smrg if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) { 2577b8e80941Smrg static LLVMRealPredicate cond_map[PIPE_FUNC_ALWAYS + 1] = { 2578b8e80941Smrg [PIPE_FUNC_LESS] = LLVMRealOLT, 2579b8e80941Smrg [PIPE_FUNC_EQUAL] = LLVMRealOEQ, 2580b8e80941Smrg [PIPE_FUNC_LEQUAL] = LLVMRealOLE, 2581b8e80941Smrg [PIPE_FUNC_GREATER] = LLVMRealOGT, 2582b8e80941Smrg [PIPE_FUNC_NOTEQUAL] = LLVMRealONE, 2583b8e80941Smrg [PIPE_FUNC_GEQUAL] = LLVMRealOGE, 2584b8e80941Smrg }; 2585b8e80941Smrg LLVMRealPredicate cond = cond_map[ctx->shader->key.part.ps.epilog.alpha_func]; 2586b8e80941Smrg assert(cond); 2587b8e80941Smrg 2588b8e80941Smrg LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn, 2589b8e80941Smrg SI_PARAM_ALPHA_REF); 2590b8e80941Smrg LLVMValueRef alpha_pass = 2591b8e80941Smrg LLVMBuildFCmp(ctx->ac.builder, cond, alpha, alpha_ref, ""); 2592b8e80941Smrg ac_build_kill_if_false(&ctx->ac, alpha_pass); 2593b8e80941Smrg } else { 2594b8e80941Smrg ac_build_kill_if_false(&ctx->ac, ctx->i1false); 2595b8e80941Smrg } 2596b8e80941Smrg} 2597b8e80941Smrg 2598b8e80941Smrgstatic LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base, 2599b8e80941Smrg LLVMValueRef alpha, 2600b8e80941Smrg unsigned samplemask_param) 2601b8e80941Smrg{ 2602b8e80941Smrg struct si_shader_context *ctx = si_shader_context(bld_base); 2603b8e80941Smrg LLVMValueRef coverage; 2604b8e80941Smrg 2605b8e80941Smrg /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */ 2606b8e80941Smrg coverage = LLVMGetParam(ctx->main_fn, 2607b8e80941Smrg samplemask_param); 2608b8e80941Smrg coverage = ac_to_integer(&ctx->ac, coverage); 2609b8e80941Smrg 2610b8e80941Smrg coverage = ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i32", 2611b8e80941Smrg ctx->i32, 2612b8e80941Smrg &coverage, 1, AC_FUNC_ATTR_READNONE); 2613b8e80941Smrg 2614b8e80941Smrg coverage = LLVMBuildUIToFP(ctx->ac.builder, coverage, 2615b8e80941Smrg ctx->f32, ""); 2616b8e80941Smrg 2617b8e80941Smrg coverage = LLVMBuildFMul(ctx->ac.builder, coverage, 2618b8e80941Smrg LLVMConstReal(ctx->f32, 2619b8e80941Smrg 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), ""); 2620b8e80941Smrg 2621b8e80941Smrg return LLVMBuildFMul(ctx->ac.builder, alpha, coverage, ""); 2622b8e80941Smrg} 2623b8e80941Smrg 2624b8e80941Smrgstatic void si_llvm_emit_clipvertex(struct si_shader_context *ctx, 2625b8e80941Smrg struct ac_export_args *pos, LLVMValueRef *out_elts) 2626b8e80941Smrg{ 2627b8e80941Smrg unsigned reg_index; 2628b8e80941Smrg unsigned chan; 2629b8e80941Smrg unsigned const_chan; 2630b8e80941Smrg LLVMValueRef base_elt; 2631b8e80941Smrg LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers); 2632b8e80941Smrg LLVMValueRef constbuf_index = LLVMConstInt(ctx->i32, 2633b8e80941Smrg SI_VS_CONST_CLIP_PLANES, 0); 2634b8e80941Smrg LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index); 2635b8e80941Smrg 2636b8e80941Smrg for (reg_index = 0; reg_index < 2; reg_index ++) { 2637b8e80941Smrg struct ac_export_args *args = &pos[2 + reg_index]; 2638b8e80941Smrg 2639b8e80941Smrg args->out[0] = 2640b8e80941Smrg args->out[1] = 2641b8e80941Smrg args->out[2] = 2642b8e80941Smrg args->out[3] = LLVMConstReal(ctx->f32, 0.0f); 2643b8e80941Smrg 2644b8e80941Smrg /* Compute dot products of position and user clip plane vectors */ 2645b8e80941Smrg for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { 2646b8e80941Smrg for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) { 2647b8e80941Smrg LLVMValueRef addr = 2648b8e80941Smrg LLVMConstInt(ctx->i32, ((reg_index * 4 + chan) * 4 + 2649b8e80941Smrg const_chan) * 4, 0); 2650b8e80941Smrg base_elt = buffer_load_const(ctx, const_resource, 2651b8e80941Smrg addr); 2652b8e80941Smrg args->out[chan] = ac_build_fmad(&ctx->ac, base_elt, 2653b8e80941Smrg out_elts[const_chan], args->out[chan]); 2654b8e80941Smrg } 2655b8e80941Smrg } 2656b8e80941Smrg 2657b8e80941Smrg args->enabled_channels = 0xf; 2658b8e80941Smrg args->valid_mask = 0; 2659b8e80941Smrg args->done = 0; 2660b8e80941Smrg args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index; 2661b8e80941Smrg args->compr = 0; 2662b8e80941Smrg } 2663b8e80941Smrg} 2664b8e80941Smrg 2665b8e80941Smrgstatic void si_dump_streamout(struct pipe_stream_output_info *so) 2666b8e80941Smrg{ 2667b8e80941Smrg unsigned i; 2668b8e80941Smrg 2669b8e80941Smrg if (so->num_outputs) 2670b8e80941Smrg fprintf(stderr, "STREAMOUT\n"); 2671b8e80941Smrg 2672b8e80941Smrg for (i = 0; i < so->num_outputs; i++) { 2673b8e80941Smrg unsigned mask = ((1 << so->output[i].num_components) - 1) << 2674b8e80941Smrg so->output[i].start_component; 2675b8e80941Smrg fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n", 2676b8e80941Smrg i, so->output[i].output_buffer, 2677b8e80941Smrg so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1, 2678b8e80941Smrg so->output[i].register_index, 2679b8e80941Smrg mask & 1 ? "x" : "", 2680b8e80941Smrg mask & 2 ? "y" : "", 2681b8e80941Smrg mask & 4 ? "z" : "", 2682b8e80941Smrg mask & 8 ? "w" : ""); 2683b8e80941Smrg } 2684b8e80941Smrg} 2685b8e80941Smrg 2686b8e80941Smrgstatic void emit_streamout_output(struct si_shader_context *ctx, 2687b8e80941Smrg LLVMValueRef const *so_buffers, 2688b8e80941Smrg LLVMValueRef const *so_write_offsets, 2689b8e80941Smrg struct pipe_stream_output *stream_out, 2690b8e80941Smrg struct si_shader_output_values *shader_out) 2691b8e80941Smrg{ 2692b8e80941Smrg unsigned buf_idx = stream_out->output_buffer; 2693b8e80941Smrg unsigned start = stream_out->start_component; 2694b8e80941Smrg unsigned num_comps = stream_out->num_components; 2695b8e80941Smrg LLVMValueRef out[4]; 2696b8e80941Smrg 2697b8e80941Smrg assert(num_comps && num_comps <= 4); 2698b8e80941Smrg if (!num_comps || num_comps > 4) 2699b8e80941Smrg return; 2700b8e80941Smrg 2701b8e80941Smrg /* Load the output as int. */ 2702b8e80941Smrg for (int j = 0; j < num_comps; j++) { 2703b8e80941Smrg assert(stream_out->stream == shader_out->vertex_stream[start + j]); 2704b8e80941Smrg 2705b8e80941Smrg out[j] = ac_to_integer(&ctx->ac, shader_out->values[start + j]); 2706b8e80941Smrg } 2707b8e80941Smrg 2708b8e80941Smrg /* Pack the output. */ 2709b8e80941Smrg LLVMValueRef vdata = NULL; 2710b8e80941Smrg 2711b8e80941Smrg switch (num_comps) { 2712b8e80941Smrg case 1: /* as i32 */ 2713b8e80941Smrg vdata = out[0]; 2714b8e80941Smrg break; 2715b8e80941Smrg case 2: /* as v2i32 */ 2716b8e80941Smrg case 3: /* as v4i32 (aligned to 4) */ 2717b8e80941Smrg out[3] = LLVMGetUndef(ctx->i32); 2718b8e80941Smrg /* fall through */ 2719b8e80941Smrg case 4: /* as v4i32 */ 2720b8e80941Smrg vdata = ac_build_gather_values(&ctx->ac, out, util_next_power_of_two(num_comps)); 2721b8e80941Smrg break; 2722b8e80941Smrg } 2723b8e80941Smrg 2724b8e80941Smrg ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx], 2725b8e80941Smrg vdata, num_comps, 2726b8e80941Smrg so_write_offsets[buf_idx], 2727b8e80941Smrg ctx->i32_0, 2728b8e80941Smrg stream_out->dst_offset * 4, 1, 1, true, false); 2729b8e80941Smrg} 2730b8e80941Smrg 2731b8e80941Smrg/** 2732b8e80941Smrg * Write streamout data to buffers for vertex stream @p stream (different 2733b8e80941Smrg * vertex streams can occur for GS copy shaders). 2734b8e80941Smrg */ 2735b8e80941Smrgstatic void si_llvm_emit_streamout(struct si_shader_context *ctx, 2736b8e80941Smrg struct si_shader_output_values *outputs, 2737b8e80941Smrg unsigned noutput, unsigned stream) 2738b8e80941Smrg{ 2739b8e80941Smrg struct si_shader_selector *sel = ctx->shader->selector; 2740b8e80941Smrg struct pipe_stream_output_info *so = &sel->so; 2741b8e80941Smrg LLVMBuilderRef builder = ctx->ac.builder; 2742b8e80941Smrg int i; 2743b8e80941Smrg struct lp_build_if_state if_ctx; 2744b8e80941Smrg 2745b8e80941Smrg /* Get bits [22:16], i.e. (so_param >> 16) & 127; */ 2746b8e80941Smrg LLVMValueRef so_vtx_count = 2747b8e80941Smrg si_unpack_param(ctx, ctx->param_streamout_config, 16, 7); 2748b8e80941Smrg 2749b8e80941Smrg LLVMValueRef tid = ac_get_thread_id(&ctx->ac); 2750b8e80941Smrg 2751b8e80941Smrg /* can_emit = tid < so_vtx_count; */ 2752b8e80941Smrg LLVMValueRef can_emit = 2753b8e80941Smrg LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, ""); 2754b8e80941Smrg 2755b8e80941Smrg /* Emit the streamout code conditionally. This actually avoids 2756b8e80941Smrg * out-of-bounds buffer access. The hw tells us via the SGPR 2757b8e80941Smrg * (so_vtx_count) which threads are allowed to emit streamout data. */ 2758b8e80941Smrg lp_build_if(&if_ctx, &ctx->gallivm, can_emit); 2759b8e80941Smrg { 2760b8e80941Smrg /* The buffer offset is computed as follows: 2761b8e80941Smrg * ByteOffset = streamout_offset[buffer_id]*4 + 2762b8e80941Smrg * (streamout_write_index + thread_id)*stride[buffer_id] + 2763b8e80941Smrg * attrib_offset 2764b8e80941Smrg */ 2765b8e80941Smrg 2766b8e80941Smrg LLVMValueRef so_write_index = 2767b8e80941Smrg LLVMGetParam(ctx->main_fn, 2768b8e80941Smrg ctx->param_streamout_write_index); 2769b8e80941Smrg 2770b8e80941Smrg /* Compute (streamout_write_index + thread_id). */ 2771b8e80941Smrg so_write_index = LLVMBuildAdd(builder, so_write_index, tid, ""); 2772b8e80941Smrg 2773b8e80941Smrg /* Load the descriptor and compute the write offset for each 2774b8e80941Smrg * enabled buffer. */ 2775b8e80941Smrg LLVMValueRef so_write_offset[4] = {}; 2776b8e80941Smrg LLVMValueRef so_buffers[4]; 2777b8e80941Smrg LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn, 2778b8e80941Smrg ctx->param_rw_buffers); 2779b8e80941Smrg 2780b8e80941Smrg for (i = 0; i < 4; i++) { 2781b8e80941Smrg if (!so->stride[i]) 2782b8e80941Smrg continue; 2783b8e80941Smrg 2784b8e80941Smrg LLVMValueRef offset = LLVMConstInt(ctx->i32, 2785b8e80941Smrg SI_VS_STREAMOUT_BUF0 + i, 0); 2786b8e80941Smrg 2787b8e80941Smrg so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); 2788b8e80941Smrg 2789b8e80941Smrg LLVMValueRef so_offset = LLVMGetParam(ctx->main_fn, 2790b8e80941Smrg ctx->param_streamout_offset[i]); 2791b8e80941Smrg so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), ""); 2792b8e80941Smrg 2793b8e80941Smrg so_write_offset[i] = ac_build_imad(&ctx->ac, so_write_index, 2794b8e80941Smrg LLVMConstInt(ctx->i32, so->stride[i]*4, 0), 2795b8e80941Smrg so_offset); 2796b8e80941Smrg } 2797b8e80941Smrg 2798b8e80941Smrg /* Write streamout data. */ 2799b8e80941Smrg for (i = 0; i < so->num_outputs; i++) { 2800b8e80941Smrg unsigned reg = so->output[i].register_index; 2801b8e80941Smrg 2802b8e80941Smrg if (reg >= noutput) 2803b8e80941Smrg continue; 2804b8e80941Smrg 2805b8e80941Smrg if (stream != so->output[i].stream) 2806b8e80941Smrg continue; 2807b8e80941Smrg 2808b8e80941Smrg emit_streamout_output(ctx, so_buffers, so_write_offset, 2809b8e80941Smrg &so->output[i], &outputs[reg]); 2810b8e80941Smrg } 2811b8e80941Smrg } 2812b8e80941Smrg lp_build_endif(&if_ctx); 2813b8e80941Smrg} 2814b8e80941Smrg 2815b8e80941Smrgstatic void si_export_param(struct si_shader_context *ctx, unsigned index, 2816b8e80941Smrg LLVMValueRef *values) 2817b8e80941Smrg{ 2818b8e80941Smrg struct ac_export_args args; 2819b8e80941Smrg 2820b8e80941Smrg si_llvm_init_export_args(ctx, values, 2821b8e80941Smrg V_008DFC_SQ_EXP_PARAM + index, &args); 2822b8e80941Smrg ac_build_export(&ctx->ac, &args); 2823b8e80941Smrg} 2824b8e80941Smrg 2825b8e80941Smrgstatic void si_build_param_exports(struct si_shader_context *ctx, 2826b8e80941Smrg struct si_shader_output_values *outputs, 2827b8e80941Smrg unsigned noutput) 2828b8e80941Smrg{ 2829b8e80941Smrg struct si_shader *shader = ctx->shader; 2830b8e80941Smrg unsigned param_count = 0; 2831b8e80941Smrg 2832b8e80941Smrg for (unsigned i = 0; i < noutput; i++) { 2833b8e80941Smrg unsigned semantic_name = outputs[i].semantic_name; 2834b8e80941Smrg unsigned semantic_index = outputs[i].semantic_index; 2835b8e80941Smrg 2836b8e80941Smrg if (outputs[i].vertex_stream[0] != 0 && 2837b8e80941Smrg outputs[i].vertex_stream[1] != 0 && 2838b8e80941Smrg outputs[i].vertex_stream[2] != 0 && 2839b8e80941Smrg outputs[i].vertex_stream[3] != 0) 2840b8e80941Smrg continue; 2841b8e80941Smrg 2842b8e80941Smrg switch (semantic_name) { 2843b8e80941Smrg case TGSI_SEMANTIC_LAYER: 2844b8e80941Smrg case TGSI_SEMANTIC_VIEWPORT_INDEX: 2845b8e80941Smrg case TGSI_SEMANTIC_CLIPDIST: 2846b8e80941Smrg case TGSI_SEMANTIC_COLOR: 2847b8e80941Smrg case TGSI_SEMANTIC_BCOLOR: 2848b8e80941Smrg case TGSI_SEMANTIC_PRIMID: 2849b8e80941Smrg case TGSI_SEMANTIC_FOG: 2850b8e80941Smrg case TGSI_SEMANTIC_TEXCOORD: 2851b8e80941Smrg case TGSI_SEMANTIC_GENERIC: 2852b8e80941Smrg break; 2853b8e80941Smrg default: 2854b8e80941Smrg continue; 2855b8e80941Smrg } 2856b8e80941Smrg 2857b8e80941Smrg if ((semantic_name != TGSI_SEMANTIC_GENERIC || 2858b8e80941Smrg semantic_index < SI_MAX_IO_GENERIC) && 2859b8e80941Smrg shader->key.opt.kill_outputs & 2860b8e80941Smrg (1ull << si_shader_io_get_unique_index(semantic_name, 2861b8e80941Smrg semantic_index, true))) 2862b8e80941Smrg continue; 2863b8e80941Smrg 2864b8e80941Smrg si_export_param(ctx, param_count, outputs[i].values); 2865b8e80941Smrg 2866b8e80941Smrg assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset)); 2867b8e80941Smrg shader->info.vs_output_param_offset[i] = param_count++; 2868b8e80941Smrg } 2869b8e80941Smrg 2870b8e80941Smrg shader->info.nr_param_exports = param_count; 2871b8e80941Smrg} 2872b8e80941Smrg 2873b8e80941Smrg/* Generate export instructions for hardware VS shader stage */ 2874b8e80941Smrgstatic void si_llvm_export_vs(struct si_shader_context *ctx, 2875b8e80941Smrg struct si_shader_output_values *outputs, 2876b8e80941Smrg unsigned noutput) 2877b8e80941Smrg{ 2878b8e80941Smrg struct si_shader *shader = ctx->shader; 2879b8e80941Smrg struct ac_export_args pos_args[4] = {}; 2880b8e80941Smrg LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL; 2881b8e80941Smrg unsigned pos_idx; 2882b8e80941Smrg int i; 2883b8e80941Smrg 2884b8e80941Smrg /* Build position exports. */ 2885b8e80941Smrg for (i = 0; i < noutput; i++) { 2886b8e80941Smrg switch (outputs[i].semantic_name) { 2887b8e80941Smrg case TGSI_SEMANTIC_POSITION: 2888b8e80941Smrg si_llvm_init_export_args(ctx, outputs[i].values, 2889b8e80941Smrg V_008DFC_SQ_EXP_POS, &pos_args[0]); 2890b8e80941Smrg break; 2891b8e80941Smrg case TGSI_SEMANTIC_PSIZE: 2892b8e80941Smrg psize_value = outputs[i].values[0]; 2893b8e80941Smrg break; 2894b8e80941Smrg case TGSI_SEMANTIC_LAYER: 2895b8e80941Smrg layer_value = outputs[i].values[0]; 2896b8e80941Smrg break; 2897b8e80941Smrg case TGSI_SEMANTIC_VIEWPORT_INDEX: 2898b8e80941Smrg viewport_index_value = outputs[i].values[0]; 2899b8e80941Smrg break; 2900b8e80941Smrg case TGSI_SEMANTIC_EDGEFLAG: 2901b8e80941Smrg edgeflag_value = outputs[i].values[0]; 2902b8e80941Smrg break; 2903b8e80941Smrg case TGSI_SEMANTIC_CLIPDIST: 2904b8e80941Smrg if (!shader->key.opt.clip_disable) { 2905b8e80941Smrg unsigned index = 2 + outputs[i].semantic_index; 2906b8e80941Smrg si_llvm_init_export_args(ctx, outputs[i].values, 2907b8e80941Smrg V_008DFC_SQ_EXP_POS + index, 2908b8e80941Smrg &pos_args[index]); 2909b8e80941Smrg } 2910b8e80941Smrg break; 2911b8e80941Smrg case TGSI_SEMANTIC_CLIPVERTEX: 2912b8e80941Smrg if (!shader->key.opt.clip_disable) { 2913b8e80941Smrg si_llvm_emit_clipvertex(ctx, pos_args, 2914b8e80941Smrg outputs[i].values); 2915b8e80941Smrg } 2916b8e80941Smrg break; 2917b8e80941Smrg } 2918b8e80941Smrg } 2919b8e80941Smrg 2920b8e80941Smrg /* We need to add the position output manually if it's missing. */ 2921b8e80941Smrg if (!pos_args[0].out[0]) { 2922b8e80941Smrg pos_args[0].enabled_channels = 0xf; /* writemask */ 2923b8e80941Smrg pos_args[0].valid_mask = 0; /* EXEC mask */ 2924b8e80941Smrg pos_args[0].done = 0; /* last export? */ 2925b8e80941Smrg pos_args[0].target = V_008DFC_SQ_EXP_POS; 2926b8e80941Smrg pos_args[0].compr = 0; /* COMPR flag */ 2927b8e80941Smrg pos_args[0].out[0] = ctx->ac.f32_0; /* X */ 2928b8e80941Smrg pos_args[0].out[1] = ctx->ac.f32_0; /* Y */ 2929b8e80941Smrg pos_args[0].out[2] = ctx->ac.f32_0; /* Z */ 2930b8e80941Smrg pos_args[0].out[3] = ctx->ac.f32_1; /* W */ 2931b8e80941Smrg } 2932b8e80941Smrg 2933b8e80941Smrg /* Write the misc vector (point size, edgeflag, layer, viewport). */ 2934b8e80941Smrg if (shader->selector->info.writes_psize || 2935b8e80941Smrg shader->selector->info.writes_edgeflag || 2936b8e80941Smrg shader->selector->info.writes_viewport_index || 2937b8e80941Smrg shader->selector->info.writes_layer) { 2938b8e80941Smrg pos_args[1].enabled_channels = shader->selector->info.writes_psize | 2939b8e80941Smrg (shader->selector->info.writes_edgeflag << 1) | 2940b8e80941Smrg (shader->selector->info.writes_layer << 2); 2941b8e80941Smrg 2942b8e80941Smrg pos_args[1].valid_mask = 0; /* EXEC mask */ 2943b8e80941Smrg pos_args[1].done = 0; /* last export? */ 2944b8e80941Smrg pos_args[1].target = V_008DFC_SQ_EXP_POS + 1; 2945b8e80941Smrg pos_args[1].compr = 0; /* COMPR flag */ 2946b8e80941Smrg pos_args[1].out[0] = ctx->ac.f32_0; /* X */ 2947b8e80941Smrg pos_args[1].out[1] = ctx->ac.f32_0; /* Y */ 2948b8e80941Smrg pos_args[1].out[2] = ctx->ac.f32_0; /* Z */ 2949b8e80941Smrg pos_args[1].out[3] = ctx->ac.f32_0; /* W */ 2950b8e80941Smrg 2951b8e80941Smrg if (shader->selector->info.writes_psize) 2952b8e80941Smrg pos_args[1].out[0] = psize_value; 2953b8e80941Smrg 2954b8e80941Smrg if (shader->selector->info.writes_edgeflag) { 2955b8e80941Smrg /* The output is a float, but the hw expects an integer 2956b8e80941Smrg * with the first bit containing the edge flag. */ 2957b8e80941Smrg edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder, 2958b8e80941Smrg edgeflag_value, 2959b8e80941Smrg ctx->i32, ""); 2960b8e80941Smrg edgeflag_value = ac_build_umin(&ctx->ac, 2961b8e80941Smrg edgeflag_value, 2962b8e80941Smrg ctx->i32_1); 2963b8e80941Smrg 2964b8e80941Smrg /* The LLVM intrinsic expects a float. */ 2965b8e80941Smrg pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value); 2966b8e80941Smrg } 2967b8e80941Smrg 2968b8e80941Smrg if (ctx->screen->info.chip_class >= GFX9) { 2969b8e80941Smrg /* GFX9 has the layer in out.z[10:0] and the viewport 2970b8e80941Smrg * index in out.z[19:16]. 2971b8e80941Smrg */ 2972b8e80941Smrg if (shader->selector->info.writes_layer) 2973b8e80941Smrg pos_args[1].out[2] = layer_value; 2974b8e80941Smrg 2975b8e80941Smrg if (shader->selector->info.writes_viewport_index) { 2976b8e80941Smrg LLVMValueRef v = viewport_index_value; 2977b8e80941Smrg 2978b8e80941Smrg v = ac_to_integer(&ctx->ac, v); 2979b8e80941Smrg v = LLVMBuildShl(ctx->ac.builder, v, 2980b8e80941Smrg LLVMConstInt(ctx->i32, 16, 0), ""); 2981b8e80941Smrg v = LLVMBuildOr(ctx->ac.builder, v, 2982b8e80941Smrg ac_to_integer(&ctx->ac, pos_args[1].out[2]), ""); 2983b8e80941Smrg pos_args[1].out[2] = ac_to_float(&ctx->ac, v); 2984b8e80941Smrg pos_args[1].enabled_channels |= 1 << 2; 2985b8e80941Smrg } 2986b8e80941Smrg } else { 2987b8e80941Smrg if (shader->selector->info.writes_layer) 2988b8e80941Smrg pos_args[1].out[2] = layer_value; 2989b8e80941Smrg 2990b8e80941Smrg if (shader->selector->info.writes_viewport_index) { 2991b8e80941Smrg pos_args[1].out[3] = viewport_index_value; 2992b8e80941Smrg pos_args[1].enabled_channels |= 1 << 3; 2993b8e80941Smrg } 2994b8e80941Smrg } 2995b8e80941Smrg } 2996b8e80941Smrg 2997b8e80941Smrg for (i = 0; i < 4; i++) 2998b8e80941Smrg if (pos_args[i].out[0]) 2999b8e80941Smrg shader->info.nr_pos_exports++; 3000848b8605Smrg 3001848b8605Smrg pos_idx = 0; 3002848b8605Smrg for (i = 0; i < 4; i++) { 3003b8e80941Smrg if (!pos_args[i].out[0]) 3004b8e80941Smrg continue; 3005b8e80941Smrg 3006b8e80941Smrg /* Specify the target we are exporting */ 3007b8e80941Smrg pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++; 3008b8e80941Smrg 3009b8e80941Smrg if (pos_idx == shader->info.nr_pos_exports) 3010b8e80941Smrg /* Specify that this is the last export */ 3011b8e80941Smrg pos_args[i].done = 1; 3012b8e80941Smrg 3013b8e80941Smrg ac_build_export(&ctx->ac, &pos_args[i]); 3014b8e80941Smrg } 3015b8e80941Smrg 3016b8e80941Smrg /* Build parameter exports. */ 3017b8e80941Smrg si_build_param_exports(ctx, outputs, noutput); 3018b8e80941Smrg} 3019b8e80941Smrg 3020b8e80941Smrg/** 3021b8e80941Smrg * Forward all outputs from the vertex shader to the TES. This is only used 3022b8e80941Smrg * for the fixed function TCS. 3023b8e80941Smrg */ 3024b8e80941Smrgstatic void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base) 3025b8e80941Smrg{ 3026b8e80941Smrg struct si_shader_context *ctx = si_shader_context(bld_base); 3027b8e80941Smrg LLVMValueRef invocation_id, buffer, buffer_offset; 3028b8e80941Smrg LLVMValueRef lds_vertex_stride, lds_base; 3029b8e80941Smrg uint64_t inputs; 3030b8e80941Smrg 3031b8e80941Smrg invocation_id = unpack_llvm_param(ctx, ctx->abi.tcs_rel_ids, 8, 5); 3032b8e80941Smrg buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS); 3033b8e80941Smrg buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset); 3034b8e80941Smrg 3035b8e80941Smrg lds_vertex_stride = get_tcs_in_vertex_dw_stride(ctx); 3036b8e80941Smrg lds_base = get_tcs_in_current_patch_offset(ctx); 3037b8e80941Smrg lds_base = ac_build_imad(&ctx->ac, invocation_id, lds_vertex_stride, 3038b8e80941Smrg lds_base); 3039b8e80941Smrg 3040b8e80941Smrg inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy; 3041b8e80941Smrg while (inputs) { 3042b8e80941Smrg unsigned i = u_bit_scan64(&inputs); 3043b8e80941Smrg 3044b8e80941Smrg LLVMValueRef lds_ptr = LLVMBuildAdd(ctx->ac.builder, lds_base, 3045b8e80941Smrg LLVMConstInt(ctx->i32, 4 * i, 0), 3046b8e80941Smrg ""); 3047b8e80941Smrg 3048b8e80941Smrg LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx, 3049b8e80941Smrg get_rel_patch_id(ctx), 3050b8e80941Smrg invocation_id, 3051b8e80941Smrg LLVMConstInt(ctx->i32, i, 0)); 3052b8e80941Smrg 3053b8e80941Smrg LLVMValueRef value = lds_load(bld_base, ctx->ac.i32, ~0, 3054b8e80941Smrg lds_ptr); 3055b8e80941Smrg 3056b8e80941Smrg ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr, 3057b8e80941Smrg buffer_offset, 0, 1, 0, true, false); 3058b8e80941Smrg } 3059b8e80941Smrg} 3060b8e80941Smrg 3061b8e80941Smrgstatic void si_write_tess_factors(struct lp_build_tgsi_context *bld_base, 3062b8e80941Smrg LLVMValueRef rel_patch_id, 3063b8e80941Smrg LLVMValueRef invocation_id, 3064b8e80941Smrg LLVMValueRef tcs_out_current_patch_data_offset, 3065b8e80941Smrg LLVMValueRef invoc0_tf_outer[4], 3066b8e80941Smrg LLVMValueRef invoc0_tf_inner[2]) 3067b8e80941Smrg{ 3068b8e80941Smrg struct si_shader_context *ctx = si_shader_context(bld_base); 3069b8e80941Smrg struct si_shader *shader = ctx->shader; 3070b8e80941Smrg unsigned tess_inner_index, tess_outer_index; 3071b8e80941Smrg LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer; 3072b8e80941Smrg LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4]; 3073b8e80941Smrg unsigned stride, outer_comps, inner_comps, i, offset; 3074b8e80941Smrg struct lp_build_if_state if_ctx, inner_if_ctx; 3075b8e80941Smrg 3076b8e80941Smrg /* Add a barrier before loading tess factors from LDS. */ 3077b8e80941Smrg if (!shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) 3078b8e80941Smrg si_llvm_emit_barrier(NULL, bld_base, NULL); 3079b8e80941Smrg 3080b8e80941Smrg /* Do this only for invocation 0, because the tess levels are per-patch, 3081b8e80941Smrg * not per-vertex. 3082b8e80941Smrg * 3083b8e80941Smrg * This can't jump, because invocation 0 executes this. It should 3084b8e80941Smrg * at least mask out the loads and stores for other invocations. 3085b8e80941Smrg */ 3086b8e80941Smrg lp_build_if(&if_ctx, &ctx->gallivm, 3087b8e80941Smrg LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, 3088b8e80941Smrg invocation_id, ctx->i32_0, "")); 3089b8e80941Smrg 3090b8e80941Smrg /* Determine the layout of one tess factor element in the buffer. */ 3091b8e80941Smrg switch (shader->key.part.tcs.epilog.prim_mode) { 3092b8e80941Smrg case PIPE_PRIM_LINES: 3093b8e80941Smrg stride = 2; /* 2 dwords, 1 vec2 store */ 3094b8e80941Smrg outer_comps = 2; 3095b8e80941Smrg inner_comps = 0; 3096b8e80941Smrg break; 3097b8e80941Smrg case PIPE_PRIM_TRIANGLES: 3098b8e80941Smrg stride = 4; /* 4 dwords, 1 vec4 store */ 3099b8e80941Smrg outer_comps = 3; 3100b8e80941Smrg inner_comps = 1; 3101b8e80941Smrg break; 3102b8e80941Smrg case PIPE_PRIM_QUADS: 3103b8e80941Smrg stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */ 3104b8e80941Smrg outer_comps = 4; 3105b8e80941Smrg inner_comps = 2; 3106b8e80941Smrg break; 3107b8e80941Smrg default: 3108b8e80941Smrg assert(0); 3109b8e80941Smrg return; 3110b8e80941Smrg } 3111b8e80941Smrg 3112b8e80941Smrg for (i = 0; i < 4; i++) { 3113b8e80941Smrg inner[i] = LLVMGetUndef(ctx->i32); 3114b8e80941Smrg outer[i] = LLVMGetUndef(ctx->i32); 3115b8e80941Smrg } 3116b8e80941Smrg 3117b8e80941Smrg if (shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) { 3118b8e80941Smrg /* Tess factors are in VGPRs. */ 3119b8e80941Smrg for (i = 0; i < outer_comps; i++) 3120b8e80941Smrg outer[i] = out[i] = invoc0_tf_outer[i]; 3121b8e80941Smrg for (i = 0; i < inner_comps; i++) 3122b8e80941Smrg inner[i] = out[outer_comps+i] = invoc0_tf_inner[i]; 3123b8e80941Smrg } else { 3124b8e80941Smrg /* Load tess_inner and tess_outer from LDS. 3125b8e80941Smrg * Any invocation can write them, so we can't get them from a temporary. 3126b8e80941Smrg */ 3127b8e80941Smrg tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0); 3128b8e80941Smrg tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0); 3129b8e80941Smrg 3130b8e80941Smrg lds_base = tcs_out_current_patch_data_offset; 3131b8e80941Smrg lds_inner = LLVMBuildAdd(ctx->ac.builder, lds_base, 3132b8e80941Smrg LLVMConstInt(ctx->i32, 3133b8e80941Smrg tess_inner_index * 4, 0), ""); 3134b8e80941Smrg lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_base, 3135b8e80941Smrg LLVMConstInt(ctx->i32, 3136b8e80941Smrg tess_outer_index * 4, 0), ""); 3137b8e80941Smrg 3138b8e80941Smrg for (i = 0; i < outer_comps; i++) { 3139b8e80941Smrg outer[i] = out[i] = 3140b8e80941Smrg lds_load(bld_base, ctx->ac.i32, i, lds_outer); 3141b8e80941Smrg } 3142b8e80941Smrg for (i = 0; i < inner_comps; i++) { 3143b8e80941Smrg inner[i] = out[outer_comps+i] = 3144b8e80941Smrg lds_load(bld_base, ctx->ac.i32, i, lds_inner); 3145b8e80941Smrg } 3146b8e80941Smrg } 3147b8e80941Smrg 3148b8e80941Smrg if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) { 3149b8e80941Smrg /* For isolines, the hardware expects tess factors in the 3150b8e80941Smrg * reverse order from what GLSL / TGSI specify. 3151b8e80941Smrg */ 3152b8e80941Smrg LLVMValueRef tmp = out[0]; 3153b8e80941Smrg out[0] = out[1]; 3154b8e80941Smrg out[1] = tmp; 3155b8e80941Smrg } 3156b8e80941Smrg 3157b8e80941Smrg /* Convert the outputs to vectors for stores. */ 3158b8e80941Smrg vec0 = ac_build_gather_values(&ctx->ac, out, MIN2(stride, 4)); 3159b8e80941Smrg vec1 = NULL; 3160b8e80941Smrg 3161b8e80941Smrg if (stride > 4) 3162b8e80941Smrg vec1 = ac_build_gather_values(&ctx->ac, out+4, stride - 4); 3163b8e80941Smrg 3164b8e80941Smrg /* Get the buffer. */ 3165b8e80941Smrg buffer = get_tess_ring_descriptor(ctx, TCS_FACTOR_RING); 3166b8e80941Smrg 3167b8e80941Smrg /* Get the offset. */ 3168b8e80941Smrg tf_base = LLVMGetParam(ctx->main_fn, 3169b8e80941Smrg ctx->param_tcs_factor_offset); 3170b8e80941Smrg byteoffset = LLVMBuildMul(ctx->ac.builder, rel_patch_id, 3171b8e80941Smrg LLVMConstInt(ctx->i32, 4 * stride, 0), ""); 3172b8e80941Smrg 3173b8e80941Smrg lp_build_if(&inner_if_ctx, &ctx->gallivm, 3174b8e80941Smrg LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, 3175b8e80941Smrg rel_patch_id, ctx->i32_0, "")); 3176b8e80941Smrg 3177b8e80941Smrg /* Store the dynamic HS control word. */ 3178b8e80941Smrg offset = 0; 3179b8e80941Smrg if (ctx->screen->info.chip_class <= VI) { 3180b8e80941Smrg ac_build_buffer_store_dword(&ctx->ac, buffer, 3181b8e80941Smrg LLVMConstInt(ctx->i32, 0x80000000, 0), 3182b8e80941Smrg 1, ctx->i32_0, tf_base, 3183b8e80941Smrg offset, 1, 0, true, false); 3184b8e80941Smrg offset += 4; 3185b8e80941Smrg } 3186b8e80941Smrg 3187b8e80941Smrg lp_build_endif(&inner_if_ctx); 3188b8e80941Smrg 3189b8e80941Smrg /* Store the tessellation factors. */ 3190b8e80941Smrg ac_build_buffer_store_dword(&ctx->ac, buffer, vec0, 3191b8e80941Smrg MIN2(stride, 4), byteoffset, tf_base, 3192b8e80941Smrg offset, 1, 0, true, false); 3193b8e80941Smrg offset += 16; 3194b8e80941Smrg if (vec1) 3195b8e80941Smrg ac_build_buffer_store_dword(&ctx->ac, buffer, vec1, 3196b8e80941Smrg stride - 4, byteoffset, tf_base, 3197b8e80941Smrg offset, 1, 0, true, false); 3198b8e80941Smrg 3199b8e80941Smrg /* Store the tess factors into the offchip buffer if TES reads them. */ 3200b8e80941Smrg if (shader->key.part.tcs.epilog.tes_reads_tess_factors) { 3201b8e80941Smrg LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset; 3202b8e80941Smrg LLVMValueRef tf_inner_offset; 3203b8e80941Smrg unsigned param_outer, param_inner; 3204b8e80941Smrg 3205b8e80941Smrg buf = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS); 3206b8e80941Smrg base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset); 3207b8e80941Smrg 3208b8e80941Smrg param_outer = si_shader_io_get_unique_index_patch( 3209b8e80941Smrg TGSI_SEMANTIC_TESSOUTER, 0); 3210b8e80941Smrg tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL, 3211b8e80941Smrg LLVMConstInt(ctx->i32, param_outer, 0)); 3212b8e80941Smrg 3213b8e80941Smrg outer_vec = ac_build_gather_values(&ctx->ac, outer, 3214b8e80941Smrg util_next_power_of_two(outer_comps)); 3215b8e80941Smrg 3216b8e80941Smrg ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec, 3217b8e80941Smrg outer_comps, tf_outer_offset, 3218b8e80941Smrg base, 0, 1, 0, true, false); 3219b8e80941Smrg if (inner_comps) { 3220b8e80941Smrg param_inner = si_shader_io_get_unique_index_patch( 3221b8e80941Smrg TGSI_SEMANTIC_TESSINNER, 0); 3222b8e80941Smrg tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL, 3223b8e80941Smrg LLVMConstInt(ctx->i32, param_inner, 0)); 3224b8e80941Smrg 3225b8e80941Smrg inner_vec = inner_comps == 1 ? inner[0] : 3226b8e80941Smrg ac_build_gather_values(&ctx->ac, inner, inner_comps); 3227b8e80941Smrg ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec, 3228b8e80941Smrg inner_comps, tf_inner_offset, 3229b8e80941Smrg base, 0, 1, 0, true, false); 3230b8e80941Smrg } 3231b8e80941Smrg } 3232b8e80941Smrg 3233b8e80941Smrg lp_build_endif(&if_ctx); 3234b8e80941Smrg} 3235b8e80941Smrg 3236b8e80941Smrgstatic LLVMValueRef 3237b8e80941Smrgsi_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret, 3238b8e80941Smrg unsigned param, unsigned return_index) 3239b8e80941Smrg{ 3240b8e80941Smrg return LLVMBuildInsertValue(ctx->ac.builder, ret, 3241b8e80941Smrg LLVMGetParam(ctx->main_fn, param), 3242b8e80941Smrg return_index, ""); 3243b8e80941Smrg} 3244b8e80941Smrg 3245b8e80941Smrgstatic LLVMValueRef 3246b8e80941Smrgsi_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret, 3247b8e80941Smrg unsigned param, unsigned return_index) 3248b8e80941Smrg{ 3249b8e80941Smrg LLVMBuilderRef builder = ctx->ac.builder; 3250b8e80941Smrg LLVMValueRef p = LLVMGetParam(ctx->main_fn, param); 3251b8e80941Smrg 3252b8e80941Smrg return LLVMBuildInsertValue(builder, ret, 3253b8e80941Smrg ac_to_float(&ctx->ac, p), 3254b8e80941Smrg return_index, ""); 3255b8e80941Smrg} 3256b8e80941Smrg 3257b8e80941Smrgstatic LLVMValueRef 3258b8e80941Smrgsi_insert_input_ptr(struct si_shader_context *ctx, LLVMValueRef ret, 3259b8e80941Smrg unsigned param, unsigned return_index) 3260b8e80941Smrg{ 3261b8e80941Smrg LLVMBuilderRef builder = ctx->ac.builder; 3262b8e80941Smrg LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, param); 3263b8e80941Smrg ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i32, ""); 3264b8e80941Smrg return LLVMBuildInsertValue(builder, ret, ptr, return_index, ""); 3265b8e80941Smrg} 3266b8e80941Smrg 3267b8e80941Smrg/* This only writes the tessellation factor levels. */ 3268b8e80941Smrgstatic void si_llvm_emit_tcs_epilogue(struct ac_shader_abi *abi, 3269b8e80941Smrg unsigned max_outputs, 3270b8e80941Smrg LLVMValueRef *addrs) 3271b8e80941Smrg{ 3272b8e80941Smrg struct si_shader_context *ctx = si_shader_context_from_abi(abi); 3273b8e80941Smrg struct lp_build_tgsi_context *bld_base = &ctx->bld_base; 3274b8e80941Smrg LLVMBuilderRef builder = ctx->ac.builder; 3275b8e80941Smrg LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset; 3276b8e80941Smrg 3277b8e80941Smrg si_copy_tcs_inputs(bld_base); 3278b8e80941Smrg 3279b8e80941Smrg rel_patch_id = get_rel_patch_id(ctx); 3280b8e80941Smrg invocation_id = unpack_llvm_param(ctx, ctx->abi.tcs_rel_ids, 8, 5); 3281b8e80941Smrg tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx); 3282b8e80941Smrg 3283b8e80941Smrg if (ctx->screen->info.chip_class >= GFX9) { 3284b8e80941Smrg LLVMBasicBlockRef blocks[2] = { 3285b8e80941Smrg LLVMGetInsertBlock(builder), 3286b8e80941Smrg ctx->merged_wrap_if_state.entry_block 3287b8e80941Smrg }; 3288b8e80941Smrg LLVMValueRef values[2]; 3289b8e80941Smrg 3290b8e80941Smrg lp_build_endif(&ctx->merged_wrap_if_state); 3291b8e80941Smrg 3292b8e80941Smrg values[0] = rel_patch_id; 3293b8e80941Smrg values[1] = LLVMGetUndef(ctx->i32); 3294b8e80941Smrg rel_patch_id = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks); 3295b8e80941Smrg 3296b8e80941Smrg values[0] = tf_lds_offset; 3297b8e80941Smrg values[1] = LLVMGetUndef(ctx->i32); 3298b8e80941Smrg tf_lds_offset = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks); 3299b8e80941Smrg 3300b8e80941Smrg values[0] = invocation_id; 3301b8e80941Smrg values[1] = ctx->i32_1; /* cause the epilog to skip threads */ 3302b8e80941Smrg invocation_id = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks); 3303b8e80941Smrg } 3304b8e80941Smrg 3305b8e80941Smrg /* Return epilog parameters from this function. */ 3306b8e80941Smrg LLVMValueRef ret = ctx->return_value; 3307b8e80941Smrg unsigned vgpr; 3308b8e80941Smrg 3309b8e80941Smrg if (ctx->screen->info.chip_class >= GFX9) { 3310b8e80941Smrg ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout, 3311b8e80941Smrg 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT); 3312b8e80941Smrg ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout, 3313b8e80941Smrg 8 + GFX9_SGPR_TCS_OUT_LAYOUT); 3314b8e80941Smrg /* Tess offchip and tess factor offsets are at the beginning. */ 3315b8e80941Smrg ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2); 3316b8e80941Smrg ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4); 3317b8e80941Smrg vgpr = 8 + GFX9_SGPR_TCS_OUT_LAYOUT + 1; 3318b8e80941Smrg } else { 3319b8e80941Smrg ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout, 3320b8e80941Smrg GFX6_SGPR_TCS_OFFCHIP_LAYOUT); 3321b8e80941Smrg ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout, 3322b8e80941Smrg GFX6_SGPR_TCS_OUT_LAYOUT); 3323b8e80941Smrg /* Tess offchip and tess factor offsets are after user SGPRs. */ 3324b8e80941Smrg ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 3325b8e80941Smrg GFX6_TCS_NUM_USER_SGPR); 3326b8e80941Smrg ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 3327b8e80941Smrg GFX6_TCS_NUM_USER_SGPR + 1); 3328b8e80941Smrg vgpr = GFX6_TCS_NUM_USER_SGPR + 2; 3329b8e80941Smrg } 3330b8e80941Smrg 3331b8e80941Smrg /* VGPRs */ 3332b8e80941Smrg rel_patch_id = ac_to_float(&ctx->ac, rel_patch_id); 3333b8e80941Smrg invocation_id = ac_to_float(&ctx->ac, invocation_id); 3334b8e80941Smrg tf_lds_offset = ac_to_float(&ctx->ac, tf_lds_offset); 3335b8e80941Smrg 3336b8e80941Smrg /* Leave a hole corresponding to the two input VGPRs. This ensures that 3337b8e80941Smrg * the invocation_id output does not alias the tcs_rel_ids input, 3338b8e80941Smrg * which saves a V_MOV on gfx9. 3339b8e80941Smrg */ 3340b8e80941Smrg vgpr += 2; 3341b8e80941Smrg 3342b8e80941Smrg ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, ""); 3343b8e80941Smrg ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, ""); 3344b8e80941Smrg 3345b8e80941Smrg if (ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) { 3346b8e80941Smrg vgpr++; /* skip the tess factor LDS offset */ 3347b8e80941Smrg for (unsigned i = 0; i < 6; i++) { 3348b8e80941Smrg LLVMValueRef value = 3349b8e80941Smrg LLVMBuildLoad(builder, ctx->invoc0_tess_factors[i], ""); 3350b8e80941Smrg value = ac_to_float(&ctx->ac, value); 3351b8e80941Smrg ret = LLVMBuildInsertValue(builder, ret, value, vgpr++, ""); 3352b8e80941Smrg } 3353b8e80941Smrg } else { 3354b8e80941Smrg ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, ""); 3355b8e80941Smrg } 3356b8e80941Smrg ctx->return_value = ret; 3357b8e80941Smrg} 3358b8e80941Smrg 3359b8e80941Smrg/* Pass TCS inputs from LS to TCS on GFX9. */ 3360b8e80941Smrgstatic void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx) 3361b8e80941Smrg{ 3362b8e80941Smrg LLVMValueRef ret = ctx->return_value; 3363b8e80941Smrg 3364b8e80941Smrg ret = si_insert_input_ptr(ctx, ret, 0, 0); 3365b8e80941Smrg ret = si_insert_input_ptr(ctx, ret, 1, 1); 3366b8e80941Smrg ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2); 3367b8e80941Smrg ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3); 3368b8e80941Smrg ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4); 3369b8e80941Smrg ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5); 3370b8e80941Smrg 3371b8e80941Smrg ret = si_insert_input_ptr(ctx, ret, ctx->param_rw_buffers, 3372b8e80941Smrg 8 + SI_SGPR_RW_BUFFERS); 3373b8e80941Smrg ret = si_insert_input_ptr(ctx, ret, 3374b8e80941Smrg ctx->param_bindless_samplers_and_images, 3375b8e80941Smrg 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES); 3376b8e80941Smrg 3377b8e80941Smrg ret = si_insert_input_ret(ctx, ret, ctx->param_vs_state_bits, 3378b8e80941Smrg 8 + SI_SGPR_VS_STATE_BITS); 3379b8e80941Smrg 3380b8e80941Smrg ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout, 3381b8e80941Smrg 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT); 3382b8e80941Smrg ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_offsets, 3383b8e80941Smrg 8 + GFX9_SGPR_TCS_OUT_OFFSETS); 3384b8e80941Smrg ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout, 3385b8e80941Smrg 8 + GFX9_SGPR_TCS_OUT_LAYOUT); 3386b8e80941Smrg 3387b8e80941Smrg unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR; 3388b8e80941Smrg ret = LLVMBuildInsertValue(ctx->ac.builder, ret, 3389b8e80941Smrg ac_to_float(&ctx->ac, ctx->abi.tcs_patch_id), 3390b8e80941Smrg vgpr++, ""); 3391b8e80941Smrg ret = LLVMBuildInsertValue(ctx->ac.builder, ret, 3392b8e80941Smrg ac_to_float(&ctx->ac, ctx->abi.tcs_rel_ids), 3393b8e80941Smrg vgpr++, ""); 3394b8e80941Smrg ctx->return_value = ret; 3395b8e80941Smrg} 3396b8e80941Smrg 3397b8e80941Smrg/* Pass GS inputs from ES to GS on GFX9. */ 3398b8e80941Smrgstatic void si_set_es_return_value_for_gs(struct si_shader_context *ctx) 3399b8e80941Smrg{ 3400b8e80941Smrg LLVMValueRef ret = ctx->return_value; 3401b8e80941Smrg 3402b8e80941Smrg ret = si_insert_input_ptr(ctx, ret, 0, 0); 3403b8e80941Smrg ret = si_insert_input_ptr(ctx, ret, 1, 1); 3404b8e80941Smrg ret = si_insert_input_ret(ctx, ret, ctx->param_gs2vs_offset, 2); 3405b8e80941Smrg ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3); 3406b8e80941Smrg ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5); 3407b8e80941Smrg 3408b8e80941Smrg ret = si_insert_input_ptr(ctx, ret, ctx->param_rw_buffers, 3409b8e80941Smrg 8 + SI_SGPR_RW_BUFFERS); 3410b8e80941Smrg ret = si_insert_input_ptr(ctx, ret, 3411b8e80941Smrg ctx->param_bindless_samplers_and_images, 3412b8e80941Smrg 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES); 3413b8e80941Smrg 3414b8e80941Smrg unsigned vgpr; 3415b8e80941Smrg if (ctx->type == PIPE_SHADER_VERTEX) 3416b8e80941Smrg vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR; 3417b8e80941Smrg else 3418b8e80941Smrg vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR; 3419b8e80941Smrg 3420b8e80941Smrg for (unsigned i = 0; i < 5; i++) { 3421b8e80941Smrg unsigned param = ctx->param_gs_vtx01_offset + i; 3422b8e80941Smrg ret = si_insert_input_ret_float(ctx, ret, param, vgpr++); 3423b8e80941Smrg } 3424b8e80941Smrg ctx->return_value = ret; 3425b8e80941Smrg} 3426b8e80941Smrg 3427b8e80941Smrgstatic void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, 3428b8e80941Smrg unsigned max_outputs, 3429b8e80941Smrg LLVMValueRef *addrs) 3430b8e80941Smrg{ 3431b8e80941Smrg struct si_shader_context *ctx = si_shader_context_from_abi(abi); 3432b8e80941Smrg struct si_shader *shader = ctx->shader; 3433b8e80941Smrg struct tgsi_shader_info *info = &shader->selector->info; 3434b8e80941Smrg unsigned i, chan; 3435b8e80941Smrg LLVMValueRef vertex_id = LLVMGetParam(ctx->main_fn, 3436b8e80941Smrg ctx->param_rel_auto_id); 3437b8e80941Smrg LLVMValueRef vertex_dw_stride = get_tcs_in_vertex_dw_stride(ctx); 3438b8e80941Smrg LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id, 3439b8e80941Smrg vertex_dw_stride, ""); 3440b8e80941Smrg 3441b8e80941Smrg /* Write outputs to LDS. The next shader (TCS aka HS) will read 3442b8e80941Smrg * its inputs from it. */ 3443b8e80941Smrg for (i = 0; i < info->num_outputs; i++) { 3444b8e80941Smrg unsigned name = info->output_semantic_name[i]; 3445b8e80941Smrg unsigned index = info->output_semantic_index[i]; 3446b8e80941Smrg 3447b8e80941Smrg /* The ARB_shader_viewport_layer_array spec contains the 3448b8e80941Smrg * following issue: 3449b8e80941Smrg * 3450b8e80941Smrg * 2) What happens if gl_ViewportIndex or gl_Layer is 3451b8e80941Smrg * written in the vertex shader and a geometry shader is 3452b8e80941Smrg * present? 3453b8e80941Smrg * 3454b8e80941Smrg * RESOLVED: The value written by the last vertex processing 3455b8e80941Smrg * stage is used. If the last vertex processing stage 3456b8e80941Smrg * (vertex, tessellation evaluation or geometry) does not 3457b8e80941Smrg * statically assign to gl_ViewportIndex or gl_Layer, index 3458b8e80941Smrg * or layer zero is assumed. 3459b8e80941Smrg * 3460b8e80941Smrg * So writes to those outputs in VS-as-LS are simply ignored. 3461b8e80941Smrg */ 3462b8e80941Smrg if (name == TGSI_SEMANTIC_LAYER || 3463b8e80941Smrg name == TGSI_SEMANTIC_VIEWPORT_INDEX) 3464b8e80941Smrg continue; 3465b8e80941Smrg 3466b8e80941Smrg int param = si_shader_io_get_unique_index(name, index, false); 3467b8e80941Smrg LLVMValueRef dw_addr = LLVMBuildAdd(ctx->ac.builder, base_dw_addr, 3468b8e80941Smrg LLVMConstInt(ctx->i32, param * 4, 0), ""); 3469b8e80941Smrg 3470b8e80941Smrg for (chan = 0; chan < 4; chan++) { 3471b8e80941Smrg if (!(info->output_usagemask[i] & (1 << chan))) 3472b8e80941Smrg continue; 3473b8e80941Smrg 3474b8e80941Smrg lds_store(ctx, chan, dw_addr, 3475b8e80941Smrg LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "")); 3476b8e80941Smrg } 3477b8e80941Smrg } 3478b8e80941Smrg 3479b8e80941Smrg if (ctx->screen->info.chip_class >= GFX9) 3480b8e80941Smrg si_set_ls_return_value_for_tcs(ctx); 3481b8e80941Smrg} 3482b8e80941Smrg 3483b8e80941Smrgstatic void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, 3484b8e80941Smrg unsigned max_outputs, 3485b8e80941Smrg LLVMValueRef *addrs) 3486b8e80941Smrg{ 3487b8e80941Smrg struct si_shader_context *ctx = si_shader_context_from_abi(abi); 3488b8e80941Smrg struct si_shader *es = ctx->shader; 3489b8e80941Smrg struct tgsi_shader_info *info = &es->selector->info; 3490b8e80941Smrg LLVMValueRef soffset = LLVMGetParam(ctx->main_fn, 3491b8e80941Smrg ctx->param_es2gs_offset); 3492b8e80941Smrg LLVMValueRef lds_base = NULL; 3493b8e80941Smrg unsigned chan; 3494b8e80941Smrg int i; 3495b8e80941Smrg 3496b8e80941Smrg if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) { 3497b8e80941Smrg unsigned itemsize_dw = es->selector->esgs_itemsize / 4; 3498b8e80941Smrg LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac); 3499b8e80941Smrg LLVMValueRef wave_idx = si_unpack_param(ctx, ctx->param_merged_wave_info, 24, 4); 3500b8e80941Smrg vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx, 3501b8e80941Smrg LLVMBuildMul(ctx->ac.builder, wave_idx, 3502b8e80941Smrg LLVMConstInt(ctx->i32, 64, false), ""), ""); 3503b8e80941Smrg lds_base = LLVMBuildMul(ctx->ac.builder, vertex_idx, 3504b8e80941Smrg LLVMConstInt(ctx->i32, itemsize_dw, 0), ""); 3505b8e80941Smrg } 3506b8e80941Smrg 3507b8e80941Smrg for (i = 0; i < info->num_outputs; i++) { 3508b8e80941Smrg int param; 3509b8e80941Smrg 3510b8e80941Smrg if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX || 3511b8e80941Smrg info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER) 3512b8e80941Smrg continue; 3513b8e80941Smrg 3514b8e80941Smrg param = si_shader_io_get_unique_index(info->output_semantic_name[i], 3515b8e80941Smrg info->output_semantic_index[i], false); 3516b8e80941Smrg 3517b8e80941Smrg for (chan = 0; chan < 4; chan++) { 3518b8e80941Smrg if (!(info->output_usagemask[i] & (1 << chan))) 3519b8e80941Smrg continue; 3520b8e80941Smrg 3521b8e80941Smrg LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""); 3522b8e80941Smrg out_val = ac_to_integer(&ctx->ac, out_val); 3523b8e80941Smrg 3524b8e80941Smrg /* GFX9 has the ESGS ring in LDS. */ 3525b8e80941Smrg if (ctx->screen->info.chip_class >= GFX9) { 3526b8e80941Smrg lds_store(ctx, param * 4 + chan, lds_base, out_val); 3527b8e80941Smrg continue; 3528b8e80941Smrg } 3529b8e80941Smrg 3530b8e80941Smrg ac_build_buffer_store_dword(&ctx->ac, 3531b8e80941Smrg ctx->esgs_ring, 3532b8e80941Smrg out_val, 1, NULL, soffset, 3533b8e80941Smrg (4 * param + chan) * 4, 3534b8e80941Smrg 1, 1, true, true); 3535b8e80941Smrg } 3536b8e80941Smrg } 3537b8e80941Smrg 3538b8e80941Smrg if (ctx->screen->info.chip_class >= GFX9) 3539b8e80941Smrg si_set_es_return_value_for_gs(ctx); 3540b8e80941Smrg} 3541b8e80941Smrg 3542b8e80941Smrgstatic LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx) 3543b8e80941Smrg{ 3544b8e80941Smrg if (ctx->screen->info.chip_class >= GFX9) 3545b8e80941Smrg return si_unpack_param(ctx, ctx->param_merged_wave_info, 16, 8); 3546b8e80941Smrg else 3547b8e80941Smrg return LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id); 3548b8e80941Smrg} 3549b8e80941Smrg 3550b8e80941Smrgstatic void emit_gs_epilogue(struct si_shader_context *ctx) 3551b8e80941Smrg{ 3552b8e80941Smrg ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, 3553b8e80941Smrg si_get_gs_wave_id(ctx)); 3554b8e80941Smrg 3555b8e80941Smrg if (ctx->screen->info.chip_class >= GFX9) 3556b8e80941Smrg lp_build_endif(&ctx->merged_wrap_if_state); 3557b8e80941Smrg} 3558b8e80941Smrg 3559b8e80941Smrgstatic void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi, 3560b8e80941Smrg unsigned max_outputs, 3561b8e80941Smrg LLVMValueRef *addrs) 3562b8e80941Smrg{ 3563b8e80941Smrg struct si_shader_context *ctx = si_shader_context_from_abi(abi); 3564b8e80941Smrg struct tgsi_shader_info UNUSED *info = &ctx->shader->selector->info; 3565b8e80941Smrg 3566b8e80941Smrg assert(info->num_outputs <= max_outputs); 3567b8e80941Smrg 3568b8e80941Smrg emit_gs_epilogue(ctx); 3569b8e80941Smrg} 3570b8e80941Smrg 3571b8e80941Smrgstatic void si_tgsi_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base) 3572b8e80941Smrg{ 3573b8e80941Smrg struct si_shader_context *ctx = si_shader_context(bld_base); 3574b8e80941Smrg emit_gs_epilogue(ctx); 3575b8e80941Smrg} 3576b8e80941Smrg 3577b8e80941Smrgstatic void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, 3578b8e80941Smrg unsigned max_outputs, 3579b8e80941Smrg LLVMValueRef *addrs) 3580b8e80941Smrg{ 3581b8e80941Smrg struct si_shader_context *ctx = si_shader_context_from_abi(abi); 3582b8e80941Smrg struct tgsi_shader_info *info = &ctx->shader->selector->info; 3583b8e80941Smrg struct si_shader_output_values *outputs = NULL; 3584b8e80941Smrg int i,j; 3585b8e80941Smrg 3586b8e80941Smrg assert(!ctx->shader->is_gs_copy_shader); 3587b8e80941Smrg assert(info->num_outputs <= max_outputs); 3588b8e80941Smrg 3589b8e80941Smrg outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0])); 3590b8e80941Smrg 3591b8e80941Smrg /* Vertex color clamping. 3592b8e80941Smrg * 3593b8e80941Smrg * This uses a state constant loaded in a user data SGPR and 3594b8e80941Smrg * an IF statement is added that clamps all colors if the constant 3595b8e80941Smrg * is true. 3596b8e80941Smrg */ 3597b8e80941Smrg struct lp_build_if_state if_ctx; 3598b8e80941Smrg LLVMValueRef cond = NULL; 3599b8e80941Smrg LLVMValueRef addr, val; 3600b8e80941Smrg 3601b8e80941Smrg for (i = 0; i < info->num_outputs; i++) { 3602b8e80941Smrg if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR && 3603b8e80941Smrg info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR) 3604b8e80941Smrg continue; 3605b8e80941Smrg 3606b8e80941Smrg /* We've found a color. */ 3607b8e80941Smrg if (!cond) { 3608b8e80941Smrg /* The state is in the first bit of the user SGPR. */ 3609b8e80941Smrg cond = LLVMGetParam(ctx->main_fn, 3610b8e80941Smrg ctx->param_vs_state_bits); 3611b8e80941Smrg cond = LLVMBuildTrunc(ctx->ac.builder, cond, 3612b8e80941Smrg ctx->i1, ""); 3613b8e80941Smrg lp_build_if(&if_ctx, &ctx->gallivm, cond); 3614b8e80941Smrg } 3615b8e80941Smrg 3616b8e80941Smrg for (j = 0; j < 4; j++) { 3617b8e80941Smrg addr = addrs[4 * i + j]; 3618b8e80941Smrg val = LLVMBuildLoad(ctx->ac.builder, addr, ""); 3619b8e80941Smrg val = ac_build_clamp(&ctx->ac, val); 3620b8e80941Smrg LLVMBuildStore(ctx->ac.builder, val, addr); 3621b8e80941Smrg } 3622b8e80941Smrg } 3623b8e80941Smrg 3624b8e80941Smrg if (cond) 3625b8e80941Smrg lp_build_endif(&if_ctx); 3626b8e80941Smrg 3627b8e80941Smrg for (i = 0; i < info->num_outputs; i++) { 3628b8e80941Smrg outputs[i].semantic_name = info->output_semantic_name[i]; 3629b8e80941Smrg outputs[i].semantic_index = info->output_semantic_index[i]; 3630b8e80941Smrg 3631b8e80941Smrg for (j = 0; j < 4; j++) { 3632b8e80941Smrg outputs[i].values[j] = 3633b8e80941Smrg LLVMBuildLoad(ctx->ac.builder, 3634b8e80941Smrg addrs[4 * i + j], 3635b8e80941Smrg ""); 3636b8e80941Smrg outputs[i].vertex_stream[j] = 3637b8e80941Smrg (info->output_streams[i] >> (2 * j)) & 3; 3638b8e80941Smrg } 3639b8e80941Smrg } 3640b8e80941Smrg 3641b8e80941Smrg if (ctx->shader->selector->so.num_outputs) 3642b8e80941Smrg si_llvm_emit_streamout(ctx, outputs, i, 0); 3643b8e80941Smrg 3644b8e80941Smrg /* Export PrimitiveID. */ 3645b8e80941Smrg if (ctx->shader->key.mono.u.vs_export_prim_id) { 3646b8e80941Smrg outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID; 3647b8e80941Smrg outputs[i].semantic_index = 0; 3648b8e80941Smrg outputs[i].values[0] = ac_to_float(&ctx->ac, get_primitive_id(ctx, 0)); 3649b8e80941Smrg for (j = 1; j < 4; j++) 3650b8e80941Smrg outputs[i].values[j] = LLVMConstReal(ctx->f32, 0); 3651b8e80941Smrg 3652b8e80941Smrg memset(outputs[i].vertex_stream, 0, 3653b8e80941Smrg sizeof(outputs[i].vertex_stream)); 3654b8e80941Smrg i++; 3655b8e80941Smrg } 3656b8e80941Smrg 3657b8e80941Smrg si_llvm_export_vs(ctx, outputs, i); 3658b8e80941Smrg FREE(outputs); 3659b8e80941Smrg} 3660b8e80941Smrg 3661b8e80941Smrgstatic void si_tgsi_emit_epilogue(struct lp_build_tgsi_context *bld_base) 3662b8e80941Smrg{ 3663b8e80941Smrg struct si_shader_context *ctx = si_shader_context(bld_base); 3664b8e80941Smrg 3665b8e80941Smrg ctx->abi.emit_outputs(&ctx->abi, RADEON_LLVM_MAX_OUTPUTS, 3666b8e80941Smrg &ctx->outputs[0][0]); 3667b8e80941Smrg} 3668b8e80941Smrg 3669b8e80941Smrgstruct si_ps_exports { 3670b8e80941Smrg unsigned num; 3671b8e80941Smrg struct ac_export_args args[10]; 3672b8e80941Smrg}; 3673b8e80941Smrg 3674b8e80941Smrgstatic void si_export_mrt_z(struct lp_build_tgsi_context *bld_base, 3675b8e80941Smrg LLVMValueRef depth, LLVMValueRef stencil, 3676b8e80941Smrg LLVMValueRef samplemask, struct si_ps_exports *exp) 3677b8e80941Smrg{ 3678b8e80941Smrg struct si_shader_context *ctx = si_shader_context(bld_base); 3679b8e80941Smrg struct ac_export_args args; 3680b8e80941Smrg 3681b8e80941Smrg ac_export_mrt_z(&ctx->ac, depth, stencil, samplemask, &args); 3682b8e80941Smrg 3683b8e80941Smrg memcpy(&exp->args[exp->num++], &args, sizeof(args)); 3684b8e80941Smrg} 3685b8e80941Smrg 3686b8e80941Smrgstatic void si_export_mrt_color(struct lp_build_tgsi_context *bld_base, 3687b8e80941Smrg LLVMValueRef *color, unsigned index, 3688b8e80941Smrg unsigned samplemask_param, 3689b8e80941Smrg bool is_last, struct si_ps_exports *exp) 3690b8e80941Smrg{ 3691b8e80941Smrg struct si_shader_context *ctx = si_shader_context(bld_base); 3692b8e80941Smrg int i; 3693b8e80941Smrg 3694b8e80941Smrg /* Clamp color */ 3695b8e80941Smrg if (ctx->shader->key.part.ps.epilog.clamp_color) 3696b8e80941Smrg for (i = 0; i < 4; i++) 3697b8e80941Smrg color[i] = ac_build_clamp(&ctx->ac, color[i]); 3698b8e80941Smrg 3699b8e80941Smrg /* Alpha to one */ 3700b8e80941Smrg if (ctx->shader->key.part.ps.epilog.alpha_to_one) 3701b8e80941Smrg color[3] = ctx->ac.f32_1; 3702b8e80941Smrg 3703b8e80941Smrg /* Alpha test */ 3704b8e80941Smrg if (index == 0 && 3705b8e80941Smrg ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS) 3706b8e80941Smrg si_alpha_test(bld_base, color[3]); 3707b8e80941Smrg 3708b8e80941Smrg /* Line & polygon smoothing */ 3709b8e80941Smrg if (ctx->shader->key.part.ps.epilog.poly_line_smoothing) 3710b8e80941Smrg color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3], 3711b8e80941Smrg samplemask_param); 3712b8e80941Smrg 3713b8e80941Smrg /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */ 3714b8e80941Smrg if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) { 3715b8e80941Smrg struct ac_export_args args[8]; 3716b8e80941Smrg int c, last = -1; 3717b8e80941Smrg 3718b8e80941Smrg /* Get the export arguments, also find out what the last one is. */ 3719b8e80941Smrg for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) { 3720b8e80941Smrg si_llvm_init_export_args(ctx, color, 3721b8e80941Smrg V_008DFC_SQ_EXP_MRT + c, &args[c]); 3722b8e80941Smrg if (args[c].enabled_channels) 3723b8e80941Smrg last = c; 3724b8e80941Smrg } 3725b8e80941Smrg 3726b8e80941Smrg /* Emit all exports. */ 3727b8e80941Smrg for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) { 3728b8e80941Smrg if (is_last && last == c) { 3729b8e80941Smrg args[c].valid_mask = 1; /* whether the EXEC mask is valid */ 3730b8e80941Smrg args[c].done = 1; /* DONE bit */ 3731b8e80941Smrg } else if (!args[c].enabled_channels) 3732b8e80941Smrg continue; /* unnecessary NULL export */ 3733b8e80941Smrg 3734b8e80941Smrg memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c])); 3735b8e80941Smrg } 3736b8e80941Smrg } else { 3737b8e80941Smrg struct ac_export_args args; 3738b8e80941Smrg 3739b8e80941Smrg /* Export */ 3740b8e80941Smrg si_llvm_init_export_args(ctx, color, V_008DFC_SQ_EXP_MRT + index, 3741b8e80941Smrg &args); 3742b8e80941Smrg if (is_last) { 3743b8e80941Smrg args.valid_mask = 1; /* whether the EXEC mask is valid */ 3744b8e80941Smrg args.done = 1; /* DONE bit */ 3745b8e80941Smrg } else if (!args.enabled_channels) 3746b8e80941Smrg return; /* unnecessary NULL export */ 3747b8e80941Smrg 3748b8e80941Smrg memcpy(&exp->args[exp->num++], &args, sizeof(args)); 3749b8e80941Smrg } 3750b8e80941Smrg} 3751b8e80941Smrg 3752b8e80941Smrgstatic void si_emit_ps_exports(struct si_shader_context *ctx, 3753b8e80941Smrg struct si_ps_exports *exp) 3754b8e80941Smrg{ 3755b8e80941Smrg for (unsigned i = 0; i < exp->num; i++) 3756b8e80941Smrg ac_build_export(&ctx->ac, &exp->args[i]); 3757b8e80941Smrg} 3758b8e80941Smrg 3759b8e80941Smrg/** 3760b8e80941Smrg * Return PS outputs in this order: 3761b8e80941Smrg * 3762b8e80941Smrg * v[0:3] = color0.xyzw 3763b8e80941Smrg * v[4:7] = color1.xyzw 3764b8e80941Smrg * ... 3765b8e80941Smrg * vN+0 = Depth 3766b8e80941Smrg * vN+1 = Stencil 3767b8e80941Smrg * vN+2 = SampleMask 3768b8e80941Smrg * vN+3 = SampleMaskIn (used for OpenGL smoothing) 3769b8e80941Smrg * 3770b8e80941Smrg * The alpha-ref SGPR is returned via its original location. 3771b8e80941Smrg */ 3772b8e80941Smrgstatic void si_llvm_return_fs_outputs(struct ac_shader_abi *abi, 3773b8e80941Smrg unsigned max_outputs, 3774b8e80941Smrg LLVMValueRef *addrs) 3775b8e80941Smrg{ 3776b8e80941Smrg struct si_shader_context *ctx = si_shader_context_from_abi(abi); 3777b8e80941Smrg struct si_shader *shader = ctx->shader; 3778b8e80941Smrg struct tgsi_shader_info *info = &shader->selector->info; 3779b8e80941Smrg LLVMBuilderRef builder = ctx->ac.builder; 3780b8e80941Smrg unsigned i, j, first_vgpr, vgpr; 3781b8e80941Smrg 3782b8e80941Smrg LLVMValueRef color[8][4] = {}; 3783b8e80941Smrg LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL; 3784b8e80941Smrg LLVMValueRef ret; 3785b8e80941Smrg 3786b8e80941Smrg if (ctx->postponed_kill) 3787b8e80941Smrg ac_build_kill_if_false(&ctx->ac, LLVMBuildLoad(builder, ctx->postponed_kill, "")); 3788b8e80941Smrg 3789b8e80941Smrg /* Read the output values. */ 3790b8e80941Smrg for (i = 0; i < info->num_outputs; i++) { 3791b8e80941Smrg unsigned semantic_name = info->output_semantic_name[i]; 3792b8e80941Smrg unsigned semantic_index = info->output_semantic_index[i]; 3793b8e80941Smrg 3794b8e80941Smrg switch (semantic_name) { 3795b8e80941Smrg case TGSI_SEMANTIC_COLOR: 3796b8e80941Smrg assert(semantic_index < 8); 3797b8e80941Smrg for (j = 0; j < 4; j++) { 3798b8e80941Smrg LLVMValueRef ptr = addrs[4 * i + j]; 3799b8e80941Smrg LLVMValueRef result = LLVMBuildLoad(builder, ptr, ""); 3800b8e80941Smrg color[semantic_index][j] = result; 3801b8e80941Smrg } 3802b8e80941Smrg break; 3803b8e80941Smrg case TGSI_SEMANTIC_POSITION: 3804b8e80941Smrg depth = LLVMBuildLoad(builder, 3805b8e80941Smrg addrs[4 * i + 2], ""); 3806b8e80941Smrg break; 3807b8e80941Smrg case TGSI_SEMANTIC_STENCIL: 3808b8e80941Smrg stencil = LLVMBuildLoad(builder, 3809b8e80941Smrg addrs[4 * i + 1], ""); 3810b8e80941Smrg break; 3811b8e80941Smrg case TGSI_SEMANTIC_SAMPLEMASK: 3812b8e80941Smrg samplemask = LLVMBuildLoad(builder, 3813b8e80941Smrg addrs[4 * i + 0], ""); 3814b8e80941Smrg break; 3815b8e80941Smrg default: 3816b8e80941Smrg fprintf(stderr, "Warning: SI unhandled fs output type:%d\n", 3817b8e80941Smrg semantic_name); 3818b8e80941Smrg } 3819b8e80941Smrg } 3820b8e80941Smrg 3821b8e80941Smrg /* Fill the return structure. */ 3822b8e80941Smrg ret = ctx->return_value; 3823b8e80941Smrg 3824b8e80941Smrg /* Set SGPRs. */ 3825b8e80941Smrg ret = LLVMBuildInsertValue(builder, ret, 3826b8e80941Smrg ac_to_integer(&ctx->ac, 3827b8e80941Smrg LLVMGetParam(ctx->main_fn, 3828b8e80941Smrg SI_PARAM_ALPHA_REF)), 3829b8e80941Smrg SI_SGPR_ALPHA_REF, ""); 3830b8e80941Smrg 3831b8e80941Smrg /* Set VGPRs */ 3832b8e80941Smrg first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1; 3833b8e80941Smrg for (i = 0; i < ARRAY_SIZE(color); i++) { 3834b8e80941Smrg if (!color[i][0]) 3835b8e80941Smrg continue; 3836b8e80941Smrg 3837b8e80941Smrg for (j = 0; j < 4; j++) 3838b8e80941Smrg ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, ""); 3839b8e80941Smrg } 3840b8e80941Smrg if (depth) 3841b8e80941Smrg ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, ""); 3842b8e80941Smrg if (stencil) 3843b8e80941Smrg ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, ""); 3844b8e80941Smrg if (samplemask) 3845b8e80941Smrg ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, ""); 3846b8e80941Smrg 3847b8e80941Smrg /* Add the input sample mask for smoothing at the end. */ 3848b8e80941Smrg if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC) 3849b8e80941Smrg vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC; 3850b8e80941Smrg ret = LLVMBuildInsertValue(builder, ret, 3851b8e80941Smrg LLVMGetParam(ctx->main_fn, 3852b8e80941Smrg SI_PARAM_SAMPLE_COVERAGE), vgpr++, ""); 3853b8e80941Smrg 3854b8e80941Smrg ctx->return_value = ret; 3855b8e80941Smrg} 3856b8e80941Smrg 3857b8e80941Smrgstatic void membar_emit( 3858b8e80941Smrg const struct lp_build_tgsi_action *action, 3859b8e80941Smrg struct lp_build_tgsi_context *bld_base, 3860b8e80941Smrg struct lp_build_emit_data *emit_data) 3861b8e80941Smrg{ 3862b8e80941Smrg struct si_shader_context *ctx = si_shader_context(bld_base); 3863b8e80941Smrg LLVMValueRef src0 = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0); 3864b8e80941Smrg unsigned flags = LLVMConstIntGetZExtValue(src0); 3865b8e80941Smrg unsigned waitcnt = NOOP_WAITCNT; 3866b8e80941Smrg 3867b8e80941Smrg if (flags & TGSI_MEMBAR_THREAD_GROUP) 3868b8e80941Smrg waitcnt &= VM_CNT & LGKM_CNT; 3869b8e80941Smrg 3870b8e80941Smrg if (flags & (TGSI_MEMBAR_ATOMIC_BUFFER | 3871b8e80941Smrg TGSI_MEMBAR_SHADER_BUFFER | 3872b8e80941Smrg TGSI_MEMBAR_SHADER_IMAGE)) 3873b8e80941Smrg waitcnt &= VM_CNT; 3874b8e80941Smrg 3875b8e80941Smrg if (flags & TGSI_MEMBAR_SHARED) 3876b8e80941Smrg waitcnt &= LGKM_CNT; 3877b8e80941Smrg 3878b8e80941Smrg if (waitcnt != NOOP_WAITCNT) 3879b8e80941Smrg ac_build_waitcnt(&ctx->ac, waitcnt); 3880b8e80941Smrg} 3881b8e80941Smrg 3882b8e80941Smrgstatic void clock_emit( 3883b8e80941Smrg const struct lp_build_tgsi_action *action, 3884b8e80941Smrg struct lp_build_tgsi_context *bld_base, 3885b8e80941Smrg struct lp_build_emit_data *emit_data) 3886b8e80941Smrg{ 3887b8e80941Smrg struct si_shader_context *ctx = si_shader_context(bld_base); 3888b8e80941Smrg LLVMValueRef tmp = ac_build_shader_clock(&ctx->ac); 3889b8e80941Smrg 3890b8e80941Smrg emit_data->output[0] = 3891b8e80941Smrg LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->i32_0, ""); 3892b8e80941Smrg emit_data->output[1] = 3893b8e80941Smrg LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->i32_1, ""); 3894b8e80941Smrg} 3895b8e80941Smrg 3896b8e80941Smrgstatic void si_llvm_emit_ddxy( 3897b8e80941Smrg const struct lp_build_tgsi_action *action, 3898b8e80941Smrg struct lp_build_tgsi_context *bld_base, 3899b8e80941Smrg struct lp_build_emit_data *emit_data) 3900b8e80941Smrg{ 3901b8e80941Smrg struct si_shader_context *ctx = si_shader_context(bld_base); 3902b8e80941Smrg unsigned opcode = emit_data->info->opcode; 3903b8e80941Smrg LLVMValueRef val; 3904b8e80941Smrg int idx; 3905b8e80941Smrg unsigned mask; 3906b8e80941Smrg 3907b8e80941Smrg if (opcode == TGSI_OPCODE_DDX_FINE) 3908b8e80941Smrg mask = AC_TID_MASK_LEFT; 3909b8e80941Smrg else if (opcode == TGSI_OPCODE_DDY_FINE) 3910b8e80941Smrg mask = AC_TID_MASK_TOP; 3911b8e80941Smrg else 3912b8e80941Smrg mask = AC_TID_MASK_TOP_LEFT; 3913b8e80941Smrg 3914b8e80941Smrg /* for DDX we want to next X pixel, DDY next Y pixel. */ 3915b8e80941Smrg idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2; 3916b8e80941Smrg 3917b8e80941Smrg val = ac_to_integer(&ctx->ac, emit_data->args[0]); 3918b8e80941Smrg val = ac_build_ddxy(&ctx->ac, mask, idx, val); 3919b8e80941Smrg emit_data->output[emit_data->chan] = val; 3920b8e80941Smrg} 3921b8e80941Smrg 3922b8e80941Smrgstatic void build_interp_intrinsic(const struct lp_build_tgsi_action *action, 3923b8e80941Smrg struct lp_build_tgsi_context *bld_base, 3924b8e80941Smrg struct lp_build_emit_data *emit_data) 3925b8e80941Smrg{ 3926b8e80941Smrg struct si_shader_context *ctx = si_shader_context(bld_base); 3927b8e80941Smrg struct si_shader *shader = ctx->shader; 3928b8e80941Smrg const struct tgsi_shader_info *info = &shader->selector->info; 3929b8e80941Smrg LLVMValueRef interp_param; 3930b8e80941Smrg const struct tgsi_full_instruction *inst = emit_data->inst; 3931b8e80941Smrg const struct tgsi_full_src_register *input = &inst->Src[0]; 3932b8e80941Smrg int input_base, input_array_size; 3933b8e80941Smrg int chan; 3934b8e80941Smrg int i; 3935b8e80941Smrg LLVMValueRef prim_mask = ctx->abi.prim_mask; 3936b8e80941Smrg LLVMValueRef array_idx, offset_x = NULL, offset_y = NULL; 3937b8e80941Smrg int interp_param_idx; 3938b8e80941Smrg unsigned interp; 3939b8e80941Smrg unsigned location; 3940b8e80941Smrg 3941b8e80941Smrg if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) { 3942b8e80941Smrg /* offset is in second src, first two channels */ 3943b8e80941Smrg offset_x = lp_build_emit_fetch(bld_base, emit_data->inst, 1, 3944b8e80941Smrg TGSI_CHAN_X); 3945b8e80941Smrg offset_y = lp_build_emit_fetch(bld_base, emit_data->inst, 1, 3946b8e80941Smrg TGSI_CHAN_Y); 3947b8e80941Smrg } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 3948b8e80941Smrg LLVMValueRef sample_position; 3949b8e80941Smrg LLVMValueRef sample_id; 3950b8e80941Smrg LLVMValueRef halfval = LLVMConstReal(ctx->f32, 0.5f); 3951b8e80941Smrg 3952b8e80941Smrg /* fetch sample ID, then fetch its sample position, 3953b8e80941Smrg * and place into first two channels. 3954b8e80941Smrg */ 3955b8e80941Smrg sample_id = lp_build_emit_fetch(bld_base, 3956b8e80941Smrg emit_data->inst, 1, TGSI_CHAN_X); 3957b8e80941Smrg sample_id = ac_to_integer(&ctx->ac, sample_id); 3958b8e80941Smrg 3959b8e80941Smrg /* Section 8.13.2 (Interpolation Functions) of the OpenGL Shading 3960b8e80941Smrg * Language 4.50 spec says about interpolateAtSample: 3961b8e80941Smrg * 3962b8e80941Smrg * "Returns the value of the input interpolant variable at 3963b8e80941Smrg * the location of sample number sample. If multisample 3964b8e80941Smrg * buffers are not available, the input variable will be 3965b8e80941Smrg * evaluated at the center of the pixel. If sample sample 3966b8e80941Smrg * does not exist, the position used to interpolate the 3967b8e80941Smrg * input variable is undefined." 3968b8e80941Smrg * 3969b8e80941Smrg * This means that sample_id values outside of the valid are 3970b8e80941Smrg * in fact valid input, and the usual mechanism for loading the 3971b8e80941Smrg * sample position doesn't work. 3972b8e80941Smrg */ 3973b8e80941Smrg if (ctx->shader->key.mono.u.ps.interpolate_at_sample_force_center) { 3974b8e80941Smrg LLVMValueRef center[4] = { 3975b8e80941Smrg LLVMConstReal(ctx->f32, 0.5), 3976b8e80941Smrg LLVMConstReal(ctx->f32, 0.5), 3977b8e80941Smrg ctx->ac.f32_0, 3978b8e80941Smrg ctx->ac.f32_0, 3979b8e80941Smrg }; 3980b8e80941Smrg 3981b8e80941Smrg sample_position = ac_build_gather_values(&ctx->ac, center, 4); 3982b8e80941Smrg } else { 3983b8e80941Smrg sample_position = load_sample_position(&ctx->abi, sample_id); 3984b8e80941Smrg } 3985b8e80941Smrg 3986b8e80941Smrg offset_x = LLVMBuildExtractElement(ctx->ac.builder, sample_position, 3987b8e80941Smrg ctx->i32_0, ""); 3988b8e80941Smrg 3989b8e80941Smrg offset_x = LLVMBuildFSub(ctx->ac.builder, offset_x, halfval, ""); 3990b8e80941Smrg offset_y = LLVMBuildExtractElement(ctx->ac.builder, sample_position, 3991b8e80941Smrg ctx->i32_1, ""); 3992b8e80941Smrg offset_y = LLVMBuildFSub(ctx->ac.builder, offset_y, halfval, ""); 3993b8e80941Smrg } 3994b8e80941Smrg 3995b8e80941Smrg assert(input->Register.File == TGSI_FILE_INPUT); 3996b8e80941Smrg 3997b8e80941Smrg if (input->Register.Indirect) { 3998b8e80941Smrg unsigned array_id = input->Indirect.ArrayID; 3999b8e80941Smrg 4000b8e80941Smrg if (array_id) { 4001b8e80941Smrg input_base = info->input_array_first[array_id]; 4002b8e80941Smrg input_array_size = info->input_array_last[array_id] - input_base + 1; 4003b8e80941Smrg } else { 4004b8e80941Smrg input_base = inst->Src[0].Register.Index; 4005b8e80941Smrg input_array_size = info->num_inputs - input_base; 4006b8e80941Smrg } 4007b8e80941Smrg 4008b8e80941Smrg array_idx = si_get_indirect_index(ctx, &input->Indirect, 4009b8e80941Smrg 1, input->Register.Index - input_base); 4010b8e80941Smrg } else { 4011b8e80941Smrg input_base = inst->Src[0].Register.Index; 4012b8e80941Smrg input_array_size = 1; 4013b8e80941Smrg array_idx = ctx->i32_0; 4014b8e80941Smrg } 4015b8e80941Smrg 4016b8e80941Smrg interp = shader->selector->info.input_interpolate[input_base]; 4017b8e80941Smrg 4018b8e80941Smrg if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || 4019b8e80941Smrg inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) 4020b8e80941Smrg location = TGSI_INTERPOLATE_LOC_CENTER; 4021b8e80941Smrg else 4022b8e80941Smrg location = TGSI_INTERPOLATE_LOC_CENTROID; 4023b8e80941Smrg 4024b8e80941Smrg interp_param_idx = lookup_interp_param_index(interp, location); 4025b8e80941Smrg if (interp_param_idx == -1) 4026b8e80941Smrg return; 4027b8e80941Smrg else if (interp_param_idx) 4028b8e80941Smrg interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx); 4029b8e80941Smrg else 4030b8e80941Smrg interp_param = NULL; 4031b8e80941Smrg 4032b8e80941Smrg if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || 4033b8e80941Smrg inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 4034b8e80941Smrg LLVMValueRef ij_out[2]; 4035b8e80941Smrg LLVMValueRef ddxy_out = ac_build_ddxy_interp(&ctx->ac, interp_param); 4036b8e80941Smrg 4037b8e80941Smrg /* 4038b8e80941Smrg * take the I then J parameters, and the DDX/Y for it, and 4039b8e80941Smrg * calculate the IJ inputs for the interpolator. 4040b8e80941Smrg * temp1 = ddx * offset/sample.x + I; 4041b8e80941Smrg * interp_param.I = ddy * offset/sample.y + temp1; 4042b8e80941Smrg * temp1 = ddx * offset/sample.x + J; 4043b8e80941Smrg * interp_param.J = ddy * offset/sample.y + temp1; 4044b8e80941Smrg */ 4045b8e80941Smrg for (i = 0; i < 2; i++) { 4046b8e80941Smrg LLVMValueRef ix_ll = LLVMConstInt(ctx->i32, i, 0); 4047b8e80941Smrg LLVMValueRef iy_ll = LLVMConstInt(ctx->i32, i + 2, 0); 4048b8e80941Smrg LLVMValueRef ddx_el = LLVMBuildExtractElement(ctx->ac.builder, 4049b8e80941Smrg ddxy_out, ix_ll, ""); 4050b8e80941Smrg LLVMValueRef ddy_el = LLVMBuildExtractElement(ctx->ac.builder, 4051b8e80941Smrg ddxy_out, iy_ll, ""); 4052b8e80941Smrg LLVMValueRef interp_el = LLVMBuildExtractElement(ctx->ac.builder, 4053b8e80941Smrg interp_param, ix_ll, ""); 4054b8e80941Smrg LLVMValueRef temp; 4055b8e80941Smrg 4056b8e80941Smrg interp_el = ac_to_float(&ctx->ac, interp_el); 4057b8e80941Smrg 4058b8e80941Smrg temp = ac_build_fmad(&ctx->ac, ddx_el, offset_x, interp_el); 4059b8e80941Smrg ij_out[i] = ac_build_fmad(&ctx->ac, ddy_el, offset_y, temp); 4060b8e80941Smrg } 4061b8e80941Smrg interp_param = ac_build_gather_values(&ctx->ac, ij_out, 2); 4062b8e80941Smrg } 4063b8e80941Smrg 4064b8e80941Smrg if (interp_param) 4065b8e80941Smrg interp_param = ac_to_float(&ctx->ac, interp_param); 4066b8e80941Smrg 4067b8e80941Smrg for (chan = 0; chan < 4; chan++) { 4068b8e80941Smrg LLVMValueRef gather = LLVMGetUndef(LLVMVectorType(ctx->f32, input_array_size)); 4069b8e80941Smrg unsigned schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan); 4070b8e80941Smrg 4071b8e80941Smrg for (unsigned idx = 0; idx < input_array_size; ++idx) { 4072b8e80941Smrg LLVMValueRef v, i = NULL, j = NULL; 4073b8e80941Smrg 4074b8e80941Smrg if (interp_param) { 4075b8e80941Smrg i = LLVMBuildExtractElement( 4076b8e80941Smrg ctx->ac.builder, interp_param, ctx->i32_0, ""); 4077b8e80941Smrg j = LLVMBuildExtractElement( 4078b8e80941Smrg ctx->ac.builder, interp_param, ctx->i32_1, ""); 4079b8e80941Smrg } 4080b8e80941Smrg v = si_build_fs_interp(ctx, input_base + idx, schan, 4081b8e80941Smrg prim_mask, i, j); 4082b8e80941Smrg 4083b8e80941Smrg gather = LLVMBuildInsertElement(ctx->ac.builder, 4084b8e80941Smrg gather, v, LLVMConstInt(ctx->i32, idx, false), ""); 4085b8e80941Smrg } 4086b8e80941Smrg 4087b8e80941Smrg emit_data->output[chan] = LLVMBuildExtractElement( 4088b8e80941Smrg ctx->ac.builder, gather, array_idx, ""); 4089b8e80941Smrg } 4090b8e80941Smrg} 4091b8e80941Smrg 4092b8e80941Smrgstatic void vote_all_emit( 4093b8e80941Smrg const struct lp_build_tgsi_action *action, 4094b8e80941Smrg struct lp_build_tgsi_context *bld_base, 4095b8e80941Smrg struct lp_build_emit_data *emit_data) 4096b8e80941Smrg{ 4097b8e80941Smrg struct si_shader_context *ctx = si_shader_context(bld_base); 4098b8e80941Smrg 4099b8e80941Smrg LLVMValueRef tmp = ac_build_vote_all(&ctx->ac, emit_data->args[0]); 4100b8e80941Smrg emit_data->output[emit_data->chan] = 4101b8e80941Smrg LLVMBuildSExt(ctx->ac.builder, tmp, ctx->i32, ""); 4102b8e80941Smrg} 4103b8e80941Smrg 4104b8e80941Smrgstatic void vote_any_emit( 4105b8e80941Smrg const struct lp_build_tgsi_action *action, 4106b8e80941Smrg struct lp_build_tgsi_context *bld_base, 4107b8e80941Smrg struct lp_build_emit_data *emit_data) 4108b8e80941Smrg{ 4109b8e80941Smrg struct si_shader_context *ctx = si_shader_context(bld_base); 4110b8e80941Smrg 4111b8e80941Smrg LLVMValueRef tmp = ac_build_vote_any(&ctx->ac, emit_data->args[0]); 4112b8e80941Smrg emit_data->output[emit_data->chan] = 4113b8e80941Smrg LLVMBuildSExt(ctx->ac.builder, tmp, ctx->i32, ""); 4114b8e80941Smrg} 4115b8e80941Smrg 4116b8e80941Smrgstatic void vote_eq_emit( 4117b8e80941Smrg const struct lp_build_tgsi_action *action, 4118b8e80941Smrg struct lp_build_tgsi_context *bld_base, 4119b8e80941Smrg struct lp_build_emit_data *emit_data) 4120b8e80941Smrg{ 4121b8e80941Smrg struct si_shader_context *ctx = si_shader_context(bld_base); 4122b8e80941Smrg 4123b8e80941Smrg LLVMValueRef tmp = ac_build_vote_eq(&ctx->ac, emit_data->args[0]); 4124b8e80941Smrg emit_data->output[emit_data->chan] = 4125b8e80941Smrg LLVMBuildSExt(ctx->ac.builder, tmp, ctx->i32, ""); 4126b8e80941Smrg} 4127b8e80941Smrg 4128b8e80941Smrgstatic void ballot_emit( 4129b8e80941Smrg const struct lp_build_tgsi_action *action, 4130b8e80941Smrg struct lp_build_tgsi_context *bld_base, 4131b8e80941Smrg struct lp_build_emit_data *emit_data) 4132b8e80941Smrg{ 4133b8e80941Smrg struct si_shader_context *ctx = si_shader_context(bld_base); 4134b8e80941Smrg LLVMBuilderRef builder = ctx->ac.builder; 4135b8e80941Smrg LLVMValueRef tmp; 4136b8e80941Smrg 4137b8e80941Smrg tmp = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X); 4138b8e80941Smrg tmp = ac_build_ballot(&ctx->ac, tmp); 4139b8e80941Smrg tmp = LLVMBuildBitCast(builder, tmp, ctx->v2i32, ""); 4140b8e80941Smrg 4141b8e80941Smrg emit_data->output[0] = LLVMBuildExtractElement(builder, tmp, ctx->i32_0, ""); 4142b8e80941Smrg emit_data->output[1] = LLVMBuildExtractElement(builder, tmp, ctx->i32_1, ""); 4143b8e80941Smrg} 4144b8e80941Smrg 4145b8e80941Smrgstatic void read_lane_emit( 4146b8e80941Smrg const struct lp_build_tgsi_action *action, 4147b8e80941Smrg struct lp_build_tgsi_context *bld_base, 4148b8e80941Smrg struct lp_build_emit_data *emit_data) 4149b8e80941Smrg{ 4150b8e80941Smrg struct si_shader_context *ctx = si_shader_context(bld_base); 4151b8e80941Smrg 4152b8e80941Smrg if (emit_data->inst->Instruction.Opcode == TGSI_OPCODE_READ_INVOC) { 4153b8e80941Smrg emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst, 4154b8e80941Smrg 0, emit_data->src_chan); 4155b8e80941Smrg 4156b8e80941Smrg /* Always read the source invocation (= lane) from the X channel. */ 4157b8e80941Smrg emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst, 4158b8e80941Smrg 1, TGSI_CHAN_X); 4159b8e80941Smrg emit_data->arg_count = 2; 4160b8e80941Smrg } 4161b8e80941Smrg 4162b8e80941Smrg /* We currently have no other way to prevent LLVM from lifting the icmp 4163b8e80941Smrg * calls to a dominating basic block. 4164b8e80941Smrg */ 4165b8e80941Smrg ac_build_optimization_barrier(&ctx->ac, &emit_data->args[0]); 4166b8e80941Smrg 4167b8e80941Smrg for (unsigned i = 0; i < emit_data->arg_count; ++i) 4168b8e80941Smrg emit_data->args[i] = ac_to_integer(&ctx->ac, emit_data->args[i]); 4169b8e80941Smrg 4170b8e80941Smrg emit_data->output[emit_data->chan] = 4171b8e80941Smrg ac_build_intrinsic(&ctx->ac, action->intr_name, 4172b8e80941Smrg ctx->i32, emit_data->args, emit_data->arg_count, 4173b8e80941Smrg AC_FUNC_ATTR_READNONE | 4174b8e80941Smrg AC_FUNC_ATTR_CONVERGENT); 4175b8e80941Smrg} 4176b8e80941Smrg 4177b8e80941Smrgstatic unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base, 4178b8e80941Smrg struct lp_build_emit_data *emit_data) 4179b8e80941Smrg{ 4180b8e80941Smrg struct si_shader_context *ctx = si_shader_context(bld_base); 4181b8e80941Smrg struct tgsi_src_register src0 = emit_data->inst->Src[0].Register; 4182b8e80941Smrg LLVMValueRef imm; 4183b8e80941Smrg unsigned stream; 4184b8e80941Smrg 4185b8e80941Smrg assert(src0.File == TGSI_FILE_IMMEDIATE); 4186b8e80941Smrg 4187b8e80941Smrg imm = ctx->imms[src0.Index * TGSI_NUM_CHANNELS + src0.SwizzleX]; 4188b8e80941Smrg stream = LLVMConstIntGetZExtValue(imm) & 0x3; 4189b8e80941Smrg return stream; 4190b8e80941Smrg} 4191b8e80941Smrg 4192b8e80941Smrg/* Emit one vertex from the geometry shader */ 4193b8e80941Smrgstatic void si_llvm_emit_vertex(struct ac_shader_abi *abi, 4194b8e80941Smrg unsigned stream, 4195b8e80941Smrg LLVMValueRef *addrs) 4196b8e80941Smrg{ 4197b8e80941Smrg struct si_shader_context *ctx = si_shader_context_from_abi(abi); 4198b8e80941Smrg struct tgsi_shader_info *info = &ctx->shader->selector->info; 4199b8e80941Smrg struct si_shader *shader = ctx->shader; 4200b8e80941Smrg struct lp_build_if_state if_state; 4201b8e80941Smrg LLVMValueRef soffset = LLVMGetParam(ctx->main_fn, 4202b8e80941Smrg ctx->param_gs2vs_offset); 4203b8e80941Smrg LLVMValueRef gs_next_vertex; 4204b8e80941Smrg LLVMValueRef can_emit; 4205b8e80941Smrg unsigned chan, offset; 4206b8e80941Smrg int i; 4207b8e80941Smrg 4208b8e80941Smrg /* Write vertex attribute values to GSVS ring */ 4209b8e80941Smrg gs_next_vertex = LLVMBuildLoad(ctx->ac.builder, 4210b8e80941Smrg ctx->gs_next_vertex[stream], 4211b8e80941Smrg ""); 4212b8e80941Smrg 4213b8e80941Smrg /* If this thread has already emitted the declared maximum number of 4214b8e80941Smrg * vertices, skip the write: excessive vertex emissions are not 4215b8e80941Smrg * supposed to have any effect. 4216b8e80941Smrg * 4217b8e80941Smrg * If the shader has no writes to memory, kill it instead. This skips 4218b8e80941Smrg * further memory loads and may allow LLVM to skip to the end 4219b8e80941Smrg * altogether. 4220b8e80941Smrg */ 4221b8e80941Smrg can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex, 4222b8e80941Smrg LLVMConstInt(ctx->i32, 4223b8e80941Smrg shader->selector->gs_max_out_vertices, 0), ""); 4224b8e80941Smrg 4225b8e80941Smrg bool use_kill = !info->writes_memory; 4226b8e80941Smrg if (use_kill) { 4227b8e80941Smrg ac_build_kill_if_false(&ctx->ac, can_emit); 4228b8e80941Smrg } else { 4229b8e80941Smrg lp_build_if(&if_state, &ctx->gallivm, can_emit); 4230b8e80941Smrg } 4231b8e80941Smrg 4232b8e80941Smrg offset = 0; 4233b8e80941Smrg for (i = 0; i < info->num_outputs; i++) { 4234b8e80941Smrg for (chan = 0; chan < 4; chan++) { 4235b8e80941Smrg if (!(info->output_usagemask[i] & (1 << chan)) || 4236b8e80941Smrg ((info->output_streams[i] >> (2 * chan)) & 3) != stream) 4237b8e80941Smrg continue; 4238b8e80941Smrg 4239b8e80941Smrg LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""); 4240b8e80941Smrg LLVMValueRef voffset = 4241b8e80941Smrg LLVMConstInt(ctx->i32, offset * 4242b8e80941Smrg shader->selector->gs_max_out_vertices, 0); 4243b8e80941Smrg offset++; 4244b8e80941Smrg 4245b8e80941Smrg voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, ""); 4246b8e80941Smrg voffset = LLVMBuildMul(ctx->ac.builder, voffset, 4247b8e80941Smrg LLVMConstInt(ctx->i32, 4, 0), ""); 4248b8e80941Smrg 4249b8e80941Smrg out_val = ac_to_integer(&ctx->ac, out_val); 4250b8e80941Smrg 4251b8e80941Smrg ac_build_buffer_store_dword(&ctx->ac, 4252b8e80941Smrg ctx->gsvs_ring[stream], 4253b8e80941Smrg out_val, 1, 4254b8e80941Smrg voffset, soffset, 0, 4255b8e80941Smrg 1, 1, true, true); 4256b8e80941Smrg } 4257b8e80941Smrg } 4258b8e80941Smrg 4259b8e80941Smrg gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, ctx->i32_1, ""); 4260b8e80941Smrg LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]); 4261b8e80941Smrg 4262b8e80941Smrg /* Signal vertex emission if vertex data was written. */ 4263b8e80941Smrg if (offset) { 4264b8e80941Smrg ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8), 4265b8e80941Smrg si_get_gs_wave_id(ctx)); 4266b8e80941Smrg } 4267b8e80941Smrg 4268b8e80941Smrg if (!use_kill) 4269b8e80941Smrg lp_build_endif(&if_state); 4270b8e80941Smrg} 4271b8e80941Smrg 4272b8e80941Smrg/* Emit one vertex from the geometry shader */ 4273b8e80941Smrgstatic void si_tgsi_emit_vertex( 4274b8e80941Smrg const struct lp_build_tgsi_action *action, 4275b8e80941Smrg struct lp_build_tgsi_context *bld_base, 4276b8e80941Smrg struct lp_build_emit_data *emit_data) 4277b8e80941Smrg{ 4278b8e80941Smrg struct si_shader_context *ctx = si_shader_context(bld_base); 4279b8e80941Smrg unsigned stream = si_llvm_get_stream(bld_base, emit_data); 4280b8e80941Smrg 4281b8e80941Smrg si_llvm_emit_vertex(&ctx->abi, stream, ctx->outputs[0]); 4282b8e80941Smrg} 4283b8e80941Smrg 4284b8e80941Smrg/* Cut one primitive from the geometry shader */ 4285b8e80941Smrgstatic void si_llvm_emit_primitive(struct ac_shader_abi *abi, 4286b8e80941Smrg unsigned stream) 4287b8e80941Smrg{ 4288b8e80941Smrg struct si_shader_context *ctx = si_shader_context_from_abi(abi); 4289b8e80941Smrg 4290b8e80941Smrg /* Signal primitive cut */ 4291b8e80941Smrg ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8), 4292b8e80941Smrg si_get_gs_wave_id(ctx)); 4293b8e80941Smrg} 4294b8e80941Smrg 4295b8e80941Smrg/* Cut one primitive from the geometry shader */ 4296b8e80941Smrgstatic void si_tgsi_emit_primitive( 4297b8e80941Smrg const struct lp_build_tgsi_action *action, 4298b8e80941Smrg struct lp_build_tgsi_context *bld_base, 4299b8e80941Smrg struct lp_build_emit_data *emit_data) 4300b8e80941Smrg{ 4301b8e80941Smrg struct si_shader_context *ctx = si_shader_context(bld_base); 4302b8e80941Smrg 4303b8e80941Smrg si_llvm_emit_primitive(&ctx->abi, si_llvm_get_stream(bld_base, emit_data)); 4304b8e80941Smrg} 4305b8e80941Smrg 4306b8e80941Smrgstatic void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action, 4307b8e80941Smrg struct lp_build_tgsi_context *bld_base, 4308b8e80941Smrg struct lp_build_emit_data *emit_data) 4309b8e80941Smrg{ 4310b8e80941Smrg struct si_shader_context *ctx = si_shader_context(bld_base); 4311b8e80941Smrg 4312b8e80941Smrg /* SI only (thanks to a hw bug workaround): 4313b8e80941Smrg * The real barrier instruction isn’t needed, because an entire patch 4314b8e80941Smrg * always fits into a single wave. 4315b8e80941Smrg */ 4316b8e80941Smrg if (ctx->screen->info.chip_class == SI && 4317b8e80941Smrg ctx->type == PIPE_SHADER_TESS_CTRL) { 4318b8e80941Smrg ac_build_waitcnt(&ctx->ac, LGKM_CNT & VM_CNT); 4319b8e80941Smrg return; 4320b8e80941Smrg } 4321b8e80941Smrg 4322b8e80941Smrg ac_build_s_barrier(&ctx->ac); 4323b8e80941Smrg} 4324b8e80941Smrg 4325b8e80941Smrgstatic void si_create_function(struct si_shader_context *ctx, 4326b8e80941Smrg const char *name, 4327b8e80941Smrg LLVMTypeRef *returns, unsigned num_returns, 4328b8e80941Smrg struct si_function_info *fninfo, 4329b8e80941Smrg unsigned max_workgroup_size) 4330b8e80941Smrg{ 4331b8e80941Smrg int i; 4332b8e80941Smrg 4333b8e80941Smrg si_llvm_create_func(ctx, name, returns, num_returns, 4334b8e80941Smrg fninfo->types, fninfo->num_params); 4335b8e80941Smrg ctx->return_value = LLVMGetUndef(ctx->return_type); 4336b8e80941Smrg 4337b8e80941Smrg for (i = 0; i < fninfo->num_sgpr_params; ++i) { 4338b8e80941Smrg LLVMValueRef P = LLVMGetParam(ctx->main_fn, i); 4339b8e80941Smrg 4340b8e80941Smrg /* The combination of: 4341b8e80941Smrg * - noalias 4342b8e80941Smrg * - dereferenceable 4343b8e80941Smrg * - invariant.load 4344b8e80941Smrg * allows the optimization passes to move loads and reduces 4345b8e80941Smrg * SGPR spilling significantly. 4346b8e80941Smrg */ 4347b8e80941Smrg ac_add_function_attr(ctx->ac.context, ctx->main_fn, i + 1, 4348b8e80941Smrg AC_FUNC_ATTR_INREG); 4349b8e80941Smrg 4350b8e80941Smrg if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) { 4351b8e80941Smrg ac_add_function_attr(ctx->ac.context, ctx->main_fn, i + 1, 4352b8e80941Smrg AC_FUNC_ATTR_NOALIAS); 4353b8e80941Smrg ac_add_attr_dereferenceable(P, UINT64_MAX); 4354b8e80941Smrg } 4355b8e80941Smrg } 4356b8e80941Smrg 4357b8e80941Smrg for (i = 0; i < fninfo->num_params; ++i) { 4358b8e80941Smrg if (fninfo->assign[i]) 4359b8e80941Smrg *fninfo->assign[i] = LLVMGetParam(ctx->main_fn, i); 4360b8e80941Smrg } 4361b8e80941Smrg 4362b8e80941Smrg if (ctx->screen->info.address32_hi) { 4363b8e80941Smrg ac_llvm_add_target_dep_function_attr(ctx->main_fn, 4364b8e80941Smrg "amdgpu-32bit-address-high-bits", 4365b8e80941Smrg ctx->screen->info.address32_hi); 4366b8e80941Smrg } 4367b8e80941Smrg 4368b8e80941Smrg if (max_workgroup_size) { 4369b8e80941Smrg ac_llvm_add_target_dep_function_attr(ctx->main_fn, 4370b8e80941Smrg "amdgpu-max-work-group-size", 4371b8e80941Smrg max_workgroup_size); 4372b8e80941Smrg } 4373b8e80941Smrg LLVMAddTargetDependentFunctionAttr(ctx->main_fn, 4374b8e80941Smrg "no-signed-zeros-fp-math", 4375b8e80941Smrg "true"); 4376b8e80941Smrg 4377b8e80941Smrg if (ctx->screen->debug_flags & DBG(UNSAFE_MATH)) { 4378b8e80941Smrg /* These were copied from some LLVM test. */ 4379b8e80941Smrg LLVMAddTargetDependentFunctionAttr(ctx->main_fn, 4380b8e80941Smrg "less-precise-fpmad", 4381b8e80941Smrg "true"); 4382b8e80941Smrg LLVMAddTargetDependentFunctionAttr(ctx->main_fn, 4383b8e80941Smrg "no-infs-fp-math", 4384b8e80941Smrg "true"); 4385b8e80941Smrg LLVMAddTargetDependentFunctionAttr(ctx->main_fn, 4386b8e80941Smrg "no-nans-fp-math", 4387b8e80941Smrg "true"); 4388b8e80941Smrg LLVMAddTargetDependentFunctionAttr(ctx->main_fn, 4389b8e80941Smrg "unsafe-fp-math", 4390b8e80941Smrg "true"); 4391b8e80941Smrg } 4392b8e80941Smrg} 4393b8e80941Smrg 4394b8e80941Smrgstatic void declare_streamout_params(struct si_shader_context *ctx, 4395b8e80941Smrg struct pipe_stream_output_info *so, 4396b8e80941Smrg struct si_function_info *fninfo) 4397b8e80941Smrg{ 4398b8e80941Smrg int i; 4399b8e80941Smrg 4400b8e80941Smrg /* Streamout SGPRs. */ 4401b8e80941Smrg if (so->num_outputs) { 4402b8e80941Smrg if (ctx->type != PIPE_SHADER_TESS_EVAL) 4403b8e80941Smrg ctx->param_streamout_config = add_arg(fninfo, ARG_SGPR, ctx->ac.i32); 4404b8e80941Smrg else 4405b8e80941Smrg ctx->param_streamout_config = fninfo->num_params - 1; 4406b8e80941Smrg 4407b8e80941Smrg ctx->param_streamout_write_index = add_arg(fninfo, ARG_SGPR, ctx->ac.i32); 4408b8e80941Smrg } 4409b8e80941Smrg /* A streamout buffer offset is loaded if the stride is non-zero. */ 4410b8e80941Smrg for (i = 0; i < 4; i++) { 4411b8e80941Smrg if (!so->stride[i]) 4412b8e80941Smrg continue; 4413b8e80941Smrg 4414b8e80941Smrg ctx->param_streamout_offset[i] = add_arg(fninfo, ARG_SGPR, ctx->ac.i32); 4415b8e80941Smrg } 4416b8e80941Smrg} 4417b8e80941Smrg 4418b8e80941Smrgstatic unsigned si_get_max_workgroup_size(const struct si_shader *shader) 4419b8e80941Smrg{ 4420b8e80941Smrg switch (shader->selector->type) { 4421b8e80941Smrg case PIPE_SHADER_TESS_CTRL: 4422b8e80941Smrg /* Return this so that LLVM doesn't remove s_barrier 4423b8e80941Smrg * instructions on chips where we use s_barrier. */ 4424b8e80941Smrg return shader->selector->screen->info.chip_class >= CIK ? 128 : 64; 4425b8e80941Smrg 4426b8e80941Smrg case PIPE_SHADER_GEOMETRY: 4427b8e80941Smrg return shader->selector->screen->info.chip_class >= GFX9 ? 128 : 64; 4428b8e80941Smrg 4429b8e80941Smrg case PIPE_SHADER_COMPUTE: 4430b8e80941Smrg break; /* see below */ 4431b8e80941Smrg 4432b8e80941Smrg default: 4433b8e80941Smrg return 0; 4434b8e80941Smrg } 4435b8e80941Smrg 4436b8e80941Smrg const unsigned *properties = shader->selector->info.properties; 4437b8e80941Smrg unsigned max_work_group_size = 4438b8e80941Smrg properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] * 4439b8e80941Smrg properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] * 4440b8e80941Smrg properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]; 4441b8e80941Smrg 4442b8e80941Smrg if (!max_work_group_size) { 4443b8e80941Smrg /* This is a variable group size compute shader, 4444b8e80941Smrg * compile it for the maximum possible group size. 4445b8e80941Smrg */ 4446b8e80941Smrg max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK; 4447b8e80941Smrg } 4448b8e80941Smrg return max_work_group_size; 4449b8e80941Smrg} 4450b8e80941Smrg 4451b8e80941Smrgstatic void declare_const_and_shader_buffers(struct si_shader_context *ctx, 4452b8e80941Smrg struct si_function_info *fninfo, 4453b8e80941Smrg bool assign_params) 4454b8e80941Smrg{ 4455b8e80941Smrg LLVMTypeRef const_shader_buf_type; 4456b8e80941Smrg 4457b8e80941Smrg if (ctx->shader->selector->info.const_buffers_declared == 1 && 4458b8e80941Smrg ctx->shader->selector->info.shader_buffers_declared == 0) 4459b8e80941Smrg const_shader_buf_type = ctx->f32; 4460b8e80941Smrg else 4461b8e80941Smrg const_shader_buf_type = ctx->v4i32; 4462b8e80941Smrg 4463b8e80941Smrg unsigned const_and_shader_buffers = 4464b8e80941Smrg add_arg(fninfo, ARG_SGPR, 4465b8e80941Smrg ac_array_in_const32_addr_space(const_shader_buf_type)); 4466b8e80941Smrg 4467b8e80941Smrg if (assign_params) 4468b8e80941Smrg ctx->param_const_and_shader_buffers = const_and_shader_buffers; 4469b8e80941Smrg} 4470b8e80941Smrg 4471b8e80941Smrgstatic void declare_samplers_and_images(struct si_shader_context *ctx, 4472b8e80941Smrg struct si_function_info *fninfo, 4473b8e80941Smrg bool assign_params) 4474b8e80941Smrg{ 4475b8e80941Smrg unsigned samplers_and_images = 4476b8e80941Smrg add_arg(fninfo, ARG_SGPR, 4477b8e80941Smrg ac_array_in_const32_addr_space(ctx->v8i32)); 4478b8e80941Smrg 4479b8e80941Smrg if (assign_params) 4480b8e80941Smrg ctx->param_samplers_and_images = samplers_and_images; 4481b8e80941Smrg} 4482b8e80941Smrg 4483b8e80941Smrgstatic void declare_per_stage_desc_pointers(struct si_shader_context *ctx, 4484b8e80941Smrg struct si_function_info *fninfo, 4485b8e80941Smrg bool assign_params) 4486b8e80941Smrg{ 4487b8e80941Smrg declare_const_and_shader_buffers(ctx, fninfo, assign_params); 4488b8e80941Smrg declare_samplers_and_images(ctx, fninfo, assign_params); 4489b8e80941Smrg} 4490b8e80941Smrg 4491b8e80941Smrgstatic void declare_global_desc_pointers(struct si_shader_context *ctx, 4492b8e80941Smrg struct si_function_info *fninfo) 4493b8e80941Smrg{ 4494b8e80941Smrg ctx->param_rw_buffers = add_arg(fninfo, ARG_SGPR, 4495b8e80941Smrg ac_array_in_const32_addr_space(ctx->v4i32)); 4496b8e80941Smrg ctx->param_bindless_samplers_and_images = add_arg(fninfo, ARG_SGPR, 4497b8e80941Smrg ac_array_in_const32_addr_space(ctx->v8i32)); 4498b8e80941Smrg} 4499b8e80941Smrg 4500b8e80941Smrgstatic void declare_vs_specific_input_sgprs(struct si_shader_context *ctx, 4501b8e80941Smrg struct si_function_info *fninfo) 4502b8e80941Smrg{ 4503b8e80941Smrg ctx->param_vs_state_bits = add_arg(fninfo, ARG_SGPR, ctx->i32); 4504b8e80941Smrg add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.base_vertex); 4505b8e80941Smrg add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.start_instance); 4506b8e80941Smrg add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.draw_id); 4507b8e80941Smrg} 4508b8e80941Smrg 4509b8e80941Smrgstatic void declare_vs_input_vgprs(struct si_shader_context *ctx, 4510b8e80941Smrg struct si_function_info *fninfo, 4511b8e80941Smrg unsigned *num_prolog_vgprs) 4512b8e80941Smrg{ 4513b8e80941Smrg struct si_shader *shader = ctx->shader; 4514b8e80941Smrg 4515b8e80941Smrg add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.vertex_id); 4516b8e80941Smrg if (shader->key.as_ls) { 4517b8e80941Smrg ctx->param_rel_auto_id = add_arg(fninfo, ARG_VGPR, ctx->i32); 4518b8e80941Smrg add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id); 4519b8e80941Smrg } else { 4520b8e80941Smrg add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id); 4521b8e80941Smrg ctx->param_vs_prim_id = add_arg(fninfo, ARG_VGPR, ctx->i32); 4522b8e80941Smrg } 4523b8e80941Smrg add_arg(fninfo, ARG_VGPR, ctx->i32); /* unused */ 4524b8e80941Smrg 4525b8e80941Smrg if (!shader->is_gs_copy_shader) { 4526b8e80941Smrg /* Vertex load indices. */ 4527b8e80941Smrg ctx->param_vertex_index0 = fninfo->num_params; 4528b8e80941Smrg for (unsigned i = 0; i < shader->selector->info.num_inputs; i++) 4529b8e80941Smrg add_arg(fninfo, ARG_VGPR, ctx->i32); 4530b8e80941Smrg *num_prolog_vgprs += shader->selector->info.num_inputs; 4531b8e80941Smrg } 4532b8e80941Smrg} 4533b8e80941Smrg 4534b8e80941Smrgstatic void declare_vs_blit_inputs(struct si_shader_context *ctx, 4535b8e80941Smrg struct si_function_info *fninfo, 4536b8e80941Smrg unsigned vs_blit_property) 4537b8e80941Smrg{ 4538b8e80941Smrg ctx->param_vs_blit_inputs = fninfo->num_params; 4539b8e80941Smrg add_arg(fninfo, ARG_SGPR, ctx->i32); /* i16 x1, y1 */ 4540b8e80941Smrg add_arg(fninfo, ARG_SGPR, ctx->i32); /* i16 x2, y2 */ 4541b8e80941Smrg add_arg(fninfo, ARG_SGPR, ctx->f32); /* depth */ 4542b8e80941Smrg 4543b8e80941Smrg if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) { 4544b8e80941Smrg add_arg(fninfo, ARG_SGPR, ctx->f32); /* color0 */ 4545b8e80941Smrg add_arg(fninfo, ARG_SGPR, ctx->f32); /* color1 */ 4546b8e80941Smrg add_arg(fninfo, ARG_SGPR, ctx->f32); /* color2 */ 4547b8e80941Smrg add_arg(fninfo, ARG_SGPR, ctx->f32); /* color3 */ 4548b8e80941Smrg } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) { 4549b8e80941Smrg add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.x1 */ 4550b8e80941Smrg add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.y1 */ 4551b8e80941Smrg add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.x2 */ 4552b8e80941Smrg add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.y2 */ 4553b8e80941Smrg add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.z */ 4554b8e80941Smrg add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.w */ 4555b8e80941Smrg } 4556b8e80941Smrg} 4557b8e80941Smrg 4558b8e80941Smrgstatic void declare_tes_input_vgprs(struct si_shader_context *ctx, 4559b8e80941Smrg struct si_function_info *fninfo) 4560b8e80941Smrg{ 4561b8e80941Smrg ctx->param_tes_u = add_arg(fninfo, ARG_VGPR, ctx->f32); 4562b8e80941Smrg ctx->param_tes_v = add_arg(fninfo, ARG_VGPR, ctx->f32); 4563b8e80941Smrg ctx->param_tes_rel_patch_id = add_arg(fninfo, ARG_VGPR, ctx->i32); 4564b8e80941Smrg add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tes_patch_id); 4565b8e80941Smrg} 4566b8e80941Smrg 4567b8e80941Smrgenum { 4568b8e80941Smrg /* Convenient merged shader definitions. */ 4569b8e80941Smrg SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES, 4570b8e80941Smrg SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY, 4571b8e80941Smrg}; 4572b8e80941Smrg 4573b8e80941Smrgstatic void create_function(struct si_shader_context *ctx) 4574b8e80941Smrg{ 4575b8e80941Smrg struct si_shader *shader = ctx->shader; 4576b8e80941Smrg struct si_function_info fninfo; 4577b8e80941Smrg LLVMTypeRef returns[16+32*4]; 4578b8e80941Smrg unsigned i, num_return_sgprs; 4579b8e80941Smrg unsigned num_returns = 0; 4580b8e80941Smrg unsigned num_prolog_vgprs = 0; 4581b8e80941Smrg unsigned type = ctx->type; 4582b8e80941Smrg unsigned vs_blit_property = 4583b8e80941Smrg shader->selector->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS]; 4584b8e80941Smrg 4585b8e80941Smrg si_init_function_info(&fninfo); 4586b8e80941Smrg 4587b8e80941Smrg /* Set MERGED shaders. */ 4588b8e80941Smrg if (ctx->screen->info.chip_class >= GFX9) { 4589b8e80941Smrg if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL) 4590b8e80941Smrg type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */ 4591b8e80941Smrg else if (shader->key.as_es || type == PIPE_SHADER_GEOMETRY) 4592b8e80941Smrg type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY; 4593b8e80941Smrg } 4594b8e80941Smrg 4595b8e80941Smrg LLVMTypeRef v3i32 = LLVMVectorType(ctx->i32, 3); 4596b8e80941Smrg 4597b8e80941Smrg switch (type) { 4598b8e80941Smrg case PIPE_SHADER_VERTEX: 4599b8e80941Smrg declare_global_desc_pointers(ctx, &fninfo); 4600b8e80941Smrg 4601b8e80941Smrg if (vs_blit_property) { 4602b8e80941Smrg declare_vs_blit_inputs(ctx, &fninfo, vs_blit_property); 4603b8e80941Smrg 4604b8e80941Smrg /* VGPRs */ 4605b8e80941Smrg declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs); 4606b8e80941Smrg break; 4607b8e80941Smrg } 4608b8e80941Smrg 4609b8e80941Smrg declare_per_stage_desc_pointers(ctx, &fninfo, true); 4610b8e80941Smrg declare_vs_specific_input_sgprs(ctx, &fninfo); 4611b8e80941Smrg ctx->param_vertex_buffers = add_arg(&fninfo, ARG_SGPR, 4612b8e80941Smrg ac_array_in_const32_addr_space(ctx->v4i32)); 4613b8e80941Smrg 4614b8e80941Smrg if (shader->key.as_es) { 4615b8e80941Smrg ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4616b8e80941Smrg } else if (shader->key.as_ls) { 4617b8e80941Smrg /* no extra parameters */ 4618b8e80941Smrg } else { 4619b8e80941Smrg if (shader->is_gs_copy_shader) { 4620b8e80941Smrg fninfo.num_params = ctx->param_vs_state_bits + 1; 4621b8e80941Smrg fninfo.num_sgpr_params = fninfo.num_params; 4622b8e80941Smrg } 4623b8e80941Smrg 4624b8e80941Smrg /* The locations of the other parameters are assigned dynamically. */ 4625b8e80941Smrg declare_streamout_params(ctx, &shader->selector->so, 4626b8e80941Smrg &fninfo); 4627b8e80941Smrg } 4628b8e80941Smrg 4629b8e80941Smrg /* VGPRs */ 4630b8e80941Smrg declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs); 4631b8e80941Smrg break; 4632b8e80941Smrg 4633b8e80941Smrg case PIPE_SHADER_TESS_CTRL: /* SI-CI-VI */ 4634b8e80941Smrg declare_global_desc_pointers(ctx, &fninfo); 4635b8e80941Smrg declare_per_stage_desc_pointers(ctx, &fninfo, true); 4636b8e80941Smrg ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4637b8e80941Smrg ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4638b8e80941Smrg ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4639b8e80941Smrg ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4640b8e80941Smrg ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4641b8e80941Smrg ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4642b8e80941Smrg 4643b8e80941Smrg /* VGPRs */ 4644b8e80941Smrg add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tcs_patch_id); 4645b8e80941Smrg add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tcs_rel_ids); 4646b8e80941Smrg 4647b8e80941Smrg /* param_tcs_offchip_offset and param_tcs_factor_offset are 4648b8e80941Smrg * placed after the user SGPRs. 4649b8e80941Smrg */ 4650b8e80941Smrg for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++) 4651b8e80941Smrg returns[num_returns++] = ctx->i32; /* SGPRs */ 4652b8e80941Smrg for (i = 0; i < 11; i++) 4653b8e80941Smrg returns[num_returns++] = ctx->f32; /* VGPRs */ 4654b8e80941Smrg break; 4655b8e80941Smrg 4656b8e80941Smrg case SI_SHADER_MERGED_VERTEX_TESSCTRL: 4657b8e80941Smrg /* Merged stages have 8 system SGPRs at the beginning. */ 4658b8e80941Smrg /* SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */ 4659b8e80941Smrg declare_per_stage_desc_pointers(ctx, &fninfo, 4660b8e80941Smrg ctx->type == PIPE_SHADER_TESS_CTRL); 4661b8e80941Smrg ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4662b8e80941Smrg ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4663b8e80941Smrg ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4664b8e80941Smrg ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4665b8e80941Smrg add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */ 4666b8e80941Smrg add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */ 4667b8e80941Smrg 4668b8e80941Smrg declare_global_desc_pointers(ctx, &fninfo); 4669b8e80941Smrg declare_per_stage_desc_pointers(ctx, &fninfo, 4670b8e80941Smrg ctx->type == PIPE_SHADER_VERTEX); 4671b8e80941Smrg declare_vs_specific_input_sgprs(ctx, &fninfo); 4672b8e80941Smrg 4673b8e80941Smrg ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4674b8e80941Smrg ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4675b8e80941Smrg ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4676b8e80941Smrg ctx->param_vertex_buffers = add_arg(&fninfo, ARG_SGPR, 4677b8e80941Smrg ac_array_in_const32_addr_space(ctx->v4i32)); 4678b8e80941Smrg 4679b8e80941Smrg /* VGPRs (first TCS, then VS) */ 4680b8e80941Smrg add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tcs_patch_id); 4681b8e80941Smrg add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tcs_rel_ids); 4682b8e80941Smrg 4683b8e80941Smrg if (ctx->type == PIPE_SHADER_VERTEX) { 4684b8e80941Smrg declare_vs_input_vgprs(ctx, &fninfo, 4685b8e80941Smrg &num_prolog_vgprs); 4686b8e80941Smrg 4687b8e80941Smrg /* LS return values are inputs to the TCS main shader part. */ 4688b8e80941Smrg for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++) 4689b8e80941Smrg returns[num_returns++] = ctx->i32; /* SGPRs */ 4690b8e80941Smrg for (i = 0; i < 2; i++) 4691b8e80941Smrg returns[num_returns++] = ctx->f32; /* VGPRs */ 4692b8e80941Smrg } else { 4693b8e80941Smrg /* TCS return values are inputs to the TCS epilog. 4694b8e80941Smrg * 4695b8e80941Smrg * param_tcs_offchip_offset, param_tcs_factor_offset, 4696b8e80941Smrg * param_tcs_offchip_layout, and param_rw_buffers 4697b8e80941Smrg * should be passed to the epilog. 4698b8e80941Smrg */ 4699b8e80941Smrg for (i = 0; i <= 8 + GFX9_SGPR_TCS_OUT_LAYOUT; i++) 4700b8e80941Smrg returns[num_returns++] = ctx->i32; /* SGPRs */ 4701b8e80941Smrg for (i = 0; i < 11; i++) 4702b8e80941Smrg returns[num_returns++] = ctx->f32; /* VGPRs */ 4703b8e80941Smrg } 4704b8e80941Smrg break; 4705b8e80941Smrg 4706b8e80941Smrg case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY: 4707b8e80941Smrg /* Merged stages have 8 system SGPRs at the beginning. */ 4708b8e80941Smrg /* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */ 4709b8e80941Smrg declare_per_stage_desc_pointers(ctx, &fninfo, 4710b8e80941Smrg ctx->type == PIPE_SHADER_GEOMETRY); 4711b8e80941Smrg ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4712b8e80941Smrg ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4713b8e80941Smrg ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4714b8e80941Smrg ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4715b8e80941Smrg add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */ 4716b8e80941Smrg add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */ 4717b8e80941Smrg 4718b8e80941Smrg declare_global_desc_pointers(ctx, &fninfo); 4719b8e80941Smrg declare_per_stage_desc_pointers(ctx, &fninfo, 4720b8e80941Smrg (ctx->type == PIPE_SHADER_VERTEX || 4721b8e80941Smrg ctx->type == PIPE_SHADER_TESS_EVAL)); 4722b8e80941Smrg if (ctx->type == PIPE_SHADER_VERTEX) { 4723b8e80941Smrg declare_vs_specific_input_sgprs(ctx, &fninfo); 4724b8e80941Smrg } else { 4725b8e80941Smrg ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4726b8e80941Smrg ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4727b8e80941Smrg ctx->param_tes_offchip_addr = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4728b8e80941Smrg /* Declare as many input SGPRs as the VS has. */ 4729b8e80941Smrg } 4730b8e80941Smrg 4731b8e80941Smrg if (ctx->type == PIPE_SHADER_VERTEX) { 4732b8e80941Smrg ctx->param_vertex_buffers = add_arg(&fninfo, ARG_SGPR, 4733b8e80941Smrg ac_array_in_const32_addr_space(ctx->v4i32)); 4734b8e80941Smrg } 4735b8e80941Smrg 4736b8e80941Smrg /* VGPRs (first GS, then VS/TES) */ 4737b8e80941Smrg ctx->param_gs_vtx01_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32); 4738b8e80941Smrg ctx->param_gs_vtx23_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32); 4739b8e80941Smrg add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_prim_id); 4740b8e80941Smrg add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_invocation_id); 4741b8e80941Smrg ctx->param_gs_vtx45_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32); 4742b8e80941Smrg 4743b8e80941Smrg if (ctx->type == PIPE_SHADER_VERTEX) { 4744b8e80941Smrg declare_vs_input_vgprs(ctx, &fninfo, 4745b8e80941Smrg &num_prolog_vgprs); 4746b8e80941Smrg } else if (ctx->type == PIPE_SHADER_TESS_EVAL) { 4747b8e80941Smrg declare_tes_input_vgprs(ctx, &fninfo); 4748b8e80941Smrg } 4749b8e80941Smrg 4750b8e80941Smrg if (ctx->type == PIPE_SHADER_VERTEX || 4751b8e80941Smrg ctx->type == PIPE_SHADER_TESS_EVAL) { 4752b8e80941Smrg unsigned num_user_sgprs; 4753b8e80941Smrg 4754b8e80941Smrg if (ctx->type == PIPE_SHADER_VERTEX) 4755b8e80941Smrg num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR; 4756b8e80941Smrg else 4757b8e80941Smrg num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR; 4758b8e80941Smrg 4759b8e80941Smrg /* ES return values are inputs to GS. */ 4760b8e80941Smrg for (i = 0; i < 8 + num_user_sgprs; i++) 4761b8e80941Smrg returns[num_returns++] = ctx->i32; /* SGPRs */ 4762b8e80941Smrg for (i = 0; i < 5; i++) 4763b8e80941Smrg returns[num_returns++] = ctx->f32; /* VGPRs */ 4764b8e80941Smrg } 4765b8e80941Smrg break; 4766b8e80941Smrg 4767b8e80941Smrg case PIPE_SHADER_TESS_EVAL: 4768b8e80941Smrg declare_global_desc_pointers(ctx, &fninfo); 4769b8e80941Smrg declare_per_stage_desc_pointers(ctx, &fninfo, true); 4770b8e80941Smrg ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4771b8e80941Smrg ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4772b8e80941Smrg ctx->param_tes_offchip_addr = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4773b8e80941Smrg 4774b8e80941Smrg if (shader->key.as_es) { 4775b8e80941Smrg ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4776b8e80941Smrg add_arg(&fninfo, ARG_SGPR, ctx->i32); 4777b8e80941Smrg ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4778b8e80941Smrg } else { 4779b8e80941Smrg add_arg(&fninfo, ARG_SGPR, ctx->i32); 4780b8e80941Smrg declare_streamout_params(ctx, &shader->selector->so, 4781b8e80941Smrg &fninfo); 4782b8e80941Smrg ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4783b8e80941Smrg } 4784b8e80941Smrg 4785b8e80941Smrg /* VGPRs */ 4786b8e80941Smrg declare_tes_input_vgprs(ctx, &fninfo); 4787b8e80941Smrg break; 4788b8e80941Smrg 4789b8e80941Smrg case PIPE_SHADER_GEOMETRY: 4790b8e80941Smrg declare_global_desc_pointers(ctx, &fninfo); 4791b8e80941Smrg declare_per_stage_desc_pointers(ctx, &fninfo, true); 4792b8e80941Smrg ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4793b8e80941Smrg ctx->param_gs_wave_id = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4794b8e80941Smrg 4795b8e80941Smrg /* VGPRs */ 4796b8e80941Smrg add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[0]); 4797b8e80941Smrg add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[1]); 4798b8e80941Smrg add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_prim_id); 4799b8e80941Smrg add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[2]); 4800b8e80941Smrg add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[3]); 4801b8e80941Smrg add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[4]); 4802b8e80941Smrg add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[5]); 4803b8e80941Smrg add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_invocation_id); 4804b8e80941Smrg break; 4805b8e80941Smrg 4806b8e80941Smrg case PIPE_SHADER_FRAGMENT: 4807b8e80941Smrg declare_global_desc_pointers(ctx, &fninfo); 4808b8e80941Smrg declare_per_stage_desc_pointers(ctx, &fninfo, true); 4809b8e80941Smrg add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF); 4810b8e80941Smrg add_arg_assign_checked(&fninfo, ARG_SGPR, ctx->i32, 4811b8e80941Smrg &ctx->abi.prim_mask, SI_PARAM_PRIM_MASK); 4812b8e80941Smrg 4813b8e80941Smrg add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_SAMPLE); 4814b8e80941Smrg add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTER); 4815b8e80941Smrg add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTROID); 4816b8e80941Smrg add_arg_checked(&fninfo, ARG_VGPR, v3i32, SI_PARAM_PERSP_PULL_MODEL); 4817b8e80941Smrg add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_SAMPLE); 4818b8e80941Smrg add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTER); 4819b8e80941Smrg add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTROID); 4820b8e80941Smrg add_arg_checked(&fninfo, ARG_VGPR, ctx->f32, SI_PARAM_LINE_STIPPLE_TEX); 4821b8e80941Smrg add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32, 4822b8e80941Smrg &ctx->abi.frag_pos[0], SI_PARAM_POS_X_FLOAT); 4823b8e80941Smrg add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32, 4824b8e80941Smrg &ctx->abi.frag_pos[1], SI_PARAM_POS_Y_FLOAT); 4825b8e80941Smrg add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32, 4826b8e80941Smrg &ctx->abi.frag_pos[2], SI_PARAM_POS_Z_FLOAT); 4827b8e80941Smrg add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32, 4828b8e80941Smrg &ctx->abi.frag_pos[3], SI_PARAM_POS_W_FLOAT); 4829b8e80941Smrg add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->i32, 4830b8e80941Smrg &ctx->abi.front_face, SI_PARAM_FRONT_FACE); 4831b8e80941Smrg shader->info.face_vgpr_index = 20; 4832b8e80941Smrg add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->i32, 4833b8e80941Smrg &ctx->abi.ancillary, SI_PARAM_ANCILLARY); 4834b8e80941Smrg shader->info.ancillary_vgpr_index = 21; 4835b8e80941Smrg add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32, 4836b8e80941Smrg &ctx->abi.sample_coverage, SI_PARAM_SAMPLE_COVERAGE); 4837b8e80941Smrg add_arg_checked(&fninfo, ARG_VGPR, ctx->i32, SI_PARAM_POS_FIXED_PT); 4838b8e80941Smrg 4839b8e80941Smrg /* Color inputs from the prolog. */ 4840b8e80941Smrg if (shader->selector->info.colors_read) { 4841b8e80941Smrg unsigned num_color_elements = 4842b8e80941Smrg util_bitcount(shader->selector->info.colors_read); 4843b8e80941Smrg 4844b8e80941Smrg assert(fninfo.num_params + num_color_elements <= ARRAY_SIZE(fninfo.types)); 4845b8e80941Smrg for (i = 0; i < num_color_elements; i++) 4846b8e80941Smrg add_arg(&fninfo, ARG_VGPR, ctx->f32); 4847b8e80941Smrg 4848b8e80941Smrg num_prolog_vgprs += num_color_elements; 4849b8e80941Smrg } 4850b8e80941Smrg 4851b8e80941Smrg /* Outputs for the epilog. */ 4852b8e80941Smrg num_return_sgprs = SI_SGPR_ALPHA_REF + 1; 4853b8e80941Smrg num_returns = 4854b8e80941Smrg num_return_sgprs + 4855b8e80941Smrg util_bitcount(shader->selector->info.colors_written) * 4 + 4856b8e80941Smrg shader->selector->info.writes_z + 4857b8e80941Smrg shader->selector->info.writes_stencil + 4858b8e80941Smrg shader->selector->info.writes_samplemask + 4859b8e80941Smrg 1 /* SampleMaskIn */; 4860b8e80941Smrg 4861b8e80941Smrg num_returns = MAX2(num_returns, 4862b8e80941Smrg num_return_sgprs + 4863b8e80941Smrg PS_EPILOG_SAMPLEMASK_MIN_LOC + 1); 4864b8e80941Smrg 4865b8e80941Smrg for (i = 0; i < num_return_sgprs; i++) 4866b8e80941Smrg returns[i] = ctx->i32; 4867b8e80941Smrg for (; i < num_returns; i++) 4868b8e80941Smrg returns[i] = ctx->f32; 4869b8e80941Smrg break; 4870b8e80941Smrg 4871b8e80941Smrg case PIPE_SHADER_COMPUTE: 4872b8e80941Smrg declare_global_desc_pointers(ctx, &fninfo); 4873b8e80941Smrg declare_per_stage_desc_pointers(ctx, &fninfo, true); 4874b8e80941Smrg if (shader->selector->info.uses_grid_size) 4875b8e80941Smrg add_arg_assign(&fninfo, ARG_SGPR, v3i32, &ctx->abi.num_work_groups); 4876b8e80941Smrg if (shader->selector->info.uses_block_size && 4877b8e80941Smrg shader->selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0) 4878b8e80941Smrg ctx->param_block_size = add_arg(&fninfo, ARG_SGPR, v3i32); 4879b8e80941Smrg 4880b8e80941Smrg unsigned cs_user_data_dwords = 4881b8e80941Smrg shader->selector->info.properties[TGSI_PROPERTY_CS_USER_DATA_DWORDS]; 4882b8e80941Smrg if (cs_user_data_dwords) { 4883b8e80941Smrg ctx->param_cs_user_data = add_arg(&fninfo, ARG_SGPR, 4884b8e80941Smrg LLVMVectorType(ctx->i32, cs_user_data_dwords)); 4885b8e80941Smrg } 4886b8e80941Smrg 4887b8e80941Smrg for (i = 0; i < 3; i++) { 4888b8e80941Smrg ctx->abi.workgroup_ids[i] = NULL; 4889b8e80941Smrg if (shader->selector->info.uses_block_id[i]) 4890b8e80941Smrg add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &ctx->abi.workgroup_ids[i]); 4891b8e80941Smrg } 4892b8e80941Smrg 4893b8e80941Smrg add_arg_assign(&fninfo, ARG_VGPR, v3i32, &ctx->abi.local_invocation_ids); 4894b8e80941Smrg break; 4895b8e80941Smrg default: 4896b8e80941Smrg assert(0 && "unimplemented shader"); 4897b8e80941Smrg return; 4898b8e80941Smrg } 4899b8e80941Smrg 4900b8e80941Smrg si_create_function(ctx, "main", returns, num_returns, &fninfo, 4901b8e80941Smrg si_get_max_workgroup_size(shader)); 4902b8e80941Smrg 4903b8e80941Smrg /* Reserve register locations for VGPR inputs the PS prolog may need. */ 4904b8e80941Smrg if (ctx->type == PIPE_SHADER_FRAGMENT && !ctx->shader->is_monolithic) { 4905b8e80941Smrg ac_llvm_add_target_dep_function_attr(ctx->main_fn, 4906b8e80941Smrg "InitialPSInputAddr", 4907b8e80941Smrg S_0286D0_PERSP_SAMPLE_ENA(1) | 4908b8e80941Smrg S_0286D0_PERSP_CENTER_ENA(1) | 4909b8e80941Smrg S_0286D0_PERSP_CENTROID_ENA(1) | 4910b8e80941Smrg S_0286D0_LINEAR_SAMPLE_ENA(1) | 4911b8e80941Smrg S_0286D0_LINEAR_CENTER_ENA(1) | 4912b8e80941Smrg S_0286D0_LINEAR_CENTROID_ENA(1) | 4913b8e80941Smrg S_0286D0_FRONT_FACE_ENA(1) | 4914b8e80941Smrg S_0286D0_ANCILLARY_ENA(1) | 4915b8e80941Smrg S_0286D0_POS_FIXED_PT_ENA(1)); 4916b8e80941Smrg } 4917b8e80941Smrg 4918b8e80941Smrg shader->info.num_input_sgprs = 0; 4919b8e80941Smrg shader->info.num_input_vgprs = 0; 4920b8e80941Smrg 4921b8e80941Smrg for (i = 0; i < fninfo.num_sgpr_params; ++i) 4922b8e80941Smrg shader->info.num_input_sgprs += ac_get_type_size(fninfo.types[i]) / 4; 4923b8e80941Smrg 4924b8e80941Smrg for (; i < fninfo.num_params; ++i) 4925b8e80941Smrg shader->info.num_input_vgprs += ac_get_type_size(fninfo.types[i]) / 4; 4926b8e80941Smrg 4927b8e80941Smrg assert(shader->info.num_input_vgprs >= num_prolog_vgprs); 4928b8e80941Smrg shader->info.num_input_vgprs -= num_prolog_vgprs; 4929b8e80941Smrg 4930b8e80941Smrg if (shader->key.as_ls || 4931b8e80941Smrg ctx->type == PIPE_SHADER_TESS_CTRL || 4932b8e80941Smrg /* GFX9 has the ESGS ring buffer in LDS. */ 4933b8e80941Smrg type == SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY) 4934b8e80941Smrg ac_declare_lds_as_pointer(&ctx->ac); 4935b8e80941Smrg} 4936b8e80941Smrg 4937b8e80941Smrg/** 4938b8e80941Smrg * Load ESGS and GSVS ring buffer resource descriptors and save the variables 4939b8e80941Smrg * for later use. 4940b8e80941Smrg */ 4941b8e80941Smrgstatic void preload_ring_buffers(struct si_shader_context *ctx) 4942b8e80941Smrg{ 4943b8e80941Smrg LLVMBuilderRef builder = ctx->ac.builder; 4944b8e80941Smrg 4945b8e80941Smrg LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn, 4946b8e80941Smrg ctx->param_rw_buffers); 4947b8e80941Smrg 4948b8e80941Smrg if (ctx->screen->info.chip_class <= VI && 4949b8e80941Smrg (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)) { 4950b8e80941Smrg unsigned ring = 4951b8e80941Smrg ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS 4952b8e80941Smrg : SI_ES_RING_ESGS; 4953b8e80941Smrg LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0); 4954b8e80941Smrg 4955b8e80941Smrg ctx->esgs_ring = 4956b8e80941Smrg ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); 4957b8e80941Smrg } 4958b8e80941Smrg 4959b8e80941Smrg if (ctx->shader->is_gs_copy_shader) { 4960b8e80941Smrg LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0); 4961b8e80941Smrg 4962b8e80941Smrg ctx->gsvs_ring[0] = 4963b8e80941Smrg ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); 4964b8e80941Smrg } else if (ctx->type == PIPE_SHADER_GEOMETRY) { 4965b8e80941Smrg const struct si_shader_selector *sel = ctx->shader->selector; 4966b8e80941Smrg LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0); 4967b8e80941Smrg LLVMValueRef base_ring; 4968b8e80941Smrg 4969b8e80941Smrg base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); 4970b8e80941Smrg 4971b8e80941Smrg /* The conceptual layout of the GSVS ring is 4972b8e80941Smrg * v0c0 .. vLv0 v0c1 .. vLc1 .. 4973b8e80941Smrg * but the real memory layout is swizzled across 4974b8e80941Smrg * threads: 4975b8e80941Smrg * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL 4976b8e80941Smrg * t16v0c0 .. 4977b8e80941Smrg * Override the buffer descriptor accordingly. 4978b8e80941Smrg */ 4979b8e80941Smrg LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2); 4980b8e80941Smrg uint64_t stream_offset = 0; 4981b8e80941Smrg 4982b8e80941Smrg for (unsigned stream = 0; stream < 4; ++stream) { 4983b8e80941Smrg unsigned num_components; 4984b8e80941Smrg unsigned stride; 4985b8e80941Smrg unsigned num_records; 4986b8e80941Smrg LLVMValueRef ring, tmp; 4987b8e80941Smrg 4988b8e80941Smrg num_components = sel->info.num_stream_output_components[stream]; 4989b8e80941Smrg if (!num_components) 4990b8e80941Smrg continue; 4991b8e80941Smrg 4992b8e80941Smrg stride = 4 * num_components * sel->gs_max_out_vertices; 4993b8e80941Smrg 4994b8e80941Smrg /* Limit on the stride field for <= CIK. */ 4995b8e80941Smrg assert(stride < (1 << 14)); 4996b8e80941Smrg 4997b8e80941Smrg num_records = 64; 4998b8e80941Smrg 4999b8e80941Smrg ring = LLVMBuildBitCast(builder, base_ring, v2i64, ""); 5000b8e80941Smrg tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, ""); 5001b8e80941Smrg tmp = LLVMBuildAdd(builder, tmp, 5002b8e80941Smrg LLVMConstInt(ctx->i64, 5003b8e80941Smrg stream_offset, 0), ""); 5004b8e80941Smrg stream_offset += stride * 64; 5005b8e80941Smrg 5006b8e80941Smrg ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, ""); 5007b8e80941Smrg ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, ""); 5008b8e80941Smrg tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_1, ""); 5009b8e80941Smrg tmp = LLVMBuildOr(builder, tmp, 5010b8e80941Smrg LLVMConstInt(ctx->i32, 5011b8e80941Smrg S_008F04_STRIDE(stride) | 5012b8e80941Smrg S_008F04_SWIZZLE_ENABLE(1), 0), ""); 5013b8e80941Smrg ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_1, ""); 5014b8e80941Smrg ring = LLVMBuildInsertElement(builder, ring, 5015b8e80941Smrg LLVMConstInt(ctx->i32, num_records, 0), 5016b8e80941Smrg LLVMConstInt(ctx->i32, 2, 0), ""); 5017b8e80941Smrg ring = LLVMBuildInsertElement(builder, ring, 5018b8e80941Smrg LLVMConstInt(ctx->i32, 5019b8e80941Smrg S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | 5020b8e80941Smrg S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | 5021b8e80941Smrg S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | 5022b8e80941Smrg S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | 5023b8e80941Smrg S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | 5024b8e80941Smrg S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | 5025b8e80941Smrg S_008F0C_ELEMENT_SIZE(1) | /* element_size = 4 (bytes) */ 5026b8e80941Smrg S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */ 5027b8e80941Smrg S_008F0C_ADD_TID_ENABLE(1), 5028b8e80941Smrg 0), 5029b8e80941Smrg LLVMConstInt(ctx->i32, 3, 0), ""); 5030b8e80941Smrg 5031b8e80941Smrg ctx->gsvs_ring[stream] = ring; 5032b8e80941Smrg } 5033b8e80941Smrg } else if (ctx->type == PIPE_SHADER_TESS_EVAL) { 5034b8e80941Smrg ctx->tess_offchip_ring = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TES); 5035b8e80941Smrg } 5036b8e80941Smrg} 5037b8e80941Smrg 5038b8e80941Smrgstatic void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx, 5039b8e80941Smrg LLVMValueRef param_rw_buffers, 5040b8e80941Smrg unsigned param_pos_fixed_pt) 5041b8e80941Smrg{ 5042b8e80941Smrg LLVMBuilderRef builder = ctx->ac.builder; 5043b8e80941Smrg LLVMValueRef slot, desc, offset, row, bit, address[2]; 5044b8e80941Smrg 5045b8e80941Smrg /* Use the fixed-point gl_FragCoord input. 5046b8e80941Smrg * Since the stipple pattern is 32x32 and it repeats, just get 5 bits 5047b8e80941Smrg * per coordinate to get the repeating effect. 5048b8e80941Smrg */ 5049b8e80941Smrg address[0] = si_unpack_param(ctx, param_pos_fixed_pt, 0, 5); 5050b8e80941Smrg address[1] = si_unpack_param(ctx, param_pos_fixed_pt, 16, 5); 5051b8e80941Smrg 5052b8e80941Smrg /* Load the buffer descriptor. */ 5053b8e80941Smrg slot = LLVMConstInt(ctx->i32, SI_PS_CONST_POLY_STIPPLE, 0); 5054b8e80941Smrg desc = ac_build_load_to_sgpr(&ctx->ac, param_rw_buffers, slot); 5055b8e80941Smrg 5056b8e80941Smrg /* The stipple pattern is 32x32, each row has 32 bits. */ 5057b8e80941Smrg offset = LLVMBuildMul(builder, address[1], 5058b8e80941Smrg LLVMConstInt(ctx->i32, 4, 0), ""); 5059b8e80941Smrg row = buffer_load_const(ctx, desc, offset); 5060b8e80941Smrg row = ac_to_integer(&ctx->ac, row); 5061b8e80941Smrg bit = LLVMBuildLShr(builder, row, address[0], ""); 5062b8e80941Smrg bit = LLVMBuildTrunc(builder, bit, ctx->i1, ""); 5063b8e80941Smrg ac_build_kill_if_false(&ctx->ac, bit); 5064b8e80941Smrg} 5065b8e80941Smrg 5066b8e80941Smrgvoid si_shader_binary_read_config(struct ac_shader_binary *binary, 5067b8e80941Smrg struct si_shader_config *conf, 5068b8e80941Smrg unsigned symbol_offset) 5069b8e80941Smrg{ 5070b8e80941Smrg unsigned i; 5071b8e80941Smrg const unsigned char *config = 5072b8e80941Smrg ac_shader_binary_config_start(binary, symbol_offset); 5073b8e80941Smrg bool really_needs_scratch = false; 5074b8e80941Smrg 5075b8e80941Smrg /* LLVM adds SGPR spills to the scratch size. 5076b8e80941Smrg * Find out if we really need the scratch buffer. 5077b8e80941Smrg */ 5078b8e80941Smrg for (i = 0; i < binary->reloc_count; i++) { 5079b8e80941Smrg const struct ac_shader_reloc *reloc = &binary->relocs[i]; 5080b8e80941Smrg 5081b8e80941Smrg if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) || 5082b8e80941Smrg !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) { 5083b8e80941Smrg really_needs_scratch = true; 5084b8e80941Smrg break; 5085b8e80941Smrg } 5086b8e80941Smrg } 5087b8e80941Smrg 5088b8e80941Smrg /* XXX: We may be able to emit some of these values directly rather than 5089b8e80941Smrg * extracting fields to be emitted later. 5090b8e80941Smrg */ 5091b8e80941Smrg 5092b8e80941Smrg for (i = 0; i < binary->config_size_per_symbol; i+= 8) { 5093b8e80941Smrg unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i)); 5094b8e80941Smrg unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4)); 5095b8e80941Smrg switch (reg) { 5096b8e80941Smrg case R_00B028_SPI_SHADER_PGM_RSRC1_PS: 5097b8e80941Smrg case R_00B128_SPI_SHADER_PGM_RSRC1_VS: 5098b8e80941Smrg case R_00B228_SPI_SHADER_PGM_RSRC1_GS: 5099b8e80941Smrg case R_00B428_SPI_SHADER_PGM_RSRC1_HS: 5100b8e80941Smrg case R_00B848_COMPUTE_PGM_RSRC1: 5101b8e80941Smrg conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8); 5102b8e80941Smrg conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4); 5103b8e80941Smrg conf->float_mode = G_00B028_FLOAT_MODE(value); 5104b8e80941Smrg conf->rsrc1 = value; 5105b8e80941Smrg break; 5106b8e80941Smrg case R_00B02C_SPI_SHADER_PGM_RSRC2_PS: 5107b8e80941Smrg conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value)); 5108b8e80941Smrg break; 5109b8e80941Smrg case R_00B84C_COMPUTE_PGM_RSRC2: 5110b8e80941Smrg conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value)); 5111b8e80941Smrg conf->rsrc2 = value; 5112b8e80941Smrg break; 5113b8e80941Smrg case R_0286CC_SPI_PS_INPUT_ENA: 5114b8e80941Smrg conf->spi_ps_input_ena = value; 5115b8e80941Smrg break; 5116b8e80941Smrg case R_0286D0_SPI_PS_INPUT_ADDR: 5117b8e80941Smrg conf->spi_ps_input_addr = value; 5118b8e80941Smrg break; 5119b8e80941Smrg case R_0286E8_SPI_TMPRING_SIZE: 5120b8e80941Smrg case R_00B860_COMPUTE_TMPRING_SIZE: 5121b8e80941Smrg /* WAVESIZE is in units of 256 dwords. */ 5122b8e80941Smrg if (really_needs_scratch) 5123b8e80941Smrg conf->scratch_bytes_per_wave = 5124b8e80941Smrg G_00B860_WAVESIZE(value) * 256 * 4; 5125b8e80941Smrg break; 5126b8e80941Smrg case 0x4: /* SPILLED_SGPRS */ 5127b8e80941Smrg conf->spilled_sgprs = value; 5128b8e80941Smrg break; 5129b8e80941Smrg case 0x8: /* SPILLED_VGPRS */ 5130b8e80941Smrg conf->spilled_vgprs = value; 5131b8e80941Smrg break; 5132b8e80941Smrg default: 5133b8e80941Smrg { 5134b8e80941Smrg static bool printed; 5135b8e80941Smrg 5136b8e80941Smrg if (!printed) { 5137b8e80941Smrg fprintf(stderr, "Warning: LLVM emitted unknown " 5138b8e80941Smrg "config register: 0x%x\n", reg); 5139b8e80941Smrg printed = true; 5140b8e80941Smrg } 5141b8e80941Smrg } 5142b8e80941Smrg break; 5143b8e80941Smrg } 5144b8e80941Smrg } 5145b8e80941Smrg 5146b8e80941Smrg if (!conf->spi_ps_input_addr) 5147b8e80941Smrg conf->spi_ps_input_addr = conf->spi_ps_input_ena; 5148b8e80941Smrg} 5149b8e80941Smrg 5150b8e80941Smrgvoid si_shader_apply_scratch_relocs(struct si_shader *shader, 5151b8e80941Smrg uint64_t scratch_va) 5152b8e80941Smrg{ 5153b8e80941Smrg unsigned i; 5154b8e80941Smrg uint32_t scratch_rsrc_dword0 = scratch_va; 5155b8e80941Smrg uint32_t scratch_rsrc_dword1 = 5156b8e80941Smrg S_008F04_BASE_ADDRESS_HI(scratch_va >> 32); 5157b8e80941Smrg 5158b8e80941Smrg /* Enable scratch coalescing. */ 5159b8e80941Smrg scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1); 5160b8e80941Smrg 5161b8e80941Smrg for (i = 0 ; i < shader->binary.reloc_count; i++) { 5162b8e80941Smrg const struct ac_shader_reloc *reloc = 5163b8e80941Smrg &shader->binary.relocs[i]; 5164b8e80941Smrg if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) { 5165b8e80941Smrg util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset, 5166b8e80941Smrg &scratch_rsrc_dword0, 4); 5167b8e80941Smrg } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) { 5168b8e80941Smrg util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset, 5169b8e80941Smrg &scratch_rsrc_dword1, 4); 5170b8e80941Smrg } 5171b8e80941Smrg } 5172b8e80941Smrg} 5173b8e80941Smrg 5174b8e80941Smrg/* For the UMR disassembler. */ 5175b8e80941Smrg#define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */ 5176b8e80941Smrg#define DEBUGGER_NUM_MARKERS 5 5177b8e80941Smrg 5178b8e80941Smrgstatic unsigned si_get_shader_binary_size(const struct si_shader *shader) 5179b8e80941Smrg{ 5180b8e80941Smrg unsigned size = shader->binary.code_size; 5181b8e80941Smrg 5182b8e80941Smrg if (shader->prolog) 5183b8e80941Smrg size += shader->prolog->binary.code_size; 5184b8e80941Smrg if (shader->previous_stage) 5185b8e80941Smrg size += shader->previous_stage->binary.code_size; 5186b8e80941Smrg if (shader->prolog2) 5187b8e80941Smrg size += shader->prolog2->binary.code_size; 5188b8e80941Smrg if (shader->epilog) 5189b8e80941Smrg size += shader->epilog->binary.code_size; 5190b8e80941Smrg return size + DEBUGGER_NUM_MARKERS * 4; 5191b8e80941Smrg} 5192b8e80941Smrg 5193b8e80941Smrgint si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader) 5194b8e80941Smrg{ 5195b8e80941Smrg const struct ac_shader_binary *prolog = 5196b8e80941Smrg shader->prolog ? &shader->prolog->binary : NULL; 5197b8e80941Smrg const struct ac_shader_binary *previous_stage = 5198b8e80941Smrg shader->previous_stage ? &shader->previous_stage->binary : NULL; 5199b8e80941Smrg const struct ac_shader_binary *prolog2 = 5200b8e80941Smrg shader->prolog2 ? &shader->prolog2->binary : NULL; 5201b8e80941Smrg const struct ac_shader_binary *epilog = 5202b8e80941Smrg shader->epilog ? &shader->epilog->binary : NULL; 5203b8e80941Smrg const struct ac_shader_binary *mainb = &shader->binary; 5204b8e80941Smrg unsigned bo_size = si_get_shader_binary_size(shader) + 5205b8e80941Smrg (!epilog ? mainb->rodata_size : 0); 5206b8e80941Smrg unsigned char *ptr; 5207b8e80941Smrg 5208b8e80941Smrg assert(!prolog || !prolog->rodata_size); 5209b8e80941Smrg assert(!previous_stage || !previous_stage->rodata_size); 5210b8e80941Smrg assert(!prolog2 || !prolog2->rodata_size); 5211b8e80941Smrg assert((!prolog && !previous_stage && !prolog2 && !epilog) || 5212b8e80941Smrg !mainb->rodata_size); 5213b8e80941Smrg assert(!epilog || !epilog->rodata_size); 5214b8e80941Smrg 5215b8e80941Smrg si_resource_reference(&shader->bo, NULL); 5216b8e80941Smrg shader->bo = si_aligned_buffer_create(&sscreen->b, 5217b8e80941Smrg sscreen->cpdma_prefetch_writes_memory ? 5218b8e80941Smrg 0 : SI_RESOURCE_FLAG_READ_ONLY, 5219b8e80941Smrg PIPE_USAGE_IMMUTABLE, 5220b8e80941Smrg align(bo_size, SI_CPDMA_ALIGNMENT), 5221b8e80941Smrg 256); 5222b8e80941Smrg if (!shader->bo) 5223b8e80941Smrg return -ENOMEM; 5224b8e80941Smrg 5225b8e80941Smrg /* Upload. */ 5226b8e80941Smrg ptr = sscreen->ws->buffer_map(shader->bo->buf, NULL, 5227b8e80941Smrg PIPE_TRANSFER_READ_WRITE | 5228b8e80941Smrg PIPE_TRANSFER_UNSYNCHRONIZED | 5229b8e80941Smrg RADEON_TRANSFER_TEMPORARY); 5230b8e80941Smrg 5231b8e80941Smrg /* Don't use util_memcpy_cpu_to_le32. LLVM binaries are 5232b8e80941Smrg * endian-independent. */ 5233b8e80941Smrg if (prolog) { 5234b8e80941Smrg memcpy(ptr, prolog->code, prolog->code_size); 5235b8e80941Smrg ptr += prolog->code_size; 5236b8e80941Smrg } 5237b8e80941Smrg if (previous_stage) { 5238b8e80941Smrg memcpy(ptr, previous_stage->code, previous_stage->code_size); 5239b8e80941Smrg ptr += previous_stage->code_size; 5240b8e80941Smrg } 5241b8e80941Smrg if (prolog2) { 5242b8e80941Smrg memcpy(ptr, prolog2->code, prolog2->code_size); 5243b8e80941Smrg ptr += prolog2->code_size; 5244b8e80941Smrg } 5245b8e80941Smrg 5246b8e80941Smrg memcpy(ptr, mainb->code, mainb->code_size); 5247b8e80941Smrg ptr += mainb->code_size; 5248b8e80941Smrg 5249b8e80941Smrg if (epilog) { 5250b8e80941Smrg memcpy(ptr, epilog->code, epilog->code_size); 5251b8e80941Smrg ptr += epilog->code_size; 5252b8e80941Smrg } else if (mainb->rodata_size > 0) { 5253b8e80941Smrg memcpy(ptr, mainb->rodata, mainb->rodata_size); 5254b8e80941Smrg ptr += mainb->rodata_size; 5255b8e80941Smrg } 5256b8e80941Smrg 5257b8e80941Smrg /* Add end-of-code markers for the UMR disassembler. */ 5258b8e80941Smrg uint32_t *ptr32 = (uint32_t*)ptr; 5259b8e80941Smrg for (unsigned i = 0; i < DEBUGGER_NUM_MARKERS; i++) 5260b8e80941Smrg ptr32[i] = DEBUGGER_END_OF_CODE_MARKER; 5261b8e80941Smrg 5262b8e80941Smrg sscreen->ws->buffer_unmap(shader->bo->buf); 5263b8e80941Smrg return 0; 5264b8e80941Smrg} 5265b8e80941Smrg 5266b8e80941Smrgstatic void si_shader_dump_disassembly(const struct ac_shader_binary *binary, 5267b8e80941Smrg struct pipe_debug_callback *debug, 5268b8e80941Smrg const char *name, FILE *file) 5269b8e80941Smrg{ 5270b8e80941Smrg char *line, *p; 5271b8e80941Smrg unsigned i, count; 5272b8e80941Smrg 5273b8e80941Smrg if (binary->disasm_string) { 5274b8e80941Smrg fprintf(file, "Shader %s disassembly:\n", name); 5275b8e80941Smrg fprintf(file, "%s", binary->disasm_string); 5276b8e80941Smrg 5277b8e80941Smrg if (debug && debug->debug_message) { 5278b8e80941Smrg /* Very long debug messages are cut off, so send the 5279b8e80941Smrg * disassembly one line at a time. This causes more 5280b8e80941Smrg * overhead, but on the plus side it simplifies 5281b8e80941Smrg * parsing of resulting logs. 5282b8e80941Smrg */ 5283b8e80941Smrg pipe_debug_message(debug, SHADER_INFO, 5284b8e80941Smrg "Shader Disassembly Begin"); 5285b8e80941Smrg 5286b8e80941Smrg line = binary->disasm_string; 5287b8e80941Smrg while (*line) { 5288b8e80941Smrg p = util_strchrnul(line, '\n'); 5289b8e80941Smrg count = p - line; 5290b8e80941Smrg 5291b8e80941Smrg if (count) { 5292b8e80941Smrg pipe_debug_message(debug, SHADER_INFO, 5293b8e80941Smrg "%.*s", count, line); 5294b8e80941Smrg } 5295b8e80941Smrg 5296b8e80941Smrg if (!*p) 5297b8e80941Smrg break; 5298b8e80941Smrg line = p + 1; 5299b8e80941Smrg } 5300b8e80941Smrg 5301b8e80941Smrg pipe_debug_message(debug, SHADER_INFO, 5302b8e80941Smrg "Shader Disassembly End"); 5303b8e80941Smrg } 5304b8e80941Smrg } else { 5305b8e80941Smrg fprintf(file, "Shader %s binary:\n", name); 5306b8e80941Smrg for (i = 0; i < binary->code_size; i += 4) { 5307b8e80941Smrg fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i, 5308b8e80941Smrg binary->code[i + 3], binary->code[i + 2], 5309b8e80941Smrg binary->code[i + 1], binary->code[i]); 5310b8e80941Smrg } 5311b8e80941Smrg } 5312b8e80941Smrg} 5313b8e80941Smrg 5314b8e80941Smrgstatic void si_calculate_max_simd_waves(struct si_shader *shader) 5315b8e80941Smrg{ 5316b8e80941Smrg struct si_screen *sscreen = shader->selector->screen; 5317b8e80941Smrg struct si_shader_config *conf = &shader->config; 5318b8e80941Smrg unsigned num_inputs = shader->selector->info.num_inputs; 5319b8e80941Smrg unsigned lds_increment = sscreen->info.chip_class >= CIK ? 512 : 256; 5320b8e80941Smrg unsigned lds_per_wave = 0; 5321b8e80941Smrg unsigned max_simd_waves; 5322b8e80941Smrg 5323b8e80941Smrg max_simd_waves = ac_get_max_simd_waves(sscreen->info.family); 5324b8e80941Smrg 5325b8e80941Smrg /* Compute LDS usage for PS. */ 5326b8e80941Smrg switch (shader->selector->type) { 5327b8e80941Smrg case PIPE_SHADER_FRAGMENT: 5328b8e80941Smrg /* The minimum usage per wave is (num_inputs * 48). The maximum 5329b8e80941Smrg * usage is (num_inputs * 48 * 16). 5330b8e80941Smrg * We can get anything in between and it varies between waves. 5331b8e80941Smrg * 5332b8e80941Smrg * The 48 bytes per input for a single primitive is equal to 5333b8e80941Smrg * 4 bytes/component * 4 components/input * 3 points. 5334b8e80941Smrg * 5335b8e80941Smrg * Other stages don't know the size at compile time or don't 5336b8e80941Smrg * allocate LDS per wave, but instead they do it per thread group. 5337b8e80941Smrg */ 5338b8e80941Smrg lds_per_wave = conf->lds_size * lds_increment + 5339b8e80941Smrg align(num_inputs * 48, lds_increment); 5340b8e80941Smrg break; 5341b8e80941Smrg case PIPE_SHADER_COMPUTE: 5342b8e80941Smrg if (shader->selector) { 5343b8e80941Smrg unsigned max_workgroup_size = 5344b8e80941Smrg si_get_max_workgroup_size(shader); 5345b8e80941Smrg lds_per_wave = (conf->lds_size * lds_increment) / 5346b8e80941Smrg DIV_ROUND_UP(max_workgroup_size, 64); 5347b8e80941Smrg } 5348b8e80941Smrg break; 5349b8e80941Smrg } 5350b8e80941Smrg 5351b8e80941Smrg /* Compute the per-SIMD wave counts. */ 5352b8e80941Smrg if (conf->num_sgprs) { 5353b8e80941Smrg max_simd_waves = 5354b8e80941Smrg MIN2(max_simd_waves, 5355b8e80941Smrg ac_get_num_physical_sgprs(sscreen->info.chip_class) / conf->num_sgprs); 5356b8e80941Smrg } 5357b8e80941Smrg 5358b8e80941Smrg if (conf->num_vgprs) 5359b8e80941Smrg max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs); 5360b8e80941Smrg 5361b8e80941Smrg /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above 5362b8e80941Smrg * 16KB makes some SIMDs unoccupied). */ 5363b8e80941Smrg if (lds_per_wave) 5364b8e80941Smrg max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave); 5365b8e80941Smrg 5366b8e80941Smrg conf->max_simd_waves = max_simd_waves; 5367b8e80941Smrg} 5368b8e80941Smrg 5369b8e80941Smrgvoid si_shader_dump_stats_for_shader_db(const struct si_shader *shader, 5370b8e80941Smrg struct pipe_debug_callback *debug) 5371b8e80941Smrg{ 5372b8e80941Smrg const struct si_shader_config *conf = &shader->config; 5373b8e80941Smrg 5374b8e80941Smrg pipe_debug_message(debug, SHADER_INFO, 5375b8e80941Smrg "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d " 5376b8e80941Smrg "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d " 5377b8e80941Smrg "Spilled VGPRs: %d PrivMem VGPRs: %d", 5378b8e80941Smrg conf->num_sgprs, conf->num_vgprs, 5379b8e80941Smrg si_get_shader_binary_size(shader), 5380b8e80941Smrg conf->lds_size, conf->scratch_bytes_per_wave, 5381b8e80941Smrg conf->max_simd_waves, conf->spilled_sgprs, 5382b8e80941Smrg conf->spilled_vgprs, conf->private_mem_vgprs); 5383b8e80941Smrg} 5384b8e80941Smrg 5385b8e80941Smrgstatic void si_shader_dump_stats(struct si_screen *sscreen, 5386b8e80941Smrg const struct si_shader *shader, 5387b8e80941Smrg unsigned processor, 5388b8e80941Smrg FILE *file, 5389b8e80941Smrg bool check_debug_option) 5390b8e80941Smrg{ 5391b8e80941Smrg const struct si_shader_config *conf = &shader->config; 5392b8e80941Smrg 5393b8e80941Smrg if (!check_debug_option || 5394b8e80941Smrg si_can_dump_shader(sscreen, processor)) { 5395b8e80941Smrg if (processor == PIPE_SHADER_FRAGMENT) { 5396b8e80941Smrg fprintf(file, "*** SHADER CONFIG ***\n" 5397b8e80941Smrg "SPI_PS_INPUT_ADDR = 0x%04x\n" 5398b8e80941Smrg "SPI_PS_INPUT_ENA = 0x%04x\n", 5399b8e80941Smrg conf->spi_ps_input_addr, conf->spi_ps_input_ena); 5400b8e80941Smrg } 5401b8e80941Smrg 5402b8e80941Smrg fprintf(file, "*** SHADER STATS ***\n" 5403b8e80941Smrg "SGPRS: %d\n" 5404b8e80941Smrg "VGPRS: %d\n" 5405b8e80941Smrg "Spilled SGPRs: %d\n" 5406b8e80941Smrg "Spilled VGPRs: %d\n" 5407b8e80941Smrg "Private memory VGPRs: %d\n" 5408b8e80941Smrg "Code Size: %d bytes\n" 5409b8e80941Smrg "LDS: %d blocks\n" 5410b8e80941Smrg "Scratch: %d bytes per wave\n" 5411b8e80941Smrg "Max Waves: %d\n" 5412b8e80941Smrg "********************\n\n\n", 5413b8e80941Smrg conf->num_sgprs, conf->num_vgprs, 5414b8e80941Smrg conf->spilled_sgprs, conf->spilled_vgprs, 5415b8e80941Smrg conf->private_mem_vgprs, 5416b8e80941Smrg si_get_shader_binary_size(shader), 5417b8e80941Smrg conf->lds_size, conf->scratch_bytes_per_wave, 5418b8e80941Smrg conf->max_simd_waves); 5419b8e80941Smrg } 5420b8e80941Smrg} 5421b8e80941Smrg 5422b8e80941Smrgconst char *si_get_shader_name(const struct si_shader *shader, unsigned processor) 5423b8e80941Smrg{ 5424b8e80941Smrg switch (processor) { 5425b8e80941Smrg case PIPE_SHADER_VERTEX: 5426b8e80941Smrg if (shader->key.as_es) 5427b8e80941Smrg return "Vertex Shader as ES"; 5428b8e80941Smrg else if (shader->key.as_ls) 5429b8e80941Smrg return "Vertex Shader as LS"; 5430b8e80941Smrg else 5431b8e80941Smrg return "Vertex Shader as VS"; 5432b8e80941Smrg case PIPE_SHADER_TESS_CTRL: 5433b8e80941Smrg return "Tessellation Control Shader"; 5434b8e80941Smrg case PIPE_SHADER_TESS_EVAL: 5435b8e80941Smrg if (shader->key.as_es) 5436b8e80941Smrg return "Tessellation Evaluation Shader as ES"; 5437b8e80941Smrg else 5438b8e80941Smrg return "Tessellation Evaluation Shader as VS"; 5439b8e80941Smrg case PIPE_SHADER_GEOMETRY: 5440b8e80941Smrg if (shader->is_gs_copy_shader) 5441b8e80941Smrg return "GS Copy Shader as VS"; 5442b8e80941Smrg else 5443b8e80941Smrg return "Geometry Shader"; 5444b8e80941Smrg case PIPE_SHADER_FRAGMENT: 5445b8e80941Smrg return "Pixel Shader"; 5446b8e80941Smrg case PIPE_SHADER_COMPUTE: 5447b8e80941Smrg return "Compute Shader"; 5448b8e80941Smrg default: 5449b8e80941Smrg return "Unknown Shader"; 5450b8e80941Smrg } 5451b8e80941Smrg} 5452b8e80941Smrg 5453b8e80941Smrgvoid si_shader_dump(struct si_screen *sscreen, const struct si_shader *shader, 5454b8e80941Smrg struct pipe_debug_callback *debug, unsigned processor, 5455b8e80941Smrg FILE *file, bool check_debug_option) 5456b8e80941Smrg{ 5457b8e80941Smrg if (!check_debug_option || 5458b8e80941Smrg si_can_dump_shader(sscreen, processor)) 5459b8e80941Smrg si_dump_shader_key(processor, shader, file); 5460b8e80941Smrg 5461b8e80941Smrg if (!check_debug_option && shader->binary.llvm_ir_string) { 5462b8e80941Smrg if (shader->previous_stage && 5463b8e80941Smrg shader->previous_stage->binary.llvm_ir_string) { 5464b8e80941Smrg fprintf(file, "\n%s - previous stage - LLVM IR:\n\n", 5465b8e80941Smrg si_get_shader_name(shader, processor)); 5466b8e80941Smrg fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string); 5467b8e80941Smrg } 5468b8e80941Smrg 5469b8e80941Smrg fprintf(file, "\n%s - main shader part - LLVM IR:\n\n", 5470b8e80941Smrg si_get_shader_name(shader, processor)); 5471b8e80941Smrg fprintf(file, "%s\n", shader->binary.llvm_ir_string); 5472b8e80941Smrg } 5473b8e80941Smrg 5474b8e80941Smrg if (!check_debug_option || 5475b8e80941Smrg (si_can_dump_shader(sscreen, processor) && 5476b8e80941Smrg !(sscreen->debug_flags & DBG(NO_ASM)))) { 5477b8e80941Smrg fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor)); 5478b8e80941Smrg 5479b8e80941Smrg if (shader->prolog) 5480b8e80941Smrg si_shader_dump_disassembly(&shader->prolog->binary, 5481b8e80941Smrg debug, "prolog", file); 5482b8e80941Smrg if (shader->previous_stage) 5483b8e80941Smrg si_shader_dump_disassembly(&shader->previous_stage->binary, 5484b8e80941Smrg debug, "previous stage", file); 5485b8e80941Smrg if (shader->prolog2) 5486b8e80941Smrg si_shader_dump_disassembly(&shader->prolog2->binary, 5487b8e80941Smrg debug, "prolog2", file); 5488b8e80941Smrg 5489b8e80941Smrg si_shader_dump_disassembly(&shader->binary, debug, "main", file); 5490b8e80941Smrg 5491b8e80941Smrg if (shader->epilog) 5492b8e80941Smrg si_shader_dump_disassembly(&shader->epilog->binary, 5493b8e80941Smrg debug, "epilog", file); 5494b8e80941Smrg fprintf(file, "\n"); 5495b8e80941Smrg } 5496b8e80941Smrg 5497b8e80941Smrg si_shader_dump_stats(sscreen, shader, processor, file, 5498b8e80941Smrg check_debug_option); 5499b8e80941Smrg} 5500b8e80941Smrg 5501b8e80941Smrgstatic int si_compile_llvm(struct si_screen *sscreen, 5502b8e80941Smrg struct ac_shader_binary *binary, 5503b8e80941Smrg struct si_shader_config *conf, 5504b8e80941Smrg struct ac_llvm_compiler *compiler, 5505b8e80941Smrg LLVMModuleRef mod, 5506b8e80941Smrg struct pipe_debug_callback *debug, 5507b8e80941Smrg unsigned processor, 5508b8e80941Smrg const char *name, 5509b8e80941Smrg bool less_optimized) 5510b8e80941Smrg{ 5511b8e80941Smrg int r = 0; 5512b8e80941Smrg unsigned count = p_atomic_inc_return(&sscreen->num_compilations); 5513b8e80941Smrg 5514b8e80941Smrg if (si_can_dump_shader(sscreen, processor)) { 5515b8e80941Smrg fprintf(stderr, "radeonsi: Compiling shader %d\n", count); 5516b8e80941Smrg 5517b8e80941Smrg if (!(sscreen->debug_flags & (DBG(NO_IR) | DBG(PREOPT_IR)))) { 5518b8e80941Smrg fprintf(stderr, "%s LLVM IR:\n\n", name); 5519b8e80941Smrg ac_dump_module(mod); 5520b8e80941Smrg fprintf(stderr, "\n"); 5521b8e80941Smrg } 5522b8e80941Smrg } 5523b8e80941Smrg 5524b8e80941Smrg if (sscreen->record_llvm_ir) { 5525b8e80941Smrg char *ir = LLVMPrintModuleToString(mod); 5526b8e80941Smrg binary->llvm_ir_string = strdup(ir); 5527b8e80941Smrg LLVMDisposeMessage(ir); 5528b8e80941Smrg } 5529b8e80941Smrg 5530b8e80941Smrg if (!si_replace_shader(count, binary)) { 5531b8e80941Smrg r = si_llvm_compile(mod, binary, compiler, debug, 5532b8e80941Smrg less_optimized); 5533b8e80941Smrg if (r) 5534b8e80941Smrg return r; 5535b8e80941Smrg } 5536b8e80941Smrg 5537b8e80941Smrg si_shader_binary_read_config(binary, conf, 0); 5538b8e80941Smrg 5539b8e80941Smrg /* Enable 64-bit and 16-bit denormals, because there is no performance 5540b8e80941Smrg * cost. 5541b8e80941Smrg * 5542b8e80941Smrg * If denormals are enabled, all floating-point output modifiers are 5543b8e80941Smrg * ignored. 5544b8e80941Smrg * 5545b8e80941Smrg * Don't enable denormals for 32-bit floats, because: 5546b8e80941Smrg * - Floating-point output modifiers would be ignored by the hw. 5547b8e80941Smrg * - Some opcodes don't support denormals, such as v_mad_f32. We would 5548b8e80941Smrg * have to stop using those. 5549b8e80941Smrg * - SI & CI would be very slow. 5550b8e80941Smrg */ 5551b8e80941Smrg conf->float_mode |= V_00B028_FP_64_DENORMS; 5552b8e80941Smrg 5553b8e80941Smrg FREE(binary->config); 5554b8e80941Smrg FREE(binary->global_symbol_offsets); 5555b8e80941Smrg binary->config = NULL; 5556b8e80941Smrg binary->global_symbol_offsets = NULL; 5557b8e80941Smrg 5558b8e80941Smrg /* Some shaders can't have rodata because their binaries can be 5559b8e80941Smrg * concatenated. 5560b8e80941Smrg */ 5561b8e80941Smrg if (binary->rodata_size && 5562b8e80941Smrg (processor == PIPE_SHADER_VERTEX || 5563b8e80941Smrg processor == PIPE_SHADER_TESS_CTRL || 5564b8e80941Smrg processor == PIPE_SHADER_TESS_EVAL || 5565b8e80941Smrg processor == PIPE_SHADER_FRAGMENT)) { 5566b8e80941Smrg fprintf(stderr, "radeonsi: The shader can't have rodata."); 5567b8e80941Smrg return -EINVAL; 5568b8e80941Smrg } 5569b8e80941Smrg 5570b8e80941Smrg return r; 5571b8e80941Smrg} 5572b8e80941Smrg 5573b8e80941Smrgstatic void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret) 5574b8e80941Smrg{ 5575b8e80941Smrg if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind) 5576b8e80941Smrg LLVMBuildRetVoid(ctx->ac.builder); 5577b8e80941Smrg else 5578b8e80941Smrg LLVMBuildRet(ctx->ac.builder, ret); 5579b8e80941Smrg} 5580b8e80941Smrg 5581b8e80941Smrg/* Generate code for the hardware VS shader stage to go with a geometry shader */ 5582b8e80941Smrgstruct si_shader * 5583b8e80941Smrgsi_generate_gs_copy_shader(struct si_screen *sscreen, 5584b8e80941Smrg struct ac_llvm_compiler *compiler, 5585b8e80941Smrg struct si_shader_selector *gs_selector, 5586b8e80941Smrg struct pipe_debug_callback *debug) 5587b8e80941Smrg{ 5588b8e80941Smrg struct si_shader_context ctx; 5589b8e80941Smrg struct si_shader *shader; 5590b8e80941Smrg LLVMBuilderRef builder; 5591b8e80941Smrg struct si_shader_output_values outputs[SI_MAX_VS_OUTPUTS]; 5592b8e80941Smrg struct tgsi_shader_info *gsinfo = &gs_selector->info; 5593b8e80941Smrg int i, r; 5594b8e80941Smrg 5595b8e80941Smrg 5596b8e80941Smrg shader = CALLOC_STRUCT(si_shader); 5597b8e80941Smrg if (!shader) 5598b8e80941Smrg return NULL; 5599b8e80941Smrg 5600b8e80941Smrg /* We can leave the fence as permanently signaled because the GS copy 5601b8e80941Smrg * shader only becomes visible globally after it has been compiled. */ 5602b8e80941Smrg util_queue_fence_init(&shader->ready); 5603b8e80941Smrg 5604b8e80941Smrg shader->selector = gs_selector; 5605b8e80941Smrg shader->is_gs_copy_shader = true; 5606b8e80941Smrg 5607b8e80941Smrg si_init_shader_ctx(&ctx, sscreen, compiler); 5608b8e80941Smrg ctx.shader = shader; 5609b8e80941Smrg ctx.type = PIPE_SHADER_VERTEX; 5610b8e80941Smrg 5611b8e80941Smrg builder = ctx.ac.builder; 5612b8e80941Smrg 5613b8e80941Smrg create_function(&ctx); 5614b8e80941Smrg preload_ring_buffers(&ctx); 5615b8e80941Smrg 5616b8e80941Smrg LLVMValueRef voffset = 5617b8e80941Smrg LLVMBuildMul(ctx.ac.builder, ctx.abi.vertex_id, 5618b8e80941Smrg LLVMConstInt(ctx.i32, 4, 0), ""); 5619b8e80941Smrg 5620b8e80941Smrg /* Fetch the vertex stream ID.*/ 5621b8e80941Smrg LLVMValueRef stream_id; 5622b8e80941Smrg 5623b8e80941Smrg if (gs_selector->so.num_outputs) 5624b8e80941Smrg stream_id = si_unpack_param(&ctx, ctx.param_streamout_config, 24, 2); 5625b8e80941Smrg else 5626b8e80941Smrg stream_id = ctx.i32_0; 5627b8e80941Smrg 5628b8e80941Smrg /* Fill in output information. */ 5629b8e80941Smrg for (i = 0; i < gsinfo->num_outputs; ++i) { 5630b8e80941Smrg outputs[i].semantic_name = gsinfo->output_semantic_name[i]; 5631b8e80941Smrg outputs[i].semantic_index = gsinfo->output_semantic_index[i]; 5632b8e80941Smrg 5633b8e80941Smrg for (int chan = 0; chan < 4; chan++) { 5634b8e80941Smrg outputs[i].vertex_stream[chan] = 5635b8e80941Smrg (gsinfo->output_streams[i] >> (2 * chan)) & 3; 5636b8e80941Smrg } 5637b8e80941Smrg } 5638b8e80941Smrg 5639b8e80941Smrg LLVMBasicBlockRef end_bb; 5640b8e80941Smrg LLVMValueRef switch_inst; 5641b8e80941Smrg 5642b8e80941Smrg end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end"); 5643b8e80941Smrg switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4); 5644b8e80941Smrg 5645b8e80941Smrg for (int stream = 0; stream < 4; stream++) { 5646b8e80941Smrg LLVMBasicBlockRef bb; 5647b8e80941Smrg unsigned offset; 5648b8e80941Smrg 5649b8e80941Smrg if (!gsinfo->num_stream_output_components[stream]) 5650b8e80941Smrg continue; 5651b8e80941Smrg 5652b8e80941Smrg if (stream > 0 && !gs_selector->so.num_outputs) 5653848b8605Smrg continue; 5654848b8605Smrg 5655b8e80941Smrg bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out"); 5656b8e80941Smrg LLVMAddCase(switch_inst, LLVMConstInt(ctx.i32, stream, 0), bb); 5657b8e80941Smrg LLVMPositionBuilderAtEnd(builder, bb); 5658b8e80941Smrg 5659b8e80941Smrg /* Fetch vertex data from GSVS ring */ 5660b8e80941Smrg offset = 0; 5661b8e80941Smrg for (i = 0; i < gsinfo->num_outputs; ++i) { 5662b8e80941Smrg for (unsigned chan = 0; chan < 4; chan++) { 5663b8e80941Smrg if (!(gsinfo->output_usagemask[i] & (1 << chan)) || 5664b8e80941Smrg outputs[i].vertex_stream[chan] != stream) { 5665b8e80941Smrg outputs[i].values[chan] = LLVMGetUndef(ctx.f32); 5666b8e80941Smrg continue; 5667b8e80941Smrg } 5668b8e80941Smrg 5669b8e80941Smrg LLVMValueRef soffset = LLVMConstInt(ctx.i32, 5670b8e80941Smrg offset * gs_selector->gs_max_out_vertices * 16 * 4, 0); 5671b8e80941Smrg offset++; 5672b8e80941Smrg 5673b8e80941Smrg outputs[i].values[chan] = 5674b8e80941Smrg ac_build_buffer_load(&ctx.ac, 5675b8e80941Smrg ctx.gsvs_ring[0], 1, 5676b8e80941Smrg ctx.i32_0, voffset, 5677b8e80941Smrg soffset, 0, 1, 1, 5678b8e80941Smrg true, false); 5679b8e80941Smrg } 5680b8e80941Smrg } 5681b8e80941Smrg 5682b8e80941Smrg /* Streamout and exports. */ 5683b8e80941Smrg if (gs_selector->so.num_outputs) { 5684b8e80941Smrg si_llvm_emit_streamout(&ctx, outputs, 5685b8e80941Smrg gsinfo->num_outputs, 5686b8e80941Smrg stream); 5687b8e80941Smrg } 5688b8e80941Smrg 5689b8e80941Smrg if (stream == 0) { 5690b8e80941Smrg /* Vertex color clamping. 5691b8e80941Smrg * 5692b8e80941Smrg * This uses a state constant loaded in a user data SGPR and 5693b8e80941Smrg * an IF statement is added that clamps all colors if the constant 5694b8e80941Smrg * is true. 5695b8e80941Smrg */ 5696b8e80941Smrg struct lp_build_if_state if_ctx; 5697b8e80941Smrg LLVMValueRef v[2], cond = NULL; 5698b8e80941Smrg LLVMBasicBlockRef blocks[2]; 5699b8e80941Smrg 5700b8e80941Smrg for (unsigned i = 0; i < gsinfo->num_outputs; i++) { 5701b8e80941Smrg if (gsinfo->output_semantic_name[i] != TGSI_SEMANTIC_COLOR && 5702b8e80941Smrg gsinfo->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR) 5703b8e80941Smrg continue; 5704b8e80941Smrg 5705b8e80941Smrg /* We've found a color. */ 5706b8e80941Smrg if (!cond) { 5707b8e80941Smrg /* The state is in the first bit of the user SGPR. */ 5708b8e80941Smrg cond = LLVMGetParam(ctx.main_fn, 5709b8e80941Smrg ctx.param_vs_state_bits); 5710b8e80941Smrg cond = LLVMBuildTrunc(ctx.ac.builder, cond, 5711b8e80941Smrg ctx.i1, ""); 5712b8e80941Smrg lp_build_if(&if_ctx, &ctx.gallivm, cond); 5713b8e80941Smrg /* Remember blocks for Phi. */ 5714b8e80941Smrg blocks[0] = if_ctx.true_block; 5715b8e80941Smrg blocks[1] = if_ctx.entry_block; 5716b8e80941Smrg } 5717b8e80941Smrg 5718b8e80941Smrg for (unsigned j = 0; j < 4; j++) { 5719b8e80941Smrg /* Insert clamp into the true block. */ 5720b8e80941Smrg v[0] = ac_build_clamp(&ctx.ac, outputs[i].values[j]); 5721b8e80941Smrg v[1] = outputs[i].values[j]; 5722b8e80941Smrg 5723b8e80941Smrg /* Insert Phi into the endif block. */ 5724b8e80941Smrg LLVMPositionBuilderAtEnd(ctx.ac.builder, if_ctx.merge_block); 5725b8e80941Smrg outputs[i].values[j] = ac_build_phi(&ctx.ac, ctx.f32, 2, v, blocks); 5726b8e80941Smrg LLVMPositionBuilderAtEnd(ctx.ac.builder, if_ctx.true_block); 5727b8e80941Smrg } 5728b8e80941Smrg } 5729b8e80941Smrg if (cond) 5730b8e80941Smrg lp_build_endif(&if_ctx); 5731b8e80941Smrg 5732b8e80941Smrg si_llvm_export_vs(&ctx, outputs, gsinfo->num_outputs); 5733b8e80941Smrg } 5734b8e80941Smrg 5735b8e80941Smrg LLVMBuildBr(builder, end_bb); 5736b8e80941Smrg } 5737b8e80941Smrg 5738b8e80941Smrg LLVMPositionBuilderAtEnd(builder, end_bb); 5739b8e80941Smrg 5740b8e80941Smrg LLVMBuildRetVoid(ctx.ac.builder); 5741b8e80941Smrg 5742b8e80941Smrg ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */ 5743b8e80941Smrg si_llvm_optimize_module(&ctx); 5744b8e80941Smrg 5745b8e80941Smrg r = si_compile_llvm(sscreen, &ctx.shader->binary, 5746b8e80941Smrg &ctx.shader->config, ctx.compiler, 5747b8e80941Smrg ctx.ac.module, 5748b8e80941Smrg debug, PIPE_SHADER_GEOMETRY, 5749b8e80941Smrg "GS Copy Shader", false); 5750b8e80941Smrg if (!r) { 5751b8e80941Smrg if (si_can_dump_shader(sscreen, PIPE_SHADER_GEOMETRY)) 5752b8e80941Smrg fprintf(stderr, "GS Copy Shader:\n"); 5753b8e80941Smrg si_shader_dump(sscreen, ctx.shader, debug, 5754b8e80941Smrg PIPE_SHADER_GEOMETRY, stderr, true); 5755b8e80941Smrg r = si_shader_binary_upload(sscreen, ctx.shader); 5756b8e80941Smrg } 5757b8e80941Smrg 5758b8e80941Smrg si_llvm_dispose(&ctx); 5759b8e80941Smrg 5760b8e80941Smrg if (r != 0) { 5761b8e80941Smrg FREE(shader); 5762b8e80941Smrg shader = NULL; 5763b8e80941Smrg } else { 5764b8e80941Smrg si_fix_resource_usage(sscreen, shader); 5765b8e80941Smrg } 5766b8e80941Smrg return shader; 5767b8e80941Smrg} 5768b8e80941Smrg 5769b8e80941Smrgstatic void si_dump_shader_key_vs(const struct si_shader_key *key, 5770b8e80941Smrg const struct si_vs_prolog_bits *prolog, 5771b8e80941Smrg const char *prefix, FILE *f) 5772b8e80941Smrg{ 5773b8e80941Smrg fprintf(f, " %s.instance_divisor_is_one = %u\n", 5774b8e80941Smrg prefix, prolog->instance_divisor_is_one); 5775b8e80941Smrg fprintf(f, " %s.instance_divisor_is_fetched = %u\n", 5776b8e80941Smrg prefix, prolog->instance_divisor_is_fetched); 5777b8e80941Smrg fprintf(f, " %s.ls_vgpr_fix = %u\n", 5778b8e80941Smrg prefix, prolog->ls_vgpr_fix); 5779b8e80941Smrg 5780b8e80941Smrg fprintf(f, " mono.vs.fix_fetch = {"); 5781b8e80941Smrg for (int i = 0; i < SI_MAX_ATTRIBS; i++) 5782b8e80941Smrg fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]); 5783b8e80941Smrg fprintf(f, "}\n"); 5784b8e80941Smrg} 5785b8e80941Smrg 5786b8e80941Smrgstatic void si_dump_shader_key(unsigned processor, const struct si_shader *shader, 5787b8e80941Smrg FILE *f) 5788b8e80941Smrg{ 5789b8e80941Smrg const struct si_shader_key *key = &shader->key; 5790b8e80941Smrg 5791b8e80941Smrg fprintf(f, "SHADER KEY\n"); 5792b8e80941Smrg 5793b8e80941Smrg switch (processor) { 5794b8e80941Smrg case PIPE_SHADER_VERTEX: 5795b8e80941Smrg si_dump_shader_key_vs(key, &key->part.vs.prolog, 5796b8e80941Smrg "part.vs.prolog", f); 5797b8e80941Smrg fprintf(f, " as_es = %u\n", key->as_es); 5798b8e80941Smrg fprintf(f, " as_ls = %u\n", key->as_ls); 5799b8e80941Smrg fprintf(f, " mono.u.vs_export_prim_id = %u\n", 5800b8e80941Smrg key->mono.u.vs_export_prim_id); 5801b8e80941Smrg break; 5802b8e80941Smrg 5803b8e80941Smrg case PIPE_SHADER_TESS_CTRL: 5804b8e80941Smrg if (shader->selector->screen->info.chip_class >= GFX9) { 5805b8e80941Smrg si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog, 5806b8e80941Smrg "part.tcs.ls_prolog", f); 5807b8e80941Smrg } 5808b8e80941Smrg fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode); 5809b8e80941Smrg fprintf(f, " mono.u.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.u.ff_tcs_inputs_to_copy); 5810b8e80941Smrg break; 5811b8e80941Smrg 5812b8e80941Smrg case PIPE_SHADER_TESS_EVAL: 5813b8e80941Smrg fprintf(f, " as_es = %u\n", key->as_es); 5814b8e80941Smrg fprintf(f, " mono.u.vs_export_prim_id = %u\n", 5815b8e80941Smrg key->mono.u.vs_export_prim_id); 5816b8e80941Smrg break; 5817b8e80941Smrg 5818b8e80941Smrg case PIPE_SHADER_GEOMETRY: 5819b8e80941Smrg if (shader->is_gs_copy_shader) 5820b8e80941Smrg break; 5821b8e80941Smrg 5822b8e80941Smrg if (shader->selector->screen->info.chip_class >= GFX9 && 5823b8e80941Smrg key->part.gs.es->type == PIPE_SHADER_VERTEX) { 5824b8e80941Smrg si_dump_shader_key_vs(key, &key->part.gs.vs_prolog, 5825b8e80941Smrg "part.gs.vs_prolog", f); 5826b8e80941Smrg } 5827b8e80941Smrg fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix); 5828b8e80941Smrg break; 5829b8e80941Smrg 5830b8e80941Smrg case PIPE_SHADER_COMPUTE: 5831b8e80941Smrg break; 5832848b8605Smrg 5833b8e80941Smrg case PIPE_SHADER_FRAGMENT: 5834b8e80941Smrg fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side); 5835b8e80941Smrg fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors); 5836b8e80941Smrg fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple); 5837b8e80941Smrg fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp); 5838b8e80941Smrg fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp); 5839b8e80941Smrg fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp); 5840b8e80941Smrg fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp); 5841b8e80941Smrg fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp); 5842b8e80941Smrg fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear); 5843b8e80941Smrg fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format); 5844b8e80941Smrg fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8); 5845b8e80941Smrg fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10); 5846b8e80941Smrg fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf); 5847b8e80941Smrg fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func); 5848b8e80941Smrg fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one); 5849b8e80941Smrg fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing); 5850b8e80941Smrg fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color); 5851b8e80941Smrg break; 5852848b8605Smrg 5853b8e80941Smrg default: 5854b8e80941Smrg assert(0); 5855b8e80941Smrg } 5856b8e80941Smrg 5857b8e80941Smrg if ((processor == PIPE_SHADER_GEOMETRY || 5858b8e80941Smrg processor == PIPE_SHADER_TESS_EVAL || 5859b8e80941Smrg processor == PIPE_SHADER_VERTEX) && 5860b8e80941Smrg !key->as_es && !key->as_ls) { 5861b8e80941Smrg fprintf(f, " opt.kill_outputs = 0x%"PRIx64"\n", key->opt.kill_outputs); 5862b8e80941Smrg fprintf(f, " opt.clip_disable = %u\n", key->opt.clip_disable); 5863848b8605Smrg } 5864848b8605Smrg} 5865848b8605Smrg 5866b8e80941Smrgstatic void si_init_shader_ctx(struct si_shader_context *ctx, 5867b8e80941Smrg struct si_screen *sscreen, 5868b8e80941Smrg struct ac_llvm_compiler *compiler) 5869848b8605Smrg{ 5870b8e80941Smrg struct lp_build_tgsi_context *bld_base; 5871848b8605Smrg 5872b8e80941Smrg si_llvm_context_init(ctx, sscreen, compiler); 5873848b8605Smrg 5874b8e80941Smrg bld_base = &ctx->bld_base; 5875b8e80941Smrg bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant; 5876848b8605Smrg 5877b8e80941Smrg bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID].emit = build_interp_intrinsic; 5878b8e80941Smrg bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE].emit = build_interp_intrinsic; 5879b8e80941Smrg bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET].emit = build_interp_intrinsic; 5880848b8605Smrg 5881b8e80941Smrg bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit; 5882848b8605Smrg 5883b8e80941Smrg bld_base->op_actions[TGSI_OPCODE_CLOCK].emit = clock_emit; 5884848b8605Smrg 5885b8e80941Smrg bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy; 5886b8e80941Smrg bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy; 5887b8e80941Smrg bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy; 5888b8e80941Smrg bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy; 5889b8e80941Smrg 5890b8e80941Smrg bld_base->op_actions[TGSI_OPCODE_VOTE_ALL].emit = vote_all_emit; 5891b8e80941Smrg bld_base->op_actions[TGSI_OPCODE_VOTE_ANY].emit = vote_any_emit; 5892b8e80941Smrg bld_base->op_actions[TGSI_OPCODE_VOTE_EQ].emit = vote_eq_emit; 5893b8e80941Smrg bld_base->op_actions[TGSI_OPCODE_BALLOT].emit = ballot_emit; 5894b8e80941Smrg bld_base->op_actions[TGSI_OPCODE_READ_FIRST].intr_name = "llvm.amdgcn.readfirstlane"; 5895b8e80941Smrg bld_base->op_actions[TGSI_OPCODE_READ_FIRST].emit = read_lane_emit; 5896b8e80941Smrg bld_base->op_actions[TGSI_OPCODE_READ_INVOC].intr_name = "llvm.amdgcn.readlane"; 5897b8e80941Smrg bld_base->op_actions[TGSI_OPCODE_READ_INVOC].emit = read_lane_emit; 5898b8e80941Smrg 5899b8e80941Smrg bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_tgsi_emit_vertex; 5900b8e80941Smrg bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_tgsi_emit_primitive; 5901b8e80941Smrg bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier; 5902b8e80941Smrg} 5903848b8605Smrg 5904b8e80941Smrgstatic void si_optimize_vs_outputs(struct si_shader_context *ctx) 5905b8e80941Smrg{ 5906b8e80941Smrg struct si_shader *shader = ctx->shader; 5907b8e80941Smrg struct tgsi_shader_info *info = &shader->selector->info; 5908848b8605Smrg 5909b8e80941Smrg if ((ctx->type != PIPE_SHADER_VERTEX && 5910b8e80941Smrg ctx->type != PIPE_SHADER_TESS_EVAL) || 5911b8e80941Smrg shader->key.as_ls || 5912b8e80941Smrg shader->key.as_es) 5913b8e80941Smrg return; 5914848b8605Smrg 5915b8e80941Smrg ac_optimize_vs_outputs(&ctx->ac, 5916b8e80941Smrg ctx->main_fn, 5917b8e80941Smrg shader->info.vs_output_param_offset, 5918b8e80941Smrg info->num_outputs, 5919b8e80941Smrg &shader->info.nr_param_exports); 5920848b8605Smrg} 5921848b8605Smrg 5922b8e80941Smrgstatic void si_init_exec_from_input(struct si_shader_context *ctx, 5923b8e80941Smrg unsigned param, unsigned bitoffset) 5924848b8605Smrg{ 5925b8e80941Smrg LLVMValueRef args[] = { 5926b8e80941Smrg LLVMGetParam(ctx->main_fn, param), 5927b8e80941Smrg LLVMConstInt(ctx->i32, bitoffset, 0), 5928b8e80941Smrg }; 5929b8e80941Smrg ac_build_intrinsic(&ctx->ac, 5930b8e80941Smrg "llvm.amdgcn.init.exec.from.input", 5931b8e80941Smrg ctx->voidt, args, 2, AC_FUNC_ATTR_CONVERGENT); 5932b8e80941Smrg} 5933848b8605Smrg 5934b8e80941Smrgstatic bool si_vs_needs_prolog(const struct si_shader_selector *sel, 5935b8e80941Smrg const struct si_vs_prolog_bits *key) 5936b8e80941Smrg{ 5937b8e80941Smrg /* VGPR initialization fixup for Vega10 and Raven is always done in the 5938b8e80941Smrg * VS prolog. */ 5939b8e80941Smrg return sel->vs_needs_prolog || key->ls_vgpr_fix; 5940848b8605Smrg} 5941848b8605Smrg 5942b8e80941Smrgstatic bool si_compile_tgsi_main(struct si_shader_context *ctx) 5943848b8605Smrg{ 5944b8e80941Smrg struct si_shader *shader = ctx->shader; 5945b8e80941Smrg struct si_shader_selector *sel = shader->selector; 5946b8e80941Smrg struct lp_build_tgsi_context *bld_base = &ctx->bld_base; 5947b8e80941Smrg 5948b8e80941Smrg // TODO clean all this up! 5949b8e80941Smrg switch (ctx->type) { 5950b8e80941Smrg case PIPE_SHADER_VERTEX: 5951b8e80941Smrg ctx->load_input = declare_input_vs; 5952b8e80941Smrg if (shader->key.as_ls) 5953b8e80941Smrg ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue; 5954b8e80941Smrg else if (shader->key.as_es) 5955b8e80941Smrg ctx->abi.emit_outputs = si_llvm_emit_es_epilogue; 5956b8e80941Smrg else 5957b8e80941Smrg ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue; 5958b8e80941Smrg bld_base->emit_epilogue = si_tgsi_emit_epilogue; 5959b8e80941Smrg ctx->abi.load_base_vertex = get_base_vertex; 5960b8e80941Smrg break; 5961b8e80941Smrg case PIPE_SHADER_TESS_CTRL: 5962b8e80941Smrg bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs; 5963b8e80941Smrg ctx->abi.load_tess_varyings = si_nir_load_tcs_varyings; 5964b8e80941Smrg bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs; 5965b8e80941Smrg bld_base->emit_store = store_output_tcs; 5966b8e80941Smrg ctx->abi.store_tcs_outputs = si_nir_store_output_tcs; 5967b8e80941Smrg ctx->abi.emit_outputs = si_llvm_emit_tcs_epilogue; 5968b8e80941Smrg ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in; 5969b8e80941Smrg bld_base->emit_epilogue = si_tgsi_emit_epilogue; 5970b8e80941Smrg break; 5971b8e80941Smrg case PIPE_SHADER_TESS_EVAL: 5972b8e80941Smrg bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes; 5973b8e80941Smrg ctx->abi.load_tess_varyings = si_nir_load_input_tes; 5974b8e80941Smrg ctx->abi.load_tess_coord = si_load_tess_coord; 5975b8e80941Smrg ctx->abi.load_tess_level = si_load_tess_level; 5976b8e80941Smrg ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in; 5977b8e80941Smrg if (shader->key.as_es) 5978b8e80941Smrg ctx->abi.emit_outputs = si_llvm_emit_es_epilogue; 5979b8e80941Smrg else 5980b8e80941Smrg ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue; 5981b8e80941Smrg bld_base->emit_epilogue = si_tgsi_emit_epilogue; 5982b8e80941Smrg break; 5983b8e80941Smrg case PIPE_SHADER_GEOMETRY: 5984b8e80941Smrg bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs; 5985b8e80941Smrg ctx->abi.load_inputs = si_nir_load_input_gs; 5986b8e80941Smrg ctx->abi.emit_vertex = si_llvm_emit_vertex; 5987b8e80941Smrg ctx->abi.emit_primitive = si_llvm_emit_primitive; 5988b8e80941Smrg ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue; 5989b8e80941Smrg bld_base->emit_epilogue = si_tgsi_emit_gs_epilogue; 5990b8e80941Smrg break; 5991b8e80941Smrg case PIPE_SHADER_FRAGMENT: 5992b8e80941Smrg ctx->load_input = declare_input_fs; 5993b8e80941Smrg ctx->abi.emit_outputs = si_llvm_return_fs_outputs; 5994b8e80941Smrg bld_base->emit_epilogue = si_tgsi_emit_epilogue; 5995b8e80941Smrg ctx->abi.lookup_interp_param = si_nir_lookup_interp_param; 5996b8e80941Smrg ctx->abi.load_sample_position = load_sample_position; 5997b8e80941Smrg ctx->abi.load_sample_mask_in = load_sample_mask_in; 5998b8e80941Smrg ctx->abi.emit_kill = si_llvm_emit_kill; 5999b8e80941Smrg break; 6000b8e80941Smrg case PIPE_SHADER_COMPUTE: 6001b8e80941Smrg ctx->abi.load_local_group_size = get_block_size; 6002b8e80941Smrg break; 6003b8e80941Smrg default: 6004b8e80941Smrg assert(!"Unsupported shader type"); 6005b8e80941Smrg return false; 6006b8e80941Smrg } 6007848b8605Smrg 6008b8e80941Smrg ctx->abi.load_ubo = load_ubo; 6009b8e80941Smrg ctx->abi.load_ssbo = load_ssbo; 6010848b8605Smrg 6011b8e80941Smrg create_function(ctx); 6012b8e80941Smrg preload_ring_buffers(ctx); 6013848b8605Smrg 6014b8e80941Smrg /* For GFX9 merged shaders: 6015b8e80941Smrg * - Set EXEC for the first shader. If the prolog is present, set 6016b8e80941Smrg * EXEC there instead. 6017b8e80941Smrg * - Add a barrier before the second shader. 6018b8e80941Smrg * - In the second shader, reset EXEC to ~0 and wrap the main part in 6019b8e80941Smrg * an if-statement. This is required for correctness in geometry 6020b8e80941Smrg * shaders, to ensure that empty GS waves do not send GS_EMIT and 6021b8e80941Smrg * GS_CUT messages. 6022b8e80941Smrg * 6023b8e80941Smrg * For monolithic merged shaders, the first shader is wrapped in an 6024b8e80941Smrg * if-block together with its prolog in si_build_wrapper_function. 6025b8e80941Smrg */ 6026b8e80941Smrg if (ctx->screen->info.chip_class >= GFX9) { 6027b8e80941Smrg if (!shader->is_monolithic && 6028b8e80941Smrg sel->info.num_instructions > 1 && /* not empty shader */ 6029b8e80941Smrg (shader->key.as_es || shader->key.as_ls) && 6030b8e80941Smrg (ctx->type == PIPE_SHADER_TESS_EVAL || 6031b8e80941Smrg (ctx->type == PIPE_SHADER_VERTEX && 6032b8e80941Smrg !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog)))) { 6033b8e80941Smrg si_init_exec_from_input(ctx, 6034b8e80941Smrg ctx->param_merged_wave_info, 0); 6035b8e80941Smrg } else if (ctx->type == PIPE_SHADER_TESS_CTRL || 6036b8e80941Smrg ctx->type == PIPE_SHADER_GEOMETRY) { 6037b8e80941Smrg if (!shader->is_monolithic) 6038b8e80941Smrg ac_init_exec_full_mask(&ctx->ac); 6039b8e80941Smrg 6040b8e80941Smrg LLVMValueRef num_threads = si_unpack_param(ctx, ctx->param_merged_wave_info, 8, 8); 6041b8e80941Smrg LLVMValueRef ena = 6042b8e80941Smrg LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, 6043b8e80941Smrg ac_get_thread_id(&ctx->ac), num_threads, ""); 6044b8e80941Smrg lp_build_if(&ctx->merged_wrap_if_state, &ctx->gallivm, ena); 6045b8e80941Smrg 6046b8e80941Smrg /* The barrier must execute for all shaders in a 6047b8e80941Smrg * threadgroup. 6048b8e80941Smrg * 6049b8e80941Smrg * Execute the barrier inside the conditional block, 6050b8e80941Smrg * so that empty waves can jump directly to s_endpgm, 6051b8e80941Smrg * which will also signal the barrier. 6052b8e80941Smrg * 6053b8e80941Smrg * If the shader is TCS and the TCS epilog is present 6054b8e80941Smrg * and contains a barrier, it will wait there and then 6055b8e80941Smrg * reach s_endpgm. 6056b8e80941Smrg */ 6057b8e80941Smrg si_llvm_emit_barrier(NULL, bld_base, NULL); 6058b8e80941Smrg } 6059b8e80941Smrg } 6060848b8605Smrg 6061b8e80941Smrg if (ctx->type == PIPE_SHADER_TESS_CTRL && 6062b8e80941Smrg sel->tcs_info.tessfactors_are_def_in_all_invocs) { 6063b8e80941Smrg for (unsigned i = 0; i < 6; i++) { 6064b8e80941Smrg ctx->invoc0_tess_factors[i] = 6065b8e80941Smrg ac_build_alloca_undef(&ctx->ac, ctx->i32, ""); 6066b8e80941Smrg } 6067b8e80941Smrg } 6068b8e80941Smrg 6069b8e80941Smrg if (ctx->type == PIPE_SHADER_GEOMETRY) { 6070b8e80941Smrg int i; 6071b8e80941Smrg for (i = 0; i < 4; i++) { 6072b8e80941Smrg ctx->gs_next_vertex[i] = 6073b8e80941Smrg ac_build_alloca(&ctx->ac, ctx->i32, ""); 6074b8e80941Smrg } 6075b8e80941Smrg } 6076848b8605Smrg 6077b8e80941Smrg if (sel->force_correct_derivs_after_kill) { 6078b8e80941Smrg ctx->postponed_kill = ac_build_alloca_undef(&ctx->ac, ctx->i1, ""); 6079b8e80941Smrg /* true = don't kill. */ 6080b8e80941Smrg LLVMBuildStore(ctx->ac.builder, ctx->i1true, 6081b8e80941Smrg ctx->postponed_kill); 6082b8e80941Smrg } 6083848b8605Smrg 6084b8e80941Smrg if (sel->tokens) { 6085b8e80941Smrg if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) { 6086b8e80941Smrg fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n"); 6087b8e80941Smrg return false; 6088b8e80941Smrg } 6089b8e80941Smrg } else { 6090b8e80941Smrg if (!si_nir_build_llvm(ctx, sel->nir)) { 6091b8e80941Smrg fprintf(stderr, "Failed to translate shader from NIR to LLVM\n"); 6092b8e80941Smrg return false; 6093848b8605Smrg } 6094848b8605Smrg } 6095848b8605Smrg 6096b8e80941Smrg si_llvm_build_ret(ctx, ctx->return_value); 6097b8e80941Smrg return true; 6098848b8605Smrg} 6099848b8605Smrg 6100b8e80941Smrg/** 6101b8e80941Smrg * Compute the VS prolog key, which contains all the information needed to 6102b8e80941Smrg * build the VS prolog function, and set shader->info bits where needed. 6103b8e80941Smrg * 6104b8e80941Smrg * \param info Shader info of the vertex shader. 6105b8e80941Smrg * \param num_input_sgprs Number of input SGPRs for the vertex shader. 6106b8e80941Smrg * \param prolog_key Key of the VS prolog 6107b8e80941Smrg * \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS. 6108b8e80941Smrg * \param key Output shader part key. 6109b8e80941Smrg */ 6110b8e80941Smrgstatic void si_get_vs_prolog_key(const struct tgsi_shader_info *info, 6111b8e80941Smrg unsigned num_input_sgprs, 6112b8e80941Smrg const struct si_vs_prolog_bits *prolog_key, 6113b8e80941Smrg struct si_shader *shader_out, 6114b8e80941Smrg union si_shader_part_key *key) 6115848b8605Smrg{ 6116b8e80941Smrg memset(key, 0, sizeof(*key)); 6117b8e80941Smrg key->vs_prolog.states = *prolog_key; 6118b8e80941Smrg key->vs_prolog.num_input_sgprs = num_input_sgprs; 6119b8e80941Smrg key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1; 6120b8e80941Smrg key->vs_prolog.as_ls = shader_out->key.as_ls; 6121b8e80941Smrg key->vs_prolog.as_es = shader_out->key.as_es; 6122b8e80941Smrg 6123b8e80941Smrg if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) { 6124b8e80941Smrg key->vs_prolog.as_ls = 1; 6125b8e80941Smrg key->vs_prolog.num_merged_next_stage_vgprs = 2; 6126b8e80941Smrg } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) { 6127b8e80941Smrg key->vs_prolog.as_es = 1; 6128b8e80941Smrg key->vs_prolog.num_merged_next_stage_vgprs = 5; 6129b8e80941Smrg } 6130848b8605Smrg 6131b8e80941Smrg /* Enable loading the InstanceID VGPR. */ 6132b8e80941Smrg uint16_t input_mask = u_bit_consecutive(0, info->num_inputs); 6133848b8605Smrg 6134b8e80941Smrg if ((key->vs_prolog.states.instance_divisor_is_one | 6135b8e80941Smrg key->vs_prolog.states.instance_divisor_is_fetched) & input_mask) 6136b8e80941Smrg shader_out->info.uses_instanceid = true; 6137b8e80941Smrg} 6138848b8605Smrg 6139b8e80941Smrg/** 6140b8e80941Smrg * Compute the PS prolog key, which contains all the information needed to 6141b8e80941Smrg * build the PS prolog function, and set related bits in shader->config. 6142b8e80941Smrg */ 6143b8e80941Smrgstatic void si_get_ps_prolog_key(struct si_shader *shader, 6144b8e80941Smrg union si_shader_part_key *key, 6145b8e80941Smrg bool separate_prolog) 6146b8e80941Smrg{ 6147b8e80941Smrg struct tgsi_shader_info *info = &shader->selector->info; 6148b8e80941Smrg 6149b8e80941Smrg memset(key, 0, sizeof(*key)); 6150b8e80941Smrg key->ps_prolog.states = shader->key.part.ps.prolog; 6151b8e80941Smrg key->ps_prolog.colors_read = info->colors_read; 6152b8e80941Smrg key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs; 6153b8e80941Smrg key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs; 6154b8e80941Smrg key->ps_prolog.wqm = info->uses_derivatives && 6155b8e80941Smrg (key->ps_prolog.colors_read || 6156b8e80941Smrg key->ps_prolog.states.force_persp_sample_interp || 6157b8e80941Smrg key->ps_prolog.states.force_linear_sample_interp || 6158b8e80941Smrg key->ps_prolog.states.force_persp_center_interp || 6159b8e80941Smrg key->ps_prolog.states.force_linear_center_interp || 6160b8e80941Smrg key->ps_prolog.states.bc_optimize_for_persp || 6161b8e80941Smrg key->ps_prolog.states.bc_optimize_for_linear); 6162b8e80941Smrg key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index; 6163b8e80941Smrg 6164b8e80941Smrg if (info->colors_read) { 6165b8e80941Smrg unsigned *color = shader->selector->color_attr_index; 6166b8e80941Smrg 6167b8e80941Smrg if (shader->key.part.ps.prolog.color_two_side) { 6168b8e80941Smrg /* BCOLORs are stored after the last input. */ 6169b8e80941Smrg key->ps_prolog.num_interp_inputs = info->num_inputs; 6170b8e80941Smrg key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index; 6171b8e80941Smrg shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1); 6172b8e80941Smrg } 6173848b8605Smrg 6174b8e80941Smrg for (unsigned i = 0; i < 2; i++) { 6175b8e80941Smrg unsigned interp = info->input_interpolate[color[i]]; 6176b8e80941Smrg unsigned location = info->input_interpolate_loc[color[i]]; 6177848b8605Smrg 6178b8e80941Smrg if (!(info->colors_read & (0xf << i*4))) 6179848b8605Smrg continue; 6180848b8605Smrg 6181b8e80941Smrg key->ps_prolog.color_attr_index[i] = color[i]; 6182848b8605Smrg 6183b8e80941Smrg if (shader->key.part.ps.prolog.flatshade_colors && 6184b8e80941Smrg interp == TGSI_INTERPOLATE_COLOR) 6185b8e80941Smrg interp = TGSI_INTERPOLATE_CONSTANT; 6186b8e80941Smrg 6187b8e80941Smrg switch (interp) { 6188b8e80941Smrg case TGSI_INTERPOLATE_CONSTANT: 6189b8e80941Smrg key->ps_prolog.color_interp_vgpr_index[i] = -1; 6190b8e80941Smrg break; 6191b8e80941Smrg case TGSI_INTERPOLATE_PERSPECTIVE: 6192b8e80941Smrg case TGSI_INTERPOLATE_COLOR: 6193b8e80941Smrg /* Force the interpolation location for colors here. */ 6194b8e80941Smrg if (shader->key.part.ps.prolog.force_persp_sample_interp) 6195b8e80941Smrg location = TGSI_INTERPOLATE_LOC_SAMPLE; 6196b8e80941Smrg if (shader->key.part.ps.prolog.force_persp_center_interp) 6197b8e80941Smrg location = TGSI_INTERPOLATE_LOC_CENTER; 6198b8e80941Smrg 6199b8e80941Smrg switch (location) { 6200b8e80941Smrg case TGSI_INTERPOLATE_LOC_SAMPLE: 6201b8e80941Smrg key->ps_prolog.color_interp_vgpr_index[i] = 0; 6202b8e80941Smrg shader->config.spi_ps_input_ena |= 6203b8e80941Smrg S_0286CC_PERSP_SAMPLE_ENA(1); 6204b8e80941Smrg break; 6205b8e80941Smrg case TGSI_INTERPOLATE_LOC_CENTER: 6206b8e80941Smrg key->ps_prolog.color_interp_vgpr_index[i] = 2; 6207b8e80941Smrg shader->config.spi_ps_input_ena |= 6208b8e80941Smrg S_0286CC_PERSP_CENTER_ENA(1); 6209b8e80941Smrg break; 6210b8e80941Smrg case TGSI_INTERPOLATE_LOC_CENTROID: 6211b8e80941Smrg key->ps_prolog.color_interp_vgpr_index[i] = 4; 6212b8e80941Smrg shader->config.spi_ps_input_ena |= 6213b8e80941Smrg S_0286CC_PERSP_CENTROID_ENA(1); 6214b8e80941Smrg break; 6215b8e80941Smrg default: 6216b8e80941Smrg assert(0); 6217b8e80941Smrg } 6218b8e80941Smrg break; 6219b8e80941Smrg case TGSI_INTERPOLATE_LINEAR: 6220b8e80941Smrg /* Force the interpolation location for colors here. */ 6221b8e80941Smrg if (shader->key.part.ps.prolog.force_linear_sample_interp) 6222b8e80941Smrg location = TGSI_INTERPOLATE_LOC_SAMPLE; 6223b8e80941Smrg if (shader->key.part.ps.prolog.force_linear_center_interp) 6224b8e80941Smrg location = TGSI_INTERPOLATE_LOC_CENTER; 6225b8e80941Smrg 6226b8e80941Smrg /* The VGPR assignment for non-monolithic shaders 6227b8e80941Smrg * works because InitialPSInputAddr is set on the 6228b8e80941Smrg * main shader and PERSP_PULL_MODEL is never used. 6229b8e80941Smrg */ 6230b8e80941Smrg switch (location) { 6231b8e80941Smrg case TGSI_INTERPOLATE_LOC_SAMPLE: 6232b8e80941Smrg key->ps_prolog.color_interp_vgpr_index[i] = 6233b8e80941Smrg separate_prolog ? 6 : 9; 6234b8e80941Smrg shader->config.spi_ps_input_ena |= 6235b8e80941Smrg S_0286CC_LINEAR_SAMPLE_ENA(1); 6236b8e80941Smrg break; 6237b8e80941Smrg case TGSI_INTERPOLATE_LOC_CENTER: 6238b8e80941Smrg key->ps_prolog.color_interp_vgpr_index[i] = 6239b8e80941Smrg separate_prolog ? 8 : 11; 6240b8e80941Smrg shader->config.spi_ps_input_ena |= 6241b8e80941Smrg S_0286CC_LINEAR_CENTER_ENA(1); 6242b8e80941Smrg break; 6243b8e80941Smrg case TGSI_INTERPOLATE_LOC_CENTROID: 6244b8e80941Smrg key->ps_prolog.color_interp_vgpr_index[i] = 6245b8e80941Smrg separate_prolog ? 10 : 13; 6246b8e80941Smrg shader->config.spi_ps_input_ena |= 6247b8e80941Smrg S_0286CC_LINEAR_CENTROID_ENA(1); 6248b8e80941Smrg break; 6249b8e80941Smrg default: 6250b8e80941Smrg assert(0); 6251848b8605Smrg } 6252b8e80941Smrg break; 6253b8e80941Smrg default: 6254b8e80941Smrg assert(0); 6255848b8605Smrg } 6256848b8605Smrg } 6257848b8605Smrg } 6258b8e80941Smrg} 6259848b8605Smrg 6260b8e80941Smrg/** 6261b8e80941Smrg * Check whether a PS prolog is required based on the key. 6262b8e80941Smrg */ 6263b8e80941Smrgstatic bool si_need_ps_prolog(const union si_shader_part_key *key) 6264b8e80941Smrg{ 6265b8e80941Smrg return key->ps_prolog.colors_read || 6266b8e80941Smrg key->ps_prolog.states.force_persp_sample_interp || 6267b8e80941Smrg key->ps_prolog.states.force_linear_sample_interp || 6268b8e80941Smrg key->ps_prolog.states.force_persp_center_interp || 6269b8e80941Smrg key->ps_prolog.states.force_linear_center_interp || 6270b8e80941Smrg key->ps_prolog.states.bc_optimize_for_persp || 6271b8e80941Smrg key->ps_prolog.states.bc_optimize_for_linear || 6272b8e80941Smrg key->ps_prolog.states.poly_stipple || 6273b8e80941Smrg key->ps_prolog.states.samplemask_log_ps_iter; 6274b8e80941Smrg} 6275848b8605Smrg 6276b8e80941Smrg/** 6277b8e80941Smrg * Compute the PS epilog key, which contains all the information needed to 6278b8e80941Smrg * build the PS epilog function. 6279b8e80941Smrg */ 6280b8e80941Smrgstatic void si_get_ps_epilog_key(struct si_shader *shader, 6281b8e80941Smrg union si_shader_part_key *key) 6282b8e80941Smrg{ 6283b8e80941Smrg struct tgsi_shader_info *info = &shader->selector->info; 6284b8e80941Smrg memset(key, 0, sizeof(*key)); 6285b8e80941Smrg key->ps_epilog.colors_written = info->colors_written; 6286b8e80941Smrg key->ps_epilog.writes_z = info->writes_z; 6287b8e80941Smrg key->ps_epilog.writes_stencil = info->writes_stencil; 6288b8e80941Smrg key->ps_epilog.writes_samplemask = info->writes_samplemask; 6289b8e80941Smrg key->ps_epilog.states = shader->key.part.ps.epilog; 6290b8e80941Smrg} 6291848b8605Smrg 6292b8e80941Smrg/** 6293b8e80941Smrg * Build the GS prolog function. Rotate the input vertices for triangle strips 6294b8e80941Smrg * with adjacency. 6295b8e80941Smrg */ 6296b8e80941Smrgstatic void si_build_gs_prolog_function(struct si_shader_context *ctx, 6297b8e80941Smrg union si_shader_part_key *key) 6298b8e80941Smrg{ 6299b8e80941Smrg unsigned num_sgprs, num_vgprs; 6300b8e80941Smrg struct si_function_info fninfo; 6301b8e80941Smrg LLVMBuilderRef builder = ctx->ac.builder; 6302b8e80941Smrg LLVMTypeRef returns[48]; 6303b8e80941Smrg LLVMValueRef func, ret; 6304848b8605Smrg 6305b8e80941Smrg si_init_function_info(&fninfo); 6306848b8605Smrg 6307b8e80941Smrg if (ctx->screen->info.chip_class >= GFX9) { 6308b8e80941Smrg if (key->gs_prolog.states.gfx9_prev_is_vs) 6309b8e80941Smrg num_sgprs = 8 + GFX9_VSGS_NUM_USER_SGPR; 6310848b8605Smrg else 6311b8e80941Smrg num_sgprs = 8 + GFX9_TESGS_NUM_USER_SGPR; 6312b8e80941Smrg num_vgprs = 5; /* ES inputs are not needed by GS */ 6313b8e80941Smrg } else { 6314b8e80941Smrg num_sgprs = GFX6_GS_NUM_USER_SGPR + 2; 6315b8e80941Smrg num_vgprs = 8; 6316b8e80941Smrg } 6317848b8605Smrg 6318b8e80941Smrg for (unsigned i = 0; i < num_sgprs; ++i) { 6319b8e80941Smrg add_arg(&fninfo, ARG_SGPR, ctx->i32); 6320b8e80941Smrg returns[i] = ctx->i32; 6321848b8605Smrg } 6322848b8605Smrg 6323b8e80941Smrg for (unsigned i = 0; i < num_vgprs; ++i) { 6324b8e80941Smrg add_arg(&fninfo, ARG_VGPR, ctx->i32); 6325b8e80941Smrg returns[num_sgprs + i] = ctx->f32; 6326b8e80941Smrg } 6327848b8605Smrg 6328b8e80941Smrg /* Create the function. */ 6329b8e80941Smrg si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, 6330b8e80941Smrg &fninfo, 0); 6331b8e80941Smrg func = ctx->main_fn; 6332848b8605Smrg 6333b8e80941Smrg /* Set the full EXEC mask for the prolog, because we are only fiddling 6334b8e80941Smrg * with registers here. The main shader part will set the correct EXEC 6335b8e80941Smrg * mask. 6336b8e80941Smrg */ 6337b8e80941Smrg if (ctx->screen->info.chip_class >= GFX9 && !key->gs_prolog.is_monolithic) 6338b8e80941Smrg ac_init_exec_full_mask(&ctx->ac); 6339848b8605Smrg 6340b8e80941Smrg /* Copy inputs to outputs. This should be no-op, as the registers match, 6341b8e80941Smrg * but it will prevent the compiler from overwriting them unintentionally. 6342b8e80941Smrg */ 6343b8e80941Smrg ret = ctx->return_value; 6344b8e80941Smrg for (unsigned i = 0; i < num_sgprs; i++) { 6345b8e80941Smrg LLVMValueRef p = LLVMGetParam(func, i); 6346b8e80941Smrg ret = LLVMBuildInsertValue(builder, ret, p, i, ""); 6347b8e80941Smrg } 6348b8e80941Smrg for (unsigned i = 0; i < num_vgprs; i++) { 6349b8e80941Smrg LLVMValueRef p = LLVMGetParam(func, num_sgprs + i); 6350b8e80941Smrg p = ac_to_float(&ctx->ac, p); 6351b8e80941Smrg ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, ""); 6352848b8605Smrg } 6353848b8605Smrg 6354b8e80941Smrg if (key->gs_prolog.states.tri_strip_adj_fix) { 6355b8e80941Smrg /* Remap the input vertices for every other primitive. */ 6356b8e80941Smrg const unsigned gfx6_vtx_params[6] = { 6357b8e80941Smrg num_sgprs, 6358b8e80941Smrg num_sgprs + 1, 6359b8e80941Smrg num_sgprs + 3, 6360b8e80941Smrg num_sgprs + 4, 6361b8e80941Smrg num_sgprs + 5, 6362b8e80941Smrg num_sgprs + 6 6363b8e80941Smrg }; 6364b8e80941Smrg const unsigned gfx9_vtx_params[3] = { 6365b8e80941Smrg num_sgprs, 6366b8e80941Smrg num_sgprs + 1, 6367b8e80941Smrg num_sgprs + 4, 6368b8e80941Smrg }; 6369b8e80941Smrg LLVMValueRef vtx_in[6], vtx_out[6]; 6370b8e80941Smrg LLVMValueRef prim_id, rotate; 6371848b8605Smrg 6372b8e80941Smrg if (ctx->screen->info.chip_class >= GFX9) { 6373b8e80941Smrg for (unsigned i = 0; i < 3; i++) { 6374b8e80941Smrg vtx_in[i*2] = si_unpack_param(ctx, gfx9_vtx_params[i], 0, 16); 6375b8e80941Smrg vtx_in[i*2+1] = si_unpack_param(ctx, gfx9_vtx_params[i], 16, 16); 6376b8e80941Smrg } 6377b8e80941Smrg } else { 6378b8e80941Smrg for (unsigned i = 0; i < 6; i++) 6379b8e80941Smrg vtx_in[i] = LLVMGetParam(func, gfx6_vtx_params[i]); 6380b8e80941Smrg } 6381848b8605Smrg 6382b8e80941Smrg prim_id = LLVMGetParam(func, num_sgprs + 2); 6383b8e80941Smrg rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, ""); 6384848b8605Smrg 6385b8e80941Smrg for (unsigned i = 0; i < 6; ++i) { 6386b8e80941Smrg LLVMValueRef base, rotated; 6387b8e80941Smrg base = vtx_in[i]; 6388b8e80941Smrg rotated = vtx_in[(i + 4) % 6]; 6389b8e80941Smrg vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, ""); 6390b8e80941Smrg } 6391848b8605Smrg 6392b8e80941Smrg if (ctx->screen->info.chip_class >= GFX9) { 6393b8e80941Smrg for (unsigned i = 0; i < 3; i++) { 6394b8e80941Smrg LLVMValueRef hi, out; 6395848b8605Smrg 6396b8e80941Smrg hi = LLVMBuildShl(builder, vtx_out[i*2+1], 6397b8e80941Smrg LLVMConstInt(ctx->i32, 16, 0), ""); 6398b8e80941Smrg out = LLVMBuildOr(builder, vtx_out[i*2], hi, ""); 6399b8e80941Smrg out = ac_to_float(&ctx->ac, out); 6400b8e80941Smrg ret = LLVMBuildInsertValue(builder, ret, out, 6401b8e80941Smrg gfx9_vtx_params[i], ""); 6402b8e80941Smrg } 6403b8e80941Smrg } else { 6404b8e80941Smrg for (unsigned i = 0; i < 6; i++) { 6405b8e80941Smrg LLVMValueRef out; 6406848b8605Smrg 6407b8e80941Smrg out = ac_to_float(&ctx->ac, vtx_out[i]); 6408b8e80941Smrg ret = LLVMBuildInsertValue(builder, ret, out, 6409b8e80941Smrg gfx6_vtx_params[i], ""); 6410b8e80941Smrg } 6411b8e80941Smrg } 6412848b8605Smrg } 6413848b8605Smrg 6414b8e80941Smrg LLVMBuildRet(builder, ret); 6415b8e80941Smrg} 6416b8e80941Smrg 6417b8e80941Smrg/** 6418b8e80941Smrg * Given a list of shader part functions, build a wrapper function that 6419b8e80941Smrg * runs them in sequence to form a monolithic shader. 6420b8e80941Smrg */ 6421b8e80941Smrgstatic void si_build_wrapper_function(struct si_shader_context *ctx, 6422b8e80941Smrg LLVMValueRef *parts, 6423b8e80941Smrg unsigned num_parts, 6424b8e80941Smrg unsigned main_part, 6425b8e80941Smrg unsigned next_shader_first_part) 6426b8e80941Smrg{ 6427b8e80941Smrg LLVMBuilderRef builder = ctx->ac.builder; 6428b8e80941Smrg /* PS epilog has one arg per color component; gfx9 merged shader 6429b8e80941Smrg * prologs need to forward 32 user SGPRs. 6430b8e80941Smrg */ 6431b8e80941Smrg struct si_function_info fninfo; 6432b8e80941Smrg LLVMValueRef initial[64], out[64]; 6433b8e80941Smrg LLVMTypeRef function_type; 6434b8e80941Smrg unsigned num_first_params; 6435b8e80941Smrg unsigned num_out, initial_num_out; 6436b8e80941Smrg MAYBE_UNUSED unsigned num_out_sgpr; /* used in debug checks */ 6437b8e80941Smrg MAYBE_UNUSED unsigned initial_num_out_sgpr; /* used in debug checks */ 6438b8e80941Smrg unsigned num_sgprs, num_vgprs; 6439b8e80941Smrg unsigned gprs; 6440b8e80941Smrg struct lp_build_if_state if_state; 6441b8e80941Smrg 6442b8e80941Smrg si_init_function_info(&fninfo); 6443b8e80941Smrg 6444b8e80941Smrg for (unsigned i = 0; i < num_parts; ++i) { 6445b8e80941Smrg ac_add_function_attr(ctx->ac.context, parts[i], -1, 6446b8e80941Smrg AC_FUNC_ATTR_ALWAYSINLINE); 6447b8e80941Smrg LLVMSetLinkage(parts[i], LLVMPrivateLinkage); 6448848b8605Smrg } 6449848b8605Smrg 6450b8e80941Smrg /* The parameters of the wrapper function correspond to those of the 6451b8e80941Smrg * first part in terms of SGPRs and VGPRs, but we use the types of the 6452b8e80941Smrg * main part to get the right types. This is relevant for the 6453b8e80941Smrg * dereferenceable attribute on descriptor table pointers. 6454b8e80941Smrg */ 6455b8e80941Smrg num_sgprs = 0; 6456b8e80941Smrg num_vgprs = 0; 6457848b8605Smrg 6458b8e80941Smrg function_type = LLVMGetElementType(LLVMTypeOf(parts[0])); 6459b8e80941Smrg num_first_params = LLVMCountParamTypes(function_type); 6460848b8605Smrg 6461b8e80941Smrg for (unsigned i = 0; i < num_first_params; ++i) { 6462b8e80941Smrg LLVMValueRef param = LLVMGetParam(parts[0], i); 6463848b8605Smrg 6464b8e80941Smrg if (ac_is_sgpr_param(param)) { 6465b8e80941Smrg assert(num_vgprs == 0); 6466b8e80941Smrg num_sgprs += ac_get_type_size(LLVMTypeOf(param)) / 4; 6467848b8605Smrg } else { 6468b8e80941Smrg num_vgprs += ac_get_type_size(LLVMTypeOf(param)) / 4; 6469848b8605Smrg } 6470848b8605Smrg } 6471848b8605Smrg 6472b8e80941Smrg gprs = 0; 6473b8e80941Smrg while (gprs < num_sgprs + num_vgprs) { 6474b8e80941Smrg LLVMValueRef param = LLVMGetParam(parts[main_part], fninfo.num_params); 6475b8e80941Smrg LLVMTypeRef type = LLVMTypeOf(param); 6476b8e80941Smrg unsigned size = ac_get_type_size(type) / 4; 6477848b8605Smrg 6478b8e80941Smrg add_arg(&fninfo, gprs < num_sgprs ? ARG_SGPR : ARG_VGPR, type); 6479848b8605Smrg 6480b8e80941Smrg assert(ac_is_sgpr_param(param) == (gprs < num_sgprs)); 6481b8e80941Smrg assert(gprs + size <= num_sgprs + num_vgprs && 6482b8e80941Smrg (gprs >= num_sgprs || gprs + size <= num_sgprs)); 6483848b8605Smrg 6484b8e80941Smrg gprs += size; 6485848b8605Smrg } 6486848b8605Smrg 6487b8e80941Smrg si_create_function(ctx, "wrapper", NULL, 0, &fninfo, 6488b8e80941Smrg si_get_max_workgroup_size(ctx->shader)); 6489848b8605Smrg 6490b8e80941Smrg if (is_merged_shader(ctx)) 6491b8e80941Smrg ac_init_exec_full_mask(&ctx->ac); 6492848b8605Smrg 6493b8e80941Smrg /* Record the arguments of the function as if they were an output of 6494b8e80941Smrg * a previous part. 6495b8e80941Smrg */ 6496b8e80941Smrg num_out = 0; 6497b8e80941Smrg num_out_sgpr = 0; 6498b8e80941Smrg 6499b8e80941Smrg for (unsigned i = 0; i < fninfo.num_params; ++i) { 6500b8e80941Smrg LLVMValueRef param = LLVMGetParam(ctx->main_fn, i); 6501b8e80941Smrg LLVMTypeRef param_type = LLVMTypeOf(param); 6502b8e80941Smrg LLVMTypeRef out_type = i < fninfo.num_sgpr_params ? ctx->i32 : ctx->f32; 6503b8e80941Smrg unsigned size = ac_get_type_size(param_type) / 4; 6504b8e80941Smrg 6505b8e80941Smrg if (size == 1) { 6506b8e80941Smrg if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) { 6507b8e80941Smrg param = LLVMBuildPtrToInt(builder, param, ctx->i32, ""); 6508b8e80941Smrg param_type = ctx->i32; 6509b8e80941Smrg } 6510848b8605Smrg 6511b8e80941Smrg if (param_type != out_type) 6512b8e80941Smrg param = LLVMBuildBitCast(builder, param, out_type, ""); 6513b8e80941Smrg out[num_out++] = param; 6514b8e80941Smrg } else { 6515b8e80941Smrg LLVMTypeRef vector_type = LLVMVectorType(out_type, size); 6516848b8605Smrg 6517b8e80941Smrg if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) { 6518b8e80941Smrg param = LLVMBuildPtrToInt(builder, param, ctx->i64, ""); 6519b8e80941Smrg param_type = ctx->i64; 6520848b8605Smrg } 6521b8e80941Smrg 6522b8e80941Smrg if (param_type != vector_type) 6523b8e80941Smrg param = LLVMBuildBitCast(builder, param, vector_type, ""); 6524b8e80941Smrg 6525b8e80941Smrg for (unsigned j = 0; j < size; ++j) 6526b8e80941Smrg out[num_out++] = LLVMBuildExtractElement( 6527b8e80941Smrg builder, param, LLVMConstInt(ctx->i32, j, 0), ""); 6528848b8605Smrg } 6529848b8605Smrg 6530b8e80941Smrg if (i < fninfo.num_sgpr_params) 6531b8e80941Smrg num_out_sgpr = num_out; 6532b8e80941Smrg } 6533b8e80941Smrg 6534b8e80941Smrg memcpy(initial, out, sizeof(out)); 6535b8e80941Smrg initial_num_out = num_out; 6536b8e80941Smrg initial_num_out_sgpr = num_out_sgpr; 6537b8e80941Smrg 6538b8e80941Smrg /* Now chain the parts. */ 6539b8e80941Smrg for (unsigned part = 0; part < num_parts; ++part) { 6540b8e80941Smrg LLVMValueRef in[48]; 6541b8e80941Smrg LLVMValueRef ret; 6542b8e80941Smrg LLVMTypeRef ret_type; 6543b8e80941Smrg unsigned out_idx = 0; 6544b8e80941Smrg unsigned num_params = LLVMCountParams(parts[part]); 6545b8e80941Smrg 6546b8e80941Smrg /* Merged shaders are executed conditionally depending 6547b8e80941Smrg * on the number of enabled threads passed in the input SGPRs. */ 6548b8e80941Smrg if (is_merged_shader(ctx) && part == 0) { 6549b8e80941Smrg LLVMValueRef ena, count = initial[3]; 6550b8e80941Smrg 6551b8e80941Smrg count = LLVMBuildAnd(builder, count, 6552b8e80941Smrg LLVMConstInt(ctx->i32, 0x7f, 0), ""); 6553b8e80941Smrg ena = LLVMBuildICmp(builder, LLVMIntULT, 6554b8e80941Smrg ac_get_thread_id(&ctx->ac), count, ""); 6555b8e80941Smrg lp_build_if(&if_state, &ctx->gallivm, ena); 6556b8e80941Smrg } 6557848b8605Smrg 6558b8e80941Smrg /* Derive arguments for the next part from outputs of the 6559b8e80941Smrg * previous one. 6560b8e80941Smrg */ 6561b8e80941Smrg for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) { 6562b8e80941Smrg LLVMValueRef param; 6563b8e80941Smrg LLVMTypeRef param_type; 6564b8e80941Smrg bool is_sgpr; 6565b8e80941Smrg unsigned param_size; 6566b8e80941Smrg LLVMValueRef arg = NULL; 6567b8e80941Smrg 6568b8e80941Smrg param = LLVMGetParam(parts[part], param_idx); 6569b8e80941Smrg param_type = LLVMTypeOf(param); 6570b8e80941Smrg param_size = ac_get_type_size(param_type) / 4; 6571b8e80941Smrg is_sgpr = ac_is_sgpr_param(param); 6572b8e80941Smrg 6573b8e80941Smrg if (is_sgpr) { 6574b8e80941Smrg ac_add_function_attr(ctx->ac.context, parts[part], 6575b8e80941Smrg param_idx + 1, AC_FUNC_ATTR_INREG); 6576b8e80941Smrg } else if (out_idx < num_out_sgpr) { 6577b8e80941Smrg /* Skip returned SGPRs the current part doesn't 6578b8e80941Smrg * declare on the input. */ 6579b8e80941Smrg out_idx = num_out_sgpr; 6580b8e80941Smrg } 6581848b8605Smrg 6582b8e80941Smrg assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out)); 6583848b8605Smrg 6584b8e80941Smrg if (param_size == 1) 6585b8e80941Smrg arg = out[out_idx]; 6586b8e80941Smrg else 6587b8e80941Smrg arg = ac_build_gather_values(&ctx->ac, &out[out_idx], param_size); 6588b8e80941Smrg 6589b8e80941Smrg if (LLVMTypeOf(arg) != param_type) { 6590b8e80941Smrg if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) { 6591b8e80941Smrg if (LLVMGetPointerAddressSpace(param_type) == 6592b8e80941Smrg AC_ADDR_SPACE_CONST_32BIT) { 6593b8e80941Smrg arg = LLVMBuildBitCast(builder, arg, ctx->i32, ""); 6594b8e80941Smrg arg = LLVMBuildIntToPtr(builder, arg, param_type, ""); 6595b8e80941Smrg } else { 6596b8e80941Smrg arg = LLVMBuildBitCast(builder, arg, ctx->i64, ""); 6597b8e80941Smrg arg = LLVMBuildIntToPtr(builder, arg, param_type, ""); 6598b8e80941Smrg } 6599b8e80941Smrg } else { 6600b8e80941Smrg arg = LLVMBuildBitCast(builder, arg, param_type, ""); 6601b8e80941Smrg } 6602848b8605Smrg } 6603848b8605Smrg 6604b8e80941Smrg in[param_idx] = arg; 6605b8e80941Smrg out_idx += param_size; 6606848b8605Smrg } 6607848b8605Smrg 6608b8e80941Smrg ret = LLVMBuildCall(builder, parts[part], in, num_params, ""); 6609848b8605Smrg 6610b8e80941Smrg if (is_merged_shader(ctx) && 6611b8e80941Smrg part + 1 == next_shader_first_part) { 6612b8e80941Smrg lp_build_endif(&if_state); 6613848b8605Smrg 6614b8e80941Smrg /* The second half of the merged shader should use 6615b8e80941Smrg * the inputs from the toplevel (wrapper) function, 6616b8e80941Smrg * not the return value from the last call. 6617b8e80941Smrg * 6618b8e80941Smrg * That's because the last call was executed condi- 6619b8e80941Smrg * tionally, so we can't consume it in the main 6620b8e80941Smrg * block. 6621b8e80941Smrg */ 6622b8e80941Smrg memcpy(out, initial, sizeof(initial)); 6623b8e80941Smrg num_out = initial_num_out; 6624b8e80941Smrg num_out_sgpr = initial_num_out_sgpr; 6625b8e80941Smrg continue; 6626b8e80941Smrg } 6627848b8605Smrg 6628b8e80941Smrg /* Extract the returned GPRs. */ 6629b8e80941Smrg ret_type = LLVMTypeOf(ret); 6630b8e80941Smrg num_out = 0; 6631b8e80941Smrg num_out_sgpr = 0; 6632848b8605Smrg 6633b8e80941Smrg if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) { 6634b8e80941Smrg assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind); 6635848b8605Smrg 6636b8e80941Smrg unsigned ret_size = LLVMCountStructElementTypes(ret_type); 6637848b8605Smrg 6638b8e80941Smrg for (unsigned i = 0; i < ret_size; ++i) { 6639b8e80941Smrg LLVMValueRef val = 6640b8e80941Smrg LLVMBuildExtractValue(builder, ret, i, ""); 6641848b8605Smrg 6642b8e80941Smrg assert(num_out < ARRAY_SIZE(out)); 6643b8e80941Smrg out[num_out++] = val; 6644848b8605Smrg 6645b8e80941Smrg if (LLVMTypeOf(val) == ctx->i32) { 6646b8e80941Smrg assert(num_out_sgpr + 1 == num_out); 6647b8e80941Smrg num_out_sgpr = num_out; 6648b8e80941Smrg } 6649b8e80941Smrg } 6650848b8605Smrg } 6651b8e80941Smrg } 6652848b8605Smrg 6653b8e80941Smrg LLVMBuildRetVoid(builder); 6654b8e80941Smrg} 6655848b8605Smrg 6656b8e80941Smrgstatic bool si_should_optimize_less(struct ac_llvm_compiler *compiler, 6657b8e80941Smrg struct si_shader_selector *sel) 6658b8e80941Smrg{ 6659b8e80941Smrg if (!compiler->low_opt_passes) 6660b8e80941Smrg return false; 6661b8e80941Smrg 6662b8e80941Smrg /* Assume a slow CPU. */ 6663b8e80941Smrg assert(!sel->screen->info.has_dedicated_vram && 6664b8e80941Smrg sel->screen->info.chip_class <= VI); 6665b8e80941Smrg 6666b8e80941Smrg /* For a crazy dEQP test containing 2597 memory opcodes, mostly 6667b8e80941Smrg * buffer stores. */ 6668b8e80941Smrg return sel->type == PIPE_SHADER_COMPUTE && 6669b8e80941Smrg sel->info.num_memory_instructions > 1000; 6670848b8605Smrg} 6671848b8605Smrg 6672b8e80941Smrgint si_compile_tgsi_shader(struct si_screen *sscreen, 6673b8e80941Smrg struct ac_llvm_compiler *compiler, 6674b8e80941Smrg struct si_shader *shader, 6675b8e80941Smrg struct pipe_debug_callback *debug) 6676848b8605Smrg{ 6677b8e80941Smrg struct si_shader_selector *sel = shader->selector; 6678b8e80941Smrg struct si_shader_context ctx; 6679b8e80941Smrg int r = -1; 6680b8e80941Smrg 6681b8e80941Smrg /* Dump TGSI code before doing TGSI->LLVM conversion in case the 6682b8e80941Smrg * conversion fails. */ 6683b8e80941Smrg if (si_can_dump_shader(sscreen, sel->info.processor) && 6684b8e80941Smrg !(sscreen->debug_flags & DBG(NO_TGSI))) { 6685b8e80941Smrg if (sel->tokens) 6686b8e80941Smrg tgsi_dump(sel->tokens, 0); 6687b8e80941Smrg else 6688b8e80941Smrg nir_print_shader(sel->nir, stderr); 6689b8e80941Smrg si_dump_streamout(&sel->so); 6690848b8605Smrg } 6691848b8605Smrg 6692b8e80941Smrg si_init_shader_ctx(&ctx, sscreen, compiler); 6693b8e80941Smrg si_llvm_context_set_tgsi(&ctx, shader); 6694848b8605Smrg 6695b8e80941Smrg memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED, 6696b8e80941Smrg sizeof(shader->info.vs_output_param_offset)); 6697848b8605Smrg 6698b8e80941Smrg shader->info.uses_instanceid = sel->info.uses_instanceid; 6699848b8605Smrg 6700b8e80941Smrg if (!si_compile_tgsi_main(&ctx)) { 6701b8e80941Smrg si_llvm_dispose(&ctx); 6702b8e80941Smrg return -1; 6703b8e80941Smrg } 6704848b8605Smrg 6705b8e80941Smrg if (shader->is_monolithic && ctx.type == PIPE_SHADER_VERTEX) { 6706b8e80941Smrg LLVMValueRef parts[2]; 6707b8e80941Smrg bool need_prolog = sel->vs_needs_prolog; 6708848b8605Smrg 6709b8e80941Smrg parts[1] = ctx.main_fn; 6710848b8605Smrg 6711b8e80941Smrg if (need_prolog) { 6712b8e80941Smrg union si_shader_part_key prolog_key; 6713b8e80941Smrg si_get_vs_prolog_key(&sel->info, 6714b8e80941Smrg shader->info.num_input_sgprs, 6715b8e80941Smrg &shader->key.part.vs.prolog, 6716b8e80941Smrg shader, &prolog_key); 6717b8e80941Smrg si_build_vs_prolog_function(&ctx, &prolog_key); 6718b8e80941Smrg parts[0] = ctx.main_fn; 6719b8e80941Smrg } 6720848b8605Smrg 6721b8e80941Smrg si_build_wrapper_function(&ctx, parts + !need_prolog, 6722b8e80941Smrg 1 + need_prolog, need_prolog, 0); 6723b8e80941Smrg } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) { 6724b8e80941Smrg if (sscreen->info.chip_class >= GFX9) { 6725b8e80941Smrg struct si_shader_selector *ls = shader->key.part.tcs.ls; 6726b8e80941Smrg LLVMValueRef parts[4]; 6727b8e80941Smrg bool vs_needs_prolog = 6728b8e80941Smrg si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog); 6729b8e80941Smrg 6730b8e80941Smrg /* TCS main part */ 6731b8e80941Smrg parts[2] = ctx.main_fn; 6732b8e80941Smrg 6733b8e80941Smrg /* TCS epilog */ 6734b8e80941Smrg union si_shader_part_key tcs_epilog_key; 6735b8e80941Smrg memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key)); 6736b8e80941Smrg tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog; 6737b8e80941Smrg si_build_tcs_epilog_function(&ctx, &tcs_epilog_key); 6738b8e80941Smrg parts[3] = ctx.main_fn; 6739b8e80941Smrg 6740b8e80941Smrg /* VS as LS main part */ 6741b8e80941Smrg struct si_shader shader_ls = {}; 6742b8e80941Smrg shader_ls.selector = ls; 6743b8e80941Smrg shader_ls.key.as_ls = 1; 6744b8e80941Smrg shader_ls.key.mono = shader->key.mono; 6745b8e80941Smrg shader_ls.key.opt = shader->key.opt; 6746b8e80941Smrg shader_ls.is_monolithic = true; 6747b8e80941Smrg si_llvm_context_set_tgsi(&ctx, &shader_ls); 6748b8e80941Smrg 6749b8e80941Smrg if (!si_compile_tgsi_main(&ctx)) { 6750b8e80941Smrg si_llvm_dispose(&ctx); 6751b8e80941Smrg return -1; 6752b8e80941Smrg } 6753b8e80941Smrg shader->info.uses_instanceid |= ls->info.uses_instanceid; 6754b8e80941Smrg parts[1] = ctx.main_fn; 6755b8e80941Smrg 6756b8e80941Smrg /* LS prolog */ 6757b8e80941Smrg if (vs_needs_prolog) { 6758b8e80941Smrg union si_shader_part_key vs_prolog_key; 6759b8e80941Smrg si_get_vs_prolog_key(&ls->info, 6760b8e80941Smrg shader_ls.info.num_input_sgprs, 6761b8e80941Smrg &shader->key.part.tcs.ls_prolog, 6762b8e80941Smrg shader, &vs_prolog_key); 6763b8e80941Smrg vs_prolog_key.vs_prolog.is_monolithic = true; 6764b8e80941Smrg si_build_vs_prolog_function(&ctx, &vs_prolog_key); 6765b8e80941Smrg parts[0] = ctx.main_fn; 6766b8e80941Smrg } 6767848b8605Smrg 6768b8e80941Smrg /* Reset the shader context. */ 6769b8e80941Smrg ctx.shader = shader; 6770b8e80941Smrg ctx.type = PIPE_SHADER_TESS_CTRL; 6771848b8605Smrg 6772b8e80941Smrg si_build_wrapper_function(&ctx, 6773b8e80941Smrg parts + !vs_needs_prolog, 6774b8e80941Smrg 4 - !vs_needs_prolog, vs_needs_prolog, 6775b8e80941Smrg vs_needs_prolog ? 2 : 1); 6776b8e80941Smrg } else { 6777b8e80941Smrg LLVMValueRef parts[2]; 6778b8e80941Smrg union si_shader_part_key epilog_key; 6779848b8605Smrg 6780b8e80941Smrg parts[0] = ctx.main_fn; 6781848b8605Smrg 6782b8e80941Smrg memset(&epilog_key, 0, sizeof(epilog_key)); 6783b8e80941Smrg epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog; 6784b8e80941Smrg si_build_tcs_epilog_function(&ctx, &epilog_key); 6785b8e80941Smrg parts[1] = ctx.main_fn; 6786848b8605Smrg 6787b8e80941Smrg si_build_wrapper_function(&ctx, parts, 2, 0, 0); 6788848b8605Smrg } 6789b8e80941Smrg } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) { 6790b8e80941Smrg if (ctx.screen->info.chip_class >= GFX9) { 6791b8e80941Smrg struct si_shader_selector *es = shader->key.part.gs.es; 6792b8e80941Smrg LLVMValueRef es_prolog = NULL; 6793b8e80941Smrg LLVMValueRef es_main = NULL; 6794b8e80941Smrg LLVMValueRef gs_prolog = NULL; 6795b8e80941Smrg LLVMValueRef gs_main = ctx.main_fn; 6796b8e80941Smrg 6797b8e80941Smrg /* GS prolog */ 6798b8e80941Smrg union si_shader_part_key gs_prolog_key; 6799b8e80941Smrg memset(&gs_prolog_key, 0, sizeof(gs_prolog_key)); 6800b8e80941Smrg gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog; 6801b8e80941Smrg gs_prolog_key.gs_prolog.is_monolithic = true; 6802b8e80941Smrg si_build_gs_prolog_function(&ctx, &gs_prolog_key); 6803b8e80941Smrg gs_prolog = ctx.main_fn; 6804b8e80941Smrg 6805b8e80941Smrg /* ES main part */ 6806b8e80941Smrg struct si_shader shader_es = {}; 6807b8e80941Smrg shader_es.selector = es; 6808b8e80941Smrg shader_es.key.as_es = 1; 6809b8e80941Smrg shader_es.key.mono = shader->key.mono; 6810b8e80941Smrg shader_es.key.opt = shader->key.opt; 6811b8e80941Smrg shader_es.is_monolithic = true; 6812b8e80941Smrg si_llvm_context_set_tgsi(&ctx, &shader_es); 6813b8e80941Smrg 6814b8e80941Smrg if (!si_compile_tgsi_main(&ctx)) { 6815b8e80941Smrg si_llvm_dispose(&ctx); 6816b8e80941Smrg return -1; 6817b8e80941Smrg } 6818b8e80941Smrg shader->info.uses_instanceid |= es->info.uses_instanceid; 6819b8e80941Smrg es_main = ctx.main_fn; 6820b8e80941Smrg 6821b8e80941Smrg /* ES prolog */ 6822b8e80941Smrg if (es->vs_needs_prolog) { 6823b8e80941Smrg union si_shader_part_key vs_prolog_key; 6824b8e80941Smrg si_get_vs_prolog_key(&es->info, 6825b8e80941Smrg shader_es.info.num_input_sgprs, 6826b8e80941Smrg &shader->key.part.gs.vs_prolog, 6827b8e80941Smrg shader, &vs_prolog_key); 6828b8e80941Smrg vs_prolog_key.vs_prolog.is_monolithic = true; 6829b8e80941Smrg si_build_vs_prolog_function(&ctx, &vs_prolog_key); 6830b8e80941Smrg es_prolog = ctx.main_fn; 6831b8e80941Smrg } 6832848b8605Smrg 6833b8e80941Smrg /* Reset the shader context. */ 6834b8e80941Smrg ctx.shader = shader; 6835b8e80941Smrg ctx.type = PIPE_SHADER_GEOMETRY; 6836848b8605Smrg 6837b8e80941Smrg /* Prepare the array of shader parts. */ 6838b8e80941Smrg LLVMValueRef parts[4]; 6839b8e80941Smrg unsigned num_parts = 0, main_part, next_first_part; 6840848b8605Smrg 6841b8e80941Smrg if (es_prolog) 6842b8e80941Smrg parts[num_parts++] = es_prolog; 6843848b8605Smrg 6844b8e80941Smrg parts[main_part = num_parts++] = es_main; 6845b8e80941Smrg parts[next_first_part = num_parts++] = gs_prolog; 6846b8e80941Smrg parts[num_parts++] = gs_main; 6847848b8605Smrg 6848b8e80941Smrg si_build_wrapper_function(&ctx, parts, num_parts, 6849b8e80941Smrg main_part, next_first_part); 6850b8e80941Smrg } else { 6851b8e80941Smrg LLVMValueRef parts[2]; 6852b8e80941Smrg union si_shader_part_key prolog_key; 6853848b8605Smrg 6854b8e80941Smrg parts[1] = ctx.main_fn; 6855b8e80941Smrg 6856b8e80941Smrg memset(&prolog_key, 0, sizeof(prolog_key)); 6857b8e80941Smrg prolog_key.gs_prolog.states = shader->key.part.gs.prolog; 6858b8e80941Smrg si_build_gs_prolog_function(&ctx, &prolog_key); 6859b8e80941Smrg parts[0] = ctx.main_fn; 6860b8e80941Smrg 6861b8e80941Smrg si_build_wrapper_function(&ctx, parts, 2, 1, 0); 6862b8e80941Smrg } 6863b8e80941Smrg } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) { 6864b8e80941Smrg LLVMValueRef parts[3]; 6865b8e80941Smrg union si_shader_part_key prolog_key; 6866b8e80941Smrg union si_shader_part_key epilog_key; 6867b8e80941Smrg bool need_prolog; 6868b8e80941Smrg 6869b8e80941Smrg si_get_ps_prolog_key(shader, &prolog_key, false); 6870b8e80941Smrg need_prolog = si_need_ps_prolog(&prolog_key); 6871b8e80941Smrg 6872b8e80941Smrg parts[need_prolog ? 1 : 0] = ctx.main_fn; 6873b8e80941Smrg 6874b8e80941Smrg if (need_prolog) { 6875b8e80941Smrg si_build_ps_prolog_function(&ctx, &prolog_key); 6876b8e80941Smrg parts[0] = ctx.main_fn; 6877b8e80941Smrg } 6878848b8605Smrg 6879b8e80941Smrg si_get_ps_epilog_key(shader, &epilog_key); 6880b8e80941Smrg si_build_ps_epilog_function(&ctx, &epilog_key); 6881b8e80941Smrg parts[need_prolog ? 2 : 1] = ctx.main_fn; 6882848b8605Smrg 6883b8e80941Smrg si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2, 6884b8e80941Smrg need_prolog ? 1 : 0, 0); 6885b8e80941Smrg } 6886848b8605Smrg 6887b8e80941Smrg si_llvm_optimize_module(&ctx); 6888848b8605Smrg 6889b8e80941Smrg /* Post-optimization transformations and analysis. */ 6890b8e80941Smrg si_optimize_vs_outputs(&ctx); 6891848b8605Smrg 6892b8e80941Smrg if ((debug && debug->debug_message) || 6893b8e80941Smrg si_can_dump_shader(sscreen, ctx.type)) { 6894b8e80941Smrg ctx.shader->config.private_mem_vgprs = 6895b8e80941Smrg ac_count_scratch_private_memory(ctx.main_fn); 6896848b8605Smrg } 6897848b8605Smrg 6898b8e80941Smrg /* Make sure the input is a pointer and not integer followed by inttoptr. */ 6899b8e80941Smrg assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) == 6900b8e80941Smrg LLVMPointerTypeKind); 6901848b8605Smrg 6902b8e80941Smrg /* Compile to bytecode. */ 6903b8e80941Smrg r = si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler, 6904b8e80941Smrg ctx.ac.module, debug, ctx.type, 6905b8e80941Smrg si_get_shader_name(shader, ctx.type), 6906b8e80941Smrg si_should_optimize_less(compiler, shader->selector)); 6907b8e80941Smrg si_llvm_dispose(&ctx); 6908b8e80941Smrg if (r) { 6909b8e80941Smrg fprintf(stderr, "LLVM failed to compile shader\n"); 6910b8e80941Smrg return r; 6911b8e80941Smrg } 6912b8e80941Smrg 6913b8e80941Smrg /* Validate SGPR and VGPR usage for compute to detect compiler bugs. 6914b8e80941Smrg * LLVM 3.9svn has this bug. 6915848b8605Smrg */ 6916b8e80941Smrg if (sel->type == PIPE_SHADER_COMPUTE) { 6917b8e80941Smrg unsigned wave_size = 64; 6918b8e80941Smrg unsigned max_vgprs = 256; 6919b8e80941Smrg unsigned max_sgprs = sscreen->info.chip_class >= VI ? 800 : 512; 6920b8e80941Smrg unsigned max_sgprs_per_wave = 128; 6921b8e80941Smrg unsigned max_block_threads = si_get_max_workgroup_size(shader); 6922b8e80941Smrg unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size); 6923b8e80941Smrg unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4); 6924b8e80941Smrg 6925b8e80941Smrg max_vgprs = max_vgprs / min_waves_per_simd; 6926b8e80941Smrg max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave); 6927b8e80941Smrg 6928b8e80941Smrg if (shader->config.num_sgprs > max_sgprs || 6929b8e80941Smrg shader->config.num_vgprs > max_vgprs) { 6930b8e80941Smrg fprintf(stderr, "LLVM failed to compile a shader correctly: " 6931b8e80941Smrg "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n", 6932b8e80941Smrg shader->config.num_sgprs, shader->config.num_vgprs, 6933b8e80941Smrg max_sgprs, max_vgprs); 6934b8e80941Smrg 6935b8e80941Smrg /* Just terminate the process, because dependent 6936b8e80941Smrg * shaders can hang due to bad input data, but use 6937b8e80941Smrg * the env var to allow shader-db to work. 6938b8e80941Smrg */ 6939b8e80941Smrg if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false)) 6940b8e80941Smrg abort(); 6941b8e80941Smrg } 6942b8e80941Smrg } 6943848b8605Smrg 6944b8e80941Smrg /* Add the scratch offset to input SGPRs. */ 6945b8e80941Smrg if (shader->config.scratch_bytes_per_wave && !is_merged_shader(&ctx)) 6946b8e80941Smrg shader->info.num_input_sgprs += 1; /* scratch byte offset */ 6947b8e80941Smrg 6948b8e80941Smrg /* Calculate the number of fragment input VGPRs. */ 6949b8e80941Smrg if (ctx.type == PIPE_SHADER_FRAGMENT) { 6950b8e80941Smrg shader->info.num_input_vgprs = 0; 6951b8e80941Smrg shader->info.face_vgpr_index = -1; 6952b8e80941Smrg shader->info.ancillary_vgpr_index = -1; 6953b8e80941Smrg 6954b8e80941Smrg if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr)) 6955b8e80941Smrg shader->info.num_input_vgprs += 2; 6956b8e80941Smrg if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr)) 6957b8e80941Smrg shader->info.num_input_vgprs += 2; 6958b8e80941Smrg if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr)) 6959b8e80941Smrg shader->info.num_input_vgprs += 2; 6960b8e80941Smrg if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr)) 6961b8e80941Smrg shader->info.num_input_vgprs += 3; 6962b8e80941Smrg if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr)) 6963b8e80941Smrg shader->info.num_input_vgprs += 2; 6964b8e80941Smrg if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr)) 6965b8e80941Smrg shader->info.num_input_vgprs += 2; 6966b8e80941Smrg if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr)) 6967b8e80941Smrg shader->info.num_input_vgprs += 2; 6968b8e80941Smrg if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr)) 6969b8e80941Smrg shader->info.num_input_vgprs += 1; 6970b8e80941Smrg if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr)) 6971b8e80941Smrg shader->info.num_input_vgprs += 1; 6972b8e80941Smrg if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr)) 6973b8e80941Smrg shader->info.num_input_vgprs += 1; 6974b8e80941Smrg if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr)) 6975b8e80941Smrg shader->info.num_input_vgprs += 1; 6976b8e80941Smrg if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr)) 6977b8e80941Smrg shader->info.num_input_vgprs += 1; 6978b8e80941Smrg if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) { 6979b8e80941Smrg shader->info.face_vgpr_index = shader->info.num_input_vgprs; 6980b8e80941Smrg shader->info.num_input_vgprs += 1; 6981b8e80941Smrg } 6982b8e80941Smrg if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr)) { 6983b8e80941Smrg shader->info.ancillary_vgpr_index = shader->info.num_input_vgprs; 6984b8e80941Smrg shader->info.num_input_vgprs += 1; 6985b8e80941Smrg } 6986b8e80941Smrg if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr)) 6987b8e80941Smrg shader->info.num_input_vgprs += 1; 6988b8e80941Smrg if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr)) 6989b8e80941Smrg shader->info.num_input_vgprs += 1; 6990b8e80941Smrg } 6991b8e80941Smrg 6992b8e80941Smrg si_calculate_max_simd_waves(shader); 6993b8e80941Smrg si_shader_dump_stats_for_shader_db(shader, debug); 6994b8e80941Smrg return 0; 6995b8e80941Smrg} 6996848b8605Smrg 6997b8e80941Smrg/** 6998b8e80941Smrg * Create, compile and return a shader part (prolog or epilog). 6999b8e80941Smrg * 7000b8e80941Smrg * \param sscreen screen 7001b8e80941Smrg * \param list list of shader parts of the same category 7002b8e80941Smrg * \param type shader type 7003b8e80941Smrg * \param key shader part key 7004b8e80941Smrg * \param prolog whether the part being requested is a prolog 7005b8e80941Smrg * \param tm LLVM target machine 7006b8e80941Smrg * \param debug debug callback 7007b8e80941Smrg * \param build the callback responsible for building the main function 7008b8e80941Smrg * \return non-NULL on success 7009b8e80941Smrg */ 7010b8e80941Smrgstatic struct si_shader_part * 7011b8e80941Smrgsi_get_shader_part(struct si_screen *sscreen, 7012b8e80941Smrg struct si_shader_part **list, 7013b8e80941Smrg enum pipe_shader_type type, 7014b8e80941Smrg bool prolog, 7015b8e80941Smrg union si_shader_part_key *key, 7016b8e80941Smrg struct ac_llvm_compiler *compiler, 7017b8e80941Smrg struct pipe_debug_callback *debug, 7018b8e80941Smrg void (*build)(struct si_shader_context *, 7019b8e80941Smrg union si_shader_part_key *), 7020b8e80941Smrg const char *name) 7021b8e80941Smrg{ 7022b8e80941Smrg struct si_shader_part *result; 7023848b8605Smrg 7024b8e80941Smrg mtx_lock(&sscreen->shader_parts_mutex); 7025848b8605Smrg 7026b8e80941Smrg /* Find existing. */ 7027b8e80941Smrg for (result = *list; result; result = result->next) { 7028b8e80941Smrg if (memcmp(&result->key, key, sizeof(*key)) == 0) { 7029b8e80941Smrg mtx_unlock(&sscreen->shader_parts_mutex); 7030b8e80941Smrg return result; 7031848b8605Smrg } 7032848b8605Smrg } 7033848b8605Smrg 7034b8e80941Smrg /* Compile a new one. */ 7035b8e80941Smrg result = CALLOC_STRUCT(si_shader_part); 7036b8e80941Smrg result->key = *key; 7037848b8605Smrg 7038b8e80941Smrg struct si_shader shader = {}; 7039b8e80941Smrg struct si_shader_context ctx; 7040848b8605Smrg 7041b8e80941Smrg si_init_shader_ctx(&ctx, sscreen, compiler); 7042b8e80941Smrg ctx.shader = &shader; 7043b8e80941Smrg ctx.type = type; 7044848b8605Smrg 7045b8e80941Smrg switch (type) { 7046b8e80941Smrg case PIPE_SHADER_VERTEX: 7047b8e80941Smrg shader.key.as_ls = key->vs_prolog.as_ls; 7048b8e80941Smrg shader.key.as_es = key->vs_prolog.as_es; 7049b8e80941Smrg break; 7050b8e80941Smrg case PIPE_SHADER_TESS_CTRL: 7051b8e80941Smrg assert(!prolog); 7052b8e80941Smrg shader.key.part.tcs.epilog = key->tcs_epilog.states; 7053b8e80941Smrg break; 7054b8e80941Smrg case PIPE_SHADER_GEOMETRY: 7055b8e80941Smrg assert(prolog); 7056b8e80941Smrg break; 7057b8e80941Smrg case PIPE_SHADER_FRAGMENT: 7058b8e80941Smrg if (prolog) 7059b8e80941Smrg shader.key.part.ps.prolog = key->ps_prolog.states; 7060b8e80941Smrg else 7061b8e80941Smrg shader.key.part.ps.epilog = key->ps_epilog.states; 7062b8e80941Smrg break; 7063b8e80941Smrg default: 7064b8e80941Smrg unreachable("bad shader part"); 7065b8e80941Smrg } 7066848b8605Smrg 7067b8e80941Smrg build(&ctx, key); 7068848b8605Smrg 7069b8e80941Smrg /* Compile. */ 7070b8e80941Smrg si_llvm_optimize_module(&ctx); 7071b8e80941Smrg 7072b8e80941Smrg if (si_compile_llvm(sscreen, &result->binary, &result->config, compiler, 7073b8e80941Smrg ctx.ac.module, debug, ctx.type, name, false)) { 7074b8e80941Smrg FREE(result); 7075b8e80941Smrg result = NULL; 7076b8e80941Smrg goto out; 7077b8e80941Smrg } 7078848b8605Smrg 7079b8e80941Smrg result->next = *list; 7080b8e80941Smrg *list = result; 7081848b8605Smrg 7082b8e80941Smrgout: 7083b8e80941Smrg si_llvm_dispose(&ctx); 7084b8e80941Smrg mtx_unlock(&sscreen->shader_parts_mutex); 7085b8e80941Smrg return result; 7086848b8605Smrg} 7087848b8605Smrg 7088b8e80941Smrgstatic LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx) 7089848b8605Smrg{ 7090b8e80941Smrg LLVMValueRef ptr[2], list; 7091b8e80941Smrg bool merged_shader = is_merged_shader(ctx); 7092b8e80941Smrg 7093b8e80941Smrg ptr[0] = LLVMGetParam(ctx->main_fn, (merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS); 7094b8e80941Smrg list = LLVMBuildIntToPtr(ctx->ac.builder, ptr[0], 7095b8e80941Smrg ac_array_in_const32_addr_space(ctx->v4i32), ""); 7096b8e80941Smrg return list; 7097848b8605Smrg} 7098848b8605Smrg 7099b8e80941Smrg/** 7100b8e80941Smrg * Build the vertex shader prolog function. 7101b8e80941Smrg * 7102b8e80941Smrg * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values). 7103b8e80941Smrg * All inputs are returned unmodified. The vertex load indices are 7104b8e80941Smrg * stored after them, which will be used by the API VS for fetching inputs. 7105b8e80941Smrg * 7106b8e80941Smrg * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are: 7107b8e80941Smrg * input_v0, 7108b8e80941Smrg * input_v1, 7109b8e80941Smrg * input_v2, 7110b8e80941Smrg * input_v3, 7111b8e80941Smrg * (VertexID + BaseVertex), 7112b8e80941Smrg * (InstanceID + StartInstance), 7113b8e80941Smrg * (InstanceID / 2 + StartInstance) 7114b8e80941Smrg */ 7115b8e80941Smrgstatic void si_build_vs_prolog_function(struct si_shader_context *ctx, 7116b8e80941Smrg union si_shader_part_key *key) 7117848b8605Smrg{ 7118b8e80941Smrg struct si_function_info fninfo; 7119b8e80941Smrg LLVMTypeRef *returns; 7120b8e80941Smrg LLVMValueRef ret, func; 7121b8e80941Smrg int num_returns, i; 7122b8e80941Smrg unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs; 7123b8e80941Smrg unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4; 7124b8e80941Smrg LLVMValueRef input_vgprs[9]; 7125b8e80941Smrg unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs + 7126b8e80941Smrg num_input_vgprs; 7127b8e80941Smrg unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0; 7128b8e80941Smrg 7129b8e80941Smrg si_init_function_info(&fninfo); 7130b8e80941Smrg 7131b8e80941Smrg /* 4 preloaded VGPRs + vertex load indices as prolog outputs */ 7132b8e80941Smrg returns = alloca((num_all_input_regs + key->vs_prolog.last_input + 1) * 7133b8e80941Smrg sizeof(LLVMTypeRef)); 7134b8e80941Smrg num_returns = 0; 7135b8e80941Smrg 7136b8e80941Smrg /* Declare input and output SGPRs. */ 7137b8e80941Smrg for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) { 7138b8e80941Smrg add_arg(&fninfo, ARG_SGPR, ctx->i32); 7139b8e80941Smrg returns[num_returns++] = ctx->i32; 7140b8e80941Smrg } 7141848b8605Smrg 7142b8e80941Smrg /* Preloaded VGPRs (outputs must be floats) */ 7143b8e80941Smrg for (i = 0; i < num_input_vgprs; i++) { 7144b8e80941Smrg add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &input_vgprs[i]); 7145b8e80941Smrg returns[num_returns++] = ctx->f32; 7146b8e80941Smrg } 7147848b8605Smrg 7148b8e80941Smrg /* Vertex load indices. */ 7149b8e80941Smrg for (i = 0; i <= key->vs_prolog.last_input; i++) 7150b8e80941Smrg returns[num_returns++] = ctx->f32; 7151848b8605Smrg 7152b8e80941Smrg /* Create the function. */ 7153b8e80941Smrg si_create_function(ctx, "vs_prolog", returns, num_returns, &fninfo, 0); 7154b8e80941Smrg func = ctx->main_fn; 7155848b8605Smrg 7156b8e80941Smrg if (key->vs_prolog.num_merged_next_stage_vgprs) { 7157b8e80941Smrg if (!key->vs_prolog.is_monolithic) 7158b8e80941Smrg si_init_exec_from_input(ctx, 3, 0); 7159848b8605Smrg 7160b8e80941Smrg if (key->vs_prolog.as_ls && 7161b8e80941Smrg ctx->screen->has_ls_vgpr_init_bug) { 7162b8e80941Smrg /* If there are no HS threads, SPI loads the LS VGPRs 7163b8e80941Smrg * starting at VGPR 0. Shift them back to where they 7164b8e80941Smrg * belong. 7165b8e80941Smrg */ 7166b8e80941Smrg LLVMValueRef has_hs_threads = 7167b8e80941Smrg LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, 7168b8e80941Smrg si_unpack_param(ctx, 3, 8, 8), 7169b8e80941Smrg ctx->i32_0, ""); 7170b8e80941Smrg 7171b8e80941Smrg for (i = 4; i > 0; --i) { 7172b8e80941Smrg input_vgprs[i + 1] = 7173b8e80941Smrg LLVMBuildSelect(ctx->ac.builder, has_hs_threads, 7174b8e80941Smrg input_vgprs[i + 1], 7175b8e80941Smrg input_vgprs[i - 1], ""); 7176848b8605Smrg } 7177848b8605Smrg } 7178b8e80941Smrg } 7179848b8605Smrg 7180b8e80941Smrg ctx->abi.vertex_id = input_vgprs[first_vs_vgpr]; 7181b8e80941Smrg ctx->abi.instance_id = input_vgprs[first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1)]; 7182848b8605Smrg 7183b8e80941Smrg /* Copy inputs to outputs. This should be no-op, as the registers match, 7184b8e80941Smrg * but it will prevent the compiler from overwriting them unintentionally. 7185b8e80941Smrg */ 7186b8e80941Smrg ret = ctx->return_value; 7187b8e80941Smrg for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) { 7188b8e80941Smrg LLVMValueRef p = LLVMGetParam(func, i); 7189b8e80941Smrg ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, ""); 7190b8e80941Smrg } 7191b8e80941Smrg for (i = 0; i < num_input_vgprs; i++) { 7192b8e80941Smrg LLVMValueRef p = input_vgprs[i]; 7193b8e80941Smrg p = ac_to_float(&ctx->ac, p); 7194b8e80941Smrg ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, 7195b8e80941Smrg key->vs_prolog.num_input_sgprs + i, ""); 7196b8e80941Smrg } 7197848b8605Smrg 7198b8e80941Smrg /* Compute vertex load indices from instance divisors. */ 7199b8e80941Smrg LLVMValueRef instance_divisor_constbuf = NULL; 7200848b8605Smrg 7201b8e80941Smrg if (key->vs_prolog.states.instance_divisor_is_fetched) { 7202b8e80941Smrg LLVMValueRef list = si_prolog_get_rw_buffers(ctx); 7203b8e80941Smrg LLVMValueRef buf_index = 7204b8e80941Smrg LLVMConstInt(ctx->i32, SI_VS_CONST_INSTANCE_DIVISORS, 0); 7205b8e80941Smrg instance_divisor_constbuf = 7206b8e80941Smrg ac_build_load_to_sgpr(&ctx->ac, list, buf_index); 7207848b8605Smrg } 7208848b8605Smrg 7209b8e80941Smrg for (i = 0; i <= key->vs_prolog.last_input; i++) { 7210b8e80941Smrg bool divisor_is_one = 7211b8e80941Smrg key->vs_prolog.states.instance_divisor_is_one & (1u << i); 7212b8e80941Smrg bool divisor_is_fetched = 7213b8e80941Smrg key->vs_prolog.states.instance_divisor_is_fetched & (1u << i); 7214b8e80941Smrg LLVMValueRef index = NULL; 7215b8e80941Smrg 7216b8e80941Smrg if (divisor_is_one) { 7217b8e80941Smrg index = ctx->abi.instance_id; 7218b8e80941Smrg } else if (divisor_is_fetched) { 7219b8e80941Smrg LLVMValueRef udiv_factors[4]; 7220b8e80941Smrg 7221b8e80941Smrg for (unsigned j = 0; j < 4; j++) { 7222b8e80941Smrg udiv_factors[j] = 7223b8e80941Smrg buffer_load_const(ctx, instance_divisor_constbuf, 7224b8e80941Smrg LLVMConstInt(ctx->i32, i*16 + j*4, 0)); 7225b8e80941Smrg udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]); 7226b8e80941Smrg } 7227b8e80941Smrg /* The faster NUW version doesn't work when InstanceID == UINT_MAX. 7228b8e80941Smrg * Such InstanceID might not be achievable in a reasonable time though. 7229b8e80941Smrg */ 7230b8e80941Smrg index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id, 7231b8e80941Smrg udiv_factors[0], udiv_factors[1], 7232b8e80941Smrg udiv_factors[2], udiv_factors[3]); 7233b8e80941Smrg } 7234848b8605Smrg 7235b8e80941Smrg if (divisor_is_one || divisor_is_fetched) { 7236b8e80941Smrg /* Add StartInstance. */ 7237b8e80941Smrg index = LLVMBuildAdd(ctx->ac.builder, index, 7238b8e80941Smrg LLVMGetParam(ctx->main_fn, user_sgpr_base + 7239b8e80941Smrg SI_SGPR_START_INSTANCE), ""); 7240b8e80941Smrg } else { 7241b8e80941Smrg /* VertexID + BaseVertex */ 7242b8e80941Smrg index = LLVMBuildAdd(ctx->ac.builder, 7243b8e80941Smrg ctx->abi.vertex_id, 7244b8e80941Smrg LLVMGetParam(func, user_sgpr_base + 7245b8e80941Smrg SI_SGPR_BASE_VERTEX), ""); 7246848b8605Smrg } 7247b8e80941Smrg 7248b8e80941Smrg index = ac_to_float(&ctx->ac, index); 7249b8e80941Smrg ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index, 7250b8e80941Smrg fninfo.num_params + i, ""); 7251848b8605Smrg } 7252848b8605Smrg 7253b8e80941Smrg si_llvm_build_ret(ctx, ret); 7254b8e80941Smrg} 7255b8e80941Smrg 7256b8e80941Smrgstatic bool si_get_vs_prolog(struct si_screen *sscreen, 7257b8e80941Smrg struct ac_llvm_compiler *compiler, 7258b8e80941Smrg struct si_shader *shader, 7259b8e80941Smrg struct pipe_debug_callback *debug, 7260b8e80941Smrg struct si_shader *main_part, 7261b8e80941Smrg const struct si_vs_prolog_bits *key) 7262b8e80941Smrg{ 7263b8e80941Smrg struct si_shader_selector *vs = main_part->selector; 7264b8e80941Smrg 7265b8e80941Smrg if (!si_vs_needs_prolog(vs, key)) 7266b8e80941Smrg return true; 7267b8e80941Smrg 7268b8e80941Smrg /* Get the prolog. */ 7269b8e80941Smrg union si_shader_part_key prolog_key; 7270b8e80941Smrg si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, 7271b8e80941Smrg key, shader, &prolog_key); 7272b8e80941Smrg 7273b8e80941Smrg shader->prolog = 7274b8e80941Smrg si_get_shader_part(sscreen, &sscreen->vs_prologs, 7275b8e80941Smrg PIPE_SHADER_VERTEX, true, &prolog_key, compiler, 7276b8e80941Smrg debug, si_build_vs_prolog_function, 7277b8e80941Smrg "Vertex Shader Prolog"); 7278b8e80941Smrg return shader->prolog != NULL; 7279848b8605Smrg} 7280848b8605Smrg 7281b8e80941Smrg/** 7282b8e80941Smrg * Select and compile (or reuse) vertex shader parts (prolog & epilog). 7283b8e80941Smrg */ 7284b8e80941Smrgstatic bool si_shader_select_vs_parts(struct si_screen *sscreen, 7285b8e80941Smrg struct ac_llvm_compiler *compiler, 7286b8e80941Smrg struct si_shader *shader, 7287b8e80941Smrg struct pipe_debug_callback *debug) 7288848b8605Smrg{ 7289b8e80941Smrg return si_get_vs_prolog(sscreen, compiler, shader, debug, shader, 7290b8e80941Smrg &shader->key.part.vs.prolog); 7291b8e80941Smrg} 7292848b8605Smrg 7293b8e80941Smrg/** 7294b8e80941Smrg * Compile the TCS epilog function. This writes tesselation factors to memory 7295b8e80941Smrg * based on the output primitive type of the tesselator (determined by TES). 7296b8e80941Smrg */ 7297b8e80941Smrgstatic void si_build_tcs_epilog_function(struct si_shader_context *ctx, 7298b8e80941Smrg union si_shader_part_key *key) 7299b8e80941Smrg{ 7300b8e80941Smrg struct lp_build_tgsi_context *bld_base = &ctx->bld_base; 7301b8e80941Smrg struct si_function_info fninfo; 7302b8e80941Smrg LLVMValueRef func; 7303b8e80941Smrg 7304b8e80941Smrg si_init_function_info(&fninfo); 7305b8e80941Smrg 7306b8e80941Smrg if (ctx->screen->info.chip_class >= GFX9) { 7307b8e80941Smrg add_arg(&fninfo, ARG_SGPR, ctx->i32); 7308b8e80941Smrg add_arg(&fninfo, ARG_SGPR, ctx->i32); 7309b8e80941Smrg ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 7310b8e80941Smrg add_arg(&fninfo, ARG_SGPR, ctx->i32); /* wave info */ 7311b8e80941Smrg ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 7312b8e80941Smrg add_arg(&fninfo, ARG_SGPR, ctx->i32); 7313b8e80941Smrg add_arg(&fninfo, ARG_SGPR, ctx->i32); 7314b8e80941Smrg add_arg(&fninfo, ARG_SGPR, ctx->i32); 7315b8e80941Smrg add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr); 7316b8e80941Smrg add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr); 7317b8e80941Smrg add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr); 7318b8e80941Smrg add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr); 7319b8e80941Smrg add_arg(&fninfo, ARG_SGPR, ctx->i32); 7320b8e80941Smrg add_arg(&fninfo, ARG_SGPR, ctx->i32); 7321b8e80941Smrg add_arg(&fninfo, ARG_SGPR, ctx->i32); 7322b8e80941Smrg add_arg(&fninfo, ARG_SGPR, ctx->i32); 7323b8e80941Smrg ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); 7324b8e80941Smrg add_arg(&fninfo, ARG_SGPR, ctx->i32); 7325b8e80941Smrg ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); 7326b8e80941Smrg } else { 7327b8e80941Smrg add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr); 7328b8e80941Smrg add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr); 7329b8e80941Smrg add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr); 7330b8e80941Smrg add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr); 7331b8e80941Smrg ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); 7332b8e80941Smrg add_arg(&fninfo, ARG_SGPR, ctx->i32); 7333b8e80941Smrg ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); 7334b8e80941Smrg add_arg(&fninfo, ARG_SGPR, ctx->i32); 7335b8e80941Smrg ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 7336b8e80941Smrg ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 7337b8e80941Smrg } 7338848b8605Smrg 7339b8e80941Smrg add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */ 7340b8e80941Smrg add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */ 7341b8e80941Smrg unsigned tess_factors_idx = 7342b8e80941Smrg add_arg(&fninfo, ARG_VGPR, ctx->i32); /* patch index within the wave (REL_PATCH_ID) */ 7343b8e80941Smrg add_arg(&fninfo, ARG_VGPR, ctx->i32); /* invocation ID within the patch */ 7344b8e80941Smrg add_arg(&fninfo, ARG_VGPR, ctx->i32); /* LDS offset where tess factors should be loaded from */ 7345b8e80941Smrg 7346b8e80941Smrg for (unsigned i = 0; i < 6; i++) 7347b8e80941Smrg add_arg(&fninfo, ARG_VGPR, ctx->i32); /* tess factors */ 7348b8e80941Smrg 7349b8e80941Smrg /* Create the function. */ 7350b8e80941Smrg si_create_function(ctx, "tcs_epilog", NULL, 0, &fninfo, 7351b8e80941Smrg ctx->screen->info.chip_class >= CIK ? 128 : 64); 7352b8e80941Smrg ac_declare_lds_as_pointer(&ctx->ac); 7353b8e80941Smrg func = ctx->main_fn; 7354b8e80941Smrg 7355b8e80941Smrg LLVMValueRef invoc0_tess_factors[6]; 7356b8e80941Smrg for (unsigned i = 0; i < 6; i++) 7357b8e80941Smrg invoc0_tess_factors[i] = LLVMGetParam(func, tess_factors_idx + 3 + i); 7358b8e80941Smrg 7359b8e80941Smrg si_write_tess_factors(bld_base, 7360b8e80941Smrg LLVMGetParam(func, tess_factors_idx), 7361b8e80941Smrg LLVMGetParam(func, tess_factors_idx + 1), 7362b8e80941Smrg LLVMGetParam(func, tess_factors_idx + 2), 7363b8e80941Smrg invoc0_tess_factors, invoc0_tess_factors + 4); 7364b8e80941Smrg 7365b8e80941Smrg LLVMBuildRetVoid(ctx->ac.builder); 7366b8e80941Smrg} 7367848b8605Smrg 7368b8e80941Smrg/** 7369b8e80941Smrg * Select and compile (or reuse) TCS parts (epilog). 7370b8e80941Smrg */ 7371b8e80941Smrgstatic bool si_shader_select_tcs_parts(struct si_screen *sscreen, 7372b8e80941Smrg struct ac_llvm_compiler *compiler, 7373b8e80941Smrg struct si_shader *shader, 7374b8e80941Smrg struct pipe_debug_callback *debug) 7375b8e80941Smrg{ 7376b8e80941Smrg if (sscreen->info.chip_class >= GFX9) { 7377b8e80941Smrg struct si_shader *ls_main_part = 7378b8e80941Smrg shader->key.part.tcs.ls->main_shader_part_ls; 7379848b8605Smrg 7380b8e80941Smrg if (!si_get_vs_prolog(sscreen, compiler, shader, debug, ls_main_part, 7381b8e80941Smrg &shader->key.part.tcs.ls_prolog)) 7382b8e80941Smrg return false; 7383848b8605Smrg 7384b8e80941Smrg shader->previous_stage = ls_main_part; 7385848b8605Smrg } 7386b8e80941Smrg 7387b8e80941Smrg /* Get the epilog. */ 7388b8e80941Smrg union si_shader_part_key epilog_key; 7389b8e80941Smrg memset(&epilog_key, 0, sizeof(epilog_key)); 7390b8e80941Smrg epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog; 7391b8e80941Smrg 7392b8e80941Smrg shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs, 7393b8e80941Smrg PIPE_SHADER_TESS_CTRL, false, 7394b8e80941Smrg &epilog_key, compiler, debug, 7395b8e80941Smrg si_build_tcs_epilog_function, 7396b8e80941Smrg "Tessellation Control Shader Epilog"); 7397b8e80941Smrg return shader->epilog != NULL; 7398848b8605Smrg} 7399848b8605Smrg 7400b8e80941Smrg/** 7401b8e80941Smrg * Select and compile (or reuse) GS parts (prolog). 7402b8e80941Smrg */ 7403b8e80941Smrgstatic bool si_shader_select_gs_parts(struct si_screen *sscreen, 7404b8e80941Smrg struct ac_llvm_compiler *compiler, 7405b8e80941Smrg struct si_shader *shader, 7406b8e80941Smrg struct pipe_debug_callback *debug) 7407848b8605Smrg{ 7408b8e80941Smrg if (sscreen->info.chip_class >= GFX9) { 7409b8e80941Smrg struct si_shader *es_main_part = 7410b8e80941Smrg shader->key.part.gs.es->main_shader_part_es; 7411848b8605Smrg 7412b8e80941Smrg if (shader->key.part.gs.es->type == PIPE_SHADER_VERTEX && 7413b8e80941Smrg !si_get_vs_prolog(sscreen, compiler, shader, debug, es_main_part, 7414b8e80941Smrg &shader->key.part.gs.vs_prolog)) 7415b8e80941Smrg return false; 7416848b8605Smrg 7417b8e80941Smrg shader->previous_stage = es_main_part; 7418b8e80941Smrg } 7419848b8605Smrg 7420b8e80941Smrg if (!shader->key.part.gs.prolog.tri_strip_adj_fix) 7421b8e80941Smrg return true; 7422848b8605Smrg 7423b8e80941Smrg union si_shader_part_key prolog_key; 7424b8e80941Smrg memset(&prolog_key, 0, sizeof(prolog_key)); 7425b8e80941Smrg prolog_key.gs_prolog.states = shader->key.part.gs.prolog; 7426848b8605Smrg 7427b8e80941Smrg shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs, 7428b8e80941Smrg PIPE_SHADER_GEOMETRY, true, 7429b8e80941Smrg &prolog_key, compiler, debug, 7430b8e80941Smrg si_build_gs_prolog_function, 7431b8e80941Smrg "Geometry Shader Prolog"); 7432b8e80941Smrg return shader->prolog2 != NULL; 7433b8e80941Smrg} 7434848b8605Smrg 7435b8e80941Smrg/** 7436b8e80941Smrg * Build the pixel shader prolog function. This handles: 7437b8e80941Smrg * - two-side color selection and interpolation 7438b8e80941Smrg * - overriding interpolation parameters for the API PS 7439b8e80941Smrg * - polygon stippling 7440b8e80941Smrg * 7441b8e80941Smrg * All preloaded SGPRs and VGPRs are passed through unmodified unless they are 7442b8e80941Smrg * overriden by other states. (e.g. per-sample interpolation) 7443b8e80941Smrg * Interpolated colors are stored after the preloaded VGPRs. 7444b8e80941Smrg */ 7445b8e80941Smrgstatic void si_build_ps_prolog_function(struct si_shader_context *ctx, 7446b8e80941Smrg union si_shader_part_key *key) 7447b8e80941Smrg{ 7448b8e80941Smrg struct si_function_info fninfo; 7449b8e80941Smrg LLVMValueRef ret, func; 7450b8e80941Smrg int num_returns, i, num_color_channels; 7451848b8605Smrg 7452b8e80941Smrg assert(si_need_ps_prolog(key)); 7453848b8605Smrg 7454b8e80941Smrg si_init_function_info(&fninfo); 7455848b8605Smrg 7456b8e80941Smrg /* Declare inputs. */ 7457b8e80941Smrg for (i = 0; i < key->ps_prolog.num_input_sgprs; i++) 7458b8e80941Smrg add_arg(&fninfo, ARG_SGPR, ctx->i32); 7459848b8605Smrg 7460b8e80941Smrg for (i = 0; i < key->ps_prolog.num_input_vgprs; i++) 7461b8e80941Smrg add_arg(&fninfo, ARG_VGPR, ctx->f32); 7462848b8605Smrg 7463b8e80941Smrg /* Declare outputs (same as inputs + add colors if needed) */ 7464b8e80941Smrg num_returns = fninfo.num_params; 7465b8e80941Smrg num_color_channels = util_bitcount(key->ps_prolog.colors_read); 7466b8e80941Smrg for (i = 0; i < num_color_channels; i++) 7467b8e80941Smrg fninfo.types[num_returns++] = ctx->f32; 7468848b8605Smrg 7469b8e80941Smrg /* Create the function. */ 7470b8e80941Smrg si_create_function(ctx, "ps_prolog", fninfo.types, num_returns, 7471b8e80941Smrg &fninfo, 0); 7472b8e80941Smrg func = ctx->main_fn; 7473848b8605Smrg 7474b8e80941Smrg /* Copy inputs to outputs. This should be no-op, as the registers match, 7475b8e80941Smrg * but it will prevent the compiler from overwriting them unintentionally. 7476b8e80941Smrg */ 7477b8e80941Smrg ret = ctx->return_value; 7478b8e80941Smrg for (i = 0; i < fninfo.num_params; i++) { 7479b8e80941Smrg LLVMValueRef p = LLVMGetParam(func, i); 7480b8e80941Smrg ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, ""); 7481848b8605Smrg } 7482848b8605Smrg 7483b8e80941Smrg /* Polygon stippling. */ 7484b8e80941Smrg if (key->ps_prolog.states.poly_stipple) { 7485b8e80941Smrg /* POS_FIXED_PT is always last. */ 7486b8e80941Smrg unsigned pos = key->ps_prolog.num_input_sgprs + 7487b8e80941Smrg key->ps_prolog.num_input_vgprs - 1; 7488b8e80941Smrg LLVMValueRef list = si_prolog_get_rw_buffers(ctx); 7489848b8605Smrg 7490b8e80941Smrg si_llvm_emit_polygon_stipple(ctx, list, pos); 7491848b8605Smrg } 7492848b8605Smrg 7493b8e80941Smrg if (key->ps_prolog.states.bc_optimize_for_persp || 7494b8e80941Smrg key->ps_prolog.states.bc_optimize_for_linear) { 7495b8e80941Smrg unsigned i, base = key->ps_prolog.num_input_sgprs; 7496b8e80941Smrg LLVMValueRef center[2], centroid[2], tmp, bc_optimize; 7497b8e80941Smrg 7498b8e80941Smrg /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER; 7499b8e80941Smrg * The hw doesn't compute CENTROID if the whole wave only 7500b8e80941Smrg * contains fully-covered quads. 7501b8e80941Smrg * 7502b8e80941Smrg * PRIM_MASK is after user SGPRs. 7503b8e80941Smrg */ 7504b8e80941Smrg bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR); 7505b8e80941Smrg bc_optimize = LLVMBuildLShr(ctx->ac.builder, bc_optimize, 7506b8e80941Smrg LLVMConstInt(ctx->i32, 31, 0), ""); 7507b8e80941Smrg bc_optimize = LLVMBuildTrunc(ctx->ac.builder, bc_optimize, 7508b8e80941Smrg ctx->i1, ""); 7509b8e80941Smrg 7510b8e80941Smrg if (key->ps_prolog.states.bc_optimize_for_persp) { 7511b8e80941Smrg /* Read PERSP_CENTER. */ 7512b8e80941Smrg for (i = 0; i < 2; i++) 7513b8e80941Smrg center[i] = LLVMGetParam(func, base + 2 + i); 7514b8e80941Smrg /* Read PERSP_CENTROID. */ 7515b8e80941Smrg for (i = 0; i < 2; i++) 7516b8e80941Smrg centroid[i] = LLVMGetParam(func, base + 4 + i); 7517b8e80941Smrg /* Select PERSP_CENTROID. */ 7518b8e80941Smrg for (i = 0; i < 2; i++) { 7519b8e80941Smrg tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize, 7520b8e80941Smrg center[i], centroid[i], ""); 7521b8e80941Smrg ret = LLVMBuildInsertValue(ctx->ac.builder, ret, 7522b8e80941Smrg tmp, base + 4 + i, ""); 7523b8e80941Smrg } 7524b8e80941Smrg } 7525b8e80941Smrg if (key->ps_prolog.states.bc_optimize_for_linear) { 7526b8e80941Smrg /* Read LINEAR_CENTER. */ 7527b8e80941Smrg for (i = 0; i < 2; i++) 7528b8e80941Smrg center[i] = LLVMGetParam(func, base + 8 + i); 7529b8e80941Smrg /* Read LINEAR_CENTROID. */ 7530b8e80941Smrg for (i = 0; i < 2; i++) 7531b8e80941Smrg centroid[i] = LLVMGetParam(func, base + 10 + i); 7532b8e80941Smrg /* Select LINEAR_CENTROID. */ 7533b8e80941Smrg for (i = 0; i < 2; i++) { 7534b8e80941Smrg tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize, 7535b8e80941Smrg center[i], centroid[i], ""); 7536b8e80941Smrg ret = LLVMBuildInsertValue(ctx->ac.builder, ret, 7537b8e80941Smrg tmp, base + 10 + i, ""); 7538b8e80941Smrg } 7539848b8605Smrg } 7540848b8605Smrg } 7541848b8605Smrg 7542b8e80941Smrg /* Force per-sample interpolation. */ 7543b8e80941Smrg if (key->ps_prolog.states.force_persp_sample_interp) { 7544b8e80941Smrg unsigned i, base = key->ps_prolog.num_input_sgprs; 7545b8e80941Smrg LLVMValueRef persp_sample[2]; 7546b8e80941Smrg 7547b8e80941Smrg /* Read PERSP_SAMPLE. */ 7548b8e80941Smrg for (i = 0; i < 2; i++) 7549b8e80941Smrg persp_sample[i] = LLVMGetParam(func, base + i); 7550b8e80941Smrg /* Overwrite PERSP_CENTER. */ 7551b8e80941Smrg for (i = 0; i < 2; i++) 7552b8e80941Smrg ret = LLVMBuildInsertValue(ctx->ac.builder, ret, 7553b8e80941Smrg persp_sample[i], base + 2 + i, ""); 7554b8e80941Smrg /* Overwrite PERSP_CENTROID. */ 7555b8e80941Smrg for (i = 0; i < 2; i++) 7556b8e80941Smrg ret = LLVMBuildInsertValue(ctx->ac.builder, ret, 7557b8e80941Smrg persp_sample[i], base + 4 + i, ""); 7558b8e80941Smrg } 7559b8e80941Smrg if (key->ps_prolog.states.force_linear_sample_interp) { 7560b8e80941Smrg unsigned i, base = key->ps_prolog.num_input_sgprs; 7561b8e80941Smrg LLVMValueRef linear_sample[2]; 7562b8e80941Smrg 7563b8e80941Smrg /* Read LINEAR_SAMPLE. */ 7564b8e80941Smrg for (i = 0; i < 2; i++) 7565b8e80941Smrg linear_sample[i] = LLVMGetParam(func, base + 6 + i); 7566b8e80941Smrg /* Overwrite LINEAR_CENTER. */ 7567b8e80941Smrg for (i = 0; i < 2; i++) 7568b8e80941Smrg ret = LLVMBuildInsertValue(ctx->ac.builder, ret, 7569b8e80941Smrg linear_sample[i], base + 8 + i, ""); 7570b8e80941Smrg /* Overwrite LINEAR_CENTROID. */ 7571b8e80941Smrg for (i = 0; i < 2; i++) 7572b8e80941Smrg ret = LLVMBuildInsertValue(ctx->ac.builder, ret, 7573b8e80941Smrg linear_sample[i], base + 10 + i, ""); 7574848b8605Smrg } 7575848b8605Smrg 7576b8e80941Smrg /* Force center interpolation. */ 7577b8e80941Smrg if (key->ps_prolog.states.force_persp_center_interp) { 7578b8e80941Smrg unsigned i, base = key->ps_prolog.num_input_sgprs; 7579b8e80941Smrg LLVMValueRef persp_center[2]; 7580b8e80941Smrg 7581b8e80941Smrg /* Read PERSP_CENTER. */ 7582b8e80941Smrg for (i = 0; i < 2; i++) 7583b8e80941Smrg persp_center[i] = LLVMGetParam(func, base + 2 + i); 7584b8e80941Smrg /* Overwrite PERSP_SAMPLE. */ 7585b8e80941Smrg for (i = 0; i < 2; i++) 7586b8e80941Smrg ret = LLVMBuildInsertValue(ctx->ac.builder, ret, 7587b8e80941Smrg persp_center[i], base + i, ""); 7588b8e80941Smrg /* Overwrite PERSP_CENTROID. */ 7589b8e80941Smrg for (i = 0; i < 2; i++) 7590b8e80941Smrg ret = LLVMBuildInsertValue(ctx->ac.builder, ret, 7591b8e80941Smrg persp_center[i], base + 4 + i, ""); 7592b8e80941Smrg } 7593b8e80941Smrg if (key->ps_prolog.states.force_linear_center_interp) { 7594b8e80941Smrg unsigned i, base = key->ps_prolog.num_input_sgprs; 7595b8e80941Smrg LLVMValueRef linear_center[2]; 7596b8e80941Smrg 7597b8e80941Smrg /* Read LINEAR_CENTER. */ 7598b8e80941Smrg for (i = 0; i < 2; i++) 7599b8e80941Smrg linear_center[i] = LLVMGetParam(func, base + 8 + i); 7600b8e80941Smrg /* Overwrite LINEAR_SAMPLE. */ 7601b8e80941Smrg for (i = 0; i < 2; i++) 7602b8e80941Smrg ret = LLVMBuildInsertValue(ctx->ac.builder, ret, 7603b8e80941Smrg linear_center[i], base + 6 + i, ""); 7604b8e80941Smrg /* Overwrite LINEAR_CENTROID. */ 7605b8e80941Smrg for (i = 0; i < 2; i++) 7606b8e80941Smrg ret = LLVMBuildInsertValue(ctx->ac.builder, ret, 7607b8e80941Smrg linear_center[i], base + 10 + i, ""); 7608848b8605Smrg } 7609848b8605Smrg 7610b8e80941Smrg /* Interpolate colors. */ 7611b8e80941Smrg unsigned color_out_idx = 0; 7612b8e80941Smrg for (i = 0; i < 2; i++) { 7613b8e80941Smrg unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf; 7614b8e80941Smrg unsigned face_vgpr = key->ps_prolog.num_input_sgprs + 7615b8e80941Smrg key->ps_prolog.face_vgpr_index; 7616b8e80941Smrg LLVMValueRef interp[2], color[4]; 7617b8e80941Smrg LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL; 7618848b8605Smrg 7619b8e80941Smrg if (!writemask) 7620b8e80941Smrg continue; 7621848b8605Smrg 7622b8e80941Smrg /* If the interpolation qualifier is not CONSTANT (-1). */ 7623b8e80941Smrg if (key->ps_prolog.color_interp_vgpr_index[i] != -1) { 7624b8e80941Smrg unsigned interp_vgpr = key->ps_prolog.num_input_sgprs + 7625b8e80941Smrg key->ps_prolog.color_interp_vgpr_index[i]; 7626b8e80941Smrg 7627b8e80941Smrg /* Get the (i,j) updated by bc_optimize handling. */ 7628b8e80941Smrg interp[0] = LLVMBuildExtractValue(ctx->ac.builder, ret, 7629b8e80941Smrg interp_vgpr, ""); 7630b8e80941Smrg interp[1] = LLVMBuildExtractValue(ctx->ac.builder, ret, 7631b8e80941Smrg interp_vgpr + 1, ""); 7632b8e80941Smrg interp_ij = ac_build_gather_values(&ctx->ac, interp, 2); 7633b8e80941Smrg } 7634848b8605Smrg 7635b8e80941Smrg /* Use the absolute location of the input. */ 7636b8e80941Smrg prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR); 7637848b8605Smrg 7638b8e80941Smrg if (key->ps_prolog.states.color_two_side) { 7639b8e80941Smrg face = LLVMGetParam(func, face_vgpr); 7640b8e80941Smrg face = ac_to_integer(&ctx->ac, face); 7641b8e80941Smrg } 7642848b8605Smrg 7643b8e80941Smrg interp_fs_input(ctx, 7644b8e80941Smrg key->ps_prolog.color_attr_index[i], 7645b8e80941Smrg TGSI_SEMANTIC_COLOR, i, 7646b8e80941Smrg key->ps_prolog.num_interp_inputs, 7647b8e80941Smrg key->ps_prolog.colors_read, interp_ij, 7648b8e80941Smrg prim_mask, face, color); 7649b8e80941Smrg 7650b8e80941Smrg while (writemask) { 7651b8e80941Smrg unsigned chan = u_bit_scan(&writemask); 7652b8e80941Smrg ret = LLVMBuildInsertValue(ctx->ac.builder, ret, color[chan], 7653b8e80941Smrg fninfo.num_params + color_out_idx++, ""); 7654b8e80941Smrg } 7655b8e80941Smrg } 7656b8e80941Smrg 7657b8e80941Smrg /* Section 15.2.2 (Shader Inputs) of the OpenGL 4.5 (Core Profile) spec 7658b8e80941Smrg * says: 7659b8e80941Smrg * 7660b8e80941Smrg * "When per-sample shading is active due to the use of a fragment 7661b8e80941Smrg * input qualified by sample or due to the use of the gl_SampleID 7662b8e80941Smrg * or gl_SamplePosition variables, only the bit for the current 7663b8e80941Smrg * sample is set in gl_SampleMaskIn. When state specifies multiple 7664b8e80941Smrg * fragment shader invocations for a given fragment, the sample 7665b8e80941Smrg * mask for any single fragment shader invocation may specify a 7666b8e80941Smrg * subset of the covered samples for the fragment. In this case, 7667b8e80941Smrg * the bit corresponding to each covered sample will be set in 7668b8e80941Smrg * exactly one fragment shader invocation." 7669b8e80941Smrg * 7670b8e80941Smrg * The samplemask loaded by hardware is always the coverage of the 7671b8e80941Smrg * entire pixel/fragment, so mask bits out based on the sample ID. 7672b8e80941Smrg */ 7673b8e80941Smrg if (key->ps_prolog.states.samplemask_log_ps_iter) { 7674b8e80941Smrg /* The bit pattern matches that used by fixed function fragment 7675b8e80941Smrg * processing. */ 7676b8e80941Smrg static const uint16_t ps_iter_masks[] = { 7677b8e80941Smrg 0xffff, /* not used */ 7678b8e80941Smrg 0x5555, 7679b8e80941Smrg 0x1111, 7680b8e80941Smrg 0x0101, 7681b8e80941Smrg 0x0001, 7682b8e80941Smrg }; 7683b8e80941Smrg assert(key->ps_prolog.states.samplemask_log_ps_iter < ARRAY_SIZE(ps_iter_masks)); 7684b8e80941Smrg 7685b8e80941Smrg uint32_t ps_iter_mask = ps_iter_masks[key->ps_prolog.states.samplemask_log_ps_iter]; 7686b8e80941Smrg unsigned ancillary_vgpr = key->ps_prolog.num_input_sgprs + 7687b8e80941Smrg key->ps_prolog.ancillary_vgpr_index; 7688b8e80941Smrg LLVMValueRef sampleid = si_unpack_param(ctx, ancillary_vgpr, 8, 4); 7689b8e80941Smrg LLVMValueRef samplemask = LLVMGetParam(func, ancillary_vgpr + 1); 7690b8e80941Smrg 7691b8e80941Smrg samplemask = ac_to_integer(&ctx->ac, samplemask); 7692b8e80941Smrg samplemask = LLVMBuildAnd( 7693b8e80941Smrg ctx->ac.builder, 7694b8e80941Smrg samplemask, 7695b8e80941Smrg LLVMBuildShl(ctx->ac.builder, 7696b8e80941Smrg LLVMConstInt(ctx->i32, ps_iter_mask, false), 7697b8e80941Smrg sampleid, ""), 7698b8e80941Smrg ""); 7699b8e80941Smrg samplemask = ac_to_float(&ctx->ac, samplemask); 7700b8e80941Smrg 7701b8e80941Smrg ret = LLVMBuildInsertValue(ctx->ac.builder, ret, samplemask, 7702b8e80941Smrg ancillary_vgpr + 1, ""); 7703b8e80941Smrg } 7704848b8605Smrg 7705b8e80941Smrg /* Tell LLVM to insert WQM instruction sequence when needed. */ 7706b8e80941Smrg if (key->ps_prolog.wqm) { 7707b8e80941Smrg LLVMAddTargetDependentFunctionAttr(func, 7708b8e80941Smrg "amdgpu-ps-wqm-outputs", ""); 7709b8e80941Smrg } 7710848b8605Smrg 7711b8e80941Smrg si_llvm_build_ret(ctx, ret); 7712b8e80941Smrg} 7713848b8605Smrg 7714b8e80941Smrg/** 7715b8e80941Smrg * Build the pixel shader epilog function. This handles everything that must be 7716b8e80941Smrg * emulated for pixel shader exports. (alpha-test, format conversions, etc) 7717b8e80941Smrg */ 7718b8e80941Smrgstatic void si_build_ps_epilog_function(struct si_shader_context *ctx, 7719b8e80941Smrg union si_shader_part_key *key) 7720b8e80941Smrg{ 7721b8e80941Smrg struct lp_build_tgsi_context *bld_base = &ctx->bld_base; 7722b8e80941Smrg struct si_function_info fninfo; 7723b8e80941Smrg LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL; 7724b8e80941Smrg int i; 7725b8e80941Smrg struct si_ps_exports exp = {}; 7726b8e80941Smrg 7727b8e80941Smrg si_init_function_info(&fninfo); 7728b8e80941Smrg 7729b8e80941Smrg /* Declare input SGPRs. */ 7730b8e80941Smrg ctx->param_rw_buffers = add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr); 7731b8e80941Smrg ctx->param_bindless_samplers_and_images = add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr); 7732b8e80941Smrg ctx->param_const_and_shader_buffers = add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr); 7733b8e80941Smrg ctx->param_samplers_and_images = add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr); 7734b8e80941Smrg add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF); 7735b8e80941Smrg 7736b8e80941Smrg /* Declare input VGPRs. */ 7737b8e80941Smrg unsigned required_num_params = 7738b8e80941Smrg fninfo.num_sgpr_params + 7739b8e80941Smrg util_bitcount(key->ps_epilog.colors_written) * 4 + 7740b8e80941Smrg key->ps_epilog.writes_z + 7741b8e80941Smrg key->ps_epilog.writes_stencil + 7742b8e80941Smrg key->ps_epilog.writes_samplemask; 7743b8e80941Smrg 7744b8e80941Smrg required_num_params = MAX2(required_num_params, 7745b8e80941Smrg fninfo.num_sgpr_params + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1); 7746b8e80941Smrg 7747b8e80941Smrg while (fninfo.num_params < required_num_params) 7748b8e80941Smrg add_arg(&fninfo, ARG_VGPR, ctx->f32); 7749b8e80941Smrg 7750b8e80941Smrg /* Create the function. */ 7751b8e80941Smrg si_create_function(ctx, "ps_epilog", NULL, 0, &fninfo, 0); 7752b8e80941Smrg /* Disable elimination of unused inputs. */ 7753b8e80941Smrg ac_llvm_add_target_dep_function_attr(ctx->main_fn, 7754b8e80941Smrg "InitialPSInputAddr", 0xffffff); 7755b8e80941Smrg 7756b8e80941Smrg /* Process colors. */ 7757b8e80941Smrg unsigned vgpr = fninfo.num_sgpr_params; 7758b8e80941Smrg unsigned colors_written = key->ps_epilog.colors_written; 7759b8e80941Smrg int last_color_export = -1; 7760b8e80941Smrg 7761b8e80941Smrg /* Find the last color export. */ 7762b8e80941Smrg if (!key->ps_epilog.writes_z && 7763b8e80941Smrg !key->ps_epilog.writes_stencil && 7764b8e80941Smrg !key->ps_epilog.writes_samplemask) { 7765b8e80941Smrg unsigned spi_format = key->ps_epilog.states.spi_shader_col_format; 7766b8e80941Smrg 7767b8e80941Smrg /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */ 7768b8e80941Smrg if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) { 7769b8e80941Smrg /* Just set this if any of the colorbuffers are enabled. */ 7770b8e80941Smrg if (spi_format & 7771b8e80941Smrg ((1ull << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1)) 7772b8e80941Smrg last_color_export = 0; 7773b8e80941Smrg } else { 7774b8e80941Smrg for (i = 0; i < 8; i++) 7775b8e80941Smrg if (colors_written & (1 << i) && 7776b8e80941Smrg (spi_format >> (i * 4)) & 0xf) 7777b8e80941Smrg last_color_export = i; 7778848b8605Smrg } 7779848b8605Smrg } 7780848b8605Smrg 7781b8e80941Smrg while (colors_written) { 7782b8e80941Smrg LLVMValueRef color[4]; 7783b8e80941Smrg int mrt = u_bit_scan(&colors_written); 7784848b8605Smrg 7785b8e80941Smrg for (i = 0; i < 4; i++) 7786b8e80941Smrg color[i] = LLVMGetParam(ctx->main_fn, vgpr++); 7787848b8605Smrg 7788b8e80941Smrg si_export_mrt_color(bld_base, color, mrt, 7789b8e80941Smrg fninfo.num_params - 1, 7790b8e80941Smrg mrt == last_color_export, &exp); 7791b8e80941Smrg } 7792848b8605Smrg 7793b8e80941Smrg /* Process depth, stencil, samplemask. */ 7794b8e80941Smrg if (key->ps_epilog.writes_z) 7795b8e80941Smrg depth = LLVMGetParam(ctx->main_fn, vgpr++); 7796b8e80941Smrg if (key->ps_epilog.writes_stencil) 7797b8e80941Smrg stencil = LLVMGetParam(ctx->main_fn, vgpr++); 7798b8e80941Smrg if (key->ps_epilog.writes_samplemask) 7799b8e80941Smrg samplemask = LLVMGetParam(ctx->main_fn, vgpr++); 7800848b8605Smrg 7801b8e80941Smrg if (depth || stencil || samplemask) 7802b8e80941Smrg si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp); 7803b8e80941Smrg else if (last_color_export == -1) 7804b8e80941Smrg ac_build_export_null(&ctx->ac); 7805848b8605Smrg 7806b8e80941Smrg if (exp.num) 7807b8e80941Smrg si_emit_ps_exports(ctx, &exp); 7808b8e80941Smrg 7809b8e80941Smrg /* Compile. */ 7810b8e80941Smrg LLVMBuildRetVoid(ctx->ac.builder); 7811848b8605Smrg} 7812848b8605Smrg 7813b8e80941Smrg/** 7814b8e80941Smrg * Select and compile (or reuse) pixel shader parts (prolog & epilog). 7815b8e80941Smrg */ 7816b8e80941Smrgstatic bool si_shader_select_ps_parts(struct si_screen *sscreen, 7817b8e80941Smrg struct ac_llvm_compiler *compiler, 7818b8e80941Smrg struct si_shader *shader, 7819b8e80941Smrg struct pipe_debug_callback *debug) 7820848b8605Smrg{ 7821b8e80941Smrg union si_shader_part_key prolog_key; 7822b8e80941Smrg union si_shader_part_key epilog_key; 7823b8e80941Smrg 7824b8e80941Smrg /* Get the prolog. */ 7825b8e80941Smrg si_get_ps_prolog_key(shader, &prolog_key, true); 7826b8e80941Smrg 7827b8e80941Smrg /* The prolog is a no-op if these aren't set. */ 7828b8e80941Smrg if (si_need_ps_prolog(&prolog_key)) { 7829b8e80941Smrg shader->prolog = 7830b8e80941Smrg si_get_shader_part(sscreen, &sscreen->ps_prologs, 7831b8e80941Smrg PIPE_SHADER_FRAGMENT, true, 7832b8e80941Smrg &prolog_key, compiler, debug, 7833b8e80941Smrg si_build_ps_prolog_function, 7834b8e80941Smrg "Fragment Shader Prolog"); 7835b8e80941Smrg if (!shader->prolog) 7836b8e80941Smrg return false; 7837848b8605Smrg } 7838848b8605Smrg 7839b8e80941Smrg /* Get the epilog. */ 7840b8e80941Smrg si_get_ps_epilog_key(shader, &epilog_key); 7841b8e80941Smrg 7842b8e80941Smrg shader->epilog = 7843b8e80941Smrg si_get_shader_part(sscreen, &sscreen->ps_epilogs, 7844b8e80941Smrg PIPE_SHADER_FRAGMENT, false, 7845b8e80941Smrg &epilog_key, compiler, debug, 7846b8e80941Smrg si_build_ps_epilog_function, 7847b8e80941Smrg "Fragment Shader Epilog"); 7848b8e80941Smrg if (!shader->epilog) 7849b8e80941Smrg return false; 7850b8e80941Smrg 7851b8e80941Smrg /* Enable POS_FIXED_PT if polygon stippling is enabled. */ 7852b8e80941Smrg if (shader->key.part.ps.prolog.poly_stipple) { 7853b8e80941Smrg shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1); 7854b8e80941Smrg assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr)); 7855b8e80941Smrg } 7856848b8605Smrg 7857b8e80941Smrg /* Set up the enable bits for per-sample shading if needed. */ 7858b8e80941Smrg if (shader->key.part.ps.prolog.force_persp_sample_interp && 7859b8e80941Smrg (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) || 7860b8e80941Smrg G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) { 7861b8e80941Smrg shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA; 7862b8e80941Smrg shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA; 7863b8e80941Smrg shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1); 7864b8e80941Smrg } 7865b8e80941Smrg if (shader->key.part.ps.prolog.force_linear_sample_interp && 7866b8e80941Smrg (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) || 7867b8e80941Smrg G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) { 7868b8e80941Smrg shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA; 7869b8e80941Smrg shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA; 7870b8e80941Smrg shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1); 7871b8e80941Smrg } 7872b8e80941Smrg if (shader->key.part.ps.prolog.force_persp_center_interp && 7873b8e80941Smrg (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) || 7874b8e80941Smrg G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) { 7875b8e80941Smrg shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA; 7876b8e80941Smrg shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA; 7877b8e80941Smrg shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1); 7878b8e80941Smrg } 7879b8e80941Smrg if (shader->key.part.ps.prolog.force_linear_center_interp && 7880b8e80941Smrg (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) || 7881b8e80941Smrg G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) { 7882b8e80941Smrg shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA; 7883b8e80941Smrg shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA; 7884b8e80941Smrg shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1); 7885b8e80941Smrg } 7886848b8605Smrg 7887b8e80941Smrg /* POW_W_FLOAT requires that one of the perspective weights is enabled. */ 7888b8e80941Smrg if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) && 7889b8e80941Smrg !(shader->config.spi_ps_input_ena & 0xf)) { 7890b8e80941Smrg shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1); 7891b8e80941Smrg assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr)); 7892b8e80941Smrg } 7893848b8605Smrg 7894b8e80941Smrg /* At least one pair of interpolation weights must be enabled. */ 7895b8e80941Smrg if (!(shader->config.spi_ps_input_ena & 0x7f)) { 7896b8e80941Smrg shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1); 7897b8e80941Smrg assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr)); 7898b8e80941Smrg } 7899848b8605Smrg 7900b8e80941Smrg /* Samplemask fixup requires the sample ID. */ 7901b8e80941Smrg if (shader->key.part.ps.prolog.samplemask_log_ps_iter) { 7902b8e80941Smrg shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1); 7903b8e80941Smrg assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr)); 7904b8e80941Smrg } 7905848b8605Smrg 7906b8e80941Smrg /* The sample mask input is always enabled, because the API shader always 7907b8e80941Smrg * passes it through to the epilog. Disable it here if it's unused. 7908b8e80941Smrg */ 7909b8e80941Smrg if (!shader->key.part.ps.epilog.poly_line_smoothing && 7910b8e80941Smrg !shader->selector->info.reads_samplemask) 7911b8e80941Smrg shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA; 7912848b8605Smrg 7913b8e80941Smrg return true; 7914b8e80941Smrg} 7915848b8605Smrg 7916b8e80941Smrgvoid si_multiwave_lds_size_workaround(struct si_screen *sscreen, 7917b8e80941Smrg unsigned *lds_size) 7918b8e80941Smrg{ 7919b8e80941Smrg /* If tessellation is all offchip and on-chip GS isn't used, this 7920b8e80941Smrg * workaround is not needed. 7921b8e80941Smrg */ 7922b8e80941Smrg return; 7923848b8605Smrg 7924b8e80941Smrg /* SPI barrier management bug: 7925b8e80941Smrg * Make sure we have at least 4k of LDS in use to avoid the bug. 7926b8e80941Smrg * It applies to workgroup sizes of more than one wavefront. 7927b8e80941Smrg */ 7928b8e80941Smrg if (sscreen->info.family == CHIP_BONAIRE || 7929b8e80941Smrg sscreen->info.family == CHIP_KABINI || 7930b8e80941Smrg sscreen->info.family == CHIP_MULLINS) 7931b8e80941Smrg *lds_size = MAX2(*lds_size, 8); 7932b8e80941Smrg} 7933848b8605Smrg 7934b8e80941Smrgstatic void si_fix_resource_usage(struct si_screen *sscreen, 7935b8e80941Smrg struct si_shader *shader) 7936b8e80941Smrg{ 7937b8e80941Smrg unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */ 7938848b8605Smrg 7939b8e80941Smrg shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs); 7940848b8605Smrg 7941b8e80941Smrg if (shader->selector->type == PIPE_SHADER_COMPUTE && 7942b8e80941Smrg si_get_max_workgroup_size(shader) > 64) { 7943b8e80941Smrg si_multiwave_lds_size_workaround(sscreen, 7944b8e80941Smrg &shader->config.lds_size); 7945848b8605Smrg } 7946b8e80941Smrg} 7947848b8605Smrg 7948b8e80941Smrgint si_shader_create(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, 7949b8e80941Smrg struct si_shader *shader, 7950b8e80941Smrg struct pipe_debug_callback *debug) 7951b8e80941Smrg{ 7952b8e80941Smrg struct si_shader_selector *sel = shader->selector; 7953b8e80941Smrg struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key); 7954b8e80941Smrg int r; 7955848b8605Smrg 7956b8e80941Smrg /* LS, ES, VS are compiled on demand if the main part hasn't been 7957b8e80941Smrg * compiled for that stage. 7958b8e80941Smrg * 7959b8e80941Smrg * Vertex shaders are compiled on demand when a vertex fetch 7960b8e80941Smrg * workaround must be applied. 7961b8e80941Smrg */ 7962b8e80941Smrg if (shader->is_monolithic) { 7963b8e80941Smrg /* Monolithic shader (compiled as a whole, has many variants, 7964b8e80941Smrg * may take a long time to compile). 7965b8e80941Smrg */ 7966b8e80941Smrg r = si_compile_tgsi_shader(sscreen, compiler, shader, debug); 7967b8e80941Smrg if (r) 7968b8e80941Smrg return r; 7969b8e80941Smrg } else { 7970b8e80941Smrg /* The shader consists of several parts: 7971b8e80941Smrg * 7972b8e80941Smrg * - the middle part is the user shader, it has 1 variant only 7973b8e80941Smrg * and it was compiled during the creation of the shader 7974b8e80941Smrg * selector 7975b8e80941Smrg * - the prolog part is inserted at the beginning 7976b8e80941Smrg * - the epilog part is inserted at the end 7977b8e80941Smrg * 7978b8e80941Smrg * The prolog and epilog have many (but simple) variants. 7979b8e80941Smrg * 7980b8e80941Smrg * Starting with gfx9, geometry and tessellation control 7981b8e80941Smrg * shaders also contain the prolog and user shader parts of 7982b8e80941Smrg * the previous shader stage. 7983b8e80941Smrg */ 7984848b8605Smrg 7985b8e80941Smrg if (!mainp) 7986b8e80941Smrg return -1; 7987b8e80941Smrg 7988b8e80941Smrg /* Copy the compiled TGSI shader data over. */ 7989b8e80941Smrg shader->is_binary_shared = true; 7990b8e80941Smrg shader->binary = mainp->binary; 7991b8e80941Smrg shader->config = mainp->config; 7992b8e80941Smrg shader->info.num_input_sgprs = mainp->info.num_input_sgprs; 7993b8e80941Smrg shader->info.num_input_vgprs = mainp->info.num_input_vgprs; 7994b8e80941Smrg shader->info.face_vgpr_index = mainp->info.face_vgpr_index; 7995b8e80941Smrg shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index; 7996b8e80941Smrg memcpy(shader->info.vs_output_param_offset, 7997b8e80941Smrg mainp->info.vs_output_param_offset, 7998b8e80941Smrg sizeof(mainp->info.vs_output_param_offset)); 7999b8e80941Smrg shader->info.uses_instanceid = mainp->info.uses_instanceid; 8000b8e80941Smrg shader->info.nr_pos_exports = mainp->info.nr_pos_exports; 8001b8e80941Smrg shader->info.nr_param_exports = mainp->info.nr_param_exports; 8002b8e80941Smrg 8003b8e80941Smrg /* Select prologs and/or epilogs. */ 8004b8e80941Smrg switch (sel->type) { 8005b8e80941Smrg case PIPE_SHADER_VERTEX: 8006b8e80941Smrg if (!si_shader_select_vs_parts(sscreen, compiler, shader, debug)) 8007b8e80941Smrg return -1; 8008b8e80941Smrg break; 8009b8e80941Smrg case PIPE_SHADER_TESS_CTRL: 8010b8e80941Smrg if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug)) 8011b8e80941Smrg return -1; 8012b8e80941Smrg break; 8013b8e80941Smrg case PIPE_SHADER_TESS_EVAL: 8014b8e80941Smrg break; 8015b8e80941Smrg case PIPE_SHADER_GEOMETRY: 8016b8e80941Smrg if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug)) 8017b8e80941Smrg return -1; 8018b8e80941Smrg break; 8019b8e80941Smrg case PIPE_SHADER_FRAGMENT: 8020b8e80941Smrg if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug)) 8021b8e80941Smrg return -1; 8022848b8605Smrg 8023b8e80941Smrg /* Make sure we have at least as many VGPRs as there 8024b8e80941Smrg * are allocated inputs. 8025b8e80941Smrg */ 8026b8e80941Smrg shader->config.num_vgprs = MAX2(shader->config.num_vgprs, 8027b8e80941Smrg shader->info.num_input_vgprs); 8028b8e80941Smrg break; 8029b8e80941Smrg } 8030848b8605Smrg 8031b8e80941Smrg /* Update SGPR and VGPR counts. */ 8032b8e80941Smrg if (shader->prolog) { 8033b8e80941Smrg shader->config.num_sgprs = MAX2(shader->config.num_sgprs, 8034b8e80941Smrg shader->prolog->config.num_sgprs); 8035b8e80941Smrg shader->config.num_vgprs = MAX2(shader->config.num_vgprs, 8036b8e80941Smrg shader->prolog->config.num_vgprs); 8037b8e80941Smrg } 8038b8e80941Smrg if (shader->previous_stage) { 8039b8e80941Smrg shader->config.num_sgprs = MAX2(shader->config.num_sgprs, 8040b8e80941Smrg shader->previous_stage->config.num_sgprs); 8041b8e80941Smrg shader->config.num_vgprs = MAX2(shader->config.num_vgprs, 8042b8e80941Smrg shader->previous_stage->config.num_vgprs); 8043b8e80941Smrg shader->config.spilled_sgprs = 8044b8e80941Smrg MAX2(shader->config.spilled_sgprs, 8045b8e80941Smrg shader->previous_stage->config.spilled_sgprs); 8046b8e80941Smrg shader->config.spilled_vgprs = 8047b8e80941Smrg MAX2(shader->config.spilled_vgprs, 8048b8e80941Smrg shader->previous_stage->config.spilled_vgprs); 8049b8e80941Smrg shader->config.private_mem_vgprs = 8050b8e80941Smrg MAX2(shader->config.private_mem_vgprs, 8051b8e80941Smrg shader->previous_stage->config.private_mem_vgprs); 8052b8e80941Smrg shader->config.scratch_bytes_per_wave = 8053b8e80941Smrg MAX2(shader->config.scratch_bytes_per_wave, 8054b8e80941Smrg shader->previous_stage->config.scratch_bytes_per_wave); 8055b8e80941Smrg shader->info.uses_instanceid |= 8056b8e80941Smrg shader->previous_stage->info.uses_instanceid; 8057848b8605Smrg } 8058b8e80941Smrg if (shader->prolog2) { 8059b8e80941Smrg shader->config.num_sgprs = MAX2(shader->config.num_sgprs, 8060b8e80941Smrg shader->prolog2->config.num_sgprs); 8061b8e80941Smrg shader->config.num_vgprs = MAX2(shader->config.num_vgprs, 8062b8e80941Smrg shader->prolog2->config.num_vgprs); 8063b8e80941Smrg } 8064b8e80941Smrg if (shader->epilog) { 8065b8e80941Smrg shader->config.num_sgprs = MAX2(shader->config.num_sgprs, 8066b8e80941Smrg shader->epilog->config.num_sgprs); 8067b8e80941Smrg shader->config.num_vgprs = MAX2(shader->config.num_vgprs, 8068b8e80941Smrg shader->epilog->config.num_vgprs); 8069b8e80941Smrg } 8070b8e80941Smrg si_calculate_max_simd_waves(shader); 8071848b8605Smrg } 8072848b8605Smrg 8073b8e80941Smrg si_fix_resource_usage(sscreen, shader); 8074b8e80941Smrg si_shader_dump(sscreen, shader, debug, sel->info.processor, 8075b8e80941Smrg stderr, true); 8076848b8605Smrg 8077b8e80941Smrg /* Upload. */ 8078b8e80941Smrg r = si_shader_binary_upload(sscreen, shader); 8079b8e80941Smrg if (r) { 8080b8e80941Smrg fprintf(stderr, "LLVM failed to upload shader\n"); 8081b8e80941Smrg return r; 8082b8e80941Smrg } 8083848b8605Smrg 8084b8e80941Smrg return 0; 8085848b8605Smrg} 8086848b8605Smrg 8087b8e80941Smrgvoid si_shader_destroy(struct si_shader *shader) 8088848b8605Smrg{ 8089b8e80941Smrg if (shader->scratch_bo) 8090b8e80941Smrg si_resource_reference(&shader->scratch_bo, NULL); 8091b8e80941Smrg 8092b8e80941Smrg si_resource_reference(&shader->bo, NULL); 8093b8e80941Smrg 8094b8e80941Smrg if (!shader->is_binary_shared) 8095b8e80941Smrg ac_shader_binary_clean(&shader->binary); 8096848b8605Smrg 8097b8e80941Smrg free(shader->shader_log); 8098848b8605Smrg} 8099