1af69d88dSmrg/* 2af69d88dSmrg * Copyright 2012 Advanced Micro Devices, Inc. 301e04c3fSmrg * All Rights Reserved. 4af69d88dSmrg * 5af69d88dSmrg * Permission is hereby granted, free of charge, to any person obtaining a 6af69d88dSmrg * copy of this software and associated documentation files (the "Software"), 7af69d88dSmrg * to deal in the Software without restriction, including without limitation 8af69d88dSmrg * on the rights to use, copy, modify, merge, publish, distribute, sub 9af69d88dSmrg * license, and/or sell copies of the Software, and to permit persons to whom 10af69d88dSmrg * the Software is furnished to do so, subject to the following conditions: 11af69d88dSmrg * 12af69d88dSmrg * The above copyright notice and this permission notice (including the next 13af69d88dSmrg * paragraph) shall be included in all copies or substantial portions of the 14af69d88dSmrg * Software. 15af69d88dSmrg * 16af69d88dSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17af69d88dSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18af69d88dSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19af69d88dSmrg * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 20af69d88dSmrg * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21af69d88dSmrg * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22af69d88dSmrg * USE OR OTHER DEALINGS IN THE SOFTWARE. 2301e04c3fSmrg */ 2401e04c3fSmrg 2501e04c3fSmrg/* The compiler middle-end architecture: Explaining (non-)monolithic shaders 2601e04c3fSmrg * ------------------------------------------------------------------------- 2701e04c3fSmrg * 2801e04c3fSmrg * Typically, there is one-to-one correspondence between API and HW shaders, 2901e04c3fSmrg * that is, for every API shader, there is exactly one shader binary in 3001e04c3fSmrg * the driver. 3101e04c3fSmrg * 3201e04c3fSmrg * The problem with that is that we also have to emulate some API states 3301e04c3fSmrg * (e.g. alpha-test, and many others) in shaders too. The two obvious ways 3401e04c3fSmrg * to deal with it are: 3501e04c3fSmrg * - each shader has multiple variants for each combination of emulated states, 3601e04c3fSmrg * and the variants are compiled on demand, possibly relying on a shader 3701e04c3fSmrg * cache for good performance 3801e04c3fSmrg * - patch shaders at the binary level 3901e04c3fSmrg * 4001e04c3fSmrg * This driver uses something completely different. The emulated states are 4101e04c3fSmrg * usually implemented at the beginning or end of shaders. Therefore, we can 4201e04c3fSmrg * split the shader into 3 parts: 4301e04c3fSmrg * - prolog part (shader code dependent on states) 4401e04c3fSmrg * - main part (the API shader) 4501e04c3fSmrg * - epilog part (shader code dependent on states) 4601e04c3fSmrg * 4701e04c3fSmrg * Each part is compiled as a separate shader and the final binaries are 4801e04c3fSmrg * concatenated. This type of shader is called non-monolithic, because it 4901e04c3fSmrg * consists of multiple independent binaries. Creating a new shader variant 5001e04c3fSmrg * is therefore only a concatenation of shader parts (binaries) and doesn't 5101e04c3fSmrg * involve any compilation. The main shader parts are the only parts that are 5201e04c3fSmrg * compiled when applications create shader objects. The prolog and epilog 5301e04c3fSmrg * parts are compiled on the first use and saved, so that their binaries can 5401e04c3fSmrg * be reused by many other shaders. 5501e04c3fSmrg * 5601e04c3fSmrg * One of the roles of the prolog part is to compute vertex buffer addresses 5701e04c3fSmrg * for vertex shaders. A few of the roles of the epilog part are color buffer 5801e04c3fSmrg * format conversions in pixel shaders that we have to do manually, and write 5901e04c3fSmrg * tessellation factors in tessellation control shaders. The prolog and epilog 6001e04c3fSmrg * have many other important responsibilities in various shader stages. 6101e04c3fSmrg * They don't just "emulate legacy stuff". 6201e04c3fSmrg * 6301e04c3fSmrg * Monolithic shaders are shaders where the parts are combined before LLVM 6401e04c3fSmrg * compilation, and the whole thing is compiled and optimized as one unit with 6501e04c3fSmrg * one binary on the output. The result is the same as the non-monolithic 6601e04c3fSmrg * shader, but the final code can be better, because LLVM can optimize across 6701e04c3fSmrg * all shader parts. Monolithic shaders aren't usually used except for these 6801e04c3fSmrg * special cases: 6901e04c3fSmrg * 7001e04c3fSmrg * 1) Some rarely-used states require modification of the main shader part 7101e04c3fSmrg * itself, and in such cases, only the monolithic shader variant is 7201e04c3fSmrg * compiled, and that's always done on the first use. 73af69d88dSmrg * 7401e04c3fSmrg * 2) When we do cross-stage optimizations for separate shader objects and 7501e04c3fSmrg * e.g. eliminate unused shader varyings, the resulting optimized shader 7601e04c3fSmrg * variants are always compiled as monolithic shaders, and always 7701e04c3fSmrg * asynchronously (i.e. not stalling ongoing rendering). We call them 7801e04c3fSmrg * "optimized monolithic" shaders. The important property here is that 7901e04c3fSmrg * the non-monolithic unoptimized shader variant is always available for use 8001e04c3fSmrg * when the asynchronous compilation of the optimized shader is not done 8101e04c3fSmrg * yet. 8201e04c3fSmrg * 8301e04c3fSmrg * Starting with GFX9 chips, some shader stages are merged, and the number of 8401e04c3fSmrg * shader parts per shader increased. The complete new list of shader parts is: 8501e04c3fSmrg * - 1st shader: prolog part 8601e04c3fSmrg * - 1st shader: main part 8701e04c3fSmrg * - 2nd shader: prolog part 8801e04c3fSmrg * - 2nd shader: main part 8901e04c3fSmrg * - 2nd shader: epilog part 9001e04c3fSmrg */ 9101e04c3fSmrg 9201e04c3fSmrg/* How linking shader inputs and outputs between vertex, tessellation, and 9301e04c3fSmrg * geometry shaders works. 9401e04c3fSmrg * 9501e04c3fSmrg * Inputs and outputs between shaders are stored in a buffer. This buffer 9601e04c3fSmrg * lives in LDS (typical case for tessellation), but it can also live 9701e04c3fSmrg * in memory (ESGS). Each input or output has a fixed location within a vertex. 9801e04c3fSmrg * The highest used input or output determines the stride between vertices. 9901e04c3fSmrg * 10001e04c3fSmrg * Since GS and tessellation are only possible in the OpenGL core profile, 10101e04c3fSmrg * only these semantics are valid for per-vertex data: 10201e04c3fSmrg * 10301e04c3fSmrg * Name Location 10401e04c3fSmrg * 10501e04c3fSmrg * POSITION 0 10601e04c3fSmrg * PSIZE 1 10701e04c3fSmrg * CLIPDIST0..1 2..3 10801e04c3fSmrg * CULLDIST0..1 (not implemented) 10901e04c3fSmrg * GENERIC0..31 4..35 11001e04c3fSmrg * 11101e04c3fSmrg * For example, a shader only writing GENERIC0 has the output stride of 5. 11201e04c3fSmrg * 11301e04c3fSmrg * Only these semantics are valid for per-patch data: 11401e04c3fSmrg * 11501e04c3fSmrg * Name Location 11601e04c3fSmrg * 11701e04c3fSmrg * TESSOUTER 0 11801e04c3fSmrg * TESSINNER 1 11901e04c3fSmrg * PATCH0..29 2..31 12001e04c3fSmrg * 12101e04c3fSmrg * That's how independent shaders agree on input and output locations. 12201e04c3fSmrg * The si_shader_io_get_unique_index function assigns the locations. 12301e04c3fSmrg * 12401e04c3fSmrg * For tessellation, other required information for calculating the input and 12501e04c3fSmrg * output addresses like the vertex stride, the patch stride, and the offsets 12601e04c3fSmrg * where per-vertex and per-patch data start, is passed to the shader via 12701e04c3fSmrg * user data SGPRs. The offsets and strides are calculated at draw time and 12801e04c3fSmrg * aren't available at compile time. 129af69d88dSmrg */ 130af69d88dSmrg 131af69d88dSmrg#ifndef SI_SHADER_H 132af69d88dSmrg#define SI_SHADER_H 133af69d88dSmrg 13401e04c3fSmrg#include "ac_binary.h" 13501e04c3fSmrg#include "ac_llvm_build.h" 13601e04c3fSmrg#include "ac_llvm_util.h" 1377ec681f3Smrg#include "util/simple_mtx.h" 1387ec681f3Smrg#include "util/u_inlines.h" 1397ec681f3Smrg#include "util/u_live_shader_cache.h" 1407ec681f3Smrg#include "util/u_queue.h" 1417ec681f3Smrg#include "si_pm4.h" 14201e04c3fSmrg 14301e04c3fSmrg#include <stdio.h> 14401e04c3fSmrg 1457ec681f3Smrg#ifdef __cplusplus 1467ec681f3Smrgextern "C" { 1477ec681f3Smrg#endif 1487ec681f3Smrg 1497ec681f3Smrg// Use LDS symbols when supported by LLVM. Can be disabled for testing the old 1507ec681f3Smrg// path on newer LLVM for now. Should be removed in the long term. 1517ec681f3Smrg#define USE_LDS_SYMBOLS (true) 1527ec681f3Smrg 15301e04c3fSmrgstruct nir_shader; 15401e04c3fSmrgstruct si_shader; 15501e04c3fSmrgstruct si_context; 15601e04c3fSmrg 1577ec681f3Smrg#define SI_MAX_ATTRIBS 16 1587ec681f3Smrg#define SI_MAX_VS_OUTPUTS 40 15901e04c3fSmrg 1607ec681f3Smrg#define SI_NGG_PRIM_EDGE_FLAG_BITS ((1 << 9) | (1 << 19) | (1 << 29)) 1617ec681f3Smrg 1627ec681f3Smrg#define SI_PS_INPUT_CNTL_0000 (S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(0)) 1637ec681f3Smrg#define SI_PS_INPUT_CNTL_0001 (S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(3)) 1647ec681f3Smrg#define SI_PS_INPUT_CNTL_UNUSED SI_PS_INPUT_CNTL_0000 1657ec681f3Smrg/* D3D9 behaviour for COLOR0 requires 0001. GL is undefined. */ 1667ec681f3Smrg#define SI_PS_INPUT_CNTL_UNUSED_COLOR0 SI_PS_INPUT_CNTL_0001 16701e04c3fSmrg 16801e04c3fSmrg/* SGPR user data indices */ 1697ec681f3Smrgenum 1707ec681f3Smrg{ 1717ec681f3Smrg SI_SGPR_INTERNAL_BINDINGS, 1727ec681f3Smrg SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES, 1737ec681f3Smrg SI_SGPR_CONST_AND_SHADER_BUFFERS, /* or just a constant buffer 0 pointer */ 1747ec681f3Smrg SI_SGPR_SAMPLERS_AND_IMAGES, 1757ec681f3Smrg SI_NUM_RESOURCE_SGPRS, 1767ec681f3Smrg 1777ec681f3Smrg /* API VS, TES without GS, GS copy shader */ 1787ec681f3Smrg SI_SGPR_VS_STATE_BITS = SI_NUM_RESOURCE_SGPRS, 1797ec681f3Smrg SI_NUM_VS_STATE_RESOURCE_SGPRS, 1807ec681f3Smrg 1817ec681f3Smrg /* all VS variants */ 1827ec681f3Smrg SI_SGPR_BASE_VERTEX = SI_NUM_VS_STATE_RESOURCE_SGPRS, 1837ec681f3Smrg SI_SGPR_DRAWID, 1847ec681f3Smrg SI_SGPR_START_INSTANCE, 1857ec681f3Smrg SI_VS_NUM_USER_SGPR, 1867ec681f3Smrg 1877ec681f3Smrg SI_SGPR_VS_BLIT_DATA = SI_SGPR_CONST_AND_SHADER_BUFFERS, 1887ec681f3Smrg 1897ec681f3Smrg /* TES */ 1907ec681f3Smrg SI_SGPR_TES_OFFCHIP_LAYOUT = SI_NUM_VS_STATE_RESOURCE_SGPRS, 1917ec681f3Smrg SI_SGPR_TES_OFFCHIP_ADDR, 1927ec681f3Smrg SI_TES_NUM_USER_SGPR, 1937ec681f3Smrg 1947ec681f3Smrg /* GFX6-8: TCS only */ 1957ec681f3Smrg GFX6_SGPR_TCS_OFFCHIP_LAYOUT = SI_NUM_RESOURCE_SGPRS, 1967ec681f3Smrg GFX6_SGPR_TCS_OUT_OFFSETS, 1977ec681f3Smrg GFX6_SGPR_TCS_OUT_LAYOUT, 1987ec681f3Smrg GFX6_SGPR_TCS_IN_LAYOUT, 1997ec681f3Smrg GFX6_TCS_NUM_USER_SGPR, 2007ec681f3Smrg 2017ec681f3Smrg /* GFX9: Merged shaders. */ 2027ec681f3Smrg /* 2ND_CONST_AND_SHADER_BUFFERS is set in USER_DATA_ADDR_LO (SGPR0). */ 2037ec681f3Smrg /* 2ND_SAMPLERS_AND_IMAGES is set in USER_DATA_ADDR_HI (SGPR1). */ 2047ec681f3Smrg GFX9_MERGED_NUM_USER_SGPR = SI_VS_NUM_USER_SGPR, 2057ec681f3Smrg 2067ec681f3Smrg /* GFX9: Merged LS-HS (VS-TCS) only. */ 2077ec681f3Smrg GFX9_SGPR_TCS_OFFCHIP_LAYOUT = GFX9_MERGED_NUM_USER_SGPR, 2087ec681f3Smrg GFX9_SGPR_TCS_OUT_OFFSETS, 2097ec681f3Smrg GFX9_SGPR_TCS_OUT_LAYOUT, 2107ec681f3Smrg GFX9_TCS_NUM_USER_SGPR, 2117ec681f3Smrg 2127ec681f3Smrg /* GS limits */ 2137ec681f3Smrg GFX6_GS_NUM_USER_SGPR = SI_NUM_RESOURCE_SGPRS, 2147ec681f3Smrg GFX9_VSGS_NUM_USER_SGPR = SI_VS_NUM_USER_SGPR, 2157ec681f3Smrg GFX9_TESGS_NUM_USER_SGPR = SI_TES_NUM_USER_SGPR, 2167ec681f3Smrg SI_GSCOPY_NUM_USER_SGPR = SI_NUM_VS_STATE_RESOURCE_SGPRS, 2177ec681f3Smrg 2187ec681f3Smrg /* PS only */ 2197ec681f3Smrg SI_SGPR_ALPHA_REF = SI_NUM_RESOURCE_SGPRS, 2207ec681f3Smrg SI_PS_NUM_USER_SGPR, 2217ec681f3Smrg 2227ec681f3Smrg /* The value has to be 12, because the hw requires that descriptors 2237ec681f3Smrg * are aligned to 4 SGPRs. 2247ec681f3Smrg */ 2257ec681f3Smrg SI_SGPR_VS_VB_DESCRIPTOR_FIRST = 12, 22601e04c3fSmrg}; 227af69d88dSmrg 228af69d88dSmrg/* LLVM function parameter indices */ 2297ec681f3Smrgenum 2307ec681f3Smrg{ 2317ec681f3Smrg SI_NUM_RESOURCE_PARAMS = 4, 2327ec681f3Smrg 2337ec681f3Smrg /* PS only parameters */ 2347ec681f3Smrg SI_PARAM_ALPHA_REF = SI_NUM_RESOURCE_PARAMS, 2357ec681f3Smrg SI_PARAM_PRIM_MASK, 2367ec681f3Smrg SI_PARAM_PERSP_SAMPLE, 2377ec681f3Smrg SI_PARAM_PERSP_CENTER, 2387ec681f3Smrg SI_PARAM_PERSP_CENTROID, 2397ec681f3Smrg SI_PARAM_PERSP_PULL_MODEL, 2407ec681f3Smrg SI_PARAM_LINEAR_SAMPLE, 2417ec681f3Smrg SI_PARAM_LINEAR_CENTER, 2427ec681f3Smrg SI_PARAM_LINEAR_CENTROID, 2437ec681f3Smrg SI_PARAM_LINE_STIPPLE_TEX, 2447ec681f3Smrg SI_PARAM_POS_X_FLOAT, 2457ec681f3Smrg SI_PARAM_POS_Y_FLOAT, 2467ec681f3Smrg SI_PARAM_POS_Z_FLOAT, 2477ec681f3Smrg SI_PARAM_POS_W_FLOAT, 2487ec681f3Smrg SI_PARAM_FRONT_FACE, 2497ec681f3Smrg SI_PARAM_ANCILLARY, 2507ec681f3Smrg SI_PARAM_SAMPLE_COVERAGE, 2517ec681f3Smrg SI_PARAM_POS_FIXED_PT, 2527ec681f3Smrg 2537ec681f3Smrg SI_NUM_PARAMS = SI_PARAM_POS_FIXED_PT + 9, /* +8 for COLOR[0..1] */ 25401e04c3fSmrg}; 25501e04c3fSmrg 25601e04c3fSmrg/* Fields of driver-defined VS state SGPR. */ 2577ec681f3Smrg#define S_VS_STATE_CLAMP_VERTEX_COLOR(x) (((unsigned)(x)&0x1) << 0) 2587ec681f3Smrg#define C_VS_STATE_CLAMP_VERTEX_COLOR 0xFFFFFFFE 2597ec681f3Smrg#define S_VS_STATE_INDEXED(x) (((unsigned)(x)&0x1) << 1) 2607ec681f3Smrg#define C_VS_STATE_INDEXED 0xFFFFFFFD 2617ec681f3Smrg#define S_VS_STATE_OUTPRIM(x) (((unsigned)(x)&0x3) << 2) 2627ec681f3Smrg#define C_VS_STATE_OUTPRIM 0xFFFFFFF3 2637ec681f3Smrg#define S_VS_STATE_PROVOKING_VTX_INDEX(x) (((unsigned)(x)&0x3) << 4) 2647ec681f3Smrg#define C_VS_STATE_PROVOKING_VTX_INDEX 0xFFFFFFCF 2657ec681f3Smrg#define S_VS_STATE_STREAMOUT_QUERY_ENABLED(x) (((unsigned)(x)&0x1) << 6) 2667ec681f3Smrg#define C_VS_STATE_STREAMOUT_QUERY_ENABLED 0xFFFFFFBF 2677ec681f3Smrg#define S_VS_STATE_SMALL_PRIM_PRECISION(x) (((unsigned)(x)&0xF) << 7) 2687ec681f3Smrg#define C_VS_STATE_SMALL_PRIM_PRECISION 0xFFFFF87F 2697ec681f3Smrg#define S_VS_STATE_LS_OUT_PATCH_SIZE(x) (((unsigned)(x)&0x1FFF) << 11) 2707ec681f3Smrg#define C_VS_STATE_LS_OUT_PATCH_SIZE 0xFF0007FF 2717ec681f3Smrg#define S_VS_STATE_LS_OUT_VERTEX_SIZE(x) (((unsigned)(x)&0xFF) << 24) 2727ec681f3Smrg#define C_VS_STATE_LS_OUT_VERTEX_SIZE 0x00FFFFFF 2737ec681f3Smrg 2747ec681f3Smrgenum 2757ec681f3Smrg{ 2767ec681f3Smrg /* These represent the number of SGPRs the shader uses. */ 2777ec681f3Smrg SI_VS_BLIT_SGPRS_POS = 3, 2787ec681f3Smrg SI_VS_BLIT_SGPRS_POS_COLOR = 7, 2797ec681f3Smrg SI_VS_BLIT_SGPRS_POS_TEXCOORD = 9, 28001e04c3fSmrg}; 28101e04c3fSmrg 2827ec681f3Smrg#define SI_NGG_CULL_ENABLED (1 << 0) /* this implies W, view.xy, and small prim culling */ 2837ec681f3Smrg#define SI_NGG_CULL_BACK_FACE (1 << 1) /* back faces */ 2847ec681f3Smrg#define SI_NGG_CULL_FRONT_FACE (1 << 2) /* front faces */ 2857ec681f3Smrg#define SI_NGG_CULL_LINES (1 << 3) /* the primitive type is lines */ 28601e04c3fSmrg 2877ec681f3Smrg/** 2887ec681f3Smrg * For VS shader keys, describe any fixups required for vertex fetch. 2897ec681f3Smrg * 2907ec681f3Smrg * \ref log_size, \ref format, and the number of channels are interpreted as 2917ec681f3Smrg * by \ref ac_build_opencoded_load_format. 2927ec681f3Smrg * 2937ec681f3Smrg * Note: all bits 0 (size = 1 byte, num channels = 1, format = float) is an 2947ec681f3Smrg * impossible format and indicates that no fixup is needed (just use 2957ec681f3Smrg * buffer_load_format_xyzw). 2967ec681f3Smrg */ 2977ec681f3Smrgunion si_vs_fix_fetch { 2987ec681f3Smrg struct { 2997ec681f3Smrg uint8_t log_size : 2; /* 1, 2, 4, 8 or bytes per channel */ 3007ec681f3Smrg uint8_t num_channels_m1 : 2; /* number of channels minus 1 */ 3017ec681f3Smrg uint8_t format : 3; /* AC_FETCH_FORMAT_xxx */ 3027ec681f3Smrg uint8_t reverse : 1; /* reverse XYZ channels */ 3037ec681f3Smrg } u; 3047ec681f3Smrg uint8_t bits; 30501e04c3fSmrg}; 30601e04c3fSmrg 30701e04c3fSmrgstruct si_shader; 30801e04c3fSmrg 30901e04c3fSmrg/* State of the context creating the shader object. */ 31001e04c3fSmrgstruct si_compiler_ctx_state { 3117ec681f3Smrg /* Should only be used by si_init_shader_selector_async and 3127ec681f3Smrg * si_build_shader_variant if thread_index == -1 (non-threaded). */ 3137ec681f3Smrg struct ac_llvm_compiler *compiler; 31401e04c3fSmrg 3157ec681f3Smrg /* Used if thread_index == -1 or if debug.async is true. */ 3167ec681f3Smrg struct pipe_debug_callback debug; 31701e04c3fSmrg 3187ec681f3Smrg /* Used for creating the log string for gallium/ddebug. */ 3197ec681f3Smrg bool is_debug_context; 3207ec681f3Smrg}; 3217ec681f3Smrg 3227ec681f3Smrgenum si_color_output_type { 3237ec681f3Smrg SI_TYPE_ANY32, 3247ec681f3Smrg SI_TYPE_FLOAT16, 3257ec681f3Smrg SI_TYPE_INT16, 3267ec681f3Smrg SI_TYPE_UINT16, 3277ec681f3Smrg}; 3287ec681f3Smrg 3297ec681f3Smrgunion si_input_info { 3307ec681f3Smrg struct { 3317ec681f3Smrg ubyte semantic; 3327ec681f3Smrg ubyte interpolate; 3337ec681f3Smrg ubyte fp16_lo_hi_valid; 3347ec681f3Smrg ubyte usage_mask; 3357ec681f3Smrg }; 3367ec681f3Smrg uint32_t _unused; /* this just forces 4-byte alignment */ 3377ec681f3Smrg}; 3387ec681f3Smrg 3397ec681f3Smrgstruct si_shader_info { 3407ec681f3Smrg shader_info base; 3417ec681f3Smrg 3427ec681f3Smrg gl_shader_stage stage; 3437ec681f3Smrg 3447ec681f3Smrg ubyte num_inputs; 3457ec681f3Smrg ubyte num_outputs; 3467ec681f3Smrg union si_input_info input[PIPE_MAX_SHADER_INPUTS]; 3477ec681f3Smrg ubyte output_semantic[PIPE_MAX_SHADER_OUTPUTS]; 3487ec681f3Smrg ubyte output_usagemask[PIPE_MAX_SHADER_OUTPUTS]; 3497ec681f3Smrg ubyte output_readmask[PIPE_MAX_SHADER_OUTPUTS]; 3507ec681f3Smrg ubyte output_streams[PIPE_MAX_SHADER_OUTPUTS]; 3517ec681f3Smrg ubyte output_type[PIPE_MAX_SHADER_OUTPUTS]; /* enum nir_alu_type */ 3527ec681f3Smrg 3537ec681f3Smrg ubyte color_interpolate[2]; 3547ec681f3Smrg ubyte color_interpolate_loc[2]; 3557ec681f3Smrg 3567ec681f3Smrg int constbuf0_num_slots; 3577ec681f3Smrg ubyte num_stream_output_components[4]; 3587ec681f3Smrg 3597ec681f3Smrg uint num_memory_stores; 3607ec681f3Smrg 3617ec681f3Smrg ubyte colors_read; /**< which color components are read by the FS */ 3627ec681f3Smrg ubyte colors_written; 3637ec681f3Smrg uint16_t output_color_types; /**< Each bit pair is enum si_color_output_type */ 3647ec681f3Smrg bool color0_writes_all_cbufs; /**< gl_FragColor */ 3657ec681f3Smrg bool reads_samplemask; /**< does fragment shader read sample mask? */ 3667ec681f3Smrg bool reads_tess_factors; /**< If TES reads TESSINNER or TESSOUTER */ 3677ec681f3Smrg bool writes_z; /**< does fragment shader write Z value? */ 3687ec681f3Smrg bool writes_stencil; /**< does fragment shader write stencil value? */ 3697ec681f3Smrg bool writes_samplemask; /**< does fragment shader write sample mask? */ 3707ec681f3Smrg bool writes_edgeflag; /**< vertex shader outputs edgeflag */ 3717ec681f3Smrg bool uses_interp_color; 3727ec681f3Smrg bool uses_persp_center_color; 3737ec681f3Smrg bool uses_persp_centroid_color; 3747ec681f3Smrg bool uses_persp_sample_color; 3757ec681f3Smrg bool uses_persp_center; 3767ec681f3Smrg bool uses_persp_centroid; 3777ec681f3Smrg bool uses_persp_sample; 3787ec681f3Smrg bool uses_linear_center; 3797ec681f3Smrg bool uses_linear_centroid; 3807ec681f3Smrg bool uses_linear_sample; 3817ec681f3Smrg bool uses_interp_at_sample; 3827ec681f3Smrg bool uses_instanceid; 3837ec681f3Smrg bool uses_base_vertex; 3847ec681f3Smrg bool uses_base_instance; 3857ec681f3Smrg bool uses_drawid; 3867ec681f3Smrg bool uses_primid; 3877ec681f3Smrg bool uses_frontface; 3887ec681f3Smrg bool uses_invocationid; 3897ec681f3Smrg bool uses_thread_id[3]; 3907ec681f3Smrg bool uses_block_id[3]; 3917ec681f3Smrg bool uses_variable_block_size; 3927ec681f3Smrg bool uses_grid_size; 3937ec681f3Smrg bool uses_subgroup_info; 3947ec681f3Smrg bool writes_position; 3957ec681f3Smrg bool writes_psize; 3967ec681f3Smrg bool writes_clipvertex; 3977ec681f3Smrg bool writes_primid; 3987ec681f3Smrg bool writes_viewport_index; 3997ec681f3Smrg bool writes_layer; 4007ec681f3Smrg bool uses_bindless_samplers; 4017ec681f3Smrg bool uses_bindless_images; 4027ec681f3Smrg bool uses_indirect_descriptor; 4037ec681f3Smrg 4047ec681f3Smrg bool uses_vmem_return_type_sampler_or_bvh; 4057ec681f3Smrg bool uses_vmem_return_type_other; /* all other VMEM loads and atomics with return */ 4067ec681f3Smrg 4077ec681f3Smrg /** Whether all codepaths write tess factors in all invocations. */ 4087ec681f3Smrg bool tessfactors_are_def_in_all_invocs; 4097ec681f3Smrg 4107ec681f3Smrg /* A flag to check if vrs2x2 can be enabled to reduce number of 4117ec681f3Smrg * fragment shader invocations if flat shading. 4127ec681f3Smrg */ 4137ec681f3Smrg bool allow_flat_shading; 4147ec681f3Smrg 4157ec681f3Smrg /* Optimization: if the texture bound to this texunit has been cleared to 1, 4167ec681f3Smrg * then the draw can be skipped (see si_draw_vbo_skip_noop). Initially the 4177ec681f3Smrg * value is 0xff (undetermined) and can be later changed to 0 (= false) or 4187ec681f3Smrg * texunit + 1. 4197ec681f3Smrg */ 4207ec681f3Smrg uint8_t writes_1_if_tex_is_1; 42101e04c3fSmrg}; 42201e04c3fSmrg 42301e04c3fSmrg/* A shader selector is a gallium CSO and contains shader variants and 4247ec681f3Smrg * binaries for one NIR program. This can be shared by multiple contexts. 42501e04c3fSmrg */ 42601e04c3fSmrgstruct si_shader_selector { 4277ec681f3Smrg struct util_live_shader base; 4287ec681f3Smrg struct si_screen *screen; 4297ec681f3Smrg struct util_queue_fence ready; 4307ec681f3Smrg struct si_compiler_ctx_state compiler_ctx_state; 4317ec681f3Smrg 4327ec681f3Smrg simple_mtx_t mutex; 4337ec681f3Smrg struct si_shader *first_variant; /* immutable after the first variant */ 4347ec681f3Smrg struct si_shader *last_variant; /* mutable */ 4357ec681f3Smrg 4367ec681f3Smrg /* The compiled NIR shader without a prolog and/or epilog (not 4377ec681f3Smrg * uploaded to a buffer object). 4387ec681f3Smrg */ 4397ec681f3Smrg struct si_shader *main_shader_part; 4407ec681f3Smrg struct si_shader *main_shader_part_ls; /* as_ls is set in the key */ 4417ec681f3Smrg struct si_shader *main_shader_part_es; /* as_es is set in the key */ 4427ec681f3Smrg struct si_shader *main_shader_part_ngg; /* as_ngg is set in the key */ 4437ec681f3Smrg struct si_shader *main_shader_part_ngg_es; /* for Wave32 TES before legacy GS */ 4447ec681f3Smrg 4457ec681f3Smrg struct si_shader *gs_copy_shader; 4467ec681f3Smrg 4477ec681f3Smrg struct nir_shader *nir; 4487ec681f3Smrg void *nir_binary; 4497ec681f3Smrg unsigned nir_size; 4507ec681f3Smrg 4517ec681f3Smrg struct pipe_stream_output_info so; 4527ec681f3Smrg struct si_shader_info info; 4537ec681f3Smrg 4547ec681f3Smrg enum pipe_shader_type pipe_shader_type; 4557ec681f3Smrg ubyte const_and_shader_buf_descriptors_index; 4567ec681f3Smrg ubyte sampler_and_images_descriptors_index; 4577ec681f3Smrg bool vs_needs_prolog; 4587ec681f3Smrg ubyte cs_shaderbufs_sgpr_index; 4597ec681f3Smrg ubyte cs_num_shaderbufs_in_user_sgprs; 4607ec681f3Smrg ubyte cs_images_sgpr_index; 4617ec681f3Smrg ubyte cs_images_num_sgprs; 4627ec681f3Smrg ubyte cs_num_images_in_user_sgprs; 4637ec681f3Smrg ubyte num_vs_inputs; 4647ec681f3Smrg ubyte num_vbos_in_user_sgprs; 4657ec681f3Smrg unsigned ngg_cull_vert_threshold; /* UINT32_MAX = disabled */ 4667ec681f3Smrg ubyte clipdist_mask; 4677ec681f3Smrg ubyte culldist_mask; 4687ec681f3Smrg enum pipe_prim_type rast_prim; 4697ec681f3Smrg 4707ec681f3Smrg /* ES parameters. */ 4717ec681f3Smrg uint16_t esgs_itemsize; /* vertex stride */ 4727ec681f3Smrg uint16_t lshs_vertex_stride; 4737ec681f3Smrg 4747ec681f3Smrg /* GS parameters. */ 4757ec681f3Smrg uint16_t gsvs_vertex_size; 4767ec681f3Smrg ubyte gs_input_verts_per_prim; 4777ec681f3Smrg unsigned max_gsvs_emit_size; 4787ec681f3Smrg uint16_t enabled_streamout_buffer_mask; 4797ec681f3Smrg bool tess_turns_off_ngg; 4807ec681f3Smrg 4817ec681f3Smrg /* PS parameters. */ 4827ec681f3Smrg ubyte color_attr_index[2]; 4837ec681f3Smrg unsigned db_shader_control; 4847ec681f3Smrg /* Set 0xf or 0x0 (4 bits) per each written output. 4857ec681f3Smrg * ANDed with spi_shader_col_format. 4867ec681f3Smrg */ 4877ec681f3Smrg unsigned colors_written_4bit; 4887ec681f3Smrg 4897ec681f3Smrg uint64_t outputs_written_before_ps; /* "get_unique_index" bits */ 4907ec681f3Smrg uint64_t outputs_written; /* "get_unique_index" bits */ 4917ec681f3Smrg uint32_t patch_outputs_written; /* "get_unique_index_patch" bits */ 4927ec681f3Smrg 4937ec681f3Smrg uint64_t inputs_read; /* "get_unique_index" bits */ 4947ec681f3Smrg uint64_t tcs_vgpr_only_inputs; /* TCS inputs that are only in VGPRs, not LDS. */ 4957ec681f3Smrg 4967ec681f3Smrg /* bitmasks of used descriptor slots */ 4977ec681f3Smrg uint64_t active_const_and_shader_buffers; 4987ec681f3Smrg uint64_t active_samplers_and_images; 499af69d88dSmrg}; 500af69d88dSmrg 50101e04c3fSmrg/* Valid shader configurations: 50201e04c3fSmrg * 5037ec681f3Smrg * API shaders VS | TCS | TES | GS |pass| PS 5047ec681f3Smrg * are compiled as: | | | |thru| 5057ec681f3Smrg * | | | | | 5067ec681f3Smrg * Only VS & PS: VS | | | | | PS 5077ec681f3Smrg * GFX6 - with GS: ES | | | GS | VS | PS 5087ec681f3Smrg * - with tess: LS | HS | VS | | | PS 5097ec681f3Smrg * - with both: LS | HS | ES | GS | VS | PS 5107ec681f3Smrg * GFX9 - with GS: -> | | | GS | VS | PS 5117ec681f3Smrg * - with tess: -> | HS | VS | | | PS 5127ec681f3Smrg * - with both: -> | HS | -> | GS | VS | PS 5137ec681f3Smrg * | | | | | 5147ec681f3Smrg * NGG - VS & PS: GS | | | | | PS 5157ec681f3Smrg * (GFX10+) - with GS: -> | | | GS | | PS 5167ec681f3Smrg * - with tess: -> | HS | GS | | | PS 5177ec681f3Smrg * - with both: -> | HS | -> | GS | | PS 51801e04c3fSmrg * 51901e04c3fSmrg * -> = merged with the next stage 52001e04c3fSmrg */ 521af69d88dSmrg 52201e04c3fSmrg/* Use the byte alignment for all following structure members for optimal 52301e04c3fSmrg * shader key memory footprint. 52401e04c3fSmrg */ 52501e04c3fSmrg#pragma pack(push, 1) 526af69d88dSmrg 52701e04c3fSmrg/* Common VS bits between the shader key and the prolog key. */ 52801e04c3fSmrgstruct si_vs_prolog_bits { 5297ec681f3Smrg /* - If neither "is_one" nor "is_fetched" has a bit set, the instance 5307ec681f3Smrg * divisor is 0. 5317ec681f3Smrg * - If "is_one" has a bit set, the instance divisor is 1. 5327ec681f3Smrg * - If "is_fetched" has a bit set, the instance divisor will be loaded 5337ec681f3Smrg * from the constant buffer. 5347ec681f3Smrg */ 5357ec681f3Smrg uint16_t instance_divisor_is_one; /* bitmask of inputs */ 5367ec681f3Smrg uint16_t instance_divisor_is_fetched; /* bitmask of inputs */ 5377ec681f3Smrg unsigned ls_vgpr_fix : 1; 53801e04c3fSmrg}; 539af69d88dSmrg 54001e04c3fSmrg/* Common TCS bits between the shader key and the epilog key. */ 54101e04c3fSmrgstruct si_tcs_epilog_bits { 5427ec681f3Smrg unsigned prim_mode : 3; 5437ec681f3Smrg unsigned invoc0_tess_factors_are_def : 1; 5447ec681f3Smrg unsigned tes_reads_tess_factors : 1; 545af69d88dSmrg}; 546af69d88dSmrg 54701e04c3fSmrgstruct si_gs_prolog_bits { 5487ec681f3Smrg unsigned tri_strip_adj_fix : 1; 54901e04c3fSmrg}; 55001e04c3fSmrg 55101e04c3fSmrg/* Common PS bits between the shader key and the prolog key. */ 55201e04c3fSmrgstruct si_ps_prolog_bits { 5537ec681f3Smrg unsigned color_two_side : 1; 5547ec681f3Smrg unsigned flatshade_colors : 1; 5557ec681f3Smrg unsigned poly_stipple : 1; 5567ec681f3Smrg unsigned force_persp_sample_interp : 1; 5577ec681f3Smrg unsigned force_linear_sample_interp : 1; 5587ec681f3Smrg unsigned force_persp_center_interp : 1; 5597ec681f3Smrg unsigned force_linear_center_interp : 1; 5607ec681f3Smrg unsigned bc_optimize_for_persp : 1; 5617ec681f3Smrg unsigned bc_optimize_for_linear : 1; 5627ec681f3Smrg unsigned samplemask_log_ps_iter : 3; 56301e04c3fSmrg}; 56401e04c3fSmrg 56501e04c3fSmrg/* Common PS bits between the shader key and the epilog key. */ 56601e04c3fSmrgstruct si_ps_epilog_bits { 5677ec681f3Smrg unsigned spi_shader_col_format; 5687ec681f3Smrg unsigned color_is_int8 : 8; 5697ec681f3Smrg unsigned color_is_int10 : 8; 5707ec681f3Smrg unsigned last_cbuf : 3; 5717ec681f3Smrg unsigned alpha_func : 3; 5727ec681f3Smrg unsigned alpha_to_one : 1; 5737ec681f3Smrg unsigned poly_line_smoothing : 1; 5747ec681f3Smrg unsigned clamp_color : 1; 57501e04c3fSmrg}; 57601e04c3fSmrg 57701e04c3fSmrgunion si_shader_part_key { 5787ec681f3Smrg struct { 5797ec681f3Smrg struct si_vs_prolog_bits states; 5807ec681f3Smrg unsigned num_input_sgprs : 6; 5817ec681f3Smrg /* For merged stages such as LS-HS, HS input VGPRs are first. */ 5827ec681f3Smrg unsigned num_merged_next_stage_vgprs : 3; 5837ec681f3Smrg unsigned num_inputs : 5; 5847ec681f3Smrg unsigned as_ls : 1; 5857ec681f3Smrg unsigned as_es : 1; 5867ec681f3Smrg unsigned as_ngg : 1; 5877ec681f3Smrg unsigned load_vgprs_after_culling : 1; 5887ec681f3Smrg /* Prologs for monolithic shaders shouldn't set EXEC. */ 5897ec681f3Smrg unsigned is_monolithic : 1; 5907ec681f3Smrg } vs_prolog; 5917ec681f3Smrg struct { 5927ec681f3Smrg struct si_tcs_epilog_bits states; 5937ec681f3Smrg } tcs_epilog; 5947ec681f3Smrg struct { 5957ec681f3Smrg struct si_gs_prolog_bits states; 5967ec681f3Smrg unsigned as_ngg : 1; 5977ec681f3Smrg } gs_prolog; 5987ec681f3Smrg struct { 5997ec681f3Smrg struct si_ps_prolog_bits states; 6007ec681f3Smrg unsigned num_input_sgprs : 6; 6017ec681f3Smrg unsigned num_input_vgprs : 5; 6027ec681f3Smrg /* Color interpolation and two-side color selection. */ 6037ec681f3Smrg unsigned colors_read : 8; /* color input components read */ 6047ec681f3Smrg unsigned num_interp_inputs : 5; /* BCOLOR is at this location */ 6057ec681f3Smrg unsigned face_vgpr_index : 5; 6067ec681f3Smrg unsigned ancillary_vgpr_index : 5; 6077ec681f3Smrg unsigned wqm : 1; 6087ec681f3Smrg char color_attr_index[2]; 6097ec681f3Smrg signed char color_interp_vgpr_index[2]; /* -1 == constant */ 6107ec681f3Smrg } ps_prolog; 6117ec681f3Smrg struct { 6127ec681f3Smrg struct si_ps_epilog_bits states; 6137ec681f3Smrg unsigned colors_written : 8; 6147ec681f3Smrg unsigned color_types : 16; 6157ec681f3Smrg unsigned writes_z : 1; 6167ec681f3Smrg unsigned writes_stencil : 1; 6177ec681f3Smrg unsigned writes_samplemask : 1; 6187ec681f3Smrg } ps_epilog; 619af69d88dSmrg}; 620af69d88dSmrg 62101e04c3fSmrgstruct si_shader_key { 6227ec681f3Smrg /* Prolog and epilog flags. */ 6237ec681f3Smrg union { 6247ec681f3Smrg struct { 6257ec681f3Smrg struct si_vs_prolog_bits prolog; 6267ec681f3Smrg } vs; 6277ec681f3Smrg struct { 6287ec681f3Smrg struct si_vs_prolog_bits ls_prolog; /* for merged LS-HS */ 6297ec681f3Smrg struct si_shader_selector *ls; /* for merged LS-HS */ 6307ec681f3Smrg struct si_tcs_epilog_bits epilog; 6317ec681f3Smrg } tcs; /* tessellation control shader */ 6327ec681f3Smrg struct { 6337ec681f3Smrg struct si_vs_prolog_bits vs_prolog; /* for merged ES-GS */ 6347ec681f3Smrg struct si_shader_selector *es; /* for merged ES-GS */ 6357ec681f3Smrg struct si_gs_prolog_bits prolog; 6367ec681f3Smrg } gs; 6377ec681f3Smrg struct { 6387ec681f3Smrg struct si_ps_prolog_bits prolog; 6397ec681f3Smrg struct si_ps_epilog_bits epilog; 6407ec681f3Smrg } ps; 6417ec681f3Smrg } part; 6427ec681f3Smrg 6437ec681f3Smrg /* These three are initially set according to the NEXT_SHADER property, 6447ec681f3Smrg * or guessed if the property doesn't seem correct. 6457ec681f3Smrg */ 6467ec681f3Smrg unsigned as_es : 1; /* whether it's a shader before GS */ 6477ec681f3Smrg unsigned as_ls : 1; /* whether it's VS before TCS */ 6487ec681f3Smrg unsigned as_ngg : 1; /* whether it's the last GE stage and NGG is enabled, 6497ec681f3Smrg also set for the stage right before GS */ 6507ec681f3Smrg 6517ec681f3Smrg /* Flags for monolithic compilation only. */ 6527ec681f3Smrg struct { 6537ec681f3Smrg /* Whether fetch should be opencoded according to vs_fix_fetch. 6547ec681f3Smrg * Otherwise, if vs_fix_fetch is non-zero, buffer_load_format_xyzw 6557ec681f3Smrg * with minimal fixups is used. */ 6567ec681f3Smrg uint16_t vs_fetch_opencode; 6577ec681f3Smrg union si_vs_fix_fetch vs_fix_fetch[SI_MAX_ATTRIBS]; 6587ec681f3Smrg 6597ec681f3Smrg union { 6607ec681f3Smrg uint64_t ff_tcs_inputs_to_copy; /* for fixed-func TCS */ 6617ec681f3Smrg /* When PS needs PrimID and GS is disabled. */ 6627ec681f3Smrg unsigned vs_export_prim_id : 1; 6637ec681f3Smrg struct { 6647ec681f3Smrg unsigned interpolate_at_sample_force_center : 1; 6657ec681f3Smrg unsigned fbfetch_msaa : 1; 6667ec681f3Smrg unsigned fbfetch_is_1D : 1; 6677ec681f3Smrg unsigned fbfetch_layered : 1; 6687ec681f3Smrg } ps; 6697ec681f3Smrg } u; 6707ec681f3Smrg } mono; 6717ec681f3Smrg 6727ec681f3Smrg /* Optimization flags for asynchronous compilation only. */ 6737ec681f3Smrg struct { 6747ec681f3Smrg /* For HW VS (it can be VS, TES, GS) */ 6757ec681f3Smrg uint64_t kill_outputs; /* "get_unique_index" bits */ 6767ec681f3Smrg unsigned kill_clip_distances : 8; 6777ec681f3Smrg unsigned kill_pointsize : 1; 6787ec681f3Smrg 6797ec681f3Smrg /* For NGG VS and TES. */ 6807ec681f3Smrg unsigned ngg_culling : 4; /* SI_NGG_CULL_* */ 6817ec681f3Smrg 6827ec681f3Smrg /* For shaders where monolithic variants have better code. 6837ec681f3Smrg * 6847ec681f3Smrg * This is a flag that has no effect on code generation, 6857ec681f3Smrg * but forces monolithic shaders to be used as soon as 6867ec681f3Smrg * possible, because it's in the "opt" group. 6877ec681f3Smrg */ 6887ec681f3Smrg unsigned prefer_mono : 1; 6897ec681f3Smrg 6907ec681f3Smrg /* VS and TCS have the same number of patch vertices. */ 6917ec681f3Smrg unsigned same_patch_vertices:1; 6927ec681f3Smrg 6937ec681f3Smrg unsigned inline_uniforms:1; 6947ec681f3Smrg 6957ec681f3Smrg /* This must be kept last to limit the number of variants 6967ec681f3Smrg * depending only on the uniform values. 6977ec681f3Smrg */ 6987ec681f3Smrg uint32_t inlined_uniform_values[MAX_INLINABLE_UNIFORMS]; 6997ec681f3Smrg } opt; 70001e04c3fSmrg}; 70101e04c3fSmrg 70201e04c3fSmrg/* Restore the pack alignment to default. */ 70301e04c3fSmrg#pragma pack(pop) 70401e04c3fSmrg 7057ec681f3Smrg/* GCN-specific shader info. */ 7067ec681f3Smrgstruct si_shader_binary_info { 7077ec681f3Smrg ubyte vs_output_param_offset[SI_MAX_VS_OUTPUTS]; 7087ec681f3Smrg uint32_t vs_output_ps_input_cntl[NUM_TOTAL_VARYING_SLOTS]; 7097ec681f3Smrg ubyte num_input_sgprs; 7107ec681f3Smrg ubyte num_input_vgprs; 7117ec681f3Smrg signed char face_vgpr_index; 7127ec681f3Smrg signed char ancillary_vgpr_index; 7137ec681f3Smrg bool uses_instanceid; 7147ec681f3Smrg ubyte nr_pos_exports; 7157ec681f3Smrg ubyte nr_param_exports; 7167ec681f3Smrg unsigned private_mem_vgprs; 7177ec681f3Smrg unsigned max_simd_waves; 71801e04c3fSmrg}; 71901e04c3fSmrg 7207ec681f3Smrgstruct si_shader_binary { 7217ec681f3Smrg const char *elf_buffer; 7227ec681f3Smrg size_t elf_size; 7237ec681f3Smrg 7247ec681f3Smrg char *uploaded_code; 7257ec681f3Smrg size_t uploaded_code_size; 7267ec681f3Smrg 7277ec681f3Smrg char *llvm_ir_string; 7287ec681f3Smrg}; 7297ec681f3Smrg 7307ec681f3Smrgstruct gfx9_gs_info { 7317ec681f3Smrg unsigned es_verts_per_subgroup; 7327ec681f3Smrg unsigned gs_prims_per_subgroup; 7337ec681f3Smrg unsigned gs_inst_prims_in_subgroup; 7347ec681f3Smrg unsigned max_prims_per_subgroup; 7357ec681f3Smrg unsigned esgs_ring_size; /* in bytes */ 7367ec681f3Smrg}; 7377ec681f3Smrg 7387ec681f3Smrg#define SI_NUM_VGT_STAGES_KEY_BITS 5 7397ec681f3Smrg#define SI_NUM_VGT_STAGES_STATES (1 << SI_NUM_VGT_STAGES_KEY_BITS) 7407ec681f3Smrg 7417ec681f3Smrg/* The VGT_SHADER_STAGES key used to index the table of precomputed values. 7427ec681f3Smrg * Some fields are set by state-change calls, most are set by draw_vbo. 7437ec681f3Smrg */ 7447ec681f3Smrgunion si_vgt_stages_key { 7457ec681f3Smrg struct { 7467ec681f3Smrg#if UTIL_ARCH_LITTLE_ENDIAN 7477ec681f3Smrg uint8_t tess : 1; 7487ec681f3Smrg uint8_t gs : 1; 7497ec681f3Smrg uint8_t ngg_passthrough : 1; 7507ec681f3Smrg uint8_t ngg : 1; /* gfx10+ */ 7517ec681f3Smrg uint8_t streamout : 1; /* only used with NGG */ 7527ec681f3Smrg uint8_t _pad : 8 - SI_NUM_VGT_STAGES_KEY_BITS; 7537ec681f3Smrg#else /* UTIL_ARCH_BIG_ENDIAN */ 7547ec681f3Smrg uint8_t _pad : 8 - SI_NUM_VGT_STAGES_KEY_BITS; 7557ec681f3Smrg uint8_t streamout : 1; 7567ec681f3Smrg uint8_t ngg : 1; 7577ec681f3Smrg uint8_t ngg_passthrough : 1; 7587ec681f3Smrg uint8_t gs : 1; 7597ec681f3Smrg uint8_t tess : 1; 7607ec681f3Smrg#endif 7617ec681f3Smrg } u; 7627ec681f3Smrg uint8_t index; 76301e04c3fSmrg}; 76401e04c3fSmrg 76501e04c3fSmrgstruct si_shader { 7667ec681f3Smrg struct si_pm4_state pm4; /* base class */ 7677ec681f3Smrg struct si_compiler_ctx_state compiler_ctx_state; 7687ec681f3Smrg 7697ec681f3Smrg struct si_shader_selector *selector; 7707ec681f3Smrg struct si_shader_selector *previous_stage_sel; /* for refcounting */ 7717ec681f3Smrg struct si_shader *next_variant; 7727ec681f3Smrg 7737ec681f3Smrg struct si_shader_part *prolog; 7747ec681f3Smrg struct si_shader *previous_stage; /* for GFX9 */ 7757ec681f3Smrg struct si_shader_part *prolog2; 7767ec681f3Smrg struct si_shader_part *epilog; 7777ec681f3Smrg 7787ec681f3Smrg struct si_resource *bo; 7797ec681f3Smrg struct si_resource *scratch_bo; 7807ec681f3Smrg struct si_shader_key key; 7817ec681f3Smrg struct util_queue_fence ready; 7827ec681f3Smrg bool compilation_failed; 7837ec681f3Smrg bool is_monolithic; 7847ec681f3Smrg bool is_optimized; 7857ec681f3Smrg bool is_binary_shared; 7867ec681f3Smrg bool is_gs_copy_shader; 7877ec681f3Smrg 7887ec681f3Smrg /* The following data is all that's needed for binary shaders. */ 7897ec681f3Smrg struct si_shader_binary binary; 7907ec681f3Smrg struct ac_shader_config config; 7917ec681f3Smrg struct si_shader_binary_info info; 7927ec681f3Smrg 7937ec681f3Smrg /* SI_SGPR_VS_STATE_BITS */ 7947ec681f3Smrg bool uses_vs_state_provoking_vertex; 7957ec681f3Smrg bool uses_vs_state_outprim; 7967ec681f3Smrg 7977ec681f3Smrg bool uses_base_instance; 7987ec681f3Smrg 7997ec681f3Smrg struct { 8007ec681f3Smrg uint16_t ngg_emit_size; /* in dwords */ 8017ec681f3Smrg uint16_t hw_max_esverts; 8027ec681f3Smrg uint16_t max_gsprims; 8037ec681f3Smrg uint16_t max_out_verts; 8047ec681f3Smrg uint16_t prim_amp_factor; 8057ec681f3Smrg bool max_vert_out_per_gs_instance; 8067ec681f3Smrg } ngg; 8077ec681f3Smrg 8087ec681f3Smrg /* Shader key + LLVM IR + disassembly + statistics. 8097ec681f3Smrg * Generated for debug contexts only. 8107ec681f3Smrg */ 8117ec681f3Smrg char *shader_log; 8127ec681f3Smrg size_t shader_log_size; 8137ec681f3Smrg 8147ec681f3Smrg struct gfx9_gs_info gs_info; 8157ec681f3Smrg 8167ec681f3Smrg /* For save precompute context registers values. */ 8177ec681f3Smrg union { 8187ec681f3Smrg struct { 8197ec681f3Smrg unsigned vgt_gsvs_ring_offset_1; 8207ec681f3Smrg unsigned vgt_gsvs_ring_offset_2; 8217ec681f3Smrg unsigned vgt_gsvs_ring_offset_3; 8227ec681f3Smrg unsigned vgt_gsvs_ring_itemsize; 8237ec681f3Smrg unsigned vgt_gs_max_vert_out; 8247ec681f3Smrg unsigned vgt_gs_vert_itemsize; 8257ec681f3Smrg unsigned vgt_gs_vert_itemsize_1; 8267ec681f3Smrg unsigned vgt_gs_vert_itemsize_2; 8277ec681f3Smrg unsigned vgt_gs_vert_itemsize_3; 8287ec681f3Smrg unsigned vgt_gs_instance_cnt; 8297ec681f3Smrg unsigned vgt_gs_onchip_cntl; 8307ec681f3Smrg unsigned vgt_gs_max_prims_per_subgroup; 8317ec681f3Smrg unsigned vgt_esgs_ring_itemsize; 8327ec681f3Smrg unsigned spi_shader_pgm_rsrc3_gs; 8337ec681f3Smrg unsigned spi_shader_pgm_rsrc4_gs; 8347ec681f3Smrg } gs; 8357ec681f3Smrg 8367ec681f3Smrg struct { 8377ec681f3Smrg unsigned ge_max_output_per_subgroup; 8387ec681f3Smrg unsigned ge_ngg_subgrp_cntl; 8397ec681f3Smrg unsigned vgt_primitiveid_en; 8407ec681f3Smrg unsigned vgt_gs_onchip_cntl; 8417ec681f3Smrg unsigned vgt_gs_instance_cnt; 8427ec681f3Smrg unsigned vgt_esgs_ring_itemsize; 8437ec681f3Smrg unsigned spi_vs_out_config; 8447ec681f3Smrg unsigned spi_shader_idx_format; 8457ec681f3Smrg unsigned spi_shader_pos_format; 8467ec681f3Smrg unsigned pa_cl_vte_cntl; 8477ec681f3Smrg unsigned pa_cl_ngg_cntl; 8487ec681f3Smrg unsigned vgt_gs_max_vert_out; /* for API GS */ 8497ec681f3Smrg unsigned ge_pc_alloc; /* uconfig register */ 8507ec681f3Smrg unsigned spi_shader_pgm_rsrc3_gs; 8517ec681f3Smrg unsigned spi_shader_pgm_rsrc4_gs; 8527ec681f3Smrg union si_vgt_stages_key vgt_stages; 8537ec681f3Smrg } ngg; 8547ec681f3Smrg 8557ec681f3Smrg struct { 8567ec681f3Smrg unsigned vgt_gs_mode; 8577ec681f3Smrg unsigned vgt_primitiveid_en; 8587ec681f3Smrg unsigned vgt_reuse_off; 8597ec681f3Smrg unsigned spi_vs_out_config; 8607ec681f3Smrg unsigned spi_shader_pos_format; 8617ec681f3Smrg unsigned pa_cl_vte_cntl; 8627ec681f3Smrg unsigned ge_pc_alloc; /* uconfig register */ 8637ec681f3Smrg } vs; 8647ec681f3Smrg 8657ec681f3Smrg struct { 8667ec681f3Smrg unsigned spi_ps_input_ena; 8677ec681f3Smrg unsigned spi_ps_input_addr; 8687ec681f3Smrg unsigned spi_baryc_cntl; 8697ec681f3Smrg unsigned spi_ps_in_control; 8707ec681f3Smrg unsigned spi_shader_z_format; 8717ec681f3Smrg unsigned spi_shader_col_format; 8727ec681f3Smrg unsigned cb_shader_mask; 8737ec681f3Smrg unsigned num_interp; 8747ec681f3Smrg } ps; 8757ec681f3Smrg } ctx_reg; 8767ec681f3Smrg 8777ec681f3Smrg /*For save precompute registers value */ 8787ec681f3Smrg unsigned vgt_tf_param; /* VGT_TF_PARAM */ 8797ec681f3Smrg unsigned vgt_vertex_reuse_block_cntl; /* VGT_VERTEX_REUSE_BLOCK_CNTL */ 8807ec681f3Smrg unsigned pa_cl_vs_out_cntl; 8817ec681f3Smrg unsigned ge_cntl; 882af69d88dSmrg}; 883af69d88dSmrg 88401e04c3fSmrgstruct si_shader_part { 8857ec681f3Smrg struct si_shader_part *next; 8867ec681f3Smrg union si_shader_part_key key; 8877ec681f3Smrg struct si_shader_binary binary; 8887ec681f3Smrg struct ac_shader_config config; 88901e04c3fSmrg}; 89001e04c3fSmrg 89101e04c3fSmrg/* si_shader.c */ 8927ec681f3Smrgbool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, 8937ec681f3Smrg struct si_shader *shader, struct pipe_debug_callback *debug); 8947ec681f3Smrgbool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, 8957ec681f3Smrg struct si_shader *shader, struct pipe_debug_callback *debug); 89601e04c3fSmrgvoid si_shader_destroy(struct si_shader *shader); 8977ec681f3Smrgunsigned si_shader_io_get_unique_index_patch(unsigned semantic); 8987ec681f3Smrgunsigned si_shader_io_get_unique_index(unsigned semantic, bool is_varying); 8997ec681f3Smrgbool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader, 9007ec681f3Smrg uint64_t scratch_va); 9017ec681f3Smrgvoid si_shader_dump(struct si_screen *sscreen, struct si_shader *shader, 9027ec681f3Smrg struct pipe_debug_callback *debug, FILE *f, bool check_debug_option); 9037ec681f3Smrgvoid si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader, 9047ec681f3Smrg struct pipe_debug_callback *debug); 9057ec681f3Smrgvoid si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size); 9067ec681f3Smrgconst char *si_get_shader_name(const struct si_shader *shader); 9077ec681f3Smrgvoid si_shader_binary_clean(struct si_shader_binary *binary); 9087ec681f3Smrg 9097ec681f3Smrg/* si_shader_llvm_gs.c */ 9107ec681f3Smrgstruct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen, 9117ec681f3Smrg struct ac_llvm_compiler *compiler, 9127ec681f3Smrg struct si_shader_selector *gs_selector, 9137ec681f3Smrg struct pipe_debug_callback *debug); 91401e04c3fSmrg 91501e04c3fSmrg/* si_shader_nir.c */ 9167ec681f3Smrgvoid si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *info); 9177ec681f3Smrgvoid si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first); 9187ec681f3Smrgvoid si_nir_late_opts(nir_shader *nir); 9197ec681f3Smrgchar *si_finalize_nir(struct pipe_screen *screen, void *nirptr); 9207ec681f3Smrg 9217ec681f3Smrg/* si_state_shaders.c */ 9227ec681f3Smrgvoid gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs, 9237ec681f3Smrg struct gfx9_gs_info *out); 9247ec681f3Smrgbool gfx10_is_ngg_passthrough(struct si_shader *shader); 92501e04c3fSmrg 92601e04c3fSmrg/* Inline helpers. */ 92701e04c3fSmrg 92801e04c3fSmrg/* Return the pointer to the main shader part's pointer. */ 9297ec681f3Smrgstatic inline struct si_shader **si_get_main_shader_part(struct si_shader_selector *sel, 9307ec681f3Smrg const struct si_shader_key *key) 931af69d88dSmrg{ 9327ec681f3Smrg if (key->as_ls) 9337ec681f3Smrg return &sel->main_shader_part_ls; 9347ec681f3Smrg if (key->as_es && key->as_ngg) 9357ec681f3Smrg return &sel->main_shader_part_ngg_es; 9367ec681f3Smrg if (key->as_es) 9377ec681f3Smrg return &sel->main_shader_part_es; 9387ec681f3Smrg if (key->as_ngg) 9397ec681f3Smrg return &sel->main_shader_part_ngg; 9407ec681f3Smrg return &sel->main_shader_part; 941af69d88dSmrg} 942af69d88dSmrg 9437ec681f3Smrgstatic inline bool si_shader_uses_bindless_samplers(struct si_shader_selector *selector) 94401e04c3fSmrg{ 9457ec681f3Smrg return selector ? selector->info.uses_bindless_samplers : false; 94601e04c3fSmrg} 94701e04c3fSmrg 9487ec681f3Smrgstatic inline bool si_shader_uses_bindless_images(struct si_shader_selector *selector) 94901e04c3fSmrg{ 9507ec681f3Smrg return selector ? selector->info.uses_bindless_images : false; 95101e04c3fSmrg} 95201e04c3fSmrg 9537ec681f3Smrgstatic inline bool gfx10_edgeflags_have_effect(struct si_shader *shader) 9547ec681f3Smrg{ 9557ec681f3Smrg if (shader->selector->info.stage == MESA_SHADER_VERTEX && 9567ec681f3Smrg !shader->selector->info.base.vs.blit_sgprs_amd && 9577ec681f3Smrg !(shader->key.opt.ngg_culling & SI_NGG_CULL_LINES)) 9587ec681f3Smrg return true; 9597ec681f3Smrg 9607ec681f3Smrg return false; 9617ec681f3Smrg} 96201e04c3fSmrg 9637ec681f3Smrgstatic inline bool gfx10_ngg_writes_user_edgeflags(struct si_shader *shader) 96401e04c3fSmrg{ 9657ec681f3Smrg return gfx10_edgeflags_have_effect(shader) && 9667ec681f3Smrg shader->selector->info.writes_edgeflag; 9677ec681f3Smrg} 96801e04c3fSmrg 9697ec681f3Smrg#ifdef __cplusplus 97001e04c3fSmrg} 9717ec681f3Smrg#endif 972af69d88dSmrg 973af69d88dSmrg#endif 974