101e04c3fSmrg/* 201e04c3fSmrg * Copyright 2012 Advanced Micro Devices, Inc. 301e04c3fSmrg * 401e04c3fSmrg * Permission is hereby granted, free of charge, to any person obtaining a 501e04c3fSmrg * copy of this software and associated documentation files (the "Software"), 601e04c3fSmrg * to deal in the Software without restriction, including without limitation 701e04c3fSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 801e04c3fSmrg * and/or sell copies of the Software, and to permit persons to whom the 901e04c3fSmrg * Software is furnished to do so, subject to the following conditions: 1001e04c3fSmrg * 1101e04c3fSmrg * The above copyright notice and this permission notice (including the next 1201e04c3fSmrg * paragraph) shall be included in all copies or substantial portions of the 1301e04c3fSmrg * Software. 1401e04c3fSmrg * 1501e04c3fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1601e04c3fSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1701e04c3fSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 1801e04c3fSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 1901e04c3fSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 2001e04c3fSmrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 2101e04c3fSmrg * IN THE SOFTWARE. 2201e04c3fSmrg */ 2301e04c3fSmrg 247ec681f3Smrg#include "ac_shader_util.h" 257ec681f3Smrg#include "ac_gpu_info.h" 267ec681f3Smrg 277ec681f3Smrg#include "sid.h" 287ec681f3Smrg#include "u_math.h" 297ec681f3Smrg 3001e04c3fSmrg#include <assert.h> 3101e04c3fSmrg#include <stdlib.h> 3201e04c3fSmrg#include <string.h> 3301e04c3fSmrg 347ec681f3Smrgunsigned ac_get_spi_shader_z_format(bool writes_z, bool writes_stencil, bool writes_samplemask) 3501e04c3fSmrg{ 367ec681f3Smrg if (writes_z) { 377ec681f3Smrg /* Z needs 32 bits. */ 387ec681f3Smrg if (writes_samplemask) 397ec681f3Smrg return V_028710_SPI_SHADER_32_ABGR; 407ec681f3Smrg else if (writes_stencil) 417ec681f3Smrg return V_028710_SPI_SHADER_32_GR; 427ec681f3Smrg else 437ec681f3Smrg return V_028710_SPI_SHADER_32_R; 447ec681f3Smrg } else if (writes_stencil || writes_samplemask) { 457ec681f3Smrg /* Both stencil and sample mask need only 16 bits. */ 467ec681f3Smrg return V_028710_SPI_SHADER_UINT16_ABGR; 477ec681f3Smrg } else { 487ec681f3Smrg return V_028710_SPI_SHADER_ZERO; 497ec681f3Smrg } 5001e04c3fSmrg} 5101e04c3fSmrg 527ec681f3Smrgunsigned ac_get_cb_shader_mask(unsigned spi_shader_col_format) 5301e04c3fSmrg{ 547ec681f3Smrg unsigned i, cb_shader_mask = 0; 557ec681f3Smrg 567ec681f3Smrg for (i = 0; i < 8; i++) { 577ec681f3Smrg switch ((spi_shader_col_format >> (i * 4)) & 0xf) { 587ec681f3Smrg case V_028714_SPI_SHADER_ZERO: 597ec681f3Smrg break; 607ec681f3Smrg case V_028714_SPI_SHADER_32_R: 617ec681f3Smrg cb_shader_mask |= 0x1 << (i * 4); 627ec681f3Smrg break; 637ec681f3Smrg case V_028714_SPI_SHADER_32_GR: 647ec681f3Smrg cb_shader_mask |= 0x3 << (i * 4); 657ec681f3Smrg break; 667ec681f3Smrg case V_028714_SPI_SHADER_32_AR: 677ec681f3Smrg cb_shader_mask |= 0x9u << (i * 4); 687ec681f3Smrg break; 697ec681f3Smrg case V_028714_SPI_SHADER_FP16_ABGR: 707ec681f3Smrg case V_028714_SPI_SHADER_UNORM16_ABGR: 717ec681f3Smrg case V_028714_SPI_SHADER_SNORM16_ABGR: 727ec681f3Smrg case V_028714_SPI_SHADER_UINT16_ABGR: 737ec681f3Smrg case V_028714_SPI_SHADER_SINT16_ABGR: 747ec681f3Smrg case V_028714_SPI_SHADER_32_ABGR: 757ec681f3Smrg cb_shader_mask |= 0xfu << (i * 4); 767ec681f3Smrg break; 777ec681f3Smrg default: 787ec681f3Smrg assert(0); 797ec681f3Smrg } 807ec681f3Smrg } 817ec681f3Smrg return cb_shader_mask; 8201e04c3fSmrg} 8301e04c3fSmrg 8401e04c3fSmrg/** 8501e04c3fSmrg * Calculate the appropriate setting of VGT_GS_MODE when \p shader is a 8601e04c3fSmrg * geometry shader. 8701e04c3fSmrg */ 887ec681f3Smrguint32_t ac_vgt_gs_mode(unsigned gs_max_vert_out, enum chip_class chip_class) 897ec681f3Smrg{ 907ec681f3Smrg unsigned cut_mode; 917ec681f3Smrg 927ec681f3Smrg if (gs_max_vert_out <= 128) { 937ec681f3Smrg cut_mode = V_028A40_GS_CUT_128; 947ec681f3Smrg } else if (gs_max_vert_out <= 256) { 957ec681f3Smrg cut_mode = V_028A40_GS_CUT_256; 967ec681f3Smrg } else if (gs_max_vert_out <= 512) { 977ec681f3Smrg cut_mode = V_028A40_GS_CUT_512; 987ec681f3Smrg } else { 997ec681f3Smrg assert(gs_max_vert_out <= 1024); 1007ec681f3Smrg cut_mode = V_028A40_GS_CUT_1024; 1017ec681f3Smrg } 1027ec681f3Smrg 1037ec681f3Smrg return S_028A40_MODE(V_028A40_GS_SCENARIO_G) | S_028A40_CUT_MODE(cut_mode) | 1047ec681f3Smrg S_028A40_ES_WRITE_OPTIMIZE(chip_class <= GFX8) | S_028A40_GS_WRITE_OPTIMIZE(1) | 1057ec681f3Smrg S_028A40_ONCHIP(chip_class >= GFX9 ? 1 : 0); 1067ec681f3Smrg} 1077ec681f3Smrg 1087ec681f3Smrg/// Translate a (dfmt, nfmt) pair into a chip-appropriate combined format 1097ec681f3Smrg/// value for LLVM8+ tbuffer intrinsics. 1107ec681f3Smrgunsigned ac_get_tbuffer_format(enum chip_class chip_class, unsigned dfmt, unsigned nfmt) 1117ec681f3Smrg{ 1127ec681f3Smrg // Some games try to access vertex buffers without a valid format. 1137ec681f3Smrg // This is a game bug, but we should still handle it gracefully. 1147ec681f3Smrg if (dfmt == V_008F0C_GFX10_FORMAT_INVALID) 1157ec681f3Smrg return V_008F0C_GFX10_FORMAT_INVALID; 1167ec681f3Smrg 1177ec681f3Smrg if (chip_class >= GFX10) { 1187ec681f3Smrg unsigned format; 1197ec681f3Smrg switch (dfmt) { 1207ec681f3Smrg default: 1217ec681f3Smrg unreachable("bad dfmt"); 1227ec681f3Smrg case V_008F0C_BUF_DATA_FORMAT_INVALID: 1237ec681f3Smrg format = V_008F0C_GFX10_FORMAT_INVALID; 1247ec681f3Smrg break; 1257ec681f3Smrg case V_008F0C_BUF_DATA_FORMAT_8: 1267ec681f3Smrg format = V_008F0C_GFX10_FORMAT_8_UINT; 1277ec681f3Smrg break; 1287ec681f3Smrg case V_008F0C_BUF_DATA_FORMAT_8_8: 1297ec681f3Smrg format = V_008F0C_GFX10_FORMAT_8_8_UINT; 1307ec681f3Smrg break; 1317ec681f3Smrg case V_008F0C_BUF_DATA_FORMAT_8_8_8_8: 1327ec681f3Smrg format = V_008F0C_GFX10_FORMAT_8_8_8_8_UINT; 1337ec681f3Smrg break; 1347ec681f3Smrg case V_008F0C_BUF_DATA_FORMAT_16: 1357ec681f3Smrg format = V_008F0C_GFX10_FORMAT_16_UINT; 1367ec681f3Smrg break; 1377ec681f3Smrg case V_008F0C_BUF_DATA_FORMAT_16_16: 1387ec681f3Smrg format = V_008F0C_GFX10_FORMAT_16_16_UINT; 1397ec681f3Smrg break; 1407ec681f3Smrg case V_008F0C_BUF_DATA_FORMAT_16_16_16_16: 1417ec681f3Smrg format = V_008F0C_GFX10_FORMAT_16_16_16_16_UINT; 1427ec681f3Smrg break; 1437ec681f3Smrg case V_008F0C_BUF_DATA_FORMAT_32: 1447ec681f3Smrg format = V_008F0C_GFX10_FORMAT_32_UINT; 1457ec681f3Smrg break; 1467ec681f3Smrg case V_008F0C_BUF_DATA_FORMAT_32_32: 1477ec681f3Smrg format = V_008F0C_GFX10_FORMAT_32_32_UINT; 1487ec681f3Smrg break; 1497ec681f3Smrg case V_008F0C_BUF_DATA_FORMAT_32_32_32: 1507ec681f3Smrg format = V_008F0C_GFX10_FORMAT_32_32_32_UINT; 1517ec681f3Smrg break; 1527ec681f3Smrg case V_008F0C_BUF_DATA_FORMAT_32_32_32_32: 1537ec681f3Smrg format = V_008F0C_GFX10_FORMAT_32_32_32_32_UINT; 1547ec681f3Smrg break; 1557ec681f3Smrg case V_008F0C_BUF_DATA_FORMAT_2_10_10_10: 1567ec681f3Smrg format = V_008F0C_GFX10_FORMAT_2_10_10_10_UINT; 1577ec681f3Smrg break; 1587ec681f3Smrg case V_008F0C_BUF_DATA_FORMAT_10_11_11: 1597ec681f3Smrg format = V_008F0C_GFX10_FORMAT_10_11_11_UINT; 1607ec681f3Smrg break; 1617ec681f3Smrg } 1627ec681f3Smrg 1637ec681f3Smrg // Use the regularity properties of the combined format enum. 1647ec681f3Smrg // 1657ec681f3Smrg // Note: float is incompatible with 8-bit data formats, 1667ec681f3Smrg // [us]{norm,scaled} are incomparible with 32-bit data formats. 1677ec681f3Smrg // [us]scaled are not writable. 1687ec681f3Smrg switch (nfmt) { 1697ec681f3Smrg case V_008F0C_BUF_NUM_FORMAT_UNORM: 1707ec681f3Smrg format -= 4; 1717ec681f3Smrg break; 1727ec681f3Smrg case V_008F0C_BUF_NUM_FORMAT_SNORM: 1737ec681f3Smrg format -= 3; 1747ec681f3Smrg break; 1757ec681f3Smrg case V_008F0C_BUF_NUM_FORMAT_USCALED: 1767ec681f3Smrg format -= 2; 1777ec681f3Smrg break; 1787ec681f3Smrg case V_008F0C_BUF_NUM_FORMAT_SSCALED: 1797ec681f3Smrg format -= 1; 1807ec681f3Smrg break; 1817ec681f3Smrg default: 1827ec681f3Smrg unreachable("bad nfmt"); 1837ec681f3Smrg case V_008F0C_BUF_NUM_FORMAT_UINT: 1847ec681f3Smrg break; 1857ec681f3Smrg case V_008F0C_BUF_NUM_FORMAT_SINT: 1867ec681f3Smrg format += 1; 1877ec681f3Smrg break; 1887ec681f3Smrg case V_008F0C_BUF_NUM_FORMAT_FLOAT: 1897ec681f3Smrg format += 2; 1907ec681f3Smrg break; 1917ec681f3Smrg } 1927ec681f3Smrg 1937ec681f3Smrg return format; 1947ec681f3Smrg } else { 1957ec681f3Smrg return dfmt | (nfmt << 4); 1967ec681f3Smrg } 1977ec681f3Smrg} 1987ec681f3Smrg 1997ec681f3Smrgstatic const struct ac_data_format_info data_format_table[] = { 2007ec681f3Smrg [V_008F0C_BUF_DATA_FORMAT_INVALID] = {0, 4, 0, V_008F0C_BUF_DATA_FORMAT_INVALID}, 2017ec681f3Smrg [V_008F0C_BUF_DATA_FORMAT_8] = {1, 1, 1, V_008F0C_BUF_DATA_FORMAT_8}, 2027ec681f3Smrg [V_008F0C_BUF_DATA_FORMAT_16] = {2, 1, 2, V_008F0C_BUF_DATA_FORMAT_16}, 2037ec681f3Smrg [V_008F0C_BUF_DATA_FORMAT_8_8] = {2, 2, 1, V_008F0C_BUF_DATA_FORMAT_8}, 2047ec681f3Smrg [V_008F0C_BUF_DATA_FORMAT_32] = {4, 1, 4, V_008F0C_BUF_DATA_FORMAT_32}, 2057ec681f3Smrg [V_008F0C_BUF_DATA_FORMAT_16_16] = {4, 2, 2, V_008F0C_BUF_DATA_FORMAT_16}, 2067ec681f3Smrg [V_008F0C_BUF_DATA_FORMAT_10_11_11] = {4, 3, 0, V_008F0C_BUF_DATA_FORMAT_10_11_11}, 2077ec681f3Smrg [V_008F0C_BUF_DATA_FORMAT_11_11_10] = {4, 3, 0, V_008F0C_BUF_DATA_FORMAT_11_11_10}, 2087ec681f3Smrg [V_008F0C_BUF_DATA_FORMAT_10_10_10_2] = {4, 4, 0, V_008F0C_BUF_DATA_FORMAT_10_10_10_2}, 2097ec681f3Smrg [V_008F0C_BUF_DATA_FORMAT_2_10_10_10] = {4, 4, 0, V_008F0C_BUF_DATA_FORMAT_2_10_10_10}, 2107ec681f3Smrg [V_008F0C_BUF_DATA_FORMAT_8_8_8_8] = {4, 4, 1, V_008F0C_BUF_DATA_FORMAT_8}, 2117ec681f3Smrg [V_008F0C_BUF_DATA_FORMAT_32_32] = {8, 2, 4, V_008F0C_BUF_DATA_FORMAT_32}, 2127ec681f3Smrg [V_008F0C_BUF_DATA_FORMAT_16_16_16_16] = {8, 4, 2, V_008F0C_BUF_DATA_FORMAT_16}, 2137ec681f3Smrg [V_008F0C_BUF_DATA_FORMAT_32_32_32] = {12, 3, 4, V_008F0C_BUF_DATA_FORMAT_32}, 2147ec681f3Smrg [V_008F0C_BUF_DATA_FORMAT_32_32_32_32] = {16, 4, 4, V_008F0C_BUF_DATA_FORMAT_32}, 2157ec681f3Smrg}; 2167ec681f3Smrg 2177ec681f3Smrgconst struct ac_data_format_info *ac_get_data_format_info(unsigned dfmt) 2187ec681f3Smrg{ 2197ec681f3Smrg assert(dfmt < ARRAY_SIZE(data_format_table)); 2207ec681f3Smrg return &data_format_table[dfmt]; 2217ec681f3Smrg} 2227ec681f3Smrg 2237ec681f3Smrgenum ac_image_dim ac_get_sampler_dim(enum chip_class chip_class, enum glsl_sampler_dim dim, 2247ec681f3Smrg bool is_array) 2257ec681f3Smrg{ 2267ec681f3Smrg switch (dim) { 2277ec681f3Smrg case GLSL_SAMPLER_DIM_1D: 2287ec681f3Smrg if (chip_class == GFX9) 2297ec681f3Smrg return is_array ? ac_image_2darray : ac_image_2d; 2307ec681f3Smrg return is_array ? ac_image_1darray : ac_image_1d; 2317ec681f3Smrg case GLSL_SAMPLER_DIM_2D: 2327ec681f3Smrg case GLSL_SAMPLER_DIM_RECT: 2337ec681f3Smrg case GLSL_SAMPLER_DIM_EXTERNAL: 2347ec681f3Smrg return is_array ? ac_image_2darray : ac_image_2d; 2357ec681f3Smrg case GLSL_SAMPLER_DIM_3D: 2367ec681f3Smrg return ac_image_3d; 2377ec681f3Smrg case GLSL_SAMPLER_DIM_CUBE: 2387ec681f3Smrg return ac_image_cube; 2397ec681f3Smrg case GLSL_SAMPLER_DIM_MS: 2407ec681f3Smrg return is_array ? ac_image_2darraymsaa : ac_image_2dmsaa; 2417ec681f3Smrg case GLSL_SAMPLER_DIM_SUBPASS: 2427ec681f3Smrg return ac_image_2darray; 2437ec681f3Smrg case GLSL_SAMPLER_DIM_SUBPASS_MS: 2447ec681f3Smrg return ac_image_2darraymsaa; 2457ec681f3Smrg default: 2467ec681f3Smrg unreachable("bad sampler dim"); 2477ec681f3Smrg } 2487ec681f3Smrg} 2497ec681f3Smrg 2507ec681f3Smrgenum ac_image_dim ac_get_image_dim(enum chip_class chip_class, enum glsl_sampler_dim sdim, 2517ec681f3Smrg bool is_array) 2527ec681f3Smrg{ 2537ec681f3Smrg enum ac_image_dim dim = ac_get_sampler_dim(chip_class, sdim, is_array); 2547ec681f3Smrg 2557ec681f3Smrg /* Match the resource type set in the descriptor. */ 2567ec681f3Smrg if (dim == ac_image_cube || (chip_class <= GFX8 && dim == ac_image_3d)) 2577ec681f3Smrg dim = ac_image_2darray; 2587ec681f3Smrg else if (sdim == GLSL_SAMPLER_DIM_2D && !is_array && chip_class == GFX9) { 2597ec681f3Smrg /* When a single layer of a 3D texture is bound, the shader 2607ec681f3Smrg * will refer to a 2D target, but the descriptor has a 3D type. 2617ec681f3Smrg * Since the HW ignores BASE_ARRAY in this case, we need to 2627ec681f3Smrg * send 3 coordinates. This doesn't hurt when the underlying 2637ec681f3Smrg * texture is non-3D. 2647ec681f3Smrg */ 2657ec681f3Smrg dim = ac_image_3d; 2667ec681f3Smrg } 2677ec681f3Smrg 2687ec681f3Smrg return dim; 2697ec681f3Smrg} 2707ec681f3Smrg 2717ec681f3Smrgunsigned ac_get_fs_input_vgpr_cnt(const struct ac_shader_config *config, 2727ec681f3Smrg signed char *face_vgpr_index_ptr, 2737ec681f3Smrg signed char *ancillary_vgpr_index_ptr) 2747ec681f3Smrg{ 2757ec681f3Smrg unsigned num_input_vgprs = 0; 2767ec681f3Smrg signed char face_vgpr_index = -1; 2777ec681f3Smrg signed char ancillary_vgpr_index = -1; 2787ec681f3Smrg 2797ec681f3Smrg if (G_0286CC_PERSP_SAMPLE_ENA(config->spi_ps_input_addr)) 2807ec681f3Smrg num_input_vgprs += 2; 2817ec681f3Smrg if (G_0286CC_PERSP_CENTER_ENA(config->spi_ps_input_addr)) 2827ec681f3Smrg num_input_vgprs += 2; 2837ec681f3Smrg if (G_0286CC_PERSP_CENTROID_ENA(config->spi_ps_input_addr)) 2847ec681f3Smrg num_input_vgprs += 2; 2857ec681f3Smrg if (G_0286CC_PERSP_PULL_MODEL_ENA(config->spi_ps_input_addr)) 2867ec681f3Smrg num_input_vgprs += 3; 2877ec681f3Smrg if (G_0286CC_LINEAR_SAMPLE_ENA(config->spi_ps_input_addr)) 2887ec681f3Smrg num_input_vgprs += 2; 2897ec681f3Smrg if (G_0286CC_LINEAR_CENTER_ENA(config->spi_ps_input_addr)) 2907ec681f3Smrg num_input_vgprs += 2; 2917ec681f3Smrg if (G_0286CC_LINEAR_CENTROID_ENA(config->spi_ps_input_addr)) 2927ec681f3Smrg num_input_vgprs += 2; 2937ec681f3Smrg if (G_0286CC_LINE_STIPPLE_TEX_ENA(config->spi_ps_input_addr)) 2947ec681f3Smrg num_input_vgprs += 1; 2957ec681f3Smrg if (G_0286CC_POS_X_FLOAT_ENA(config->spi_ps_input_addr)) 2967ec681f3Smrg num_input_vgprs += 1; 2977ec681f3Smrg if (G_0286CC_POS_Y_FLOAT_ENA(config->spi_ps_input_addr)) 2987ec681f3Smrg num_input_vgprs += 1; 2997ec681f3Smrg if (G_0286CC_POS_Z_FLOAT_ENA(config->spi_ps_input_addr)) 3007ec681f3Smrg num_input_vgprs += 1; 3017ec681f3Smrg if (G_0286CC_POS_W_FLOAT_ENA(config->spi_ps_input_addr)) 3027ec681f3Smrg num_input_vgprs += 1; 3037ec681f3Smrg if (G_0286CC_FRONT_FACE_ENA(config->spi_ps_input_addr)) { 3047ec681f3Smrg face_vgpr_index = num_input_vgprs; 3057ec681f3Smrg num_input_vgprs += 1; 3067ec681f3Smrg } 3077ec681f3Smrg if (G_0286CC_ANCILLARY_ENA(config->spi_ps_input_addr)) { 3087ec681f3Smrg ancillary_vgpr_index = num_input_vgprs; 3097ec681f3Smrg num_input_vgprs += 1; 3107ec681f3Smrg } 3117ec681f3Smrg if (G_0286CC_SAMPLE_COVERAGE_ENA(config->spi_ps_input_addr)) 3127ec681f3Smrg num_input_vgprs += 1; 3137ec681f3Smrg if (G_0286CC_POS_FIXED_PT_ENA(config->spi_ps_input_addr)) 3147ec681f3Smrg num_input_vgprs += 1; 3157ec681f3Smrg 3167ec681f3Smrg if (face_vgpr_index_ptr) 3177ec681f3Smrg *face_vgpr_index_ptr = face_vgpr_index; 3187ec681f3Smrg if (ancillary_vgpr_index_ptr) 3197ec681f3Smrg *ancillary_vgpr_index_ptr = ancillary_vgpr_index; 3207ec681f3Smrg 3217ec681f3Smrg return num_input_vgprs; 3227ec681f3Smrg} 3237ec681f3Smrg 3247ec681f3Smrgvoid ac_choose_spi_color_formats(unsigned format, unsigned swap, unsigned ntype, 3257ec681f3Smrg bool is_depth, bool use_rbplus, 3267ec681f3Smrg struct ac_spi_color_formats *formats) 32701e04c3fSmrg{ 3287ec681f3Smrg /* Alpha is needed for alpha-to-coverage. 3297ec681f3Smrg * Blending may be with or without alpha. 3307ec681f3Smrg */ 3317ec681f3Smrg unsigned normal = 0; /* most optimal, may not support blending or export alpha */ 3327ec681f3Smrg unsigned alpha = 0; /* exports alpha, but may not support blending */ 3337ec681f3Smrg unsigned blend = 0; /* supports blending, but may not export alpha */ 3347ec681f3Smrg unsigned blend_alpha = 0; /* least optimal, supports blending and exports alpha */ 3357ec681f3Smrg 3367ec681f3Smrg /* Choose the SPI color formats. These are required values for RB+. 3377ec681f3Smrg * Other chips have multiple choices, though they are not necessarily better. 3387ec681f3Smrg */ 3397ec681f3Smrg switch (format) { 3407ec681f3Smrg case V_028C70_COLOR_5_6_5: 3417ec681f3Smrg case V_028C70_COLOR_1_5_5_5: 3427ec681f3Smrg case V_028C70_COLOR_5_5_5_1: 3437ec681f3Smrg case V_028C70_COLOR_4_4_4_4: 3447ec681f3Smrg case V_028C70_COLOR_10_11_11: 3457ec681f3Smrg case V_028C70_COLOR_11_11_10: 3467ec681f3Smrg case V_028C70_COLOR_5_9_9_9: 3477ec681f3Smrg case V_028C70_COLOR_8: 3487ec681f3Smrg case V_028C70_COLOR_8_8: 3497ec681f3Smrg case V_028C70_COLOR_8_8_8_8: 3507ec681f3Smrg case V_028C70_COLOR_10_10_10_2: 3517ec681f3Smrg case V_028C70_COLOR_2_10_10_10: 3527ec681f3Smrg if (ntype == V_028C70_NUMBER_UINT) 3537ec681f3Smrg alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR; 3547ec681f3Smrg else if (ntype == V_028C70_NUMBER_SINT) 3557ec681f3Smrg alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR; 3567ec681f3Smrg else 3577ec681f3Smrg alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR; 3587ec681f3Smrg 3597ec681f3Smrg if (!use_rbplus && format == V_028C70_COLOR_8 && 3607ec681f3Smrg ntype != V_028C70_NUMBER_SRGB && swap == V_028C70_SWAP_STD) /* R */ { 3617ec681f3Smrg /* When RB+ is enabled, R8_UNORM should use FP16_ABGR for 2x 3627ec681f3Smrg * exporting performance. Otherwise, use 32_R to remove useless 3637ec681f3Smrg * instructions needed for 16-bit compressed exports. 3647ec681f3Smrg */ 3657ec681f3Smrg blend = normal = V_028714_SPI_SHADER_32_R; 3667ec681f3Smrg } 3677ec681f3Smrg break; 3687ec681f3Smrg 3697ec681f3Smrg case V_028C70_COLOR_16: 3707ec681f3Smrg case V_028C70_COLOR_16_16: 3717ec681f3Smrg case V_028C70_COLOR_16_16_16_16: 3727ec681f3Smrg if (ntype == V_028C70_NUMBER_UNORM || ntype == V_028C70_NUMBER_SNORM) { 3737ec681f3Smrg /* UNORM16 and SNORM16 don't support blending */ 3747ec681f3Smrg if (ntype == V_028C70_NUMBER_UNORM) 3757ec681f3Smrg normal = alpha = V_028714_SPI_SHADER_UNORM16_ABGR; 3767ec681f3Smrg else 3777ec681f3Smrg normal = alpha = V_028714_SPI_SHADER_SNORM16_ABGR; 3787ec681f3Smrg 3797ec681f3Smrg /* Use 32 bits per channel for blending. */ 3807ec681f3Smrg if (format == V_028C70_COLOR_16) { 3817ec681f3Smrg if (swap == V_028C70_SWAP_STD) { /* R */ 3827ec681f3Smrg blend = V_028714_SPI_SHADER_32_R; 3837ec681f3Smrg blend_alpha = V_028714_SPI_SHADER_32_AR; 3847ec681f3Smrg } else if (swap == V_028C70_SWAP_ALT_REV) /* A */ 3857ec681f3Smrg blend = blend_alpha = V_028714_SPI_SHADER_32_AR; 3867ec681f3Smrg else 3877ec681f3Smrg assert(0); 3887ec681f3Smrg } else if (format == V_028C70_COLOR_16_16) { 3897ec681f3Smrg if (swap == V_028C70_SWAP_STD) { /* RG */ 3907ec681f3Smrg blend = V_028714_SPI_SHADER_32_GR; 3917ec681f3Smrg blend_alpha = V_028714_SPI_SHADER_32_ABGR; 3927ec681f3Smrg } else if (swap == V_028C70_SWAP_ALT) /* RA */ 3937ec681f3Smrg blend = blend_alpha = V_028714_SPI_SHADER_32_AR; 3947ec681f3Smrg else 3957ec681f3Smrg assert(0); 3967ec681f3Smrg } else /* 16_16_16_16 */ 3977ec681f3Smrg blend = blend_alpha = V_028714_SPI_SHADER_32_ABGR; 3987ec681f3Smrg } else if (ntype == V_028C70_NUMBER_UINT) 3997ec681f3Smrg alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR; 4007ec681f3Smrg else if (ntype == V_028C70_NUMBER_SINT) 4017ec681f3Smrg alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR; 4027ec681f3Smrg else if (ntype == V_028C70_NUMBER_FLOAT) 4037ec681f3Smrg alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR; 4047ec681f3Smrg else 4057ec681f3Smrg assert(0); 4067ec681f3Smrg break; 4077ec681f3Smrg 4087ec681f3Smrg case V_028C70_COLOR_32: 4097ec681f3Smrg if (swap == V_028C70_SWAP_STD) { /* R */ 4107ec681f3Smrg blend = normal = V_028714_SPI_SHADER_32_R; 4117ec681f3Smrg alpha = blend_alpha = V_028714_SPI_SHADER_32_AR; 4127ec681f3Smrg } else if (swap == V_028C70_SWAP_ALT_REV) /* A */ 4137ec681f3Smrg alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR; 4147ec681f3Smrg else 4157ec681f3Smrg assert(0); 4167ec681f3Smrg break; 4177ec681f3Smrg 4187ec681f3Smrg case V_028C70_COLOR_32_32: 4197ec681f3Smrg if (swap == V_028C70_SWAP_STD) { /* RG */ 4207ec681f3Smrg blend = normal = V_028714_SPI_SHADER_32_GR; 4217ec681f3Smrg alpha = blend_alpha = V_028714_SPI_SHADER_32_ABGR; 4227ec681f3Smrg } else if (swap == V_028C70_SWAP_ALT) /* RA */ 4237ec681f3Smrg alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR; 4247ec681f3Smrg else 4257ec681f3Smrg assert(0); 4267ec681f3Smrg break; 4277ec681f3Smrg 4287ec681f3Smrg case V_028C70_COLOR_32_32_32_32: 4297ec681f3Smrg case V_028C70_COLOR_8_24: 4307ec681f3Smrg case V_028C70_COLOR_24_8: 4317ec681f3Smrg case V_028C70_COLOR_X24_8_32_FLOAT: 4327ec681f3Smrg alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR; 4337ec681f3Smrg break; 4347ec681f3Smrg 4357ec681f3Smrg default: 4367ec681f3Smrg assert(0); 4377ec681f3Smrg return; 4387ec681f3Smrg } 4397ec681f3Smrg 4407ec681f3Smrg /* The DB->CB copy needs 32_ABGR. */ 4417ec681f3Smrg if (is_depth) 4427ec681f3Smrg alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR; 4437ec681f3Smrg 4447ec681f3Smrg formats->normal = normal; 4457ec681f3Smrg formats->alpha = alpha; 4467ec681f3Smrg formats->blend = blend; 4477ec681f3Smrg formats->blend_alpha = blend_alpha; 44801e04c3fSmrg} 44901e04c3fSmrg 4507ec681f3Smrgvoid ac_compute_late_alloc(const struct radeon_info *info, bool ngg, bool ngg_culling, 4517ec681f3Smrg bool uses_scratch, unsigned *late_alloc_wave64, unsigned *cu_mask) 45201e04c3fSmrg{ 4537ec681f3Smrg *late_alloc_wave64 = 0; /* The limit is per SA. */ 4547ec681f3Smrg *cu_mask = 0xffff; 4557ec681f3Smrg 4567ec681f3Smrg /* CU masking can decrease performance and cause a hang with <= 2 CUs per SA. */ 4577ec681f3Smrg if (info->min_good_cu_per_sa <= 2) 4587ec681f3Smrg return; 4597ec681f3Smrg 4607ec681f3Smrg /* If scratch is used with late alloc, the GPU could deadlock if PS uses scratch too. A more 4617ec681f3Smrg * complicated computation is needed to enable late alloc with scratch (see PAL). 4627ec681f3Smrg */ 4637ec681f3Smrg if (uses_scratch) 4647ec681f3Smrg return; 4657ec681f3Smrg 4667ec681f3Smrg /* Late alloc is not used for NGG on Navi14 due to a hw bug. */ 4677ec681f3Smrg if (ngg && info->family == CHIP_NAVI14) 4687ec681f3Smrg return; 4697ec681f3Smrg 4707ec681f3Smrg if (info->chip_class >= GFX10) { 4717ec681f3Smrg /* For Wave32, the hw will launch twice the number of late alloc waves, so 1 == 2x wave32. 4727ec681f3Smrg * These limits are estimated because they are all safe but they vary in performance. 4737ec681f3Smrg */ 4747ec681f3Smrg if (ngg_culling) 4757ec681f3Smrg *late_alloc_wave64 = info->min_good_cu_per_sa * 10; 4767ec681f3Smrg else 4777ec681f3Smrg *late_alloc_wave64 = info->min_good_cu_per_sa * 4; 4787ec681f3Smrg 4797ec681f3Smrg /* Limit LATE_ALLOC_GS to prevent a hang (hw bug) on gfx10. */ 4807ec681f3Smrg if (info->chip_class == GFX10 && ngg) 4817ec681f3Smrg *late_alloc_wave64 = MIN2(*late_alloc_wave64, 64); 4827ec681f3Smrg 4837ec681f3Smrg /* Gfx10: CU2 & CU3 must be disabled to prevent a hw deadlock. 4847ec681f3Smrg * Others: CU1 must be disabled to prevent a hw deadlock. 4857ec681f3Smrg * 4867ec681f3Smrg * The deadlock is caused by late alloc, which usually increases performance. 4877ec681f3Smrg */ 4887ec681f3Smrg *cu_mask &= info->chip_class == GFX10 ? ~BITFIELD_RANGE(2, 2) : 4897ec681f3Smrg ~BITFIELD_RANGE(1, 1); 4907ec681f3Smrg } else { 4917ec681f3Smrg if (info->min_good_cu_per_sa <= 4) { 4927ec681f3Smrg /* Too few available compute units per SA. Disallowing VS to run on one CU could hurt us 4937ec681f3Smrg * more than late VS allocation would help. 4947ec681f3Smrg * 4957ec681f3Smrg * 2 is the highest safe number that allows us to keep all CUs enabled. 4967ec681f3Smrg */ 4977ec681f3Smrg *late_alloc_wave64 = 2; 4987ec681f3Smrg } else { 4997ec681f3Smrg /* This is a good initial value, allowing 1 late_alloc wave per SIMD on num_cu - 2. 5007ec681f3Smrg */ 5017ec681f3Smrg *late_alloc_wave64 = (info->min_good_cu_per_sa - 2) * 4; 5027ec681f3Smrg } 5037ec681f3Smrg 5047ec681f3Smrg /* VS can't execute on one CU if the limit is > 2. */ 5057ec681f3Smrg if (*late_alloc_wave64 > 2) 5067ec681f3Smrg *cu_mask = 0xfffe; /* 1 CU disabled */ 5077ec681f3Smrg } 5087ec681f3Smrg 5097ec681f3Smrg /* Max number that fits into the register field. */ 5107ec681f3Smrg if (ngg) /* GS */ 5117ec681f3Smrg *late_alloc_wave64 = MIN2(*late_alloc_wave64, G_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(~0u)); 5127ec681f3Smrg else /* VS */ 5137ec681f3Smrg *late_alloc_wave64 = MIN2(*late_alloc_wave64, G_00B11C_LIMIT(~0u)); 5147ec681f3Smrg} 5157ec681f3Smrg 5167ec681f3Smrgunsigned ac_compute_cs_workgroup_size(uint16_t sizes[3], bool variable, unsigned max) 5177ec681f3Smrg{ 5187ec681f3Smrg if (variable) 5197ec681f3Smrg return max; 5207ec681f3Smrg 5217ec681f3Smrg return sizes[0] * sizes[1] * sizes[2]; 5227ec681f3Smrg} 5237ec681f3Smrg 5247ec681f3Smrgunsigned ac_compute_lshs_workgroup_size(enum chip_class chip_class, gl_shader_stage stage, 5257ec681f3Smrg unsigned tess_num_patches, 5267ec681f3Smrg unsigned tess_patch_in_vtx, 5277ec681f3Smrg unsigned tess_patch_out_vtx) 5287ec681f3Smrg{ 5297ec681f3Smrg /* When tessellation is used, API VS runs on HW LS, API TCS runs on HW HS. 5307ec681f3Smrg * These two HW stages are merged on GFX9+. 5317ec681f3Smrg */ 5327ec681f3Smrg 5337ec681f3Smrg bool merged_shaders = chip_class >= GFX9; 5347ec681f3Smrg unsigned ls_workgroup_size = tess_num_patches * tess_patch_in_vtx; 5357ec681f3Smrg unsigned hs_workgroup_size = tess_num_patches * tess_patch_out_vtx; 5367ec681f3Smrg 5377ec681f3Smrg if (merged_shaders) 5387ec681f3Smrg return MAX2(ls_workgroup_size, hs_workgroup_size); 5397ec681f3Smrg else if (stage == MESA_SHADER_VERTEX) 5407ec681f3Smrg return ls_workgroup_size; 5417ec681f3Smrg else if (stage == MESA_SHADER_TESS_CTRL) 5427ec681f3Smrg return hs_workgroup_size; 5437ec681f3Smrg else 5447ec681f3Smrg unreachable("invalid LSHS shader stage"); 5457ec681f3Smrg} 5467ec681f3Smrg 5477ec681f3Smrgunsigned ac_compute_esgs_workgroup_size(enum chip_class chip_class, unsigned wave_size, 5487ec681f3Smrg unsigned es_verts, unsigned gs_inst_prims) 5497ec681f3Smrg{ 5507ec681f3Smrg /* ESGS may operate in workgroups if on-chip GS (LDS rings) are enabled. 5517ec681f3Smrg * 5527ec681f3Smrg * GFX6: Not possible in the HW. 5537ec681f3Smrg * GFX7-8 (unmerged): possible in the HW, but not implemented in Mesa. 5547ec681f3Smrg * GFX9+ (merged): implemented in Mesa. 5557ec681f3Smrg */ 5567ec681f3Smrg 5577ec681f3Smrg if (chip_class <= GFX8) 5587ec681f3Smrg return wave_size; 5597ec681f3Smrg 5607ec681f3Smrg unsigned workgroup_size = MAX2(es_verts, gs_inst_prims); 5617ec681f3Smrg return CLAMP(workgroup_size, 1, 256); 5627ec681f3Smrg} 5637ec681f3Smrg 5647ec681f3Smrgunsigned ac_compute_ngg_workgroup_size(unsigned es_verts, unsigned gs_inst_prims, 5657ec681f3Smrg unsigned max_vtx_out, unsigned prim_amp_factor) 5667ec681f3Smrg{ 5677ec681f3Smrg /* NGG always operates in workgroups. 5687ec681f3Smrg * 5697ec681f3Smrg * For API VS/TES/GS: 5707ec681f3Smrg * - 1 invocation per input vertex 5717ec681f3Smrg * - 1 invocation per input primitive 5727ec681f3Smrg * 5737ec681f3Smrg * The same invocation can process both an input vertex and primitive, 5747ec681f3Smrg * however 1 invocation can only output up to 1 vertex and 1 primitive. 5757ec681f3Smrg */ 5767ec681f3Smrg 5777ec681f3Smrg unsigned max_vtx_in = es_verts < 256 ? es_verts : 3 * gs_inst_prims; 5787ec681f3Smrg unsigned max_prim_in = gs_inst_prims; 5797ec681f3Smrg unsigned max_prim_out = gs_inst_prims * prim_amp_factor; 5807ec681f3Smrg unsigned workgroup_size = MAX4(max_vtx_in, max_vtx_out, max_prim_in, max_prim_out); 5817ec681f3Smrg 5827ec681f3Smrg return CLAMP(workgroup_size, 1, 256); 58301e04c3fSmrg} 584