1b8e80941Smrg/* 2b8e80941Smrg * Copyright © 2016 Red Hat. 3b8e80941Smrg * Copyright © 2016 Bas Nieuwenhuizen 4b8e80941Smrg * 5b8e80941Smrg * based on si_state.c 6b8e80941Smrg * Copyright © 2015 Advanced Micro Devices, Inc. 7b8e80941Smrg * 8b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a 9b8e80941Smrg * copy of this software and associated documentation files (the "Software"), 10b8e80941Smrg * to deal in the Software without restriction, including without limitation 11b8e80941Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 12b8e80941Smrg * and/or sell copies of the Software, and to permit persons to whom the 13b8e80941Smrg * Software is furnished to do so, subject to the following conditions: 14b8e80941Smrg * 15b8e80941Smrg * The above copyright notice and this permission notice (including the next 16b8e80941Smrg * paragraph) shall be included in all copies or substantial portions of the 17b8e80941Smrg * Software. 18b8e80941Smrg * 19b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20b8e80941Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22b8e80941Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23b8e80941Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24b8e80941Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 25b8e80941Smrg * IN THE SOFTWARE. 26b8e80941Smrg */ 27b8e80941Smrg 28b8e80941Smrg/* command buffer handling for SI */ 29b8e80941Smrg 30b8e80941Smrg#include "radv_private.h" 31b8e80941Smrg#include "radv_shader.h" 32b8e80941Smrg#include "radv_cs.h" 33b8e80941Smrg#include "sid.h" 34b8e80941Smrg#include "gfx9d.h" 35b8e80941Smrg#include "radv_util.h" 36b8e80941Smrg#include "main/macros.h" 37b8e80941Smrg 38b8e80941Smrgstatic void 39b8e80941Smrgsi_write_harvested_raster_configs(struct radv_physical_device *physical_device, 40b8e80941Smrg struct radeon_cmdbuf *cs, 41b8e80941Smrg unsigned raster_config, 42b8e80941Smrg unsigned raster_config_1) 43b8e80941Smrg{ 44b8e80941Smrg unsigned num_se = MAX2(physical_device->rad_info.max_se, 1); 45b8e80941Smrg unsigned raster_config_se[4]; 46b8e80941Smrg unsigned se; 47b8e80941Smrg 48b8e80941Smrg ac_get_harvested_configs(&physical_device->rad_info, 49b8e80941Smrg raster_config, 50b8e80941Smrg &raster_config_1, 51b8e80941Smrg raster_config_se); 52b8e80941Smrg 53b8e80941Smrg for (se = 0; se < num_se; se++) { 54b8e80941Smrg /* GRBM_GFX_INDEX has a different offset on SI and CI+ */ 55b8e80941Smrg if (physical_device->rad_info.chip_class < CIK) 56b8e80941Smrg radeon_set_config_reg(cs, R_00802C_GRBM_GFX_INDEX, 57b8e80941Smrg S_00802C_SE_INDEX(se) | 58b8e80941Smrg S_00802C_SH_BROADCAST_WRITES(1) | 59b8e80941Smrg S_00802C_INSTANCE_BROADCAST_WRITES(1)); 60b8e80941Smrg else 61b8e80941Smrg radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, 62b8e80941Smrg S_030800_SE_INDEX(se) | S_030800_SH_BROADCAST_WRITES(1) | 63b8e80941Smrg S_030800_INSTANCE_BROADCAST_WRITES(1)); 64b8e80941Smrg radeon_set_context_reg(cs, R_028350_PA_SC_RASTER_CONFIG, raster_config_se[se]); 65b8e80941Smrg } 66b8e80941Smrg 67b8e80941Smrg /* GRBM_GFX_INDEX has a different offset on SI and CI+ */ 68b8e80941Smrg if (physical_device->rad_info.chip_class < CIK) 69b8e80941Smrg radeon_set_config_reg(cs, R_00802C_GRBM_GFX_INDEX, 70b8e80941Smrg S_00802C_SE_BROADCAST_WRITES(1) | 71b8e80941Smrg S_00802C_SH_BROADCAST_WRITES(1) | 72b8e80941Smrg S_00802C_INSTANCE_BROADCAST_WRITES(1)); 73b8e80941Smrg else 74b8e80941Smrg radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, 75b8e80941Smrg S_030800_SE_BROADCAST_WRITES(1) | S_030800_SH_BROADCAST_WRITES(1) | 76b8e80941Smrg S_030800_INSTANCE_BROADCAST_WRITES(1)); 77b8e80941Smrg 78b8e80941Smrg if (physical_device->rad_info.chip_class >= CIK) 79b8e80941Smrg radeon_set_context_reg(cs, R_028354_PA_SC_RASTER_CONFIG_1, raster_config_1); 80b8e80941Smrg} 81b8e80941Smrg 82b8e80941Smrgvoid 83b8e80941Smrgsi_emit_compute(struct radv_physical_device *physical_device, 84b8e80941Smrg struct radeon_cmdbuf *cs) 85b8e80941Smrg{ 86b8e80941Smrg radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3); 87b8e80941Smrg radeon_emit(cs, 0); 88b8e80941Smrg radeon_emit(cs, 0); 89b8e80941Smrg radeon_emit(cs, 0); 90b8e80941Smrg 91b8e80941Smrg radeon_set_sh_reg_seq(cs, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2); 92b8e80941Smrg /* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1 */ 93b8e80941Smrg radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff)); 94b8e80941Smrg radeon_emit(cs, S_00B85C_SH0_CU_EN(0xffff) | S_00B85C_SH1_CU_EN(0xffff)); 95b8e80941Smrg 96b8e80941Smrg if (physical_device->rad_info.chip_class >= CIK) { 97b8e80941Smrg /* Also set R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE2 / SE3 */ 98b8e80941Smrg radeon_set_sh_reg_seq(cs, 99b8e80941Smrg R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, 2); 100b8e80941Smrg radeon_emit(cs, S_00B864_SH0_CU_EN(0xffff) | 101b8e80941Smrg S_00B864_SH1_CU_EN(0xffff)); 102b8e80941Smrg radeon_emit(cs, S_00B868_SH0_CU_EN(0xffff) | 103b8e80941Smrg S_00B868_SH1_CU_EN(0xffff)); 104b8e80941Smrg } 105b8e80941Smrg 106b8e80941Smrg /* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID 107b8e80941Smrg * and is now per pipe, so it should be handled in the 108b8e80941Smrg * kernel if we want to use something other than the default value, 109b8e80941Smrg * which is now 0x22f. 110b8e80941Smrg */ 111b8e80941Smrg if (physical_device->rad_info.chip_class <= SI) { 112b8e80941Smrg /* XXX: This should be: 113b8e80941Smrg * (number of compute units) * 4 * (waves per simd) - 1 */ 114b8e80941Smrg 115b8e80941Smrg radeon_set_sh_reg(cs, R_00B82C_COMPUTE_MAX_WAVE_ID, 116b8e80941Smrg 0x190 /* Default value */); 117b8e80941Smrg } 118b8e80941Smrg} 119b8e80941Smrg 120b8e80941Smrg/* 12.4 fixed-point */ 121b8e80941Smrgstatic unsigned radv_pack_float_12p4(float x) 122b8e80941Smrg{ 123b8e80941Smrg return x <= 0 ? 0 : 124b8e80941Smrg x >= 4096 ? 0xffff : x * 16; 125b8e80941Smrg} 126b8e80941Smrg 127b8e80941Smrgstatic void 128b8e80941Smrgsi_set_raster_config(struct radv_physical_device *physical_device, 129b8e80941Smrg struct radeon_cmdbuf *cs) 130b8e80941Smrg{ 131b8e80941Smrg unsigned num_rb = MIN2(physical_device->rad_info.num_render_backends, 16); 132b8e80941Smrg unsigned rb_mask = physical_device->rad_info.enabled_rb_mask; 133b8e80941Smrg unsigned raster_config, raster_config_1; 134b8e80941Smrg 135b8e80941Smrg ac_get_raster_config(&physical_device->rad_info, 136b8e80941Smrg &raster_config, 137b8e80941Smrg &raster_config_1, NULL); 138b8e80941Smrg 139b8e80941Smrg /* Always use the default config when all backends are enabled 140b8e80941Smrg * (or when we failed to determine the enabled backends). 141b8e80941Smrg */ 142b8e80941Smrg if (!rb_mask || util_bitcount(rb_mask) >= num_rb) { 143b8e80941Smrg radeon_set_context_reg(cs, R_028350_PA_SC_RASTER_CONFIG, 144b8e80941Smrg raster_config); 145b8e80941Smrg if (physical_device->rad_info.chip_class >= CIK) 146b8e80941Smrg radeon_set_context_reg(cs, R_028354_PA_SC_RASTER_CONFIG_1, 147b8e80941Smrg raster_config_1); 148b8e80941Smrg } else { 149b8e80941Smrg si_write_harvested_raster_configs(physical_device, cs, 150b8e80941Smrg raster_config, 151b8e80941Smrg raster_config_1); 152b8e80941Smrg } 153b8e80941Smrg} 154b8e80941Smrg 155b8e80941Smrgvoid 156b8e80941Smrgsi_emit_graphics(struct radv_physical_device *physical_device, 157b8e80941Smrg struct radeon_cmdbuf *cs) 158b8e80941Smrg{ 159b8e80941Smrg int i; 160b8e80941Smrg 161b8e80941Smrg /* Only SI can disable CLEAR_STATE for now. */ 162b8e80941Smrg assert(physical_device->has_clear_state || 163b8e80941Smrg physical_device->rad_info.chip_class == SI); 164b8e80941Smrg 165b8e80941Smrg radeon_emit(cs, PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); 166b8e80941Smrg radeon_emit(cs, CONTEXT_CONTROL_LOAD_ENABLE(1)); 167b8e80941Smrg radeon_emit(cs, CONTEXT_CONTROL_SHADOW_ENABLE(1)); 168b8e80941Smrg 169b8e80941Smrg if (physical_device->has_clear_state) { 170b8e80941Smrg radeon_emit(cs, PKT3(PKT3_CLEAR_STATE, 0, 0)); 171b8e80941Smrg radeon_emit(cs, 0); 172b8e80941Smrg } 173b8e80941Smrg 174b8e80941Smrg if (physical_device->rad_info.chip_class <= VI) 175b8e80941Smrg si_set_raster_config(physical_device, cs); 176b8e80941Smrg 177b8e80941Smrg radeon_set_context_reg(cs, R_028A18_VGT_HOS_MAX_TESS_LEVEL, fui(64)); 178b8e80941Smrg if (!physical_device->has_clear_state) 179b8e80941Smrg radeon_set_context_reg(cs, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, fui(0)); 180b8e80941Smrg 181b8e80941Smrg /* FIXME calculate these values somehow ??? */ 182b8e80941Smrg if (physical_device->rad_info.chip_class <= VI) { 183b8e80941Smrg radeon_set_context_reg(cs, R_028A54_VGT_GS_PER_ES, SI_GS_PER_ES); 184b8e80941Smrg radeon_set_context_reg(cs, R_028A58_VGT_ES_PER_GS, 0x40); 185b8e80941Smrg } 186b8e80941Smrg 187b8e80941Smrg if (!physical_device->has_clear_state) { 188b8e80941Smrg radeon_set_context_reg(cs, R_028A5C_VGT_GS_PER_VS, 0x2); 189b8e80941Smrg radeon_set_context_reg(cs, R_028A8C_VGT_PRIMITIVEID_RESET, 0x0); 190b8e80941Smrg radeon_set_context_reg(cs, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0); 191b8e80941Smrg } 192b8e80941Smrg 193b8e80941Smrg radeon_set_context_reg(cs, R_028AA0_VGT_INSTANCE_STEP_RATE_0, 1); 194b8e80941Smrg if (!physical_device->has_clear_state) 195b8e80941Smrg radeon_set_context_reg(cs, R_028AB8_VGT_VTX_CNT_EN, 0x0); 196b8e80941Smrg if (physical_device->rad_info.chip_class < CIK) 197b8e80941Smrg radeon_set_config_reg(cs, R_008A14_PA_CL_ENHANCE, S_008A14_NUM_CLIP_SEQ(3) | 198b8e80941Smrg S_008A14_CLIP_VTX_REORDER_ENA(1)); 199b8e80941Smrg 200b8e80941Smrg radeon_set_context_reg(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 0x76543210); 201b8e80941Smrg radeon_set_context_reg(cs, R_028BD8_PA_SC_CENTROID_PRIORITY_1, 0xfedcba98); 202b8e80941Smrg 203b8e80941Smrg if (!physical_device->has_clear_state) 204b8e80941Smrg radeon_set_context_reg(cs, R_02882C_PA_SU_PRIM_FILTER_CNTL, 0); 205b8e80941Smrg 206b8e80941Smrg /* CLEAR_STATE doesn't clear these correctly on certain generations. 207b8e80941Smrg * I don't know why. Deduced by trial and error. 208b8e80941Smrg */ 209b8e80941Smrg if (physical_device->rad_info.chip_class <= CIK) { 210b8e80941Smrg radeon_set_context_reg(cs, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0); 211b8e80941Smrg radeon_set_context_reg(cs, R_028204_PA_SC_WINDOW_SCISSOR_TL, 212b8e80941Smrg S_028204_WINDOW_OFFSET_DISABLE(1)); 213b8e80941Smrg radeon_set_context_reg(cs, R_028240_PA_SC_GENERIC_SCISSOR_TL, 214b8e80941Smrg S_028240_WINDOW_OFFSET_DISABLE(1)); 215b8e80941Smrg radeon_set_context_reg(cs, R_028244_PA_SC_GENERIC_SCISSOR_BR, 216b8e80941Smrg S_028244_BR_X(16384) | S_028244_BR_Y(16384)); 217b8e80941Smrg radeon_set_context_reg(cs, R_028030_PA_SC_SCREEN_SCISSOR_TL, 0); 218b8e80941Smrg radeon_set_context_reg(cs, R_028034_PA_SC_SCREEN_SCISSOR_BR, 219b8e80941Smrg S_028034_BR_X(16384) | S_028034_BR_Y(16384)); 220b8e80941Smrg } 221b8e80941Smrg 222b8e80941Smrg if (!physical_device->has_clear_state) { 223b8e80941Smrg for (i = 0; i < 16; i++) { 224b8e80941Smrg radeon_set_context_reg(cs, R_0282D0_PA_SC_VPORT_ZMIN_0 + i*8, 0); 225b8e80941Smrg radeon_set_context_reg(cs, R_0282D4_PA_SC_VPORT_ZMAX_0 + i*8, fui(1.0)); 226b8e80941Smrg } 227b8e80941Smrg } 228b8e80941Smrg 229b8e80941Smrg if (!physical_device->has_clear_state) { 230b8e80941Smrg radeon_set_context_reg(cs, R_02820C_PA_SC_CLIPRECT_RULE, 0xFFFF); 231b8e80941Smrg radeon_set_context_reg(cs, R_028230_PA_SC_EDGERULE, 0xAAAAAAAA); 232b8e80941Smrg /* PA_SU_HARDWARE_SCREEN_OFFSET must be 0 due to hw bug on SI */ 233b8e80941Smrg radeon_set_context_reg(cs, R_028234_PA_SU_HARDWARE_SCREEN_OFFSET, 0); 234b8e80941Smrg radeon_set_context_reg(cs, R_028820_PA_CL_NANINF_CNTL, 0); 235b8e80941Smrg radeon_set_context_reg(cs, R_028AC0_DB_SRESULTS_COMPARE_STATE0, 0x0); 236b8e80941Smrg radeon_set_context_reg(cs, R_028AC4_DB_SRESULTS_COMPARE_STATE1, 0x0); 237b8e80941Smrg radeon_set_context_reg(cs, R_028AC8_DB_PRELOAD_CONTROL, 0x0); 238b8e80941Smrg } 239b8e80941Smrg 240b8e80941Smrg radeon_set_context_reg(cs, R_02800C_DB_RENDER_OVERRIDE, 241b8e80941Smrg S_02800C_FORCE_HIS_ENABLE0(V_02800C_FORCE_DISABLE) | 242b8e80941Smrg S_02800C_FORCE_HIS_ENABLE1(V_02800C_FORCE_DISABLE)); 243b8e80941Smrg 244b8e80941Smrg if (physical_device->rad_info.chip_class >= GFX9) { 245b8e80941Smrg radeon_set_uconfig_reg(cs, R_030920_VGT_MAX_VTX_INDX, ~0); 246b8e80941Smrg radeon_set_uconfig_reg(cs, R_030924_VGT_MIN_VTX_INDX, 0); 247b8e80941Smrg radeon_set_uconfig_reg(cs, R_030928_VGT_INDX_OFFSET, 0); 248b8e80941Smrg } else { 249b8e80941Smrg /* These registers, when written, also overwrite the 250b8e80941Smrg * CLEAR_STATE context, so we can't rely on CLEAR_STATE setting 251b8e80941Smrg * them. It would be an issue if there was another UMD 252b8e80941Smrg * changing them. 253b8e80941Smrg */ 254b8e80941Smrg radeon_set_context_reg(cs, R_028400_VGT_MAX_VTX_INDX, ~0); 255b8e80941Smrg radeon_set_context_reg(cs, R_028404_VGT_MIN_VTX_INDX, 0); 256b8e80941Smrg radeon_set_context_reg(cs, R_028408_VGT_INDX_OFFSET, 0); 257b8e80941Smrg } 258b8e80941Smrg 259b8e80941Smrg if (physical_device->rad_info.chip_class >= CIK) { 260b8e80941Smrg if (physical_device->rad_info.chip_class >= GFX9) { 261b8e80941Smrg radeon_set_sh_reg(cs, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, 262b8e80941Smrg S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F)); 263b8e80941Smrg } else { 264b8e80941Smrg radeon_set_sh_reg(cs, R_00B51C_SPI_SHADER_PGM_RSRC3_LS, 265b8e80941Smrg S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F)); 266b8e80941Smrg radeon_set_sh_reg(cs, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, 267b8e80941Smrg S_00B41C_WAVE_LIMIT(0x3F)); 268b8e80941Smrg radeon_set_sh_reg(cs, R_00B31C_SPI_SHADER_PGM_RSRC3_ES, 269b8e80941Smrg S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F)); 270b8e80941Smrg /* If this is 0, Bonaire can hang even if GS isn't being used. 271b8e80941Smrg * Other chips are unaffected. These are suboptimal values, 272b8e80941Smrg * but we don't use on-chip GS. 273b8e80941Smrg */ 274b8e80941Smrg radeon_set_context_reg(cs, R_028A44_VGT_GS_ONCHIP_CNTL, 275b8e80941Smrg S_028A44_ES_VERTS_PER_SUBGRP(64) | 276b8e80941Smrg S_028A44_GS_PRIMS_PER_SUBGRP(4)); 277b8e80941Smrg } 278b8e80941Smrg radeon_set_sh_reg(cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, 279b8e80941Smrg S_00B21C_CU_EN(0xffff) | S_00B21C_WAVE_LIMIT(0x3F)); 280b8e80941Smrg 281b8e80941Smrg if (physical_device->rad_info.num_good_cu_per_sh <= 4) { 282b8e80941Smrg /* Too few available compute units per SH. Disallowing 283b8e80941Smrg * VS to run on CU0 could hurt us more than late VS 284b8e80941Smrg * allocation would help. 285b8e80941Smrg * 286b8e80941Smrg * LATE_ALLOC_VS = 2 is the highest safe number. 287b8e80941Smrg */ 288b8e80941Smrg radeon_set_sh_reg(cs, R_00B118_SPI_SHADER_PGM_RSRC3_VS, 289b8e80941Smrg S_00B118_CU_EN(0xffff) | S_00B118_WAVE_LIMIT(0x3F) ); 290b8e80941Smrg radeon_set_sh_reg(cs, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(2)); 291b8e80941Smrg } else { 292b8e80941Smrg /* Set LATE_ALLOC_VS == 31. It should be less than 293b8e80941Smrg * the number of scratch waves. Limitations: 294b8e80941Smrg * - VS can't execute on CU0. 295b8e80941Smrg * - If HS writes outputs to LDS, LS can't execute on CU0. 296b8e80941Smrg */ 297b8e80941Smrg radeon_set_sh_reg(cs, R_00B118_SPI_SHADER_PGM_RSRC3_VS, 298b8e80941Smrg S_00B118_CU_EN(0xfffe) | S_00B118_WAVE_LIMIT(0x3F)); 299b8e80941Smrg radeon_set_sh_reg(cs, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(31)); 300b8e80941Smrg } 301b8e80941Smrg 302b8e80941Smrg radeon_set_sh_reg(cs, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, 303b8e80941Smrg S_00B01C_CU_EN(0xffff) | S_00B01C_WAVE_LIMIT(0x3F)); 304b8e80941Smrg } 305b8e80941Smrg 306b8e80941Smrg if (physical_device->rad_info.chip_class >= VI) { 307b8e80941Smrg uint32_t vgt_tess_distribution; 308b8e80941Smrg 309b8e80941Smrg vgt_tess_distribution = S_028B50_ACCUM_ISOLINE(32) | 310b8e80941Smrg S_028B50_ACCUM_TRI(11) | 311b8e80941Smrg S_028B50_ACCUM_QUAD(11) | 312b8e80941Smrg S_028B50_DONUT_SPLIT(16); 313b8e80941Smrg 314b8e80941Smrg if (physical_device->rad_info.family == CHIP_FIJI || 315b8e80941Smrg physical_device->rad_info.family >= CHIP_POLARIS10) 316b8e80941Smrg vgt_tess_distribution |= S_028B50_TRAP_SPLIT(3); 317b8e80941Smrg 318b8e80941Smrg radeon_set_context_reg(cs, R_028B50_VGT_TESS_DISTRIBUTION, 319b8e80941Smrg vgt_tess_distribution); 320b8e80941Smrg } else if (!physical_device->has_clear_state) { 321b8e80941Smrg radeon_set_context_reg(cs, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14); 322b8e80941Smrg radeon_set_context_reg(cs, R_028C5C_VGT_OUT_DEALLOC_CNTL, 16); 323b8e80941Smrg } 324b8e80941Smrg 325b8e80941Smrg if (physical_device->rad_info.chip_class >= GFX9) { 326b8e80941Smrg unsigned num_se = physical_device->rad_info.max_se; 327b8e80941Smrg unsigned pc_lines = 0; 328b8e80941Smrg 329b8e80941Smrg switch (physical_device->rad_info.family) { 330b8e80941Smrg case CHIP_VEGA10: 331b8e80941Smrg case CHIP_VEGA12: 332b8e80941Smrg case CHIP_VEGA20: 333b8e80941Smrg pc_lines = 4096; 334b8e80941Smrg break; 335b8e80941Smrg case CHIP_RAVEN: 336b8e80941Smrg case CHIP_RAVEN2: 337b8e80941Smrg pc_lines = 1024; 338b8e80941Smrg break; 339b8e80941Smrg default: 340b8e80941Smrg assert(0); 341b8e80941Smrg } 342b8e80941Smrg 343b8e80941Smrg radeon_set_context_reg(cs, R_028C48_PA_SC_BINNER_CNTL_1, 344b8e80941Smrg S_028C48_MAX_ALLOC_COUNT(MIN2(128, pc_lines / (4 * num_se))) | 345b8e80941Smrg S_028C48_MAX_PRIM_PER_BATCH(1023)); 346b8e80941Smrg radeon_set_context_reg(cs, R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL, 347b8e80941Smrg S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1)); 348b8e80941Smrg radeon_set_uconfig_reg(cs, R_030968_VGT_INSTANCE_BASE_ID, 0); 349b8e80941Smrg } 350b8e80941Smrg 351b8e80941Smrg unsigned tmp = (unsigned)(1.0 * 8.0); 352b8e80941Smrg radeon_set_context_reg_seq(cs, R_028A00_PA_SU_POINT_SIZE, 1); 353b8e80941Smrg radeon_emit(cs, S_028A00_HEIGHT(tmp) | S_028A00_WIDTH(tmp)); 354b8e80941Smrg radeon_set_context_reg_seq(cs, R_028A04_PA_SU_POINT_MINMAX, 1); 355b8e80941Smrg radeon_emit(cs, S_028A04_MIN_SIZE(radv_pack_float_12p4(0)) | 356b8e80941Smrg S_028A04_MAX_SIZE(radv_pack_float_12p4(8192/2))); 357b8e80941Smrg 358b8e80941Smrg if (!physical_device->has_clear_state) { 359b8e80941Smrg radeon_set_context_reg(cs, R_028004_DB_COUNT_CONTROL, 360b8e80941Smrg S_028004_ZPASS_INCREMENT_DISABLE(1)); 361b8e80941Smrg } 362b8e80941Smrg 363b8e80941Smrg /* Enable the Polaris small primitive filter control. 364b8e80941Smrg * XXX: There is possibly an issue when MSAA is off (see RadeonSI 365b8e80941Smrg * has_msaa_sample_loc_bug). But this doesn't seem to regress anything, 366b8e80941Smrg * and AMDVLK doesn't have a workaround as well. 367b8e80941Smrg */ 368b8e80941Smrg if (physical_device->rad_info.family >= CHIP_POLARIS10) { 369b8e80941Smrg unsigned small_prim_filter_cntl = 370b8e80941Smrg S_028830_SMALL_PRIM_FILTER_ENABLE(1) | 371b8e80941Smrg /* Workaround for a hw line bug. */ 372b8e80941Smrg S_028830_LINE_FILTER_DISABLE(physical_device->rad_info.family <= CHIP_POLARIS12); 373b8e80941Smrg 374b8e80941Smrg radeon_set_context_reg(cs, R_028830_PA_SU_SMALL_PRIM_FILTER_CNTL, 375b8e80941Smrg small_prim_filter_cntl); 376b8e80941Smrg } 377b8e80941Smrg 378b8e80941Smrg si_emit_compute(physical_device, cs); 379b8e80941Smrg} 380b8e80941Smrg 381b8e80941Smrgvoid 382b8e80941Smrgcik_create_gfx_config(struct radv_device *device) 383b8e80941Smrg{ 384b8e80941Smrg struct radeon_cmdbuf *cs = device->ws->cs_create(device->ws, RING_GFX); 385b8e80941Smrg if (!cs) 386b8e80941Smrg return; 387b8e80941Smrg 388b8e80941Smrg si_emit_graphics(device->physical_device, cs); 389b8e80941Smrg 390b8e80941Smrg while (cs->cdw & 7) { 391b8e80941Smrg if (device->physical_device->rad_info.gfx_ib_pad_with_type2) 392b8e80941Smrg radeon_emit(cs, 0x80000000); 393b8e80941Smrg else 394b8e80941Smrg radeon_emit(cs, 0xffff1000); 395b8e80941Smrg } 396b8e80941Smrg 397b8e80941Smrg device->gfx_init = device->ws->buffer_create(device->ws, 398b8e80941Smrg cs->cdw * 4, 4096, 399b8e80941Smrg RADEON_DOMAIN_GTT, 400b8e80941Smrg RADEON_FLAG_CPU_ACCESS| 401b8e80941Smrg RADEON_FLAG_NO_INTERPROCESS_SHARING | 402b8e80941Smrg RADEON_FLAG_READ_ONLY, 403b8e80941Smrg RADV_BO_PRIORITY_CS); 404b8e80941Smrg if (!device->gfx_init) 405b8e80941Smrg goto fail; 406b8e80941Smrg 407b8e80941Smrg void *map = device->ws->buffer_map(device->gfx_init); 408b8e80941Smrg if (!map) { 409b8e80941Smrg device->ws->buffer_destroy(device->gfx_init); 410b8e80941Smrg device->gfx_init = NULL; 411b8e80941Smrg goto fail; 412b8e80941Smrg } 413b8e80941Smrg memcpy(map, cs->buf, cs->cdw * 4); 414b8e80941Smrg 415b8e80941Smrg device->ws->buffer_unmap(device->gfx_init); 416b8e80941Smrg device->gfx_init_size_dw = cs->cdw; 417b8e80941Smrgfail: 418b8e80941Smrg device->ws->cs_destroy(cs); 419b8e80941Smrg} 420b8e80941Smrg 421b8e80941Smrgstatic void 422b8e80941Smrgget_viewport_xform(const VkViewport *viewport, 423b8e80941Smrg float scale[3], float translate[3]) 424b8e80941Smrg{ 425b8e80941Smrg float x = viewport->x; 426b8e80941Smrg float y = viewport->y; 427b8e80941Smrg float half_width = 0.5f * viewport->width; 428b8e80941Smrg float half_height = 0.5f * viewport->height; 429b8e80941Smrg double n = viewport->minDepth; 430b8e80941Smrg double f = viewport->maxDepth; 431b8e80941Smrg 432b8e80941Smrg scale[0] = half_width; 433b8e80941Smrg translate[0] = half_width + x; 434b8e80941Smrg scale[1] = half_height; 435b8e80941Smrg translate[1] = half_height + y; 436b8e80941Smrg 437b8e80941Smrg scale[2] = (f - n); 438b8e80941Smrg translate[2] = n; 439b8e80941Smrg} 440b8e80941Smrg 441b8e80941Smrgvoid 442b8e80941Smrgsi_write_viewport(struct radeon_cmdbuf *cs, int first_vp, 443b8e80941Smrg int count, const VkViewport *viewports) 444b8e80941Smrg{ 445b8e80941Smrg int i; 446b8e80941Smrg 447b8e80941Smrg assert(count); 448b8e80941Smrg radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE + 449b8e80941Smrg first_vp * 4 * 6, count * 6); 450b8e80941Smrg 451b8e80941Smrg for (i = 0; i < count; i++) { 452b8e80941Smrg float scale[3], translate[3]; 453b8e80941Smrg 454b8e80941Smrg 455b8e80941Smrg get_viewport_xform(&viewports[i], scale, translate); 456b8e80941Smrg radeon_emit(cs, fui(scale[0])); 457b8e80941Smrg radeon_emit(cs, fui(translate[0])); 458b8e80941Smrg radeon_emit(cs, fui(scale[1])); 459b8e80941Smrg radeon_emit(cs, fui(translate[1])); 460b8e80941Smrg radeon_emit(cs, fui(scale[2])); 461b8e80941Smrg radeon_emit(cs, fui(translate[2])); 462b8e80941Smrg } 463b8e80941Smrg 464b8e80941Smrg radeon_set_context_reg_seq(cs, R_0282D0_PA_SC_VPORT_ZMIN_0 + 465b8e80941Smrg first_vp * 4 * 2, count * 2); 466b8e80941Smrg for (i = 0; i < count; i++) { 467b8e80941Smrg float zmin = MIN2(viewports[i].minDepth, viewports[i].maxDepth); 468b8e80941Smrg float zmax = MAX2(viewports[i].minDepth, viewports[i].maxDepth); 469b8e80941Smrg radeon_emit(cs, fui(zmin)); 470b8e80941Smrg radeon_emit(cs, fui(zmax)); 471b8e80941Smrg } 472b8e80941Smrg} 473b8e80941Smrg 474b8e80941Smrgstatic VkRect2D si_scissor_from_viewport(const VkViewport *viewport) 475b8e80941Smrg{ 476b8e80941Smrg float scale[3], translate[3]; 477b8e80941Smrg VkRect2D rect; 478b8e80941Smrg 479b8e80941Smrg get_viewport_xform(viewport, scale, translate); 480b8e80941Smrg 481b8e80941Smrg rect.offset.x = translate[0] - fabs(scale[0]); 482b8e80941Smrg rect.offset.y = translate[1] - fabs(scale[1]); 483b8e80941Smrg rect.extent.width = ceilf(translate[0] + fabs(scale[0])) - rect.offset.x; 484b8e80941Smrg rect.extent.height = ceilf(translate[1] + fabs(scale[1])) - rect.offset.y; 485b8e80941Smrg 486b8e80941Smrg return rect; 487b8e80941Smrg} 488b8e80941Smrg 489b8e80941Smrgstatic VkRect2D si_intersect_scissor(const VkRect2D *a, const VkRect2D *b) { 490b8e80941Smrg VkRect2D ret; 491b8e80941Smrg ret.offset.x = MAX2(a->offset.x, b->offset.x); 492b8e80941Smrg ret.offset.y = MAX2(a->offset.y, b->offset.y); 493b8e80941Smrg ret.extent.width = MIN2(a->offset.x + a->extent.width, 494b8e80941Smrg b->offset.x + b->extent.width) - ret.offset.x; 495b8e80941Smrg ret.extent.height = MIN2(a->offset.y + a->extent.height, 496b8e80941Smrg b->offset.y + b->extent.height) - ret.offset.y; 497b8e80941Smrg return ret; 498b8e80941Smrg} 499b8e80941Smrg 500b8e80941Smrgvoid 501b8e80941Smrgsi_write_scissors(struct radeon_cmdbuf *cs, int first, 502b8e80941Smrg int count, const VkRect2D *scissors, 503b8e80941Smrg const VkViewport *viewports, bool can_use_guardband) 504b8e80941Smrg{ 505b8e80941Smrg int i; 506b8e80941Smrg float scale[3], translate[3], guardband_x = INFINITY, guardband_y = INFINITY; 507b8e80941Smrg const float max_range = 32767.0f; 508b8e80941Smrg if (!count) 509b8e80941Smrg return; 510b8e80941Smrg 511b8e80941Smrg radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL + first * 4 * 2, count * 2); 512b8e80941Smrg for (i = 0; i < count; i++) { 513b8e80941Smrg VkRect2D viewport_scissor = si_scissor_from_viewport(viewports + i); 514b8e80941Smrg VkRect2D scissor = si_intersect_scissor(&scissors[i], &viewport_scissor); 515b8e80941Smrg 516b8e80941Smrg get_viewport_xform(viewports + i, scale, translate); 517b8e80941Smrg scale[0] = fabsf(scale[0]); 518b8e80941Smrg scale[1] = fabsf(scale[1]); 519b8e80941Smrg 520b8e80941Smrg if (scale[0] < 0.5) 521b8e80941Smrg scale[0] = 0.5; 522b8e80941Smrg if (scale[1] < 0.5) 523b8e80941Smrg scale[1] = 0.5; 524b8e80941Smrg 525b8e80941Smrg guardband_x = MIN2(guardband_x, (max_range - fabsf(translate[0])) / scale[0]); 526b8e80941Smrg guardband_y = MIN2(guardband_y, (max_range - fabsf(translate[1])) / scale[1]); 527b8e80941Smrg 528b8e80941Smrg radeon_emit(cs, S_028250_TL_X(scissor.offset.x) | 529b8e80941Smrg S_028250_TL_Y(scissor.offset.y) | 530b8e80941Smrg S_028250_WINDOW_OFFSET_DISABLE(1)); 531b8e80941Smrg radeon_emit(cs, S_028254_BR_X(scissor.offset.x + scissor.extent.width) | 532b8e80941Smrg S_028254_BR_Y(scissor.offset.y + scissor.extent.height)); 533b8e80941Smrg } 534b8e80941Smrg if (!can_use_guardband) { 535b8e80941Smrg guardband_x = 1.0; 536b8e80941Smrg guardband_y = 1.0; 537b8e80941Smrg } 538b8e80941Smrg 539b8e80941Smrg radeon_set_context_reg_seq(cs, R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, 4); 540b8e80941Smrg radeon_emit(cs, fui(guardband_y)); 541b8e80941Smrg radeon_emit(cs, fui(1.0)); 542b8e80941Smrg radeon_emit(cs, fui(guardband_x)); 543b8e80941Smrg radeon_emit(cs, fui(1.0)); 544b8e80941Smrg} 545b8e80941Smrg 546b8e80941Smrgstatic inline unsigned 547b8e80941Smrgradv_prims_for_vertices(struct radv_prim_vertex_count *info, unsigned num) 548b8e80941Smrg{ 549b8e80941Smrg if (num == 0) 550b8e80941Smrg return 0; 551b8e80941Smrg 552b8e80941Smrg if (info->incr == 0) 553b8e80941Smrg return 0; 554b8e80941Smrg 555b8e80941Smrg if (num < info->min) 556b8e80941Smrg return 0; 557b8e80941Smrg 558b8e80941Smrg return 1 + ((num - info->min) / info->incr); 559b8e80941Smrg} 560b8e80941Smrg 561b8e80941Smrguint32_t 562b8e80941Smrgsi_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer, 563b8e80941Smrg bool instanced_draw, bool indirect_draw, 564b8e80941Smrg bool count_from_stream_output, 565b8e80941Smrg uint32_t draw_vertex_count) 566b8e80941Smrg{ 567b8e80941Smrg enum chip_class chip_class = cmd_buffer->device->physical_device->rad_info.chip_class; 568b8e80941Smrg enum radeon_family family = cmd_buffer->device->physical_device->rad_info.family; 569b8e80941Smrg struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info; 570b8e80941Smrg const unsigned max_primgroup_in_wave = 2; 571b8e80941Smrg /* SWITCH_ON_EOP(0) is always preferable. */ 572b8e80941Smrg bool wd_switch_on_eop = false; 573b8e80941Smrg bool ia_switch_on_eop = false; 574b8e80941Smrg bool ia_switch_on_eoi = false; 575b8e80941Smrg bool partial_vs_wave = false; 576b8e80941Smrg bool partial_es_wave = cmd_buffer->state.pipeline->graphics.ia_multi_vgt_param.partial_es_wave; 577b8e80941Smrg bool multi_instances_smaller_than_primgroup; 578b8e80941Smrg 579b8e80941Smrg multi_instances_smaller_than_primgroup = indirect_draw; 580b8e80941Smrg if (!multi_instances_smaller_than_primgroup && instanced_draw) { 581b8e80941Smrg uint32_t num_prims = radv_prims_for_vertices(&cmd_buffer->state.pipeline->graphics.prim_vertex_count, draw_vertex_count); 582b8e80941Smrg if (num_prims < cmd_buffer->state.pipeline->graphics.ia_multi_vgt_param.primgroup_size) 583b8e80941Smrg multi_instances_smaller_than_primgroup = true; 584b8e80941Smrg } 585b8e80941Smrg 586b8e80941Smrg ia_switch_on_eoi = cmd_buffer->state.pipeline->graphics.ia_multi_vgt_param.ia_switch_on_eoi; 587b8e80941Smrg partial_vs_wave = cmd_buffer->state.pipeline->graphics.ia_multi_vgt_param.partial_vs_wave; 588b8e80941Smrg 589b8e80941Smrg if (chip_class >= CIK) { 590b8e80941Smrg wd_switch_on_eop = cmd_buffer->state.pipeline->graphics.ia_multi_vgt_param.wd_switch_on_eop; 591b8e80941Smrg 592b8e80941Smrg /* Hawaii hangs if instancing is enabled and WD_SWITCH_ON_EOP is 0. 593b8e80941Smrg * We don't know that for indirect drawing, so treat it as 594b8e80941Smrg * always problematic. */ 595b8e80941Smrg if (family == CHIP_HAWAII && 596b8e80941Smrg (instanced_draw || indirect_draw)) 597b8e80941Smrg wd_switch_on_eop = true; 598b8e80941Smrg 599b8e80941Smrg /* Performance recommendation for 4 SE Gfx7-8 parts if 600b8e80941Smrg * instances are smaller than a primgroup. 601b8e80941Smrg * Assume indirect draws always use small instances. 602b8e80941Smrg * This is needed for good VS wave utilization. 603b8e80941Smrg */ 604b8e80941Smrg if (chip_class <= VI && 605b8e80941Smrg info->max_se == 4 && 606b8e80941Smrg multi_instances_smaller_than_primgroup) 607b8e80941Smrg wd_switch_on_eop = true; 608b8e80941Smrg 609b8e80941Smrg /* Required on CIK and later. */ 610b8e80941Smrg if (info->max_se > 2 && !wd_switch_on_eop) 611b8e80941Smrg ia_switch_on_eoi = true; 612b8e80941Smrg 613b8e80941Smrg /* Required by Hawaii and, for some special cases, by VI. */ 614b8e80941Smrg if (ia_switch_on_eoi && 615b8e80941Smrg (family == CHIP_HAWAII || 616b8e80941Smrg (chip_class == VI && 617b8e80941Smrg /* max primgroup in wave is always 2 - leave this for documentation */ 618b8e80941Smrg (radv_pipeline_has_gs(cmd_buffer->state.pipeline) || max_primgroup_in_wave != 2)))) 619b8e80941Smrg partial_vs_wave = true; 620b8e80941Smrg 621b8e80941Smrg /* Instancing bug on Bonaire. */ 622b8e80941Smrg if (family == CHIP_BONAIRE && ia_switch_on_eoi && 623b8e80941Smrg (instanced_draw || indirect_draw)) 624b8e80941Smrg partial_vs_wave = true; 625b8e80941Smrg 626b8e80941Smrg /* Hardware requirement when drawing primitives from a stream 627b8e80941Smrg * output buffer. 628b8e80941Smrg */ 629b8e80941Smrg if (count_from_stream_output) 630b8e80941Smrg wd_switch_on_eop = true; 631b8e80941Smrg 632b8e80941Smrg /* If the WD switch is false, the IA switch must be false too. */ 633b8e80941Smrg assert(wd_switch_on_eop || !ia_switch_on_eop); 634b8e80941Smrg } 635b8e80941Smrg /* If SWITCH_ON_EOI is set, PARTIAL_ES_WAVE must be set too. */ 636b8e80941Smrg if (chip_class <= VI && ia_switch_on_eoi) 637b8e80941Smrg partial_es_wave = true; 638b8e80941Smrg 639b8e80941Smrg if (radv_pipeline_has_gs(cmd_buffer->state.pipeline)) { 640b8e80941Smrg /* GS hw bug with single-primitive instances and SWITCH_ON_EOI. 641b8e80941Smrg * The hw doc says all multi-SE chips are affected, but amdgpu-pro Vulkan 642b8e80941Smrg * only applies it to Hawaii. Do what amdgpu-pro Vulkan does. 643b8e80941Smrg */ 644b8e80941Smrg if (family == CHIP_HAWAII && ia_switch_on_eoi) { 645b8e80941Smrg bool set_vgt_flush = indirect_draw; 646b8e80941Smrg if (!set_vgt_flush && instanced_draw) { 647b8e80941Smrg uint32_t num_prims = radv_prims_for_vertices(&cmd_buffer->state.pipeline->graphics.prim_vertex_count, draw_vertex_count); 648b8e80941Smrg if (num_prims <= 1) 649b8e80941Smrg set_vgt_flush = true; 650b8e80941Smrg } 651b8e80941Smrg if (set_vgt_flush) 652b8e80941Smrg cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_FLUSH; 653b8e80941Smrg } 654b8e80941Smrg } 655b8e80941Smrg 656b8e80941Smrg return cmd_buffer->state.pipeline->graphics.ia_multi_vgt_param.base | 657b8e80941Smrg S_028AA8_SWITCH_ON_EOP(ia_switch_on_eop) | 658b8e80941Smrg S_028AA8_SWITCH_ON_EOI(ia_switch_on_eoi) | 659b8e80941Smrg S_028AA8_PARTIAL_VS_WAVE_ON(partial_vs_wave) | 660b8e80941Smrg S_028AA8_PARTIAL_ES_WAVE_ON(partial_es_wave) | 661b8e80941Smrg S_028AA8_WD_SWITCH_ON_EOP(chip_class >= CIK ? wd_switch_on_eop : 0); 662b8e80941Smrg 663b8e80941Smrg} 664b8e80941Smrg 665b8e80941Smrgvoid si_cs_emit_write_event_eop(struct radeon_cmdbuf *cs, 666b8e80941Smrg enum chip_class chip_class, 667b8e80941Smrg bool is_mec, 668b8e80941Smrg unsigned event, unsigned event_flags, 669b8e80941Smrg unsigned data_sel, 670b8e80941Smrg uint64_t va, 671b8e80941Smrg uint32_t new_fence, 672b8e80941Smrg uint64_t gfx9_eop_bug_va) 673b8e80941Smrg{ 674b8e80941Smrg unsigned op = EVENT_TYPE(event) | 675b8e80941Smrg EVENT_INDEX(5) | 676b8e80941Smrg event_flags; 677b8e80941Smrg unsigned is_gfx8_mec = is_mec && chip_class < GFX9; 678b8e80941Smrg unsigned sel = EOP_DATA_SEL(data_sel); 679b8e80941Smrg 680b8e80941Smrg /* Wait for write confirmation before writing data, but don't send 681b8e80941Smrg * an interrupt. */ 682b8e80941Smrg if (data_sel != EOP_DATA_SEL_DISCARD) 683b8e80941Smrg sel |= EOP_INT_SEL(EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM); 684b8e80941Smrg 685b8e80941Smrg if (chip_class >= GFX9 || is_gfx8_mec) { 686b8e80941Smrg /* A ZPASS_DONE or PIXEL_STAT_DUMP_EVENT (of the DB occlusion 687b8e80941Smrg * counters) must immediately precede every timestamp event to 688b8e80941Smrg * prevent a GPU hang on GFX9. 689b8e80941Smrg */ 690b8e80941Smrg if (chip_class == GFX9 && !is_mec) { 691b8e80941Smrg radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); 692b8e80941Smrg radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1)); 693b8e80941Smrg radeon_emit(cs, gfx9_eop_bug_va); 694b8e80941Smrg radeon_emit(cs, gfx9_eop_bug_va >> 32); 695b8e80941Smrg } 696b8e80941Smrg 697b8e80941Smrg radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, is_gfx8_mec ? 5 : 6, false)); 698b8e80941Smrg radeon_emit(cs, op); 699b8e80941Smrg radeon_emit(cs, sel); 700b8e80941Smrg radeon_emit(cs, va); /* address lo */ 701b8e80941Smrg radeon_emit(cs, va >> 32); /* address hi */ 702b8e80941Smrg radeon_emit(cs, new_fence); /* immediate data lo */ 703b8e80941Smrg radeon_emit(cs, 0); /* immediate data hi */ 704b8e80941Smrg if (!is_gfx8_mec) 705b8e80941Smrg radeon_emit(cs, 0); /* unused */ 706b8e80941Smrg } else { 707b8e80941Smrg if (chip_class == CIK || 708b8e80941Smrg chip_class == VI) { 709b8e80941Smrg /* Two EOP events are required to make all engines go idle 710b8e80941Smrg * (and optional cache flushes executed) before the timestamp 711b8e80941Smrg * is written. 712b8e80941Smrg */ 713b8e80941Smrg radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, false)); 714b8e80941Smrg radeon_emit(cs, op); 715b8e80941Smrg radeon_emit(cs, va); 716b8e80941Smrg radeon_emit(cs, ((va >> 32) & 0xffff) | sel); 717b8e80941Smrg radeon_emit(cs, 0); /* immediate data */ 718b8e80941Smrg radeon_emit(cs, 0); /* unused */ 719b8e80941Smrg } 720b8e80941Smrg 721b8e80941Smrg radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, false)); 722b8e80941Smrg radeon_emit(cs, op); 723b8e80941Smrg radeon_emit(cs, va); 724b8e80941Smrg radeon_emit(cs, ((va >> 32) & 0xffff) | sel); 725b8e80941Smrg radeon_emit(cs, new_fence); /* immediate data */ 726b8e80941Smrg radeon_emit(cs, 0); /* unused */ 727b8e80941Smrg } 728b8e80941Smrg} 729b8e80941Smrg 730b8e80941Smrgvoid 731b8e80941Smrgradv_cp_wait_mem(struct radeon_cmdbuf *cs, uint32_t op, uint64_t va, 732b8e80941Smrg uint32_t ref, uint32_t mask) 733b8e80941Smrg{ 734b8e80941Smrg assert(op == WAIT_REG_MEM_EQUAL || 735b8e80941Smrg op == WAIT_REG_MEM_NOT_EQUAL || 736b8e80941Smrg op == WAIT_REG_MEM_GREATER_OR_EQUAL); 737b8e80941Smrg 738b8e80941Smrg radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, false)); 739b8e80941Smrg radeon_emit(cs, op | WAIT_REG_MEM_MEM_SPACE(1)); 740b8e80941Smrg radeon_emit(cs, va); 741b8e80941Smrg radeon_emit(cs, va >> 32); 742b8e80941Smrg radeon_emit(cs, ref); /* reference value */ 743b8e80941Smrg radeon_emit(cs, mask); /* mask */ 744b8e80941Smrg radeon_emit(cs, 4); /* poll interval */ 745b8e80941Smrg} 746b8e80941Smrg 747b8e80941Smrgstatic void 748b8e80941Smrgsi_emit_acquire_mem(struct radeon_cmdbuf *cs, 749b8e80941Smrg bool is_mec, 750b8e80941Smrg bool is_gfx9, 751b8e80941Smrg unsigned cp_coher_cntl) 752b8e80941Smrg{ 753b8e80941Smrg if (is_mec || is_gfx9) { 754b8e80941Smrg uint32_t hi_val = is_gfx9 ? 0xffffff : 0xff; 755b8e80941Smrg radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, false) | 756b8e80941Smrg PKT3_SHADER_TYPE_S(is_mec)); 757b8e80941Smrg radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */ 758b8e80941Smrg radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ 759b8e80941Smrg radeon_emit(cs, hi_val); /* CP_COHER_SIZE_HI */ 760b8e80941Smrg radeon_emit(cs, 0); /* CP_COHER_BASE */ 761b8e80941Smrg radeon_emit(cs, 0); /* CP_COHER_BASE_HI */ 762b8e80941Smrg radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ 763b8e80941Smrg } else { 764b8e80941Smrg /* ACQUIRE_MEM is only required on a compute ring. */ 765b8e80941Smrg radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, false)); 766b8e80941Smrg radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */ 767b8e80941Smrg radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ 768b8e80941Smrg radeon_emit(cs, 0); /* CP_COHER_BASE */ 769b8e80941Smrg radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ 770b8e80941Smrg } 771b8e80941Smrg} 772b8e80941Smrg 773b8e80941Smrgvoid 774b8e80941Smrgsi_cs_emit_cache_flush(struct radeon_cmdbuf *cs, 775b8e80941Smrg enum chip_class chip_class, 776b8e80941Smrg uint32_t *flush_cnt, 777b8e80941Smrg uint64_t flush_va, 778b8e80941Smrg bool is_mec, 779b8e80941Smrg enum radv_cmd_flush_bits flush_bits, 780b8e80941Smrg uint64_t gfx9_eop_bug_va) 781b8e80941Smrg{ 782b8e80941Smrg unsigned cp_coher_cntl = 0; 783b8e80941Smrg uint32_t flush_cb_db = flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB | 784b8e80941Smrg RADV_CMD_FLAG_FLUSH_AND_INV_DB); 785b8e80941Smrg 786b8e80941Smrg if (flush_bits & RADV_CMD_FLAG_INV_ICACHE) 787b8e80941Smrg cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1); 788b8e80941Smrg if (flush_bits & RADV_CMD_FLAG_INV_SMEM_L1) 789b8e80941Smrg cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1); 790b8e80941Smrg 791b8e80941Smrg if (chip_class <= VI) { 792b8e80941Smrg if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB) { 793b8e80941Smrg cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) | 794b8e80941Smrg S_0085F0_CB0_DEST_BASE_ENA(1) | 795b8e80941Smrg S_0085F0_CB1_DEST_BASE_ENA(1) | 796b8e80941Smrg S_0085F0_CB2_DEST_BASE_ENA(1) | 797b8e80941Smrg S_0085F0_CB3_DEST_BASE_ENA(1) | 798b8e80941Smrg S_0085F0_CB4_DEST_BASE_ENA(1) | 799b8e80941Smrg S_0085F0_CB5_DEST_BASE_ENA(1) | 800b8e80941Smrg S_0085F0_CB6_DEST_BASE_ENA(1) | 801b8e80941Smrg S_0085F0_CB7_DEST_BASE_ENA(1); 802b8e80941Smrg 803b8e80941Smrg /* Necessary for DCC */ 804b8e80941Smrg if (chip_class >= VI) { 805b8e80941Smrg si_cs_emit_write_event_eop(cs, 806b8e80941Smrg chip_class, 807b8e80941Smrg is_mec, 808b8e80941Smrg V_028A90_FLUSH_AND_INV_CB_DATA_TS, 809b8e80941Smrg 0, 810b8e80941Smrg EOP_DATA_SEL_DISCARD, 811b8e80941Smrg 0, 0, 812b8e80941Smrg gfx9_eop_bug_va); 813b8e80941Smrg } 814b8e80941Smrg } 815b8e80941Smrg if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB) { 816b8e80941Smrg cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | 817b8e80941Smrg S_0085F0_DB_DEST_BASE_ENA(1); 818b8e80941Smrg } 819b8e80941Smrg } 820b8e80941Smrg 821b8e80941Smrg if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB_META) { 822b8e80941Smrg radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 823b8e80941Smrg radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0)); 824b8e80941Smrg } 825b8e80941Smrg 826b8e80941Smrg if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB_META) { 827b8e80941Smrg radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 828b8e80941Smrg radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0)); 829b8e80941Smrg } 830b8e80941Smrg 831b8e80941Smrg if (flush_bits & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) { 832b8e80941Smrg radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 833b8e80941Smrg radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4)); 834b8e80941Smrg } else if (flush_bits & RADV_CMD_FLAG_VS_PARTIAL_FLUSH) { 835b8e80941Smrg radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 836b8e80941Smrg radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); 837b8e80941Smrg } 838b8e80941Smrg 839b8e80941Smrg if (flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH) { 840b8e80941Smrg radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 841b8e80941Smrg radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4)); 842b8e80941Smrg } 843b8e80941Smrg 844b8e80941Smrg if (chip_class >= GFX9 && flush_cb_db) { 845b8e80941Smrg unsigned cb_db_event, tc_flags; 846b8e80941Smrg 847b8e80941Smrg /* Set the CB/DB flush event. */ 848b8e80941Smrg cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT; 849b8e80941Smrg 850b8e80941Smrg /* These are the only allowed combinations. If you need to 851b8e80941Smrg * do multiple operations at once, do them separately. 852b8e80941Smrg * All operations that invalidate L2 also seem to invalidate 853b8e80941Smrg * metadata. Volatile (VOL) and WC flushes are not listed here. 854b8e80941Smrg * 855b8e80941Smrg * TC | TC_WB = writeback & invalidate L2 & L1 856b8e80941Smrg * TC | TC_WB | TC_NC = writeback & invalidate L2 for MTYPE == NC 857b8e80941Smrg * TC_WB | TC_NC = writeback L2 for MTYPE == NC 858b8e80941Smrg * TC | TC_NC = invalidate L2 for MTYPE == NC 859b8e80941Smrg * TC | TC_MD = writeback & invalidate L2 metadata (DCC, etc.) 860b8e80941Smrg * TCL1 = invalidate L1 861b8e80941Smrg */ 862b8e80941Smrg tc_flags = EVENT_TC_ACTION_ENA | 863b8e80941Smrg EVENT_TC_MD_ACTION_ENA; 864b8e80941Smrg 865b8e80941Smrg /* Ideally flush TC together with CB/DB. */ 866b8e80941Smrg if (flush_bits & RADV_CMD_FLAG_INV_GLOBAL_L2) { 867b8e80941Smrg /* Writeback and invalidate everything in L2 & L1. */ 868b8e80941Smrg tc_flags = EVENT_TC_ACTION_ENA | 869b8e80941Smrg EVENT_TC_WB_ACTION_ENA; 870b8e80941Smrg 871b8e80941Smrg 872b8e80941Smrg /* Clear the flags. */ 873b8e80941Smrg flush_bits &= ~(RADV_CMD_FLAG_INV_GLOBAL_L2 | 874b8e80941Smrg RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2 | 875b8e80941Smrg RADV_CMD_FLAG_INV_VMEM_L1); 876b8e80941Smrg } 877b8e80941Smrg assert(flush_cnt); 878b8e80941Smrg (*flush_cnt)++; 879b8e80941Smrg 880b8e80941Smrg si_cs_emit_write_event_eop(cs, chip_class, false, cb_db_event, tc_flags, 881b8e80941Smrg EOP_DATA_SEL_VALUE_32BIT, 882b8e80941Smrg flush_va, *flush_cnt, 883b8e80941Smrg gfx9_eop_bug_va); 884b8e80941Smrg radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL, flush_va, 885b8e80941Smrg *flush_cnt, 0xffffffff); 886b8e80941Smrg } 887b8e80941Smrg 888b8e80941Smrg /* VGT state sync */ 889b8e80941Smrg if (flush_bits & RADV_CMD_FLAG_VGT_FLUSH) { 890b8e80941Smrg radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 891b8e80941Smrg radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0)); 892b8e80941Smrg } 893b8e80941Smrg 894b8e80941Smrg /* VGT streamout state sync */ 895b8e80941Smrg if (flush_bits & RADV_CMD_FLAG_VGT_STREAMOUT_SYNC) { 896b8e80941Smrg radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 897b8e80941Smrg radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_STREAMOUT_SYNC) | EVENT_INDEX(0)); 898b8e80941Smrg } 899b8e80941Smrg 900b8e80941Smrg /* Make sure ME is idle (it executes most packets) before continuing. 901b8e80941Smrg * This prevents read-after-write hazards between PFP and ME. 902b8e80941Smrg */ 903b8e80941Smrg if ((cp_coher_cntl || 904b8e80941Smrg (flush_bits & (RADV_CMD_FLAG_CS_PARTIAL_FLUSH | 905b8e80941Smrg RADV_CMD_FLAG_INV_VMEM_L1 | 906b8e80941Smrg RADV_CMD_FLAG_INV_GLOBAL_L2 | 907b8e80941Smrg RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2))) && 908b8e80941Smrg !is_mec) { 909b8e80941Smrg radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); 910b8e80941Smrg radeon_emit(cs, 0); 911b8e80941Smrg } 912b8e80941Smrg 913b8e80941Smrg if ((flush_bits & RADV_CMD_FLAG_INV_GLOBAL_L2) || 914b8e80941Smrg (chip_class <= CIK && (flush_bits & RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2))) { 915b8e80941Smrg si_emit_acquire_mem(cs, is_mec, chip_class >= GFX9, 916b8e80941Smrg cp_coher_cntl | 917b8e80941Smrg S_0085F0_TC_ACTION_ENA(1) | 918b8e80941Smrg S_0085F0_TCL1_ACTION_ENA(1) | 919b8e80941Smrg S_0301F0_TC_WB_ACTION_ENA(chip_class >= VI)); 920b8e80941Smrg cp_coher_cntl = 0; 921b8e80941Smrg } else { 922b8e80941Smrg if(flush_bits & RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2) { 923b8e80941Smrg /* WB = write-back 924b8e80941Smrg * NC = apply to non-coherent MTYPEs 925b8e80941Smrg * (i.e. MTYPE <= 1, which is what we use everywhere) 926b8e80941Smrg * 927b8e80941Smrg * WB doesn't work without NC. 928b8e80941Smrg */ 929b8e80941Smrg si_emit_acquire_mem(cs, is_mec, 930b8e80941Smrg chip_class >= GFX9, 931b8e80941Smrg cp_coher_cntl | 932b8e80941Smrg S_0301F0_TC_WB_ACTION_ENA(1) | 933b8e80941Smrg S_0301F0_TC_NC_ACTION_ENA(1)); 934b8e80941Smrg cp_coher_cntl = 0; 935b8e80941Smrg } 936b8e80941Smrg if (flush_bits & RADV_CMD_FLAG_INV_VMEM_L1) { 937b8e80941Smrg si_emit_acquire_mem(cs, is_mec, 938b8e80941Smrg chip_class >= GFX9, 939b8e80941Smrg cp_coher_cntl | 940b8e80941Smrg S_0085F0_TCL1_ACTION_ENA(1)); 941b8e80941Smrg cp_coher_cntl = 0; 942b8e80941Smrg } 943b8e80941Smrg } 944b8e80941Smrg 945b8e80941Smrg /* When one of the DEST_BASE flags is set, SURFACE_SYNC waits for idle. 946b8e80941Smrg * Therefore, it should be last. Done in PFP. 947b8e80941Smrg */ 948b8e80941Smrg if (cp_coher_cntl) 949b8e80941Smrg si_emit_acquire_mem(cs, is_mec, chip_class >= GFX9, cp_coher_cntl); 950b8e80941Smrg 951b8e80941Smrg if (flush_bits & RADV_CMD_FLAG_START_PIPELINE_STATS) { 952b8e80941Smrg radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 953b8e80941Smrg radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) | 954b8e80941Smrg EVENT_INDEX(0)); 955b8e80941Smrg } else if (flush_bits & RADV_CMD_FLAG_STOP_PIPELINE_STATS) { 956b8e80941Smrg radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 957b8e80941Smrg radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | 958b8e80941Smrg EVENT_INDEX(0)); 959b8e80941Smrg } 960b8e80941Smrg} 961b8e80941Smrg 962b8e80941Smrgvoid 963b8e80941Smrgsi_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer) 964b8e80941Smrg{ 965b8e80941Smrg bool is_compute = cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE; 966b8e80941Smrg 967b8e80941Smrg if (is_compute) 968b8e80941Smrg cmd_buffer->state.flush_bits &= ~(RADV_CMD_FLAG_FLUSH_AND_INV_CB | 969b8e80941Smrg RADV_CMD_FLAG_FLUSH_AND_INV_CB_META | 970b8e80941Smrg RADV_CMD_FLAG_FLUSH_AND_INV_DB | 971b8e80941Smrg RADV_CMD_FLAG_FLUSH_AND_INV_DB_META | 972b8e80941Smrg RADV_CMD_FLAG_PS_PARTIAL_FLUSH | 973b8e80941Smrg RADV_CMD_FLAG_VS_PARTIAL_FLUSH | 974b8e80941Smrg RADV_CMD_FLAG_VGT_FLUSH | 975b8e80941Smrg RADV_CMD_FLAG_START_PIPELINE_STATS | 976b8e80941Smrg RADV_CMD_FLAG_STOP_PIPELINE_STATS); 977b8e80941Smrg 978b8e80941Smrg if (!cmd_buffer->state.flush_bits) 979b8e80941Smrg return; 980b8e80941Smrg 981b8e80941Smrg radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 128); 982b8e80941Smrg 983b8e80941Smrg si_cs_emit_cache_flush(cmd_buffer->cs, 984b8e80941Smrg cmd_buffer->device->physical_device->rad_info.chip_class, 985b8e80941Smrg &cmd_buffer->gfx9_fence_idx, 986b8e80941Smrg cmd_buffer->gfx9_fence_va, 987b8e80941Smrg radv_cmd_buffer_uses_mec(cmd_buffer), 988b8e80941Smrg cmd_buffer->state.flush_bits, 989b8e80941Smrg cmd_buffer->gfx9_eop_bug_va); 990b8e80941Smrg 991b8e80941Smrg 992b8e80941Smrg if (unlikely(cmd_buffer->device->trace_bo)) 993b8e80941Smrg radv_cmd_buffer_trace_emit(cmd_buffer); 994b8e80941Smrg 995b8e80941Smrg cmd_buffer->state.flush_bits = 0; 996b8e80941Smrg 997b8e80941Smrg /* If the driver used a compute shader for resetting a query pool, it 998b8e80941Smrg * should be finished at this point. 999b8e80941Smrg */ 1000b8e80941Smrg cmd_buffer->pending_reset_query = false; 1001b8e80941Smrg} 1002b8e80941Smrg 1003b8e80941Smrg/* sets the CP predication state using a boolean stored at va */ 1004b8e80941Smrgvoid 1005b8e80941Smrgsi_emit_set_predication_state(struct radv_cmd_buffer *cmd_buffer, 1006b8e80941Smrg bool draw_visible, uint64_t va) 1007b8e80941Smrg{ 1008b8e80941Smrg uint32_t op = 0; 1009b8e80941Smrg 1010b8e80941Smrg if (va) { 1011b8e80941Smrg op = PRED_OP(PREDICATION_OP_BOOL64); 1012b8e80941Smrg 1013b8e80941Smrg /* PREDICATION_DRAW_VISIBLE means that if the 32-bit value is 1014b8e80941Smrg * zero, all rendering commands are discarded. Otherwise, they 1015b8e80941Smrg * are discarded if the value is non zero. 1016b8e80941Smrg */ 1017b8e80941Smrg op |= draw_visible ? PREDICATION_DRAW_VISIBLE : 1018b8e80941Smrg PREDICATION_DRAW_NOT_VISIBLE; 1019b8e80941Smrg } 1020b8e80941Smrg if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { 1021b8e80941Smrg radeon_emit(cmd_buffer->cs, PKT3(PKT3_SET_PREDICATION, 2, 0)); 1022b8e80941Smrg radeon_emit(cmd_buffer->cs, op); 1023b8e80941Smrg radeon_emit(cmd_buffer->cs, va); 1024b8e80941Smrg radeon_emit(cmd_buffer->cs, va >> 32); 1025b8e80941Smrg } else { 1026b8e80941Smrg radeon_emit(cmd_buffer->cs, PKT3(PKT3_SET_PREDICATION, 1, 0)); 1027b8e80941Smrg radeon_emit(cmd_buffer->cs, va); 1028b8e80941Smrg radeon_emit(cmd_buffer->cs, op | ((va >> 32) & 0xFF)); 1029b8e80941Smrg } 1030b8e80941Smrg} 1031b8e80941Smrg 1032b8e80941Smrg/* Set this if you want the 3D engine to wait until CP DMA is done. 1033b8e80941Smrg * It should be set on the last CP DMA packet. */ 1034b8e80941Smrg#define CP_DMA_SYNC (1 << 0) 1035b8e80941Smrg 1036b8e80941Smrg/* Set this if the source data was used as a destination in a previous CP DMA 1037b8e80941Smrg * packet. It's for preventing a read-after-write (RAW) hazard between two 1038b8e80941Smrg * CP DMA packets. */ 1039b8e80941Smrg#define CP_DMA_RAW_WAIT (1 << 1) 1040b8e80941Smrg#define CP_DMA_USE_L2 (1 << 2) 1041b8e80941Smrg#define CP_DMA_CLEAR (1 << 3) 1042b8e80941Smrg 1043b8e80941Smrg/* Alignment for optimal performance. */ 1044b8e80941Smrg#define SI_CPDMA_ALIGNMENT 32 1045b8e80941Smrg 1046b8e80941Smrg/* The max number of bytes that can be copied per packet. */ 1047b8e80941Smrgstatic inline unsigned cp_dma_max_byte_count(struct radv_cmd_buffer *cmd_buffer) 1048b8e80941Smrg{ 1049b8e80941Smrg unsigned max = cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9 ? 1050b8e80941Smrg S_414_BYTE_COUNT_GFX9(~0u) : 1051b8e80941Smrg S_414_BYTE_COUNT_GFX6(~0u); 1052b8e80941Smrg 1053b8e80941Smrg /* make it aligned for optimal performance */ 1054b8e80941Smrg return max & ~(SI_CPDMA_ALIGNMENT - 1); 1055b8e80941Smrg} 1056b8e80941Smrg 1057b8e80941Smrg/* Emit a CP DMA packet to do a copy from one buffer to another, or to clear 1058b8e80941Smrg * a buffer. The size must fit in bits [20:0]. If CP_DMA_CLEAR is set, src_va is a 32-bit 1059b8e80941Smrg * clear value. 1060b8e80941Smrg */ 1061b8e80941Smrgstatic void si_emit_cp_dma(struct radv_cmd_buffer *cmd_buffer, 1062b8e80941Smrg uint64_t dst_va, uint64_t src_va, 1063b8e80941Smrg unsigned size, unsigned flags) 1064b8e80941Smrg{ 1065b8e80941Smrg struct radeon_cmdbuf *cs = cmd_buffer->cs; 1066b8e80941Smrg uint32_t header = 0, command = 0; 1067b8e80941Smrg 1068b8e80941Smrg assert(size <= cp_dma_max_byte_count(cmd_buffer)); 1069b8e80941Smrg 1070b8e80941Smrg radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 9); 1071b8e80941Smrg if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) 1072b8e80941Smrg command |= S_414_BYTE_COUNT_GFX9(size); 1073b8e80941Smrg else 1074b8e80941Smrg command |= S_414_BYTE_COUNT_GFX6(size); 1075b8e80941Smrg 1076b8e80941Smrg /* Sync flags. */ 1077b8e80941Smrg if (flags & CP_DMA_SYNC) 1078b8e80941Smrg header |= S_411_CP_SYNC(1); 1079b8e80941Smrg else { 1080b8e80941Smrg if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) 1081b8e80941Smrg command |= S_414_DISABLE_WR_CONFIRM_GFX9(1); 1082b8e80941Smrg else 1083b8e80941Smrg command |= S_414_DISABLE_WR_CONFIRM_GFX6(1); 1084b8e80941Smrg } 1085b8e80941Smrg 1086b8e80941Smrg if (flags & CP_DMA_RAW_WAIT) 1087b8e80941Smrg command |= S_414_RAW_WAIT(1); 1088b8e80941Smrg 1089b8e80941Smrg /* Src and dst flags. */ 1090b8e80941Smrg if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9 && 1091b8e80941Smrg !(flags & CP_DMA_CLEAR) && 1092b8e80941Smrg src_va == dst_va) 1093b8e80941Smrg header |= S_411_DST_SEL(V_411_NOWHERE); /* prefetch only */ 1094b8e80941Smrg else if (flags & CP_DMA_USE_L2) 1095b8e80941Smrg header |= S_411_DST_SEL(V_411_DST_ADDR_TC_L2); 1096b8e80941Smrg 1097b8e80941Smrg if (flags & CP_DMA_CLEAR) 1098b8e80941Smrg header |= S_411_SRC_SEL(V_411_DATA); 1099b8e80941Smrg else if (flags & CP_DMA_USE_L2) 1100b8e80941Smrg header |= S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2); 1101b8e80941Smrg 1102b8e80941Smrg if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) { 1103b8e80941Smrg radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, cmd_buffer->state.predicating)); 1104b8e80941Smrg radeon_emit(cs, header); 1105b8e80941Smrg radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */ 1106b8e80941Smrg radeon_emit(cs, src_va >> 32); /* SRC_ADDR_HI [31:0] */ 1107b8e80941Smrg radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ 1108b8e80941Smrg radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [31:0] */ 1109b8e80941Smrg radeon_emit(cs, command); 1110b8e80941Smrg } else { 1111b8e80941Smrg assert(!(flags & CP_DMA_USE_L2)); 1112b8e80941Smrg header |= S_411_SRC_ADDR_HI(src_va >> 32); 1113b8e80941Smrg radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, cmd_buffer->state.predicating)); 1114b8e80941Smrg radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */ 1115b8e80941Smrg radeon_emit(cs, header); /* SRC_ADDR_HI [15:0] + flags. */ 1116b8e80941Smrg radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ 1117b8e80941Smrg radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */ 1118b8e80941Smrg radeon_emit(cs, command); 1119b8e80941Smrg } 1120b8e80941Smrg 1121b8e80941Smrg /* CP DMA is executed in ME, but index buffers are read by PFP. 1122b8e80941Smrg * This ensures that ME (CP DMA) is idle before PFP starts fetching 1123b8e80941Smrg * indices. If we wanted to execute CP DMA in PFP, this packet 1124b8e80941Smrg * should precede it. 1125b8e80941Smrg */ 1126b8e80941Smrg if (flags & CP_DMA_SYNC) { 1127b8e80941Smrg if (cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) { 1128b8e80941Smrg radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating)); 1129b8e80941Smrg radeon_emit(cs, 0); 1130b8e80941Smrg } 1131b8e80941Smrg 1132b8e80941Smrg /* CP will see the sync flag and wait for all DMAs to complete. */ 1133b8e80941Smrg cmd_buffer->state.dma_is_busy = false; 1134b8e80941Smrg } 1135b8e80941Smrg 1136b8e80941Smrg if (unlikely(cmd_buffer->device->trace_bo)) 1137b8e80941Smrg radv_cmd_buffer_trace_emit(cmd_buffer); 1138b8e80941Smrg} 1139b8e80941Smrg 1140b8e80941Smrgvoid si_cp_dma_prefetch(struct radv_cmd_buffer *cmd_buffer, uint64_t va, 1141b8e80941Smrg unsigned size) 1142b8e80941Smrg{ 1143b8e80941Smrg uint64_t aligned_va = va & ~(SI_CPDMA_ALIGNMENT - 1); 1144b8e80941Smrg uint64_t aligned_size = ((va + size + SI_CPDMA_ALIGNMENT -1) & ~(SI_CPDMA_ALIGNMENT - 1)) - aligned_va; 1145b8e80941Smrg 1146b8e80941Smrg si_emit_cp_dma(cmd_buffer, aligned_va, aligned_va, 1147b8e80941Smrg aligned_size, CP_DMA_USE_L2); 1148b8e80941Smrg} 1149b8e80941Smrg 1150b8e80941Smrgstatic void si_cp_dma_prepare(struct radv_cmd_buffer *cmd_buffer, uint64_t byte_count, 1151b8e80941Smrg uint64_t remaining_size, unsigned *flags) 1152b8e80941Smrg{ 1153b8e80941Smrg 1154b8e80941Smrg /* Flush the caches for the first copy only. 1155b8e80941Smrg * Also wait for the previous CP DMA operations. 1156b8e80941Smrg */ 1157b8e80941Smrg if (cmd_buffer->state.flush_bits) { 1158b8e80941Smrg si_emit_cache_flush(cmd_buffer); 1159b8e80941Smrg *flags |= CP_DMA_RAW_WAIT; 1160b8e80941Smrg } 1161b8e80941Smrg 1162b8e80941Smrg /* Do the synchronization after the last dma, so that all data 1163b8e80941Smrg * is written to memory. 1164b8e80941Smrg */ 1165b8e80941Smrg if (byte_count == remaining_size) 1166b8e80941Smrg *flags |= CP_DMA_SYNC; 1167b8e80941Smrg} 1168b8e80941Smrg 1169b8e80941Smrgstatic void si_cp_dma_realign_engine(struct radv_cmd_buffer *cmd_buffer, unsigned size) 1170b8e80941Smrg{ 1171b8e80941Smrg uint64_t va; 1172b8e80941Smrg uint32_t offset; 1173b8e80941Smrg unsigned dma_flags = 0; 1174b8e80941Smrg unsigned buf_size = SI_CPDMA_ALIGNMENT * 2; 1175b8e80941Smrg void *ptr; 1176b8e80941Smrg 1177b8e80941Smrg assert(size < SI_CPDMA_ALIGNMENT); 1178b8e80941Smrg 1179b8e80941Smrg radv_cmd_buffer_upload_alloc(cmd_buffer, buf_size, SI_CPDMA_ALIGNMENT, &offset, &ptr); 1180b8e80941Smrg 1181b8e80941Smrg va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); 1182b8e80941Smrg va += offset; 1183b8e80941Smrg 1184b8e80941Smrg si_cp_dma_prepare(cmd_buffer, size, size, &dma_flags); 1185b8e80941Smrg 1186b8e80941Smrg si_emit_cp_dma(cmd_buffer, va, va + SI_CPDMA_ALIGNMENT, size, 1187b8e80941Smrg dma_flags); 1188b8e80941Smrg} 1189b8e80941Smrg 1190b8e80941Smrgvoid si_cp_dma_buffer_copy(struct radv_cmd_buffer *cmd_buffer, 1191b8e80941Smrg uint64_t src_va, uint64_t dest_va, 1192b8e80941Smrg uint64_t size) 1193b8e80941Smrg{ 1194b8e80941Smrg uint64_t main_src_va, main_dest_va; 1195b8e80941Smrg uint64_t skipped_size = 0, realign_size = 0; 1196b8e80941Smrg 1197b8e80941Smrg /* Assume that we are not going to sync after the last DMA operation. */ 1198b8e80941Smrg cmd_buffer->state.dma_is_busy = true; 1199b8e80941Smrg 1200b8e80941Smrg if (cmd_buffer->device->physical_device->rad_info.family <= CHIP_CARRIZO || 1201b8e80941Smrg cmd_buffer->device->physical_device->rad_info.family == CHIP_STONEY) { 1202b8e80941Smrg /* If the size is not aligned, we must add a dummy copy at the end 1203b8e80941Smrg * just to align the internal counter. Otherwise, the DMA engine 1204b8e80941Smrg * would slow down by an order of magnitude for following copies. 1205b8e80941Smrg */ 1206b8e80941Smrg if (size % SI_CPDMA_ALIGNMENT) 1207b8e80941Smrg realign_size = SI_CPDMA_ALIGNMENT - (size % SI_CPDMA_ALIGNMENT); 1208b8e80941Smrg 1209b8e80941Smrg /* If the copy begins unaligned, we must start copying from the next 1210b8e80941Smrg * aligned block and the skipped part should be copied after everything 1211b8e80941Smrg * else has been copied. Only the src alignment matters, not dst. 1212b8e80941Smrg */ 1213b8e80941Smrg if (src_va % SI_CPDMA_ALIGNMENT) { 1214b8e80941Smrg skipped_size = SI_CPDMA_ALIGNMENT - (src_va % SI_CPDMA_ALIGNMENT); 1215b8e80941Smrg /* The main part will be skipped if the size is too small. */ 1216b8e80941Smrg skipped_size = MIN2(skipped_size, size); 1217b8e80941Smrg size -= skipped_size; 1218b8e80941Smrg } 1219b8e80941Smrg } 1220b8e80941Smrg main_src_va = src_va + skipped_size; 1221b8e80941Smrg main_dest_va = dest_va + skipped_size; 1222b8e80941Smrg 1223b8e80941Smrg while (size) { 1224b8e80941Smrg unsigned dma_flags = 0; 1225b8e80941Smrg unsigned byte_count = MIN2(size, cp_dma_max_byte_count(cmd_buffer)); 1226b8e80941Smrg 1227b8e80941Smrg si_cp_dma_prepare(cmd_buffer, byte_count, 1228b8e80941Smrg size + skipped_size + realign_size, 1229b8e80941Smrg &dma_flags); 1230b8e80941Smrg 1231b8e80941Smrg dma_flags &= ~CP_DMA_SYNC; 1232b8e80941Smrg 1233b8e80941Smrg si_emit_cp_dma(cmd_buffer, main_dest_va, main_src_va, 1234b8e80941Smrg byte_count, dma_flags); 1235b8e80941Smrg 1236b8e80941Smrg size -= byte_count; 1237b8e80941Smrg main_src_va += byte_count; 1238b8e80941Smrg main_dest_va += byte_count; 1239b8e80941Smrg } 1240b8e80941Smrg 1241b8e80941Smrg if (skipped_size) { 1242b8e80941Smrg unsigned dma_flags = 0; 1243b8e80941Smrg 1244b8e80941Smrg si_cp_dma_prepare(cmd_buffer, skipped_size, 1245b8e80941Smrg size + skipped_size + realign_size, 1246b8e80941Smrg &dma_flags); 1247b8e80941Smrg 1248b8e80941Smrg si_emit_cp_dma(cmd_buffer, dest_va, src_va, 1249b8e80941Smrg skipped_size, dma_flags); 1250b8e80941Smrg } 1251b8e80941Smrg if (realign_size) 1252b8e80941Smrg si_cp_dma_realign_engine(cmd_buffer, realign_size); 1253b8e80941Smrg} 1254b8e80941Smrg 1255b8e80941Smrgvoid si_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer, uint64_t va, 1256b8e80941Smrg uint64_t size, unsigned value) 1257b8e80941Smrg{ 1258b8e80941Smrg 1259b8e80941Smrg if (!size) 1260b8e80941Smrg return; 1261b8e80941Smrg 1262b8e80941Smrg assert(va % 4 == 0 && size % 4 == 0); 1263b8e80941Smrg 1264b8e80941Smrg /* Assume that we are not going to sync after the last DMA operation. */ 1265b8e80941Smrg cmd_buffer->state.dma_is_busy = true; 1266b8e80941Smrg 1267b8e80941Smrg while (size) { 1268b8e80941Smrg unsigned byte_count = MIN2(size, cp_dma_max_byte_count(cmd_buffer)); 1269b8e80941Smrg unsigned dma_flags = CP_DMA_CLEAR; 1270b8e80941Smrg 1271b8e80941Smrg si_cp_dma_prepare(cmd_buffer, byte_count, size, &dma_flags); 1272b8e80941Smrg 1273b8e80941Smrg /* Emit the clear packet. */ 1274b8e80941Smrg si_emit_cp_dma(cmd_buffer, va, value, byte_count, 1275b8e80941Smrg dma_flags); 1276b8e80941Smrg 1277b8e80941Smrg size -= byte_count; 1278b8e80941Smrg va += byte_count; 1279b8e80941Smrg } 1280b8e80941Smrg} 1281b8e80941Smrg 1282b8e80941Smrgvoid si_cp_dma_wait_for_idle(struct radv_cmd_buffer *cmd_buffer) 1283b8e80941Smrg{ 1284b8e80941Smrg if (cmd_buffer->device->physical_device->rad_info.chip_class < CIK) 1285b8e80941Smrg return; 1286b8e80941Smrg 1287b8e80941Smrg if (!cmd_buffer->state.dma_is_busy) 1288b8e80941Smrg return; 1289b8e80941Smrg 1290b8e80941Smrg /* Issue a dummy DMA that copies zero bytes. 1291b8e80941Smrg * 1292b8e80941Smrg * The DMA engine will see that there's no work to do and skip this 1293b8e80941Smrg * DMA request, however, the CP will see the sync flag and still wait 1294b8e80941Smrg * for all DMAs to complete. 1295b8e80941Smrg */ 1296b8e80941Smrg si_emit_cp_dma(cmd_buffer, 0, 0, 0, CP_DMA_SYNC); 1297b8e80941Smrg 1298b8e80941Smrg cmd_buffer->state.dma_is_busy = false; 1299b8e80941Smrg} 1300b8e80941Smrg 1301b8e80941Smrg/* For MSAA sample positions. */ 1302b8e80941Smrg#define FILL_SREG(s0x, s0y, s1x, s1y, s2x, s2y, s3x, s3y) \ 1303b8e80941Smrg (((s0x) & 0xf) | (((unsigned)(s0y) & 0xf) << 4) | \ 1304b8e80941Smrg (((unsigned)(s1x) & 0xf) << 8) | (((unsigned)(s1y) & 0xf) << 12) | \ 1305b8e80941Smrg (((unsigned)(s2x) & 0xf) << 16) | (((unsigned)(s2y) & 0xf) << 20) | \ 1306b8e80941Smrg (((unsigned)(s3x) & 0xf) << 24) | (((unsigned)(s3y) & 0xf) << 28)) 1307b8e80941Smrg 1308b8e80941Smrg 1309b8e80941Smrg/* 2xMSAA 1310b8e80941Smrg * There are two locations (4, 4), (-4, -4). */ 1311b8e80941Smrgconst uint32_t eg_sample_locs_2x[4] = { 1312b8e80941Smrg FILL_SREG(4, 4, -4, -4, 4, 4, -4, -4), 1313b8e80941Smrg FILL_SREG(4, 4, -4, -4, 4, 4, -4, -4), 1314b8e80941Smrg FILL_SREG(4, 4, -4, -4, 4, 4, -4, -4), 1315b8e80941Smrg FILL_SREG(4, 4, -4, -4, 4, 4, -4, -4), 1316b8e80941Smrg}; 1317b8e80941Smrgconst unsigned eg_max_dist_2x = 4; 1318b8e80941Smrg/* 4xMSAA 1319b8e80941Smrg * There are 4 locations: (-2, 6), (6, -2), (-6, 2), (2, 6). */ 1320b8e80941Smrgconst uint32_t eg_sample_locs_4x[4] = { 1321b8e80941Smrg FILL_SREG(-2, -6, 6, -2, -6, 2, 2, 6), 1322b8e80941Smrg FILL_SREG(-2, -6, 6, -2, -6, 2, 2, 6), 1323b8e80941Smrg FILL_SREG(-2, -6, 6, -2, -6, 2, 2, 6), 1324b8e80941Smrg FILL_SREG(-2, -6, 6, -2, -6, 2, 2, 6), 1325b8e80941Smrg}; 1326b8e80941Smrgconst unsigned eg_max_dist_4x = 6; 1327b8e80941Smrg 1328b8e80941Smrg/* Cayman 8xMSAA */ 1329b8e80941Smrgstatic const uint32_t cm_sample_locs_8x[] = { 1330b8e80941Smrg FILL_SREG( 1, -3, -1, 3, 5, 1, -3, -5), 1331b8e80941Smrg FILL_SREG( 1, -3, -1, 3, 5, 1, -3, -5), 1332b8e80941Smrg FILL_SREG( 1, -3, -1, 3, 5, 1, -3, -5), 1333b8e80941Smrg FILL_SREG( 1, -3, -1, 3, 5, 1, -3, -5), 1334b8e80941Smrg FILL_SREG(-5, 5, -7, -1, 3, 7, 7, -7), 1335b8e80941Smrg FILL_SREG(-5, 5, -7, -1, 3, 7, 7, -7), 1336b8e80941Smrg FILL_SREG(-5, 5, -7, -1, 3, 7, 7, -7), 1337b8e80941Smrg FILL_SREG(-5, 5, -7, -1, 3, 7, 7, -7), 1338b8e80941Smrg}; 1339b8e80941Smrgstatic const unsigned cm_max_dist_8x = 8; 1340b8e80941Smrg/* Cayman 16xMSAA */ 1341b8e80941Smrgstatic const uint32_t cm_sample_locs_16x[] = { 1342b8e80941Smrg FILL_SREG( 1, 1, -1, -3, -3, 2, 4, -1), 1343b8e80941Smrg FILL_SREG( 1, 1, -1, -3, -3, 2, 4, -1), 1344b8e80941Smrg FILL_SREG( 1, 1, -1, -3, -3, 2, 4, -1), 1345b8e80941Smrg FILL_SREG( 1, 1, -1, -3, -3, 2, 4, -1), 1346b8e80941Smrg FILL_SREG(-5, -2, 2, 5, 5, 3, 3, -5), 1347b8e80941Smrg FILL_SREG(-5, -2, 2, 5, 5, 3, 3, -5), 1348b8e80941Smrg FILL_SREG(-5, -2, 2, 5, 5, 3, 3, -5), 1349b8e80941Smrg FILL_SREG(-5, -2, 2, 5, 5, 3, 3, -5), 1350b8e80941Smrg FILL_SREG(-2, 6, 0, -7, -4, -6, -6, 4), 1351b8e80941Smrg FILL_SREG(-2, 6, 0, -7, -4, -6, -6, 4), 1352b8e80941Smrg FILL_SREG(-2, 6, 0, -7, -4, -6, -6, 4), 1353b8e80941Smrg FILL_SREG(-2, 6, 0, -7, -4, -6, -6, 4), 1354b8e80941Smrg FILL_SREG(-8, 0, 7, -4, 6, 7, -7, -8), 1355b8e80941Smrg FILL_SREG(-8, 0, 7, -4, 6, 7, -7, -8), 1356b8e80941Smrg FILL_SREG(-8, 0, 7, -4, 6, 7, -7, -8), 1357b8e80941Smrg FILL_SREG(-8, 0, 7, -4, 6, 7, -7, -8), 1358b8e80941Smrg}; 1359b8e80941Smrgstatic const unsigned cm_max_dist_16x = 8; 1360b8e80941Smrg 1361b8e80941Smrgunsigned radv_cayman_get_maxdist(int log_samples) 1362b8e80941Smrg{ 1363b8e80941Smrg unsigned max_dist[] = { 1364b8e80941Smrg 0, 1365b8e80941Smrg eg_max_dist_2x, 1366b8e80941Smrg eg_max_dist_4x, 1367b8e80941Smrg cm_max_dist_8x, 1368b8e80941Smrg cm_max_dist_16x 1369b8e80941Smrg }; 1370b8e80941Smrg return max_dist[log_samples]; 1371b8e80941Smrg} 1372b8e80941Smrg 1373b8e80941Smrgvoid radv_cayman_emit_msaa_sample_locs(struct radeon_cmdbuf *cs, int nr_samples) 1374b8e80941Smrg{ 1375b8e80941Smrg switch (nr_samples) { 1376b8e80941Smrg default: 1377b8e80941Smrg case 1: 1378b8e80941Smrg radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 0); 1379b8e80941Smrg radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, 0); 1380b8e80941Smrg radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, 0); 1381b8e80941Smrg radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, 0); 1382b8e80941Smrg break; 1383b8e80941Smrg case 2: 1384b8e80941Smrg radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, eg_sample_locs_2x[0]); 1385b8e80941Smrg radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, eg_sample_locs_2x[1]); 1386b8e80941Smrg radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, eg_sample_locs_2x[2]); 1387b8e80941Smrg radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, eg_sample_locs_2x[3]); 1388b8e80941Smrg break; 1389b8e80941Smrg case 4: 1390b8e80941Smrg radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, eg_sample_locs_4x[0]); 1391b8e80941Smrg radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, eg_sample_locs_4x[1]); 1392b8e80941Smrg radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, eg_sample_locs_4x[2]); 1393b8e80941Smrg radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, eg_sample_locs_4x[3]); 1394b8e80941Smrg break; 1395b8e80941Smrg case 8: 1396b8e80941Smrg radeon_set_context_reg_seq(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 14); 1397b8e80941Smrg radeon_emit(cs, cm_sample_locs_8x[0]); 1398b8e80941Smrg radeon_emit(cs, cm_sample_locs_8x[4]); 1399b8e80941Smrg radeon_emit(cs, 0); 1400b8e80941Smrg radeon_emit(cs, 0); 1401b8e80941Smrg radeon_emit(cs, cm_sample_locs_8x[1]); 1402b8e80941Smrg radeon_emit(cs, cm_sample_locs_8x[5]); 1403b8e80941Smrg radeon_emit(cs, 0); 1404b8e80941Smrg radeon_emit(cs, 0); 1405b8e80941Smrg radeon_emit(cs, cm_sample_locs_8x[2]); 1406b8e80941Smrg radeon_emit(cs, cm_sample_locs_8x[6]); 1407b8e80941Smrg radeon_emit(cs, 0); 1408b8e80941Smrg radeon_emit(cs, 0); 1409b8e80941Smrg radeon_emit(cs, cm_sample_locs_8x[3]); 1410b8e80941Smrg radeon_emit(cs, cm_sample_locs_8x[7]); 1411b8e80941Smrg break; 1412b8e80941Smrg case 16: 1413b8e80941Smrg radeon_set_context_reg_seq(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 16); 1414b8e80941Smrg radeon_emit(cs, cm_sample_locs_16x[0]); 1415b8e80941Smrg radeon_emit(cs, cm_sample_locs_16x[4]); 1416b8e80941Smrg radeon_emit(cs, cm_sample_locs_16x[8]); 1417b8e80941Smrg radeon_emit(cs, cm_sample_locs_16x[12]); 1418b8e80941Smrg radeon_emit(cs, cm_sample_locs_16x[1]); 1419b8e80941Smrg radeon_emit(cs, cm_sample_locs_16x[5]); 1420b8e80941Smrg radeon_emit(cs, cm_sample_locs_16x[9]); 1421b8e80941Smrg radeon_emit(cs, cm_sample_locs_16x[13]); 1422b8e80941Smrg radeon_emit(cs, cm_sample_locs_16x[2]); 1423b8e80941Smrg radeon_emit(cs, cm_sample_locs_16x[6]); 1424b8e80941Smrg radeon_emit(cs, cm_sample_locs_16x[10]); 1425b8e80941Smrg radeon_emit(cs, cm_sample_locs_16x[14]); 1426b8e80941Smrg radeon_emit(cs, cm_sample_locs_16x[3]); 1427b8e80941Smrg radeon_emit(cs, cm_sample_locs_16x[7]); 1428b8e80941Smrg radeon_emit(cs, cm_sample_locs_16x[11]); 1429b8e80941Smrg radeon_emit(cs, cm_sample_locs_16x[15]); 1430b8e80941Smrg break; 1431b8e80941Smrg } 1432b8e80941Smrg} 1433b8e80941Smrg 1434b8e80941Smrgstatic void radv_cayman_get_sample_position(struct radv_device *device, 1435b8e80941Smrg unsigned sample_count, 1436b8e80941Smrg unsigned sample_index, float *out_value) 1437b8e80941Smrg{ 1438b8e80941Smrg int offset, index; 1439b8e80941Smrg struct { 1440b8e80941Smrg int idx:4; 1441b8e80941Smrg } val; 1442b8e80941Smrg switch (sample_count) { 1443b8e80941Smrg case 1: 1444b8e80941Smrg default: 1445b8e80941Smrg out_value[0] = out_value[1] = 0.5; 1446b8e80941Smrg break; 1447b8e80941Smrg case 2: 1448b8e80941Smrg offset = 4 * (sample_index * 2); 1449b8e80941Smrg val.idx = (eg_sample_locs_2x[0] >> offset) & 0xf; 1450b8e80941Smrg out_value[0] = (float)(val.idx + 8) / 16.0f; 1451b8e80941Smrg val.idx = (eg_sample_locs_2x[0] >> (offset + 4)) & 0xf; 1452b8e80941Smrg out_value[1] = (float)(val.idx + 8) / 16.0f; 1453b8e80941Smrg break; 1454b8e80941Smrg case 4: 1455b8e80941Smrg offset = 4 * (sample_index * 2); 1456b8e80941Smrg val.idx = (eg_sample_locs_4x[0] >> offset) & 0xf; 1457b8e80941Smrg out_value[0] = (float)(val.idx + 8) / 16.0f; 1458b8e80941Smrg val.idx = (eg_sample_locs_4x[0] >> (offset + 4)) & 0xf; 1459b8e80941Smrg out_value[1] = (float)(val.idx + 8) / 16.0f; 1460b8e80941Smrg break; 1461b8e80941Smrg case 8: 1462b8e80941Smrg offset = 4 * (sample_index % 4 * 2); 1463b8e80941Smrg index = (sample_index / 4) * 4; 1464b8e80941Smrg val.idx = (cm_sample_locs_8x[index] >> offset) & 0xf; 1465b8e80941Smrg out_value[0] = (float)(val.idx + 8) / 16.0f; 1466b8e80941Smrg val.idx = (cm_sample_locs_8x[index] >> (offset + 4)) & 0xf; 1467b8e80941Smrg out_value[1] = (float)(val.idx + 8) / 16.0f; 1468b8e80941Smrg break; 1469b8e80941Smrg case 16: 1470b8e80941Smrg offset = 4 * (sample_index % 4 * 2); 1471b8e80941Smrg index = (sample_index / 4) * 4; 1472b8e80941Smrg val.idx = (cm_sample_locs_16x[index] >> offset) & 0xf; 1473b8e80941Smrg out_value[0] = (float)(val.idx + 8) / 16.0f; 1474b8e80941Smrg val.idx = (cm_sample_locs_16x[index] >> (offset + 4)) & 0xf; 1475b8e80941Smrg out_value[1] = (float)(val.idx + 8) / 16.0f; 1476b8e80941Smrg break; 1477b8e80941Smrg } 1478b8e80941Smrg} 1479b8e80941Smrg 1480b8e80941Smrgvoid radv_device_init_msaa(struct radv_device *device) 1481b8e80941Smrg{ 1482b8e80941Smrg int i; 1483b8e80941Smrg radv_cayman_get_sample_position(device, 1, 0, device->sample_locations_1x[0]); 1484b8e80941Smrg 1485b8e80941Smrg for (i = 0; i < 2; i++) 1486b8e80941Smrg radv_cayman_get_sample_position(device, 2, i, device->sample_locations_2x[i]); 1487b8e80941Smrg for (i = 0; i < 4; i++) 1488b8e80941Smrg radv_cayman_get_sample_position(device, 4, i, device->sample_locations_4x[i]); 1489b8e80941Smrg for (i = 0; i < 8; i++) 1490b8e80941Smrg radv_cayman_get_sample_position(device, 8, i, device->sample_locations_8x[i]); 1491b8e80941Smrg for (i = 0; i < 16; i++) 1492b8e80941Smrg radv_cayman_get_sample_position(device, 16, i, device->sample_locations_16x[i]); 1493b8e80941Smrg} 1494