1/* 2 * Copyright © 2016 Red Hat. 3 * Copyright © 2016 Bas Nieuwenhuizen 4 * 5 * based on si_state.c 6 * Copyright © 2015 Advanced Micro Devices, Inc. 7 * 8 * Permission is hereby granted, free of charge, to any person obtaining a 9 * copy of this software and associated documentation files (the "Software"), 10 * to deal in the Software without restriction, including without limitation 11 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 12 * and/or sell copies of the Software, and to permit persons to whom the 13 * Software is furnished to do so, subject to the following conditions: 14 * 15 * The above copyright notice and this permission notice (including the next 16 * paragraph) shall be included in all copies or substantial portions of the 17 * Software. 18 * 19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 25 * IN THE SOFTWARE. 26 */ 27 28/* command buffer handling for AMD GCN */ 29 30#include "radv_cs.h" 31#include "radv_private.h" 32#include "radv_shader.h" 33#include "sid.h" 34 35static void 36si_write_harvested_raster_configs(struct radv_physical_device *physical_device, 37 struct radeon_cmdbuf *cs, unsigned raster_config, 38 unsigned raster_config_1) 39{ 40 unsigned num_se = MAX2(physical_device->rad_info.max_se, 1); 41 unsigned raster_config_se[4]; 42 unsigned se; 43 44 ac_get_harvested_configs(&physical_device->rad_info, raster_config, &raster_config_1, 45 raster_config_se); 46 47 for (se = 0; se < num_se; se++) { 48 /* GRBM_GFX_INDEX has a different offset on GFX6 and GFX7+ */ 49 if (physical_device->rad_info.chip_class < GFX7) 50 radeon_set_config_reg(cs, R_00802C_GRBM_GFX_INDEX, 51 S_00802C_SE_INDEX(se) | S_00802C_SH_BROADCAST_WRITES(1) | 52 S_00802C_INSTANCE_BROADCAST_WRITES(1)); 53 else 54 radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, 55 S_030800_SE_INDEX(se) | S_030800_SH_BROADCAST_WRITES(1) | 56 S_030800_INSTANCE_BROADCAST_WRITES(1)); 57 radeon_set_context_reg(cs, R_028350_PA_SC_RASTER_CONFIG, raster_config_se[se]); 58 } 59 60 /* GRBM_GFX_INDEX has a different offset on GFX6 and GFX7+ */ 61 if (physical_device->rad_info.chip_class < GFX7) 62 radeon_set_config_reg(cs, R_00802C_GRBM_GFX_INDEX, 63 S_00802C_SE_BROADCAST_WRITES(1) | S_00802C_SH_BROADCAST_WRITES(1) | 64 S_00802C_INSTANCE_BROADCAST_WRITES(1)); 65 else 66 radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, 67 S_030800_SE_BROADCAST_WRITES(1) | S_030800_SH_BROADCAST_WRITES(1) | 68 S_030800_INSTANCE_BROADCAST_WRITES(1)); 69 70 if (physical_device->rad_info.chip_class >= GFX7) 71 radeon_set_context_reg(cs, R_028354_PA_SC_RASTER_CONFIG_1, raster_config_1); 72} 73 74void 75si_emit_compute(struct radv_device *device, struct radeon_cmdbuf *cs) 76{ 77 radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3); 78 radeon_emit(cs, 0); 79 radeon_emit(cs, 0); 80 radeon_emit(cs, 0); 81 82 radeon_set_sh_reg(cs, R_00B834_COMPUTE_PGM_HI, 83 S_00B834_DATA(device->physical_device->rad_info.address32_hi >> 8)); 84 85 radeon_set_sh_reg_seq(cs, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2); 86 /* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1, 87 * renamed COMPUTE_DESTINATION_EN_SEn on gfx10. */ 88 radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff)); 89 radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff)); 90 91 if (device->physical_device->rad_info.chip_class >= GFX7) { 92 /* Also set R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE2 / SE3 */ 93 radeon_set_sh_reg_seq(cs, R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, 2); 94 radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff)); 95 radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff)); 96 97 if (device->border_color_data.bo) { 98 uint64_t bc_va = radv_buffer_get_va(device->border_color_data.bo); 99 100 radeon_set_uconfig_reg_seq(cs, R_030E00_TA_CS_BC_BASE_ADDR, 2); 101 radeon_emit(cs, bc_va >> 8); 102 radeon_emit(cs, S_030E04_ADDRESS(bc_va >> 40)); 103 } 104 } 105 106 if (device->physical_device->rad_info.chip_class >= GFX9) { 107 radeon_set_uconfig_reg(cs, R_0301EC_CP_COHER_START_DELAY, 108 device->physical_device->rad_info.chip_class >= GFX10 ? 0x20 : 0); 109 } 110 111 if (device->physical_device->rad_info.chip_class >= GFX10) { 112 radeon_set_sh_reg_seq(cs, R_00B890_COMPUTE_USER_ACCUM_0, 5); 113 radeon_emit(cs, 0); /* R_00B890_COMPUTE_USER_ACCUM_0 */ 114 radeon_emit(cs, 0); /* R_00B894_COMPUTE_USER_ACCUM_1 */ 115 radeon_emit(cs, 0); /* R_00B898_COMPUTE_USER_ACCUM_2 */ 116 radeon_emit(cs, 0); /* R_00B89C_COMPUTE_USER_ACCUM_3 */ 117 radeon_emit(cs, 0); /* R_00B8A0_COMPUTE_PGM_RSRC3 */ 118 } 119 120 /* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID 121 * and is now per pipe, so it should be handled in the 122 * kernel if we want to use something other than the default value, 123 * which is now 0x22f. 124 */ 125 if (device->physical_device->rad_info.chip_class <= GFX6) { 126 /* XXX: This should be: 127 * (number of compute units) * 4 * (waves per simd) - 1 */ 128 129 radeon_set_sh_reg(cs, R_00B82C_COMPUTE_MAX_WAVE_ID, 0x190 /* Default value */); 130 131 if (device->border_color_data.bo) { 132 uint64_t bc_va = radv_buffer_get_va(device->border_color_data.bo); 133 radeon_set_config_reg(cs, R_00950C_TA_CS_BC_BASE_ADDR, bc_va >> 8); 134 } 135 } 136 137 if (device->tma_bo) { 138 uint64_t tba_va, tma_va; 139 140 assert(device->physical_device->rad_info.chip_class == GFX8); 141 142 tba_va = radv_shader_variant_get_va(device->trap_handler_shader); 143 tma_va = radv_buffer_get_va(device->tma_bo); 144 145 radeon_set_sh_reg_seq(cs, R_00B838_COMPUTE_TBA_LO, 4); 146 radeon_emit(cs, tba_va >> 8); 147 radeon_emit(cs, tba_va >> 40); 148 radeon_emit(cs, tma_va >> 8); 149 radeon_emit(cs, tma_va >> 40); 150 } 151} 152 153/* 12.4 fixed-point */ 154static unsigned 155radv_pack_float_12p4(float x) 156{ 157 return x <= 0 ? 0 : x >= 4096 ? 0xffff : x * 16; 158} 159 160static void 161si_set_raster_config(struct radv_physical_device *physical_device, struct radeon_cmdbuf *cs) 162{ 163 unsigned num_rb = MIN2(physical_device->rad_info.max_render_backends, 16); 164 unsigned rb_mask = physical_device->rad_info.enabled_rb_mask; 165 unsigned raster_config, raster_config_1; 166 167 ac_get_raster_config(&physical_device->rad_info, &raster_config, &raster_config_1, NULL); 168 169 /* Always use the default config when all backends are enabled 170 * (or when we failed to determine the enabled backends). 171 */ 172 if (!rb_mask || util_bitcount(rb_mask) >= num_rb) { 173 radeon_set_context_reg(cs, R_028350_PA_SC_RASTER_CONFIG, raster_config); 174 if (physical_device->rad_info.chip_class >= GFX7) 175 radeon_set_context_reg(cs, R_028354_PA_SC_RASTER_CONFIG_1, raster_config_1); 176 } else { 177 si_write_harvested_raster_configs(physical_device, cs, raster_config, raster_config_1); 178 } 179} 180 181void 182si_emit_graphics(struct radv_device *device, struct radeon_cmdbuf *cs) 183{ 184 struct radv_physical_device *physical_device = device->physical_device; 185 186 bool has_clear_state = physical_device->rad_info.has_clear_state; 187 int i; 188 189 radeon_emit(cs, PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); 190 radeon_emit(cs, CC0_UPDATE_LOAD_ENABLES(1)); 191 radeon_emit(cs, CC1_UPDATE_SHADOW_ENABLES(1)); 192 193 if (has_clear_state) { 194 radeon_emit(cs, PKT3(PKT3_CLEAR_STATE, 0, 0)); 195 radeon_emit(cs, 0); 196 } 197 198 if (physical_device->rad_info.chip_class <= GFX8) 199 si_set_raster_config(physical_device, cs); 200 201 radeon_set_context_reg(cs, R_028A18_VGT_HOS_MAX_TESS_LEVEL, fui(64)); 202 if (!has_clear_state) 203 radeon_set_context_reg(cs, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, fui(0)); 204 205 /* FIXME calculate these values somehow ??? */ 206 if (physical_device->rad_info.chip_class <= GFX8) { 207 radeon_set_context_reg(cs, R_028A54_VGT_GS_PER_ES, SI_GS_PER_ES); 208 radeon_set_context_reg(cs, R_028A58_VGT_ES_PER_GS, 0x40); 209 } 210 211 if (!has_clear_state) { 212 radeon_set_context_reg(cs, R_028A5C_VGT_GS_PER_VS, 0x2); 213 radeon_set_context_reg(cs, R_028A8C_VGT_PRIMITIVEID_RESET, 0x0); 214 radeon_set_context_reg(cs, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0); 215 } 216 217 if (physical_device->rad_info.chip_class <= GFX9) 218 radeon_set_context_reg(cs, R_028AA0_VGT_INSTANCE_STEP_RATE_0, 1); 219 if (!has_clear_state) 220 radeon_set_context_reg(cs, R_028AB8_VGT_VTX_CNT_EN, 0x0); 221 if (physical_device->rad_info.chip_class < GFX7) 222 radeon_set_config_reg(cs, R_008A14_PA_CL_ENHANCE, 223 S_008A14_NUM_CLIP_SEQ(3) | S_008A14_CLIP_VTX_REORDER_ENA(1)); 224 225 if (!has_clear_state) 226 radeon_set_context_reg(cs, R_02882C_PA_SU_PRIM_FILTER_CNTL, 0); 227 228 /* CLEAR_STATE doesn't clear these correctly on certain generations. 229 * I don't know why. Deduced by trial and error. 230 */ 231 if (physical_device->rad_info.chip_class <= GFX7 || !has_clear_state) { 232 radeon_set_context_reg(cs, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0); 233 radeon_set_context_reg(cs, R_028204_PA_SC_WINDOW_SCISSOR_TL, 234 S_028204_WINDOW_OFFSET_DISABLE(1)); 235 radeon_set_context_reg(cs, R_028240_PA_SC_GENERIC_SCISSOR_TL, 236 S_028240_WINDOW_OFFSET_DISABLE(1)); 237 radeon_set_context_reg(cs, R_028244_PA_SC_GENERIC_SCISSOR_BR, 238 S_028244_BR_X(16384) | S_028244_BR_Y(16384)); 239 radeon_set_context_reg(cs, R_028030_PA_SC_SCREEN_SCISSOR_TL, 0); 240 radeon_set_context_reg(cs, R_028034_PA_SC_SCREEN_SCISSOR_BR, 241 S_028034_BR_X(16384) | S_028034_BR_Y(16384)); 242 } 243 244 if (!has_clear_state) { 245 for (i = 0; i < 16; i++) { 246 radeon_set_context_reg(cs, R_0282D0_PA_SC_VPORT_ZMIN_0 + i * 8, 0); 247 radeon_set_context_reg(cs, R_0282D4_PA_SC_VPORT_ZMAX_0 + i * 8, fui(1.0)); 248 } 249 } 250 251 if (!has_clear_state) { 252 radeon_set_context_reg(cs, R_02820C_PA_SC_CLIPRECT_RULE, 0xFFFF); 253 radeon_set_context_reg(cs, R_028230_PA_SC_EDGERULE, 0xAAAAAAAA); 254 /* PA_SU_HARDWARE_SCREEN_OFFSET must be 0 due to hw bug on GFX6 */ 255 radeon_set_context_reg(cs, R_028234_PA_SU_HARDWARE_SCREEN_OFFSET, 0); 256 radeon_set_context_reg(cs, R_028820_PA_CL_NANINF_CNTL, 0); 257 radeon_set_context_reg(cs, R_028AC0_DB_SRESULTS_COMPARE_STATE0, 0x0); 258 radeon_set_context_reg(cs, R_028AC4_DB_SRESULTS_COMPARE_STATE1, 0x0); 259 radeon_set_context_reg(cs, R_028AC8_DB_PRELOAD_CONTROL, 0x0); 260 } 261 262 radeon_set_context_reg(cs, R_02800C_DB_RENDER_OVERRIDE, 263 S_02800C_FORCE_HIS_ENABLE0(V_02800C_FORCE_DISABLE) | 264 S_02800C_FORCE_HIS_ENABLE1(V_02800C_FORCE_DISABLE)); 265 266 if (physical_device->rad_info.chip_class >= GFX10) { 267 radeon_set_context_reg(cs, R_028A98_VGT_DRAW_PAYLOAD_CNTL, 0); 268 radeon_set_uconfig_reg(cs, R_030964_GE_MAX_VTX_INDX, ~0); 269 radeon_set_uconfig_reg(cs, R_030924_GE_MIN_VTX_INDX, 0); 270 radeon_set_uconfig_reg(cs, R_030928_GE_INDX_OFFSET, 0); 271 radeon_set_uconfig_reg(cs, R_03097C_GE_STEREO_CNTL, 0); 272 radeon_set_uconfig_reg(cs, R_030988_GE_USER_VGPR_EN, 0); 273 274 radeon_set_context_reg(cs, R_028038_DB_DFSM_CONTROL, 275 S_028038_PUNCHOUT_MODE(V_028038_FORCE_OFF) | 276 S_028038_POPS_DRAIN_PS_ON_OVERLAP(1)); 277 } else if (physical_device->rad_info.chip_class == GFX9) { 278 radeon_set_uconfig_reg(cs, R_030920_VGT_MAX_VTX_INDX, ~0); 279 radeon_set_uconfig_reg(cs, R_030924_VGT_MIN_VTX_INDX, 0); 280 radeon_set_uconfig_reg(cs, R_030928_VGT_INDX_OFFSET, 0); 281 282 radeon_set_context_reg(cs, R_028060_DB_DFSM_CONTROL, 283 S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF) | 284 S_028060_POPS_DRAIN_PS_ON_OVERLAP(1)); 285 } else { 286 /* These registers, when written, also overwrite the 287 * CLEAR_STATE context, so we can't rely on CLEAR_STATE setting 288 * them. It would be an issue if there was another UMD 289 * changing them. 290 */ 291 radeon_set_context_reg(cs, R_028400_VGT_MAX_VTX_INDX, ~0); 292 radeon_set_context_reg(cs, R_028404_VGT_MIN_VTX_INDX, 0); 293 radeon_set_context_reg(cs, R_028408_VGT_INDX_OFFSET, 0); 294 } 295 296 if (device->physical_device->rad_info.chip_class >= GFX10) { 297 radeon_set_sh_reg(cs, R_00B524_SPI_SHADER_PGM_HI_LS, 298 S_00B524_MEM_BASE(device->physical_device->rad_info.address32_hi >> 8)); 299 radeon_set_sh_reg(cs, R_00B324_SPI_SHADER_PGM_HI_ES, 300 S_00B324_MEM_BASE(device->physical_device->rad_info.address32_hi >> 8)); 301 } else if (device->physical_device->rad_info.chip_class == GFX9) { 302 radeon_set_sh_reg(cs, R_00B414_SPI_SHADER_PGM_HI_LS, 303 S_00B414_MEM_BASE(device->physical_device->rad_info.address32_hi >> 8)); 304 radeon_set_sh_reg(cs, R_00B214_SPI_SHADER_PGM_HI_ES, 305 S_00B214_MEM_BASE(device->physical_device->rad_info.address32_hi >> 8)); 306 } else { 307 radeon_set_sh_reg(cs, R_00B524_SPI_SHADER_PGM_HI_LS, 308 S_00B524_MEM_BASE(device->physical_device->rad_info.address32_hi >> 8)); 309 radeon_set_sh_reg(cs, R_00B324_SPI_SHADER_PGM_HI_ES, 310 S_00B324_MEM_BASE(device->physical_device->rad_info.address32_hi >> 8)); 311 } 312 313 unsigned cu_mask_ps = 0xffffffff; 314 315 /* It's wasteful to enable all CUs for PS if shader arrays have a 316 * different number of CUs. The reason is that the hardware sends the 317 * same number of PS waves to each shader array, so the slowest shader 318 * array limits the performance. Disable the extra CUs for PS in 319 * other shader arrays to save power and thus increase clocks for busy 320 * CUs. In the future, we might disable or enable this tweak only for 321 * certain apps. 322 */ 323 if (physical_device->rad_info.chip_class >= GFX10_3) 324 cu_mask_ps = u_bit_consecutive(0, physical_device->rad_info.min_good_cu_per_sa); 325 326 if (physical_device->rad_info.chip_class >= GFX7) { 327 if (physical_device->rad_info.chip_class >= GFX10) { 328 /* Logical CUs 16 - 31 */ 329 radeon_set_sh_reg_idx(physical_device, cs, R_00B404_SPI_SHADER_PGM_RSRC4_HS, 3, 330 S_00B404_CU_EN(0xffff)); 331 radeon_set_sh_reg_idx(physical_device, cs, R_00B104_SPI_SHADER_PGM_RSRC4_VS, 3, 332 S_00B104_CU_EN(0xffff)); 333 radeon_set_sh_reg_idx(physical_device, cs, R_00B004_SPI_SHADER_PGM_RSRC4_PS, 3, 334 S_00B004_CU_EN(cu_mask_ps >> 16)); 335 } 336 337 if (physical_device->rad_info.chip_class >= GFX9) { 338 radeon_set_sh_reg_idx(physical_device, cs, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, 3, 339 S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F)); 340 } else { 341 radeon_set_sh_reg(cs, R_00B51C_SPI_SHADER_PGM_RSRC3_LS, 342 S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F)); 343 radeon_set_sh_reg(cs, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, S_00B41C_WAVE_LIMIT(0x3F)); 344 radeon_set_sh_reg(cs, R_00B31C_SPI_SHADER_PGM_RSRC3_ES, 345 S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F)); 346 /* If this is 0, Bonaire can hang even if GS isn't being used. 347 * Other chips are unaffected. These are suboptimal values, 348 * but we don't use on-chip GS. 349 */ 350 radeon_set_context_reg(cs, R_028A44_VGT_GS_ONCHIP_CNTL, 351 S_028A44_ES_VERTS_PER_SUBGRP(64) | S_028A44_GS_PRIMS_PER_SUBGRP(4)); 352 } 353 354 radeon_set_sh_reg_idx(physical_device, cs, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, 3, 355 S_00B01C_CU_EN(cu_mask_ps) | S_00B01C_WAVE_LIMIT(0x3F)); 356 } 357 358 if (physical_device->rad_info.chip_class >= GFX10) { 359 /* Break up a pixel wave if it contains deallocs for more than 360 * half the parameter cache. 361 * 362 * To avoid a deadlock where pixel waves aren't launched 363 * because they're waiting for more pixels while the frontend 364 * is stuck waiting for PC space, the maximum allowed value is 365 * the size of the PC minus the largest possible allocation for 366 * a single primitive shader subgroup. 367 */ 368 radeon_set_context_reg(cs, R_028C50_PA_SC_NGG_MODE_CNTL, S_028C50_MAX_DEALLOCS_IN_WAVE(512)); 369 radeon_set_context_reg(cs, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14); 370 371 /* Vulkan doesn't support user edge flags and it also doesn't 372 * need to prevent drawing lines on internal edges of 373 * decomposed primitives (such as quads) with polygon mode = lines. 374 */ 375 unsigned vertex_reuse_depth = physical_device->rad_info.chip_class >= GFX10_3 ? 30 : 0; 376 radeon_set_context_reg(cs, R_028838_PA_CL_NGG_CNTL, 377 S_028838_INDEX_BUF_EDGE_FLAG_ENA(0) | 378 S_028838_VERTEX_REUSE_DEPTH(vertex_reuse_depth)); 379 380 /* Enable CMASK/FMASK/HTILE/DCC caching in L2 for small chips. */ 381 unsigned meta_write_policy, meta_read_policy; 382 383 /* TODO: investigate whether LRU improves performance on other chips too */ 384 if (physical_device->rad_info.max_render_backends <= 4) { 385 meta_write_policy = V_02807C_CACHE_LRU_WR; /* cache writes */ 386 meta_read_policy = V_02807C_CACHE_LRU_RD; /* cache reads */ 387 } else { 388 meta_write_policy = V_02807C_CACHE_STREAM; /* write combine */ 389 meta_read_policy = V_02807C_CACHE_NOA; /* don't cache reads */ 390 } 391 392 radeon_set_context_reg( 393 cs, R_02807C_DB_RMI_L2_CACHE_CONTROL, 394 S_02807C_Z_WR_POLICY(V_02807C_CACHE_STREAM) | S_02807C_S_WR_POLICY(V_02807C_CACHE_STREAM) | 395 S_02807C_HTILE_WR_POLICY(meta_write_policy) | 396 S_02807C_ZPCPSD_WR_POLICY(V_02807C_CACHE_STREAM) | 397 S_02807C_Z_RD_POLICY(V_02807C_CACHE_NOA) | S_02807C_S_RD_POLICY(V_02807C_CACHE_NOA) | 398 S_02807C_HTILE_RD_POLICY(meta_read_policy)); 399 400 radeon_set_context_reg( 401 cs, R_028410_CB_RMI_GL2_CACHE_CONTROL, 402 S_028410_CMASK_WR_POLICY(meta_write_policy) | S_028410_FMASK_WR_POLICY(meta_write_policy) | 403 S_028410_DCC_WR_POLICY(meta_write_policy) | 404 S_028410_COLOR_WR_POLICY(V_028410_CACHE_STREAM) | 405 S_028410_CMASK_RD_POLICY(meta_read_policy) | 406 S_028410_FMASK_RD_POLICY(meta_read_policy) | S_028410_DCC_RD_POLICY(meta_read_policy) | 407 S_028410_COLOR_RD_POLICY(V_028410_CACHE_NOA)); 408 radeon_set_context_reg(cs, R_028428_CB_COVERAGE_OUT_CONTROL, 0); 409 410 radeon_set_sh_reg_seq(cs, R_00B0C8_SPI_SHADER_USER_ACCUM_PS_0, 4); 411 radeon_emit(cs, 0); /* R_00B0C8_SPI_SHADER_USER_ACCUM_PS_0 */ 412 radeon_emit(cs, 0); /* R_00B0CC_SPI_SHADER_USER_ACCUM_PS_1 */ 413 radeon_emit(cs, 0); /* R_00B0D0_SPI_SHADER_USER_ACCUM_PS_2 */ 414 radeon_emit(cs, 0); /* R_00B0D4_SPI_SHADER_USER_ACCUM_PS_3 */ 415 radeon_set_sh_reg_seq(cs, R_00B1C8_SPI_SHADER_USER_ACCUM_VS_0, 4); 416 radeon_emit(cs, 0); /* R_00B1C8_SPI_SHADER_USER_ACCUM_VS_0 */ 417 radeon_emit(cs, 0); /* R_00B1CC_SPI_SHADER_USER_ACCUM_VS_1 */ 418 radeon_emit(cs, 0); /* R_00B1D0_SPI_SHADER_USER_ACCUM_VS_2 */ 419 radeon_emit(cs, 0); /* R_00B1D4_SPI_SHADER_USER_ACCUM_VS_3 */ 420 radeon_set_sh_reg_seq(cs, R_00B2C8_SPI_SHADER_USER_ACCUM_ESGS_0, 4); 421 radeon_emit(cs, 0); /* R_00B2C8_SPI_SHADER_USER_ACCUM_ESGS_0 */ 422 radeon_emit(cs, 0); /* R_00B2CC_SPI_SHADER_USER_ACCUM_ESGS_1 */ 423 radeon_emit(cs, 0); /* R_00B2D0_SPI_SHADER_USER_ACCUM_ESGS_2 */ 424 radeon_emit(cs, 0); /* R_00B2D4_SPI_SHADER_USER_ACCUM_ESGS_3 */ 425 radeon_set_sh_reg_seq(cs, R_00B4C8_SPI_SHADER_USER_ACCUM_LSHS_0, 4); 426 radeon_emit(cs, 0); /* R_00B4C8_SPI_SHADER_USER_ACCUM_LSHS_0 */ 427 radeon_emit(cs, 0); /* R_00B4CC_SPI_SHADER_USER_ACCUM_LSHS_1 */ 428 radeon_emit(cs, 0); /* R_00B4D0_SPI_SHADER_USER_ACCUM_LSHS_2 */ 429 radeon_emit(cs, 0); /* R_00B4D4_SPI_SHADER_USER_ACCUM_LSHS_3 */ 430 431 radeon_set_sh_reg(cs, R_00B0C0_SPI_SHADER_REQ_CTRL_PS, 432 S_00B0C0_SOFT_GROUPING_EN(1) | S_00B0C0_NUMBER_OF_REQUESTS_PER_CU(4 - 1)); 433 radeon_set_sh_reg(cs, R_00B1C0_SPI_SHADER_REQ_CTRL_VS, 0); 434 435 if (physical_device->rad_info.chip_class >= GFX10_3) { 436 radeon_set_context_reg(cs, R_028750_SX_PS_DOWNCONVERT_CONTROL, 0xff); 437 /* This allows sample shading. */ 438 radeon_set_context_reg( 439 cs, R_028848_PA_CL_VRS_CNTL, 440 S_028848_SAMPLE_ITER_COMBINER_MODE(V_028848_VRS_COMB_MODE_OVERRIDE)); 441 } 442 } 443 444 if (physical_device->rad_info.chip_class >= GFX9) { 445 radeon_set_context_reg(cs, R_028B50_VGT_TESS_DISTRIBUTION, 446 S_028B50_ACCUM_ISOLINE(40) | S_028B50_ACCUM_TRI(30) | 447 S_028B50_ACCUM_QUAD(24) | S_028B50_DONUT_SPLIT_GFX9(24) | 448 S_028B50_TRAP_SPLIT(6)); 449 } else if (physical_device->rad_info.chip_class >= GFX8) { 450 uint32_t vgt_tess_distribution; 451 452 vgt_tess_distribution = S_028B50_ACCUM_ISOLINE(32) | S_028B50_ACCUM_TRI(11) | 453 S_028B50_ACCUM_QUAD(11) | S_028B50_DONUT_SPLIT_GFX81(16); 454 455 if (physical_device->rad_info.family == CHIP_FIJI || 456 physical_device->rad_info.family >= CHIP_POLARIS10) 457 vgt_tess_distribution |= S_028B50_TRAP_SPLIT(3); 458 459 radeon_set_context_reg(cs, R_028B50_VGT_TESS_DISTRIBUTION, vgt_tess_distribution); 460 } else if (!has_clear_state) { 461 radeon_set_context_reg(cs, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14); 462 radeon_set_context_reg(cs, R_028C5C_VGT_OUT_DEALLOC_CNTL, 16); 463 } 464 465 if (device->border_color_data.bo) { 466 uint64_t border_color_va = radv_buffer_get_va(device->border_color_data.bo); 467 468 radeon_set_context_reg(cs, R_028080_TA_BC_BASE_ADDR, border_color_va >> 8); 469 if (physical_device->rad_info.chip_class >= GFX7) { 470 radeon_set_context_reg(cs, R_028084_TA_BC_BASE_ADDR_HI, 471 S_028084_ADDRESS(border_color_va >> 40)); 472 } 473 } 474 475 if (physical_device->rad_info.chip_class >= GFX9) { 476 radeon_set_context_reg( 477 cs, R_028C48_PA_SC_BINNER_CNTL_1, 478 S_028C48_MAX_ALLOC_COUNT(physical_device->rad_info.pbb_max_alloc_count - 1) | 479 S_028C48_MAX_PRIM_PER_BATCH(1023)); 480 radeon_set_context_reg(cs, R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL, 481 S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1)); 482 radeon_set_uconfig_reg(cs, R_030968_VGT_INSTANCE_BASE_ID, 0); 483 } 484 485 unsigned tmp = (unsigned)(1.0 * 8.0); 486 radeon_set_context_reg(cs, R_028A00_PA_SU_POINT_SIZE, 487 S_028A00_HEIGHT(tmp) | S_028A00_WIDTH(tmp)); 488 radeon_set_context_reg(cs, R_028A04_PA_SU_POINT_MINMAX, 489 S_028A04_MIN_SIZE(radv_pack_float_12p4(0)) | 490 S_028A04_MAX_SIZE(radv_pack_float_12p4(8191.875 / 2))); 491 492 if (!has_clear_state) { 493 radeon_set_context_reg(cs, R_028004_DB_COUNT_CONTROL, S_028004_ZPASS_INCREMENT_DISABLE(1)); 494 } 495 496 /* Enable the Polaris small primitive filter control. 497 * XXX: There is possibly an issue when MSAA is off (see RadeonSI 498 * has_msaa_sample_loc_bug). But this doesn't seem to regress anything, 499 * and AMDVLK doesn't have a workaround as well. 500 */ 501 if (physical_device->rad_info.family >= CHIP_POLARIS10) { 502 unsigned small_prim_filter_cntl = 503 S_028830_SMALL_PRIM_FILTER_ENABLE(1) | 504 /* Workaround for a hw line bug. */ 505 S_028830_LINE_FILTER_DISABLE(physical_device->rad_info.family <= CHIP_POLARIS12); 506 507 radeon_set_context_reg(cs, R_028830_PA_SU_SMALL_PRIM_FILTER_CNTL, small_prim_filter_cntl); 508 } 509 510 radeon_set_context_reg( 511 cs, R_0286D4_SPI_INTERP_CONTROL_0, 512 S_0286D4_FLAT_SHADE_ENA(1) | S_0286D4_PNT_SPRITE_ENA(1) | 513 S_0286D4_PNT_SPRITE_OVRD_X(V_0286D4_SPI_PNT_SPRITE_SEL_S) | 514 S_0286D4_PNT_SPRITE_OVRD_Y(V_0286D4_SPI_PNT_SPRITE_SEL_T) | 515 S_0286D4_PNT_SPRITE_OVRD_Z(V_0286D4_SPI_PNT_SPRITE_SEL_0) | 516 S_0286D4_PNT_SPRITE_OVRD_W(V_0286D4_SPI_PNT_SPRITE_SEL_1) | 517 S_0286D4_PNT_SPRITE_TOP_1(0)); /* vulkan is top to bottom - 1.0 at bottom */ 518 519 radeon_set_context_reg(cs, R_028BE4_PA_SU_VTX_CNTL, 520 S_028BE4_PIX_CENTER(1) | S_028BE4_ROUND_MODE(V_028BE4_X_ROUND_TO_EVEN) | 521 S_028BE4_QUANT_MODE(V_028BE4_X_16_8_FIXED_POINT_1_256TH)); 522 523 radeon_set_context_reg(cs, R_028818_PA_CL_VTE_CNTL, 524 S_028818_VTX_W0_FMT(1) | S_028818_VPORT_X_SCALE_ENA(1) | 525 S_028818_VPORT_X_OFFSET_ENA(1) | S_028818_VPORT_Y_SCALE_ENA(1) | 526 S_028818_VPORT_Y_OFFSET_ENA(1) | S_028818_VPORT_Z_SCALE_ENA(1) | 527 S_028818_VPORT_Z_OFFSET_ENA(1)); 528 529 if (device->tma_bo) { 530 uint64_t tba_va, tma_va; 531 532 assert(device->physical_device->rad_info.chip_class == GFX8); 533 534 tba_va = radv_shader_variant_get_va(device->trap_handler_shader); 535 tma_va = radv_buffer_get_va(device->tma_bo); 536 537 uint32_t regs[] = {R_00B000_SPI_SHADER_TBA_LO_PS, R_00B100_SPI_SHADER_TBA_LO_VS, 538 R_00B200_SPI_SHADER_TBA_LO_GS, R_00B300_SPI_SHADER_TBA_LO_ES, 539 R_00B400_SPI_SHADER_TBA_LO_HS, R_00B500_SPI_SHADER_TBA_LO_LS}; 540 541 for (i = 0; i < ARRAY_SIZE(regs); ++i) { 542 radeon_set_sh_reg_seq(cs, regs[i], 4); 543 radeon_emit(cs, tba_va >> 8); 544 radeon_emit(cs, tba_va >> 40); 545 radeon_emit(cs, tma_va >> 8); 546 radeon_emit(cs, tma_va >> 40); 547 } 548 } 549 550 /* The DX10 diamond test is unnecessary with Vulkan and it decreases line rasterization 551 * performance. 552 */ 553 radeon_set_context_reg(cs, R_028BDC_PA_SC_LINE_CNTL, 0); 554 555 si_emit_compute(device, cs); 556} 557 558void 559cik_create_gfx_config(struct radv_device *device) 560{ 561 struct radeon_cmdbuf *cs = device->ws->cs_create(device->ws, RING_GFX); 562 if (!cs) 563 return; 564 565 si_emit_graphics(device, cs); 566 567 while (cs->cdw & 7) { 568 if (device->physical_device->rad_info.gfx_ib_pad_with_type2) 569 radeon_emit(cs, PKT2_NOP_PAD); 570 else 571 radeon_emit(cs, PKT3_NOP_PAD); 572 } 573 574 VkResult result = 575 device->ws->buffer_create(device->ws, cs->cdw * 4, 4096, device->ws->cs_domain(device->ws), 576 RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING | 577 RADEON_FLAG_READ_ONLY | RADEON_FLAG_GTT_WC, 578 RADV_BO_PRIORITY_CS, 0, &device->gfx_init); 579 if (result != VK_SUCCESS) 580 goto fail; 581 582 void *map = device->ws->buffer_map(device->gfx_init); 583 if (!map) { 584 device->ws->buffer_destroy(device->ws, device->gfx_init); 585 device->gfx_init = NULL; 586 goto fail; 587 } 588 memcpy(map, cs->buf, cs->cdw * 4); 589 590 device->ws->buffer_unmap(device->gfx_init); 591 device->gfx_init_size_dw = cs->cdw; 592fail: 593 device->ws->cs_destroy(cs); 594} 595 596void 597radv_get_viewport_xform(const VkViewport *viewport, float scale[3], float translate[3]) 598{ 599 float x = viewport->x; 600 float y = viewport->y; 601 float half_width = 0.5f * viewport->width; 602 float half_height = 0.5f * viewport->height; 603 double n = viewport->minDepth; 604 double f = viewport->maxDepth; 605 606 scale[0] = half_width; 607 translate[0] = half_width + x; 608 scale[1] = half_height; 609 translate[1] = half_height + y; 610 611 scale[2] = (f - n); 612 translate[2] = n; 613} 614 615static VkRect2D 616si_scissor_from_viewport(const VkViewport *viewport) 617{ 618 float scale[3], translate[3]; 619 VkRect2D rect; 620 621 radv_get_viewport_xform(viewport, scale, translate); 622 623 rect.offset.x = translate[0] - fabsf(scale[0]); 624 rect.offset.y = translate[1] - fabsf(scale[1]); 625 rect.extent.width = ceilf(translate[0] + fabsf(scale[0])) - rect.offset.x; 626 rect.extent.height = ceilf(translate[1] + fabsf(scale[1])) - rect.offset.y; 627 628 return rect; 629} 630 631static VkRect2D 632si_intersect_scissor(const VkRect2D *a, const VkRect2D *b) 633{ 634 VkRect2D ret; 635 ret.offset.x = MAX2(a->offset.x, b->offset.x); 636 ret.offset.y = MAX2(a->offset.y, b->offset.y); 637 ret.extent.width = 638 MIN2(a->offset.x + a->extent.width, b->offset.x + b->extent.width) - ret.offset.x; 639 ret.extent.height = 640 MIN2(a->offset.y + a->extent.height, b->offset.y + b->extent.height) - ret.offset.y; 641 return ret; 642} 643 644void 645si_write_scissors(struct radeon_cmdbuf *cs, int first, int count, const VkRect2D *scissors, 646 const VkViewport *viewports, bool can_use_guardband) 647{ 648 int i; 649 float scale[3], translate[3], guardband_x = INFINITY, guardband_y = INFINITY; 650 const float max_range = 32767.0f; 651 if (!count) 652 return; 653 654 radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL + first * 4 * 2, count * 2); 655 for (i = 0; i < count; i++) { 656 VkRect2D viewport_scissor = si_scissor_from_viewport(viewports + i); 657 VkRect2D scissor = si_intersect_scissor(&scissors[i], &viewport_scissor); 658 659 radv_get_viewport_xform(viewports + i, scale, translate); 660 scale[0] = fabsf(scale[0]); 661 scale[1] = fabsf(scale[1]); 662 663 if (scale[0] < 0.5) 664 scale[0] = 0.5; 665 if (scale[1] < 0.5) 666 scale[1] = 0.5; 667 668 guardband_x = MIN2(guardband_x, (max_range - fabsf(translate[0])) / scale[0]); 669 guardband_y = MIN2(guardband_y, (max_range - fabsf(translate[1])) / scale[1]); 670 671 radeon_emit(cs, S_028250_TL_X(scissor.offset.x) | S_028250_TL_Y(scissor.offset.y) | 672 S_028250_WINDOW_OFFSET_DISABLE(1)); 673 radeon_emit(cs, S_028254_BR_X(scissor.offset.x + scissor.extent.width) | 674 S_028254_BR_Y(scissor.offset.y + scissor.extent.height)); 675 } 676 if (!can_use_guardband) { 677 guardband_x = 1.0; 678 guardband_y = 1.0; 679 } 680 681 radeon_set_context_reg_seq(cs, R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, 4); 682 radeon_emit(cs, fui(guardband_y)); 683 radeon_emit(cs, fui(1.0)); 684 radeon_emit(cs, fui(guardband_x)); 685 radeon_emit(cs, fui(1.0)); 686} 687 688static inline unsigned 689radv_prims_for_vertices(struct radv_prim_vertex_count *info, unsigned num) 690{ 691 if (num == 0) 692 return 0; 693 694 if (info->incr == 0) 695 return 0; 696 697 if (num < info->min) 698 return 0; 699 700 return 1 + ((num - info->min) / info->incr); 701} 702 703static const struct radv_prim_vertex_count prim_size_table[] = { 704 [V_008958_DI_PT_NONE] = {0, 0}, [V_008958_DI_PT_POINTLIST] = {1, 1}, 705 [V_008958_DI_PT_LINELIST] = {2, 2}, [V_008958_DI_PT_LINESTRIP] = {2, 1}, 706 [V_008958_DI_PT_TRILIST] = {3, 3}, [V_008958_DI_PT_TRIFAN] = {3, 1}, 707 [V_008958_DI_PT_TRISTRIP] = {3, 1}, [V_008958_DI_PT_LINELIST_ADJ] = {4, 4}, 708 [V_008958_DI_PT_LINESTRIP_ADJ] = {4, 1}, [V_008958_DI_PT_TRILIST_ADJ] = {6, 6}, 709 [V_008958_DI_PT_TRISTRIP_ADJ] = {6, 2}, [V_008958_DI_PT_RECTLIST] = {3, 3}, 710 [V_008958_DI_PT_LINELOOP] = {2, 1}, [V_008958_DI_PT_POLYGON] = {3, 1}, 711 [V_008958_DI_PT_2D_TRI_STRIP] = {0, 0}, 712}; 713 714uint32_t 715si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer, bool instanced_draw, 716 bool indirect_draw, bool count_from_stream_output, 717 uint32_t draw_vertex_count, unsigned topology, bool prim_restart_enable) 718{ 719 enum chip_class chip_class = cmd_buffer->device->physical_device->rad_info.chip_class; 720 enum radeon_family family = cmd_buffer->device->physical_device->rad_info.family; 721 struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info; 722 const unsigned max_primgroup_in_wave = 2; 723 /* SWITCH_ON_EOP(0) is always preferable. */ 724 bool wd_switch_on_eop = false; 725 bool ia_switch_on_eop = false; 726 bool ia_switch_on_eoi = false; 727 bool partial_vs_wave = false; 728 bool partial_es_wave = cmd_buffer->state.pipeline->graphics.ia_multi_vgt_param.partial_es_wave; 729 bool multi_instances_smaller_than_primgroup; 730 struct radv_prim_vertex_count prim_vertex_count = prim_size_table[topology]; 731 732 if (radv_pipeline_has_tess(cmd_buffer->state.pipeline)) { 733 if (topology == V_008958_DI_PT_PATCH) { 734 prim_vertex_count.min = cmd_buffer->state.pipeline->graphics.tess_patch_control_points; 735 prim_vertex_count.incr = 1; 736 } 737 } 738 739 multi_instances_smaller_than_primgroup = indirect_draw; 740 if (!multi_instances_smaller_than_primgroup && instanced_draw) { 741 uint32_t num_prims = radv_prims_for_vertices(&prim_vertex_count, draw_vertex_count); 742 if (num_prims < cmd_buffer->state.pipeline->graphics.ia_multi_vgt_param.primgroup_size) 743 multi_instances_smaller_than_primgroup = true; 744 } 745 746 ia_switch_on_eoi = cmd_buffer->state.pipeline->graphics.ia_multi_vgt_param.ia_switch_on_eoi; 747 partial_vs_wave = cmd_buffer->state.pipeline->graphics.ia_multi_vgt_param.partial_vs_wave; 748 749 if (chip_class >= GFX7) { 750 /* WD_SWITCH_ON_EOP has no effect on GPUs with less than 751 * 4 shader engines. Set 1 to pass the assertion below. 752 * The other cases are hardware requirements. */ 753 if (cmd_buffer->device->physical_device->rad_info.max_se < 4 || 754 topology == V_008958_DI_PT_POLYGON || topology == V_008958_DI_PT_LINELOOP || 755 topology == V_008958_DI_PT_TRIFAN || topology == V_008958_DI_PT_TRISTRIP_ADJ || 756 (prim_restart_enable && 757 (cmd_buffer->device->physical_device->rad_info.family < CHIP_POLARIS10 || 758 (topology != V_008958_DI_PT_POINTLIST && topology != V_008958_DI_PT_LINESTRIP)))) 759 wd_switch_on_eop = true; 760 761 /* Hawaii hangs if instancing is enabled and WD_SWITCH_ON_EOP is 0. 762 * We don't know that for indirect drawing, so treat it as 763 * always problematic. */ 764 if (family == CHIP_HAWAII && (instanced_draw || indirect_draw)) 765 wd_switch_on_eop = true; 766 767 /* Performance recommendation for 4 SE Gfx7-8 parts if 768 * instances are smaller than a primgroup. 769 * Assume indirect draws always use small instances. 770 * This is needed for good VS wave utilization. 771 */ 772 if (chip_class <= GFX8 && info->max_se == 4 && multi_instances_smaller_than_primgroup) 773 wd_switch_on_eop = true; 774 775 /* Required on GFX7 and later. */ 776 if (info->max_se > 2 && !wd_switch_on_eop) 777 ia_switch_on_eoi = true; 778 779 /* Required by Hawaii and, for some special cases, by GFX8. */ 780 if (ia_switch_on_eoi && 781 (family == CHIP_HAWAII || 782 (chip_class == GFX8 && 783 /* max primgroup in wave is always 2 - leave this for documentation */ 784 (radv_pipeline_has_gs(cmd_buffer->state.pipeline) || max_primgroup_in_wave != 2)))) 785 partial_vs_wave = true; 786 787 /* Instancing bug on Bonaire. */ 788 if (family == CHIP_BONAIRE && ia_switch_on_eoi && (instanced_draw || indirect_draw)) 789 partial_vs_wave = true; 790 791 /* Hardware requirement when drawing primitives from a stream 792 * output buffer. 793 */ 794 if (count_from_stream_output) 795 wd_switch_on_eop = true; 796 797 /* If the WD switch is false, the IA switch must be false too. */ 798 assert(wd_switch_on_eop || !ia_switch_on_eop); 799 } 800 /* If SWITCH_ON_EOI is set, PARTIAL_ES_WAVE must be set too. */ 801 if (chip_class <= GFX8 && ia_switch_on_eoi) 802 partial_es_wave = true; 803 804 if (radv_pipeline_has_gs(cmd_buffer->state.pipeline)) { 805 /* GS hw bug with single-primitive instances and SWITCH_ON_EOI. 806 * The hw doc says all multi-SE chips are affected, but amdgpu-pro Vulkan 807 * only applies it to Hawaii. Do what amdgpu-pro Vulkan does. 808 */ 809 if (family == CHIP_HAWAII && ia_switch_on_eoi) { 810 bool set_vgt_flush = indirect_draw; 811 if (!set_vgt_flush && instanced_draw) { 812 uint32_t num_prims = radv_prims_for_vertices(&prim_vertex_count, draw_vertex_count); 813 if (num_prims <= 1) 814 set_vgt_flush = true; 815 } 816 if (set_vgt_flush) 817 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_FLUSH; 818 } 819 } 820 821 /* Workaround for a VGT hang when strip primitive types are used with 822 * primitive restart. 823 */ 824 if (prim_restart_enable && 825 (topology == V_008958_DI_PT_LINESTRIP || topology == V_008958_DI_PT_TRISTRIP || 826 topology == V_008958_DI_PT_LINESTRIP_ADJ || topology == V_008958_DI_PT_TRISTRIP_ADJ)) { 827 partial_vs_wave = true; 828 } 829 830 return cmd_buffer->state.pipeline->graphics.ia_multi_vgt_param.base | 831 S_028AA8_SWITCH_ON_EOP(ia_switch_on_eop) | S_028AA8_SWITCH_ON_EOI(ia_switch_on_eoi) | 832 S_028AA8_PARTIAL_VS_WAVE_ON(partial_vs_wave) | 833 S_028AA8_PARTIAL_ES_WAVE_ON(partial_es_wave) | 834 S_028AA8_WD_SWITCH_ON_EOP(chip_class >= GFX7 ? wd_switch_on_eop : 0); 835} 836 837void 838si_cs_emit_write_event_eop(struct radeon_cmdbuf *cs, enum chip_class chip_class, bool is_mec, 839 unsigned event, unsigned event_flags, unsigned dst_sel, 840 unsigned data_sel, uint64_t va, uint32_t new_fence, 841 uint64_t gfx9_eop_bug_va) 842{ 843 unsigned op = EVENT_TYPE(event) | 844 EVENT_INDEX(event == V_028A90_CS_DONE || event == V_028A90_PS_DONE ? 6 : 5) | 845 event_flags; 846 unsigned is_gfx8_mec = is_mec && chip_class < GFX9; 847 unsigned sel = EOP_DST_SEL(dst_sel) | EOP_DATA_SEL(data_sel); 848 849 /* Wait for write confirmation before writing data, but don't send 850 * an interrupt. */ 851 if (data_sel != EOP_DATA_SEL_DISCARD) 852 sel |= EOP_INT_SEL(EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM); 853 854 if (chip_class >= GFX9 || is_gfx8_mec) { 855 /* A ZPASS_DONE or PIXEL_STAT_DUMP_EVENT (of the DB occlusion 856 * counters) must immediately precede every timestamp event to 857 * prevent a GPU hang on GFX9. 858 */ 859 if (chip_class == GFX9 && !is_mec) { 860 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); 861 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1)); 862 radeon_emit(cs, gfx9_eop_bug_va); 863 radeon_emit(cs, gfx9_eop_bug_va >> 32); 864 } 865 866 radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, is_gfx8_mec ? 5 : 6, false)); 867 radeon_emit(cs, op); 868 radeon_emit(cs, sel); 869 radeon_emit(cs, va); /* address lo */ 870 radeon_emit(cs, va >> 32); /* address hi */ 871 radeon_emit(cs, new_fence); /* immediate data lo */ 872 radeon_emit(cs, 0); /* immediate data hi */ 873 if (!is_gfx8_mec) 874 radeon_emit(cs, 0); /* unused */ 875 } else { 876 /* On GFX6, EOS events are always emitted with EVENT_WRITE_EOS. 877 * On GFX7+, EOS events are emitted with EVENT_WRITE_EOS on 878 * the graphics queue, and with RELEASE_MEM on the compute 879 * queue. 880 */ 881 if (event == V_028B9C_CS_DONE || event == V_028B9C_PS_DONE) { 882 assert(event_flags == 0 && dst_sel == EOP_DST_SEL_MEM && 883 data_sel == EOP_DATA_SEL_VALUE_32BIT); 884 885 if (is_mec) { 886 radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, 5, false)); 887 radeon_emit(cs, op); 888 radeon_emit(cs, sel); 889 radeon_emit(cs, va); /* address lo */ 890 radeon_emit(cs, va >> 32); /* address hi */ 891 radeon_emit(cs, new_fence); /* immediate data lo */ 892 radeon_emit(cs, 0); /* immediate data hi */ 893 } else { 894 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOS, 3, false)); 895 radeon_emit(cs, op); 896 radeon_emit(cs, va); 897 radeon_emit(cs, ((va >> 32) & 0xffff) | EOS_DATA_SEL(EOS_DATA_SEL_VALUE_32BIT)); 898 radeon_emit(cs, new_fence); 899 } 900 } else { 901 if (chip_class == GFX7 || chip_class == GFX8) { 902 /* Two EOP events are required to make all 903 * engines go idle (and optional cache flushes 904 * executed) before the timestamp is written. 905 */ 906 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, false)); 907 radeon_emit(cs, op); 908 radeon_emit(cs, va); 909 radeon_emit(cs, ((va >> 32) & 0xffff) | sel); 910 radeon_emit(cs, 0); /* immediate data */ 911 radeon_emit(cs, 0); /* unused */ 912 } 913 914 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, false)); 915 radeon_emit(cs, op); 916 radeon_emit(cs, va); 917 radeon_emit(cs, ((va >> 32) & 0xffff) | sel); 918 radeon_emit(cs, new_fence); /* immediate data */ 919 radeon_emit(cs, 0); /* unused */ 920 } 921 } 922} 923 924void 925radv_cp_wait_mem(struct radeon_cmdbuf *cs, uint32_t op, uint64_t va, uint32_t ref, uint32_t mask) 926{ 927 assert(op == WAIT_REG_MEM_EQUAL || op == WAIT_REG_MEM_NOT_EQUAL || 928 op == WAIT_REG_MEM_GREATER_OR_EQUAL); 929 930 radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, false)); 931 radeon_emit(cs, op | WAIT_REG_MEM_MEM_SPACE(1)); 932 radeon_emit(cs, va); 933 radeon_emit(cs, va >> 32); 934 radeon_emit(cs, ref); /* reference value */ 935 radeon_emit(cs, mask); /* mask */ 936 radeon_emit(cs, 4); /* poll interval */ 937} 938 939static void 940si_emit_acquire_mem(struct radeon_cmdbuf *cs, bool is_mec, bool is_gfx9, unsigned cp_coher_cntl) 941{ 942 if (is_mec || is_gfx9) { 943 uint32_t hi_val = is_gfx9 ? 0xffffff : 0xff; 944 radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, false) | PKT3_SHADER_TYPE_S(is_mec)); 945 radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */ 946 radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ 947 radeon_emit(cs, hi_val); /* CP_COHER_SIZE_HI */ 948 radeon_emit(cs, 0); /* CP_COHER_BASE */ 949 radeon_emit(cs, 0); /* CP_COHER_BASE_HI */ 950 radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ 951 } else { 952 /* ACQUIRE_MEM is only required on a compute ring. */ 953 radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, false)); 954 radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */ 955 radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ 956 radeon_emit(cs, 0); /* CP_COHER_BASE */ 957 radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ 958 } 959} 960 961static void 962gfx10_cs_emit_cache_flush(struct radeon_cmdbuf *cs, enum chip_class chip_class, uint32_t *flush_cnt, 963 uint64_t flush_va, bool is_mec, enum radv_cmd_flush_bits flush_bits, 964 enum rgp_flush_bits *sqtt_flush_bits, uint64_t gfx9_eop_bug_va) 965{ 966 uint32_t gcr_cntl = 0; 967 unsigned cb_db_event = 0; 968 969 /* We don't need these. */ 970 assert(!(flush_bits & (RADV_CMD_FLAG_VGT_STREAMOUT_SYNC))); 971 972 if (flush_bits & RADV_CMD_FLAG_INV_ICACHE) { 973 gcr_cntl |= S_586_GLI_INV(V_586_GLI_ALL); 974 975 *sqtt_flush_bits |= RGP_FLUSH_INVAL_ICACHE; 976 } 977 if (flush_bits & RADV_CMD_FLAG_INV_SCACHE) { 978 /* TODO: When writing to the SMEM L1 cache, we need to set SEQ 979 * to FORWARD when both L1 and L2 are written out (WB or INV). 980 */ 981 gcr_cntl |= S_586_GL1_INV(1) | S_586_GLK_INV(1); 982 983 *sqtt_flush_bits |= RGP_FLUSH_INVAL_SMEM_L0; 984 } 985 if (flush_bits & RADV_CMD_FLAG_INV_VCACHE) { 986 gcr_cntl |= S_586_GL1_INV(1) | S_586_GLV_INV(1); 987 988 *sqtt_flush_bits |= RGP_FLUSH_INVAL_VMEM_L0 | RGP_FLUSH_INVAL_L1; 989 } 990 if (flush_bits & RADV_CMD_FLAG_INV_L2) { 991 /* Writeback and invalidate everything in L2. */ 992 gcr_cntl |= S_586_GL2_INV(1) | S_586_GL2_WB(1) | S_586_GLM_INV(1) | S_586_GLM_WB(1); 993 994 *sqtt_flush_bits |= RGP_FLUSH_INVAL_L2; 995 } else if (flush_bits & RADV_CMD_FLAG_WB_L2) { 996 /* Writeback but do not invalidate. 997 * GLM doesn't support WB alone. If WB is set, INV must be set too. 998 */ 999 gcr_cntl |= S_586_GL2_WB(1) | S_586_GLM_WB(1) | S_586_GLM_INV(1); 1000 1001 *sqtt_flush_bits |= RGP_FLUSH_FLUSH_L2; 1002 } else if (flush_bits & RADV_CMD_FLAG_INV_L2_METADATA) { 1003 gcr_cntl |= S_586_GLM_INV(1) | S_586_GLM_WB(1); 1004 } 1005 1006 if (flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB)) { 1007 /* TODO: trigger on RADV_CMD_FLAG_FLUSH_AND_INV_CB_META */ 1008 if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB) { 1009 /* Flush CMASK/FMASK/DCC. Will wait for idle later. */ 1010 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 1011 radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0)); 1012 1013 *sqtt_flush_bits |= RGP_FLUSH_FLUSH_CB | RGP_FLUSH_INVAL_CB; 1014 } 1015 1016 /* TODO: trigger on RADV_CMD_FLAG_FLUSH_AND_INV_DB_META ? */ 1017 if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB) { 1018 /* Flush HTILE. Will wait for idle later. */ 1019 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 1020 radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0)); 1021 1022 *sqtt_flush_bits |= RGP_FLUSH_FLUSH_DB | RGP_FLUSH_INVAL_DB; 1023 } 1024 1025 /* First flush CB/DB, then L1/L2. */ 1026 gcr_cntl |= S_586_SEQ(V_586_SEQ_FORWARD); 1027 1028 if ((flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB)) == 1029 (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB)) { 1030 cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT; 1031 } else if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB) { 1032 cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS; 1033 } else if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB) { 1034 cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS; 1035 } else { 1036 assert(0); 1037 } 1038 } else { 1039 /* Wait for graphics shaders to go idle if requested. */ 1040 if (flush_bits & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) { 1041 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 1042 radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4)); 1043 1044 *sqtt_flush_bits |= RGP_FLUSH_PS_PARTIAL_FLUSH; 1045 } else if (flush_bits & RADV_CMD_FLAG_VS_PARTIAL_FLUSH) { 1046 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 1047 radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); 1048 1049 *sqtt_flush_bits |= RGP_FLUSH_VS_PARTIAL_FLUSH; 1050 } 1051 } 1052 1053 if (flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH) { 1054 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 1055 radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4))); 1056 1057 *sqtt_flush_bits |= RGP_FLUSH_CS_PARTIAL_FLUSH; 1058 } 1059 1060 if (cb_db_event) { 1061 /* CB/DB flush and invalidate (or possibly just a wait for a 1062 * meta flush) via RELEASE_MEM. 1063 * 1064 * Combine this with other cache flushes when possible; this 1065 * requires affected shaders to be idle, so do it after the 1066 * CS_PARTIAL_FLUSH before (VS/PS partial flushes are always 1067 * implied). 1068 */ 1069 /* Get GCR_CNTL fields, because the encoding is different in RELEASE_MEM. */ 1070 unsigned glm_wb = G_586_GLM_WB(gcr_cntl); 1071 unsigned glm_inv = G_586_GLM_INV(gcr_cntl); 1072 unsigned glv_inv = G_586_GLV_INV(gcr_cntl); 1073 unsigned gl1_inv = G_586_GL1_INV(gcr_cntl); 1074 assert(G_586_GL2_US(gcr_cntl) == 0); 1075 assert(G_586_GL2_RANGE(gcr_cntl) == 0); 1076 assert(G_586_GL2_DISCARD(gcr_cntl) == 0); 1077 unsigned gl2_inv = G_586_GL2_INV(gcr_cntl); 1078 unsigned gl2_wb = G_586_GL2_WB(gcr_cntl); 1079 unsigned gcr_seq = G_586_SEQ(gcr_cntl); 1080 1081 gcr_cntl &= C_586_GLM_WB & C_586_GLM_INV & C_586_GLV_INV & C_586_GL1_INV & C_586_GL2_INV & 1082 C_586_GL2_WB; /* keep SEQ */ 1083 1084 assert(flush_cnt); 1085 (*flush_cnt)++; 1086 1087 si_cs_emit_write_event_eop( 1088 cs, chip_class, false, cb_db_event, 1089 S_490_GLM_WB(glm_wb) | S_490_GLM_INV(glm_inv) | S_490_GLV_INV(glv_inv) | 1090 S_490_GL1_INV(gl1_inv) | S_490_GL2_INV(gl2_inv) | S_490_GL2_WB(gl2_wb) | 1091 S_490_SEQ(gcr_seq), 1092 EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, flush_va, *flush_cnt, gfx9_eop_bug_va); 1093 1094 radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL, flush_va, *flush_cnt, 0xffffffff); 1095 } 1096 1097 /* VGT state sync */ 1098 if (flush_bits & RADV_CMD_FLAG_VGT_FLUSH) { 1099 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 1100 radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0)); 1101 } 1102 1103 /* Ignore fields that only modify the behavior of other fields. */ 1104 if (gcr_cntl & C_586_GL1_RANGE & C_586_GL2_RANGE & C_586_SEQ) { 1105 /* Flush caches and wait for the caches to assert idle. 1106 * The cache flush is executed in the ME, but the PFP waits 1107 * for completion. 1108 */ 1109 radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0)); 1110 radeon_emit(cs, 0); /* CP_COHER_CNTL */ 1111 radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ 1112 radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */ 1113 radeon_emit(cs, 0); /* CP_COHER_BASE */ 1114 radeon_emit(cs, 0); /* CP_COHER_BASE_HI */ 1115 radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ 1116 radeon_emit(cs, gcr_cntl); /* GCR_CNTL */ 1117 } else if ((cb_db_event || 1118 (flush_bits & (RADV_CMD_FLAG_VS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | 1119 RADV_CMD_FLAG_CS_PARTIAL_FLUSH))) && 1120 !is_mec) { 1121 /* We need to ensure that PFP waits as well. */ 1122 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); 1123 radeon_emit(cs, 0); 1124 1125 *sqtt_flush_bits |= RGP_FLUSH_PFP_SYNC_ME; 1126 } 1127 1128 if (flush_bits & RADV_CMD_FLAG_START_PIPELINE_STATS) { 1129 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 1130 radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) | EVENT_INDEX(0)); 1131 } else if (flush_bits & RADV_CMD_FLAG_STOP_PIPELINE_STATS) { 1132 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 1133 radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | EVENT_INDEX(0)); 1134 } 1135} 1136 1137void 1138si_cs_emit_cache_flush(struct radeon_cmdbuf *cs, enum chip_class chip_class, uint32_t *flush_cnt, 1139 uint64_t flush_va, bool is_mec, enum radv_cmd_flush_bits flush_bits, 1140 enum rgp_flush_bits *sqtt_flush_bits, uint64_t gfx9_eop_bug_va) 1141{ 1142 unsigned cp_coher_cntl = 0; 1143 uint32_t flush_cb_db = 1144 flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB); 1145 1146 if (chip_class >= GFX10) { 1147 /* GFX10 cache flush handling is quite different. */ 1148 gfx10_cs_emit_cache_flush(cs, chip_class, flush_cnt, flush_va, is_mec, flush_bits, 1149 sqtt_flush_bits, gfx9_eop_bug_va); 1150 return; 1151 } 1152 1153 if (flush_bits & RADV_CMD_FLAG_INV_ICACHE) { 1154 cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1); 1155 *sqtt_flush_bits |= RGP_FLUSH_INVAL_ICACHE; 1156 } 1157 if (flush_bits & RADV_CMD_FLAG_INV_SCACHE) { 1158 cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1); 1159 *sqtt_flush_bits |= RGP_FLUSH_INVAL_SMEM_L0; 1160 } 1161 1162 if (chip_class <= GFX8) { 1163 if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB) { 1164 cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) | S_0085F0_CB0_DEST_BASE_ENA(1) | 1165 S_0085F0_CB1_DEST_BASE_ENA(1) | S_0085F0_CB2_DEST_BASE_ENA(1) | 1166 S_0085F0_CB3_DEST_BASE_ENA(1) | S_0085F0_CB4_DEST_BASE_ENA(1) | 1167 S_0085F0_CB5_DEST_BASE_ENA(1) | S_0085F0_CB6_DEST_BASE_ENA(1) | 1168 S_0085F0_CB7_DEST_BASE_ENA(1); 1169 1170 /* Necessary for DCC */ 1171 if (chip_class >= GFX8) { 1172 si_cs_emit_write_event_eop(cs, chip_class, is_mec, V_028A90_FLUSH_AND_INV_CB_DATA_TS, 0, 1173 EOP_DST_SEL_MEM, EOP_DATA_SEL_DISCARD, 0, 0, 1174 gfx9_eop_bug_va); 1175 } 1176 1177 *sqtt_flush_bits |= RGP_FLUSH_FLUSH_CB | RGP_FLUSH_INVAL_CB; 1178 } 1179 if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB) { 1180 cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1); 1181 1182 *sqtt_flush_bits |= RGP_FLUSH_FLUSH_DB | RGP_FLUSH_INVAL_DB; 1183 } 1184 } 1185 1186 if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB_META) { 1187 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 1188 radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0)); 1189 1190 *sqtt_flush_bits |= RGP_FLUSH_FLUSH_CB | RGP_FLUSH_INVAL_CB; 1191 } 1192 1193 if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB_META) { 1194 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 1195 radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0)); 1196 1197 *sqtt_flush_bits |= RGP_FLUSH_FLUSH_DB | RGP_FLUSH_INVAL_DB; 1198 } 1199 1200 if (flush_bits & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) { 1201 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 1202 radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4)); 1203 1204 *sqtt_flush_bits |= RGP_FLUSH_PS_PARTIAL_FLUSH; 1205 } else if (flush_bits & RADV_CMD_FLAG_VS_PARTIAL_FLUSH) { 1206 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 1207 radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); 1208 1209 *sqtt_flush_bits |= RGP_FLUSH_VS_PARTIAL_FLUSH; 1210 } 1211 1212 if (flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH) { 1213 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 1214 radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4)); 1215 1216 *sqtt_flush_bits |= RGP_FLUSH_CS_PARTIAL_FLUSH; 1217 } 1218 1219 if (chip_class == GFX9 && flush_cb_db) { 1220 unsigned cb_db_event, tc_flags; 1221 1222 /* Set the CB/DB flush event. */ 1223 cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT; 1224 1225 /* These are the only allowed combinations. If you need to 1226 * do multiple operations at once, do them separately. 1227 * All operations that invalidate L2 also seem to invalidate 1228 * metadata. Volatile (VOL) and WC flushes are not listed here. 1229 * 1230 * TC | TC_WB = writeback & invalidate L2 & L1 1231 * TC | TC_WB | TC_NC = writeback & invalidate L2 for MTYPE == NC 1232 * TC_WB | TC_NC = writeback L2 for MTYPE == NC 1233 * TC | TC_NC = invalidate L2 for MTYPE == NC 1234 * TC | TC_MD = writeback & invalidate L2 metadata (DCC, etc.) 1235 * TCL1 = invalidate L1 1236 */ 1237 tc_flags = EVENT_TC_ACTION_ENA | EVENT_TC_MD_ACTION_ENA; 1238 1239 *sqtt_flush_bits |= 1240 RGP_FLUSH_FLUSH_CB | RGP_FLUSH_INVAL_CB | RGP_FLUSH_FLUSH_DB | RGP_FLUSH_INVAL_DB; 1241 1242 /* Ideally flush TC together with CB/DB. */ 1243 if (flush_bits & RADV_CMD_FLAG_INV_L2) { 1244 /* Writeback and invalidate everything in L2 & L1. */ 1245 tc_flags = EVENT_TC_ACTION_ENA | EVENT_TC_WB_ACTION_ENA; 1246 1247 /* Clear the flags. */ 1248 flush_bits &= ~(RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_WB_L2 | RADV_CMD_FLAG_INV_VCACHE); 1249 1250 *sqtt_flush_bits |= RGP_FLUSH_INVAL_L2; 1251 } 1252 1253 assert(flush_cnt); 1254 (*flush_cnt)++; 1255 1256 si_cs_emit_write_event_eop(cs, chip_class, false, cb_db_event, tc_flags, EOP_DST_SEL_MEM, 1257 EOP_DATA_SEL_VALUE_32BIT, flush_va, *flush_cnt, gfx9_eop_bug_va); 1258 radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL, flush_va, *flush_cnt, 0xffffffff); 1259 } 1260 1261 /* VGT state sync */ 1262 if (flush_bits & RADV_CMD_FLAG_VGT_FLUSH) { 1263 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 1264 radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0)); 1265 } 1266 1267 /* VGT streamout state sync */ 1268 if (flush_bits & RADV_CMD_FLAG_VGT_STREAMOUT_SYNC) { 1269 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 1270 radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_STREAMOUT_SYNC) | EVENT_INDEX(0)); 1271 } 1272 1273 /* Make sure ME is idle (it executes most packets) before continuing. 1274 * This prevents read-after-write hazards between PFP and ME. 1275 */ 1276 if ((cp_coher_cntl || (flush_bits & (RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_VCACHE | 1277 RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_WB_L2))) && 1278 !is_mec) { 1279 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); 1280 radeon_emit(cs, 0); 1281 1282 *sqtt_flush_bits |= RGP_FLUSH_PFP_SYNC_ME; 1283 } 1284 1285 if ((flush_bits & RADV_CMD_FLAG_INV_L2) || 1286 (chip_class <= GFX7 && (flush_bits & RADV_CMD_FLAG_WB_L2))) { 1287 si_emit_acquire_mem(cs, is_mec, chip_class == GFX9, 1288 cp_coher_cntl | S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) | 1289 S_0301F0_TC_WB_ACTION_ENA(chip_class >= GFX8)); 1290 cp_coher_cntl = 0; 1291 1292 *sqtt_flush_bits |= RGP_FLUSH_INVAL_L2 | RGP_FLUSH_INVAL_VMEM_L0; 1293 } else { 1294 if (flush_bits & RADV_CMD_FLAG_WB_L2) { 1295 /* WB = write-back 1296 * NC = apply to non-coherent MTYPEs 1297 * (i.e. MTYPE <= 1, which is what we use everywhere) 1298 * 1299 * WB doesn't work without NC. 1300 */ 1301 si_emit_acquire_mem( 1302 cs, is_mec, chip_class == GFX9, 1303 cp_coher_cntl | S_0301F0_TC_WB_ACTION_ENA(1) | S_0301F0_TC_NC_ACTION_ENA(1)); 1304 cp_coher_cntl = 0; 1305 1306 *sqtt_flush_bits |= RGP_FLUSH_FLUSH_L2 | RGP_FLUSH_INVAL_VMEM_L0; 1307 } 1308 if (flush_bits & RADV_CMD_FLAG_INV_VCACHE) { 1309 si_emit_acquire_mem(cs, is_mec, chip_class == GFX9, 1310 cp_coher_cntl | S_0085F0_TCL1_ACTION_ENA(1)); 1311 cp_coher_cntl = 0; 1312 1313 *sqtt_flush_bits |= RGP_FLUSH_INVAL_VMEM_L0; 1314 } 1315 } 1316 1317 /* When one of the DEST_BASE flags is set, SURFACE_SYNC waits for idle. 1318 * Therefore, it should be last. Done in PFP. 1319 */ 1320 if (cp_coher_cntl) 1321 si_emit_acquire_mem(cs, is_mec, chip_class == GFX9, cp_coher_cntl); 1322 1323 if (flush_bits & RADV_CMD_FLAG_START_PIPELINE_STATS) { 1324 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 1325 radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) | EVENT_INDEX(0)); 1326 } else if (flush_bits & RADV_CMD_FLAG_STOP_PIPELINE_STATS) { 1327 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 1328 radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | EVENT_INDEX(0)); 1329 } 1330} 1331 1332void 1333si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer) 1334{ 1335 bool is_compute = cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE; 1336 1337 if (is_compute) 1338 cmd_buffer->state.flush_bits &= 1339 ~(RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META | 1340 RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META | 1341 RADV_CMD_FLAG_INV_L2_METADATA | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | 1342 RADV_CMD_FLAG_VS_PARTIAL_FLUSH | RADV_CMD_FLAG_VGT_FLUSH | 1343 RADV_CMD_FLAG_START_PIPELINE_STATS | RADV_CMD_FLAG_STOP_PIPELINE_STATS); 1344 1345 if (!cmd_buffer->state.flush_bits) { 1346 radv_describe_barrier_end_delayed(cmd_buffer); 1347 return; 1348 } 1349 1350 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 128); 1351 1352 si_cs_emit_cache_flush(cmd_buffer->cs, cmd_buffer->device->physical_device->rad_info.chip_class, 1353 &cmd_buffer->gfx9_fence_idx, cmd_buffer->gfx9_fence_va, 1354 radv_cmd_buffer_uses_mec(cmd_buffer), cmd_buffer->state.flush_bits, 1355 &cmd_buffer->state.sqtt_flush_bits, cmd_buffer->gfx9_eop_bug_va); 1356 1357 if (unlikely(cmd_buffer->device->trace_bo)) 1358 radv_cmd_buffer_trace_emit(cmd_buffer); 1359 1360 if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_INV_L2) 1361 cmd_buffer->state.rb_noncoherent_dirty = false; 1362 1363 /* Clear the caches that have been flushed to avoid syncing too much 1364 * when there is some pending active queries. 1365 */ 1366 cmd_buffer->active_query_flush_bits &= ~cmd_buffer->state.flush_bits; 1367 1368 cmd_buffer->state.flush_bits = 0; 1369 1370 /* If the driver used a compute shader for resetting a query pool, it 1371 * should be finished at this point. 1372 */ 1373 cmd_buffer->pending_reset_query = false; 1374 1375 radv_describe_barrier_end_delayed(cmd_buffer); 1376} 1377 1378/* sets the CP predication state using a boolean stored at va */ 1379void 1380si_emit_set_predication_state(struct radv_cmd_buffer *cmd_buffer, bool draw_visible, 1381 unsigned pred_op, uint64_t va) 1382{ 1383 uint32_t op = 0; 1384 1385 if (va) { 1386 assert(pred_op == PREDICATION_OP_BOOL32 || pred_op == PREDICATION_OP_BOOL64); 1387 1388 op = PRED_OP(pred_op); 1389 1390 /* PREDICATION_DRAW_VISIBLE means that if the 32-bit value is 1391 * zero, all rendering commands are discarded. Otherwise, they 1392 * are discarded if the value is non zero. 1393 */ 1394 op |= draw_visible ? PREDICATION_DRAW_VISIBLE : PREDICATION_DRAW_NOT_VISIBLE; 1395 } 1396 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { 1397 radeon_emit(cmd_buffer->cs, PKT3(PKT3_SET_PREDICATION, 2, 0)); 1398 radeon_emit(cmd_buffer->cs, op); 1399 radeon_emit(cmd_buffer->cs, va); 1400 radeon_emit(cmd_buffer->cs, va >> 32); 1401 } else { 1402 radeon_emit(cmd_buffer->cs, PKT3(PKT3_SET_PREDICATION, 1, 0)); 1403 radeon_emit(cmd_buffer->cs, va); 1404 radeon_emit(cmd_buffer->cs, op | ((va >> 32) & 0xFF)); 1405 } 1406} 1407 1408/* Set this if you want the 3D engine to wait until CP DMA is done. 1409 * It should be set on the last CP DMA packet. */ 1410#define CP_DMA_SYNC (1 << 0) 1411 1412/* Set this if the source data was used as a destination in a previous CP DMA 1413 * packet. It's for preventing a read-after-write (RAW) hazard between two 1414 * CP DMA packets. */ 1415#define CP_DMA_RAW_WAIT (1 << 1) 1416#define CP_DMA_USE_L2 (1 << 2) 1417#define CP_DMA_CLEAR (1 << 3) 1418 1419/* Alignment for optimal performance. */ 1420#define SI_CPDMA_ALIGNMENT 32 1421 1422/* The max number of bytes that can be copied per packet. */ 1423static inline unsigned 1424cp_dma_max_byte_count(struct radv_cmd_buffer *cmd_buffer) 1425{ 1426 unsigned max = cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9 1427 ? S_415_BYTE_COUNT_GFX9(~0u) 1428 : S_415_BYTE_COUNT_GFX6(~0u); 1429 1430 /* make it aligned for optimal performance */ 1431 return max & ~(SI_CPDMA_ALIGNMENT - 1); 1432} 1433 1434/* Emit a CP DMA packet to do a copy from one buffer to another, or to clear 1435 * a buffer. The size must fit in bits [20:0]. If CP_DMA_CLEAR is set, src_va is a 32-bit 1436 * clear value. 1437 */ 1438static void 1439si_emit_cp_dma(struct radv_cmd_buffer *cmd_buffer, uint64_t dst_va, uint64_t src_va, unsigned size, 1440 unsigned flags) 1441{ 1442 struct radeon_cmdbuf *cs = cmd_buffer->cs; 1443 uint32_t header = 0, command = 0; 1444 1445 assert(size <= cp_dma_max_byte_count(cmd_buffer)); 1446 1447 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 9); 1448 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) 1449 command |= S_415_BYTE_COUNT_GFX9(size); 1450 else 1451 command |= S_415_BYTE_COUNT_GFX6(size); 1452 1453 /* Sync flags. */ 1454 if (flags & CP_DMA_SYNC) 1455 header |= S_411_CP_SYNC(1); 1456 else { 1457 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) 1458 command |= S_415_DISABLE_WR_CONFIRM_GFX9(1); 1459 else 1460 command |= S_415_DISABLE_WR_CONFIRM_GFX6(1); 1461 } 1462 1463 if (flags & CP_DMA_RAW_WAIT) 1464 command |= S_415_RAW_WAIT(1); 1465 1466 /* Src and dst flags. */ 1467 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9 && 1468 !(flags & CP_DMA_CLEAR) && src_va == dst_va) 1469 header |= S_411_DST_SEL(V_411_NOWHERE); /* prefetch only */ 1470 else if (flags & CP_DMA_USE_L2) 1471 header |= S_411_DST_SEL(V_411_DST_ADDR_TC_L2); 1472 1473 if (flags & CP_DMA_CLEAR) 1474 header |= S_411_SRC_SEL(V_411_DATA); 1475 else if (flags & CP_DMA_USE_L2) 1476 header |= S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2); 1477 1478 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) { 1479 radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, cmd_buffer->state.predicating)); 1480 radeon_emit(cs, header); 1481 radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */ 1482 radeon_emit(cs, src_va >> 32); /* SRC_ADDR_HI [31:0] */ 1483 radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ 1484 radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [31:0] */ 1485 radeon_emit(cs, command); 1486 } else { 1487 assert(!(flags & CP_DMA_USE_L2)); 1488 header |= S_411_SRC_ADDR_HI(src_va >> 32); 1489 radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, cmd_buffer->state.predicating)); 1490 radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */ 1491 radeon_emit(cs, header); /* SRC_ADDR_HI [15:0] + flags. */ 1492 radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ 1493 radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */ 1494 radeon_emit(cs, command); 1495 } 1496 1497 /* CP DMA is executed in ME, but index buffers are read by PFP. 1498 * This ensures that ME (CP DMA) is idle before PFP starts fetching 1499 * indices. If we wanted to execute CP DMA in PFP, this packet 1500 * should precede it. 1501 */ 1502 if (flags & CP_DMA_SYNC) { 1503 if (cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) { 1504 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating)); 1505 radeon_emit(cs, 0); 1506 } 1507 1508 /* CP will see the sync flag and wait for all DMAs to complete. */ 1509 cmd_buffer->state.dma_is_busy = false; 1510 } 1511 1512 if (unlikely(cmd_buffer->device->trace_bo)) 1513 radv_cmd_buffer_trace_emit(cmd_buffer); 1514} 1515 1516void 1517si_cp_dma_prefetch(struct radv_cmd_buffer *cmd_buffer, uint64_t va, unsigned size) 1518{ 1519 uint64_t aligned_va = va & ~(SI_CPDMA_ALIGNMENT - 1); 1520 uint64_t aligned_size = 1521 ((va + size + SI_CPDMA_ALIGNMENT - 1) & ~(SI_CPDMA_ALIGNMENT - 1)) - aligned_va; 1522 1523 si_emit_cp_dma(cmd_buffer, aligned_va, aligned_va, aligned_size, CP_DMA_USE_L2); 1524} 1525 1526static void 1527si_cp_dma_prepare(struct radv_cmd_buffer *cmd_buffer, uint64_t byte_count, uint64_t remaining_size, 1528 unsigned *flags) 1529{ 1530 1531 /* Flush the caches for the first copy only. 1532 * Also wait for the previous CP DMA operations. 1533 */ 1534 if (cmd_buffer->state.flush_bits) { 1535 si_emit_cache_flush(cmd_buffer); 1536 *flags |= CP_DMA_RAW_WAIT; 1537 } 1538 1539 /* Do the synchronization after the last dma, so that all data 1540 * is written to memory. 1541 */ 1542 if (byte_count == remaining_size) 1543 *flags |= CP_DMA_SYNC; 1544} 1545 1546static void 1547si_cp_dma_realign_engine(struct radv_cmd_buffer *cmd_buffer, unsigned size) 1548{ 1549 uint64_t va; 1550 uint32_t offset; 1551 unsigned dma_flags = 0; 1552 unsigned buf_size = SI_CPDMA_ALIGNMENT * 2; 1553 void *ptr; 1554 1555 assert(size < SI_CPDMA_ALIGNMENT); 1556 1557 radv_cmd_buffer_upload_alloc(cmd_buffer, buf_size, &offset, &ptr); 1558 1559 va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); 1560 va += offset; 1561 1562 si_cp_dma_prepare(cmd_buffer, size, size, &dma_flags); 1563 1564 si_emit_cp_dma(cmd_buffer, va, va + SI_CPDMA_ALIGNMENT, size, dma_flags); 1565} 1566 1567void 1568si_cp_dma_buffer_copy(struct radv_cmd_buffer *cmd_buffer, uint64_t src_va, uint64_t dest_va, 1569 uint64_t size) 1570{ 1571 uint64_t main_src_va, main_dest_va; 1572 uint64_t skipped_size = 0, realign_size = 0; 1573 1574 /* Assume that we are not going to sync after the last DMA operation. */ 1575 cmd_buffer->state.dma_is_busy = true; 1576 1577 if (cmd_buffer->device->physical_device->rad_info.family <= CHIP_CARRIZO || 1578 cmd_buffer->device->physical_device->rad_info.family == CHIP_STONEY) { 1579 /* If the size is not aligned, we must add a dummy copy at the end 1580 * just to align the internal counter. Otherwise, the DMA engine 1581 * would slow down by an order of magnitude for following copies. 1582 */ 1583 if (size % SI_CPDMA_ALIGNMENT) 1584 realign_size = SI_CPDMA_ALIGNMENT - (size % SI_CPDMA_ALIGNMENT); 1585 1586 /* If the copy begins unaligned, we must start copying from the next 1587 * aligned block and the skipped part should be copied after everything 1588 * else has been copied. Only the src alignment matters, not dst. 1589 */ 1590 if (src_va % SI_CPDMA_ALIGNMENT) { 1591 skipped_size = SI_CPDMA_ALIGNMENT - (src_va % SI_CPDMA_ALIGNMENT); 1592 /* The main part will be skipped if the size is too small. */ 1593 skipped_size = MIN2(skipped_size, size); 1594 size -= skipped_size; 1595 } 1596 } 1597 main_src_va = src_va + skipped_size; 1598 main_dest_va = dest_va + skipped_size; 1599 1600 while (size) { 1601 unsigned dma_flags = 0; 1602 unsigned byte_count = MIN2(size, cp_dma_max_byte_count(cmd_buffer)); 1603 1604 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { 1605 /* DMA operations via L2 are coherent and faster. 1606 * TODO: GFX7-GFX8 should also support this but it 1607 * requires tests/benchmarks. 1608 * 1609 * Also enable on GFX9 so we can use L2 at rest on GFX9+. On Raven 1610 * this didn't seem to be worse. 1611 * 1612 * Note that we only use CP DMA for sizes < RADV_BUFFER_OPS_CS_THRESHOLD, 1613 * which is 4k at the moment, so this is really unlikely to cause 1614 * significant thrashing. 1615 */ 1616 dma_flags |= CP_DMA_USE_L2; 1617 } 1618 1619 si_cp_dma_prepare(cmd_buffer, byte_count, size + skipped_size + realign_size, &dma_flags); 1620 1621 dma_flags &= ~CP_DMA_SYNC; 1622 1623 si_emit_cp_dma(cmd_buffer, main_dest_va, main_src_va, byte_count, dma_flags); 1624 1625 size -= byte_count; 1626 main_src_va += byte_count; 1627 main_dest_va += byte_count; 1628 } 1629 1630 if (skipped_size) { 1631 unsigned dma_flags = 0; 1632 1633 si_cp_dma_prepare(cmd_buffer, skipped_size, size + skipped_size + realign_size, &dma_flags); 1634 1635 si_emit_cp_dma(cmd_buffer, dest_va, src_va, skipped_size, dma_flags); 1636 } 1637 if (realign_size) 1638 si_cp_dma_realign_engine(cmd_buffer, realign_size); 1639} 1640 1641void 1642si_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer, uint64_t va, uint64_t size, 1643 unsigned value) 1644{ 1645 1646 if (!size) 1647 return; 1648 1649 assert(va % 4 == 0 && size % 4 == 0); 1650 1651 /* Assume that we are not going to sync after the last DMA operation. */ 1652 cmd_buffer->state.dma_is_busy = true; 1653 1654 while (size) { 1655 unsigned byte_count = MIN2(size, cp_dma_max_byte_count(cmd_buffer)); 1656 unsigned dma_flags = CP_DMA_CLEAR; 1657 1658 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { 1659 /* DMA operations via L2 are coherent and faster. 1660 * TODO: GFX7-GFX8 should also support this but it 1661 * requires tests/benchmarks. 1662 * 1663 * Also enable on GFX9 so we can use L2 at rest on GFX9+. 1664 */ 1665 dma_flags |= CP_DMA_USE_L2; 1666 } 1667 1668 si_cp_dma_prepare(cmd_buffer, byte_count, size, &dma_flags); 1669 1670 /* Emit the clear packet. */ 1671 si_emit_cp_dma(cmd_buffer, va, value, byte_count, dma_flags); 1672 1673 size -= byte_count; 1674 va += byte_count; 1675 } 1676} 1677 1678void 1679si_cp_dma_wait_for_idle(struct radv_cmd_buffer *cmd_buffer) 1680{ 1681 if (cmd_buffer->device->physical_device->rad_info.chip_class < GFX7) 1682 return; 1683 1684 if (!cmd_buffer->state.dma_is_busy) 1685 return; 1686 1687 /* Issue a dummy DMA that copies zero bytes. 1688 * 1689 * The DMA engine will see that there's no work to do and skip this 1690 * DMA request, however, the CP will see the sync flag and still wait 1691 * for all DMAs to complete. 1692 */ 1693 si_emit_cp_dma(cmd_buffer, 0, 0, 0, CP_DMA_SYNC); 1694 1695 cmd_buffer->state.dma_is_busy = false; 1696} 1697 1698/* For MSAA sample positions. */ 1699#define FILL_SREG(s0x, s0y, s1x, s1y, s2x, s2y, s3x, s3y) \ 1700 ((((unsigned)(s0x)&0xf) << 0) | (((unsigned)(s0y)&0xf) << 4) | (((unsigned)(s1x)&0xf) << 8) | \ 1701 (((unsigned)(s1y)&0xf) << 12) | (((unsigned)(s2x)&0xf) << 16) | \ 1702 (((unsigned)(s2y)&0xf) << 20) | (((unsigned)(s3x)&0xf) << 24) | (((unsigned)(s3y)&0xf) << 28)) 1703 1704/* For obtaining location coordinates from registers */ 1705#define SEXT4(x) ((int)((x) | ((x)&0x8 ? 0xfffffff0 : 0))) 1706#define GET_SFIELD(reg, index) SEXT4(((reg) >> ((index)*4)) & 0xf) 1707#define GET_SX(reg, index) GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2) 1708#define GET_SY(reg, index) GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2 + 1) 1709 1710/* 1x MSAA */ 1711static const uint32_t sample_locs_1x = FILL_SREG(0, 0, 0, 0, 0, 0, 0, 0); 1712static const unsigned max_dist_1x = 0; 1713static const uint64_t centroid_priority_1x = 0x0000000000000000ull; 1714 1715/* 2xMSAA */ 1716static const uint32_t sample_locs_2x = FILL_SREG(4, 4, -4, -4, 0, 0, 0, 0); 1717static const unsigned max_dist_2x = 4; 1718static const uint64_t centroid_priority_2x = 0x1010101010101010ull; 1719 1720/* 4xMSAA */ 1721static const uint32_t sample_locs_4x = FILL_SREG(-2, -6, 6, -2, -6, 2, 2, 6); 1722static const unsigned max_dist_4x = 6; 1723static const uint64_t centroid_priority_4x = 0x3210321032103210ull; 1724 1725/* 8xMSAA */ 1726static const uint32_t sample_locs_8x[] = { 1727 FILL_SREG(1, -3, -1, 3, 5, 1, -3, -5), 1728 FILL_SREG(-5, 5, -7, -1, 3, 7, 7, -7), 1729 /* The following are unused by hardware, but we emit them to IBs 1730 * instead of multiple SET_CONTEXT_REG packets. */ 1731 0, 1732 0, 1733}; 1734static const unsigned max_dist_8x = 7; 1735static const uint64_t centroid_priority_8x = 0x7654321076543210ull; 1736 1737unsigned 1738radv_get_default_max_sample_dist(int log_samples) 1739{ 1740 unsigned max_dist[] = { 1741 max_dist_1x, 1742 max_dist_2x, 1743 max_dist_4x, 1744 max_dist_8x, 1745 }; 1746 return max_dist[log_samples]; 1747} 1748 1749void 1750radv_emit_default_sample_locations(struct radeon_cmdbuf *cs, int nr_samples) 1751{ 1752 switch (nr_samples) { 1753 default: 1754 case 1: 1755 radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2); 1756 radeon_emit(cs, (uint32_t)centroid_priority_1x); 1757 radeon_emit(cs, centroid_priority_1x >> 32); 1758 radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_1x); 1759 radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_1x); 1760 radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_1x); 1761 radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_1x); 1762 break; 1763 case 2: 1764 radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2); 1765 radeon_emit(cs, (uint32_t)centroid_priority_2x); 1766 radeon_emit(cs, centroid_priority_2x >> 32); 1767 radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_2x); 1768 radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_2x); 1769 radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_2x); 1770 radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_2x); 1771 break; 1772 case 4: 1773 radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2); 1774 radeon_emit(cs, (uint32_t)centroid_priority_4x); 1775 radeon_emit(cs, centroid_priority_4x >> 32); 1776 radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_4x); 1777 radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_4x); 1778 radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_4x); 1779 radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_4x); 1780 break; 1781 case 8: 1782 radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2); 1783 radeon_emit(cs, (uint32_t)centroid_priority_8x); 1784 radeon_emit(cs, centroid_priority_8x >> 32); 1785 radeon_set_context_reg_seq(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 14); 1786 radeon_emit_array(cs, sample_locs_8x, 4); 1787 radeon_emit_array(cs, sample_locs_8x, 4); 1788 radeon_emit_array(cs, sample_locs_8x, 4); 1789 radeon_emit_array(cs, sample_locs_8x, 2); 1790 break; 1791 } 1792} 1793 1794static void 1795radv_get_sample_position(struct radv_device *device, unsigned sample_count, unsigned sample_index, 1796 float *out_value) 1797{ 1798 const uint32_t *sample_locs; 1799 1800 switch (sample_count) { 1801 case 1: 1802 default: 1803 sample_locs = &sample_locs_1x; 1804 break; 1805 case 2: 1806 sample_locs = &sample_locs_2x; 1807 break; 1808 case 4: 1809 sample_locs = &sample_locs_4x; 1810 break; 1811 case 8: 1812 sample_locs = sample_locs_8x; 1813 break; 1814 } 1815 1816 out_value[0] = (GET_SX(sample_locs, sample_index) + 8) / 16.0f; 1817 out_value[1] = (GET_SY(sample_locs, sample_index) + 8) / 16.0f; 1818} 1819 1820void 1821radv_device_init_msaa(struct radv_device *device) 1822{ 1823 int i; 1824 1825 radv_get_sample_position(device, 1, 0, device->sample_locations_1x[0]); 1826 1827 for (i = 0; i < 2; i++) 1828 radv_get_sample_position(device, 2, i, device->sample_locations_2x[i]); 1829 for (i = 0; i < 4; i++) 1830 radv_get_sample_position(device, 4, i, device->sample_locations_4x[i]); 1831 for (i = 0; i < 8; i++) 1832 radv_get_sample_position(device, 8, i, device->sample_locations_8x[i]); 1833} 1834