si_cmd_buffer.c revision b8e80941
1/* 2 * Copyright © 2016 Red Hat. 3 * Copyright © 2016 Bas Nieuwenhuizen 4 * 5 * based on si_state.c 6 * Copyright © 2015 Advanced Micro Devices, Inc. 7 * 8 * Permission is hereby granted, free of charge, to any person obtaining a 9 * copy of this software and associated documentation files (the "Software"), 10 * to deal in the Software without restriction, including without limitation 11 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 12 * and/or sell copies of the Software, and to permit persons to whom the 13 * Software is furnished to do so, subject to the following conditions: 14 * 15 * The above copyright notice and this permission notice (including the next 16 * paragraph) shall be included in all copies or substantial portions of the 17 * Software. 18 * 19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 25 * IN THE SOFTWARE. 26 */ 27 28/* command buffer handling for SI */ 29 30#include "radv_private.h" 31#include "radv_shader.h" 32#include "radv_cs.h" 33#include "sid.h" 34#include "gfx9d.h" 35#include "radv_util.h" 36#include "main/macros.h" 37 38static void 39si_write_harvested_raster_configs(struct radv_physical_device *physical_device, 40 struct radeon_cmdbuf *cs, 41 unsigned raster_config, 42 unsigned raster_config_1) 43{ 44 unsigned num_se = MAX2(physical_device->rad_info.max_se, 1); 45 unsigned raster_config_se[4]; 46 unsigned se; 47 48 ac_get_harvested_configs(&physical_device->rad_info, 49 raster_config, 50 &raster_config_1, 51 raster_config_se); 52 53 for (se = 0; se < num_se; se++) { 54 /* GRBM_GFX_INDEX has a different offset on SI and CI+ */ 55 if (physical_device->rad_info.chip_class < CIK) 56 radeon_set_config_reg(cs, R_00802C_GRBM_GFX_INDEX, 57 S_00802C_SE_INDEX(se) | 58 S_00802C_SH_BROADCAST_WRITES(1) | 59 S_00802C_INSTANCE_BROADCAST_WRITES(1)); 60 else 61 radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, 62 S_030800_SE_INDEX(se) | S_030800_SH_BROADCAST_WRITES(1) | 63 S_030800_INSTANCE_BROADCAST_WRITES(1)); 64 radeon_set_context_reg(cs, R_028350_PA_SC_RASTER_CONFIG, raster_config_se[se]); 65 } 66 67 /* GRBM_GFX_INDEX has a different offset on SI and CI+ */ 68 if (physical_device->rad_info.chip_class < CIK) 69 radeon_set_config_reg(cs, R_00802C_GRBM_GFX_INDEX, 70 S_00802C_SE_BROADCAST_WRITES(1) | 71 S_00802C_SH_BROADCAST_WRITES(1) | 72 S_00802C_INSTANCE_BROADCAST_WRITES(1)); 73 else 74 radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, 75 S_030800_SE_BROADCAST_WRITES(1) | S_030800_SH_BROADCAST_WRITES(1) | 76 S_030800_INSTANCE_BROADCAST_WRITES(1)); 77 78 if (physical_device->rad_info.chip_class >= CIK) 79 radeon_set_context_reg(cs, R_028354_PA_SC_RASTER_CONFIG_1, raster_config_1); 80} 81 82void 83si_emit_compute(struct radv_physical_device *physical_device, 84 struct radeon_cmdbuf *cs) 85{ 86 radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3); 87 radeon_emit(cs, 0); 88 radeon_emit(cs, 0); 89 radeon_emit(cs, 0); 90 91 radeon_set_sh_reg_seq(cs, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2); 92 /* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1 */ 93 radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff)); 94 radeon_emit(cs, S_00B85C_SH0_CU_EN(0xffff) | S_00B85C_SH1_CU_EN(0xffff)); 95 96 if (physical_device->rad_info.chip_class >= CIK) { 97 /* Also set R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE2 / SE3 */ 98 radeon_set_sh_reg_seq(cs, 99 R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, 2); 100 radeon_emit(cs, S_00B864_SH0_CU_EN(0xffff) | 101 S_00B864_SH1_CU_EN(0xffff)); 102 radeon_emit(cs, S_00B868_SH0_CU_EN(0xffff) | 103 S_00B868_SH1_CU_EN(0xffff)); 104 } 105 106 /* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID 107 * and is now per pipe, so it should be handled in the 108 * kernel if we want to use something other than the default value, 109 * which is now 0x22f. 110 */ 111 if (physical_device->rad_info.chip_class <= SI) { 112 /* XXX: This should be: 113 * (number of compute units) * 4 * (waves per simd) - 1 */ 114 115 radeon_set_sh_reg(cs, R_00B82C_COMPUTE_MAX_WAVE_ID, 116 0x190 /* Default value */); 117 } 118} 119 120/* 12.4 fixed-point */ 121static unsigned radv_pack_float_12p4(float x) 122{ 123 return x <= 0 ? 0 : 124 x >= 4096 ? 0xffff : x * 16; 125} 126 127static void 128si_set_raster_config(struct radv_physical_device *physical_device, 129 struct radeon_cmdbuf *cs) 130{ 131 unsigned num_rb = MIN2(physical_device->rad_info.num_render_backends, 16); 132 unsigned rb_mask = physical_device->rad_info.enabled_rb_mask; 133 unsigned raster_config, raster_config_1; 134 135 ac_get_raster_config(&physical_device->rad_info, 136 &raster_config, 137 &raster_config_1, NULL); 138 139 /* Always use the default config when all backends are enabled 140 * (or when we failed to determine the enabled backends). 141 */ 142 if (!rb_mask || util_bitcount(rb_mask) >= num_rb) { 143 radeon_set_context_reg(cs, R_028350_PA_SC_RASTER_CONFIG, 144 raster_config); 145 if (physical_device->rad_info.chip_class >= CIK) 146 radeon_set_context_reg(cs, R_028354_PA_SC_RASTER_CONFIG_1, 147 raster_config_1); 148 } else { 149 si_write_harvested_raster_configs(physical_device, cs, 150 raster_config, 151 raster_config_1); 152 } 153} 154 155void 156si_emit_graphics(struct radv_physical_device *physical_device, 157 struct radeon_cmdbuf *cs) 158{ 159 int i; 160 161 /* Only SI can disable CLEAR_STATE for now. */ 162 assert(physical_device->has_clear_state || 163 physical_device->rad_info.chip_class == SI); 164 165 radeon_emit(cs, PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); 166 radeon_emit(cs, CONTEXT_CONTROL_LOAD_ENABLE(1)); 167 radeon_emit(cs, CONTEXT_CONTROL_SHADOW_ENABLE(1)); 168 169 if (physical_device->has_clear_state) { 170 radeon_emit(cs, PKT3(PKT3_CLEAR_STATE, 0, 0)); 171 radeon_emit(cs, 0); 172 } 173 174 if (physical_device->rad_info.chip_class <= VI) 175 si_set_raster_config(physical_device, cs); 176 177 radeon_set_context_reg(cs, R_028A18_VGT_HOS_MAX_TESS_LEVEL, fui(64)); 178 if (!physical_device->has_clear_state) 179 radeon_set_context_reg(cs, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, fui(0)); 180 181 /* FIXME calculate these values somehow ??? */ 182 if (physical_device->rad_info.chip_class <= VI) { 183 radeon_set_context_reg(cs, R_028A54_VGT_GS_PER_ES, SI_GS_PER_ES); 184 radeon_set_context_reg(cs, R_028A58_VGT_ES_PER_GS, 0x40); 185 } 186 187 if (!physical_device->has_clear_state) { 188 radeon_set_context_reg(cs, R_028A5C_VGT_GS_PER_VS, 0x2); 189 radeon_set_context_reg(cs, R_028A8C_VGT_PRIMITIVEID_RESET, 0x0); 190 radeon_set_context_reg(cs, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0); 191 } 192 193 radeon_set_context_reg(cs, R_028AA0_VGT_INSTANCE_STEP_RATE_0, 1); 194 if (!physical_device->has_clear_state) 195 radeon_set_context_reg(cs, R_028AB8_VGT_VTX_CNT_EN, 0x0); 196 if (physical_device->rad_info.chip_class < CIK) 197 radeon_set_config_reg(cs, R_008A14_PA_CL_ENHANCE, S_008A14_NUM_CLIP_SEQ(3) | 198 S_008A14_CLIP_VTX_REORDER_ENA(1)); 199 200 radeon_set_context_reg(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 0x76543210); 201 radeon_set_context_reg(cs, R_028BD8_PA_SC_CENTROID_PRIORITY_1, 0xfedcba98); 202 203 if (!physical_device->has_clear_state) 204 radeon_set_context_reg(cs, R_02882C_PA_SU_PRIM_FILTER_CNTL, 0); 205 206 /* CLEAR_STATE doesn't clear these correctly on certain generations. 207 * I don't know why. Deduced by trial and error. 208 */ 209 if (physical_device->rad_info.chip_class <= CIK) { 210 radeon_set_context_reg(cs, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0); 211 radeon_set_context_reg(cs, R_028204_PA_SC_WINDOW_SCISSOR_TL, 212 S_028204_WINDOW_OFFSET_DISABLE(1)); 213 radeon_set_context_reg(cs, R_028240_PA_SC_GENERIC_SCISSOR_TL, 214 S_028240_WINDOW_OFFSET_DISABLE(1)); 215 radeon_set_context_reg(cs, R_028244_PA_SC_GENERIC_SCISSOR_BR, 216 S_028244_BR_X(16384) | S_028244_BR_Y(16384)); 217 radeon_set_context_reg(cs, R_028030_PA_SC_SCREEN_SCISSOR_TL, 0); 218 radeon_set_context_reg(cs, R_028034_PA_SC_SCREEN_SCISSOR_BR, 219 S_028034_BR_X(16384) | S_028034_BR_Y(16384)); 220 } 221 222 if (!physical_device->has_clear_state) { 223 for (i = 0; i < 16; i++) { 224 radeon_set_context_reg(cs, R_0282D0_PA_SC_VPORT_ZMIN_0 + i*8, 0); 225 radeon_set_context_reg(cs, R_0282D4_PA_SC_VPORT_ZMAX_0 + i*8, fui(1.0)); 226 } 227 } 228 229 if (!physical_device->has_clear_state) { 230 radeon_set_context_reg(cs, R_02820C_PA_SC_CLIPRECT_RULE, 0xFFFF); 231 radeon_set_context_reg(cs, R_028230_PA_SC_EDGERULE, 0xAAAAAAAA); 232 /* PA_SU_HARDWARE_SCREEN_OFFSET must be 0 due to hw bug on SI */ 233 radeon_set_context_reg(cs, R_028234_PA_SU_HARDWARE_SCREEN_OFFSET, 0); 234 radeon_set_context_reg(cs, R_028820_PA_CL_NANINF_CNTL, 0); 235 radeon_set_context_reg(cs, R_028AC0_DB_SRESULTS_COMPARE_STATE0, 0x0); 236 radeon_set_context_reg(cs, R_028AC4_DB_SRESULTS_COMPARE_STATE1, 0x0); 237 radeon_set_context_reg(cs, R_028AC8_DB_PRELOAD_CONTROL, 0x0); 238 } 239 240 radeon_set_context_reg(cs, R_02800C_DB_RENDER_OVERRIDE, 241 S_02800C_FORCE_HIS_ENABLE0(V_02800C_FORCE_DISABLE) | 242 S_02800C_FORCE_HIS_ENABLE1(V_02800C_FORCE_DISABLE)); 243 244 if (physical_device->rad_info.chip_class >= GFX9) { 245 radeon_set_uconfig_reg(cs, R_030920_VGT_MAX_VTX_INDX, ~0); 246 radeon_set_uconfig_reg(cs, R_030924_VGT_MIN_VTX_INDX, 0); 247 radeon_set_uconfig_reg(cs, R_030928_VGT_INDX_OFFSET, 0); 248 } else { 249 /* These registers, when written, also overwrite the 250 * CLEAR_STATE context, so we can't rely on CLEAR_STATE setting 251 * them. It would be an issue if there was another UMD 252 * changing them. 253 */ 254 radeon_set_context_reg(cs, R_028400_VGT_MAX_VTX_INDX, ~0); 255 radeon_set_context_reg(cs, R_028404_VGT_MIN_VTX_INDX, 0); 256 radeon_set_context_reg(cs, R_028408_VGT_INDX_OFFSET, 0); 257 } 258 259 if (physical_device->rad_info.chip_class >= CIK) { 260 if (physical_device->rad_info.chip_class >= GFX9) { 261 radeon_set_sh_reg(cs, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, 262 S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F)); 263 } else { 264 radeon_set_sh_reg(cs, R_00B51C_SPI_SHADER_PGM_RSRC3_LS, 265 S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F)); 266 radeon_set_sh_reg(cs, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, 267 S_00B41C_WAVE_LIMIT(0x3F)); 268 radeon_set_sh_reg(cs, R_00B31C_SPI_SHADER_PGM_RSRC3_ES, 269 S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F)); 270 /* If this is 0, Bonaire can hang even if GS isn't being used. 271 * Other chips are unaffected. These are suboptimal values, 272 * but we don't use on-chip GS. 273 */ 274 radeon_set_context_reg(cs, R_028A44_VGT_GS_ONCHIP_CNTL, 275 S_028A44_ES_VERTS_PER_SUBGRP(64) | 276 S_028A44_GS_PRIMS_PER_SUBGRP(4)); 277 } 278 radeon_set_sh_reg(cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, 279 S_00B21C_CU_EN(0xffff) | S_00B21C_WAVE_LIMIT(0x3F)); 280 281 if (physical_device->rad_info.num_good_cu_per_sh <= 4) { 282 /* Too few available compute units per SH. Disallowing 283 * VS to run on CU0 could hurt us more than late VS 284 * allocation would help. 285 * 286 * LATE_ALLOC_VS = 2 is the highest safe number. 287 */ 288 radeon_set_sh_reg(cs, R_00B118_SPI_SHADER_PGM_RSRC3_VS, 289 S_00B118_CU_EN(0xffff) | S_00B118_WAVE_LIMIT(0x3F) ); 290 radeon_set_sh_reg(cs, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(2)); 291 } else { 292 /* Set LATE_ALLOC_VS == 31. It should be less than 293 * the number of scratch waves. Limitations: 294 * - VS can't execute on CU0. 295 * - If HS writes outputs to LDS, LS can't execute on CU0. 296 */ 297 radeon_set_sh_reg(cs, R_00B118_SPI_SHADER_PGM_RSRC3_VS, 298 S_00B118_CU_EN(0xfffe) | S_00B118_WAVE_LIMIT(0x3F)); 299 radeon_set_sh_reg(cs, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(31)); 300 } 301 302 radeon_set_sh_reg(cs, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, 303 S_00B01C_CU_EN(0xffff) | S_00B01C_WAVE_LIMIT(0x3F)); 304 } 305 306 if (physical_device->rad_info.chip_class >= VI) { 307 uint32_t vgt_tess_distribution; 308 309 vgt_tess_distribution = S_028B50_ACCUM_ISOLINE(32) | 310 S_028B50_ACCUM_TRI(11) | 311 S_028B50_ACCUM_QUAD(11) | 312 S_028B50_DONUT_SPLIT(16); 313 314 if (physical_device->rad_info.family == CHIP_FIJI || 315 physical_device->rad_info.family >= CHIP_POLARIS10) 316 vgt_tess_distribution |= S_028B50_TRAP_SPLIT(3); 317 318 radeon_set_context_reg(cs, R_028B50_VGT_TESS_DISTRIBUTION, 319 vgt_tess_distribution); 320 } else if (!physical_device->has_clear_state) { 321 radeon_set_context_reg(cs, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14); 322 radeon_set_context_reg(cs, R_028C5C_VGT_OUT_DEALLOC_CNTL, 16); 323 } 324 325 if (physical_device->rad_info.chip_class >= GFX9) { 326 unsigned num_se = physical_device->rad_info.max_se; 327 unsigned pc_lines = 0; 328 329 switch (physical_device->rad_info.family) { 330 case CHIP_VEGA10: 331 case CHIP_VEGA12: 332 case CHIP_VEGA20: 333 pc_lines = 4096; 334 break; 335 case CHIP_RAVEN: 336 case CHIP_RAVEN2: 337 pc_lines = 1024; 338 break; 339 default: 340 assert(0); 341 } 342 343 radeon_set_context_reg(cs, R_028C48_PA_SC_BINNER_CNTL_1, 344 S_028C48_MAX_ALLOC_COUNT(MIN2(128, pc_lines / (4 * num_se))) | 345 S_028C48_MAX_PRIM_PER_BATCH(1023)); 346 radeon_set_context_reg(cs, R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL, 347 S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1)); 348 radeon_set_uconfig_reg(cs, R_030968_VGT_INSTANCE_BASE_ID, 0); 349 } 350 351 unsigned tmp = (unsigned)(1.0 * 8.0); 352 radeon_set_context_reg_seq(cs, R_028A00_PA_SU_POINT_SIZE, 1); 353 radeon_emit(cs, S_028A00_HEIGHT(tmp) | S_028A00_WIDTH(tmp)); 354 radeon_set_context_reg_seq(cs, R_028A04_PA_SU_POINT_MINMAX, 1); 355 radeon_emit(cs, S_028A04_MIN_SIZE(radv_pack_float_12p4(0)) | 356 S_028A04_MAX_SIZE(radv_pack_float_12p4(8192/2))); 357 358 if (!physical_device->has_clear_state) { 359 radeon_set_context_reg(cs, R_028004_DB_COUNT_CONTROL, 360 S_028004_ZPASS_INCREMENT_DISABLE(1)); 361 } 362 363 /* Enable the Polaris small primitive filter control. 364 * XXX: There is possibly an issue when MSAA is off (see RadeonSI 365 * has_msaa_sample_loc_bug). But this doesn't seem to regress anything, 366 * and AMDVLK doesn't have a workaround as well. 367 */ 368 if (physical_device->rad_info.family >= CHIP_POLARIS10) { 369 unsigned small_prim_filter_cntl = 370 S_028830_SMALL_PRIM_FILTER_ENABLE(1) | 371 /* Workaround for a hw line bug. */ 372 S_028830_LINE_FILTER_DISABLE(physical_device->rad_info.family <= CHIP_POLARIS12); 373 374 radeon_set_context_reg(cs, R_028830_PA_SU_SMALL_PRIM_FILTER_CNTL, 375 small_prim_filter_cntl); 376 } 377 378 si_emit_compute(physical_device, cs); 379} 380 381void 382cik_create_gfx_config(struct radv_device *device) 383{ 384 struct radeon_cmdbuf *cs = device->ws->cs_create(device->ws, RING_GFX); 385 if (!cs) 386 return; 387 388 si_emit_graphics(device->physical_device, cs); 389 390 while (cs->cdw & 7) { 391 if (device->physical_device->rad_info.gfx_ib_pad_with_type2) 392 radeon_emit(cs, 0x80000000); 393 else 394 radeon_emit(cs, 0xffff1000); 395 } 396 397 device->gfx_init = device->ws->buffer_create(device->ws, 398 cs->cdw * 4, 4096, 399 RADEON_DOMAIN_GTT, 400 RADEON_FLAG_CPU_ACCESS| 401 RADEON_FLAG_NO_INTERPROCESS_SHARING | 402 RADEON_FLAG_READ_ONLY, 403 RADV_BO_PRIORITY_CS); 404 if (!device->gfx_init) 405 goto fail; 406 407 void *map = device->ws->buffer_map(device->gfx_init); 408 if (!map) { 409 device->ws->buffer_destroy(device->gfx_init); 410 device->gfx_init = NULL; 411 goto fail; 412 } 413 memcpy(map, cs->buf, cs->cdw * 4); 414 415 device->ws->buffer_unmap(device->gfx_init); 416 device->gfx_init_size_dw = cs->cdw; 417fail: 418 device->ws->cs_destroy(cs); 419} 420 421static void 422get_viewport_xform(const VkViewport *viewport, 423 float scale[3], float translate[3]) 424{ 425 float x = viewport->x; 426 float y = viewport->y; 427 float half_width = 0.5f * viewport->width; 428 float half_height = 0.5f * viewport->height; 429 double n = viewport->minDepth; 430 double f = viewport->maxDepth; 431 432 scale[0] = half_width; 433 translate[0] = half_width + x; 434 scale[1] = half_height; 435 translate[1] = half_height + y; 436 437 scale[2] = (f - n); 438 translate[2] = n; 439} 440 441void 442si_write_viewport(struct radeon_cmdbuf *cs, int first_vp, 443 int count, const VkViewport *viewports) 444{ 445 int i; 446 447 assert(count); 448 radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE + 449 first_vp * 4 * 6, count * 6); 450 451 for (i = 0; i < count; i++) { 452 float scale[3], translate[3]; 453 454 455 get_viewport_xform(&viewports[i], scale, translate); 456 radeon_emit(cs, fui(scale[0])); 457 radeon_emit(cs, fui(translate[0])); 458 radeon_emit(cs, fui(scale[1])); 459 radeon_emit(cs, fui(translate[1])); 460 radeon_emit(cs, fui(scale[2])); 461 radeon_emit(cs, fui(translate[2])); 462 } 463 464 radeon_set_context_reg_seq(cs, R_0282D0_PA_SC_VPORT_ZMIN_0 + 465 first_vp * 4 * 2, count * 2); 466 for (i = 0; i < count; i++) { 467 float zmin = MIN2(viewports[i].minDepth, viewports[i].maxDepth); 468 float zmax = MAX2(viewports[i].minDepth, viewports[i].maxDepth); 469 radeon_emit(cs, fui(zmin)); 470 radeon_emit(cs, fui(zmax)); 471 } 472} 473 474static VkRect2D si_scissor_from_viewport(const VkViewport *viewport) 475{ 476 float scale[3], translate[3]; 477 VkRect2D rect; 478 479 get_viewport_xform(viewport, scale, translate); 480 481 rect.offset.x = translate[0] - fabs(scale[0]); 482 rect.offset.y = translate[1] - fabs(scale[1]); 483 rect.extent.width = ceilf(translate[0] + fabs(scale[0])) - rect.offset.x; 484 rect.extent.height = ceilf(translate[1] + fabs(scale[1])) - rect.offset.y; 485 486 return rect; 487} 488 489static VkRect2D si_intersect_scissor(const VkRect2D *a, const VkRect2D *b) { 490 VkRect2D ret; 491 ret.offset.x = MAX2(a->offset.x, b->offset.x); 492 ret.offset.y = MAX2(a->offset.y, b->offset.y); 493 ret.extent.width = MIN2(a->offset.x + a->extent.width, 494 b->offset.x + b->extent.width) - ret.offset.x; 495 ret.extent.height = MIN2(a->offset.y + a->extent.height, 496 b->offset.y + b->extent.height) - ret.offset.y; 497 return ret; 498} 499 500void 501si_write_scissors(struct radeon_cmdbuf *cs, int first, 502 int count, const VkRect2D *scissors, 503 const VkViewport *viewports, bool can_use_guardband) 504{ 505 int i; 506 float scale[3], translate[3], guardband_x = INFINITY, guardband_y = INFINITY; 507 const float max_range = 32767.0f; 508 if (!count) 509 return; 510 511 radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL + first * 4 * 2, count * 2); 512 for (i = 0; i < count; i++) { 513 VkRect2D viewport_scissor = si_scissor_from_viewport(viewports + i); 514 VkRect2D scissor = si_intersect_scissor(&scissors[i], &viewport_scissor); 515 516 get_viewport_xform(viewports + i, scale, translate); 517 scale[0] = fabsf(scale[0]); 518 scale[1] = fabsf(scale[1]); 519 520 if (scale[0] < 0.5) 521 scale[0] = 0.5; 522 if (scale[1] < 0.5) 523 scale[1] = 0.5; 524 525 guardband_x = MIN2(guardband_x, (max_range - fabsf(translate[0])) / scale[0]); 526 guardband_y = MIN2(guardband_y, (max_range - fabsf(translate[1])) / scale[1]); 527 528 radeon_emit(cs, S_028250_TL_X(scissor.offset.x) | 529 S_028250_TL_Y(scissor.offset.y) | 530 S_028250_WINDOW_OFFSET_DISABLE(1)); 531 radeon_emit(cs, S_028254_BR_X(scissor.offset.x + scissor.extent.width) | 532 S_028254_BR_Y(scissor.offset.y + scissor.extent.height)); 533 } 534 if (!can_use_guardband) { 535 guardband_x = 1.0; 536 guardband_y = 1.0; 537 } 538 539 radeon_set_context_reg_seq(cs, R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, 4); 540 radeon_emit(cs, fui(guardband_y)); 541 radeon_emit(cs, fui(1.0)); 542 radeon_emit(cs, fui(guardband_x)); 543 radeon_emit(cs, fui(1.0)); 544} 545 546static inline unsigned 547radv_prims_for_vertices(struct radv_prim_vertex_count *info, unsigned num) 548{ 549 if (num == 0) 550 return 0; 551 552 if (info->incr == 0) 553 return 0; 554 555 if (num < info->min) 556 return 0; 557 558 return 1 + ((num - info->min) / info->incr); 559} 560 561uint32_t 562si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer, 563 bool instanced_draw, bool indirect_draw, 564 bool count_from_stream_output, 565 uint32_t draw_vertex_count) 566{ 567 enum chip_class chip_class = cmd_buffer->device->physical_device->rad_info.chip_class; 568 enum radeon_family family = cmd_buffer->device->physical_device->rad_info.family; 569 struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info; 570 const unsigned max_primgroup_in_wave = 2; 571 /* SWITCH_ON_EOP(0) is always preferable. */ 572 bool wd_switch_on_eop = false; 573 bool ia_switch_on_eop = false; 574 bool ia_switch_on_eoi = false; 575 bool partial_vs_wave = false; 576 bool partial_es_wave = cmd_buffer->state.pipeline->graphics.ia_multi_vgt_param.partial_es_wave; 577 bool multi_instances_smaller_than_primgroup; 578 579 multi_instances_smaller_than_primgroup = indirect_draw; 580 if (!multi_instances_smaller_than_primgroup && instanced_draw) { 581 uint32_t num_prims = radv_prims_for_vertices(&cmd_buffer->state.pipeline->graphics.prim_vertex_count, draw_vertex_count); 582 if (num_prims < cmd_buffer->state.pipeline->graphics.ia_multi_vgt_param.primgroup_size) 583 multi_instances_smaller_than_primgroup = true; 584 } 585 586 ia_switch_on_eoi = cmd_buffer->state.pipeline->graphics.ia_multi_vgt_param.ia_switch_on_eoi; 587 partial_vs_wave = cmd_buffer->state.pipeline->graphics.ia_multi_vgt_param.partial_vs_wave; 588 589 if (chip_class >= CIK) { 590 wd_switch_on_eop = cmd_buffer->state.pipeline->graphics.ia_multi_vgt_param.wd_switch_on_eop; 591 592 /* Hawaii hangs if instancing is enabled and WD_SWITCH_ON_EOP is 0. 593 * We don't know that for indirect drawing, so treat it as 594 * always problematic. */ 595 if (family == CHIP_HAWAII && 596 (instanced_draw || indirect_draw)) 597 wd_switch_on_eop = true; 598 599 /* Performance recommendation for 4 SE Gfx7-8 parts if 600 * instances are smaller than a primgroup. 601 * Assume indirect draws always use small instances. 602 * This is needed for good VS wave utilization. 603 */ 604 if (chip_class <= VI && 605 info->max_se == 4 && 606 multi_instances_smaller_than_primgroup) 607 wd_switch_on_eop = true; 608 609 /* Required on CIK and later. */ 610 if (info->max_se > 2 && !wd_switch_on_eop) 611 ia_switch_on_eoi = true; 612 613 /* Required by Hawaii and, for some special cases, by VI. */ 614 if (ia_switch_on_eoi && 615 (family == CHIP_HAWAII || 616 (chip_class == VI && 617 /* max primgroup in wave is always 2 - leave this for documentation */ 618 (radv_pipeline_has_gs(cmd_buffer->state.pipeline) || max_primgroup_in_wave != 2)))) 619 partial_vs_wave = true; 620 621 /* Instancing bug on Bonaire. */ 622 if (family == CHIP_BONAIRE && ia_switch_on_eoi && 623 (instanced_draw || indirect_draw)) 624 partial_vs_wave = true; 625 626 /* Hardware requirement when drawing primitives from a stream 627 * output buffer. 628 */ 629 if (count_from_stream_output) 630 wd_switch_on_eop = true; 631 632 /* If the WD switch is false, the IA switch must be false too. */ 633 assert(wd_switch_on_eop || !ia_switch_on_eop); 634 } 635 /* If SWITCH_ON_EOI is set, PARTIAL_ES_WAVE must be set too. */ 636 if (chip_class <= VI && ia_switch_on_eoi) 637 partial_es_wave = true; 638 639 if (radv_pipeline_has_gs(cmd_buffer->state.pipeline)) { 640 /* GS hw bug with single-primitive instances and SWITCH_ON_EOI. 641 * The hw doc says all multi-SE chips are affected, but amdgpu-pro Vulkan 642 * only applies it to Hawaii. Do what amdgpu-pro Vulkan does. 643 */ 644 if (family == CHIP_HAWAII && ia_switch_on_eoi) { 645 bool set_vgt_flush = indirect_draw; 646 if (!set_vgt_flush && instanced_draw) { 647 uint32_t num_prims = radv_prims_for_vertices(&cmd_buffer->state.pipeline->graphics.prim_vertex_count, draw_vertex_count); 648 if (num_prims <= 1) 649 set_vgt_flush = true; 650 } 651 if (set_vgt_flush) 652 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_FLUSH; 653 } 654 } 655 656 return cmd_buffer->state.pipeline->graphics.ia_multi_vgt_param.base | 657 S_028AA8_SWITCH_ON_EOP(ia_switch_on_eop) | 658 S_028AA8_SWITCH_ON_EOI(ia_switch_on_eoi) | 659 S_028AA8_PARTIAL_VS_WAVE_ON(partial_vs_wave) | 660 S_028AA8_PARTIAL_ES_WAVE_ON(partial_es_wave) | 661 S_028AA8_WD_SWITCH_ON_EOP(chip_class >= CIK ? wd_switch_on_eop : 0); 662 663} 664 665void si_cs_emit_write_event_eop(struct radeon_cmdbuf *cs, 666 enum chip_class chip_class, 667 bool is_mec, 668 unsigned event, unsigned event_flags, 669 unsigned data_sel, 670 uint64_t va, 671 uint32_t new_fence, 672 uint64_t gfx9_eop_bug_va) 673{ 674 unsigned op = EVENT_TYPE(event) | 675 EVENT_INDEX(5) | 676 event_flags; 677 unsigned is_gfx8_mec = is_mec && chip_class < GFX9; 678 unsigned sel = EOP_DATA_SEL(data_sel); 679 680 /* Wait for write confirmation before writing data, but don't send 681 * an interrupt. */ 682 if (data_sel != EOP_DATA_SEL_DISCARD) 683 sel |= EOP_INT_SEL(EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM); 684 685 if (chip_class >= GFX9 || is_gfx8_mec) { 686 /* A ZPASS_DONE or PIXEL_STAT_DUMP_EVENT (of the DB occlusion 687 * counters) must immediately precede every timestamp event to 688 * prevent a GPU hang on GFX9. 689 */ 690 if (chip_class == GFX9 && !is_mec) { 691 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); 692 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1)); 693 radeon_emit(cs, gfx9_eop_bug_va); 694 radeon_emit(cs, gfx9_eop_bug_va >> 32); 695 } 696 697 radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, is_gfx8_mec ? 5 : 6, false)); 698 radeon_emit(cs, op); 699 radeon_emit(cs, sel); 700 radeon_emit(cs, va); /* address lo */ 701 radeon_emit(cs, va >> 32); /* address hi */ 702 radeon_emit(cs, new_fence); /* immediate data lo */ 703 radeon_emit(cs, 0); /* immediate data hi */ 704 if (!is_gfx8_mec) 705 radeon_emit(cs, 0); /* unused */ 706 } else { 707 if (chip_class == CIK || 708 chip_class == VI) { 709 /* Two EOP events are required to make all engines go idle 710 * (and optional cache flushes executed) before the timestamp 711 * is written. 712 */ 713 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, false)); 714 radeon_emit(cs, op); 715 radeon_emit(cs, va); 716 radeon_emit(cs, ((va >> 32) & 0xffff) | sel); 717 radeon_emit(cs, 0); /* immediate data */ 718 radeon_emit(cs, 0); /* unused */ 719 } 720 721 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, false)); 722 radeon_emit(cs, op); 723 radeon_emit(cs, va); 724 radeon_emit(cs, ((va >> 32) & 0xffff) | sel); 725 radeon_emit(cs, new_fence); /* immediate data */ 726 radeon_emit(cs, 0); /* unused */ 727 } 728} 729 730void 731radv_cp_wait_mem(struct radeon_cmdbuf *cs, uint32_t op, uint64_t va, 732 uint32_t ref, uint32_t mask) 733{ 734 assert(op == WAIT_REG_MEM_EQUAL || 735 op == WAIT_REG_MEM_NOT_EQUAL || 736 op == WAIT_REG_MEM_GREATER_OR_EQUAL); 737 738 radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, false)); 739 radeon_emit(cs, op | WAIT_REG_MEM_MEM_SPACE(1)); 740 radeon_emit(cs, va); 741 radeon_emit(cs, va >> 32); 742 radeon_emit(cs, ref); /* reference value */ 743 radeon_emit(cs, mask); /* mask */ 744 radeon_emit(cs, 4); /* poll interval */ 745} 746 747static void 748si_emit_acquire_mem(struct radeon_cmdbuf *cs, 749 bool is_mec, 750 bool is_gfx9, 751 unsigned cp_coher_cntl) 752{ 753 if (is_mec || is_gfx9) { 754 uint32_t hi_val = is_gfx9 ? 0xffffff : 0xff; 755 radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, false) | 756 PKT3_SHADER_TYPE_S(is_mec)); 757 radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */ 758 radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ 759 radeon_emit(cs, hi_val); /* CP_COHER_SIZE_HI */ 760 radeon_emit(cs, 0); /* CP_COHER_BASE */ 761 radeon_emit(cs, 0); /* CP_COHER_BASE_HI */ 762 radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ 763 } else { 764 /* ACQUIRE_MEM is only required on a compute ring. */ 765 radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, false)); 766 radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */ 767 radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ 768 radeon_emit(cs, 0); /* CP_COHER_BASE */ 769 radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ 770 } 771} 772 773void 774si_cs_emit_cache_flush(struct radeon_cmdbuf *cs, 775 enum chip_class chip_class, 776 uint32_t *flush_cnt, 777 uint64_t flush_va, 778 bool is_mec, 779 enum radv_cmd_flush_bits flush_bits, 780 uint64_t gfx9_eop_bug_va) 781{ 782 unsigned cp_coher_cntl = 0; 783 uint32_t flush_cb_db = flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB | 784 RADV_CMD_FLAG_FLUSH_AND_INV_DB); 785 786 if (flush_bits & RADV_CMD_FLAG_INV_ICACHE) 787 cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1); 788 if (flush_bits & RADV_CMD_FLAG_INV_SMEM_L1) 789 cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1); 790 791 if (chip_class <= VI) { 792 if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB) { 793 cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) | 794 S_0085F0_CB0_DEST_BASE_ENA(1) | 795 S_0085F0_CB1_DEST_BASE_ENA(1) | 796 S_0085F0_CB2_DEST_BASE_ENA(1) | 797 S_0085F0_CB3_DEST_BASE_ENA(1) | 798 S_0085F0_CB4_DEST_BASE_ENA(1) | 799 S_0085F0_CB5_DEST_BASE_ENA(1) | 800 S_0085F0_CB6_DEST_BASE_ENA(1) | 801 S_0085F0_CB7_DEST_BASE_ENA(1); 802 803 /* Necessary for DCC */ 804 if (chip_class >= VI) { 805 si_cs_emit_write_event_eop(cs, 806 chip_class, 807 is_mec, 808 V_028A90_FLUSH_AND_INV_CB_DATA_TS, 809 0, 810 EOP_DATA_SEL_DISCARD, 811 0, 0, 812 gfx9_eop_bug_va); 813 } 814 } 815 if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB) { 816 cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | 817 S_0085F0_DB_DEST_BASE_ENA(1); 818 } 819 } 820 821 if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB_META) { 822 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 823 radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0)); 824 } 825 826 if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB_META) { 827 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 828 radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0)); 829 } 830 831 if (flush_bits & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) { 832 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 833 radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4)); 834 } else if (flush_bits & RADV_CMD_FLAG_VS_PARTIAL_FLUSH) { 835 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 836 radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); 837 } 838 839 if (flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH) { 840 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 841 radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4)); 842 } 843 844 if (chip_class >= GFX9 && flush_cb_db) { 845 unsigned cb_db_event, tc_flags; 846 847 /* Set the CB/DB flush event. */ 848 cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT; 849 850 /* These are the only allowed combinations. If you need to 851 * do multiple operations at once, do them separately. 852 * All operations that invalidate L2 also seem to invalidate 853 * metadata. Volatile (VOL) and WC flushes are not listed here. 854 * 855 * TC | TC_WB = writeback & invalidate L2 & L1 856 * TC | TC_WB | TC_NC = writeback & invalidate L2 for MTYPE == NC 857 * TC_WB | TC_NC = writeback L2 for MTYPE == NC 858 * TC | TC_NC = invalidate L2 for MTYPE == NC 859 * TC | TC_MD = writeback & invalidate L2 metadata (DCC, etc.) 860 * TCL1 = invalidate L1 861 */ 862 tc_flags = EVENT_TC_ACTION_ENA | 863 EVENT_TC_MD_ACTION_ENA; 864 865 /* Ideally flush TC together with CB/DB. */ 866 if (flush_bits & RADV_CMD_FLAG_INV_GLOBAL_L2) { 867 /* Writeback and invalidate everything in L2 & L1. */ 868 tc_flags = EVENT_TC_ACTION_ENA | 869 EVENT_TC_WB_ACTION_ENA; 870 871 872 /* Clear the flags. */ 873 flush_bits &= ~(RADV_CMD_FLAG_INV_GLOBAL_L2 | 874 RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2 | 875 RADV_CMD_FLAG_INV_VMEM_L1); 876 } 877 assert(flush_cnt); 878 (*flush_cnt)++; 879 880 si_cs_emit_write_event_eop(cs, chip_class, false, cb_db_event, tc_flags, 881 EOP_DATA_SEL_VALUE_32BIT, 882 flush_va, *flush_cnt, 883 gfx9_eop_bug_va); 884 radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL, flush_va, 885 *flush_cnt, 0xffffffff); 886 } 887 888 /* VGT state sync */ 889 if (flush_bits & RADV_CMD_FLAG_VGT_FLUSH) { 890 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 891 radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0)); 892 } 893 894 /* VGT streamout state sync */ 895 if (flush_bits & RADV_CMD_FLAG_VGT_STREAMOUT_SYNC) { 896 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 897 radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_STREAMOUT_SYNC) | EVENT_INDEX(0)); 898 } 899 900 /* Make sure ME is idle (it executes most packets) before continuing. 901 * This prevents read-after-write hazards between PFP and ME. 902 */ 903 if ((cp_coher_cntl || 904 (flush_bits & (RADV_CMD_FLAG_CS_PARTIAL_FLUSH | 905 RADV_CMD_FLAG_INV_VMEM_L1 | 906 RADV_CMD_FLAG_INV_GLOBAL_L2 | 907 RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2))) && 908 !is_mec) { 909 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); 910 radeon_emit(cs, 0); 911 } 912 913 if ((flush_bits & RADV_CMD_FLAG_INV_GLOBAL_L2) || 914 (chip_class <= CIK && (flush_bits & RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2))) { 915 si_emit_acquire_mem(cs, is_mec, chip_class >= GFX9, 916 cp_coher_cntl | 917 S_0085F0_TC_ACTION_ENA(1) | 918 S_0085F0_TCL1_ACTION_ENA(1) | 919 S_0301F0_TC_WB_ACTION_ENA(chip_class >= VI)); 920 cp_coher_cntl = 0; 921 } else { 922 if(flush_bits & RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2) { 923 /* WB = write-back 924 * NC = apply to non-coherent MTYPEs 925 * (i.e. MTYPE <= 1, which is what we use everywhere) 926 * 927 * WB doesn't work without NC. 928 */ 929 si_emit_acquire_mem(cs, is_mec, 930 chip_class >= GFX9, 931 cp_coher_cntl | 932 S_0301F0_TC_WB_ACTION_ENA(1) | 933 S_0301F0_TC_NC_ACTION_ENA(1)); 934 cp_coher_cntl = 0; 935 } 936 if (flush_bits & RADV_CMD_FLAG_INV_VMEM_L1) { 937 si_emit_acquire_mem(cs, is_mec, 938 chip_class >= GFX9, 939 cp_coher_cntl | 940 S_0085F0_TCL1_ACTION_ENA(1)); 941 cp_coher_cntl = 0; 942 } 943 } 944 945 /* When one of the DEST_BASE flags is set, SURFACE_SYNC waits for idle. 946 * Therefore, it should be last. Done in PFP. 947 */ 948 if (cp_coher_cntl) 949 si_emit_acquire_mem(cs, is_mec, chip_class >= GFX9, cp_coher_cntl); 950 951 if (flush_bits & RADV_CMD_FLAG_START_PIPELINE_STATS) { 952 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 953 radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) | 954 EVENT_INDEX(0)); 955 } else if (flush_bits & RADV_CMD_FLAG_STOP_PIPELINE_STATS) { 956 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 957 radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | 958 EVENT_INDEX(0)); 959 } 960} 961 962void 963si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer) 964{ 965 bool is_compute = cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE; 966 967 if (is_compute) 968 cmd_buffer->state.flush_bits &= ~(RADV_CMD_FLAG_FLUSH_AND_INV_CB | 969 RADV_CMD_FLAG_FLUSH_AND_INV_CB_META | 970 RADV_CMD_FLAG_FLUSH_AND_INV_DB | 971 RADV_CMD_FLAG_FLUSH_AND_INV_DB_META | 972 RADV_CMD_FLAG_PS_PARTIAL_FLUSH | 973 RADV_CMD_FLAG_VS_PARTIAL_FLUSH | 974 RADV_CMD_FLAG_VGT_FLUSH | 975 RADV_CMD_FLAG_START_PIPELINE_STATS | 976 RADV_CMD_FLAG_STOP_PIPELINE_STATS); 977 978 if (!cmd_buffer->state.flush_bits) 979 return; 980 981 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 128); 982 983 si_cs_emit_cache_flush(cmd_buffer->cs, 984 cmd_buffer->device->physical_device->rad_info.chip_class, 985 &cmd_buffer->gfx9_fence_idx, 986 cmd_buffer->gfx9_fence_va, 987 radv_cmd_buffer_uses_mec(cmd_buffer), 988 cmd_buffer->state.flush_bits, 989 cmd_buffer->gfx9_eop_bug_va); 990 991 992 if (unlikely(cmd_buffer->device->trace_bo)) 993 radv_cmd_buffer_trace_emit(cmd_buffer); 994 995 cmd_buffer->state.flush_bits = 0; 996 997 /* If the driver used a compute shader for resetting a query pool, it 998 * should be finished at this point. 999 */ 1000 cmd_buffer->pending_reset_query = false; 1001} 1002 1003/* sets the CP predication state using a boolean stored at va */ 1004void 1005si_emit_set_predication_state(struct radv_cmd_buffer *cmd_buffer, 1006 bool draw_visible, uint64_t va) 1007{ 1008 uint32_t op = 0; 1009 1010 if (va) { 1011 op = PRED_OP(PREDICATION_OP_BOOL64); 1012 1013 /* PREDICATION_DRAW_VISIBLE means that if the 32-bit value is 1014 * zero, all rendering commands are discarded. Otherwise, they 1015 * are discarded if the value is non zero. 1016 */ 1017 op |= draw_visible ? PREDICATION_DRAW_VISIBLE : 1018 PREDICATION_DRAW_NOT_VISIBLE; 1019 } 1020 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { 1021 radeon_emit(cmd_buffer->cs, PKT3(PKT3_SET_PREDICATION, 2, 0)); 1022 radeon_emit(cmd_buffer->cs, op); 1023 radeon_emit(cmd_buffer->cs, va); 1024 radeon_emit(cmd_buffer->cs, va >> 32); 1025 } else { 1026 radeon_emit(cmd_buffer->cs, PKT3(PKT3_SET_PREDICATION, 1, 0)); 1027 radeon_emit(cmd_buffer->cs, va); 1028 radeon_emit(cmd_buffer->cs, op | ((va >> 32) & 0xFF)); 1029 } 1030} 1031 1032/* Set this if you want the 3D engine to wait until CP DMA is done. 1033 * It should be set on the last CP DMA packet. */ 1034#define CP_DMA_SYNC (1 << 0) 1035 1036/* Set this if the source data was used as a destination in a previous CP DMA 1037 * packet. It's for preventing a read-after-write (RAW) hazard between two 1038 * CP DMA packets. */ 1039#define CP_DMA_RAW_WAIT (1 << 1) 1040#define CP_DMA_USE_L2 (1 << 2) 1041#define CP_DMA_CLEAR (1 << 3) 1042 1043/* Alignment for optimal performance. */ 1044#define SI_CPDMA_ALIGNMENT 32 1045 1046/* The max number of bytes that can be copied per packet. */ 1047static inline unsigned cp_dma_max_byte_count(struct radv_cmd_buffer *cmd_buffer) 1048{ 1049 unsigned max = cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9 ? 1050 S_414_BYTE_COUNT_GFX9(~0u) : 1051 S_414_BYTE_COUNT_GFX6(~0u); 1052 1053 /* make it aligned for optimal performance */ 1054 return max & ~(SI_CPDMA_ALIGNMENT - 1); 1055} 1056 1057/* Emit a CP DMA packet to do a copy from one buffer to another, or to clear 1058 * a buffer. The size must fit in bits [20:0]. If CP_DMA_CLEAR is set, src_va is a 32-bit 1059 * clear value. 1060 */ 1061static void si_emit_cp_dma(struct radv_cmd_buffer *cmd_buffer, 1062 uint64_t dst_va, uint64_t src_va, 1063 unsigned size, unsigned flags) 1064{ 1065 struct radeon_cmdbuf *cs = cmd_buffer->cs; 1066 uint32_t header = 0, command = 0; 1067 1068 assert(size <= cp_dma_max_byte_count(cmd_buffer)); 1069 1070 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 9); 1071 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) 1072 command |= S_414_BYTE_COUNT_GFX9(size); 1073 else 1074 command |= S_414_BYTE_COUNT_GFX6(size); 1075 1076 /* Sync flags. */ 1077 if (flags & CP_DMA_SYNC) 1078 header |= S_411_CP_SYNC(1); 1079 else { 1080 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) 1081 command |= S_414_DISABLE_WR_CONFIRM_GFX9(1); 1082 else 1083 command |= S_414_DISABLE_WR_CONFIRM_GFX6(1); 1084 } 1085 1086 if (flags & CP_DMA_RAW_WAIT) 1087 command |= S_414_RAW_WAIT(1); 1088 1089 /* Src and dst flags. */ 1090 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9 && 1091 !(flags & CP_DMA_CLEAR) && 1092 src_va == dst_va) 1093 header |= S_411_DST_SEL(V_411_NOWHERE); /* prefetch only */ 1094 else if (flags & CP_DMA_USE_L2) 1095 header |= S_411_DST_SEL(V_411_DST_ADDR_TC_L2); 1096 1097 if (flags & CP_DMA_CLEAR) 1098 header |= S_411_SRC_SEL(V_411_DATA); 1099 else if (flags & CP_DMA_USE_L2) 1100 header |= S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2); 1101 1102 if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) { 1103 radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, cmd_buffer->state.predicating)); 1104 radeon_emit(cs, header); 1105 radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */ 1106 radeon_emit(cs, src_va >> 32); /* SRC_ADDR_HI [31:0] */ 1107 radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ 1108 radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [31:0] */ 1109 radeon_emit(cs, command); 1110 } else { 1111 assert(!(flags & CP_DMA_USE_L2)); 1112 header |= S_411_SRC_ADDR_HI(src_va >> 32); 1113 radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, cmd_buffer->state.predicating)); 1114 radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */ 1115 radeon_emit(cs, header); /* SRC_ADDR_HI [15:0] + flags. */ 1116 radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ 1117 radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */ 1118 radeon_emit(cs, command); 1119 } 1120 1121 /* CP DMA is executed in ME, but index buffers are read by PFP. 1122 * This ensures that ME (CP DMA) is idle before PFP starts fetching 1123 * indices. If we wanted to execute CP DMA in PFP, this packet 1124 * should precede it. 1125 */ 1126 if (flags & CP_DMA_SYNC) { 1127 if (cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) { 1128 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating)); 1129 radeon_emit(cs, 0); 1130 } 1131 1132 /* CP will see the sync flag and wait for all DMAs to complete. */ 1133 cmd_buffer->state.dma_is_busy = false; 1134 } 1135 1136 if (unlikely(cmd_buffer->device->trace_bo)) 1137 radv_cmd_buffer_trace_emit(cmd_buffer); 1138} 1139 1140void si_cp_dma_prefetch(struct radv_cmd_buffer *cmd_buffer, uint64_t va, 1141 unsigned size) 1142{ 1143 uint64_t aligned_va = va & ~(SI_CPDMA_ALIGNMENT - 1); 1144 uint64_t aligned_size = ((va + size + SI_CPDMA_ALIGNMENT -1) & ~(SI_CPDMA_ALIGNMENT - 1)) - aligned_va; 1145 1146 si_emit_cp_dma(cmd_buffer, aligned_va, aligned_va, 1147 aligned_size, CP_DMA_USE_L2); 1148} 1149 1150static void si_cp_dma_prepare(struct radv_cmd_buffer *cmd_buffer, uint64_t byte_count, 1151 uint64_t remaining_size, unsigned *flags) 1152{ 1153 1154 /* Flush the caches for the first copy only. 1155 * Also wait for the previous CP DMA operations. 1156 */ 1157 if (cmd_buffer->state.flush_bits) { 1158 si_emit_cache_flush(cmd_buffer); 1159 *flags |= CP_DMA_RAW_WAIT; 1160 } 1161 1162 /* Do the synchronization after the last dma, so that all data 1163 * is written to memory. 1164 */ 1165 if (byte_count == remaining_size) 1166 *flags |= CP_DMA_SYNC; 1167} 1168 1169static void si_cp_dma_realign_engine(struct radv_cmd_buffer *cmd_buffer, unsigned size) 1170{ 1171 uint64_t va; 1172 uint32_t offset; 1173 unsigned dma_flags = 0; 1174 unsigned buf_size = SI_CPDMA_ALIGNMENT * 2; 1175 void *ptr; 1176 1177 assert(size < SI_CPDMA_ALIGNMENT); 1178 1179 radv_cmd_buffer_upload_alloc(cmd_buffer, buf_size, SI_CPDMA_ALIGNMENT, &offset, &ptr); 1180 1181 va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); 1182 va += offset; 1183 1184 si_cp_dma_prepare(cmd_buffer, size, size, &dma_flags); 1185 1186 si_emit_cp_dma(cmd_buffer, va, va + SI_CPDMA_ALIGNMENT, size, 1187 dma_flags); 1188} 1189 1190void si_cp_dma_buffer_copy(struct radv_cmd_buffer *cmd_buffer, 1191 uint64_t src_va, uint64_t dest_va, 1192 uint64_t size) 1193{ 1194 uint64_t main_src_va, main_dest_va; 1195 uint64_t skipped_size = 0, realign_size = 0; 1196 1197 /* Assume that we are not going to sync after the last DMA operation. */ 1198 cmd_buffer->state.dma_is_busy = true; 1199 1200 if (cmd_buffer->device->physical_device->rad_info.family <= CHIP_CARRIZO || 1201 cmd_buffer->device->physical_device->rad_info.family == CHIP_STONEY) { 1202 /* If the size is not aligned, we must add a dummy copy at the end 1203 * just to align the internal counter. Otherwise, the DMA engine 1204 * would slow down by an order of magnitude for following copies. 1205 */ 1206 if (size % SI_CPDMA_ALIGNMENT) 1207 realign_size = SI_CPDMA_ALIGNMENT - (size % SI_CPDMA_ALIGNMENT); 1208 1209 /* If the copy begins unaligned, we must start copying from the next 1210 * aligned block and the skipped part should be copied after everything 1211 * else has been copied. Only the src alignment matters, not dst. 1212 */ 1213 if (src_va % SI_CPDMA_ALIGNMENT) { 1214 skipped_size = SI_CPDMA_ALIGNMENT - (src_va % SI_CPDMA_ALIGNMENT); 1215 /* The main part will be skipped if the size is too small. */ 1216 skipped_size = MIN2(skipped_size, size); 1217 size -= skipped_size; 1218 } 1219 } 1220 main_src_va = src_va + skipped_size; 1221 main_dest_va = dest_va + skipped_size; 1222 1223 while (size) { 1224 unsigned dma_flags = 0; 1225 unsigned byte_count = MIN2(size, cp_dma_max_byte_count(cmd_buffer)); 1226 1227 si_cp_dma_prepare(cmd_buffer, byte_count, 1228 size + skipped_size + realign_size, 1229 &dma_flags); 1230 1231 dma_flags &= ~CP_DMA_SYNC; 1232 1233 si_emit_cp_dma(cmd_buffer, main_dest_va, main_src_va, 1234 byte_count, dma_flags); 1235 1236 size -= byte_count; 1237 main_src_va += byte_count; 1238 main_dest_va += byte_count; 1239 } 1240 1241 if (skipped_size) { 1242 unsigned dma_flags = 0; 1243 1244 si_cp_dma_prepare(cmd_buffer, skipped_size, 1245 size + skipped_size + realign_size, 1246 &dma_flags); 1247 1248 si_emit_cp_dma(cmd_buffer, dest_va, src_va, 1249 skipped_size, dma_flags); 1250 } 1251 if (realign_size) 1252 si_cp_dma_realign_engine(cmd_buffer, realign_size); 1253} 1254 1255void si_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer, uint64_t va, 1256 uint64_t size, unsigned value) 1257{ 1258 1259 if (!size) 1260 return; 1261 1262 assert(va % 4 == 0 && size % 4 == 0); 1263 1264 /* Assume that we are not going to sync after the last DMA operation. */ 1265 cmd_buffer->state.dma_is_busy = true; 1266 1267 while (size) { 1268 unsigned byte_count = MIN2(size, cp_dma_max_byte_count(cmd_buffer)); 1269 unsigned dma_flags = CP_DMA_CLEAR; 1270 1271 si_cp_dma_prepare(cmd_buffer, byte_count, size, &dma_flags); 1272 1273 /* Emit the clear packet. */ 1274 si_emit_cp_dma(cmd_buffer, va, value, byte_count, 1275 dma_flags); 1276 1277 size -= byte_count; 1278 va += byte_count; 1279 } 1280} 1281 1282void si_cp_dma_wait_for_idle(struct radv_cmd_buffer *cmd_buffer) 1283{ 1284 if (cmd_buffer->device->physical_device->rad_info.chip_class < CIK) 1285 return; 1286 1287 if (!cmd_buffer->state.dma_is_busy) 1288 return; 1289 1290 /* Issue a dummy DMA that copies zero bytes. 1291 * 1292 * The DMA engine will see that there's no work to do and skip this 1293 * DMA request, however, the CP will see the sync flag and still wait 1294 * for all DMAs to complete. 1295 */ 1296 si_emit_cp_dma(cmd_buffer, 0, 0, 0, CP_DMA_SYNC); 1297 1298 cmd_buffer->state.dma_is_busy = false; 1299} 1300 1301/* For MSAA sample positions. */ 1302#define FILL_SREG(s0x, s0y, s1x, s1y, s2x, s2y, s3x, s3y) \ 1303 (((s0x) & 0xf) | (((unsigned)(s0y) & 0xf) << 4) | \ 1304 (((unsigned)(s1x) & 0xf) << 8) | (((unsigned)(s1y) & 0xf) << 12) | \ 1305 (((unsigned)(s2x) & 0xf) << 16) | (((unsigned)(s2y) & 0xf) << 20) | \ 1306 (((unsigned)(s3x) & 0xf) << 24) | (((unsigned)(s3y) & 0xf) << 28)) 1307 1308 1309/* 2xMSAA 1310 * There are two locations (4, 4), (-4, -4). */ 1311const uint32_t eg_sample_locs_2x[4] = { 1312 FILL_SREG(4, 4, -4, -4, 4, 4, -4, -4), 1313 FILL_SREG(4, 4, -4, -4, 4, 4, -4, -4), 1314 FILL_SREG(4, 4, -4, -4, 4, 4, -4, -4), 1315 FILL_SREG(4, 4, -4, -4, 4, 4, -4, -4), 1316}; 1317const unsigned eg_max_dist_2x = 4; 1318/* 4xMSAA 1319 * There are 4 locations: (-2, 6), (6, -2), (-6, 2), (2, 6). */ 1320const uint32_t eg_sample_locs_4x[4] = { 1321 FILL_SREG(-2, -6, 6, -2, -6, 2, 2, 6), 1322 FILL_SREG(-2, -6, 6, -2, -6, 2, 2, 6), 1323 FILL_SREG(-2, -6, 6, -2, -6, 2, 2, 6), 1324 FILL_SREG(-2, -6, 6, -2, -6, 2, 2, 6), 1325}; 1326const unsigned eg_max_dist_4x = 6; 1327 1328/* Cayman 8xMSAA */ 1329static const uint32_t cm_sample_locs_8x[] = { 1330 FILL_SREG( 1, -3, -1, 3, 5, 1, -3, -5), 1331 FILL_SREG( 1, -3, -1, 3, 5, 1, -3, -5), 1332 FILL_SREG( 1, -3, -1, 3, 5, 1, -3, -5), 1333 FILL_SREG( 1, -3, -1, 3, 5, 1, -3, -5), 1334 FILL_SREG(-5, 5, -7, -1, 3, 7, 7, -7), 1335 FILL_SREG(-5, 5, -7, -1, 3, 7, 7, -7), 1336 FILL_SREG(-5, 5, -7, -1, 3, 7, 7, -7), 1337 FILL_SREG(-5, 5, -7, -1, 3, 7, 7, -7), 1338}; 1339static const unsigned cm_max_dist_8x = 8; 1340/* Cayman 16xMSAA */ 1341static const uint32_t cm_sample_locs_16x[] = { 1342 FILL_SREG( 1, 1, -1, -3, -3, 2, 4, -1), 1343 FILL_SREG( 1, 1, -1, -3, -3, 2, 4, -1), 1344 FILL_SREG( 1, 1, -1, -3, -3, 2, 4, -1), 1345 FILL_SREG( 1, 1, -1, -3, -3, 2, 4, -1), 1346 FILL_SREG(-5, -2, 2, 5, 5, 3, 3, -5), 1347 FILL_SREG(-5, -2, 2, 5, 5, 3, 3, -5), 1348 FILL_SREG(-5, -2, 2, 5, 5, 3, 3, -5), 1349 FILL_SREG(-5, -2, 2, 5, 5, 3, 3, -5), 1350 FILL_SREG(-2, 6, 0, -7, -4, -6, -6, 4), 1351 FILL_SREG(-2, 6, 0, -7, -4, -6, -6, 4), 1352 FILL_SREG(-2, 6, 0, -7, -4, -6, -6, 4), 1353 FILL_SREG(-2, 6, 0, -7, -4, -6, -6, 4), 1354 FILL_SREG(-8, 0, 7, -4, 6, 7, -7, -8), 1355 FILL_SREG(-8, 0, 7, -4, 6, 7, -7, -8), 1356 FILL_SREG(-8, 0, 7, -4, 6, 7, -7, -8), 1357 FILL_SREG(-8, 0, 7, -4, 6, 7, -7, -8), 1358}; 1359static const unsigned cm_max_dist_16x = 8; 1360 1361unsigned radv_cayman_get_maxdist(int log_samples) 1362{ 1363 unsigned max_dist[] = { 1364 0, 1365 eg_max_dist_2x, 1366 eg_max_dist_4x, 1367 cm_max_dist_8x, 1368 cm_max_dist_16x 1369 }; 1370 return max_dist[log_samples]; 1371} 1372 1373void radv_cayman_emit_msaa_sample_locs(struct radeon_cmdbuf *cs, int nr_samples) 1374{ 1375 switch (nr_samples) { 1376 default: 1377 case 1: 1378 radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 0); 1379 radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, 0); 1380 radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, 0); 1381 radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, 0); 1382 break; 1383 case 2: 1384 radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, eg_sample_locs_2x[0]); 1385 radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, eg_sample_locs_2x[1]); 1386 radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, eg_sample_locs_2x[2]); 1387 radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, eg_sample_locs_2x[3]); 1388 break; 1389 case 4: 1390 radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, eg_sample_locs_4x[0]); 1391 radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, eg_sample_locs_4x[1]); 1392 radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, eg_sample_locs_4x[2]); 1393 radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, eg_sample_locs_4x[3]); 1394 break; 1395 case 8: 1396 radeon_set_context_reg_seq(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 14); 1397 radeon_emit(cs, cm_sample_locs_8x[0]); 1398 radeon_emit(cs, cm_sample_locs_8x[4]); 1399 radeon_emit(cs, 0); 1400 radeon_emit(cs, 0); 1401 radeon_emit(cs, cm_sample_locs_8x[1]); 1402 radeon_emit(cs, cm_sample_locs_8x[5]); 1403 radeon_emit(cs, 0); 1404 radeon_emit(cs, 0); 1405 radeon_emit(cs, cm_sample_locs_8x[2]); 1406 radeon_emit(cs, cm_sample_locs_8x[6]); 1407 radeon_emit(cs, 0); 1408 radeon_emit(cs, 0); 1409 radeon_emit(cs, cm_sample_locs_8x[3]); 1410 radeon_emit(cs, cm_sample_locs_8x[7]); 1411 break; 1412 case 16: 1413 radeon_set_context_reg_seq(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 16); 1414 radeon_emit(cs, cm_sample_locs_16x[0]); 1415 radeon_emit(cs, cm_sample_locs_16x[4]); 1416 radeon_emit(cs, cm_sample_locs_16x[8]); 1417 radeon_emit(cs, cm_sample_locs_16x[12]); 1418 radeon_emit(cs, cm_sample_locs_16x[1]); 1419 radeon_emit(cs, cm_sample_locs_16x[5]); 1420 radeon_emit(cs, cm_sample_locs_16x[9]); 1421 radeon_emit(cs, cm_sample_locs_16x[13]); 1422 radeon_emit(cs, cm_sample_locs_16x[2]); 1423 radeon_emit(cs, cm_sample_locs_16x[6]); 1424 radeon_emit(cs, cm_sample_locs_16x[10]); 1425 radeon_emit(cs, cm_sample_locs_16x[14]); 1426 radeon_emit(cs, cm_sample_locs_16x[3]); 1427 radeon_emit(cs, cm_sample_locs_16x[7]); 1428 radeon_emit(cs, cm_sample_locs_16x[11]); 1429 radeon_emit(cs, cm_sample_locs_16x[15]); 1430 break; 1431 } 1432} 1433 1434static void radv_cayman_get_sample_position(struct radv_device *device, 1435 unsigned sample_count, 1436 unsigned sample_index, float *out_value) 1437{ 1438 int offset, index; 1439 struct { 1440 int idx:4; 1441 } val; 1442 switch (sample_count) { 1443 case 1: 1444 default: 1445 out_value[0] = out_value[1] = 0.5; 1446 break; 1447 case 2: 1448 offset = 4 * (sample_index * 2); 1449 val.idx = (eg_sample_locs_2x[0] >> offset) & 0xf; 1450 out_value[0] = (float)(val.idx + 8) / 16.0f; 1451 val.idx = (eg_sample_locs_2x[0] >> (offset + 4)) & 0xf; 1452 out_value[1] = (float)(val.idx + 8) / 16.0f; 1453 break; 1454 case 4: 1455 offset = 4 * (sample_index * 2); 1456 val.idx = (eg_sample_locs_4x[0] >> offset) & 0xf; 1457 out_value[0] = (float)(val.idx + 8) / 16.0f; 1458 val.idx = (eg_sample_locs_4x[0] >> (offset + 4)) & 0xf; 1459 out_value[1] = (float)(val.idx + 8) / 16.0f; 1460 break; 1461 case 8: 1462 offset = 4 * (sample_index % 4 * 2); 1463 index = (sample_index / 4) * 4; 1464 val.idx = (cm_sample_locs_8x[index] >> offset) & 0xf; 1465 out_value[0] = (float)(val.idx + 8) / 16.0f; 1466 val.idx = (cm_sample_locs_8x[index] >> (offset + 4)) & 0xf; 1467 out_value[1] = (float)(val.idx + 8) / 16.0f; 1468 break; 1469 case 16: 1470 offset = 4 * (sample_index % 4 * 2); 1471 index = (sample_index / 4) * 4; 1472 val.idx = (cm_sample_locs_16x[index] >> offset) & 0xf; 1473 out_value[0] = (float)(val.idx + 8) / 16.0f; 1474 val.idx = (cm_sample_locs_16x[index] >> (offset + 4)) & 0xf; 1475 out_value[1] = (float)(val.idx + 8) / 16.0f; 1476 break; 1477 } 1478} 1479 1480void radv_device_init_msaa(struct radv_device *device) 1481{ 1482 int i; 1483 radv_cayman_get_sample_position(device, 1, 0, device->sample_locations_1x[0]); 1484 1485 for (i = 0; i < 2; i++) 1486 radv_cayman_get_sample_position(device, 2, i, device->sample_locations_2x[i]); 1487 for (i = 0; i < 4; i++) 1488 radv_cayman_get_sample_position(device, 4, i, device->sample_locations_4x[i]); 1489 for (i = 0; i < 8; i++) 1490 radv_cayman_get_sample_position(device, 8, i, device->sample_locations_8x[i]); 1491 for (i = 0; i < 16; i++) 1492 radv_cayman_get_sample_position(device, 16, i, device->sample_locations_16x[i]); 1493} 1494