1/* 2 * Copyright 2017 Advanced Micro Devices, Inc. 3 * All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * on the rights to use, copy, modify, merge, publish, distribute, sub 9 * license, and/or sell copies of the Software, and to permit persons to whom 10 * the Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22 * USE OR OTHER DEALINGS IN THE SOFTWARE. 23 */ 24 25/* This file handles register programming of primitive binning. */ 26 27#include "si_build_pm4.h" 28#include "gfx9d.h" 29 30struct uvec2 { 31 unsigned x, y; 32}; 33 34struct si_bin_size_map { 35 unsigned start; 36 unsigned bin_size_x; 37 unsigned bin_size_y; 38}; 39 40typedef struct si_bin_size_map si_bin_size_subtable[3][10]; 41 42/* Find the bin size where sum is >= table[i].start and < table[i + 1].start. */ 43static struct uvec2 si_find_bin_size(struct si_screen *sscreen, 44 const si_bin_size_subtable table[], 45 unsigned sum) 46{ 47 unsigned log_num_rb_per_se = 48 util_logbase2_ceil(sscreen->info.num_render_backends / 49 sscreen->info.max_se); 50 unsigned log_num_se = util_logbase2_ceil(sscreen->info.max_se); 51 unsigned i; 52 53 /* Get the chip-specific subtable. */ 54 const struct si_bin_size_map *subtable = 55 &table[log_num_rb_per_se][log_num_se][0]; 56 57 for (i = 0; subtable[i].bin_size_x != 0; i++) { 58 if (sum >= subtable[i].start && sum < subtable[i + 1].start) 59 break; 60 } 61 62 struct uvec2 size = {subtable[i].bin_size_x, subtable[i].bin_size_y}; 63 return size; 64} 65 66static struct uvec2 si_get_color_bin_size(struct si_context *sctx, 67 unsigned cb_target_enabled_4bit) 68{ 69 unsigned num_fragments = sctx->framebuffer.nr_color_samples; 70 unsigned sum = 0; 71 72 /* Compute the sum of all Bpp. */ 73 for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) { 74 if (!(cb_target_enabled_4bit & (0xf << (i * 4)))) 75 continue; 76 77 struct si_texture *tex = 78 (struct si_texture*)sctx->framebuffer.state.cbufs[i]->texture; 79 sum += tex->surface.bpe; 80 } 81 82 /* Multiply the sum by some function of the number of samples. */ 83 if (num_fragments >= 2) { 84 if (si_get_ps_iter_samples(sctx) >= 2) 85 sum *= num_fragments; 86 else 87 sum *= 2; 88 } 89 90 static const si_bin_size_subtable table[] = { 91 { 92 /* One RB / SE */ 93 { 94 /* One shader engine */ 95 { 0, 128, 128 }, 96 { 1, 64, 128 }, 97 { 2, 32, 128 }, 98 { 3, 16, 128 }, 99 { 17, 0, 0 }, 100 }, 101 { 102 /* Two shader engines */ 103 { 0, 128, 128 }, 104 { 2, 64, 128 }, 105 { 3, 32, 128 }, 106 { 5, 16, 128 }, 107 { 17, 0, 0 }, 108 }, 109 { 110 /* Four shader engines */ 111 { 0, 128, 128 }, 112 { 3, 64, 128 }, 113 { 5, 16, 128 }, 114 { 17, 0, 0 }, 115 }, 116 }, 117 { 118 /* Two RB / SE */ 119 { 120 /* One shader engine */ 121 { 0, 128, 128 }, 122 { 2, 64, 128 }, 123 { 3, 32, 128 }, 124 { 9, 16, 128 }, 125 { 33, 0, 0 }, 126 }, 127 { 128 /* Two shader engines */ 129 { 0, 128, 128 }, 130 { 3, 64, 128 }, 131 { 5, 32, 128 }, 132 { 9, 16, 128 }, 133 { 33, 0, 0 }, 134 }, 135 { 136 /* Four shader engines */ 137 { 0, 256, 256 }, 138 { 2, 128, 256 }, 139 { 3, 128, 128 }, 140 { 5, 64, 128 }, 141 { 9, 16, 128 }, 142 { 33, 0, 0 }, 143 }, 144 }, 145 { 146 /* Four RB / SE */ 147 { 148 /* One shader engine */ 149 { 0, 128, 256 }, 150 { 2, 128, 128 }, 151 { 3, 64, 128 }, 152 { 5, 32, 128 }, 153 { 9, 16, 128 }, 154 { 17, 0, 0 }, 155 }, 156 { 157 /* Two shader engines */ 158 { 0, 256, 256 }, 159 { 2, 128, 256 }, 160 { 3, 128, 128 }, 161 { 5, 64, 128 }, 162 { 9, 32, 128 }, 163 { 17, 16, 128 }, 164 { 33, 0, 0 }, 165 }, 166 { 167 /* Four shader engines */ 168 { 0, 256, 512 }, 169 { 2, 128, 512 }, 170 { 3, 64, 512 }, 171 { 5, 32, 512 }, 172 { 9, 32, 256 }, 173 { 17, 32, 128 }, 174 { 33, 0, 0 }, 175 }, 176 }, 177 }; 178 179 return si_find_bin_size(sctx->screen, table, sum); 180} 181 182static struct uvec2 si_get_depth_bin_size(struct si_context *sctx) 183{ 184 struct si_state_dsa *dsa = sctx->queued.named.dsa; 185 186 if (!sctx->framebuffer.state.zsbuf || 187 (!dsa->depth_enabled && !dsa->stencil_enabled)) { 188 /* Return the max size. */ 189 struct uvec2 size = {512, 512}; 190 return size; 191 } 192 193 struct si_texture *tex = 194 (struct si_texture*)sctx->framebuffer.state.zsbuf->texture; 195 unsigned depth_coeff = dsa->depth_enabled ? 5 : 0; 196 unsigned stencil_coeff = tex->surface.has_stencil && 197 dsa->stencil_enabled ? 1 : 0; 198 unsigned sum = 4 * (depth_coeff + stencil_coeff) * 199 tex->buffer.b.b.nr_samples; 200 201 static const si_bin_size_subtable table[] = { 202 { 203 // One RB / SE 204 { 205 // One shader engine 206 { 0, 64, 512 }, 207 { 2, 64, 256 }, 208 { 4, 64, 128 }, 209 { 7, 32, 128 }, 210 { 13, 16, 128 }, 211 { 49, 0, 0 }, 212 }, 213 { 214 // Two shader engines 215 { 0, 128, 512 }, 216 { 2, 64, 512 }, 217 { 4, 64, 256 }, 218 { 7, 64, 128 }, 219 { 13, 32, 128 }, 220 { 25, 16, 128 }, 221 { 49, 0, 0 }, 222 }, 223 { 224 // Four shader engines 225 { 0, 256, 512 }, 226 { 2, 128, 512 }, 227 { 4, 64, 512 }, 228 { 7, 64, 256 }, 229 { 13, 64, 128 }, 230 { 25, 16, 128 }, 231 { 49, 0, 0 }, 232 }, 233 }, 234 { 235 // Two RB / SE 236 { 237 // One shader engine 238 { 0, 128, 512 }, 239 { 2, 64, 512 }, 240 { 4, 64, 256 }, 241 { 7, 64, 128 }, 242 { 13, 32, 128 }, 243 { 25, 16, 128 }, 244 { 97, 0, 0 }, 245 }, 246 { 247 // Two shader engines 248 { 0, 256, 512 }, 249 { 2, 128, 512 }, 250 { 4, 64, 512 }, 251 { 7, 64, 256 }, 252 { 13, 64, 128 }, 253 { 25, 32, 128 }, 254 { 49, 16, 128 }, 255 { 97, 0, 0 }, 256 }, 257 { 258 // Four shader engines 259 { 0, 512, 512 }, 260 { 2, 256, 512 }, 261 { 4, 128, 512 }, 262 { 7, 64, 512 }, 263 { 13, 64, 256 }, 264 { 25, 64, 128 }, 265 { 49, 16, 128 }, 266 { 97, 0, 0 }, 267 }, 268 }, 269 { 270 // Four RB / SE 271 { 272 // One shader engine 273 { 0, 256, 512 }, 274 { 2, 128, 512 }, 275 { 4, 64, 512 }, 276 { 7, 64, 256 }, 277 { 13, 64, 128 }, 278 { 25, 32, 128 }, 279 { 49, 16, 128 }, 280 { 193, 0, 0 }, 281 }, 282 { 283 // Two shader engines 284 { 0, 512, 512 }, 285 { 2, 256, 512 }, 286 { 4, 128, 512 }, 287 { 7, 64, 512 }, 288 { 13, 64, 256 }, 289 { 25, 64, 128 }, 290 { 49, 32, 128 }, 291 { 97, 16, 128 }, 292 { 193, 0, 0 }, 293 }, 294 { 295 // Four shader engines 296 { 0, 512, 512 }, 297 { 4, 256, 512 }, 298 { 7, 128, 512 }, 299 { 13, 64, 512 }, 300 { 25, 32, 512 }, 301 { 49, 32, 256 }, 302 { 97, 16, 128 }, 303 { 193, 0, 0 }, 304 }, 305 }, 306 }; 307 308 return si_find_bin_size(sctx->screen, table, sum); 309} 310 311static void si_emit_dpbb_disable(struct si_context *sctx) 312{ 313 unsigned initial_cdw = sctx->gfx_cs->current.cdw; 314 315 radeon_opt_set_context_reg(sctx, R_028C44_PA_SC_BINNER_CNTL_0, 316 SI_TRACKED_PA_SC_BINNER_CNTL_0, 317 S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) | 318 S_028C44_DISABLE_START_OF_PRIM(1)); 319 radeon_opt_set_context_reg(sctx, R_028060_DB_DFSM_CONTROL, 320 SI_TRACKED_DB_DFSM_CONTROL, 321 S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF) | 322 S_028060_POPS_DRAIN_PS_ON_OVERLAP(1)); 323 if (initial_cdw != sctx->gfx_cs->current.cdw) 324 sctx->context_roll = true; 325} 326 327void si_emit_dpbb_state(struct si_context *sctx) 328{ 329 struct si_screen *sscreen = sctx->screen; 330 struct si_state_blend *blend = sctx->queued.named.blend; 331 struct si_state_dsa *dsa = sctx->queued.named.dsa; 332 unsigned db_shader_control = sctx->ps_db_shader_control; 333 334 assert(sctx->chip_class >= GFX9); 335 336 if (!sscreen->dpbb_allowed || !blend || !dsa || sctx->dpbb_force_off) { 337 si_emit_dpbb_disable(sctx); 338 return; 339 } 340 341 bool ps_can_kill = G_02880C_KILL_ENABLE(db_shader_control) || 342 G_02880C_MASK_EXPORT_ENABLE(db_shader_control) || 343 G_02880C_COVERAGE_TO_MASK_ENABLE(db_shader_control) || 344 blend->alpha_to_coverage; 345 346 bool db_can_reject_z_trivially = 347 !G_02880C_Z_EXPORT_ENABLE(db_shader_control) || 348 G_02880C_CONSERVATIVE_Z_EXPORT(db_shader_control) || 349 G_02880C_DEPTH_BEFORE_SHADER(db_shader_control); 350 351 /* Disable DPBB when it's believed to be inefficient. */ 352 if (ps_can_kill && 353 db_can_reject_z_trivially && 354 sctx->framebuffer.state.zsbuf && 355 dsa->db_can_write) { 356 si_emit_dpbb_disable(sctx); 357 return; 358 } 359 360 /* Compute the bin size. */ 361 /* TODO: We could also look at enabled pixel shader outputs. */ 362 unsigned cb_target_enabled_4bit = sctx->framebuffer.colorbuf_enabled_4bit & 363 blend->cb_target_enabled_4bit; 364 struct uvec2 color_bin_size = 365 si_get_color_bin_size(sctx, cb_target_enabled_4bit); 366 struct uvec2 depth_bin_size = si_get_depth_bin_size(sctx); 367 368 unsigned color_area = color_bin_size.x * color_bin_size.y; 369 unsigned depth_area = depth_bin_size.x * depth_bin_size.y; 370 371 struct uvec2 bin_size = color_area < depth_area ? color_bin_size 372 : depth_bin_size; 373 374 if (!bin_size.x || !bin_size.y) { 375 si_emit_dpbb_disable(sctx); 376 return; 377 } 378 379 /* Enable DFSM if it's preferred. */ 380 unsigned punchout_mode = V_028060_FORCE_OFF; 381 bool disable_start_of_prim = true; 382 bool zs_eqaa_dfsm_bug = sctx->chip_class == GFX9 && 383 sctx->framebuffer.state.zsbuf && 384 sctx->framebuffer.nr_samples != 385 MAX2(1, sctx->framebuffer.state.zsbuf->texture->nr_samples); 386 387 if (sscreen->dfsm_allowed && 388 !zs_eqaa_dfsm_bug && 389 cb_target_enabled_4bit && 390 !G_02880C_KILL_ENABLE(db_shader_control) && 391 /* These two also imply that DFSM is disabled when PS writes to memory. */ 392 !G_02880C_EXEC_ON_HIER_FAIL(db_shader_control) && 393 !G_02880C_EXEC_ON_NOOP(db_shader_control) && 394 G_02880C_Z_ORDER(db_shader_control) == V_02880C_EARLY_Z_THEN_LATE_Z) { 395 punchout_mode = V_028060_AUTO; 396 disable_start_of_prim = (cb_target_enabled_4bit & 397 blend->blend_enable_4bit) != 0; 398 } 399 400 /* Tunable parameters. Also test with DFSM enabled/disabled. */ 401 unsigned context_states_per_bin; /* allowed range: [0, 5] */ 402 unsigned persistent_states_per_bin; /* allowed range: [0, 31] */ 403 unsigned fpovs_per_batch; /* allowed range: [0, 255], 0 = unlimited */ 404 405 switch (sctx->family) { 406 case CHIP_VEGA10: 407 case CHIP_VEGA12: 408 case CHIP_VEGA20: 409 case CHIP_RAVEN: 410 case CHIP_RAVEN2: 411 /* Tuned for Raven. Vega might need different values. */ 412 context_states_per_bin = 5; 413 persistent_states_per_bin = 31; 414 fpovs_per_batch = 63; 415 break; 416 default: 417 assert(0); 418 } 419 420 /* Emit registers. */ 421 struct uvec2 bin_size_extend = {}; 422 if (bin_size.x >= 32) 423 bin_size_extend.x = util_logbase2(bin_size.x) - 5; 424 if (bin_size.y >= 32) 425 bin_size_extend.y = util_logbase2(bin_size.y) - 5; 426 427 unsigned initial_cdw = sctx->gfx_cs->current.cdw; 428 radeon_opt_set_context_reg( 429 sctx, R_028C44_PA_SC_BINNER_CNTL_0, 430 SI_TRACKED_PA_SC_BINNER_CNTL_0, 431 S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) | 432 S_028C44_BIN_SIZE_X(bin_size.x == 16) | 433 S_028C44_BIN_SIZE_Y(bin_size.y == 16) | 434 S_028C44_BIN_SIZE_X_EXTEND(bin_size_extend.x) | 435 S_028C44_BIN_SIZE_Y_EXTEND(bin_size_extend.y) | 436 S_028C44_CONTEXT_STATES_PER_BIN(context_states_per_bin) | 437 S_028C44_PERSISTENT_STATES_PER_BIN(persistent_states_per_bin) | 438 S_028C44_DISABLE_START_OF_PRIM(disable_start_of_prim) | 439 S_028C44_FPOVS_PER_BATCH(fpovs_per_batch) | 440 S_028C44_OPTIMAL_BIN_SELECTION(1)); 441 radeon_opt_set_context_reg(sctx, R_028060_DB_DFSM_CONTROL, 442 SI_TRACKED_DB_DFSM_CONTROL, 443 S_028060_PUNCHOUT_MODE(punchout_mode) | 444 S_028060_POPS_DRAIN_PS_ON_OVERLAP(1)); 445 if (initial_cdw != sctx->gfx_cs->current.cdw) 446 sctx->context_roll = true; 447} 448