1/* 2 * Copyright © 2013 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "util/ralloc.h" 25 26#include "main/macros.h" /* Needed for MAX3 and MAX2 for format_rgb9e5 */ 27#include "util/format_rgb9e5.h" 28#include "util/format_srgb.h" 29 30#include "blorp_priv.h" 31#include "compiler/brw_eu_defines.h" 32#include "dev/intel_debug.h" 33 34#include "blorp_nir_builder.h" 35 36#define FILE_DEBUG_FLAG DEBUG_BLORP 37 38#pragma pack(push, 1) 39struct brw_blorp_const_color_prog_key 40{ 41 struct brw_blorp_base_key base; 42 bool use_simd16_replicated_data; 43 bool clear_rgb_as_red; 44 uint8_t local_y; 45}; 46#pragma pack(pop) 47 48static bool 49blorp_params_get_clear_kernel_fs(struct blorp_batch *batch, 50 struct blorp_params *params, 51 bool use_replicated_data, 52 bool clear_rgb_as_red) 53{ 54 struct blorp_context *blorp = batch->blorp; 55 56 const struct brw_blorp_const_color_prog_key blorp_key = { 57 .base = BRW_BLORP_BASE_KEY_INIT(BLORP_SHADER_TYPE_CLEAR), 58 .base.shader_pipeline = BLORP_SHADER_PIPELINE_RENDER, 59 .use_simd16_replicated_data = use_replicated_data, 60 .clear_rgb_as_red = clear_rgb_as_red, 61 .local_y = 0, 62 }; 63 64 if (blorp->lookup_shader(batch, &blorp_key, sizeof(blorp_key), 65 ¶ms->wm_prog_kernel, ¶ms->wm_prog_data)) 66 return true; 67 68 void *mem_ctx = ralloc_context(NULL); 69 70 nir_builder b; 71 blorp_nir_init_shader(&b, mem_ctx, MESA_SHADER_FRAGMENT, 72 blorp_shader_type_to_name(blorp_key.base.shader_type)); 73 74 nir_variable *v_color = 75 BLORP_CREATE_NIR_INPUT(b.shader, clear_color, glsl_vec4_type()); 76 nir_ssa_def *color = nir_load_var(&b, v_color); 77 78 if (clear_rgb_as_red) { 79 nir_ssa_def *pos = nir_f2i32(&b, nir_load_frag_coord(&b)); 80 nir_ssa_def *comp = nir_umod(&b, nir_channel(&b, pos, 0), 81 nir_imm_int(&b, 3)); 82 color = nir_pad_vec4(&b, nir_vector_extract(&b, color, comp)); 83 } 84 85 nir_variable *frag_color = nir_variable_create(b.shader, nir_var_shader_out, 86 glsl_vec4_type(), 87 "gl_FragColor"); 88 frag_color->data.location = FRAG_RESULT_COLOR; 89 nir_store_var(&b, frag_color, color, 0xf); 90 91 struct brw_wm_prog_key wm_key; 92 brw_blorp_init_wm_prog_key(&wm_key); 93 94 struct brw_wm_prog_data prog_data; 95 const unsigned *program = 96 blorp_compile_fs(blorp, mem_ctx, b.shader, &wm_key, use_replicated_data, 97 &prog_data); 98 99 bool result = 100 blorp->upload_shader(batch, MESA_SHADER_FRAGMENT, 101 &blorp_key, sizeof(blorp_key), 102 program, prog_data.base.program_size, 103 &prog_data.base, sizeof(prog_data), 104 ¶ms->wm_prog_kernel, ¶ms->wm_prog_data); 105 106 ralloc_free(mem_ctx); 107 return result; 108} 109 110static bool 111blorp_params_get_clear_kernel_cs(struct blorp_batch *batch, 112 struct blorp_params *params, 113 bool clear_rgb_as_red) 114{ 115 struct blorp_context *blorp = batch->blorp; 116 117 const struct brw_blorp_const_color_prog_key blorp_key = { 118 .base = BRW_BLORP_BASE_KEY_INIT(BLORP_SHADER_TYPE_CLEAR), 119 .base.shader_pipeline = BLORP_SHADER_PIPELINE_COMPUTE, 120 .use_simd16_replicated_data = false, 121 .clear_rgb_as_red = clear_rgb_as_red, 122 .local_y = blorp_get_cs_local_y(params), 123 }; 124 125 if (blorp->lookup_shader(batch, &blorp_key, sizeof(blorp_key), 126 ¶ms->cs_prog_kernel, ¶ms->cs_prog_data)) 127 return true; 128 129 void *mem_ctx = ralloc_context(NULL); 130 131 nir_builder b; 132 blorp_nir_init_shader(&b, mem_ctx, MESA_SHADER_COMPUTE, "BLORP-gpgpu-clear"); 133 blorp_set_cs_dims(b.shader, blorp_key.local_y); 134 135 nir_ssa_def *dst_pos = nir_load_global_invocation_id(&b, 32); 136 137 nir_variable *v_color = 138 BLORP_CREATE_NIR_INPUT(b.shader, clear_color, glsl_vec4_type()); 139 nir_ssa_def *color = nir_load_var(&b, v_color); 140 141 nir_variable *v_bounds_rect = 142 BLORP_CREATE_NIR_INPUT(b.shader, bounds_rect, glsl_vec4_type()); 143 nir_ssa_def *bounds_rect = nir_load_var(&b, v_bounds_rect); 144 nir_ssa_def *in_bounds = blorp_check_in_bounds(&b, bounds_rect, dst_pos); 145 146 if (clear_rgb_as_red) { 147 nir_ssa_def *comp = nir_umod(&b, nir_channel(&b, dst_pos, 0), 148 nir_imm_int(&b, 3)); 149 color = nir_pad_vec4(&b, nir_vector_extract(&b, color, comp)); 150 } 151 152 nir_push_if(&b, in_bounds); 153 154 nir_image_store(&b, nir_imm_int(&b, 0), 155 nir_pad_vector_imm_int(&b, dst_pos, 0, 4), 156 nir_imm_int(&b, 0), 157 nir_pad_vector_imm_int(&b, color, 0, 4), 158 nir_imm_int(&b, 0), 159 .image_dim = GLSL_SAMPLER_DIM_2D, 160 .image_array = true, 161 .access = ACCESS_NON_READABLE); 162 163 nir_pop_if(&b, NULL); 164 165 struct brw_cs_prog_key cs_key; 166 brw_blorp_init_cs_prog_key(&cs_key); 167 168 struct brw_cs_prog_data prog_data; 169 const unsigned *program = 170 blorp_compile_cs(blorp, mem_ctx, b.shader, &cs_key, &prog_data); 171 172 bool result = 173 blorp->upload_shader(batch, MESA_SHADER_COMPUTE, 174 &blorp_key, sizeof(blorp_key), 175 program, prog_data.base.program_size, 176 &prog_data.base, sizeof(prog_data), 177 ¶ms->cs_prog_kernel, ¶ms->cs_prog_data); 178 179 ralloc_free(mem_ctx); 180 return result; 181} 182 183static bool 184blorp_params_get_clear_kernel(struct blorp_batch *batch, 185 struct blorp_params *params, 186 bool use_replicated_data, 187 bool clear_rgb_as_red) 188{ 189 if (batch->flags & BLORP_BATCH_USE_COMPUTE) { 190 assert(!use_replicated_data); 191 return blorp_params_get_clear_kernel_cs(batch, params, clear_rgb_as_red); 192 } else { 193 return blorp_params_get_clear_kernel_fs(batch, params, 194 use_replicated_data, 195 clear_rgb_as_red); 196 } 197} 198 199#pragma pack(push, 1) 200struct layer_offset_vs_key { 201 struct brw_blorp_base_key base; 202 unsigned num_inputs; 203}; 204#pragma pack(pop) 205 206/* In the case of doing attachment clears, we are using a surface state that 207 * is handed to us so we can't set (and don't even know) the base array layer. 208 * In order to do a layered clear in this scenario, we need some way of adding 209 * the base array layer to the instance id. Unfortunately, our hardware has 210 * no real concept of "base instance", so we have to do it manually in a 211 * vertex shader. 212 */ 213static bool 214blorp_params_get_layer_offset_vs(struct blorp_batch *batch, 215 struct blorp_params *params) 216{ 217 struct blorp_context *blorp = batch->blorp; 218 struct layer_offset_vs_key blorp_key = { 219 .base = BRW_BLORP_BASE_KEY_INIT(BLORP_SHADER_TYPE_LAYER_OFFSET_VS), 220 }; 221 222 if (params->wm_prog_data) 223 blorp_key.num_inputs = params->wm_prog_data->num_varying_inputs; 224 225 if (blorp->lookup_shader(batch, &blorp_key, sizeof(blorp_key), 226 ¶ms->vs_prog_kernel, ¶ms->vs_prog_data)) 227 return true; 228 229 void *mem_ctx = ralloc_context(NULL); 230 231 nir_builder b; 232 blorp_nir_init_shader(&b, mem_ctx, MESA_SHADER_VERTEX, 233 blorp_shader_type_to_name(blorp_key.base.shader_type)); 234 235 const struct glsl_type *uvec4_type = glsl_vector_type(GLSL_TYPE_UINT, 4); 236 237 /* First we deal with the header which has instance and base instance */ 238 nir_variable *a_header = nir_variable_create(b.shader, nir_var_shader_in, 239 uvec4_type, "header"); 240 a_header->data.location = VERT_ATTRIB_GENERIC0; 241 242 nir_variable *v_layer = nir_variable_create(b.shader, nir_var_shader_out, 243 glsl_int_type(), "layer_id"); 244 v_layer->data.location = VARYING_SLOT_LAYER; 245 246 /* Compute the layer id */ 247 nir_ssa_def *header = nir_load_var(&b, a_header); 248 nir_ssa_def *base_layer = nir_channel(&b, header, 0); 249 nir_ssa_def *instance = nir_channel(&b, header, 1); 250 nir_store_var(&b, v_layer, nir_iadd(&b, instance, base_layer), 0x1); 251 252 /* Then we copy the vertex from the next slot to VARYING_SLOT_POS */ 253 nir_variable *a_vertex = nir_variable_create(b.shader, nir_var_shader_in, 254 glsl_vec4_type(), "a_vertex"); 255 a_vertex->data.location = VERT_ATTRIB_GENERIC1; 256 257 nir_variable *v_pos = nir_variable_create(b.shader, nir_var_shader_out, 258 glsl_vec4_type(), "v_pos"); 259 v_pos->data.location = VARYING_SLOT_POS; 260 261 nir_copy_var(&b, v_pos, a_vertex); 262 263 /* Then we copy everything else */ 264 for (unsigned i = 0; i < blorp_key.num_inputs; i++) { 265 nir_variable *a_in = nir_variable_create(b.shader, nir_var_shader_in, 266 uvec4_type, "input"); 267 a_in->data.location = VERT_ATTRIB_GENERIC2 + i; 268 269 nir_variable *v_out = nir_variable_create(b.shader, nir_var_shader_out, 270 uvec4_type, "output"); 271 v_out->data.location = VARYING_SLOT_VAR0 + i; 272 273 nir_copy_var(&b, v_out, a_in); 274 } 275 276 struct brw_vs_prog_data vs_prog_data; 277 memset(&vs_prog_data, 0, sizeof(vs_prog_data)); 278 279 const unsigned *program = 280 blorp_compile_vs(blorp, mem_ctx, b.shader, &vs_prog_data); 281 282 bool result = 283 blorp->upload_shader(batch, MESA_SHADER_VERTEX, 284 &blorp_key, sizeof(blorp_key), 285 program, vs_prog_data.base.base.program_size, 286 &vs_prog_data.base.base, sizeof(vs_prog_data), 287 ¶ms->vs_prog_kernel, ¶ms->vs_prog_data); 288 289 ralloc_free(mem_ctx); 290 return result; 291} 292 293/* The x0, y0, x1, and y1 parameters must already be populated with the render 294 * area of the framebuffer to be cleared. 295 */ 296static void 297get_fast_clear_rect(const struct isl_device *dev, 298 const struct isl_surf *aux_surf, 299 unsigned *x0, unsigned *y0, 300 unsigned *x1, unsigned *y1) 301{ 302 unsigned int x_align, y_align; 303 unsigned int x_scaledown, y_scaledown; 304 305 /* Only single sampled surfaces need to (and actually can) be resolved. */ 306 if (aux_surf->usage == ISL_SURF_USAGE_CCS_BIT) { 307 /* From the Ivy Bridge PRM, Vol2 Part1 11.7 "MCS Buffer for Render 308 * Target(s)", beneath the "Fast Color Clear" bullet (p327): 309 * 310 * Clear pass must have a clear rectangle that must follow 311 * alignment rules in terms of pixels and lines as shown in the 312 * table below. Further, the clear-rectangle height and width 313 * must be multiple of the following dimensions. If the height 314 * and width of the render target being cleared do not meet these 315 * requirements, an MCS buffer can be created such that it 316 * follows the requirement and covers the RT. 317 * 318 * The alignment size in the table that follows is related to the 319 * alignment size that is baked into the CCS surface format but with X 320 * alignment multiplied by 16 and Y alignment multiplied by 32. 321 */ 322 x_align = isl_format_get_layout(aux_surf->format)->bw; 323 y_align = isl_format_get_layout(aux_surf->format)->bh; 324 325 x_align *= 16; 326 327 /* The line alignment requirement for Y-tiled is halved at SKL and again 328 * at TGL. 329 */ 330 if (dev->info->ver >= 12) 331 y_align *= 8; 332 else if (dev->info->ver >= 9) 333 y_align *= 16; 334 else 335 y_align *= 32; 336 337 /* From the Ivy Bridge PRM, Vol2 Part1 11.7 "MCS Buffer for Render 338 * Target(s)", beneath the "Fast Color Clear" bullet (p327): 339 * 340 * In order to optimize the performance MCS buffer (when bound to 341 * 1X RT) clear similarly to MCS buffer clear for MSRT case, 342 * clear rect is required to be scaled by the following factors 343 * in the horizontal and vertical directions: 344 * 345 * The X and Y scale down factors in the table that follows are each 346 * equal to half the alignment value computed above. 347 */ 348 x_scaledown = x_align / 2; 349 y_scaledown = y_align / 2; 350 351 if (ISL_DEV_IS_HASWELL(dev)) { 352 /* From BSpec: 3D-Media-GPGPU Engine > 3D Pipeline > Pixel > Pixel 353 * Backend > MCS Buffer for Render Target(s) [DevIVB+] > Table "Color 354 * Clear of Non-MultiSampled Render Target Restrictions": 355 * 356 * Clear rectangle must be aligned to two times the number of 357 * pixels in the table shown below due to 16x16 hashing across the 358 * slice. 359 * 360 * This restriction is only documented to exist on HSW GT3 but 361 * empirical evidence suggests that it's also needed GT2. 362 */ 363 x_align *= 2; 364 y_align *= 2; 365 } 366 } else { 367 assert(aux_surf->usage == ISL_SURF_USAGE_MCS_BIT); 368 369 /* From the Ivy Bridge PRM, Vol2 Part1 11.7 "MCS Buffer for Render 370 * Target(s)", beneath the "MSAA Compression" bullet (p326): 371 * 372 * Clear pass for this case requires that scaled down primitive 373 * is sent down with upper left co-ordinate to coincide with 374 * actual rectangle being cleared. For MSAA, clear rectangle’s 375 * height and width need to as show in the following table in 376 * terms of (width,height) of the RT. 377 * 378 * MSAA Width of Clear Rect Height of Clear Rect 379 * 2X Ceil(1/8*width) Ceil(1/2*height) 380 * 4X Ceil(1/8*width) Ceil(1/2*height) 381 * 8X Ceil(1/2*width) Ceil(1/2*height) 382 * 16X width Ceil(1/2*height) 383 * 384 * The text "with upper left co-ordinate to coincide with actual 385 * rectangle being cleared" is a little confusing--it seems to imply 386 * that to clear a rectangle from (x,y) to (x+w,y+h), one needs to 387 * feed the pipeline using the rectangle (x,y) to 388 * (x+Ceil(w/N),y+Ceil(h/2)), where N is either 2 or 8 depending on 389 * the number of samples. Experiments indicate that this is not 390 * quite correct; actually, what the hardware appears to do is to 391 * align whatever rectangle is sent down the pipeline to the nearest 392 * multiple of 2x2 blocks, and then scale it up by a factor of N 393 * horizontally and 2 vertically. So the resulting alignment is 4 394 * vertically and either 4 or 16 horizontally, and the scaledown 395 * factor is 2 vertically and either 2 or 8 horizontally. 396 */ 397 switch (aux_surf->format) { 398 case ISL_FORMAT_MCS_2X: 399 case ISL_FORMAT_MCS_4X: 400 x_scaledown = 8; 401 break; 402 case ISL_FORMAT_MCS_8X: 403 x_scaledown = 2; 404 break; 405 case ISL_FORMAT_MCS_16X: 406 x_scaledown = 1; 407 break; 408 default: 409 unreachable("Unexpected MCS format for fast clear"); 410 } 411 y_scaledown = 2; 412 x_align = x_scaledown * 2; 413 y_align = y_scaledown * 2; 414 } 415 416 *x0 = ROUND_DOWN_TO(*x0, x_align) / x_scaledown; 417 *y0 = ROUND_DOWN_TO(*y0, y_align) / y_scaledown; 418 *x1 = ALIGN(*x1, x_align) / x_scaledown; 419 *y1 = ALIGN(*y1, y_align) / y_scaledown; 420} 421 422void 423blorp_fast_clear(struct blorp_batch *batch, 424 const struct blorp_surf *surf, 425 enum isl_format format, struct isl_swizzle swizzle, 426 uint32_t level, uint32_t start_layer, uint32_t num_layers, 427 uint32_t x0, uint32_t y0, uint32_t x1, uint32_t y1) 428{ 429 struct blorp_params params; 430 blorp_params_init(¶ms); 431 params.num_layers = num_layers; 432 assert((batch->flags & BLORP_BATCH_USE_COMPUTE) == 0); 433 434 params.x0 = x0; 435 params.y0 = y0; 436 params.x1 = x1; 437 params.y1 = y1; 438 439 memset(¶ms.wm_inputs.clear_color, 0xff, 4*sizeof(float)); 440 params.fast_clear_op = ISL_AUX_OP_FAST_CLEAR; 441 442 get_fast_clear_rect(batch->blorp->isl_dev, surf->aux_surf, 443 ¶ms.x0, ¶ms.y0, ¶ms.x1, ¶ms.y1); 444 445 if (!blorp_params_get_clear_kernel(batch, ¶ms, true, false)) 446 return; 447 448 brw_blorp_surface_info_init(batch, ¶ms.dst, surf, level, 449 start_layer, format, true); 450 params.num_samples = params.dst.surf.samples; 451 452 assert(params.num_samples != 0); 453 if (params.num_samples == 1) 454 params.snapshot_type = INTEL_SNAPSHOT_CCS_COLOR_CLEAR; 455 else 456 params.snapshot_type = INTEL_SNAPSHOT_MCS_COLOR_CLEAR; 457 458 /* If a swizzle was provided, we need to swizzle the clear color so that 459 * the hardware color format conversion will work properly. 460 */ 461 params.dst.clear_color = 462 isl_color_value_swizzle_inv(params.dst.clear_color, swizzle); 463 464 batch->blorp->exec(batch, ¶ms); 465} 466 467bool 468blorp_clear_supports_compute(struct blorp_context *blorp, 469 uint8_t color_write_disable, bool blend_enabled, 470 enum isl_aux_usage aux_usage) 471{ 472 if (blorp->isl_dev->info->ver < 7) 473 return false; 474 if (color_write_disable != 0 || blend_enabled) 475 return false; 476 if (blorp->isl_dev->info->ver >= 12) { 477 return aux_usage == ISL_AUX_USAGE_GFX12_CCS_E || 478 aux_usage == ISL_AUX_USAGE_CCS_E || 479 aux_usage == ISL_AUX_USAGE_NONE; 480 } else { 481 return aux_usage == ISL_AUX_USAGE_NONE; 482 } 483} 484 485void 486blorp_clear(struct blorp_batch *batch, 487 const struct blorp_surf *surf, 488 enum isl_format format, struct isl_swizzle swizzle, 489 uint32_t level, uint32_t start_layer, uint32_t num_layers, 490 uint32_t x0, uint32_t y0, uint32_t x1, uint32_t y1, 491 union isl_color_value clear_color, 492 uint8_t color_write_disable) 493{ 494 struct blorp_params params; 495 blorp_params_init(¶ms); 496 params.snapshot_type = INTEL_SNAPSHOT_SLOW_COLOR_CLEAR; 497 498 const bool compute = batch->flags & BLORP_BATCH_USE_COMPUTE; 499 if (compute) 500 assert(blorp_clear_supports_compute(batch->blorp, color_write_disable, 501 false, surf->aux_usage)); 502 503 /* Manually apply the clear destination swizzle. This way swizzled clears 504 * will work for swizzles which we can't normally use for rendering and it 505 * also ensures that they work on pre-Haswell hardware which can't swizlle 506 * at all. 507 */ 508 clear_color = isl_color_value_swizzle_inv(clear_color, swizzle); 509 swizzle = ISL_SWIZZLE_IDENTITY; 510 511 bool clear_rgb_as_red = false; 512 if (format == ISL_FORMAT_R9G9B9E5_SHAREDEXP) { 513 clear_color.u32[0] = float3_to_rgb9e5(clear_color.f32); 514 format = ISL_FORMAT_R32_UINT; 515 } else if (format == ISL_FORMAT_L8_UNORM_SRGB) { 516 clear_color.f32[0] = util_format_linear_to_srgb_float(clear_color.f32[0]); 517 format = ISL_FORMAT_R8_UNORM; 518 } else if (format == ISL_FORMAT_A4B4G4R4_UNORM) { 519 /* Broadwell and earlier cannot render to this format so we need to work 520 * around it by swapping the colors around and using B4G4R4A4 instead. 521 */ 522 const struct isl_swizzle ARGB = ISL_SWIZZLE(ALPHA, RED, GREEN, BLUE); 523 clear_color = isl_color_value_swizzle_inv(clear_color, ARGB); 524 format = ISL_FORMAT_B4G4R4A4_UNORM; 525 } else if (isl_format_get_layout(format)->bpb % 3 == 0) { 526 clear_rgb_as_red = true; 527 if (format == ISL_FORMAT_R8G8B8_UNORM_SRGB) { 528 clear_color.f32[0] = util_format_linear_to_srgb_float(clear_color.f32[0]); 529 clear_color.f32[1] = util_format_linear_to_srgb_float(clear_color.f32[1]); 530 clear_color.f32[2] = util_format_linear_to_srgb_float(clear_color.f32[2]); 531 } 532 } 533 534 memcpy(¶ms.wm_inputs.clear_color, clear_color.f32, sizeof(float) * 4); 535 536 bool use_simd16_replicated_data = true; 537 538 /* From the SNB PRM (Vol4_Part1): 539 * 540 * "Replicated data (Message Type = 111) is only supported when 541 * accessing tiled memory. Using this Message Type to access linear 542 * (untiled) memory is UNDEFINED." 543 */ 544 if (surf->surf->tiling == ISL_TILING_LINEAR) 545 use_simd16_replicated_data = false; 546 547 /* Replicated clears don't work yet before gfx6 */ 548 if (batch->blorp->isl_dev->info->ver < 6) 549 use_simd16_replicated_data = false; 550 551 if (compute) 552 use_simd16_replicated_data = false; 553 554 /* Constant color writes ignore everyting in blend and color calculator 555 * state. This is not documented. 556 */ 557 params.color_write_disable = color_write_disable & BITFIELD_MASK(4); 558 if (color_write_disable) 559 use_simd16_replicated_data = false; 560 561 if (!blorp_params_get_clear_kernel(batch, ¶ms, 562 use_simd16_replicated_data, 563 clear_rgb_as_red)) 564 return; 565 566 if (!compute && !blorp_ensure_sf_program(batch, ¶ms)) 567 return; 568 569 while (num_layers > 0) { 570 brw_blorp_surface_info_init(batch, ¶ms.dst, surf, level, 571 start_layer, format, true); 572 params.dst.view.swizzle = swizzle; 573 574 params.x0 = x0; 575 params.y0 = y0; 576 params.x1 = x1; 577 params.y1 = y1; 578 579 if (compute) { 580 params.wm_inputs.bounds_rect.x0 = x0; 581 params.wm_inputs.bounds_rect.y0 = y0; 582 params.wm_inputs.bounds_rect.x1 = x1; 583 params.wm_inputs.bounds_rect.y1 = y1; 584 } 585 586 if (params.dst.tile_x_sa || params.dst.tile_y_sa) { 587 assert(params.dst.surf.samples == 1); 588 assert(num_layers == 1); 589 params.x0 += params.dst.tile_x_sa; 590 params.y0 += params.dst.tile_y_sa; 591 params.x1 += params.dst.tile_x_sa; 592 params.y1 += params.dst.tile_y_sa; 593 } 594 595 /* The MinLOD and MinimumArrayElement don't work properly for cube maps. 596 * Convert them to a single slice on gfx4. 597 */ 598 if (batch->blorp->isl_dev->info->ver == 4 && 599 (params.dst.surf.usage & ISL_SURF_USAGE_CUBE_BIT)) { 600 blorp_surf_convert_to_single_slice(batch->blorp->isl_dev, ¶ms.dst); 601 } 602 603 if (clear_rgb_as_red) { 604 surf_fake_rgb_with_red(batch->blorp->isl_dev, ¶ms.dst); 605 params.x0 *= 3; 606 params.x1 *= 3; 607 } 608 609 if (isl_format_is_compressed(params.dst.surf.format)) { 610 blorp_surf_convert_to_uncompressed(batch->blorp->isl_dev, ¶ms.dst, 611 NULL, NULL, NULL, NULL); 612 //&dst_x, &dst_y, &dst_w, &dst_h); 613 } 614 615 if (params.dst.tile_x_sa || params.dst.tile_y_sa) { 616 /* Either we're on gfx4 where there is no multisampling or the 617 * surface is compressed which also implies no multisampling. 618 * Therefore, sa == px and we don't need to do a conversion. 619 */ 620 assert(params.dst.surf.samples == 1); 621 params.x0 += params.dst.tile_x_sa; 622 params.y0 += params.dst.tile_y_sa; 623 params.x1 += params.dst.tile_x_sa; 624 params.y1 += params.dst.tile_y_sa; 625 } 626 627 params.num_samples = params.dst.surf.samples; 628 629 /* We may be restricted on the number of layers we can bind at any one 630 * time. In particular, Sandy Bridge has a maximum number of layers of 631 * 512 but a maximum 3D texture size is much larger. 632 */ 633 params.num_layers = MIN2(params.dst.view.array_len, num_layers); 634 635 const unsigned max_image_width = 16 * 1024; 636 if (params.dst.surf.logical_level0_px.width > max_image_width) { 637 /* Clearing an RGB image as red multiplies the surface width by 3 638 * so it may now be too wide for the hardware surface limits. We 639 * have to break the clear up into pieces in order to clear wide 640 * images. 641 */ 642 assert(clear_rgb_as_red); 643 assert(params.dst.surf.dim == ISL_SURF_DIM_2D); 644 assert(params.dst.surf.tiling == ISL_TILING_LINEAR); 645 assert(params.dst.surf.logical_level0_px.depth == 1); 646 assert(params.dst.surf.logical_level0_px.array_len == 1); 647 assert(params.dst.surf.levels == 1); 648 assert(params.dst.surf.samples == 1); 649 assert(params.dst.tile_x_sa == 0 || params.dst.tile_y_sa == 0); 650 assert(params.dst.aux_usage == ISL_AUX_USAGE_NONE); 651 652 /* max_image_width rounded down to a multiple of 3 */ 653 const unsigned max_fake_rgb_width = (max_image_width / 3) * 3; 654 const unsigned cpp = 655 isl_format_get_layout(params.dst.surf.format)->bpb / 8; 656 657 params.dst.surf.logical_level0_px.width = max_fake_rgb_width; 658 params.dst.surf.phys_level0_sa.width = max_fake_rgb_width; 659 660 uint32_t orig_x0 = params.x0, orig_x1 = params.x1; 661 uint64_t orig_offset = params.dst.addr.offset; 662 for (uint32_t x = orig_x0; x < orig_x1; x += max_fake_rgb_width) { 663 /* Offset to the surface. It's easy because we're linear */ 664 params.dst.addr.offset = orig_offset + x * cpp; 665 666 params.x0 = 0; 667 params.x1 = MIN2(orig_x1 - x, max_image_width); 668 669 batch->blorp->exec(batch, ¶ms); 670 } 671 } else { 672 batch->blorp->exec(batch, ¶ms); 673 } 674 675 start_layer += params.num_layers; 676 num_layers -= params.num_layers; 677 } 678} 679 680static bool 681blorp_clear_stencil_as_rgba(struct blorp_batch *batch, 682 const struct blorp_surf *surf, 683 uint32_t level, uint32_t start_layer, 684 uint32_t num_layers, 685 uint32_t x0, uint32_t y0, uint32_t x1, uint32_t y1, 686 uint8_t stencil_mask, uint8_t stencil_value) 687{ 688 assert((batch->flags & BLORP_BATCH_USE_COMPUTE) == 0); 689 690 /* We only support separate W-tiled stencil for now */ 691 if (surf->surf->format != ISL_FORMAT_R8_UINT || 692 surf->surf->tiling != ISL_TILING_W) 693 return false; 694 695 /* Stencil mask support would require piles of shader magic */ 696 if (stencil_mask != 0xff) 697 return false; 698 699 if (surf->surf->samples > 1) { 700 /* Adjust x0, y0, x1, and y1 to be in units of samples */ 701 assert(surf->surf->msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED); 702 struct isl_extent2d msaa_px_size_sa = 703 isl_get_interleaved_msaa_px_size_sa(surf->surf->samples); 704 705 x0 *= msaa_px_size_sa.w; 706 y0 *= msaa_px_size_sa.h; 707 x1 *= msaa_px_size_sa.w; 708 y1 *= msaa_px_size_sa.h; 709 } 710 711 /* W-tiles and Y-tiles have the same layout as far as cache lines are 712 * concerned: both are 8x8 cache lines laid out Y-major. The difference is 713 * entirely in how the data is arranged withing the cache line. W-tiling 714 * is 8x8 pixels in a swizzled pattern while Y-tiling is 16B by 4 rows 715 * regardless of image format size. As long as everything is aligned to 8, 716 * we can just treat the W-tiled image as Y-tiled, ignore the layout 717 * difference within a cache line, and blast out data. 718 */ 719 if (x0 % 8 != 0 || y0 % 8 != 0 || x1 % 8 != 0 || y1 % 8 != 0) 720 return false; 721 722 struct blorp_params params; 723 blorp_params_init(¶ms); 724 params.snapshot_type = INTEL_SNAPSHOT_SLOW_DEPTH_CLEAR; 725 726 if (!blorp_params_get_clear_kernel(batch, ¶ms, true, false)) 727 return false; 728 729 memset(¶ms.wm_inputs.clear_color, stencil_value, 730 sizeof(params.wm_inputs.clear_color)); 731 732 /* The Sandy Bridge PRM Vol. 4 Pt. 2, section 2.11.2.1.1 has the 733 * following footnote to the format table: 734 * 735 * 128 BPE Formats cannot be Tiled Y when used as render targets 736 * 737 * We have to use RGBA16_UINT on SNB. 738 */ 739 enum isl_format wide_format; 740 if (ISL_GFX_VER(batch->blorp->isl_dev) <= 6) { 741 wide_format = ISL_FORMAT_R16G16B16A16_UINT; 742 743 /* For RGBA16_UINT, we need to mask the stencil value otherwise, we risk 744 * clamping giving us the wrong values 745 */ 746 for (unsigned i = 0; i < 4; i++) 747 params.wm_inputs.clear_color[i] &= 0xffff; 748 } else { 749 wide_format = ISL_FORMAT_R32G32B32A32_UINT; 750 } 751 752 for (uint32_t a = 0; a < num_layers; a++) { 753 uint32_t layer = start_layer + a; 754 755 brw_blorp_surface_info_init(batch, ¶ms.dst, surf, level, 756 layer, ISL_FORMAT_UNSUPPORTED, true); 757 758 if (surf->surf->samples > 1) 759 blorp_surf_fake_interleaved_msaa(batch->blorp->isl_dev, ¶ms.dst); 760 761 /* Make it Y-tiled */ 762 blorp_surf_retile_w_to_y(batch->blorp->isl_dev, ¶ms.dst); 763 764 unsigned wide_Bpp = 765 isl_format_get_layout(wide_format)->bpb / 8; 766 767 params.dst.view.format = params.dst.surf.format = wide_format; 768 assert(params.dst.surf.logical_level0_px.width % wide_Bpp == 0); 769 params.dst.surf.logical_level0_px.width /= wide_Bpp; 770 assert(params.dst.tile_x_sa % wide_Bpp == 0); 771 params.dst.tile_x_sa /= wide_Bpp; 772 773 params.x0 = params.dst.tile_x_sa + x0 / (wide_Bpp / 2); 774 params.y0 = params.dst.tile_y_sa + y0 / 2; 775 params.x1 = params.dst.tile_x_sa + x1 / (wide_Bpp / 2); 776 params.y1 = params.dst.tile_y_sa + y1 / 2; 777 778 batch->blorp->exec(batch, ¶ms); 779 } 780 781 return true; 782} 783 784void 785blorp_clear_depth_stencil(struct blorp_batch *batch, 786 const struct blorp_surf *depth, 787 const struct blorp_surf *stencil, 788 uint32_t level, uint32_t start_layer, 789 uint32_t num_layers, 790 uint32_t x0, uint32_t y0, uint32_t x1, uint32_t y1, 791 bool clear_depth, float depth_value, 792 uint8_t stencil_mask, uint8_t stencil_value) 793{ 794 assert((batch->flags & BLORP_BATCH_USE_COMPUTE) == 0); 795 796 if (!clear_depth && blorp_clear_stencil_as_rgba(batch, stencil, level, 797 start_layer, num_layers, 798 x0, y0, x1, y1, 799 stencil_mask, 800 stencil_value)) 801 return; 802 803 struct blorp_params params; 804 blorp_params_init(¶ms); 805 params.snapshot_type = INTEL_SNAPSHOT_SLOW_DEPTH_CLEAR; 806 807 params.x0 = x0; 808 params.y0 = y0; 809 params.x1 = x1; 810 params.y1 = y1; 811 812 if (ISL_GFX_VER(batch->blorp->isl_dev) == 6) { 813 /* For some reason, Sandy Bridge gets occlusion queries wrong if we 814 * don't have a shader. In particular, it records samples even though 815 * we disable statistics in 3DSTATE_WM. Give it the usual clear shader 816 * to work around the issue. 817 */ 818 if (!blorp_params_get_clear_kernel(batch, ¶ms, false, false)) 819 return; 820 } 821 822 while (num_layers > 0) { 823 params.num_layers = num_layers; 824 825 if (stencil_mask) { 826 brw_blorp_surface_info_init(batch, ¶ms.stencil, stencil, 827 level, start_layer, 828 ISL_FORMAT_UNSUPPORTED, true); 829 params.stencil_mask = stencil_mask; 830 params.stencil_ref = stencil_value; 831 832 params.dst.surf.samples = params.stencil.surf.samples; 833 params.dst.surf.logical_level0_px = 834 params.stencil.surf.logical_level0_px; 835 params.dst.view = params.stencil.view; 836 837 params.num_samples = params.stencil.surf.samples; 838 839 /* We may be restricted on the number of layers we can bind at any 840 * one time. In particular, Sandy Bridge has a maximum number of 841 * layers of 512 but a maximum 3D texture size is much larger. 842 */ 843 if (params.stencil.view.array_len < params.num_layers) 844 params.num_layers = params.stencil.view.array_len; 845 } 846 847 if (clear_depth) { 848 brw_blorp_surface_info_init(batch, ¶ms.depth, depth, 849 level, start_layer, 850 ISL_FORMAT_UNSUPPORTED, true); 851 params.z = depth_value; 852 params.depth_format = 853 isl_format_get_depth_format(depth->surf->format, false); 854 855 params.dst.surf.samples = params.depth.surf.samples; 856 params.dst.surf.logical_level0_px = 857 params.depth.surf.logical_level0_px; 858 params.dst.view = params.depth.view; 859 860 params.num_samples = params.depth.surf.samples; 861 862 /* We may be restricted on the number of layers we can bind at any 863 * one time. In particular, Sandy Bridge has a maximum number of 864 * layers of 512 but a maximum 3D texture size is much larger. 865 */ 866 if (params.depth.view.array_len < params.num_layers) 867 params.num_layers = params.depth.view.array_len; 868 } 869 870 batch->blorp->exec(batch, ¶ms); 871 872 start_layer += params.num_layers; 873 num_layers -= params.num_layers; 874 } 875} 876 877bool 878blorp_can_hiz_clear_depth(const struct intel_device_info *devinfo, 879 const struct isl_surf *surf, 880 enum isl_aux_usage aux_usage, 881 uint32_t level, uint32_t layer, 882 uint32_t x0, uint32_t y0, uint32_t x1, uint32_t y1) 883{ 884 /* This function currently doesn't support any gen prior to gfx8 */ 885 assert(devinfo->ver >= 8); 886 887 if (devinfo->ver == 8 && surf->format == ISL_FORMAT_R16_UNORM) { 888 /* Apply the D16 alignment restrictions. On BDW, HiZ has an 8x4 sample 889 * block with the following property: as the number of samples increases, 890 * the number of pixels representable by this block decreases by a factor 891 * of the sample dimensions. Sample dimensions scale following the MSAA 892 * interleaved pattern. 893 * 894 * Sample|Sample|Pixel 895 * Count |Dim |Dim 896 * =================== 897 * 1 | 1x1 | 8x4 898 * 2 | 2x1 | 4x4 899 * 4 | 2x2 | 4x2 900 * 8 | 4x2 | 2x2 901 * 16 | 4x4 | 2x1 902 * 903 * Table: Pixel Dimensions in a HiZ Sample Block Pre-SKL 904 */ 905 const struct isl_extent2d sa_block_dim = 906 isl_get_interleaved_msaa_px_size_sa(surf->samples); 907 const uint8_t align_px_w = 8 / sa_block_dim.w; 908 const uint8_t align_px_h = 4 / sa_block_dim.h; 909 910 /* Fast depth clears clear an entire sample block at a time. As a result, 911 * the rectangle must be aligned to the dimensions of the encompassing 912 * pixel block for a successful operation. 913 * 914 * Fast clears can still work if the upper-left corner is aligned and the 915 * bottom-rigtht corner touches the edge of a depth buffer whose extent 916 * is unaligned. This is because each miplevel in the depth buffer is 917 * padded by the Pixel Dim (similar to a standard compressed texture). 918 * In this case, the clear rectangle could be padded by to match the full 919 * depth buffer extent but to support multiple clearing techniques, we 920 * chose to be unaware of the depth buffer's extent and thus don't handle 921 * this case. 922 */ 923 if (x0 % align_px_w || y0 % align_px_h || 924 x1 % align_px_w || y1 % align_px_h) 925 return false; 926 } else if (aux_usage == ISL_AUX_USAGE_HIZ_CCS_WT) { 927 /* We have to set the WM_HZ_OP::FullSurfaceDepthandStencilClear bit 928 * whenever we clear an uninitialized HIZ buffer (as some drivers 929 * currently do). However, this bit seems liable to clear 16x8 pixels in 930 * the ZCS on Gfx12 - greater than the slice alignments for depth 931 * buffers. 932 */ 933 assert(surf->image_alignment_el.w % 16 != 0 || 934 surf->image_alignment_el.h % 8 != 0); 935 936 /* This is the hypothesis behind some corruption that was seen with the 937 * amd_vertex_shader_layer-layered-depth-texture-render piglit test. 938 * 939 * From the Compressed Depth Buffers section of the Bspec, under the 940 * Gfx12 texture performant and ZCS columns: 941 * 942 * Update with clear at either 16x8 or 8x4 granularity, based on 943 * fs_clr or otherwise. 944 * 945 * There are a number of ways to avoid full surface CCS clears that 946 * overlap other slices, but for now we choose to disable fast-clears 947 * when an initializing clear could hit another miplevel. 948 * 949 * NOTE: Because the CCS compresses the depth buffer and not a version 950 * of it that has been rearranged with different alignments (like Gfx8+ 951 * HIZ), we have to make sure that the x0 and y0 are at least 16x8 952 * aligned in the context of the entire surface. 953 */ 954 uint32_t slice_x0, slice_y0, slice_z0, slice_a0; 955 isl_surf_get_image_offset_el(surf, level, 956 surf->dim == ISL_SURF_DIM_3D ? 0 : layer, 957 surf->dim == ISL_SURF_DIM_3D ? layer: 0, 958 &slice_x0, &slice_y0, &slice_z0, &slice_a0); 959 assert(slice_z0 == 0 && slice_a0 == 0); 960 const bool max_x1_y1 = 961 x1 == minify(surf->logical_level0_px.width, level) && 962 y1 == minify(surf->logical_level0_px.height, level); 963 const uint32_t haligned_x1 = ALIGN(x1, surf->image_alignment_el.w); 964 const uint32_t valigned_y1 = ALIGN(y1, surf->image_alignment_el.h); 965 const bool unaligned = (slice_x0 + x0) % 16 || (slice_y0 + y0) % 8 || 966 (max_x1_y1 ? haligned_x1 % 16 || valigned_y1 % 8 : 967 x1 % 16 || y1 % 8); 968 const bool partial_clear = x0 > 0 || y0 > 0 || !max_x1_y1; 969 const bool multislice_surf = surf->levels > 1 || 970 surf->logical_level0_px.depth > 1 || 971 surf->logical_level0_px.array_len > 1; 972 973 if (unaligned && (partial_clear || multislice_surf)) 974 return false; 975 } 976 977 return isl_aux_usage_has_hiz(aux_usage); 978} 979 980static bool 981blorp_can_clear_full_surface(const struct blorp_surf *depth, 982 const struct blorp_surf *stencil, 983 uint32_t level, 984 uint32_t x0, uint32_t y0, 985 uint32_t x1, uint32_t y1, 986 bool clear_depth, 987 bool clear_stencil) 988{ 989 uint32_t width = 0, height = 0; 990 if (clear_stencil) { 991 width = minify(stencil->surf->logical_level0_px.width, level); 992 height = minify(stencil->surf->logical_level0_px.height, level); 993 } 994 995 if (clear_depth && !(width || height)) { 996 width = minify(depth->surf->logical_level0_px.width, level); 997 height = minify(depth->surf->logical_level0_px.height, level); 998 } 999 1000 return x0 == 0 && y0 == 0 && width == x1 && height == y1; 1001} 1002 1003void 1004blorp_hiz_clear_depth_stencil(struct blorp_batch *batch, 1005 const struct blorp_surf *depth, 1006 const struct blorp_surf *stencil, 1007 uint32_t level, 1008 uint32_t start_layer, uint32_t num_layers, 1009 uint32_t x0, uint32_t y0, 1010 uint32_t x1, uint32_t y1, 1011 bool clear_depth, float depth_value, 1012 bool clear_stencil, uint8_t stencil_value) 1013{ 1014 struct blorp_params params; 1015 blorp_params_init(¶ms); 1016 params.snapshot_type = INTEL_SNAPSHOT_HIZ_CLEAR; 1017 1018 /* This requires WM_HZ_OP which only exists on gfx8+ */ 1019 assert(ISL_GFX_VER(batch->blorp->isl_dev) >= 8); 1020 1021 params.hiz_op = ISL_AUX_OP_FAST_CLEAR; 1022 /* From BSpec: 3DSTATE_WM_HZ_OP_BODY >> Full Surface Depth and Stencil Clear 1023 * 1024 * "Software must set this only when the APP requires the entire Depth 1025 * surface to be cleared." 1026 */ 1027 params.full_surface_hiz_op = 1028 blorp_can_clear_full_surface(depth, stencil, level, x0, y0, x1, y1, 1029 clear_depth, clear_stencil); 1030 params.num_layers = 1; 1031 1032 params.x0 = x0; 1033 params.y0 = y0; 1034 params.x1 = x1; 1035 params.y1 = y1; 1036 1037 for (uint32_t l = 0; l < num_layers; l++) { 1038 const uint32_t layer = start_layer + l; 1039 if (clear_stencil) { 1040 brw_blorp_surface_info_init(batch, ¶ms.stencil, stencil, 1041 level, layer, 1042 ISL_FORMAT_UNSUPPORTED, true); 1043 params.stencil_mask = 0xff; 1044 params.stencil_ref = stencil_value; 1045 params.num_samples = params.stencil.surf.samples; 1046 } 1047 1048 if (clear_depth) { 1049 /* If we're clearing depth, we must have HiZ */ 1050 assert(depth && isl_aux_usage_has_hiz(depth->aux_usage)); 1051 1052 brw_blorp_surface_info_init(batch, ¶ms.depth, depth, 1053 level, layer, 1054 ISL_FORMAT_UNSUPPORTED, true); 1055 params.depth.clear_color.f32[0] = depth_value; 1056 params.depth_format = 1057 isl_format_get_depth_format(depth->surf->format, false); 1058 params.num_samples = params.depth.surf.samples; 1059 } 1060 1061 batch->blorp->exec(batch, ¶ms); 1062 } 1063} 1064 1065/* Given a depth stencil attachment, this function performs a fast depth clear 1066 * on a depth portion and a regular clear on the stencil portion. When 1067 * performing a fast depth clear on the depth portion, the HiZ buffer is simply 1068 * tagged as cleared so the depth clear value is not actually needed. 1069 */ 1070void 1071blorp_gfx8_hiz_clear_attachments(struct blorp_batch *batch, 1072 uint32_t num_samples, 1073 uint32_t x0, uint32_t y0, 1074 uint32_t x1, uint32_t y1, 1075 bool clear_depth, bool clear_stencil, 1076 uint8_t stencil_value) 1077{ 1078 assert(batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL); 1079 1080 struct blorp_params params; 1081 blorp_params_init(¶ms); 1082 params.snapshot_type = INTEL_SNAPSHOT_HIZ_CLEAR; 1083 params.num_layers = 1; 1084 params.hiz_op = ISL_AUX_OP_FAST_CLEAR; 1085 params.x0 = x0; 1086 params.y0 = y0; 1087 params.x1 = x1; 1088 params.y1 = y1; 1089 params.num_samples = num_samples; 1090 params.depth.enabled = clear_depth; 1091 params.stencil.enabled = clear_stencil; 1092 params.stencil_ref = stencil_value; 1093 batch->blorp->exec(batch, ¶ms); 1094} 1095 1096/** Clear active color/depth/stencili attachments 1097 * 1098 * This function performs a clear operation on the currently bound 1099 * color/depth/stencil attachments. It is assumed that any information passed 1100 * in here is valid, consistent, and in-bounds relative to the currently 1101 * attached depth/stencil. The binding_table_offset parameter is the 32-bit 1102 * offset relative to surface state base address where pre-baked binding table 1103 * that we are to use lives. If clear_color is false, binding_table_offset 1104 * must point to a binding table with one entry which is a valid null surface 1105 * that matches the currently bound depth and stencil. 1106 */ 1107void 1108blorp_clear_attachments(struct blorp_batch *batch, 1109 uint32_t binding_table_offset, 1110 enum isl_format depth_format, 1111 uint32_t num_samples, 1112 uint32_t start_layer, uint32_t num_layers, 1113 uint32_t x0, uint32_t y0, uint32_t x1, uint32_t y1, 1114 bool clear_color, union isl_color_value color_value, 1115 bool clear_depth, float depth_value, 1116 uint8_t stencil_mask, uint8_t stencil_value) 1117{ 1118 struct blorp_params params; 1119 blorp_params_init(¶ms); 1120 1121 assert((batch->flags & BLORP_BATCH_USE_COMPUTE) == 0); 1122 assert(batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL); 1123 1124 params.x0 = x0; 1125 params.y0 = y0; 1126 params.x1 = x1; 1127 params.y1 = y1; 1128 1129 params.use_pre_baked_binding_table = true; 1130 params.pre_baked_binding_table_offset = binding_table_offset; 1131 1132 params.num_layers = num_layers; 1133 params.num_samples = num_samples; 1134 1135 if (clear_color) { 1136 params.dst.enabled = true; 1137 params.snapshot_type = INTEL_SNAPSHOT_SLOW_COLOR_CLEAR; 1138 1139 memcpy(¶ms.wm_inputs.clear_color, color_value.f32, sizeof(float) * 4); 1140 1141 /* Unfortunately, without knowing whether or not our destination surface 1142 * is tiled or not, we have to assume it may be linear. This means no 1143 * SIMD16_REPDATA for us. :-( 1144 */ 1145 if (!blorp_params_get_clear_kernel(batch, ¶ms, false, false)) 1146 return; 1147 } 1148 1149 if (clear_depth) { 1150 params.depth.enabled = true; 1151 params.snapshot_type = INTEL_SNAPSHOT_SLOW_DEPTH_CLEAR; 1152 1153 params.z = depth_value; 1154 params.depth_format = isl_format_get_depth_format(depth_format, false); 1155 } 1156 1157 if (stencil_mask) { 1158 params.stencil.enabled = true; 1159 params.snapshot_type = INTEL_SNAPSHOT_SLOW_DEPTH_CLEAR; 1160 1161 params.stencil_mask = stencil_mask; 1162 params.stencil_ref = stencil_value; 1163 } 1164 1165 if (!blorp_params_get_layer_offset_vs(batch, ¶ms)) 1166 return; 1167 1168 params.vs_inputs.base_layer = start_layer; 1169 1170 batch->blorp->exec(batch, ¶ms); 1171} 1172 1173void 1174blorp_ccs_resolve(struct blorp_batch *batch, 1175 struct blorp_surf *surf, uint32_t level, 1176 uint32_t start_layer, uint32_t num_layers, 1177 enum isl_format format, 1178 enum isl_aux_op resolve_op) 1179{ 1180 assert((batch->flags & BLORP_BATCH_USE_COMPUTE) == 0); 1181 struct blorp_params params; 1182 1183 blorp_params_init(¶ms); 1184 switch(resolve_op) { 1185 case ISL_AUX_OP_AMBIGUATE: 1186 params.snapshot_type = INTEL_SNAPSHOT_CCS_AMBIGUATE; 1187 break; 1188 case ISL_AUX_OP_FULL_RESOLVE: 1189 params.snapshot_type = INTEL_SNAPSHOT_CCS_RESOLVE; 1190 break; 1191 case ISL_AUX_OP_PARTIAL_RESOLVE: 1192 params.snapshot_type = INTEL_SNAPSHOT_CCS_PARTIAL_RESOLVE; 1193 break; 1194 default: 1195 assert(false); 1196 } 1197 brw_blorp_surface_info_init(batch, ¶ms.dst, surf, 1198 level, start_layer, format, true); 1199 1200 /* From the Ivy Bridge PRM, Vol2 Part1 11.9 "Render Target Resolve": 1201 * 1202 * A rectangle primitive must be scaled down by the following factors 1203 * with respect to render target being resolved. 1204 * 1205 * The scaledown factors in the table that follows are related to the block 1206 * size of the CCS format. For IVB and HSW, we divide by two, for BDW we 1207 * multiply by 8 and 16. On Sky Lake, we multiply by 8. 1208 */ 1209 const struct isl_format_layout *aux_fmtl = 1210 isl_format_get_layout(params.dst.aux_surf.format); 1211 assert(aux_fmtl->txc == ISL_TXC_CCS); 1212 1213 unsigned x_scaledown, y_scaledown; 1214 if (ISL_GFX_VER(batch->blorp->isl_dev) >= 12) { 1215 x_scaledown = aux_fmtl->bw * 8; 1216 y_scaledown = aux_fmtl->bh * 4; 1217 } else if (ISL_GFX_VER(batch->blorp->isl_dev) >= 9) { 1218 x_scaledown = aux_fmtl->bw * 8; 1219 y_scaledown = aux_fmtl->bh * 8; 1220 } else if (ISL_GFX_VER(batch->blorp->isl_dev) >= 8) { 1221 x_scaledown = aux_fmtl->bw * 8; 1222 y_scaledown = aux_fmtl->bh * 16; 1223 } else { 1224 x_scaledown = aux_fmtl->bw / 2; 1225 y_scaledown = aux_fmtl->bh / 2; 1226 } 1227 params.x0 = params.y0 = 0; 1228 params.x1 = minify(params.dst.surf.logical_level0_px.width, level); 1229 params.y1 = minify(params.dst.surf.logical_level0_px.height, level); 1230 params.x1 = ALIGN(params.x1, x_scaledown) / x_scaledown; 1231 params.y1 = ALIGN(params.y1, y_scaledown) / y_scaledown; 1232 1233 if (batch->blorp->isl_dev->info->ver >= 10) { 1234 assert(resolve_op == ISL_AUX_OP_FULL_RESOLVE || 1235 resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE || 1236 resolve_op == ISL_AUX_OP_AMBIGUATE); 1237 } else if (batch->blorp->isl_dev->info->ver >= 9) { 1238 assert(resolve_op == ISL_AUX_OP_FULL_RESOLVE || 1239 resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE); 1240 } else { 1241 /* Broadwell and earlier do not have a partial resolve */ 1242 assert(resolve_op == ISL_AUX_OP_FULL_RESOLVE); 1243 } 1244 params.fast_clear_op = resolve_op; 1245 params.num_layers = num_layers; 1246 1247 /* Note: there is no need to initialize push constants because it doesn't 1248 * matter what data gets dispatched to the render target. However, we must 1249 * ensure that the fragment shader delivers the data using the "replicated 1250 * color" message. 1251 */ 1252 1253 if (!blorp_params_get_clear_kernel(batch, ¶ms, true, false)) 1254 return; 1255 1256 batch->blorp->exec(batch, ¶ms); 1257} 1258 1259static nir_ssa_def * 1260blorp_nir_bit(nir_builder *b, nir_ssa_def *src, unsigned bit) 1261{ 1262 return nir_iand(b, nir_ushr(b, src, nir_imm_int(b, bit)), 1263 nir_imm_int(b, 1)); 1264} 1265 1266#pragma pack(push, 1) 1267struct blorp_mcs_partial_resolve_key 1268{ 1269 struct brw_blorp_base_key base; 1270 bool indirect_clear_color; 1271 bool int_format; 1272 uint32_t num_samples; 1273}; 1274#pragma pack(pop) 1275 1276static bool 1277blorp_params_get_mcs_partial_resolve_kernel(struct blorp_batch *batch, 1278 struct blorp_params *params) 1279{ 1280 struct blorp_context *blorp = batch->blorp; 1281 const struct blorp_mcs_partial_resolve_key blorp_key = { 1282 .base = BRW_BLORP_BASE_KEY_INIT(BLORP_SHADER_TYPE_MCS_PARTIAL_RESOLVE), 1283 .indirect_clear_color = params->dst.clear_color_addr.buffer != NULL, 1284 .int_format = isl_format_has_int_channel(params->dst.view.format), 1285 .num_samples = params->num_samples, 1286 }; 1287 1288 if (blorp->lookup_shader(batch, &blorp_key, sizeof(blorp_key), 1289 ¶ms->wm_prog_kernel, ¶ms->wm_prog_data)) 1290 return true; 1291 1292 void *mem_ctx = ralloc_context(NULL); 1293 1294 nir_builder b; 1295 blorp_nir_init_shader(&b, mem_ctx, MESA_SHADER_FRAGMENT, 1296 blorp_shader_type_to_name(blorp_key.base.shader_type)); 1297 1298 nir_variable *v_color = 1299 BLORP_CREATE_NIR_INPUT(b.shader, clear_color, glsl_vec4_type()); 1300 1301 nir_variable *frag_color = 1302 nir_variable_create(b.shader, nir_var_shader_out, 1303 glsl_vec4_type(), "gl_FragColor"); 1304 frag_color->data.location = FRAG_RESULT_COLOR; 1305 1306 /* Do an MCS fetch and check if it is equal to the magic clear value */ 1307 nir_ssa_def *mcs = 1308 blorp_nir_txf_ms_mcs(&b, nir_f2i32(&b, nir_load_frag_coord(&b)), 1309 nir_load_layer_id(&b)); 1310 nir_ssa_def *is_clear = 1311 blorp_nir_mcs_is_clear_color(&b, mcs, blorp_key.num_samples); 1312 1313 /* If we aren't the clear value, discard. */ 1314 nir_discard_if(&b, nir_inot(&b, is_clear)); 1315 1316 nir_ssa_def *clear_color = nir_load_var(&b, v_color); 1317 if (blorp_key.indirect_clear_color && blorp->isl_dev->info->ver <= 8) { 1318 /* Gfx7-8 clear colors are stored as single 0/1 bits */ 1319 clear_color = nir_vec4(&b, blorp_nir_bit(&b, clear_color, 31), 1320 blorp_nir_bit(&b, clear_color, 30), 1321 blorp_nir_bit(&b, clear_color, 29), 1322 blorp_nir_bit(&b, clear_color, 28)); 1323 1324 if (!blorp_key.int_format) 1325 clear_color = nir_i2f32(&b, clear_color); 1326 } 1327 nir_store_var(&b, frag_color, clear_color, 0xf); 1328 1329 struct brw_wm_prog_key wm_key; 1330 brw_blorp_init_wm_prog_key(&wm_key); 1331 wm_key.base.tex.compressed_multisample_layout_mask = 1; 1332 wm_key.base.tex.msaa_16 = blorp_key.num_samples == 16; 1333 wm_key.multisample_fbo = true; 1334 1335 struct brw_wm_prog_data prog_data; 1336 const unsigned *program = 1337 blorp_compile_fs(blorp, mem_ctx, b.shader, &wm_key, false, 1338 &prog_data); 1339 1340 bool result = 1341 blorp->upload_shader(batch, MESA_SHADER_FRAGMENT, 1342 &blorp_key, sizeof(blorp_key), 1343 program, prog_data.base.program_size, 1344 &prog_data.base, sizeof(prog_data), 1345 ¶ms->wm_prog_kernel, ¶ms->wm_prog_data); 1346 1347 ralloc_free(mem_ctx); 1348 return result; 1349} 1350 1351void 1352blorp_mcs_partial_resolve(struct blorp_batch *batch, 1353 struct blorp_surf *surf, 1354 enum isl_format format, 1355 uint32_t start_layer, uint32_t num_layers) 1356{ 1357 struct blorp_params params; 1358 blorp_params_init(¶ms); 1359 params.snapshot_type = INTEL_SNAPSHOT_MCS_PARTIAL_RESOLVE; 1360 1361 assert(batch->blorp->isl_dev->info->ver >= 7); 1362 1363 params.x0 = 0; 1364 params.y0 = 0; 1365 params.x1 = surf->surf->logical_level0_px.width; 1366 params.y1 = surf->surf->logical_level0_px.height; 1367 1368 brw_blorp_surface_info_init(batch, ¶ms.src, surf, 0, 1369 start_layer, format, false); 1370 brw_blorp_surface_info_init(batch, ¶ms.dst, surf, 0, 1371 start_layer, format, true); 1372 1373 params.num_samples = params.dst.surf.samples; 1374 params.num_layers = num_layers; 1375 params.dst_clear_color_as_input = surf->clear_color_addr.buffer != NULL; 1376 1377 memcpy(¶ms.wm_inputs.clear_color, 1378 surf->clear_color.f32, sizeof(float) * 4); 1379 1380 if (!blorp_params_get_mcs_partial_resolve_kernel(batch, ¶ms)) 1381 return; 1382 1383 batch->blorp->exec(batch, ¶ms); 1384} 1385 1386/** Clear a CCS to the "uncompressed" state 1387 * 1388 * This pass is the CCS equivalent of a "HiZ resolve". It sets the CCS values 1389 * for a given layer/level of a surface to 0x0 which is the "uncompressed" 1390 * state which tells the sampler to go look at the main surface. 1391 */ 1392void 1393blorp_ccs_ambiguate(struct blorp_batch *batch, 1394 struct blorp_surf *surf, 1395 uint32_t level, uint32_t layer) 1396{ 1397 assert((batch->flags & BLORP_BATCH_USE_COMPUTE) == 0); 1398 1399 if (ISL_GFX_VER(batch->blorp->isl_dev) >= 10) { 1400 /* On gfx10 and above, we have a hardware resolve op for this */ 1401 return blorp_ccs_resolve(batch, surf, level, layer, 1, 1402 surf->surf->format, ISL_AUX_OP_AMBIGUATE); 1403 } 1404 1405 struct blorp_params params; 1406 blorp_params_init(¶ms); 1407 params.snapshot_type = INTEL_SNAPSHOT_CCS_AMBIGUATE; 1408 1409 assert(ISL_GFX_VER(batch->blorp->isl_dev) >= 7); 1410 1411 const struct isl_format_layout *aux_fmtl = 1412 isl_format_get_layout(surf->aux_surf->format); 1413 assert(aux_fmtl->txc == ISL_TXC_CCS); 1414 1415 params.dst = (struct brw_blorp_surface_info) { 1416 .enabled = true, 1417 .addr = surf->aux_addr, 1418 .view = { 1419 .usage = ISL_SURF_USAGE_RENDER_TARGET_BIT, 1420 .format = ISL_FORMAT_R32G32B32A32_UINT, 1421 .base_level = 0, 1422 .base_array_layer = 0, 1423 .levels = 1, 1424 .array_len = 1, 1425 .swizzle = ISL_SWIZZLE_IDENTITY, 1426 }, 1427 }; 1428 1429 uint32_t z = 0; 1430 if (surf->surf->dim == ISL_SURF_DIM_3D) { 1431 z = layer; 1432 layer = 0; 1433 } 1434 1435 uint64_t offset_B; 1436 uint32_t x_offset_el, y_offset_el; 1437 isl_surf_get_image_offset_B_tile_el(surf->aux_surf, level, layer, z, 1438 &offset_B, &x_offset_el, &y_offset_el); 1439 params.dst.addr.offset += offset_B; 1440 1441 const uint32_t width_px = 1442 minify(surf->aux_surf->logical_level0_px.width, level); 1443 const uint32_t height_px = 1444 minify(surf->aux_surf->logical_level0_px.height, level); 1445 const uint32_t width_el = DIV_ROUND_UP(width_px, aux_fmtl->bw); 1446 const uint32_t height_el = DIV_ROUND_UP(height_px, aux_fmtl->bh); 1447 1448 struct isl_tile_info ccs_tile_info; 1449 isl_surf_get_tile_info(surf->aux_surf, &ccs_tile_info); 1450 1451 /* We're going to map it as a regular RGBA32_UINT surface. We need to 1452 * downscale a good deal. We start by computing the area on the CCS to 1453 * clear in units of Y-tiled cache lines. 1454 */ 1455 uint32_t x_offset_cl, y_offset_cl, width_cl, height_cl; 1456 if (ISL_GFX_VER(batch->blorp->isl_dev) >= 8) { 1457 /* From the Sky Lake PRM Vol. 12 in the section on planes: 1458 * 1459 * "The Color Control Surface (CCS) contains the compression status 1460 * of the cache-line pairs. The compression state of the cache-line 1461 * pair is specified by 2 bits in the CCS. Each CCS cache-line 1462 * represents an area on the main surface of 16x16 sets of 128 byte 1463 * Y-tiled cache-line-pairs. CCS is always Y tiled." 1464 * 1465 * Each 2-bit surface element in the CCS corresponds to a single 1466 * cache-line pair in the main surface. This means that 16x16 el block 1467 * in the CCS maps to a Y-tiled cache line. Fortunately, CCS layouts 1468 * are calculated with a very large alignment so we can round up to a 1469 * whole cache line without worrying about overdraw. 1470 */ 1471 1472 /* On Broadwell and above, a CCS tile is the same as a Y tile when 1473 * viewed at the cache-line granularity. Fortunately, the horizontal 1474 * and vertical alignment requirements of the CCS are such that we can 1475 * align to an entire cache line without worrying about crossing over 1476 * from one LOD to another. 1477 */ 1478 const uint32_t x_el_per_cl = ccs_tile_info.logical_extent_el.w / 8; 1479 const uint32_t y_el_per_cl = ccs_tile_info.logical_extent_el.h / 8; 1480 assert(surf->aux_surf->image_alignment_el.w % x_el_per_cl == 0); 1481 assert(surf->aux_surf->image_alignment_el.h % y_el_per_cl == 0); 1482 1483 assert(x_offset_el % x_el_per_cl == 0); 1484 assert(y_offset_el % y_el_per_cl == 0); 1485 x_offset_cl = x_offset_el / x_el_per_cl; 1486 y_offset_cl = y_offset_el / y_el_per_cl; 1487 width_cl = DIV_ROUND_UP(width_el, x_el_per_cl); 1488 height_cl = DIV_ROUND_UP(height_el, y_el_per_cl); 1489 } else { 1490 /* On gfx7, the CCS tiling is not so nice. However, there we are 1491 * guaranteed that we only have a single level and slice so we don't 1492 * have to worry about it and can just align to a whole tile. 1493 */ 1494 assert(surf->aux_surf->logical_level0_px.depth == 1); 1495 assert(surf->aux_surf->logical_level0_px.array_len == 1); 1496 assert(x_offset_el == 0 && y_offset_el == 0); 1497 const uint32_t width_tl = 1498 DIV_ROUND_UP(width_el, ccs_tile_info.logical_extent_el.w); 1499 const uint32_t height_tl = 1500 DIV_ROUND_UP(height_el, ccs_tile_info.logical_extent_el.h); 1501 x_offset_cl = 0; 1502 y_offset_cl = 0; 1503 width_cl = width_tl * 8; 1504 height_cl = height_tl * 8; 1505 } 1506 1507 /* We're going to use a RGBA32 format so as to write data as quickly as 1508 * possible. A y-tiled cache line will then be 1x4 px. 1509 */ 1510 const uint32_t x_offset_rgba_px = x_offset_cl; 1511 const uint32_t y_offset_rgba_px = y_offset_cl * 4; 1512 const uint32_t width_rgba_px = width_cl; 1513 const uint32_t height_rgba_px = height_cl * 4; 1514 1515 ASSERTED bool ok = 1516 isl_surf_init(batch->blorp->isl_dev, ¶ms.dst.surf, 1517 .dim = ISL_SURF_DIM_2D, 1518 .format = ISL_FORMAT_R32G32B32A32_UINT, 1519 .width = width_rgba_px + x_offset_rgba_px, 1520 .height = height_rgba_px + y_offset_rgba_px, 1521 .depth = 1, 1522 .levels = 1, 1523 .array_len = 1, 1524 .samples = 1, 1525 .row_pitch_B = surf->aux_surf->row_pitch_B, 1526 .usage = ISL_SURF_USAGE_RENDER_TARGET_BIT, 1527 .tiling_flags = ISL_TILING_Y0_BIT); 1528 assert(ok); 1529 1530 params.x0 = x_offset_rgba_px; 1531 params.y0 = y_offset_rgba_px; 1532 params.x1 = x_offset_rgba_px + width_rgba_px; 1533 params.y1 = y_offset_rgba_px + height_rgba_px; 1534 1535 /* A CCS value of 0 means "uncompressed." */ 1536 memset(¶ms.wm_inputs.clear_color, 0, 1537 sizeof(params.wm_inputs.clear_color)); 1538 1539 if (!blorp_params_get_clear_kernel(batch, ¶ms, true, false)) 1540 return; 1541 1542 batch->blorp->exec(batch, ¶ms); 1543} 1544