tu_clear_blit.c revision 7ec681f3
1/* 2 * Copyright 2019-2020 Valve Corporation 3 * SPDX-License-Identifier: MIT 4 * 5 * Authors: 6 * Jonathan Marek <jonathan@marek.ca> 7 */ 8 9#include "tu_private.h" 10 11#include "tu_cs.h" 12#include "vk_format.h" 13 14#include "ir3/ir3_nir.h" 15 16#include "util/format_r11g11b10f.h" 17#include "util/format_rgb9e5.h" 18#include "util/format_srgb.h" 19#include "util/half_float.h" 20#include "compiler/nir/nir_builder.h" 21 22#include "tu_tracepoints.h" 23 24static uint32_t 25tu_pack_float32_for_unorm(float val, int bits) 26{ 27 return _mesa_lroundevenf(CLAMP(val, 0.0f, 1.0f) * (float) ((1 << bits) - 1)); 28} 29 30/* r2d_ = BLIT_OP_SCALE operations */ 31 32static enum a6xx_2d_ifmt 33format_to_ifmt(VkFormat format) 34{ 35 if (format == VK_FORMAT_D24_UNORM_S8_UINT || 36 format == VK_FORMAT_X8_D24_UNORM_PACK32) 37 return R2D_UNORM8; 38 39 /* get_component_bits doesn't work with depth/stencil formats: */ 40 if (format == VK_FORMAT_D16_UNORM || format == VK_FORMAT_D32_SFLOAT) 41 return R2D_FLOAT32; 42 if (format == VK_FORMAT_S8_UINT) 43 return R2D_INT8; 44 45 /* use the size of the red channel to find the corresponding "ifmt" */ 46 bool is_int = vk_format_is_int(format); 47 switch (vk_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) { 48 case 4: case 5: case 8: 49 return is_int ? R2D_INT8 : R2D_UNORM8; 50 case 10: case 11: 51 return is_int ? R2D_INT16 : R2D_FLOAT16; 52 case 16: 53 if (vk_format_is_float(format)) 54 return R2D_FLOAT16; 55 return is_int ? R2D_INT16 : R2D_FLOAT32; 56 case 32: 57 return is_int ? R2D_INT32 : R2D_FLOAT32; 58 default: 59 unreachable("bad format"); 60 return 0; 61 } 62} 63 64static void 65r2d_coords(struct tu_cs *cs, 66 const VkOffset2D *dst, 67 const VkOffset2D *src, 68 const VkExtent2D *extent) 69{ 70 tu_cs_emit_regs(cs, 71 A6XX_GRAS_2D_DST_TL(.x = dst->x, .y = dst->y), 72 A6XX_GRAS_2D_DST_BR(.x = dst->x + extent->width - 1, .y = dst->y + extent->height - 1)); 73 74 if (!src) 75 return; 76 77 tu_cs_emit_regs(cs, 78 A6XX_GRAS_2D_SRC_TL_X(src->x), 79 A6XX_GRAS_2D_SRC_BR_X(src->x + extent->width - 1), 80 A6XX_GRAS_2D_SRC_TL_Y(src->y), 81 A6XX_GRAS_2D_SRC_BR_Y(src->y + extent->height - 1)); 82} 83 84static void 85r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val) 86{ 87 uint32_t clear_value[4] = {}; 88 89 switch (format) { 90 case VK_FORMAT_X8_D24_UNORM_PACK32: 91 case VK_FORMAT_D24_UNORM_S8_UINT: 92 /* cleared as r8g8b8a8_unorm using special format */ 93 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24); 94 clear_value[1] = clear_value[0] >> 8; 95 clear_value[2] = clear_value[0] >> 16; 96 clear_value[3] = val->depthStencil.stencil; 97 break; 98 case VK_FORMAT_D16_UNORM: 99 case VK_FORMAT_D32_SFLOAT: 100 /* R2D_FLOAT32 */ 101 clear_value[0] = fui(val->depthStencil.depth); 102 break; 103 case VK_FORMAT_S8_UINT: 104 clear_value[0] = val->depthStencil.stencil; 105 break; 106 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32: 107 /* cleared as UINT32 */ 108 clear_value[0] = float3_to_rgb9e5(val->color.float32); 109 break; 110 default: 111 assert(!vk_format_is_depth_or_stencil(format)); 112 const struct util_format_description *desc = vk_format_description(format); 113 enum a6xx_2d_ifmt ifmt = format_to_ifmt(format); 114 115 assert(desc && (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN || 116 format == VK_FORMAT_B10G11R11_UFLOAT_PACK32)); 117 118 for (unsigned i = 0; i < desc->nr_channels; i++) { 119 const struct util_format_channel_description *ch = &desc->channel[i]; 120 if (ifmt == R2D_UNORM8) { 121 float linear = val->color.float32[i]; 122 if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3) 123 linear = util_format_linear_to_srgb_float(val->color.float32[i]); 124 125 if (ch->type == UTIL_FORMAT_TYPE_SIGNED) 126 clear_value[i] = _mesa_lroundevenf(CLAMP(linear, -1.0f, 1.0f) * 127.0f); 127 else 128 clear_value[i] = tu_pack_float32_for_unorm(linear, 8); 129 } else if (ifmt == R2D_FLOAT16) { 130 clear_value[i] = _mesa_float_to_half(val->color.float32[i]); 131 } else { 132 assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 || 133 ifmt == R2D_INT16 || ifmt == R2D_INT8); 134 clear_value[i] = val->color.uint32[i]; 135 } 136 } 137 break; 138 } 139 140 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4); 141 tu_cs_emit_array(cs, clear_value, 4); 142} 143 144static void 145r2d_src(struct tu_cmd_buffer *cmd, 146 struct tu_cs *cs, 147 const struct tu_image_view *iview, 148 uint32_t layer, 149 VkFilter filter) 150{ 151 uint32_t src_info = iview->SP_PS_2D_SRC_INFO; 152 if (filter != VK_FILTER_NEAREST) 153 src_info |= A6XX_SP_PS_2D_SRC_INFO_FILTER; 154 155 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5); 156 tu_cs_emit(cs, src_info); 157 tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE); 158 tu_cs_image_ref_2d(cs, iview, layer, true); 159 160 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS, 3); 161 tu_cs_image_flag_ref(cs, iview, layer); 162} 163 164static void 165r2d_src_stencil(struct tu_cmd_buffer *cmd, 166 struct tu_cs *cs, 167 const struct tu_image_view *iview, 168 uint32_t layer, 169 VkFilter filter) 170{ 171 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5); 172 tu_cs_emit(cs, tu_image_view_stencil(iview, SP_PS_2D_SRC_INFO) & ~A6XX_SP_PS_2D_SRC_INFO_FLAGS); 173 tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE); 174 tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer); 175 /* SP_PS_2D_SRC_PITCH has shifted pitch field */ 176 tu_cs_emit(cs, iview->stencil_PITCH << 9); 177} 178 179static void 180r2d_src_buffer(struct tu_cmd_buffer *cmd, 181 struct tu_cs *cs, 182 VkFormat vk_format, 183 uint64_t va, uint32_t pitch, 184 uint32_t width, uint32_t height) 185{ 186 struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR); 187 188 tu_cs_emit_regs(cs, 189 A6XX_SP_PS_2D_SRC_INFO( 190 .color_format = format.fmt, 191 .color_swap = format.swap, 192 .srgb = vk_format_is_srgb(vk_format), 193 .unk20 = 1, 194 .unk22 = 1), 195 A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height), 196 A6XX_SP_PS_2D_SRC(.qword = va), 197 A6XX_SP_PS_2D_SRC_PITCH(.pitch = pitch)); 198} 199 200static void 201r2d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer) 202{ 203 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4); 204 tu_cs_emit(cs, iview->RB_2D_DST_INFO); 205 tu_cs_image_ref_2d(cs, iview, layer, false); 206 207 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS, 3); 208 tu_cs_image_flag_ref(cs, iview, layer); 209} 210 211static void 212r2d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer) 213{ 214 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4); 215 tu_cs_emit(cs, tu_image_view_stencil(iview, RB_2D_DST_INFO) & ~A6XX_RB_2D_DST_INFO_FLAGS); 216 tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer); 217 tu_cs_emit(cs, iview->stencil_PITCH); 218} 219 220static void 221r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch) 222{ 223 struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR); 224 225 tu_cs_emit_regs(cs, 226 A6XX_RB_2D_DST_INFO( 227 .color_format = format.fmt, 228 .color_swap = format.swap, 229 .srgb = vk_format_is_srgb(vk_format)), 230 A6XX_RB_2D_DST(.qword = va), 231 A6XX_RB_2D_DST_PITCH(pitch)); 232} 233 234static void 235r2d_setup_common(struct tu_cmd_buffer *cmd, 236 struct tu_cs *cs, 237 VkFormat vk_format, 238 VkImageAspectFlags aspect_mask, 239 unsigned blit_param, 240 bool clear, 241 bool ubwc, 242 bool scissor) 243{ 244 enum a6xx_format format = tu6_base_format(vk_format); 245 enum a6xx_2d_ifmt ifmt = format_to_ifmt(vk_format); 246 uint32_t unknown_8c01 = 0; 247 248 if ((vk_format == VK_FORMAT_D24_UNORM_S8_UINT || 249 vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) && ubwc) { 250 format = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8; 251 } 252 253 /* note: the only format with partial clearing is D24S8 */ 254 if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) { 255 /* preserve stencil channel */ 256 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT) 257 unknown_8c01 = 0x08000041; 258 /* preserve depth channels */ 259 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT) 260 unknown_8c01 = 0x00084001; 261 } 262 263 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_UNKNOWN_8C01, 1); 264 tu_cs_emit(cs, unknown_8c01); 265 266 uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL( 267 .scissor = scissor, 268 .rotate = blit_param, 269 .solid_color = clear, 270 .d24s8 = format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear, 271 .color_format = format, 272 .mask = 0xf, 273 .ifmt = vk_format_is_srgb(vk_format) ? R2D_UNORM8_SRGB : ifmt, 274 ).value; 275 276 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1); 277 tu_cs_emit(cs, blit_cntl); 278 279 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1); 280 tu_cs_emit(cs, blit_cntl); 281 282 if (format == FMT6_10_10_10_2_UNORM_DEST) 283 format = FMT6_16_16_16_16_FLOAT; 284 285 tu_cs_emit_regs(cs, A6XX_SP_2D_DST_FORMAT( 286 .sint = vk_format_is_sint(vk_format), 287 .uint = vk_format_is_uint(vk_format), 288 .color_format = format, 289 .srgb = vk_format_is_srgb(vk_format), 290 .mask = 0xf)); 291} 292 293static void 294r2d_setup(struct tu_cmd_buffer *cmd, 295 struct tu_cs *cs, 296 VkFormat vk_format, 297 VkImageAspectFlags aspect_mask, 298 unsigned blit_param, 299 bool clear, 300 bool ubwc, 301 VkSampleCountFlagBits samples) 302{ 303 assert(samples == VK_SAMPLE_COUNT_1_BIT); 304 305 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM); 306 307 r2d_setup_common(cmd, cs, vk_format, aspect_mask, blit_param, clear, ubwc, false); 308} 309 310static void 311r2d_teardown(struct tu_cmd_buffer *cmd, 312 struct tu_cs *cs) 313{ 314 /* nothing to do here */ 315} 316 317static void 318r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs) 319{ 320 tu_cs_emit_pkt7(cs, CP_BLIT, 1); 321 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE)); 322} 323 324/* r3d_ = shader path operations */ 325 326static nir_ssa_def * 327load_const(nir_builder *b, unsigned base, unsigned components) 328{ 329 return nir_load_uniform(b, components, 32, nir_imm_int(b, 0), 330 .base = base); 331} 332 333static nir_shader * 334build_blit_vs_shader(void) 335{ 336 nir_builder _b = 337 nir_builder_init_simple_shader(MESA_SHADER_VERTEX, NULL, "blit vs"); 338 nir_builder *b = &_b; 339 340 nir_variable *out_pos = 341 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(), 342 "gl_Position"); 343 out_pos->data.location = VARYING_SLOT_POS; 344 345 nir_ssa_def *vert0_pos = load_const(b, 0, 2); 346 nir_ssa_def *vert1_pos = load_const(b, 4, 2); 347 nir_ssa_def *vertex = nir_load_vertex_id(b); 348 349 nir_ssa_def *pos = nir_bcsel(b, nir_i2b1(b, vertex), vert1_pos, vert0_pos); 350 pos = nir_vec4(b, nir_channel(b, pos, 0), 351 nir_channel(b, pos, 1), 352 nir_imm_float(b, 0.0), 353 nir_imm_float(b, 1.0)); 354 355 nir_store_var(b, out_pos, pos, 0xf); 356 357 nir_variable *out_coords = 358 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec_type(3), 359 "coords"); 360 out_coords->data.location = VARYING_SLOT_VAR0; 361 362 nir_ssa_def *vert0_coords = load_const(b, 2, 2); 363 nir_ssa_def *vert1_coords = load_const(b, 6, 2); 364 365 /* Only used with "z scale" blit path which uses a 3d texture */ 366 nir_ssa_def *z_coord = load_const(b, 8, 1); 367 368 nir_ssa_def *coords = nir_bcsel(b, nir_i2b1(b, vertex), vert1_coords, vert0_coords); 369 coords = nir_vec3(b, nir_channel(b, coords, 0), nir_channel(b, coords, 1), 370 z_coord); 371 372 nir_store_var(b, out_coords, coords, 0x7); 373 374 return b->shader; 375} 376 377static nir_shader * 378build_clear_vs_shader(void) 379{ 380 nir_builder _b = 381 nir_builder_init_simple_shader(MESA_SHADER_VERTEX, NULL, "blit vs"); 382 nir_builder *b = &_b; 383 384 nir_variable *out_pos = 385 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(), 386 "gl_Position"); 387 out_pos->data.location = VARYING_SLOT_POS; 388 389 nir_ssa_def *vert0_pos = load_const(b, 0, 2); 390 nir_ssa_def *vert1_pos = load_const(b, 4, 2); 391 /* c0.z is used to clear depth */ 392 nir_ssa_def *depth = load_const(b, 2, 1); 393 nir_ssa_def *vertex = nir_load_vertex_id(b); 394 395 nir_ssa_def *pos = nir_bcsel(b, nir_i2b1(b, vertex), vert1_pos, vert0_pos); 396 pos = nir_vec4(b, nir_channel(b, pos, 0), 397 nir_channel(b, pos, 1), 398 depth, nir_imm_float(b, 1.0)); 399 400 nir_store_var(b, out_pos, pos, 0xf); 401 402 nir_variable *out_layer = 403 nir_variable_create(b->shader, nir_var_shader_out, glsl_uint_type(), 404 "gl_Layer"); 405 out_layer->data.location = VARYING_SLOT_LAYER; 406 nir_ssa_def *layer = load_const(b, 3, 1); 407 nir_store_var(b, out_layer, layer, 1); 408 409 return b->shader; 410} 411 412static nir_shader * 413build_blit_fs_shader(bool zscale) 414{ 415 nir_builder _b = 416 nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL, 417 zscale ? "zscale blit fs" : "blit fs"); 418 nir_builder *b = &_b; 419 420 nir_variable *out_color = 421 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(), 422 "color0"); 423 out_color->data.location = FRAG_RESULT_DATA0; 424 425 unsigned coord_components = zscale ? 3 : 2; 426 nir_variable *in_coords = 427 nir_variable_create(b->shader, nir_var_shader_in, 428 glsl_vec_type(coord_components), 429 "coords"); 430 in_coords->data.location = VARYING_SLOT_VAR0; 431 432 nir_tex_instr *tex = nir_tex_instr_create(b->shader, 1); 433 /* Note: since we're just copying data, we rely on the HW ignoring the 434 * dest_type. 435 */ 436 tex->dest_type = nir_type_int32; 437 tex->is_array = false; 438 tex->is_shadow = false; 439 tex->sampler_dim = zscale ? GLSL_SAMPLER_DIM_3D : GLSL_SAMPLER_DIM_2D; 440 441 tex->texture_index = 0; 442 tex->sampler_index = 0; 443 444 b->shader->info.num_textures = 1; 445 BITSET_SET(b->shader->info.textures_used, 0); 446 447 tex->src[0].src_type = nir_tex_src_coord; 448 tex->src[0].src = nir_src_for_ssa(nir_load_var(b, in_coords)); 449 tex->coord_components = coord_components; 450 451 nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL); 452 nir_builder_instr_insert(b, &tex->instr); 453 454 nir_store_var(b, out_color, &tex->dest.ssa, 0xf); 455 456 return b->shader; 457} 458 459/* We can only read multisample textures via txf_ms, so we need a separate 460 * variant for them. 461 */ 462static nir_shader * 463build_ms_copy_fs_shader(void) 464{ 465 nir_builder _b = 466 nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL, 467 "multisample copy fs"); 468 nir_builder *b = &_b; 469 470 nir_variable *out_color = 471 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(), 472 "color0"); 473 out_color->data.location = FRAG_RESULT_DATA0; 474 475 nir_variable *in_coords = 476 nir_variable_create(b->shader, nir_var_shader_in, 477 glsl_vec_type(2), 478 "coords"); 479 in_coords->data.location = VARYING_SLOT_VAR0; 480 481 nir_tex_instr *tex = nir_tex_instr_create(b->shader, 2); 482 483 tex->op = nir_texop_txf_ms; 484 485 /* Note: since we're just copying data, we rely on the HW ignoring the 486 * dest_type. 487 */ 488 tex->dest_type = nir_type_int32; 489 tex->is_array = false; 490 tex->is_shadow = false; 491 tex->sampler_dim = GLSL_SAMPLER_DIM_MS; 492 493 tex->texture_index = 0; 494 tex->sampler_index = 0; 495 496 b->shader->info.num_textures = 1; 497 BITSET_SET(b->shader->info.textures_used, 0); 498 BITSET_SET(b->shader->info.textures_used_by_txf, 0); 499 500 nir_ssa_def *coord = nir_f2i32(b, nir_load_var(b, in_coords)); 501 502 tex->src[0].src_type = nir_tex_src_coord; 503 tex->src[0].src = nir_src_for_ssa(coord); 504 tex->coord_components = 2; 505 506 tex->src[1].src_type = nir_tex_src_ms_index; 507 tex->src[1].src = nir_src_for_ssa(nir_load_sample_id(b)); 508 509 nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL); 510 nir_builder_instr_insert(b, &tex->instr); 511 512 nir_store_var(b, out_color, &tex->dest.ssa, 0xf); 513 514 return b->shader; 515} 516 517static nir_shader * 518build_clear_fs_shader(unsigned mrts) 519{ 520 nir_builder _b = 521 nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL, 522 "mrt%u clear fs", mrts); 523 nir_builder *b = &_b; 524 525 for (unsigned i = 0; i < mrts; i++) { 526 nir_variable *out_color = 527 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(), 528 "color"); 529 out_color->data.location = FRAG_RESULT_DATA0 + i; 530 531 nir_ssa_def *color = load_const(b, 4 * i, 4); 532 nir_store_var(b, out_color, color, 0xf); 533 } 534 535 return b->shader; 536} 537 538static void 539compile_shader(struct tu_device *dev, struct nir_shader *nir, 540 unsigned consts, unsigned *offset, enum global_shader idx) 541{ 542 nir->options = ir3_get_compiler_options(dev->compiler); 543 544 nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs, nir->info.stage); 545 nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs, nir->info.stage); 546 547 ir3_finalize_nir(dev->compiler, nir); 548 549 struct ir3_shader *sh = ir3_shader_from_nir(dev->compiler, nir, 550 align(consts, 4), NULL); 551 552 struct ir3_shader_key key = {}; 553 bool created; 554 struct ir3_shader_variant *so = 555 ir3_shader_get_variant(sh, &key, false, false, &created); 556 557 struct tu6_global *global = dev->global_bo.map; 558 559 assert(*offset + so->info.sizedwords <= ARRAY_SIZE(global->shaders)); 560 dev->global_shaders[idx] = so; 561 memcpy(&global->shaders[*offset], so->bin, 562 sizeof(uint32_t) * so->info.sizedwords); 563 dev->global_shader_va[idx] = dev->global_bo.iova + 564 gb_offset(shaders[*offset]); 565 *offset += align(so->info.sizedwords, 32); 566} 567 568void 569tu_init_clear_blit_shaders(struct tu_device *dev) 570{ 571 unsigned offset = 0; 572 compile_shader(dev, build_blit_vs_shader(), 3, &offset, GLOBAL_SH_VS_BLIT); 573 compile_shader(dev, build_clear_vs_shader(), 2, &offset, GLOBAL_SH_VS_CLEAR); 574 compile_shader(dev, build_blit_fs_shader(false), 0, &offset, GLOBAL_SH_FS_BLIT); 575 compile_shader(dev, build_blit_fs_shader(true), 0, &offset, GLOBAL_SH_FS_BLIT_ZSCALE); 576 compile_shader(dev, build_ms_copy_fs_shader(), 0, &offset, GLOBAL_SH_FS_COPY_MS); 577 578 for (uint32_t num_rts = 0; num_rts <= MAX_RTS; num_rts++) { 579 compile_shader(dev, build_clear_fs_shader(num_rts), num_rts, &offset, 580 GLOBAL_SH_FS_CLEAR0 + num_rts); 581 } 582} 583 584void 585tu_destroy_clear_blit_shaders(struct tu_device *dev) 586{ 587 for (unsigned i = 0; i < GLOBAL_SH_COUNT; i++) { 588 if (dev->global_shaders[i]) 589 ir3_shader_destroy(dev->global_shaders[i]->shader); 590 } 591} 592 593static void 594r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, 595 uint32_t rts_mask, bool z_scale, VkSampleCountFlagBits samples) 596{ 597 enum global_shader vs_id = 598 blit ? GLOBAL_SH_VS_BLIT : GLOBAL_SH_VS_CLEAR; 599 600 struct ir3_shader_variant *vs = cmd->device->global_shaders[vs_id]; 601 uint64_t vs_iova = cmd->device->global_shader_va[vs_id]; 602 603 enum global_shader fs_id = GLOBAL_SH_FS_BLIT; 604 605 if (z_scale) 606 fs_id = GLOBAL_SH_FS_BLIT_ZSCALE; 607 else if (samples != VK_SAMPLE_COUNT_1_BIT) 608 fs_id = GLOBAL_SH_FS_COPY_MS; 609 610 unsigned num_rts = util_bitcount(rts_mask); 611 if (!blit) 612 fs_id = GLOBAL_SH_FS_CLEAR0 + num_rts; 613 614 struct ir3_shader_variant *fs = cmd->device->global_shaders[fs_id]; 615 uint64_t fs_iova = cmd->device->global_shader_va[fs_id]; 616 617 tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD( 618 .vs_state = true, 619 .hs_state = true, 620 .ds_state = true, 621 .gs_state = true, 622 .fs_state = true, 623 .cs_state = true, 624 .gfx_ibo = true, 625 .cs_ibo = true, 626 .gfx_shared_const = true, 627 .gfx_bindless = 0x1f, 628 .cs_bindless = 0x1f)); 629 630 tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, vs); 631 tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL); 632 tu6_emit_xs_config(cs, MESA_SHADER_TESS_EVAL, NULL); 633 tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, NULL); 634 tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, fs); 635 636 struct tu_pvtmem_config pvtmem = {}; 637 tu6_emit_xs(cs, MESA_SHADER_VERTEX, vs, &pvtmem, vs_iova); 638 tu6_emit_xs(cs, MESA_SHADER_FRAGMENT, fs, &pvtmem, fs_iova); 639 640 tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0()); 641 tu_cs_emit_regs(cs, A6XX_VFD_CONTROL_0()); 642 643 if (cmd->device->physical_device->info->a6xx.has_cp_reg_write) { 644 /* Copy what the blob does here. This will emit an extra 0x3f 645 * CP_EVENT_WRITE when multiview is disabled. I'm not exactly sure what 646 * this is working around yet. 647 */ 648 tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3); 649 tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(UNK_EVENT_WRITE)); 650 tu_cs_emit(cs, REG_A6XX_PC_MULTIVIEW_CNTL); 651 tu_cs_emit(cs, 0); 652 } else { 653 tu_cs_emit_regs(cs, A6XX_PC_MULTIVIEW_CNTL()); 654 } 655 tu_cs_emit_regs(cs, A6XX_VFD_MULTIVIEW_CNTL()); 656 657 tu6_emit_vpc(cs, vs, NULL, NULL, NULL, fs, 0); 658 659 /* REPL_MODE for varying with RECTLIST (2 vertices only) */ 660 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0)); 661 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0)); 662 663 tu6_emit_fs_inputs(cs, fs); 664 665 tu_cs_emit_regs(cs, 666 A6XX_GRAS_CL_CNTL( 667 .persp_division_disable = 1, 668 .vp_xform_disable = 1, 669 .vp_clip_code_ignore = 1, 670 .clip_disable = 1)); 671 tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable? 672 673 tu_cs_emit_regs(cs, A6XX_PC_RASTER_CNTL()); 674 tu_cs_emit_regs(cs, A6XX_VPC_UNKNOWN_9107()); 675 676 tu_cs_emit_regs(cs, 677 A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0, .x = 0, .y = 0), 678 A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff)); 679 tu_cs_emit_regs(cs, 680 A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0, .x = 0, .y = 0), 681 A6XX_GRAS_SC_SCREEN_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff)); 682 683 tu_cs_emit_regs(cs, 684 A6XX_VFD_INDEX_OFFSET(), 685 A6XX_VFD_INSTANCE_START_OFFSET()); 686 687 if (rts_mask) { 688 unsigned rts_count = util_last_bit(rts_mask); 689 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), rts_count); 690 unsigned rt = 0; 691 for (unsigned i = 0; i < rts_count; i++) { 692 unsigned regid = 0; 693 if (rts_mask & (1u << i)) 694 regid = ir3_find_output_regid(fs, FRAG_RESULT_DATA0 + rt++); 695 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(regid)); 696 } 697 } 698 699 cmd->state.line_mode = RECTANGULAR; 700 tu6_emit_msaa(cs, samples, cmd->state.line_mode); 701} 702 703static void 704r3d_coords_raw(struct tu_cs *cs, const float *coords) 705{ 706 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8); 707 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) | 708 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | 709 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | 710 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) | 711 CP_LOAD_STATE6_0_NUM_UNIT(2)); 712 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); 713 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); 714 tu_cs_emit_array(cs, (const uint32_t *) coords, 8); 715} 716 717/* z coordinate for "z scale" blit path which uses a 3d texture */ 718static void 719r3d_coord_z(struct tu_cs *cs, float z) 720{ 721 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 4); 722 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(2) | 723 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | 724 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | 725 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) | 726 CP_LOAD_STATE6_0_NUM_UNIT(1)); 727 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); 728 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); 729 tu_cs_emit(cs, fui(z)); 730 tu_cs_emit(cs, 0); 731 tu_cs_emit(cs, 0); 732 tu_cs_emit(cs, 0); 733} 734 735static void 736r3d_coords(struct tu_cs *cs, 737 const VkOffset2D *dst, 738 const VkOffset2D *src, 739 const VkExtent2D *extent) 740{ 741 int32_t src_x1 = src ? src->x : 0; 742 int32_t src_y1 = src ? src->y : 0; 743 r3d_coords_raw(cs, (float[]) { 744 dst->x, dst->y, 745 src_x1, src_y1, 746 dst->x + extent->width, dst->y + extent->height, 747 src_x1 + extent->width, src_y1 + extent->height, 748 }); 749} 750 751static void 752r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val) 753{ 754 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4); 755 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) | 756 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | 757 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | 758 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) | 759 CP_LOAD_STATE6_0_NUM_UNIT(1)); 760 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); 761 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); 762 switch (format) { 763 case VK_FORMAT_X8_D24_UNORM_PACK32: 764 case VK_FORMAT_D24_UNORM_S8_UINT: { 765 /* cleared as r8g8b8a8_unorm using special format */ 766 uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24); 767 tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f)); 768 tu_cs_emit(cs, fui((tmp >> 8 & 0xff) / 255.0f)); 769 tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f)); 770 tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f)); 771 } break; 772 case VK_FORMAT_D16_UNORM: 773 case VK_FORMAT_D32_SFLOAT: 774 tu_cs_emit(cs, fui(val->depthStencil.depth)); 775 tu_cs_emit(cs, 0); 776 tu_cs_emit(cs, 0); 777 tu_cs_emit(cs, 0); 778 break; 779 case VK_FORMAT_S8_UINT: 780 tu_cs_emit(cs, val->depthStencil.stencil & 0xff); 781 tu_cs_emit(cs, 0); 782 tu_cs_emit(cs, 0); 783 tu_cs_emit(cs, 0); 784 break; 785 default: 786 /* as color formats use clear value as-is */ 787 assert(!vk_format_is_depth_or_stencil(format)); 788 tu_cs_emit_array(cs, val->color.uint32, 4); 789 break; 790 } 791} 792 793static void 794r3d_src_common(struct tu_cmd_buffer *cmd, 795 struct tu_cs *cs, 796 const uint32_t *tex_const, 797 uint32_t offset_base, 798 uint32_t offset_ubwc, 799 VkFilter filter) 800{ 801 struct tu_cs_memory texture = { }; 802 VkResult result = tu_cs_alloc(&cmd->sub_cs, 803 2, /* allocate space for a sampler too */ 804 A6XX_TEX_CONST_DWORDS, &texture); 805 if (result != VK_SUCCESS) { 806 cmd->record_result = result; 807 return; 808 } 809 810 memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4); 811 812 /* patch addresses for layer offset */ 813 *(uint64_t*) (texture.map + 4) += offset_base; 814 uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc; 815 texture.map[7] = ubwc_addr; 816 texture.map[8] = ubwc_addr >> 32; 817 818 texture.map[A6XX_TEX_CONST_DWORDS + 0] = 819 A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(filter, false)) | 820 A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(filter, false)) | 821 A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) | 822 A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) | 823 A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) | 824 0x60000; /* XXX used by blob, doesn't seem necessary */ 825 texture.map[A6XX_TEX_CONST_DWORDS + 1] = 826 0x1 | /* XXX used by blob, doesn't seem necessary */ 827 A6XX_TEX_SAMP_1_UNNORM_COORDS | 828 A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR; 829 texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0; 830 texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0; 831 832 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3); 833 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) | 834 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) | 835 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | 836 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) | 837 CP_LOAD_STATE6_0_NUM_UNIT(1)); 838 tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4); 839 840 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_SAMP(.qword = texture.iova + A6XX_TEX_CONST_DWORDS * 4)); 841 842 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3); 843 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) | 844 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | 845 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | 846 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) | 847 CP_LOAD_STATE6_0_NUM_UNIT(1)); 848 tu_cs_emit_qw(cs, texture.iova); 849 850 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_CONST(.qword = texture.iova)); 851 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1)); 852} 853 854static void 855r3d_src(struct tu_cmd_buffer *cmd, 856 struct tu_cs *cs, 857 const struct tu_image_view *iview, 858 uint32_t layer, 859 VkFilter filter) 860{ 861 r3d_src_common(cmd, cs, iview->descriptor, 862 iview->layer_size * layer, 863 iview->ubwc_layer_size * layer, 864 filter); 865} 866 867static void 868r3d_src_buffer(struct tu_cmd_buffer *cmd, 869 struct tu_cs *cs, 870 VkFormat vk_format, 871 uint64_t va, uint32_t pitch, 872 uint32_t width, uint32_t height) 873{ 874 uint32_t desc[A6XX_TEX_CONST_DWORDS]; 875 876 struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR); 877 878 desc[0] = 879 COND(vk_format_is_srgb(vk_format), A6XX_TEX_CONST_0_SRGB) | 880 A6XX_TEX_CONST_0_FMT(format.fmt) | 881 A6XX_TEX_CONST_0_SWAP(format.swap) | 882 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) | 883 // XXX to swizzle into .w for stencil buffer_to_image 884 A6XX_TEX_CONST_0_SWIZ_Y(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Y) | 885 A6XX_TEX_CONST_0_SWIZ_Z(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Z) | 886 A6XX_TEX_CONST_0_SWIZ_W(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W); 887 desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height); 888 desc[2] = 889 A6XX_TEX_CONST_2_PITCH(pitch) | 890 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D); 891 desc[3] = 0; 892 desc[4] = va; 893 desc[5] = va >> 32; 894 for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++) 895 desc[i] = 0; 896 897 r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST); 898} 899 900static void 901r3d_src_gmem(struct tu_cmd_buffer *cmd, 902 struct tu_cs *cs, 903 const struct tu_image_view *iview, 904 VkFormat format, 905 uint32_t gmem_offset, 906 uint32_t cpp) 907{ 908 uint32_t desc[A6XX_TEX_CONST_DWORDS]; 909 memcpy(desc, iview->descriptor, sizeof(desc)); 910 911 /* patch the format so that depth/stencil get the right format */ 912 desc[0] &= ~A6XX_TEX_CONST_0_FMT__MASK; 913 desc[0] |= A6XX_TEX_CONST_0_FMT(tu6_format_texture(format, TILE6_2).fmt); 914 915 /* patched for gmem */ 916 desc[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK); 917 desc[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2); 918 desc[2] = 919 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) | 920 A6XX_TEX_CONST_2_PITCH(cmd->state.framebuffer->tile0.width * cpp); 921 desc[3] = 0; 922 desc[4] = cmd->device->physical_device->gmem_base + gmem_offset; 923 desc[5] = A6XX_TEX_CONST_5_DEPTH(1); 924 for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++) 925 desc[i] = 0; 926 927 r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST); 928} 929 930static void 931r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer) 932{ 933 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6); 934 tu_cs_emit(cs, iview->RB_MRT_BUF_INFO); 935 tu_cs_image_ref(cs, iview, layer); 936 tu_cs_emit(cs, 0); 937 938 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3); 939 tu_cs_image_flag_ref(cs, iview, layer); 940 941 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled)); 942} 943 944static void 945r3d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer) 946{ 947 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6); 948 tu_cs_emit(cs, tu_image_view_stencil(iview, RB_MRT_BUF_INFO)); 949 tu_cs_image_stencil_ref(cs, iview, layer); 950 tu_cs_emit(cs, 0); 951 952 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL()); 953} 954 955static void 956r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch) 957{ 958 struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR); 959 960 tu_cs_emit_regs(cs, 961 A6XX_RB_MRT_BUF_INFO(0, .color_format = format.fmt, .color_swap = format.swap), 962 A6XX_RB_MRT_PITCH(0, pitch), 963 A6XX_RB_MRT_ARRAY_PITCH(0, 0), 964 A6XX_RB_MRT_BASE(0, .qword = va), 965 A6XX_RB_MRT_BASE_GMEM(0, 0)); 966 967 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL()); 968} 969 970static uint8_t 971aspect_write_mask(VkFormat vk_format, VkImageAspectFlags aspect_mask) 972{ 973 uint8_t mask = 0xf; 974 assert(aspect_mask); 975 /* note: the only format with partial writing is D24S8, 976 * clear/blit uses the _AS_R8G8B8A8 format to access it 977 */ 978 if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) { 979 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT) 980 mask = 0x7; 981 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT) 982 mask = 0x8; 983 } 984 return mask; 985} 986 987static void 988r3d_setup(struct tu_cmd_buffer *cmd, 989 struct tu_cs *cs, 990 VkFormat vk_format, 991 VkImageAspectFlags aspect_mask, 992 unsigned blit_param, 993 bool clear, 994 bool ubwc, 995 VkSampleCountFlagBits samples) 996{ 997 enum a6xx_format format = tu6_base_format(vk_format); 998 999 if ((vk_format == VK_FORMAT_D24_UNORM_S8_UINT || 1000 vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) && ubwc) { 1001 format = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8; 1002 } 1003 1004 if (!cmd->state.pass) { 1005 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM); 1006 tu6_emit_window_scissor(cs, 0, 0, 0x3fff, 0x3fff); 1007 } 1008 1009 tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000)); 1010 tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000)); 1011 1012 r3d_common(cmd, cs, !clear, 1, blit_param, samples); 1013 1014 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2); 1015 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) | 1016 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) | 1017 0xfc000000); 1018 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(1)); 1019 1020 tu_cs_emit_regs(cs, 1021 A6XX_RB_FS_OUTPUT_CNTL0(), 1022 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1)); 1023 1024 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL()); 1025 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff)); 1026 1027 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL()); 1028 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL()); 1029 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL()); 1030 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL()); 1031 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK()); 1032 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK()); 1033 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF()); 1034 1035 tu_cs_emit_regs(cs, A6XX_RB_RENDER_COMPONENTS(.rt0 = 0xf)); 1036 tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf)); 1037 1038 tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0, 1039 .color_format = format, 1040 .color_sint = vk_format_is_sint(vk_format), 1041 .color_uint = vk_format_is_uint(vk_format))); 1042 1043 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0, 1044 .component_enable = aspect_write_mask(vk_format, aspect_mask))); 1045 tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format))); 1046 tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format))); 1047 1048 tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0)); 1049 tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0)); 1050 1051 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_SC_CNTL, 1052 A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2)); 1053 1054 if (cmd->state.predication_active) { 1055 tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1); 1056 tu_cs_emit(cs, 0); 1057 } 1058} 1059 1060static void 1061r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs) 1062{ 1063 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3); 1064 tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) | 1065 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) | 1066 CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY)); 1067 tu_cs_emit(cs, 1); /* instance count */ 1068 tu_cs_emit(cs, 2); /* vertex count */ 1069} 1070 1071static void 1072r3d_teardown(struct tu_cmd_buffer *cmd, struct tu_cs *cs) 1073{ 1074 if (cmd->state.predication_active) { 1075 tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1); 1076 tu_cs_emit(cs, 1); 1077 } 1078} 1079 1080/* blit ops - common interface for 2d/shader paths */ 1081 1082struct blit_ops { 1083 void (*coords)(struct tu_cs *cs, 1084 const VkOffset2D *dst, 1085 const VkOffset2D *src, 1086 const VkExtent2D *extent); 1087 void (*clear_value)(struct tu_cs *cs, VkFormat format, const VkClearValue *val); 1088 void (*src)( 1089 struct tu_cmd_buffer *cmd, 1090 struct tu_cs *cs, 1091 const struct tu_image_view *iview, 1092 uint32_t layer, 1093 VkFilter filter); 1094 void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs, 1095 VkFormat vk_format, 1096 uint64_t va, uint32_t pitch, 1097 uint32_t width, uint32_t height); 1098 void (*dst)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer); 1099 void (*dst_buffer)(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch); 1100 void (*setup)(struct tu_cmd_buffer *cmd, 1101 struct tu_cs *cs, 1102 VkFormat vk_format, 1103 VkImageAspectFlags aspect_mask, 1104 unsigned blit_param, /* CmdBlitImage: rotation in 2D path and z scaling in 3D path */ 1105 bool clear, 1106 bool ubwc, 1107 VkSampleCountFlagBits samples); 1108 void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs); 1109 void (*teardown)(struct tu_cmd_buffer *cmd, 1110 struct tu_cs *cs); 1111}; 1112 1113static const struct blit_ops r2d_ops = { 1114 .coords = r2d_coords, 1115 .clear_value = r2d_clear_value, 1116 .src = r2d_src, 1117 .src_buffer = r2d_src_buffer, 1118 .dst = r2d_dst, 1119 .dst_buffer = r2d_dst_buffer, 1120 .setup = r2d_setup, 1121 .run = r2d_run, 1122 .teardown = r2d_teardown, 1123}; 1124 1125static const struct blit_ops r3d_ops = { 1126 .coords = r3d_coords, 1127 .clear_value = r3d_clear_value, 1128 .src = r3d_src, 1129 .src_buffer = r3d_src_buffer, 1130 .dst = r3d_dst, 1131 .dst_buffer = r3d_dst_buffer, 1132 .setup = r3d_setup, 1133 .run = r3d_run, 1134 .teardown = r3d_teardown, 1135}; 1136 1137/* passthrough set coords from 3D extents */ 1138static void 1139coords(const struct blit_ops *ops, 1140 struct tu_cs *cs, 1141 const VkOffset3D *dst, 1142 const VkOffset3D *src, 1143 const VkExtent3D *extent) 1144{ 1145 ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent); 1146} 1147 1148/* Decides the VK format to treat our data as for a memcpy-style blit. We have 1149 * to be a bit careful because we have to pick a format with matching UBWC 1150 * compression behavior, so no just returning R8_UINT/R16_UINT/R32_UINT for 1151 * everything. 1152 */ 1153static VkFormat 1154copy_format(VkFormat format, VkImageAspectFlags aspect_mask, bool copy_buffer) 1155{ 1156 if (vk_format_is_compressed(format)) { 1157 switch (vk_format_get_blocksize(format)) { 1158 case 1: return VK_FORMAT_R8_UINT; 1159 case 2: return VK_FORMAT_R16_UINT; 1160 case 4: return VK_FORMAT_R32_UINT; 1161 case 8: return VK_FORMAT_R32G32_UINT; 1162 case 16:return VK_FORMAT_R32G32B32A32_UINT; 1163 default: 1164 unreachable("unhandled format size"); 1165 } 1166 } 1167 1168 switch (format) { 1169 /* For SNORM formats, copy them as the equivalent UNORM format. If we treat 1170 * them as snorm then the 0x80 (-1.0 snorm8) value will get clamped to 0x81 1171 * (also -1.0), when we're supposed to be memcpying the bits. See 1172 * https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/2917 for discussion. 1173 */ 1174 case VK_FORMAT_R8_SNORM: 1175 return VK_FORMAT_R8_UNORM; 1176 case VK_FORMAT_R8G8_SNORM: 1177 return VK_FORMAT_R8G8_UNORM; 1178 case VK_FORMAT_R8G8B8_SNORM: 1179 return VK_FORMAT_R8G8B8_UNORM; 1180 case VK_FORMAT_B8G8R8_SNORM: 1181 return VK_FORMAT_B8G8R8_UNORM; 1182 case VK_FORMAT_R8G8B8A8_SNORM: 1183 return VK_FORMAT_R8G8B8A8_UNORM; 1184 case VK_FORMAT_B8G8R8A8_SNORM: 1185 return VK_FORMAT_B8G8R8A8_UNORM; 1186 case VK_FORMAT_A8B8G8R8_SNORM_PACK32: 1187 return VK_FORMAT_A8B8G8R8_UNORM_PACK32; 1188 case VK_FORMAT_A2R10G10B10_SNORM_PACK32: 1189 return VK_FORMAT_A2R10G10B10_UNORM_PACK32; 1190 case VK_FORMAT_A2B10G10R10_SNORM_PACK32: 1191 return VK_FORMAT_A2B10G10R10_UNORM_PACK32; 1192 case VK_FORMAT_R16_SNORM: 1193 return VK_FORMAT_R16_UNORM; 1194 case VK_FORMAT_R16G16_SNORM: 1195 return VK_FORMAT_R16G16_UNORM; 1196 case VK_FORMAT_R16G16B16_SNORM: 1197 return VK_FORMAT_R16G16B16_UNORM; 1198 case VK_FORMAT_R16G16B16A16_SNORM: 1199 return VK_FORMAT_R16G16B16A16_UNORM; 1200 1201 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32: 1202 return VK_FORMAT_R32_UINT; 1203 1204 case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM: 1205 if (aspect_mask == VK_IMAGE_ASPECT_PLANE_1_BIT) 1206 return VK_FORMAT_R8G8_UNORM; 1207 else 1208 return VK_FORMAT_R8_UNORM; 1209 case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM: 1210 return VK_FORMAT_R8_UNORM; 1211 1212 case VK_FORMAT_D24_UNORM_S8_UINT: 1213 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT && copy_buffer) 1214 return VK_FORMAT_R8_UNORM; 1215 else 1216 return format; 1217 1218 case VK_FORMAT_D32_SFLOAT_S8_UINT: 1219 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT) 1220 return VK_FORMAT_S8_UINT; 1221 assert(aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT); 1222 return VK_FORMAT_D32_SFLOAT; 1223 1224 default: 1225 return format; 1226 } 1227} 1228 1229void 1230tu6_clear_lrz(struct tu_cmd_buffer *cmd, 1231 struct tu_cs *cs, 1232 struct tu_image *image, 1233 const VkClearValue *value) 1234{ 1235 const struct blit_ops *ops = &r2d_ops; 1236 1237 ops->setup(cmd, cs, VK_FORMAT_D16_UNORM, VK_IMAGE_ASPECT_DEPTH_BIT, 0, true, false, 1238 VK_SAMPLE_COUNT_1_BIT); 1239 ops->clear_value(cs, VK_FORMAT_D16_UNORM, value); 1240 ops->dst_buffer(cs, VK_FORMAT_D16_UNORM, 1241 image->bo->iova + image->bo_offset + image->lrz_offset, 1242 image->lrz_pitch * 2); 1243 ops->coords(cs, &(VkOffset2D) {}, NULL, &(VkExtent2D) {image->lrz_pitch, image->lrz_height}); 1244 ops->run(cmd, cs); 1245 ops->teardown(cmd, cs); 1246} 1247 1248static void 1249tu_image_view_copy_blit(struct tu_image_view *iview, 1250 struct tu_image *image, 1251 VkFormat format, 1252 const VkImageSubresourceLayers *subres, 1253 uint32_t layer, 1254 bool stencil_read, 1255 bool z_scale) 1256{ 1257 VkImageAspectFlags aspect_mask = subres->aspectMask; 1258 1259 /* always use the AS_R8G8B8A8 format for these */ 1260 if (format == VK_FORMAT_D24_UNORM_S8_UINT || 1261 format == VK_FORMAT_X8_D24_UNORM_PACK32) { 1262 aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT; 1263 } 1264 1265 tu_image_view_init(iview, &(VkImageViewCreateInfo) { 1266 .image = tu_image_to_handle(image), 1267 .viewType = z_scale ? VK_IMAGE_VIEW_TYPE_3D : VK_IMAGE_VIEW_TYPE_2D, 1268 .format = format, 1269 /* image_to_buffer from d24s8 with stencil aspect mask writes out to r8 */ 1270 .components.r = stencil_read ? VK_COMPONENT_SWIZZLE_A : VK_COMPONENT_SWIZZLE_R, 1271 .subresourceRange = { 1272 .aspectMask = aspect_mask, 1273 .baseMipLevel = subres->mipLevel, 1274 .levelCount = 1, 1275 .baseArrayLayer = subres->baseArrayLayer + layer, 1276 .layerCount = 1, 1277 }, 1278 }, false); 1279} 1280 1281static void 1282tu_image_view_copy(struct tu_image_view *iview, 1283 struct tu_image *image, 1284 VkFormat format, 1285 const VkImageSubresourceLayers *subres, 1286 uint32_t layer, 1287 bool stencil_read) 1288{ 1289 format = copy_format(format, subres->aspectMask, false); 1290 tu_image_view_copy_blit(iview, image, format, subres, layer, stencil_read, false); 1291} 1292 1293static void 1294tu_image_view_blit(struct tu_image_view *iview, 1295 struct tu_image *image, 1296 const VkImageSubresourceLayers *subres, 1297 uint32_t layer) 1298{ 1299 tu_image_view_copy_blit(iview, image, image->vk_format, subres, layer, false, false); 1300} 1301 1302static void 1303tu6_blit_image(struct tu_cmd_buffer *cmd, 1304 struct tu_image *src_image, 1305 struct tu_image *dst_image, 1306 const VkImageBlit *info, 1307 VkFilter filter) 1308{ 1309 const struct blit_ops *ops = &r2d_ops; 1310 struct tu_cs *cs = &cmd->cs; 1311 bool z_scale = false; 1312 uint32_t layers = info->dstOffsets[1].z - info->dstOffsets[0].z; 1313 1314 /* 2D blit can't do rotation mirroring from just coordinates */ 1315 static const enum a6xx_rotation rotate[2][2] = { 1316 {ROTATE_0, ROTATE_HFLIP}, 1317 {ROTATE_VFLIP, ROTATE_180}, 1318 }; 1319 1320 bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) != 1321 (info->dstOffsets[1].x < info->dstOffsets[0].x); 1322 bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) != 1323 (info->dstOffsets[1].y < info->dstOffsets[0].y); 1324 1325 int32_t src0_z = info->srcOffsets[0].z; 1326 int32_t src1_z = info->srcOffsets[1].z; 1327 1328 if ((info->srcOffsets[1].z - info->srcOffsets[0].z != 1329 info->dstOffsets[1].z - info->dstOffsets[0].z) || 1330 info->srcOffsets[1].z < info->srcOffsets[0].z) { 1331 z_scale = true; 1332 } 1333 1334 if (info->dstOffsets[1].z < info->dstOffsets[0].z) { 1335 layers = info->dstOffsets[0].z - info->dstOffsets[1].z; 1336 src0_z = info->srcOffsets[1].z; 1337 src1_z = info->srcOffsets[0].z; 1338 } 1339 1340 if (info->dstSubresource.layerCount > 1) { 1341 assert(layers <= 1); 1342 layers = info->dstSubresource.layerCount; 1343 } 1344 1345 /* BC1_RGB_* formats need to have their last components overriden with 1 1346 * when sampling, which is normally handled with the texture descriptor 1347 * swizzle. The 2d path can't handle that, so use the 3d path. 1348 * 1349 * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with 1350 * the 2d path. 1351 */ 1352 1353 unsigned blit_param = rotate[mirror_y][mirror_x]; 1354 if (dst_image->layout[0].nr_samples > 1 || 1355 src_image->vk_format == VK_FORMAT_BC1_RGB_UNORM_BLOCK || 1356 src_image->vk_format == VK_FORMAT_BC1_RGB_SRGB_BLOCK || 1357 filter == VK_FILTER_CUBIC_EXT || 1358 z_scale) { 1359 ops = &r3d_ops; 1360 blit_param = z_scale; 1361 } 1362 1363 /* use the right format in setup() for D32_S8 1364 * TODO: this probably should use a helper 1365 */ 1366 VkFormat format = dst_image->vk_format; 1367 if (format == VK_FORMAT_D32_SFLOAT_S8_UINT) { 1368 if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT) 1369 format = VK_FORMAT_D32_SFLOAT; 1370 else if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) 1371 format = VK_FORMAT_S8_UINT; 1372 else 1373 unreachable("unexpected D32_S8 aspect mask in blit_image"); 1374 } 1375 1376 trace_start_blit(&cmd->trace, cs); 1377 1378 ops->setup(cmd, cs, format, info->dstSubresource.aspectMask, 1379 blit_param, false, dst_image->layout[0].ubwc, 1380 dst_image->layout[0].nr_samples); 1381 1382 if (ops == &r3d_ops) { 1383 r3d_coords_raw(cs, (float[]) { 1384 info->dstOffsets[0].x, info->dstOffsets[0].y, 1385 info->srcOffsets[0].x, info->srcOffsets[0].y, 1386 info->dstOffsets[1].x, info->dstOffsets[1].y, 1387 info->srcOffsets[1].x, info->srcOffsets[1].y 1388 }); 1389 } else { 1390 tu_cs_emit_regs(cs, 1391 A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x), 1392 .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)), 1393 A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1, 1394 .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1)); 1395 tu_cs_emit_regs(cs, 1396 A6XX_GRAS_2D_SRC_TL_X(MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)), 1397 A6XX_GRAS_2D_SRC_BR_X(MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1), 1398 A6XX_GRAS_2D_SRC_TL_Y(MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)), 1399 A6XX_GRAS_2D_SRC_BR_Y(MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1)); 1400 } 1401 1402 struct tu_image_view dst, src; 1403 tu_image_view_blit(&dst, dst_image, &info->dstSubresource, 1404 MIN2(info->dstOffsets[0].z, info->dstOffsets[1].z)); 1405 1406 if (z_scale) { 1407 tu_image_view_copy_blit(&src, src_image, src_image->vk_format, 1408 &info->srcSubresource, 0, false, true); 1409 ops->src(cmd, cs, &src, 0, filter); 1410 } else { 1411 tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z); 1412 } 1413 1414 for (uint32_t i = 0; i < layers; i++) { 1415 if (z_scale) { 1416 float t = ((float) i + 0.5f) / (float) layers; 1417 r3d_coord_z(cs, t * (src1_z - src0_z) + src0_z); 1418 } else { 1419 ops->src(cmd, cs, &src, i, filter); 1420 } 1421 ops->dst(cs, &dst, i); 1422 ops->run(cmd, cs); 1423 } 1424 1425 ops->teardown(cmd, cs); 1426 1427 trace_end_blit(&cmd->trace, cs, 1428 ops == &r3d_ops, 1429 src_image->vk_format, 1430 dst_image->vk_format, 1431 layers); 1432} 1433 1434VKAPI_ATTR void VKAPI_CALL 1435tu_CmdBlitImage(VkCommandBuffer commandBuffer, 1436 VkImage srcImage, 1437 VkImageLayout srcImageLayout, 1438 VkImage dstImage, 1439 VkImageLayout dstImageLayout, 1440 uint32_t regionCount, 1441 const VkImageBlit *pRegions, 1442 VkFilter filter) 1443 1444{ 1445 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 1446 TU_FROM_HANDLE(tu_image, src_image, srcImage); 1447 TU_FROM_HANDLE(tu_image, dst_image, dstImage); 1448 1449 for (uint32_t i = 0; i < regionCount; ++i) { 1450 /* can't blit both depth and stencil at once with D32_S8 1451 * TODO: more advanced 3D blit path to support it instead? 1452 */ 1453 if (src_image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT || 1454 dst_image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) { 1455 VkImageBlit region = pRegions[i]; 1456 u_foreach_bit(b, pRegions[i].dstSubresource.aspectMask) { 1457 region.srcSubresource.aspectMask = BIT(b); 1458 region.dstSubresource.aspectMask = BIT(b); 1459 tu6_blit_image(cmd, src_image, dst_image, ®ion, filter); 1460 } 1461 continue; 1462 } 1463 tu6_blit_image(cmd, src_image, dst_image, pRegions + i, filter); 1464 } 1465} 1466 1467static void 1468copy_compressed(VkFormat format, 1469 VkOffset3D *offset, 1470 VkExtent3D *extent, 1471 uint32_t *width, 1472 uint32_t *height) 1473{ 1474 if (!vk_format_is_compressed(format)) 1475 return; 1476 1477 uint32_t block_width = vk_format_get_blockwidth(format); 1478 uint32_t block_height = vk_format_get_blockheight(format); 1479 1480 offset->x /= block_width; 1481 offset->y /= block_height; 1482 1483 if (extent) { 1484 extent->width = DIV_ROUND_UP(extent->width, block_width); 1485 extent->height = DIV_ROUND_UP(extent->height, block_height); 1486 } 1487 if (width) 1488 *width = DIV_ROUND_UP(*width, block_width); 1489 if (height) 1490 *height = DIV_ROUND_UP(*height, block_height); 1491} 1492 1493static void 1494tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd, 1495 struct tu_buffer *src_buffer, 1496 struct tu_image *dst_image, 1497 const VkBufferImageCopy *info) 1498{ 1499 struct tu_cs *cs = &cmd->cs; 1500 uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount); 1501 VkFormat src_format = 1502 copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, true); 1503 const struct blit_ops *ops = &r2d_ops; 1504 1505 /* special case for buffer to stencil */ 1506 if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT && 1507 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) { 1508 ops = &r3d_ops; 1509 } 1510 1511 /* TODO: G8_B8R8_2PLANE_420_UNORM Y plane has different hardware format, 1512 * which matters for UBWC. buffer_to_image/etc can fail because of this 1513 */ 1514 1515 VkOffset3D offset = info->imageOffset; 1516 VkExtent3D extent = info->imageExtent; 1517 uint32_t src_width = info->bufferRowLength ?: extent.width; 1518 uint32_t src_height = info->bufferImageHeight ?: extent.height; 1519 1520 copy_compressed(dst_image->vk_format, &offset, &extent, &src_width, &src_height); 1521 1522 uint32_t pitch = src_width * vk_format_get_blocksize(src_format); 1523 uint32_t layer_size = src_height * pitch; 1524 1525 ops->setup(cmd, cs, 1526 copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, false), 1527 info->imageSubresource.aspectMask, 0, false, dst_image->layout[0].ubwc, 1528 dst_image->layout[0].nr_samples); 1529 1530 struct tu_image_view dst; 1531 tu_image_view_copy(&dst, dst_image, dst_image->vk_format, &info->imageSubresource, offset.z, false); 1532 1533 for (uint32_t i = 0; i < layers; i++) { 1534 ops->dst(cs, &dst, i); 1535 1536 uint64_t src_va = tu_buffer_iova(src_buffer) + info->bufferOffset + layer_size * i; 1537 if ((src_va & 63) || (pitch & 63)) { 1538 for (uint32_t y = 0; y < extent.height; y++) { 1539 uint32_t x = (src_va & 63) / vk_format_get_blocksize(src_format); 1540 ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch, 1541 x + extent.width, 1); 1542 ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y}, &(VkOffset2D){x}, 1543 &(VkExtent2D) {extent.width, 1}); 1544 ops->run(cmd, cs); 1545 src_va += pitch; 1546 } 1547 } else { 1548 ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height); 1549 coords(ops, cs, &offset, &(VkOffset3D){}, &extent); 1550 ops->run(cmd, cs); 1551 } 1552 } 1553 1554 ops->teardown(cmd, cs); 1555} 1556 1557VKAPI_ATTR void VKAPI_CALL 1558tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer, 1559 VkBuffer srcBuffer, 1560 VkImage dstImage, 1561 VkImageLayout dstImageLayout, 1562 uint32_t regionCount, 1563 const VkBufferImageCopy *pRegions) 1564{ 1565 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 1566 TU_FROM_HANDLE(tu_image, dst_image, dstImage); 1567 TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer); 1568 1569 for (unsigned i = 0; i < regionCount; ++i) 1570 tu_copy_buffer_to_image(cmd, src_buffer, dst_image, pRegions + i); 1571} 1572 1573static void 1574tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd, 1575 struct tu_image *src_image, 1576 struct tu_buffer *dst_buffer, 1577 const VkBufferImageCopy *info) 1578{ 1579 struct tu_cs *cs = &cmd->cs; 1580 uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount); 1581 VkFormat dst_format = 1582 copy_format(src_image->vk_format, info->imageSubresource.aspectMask, true); 1583 bool stencil_read = false; 1584 1585 if (src_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT && 1586 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) { 1587 stencil_read = true; 1588 } 1589 1590 const struct blit_ops *ops = stencil_read ? &r3d_ops : &r2d_ops; 1591 VkOffset3D offset = info->imageOffset; 1592 VkExtent3D extent = info->imageExtent; 1593 uint32_t dst_width = info->bufferRowLength ?: extent.width; 1594 uint32_t dst_height = info->bufferImageHeight ?: extent.height; 1595 1596 copy_compressed(src_image->vk_format, &offset, &extent, &dst_width, &dst_height); 1597 1598 uint32_t pitch = dst_width * vk_format_get_blocksize(dst_format); 1599 uint32_t layer_size = pitch * dst_height; 1600 1601 ops->setup(cmd, cs, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false, 1602 VK_SAMPLE_COUNT_1_BIT); 1603 1604 struct tu_image_view src; 1605 tu_image_view_copy(&src, src_image, src_image->vk_format, &info->imageSubresource, offset.z, stencil_read); 1606 1607 for (uint32_t i = 0; i < layers; i++) { 1608 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST); 1609 1610 uint64_t dst_va = tu_buffer_iova(dst_buffer) + info->bufferOffset + layer_size * i; 1611 if ((dst_va & 63) || (pitch & 63)) { 1612 for (uint32_t y = 0; y < extent.height; y++) { 1613 uint32_t x = (dst_va & 63) / vk_format_get_blocksize(dst_format); 1614 ops->dst_buffer(cs, dst_format, dst_va & ~63, 0); 1615 ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y}, 1616 &(VkExtent2D) {extent.width, 1}); 1617 ops->run(cmd, cs); 1618 dst_va += pitch; 1619 } 1620 } else { 1621 ops->dst_buffer(cs, dst_format, dst_va, pitch); 1622 coords(ops, cs, &(VkOffset3D) {0, 0}, &offset, &extent); 1623 ops->run(cmd, cs); 1624 } 1625 } 1626 1627 ops->teardown(cmd, cs); 1628} 1629 1630VKAPI_ATTR void VKAPI_CALL 1631tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer, 1632 VkImage srcImage, 1633 VkImageLayout srcImageLayout, 1634 VkBuffer dstBuffer, 1635 uint32_t regionCount, 1636 const VkBufferImageCopy *pRegions) 1637{ 1638 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 1639 TU_FROM_HANDLE(tu_image, src_image, srcImage); 1640 TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer); 1641 1642 for (unsigned i = 0; i < regionCount; ++i) 1643 tu_copy_image_to_buffer(cmd, src_image, dst_buffer, pRegions + i); 1644} 1645 1646/* Tiled formats don't support swapping, which means that we can't support 1647 * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some 1648 * formats like B5G5R5A1 have a separate linear-only format when sampling. 1649 * Currently we fake support for tiled swapped formats and use the unswapped 1650 * format instead, but this means that reinterpreting copies to and from 1651 * swapped formats can't be performed correctly unless we can swizzle the 1652 * components by reinterpreting the other image as the "correct" swapped 1653 * format, i.e. only when the other image is linear. 1654 */ 1655 1656static bool 1657is_swapped_format(VkFormat format) 1658{ 1659 struct tu_native_format linear = tu6_format_texture(format, TILE6_LINEAR); 1660 struct tu_native_format tiled = tu6_format_texture(format, TILE6_3); 1661 return linear.fmt != tiled.fmt || linear.swap != tiled.swap; 1662} 1663 1664/* R8G8_* formats have a different tiling layout than other cpp=2 formats, and 1665 * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice 1666 * versa). This should mirror the logic in fdl6_layout. 1667 */ 1668static bool 1669image_is_r8g8(struct tu_image *image) 1670{ 1671 return image->layout[0].cpp == 2 && 1672 vk_format_get_nr_components(image->vk_format) == 2; 1673} 1674 1675static void 1676tu_copy_image_to_image(struct tu_cmd_buffer *cmd, 1677 struct tu_image *src_image, 1678 struct tu_image *dst_image, 1679 const VkImageCopy *info) 1680{ 1681 const struct blit_ops *ops = &r2d_ops; 1682 struct tu_cs *cs = &cmd->cs; 1683 1684 if (dst_image->layout[0].nr_samples > 1) 1685 ops = &r3d_ops; 1686 1687 VkFormat format = VK_FORMAT_UNDEFINED; 1688 VkOffset3D src_offset = info->srcOffset; 1689 VkOffset3D dst_offset = info->dstOffset; 1690 VkExtent3D extent = info->extent; 1691 uint32_t layers_to_copy = MAX2(info->extent.depth, info->srcSubresource.layerCount); 1692 1693 /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between 1694 * Images": 1695 * 1696 * When copying between compressed and uncompressed formats the extent 1697 * members represent the texel dimensions of the source image and not 1698 * the destination. When copying from a compressed image to an 1699 * uncompressed image the image texel dimensions written to the 1700 * uncompressed image will be source extent divided by the compressed 1701 * texel block dimensions. When copying from an uncompressed image to a 1702 * compressed image the image texel dimensions written to the compressed 1703 * image will be the source extent multiplied by the compressed texel 1704 * block dimensions. 1705 * 1706 * This means we only have to adjust the extent if the source image is 1707 * compressed. 1708 */ 1709 copy_compressed(src_image->vk_format, &src_offset, &extent, NULL, NULL); 1710 copy_compressed(dst_image->vk_format, &dst_offset, NULL, NULL, NULL); 1711 1712 VkFormat dst_format = copy_format(dst_image->vk_format, info->dstSubresource.aspectMask, false); 1713 VkFormat src_format = copy_format(src_image->vk_format, info->srcSubresource.aspectMask, false); 1714 1715 bool use_staging_blit = false; 1716 1717 if (src_format == dst_format) { 1718 /* Images that share a format can always be copied directly because it's 1719 * the same as a blit. 1720 */ 1721 format = src_format; 1722 } else if (!src_image->layout[0].tile_mode) { 1723 /* If an image is linear, we can always safely reinterpret it with the 1724 * other image's format and then do a regular blit. 1725 */ 1726 format = dst_format; 1727 } else if (!dst_image->layout[0].tile_mode) { 1728 format = src_format; 1729 } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) { 1730 /* We can't currently copy r8g8 images to/from other cpp=2 images, 1731 * due to the different tile layout. 1732 */ 1733 use_staging_blit = true; 1734 } else if (is_swapped_format(src_format) || 1735 is_swapped_format(dst_format)) { 1736 /* If either format has a non-identity swap, then we can't copy 1737 * to/from it. 1738 */ 1739 use_staging_blit = true; 1740 } else if (!src_image->layout[0].ubwc) { 1741 format = dst_format; 1742 } else if (!dst_image->layout[0].ubwc) { 1743 format = src_format; 1744 } else { 1745 /* Both formats use UBWC and so neither can be reinterpreted. 1746 * TODO: We could do an in-place decompression of the dst instead. 1747 */ 1748 use_staging_blit = true; 1749 } 1750 1751 struct tu_image_view dst, src; 1752 1753 if (use_staging_blit) { 1754 tu_image_view_copy(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z, false); 1755 tu_image_view_copy(&src, src_image, src_format, &info->srcSubresource, src_offset.z, false); 1756 1757 struct tu_image staging_image = { 1758 .base.type = VK_OBJECT_TYPE_IMAGE, 1759 .vk_format = src_format, 1760 .level_count = 1, 1761 .layer_count = info->srcSubresource.layerCount, 1762 .bo_offset = 0, 1763 }; 1764 1765 VkImageSubresourceLayers staging_subresource = { 1766 .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, 1767 .mipLevel = 0, 1768 .baseArrayLayer = 0, 1769 .layerCount = info->srcSubresource.layerCount, 1770 }; 1771 1772 VkOffset3D staging_offset = { 0 }; 1773 1774 staging_image.layout[0].tile_mode = TILE6_LINEAR; 1775 staging_image.layout[0].ubwc = false; 1776 1777 fdl6_layout(&staging_image.layout[0], 1778 vk_format_to_pipe_format(staging_image.vk_format), 1779 src_image->layout[0].nr_samples, 1780 extent.width, 1781 extent.height, 1782 extent.depth, 1783 staging_image.level_count, 1784 staging_image.layer_count, 1785 extent.depth > 1, 1786 NULL); 1787 1788 VkResult result = tu_get_scratch_bo(cmd->device, 1789 staging_image.layout[0].size, 1790 &staging_image.bo); 1791 if (result != VK_SUCCESS) { 1792 cmd->record_result = result; 1793 return; 1794 } 1795 1796 struct tu_image_view staging; 1797 tu_image_view_copy(&staging, &staging_image, src_format, 1798 &staging_subresource, 0, false); 1799 1800 ops->setup(cmd, cs, src_format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false, 1801 dst_image->layout[0].nr_samples); 1802 coords(ops, cs, &staging_offset, &src_offset, &extent); 1803 1804 for (uint32_t i = 0; i < layers_to_copy; i++) { 1805 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST); 1806 ops->dst(cs, &staging, i); 1807 ops->run(cmd, cs); 1808 } 1809 1810 /* When executed by the user there has to be a pipeline barrier here, 1811 * but since we're doing it manually we'll have to flush ourselves. 1812 */ 1813 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); 1814 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); 1815 tu_cs_emit_wfi(cs); 1816 1817 tu_image_view_copy(&staging, &staging_image, dst_format, 1818 &staging_subresource, 0, false); 1819 1820 ops->setup(cmd, cs, dst_format, info->dstSubresource.aspectMask, 1821 0, false, dst_image->layout[0].ubwc, 1822 dst_image->layout[0].nr_samples); 1823 coords(ops, cs, &dst_offset, &staging_offset, &extent); 1824 1825 for (uint32_t i = 0; i < layers_to_copy; i++) { 1826 ops->src(cmd, cs, &staging, i, VK_FILTER_NEAREST); 1827 ops->dst(cs, &dst, i); 1828 ops->run(cmd, cs); 1829 } 1830 } else { 1831 tu_image_view_copy(&dst, dst_image, format, &info->dstSubresource, dst_offset.z, false); 1832 tu_image_view_copy(&src, src_image, format, &info->srcSubresource, src_offset.z, false); 1833 1834 ops->setup(cmd, cs, format, info->dstSubresource.aspectMask, 1835 0, false, dst_image->layout[0].ubwc, 1836 dst_image->layout[0].nr_samples); 1837 coords(ops, cs, &dst_offset, &src_offset, &extent); 1838 1839 for (uint32_t i = 0; i < layers_to_copy; i++) { 1840 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST); 1841 ops->dst(cs, &dst, i); 1842 ops->run(cmd, cs); 1843 } 1844 } 1845 1846 ops->teardown(cmd, cs); 1847} 1848 1849VKAPI_ATTR void VKAPI_CALL 1850tu_CmdCopyImage(VkCommandBuffer commandBuffer, 1851 VkImage srcImage, 1852 VkImageLayout srcImageLayout, 1853 VkImage destImage, 1854 VkImageLayout destImageLayout, 1855 uint32_t regionCount, 1856 const VkImageCopy *pRegions) 1857{ 1858 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 1859 TU_FROM_HANDLE(tu_image, src_image, srcImage); 1860 TU_FROM_HANDLE(tu_image, dst_image, destImage); 1861 1862 for (uint32_t i = 0; i < regionCount; ++i) { 1863 if (src_image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) { 1864 VkImageCopy info = pRegions[i]; 1865 u_foreach_bit(b, pRegions[i].dstSubresource.aspectMask) { 1866 info.srcSubresource.aspectMask = BIT(b); 1867 info.dstSubresource.aspectMask = BIT(b); 1868 tu_copy_image_to_image(cmd, src_image, dst_image, &info); 1869 } 1870 continue; 1871 } 1872 1873 tu_copy_image_to_image(cmd, src_image, dst_image, pRegions + i); 1874 } 1875} 1876 1877static void 1878copy_buffer(struct tu_cmd_buffer *cmd, 1879 uint64_t dst_va, 1880 uint64_t src_va, 1881 uint64_t size, 1882 uint32_t block_size) 1883{ 1884 const struct blit_ops *ops = &r2d_ops; 1885 struct tu_cs *cs = &cmd->cs; 1886 VkFormat format = block_size == 4 ? VK_FORMAT_R32_UINT : VK_FORMAT_R8_UNORM; 1887 uint64_t blocks = size / block_size; 1888 1889 ops->setup(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false, 1890 VK_SAMPLE_COUNT_1_BIT); 1891 1892 while (blocks) { 1893 uint32_t src_x = (src_va & 63) / block_size; 1894 uint32_t dst_x = (dst_va & 63) / block_size; 1895 uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x); 1896 1897 ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1); 1898 ops->dst_buffer( cs, format, dst_va & ~63, 0); 1899 ops->coords(cs, &(VkOffset2D) {dst_x}, &(VkOffset2D) {src_x}, &(VkExtent2D) {width, 1}); 1900 ops->run(cmd, cs); 1901 1902 src_va += width * block_size; 1903 dst_va += width * block_size; 1904 blocks -= width; 1905 } 1906 1907 ops->teardown(cmd, cs); 1908} 1909 1910VKAPI_ATTR void VKAPI_CALL 1911tu_CmdCopyBuffer(VkCommandBuffer commandBuffer, 1912 VkBuffer srcBuffer, 1913 VkBuffer dstBuffer, 1914 uint32_t regionCount, 1915 const VkBufferCopy *pRegions) 1916{ 1917 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 1918 TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer); 1919 TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer); 1920 1921 for (unsigned i = 0; i < regionCount; ++i) { 1922 copy_buffer(cmd, 1923 tu_buffer_iova(dst_buffer) + pRegions[i].dstOffset, 1924 tu_buffer_iova(src_buffer) + pRegions[i].srcOffset, 1925 pRegions[i].size, 1); 1926 } 1927} 1928 1929VKAPI_ATTR void VKAPI_CALL 1930tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer, 1931 VkBuffer dstBuffer, 1932 VkDeviceSize dstOffset, 1933 VkDeviceSize dataSize, 1934 const void *pData) 1935{ 1936 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 1937 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer); 1938 1939 struct tu_cs_memory tmp; 1940 VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64 / 4, &tmp); 1941 if (result != VK_SUCCESS) { 1942 cmd->record_result = result; 1943 return; 1944 } 1945 1946 memcpy(tmp.map, pData, dataSize); 1947 copy_buffer(cmd, tu_buffer_iova(buffer) + dstOffset, tmp.iova, dataSize, 4); 1948} 1949 1950VKAPI_ATTR void VKAPI_CALL 1951tu_CmdFillBuffer(VkCommandBuffer commandBuffer, 1952 VkBuffer dstBuffer, 1953 VkDeviceSize dstOffset, 1954 VkDeviceSize fillSize, 1955 uint32_t data) 1956{ 1957 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 1958 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer); 1959 const struct blit_ops *ops = &r2d_ops; 1960 struct tu_cs *cs = &cmd->cs; 1961 1962 if (fillSize == VK_WHOLE_SIZE) 1963 fillSize = buffer->size - dstOffset; 1964 1965 uint64_t dst_va = tu_buffer_iova(buffer) + dstOffset; 1966 uint32_t blocks = fillSize / 4; 1967 1968 ops->setup(cmd, cs, VK_FORMAT_R32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, 0, true, false, 1969 VK_SAMPLE_COUNT_1_BIT); 1970 ops->clear_value(cs, VK_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}}); 1971 1972 while (blocks) { 1973 uint32_t dst_x = (dst_va & 63) / 4; 1974 uint32_t width = MIN2(blocks, 0x4000 - dst_x); 1975 1976 ops->dst_buffer(cs, VK_FORMAT_R32_UINT, dst_va & ~63, 0); 1977 ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1}); 1978 ops->run(cmd, cs); 1979 1980 dst_va += width * 4; 1981 blocks -= width; 1982 } 1983 1984 ops->teardown(cmd, cs); 1985} 1986 1987VKAPI_ATTR void VKAPI_CALL 1988tu_CmdResolveImage(VkCommandBuffer commandBuffer, 1989 VkImage srcImage, 1990 VkImageLayout srcImageLayout, 1991 VkImage dstImage, 1992 VkImageLayout dstImageLayout, 1993 uint32_t regionCount, 1994 const VkImageResolve *pRegions) 1995{ 1996 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 1997 TU_FROM_HANDLE(tu_image, src_image, srcImage); 1998 TU_FROM_HANDLE(tu_image, dst_image, dstImage); 1999 const struct blit_ops *ops = &r2d_ops; 2000 struct tu_cs *cs = &cmd->cs; 2001 2002 ops->setup(cmd, cs, dst_image->vk_format, VK_IMAGE_ASPECT_COLOR_BIT, 2003 0, false, dst_image->layout[0].ubwc, VK_SAMPLE_COUNT_1_BIT); 2004 2005 for (uint32_t i = 0; i < regionCount; ++i) { 2006 const VkImageResolve *info = &pRegions[i]; 2007 uint32_t layers = MAX2(info->extent.depth, info->dstSubresource.layerCount); 2008 2009 assert(info->srcSubresource.layerCount == info->dstSubresource.layerCount); 2010 /* TODO: aspect masks possible ? */ 2011 2012 coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent); 2013 2014 struct tu_image_view dst, src; 2015 tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffset.z); 2016 tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z); 2017 2018 for (uint32_t i = 0; i < layers; i++) { 2019 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST); 2020 ops->dst(cs, &dst, i); 2021 ops->run(cmd, cs); 2022 } 2023 } 2024 2025 ops->teardown(cmd, cs); 2026} 2027 2028#define for_each_layer(layer, layer_mask, layers) \ 2029 for (uint32_t layer = 0; \ 2030 layer < ((layer_mask) ? (util_logbase2(layer_mask) + 1) : layers); \ 2031 layer++) \ 2032 if (!layer_mask || (layer_mask & BIT(layer))) 2033 2034static void 2035resolve_sysmem(struct tu_cmd_buffer *cmd, 2036 struct tu_cs *cs, 2037 VkFormat format, 2038 const struct tu_image_view *src, 2039 const struct tu_image_view *dst, 2040 uint32_t layer_mask, 2041 uint32_t layers, 2042 const VkRect2D *rect, 2043 bool separate_stencil) 2044{ 2045 const struct blit_ops *ops = &r2d_ops; 2046 2047 trace_start_sysmem_resolve(&cmd->trace, cs); 2048 2049 ops->setup(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, 2050 0, false, dst->ubwc_enabled, VK_SAMPLE_COUNT_1_BIT); 2051 ops->coords(cs, &rect->offset, &rect->offset, &rect->extent); 2052 2053 for_each_layer(i, layer_mask, layers) { 2054 if (separate_stencil) { 2055 r2d_src_stencil(cmd, cs, src, i, VK_FILTER_NEAREST); 2056 r2d_dst_stencil(cs, dst, i); 2057 } else { 2058 ops->src(cmd, cs, src, i, VK_FILTER_NEAREST); 2059 ops->dst(cs, dst, i); 2060 } 2061 ops->run(cmd, cs); 2062 } 2063 2064 ops->teardown(cmd, cs); 2065 2066 trace_end_sysmem_resolve(&cmd->trace, cs, format); 2067} 2068 2069void 2070tu_resolve_sysmem(struct tu_cmd_buffer *cmd, 2071 struct tu_cs *cs, 2072 const struct tu_image_view *src, 2073 const struct tu_image_view *dst, 2074 uint32_t layer_mask, 2075 uint32_t layers, 2076 const VkRect2D *rect) 2077{ 2078 assert(src->image->vk_format == dst->image->vk_format); 2079 2080 if (dst->image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) { 2081 resolve_sysmem(cmd, cs, VK_FORMAT_D32_SFLOAT, 2082 src, dst, layer_mask, layers, rect, false); 2083 resolve_sysmem(cmd, cs, VK_FORMAT_S8_UINT, 2084 src, dst, layer_mask, layers, rect, true); 2085 } else { 2086 resolve_sysmem(cmd, cs, dst->image->vk_format, 2087 src, dst, layer_mask, layers, rect, false); 2088 } 2089} 2090 2091static void 2092clear_image(struct tu_cmd_buffer *cmd, 2093 struct tu_image *image, 2094 const VkClearValue *clear_value, 2095 const VkImageSubresourceRange *range, 2096 VkImageAspectFlags aspect_mask) 2097{ 2098 uint32_t level_count = tu_get_levelCount(image, range); 2099 uint32_t layer_count = tu_get_layerCount(image, range); 2100 struct tu_cs *cs = &cmd->cs; 2101 VkFormat format = image->vk_format; 2102 if (format == VK_FORMAT_D32_SFLOAT_S8_UINT || format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) 2103 format = copy_format(format, aspect_mask, false); 2104 2105 if (image->layout[0].depth0 > 1) { 2106 assert(layer_count == 1); 2107 assert(range->baseArrayLayer == 0); 2108 } 2109 2110 const struct blit_ops *ops = image->layout[0].nr_samples > 1 ? &r3d_ops : &r2d_ops; 2111 2112 ops->setup(cmd, cs, format, aspect_mask, 0, true, image->layout[0].ubwc, 2113 image->layout[0].nr_samples); 2114 if (image->vk_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) 2115 ops->clear_value(cs, VK_FORMAT_E5B9G9R9_UFLOAT_PACK32, clear_value); 2116 else 2117 ops->clear_value(cs, format, clear_value); 2118 2119 for (unsigned j = 0; j < level_count; j++) { 2120 if (image->layout[0].depth0 > 1) 2121 layer_count = u_minify(image->layout[0].depth0, range->baseMipLevel + j); 2122 2123 ops->coords(cs, &(VkOffset2D){}, NULL, &(VkExtent2D) { 2124 u_minify(image->layout[0].width0, range->baseMipLevel + j), 2125 u_minify(image->layout[0].height0, range->baseMipLevel + j) 2126 }); 2127 2128 struct tu_image_view dst; 2129 tu_image_view_copy_blit(&dst, image, format, &(VkImageSubresourceLayers) { 2130 .aspectMask = aspect_mask, 2131 .mipLevel = range->baseMipLevel + j, 2132 .baseArrayLayer = range->baseArrayLayer, 2133 .layerCount = 1, 2134 }, 0, false, false); 2135 2136 for (uint32_t i = 0; i < layer_count; i++) { 2137 ops->dst(cs, &dst, i); 2138 ops->run(cmd, cs); 2139 } 2140 } 2141 2142 ops->teardown(cmd, cs); 2143} 2144 2145VKAPI_ATTR void VKAPI_CALL 2146tu_CmdClearColorImage(VkCommandBuffer commandBuffer, 2147 VkImage image_h, 2148 VkImageLayout imageLayout, 2149 const VkClearColorValue *pColor, 2150 uint32_t rangeCount, 2151 const VkImageSubresourceRange *pRanges) 2152{ 2153 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 2154 TU_FROM_HANDLE(tu_image, image, image_h); 2155 2156 for (unsigned i = 0; i < rangeCount; i++) 2157 clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i, VK_IMAGE_ASPECT_COLOR_BIT); 2158} 2159 2160VKAPI_ATTR void VKAPI_CALL 2161tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer, 2162 VkImage image_h, 2163 VkImageLayout imageLayout, 2164 const VkClearDepthStencilValue *pDepthStencil, 2165 uint32_t rangeCount, 2166 const VkImageSubresourceRange *pRanges) 2167{ 2168 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 2169 TU_FROM_HANDLE(tu_image, image, image_h); 2170 2171 for (unsigned i = 0; i < rangeCount; i++) { 2172 const VkImageSubresourceRange *range = &pRanges[i]; 2173 2174 if (image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) { 2175 /* can't clear both depth and stencil at once, split up the aspect mask */ 2176 u_foreach_bit(b, range->aspectMask) 2177 clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, BIT(b)); 2178 continue; 2179 } 2180 2181 clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, range->aspectMask); 2182 } 2183} 2184 2185static void 2186tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd, 2187 uint32_t attachment_count, 2188 const VkClearAttachment *attachments, 2189 uint32_t rect_count, 2190 const VkClearRect *rects) 2191{ 2192 /* the shader path here is special, it avoids changing MRT/etc state */ 2193 const struct tu_subpass *subpass = cmd->state.subpass; 2194 const uint32_t mrt_count = subpass->color_count; 2195 struct tu_cs *cs = &cmd->draw_cs; 2196 uint32_t clear_value[MAX_RTS][4]; 2197 float z_clear_val = 0.0f; 2198 uint8_t s_clear_val = 0; 2199 uint32_t clear_rts = 0, clear_components = 0; 2200 bool z_clear = false; 2201 bool s_clear = false; 2202 2203 trace_start_sysmem_clear_all(&cmd->trace, cs); 2204 2205 for (uint32_t i = 0; i < attachment_count; i++) { 2206 uint32_t a; 2207 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) { 2208 uint32_t c = attachments[i].colorAttachment; 2209 a = subpass->color_attachments[c].attachment; 2210 if (a == VK_ATTACHMENT_UNUSED) 2211 continue; 2212 2213 clear_rts |= 1 << c; 2214 clear_components |= 0xf << (c * 4); 2215 memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t)); 2216 } else { 2217 a = subpass->depth_stencil_attachment.attachment; 2218 if (a == VK_ATTACHMENT_UNUSED) 2219 continue; 2220 2221 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) { 2222 z_clear = true; 2223 z_clear_val = attachments[i].clearValue.depthStencil.depth; 2224 } 2225 2226 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) { 2227 s_clear = true; 2228 s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff; 2229 } 2230 } 2231 } 2232 2233 /* We may not know the multisample count if there are no attachments, so 2234 * just bail early to avoid corner cases later. 2235 */ 2236 if (clear_rts == 0 && !z_clear && !s_clear) 2237 return; 2238 2239 /* disable all draw states so they don't interfere 2240 * TODO: use and re-use draw states 2241 * we have to disable draw states individually to preserve 2242 * input attachment states, because a secondary command buffer 2243 * won't be able to restore them 2244 */ 2245 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2)); 2246 for (uint32_t i = 0; i < TU_DRAW_STATE_COUNT; i++) { 2247 if (i == TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM || 2248 i == TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM) 2249 continue; 2250 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_GROUP_ID(i) | 2251 CP_SET_DRAW_STATE__0_DISABLE); 2252 tu_cs_emit_qw(cs, 0); 2253 } 2254 cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE; 2255 2256 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2); 2257 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) | 2258 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) | 2259 0xfc000000); 2260 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count)); 2261 2262 r3d_common(cmd, cs, false, clear_rts, false, cmd->state.subpass->samples); 2263 2264 tu_cs_emit_regs(cs, 2265 A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components)); 2266 tu_cs_emit_regs(cs, 2267 A6XX_RB_RENDER_COMPONENTS(.dword = clear_components)); 2268 2269 tu_cs_emit_regs(cs, 2270 A6XX_RB_FS_OUTPUT_CNTL0(), 2271 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count)); 2272 2273 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL()); 2274 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff)); 2275 for (uint32_t i = 0; i < mrt_count; i++) { 2276 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i, 2277 .component_enable = COND(clear_rts & (1 << i), 0xf))); 2278 } 2279 2280 tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0)); 2281 tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0)); 2282 2283 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL()); 2284 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL( 2285 .z_test_enable = z_clear, 2286 .z_write_enable = z_clear, 2287 .zfunc = FUNC_ALWAYS)); 2288 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL()); 2289 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL( 2290 .stencil_enable = s_clear, 2291 .func = FUNC_ALWAYS, 2292 .zpass = STENCIL_REPLACE)); 2293 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff)); 2294 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff)); 2295 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val)); 2296 2297 unsigned num_rts = util_bitcount(clear_rts); 2298 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4 * num_rts); 2299 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) | 2300 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | 2301 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | 2302 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) | 2303 CP_LOAD_STATE6_0_NUM_UNIT(num_rts)); 2304 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); 2305 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); 2306 u_foreach_bit(b, clear_rts) 2307 tu_cs_emit_array(cs, clear_value[b], 4); 2308 2309 for (uint32_t i = 0; i < rect_count; i++) { 2310 /* This should be true because of this valid usage for 2311 * vkCmdClearAttachments: 2312 * 2313 * "If the render pass instance this is recorded in uses multiview, 2314 * then baseArrayLayer must be zero and layerCount must be one" 2315 */ 2316 assert(!subpass->multiview_mask || rects[i].baseArrayLayer == 0); 2317 2318 /* a630 doesn't support multiview masks, which means that we can't use 2319 * the normal multiview path without potentially recompiling a shader 2320 * on-demand or using a more complicated variant that takes the mask as 2321 * a const. Just use the layered path instead, since it shouldn't be 2322 * much worse. 2323 */ 2324 for_each_layer(layer, subpass->multiview_mask, rects[i].layerCount) { 2325 r3d_coords_raw(cs, (float[]) { 2326 rects[i].rect.offset.x, rects[i].rect.offset.y, 2327 z_clear_val, uif(rects[i].baseArrayLayer + layer), 2328 rects[i].rect.offset.x + rects[i].rect.extent.width, 2329 rects[i].rect.offset.y + rects[i].rect.extent.height, 2330 z_clear_val, 1.0f, 2331 }); 2332 r3d_run(cmd, cs); 2333 } 2334 } 2335 2336 trace_end_sysmem_clear_all(&cmd->trace, 2337 cs, mrt_count, rect_count); 2338} 2339 2340static void 2341pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t clear_value[4]) 2342{ 2343 switch (format) { 2344 case VK_FORMAT_X8_D24_UNORM_PACK32: 2345 case VK_FORMAT_D24_UNORM_S8_UINT: 2346 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24) | 2347 val->depthStencil.stencil << 24; 2348 return; 2349 case VK_FORMAT_D16_UNORM: 2350 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 16); 2351 return; 2352 case VK_FORMAT_D32_SFLOAT: 2353 clear_value[0] = fui(val->depthStencil.depth); 2354 return; 2355 case VK_FORMAT_S8_UINT: 2356 clear_value[0] = val->depthStencil.stencil; 2357 return; 2358 default: 2359 break; 2360 } 2361 2362 float tmp[4]; 2363 memcpy(tmp, val->color.float32, 4 * sizeof(float)); 2364 if (vk_format_is_srgb(format)) { 2365 for (int i = 0; i < 3; i++) 2366 tmp[i] = util_format_linear_to_srgb_float(tmp[i]); 2367 } 2368 2369#define PACK_F(type) util_format_##type##_pack_rgba_float \ 2370 ( (uint8_t*) &clear_value[0], 0, tmp, 0, 1, 1) 2371 switch (vk_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) { 2372 case 4: 2373 PACK_F(r4g4b4a4_unorm); 2374 break; 2375 case 5: 2376 if (vk_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_Y) == 6) 2377 PACK_F(r5g6b5_unorm); 2378 else 2379 PACK_F(r5g5b5a1_unorm); 2380 break; 2381 case 8: 2382 if (vk_format_is_snorm(format)) 2383 PACK_F(r8g8b8a8_snorm); 2384 else if (vk_format_is_unorm(format)) 2385 PACK_F(r8g8b8a8_unorm); 2386 else 2387 pack_int8(clear_value, val->color.uint32); 2388 break; 2389 case 10: 2390 if (vk_format_is_int(format)) 2391 pack_int10_2(clear_value, val->color.uint32); 2392 else 2393 PACK_F(r10g10b10a2_unorm); 2394 break; 2395 case 11: 2396 clear_value[0] = float3_to_r11g11b10f(val->color.float32); 2397 break; 2398 case 16: 2399 if (vk_format_is_snorm(format)) 2400 PACK_F(r16g16b16a16_snorm); 2401 else if (vk_format_is_unorm(format)) 2402 PACK_F(r16g16b16a16_unorm); 2403 else if (vk_format_is_float(format)) 2404 PACK_F(r16g16b16a16_float); 2405 else 2406 pack_int16(clear_value, val->color.uint32); 2407 break; 2408 case 32: 2409 memcpy(clear_value, val->color.float32, 4 * sizeof(float)); 2410 break; 2411 default: 2412 unreachable("unexpected channel size"); 2413 } 2414#undef PACK_F 2415} 2416 2417static void 2418clear_gmem_attachment(struct tu_cmd_buffer *cmd, 2419 struct tu_cs *cs, 2420 VkFormat format, 2421 uint8_t clear_mask, 2422 uint32_t gmem_offset, 2423 const VkClearValue *value) 2424{ 2425 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1); 2426 tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(format))); 2427 2428 tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(.gmem = 1, .clear_mask = clear_mask)); 2429 2430 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1); 2431 tu_cs_emit(cs, gmem_offset); 2432 2433 tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1); 2434 tu_cs_emit(cs, 0); 2435 2436 uint32_t clear_vals[4] = {}; 2437 pack_gmem_clear_value(value, format, clear_vals); 2438 2439 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4); 2440 tu_cs_emit_array(cs, clear_vals, 4); 2441 2442 tu6_emit_event_write(cmd, cs, BLIT); 2443} 2444 2445static void 2446tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd, 2447 struct tu_cs *cs, 2448 uint32_t attachment, 2449 VkImageAspectFlags mask, 2450 const VkClearValue *value) 2451{ 2452 const struct tu_render_pass_attachment *att = 2453 &cmd->state.pass->attachments[attachment]; 2454 2455 trace_start_gmem_clear(&cmd->trace, cs); 2456 2457 if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) { 2458 if (mask & VK_IMAGE_ASPECT_DEPTH_BIT) 2459 clear_gmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, 0xf, att->gmem_offset, value); 2460 if (mask & VK_IMAGE_ASPECT_STENCIL_BIT) 2461 clear_gmem_attachment(cmd, cs, VK_FORMAT_S8_UINT, 0xf, att->gmem_offset_stencil, value); 2462 return; 2463 } 2464 2465 clear_gmem_attachment(cmd, cs, att->format, aspect_write_mask(att->format, mask), att->gmem_offset, value); 2466 2467 trace_end_gmem_clear(&cmd->trace, cs, att->format, att->samples); 2468} 2469 2470static void 2471tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd, 2472 uint32_t attachment_count, 2473 const VkClearAttachment *attachments, 2474 uint32_t rect_count, 2475 const VkClearRect *rects) 2476{ 2477 const struct tu_subpass *subpass = cmd->state.subpass; 2478 struct tu_cs *cs = &cmd->draw_cs; 2479 2480 /* TODO: swap the loops for smaller cmdstream */ 2481 for (unsigned i = 0; i < rect_count; i++) { 2482 unsigned x1 = rects[i].rect.offset.x; 2483 unsigned y1 = rects[i].rect.offset.y; 2484 unsigned x2 = x1 + rects[i].rect.extent.width - 1; 2485 unsigned y2 = y1 + rects[i].rect.extent.height - 1; 2486 2487 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2); 2488 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1)); 2489 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2)); 2490 2491 for (unsigned j = 0; j < attachment_count; j++) { 2492 uint32_t a; 2493 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) 2494 a = subpass->color_attachments[attachments[j].colorAttachment].attachment; 2495 else 2496 a = subpass->depth_stencil_attachment.attachment; 2497 2498 if (a == VK_ATTACHMENT_UNUSED) 2499 continue; 2500 2501 tu_emit_clear_gmem_attachment(cmd, cs, a, attachments[j].aspectMask, 2502 &attachments[j].clearValue); 2503 } 2504 } 2505} 2506 2507VKAPI_ATTR void VKAPI_CALL 2508tu_CmdClearAttachments(VkCommandBuffer commandBuffer, 2509 uint32_t attachmentCount, 2510 const VkClearAttachment *pAttachments, 2511 uint32_t rectCount, 2512 const VkClearRect *pRects) 2513{ 2514 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 2515 struct tu_cs *cs = &cmd->draw_cs; 2516 2517 /* sysmem path behaves like a draw, note we don't have a way of using different 2518 * flushes for sysmem/gmem, so this needs to be outside of the cond_exec 2519 */ 2520 tu_emit_cache_flush_renderpass(cmd, cs); 2521 2522 for (uint32_t j = 0; j < attachmentCount; j++) { 2523 if ((pAttachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) == 0) 2524 continue; 2525 cmd->state.lrz.valid = false; 2526 cmd->state.dirty |= TU_CMD_DIRTY_LRZ; 2527 } 2528 2529 /* vkCmdClearAttachments is supposed to respect the predicate if active. 2530 * The easiest way to do this is to always use the 3d path, which always 2531 * works even with GMEM because it's just a simple draw using the existing 2532 * attachment state. However it seems that IGNORE_VISIBILITY draws must be 2533 * skipped in the binning pass, since otherwise they produce binning data 2534 * which isn't consumed and leads to the wrong binning data being read, so 2535 * condition on GMEM | SYSMEM. 2536 */ 2537 if (cmd->state.predication_active) { 2538 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM | 2539 CP_COND_EXEC_0_RENDER_MODE_SYSMEM); 2540 tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects); 2541 tu_cond_exec_end(cs); 2542 return; 2543 } 2544 2545 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM); 2546 tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects); 2547 tu_cond_exec_end(cs); 2548 2549 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM); 2550 tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects); 2551 tu_cond_exec_end(cs); 2552} 2553 2554static void 2555clear_sysmem_attachment(struct tu_cmd_buffer *cmd, 2556 struct tu_cs *cs, 2557 VkFormat format, 2558 VkImageAspectFlags clear_mask, 2559 const VkRenderPassBeginInfo *info, 2560 uint32_t a, 2561 bool separate_stencil) 2562{ 2563 const struct tu_framebuffer *fb = cmd->state.framebuffer; 2564 const struct tu_image_view *iview = cmd->state.attachments[a]; 2565 const uint32_t clear_views = cmd->state.pass->attachments[a].clear_views; 2566 const struct blit_ops *ops = &r2d_ops; 2567 if (cmd->state.pass->attachments[a].samples > 1) 2568 ops = &r3d_ops; 2569 2570 trace_start_sysmem_clear(&cmd->trace, cs); 2571 2572 ops->setup(cmd, cs, format, clear_mask, 0, true, iview->ubwc_enabled, 2573 cmd->state.pass->attachments[a].samples); 2574 ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent); 2575 ops->clear_value(cs, format, &info->pClearValues[a]); 2576 2577 for_each_layer(i, clear_views, fb->layers) { 2578 if (separate_stencil) { 2579 if (ops == &r3d_ops) 2580 r3d_dst_stencil(cs, iview, i); 2581 else 2582 r2d_dst_stencil(cs, iview, i); 2583 } else { 2584 ops->dst(cs, iview, i); 2585 } 2586 ops->run(cmd, cs); 2587 } 2588 2589 ops->teardown(cmd, cs); 2590 2591 trace_end_sysmem_clear(&cmd->trace, cs, 2592 format, ops == &r3d_ops, 2593 cmd->state.pass->attachments[a].samples); 2594} 2595 2596void 2597tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd, 2598 struct tu_cs *cs, 2599 uint32_t a, 2600 const VkRenderPassBeginInfo *info) 2601{ 2602 const struct tu_render_pass_attachment *attachment = 2603 &cmd->state.pass->attachments[a]; 2604 2605 if (!attachment->clear_mask) 2606 return; 2607 2608 if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) { 2609 if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) { 2610 clear_sysmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_IMAGE_ASPECT_COLOR_BIT, 2611 info, a, false); 2612 } 2613 if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) { 2614 clear_sysmem_attachment(cmd, cs, VK_FORMAT_S8_UINT, VK_IMAGE_ASPECT_COLOR_BIT, 2615 info, a, true); 2616 } 2617 } else { 2618 clear_sysmem_attachment(cmd, cs, attachment->format, attachment->clear_mask, 2619 info, a, false); 2620 } 2621 2622 /* The spec doesn't explicitly say, but presumably the initial renderpass 2623 * clear is considered part of the renderpass, and therefore barriers 2624 * aren't required inside the subpass/renderpass. Therefore we need to 2625 * flush CCU color into CCU depth here, just like with 2626 * vkCmdClearAttachments(). Note that because this only happens at the 2627 * beginning of a renderpass, and renderpass writes are considered 2628 * "incoherent", we shouldn't have to worry about syncing depth into color 2629 * beforehand as depth should already be flushed. 2630 */ 2631 if (vk_format_is_depth_or_stencil(attachment->format)) { 2632 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); 2633 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH); 2634 } else { 2635 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); 2636 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR); 2637 } 2638 2639 if (cmd->device->physical_device->info->a6xx.has_ccu_flush_bug) 2640 tu_cs_emit_wfi(cs); 2641} 2642 2643void 2644tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd, 2645 struct tu_cs *cs, 2646 uint32_t a, 2647 const VkRenderPassBeginInfo *info) 2648{ 2649 const struct tu_render_pass_attachment *attachment = 2650 &cmd->state.pass->attachments[a]; 2651 2652 if (!attachment->clear_mask) 2653 return; 2654 2655 tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples))); 2656 2657 tu_emit_clear_gmem_attachment(cmd, cs, a, attachment->clear_mask, 2658 &info->pClearValues[a]); 2659} 2660 2661static void 2662tu_emit_blit(struct tu_cmd_buffer *cmd, 2663 struct tu_cs *cs, 2664 const struct tu_image_view *iview, 2665 const struct tu_render_pass_attachment *attachment, 2666 bool resolve, 2667 bool separate_stencil) 2668{ 2669 tu_cs_emit_regs(cs, 2670 A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples))); 2671 2672 tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO( 2673 .unk0 = !resolve, 2674 .gmem = !resolve, 2675 .sample_0 = vk_format_is_int(attachment->format) | 2676 vk_format_is_depth_or_stencil(attachment->format))); 2677 2678 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4); 2679 if (separate_stencil) { 2680 tu_cs_emit(cs, tu_image_view_stencil(iview, RB_BLIT_DST_INFO) & ~A6XX_RB_BLIT_DST_INFO_FLAGS); 2681 tu_cs_emit_qw(cs, iview->stencil_base_addr); 2682 tu_cs_emit(cs, iview->stencil_PITCH); 2683 2684 tu_cs_emit_regs(cs, 2685 A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset_stencil)); 2686 } else { 2687 tu_cs_emit(cs, iview->RB_BLIT_DST_INFO); 2688 tu_cs_image_ref_2d(cs, iview, 0, false); 2689 2690 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST, 3); 2691 tu_cs_image_flag_ref(cs, iview, 0); 2692 2693 tu_cs_emit_regs(cs, 2694 A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset)); 2695 } 2696 2697 tu6_emit_event_write(cmd, cs, BLIT); 2698} 2699 2700static bool 2701blit_can_resolve(VkFormat format) 2702{ 2703 const struct util_format_description *desc = vk_format_description(format); 2704 2705 /* blit event can only do resolve for simple cases: 2706 * averaging samples as unsigned integers or choosing only one sample 2707 */ 2708 if (vk_format_is_snorm(format) || vk_format_is_srgb(format)) 2709 return false; 2710 2711 /* can't do formats with larger channel sizes 2712 * note: this includes all float formats 2713 * note2: single channel integer formats seem OK 2714 */ 2715 if (desc->channel[0].size > 10) 2716 return false; 2717 2718 switch (format) { 2719 /* for unknown reasons blit event can't msaa resolve these formats when tiled 2720 * likely related to these formats having different layout from other cpp=2 formats 2721 */ 2722 case VK_FORMAT_R8G8_UNORM: 2723 case VK_FORMAT_R8G8_UINT: 2724 case VK_FORMAT_R8G8_SINT: 2725 /* TODO: this one should be able to work? */ 2726 case VK_FORMAT_D24_UNORM_S8_UINT: 2727 return false; 2728 default: 2729 break; 2730 } 2731 2732 return true; 2733} 2734 2735void 2736tu_load_gmem_attachment(struct tu_cmd_buffer *cmd, 2737 struct tu_cs *cs, 2738 uint32_t a, 2739 bool force_load) 2740{ 2741 const struct tu_image_view *iview = cmd->state.attachments[a]; 2742 const struct tu_render_pass_attachment *attachment = 2743 &cmd->state.pass->attachments[a]; 2744 2745 trace_start_gmem_load(&cmd->trace, cs); 2746 2747 if (attachment->load || force_load) 2748 tu_emit_blit(cmd, cs, iview, attachment, false, false); 2749 2750 if (attachment->load_stencil || (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && force_load)) 2751 tu_emit_blit(cmd, cs, iview, attachment, false, true); 2752 2753 trace_end_gmem_load(&cmd->trace, cs, attachment->format, force_load); 2754} 2755 2756static void 2757store_cp_blit(struct tu_cmd_buffer *cmd, 2758 struct tu_cs *cs, 2759 const struct tu_image_view *iview, 2760 uint32_t samples, 2761 bool separate_stencil, 2762 VkFormat format, 2763 uint32_t gmem_offset, 2764 uint32_t cpp) 2765{ 2766 r2d_setup_common(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, 2767 iview->ubwc_enabled, true); 2768 if (separate_stencil) 2769 r2d_dst_stencil(cs, iview, 0); 2770 else 2771 r2d_dst(cs, iview, 0); 2772 2773 tu_cs_emit_regs(cs, 2774 A6XX_SP_PS_2D_SRC_INFO( 2775 .color_format = tu6_format_texture(format, TILE6_2).fmt, 2776 .tile_mode = TILE6_2, 2777 .srgb = vk_format_is_srgb(format), 2778 .samples = tu_msaa_samples(samples), 2779 .samples_average = !vk_format_is_int(format) && 2780 !vk_format_is_depth_or_stencil(format), 2781 .unk20 = 1, 2782 .unk22 = 1), 2783 /* note: src size does not matter when not scaling */ 2784 A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff), 2785 A6XX_SP_PS_2D_SRC(.qword = cmd->device->physical_device->gmem_base + gmem_offset), 2786 A6XX_SP_PS_2D_SRC_PITCH(.pitch = cmd->state.framebuffer->tile0.width * cpp)); 2787 2788 /* sync GMEM writes with CACHE. */ 2789 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); 2790 2791 /* Wait for CACHE_INVALIDATE to land */ 2792 tu_cs_emit_wfi(cs); 2793 2794 tu_cs_emit_pkt7(cs, CP_BLIT, 1); 2795 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE)); 2796 2797 /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to 2798 * sysmem, and we generally assume that GMEM renderpasses leave their 2799 * results in sysmem, so we need to flush manually here. 2800 */ 2801 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); 2802} 2803 2804static void 2805store_3d_blit(struct tu_cmd_buffer *cmd, 2806 struct tu_cs *cs, 2807 const struct tu_image_view *iview, 2808 uint32_t dst_samples, 2809 bool separate_stencil, 2810 VkFormat format, 2811 const VkRect2D *render_area, 2812 uint32_t gmem_offset, 2813 uint32_t cpp) 2814{ 2815 r3d_setup(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, 2816 iview->ubwc_enabled, dst_samples); 2817 2818 r3d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent); 2819 2820 if (separate_stencil) 2821 r3d_dst_stencil(cs, iview, 0); 2822 else 2823 r3d_dst(cs, iview, 0); 2824 2825 r3d_src_gmem(cmd, cs, iview, format, gmem_offset, cpp); 2826 2827 /* sync GMEM writes with CACHE. */ 2828 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); 2829 2830 r3d_run(cmd, cs); 2831 2832 /* Draws write to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to 2833 * sysmem, and we generally assume that GMEM renderpasses leave their 2834 * results in sysmem, so we need to flush manually here. The 3d blit path 2835 * writes to depth images as a color RT, so there's no need to flush depth. 2836 */ 2837 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); 2838} 2839 2840void 2841tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, 2842 struct tu_cs *cs, 2843 uint32_t a, 2844 uint32_t gmem_a) 2845{ 2846 struct tu_physical_device *phys_dev = cmd->device->physical_device; 2847 const VkRect2D *render_area = &cmd->state.render_area; 2848 struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a]; 2849 const struct tu_image_view *iview = cmd->state.attachments[a]; 2850 struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a]; 2851 2852 if (!dst->store && !dst->store_stencil) 2853 return; 2854 2855 uint32_t x1 = render_area->offset.x; 2856 uint32_t y1 = render_area->offset.y; 2857 uint32_t x2 = x1 + render_area->extent.width; 2858 uint32_t y2 = y1 + render_area->extent.height; 2859 /* x2/y2 can be unaligned if equal to the size of the image, 2860 * since it will write into padding space 2861 * the one exception is linear levels which don't have the 2862 * required y padding in the layout (except for the last level) 2863 */ 2864 bool need_y2_align = 2865 y2 != iview->extent.height || iview->need_y2_align; 2866 2867 bool unaligned = 2868 x1 % phys_dev->info->gmem_align_w || 2869 (x2 % phys_dev->info->gmem_align_w && x2 != iview->extent.width) || 2870 y1 % phys_dev->info->gmem_align_h || (y2 % phys_dev->info->gmem_align_h && need_y2_align); 2871 2872 /* D32_SFLOAT_S8_UINT is quite special format: it has two planes, 2873 * one for depth and other for stencil. When resolving a MSAA 2874 * D32_SFLOAT_S8_UINT to S8_UINT, we need to take that into account. 2875 */ 2876 bool resolve_d32s8_s8 = 2877 src->format == VK_FORMAT_D32_SFLOAT_S8_UINT && 2878 dst->format == VK_FORMAT_S8_UINT; 2879 2880 trace_start_gmem_store(&cmd->trace, cs); 2881 2882 /* use fast path when render area is aligned, except for unsupported resolve cases */ 2883 if (!unaligned && (a == gmem_a || blit_can_resolve(dst->format))) { 2884 if (dst->store) 2885 tu_emit_blit(cmd, cs, iview, src, true, resolve_d32s8_s8); 2886 if (dst->store_stencil) 2887 tu_emit_blit(cmd, cs, iview, src, true, true); 2888 2889 trace_end_gmem_store(&cmd->trace, cs, dst->format, true, false); 2890 return; 2891 } 2892 2893 VkFormat format = src->format; 2894 if (format == VK_FORMAT_D32_SFLOAT_S8_UINT) 2895 format = VK_FORMAT_D32_SFLOAT; 2896 2897 if (dst->samples > 1) { 2898 /* If we hit this path, we have to disable draw states after every tile 2899 * instead of once at the end of the renderpass, so that they aren't 2900 * executed when calling CP_DRAW. 2901 * 2902 * TODO: store a flag somewhere so we don't do this more than once and 2903 * don't do it after the renderpass when this happens. 2904 */ 2905 if (dst->store || dst->store_stencil) 2906 tu_disable_draw_states(cmd, cs); 2907 2908 if (dst->store) { 2909 store_3d_blit(cmd, cs, iview, dst->samples, resolve_d32s8_s8, format, 2910 render_area, src->gmem_offset, src->cpp); 2911 } 2912 if (dst->store_stencil) { 2913 store_3d_blit(cmd, cs, iview, dst->samples, true, VK_FORMAT_S8_UINT, 2914 render_area, src->gmem_offset, src->samples); 2915 } 2916 } else { 2917 r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent); 2918 2919 if (dst->store) { 2920 store_cp_blit(cmd, cs, iview, src->samples, resolve_d32s8_s8, format, 2921 src->gmem_offset, src->cpp); 2922 } 2923 if (dst->store_stencil) { 2924 store_cp_blit(cmd, cs, iview, src->samples, true, VK_FORMAT_S8_UINT, 2925 src->gmem_offset_stencil, src->samples); 2926 } 2927 } 2928 2929 trace_end_gmem_store(&cmd->trace, cs, dst->format, false, unaligned); 2930} 2931