1/* 2 * Copyright © 2016 Red Hat. 3 * Copyright © 2016 Bas Nieuwenhuizen 4 * 5 * based in part on anv driver which is: 6 * Copyright © 2015 Intel Corporation 7 * 8 * Permission is hereby granted, free of charge, to any person obtaining a 9 * copy of this software and associated documentation files (the "Software"), 10 * to deal in the Software without restriction, including without limitation 11 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 12 * and/or sell copies of the Software, and to permit persons to whom the 13 * Software is furnished to do so, subject to the following conditions: 14 * 15 * The above copyright notice and this permission notice (including the next 16 * paragraph) shall be included in all copies or substantial portions of the 17 * Software. 18 * 19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 * DEALINGS IN THE SOFTWARE. 26 */ 27 28#include "tu_private.h" 29 30#include "adreno_pm4.xml.h" 31#include "adreno_common.xml.h" 32 33#include "vk_format.h" 34#include "vk_util.h" 35 36#include "tu_cs.h" 37 38#include "tu_tracepoints.h" 39 40void 41tu6_emit_event_write(struct tu_cmd_buffer *cmd, 42 struct tu_cs *cs, 43 enum vgt_event_type event) 44{ 45 bool need_seqno = false; 46 switch (event) { 47 case CACHE_FLUSH_TS: 48 case WT_DONE_TS: 49 case RB_DONE_TS: 50 case PC_CCU_FLUSH_DEPTH_TS: 51 case PC_CCU_FLUSH_COLOR_TS: 52 case PC_CCU_RESOLVE_TS: 53 need_seqno = true; 54 break; 55 default: 56 break; 57 } 58 59 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, need_seqno ? 4 : 1); 60 tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(event)); 61 if (need_seqno) { 62 tu_cs_emit_qw(cs, global_iova(cmd, seqno_dummy)); 63 tu_cs_emit(cs, 0); 64 } 65} 66 67static void 68tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer, 69 struct tu_cs *cs, 70 enum tu_cmd_flush_bits flushes) 71{ 72 if (unlikely(cmd_buffer->device->physical_device->instance->debug_flags & TU_DEBUG_FLUSHALL)) 73 flushes |= TU_CMD_FLAG_ALL_FLUSH | TU_CMD_FLAG_ALL_INVALIDATE; 74 75 if (unlikely(cmd_buffer->device->physical_device->instance->debug_flags & TU_DEBUG_SYNCDRAW)) 76 flushes |= TU_CMD_FLAG_WAIT_MEM_WRITES | 77 TU_CMD_FLAG_WAIT_FOR_IDLE | 78 TU_CMD_FLAG_WAIT_FOR_ME; 79 80 /* Experiments show that invalidating CCU while it still has data in it 81 * doesn't work, so make sure to always flush before invalidating in case 82 * any data remains that hasn't yet been made available through a barrier. 83 * However it does seem to work for UCHE. 84 */ 85 if (flushes & (TU_CMD_FLAG_CCU_FLUSH_COLOR | 86 TU_CMD_FLAG_CCU_INVALIDATE_COLOR)) 87 tu6_emit_event_write(cmd_buffer, cs, PC_CCU_FLUSH_COLOR_TS); 88 if (flushes & (TU_CMD_FLAG_CCU_FLUSH_DEPTH | 89 TU_CMD_FLAG_CCU_INVALIDATE_DEPTH)) 90 tu6_emit_event_write(cmd_buffer, cs, PC_CCU_FLUSH_DEPTH_TS); 91 if (flushes & TU_CMD_FLAG_CCU_INVALIDATE_COLOR) 92 tu6_emit_event_write(cmd_buffer, cs, PC_CCU_INVALIDATE_COLOR); 93 if (flushes & TU_CMD_FLAG_CCU_INVALIDATE_DEPTH) 94 tu6_emit_event_write(cmd_buffer, cs, PC_CCU_INVALIDATE_DEPTH); 95 if (flushes & TU_CMD_FLAG_CACHE_FLUSH) 96 tu6_emit_event_write(cmd_buffer, cs, CACHE_FLUSH_TS); 97 if (flushes & TU_CMD_FLAG_CACHE_INVALIDATE) 98 tu6_emit_event_write(cmd_buffer, cs, CACHE_INVALIDATE); 99 if (flushes & TU_CMD_FLAG_WAIT_MEM_WRITES) 100 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); 101 if ((flushes & TU_CMD_FLAG_WAIT_FOR_IDLE) || 102 (cmd_buffer->device->physical_device->info->a6xx.has_ccu_flush_bug && 103 (flushes & (TU_CMD_FLAG_CCU_FLUSH_COLOR | TU_CMD_FLAG_CCU_FLUSH_DEPTH)))) 104 tu_cs_emit_wfi(cs); 105 if (flushes & TU_CMD_FLAG_WAIT_FOR_ME) 106 tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0); 107} 108 109/* "Normal" cache flushes, that don't require any special handling */ 110 111static void 112tu_emit_cache_flush(struct tu_cmd_buffer *cmd_buffer, 113 struct tu_cs *cs) 114{ 115 tu6_emit_flushes(cmd_buffer, cs, cmd_buffer->state.cache.flush_bits); 116 cmd_buffer->state.cache.flush_bits = 0; 117} 118 119/* Renderpass cache flushes */ 120 121void 122tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer, 123 struct tu_cs *cs) 124{ 125 if (!cmd_buffer->state.renderpass_cache.flush_bits && 126 likely(!cmd_buffer->device->physical_device->instance->debug_flags)) 127 return; 128 tu6_emit_flushes(cmd_buffer, cs, cmd_buffer->state.renderpass_cache.flush_bits); 129 cmd_buffer->state.renderpass_cache.flush_bits = 0; 130} 131 132/* Cache flushes for things that use the color/depth read/write path (i.e. 133 * blits and draws). This deals with changing CCU state as well as the usual 134 * cache flushing. 135 */ 136 137void 138tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer, 139 struct tu_cs *cs, 140 enum tu_cmd_ccu_state ccu_state) 141{ 142 enum tu_cmd_flush_bits flushes = cmd_buffer->state.cache.flush_bits; 143 144 assert(ccu_state != TU_CMD_CCU_UNKNOWN); 145 146 /* Changing CCU state must involve invalidating the CCU. In sysmem mode, 147 * the CCU may also contain data that we haven't flushed out yet, so we 148 * also need to flush. Also, in order to program RB_CCU_CNTL, we need to 149 * emit a WFI as it isn't pipelined. 150 */ 151 if (ccu_state != cmd_buffer->state.ccu_state) { 152 if (cmd_buffer->state.ccu_state != TU_CMD_CCU_GMEM) { 153 flushes |= 154 TU_CMD_FLAG_CCU_FLUSH_COLOR | 155 TU_CMD_FLAG_CCU_FLUSH_DEPTH; 156 cmd_buffer->state.cache.pending_flush_bits &= ~( 157 TU_CMD_FLAG_CCU_FLUSH_COLOR | 158 TU_CMD_FLAG_CCU_FLUSH_DEPTH); 159 } 160 flushes |= 161 TU_CMD_FLAG_CCU_INVALIDATE_COLOR | 162 TU_CMD_FLAG_CCU_INVALIDATE_DEPTH | 163 TU_CMD_FLAG_WAIT_FOR_IDLE; 164 cmd_buffer->state.cache.pending_flush_bits &= ~( 165 TU_CMD_FLAG_CCU_INVALIDATE_COLOR | 166 TU_CMD_FLAG_CCU_INVALIDATE_DEPTH | 167 TU_CMD_FLAG_WAIT_FOR_IDLE); 168 } 169 170 tu6_emit_flushes(cmd_buffer, cs, flushes); 171 cmd_buffer->state.cache.flush_bits = 0; 172 173 if (ccu_state != cmd_buffer->state.ccu_state) { 174 struct tu_physical_device *phys_dev = cmd_buffer->device->physical_device; 175 tu_cs_emit_regs(cs, 176 A6XX_RB_CCU_CNTL(.color_offset = 177 ccu_state == TU_CMD_CCU_GMEM ? 178 phys_dev->ccu_offset_gmem : 179 phys_dev->ccu_offset_bypass, 180 .gmem = ccu_state == TU_CMD_CCU_GMEM)); 181 cmd_buffer->state.ccu_state = ccu_state; 182 } 183} 184 185static void 186tu6_emit_zs(struct tu_cmd_buffer *cmd, 187 const struct tu_subpass *subpass, 188 struct tu_cs *cs) 189{ 190 const uint32_t a = subpass->depth_stencil_attachment.attachment; 191 if (a == VK_ATTACHMENT_UNUSED) { 192 tu_cs_emit_regs(cs, 193 A6XX_RB_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE), 194 A6XX_RB_DEPTH_BUFFER_PITCH(0), 195 A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(0), 196 A6XX_RB_DEPTH_BUFFER_BASE(0), 197 A6XX_RB_DEPTH_BUFFER_BASE_GMEM(0)); 198 199 tu_cs_emit_regs(cs, 200 A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE)); 201 202 tu_cs_emit_regs(cs, 203 A6XX_GRAS_LRZ_BUFFER_BASE(0), 204 A6XX_GRAS_LRZ_BUFFER_PITCH(0), 205 A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0)); 206 207 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_INFO(0)); 208 209 return; 210 } 211 212 const struct tu_image_view *iview = cmd->state.attachments[a]; 213 const struct tu_render_pass_attachment *attachment = 214 &cmd->state.pass->attachments[a]; 215 enum a6xx_depth_format fmt = tu6_pipe2depth(attachment->format); 216 217 tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_BUFFER_INFO, 6); 218 tu_cs_emit(cs, A6XX_RB_DEPTH_BUFFER_INFO(.depth_format = fmt).value); 219 tu_cs_image_ref(cs, iview, 0); 220 tu_cs_emit(cs, attachment->gmem_offset); 221 222 tu_cs_emit_regs(cs, 223 A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = fmt)); 224 225 tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_FLAG_BUFFER_BASE, 3); 226 tu_cs_image_flag_ref(cs, iview, 0); 227 228 tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_BUFFER_BASE(.bo = iview->image->bo, 229 .bo_offset = iview->image->bo_offset + iview->image->lrz_offset), 230 A6XX_GRAS_LRZ_BUFFER_PITCH(.pitch = iview->image->lrz_pitch), 231 A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE()); 232 233 if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT || 234 attachment->format == VK_FORMAT_S8_UINT) { 235 236 tu_cs_emit_pkt4(cs, REG_A6XX_RB_STENCIL_INFO, 6); 237 tu_cs_emit(cs, A6XX_RB_STENCIL_INFO(.separate_stencil = true).value); 238 if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) { 239 tu_cs_image_stencil_ref(cs, iview, 0); 240 tu_cs_emit(cs, attachment->gmem_offset_stencil); 241 } else { 242 tu_cs_image_ref(cs, iview, 0); 243 tu_cs_emit(cs, attachment->gmem_offset); 244 } 245 } else { 246 tu_cs_emit_regs(cs, 247 A6XX_RB_STENCIL_INFO(0)); 248 } 249} 250 251static void 252tu6_emit_mrt(struct tu_cmd_buffer *cmd, 253 const struct tu_subpass *subpass, 254 struct tu_cs *cs) 255{ 256 const struct tu_framebuffer *fb = cmd->state.framebuffer; 257 258 for (uint32_t i = 0; i < subpass->color_count; ++i) { 259 uint32_t a = subpass->color_attachments[i].attachment; 260 if (a == VK_ATTACHMENT_UNUSED) 261 continue; 262 263 const struct tu_image_view *iview = cmd->state.attachments[a]; 264 265 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(i), 6); 266 tu_cs_emit(cs, iview->RB_MRT_BUF_INFO); 267 tu_cs_image_ref(cs, iview, 0); 268 tu_cs_emit(cs, cmd->state.pass->attachments[a].gmem_offset); 269 270 tu_cs_emit_regs(cs, 271 A6XX_SP_FS_MRT_REG(i, .dword = iview->SP_FS_MRT_REG)); 272 273 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER_ADDR(i), 3); 274 tu_cs_image_flag_ref(cs, iview, 0); 275 } 276 277 tu_cs_emit_regs(cs, 278 A6XX_RB_SRGB_CNTL(.dword = subpass->srgb_cntl)); 279 tu_cs_emit_regs(cs, 280 A6XX_SP_SRGB_CNTL(.dword = subpass->srgb_cntl)); 281 282 unsigned layers = MAX2(fb->layers, util_logbase2(subpass->multiview_mask) + 1); 283 tu_cs_emit_regs(cs, A6XX_GRAS_MAX_LAYER_INDEX(layers - 1)); 284 285 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_SC_CNTL, 286 A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2)); 287 288 /* If there is a feedback loop, then the shader can read the previous value 289 * of a pixel being written out. It can also write some components and then 290 * read different components without a barrier in between. This is a 291 * problem in sysmem mode with UBWC, because the main buffer and flags 292 * buffer can get out-of-sync if only one is flushed. We fix this by 293 * setting the SINGLE_PRIM_MODE field to the same value that the blob does 294 * for advanced_blend in sysmem mode if a feedback loop is detected. 295 */ 296 if (subpass->feedback) { 297 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM); 298 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_SC_CNTL, 299 A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2) | 300 A6XX_GRAS_SC_CNTL_SINGLE_PRIM_MODE( 301 FLUSH_PER_OVERLAP_AND_OVERWRITE)); 302 tu_cond_exec_end(cs); 303 } 304} 305 306void 307tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits vk_samples, 308 enum a5xx_line_mode line_mode) 309{ 310 const enum a3xx_msaa_samples samples = tu_msaa_samples(vk_samples); 311 bool msaa_disable = (samples == MSAA_ONE) || (line_mode == BRESENHAM); 312 313 tu_cs_emit_regs(cs, 314 A6XX_SP_TP_RAS_MSAA_CNTL(samples), 315 A6XX_SP_TP_DEST_MSAA_CNTL(.samples = samples, 316 .msaa_disable = msaa_disable)); 317 318 tu_cs_emit_regs(cs, 319 A6XX_GRAS_RAS_MSAA_CNTL(samples), 320 A6XX_GRAS_DEST_MSAA_CNTL(.samples = samples, 321 .msaa_disable = msaa_disable)); 322 323 tu_cs_emit_regs(cs, 324 A6XX_RB_RAS_MSAA_CNTL(samples), 325 A6XX_RB_DEST_MSAA_CNTL(.samples = samples, 326 .msaa_disable = msaa_disable)); 327 328 tu_cs_emit_regs(cs, 329 A6XX_RB_MSAA_CNTL(samples)); 330} 331 332static void 333tu6_emit_bin_size(struct tu_cs *cs, 334 uint32_t bin_w, uint32_t bin_h, uint32_t flags) 335{ 336 tu_cs_emit_regs(cs, 337 A6XX_GRAS_BIN_CONTROL(.binw = bin_w, 338 .binh = bin_h, 339 .dword = flags)); 340 341 tu_cs_emit_regs(cs, 342 A6XX_RB_BIN_CONTROL(.binw = bin_w, 343 .binh = bin_h, 344 .dword = flags)); 345 346 /* no flag for RB_BIN_CONTROL2... */ 347 tu_cs_emit_regs(cs, 348 A6XX_RB_BIN_CONTROL2(.binw = bin_w, 349 .binh = bin_h)); 350} 351 352static void 353tu6_emit_render_cntl(struct tu_cmd_buffer *cmd, 354 const struct tu_subpass *subpass, 355 struct tu_cs *cs, 356 bool binning) 357{ 358 /* doesn't RB_RENDER_CNTL set differently for binning pass: */ 359 bool no_track = !cmd->device->physical_device->info->a6xx.has_cp_reg_write; 360 uint32_t cntl = 0; 361 cntl |= A6XX_RB_RENDER_CNTL_CCUSINGLECACHELINESIZE(2); 362 if (binning) { 363 if (no_track) 364 return; 365 cntl |= A6XX_RB_RENDER_CNTL_BINNING; 366 } else { 367 uint32_t mrts_ubwc_enable = 0; 368 for (uint32_t i = 0; i < subpass->color_count; ++i) { 369 uint32_t a = subpass->color_attachments[i].attachment; 370 if (a == VK_ATTACHMENT_UNUSED) 371 continue; 372 373 const struct tu_image_view *iview = cmd->state.attachments[a]; 374 if (iview->ubwc_enabled) 375 mrts_ubwc_enable |= 1 << i; 376 } 377 378 cntl |= A6XX_RB_RENDER_CNTL_FLAG_MRTS(mrts_ubwc_enable); 379 380 const uint32_t a = subpass->depth_stencil_attachment.attachment; 381 if (a != VK_ATTACHMENT_UNUSED) { 382 const struct tu_image_view *iview = cmd->state.attachments[a]; 383 if (iview->ubwc_enabled) 384 cntl |= A6XX_RB_RENDER_CNTL_FLAG_DEPTH; 385 } 386 387 if (no_track) { 388 tu_cs_emit_pkt4(cs, REG_A6XX_RB_RENDER_CNTL, 1); 389 tu_cs_emit(cs, cntl); 390 return; 391 } 392 393 /* In the !binning case, we need to set RB_RENDER_CNTL in the draw_cs 394 * in order to set it correctly for the different subpasses. However, 395 * that means the packets we're emitting also happen during binning. So 396 * we need to guard the write on !BINNING at CP execution time. 397 */ 398 tu_cs_reserve(cs, 3 + 4); 399 tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2); 400 tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) | 401 CP_COND_REG_EXEC_0_GMEM | CP_COND_REG_EXEC_0_SYSMEM); 402 tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(4)); 403 } 404 405 tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3); 406 tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(TRACK_RENDER_CNTL)); 407 tu_cs_emit(cs, REG_A6XX_RB_RENDER_CNTL); 408 tu_cs_emit(cs, cntl); 409} 410 411static void 412tu6_emit_blit_scissor(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool align) 413{ 414 struct tu_physical_device *phys_dev = cmd->device->physical_device; 415 const VkRect2D *render_area = &cmd->state.render_area; 416 417 /* Avoid assertion fails with an empty render area at (0, 0) where the 418 * subtraction below wraps around. Empty render areas should be forced to 419 * the sysmem path by use_sysmem_rendering(). It's not even clear whether 420 * an empty scissor here works, and the blob seems to force sysmem too as 421 * it sets something wrong (non-empty) for the scissor. 422 */ 423 if (render_area->extent.width == 0 || 424 render_area->extent.height == 0) 425 return; 426 427 uint32_t x1 = render_area->offset.x; 428 uint32_t y1 = render_area->offset.y; 429 uint32_t x2 = x1 + render_area->extent.width - 1; 430 uint32_t y2 = y1 + render_area->extent.height - 1; 431 432 if (align) { 433 x1 = x1 & ~(phys_dev->info->gmem_align_w - 1); 434 y1 = y1 & ~(phys_dev->info->gmem_align_h - 1); 435 x2 = ALIGN_POT(x2 + 1, phys_dev->info->gmem_align_w) - 1; 436 y2 = ALIGN_POT(y2 + 1, phys_dev->info->gmem_align_h) - 1; 437 } 438 439 tu_cs_emit_regs(cs, 440 A6XX_RB_BLIT_SCISSOR_TL(.x = x1, .y = y1), 441 A6XX_RB_BLIT_SCISSOR_BR(.x = x2, .y = y2)); 442} 443 444void 445tu6_emit_window_scissor(struct tu_cs *cs, 446 uint32_t x1, 447 uint32_t y1, 448 uint32_t x2, 449 uint32_t y2) 450{ 451 tu_cs_emit_regs(cs, 452 A6XX_GRAS_SC_WINDOW_SCISSOR_TL(.x = x1, .y = y1), 453 A6XX_GRAS_SC_WINDOW_SCISSOR_BR(.x = x2, .y = y2)); 454 455 tu_cs_emit_regs(cs, 456 A6XX_GRAS_2D_RESOLVE_CNTL_1(.x = x1, .y = y1), 457 A6XX_GRAS_2D_RESOLVE_CNTL_2(.x = x2, .y = y2)); 458} 459 460void 461tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1) 462{ 463 tu_cs_emit_regs(cs, 464 A6XX_RB_WINDOW_OFFSET(.x = x1, .y = y1)); 465 466 tu_cs_emit_regs(cs, 467 A6XX_RB_WINDOW_OFFSET2(.x = x1, .y = y1)); 468 469 tu_cs_emit_regs(cs, 470 A6XX_SP_WINDOW_OFFSET(.x = x1, .y = y1)); 471 472 tu_cs_emit_regs(cs, 473 A6XX_SP_TP_WINDOW_OFFSET(.x = x1, .y = y1)); 474} 475 476void 477tu6_apply_depth_bounds_workaround(struct tu_device *device, 478 uint32_t *rb_depth_cntl) 479{ 480 if (!device->physical_device->info->a6xx.depth_bounds_require_depth_test_quirk) 481 return; 482 483 /* On some GPUs it is necessary to enable z test for depth bounds test when 484 * UBWC is enabled. Otherwise, the GPU would hang. FUNC_ALWAYS is required to 485 * pass z test. Relevant tests: 486 * dEQP-VK.pipeline.extended_dynamic_state.two_draws_dynamic.depth_bounds_test_disable 487 * dEQP-VK.dynamic_state.ds_state.depth_bounds_1 488 */ 489 *rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE | 490 A6XX_RB_DEPTH_CNTL_ZFUNC(FUNC_ALWAYS); 491} 492 493static void 494tu_cs_emit_draw_state(struct tu_cs *cs, uint32_t id, struct tu_draw_state state) 495{ 496 uint32_t enable_mask; 497 switch (id) { 498 case TU_DRAW_STATE_PROGRAM: 499 case TU_DRAW_STATE_VI: 500 case TU_DRAW_STATE_FS_CONST: 501 /* The blob seems to not enable this (DESC_SETS_LOAD) for binning, even 502 * when resources would actually be used in the binning shader. 503 * Presumably the overhead of prefetching the resources isn't 504 * worth it. 505 */ 506 case TU_DRAW_STATE_DESC_SETS_LOAD: 507 enable_mask = CP_SET_DRAW_STATE__0_GMEM | 508 CP_SET_DRAW_STATE__0_SYSMEM; 509 break; 510 case TU_DRAW_STATE_PROGRAM_BINNING: 511 case TU_DRAW_STATE_VI_BINNING: 512 enable_mask = CP_SET_DRAW_STATE__0_BINNING; 513 break; 514 case TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM: 515 enable_mask = CP_SET_DRAW_STATE__0_GMEM; 516 break; 517 case TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM: 518 enable_mask = CP_SET_DRAW_STATE__0_SYSMEM; 519 break; 520 default: 521 enable_mask = CP_SET_DRAW_STATE__0_GMEM | 522 CP_SET_DRAW_STATE__0_SYSMEM | 523 CP_SET_DRAW_STATE__0_BINNING; 524 break; 525 } 526 527 STATIC_ASSERT(TU_DRAW_STATE_COUNT <= 32); 528 529 /* We need to reload the descriptors every time the descriptor sets 530 * change. However, the commands we send only depend on the pipeline 531 * because the whole point is to cache descriptors which are used by the 532 * pipeline. There's a problem here, in that the firmware has an 533 * "optimization" which skips executing groups that are set to the same 534 * value as the last draw. This means that if the descriptor sets change 535 * but not the pipeline, we'd try to re-execute the same buffer which 536 * the firmware would ignore and we wouldn't pre-load the new 537 * descriptors. Set the DIRTY bit to avoid this optimization 538 */ 539 if (id == TU_DRAW_STATE_DESC_SETS_LOAD) 540 enable_mask |= CP_SET_DRAW_STATE__0_DIRTY; 541 542 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(state.size) | 543 enable_mask | 544 CP_SET_DRAW_STATE__0_GROUP_ID(id) | 545 COND(!state.size, CP_SET_DRAW_STATE__0_DISABLE)); 546 tu_cs_emit_qw(cs, state.iova); 547} 548 549static bool 550use_hw_binning(struct tu_cmd_buffer *cmd) 551{ 552 const struct tu_framebuffer *fb = cmd->state.framebuffer; 553 554 /* XFB commands are emitted for BINNING || SYSMEM, which makes it incompatible 555 * with non-hw binning GMEM rendering. this is required because some of the 556 * XFB commands need to only be executed once 557 */ 558 if (cmd->state.xfb_used) 559 return true; 560 561 if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_NOBIN)) 562 return false; 563 564 if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_FORCEBIN)) 565 return true; 566 567 return (fb->tile_count.width * fb->tile_count.height) > 2; 568} 569 570static bool 571use_sysmem_rendering(struct tu_cmd_buffer *cmd) 572{ 573 if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_SYSMEM)) 574 return true; 575 576 /* can't fit attachments into gmem */ 577 if (!cmd->state.pass->gmem_pixels) 578 return true; 579 580 if (cmd->state.framebuffer->layers > 1) 581 return true; 582 583 /* Use sysmem for empty render areas */ 584 if (cmd->state.render_area.extent.width == 0 || 585 cmd->state.render_area.extent.height == 0) 586 return true; 587 588 if (cmd->state.has_tess) 589 return true; 590 591 if (cmd->state.disable_gmem) 592 return true; 593 594 return false; 595} 596 597static void 598tu6_emit_tile_select(struct tu_cmd_buffer *cmd, 599 struct tu_cs *cs, 600 uint32_t tx, uint32_t ty, uint32_t pipe, uint32_t slot) 601{ 602 const struct tu_framebuffer *fb = cmd->state.framebuffer; 603 604 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); 605 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_GMEM)); 606 607 const uint32_t x1 = fb->tile0.width * tx; 608 const uint32_t y1 = fb->tile0.height * ty; 609 const uint32_t x2 = MIN2(x1 + fb->tile0.width - 1, MAX_VIEWPORT_SIZE - 1); 610 const uint32_t y2 = MIN2(y1 + fb->tile0.height - 1, MAX_VIEWPORT_SIZE - 1); 611 tu6_emit_window_scissor(cs, x1, y1, x2, y2); 612 tu6_emit_window_offset(cs, x1, y1); 613 614 tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(false)); 615 616 if (use_hw_binning(cmd)) { 617 tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0); 618 619 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1); 620 tu_cs_emit(cs, 0x0); 621 622 tu_cs_emit_pkt7(cs, CP_SET_BIN_DATA5_OFFSET, 4); 623 tu_cs_emit(cs, fb->pipe_sizes[pipe] | 624 CP_SET_BIN_DATA5_0_VSC_N(slot)); 625 tu_cs_emit(cs, pipe * cmd->vsc_draw_strm_pitch); 626 tu_cs_emit(cs, pipe * 4); 627 tu_cs_emit(cs, pipe * cmd->vsc_prim_strm_pitch); 628 629 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1); 630 tu_cs_emit(cs, 0x0); 631 632 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1); 633 tu_cs_emit(cs, 0x0); 634 } else { 635 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1); 636 tu_cs_emit(cs, 0x1); 637 638 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1); 639 tu_cs_emit(cs, 0x0); 640 } 641} 642 643static void 644tu6_emit_sysmem_resolve(struct tu_cmd_buffer *cmd, 645 struct tu_cs *cs, 646 uint32_t layer_mask, 647 uint32_t a, 648 uint32_t gmem_a) 649{ 650 const struct tu_framebuffer *fb = cmd->state.framebuffer; 651 const struct tu_image_view *dst = cmd->state.attachments[a]; 652 const struct tu_image_view *src = cmd->state.attachments[gmem_a]; 653 654 tu_resolve_sysmem(cmd, cs, src, dst, layer_mask, fb->layers, &cmd->state.render_area); 655} 656 657static void 658tu6_emit_sysmem_resolves(struct tu_cmd_buffer *cmd, 659 struct tu_cs *cs, 660 const struct tu_subpass *subpass) 661{ 662 if (subpass->resolve_attachments) { 663 /* From the documentation for vkCmdNextSubpass, section 7.4 "Render Pass 664 * Commands": 665 * 666 * End-of-subpass multisample resolves are treated as color 667 * attachment writes for the purposes of synchronization. 668 * This applies to resolve operations for both color and 669 * depth/stencil attachments. That is, they are considered to 670 * execute in the VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT 671 * pipeline stage and their writes are synchronized with 672 * VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT. Synchronization between 673 * rendering within a subpass and any resolve operations at the end 674 * of the subpass occurs automatically, without need for explicit 675 * dependencies or pipeline barriers. However, if the resolve 676 * attachment is also used in a different subpass, an explicit 677 * dependency is needed. 678 * 679 * We use the CP_BLIT path for sysmem resolves, which is really a 680 * transfer command, so we have to manually flush similar to the gmem 681 * resolve case. However, a flush afterwards isn't needed because of the 682 * last sentence and the fact that we're in sysmem mode. 683 */ 684 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); 685 if (subpass->resolve_depth_stencil) 686 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS); 687 688 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); 689 690 /* Wait for the flushes to land before using the 2D engine */ 691 tu_cs_emit_wfi(cs); 692 693 for (unsigned i = 0; i < subpass->resolve_count; i++) { 694 uint32_t a = subpass->resolve_attachments[i].attachment; 695 if (a == VK_ATTACHMENT_UNUSED) 696 continue; 697 698 uint32_t gmem_a = tu_subpass_get_attachment_to_resolve(subpass, i); 699 700 tu6_emit_sysmem_resolve(cmd, cs, subpass->multiview_mask, a, gmem_a); 701 } 702 } 703} 704 705static void 706tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs) 707{ 708 const struct tu_render_pass *pass = cmd->state.pass; 709 const struct tu_subpass *subpass = &pass->subpasses[pass->subpass_count-1]; 710 711 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1); 712 tu_cs_emit(cs, 0x0); 713 714 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); 715 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_RESOLVE)); 716 717 tu6_emit_blit_scissor(cmd, cs, true); 718 719 for (uint32_t a = 0; a < pass->attachment_count; ++a) { 720 if (pass->attachments[a].gmem_offset >= 0) 721 tu_store_gmem_attachment(cmd, cs, a, a); 722 } 723 724 if (subpass->resolve_attachments) { 725 for (unsigned i = 0; i < subpass->resolve_count; i++) { 726 uint32_t a = subpass->resolve_attachments[i].attachment; 727 if (a != VK_ATTACHMENT_UNUSED) { 728 uint32_t gmem_a = tu_subpass_get_attachment_to_resolve(subpass, i); 729 tu_store_gmem_attachment(cmd, cs, a, gmem_a); 730 } 731 } 732 } 733} 734 735void 736tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs) 737{ 738 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3); 739 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) | 740 CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS | 741 CP_SET_DRAW_STATE__0_GROUP_ID(0)); 742 tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0)); 743 tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0)); 744 745 cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE; 746} 747 748static void 749tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs) 750{ 751 struct tu_device *dev = cmd->device; 752 const struct tu_physical_device *phys_dev = dev->physical_device; 753 754 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); 755 756 tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD( 757 .vs_state = true, 758 .hs_state = true, 759 .ds_state = true, 760 .gs_state = true, 761 .fs_state = true, 762 .cs_state = true, 763 .gfx_ibo = true, 764 .cs_ibo = true, 765 .gfx_shared_const = true, 766 .cs_shared_const = true, 767 .gfx_bindless = 0x1f, 768 .cs_bindless = 0x1f)); 769 770 tu_cs_emit_wfi(cs); 771 772 cmd->state.cache.pending_flush_bits &= 773 ~(TU_CMD_FLAG_WAIT_FOR_IDLE | TU_CMD_FLAG_CACHE_INVALIDATE); 774 775 tu_cs_emit_regs(cs, 776 A6XX_RB_CCU_CNTL(.color_offset = phys_dev->ccu_offset_bypass)); 777 cmd->state.ccu_state = TU_CMD_CCU_SYSMEM; 778 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8E04, 0x00100000); 779 tu_cs_emit_write_reg(cs, REG_A6XX_SP_FLOAT_CNTL, 0); 780 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE00, 0); 781 tu_cs_emit_write_reg(cs, REG_A6XX_SP_PERFCTR_ENABLE, 0x3f); 782 tu_cs_emit_write_reg(cs, REG_A6XX_TPL1_UNKNOWN_B605, 0x44); 783 tu_cs_emit_write_reg(cs, REG_A6XX_TPL1_DBG_ECO_CNTL, 784 phys_dev->info->a6xx.magic.TPL1_DBG_ECO_CNTL); 785 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE00, 0x80); 786 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE01, 0); 787 788 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9600, 0); 789 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_DBG_ECO_CNTL, 0x880); 790 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE04, 0); 791 tu_cs_emit_write_reg(cs, REG_A6XX_SP_CHICKEN_BITS, 0x00000410); 792 tu_cs_emit_write_reg(cs, REG_A6XX_SP_IBO_COUNT, 0); 793 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B182, 0); 794 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_SHARED_CONSTS, 0); 795 tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_UNKNOWN_0E12, 0x3200000); 796 tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_CLIENT_PF, 4); 797 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8E01, 0x0); 798 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_A9A8, 0); 799 tu_cs_emit_write_reg(cs, REG_A6XX_SP_MODE_CONTROL, 800 A6XX_SP_MODE_CONTROL_CONSTANT_DEMOTION_ENABLE | 4); 801 802 /* TODO: set A6XX_VFD_ADD_OFFSET_INSTANCE and fix ir3 to avoid adding base instance */ 803 tu_cs_emit_write_reg(cs, REG_A6XX_VFD_ADD_OFFSET, A6XX_VFD_ADD_OFFSET_VERTEX); 804 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8811, 0x00000010); 805 tu_cs_emit_write_reg(cs, REG_A6XX_PC_MODE_CNTL, 0x1f); 806 807 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8110, 0); 808 809 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8818, 0); 810 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8819, 0); 811 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881A, 0); 812 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881B, 0); 813 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881C, 0); 814 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881D, 0); 815 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881E, 0); 816 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_88F0, 0); 817 818 tu_cs_emit_regs(cs, A6XX_VPC_POINT_COORD_INVERT(false)); 819 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9300, 0); 820 821 tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(true)); 822 823 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B183, 0); 824 825 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_SU_CONSERVATIVE_RAS_CNTL, 0); 826 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80AF, 0); 827 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9210, 0); 828 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9211, 0); 829 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9602, 0); 830 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9E72, 0); 831 tu_cs_emit_write_reg(cs, REG_A6XX_SP_TP_MODE_CNTL, 832 0x000000a0 | 833 A6XX_SP_TP_MODE_CNTL_ISAMMODE(ISAMMODE_GL)); 834 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_CONTROL_5_REG, 0xfc); 835 836 tu_cs_emit_write_reg(cs, REG_A6XX_VFD_MODE_CNTL, 0x00000000); 837 838 tu_cs_emit_write_reg(cs, REG_A6XX_PC_MODE_CNTL, 0x0000001f); 839 840 tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL()); /* always disable alpha test */ 841 tu_cs_emit_regs(cs, A6XX_RB_DITHER_CNTL()); /* always disable dithering */ 842 843 tu_disable_draw_states(cmd, cs); 844 845 tu_cs_emit_regs(cs, 846 A6XX_SP_TP_BORDER_COLOR_BASE_ADDR(.bo = &dev->global_bo, 847 .bo_offset = gb_offset(bcolor_builtin))); 848 tu_cs_emit_regs(cs, 849 A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR(.bo = &dev->global_bo, 850 .bo_offset = gb_offset(bcolor_builtin))); 851 852 /* VSC buffers: 853 * use vsc pitches from the largest values used so far with this device 854 * if there hasn't been overflow, there will already be a scratch bo 855 * allocated for these sizes 856 * 857 * if overflow is detected, the stream size is increased by 2x 858 */ 859 mtx_lock(&dev->mutex); 860 861 struct tu6_global *global = dev->global_bo.map; 862 863 uint32_t vsc_draw_overflow = global->vsc_draw_overflow; 864 uint32_t vsc_prim_overflow = global->vsc_prim_overflow; 865 866 if (vsc_draw_overflow >= dev->vsc_draw_strm_pitch) 867 dev->vsc_draw_strm_pitch = (dev->vsc_draw_strm_pitch - VSC_PAD) * 2 + VSC_PAD; 868 869 if (vsc_prim_overflow >= dev->vsc_prim_strm_pitch) 870 dev->vsc_prim_strm_pitch = (dev->vsc_prim_strm_pitch - VSC_PAD) * 2 + VSC_PAD; 871 872 cmd->vsc_prim_strm_pitch = dev->vsc_prim_strm_pitch; 873 cmd->vsc_draw_strm_pitch = dev->vsc_draw_strm_pitch; 874 875 mtx_unlock(&dev->mutex); 876 877 struct tu_bo *vsc_bo; 878 uint32_t size0 = cmd->vsc_prim_strm_pitch * MAX_VSC_PIPES + 879 cmd->vsc_draw_strm_pitch * MAX_VSC_PIPES; 880 881 tu_get_scratch_bo(dev, size0 + MAX_VSC_PIPES * 4, &vsc_bo); 882 883 tu_cs_emit_regs(cs, 884 A6XX_VSC_DRAW_STRM_SIZE_ADDRESS(.bo = vsc_bo, .bo_offset = size0)); 885 tu_cs_emit_regs(cs, 886 A6XX_VSC_PRIM_STRM_ADDRESS(.bo = vsc_bo)); 887 tu_cs_emit_regs(cs, 888 A6XX_VSC_DRAW_STRM_ADDRESS(.bo = vsc_bo, 889 .bo_offset = cmd->vsc_prim_strm_pitch * MAX_VSC_PIPES)); 890 891 tu_cs_sanity_check(cs); 892} 893 894static void 895update_vsc_pipe(struct tu_cmd_buffer *cmd, struct tu_cs *cs) 896{ 897 const struct tu_framebuffer *fb = cmd->state.framebuffer; 898 899 tu_cs_emit_regs(cs, 900 A6XX_VSC_BIN_SIZE(.width = fb->tile0.width, 901 .height = fb->tile0.height)); 902 903 tu_cs_emit_regs(cs, 904 A6XX_VSC_BIN_COUNT(.nx = fb->tile_count.width, 905 .ny = fb->tile_count.height)); 906 907 tu_cs_emit_pkt4(cs, REG_A6XX_VSC_PIPE_CONFIG_REG(0), 32); 908 tu_cs_emit_array(cs, fb->pipe_config, 32); 909 910 tu_cs_emit_regs(cs, 911 A6XX_VSC_PRIM_STRM_PITCH(cmd->vsc_prim_strm_pitch), 912 A6XX_VSC_PRIM_STRM_LIMIT(cmd->vsc_prim_strm_pitch - VSC_PAD)); 913 914 tu_cs_emit_regs(cs, 915 A6XX_VSC_DRAW_STRM_PITCH(cmd->vsc_draw_strm_pitch), 916 A6XX_VSC_DRAW_STRM_LIMIT(cmd->vsc_draw_strm_pitch - VSC_PAD)); 917} 918 919static void 920emit_vsc_overflow_test(struct tu_cmd_buffer *cmd, struct tu_cs *cs) 921{ 922 const struct tu_framebuffer *fb = cmd->state.framebuffer; 923 const uint32_t used_pipe_count = 924 fb->pipe_count.width * fb->pipe_count.height; 925 926 for (int i = 0; i < used_pipe_count; i++) { 927 tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8); 928 tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) | 929 CP_COND_WRITE5_0_WRITE_MEMORY); 930 tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_DRAW_STRM_SIZE_REG(i))); 931 tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0)); 932 tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_draw_strm_pitch - VSC_PAD)); 933 tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0)); 934 tu_cs_emit_qw(cs, global_iova(cmd, vsc_draw_overflow)); 935 tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(cmd->vsc_draw_strm_pitch)); 936 937 tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8); 938 tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) | 939 CP_COND_WRITE5_0_WRITE_MEMORY); 940 tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_PRIM_STRM_SIZE_REG(i))); 941 tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0)); 942 tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_prim_strm_pitch - VSC_PAD)); 943 tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0)); 944 tu_cs_emit_qw(cs, global_iova(cmd, vsc_prim_overflow)); 945 tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(cmd->vsc_prim_strm_pitch)); 946 } 947 948 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); 949} 950 951static void 952tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs) 953{ 954 struct tu_physical_device *phys_dev = cmd->device->physical_device; 955 const struct tu_framebuffer *fb = cmd->state.framebuffer; 956 957 tu6_emit_window_scissor(cs, 0, 0, fb->width - 1, fb->height - 1); 958 959 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); 960 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BINNING)); 961 962 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1); 963 tu_cs_emit(cs, 0x1); 964 965 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1); 966 tu_cs_emit(cs, 0x1); 967 968 tu_cs_emit_wfi(cs); 969 970 tu_cs_emit_regs(cs, 971 A6XX_VFD_MODE_CNTL(.render_mode = BINNING_PASS)); 972 973 update_vsc_pipe(cmd, cs); 974 975 tu_cs_emit_regs(cs, 976 A6XX_PC_POWER_CNTL(phys_dev->info->a6xx.magic.PC_POWER_CNTL)); 977 978 tu_cs_emit_regs(cs, 979 A6XX_VFD_POWER_CNTL(phys_dev->info->a6xx.magic.PC_POWER_CNTL)); 980 981 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1); 982 tu_cs_emit(cs, UNK_2C); 983 984 tu_cs_emit_regs(cs, 985 A6XX_RB_WINDOW_OFFSET(.x = 0, .y = 0)); 986 987 tu_cs_emit_regs(cs, 988 A6XX_SP_TP_WINDOW_OFFSET(.x = 0, .y = 0)); 989 990 trace_start_binning_ib(&cmd->trace, cs); 991 992 /* emit IB to binning drawcmds: */ 993 tu_cs_emit_call(cs, &cmd->draw_cs); 994 995 trace_end_binning_ib(&cmd->trace, cs); 996 997 /* switching from binning pass to GMEM pass will cause a switch from 998 * PROGRAM_BINNING to PROGRAM, which invalidates const state (XS_CONST states) 999 * so make sure these states are re-emitted 1000 * (eventually these states shouldn't exist at all with shader prologue) 1001 * only VS and GS are invalidated, as FS isn't emitted in binning pass, 1002 * and we don't use HW binning when tesselation is used 1003 */ 1004 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3); 1005 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) | 1006 CP_SET_DRAW_STATE__0_DISABLE | 1007 CP_SET_DRAW_STATE__0_GROUP_ID(TU_DRAW_STATE_SHADER_GEOM_CONST)); 1008 tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0)); 1009 tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0)); 1010 1011 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1); 1012 tu_cs_emit(cs, UNK_2D); 1013 1014 /* This flush is probably required because the VSC, which produces the 1015 * visibility stream, is a client of UCHE, whereas the CP needs to read the 1016 * visibility stream (without caching) to do draw skipping. The 1017 * WFI+WAIT_FOR_ME combination guarantees that the binning commands 1018 * submitted are finished before reading the VSC regs (in 1019 * emit_vsc_overflow_test) or the VSC_DATA buffer directly (implicitly as 1020 * part of draws). 1021 */ 1022 tu6_emit_event_write(cmd, cs, CACHE_FLUSH_TS); 1023 1024 tu_cs_emit_wfi(cs); 1025 1026 tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0); 1027 1028 emit_vsc_overflow_test(cmd, cs); 1029 1030 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1); 1031 tu_cs_emit(cs, 0x0); 1032 1033 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1); 1034 tu_cs_emit(cs, 0x0); 1035} 1036 1037static struct tu_draw_state 1038tu_emit_input_attachments(struct tu_cmd_buffer *cmd, 1039 const struct tu_subpass *subpass, 1040 bool gmem) 1041{ 1042 /* note: we can probably emit input attachments just once for the whole 1043 * renderpass, this would avoid emitting both sysmem/gmem versions 1044 * 1045 * emit two texture descriptors for each input, as a workaround for 1046 * d24s8/d32s8, which can be sampled as both float (depth) and integer (stencil) 1047 * tu_shader lowers uint input attachment loads to use the 2nd descriptor 1048 * in the pair 1049 * TODO: a smarter workaround 1050 */ 1051 1052 if (!subpass->input_count) 1053 return (struct tu_draw_state) {}; 1054 1055 struct tu_cs_memory texture; 1056 VkResult result = tu_cs_alloc(&cmd->sub_cs, subpass->input_count * 2, 1057 A6XX_TEX_CONST_DWORDS, &texture); 1058 if (result != VK_SUCCESS) { 1059 cmd->record_result = result; 1060 return (struct tu_draw_state) {}; 1061 } 1062 1063 for (unsigned i = 0; i < subpass->input_count * 2; i++) { 1064 uint32_t a = subpass->input_attachments[i / 2].attachment; 1065 if (a == VK_ATTACHMENT_UNUSED) 1066 continue; 1067 1068 const struct tu_image_view *iview = cmd->state.attachments[a]; 1069 const struct tu_render_pass_attachment *att = 1070 &cmd->state.pass->attachments[a]; 1071 uint32_t *dst = &texture.map[A6XX_TEX_CONST_DWORDS * i]; 1072 uint32_t gmem_offset = att->gmem_offset; 1073 uint32_t cpp = att->cpp; 1074 1075 memcpy(dst, iview->descriptor, A6XX_TEX_CONST_DWORDS * 4); 1076 1077 if (i % 2 == 1 && att->format == VK_FORMAT_D24_UNORM_S8_UINT) { 1078 /* note this works because spec says fb and input attachments 1079 * must use identity swizzle 1080 */ 1081 dst[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK | 1082 A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK | 1083 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK); 1084 if (!cmd->device->physical_device->info->a6xx.has_z24uint_s8uint) { 1085 dst[0] |= A6XX_TEX_CONST_0_FMT(FMT6_8_8_8_8_UINT) | 1086 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_W) | 1087 A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_ZERO) | 1088 A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_ZERO) | 1089 A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_ONE); 1090 } else { 1091 dst[0] |= A6XX_TEX_CONST_0_FMT(FMT6_Z24_UINT_S8_UINT) | 1092 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_Y) | 1093 A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_ZERO) | 1094 A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_ZERO) | 1095 A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_ONE); 1096 } 1097 } 1098 1099 if (i % 2 == 1 && att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) { 1100 dst[0] &= ~A6XX_TEX_CONST_0_FMT__MASK; 1101 dst[0] |= A6XX_TEX_CONST_0_FMT(FMT6_8_UINT); 1102 dst[2] &= ~(A6XX_TEX_CONST_2_PITCHALIGN__MASK | A6XX_TEX_CONST_2_PITCH__MASK); 1103 dst[2] |= A6XX_TEX_CONST_2_PITCH(iview->stencil_PITCH << 6); 1104 dst[3] = 0; 1105 dst[4] = iview->stencil_base_addr; 1106 dst[5] = (dst[5] & 0xffff) | iview->stencil_base_addr >> 32; 1107 1108 cpp = att->samples; 1109 gmem_offset = att->gmem_offset_stencil; 1110 } 1111 1112 if (!gmem || !subpass->input_attachments[i / 2].patch_input_gmem) 1113 continue; 1114 1115 /* patched for gmem */ 1116 dst[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK); 1117 dst[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2); 1118 dst[2] = 1119 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) | 1120 A6XX_TEX_CONST_2_PITCH(cmd->state.framebuffer->tile0.width * cpp); 1121 dst[3] = 0; 1122 dst[4] = cmd->device->physical_device->gmem_base + gmem_offset; 1123 dst[5] = A6XX_TEX_CONST_5_DEPTH(1); 1124 for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++) 1125 dst[i] = 0; 1126 } 1127 1128 struct tu_cs cs; 1129 struct tu_draw_state ds = tu_cs_draw_state(&cmd->sub_cs, &cs, 9); 1130 1131 tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_FRAG, 3); 1132 tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(0) | 1133 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | 1134 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | 1135 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) | 1136 CP_LOAD_STATE6_0_NUM_UNIT(subpass->input_count * 2)); 1137 tu_cs_emit_qw(&cs, texture.iova); 1138 1139 tu_cs_emit_regs(&cs, A6XX_SP_FS_TEX_CONST(.qword = texture.iova)); 1140 1141 tu_cs_emit_regs(&cs, A6XX_SP_FS_TEX_COUNT(subpass->input_count * 2)); 1142 1143 assert(cs.cur == cs.end); /* validate draw state size */ 1144 1145 return ds; 1146} 1147 1148static void 1149tu_set_input_attachments(struct tu_cmd_buffer *cmd, const struct tu_subpass *subpass) 1150{ 1151 struct tu_cs *cs = &cmd->draw_cs; 1152 1153 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 6); 1154 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM, 1155 tu_emit_input_attachments(cmd, subpass, true)); 1156 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM, 1157 tu_emit_input_attachments(cmd, subpass, false)); 1158} 1159 1160static void 1161tu_emit_renderpass_begin(struct tu_cmd_buffer *cmd, 1162 const VkRenderPassBeginInfo *info) 1163{ 1164 struct tu_cs *cs = &cmd->draw_cs; 1165 1166 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM); 1167 1168 tu6_emit_blit_scissor(cmd, cs, true); 1169 1170 for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i) 1171 tu_load_gmem_attachment(cmd, cs, i, false); 1172 1173 tu6_emit_blit_scissor(cmd, cs, false); 1174 1175 for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i) 1176 tu_clear_gmem_attachment(cmd, cs, i, info); 1177 1178 tu_cond_exec_end(cs); 1179 1180 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM); 1181 1182 for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i) 1183 tu_clear_sysmem_attachment(cmd, cs, i, info); 1184 1185 tu_cond_exec_end(cs); 1186} 1187 1188static void 1189tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs) 1190{ 1191 const struct tu_framebuffer *fb = cmd->state.framebuffer; 1192 1193 assert(fb->width > 0 && fb->height > 0); 1194 tu6_emit_window_scissor(cs, 0, 0, fb->width - 1, fb->height - 1); 1195 tu6_emit_window_offset(cs, 0, 0); 1196 1197 tu6_emit_bin_size(cs, 0, 0, 1198 A6XX_RB_BIN_CONTROL_BUFFERS_LOCATION(BUFFERS_IN_SYSMEM)); 1199 1200 tu6_emit_event_write(cmd, cs, LRZ_FLUSH); 1201 1202 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); 1203 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BYPASS)); 1204 1205 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1); 1206 tu_cs_emit(cs, 0x0); 1207 1208 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM); 1209 1210 /* enable stream-out, with sysmem there is only one pass: */ 1211 tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(false)); 1212 1213 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1); 1214 tu_cs_emit(cs, 0x1); 1215 1216 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1); 1217 tu_cs_emit(cs, 0x0); 1218 1219 tu_cs_sanity_check(cs); 1220} 1221 1222static void 1223tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs) 1224{ 1225 /* Do any resolves of the last subpass. These are handled in the 1226 * tile_store_cs in the gmem path. 1227 */ 1228 tu6_emit_sysmem_resolves(cmd, cs, cmd->state.subpass); 1229 1230 tu_cs_emit_call(cs, &cmd->draw_epilogue_cs); 1231 1232 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1); 1233 tu_cs_emit(cs, 0x0); 1234 1235 tu6_emit_event_write(cmd, cs, LRZ_FLUSH); 1236 1237 tu_cs_sanity_check(cs); 1238} 1239 1240static void 1241tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs) 1242{ 1243 struct tu_physical_device *phys_dev = cmd->device->physical_device; 1244 1245 tu6_emit_event_write(cmd, cs, LRZ_FLUSH); 1246 1247 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1); 1248 tu_cs_emit(cs, 0x0); 1249 1250 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_GMEM); 1251 1252 const struct tu_framebuffer *fb = cmd->state.framebuffer; 1253 if (use_hw_binning(cmd)) { 1254 /* enable stream-out during binning pass: */ 1255 tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(false)); 1256 1257 tu6_emit_bin_size(cs, fb->tile0.width, fb->tile0.height, 1258 A6XX_RB_BIN_CONTROL_RENDER_MODE(BINNING_PASS) | 1259 A6XX_RB_BIN_CONTROL_LRZ_FEEDBACK_ZMODE_MASK(0x6)); 1260 1261 tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, true); 1262 1263 tu6_emit_binning_pass(cmd, cs); 1264 1265 /* and disable stream-out for draw pass: */ 1266 tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(true)); 1267 1268 tu6_emit_bin_size(cs, fb->tile0.width, fb->tile0.height, 1269 A6XX_RB_BIN_CONTROL_FORCE_LRZ_WRITE_DIS | 1270 A6XX_RB_BIN_CONTROL_LRZ_FEEDBACK_ZMODE_MASK(0x6)); 1271 1272 tu_cs_emit_regs(cs, 1273 A6XX_VFD_MODE_CNTL(0)); 1274 1275 tu_cs_emit_regs(cs, 1276 A6XX_PC_POWER_CNTL(phys_dev->info->a6xx.magic.PC_POWER_CNTL)); 1277 1278 tu_cs_emit_regs(cs, 1279 A6XX_VFD_POWER_CNTL(phys_dev->info->a6xx.magic.PC_POWER_CNTL)); 1280 1281 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1); 1282 tu_cs_emit(cs, 0x1); 1283 } else { 1284 /* no binning pass, so enable stream-out for draw pass:: */ 1285 tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(false)); 1286 1287 tu6_emit_bin_size(cs, fb->tile0.width, fb->tile0.height, 1288 A6XX_RB_BIN_CONTROL_LRZ_FEEDBACK_ZMODE_MASK(0x6)); 1289 } 1290 1291 tu_cs_sanity_check(cs); 1292} 1293 1294static void 1295tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs) 1296{ 1297 tu_cs_emit_call(cs, &cmd->draw_cs); 1298 1299 if (use_hw_binning(cmd)) { 1300 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); 1301 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_ENDVIS)); 1302 } 1303 1304 tu_cs_emit_call(cs, &cmd->tile_store_cs); 1305 1306 if (!u_trace_iterator_equal(cmd->trace_renderpass_start, cmd->trace_renderpass_end)) { 1307 tu_cs_emit_wfi(cs); 1308 tu_cs_emit_pkt7(&cmd->cs, CP_WAIT_FOR_ME, 0); 1309 u_trace_clone_append(cmd->trace_renderpass_start, 1310 cmd->trace_renderpass_end, 1311 &cmd->trace, 1312 cs, tu_copy_timestamp_buffer); 1313 } 1314 1315 tu_cs_sanity_check(cs); 1316} 1317 1318static void 1319tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs) 1320{ 1321 tu_cs_emit_call(cs, &cmd->draw_epilogue_cs); 1322 1323 tu_cs_emit_regs(cs, 1324 A6XX_GRAS_LRZ_CNTL(0)); 1325 1326 tu6_emit_event_write(cmd, cs, LRZ_FLUSH); 1327 1328 tu6_emit_event_write(cmd, cs, PC_CCU_RESOLVE_TS); 1329 1330 tu_cs_sanity_check(cs); 1331} 1332 1333static void 1334tu_cmd_render_tiles(struct tu_cmd_buffer *cmd) 1335{ 1336 const struct tu_framebuffer *fb = cmd->state.framebuffer; 1337 1338 tu6_tile_render_begin(cmd, &cmd->cs); 1339 1340 uint32_t pipe = 0; 1341 for (uint32_t py = 0; py < fb->pipe_count.height; py++) { 1342 for (uint32_t px = 0; px < fb->pipe_count.width; px++, pipe++) { 1343 uint32_t tx1 = px * fb->pipe0.width; 1344 uint32_t ty1 = py * fb->pipe0.height; 1345 uint32_t tx2 = MIN2(tx1 + fb->pipe0.width, fb->tile_count.width); 1346 uint32_t ty2 = MIN2(ty1 + fb->pipe0.height, fb->tile_count.height); 1347 uint32_t slot = 0; 1348 for (uint32_t ty = ty1; ty < ty2; ty++) { 1349 for (uint32_t tx = tx1; tx < tx2; tx++, slot++) { 1350 tu6_emit_tile_select(cmd, &cmd->cs, tx, ty, pipe, slot); 1351 1352 trace_start_draw_ib_gmem(&cmd->trace, &cmd->cs); 1353 tu6_render_tile(cmd, &cmd->cs); 1354 trace_end_draw_ib_gmem(&cmd->trace, &cmd->cs); 1355 } 1356 } 1357 } 1358 } 1359 1360 tu6_tile_render_end(cmd, &cmd->cs); 1361 1362 trace_end_render_pass(&cmd->trace, &cmd->cs, fb); 1363 1364 if (!u_trace_iterator_equal(cmd->trace_renderpass_start, cmd->trace_renderpass_end)) 1365 u_trace_disable_event_range(cmd->trace_renderpass_start, 1366 cmd->trace_renderpass_end); 1367} 1368 1369static void 1370tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd) 1371{ 1372 tu6_sysmem_render_begin(cmd, &cmd->cs); 1373 1374 trace_start_draw_ib_sysmem(&cmd->trace, &cmd->cs); 1375 1376 tu_cs_emit_call(&cmd->cs, &cmd->draw_cs); 1377 1378 trace_end_draw_ib_sysmem(&cmd->trace, &cmd->cs); 1379 1380 tu6_sysmem_render_end(cmd, &cmd->cs); 1381 1382 trace_end_render_pass(&cmd->trace, &cmd->cs, cmd->state.framebuffer); 1383} 1384 1385static VkResult 1386tu_create_cmd_buffer(struct tu_device *device, 1387 struct tu_cmd_pool *pool, 1388 VkCommandBufferLevel level, 1389 VkCommandBuffer *pCommandBuffer) 1390{ 1391 struct tu_cmd_buffer *cmd_buffer; 1392 1393 cmd_buffer = vk_zalloc2(&device->vk.alloc, NULL, sizeof(*cmd_buffer), 8, 1394 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 1395 1396 if (cmd_buffer == NULL) 1397 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 1398 1399 VkResult result = vk_command_buffer_init(&cmd_buffer->vk, &device->vk); 1400 if (result != VK_SUCCESS) { 1401 vk_free2(&device->vk.alloc, NULL, cmd_buffer); 1402 return result; 1403 } 1404 1405 cmd_buffer->device = device; 1406 cmd_buffer->pool = pool; 1407 cmd_buffer->level = level; 1408 1409 if (pool) { 1410 list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers); 1411 cmd_buffer->queue_family_index = pool->queue_family_index; 1412 1413 } else { 1414 /* Init the pool_link so we can safely call list_del when we destroy 1415 * the command buffer 1416 */ 1417 list_inithead(&cmd_buffer->pool_link); 1418 cmd_buffer->queue_family_index = TU_QUEUE_GENERAL; 1419 } 1420 1421 u_trace_init(&cmd_buffer->trace, &device->trace_context); 1422 1423 tu_cs_init(&cmd_buffer->cs, device, TU_CS_MODE_GROW, 4096); 1424 tu_cs_init(&cmd_buffer->draw_cs, device, TU_CS_MODE_GROW, 4096); 1425 tu_cs_init(&cmd_buffer->tile_store_cs, device, TU_CS_MODE_GROW, 2048); 1426 tu_cs_init(&cmd_buffer->draw_epilogue_cs, device, TU_CS_MODE_GROW, 4096); 1427 tu_cs_init(&cmd_buffer->sub_cs, device, TU_CS_MODE_SUB_STREAM, 2048); 1428 1429 *pCommandBuffer = tu_cmd_buffer_to_handle(cmd_buffer); 1430 1431 return VK_SUCCESS; 1432} 1433 1434static void 1435tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer) 1436{ 1437 list_del(&cmd_buffer->pool_link); 1438 1439 tu_cs_finish(&cmd_buffer->cs); 1440 tu_cs_finish(&cmd_buffer->draw_cs); 1441 tu_cs_finish(&cmd_buffer->tile_store_cs); 1442 tu_cs_finish(&cmd_buffer->draw_epilogue_cs); 1443 tu_cs_finish(&cmd_buffer->sub_cs); 1444 1445 u_trace_fini(&cmd_buffer->trace); 1446 1447 vk_command_buffer_finish(&cmd_buffer->vk); 1448 vk_free2(&cmd_buffer->device->vk.alloc, &cmd_buffer->pool->alloc, 1449 cmd_buffer); 1450} 1451 1452static VkResult 1453tu_reset_cmd_buffer(struct tu_cmd_buffer *cmd_buffer) 1454{ 1455 vk_command_buffer_reset(&cmd_buffer->vk); 1456 1457 cmd_buffer->record_result = VK_SUCCESS; 1458 1459 tu_cs_reset(&cmd_buffer->cs); 1460 tu_cs_reset(&cmd_buffer->draw_cs); 1461 tu_cs_reset(&cmd_buffer->tile_store_cs); 1462 tu_cs_reset(&cmd_buffer->draw_epilogue_cs); 1463 tu_cs_reset(&cmd_buffer->sub_cs); 1464 1465 for (unsigned i = 0; i < MAX_BIND_POINTS; i++) { 1466 memset(&cmd_buffer->descriptors[i].sets, 0, sizeof(cmd_buffer->descriptors[i].sets)); 1467 memset(&cmd_buffer->descriptors[i].push_set, 0, sizeof(cmd_buffer->descriptors[i].push_set)); 1468 cmd_buffer->descriptors[i].push_set.base.type = VK_OBJECT_TYPE_DESCRIPTOR_SET; 1469 } 1470 1471 u_trace_fini(&cmd_buffer->trace); 1472 u_trace_init(&cmd_buffer->trace, &cmd_buffer->device->trace_context); 1473 1474 cmd_buffer->status = TU_CMD_BUFFER_STATUS_INITIAL; 1475 1476 return cmd_buffer->record_result; 1477} 1478 1479VKAPI_ATTR VkResult VKAPI_CALL 1480tu_AllocateCommandBuffers(VkDevice _device, 1481 const VkCommandBufferAllocateInfo *pAllocateInfo, 1482 VkCommandBuffer *pCommandBuffers) 1483{ 1484 TU_FROM_HANDLE(tu_device, device, _device); 1485 TU_FROM_HANDLE(tu_cmd_pool, pool, pAllocateInfo->commandPool); 1486 1487 VkResult result = VK_SUCCESS; 1488 uint32_t i; 1489 1490 for (i = 0; i < pAllocateInfo->commandBufferCount; i++) { 1491 1492 if (!list_is_empty(&pool->free_cmd_buffers)) { 1493 struct tu_cmd_buffer *cmd_buffer = list_first_entry( 1494 &pool->free_cmd_buffers, struct tu_cmd_buffer, pool_link); 1495 1496 list_del(&cmd_buffer->pool_link); 1497 list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers); 1498 1499 result = tu_reset_cmd_buffer(cmd_buffer); 1500 cmd_buffer->level = pAllocateInfo->level; 1501 vk_command_buffer_finish(&cmd_buffer->vk); 1502 VkResult init_result = 1503 vk_command_buffer_init(&cmd_buffer->vk, &device->vk); 1504 if (init_result != VK_SUCCESS) 1505 result = init_result; 1506 1507 pCommandBuffers[i] = tu_cmd_buffer_to_handle(cmd_buffer); 1508 } else { 1509 result = tu_create_cmd_buffer(device, pool, pAllocateInfo->level, 1510 &pCommandBuffers[i]); 1511 } 1512 if (result != VK_SUCCESS) 1513 break; 1514 } 1515 1516 if (result != VK_SUCCESS) { 1517 tu_FreeCommandBuffers(_device, pAllocateInfo->commandPool, i, 1518 pCommandBuffers); 1519 1520 /* From the Vulkan 1.0.66 spec: 1521 * 1522 * "vkAllocateCommandBuffers can be used to create multiple 1523 * command buffers. If the creation of any of those command 1524 * buffers fails, the implementation must destroy all 1525 * successfully created command buffer objects from this 1526 * command, set all entries of the pCommandBuffers array to 1527 * NULL and return the error." 1528 */ 1529 memset(pCommandBuffers, 0, 1530 sizeof(*pCommandBuffers) * pAllocateInfo->commandBufferCount); 1531 } 1532 1533 return result; 1534} 1535 1536VKAPI_ATTR void VKAPI_CALL 1537tu_FreeCommandBuffers(VkDevice device, 1538 VkCommandPool commandPool, 1539 uint32_t commandBufferCount, 1540 const VkCommandBuffer *pCommandBuffers) 1541{ 1542 for (uint32_t i = 0; i < commandBufferCount; i++) { 1543 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, pCommandBuffers[i]); 1544 1545 if (cmd_buffer) { 1546 if (cmd_buffer->pool) { 1547 list_del(&cmd_buffer->pool_link); 1548 list_addtail(&cmd_buffer->pool_link, 1549 &cmd_buffer->pool->free_cmd_buffers); 1550 } else 1551 tu_cmd_buffer_destroy(cmd_buffer); 1552 } 1553 } 1554} 1555 1556VKAPI_ATTR VkResult VKAPI_CALL 1557tu_ResetCommandBuffer(VkCommandBuffer commandBuffer, 1558 VkCommandBufferResetFlags flags) 1559{ 1560 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); 1561 return tu_reset_cmd_buffer(cmd_buffer); 1562} 1563 1564/* Initialize the cache, assuming all necessary flushes have happened but *not* 1565 * invalidations. 1566 */ 1567static void 1568tu_cache_init(struct tu_cache_state *cache) 1569{ 1570 cache->flush_bits = 0; 1571 cache->pending_flush_bits = TU_CMD_FLAG_ALL_INVALIDATE; 1572} 1573 1574VKAPI_ATTR VkResult VKAPI_CALL 1575tu_BeginCommandBuffer(VkCommandBuffer commandBuffer, 1576 const VkCommandBufferBeginInfo *pBeginInfo) 1577{ 1578 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); 1579 VkResult result = VK_SUCCESS; 1580 1581 if (cmd_buffer->status != TU_CMD_BUFFER_STATUS_INITIAL) { 1582 /* If the command buffer has already been resetted with 1583 * vkResetCommandBuffer, no need to do it again. 1584 */ 1585 result = tu_reset_cmd_buffer(cmd_buffer); 1586 if (result != VK_SUCCESS) 1587 return result; 1588 } 1589 1590 memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state)); 1591 cmd_buffer->state.index_size = 0xff; /* dirty restart index */ 1592 cmd_buffer->state.line_mode = RECTANGULAR; 1593 1594 tu_cache_init(&cmd_buffer->state.cache); 1595 tu_cache_init(&cmd_buffer->state.renderpass_cache); 1596 cmd_buffer->usage_flags = pBeginInfo->flags; 1597 1598 tu_cs_begin(&cmd_buffer->cs); 1599 tu_cs_begin(&cmd_buffer->draw_cs); 1600 tu_cs_begin(&cmd_buffer->tile_store_cs); 1601 tu_cs_begin(&cmd_buffer->draw_epilogue_cs); 1602 1603 /* setup initial configuration into command buffer */ 1604 if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) { 1605 switch (cmd_buffer->queue_family_index) { 1606 case TU_QUEUE_GENERAL: 1607 tu6_init_hw(cmd_buffer, &cmd_buffer->cs); 1608 break; 1609 default: 1610 break; 1611 } 1612 } else if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) { 1613 assert(pBeginInfo->pInheritanceInfo); 1614 1615 vk_foreach_struct(ext, pBeginInfo->pInheritanceInfo) { 1616 switch (ext->sType) { 1617 case VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT: { 1618 const VkCommandBufferInheritanceConditionalRenderingInfoEXT *cond_rend = (void *) ext; 1619 cmd_buffer->state.predication_active = cond_rend->conditionalRenderingEnable; 1620 break; 1621 default: 1622 break; 1623 } 1624 } 1625 } 1626 1627 if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) { 1628 cmd_buffer->state.pass = tu_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass); 1629 cmd_buffer->state.subpass = 1630 &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass]; 1631 } else { 1632 /* When executing in the middle of another command buffer, the CCU 1633 * state is unknown. 1634 */ 1635 cmd_buffer->state.ccu_state = TU_CMD_CCU_UNKNOWN; 1636 } 1637 } 1638 1639 cmd_buffer->status = TU_CMD_BUFFER_STATUS_RECORDING; 1640 1641 return VK_SUCCESS; 1642} 1643 1644VKAPI_ATTR void VKAPI_CALL 1645tu_CmdBindVertexBuffers(VkCommandBuffer commandBuffer, 1646 uint32_t firstBinding, 1647 uint32_t bindingCount, 1648 const VkBuffer *pBuffers, 1649 const VkDeviceSize *pOffsets) 1650{ 1651 tu_CmdBindVertexBuffers2EXT(commandBuffer, firstBinding, bindingCount, 1652 pBuffers, pOffsets, NULL, NULL); 1653} 1654 1655VKAPI_ATTR void VKAPI_CALL 1656tu_CmdBindVertexBuffers2EXT(VkCommandBuffer commandBuffer, 1657 uint32_t firstBinding, 1658 uint32_t bindingCount, 1659 const VkBuffer* pBuffers, 1660 const VkDeviceSize* pOffsets, 1661 const VkDeviceSize* pSizes, 1662 const VkDeviceSize* pStrides) 1663{ 1664 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 1665 struct tu_cs cs; 1666 /* TODO: track a "max_vb" value for the cmdbuf to save a bit of memory */ 1667 cmd->state.vertex_buffers.iova = tu_cs_draw_state(&cmd->sub_cs, &cs, 4 * MAX_VBS).iova; 1668 1669 for (uint32_t i = 0; i < bindingCount; i++) { 1670 if (pBuffers[i] == VK_NULL_HANDLE) { 1671 cmd->state.vb[firstBinding + i].base = 0; 1672 cmd->state.vb[firstBinding + i].size = 0; 1673 } else { 1674 struct tu_buffer *buf = tu_buffer_from_handle(pBuffers[i]); 1675 cmd->state.vb[firstBinding + i].base = tu_buffer_iova(buf) + pOffsets[i]; 1676 cmd->state.vb[firstBinding + i].size = pSizes ? pSizes[i] : (buf->size - pOffsets[i]); 1677 } 1678 1679 if (pStrides) 1680 cmd->state.vb[firstBinding + i].stride = pStrides[i]; 1681 } 1682 1683 for (uint32_t i = 0; i < MAX_VBS; i++) { 1684 tu_cs_emit_regs(&cs, 1685 A6XX_VFD_FETCH_BASE(i, .qword = cmd->state.vb[i].base), 1686 A6XX_VFD_FETCH_SIZE(i, cmd->state.vb[i].size)); 1687 } 1688 1689 cmd->state.dirty |= TU_CMD_DIRTY_VERTEX_BUFFERS; 1690 1691 if (pStrides) { 1692 cmd->state.dynamic_state[TU_DYNAMIC_STATE_VB_STRIDE].iova = 1693 tu_cs_draw_state(&cmd->sub_cs, &cs, 2 * MAX_VBS).iova; 1694 1695 for (uint32_t i = 0; i < MAX_VBS; i++) 1696 tu_cs_emit_regs(&cs, A6XX_VFD_FETCH_STRIDE(i, cmd->state.vb[i].stride)); 1697 1698 cmd->state.dirty |= TU_CMD_DIRTY_VB_STRIDE; 1699 } 1700} 1701 1702VKAPI_ATTR void VKAPI_CALL 1703tu_CmdBindIndexBuffer(VkCommandBuffer commandBuffer, 1704 VkBuffer buffer, 1705 VkDeviceSize offset, 1706 VkIndexType indexType) 1707{ 1708 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 1709 TU_FROM_HANDLE(tu_buffer, buf, buffer); 1710 1711 1712 1713 uint32_t index_size, index_shift, restart_index; 1714 1715 switch (indexType) { 1716 case VK_INDEX_TYPE_UINT16: 1717 index_size = INDEX4_SIZE_16_BIT; 1718 index_shift = 1; 1719 restart_index = 0xffff; 1720 break; 1721 case VK_INDEX_TYPE_UINT32: 1722 index_size = INDEX4_SIZE_32_BIT; 1723 index_shift = 2; 1724 restart_index = 0xffffffff; 1725 break; 1726 case VK_INDEX_TYPE_UINT8_EXT: 1727 index_size = INDEX4_SIZE_8_BIT; 1728 index_shift = 0; 1729 restart_index = 0xff; 1730 break; 1731 default: 1732 unreachable("invalid VkIndexType"); 1733 } 1734 1735 /* initialize/update the restart index */ 1736 if (cmd->state.index_size != index_size) 1737 tu_cs_emit_regs(&cmd->draw_cs, A6XX_PC_RESTART_INDEX(restart_index)); 1738 1739 assert(buf->size >= offset); 1740 1741 cmd->state.index_va = buf->bo->iova + buf->bo_offset + offset; 1742 cmd->state.max_index_count = (buf->size - offset) >> index_shift; 1743 cmd->state.index_size = index_size; 1744} 1745 1746VKAPI_ATTR void VKAPI_CALL 1747tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer, 1748 VkPipelineBindPoint pipelineBindPoint, 1749 VkPipelineLayout _layout, 1750 uint32_t firstSet, 1751 uint32_t descriptorSetCount, 1752 const VkDescriptorSet *pDescriptorSets, 1753 uint32_t dynamicOffsetCount, 1754 const uint32_t *pDynamicOffsets) 1755{ 1756 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 1757 TU_FROM_HANDLE(tu_pipeline_layout, layout, _layout); 1758 unsigned dyn_idx = 0; 1759 1760 struct tu_descriptor_state *descriptors_state = 1761 tu_get_descriptors_state(cmd, pipelineBindPoint); 1762 1763 for (unsigned i = 0; i < descriptorSetCount; ++i) { 1764 unsigned idx = i + firstSet; 1765 TU_FROM_HANDLE(tu_descriptor_set, set, pDescriptorSets[i]); 1766 1767 descriptors_state->sets[idx] = set; 1768 1769 for(unsigned j = 0; j < set->layout->dynamic_offset_count; ++j, ++dyn_idx) { 1770 /* update the contents of the dynamic descriptor set */ 1771 unsigned src_idx = j; 1772 unsigned dst_idx = j + layout->set[idx].dynamic_offset_start; 1773 assert(dyn_idx < dynamicOffsetCount); 1774 1775 uint32_t *dst = 1776 &descriptors_state->dynamic_descriptors[dst_idx * A6XX_TEX_CONST_DWORDS]; 1777 uint32_t *src = 1778 &set->dynamic_descriptors[src_idx * A6XX_TEX_CONST_DWORDS]; 1779 uint32_t offset = pDynamicOffsets[dyn_idx]; 1780 1781 /* Patch the storage/uniform descriptors right away. */ 1782 if (layout->set[idx].layout->dynamic_ubo & (1 << j)) { 1783 /* Note: we can assume here that the addition won't roll over and 1784 * change the SIZE field. 1785 */ 1786 uint64_t va = src[0] | ((uint64_t)src[1] << 32); 1787 va += offset; 1788 dst[0] = va; 1789 dst[1] = va >> 32; 1790 } else { 1791 memcpy(dst, src, A6XX_TEX_CONST_DWORDS * 4); 1792 /* Note: A6XX_IBO_5_DEPTH is always 0 */ 1793 uint64_t va = dst[4] | ((uint64_t)dst[5] << 32); 1794 va += offset; 1795 dst[4] = va; 1796 dst[5] = va >> 32; 1797 } 1798 } 1799 } 1800 assert(dyn_idx == dynamicOffsetCount); 1801 1802 uint32_t sp_bindless_base_reg, hlsq_bindless_base_reg, hlsq_invalidate_value; 1803 uint64_t addr[MAX_SETS + 1] = {}; 1804 struct tu_cs *cs, state_cs; 1805 1806 for (uint32_t i = 0; i < MAX_SETS; i++) { 1807 struct tu_descriptor_set *set = descriptors_state->sets[i]; 1808 if (set) 1809 addr[i] = set->va | 3; 1810 } 1811 1812 if (layout->dynamic_offset_count) { 1813 /* allocate and fill out dynamic descriptor set */ 1814 struct tu_cs_memory dynamic_desc_set; 1815 VkResult result = tu_cs_alloc(&cmd->sub_cs, layout->dynamic_offset_count, 1816 A6XX_TEX_CONST_DWORDS, &dynamic_desc_set); 1817 if (result != VK_SUCCESS) { 1818 cmd->record_result = result; 1819 return; 1820 } 1821 1822 memcpy(dynamic_desc_set.map, descriptors_state->dynamic_descriptors, 1823 layout->dynamic_offset_count * A6XX_TEX_CONST_DWORDS * 4); 1824 addr[MAX_SETS] = dynamic_desc_set.iova | 3; 1825 } 1826 1827 if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) { 1828 sp_bindless_base_reg = REG_A6XX_SP_BINDLESS_BASE(0); 1829 hlsq_bindless_base_reg = REG_A6XX_HLSQ_BINDLESS_BASE(0); 1830 hlsq_invalidate_value = A6XX_HLSQ_INVALIDATE_CMD_GFX_BINDLESS(0x1f); 1831 1832 cmd->state.desc_sets = tu_cs_draw_state(&cmd->sub_cs, &state_cs, 24); 1833 cmd->state.dirty |= TU_CMD_DIRTY_DESC_SETS_LOAD | TU_CMD_DIRTY_SHADER_CONSTS; 1834 cs = &state_cs; 1835 } else { 1836 assert(pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE); 1837 1838 sp_bindless_base_reg = REG_A6XX_SP_CS_BINDLESS_BASE(0); 1839 hlsq_bindless_base_reg = REG_A6XX_HLSQ_CS_BINDLESS_BASE(0); 1840 hlsq_invalidate_value = A6XX_HLSQ_INVALIDATE_CMD_CS_BINDLESS(0x1f); 1841 1842 cmd->state.dirty |= TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD; 1843 cs = &cmd->cs; 1844 } 1845 1846 tu_cs_emit_pkt4(cs, sp_bindless_base_reg, 10); 1847 tu_cs_emit_array(cs, (const uint32_t*) addr, 10); 1848 tu_cs_emit_pkt4(cs, hlsq_bindless_base_reg, 10); 1849 tu_cs_emit_array(cs, (const uint32_t*) addr, 10); 1850 tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(.dword = hlsq_invalidate_value)); 1851 1852 if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) { 1853 assert(cs->cur == cs->end); /* validate draw state size */ 1854 /* note: this also avoids emitting draw states before renderpass clears, 1855 * which may use the 3D clear path (for MSAA cases) 1856 */ 1857 if (!(cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)) { 1858 tu_cs_emit_pkt7(&cmd->draw_cs, CP_SET_DRAW_STATE, 3); 1859 tu_cs_emit_draw_state(&cmd->draw_cs, TU_DRAW_STATE_DESC_SETS, cmd->state.desc_sets); 1860 } 1861 } 1862} 1863 1864VKAPI_ATTR void VKAPI_CALL 1865tu_CmdPushDescriptorSetKHR(VkCommandBuffer commandBuffer, 1866 VkPipelineBindPoint pipelineBindPoint, 1867 VkPipelineLayout _layout, 1868 uint32_t _set, 1869 uint32_t descriptorWriteCount, 1870 const VkWriteDescriptorSet *pDescriptorWrites) 1871{ 1872 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 1873 TU_FROM_HANDLE(tu_pipeline_layout, pipe_layout, _layout); 1874 struct tu_descriptor_set_layout *layout = pipe_layout->set[_set].layout; 1875 struct tu_descriptor_set *set = 1876 &tu_get_descriptors_state(cmd, pipelineBindPoint)->push_set; 1877 1878 struct tu_cs_memory set_mem; 1879 VkResult result = tu_cs_alloc(&cmd->sub_cs, 1880 DIV_ROUND_UP(layout->size, A6XX_TEX_CONST_DWORDS * 4), 1881 A6XX_TEX_CONST_DWORDS, &set_mem); 1882 if (result != VK_SUCCESS) { 1883 cmd->record_result = result; 1884 return; 1885 } 1886 1887 /* preserve previous content if the layout is the same: */ 1888 if (set->layout == layout) 1889 memcpy(set_mem.map, set->mapped_ptr, layout->size); 1890 1891 set->layout = layout; 1892 set->mapped_ptr = set_mem.map; 1893 set->va = set_mem.iova; 1894 1895 tu_update_descriptor_sets(cmd->device, tu_descriptor_set_to_handle(set), 1896 descriptorWriteCount, pDescriptorWrites, 0, NULL); 1897 1898 tu_CmdBindDescriptorSets(commandBuffer, pipelineBindPoint, _layout, _set, 1899 1, (VkDescriptorSet[]) { tu_descriptor_set_to_handle(set) }, 1900 0, NULL); 1901} 1902 1903VKAPI_ATTR void VKAPI_CALL 1904tu_CmdPushDescriptorSetWithTemplateKHR(VkCommandBuffer commandBuffer, 1905 VkDescriptorUpdateTemplate descriptorUpdateTemplate, 1906 VkPipelineLayout _layout, 1907 uint32_t _set, 1908 const void* pData) 1909{ 1910 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 1911 TU_FROM_HANDLE(tu_pipeline_layout, pipe_layout, _layout); 1912 TU_FROM_HANDLE(tu_descriptor_update_template, templ, descriptorUpdateTemplate); 1913 struct tu_descriptor_set_layout *layout = pipe_layout->set[_set].layout; 1914 struct tu_descriptor_set *set = 1915 &tu_get_descriptors_state(cmd, templ->bind_point)->push_set; 1916 1917 struct tu_cs_memory set_mem; 1918 VkResult result = tu_cs_alloc(&cmd->sub_cs, 1919 DIV_ROUND_UP(layout->size, A6XX_TEX_CONST_DWORDS * 4), 1920 A6XX_TEX_CONST_DWORDS, &set_mem); 1921 if (result != VK_SUCCESS) { 1922 cmd->record_result = result; 1923 return; 1924 } 1925 1926 /* preserve previous content if the layout is the same: */ 1927 if (set->layout == layout) 1928 memcpy(set_mem.map, set->mapped_ptr, layout->size); 1929 1930 set->layout = layout; 1931 set->mapped_ptr = set_mem.map; 1932 set->va = set_mem.iova; 1933 1934 tu_update_descriptor_set_with_template(cmd->device, set, descriptorUpdateTemplate, pData); 1935 1936 tu_CmdBindDescriptorSets(commandBuffer, templ->bind_point, _layout, _set, 1937 1, (VkDescriptorSet[]) { tu_descriptor_set_to_handle(set) }, 1938 0, NULL); 1939} 1940 1941VKAPI_ATTR void VKAPI_CALL 1942tu_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer, 1943 uint32_t firstBinding, 1944 uint32_t bindingCount, 1945 const VkBuffer *pBuffers, 1946 const VkDeviceSize *pOffsets, 1947 const VkDeviceSize *pSizes) 1948{ 1949 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 1950 struct tu_cs *cs = &cmd->draw_cs; 1951 1952 /* using COND_REG_EXEC for xfb commands matches the blob behavior 1953 * presumably there isn't any benefit using a draw state when the 1954 * condition is (SYSMEM | BINNING) 1955 */ 1956 tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) | 1957 CP_COND_REG_EXEC_0_SYSMEM | 1958 CP_COND_REG_EXEC_0_BINNING); 1959 1960 for (uint32_t i = 0; i < bindingCount; i++) { 1961 TU_FROM_HANDLE(tu_buffer, buf, pBuffers[i]); 1962 uint64_t iova = buf->bo->iova + pOffsets[i]; 1963 uint32_t size = buf->bo->size - pOffsets[i]; 1964 uint32_t idx = i + firstBinding; 1965 1966 if (pSizes && pSizes[i] != VK_WHOLE_SIZE) 1967 size = pSizes[i]; 1968 1969 /* BUFFER_BASE is 32-byte aligned, add remaining offset to BUFFER_OFFSET */ 1970 uint32_t offset = iova & 0x1f; 1971 iova &= ~(uint64_t) 0x1f; 1972 1973 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_BUFFER_BASE(idx), 3); 1974 tu_cs_emit_qw(cs, iova); 1975 tu_cs_emit(cs, size + offset); 1976 1977 cmd->state.streamout_offset[idx] = offset; 1978 } 1979 1980 tu_cond_exec_end(cs); 1981} 1982 1983VKAPI_ATTR void VKAPI_CALL 1984tu_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer, 1985 uint32_t firstCounterBuffer, 1986 uint32_t counterBufferCount, 1987 const VkBuffer *pCounterBuffers, 1988 const VkDeviceSize *pCounterBufferOffsets) 1989{ 1990 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 1991 struct tu_cs *cs = &cmd->draw_cs; 1992 1993 tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) | 1994 CP_COND_REG_EXEC_0_SYSMEM | 1995 CP_COND_REG_EXEC_0_BINNING); 1996 1997 /* TODO: only update offset for active buffers */ 1998 for (uint32_t i = 0; i < IR3_MAX_SO_BUFFERS; i++) 1999 tu_cs_emit_regs(cs, A6XX_VPC_SO_BUFFER_OFFSET(i, cmd->state.streamout_offset[i])); 2000 2001 for (uint32_t i = 0; i < (pCounterBuffers ? counterBufferCount : 0); i++) { 2002 uint32_t idx = firstCounterBuffer + i; 2003 uint32_t offset = cmd->state.streamout_offset[idx]; 2004 uint64_t counter_buffer_offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0u; 2005 2006 if (!pCounterBuffers[i]) 2007 continue; 2008 2009 TU_FROM_HANDLE(tu_buffer, buf, pCounterBuffers[i]); 2010 2011 tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3); 2012 tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(idx)) | 2013 CP_MEM_TO_REG_0_UNK31 | 2014 CP_MEM_TO_REG_0_CNT(1)); 2015 tu_cs_emit_qw(cs, buf->bo->iova + counter_buffer_offset); 2016 2017 if (offset) { 2018 tu_cs_emit_pkt7(cs, CP_REG_RMW, 3); 2019 tu_cs_emit(cs, CP_REG_RMW_0_DST_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(idx)) | 2020 CP_REG_RMW_0_SRC1_ADD); 2021 tu_cs_emit(cs, 0xffffffff); 2022 tu_cs_emit(cs, offset); 2023 } 2024 } 2025 2026 tu_cond_exec_end(cs); 2027} 2028 2029VKAPI_ATTR void VKAPI_CALL 2030tu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, 2031 uint32_t firstCounterBuffer, 2032 uint32_t counterBufferCount, 2033 const VkBuffer *pCounterBuffers, 2034 const VkDeviceSize *pCounterBufferOffsets) 2035{ 2036 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 2037 struct tu_cs *cs = &cmd->draw_cs; 2038 2039 tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) | 2040 CP_COND_REG_EXEC_0_SYSMEM | 2041 CP_COND_REG_EXEC_0_BINNING); 2042 2043 /* TODO: only flush buffers that need to be flushed */ 2044 for (uint32_t i = 0; i < IR3_MAX_SO_BUFFERS; i++) { 2045 /* note: FLUSH_BASE is always the same, so it could go in init_hw()? */ 2046 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_FLUSH_BASE(i), 2); 2047 tu_cs_emit_qw(cs, global_iova(cmd, flush_base[i])); 2048 tu6_emit_event_write(cmd, cs, FLUSH_SO_0 + i); 2049 } 2050 2051 for (uint32_t i = 0; i < (pCounterBuffers ? counterBufferCount : 0); i++) { 2052 uint32_t idx = firstCounterBuffer + i; 2053 uint32_t offset = cmd->state.streamout_offset[idx]; 2054 uint64_t counter_buffer_offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0u; 2055 2056 if (!pCounterBuffers[i]) 2057 continue; 2058 2059 TU_FROM_HANDLE(tu_buffer, buf, pCounterBuffers[i]); 2060 2061 /* VPC_SO_FLUSH_BASE has dwords counter, but counter should be in bytes */ 2062 tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3); 2063 tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A6XX_CP_SCRATCH_REG(0)) | 2064 CP_MEM_TO_REG_0_SHIFT_BY_2 | 2065 0x40000 | /* ??? */ 2066 CP_MEM_TO_REG_0_UNK31 | 2067 CP_MEM_TO_REG_0_CNT(1)); 2068 tu_cs_emit_qw(cs, global_iova(cmd, flush_base[idx])); 2069 2070 if (offset) { 2071 tu_cs_emit_pkt7(cs, CP_REG_RMW, 3); 2072 tu_cs_emit(cs, CP_REG_RMW_0_DST_REG(REG_A6XX_CP_SCRATCH_REG(0)) | 2073 CP_REG_RMW_0_SRC1_ADD); 2074 tu_cs_emit(cs, 0xffffffff); 2075 tu_cs_emit(cs, -offset); 2076 } 2077 2078 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); 2079 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_SCRATCH_REG(0)) | 2080 CP_REG_TO_MEM_0_CNT(1)); 2081 tu_cs_emit_qw(cs, buf->bo->iova + counter_buffer_offset); 2082 } 2083 2084 tu_cond_exec_end(cs); 2085 2086 cmd->state.xfb_used = true; 2087} 2088 2089VKAPI_ATTR void VKAPI_CALL 2090tu_CmdPushConstants(VkCommandBuffer commandBuffer, 2091 VkPipelineLayout layout, 2092 VkShaderStageFlags stageFlags, 2093 uint32_t offset, 2094 uint32_t size, 2095 const void *pValues) 2096{ 2097 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 2098 memcpy((void*) cmd->push_constants + offset, pValues, size); 2099 cmd->state.dirty |= TU_CMD_DIRTY_SHADER_CONSTS; 2100} 2101 2102/* Flush everything which has been made available but we haven't actually 2103 * flushed yet. 2104 */ 2105static void 2106tu_flush_all_pending(struct tu_cache_state *cache) 2107{ 2108 cache->flush_bits |= cache->pending_flush_bits & TU_CMD_FLAG_ALL_FLUSH; 2109 cache->pending_flush_bits &= ~TU_CMD_FLAG_ALL_FLUSH; 2110} 2111 2112VKAPI_ATTR VkResult VKAPI_CALL 2113tu_EndCommandBuffer(VkCommandBuffer commandBuffer) 2114{ 2115 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); 2116 2117 /* We currently flush CCU at the end of the command buffer, like 2118 * what the blob does. There's implicit synchronization around every 2119 * vkQueueSubmit, but the kernel only flushes the UCHE, and we don't 2120 * know yet if this command buffer will be the last in the submit so we 2121 * have to defensively flush everything else. 2122 * 2123 * TODO: We could definitely do better than this, since these flushes 2124 * aren't required by Vulkan, but we'd need kernel support to do that. 2125 * Ideally, we'd like the kernel to flush everything afterwards, so that we 2126 * wouldn't have to do any flushes here, and when submitting multiple 2127 * command buffers there wouldn't be any unnecessary flushes in between. 2128 */ 2129 if (cmd_buffer->state.pass) { 2130 tu_flush_all_pending(&cmd_buffer->state.renderpass_cache); 2131 tu_emit_cache_flush_renderpass(cmd_buffer, &cmd_buffer->draw_cs); 2132 } else { 2133 tu_flush_all_pending(&cmd_buffer->state.cache); 2134 cmd_buffer->state.cache.flush_bits |= 2135 TU_CMD_FLAG_CCU_FLUSH_COLOR | 2136 TU_CMD_FLAG_CCU_FLUSH_DEPTH; 2137 tu_emit_cache_flush(cmd_buffer, &cmd_buffer->cs); 2138 } 2139 2140 tu_cs_end(&cmd_buffer->cs); 2141 tu_cs_end(&cmd_buffer->draw_cs); 2142 tu_cs_end(&cmd_buffer->tile_store_cs); 2143 tu_cs_end(&cmd_buffer->draw_epilogue_cs); 2144 2145 cmd_buffer->status = TU_CMD_BUFFER_STATUS_EXECUTABLE; 2146 2147 return cmd_buffer->record_result; 2148} 2149 2150static struct tu_cs 2151tu_cmd_dynamic_state(struct tu_cmd_buffer *cmd, uint32_t id, uint32_t size) 2152{ 2153 struct tu_cs cs; 2154 2155 assert(id < ARRAY_SIZE(cmd->state.dynamic_state)); 2156 cmd->state.dynamic_state[id] = tu_cs_draw_state(&cmd->sub_cs, &cs, size); 2157 2158 /* note: this also avoids emitting draw states before renderpass clears, 2159 * which may use the 3D clear path (for MSAA cases) 2160 */ 2161 if (cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE) 2162 return cs; 2163 2164 tu_cs_emit_pkt7(&cmd->draw_cs, CP_SET_DRAW_STATE, 3); 2165 tu_cs_emit_draw_state(&cmd->draw_cs, TU_DRAW_STATE_DYNAMIC + id, cmd->state.dynamic_state[id]); 2166 2167 return cs; 2168} 2169 2170VKAPI_ATTR void VKAPI_CALL 2171tu_CmdBindPipeline(VkCommandBuffer commandBuffer, 2172 VkPipelineBindPoint pipelineBindPoint, 2173 VkPipeline _pipeline) 2174{ 2175 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 2176 TU_FROM_HANDLE(tu_pipeline, pipeline, _pipeline); 2177 2178 if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE) { 2179 cmd->state.compute_pipeline = pipeline; 2180 tu_cs_emit_state_ib(&cmd->cs, pipeline->program.state); 2181 return; 2182 } 2183 2184 assert(pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS); 2185 2186 cmd->state.pipeline = pipeline; 2187 cmd->state.dirty |= TU_CMD_DIRTY_DESC_SETS_LOAD | TU_CMD_DIRTY_SHADER_CONSTS | 2188 TU_CMD_DIRTY_LRZ | TU_CMD_DIRTY_VS_PARAMS; 2189 2190 /* note: this also avoids emitting draw states before renderpass clears, 2191 * which may use the 3D clear path (for MSAA cases) 2192 */ 2193 if (!(cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)) { 2194 struct tu_cs *cs = &cmd->draw_cs; 2195 uint32_t mask = ~pipeline->dynamic_state_mask & BITFIELD_MASK(TU_DYNAMIC_STATE_COUNT); 2196 2197 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (7 + util_bitcount(mask))); 2198 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_CONFIG, pipeline->program.config_state); 2199 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM, pipeline->program.state); 2200 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_BINNING, pipeline->program.binning_state); 2201 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VI, pipeline->vi.state); 2202 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VI_BINNING, pipeline->vi.binning_state); 2203 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_RAST, pipeline->rast_state); 2204 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_BLEND, pipeline->blend_state); 2205 2206 u_foreach_bit(i, mask) 2207 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + i, pipeline->dynamic_state[i]); 2208 } 2209 2210 if (cmd->state.line_mode != pipeline->line_mode) { 2211 cmd->state.line_mode = pipeline->line_mode; 2212 2213 /* We have to disable MSAA when bresenham lines are used, this is 2214 * a hardware limitation and spec allows it: 2215 * 2216 * When Bresenham lines are being rasterized, sample locations may 2217 * all be treated as being at the pixel center (this may affect 2218 * attribute and depth interpolation). 2219 */ 2220 if (cmd->state.subpass && cmd->state.subpass->samples) { 2221 tu6_emit_msaa(&cmd->draw_cs, cmd->state.subpass->samples, cmd->state.line_mode); 2222 } 2223 } 2224 2225 /* the vertex_buffers draw state always contains all the currently 2226 * bound vertex buffers. update its size to only emit the vbs which 2227 * are actually used by the pipeline 2228 * note there is a HW optimization which makes it so the draw state 2229 * is not re-executed completely when only the size changes 2230 */ 2231 if (cmd->state.vertex_buffers.size != pipeline->num_vbs * 4) { 2232 cmd->state.vertex_buffers.size = pipeline->num_vbs * 4; 2233 cmd->state.dirty |= TU_CMD_DIRTY_VERTEX_BUFFERS; 2234 } 2235 2236 if ((pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_VB_STRIDE)) && 2237 cmd->state.dynamic_state[TU_DYNAMIC_STATE_VB_STRIDE].size != pipeline->num_vbs * 2) { 2238 cmd->state.dynamic_state[TU_DYNAMIC_STATE_VB_STRIDE].size = pipeline->num_vbs * 2; 2239 cmd->state.dirty |= TU_CMD_DIRTY_VB_STRIDE; 2240 } 2241 2242#define UPDATE_REG(X, Y) { \ 2243 /* note: would be better to have pipeline bits already masked */ \ 2244 uint32_t pipeline_bits = pipeline->X & pipeline->X##_mask; \ 2245 if ((cmd->state.X & pipeline->X##_mask) != pipeline_bits) { \ 2246 cmd->state.X &= ~pipeline->X##_mask; \ 2247 cmd->state.X |= pipeline_bits; \ 2248 cmd->state.dirty |= TU_CMD_DIRTY_##Y; \ 2249 } \ 2250 if (!(pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_##Y))) \ 2251 cmd->state.dirty &= ~TU_CMD_DIRTY_##Y; \ 2252} 2253 2254 /* these registers can have bits set from both pipeline and dynamic state 2255 * this updates the bits set by the pipeline 2256 * if the pipeline doesn't use a dynamic state for the register, then 2257 * the relevant dirty bit is cleared to avoid overriding the non-dynamic 2258 * state with a dynamic state the next draw. 2259 */ 2260 UPDATE_REG(gras_su_cntl, GRAS_SU_CNTL); 2261 UPDATE_REG(rb_depth_cntl, RB_DEPTH_CNTL); 2262 UPDATE_REG(rb_stencil_cntl, RB_STENCIL_CNTL); 2263 UPDATE_REG(pc_raster_cntl, RASTERIZER_DISCARD); 2264 UPDATE_REG(vpc_unknown_9107, RASTERIZER_DISCARD); 2265#undef UPDATE_REG 2266 2267 if (pipeline->rb_depth_cntl_disable) 2268 cmd->state.dirty |= TU_CMD_DIRTY_RB_DEPTH_CNTL; 2269} 2270 2271VKAPI_ATTR void VKAPI_CALL 2272tu_CmdSetViewport(VkCommandBuffer commandBuffer, 2273 uint32_t firstViewport, 2274 uint32_t viewportCount, 2275 const VkViewport *pViewports) 2276{ 2277 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 2278 struct tu_cs cs; 2279 2280 memcpy(&cmd->state.viewport[firstViewport], pViewports, viewportCount * sizeof(*pViewports)); 2281 cmd->state.max_viewport = MAX2(cmd->state.max_viewport, firstViewport + viewportCount); 2282 2283 cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_VIEWPORT, 8 + 10 * cmd->state.max_viewport); 2284 tu6_emit_viewport(&cs, cmd->state.viewport, cmd->state.max_viewport); 2285} 2286 2287VKAPI_ATTR void VKAPI_CALL 2288tu_CmdSetScissor(VkCommandBuffer commandBuffer, 2289 uint32_t firstScissor, 2290 uint32_t scissorCount, 2291 const VkRect2D *pScissors) 2292{ 2293 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 2294 struct tu_cs cs; 2295 2296 memcpy(&cmd->state.scissor[firstScissor], pScissors, scissorCount * sizeof(*pScissors)); 2297 cmd->state.max_scissor = MAX2(cmd->state.max_scissor, firstScissor + scissorCount); 2298 2299 cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_SCISSOR, 1 + 2 * cmd->state.max_scissor); 2300 tu6_emit_scissor(&cs, cmd->state.scissor, cmd->state.max_scissor); 2301} 2302 2303VKAPI_ATTR void VKAPI_CALL 2304tu_CmdSetLineWidth(VkCommandBuffer commandBuffer, float lineWidth) 2305{ 2306 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 2307 2308 cmd->state.gras_su_cntl &= ~A6XX_GRAS_SU_CNTL_LINEHALFWIDTH__MASK; 2309 cmd->state.gras_su_cntl |= A6XX_GRAS_SU_CNTL_LINEHALFWIDTH(lineWidth / 2.0f); 2310 2311 cmd->state.dirty |= TU_CMD_DIRTY_GRAS_SU_CNTL; 2312} 2313 2314VKAPI_ATTR void VKAPI_CALL 2315tu_CmdSetDepthBias(VkCommandBuffer commandBuffer, 2316 float depthBiasConstantFactor, 2317 float depthBiasClamp, 2318 float depthBiasSlopeFactor) 2319{ 2320 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 2321 struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_DEPTH_BIAS, 4); 2322 2323 tu6_emit_depth_bias(&cs, depthBiasConstantFactor, depthBiasClamp, depthBiasSlopeFactor); 2324} 2325 2326VKAPI_ATTR void VKAPI_CALL 2327tu_CmdSetBlendConstants(VkCommandBuffer commandBuffer, 2328 const float blendConstants[4]) 2329{ 2330 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 2331 struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_BLEND_CONSTANTS, 5); 2332 2333 tu_cs_emit_pkt4(&cs, REG_A6XX_RB_BLEND_RED_F32, 4); 2334 tu_cs_emit_array(&cs, (const uint32_t *) blendConstants, 4); 2335} 2336 2337VKAPI_ATTR void VKAPI_CALL 2338tu_CmdSetDepthBounds(VkCommandBuffer commandBuffer, 2339 float minDepthBounds, 2340 float maxDepthBounds) 2341{ 2342 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 2343 struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_DEPTH_BOUNDS, 3); 2344 2345 tu_cs_emit_regs(&cs, 2346 A6XX_RB_Z_BOUNDS_MIN(minDepthBounds), 2347 A6XX_RB_Z_BOUNDS_MAX(maxDepthBounds)); 2348} 2349 2350void 2351update_stencil_mask(uint32_t *value, VkStencilFaceFlags face, uint32_t mask) 2352{ 2353 if (face & VK_STENCIL_FACE_FRONT_BIT) 2354 *value = (*value & 0xff00) | (mask & 0xff); 2355 if (face & VK_STENCIL_FACE_BACK_BIT) 2356 *value = (*value & 0xff) | (mask & 0xff) << 8; 2357} 2358 2359VKAPI_ATTR void VKAPI_CALL 2360tu_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer, 2361 VkStencilFaceFlags faceMask, 2362 uint32_t compareMask) 2363{ 2364 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 2365 struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK, 2); 2366 2367 update_stencil_mask(&cmd->state.dynamic_stencil_mask, faceMask, compareMask); 2368 2369 tu_cs_emit_regs(&cs, A6XX_RB_STENCILMASK(.dword = cmd->state.dynamic_stencil_mask)); 2370} 2371 2372VKAPI_ATTR void VKAPI_CALL 2373tu_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer, 2374 VkStencilFaceFlags faceMask, 2375 uint32_t writeMask) 2376{ 2377 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 2378 struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_STENCIL_WRITE_MASK, 2); 2379 2380 update_stencil_mask(&cmd->state.dynamic_stencil_wrmask, faceMask, writeMask); 2381 2382 tu_cs_emit_regs(&cs, A6XX_RB_STENCILWRMASK(.dword = cmd->state.dynamic_stencil_wrmask)); 2383 2384 cmd->state.dirty |= TU_CMD_DIRTY_LRZ; 2385} 2386 2387VKAPI_ATTR void VKAPI_CALL 2388tu_CmdSetStencilReference(VkCommandBuffer commandBuffer, 2389 VkStencilFaceFlags faceMask, 2390 uint32_t reference) 2391{ 2392 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 2393 struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_STENCIL_REFERENCE, 2); 2394 2395 update_stencil_mask(&cmd->state.dynamic_stencil_ref, faceMask, reference); 2396 2397 tu_cs_emit_regs(&cs, A6XX_RB_STENCILREF(.dword = cmd->state.dynamic_stencil_ref)); 2398} 2399 2400VKAPI_ATTR void VKAPI_CALL 2401tu_CmdSetSampleLocationsEXT(VkCommandBuffer commandBuffer, 2402 const VkSampleLocationsInfoEXT* pSampleLocationsInfo) 2403{ 2404 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 2405 struct tu_cs cs = tu_cmd_dynamic_state(cmd, TU_DYNAMIC_STATE_SAMPLE_LOCATIONS, 9); 2406 2407 assert(pSampleLocationsInfo); 2408 2409 tu6_emit_sample_locations(&cs, pSampleLocationsInfo); 2410} 2411 2412VKAPI_ATTR void VKAPI_CALL 2413tu_CmdSetCullModeEXT(VkCommandBuffer commandBuffer, VkCullModeFlags cullMode) 2414{ 2415 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 2416 2417 cmd->state.gras_su_cntl &= 2418 ~(A6XX_GRAS_SU_CNTL_CULL_FRONT | A6XX_GRAS_SU_CNTL_CULL_BACK); 2419 2420 if (cullMode & VK_CULL_MODE_FRONT_BIT) 2421 cmd->state.gras_su_cntl |= A6XX_GRAS_SU_CNTL_CULL_FRONT; 2422 if (cullMode & VK_CULL_MODE_BACK_BIT) 2423 cmd->state.gras_su_cntl |= A6XX_GRAS_SU_CNTL_CULL_BACK; 2424 2425 cmd->state.dirty |= TU_CMD_DIRTY_GRAS_SU_CNTL; 2426} 2427 2428VKAPI_ATTR void VKAPI_CALL 2429tu_CmdSetFrontFaceEXT(VkCommandBuffer commandBuffer, VkFrontFace frontFace) 2430{ 2431 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 2432 2433 cmd->state.gras_su_cntl &= ~A6XX_GRAS_SU_CNTL_FRONT_CW; 2434 2435 if (frontFace == VK_FRONT_FACE_CLOCKWISE) 2436 cmd->state.gras_su_cntl |= A6XX_GRAS_SU_CNTL_FRONT_CW; 2437 2438 cmd->state.dirty |= TU_CMD_DIRTY_GRAS_SU_CNTL; 2439} 2440 2441VKAPI_ATTR void VKAPI_CALL 2442tu_CmdSetPrimitiveTopologyEXT(VkCommandBuffer commandBuffer, 2443 VkPrimitiveTopology primitiveTopology) 2444{ 2445 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 2446 2447 cmd->state.primtype = tu6_primtype(primitiveTopology); 2448} 2449 2450VKAPI_ATTR void VKAPI_CALL 2451tu_CmdSetViewportWithCountEXT(VkCommandBuffer commandBuffer, 2452 uint32_t viewportCount, 2453 const VkViewport* pViewports) 2454{ 2455 tu_CmdSetViewport(commandBuffer, 0, viewportCount, pViewports); 2456} 2457 2458VKAPI_ATTR void VKAPI_CALL 2459tu_CmdSetScissorWithCountEXT(VkCommandBuffer commandBuffer, 2460 uint32_t scissorCount, 2461 const VkRect2D* pScissors) 2462{ 2463 tu_CmdSetScissor(commandBuffer, 0, scissorCount, pScissors); 2464} 2465 2466VKAPI_ATTR void VKAPI_CALL 2467tu_CmdSetDepthTestEnableEXT(VkCommandBuffer commandBuffer, 2468 VkBool32 depthTestEnable) 2469{ 2470 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 2471 2472 cmd->state.rb_depth_cntl &= ~A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE; 2473 2474 if (depthTestEnable) 2475 cmd->state.rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE; 2476 2477 cmd->state.dirty |= TU_CMD_DIRTY_RB_DEPTH_CNTL; 2478} 2479 2480VKAPI_ATTR void VKAPI_CALL 2481tu_CmdSetDepthWriteEnableEXT(VkCommandBuffer commandBuffer, 2482 VkBool32 depthWriteEnable) 2483{ 2484 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 2485 2486 cmd->state.rb_depth_cntl &= ~A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE; 2487 2488 if (depthWriteEnable) 2489 cmd->state.rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE; 2490 2491 cmd->state.dirty |= TU_CMD_DIRTY_RB_DEPTH_CNTL; 2492} 2493 2494VKAPI_ATTR void VKAPI_CALL 2495tu_CmdSetDepthCompareOpEXT(VkCommandBuffer commandBuffer, 2496 VkCompareOp depthCompareOp) 2497{ 2498 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 2499 2500 cmd->state.rb_depth_cntl &= ~A6XX_RB_DEPTH_CNTL_ZFUNC__MASK; 2501 2502 cmd->state.rb_depth_cntl |= 2503 A6XX_RB_DEPTH_CNTL_ZFUNC(tu6_compare_func(depthCompareOp)); 2504 2505 cmd->state.dirty |= TU_CMD_DIRTY_RB_DEPTH_CNTL; 2506} 2507 2508VKAPI_ATTR void VKAPI_CALL 2509tu_CmdSetDepthBoundsTestEnableEXT(VkCommandBuffer commandBuffer, 2510 VkBool32 depthBoundsTestEnable) 2511{ 2512 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 2513 2514 cmd->state.rb_depth_cntl &= ~A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE; 2515 2516 if (depthBoundsTestEnable) 2517 cmd->state.rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE; 2518 2519 cmd->state.dirty |= TU_CMD_DIRTY_RB_DEPTH_CNTL; 2520} 2521 2522VKAPI_ATTR void VKAPI_CALL 2523tu_CmdSetStencilTestEnableEXT(VkCommandBuffer commandBuffer, 2524 VkBool32 stencilTestEnable) 2525{ 2526 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 2527 2528 cmd->state.rb_stencil_cntl &= ~( 2529 A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE | 2530 A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF | 2531 A6XX_RB_STENCIL_CONTROL_STENCIL_READ); 2532 2533 if (stencilTestEnable) { 2534 cmd->state.rb_stencil_cntl |= 2535 A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE | 2536 A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF | 2537 A6XX_RB_STENCIL_CONTROL_STENCIL_READ; 2538 } 2539 2540 cmd->state.dirty |= TU_CMD_DIRTY_RB_STENCIL_CNTL; 2541} 2542 2543VKAPI_ATTR void VKAPI_CALL 2544tu_CmdSetStencilOpEXT(VkCommandBuffer commandBuffer, 2545 VkStencilFaceFlags faceMask, 2546 VkStencilOp failOp, 2547 VkStencilOp passOp, 2548 VkStencilOp depthFailOp, 2549 VkCompareOp compareOp) 2550{ 2551 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 2552 2553 if (faceMask & VK_STENCIL_FACE_FRONT_BIT) { 2554 cmd->state.rb_stencil_cntl &= ~( 2555 A6XX_RB_STENCIL_CONTROL_FUNC__MASK | 2556 A6XX_RB_STENCIL_CONTROL_FAIL__MASK | 2557 A6XX_RB_STENCIL_CONTROL_ZPASS__MASK | 2558 A6XX_RB_STENCIL_CONTROL_ZFAIL__MASK); 2559 2560 cmd->state.rb_stencil_cntl |= 2561 A6XX_RB_STENCIL_CONTROL_FUNC(tu6_compare_func(compareOp)) | 2562 A6XX_RB_STENCIL_CONTROL_FAIL(tu6_stencil_op(failOp)) | 2563 A6XX_RB_STENCIL_CONTROL_ZPASS(tu6_stencil_op(passOp)) | 2564 A6XX_RB_STENCIL_CONTROL_ZFAIL(tu6_stencil_op(depthFailOp)); 2565 } 2566 2567 if (faceMask & VK_STENCIL_FACE_BACK_BIT) { 2568 cmd->state.rb_stencil_cntl &= ~( 2569 A6XX_RB_STENCIL_CONTROL_FUNC_BF__MASK | 2570 A6XX_RB_STENCIL_CONTROL_FAIL_BF__MASK | 2571 A6XX_RB_STENCIL_CONTROL_ZPASS_BF__MASK | 2572 A6XX_RB_STENCIL_CONTROL_ZFAIL_BF__MASK); 2573 2574 cmd->state.rb_stencil_cntl |= 2575 A6XX_RB_STENCIL_CONTROL_FUNC_BF(tu6_compare_func(compareOp)) | 2576 A6XX_RB_STENCIL_CONTROL_FAIL_BF(tu6_stencil_op(failOp)) | 2577 A6XX_RB_STENCIL_CONTROL_ZPASS_BF(tu6_stencil_op(passOp)) | 2578 A6XX_RB_STENCIL_CONTROL_ZFAIL_BF(tu6_stencil_op(depthFailOp)); 2579 } 2580 2581 cmd->state.dirty |= TU_CMD_DIRTY_RB_STENCIL_CNTL; 2582} 2583 2584VKAPI_ATTR void VKAPI_CALL 2585tu_CmdSetDepthBiasEnableEXT(VkCommandBuffer commandBuffer, 2586 VkBool32 depthBiasEnable) 2587{ 2588 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 2589 2590 cmd->state.gras_su_cntl &= ~A6XX_GRAS_SU_CNTL_POLY_OFFSET; 2591 if (depthBiasEnable) 2592 cmd->state.gras_su_cntl |= A6XX_GRAS_SU_CNTL_POLY_OFFSET; 2593 2594 cmd->state.dirty |= TU_CMD_DIRTY_GRAS_SU_CNTL; 2595} 2596 2597VKAPI_ATTR void VKAPI_CALL 2598tu_CmdSetPrimitiveRestartEnableEXT(VkCommandBuffer commandBuffer, 2599 VkBool32 primitiveRestartEnable) 2600{ 2601 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 2602 2603 cmd->state.primitive_restart_enable = primitiveRestartEnable; 2604} 2605 2606VKAPI_ATTR void VKAPI_CALL 2607tu_CmdSetRasterizerDiscardEnableEXT(VkCommandBuffer commandBuffer, 2608 VkBool32 rasterizerDiscardEnable) 2609{ 2610 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 2611 2612 cmd->state.pc_raster_cntl &= ~A6XX_PC_RASTER_CNTL_DISCARD; 2613 cmd->state.vpc_unknown_9107 &= ~A6XX_VPC_UNKNOWN_9107_RASTER_DISCARD; 2614 if (rasterizerDiscardEnable) { 2615 cmd->state.pc_raster_cntl |= A6XX_PC_RASTER_CNTL_DISCARD; 2616 cmd->state.vpc_unknown_9107 |= A6XX_VPC_UNKNOWN_9107_RASTER_DISCARD; 2617 } 2618 2619 cmd->state.dirty |= TU_CMD_DIRTY_RASTERIZER_DISCARD; 2620} 2621 2622VKAPI_ATTR void VKAPI_CALL 2623tu_CmdSetLogicOpEXT(VkCommandBuffer commandBuffer, 2624 VkLogicOp logicOp) 2625{ 2626 tu_stub(); 2627} 2628 2629VKAPI_ATTR void VKAPI_CALL 2630tu_CmdSetPatchControlPointsEXT(VkCommandBuffer commandBuffer, 2631 uint32_t patchControlPoints) 2632{ 2633 tu_stub(); 2634} 2635 2636VKAPI_ATTR void VKAPI_CALL 2637tu_CmdSetLineStippleEXT(VkCommandBuffer commandBuffer, 2638 uint32_t lineStippleFactor, 2639 uint16_t lineStipplePattern) 2640{ 2641 tu_stub(); 2642} 2643 2644static void 2645tu_flush_for_access(struct tu_cache_state *cache, 2646 enum tu_cmd_access_mask src_mask, 2647 enum tu_cmd_access_mask dst_mask) 2648{ 2649 enum tu_cmd_flush_bits flush_bits = 0; 2650 2651 if (src_mask & TU_ACCESS_SYSMEM_WRITE) { 2652 cache->pending_flush_bits |= TU_CMD_FLAG_ALL_INVALIDATE; 2653 } 2654 2655 if (src_mask & TU_ACCESS_CP_WRITE) { 2656 /* Flush the CP write queue. 2657 */ 2658 cache->pending_flush_bits |= 2659 TU_CMD_FLAG_WAIT_MEM_WRITES | 2660 TU_CMD_FLAG_ALL_INVALIDATE; 2661 } 2662 2663#define SRC_FLUSH(domain, flush, invalidate) \ 2664 if (src_mask & TU_ACCESS_##domain##_WRITE) { \ 2665 cache->pending_flush_bits |= TU_CMD_FLAG_##flush | \ 2666 (TU_CMD_FLAG_ALL_INVALIDATE & ~TU_CMD_FLAG_##invalidate); \ 2667 } 2668 2669 SRC_FLUSH(UCHE, CACHE_FLUSH, CACHE_INVALIDATE) 2670 SRC_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR) 2671 SRC_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH) 2672 2673#undef SRC_FLUSH 2674 2675#define SRC_INCOHERENT_FLUSH(domain, flush, invalidate) \ 2676 if (src_mask & TU_ACCESS_##domain##_INCOHERENT_WRITE) { \ 2677 flush_bits |= TU_CMD_FLAG_##flush; \ 2678 cache->pending_flush_bits |= \ 2679 (TU_CMD_FLAG_ALL_INVALIDATE & ~TU_CMD_FLAG_##invalidate); \ 2680 } 2681 2682 SRC_INCOHERENT_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR) 2683 SRC_INCOHERENT_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH) 2684 2685#undef SRC_INCOHERENT_FLUSH 2686 2687 /* Treat host & sysmem write accesses the same, since the kernel implicitly 2688 * drains the queue before signalling completion to the host. 2689 */ 2690 if (dst_mask & (TU_ACCESS_SYSMEM_READ | TU_ACCESS_SYSMEM_WRITE)) { 2691 flush_bits |= cache->pending_flush_bits & TU_CMD_FLAG_ALL_FLUSH; 2692 } 2693 2694#define DST_FLUSH(domain, flush, invalidate) \ 2695 if (dst_mask & (TU_ACCESS_##domain##_READ | \ 2696 TU_ACCESS_##domain##_WRITE)) { \ 2697 flush_bits |= cache->pending_flush_bits & \ 2698 (TU_CMD_FLAG_##invalidate | \ 2699 (TU_CMD_FLAG_ALL_FLUSH & ~TU_CMD_FLAG_##flush)); \ 2700 } 2701 2702 DST_FLUSH(UCHE, CACHE_FLUSH, CACHE_INVALIDATE) 2703 DST_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR) 2704 DST_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH) 2705 2706#undef DST_FLUSH 2707 2708#define DST_INCOHERENT_FLUSH(domain, flush, invalidate) \ 2709 if (dst_mask & (TU_ACCESS_##domain##_INCOHERENT_READ | \ 2710 TU_ACCESS_##domain##_INCOHERENT_WRITE)) { \ 2711 flush_bits |= TU_CMD_FLAG_##invalidate | \ 2712 (cache->pending_flush_bits & \ 2713 (TU_CMD_FLAG_ALL_FLUSH & ~TU_CMD_FLAG_##flush)); \ 2714 } 2715 2716 DST_INCOHERENT_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR) 2717 DST_INCOHERENT_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH) 2718 2719#undef DST_INCOHERENT_FLUSH 2720 2721 cache->flush_bits |= flush_bits; 2722 cache->pending_flush_bits &= ~flush_bits; 2723} 2724 2725static void 2726tu_flush_for_stage(struct tu_cache_state *cache, 2727 enum tu_stage src_stage, enum tu_stage dst_stage) 2728{ 2729 /* As far as we know, flushes take place in the last stage so if there are 2730 * any pending flushes then we have to move down the source stage, because 2731 * the data only becomes available when the flush finishes. In particular 2732 * this can matter when the CP writes something and we need to invalidate 2733 * UCHE to read it. 2734 */ 2735 if (cache->flush_bits & (TU_CMD_FLAG_ALL_FLUSH | TU_CMD_FLAG_ALL_INVALIDATE)) 2736 src_stage = TU_STAGE_PS; 2737 2738 /* Note: if the destination stage is the CP, then the CP also has to wait 2739 * for any WFI's to finish. This is already done for draw calls, including 2740 * before indirect param reads, for the most part, so we just need to WFI. 2741 * 2742 * Transform feedback counters are read via CP_MEM_TO_REG, which implicitly 2743 * does CP_WAIT_FOR_ME, but we still need a WFI if the GPU writes it. 2744 * 2745 * Currently we read the draw predicate using CP_MEM_TO_MEM, which 2746 * also implicitly does CP_WAIT_FOR_ME. However CP_DRAW_PRED_SET does *not* 2747 * implicitly do CP_WAIT_FOR_ME, it seems to only wait for counters to 2748 * complete since it's written for DX11 where you can only predicate on the 2749 * result of a query object. So if we implement 64-bit comparisons in the 2750 * future, or if CP_DRAW_PRED_SET grows the capability to do 32-bit 2751 * comparisons, then this will have to be dealt with. 2752 */ 2753 if (src_stage > dst_stage) 2754 cache->flush_bits |= TU_CMD_FLAG_WAIT_FOR_IDLE; 2755} 2756 2757static enum tu_cmd_access_mask 2758vk2tu_access(VkAccessFlags flags, bool gmem) 2759{ 2760 enum tu_cmd_access_mask mask = 0; 2761 2762 if (flags & 2763 (VK_ACCESS_INDIRECT_COMMAND_READ_BIT | /* Read performed by CP */ 2764 VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT | /* Read performed by CP */ 2765 VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT | /* Read performed by CP */ 2766 VK_ACCESS_MEMORY_READ_BIT)) { 2767 mask |= TU_ACCESS_SYSMEM_READ; 2768 } 2769 2770 if (flags & 2771 (VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT | 2772 VK_ACCESS_MEMORY_WRITE_BIT)) { 2773 mask |= TU_ACCESS_CP_WRITE; 2774 } 2775 2776 if (flags & 2777 (VK_ACCESS_HOST_READ_BIT | 2778 VK_ACCESS_MEMORY_WRITE_BIT)) { 2779 mask |= TU_ACCESS_SYSMEM_READ; 2780 } 2781 2782 if (flags & 2783 (VK_ACCESS_HOST_WRITE_BIT | 2784 VK_ACCESS_MEMORY_WRITE_BIT)) { 2785 mask |= TU_ACCESS_SYSMEM_WRITE; 2786 } 2787 2788 if (flags & 2789 (VK_ACCESS_INDEX_READ_BIT | /* Read performed by PC, I think */ 2790 VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | /* Read performed by VFD */ 2791 VK_ACCESS_UNIFORM_READ_BIT | /* Read performed by SP */ 2792 /* TODO: Is there a no-cache bit for textures so that we can ignore 2793 * these? 2794 */ 2795 VK_ACCESS_INPUT_ATTACHMENT_READ_BIT | /* Read performed by TP */ 2796 VK_ACCESS_SHADER_READ_BIT | /* Read perfomed by SP/TP */ 2797 VK_ACCESS_MEMORY_READ_BIT)) { 2798 mask |= TU_ACCESS_UCHE_READ; 2799 } 2800 2801 if (flags & 2802 (VK_ACCESS_SHADER_WRITE_BIT | /* Write performed by SP */ 2803 VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT | /* Write performed by VPC */ 2804 VK_ACCESS_MEMORY_WRITE_BIT)) { 2805 mask |= TU_ACCESS_UCHE_WRITE; 2806 } 2807 2808 /* When using GMEM, the CCU is always flushed automatically to GMEM, and 2809 * then GMEM is flushed to sysmem. Furthermore, we already had to flush any 2810 * previous writes in sysmem mode when transitioning to GMEM. Therefore we 2811 * can ignore CCU and pretend that color attachments and transfers use 2812 * sysmem directly. 2813 */ 2814 2815 if (flags & 2816 (VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | 2817 VK_ACCESS_COLOR_ATTACHMENT_READ_NONCOHERENT_BIT_EXT | 2818 VK_ACCESS_MEMORY_READ_BIT)) { 2819 if (gmem) 2820 mask |= TU_ACCESS_SYSMEM_READ; 2821 else 2822 mask |= TU_ACCESS_CCU_COLOR_INCOHERENT_READ; 2823 } 2824 2825 if (flags & 2826 (VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | 2827 VK_ACCESS_MEMORY_READ_BIT)) { 2828 if (gmem) 2829 mask |= TU_ACCESS_SYSMEM_READ; 2830 else 2831 mask |= TU_ACCESS_CCU_DEPTH_INCOHERENT_READ; 2832 } 2833 2834 if (flags & 2835 (VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | 2836 VK_ACCESS_MEMORY_WRITE_BIT)) { 2837 if (gmem) { 2838 mask |= TU_ACCESS_SYSMEM_WRITE; 2839 } else { 2840 mask |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE; 2841 } 2842 } 2843 2844 if (flags & 2845 (VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | 2846 VK_ACCESS_MEMORY_WRITE_BIT)) { 2847 if (gmem) { 2848 mask |= TU_ACCESS_SYSMEM_WRITE; 2849 } else { 2850 mask |= TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE; 2851 } 2852 } 2853 2854 if (flags & 2855 (VK_ACCESS_TRANSFER_WRITE_BIT | 2856 VK_ACCESS_MEMORY_WRITE_BIT)) { 2857 if (gmem) { 2858 mask |= TU_ACCESS_SYSMEM_WRITE; 2859 } else { 2860 mask |= TU_ACCESS_CCU_COLOR_WRITE; 2861 } 2862 } 2863 2864 if (flags & 2865 (VK_ACCESS_TRANSFER_READ_BIT | /* Access performed by TP */ 2866 VK_ACCESS_MEMORY_READ_BIT)) { 2867 mask |= TU_ACCESS_UCHE_READ; 2868 } 2869 2870 return mask; 2871} 2872 2873static enum tu_stage 2874vk2tu_single_stage(VkPipelineStageFlags vk_stage, bool dst) 2875{ 2876 switch (vk_stage) { 2877 case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT: 2878 case VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT: 2879 case VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT: 2880 return TU_STAGE_CP; 2881 case VK_PIPELINE_STAGE_VERTEX_INPUT_BIT: 2882 return TU_STAGE_FE; 2883 case VK_PIPELINE_STAGE_VERTEX_SHADER_BIT: 2884 case VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT: 2885 case VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT: 2886 case VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT: 2887 return TU_STAGE_SP_VS; 2888 case VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT: 2889 case VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT: 2890 return TU_STAGE_SP_PS; 2891 case VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT: /* Yes, really */ 2892 /* See comment in TU_STAGE_GRAS about early fragment tests */ 2893 case VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT: 2894 case VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT: 2895 case VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT: 2896 case VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT: 2897 return TU_STAGE_PS; 2898 2899 case VK_PIPELINE_STAGE_TRANSFER_BIT: 2900 /* Blits read in SP_PS and write in PS, in both 2d and 3d cases */ 2901 return dst ? TU_STAGE_SP_PS : TU_STAGE_PS; 2902 2903 case VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT: 2904 case VK_PIPELINE_STAGE_ALL_COMMANDS_BIT: 2905 /* Be conservative */ 2906 return dst ? TU_STAGE_CP : TU_STAGE_PS; 2907 2908 case VK_PIPELINE_STAGE_HOST_BIT: 2909 return dst ? TU_STAGE_PS : TU_STAGE_CP; 2910 } 2911 2912 unreachable("unknown pipeline stage"); 2913} 2914 2915static enum tu_stage 2916vk2tu_src_stage(VkPipelineStageFlags vk_stages) 2917{ 2918 enum tu_stage stage = TU_STAGE_CP; 2919 u_foreach_bit (bit, vk_stages) { 2920 enum tu_stage new_stage = vk2tu_single_stage(1ull << bit, false); 2921 stage = MAX2(stage, new_stage); 2922 } 2923 2924 return stage; 2925} 2926 2927static enum tu_stage 2928vk2tu_dst_stage(VkPipelineStageFlags vk_stages) 2929{ 2930 enum tu_stage stage = TU_STAGE_PS; 2931 u_foreach_bit (bit, vk_stages) { 2932 enum tu_stage new_stage = vk2tu_single_stage(1ull << bit, true); 2933 stage = MIN2(stage, new_stage); 2934 } 2935 2936 return stage; 2937} 2938 2939VKAPI_ATTR void VKAPI_CALL 2940tu_CmdExecuteCommands(VkCommandBuffer commandBuffer, 2941 uint32_t commandBufferCount, 2942 const VkCommandBuffer *pCmdBuffers) 2943{ 2944 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 2945 VkResult result; 2946 2947 assert(commandBufferCount > 0); 2948 2949 /* Emit any pending flushes. */ 2950 if (cmd->state.pass) { 2951 tu_flush_all_pending(&cmd->state.renderpass_cache); 2952 tu_emit_cache_flush_renderpass(cmd, &cmd->draw_cs); 2953 } else { 2954 tu_flush_all_pending(&cmd->state.cache); 2955 tu_emit_cache_flush(cmd, &cmd->cs); 2956 } 2957 2958 for (uint32_t i = 0; i < commandBufferCount; i++) { 2959 TU_FROM_HANDLE(tu_cmd_buffer, secondary, pCmdBuffers[i]); 2960 2961 if (secondary->usage_flags & 2962 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) { 2963 assert(tu_cs_is_empty(&secondary->cs)); 2964 2965 result = tu_cs_add_entries(&cmd->draw_cs, &secondary->draw_cs); 2966 if (result != VK_SUCCESS) { 2967 cmd->record_result = result; 2968 break; 2969 } 2970 2971 result = tu_cs_add_entries(&cmd->draw_epilogue_cs, 2972 &secondary->draw_epilogue_cs); 2973 if (result != VK_SUCCESS) { 2974 cmd->record_result = result; 2975 break; 2976 } 2977 2978 if (secondary->state.has_tess) 2979 cmd->state.has_tess = true; 2980 if (secondary->state.has_subpass_predication) 2981 cmd->state.has_subpass_predication = true; 2982 if (secondary->state.disable_gmem) 2983 cmd->state.disable_gmem = true; 2984 } else { 2985 assert(tu_cs_is_empty(&secondary->draw_cs)); 2986 assert(tu_cs_is_empty(&secondary->draw_epilogue_cs)); 2987 2988 tu_cs_add_entries(&cmd->cs, &secondary->cs); 2989 } 2990 2991 cmd->state.index_size = secondary->state.index_size; /* for restart index update */ 2992 } 2993 cmd->state.dirty = ~0u; /* TODO: set dirty only what needs to be */ 2994 2995 if (cmd->state.pass) { 2996 /* After a secondary command buffer is executed, LRZ is not valid 2997 * until it is cleared again. 2998 */ 2999 cmd->state.lrz.valid = false; 3000 } 3001 3002 /* After executing secondary command buffers, there may have been arbitrary 3003 * flushes executed, so when we encounter a pipeline barrier with a 3004 * srcMask, we have to assume that we need to invalidate. Therefore we need 3005 * to re-initialize the cache with all pending invalidate bits set. 3006 */ 3007 if (cmd->state.pass) { 3008 tu_cache_init(&cmd->state.renderpass_cache); 3009 } else { 3010 tu_cache_init(&cmd->state.cache); 3011 } 3012} 3013 3014VKAPI_ATTR VkResult VKAPI_CALL 3015tu_CreateCommandPool(VkDevice _device, 3016 const VkCommandPoolCreateInfo *pCreateInfo, 3017 const VkAllocationCallbacks *pAllocator, 3018 VkCommandPool *pCmdPool) 3019{ 3020 TU_FROM_HANDLE(tu_device, device, _device); 3021 struct tu_cmd_pool *pool; 3022 3023 pool = vk_object_alloc(&device->vk, pAllocator, sizeof(*pool), 3024 VK_OBJECT_TYPE_COMMAND_POOL); 3025 if (pool == NULL) 3026 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 3027 3028 if (pAllocator) 3029 pool->alloc = *pAllocator; 3030 else 3031 pool->alloc = device->vk.alloc; 3032 3033 list_inithead(&pool->cmd_buffers); 3034 list_inithead(&pool->free_cmd_buffers); 3035 3036 pool->queue_family_index = pCreateInfo->queueFamilyIndex; 3037 3038 *pCmdPool = tu_cmd_pool_to_handle(pool); 3039 3040 return VK_SUCCESS; 3041} 3042 3043VKAPI_ATTR void VKAPI_CALL 3044tu_DestroyCommandPool(VkDevice _device, 3045 VkCommandPool commandPool, 3046 const VkAllocationCallbacks *pAllocator) 3047{ 3048 TU_FROM_HANDLE(tu_device, device, _device); 3049 TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool); 3050 3051 if (!pool) 3052 return; 3053 3054 list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer, 3055 &pool->cmd_buffers, pool_link) 3056 { 3057 tu_cmd_buffer_destroy(cmd_buffer); 3058 } 3059 3060 list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer, 3061 &pool->free_cmd_buffers, pool_link) 3062 { 3063 tu_cmd_buffer_destroy(cmd_buffer); 3064 } 3065 3066 vk_object_free(&device->vk, pAllocator, pool); 3067} 3068 3069VKAPI_ATTR VkResult VKAPI_CALL 3070tu_ResetCommandPool(VkDevice device, 3071 VkCommandPool commandPool, 3072 VkCommandPoolResetFlags flags) 3073{ 3074 TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool); 3075 VkResult result; 3076 3077 list_for_each_entry(struct tu_cmd_buffer, cmd_buffer, &pool->cmd_buffers, 3078 pool_link) 3079 { 3080 result = tu_reset_cmd_buffer(cmd_buffer); 3081 if (result != VK_SUCCESS) 3082 return result; 3083 } 3084 3085 return VK_SUCCESS; 3086} 3087 3088VKAPI_ATTR void VKAPI_CALL 3089tu_TrimCommandPool(VkDevice device, 3090 VkCommandPool commandPool, 3091 VkCommandPoolTrimFlags flags) 3092{ 3093 TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool); 3094 3095 if (!pool) 3096 return; 3097 3098 list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer, 3099 &pool->free_cmd_buffers, pool_link) 3100 { 3101 tu_cmd_buffer_destroy(cmd_buffer); 3102 } 3103} 3104 3105static void 3106tu_subpass_barrier(struct tu_cmd_buffer *cmd_buffer, 3107 const struct tu_subpass_barrier *barrier, 3108 bool external) 3109{ 3110 /* Note: we don't know until the end of the subpass whether we'll use 3111 * sysmem, so assume sysmem here to be safe. 3112 */ 3113 struct tu_cache_state *cache = 3114 external ? &cmd_buffer->state.cache : &cmd_buffer->state.renderpass_cache; 3115 enum tu_cmd_access_mask src_flags = 3116 vk2tu_access(barrier->src_access_mask, false); 3117 enum tu_cmd_access_mask dst_flags = 3118 vk2tu_access(barrier->dst_access_mask, false); 3119 3120 if (barrier->incoherent_ccu_color) 3121 src_flags |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE; 3122 if (barrier->incoherent_ccu_depth) 3123 src_flags |= TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE; 3124 3125 tu_flush_for_access(cache, src_flags, dst_flags); 3126 3127 enum tu_stage src_stage = vk2tu_src_stage(barrier->src_stage_mask); 3128 enum tu_stage dst_stage = vk2tu_dst_stage(barrier->dst_stage_mask); 3129 tu_flush_for_stage(cache, src_stage, dst_stage); 3130} 3131 3132VKAPI_ATTR void VKAPI_CALL 3133tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer, 3134 const VkRenderPassBeginInfo *pRenderPassBegin, 3135 const VkSubpassBeginInfo *pSubpassBeginInfo) 3136{ 3137 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 3138 TU_FROM_HANDLE(tu_render_pass, pass, pRenderPassBegin->renderPass); 3139 TU_FROM_HANDLE(tu_framebuffer, fb, pRenderPassBegin->framebuffer); 3140 3141 const struct VkRenderPassAttachmentBeginInfo *pAttachmentInfo = 3142 vk_find_struct_const(pRenderPassBegin->pNext, 3143 RENDER_PASS_ATTACHMENT_BEGIN_INFO); 3144 3145 cmd->state.pass = pass; 3146 cmd->state.subpass = pass->subpasses; 3147 cmd->state.framebuffer = fb; 3148 cmd->state.render_area = pRenderPassBegin->renderArea; 3149 3150 cmd->state.attachments = 3151 vk_alloc(&cmd->pool->alloc, pass->attachment_count * 3152 sizeof(cmd->state.attachments[0]), 8, 3153 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 3154 3155 if (!cmd->state.attachments) { 3156 cmd->record_result = VK_ERROR_OUT_OF_HOST_MEMORY; 3157 return; 3158 } 3159 3160 for (unsigned i = 0; i < pass->attachment_count; i++) { 3161 cmd->state.attachments[i] = pAttachmentInfo ? 3162 tu_image_view_from_handle(pAttachmentInfo->pAttachments[i]) : 3163 cmd->state.framebuffer->attachments[i].attachment; 3164 } 3165 3166 trace_start_render_pass(&cmd->trace, &cmd->cs); 3167 3168 /* Note: because this is external, any flushes will happen before draw_cs 3169 * gets called. However deferred flushes could have to happen later as part 3170 * of the subpass. 3171 */ 3172 tu_subpass_barrier(cmd, &pass->subpasses[0].start_barrier, true); 3173 cmd->state.renderpass_cache.pending_flush_bits = 3174 cmd->state.cache.pending_flush_bits; 3175 cmd->state.renderpass_cache.flush_bits = 0; 3176 3177 if (pass->subpasses[0].feedback_invalidate) 3178 cmd->state.renderpass_cache.flush_bits |= TU_CMD_FLAG_CACHE_INVALIDATE; 3179 3180 /* Track LRZ valid state */ 3181 uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment; 3182 if (a != VK_ATTACHMENT_UNUSED) { 3183 const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a]; 3184 struct tu_image *image = cmd->state.attachments[a]->image; 3185 /* if image has lrz and it isn't a stencil-only clear: */ 3186 if (image->lrz_height && 3187 (att->clear_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT))) { 3188 cmd->state.lrz.image = image; 3189 cmd->state.lrz.valid = true; 3190 cmd->state.lrz.prev_direction = TU_LRZ_UNKNOWN; 3191 3192 tu6_clear_lrz(cmd, &cmd->cs, image, &pRenderPassBegin->pClearValues[a]); 3193 3194 /* Clearing writes via CCU color in the PS stage, and LRZ is read via 3195 * UCHE in the earlier GRAS stage. 3196 */ 3197 cmd->state.cache.flush_bits |= 3198 TU_CMD_FLAG_CCU_FLUSH_COLOR | TU_CMD_FLAG_CACHE_INVALIDATE | 3199 TU_CMD_FLAG_WAIT_FOR_IDLE; 3200 } else { 3201 cmd->state.lrz.valid = false; 3202 } 3203 cmd->state.dirty |= TU_CMD_DIRTY_LRZ; 3204 } 3205 3206 cmd->trace_renderpass_start = u_trace_end_iterator(&cmd->trace); 3207 3208 tu_emit_renderpass_begin(cmd, pRenderPassBegin); 3209 3210 tu6_emit_zs(cmd, cmd->state.subpass, &cmd->draw_cs); 3211 tu6_emit_mrt(cmd, cmd->state.subpass, &cmd->draw_cs); 3212 if (cmd->state.subpass->samples) 3213 tu6_emit_msaa(&cmd->draw_cs, cmd->state.subpass->samples, cmd->state.line_mode); 3214 tu6_emit_render_cntl(cmd, cmd->state.subpass, &cmd->draw_cs, false); 3215 3216 tu_set_input_attachments(cmd, cmd->state.subpass); 3217} 3218 3219VKAPI_ATTR void VKAPI_CALL 3220tu_CmdNextSubpass2(VkCommandBuffer commandBuffer, 3221 const VkSubpassBeginInfo *pSubpassBeginInfo, 3222 const VkSubpassEndInfo *pSubpassEndInfo) 3223{ 3224 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 3225 const struct tu_render_pass *pass = cmd->state.pass; 3226 struct tu_cs *cs = &cmd->draw_cs; 3227 3228 const struct tu_subpass *subpass = cmd->state.subpass++; 3229 3230 /* Track LRZ valid state 3231 * 3232 * TODO: Improve this tracking for keeping the state of the past depth/stencil images, 3233 * so if they become active again, we reuse its old state. 3234 */ 3235 cmd->state.lrz.valid = false; 3236 cmd->state.dirty |= TU_CMD_DIRTY_LRZ; 3237 3238 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM); 3239 3240 if (subpass->resolve_attachments) { 3241 tu6_emit_blit_scissor(cmd, cs, true); 3242 3243 for (unsigned i = 0; i < subpass->resolve_count; i++) { 3244 uint32_t a = subpass->resolve_attachments[i].attachment; 3245 if (a == VK_ATTACHMENT_UNUSED) 3246 continue; 3247 3248 uint32_t gmem_a = tu_subpass_get_attachment_to_resolve(subpass, i); 3249 3250 tu_store_gmem_attachment(cmd, cs, a, gmem_a); 3251 3252 if (pass->attachments[a].gmem_offset < 0) 3253 continue; 3254 3255 /* TODO: 3256 * check if the resolved attachment is needed by later subpasses, 3257 * if it is, should be doing a GMEM->GMEM resolve instead of GMEM->MEM->GMEM.. 3258 */ 3259 tu_finishme("missing GMEM->GMEM resolve path\n"); 3260 tu_load_gmem_attachment(cmd, cs, a, true); 3261 } 3262 } 3263 3264 tu_cond_exec_end(cs); 3265 3266 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM); 3267 3268 tu6_emit_sysmem_resolves(cmd, cs, subpass); 3269 3270 tu_cond_exec_end(cs); 3271 3272 /* Handle dependencies for the next subpass */ 3273 tu_subpass_barrier(cmd, &cmd->state.subpass->start_barrier, false); 3274 3275 if (cmd->state.subpass->feedback_invalidate) 3276 cmd->state.renderpass_cache.flush_bits |= TU_CMD_FLAG_CACHE_INVALIDATE; 3277 3278 /* emit mrt/zs/msaa/ubwc state for the subpass that is starting */ 3279 tu6_emit_zs(cmd, cmd->state.subpass, cs); 3280 tu6_emit_mrt(cmd, cmd->state.subpass, cs); 3281 if (cmd->state.subpass->samples) 3282 tu6_emit_msaa(cs, cmd->state.subpass->samples, cmd->state.line_mode); 3283 tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, false); 3284 3285 tu_set_input_attachments(cmd, cmd->state.subpass); 3286} 3287 3288static uint32_t 3289tu6_user_consts_size(const struct tu_pipeline *pipeline, 3290 struct tu_descriptor_state *descriptors_state, 3291 gl_shader_stage type) 3292{ 3293 const struct tu_program_descriptor_linkage *link = 3294 &pipeline->program.link[type]; 3295 const struct ir3_ubo_analysis_state *state = &link->const_state.ubo_state; 3296 uint32_t dwords = 0; 3297 3298 if (link->push_consts.count > 0) { 3299 unsigned num_units = link->push_consts.count; 3300 dwords += 4 + num_units * 4; 3301 } 3302 3303 for (uint32_t i = 0; i < state->num_enabled; i++) { 3304 uint32_t size = state->range[i].end - state->range[i].start; 3305 3306 size = MIN2(size, (16 * link->constlen) - state->range[i].offset); 3307 3308 if (size == 0) 3309 continue; 3310 3311 if (!state->range[i].ubo.bindless) 3312 continue; 3313 3314 uint32_t *base = state->range[i].ubo.bindless_base == MAX_SETS ? 3315 descriptors_state->dynamic_descriptors : 3316 descriptors_state->sets[state->range[i].ubo.bindless_base]->mapped_ptr; 3317 unsigned block = state->range[i].ubo.block; 3318 uint32_t *desc = base + block * A6XX_TEX_CONST_DWORDS; 3319 uint32_t desc_size = (desc[1] >> A6XX_UBO_1_SIZE__SHIFT) * 16; 3320 desc_size = desc_size > state->range[i].start ? 3321 desc_size - state->range[i].start : 0; 3322 3323 if (desc_size < size) { 3324 uint32_t zero_size = size - desc_size; 3325 dwords += 4 + zero_size / 4; 3326 size = desc_size; 3327 } 3328 3329 if (size > 0) { 3330 dwords += 4; 3331 } 3332 } 3333 3334 return dwords; 3335} 3336 3337static void 3338tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline, 3339 struct tu_descriptor_state *descriptors_state, 3340 gl_shader_stage type, 3341 uint32_t *push_constants) 3342{ 3343 const struct tu_program_descriptor_linkage *link = 3344 &pipeline->program.link[type]; 3345 const struct ir3_const_state *const_state = &link->const_state; 3346 const struct ir3_ubo_analysis_state *state = &const_state->ubo_state; 3347 3348 if (link->push_consts.count > 0) { 3349 unsigned num_units = link->push_consts.count; 3350 unsigned offset = link->push_consts.lo; 3351 tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + num_units * 4); 3352 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) | 3353 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | 3354 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | 3355 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | 3356 CP_LOAD_STATE6_0_NUM_UNIT(num_units)); 3357 tu_cs_emit(cs, 0); 3358 tu_cs_emit(cs, 0); 3359 for (unsigned i = 0; i < num_units * 4; i++) 3360 tu_cs_emit(cs, push_constants[i + offset * 4]); 3361 } 3362 3363 for (uint32_t i = 0; i < state->num_enabled; i++) { 3364 uint32_t size = state->range[i].end - state->range[i].start; 3365 uint32_t offset = state->range[i].start; 3366 3367 /* and even if the start of the const buffer is before 3368 * first_immediate, the end may not be: 3369 */ 3370 size = MIN2(size, (16 * link->constlen) - state->range[i].offset); 3371 3372 if (size == 0) 3373 continue; 3374 3375 /* things should be aligned to vec4: */ 3376 debug_assert((state->range[i].offset % 16) == 0); 3377 debug_assert((size % 16) == 0); 3378 debug_assert((offset % 16) == 0); 3379 3380 /* Dig out the descriptor from the descriptor state and read the VA from 3381 * it. All our UBOs are bindless with the exception of the NIR 3382 * constant_data, which is uploaded once in the pipeline. 3383 */ 3384 if (!state->range[i].ubo.bindless) { 3385 assert(state->range[i].ubo.block == const_state->constant_data_ubo); 3386 continue; 3387 } 3388 3389 uint32_t *base = state->range[i].ubo.bindless_base == MAX_SETS ? 3390 descriptors_state->dynamic_descriptors : 3391 descriptors_state->sets[state->range[i].ubo.bindless_base]->mapped_ptr; 3392 unsigned block = state->range[i].ubo.block; 3393 uint32_t *desc = base + block * A6XX_TEX_CONST_DWORDS; 3394 uint64_t va = desc[0] | ((uint64_t)(desc[1] & A6XX_UBO_1_BASE_HI__MASK) << 32); 3395 uint32_t desc_size = (desc[1] >> A6XX_UBO_1_SIZE__SHIFT) * 16; 3396 desc_size = desc_size > state->range[i].start ? 3397 desc_size - state->range[i].start : 0; 3398 3399 /* Handle null UBO descriptors and out-of-range UBO reads by filling the 3400 * rest with 0, simulating what reading with ldc would do. This behavior 3401 * is required by VK_EXT_robustness2. 3402 */ 3403 if (desc_size < size) { 3404 uint32_t zero_size = size - desc_size; 3405 uint32_t zero_offset = state->range[i].offset + desc_size; 3406 tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + zero_size / 4); 3407 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(zero_offset / 16) | 3408 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | 3409 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | 3410 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | 3411 CP_LOAD_STATE6_0_NUM_UNIT(zero_size / 16)); 3412 tu_cs_emit_qw(cs, 0); 3413 for (unsigned i = 0; i < zero_size / 4; i++) { 3414 tu_cs_emit(cs, 0); 3415 } 3416 size = desc_size; 3417 } 3418 3419 if (size > 0) { 3420 assert(va); 3421 tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3); 3422 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(state->range[i].offset / 16) | 3423 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | 3424 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | 3425 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | 3426 CP_LOAD_STATE6_0_NUM_UNIT(size / 16)); 3427 tu_cs_emit_qw(cs, va + offset); 3428 } 3429 } 3430} 3431 3432static struct tu_draw_state 3433tu6_emit_consts(struct tu_cmd_buffer *cmd, 3434 const struct tu_pipeline *pipeline, 3435 struct tu_descriptor_state *descriptors_state, 3436 gl_shader_stage type) 3437{ 3438 uint32_t dwords = tu6_user_consts_size(pipeline, descriptors_state, type); 3439 if (dwords == 0) 3440 return (struct tu_draw_state) {}; 3441 3442 struct tu_cs cs; 3443 tu_cs_begin_sub_stream(&cmd->sub_cs, dwords, &cs); 3444 3445 tu6_emit_user_consts(&cs, pipeline, descriptors_state, type, cmd->push_constants); 3446 3447 return tu_cs_end_draw_state(&cmd->sub_cs, &cs); 3448} 3449 3450static struct tu_draw_state 3451tu6_emit_consts_geom(struct tu_cmd_buffer *cmd, 3452 const struct tu_pipeline *pipeline, 3453 struct tu_descriptor_state *descriptors_state) 3454{ 3455 uint32_t dwords = 0; 3456 3457 for (uint32_t type = MESA_SHADER_VERTEX; type < MESA_SHADER_FRAGMENT; type++) 3458 dwords += tu6_user_consts_size(pipeline, descriptors_state, type); 3459 3460 if (dwords == 0) 3461 return (struct tu_draw_state) {}; 3462 3463 struct tu_cs cs; 3464 tu_cs_begin_sub_stream(&cmd->sub_cs, dwords, &cs); 3465 3466 for (uint32_t type = MESA_SHADER_VERTEX; type < MESA_SHADER_FRAGMENT; type++) 3467 tu6_emit_user_consts(&cs, pipeline, descriptors_state, type, cmd->push_constants); 3468 3469 return tu_cs_end_draw_state(&cmd->sub_cs, &cs); 3470} 3471 3472static uint64_t 3473get_tess_param_bo_size(const struct tu_pipeline *pipeline, 3474 uint32_t draw_count) 3475{ 3476 /* TODO: For indirect draws, we can't compute the BO size ahead of time. 3477 * Still not sure what to do here, so just allocate a reasonably large 3478 * BO and hope for the best for now. */ 3479 if (!draw_count) 3480 draw_count = 2048; 3481 3482 /* the tess param BO is pipeline->tess.param_stride bytes per patch, 3483 * which includes both the per-vertex outputs and per-patch outputs 3484 * build_primitive_map in ir3 calculates this stride 3485 */ 3486 uint32_t verts_per_patch = pipeline->ia.primtype - DI_PT_PATCHES0; 3487 uint32_t num_patches = draw_count / verts_per_patch; 3488 return num_patches * pipeline->tess.param_stride; 3489} 3490 3491static uint64_t 3492get_tess_factor_bo_size(const struct tu_pipeline *pipeline, 3493 uint32_t draw_count) 3494{ 3495 /* TODO: For indirect draws, we can't compute the BO size ahead of time. 3496 * Still not sure what to do here, so just allocate a reasonably large 3497 * BO and hope for the best for now. */ 3498 if (!draw_count) 3499 draw_count = 2048; 3500 3501 /* Each distinct patch gets its own tess factor output. */ 3502 uint32_t verts_per_patch = pipeline->ia.primtype - DI_PT_PATCHES0; 3503 uint32_t num_patches = draw_count / verts_per_patch; 3504 uint32_t factor_stride; 3505 switch (pipeline->tess.patch_type) { 3506 case IR3_TESS_ISOLINES: 3507 factor_stride = 12; 3508 break; 3509 case IR3_TESS_TRIANGLES: 3510 factor_stride = 20; 3511 break; 3512 case IR3_TESS_QUADS: 3513 factor_stride = 28; 3514 break; 3515 default: 3516 unreachable("bad tessmode"); 3517 } 3518 return factor_stride * num_patches; 3519} 3520 3521static VkResult 3522tu6_emit_tess_consts(struct tu_cmd_buffer *cmd, 3523 uint32_t draw_count, 3524 const struct tu_pipeline *pipeline, 3525 struct tu_draw_state *state, 3526 uint64_t *factor_iova) 3527{ 3528 struct tu_cs cs; 3529 VkResult result = tu_cs_begin_sub_stream(&cmd->sub_cs, 16, &cs); 3530 if (result != VK_SUCCESS) 3531 return result; 3532 3533 const struct tu_program_descriptor_linkage *hs_link = 3534 &pipeline->program.link[MESA_SHADER_TESS_CTRL]; 3535 bool hs_uses_bo = pipeline->tess.hs_bo_regid < hs_link->constlen; 3536 3537 const struct tu_program_descriptor_linkage *ds_link = 3538 &pipeline->program.link[MESA_SHADER_TESS_EVAL]; 3539 bool ds_uses_bo = pipeline->tess.ds_bo_regid < ds_link->constlen; 3540 3541 uint64_t tess_factor_size = get_tess_factor_bo_size(pipeline, draw_count); 3542 uint64_t tess_param_size = get_tess_param_bo_size(pipeline, draw_count); 3543 uint64_t tess_bo_size = tess_factor_size + tess_param_size; 3544 if ((hs_uses_bo || ds_uses_bo) && tess_bo_size > 0) { 3545 struct tu_bo *tess_bo; 3546 result = tu_get_scratch_bo(cmd->device, tess_bo_size, &tess_bo); 3547 if (result != VK_SUCCESS) 3548 return result; 3549 3550 uint64_t tess_factor_iova = tess_bo->iova; 3551 uint64_t tess_param_iova = tess_factor_iova + tess_factor_size; 3552 3553 if (hs_uses_bo) { 3554 tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4); 3555 tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(pipeline->tess.hs_bo_regid) | 3556 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | 3557 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | 3558 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_HS_SHADER) | 3559 CP_LOAD_STATE6_0_NUM_UNIT(1)); 3560 tu_cs_emit(&cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); 3561 tu_cs_emit(&cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); 3562 tu_cs_emit_qw(&cs, tess_param_iova); 3563 tu_cs_emit_qw(&cs, tess_factor_iova); 3564 } 3565 3566 if (ds_uses_bo) { 3567 tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4); 3568 tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(pipeline->tess.ds_bo_regid) | 3569 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | 3570 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | 3571 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_DS_SHADER) | 3572 CP_LOAD_STATE6_0_NUM_UNIT(1)); 3573 tu_cs_emit(&cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); 3574 tu_cs_emit(&cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); 3575 tu_cs_emit_qw(&cs, tess_param_iova); 3576 tu_cs_emit_qw(&cs, tess_factor_iova); 3577 } 3578 3579 *factor_iova = tess_factor_iova; 3580 } 3581 *state = tu_cs_end_draw_state(&cmd->sub_cs, &cs); 3582 return VK_SUCCESS; 3583} 3584 3585static enum tu_lrz_direction 3586tu6_lrz_depth_mode(struct A6XX_GRAS_LRZ_CNTL *gras_lrz_cntl, 3587 VkCompareOp depthCompareOp, 3588 bool *invalidate_lrz) 3589{ 3590 enum tu_lrz_direction lrz_direction = TU_LRZ_UNKNOWN; 3591 3592 /* LRZ does not support some depth modes. */ 3593 switch (depthCompareOp) { 3594 case VK_COMPARE_OP_ALWAYS: 3595 case VK_COMPARE_OP_NOT_EQUAL: 3596 *invalidate_lrz = true; 3597 gras_lrz_cntl->lrz_write = false; 3598 break; 3599 case VK_COMPARE_OP_EQUAL: 3600 case VK_COMPARE_OP_NEVER: 3601 gras_lrz_cntl->lrz_write = false; 3602 break; 3603 case VK_COMPARE_OP_GREATER: 3604 case VK_COMPARE_OP_GREATER_OR_EQUAL: 3605 lrz_direction = TU_LRZ_GREATER; 3606 gras_lrz_cntl->greater = true; 3607 break; 3608 case VK_COMPARE_OP_LESS: 3609 case VK_COMPARE_OP_LESS_OR_EQUAL: 3610 lrz_direction = TU_LRZ_LESS; 3611 break; 3612 default: 3613 unreachable("bad VK_COMPARE_OP value or uninitialized"); 3614 break; 3615 }; 3616 3617 return lrz_direction; 3618} 3619 3620/* update lrz state based on stencil-test func: 3621 * 3622 * Conceptually the order of the pipeline is: 3623 * 3624 * 3625 * FS -> Alpha-Test -> Stencil-Test -> Depth-Test 3626 * | | 3627 * if wrmask != 0 if wrmask != 0 3628 * | | 3629 * v v 3630 * Stencil-Write Depth-Write 3631 * 3632 * Because Stencil-Test can have side effects (Stencil-Write) prior 3633 * to depth test, in this case we potentially need to disable early 3634 * lrz-test. See: 3635 * 3636 * https://www.khronos.org/opengl/wiki/Per-Sample_Processing 3637 */ 3638static void 3639tu6_lrz_stencil_op(struct A6XX_GRAS_LRZ_CNTL *gras_lrz_cntl, 3640 VkCompareOp func, 3641 bool stencil_write, 3642 bool *invalidate_lrz) 3643{ 3644 switch (func) { 3645 case VK_COMPARE_OP_ALWAYS: 3646 /* nothing to do for LRZ, but for stencil test when stencil- 3647 * write is enabled, we need to disable lrz-test, since 3648 * conceptually stencil test and write happens before depth-test. 3649 */ 3650 if (stencil_write) { 3651 gras_lrz_cntl->enable = false; 3652 gras_lrz_cntl->z_test_enable = false; 3653 *invalidate_lrz = true; 3654 } 3655 break; 3656 case VK_COMPARE_OP_NEVER: 3657 /* fragment never passes, disable lrz_write for this draw. */ 3658 gras_lrz_cntl->lrz_write = false; 3659 break; 3660 default: 3661 /* whether the fragment passes or not depends on result 3662 * of stencil test, which we cannot know when doing binning 3663 * pass. 3664 */ 3665 gras_lrz_cntl->lrz_write = false; 3666 /* similarly to the VK_COMPARE_OP_ALWAYS case, if there are side- 3667 * effects from stencil test we need to disable lrz-test. 3668 */ 3669 if (stencil_write) { 3670 gras_lrz_cntl->enable = false; 3671 gras_lrz_cntl->z_test_enable = false; 3672 *invalidate_lrz = true; 3673 } 3674 break; 3675 } 3676} 3677 3678static struct A6XX_GRAS_LRZ_CNTL 3679tu6_calculate_lrz_state(struct tu_cmd_buffer *cmd, 3680 const uint32_t a) 3681{ 3682 struct tu_pipeline *pipeline = cmd->state.pipeline; 3683 struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = { 0 }; 3684 bool invalidate_lrz = pipeline->lrz.force_disable_mask & TU_LRZ_FORCE_DISABLE_LRZ; 3685 bool force_disable_write = pipeline->lrz.force_disable_mask & TU_LRZ_FORCE_DISABLE_WRITE; 3686 enum tu_lrz_direction lrz_direction = TU_LRZ_UNKNOWN; 3687 3688 gras_lrz_cntl.enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE; 3689 gras_lrz_cntl.lrz_write = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE; 3690 gras_lrz_cntl.z_test_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE; 3691 gras_lrz_cntl.z_bounds_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE; 3692 3693 VkCompareOp depth_compare_op = (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_ZFUNC__MASK) >> A6XX_RB_DEPTH_CNTL_ZFUNC__SHIFT; 3694 lrz_direction = tu6_lrz_depth_mode(&gras_lrz_cntl, depth_compare_op, &invalidate_lrz); 3695 3696 /* LRZ doesn't transition properly between GREATER* and LESS* depth compare ops */ 3697 if (cmd->state.lrz.prev_direction != TU_LRZ_UNKNOWN && 3698 lrz_direction != TU_LRZ_UNKNOWN && 3699 cmd->state.lrz.prev_direction != lrz_direction) { 3700 invalidate_lrz = true; 3701 } 3702 3703 cmd->state.lrz.prev_direction = lrz_direction; 3704 3705 /* Invalidate LRZ and disable write if stencil test is enabled */ 3706 bool stencil_test_enable = cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE; 3707 if (stencil_test_enable) { 3708 bool stencil_front_writemask = 3709 (pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ? 3710 (cmd->state.dynamic_stencil_wrmask & 0xff) : 3711 (pipeline->stencil_wrmask & 0xff); 3712 3713 bool stencil_back_writemask = 3714 (pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ? 3715 ((cmd->state.dynamic_stencil_wrmask & 0xff00) >> 8) : 3716 (pipeline->stencil_wrmask & 0xff00) >> 8; 3717 3718 VkCompareOp stencil_front_compare_op = 3719 (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FUNC__MASK) >> A6XX_RB_STENCIL_CONTROL_FUNC__SHIFT; 3720 3721 VkCompareOp stencil_back_compare_op = 3722 (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FUNC_BF__MASK) >> A6XX_RB_STENCIL_CONTROL_FUNC_BF__SHIFT; 3723 3724 tu6_lrz_stencil_op(&gras_lrz_cntl, stencil_front_compare_op, 3725 stencil_front_writemask, &invalidate_lrz); 3726 3727 tu6_lrz_stencil_op(&gras_lrz_cntl, stencil_back_compare_op, 3728 stencil_back_writemask, &invalidate_lrz); 3729 } 3730 3731 if (force_disable_write) 3732 gras_lrz_cntl.lrz_write = false; 3733 3734 if (invalidate_lrz) { 3735 cmd->state.lrz.valid = false; 3736 } 3737 3738 /* In case no depth attachment or invalid, we clear the gras_lrz_cntl register */ 3739 if (a == VK_ATTACHMENT_UNUSED || !cmd->state.lrz.valid) 3740 memset(&gras_lrz_cntl, 0, sizeof(gras_lrz_cntl)); 3741 3742 return gras_lrz_cntl; 3743} 3744 3745static struct tu_draw_state 3746tu6_build_lrz(struct tu_cmd_buffer *cmd) 3747{ 3748 const uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment; 3749 struct tu_cs lrz_cs; 3750 struct tu_draw_state ds = tu_cs_draw_state(&cmd->sub_cs, &lrz_cs, 4); 3751 3752 struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = tu6_calculate_lrz_state(cmd, a); 3753 3754 tu_cs_emit_regs(&lrz_cs, A6XX_GRAS_LRZ_CNTL( 3755 .enable = gras_lrz_cntl.enable, 3756 .greater = gras_lrz_cntl.greater, 3757 .lrz_write = gras_lrz_cntl.lrz_write, 3758 .z_test_enable = gras_lrz_cntl.z_test_enable, 3759 .z_bounds_enable = gras_lrz_cntl.z_bounds_enable)); 3760 tu_cs_emit_regs(&lrz_cs, A6XX_RB_LRZ_CNTL(.enable = gras_lrz_cntl.enable)); 3761 3762 return ds; 3763} 3764 3765static bool 3766tu6_writes_depth(struct tu_cmd_buffer *cmd, bool depth_test_enable) 3767{ 3768 bool depth_write_enable = 3769 cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE; 3770 3771 VkCompareOp depth_compare_op = 3772 (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_ZFUNC__MASK) >> A6XX_RB_DEPTH_CNTL_ZFUNC__SHIFT; 3773 3774 bool depth_compare_op_writes = depth_compare_op != VK_COMPARE_OP_NEVER; 3775 3776 return depth_test_enable && depth_write_enable && depth_compare_op_writes; 3777} 3778 3779static bool 3780tu6_writes_stencil(struct tu_cmd_buffer *cmd) 3781{ 3782 bool stencil_test_enable = 3783 cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE; 3784 3785 bool stencil_front_writemask = 3786 (cmd->state.pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ? 3787 (cmd->state.dynamic_stencil_wrmask & 0xff) : 3788 (cmd->state.pipeline->stencil_wrmask & 0xff); 3789 3790 bool stencil_back_writemask = 3791 (cmd->state.pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ? 3792 ((cmd->state.dynamic_stencil_wrmask & 0xff00) >> 8) : 3793 (cmd->state.pipeline->stencil_wrmask & 0xff00) >> 8; 3794 3795 VkStencilOp front_fail_op = 3796 (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FAIL__MASK) >> A6XX_RB_STENCIL_CONTROL_FAIL__SHIFT; 3797 VkStencilOp front_pass_op = 3798 (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_ZPASS__MASK) >> A6XX_RB_STENCIL_CONTROL_ZPASS__SHIFT; 3799 VkStencilOp front_depth_fail_op = 3800 (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_ZFAIL__MASK) >> A6XX_RB_STENCIL_CONTROL_ZFAIL__SHIFT; 3801 VkStencilOp back_fail_op = 3802 (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FAIL_BF__MASK) >> A6XX_RB_STENCIL_CONTROL_FAIL_BF__SHIFT; 3803 VkStencilOp back_pass_op = 3804 (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_ZPASS_BF__MASK) >> A6XX_RB_STENCIL_CONTROL_ZPASS_BF__SHIFT; 3805 VkStencilOp back_depth_fail_op = 3806 (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_ZFAIL_BF__MASK) >> A6XX_RB_STENCIL_CONTROL_ZFAIL_BF__SHIFT; 3807 3808 bool stencil_front_op_writes = 3809 front_pass_op != VK_STENCIL_OP_KEEP && 3810 front_fail_op != VK_STENCIL_OP_KEEP && 3811 front_depth_fail_op != VK_STENCIL_OP_KEEP; 3812 3813 bool stencil_back_op_writes = 3814 back_pass_op != VK_STENCIL_OP_KEEP && 3815 back_fail_op != VK_STENCIL_OP_KEEP && 3816 back_depth_fail_op != VK_STENCIL_OP_KEEP; 3817 3818 return stencil_test_enable && 3819 ((stencil_front_writemask && stencil_front_op_writes) || 3820 (stencil_back_writemask && stencil_back_op_writes)); 3821} 3822 3823static struct tu_draw_state 3824tu6_build_depth_plane_z_mode(struct tu_cmd_buffer *cmd) 3825{ 3826 struct tu_cs cs; 3827 struct tu_draw_state ds = tu_cs_draw_state(&cmd->sub_cs, &cs, 4); 3828 3829 enum a6xx_ztest_mode zmode = A6XX_EARLY_Z; 3830 bool depth_test_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE; 3831 bool depth_write = tu6_writes_depth(cmd, depth_test_enable); 3832 bool stencil_write = tu6_writes_stencil(cmd); 3833 3834 if (cmd->state.pipeline->lrz.fs_has_kill && 3835 (depth_write || stencil_write)) { 3836 zmode = cmd->state.lrz.valid ? A6XX_EARLY_LRZ_LATE_Z : A6XX_LATE_Z; 3837 } 3838 3839 if (cmd->state.pipeline->lrz.force_late_z || !depth_test_enable) 3840 zmode = A6XX_LATE_Z; 3841 3842 /* User defined early tests take precedence above all else */ 3843 if (cmd->state.pipeline->lrz.early_fragment_tests) 3844 zmode = A6XX_EARLY_Z; 3845 3846 tu_cs_emit_pkt4(&cs, REG_A6XX_GRAS_SU_DEPTH_PLANE_CNTL, 1); 3847 tu_cs_emit(&cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL_Z_MODE(zmode)); 3848 3849 tu_cs_emit_pkt4(&cs, REG_A6XX_RB_DEPTH_PLANE_CNTL, 1); 3850 tu_cs_emit(&cs, A6XX_RB_DEPTH_PLANE_CNTL_Z_MODE(zmode)); 3851 return ds; 3852} 3853 3854static VkResult 3855tu6_draw_common(struct tu_cmd_buffer *cmd, 3856 struct tu_cs *cs, 3857 bool indexed, 3858 /* note: draw_count is 0 for indirect */ 3859 uint32_t draw_count) 3860{ 3861 const struct tu_pipeline *pipeline = cmd->state.pipeline; 3862 VkResult result; 3863 3864 tu_emit_cache_flush_renderpass(cmd, cs); 3865 3866 bool primitive_restart_enabled = pipeline->ia.primitive_restart; 3867 if (pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE)) 3868 primitive_restart_enabled = cmd->state.primitive_restart_enable; 3869 3870 tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0( 3871 .primitive_restart = 3872 primitive_restart_enabled && indexed, 3873 .provoking_vtx_last = pipeline->provoking_vertex_last, 3874 .tess_upper_left_domain_origin = 3875 pipeline->tess.upper_left_domain_origin)); 3876 3877 bool has_tess = 3878 pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT; 3879 3880 /* Early exit if there is nothing to emit, saves CPU cycles */ 3881 if (!(cmd->state.dirty & ~TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD) && 3882 !has_tess) 3883 return VK_SUCCESS; 3884 3885 bool dirty_lrz = cmd->state.dirty & (TU_CMD_DIRTY_LRZ | TU_CMD_DIRTY_RB_DEPTH_CNTL | TU_CMD_DIRTY_RB_STENCIL_CNTL); 3886 3887 struct tu_descriptor_state *descriptors_state = 3888 &cmd->descriptors[VK_PIPELINE_BIND_POINT_GRAPHICS]; 3889 3890 if (dirty_lrz) { 3891 cmd->state.lrz.state = tu6_build_lrz(cmd); 3892 cmd->state.depth_plane_state = tu6_build_depth_plane_z_mode(cmd); 3893 } 3894 3895 if (cmd->state.dirty & TU_CMD_DIRTY_RASTERIZER_DISCARD) { 3896 struct tu_cs cs = tu_cmd_dynamic_state(cmd, TU_DYNAMIC_STATE_RASTERIZER_DISCARD, 4); 3897 tu_cs_emit_regs(&cs, A6XX_PC_RASTER_CNTL(.dword = cmd->state.pc_raster_cntl)); 3898 tu_cs_emit_regs(&cs, A6XX_VPC_UNKNOWN_9107(.dword = cmd->state.vpc_unknown_9107)); 3899 } 3900 3901 if (cmd->state.dirty & TU_CMD_DIRTY_GRAS_SU_CNTL) { 3902 struct tu_cs cs = tu_cmd_dynamic_state(cmd, TU_DYNAMIC_STATE_GRAS_SU_CNTL, 2); 3903 tu_cs_emit_regs(&cs, A6XX_GRAS_SU_CNTL(.dword = cmd->state.gras_su_cntl)); 3904 } 3905 3906 if (cmd->state.dirty & TU_CMD_DIRTY_RB_DEPTH_CNTL) { 3907 struct tu_cs cs = tu_cmd_dynamic_state(cmd, TU_DYNAMIC_STATE_RB_DEPTH_CNTL, 2); 3908 uint32_t rb_depth_cntl = cmd->state.rb_depth_cntl; 3909 3910 if ((rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE) || 3911 (rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE)) 3912 rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE; 3913 3914 if ((rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE) && 3915 !(rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE)) 3916 tu6_apply_depth_bounds_workaround(cmd->device, &rb_depth_cntl); 3917 3918 if (pipeline->rb_depth_cntl_disable) 3919 rb_depth_cntl = 0; 3920 3921 tu_cs_emit_regs(&cs, A6XX_RB_DEPTH_CNTL(.dword = rb_depth_cntl)); 3922 } 3923 3924 if (cmd->state.dirty & TU_CMD_DIRTY_RB_STENCIL_CNTL) { 3925 struct tu_cs cs = tu_cmd_dynamic_state(cmd, TU_DYNAMIC_STATE_RB_STENCIL_CNTL, 2); 3926 tu_cs_emit_regs(&cs, A6XX_RB_STENCIL_CONTROL(.dword = cmd->state.rb_stencil_cntl)); 3927 } 3928 3929 if (cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) { 3930 cmd->state.shader_const[0] = 3931 tu6_emit_consts_geom(cmd, pipeline, descriptors_state); 3932 cmd->state.shader_const[1] = 3933 tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_FRAGMENT); 3934 } 3935 3936 struct tu_draw_state tess_consts = {}; 3937 if (has_tess) { 3938 uint64_t tess_factor_iova = 0; 3939 3940 cmd->state.has_tess = true; 3941 result = tu6_emit_tess_consts(cmd, draw_count, pipeline, &tess_consts, &tess_factor_iova); 3942 if (result != VK_SUCCESS) 3943 return result; 3944 3945 /* this sequence matches what the blob does before every tess draw 3946 * PC_TESSFACTOR_ADDR_LO is a non-context register and needs a wfi 3947 * before writing to it 3948 */ 3949 tu_cs_emit_wfi(cs); 3950 3951 tu_cs_emit_regs(cs, A6XX_PC_TESSFACTOR_ADDR(.qword = tess_factor_iova)); 3952 3953 tu_cs_emit_pkt7(cs, CP_SET_SUBDRAW_SIZE, 1); 3954 tu_cs_emit(cs, draw_count); 3955 } 3956 3957 /* for the first draw in a renderpass, re-emit all the draw states 3958 * 3959 * and if a draw-state disabling path (CmdClearAttachments 3D fallback) was 3960 * used, then draw states must be re-emitted. note however this only happens 3961 * in the sysmem path, so this can be skipped this for the gmem path (TODO) 3962 * 3963 * the two input attachment states are excluded because secondary command 3964 * buffer doesn't have a state ib to restore it, and not re-emitting them 3965 * is OK since CmdClearAttachments won't disable/overwrite them 3966 */ 3967 if (cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE) { 3968 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2)); 3969 3970 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_CONFIG, pipeline->program.config_state); 3971 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM, pipeline->program.state); 3972 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_BINNING, pipeline->program.binning_state); 3973 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_TESS, tess_consts); 3974 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VI, pipeline->vi.state); 3975 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VI_BINNING, pipeline->vi.binning_state); 3976 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_RAST, pipeline->rast_state); 3977 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_BLEND, pipeline->blend_state); 3978 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_SHADER_GEOM_CONST, cmd->state.shader_const[0]); 3979 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_FS_CONST, cmd->state.shader_const[1]); 3980 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DESC_SETS, cmd->state.desc_sets); 3981 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DESC_SETS_LOAD, pipeline->load_state); 3982 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VB, cmd->state.vertex_buffers); 3983 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS_PARAMS, cmd->state.vs_params); 3984 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_LRZ, cmd->state.lrz.state); 3985 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DEPTH_PLANE, cmd->state.depth_plane_state); 3986 3987 for (uint32_t i = 0; i < ARRAY_SIZE(cmd->state.dynamic_state); i++) { 3988 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + i, 3989 ((pipeline->dynamic_state_mask & BIT(i)) ? 3990 cmd->state.dynamic_state[i] : 3991 pipeline->dynamic_state[i])); 3992 } 3993 } else { 3994 /* emit draw states that were just updated 3995 * note we eventually don't want to have to emit anything here 3996 */ 3997 bool emit_binding_stride = false; 3998 uint32_t draw_state_count = 3999 has_tess + 4000 ((cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) ? 2 : 0) + 4001 ((cmd->state.dirty & TU_CMD_DIRTY_DESC_SETS_LOAD) ? 1 : 0) + 4002 ((cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS) ? 1 : 0) + 4003 ((cmd->state.dirty & TU_CMD_DIRTY_VS_PARAMS) ? 1 : 0) + 4004 (dirty_lrz ? 2 : 0); 4005 4006 if ((cmd->state.dirty & TU_CMD_DIRTY_VB_STRIDE) && 4007 (pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_VB_STRIDE))) { 4008 emit_binding_stride = true; 4009 draw_state_count += 1; 4010 } 4011 4012 if (draw_state_count > 0) 4013 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * draw_state_count); 4014 4015 /* We may need to re-emit tess consts if the current draw call is 4016 * sufficiently larger than the last draw call. */ 4017 if (has_tess) 4018 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_TESS, tess_consts); 4019 if (cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) { 4020 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_SHADER_GEOM_CONST, cmd->state.shader_const[0]); 4021 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_FS_CONST, cmd->state.shader_const[1]); 4022 } 4023 if (cmd->state.dirty & TU_CMD_DIRTY_DESC_SETS_LOAD) 4024 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DESC_SETS_LOAD, pipeline->load_state); 4025 if (cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS) 4026 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VB, cmd->state.vertex_buffers); 4027 if (emit_binding_stride) { 4028 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_VB_STRIDE, 4029 cmd->state.dynamic_state[TU_DYNAMIC_STATE_VB_STRIDE]); 4030 } 4031 if (cmd->state.dirty & TU_CMD_DIRTY_VS_PARAMS) 4032 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS_PARAMS, cmd->state.vs_params); 4033 4034 if (dirty_lrz) { 4035 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_LRZ, cmd->state.lrz.state); 4036 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DEPTH_PLANE, cmd->state.depth_plane_state); 4037 } 4038 } 4039 4040 tu_cs_sanity_check(cs); 4041 4042 /* There are too many graphics dirty bits to list here, so just list the 4043 * bits to preserve instead. The only things not emitted here are 4044 * compute-related state. 4045 */ 4046 cmd->state.dirty &= TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD; 4047 return VK_SUCCESS; 4048} 4049 4050static uint32_t 4051tu_draw_initiator(struct tu_cmd_buffer *cmd, enum pc_di_src_sel src_sel) 4052{ 4053 const struct tu_pipeline *pipeline = cmd->state.pipeline; 4054 enum pc_di_primtype primtype = pipeline->ia.primtype; 4055 4056 if (pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY)) { 4057 if (primtype < DI_PT_PATCHES0) { 4058 /* If tesselation used, only VK_PRIMITIVE_TOPOLOGY_PATCH_LIST can be 4059 * set via vkCmdSetPrimitiveTopologyEXT, but primtype is already 4060 * calculated at the pipeline creation based on control points 4061 * for each patch. 4062 * 4063 * Just use the primtype as is for the case. 4064 */ 4065 primtype = cmd->state.primtype; 4066 } 4067 } 4068 4069 uint32_t initiator = 4070 CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(primtype) | 4071 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(src_sel) | 4072 CP_DRAW_INDX_OFFSET_0_INDEX_SIZE(cmd->state.index_size) | 4073 CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY); 4074 4075 if (pipeline->active_stages & VK_SHADER_STAGE_GEOMETRY_BIT) 4076 initiator |= CP_DRAW_INDX_OFFSET_0_GS_ENABLE; 4077 4078 switch (pipeline->tess.patch_type) { 4079 case IR3_TESS_TRIANGLES: 4080 initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_TRIANGLES) | 4081 CP_DRAW_INDX_OFFSET_0_TESS_ENABLE; 4082 break; 4083 case IR3_TESS_ISOLINES: 4084 initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_ISOLINES) | 4085 CP_DRAW_INDX_OFFSET_0_TESS_ENABLE; 4086 break; 4087 case IR3_TESS_NONE: 4088 initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_QUADS); 4089 break; 4090 case IR3_TESS_QUADS: 4091 initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_QUADS) | 4092 CP_DRAW_INDX_OFFSET_0_TESS_ENABLE; 4093 break; 4094 } 4095 return initiator; 4096} 4097 4098 4099static uint32_t 4100vs_params_offset(struct tu_cmd_buffer *cmd) 4101{ 4102 const struct tu_program_descriptor_linkage *link = 4103 &cmd->state.pipeline->program.link[MESA_SHADER_VERTEX]; 4104 const struct ir3_const_state *const_state = &link->const_state; 4105 4106 if (const_state->offsets.driver_param >= link->constlen) 4107 return 0; 4108 4109 /* this layout is required by CP_DRAW_INDIRECT_MULTI */ 4110 STATIC_ASSERT(IR3_DP_DRAWID == 0); 4111 STATIC_ASSERT(IR3_DP_VTXID_BASE == 1); 4112 STATIC_ASSERT(IR3_DP_INSTID_BASE == 2); 4113 4114 /* 0 means disabled for CP_DRAW_INDIRECT_MULTI */ 4115 assert(const_state->offsets.driver_param != 0); 4116 4117 return const_state->offsets.driver_param; 4118} 4119 4120static void 4121tu6_emit_empty_vs_params(struct tu_cmd_buffer *cmd) 4122{ 4123 if (cmd->state.vs_params.iova) { 4124 cmd->state.vs_params = (struct tu_draw_state) {}; 4125 cmd->state.dirty |= TU_CMD_DIRTY_VS_PARAMS; 4126 } 4127} 4128 4129static void 4130tu6_emit_vs_params(struct tu_cmd_buffer *cmd, 4131 uint32_t vertex_offset, 4132 uint32_t first_instance) 4133{ 4134 /* Beside re-emitting params when they are changed, we should re-emit 4135 * them after constants are invalidated via HLSQ_INVALIDATE_CMD. 4136 */ 4137 if (!(cmd->state.dirty & (TU_CMD_DIRTY_DRAW_STATE | TU_CMD_DIRTY_VS_PARAMS)) && 4138 vertex_offset == cmd->state.last_vs_params.vertex_offset && 4139 first_instance == cmd->state.last_vs_params.first_instance) { 4140 return; 4141 } 4142 4143 uint32_t offset = vs_params_offset(cmd); 4144 4145 struct tu_cs cs; 4146 VkResult result = tu_cs_begin_sub_stream(&cmd->sub_cs, 3 + (offset ? 8 : 0), &cs); 4147 if (result != VK_SUCCESS) { 4148 cmd->record_result = result; 4149 return; 4150 } 4151 4152 tu_cs_emit_regs(&cs, 4153 A6XX_VFD_INDEX_OFFSET(vertex_offset), 4154 A6XX_VFD_INSTANCE_START_OFFSET(first_instance)); 4155 4156 if (offset) { 4157 tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4); 4158 tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(offset) | 4159 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | 4160 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | 4161 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) | 4162 CP_LOAD_STATE6_0_NUM_UNIT(1)); 4163 tu_cs_emit(&cs, 0); 4164 tu_cs_emit(&cs, 0); 4165 4166 tu_cs_emit(&cs, 0); 4167 tu_cs_emit(&cs, vertex_offset); 4168 tu_cs_emit(&cs, first_instance); 4169 tu_cs_emit(&cs, 0); 4170 } 4171 4172 cmd->state.last_vs_params.vertex_offset = vertex_offset; 4173 cmd->state.last_vs_params.first_instance = first_instance; 4174 4175 struct tu_cs_entry entry = tu_cs_end_sub_stream(&cmd->sub_cs, &cs); 4176 cmd->state.vs_params = (struct tu_draw_state) {entry.bo->iova + entry.offset, entry.size / 4}; 4177 4178 cmd->state.dirty |= TU_CMD_DIRTY_VS_PARAMS; 4179} 4180 4181VKAPI_ATTR void VKAPI_CALL 4182tu_CmdDraw(VkCommandBuffer commandBuffer, 4183 uint32_t vertexCount, 4184 uint32_t instanceCount, 4185 uint32_t firstVertex, 4186 uint32_t firstInstance) 4187{ 4188 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 4189 struct tu_cs *cs = &cmd->draw_cs; 4190 4191 tu6_emit_vs_params(cmd, firstVertex, firstInstance); 4192 4193 tu6_draw_common(cmd, cs, false, vertexCount); 4194 4195 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3); 4196 tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX)); 4197 tu_cs_emit(cs, instanceCount); 4198 tu_cs_emit(cs, vertexCount); 4199} 4200 4201VKAPI_ATTR void VKAPI_CALL 4202tu_CmdDrawIndexed(VkCommandBuffer commandBuffer, 4203 uint32_t indexCount, 4204 uint32_t instanceCount, 4205 uint32_t firstIndex, 4206 int32_t vertexOffset, 4207 uint32_t firstInstance) 4208{ 4209 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 4210 struct tu_cs *cs = &cmd->draw_cs; 4211 4212 tu6_emit_vs_params(cmd, vertexOffset, firstInstance); 4213 4214 tu6_draw_common(cmd, cs, true, indexCount); 4215 4216 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 7); 4217 tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA)); 4218 tu_cs_emit(cs, instanceCount); 4219 tu_cs_emit(cs, indexCount); 4220 tu_cs_emit(cs, firstIndex); 4221 tu_cs_emit_qw(cs, cmd->state.index_va); 4222 tu_cs_emit(cs, cmd->state.max_index_count); 4223} 4224 4225/* Various firmware bugs/inconsistencies mean that some indirect draw opcodes 4226 * do not wait for WFI's to complete before executing. Add a WAIT_FOR_ME if 4227 * pending for these opcodes. This may result in a few extra WAIT_FOR_ME's 4228 * with these opcodes, but the alternative would add unnecessary WAIT_FOR_ME's 4229 * before draw opcodes that don't need it. 4230 */ 4231static void 4232draw_wfm(struct tu_cmd_buffer *cmd) 4233{ 4234 cmd->state.renderpass_cache.flush_bits |= 4235 cmd->state.renderpass_cache.pending_flush_bits & TU_CMD_FLAG_WAIT_FOR_ME; 4236 cmd->state.renderpass_cache.pending_flush_bits &= ~TU_CMD_FLAG_WAIT_FOR_ME; 4237} 4238 4239VKAPI_ATTR void VKAPI_CALL 4240tu_CmdDrawIndirect(VkCommandBuffer commandBuffer, 4241 VkBuffer _buffer, 4242 VkDeviceSize offset, 4243 uint32_t drawCount, 4244 uint32_t stride) 4245{ 4246 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 4247 TU_FROM_HANDLE(tu_buffer, buf, _buffer); 4248 struct tu_cs *cs = &cmd->draw_cs; 4249 4250 tu6_emit_empty_vs_params(cmd); 4251 4252 if (cmd->device->physical_device->info->a6xx.indirect_draw_wfm_quirk) 4253 draw_wfm(cmd); 4254 4255 tu6_draw_common(cmd, cs, false, 0); 4256 4257 tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 6); 4258 tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX)); 4259 tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_NORMAL) | 4260 A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd))); 4261 tu_cs_emit(cs, drawCount); 4262 tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + offset); 4263 tu_cs_emit(cs, stride); 4264} 4265 4266VKAPI_ATTR void VKAPI_CALL 4267tu_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, 4268 VkBuffer _buffer, 4269 VkDeviceSize offset, 4270 uint32_t drawCount, 4271 uint32_t stride) 4272{ 4273 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 4274 TU_FROM_HANDLE(tu_buffer, buf, _buffer); 4275 struct tu_cs *cs = &cmd->draw_cs; 4276 4277 tu6_emit_empty_vs_params(cmd); 4278 4279 if (cmd->device->physical_device->info->a6xx.indirect_draw_wfm_quirk) 4280 draw_wfm(cmd); 4281 4282 tu6_draw_common(cmd, cs, true, 0); 4283 4284 tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 9); 4285 tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA)); 4286 tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_INDEXED) | 4287 A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd))); 4288 tu_cs_emit(cs, drawCount); 4289 tu_cs_emit_qw(cs, cmd->state.index_va); 4290 tu_cs_emit(cs, cmd->state.max_index_count); 4291 tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + offset); 4292 tu_cs_emit(cs, stride); 4293} 4294 4295VKAPI_ATTR void VKAPI_CALL 4296tu_CmdDrawIndirectCount(VkCommandBuffer commandBuffer, 4297 VkBuffer _buffer, 4298 VkDeviceSize offset, 4299 VkBuffer countBuffer, 4300 VkDeviceSize countBufferOffset, 4301 uint32_t drawCount, 4302 uint32_t stride) 4303{ 4304 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 4305 TU_FROM_HANDLE(tu_buffer, buf, _buffer); 4306 TU_FROM_HANDLE(tu_buffer, count_buf, countBuffer); 4307 struct tu_cs *cs = &cmd->draw_cs; 4308 4309 tu6_emit_empty_vs_params(cmd); 4310 4311 /* It turns out that the firmware we have for a650 only partially fixed the 4312 * problem with CP_DRAW_INDIRECT_MULTI not waiting for WFI's to complete 4313 * before reading indirect parameters. It waits for WFI's before reading 4314 * the draw parameters, but after reading the indirect count :(. 4315 */ 4316 draw_wfm(cmd); 4317 4318 tu6_draw_common(cmd, cs, false, 0); 4319 4320 tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 8); 4321 tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX)); 4322 tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_INDIRECT_COUNT) | 4323 A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd))); 4324 tu_cs_emit(cs, drawCount); 4325 tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + offset); 4326 tu_cs_emit_qw(cs, count_buf->bo->iova + count_buf->bo_offset + countBufferOffset); 4327 tu_cs_emit(cs, stride); 4328} 4329 4330VKAPI_ATTR void VKAPI_CALL 4331tu_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer, 4332 VkBuffer _buffer, 4333 VkDeviceSize offset, 4334 VkBuffer countBuffer, 4335 VkDeviceSize countBufferOffset, 4336 uint32_t drawCount, 4337 uint32_t stride) 4338{ 4339 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 4340 TU_FROM_HANDLE(tu_buffer, buf, _buffer); 4341 TU_FROM_HANDLE(tu_buffer, count_buf, countBuffer); 4342 struct tu_cs *cs = &cmd->draw_cs; 4343 4344 tu6_emit_empty_vs_params(cmd); 4345 4346 draw_wfm(cmd); 4347 4348 tu6_draw_common(cmd, cs, true, 0); 4349 4350 tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 11); 4351 tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA)); 4352 tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_INDIRECT_COUNT_INDEXED) | 4353 A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd))); 4354 tu_cs_emit(cs, drawCount); 4355 tu_cs_emit_qw(cs, cmd->state.index_va); 4356 tu_cs_emit(cs, cmd->state.max_index_count); 4357 tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + offset); 4358 tu_cs_emit_qw(cs, count_buf->bo->iova + count_buf->bo_offset + countBufferOffset); 4359 tu_cs_emit(cs, stride); 4360} 4361 4362VKAPI_ATTR void VKAPI_CALL 4363tu_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer, 4364 uint32_t instanceCount, 4365 uint32_t firstInstance, 4366 VkBuffer _counterBuffer, 4367 VkDeviceSize counterBufferOffset, 4368 uint32_t counterOffset, 4369 uint32_t vertexStride) 4370{ 4371 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 4372 TU_FROM_HANDLE(tu_buffer, buf, _counterBuffer); 4373 struct tu_cs *cs = &cmd->draw_cs; 4374 4375 /* All known firmware versions do not wait for WFI's with CP_DRAW_AUTO. 4376 * Plus, for the common case where the counter buffer is written by 4377 * vkCmdEndTransformFeedback, we need to wait for the CP_WAIT_MEM_WRITES to 4378 * complete which means we need a WAIT_FOR_ME anyway. 4379 */ 4380 draw_wfm(cmd); 4381 4382 tu6_emit_vs_params(cmd, 0, firstInstance); 4383 4384 tu6_draw_common(cmd, cs, false, 0); 4385 4386 tu_cs_emit_pkt7(cs, CP_DRAW_AUTO, 6); 4387 tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_XFB)); 4388 tu_cs_emit(cs, instanceCount); 4389 tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + counterBufferOffset); 4390 tu_cs_emit(cs, counterOffset); 4391 tu_cs_emit(cs, vertexStride); 4392} 4393 4394struct tu_dispatch_info 4395{ 4396 /** 4397 * Determine the layout of the grid (in block units) to be used. 4398 */ 4399 uint32_t blocks[3]; 4400 4401 /** 4402 * A starting offset for the grid. If unaligned is set, the offset 4403 * must still be aligned. 4404 */ 4405 uint32_t offsets[3]; 4406 /** 4407 * Whether it's an unaligned compute dispatch. 4408 */ 4409 bool unaligned; 4410 4411 /** 4412 * Indirect compute parameters resource. 4413 */ 4414 struct tu_buffer *indirect; 4415 uint64_t indirect_offset; 4416}; 4417 4418static void 4419tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd, 4420 struct tu_cs *cs, struct tu_pipeline *pipeline, 4421 const struct tu_dispatch_info *info) 4422{ 4423 gl_shader_stage type = MESA_SHADER_COMPUTE; 4424 const struct tu_program_descriptor_linkage *link = 4425 &pipeline->program.link[type]; 4426 const struct ir3_const_state *const_state = &link->const_state; 4427 uint32_t offset = const_state->offsets.driver_param; 4428 unsigned subgroup_size = pipeline->compute.subgroup_size; 4429 unsigned subgroup_shift = util_logbase2(subgroup_size); 4430 4431 if (link->constlen <= offset) 4432 return; 4433 4434 uint32_t num_consts = MIN2(const_state->num_driver_params, 4435 (link->constlen - offset) * 4); 4436 4437 if (!info->indirect) { 4438 uint32_t driver_params[12] = { 4439 [IR3_DP_NUM_WORK_GROUPS_X] = info->blocks[0], 4440 [IR3_DP_NUM_WORK_GROUPS_Y] = info->blocks[1], 4441 [IR3_DP_NUM_WORK_GROUPS_Z] = info->blocks[2], 4442 [IR3_DP_BASE_GROUP_X] = info->offsets[0], 4443 [IR3_DP_BASE_GROUP_Y] = info->offsets[1], 4444 [IR3_DP_BASE_GROUP_Z] = info->offsets[2], 4445 [IR3_DP_SUBGROUP_SIZE] = subgroup_size, 4446 [IR3_DP_SUBGROUP_ID_SHIFT] = subgroup_shift, 4447 }; 4448 4449 assert(num_consts <= ARRAY_SIZE(driver_params)); 4450 4451 /* push constants */ 4452 tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + num_consts); 4453 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) | 4454 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | 4455 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | 4456 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | 4457 CP_LOAD_STATE6_0_NUM_UNIT(num_consts / 4)); 4458 tu_cs_emit(cs, 0); 4459 tu_cs_emit(cs, 0); 4460 uint32_t i; 4461 for (i = 0; i < num_consts; i++) 4462 tu_cs_emit(cs, driver_params[i]); 4463 } else if (!(info->indirect_offset & 0xf)) { 4464 tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3); 4465 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) | 4466 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | 4467 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | 4468 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | 4469 CP_LOAD_STATE6_0_NUM_UNIT(1)); 4470 tu_cs_emit_qw(cs, tu_buffer_iova(info->indirect) + info->indirect_offset); 4471 } else { 4472 /* Vulkan guarantees only 4 byte alignment for indirect_offset. 4473 * However, CP_LOAD_STATE.EXT_SRC_ADDR needs 16 byte alignment. 4474 */ 4475 4476 uint64_t indirect_iova = tu_buffer_iova(info->indirect) + info->indirect_offset; 4477 4478 for (uint32_t i = 0; i < 3; i++) { 4479 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5); 4480 tu_cs_emit(cs, 0); 4481 tu_cs_emit_qw(cs, global_iova(cmd, cs_indirect_xyz[i])); 4482 tu_cs_emit_qw(cs, indirect_iova + i * 4); 4483 } 4484 4485 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); 4486 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); 4487 4488 tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3); 4489 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) | 4490 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | 4491 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | 4492 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | 4493 CP_LOAD_STATE6_0_NUM_UNIT(1)); 4494 tu_cs_emit_qw(cs, global_iova(cmd, cs_indirect_xyz[0])); 4495 } 4496 4497 /* Fill out IR3_DP_SUBGROUP_SIZE and IR3_DP_SUBGROUP_ID_SHIFT for indirect 4498 * dispatch. 4499 */ 4500 if (info->indirect && num_consts > IR3_DP_BASE_GROUP_X) { 4501 tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 7); 4502 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset + (IR3_DP_BASE_GROUP_X / 4)) | 4503 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | 4504 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | 4505 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | 4506 CP_LOAD_STATE6_0_NUM_UNIT((num_consts - IR3_DP_BASE_GROUP_X) / 4)); 4507 tu_cs_emit_qw(cs, 0); 4508 tu_cs_emit(cs, 0); /* BASE_GROUP_X */ 4509 tu_cs_emit(cs, 0); /* BASE_GROUP_Y */ 4510 tu_cs_emit(cs, 0); /* BASE_GROUP_Z */ 4511 tu_cs_emit(cs, subgroup_size); 4512 if (num_consts > IR3_DP_LOCAL_GROUP_SIZE_X) { 4513 assert(num_consts == align(IR3_DP_SUBGROUP_ID_SHIFT, 4)); 4514 tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_X */ 4515 tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_Y */ 4516 tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_Z */ 4517 tu_cs_emit(cs, subgroup_shift); 4518 } 4519 } 4520} 4521 4522static void 4523tu_dispatch(struct tu_cmd_buffer *cmd, 4524 const struct tu_dispatch_info *info) 4525{ 4526 if (!info->indirect && 4527 (info->blocks[0] == 0 || info->blocks[1] == 0 || info->blocks[2] == 0)) 4528 return; 4529 4530 struct tu_cs *cs = &cmd->cs; 4531 struct tu_pipeline *pipeline = cmd->state.compute_pipeline; 4532 struct tu_descriptor_state *descriptors_state = 4533 &cmd->descriptors[VK_PIPELINE_BIND_POINT_COMPUTE]; 4534 4535 /* TODO: We could probably flush less if we add a compute_flush_bits 4536 * bitfield. 4537 */ 4538 tu_emit_cache_flush(cmd, cs); 4539 4540 /* note: no reason to have this in a separate IB */ 4541 tu_cs_emit_state_ib(cs, 4542 tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_COMPUTE)); 4543 4544 tu_emit_compute_driver_params(cmd, cs, pipeline, info); 4545 4546 if (cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD) 4547 tu_cs_emit_state_ib(cs, pipeline->load_state); 4548 4549 cmd->state.dirty &= ~TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD; 4550 4551 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); 4552 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_COMPUTE)); 4553 4554 const uint32_t *local_size = pipeline->compute.local_size; 4555 const uint32_t *num_groups = info->blocks; 4556 tu_cs_emit_regs(cs, 4557 A6XX_HLSQ_CS_NDRANGE_0(.kerneldim = 3, 4558 .localsizex = local_size[0] - 1, 4559 .localsizey = local_size[1] - 1, 4560 .localsizez = local_size[2] - 1), 4561 A6XX_HLSQ_CS_NDRANGE_1(.globalsize_x = local_size[0] * num_groups[0]), 4562 A6XX_HLSQ_CS_NDRANGE_2(.globaloff_x = 0), 4563 A6XX_HLSQ_CS_NDRANGE_3(.globalsize_y = local_size[1] * num_groups[1]), 4564 A6XX_HLSQ_CS_NDRANGE_4(.globaloff_y = 0), 4565 A6XX_HLSQ_CS_NDRANGE_5(.globalsize_z = local_size[2] * num_groups[2]), 4566 A6XX_HLSQ_CS_NDRANGE_6(.globaloff_z = 0)); 4567 4568 tu_cs_emit_regs(cs, 4569 A6XX_HLSQ_CS_KERNEL_GROUP_X(1), 4570 A6XX_HLSQ_CS_KERNEL_GROUP_Y(1), 4571 A6XX_HLSQ_CS_KERNEL_GROUP_Z(1)); 4572 4573 trace_start_compute(&cmd->trace, cs); 4574 4575 if (info->indirect) { 4576 uint64_t iova = tu_buffer_iova(info->indirect) + info->indirect_offset; 4577 4578 tu_cs_emit_pkt7(cs, CP_EXEC_CS_INDIRECT, 4); 4579 tu_cs_emit(cs, 0x00000000); 4580 tu_cs_emit_qw(cs, iova); 4581 tu_cs_emit(cs, 4582 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEX(local_size[0] - 1) | 4583 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEY(local_size[1] - 1) | 4584 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEZ(local_size[2] - 1)); 4585 } else { 4586 tu_cs_emit_pkt7(cs, CP_EXEC_CS, 4); 4587 tu_cs_emit(cs, 0x00000000); 4588 tu_cs_emit(cs, CP_EXEC_CS_1_NGROUPS_X(info->blocks[0])); 4589 tu_cs_emit(cs, CP_EXEC_CS_2_NGROUPS_Y(info->blocks[1])); 4590 tu_cs_emit(cs, CP_EXEC_CS_3_NGROUPS_Z(info->blocks[2])); 4591 } 4592 4593 trace_end_compute(&cmd->trace, cs, 4594 info->indirect != NULL, 4595 local_size[0], local_size[1], local_size[2], 4596 info->blocks[0], info->blocks[1], info->blocks[2]); 4597 4598 tu_cs_emit_wfi(cs); 4599} 4600 4601VKAPI_ATTR void VKAPI_CALL 4602tu_CmdDispatchBase(VkCommandBuffer commandBuffer, 4603 uint32_t base_x, 4604 uint32_t base_y, 4605 uint32_t base_z, 4606 uint32_t x, 4607 uint32_t y, 4608 uint32_t z) 4609{ 4610 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); 4611 struct tu_dispatch_info info = {}; 4612 4613 info.blocks[0] = x; 4614 info.blocks[1] = y; 4615 info.blocks[2] = z; 4616 4617 info.offsets[0] = base_x; 4618 info.offsets[1] = base_y; 4619 info.offsets[2] = base_z; 4620 tu_dispatch(cmd_buffer, &info); 4621} 4622 4623VKAPI_ATTR void VKAPI_CALL 4624tu_CmdDispatch(VkCommandBuffer commandBuffer, 4625 uint32_t x, 4626 uint32_t y, 4627 uint32_t z) 4628{ 4629 tu_CmdDispatchBase(commandBuffer, 0, 0, 0, x, y, z); 4630} 4631 4632VKAPI_ATTR void VKAPI_CALL 4633tu_CmdDispatchIndirect(VkCommandBuffer commandBuffer, 4634 VkBuffer _buffer, 4635 VkDeviceSize offset) 4636{ 4637 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); 4638 TU_FROM_HANDLE(tu_buffer, buffer, _buffer); 4639 struct tu_dispatch_info info = {}; 4640 4641 info.indirect = buffer; 4642 info.indirect_offset = offset; 4643 4644 tu_dispatch(cmd_buffer, &info); 4645} 4646 4647VKAPI_ATTR void VKAPI_CALL 4648tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer, 4649 const VkSubpassEndInfoKHR *pSubpassEndInfo) 4650{ 4651 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); 4652 4653 tu6_emit_tile_store(cmd_buffer, &cmd_buffer->tile_store_cs); 4654 4655 tu_cs_end(&cmd_buffer->draw_cs); 4656 tu_cs_end(&cmd_buffer->tile_store_cs); 4657 tu_cs_end(&cmd_buffer->draw_epilogue_cs); 4658 4659 cmd_buffer->trace_renderpass_end = u_trace_end_iterator(&cmd_buffer->trace); 4660 4661 if (use_sysmem_rendering(cmd_buffer)) 4662 tu_cmd_render_sysmem(cmd_buffer); 4663 else 4664 tu_cmd_render_tiles(cmd_buffer); 4665 4666 /* Outside of renderpasses we assume all draw states are disabled. We do 4667 * this outside the draw CS for the normal case where 3d gmem stores aren't 4668 * used. 4669 */ 4670 tu_disable_draw_states(cmd_buffer, &cmd_buffer->cs); 4671 4672 /* discard draw_cs and draw_epilogue_cs entries now that the tiles are 4673 rendered */ 4674 tu_cs_discard_entries(&cmd_buffer->draw_cs); 4675 tu_cs_begin(&cmd_buffer->draw_cs); 4676 tu_cs_discard_entries(&cmd_buffer->tile_store_cs); 4677 tu_cs_begin(&cmd_buffer->tile_store_cs); 4678 tu_cs_discard_entries(&cmd_buffer->draw_epilogue_cs); 4679 tu_cs_begin(&cmd_buffer->draw_epilogue_cs); 4680 4681 cmd_buffer->state.cache.pending_flush_bits |= 4682 cmd_buffer->state.renderpass_cache.pending_flush_bits; 4683 tu_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier, true); 4684 4685 vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments); 4686 4687 cmd_buffer->state.pass = NULL; 4688 cmd_buffer->state.subpass = NULL; 4689 cmd_buffer->state.framebuffer = NULL; 4690 cmd_buffer->state.attachments = NULL; 4691 cmd_buffer->state.has_tess = false; 4692 cmd_buffer->state.has_subpass_predication = false; 4693 cmd_buffer->state.disable_gmem = false; 4694 4695 /* LRZ is not valid next time we use it */ 4696 cmd_buffer->state.lrz.valid = false; 4697 cmd_buffer->state.dirty |= TU_CMD_DIRTY_LRZ; 4698} 4699 4700struct tu_barrier_info 4701{ 4702 uint32_t eventCount; 4703 const VkEvent *pEvents; 4704 VkPipelineStageFlags srcStageMask; 4705 VkPipelineStageFlags dstStageMask; 4706}; 4707 4708static void 4709tu_barrier(struct tu_cmd_buffer *cmd, 4710 uint32_t memoryBarrierCount, 4711 const VkMemoryBarrier *pMemoryBarriers, 4712 uint32_t bufferMemoryBarrierCount, 4713 const VkBufferMemoryBarrier *pBufferMemoryBarriers, 4714 uint32_t imageMemoryBarrierCount, 4715 const VkImageMemoryBarrier *pImageMemoryBarriers, 4716 const struct tu_barrier_info *info) 4717{ 4718 struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs; 4719 VkAccessFlags srcAccessMask = 0; 4720 VkAccessFlags dstAccessMask = 0; 4721 4722 if (cmd->state.pass) { 4723 const VkPipelineStageFlags framebuffer_space_stages = 4724 VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | 4725 VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | 4726 VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT | 4727 VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; 4728 4729 /* We cannot have non-by-region "fb-space to fb-space" barriers. 4730 * 4731 * From the Vulkan 1.2.185 spec, section 7.6.1 "Subpass Self-dependency": 4732 * 4733 * If the source and destination stage masks both include 4734 * framebuffer-space stages, then dependencyFlags must include 4735 * VK_DEPENDENCY_BY_REGION_BIT. 4736 * [...] 4737 * Each of the synchronization scopes and access scopes of a 4738 * vkCmdPipelineBarrier2KHR or vkCmdPipelineBarrier command inside 4739 * a render pass instance must be a subset of the scopes of one of 4740 * the self-dependencies for the current subpass. 4741 * 4742 * If the self-dependency has VK_DEPENDENCY_BY_REGION_BIT or 4743 * VK_DEPENDENCY_VIEW_LOCAL_BIT set, then so must the pipeline barrier. 4744 * 4745 * By-region barriers are ok for gmem. All other barriers would involve 4746 * vtx stages which are NOT ok for gmem rendering. 4747 * See dep_invalid_for_gmem(). 4748 */ 4749 if ((info->srcStageMask & ~framebuffer_space_stages) || 4750 (info->dstStageMask & ~framebuffer_space_stages)) { 4751 cmd->state.disable_gmem = true; 4752 } 4753 } 4754 4755 for (uint32_t i = 0; i < memoryBarrierCount; i++) { 4756 srcAccessMask |= pMemoryBarriers[i].srcAccessMask; 4757 dstAccessMask |= pMemoryBarriers[i].dstAccessMask; 4758 } 4759 4760 for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) { 4761 srcAccessMask |= pBufferMemoryBarriers[i].srcAccessMask; 4762 dstAccessMask |= pBufferMemoryBarriers[i].dstAccessMask; 4763 } 4764 4765 enum tu_cmd_access_mask src_flags = 0; 4766 enum tu_cmd_access_mask dst_flags = 0; 4767 4768 for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) { 4769 VkImageLayout old_layout = pImageMemoryBarriers[i].oldLayout; 4770 if (old_layout == VK_IMAGE_LAYOUT_UNDEFINED) { 4771 /* The underlying memory for this image may have been used earlier 4772 * within the same queue submission for a different image, which 4773 * means that there may be old, stale cache entries which are in the 4774 * "wrong" location, which could cause problems later after writing 4775 * to the image. We don't want these entries being flushed later and 4776 * overwriting the actual image, so we need to flush the CCU. 4777 */ 4778 src_flags |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE; 4779 } 4780 srcAccessMask |= pImageMemoryBarriers[i].srcAccessMask; 4781 dstAccessMask |= pImageMemoryBarriers[i].dstAccessMask; 4782 } 4783 4784 /* Inside a renderpass, we don't know yet whether we'll be using sysmem 4785 * so we have to use the sysmem flushes. 4786 */ 4787 bool gmem = cmd->state.ccu_state == TU_CMD_CCU_GMEM && 4788 !cmd->state.pass; 4789 src_flags |= vk2tu_access(srcAccessMask, gmem); 4790 dst_flags |= vk2tu_access(dstAccessMask, gmem); 4791 4792 struct tu_cache_state *cache = 4793 cmd->state.pass ? &cmd->state.renderpass_cache : &cmd->state.cache; 4794 tu_flush_for_access(cache, src_flags, dst_flags); 4795 4796 enum tu_stage src_stage = vk2tu_src_stage(info->srcStageMask); 4797 enum tu_stage dst_stage = vk2tu_dst_stage(info->dstStageMask); 4798 tu_flush_for_stage(cache, src_stage, dst_stage); 4799 4800 for (uint32_t i = 0; i < info->eventCount; i++) { 4801 TU_FROM_HANDLE(tu_event, event, info->pEvents[i]); 4802 4803 tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6); 4804 tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) | 4805 CP_WAIT_REG_MEM_0_POLL_MEMORY); 4806 tu_cs_emit_qw(cs, event->bo.iova); /* POLL_ADDR_LO/HI */ 4807 tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(1)); 4808 tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0u)); 4809 tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(20)); 4810 } 4811} 4812 4813VKAPI_ATTR void VKAPI_CALL 4814tu_CmdPipelineBarrier(VkCommandBuffer commandBuffer, 4815 VkPipelineStageFlags srcStageMask, 4816 VkPipelineStageFlags dstStageMask, 4817 VkDependencyFlags dependencyFlags, 4818 uint32_t memoryBarrierCount, 4819 const VkMemoryBarrier *pMemoryBarriers, 4820 uint32_t bufferMemoryBarrierCount, 4821 const VkBufferMemoryBarrier *pBufferMemoryBarriers, 4822 uint32_t imageMemoryBarrierCount, 4823 const VkImageMemoryBarrier *pImageMemoryBarriers) 4824{ 4825 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); 4826 struct tu_barrier_info info; 4827 4828 info.eventCount = 0; 4829 info.pEvents = NULL; 4830 info.srcStageMask = srcStageMask; 4831 info.dstStageMask = dstStageMask; 4832 4833 tu_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers, 4834 bufferMemoryBarrierCount, pBufferMemoryBarriers, 4835 imageMemoryBarrierCount, pImageMemoryBarriers, &info); 4836} 4837 4838static void 4839write_event(struct tu_cmd_buffer *cmd, struct tu_event *event, 4840 VkPipelineStageFlags stageMask, unsigned value) 4841{ 4842 struct tu_cs *cs = &cmd->cs; 4843 4844 /* vkCmdSetEvent/vkCmdResetEvent cannot be called inside a render pass */ 4845 assert(!cmd->state.pass); 4846 4847 tu_emit_cache_flush(cmd, cs); 4848 4849 /* Flags that only require a top-of-pipe event. DrawIndirect parameters are 4850 * read by the CP, so the draw indirect stage counts as top-of-pipe too. 4851 */ 4852 VkPipelineStageFlags top_of_pipe_flags = 4853 VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT | 4854 VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT; 4855 4856 if (!(stageMask & ~top_of_pipe_flags)) { 4857 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3); 4858 tu_cs_emit_qw(cs, event->bo.iova); /* ADDR_LO/HI */ 4859 tu_cs_emit(cs, value); 4860 } else { 4861 /* Use a RB_DONE_TS event to wait for everything to complete. */ 4862 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 4); 4863 tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(RB_DONE_TS)); 4864 tu_cs_emit_qw(cs, event->bo.iova); 4865 tu_cs_emit(cs, value); 4866 } 4867} 4868 4869VKAPI_ATTR void VKAPI_CALL 4870tu_CmdSetEvent(VkCommandBuffer commandBuffer, 4871 VkEvent _event, 4872 VkPipelineStageFlags stageMask) 4873{ 4874 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 4875 TU_FROM_HANDLE(tu_event, event, _event); 4876 4877 write_event(cmd, event, stageMask, 1); 4878} 4879 4880VKAPI_ATTR void VKAPI_CALL 4881tu_CmdResetEvent(VkCommandBuffer commandBuffer, 4882 VkEvent _event, 4883 VkPipelineStageFlags stageMask) 4884{ 4885 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 4886 TU_FROM_HANDLE(tu_event, event, _event); 4887 4888 write_event(cmd, event, stageMask, 0); 4889} 4890 4891VKAPI_ATTR void VKAPI_CALL 4892tu_CmdWaitEvents(VkCommandBuffer commandBuffer, 4893 uint32_t eventCount, 4894 const VkEvent *pEvents, 4895 VkPipelineStageFlags srcStageMask, 4896 VkPipelineStageFlags dstStageMask, 4897 uint32_t memoryBarrierCount, 4898 const VkMemoryBarrier *pMemoryBarriers, 4899 uint32_t bufferMemoryBarrierCount, 4900 const VkBufferMemoryBarrier *pBufferMemoryBarriers, 4901 uint32_t imageMemoryBarrierCount, 4902 const VkImageMemoryBarrier *pImageMemoryBarriers) 4903{ 4904 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 4905 struct tu_barrier_info info; 4906 4907 info.eventCount = eventCount; 4908 info.pEvents = pEvents; 4909 info.srcStageMask = srcStageMask; 4910 info.dstStageMask = dstStageMask; 4911 4912 tu_barrier(cmd, memoryBarrierCount, pMemoryBarriers, 4913 bufferMemoryBarrierCount, pBufferMemoryBarriers, 4914 imageMemoryBarrierCount, pImageMemoryBarriers, &info); 4915} 4916 4917VKAPI_ATTR void VKAPI_CALL 4918tu_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask) 4919{ 4920 /* No-op */ 4921} 4922 4923 4924VKAPI_ATTR void VKAPI_CALL 4925tu_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer, 4926 const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin) 4927{ 4928 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 4929 4930 cmd->state.predication_active = true; 4931 if (cmd->state.pass) 4932 cmd->state.has_subpass_predication = true; 4933 4934 struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs; 4935 4936 tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_GLOBAL, 1); 4937 tu_cs_emit(cs, 1); 4938 4939 /* Wait for any writes to the predicate to land */ 4940 if (cmd->state.pass) 4941 tu_emit_cache_flush_renderpass(cmd, cs); 4942 else 4943 tu_emit_cache_flush(cmd, cs); 4944 4945 TU_FROM_HANDLE(tu_buffer, buf, pConditionalRenderingBegin->buffer); 4946 uint64_t iova = tu_buffer_iova(buf) + pConditionalRenderingBegin->offset; 4947 4948 /* qcom doesn't support 32-bit reference values, only 64-bit, but Vulkan 4949 * mandates 32-bit comparisons. Our workaround is to copy the the reference 4950 * value to the low 32-bits of a location where the high 32 bits are known 4951 * to be 0 and then compare that. 4952 */ 4953 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5); 4954 tu_cs_emit(cs, 0); 4955 tu_cs_emit_qw(cs, global_iova(cmd, predicate)); 4956 tu_cs_emit_qw(cs, iova); 4957 4958 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); 4959 tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0); 4960 4961 bool inv = pConditionalRenderingBegin->flags & VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT; 4962 tu_cs_emit_pkt7(cs, CP_DRAW_PRED_SET, 3); 4963 tu_cs_emit(cs, CP_DRAW_PRED_SET_0_SRC(PRED_SRC_MEM) | 4964 CP_DRAW_PRED_SET_0_TEST(inv ? EQ_0_PASS : NE_0_PASS)); 4965 tu_cs_emit_qw(cs, global_iova(cmd, predicate)); 4966} 4967 4968VKAPI_ATTR void VKAPI_CALL 4969tu_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer) 4970{ 4971 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 4972 4973 cmd->state.predication_active = false; 4974 4975 struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs; 4976 4977 tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_GLOBAL, 1); 4978 tu_cs_emit(cs, 0); 4979} 4980 4981