1/* 2 * Copyright © 2016 Red Hat. 3 * Copyright © 2016 Bas Nieuwenhuizen 4 * 5 * based in part on anv driver which is: 6 * Copyright © 2015 Intel Corporation 7 * 8 * Permission is hereby granted, free of charge, to any person obtaining a 9 * copy of this software and associated documentation files (the "Software"), 10 * to deal in the Software without restriction, including without limitation 11 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 12 * and/or sell copies of the Software, and to permit persons to whom the 13 * Software is furnished to do so, subject to the following conditions: 14 * 15 * The above copyright notice and this permission notice (including the next 16 * paragraph) shall be included in all copies or substantial portions of the 17 * Software. 18 * 19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 * DEALINGS IN THE SOFTWARE. 26 */ 27 28#ifndef TU_PRIVATE_H 29#define TU_PRIVATE_H 30 31#include <assert.h> 32#include <pthread.h> 33#include <stdbool.h> 34#include <stdint.h> 35#include <stdio.h> 36#include <stdlib.h> 37#include <string.h> 38#ifdef HAVE_VALGRIND 39#include <memcheck.h> 40#include <valgrind.h> 41#define VG(x) x 42#else 43#define VG(x) ((void)0) 44#endif 45 46#define MESA_LOG_TAG "TU" 47 48#include "c11/threads.h" 49#include "main/macros.h" 50#include "util/bitscan.h" 51#include "util/list.h" 52#include "util/log.h" 53#include "util/macros.h" 54#include "util/u_atomic.h" 55#include "util/u_dynarray.h" 56#include "util/perf/u_trace.h" 57#include "vk_alloc.h" 58#include "vk_debug_report.h" 59#include "vk_device.h" 60#include "vk_dispatch_table.h" 61#include "vk_extensions.h" 62#include "vk_instance.h" 63#include "vk_log.h" 64#include "vk_physical_device.h" 65#include "vk_shader_module.h" 66#include "wsi_common.h" 67 68#include "ir3/ir3_compiler.h" 69#include "ir3/ir3_shader.h" 70 71#include "adreno_common.xml.h" 72#include "adreno_pm4.xml.h" 73#include "a6xx.xml.h" 74#include "fdl/freedreno_layout.h" 75#include "common/freedreno_dev_info.h" 76#include "perfcntrs/freedreno_perfcntr.h" 77 78#include "tu_descriptor_set.h" 79#include "tu_util.h" 80#include "tu_perfetto.h" 81 82/* Pre-declarations needed for WSI entrypoints */ 83struct wl_surface; 84struct wl_display; 85typedef struct xcb_connection_t xcb_connection_t; 86typedef uint32_t xcb_visualid_t; 87typedef uint32_t xcb_window_t; 88 89#include <vulkan/vk_android_native_buffer.h> 90#include <vulkan/vk_icd.h> 91#include <vulkan/vulkan.h> 92 93#include "tu_entrypoints.h" 94 95#include "vk_format.h" 96#include "vk_command_buffer.h" 97#include "vk_queue.h" 98 99#define MAX_VBS 32 100#define MAX_VERTEX_ATTRIBS 32 101#define MAX_RTS 8 102#define MAX_VSC_PIPES 32 103#define MAX_VIEWPORTS 16 104#define MAX_VIEWPORT_SIZE (1 << 14) 105#define MAX_SCISSORS 16 106#define MAX_DISCARD_RECTANGLES 4 107#define MAX_PUSH_CONSTANTS_SIZE 128 108#define MAX_PUSH_DESCRIPTORS 32 109#define MAX_DYNAMIC_UNIFORM_BUFFERS 16 110#define MAX_DYNAMIC_STORAGE_BUFFERS 8 111#define MAX_DYNAMIC_BUFFERS \ 112 (MAX_DYNAMIC_UNIFORM_BUFFERS + MAX_DYNAMIC_STORAGE_BUFFERS) 113#define TU_MAX_DRM_DEVICES 8 114#define MAX_VIEWS 16 115#define MAX_BIND_POINTS 2 /* compute + graphics */ 116/* The Qualcomm driver exposes 0x20000058 */ 117#define MAX_STORAGE_BUFFER_RANGE 0x20000000 118/* We use ldc for uniform buffer loads, just like the Qualcomm driver, so 119 * expose the same maximum range. 120 * TODO: The SIZE bitfield is 15 bits, and in 4-dword units, so the actual 121 * range might be higher. 122 */ 123#define MAX_UNIFORM_BUFFER_RANGE 0x10000 124 125#define A6XX_TEX_CONST_DWORDS 16 126#define A6XX_TEX_SAMP_DWORDS 4 127 128#define COND(bool, val) ((bool) ? (val) : 0) 129#define BIT(bit) (1u << (bit)) 130 131/* Whenever we generate an error, pass it through this function. Useful for 132 * debugging, where we can break on it. Only call at error site, not when 133 * propagating errors. Might be useful to plug in a stack trace here. 134 */ 135 136struct tu_instance; 137 138VkResult 139__vk_startup_errorf(struct tu_instance *instance, 140 VkResult error, 141 bool force_print, 142 const char *file, 143 int line, 144 const char *format, 145 ...) PRINTFLIKE(6, 7); 146 147/* Prints startup errors if TU_DEBUG=startup is set or on a debug driver 148 * build. 149 */ 150#define vk_startup_errorf(instance, error, format, ...) \ 151 __vk_startup_errorf(instance, error, \ 152 instance->debug_flags & TU_DEBUG_STARTUP, \ 153 __FILE__, __LINE__, format, ##__VA_ARGS__) 154 155void 156__tu_finishme(const char *file, int line, const char *format, ...) 157 PRINTFLIKE(3, 4); 158 159/** 160 * Print a FINISHME message, including its source location. 161 */ 162#define tu_finishme(format, ...) \ 163 do { \ 164 static bool reported = false; \ 165 if (!reported) { \ 166 __tu_finishme(__FILE__, __LINE__, format, ##__VA_ARGS__); \ 167 reported = true; \ 168 } \ 169 } while (0) 170 171#define tu_stub() \ 172 do { \ 173 tu_finishme("stub %s", __func__); \ 174 } while (0) 175 176struct tu_memory_heap { 177 /* Standard bits passed on to the client */ 178 VkDeviceSize size; 179 VkMemoryHeapFlags flags; 180 181 /** Copied from ANV: 182 * 183 * Driver-internal book-keeping. 184 * 185 * Align it to 64 bits to make atomic operations faster on 32 bit platforms. 186 */ 187 VkDeviceSize used __attribute__ ((aligned (8))); 188}; 189 190uint64_t 191tu_get_system_heap_size(void); 192 193struct tu_physical_device 194{ 195 struct vk_physical_device vk; 196 197 struct tu_instance *instance; 198 199 const char *name; 200 uint8_t driver_uuid[VK_UUID_SIZE]; 201 uint8_t device_uuid[VK_UUID_SIZE]; 202 uint8_t cache_uuid[VK_UUID_SIZE]; 203 204 struct wsi_device wsi_device; 205 206 int local_fd; 207 int master_fd; 208 209 uint32_t gmem_size; 210 uint64_t gmem_base; 211 uint32_t ccu_offset_gmem; 212 uint32_t ccu_offset_bypass; 213 214 struct fd_dev_id dev_id; 215 const struct fd_dev_info *info; 216 217 int msm_major_version; 218 int msm_minor_version; 219 220 /* This is the drivers on-disk cache used as a fallback as opposed to 221 * the pipeline cache defined by apps. 222 */ 223 struct disk_cache *disk_cache; 224 225 struct tu_memory_heap heap; 226}; 227 228enum tu_debug_flags 229{ 230 TU_DEBUG_STARTUP = 1 << 0, 231 TU_DEBUG_NIR = 1 << 1, 232 TU_DEBUG_NOBIN = 1 << 3, 233 TU_DEBUG_SYSMEM = 1 << 4, 234 TU_DEBUG_FORCEBIN = 1 << 5, 235 TU_DEBUG_NOUBWC = 1 << 6, 236 TU_DEBUG_NOMULTIPOS = 1 << 7, 237 TU_DEBUG_NOLRZ = 1 << 8, 238 TU_DEBUG_PERFC = 1 << 9, 239 TU_DEBUG_FLUSHALL = 1 << 10, 240 TU_DEBUG_SYNCDRAW = 1 << 11, 241}; 242 243struct tu_instance 244{ 245 struct vk_instance vk; 246 247 uint32_t api_version; 248 int physical_device_count; 249 struct tu_physical_device physical_devices[TU_MAX_DRM_DEVICES]; 250 251 enum tu_debug_flags debug_flags; 252}; 253 254VkResult 255tu_wsi_init(struct tu_physical_device *physical_device); 256void 257tu_wsi_finish(struct tu_physical_device *physical_device); 258 259bool 260tu_instance_extension_supported(const char *name); 261uint32_t 262tu_physical_device_api_version(struct tu_physical_device *dev); 263bool 264tu_physical_device_extension_supported(struct tu_physical_device *dev, 265 const char *name); 266 267struct cache_entry; 268 269struct tu_pipeline_cache 270{ 271 struct vk_object_base base; 272 273 struct tu_device *device; 274 pthread_mutex_t mutex; 275 276 uint32_t total_size; 277 uint32_t table_size; 278 uint32_t kernel_count; 279 struct cache_entry **hash_table; 280 bool modified; 281 282 VkAllocationCallbacks alloc; 283}; 284 285struct tu_pipeline_key 286{ 287}; 288 289 290/* queue types */ 291#define TU_QUEUE_GENERAL 0 292 293#define TU_MAX_QUEUE_FAMILIES 1 294 295struct tu_syncobj; 296struct tu_u_trace_syncobj; 297 298struct tu_queue 299{ 300 struct vk_queue vk; 301 302 struct tu_device *device; 303 304 uint32_t msm_queue_id; 305 int fence; 306 307 /* Queue containing deferred submits */ 308 struct list_head queued_submits; 309}; 310 311struct tu_bo 312{ 313 uint32_t gem_handle; 314 uint64_t size; 315 uint64_t iova; 316 void *map; 317}; 318 319enum global_shader { 320 GLOBAL_SH_VS_BLIT, 321 GLOBAL_SH_VS_CLEAR, 322 GLOBAL_SH_FS_BLIT, 323 GLOBAL_SH_FS_BLIT_ZSCALE, 324 GLOBAL_SH_FS_COPY_MS, 325 GLOBAL_SH_FS_CLEAR0, 326 GLOBAL_SH_FS_CLEAR_MAX = GLOBAL_SH_FS_CLEAR0 + MAX_RTS, 327 GLOBAL_SH_COUNT, 328}; 329 330#define TU_BORDER_COLOR_COUNT 4096 331#define TU_BORDER_COLOR_BUILTIN 6 332 333#define TU_BLIT_SHADER_SIZE 1024 334 335/* This struct defines the layout of the global_bo */ 336struct tu6_global 337{ 338 /* clear/blit shaders */ 339 uint32_t shaders[TU_BLIT_SHADER_SIZE]; 340 341 uint32_t seqno_dummy; /* dummy seqno for CP_EVENT_WRITE */ 342 uint32_t _pad0; 343 volatile uint32_t vsc_draw_overflow; 344 uint32_t _pad1; 345 volatile uint32_t vsc_prim_overflow; 346 uint32_t _pad2; 347 uint64_t predicate; 348 349 /* scratch space for VPC_SO[i].FLUSH_BASE_LO/HI, start on 32 byte boundary. */ 350 struct { 351 uint32_t offset; 352 uint32_t pad[7]; 353 } flush_base[4]; 354 355 ALIGN16 uint32_t cs_indirect_xyz[3]; 356 357 /* note: larger global bo will be used for customBorderColors */ 358 struct bcolor_entry bcolor_builtin[TU_BORDER_COLOR_BUILTIN], bcolor[]; 359}; 360#define gb_offset(member) offsetof(struct tu6_global, member) 361#define global_iova(cmd, member) ((cmd)->device->global_bo.iova + gb_offset(member)) 362 363/* extra space in vsc draw/prim streams */ 364#define VSC_PAD 0x40 365 366struct tu_device 367{ 368 struct vk_device vk; 369 struct tu_instance *instance; 370 371 struct tu_queue *queues[TU_MAX_QUEUE_FAMILIES]; 372 int queue_count[TU_MAX_QUEUE_FAMILIES]; 373 374 struct tu_physical_device *physical_device; 375 int fd; 376 int _lost; 377 378 struct ir3_compiler *compiler; 379 380 /* Backup in-memory cache to be used if the app doesn't provide one */ 381 struct tu_pipeline_cache *mem_cache; 382 383#define MIN_SCRATCH_BO_SIZE_LOG2 12 /* A page */ 384 385 /* Currently the kernel driver uses a 32-bit GPU address space, but it 386 * should be impossible to go beyond 48 bits. 387 */ 388 struct { 389 struct tu_bo bo; 390 mtx_t construct_mtx; 391 bool initialized; 392 } scratch_bos[48 - MIN_SCRATCH_BO_SIZE_LOG2]; 393 394 struct tu_bo global_bo; 395 396 struct ir3_shader_variant *global_shaders[GLOBAL_SH_COUNT]; 397 uint64_t global_shader_va[GLOBAL_SH_COUNT]; 398 399 uint32_t vsc_draw_strm_pitch; 400 uint32_t vsc_prim_strm_pitch; 401 BITSET_DECLARE(custom_border_color, TU_BORDER_COLOR_COUNT); 402 mtx_t mutex; 403 404 /* bo list for submits: */ 405 struct drm_msm_gem_submit_bo *bo_list; 406 /* map bo handles to bo list index: */ 407 uint32_t *bo_idx; 408 uint32_t bo_count, bo_list_size, bo_idx_size; 409 mtx_t bo_mutex; 410 411 /* Command streams to set pass index to a scratch reg */ 412 struct tu_cs *perfcntrs_pass_cs; 413 struct tu_cs_entry *perfcntrs_pass_cs_entries; 414 415 /* Condition variable for timeline semaphore to notify waiters when a 416 * new submit is executed. */ 417 pthread_cond_t timeline_cond; 418 pthread_mutex_t submit_mutex; 419 420#ifdef ANDROID 421 const void *gralloc; 422 enum { 423 TU_GRALLOC_UNKNOWN, 424 TU_GRALLOC_CROS, 425 TU_GRALLOC_OTHER, 426 } gralloc_type; 427#endif 428 429 uint32_t submit_count; 430 431 struct u_trace_context trace_context; 432 433 #ifdef HAVE_PERFETTO 434 struct tu_perfetto_state perfetto; 435 #endif 436}; 437 438void tu_init_clear_blit_shaders(struct tu_device *dev); 439 440void tu_destroy_clear_blit_shaders(struct tu_device *dev); 441 442VkResult _tu_device_set_lost(struct tu_device *device, 443 const char *msg, ...) PRINTFLIKE(2, 3); 444#define tu_device_set_lost(dev, ...) \ 445 _tu_device_set_lost(dev, __VA_ARGS__) 446 447static inline bool 448tu_device_is_lost(struct tu_device *device) 449{ 450 return unlikely(p_atomic_read(&device->_lost)); 451} 452 453VkResult 454tu_device_submit_deferred_locked(struct tu_device *dev); 455 456VkResult 457tu_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syncobj); 458 459uint64_t 460tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts); 461 462enum tu_bo_alloc_flags 463{ 464 TU_BO_ALLOC_NO_FLAGS = 0, 465 TU_BO_ALLOC_ALLOW_DUMP = 1 << 0, 466 TU_BO_ALLOC_GPU_READ_ONLY = 1 << 1, 467}; 468 469VkResult 470tu_bo_init_new(struct tu_device *dev, struct tu_bo *bo, uint64_t size, 471 enum tu_bo_alloc_flags flags); 472VkResult 473tu_bo_init_dmabuf(struct tu_device *dev, 474 struct tu_bo *bo, 475 uint64_t size, 476 int fd); 477int 478tu_bo_export_dmabuf(struct tu_device *dev, struct tu_bo *bo); 479void 480tu_bo_finish(struct tu_device *dev, struct tu_bo *bo); 481VkResult 482tu_bo_map(struct tu_device *dev, struct tu_bo *bo); 483 484/* Get a scratch bo for use inside a command buffer. This will always return 485 * the same bo given the same size or similar sizes, so only one scratch bo 486 * can be used at the same time. It's meant for short-lived things where we 487 * need to write to some piece of memory, read from it, and then immediately 488 * discard it. 489 */ 490VkResult 491tu_get_scratch_bo(struct tu_device *dev, uint64_t size, struct tu_bo **bo); 492 493struct tu_cs_entry 494{ 495 /* No ownership */ 496 const struct tu_bo *bo; 497 498 uint32_t size; 499 uint32_t offset; 500}; 501 502struct tu_cs_memory { 503 uint32_t *map; 504 uint64_t iova; 505}; 506 507struct tu_draw_state { 508 uint64_t iova : 48; 509 uint32_t size : 16; 510}; 511 512enum tu_dynamic_state 513{ 514 /* re-use VK_DYNAMIC_STATE_ enums for non-extended dynamic states */ 515 TU_DYNAMIC_STATE_SAMPLE_LOCATIONS = VK_DYNAMIC_STATE_STENCIL_REFERENCE + 1, 516 TU_DYNAMIC_STATE_RB_DEPTH_CNTL, 517 TU_DYNAMIC_STATE_RB_STENCIL_CNTL, 518 TU_DYNAMIC_STATE_VB_STRIDE, 519 TU_DYNAMIC_STATE_RASTERIZER_DISCARD, 520 TU_DYNAMIC_STATE_COUNT, 521 /* no associated draw state: */ 522 TU_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY = TU_DYNAMIC_STATE_COUNT, 523 TU_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE, 524 /* re-use the line width enum as it uses GRAS_SU_CNTL: */ 525 TU_DYNAMIC_STATE_GRAS_SU_CNTL = VK_DYNAMIC_STATE_LINE_WIDTH, 526}; 527 528enum tu_draw_state_group_id 529{ 530 TU_DRAW_STATE_PROGRAM_CONFIG, 531 TU_DRAW_STATE_PROGRAM, 532 TU_DRAW_STATE_PROGRAM_BINNING, 533 TU_DRAW_STATE_TESS, 534 TU_DRAW_STATE_VB, 535 TU_DRAW_STATE_VI, 536 TU_DRAW_STATE_VI_BINNING, 537 TU_DRAW_STATE_RAST, 538 TU_DRAW_STATE_BLEND, 539 TU_DRAW_STATE_SHADER_GEOM_CONST, 540 TU_DRAW_STATE_FS_CONST, 541 TU_DRAW_STATE_DESC_SETS, 542 TU_DRAW_STATE_DESC_SETS_LOAD, 543 TU_DRAW_STATE_VS_PARAMS, 544 TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM, 545 TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM, 546 TU_DRAW_STATE_LRZ, 547 TU_DRAW_STATE_DEPTH_PLANE, 548 549 /* dynamic state related draw states */ 550 TU_DRAW_STATE_DYNAMIC, 551 TU_DRAW_STATE_COUNT = TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_COUNT, 552}; 553 554enum tu_cs_mode 555{ 556 557 /* 558 * A command stream in TU_CS_MODE_GROW mode grows automatically whenever it 559 * is full. tu_cs_begin must be called before command packet emission and 560 * tu_cs_end must be called after. 561 * 562 * This mode may create multiple entries internally. The entries must be 563 * submitted together. 564 */ 565 TU_CS_MODE_GROW, 566 567 /* 568 * A command stream in TU_CS_MODE_EXTERNAL mode wraps an external, 569 * fixed-size buffer. tu_cs_begin and tu_cs_end are optional and have no 570 * effect on it. 571 * 572 * This mode does not create any entry or any BO. 573 */ 574 TU_CS_MODE_EXTERNAL, 575 576 /* 577 * A command stream in TU_CS_MODE_SUB_STREAM mode does not support direct 578 * command packet emission. tu_cs_begin_sub_stream must be called to get a 579 * sub-stream to emit comamnd packets to. When done with the sub-stream, 580 * tu_cs_end_sub_stream must be called. 581 * 582 * This mode does not create any entry internally. 583 */ 584 TU_CS_MODE_SUB_STREAM, 585}; 586 587struct tu_cs 588{ 589 uint32_t *start; 590 uint32_t *cur; 591 uint32_t *reserved_end; 592 uint32_t *end; 593 594 struct tu_device *device; 595 enum tu_cs_mode mode; 596 uint32_t next_bo_size; 597 598 struct tu_cs_entry *entries; 599 uint32_t entry_count; 600 uint32_t entry_capacity; 601 602 struct tu_bo **bos; 603 uint32_t bo_count; 604 uint32_t bo_capacity; 605 606 /* state for cond_exec_start/cond_exec_end */ 607 uint32_t cond_flags; 608 uint32_t *cond_dwords; 609}; 610 611struct tu_device_memory 612{ 613 struct vk_object_base base; 614 615 struct tu_bo bo; 616}; 617 618struct tu_descriptor_range 619{ 620 uint64_t va; 621 uint32_t size; 622}; 623 624struct tu_descriptor_set 625{ 626 struct vk_object_base base; 627 628 const struct tu_descriptor_set_layout *layout; 629 struct tu_descriptor_pool *pool; 630 uint32_t size; 631 632 uint64_t va; 633 uint32_t *mapped_ptr; 634 635 uint32_t *dynamic_descriptors; 636}; 637 638struct tu_descriptor_pool_entry 639{ 640 uint32_t offset; 641 uint32_t size; 642 struct tu_descriptor_set *set; 643}; 644 645struct tu_descriptor_pool 646{ 647 struct vk_object_base base; 648 649 struct tu_bo bo; 650 uint64_t current_offset; 651 uint64_t size; 652 653 uint8_t *host_memory_base; 654 uint8_t *host_memory_ptr; 655 uint8_t *host_memory_end; 656 uint8_t *host_bo; 657 658 uint32_t entry_count; 659 uint32_t max_entry_count; 660 struct tu_descriptor_pool_entry entries[0]; 661}; 662 663struct tu_descriptor_update_template_entry 664{ 665 VkDescriptorType descriptor_type; 666 667 /* The number of descriptors to update */ 668 uint32_t descriptor_count; 669 670 /* Into mapped_ptr or dynamic_descriptors, in units of the respective array 671 */ 672 uint32_t dst_offset; 673 674 /* In dwords. Not valid/used for dynamic descriptors */ 675 uint32_t dst_stride; 676 677 uint32_t buffer_offset; 678 679 /* Only valid for combined image samplers and samplers */ 680 uint16_t has_sampler; 681 682 /* In bytes */ 683 size_t src_offset; 684 size_t src_stride; 685 686 /* For push descriptors */ 687 const struct tu_sampler *immutable_samplers; 688}; 689 690struct tu_descriptor_update_template 691{ 692 struct vk_object_base base; 693 694 uint32_t entry_count; 695 VkPipelineBindPoint bind_point; 696 struct tu_descriptor_update_template_entry entry[0]; 697}; 698 699struct tu_buffer 700{ 701 struct vk_object_base base; 702 703 VkDeviceSize size; 704 705 VkBufferUsageFlags usage; 706 VkBufferCreateFlags flags; 707 708 struct tu_bo *bo; 709 VkDeviceSize bo_offset; 710}; 711 712static inline uint64_t 713tu_buffer_iova(struct tu_buffer *buffer) 714{ 715 return buffer->bo->iova + buffer->bo_offset; 716} 717 718const char * 719tu_get_debug_option_name(int id); 720 721const char * 722tu_get_perftest_option_name(int id); 723 724struct tu_descriptor_state 725{ 726 struct tu_descriptor_set *sets[MAX_SETS]; 727 struct tu_descriptor_set push_set; 728 uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS * A6XX_TEX_CONST_DWORDS]; 729}; 730 731enum tu_cmd_dirty_bits 732{ 733 TU_CMD_DIRTY_VERTEX_BUFFERS = BIT(0), 734 TU_CMD_DIRTY_VB_STRIDE = BIT(1), 735 TU_CMD_DIRTY_GRAS_SU_CNTL = BIT(2), 736 TU_CMD_DIRTY_RB_DEPTH_CNTL = BIT(3), 737 TU_CMD_DIRTY_RB_STENCIL_CNTL = BIT(4), 738 TU_CMD_DIRTY_DESC_SETS_LOAD = BIT(5), 739 TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD = BIT(6), 740 TU_CMD_DIRTY_SHADER_CONSTS = BIT(7), 741 TU_CMD_DIRTY_LRZ = BIT(8), 742 TU_CMD_DIRTY_VS_PARAMS = BIT(9), 743 TU_CMD_DIRTY_RASTERIZER_DISCARD = BIT(10), 744 /* all draw states were disabled and need to be re-enabled: */ 745 TU_CMD_DIRTY_DRAW_STATE = BIT(11) 746}; 747 748/* There are only three cache domains we have to care about: the CCU, or 749 * color cache unit, which is used for color and depth/stencil attachments 750 * and copy/blit destinations, and is split conceptually into color and depth, 751 * and the universal cache or UCHE which is used for pretty much everything 752 * else, except for the CP (uncached) and host. We need to flush whenever data 753 * crosses these boundaries. 754 */ 755 756enum tu_cmd_access_mask { 757 TU_ACCESS_UCHE_READ = 1 << 0, 758 TU_ACCESS_UCHE_WRITE = 1 << 1, 759 TU_ACCESS_CCU_COLOR_READ = 1 << 2, 760 TU_ACCESS_CCU_COLOR_WRITE = 1 << 3, 761 TU_ACCESS_CCU_DEPTH_READ = 1 << 4, 762 TU_ACCESS_CCU_DEPTH_WRITE = 1 << 5, 763 764 /* Experiments have shown that while it's safe to avoid flushing the CCU 765 * after each blit/renderpass, it's not safe to assume that subsequent 766 * lookups with a different attachment state will hit unflushed cache 767 * entries. That is, the CCU needs to be flushed and possibly invalidated 768 * when accessing memory with a different attachment state. Writing to an 769 * attachment under the following conditions after clearing using the 770 * normal 2d engine path is known to have issues: 771 * 772 * - It isn't the 0'th layer. 773 * - There are more than one attachment, and this isn't the 0'th attachment 774 * (this seems to also depend on the cpp of the attachments). 775 * 776 * Our best guess is that the layer/MRT state is used when computing 777 * the location of a cache entry in CCU, to avoid conflicts. We assume that 778 * any access in a renderpass after or before an access by a transfer needs 779 * a flush/invalidate, and use the _INCOHERENT variants to represent access 780 * by a renderpass. 781 */ 782 TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6, 783 TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7, 784 TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8, 785 TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9, 786 787 /* Accesses which bypasses any cache. e.g. writes via the host, 788 * CP_EVENT_WRITE::BLIT, and the CP are SYSMEM_WRITE. 789 */ 790 TU_ACCESS_SYSMEM_READ = 1 << 10, 791 TU_ACCESS_SYSMEM_WRITE = 1 << 11, 792 793 /* Memory writes from the CP start in-order with draws and event writes, 794 * but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read. 795 */ 796 TU_ACCESS_CP_WRITE = 1 << 12, 797 798 TU_ACCESS_READ = 799 TU_ACCESS_UCHE_READ | 800 TU_ACCESS_CCU_COLOR_READ | 801 TU_ACCESS_CCU_DEPTH_READ | 802 TU_ACCESS_CCU_COLOR_INCOHERENT_READ | 803 TU_ACCESS_CCU_DEPTH_INCOHERENT_READ | 804 TU_ACCESS_SYSMEM_READ, 805 806 TU_ACCESS_WRITE = 807 TU_ACCESS_UCHE_WRITE | 808 TU_ACCESS_CCU_COLOR_WRITE | 809 TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE | 810 TU_ACCESS_CCU_DEPTH_WRITE | 811 TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE | 812 TU_ACCESS_SYSMEM_WRITE | 813 TU_ACCESS_CP_WRITE, 814 815 TU_ACCESS_ALL = 816 TU_ACCESS_READ | 817 TU_ACCESS_WRITE, 818}; 819 820/* Starting with a6xx, the pipeline is split into several "clusters" (really 821 * pipeline stages). Each stage has its own pair of register banks and can 822 * switch them independently, so that earlier stages can run ahead of later 823 * ones. e.g. the FS of draw N and the VS of draw N + 1 can be executing at 824 * the same time. 825 * 826 * As a result of this, we need to insert a WFI when an earlier stage depends 827 * on the result of a later stage. CP_DRAW_* and CP_BLIT will wait for any 828 * pending WFI's to complete before starting, and usually before reading 829 * indirect params even, so a WFI also acts as a full "pipeline stall". 830 * 831 * Note, the names of the stages come from CLUSTER_* in devcoredump. We 832 * include all the stages for completeness, even ones which do not read/write 833 * anything. 834 */ 835 836enum tu_stage { 837 /* This doesn't correspond to a cluster, but we need it for tracking 838 * indirect draw parameter reads etc. 839 */ 840 TU_STAGE_CP, 841 842 /* - Fetch index buffer 843 * - Fetch vertex attributes, dispatch VS 844 */ 845 TU_STAGE_FE, 846 847 /* Execute all geometry stages (VS thru GS) */ 848 TU_STAGE_SP_VS, 849 850 /* Write to VPC, do primitive assembly. */ 851 TU_STAGE_PC_VS, 852 853 /* Rasterization. RB_DEPTH_BUFFER_BASE only exists in CLUSTER_PS according 854 * to devcoredump so presumably this stage stalls for TU_STAGE_PS when 855 * early depth testing is enabled before dispatching fragments? However 856 * GRAS reads and writes LRZ directly. 857 */ 858 TU_STAGE_GRAS, 859 860 /* Execute FS */ 861 TU_STAGE_SP_PS, 862 863 /* - Fragment tests 864 * - Write color/depth 865 * - Streamout writes (???) 866 * - Varying interpolation (???) 867 */ 868 TU_STAGE_PS, 869}; 870 871enum tu_cmd_flush_bits { 872 TU_CMD_FLAG_CCU_FLUSH_DEPTH = 1 << 0, 873 TU_CMD_FLAG_CCU_FLUSH_COLOR = 1 << 1, 874 TU_CMD_FLAG_CCU_INVALIDATE_DEPTH = 1 << 2, 875 TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3, 876 TU_CMD_FLAG_CACHE_FLUSH = 1 << 4, 877 TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5, 878 TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 6, 879 TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 7, 880 TU_CMD_FLAG_WAIT_FOR_ME = 1 << 8, 881 882 TU_CMD_FLAG_ALL_FLUSH = 883 TU_CMD_FLAG_CCU_FLUSH_DEPTH | 884 TU_CMD_FLAG_CCU_FLUSH_COLOR | 885 TU_CMD_FLAG_CACHE_FLUSH | 886 /* Treat the CP as a sort of "cache" which may need to be "flushed" via 887 * waiting for writes to land with WAIT_FOR_MEM_WRITES. 888 */ 889 TU_CMD_FLAG_WAIT_MEM_WRITES, 890 891 TU_CMD_FLAG_ALL_INVALIDATE = 892 TU_CMD_FLAG_CCU_INVALIDATE_DEPTH | 893 TU_CMD_FLAG_CCU_INVALIDATE_COLOR | 894 TU_CMD_FLAG_CACHE_INVALIDATE, 895}; 896 897/* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty 898 * heavy, involving a CCU cache flush/invalidate and a WFI in order to change 899 * which part of the gmem is used by the CCU. Here we keep track of what the 900 * state of the CCU. 901 */ 902enum tu_cmd_ccu_state { 903 TU_CMD_CCU_SYSMEM, 904 TU_CMD_CCU_GMEM, 905 TU_CMD_CCU_UNKNOWN, 906}; 907 908struct tu_cache_state { 909 /* Caches which must be made available (flushed) eventually if there are 910 * any users outside that cache domain, and caches which must be 911 * invalidated eventually if there are any reads. 912 */ 913 enum tu_cmd_flush_bits pending_flush_bits; 914 /* Pending flushes */ 915 enum tu_cmd_flush_bits flush_bits; 916}; 917 918enum tu_lrz_force_disable_mask { 919 TU_LRZ_FORCE_DISABLE_LRZ = 1 << 0, 920 TU_LRZ_FORCE_DISABLE_WRITE = 1 << 1, 921}; 922 923enum tu_lrz_direction { 924 TU_LRZ_UNKNOWN, 925 /* Depth func less/less-than: */ 926 TU_LRZ_LESS, 927 /* Depth func greater/greater-than: */ 928 TU_LRZ_GREATER, 929}; 930 931struct tu_lrz_pipeline 932{ 933 uint32_t force_disable_mask; 934 bool fs_has_kill; 935 bool force_late_z; 936 bool early_fragment_tests; 937}; 938 939struct tu_lrz_state 940{ 941 /* Depth/Stencil image currently on use to do LRZ */ 942 struct tu_image *image; 943 bool valid : 1; 944 struct tu_draw_state state; 945 enum tu_lrz_direction prev_direction; 946}; 947 948struct tu_vs_params { 949 uint32_t vertex_offset; 950 uint32_t first_instance; 951}; 952 953struct tu_cmd_state 954{ 955 uint32_t dirty; 956 957 struct tu_pipeline *pipeline; 958 struct tu_pipeline *compute_pipeline; 959 960 /* Vertex buffers, viewports, and scissors 961 * the states for these can be updated partially, so we need to save these 962 * to be able to emit a complete draw state 963 */ 964 struct { 965 uint64_t base; 966 uint32_t size; 967 uint32_t stride; 968 } vb[MAX_VBS]; 969 VkViewport viewport[MAX_VIEWPORTS]; 970 VkRect2D scissor[MAX_SCISSORS]; 971 uint32_t max_viewport, max_scissor; 972 973 /* for dynamic states that can't be emitted directly */ 974 uint32_t dynamic_stencil_mask; 975 uint32_t dynamic_stencil_wrmask; 976 uint32_t dynamic_stencil_ref; 977 978 uint32_t gras_su_cntl, rb_depth_cntl, rb_stencil_cntl; 979 uint32_t pc_raster_cntl, vpc_unknown_9107; 980 enum pc_di_primtype primtype; 981 bool primitive_restart_enable; 982 983 /* saved states to re-emit in TU_CMD_DIRTY_DRAW_STATE case */ 984 struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT]; 985 struct tu_draw_state vertex_buffers; 986 struct tu_draw_state shader_const[2]; 987 struct tu_draw_state desc_sets; 988 989 struct tu_draw_state vs_params; 990 991 /* Index buffer */ 992 uint64_t index_va; 993 uint32_t max_index_count; 994 uint8_t index_size; 995 996 /* because streamout base has to be 32-byte aligned 997 * there is an extra offset to deal with when it is 998 * unaligned 999 */ 1000 uint8_t streamout_offset[IR3_MAX_SO_BUFFERS]; 1001 1002 /* Renderpasses are tricky, because we may need to flush differently if 1003 * using sysmem vs. gmem and therefore we have to delay any flushing that 1004 * happens before a renderpass. So we have to have two copies of the flush 1005 * state, one for intra-renderpass flushes (i.e. renderpass dependencies) 1006 * and one for outside a renderpass. 1007 */ 1008 struct tu_cache_state cache; 1009 struct tu_cache_state renderpass_cache; 1010 1011 enum tu_cmd_ccu_state ccu_state; 1012 1013 const struct tu_render_pass *pass; 1014 const struct tu_subpass *subpass; 1015 const struct tu_framebuffer *framebuffer; 1016 VkRect2D render_area; 1017 1018 const struct tu_image_view **attachments; 1019 1020 bool xfb_used; 1021 bool has_tess; 1022 bool has_subpass_predication; 1023 bool predication_active; 1024 bool disable_gmem; 1025 enum a5xx_line_mode line_mode; 1026 1027 struct tu_lrz_state lrz; 1028 1029 struct tu_draw_state depth_plane_state; 1030 1031 struct tu_vs_params last_vs_params; 1032}; 1033 1034struct tu_cmd_pool 1035{ 1036 struct vk_object_base base; 1037 1038 VkAllocationCallbacks alloc; 1039 struct list_head cmd_buffers; 1040 struct list_head free_cmd_buffers; 1041 uint32_t queue_family_index; 1042}; 1043 1044enum tu_cmd_buffer_status 1045{ 1046 TU_CMD_BUFFER_STATUS_INVALID, 1047 TU_CMD_BUFFER_STATUS_INITIAL, 1048 TU_CMD_BUFFER_STATUS_RECORDING, 1049 TU_CMD_BUFFER_STATUS_EXECUTABLE, 1050 TU_CMD_BUFFER_STATUS_PENDING, 1051}; 1052 1053struct tu_cmd_buffer 1054{ 1055 struct vk_command_buffer vk; 1056 1057 struct tu_device *device; 1058 1059 struct tu_cmd_pool *pool; 1060 struct list_head pool_link; 1061 1062 struct u_trace trace; 1063 struct u_trace_iterator trace_renderpass_start; 1064 struct u_trace_iterator trace_renderpass_end; 1065 1066 VkCommandBufferUsageFlags usage_flags; 1067 VkCommandBufferLevel level; 1068 enum tu_cmd_buffer_status status; 1069 1070 struct tu_cmd_state state; 1071 uint32_t queue_family_index; 1072 1073 uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4]; 1074 VkShaderStageFlags push_constant_stages; 1075 struct tu_descriptor_set meta_push_descriptors; 1076 1077 struct tu_descriptor_state descriptors[MAX_BIND_POINTS]; 1078 1079 VkResult record_result; 1080 1081 struct tu_cs cs; 1082 struct tu_cs draw_cs; 1083 struct tu_cs tile_store_cs; 1084 struct tu_cs draw_epilogue_cs; 1085 struct tu_cs sub_cs; 1086 1087 uint32_t vsc_draw_strm_pitch; 1088 uint32_t vsc_prim_strm_pitch; 1089}; 1090 1091/* Temporary struct for tracking a register state to be written, used by 1092 * a6xx-pack.h and tu_cs_emit_regs() 1093 */ 1094struct tu_reg_value { 1095 uint32_t reg; 1096 uint64_t value; 1097 bool is_address; 1098 struct tu_bo *bo; 1099 bool bo_write; 1100 uint32_t bo_offset; 1101 uint32_t bo_shift; 1102}; 1103 1104 1105void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer, 1106 struct tu_cs *cs); 1107 1108void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer, 1109 struct tu_cs *cs, 1110 enum tu_cmd_ccu_state ccu_state); 1111 1112void 1113tu6_emit_event_write(struct tu_cmd_buffer *cmd, 1114 struct tu_cs *cs, 1115 enum vgt_event_type event); 1116 1117static inline struct tu_descriptor_state * 1118tu_get_descriptors_state(struct tu_cmd_buffer *cmd_buffer, 1119 VkPipelineBindPoint bind_point) 1120{ 1121 return &cmd_buffer->descriptors[bind_point]; 1122} 1123 1124struct tu_event 1125{ 1126 struct vk_object_base base; 1127 struct tu_bo bo; 1128}; 1129 1130struct tu_push_constant_range 1131{ 1132 uint32_t lo; 1133 uint32_t count; 1134}; 1135 1136struct tu_shader 1137{ 1138 struct ir3_shader *ir3_shader; 1139 1140 struct tu_push_constant_range push_consts; 1141 uint8_t active_desc_sets; 1142 bool multi_pos_output; 1143}; 1144 1145bool 1146tu_nir_lower_multiview(nir_shader *nir, uint32_t mask, bool *multi_pos_output, 1147 struct tu_device *dev); 1148 1149nir_shader * 1150tu_spirv_to_nir(struct tu_device *dev, 1151 const VkPipelineShaderStageCreateInfo *stage_info, 1152 gl_shader_stage stage); 1153 1154struct tu_shader * 1155tu_shader_create(struct tu_device *dev, 1156 nir_shader *nir, 1157 unsigned multiview_mask, 1158 struct tu_pipeline_layout *layout, 1159 const VkAllocationCallbacks *alloc); 1160 1161void 1162tu_shader_destroy(struct tu_device *dev, 1163 struct tu_shader *shader, 1164 const VkAllocationCallbacks *alloc); 1165 1166struct tu_program_descriptor_linkage 1167{ 1168 struct ir3_const_state const_state; 1169 1170 uint32_t constlen; 1171 1172 struct tu_push_constant_range push_consts; 1173}; 1174 1175struct tu_pipeline_executable { 1176 gl_shader_stage stage; 1177 1178 struct ir3_info stats; 1179 bool is_binning; 1180 1181 char *nir_from_spirv; 1182 char *nir_final; 1183 char *disasm; 1184}; 1185 1186struct tu_pipeline 1187{ 1188 struct vk_object_base base; 1189 1190 struct tu_cs cs; 1191 1192 /* Separate BO for private memory since it should GPU writable */ 1193 struct tu_bo pvtmem_bo; 1194 1195 struct tu_pipeline_layout *layout; 1196 1197 bool need_indirect_descriptor_sets; 1198 VkShaderStageFlags active_stages; 1199 uint32_t active_desc_sets; 1200 1201 /* mask of enabled dynamic states 1202 * if BIT(i) is set, pipeline->dynamic_state[i] is *NOT* used 1203 */ 1204 uint32_t dynamic_state_mask; 1205 struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT]; 1206 1207 /* for dynamic states which use the same register: */ 1208 uint32_t gras_su_cntl, gras_su_cntl_mask; 1209 uint32_t rb_depth_cntl, rb_depth_cntl_mask; 1210 uint32_t rb_stencil_cntl, rb_stencil_cntl_mask; 1211 uint32_t pc_raster_cntl, pc_raster_cntl_mask; 1212 uint32_t vpc_unknown_9107, vpc_unknown_9107_mask; 1213 uint32_t stencil_wrmask; 1214 1215 bool rb_depth_cntl_disable; 1216 1217 enum a5xx_line_mode line_mode; 1218 1219 /* draw states for the pipeline */ 1220 struct tu_draw_state load_state, rast_state, blend_state; 1221 1222 /* for vertex buffers state */ 1223 uint32_t num_vbs; 1224 1225 struct 1226 { 1227 struct tu_draw_state config_state; 1228 struct tu_draw_state state; 1229 struct tu_draw_state binning_state; 1230 1231 struct tu_program_descriptor_linkage link[MESA_SHADER_STAGES]; 1232 } program; 1233 1234 struct 1235 { 1236 struct tu_draw_state state; 1237 struct tu_draw_state binning_state; 1238 } vi; 1239 1240 struct 1241 { 1242 enum pc_di_primtype primtype; 1243 bool primitive_restart; 1244 } ia; 1245 1246 struct 1247 { 1248 uint32_t patch_type; 1249 uint32_t param_stride; 1250 uint32_t hs_bo_regid; 1251 uint32_t ds_bo_regid; 1252 bool upper_left_domain_origin; 1253 } tess; 1254 1255 struct 1256 { 1257 uint32_t local_size[3]; 1258 uint32_t subgroup_size; 1259 } compute; 1260 1261 bool provoking_vertex_last; 1262 1263 struct tu_lrz_pipeline lrz; 1264 1265 void *executables_mem_ctx; 1266 /* tu_pipeline_executable */ 1267 struct util_dynarray executables; 1268}; 1269 1270void 1271tu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewport, uint32_t num_viewport); 1272 1273void 1274tu6_emit_scissor(struct tu_cs *cs, const VkRect2D *scs, uint32_t scissor_count); 1275 1276void 1277tu6_clear_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_image* image, const VkClearValue *value); 1278 1279void 1280tu6_emit_sample_locations(struct tu_cs *cs, const VkSampleLocationsInfoEXT *samp_loc); 1281 1282void 1283tu6_emit_depth_bias(struct tu_cs *cs, 1284 float constant_factor, 1285 float clamp, 1286 float slope_factor); 1287 1288void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples, 1289 enum a5xx_line_mode line_mode); 1290 1291void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2); 1292 1293void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1); 1294 1295void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs); 1296 1297void tu6_apply_depth_bounds_workaround(struct tu_device *device, 1298 uint32_t *rb_depth_cntl); 1299 1300struct tu_pvtmem_config { 1301 uint64_t iova; 1302 uint32_t per_fiber_size; 1303 uint32_t per_sp_size; 1304 bool per_wave; 1305}; 1306 1307void 1308tu6_emit_xs_config(struct tu_cs *cs, 1309 gl_shader_stage stage, 1310 const struct ir3_shader_variant *xs); 1311 1312void 1313tu6_emit_xs(struct tu_cs *cs, 1314 gl_shader_stage stage, 1315 const struct ir3_shader_variant *xs, 1316 const struct tu_pvtmem_config *pvtmem, 1317 uint64_t binary_iova); 1318 1319void 1320tu6_emit_vpc(struct tu_cs *cs, 1321 const struct ir3_shader_variant *vs, 1322 const struct ir3_shader_variant *hs, 1323 const struct ir3_shader_variant *ds, 1324 const struct ir3_shader_variant *gs, 1325 const struct ir3_shader_variant *fs, 1326 uint32_t patch_control_points); 1327 1328void 1329tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs); 1330 1331struct tu_image_view; 1332 1333void 1334tu_resolve_sysmem(struct tu_cmd_buffer *cmd, 1335 struct tu_cs *cs, 1336 const struct tu_image_view *src, 1337 const struct tu_image_view *dst, 1338 uint32_t layer_mask, 1339 uint32_t layers, 1340 const VkRect2D *rect); 1341 1342void 1343tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd, 1344 struct tu_cs *cs, 1345 uint32_t a, 1346 const VkRenderPassBeginInfo *info); 1347 1348void 1349tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd, 1350 struct tu_cs *cs, 1351 uint32_t a, 1352 const VkRenderPassBeginInfo *info); 1353 1354void 1355tu_load_gmem_attachment(struct tu_cmd_buffer *cmd, 1356 struct tu_cs *cs, 1357 uint32_t a, 1358 bool force_load); 1359 1360/* expose this function to be able to emit load without checking LOAD_OP */ 1361void 1362tu_emit_load_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a); 1363 1364/* note: gmem store can also resolve */ 1365void 1366tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, 1367 struct tu_cs *cs, 1368 uint32_t a, 1369 uint32_t gmem_a); 1370 1371struct tu_native_format 1372{ 1373 enum a6xx_format fmt : 8; 1374 enum a3xx_color_swap swap : 8; 1375 enum a6xx_tile_mode tile_mode : 8; 1376}; 1377 1378bool tu6_format_vtx_supported(VkFormat format); 1379struct tu_native_format tu6_format_vtx(VkFormat format); 1380bool tu6_format_color_supported(VkFormat format); 1381struct tu_native_format tu6_format_color(VkFormat format, enum a6xx_tile_mode tile_mode); 1382bool tu6_format_texture_supported(VkFormat format); 1383struct tu_native_format tu6_format_texture(VkFormat format, enum a6xx_tile_mode tile_mode); 1384 1385static inline enum a6xx_format 1386tu6_base_format(VkFormat format) 1387{ 1388 /* note: tu6_format_color doesn't care about tiling for .fmt field */ 1389 return tu6_format_color(format, TILE6_LINEAR).fmt; 1390} 1391 1392struct tu_image 1393{ 1394 struct vk_object_base base; 1395 1396 /* The original VkFormat provided by the client. This may not match any 1397 * of the actual surface formats. 1398 */ 1399 VkFormat vk_format; 1400 uint32_t level_count; 1401 uint32_t layer_count; 1402 1403 struct fdl_layout layout[3]; 1404 uint32_t total_size; 1405 1406#ifdef ANDROID 1407 /* For VK_ANDROID_native_buffer, the WSI image owns the memory, */ 1408 VkDeviceMemory owned_memory; 1409#endif 1410 1411 /* Set when bound */ 1412 struct tu_bo *bo; 1413 VkDeviceSize bo_offset; 1414 1415 uint32_t lrz_height; 1416 uint32_t lrz_pitch; 1417 uint32_t lrz_offset; 1418 1419 bool shareable; 1420}; 1421 1422static inline uint32_t 1423tu_get_layerCount(const struct tu_image *image, 1424 const VkImageSubresourceRange *range) 1425{ 1426 return range->layerCount == VK_REMAINING_ARRAY_LAYERS 1427 ? image->layer_count - range->baseArrayLayer 1428 : range->layerCount; 1429} 1430 1431static inline uint32_t 1432tu_get_levelCount(const struct tu_image *image, 1433 const VkImageSubresourceRange *range) 1434{ 1435 return range->levelCount == VK_REMAINING_MIP_LEVELS 1436 ? image->level_count - range->baseMipLevel 1437 : range->levelCount; 1438} 1439 1440struct tu_image_view 1441{ 1442 struct vk_object_base base; 1443 1444 struct tu_image *image; /**< VkImageViewCreateInfo::image */ 1445 1446 uint64_t base_addr; 1447 uint64_t ubwc_addr; 1448 uint32_t layer_size; 1449 uint32_t ubwc_layer_size; 1450 1451 /* used to determine if fast gmem store path can be used */ 1452 VkExtent2D extent; 1453 bool need_y2_align; 1454 1455 bool ubwc_enabled; 1456 1457 uint32_t descriptor[A6XX_TEX_CONST_DWORDS]; 1458 1459 /* Descriptor for use as a storage image as opposed to a sampled image. 1460 * This has a few differences for cube maps (e.g. type). 1461 */ 1462 uint32_t storage_descriptor[A6XX_TEX_CONST_DWORDS]; 1463 1464 /* pre-filled register values */ 1465 uint32_t PITCH; 1466 uint32_t FLAG_BUFFER_PITCH; 1467 1468 uint32_t RB_MRT_BUF_INFO; 1469 uint32_t SP_FS_MRT_REG; 1470 1471 uint32_t SP_PS_2D_SRC_INFO; 1472 uint32_t SP_PS_2D_SRC_SIZE; 1473 1474 uint32_t RB_2D_DST_INFO; 1475 1476 uint32_t RB_BLIT_DST_INFO; 1477 1478 /* for d32s8 separate stencil */ 1479 uint64_t stencil_base_addr; 1480 uint32_t stencil_layer_size; 1481 uint32_t stencil_PITCH; 1482}; 1483 1484struct tu_sampler_ycbcr_conversion { 1485 struct vk_object_base base; 1486 1487 VkFormat format; 1488 VkSamplerYcbcrModelConversion ycbcr_model; 1489 VkSamplerYcbcrRange ycbcr_range; 1490 VkComponentMapping components; 1491 VkChromaLocation chroma_offsets[2]; 1492 VkFilter chroma_filter; 1493}; 1494 1495struct tu_sampler { 1496 struct vk_object_base base; 1497 1498 uint32_t descriptor[A6XX_TEX_SAMP_DWORDS]; 1499 struct tu_sampler_ycbcr_conversion *ycbcr_sampler; 1500}; 1501 1502void 1503tu_cs_image_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer); 1504 1505void 1506tu_cs_image_ref_2d(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer, bool src); 1507 1508void 1509tu_cs_image_flag_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer); 1510 1511void 1512tu_cs_image_stencil_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer); 1513 1514#define tu_image_view_stencil(iview, x) \ 1515 ((iview->x & ~A6XX_##x##_COLOR_FORMAT__MASK) | A6XX_##x##_COLOR_FORMAT(FMT6_8_UINT)) 1516 1517VkResult 1518tu_gralloc_info(struct tu_device *device, 1519 const VkNativeBufferANDROID *gralloc_info, 1520 int *dma_buf, 1521 uint64_t *modifier); 1522 1523VkResult 1524tu_import_memory_from_gralloc_handle(VkDevice device_h, 1525 int dma_buf, 1526 const VkAllocationCallbacks *alloc, 1527 VkImage image_h); 1528 1529void 1530tu_image_view_init(struct tu_image_view *iview, 1531 const VkImageViewCreateInfo *pCreateInfo, 1532 bool limited_z24s8); 1533 1534bool 1535ubwc_possible(VkFormat format, VkImageType type, VkImageUsageFlags usage, VkImageUsageFlags stencil_usage, 1536 const struct fd_dev_info *info, VkSampleCountFlagBits samples); 1537 1538struct tu_buffer_view 1539{ 1540 struct vk_object_base base; 1541 1542 uint32_t descriptor[A6XX_TEX_CONST_DWORDS]; 1543 1544 struct tu_buffer *buffer; 1545}; 1546void 1547tu_buffer_view_init(struct tu_buffer_view *view, 1548 struct tu_device *device, 1549 const VkBufferViewCreateInfo *pCreateInfo); 1550 1551struct tu_attachment_info 1552{ 1553 struct tu_image_view *attachment; 1554}; 1555 1556struct tu_framebuffer 1557{ 1558 struct vk_object_base base; 1559 1560 uint32_t width; 1561 uint32_t height; 1562 uint32_t layers; 1563 1564 /* size of the first tile */ 1565 VkExtent2D tile0; 1566 /* number of tiles */ 1567 VkExtent2D tile_count; 1568 1569 /* size of the first VSC pipe */ 1570 VkExtent2D pipe0; 1571 /* number of VSC pipes */ 1572 VkExtent2D pipe_count; 1573 1574 /* pipe register values */ 1575 uint32_t pipe_config[MAX_VSC_PIPES]; 1576 uint32_t pipe_sizes[MAX_VSC_PIPES]; 1577 1578 uint32_t attachment_count; 1579 struct tu_attachment_info attachments[0]; 1580}; 1581 1582void 1583tu_framebuffer_tiling_config(struct tu_framebuffer *fb, 1584 const struct tu_device *device, 1585 const struct tu_render_pass *pass); 1586 1587struct tu_subpass_barrier { 1588 VkPipelineStageFlags src_stage_mask; 1589 VkPipelineStageFlags dst_stage_mask; 1590 VkAccessFlags src_access_mask; 1591 VkAccessFlags dst_access_mask; 1592 bool incoherent_ccu_color, incoherent_ccu_depth; 1593}; 1594 1595struct tu_subpass_attachment 1596{ 1597 uint32_t attachment; 1598 1599 /* For input attachments, true if it needs to be patched to refer to GMEM 1600 * in GMEM mode. This is false if it hasn't already been written as an 1601 * attachment. 1602 */ 1603 bool patch_input_gmem; 1604}; 1605 1606struct tu_subpass 1607{ 1608 uint32_t input_count; 1609 uint32_t color_count; 1610 uint32_t resolve_count; 1611 bool resolve_depth_stencil; 1612 1613 /* True if there is any feedback loop at all. */ 1614 bool feedback; 1615 1616 /* True if we must invalidate UCHE thanks to a feedback loop. */ 1617 bool feedback_invalidate; 1618 1619 struct tu_subpass_attachment *input_attachments; 1620 struct tu_subpass_attachment *color_attachments; 1621 struct tu_subpass_attachment *resolve_attachments; 1622 struct tu_subpass_attachment depth_stencil_attachment; 1623 1624 VkSampleCountFlagBits samples; 1625 1626 uint32_t srgb_cntl; 1627 uint32_t multiview_mask; 1628 1629 struct tu_subpass_barrier start_barrier; 1630}; 1631 1632struct tu_render_pass_attachment 1633{ 1634 VkFormat format; 1635 uint32_t samples; 1636 uint32_t cpp; 1637 VkImageAspectFlags clear_mask; 1638 uint32_t clear_views; 1639 bool load; 1640 bool store; 1641 int32_t gmem_offset; 1642 /* for D32S8 separate stencil: */ 1643 bool load_stencil; 1644 bool store_stencil; 1645 int32_t gmem_offset_stencil; 1646}; 1647 1648struct tu_render_pass 1649{ 1650 struct vk_object_base base; 1651 1652 uint32_t attachment_count; 1653 uint32_t subpass_count; 1654 uint32_t gmem_pixels; 1655 uint32_t tile_align_w; 1656 struct tu_subpass_attachment *subpass_attachments; 1657 struct tu_render_pass_attachment *attachments; 1658 struct tu_subpass_barrier end_barrier; 1659 struct tu_subpass subpasses[0]; 1660}; 1661 1662#define PERF_CNTRS_REG 4 1663 1664struct tu_perf_query_data 1665{ 1666 uint32_t gid; /* group-id */ 1667 uint32_t cid; /* countable-id within the group */ 1668 uint32_t cntr_reg; /* counter register within the group */ 1669 uint32_t pass; /* pass index that countables can be requested */ 1670 uint32_t app_idx; /* index provided by apps */ 1671}; 1672 1673struct tu_query_pool 1674{ 1675 struct vk_object_base base; 1676 1677 VkQueryType type; 1678 uint32_t stride; 1679 uint64_t size; 1680 uint32_t pipeline_statistics; 1681 struct tu_bo bo; 1682 1683 /* For performance query */ 1684 const struct fd_perfcntr_group *perf_group; 1685 uint32_t perf_group_count; 1686 uint32_t counter_index_count; 1687 struct tu_perf_query_data perf_query_data[0]; 1688}; 1689 1690uint32_t 1691tu_subpass_get_attachment_to_resolve(const struct tu_subpass *subpass, uint32_t index); 1692 1693void 1694tu_update_descriptor_sets(const struct tu_device *device, 1695 VkDescriptorSet overrideSet, 1696 uint32_t descriptorWriteCount, 1697 const VkWriteDescriptorSet *pDescriptorWrites, 1698 uint32_t descriptorCopyCount, 1699 const VkCopyDescriptorSet *pDescriptorCopies); 1700 1701void 1702tu_update_descriptor_set_with_template( 1703 const struct tu_device *device, 1704 struct tu_descriptor_set *set, 1705 VkDescriptorUpdateTemplate descriptorUpdateTemplate, 1706 const void *pData); 1707 1708VkResult 1709tu_physical_device_init(struct tu_physical_device *device, 1710 struct tu_instance *instance); 1711VkResult 1712tu_enumerate_devices(struct tu_instance *instance); 1713 1714int 1715tu_drm_get_timestamp(struct tu_physical_device *device, 1716 uint64_t *ts); 1717 1718int 1719tu_drm_submitqueue_new(const struct tu_device *dev, 1720 int priority, 1721 uint32_t *queue_id); 1722 1723void 1724tu_drm_submitqueue_close(const struct tu_device *dev, uint32_t queue_id); 1725 1726int 1727tu_signal_fences(struct tu_device *device, struct tu_syncobj *fence1, struct tu_syncobj *fence2); 1728 1729int 1730tu_syncobj_to_fd(struct tu_device *device, struct tu_syncobj *sync); 1731 1732 1733void 1734tu_copy_timestamp_buffer(struct u_trace_context *utctx, void *cmdstream, 1735 void *ts_from, uint32_t from_offset, 1736 void *ts_to, uint32_t to_offset, 1737 uint32_t count); 1738 1739 1740VkResult 1741tu_create_copy_timestamp_cs(struct tu_cmd_buffer *cmdbuf, struct tu_cs** cs, 1742 struct u_trace **trace_copy); 1743 1744struct tu_u_trace_cmd_data 1745{ 1746 struct tu_cs *timestamp_copy_cs; 1747 struct u_trace *trace; 1748}; 1749 1750void 1751tu_u_trace_cmd_data_finish(struct tu_device *device, 1752 struct tu_u_trace_cmd_data *trace_data, 1753 uint32_t entry_count); 1754 1755struct tu_u_trace_flush_data 1756{ 1757 uint32_t submission_id; 1758 struct tu_u_trace_syncobj *syncobj; 1759 uint32_t trace_count; 1760 struct tu_u_trace_cmd_data *cmd_trace_data; 1761}; 1762 1763#define TU_FROM_HANDLE(__tu_type, __name, __handle) \ 1764 VK_FROM_HANDLE(__tu_type, __name, __handle) 1765 1766VK_DEFINE_HANDLE_CASTS(tu_cmd_buffer, vk.base, VkCommandBuffer, 1767 VK_OBJECT_TYPE_COMMAND_BUFFER) 1768VK_DEFINE_HANDLE_CASTS(tu_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE) 1769VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance, 1770 VK_OBJECT_TYPE_INSTANCE) 1771VK_DEFINE_HANDLE_CASTS(tu_physical_device, vk.base, VkPhysicalDevice, 1772 VK_OBJECT_TYPE_PHYSICAL_DEVICE) 1773VK_DEFINE_HANDLE_CASTS(tu_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE) 1774 1775VK_DEFINE_NONDISP_HANDLE_CASTS(tu_cmd_pool, base, VkCommandPool, 1776 VK_OBJECT_TYPE_COMMAND_POOL) 1777VK_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer, base, VkBuffer, 1778 VK_OBJECT_TYPE_BUFFER) 1779VK_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer_view, base, VkBufferView, 1780 VK_OBJECT_TYPE_BUFFER_VIEW) 1781VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_pool, base, VkDescriptorPool, 1782 VK_OBJECT_TYPE_DESCRIPTOR_POOL) 1783VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set, base, VkDescriptorSet, 1784 VK_OBJECT_TYPE_DESCRIPTOR_SET) 1785VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set_layout, base, 1786 VkDescriptorSetLayout, 1787 VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT) 1788VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_update_template, base, 1789 VkDescriptorUpdateTemplate, 1790 VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE) 1791VK_DEFINE_NONDISP_HANDLE_CASTS(tu_device_memory, base, VkDeviceMemory, 1792 VK_OBJECT_TYPE_DEVICE_MEMORY) 1793VK_DEFINE_NONDISP_HANDLE_CASTS(tu_event, base, VkEvent, VK_OBJECT_TYPE_EVENT) 1794VK_DEFINE_NONDISP_HANDLE_CASTS(tu_framebuffer, base, VkFramebuffer, 1795 VK_OBJECT_TYPE_FRAMEBUFFER) 1796VK_DEFINE_NONDISP_HANDLE_CASTS(tu_image, base, VkImage, VK_OBJECT_TYPE_IMAGE) 1797VK_DEFINE_NONDISP_HANDLE_CASTS(tu_image_view, base, VkImageView, 1798 VK_OBJECT_TYPE_IMAGE_VIEW); 1799VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_cache, base, VkPipelineCache, 1800 VK_OBJECT_TYPE_PIPELINE_CACHE) 1801VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline, base, VkPipeline, 1802 VK_OBJECT_TYPE_PIPELINE) 1803VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_layout, base, VkPipelineLayout, 1804 VK_OBJECT_TYPE_PIPELINE_LAYOUT) 1805VK_DEFINE_NONDISP_HANDLE_CASTS(tu_query_pool, base, VkQueryPool, 1806 VK_OBJECT_TYPE_QUERY_POOL) 1807VK_DEFINE_NONDISP_HANDLE_CASTS(tu_render_pass, base, VkRenderPass, 1808 VK_OBJECT_TYPE_RENDER_PASS) 1809VK_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler, base, VkSampler, 1810 VK_OBJECT_TYPE_SAMPLER) 1811VK_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler_ycbcr_conversion, base, VkSamplerYcbcrConversion, 1812 VK_OBJECT_TYPE_SAMPLER_YCBCR_CONVERSION) 1813 1814/* for TU_FROM_HANDLE with both VkFence and VkSemaphore: */ 1815#define tu_syncobj_from_handle(x) ((struct tu_syncobj*) (uintptr_t) (x)) 1816 1817void 1818update_stencil_mask(uint32_t *value, VkStencilFaceFlags face, uint32_t mask); 1819 1820#endif /* TU_PRIVATE_H */ 1821